From 22ccb684c1cae37411450e6e86a379cd3c29cb8f Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 21 Oct 2024 03:12:10 +0000 Subject: [PATCH 01/16] bonding: return detailed error when loading native XDP fails MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Bonding only supports native XDP for specific modes, which can lead to confusion for users regarding why XDP loads successfully at times and fails at others. This patch enhances error handling by returning detailed error messages, providing users with clearer insights into the specific reasons for the failure when loading native XDP. Reviewed-by: Nikolay Aleksandrov Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20241021031211.814-2-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 3928287f5865..5812e8eaccf1 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -5676,8 +5676,11 @@ static int bond_xdp_set(struct net_device *dev, struct bpf_prog *prog, ASSERT_RTNL(); - if (!bond_xdp_check(bond)) + if (!bond_xdp_check(bond)) { + BOND_NL_ERR(dev, extack, + "No native XDP support for the current bonding mode"); return -EOPNOTSUPP; + } old_prog = bond->xdp_prog; bond->xdp_prog = prog; -- 2.51.0 From 9f59eccd9dd5a4c6a974e02e70b9eed0d3b14245 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 21 Oct 2024 03:12:11 +0000 Subject: [PATCH 02/16] Documentation: bonding: add XDP support explanation MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Add document about which modes have native XDP support. Reviewed-by: Nikolay Aleksandrov Signed-off-by: Hangbin Liu Reviewed-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20241021031211.814-3-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski --- Documentation/networking/bonding.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst index e774b48de9f5..7c8d22d68682 100644 --- a/Documentation/networking/bonding.rst +++ b/Documentation/networking/bonding.rst @@ -2916,6 +2916,17 @@ from the bond (``ifenslave -d bond0 eth0``). The bonding driver will then restore the MAC addresses that the slaves had before they were enslaved. +9. What bonding modes support native XDP? +------------------------------------------ + + * balance-rr (0) + * active-backup (1) + * balance-xor (2) + * 802.3ad (4) + +Note that the vlan+srcmac hash policy does not support native XDP. +For other bonding modes, the XDP program must be loaded with generic mode. + 16. Resources and Links ======================= -- 2.51.0 From 25872a079bbbe952eb660249cc9f40fa75623e68 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Wed, 23 Oct 2024 15:41:46 +0200 Subject: [PATCH 03/16] net/mlx5: unique names for per device caches Add the device name to the per device kmem_cache names to ensure their uniqueness. This fixes warnings like this: "kmem_cache of name 'mlx5_fs_fgs' already exists". Signed-off-by: Sebastian Ott Reviewed-by: Breno Leitao Reviewed-by: Tariq Toukan Link: https://patch.msgid.link/20241023134146.28448-1-sebott@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 8505d5e241e1..c2db0a1c132b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -3689,6 +3689,7 @@ void mlx5_fs_core_free(struct mlx5_core_dev *dev) int mlx5_fs_core_alloc(struct mlx5_core_dev *dev) { struct mlx5_flow_steering *steering; + char name[80]; int err = 0; err = mlx5_init_fc_stats(dev); @@ -3713,10 +3714,12 @@ int mlx5_fs_core_alloc(struct mlx5_core_dev *dev) else steering->mode = MLX5_FLOW_STEERING_MODE_DMFS; - steering->fgs_cache = kmem_cache_create("mlx5_fs_fgs", + snprintf(name, sizeof(name), "%s-mlx5_fs_fgs", dev_name(dev->device)); + steering->fgs_cache = kmem_cache_create(name, sizeof(struct mlx5_flow_group), 0, 0, NULL); - steering->ftes_cache = kmem_cache_create("mlx5_fs_ftes", sizeof(struct fs_fte), 0, + snprintf(name, sizeof(name), "%s-mlx5_fs_ftes", dev_name(dev->device)); + steering->ftes_cache = kmem_cache_create(name, sizeof(struct fs_fte), 0, 0, NULL); if (!steering->ftes_cache || !steering->fgs_cache) { err = -ENOMEM; -- 2.51.0 From ba4e469e42fe1a771b5653d179eb12dc4be6b6a8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Oct 2024 13:48:19 +0000 Subject: [PATCH 04/16] vsock: do not leave dangling sk pointer in vsock_create() syzbot was able to trigger the following warning after recent core network cleanup. On error vsock_create() frees the allocated sk object, but sock_init_data() has already attached it to the provided sock object. We must clear sock->sk to avoid possible use-after-free later. WARNING: CPU: 0 PID: 5282 at net/socket.c:1581 __sock_create+0x897/0x950 net/socket.c:1581 Modules linked in: CPU: 0 UID: 0 PID: 5282 Comm: syz.2.43 Not tainted 6.12.0-rc2-syzkaller-00667-g53bac8330865 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:__sock_create+0x897/0x950 net/socket.c:1581 Code: 7f 06 01 65 48 8b 34 25 00 d8 03 00 48 81 c6 b0 08 00 00 48 c7 c7 60 0b 0d 8d e8 d4 9a 3c 02 e9 11 f8 ff ff e8 0a ab 0d f8 90 <0f> 0b 90 e9 82 fd ff ff 89 e9 80 e1 07 fe c1 38 c1 0f 8c c7 f8 ff RSP: 0018:ffffc9000394fda8 EFLAGS: 00010293 RAX: ffffffff89873c46 RBX: ffff888079f3c818 RCX: ffff8880314b9e00 RDX: 0000000000000000 RSI: 00000000ffffffed RDI: 0000000000000000 RBP: ffffffff8d3337f0 R08: ffffffff8987384e R09: ffffffff8989473a R10: dffffc0000000000 R11: fffffbfff203a276 R12: 00000000ffffffed R13: ffff888079f3c8c0 R14: ffffffff898736e7 R15: dffffc0000000000 FS: 00005555680ab500(0000) GS:ffff8880b8600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f22b11196d0 CR3: 00000000308c0000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: sock_create net/socket.c:1632 [inline] __sys_socket_create net/socket.c:1669 [inline] __sys_socket+0x150/0x3c0 net/socket.c:1716 __do_sys_socket net/socket.c:1730 [inline] __se_sys_socket net/socket.c:1728 [inline] __x64_sys_socket+0x7a/0x90 net/socket.c:1728 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f22b117dff9 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fff56aec0e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000029 RAX: ffffffffffffffda RBX: 00007f22b1335f80 RCX: 00007f22b117dff9 RDX: 0000000000000000 RSI: 0000000000000002 RDI: 0000000000000028 RBP: 00007f22b11f0296 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007f22b1335f80 R14: 00007f22b1335f80 R15: 00000000000012dd Fixes: 48156296a08c ("net: warn, if pf->create does not clear sock->sk on error") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Ignat Korchagin Reviewed-by: Kuniyuki Iwashima Reviewed-by: Stefano Garzarella Link: https://patch.msgid.link/20241022134819.1085254-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/vmw_vsock/af_vsock.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 35681adedd9a..109b7a0bd071 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -2417,6 +2417,7 @@ static int vsock_create(struct net *net, struct socket *sock, if (sock->type == SOCK_DGRAM) { ret = vsock_assign_transport(vsk, NULL); if (ret < 0) { + sock->sk = NULL; sock_put(sk); return ret; } -- 2.51.0 From 63afe0c217dc21457dbccca1da61266c47886e61 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 22 Oct 2024 17:14:18 +0200 Subject: [PATCH 05/16] netlink: specs: Add missing phy-ntf command to ethtool spec ETHTOOL_MSG_PHY_NTF description is missing in the ethtool netlink spec. Add it to the spec. Signed-off-by: Kory Maincent Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/20241022151418.875424-1-kory.maincent@bootlin.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index f6c5d8214c7e..93369f0eb816 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -1956,3 +1956,7 @@ operations: - upstream-sfp-name - downstream-sfp-name dump: *phy-get-op + - + name: phy-ntf + doc: Notification for change in PHY devices. + notify: phy-get -- 2.51.0 From ab101c553bc1f76a839163d1dc0d1e715ad6bb4e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Oct 2024 15:00:59 +0000 Subject: [PATCH 06/16] neighbour: use kvzalloc()/kvfree() mm layer is providing convenient functions, we do not have to work around old limitations. Signed-off-by: Eric Dumazet Cc: Gilad Naaman Reviewed-by: Joe Damato Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241022150059.1345406-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/neighbour.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 395ae1626eef..4b871cecd2ce 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -14,7 +14,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include -#include #include #include #include @@ -538,14 +537,7 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) ret = kmalloc(sizeof(*ret), GFP_ATOMIC); if (!ret) return NULL; - if (size <= PAGE_SIZE) { - buckets = kzalloc(size, GFP_ATOMIC); - } else { - buckets = (struct neighbour __rcu **) - __get_free_pages(GFP_ATOMIC | __GFP_ZERO, - get_order(size)); - kmemleak_alloc(buckets, size, 1, GFP_ATOMIC); - } + buckets = kvzalloc(size, GFP_ATOMIC); if (!buckets) { kfree(ret); return NULL; @@ -562,15 +554,8 @@ static void neigh_hash_free_rcu(struct rcu_head *head) struct neigh_hash_table *nht = container_of(head, struct neigh_hash_table, rcu); - size_t size = (1 << nht->hash_shift) * sizeof(struct neighbour *); - struct neighbour __rcu **buckets = nht->hash_buckets; - if (size <= PAGE_SIZE) { - kfree(buckets); - } else { - kmemleak_free(buckets); - free_pages((unsigned long)buckets, get_order(size)); - } + kvfree(nht->hash_buckets); kfree(nht); } -- 2.51.0 From 9cb7e40d388d6c0e4677809c6b2950bc67fd8830 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 21 Oct 2024 11:32:28 -0700 Subject: [PATCH 07/16] rtnetlink: Make per-netns RTNL dereference helpers to macro. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit When CONFIG_DEBUG_NET_SMALL_RTNL is off, rtnl_net_dereference() is the static inline wrapper of rtnl_dereference() returning a plain (void *) pointer to make sure net is always evaluated as requested in [0]. But, it makes sparse complain [1] when the pointer has __rcu annotation: net/ipv4/devinet.c:674:47: sparse: warning: incorrect type in argument 2 (different address spaces) net/ipv4/devinet.c:674:47: sparse: expected void *p net/ipv4/devinet.c:674:47: sparse: got struct in_ifaddr [noderef] __rcu * Also, if we evaluate net as (void *) in a macro, then the compiler in turn fails to build due to -Werror=unused-value. #define rtnl_net_dereference(net, p) \ ({ \ (void *)net; \ rtnl_dereference(p); \ }) net/ipv4/devinet.c: In function ‘inet_rtm_deladdr’: ./include/linux/rtnetlink.h:154:17: error: statement with no effect [-Werror=unused-value] 154 | (void *)net; \ net/ipv4/devinet.c:674:21: note: in expansion of macro ‘rtnl_net_dereference’ 674 | (ifa = rtnl_net_dereference(net, *ifap)) != NULL; | ^~~~~~~~~~~~~~~~~~~~ Let's go back to the original simplest macro. Note that checkpatch complains about this approach, but it's one-shot and less noisy than the other two. WARNING: Argument 'net' is not used in function-like macro #76: FILE: include/linux/rtnetlink.h:142: +#define rtnl_net_dereference(net, p) \ + rtnl_dereference(p) Fixes: 844e5e7e656d ("rtnetlink: Add assertion helpers for per-netns RTNL.") Link: https://lore.kernel.org/netdev/20241004132145.7fd208e9@kernel.org/ [0] Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202410200325.SaEJmyZS-lkp@intel.com/ [1] Signed-off-by: Kuniyuki Iwashima Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni --- include/linux/rtnetlink.h | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 8468a4ce8510..0e62918de63b 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -137,21 +137,12 @@ static inline void ASSERT_RTNL_NET(struct net *net) ASSERT_RTNL(); } -static inline void *rcu_dereference_rtnl_net(struct net *net, void *p) -{ - return rcu_dereference_rtnl(p); -} - -static inline void *rtnl_net_dereference(struct net *net, void *p) -{ - return rtnl_dereference(p); -} - -static inline void *rcu_replace_pointer_rtnl_net(struct net *net, - void *rp, void *p) -{ - return rcu_replace_pointer_rtnl(rp, p); -} +#define rcu_dereference_rtnl_net(net, p) \ + rcu_dereference_rtnl(p) +#define rtnl_net_dereference(net, p) \ + rtnl_dereference(p) +#define rcu_replace_pointer_rtnl_net(net, rp, p) \ + rcu_replace_pointer_rtnl(rp, p) #endif static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev) -- 2.51.0 From 26d8db55eeacb7dc78672523f57825916d203de4 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 21 Oct 2024 11:32:29 -0700 Subject: [PATCH 08/16] rtnetlink: Define RTNL_FLAG_DOIT_PERNET for per-netns RTNL doit(). We will push RTNL down to each doit() as rtnl_net_lock(). We can use RTNL_FLAG_DOIT_UNLOCKED to call doit() without RTNL, but doit() will still hold RTNL. Let's define RTNL_FLAG_DOIT_PERNET as an alias of RTNL_FLAG_DOIT_UNLOCKED. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/net/rtnetlink.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index e0d9a8eae6b6..b260c0cc9671 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -12,6 +12,7 @@ typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *); enum rtnl_link_flags { RTNL_FLAG_DOIT_UNLOCKED = BIT(0), +#define RTNL_FLAG_DOIT_PERNET RTNL_FLAG_DOIT_UNLOCKED RTNL_FLAG_BULK_DEL_SUPPORTED = BIT(1), RTNL_FLAG_DUMP_UNLOCKED = BIT(2), RTNL_FLAG_DUMP_SPLIT_NLM_DONE = BIT(3), /* legacy behavior */ -- 2.51.0 From 2d34429d14f9d09b38a8bee6a972a07228378df6 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 21 Oct 2024 11:32:30 -0700 Subject: [PATCH 09/16] ipv4: Factorise RTM_NEWADDR validation to inet_validate_rtm(). rtm_to_ifaddr() validates some attributes, looks up a netdev, allocates struct in_ifaddr, and validates IFA_CACHEINFO. There is no reason to delay IFA_CACHEINFO validation. We will push RTNL down to inet_rtm_newaddr(), and then we want to complete rtnetlink validation before rtnl_net_lock(). Let's factorise the validation parts. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/ipv4/devinet.c | 79 ++++++++++++++++++++++++++-------------------- 1 file changed, 44 insertions(+), 35 deletions(-) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 5f859d01cbbe..da5412fb34e7 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -846,35 +846,54 @@ static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft, WRITE_ONCE(ifa->ifa_cstamp, ifa->ifa_tstamp); } -static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, - __u32 *pvalid_lft, __u32 *pprefered_lft, - struct netlink_ext_ack *extack) +static int inet_validate_rtm(struct nlmsghdr *nlh, struct nlattr **tb, + struct netlink_ext_ack *extack, + __u32 *valid_lft, __u32 *prefered_lft) { - struct nlattr *tb[IFA_MAX+1]; - struct in_ifaddr *ifa; - struct ifaddrmsg *ifm; - struct net_device *dev; - struct in_device *in_dev; + struct ifaddrmsg *ifm = nlmsg_data(nlh); int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, extack); if (err < 0) - goto errout; - - ifm = nlmsg_data(nlh); - err = -EINVAL; + return err; if (ifm->ifa_prefixlen > 32) { NL_SET_ERR_MSG(extack, "ipv4: Invalid prefix length"); - goto errout; + return -EINVAL; } if (!tb[IFA_LOCAL]) { NL_SET_ERR_MSG(extack, "ipv4: Local address is not supplied"); - goto errout; + return -EINVAL; } + if (tb[IFA_CACHEINFO]) { + struct ifa_cacheinfo *ci; + + ci = nla_data(tb[IFA_CACHEINFO]); + if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) { + NL_SET_ERR_MSG(extack, "ipv4: address lifetime invalid"); + return -EINVAL; + } + + *valid_lft = ci->ifa_valid; + *prefered_lft = ci->ifa_prefered; + } + + return 0; +} + +static struct in_ifaddr *inet_rtm_to_ifa(struct net *net, struct nlmsghdr *nlh, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct ifaddrmsg *ifm = nlmsg_data(nlh); + struct in_device *in_dev; + struct net_device *dev; + struct in_ifaddr *ifa; + int err; + dev = __dev_get_by_index(net, ifm->ifa_index); err = -ENODEV; if (!dev) { @@ -923,23 +942,8 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, if (tb[IFA_PROTO]) ifa->ifa_proto = nla_get_u8(tb[IFA_PROTO]); - if (tb[IFA_CACHEINFO]) { - struct ifa_cacheinfo *ci; - - ci = nla_data(tb[IFA_CACHEINFO]); - if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) { - NL_SET_ERR_MSG(extack, "ipv4: address lifetime invalid"); - err = -EINVAL; - goto errout_free; - } - *pvalid_lft = ci->ifa_valid; - *pprefered_lft = ci->ifa_prefered; - } - return ifa; -errout_free: - inet_free_ifa(ifa); errout: return ERR_PTR(err); } @@ -964,15 +968,21 @@ static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa) static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + __u32 prefered_lft = INFINITY_LIFE_TIME; + __u32 valid_lft = INFINITY_LIFE_TIME; struct net *net = sock_net(skb->sk); - struct in_ifaddr *ifa; struct in_ifaddr *ifa_existing; - __u32 valid_lft = INFINITY_LIFE_TIME; - __u32 prefered_lft = INFINITY_LIFE_TIME; + struct nlattr *tb[IFA_MAX + 1]; + struct in_ifaddr *ifa; + int ret; ASSERT_RTNL(); - ifa = rtm_to_ifaddr(net, nlh, &valid_lft, &prefered_lft, extack); + ret = inet_validate_rtm(nlh, tb, extack, &valid_lft, &prefered_lft); + if (ret < 0) + return ret; + + ifa = inet_rtm_to_ifa(net, nlh, tb, extack); if (IS_ERR(ifa)) return PTR_ERR(ifa); @@ -983,8 +993,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, */ set_ifa_lifetime(ifa, valid_lft, prefered_lft); if (ifa->ifa_flags & IFA_F_MCAUTOJOIN) { - int ret = ip_mc_autojoin_config(net, true, ifa); - + ret = ip_mc_autojoin_config(net, true, ifa); if (ret < 0) { NL_SET_ERR_MSG(extack, "ipv4: Multicast auto join failed"); inet_free_ifa(ifa); -- 2.51.0 From abd0deff03d854cb34818e1e01490296d0314ea1 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 21 Oct 2024 11:32:31 -0700 Subject: [PATCH 10/16] ipv4: Don't allocate ifa for 0.0.0.0 in inet_rtm_newaddr(). When we pass 0.0.0.0 to __inet_insert_ifa(), it frees ifa and returns 0. We can do this check much earlier for RTM_NEWADDR even before allocating struct in_ifaddr. Let's move the validation to 1. inet_insert_ifa() for ioctl() 2. inet_rtm_newaddr() for RTM_NEWADDR Now, we can remove the same check in find_matching_ifa(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/ipv4/devinet.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index da5412fb34e7..8db84c70ebed 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -508,11 +508,6 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, ASSERT_RTNL(); - if (!ifa->ifa_local) { - inet_free_ifa(ifa); - return 0; - } - ifa->ifa_flags &= ~IFA_F_SECONDARY; last_primary = &in_dev->ifa_list; @@ -584,6 +579,11 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, static int inet_insert_ifa(struct in_ifaddr *ifa) { + if (!ifa->ifa_local) { + inet_free_ifa(ifa); + return 0; + } + return __inet_insert_ifa(ifa, NULL, 0, NULL); } @@ -953,15 +953,13 @@ static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa) struct in_device *in_dev = ifa->ifa_dev; struct in_ifaddr *ifa1; - if (!ifa->ifa_local) - return NULL; - in_dev_for_each_ifa_rtnl(ifa1, in_dev) { if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa) && ifa1->ifa_local == ifa->ifa_local) return ifa1; } + return NULL; } @@ -982,6 +980,9 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, if (ret < 0) return ret; + if (!nla_get_in_addr(tb[IFA_LOCAL])) + return 0; + ifa = inet_rtm_to_ifa(net, nlh, tb, extack); if (IS_ERR(ifa)) return PTR_ERR(ifa); -- 2.51.0 From 487257786b71172648664164ba567e807e1e11fc Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 21 Oct 2024 11:32:32 -0700 Subject: [PATCH 11/16] ipv4: Convert RTM_NEWADDR to per-netns RTNL. The address hash table and GC are already namespacified. Let's push down RTNL into inet_rtm_newaddr() as rtnl_net_lock(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/ipv4/devinet.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 8db84c70ebed..7f24bc38981b 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -974,8 +974,6 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, struct in_ifaddr *ifa; int ret; - ASSERT_RTNL(); - ret = inet_validate_rtm(nlh, tb, extack, &valid_lft, &prefered_lft); if (ret < 0) return ret; @@ -983,9 +981,13 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, if (!nla_get_in_addr(tb[IFA_LOCAL])) return 0; + rtnl_net_lock(net); + ifa = inet_rtm_to_ifa(net, nlh, tb, extack); - if (IS_ERR(ifa)) - return PTR_ERR(ifa); + if (IS_ERR(ifa)) { + ret = PTR_ERR(ifa); + goto unlock; + } ifa_existing = find_matching_ifa(ifa); if (!ifa_existing) { @@ -998,11 +1000,11 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, if (ret < 0) { NL_SET_ERR_MSG(extack, "ipv4: Multicast auto join failed"); inet_free_ifa(ifa); - return ret; + goto unlock; } } - return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid, - extack); + + ret = __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid, extack); } else { u32 new_metric = ifa->ifa_rt_priority; u8 new_proto = ifa->ifa_proto; @@ -1012,7 +1014,8 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, if (nlh->nlmsg_flags & NLM_F_EXCL || !(nlh->nlmsg_flags & NLM_F_REPLACE)) { NL_SET_ERR_MSG(extack, "ipv4: Address already assigned"); - return -EEXIST; + ret = -EEXIST; + goto unlock; } ifa = ifa_existing; @@ -1029,7 +1032,11 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, &net->ipv4.addr_chk_work, 0); rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid); } - return 0; + +unlock: + rtnl_net_unlock(net); + + return ret; } /* @@ -2823,7 +2830,8 @@ static struct rtnl_af_ops inet_af_ops __read_mostly = { }; static const struct rtnl_msg_handler devinet_rtnl_msg_handlers[] __initconst = { - {.protocol = PF_INET, .msgtype = RTM_NEWADDR, .doit = inet_rtm_newaddr}, + {.protocol = PF_INET, .msgtype = RTM_NEWADDR, .doit = inet_rtm_newaddr, + .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_DELADDR, .doit = inet_rtm_deladdr}, {.protocol = PF_INET, .msgtype = RTM_GETADDR, .dumpit = inet_dump_ifaddr, .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, -- 2.51.0 From d4b483208b2606add41a22bdd3c8cd6d36009319 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 21 Oct 2024 11:32:33 -0700 Subject: [PATCH 12/16] ipv4: Use per-netns RTNL helpers in inet_rtm_newaddr(). inet_rtm_to_ifa() and find_matching_ifa() are called under rtnl_net_lock(). __in_dev_get_rtnl() and in_dev_for_each_ifa_rtnl() there can use per-netns RTNL helpers. Let's define and use __in_dev_get_rtnl_net() and in_dev_for_each_ifa_rtnl_net(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/linux/inetdevice.h | 9 +++++++++ net/ipv4/devinet.c | 8 ++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index d9c690c8c80b..5730ba6b1cfa 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -226,6 +226,10 @@ static __inline__ bool bad_mask(__be32 mask, __be32 addr) for (ifa = rtnl_dereference((in_dev)->ifa_list); ifa; \ ifa = rtnl_dereference(ifa->ifa_next)) +#define in_dev_for_each_ifa_rtnl_net(net, ifa, in_dev) \ + for (ifa = rtnl_net_dereference(net, (in_dev)->ifa_list); ifa; \ + ifa = rtnl_net_dereference(net, ifa->ifa_next)) + #define in_dev_for_each_ifa_rcu(ifa, in_dev) \ for (ifa = rcu_dereference((in_dev)->ifa_list); ifa; \ ifa = rcu_dereference(ifa->ifa_next)) @@ -252,6 +256,11 @@ static inline struct in_device *__in_dev_get_rtnl(const struct net_device *dev) return rtnl_dereference(dev->ip_ptr); } +static inline struct in_device *__in_dev_get_rtnl_net(const struct net_device *dev) +{ + return rtnl_net_dereference(dev_net(dev), dev->ip_ptr); +} + /* called with rcu_read_lock or rtnl held */ static inline bool ip_ignore_linkdown(const struct net_device *dev) { diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 7f24bc38981b..e14e35c22054 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -901,7 +901,7 @@ static struct in_ifaddr *inet_rtm_to_ifa(struct net *net, struct nlmsghdr *nlh, goto errout; } - in_dev = __in_dev_get_rtnl(dev); + in_dev = __in_dev_get_rtnl_net(dev); err = -ENOBUFS; if (!in_dev) goto errout; @@ -948,12 +948,12 @@ errout: return ERR_PTR(err); } -static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa) +static struct in_ifaddr *find_matching_ifa(struct net *net, struct in_ifaddr *ifa) { struct in_device *in_dev = ifa->ifa_dev; struct in_ifaddr *ifa1; - in_dev_for_each_ifa_rtnl(ifa1, in_dev) { + in_dev_for_each_ifa_rtnl_net(net, ifa1, in_dev) { if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa) && ifa1->ifa_local == ifa->ifa_local) @@ -989,7 +989,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, goto unlock; } - ifa_existing = find_matching_ifa(ifa); + ifa_existing = find_matching_ifa(net, ifa); if (!ifa_existing) { /* It would be best to check for !NLM_F_CREATE here but * userspace already relies on not having to provide this. -- 2.51.0 From 4df5066f079cfbc563c2da031b02b4ba2d9e1ba0 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 21 Oct 2024 11:32:34 -0700 Subject: [PATCH 13/16] ipv4: Convert RTM_DELADDR to per-netns RTNL. Let's push down RTNL into inet_rtm_deladdr() as rtnl_net_lock(). Now, ip_mc_autojoin_config() is always called under per-netns RTNL, so ASSERT_RTNL() can be replaced with ASSERT_RTNL_NET(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/ipv4/devinet.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index e14e35c22054..6b7780e12f34 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -645,7 +645,7 @@ static int ip_mc_autojoin_config(struct net *net, bool join, struct sock *sk = net->ipv4.mc_autojoin_sk; int ret; - ASSERT_RTNL(); + ASSERT_RTNL_NET(net); lock_sock(sk); if (join) @@ -671,22 +671,24 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, struct in_ifaddr *ifa; int err; - ASSERT_RTNL(); - err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, extack); if (err < 0) - goto errout; + goto out; ifm = nlmsg_data(nlh); + + rtnl_net_lock(net); + in_dev = inetdev_by_index(net, ifm->ifa_index); if (!in_dev) { NL_SET_ERR_MSG(extack, "ipv4: Device not found"); err = -ENODEV; - goto errout; + goto unlock; } - for (ifap = &in_dev->ifa_list; (ifa = rtnl_dereference(*ifap)) != NULL; + for (ifap = &in_dev->ifa_list; + (ifa = rtnl_net_dereference(net, *ifap)) != NULL; ifap = &ifa->ifa_next) { if (tb[IFA_LOCAL] && ifa->ifa_local != nla_get_in_addr(tb[IFA_LOCAL])) @@ -702,13 +704,16 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, if (ipv4_is_multicast(ifa->ifa_address)) ip_mc_autojoin_config(net, false, ifa); + __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid); - return 0; + goto unlock; } NL_SET_ERR_MSG(extack, "ipv4: Address not found"); err = -EADDRNOTAVAIL; -errout: +unlock: + rtnl_net_unlock(net); +out: return err; } @@ -2832,7 +2837,8 @@ static struct rtnl_af_ops inet_af_ops __read_mostly = { static const struct rtnl_msg_handler devinet_rtnl_msg_handlers[] __initconst = { {.protocol = PF_INET, .msgtype = RTM_NEWADDR, .doit = inet_rtm_newaddr, .flags = RTNL_FLAG_DOIT_PERNET}, - {.protocol = PF_INET, .msgtype = RTM_DELADDR, .doit = inet_rtm_deladdr}, + {.protocol = PF_INET, .msgtype = RTM_DELADDR, .doit = inet_rtm_deladdr, + .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_GETADDR, .dumpit = inet_dump_ifaddr, .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, {.protocol = PF_INET, .msgtype = RTM_GETNETCONF, -- 2.51.0 From c350c4761e7f4767dea59aef036ce13276466fd0 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 21 Oct 2024 11:32:35 -0700 Subject: [PATCH 14/16] ipv4: Convert check_lifetime() to per-netns RTNL. Since commit 1675f385213e ("ipv4: Namespacify IPv4 address GC."), check_lifetime() works on a per-netns basis. Let's use rtnl_net_lock() and rtnl_net_dereference(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/ipv4/devinet.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 6b7780e12f34..5eaef3bbb987 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -771,7 +771,8 @@ static void check_lifetime(struct work_struct *work) rcu_read_unlock(); if (!change_needed) continue; - rtnl_lock(); + + rtnl_net_lock(net); hlist_for_each_entry_safe(ifa, n, head, addr_lst) { unsigned long age; @@ -788,7 +789,7 @@ static void check_lifetime(struct work_struct *work) struct in_ifaddr *tmp; ifap = &ifa->ifa_dev->ifa_list; - tmp = rtnl_dereference(*ifap); + tmp = rtnl_net_dereference(net, *ifap); while (tmp) { if (tmp == ifa) { inet_del_ifa(ifa->ifa_dev, @@ -796,7 +797,7 @@ static void check_lifetime(struct work_struct *work) break; } ifap = &tmp->ifa_next; - tmp = rtnl_dereference(*ifap); + tmp = rtnl_net_dereference(net, *ifap); } } else if (ifa->ifa_preferred_lft != INFINITY_LIFE_TIME && @@ -806,7 +807,7 @@ static void check_lifetime(struct work_struct *work) rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); } } - rtnl_unlock(); + rtnl_net_unlock(net); } next_sec = round_jiffies_up(next); -- 2.51.0 From d1c81818aa227b37d65b40f9883109c5256b9bfb Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 21 Oct 2024 11:32:36 -0700 Subject: [PATCH 15/16] rtnetlink: Define rtnl_net_trylock(). We will need the per-netns version of rtnl_trylock(). rtnl_net_trylock() calls __rtnl_net_lock() only when rtnl_trylock() successfully holds RTNL. When RTNL is removed, we will use mutex_trylock() for per-netns RTNL. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/linux/rtnetlink.h | 6 ++++++ net/core/rtnetlink.c | 11 +++++++++++ 2 files changed, 17 insertions(+) diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 0e62918de63b..14b88f551920 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -101,6 +101,7 @@ void __rtnl_net_lock(struct net *net); void __rtnl_net_unlock(struct net *net); void rtnl_net_lock(struct net *net); void rtnl_net_unlock(struct net *net); +int rtnl_net_trylock(struct net *net); int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b); bool rtnl_net_is_locked(struct net *net); @@ -132,6 +133,11 @@ static inline void rtnl_net_unlock(struct net *net) rtnl_unlock(); } +static inline int rtnl_net_trylock(struct net *net) +{ + return rtnl_trylock(); +} + static inline void ASSERT_RTNL_NET(struct net *net) { ASSERT_RTNL(); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 194a81e5f608..dda8230fdfd4 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -210,6 +210,17 @@ void rtnl_net_unlock(struct net *net) } EXPORT_SYMBOL(rtnl_net_unlock); +int rtnl_net_trylock(struct net *net) +{ + int ret = rtnl_trylock(); + + if (ret) + __rtnl_net_lock(net); + + return ret; +} +EXPORT_SYMBOL(rtnl_net_trylock); + static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b) { if (net_eq(net_a, net_b)) -- 2.51.0 From 77453d428d4c9c613341de7f9b943f0c83f37a27 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 21 Oct 2024 11:32:37 -0700 Subject: [PATCH 16/16] ipv4: Convert devinet_sysctl_forward() to per-netns RTNL. devinet_sysctl_forward() touches only a single netns. Let's use rtnl_trylock() and __in_dev_get_rtnl_net(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/ipv4/devinet.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 5eaef3bbb987..bd65e0ef774e 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2395,7 +2395,7 @@ static void inet_forward_change(struct net *net) if (on) dev_disable_lro(dev); - in_dev = __in_dev_get_rtnl(dev); + in_dev = __in_dev_get_rtnl_net(dev); if (in_dev) { IN_DEV_CONF_SET(in_dev, FORWARDING, on); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, @@ -2486,7 +2486,7 @@ static int devinet_sysctl_forward(const struct ctl_table *ctl, int write, if (write && *valp != val) { if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) { - if (!rtnl_trylock()) { + if (!rtnl_net_trylock(net)) { /* Restore the original values before restarting */ *valp = val; *ppos = pos; @@ -2505,7 +2505,7 @@ static int devinet_sysctl_forward(const struct ctl_table *ctl, int write, idev->dev->ifindex, cnf); } - rtnl_unlock(); + rtnl_net_unlock(net); rt_cache_flush(net); } else inet_netconf_notify_devconf(net, RTM_NEWNETCONF, -- 2.51.0