From d6306b9d98850a1a7247eb10fe5bb78506ee343b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 27 Feb 2025 20:23:22 -0800 Subject: [PATCH 01/16] ipv4: fib: Remove fib_info_hash_size. We will allocate the fib_info hash tables per netns. There are 5 global variables for fib_info hash tables: fib_info_hash, fib_info_laddrhash, fib_info_hash_size, fib_info_hash_bits, fib_info_cnt. However, fib_info_laddrhash and fib_info_hash_size can be easily calculated from fib_info_hash and fib_info_hash_bits. Let's remove fib_info_hash_size and use (1 << fib_info_hash_bits) instead. Now we need not pass the new hash table size to fib_info_hash_move(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250228042328.96624-7-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_semantics.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 6cc518dd665f..9dc09e80b92b 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -51,7 +51,6 @@ #include "fib_lookup.h" static struct hlist_head *fib_info_hash; -static unsigned int fib_info_hash_size; static unsigned int fib_info_hash_bits; static unsigned int fib_info_cnt; @@ -1255,17 +1254,15 @@ int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, return err; } -static void fib_info_hash_move(struct hlist_head *new_info_hash, - unsigned int new_size) +static void fib_info_hash_move(struct hlist_head *new_info_hash) { - unsigned int old_size = fib_info_hash_size; + unsigned int old_size = 1 << fib_info_hash_bits; struct hlist_head *old_info_hash; unsigned int i; ASSERT_RTNL(); old_info_hash = fib_info_hash; - fib_info_hash_size = new_size; - fib_info_hash_bits = ilog2(new_size); + fib_info_hash_bits += 1; fib_info_hash = new_info_hash; for (i = 0; i < old_size; i++) { @@ -1406,13 +1403,12 @@ struct fib_info *fib_create_info(struct fib_config *cfg, } #endif - if (fib_info_cnt >= fib_info_hash_size) { - unsigned int new_hash_bits = fib_info_hash_bits + 1; + if (fib_info_cnt >= (1 << fib_info_hash_bits)) { struct hlist_head *new_info_hash; - new_info_hash = fib_info_hash_alloc(new_hash_bits); + new_info_hash = fib_info_hash_alloc(fib_info_hash_bits + 1); if (new_info_hash) - fib_info_hash_move(new_info_hash, 1 << new_hash_bits); + fib_info_hash_move(new_info_hash); } fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL); @@ -2256,7 +2252,6 @@ int __net_init fib4_semantics_init(struct net *net) return -ENOMEM; fib_info_hash_bits = hash_bits; - fib_info_hash_size = 1 << hash_bits; return 0; } -- 2.51.0 From b79bcaf7d95261bb8c205fa6826100ca4ed961c1 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 27 Feb 2025 20:23:23 -0800 Subject: [PATCH 02/16] ipv4: fib: Add fib_info_hash_grow(). When the number of struct fib_info exceeds the hash table size in fib_create_info(), we try to allocate a new hash table with the doubled size. The allocation is done in fib_create_info(), and if successful, each struct fib_info is moved to the new hash table by fib_info_hash_move(). Let's integrate the allocation and fib_info_hash_move() as fib_info_hash_grow() to make the following change cleaner. While at it, fib_info_hash_grow() is placed near other hash-table-specific functions. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250228042328.96624-8-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_semantics.c | 85 +++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 44 deletions(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 9dc09e80b92b..0cd40ff18d8b 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -376,6 +376,46 @@ static void fib_info_hash_free(struct hlist_head *head) kvfree(head); } +static void fib_info_hash_grow(void) +{ + struct hlist_head *new_info_hash, *old_info_hash; + unsigned int old_size = 1 << fib_info_hash_bits; + unsigned int i; + + if (fib_info_cnt < old_size) + return; + + new_info_hash = fib_info_hash_alloc(fib_info_hash_bits + 1); + if (!new_info_hash) + return; + + old_info_hash = fib_info_hash; + fib_info_hash = new_info_hash; + fib_info_hash_bits += 1; + + for (i = 0; i < old_size; i++) { + struct hlist_head *head = &old_info_hash[i]; + struct hlist_node *n; + struct fib_info *fi; + + hlist_for_each_entry_safe(fi, n, head, fib_hash) + hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi)); + } + + for (i = 0; i < old_size; i++) { + struct hlist_head *lhead = &old_info_hash[old_size + i]; + struct hlist_node *n; + struct fib_info *fi; + + hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) + hlist_add_head(&fi->fib_lhash, + fib_info_laddrhash_bucket(fi->fib_net, + fi->fib_prefsrc)); + } + + fib_info_hash_free(old_info_hash); +} + /* no metrics, only nexthop id */ static struct fib_info *fib_find_info_nh(struct net *net, const struct fib_config *cfg) @@ -1254,43 +1294,6 @@ int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, return err; } -static void fib_info_hash_move(struct hlist_head *new_info_hash) -{ - unsigned int old_size = 1 << fib_info_hash_bits; - struct hlist_head *old_info_hash; - unsigned int i; - - ASSERT_RTNL(); - old_info_hash = fib_info_hash; - fib_info_hash_bits += 1; - fib_info_hash = new_info_hash; - - for (i = 0; i < old_size; i++) { - struct hlist_head *head = &old_info_hash[i]; - struct hlist_node *n; - struct fib_info *fi; - - hlist_for_each_entry_safe(fi, n, head, fib_hash) - hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi)); - } - - for (i = 0; i < old_size; i++) { - struct hlist_head *lhead = &old_info_hash[old_size + i]; - struct hlist_node *n; - struct fib_info *fi; - - hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) { - struct hlist_head *ldest; - - ldest = fib_info_laddrhash_bucket(fi->fib_net, - fi->fib_prefsrc); - hlist_add_head(&fi->fib_lhash, ldest); - } - } - - fib_info_hash_free(old_info_hash); -} - __be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc, unsigned char scope) { @@ -1403,13 +1406,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, } #endif - if (fib_info_cnt >= (1 << fib_info_hash_bits)) { - struct hlist_head *new_info_hash; - - new_info_hash = fib_info_hash_alloc(fib_info_hash_bits + 1); - if (new_info_hash) - fib_info_hash_move(new_info_hash); - } + fib_info_hash_grow(); fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL); if (!fi) { -- 2.51.0 From 9f7f3ebeba9388c30b0cba9dc5df0c43d0e07605 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 27 Feb 2025 20:23:24 -0800 Subject: [PATCH 03/16] ipv4: fib: Namespacify fib_info hash tables. We will convert RTM_NEWROUTE and RTM_DELROUTE to per-netns RTNL. Then, we need to have per-netns hash tables for struct fib_info. Let's allocate the hash tables per netns. fib_info_hash, fib_info_hash_bits, and fib_info_cnt are now moved to struct netns_ipv4 and accessed with net->ipv4.fib_XXX. Also, the netns checks are removed from fib_find_info_nh() and fib_find_info(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250228042328.96624-9-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv4.h | 3 ++ net/ipv4/fib_semantics.c | 61 +++++++++++++++++----------------------- 2 files changed, 29 insertions(+), 35 deletions(-) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 45ac125e8aeb..650b2dc9199f 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -111,6 +111,9 @@ struct netns_ipv4 { #endif struct hlist_head *fib_table_hash; struct sock *fibnl; + struct hlist_head *fib_info_hash; + unsigned int fib_info_hash_bits; + unsigned int fib_info_cnt; struct sock *mc_autojoin_sk; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 0cd40ff18d8b..f68bb9e34c34 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -50,10 +50,6 @@ #include "fib_lookup.h" -static struct hlist_head *fib_info_hash; -static unsigned int fib_info_hash_bits; -static unsigned int fib_info_cnt; - /* for_nexthops and change_nexthops only used when nexthop object * is not set in a fib_info. The logic within can reference fib_nh. */ @@ -256,8 +252,7 @@ void fib_release_info(struct fib_info *fi) ASSERT_RTNL(); if (fi && refcount_dec_and_test(&fi->fib_treeref)) { hlist_del(&fi->fib_hash); - - fib_info_cnt--; + fi->fib_net->ipv4.fib_info_cnt--; if (fi->fib_prefsrc) hlist_del(&fi->fib_lhash); @@ -333,11 +328,12 @@ static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope, static unsigned int fib_info_hashfn_result(const struct net *net, unsigned int val) { - return hash_32(val ^ net_hash_mix(net), fib_info_hash_bits); + return hash_32(val ^ net_hash_mix(net), net->ipv4.fib_info_hash_bits); } static struct hlist_head *fib_info_hash_bucket(struct fib_info *fi) { + struct net *net = fi->fib_net; unsigned int val; val = fib_info_hashfn_1(fi->fib_nhs, fi->fib_protocol, @@ -352,16 +348,18 @@ static struct hlist_head *fib_info_hash_bucket(struct fib_info *fi) } endfor_nexthops(fi) } - return &fib_info_hash[fib_info_hashfn_result(fi->fib_net, val)]; + return &net->ipv4.fib_info_hash[fib_info_hashfn_result(net, val)]; } static struct hlist_head *fib_info_laddrhash_bucket(const struct net *net, __be32 val) { - u32 slot = hash_32(net_hash_mix(net) ^ (__force u32)val, - fib_info_hash_bits); + unsigned int hash_bits = net->ipv4.fib_info_hash_bits; + u32 slot; - return &fib_info_hash[(1 << fib_info_hash_bits) + slot]; + slot = hash_32(net_hash_mix(net) ^ (__force u32)val, hash_bits); + + return &net->ipv4.fib_info_hash[(1 << hash_bits) + slot]; } static struct hlist_head *fib_info_hash_alloc(unsigned int hash_bits) @@ -376,22 +374,22 @@ static void fib_info_hash_free(struct hlist_head *head) kvfree(head); } -static void fib_info_hash_grow(void) +static void fib_info_hash_grow(struct net *net) { + unsigned int old_size = 1 << net->ipv4.fib_info_hash_bits; struct hlist_head *new_info_hash, *old_info_hash; - unsigned int old_size = 1 << fib_info_hash_bits; unsigned int i; - if (fib_info_cnt < old_size) + if (net->ipv4.fib_info_cnt < old_size) return; - new_info_hash = fib_info_hash_alloc(fib_info_hash_bits + 1); + new_info_hash = fib_info_hash_alloc(net->ipv4.fib_info_hash_bits + 1); if (!new_info_hash) return; - old_info_hash = fib_info_hash; - fib_info_hash = new_info_hash; - fib_info_hash_bits += 1; + old_info_hash = net->ipv4.fib_info_hash; + net->ipv4.fib_info_hash = new_info_hash; + net->ipv4.fib_info_hash_bits += 1; for (i = 0; i < old_size; i++) { struct hlist_head *head = &old_info_hash[i]; @@ -429,13 +427,12 @@ static struct fib_info *fib_find_info_nh(struct net *net, (__force u32)cfg->fc_prefsrc, cfg->fc_priority); hash = fib_info_hashfn_result(net, hash); - head = &fib_info_hash[hash]; + head = &net->ipv4.fib_info_hash[hash]; hlist_for_each_entry(fi, head, fib_hash) { - if (!net_eq(fi->fib_net, net)) - continue; if (!fi->nh || fi->nh->id != cfg->fc_nh_id) continue; + if (cfg->fc_protocol == fi->fib_protocol && cfg->fc_scope == fi->fib_scope && cfg->fc_prefsrc == fi->fib_prefsrc && @@ -455,10 +452,9 @@ static struct fib_info *fib_find_info(struct fib_info *nfi) struct fib_info *fi; hlist_for_each_entry(fi, head, fib_hash) { - if (!net_eq(fi->fib_net, nfi->fib_net)) - continue; if (fi->fib_nhs != nfi->fib_nhs) continue; + if (nfi->fib_protocol == fi->fib_protocol && nfi->fib_scope == fi->fib_scope && nfi->fib_prefsrc == fi->fib_prefsrc && @@ -1406,7 +1402,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, } #endif - fib_info_hash_grow(); + fib_info_hash_grow(net); fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL); if (!fi) { @@ -1550,7 +1546,7 @@ link_it: refcount_set(&fi->fib_treeref, 1); refcount_set(&fi->fib_clntref, 1); - fib_info_cnt++; + net->ipv4.fib_info_cnt++; hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi)); if (fi->fib_prefsrc) { @@ -2241,22 +2237,17 @@ int __net_init fib4_semantics_init(struct net *net) { unsigned int hash_bits = 4; - if (!net_eq(net, &init_net)) - return 0; - - fib_info_hash = fib_info_hash_alloc(hash_bits); - if (!fib_info_hash) + net->ipv4.fib_info_hash = fib_info_hash_alloc(hash_bits); + if (!net->ipv4.fib_info_hash) return -ENOMEM; - fib_info_hash_bits = hash_bits; + net->ipv4.fib_info_hash_bits = hash_bits; + net->ipv4.fib_info_cnt = 0; return 0; } void __net_exit fib4_semantics_exit(struct net *net) { - if (!net_eq(net, &init_net)) - return; - - fib_info_hash_free(fib_info_hash); + fib_info_hash_free(net->ipv4.fib_info_hash); } -- 2.51.0 From af5cd2a8f07842d14be2cbc628ea511adfd6bba9 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 27 Feb 2025 20:23:25 -0800 Subject: [PATCH 04/16] ipv4: fib: Hold rtnl_net_lock() for ip_fib_net_exit(). ip_fib_net_exit() requires RTNL and is called from fib_net_init() and fib_net_exit_batch(). Let's hold rtnl_net_lock() before ip_fib_net_exit(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250228042328.96624-10-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_frontend.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 40c062f820f2..c48ed369b179 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1575,7 +1575,7 @@ static void ip_fib_net_exit(struct net *net) { int i; - ASSERT_RTNL(); + ASSERT_RTNL_NET(net); #ifdef CONFIG_IP_MULTIPLE_TABLES RCU_INIT_POINTER(net->ipv4.fib_main, NULL); RCU_INIT_POINTER(net->ipv4.fib_default, NULL); @@ -1635,9 +1635,9 @@ out_proc: out_nlfl: fib4_semantics_exit(net); out_semantics: - rtnl_lock(); + rtnl_net_lock(net); ip_fib_net_exit(net); - rtnl_unlock(); + rtnl_net_unlock(net); goto out; } @@ -1652,9 +1652,11 @@ static void __net_exit fib_net_exit_batch(struct list_head *net_list) struct net *net; rtnl_lock(); - list_for_each_entry(net, net_list, exit_list) + list_for_each_entry(net, net_list, exit_list) { + __rtnl_net_lock(net); ip_fib_net_exit(net); - + __rtnl_net_unlock(net); + } rtnl_unlock(); list_for_each_entry(net, net_list, exit_list) -- 2.51.0 From c0ebe1cdc2cff0dee092a67f2c50377bb5fcf43d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 27 Feb 2025 20:23:26 -0800 Subject: [PATCH 05/16] ipv4: fib: Hold rtnl_net_lock() in ip_rt_ioctl(). ioctl(SIOCADDRT/SIOCDELRT) calls ip_rt_ioctl() to add/remove a route in the netns of the specified socket. Let's hold rtnl_net_lock() there. Note that rtentry_to_fib_config() can be called without rtnl_net_lock() if we convert rtentry.dev handling to RCU later. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250228042328.96624-11-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_frontend.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index c48ed369b179..a76dacc3e577 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -553,18 +553,16 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, const struct in_ifaddr *ifa; struct in_device *in_dev; - in_dev = __in_dev_get_rtnl(dev); + in_dev = __in_dev_get_rtnl_net(dev); if (!in_dev) return -ENODEV; *colon = ':'; - rcu_read_lock(); - in_dev_for_each_ifa_rcu(ifa, in_dev) { + in_dev_for_each_ifa_rtnl_net(net, ifa, in_dev) { if (strcmp(ifa->ifa_label, devname) == 0) break; } - rcu_read_unlock(); if (!ifa) return -ENODEV; @@ -635,7 +633,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt) if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; - rtnl_lock(); + rtnl_net_lock(net); err = rtentry_to_fib_config(net, cmd, rt, &cfg); if (err == 0) { struct fib_table *tb; @@ -659,7 +657,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt) /* allocated by rtentry_to_fib_config() */ kfree(cfg.fc_mx); } - rtnl_unlock(); + rtnl_net_unlock(net); return err; } return -EINVAL; -- 2.51.0 From 254ba7e6032d3fc738050d500b0c1d8197af90ca Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 27 Feb 2025 20:23:27 -0800 Subject: [PATCH 06/16] ipv4: fib: Move fib_valid_key_len() to rtm_to_fib_config(). fib_valid_key_len() is called in the beginning of fib_table_insert() or fib_table_delete() to check if the prefix length is valid. fib_table_insert() and fib_table_delete() are called from 3 paths - ip_rt_ioctl() - inet_rtm_newroute() / inet_rtm_delroute() - fib_magic() In the first ioctl() path, rtentry_to_fib_config() checks the prefix length with bad_mask(). Also, fib_magic() always passes the correct prefix: 32 or ifa->ifa_prefixlen, which is already validated. Let's move fib_valid_key_len() to the rtnetlink path, rtm_to_fib_config(). While at it, 2 direct returns in rtm_to_fib_config() are changed to goto to match other places in the same function Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250228042328.96624-12-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_frontend.c | 18 ++++++++++++++++-- net/ipv4/fib_trie.c | 22 ---------------------- 2 files changed, 16 insertions(+), 24 deletions(-) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index a76dacc3e577..a6372d934e45 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -835,19 +835,33 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, } } + if (cfg->fc_dst_len > 32) { + NL_SET_ERR_MSG(extack, "Invalid prefix length"); + err = -EINVAL; + goto errout; + } + + if (cfg->fc_dst_len < 32 && (ntohl(cfg->fc_dst) << cfg->fc_dst_len)) { + NL_SET_ERR_MSG(extack, "Invalid prefix for given prefix length"); + err = -EINVAL; + goto errout; + } + if (cfg->fc_nh_id) { if (cfg->fc_oif || cfg->fc_gw_family || cfg->fc_encap || cfg->fc_mp) { NL_SET_ERR_MSG(extack, "Nexthop specification and nexthop id are mutually exclusive"); - return -EINVAL; + err = -EINVAL; + goto errout; } } if (has_gw && has_via) { NL_SET_ERR_MSG(extack, "Nexthop configuration can not contain both GATEWAY and VIA"); - return -EINVAL; + err = -EINVAL; + goto errout; } if (!cfg->fc_table) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index d6411ac81096..59a6f0a9638f 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1187,22 +1187,6 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp, return 0; } -static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack) -{ - if (plen > KEYLENGTH) { - NL_SET_ERR_MSG(extack, "Invalid prefix length"); - return false; - } - - if ((plen < KEYLENGTH) && (key << plen)) { - NL_SET_ERR_MSG(extack, - "Invalid prefix for given prefix length"); - return false; - } - - return true; -} - static void fib_remove_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *old); @@ -1223,9 +1207,6 @@ int fib_table_insert(struct net *net, struct fib_table *tb, key = ntohl(cfg->fc_dst); - if (!fib_valid_key_len(key, plen, extack)) - return -EINVAL; - pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); fi = fib_create_info(cfg, extack); @@ -1717,9 +1698,6 @@ int fib_table_delete(struct net *net, struct fib_table *tb, key = ntohl(cfg->fc_dst); - if (!fib_valid_key_len(key, plen, extack)) - return -EINVAL; - l = fib_find_node(t, &tp, key); if (!l) return -ESRCH; -- 2.51.0 From 1dd2af7963e95df90590fe40425fe1bdf982ae8f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 27 Feb 2025 20:23:28 -0800 Subject: [PATCH 07/16] ipv4: fib: Convert RTM_NEWROUTE and RTM_DELROUTE to per-netns RTNL. We converted fib_info hash tables to per-netns one and now ready to convert RTM_NEWROUTE and RTM_DELROUTE to per-netns RTNL. Let's hold rtnl_net_lock() in inet_rtm_newroute() and inet_rtm_delroute(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250228042328.96624-13-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_frontend.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index a6372d934e45..6de77415b5b3 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -884,20 +884,24 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto errout; + rtnl_net_lock(net); + if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) { NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); err = -EINVAL; - goto errout; + goto unlock; } tb = fib_get_table(net, cfg.fc_table); if (!tb) { NL_SET_ERR_MSG(extack, "FIB table does not exist"); err = -ESRCH; - goto errout; + goto unlock; } err = fib_table_delete(net, tb, &cfg, extack); +unlock: + rtnl_net_unlock(net); errout: return err; } @@ -914,15 +918,20 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto errout; + rtnl_net_lock(net); + tb = fib_new_table(net, cfg.fc_table); if (!tb) { err = -ENOBUFS; - goto errout; + goto unlock; } err = fib_table_insert(net, tb, &cfg, extack); if (!err && cfg.fc_type == RTN_LOCAL) net->ipv4.fib_has_custom_local_routes = true; + +unlock: + rtnl_net_unlock(net); errout: return err; } @@ -1683,9 +1692,9 @@ static struct pernet_operations fib_net_ops = { static const struct rtnl_msg_handler fib_rtnl_msg_handlers[] __initconst = { {.protocol = PF_INET, .msgtype = RTM_NEWROUTE, - .doit = inet_rtm_newroute}, + .doit = inet_rtm_newroute, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_DELROUTE, - .doit = inet_rtm_delroute}, + .doit = inet_rtm_delroute, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_GETROUTE, .dumpit = inet_dump_fib, .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, }; -- 2.51.0 From e34100c2ecbb79a7d69474eb2f58545bb9926a5f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 1 Mar 2025 20:14:19 +0000 Subject: [PATCH 08/16] tcp: add a drop_reason pointer to tcp_check_req() We want to add new drop reasons for packets dropped in 3WHS in the following patches. tcp_rcv_state_process() has to set reason to TCP_FASTOPEN, because tcp_check_req() will conditionally overwrite the drop_reason. Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250301201424.2046477-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 2 +- net/ipv4/tcp_input.c | 5 ++--- net/ipv4/tcp_ipv4.c | 3 ++- net/ipv4/tcp_minisocks.c | 3 ++- net/ipv6/tcp_ipv6.c | 3 ++- 5 files changed, 9 insertions(+), 7 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index f9b9377a2897..a9bc959fb102 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -392,7 +392,7 @@ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, u32 *tw_isn); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, bool fastopen, - bool *lost_race); + bool *lost_race, enum skb_drop_reason *drop_reason); enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child, struct sk_buff *skb); void tcp_enter_loss(struct sock *sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d22ad553b45b..4e2212348088 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6812,10 +6812,9 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && sk->sk_state != TCP_FIN_WAIT1); - if (!tcp_check_req(sk, skb, req, true, &req_stolen)) { - SKB_DR_SET(reason, TCP_FASTOPEN); + SKB_DR_SET(reason, TCP_FASTOPEN); + if (!tcp_check_req(sk, skb, req, true, &req_stolen, &reason)) goto discard; - } } if (!th->ack && !th->rst && !th->syn) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7900855237d9..218f01a8cc5f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2265,7 +2265,8 @@ lookup: th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); tcp_v4_fill_cb(skb, iph, th); - nsk = tcp_check_req(sk, skb, req, false, &req_stolen); + nsk = tcp_check_req(sk, skb, req, false, &req_stolen, + &drop_reason); } else { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 29b54ade7572..46c86c4f80e9 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -657,7 +657,8 @@ EXPORT_SYMBOL(tcp_create_openreq_child); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - bool fastopen, bool *req_stolen) + bool fastopen, bool *req_stolen, + enum skb_drop_reason *drop_reason) { struct tcp_options_received tmp_opt; struct sock *child; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index a80608260298..d01088ab80d2 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1828,7 +1828,8 @@ lookup: th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); tcp_v6_fill_cb(skb, hdr, th); - nsk = tcp_check_req(sk, skb, req, false, &req_stolen); + nsk = tcp_check_req(sk, skb, req, false, &req_stolen, + &drop_reason); } else { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; } -- 2.51.0 From a11a791ca81e5cd9bf2d48e1368d836bc53f00ca Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 1 Mar 2025 20:14:20 +0000 Subject: [PATCH 09/16] tcp: add four drop reasons to tcp_check_req() Use two existing drop reasons in tcp_check_req(): - TCP_RFC7323_PAWS - TCP_OVERWINDOW Add two new ones: - TCP_RFC7323_TSECR (corresponds to LINUX_MIB_TSECRREJECTED) - TCP_LISTEN_OVERFLOW (when a listener accept queue is full) Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250301201424.2046477-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 9 +++++++++ net/ipv4/tcp_minisocks.c | 10 ++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 32a34dfe8cc5..e4fdc6b54cef 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -40,6 +40,8 @@ FN(TCP_OFOMERGE) \ FN(TCP_RFC7323_PAWS) \ FN(TCP_RFC7323_PAWS_ACK) \ + FN(TCP_RFC7323_TSECR) \ + FN(TCP_LISTEN_OVERFLOW) \ FN(TCP_OLD_SEQUENCE) \ FN(TCP_INVALID_SEQUENCE) \ FN(TCP_INVALID_ACK_SEQUENCE) \ @@ -281,6 +283,13 @@ enum skb_drop_reason { * Corresponds to LINUX_MIB_PAWS_OLD_ACK. */ SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK, + /** + * @SKB_DROP_REASON_TCP_RFC7323_TSECR: PAWS check, invalid TSEcr. + * Corresponds to LINUX_MIB_TSECRREJECTED. + */ + SKB_DROP_REASON_TCP_RFC7323_TSECR, + /** @SKB_DROP_REASON_TCP_LISTEN_OVERFLOW: listener queue full. */ + SKB_DROP_REASON_TCP_LISTEN_OVERFLOW, /** @SKB_DROP_REASON_TCP_OLD_SEQUENCE: Old SEQ field (duplicate packet) */ SKB_DROP_REASON_TCP_OLD_SEQUENCE, /** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field */ diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 46c86c4f80e9..ba4a5d7f251d 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -809,10 +809,15 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, &tcp_rsk(req)->last_oow_ack_time)) req->rsk_ops->send_ack(sk, skb, req); - if (paws_reject) + if (paws_reject) { + SKB_DR_SET(*drop_reason, TCP_RFC7323_PAWS); NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); - else if (tsecr_reject) + } else if (tsecr_reject) { + SKB_DR_SET(*drop_reason, TCP_RFC7323_TSECR); NET_INC_STATS(sock_net(sk), LINUX_MIB_TSECRREJECTED); + } else { + SKB_DR_SET(*drop_reason, TCP_OVERWINDOW); + } return NULL; } @@ -882,6 +887,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: + SKB_DR_SET(*drop_reason, TCP_LISTEN_OVERFLOW); if (sk != req->rsk_listener) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); -- 2.51.0 From e7b9ecce562ca6a1de32c56c597fa45e08c44ec0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 1 Mar 2025 20:14:21 +0000 Subject: [PATCH 10/16] tcp: convert to dev_net_rcu() TCP uses of dev_net() are under RCU protection, change them to dev_net_rcu() to get LOCKDEP support. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250301201424.2046477-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/inet6_hashtables.h | 2 +- include/net/inet_hashtables.h | 2 +- net/ipv4/tcp_ipv4.c | 12 ++++++------ net/ipv4/tcp_metrics.c | 6 +++--- net/ipv6/tcp_ipv6.c | 22 +++++++++++----------- 5 files changed, 22 insertions(+), 22 deletions(-) diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 74dd90ff5f12..c32878c69179 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -150,7 +150,7 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, int iif, int sdif, bool *refcounted) { - struct net *net = dev_net(skb_dst(skb)->dev); + struct net *net = dev_net_rcu(skb_dst(skb)->dev); const struct ipv6hdr *ip6h = ipv6_hdr(skb); struct sock *sk; diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 5eea47f135a4..da818fb0205f 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -492,7 +492,7 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, const int sdif, bool *refcounted) { - struct net *net = dev_net(skb_dst(skb)->dev); + struct net *net = dev_net_rcu(skb_dst(skb)->dev); const struct iphdr *iph = ip_hdr(skb); struct sock *sk; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 218f01a8cc5f..ae07613e4f33 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -494,14 +494,14 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); - struct tcp_sock *tp; + struct net *net = dev_net_rcu(skb->dev); const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; - struct sock *sk; struct request_sock *fastopen; + struct tcp_sock *tp; u32 seq, snd_una; + struct sock *sk; int err; - struct net *net = dev_net(skb->dev); sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, iph->daddr, th->dest, iph->saddr, @@ -786,7 +786,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, arg.iov[0].iov_base = (unsigned char *)&rep; arg.iov[0].iov_len = sizeof(rep.th); - net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); + net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev); /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) @@ -1961,7 +1961,7 @@ EXPORT_SYMBOL(tcp_v4_do_rcv); int tcp_v4_early_demux(struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); const struct iphdr *iph; const struct tcphdr *th; struct sock *sk; @@ -2172,7 +2172,7 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, int tcp_v4_rcv(struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); enum skb_drop_reason drop_reason; int sdif = inet_sdif(skb); int dif = inet_iif(skb); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 95669935494e..4251670e328c 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -170,7 +170,7 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, bool reclaim = false; spin_lock_bh(&tcp_metrics_lock); - net = dev_net(dst->dev); + net = dev_net_rcu(dst->dev); /* While waiting for the spin-lock the cache might have been populated * with this entry and so we have to check again. @@ -273,7 +273,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, return NULL; } - net = dev_net(dst->dev); + net = dev_net_rcu(dst->dev); hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); @@ -318,7 +318,7 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, else return NULL; - net = dev_net(dst->dev); + net = dev_net_rcu(dst->dev); hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d01088ab80d2..fe75ad8e606c 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -376,7 +376,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, { const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; const struct tcphdr *th = (struct tcphdr *)(skb->data+offset); - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); struct request_sock *fastopen; struct ipv6_pinfo *np; struct tcp_sock *tp; @@ -866,16 +866,16 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 int oif, int rst, u8 tclass, __be32 label, u32 priority, u32 txhash, struct tcp_key *key) { - const struct tcphdr *th = tcp_hdr(skb); - struct tcphdr *t1; - struct sk_buff *buff; - struct flowi6 fl6; - struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); - struct sock *ctl_sk = net->ipv6.tcp_sk; + struct net *net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev); unsigned int tot_len = sizeof(struct tcphdr); + struct sock *ctl_sk = net->ipv6.tcp_sk; + const struct tcphdr *th = tcp_hdr(skb); __be32 mrst = 0, *topt; struct dst_entry *dst; - __u32 mark = 0; + struct sk_buff *buff; + struct tcphdr *t1; + struct flowi6 fl6; + u32 mark = 0; if (tsecr) tot_len += TCPOLEN_TSTAMP_ALIGNED; @@ -1041,7 +1041,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, if (!sk && !ipv6_unicast_destination(skb)) return; - net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); + net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev); /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(th, &md5_hash_location, &aoh)) return; @@ -1740,6 +1740,7 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) { + struct net *net = dev_net_rcu(skb->dev); enum skb_drop_reason drop_reason; int sdif = inet6_sdif(skb); int dif = inet6_iif(skb); @@ -1749,7 +1750,6 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) bool refcounted; int ret; u32 isn; - struct net *net = dev_net(skb->dev); drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (skb->pkt_type != PACKET_HOST) @@ -2001,7 +2001,7 @@ do_time_wait: void tcp_v6_early_demux(struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); const struct ipv6hdr *hdr; const struct tcphdr *th; struct sock *sk; -- 2.51.0 From 9b49f57ccd3aa1560f215b3f8218b2783ff18391 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 1 Mar 2025 20:14:22 +0000 Subject: [PATCH 11/16] net: gro: convert four dev_net() calls tcp4_check_fraglist_gro(), tcp6_check_fraglist_gro(), udp4_gro_lookup_skb() and udp6_gro_lookup_skb() assume RCU is held so that the net structure does not disappear. Use dev_net_rcu() instead of dev_net() to get LOCKDEP support. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250301201424.2046477-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_offload.c | 2 +- net/ipv4/udp_offload.c | 2 +- net/ipv6/tcpv6_offload.c | 2 +- net/ipv6/udp_offload.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 2308665b51c5..ecef16c58c07 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -425,7 +425,7 @@ static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, inet_get_iif_sdif(skb, &iif, &sdif); iph = skb_gro_network_header(skb); - net = dev_net(skb->dev); + net = dev_net_rcu(skb->dev); sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, iph->saddr, th->source, iph->daddr, ntohs(th->dest), diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index a5be6e4ed326..c1a85b300ee8 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -630,7 +630,7 @@ static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport) { const struct iphdr *iph = skb_gro_network_header(skb); - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); int iif, sdif; inet_get_iif_sdif(skb, &iif, &sdif); diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index a45bf17cb2a1..91b88daa5b55 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -35,7 +35,7 @@ static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, inet6_get_iif_sdif(skb, &iif, &sdif); hdr = skb_gro_network_header(skb); - net = dev_net(skb->dev); + net = dev_net_rcu(skb->dev); sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &hdr->saddr, th->source, &hdr->daddr, ntohs(th->dest), diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index b41152dd4246..404212dfc99a 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -117,7 +117,7 @@ static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport) { const struct ipv6hdr *iph = skb_gro_network_header(skb); - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); int iif, sdif; inet6_get_iif_sdif(skb, &iif, &sdif); -- 2.51.0 From 5282de17621f80a1e2c6c26229975182ecade5b9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 1 Mar 2025 20:14:23 +0000 Subject: [PATCH 12/16] tcp: remove READ_ONCE(req->ts_recent) After commit 8d52da23b6c6 ("tcp: Defer ts_recent changes until req is owned"), req->ts_recent is not changed anymore. It is set once in tcp_openreq_init(), bpf_sk_assign_tcp_reqsk() or cookie_tcp_reqsk_alloc() before the req can be seen by other cpus/threads. This completes the revert of eba20811f326 ("tcp: annotate data-races around tcp_rsk(req)->ts_recent"). Signed-off-by: Eric Dumazet Cc: Wang Hai Reviewed-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250301201424.2046477-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_minisocks.c | 4 ++-- net/ipv4/tcp_output.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ae07613e4f33..d9405b012dff 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1155,7 +1155,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_rsk(req)->rcv_nxt, tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, tcp_rsk_tsval(tcp_rsk(req)), - READ_ONCE(req->ts_recent), + req->ts_recent, 0, &key, inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, ip_hdr(skb)->tos, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index ba4a5d7f251d..3cb8f281186b 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -585,7 +585,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, if (newtp->rx_opt.tstamp_ok) { newtp->tcp_usec_ts = treq->req_usec_ts; - newtp->rx_opt.ts_recent = READ_ONCE(req->ts_recent); + newtp->rx_opt.ts_recent = req->ts_recent; newtp->rx_opt.ts_recent_stamp = ktime_get_seconds(); newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; } else { @@ -673,7 +673,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { - tmp_opt.ts_recent = READ_ONCE(req->ts_recent); + tmp_opt.ts_recent = req->ts_recent; if (tmp_opt.rcv_tsecr) { if (inet_rsk(req)->tstamp_ok && !fastopen) tsecr_reject = !between(tmp_opt.rcv_tsecr, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0a660075add5..24e56bf96747 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -949,7 +949,7 @@ static unsigned int tcp_synack_options(const struct sock *sk, tcp_rsk(req)->snt_tsval_first = opts->tsval; } WRITE_ONCE(tcp_rsk(req)->snt_tsval_last, opts->tsval); - opts->tsecr = READ_ONCE(req->ts_recent); + opts->tsecr = req->ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } if (likely(ireq->sack_ok)) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index fe75ad8e606c..85c4820bfe15 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1279,7 +1279,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_rsk(req)->rcv_nxt, tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, tcp_rsk_tsval(tcp_rsk(req)), - READ_ONCE(req->ts_recent), sk->sk_bound_dev_if, + req->ts_recent, sk->sk_bound_dev_if, &key, ipv6_get_dsfield(ipv6_hdr(skb)), 0, READ_ONCE(sk->sk_priority), READ_ONCE(tcp_rsk(req)->txhash)); -- 2.51.0 From 863a952eb79a6acf2b1f654f4e75ed104ff4cc81 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 1 Mar 2025 20:14:24 +0000 Subject: [PATCH 13/16] tcp: tcp_set_window_clamp() cleanup Remove one indentation level. Use max_t() and clamp() macros. Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250301201424.2046477-7-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 118486692213..eb5a60c7a9cc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3693,33 +3693,33 @@ EXPORT_SYMBOL(tcp_sock_set_keepcnt); int tcp_set_window_clamp(struct sock *sk, int val) { + u32 old_window_clamp, new_window_clamp; struct tcp_sock *tp = tcp_sk(sk); if (!val) { if (sk->sk_state != TCP_CLOSE) return -EINVAL; WRITE_ONCE(tp->window_clamp, 0); - } else { - u32 new_rcv_ssthresh, old_window_clamp = tp->window_clamp; - u32 new_window_clamp = val < SOCK_MIN_RCVBUF / 2 ? - SOCK_MIN_RCVBUF / 2 : val; + return 0; + } - if (new_window_clamp == old_window_clamp) - return 0; + old_window_clamp = tp->window_clamp; + new_window_clamp = max_t(int, SOCK_MIN_RCVBUF / 2, val); - WRITE_ONCE(tp->window_clamp, new_window_clamp); - if (new_window_clamp < old_window_clamp) { - /* need to apply the reserved mem provisioning only - * when shrinking the window clamp - */ - __tcp_adjust_rcv_ssthresh(sk, tp->window_clamp); + if (new_window_clamp == old_window_clamp) + return 0; - } else { - new_rcv_ssthresh = min(tp->rcv_wnd, tp->window_clamp); - tp->rcv_ssthresh = max(new_rcv_ssthresh, - tp->rcv_ssthresh); - } - } + WRITE_ONCE(tp->window_clamp, new_window_clamp); + + /* Need to apply the reserved mem provisioning only + * when shrinking the window clamp. + */ + if (new_window_clamp < old_window_clamp) + __tcp_adjust_rcv_ssthresh(sk, new_window_clamp); + else + tp->rcv_ssthresh = clamp(new_window_clamp, + tp->rcv_ssthresh, + tp->rcv_wnd); return 0; } -- 2.51.0 From 456cc675b6d4cadd4872a477d8976ff56eef4b6f Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 28 Feb 2025 18:01:31 +0800 Subject: [PATCH 14/16] sock: add sock_kmemdup helper This patch adds the sock version of kmemdup() helper, named sock_kmemdup(), to duplicate the input "src" memory block using the socket's option memory buffer. Signed-off-by: Geliang Tang Reviewed-by: Kuniyuki Iwashima Acked-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/f828077394c7d1f3560123497348b438c875b510.1740735165.git.tanggeliang@kylinos.cn Signed-off-by: Jakub Kicinski --- include/net/sock.h | 2 ++ net/core/sock.c | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/include/net/sock.h b/include/net/sock.h index e771d99f81b0..8daf1b3b12c6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1797,6 +1797,8 @@ static inline struct sk_buff *sock_alloc_send_skb(struct sock *sk, } void *sock_kmalloc(struct sock *sk, int size, gfp_t priority); +void *sock_kmemdup(struct sock *sk, const void *src, + int size, gfp_t priority); void sock_kfree_s(struct sock *sk, void *mem, int size); void sock_kzfree_s(struct sock *sk, void *mem, int size); void sk_send_sigurg(struct sock *sk); diff --git a/net/core/sock.c b/net/core/sock.c index 23bce41f7f1f..a0598518ce89 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2836,6 +2836,22 @@ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) } EXPORT_SYMBOL(sock_kmalloc); +/* + * Duplicate the input "src" memory block using the socket's + * option memory buffer. + */ +void *sock_kmemdup(struct sock *sk, const void *src, + int size, gfp_t priority) +{ + void *mem; + + mem = sock_kmalloc(sk, size, priority); + if (mem) + memcpy(mem, src, size); + return mem; +} +EXPORT_SYMBOL(sock_kmemdup); + /* Free an option memory block. Note, we actually want the inline * here as this allows gcc to detect the nullify and fold away the * condition entirely. -- 2.51.0 From 483cec55c1ccb9deeefb515fbeb181f736c41736 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 28 Feb 2025 18:01:32 +0800 Subject: [PATCH 15/16] net: use sock_kmemdup for ip_options Instead of using sock_kmalloc() to allocate an ip_options and then immediately duplicate another ip_options to the newly allocated one in ipv6_dup_options(), mptcp_copy_ip_options() and sctp_v4_copy_ip_options(), the newly added sock_kmemdup() helper can be used to simplify the code. Signed-off-by: Geliang Tang Reviewed-by: Kuniyuki Iwashima Acked-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/91ae749d66600ec6fb679e0e518fda6acb5c3e6f.1740735165.git.tanggeliang@kylinos.cn Signed-off-by: Jakub Kicinski --- net/ipv6/exthdrs.c | 3 +-- net/mptcp/protocol.c | 7 ++----- net/sctp/protocol.c | 7 ++----- 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 6789623b2b0d..457de0745a33 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -1204,10 +1204,9 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt) { struct ipv6_txoptions *opt2; - opt2 = sock_kmalloc(sk, opt->tot_len, GFP_ATOMIC); + opt2 = sock_kmemdup(sk, opt, opt->tot_len, GFP_ATOMIC); if (opt2) { long dif = (char *)opt2 - (char *)opt; - memcpy(opt2, opt, opt->tot_len); if (opt2->hopopt) *((char **)&opt2->hopopt) += dif; if (opt2->dst0opt) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 6b61b7dee33b..ec23e65ef0f1 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3178,12 +3178,9 @@ static void mptcp_copy_ip_options(struct sock *newsk, const struct sock *sk) rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { - newopt = sock_kmalloc(newsk, sizeof(*inet_opt) + + newopt = sock_kmemdup(newsk, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen, GFP_ATOMIC); - if (newopt) - memcpy(newopt, inet_opt, sizeof(*inet_opt) + - inet_opt->opt.optlen); - else + if (!newopt) net_warn_ratelimited("%s: Failed to copy ip options\n", __func__); } RCU_INIT_POINTER(newinet->inet_opt, newopt); diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 29727ed1008e..5407a3922101 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -185,12 +185,9 @@ static void sctp_v4_copy_ip_options(struct sock *sk, struct sock *newsk) rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { - newopt = sock_kmalloc(newsk, sizeof(*inet_opt) + + newopt = sock_kmemdup(newsk, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen, GFP_ATOMIC); - if (newopt) - memcpy(newopt, inet_opt, sizeof(*inet_opt) + - inet_opt->opt.optlen); - else + if (!newopt) pr_err("%s: Failed to copy ip options\n", __func__); } RCU_INIT_POINTER(newinet->inet_opt, newopt); -- 2.51.0 From 52f83c0b5f857dbe24f66fc9a7f035523e9ffbc9 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 28 Feb 2025 18:01:33 +0800 Subject: [PATCH 16/16] mptcp: use sock_kmemdup for address entry Instead of using sock_kmalloc() to allocate an address entry "e" and then immediately duplicate the input "entry" to it, the newly added sock_kmemdup() helper can be used in mptcp_userspace_pm_append_new_local_addr() to simplify the code. More importantly, the code "*e = *entry;" that assigns "entry" to "e" is not easy to implemented in BPF if we use the same code to implement an append_new_local_addr() helper of a BFP path manager. This patch avoids this type of memory assignment operation. Signed-off-by: Geliang Tang Acked-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/3e5a307aed213038a87e44ff93b5793229b16279.1740735165.git.tanggeliang@kylinos.cn Signed-off-by: Jakub Kicinski --- net/mptcp/pm_userspace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 6bf6a20ef7f3..7e7d01bef5d4 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -71,13 +71,12 @@ static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, /* Memory for the entry is allocated from the * sock option buffer. */ - e = sock_kmalloc(sk, sizeof(*e), GFP_ATOMIC); + e = sock_kmemdup(sk, entry, sizeof(*entry), GFP_ATOMIC); if (!e) { ret = -ENOMEM; goto append_err; } - *e = *entry; if (!e->addr.id && needs_id) e->addr.id = find_next_zero_bit(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1, -- 2.51.0