]> www.infradead.org Git - users/hch/misc.git/commitdiff
tcp: use RCU lookup in __inet_hash_connect()
authorEric Dumazet <edumazet@google.com>
Sun, 2 Mar 2025 12:42:37 +0000 (12:42 +0000)
committerJakub Kicinski <kuba@kernel.org>
Wed, 5 Mar 2025 01:46:27 +0000 (17:46 -0800)
When __inet_hash_connect() has to try many 4-tuples before
finding an available one, we see a high spinlock cost from
the many spin_lock_bh(&head->lock) performed in its loop.

This patch adds an RCU lookup to avoid the spinlock cost.

check_established() gets a new @rcu_lookup argument.
First reason is to not make any changes while head->lock
is not held.
Second reason is to not make this RCU lookup a second time
after the spinlock has been acquired.

Tested:

Server:

ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog

Client:

ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog -c -H server

Before series:

  utime_start=0.288582
  utime_end=1.548707
  stime_start=20.637138
  stime_end=2002.489845
  num_transactions=484453
  latency_min=0.156279245
  latency_max=20.922042756
  latency_mean=1.546521274
  latency_stddev=3.936005194
  num_samples=312537
  throughput=47426.00

perf top on the client:

 49.54%  [kernel]       [k] _raw_spin_lock
 25.87%  [kernel]       [k] _raw_spin_lock_bh
  5.97%  [kernel]       [k] queued_spin_lock_slowpath
  5.67%  [kernel]       [k] __inet_hash_connect
  3.53%  [kernel]       [k] __inet6_check_established
  3.48%  [kernel]       [k] inet6_ehashfn
  0.64%  [kernel]       [k] rcu_all_qs

After this series:

  utime_start=0.271607
  utime_end=3.847111
  stime_start=18.407684
  stime_end=1997.485557
  num_transactions=1350742
  latency_min=0.014131929
  latency_max=17.895073144
  latency_mean=0.505675853  # Nice reduction of latency metrics
  latency_stddev=2.125164772
  num_samples=307884
  throughput=139866.80      # 190 % increase

perf top on client:

 56.86%  [kernel]       [k] __inet6_check_established
 17.96%  [kernel]       [k] __inet_hash_connect
 13.88%  [kernel]       [k] inet6_ehashfn
  2.52%  [kernel]       [k] rcu_all_qs
  2.01%  [kernel]       [k] __cond_resched
  0.41%  [kernel]       [k] _raw_spin_lock

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Tested-by: Jason Xing <kerneljasonxing@gmail.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://patch.msgid.link/20250302124237.3913746-5-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
include/net/inet_hashtables.h
net/ipv4/inet_hashtables.c
net/ipv6/inet6_hashtables.c

index 1061c4f536a6684fff8e73132dd0dcb4cdb3fbe8..f447d61d95982090aac492b31e4199534970c4fb 100644 (file)
@@ -529,7 +529,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
                        struct sock *sk, u64 port_offset,
                        int (*check_established)(struct inet_timewait_death_row *,
                                                 struct sock *, __u16,
-                                                struct inet_timewait_sock **));
+                                                struct inet_timewait_sock **,
+                                                bool rcu_lookup));
 
 int inet_hash_connect(struct inet_timewait_death_row *death_row,
                      struct sock *sk);
index b737e13f8459c53428980221355344327c4bc8dd..d1b5f45ee718410fdf3e78c113c7ebd4a1ddba40 100644 (file)
@@ -537,7 +537,8 @@ EXPORT_SYMBOL_GPL(__inet_lookup_established);
 /* called with local bh disabled */
 static int __inet_check_established(struct inet_timewait_death_row *death_row,
                                    struct sock *sk, __u16 lport,
-                                   struct inet_timewait_sock **twp)
+                                   struct inet_timewait_sock **twp,
+                                   bool rcu_lookup)
 {
        struct inet_hashinfo *hinfo = death_row->hashinfo;
        struct inet_sock *inet = inet_sk(sk);
@@ -556,17 +557,17 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
        struct sock *sk2;
        spinlock_t *lock;
 
-       rcu_read_lock();
-       sk_nulls_for_each(sk2, node, &head->chain) {
-               if (sk2->sk_hash != hash ||
-                   !inet_match(net, sk2, acookie, ports, dif, sdif))
-                       continue;
-               if (sk2->sk_state == TCP_TIME_WAIT)
-                       break;
-               rcu_read_unlock();
-               return -EADDRNOTAVAIL;
+       if (rcu_lookup) {
+               sk_nulls_for_each(sk2, node, &head->chain) {
+                       if (sk2->sk_hash != hash ||
+                           !inet_match(net, sk2, acookie, ports, dif, sdif))
+                               continue;
+                       if (sk2->sk_state == TCP_TIME_WAIT)
+                               break;
+                       return -EADDRNOTAVAIL;
+               }
+               return 0;
        }
-       rcu_read_unlock();
 
        lock = inet_ehash_lockp(hinfo, hash);
        spin_lock(lock);
@@ -1007,7 +1008,8 @@ static u32 *table_perturb;
 int __inet_hash_connect(struct inet_timewait_death_row *death_row,
                struct sock *sk, u64 port_offset,
                int (*check_established)(struct inet_timewait_death_row *,
-                       struct sock *, __u16, struct inet_timewait_sock **))
+                       struct sock *, __u16, struct inet_timewait_sock **,
+                       bool rcu_lookup))
 {
        struct inet_hashinfo *hinfo = death_row->hashinfo;
        struct inet_bind_hashbucket *head, *head2;
@@ -1025,7 +1027,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 
        if (port) {
                local_bh_disable();
-               ret = check_established(death_row, sk, port, NULL);
+               ret = check_established(death_row, sk, port, NULL, false);
                local_bh_enable();
                return ret;
        }
@@ -1061,6 +1063,21 @@ other_parity_scan:
                        continue;
                head = &hinfo->bhash[inet_bhashfn(net, port,
                                                  hinfo->bhash_size)];
+               rcu_read_lock();
+               hlist_for_each_entry_rcu(tb, &head->chain, node) {
+                       if (!inet_bind_bucket_match(tb, net, port, l3mdev))
+                               continue;
+                       if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) {
+                               rcu_read_unlock();
+                               goto next_port;
+                       }
+                       if (!check_established(death_row, sk, port, &tw, true))
+                               break;
+                       rcu_read_unlock();
+                       goto next_port;
+               }
+               rcu_read_unlock();
+
                spin_lock_bh(&head->lock);
 
                /* Does not bother with rcv_saddr checks, because
@@ -1070,12 +1087,12 @@ other_parity_scan:
                        if (inet_bind_bucket_match(tb, net, port, l3mdev)) {
                                if (tb->fastreuse >= 0 ||
                                    tb->fastreuseport >= 0)
-                                       goto next_port;
+                                       goto next_port_unlock;
                                WARN_ON(hlist_empty(&tb->bhash2));
                                if (!check_established(death_row, sk,
-                                                      port, &tw))
+                                                      port, &tw, false))
                                        goto ok;
-                               goto next_port;
+                               goto next_port_unlock;
                        }
                }
 
@@ -1089,8 +1106,9 @@ other_parity_scan:
                tb->fastreuse = -1;
                tb->fastreuseport = -1;
                goto ok;
-next_port:
+next_port_unlock:
                spin_unlock_bh(&head->lock);
+next_port:
                cond_resched();
        }
 
index 3604a5cae5d29a25d24f9513308334ff8e64b083..9be315496459fcb391123a07ac887e2f59d27360 100644 (file)
@@ -263,7 +263,8 @@ EXPORT_SYMBOL_GPL(inet6_lookup);
 
 static int __inet6_check_established(struct inet_timewait_death_row *death_row,
                                     struct sock *sk, const __u16 lport,
-                                    struct inet_timewait_sock **twp)
+                                    struct inet_timewait_sock **twp,
+                                    bool rcu_lookup)
 {
        struct inet_hashinfo *hinfo = death_row->hashinfo;
        struct inet_sock *inet = inet_sk(sk);
@@ -281,17 +282,18 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
        struct sock *sk2;
        spinlock_t *lock;
 
-       rcu_read_lock();
-       sk_nulls_for_each(sk2, node, &head->chain) {
-               if (sk2->sk_hash != hash ||
-                   !inet6_match(net, sk2, saddr, daddr, ports, dif, sdif))
-                       continue;
-               if (sk2->sk_state == TCP_TIME_WAIT)
-                       break;
-               rcu_read_unlock();
-               return -EADDRNOTAVAIL;
+       if (rcu_lookup) {
+               sk_nulls_for_each(sk2, node, &head->chain) {
+                       if (sk2->sk_hash != hash ||
+                           !inet6_match(net, sk2, saddr, daddr,
+                                        ports, dif, sdif))
+                               continue;
+                       if (sk2->sk_state == TCP_TIME_WAIT)
+                               break;
+                       return -EADDRNOTAVAIL;
+               }
+               return 0;
        }
-       rcu_read_unlock();
 
        lock = inet_ehash_lockp(hinfo, hash);
        spin_lock(lock);