]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
net: implement lockless SO_MAX_PACING_RATE
authorEric Dumazet <edumazet@google.com>
Thu, 21 Sep 2023 20:28:15 +0000 (20:28 +0000)
committerDavid S. Miller <davem@davemloft.net>
Sun, 1 Oct 2023 18:09:54 +0000 (19:09 +0100)
SO_MAX_PACING_RATE setsockopt() does not need to hold
the socket lock, because sk->sk_pacing_rate readers
can run fine if the value is changed by other threads,
after adding READ_ONCE() accessors.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
include/trace/events/mptcp.h
net/core/sock.c
net/ipv4/tcp_bbr.c
net/ipv4/tcp_input.c
net/ipv4/tcp_output.c
net/sched/sch_fq.c

index 563e48617374d3f68dd86b78c13fe6bc28bf6947..09e72215b9f9bb53ec363d7690e9b87a09d172cb 100644 (file)
@@ -44,7 +44,7 @@ TRACE_EVENT(mptcp_subflow_get_send,
                ssk = mptcp_subflow_tcp_sock(subflow);
                if (ssk && sk_fullsock(ssk)) {
                        __entry->snd_wnd = tcp_sk(ssk)->snd_wnd;
-                       __entry->pace = ssk->sk_pacing_rate;
+                       __entry->pace = READ_ONCE(ssk->sk_pacing_rate);
                } else {
                        __entry->snd_wnd = 0;
                        __entry->pace = 0;
index 408081549bd777811058d5de3e9df0f459e6e999..4254ed0e4817d60cb2bf9d8e62ffcd98a90f7ec6 100644 (file)
@@ -1160,6 +1160,27 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
                WRITE_ONCE(sk->sk_busy_poll_budget, val);
                return 0;
 #endif
+       case SO_MAX_PACING_RATE:
+               {
+               unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
+               unsigned long pacing_rate;
+
+               if (sizeof(ulval) != sizeof(val) &&
+                   optlen >= sizeof(ulval) &&
+                   copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
+                       return -EFAULT;
+               }
+               if (ulval != ~0UL)
+                       cmpxchg(&sk->sk_pacing_status,
+                               SK_PACING_NONE,
+                               SK_PACING_NEEDED);
+               /* Pairs with READ_ONCE() from sk_getsockopt() */
+               WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
+               pacing_rate = READ_ONCE(sk->sk_pacing_rate);
+               if (ulval < pacing_rate)
+                       WRITE_ONCE(sk->sk_pacing_rate, ulval);
+               return 0;
+               }
        }
 
        sockopt_lock_sock(sk);
@@ -1423,25 +1444,6 @@ set_sndbuf:
                break;
 
 
-       case SO_MAX_PACING_RATE:
-               {
-               unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
-
-               if (sizeof(ulval) != sizeof(val) &&
-                   optlen >= sizeof(ulval) &&
-                   copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
-                       ret = -EFAULT;
-                       break;
-               }
-               if (ulval != ~0UL)
-                       cmpxchg(&sk->sk_pacing_status,
-                               SK_PACING_NONE,
-                               SK_PACING_NEEDED);
-               /* Pairs with READ_ONCE() from sk_getsockopt() */
-               WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
-               sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
-               break;
-               }
        case SO_INCOMING_CPU:
                reuseport_update_incoming_cpu(sk, val);
                break;
index 146792cd26fed4e61cd72a5d85263b2c7c7b2636..22358032dd484b081d30686fbd03b01fbb9c4214 100644 (file)
@@ -258,7 +258,7 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
        u64 rate = bw;
 
        rate = bbr_rate_bytes_per_sec(sk, rate, gain);
-       rate = min_t(u64, rate, sk->sk_max_pacing_rate);
+       rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate));
        return rate;
 }
 
@@ -278,7 +278,8 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
        }
        bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT;
        do_div(bw, rtt_us);
-       sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain);
+       WRITE_ONCE(sk->sk_pacing_rate,
+                  bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain));
 }
 
 /* Pace using current bw estimate and a gain factor. */
@@ -290,14 +291,14 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
 
        if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
                bbr_init_pacing_rate_from_rtt(sk);
-       if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate)
-               sk->sk_pacing_rate = rate;
+       if (bbr_full_bw_reached(sk) || rate > READ_ONCE(sk->sk_pacing_rate))
+               WRITE_ONCE(sk->sk_pacing_rate, rate);
 }
 
 /* override sysctl_tcp_min_tso_segs */
 __bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
 {
-       return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
+       return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2;
 }
 
 static u32 bbr_tso_segs_goal(struct sock *sk)
@@ -309,7 +310,7 @@ static u32 bbr_tso_segs_goal(struct sock *sk)
         * driver provided sk_gso_max_size.
         */
        bytes = min_t(unsigned long,
-                     sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
+                     READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift),
                      GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
        segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
 
index 584825ddd0a09a2037aea7869b137c3ac64a1534..22c2a7c2e65ee749a61b5dc74459e0c7db9f4628 100644 (file)
@@ -927,8 +927,8 @@ static void tcp_update_pacing_rate(struct sock *sk)
         * without any lock. We want to make sure compiler wont store
         * intermediate values in this location.
         */
-       WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate,
-                                            sk->sk_max_pacing_rate));
+       WRITE_ONCE(sk->sk_pacing_rate,
+                  min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate)));
 }
 
 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
index 2d1e4b5ac1ca41ff3db8dc58458d4e922a2c4999..970a072894808f2dbd766f54f1c2c54650b8f0da 100644 (file)
@@ -1201,7 +1201,7 @@ static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
        struct tcp_sock *tp = tcp_sk(sk);
 
        if (sk->sk_pacing_status != SK_PACING_NONE) {
-               unsigned long rate = sk->sk_pacing_rate;
+               unsigned long rate = READ_ONCE(sk->sk_pacing_rate);
 
                /* Original sch_fq does not pace first 10 MSS
                 * Note that tp->data_segs_out overflows after 2^32 packets,
@@ -1973,7 +1973,7 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
        unsigned long bytes;
        u32 r;
 
-       bytes = sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift);
+       bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift);
 
        r = tcp_min_rtt(tcp_sk(sk)) >> READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log);
        if (r < BITS_PER_TYPE(sk->sk_gso_max_size))
@@ -2553,7 +2553,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
 
        limit = max_t(unsigned long,
                      2 * skb->truesize,
-                     sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift));
+                     READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift));
        if (sk->sk_pacing_status == SK_PACING_NONE)
                limit = min_t(unsigned long, limit,
                              READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes));
@@ -2561,7 +2561,8 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
 
        if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
            tcp_sk(sk)->tcp_tx_delay) {
-               u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay;
+               u64 extra_bytes = (u64)READ_ONCE(sk->sk_pacing_rate) *
+                                 tcp_sk(sk)->tcp_tx_delay;
 
                /* TSQ is based on skb truesize sum (sk_wmem_alloc), so we
                 * approximate our needs assuming an ~100% skb->truesize overhead.
index 681bbf34b70763032c68d89003307ceec8ab46b4..d35419db7b9407cedc8f48eaab783f41b366b2a2 100644 (file)
@@ -668,7 +668,7 @@ begin:
         */
        if (!skb->tstamp) {
                if (skb->sk)
-                       rate = min(skb->sk->sk_pacing_rate, rate);
+                       rate = min(READ_ONCE(skb->sk->sk_pacing_rate), rate);
 
                if (rate <= q->low_rate_threshold) {
                        f->credit = 0;