While looking at UDP receive performance, I saw sk_wake_async()
was no longer inlined.
This matters at least on AMD Zen1-4 platforms (see SRSO)
This might be because rcu_read_lock() and rcu_read_unlock()
are no longer nops in recent kernels ?
Add sk_wake_async_rcu() variant, which must be called from
contexts already holding rcu lock.
As SOCK_FASYNC is deprecated in modern days, use unlikely()
to give a hint to the compiler.
sk_wake_async_rcu() is properly inlined from
__udp_enqueue_schedule_skb() and sock_def_readable().
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20240328144032.1864988-5-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
 
                wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN |
                                                           EPOLLRDNORM |
                                                           EPOLLRDBAND);
-       sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+       sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
        rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(af_alg_wmem_wakeup);
                wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
                                                           EPOLLRDNORM |
                                                           EPOLLRDBAND);
-       sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+       sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
        rcu_read_unlock();
 }
 
 
        }
 }
 
+static inline void sk_wake_async_rcu(const struct sock *sk, int how, int band)
+{
+       if (unlikely(sock_flag(sk, SOCK_FASYNC)))
+               sock_wake_async(rcu_dereference(sk->sk_wq), how, band);
+}
+
 /* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might
  * need sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak.
  * Note: for send buffers, TCP works better if we can build two skbs at
 
                if (skwq_has_sleeper(wq))
                        wake_up_interruptible(&wq->wait);
 
-               sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+               sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
        }
 
        rcu_read_unlock();
 
        wq = rcu_dereference(sk->sk_wq);
        if (skwq_has_sleeper(wq))
                wake_up_interruptible_poll(&wq->wait, EPOLLERR);
-       sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
+       sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
        rcu_read_unlock();
 }
 
        if (skwq_has_sleeper(wq))
                wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
                                                EPOLLRDNORM | EPOLLRDBAND);
-       sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+       sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
        rcu_read_unlock();
 }
 
                                                EPOLLWRNORM | EPOLLWRBAND);
 
                /* Should agree with poll, otherwise some programs break */
-               sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+               sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
        }
 
        rcu_read_unlock();
                                                EPOLLWRNORM | EPOLLWRBAND);
 
                /* Should agree with poll, otherwise some programs break */
-               sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+               sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
        }
 }
 
 
                wake_up_interruptible(&wq->wait);
        /* Should agree with poll, otherwise some programs break */
        if (sock_writeable(sk))
-               sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+               sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
 
        rcu_read_unlock();
 }
 
                        INDIRECT_CALL_1(sk->sk_data_ready,
                                        sock_def_readable, sk);
                else
-                       sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+                       sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
        }
        busylock_release(busy);
        return 0;
 
        wq = rcu_dereference(sk->sk_wq);
        if (skwq_has_sleeper(wq))
                wake_up_interruptible_all(&wq->wait);
-       sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+       sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
        rcu_read_unlock();
 }
 
 
 
                if (skwq_has_sleeper(wq))
                        wake_up_interruptible(&wq->wait);
-               sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+               sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
        }
        rcu_read_unlock();
 }
 
        if (skwq_has_sleeper(wq))
                wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN |
                                                EPOLLRDNORM | EPOLLRDBAND);
-       sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+       sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
        rcu_read_unlock();
 }
 
 
        if (skwq_has_sleeper(wq))
                wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
                                                EPOLLRDNORM | EPOLLRDBAND);
-       sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+       sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
        if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
            (sk->sk_state == SMC_CLOSED))
-               sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
+               sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_HUP);
        rcu_read_unlock();
 }
 
 
                if (skwq_has_sleeper(wq))
                        wake_up_interruptible_sync_poll(&wq->wait,
                                EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
-               sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+               sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
        }
        rcu_read_unlock();
 }