]> www.infradead.org Git - users/hch/misc.git/commitdiff
tcp: accecn: AccECN option send control
authorChia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Tue, 16 Sep 2025 08:24:31 +0000 (10:24 +0200)
committerPaolo Abeni <pabeni@redhat.com>
Thu, 18 Sep 2025 06:47:52 +0000 (08:47 +0200)
Instead of sending the option in every ACK, limit sending to
those ACKs where the option is necessary:
- Handshake
- "Change-triggered ACK" + the ACK following it. The
  2nd ACK is necessary to unambiguously indicate which
  of the ECN byte counters in increasing. The first
  ACK has two counters increasing due to the ecnfield
  edge.
- ACKs with CE to allow CEP delta validations to take
  advantage of the option.
- Force option to be sent every at least once per 2^22
  bytes. The check is done using the bit edges of the
  byte counters (avoids need for extra variables).
- AccECN option beacon to send a few times per RTT even if
  nothing in the ECN state requires that. The default is 3
  times per RTT, and its period can be set via
  sysctl_tcp_ecn_option_beacon.

Below are the pahole outcomes before and after this patch,
in which the group size of tcp_sock_write_tx is increased
from 89 to 97 due to the new u64 accecn_opt_tstamp member:

[BEFORE THIS PATCH]
struct tcp_sock {
    [...]
    u64                        tcp_wstamp_ns;        /*  2488     8 */
    struct list_head           tsorted_sent_queue;   /*  2496    16 */

    [...]
    __cacheline_group_end__tcp_sock_write_tx[0];     /*  2521     0 */
    __cacheline_group_begin__tcp_sock_write_txrx[0]; /*  2521     0 */
    u8                         nonagle:4;            /*  2521: 0  1 */
    u8                         rate_app_limited:1;   /*  2521: 4  1 */
    /* XXX 3 bits hole, try to pack */

    /* Force alignment to the next boundary: */
    u8                         :0;
    u8                         received_ce_pending:4;/*  2522: 0  1 */
    u8                         unused2:4;            /*  2522: 4  1 */
    u8                         accecn_minlen:2;      /*  2523: 0  1 */
    u8                         est_ecnfield:2;       /*  2523: 2  1 */
    u8                         unused3:4;            /*  2523: 4  1 */

    [...]
    __cacheline_group_end__tcp_sock_write_txrx[0];   /*  2628     0 */

    [...]
    /* size: 3200, cachelines: 50, members: 171 */
}

[AFTER THIS PATCH]
struct tcp_sock {
    [...]
    u64                        tcp_wstamp_ns;        /*  2488     8 */
    u64                        accecn_opt_tstamp;    /*  2596     8 */
    struct list_head           tsorted_sent_queue;   /*  2504    16 */

    [...]
    __cacheline_group_end__tcp_sock_write_tx[0];     /*  2529     0 */
    __cacheline_group_begin__tcp_sock_write_txrx[0]; /*  2529     0 */
    u8                         nonagle:4;            /*  2529: 0  1 */
    u8                         rate_app_limited:1;   /*  2529: 4  1 */
    /* XXX 3 bits hole, try to pack */

    /* Force alignment to the next boundary: */
    u8                         :0;
    u8                         received_ce_pending:4;/*  2530: 0  1 */
    u8                         unused2:4;            /*  2530: 4  1 */
    u8                         accecn_minlen:2;      /*  2531: 0  1 */
    u8                         est_ecnfield:2;       /*  2531: 2  1 */
    u8                         accecn_opt_demand:2;  /*  2531: 4  1 */
    u8                         prev_ecnfield:2;      /*  2531: 6  1 */

    [...]
    __cacheline_group_end__tcp_sock_write_txrx[0];   /*  2636     0 */

    [...]
    /* size: 3200, cachelines: 50, members: 173 */
}

Signed-off-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Co-developed-by: Ilpo Järvinen <ij@kernel.org>
Signed-off-by: Ilpo Järvinen <ij@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250916082434.100722-8-chia-yu.chang@nokia-bell-labs.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
12 files changed:
Documentation/networking/ip-sysctl.rst
Documentation/networking/net_cachelines/tcp_sock.rst
include/linux/tcp.h
include/net/netns/ipv4.h
include/net/tcp.h
include/net/tcp_ecn.h
net/ipv4/sysctl_net_ipv4.c
net/ipv4/tcp.c
net/ipv4/tcp_input.c
net/ipv4/tcp_ipv4.c
net/ipv4/tcp_minisocks.c
net/ipv4/tcp_output.c

index 1c206501b9731a7f4fcf8a80d79ef09656fa3709..a06cb99d66dcdce763e3450352097c24aae97566 100644 (file)
@@ -487,6 +487,12 @@ tcp_ecn_option - INTEGER
 
        Default: 2
 
+tcp_ecn_option_beacon - INTEGER
+       Control Accurate ECN (AccECN) option sending frequency per RTT and it
+       takes effect only when tcp_ecn_option is set to 2.
+
+       Default: 3 (AccECN will be send at least 3 times per RTT)
+
 tcp_ecn_fallback - BOOLEAN
        If the kernel detects that ECN connection misbehaves, enable fall
        back to non-ECN. Currently, this knob implements the fallback
index b941151f8c0aec26c11004519c42edce5bcf0751..d4dc018009451261c81a46dac2d6322005901c99 100644 (file)
@@ -109,6 +109,9 @@ u8:2                          syn_ect_snt             write_mostly        read_w
 u8:2                          syn_ect_rcv             read_mostly         read_write
 u8:2                          accecn_minlen           write_mostly        read_write
 u8:2                          est_ecnfield                                read_write
+u8:2                          accecn_opt_demand       read_mostly         read_write
+u8:2                          prev_ecnfield                               read_write
+u64                           accecn_opt_tstamp       read_write
 u8:4                          accecn_fail_mode
 u32                           lost                                        read_mostly         tcp_ack
 u32                           app_limited             read_write          read_mostly         tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx)
index 73557656cb2d0fc4f9278a8d4d57674ed2a86993..f637b659b35aed9de778617d1435ee17d3cb84a8 100644 (file)
@@ -275,6 +275,7 @@ struct tcp_sock {
        u32     mdev_us;        /* medium deviation                     */
        u32     rtt_seq;        /* sequence number to update rttvar     */
        u64     tcp_wstamp_ns;  /* departure time for next sent data packet */
+       u64     accecn_opt_tstamp;      /* Last AccECN option sent timestamp */
        struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */
        struct sk_buff *highest_sack;   /* skb just after the highest
                                         * skb with SACKed bit set
@@ -296,7 +297,8 @@ struct tcp_sock {
                unused2:4;
        u8      accecn_minlen:2,/* Minimum length of AccECN option sent */
                est_ecnfield:2,/* ECN field for AccECN delivered estimates */
-               unused3:4;
+               accecn_opt_demand:2,/* Demand AccECN option for n next ACKs */
+               prev_ecnfield:2; /* ECN bits from the previous segment */
        __be32  pred_flags;
        u64     tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */
        u64     tcp_mstamp;     /* most recent packet received/sent */
index acbb7dd497e1e5791d467bece039f470b2ff8236..34eb3aecb3f21cfc2a992df7741835b261dd9fdb 100644 (file)
@@ -149,6 +149,7 @@ struct netns_ipv4 {
 
        u8 sysctl_tcp_ecn;
        u8 sysctl_tcp_ecn_option;
+       u8 sysctl_tcp_ecn_option_beacon;
        u8 sysctl_tcp_ecn_fallback;
 
        u8 sysctl_ip_default_ttl;
index 6be29129465e7a7b1047f2621bb06c3520915d10..78dd7b8a414520354e303de3ce059c6faa27f01c 100644 (file)
@@ -100,6 +100,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 /* Maximal number of window scale according to RFC1323 */
 #define TCP_MAX_WSCALE         14U
 
+/* Default sending frequency of accurate ECN option per RTT */
+#define TCP_ACCECN_OPTION_BEACON       3
+
 /* urg_data states */
 #define TCP_URG_VALID  0x0100
 #define TCP_URG_NOTYET 0x0200
index 08c7f4757e4e115560128164f30ae2d5bedb2e0b..133fb6b795006eea576fecc2bf896627b37a3fb7 100644 (file)
@@ -176,6 +176,17 @@ static inline void tcp_accecn_third_ack(struct sock *sk,
        }
 }
 
+/* Demand the minimum # to send AccECN optnio */
+static inline void tcp_accecn_opt_demand_min(struct sock *sk,
+                                            u8 opt_demand_min)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       u8 opt_demand;
+
+       opt_demand = max_t(u8, opt_demand_min, tp->accecn_opt_demand);
+       tp->accecn_opt_demand = opt_demand;
+}
+
 /* Maps IP ECN field ECT/CE code point to AccECN option field number, given
  * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0).
  */
@@ -256,6 +267,7 @@ static inline void tcp_ecn_received_counters(struct sock *sk,
        u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK;
        u8 is_ce = INET_ECN_is_ce(ecnfield);
        struct tcp_sock *tp = tcp_sk(sk);
+       bool ecn_edge;
 
        if (!INET_ECN_is_not_ect(ecnfield)) {
                u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs);
@@ -274,9 +286,34 @@ static inline void tcp_ecn_received_counters(struct sock *sk,
 
                if (len > 0) {
                        u8 minlen = tcp_ecnfield_to_accecn_optfield(ecnfield);
+                       u32 oldbytes = tp->received_ecn_bytes[ecnfield - 1];
+                       u32 bytes_mask = GENMASK_U32(31, 22);
+
                        tp->received_ecn_bytes[ecnfield - 1] += len;
                        tp->accecn_minlen = max_t(u8, tp->accecn_minlen,
                                                  minlen);
+
+                       /* Send AccECN option at least once per 2^22-byte
+                        * increase in any ECN byte counter.
+                        */
+                       if ((tp->received_ecn_bytes[ecnfield - 1] ^ oldbytes) &
+                           bytes_mask) {
+                               tcp_accecn_opt_demand_min(sk, 1);
+                       }
+               }
+       }
+
+       ecn_edge = tp->prev_ecnfield != ecnfield;
+       if (ecn_edge || is_ce) {
+               tp->prev_ecnfield = ecnfield;
+               /* Demand Accurate ECN change-triggered ACKs. Two ACK are
+                * demanded to indicate unambiguously the ecnfield value
+                * in the latter ACK.
+                */
+               if (tcp_ecn_mode_accecn(tp)) {
+                       if (ecn_edge)
+                               inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
+                       tp->accecn_opt_demand = 2;
                }
        }
 }
@@ -349,6 +386,7 @@ static inline void tcp_accecn_init_counters(struct tcp_sock *tp)
        __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes);
        __tcp_accecn_init_bytes_counters(tp->delivered_ecn_bytes);
        tp->accecn_minlen = 0;
+       tp->accecn_opt_demand = 0;
        tp->est_ecnfield = 0;
 }
 
@@ -431,6 +469,7 @@ static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th,
        default:
                tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
                tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK;
+               tp->accecn_opt_demand = 2;
                if (INET_ECN_is_ce(ip_dsfield) &&
                    tcp_accecn_validate_syn_feedback(sk, ace,
                                                     tp->syn_ect_snt)) {
@@ -451,6 +490,7 @@ static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th,
                } else {
                        tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield &
                                          INET_ECN_MASK;
+                       tp->prev_ecnfield = tp->syn_ect_rcv;
                        tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
                }
        }
@@ -542,4 +582,16 @@ tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
                th->ece = 1;
 }
 
+static inline bool tcp_accecn_option_beacon_check(const struct sock *sk)
+{
+       u32 ecn_beacon = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option_beacon);
+       const struct tcp_sock *tp = tcp_sk(sk);
+
+       if (!ecn_beacon)
+               return false;
+
+       return tcp_stamp_us_delta(tp->tcp_mstamp, tp->accecn_opt_tstamp) * ecn_beacon >=
+              (tp->srtt_us >> 3);
+}
+
 #endif /* _LINUX_TCP_ECN_H */
index 4a697acb4e858a327cca89e9f5ff2b5ddabb5c84..24dbc603cc44db0cffd71915bccc737781b3a3e9 100644 (file)
@@ -740,6 +740,15 @@ static struct ctl_table ipv4_net_table[] = {
                .extra1         = SYSCTL_ZERO,
                .extra2         = SYSCTL_TWO,
        },
+       {
+               .procname       = "tcp_ecn_option_beacon",
+               .data           = &init_net.ipv4.sysctl_tcp_ecn_option_beacon,
+               .maxlen         = sizeof(u8),
+               .mode           = 0644,
+               .proc_handler   = proc_dou8vec_minmax,
+               .extra1         = SYSCTL_ZERO,
+               .extra2         = SYSCTL_THREE,
+       },
        {
                .procname       = "tcp_ecn_fallback",
                .data           = &init_net.ipv4.sysctl_tcp_ecn_fallback,
index 8c4a4b8666fc8c7866dfcbd372b6469842430be1..090f9ac43d4c68d73d7c9b313e762f548d7c6146 100644 (file)
@@ -3410,6 +3410,8 @@ int tcp_disconnect(struct sock *sk, int flags)
        tp->delivered_ce = 0;
        tp->accecn_fail_mode = 0;
        tcp_accecn_init_counters(tp);
+       tp->prev_ecnfield = 0;
+       tp->accecn_opt_tstamp = 0;
        if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release)
                icsk->icsk_ca_ops->release(sk);
        memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
@@ -5134,11 +5136,12 @@ static void __init tcp_struct_check(void)
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns);
+       CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, accecn_opt_tstamp);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack);
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags);
-       CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 89);
+       CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 97);
 
        /* TXRX read-write hotpath cache lines */
        CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags);
index e898a76c485e952e476d523bc36e82893a689704..87154fd86167d59adc252533bbd103b45160f149 100644 (file)
@@ -6121,8 +6121,10 @@ step1:
         * RFC 5961 4.2 : Send a challenge ack
         */
        if (th->syn) {
-               if (tcp_ecn_mode_accecn(tp))
+               if (tcp_ecn_mode_accecn(tp)) {
                        accecn_reflector = true;
+                       tcp_accecn_opt_demand_min(sk, 1);
+               }
                if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack &&
                    TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq &&
                    TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt &&
index aa8dbfe20924343dfebede6b34266d70ecc34975..6a63be1f64617412946fb88e5533be7a137c5cef 100644 (file)
@@ -3562,6 +3562,7 @@ static int __net_init tcp_sk_init(struct net *net)
 {
        net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN;
        net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL;
+       net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON;
        net->ipv4.sysctl_tcp_ecn_fallback = 1;
 
        net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
index 1dbcc09ff7a90b81af451eceb78960cdad7df0f1..1933434945587dbe043863462e8d977ae0b77cfb 100644 (file)
@@ -463,6 +463,8 @@ static void tcp_ecn_openreq_child(struct sock *sk,
                tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
                tp->syn_ect_snt = treq->syn_ect_snt;
                tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt);
+               tp->prev_ecnfield = treq->syn_ect_rcv;
+               tp->accecn_opt_demand = 1;
                tcp_ecn_received_counters_payload(sk, skb);
        } else {
                tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ?
index 34e5c83bbacef87c8ea23494b1dc5a1d7445c5b5..f897c2594954c096d76ef41ca129b8e2f038ad3f 100644 (file)
@@ -705,8 +705,12 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
                        *ptr++ = htonl(((e0b & 0xffffff) << 8) |
                                       TCPOPT_NOP);
                }
-               if (tp)
+               if (tp) {
                        tp->accecn_minlen = 0;
+                       tp->accecn_opt_tstamp = tp->tcp_mstamp;
+                       if (tp->accecn_opt_demand)
+                               tp->accecn_opt_demand--;
+               }
        }
 
        if (unlikely(OPTION_SACK_ADVERTISE & options)) {
@@ -1149,11 +1153,16 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
                opts->num_sack_blocks = 0;
        }
 
-       if (tcp_ecn_mode_accecn(tp) &&
-           READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option)) {
-               opts->use_synack_ecn_bytes = 0;
-               size += tcp_options_fit_accecn(opts, tp->accecn_minlen,
-                                              MAX_TCP_OPTION_SPACE - size);
+       if (tcp_ecn_mode_accecn(tp)) {
+               int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option);
+
+               if (ecn_opt &&
+                   (ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand ||
+                    tcp_accecn_option_beacon_check(sk))) {
+                       opts->use_synack_ecn_bytes = 0;
+                       size += tcp_options_fit_accecn(opts, tp->accecn_minlen,
+                                                      MAX_TCP_OPTION_SPACE - size);
+               }
        }
 
        if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
@@ -2863,6 +2872,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
        sent_pkts = 0;
 
        tcp_mstamp_refresh(tp);
+
+       /* AccECN option beacon depends on mstamp, it may change mss */
+       if (tcp_ecn_mode_accecn(tp) && tcp_accecn_option_beacon_check(sk))
+               mss_now = tcp_current_mss(sk);
+
        if (!push_one) {
                /* Do MTU probing. */
                result = tcp_mtu_probe(sk);