tcp: make cwnd-limited checks measurement-based, and gentler

author Neal Cardwell <ncardwell@google.com>

Thu, 22 May 2014 14:41:08 +0000 (10:41 -0400)

committer David S. Miller <davem@davemloft.net>

Thu, 22 May 2014 16:04:49 +0000 (12:04 -0400)
author Neal Cardwell <ncardwell@google.com>
Thu, 22 May 2014 14:41:08 +0000 (10:41 -0400)
committer David S. Miller <davem@davemloft.net>
Thu, 22 May 2014 16:04:49 +0000 (12:04 -0400)
diff --git a/include/linux/tcp.h b/include/linux/tcp.h

index bc35e4709e8e7175f5ea000b260d61eca44ff530..a0513210798fc9027af01dccccc5ff6c677d3d7e 100644 (file)
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -197,7 +197,8 @@ struct tcp_sock {
         u8      do_early_retrans:1,/* Enable RFC5827 early-retransmit  */
                 syn_data:1,     /* SYN includes data */
                 syn_fastopen:1, /* SYN includes Fast Open option */
-               syn_data_acked:1;/* data in SYN is acked by SYN-ACK */
+               syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
+               is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
         u32     tlp_high_seq;   /* snd_nxt at the time of TLP retransmit. */
  
  /* RTT measurement */
@@ -209,6 +210,8 @@ struct tcp_sock {
  
         u32     packets_out;    /* Packets which are "in flight"        */
         u32     retrans_out;    /* Retransmitted packets out            */
+       u32     max_packets_out;  /* max packets_out in last window */
+       u32     max_packets_seq;  /* right edge of max_packets_out flight */
  
         u16     urg_data;       /* Saved octet of OOB data and control flags */
         u8      ecn_flags;      /* ECN status bits.                     */
@@ -230,7 +233,6 @@ struct tcp_sock {
         u32     snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
         u32     snd_cwnd_used;
         u32     snd_cwnd_stamp;
-       u32     lsnd_pending;   /* packets inflight or unsent since last xmit */
         u32     prior_cwnd;     /* Congestion window at start of Recovery. */
         u32     prr_delivered;  /* Number of newly delivered packets to
                                  * receiver in Recovery. */
diff --git a/include/net/tcp.h b/include/net/tcp.h

index f5d6ca4a9d289af0195fc7c50b3b20579cf7a3c5..e80abe4486cbd252eb556997978fc804d389db7e 100644 (file)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -971,8 +971,9 @@ static inline u32 tcp_wnd_end(const struct tcp_sock *tp)
  
  /* We follow the spirit of RFC2861 to validate cwnd but implement a more
   * flexible approach. The RFC suggests cwnd should not be raised unless
- * it was fully used previously. But we allow cwnd to grow as long as the
- * application has used half the cwnd.
+ * it was fully used previously. And that's exactly what we do in
+ * congestion avoidance mode. But in slow start we allow cwnd to grow
+ * as long as the application has used half the cwnd.
   * Example :
   *    cwnd is 10 (IW10), but application sends 9 frames.
   *    We allow cwnd to reach 18 when all frames are ACKed.
@@ -985,7 +986,11 @@ static inline bool tcp_is_cwnd_limited(const struct sock *sk)
  {
         const struct tcp_sock *tp = tcp_sk(sk);
  
-       return tp->snd_cwnd < 2 * tp->lsnd_pending;
+       /* If in slow start, ensure cwnd grows to twice what was ACKed. */
+       if (tp->snd_cwnd <= tp->snd_ssthresh)
+               return tp->snd_cwnd < 2 * tp->max_packets_out;
+
+       return tp->is_cwnd_limited;
  }
  
  static inline void tcp_check_probe_timer(struct sock *sk)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c

index 3d61c52bdf79dad38099ca1f087e1f08d2b86c81..d463c35db33d8a8873bef9ebf51a9a0ab112fd51 100644 (file)
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1402,11 +1402,19 @@ static void tcp_cwnd_application_limited(struct sock *sk)
         tp->snd_cwnd_stamp = tcp_time_stamp;
  }
  
-static void tcp_cwnd_validate(struct sock *sk, u32 unsent_segs)
+static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
  {
         struct tcp_sock *tp = tcp_sk(sk);
  
-       tp->lsnd_pending = tp->packets_out + unsent_segs;
+       /* Track the maximum number of outstanding packets in each
+        * window, and remember whether we were cwnd-limited then.
+        */
+       if (!before(tp->snd_una, tp->max_packets_seq) ||
+           tp->packets_out > tp->max_packets_out) {
+               tp->max_packets_out = tp->packets_out;
+               tp->max_packets_seq = tp->snd_nxt;
+               tp->is_cwnd_limited = is_cwnd_limited;
+       }
  
         if (tcp_is_cwnd_limited(sk)) {
                 /* Network is feed fully. */
@@ -1660,7 +1668,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
   *
   * This algorithm is from John Heffner.
   */
-static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
+static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
+                                bool *is_cwnd_limited)
  {
         struct tcp_sock *tp = tcp_sk(sk);
         const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1724,6 +1733,9 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
         if (!tp->tso_deferred)
                 tp->tso_deferred = 1 | (jiffies << 1);
  
+       if (cong_win < send_win && cong_win < skb->len)
+               *is_cwnd_limited = true;
+
         return true;
  
  send_now:
@@ -1881,9 +1893,10 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
  {
         struct tcp_sock *tp = tcp_sk(sk);
         struct sk_buff *skb;
-       unsigned int tso_segs, sent_pkts, unsent_segs = 0;
+       unsigned int tso_segs, sent_pkts;
         int cwnd_quota;
         int result;
+       bool is_cwnd_limited = false;
  
         sent_pkts = 0;
  
@@ -1908,6 +1921,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
  
                 cwnd_quota = tcp_cwnd_test(tp, skb);
                 if (!cwnd_quota) {
+                       is_cwnd_limited = true;
                         if (push_one == 2)
                                 /* Force out a loss probe pkt. */
                                 cwnd_quota = 1;
@@ -1924,8 +1938,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                                                       nonagle : TCP_NAGLE_PUSH))))
                                 break;
                 } else {
-                       if (!push_one && tcp_tso_should_defer(sk, skb))
-                               goto compute_unsent_segs;
+                       if (!push_one &&
+                           tcp_tso_should_defer(sk, skb, &is_cwnd_limited))
+                               break;
                 }
  
                 /* TCP Small Queues :
@@ -1950,14 +1965,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
                          * there is no smp_mb__after_set_bit() yet
                          */
                         smp_mb__after_clear_bit();
-                       if (atomic_read(&sk->sk_wmem_alloc) > limit) {
-                               u32 unsent_bytes;
-
-compute_unsent_segs:
-                               unsent_bytes = tp->write_seq - tp->snd_nxt;
-                               unsent_segs = DIV_ROUND_UP(unsent_bytes, mss_now);
+                       if (atomic_read(&sk->sk_wmem_alloc) > limit)
                                 break;
-                       }
                 }
  
                 limit = mss_now;
@@ -1997,7 +2006,7 @@ repair:
                 /* Send one loss probe per tail loss episode. */
                 if (push_one != 2)
                         tcp_schedule_loss_probe(sk);
-               tcp_cwnd_validate(sk, unsent_segs);
+               tcp_cwnd_validate(sk, is_cwnd_limited);
                 return false;
         }
         return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
author	Neal Cardwell <ncardwell@google.com>
	Thu, 22 May 2014 14:41:08 +0000 (10:41 -0400)
committer	David S. Miller <davem@davemloft.net>
	Thu, 22 May 2014 16:04:49 +0000 (12:04 -0400)
include/linux/tcp.h		patch \| blob \| history
include/net/tcp.h		patch \| blob \| history
net/ipv4/tcp_output.c		patch \| blob \| history