#include <linux/sched.h>
 #include <linux/wait.h>
 #include <linux/cgroup-defs.h>
-
+#include <linux/rbtree.h>
 #include <linux/filter.h>
 #include <linux/rculist_nulls.h>
 #include <linux/poll.h>
        int                     sk_wmem_queued;
        refcount_t              sk_wmem_alloc;
        unsigned long           sk_tsq_flags;
-       struct sk_buff          *sk_send_head;
+       union {
+               struct sk_buff  *sk_send_head;
+               struct rb_root  tcp_rtx_queue;
+       };
        struct sk_buff_head     sk_write_queue;
        __s32                   sk_peek_off;
        int                     sk_write_pending;
 
 void tcp_simple_retransmit(struct sock *);
 void tcp_enter_recovery(struct sock *sk, bool ece_ack);
 int tcp_trim_head(struct sock *, struct sk_buff *, u32);
-int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t);
+enum tcp_queue {
+       TCP_FRAG_IN_WRITE_QUEUE,
+       TCP_FRAG_IN_RTX_QUEUE,
+};
+int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+                struct sk_buff *skb, u32 len,
+                unsigned int mss_now, gfp_t gfp);
 
 void tcp_send_probe0(struct sock *);
 void tcp_send_partial(struct sock *);
 
 void tcp_write_queue_purge(struct sock *sk);
 
+static inline struct sk_buff *tcp_rtx_queue_head(const struct sock *sk)
+{
+       return skb_rb_first(&sk->tcp_rtx_queue);
+}
+
 static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk)
 {
        return skb_peek(&sk->sk_write_queue);
        return skb_queue_prev(&sk->sk_write_queue, skb);
 }
 
-#define tcp_for_write_queue(skb, sk)                                   \
-       skb_queue_walk(&(sk)->sk_write_queue, skb)
-
-#define tcp_for_write_queue_from(skb, sk)                              \
-       skb_queue_walk_from(&(sk)->sk_write_queue, skb)
-
 #define tcp_for_write_queue_from_safe(skb, tmp, sk)                    \
        skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp)
 
 static inline struct sk_buff *tcp_send_head(const struct sock *sk)
 {
-       return sk->sk_send_head;
+       return skb_peek(&sk->sk_write_queue);
 }
 
 static inline bool tcp_skb_is_last(const struct sock *sk,
        return skb_queue_is_last(&sk->sk_write_queue, skb);
 }
 
-static inline void tcp_advance_send_head(struct sock *sk, const struct sk_buff *skb)
+static inline bool tcp_write_queue_empty(const struct sock *sk)
 {
-       if (tcp_skb_is_last(sk, skb))
-               sk->sk_send_head = NULL;
-       else
-               sk->sk_send_head = tcp_write_queue_next(sk, skb);
+       return skb_queue_empty(&sk->sk_write_queue);
+}
+
+static inline bool tcp_rtx_queue_empty(const struct sock *sk)
+{
+       return RB_EMPTY_ROOT(&sk->tcp_rtx_queue);
+}
+
+static inline bool tcp_rtx_and_write_queues_empty(const struct sock *sk)
+{
+       return tcp_rtx_queue_empty(sk) && tcp_write_queue_empty(sk);
 }
 
 static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked)
 {
-       if (sk->sk_send_head == skb_unlinked) {
-               sk->sk_send_head = NULL;
+       if (tcp_write_queue_empty(sk))
                tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
-       }
+
        if (tcp_sk(sk)->highest_sack == skb_unlinked)
                tcp_sk(sk)->highest_sack = NULL;
 }
 
-static inline void tcp_init_send_head(struct sock *sk)
-{
-       sk->sk_send_head = NULL;
-}
-
 static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
 {
        __skb_queue_tail(&sk->sk_write_queue, skb);
        __tcp_add_write_queue_tail(sk, skb);
 
        /* Queue it, remembering where we must start sending. */
-       if (sk->sk_send_head == NULL) {
-               sk->sk_send_head = skb;
+       if (sk->sk_write_queue.next == skb) {
                tcp_chrono_start(sk, TCP_CHRONO_BUSY);
 
                if (tcp_sk(sk)->highest_sack == NULL)
        __skb_queue_head(&sk->sk_write_queue, skb);
 }
 
-/* Insert buff after skb on the write queue of sk.  */
-static inline void tcp_insert_write_queue_after(struct sk_buff *skb,
-                                               struct sk_buff *buff,
-                                               struct sock *sk)
-{
-       __skb_queue_after(&sk->sk_write_queue, skb, buff);
-}
-
 /* Insert new before skb on the write queue of sk.  */
 static inline void tcp_insert_write_queue_before(struct sk_buff *new,
                                                  struct sk_buff *skb,
                                                  struct sock *sk)
 {
        __skb_queue_before(&sk->sk_write_queue, skb, new);
-
-       if (sk->sk_send_head == skb)
-               sk->sk_send_head = new;
 }
 
 static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
 {
-       list_del(&skb->tcp_tsorted_anchor);
-       tcp_skb_tsorted_anchor_cleanup(skb);
        __skb_unlink(skb, &sk->sk_write_queue);
 }
 
-static inline bool tcp_write_queue_empty(struct sock *sk)
+void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb);
+
+static inline void tcp_rtx_queue_unlink(struct sk_buff *skb, struct sock *sk)
 {
-       return skb_queue_empty(&sk->sk_write_queue);
+       tcp_skb_tsorted_anchor_cleanup(skb);
+       rb_erase(&skb->rbnode, &sk->tcp_rtx_queue);
+}
+
+static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct sock *sk)
+{
+       list_del(&skb->tcp_tsorted_anchor);
+       tcp_rtx_queue_unlink(skb, sk);
+       sk_wmem_free_skb(sk, skb);
 }
 
 static inline void tcp_push_pending_frames(struct sock *sk)
 
 static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb)
 {
-       tcp_sk(sk)->highest_sack = tcp_skb_is_last(sk, skb) ? NULL :
-                                               tcp_write_queue_next(sk, skb);
+       struct sk_buff *next = skb_rb_next(skb);
+
+       tcp_sk(sk)->highest_sack = next ?: tcp_send_head(sk);
 }
 
 static inline struct sk_buff *tcp_highest_sack(struct sock *sk)
 
 static inline void tcp_highest_sack_reset(struct sock *sk)
 {
-       tcp_sk(sk)->highest_sack = tcp_write_queue_head(sk);
+       struct sk_buff *skb = tcp_rtx_queue_head(sk);
+
+       tcp_sk(sk)->highest_sack = skb ?: tcp_send_head(sk);
 }
 
 /* Called when old skb is about to be deleted (to be combined with new skb) */
 /* At how many usecs into the future should the RTO fire? */
 static inline s64 tcp_rto_delta_us(const struct sock *sk)
 {
-       const struct sk_buff *skb = tcp_write_queue_head(sk);
+       const struct sk_buff *skb = tcp_rtx_queue_head(sk);
        u32 rto = inet_csk(sk)->icsk_rto;
        u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto);
 
 
        struct tcp_sock *tp = tcp_sk(sk);
 
        tp->out_of_order_queue = RB_ROOT;
+       sk->tcp_rtx_queue = RB_ROOT;
        tcp_init_xmit_timers(sk);
        INIT_LIST_HEAD(&tp->tsq_node);
        INIT_LIST_HEAD(&tp->tsorted_sent_queue);
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
 
-       if (!tcp_send_head(sk))
-               return;
-
        skb = tcp_write_queue_tail(sk);
+       if (!skb)
+               return;
        if (!(flags & MSG_MORE) || forced_push(tp))
                tcp_mark_push(tp, skb);
 
                int copy, i;
                bool can_coalesce;
 
-               if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 ||
+               if (!skb || (copy = size_goal - skb->len) <= 0 ||
                    !tcp_skb_can_collapse_to(skb)) {
 new_segment:
                        if (!sk_stream_memory_free(sk))
                                goto wait_for_sndbuf;
 
                        skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
-                                                 skb_queue_empty(&sk->sk_write_queue));
+                                       tcp_rtx_and_write_queues_empty(sk));
                        if (!skb)
                                goto wait_for_memory;
 
                        goto out_err;
                }
 
-               skb = tcp_send_head(sk) ? tcp_write_queue_tail(sk) : NULL;
+               skb = tcp_write_queue_tail(sk);
                uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
                if (!uarg) {
                        err = -ENOBUFS;
                int max = size_goal;
 
                skb = tcp_write_queue_tail(sk);
-               if (tcp_send_head(sk)) {
+               if (skb) {
                        if (skb->ip_summed == CHECKSUM_NONE)
                                max = mss_now;
                        copy = max - skb->len;
                                process_backlog = false;
                                goto restart;
                        }
-                       first_skb = skb_queue_empty(&sk->sk_write_queue);
+                       first_skb = tcp_rtx_and_write_queues_empty(sk);
                        skb = sk_stream_alloc_skb(sk,
                                                  select_size(sk, sg, first_skb),
                                                  sk->sk_allocation,
 
        /* XXX -- need to support SO_PEEK_OFF */
 
+       skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
+               err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
+               if (err)
+                       return err;
+               copied += skb->len;
+       }
+
        skb_queue_walk(&sk->sk_write_queue, skb) {
                err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
                if (err)
                TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
 }
 
+static void tcp_rtx_queue_purge(struct sock *sk)
+{
+       struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
+
+       while (p) {
+               struct sk_buff *skb = rb_to_skb(p);
+
+               p = rb_next(p);
+               /* Since we are deleting whole queue, no need to
+                * list_del(&skb->tcp_tsorted_anchor)
+                */
+               tcp_rtx_queue_unlink(skb, sk);
+               sk_wmem_free_skb(sk, skb);
+       }
+}
+
 void tcp_write_queue_purge(struct sock *sk)
 {
        struct sk_buff *skb;
                tcp_skb_tsorted_anchor_cleanup(skb);
                sk_wmem_free_skb(sk, skb);
        }
+       tcp_rtx_queue_purge(sk);
        INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
        sk_mem_reclaim(sk);
        tcp_clear_all_retrans_hints(tcp_sk(sk));
         * issue in __tcp_select_window()
         */
        icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
-       tcp_init_send_head(sk);
        memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
        __sk_dst_reset(sk);
        dst_release(sk->sk_rx_dst);
 
        u64     last_sackt;
        struct rate_sample *rate;
        int     flag;
+       unsigned int mss_now;
 };
 
 /* Check if skb is fully within the SACK block. In presence of GSO skbs,
                if (pkt_len >= skb->len && !in_sack)
                        return 0;
 
-               err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC);
+               err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+                                  pkt_len, mss, GFP_ATOMIC);
                if (err < 0)
                        return err;
        }
        if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp))
                TCP_SKB_CB(prev)->tx.delivered_mstamp = 0;
 
-       tcp_unlink_write_queue(skb, sk);
-       sk_wmem_free_skb(sk, skb);
+       tcp_rtx_queue_unlink_and_free(skb, sk);
 
        NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);
 
                goto fallback;
 
        /* Can only happen with delayed DSACK + discard craziness */
-       if (unlikely(skb == tcp_write_queue_head(sk)))
+       prev = skb_rb_prev(skb);
+       if (!prev)
                goto fallback;
-       prev = tcp_write_queue_prev(sk, skb);
 
        if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
                goto fallback;
        /* Hole filled allows collapsing with the next as well, this is very
         * useful when hole on every nth skb pattern happens
         */
-       if (prev == tcp_write_queue_tail(sk))
+       skb = skb_rb_next(prev);
+       if (!skb)
                goto out;
-       skb = tcp_write_queue_next(sk, prev);
 
        if (!skb_can_shift(skb) ||
-           (skb == tcp_send_head(sk)) ||
            ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) ||
            (mss != tcp_skb_seglen(skb)))
                goto out;
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *tmp;
 
-       tcp_for_write_queue_from(skb, sk) {
+       skb_rbtree_walk_from(skb) {
                int in_sack = 0;
                bool dup_sack = dup_sack_in;
 
-               if (skb == tcp_send_head(sk))
-                       break;
-
                /* queue is in-order => we can short-circuit the walk early */
                if (!before(TCP_SKB_CB(skb)->seq, end_seq))
                        break;
        return skb;
 }
 
-/* Avoid all extra work that is being done by sacktag while walking in
- * a normal way
- */
+static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk,
+                                          struct tcp_sacktag_state *state,
+                                          u32 seq)
+{
+       struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node;
+       struct sk_buff *skb;
+       int unack_bytes;
+
+       while (*p) {
+               parent = *p;
+               skb = rb_to_skb(parent);
+               if (before(seq, TCP_SKB_CB(skb)->seq)) {
+                       p = &parent->rb_left;
+                       continue;
+               }
+               if (!before(seq, TCP_SKB_CB(skb)->end_seq)) {
+                       p = &parent->rb_right;
+                       continue;
+               }
+
+               state->fack_count = 0;
+               unack_bytes = TCP_SKB_CB(skb)->seq - tcp_sk(sk)->snd_una;
+               if (state->mss_now && unack_bytes > 0)
+                       state->fack_count = unack_bytes / state->mss_now;
+
+               return skb;
+       }
+       return NULL;
+}
+
 static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
                                        struct tcp_sacktag_state *state,
                                        u32 skip_to_seq)
 {
-       tcp_for_write_queue_from(skb, sk) {
-               if (skb == tcp_send_head(sk))
-                       break;
-
-               if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq))
-                       break;
+       if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq))
+               return skb;
 
-               state->fack_count += tcp_skb_pcount(skb);
-       }
-       return skb;
+       return tcp_sacktag_bsearch(sk, state, skip_to_seq);
 }
 
 static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
                }
        }
 
-       skb = tcp_write_queue_head(sk);
+       state->mss_now = tcp_current_mss(sk);
        state->fack_count = 0;
+       skb = NULL;
        i = 0;
 
        if (!tp->sacked_out) {
        if (tcp_is_reno(tp))
                tcp_reset_reno_sack(tp);
 
-       skb = tcp_write_queue_head(sk);
+       skb = tcp_rtx_queue_head(sk);
        is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
        if (is_reneg) {
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
        }
        tcp_clear_all_retrans_hints(tp);
 
-       tcp_for_write_queue(skb, sk) {
-               if (skb == tcp_send_head(sk))
-                       break;
-
+       skb_rbtree_walk_from(skb) {
                mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
                             is_reneg);
                if (mark_lost)
                        return;
                cnt = tp->lost_cnt_hint;
        } else {
-               skb = tcp_write_queue_head(sk);
+               skb = tcp_rtx_queue_head(sk);
                cnt = 0;
        }
 
-       tcp_for_write_queue_from(skb, sk) {
-               if (skb == tcp_send_head(sk))
-                       break;
+       skb_rbtree_walk_from(skb) {
                /* TODO: do this better */
                /* this is not the most efficient way to do this... */
                tp->lost_skb_hint = skb;
                        /* If needed, chop off the prefix to mark as lost. */
                        lost = (packets - oldcnt) * mss;
                        if (lost < skb->len &&
-                           tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC) < 0)
+                           tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+                                        lost, mss, GFP_ATOMIC) < 0)
                                break;
                        cnt = packets;
                }
        if (tp->retrans_out)
                return true;
 
-       skb = tcp_write_queue_head(sk);
+       skb = tcp_rtx_queue_head(sk);
        if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
                return true;
 
        if (unmark_loss) {
                struct sk_buff *skb;
 
-               tcp_for_write_queue(skb, sk) {
-                       if (skb == tcp_send_head(sk))
-                               break;
+               skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
                        TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
                }
                tp->lost_out = 0;
        unsigned int mss = tcp_current_mss(sk);
        u32 prior_lost = tp->lost_out;
 
-       tcp_for_write_queue(skb, sk) {
-               if (skb == tcp_send_head(sk))
-                       break;
+       skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
                if (tcp_skb_seglen(skb) > mss &&
                    !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
                        if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
                         * is updated in tcp_ack()). Otherwise fall back to
                         * the conventional recovery.
                         */
-                       if (tcp_send_head(sk) &&
+                       if (!tcp_write_queue_empty(sk) &&
                            after(tcp_wnd_end(tp), tp->snd_nxt)) {
                                *rexmit = REXMIT_NEW;
                                return;
        struct tcp_sock *tp = tcp_sk(sk);
        u32 prior_sacked = tp->sacked_out;
        u32 reord = tp->packets_out;
+       struct sk_buff *skb, *next;
        bool fully_acked = true;
        long sack_rtt_us = -1L;
        long seq_rtt_us = -1L;
        long ca_rtt_us = -1L;
-       struct sk_buff *skb;
        u32 pkts_acked = 0;
        u32 last_in_flight = 0;
        bool rtt_update;
 
        first_ackt = 0;
 
-       while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
+       for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) {
                struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
                u8 sacked = scb->sacked;
                u32 acked_pcount;
                                break;
                        fully_acked = false;
                } else {
-                       /* Speedup tcp_unlink_write_queue() and next loop */
-                       prefetchw(skb->next);
                        acked_pcount = tcp_skb_pcount(skb);
                }
 
                if (!fully_acked)
                        break;
 
-               tcp_unlink_write_queue(skb, sk);
-               sk_wmem_free_skb(sk, skb);
+               next = skb_rb_next(skb);
                if (unlikely(skb == tp->retransmit_skb_hint))
                        tp->retransmit_skb_hint = NULL;
                if (unlikely(skb == tp->lost_skb_hint))
                        tp->lost_skb_hint = NULL;
+               tcp_rtx_queue_unlink_and_free(skb, sk);
        }
 
        if (!skb)
 
 static void tcp_ack_probe(struct sock *sk)
 {
-       const struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
+       struct sk_buff *head = tcp_send_head(sk);
+       const struct tcp_sock *tp = tcp_sk(sk);
 
        /* Was it a usable window open? */
-
-       if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) {
+       if (!head)
+               return;
+       if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
                icsk->icsk_backoff = 0;
                inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
                /* Socket must be waked up by subsequent tcp_data_snd_check().
                        tp->pred_flags = 0;
                        tcp_fast_path_check(sk);
 
-                       if (tcp_send_head(sk))
+                       if (!tcp_write_queue_empty(sk))
                                tcp_slow_start_after_idle_check(sk);
 
                        if (nwin > tp->max_window) {
        sack_state.first_sackt = 0;
        sack_state.rate = &rs;
 
-       /* We very likely will need to access write queue head. */
-       prefetchw(sk->sk_write_queue.next);
+       /* We very likely will need to access rtx queue. */
+       prefetch(sk->tcp_rtx_queue.rb_node);
 
        /* If the ack is older than previous acks
         * then we can probably ignore it.
         * being used to time the probes, and is probably far higher than
         * it needs to be for normal retransmission.
         */
-       if (tcp_send_head(sk))
-               tcp_ack_probe(sk);
+       tcp_ack_probe(sk);
 
        if (tp->tlp_high_seq)
                tcp_process_tlp_ack(sk, ack, flag);
 }
 
 /* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
-static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
+void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
 {
        struct rb_node **p = &root->rb_node;
        struct rb_node *parent = NULL;
                                    struct tcp_fastopen_cookie *cookie)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-       struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL;
+       struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL;
        u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
        bool syn_drop = false;
 
        tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
 
        if (data) { /* Retransmit unacked data in SYN */
-               tcp_for_write_queue_from(data, sk) {
-                       if (data == tcp_send_head(sk) ||
-                           __tcp_retransmit_skb(sk, data, 1))
+               skb_rbtree_walk_from(data) {
+                       if (__tcp_retransmit_skb(sk, data, 1))
                                break;
                }
                tcp_rearm_rto(sk);
 
                                               TCP_TIMEOUT_INIT;
                icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
 
-               skb = tcp_write_queue_head(sk);
+               skb = tcp_rtx_queue_head(sk);
                BUG_ON(!skb);
 
                tcp_mstamp_refresh(tp);
 
                           int push_one, gfp_t gfp);
 
 /* Account for new data that has been sent to the network. */
-static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
+static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
 {
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        unsigned int prior_packets = tp->packets_out;
 
-       tcp_advance_send_head(sk, skb);
        tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
 
+       __skb_unlink(skb, &sk->sk_write_queue);
+       tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
+
        tp->packets_out += tcp_skb_pcount(skb);
        if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
                tcp_rearm_rto(sk);
        TCP_SKB_CB(skb)->eor = 0;
 }
 
+/* Insert buff after skb on the write or rtx queue of sk.  */
+static void tcp_insert_write_queue_after(struct sk_buff *skb,
+                                        struct sk_buff *buff,
+                                        struct sock *sk,
+                                        enum tcp_queue tcp_queue)
+{
+       if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
+               __skb_queue_after(&sk->sk_write_queue, skb, buff);
+       else
+               tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
+}
+
 /* Function to create two new TCP segments.  Shrinks the given segment
  * to the specified size and appends a new segment with the rest of the
  * packet to the list.  This won't be called frequently, I hope.
  * Remember, these are still headerless SKBs at this point.
  */
-int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
+int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+                struct sk_buff *skb, u32 len,
                 unsigned int mss_now, gfp_t gfp)
 {
        struct tcp_sock *tp = tcp_sk(sk);
 
        /* Link BUFF into the send queue. */
        __skb_header_release(buff);
-       tcp_insert_write_queue_after(skb, buff, sk);
+       tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
        list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
 
        return 0;
                 * is caused by insufficient sender buffer:
                 * 1) just sent some data (see tcp_write_xmit)
                 * 2) not cwnd limited (this else condition)
-                * 3) no more data to send (null tcp_send_head )
+                * 3) no more data to send (tcp_write_queue_empty())
                 * 4) application is hitting buffer limit (SOCK_NOSPACE)
                 */
-               if (!tcp_send_head(sk) && sk->sk_socket &&
+               if (tcp_write_queue_empty(sk) && sk->sk_socket &&
                    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
                    (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
                        tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
  * know that all the data is in scatter-gather pages, and that the
  * packet has never been sent out before (and thus is not cloned).
  */
-static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
+static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue,
+                       struct sk_buff *skb, unsigned int len,
                        unsigned int mss_now, gfp_t gfp)
 {
        struct sk_buff *buff;
 
        /* All of a TSO frame must be composed of paged data.  */
        if (skb->len != skb->data_len)
-               return tcp_fragment(sk, skb, len, mss_now, gfp);
+               return tcp_fragment(sk, tcp_queue, skb, len, mss_now, gfp);
 
        buff = sk_stream_alloc_skb(sk, 0, gfp, true);
        if (unlikely(!buff))
 
        /* Link BUFF into the send queue. */
        __skb_header_release(buff);
-       tcp_insert_write_queue_after(skb, buff, sk);
+       tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
 
        return 0;
 }
                        goto send_now;
        }
 
-       head = tcp_write_queue_head(sk);
-
+       /* TODO : use tsorted_sent_queue ? */
+       head = tcp_rtx_queue_head(sk);
+       if (!head)
+               goto send_now;
        age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp);
        /* If next ACK is likely to come too late (half srtt), do not defer */
        if (age < (tp->srtt_us >> 4))
        limit <<= factor;
 
        if (refcount_read(&sk->sk_wmem_alloc) > limit) {
-               /* Always send the 1st or 2nd skb in write queue.
+               /* Always send skb if rtx queue is empty.
                 * No need to wait for TX completion to call us back,
                 * after softirq/tasklet schedule.
                 * This helps when TX completions are delayed too much.
                 */
-               if (skb == sk->sk_write_queue.next ||
-                   skb->prev == sk->sk_write_queue.next)
+               if (tcp_rtx_queue_empty(sk))
                        return false;
 
                set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
         * it's the "most interesting" or current chrono we are
         * tracking and starts busy chrono if we have pending data.
         */
-       if (tcp_write_queue_empty(sk))
+       if (tcp_rtx_and_write_queues_empty(sk))
                tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
        else if (type == tp->chrono_type)
                tcp_chrono_set(tp, TCP_CHRONO_BUSY);
                                                    nonagle);
 
                if (skb->len > limit &&
-                   unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
+                   unlikely(tso_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
+                                         skb, limit, mss_now, gfp)))
                        break;
 
                if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
                tcp_cwnd_validate(sk, is_cwnd_limited);
                return false;
        }
-       return !tp->packets_out && tcp_send_head(sk);
+       return !tp->packets_out && !tcp_write_queue_empty(sk);
 }
 
 bool tcp_schedule_loss_probe(struct sock *sk)
                return false;
 
        if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
-            tcp_send_head(sk))
+            !tcp_write_queue_empty(sk))
                return false;
 
        /* Probe timeout is 2*rtt. Add minimum RTO to account
        int mss = tcp_current_mss(sk);
 
        skb = tcp_send_head(sk);
-       if (skb) {
-               if (tcp_snd_wnd_test(tp, skb, mss)) {
-                       pcount = tp->packets_out;
-                       tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
-                       if (tp->packets_out > pcount)
-                               goto probe_sent;
-                       goto rearm_timer;
-               }
-               skb = tcp_write_queue_prev(sk, skb);
-       } else {
-               skb = tcp_write_queue_tail(sk);
+       if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
+               pcount = tp->packets_out;
+               tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
+               if (tp->packets_out > pcount)
+                       goto probe_sent;
+               goto rearm_timer;
        }
+       skb = skb_rb_last(&sk->tcp_rtx_queue);
 
        /* At most one outstanding TLP retransmission. */
        if (tp->tlp_high_seq)
                goto rearm_timer;
 
        if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
-               if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss,
+               if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+                                         (pcount - 1) * mss, mss,
                                          GFP_ATOMIC)))
                        goto rearm_timer;
-               skb = tcp_write_queue_next(sk, skb);
+               skb = skb_rb_next(skb);
        }
 
        if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
 static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-       struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
+       struct sk_buff *next_skb = skb_rb_next(skb);
        int skb_size, next_skb_size;
 
        skb_size = skb->len;
        }
        tcp_highest_sack_combine(sk, next_skb, skb);
 
-       tcp_unlink_write_queue(next_skb, sk);
-
        if (next_skb->ip_summed == CHECKSUM_PARTIAL)
                skb->ip_summed = CHECKSUM_PARTIAL;
 
 
        tcp_skb_collapse_tstamp(skb, next_skb);
 
-       sk_wmem_free_skb(sk, next_skb);
+       tcp_rtx_queue_unlink_and_free(next_skb, sk);
        return true;
 }
 
                return false;
        if (skb_cloned(skb))
                return false;
-       if (skb == tcp_send_head(sk))
-               return false;
        /* Some heuristics for collapsing over SACK'd could be invented */
        if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
                return false;
        if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
                return;
 
-       tcp_for_write_queue_from_safe(skb, tmp, sk) {
+       skb_rbtree_walk_from_safe(skb, tmp) {
                if (!tcp_can_collapse(sk, skb))
                        break;
 
 
        len = cur_mss * segs;
        if (skb->len > len) {
-               if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC))
+               if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
+                                cur_mss, GFP_ATOMIC))
                        return -ENOMEM; /* We'll try again later. */
        } else {
                if (skb_unclone(skb, GFP_ATOMIC))
 void tcp_xmit_retransmit_queue(struct sock *sk)
 {
        const struct inet_connection_sock *icsk = inet_csk(sk);
+       struct sk_buff *skb, *rtx_head = NULL, *hole = NULL;
        struct tcp_sock *tp = tcp_sk(sk);
-       struct sk_buff *skb;
-       struct sk_buff *hole = NULL;
        u32 max_segs;
        int mib_idx;
 
        if (!tp->packets_out)
                return;
 
-       if (tp->retransmit_skb_hint) {
-               skb = tp->retransmit_skb_hint;
-       } else {
-               skb = tcp_write_queue_head(sk);
+       skb = tp->retransmit_skb_hint;
+       if (!skb) {
+               rtx_head = tcp_rtx_queue_head(sk);
+               skb = rtx_head;
        }
-
        max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
-       tcp_for_write_queue_from(skb, sk) {
+       skb_rbtree_walk_from(skb) {
                __u8 sacked;
                int segs;
 
-               if (skb == tcp_send_head(sk))
-                       break;
-
                if (tcp_pacing_check(sk))
                        break;
 
                if (tcp_in_cwnd_reduction(sk))
                        tp->prr_out += tcp_skb_pcount(skb);
 
-               if (skb == tcp_write_queue_head(sk) &&
+               if (skb == rtx_head &&
                    icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
                        inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
                                                  inet_csk(sk)->icsk_rto,
         * Note: in the latter case, FIN packet will be sent after a timeout,
         * as TCP stack thinks it has already been transmitted.
         */
-       if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) {
+       if (!tskb && tcp_under_memory_pressure(sk))
+               tskb = skb_rb_last(&sk->tcp_rtx_queue);
+
+       if (tskb) {
 coalesce:
                TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
                TCP_SKB_CB(tskb)->end_seq++;
                tp->write_seq++;
-               if (!tcp_send_head(sk)) {
+               if (tcp_write_queue_empty(sk)) {
                        /* This means tskb was already sent.
                         * Pretend we included the FIN on previous transmit.
                         * We need to set tp->snd_nxt to the value it would have
 {
        struct sk_buff *skb;
 
-       skb = tcp_write_queue_head(sk);
+       skb = tcp_rtx_queue_head(sk);
        if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
-               pr_debug("%s: wrong queue state\n", __func__);
+               pr_err("%s: wrong queue state\n", __func__);
                return -EFAULT;
        }
        if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
                        if (!nskb)
                                return -ENOMEM;
                        INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
-                       tcp_unlink_write_queue(skb, sk);
+                       tcp_rtx_queue_unlink_and_free(skb, sk);
                        __skb_header_release(nskb);
-                       __tcp_add_write_queue_head(sk, nskb);
-                       sk_wmem_free_skb(sk, skb);
+                       tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
                        sk->sk_wmem_queued += nskb->truesize;
                        sk_mem_charge(sk, nskb->truesize);
                        skb = nskb;
 
        tcb->end_seq += skb->len;
        __skb_header_release(skb);
-       __tcp_add_write_queue_tail(sk, skb);
        sk->sk_wmem_queued += skb->truesize;
        sk_mem_charge(sk, skb->truesize);
        tp->write_seq = tcb->end_seq;
        TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
        if (!err) {
                tp->syn_data = (fo->copied > 0);
+               tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data);
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
                goto done;
        }
 
-       /* data was not sent, this is our new send_head */
-       sk->sk_send_head = syn_data;
+       /* data was not sent, put it in write_queue */
+       __skb_queue_tail(&sk->sk_write_queue, syn_data);
        tp->packets_out -= tcp_skb_pcount(syn_data);
 
 fallback:
        tp->retrans_stamp = tcp_time_stamp(tp);
        tcp_connect_queue_skb(sk, buff);
        tcp_ecn_send_syn(sk, buff);
+       tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
 
        /* Send off SYN; include data in Fast Open. */
        err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
                    skb->len > mss) {
                        seg_size = min(seg_size, mss);
                        TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
-                       if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC))
+                       if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
+                                        skb, seg_size, mss, GFP_ATOMIC))
                                return -1;
                } else if (!tcp_skb_pcount(skb))
                        tcp_set_skb_tso_segs(skb, mss);
 
        err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
 
-       if (tp->packets_out || !tcp_send_head(sk)) {
+       if (tp->packets_out || tcp_write_queue_empty(sk)) {
                /* Cancel probe timer, if it is not required. */
                icsk->icsk_probes_out = 0;
                icsk->icsk_backoff = 0;
 
                return false;
 
        start_ts = tcp_sk(sk)->retrans_stamp;
-       if (unlikely(!start_ts))
-               start_ts = tcp_skb_timestamp(tcp_write_queue_head(sk));
+       if (unlikely(!start_ts)) {
+               struct sk_buff *head = tcp_rtx_queue_head(sk);
+
+               if (!head)
+                       return false;
+               start_ts = tcp_skb_timestamp(head);
+       }
 
        if (likely(timeout == 0)) {
                linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
 static void tcp_probe_timer(struct sock *sk)
 {
        struct inet_connection_sock *icsk = inet_csk(sk);
+       struct sk_buff *skb = tcp_send_head(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        int max_probes;
        u32 start_ts;
 
-       if (tp->packets_out || !tcp_send_head(sk)) {
+       if (tp->packets_out || !skb) {
                icsk->icsk_probes_out = 0;
                return;
        }
         * corresponding system limit. We also implement similar policy when
         * we use RTO to probe window in tcp_retransmit_timer().
         */
-       start_ts = tcp_skb_timestamp(tcp_send_head(sk));
+       start_ts = tcp_skb_timestamp(skb);
        if (!start_ts)
-               tcp_send_head(sk)->skb_mstamp = tp->tcp_mstamp;
+               skb->skb_mstamp = tp->tcp_mstamp;
        else if (icsk->icsk_user_timeout &&
                 (s32)(tcp_time_stamp(tp) - start_ts) >
                 jiffies_to_msecs(icsk->icsk_user_timeout))
        if (!tp->packets_out)
                goto out;
 
-       WARN_ON(tcp_write_queue_empty(sk));
+       WARN_ON(tcp_rtx_queue_empty(sk));
 
        tp->tlp_high_seq = 0;
 
                        goto out;
                }
                tcp_enter_loss(sk);
-               tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1);
+               tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1);
                __sk_dst_reset(sk);
                goto out_reset_timer;
        }
 
        tcp_enter_loss(sk);
 
-       if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1) > 0) {
+       if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) {
                /* Retransmission failed because of local congestion,
                 * do not backoff.
                 */
        elapsed = keepalive_time_when(tp);
 
        /* It is alive without keepalive 8) */
-       if (tp->packets_out || tcp_send_head(sk))
+       if (tp->packets_out || !tcp_write_queue_empty(sk))
                goto resched;
 
        elapsed = keepalive_time_elapsed(tp);