u8 backup;
        u32 nonce;
        u64 thmac;
+       u32 token;
+       u8 hmac[20];
        struct mptcp_ext ext_copy;
 #endif
 };
 
                opts->sndr_key = subflow->local_key;
                *size = TCPOLEN_MPTCP_MPC_SYN;
                return true;
+       } else if (subflow->request_join) {
+               pr_debug("remote_token=%u, nonce=%u", subflow->remote_token,
+                        subflow->local_nonce);
+               opts->suboptions = OPTION_MPTCP_MPJ_SYN;
+               opts->join_id = subflow->local_id;
+               opts->token = subflow->remote_token;
+               opts->nonce = subflow->local_nonce;
+               opts->backup = subflow->request_bkup;
+               *size = TCPOLEN_MPTCP_MPJ_SYN;
+               return true;
        }
        return false;
 }
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
        struct tcp_sock *tp = tcp_sk(sk);
 
-       pr_debug("subflow=%p", subflow);
        if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) {
                subflow->mp_capable = 1;
                subflow->can_ack = 1;
                subflow->remote_key = tp->rx_opt.mptcp.sndr_key;
-       } else {
+               pr_debug("subflow=%p, remote_key=%llu", subflow,
+                        subflow->remote_key);
+       } else if (subflow->request_join && tp->rx_opt.mptcp.mp_join) {
+               subflow->mp_join = 1;
+               subflow->thmac = tp->rx_opt.mptcp.thmac;
+               subflow->remote_nonce = tp->rx_opt.mptcp.nonce;
+               pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow,
+                        subflow->thmac, subflow->remote_nonce);
+       } else if (subflow->request_mptcp) {
                tcp_sk(sk)->is_mptcp = 0;
        }
 }
 
+/* MP_JOIN client subflow must wait for 4th ack before sending any data:
+ * TCP can't schedule delack timer before the subflow is fully established.
+ * MPTCP uses the delack timer to do 3rd ack retransmissions
+ */
+static void schedule_3rdack_retransmission(struct sock *sk)
+{
+       struct inet_connection_sock *icsk = inet_csk(sk);
+       struct tcp_sock *tp = tcp_sk(sk);
+       unsigned long timeout;
+
+       /* reschedule with a timeout above RTT, as we must look only for drop */
+       if (tp->srtt_us)
+               timeout = tp->srtt_us << 1;
+       else
+               timeout = TCP_TIMEOUT_INIT;
+
+       WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER);
+       icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
+       icsk->icsk_ack.timeout = timeout;
+       sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
+}
+
+static void clear_3rdack_retransmission(struct sock *sk)
+{
+       struct inet_connection_sock *icsk = inet_csk(sk);
+
+       sk_stop_timer(sk, &icsk->icsk_delack_timer);
+       icsk->icsk_ack.timeout = 0;
+       icsk->icsk_ack.ato = 0;
+       icsk->icsk_ack.pending &= ~(ICSK_ACK_SCHED | ICSK_ACK_TIMER);
+}
+
 static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
                                         unsigned int *size,
                                         unsigned int remaining,
        struct mptcp_ext *mpext;
        unsigned int data_len;
 
-       pr_debug("subflow=%p fully established=%d seq=%x:%x remaining=%d",
-                subflow, subflow->fully_established, subflow->snd_isn,
-                skb ? TCP_SKB_CB(skb)->seq : 0, remaining);
+       /* When skb is not available, we better over-estimate the emitted
+        * options len. A full DSS option (28 bytes) is longer than
+        * TCPOLEN_MPTCP_MPC_ACK_DATA(22) or TCPOLEN_MPTCP_MPJ_ACK(24), so
+        * tell the caller to defer the estimate to
+        * mptcp_established_options_dss(), which will reserve enough space.
+        */
+       if (!skb)
+               return false;
 
-       if (subflow->mp_capable && !subflow->fully_established && skb &&
-           subflow->snd_isn == TCP_SKB_CB(skb)->seq) {
-               /* When skb is not available, we better over-estimate the
-                * emitted options len. A full DSS option is longer than
-                * TCPOLEN_MPTCP_MPC_ACK_DATA, so let's the caller try to fit
-                * that.
-                */
+       /* MPC/MPJ needed only on 3rd ack packet */
+       if (subflow->fully_established ||
+           subflow->snd_isn != TCP_SKB_CB(skb)->seq)
+               return false;
+
+       if (subflow->mp_capable) {
                mpext = mptcp_get_ext(skb);
                data_len = mpext ? mpext->data_len : 0;
 
                         data_len);
 
                return true;
+       } else if (subflow->mp_join) {
+               opts->suboptions = OPTION_MPTCP_MPJ_ACK;
+               memcpy(opts->hmac, subflow->hmac, MPTCPOPT_HMAC_LEN);
+               *size = TCPOLEN_MPTCP_MPJ_ACK;
+               pr_debug("subflow=%p", subflow);
+
+               schedule_3rdack_retransmission(sk);
+               return true;
        }
        return false;
 }
                return true;
 
        subflow->pm_notified = 1;
-       if (subflow->mp_join)
+       if (subflow->mp_join) {
+               clear_3rdack_retransmission(sk);
                mptcp_pm_subflow_established(msk, subflow);
-       else
+       } else {
                mptcp_pm_fully_established(msk);
+       }
        return true;
 }
 
                                      0, opts->rm_id);
        }
 
+       if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) {
+               *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
+                                     TCPOLEN_MPTCP_MPJ_SYN,
+                                     opts->backup, opts->join_id);
+               put_unaligned_be32(opts->token, ptr);
+               ptr += 1;
+               put_unaligned_be32(opts->nonce, ptr);
+               ptr += 1;
+       }
+
        if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) {
                *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
                                      TCPOLEN_MPTCP_MPJ_SYNACK,
                ptr += 1;
        }
 
+       if (OPTION_MPTCP_MPJ_ACK & opts->suboptions) {
+               *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
+                                     TCPOLEN_MPTCP_MPJ_ACK, 0, 0);
+               memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN);
+               ptr += 5;
+       }
+
        if (opts->ext_copy.use_ack || opts->ext_copy.use_map) {
                struct mptcp_ext *mpext = &opts->ext_copy;
                u8 len = TCPOLEN_MPTCP_DSS_BASE;
 
        sk->sk_data_ready(sk);
 }
 
+static void __mptcp_flush_join_list(struct mptcp_sock *msk)
+{
+       if (likely(list_empty(&msk->join_list)))
+               return;
+
+       spin_lock_bh(&msk->join_list_lock);
+       list_splice_tail_init(&msk->join_list, &msk->conn_list);
+       spin_unlock_bh(&msk->join_list_lock);
+}
+
 static bool mptcp_ext_cache_refill(struct mptcp_sock *msk)
 {
        if (!msk->cached_ext)
                return ret >= 0 ? ret + copied : (copied ? copied : ret);
        }
 
+       __mptcp_flush_join_list(msk);
        ssk = mptcp_subflow_get_send(msk);
        while (!sk_stream_memory_free(sk) || !ssk) {
                ret = sk_stream_wait_memory(sk, &timeo);
 
        len = min_t(size_t, len, INT_MAX);
        target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+       __mptcp_flush_join_list(msk);
 
        while (len > (size_t)copied) {
                int bytes_read;
        struct sock *sk = &msk->sk.icsk_inet.sk;
 
        lock_sock(sk);
+       __mptcp_flush_join_list(msk);
        __mptcp_move_skbs(msk);
        release_sock(sk);
        sock_put(sk);
 {
        struct mptcp_sock *msk = mptcp_sk(sk);
 
+       spin_lock_init(&msk->join_list_lock);
+
        INIT_LIST_HEAD(&msk->conn_list);
+       INIT_LIST_HEAD(&msk->join_list);
        __set_bit(MPTCP_SEND_SPACE, &msk->flags);
        INIT_WORK(&msk->work, mptcp_worker);
 
        mptcp_token_destroy(msk->token);
        inet_sk_state_store(sk, TCP_CLOSE);
 
+       __mptcp_flush_join_list(msk);
+
        list_splice_init(&msk->conn_list, &conn_list);
 
        data_fin_tx_seq = msk->write_seq;
        struct mptcp_sock *msk = mptcp_sk(subflow->conn);
        struct sock *parent = (void *)msk;
        struct socket *parent_sock;
+       bool ret;
 
        pr_debug("msk=%p, subflow=%p", msk, subflow);
 
        if (parent_sock && !sk->sk_socket)
                mptcp_sock_graft(sk, parent_sock);
 
-       return mptcp_pm_allow_new_subflow(msk);
+       ret = mptcp_pm_allow_new_subflow(msk);
+       if (ret) {
+               /* active connections are already on conn_list */
+               spin_lock_bh(&msk->join_list_lock);
+               if (!WARN_ON_ONCE(!list_empty(&subflow->node)))
+                       list_add_tail(&subflow->node, &msk->join_list);
+               spin_unlock_bh(&msk->join_list_lock);
+       }
+       return ret;
 }
 
 bool mptcp_sk_is_subflow(const struct sock *sk)
                /* set ssk->sk_socket of accept()ed flows to mptcp socket.
                 * This is needed so NOSPACE flag can be set from tcp stack.
                 */
+               __mptcp_flush_join_list(msk);
                list_for_each_entry(subflow, &msk->conn_list, node) {
                        struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
 
                        sock->state = SS_CONNECTED;
        }
 
+       __mptcp_flush_join_list(msk);
        mptcp_for_each_subflow(msk, subflow) {
                struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
 
 
 #define TCPOLEN_MPTCP_PORT_LEN         2
 #define TCPOLEN_MPTCP_RM_ADDR_BASE     4
 
+/* MPTCP MP_JOIN flags */
 #define MPTCPOPT_BACKUP                BIT(0)
 #define MPTCPOPT_HMAC_LEN      20
+#define MPTCPOPT_THMAC_LEN     8
 
 /* MPTCP MP_CAPABLE flags */
 #define MPTCP_VERSION_MASK     (0x0F)
        u32             token;
        unsigned long   flags;
        bool            can_ack;
+       spinlock_t      join_list_lock;
        struct work_struct work;
        struct list_head conn_list;
+       struct list_head join_list;
        struct skb_ext  *cached_ext;    /* for the next sendmsg */
        struct socket   *subflow; /* outgoing connect/listener/!mp_capable */
        struct sock     *first;
        u32     ssn_offset;
        u32     map_data_len;
        u32     request_mptcp : 1,  /* send MP_CAPABLE */
+               request_join : 1,   /* send MP_JOIN */
+               request_bkup : 1,
                mp_capable : 1,     /* remote is MPTCP capable */
                mp_join : 1,        /* remote is JOINing */
                fully_established : 1,      /* path validated */
        u32     remote_nonce;
        u64     thmac;
        u32     local_nonce;
+       u32     remote_token;
+       u8      hmac[MPTCPOPT_HMAC_LEN];
        u8      local_id;
        u8      remote_id;
 
 int mptcp_is_enabled(struct net *net);
 bool mptcp_subflow_data_available(struct sock *sk);
 void mptcp_subflow_init(void);
+
+/* called with sk socket lock held */
+int __mptcp_subflow_connect(struct sock *sk, int ifindex,
+                           const struct mptcp_addr_info *loc,
+                           const struct mptcp_addr_info *remote);
 int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock);
 
 static inline void mptcp_subflow_tcp_fallback(struct sock *sk,
 
 static int subflow_rebuild_header(struct sock *sk)
 {
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
-       int err = 0;
+       int local_id, err = 0;
 
        if (subflow->request_mptcp && !subflow->token) {
                pr_debug("subflow=%p", sk);
                err = mptcp_token_new_connect(sk);
+       } else if (subflow->request_join && !subflow->local_nonce) {
+               struct mptcp_sock *msk = (struct mptcp_sock *)subflow->conn;
+
+               pr_debug("subflow=%p", sk);
+
+               do {
+                       get_random_bytes(&subflow->local_nonce, sizeof(u32));
+               } while (!subflow->local_nonce);
+
+               if (subflow->local_id)
+                       goto out;
+
+               local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)sk);
+               if (local_id < 0)
+                       return -EINVAL;
+
+               subflow->local_id = local_id;
        }
 
+out:
        if (err)
                return err;
 
 
                subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
        } else if (rx_opt.mptcp.mp_join && listener->request_mptcp) {
+               subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
                subflow_req->mp_join = 1;
                subflow_req->backup = rx_opt.mptcp.backup;
                subflow_req->remote_id = rx_opt.mptcp.join_id;
 }
 #endif
 
+/* validate received truncated hmac and create hmac for third ACK */
+static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
+{
+       u8 hmac[MPTCPOPT_HMAC_LEN];
+       u64 thmac;
+
+       subflow_generate_hmac(subflow->remote_key, subflow->local_key,
+                             subflow->remote_nonce, subflow->local_nonce,
+                             hmac);
+
+       thmac = get_unaligned_be64(hmac);
+       pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n",
+                subflow, subflow->token,
+                (unsigned long long)thmac,
+                (unsigned long long)subflow->thmac);
+
+       return thmac == subflow->thmac;
+}
+
 static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
 {
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
                parent->sk_state_change(parent);
        }
 
-       if (!subflow->conn_finished) {
+       if (subflow->conn_finished || !tcp_sk(sk)->is_mptcp)
+               return;
+
+       if (subflow->mp_capable) {
                pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
                         subflow->remote_key);
                mptcp_finish_connect(sk);
                        pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq);
                        subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
                }
+       } else if (subflow->mp_join) {
+               pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u",
+                        subflow, subflow->thmac,
+                        subflow->remote_nonce);
+               if (!subflow_thmac_valid(subflow)) {
+                       subflow->mp_join = 0;
+                       goto do_reset;
+               }
+
+               subflow_generate_hmac(subflow->local_key, subflow->remote_key,
+                                     subflow->local_nonce,
+                                     subflow->remote_nonce,
+                                     subflow->hmac);
+
+               if (skb)
+                       subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
+
+               if (!mptcp_finish_join(sk))
+                       goto do_reset;
+
+               subflow->conn_finished = 1;
+       } else {
+do_reset:
+               tcp_send_active_reset(sk, GFP_ATOMIC);
+               tcp_done(sk);
        }
 }
 
 }
 #endif
 
+static void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
+                               struct sockaddr_storage *addr)
+{
+       memset(addr, 0, sizeof(*addr));
+       addr->ss_family = info->family;
+       if (addr->ss_family == AF_INET) {
+               struct sockaddr_in *in_addr = (struct sockaddr_in *)addr;
+
+               in_addr->sin_addr = info->addr;
+               in_addr->sin_port = info->port;
+       }
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+       else if (addr->ss_family == AF_INET6) {
+               struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr;
+
+               in6_addr->sin6_addr = info->addr6;
+               in6_addr->sin6_port = info->port;
+       }
+#endif
+}
+
+int __mptcp_subflow_connect(struct sock *sk, int ifindex,
+                           const struct mptcp_addr_info *loc,
+                           const struct mptcp_addr_info *remote)
+{
+       struct mptcp_sock *msk = mptcp_sk(sk);
+       struct mptcp_subflow_context *subflow;
+       struct sockaddr_storage addr;
+       struct socket *sf;
+       u32 remote_token;
+       int addrlen;
+       int err;
+
+       if (sk->sk_state != TCP_ESTABLISHED)
+               return -ENOTCONN;
+
+       err = mptcp_subflow_create_socket(sk, &sf);
+       if (err)
+               return err;
+
+       subflow = mptcp_subflow_ctx(sf->sk);
+       subflow->remote_key = msk->remote_key;
+       subflow->local_key = msk->local_key;
+       subflow->token = msk->token;
+       mptcp_info2sockaddr(loc, &addr);
+
+       addrlen = sizeof(struct sockaddr_in);
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+       if (loc->family == AF_INET6)
+               addrlen = sizeof(struct sockaddr_in6);
+#endif
+       sf->sk->sk_bound_dev_if = ifindex;
+       err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
+       if (err)
+               goto failed;
+
+       mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL);
+       pr_debug("msk=%p remote_token=%u", msk, remote_token);
+       subflow->remote_token = remote_token;
+       subflow->local_id = loc->id;
+       subflow->request_join = 1;
+       subflow->request_bkup = 1;
+       mptcp_info2sockaddr(remote, &addr);
+
+       err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
+       if (err && err != -EINPROGRESS)
+               goto failed;
+
+       spin_lock_bh(&msk->join_list_lock);
+       list_add_tail(&subflow->node, &msk->join_list);
+       spin_unlock_bh(&msk->join_list_lock);
+
+       return err;
+
+failed:
+       sock_release(sf);
+       return err;
+}
+
 int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
 {
        struct mptcp_subflow_context *subflow;
                new_ctx->ssn_offset = subflow_req->ssn_offset;
                new_ctx->idsn = subflow_req->idsn;
        } else if (subflow_req->mp_join) {
+               new_ctx->ssn_offset = subflow_req->ssn_offset;
                new_ctx->mp_join = 1;
                new_ctx->fully_established = 1;
                new_ctx->backup = subflow_req->backup;