I've seen several attempts recently made to do quick failover of sctp transports
by reducing various retransmit timers and counters.  While its possible to
implement a faster failover on multihomed sctp associations, its not
particularly robust, in that it can lead to unneeded retransmits, as well as
false connection failures due to intermittent latency on a network.
Instead, lets implement the new ietf quick failover draft found here:
http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
This will let the sctp stack identify transports that have had a small number of
errors, and avoid using them quickly until their reliability can be
re-established.  I've tested this out on two virt guests connected via multiple
isolated virt networks and believe its in compliance with the above draft and
works well.
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Vlad Yasevich <vyasevich@gmail.com>
CC: Sridhar Samudrala <sri@us.ibm.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: linux-sctp@vger.kernel.org
CC: joe@perches.com
Acked-by: Vlad Yasevich <vyasevich@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
 
        Default: 5
 
+pf_retrans - INTEGER
+       The number of retransmissions that will be attempted on a given path
+       before traffic is redirected to an alternate transport (should one
+       exist).  Note this is distinct from path_max_retrans, as a path that
+       passes the pf_retrans threshold can still be used.  Its only
+       deprioritized when a transmission path is selected by the stack.  This
+       setting is primarily used to enable fast failover mechanisms without
+       having to reduce path_max_retrans to a very low value.  See:
+       http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+       for details.  Note also that a value of pf_retrans > path_max_retrans
+       disables this feature
+
+       Default: 0
+
 rto_initial - INTEGER
        The initial round trip timeout value in milliseconds that will be used
        in calculating round trip times.  This is the initial time interval
 
 typedef enum {
        SCTP_TRANSPORT_UP,
        SCTP_TRANSPORT_DOWN,
+       SCTP_TRANSPORT_PF,
 } sctp_transport_cmd_t;
 
 /* These are the address scopes defined mainly for IPv4 addresses
 
        int max_retrans_path;
        int max_retrans_init;
 
+       /* Potentially-Failed.Max.Retrans sysctl value
+        * taken from:
+        * http://tools.ietf.org/html/draft-nishida-tsvwg-sctp-failover-05
+        */
+       int pf_retrans;
+
        /*
         * Policy for preforming sctp/socket accounting
         * 0   - do socket level accounting, all assocs share sk_sndbuf
 #define sctp_sndbuf_policy             (sctp_globals.sndbuf_policy)
 #define sctp_rcvbuf_policy             (sctp_globals.rcvbuf_policy)
 #define sctp_max_retrans_path          (sctp_globals.max_retrans_path)
+#define sctp_pf_retrans                        (sctp_globals.pf_retrans)
 #define sctp_max_retrans_init          (sctp_globals.max_retrans_init)
 #define sctp_sack_timeout              (sctp_globals.sack_timeout)
 #define sctp_hb_interval               (sctp_globals.hb_interval)
 
        /* This is the max_retrans value for the transport and will
         * be initialized from the assocs value.  This can be changed
-        * using SCTP_SET_PEER_ADDR_PARAMS socket option.
+        * using the SCTP_SET_PEER_ADDR_PARAMS socket option.
         */
        __u16 pathmaxrxt;
 
+       /* This is the partially failed retrans value for the transport
+        * and will be initialized from the assocs value.  This can be changed
+        * using the SCTP_PEER_ADDR_THLDS socket option
+        */
+       int pf_retrans;
        /* PMTU       : The current known path MTU.  */
        __u32 pathmtu;
 
         */
        int max_retrans;
 
+       /* This is the partially failed retrans value for the transport
+        * and will be initialized from the assocs value.  This can be
+        * changed using the SCTP_PEER_ADDR_THLDS socket option
+        */
+       int pf_retrans;
+
        /* Maximum number of times the endpoint will retransmit INIT  */
        __u16 max_init_attempts;
 
 
 #define SCTP_GET_ASSOC_NUMBER  28      /* Read only */
 #define SCTP_GET_ASSOC_ID_LIST 29      /* Read only */
 #define SCTP_AUTO_ASCONF       30
+#define SCTP_PEER_ADDR_THLDS   31
 
 /* Internal Socket Options. Some of the sctp library functions are
  * implemented using these socket options.
  */
 enum sctp_spinfo_state {
        SCTP_INACTIVE,
+       SCTP_PF,
        SCTP_ACTIVE,
        SCTP_UNCONFIRMED,
        SCTP_UNKNOWN = 0xffff  /* Value used for transport state unknown */
        int sd;
 } sctp_peeloff_arg_t;
 
+/*
+ *  Peer Address Thresholds socket option
+ */
+struct sctp_paddrthlds {
+       sctp_assoc_t spt_assoc_id;
+       struct sockaddr_storage spt_address;
+       __u16 spt_pathmaxrxt;
+       __u16 spt_pathpfthld;
+};
 #endif /* __net_sctp_user_h__ */
 
         * socket values.
         */
        asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt;
+       asoc->pf_retrans  = sctp_pf_retrans;
+
        asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial);
        asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max);
        asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min);
        /* Set the path max_retrans.  */
        peer->pathmaxrxt = asoc->pathmaxrxt;
 
+       /* And the partial failure retrnas threshold */
+       peer->pf_retrans = asoc->pf_retrans;
+
        /* Initialize the peer's SACK delay timeout based on the
         * association configured value.
         */
        struct sctp_ulpevent *event;
        struct sockaddr_storage addr;
        int spc_state = 0;
+       bool ulp_notify = true;
 
        /* Record the transition on the transport.  */
        switch (command) {
                        spc_state = SCTP_ADDR_CONFIRMED;
                else
                        spc_state = SCTP_ADDR_AVAILABLE;
+               /* Don't inform ULP about transition from PF to
+                * active state and set cwnd to 1, see SCTP
+                * Quick failover draft section 5.1, point 5
+                */
+               if (transport->state == SCTP_PF) {
+                       ulp_notify = false;
+                       transport->cwnd = 1;
+               }
                transport->state = SCTP_ACTIVE;
                break;
 
                spc_state = SCTP_ADDR_UNREACHABLE;
                break;
 
+       case SCTP_TRANSPORT_PF:
+               transport->state = SCTP_PF;
+               ulp_notify = false;
+               break;
+
        default:
                return;
        }
        /* Generate and send a SCTP_PEER_ADDR_CHANGE notification to the
         * user.
         */
-       memset(&addr, 0, sizeof(struct sockaddr_storage));
-       memcpy(&addr, &transport->ipaddr, transport->af_specific->sockaddr_len);
-       event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
-                               0, spc_state, error, GFP_ATOMIC);
-       if (event)
-               sctp_ulpq_tail_event(&asoc->ulpq, event);
+       if (ulp_notify) {
+               memset(&addr, 0, sizeof(struct sockaddr_storage));
+               memcpy(&addr, &transport->ipaddr,
+                      transport->af_specific->sockaddr_len);
+               event = sctp_ulpevent_make_peer_addr_change(asoc, &addr,
+                                       0, spc_state, error, GFP_ATOMIC);
+               if (event)
+                       sctp_ulpq_tail_event(&asoc->ulpq, event);
+       }
 
        /* Select new active and retran paths. */
 
                        transports) {
 
                if ((t->state == SCTP_INACTIVE) ||
-                   (t->state == SCTP_UNCONFIRMED))
+                   (t->state == SCTP_UNCONFIRMED) ||
+                   (t->state == SCTP_PF))
                        continue;
                if (!first || t->last_time_heard > first->last_time_heard) {
                        second = first;
 
                        if (!new_transport)
                                new_transport = asoc->peer.active_path;
                } else if ((new_transport->state == SCTP_INACTIVE) ||
-                          (new_transport->state == SCTP_UNCONFIRMED)) {
+                          (new_transport->state == SCTP_UNCONFIRMED) ||
+                          (new_transport->state == SCTP_PF)) {
                        /* If the chunk is Heartbeat or Heartbeat Ack,
                         * send it to chunk->transport, even if it's
                         * inactive.
                        new_transport = chunk->transport;
                        if (!new_transport ||
                            ((new_transport->state == SCTP_INACTIVE) ||
-                            (new_transport->state == SCTP_UNCONFIRMED)))
+                            (new_transport->state == SCTP_UNCONFIRMED) ||
+                            (new_transport->state == SCTP_PF)))
                                new_transport = asoc->peer.active_path;
                        if (new_transport->state == SCTP_UNCONFIRMED)
                                continue;
 
                             sctp_cmd_seq_t *commands,
                             gfp_t gfp);
 
+static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds,
+                                    struct sctp_transport *t);
 /********************************************************************
  * Helper functions
  ********************************************************************/
  * notification SHOULD be sent to the upper layer.
  *
  */
-static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
+static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands,
+                                        struct sctp_association *asoc,
                                         struct sctp_transport *transport,
                                         int is_hb)
 {
                        transport->error_count++;
        }
 
+       /* If the transport error count is greater than the pf_retrans
+        * threshold, and less than pathmaxrtx, then mark this transport
+        * as Partially Failed, ee SCTP Quick Failover Draft, secon 5.1,
+        * point 1
+        */
+       if ((transport->state != SCTP_PF) &&
+          (asoc->pf_retrans < transport->pathmaxrxt) &&
+          (transport->error_count > asoc->pf_retrans)) {
+
+               sctp_assoc_control_transport(asoc, transport,
+                                            SCTP_TRANSPORT_PF,
+                                            0);
+
+               /* Update the hb timer to resend a heartbeat every rto */
+               sctp_cmd_hb_timer_update(commands, transport);
+       }
+
        if (transport->state != SCTP_INACTIVE &&
            (transport->error_count > transport->pathmaxrxt)) {
                SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p",
                                             SCTP_HEARTBEAT_SUCCESS);
        }
 
+       if (t->state == SCTP_PF)
+               sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP,
+                                            SCTP_HEARTBEAT_SUCCESS);
+
        /* The receiver of the HEARTBEAT ACK should also perform an
         * RTT measurement for that destination transport address
         * using the time value carried in the HEARTBEAT ACK chunk.
 
                case SCTP_CMD_STRIKE:
                        /* Mark one strike against a transport.  */
-                       sctp_do_8_2_transport_strike(asoc, cmd->obj.transport,
-                                                   0);
+                       sctp_do_8_2_transport_strike(commands, asoc,
+                                                   cmd->obj.transport, 0);
                        break;
 
                case SCTP_CMD_TRANSPORT_IDLE:
 
                case SCTP_CMD_TRANSPORT_HB_SENT:
                        t = cmd->obj.transport;
-                       sctp_do_8_2_transport_strike(asoc, t, 1);
+                       sctp_do_8_2_transport_strike(commands, asoc,
+                                                    t, 1);
                        t->hb_sent = 1;
                        break;
 
 
 }
 
 
+/*
+ * SCTP_PEER_ADDR_THLDS
+ *
+ * This option allows us to alter the partially failed threshold for one or all
+ * transports in an association.  See Section 6.1 of:
+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+ */
+static int sctp_setsockopt_paddr_thresholds(struct sock *sk,
+                                           char __user *optval,
+                                           unsigned int optlen)
+{
+       struct sctp_paddrthlds val;
+       struct sctp_transport *trans;
+       struct sctp_association *asoc;
+
+       if (optlen < sizeof(struct sctp_paddrthlds))
+               return -EINVAL;
+       if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval,
+                          sizeof(struct sctp_paddrthlds)))
+               return -EFAULT;
+
+
+       if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
+               asoc = sctp_id2assoc(sk, val.spt_assoc_id);
+               if (!asoc)
+                       return -ENOENT;
+               list_for_each_entry(trans, &asoc->peer.transport_addr_list,
+                                   transports) {
+                       if (val.spt_pathmaxrxt)
+                               trans->pathmaxrxt = val.spt_pathmaxrxt;
+                       trans->pf_retrans = val.spt_pathpfthld;
+               }
+
+               if (val.spt_pathmaxrxt)
+                       asoc->pathmaxrxt = val.spt_pathmaxrxt;
+               asoc->pf_retrans = val.spt_pathpfthld;
+       } else {
+               trans = sctp_addr_id2transport(sk, &val.spt_address,
+                                              val.spt_assoc_id);
+               if (!trans)
+                       return -ENOENT;
+
+               if (val.spt_pathmaxrxt)
+                       trans->pathmaxrxt = val.spt_pathmaxrxt;
+               trans->pf_retrans = val.spt_pathpfthld;
+       }
+
+       return 0;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
        case SCTP_AUTO_ASCONF:
                retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
                break;
+       case SCTP_PEER_ADDR_THLDS:
+               retval = sctp_setsockopt_paddr_thresholds(sk, optval, optlen);
+               break;
        default:
                retval = -ENOPROTOOPT;
                break;
        return 0;
 }
 
+/*
+ * SCTP_PEER_ADDR_THLDS
+ *
+ * This option allows us to fetch the partially failed threshold for one or all
+ * transports in an association.  See Section 6.1 of:
+ * http://www.ietf.org/id/draft-nishida-tsvwg-sctp-failover-05.txt
+ */
+static int sctp_getsockopt_paddr_thresholds(struct sock *sk,
+                                           char __user *optval,
+                                           int len,
+                                           int __user *optlen)
+{
+       struct sctp_paddrthlds val;
+       struct sctp_transport *trans;
+       struct sctp_association *asoc;
+
+       if (len < sizeof(struct sctp_paddrthlds))
+               return -EINVAL;
+       len = sizeof(struct sctp_paddrthlds);
+       if (copy_from_user(&val, (struct sctp_paddrthlds __user *)optval, len))
+               return -EFAULT;
+
+       if (sctp_is_any(sk, (const union sctp_addr *)&val.spt_address)) {
+               asoc = sctp_id2assoc(sk, val.spt_assoc_id);
+               if (!asoc)
+                       return -ENOENT;
+
+               val.spt_pathpfthld = asoc->pf_retrans;
+               val.spt_pathmaxrxt = asoc->pathmaxrxt;
+       } else {
+               trans = sctp_addr_id2transport(sk, &val.spt_address,
+                                              val.spt_assoc_id);
+               if (!trans)
+                       return -ENOENT;
+
+               val.spt_pathmaxrxt = trans->pathmaxrxt;
+               val.spt_pathpfthld = trans->pf_retrans;
+       }
+
+       if (put_user(len, optlen) || copy_to_user(optval, &val, len))
+               return -EFAULT;
+
+       return 0;
+}
+
 SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname,
                                char __user *optval, int __user *optlen)
 {
        case SCTP_AUTO_ASCONF:
                retval = sctp_getsockopt_auto_asconf(sk, len, optval, optlen);
                break;
+       case SCTP_PEER_ADDR_THLDS:
+               retval = sctp_getsockopt_paddr_thresholds(sk, optval, len, optlen);
+               break;
        default:
                retval = -ENOPROTOOPT;
                break;
 
                .extra1         = &one,
                .extra2         = &int_max
        },
+       {
+               .procname       = "pf_retrans",
+               .data           = &sctp_pf_retrans,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &zero,
+               .extra2         = &int_max
+       },
        {
                .procname       = "max_init_retransmits",
                .data           = &sctp_max_retrans_init,
 
 
        /* Initialize the default path max_retrans.  */
        peer->pathmaxrxt  = sctp_max_retrans_path;
+       peer->pf_retrans  = sctp_pf_retrans;
 
        INIT_LIST_HEAD(&peer->transmitted);
        INIT_LIST_HEAD(&peer->send_ready);
 {
        unsigned long timeout;
        timeout = t->rto + sctp_jitter(t->rto);
-       if (t->state != SCTP_UNCONFIRMED)
+       if ((t->state != SCTP_UNCONFIRMED) &&
+           (t->state != SCTP_PF))
                timeout += t->hbinterval;
        timeout += jiffies;
        return timeout;