]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
net: ipv4: Consider failed nexthops in multipath routes
authorDavid Ahern <dsa@cumulusnetworks.com>
Thu, 7 Apr 2016 14:21:00 +0000 (07:21 -0700)
committerBrian Maly <brian.maly@oracle.com>
Mon, 21 May 2018 21:05:32 +0000 (17:05 -0400)
Multipath route lookups should consider knowledge about next hops and not
select a hop that is known to be failed.

Example:

                     [h2]                   [h3]   15.0.0.5
                      |                      |
                     3|                     3|
                    [SP1]                  [SP2]--+
                     1  2                   1     2
                     |  |     /-------------+     |
                     |   \   /                    |
                     |     X                      |
                     |    / \                     |
                     |   /   \---------------\    |
                     1  2                     1   2
         12.0.0.2  [TOR1] 3-----------------3 [TOR2] 12.0.0.3
                     4                         4
                      \                       /
                        \                    /
                         \                  /
                          -------|   |-----/
                                 1   2
                                [TOR3]
                                  3|
                                   |
                                  [h1]  12.0.0.1

host h1 with IP 12.0.0.1 has 2 paths to host h3 at 15.0.0.5:

    root@h1:~# ip ro ls
    ...
    12.0.0.0/24 dev swp1  proto kernel  scope link  src 12.0.0.1
    15.0.0.0/16
            nexthop via 12.0.0.2  dev swp1 weight 1
            nexthop via 12.0.0.3  dev swp1 weight 1
    ...

If the link between tor3 and tor1 is down and the link between tor1
and tor2 then tor1 is effectively cut-off from h1. Yet the route lookups
in h1 are alternating between the 2 routes: ping 15.0.0.5 gets one and
ssh 15.0.0.5 gets the other. Connections that attempt to use the
12.0.0.2 nexthop fail since that neighbor is not reachable:

    root@h1:~# ip neigh show
    ...
    12.0.0.3 dev swp1 lladdr 00:02:00:00:00:1b REACHABLE
    12.0.0.2 dev swp1  FAILED
    ...

The failed path can be avoided by considering known neighbor information
when selecting next hops. If the neighbor lookup fails we have no
knowledge about the nexthop, so give it a shot. If there is an entry
then only select the nexthop if the state is sane. This is similar to
what fib_detect_death does.

To maintain backward compatibility use of the neighbor information is
based on a new sysctl, fib_multipath_use_neigh.

Orabug: 27547114

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Reviewed-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit a6db4494d218c2e559173661ee972e048dc04fdd)

Reviewed-by: Zhu Yanjun <yanjun.zhu@oracle.com>
Signed-off-by: Venkat Venkatsubra <venkat.x.venkatsubra@oracle.com>
Signed-off-by: Brian Maly <brian.maly@oracle.com>
Conflicts:
net/ipv4/fib_semantics.c
net/ipv4/sysctl_net_ipv4.c

Signed-off-by: Brian Maly <brian.maly@oracle.com>
Documentation/networking/ip-sysctl.txt
include/net/route.h
net/ipv4/fib_semantics.c
net/ipv4/route.c
net/ipv4/sysctl_net_ipv4.c

index 43227b2bb2e65f2656a78784572602110666aef1..8686ccd8563fd804a53083666a2da2ecd5d8ed02 100644 (file)
@@ -63,6 +63,16 @@ fwmark_reflect - BOOLEAN
        fwmark of the packet they are replying to.
        Default: 0
 
+fib_multipath_use_neigh - BOOLEAN
+       Use status of existing neighbor entry when determining nexthop for
+       multipath routes. If disabled, neighbor information is not used and
+       packets could be directed to a failed nexthop. Only valid for kernels
+       built with CONFIG_IP_ROUTE_MULTIPATH enabled.
+       Default: 0 (disabled)
+       Possible values:
+       0 - disabled
+       1 - enabled
+
 route/max_size - INTEGER
        Maximum number of routes allowed in the kernel.  Increase
        this when using large numbers of interfaces and/or routes.
index fe22d03afb6a218b6b2dfa7d5329632b8d4936ae..90a956bde3eee008c90fd9bb84613a6bd103daf7 100644 (file)
 #define RT_CONN_FLAGS(sk)   (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE))
 #define RT_CONN_FLAGS_TOS(sk,tos)   (RT_TOS(tos) | sock_flag(sk, SOCK_LOCALROUTE))
 
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+extern int sysctl_fib_multipath_use_neigh;
+#endif
+
 struct fib_nh;
 struct fib_info;
 struct uncached_list;
index a867a4f829a91ee4a110ba0ec46f46ed64305bae..b3b10a64b0df0f7cb708a817e3c559a96370a933 100644 (file)
@@ -1316,19 +1316,43 @@ int fib_sync_up(struct net_device *dev)
        return ret;
 }
 
+static bool fib_good_nh(const struct fib_nh *nh)
+{
+       int state = NUD_REACHABLE;
+
+       if (nh->nh_scope == RT_SCOPE_LINK) {
+               struct neighbour *n;
+
+               rcu_read_lock_bh();
+
+               n = __ipv4_neigh_lookup_noref(nh->nh_dev, nh->nh_gw);
+               if (n)
+                       state = n->nud_state;
+
+               rcu_read_unlock_bh();
+       }
+
+       return !!(state & NUD_VALID);
+}
+
 void fib_select_multipath(struct fib_result *res, int hash)
 {
        struct fib_info *fi = res->fi;
+       bool first = false;
 
        for_nexthops(fi) {
                if (hash > atomic_read(&nh->nh_upper_bound))
                        continue;
 
-               res->nh_sel = nhsel;
-               return;
+               if (!sysctl_fib_multipath_use_neigh ||
+                   fib_good_nh(nh)) {
+                       res->nh_sel = nhsel;
+                       return;
+               }
+               if (!first) {
+                       res->nh_sel = nhsel;
+                       first = true;
+               }
        } endfor_nexthops(fi);
-
-       /* Race condition: route has just become dead. */
-       res->nh_sel = 0;
 }
 #endif
index 07f82e85fcc64f4857dc6dcb90a6358ea4070446..b0ab3d5e8e6cec48b10e220b83c5afdb2efbb030 100644 (file)
@@ -188,6 +188,12 @@ const __u8 ip_tos2prio[16] = {
 };
 EXPORT_SYMBOL(ip_tos2prio);
 
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+int sysctl_fib_multipath_use_neigh = 0;
+EXPORT_SYMBOL(sysctl_fib_multipath_use_neigh);
+
+#endif
+
 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 
index 3d2475749728a200a9f710c52ac6a526fb462085..78974c7b98f13532b7b4f8ba463001569289712f 100644 (file)
@@ -760,6 +760,17 @@ static struct ctl_table ipv4_table[] = {
                .proc_handler   = proc_dointvec_minmax,
                .extra1         = &one
        },
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+       {
+               .procname       = "fib_multipath_use_neigh",
+               .data           = &sysctl_fib_multipath_use_neigh,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &zero,
+               .extra2         = &one,
+       },
+#endif
        { }
 };