]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
rds: HAIP across HCAs
authorBang Nguyen <bang.nguyen@oracle.com>
Mon, 26 Nov 2012 16:10:22 +0000 (08:10 -0800)
committerMukesh Kacker <mukesh.kacker@oracle.com>
Wed, 8 Jul 2015 20:12:29 +0000 (13:12 -0700)
This patch extends HAIP support to failover/failback IPs across HCAs.

Signed-off-by: Bang Nguyen <bang.nguyen@oracle.com>
net/rds/ib.c
net/rds/ib.h

index 0f40a305cd0e3f4f1c8d0af2f3a956c86496f713..205ea0f2e5aa94f1322d90988d362b05b6445767 100644 (file)
@@ -54,6 +54,7 @@ unsigned int rds_ib_apm_enabled = 0;
 unsigned int rds_ib_apm_fallback = 1;
 unsigned int rds_ib_haip_enabled = 0;
 unsigned int rds_ib_haip_fallback = 1;
+unsigned int rds_ib_haip_hca_failover_enabled = 1;
 unsigned int rds_ib_apm_timeout = RDS_IB_DEFAULT_TIMEOUT;
 unsigned int rds_ib_rnr_retry_count = RDS_IB_DEFAULT_RNR_RETRY_COUNT;
 
@@ -75,7 +76,8 @@ module_param(rds_ib_apm_fallback, int, 0444);
 MODULE_PARM_DESC(rds_ib_apm_fallback, " APM failback enabled");
 module_param(rds_ib_haip_fallback, int, 0444);
 MODULE_PARM_DESC(rds_ib_haip_fallback, " HAIP failback Enabled");
-
+module_param(rds_ib_haip_hca_failover_enabled, int, 0444);
+MODULE_PARM_DESC(rds_ib_haip_hca_failover_enabled, " HAIP HCA failover Enabled");
 
 
 /*
@@ -95,6 +97,9 @@ EXPORT_SYMBOL_GPL(rds_aux_wq);
 
 struct socket  *rds_ib_inet_socket;
 
+static struct rds_ib_port *ip_config;
+static u8      ip_port_cnt = 0;
+
 void rds_ib_nodev_connect(void)
 {
        struct rds_ib_connection *ic;
@@ -309,9 +314,45 @@ static int rds_ib_laddr_check(__be32 addr)
        return ret;
 }
 
-static void rds_ib_send_gratuitous_arp(struct net_device *out_dev,
-                                       unsigned char *dev_addr,
-                                       __be32 ip_addr)
+static u8 rds_ib_port_lookup(struct rds_ib_device *rds_ibdev, u8 port_num)
+{
+       u8      i;
+
+       for (i = 1; i <= ip_port_cnt; i++) {
+               if (ip_config[i].rds_ibdev == rds_ibdev &&
+                       ip_config[i].port_num == port_num)
+                       return i;
+       }
+       return 0;
+}
+
+static u8 rds_ib_get_failover_port(u8 port)
+{
+       u8      i;
+
+       for (i = 1; i <= ip_port_cnt; i++) {
+               if (i != port &&
+                       ip_config[i].rds_ibdev == ip_config[port].rds_ibdev &&
+                       ip_config[i].port_state == RDS_IB_PORT_UP) {
+                       return i;
+               }
+       }
+
+       if (rds_ib_haip_hca_failover_enabled) {
+               for (i = 1; i <= ip_port_cnt; i++) {
+                       if (i != port &&
+                               ip_config[i].port_state == RDS_IB_PORT_UP) {
+                                       return i;
+                               }
+               }
+       }
+
+       return 0;
+}
+
+static void rds_ib_send_gratuitous_arp(struct net_device       *out_dev,
+                                       unsigned char           *dev_addr,
+                                       __be32                  ip_addr)
 {
        arp_send(ARPOP_REQUEST, ETH_P_ARP,
                ip_addr, out_dev,
@@ -319,17 +360,17 @@ static void rds_ib_send_gratuitous_arp(struct net_device *out_dev,
                dev_addr, NULL);
 }
 
-static int rds_ib_set_ip(struct net_device *out_dev,
-                       unsigned char *dev_addr,
-                       char *if_name,
-                       __be32 addr,
-                       __be32 bcast,
-                       __be32 mask)
+static int rds_ib_set_ip(struct net_device     *out_dev,
+                       unsigned char           *dev_addr,
+                       char                    *if_name,
+                       __be32                  addr,
+                       __be32                  bcast,
+                       __be32                  mask)
 {
-       struct ifreq *ir;
-       struct sockaddr_in *sin;
-       struct page *page;
-       int ret = 0;
+       struct ifreq            *ir;
+       struct sockaddr_in      *sin;
+       struct page             *page;
+       int                     ret = 0;
 
        page = alloc_page(GFP_HIGHUSER);
        if (!page) {
@@ -346,10 +387,10 @@ static int rds_ib_set_ip(struct net_device *out_dev,
 
        sin->sin_addr.s_addr = addr;
        ret = inet_ioctl(rds_ib_inet_socket, SIOCSIFADDR, (unsigned long) ir);
-       if (ret) {
+       if (ret && addr) {
                printk(KERN_ERR
-                       "RDS/IB: inet_ioctl(SIOCSIFADDR) failed (%d)\n",
-                       ret);
+                       "RDS/IB: inet_ioctl(SIOCSIFADDR) on %s failed (%d)\n",
+                       if_name, ret);
                goto out;
        }
 
@@ -361,8 +402,8 @@ static int rds_ib_set_ip(struct net_device *out_dev,
                        (unsigned long) ir);
        if (ret) {
                printk(KERN_ERR
-                       "RDS/IB: inet_ioctl(SIOCSIFBRDADDR) failed (%d)\n",
-                       ret);
+                       "RDS/IB: inet_ioctl(SIOCSIFBRDADDR) on %s failed (%d)\n",
+                       if_name, ret);
                goto out;
        }
 
@@ -371,8 +412,8 @@ static int rds_ib_set_ip(struct net_device *out_dev,
                        (unsigned long) ir);
        if (ret) {
                printk(KERN_ERR
-                       "RDS/IB: inet_ioctl(SIOCSIFBRDADDR) failed (%d)\n",
-                       ret);
+                       "RDS/IB: inet_ioctl(SIOCSIFBRDADDR) on %s failed (%d)\n",
+                       if_name, ret);
                goto out;
        }
 
@@ -385,24 +426,50 @@ out:
        return ret;
 }
 
-static int rds_ib_move_ip(struct net_device *out_dev,
-                       unsigned char *dev_addr,
-                       char *from_dev,
-                       char *to_dev,
-                       __be32 addr,
-                       __be32 bcast,
-                       __be32 mask,
-                       int failover)
+static int rds_ib_addr_exist(struct net_device *ndev,
+                               __be32          addr,
+                               char            *if_name)
 {
-       struct rds_ib_device *rds_ibdev;
-       struct ifreq *ir;
-       struct sockaddr_in *sin;
-       struct page *page;
-       char from_dev2[2*IFNAMSIZ + 1];
-       char to_dev2[2*IFNAMSIZ + 1];
-       int i, ret = 0;
-       char *from_colon, *to_colon;
-       int from_passive = 0, to_passive = 0;
+       struct in_device        *in_dev;
+       struct in_ifaddr        *ifa;
+       struct in_ifaddr        **ifap;
+       int                     found = 0;
+
+       in_dev = in_dev_get(ndev);
+       if (in_dev) {
+               for (ifap = &in_dev->ifa_list; (ifa = *ifap);
+                       ifap = &ifa->ifa_next) {
+                       if (ifa->ifa_address == addr) {
+                               found = 1;
+                               if (if_name)
+                                       strcpy(if_name, ifa->ifa_label);
+                               break;
+                       }
+               }
+       }
+       in_dev_put(in_dev);
+
+       return found;
+}
+
+static int rds_ib_move_ip(char                 *from_dev,
+                       char                    *to_dev,
+                       u8                      from_port,
+                       u8                      to_port,
+                       u8                      arp_port,
+                       __be32                  addr,
+                       __be32                  bcast,
+                       __be32                  mask,
+                       int                     failover)
+{
+       struct ifreq            *ir;
+       struct sockaddr_in      *sin;
+       struct page             *page;
+       char                    from_dev2[2*IFNAMSIZ + 1];
+       char                    to_dev2[2*IFNAMSIZ + 1];
+       int                     i, ret = 0;
+       u8                      active_port;
+       struct in_device        *in_dev;
 
        page = alloc_page(GFP_HIGHUSER);
        if (!page) {
@@ -413,31 +480,34 @@ static int rds_ib_move_ip(struct net_device *out_dev,
        ir = (struct ifreq *)kmap(page);
        memset(ir, 0, sizeof(struct ifreq));
        sin = (struct sockaddr_in *)&ir->ifr_addr;
+       sin->sin_family = AF_INET;
 
-       from_colon = strchr(from_dev, ':');
-       to_colon = strchr(to_dev, ':');
-       if (!from_colon && !to_colon) {
-               rcu_read_lock();
-               list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
-                       for (i = 1; i <= rds_ibdev->dev->phys_port_cnt; i++) {
-                               if (!strcmp(from_dev,
-                                       rds_ibdev->ports[i].if_name) &&
-                                       !rds_ibdev->ports[i].ip_addr) {
-                                       from_passive = 1;
-                               }
-
-                               if (!strcmp(to_dev,
-                                       rds_ibdev->ports[i].if_name) &&
-                                       !rds_ibdev->ports[i].ip_addr) {
-                                       to_passive = 1;
-                               }
+       /* Set the primary IP if it hasn't been set */
+       if (ip_config[to_port].ip_addr) {
+               strcpy(ir->ifr_ifrn.ifrn_name, ip_config[to_port].dev->name);
+               ret = inet_ioctl(rds_ib_inet_socket, SIOCGIFADDR,
+                                       (unsigned long) ir);
+               if (ret == -EADDRNOTAVAIL) {
+                       sin->sin_addr.s_addr = ip_config[to_port].ip_addr;
+                       ret = inet_ioctl(rds_ib_inet_socket, SIOCSIFADDR,
+                                               (unsigned long) ir);
+                       if (ret) {
+                               printk(KERN_ERR
+                                       "RDS/IB: inet_ioctl(SIOCSIFADDR) "
+                                       "failed (%d)\n", ret);
+                               goto out;
                        }
+               } else if (ret) {
+                       printk(KERN_ERR
+                               "RDS/IB: inet_ioctl(SIOCGIFADDR) "
+                               "failed (%d)\n", ret);
+                       goto out;
                }
-               rcu_read_unlock();
        }
 
        if (failover) {
-               if (to_passive) {
+               in_dev = in_dev_get(ip_config[to_port].dev);
+               if (in_dev && !in_dev->ifa_list) {
                        strcpy(to_dev2, to_dev);
                } else {
                        strcpy(to_dev2, to_dev);
@@ -445,12 +515,31 @@ static int rds_ib_move_ip(struct net_device *out_dev,
                        strcat(to_dev2, from_dev);
                        to_dev2[IFNAMSIZ-1] = 0;
                }
-               strcpy(from_dev2, from_dev);
-       } else {
-               if (from_passive) {
+               in_dev_put(in_dev);
+
+               /* Bail if IP already exists on target port */
+               if (rds_ib_addr_exist(ip_config[to_port].dev, addr, NULL))
+                       goto out;
+
+               active_port = ip_config[from_port].ip_active_port;
+               if (active_port == from_port) {
                        strcpy(from_dev2, from_dev);
+               } else if (ip_config[active_port].port_state ==
+                               RDS_IB_PORT_UP) {
+                       if (!rds_ib_addr_exist(ip_config[active_port].dev,
+                                               addr, from_dev2)) {
+                               strcpy(from_dev2,
+                                       ip_config[active_port].dev->name);
+                               strcat(from_dev2, ":");
+                               strcat(from_dev2, from_dev);
+                       }
                } else {
                        strcpy(from_dev2, from_dev);
+               }
+       } else {
+               if (!rds_ib_addr_exist(ip_config[from_port].dev,
+                                               addr, from_dev2)) {
+                       strcpy(from_dev2, from_dev);
                        strcat(from_dev2, ":");
                        strcat(from_dev2, to_dev);
                        from_dev2[IFNAMSIZ-1] = 0;
@@ -458,193 +547,162 @@ static int rds_ib_move_ip(struct net_device *out_dev,
                strcpy(to_dev2, to_dev);
        }
 
-       /* Clear IP on from Interface */
-       sin->sin_addr.s_addr = 0;
-       sin->sin_family = AF_INET;
-       strcpy(ir->ifr_ifrn.ifrn_name, from_dev2);
-       ret = inet_ioctl(rds_ib_inet_socket, SIOCSIFADDR, (unsigned long) ir);
-       if (ret) {
-               printk(KERN_ERR
-                       "RDS/IB: inet_ioctl(SIOCSIFADDR,%s) failed (%d)\n",
-                       ir->ifr_ifrn.ifrn_name, ret);
-       }
+       /* Clear the IP on old port */
+       ret = rds_ib_set_ip(NULL, NULL, from_dev2, 0, 0, 0);
 
-       ret = rds_ib_set_ip(out_dev, dev_addr, to_dev2, addr, bcast, mask);
+       /* Set the IP on new port */
+       ret = rds_ib_set_ip(ip_config[arp_port].dev,
+                               ip_config[to_port].dev->dev_addr,
+                               to_dev2, addr, bcast, mask);
 
        if (ret) {
-               if (failover)
-                       printk(KERN_NOTICE
-                               "RDS/IP: failed to move IP %u.%u.%u.%u "
-                               "from %s to %s\n",
-                               NIPQUAD(addr), from_dev2, to_dev2);
-               else
-                       printk(KERN_NOTICE
-                               "RDS/IP: failed to move IP %u.%u.%u.%u "
-                               "from %s back to %s\n",
-                               NIPQUAD(addr), from_dev2, to_dev2);
+               printk(KERN_NOTICE
+                       "RDS/IP: failed to move IP %u.%u.%u.%u "
+                       "from %s to %s\n",
+                       NIPQUAD(addr), from_dev2, to_dev2);
        } else {
-               if (failover)
-                       printk(KERN_NOTICE
-                               "RDS/IB: IP %u.%u.%u.%u migrated over to %s\n",
-                               NIPQUAD(addr), to_dev2);
-               else
-                       printk(KERN_NOTICE
-                               "RDS/IB: IP %u.%u.%u.%u migrated back to %s\n",
-                               NIPQUAD(addr), to_dev2);
+               printk(KERN_NOTICE
+                       "RDS/IB: IP %u.%u.%u.%u migrated from %s to %s\n",
+                               NIPQUAD(addr), from_dev2, to_dev2);
        }
 
+out:
        kunmap(page);
        __free_page(page);
 
        return ret;
 }
 
-static void rds_ib_init_port(struct rds_ib_device *rds_ibdev,
-                               struct net_device *net_dev,
-                               u8 port_num)
+static void rds_ib_init_port(struct rds_ib_device      *rds_ibdev,
+                               struct net_device       *net_dev,
+                               u8                      port_num)
 {
-       strcpy(rds_ibdev->ports[port_num].if_name, net_dev->name);
-       rds_ibdev->ports[port_num].dev = net_dev;
-       rds_ibdev->ports[port_num].ip_active_port = 0;
+       ip_port_cnt++;
+       ip_config[ip_port_cnt].port_num = port_num;
+       ip_config[ip_port_cnt].dev = net_dev;
+       ip_config[ip_port_cnt].rds_ibdev = rds_ibdev;
+       ip_config[ip_port_cnt].ip_active_port = 0;
+       strcpy(ip_config[ip_port_cnt].if_name, net_dev->name);
 
        if (net_dev->operstate == IF_OPER_UP)
-               rds_ibdev->ports[port_num].port_state = RDS_IB_PORT_UP;
+               ip_config[ip_port_cnt].port_state = RDS_IB_PORT_UP;
        else
-               rds_ibdev->ports[port_num].port_state = RDS_IB_PORT_DOWN;
+               ip_config[ip_port_cnt].port_state = RDS_IB_PORT_DOWN;
 }
 
-static void rds_ib_set_port(struct rds_ib_device *rds_ibdev,
-                               struct net_device *net_dev,
-                               char *if_name, u8 port_num,
-                               __be32 ip_addr,
-                               __be32 ip_bcast,
-                               __be32 ip_mask)
+static void rds_ib_set_port(struct rds_ib_device       *rds_ibdev,
+                               struct net_device       *net_dev,
+                               char                    *if_name,
+                               u8                      port_num,
+                               __be32                  ip_addr,
+                               __be32                  ip_bcast,
+                               __be32                  ip_mask)
 {
        unsigned int    idx;
+       u8      port = rds_ib_port_lookup(rds_ibdev, port_num);
 
        if (!strcmp(net_dev->name, if_name)) {
-               strcpy(rds_ibdev->ports[port_num].if_name, if_name);
-               rds_ibdev->ports[port_num].ip_addr = ip_addr;
-               rds_ibdev->ports[port_num].ip_bcast = ip_bcast;
-               rds_ibdev->ports[port_num].ip_mask = ip_mask;
-               rds_ibdev->ports[port_num].ip_active_port = port_num;
+               strcpy(ip_config[port].if_name, if_name);
+               ip_config[port].ip_addr = ip_addr;
+               ip_config[port].ip_bcast = ip_bcast;
+               ip_config[port].ip_mask = ip_mask;
+               ip_config[port].ip_active_port = port;
        } else {
-               idx = rds_ibdev->ports[port_num].alias_cnt++;
-               strcpy(rds_ibdev->ports[port_num].
-                       aliases[idx].if_name, if_name);
-               rds_ibdev->ports[port_num].
-                       aliases[idx].ip_addr = ip_addr;
-               rds_ibdev->ports[port_num].
-                       aliases[idx].ip_bcast = ip_bcast;
-               rds_ibdev->ports[port_num].
-                       aliases[idx].ip_mask = ip_mask;
+               idx = ip_config[port].alias_cnt++;
+               strcpy(ip_config[port].aliases[idx].if_name, if_name);
+               ip_config[port].aliases[idx].ip_addr = ip_addr;
+               ip_config[port].aliases[idx].ip_bcast = ip_bcast;
+               ip_config[port].aliases[idx].ip_mask = ip_mask;
        }
 }
 
-static void rds_ib_do_failover(struct rds_ib_device *rds_ibdev,
-                               u8 from_port,
-                               u8 to_port)
+static void rds_ib_do_failover(u8 from_port, u8 to_port, u8 arp_port)
 {
-       u8      i, j;
+       u8      j;
        int     ret;
 
-       for (i = 1; i <= rds_ibdev->dev->phys_port_cnt; i++) {
-               if ((from_port != i &&
-                       i == rds_ibdev->ports[i].ip_active_port) ||
-                       i == to_port) {
-
-                       if (!rds_ib_move_ip(
-                               rds_ibdev->ports[i].dev,
-                               rds_ibdev->ports[i].dev->dev_addr,
-                               rds_ibdev->ports[from_port].if_name,
-                               rds_ibdev->ports[i].if_name,
-                               rds_ibdev->ports[from_port].ip_addr,
-                               rds_ibdev->ports[from_port].ip_bcast,
-                               rds_ibdev->ports[from_port].ip_mask,
-                               1)) {
-
-                               rds_ibdev->ports[from_port].ip_active_port = i;
-                               for (j = 0; j < rds_ibdev->ports[from_port].
-                                       alias_cnt; j++) {
-
-                                       ret = rds_ib_move_ip(
-                                               rds_ibdev->ports[i].dev,
-                                               rds_ibdev->ports[i].
-                                                       dev->dev_addr,
-                                               rds_ibdev->ports[from_port].
-                                                       aliases[j].if_name,
-                                               rds_ibdev->ports[i].if_name,
-                                               rds_ibdev->ports[from_port].
-                                                       aliases[j].ip_addr,
-                                               rds_ibdev->ports[from_port].
-                                                       aliases[j].ip_bcast,
-                                               rds_ibdev->ports[from_port].
-                                                       aliases[j].ip_mask,
-                                               1);
-                               }
-                               break;
-                       }
-               }
-       }
-}
+       if (!ip_config[from_port].ip_addr)
+               return;
 
-static void rds_ib_do_set_ip(struct rds_ib_device *rds_ibdev,
-                               u8 port)
-{
-       int     ret;
-       u8      j;
+       if (!to_port)
+               to_port = rds_ib_get_failover_port(from_port);
+
+       if (!arp_port)
+               arp_port = to_port;
 
-       ret = rds_ib_set_ip(rds_ibdev->ports[port].dev,
-                       rds_ibdev->ports[port].dev->dev_addr,
-                       rds_ibdev->ports[port].if_name,
-                       rds_ibdev->ports[port].ip_addr,
-                       rds_ibdev->ports[port].ip_bcast,
-                       rds_ibdev->ports[port].ip_mask);
-
-       for (j = 0; j < rds_ibdev->ports[port].alias_cnt; j++) {
-               ret = rds_ib_set_ip(rds_ibdev->ports[port].dev,
-                               rds_ibdev->ports[port].dev->dev_addr,
-                               rds_ibdev->ports[port].aliases[j].if_name,
-                               rds_ibdev->ports[port].aliases[j].ip_addr,
-                               rds_ibdev->ports[port].aliases[j].ip_bcast,
-                               rds_ibdev->ports[port].aliases[j].ip_mask);
+       if (to_port) {
+               if (!rds_ib_move_ip(
+                       ip_config[from_port].if_name,
+                       ip_config[to_port].if_name,
+                       from_port,
+                       to_port,
+                       arp_port,
+                       ip_config[from_port].ip_addr,
+                       ip_config[from_port].ip_bcast,
+                       ip_config[from_port].ip_mask,
+                       1)) {
+
+                       ip_config[from_port].ip_active_port = to_port;
+                       for (j = 0; j < ip_config[from_port].
+                               alias_cnt; j++) {
+
+                               ret = rds_ib_move_ip(
+                                       ip_config[from_port].
+                                               aliases[j].if_name,
+                                       ip_config[to_port].if_name,
+                                       from_port,
+                                       to_port,
+                                       arp_port,
+                                       ip_config[from_port].
+                                               aliases[j].ip_addr,
+                                       ip_config[from_port].
+                                               aliases[j].ip_bcast,
+                                       ip_config[from_port].
+                                               aliases[j].ip_mask,
+                                       1);
+                       }
+               }
        }
 }
 
-static void rds_ib_do_failback(struct rds_ib_device *rds_ibdev,
-                               u8 port)
+static void rds_ib_do_failback(u8 port)
 {
-       u8      ip_active_port = rds_ibdev->ports[port].ip_active_port;
+       u8      ip_active_port = ip_config[port].ip_active_port;
        u8      j;
        int     ret;
 
-       if (port != rds_ibdev->ports[port].ip_active_port) {
+       if (!ip_config[port].ip_addr)
+               return;
+
+       if (port != ip_config[port].ip_active_port) {
                if (!rds_ib_move_ip(
-                       rds_ibdev->ports[ip_active_port].dev,
-                       rds_ibdev->ports[port].dev->dev_addr,
-                       rds_ibdev->ports[ip_active_port].if_name,
-                       rds_ibdev->ports[port].if_name,
-                       rds_ibdev->ports[port].ip_addr,
-                       rds_ibdev->ports[port].ip_bcast,
-                       rds_ibdev->ports[port].ip_mask,
+                       ip_config[ip_active_port].if_name,
+                       ip_config[port].if_name,
+                       ip_active_port,
+                       port,
+                       ip_active_port,
+                       ip_config[port].ip_addr,
+                       ip_config[port].ip_bcast,
+                       ip_config[port].ip_mask,
                        0)) {
 
-                       for (j = 0; j < rds_ibdev->ports[port].
+                       ip_config[port].ip_active_port = port;
+                       for (j = 0; j < ip_config[port].
                                alias_cnt; j++) {
 
                                ret = rds_ib_move_ip(
-                                       rds_ibdev->ports[ip_active_port].dev,
-                                       rds_ibdev->ports[port].
-                                               dev->dev_addr,
-                                       rds_ibdev->ports[ip_active_port].
+                                       ip_config[ip_active_port].
                                                if_name,
-                                       rds_ibdev->ports[port].
+                                       ip_config[port].
                                                aliases[j].if_name,
-                                       rds_ibdev->ports[port].
+                                       ip_active_port,
+                                       port,
+                                       ip_active_port,
+                                       ip_config[port].
                                                aliases[j].ip_addr,
-                                       rds_ibdev->ports[port].
+                                       ip_config[port].
                                                aliases[j].ip_bcast,
-                                       rds_ibdev->ports[port].
+                                       ip_config[port].
                                                aliases[j].ip_mask,
                                        0);
                        }
@@ -654,18 +712,25 @@ static void rds_ib_do_failback(struct rds_ib_device *rds_ibdev,
 
 static void rds_ib_failover(struct work_struct *_work)
 {
-       struct rds_ib_port_ud_work *work =
+       struct rds_ib_port_ud_work      *work =
                container_of(_work, struct rds_ib_port_ud_work, work.work);
-       struct rds_ib_device *rds_ibdev = work->rds_ibdev;
-       int ret;
+       int                             ret;
+       u8                              i;
+
+       for (i = 1; i <= ip_port_cnt; i++) {
+               if (i != work->port &&
+                       ip_config[i].port_state == RDS_IB_PORT_DOWN &&
+                       ip_config[i].ip_active_port == work->port) {
+                       rds_ib_do_failover(i, 0, 0);
+               }
+       }
 
-       if (rds_ibdev->ports[work->port].ip_addr)
-               rds_ib_do_failover(rds_ibdev, work->port, 0);
+       if (ip_config[work->port].ip_addr)
+               rds_ib_do_failover(work->port, 0, 0);
 
-       if (rds_ibdev->ports[work->port].ip_active_port == work->port) {
-               ret = rds_ib_set_ip(rds_ibdev->ports[work->port].dev,
-                               rds_ibdev->ports[work->port].dev->dev_addr,
-                               rds_ibdev->ports[work->port].if_name,
+       if (ip_config[work->port].ip_active_port == work->port) {
+               ret = rds_ib_set_ip(NULL, NULL,
+                               ip_config[work->port].if_name,
                                0, 0, 0);
        }
 
@@ -674,51 +739,90 @@ static void rds_ib_failover(struct work_struct *_work)
 
 static void rds_ib_failback(struct work_struct *_work)
 {
-       struct rds_ib_port_ud_work *work =
+       struct rds_ib_port_ud_work      *work =
                container_of(_work, struct rds_ib_port_ud_work, work.work);
-       struct rds_ib_device *rds_ibdev = work->rds_ibdev;
-       u8 i, port = work->port;
-       struct in_device *in_dev;
+       u8                              i, ip_active_port, port = work->port;
+
+       ip_active_port = ip_config[port].ip_active_port;
+
+       rds_ib_do_failback(port);
+
+       for (i = 1; i <= ip_port_cnt; i++) {
+               if (i == port ||
+                       ip_config[i].port_state == RDS_IB_PORT_UP ||
+                       !ip_config[i].ip_addr)
+                       continue;
+
+               if (ip_config[i].ip_active_port == i) {
+                       rds_ib_do_failover(i, 0, ip_active_port);
+               } else if (ip_config[i].ip_active_port == port) {
+                       rds_ib_do_failover(i, port, ip_active_port);
+               } else if (ip_config[ip_config[i].ip_active_port].port_state ==
+                               RDS_IB_PORT_DOWN) {
+                       rds_ib_do_failover(i, 0, ip_active_port);
+               } else if (ip_config[port].rds_ibdev ==
+                               ip_config[i].rds_ibdev) {
+                       rds_ib_do_failover(i, port, ip_active_port);
+               }
+       }
 
-       if (rds_ibdev->ports[port].ip_addr &&
-               rds_ibdev->ports[port].ip_active_port != port) {
+       if (ip_active_port != ip_config[port].ip_active_port) {
+               for (i = 1; i <= ip_port_cnt; i++) {
+                       if (ip_config[i].port_state == RDS_IB_PORT_DOWN &&
+                               i != ip_active_port && ip_config[i].ip_addr &&
+                               ip_config[i].ip_active_port == ip_active_port) {
 
-               rds_ib_do_failback(rds_ibdev, port);
+                               rds_ib_do_failover(i, ip_active_port,
+                                                       ip_active_port);
+                       }
+               }
        }
 
-       rds_ibdev->ports[port].ip_active_port = port;
-       in_dev = in_dev_get(rds_ibdev->ports[port].dev);
+       kfree(work);
+}
 
-       for (i = 1; i <= rds_ibdev->dev->phys_port_cnt; i++) {
-               if (rds_ibdev->ports[i].port_state == RDS_IB_PORT_DOWN &&
-                       i != port && rds_ibdev->ports[i].ip_addr) {
+static int rds_ib_ip_config_down(void)
+{
+       u8      i;
 
-                       if (rds_ibdev->ports[i].ip_active_port == i) {
-                               rds_ib_do_failover(rds_ibdev, i, 0);
-                       } else if (rds_ibdev->ports[i].ip_active_port == port) {
-                               if (in_dev && !in_dev->ifa_list &&
-                                       rds_ibdev->ports[port].ip_addr) {
+       for (i = 1; i <= ip_port_cnt; i++) {
+               if (ip_config[i].port_state == RDS_IB_PORT_UP)
+                       return 0;
+       }
 
-                                       rds_ib_do_set_ip(rds_ibdev, port);
-                               }
+       return 1;
+}
 
-                               rds_ib_do_failover(rds_ibdev, i, port);
-                       }
-               }
+static void rds_ib_net_failback(struct work_struct *_work)
+{
+       struct rds_ib_port_ud_work      *work =
+               container_of(_work, struct rds_ib_port_ud_work, work.work);
+       struct in_device                *in_dev;
+
+       in_dev = in_dev_get(ip_config[work->port].dev);
+       if (in_dev && !in_dev->ifa_list &&
+               ip_config[work->port].ip_addr &&
+               work->timeout > 0) {
+               INIT_DELAYED_WORK(&work->work, rds_ib_net_failback);
+               work->timeout -= msecs_to_jiffies(100);
+               queue_delayed_work(rds_wq, &work->work,
+                       msecs_to_jiffies(100));
+       } else {
+               rds_ib_failback((struct work_struct *)&work->work);
        }
 
-       kfree(work);
+       in_dev_put(in_dev);
 }
 
 static void rds_ib_event_handler(struct ib_event_handler *handler,
                                struct ib_event *event)
 {
-       struct rds_ib_device *rds_ibdev =
+       struct rds_ib_device    *rds_ibdev =
                container_of(handler, typeof(*rds_ibdev), event_handler);
-       u8 port = event->element.port_num;
-       struct rds_ib_port_ud_work *work;
+       u8      port = rds_ib_port_lookup(rds_ibdev, event->element.port_num);
+       struct rds_ib_port_ud_work      *work;
 
-       if (!rds_ib_haip_enabled)
+       if (!rds_ib_haip_enabled || !ip_port_cnt)
                return;
 
        if (event->event != IB_EVENT_PORT_ACTIVE &&
@@ -726,8 +830,9 @@ static void rds_ib_event_handler(struct ib_event_handler *handler,
                return;
 
        printk(KERN_NOTICE "RDS/IB: %s/port_%d/%s is %s\n",
-               rds_ibdev->dev->name, port,
-               rds_ibdev->ports[port].if_name,
+               rds_ibdev->dev->name,
+               event->element.port_num,
+               ip_config[port].if_name,
                (event->event == IB_EVENT_PORT_ACTIVE) ?
                        "ACTIVE" : "ERROR");
 
@@ -737,7 +842,6 @@ static void rds_ib_event_handler(struct ib_event_handler *handler,
                return;
        }
 
-       work->rds_ibdev = rds_ibdev;
        work->port = port;
 
        if (event->event == IB_EVENT_PORT_ACTIVE) {
@@ -746,89 +850,58 @@ static void rds_ib_event_handler(struct ib_event_handler *handler,
                        queue_delayed_work(rds_wq, &work->work, 0);
                } else
                        kfree(work);
-               rds_ibdev->ports[port].port_state = RDS_IB_PORT_UP;
+               ip_config[port].port_state = RDS_IB_PORT_UP;
        } else {
                INIT_DELAYED_WORK(&work->work, rds_ib_failover);
                queue_delayed_work(rds_wq, &work->work, 0);
-               rds_ibdev->ports[port].port_state = RDS_IB_PORT_DOWN;
-       }
-}
-
-static void rds_ib_check_down_port(void)
-{
-       struct rds_ib_device *rds_ibdev;
-       struct rds_ib_port_ud_work *work;
-       u8 i;
-
-       list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
-               for (i = 1; i <= rds_ibdev->dev->phys_port_cnt; i++) {
-                       if (rds_ibdev->ports[i].port_state != RDS_IB_PORT_UP &&
-                               rds_ibdev->ports[i].ip_addr) {
-                               printk(KERN_NOTICE
-                                       "RDS/IB: port %s/%d is NOT UP\n",
-                                       rds_ibdev->dev->name, i);
-
-                               work = kzalloc(sizeof *work, GFP_KERNEL);
-                               if (!work) {
-                                       printk(KERN_ERR
-                                               "RDS/IB: failed to allocate port work\n");
-                                       return;
-                               }
-
-                               work->rds_ibdev = rds_ibdev;
-                               work->port = i;
-                               INIT_DELAYED_WORK(&work->work, rds_ib_failover);                                queue_delayed_work(rds_wq, &work->work, 0);
-                       }
-               }
+               ip_config[port].port_state = RDS_IB_PORT_DOWN;
        }
-       flush_workqueue(rds_wq);
 }
 
 static void rds_ib_dump_ip_config(void)
 {
-       struct rds_ib_device *rds_ibdev;
-       int i, j;
+       int     i, j;
 
        if (!rds_ib_haip_enabled)
                return;
 
        printk(KERN_ERR "RDS/IB: IP configuration ...\n");
-       list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
-               for (i = 1; i <= rds_ibdev->dev->phys_port_cnt; i++) {
-                       printk(KERN_ERR "RDS/IB: %s/port_%d/%s: "
-                               "IP %d.%d.%d.%d/%d.%d.%d.%d/%d.%d.%d.%d "
-                               "state %s\n",
-                               rds_ibdev->dev->name, i,
-                               rds_ibdev->ports[i].if_name,
-                               NIPQUAD(rds_ibdev->ports[i].ip_addr),
-                               NIPQUAD(rds_ibdev->ports[i].ip_bcast),
-                               NIPQUAD(rds_ibdev->ports[i].ip_mask),
-                               (rds_ibdev->ports[i].port_state == RDS_IB_PORT_UP ? "UP" : "DOWN"));
-
-                       for (j = 0; j < rds_ibdev->ports[i].alias_cnt; j++) {
-                               printk(KERN_ERR "Alias %s "
-                                       "IP %d.%d.%d.%d/%d.%d.%d.%d/%d.%d.%d.%d\n",
-                                       rds_ibdev->ports[i].aliases[j].if_name,
-                                       NIPQUAD(rds_ibdev->ports[i].
-                                               aliases[j].ip_addr),
-                                       NIPQUAD(rds_ibdev->ports[i].
-                                               aliases[j].ip_bcast),
-                                       NIPQUAD(rds_ibdev->ports[i].
-                                               aliases[j].ip_mask));
-                       }
+       for (i = 1; i <= ip_port_cnt; i++) {
+               printk(KERN_ERR "RDS/IB: %s/port_%d/%s: "
+                       "IP %d.%d.%d.%d/%d.%d.%d.%d/%d.%d.%d.%d "
+                       "state %s\n",
+                       ip_config[i].rds_ibdev->dev->name,
+                       ip_config[i].port_num,
+                       ip_config[i].if_name,
+                       NIPQUAD(ip_config[i].ip_addr),
+                       NIPQUAD(ip_config[i].ip_bcast),
+                       NIPQUAD(ip_config[i].ip_mask),
+                       (ip_config[i].port_state ==
+                               RDS_IB_PORT_UP ? "UP" : "DOWN"));
+
+               for (j = 0; j < ip_config[i].alias_cnt; j++) {
+                       printk(KERN_ERR "Alias %s "
+                               "IP %d.%d.%d.%d/%d.%d.%d.%d/%d.%d.%d.%d\n",
+                               ip_config[i].aliases[j].if_name,
+                               NIPQUAD(ip_config[i].
+                                       aliases[j].ip_addr),
+                               NIPQUAD(ip_config[i].
+                                       aliases[j].ip_bcast),
+                               NIPQUAD(ip_config[i].
+                                       aliases[j].ip_mask));
                }
        }
 }
 
 static int rds_ib_setup_ports(void)
 {
-       struct net_device *dev;
-       struct in_ifaddr *ifa;
-       struct in_ifaddr **ifap;
-       struct in_device *in_dev;
-       struct rds_ib_device *rds_ibdev;
-       u8      port_num;
-       int     ret = 0;
+       struct net_device       *dev;
+       struct in_ifaddr        *ifa;
+       struct in_ifaddr        **ifap;
+       struct in_device        *in_dev;
+       struct rds_ib_device    *rds_ibdev;
+       u8                      port_num;
+       int                     ret = 0;
 
        if (!rds_ib_haip_enabled)
                return ret;
@@ -873,9 +946,9 @@ static int rds_ib_setup_ports(void)
                                        ifa->ifa_mask);
                        }
                }
+               in_dev_put(in_dev);
        }
 
-       rds_ib_check_down_port();
        rds_ib_dump_ip_config();
 out:
        read_unlock(&dev_base_lock);
@@ -935,11 +1008,12 @@ void rds_ib_add_one(struct ib_device *device)
        }
 
        if (rds_ib_haip_enabled) {
-               rds_ibdev->ports = kzalloc(sizeof(struct rds_ib_port) *
-                                       (device->phys_port_cnt + 1), GFP_KERNEL);
-               if (!rds_ibdev->ports) {
+               ip_config = kzalloc(sizeof(struct rds_ib_port) *
+                                       RDS_IB_MAX_PORTS + 1, GFP_KERNEL);
+
+               if (!ip_config) {
                        printk(KERN_ERR
-                               "RDS/IB: failed to allocate ports\n");
+                               "RDS/IB: failed to allocate IP config\n");
                        goto put_dev;
                }
 
@@ -1046,33 +1120,28 @@ static int rds_ib_netdev_callback(struct notifier_block *self, unsigned long eve
        struct net_device *ndev = netdev_notifier_info_to_dev(ctx);
        u8 port = 0;
        u8 i;
-       struct rds_ib_device    *rds_ibdev;
        struct rds_ib_port_ud_work *work;
 
-       if (!rds_ib_haip_enabled)
+       if (!rds_ib_haip_enabled || !ip_port_cnt)
                return NOTIFY_DONE;
 
        if (event != NETDEV_UP && event != NETDEV_DOWN)
                return NOTIFY_DONE;
 
-       rcu_read_lock();
-       list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
-               for (i = 1; i <= rds_ibdev->dev->phys_port_cnt; i++) {
-                       if (!strcmp(ndev->name,
-                               rds_ibdev->ports[i].if_name)) {
-                                       port = i;
-                                       goto out;
-                               }
+       for (i = 1; i <= ip_port_cnt; i++) {
+               if (!strcmp(ndev->name, ip_config[i].if_name)) {
+                       port = i;
+                       break;
                }
        }
-       rcu_read_unlock();
-out:
+
        if (!port)
                return NOTIFY_DONE;
 
 
        printk(KERN_NOTICE "RDS/IB: %s/port_%d/%s is %s\n",
-               rds_ibdev->dev->name, port, ndev->name,
+               ip_config[port].rds_ibdev->dev->name,
+               ip_config[port].port_num, ndev->name,
                (event == NETDEV_UP) ? "UP" : "DOWN");
 
        work = kzalloc(sizeof *work, GFP_KERNEL);
@@ -1081,24 +1150,32 @@ out:
                return NOTIFY_DONE;
        }
 
-       work->rds_ibdev = rds_ibdev;
        work->dev = ndev;
        work->port = port;
 
        switch (event) {
        case NETDEV_UP:
                if (rds_ib_haip_fallback) {
-                       INIT_DELAYED_WORK(&work->work, rds_ib_failback);
-                       queue_delayed_work(rds_wq, &work->work, msecs_to_jiffies(100));
+                       if (rds_ib_ip_config_down()) {
+                               INIT_DELAYED_WORK(&work->work,
+                                       rds_ib_net_failback);
+                               work->timeout = msecs_to_jiffies(10000);
+                       } else {
+                               INIT_DELAYED_WORK(&work->work,
+                                       rds_ib_net_failback);
+                               work->timeout = msecs_to_jiffies(1000);
+                       }
+                       queue_delayed_work(rds_wq, &work->work,
+                               msecs_to_jiffies(100));
                } else
                        kfree(work);
 
-               rds_ibdev->ports[port].port_state = NETDEV_UP;
+               ip_config[port].port_state = NETDEV_UP;
                break;
        case NETDEV_DOWN:
                INIT_DELAYED_WORK(&work->work, rds_ib_failover);
                queue_delayed_work(rds_wq, &work->work, 0);
-               rds_ibdev->ports[port].port_state = RDS_IB_PORT_DOWN;
+               ip_config[port].port_state = RDS_IB_PORT_DOWN;
                break;
        }
 
index 35d82cd08c93de924c2b88c777cdcf1b5907e60b..58e9a285c3a53d08b0860380945d48ecced6367a 100644 (file)
@@ -266,11 +266,14 @@ enum {
        RDS_IB_PORT_DOWN,
 };
 
-#define RDS_IB_MAX_ALIASES     200
+#define RDS_IB_MAX_ALIASES     100
+#define RDS_IB_MAX_PORTS       10
 struct rds_ib_port {
+       struct rds_ib_device    *rds_ibdev;
        struct net_device       *dev;
+       unsigned int            port_state;
+       u8                      port_num;
        char                    if_name[IFNAMSIZ];
-       unsigned int            port_state;
        __be32                  ip_addr;
        __be32                  ip_bcast;
        __be32                  ip_mask;
@@ -281,9 +284,9 @@ struct rds_ib_port {
 
 struct rds_ib_port_ud_work {
        struct delayed_work             work;
-       struct rds_ib_device            *rds_ibdev;
        struct net_device               *dev;
        unsigned int                    port;
+       int                             timeout;
 };
 
 enum {
@@ -422,6 +425,7 @@ extern unsigned int rds_ib_apm_enabled;
 extern unsigned int rds_ib_apm_fallback;
 extern unsigned int rds_ib_haip_enabled;
 extern unsigned int rds_ib_haip_fallback;
+extern unsigned int rds_ib_haip_failover_enabled;
 extern unsigned int rds_ib_apm_timeout;
 
 extern spinlock_t ib_nodev_conns_lock;