]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
rds: Misc HAIP fixes
authorBang Nguyen <bang.nguyen@oracle.com>
Tue, 13 Nov 2012 20:27:34 +0000 (12:27 -0800)
committerMukesh Kacker <mukesh.kacker@oracle.com>
Wed, 8 Jul 2015 20:12:28 +0000 (13:12 -0700)
Signed-off-by: Bang Nguyen <bang.nguyen@oracle.com>
net/rds/ib.c
net/rds/ib.h
net/rds/ib_cm.c
net/rds/ib_rdma.c
net/rds/ib_recv.c
net/rds/rdma_transport.c

index 1d24eebdc04f0b5aae3182433319f12a4379933a..0f40a305cd0e3f4f1c8d0af2f3a956c86496f713 100644 (file)
 #include <linux/netdevice.h>
 #include <linux/inetdevice.h>
 #include <linux/if_arp.h>
+#include <net/arp.h>
 #include <linux/delay.h>
 #include <rdma/ib_cache.h>
 #include <net/sock.h>
 #include <net/inet_common.h>
+#include <linux/rtnetlink.h>
 
 #include "rds.h"
 #include "ib.h"
+#include <linux/time.h>
 
 unsigned int rds_ib_fmr_1m_pool_size = RDS_FMR_1M_POOL_SIZE;
 unsigned int rds_ib_fmr_8k_pool_size = RDS_FMR_8K_POOL_SIZE;
 unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
-unsigned int rds_ib_apm_enable = 0;
-unsigned int rds_ib_active_active_enabled = 0;
-unsigned int rds_ib_timeout = RDS_IB_DEFAULT_TIMEOUT;
+unsigned int rds_ib_apm_enabled = 0;
+unsigned int rds_ib_apm_fallback = 1;
+unsigned int rds_ib_haip_enabled = 0;
+unsigned int rds_ib_haip_fallback = 1;
+unsigned int rds_ib_apm_timeout = RDS_IB_DEFAULT_TIMEOUT;
 unsigned int rds_ib_rnr_retry_count = RDS_IB_DEFAULT_RNR_RETRY_COUNT;
 
 module_param(rds_ib_fmr_1m_pool_size, int, 0444);
@@ -58,14 +63,20 @@ module_param(rds_ib_fmr_8k_pool_size, int, 0444);
 MODULE_PARM_DESC(rds_ib_fmr_8k_pool_size, " Max number of 8k fmr per HCA");
 module_param(rds_ib_retry_count, int, 0444);
 MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
-module_param(rds_ib_apm_enable, int, 0444);
-MODULE_PARM_DESC(rds_ib_apm_enable, " Enable APM");
-module_param(rds_ib_active_active_enabled, int, 0444);
-MODULE_PARM_DESC(rds_ib_active_active_enabled, " Active/Active enabled");
-module_param(rds_ib_timeout, int, 0444);
-MODULE_PARM_DESC(rds_ib_timeout, " QP timeout");
+module_param(rds_ib_apm_enabled, int, 0444);
+MODULE_PARM_DESC(rds_ib_apm_enabled, " APM Enabled");
+module_param(rds_ib_haip_enabled, int, 0444);
+MODULE_PARM_DESC(rds_ib_haip_enabled, " High Availability IP enabled");
+module_param(rds_ib_apm_timeout, int, 0444);
+MODULE_PARM_DESC(rds_ib_apm_timeout, " APM timeout");
 module_param(rds_ib_rnr_retry_count, int, 0444);
-MODULE_PARM_DESC(rds_ib_timeout, " QP rnr retry count");
+MODULE_PARM_DESC(rds_ib_rnr_retry_count, " QP rnr retry count");
+module_param(rds_ib_apm_fallback, int, 0444);
+MODULE_PARM_DESC(rds_ib_apm_fallback, " APM failback enabled");
+module_param(rds_ib_haip_fallback, int, 0444);
+MODULE_PARM_DESC(rds_ib_haip_fallback, " HAIP failback Enabled");
+
+
 
 /*
  * we have a clumsy combination of RCU and a rwsem protecting this list
@@ -298,191 +309,278 @@ static int rds_ib_laddr_check(__be32 addr)
        return ret;
 }
 
-static int rds_ib_move_ip(char *from_dev, char *to_dev, __be32 addr, int failover)
+static void rds_ib_send_gratuitous_arp(struct net_device *out_dev,
+                                       unsigned char *dev_addr,
+                                       __be32 ip_addr)
+{
+       arp_send(ARPOP_REQUEST, ETH_P_ARP,
+               ip_addr, out_dev,
+               ip_addr, NULL,
+               dev_addr, NULL);
+}
+
+static int rds_ib_set_ip(struct net_device *out_dev,
+                       unsigned char *dev_addr,
+                       char *if_name,
+                       __be32 addr,
+                       __be32 bcast,
+                       __be32 mask)
 {
        struct ifreq *ir;
        struct sockaddr_in *sin;
-       __be32 down_ip, down_bcast, down_mask;
        struct page *page;
-       char from_dev2[2*IFNAMSIZ + 1];
-       char to_dev2[2*IFNAMSIZ + 1];
        int ret = 0;
 
        page = alloc_page(GFP_HIGHUSER);
        if (!page) {
                printk(KERN_ERR "RDS/IB: alloc_page failed .. NO MEM\n");
-               ret = -ENOMEM;
-               goto out;
+               return 1;
        }
 
        ir = (struct ifreq *)kmap(page);
        memset(ir, 0, sizeof(struct ifreq));
        sin = (struct sockaddr_in *)&ir->ifr_addr;
+       sin->sin_family = AF_INET;
 
-       if (failover) {
-               strcpy(to_dev2, to_dev);
-               strcat(to_dev2, ":");
-               strcat(to_dev2, from_dev);
-               to_dev2[IFNAMSIZ-1] = 0;
-               strcpy(from_dev2, from_dev);
-       } else {
-               strcpy(from_dev2, from_dev);
-               strcat(from_dev2, ":");
-               strcat(from_dev2, to_dev);
-               from_dev2[IFNAMSIZ-1] = 0;
-               strcpy(to_dev2, to_dev);
-       }
+       strcpy(ir->ifr_ifrn.ifrn_name, if_name);
 
-       strcpy(ir->ifr_ifrn.ifrn_name, from_dev2);
-       ret = inet_ioctl(rds_ib_inet_socket, SIOCGIFADDR, (unsigned long) ir);
+       sin->sin_addr.s_addr = addr;
+       ret = inet_ioctl(rds_ib_inet_socket, SIOCSIFADDR, (unsigned long) ir);
        if (ret) {
                printk(KERN_ERR
-                       "RDS/IB: inet_ioctl(SIOCGIFADDR) failed (%d)\n",
+                       "RDS/IB: inet_ioctl(SIOCSIFADDR) failed (%d)\n",
                        ret);
                goto out;
        }
-       down_ip = sin->sin_addr.s_addr;
-       if (addr != down_ip) {
-               printk(KERN_ERR
-                       "RDS/IP: %u.%u.%u.%u not configured on %s\n",
-                       NIPQUAD(addr), ir->ifr_ifrn.ifrn_name);
+
+       if (!addr)
                goto out;
-       }
 
-       ret = inet_ioctl(rds_ib_inet_socket, SIOCGIFBRDADDR, (unsigned long) ir);
+       sin->sin_addr.s_addr = bcast;
+       ret = inet_ioctl(rds_ib_inet_socket, SIOCSIFBRDADDR,
+                       (unsigned long) ir);
        if (ret) {
                printk(KERN_ERR
-                       "RDS/IB: inet_ioctl(SIOCGIFBRDADDR) failed (%d)\n",
+                       "RDS/IB: inet_ioctl(SIOCSIFBRDADDR) failed (%d)\n",
                        ret);
                goto out;
        }
-       down_bcast = sin->sin_addr.s_addr;
 
-       ret = inet_ioctl(rds_ib_inet_socket, SIOCGIFNETMASK, (unsigned long) ir);
+       sin->sin_addr.s_addr = mask;
+       ret = inet_ioctl(rds_ib_inet_socket, SIOCSIFNETMASK,
+                       (unsigned long) ir);
        if (ret) {
                printk(KERN_ERR
-                       "RDS/IB: inet_ioctl(SIOCGIFNETMASK) failed (%d)\n",
+                       "RDS/IB: inet_ioctl(SIOCSIFBRDADDR) failed (%d)\n",
                        ret);
                goto out;
        }
-       down_mask = sin->sin_addr.s_addr;
 
-       /* Clear IP on down Interface */
-       sin->sin_addr.s_addr = 0;
-       ret = inet_ioctl(rds_ib_inet_socket, SIOCSIFADDR, (unsigned long) ir);
-       if (ret) {
-               printk(KERN_ERR
-                       "RDS/IB: inet_ioctl(SIOCSIFADDR) failed (%d)\n",
-                       ret);
-               goto out;
+       rds_ib_send_gratuitous_arp(out_dev, dev_addr, addr);
+
+out:
+       kunmap(page);
+       __free_page(page);
+
+       return ret;
+}
+
+static int rds_ib_move_ip(struct net_device *out_dev,
+                       unsigned char *dev_addr,
+                       char *from_dev,
+                       char *to_dev,
+                       __be32 addr,
+                       __be32 bcast,
+                       __be32 mask,
+                       int failover)
+{
+       struct rds_ib_device *rds_ibdev;
+       struct ifreq *ir;
+       struct sockaddr_in *sin;
+       struct page *page;
+       char from_dev2[2*IFNAMSIZ + 1];
+       char to_dev2[2*IFNAMSIZ + 1];
+       int i, ret = 0;
+       char *from_colon, *to_colon;
+       int from_passive = 0, to_passive = 0;
+
+       page = alloc_page(GFP_HIGHUSER);
+       if (!page) {
+               printk(KERN_ERR "RDS/IB: alloc_page failed .. NO MEM\n");
+               return 1;
        }
 
+       ir = (struct ifreq *)kmap(page);
        memset(ir, 0, sizeof(struct ifreq));
-       strcpy(ir->ifr_ifrn.ifrn_name, to_dev2);
-       sin->sin_family = AF_INET;
+       sin = (struct sockaddr_in *)&ir->ifr_addr;
 
-       sin->sin_addr.s_addr = down_ip;
-       ret = inet_ioctl(rds_ib_inet_socket, SIOCSIFADDR, (unsigned long) ir);
-       if (ret) {
-               printk(KERN_ERR
-                       "RDS/IB: inet_ioctl(SIOCSIFADDR) failed (%d)\n",
-                       ret);
-               goto out;
+       from_colon = strchr(from_dev, ':');
+       to_colon = strchr(to_dev, ':');
+       if (!from_colon && !to_colon) {
+               rcu_read_lock();
+               list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
+                       for (i = 1; i <= rds_ibdev->dev->phys_port_cnt; i++) {
+                               if (!strcmp(from_dev,
+                                       rds_ibdev->ports[i].if_name) &&
+                                       !rds_ibdev->ports[i].ip_addr) {
+                                       from_passive = 1;
+                               }
+
+                               if (!strcmp(to_dev,
+                                       rds_ibdev->ports[i].if_name) &&
+                                       !rds_ibdev->ports[i].ip_addr) {
+                                       to_passive = 1;
+                               }
+                       }
+               }
+               rcu_read_unlock();
        }
 
-       sin->sin_addr.s_addr = down_bcast;
-       ret = inet_ioctl(rds_ib_inet_socket, SIOCSIFBRDADDR, (unsigned long) ir);
+       if (failover) {
+               if (to_passive) {
+                       strcpy(to_dev2, to_dev);
+               } else {
+                       strcpy(to_dev2, to_dev);
+                       strcat(to_dev2, ":");
+                       strcat(to_dev2, from_dev);
+                       to_dev2[IFNAMSIZ-1] = 0;
+               }
+               strcpy(from_dev2, from_dev);
+       } else {
+               if (from_passive) {
+                       strcpy(from_dev2, from_dev);
+               } else {
+                       strcpy(from_dev2, from_dev);
+                       strcat(from_dev2, ":");
+                       strcat(from_dev2, to_dev);
+                       from_dev2[IFNAMSIZ-1] = 0;
+               }
+               strcpy(to_dev2, to_dev);
+       }
+
+       /* Clear IP on from Interface */
+       sin->sin_addr.s_addr = 0;
+       sin->sin_family = AF_INET;
+       strcpy(ir->ifr_ifrn.ifrn_name, from_dev2);
+       ret = inet_ioctl(rds_ib_inet_socket, SIOCSIFADDR, (unsigned long) ir);
        if (ret) {
                printk(KERN_ERR
-                       "RDS/IB: inet_ioctl(SIOCSIFBRDADDR) failed (%d)\n",
-                       ret);
-               goto out;
+                       "RDS/IB: inet_ioctl(SIOCSIFADDR,%s) failed (%d)\n",
+                       ir->ifr_ifrn.ifrn_name, ret);
        }
 
-       sin->sin_addr.s_addr = down_mask;
-       ret = inet_ioctl(rds_ib_inet_socket, SIOCSIFNETMASK, (unsigned long) ir);
+       ret = rds_ib_set_ip(out_dev, dev_addr, to_dev2, addr, bcast, mask);
+
        if (ret) {
-               printk(KERN_ERR
-                       "RDS/IB: inet_ioctl(SIOCSIFBRDADDR) failed (%d)\n",
-                       ret);
-               goto out;
+               if (failover)
+                       printk(KERN_NOTICE
+                               "RDS/IP: failed to move IP %u.%u.%u.%u "
+                               "from %s to %s\n",
+                               NIPQUAD(addr), from_dev2, to_dev2);
+               else
+                       printk(KERN_NOTICE
+                               "RDS/IP: failed to move IP %u.%u.%u.%u "
+                               "from %s back to %s\n",
+                               NIPQUAD(addr), from_dev2, to_dev2);
+       } else {
+               if (failover)
+                       printk(KERN_NOTICE
+                               "RDS/IB: IP %u.%u.%u.%u migrated over to %s\n",
+                               NIPQUAD(addr), to_dev2);
+               else
+                       printk(KERN_NOTICE
+                               "RDS/IB: IP %u.%u.%u.%u migrated back to %s\n",
+                               NIPQUAD(addr), to_dev2);
        }
 
-       if (failover)
-               printk(KERN_NOTICE
-                       "RDS/IB: IP %u.%u.%u.%u migrated over to %s\n",
-                       NIPQUAD(down_ip), ir->ifr_ifrn.ifrn_name);
-       else
-               printk(KERN_NOTICE
-                       "RDS/IB: IP %u.%u.%u.%u migrated back to %s\n",
-                       NIPQUAD(down_ip), ir->ifr_ifrn.ifrn_name);
-out:
        kunmap(page);
        __free_page(page);
 
        return ret;
 }
 
-static void rds_ib_set_port(struct ib_device *ib_dev, struct net_device *net_dev, char *if_name, u8 port_num, __be32 ip_addr)
+static void rds_ib_init_port(struct rds_ib_device *rds_ibdev,
+                               struct net_device *net_dev,
+                               u8 port_num)
 {
-       struct rds_ib_device *rds_ibdev;
-       u8      active_port;
-       unsigned int    idx;
+       strcpy(rds_ibdev->ports[port_num].if_name, net_dev->name);
+       rds_ibdev->ports[port_num].dev = net_dev;
+       rds_ibdev->ports[port_num].ip_active_port = 0;
 
-       active_port = net_dev->operstate == IF_OPER_UP ? port_num : 0;
+       if (net_dev->operstate == IF_OPER_UP)
+               rds_ibdev->ports[port_num].port_state = RDS_IB_PORT_UP;
+       else
+               rds_ibdev->ports[port_num].port_state = RDS_IB_PORT_DOWN;
+}
 
-       list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
-               if (rds_ibdev->dev == ib_dev) {
-                       if (!strcmp(net_dev->name, if_name)) {
-                               strcpy(rds_ibdev->ports[port_num].if_name,
-                                       if_name);
-                               rds_ibdev->ports[port_num].ip_addr = ip_addr;
-                               rds_ibdev->ports[port_num].active_port =
-                                       active_port;
-                       } else {
-                               idx = rds_ibdev->ports[port_num].alias_cnt++;
-                               strcpy(rds_ibdev->ports[port_num].
-                                       aliases[idx].if_name, if_name);
-                               rds_ibdev->ports[port_num].
-                                       aliases[idx].ip_addr = ip_addr;
-                       }
-                       break;
-               }
+static void rds_ib_set_port(struct rds_ib_device *rds_ibdev,
+                               struct net_device *net_dev,
+                               char *if_name, u8 port_num,
+                               __be32 ip_addr,
+                               __be32 ip_bcast,
+                               __be32 ip_mask)
+{
+       unsigned int    idx;
+
+       if (!strcmp(net_dev->name, if_name)) {
+               strcpy(rds_ibdev->ports[port_num].if_name, if_name);
+               rds_ibdev->ports[port_num].ip_addr = ip_addr;
+               rds_ibdev->ports[port_num].ip_bcast = ip_bcast;
+               rds_ibdev->ports[port_num].ip_mask = ip_mask;
+               rds_ibdev->ports[port_num].ip_active_port = port_num;
+       } else {
+               idx = rds_ibdev->ports[port_num].alias_cnt++;
+               strcpy(rds_ibdev->ports[port_num].
+                       aliases[idx].if_name, if_name);
+               rds_ibdev->ports[port_num].
+                       aliases[idx].ip_addr = ip_addr;
+               rds_ibdev->ports[port_num].
+                       aliases[idx].ip_bcast = ip_bcast;
+               rds_ibdev->ports[port_num].
+                       aliases[idx].ip_mask = ip_mask;
        }
 }
 
-static void rds_ib_do_failover(struct rds_ib_device *rds_ibdev, u8 port)
+static void rds_ib_do_failover(struct rds_ib_device *rds_ibdev,
+                               u8 from_port,
+                               u8 to_port)
 {
        u8      i, j;
+       int     ret;
 
        for (i = 1; i <= rds_ibdev->dev->phys_port_cnt; i++) {
-               if (port != i && i == rds_ibdev->ports[i].active_port) {
-                       if (rds_ib_move_ip(
-                               rds_ibdev->ports[port].if_name,
+               if ((from_port != i &&
+                       i == rds_ibdev->ports[i].ip_active_port) ||
+                       i == to_port) {
+
+                       if (!rds_ib_move_ip(
+                               rds_ibdev->ports[i].dev,
+                               rds_ibdev->ports[i].dev->dev_addr,
+                               rds_ibdev->ports[from_port].if_name,
                                rds_ibdev->ports[i].if_name,
-                               rds_ibdev->ports[port].ip_addr,
+                               rds_ibdev->ports[from_port].ip_addr,
+                               rds_ibdev->ports[from_port].ip_bcast,
+                               rds_ibdev->ports[from_port].ip_mask,
                                1)) {
-                               printk(KERN_ERR "RDS/IP: failed to move IP "
-                                       "%u.%u.%u.%u from %s over to %s\n",
-                                       NIPQUAD(rds_ibdev->ports[port].ip_addr),                                        rds_ibdev->ports[port].if_name,
-                                       rds_ibdev->ports[i].if_name);
-                       } else {
-                               rds_ibdev->ports[port].active_port = i;
-
-                               for (j = 0; j < rds_ibdev->ports[port].alias_cnt; j++) {
-                                       if (rds_ib_move_ip(
-                                               rds_ibdev->ports[port].
+
+                               rds_ibdev->ports[from_port].ip_active_port = i;
+                               for (j = 0; j < rds_ibdev->ports[from_port].
+                                       alias_cnt; j++) {
+
+                                       ret = rds_ib_move_ip(
+                                               rds_ibdev->ports[i].dev,
+                                               rds_ibdev->ports[i].
+                                                       dev->dev_addr,
+                                               rds_ibdev->ports[from_port].
                                                        aliases[j].if_name,
                                                rds_ibdev->ports[i].if_name,
-                                               rds_ibdev->ports[port].
+                                               rds_ibdev->ports[from_port].
                                                        aliases[j].ip_addr,
-                                               1)) {
-                                               printk(KERN_ERR "RDS/IP: failed to move alias IP "
-                                                       "%u.%u.%u.%u from %s over to %s\n",
-                                                       NIPQUAD(rds_ibdev->ports[port].aliases[j].ip_addr),
-                                                       rds_ibdev->ports[port].aliases[j].if_name,
-                                                       rds_ibdev->ports[i].if_name);
-                                       }
+                                               rds_ibdev->ports[from_port].
+                                                       aliases[j].ip_bcast,
+                                               rds_ibdev->ports[from_port].
+                                                       aliases[j].ip_mask,
+                                               1);
                                }
                                break;
                        }
@@ -490,64 +588,65 @@ static void rds_ib_do_failover(struct rds_ib_device *rds_ibdev, u8 port)
        }
 }
 
-static void rds_ib_do_failback(struct rds_ib_device *rds_ibdev, u8 port)
+static void rds_ib_do_set_ip(struct rds_ib_device *rds_ibdev,
+                               u8 port)
 {
-       u8      active_port = rds_ibdev->ports[port].active_port;
+       int     ret;
        u8      j;
 
-       if (port != rds_ibdev->ports[port].active_port) {
-               if (rds_ib_move_ip(
-                       rds_ibdev->ports[active_port].if_name,
+       ret = rds_ib_set_ip(rds_ibdev->ports[port].dev,
+                       rds_ibdev->ports[port].dev->dev_addr,
                        rds_ibdev->ports[port].if_name,
                        rds_ibdev->ports[port].ip_addr,
+                       rds_ibdev->ports[port].ip_bcast,
+                       rds_ibdev->ports[port].ip_mask);
+
+       for (j = 0; j < rds_ibdev->ports[port].alias_cnt; j++) {
+               ret = rds_ib_set_ip(rds_ibdev->ports[port].dev,
+                               rds_ibdev->ports[port].dev->dev_addr,
+                               rds_ibdev->ports[port].aliases[j].if_name,
+                               rds_ibdev->ports[port].aliases[j].ip_addr,
+                               rds_ibdev->ports[port].aliases[j].ip_bcast,
+                               rds_ibdev->ports[port].aliases[j].ip_mask);
+       }
+}
+
+static void rds_ib_do_failback(struct rds_ib_device *rds_ibdev,
+                               u8 port)
+{
+       u8      ip_active_port = rds_ibdev->ports[port].ip_active_port;
+       u8      j;
+       int     ret;
+
+       if (port != rds_ibdev->ports[port].ip_active_port) {
+               if (!rds_ib_move_ip(
+                       rds_ibdev->ports[ip_active_port].dev,
+                       rds_ibdev->ports[port].dev->dev_addr,
+                       rds_ibdev->ports[ip_active_port].if_name,
+                       rds_ibdev->ports[port].if_name,
+                       rds_ibdev->ports[port].ip_addr,
+                       rds_ibdev->ports[port].ip_bcast,
+                       rds_ibdev->ports[port].ip_mask,
                        0)) {
-                       printk(KERN_ERR "RDS/IP: failed to move IP "
-                               "%u.%u.%u.%u from %s back to %s\n",
-                               NIPQUAD(rds_ibdev->ports[port].ip_addr),
-                               rds_ibdev->ports[active_port].if_name,
-                               rds_ibdev->ports[port].if_name);
-               } else {
-                       for (j = 0; j < rds_ibdev->ports[port].alias_cnt; j++) {
-                               if (rds_ib_move_ip(
-                                       rds_ibdev->ports[active_port].if_name,
+
+                       for (j = 0; j < rds_ibdev->ports[port].
+                               alias_cnt; j++) {
+
+                               ret = rds_ib_move_ip(
+                                       rds_ibdev->ports[ip_active_port].dev,
+                                       rds_ibdev->ports[port].
+                                               dev->dev_addr,
+                                       rds_ibdev->ports[ip_active_port].
+                                               if_name,
                                        rds_ibdev->ports[port].
                                                aliases[j].if_name,
                                        rds_ibdev->ports[port].
                                                aliases[j].ip_addr,
-                                       0)) {
-                                       printk(KERN_ERR "RDS/IP: failed to move alias IP "
-                                               "%u.%u.%u.%u from %s back to %s\n",
-                                               NIPQUAD(rds_ibdev->ports[port].aliases[j].ip_addr),
-                                               rds_ibdev->ports[active_port].if_name,
-                                               rds_ibdev->ports[port].aliases[j].if_name);
-                               }
-                       }
-                       rds_ibdev->ports[port].active_port = port;
-                       if (!rds_ibdev->ports[active_port].active_port) {
-                               if (rds_ib_move_ip(
-                                       rds_ibdev->ports[active_port].if_name,
-                                       rds_ibdev->ports[port].if_name,
-                                       rds_ibdev->ports[active_port].ip_addr,
-                                       1)) {
-                                       printk(KERN_ERR "RDS/IP: failed to move IP %u.%u.%u.%u from %s to %s\n",
-                                               NIPQUAD(rds_ibdev->ports[active_port].ip_addr),
-                                               rds_ibdev->ports[active_port].if_name,
-                                               rds_ibdev->ports[port].if_name);
-                               } else {
-                                       for (j = 0; j < rds_ibdev->ports[active_port].alias_cnt; j++) {
-                                               if (rds_ib_move_ip(
-                                                       rds_ibdev->ports[active_port].aliases[j].if_name,
-                                                       rds_ibdev->ports[port].if_name,
-                                                       rds_ibdev->ports[active_port].aliases[j].ip_addr,
-                                                       1)) {
-                                                       printk(KERN_ERR "RDS/IP: failed to move alias IP %u.%u.%u.%u from %s to %s\n",
-                                                               NIPQUAD(rds_ibdev->ports[active_port].aliases[j].ip_addr),
-                                                               rds_ibdev->ports[active_port].aliases[j].if_name,
-                                                               rds_ibdev->ports[port].if_name);
-                                               }
-                                       }
-                                       rds_ibdev->ports[active_port].active_port = port;
-                               }
+                                       rds_ibdev->ports[port].
+                                               aliases[j].ip_bcast,
+                                       rds_ibdev->ports[port].
+                                               aliases[j].ip_mask,
+                                       0);
                        }
                }
        }
@@ -558,12 +657,17 @@ static void rds_ib_failover(struct work_struct *_work)
        struct rds_ib_port_ud_work *work =
                container_of(_work, struct rds_ib_port_ud_work, work.work);
        struct rds_ib_device *rds_ibdev = work->rds_ibdev;
+       int ret;
 
        if (rds_ibdev->ports[work->port].ip_addr)
-               rds_ib_do_failover(rds_ibdev, work->port);
+               rds_ib_do_failover(rds_ibdev, work->port, 0);
 
-       if (rds_ibdev->ports[work->port].active_port == work->port)
-               rds_ibdev->ports[work->port].active_port = 0;
+       if (rds_ibdev->ports[work->port].ip_active_port == work->port) {
+               ret = rds_ib_set_ip(rds_ibdev->ports[work->port].dev,
+                               rds_ibdev->ports[work->port].dev->dev_addr,
+                               rds_ibdev->ports[work->port].if_name,
+                               0, 0, 0);
+       }
 
        kfree(work);
 }
@@ -573,40 +677,59 @@ static void rds_ib_failback(struct work_struct *_work)
        struct rds_ib_port_ud_work *work =
                container_of(_work, struct rds_ib_port_ud_work, work.work);
        struct rds_ib_device *rds_ibdev = work->rds_ibdev;
-       u8 i;
+       u8 i, port = work->port;
+       struct in_device *in_dev;
+
+       if (rds_ibdev->ports[port].ip_addr &&
+               rds_ibdev->ports[port].ip_active_port != port) {
 
-       if (rds_ibdev->ports[work->port].ip_addr &&
-               rds_ibdev->ports[work->port].active_port)
-               rds_ib_do_failback(rds_ibdev, work->port);
+               rds_ib_do_failback(rds_ibdev, port);
+       }
 
-       rds_ibdev->ports[work->port].active_port = work->port;
+       rds_ibdev->ports[port].ip_active_port = port;
+       in_dev = in_dev_get(rds_ibdev->ports[port].dev);
 
        for (i = 1; i <= rds_ibdev->dev->phys_port_cnt; i++) {
-               if (i != work->port && rds_ibdev->ports[i].ip_addr &&
-                       !rds_ibdev->ports[i].active_port) {
-                       rds_ib_do_failover(rds_ibdev, i);
+               if (rds_ibdev->ports[i].port_state == RDS_IB_PORT_DOWN &&
+                       i != port && rds_ibdev->ports[i].ip_addr) {
+
+                       if (rds_ibdev->ports[i].ip_active_port == i) {
+                               rds_ib_do_failover(rds_ibdev, i, 0);
+                       } else if (rds_ibdev->ports[i].ip_active_port == port) {
+                               if (in_dev && !in_dev->ifa_list &&
+                                       rds_ibdev->ports[port].ip_addr) {
+
+                                       rds_ib_do_set_ip(rds_ibdev, port);
+                               }
+
+                               rds_ib_do_failover(rds_ibdev, i, port);
+                       }
                }
        }
 
        kfree(work);
 }
 
-static void rds_ib_event_handler(struct ib_event_handler *handler, struct ib_event *event)
+static void rds_ib_event_handler(struct ib_event_handler *handler,
+                               struct ib_event *event)
 {
        struct rds_ib_device *rds_ibdev =
                container_of(handler, typeof(*rds_ibdev), event_handler);
        u8 port = event->element.port_num;
        struct rds_ib_port_ud_work *work;
 
-       if (!rds_ib_active_active_enabled)
+       if (!rds_ib_haip_enabled)
                return;
 
        if (event->event != IB_EVENT_PORT_ACTIVE &&
                event->event != IB_EVENT_PORT_ERR)
                return;
 
-       printk(KERN_NOTICE "RDS/IB: port %s/%d is %s\n", event->device->name,
-               port, (event->event == IB_EVENT_PORT_ACTIVE) ? "UP" : "DOWN");
+       printk(KERN_NOTICE "RDS/IB: %s/port_%d/%s is %s\n",
+               rds_ibdev->dev->name, port,
+               rds_ibdev->ports[port].if_name,
+               (event->event == IB_EVENT_PORT_ACTIVE) ?
+                       "ACTIVE" : "ERROR");
 
        work = kzalloc(sizeof *work, GFP_KERNEL);
        if (!work) {
@@ -618,11 +741,16 @@ static void rds_ib_event_handler(struct ib_event_handler *handler, struct ib_eve
        work->port = port;
 
        if (event->event == IB_EVENT_PORT_ACTIVE) {
-               INIT_DELAYED_WORK(&work->work, rds_ib_failback);
-               queue_delayed_work(rds_wq, &work->work, 0);
+               if (rds_ib_haip_fallback) {
+                       INIT_DELAYED_WORK(&work->work, rds_ib_failback);
+                       queue_delayed_work(rds_wq, &work->work, 0);
+               } else
+                       kfree(work);
+               rds_ibdev->ports[port].port_state = RDS_IB_PORT_UP;
        } else {
                INIT_DELAYED_WORK(&work->work, rds_ib_failover);
                queue_delayed_work(rds_wq, &work->work, 0);
+               rds_ibdev->ports[port].port_state = RDS_IB_PORT_DOWN;
        }
 }
 
@@ -634,10 +762,10 @@ static void rds_ib_check_down_port(void)
 
        list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
                for (i = 1; i <= rds_ibdev->dev->phys_port_cnt; i++) {
-                       if (!rds_ibdev->ports[i].active_port &&
+                       if (rds_ibdev->ports[i].port_state != RDS_IB_PORT_UP &&
                                rds_ibdev->ports[i].ip_addr) {
                                printk(KERN_NOTICE
-                                       "RDS/IB: port %s/%d is DOWN\n",
+                                       "RDS/IB: port %s/%d is NOT UP\n",
                                        rds_ibdev->dev->name, i);
 
                                work = kzalloc(sizeof *work, GFP_KERNEL);
@@ -656,126 +784,99 @@ static void rds_ib_check_down_port(void)
        flush_workqueue(rds_wq);
 }
 
-static void rds_ib_print_port(void)
+static void rds_ib_dump_ip_config(void)
 {
        struct rds_ib_device *rds_ibdev;
        int i, j;
 
+       if (!rds_ib_haip_enabled)
+               return;
+
+       printk(KERN_ERR "RDS/IB: IP configuration ...\n");
        list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
                for (i = 1; i <= rds_ibdev->dev->phys_port_cnt; i++) {
-                       if (!rds_ibdev->ports[i].ip_addr)
-                               continue;
-                       rdsdebug("Device %s / Port %d: name %s, "
-                               "IP %d.%d.%d.%d, active_port %d\n",
+                       printk(KERN_ERR "RDS/IB: %s/port_%d/%s: "
+                               "IP %d.%d.%d.%d/%d.%d.%d.%d/%d.%d.%d.%d "
+                               "state %s\n",
                                rds_ibdev->dev->name, i,
                                rds_ibdev->ports[i].if_name,
                                NIPQUAD(rds_ibdev->ports[i].ip_addr),
-                               rds_ibdev->ports[i].active_port);
+                               NIPQUAD(rds_ibdev->ports[i].ip_bcast),
+                               NIPQUAD(rds_ibdev->ports[i].ip_mask),
+                               (rds_ibdev->ports[i].port_state == RDS_IB_PORT_UP ? "UP" : "DOWN"));
 
                        for (j = 0; j < rds_ibdev->ports[i].alias_cnt; j++) {
-                               rdsdebug("Alias %s IP %d.%d.%d.%d\n",
+                               printk(KERN_ERR "Alias %s "
+                                       "IP %d.%d.%d.%d/%d.%d.%d.%d/%d.%d.%d.%d\n",
                                        rds_ibdev->ports[i].aliases[j].if_name,
                                        NIPQUAD(rds_ibdev->ports[i].
-                                               aliases[j].ip_addr));
+                                               aliases[j].ip_addr),
+                                       NIPQUAD(rds_ibdev->ports[i].
+                                               aliases[j].ip_bcast),
+                                       NIPQUAD(rds_ibdev->ports[i].
+                                               aliases[j].ip_mask));
                        }
                }
        }
 }
 
-static void rds_ib_check_up_port(void)
-{
-       struct net_device *dev;
-       int     downs;
-       int     retries = 0;
-
-retry:
-       downs = 0;
-       read_lock(&dev_base_lock);
-       for_each_netdev(&init_net, dev) {
-               if ((dev->type == ARPHRD_INFINIBAND) &&
-                       !(dev->flags & IFF_MASTER)) {
-                       if (dev->operstate != IF_OPER_UP)
-                               downs++;
-               }
-       }
-       read_unlock(&dev_base_lock);
-
-       if (downs) {
-               if (retries++ <= 60) {
-                       msleep(1000);
-                       goto retry;
-               } else {
-                       printk(KERN_ERR "RDS/IB: Some port(s) not operational\n");
-               }
-       }
-}
-
-static int rds_ib_init_port(void)
+static int rds_ib_setup_ports(void)
 {
        struct net_device *dev;
        struct in_ifaddr *ifa;
        struct in_ifaddr **ifap;
        struct in_device *in_dev;
-       struct rdma_cm_id *cm_id;
-       struct sockaddr_in sin;
-       struct rdma_dev_addr *dev_addr;
-       union ib_gid gid;
+       struct rds_ib_device *rds_ibdev;
        u8      port_num;
        int     ret = 0;
 
-       if (!rds_ib_active_active_enabled)
+       if (!rds_ib_haip_enabled)
                return ret;
 
-       rds_ib_check_up_port();
-
        read_lock(&dev_base_lock);
        for_each_netdev(&init_net, dev) {
                in_dev = in_dev_get(dev);
                if ((dev->type == ARPHRD_INFINIBAND) &&
+                       !(dev->flags & IFF_SLAVE) &&
                        !(dev->flags & IFF_MASTER) &&
                        in_dev) {
-                       for (ifap = &in_dev->ifa_list; (ifa = *ifap);
-                               ifap = &ifa->ifa_next) {
+                       union ib_gid gid;
 
-                               cm_id = rdma_create_id(NULL, NULL,
-                                                       RDMA_PS_TCP);
-                               ret = (IS_ERR(cm_id));
-                               if (ret) {
-                                       printk(KERN_ERR "RDS/IB: rdma_create_id failed\n");
-                                       goto out;
-                               }
-                               memset(&sin, 0, sizeof(sin));
-                               sin.sin_family = AF_INET;
-                               sin.sin_addr.s_addr = ifa->ifa_address;
-                               ret = rdma_bind_addr(cm_id,
-                                               (struct sockaddr *)&sin);
-                               if (ret) {
-                                       printk(KERN_ERR "RDS/IB: rdma_bind_addr failed\n");
-                                       rdma_destroy_id(cm_id);
-                                       goto out;
-                               }
-                               dev_addr = &cm_id->route.addr.dev_addr;
-                               memcpy(&gid, dev_addr->src_dev_addr +
-                                       rdma_addr_gid_offset(dev_addr), sizeof gid);
-                               ret = ib_find_cached_gid(cm_id->device, &gid,
-                                                       &port_num, NULL);
-                               if (ret) {
-                                       printk(KERN_ERR "RDS/IB: ib_find_cached_gid failed\n");
-                                       rdma_destroy_id(cm_id);
-                                       goto out;
-                               }
+                       memcpy(&gid, dev->dev_addr + 4, sizeof gid);
+
+                       rcu_read_lock();
+                       list_for_each_entry_rcu(rds_ibdev,
+                                       &rds_ib_devices, list) {
+                               ret = ib_find_cached_gid(rds_ibdev->dev,
+                                       &gid, &port_num, NULL);
+                               if (!ret)
+                                       break;
+                       }
+                       rcu_read_unlock();
+
+                       if (!port_num) {
+                               printk(KERN_ERR "RDS/IB: GID "RDS_IB_GID_FMT
+                                       " has no associated port\n",
+                                       RDS_IB_GID_ARG(gid));
+                               ret = 1;
+                               goto out;
+                       }
 
-                               rds_ib_set_port(cm_id->device, dev,
-                                               ifa->ifa_label, port_num,
-                                               ifa->ifa_address);
+                       rds_ib_init_port(rds_ibdev, dev, port_num);
 
-                               rdma_destroy_id(cm_id);
+                       for (ifap = &in_dev->ifa_list; (ifa = *ifap);
+                               ifap = &ifa->ifa_next) {
+                               rds_ib_set_port(rds_ibdev, dev,
+                                       ifa->ifa_label, port_num,
+                                       ifa->ifa_address,
+                                       ifa->ifa_broadcast,
+                                       ifa->ifa_mask);
                        }
                }
        }
 
        rds_ib_check_down_port();
-       rds_ib_print_port();
+       rds_ib_dump_ip_config();
 out:
        read_unlock(&dev_base_lock);
        return ret;
@@ -814,14 +915,14 @@ void rds_ib_add_one(struct ib_device *device)
        rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32;
 
        rds_ibdev->max_1m_fmrs = dev_attr->max_fmr ?
-               min_t(unsigned int, (dev_attr->max_fmr / 2),
-                       rds_ib_fmr_1m_pool_size) :
-                       rds_ib_fmr_1m_pool_size;
+               min_t(unsigned int, dev_attr->max_fmr,
+                       RDS_FMR_1M_POOL_SIZE) :
+                       RDS_FMR_1M_POOL_SIZE;
 
        rds_ibdev->max_8k_fmrs = dev_attr->max_fmr ?
-               min_t(unsigned int, ((dev_attr->max_fmr / 2) * 128),
-                       rds_ib_fmr_8k_pool_size) :
-                       rds_ib_fmr_8k_pool_size;
+               min_t(unsigned int, dev_attr->max_fmr,
+                       RDS_FMR_8K_POOL_SIZE) :
+                       RDS_FMR_8K_POOL_SIZE;
 
        rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom;
        rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom;
@@ -833,7 +934,7 @@ void rds_ib_add_one(struct ib_device *device)
                goto put_dev;
        }
 
-       if (rds_ib_active_active_enabled) {
+       if (rds_ib_haip_enabled) {
                rds_ibdev->ports = kzalloc(sizeof(struct rds_ib_port) *
                                        (device->phys_port_cnt + 1), GFP_KERNEL);
                if (!rds_ibdev->ports) {
@@ -940,6 +1041,74 @@ struct rds_transport rds_ib_transport = {
        .t_type                 = RDS_TRANS_IB
 };
 
+static int rds_ib_netdev_callback(struct notifier_block *self, unsigned long event, void *ctx)
+{
+       struct net_device *ndev = netdev_notifier_info_to_dev(ctx);
+       u8 port = 0;
+       u8 i;
+       struct rds_ib_device    *rds_ibdev;
+       struct rds_ib_port_ud_work *work;
+
+       if (!rds_ib_haip_enabled)
+               return NOTIFY_DONE;
+
+       if (event != NETDEV_UP && event != NETDEV_DOWN)
+               return NOTIFY_DONE;
+
+       rcu_read_lock();
+       list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
+               for (i = 1; i <= rds_ibdev->dev->phys_port_cnt; i++) {
+                       if (!strcmp(ndev->name,
+                               rds_ibdev->ports[i].if_name)) {
+                                       port = i;
+                                       goto out;
+                               }
+               }
+       }
+       rcu_read_unlock();
+out:
+       if (!port)
+               return NOTIFY_DONE;
+
+
+       printk(KERN_NOTICE "RDS/IB: %s/port_%d/%s is %s\n",
+               rds_ibdev->dev->name, port, ndev->name,
+               (event == NETDEV_UP) ? "UP" : "DOWN");
+
+       work = kzalloc(sizeof *work, GFP_KERNEL);
+       if (!work) {
+               printk(KERN_ERR "RDS/IB: failed to allocate port work\n");
+               return NOTIFY_DONE;
+       }
+
+       work->rds_ibdev = rds_ibdev;
+       work->dev = ndev;
+       work->port = port;
+
+       switch (event) {
+       case NETDEV_UP:
+               if (rds_ib_haip_fallback) {
+                       INIT_DELAYED_WORK(&work->work, rds_ib_failback);
+                       queue_delayed_work(rds_wq, &work->work, msecs_to_jiffies(100));
+               } else
+                       kfree(work);
+
+               rds_ibdev->ports[port].port_state = NETDEV_UP;
+               break;
+       case NETDEV_DOWN:
+               INIT_DELAYED_WORK(&work->work, rds_ib_failover);
+               queue_delayed_work(rds_wq, &work->work, 0);
+               rds_ibdev->ports[port].port_state = RDS_IB_PORT_DOWN;
+               break;
+       }
+
+       return NOTIFY_DONE;
+}
+
+static struct notifier_block rds_ib_nb = {
+       .notifier_call = rds_ib_netdev_callback
+};
+
 int rds_ib_init(void)
 {
        int ret;
@@ -983,7 +1152,7 @@ int rds_ib_init(void)
 
        rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
 
-       ret = rds_ib_init_port();
+       ret = rds_ib_setup_ports();
        if (ret) {
                printk(KERN_ERR "RDS/IB: failed to init port\n");
                goto out_srq;
@@ -995,6 +1164,8 @@ int rds_ib_init(void)
                goto out_srq;
        }
 
+       register_netdevice_notifier(&rds_ib_nb);
+
        goto out;
 
 out_srq:
index 3c1ef0e4decb3839e9f4d8da381b3f954d81db45..35d82cd08c93de924c2b88c777cdcf1b5907e60b 100644 (file)
@@ -9,9 +9,9 @@
 #include "rdma_transport.h"
 
 #define RDS_FMR_1M_POOL_SIZE           (8192 / 2)
-#define RDS_FMR_1M_MSG_SIZE            256
-#define RDS_FMR_8K_POOL_SIZE            128 * (8192 / 2)
-#define RDS_FMR_8K_MSG_SIZE            2
+#define RDS_FMR_1M_MSG_SIZE            256  /* 1M */
+#define RDS_FMR_8K_MSG_SIZE             2
+#define RDS_FMR_8K_POOL_SIZE            ((256 / (RDS_FMR_8K_MSG_SIZE + 1)) * (8192 / 2))
 
 #define RDS_IB_MAX_SGE                 8
 #define RDS_IB_RECV_SGE                        2
@@ -19,8 +19,8 @@
 #define RDS_IB_DEFAULT_RECV_WR         1024
 #define RDS_IB_DEFAULT_SEND_WR         256
 #define RDS_IB_DEFAULT_SRQ_MAX_WR       4096
-#define RDS_IB_DEFAULT_SRQ_REFILL_WR   (RDS_IB_DEFAULT_SRQ_MAX_WR/2)
-#define RDS_IB_DEFAULT_SRQ_LOW_WR      (RDS_IB_DEFAULT_SRQ_MAX_WR/10)
+#define RDS_IB_DEFAULT_SRQ_HWM_REFILL  (RDS_IB_DEFAULT_SRQ_MAX_WR/2)
+#define RDS_IB_DEFAULT_SRQ_LWM_REFILL  (RDS_IB_DEFAULT_SRQ_MAX_WR/10)
 
 #define RDS_IB_DEFAULT_RETRY_COUNT     1
 
@@ -256,13 +256,25 @@ struct rds_ib_srq {
 struct rds_ib_alias {
        char                    if_name[IFNAMSIZ];
        __be32                  ip_addr;
+       __be32                  ip_bcast;
+       __be32                  ip_mask;
+};
+
+enum {
+       RDS_IB_PORT_UNKNOWN = 0,
+       RDS_IB_PORT_UP,
+       RDS_IB_PORT_DOWN,
 };
 
 #define RDS_IB_MAX_ALIASES     200
 struct rds_ib_port {
+       struct net_device       *dev;
        char                    if_name[IFNAMSIZ];
+       unsigned int            port_state;
        __be32                  ip_addr;
-       unsigned int            active_port;
+       __be32                  ip_bcast;
+       __be32                  ip_mask;
+       unsigned int            ip_active_port;
        unsigned int            alias_cnt;
        struct rds_ib_alias     aliases[RDS_IB_MAX_ALIASES];
 };
@@ -270,12 +282,13 @@ struct rds_ib_port {
 struct rds_ib_port_ud_work {
        struct delayed_work             work;
        struct rds_ib_device            *rds_ibdev;
+       struct net_device               *dev;
        unsigned int                    port;
 };
 
 enum {
-        RDS_IB_MR_8K_POOL,
-        RDS_IB_MR_1M_POOL,
+       RDS_IB_MR_8K_POOL,
+       RDS_IB_MR_1M_POOL,
 };
 
 struct rds_ib_device {
@@ -405,8 +418,11 @@ extern unsigned int rds_ib_fmr_1m_pool_size;
 extern unsigned int rds_ib_fmr_8k_pool_size;
 extern unsigned int rds_ib_retry_count;
 extern unsigned int rds_ib_rnr_retry_count;
-extern unsigned int rds_ib_apm_enable;
-extern unsigned int rds_ib_timeout;
+extern unsigned int rds_ib_apm_enabled;
+extern unsigned int rds_ib_apm_fallback;
+extern unsigned int rds_ib_haip_enabled;
+extern unsigned int rds_ib_haip_fallback;
+extern unsigned int rds_ib_apm_timeout;
 
 extern spinlock_t ib_nodev_conns_lock;
 extern struct list_head ib_nodev_conns;
@@ -512,8 +528,8 @@ unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
 
 /* ib_recv.c */
 extern unsigned int rds_ib_srq_max_wr;
-extern unsigned int rds_ib_srq_refill_wr;
-extern unsigned int rds_ib_srq_low_wr;
+extern unsigned int rds_ib_srq_hwm_refill;
+extern unsigned int rds_ib_srq_lwm_refill;
 extern unsigned int rds_ib_srq_enabled;
 
 /* ib_sysctl.c */
index ab8628d1a2ff0cb79e2545ed505c7681944b437b..82abce6a81c519831e960589b967cd13e2e86f9a 100644 (file)
@@ -215,7 +215,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
        if (dp && dp->dp_ack_seq)
                rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
 
-       if (rds_ib_apm_enable && !ic->conn->c_reconnect) {
+       if (rds_ib_apm_enabled && !ic->conn->c_reconnect) {
                memcpy(&ic->i_pri_path.p_sgid,
                        &ic->i_cm_id->route.path_rec[0].sgid,
                        sizeof(union ib_gid));
@@ -389,7 +389,7 @@ void rds_ib_tasklet_fn_recv(unsigned long data)
 
        if (rds_ib_srq_enabled)
                if ((atomic_read(&rds_ibdev->srq->s_num_posted) <
-                                       rds_ib_srq_refill_wr) &&
+                                       rds_ib_srq_hwm_refill) &&
                        !test_and_set_bit(0, &rds_ibdev->srq->s_refill_gate))
                                queue_delayed_work(rds_wq, &rds_ibdev->srq->s_refill_w, 0);
 }
@@ -773,14 +773,14 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
                event->param.conn.responder_resources,
                event->param.conn.initiator_depth);
 
-       if (rds_ib_apm_enable)
-               rdma_set_timeout(cm_id, rds_ib_timeout);
+       if (rds_ib_apm_enabled)
+               rdma_set_timeout(cm_id, rds_ib_apm_timeout);
 
        /* rdma_accept() calls rdma_reject() internally if it fails */
        err = rdma_accept(cm_id, &conn_param);
        if (err)
                rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err);
-       else if (rds_ib_apm_enable && !conn->c_loopback) {
+       else if (rds_ib_apm_enabled && !conn->c_loopback) {
                err = rdma_enable_apm(cm_id, RDMA_ALT_PATH_BEST);
                if (err)
                        printk(KERN_WARNING "RDS/IB: APM couldn't be enabled for passive side: %d\n", err);
@@ -803,7 +803,7 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
        struct rds_ib_connect_private dp;
        int ret;
 
-       if (rds_ib_apm_enable && !conn->c_loopback) {
+       if (rds_ib_apm_enabled && !conn->c_loopback) {
                ret = rdma_enable_apm(cm_id, RDMA_ALT_PATH_BEST);
                if (ret)
                        printk(KERN_WARNING "RDS/IB: APM couldn't be enabled for active side: %d\n", ret);
@@ -850,6 +850,9 @@ static void rds_ib_migrate(struct work_struct *_work)
        struct rdma_cm_id *cm_id = ic->i_cm_id;
        int ret = 0;
 
+       if (!rds_ib_apm_fallback)
+               return;
+
        if (!ic->i_active_side) {
                ret = ib_query_qp(cm_id->qp, &qp_attr, IB_QP_PATH_MIG_STATE,
                                &qp_init_attr);
@@ -888,7 +891,7 @@ void rds_ib_check_migration(struct rds_connection *conn,
        struct rdma_cm_id *cm_id = ic->i_cm_id;
        int err;
 
-       if (!rds_ib_apm_enable || !rds_conn_up(ic->conn))
+       if (!rds_ib_apm_enabled || !rds_conn_up(ic->conn))
                return ;
 
        ic->i_alt_path_index = event->param.ud.alt_path_index;
@@ -912,7 +915,7 @@ void rds_ib_check_migration(struct rds_connection *conn,
                printk(KERN_ERR "RDS/IB: ib_query_qp failed (%d)\n", err);
                return;
        }
-       qp_attr.alt_timeout = rds_ib_timeout;
+       qp_attr.alt_timeout = rds_ib_apm_timeout;
        err = ib_modify_qp(cm_id->qp, &qp_attr, IB_QP_ALT_PATH);
        if (err) {
                printk(KERN_ERR "RDS/IB: ib_modify_qp failed (%d)\n", err);
index f56380969b2ed984fcbbfa416f914fd0b076bf0a..6daccb10f8f0d70f579d61123e0e6dc8beb5cbf8 100644 (file)
@@ -242,10 +242,10 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
 
        if (pool_type == RDS_IB_MR_1M_POOL) {
                pool->fmr_attr.max_pages = RDS_FMR_1M_MSG_SIZE + 1;
-               pool->max_items = rds_ib_fmr_1m_pool_size;
+               pool->max_items = rds_ibdev->max_1m_fmrs;
        } else /* pool_type == RDS_IB_MR_8K_POOL */ {
                pool->fmr_attr.max_pages = RDS_FMR_8K_MSG_SIZE + 1;
-               pool->max_items = rds_ib_fmr_8k_pool_size;
+               pool->max_items = rds_ibdev->max_8k_fmrs;
        }
        pool->max_free_pinned =
                pool->max_items * pool->fmr_attr.max_pages / 4;
@@ -316,6 +316,7 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev,
 {
        struct rds_ib_mr_pool *pool;
        struct rds_ib_mr *ibmr = NULL;
+       struct rds_ib_mr *tmp_ibmr = NULL;
        int err = 0, iter = 0;
 
        if (npages <= RDS_FMR_8K_MSG_SIZE)
@@ -379,6 +380,25 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev,
                        &pool->fmr_attr);
        if (IS_ERR(ibmr->fmr)) {
                err = PTR_ERR(ibmr->fmr);
+
+               /* Adjust the pool size to reflect the resources available to
+                * the VM.
+                */
+               if (err == -ENOMEM) {
+                       int prev_max = pool->max_items;
+
+                       pool->max_items = max(atomic_read(&pool->item_count),
+                                               RDS_FMR_1M_POOL_SIZE);
+
+                       printk(KERN_ERR "RDS/IB: Adjusted FMR pool (%d->%d)\n",
+                               prev_max, pool->max_items);
+
+                       rds_ib_flush_mr_pool(pool, 0, &tmp_ibmr);
+                       if (tmp_ibmr) {
+                               kfree(ibmr);
+                               return tmp_ibmr;
+                       }
+               }
                ibmr->fmr = NULL;
                printk(KERN_WARNING "RDS/IB: ib_alloc_fmr failed (err=%d)\n", err);
                goto out_no_cigar;
index af0893e2f7570d33cd057a61bff252096a2156a2..d7636d671877b8f3696cd417e2324604ca6f933f 100644 (file)
 #include "ib.h"
 
 unsigned int rds_ib_srq_max_wr = RDS_IB_DEFAULT_SRQ_MAX_WR;
-unsigned int rds_ib_srq_refill_wr = RDS_IB_DEFAULT_SRQ_REFILL_WR;
-unsigned int rds_ib_srq_low_wr = RDS_IB_DEFAULT_SRQ_LOW_WR;
+unsigned int rds_ib_srq_hwm_refill = RDS_IB_DEFAULT_SRQ_HWM_REFILL;
+unsigned int rds_ib_srq_lwm_refill = RDS_IB_DEFAULT_SRQ_LWM_REFILL;
 unsigned int rds_ib_srq_enabled = 0;
 
 module_param(rds_ib_srq_enabled, int, 0444);
 MODULE_PARM_DESC(rds_ib_srq_enabled, "Set to enabled SRQ");
 module_param(rds_ib_srq_max_wr, int, 0444);
 MODULE_PARM_DESC(rds_ib_srq_max_wr, "Max number of SRQ WRs");
-module_param(rds_ib_srq_refill_wr, int, 0444);
-MODULE_PARM_DESC(rds_ib_srq_refill_wr, "SRQ refill watermark");
-module_param(rds_ib_srq_low_wr, int, 0444);
-MODULE_PARM_DESC(rds_ib_srq_low_wr, "SRQ low watermark");
+module_param(rds_ib_srq_hwm_refill, int, 0444);
+MODULE_PARM_DESC(rds_ib_srq_hwm_refill, "SRQ HWM refill");
+module_param(rds_ib_srq_lwm_refill, int, 0444);
+MODULE_PARM_DESC(rds_ib_srq_lwm_refill, "SRQ LWM refill");
 
 static struct kmem_cache *rds_ib_incoming_slab;
 static struct kmem_cache *rds_ib_frag_slab;
@@ -1477,7 +1477,7 @@ void rds_ib_srq_rearm(struct work_struct *work)
        struct rds_ib_srq *srq = container_of(work, struct rds_ib_srq, s_rearm_w.work);
        struct ib_srq_attr srq_attr;
 
-       srq_attr.srq_limit = rds_ib_srq_low_wr;
+       srq_attr.srq_limit = rds_ib_srq_lwm_refill;
        if (ib_modify_srq(srq->s_srq, &srq_attr, IB_SRQ_LIMIT)) {
                printk(KERN_ERR "RDS: ib_modify_srq failed\n");
                return;
index 1e2f7c39dbf60f65bf393b14be627bc3d66666a5..8808f35756e98460d7af82bf9a54bee25678df29 100644 (file)
@@ -84,8 +84,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
        case RDMA_CM_EVENT_ADDR_RESOLVED:
                rdma_set_service_type(cm_id, conn->c_tos);
 
-               if (rds_ib_apm_enable)
-                       rdma_set_timeout(cm_id, rds_ib_timeout);
+               if (rds_ib_apm_enabled)
+                       rdma_set_timeout(cm_id, rds_ib_apm_timeout);
 
                /* XXX do we need to clean up if this fails? */
                ret = rdma_resolve_route(cm_id,
@@ -161,7 +161,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
                break;
 
        case RDMA_CM_EVENT_ADDR_CHANGE:
-               if (conn && !rds_ib_apm_enable)
+               if (conn && !rds_ib_apm_enabled)
                        rds_conn_drop(conn);
                break;