#include <linux/netdevice.h>
#include <linux/inetdevice.h>
#include <linux/if_arp.h>
+#include <net/arp.h>
#include <linux/delay.h>
#include <rdma/ib_cache.h>
#include <net/sock.h>
#include <net/inet_common.h>
+#include <linux/rtnetlink.h>
#include "rds.h"
#include "ib.h"
+#include <linux/time.h>
unsigned int rds_ib_fmr_1m_pool_size = RDS_FMR_1M_POOL_SIZE;
unsigned int rds_ib_fmr_8k_pool_size = RDS_FMR_8K_POOL_SIZE;
unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
-unsigned int rds_ib_apm_enable = 0;
-unsigned int rds_ib_active_active_enabled = 0;
-unsigned int rds_ib_timeout = RDS_IB_DEFAULT_TIMEOUT;
+unsigned int rds_ib_apm_enabled = 0;
+unsigned int rds_ib_apm_fallback = 1;
+unsigned int rds_ib_haip_enabled = 0;
+unsigned int rds_ib_haip_fallback = 1;
+unsigned int rds_ib_apm_timeout = RDS_IB_DEFAULT_TIMEOUT;
unsigned int rds_ib_rnr_retry_count = RDS_IB_DEFAULT_RNR_RETRY_COUNT;
module_param(rds_ib_fmr_1m_pool_size, int, 0444);
MODULE_PARM_DESC(rds_ib_fmr_8k_pool_size, " Max number of 8k fmr per HCA");
module_param(rds_ib_retry_count, int, 0444);
MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
-module_param(rds_ib_apm_enable, int, 0444);
-MODULE_PARM_DESC(rds_ib_apm_enable, " Enable APM");
-module_param(rds_ib_active_active_enabled, int, 0444);
-MODULE_PARM_DESC(rds_ib_active_active_enabled, " Active/Active enabled");
-module_param(rds_ib_timeout, int, 0444);
-MODULE_PARM_DESC(rds_ib_timeout, " QP timeout");
+module_param(rds_ib_apm_enabled, int, 0444);
+MODULE_PARM_DESC(rds_ib_apm_enabled, " APM Enabled");
+module_param(rds_ib_haip_enabled, int, 0444);
+MODULE_PARM_DESC(rds_ib_haip_enabled, " High Availability IP enabled");
+module_param(rds_ib_apm_timeout, int, 0444);
+MODULE_PARM_DESC(rds_ib_apm_timeout, " APM timeout");
module_param(rds_ib_rnr_retry_count, int, 0444);
-MODULE_PARM_DESC(rds_ib_timeout, " QP rnr retry count");
+MODULE_PARM_DESC(rds_ib_rnr_retry_count, " QP rnr retry count");
+module_param(rds_ib_apm_fallback, int, 0444);
+MODULE_PARM_DESC(rds_ib_apm_fallback, " APM failback enabled");
+module_param(rds_ib_haip_fallback, int, 0444);
+MODULE_PARM_DESC(rds_ib_haip_fallback, " HAIP failback Enabled");
+
+
/*
* we have a clumsy combination of RCU and a rwsem protecting this list
return ret;
}
-static int rds_ib_move_ip(char *from_dev, char *to_dev, __be32 addr, int failover)
+static void rds_ib_send_gratuitous_arp(struct net_device *out_dev,
+ unsigned char *dev_addr,
+ __be32 ip_addr)
+{
+ arp_send(ARPOP_REQUEST, ETH_P_ARP,
+ ip_addr, out_dev,
+ ip_addr, NULL,
+ dev_addr, NULL);
+}
+
+static int rds_ib_set_ip(struct net_device *out_dev,
+ unsigned char *dev_addr,
+ char *if_name,
+ __be32 addr,
+ __be32 bcast,
+ __be32 mask)
{
struct ifreq *ir;
struct sockaddr_in *sin;
- __be32 down_ip, down_bcast, down_mask;
struct page *page;
- char from_dev2[2*IFNAMSIZ + 1];
- char to_dev2[2*IFNAMSIZ + 1];
int ret = 0;
page = alloc_page(GFP_HIGHUSER);
if (!page) {
printk(KERN_ERR "RDS/IB: alloc_page failed .. NO MEM\n");
- ret = -ENOMEM;
- goto out;
+ return 1;
}
ir = (struct ifreq *)kmap(page);
memset(ir, 0, sizeof(struct ifreq));
sin = (struct sockaddr_in *)&ir->ifr_addr;
+ sin->sin_family = AF_INET;
- if (failover) {
- strcpy(to_dev2, to_dev);
- strcat(to_dev2, ":");
- strcat(to_dev2, from_dev);
- to_dev2[IFNAMSIZ-1] = 0;
- strcpy(from_dev2, from_dev);
- } else {
- strcpy(from_dev2, from_dev);
- strcat(from_dev2, ":");
- strcat(from_dev2, to_dev);
- from_dev2[IFNAMSIZ-1] = 0;
- strcpy(to_dev2, to_dev);
- }
+ strcpy(ir->ifr_ifrn.ifrn_name, if_name);
- strcpy(ir->ifr_ifrn.ifrn_name, from_dev2);
- ret = inet_ioctl(rds_ib_inet_socket, SIOCGIFADDR, (unsigned long) ir);
+ sin->sin_addr.s_addr = addr;
+ ret = inet_ioctl(rds_ib_inet_socket, SIOCSIFADDR, (unsigned long) ir);
if (ret) {
printk(KERN_ERR
- "RDS/IB: inet_ioctl(SIOCGIFADDR) failed (%d)\n",
+ "RDS/IB: inet_ioctl(SIOCSIFADDR) failed (%d)\n",
ret);
goto out;
}
- down_ip = sin->sin_addr.s_addr;
- if (addr != down_ip) {
- printk(KERN_ERR
- "RDS/IP: %u.%u.%u.%u not configured on %s\n",
- NIPQUAD(addr), ir->ifr_ifrn.ifrn_name);
+
+ if (!addr)
goto out;
- }
- ret = inet_ioctl(rds_ib_inet_socket, SIOCGIFBRDADDR, (unsigned long) ir);
+ sin->sin_addr.s_addr = bcast;
+ ret = inet_ioctl(rds_ib_inet_socket, SIOCSIFBRDADDR,
+ (unsigned long) ir);
if (ret) {
printk(KERN_ERR
- "RDS/IB: inet_ioctl(SIOCGIFBRDADDR) failed (%d)\n",
+ "RDS/IB: inet_ioctl(SIOCSIFBRDADDR) failed (%d)\n",
ret);
goto out;
}
- down_bcast = sin->sin_addr.s_addr;
- ret = inet_ioctl(rds_ib_inet_socket, SIOCGIFNETMASK, (unsigned long) ir);
+ sin->sin_addr.s_addr = mask;
+ ret = inet_ioctl(rds_ib_inet_socket, SIOCSIFNETMASK,
+ (unsigned long) ir);
if (ret) {
printk(KERN_ERR
- "RDS/IB: inet_ioctl(SIOCGIFNETMASK) failed (%d)\n",
+ "RDS/IB: inet_ioctl(SIOCSIFBRDADDR) failed (%d)\n",
ret);
goto out;
}
- down_mask = sin->sin_addr.s_addr;
- /* Clear IP on down Interface */
- sin->sin_addr.s_addr = 0;
- ret = inet_ioctl(rds_ib_inet_socket, SIOCSIFADDR, (unsigned long) ir);
- if (ret) {
- printk(KERN_ERR
- "RDS/IB: inet_ioctl(SIOCSIFADDR) failed (%d)\n",
- ret);
- goto out;
+ rds_ib_send_gratuitous_arp(out_dev, dev_addr, addr);
+
+out:
+ kunmap(page);
+ __free_page(page);
+
+ return ret;
+}
+
+static int rds_ib_move_ip(struct net_device *out_dev,
+ unsigned char *dev_addr,
+ char *from_dev,
+ char *to_dev,
+ __be32 addr,
+ __be32 bcast,
+ __be32 mask,
+ int failover)
+{
+ struct rds_ib_device *rds_ibdev;
+ struct ifreq *ir;
+ struct sockaddr_in *sin;
+ struct page *page;
+ char from_dev2[2*IFNAMSIZ + 1];
+ char to_dev2[2*IFNAMSIZ + 1];
+ int i, ret = 0;
+ char *from_colon, *to_colon;
+ int from_passive = 0, to_passive = 0;
+
+ page = alloc_page(GFP_HIGHUSER);
+ if (!page) {
+ printk(KERN_ERR "RDS/IB: alloc_page failed .. NO MEM\n");
+ return 1;
}
+ ir = (struct ifreq *)kmap(page);
memset(ir, 0, sizeof(struct ifreq));
- strcpy(ir->ifr_ifrn.ifrn_name, to_dev2);
- sin->sin_family = AF_INET;
+ sin = (struct sockaddr_in *)&ir->ifr_addr;
- sin->sin_addr.s_addr = down_ip;
- ret = inet_ioctl(rds_ib_inet_socket, SIOCSIFADDR, (unsigned long) ir);
- if (ret) {
- printk(KERN_ERR
- "RDS/IB: inet_ioctl(SIOCSIFADDR) failed (%d)\n",
- ret);
- goto out;
+ from_colon = strchr(from_dev, ':');
+ to_colon = strchr(to_dev, ':');
+ if (!from_colon && !to_colon) {
+ rcu_read_lock();
+ list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
+ for (i = 1; i <= rds_ibdev->dev->phys_port_cnt; i++) {
+ if (!strcmp(from_dev,
+ rds_ibdev->ports[i].if_name) &&
+ !rds_ibdev->ports[i].ip_addr) {
+ from_passive = 1;
+ }
+
+ if (!strcmp(to_dev,
+ rds_ibdev->ports[i].if_name) &&
+ !rds_ibdev->ports[i].ip_addr) {
+ to_passive = 1;
+ }
+ }
+ }
+ rcu_read_unlock();
}
- sin->sin_addr.s_addr = down_bcast;
- ret = inet_ioctl(rds_ib_inet_socket, SIOCSIFBRDADDR, (unsigned long) ir);
+ if (failover) {
+ if (to_passive) {
+ strcpy(to_dev2, to_dev);
+ } else {
+ strcpy(to_dev2, to_dev);
+ strcat(to_dev2, ":");
+ strcat(to_dev2, from_dev);
+ to_dev2[IFNAMSIZ-1] = 0;
+ }
+ strcpy(from_dev2, from_dev);
+ } else {
+ if (from_passive) {
+ strcpy(from_dev2, from_dev);
+ } else {
+ strcpy(from_dev2, from_dev);
+ strcat(from_dev2, ":");
+ strcat(from_dev2, to_dev);
+ from_dev2[IFNAMSIZ-1] = 0;
+ }
+ strcpy(to_dev2, to_dev);
+ }
+
+ /* Clear IP on from Interface */
+ sin->sin_addr.s_addr = 0;
+ sin->sin_family = AF_INET;
+ strcpy(ir->ifr_ifrn.ifrn_name, from_dev2);
+ ret = inet_ioctl(rds_ib_inet_socket, SIOCSIFADDR, (unsigned long) ir);
if (ret) {
printk(KERN_ERR
- "RDS/IB: inet_ioctl(SIOCSIFBRDADDR) failed (%d)\n",
- ret);
- goto out;
+ "RDS/IB: inet_ioctl(SIOCSIFADDR,%s) failed (%d)\n",
+ ir->ifr_ifrn.ifrn_name, ret);
}
- sin->sin_addr.s_addr = down_mask;
- ret = inet_ioctl(rds_ib_inet_socket, SIOCSIFNETMASK, (unsigned long) ir);
+ ret = rds_ib_set_ip(out_dev, dev_addr, to_dev2, addr, bcast, mask);
+
if (ret) {
- printk(KERN_ERR
- "RDS/IB: inet_ioctl(SIOCSIFBRDADDR) failed (%d)\n",
- ret);
- goto out;
+ if (failover)
+ printk(KERN_NOTICE
+ "RDS/IP: failed to move IP %u.%u.%u.%u "
+ "from %s to %s\n",
+ NIPQUAD(addr), from_dev2, to_dev2);
+ else
+ printk(KERN_NOTICE
+ "RDS/IP: failed to move IP %u.%u.%u.%u "
+ "from %s back to %s\n",
+ NIPQUAD(addr), from_dev2, to_dev2);
+ } else {
+ if (failover)
+ printk(KERN_NOTICE
+ "RDS/IB: IP %u.%u.%u.%u migrated over to %s\n",
+ NIPQUAD(addr), to_dev2);
+ else
+ printk(KERN_NOTICE
+ "RDS/IB: IP %u.%u.%u.%u migrated back to %s\n",
+ NIPQUAD(addr), to_dev2);
}
- if (failover)
- printk(KERN_NOTICE
- "RDS/IB: IP %u.%u.%u.%u migrated over to %s\n",
- NIPQUAD(down_ip), ir->ifr_ifrn.ifrn_name);
- else
- printk(KERN_NOTICE
- "RDS/IB: IP %u.%u.%u.%u migrated back to %s\n",
- NIPQUAD(down_ip), ir->ifr_ifrn.ifrn_name);
-out:
kunmap(page);
__free_page(page);
return ret;
}
-static void rds_ib_set_port(struct ib_device *ib_dev, struct net_device *net_dev, char *if_name, u8 port_num, __be32 ip_addr)
+static void rds_ib_init_port(struct rds_ib_device *rds_ibdev,
+ struct net_device *net_dev,
+ u8 port_num)
{
- struct rds_ib_device *rds_ibdev;
- u8 active_port;
- unsigned int idx;
+ strcpy(rds_ibdev->ports[port_num].if_name, net_dev->name);
+ rds_ibdev->ports[port_num].dev = net_dev;
+ rds_ibdev->ports[port_num].ip_active_port = 0;
- active_port = net_dev->operstate == IF_OPER_UP ? port_num : 0;
+ if (net_dev->operstate == IF_OPER_UP)
+ rds_ibdev->ports[port_num].port_state = RDS_IB_PORT_UP;
+ else
+ rds_ibdev->ports[port_num].port_state = RDS_IB_PORT_DOWN;
+}
- list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
- if (rds_ibdev->dev == ib_dev) {
- if (!strcmp(net_dev->name, if_name)) {
- strcpy(rds_ibdev->ports[port_num].if_name,
- if_name);
- rds_ibdev->ports[port_num].ip_addr = ip_addr;
- rds_ibdev->ports[port_num].active_port =
- active_port;
- } else {
- idx = rds_ibdev->ports[port_num].alias_cnt++;
- strcpy(rds_ibdev->ports[port_num].
- aliases[idx].if_name, if_name);
- rds_ibdev->ports[port_num].
- aliases[idx].ip_addr = ip_addr;
- }
- break;
- }
+static void rds_ib_set_port(struct rds_ib_device *rds_ibdev,
+ struct net_device *net_dev,
+ char *if_name, u8 port_num,
+ __be32 ip_addr,
+ __be32 ip_bcast,
+ __be32 ip_mask)
+{
+ unsigned int idx;
+
+ if (!strcmp(net_dev->name, if_name)) {
+ strcpy(rds_ibdev->ports[port_num].if_name, if_name);
+ rds_ibdev->ports[port_num].ip_addr = ip_addr;
+ rds_ibdev->ports[port_num].ip_bcast = ip_bcast;
+ rds_ibdev->ports[port_num].ip_mask = ip_mask;
+ rds_ibdev->ports[port_num].ip_active_port = port_num;
+ } else {
+ idx = rds_ibdev->ports[port_num].alias_cnt++;
+ strcpy(rds_ibdev->ports[port_num].
+ aliases[idx].if_name, if_name);
+ rds_ibdev->ports[port_num].
+ aliases[idx].ip_addr = ip_addr;
+ rds_ibdev->ports[port_num].
+ aliases[idx].ip_bcast = ip_bcast;
+ rds_ibdev->ports[port_num].
+ aliases[idx].ip_mask = ip_mask;
}
}
-static void rds_ib_do_failover(struct rds_ib_device *rds_ibdev, u8 port)
+static void rds_ib_do_failover(struct rds_ib_device *rds_ibdev,
+ u8 from_port,
+ u8 to_port)
{
u8 i, j;
+ int ret;
for (i = 1; i <= rds_ibdev->dev->phys_port_cnt; i++) {
- if (port != i && i == rds_ibdev->ports[i].active_port) {
- if (rds_ib_move_ip(
- rds_ibdev->ports[port].if_name,
+ if ((from_port != i &&
+ i == rds_ibdev->ports[i].ip_active_port) ||
+ i == to_port) {
+
+ if (!rds_ib_move_ip(
+ rds_ibdev->ports[i].dev,
+ rds_ibdev->ports[i].dev->dev_addr,
+ rds_ibdev->ports[from_port].if_name,
rds_ibdev->ports[i].if_name,
- rds_ibdev->ports[port].ip_addr,
+ rds_ibdev->ports[from_port].ip_addr,
+ rds_ibdev->ports[from_port].ip_bcast,
+ rds_ibdev->ports[from_port].ip_mask,
1)) {
- printk(KERN_ERR "RDS/IP: failed to move IP "
- "%u.%u.%u.%u from %s over to %s\n",
- NIPQUAD(rds_ibdev->ports[port].ip_addr), rds_ibdev->ports[port].if_name,
- rds_ibdev->ports[i].if_name);
- } else {
- rds_ibdev->ports[port].active_port = i;
-
- for (j = 0; j < rds_ibdev->ports[port].alias_cnt; j++) {
- if (rds_ib_move_ip(
- rds_ibdev->ports[port].
+
+ rds_ibdev->ports[from_port].ip_active_port = i;
+ for (j = 0; j < rds_ibdev->ports[from_port].
+ alias_cnt; j++) {
+
+ ret = rds_ib_move_ip(
+ rds_ibdev->ports[i].dev,
+ rds_ibdev->ports[i].
+ dev->dev_addr,
+ rds_ibdev->ports[from_port].
aliases[j].if_name,
rds_ibdev->ports[i].if_name,
- rds_ibdev->ports[port].
+ rds_ibdev->ports[from_port].
aliases[j].ip_addr,
- 1)) {
- printk(KERN_ERR "RDS/IP: failed to move alias IP "
- "%u.%u.%u.%u from %s over to %s\n",
- NIPQUAD(rds_ibdev->ports[port].aliases[j].ip_addr),
- rds_ibdev->ports[port].aliases[j].if_name,
- rds_ibdev->ports[i].if_name);
- }
+ rds_ibdev->ports[from_port].
+ aliases[j].ip_bcast,
+ rds_ibdev->ports[from_port].
+ aliases[j].ip_mask,
+ 1);
}
break;
}
}
}
-static void rds_ib_do_failback(struct rds_ib_device *rds_ibdev, u8 port)
+static void rds_ib_do_set_ip(struct rds_ib_device *rds_ibdev,
+ u8 port)
{
- u8 active_port = rds_ibdev->ports[port].active_port;
+ int ret;
u8 j;
- if (port != rds_ibdev->ports[port].active_port) {
- if (rds_ib_move_ip(
- rds_ibdev->ports[active_port].if_name,
+ ret = rds_ib_set_ip(rds_ibdev->ports[port].dev,
+ rds_ibdev->ports[port].dev->dev_addr,
rds_ibdev->ports[port].if_name,
rds_ibdev->ports[port].ip_addr,
+ rds_ibdev->ports[port].ip_bcast,
+ rds_ibdev->ports[port].ip_mask);
+
+ for (j = 0; j < rds_ibdev->ports[port].alias_cnt; j++) {
+ ret = rds_ib_set_ip(rds_ibdev->ports[port].dev,
+ rds_ibdev->ports[port].dev->dev_addr,
+ rds_ibdev->ports[port].aliases[j].if_name,
+ rds_ibdev->ports[port].aliases[j].ip_addr,
+ rds_ibdev->ports[port].aliases[j].ip_bcast,
+ rds_ibdev->ports[port].aliases[j].ip_mask);
+ }
+}
+
+static void rds_ib_do_failback(struct rds_ib_device *rds_ibdev,
+ u8 port)
+{
+ u8 ip_active_port = rds_ibdev->ports[port].ip_active_port;
+ u8 j;
+ int ret;
+
+ if (port != rds_ibdev->ports[port].ip_active_port) {
+ if (!rds_ib_move_ip(
+ rds_ibdev->ports[ip_active_port].dev,
+ rds_ibdev->ports[port].dev->dev_addr,
+ rds_ibdev->ports[ip_active_port].if_name,
+ rds_ibdev->ports[port].if_name,
+ rds_ibdev->ports[port].ip_addr,
+ rds_ibdev->ports[port].ip_bcast,
+ rds_ibdev->ports[port].ip_mask,
0)) {
- printk(KERN_ERR "RDS/IP: failed to move IP "
- "%u.%u.%u.%u from %s back to %s\n",
- NIPQUAD(rds_ibdev->ports[port].ip_addr),
- rds_ibdev->ports[active_port].if_name,
- rds_ibdev->ports[port].if_name);
- } else {
- for (j = 0; j < rds_ibdev->ports[port].alias_cnt; j++) {
- if (rds_ib_move_ip(
- rds_ibdev->ports[active_port].if_name,
+
+ for (j = 0; j < rds_ibdev->ports[port].
+ alias_cnt; j++) {
+
+ ret = rds_ib_move_ip(
+ rds_ibdev->ports[ip_active_port].dev,
+ rds_ibdev->ports[port].
+ dev->dev_addr,
+ rds_ibdev->ports[ip_active_port].
+ if_name,
rds_ibdev->ports[port].
aliases[j].if_name,
rds_ibdev->ports[port].
aliases[j].ip_addr,
- 0)) {
- printk(KERN_ERR "RDS/IP: failed to move alias IP "
- "%u.%u.%u.%u from %s back to %s\n",
- NIPQUAD(rds_ibdev->ports[port].aliases[j].ip_addr),
- rds_ibdev->ports[active_port].if_name,
- rds_ibdev->ports[port].aliases[j].if_name);
- }
- }
- rds_ibdev->ports[port].active_port = port;
- if (!rds_ibdev->ports[active_port].active_port) {
- if (rds_ib_move_ip(
- rds_ibdev->ports[active_port].if_name,
- rds_ibdev->ports[port].if_name,
- rds_ibdev->ports[active_port].ip_addr,
- 1)) {
- printk(KERN_ERR "RDS/IP: failed to move IP %u.%u.%u.%u from %s to %s\n",
- NIPQUAD(rds_ibdev->ports[active_port].ip_addr),
- rds_ibdev->ports[active_port].if_name,
- rds_ibdev->ports[port].if_name);
- } else {
- for (j = 0; j < rds_ibdev->ports[active_port].alias_cnt; j++) {
- if (rds_ib_move_ip(
- rds_ibdev->ports[active_port].aliases[j].if_name,
- rds_ibdev->ports[port].if_name,
- rds_ibdev->ports[active_port].aliases[j].ip_addr,
- 1)) {
- printk(KERN_ERR "RDS/IP: failed to move alias IP %u.%u.%u.%u from %s to %s\n",
- NIPQUAD(rds_ibdev->ports[active_port].aliases[j].ip_addr),
- rds_ibdev->ports[active_port].aliases[j].if_name,
- rds_ibdev->ports[port].if_name);
- }
- }
- rds_ibdev->ports[active_port].active_port = port;
- }
+ rds_ibdev->ports[port].
+ aliases[j].ip_bcast,
+ rds_ibdev->ports[port].
+ aliases[j].ip_mask,
+ 0);
}
}
}
struct rds_ib_port_ud_work *work =
container_of(_work, struct rds_ib_port_ud_work, work.work);
struct rds_ib_device *rds_ibdev = work->rds_ibdev;
+ int ret;
if (rds_ibdev->ports[work->port].ip_addr)
- rds_ib_do_failover(rds_ibdev, work->port);
+ rds_ib_do_failover(rds_ibdev, work->port, 0);
- if (rds_ibdev->ports[work->port].active_port == work->port)
- rds_ibdev->ports[work->port].active_port = 0;
+ if (rds_ibdev->ports[work->port].ip_active_port == work->port) {
+ ret = rds_ib_set_ip(rds_ibdev->ports[work->port].dev,
+ rds_ibdev->ports[work->port].dev->dev_addr,
+ rds_ibdev->ports[work->port].if_name,
+ 0, 0, 0);
+ }
kfree(work);
}
struct rds_ib_port_ud_work *work =
container_of(_work, struct rds_ib_port_ud_work, work.work);
struct rds_ib_device *rds_ibdev = work->rds_ibdev;
- u8 i;
+ u8 i, port = work->port;
+ struct in_device *in_dev;
+
+ if (rds_ibdev->ports[port].ip_addr &&
+ rds_ibdev->ports[port].ip_active_port != port) {
- if (rds_ibdev->ports[work->port].ip_addr &&
- rds_ibdev->ports[work->port].active_port)
- rds_ib_do_failback(rds_ibdev, work->port);
+ rds_ib_do_failback(rds_ibdev, port);
+ }
- rds_ibdev->ports[work->port].active_port = work->port;
+ rds_ibdev->ports[port].ip_active_port = port;
+ in_dev = in_dev_get(rds_ibdev->ports[port].dev);
for (i = 1; i <= rds_ibdev->dev->phys_port_cnt; i++) {
- if (i != work->port && rds_ibdev->ports[i].ip_addr &&
- !rds_ibdev->ports[i].active_port) {
- rds_ib_do_failover(rds_ibdev, i);
+ if (rds_ibdev->ports[i].port_state == RDS_IB_PORT_DOWN &&
+ i != port && rds_ibdev->ports[i].ip_addr) {
+
+ if (rds_ibdev->ports[i].ip_active_port == i) {
+ rds_ib_do_failover(rds_ibdev, i, 0);
+ } else if (rds_ibdev->ports[i].ip_active_port == port) {
+ if (in_dev && !in_dev->ifa_list &&
+ rds_ibdev->ports[port].ip_addr) {
+
+ rds_ib_do_set_ip(rds_ibdev, port);
+ }
+
+ rds_ib_do_failover(rds_ibdev, i, port);
+ }
}
}
kfree(work);
}
-static void rds_ib_event_handler(struct ib_event_handler *handler, struct ib_event *event)
+static void rds_ib_event_handler(struct ib_event_handler *handler,
+ struct ib_event *event)
{
struct rds_ib_device *rds_ibdev =
container_of(handler, typeof(*rds_ibdev), event_handler);
u8 port = event->element.port_num;
struct rds_ib_port_ud_work *work;
- if (!rds_ib_active_active_enabled)
+ if (!rds_ib_haip_enabled)
return;
if (event->event != IB_EVENT_PORT_ACTIVE &&
event->event != IB_EVENT_PORT_ERR)
return;
- printk(KERN_NOTICE "RDS/IB: port %s/%d is %s\n", event->device->name,
- port, (event->event == IB_EVENT_PORT_ACTIVE) ? "UP" : "DOWN");
+ printk(KERN_NOTICE "RDS/IB: %s/port_%d/%s is %s\n",
+ rds_ibdev->dev->name, port,
+ rds_ibdev->ports[port].if_name,
+ (event->event == IB_EVENT_PORT_ACTIVE) ?
+ "ACTIVE" : "ERROR");
work = kzalloc(sizeof *work, GFP_KERNEL);
if (!work) {
work->port = port;
if (event->event == IB_EVENT_PORT_ACTIVE) {
- INIT_DELAYED_WORK(&work->work, rds_ib_failback);
- queue_delayed_work(rds_wq, &work->work, 0);
+ if (rds_ib_haip_fallback) {
+ INIT_DELAYED_WORK(&work->work, rds_ib_failback);
+ queue_delayed_work(rds_wq, &work->work, 0);
+ } else
+ kfree(work);
+ rds_ibdev->ports[port].port_state = RDS_IB_PORT_UP;
} else {
INIT_DELAYED_WORK(&work->work, rds_ib_failover);
queue_delayed_work(rds_wq, &work->work, 0);
+ rds_ibdev->ports[port].port_state = RDS_IB_PORT_DOWN;
}
}
list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
for (i = 1; i <= rds_ibdev->dev->phys_port_cnt; i++) {
- if (!rds_ibdev->ports[i].active_port &&
+ if (rds_ibdev->ports[i].port_state != RDS_IB_PORT_UP &&
rds_ibdev->ports[i].ip_addr) {
printk(KERN_NOTICE
- "RDS/IB: port %s/%d is DOWN\n",
+ "RDS/IB: port %s/%d is NOT UP\n",
rds_ibdev->dev->name, i);
work = kzalloc(sizeof *work, GFP_KERNEL);
flush_workqueue(rds_wq);
}
-static void rds_ib_print_port(void)
+static void rds_ib_dump_ip_config(void)
{
struct rds_ib_device *rds_ibdev;
int i, j;
+ if (!rds_ib_haip_enabled)
+ return;
+
+ printk(KERN_ERR "RDS/IB: IP configuration ...\n");
list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
for (i = 1; i <= rds_ibdev->dev->phys_port_cnt; i++) {
- if (!rds_ibdev->ports[i].ip_addr)
- continue;
- rdsdebug("Device %s / Port %d: name %s, "
- "IP %d.%d.%d.%d, active_port %d\n",
+ printk(KERN_ERR "RDS/IB: %s/port_%d/%s: "
+ "IP %d.%d.%d.%d/%d.%d.%d.%d/%d.%d.%d.%d "
+ "state %s\n",
rds_ibdev->dev->name, i,
rds_ibdev->ports[i].if_name,
NIPQUAD(rds_ibdev->ports[i].ip_addr),
- rds_ibdev->ports[i].active_port);
+ NIPQUAD(rds_ibdev->ports[i].ip_bcast),
+ NIPQUAD(rds_ibdev->ports[i].ip_mask),
+ (rds_ibdev->ports[i].port_state == RDS_IB_PORT_UP ? "UP" : "DOWN"));
for (j = 0; j < rds_ibdev->ports[i].alias_cnt; j++) {
- rdsdebug("Alias %s IP %d.%d.%d.%d\n",
+ printk(KERN_ERR "Alias %s "
+ "IP %d.%d.%d.%d/%d.%d.%d.%d/%d.%d.%d.%d\n",
rds_ibdev->ports[i].aliases[j].if_name,
NIPQUAD(rds_ibdev->ports[i].
- aliases[j].ip_addr));
+ aliases[j].ip_addr),
+ NIPQUAD(rds_ibdev->ports[i].
+ aliases[j].ip_bcast),
+ NIPQUAD(rds_ibdev->ports[i].
+ aliases[j].ip_mask));
}
}
}
}
-static void rds_ib_check_up_port(void)
-{
- struct net_device *dev;
- int downs;
- int retries = 0;
-
-retry:
- downs = 0;
- read_lock(&dev_base_lock);
- for_each_netdev(&init_net, dev) {
- if ((dev->type == ARPHRD_INFINIBAND) &&
- !(dev->flags & IFF_MASTER)) {
- if (dev->operstate != IF_OPER_UP)
- downs++;
- }
- }
- read_unlock(&dev_base_lock);
-
- if (downs) {
- if (retries++ <= 60) {
- msleep(1000);
- goto retry;
- } else {
- printk(KERN_ERR "RDS/IB: Some port(s) not operational\n");
- }
- }
-}
-
-static int rds_ib_init_port(void)
+static int rds_ib_setup_ports(void)
{
struct net_device *dev;
struct in_ifaddr *ifa;
struct in_ifaddr **ifap;
struct in_device *in_dev;
- struct rdma_cm_id *cm_id;
- struct sockaddr_in sin;
- struct rdma_dev_addr *dev_addr;
- union ib_gid gid;
+ struct rds_ib_device *rds_ibdev;
u8 port_num;
int ret = 0;
- if (!rds_ib_active_active_enabled)
+ if (!rds_ib_haip_enabled)
return ret;
- rds_ib_check_up_port();
-
read_lock(&dev_base_lock);
for_each_netdev(&init_net, dev) {
in_dev = in_dev_get(dev);
if ((dev->type == ARPHRD_INFINIBAND) &&
+ !(dev->flags & IFF_SLAVE) &&
!(dev->flags & IFF_MASTER) &&
in_dev) {
- for (ifap = &in_dev->ifa_list; (ifa = *ifap);
- ifap = &ifa->ifa_next) {
+ union ib_gid gid;
- cm_id = rdma_create_id(NULL, NULL,
- RDMA_PS_TCP);
- ret = (IS_ERR(cm_id));
- if (ret) {
- printk(KERN_ERR "RDS/IB: rdma_create_id failed\n");
- goto out;
- }
- memset(&sin, 0, sizeof(sin));
- sin.sin_family = AF_INET;
- sin.sin_addr.s_addr = ifa->ifa_address;
- ret = rdma_bind_addr(cm_id,
- (struct sockaddr *)&sin);
- if (ret) {
- printk(KERN_ERR "RDS/IB: rdma_bind_addr failed\n");
- rdma_destroy_id(cm_id);
- goto out;
- }
- dev_addr = &cm_id->route.addr.dev_addr;
- memcpy(&gid, dev_addr->src_dev_addr +
- rdma_addr_gid_offset(dev_addr), sizeof gid);
- ret = ib_find_cached_gid(cm_id->device, &gid,
- &port_num, NULL);
- if (ret) {
- printk(KERN_ERR "RDS/IB: ib_find_cached_gid failed\n");
- rdma_destroy_id(cm_id);
- goto out;
- }
+ memcpy(&gid, dev->dev_addr + 4, sizeof gid);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(rds_ibdev,
+ &rds_ib_devices, list) {
+ ret = ib_find_cached_gid(rds_ibdev->dev,
+ &gid, &port_num, NULL);
+ if (!ret)
+ break;
+ }
+ rcu_read_unlock();
+
+ if (!port_num) {
+ printk(KERN_ERR "RDS/IB: GID "RDS_IB_GID_FMT
+ " has no associated port\n",
+ RDS_IB_GID_ARG(gid));
+ ret = 1;
+ goto out;
+ }
- rds_ib_set_port(cm_id->device, dev,
- ifa->ifa_label, port_num,
- ifa->ifa_address);
+ rds_ib_init_port(rds_ibdev, dev, port_num);
- rdma_destroy_id(cm_id);
+ for (ifap = &in_dev->ifa_list; (ifa = *ifap);
+ ifap = &ifa->ifa_next) {
+ rds_ib_set_port(rds_ibdev, dev,
+ ifa->ifa_label, port_num,
+ ifa->ifa_address,
+ ifa->ifa_broadcast,
+ ifa->ifa_mask);
}
}
}
rds_ib_check_down_port();
- rds_ib_print_port();
+ rds_ib_dump_ip_config();
out:
read_unlock(&dev_base_lock);
return ret;
rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32;
rds_ibdev->max_1m_fmrs = dev_attr->max_fmr ?
- min_t(unsigned int, (dev_attr->max_fmr / 2),
- rds_ib_fmr_1m_pool_size) :
- rds_ib_fmr_1m_pool_size;
+ min_t(unsigned int, dev_attr->max_fmr,
+ RDS_FMR_1M_POOL_SIZE) :
+ RDS_FMR_1M_POOL_SIZE;
rds_ibdev->max_8k_fmrs = dev_attr->max_fmr ?
- min_t(unsigned int, ((dev_attr->max_fmr / 2) * 128),
- rds_ib_fmr_8k_pool_size) :
- rds_ib_fmr_8k_pool_size;
+ min_t(unsigned int, dev_attr->max_fmr,
+ RDS_FMR_8K_POOL_SIZE) :
+ RDS_FMR_8K_POOL_SIZE;
rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom;
rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom;
goto put_dev;
}
- if (rds_ib_active_active_enabled) {
+ if (rds_ib_haip_enabled) {
rds_ibdev->ports = kzalloc(sizeof(struct rds_ib_port) *
(device->phys_port_cnt + 1), GFP_KERNEL);
if (!rds_ibdev->ports) {
.t_type = RDS_TRANS_IB
};
+static int rds_ib_netdev_callback(struct notifier_block *self, unsigned long event, void *ctx)
+{
+ struct net_device *ndev = netdev_notifier_info_to_dev(ctx);
+ u8 port = 0;
+ u8 i;
+ struct rds_ib_device *rds_ibdev;
+ struct rds_ib_port_ud_work *work;
+
+ if (!rds_ib_haip_enabled)
+ return NOTIFY_DONE;
+
+ if (event != NETDEV_UP && event != NETDEV_DOWN)
+ return NOTIFY_DONE;
+
+ rcu_read_lock();
+ list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
+ for (i = 1; i <= rds_ibdev->dev->phys_port_cnt; i++) {
+ if (!strcmp(ndev->name,
+ rds_ibdev->ports[i].if_name)) {
+ port = i;
+ goto out;
+ }
+ }
+ }
+ rcu_read_unlock();
+out:
+ if (!port)
+ return NOTIFY_DONE;
+
+
+ printk(KERN_NOTICE "RDS/IB: %s/port_%d/%s is %s\n",
+ rds_ibdev->dev->name, port, ndev->name,
+ (event == NETDEV_UP) ? "UP" : "DOWN");
+
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work) {
+ printk(KERN_ERR "RDS/IB: failed to allocate port work\n");
+ return NOTIFY_DONE;
+ }
+
+ work->rds_ibdev = rds_ibdev;
+ work->dev = ndev;
+ work->port = port;
+
+ switch (event) {
+ case NETDEV_UP:
+ if (rds_ib_haip_fallback) {
+ INIT_DELAYED_WORK(&work->work, rds_ib_failback);
+ queue_delayed_work(rds_wq, &work->work, msecs_to_jiffies(100));
+ } else
+ kfree(work);
+
+ rds_ibdev->ports[port].port_state = NETDEV_UP;
+ break;
+ case NETDEV_DOWN:
+ INIT_DELAYED_WORK(&work->work, rds_ib_failover);
+ queue_delayed_work(rds_wq, &work->work, 0);
+ rds_ibdev->ports[port].port_state = RDS_IB_PORT_DOWN;
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block rds_ib_nb = {
+ .notifier_call = rds_ib_netdev_callback
+};
+
int rds_ib_init(void)
{
int ret;
rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
- ret = rds_ib_init_port();
+ ret = rds_ib_setup_ports();
if (ret) {
printk(KERN_ERR "RDS/IB: failed to init port\n");
goto out_srq;
goto out_srq;
}
+ register_netdevice_notifier(&rds_ib_nb);
+
goto out;
out_srq: