rcu_read_lock();
if (!hlist_unhashed(&conn->c_hash_node)) {
rcu_read_unlock();
- rds_queue_reconnect(conn);
+ if (conn->c_laddr >= conn->c_faddr)
+ rds_queue_reconnect(conn);
} else {
rcu_read_unlock();
}
#include <linux/delay.h>
#include <rdma/ib_cache.h>
#include <net/sock.h>
+#include <net/route.h>
#include <net/inet_common.h>
#include <linux/rtnetlink.h>
unsigned int rds_ib_cq_balance_enabled = 1;
#endif
static char *rds_ib_haip_failover_groups = NULL;
+unsigned int rds_ib_haip_arps = RDS_IB_DEFAULT_NUM_ARPS;
module_param(rds_ib_fmr_1m_pool_size, int, 0444);
MODULE_PARM_DESC(rds_ib_fmr_1m_pool_size, " Max number of 1m fmr per HCA");
module_param(rds_ib_cq_balance_enabled, int, 0444);
MODULE_PARM_DESC(rds_ib_cq_balance_enabled, " CQ load balance Enabled");
#endif
+module_param(rds_ib_haip_arps, int, 0444);
+MODULE_PARM_DESC(rds_ib_haip_arps, " Num ARPs to be sent when IP moved");
/*
* we have a clumsy combination of RCU and a rwsem protecting this list
struct rdma_dev_addr *dev_addr;
ic = conn->c_transport_data;
- dev_addr = &ic->i_cm_id->route.addr.dev_addr;
-
- rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
- rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
+#if RDMA_RDS_APM_SUPPORTED
+ if (rds_ib_apm_enabled) {
+ memcpy((union ib_gid *) &iinfo->src_gid,
+ &ic->i_cur_path.p_sgid, sizeof(union ib_gid));
+ memcpy((union ib_gid *) &iinfo->dst_gid,
+ &ic->i_cur_path.p_dgid, sizeof(union ib_gid));
+ } else
+#endif
+ {
+ dev_addr = &ic->i_cm_id->route.addr.dev_addr;
+ rdma_addr_get_sgid(dev_addr,
+ (union ib_gid *) &iinfo->src_gid);
+ rdma_addr_get_dgid(dev_addr,
+ (union ib_gid *) &iinfo->dst_gid);
+ }
rds_ibdev = ic->rds_ibdev;
iinfo->max_send_wr = ic->i_send_ring.w_nr;
unsigned char *dev_addr,
__be32 ip_addr)
{
- arp_send(ARPOP_REQUEST, ETH_P_ARP,
- ip_addr, out_dev,
- ip_addr, NULL,
- dev_addr, NULL);
+ int i;
+
+ /* Send multiple ARPs to improve reliability */
+ for (i = 0; i < rds_ib_haip_arps; i++) {
+ arp_send(ARPOP_REQUEST, ETH_P_ARP,
+ ip_addr, out_dev,
+ ip_addr, NULL,
+ dev_addr, NULL);
+ }
}
static int rds_ib_set_ip(struct net_device *out_dev,
__be32 addr,
__be32 bcast,
__be32 mask,
+ int event_type,
int failover)
{
struct ifreq *ir;
printk(KERN_NOTICE
"RDS/IB: IP %u.%u.%u.%u migrated from %s to %s\n",
NIPQUAD(addr), from_dev2, to_dev2);
+
+ if (event_type == RDS_IB_PORT_EVENT_NET) {
+ unsigned long flags;
+ struct rds_ib_connection *ic;
+ struct rds_ib_device *rds_ibdev;
+
+ rds_ibdev = ip_config[to_port].rds_ibdev;
+ spin_lock_irqsave(&rds_ibdev->spinlock, flags);
+ list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node)
+ if (ic->conn->c_laddr == addr) {
+#if RDMA_RDS_APM_SUPPORTED
+ if (rds_ib_apm_enabled) {
+ if (!memcmp(
+ &ic->i_cur_path.p_sgid,
+ &ip_config[to_port].gid,
+ sizeof(union ib_gid))) {
+ continue;
+ }
+ }
+#endif
+ rds_conn_drop(ic->conn);
+ }
+ spin_unlock_irqrestore(&rds_ibdev->spinlock, flags);
+ }
}
out:
return ret;
}
+static void rds_ib_check_up_port(void)
+{
+ struct net_device *dev;
+ int downs;
+ int retries = 0;
+
+retry:
+ downs = 0;
+ read_lock(&dev_base_lock);
+ for_each_netdev(&init_net, dev) {
+ if ((dev->type == ARPHRD_INFINIBAND) &&
+ !(dev->flags & IFF_SLAVE) &&
+ !(dev->flags & IFF_MASTER)) {
+ if (dev->operstate != IF_OPER_UP)
+ downs++;
+ }
+ }
+ read_unlock(&dev_base_lock);
+
+ if (downs) {
+ if (retries++ <= 60) {
+ msleep(1000);
+ goto retry;
+ } else {
+ printk(KERN_ERR "RDS/IB: Some port(s) may not be "
+ "operational\n");
+ }
+ }
+}
+
+
static u8 rds_ib_init_port(struct rds_ib_device *rds_ibdev,
struct net_device *net_dev,
- u8 port_num)
+ u8 port_num,
+ union ib_gid gid)
{
const char *digits = "0123456789";
ip_config[ip_port_cnt].rds_ibdev = rds_ibdev;
ip_config[ip_port_cnt].ip_active_port = 0;
strcpy(ip_config[ip_port_cnt].if_name, net_dev->name);
+ memcpy(&ip_config[ip_port_cnt].gid, &gid, sizeof(union ib_gid));
if (net_dev->operstate == IF_OPER_UP)
ip_config[ip_port_cnt].port_state = RDS_IB_PORT_UP;
}
}
-static void rds_ib_do_failover(u8 from_port, u8 to_port, u8 arp_port)
+static void rds_ib_do_failover(u8 from_port, u8 to_port, u8 arp_port,
+ int event_type)
{
u8 j;
int ret;
ip_config[from_port].ip_addr,
ip_config[from_port].ip_bcast,
ip_config[from_port].ip_mask,
+ event_type,
1)) {
ip_config[from_port].ip_active_port = to_port;
aliases[j].ip_bcast,
ip_config[from_port].
aliases[j].ip_mask,
+ event_type,
1);
}
}
}
}
-static void rds_ib_do_failback(u8 port)
+static void rds_ib_do_failback(u8 port, int event_type)
{
u8 ip_active_port = ip_config[port].ip_active_port;
u8 j;
ip_config[port].ip_addr,
ip_config[port].ip_bcast,
ip_config[port].ip_mask,
+ event_type,
0)) {
ip_config[port].ip_active_port = port;
aliases[j].ip_bcast,
ip_config[port].
aliases[j].ip_mask,
+ event_type,
0);
}
}
if_name[IFNAMSIZ-1] = 0;
ret = rds_ib_set_ip(NULL, NULL, if_name, 0, 0, 0);
- rds_ib_do_failover(i, 0, 0);
+ rds_ib_do_failover(i, 0, 0, work->event_type);
}
}
if (ip_config[work->port].ip_addr)
- rds_ib_do_failover(work->port, 0, 0);
+ rds_ib_do_failover(work->port, 0, 0, work->event_type);
if (ip_config[work->port].ip_active_port == work->port) {
ret = rds_ib_set_ip(NULL, NULL,
ip_active_port = ip_config[port].ip_active_port;
- rds_ib_do_failback(port);
+ rds_ib_do_failback(port, work->event_type);
for (i = 1; i <= ip_port_cnt; i++) {
if (i == port ||
continue;
if (ip_config[i].ip_active_port == i) {
- rds_ib_do_failover(i, 0, ip_active_port);
+ rds_ib_do_failover(i, 0, ip_active_port,
+ work->event_type);
} else if (ip_config[i].ip_active_port == port) {
- rds_ib_do_failover(i, port, ip_active_port);
+ rds_ib_do_failover(i, port, ip_active_port,
+ work->event_type);
} else if (ip_config[ip_config[i].ip_active_port].port_state ==
RDS_IB_PORT_DOWN) {
- rds_ib_do_failover(i, 0, ip_active_port);
+ rds_ib_do_failover(i, 0, ip_active_port,
+ work->event_type);
} else if (ip_config[port].failover_group ==
ip_config[i].failover_group) {
- rds_ib_do_failover(i, port, ip_active_port);
+ rds_ib_do_failover(i, port, ip_active_port,
+ work->event_type);
}
}
ip_config[i].ip_active_port == ip_active_port) {
rds_ib_do_failover(i, ip_active_port,
- ip_active_port);
+ ip_active_port,
+ work->event_type);
}
}
}
}
work->port = port;
+ work->event_type = RDS_IB_PORT_EVENT_IB;
if (event->event == IB_EVENT_PORT_ACTIVE) {
if (rds_ib_haip_fallback) {
if (!rds_ib_haip_enabled)
return 0;
+ rds_ib_check_up_port();
+
rcu_read_unlock();
ip_config = kzalloc(sizeof(struct rds_ib_port) *
RDS_IB_GID_ARG(gid));
} else {
port = rds_ib_init_port(rds_ibdev, dev,
- port_num);
+ port_num, gid);
if (port > 0) {
for (ifap = &in_dev->ifa_list;
(ifa = *ifap);
work->dev = ndev;
work->port = port;
+ work->event_type = RDS_IB_PORT_EVENT_NET;
switch (event) {
case NETDEV_UP:
#include "rds.h"
#include "rdma_transport.h"
-#define RDS_FMR_1M_POOL_SIZE (8192 / 2)
+#define RDS_FMR_1M_POOL_SIZE (8192 * 3 / 4)
#define RDS_FMR_1M_MSG_SIZE 256 /* 1M */
#define RDS_FMR_8K_MSG_SIZE 2
-#define RDS_FMR_8K_POOL_SIZE ((256 / (RDS_FMR_8K_MSG_SIZE + 1)) * (8192 / 2))
+#define RDS_FMR_8K_POOL_SIZE ((256 / (RDS_FMR_8K_MSG_SIZE + 1)) * (8192 / 4))
#define RDS_IB_MAX_SGE 8
#define RDS_IB_RECV_SGE 2
#define RDS_IB_DEFAULT_RNR_RETRY_COUNT 7
+#define RDS_IB_DEFAULT_NUM_ARPS 100
+
#define RDS_IB_DEFAULT_TIMEOUT 16 /* 4.096 * 2 ^ 16 = 260 msec */
#define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */
u8 dp_protocol_major;
u8 dp_protocol_minor;
__be16 dp_protocol_minor_mask; /* bitmask */
- u8 dp_tos;
- u8 dp_reserved1;
- __be16 dp_reserved2;
+ u8 dp_tos;
+ u8 dp_reserved1;
+ __be16 dp_reserved2;
__be64 dp_ack_seq;
__be32 dp_credit; /* non-zero enables flow ctl */
};
struct rds_ib_path i_cur_path;
unsigned int i_alt_path_index;
unsigned int i_active_side;
+ unsigned long i_last_migration;
int i_scq_vector;
int i_rcq_vector;
struct net_device *dev;
unsigned int port_state;
u8 port_num;
+ union ib_gid gid;
char port_label[4];
char if_name[IFNAMSIZ];
__be32 ip_addr;
struct rds_ib_alias aliases[RDS_IB_MAX_ALIASES];
};
+enum {
+ RDS_IB_PORT_EVENT_IB,
+ RDS_IB_PORT_EVENT_NET,
+};
+
struct rds_ib_port_ud_work {
struct delayed_work work;
struct net_device *dev;
unsigned int port;
int timeout;
+ int event_type;
};
enum {
uint64_t s_ib_srq_lows;
uint64_t s_ib_srq_refills;
uint64_t s_ib_srq_empty_refills;
+ uint64_t s_ib_failed_apm;
};
extern struct workqueue_struct *rds_ib_wq;
rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
#if RDMA_RDS_APM_SUPPORTED
- if (rds_ib_apm_enabled && !ic->conn->c_reconnect) {
- memcpy(&ic->i_pri_path.p_sgid,
- &ic->i_cm_id->route.path_rec[0].sgid,
- sizeof(union ib_gid));
-
- memcpy(&ic->i_pri_path.p_dgid,
- &ic->i_cm_id->route.path_rec[0].dgid,
- sizeof(union ib_gid));
-
- memcpy(&ic->i_cur_path.p_sgid,
- &ic->i_cm_id->route.path_rec[0].sgid,
- sizeof(union ib_gid));
-
- memcpy(&ic->i_cur_path.p_dgid,
- &ic->i_cm_id->route.path_rec[0].dgid,
- sizeof(union ib_gid));
-
- printk(KERN_NOTICE "RDS/IB: connection "
- "<%u.%u.%u.%u,%u.%u.%u.%u,%d> primary path "
- "<"RDS_IB_GID_FMT","RDS_IB_GID_FMT">\n",
- NIPQUAD(conn->c_laddr),
- NIPQUAD(conn->c_faddr),
- conn->c_tos,
- RDS_IB_GID_ARG(ic->i_pri_path.p_sgid),
- RDS_IB_GID_ARG(ic->i_pri_path.p_dgid));
+ if (rds_ib_apm_enabled) {
+ struct rdma_dev_addr *dev_addr;
+
+ dev_addr = &ic->i_cm_id->route.addr.dev_addr;
+
+ if (!ic->conn->c_reconnect) {
+ rdma_addr_get_sgid(dev_addr,
+ (union ib_gid *)&ic->i_pri_path.p_sgid);
+ rdma_addr_get_dgid(dev_addr,
+ (union ib_gid *)&ic->i_pri_path.p_dgid);
+ printk(KERN_NOTICE "RDS/IB: connection "
+ "<%u.%u.%u.%u,%u.%u.%u.%u,%d> primary path "
+ "<"RDS_IB_GID_FMT","RDS_IB_GID_FMT">\n",
+ NIPQUAD(conn->c_laddr),
+ NIPQUAD(conn->c_faddr),
+ conn->c_tos,
+ RDS_IB_GID_ARG(ic->i_pri_path.p_sgid),
+ RDS_IB_GID_ARG(ic->i_pri_path.p_dgid));
+ }
+ rdma_addr_get_sgid(dev_addr,
+ (union ib_gid *)&ic->i_cur_path.p_sgid);
+ rdma_addr_get_dgid(dev_addr,
+ (union ib_gid *)&ic->i_cur_path.p_dgid);
}
#endif
rds_connect_complete(conn);
+
+#if RDMA_RDS_APM_SUPPORTED
+ if (ic->i_last_migration) {
+ rds_ib_stats_inc(s_ib_failed_apm);
+ ic->i_last_migration = 0;
+ }
+#endif
}
static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
RDS_IB_GID_ARG(ic->i_cur_path.p_sgid),
RDS_IB_GID_ARG(ic->i_cur_path.p_dgid));
}
+ ic->i_last_migration = get_seconds();
break;
case IB_EVENT_PATH_MIG_ERR:
/* XXX I wonder what affect the port space has */
/* delegate cm event handler to rdma_transport */
ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
- RDMA_PS_TCP, IB_QPT_RC);
+ RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(ic->i_cm_id)) {
ret = PTR_ERR(ic->i_cm_id);
ic->i_cm_id = NULL;
atomic_t free_pinned; /* memory pinned by free MRs */
unsigned long max_items;
- unsigned long max_items_soft;
+ atomic_t max_items_soft;
unsigned long max_free_pinned;
struct ib_fmr_attr fmr_attr;
};
pool->max_items * pool->fmr_attr.max_pages / 4;
pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
pool->fmr_attr.page_shift = PAGE_SHIFT;
- pool->max_items_soft = pool->max_items * 3 / 4;
+ atomic_set(&pool->max_items_soft, pool->max_items);
return pool;
}
{
struct rds_ib_mr_pool *pool;
struct rds_ib_mr *ibmr = NULL;
- struct rds_ib_mr *tmp_ibmr = NULL;
int err = 0, iter = 0;
if (npages <= RDS_FMR_8K_MSG_SIZE)
else
pool = rds_ibdev->mr_1m_pool;
- if (atomic_read(&pool->dirty_count) >= pool->max_items / 10)
+ if (atomic_read(&pool->dirty_count) >=
+ atomic_read(&pool->max_items_soft) / 10)
queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10);
while (1) {
if (IS_ERR(ibmr->fmr)) {
err = PTR_ERR(ibmr->fmr);
- /* Adjust the pool size to reflect the resources available to
- * the VM.
+ /* Re-balance the pool sizes to reflect the memory resources
+ * available to the VM.
*/
if (err == -ENOMEM) {
- int prev_max = pool->max_items;
-
- pool->max_items = atomic_read(&pool->item_count);
-
- printk(KERN_ERR "RDS/IB: Adjusted %s FMR pool (%d->%ld)\n", (pool->pool_type == RDS_IB_MR_8K_POOL) ? "8K" : "1M",
- prev_max, pool->max_items);
-
- rds_ib_flush_mr_pool(pool, 0, &tmp_ibmr);
- if (tmp_ibmr) {
- kfree(ibmr);
- return tmp_ibmr;
+ int total_pool_size =
+ atomic_read(&rds_ibdev->mr_8k_pool->item_count)
+ * (RDS_FMR_8K_MSG_SIZE + 1) +
+ atomic_read(&rds_ibdev->mr_1m_pool->item_count)
+ * RDS_FMR_1M_MSG_SIZE;
+
+ if (total_pool_size) {
+ int prev_8k_max = atomic_read(&rds_ibdev->mr_8k_pool->max_items_soft);
+ int prev_1m_max = atomic_read(&rds_ibdev->mr_1m_pool->max_items_soft);
+ atomic_set(&rds_ibdev->mr_8k_pool->max_items_soft, (total_pool_size / 4) / (RDS_FMR_8K_MSG_SIZE + 1));
+ atomic_set(&rds_ibdev->mr_1m_pool->max_items_soft, (total_pool_size * 3 / 4) / RDS_FMR_1M_MSG_SIZE);
+ printk(KERN_ERR "RDS/IB: "
+ "Adjusted 8K FMR pool (%d->%d)\n",
+ prev_8k_max,
+ atomic_read(&rds_ibdev->mr_8k_pool->max_items_soft));
+ printk(KERN_ERR "RDS/IB: "
+ "Adjusted 1K FMR pool (%d->%d)\n",
+ prev_1m_max,
+ atomic_read(&rds_ibdev->mr_1m_pool->max_items_soft));
+ rds_ib_flush_mr_pool(rds_ibdev->mr_8k_pool, 1,
+ NULL);
+
+ rds_ib_flush_mr_pool(rds_ibdev->mr_1m_pool, 1,
+ NULL);
+
+ err = -EAGAIN;
}
}
ibmr->fmr = NULL;
- printk(KERN_WARNING "RDS/IB: ib_alloc_fmr failed (err=%d)\n", err);
goto out_no_cigar;
}
rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc);
else
rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc);
+
+ if (atomic_read(&pool->item_count) >
+ atomic_read(&pool->max_items_soft))
+ atomic_set(&pool->max_items_soft, pool->max_items);
+
return ibmr;
out_no_cigar:
/* If we've pinned too many pages, request a flush */
if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned
- || atomic_read(&pool->dirty_count) >= pool->max_items / 5)
+ || atomic_read(&pool->dirty_count) >=
+ atomic_read(&pool->max_items_soft) / 5)
queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10);
if (invalidate) {
#include "rds.h"
#include "ib.h"
#include "tcp.h"
+
/*
* Convert IB-specific error message to RDS error message and call core
* completion handler.
"send completion on %pI4 "
"had status %u, disconnecting and reconnecting\n",
&conn->c_faddr, wc->status);
- }
+ } else
+ ic->i_last_migration = 0;
}
/*
"ib_srq_lows",
"ib_srq_refills",
"ib_srq_empty_refills",
+ "ib_apm_reconnect",
};
unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
static struct rdma_cm_id *rds_iw_listen_id;
+int rds_rdma_resolve_to_ms[] = {1000, 1000, 2000, 4000, 5000};
+
int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event)
{
/* XXX do we need to clean up if this fails? */
ret = rdma_resolve_route(cm_id,
- RDS_RDMA_RESOLVE_TIMEOUT_MS);
+ rds_rdma_resolve_to_ms[conn->c_to_index]);
if (ret) {
/*
* The cm_id will get destroyed by addr_handler
if (ibic && ibic->i_cm_id == cm_id)
ibic->i_cm_id = NULL;
rds_conn_drop(conn);
- }
+ } else if (conn->c_to_index < (RDS_RDMA_RESOLVE_TO_MAX_INDEX-1))
+ conn->c_to_index++;
}
break;
case RDMA_CM_EVENT_ROUTE_RESOLVED:
/* XXX worry about racing with listen acceptance */
+ conn->c_to_index = 0;
ret = trans->cm_initiate_connect(cm_id);
break;
#define RDS_IN_XMIT 2
#define RDS_RECV_REFILL 3
+#define RDS_RDMA_RESOLVE_TO_MAX_INDEX 5
+
struct rds_connection {
struct hlist_node c_hash_node;
__be32 c_laddr;
unsigned int c_reconnect_drops;
int c_reconnect_warn;
int c_reconnect_err;
+ int c_to_index;
unsigned int c_reconnect;