From: Bang Nguyen Date: Fri, 19 Apr 2013 15:56:14 +0000 (-0700) Subject: RDS: Fixes to improve throughput performance X-Git-Tag: v4.1.12-92~293^2^2~68 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=f09abac0d0147f14ad1af97b652bc47246e44797;p=users%2Fjedix%2Flinux-maple.git RDS: Fixes to improve throughput performance This fixes race conditions and other feature enhancements to improve throughput. Ported from UEK2 patch dbe1629e3387d8c68009e1da51d1a1ca778f2501 (Changes related to LAP in the original patch in drivers/infiniband/core/cma.c are NOT ported because we do not have APM support in rdma_cm) Orabug: 16571410 Signed-off-by: Bang Nguyen --- diff --git a/net/rds/connection.c b/net/rds/connection.c index e21fc5f0a9dc..ed6b2765e134 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -345,7 +345,8 @@ void rds_conn_shutdown(struct rds_connection *conn) rcu_read_lock(); if (!hlist_unhashed(&conn->c_hash_node)) { rcu_read_unlock(); - rds_queue_reconnect(conn); + if (conn->c_laddr >= conn->c_faddr) + rds_queue_reconnect(conn); } else { rcu_read_unlock(); } diff --git a/net/rds/ib.c b/net/rds/ib.c index 43415423e76e..38b973072a9f 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -65,6 +66,7 @@ unsigned int rds_ib_rnr_retry_count = RDS_IB_DEFAULT_RNR_RETRY_COUNT; unsigned int rds_ib_cq_balance_enabled = 1; #endif static char *rds_ib_haip_failover_groups = NULL; +unsigned int rds_ib_haip_arps = RDS_IB_DEFAULT_NUM_ARPS; module_param(rds_ib_fmr_1m_pool_size, int, 0444); MODULE_PARM_DESC(rds_ib_fmr_1m_pool_size, " Max number of 1m fmr per HCA"); @@ -97,6 +99,8 @@ MODULE_PARM_DESC(rds_ib_haip_failover_groups, module_param(rds_ib_cq_balance_enabled, int, 0444); MODULE_PARM_DESC(rds_ib_cq_balance_enabled, " CQ load balance Enabled"); #endif +module_param(rds_ib_haip_arps, int, 0444); +MODULE_PARM_DESC(rds_ib_haip_arps, " Num ARPs to be sent when IP moved"); /* * we have a clumsy combination of RCU and a rwsem protecting this list @@ -268,10 +272,21 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, struct rdma_dev_addr *dev_addr; ic = conn->c_transport_data; - dev_addr = &ic->i_cm_id->route.addr.dev_addr; - - rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); - rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); +#if RDMA_RDS_APM_SUPPORTED + if (rds_ib_apm_enabled) { + memcpy((union ib_gid *) &iinfo->src_gid, + &ic->i_cur_path.p_sgid, sizeof(union ib_gid)); + memcpy((union ib_gid *) &iinfo->dst_gid, + &ic->i_cur_path.p_dgid, sizeof(union ib_gid)); + } else +#endif + { + dev_addr = &ic->i_cm_id->route.addr.dev_addr; + rdma_addr_get_sgid(dev_addr, + (union ib_gid *) &iinfo->src_gid); + rdma_addr_get_dgid(dev_addr, + (union ib_gid *) &iinfo->dst_gid); + } rds_ibdev = ic->rds_ibdev; iinfo->max_send_wr = ic->i_send_ring.w_nr; @@ -365,10 +380,15 @@ static void rds_ib_send_gratuitous_arp(struct net_device *out_dev, unsigned char *dev_addr, __be32 ip_addr) { - arp_send(ARPOP_REQUEST, ETH_P_ARP, - ip_addr, out_dev, - ip_addr, NULL, - dev_addr, NULL); + int i; + + /* Send multiple ARPs to improve reliability */ + for (i = 0; i < rds_ib_haip_arps; i++) { + arp_send(ARPOP_REQUEST, ETH_P_ARP, + ip_addr, out_dev, + ip_addr, NULL, + dev_addr, NULL); + } } static int rds_ib_set_ip(struct net_device *out_dev, @@ -471,6 +491,7 @@ static int rds_ib_move_ip(char *from_dev, __be32 addr, __be32 bcast, __be32 mask, + int event_type, int failover) { struct ifreq *ir; @@ -577,6 +598,30 @@ static int rds_ib_move_ip(char *from_dev, printk(KERN_NOTICE "RDS/IB: IP %u.%u.%u.%u migrated from %s to %s\n", NIPQUAD(addr), from_dev2, to_dev2); + + if (event_type == RDS_IB_PORT_EVENT_NET) { + unsigned long flags; + struct rds_ib_connection *ic; + struct rds_ib_device *rds_ibdev; + + rds_ibdev = ip_config[to_port].rds_ibdev; + spin_lock_irqsave(&rds_ibdev->spinlock, flags); + list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node) + if (ic->conn->c_laddr == addr) { +#if RDMA_RDS_APM_SUPPORTED + if (rds_ib_apm_enabled) { + if (!memcmp( + &ic->i_cur_path.p_sgid, + &ip_config[to_port].gid, + sizeof(union ib_gid))) { + continue; + } + } +#endif + rds_conn_drop(ic->conn); + } + spin_unlock_irqrestore(&rds_ibdev->spinlock, flags); + } } out: @@ -586,9 +631,41 @@ out: return ret; } +static void rds_ib_check_up_port(void) +{ + struct net_device *dev; + int downs; + int retries = 0; + +retry: + downs = 0; + read_lock(&dev_base_lock); + for_each_netdev(&init_net, dev) { + if ((dev->type == ARPHRD_INFINIBAND) && + !(dev->flags & IFF_SLAVE) && + !(dev->flags & IFF_MASTER)) { + if (dev->operstate != IF_OPER_UP) + downs++; + } + } + read_unlock(&dev_base_lock); + + if (downs) { + if (retries++ <= 60) { + msleep(1000); + goto retry; + } else { + printk(KERN_ERR "RDS/IB: Some port(s) may not be " + "operational\n"); + } + } +} + + static u8 rds_ib_init_port(struct rds_ib_device *rds_ibdev, struct net_device *net_dev, - u8 port_num) + u8 port_num, + union ib_gid gid) { const char *digits = "0123456789"; @@ -609,6 +686,7 @@ static u8 rds_ib_init_port(struct rds_ib_device *rds_ibdev, ip_config[ip_port_cnt].rds_ibdev = rds_ibdev; ip_config[ip_port_cnt].ip_active_port = 0; strcpy(ip_config[ip_port_cnt].if_name, net_dev->name); + memcpy(&ip_config[ip_port_cnt].gid, &gid, sizeof(union ib_gid)); if (net_dev->operstate == IF_OPER_UP) ip_config[ip_port_cnt].port_state = RDS_IB_PORT_UP; @@ -643,7 +721,8 @@ static void rds_ib_set_port(struct rds_ib_device *rds_ibdev, } } -static void rds_ib_do_failover(u8 from_port, u8 to_port, u8 arp_port) +static void rds_ib_do_failover(u8 from_port, u8 to_port, u8 arp_port, + int event_type) { u8 j; int ret; @@ -667,6 +746,7 @@ static void rds_ib_do_failover(u8 from_port, u8 to_port, u8 arp_port) ip_config[from_port].ip_addr, ip_config[from_port].ip_bcast, ip_config[from_port].ip_mask, + event_type, 1)) { ip_config[from_port].ip_active_port = to_port; @@ -686,13 +766,14 @@ static void rds_ib_do_failover(u8 from_port, u8 to_port, u8 arp_port) aliases[j].ip_bcast, ip_config[from_port]. aliases[j].ip_mask, + event_type, 1); } } } } -static void rds_ib_do_failback(u8 port) +static void rds_ib_do_failback(u8 port, int event_type) { u8 ip_active_port = ip_config[port].ip_active_port; u8 j; @@ -711,6 +792,7 @@ static void rds_ib_do_failback(u8 port) ip_config[port].ip_addr, ip_config[port].ip_bcast, ip_config[port].ip_mask, + event_type, 0)) { ip_config[port].ip_active_port = port; @@ -731,6 +813,7 @@ static void rds_ib_do_failback(u8 port) aliases[j].ip_bcast, ip_config[port]. aliases[j].ip_mask, + event_type, 0); } } @@ -758,12 +841,12 @@ static void rds_ib_failover(struct work_struct *_work) if_name[IFNAMSIZ-1] = 0; ret = rds_ib_set_ip(NULL, NULL, if_name, 0, 0, 0); - rds_ib_do_failover(i, 0, 0); + rds_ib_do_failover(i, 0, 0, work->event_type); } } if (ip_config[work->port].ip_addr) - rds_ib_do_failover(work->port, 0, 0); + rds_ib_do_failover(work->port, 0, 0, work->event_type); if (ip_config[work->port].ip_active_port == work->port) { ret = rds_ib_set_ip(NULL, NULL, @@ -784,7 +867,7 @@ static void rds_ib_failback(struct work_struct *_work) ip_active_port = ip_config[port].ip_active_port; - rds_ib_do_failback(port); + rds_ib_do_failback(port, work->event_type); for (i = 1; i <= ip_port_cnt; i++) { if (i == port || @@ -793,15 +876,19 @@ static void rds_ib_failback(struct work_struct *_work) continue; if (ip_config[i].ip_active_port == i) { - rds_ib_do_failover(i, 0, ip_active_port); + rds_ib_do_failover(i, 0, ip_active_port, + work->event_type); } else if (ip_config[i].ip_active_port == port) { - rds_ib_do_failover(i, port, ip_active_port); + rds_ib_do_failover(i, port, ip_active_port, + work->event_type); } else if (ip_config[ip_config[i].ip_active_port].port_state == RDS_IB_PORT_DOWN) { - rds_ib_do_failover(i, 0, ip_active_port); + rds_ib_do_failover(i, 0, ip_active_port, + work->event_type); } else if (ip_config[port].failover_group == ip_config[i].failover_group) { - rds_ib_do_failover(i, port, ip_active_port); + rds_ib_do_failover(i, port, ip_active_port, + work->event_type); } } @@ -812,7 +899,8 @@ static void rds_ib_failback(struct work_struct *_work) ip_config[i].ip_active_port == ip_active_port) { rds_ib_do_failover(i, ip_active_port, - ip_active_port); + ip_active_port, + work->event_type); } } } @@ -888,6 +976,7 @@ static void rds_ib_event_handler(struct ib_event_handler *handler, } work->port = port; + work->event_type = RDS_IB_PORT_EVENT_IB; if (event->event == IB_EVENT_PORT_ACTIVE) { if (rds_ib_haip_fallback) { @@ -956,6 +1045,8 @@ static int rds_ib_ip_config_init(void) if (!rds_ib_haip_enabled) return 0; + rds_ib_check_up_port(); + rcu_read_unlock(); ip_config = kzalloc(sizeof(struct rds_ib_port) * @@ -990,7 +1081,7 @@ static int rds_ib_ip_config_init(void) RDS_IB_GID_ARG(gid)); } else { port = rds_ib_init_port(rds_ibdev, dev, - port_num); + port_num, gid); if (port > 0) { for (ifap = &in_dev->ifa_list; (ifa = *ifap); @@ -1235,6 +1326,7 @@ static int rds_ib_netdev_callback(struct notifier_block *self, unsigned long eve work->dev = ndev; work->port = port; + work->event_type = RDS_IB_PORT_EVENT_NET; switch (event) { case NETDEV_UP: diff --git a/net/rds/ib.h b/net/rds/ib.h index dda31af1aa59..6711c6cb64bb 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -8,10 +8,10 @@ #include "rds.h" #include "rdma_transport.h" -#define RDS_FMR_1M_POOL_SIZE (8192 / 2) +#define RDS_FMR_1M_POOL_SIZE (8192 * 3 / 4) #define RDS_FMR_1M_MSG_SIZE 256 /* 1M */ #define RDS_FMR_8K_MSG_SIZE 2 -#define RDS_FMR_8K_POOL_SIZE ((256 / (RDS_FMR_8K_MSG_SIZE + 1)) * (8192 / 2)) +#define RDS_FMR_8K_POOL_SIZE ((256 / (RDS_FMR_8K_MSG_SIZE + 1)) * (8192 / 4)) #define RDS_IB_MAX_SGE 8 #define RDS_IB_RECV_SGE 2 @@ -26,6 +26,8 @@ #define RDS_IB_DEFAULT_RNR_RETRY_COUNT 7 +#define RDS_IB_DEFAULT_NUM_ARPS 100 + #define RDS_IB_DEFAULT_TIMEOUT 16 /* 4.096 * 2 ^ 16 = 260 msec */ #define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ @@ -83,9 +85,9 @@ struct rds_ib_connect_private { u8 dp_protocol_major; u8 dp_protocol_minor; __be16 dp_protocol_minor_mask; /* bitmask */ - u8 dp_tos; - u8 dp_reserved1; - __be16 dp_reserved2; + u8 dp_tos; + u8 dp_reserved1; + __be16 dp_reserved2; __be64 dp_ack_seq; __be32 dp_credit; /* non-zero enables flow ctl */ }; @@ -226,6 +228,7 @@ struct rds_ib_connection { struct rds_ib_path i_cur_path; unsigned int i_alt_path_index; unsigned int i_active_side; + unsigned long i_last_migration; int i_scq_vector; int i_rcq_vector; @@ -278,6 +281,7 @@ struct rds_ib_port { struct net_device *dev; unsigned int port_state; u8 port_num; + union ib_gid gid; char port_label[4]; char if_name[IFNAMSIZ]; __be32 ip_addr; @@ -288,11 +292,17 @@ struct rds_ib_port { struct rds_ib_alias aliases[RDS_IB_MAX_ALIASES]; }; +enum { + RDS_IB_PORT_EVENT_IB, + RDS_IB_PORT_EVENT_NET, +}; + struct rds_ib_port_ud_work { struct delayed_work work; struct net_device *dev; unsigned int port; int timeout; + int event_type; }; enum { @@ -378,6 +388,7 @@ struct rds_ib_statistics { uint64_t s_ib_srq_lows; uint64_t s_ib_srq_refills; uint64_t s_ib_srq_empty_refills; + uint64_t s_ib_failed_apm; }; extern struct workqueue_struct *rds_ib_wq; diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 8301c58f073f..45cc22311295 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -207,35 +207,40 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); #if RDMA_RDS_APM_SUPPORTED - if (rds_ib_apm_enabled && !ic->conn->c_reconnect) { - memcpy(&ic->i_pri_path.p_sgid, - &ic->i_cm_id->route.path_rec[0].sgid, - sizeof(union ib_gid)); - - memcpy(&ic->i_pri_path.p_dgid, - &ic->i_cm_id->route.path_rec[0].dgid, - sizeof(union ib_gid)); - - memcpy(&ic->i_cur_path.p_sgid, - &ic->i_cm_id->route.path_rec[0].sgid, - sizeof(union ib_gid)); - - memcpy(&ic->i_cur_path.p_dgid, - &ic->i_cm_id->route.path_rec[0].dgid, - sizeof(union ib_gid)); - - printk(KERN_NOTICE "RDS/IB: connection " - "<%u.%u.%u.%u,%u.%u.%u.%u,%d> primary path " - "<"RDS_IB_GID_FMT","RDS_IB_GID_FMT">\n", - NIPQUAD(conn->c_laddr), - NIPQUAD(conn->c_faddr), - conn->c_tos, - RDS_IB_GID_ARG(ic->i_pri_path.p_sgid), - RDS_IB_GID_ARG(ic->i_pri_path.p_dgid)); + if (rds_ib_apm_enabled) { + struct rdma_dev_addr *dev_addr; + + dev_addr = &ic->i_cm_id->route.addr.dev_addr; + + if (!ic->conn->c_reconnect) { + rdma_addr_get_sgid(dev_addr, + (union ib_gid *)&ic->i_pri_path.p_sgid); + rdma_addr_get_dgid(dev_addr, + (union ib_gid *)&ic->i_pri_path.p_dgid); + printk(KERN_NOTICE "RDS/IB: connection " + "<%u.%u.%u.%u,%u.%u.%u.%u,%d> primary path " + "<"RDS_IB_GID_FMT","RDS_IB_GID_FMT">\n", + NIPQUAD(conn->c_laddr), + NIPQUAD(conn->c_faddr), + conn->c_tos, + RDS_IB_GID_ARG(ic->i_pri_path.p_sgid), + RDS_IB_GID_ARG(ic->i_pri_path.p_dgid)); + } + rdma_addr_get_sgid(dev_addr, + (union ib_gid *)&ic->i_cur_path.p_sgid); + rdma_addr_get_dgid(dev_addr, + (union ib_gid *)&ic->i_cur_path.p_dgid); } #endif rds_connect_complete(conn); + +#if RDMA_RDS_APM_SUPPORTED + if (ic->i_last_migration) { + rds_ib_stats_inc(s_ib_failed_apm); + ic->i_last_migration = 0; + } +#endif } static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, @@ -435,6 +440,7 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data) RDS_IB_GID_ARG(ic->i_cur_path.p_sgid), RDS_IB_GID_ARG(ic->i_cur_path.p_dgid)); } + ic->i_last_migration = get_seconds(); break; case IB_EVENT_PATH_MIG_ERR: @@ -993,7 +999,7 @@ int rds_ib_conn_connect(struct rds_connection *conn) /* XXX I wonder what affect the port space has */ /* delegate cm event handler to rdma_transport */ ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, - RDMA_PS_TCP, IB_QPT_RC); + RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(ic->i_cm_id)) { ret = PTR_ERR(ic->i_cm_id); ic->i_cm_id = NULL; diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 769941211161..e8d6e48d2e76 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -80,7 +80,7 @@ struct rds_ib_mr_pool { atomic_t free_pinned; /* memory pinned by free MRs */ unsigned long max_items; - unsigned long max_items_soft; + atomic_t max_items_soft; unsigned long max_free_pinned; struct ib_fmr_attr fmr_attr; }; @@ -251,7 +251,7 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, pool->max_items * pool->fmr_attr.max_pages / 4; pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; pool->fmr_attr.page_shift = PAGE_SHIFT; - pool->max_items_soft = pool->max_items * 3 / 4; + atomic_set(&pool->max_items_soft, pool->max_items); return pool; } @@ -316,7 +316,6 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, { struct rds_ib_mr_pool *pool; struct rds_ib_mr *ibmr = NULL; - struct rds_ib_mr *tmp_ibmr = NULL; int err = 0, iter = 0; if (npages <= RDS_FMR_8K_MSG_SIZE) @@ -324,7 +323,8 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, else pool = rds_ibdev->mr_1m_pool; - if (atomic_read(&pool->dirty_count) >= pool->max_items / 10) + if (atomic_read(&pool->dirty_count) >= + atomic_read(&pool->max_items_soft) / 10) queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); while (1) { @@ -381,25 +381,39 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, if (IS_ERR(ibmr->fmr)) { err = PTR_ERR(ibmr->fmr); - /* Adjust the pool size to reflect the resources available to - * the VM. + /* Re-balance the pool sizes to reflect the memory resources + * available to the VM. */ if (err == -ENOMEM) { - int prev_max = pool->max_items; - - pool->max_items = atomic_read(&pool->item_count); - - printk(KERN_ERR "RDS/IB: Adjusted %s FMR pool (%d->%ld)\n", (pool->pool_type == RDS_IB_MR_8K_POOL) ? "8K" : "1M", - prev_max, pool->max_items); - - rds_ib_flush_mr_pool(pool, 0, &tmp_ibmr); - if (tmp_ibmr) { - kfree(ibmr); - return tmp_ibmr; + int total_pool_size = + atomic_read(&rds_ibdev->mr_8k_pool->item_count) + * (RDS_FMR_8K_MSG_SIZE + 1) + + atomic_read(&rds_ibdev->mr_1m_pool->item_count) + * RDS_FMR_1M_MSG_SIZE; + + if (total_pool_size) { + int prev_8k_max = atomic_read(&rds_ibdev->mr_8k_pool->max_items_soft); + int prev_1m_max = atomic_read(&rds_ibdev->mr_1m_pool->max_items_soft); + atomic_set(&rds_ibdev->mr_8k_pool->max_items_soft, (total_pool_size / 4) / (RDS_FMR_8K_MSG_SIZE + 1)); + atomic_set(&rds_ibdev->mr_1m_pool->max_items_soft, (total_pool_size * 3 / 4) / RDS_FMR_1M_MSG_SIZE); + printk(KERN_ERR "RDS/IB: " + "Adjusted 8K FMR pool (%d->%d)\n", + prev_8k_max, + atomic_read(&rds_ibdev->mr_8k_pool->max_items_soft)); + printk(KERN_ERR "RDS/IB: " + "Adjusted 1K FMR pool (%d->%d)\n", + prev_1m_max, + atomic_read(&rds_ibdev->mr_1m_pool->max_items_soft)); + rds_ib_flush_mr_pool(rds_ibdev->mr_8k_pool, 1, + NULL); + + rds_ib_flush_mr_pool(rds_ibdev->mr_1m_pool, 1, + NULL); + + err = -EAGAIN; } } ibmr->fmr = NULL; - printk(KERN_WARNING "RDS/IB: ib_alloc_fmr failed (err=%d)\n", err); goto out_no_cigar; } @@ -408,6 +422,11 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc); else rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc); + + if (atomic_read(&pool->item_count) > + atomic_read(&pool->max_items_soft)) + atomic_set(&pool->max_items_soft, pool->max_items); + return ibmr; out_no_cigar: @@ -793,7 +812,8 @@ void rds_ib_free_mr(void *trans_private, int invalidate) /* If we've pinned too many pages, request a flush */ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned - || atomic_read(&pool->dirty_count) >= pool->max_items / 5) + || atomic_read(&pool->dirty_count) >= + atomic_read(&pool->max_items_soft) / 5) queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); if (invalidate) { diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 52e497cad5f9..b46114076140 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -38,6 +38,7 @@ #include "rds.h" #include "ib.h" #include "tcp.h" + /* * Convert IB-specific error message to RDS error message and call core * completion handler. @@ -309,7 +310,8 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) "send completion on %pI4 " "had status %u, disconnecting and reconnecting\n", &conn->c_faddr, wc->status); - } + } else + ic->i_last_migration = 0; } /* diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c index 80a4c90ac6cf..c93cc19eb617 100644 --- a/net/rds/ib_stats.c +++ b/net/rds/ib_stats.c @@ -80,6 +80,7 @@ static char *rds_ib_stat_names[] = { "ib_srq_lows", "ib_srq_refills", "ib_srq_empty_refills", + "ib_apm_reconnect", }; unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index 82ca9ee34aee..e756376ea709 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -44,6 +44,8 @@ static struct rdma_cm_id *rds_iw_listen_id; +int rds_rdma_resolve_to_ms[] = {1000, 1000, 2000, 4000, 5000}; + int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { @@ -96,7 +98,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, /* XXX do we need to clean up if this fails? */ ret = rdma_resolve_route(cm_id, - RDS_RDMA_RESOLVE_TIMEOUT_MS); + rds_rdma_resolve_to_ms[conn->c_to_index]); if (ret) { /* * The cm_id will get destroyed by addr_handler @@ -111,12 +113,14 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, if (ibic && ibic->i_cm_id == cm_id) ibic->i_cm_id = NULL; rds_conn_drop(conn); - } + } else if (conn->c_to_index < (RDS_RDMA_RESOLVE_TO_MAX_INDEX-1)) + conn->c_to_index++; } break; case RDMA_CM_EVENT_ROUTE_RESOLVED: /* XXX worry about racing with listen acceptance */ + conn->c_to_index = 0; ret = trans->cm_initiate_connect(cm_id); break; diff --git a/net/rds/rds.h b/net/rds/rds.h index 988aa458660e..5881b3d977ad 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -91,6 +91,8 @@ enum { #define RDS_IN_XMIT 2 #define RDS_RECV_REFILL 3 +#define RDS_RDMA_RESOLVE_TO_MAX_INDEX 5 + struct rds_connection { struct hlist_node c_hash_node; __be32 c_laddr; @@ -147,6 +149,7 @@ struct rds_connection { unsigned int c_reconnect_drops; int c_reconnect_warn; int c_reconnect_err; + int c_to_index; unsigned int c_reconnect;