]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
RDS: Fixes to improve throughput performance
authorBang Nguyen <bang.nguyen@oracle.com>
Fri, 19 Apr 2013 15:56:14 +0000 (08:56 -0700)
committerMukesh Kacker <mukesh.kacker@oracle.com>
Wed, 8 Jul 2015 20:37:57 +0000 (13:37 -0700)
This fixes race conditions and other feature enhancements to improve
throughput.

Ported from UEK2 patch dbe1629e3387d8c68009e1da51d1a1ca778f2501

(Changes related to LAP in the original patch in
drivers/infiniband/core/cma.c are NOT ported because we
do not have APM support in rdma_cm)

Orabug: 16571410
Signed-off-by: Bang Nguyen <bang.nguyen@oracle.com>
net/rds/connection.c
net/rds/ib.c
net/rds/ib.h
net/rds/ib_cm.c
net/rds/ib_rdma.c
net/rds/ib_send.c
net/rds/ib_stats.c
net/rds/rdma_transport.c
net/rds/rds.h

index e21fc5f0a9dcbd9f08811b1a8f8f17151d18fc3a..ed6b2765e134530cffda38c1f1a4a36e116c1a4f 100644 (file)
@@ -345,7 +345,8 @@ void rds_conn_shutdown(struct rds_connection *conn)
        rcu_read_lock();
        if (!hlist_unhashed(&conn->c_hash_node)) {
                rcu_read_unlock();
-               rds_queue_reconnect(conn);
+               if (conn->c_laddr >= conn->c_faddr)
+                       rds_queue_reconnect(conn);
        } else {
                rcu_read_unlock();
        }
index 43415423e76e05c7a022d0ccc99221d16ee97de7..38b973072a9fbbeb91b6bed8ddf11a2785664609 100644 (file)
@@ -40,6 +40,7 @@
 #include <linux/delay.h>
 #include <rdma/ib_cache.h>
 #include <net/sock.h>
+#include <net/route.h>
 #include <net/inet_common.h>
 #include <linux/rtnetlink.h>
 
@@ -65,6 +66,7 @@ unsigned int rds_ib_rnr_retry_count = RDS_IB_DEFAULT_RNR_RETRY_COUNT;
 unsigned int rds_ib_cq_balance_enabled = 1;
 #endif
 static char *rds_ib_haip_failover_groups = NULL;
+unsigned int rds_ib_haip_arps = RDS_IB_DEFAULT_NUM_ARPS;
 
 module_param(rds_ib_fmr_1m_pool_size, int, 0444);
 MODULE_PARM_DESC(rds_ib_fmr_1m_pool_size, " Max number of 1m fmr per HCA");
@@ -97,6 +99,8 @@ MODULE_PARM_DESC(rds_ib_haip_failover_groups,
 module_param(rds_ib_cq_balance_enabled, int, 0444);
 MODULE_PARM_DESC(rds_ib_cq_balance_enabled, " CQ load balance Enabled");
 #endif
+module_param(rds_ib_haip_arps, int, 0444);
+MODULE_PARM_DESC(rds_ib_haip_arps, " Num ARPs to be sent when IP moved");
 
 /*
  * we have a clumsy combination of RCU and a rwsem protecting this list
@@ -268,10 +272,21 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
                struct rdma_dev_addr *dev_addr;
 
                ic = conn->c_transport_data;
-               dev_addr = &ic->i_cm_id->route.addr.dev_addr;
-
-               rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
-               rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
+#if RDMA_RDS_APM_SUPPORTED
+               if (rds_ib_apm_enabled) {
+                       memcpy((union ib_gid *) &iinfo->src_gid,
+                               &ic->i_cur_path.p_sgid, sizeof(union ib_gid));
+                       memcpy((union ib_gid *) &iinfo->dst_gid,
+                               &ic->i_cur_path.p_dgid, sizeof(union ib_gid));
+               } else
+#endif
+               {
+                       dev_addr = &ic->i_cm_id->route.addr.dev_addr;
+                       rdma_addr_get_sgid(dev_addr,
+                               (union ib_gid *) &iinfo->src_gid);
+                       rdma_addr_get_dgid(dev_addr,
+                               (union ib_gid *) &iinfo->dst_gid);
+               }
 
                rds_ibdev = ic->rds_ibdev;
                iinfo->max_send_wr = ic->i_send_ring.w_nr;
@@ -365,10 +380,15 @@ static void rds_ib_send_gratuitous_arp(struct net_device  *out_dev,
                                        unsigned char           *dev_addr,
                                        __be32                  ip_addr)
 {
-       arp_send(ARPOP_REQUEST, ETH_P_ARP,
-               ip_addr, out_dev,
-               ip_addr, NULL,
-               dev_addr, NULL);
+       int i;
+
+       /* Send multiple ARPs to improve reliability */
+       for (i = 0; i < rds_ib_haip_arps; i++) {
+               arp_send(ARPOP_REQUEST, ETH_P_ARP,
+                       ip_addr, out_dev,
+                       ip_addr, NULL,
+                       dev_addr, NULL);
+       }
 }
 
 static int rds_ib_set_ip(struct net_device     *out_dev,
@@ -471,6 +491,7 @@ static int rds_ib_move_ip(char                      *from_dev,
                        __be32                  addr,
                        __be32                  bcast,
                        __be32                  mask,
+                       int                     event_type,
                        int                     failover)
 {
        struct ifreq            *ir;
@@ -577,6 +598,30 @@ static int rds_ib_move_ip(char                     *from_dev,
                printk(KERN_NOTICE
                        "RDS/IB: IP %u.%u.%u.%u migrated from %s to %s\n",
                                NIPQUAD(addr), from_dev2, to_dev2);
+
+               if (event_type == RDS_IB_PORT_EVENT_NET) {
+                       unsigned long flags;
+                       struct rds_ib_connection *ic;
+                       struct rds_ib_device *rds_ibdev;
+
+                       rds_ibdev = ip_config[to_port].rds_ibdev;
+                       spin_lock_irqsave(&rds_ibdev->spinlock, flags);
+                       list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node)
+                               if (ic->conn->c_laddr == addr) {
+#if RDMA_RDS_APM_SUPPORTED
+                                       if (rds_ib_apm_enabled) {
+                                               if (!memcmp(
+                                                       &ic->i_cur_path.p_sgid,
+                                                       &ip_config[to_port].gid,
+                                                       sizeof(union ib_gid))) {
+                                                       continue;
+                                               }
+                                       }
+#endif
+                                       rds_conn_drop(ic->conn);
+                               }
+                       spin_unlock_irqrestore(&rds_ibdev->spinlock, flags);
+               }
        }
 
 out:
@@ -586,9 +631,41 @@ out:
        return ret;
 }
 
+static void rds_ib_check_up_port(void)
+{
+       struct net_device *dev;
+       int     downs;
+       int     retries = 0;
+
+retry:
+       downs = 0;
+       read_lock(&dev_base_lock);
+       for_each_netdev(&init_net, dev) {
+               if ((dev->type == ARPHRD_INFINIBAND) &&
+                               !(dev->flags & IFF_SLAVE) &&
+                               !(dev->flags & IFF_MASTER)) {
+                       if (dev->operstate != IF_OPER_UP)
+                               downs++;
+               }
+       }
+       read_unlock(&dev_base_lock);
+
+       if (downs) {
+               if (retries++ <= 60) {
+                       msleep(1000);
+                       goto retry;
+               } else {
+                       printk(KERN_ERR "RDS/IB: Some port(s) may not be "
+                                       "operational\n");
+               }
+       }
+}
+
+
 static u8 rds_ib_init_port(struct rds_ib_device        *rds_ibdev,
                                struct net_device       *net_dev,
-                               u8                      port_num)
+                               u8                      port_num,
+                               union ib_gid            gid)
 {
        const char *digits = "0123456789";
 
@@ -609,6 +686,7 @@ static u8 rds_ib_init_port(struct rds_ib_device     *rds_ibdev,
        ip_config[ip_port_cnt].rds_ibdev = rds_ibdev;
        ip_config[ip_port_cnt].ip_active_port = 0;
        strcpy(ip_config[ip_port_cnt].if_name, net_dev->name);
+       memcpy(&ip_config[ip_port_cnt].gid, &gid, sizeof(union ib_gid));
 
        if (net_dev->operstate == IF_OPER_UP)
                ip_config[ip_port_cnt].port_state = RDS_IB_PORT_UP;
@@ -643,7 +721,8 @@ static void rds_ib_set_port(struct rds_ib_device    *rds_ibdev,
        }
 }
 
-static void rds_ib_do_failover(u8 from_port, u8 to_port, u8 arp_port)
+static void rds_ib_do_failover(u8 from_port, u8 to_port, u8 arp_port,
+                               int event_type)
 {
        u8      j;
        int     ret;
@@ -667,6 +746,7 @@ static void rds_ib_do_failover(u8 from_port, u8 to_port, u8 arp_port)
                        ip_config[from_port].ip_addr,
                        ip_config[from_port].ip_bcast,
                        ip_config[from_port].ip_mask,
+                       event_type,
                        1)) {
 
                        ip_config[from_port].ip_active_port = to_port;
@@ -686,13 +766,14 @@ static void rds_ib_do_failover(u8 from_port, u8 to_port, u8 arp_port)
                                                aliases[j].ip_bcast,
                                        ip_config[from_port].
                                                aliases[j].ip_mask,
+                                       event_type,
                                        1);
                        }
                }
        }
 }
 
-static void rds_ib_do_failback(u8 port)
+static void rds_ib_do_failback(u8 port, int event_type)
 {
        u8      ip_active_port = ip_config[port].ip_active_port;
        u8      j;
@@ -711,6 +792,7 @@ static void rds_ib_do_failback(u8 port)
                        ip_config[port].ip_addr,
                        ip_config[port].ip_bcast,
                        ip_config[port].ip_mask,
+                       event_type,
                        0)) {
 
                        ip_config[port].ip_active_port = port;
@@ -731,6 +813,7 @@ static void rds_ib_do_failback(u8 port)
                                                aliases[j].ip_bcast,
                                        ip_config[port].
                                                aliases[j].ip_mask,
+                                       event_type,
                                        0);
                        }
                }
@@ -758,12 +841,12 @@ static void rds_ib_failover(struct work_struct *_work)
                        if_name[IFNAMSIZ-1] = 0;
                        ret = rds_ib_set_ip(NULL, NULL, if_name, 0, 0, 0);
 
-                       rds_ib_do_failover(i, 0, 0);
+                       rds_ib_do_failover(i, 0, 0, work->event_type);
                }
        }
 
        if (ip_config[work->port].ip_addr)
-               rds_ib_do_failover(work->port, 0, 0);
+               rds_ib_do_failover(work->port, 0, 0, work->event_type);
 
        if (ip_config[work->port].ip_active_port == work->port) {
                ret = rds_ib_set_ip(NULL, NULL,
@@ -784,7 +867,7 @@ static void rds_ib_failback(struct work_struct *_work)
 
        ip_active_port = ip_config[port].ip_active_port;
 
-       rds_ib_do_failback(port);
+       rds_ib_do_failback(port, work->event_type);
 
        for (i = 1; i <= ip_port_cnt; i++) {
                if (i == port ||
@@ -793,15 +876,19 @@ static void rds_ib_failback(struct work_struct *_work)
                        continue;
 
                if (ip_config[i].ip_active_port == i) {
-                       rds_ib_do_failover(i, 0, ip_active_port);
+                       rds_ib_do_failover(i, 0, ip_active_port,
+                                               work->event_type);
                } else if (ip_config[i].ip_active_port == port) {
-                       rds_ib_do_failover(i, port, ip_active_port);
+                       rds_ib_do_failover(i, port, ip_active_port,
+                                               work->event_type);
                } else if (ip_config[ip_config[i].ip_active_port].port_state ==
                                RDS_IB_PORT_DOWN) {
-                       rds_ib_do_failover(i, 0, ip_active_port);
+                       rds_ib_do_failover(i, 0, ip_active_port,
+                                               work->event_type);
                } else if (ip_config[port].failover_group ==
                                ip_config[i].failover_group) {
-                       rds_ib_do_failover(i, port, ip_active_port);
+                       rds_ib_do_failover(i, port, ip_active_port,
+                                               work->event_type);
                }
        }
 
@@ -812,7 +899,8 @@ static void rds_ib_failback(struct work_struct *_work)
                                ip_config[i].ip_active_port == ip_active_port) {
 
                                rds_ib_do_failover(i, ip_active_port,
-                                                       ip_active_port);
+                                                       ip_active_port,
+                                                       work->event_type);
                        }
                }
        }
@@ -888,6 +976,7 @@ static void rds_ib_event_handler(struct ib_event_handler *handler,
                }
 
                work->port = port;
+               work->event_type = RDS_IB_PORT_EVENT_IB;
 
                if (event->event == IB_EVENT_PORT_ACTIVE) {
                        if (rds_ib_haip_fallback) {
@@ -956,6 +1045,8 @@ static int rds_ib_ip_config_init(void)
        if (!rds_ib_haip_enabled)
                return 0;
 
+       rds_ib_check_up_port();
+
        rcu_read_unlock();
 
        ip_config = kzalloc(sizeof(struct rds_ib_port) *
@@ -990,7 +1081,7 @@ static int rds_ib_ip_config_init(void)
                                        RDS_IB_GID_ARG(gid));
                        } else {
                                port = rds_ib_init_port(rds_ibdev, dev,
-                                       port_num);
+                                       port_num, gid);
                                if (port > 0) {
                                        for (ifap = &in_dev->ifa_list;
                                                (ifa = *ifap);
@@ -1235,6 +1326,7 @@ static int rds_ib_netdev_callback(struct notifier_block *self, unsigned long eve
 
        work->dev = ndev;
        work->port = port;
+       work->event_type = RDS_IB_PORT_EVENT_NET;
 
        switch (event) {
        case NETDEV_UP:
index dda31af1aa595fbedb6ec353ec128d98736c4ece..6711c6cb64bb6fa1dd3a9214fc5db1c07de930d7 100644 (file)
@@ -8,10 +8,10 @@
 #include "rds.h"
 #include "rdma_transport.h"
 
-#define RDS_FMR_1M_POOL_SIZE           (8192 / 2)
+#define RDS_FMR_1M_POOL_SIZE           (8192 * 3 / 4)
 #define RDS_FMR_1M_MSG_SIZE            256  /* 1M */
 #define RDS_FMR_8K_MSG_SIZE             2
-#define RDS_FMR_8K_POOL_SIZE            ((256 / (RDS_FMR_8K_MSG_SIZE + 1)) * (8192 / 2))
+#define RDS_FMR_8K_POOL_SIZE           ((256 / (RDS_FMR_8K_MSG_SIZE + 1)) * (8192 / 4))
 
 #define RDS_IB_MAX_SGE                 8
 #define RDS_IB_RECV_SGE                        2
@@ -26,6 +26,8 @@
 
 #define RDS_IB_DEFAULT_RNR_RETRY_COUNT  7
 
+#define RDS_IB_DEFAULT_NUM_ARPS                100
+
 #define RDS_IB_DEFAULT_TIMEOUT          16 /* 4.096 * 2 ^ 16 = 260 msec */
 
 #define RDS_IB_SUPPORTED_PROTOCOLS     0x00000003      /* minor versions supported */
@@ -83,9 +85,9 @@ struct rds_ib_connect_private {
        u8                      dp_protocol_major;
        u8                      dp_protocol_minor;
        __be16                  dp_protocol_minor_mask; /* bitmask */
-       u8                      dp_tos;
-       u8                      dp_reserved1;
-       __be16                  dp_reserved2;
+       u8                      dp_tos;
+       u8                      dp_reserved1;
+       __be16                  dp_reserved2;
        __be64                  dp_ack_seq;
        __be32                  dp_credit;              /* non-zero enables flow ctl */
 };
@@ -226,6 +228,7 @@ struct rds_ib_connection {
        struct rds_ib_path      i_cur_path;
        unsigned int            i_alt_path_index;
        unsigned int            i_active_side;
+       unsigned long           i_last_migration;
 
        int                     i_scq_vector;
        int                     i_rcq_vector;
@@ -278,6 +281,7 @@ struct rds_ib_port {
        struct net_device       *dev;
        unsigned int            port_state;
        u8                      port_num;
+       union ib_gid            gid;
        char                    port_label[4];
        char                    if_name[IFNAMSIZ];
        __be32                  ip_addr;
@@ -288,11 +292,17 @@ struct rds_ib_port {
        struct rds_ib_alias     aliases[RDS_IB_MAX_ALIASES];
 };
 
+enum {
+       RDS_IB_PORT_EVENT_IB,
+       RDS_IB_PORT_EVENT_NET,
+};
+
 struct rds_ib_port_ud_work {
        struct delayed_work             work;
        struct net_device               *dev;
        unsigned int                    port;
        int                             timeout;
+       int                             event_type;
 };
 
 enum {
@@ -378,6 +388,7 @@ struct rds_ib_statistics {
        uint64_t        s_ib_srq_lows;
        uint64_t        s_ib_srq_refills;
        uint64_t        s_ib_srq_empty_refills;
+       uint64_t        s_ib_failed_apm;
 };
 
 extern struct workqueue_struct *rds_ib_wq;
index 8301c58f073fb340812ee8200ae664d42d2ad163..45cc22311295004423bb2daa191aa4899617353c 100644 (file)
@@ -207,35 +207,40 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
                rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
 
 #if RDMA_RDS_APM_SUPPORTED
-       if (rds_ib_apm_enabled && !ic->conn->c_reconnect) {
-               memcpy(&ic->i_pri_path.p_sgid,
-                       &ic->i_cm_id->route.path_rec[0].sgid,
-                       sizeof(union ib_gid));
-
-               memcpy(&ic->i_pri_path.p_dgid,
-                       &ic->i_cm_id->route.path_rec[0].dgid,
-                       sizeof(union ib_gid));
-
-               memcpy(&ic->i_cur_path.p_sgid,
-                       &ic->i_cm_id->route.path_rec[0].sgid,
-                       sizeof(union ib_gid));
-
-               memcpy(&ic->i_cur_path.p_dgid,
-                       &ic->i_cm_id->route.path_rec[0].dgid,
-                       sizeof(union ib_gid));
-
-               printk(KERN_NOTICE "RDS/IB: connection "
-                       "<%u.%u.%u.%u,%u.%u.%u.%u,%d> primary path "
-                       "<"RDS_IB_GID_FMT","RDS_IB_GID_FMT">\n",
-                       NIPQUAD(conn->c_laddr),
-                       NIPQUAD(conn->c_faddr),
-                       conn->c_tos,
-                       RDS_IB_GID_ARG(ic->i_pri_path.p_sgid),
-                       RDS_IB_GID_ARG(ic->i_pri_path.p_dgid));
+       if (rds_ib_apm_enabled) {
+               struct rdma_dev_addr *dev_addr;
+
+               dev_addr = &ic->i_cm_id->route.addr.dev_addr;
+
+               if (!ic->conn->c_reconnect) {
+                       rdma_addr_get_sgid(dev_addr,
+                               (union ib_gid *)&ic->i_pri_path.p_sgid);
+                       rdma_addr_get_dgid(dev_addr,
+                               (union ib_gid *)&ic->i_pri_path.p_dgid);
+                       printk(KERN_NOTICE "RDS/IB: connection "
+                               "<%u.%u.%u.%u,%u.%u.%u.%u,%d> primary path "
+                               "<"RDS_IB_GID_FMT","RDS_IB_GID_FMT">\n",
+                               NIPQUAD(conn->c_laddr),
+                               NIPQUAD(conn->c_faddr),
+                               conn->c_tos,
+                               RDS_IB_GID_ARG(ic->i_pri_path.p_sgid),
+                               RDS_IB_GID_ARG(ic->i_pri_path.p_dgid));
+               }
+               rdma_addr_get_sgid(dev_addr,
+                       (union ib_gid *)&ic->i_cur_path.p_sgid);
+               rdma_addr_get_dgid(dev_addr,
+                       (union ib_gid *)&ic->i_cur_path.p_dgid);
        }
 #endif
 
        rds_connect_complete(conn);
+
+#if RDMA_RDS_APM_SUPPORTED
+        if (ic->i_last_migration) {
+                rds_ib_stats_inc(s_ib_failed_apm);
+                ic->i_last_migration = 0;
+        }
+#endif
 }
 
 static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
@@ -435,6 +440,7 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
                                RDS_IB_GID_ARG(ic->i_cur_path.p_sgid),
                                RDS_IB_GID_ARG(ic->i_cur_path.p_dgid));
                }
+               ic->i_last_migration = get_seconds();
 
                break;
        case IB_EVENT_PATH_MIG_ERR:
@@ -993,7 +999,7 @@ int rds_ib_conn_connect(struct rds_connection *conn)
        /* XXX I wonder what affect the port space has */
        /* delegate cm event handler to rdma_transport */
        ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
-                                    RDMA_PS_TCP, IB_QPT_RC);
+                                       RDMA_PS_TCP, IB_QPT_RC);
        if (IS_ERR(ic->i_cm_id)) {
                ret = PTR_ERR(ic->i_cm_id);
                ic->i_cm_id = NULL;
index 76994121116192e3e2c789e50f7b65f3723f4e56..e8d6e48d2e76a5569bbaa38891fdc8de91229ae9 100644 (file)
@@ -80,7 +80,7 @@ struct rds_ib_mr_pool {
 
        atomic_t                free_pinned;            /* memory pinned by free MRs */
        unsigned long           max_items;
-       unsigned long           max_items_soft;
+       atomic_t                max_items_soft;
        unsigned long           max_free_pinned;
        struct ib_fmr_attr      fmr_attr;
 };
@@ -251,7 +251,7 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
                pool->max_items * pool->fmr_attr.max_pages / 4;
        pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
        pool->fmr_attr.page_shift = PAGE_SHIFT;
-       pool->max_items_soft = pool->max_items * 3 / 4;
+       atomic_set(&pool->max_items_soft, pool->max_items);
 
        return pool;
 }
@@ -316,7 +316,6 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev,
 {
        struct rds_ib_mr_pool *pool;
        struct rds_ib_mr *ibmr = NULL;
-       struct rds_ib_mr *tmp_ibmr = NULL;
        int err = 0, iter = 0;
 
        if (npages <= RDS_FMR_8K_MSG_SIZE)
@@ -324,7 +323,8 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev,
        else
                pool = rds_ibdev->mr_1m_pool;
 
-       if (atomic_read(&pool->dirty_count) >= pool->max_items / 10)
+       if (atomic_read(&pool->dirty_count) >=
+               atomic_read(&pool->max_items_soft) / 10)
                queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10);
 
        while (1) {
@@ -381,25 +381,39 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev,
        if (IS_ERR(ibmr->fmr)) {
                err = PTR_ERR(ibmr->fmr);
 
-               /* Adjust the pool size to reflect the resources available to
-                * the VM.
+               /* Re-balance the pool sizes to reflect the memory resources
+                * available to the VM.
                 */
                if (err == -ENOMEM) {
-                       int prev_max = pool->max_items;
-
-                       pool->max_items = atomic_read(&pool->item_count);
-
-                       printk(KERN_ERR "RDS/IB: Adjusted %s FMR pool (%d->%ld)\n", (pool->pool_type == RDS_IB_MR_8K_POOL) ? "8K" : "1M",
-                               prev_max, pool->max_items);
-
-                       rds_ib_flush_mr_pool(pool, 0, &tmp_ibmr);
-                       if (tmp_ibmr) {
-                               kfree(ibmr);
-                               return tmp_ibmr;
+                       int total_pool_size =
+                               atomic_read(&rds_ibdev->mr_8k_pool->item_count)
+                                       * (RDS_FMR_8K_MSG_SIZE + 1) +
+                               atomic_read(&rds_ibdev->mr_1m_pool->item_count)
+                                       * RDS_FMR_1M_MSG_SIZE;
+
+                       if (total_pool_size) {
+                               int prev_8k_max = atomic_read(&rds_ibdev->mr_8k_pool->max_items_soft);
+                               int prev_1m_max = atomic_read(&rds_ibdev->mr_1m_pool->max_items_soft);
+                               atomic_set(&rds_ibdev->mr_8k_pool->max_items_soft, (total_pool_size / 4) / (RDS_FMR_8K_MSG_SIZE + 1));
+                               atomic_set(&rds_ibdev->mr_1m_pool->max_items_soft, (total_pool_size * 3 / 4) / RDS_FMR_1M_MSG_SIZE);
+                               printk(KERN_ERR "RDS/IB: "
+                                       "Adjusted 8K FMR pool (%d->%d)\n",
+                                       prev_8k_max,
+                                       atomic_read(&rds_ibdev->mr_8k_pool->max_items_soft));
+                               printk(KERN_ERR "RDS/IB: "
+                                       "Adjusted 1K FMR pool (%d->%d)\n",
+                                       prev_1m_max,
+                                       atomic_read(&rds_ibdev->mr_1m_pool->max_items_soft));
+                               rds_ib_flush_mr_pool(rds_ibdev->mr_8k_pool, 1,
+                                                       NULL);
+
+                               rds_ib_flush_mr_pool(rds_ibdev->mr_1m_pool, 1,
+                                                       NULL);
+
+                               err = -EAGAIN;
                        }
                }
                ibmr->fmr = NULL;
-               printk(KERN_WARNING "RDS/IB: ib_alloc_fmr failed (err=%d)\n", err);
                goto out_no_cigar;
        }
 
@@ -408,6 +422,11 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev,
                rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc);
        else
                rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc);
+
+       if (atomic_read(&pool->item_count) >
+               atomic_read(&pool->max_items_soft))
+               atomic_set(&pool->max_items_soft, pool->max_items);
+
        return ibmr;
 
 out_no_cigar:
@@ -793,7 +812,8 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
 
        /* If we've pinned too many pages, request a flush */
        if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned
-        || atomic_read(&pool->dirty_count) >= pool->max_items / 5)
+        || atomic_read(&pool->dirty_count) >=
+               atomic_read(&pool->max_items_soft) / 5)
                queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10);
 
        if (invalidate) {
index 52e497cad5f977b6b7c8d8bb28a117077f30066e..b4611407614075459f4b04db60cf323ae1c742aa 100644 (file)
@@ -38,6 +38,7 @@
 #include "rds.h"
 #include "ib.h"
 #include "tcp.h"
+
 /*
  * Convert IB-specific error message to RDS error message and call core
  * completion handler.
@@ -309,7 +310,8 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
                        "send completion on %pI4 "
                        "had status %u, disconnecting and reconnecting\n",
                        &conn->c_faddr, wc->status);
-       }
+       } else
+               ic->i_last_migration = 0;
 }
 
 /*
index 80a4c90ac6cfd8cd649da30735593a49df9c0afd..c93cc19eb617ebf8cd8b37b1cb6c52c935e08412 100644 (file)
@@ -80,6 +80,7 @@ static char *rds_ib_stat_names[] = {
        "ib_srq_lows",
        "ib_srq_refills",
        "ib_srq_empty_refills",
+       "ib_apm_reconnect",
 };
 
 unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
index 82ca9ee34aee667921034b8b71739c1348ea3e30..e756376ea7094f9bc6980178ba970304bc08db04 100644 (file)
@@ -44,6 +44,8 @@
 
 static struct rdma_cm_id *rds_iw_listen_id;
 
+int rds_rdma_resolve_to_ms[] = {1000, 1000, 2000, 4000, 5000};
+
 int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
                              struct rdma_cm_event *event)
 {
@@ -96,7 +98,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
 
                /* XXX do we need to clean up if this fails? */
                ret = rdma_resolve_route(cm_id,
-                                        RDS_RDMA_RESOLVE_TIMEOUT_MS);
+                               rds_rdma_resolve_to_ms[conn->c_to_index]);
                if (ret) {
                        /*
                         * The cm_id will get destroyed by addr_handler
@@ -111,12 +113,14 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
                                if (ibic && ibic->i_cm_id == cm_id)
                                        ibic->i_cm_id = NULL;
                                rds_conn_drop(conn);
-                       }
+                       } else if (conn->c_to_index < (RDS_RDMA_RESOLVE_TO_MAX_INDEX-1))
+                               conn->c_to_index++;
                }
                break;
 
        case RDMA_CM_EVENT_ROUTE_RESOLVED:
                /* XXX worry about racing with listen acceptance */
+               conn->c_to_index = 0;
                ret = trans->cm_initiate_connect(cm_id);
                break;
 
index 988aa458660e83e2a53f9d0eab7c6ea3e5fbdfd4..5881b3d977ad964bf0abb26f86fa3924543f270a 100644 (file)
@@ -91,6 +91,8 @@ enum {
 #define RDS_IN_XMIT            2
 #define RDS_RECV_REFILL                3
 
+#define RDS_RDMA_RESOLVE_TO_MAX_INDEX   5
+
 struct rds_connection {
        struct hlist_node       c_hash_node;
        __be32                  c_laddr;
@@ -147,6 +149,7 @@ struct rds_connection {
        unsigned int            c_reconnect_drops;
        int                     c_reconnect_warn;
        int                     c_reconnect_err;
+       int                     c_to_index;
 
        unsigned int            c_reconnect;