From: Ajaykumar Hotchandani Date: Wed, 18 May 2016 01:54:42 +0000 (-0700) Subject: IB/ipoib: Change send workqueue size for CM mode X-Git-Tag: v4.1.12-92~114^2~2 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=d47f890662b5b035557d6b33f33be4c529d1a0a9;p=users%2Fjedix%2Flinux-maple.git IB/ipoib: Change send workqueue size for CM mode Idea here is, one misbehaving connection should not become single point of failure. priv->tx_outstanding is shared by all QPs and when it reaches sendq_size, network interface queue is stopped. In connected mode, for every connection, TX QP size is sendq_size. So if one of QP starts behaving bad and we don't receive send completions in time, priv->tx_outstanding value can reach to the limit where network interface queue is required to be stopped. This can bring down entire cluster, because even ping will not go forward from that point onwards. With this patch, when creating CM QP for send operations, we limit size: +int ipoib_cm_sendq_size __read_mostly = ipoib_sendq_size / 8; Based on Yuval's suggestion, added module parameter to dictate how many bad connections we want to allow (8 above is configurable). If outstanding completions for that particular connection reaches to size of ipoib_cm_sendq_size; we halt sending data on that connection till we receive at least one completion. In summary, this will require multiple QPs to misbehave (instead of 1) in order to bring down entire cluster. As clarification, this patch is not trying to recover or change behavior of connection which may have gone bad; but it's reducing impact of bad connection. Orabug: 23254764 Signed-off-by: Ajaykumar Hotchandani Reviewed-by: Yuval Shaia --- diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 1343ea796e880..421b4969ccf09 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -71,6 +71,7 @@ enum { IPOIB_CM_BUF_SIZE = IPOIB_CM_MTU + IPOIB_ENCAP_LEN, IPOIB_CM_HEAD_SIZE = IPOIB_CM_BUF_SIZE % PAGE_SIZE, IPOIB_CM_RX_SG = ALIGN(IPOIB_CM_BUF_SIZE, PAGE_SIZE) / PAGE_SIZE, + IPOIB_CM_MAX_BAD_CONNS = 8, /* Keep network interface queue running even with 4 bad connections */ IPOIB_RX_RING_SIZE = 256, IPOIB_TX_RING_SIZE = 128, IPOIB_MAX_QUEUE_SIZE = 8192, @@ -782,6 +783,7 @@ static inline void ipoib_unregister_debugfs(void) { } extern int ipoib_sendq_size; extern int ipoib_recvq_size; +extern int ipoib_cm_sendq_size; extern struct ib_sa_client ipoib_sa_client; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index d4341b8b247d9..60424e9623d88 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -797,6 +797,14 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ if (ipoib_linearize_skb(dev, skb, priv, tx->max_send_sge) < 0) return; + if ((tx->tx_head - tx->tx_tail) >= ipoib_cm_sendq_size) { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + ipoib_dbg_data(priv, "dropping packet: length %d connection 0x%x\n", + skb->len, tx->qp->qp_num); + return; + } + ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n", tx->tx_head, skb->len, tx->qp->qp_num); @@ -807,7 +815,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ * means we have to make sure everything is properly recorded and * our state is consistent before we call post_send(). */ - tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)]; + tx_req = &tx->tx_ring[tx->tx_head & (ipoib_cm_sendq_size - 1)]; tx_req->skb = skb; /* Calculate checksum if we support ibcrc_as_csum but peer is not */ @@ -829,7 +837,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ return; } rc = post_send_sg(priv, tx, tx->tx_head & - (ipoib_sendq_size - 1), + (ipoib_cm_sendq_size - 1), skb, tx_req->mapping); } else { addr = ib_dma_map_single(priv->ca, skb->data, skb->len, @@ -845,7 +853,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ skb_orphan(skb); skb_dst_drop(skb); - rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), + rc = post_send(priv, tx, tx->tx_head & (ipoib_cm_sendq_size - 1), addr, skb->len); } if (unlikely(rc)) { @@ -882,9 +890,9 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n", wr_id, wc->status); - if (unlikely(wr_id >= ipoib_sendq_size)) { + if (unlikely(wr_id >= ipoib_cm_sendq_size)) { ipoib_warn(priv, "cm send completion event with wrid %d (> %d)\n", - wr_id, ipoib_sendq_size); + wr_id, ipoib_cm_sendq_size); return; } @@ -1143,7 +1151,7 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_ .send_cq = priv->recv_cq, .recv_cq = priv->recv_cq, .srq = priv->cm.srq, - .cap.max_send_wr = ipoib_sendq_size, + .cap.max_send_wr = ipoib_cm_sendq_size, .cap.max_send_sge = 1, .sq_sig_type = IB_SIGNAL_ALL_WR, .qp_type = IB_QPT_RC, @@ -1243,14 +1251,14 @@ static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, struct ipoib_dev_priv *priv = netdev_priv(p->dev); int ret; - p->tx_ring = __vmalloc(ipoib_sendq_size * sizeof *p->tx_ring, + p->tx_ring = __vmalloc(ipoib_cm_sendq_size * sizeof(*p->tx_ring), GFP_NOIO, PAGE_KERNEL); if (!p->tx_ring) { ipoib_warn(priv, "failed to allocate tx ring\n"); ret = -ENOMEM; goto err_tx; } - memset(p->tx_ring, 0, ipoib_sendq_size * sizeof *p->tx_ring); + memset(p->tx_ring, 0, ipoib_cm_sendq_size * sizeof(*p->tx_ring)); p->qp = ipoib_cm_create_tx_qp(p->dev, p); if (IS_ERR(p->qp)) { @@ -1325,7 +1333,7 @@ static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p) timeout: while ((int) p->tx_tail - (int) p->tx_head < 0) { - tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)]; + tx_req = &p->tx_ring[p->tx_tail & (ipoib_cm_sendq_size - 1)]; ipoib_cm_dma_unmap_tx(priv, tx_req); dev_kfree_skb_any(tx_req->skb); ++p->tx_tail; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index b5ce55b58b963..b594f5171255f 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -62,6 +62,17 @@ MODULE_VERSION(DRV_VERSION); int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE; int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE; int unload_allowed __read_mostly = 1; +/* IPOIB_CM_MAX_BAD_CONNS default value (8) is inline with current + * Exadata-ZFS deployment. + * We usually have 2 ZFS heads in current deployment. + * Considering this, maximum four connections can go bad (assuming + * unlikely scenario where all connections went bad simultaneously) + * With CM connection workqueue size, which is 1/8th of port limit, + * as defined below; we should hold good. + * Orabug: 22287489 + */ +int ipoib_cm_sendq_size __read_mostly = IPOIB_TX_RING_SIZE / IPOIB_CM_MAX_BAD_CONNS; +int ipoib_cm_max_bad_conns = IPOIB_CM_MAX_BAD_CONNS; module_param_named(module_unload_allowed, unload_allowed, int, 0444); MODULE_PARM_DESC(module_unload_allowed, "Allow this module to be unloaded or not (default 1 for YES)"); @@ -70,6 +81,8 @@ module_param_named(send_queue_size, ipoib_sendq_size, int, 0444); MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue"); module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue"); +module_param_named(cm_max_bad_conns, ipoib_cm_max_bad_conns, int, 0444); +MODULE_PARM_DESC(cm_max_bad_conns, "Continue data transfer with other nodes upto certain no of bad connections (Default 8 indicates, data transfer will continue with 4 bad connections)"); #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG int ipoib_debug_level; @@ -2023,6 +2036,13 @@ static int __init ipoib_init_module(void) } #ifdef CONFIG_INFINIBAND_IPOIB_CM + if (ipoib_cm_max_bad_conns <= 0) { + pr_err("invalid value for cm_max_bad_conns %d, seting to default %d\n", + ipoib_cm_max_bad_conns, IPOIB_CM_MAX_BAD_CONNS); + ipoib_cm_max_bad_conns = IPOIB_CM_MAX_BAD_CONNS; + } + + ipoib_cm_sendq_size = ipoib_sendq_size / ipoib_cm_max_bad_conns; ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP); #endif