From: Santosh Shilimkar Date: Wed, 21 Oct 2015 23:47:28 +0000 (-0700) Subject: RDS: IB: support larger frag size up to 16KB X-Git-Tag: v4.1.12-92~175^2~5 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=ab9a098e526ad97b34ce21539303de5363bd8945;p=users%2Fjedix%2Flinux-maple.git RDS: IB: support larger frag size up to 16KB Infiniband (IB) transport supports larger message size than RDS_FRAG_SIZE, which is usually in 4KB PAGE_SIZE. Nevertheless, RDS always fragments each payload into RDS_FRAG_SIZE before hands it over to the underlying IB HCA. One of the important message size required for database is 8448 (8K + 256B control message) for BCOPY. This RDS message, even with IB transport, will generate three IB work requests (WR) with each having its own RDS header. This series of patches improve RDS performance by allowing IB transport to send/receive RDS message with a larger RDS_FRAG_SIZE (Ideally, using a single WR). In order to maintain the backward compatibility and interoperability between various RDS versions, and at the same time to support various FRAG_SIZE, the IB fragment size is negotiated per connection. Although IB is capable of supporting 4GB of message size, currently we limit the IB RDS_FRAG_SIZE up to 16KB due to two reasons:- 1. This is needed for current 8448 RDS message size usecase. 2. Minizing the size for each receive queue entry in order to optimal memory usage. In term of implementation, The 'dp_reserved2' field of 'struct rds_ib_connect_private' now carries information about supported IB fragment size. Since we are just using the IB connection private data and a reserved field, the protocol version is not bumped up. Furthermore, the feature is enabled only for RDS_PROTOCOL_v4.1 and above (future). To keep thing simpler for user, a module parameter 'rds_ib_max_frag' is provided. Without module parameter, the default PAGE_SIZE frag will be used. During the connection establishment, the smallest fragment size will be chosen. If the fragment size is 0, it means RDS module doesn't support large fragment size and the default RDS_FRAG_SIZE will be used. Upto ~10+ % improvement seen with Orion and ~9+ % with RDBMS update queries. Orabug: 21894138 Reviwed-by: Wei Lin Guay Signed-off-by: Santosh Shilimkar --- diff --git a/net/rds/ib.c b/net/rds/ib.c index 7880023120824..fb05fd99ebd72 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -2659,6 +2659,9 @@ int rds_ib_init(void) sock_net_set(rds_ib_inet_socket->sk, &init_net); + /* Initialise the RDS IB fragment size */ + rds_ib_init_frag(RDS_PROTOCOL_VERSION); + ret = rds_ib_fmr_init(); if (ret) goto out; diff --git a/net/rds/ib.h b/net/rds/ib.h index 231e5de50828c..0d0ea322f5759 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -89,7 +89,7 @@ struct rds_ib_connect_private { __be16 dp_protocol_minor_mask; /* bitmask */ u8 dp_tos; u8 dp_reserved1; - __be16 dp_reserved2; + __be16 dp_frag_sz; __be64 dp_ack_seq; __be32 dp_credit; /* non-zero enables flow ctl */ }; @@ -591,6 +591,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, void rds_ib_check_migration(struct rds_connection *conn, struct rdma_cm_event *event); #endif +void rds_ib_init_frag(unsigned int version); #define rds_ib_conn_error(conn, fmt...) \ __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt) diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 93f70d8d18434..1fb8c441876e0 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -40,6 +41,12 @@ #include "ib.h" #include "tcp.h" +static unsigned int rds_ib_max_frag = RDS_FRAG_SIZE; +static unsigned int ib_init_frag_size = RDS_FRAG_SIZE; + +module_param(rds_ib_max_frag, int, 0444); +MODULE_PARM_DESC(rds_ib_max_frag, " RDS IB maximum fragment size"); + static char *rds_ib_event_type_strings[] = { #define RDS_IB_EVENT_STRING(foo) \ [IB_EVENT_##foo] = __stringify(IB_EVENT_##foo) @@ -120,6 +127,68 @@ rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr) printk(KERN_NOTICE "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d\n", -ret); } +static inline u16 rds_ib_get_frag(unsigned int version, u16 ib_frag) +{ + u16 frag = RDS_FRAG_SIZE; + + if (version < RDS_PROTOCOL_4_1) { + pr_err("RDS/IB: Protocol %x default frag %uKB\n", + version, frag / SZ_1K); + return frag; + } + + switch (ib_frag) { + case RDS_MAX_FRAG_SIZE: + frag = RDS_MAX_FRAG_SIZE; + break; + case SZ_8K: + frag = SZ_8K; + break; + default: + frag = RDS_FRAG_SIZE; + } + + return frag; +} + +/* Initialise the RDS IB frag size with host_ib_max_frag */ +void rds_ib_init_frag(unsigned int version) +{ + /* Initialise using Host module parameter */ + ib_init_frag_size = rds_ib_get_frag(version, rds_ib_max_frag); + + pr_debug("RDS/IB: fragment size initialised to %uKB\n", + ib_init_frag_size / SZ_1K); +} + +/* Update the RDS IB frag size */ +static void rds_ib_set_frag_size(struct rds_connection *conn, u16 dp_frag) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + u16 current_frag = ic->i_frag_sz; + u16 frag; + + if (ib_init_frag_size != dp_frag) { + frag = min_t(unsigned int, dp_frag, ib_init_frag_size); + ic->i_frag_sz = rds_ib_get_frag(conn->c_version, frag); + } else { + ic->i_frag_sz = ib_init_frag_size; + } + + ic->i_frag_pages = ic->i_frag_sz / PAGE_SIZE; + pr_debug("RDS/IB: conn <%pI4, %pI4,%d>, Frags : {%d,%d,%d}, updated {%d -> %d}\n", + &conn->c_laddr, &conn->c_faddr, conn->c_tos, + ib_init_frag_size / SZ_1K, ic->i_frag_sz / SZ_1K, dp_frag / SZ_1K, + current_frag / SZ_1K, ic->i_frag_sz / SZ_1K); +} + +/* Init per IC frag size */ +static inline void rds_ib_init_ic_frag(struct rds_ib_connection *ic) +{ + if (ic) + ic->i_frag_sz = ib_init_frag_size; +} + /* * Connection established. * We get here for both outgoing and incoming connection. @@ -142,6 +211,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even RDS_PROTOCOL(dp->dp_protocol_major, dp->dp_protocol_minor)); rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); + rds_ib_set_frag_size(conn, be16_to_cpu(dp->dp_frag_sz)); } } @@ -162,9 +232,9 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even } } - printk(KERN_NOTICE "RDS/IB: %s conn %p i_cm_id %p connected <%pI4,%pI4,%d> version %u.%u%s\n", + printk(KERN_NOTICE "RDS/IB: %s conn %p i_cm_id %p, frag %dKB, connected <%pI4,%pI4,%d> version %u.%u%s\n", ic->i_active_side ? "Active " : "Passive", - conn, ic->i_cm_id, + conn, ic->i_cm_id, ic->i_frag_sz / SZ_1K, &conn->c_laddr, &conn->c_faddr, conn->c_tos, RDS_PROTOCOL_MAJOR(conn->c_version), RDS_PROTOCOL_MINOR(conn->c_version), @@ -268,7 +338,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, struct rds_ib_connect_private *dp, u32 protocol_version, u32 max_responder_resources, - u32 max_initiator_depth) + u32 max_initiator_depth, u16 frag) { struct rds_ib_connection *ic = conn->c_transport_data; struct rds_ib_device *rds_ibdev = ic->rds_ibdev; @@ -302,6 +372,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits); } + dp->dp_frag_sz = cpu_to_be16(frag); conn_param->private_data = dp; conn_param->private_data_len = sizeof(*dp); } @@ -813,6 +884,9 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, goto out; } + rds_ib_set_protocol(conn, version); + rds_ib_set_frag_size(conn, be16_to_cpu(dp->dp_frag_sz)); + if (dp->dp_tos && !conn->c_base_conn) { conn->c_base_conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr, @@ -898,7 +972,6 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, */ conn->c_connection_start = get_seconds(); - rds_ib_set_protocol(conn, version); rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); /* If the peer gave us the last packet it saw, process this as if @@ -925,7 +998,8 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version, event->param.conn.responder_resources, - event->param.conn.initiator_depth); + event->param.conn.initiator_depth, + ib_init_frag_size); #if RDMA_RDS_APM_SUPPORTED if (rds_ib_apm_enabled) @@ -973,6 +1047,10 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) rds_ib_set_protocol(conn, RDS_PROTOCOL_4_1); ic->i_flowctl = rds_ib_sysctl_flow_control; /* advertise flow control */ + pr_debug("RDS/IB: Initiate conn <%pI4, %pI4,%d> with Frags : {%d,%d}\n", + &conn->c_laddr, &conn->c_faddr, conn->c_tos, + ib_init_frag_size / SZ_1K, ic->i_frag_sz / SZ_1K); + ret = rds_ib_setup_qp(conn); if (ret) { conn->c_drop_source = 28; @@ -981,7 +1059,8 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) } rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, - conn->c_proposed_version, UINT_MAX, UINT_MAX); + conn->c_proposed_version, UINT_MAX, UINT_MAX, + ib_init_frag_size); ret = rdma_connect(cm_id, &conn_param); if (ret) { conn->c_drop_source = 29; @@ -1287,6 +1366,7 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr); rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr); + rds_ib_init_ic_frag(ic); if (ic->i_ibinc) { rds_inc_put(&ic->i_ibinc->ii_inc); @@ -1341,6 +1421,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) */ rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr); rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr); + rds_ib_init_ic_frag(ic); ic->conn = conn; conn->c_transport_data = ic; diff --git a/net/rds/rds.h b/net/rds/rds.h index 94abf90a2ddfb..8810398e24722 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -57,6 +57,7 @@ rdsdebug(char *fmt, ...) #define RDS_FRAG_SHIFT 12 #define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT)) +#define RDS_MAX_FRAG_SIZE SZ_16K #define RDS_CONG_MAP_BYTES (65536 / 8) #define RDS_CONG_PAGE_SIZE (1UL << 12)