]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
RDS: IB: support larger frag size up to 16KB
authorSantosh Shilimkar <santosh.shilimkar@oracle.com>
Wed, 21 Oct 2015 23:47:28 +0000 (16:47 -0700)
committerChuck Anderson <chuck.anderson@oracle.com>
Thu, 14 Apr 2016 00:58:55 +0000 (17:58 -0700)
Infiniband (IB) transport supports larger message size
than RDS_FRAG_SIZE, which is usually in 4KB PAGE_SIZE.
Nevertheless, RDS always fragments each payload into
RDS_FRAG_SIZE before hands it over to the underlying
IB HCA.

One of the important message size required for database
is 8448 (8K + 256B control message) for BCOPY. This RDS
message, even with IB transport, will generate three
IB work requests (WR)  with each having its own RDS header.
This series of patches improve RDS performance by allowing
IB transport to send/receive RDS message with a larger
RDS_FRAG_SIZE (Ideally, using a single WR).

In order to maintain the backward compatibility and
interoperability between various RDS versions, and at
the same time to support various FRAG_SIZE, the IB
fragment size is negotiated per connection.
Although IB is capable of supporting 4GB of message size,
currently we limit the IB RDS_FRAG_SIZE up to 16KB due to
two reasons:-
 1. This is needed for current 8448 RDS message size usecase.
 2. Minizing the size for each receive queue entry in order
    to optimal memory usage.

In term of implementation, The 'dp_reserved2' field of
'struct rds_ib_connect_private' now carries information about
supported IB fragment size. Since we are just
using the IB connection private data and a reserved field,
the protocol version is not bumped up. Furthermore, the feature
is enabled only for RDS_PROTOCOL_v4.1 and above (future).

To keep thing simpler for user, a module parameter
'rds_ib_max_frag' is provided. Without module parameter,
the default PAGE_SIZE frag will be used. During the connection
establishment, the smallest fragment size will be
chosen. If the fragment size is 0, it means RDS module
doesn't support large fragment size and the default
RDS_FRAG_SIZE will be used.

Upto ~10+ % improvement seen with Orion and ~9+ % with RDBMS
update queries.

Orabug: 21894138

Reviwed-by: Wei Lin Guay <wei.lin.guay@oracle.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
net/rds/ib.c
net/rds/ib.h
net/rds/ib_cm.c
net/rds/rds.h

index 78800231208243597ca2acd9d4055eeaead21b50..fb05fd99ebd72587102a58cbb6902966a16b3ed5 100644 (file)
@@ -2659,6 +2659,9 @@ int rds_ib_init(void)
 
        sock_net_set(rds_ib_inet_socket->sk, &init_net);
 
+       /* Initialise the RDS IB fragment size */
+       rds_ib_init_frag(RDS_PROTOCOL_VERSION);
+
        ret = rds_ib_fmr_init();
        if (ret)
                goto out;
index 231e5de50828ce17373fa44b64f4771bd4c0e0db..0d0ea322f57596cb0153af3baa56b7912522dc09 100644 (file)
@@ -89,7 +89,7 @@ struct rds_ib_connect_private {
        __be16                  dp_protocol_minor_mask; /* bitmask */
        u8                      dp_tos;
        u8                      dp_reserved1;
-       __be16                  dp_reserved2;
+       __be16                  dp_frag_sz;
        __be64                  dp_ack_seq;
        __be32                  dp_credit;              /* non-zero enables flow ctl */
 };
@@ -591,6 +591,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn,
 void rds_ib_check_migration(struct rds_connection *conn,
                                struct rdma_cm_event *event);
 #endif
+void rds_ib_init_frag(unsigned int version);
 
 #define rds_ib_conn_error(conn, fmt...) \
        __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt)
index 93f70d8d184341d4cd26da20d0043fc6611321c6..1fb8c441876e079c9d87a42daf64b747c78e14d9 100644 (file)
@@ -33,6 +33,7 @@
 #include <linux/kernel.h>
 #include <linux/in.h>
 #include <linux/vmalloc.h>
+#include <asm-generic/sizes.h>
 #include <rdma/rdma_cm_ib.h>
 #include <rdma/ib_cache.h>
 
 #include "ib.h"
 #include "tcp.h"
 
+static unsigned int rds_ib_max_frag = RDS_FRAG_SIZE;
+static unsigned int ib_init_frag_size = RDS_FRAG_SIZE;
+
+module_param(rds_ib_max_frag, int, 0444);
+MODULE_PARM_DESC(rds_ib_max_frag, " RDS IB maximum fragment size");
+
 static char *rds_ib_event_type_strings[] = {
 #define RDS_IB_EVENT_STRING(foo) \
                [IB_EVENT_##foo] = __stringify(IB_EVENT_##foo)
@@ -120,6 +127,68 @@ rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr)
                printk(KERN_NOTICE "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d\n", -ret);
 }
 
+static inline u16 rds_ib_get_frag(unsigned int version, u16 ib_frag)
+{
+       u16 frag = RDS_FRAG_SIZE;
+
+       if (version < RDS_PROTOCOL_4_1) {
+               pr_err("RDS/IB: Protocol %x default frag %uKB\n",
+                        version, frag / SZ_1K);
+               return frag;
+       }
+
+       switch (ib_frag) {
+       case RDS_MAX_FRAG_SIZE:
+               frag = RDS_MAX_FRAG_SIZE;
+               break;
+       case SZ_8K:
+               frag = SZ_8K;
+               break;
+       default:
+               frag = RDS_FRAG_SIZE;
+       }
+
+       return frag;
+}
+
+/* Initialise the RDS IB frag size with host_ib_max_frag */
+void rds_ib_init_frag(unsigned int version)
+{
+       /* Initialise using Host module parameter */
+       ib_init_frag_size = rds_ib_get_frag(version, rds_ib_max_frag);
+
+       pr_debug("RDS/IB: fragment size initialised to %uKB\n",
+                ib_init_frag_size / SZ_1K);
+}
+
+/* Update the RDS IB frag size */
+static void rds_ib_set_frag_size(struct rds_connection *conn, u16 dp_frag)
+{
+       struct rds_ib_connection *ic = conn->c_transport_data;
+       u16 current_frag = ic->i_frag_sz;
+       u16 frag;
+
+       if (ib_init_frag_size != dp_frag) {
+               frag = min_t(unsigned int, dp_frag, ib_init_frag_size);
+               ic->i_frag_sz = rds_ib_get_frag(conn->c_version, frag);
+       } else {
+               ic->i_frag_sz = ib_init_frag_size;
+       }
+
+       ic->i_frag_pages =  ic->i_frag_sz / PAGE_SIZE;
+       pr_debug("RDS/IB: conn <%pI4, %pI4,%d>, Frags <init,ic,dp>: {%d,%d,%d}, updated {%d -> %d}\n",
+                &conn->c_laddr, &conn->c_faddr, conn->c_tos,
+                ib_init_frag_size / SZ_1K, ic->i_frag_sz / SZ_1K, dp_frag /  SZ_1K,
+                current_frag / SZ_1K, ic->i_frag_sz / SZ_1K);
+}
+
+/* Init per IC frag size */
+static inline void rds_ib_init_ic_frag(struct rds_ib_connection *ic)
+{
+       if (ic)
+               ic->i_frag_sz = ib_init_frag_size;
+}
+
 /*
  * Connection established.
  * We get here for both outgoing and incoming connection.
@@ -142,6 +211,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
                                RDS_PROTOCOL(dp->dp_protocol_major,
                                dp->dp_protocol_minor));
                        rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+                       rds_ib_set_frag_size(conn, be16_to_cpu(dp->dp_frag_sz));
                }
        }
 
@@ -162,9 +232,9 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
                }
        }
 
-       printk(KERN_NOTICE "RDS/IB: %s conn %p i_cm_id %p connected <%pI4,%pI4,%d> version %u.%u%s\n",
+       printk(KERN_NOTICE "RDS/IB: %s conn %p i_cm_id %p, frag %dKB, connected <%pI4,%pI4,%d> version %u.%u%s\n",
               ic->i_active_side ? "Active " : "Passive",
-              conn, ic->i_cm_id,
+              conn, ic->i_cm_id, ic->i_frag_sz / SZ_1K,
               &conn->c_laddr, &conn->c_faddr, conn->c_tos,
               RDS_PROTOCOL_MAJOR(conn->c_version),
               RDS_PROTOCOL_MINOR(conn->c_version),
@@ -268,7 +338,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
                        struct rds_ib_connect_private *dp,
                        u32 protocol_version,
                        u32 max_responder_resources,
-                       u32 max_initiator_depth)
+                       u32 max_initiator_depth, u16 frag)
 {
        struct rds_ib_connection *ic = conn->c_transport_data;
        struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
@@ -302,6 +372,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
                        atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits);
                }
 
+               dp->dp_frag_sz = cpu_to_be16(frag);
                conn_param->private_data = dp;
                conn_param->private_data_len = sizeof(*dp);
        }
@@ -813,6 +884,9 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
                goto out;
        }
 
+       rds_ib_set_protocol(conn, version);
+       rds_ib_set_frag_size(conn, be16_to_cpu(dp->dp_frag_sz));
+
        if (dp->dp_tos && !conn->c_base_conn) {
                conn->c_base_conn = rds_conn_create(&init_net,
                                        dp->dp_daddr, dp->dp_saddr,
@@ -898,7 +972,6 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
         */
        conn->c_connection_start = get_seconds();
 
-       rds_ib_set_protocol(conn, version);
        rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
 
        /* If the peer gave us the last packet it saw, process this as if
@@ -925,7 +998,8 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
 
        rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
                event->param.conn.responder_resources,
-               event->param.conn.initiator_depth);
+               event->param.conn.initiator_depth,
+               ib_init_frag_size);
 
 #if RDMA_RDS_APM_SUPPORTED
        if (rds_ib_apm_enabled)
@@ -973,6 +1047,10 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
        rds_ib_set_protocol(conn, RDS_PROTOCOL_4_1);
        ic->i_flowctl = rds_ib_sysctl_flow_control;     /* advertise flow control */
 
+       pr_debug("RDS/IB: Initiate conn <%pI4, %pI4,%d> with Frags <init,ic>: {%d,%d}\n",
+                &conn->c_laddr, &conn->c_faddr, conn->c_tos,
+                ib_init_frag_size / SZ_1K, ic->i_frag_sz / SZ_1K);
+
        ret = rds_ib_setup_qp(conn);
        if (ret) {
                conn->c_drop_source = 28;
@@ -981,7 +1059,8 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
        }
 
        rds_ib_cm_fill_conn_param(conn, &conn_param, &dp,
-                               conn->c_proposed_version, UINT_MAX, UINT_MAX);
+                               conn->c_proposed_version, UINT_MAX, UINT_MAX,
+                               ib_init_frag_size);
        ret = rdma_connect(cm_id, &conn_param);
        if (ret) {
                conn->c_drop_source = 29;
@@ -1287,6 +1366,7 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
 
        rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr);
        rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr);
+       rds_ib_init_ic_frag(ic);
 
        if (ic->i_ibinc) {
                rds_inc_put(&ic->i_ibinc->ii_inc);
@@ -1341,6 +1421,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
         */
        rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr);
        rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr);
+       rds_ib_init_ic_frag(ic);
 
        ic->conn = conn;
        conn->c_transport_data = ic;
index 94abf90a2ddfb15417dbc2dba27213e7e12db3df..8810398e2472242266809fcf53e0cc61b8ec3240 100644 (file)
@@ -57,6 +57,7 @@ rdsdebug(char *fmt, ...)
 
 #define RDS_FRAG_SHIFT 12
 #define RDS_FRAG_SIZE  ((unsigned int)(1 << RDS_FRAG_SHIFT))
+#define RDS_MAX_FRAG_SIZE      SZ_16K
 
 #define RDS_CONG_MAP_BYTES     (65536 / 8)
 #define RDS_CONG_PAGE_SIZE     (1UL << 12)