]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
SDP - Zero copy bcopy support
authorJim Mott <jim@mellanox.com>
Tue, 23 Oct 2007 17:59:13 +0000 (10:59 -0700)
committerMukesh Kacker <mukesh.kacker@oracle.com>
Tue, 6 Oct 2015 12:04:10 +0000 (05:04 -0700)
This patch adds zero copy send support to SDP.  Below 2K transfer size,
it is better to bcopy.  With larger transfers, this is a net win on
bandwidth.  Latency testing is yet to be done.

                     BCOPY        BZCOPY
   1K  TCP_STREAM  3555 Mb/sec  2250 Mb/sec
   2K  TCP_STREAM  3650 Mb/sec  3785 Mb/sec
   4K  TCP_STREAM  3560 Mb/sec  6220 Mb/sec
   8K  TCP_STREAM  3555 Mb/sec  6190 Mb/sec
  16K  TCP_STREAM  5100 Mb/sec  6155 Mb/sec
   1M  TCP_STREAM  4630 Mb/sec  6210 Mb/sec

Performance work still remains.  Open issues include correct setsockopt
defines (use previous SDP values?), code cleanup, performance tuning,
rigorous regression testing, and multi-OS build+test.  Simple testing to
date includes netperf and iperf, ^C recovery, unload/load, and checking
for gross memory leaks on Rhat4u4.

Signed-off-by: Jim Mott <jim@mellanox.com>
drivers/infiniband/ulp/sdp/sdp.h
drivers/infiniband/ulp/sdp/sdp_main.c
drivers/infiniband/ulp/sdp/sdp_socket.h

index d98ee3369581f4ea955a0bb6a90fbb73b828357a..d0f080fba28d4c5c15e51da6a7ecc2abf5af925d 100644 (file)
@@ -51,6 +51,9 @@ extern int sdp_data_debug_level;
 #define SDP_HEAD_SIZE (PAGE_SIZE / 2 + sizeof(struct sdp_bsdh))
 #define SDP_NUM_WC 4
 
+#define SDP_MIN_ZCOPY_THRESH    1024
+#define SDP_MAX_ZCOPY_THRESH 1048576
+
 #define SDP_OP_RECV 0x800000000LL
 #define SDP_OP_SEND 0x400000000LL
 
@@ -72,6 +75,13 @@ enum {
        SDP_MIN_BUFS = 2
 };
 
+enum {
+       SDP_ERR_ERROR   = -4,
+       SDP_ERR_FAULT   = -3,
+       SDP_NEW_SEG     = -2,
+       SDP_DO_WAIT_MEM = -1
+};
+
 struct rdma_cm_id;
 struct rdma_cm_event;
 
@@ -158,6 +168,9 @@ struct sdp_sock {
        int recv_frags;         /* max skb frags in recv packets */
        int send_frags;         /* max skb frags in send packets */
 
+       /* ZCOPY data */
+       int zcopy_thresh;
+
        struct ib_sge ibsge[SDP_MAX_SEND_SKB_FRAGS + 1];
        struct ib_wc  ibwc[SDP_NUM_WC];
 };
index 8ef98b2a56c14eaeac5673c395c5fa74e0176ab4..75e9c7ebd17f24813fd498febcf9dc12337685f8 100644 (file)
@@ -65,6 +65,16 @@ unsigned int csum_partial_copy_from_user_new (const char *src, char *dst,
 #include "sdp.h"
 #include <linux/delay.h>
 
+struct bzcopy_state {
+       unsigned char __user  *u_base;
+       int                    u_len;
+       int                    left;
+       int                    page_cnt;
+        int                    cur_page;
+        int                    cur_offset;
+       struct page          **pages;
+};
+
 MODULE_AUTHOR("Michael S. Tsirkin");
 MODULE_DESCRIPTION("InfiniBand SDP module");
 MODULE_LICENSE("Dual BSD/GPL");
@@ -122,6 +132,10 @@ static unsigned int sdp_keepalive_time = SDP_KEEPALIVE_TIME;
 module_param_named(sdp_keepalive_time, sdp_keepalive_time, uint, 0644);
 MODULE_PARM_DESC(sdp_keepalive_time, "Default idle time in seconds before keepalive probe sent.");
 
+static int sdp_zcopy_thresh = 0;
+module_param_named(sdp_zcopy_thresh, sdp_zcopy_thresh, int, 0644);
+MODULE_PARM_DESC(sdp_zcopy_thresh, "Zero copy send threshold; 0=0ff.");
+
 struct workqueue_struct *sdp_workqueue;
 
 static struct list_head sock_list;
@@ -990,6 +1004,12 @@ static int sdp_setsockopt(struct sock *sk, int level, int optname,
                                sdp_reset_keepalive_timer(sk, ssk->keepalive_time);
                }
                break;
+       case SDP_ZCOPY_THRESH:
+               if (val < SDP_MIN_ZCOPY_THRESH || val > SDP_MAX_ZCOPY_THRESH)
+                       err = -EINVAL;
+               else
+                       ssk->zcopy_thresh = val;
+               break;
        default:
                err = -ENOPROTOOPT;
                break;
@@ -1031,6 +1051,9 @@ static int sdp_getsockopt(struct sock *sk, int level, int optname,
        case TCP_KEEPIDLE:
                val = (ssk->keepalive_time ? : sdp_keepalive_time) / HZ;
                break;
+       case SDP_ZCOPY_THRESH:
+               val = ssk->zcopy_thresh ? ssk->zcopy_thresh : sdp_zcopy_thresh;
+               break;
        default:
                return -ENOPROTOOPT;
        }
@@ -1178,10 +1201,248 @@ void sdp_push_one(struct sock *sk, unsigned int mss_now)
 {
 }
 
-/* Like tcp_sendmsg */
-/* TODO: check locking */
+static struct bzcopy_state *sdp_bz_cleanup(struct bzcopy_state *bz)
+{
+       int i;
+
+       if (bz->pages) {
+               for (i = bz->cur_page; i < bz->page_cnt; i++)
+                       put_page(bz->pages[i]);
+
+               kfree(bz->pages);
+       }
+
+       kfree(bz);
+
+       return NULL;
+}
+
+
+static struct bzcopy_state *sdp_bz_setup(struct sdp_sock *ssk,
+                                        unsigned char __user *base,
+                                        int len,
+                                        int size_goal)
+{
+       struct bzcopy_state *bz;
+       unsigned long addr;
+       int done_pages;
+       int thresh;
+
+       thresh = ssk->zcopy_thresh ? : sdp_zcopy_thresh;
+       if (thresh == 0 || len < thresh)
+               return NULL;
+
+       if (!can_do_mlock())
+               return NULL;
+
+       bz = kzalloc(sizeof(*bz), GFP_KERNEL);
+       if (!bz)
+               return NULL;
+
+       /*
+        *   Since we use the TCP segmentation fields of the skb to map user
+        * pages, we must make sure that everything we send in a single chunk
+        * fits into the frags array in the skb.
+        */
+       size_goal = size_goal / PAGE_SIZE + 1;
+       if (size_goal >= MAX_SKB_FRAGS)
+               return NULL;
+
+       addr = (unsigned long)base;
+
+       bz->u_base     = base;
+       bz->u_len      = len;
+       bz->left       = len;
+       bz->cur_offset = addr & ~PAGE_MASK;
+       bz->page_cnt   = PAGE_ALIGN(len + bz->cur_offset) >> PAGE_SHIFT;
+       bz->pages      = kcalloc(bz->page_cnt, sizeof(struct page *), GFP_KERNEL);
+
+       if (!bz->pages)
+               goto out_1;
+
+       down_write(&current->mm->mmap_sem);
+
+       if (!capable(CAP_IPC_LOCK))
+               goto out_2;
+
+       addr &= PAGE_MASK;
+
+       done_pages = get_user_pages(current, current->mm, addr, bz->page_cnt,
+                                   0, 0, bz->pages, NULL);
+       if (unlikely(done_pages != bz->page_cnt)){
+               bz->page_cnt = done_pages;
+               goto out_2;
+       }
+
+       up_write(&current->mm->mmap_sem);
+
+       return bz;
+
+out_2:
+       up_write(&current->mm->mmap_sem);
+out_1:
+       sdp_bz_cleanup(bz);
+
+       return NULL;
+}
+
+
 #define TCP_PAGE(sk)   (sk->sk_sndmsg_page)
 #define TCP_OFF(sk)    (sk->sk_sndmsg_off)
+static inline int sdp_bcopy_get(struct sock *sk, struct sk_buff *skb,
+                               unsigned char __user *from, int copy)
+{
+       int err;
+       struct sdp_sock *ssk = sdp_sk(sk);
+
+       /* Where to copy to? */
+       if (skb_tailroom(skb) > 0) {
+               /* We have some space in skb head. Superb! */
+               if (copy > skb_tailroom(skb))
+                       copy = skb_tailroom(skb);
+               if ((err = skb_add_data(skb, from, copy)) != 0)
+                       return SDP_ERR_FAULT;
+       } else {
+               int merge = 0;
+               int i = skb_shinfo(skb)->nr_frags;
+               struct page *page = TCP_PAGE(sk);
+               int off = TCP_OFF(sk);
+
+               if (skb_can_coalesce(skb, i, page, off) &&
+                   off != PAGE_SIZE) {
+                       /* We can extend the last page
+                        * fragment. */
+                       merge = 1;
+               } else if (i == ssk->send_frags ||
+                          (!i &&
+                          !(sk->sk_route_caps & NETIF_F_SG))) {
+                       /* Need to add new fragment and cannot
+                        * do this because interface is non-SG,
+                        * or because all the page slots are
+                        * busy. */
+                       sdp_mark_push(ssk, skb);
+                       return SDP_NEW_SEG;
+               } else if (page) {
+                       if (off == PAGE_SIZE) {
+                               put_page(page);
+                               TCP_PAGE(sk) = page = NULL;
+                               off = 0;
+                       }
+               } else
+                       off = 0;
+
+               if (copy > PAGE_SIZE - off)
+                       copy = PAGE_SIZE - off;
+
+               if (!sk_stream_wmem_schedule(sk, copy))
+                       return SDP_DO_WAIT_MEM;
+
+               if (!page) {
+                       /* Allocate new cache page. */
+                       if (!(page = sk_stream_alloc_page(sk)))
+                               return SDP_DO_WAIT_MEM;
+               }
+
+               /* Time to copy data. We are close to
+                * the end! */
+               err = skb_copy_to_page(sk, from, skb, page,
+                                      off, copy);
+               if (err) {
+                       /* If this page was new, give it to the
+                        * socket so it does not get leaked.
+                        */
+                       if (!TCP_PAGE(sk)) {
+                               TCP_PAGE(sk) = page;
+                               TCP_OFF(sk) = 0;
+                       }
+                       return SDP_ERR_ERROR;
+               }
+
+               /* Update the skb. */
+               if (merge) {
+                       skb_shinfo(skb)->frags[i - 1].size +=
+                                                       copy;
+               } else {
+                       skb_fill_page_desc(skb, i, page, off, copy);
+                       if (TCP_PAGE(sk)) {
+                               get_page(page);
+                       } else if (off + copy < PAGE_SIZE) {
+                               get_page(page);
+                               TCP_PAGE(sk) = page;
+                       }
+               }
+
+               TCP_OFF(sk) = off + copy;
+       }
+
+       return copy;
+}
+
+
+static inline int sdp_bzcopy_get(struct sock *sk, struct sk_buff *skb,
+                                unsigned char __user *from, int copy,
+                                struct bzcopy_state *bz)
+{
+       int this_page, left;
+       struct sdp_sock *ssk = sdp_sk(sk);
+
+       if (skb_shinfo(skb)->nr_frags == ssk->send_frags) {
+               sdp_mark_push(ssk, skb);
+               return SDP_NEW_SEG;
+       }
+
+       left = copy;
+       BUG_ON(left > bz->left);
+
+       while (left) {
+               if (skb_shinfo(skb)->nr_frags == ssk->send_frags) {
+                       copy = copy - left;
+                       break;
+               }
+
+               this_page = PAGE_SIZE - bz->cur_offset;
+
+               if (left <= this_page)
+                       this_page = left;
+
+               if (!sk_stream_wmem_schedule(sk, copy))
+                       return SDP_DO_WAIT_MEM;
+
+               skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags,
+                                  bz->pages[bz->cur_page], bz->cur_offset,
+                                  this_page);
+
+               BUG_ON(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS);
+
+               bz->cur_offset += this_page;
+               if (bz->cur_offset == PAGE_SIZE) {
+                       bz->cur_offset = 0;
+                       bz->cur_page++;
+
+                       BUG_ON(bz->cur_page > bz->page_cnt);
+               } else {
+                       BUG_ON(bz->cur_offset > PAGE_SIZE);
+
+                       if (bz->cur_page != bz->page_cnt || left != this_page)
+                               get_page(bz->pages[bz->cur_page]);
+               }
+
+               left -= this_page;
+
+               skb->len             += this_page;
+               skb->data_len         = skb->len;
+               skb->truesize        += this_page;
+               sk->sk_wmem_queued   += this_page;
+               sk->sk_forward_alloc -= this_page;
+       }
+
+       bz->left -= copy;
+       return copy;
+}
+
+
+/* Like tcp_sendmsg */
+/* TODO: check locking */
 int sdp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                size_t size)
 {
@@ -1192,6 +1453,7 @@ int sdp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        int mss_now, size_goal;
        int err, copied;
        long timeo;
+       struct bzcopy_state *bz = NULL;
 
        lock_sock(sk);
        sdp_dbg_data(sk, "%s\n", __func__);
@@ -1225,6 +1487,8 @@ int sdp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 
                iov++;
 
+               bz = sdp_bz_setup(ssk, from, seglen, size_goal);
+
                while (seglen > 0) {
                        int copy;
 
@@ -1268,84 +1532,17 @@ new_segment:
                                sdp_mark_push(ssk, skb);
                                goto new_segment;
                        }
-                       /* Where to copy to? */
-                       if (skb_tailroom(skb) > 0) {
-                               /* We have some space in skb head. Superb! */
-                               if (copy > skb_tailroom(skb))
-                                       copy = skb_tailroom(skb);
-                               if ((err = skb_add_data(skb, from, copy)) != 0)
-                                       goto do_fault;
-                       } else {
-                               int merge = 0;
-                               int i = skb_shinfo(skb)->nr_frags;
-                               struct page *page = TCP_PAGE(sk);
-                               int off = TCP_OFF(sk);
-
-                               if (skb_can_coalesce(skb, i, page, off) &&
-                                   off != PAGE_SIZE) {
-                                       /* We can extend the last page
-                                        * fragment. */
-                                       merge = 1;
-                               } else if (i == ssk->send_frags ||
-                                          (!i &&
-                                          !(sk->sk_route_caps & NETIF_F_SG))) {
-                                       /* Need to add new fragment and cannot
-                                        * do this because interface is non-SG,
-                                        * or because all the page slots are
-                                        * busy. */
-                                       sdp_mark_push(ssk, skb);
-                                       goto new_segment;
-                               } else if (page) {
-                                       if (off == PAGE_SIZE) {
-                                               put_page(page);
-                                               TCP_PAGE(sk) = page = NULL;
-                                               off = 0;
-                                       }
-                               } else
-                                       off = 0;
 
-                               if (copy > PAGE_SIZE - off)
-                                       copy = PAGE_SIZE - off;
-
-                               if (!sk_stream_wmem_schedule(sk, copy))
+                       copy = (bz) ? sdp_bzcopy_get(sk, skb, from, copy, bz) :
+                                     sdp_bcopy_get(sk, skb, from, copy);
+                       if (unlikely(copy < 0)) {
+                               if (!++copy)
                                        goto wait_for_memory;
-
-                               if (!page) {
-                                       /* Allocate new cache page. */
-                                       if (!(page = sk_stream_alloc_page(sk)))
-                                               goto wait_for_memory;
-                               }
-
-                               /* Time to copy data. We are close to
-                                * the end! */
-                               err = skb_copy_to_page(sk, from, skb, page,
-                                                      off, copy);
-                               if (err) {
-                                       /* If this page was new, give it to the
-                                        * socket so it does not get leaked.
-                                        */
-                                       if (!TCP_PAGE(sk)) {
-                                               TCP_PAGE(sk) = page;
-                                               TCP_OFF(sk) = 0;
-                                       }
-                                       goto do_error;
-                               }
-
-                               /* Update the skb. */
-                               if (merge) {
-                                       skb_shinfo(skb)->frags[i - 1].size +=
-                                                                       copy;
-                               } else {
-                                       skb_fill_page_desc(skb, i, page, off, copy);
-                                       if (TCP_PAGE(sk)) {
-                                               get_page(page);
-                                       } else if (off + copy < PAGE_SIZE) {
-                                               get_page(page);
-                                               TCP_PAGE(sk) = page;
-                                       }
-                               }
-
-                               TCP_OFF(sk) = off + copy;
+                               if (!++copy)
+                                       goto new_segment;
+                               if (!++copy)
+                                       goto do_fault;
+                               goto do_error;
                        }
 
                        if (!copied)
@@ -1386,6 +1583,8 @@ wait_for_memory:
        }
 
 out:
+       if (bz)
+               bz = sdp_bz_cleanup(bz);
        if (copied)
                sdp_push(sk, ssk, flags, mss_now, ssk->nonagle);
        if (size > send_poll_thresh)
@@ -1405,6 +1604,8 @@ do_error:
        if (copied)
                goto out;
 out_err:
+       if (bz)
+               bz = sdp_bz_cleanup(bz);
        err = sk_stream_error(sk, flags, err);
        release_sock(sk);
        return err;
index e3563b7cc10d2bd27373f9daf3e929cc88dc5a86..91dbf191eddaaffdca26b03d28096b3b1ca4ca73 100644 (file)
@@ -8,6 +8,10 @@
 #define PF_INET_SDP AF_INET_SDP
 #endif
 
+#ifndef SDP_ZCOPY_THRESH
+#define SDP_ZCOPY_THRESH 80
+#endif
+
 /* TODO: AF_INET6_SDP ? */
 
 #endif