]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
net/rds: reduce memory footprint during ib_post_recv in IB transport
authorWei Lin Guay <wei.lin.guay@oracle.com>
Thu, 6 Jul 2017 08:12:48 +0000 (10:12 +0200)
committerDhaval Giani <dhaval.giani@oracle.com>
Wed, 15 Nov 2017 06:18:11 +0000 (01:18 -0500)
The RDS IB large fragment size feature requires order 2 memory allocations
and it introduces memory pressure in the allocation system. Thus, this
patch implements large fragment size support in ib_post_recv with N sge. As
of today, RDS has an assumption that each IB received work request has only
two SGEs. This patch removes this assumption and uses various SGE to
support large fragment size.

Orabug: 26770234

Signed-off-by: Wei Lin Guay <wei.lin.guay@oracle.com>
Reviewed-by: HÃ¥kon Bugge <haakon.bugge@oracle.com>
Tested-by: Shih-Yu Huang <shih-yu.huang@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: Dhaval Giani <dhaval.giani@oracle.com>
net/rds/ib.c
net/rds/ib.h
net/rds/ib_cm.c
net/rds/ib_recv.c

index a71c7e8caee0469b65c675a5a0981abdc153d237..d32f1c9fbf6d60f0f6544354ea6b9fa2ea1809a6 100644 (file)
@@ -2836,11 +2836,13 @@ int rds_ib_inc_to_skb(struct rds_incoming *inc, struct sk_buff *skb)
        int i;
        struct rds_ib_incoming *ibinc;
        struct rds_page_frag *ibfrag;
+       struct scatterlist *sg;
 
        /* pull out initial pointers */
        ibinc  = container_of(inc, struct rds_ib_incoming, ii_inc);
        ibfrag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
        len    = be32_to_cpu(inc->i_hdr.h_len);
+       sg     = ibfrag->f_sg;
        slen   = len;
        i      = 0;
 
@@ -2848,11 +2850,10 @@ int rds_ib_inc_to_skb(struct rds_incoming *inc, struct sk_buff *skb)
        while (NULL != ibfrag && slen > 0) {
                /* one to one mapping of frags to sg structures */
                frag = &skb_shinfo(skb)->frags[i];
-
                /* save off all the sg pieces to the skb frags we are creating */
-               frag->size        = ibfrag->f_sg.length;
-               frag->page_offset = ibfrag->f_sg.offset;
-               frag->page.p      = sg_page(&ibfrag->f_sg);
+               frag->size        = sg->length;
+               frag->page_offset = sg->offset;
+               frag->page.p      = sg_page(sg);
 
                /* AA:  do we need to bump up the page reference */
                /* get_page(frag->page); */
@@ -2860,8 +2861,12 @@ int rds_ib_inc_to_skb(struct rds_incoming *inc, struct sk_buff *skb)
                /* dec the amount of data we are consuming */
                slen -= frag->size;
 
-               /* bump to the next entry */
-               ibfrag = list_entry(ibfrag->f_item.next, struct rds_page_frag, f_item);
+               sg  = sg_next(sg);
+               if (!sg) {
+                       /* bump to the next entry */
+                       ibfrag = list_entry(ibfrag->f_item.next, struct rds_page_frag, f_item);
+                       sg = ibfrag->f_sg;
+               }
                i++;
 
                /* for now we will only have a single chain of fragments in the skb */
index 9454a6dfb1b2f2720b04fd67bf92a774b4c496f0..47fd839f865c6eea51267a450990a5680865c84c 100644 (file)
@@ -50,6 +50,8 @@
 
 #define RDS_WC_MAX 32
 
+#define NUM_RDS_RECV_SG        (PAGE_ALIGN(RDS_MAX_FRAG_SIZE) / PAGE_SIZE)
+
 #define        RDS_IB_CLEAN_CACHE      1
 
 #define RDS_IB_DEFAULT_FREG_PORT_NUM   1
@@ -65,7 +67,7 @@ extern struct list_head rds_ib_devices;
 struct rds_page_frag {
        struct list_head        f_item;
        struct list_head        f_cache_entry;
-       struct scatterlist      f_sg;
+       struct scatterlist      f_sg[NUM_RDS_RECV_SG];
 };
 
 struct rds_ib_incoming {
@@ -110,7 +112,7 @@ struct rds_ib_recv_work {
        struct rds_ib_incoming  *r_ibinc;
        struct rds_page_frag    *r_frag;
        struct ib_recv_wr       r_wr;
-       struct ib_sge           r_sge[2];
+       struct ib_sge           r_sge[RDS_IB_MAX_SGE];
        struct rds_ib_connection        *r_ic;
        int                             r_posted;
 };
index 85fa2a978535841ceb52e49111ba9da9d43d07fe..cd4bbf9a78025454c9a66f219de9ed1809771169 100644 (file)
@@ -754,7 +754,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
        attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1 + mr_reg;
        attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1;
        attr.cap.max_send_sge = rds_ibdev->max_sge;
-       attr.cap.max_recv_sge = RDS_IB_RECV_SGE;
+       attr.cap.max_recv_sge = rds_ibdev->max_sge;
        attr.sq_sig_type = IB_SIGNAL_REQ_WR;
        attr.qp_type = IB_QPT_RC;
        attr.send_cq = ic->i_scq;
index 03cc2119a3009a93aedb55b2409e5d56f3c6ffcd..4d061cabdd729240be56937880f2b0a7052ba86d 100644 (file)
@@ -62,7 +62,9 @@ static unsigned long rds_ib_allocation_warn = 1;
 void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
 {
        struct rds_ib_recv_work *recv;
-       u32 i;
+       u32 i, j;
+       /* One entry for RDS header */
+       u32 num_send_sge = ic->i_frag_pages + 1;
 
        for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
                struct ib_sge *sge;
@@ -73,17 +75,19 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
                recv->r_wr.next = NULL;
                recv->r_wr.wr_id = i;
                recv->r_wr.sg_list = recv->r_sge;
-               recv->r_wr.num_sge = RDS_IB_RECV_SGE;
+               recv->r_wr.num_sge = num_send_sge;
 
-               sge = &recv->r_sge[0];
+               sge = recv->r_sge;
                sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
                sge->length = sizeof(struct rds_header);
                sge->lkey = ic->i_mr->lkey;
 
-               sge = &recv->r_sge[1];
-               sge->addr = 0;
-               sge->length = ic->i_frag_sz;
-               sge->lkey = ic->i_mr->lkey;
+               for (j = 1; j <= num_send_sge; j++) {
+                       sge = recv->r_sge + j;
+                       sge->addr = 0;
+                       sge->length = PAGE_SIZE;
+                       sge->lkey = ic->i_mr->lkey;
+               }
        }
 }
 
@@ -168,11 +172,16 @@ static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache,
 }
 
 /* Detach and free frags */
-static void rds_ib_recv_free_frag(struct rds_page_frag *frag)
+static void rds_ib_recv_free_frag(struct rds_page_frag *frag, int nent)
 {
-       rdsdebug("RDS/IB: frag %p page %p\n", frag, sg_page(&frag->f_sg));
+       struct scatterlist *s;
+       int i;
+
        list_del_init(&frag->f_item);
-       __free_pages(sg_page(&frag->f_sg), get_order(frag->f_sg.length));
+       for_each_sg(frag->f_sg, s, nent, i) {
+               rdsdebug("RDS/IB: frag %p page %p\n", frag, sg_page(s));
+               __free_pages(sg_page(s), get_order(s->length));
+       }
 }
 
 void rds_ib_recv_free_caches(struct rds_ib_connection *ic)
@@ -200,7 +209,7 @@ void rds_ib_recv_free_caches(struct rds_ib_connection *ic)
        list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) {
                list_del(&frag->f_cache_entry);
                WARN_ON(!list_empty(&frag->f_item));
-               rds_ib_recv_free_frag(frag);
+               rds_ib_recv_free_frag(frag, ic->i_frag_pages);
                atomic_sub(ic->i_frag_pages, &rds_ib_allocation);
                kmem_cache_free(rds_ib_frag_slab, frag);
                atomic_sub(ic->i_frag_sz / 1024, &ic->i_cache_allocs);
@@ -216,6 +225,8 @@ void rds_ib_recv_rebuild_caches(struct rds_ib_connection *ic)
        /* init it with the used frag size */
        if (!ic->i_frag_cache_sz) {
                ic->i_frag_cache_sz = ic->i_frag_sz;
+               pr_debug("RDS/IB: assigning caches for ic %p i_cm_id %p, frag{%d->%d}\n",
+                        ic, ic->i_cm_id, ic->i_frag_cache_sz, ic->i_frag_sz);
                return;
        }
 
@@ -245,13 +256,28 @@ static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache
 static void rds_ib_frag_free(struct rds_ib_connection *ic,
                             struct rds_page_frag *frag)
 {
-       rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
+       struct scatterlist *sg;
+       int i = 0;
+
+       for_each_sg(frag->f_sg, sg, ic->i_frag_pages, i)
+               rdsdebug("frag %p page %p\n", frag, sg_page(sg));
 
        rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags);
        atomic_add(ic->i_frag_sz/1024, &ic->i_cache_allocs);
        rds_ib_stats_add(s_ib_recv_added_to_cache, ic->i_frag_sz);
 }
 
+static int sg_total_lens(struct scatterlist *sg)
+{
+       int len = 0;
+
+       while (sg) {
+               len += sg->length;
+               sg = sg_next(sg);
+       }
+       return len;
+}
+
 /* Recycle inc after freeing attached frags */
 void rds_ib_inc_free(struct rds_incoming *inc)
 {
@@ -264,9 +290,9 @@ void rds_ib_inc_free(struct rds_incoming *inc)
 
        /* Free attached frags */
        list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
-               if (frag->f_sg.length != ic->i_frag_sz)
-                       rds_ib_recv_free_frag(frag);
-               else {
+               if (sg_total_lens(frag->f_sg) != ic->i_frag_sz) {
+                       rds_ib_recv_free_frag(frag, sg_total_lens(frag->f_sg) / PAGE_SIZE);
+               else {
                        list_del_init(&frag->f_item);
                        rds_ib_frag_free(ic, frag);
                }
@@ -285,7 +311,8 @@ static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
                recv->r_ibinc = NULL;
        }
        if (recv->r_frag) {
-               ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
+               ib_dma_unmap_sg(ic->i_cm_id->device, recv->r_frag->f_sg, ic->i_frag_pages,
+                               DMA_FROM_DEVICE);
                rds_ib_frag_free(ic, recv->r_frag);
                recv->r_frag = NULL;
        }
@@ -325,8 +352,12 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic
 {
        struct rds_page_frag *frag;
        struct list_head *cache_item;
-       int ret;
+       struct scatterlist *sg;
+       struct scatterlist *s;
        int avail_allocs;
+       int ret;
+       int i;
+       int j;
 
        cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags);
        if (cache_item) {
@@ -352,13 +383,17 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic
                        return NULL;
                }
 
-               sg_init_table(&frag->f_sg, 1);
-               ret = rds_page_remainder_alloc(&frag->f_sg,
-                                              ic->i_frag_sz, page_mask, true);
-               if (ret) {
-                       kmem_cache_free(rds_ib_frag_slab, frag);
-                       atomic_sub(ic->i_frag_pages, &rds_ib_allocation);
-                       return NULL;
+               sg_init_table(frag->f_sg, ic->i_frag_pages);
+               for_each_sg(frag->f_sg, sg, ic->i_frag_pages, i) {
+                       ret = rds_page_remainder_alloc(sg,
+                                                      PAGE_SIZE, page_mask, false);
+                       if (ret) {
+                               for_each_sg(frag->f_sg, s, ic->i_frag_pages, j)
+                                       __free_pages(sg_page(s), get_order(s->length));
+                               kmem_cache_free(rds_ib_frag_slab, frag);
+                               atomic_sub(ic->i_frag_pages, &rds_ib_allocation);
+                               return NULL;
+                       }
                }
                rds_ib_stats_inc(s_ib_rx_total_frags);
        }
@@ -372,7 +407,9 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn,
                                  struct rds_ib_recv_work *recv, gfp_t gfp)
 {
        struct rds_ib_connection *ic = conn->c_transport_data;
+       struct scatterlist *sg;
        struct ib_sge *sge;
+       int i;
        int ret = -ENOMEM;
        gfp_t slab_mask = GFP_NOWAIT;
        gfp_t page_mask = GFP_NOWAIT;
@@ -402,17 +439,18 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn,
        if (!recv->r_frag)
                goto out;
 
-       ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg,
-                           1, DMA_FROM_DEVICE);
-       WARN_ON(ret != 1);
+       ret = ib_dma_map_sg(ic->i_cm_id->device, recv->r_frag->f_sg,
+                           ic->i_frag_pages, DMA_FROM_DEVICE);
 
-       sge = &recv->r_sge[0];
+       sge = recv->r_sge;
        sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
        sge->length = sizeof(struct rds_header);
 
-       sge = &recv->r_sge[1];
-       sge->addr = sg_dma_address(&recv->r_frag->f_sg);
-       sge->length = sg_dma_len(&recv->r_frag->f_sg);
+       for_each_sg(recv->r_frag->f_sg, sg, ic->i_frag_pages, i) {
+               sge = recv->r_sge + i + 1;
+               sge->addr = sg_dma_address(sg);
+               sge->length = sg_dma_len(sg);
+       }
 
        ret = 0;
 out:
@@ -430,8 +468,8 @@ static void rds_ib_srq_clear_one(struct rds_ib_srq *srq,
                recv->r_ibinc = NULL;
        }
        if (recv->r_frag) {
-               ib_dma_unmap_sg(srq->rds_ibdev->dev, &recv->r_frag->f_sg,
-                               1, DMA_FROM_DEVICE);
+               ib_dma_unmap_sg(srq->rds_ibdev->dev, recv->r_frag->f_sg,
+                               NUM_RDS_RECV_SG, DMA_FROM_DEVICE);
                if (recv->r_ic)
                        rds_ib_frag_free(recv->r_ic, recv->r_frag);
                else
@@ -445,7 +483,9 @@ static int rds_ib_srq_refill_one(struct rds_ib_srq *srq,
                                struct rds_ib_connection *ic,
                                struct rds_ib_recv_work *recv, gfp_t gfp)
 {
+       struct scatterlist *sg;
        struct ib_sge *sge;
+       int i;
        int ret = -ENOMEM;
        gfp_t slab_mask = GFP_NOWAIT;
        gfp_t page_mask = GFP_NOWAIT;
@@ -476,12 +516,10 @@ static int rds_ib_srq_refill_one(struct rds_ib_srq *srq,
        if (!recv->r_frag)
                goto out;
 
-       ret = ib_dma_map_sg(srq->rds_ibdev->dev, &recv->r_frag->f_sg,
-                       1, DMA_FROM_DEVICE);
+       ret = ib_dma_map_sg(srq->rds_ibdev->dev, recv->r_frag->f_sg,
+                           ic->i_frag_pages, DMA_FROM_DEVICE);
 
-       WARN_ON(ret != 1);
-
-       sge = &recv->r_sge[0];
+       sge = recv->r_sge;
 
        sge->addr = srq->s_recv_hdrs_dma +
                (recv - srq->s_recvs) *
@@ -489,9 +527,11 @@ static int rds_ib_srq_refill_one(struct rds_ib_srq *srq,
 
        sge->length = sizeof(struct rds_header);
 
-       sge = &recv->r_sge[1];
-       sge->addr = sg_dma_address(&recv->r_frag->f_sg);
-       sge->length = sg_dma_len(&recv->r_frag->f_sg);
+       for_each_sg(recv->r_frag->f_sg, sg, ic->i_frag_pages, i) {
+               sge = recv->r_sge + i + 1;
+               sge->addr = sg_dma_address(sg);
+               sge->length = sg_dma_len(sg);
+       }
 
        ret = 0;
 out:
@@ -501,7 +541,12 @@ out:
 static int rds_ib_srq_prefill_one(struct rds_ib_device *rds_ibdev,
                                struct rds_ib_recv_work *recv, int prefill)
 {
+       int num_sge = NUM_RDS_RECV_SG;
+       struct scatterlist *sg;
+       struct scatterlist *s;
        struct ib_sge *sge;
+       int i;
+       int j;
        int ret = -ENOMEM;
        gfp_t slab_mask = GFP_NOWAIT;
        gfp_t page_mask = GFP_NOWAIT;
@@ -523,24 +568,23 @@ static int rds_ib_srq_prefill_one(struct rds_ib_device *rds_ibdev,
        recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask);
        if (!recv->r_frag)
                goto out;
-       sg_init_table(&recv->r_frag->f_sg, 1);
-       if (recv->r_ic)
-               ret = rds_page_remainder_alloc(&recv->r_frag->f_sg,
-                               recv->r_ic->i_frag_sz, page_mask, true);
-       else
-               ret = rds_page_remainder_alloc(&recv->r_frag->f_sg,
-                               RDS_FRAG_SIZE, page_mask, true);
-       if (ret) {
-               kmem_cache_free(rds_ib_frag_slab, recv->r_frag);
-               goto out;
+       sg_init_table(recv->r_frag->f_sg, num_sge);
+       for_each_sg(recv->r_frag->f_sg, sg, num_sge, i) {
+               ret = rds_page_remainder_alloc(sg,
+                                              PAGE_SIZE, page_mask, false);
+               if (ret) {
+                       for_each_sg(recv->r_frag->f_sg, s, num_sge, j)
+                               __free_pages(sg_page(s), get_order(s->length));
+                       kmem_cache_free(rds_ib_frag_slab, recv->r_frag);
+                       goto out;
+               }
        }
 
        rds_ib_stats_inc(s_ib_rx_total_frags);
        INIT_LIST_HEAD(&recv->r_frag->f_item);
 
-       ret = ib_dma_map_sg(rds_ibdev->dev, &recv->r_frag->f_sg,
-                       1, DMA_FROM_DEVICE);
-       WARN_ON(ret != 1);
+       ret = ib_dma_map_sg(rds_ibdev->dev, recv->r_frag->f_sg,
+                           num_sge, DMA_FROM_DEVICE);
 
        sge = &recv->r_sge[0];
        sge->addr = rds_ibdev->srq->s_recv_hdrs_dma +
@@ -549,10 +593,12 @@ static int rds_ib_srq_prefill_one(struct rds_ib_device *rds_ibdev,
        sge->length = sizeof(struct rds_header);
        sge->lkey = rds_ibdev->mr->lkey;
 
-       sge = &recv->r_sge[1];
-       sge->addr = sg_dma_address(&recv->r_frag->f_sg);
-       sge->length = sg_dma_len(&recv->r_frag->f_sg);
-       sge->lkey = rds_ibdev->mr->lkey;
+       for_each_sg(recv->r_frag->f_sg, sg, num_sge, i) {
+               sge = recv->r_sge + i + 1;
+               sge->addr = sg_dma_address(sg);
+               sge->length = sg_dma_len(sg);
+               sge->lkey = rds_ibdev->mr->lkey;
+       }
 
        ret = 0;
 out:
@@ -594,12 +640,14 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
        struct rds_ib_recv_work *recv;
        struct ib_recv_wr *failed_wr;
        unsigned int posted = 0;
+       struct scatterlist *sg = NULL;
        unsigned int flowctl_credits = 0;
        /* For the time being, 16 seems to be a good starting number to
         * perform flow control update.
         */
        unsigned int flow_cntl_log2_cnt = 16;
        int ret = 0;
+       int i = 0;
        int can_wait = gfp & __GFP_WAIT;
        int must_wake = 0;
        int ring_low = 0;
@@ -645,9 +693,12 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
 
                /* XXX when can this fail? */
                ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
-               rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv,
-                        recv->r_ibinc, sg_page(&recv->r_frag->f_sg),
-                        (long) sg_dma_address(&recv->r_frag->f_sg), ret);
+               if (recv->r_frag)
+                       for_each_sg(recv->r_frag->f_sg, sg, ic->i_frag_pages, i)
+                               rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv,
+                                        recv->r_ibinc, sg_page(sg),
+                                        (long)sg_dma_address(sg),
+                                        ret);
                if (ret) {
                        rds_conn_drop(conn, DR_IB_POST_RECV_FAIL);
                        pr_warn("RDS/IB: recv post on %pI4 returned %d, disconnecting and reconnecting\n",
@@ -790,6 +841,7 @@ int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
        struct rds_ib_connection *ic = inc->i_conn->c_transport_data;
        struct rds_ib_incoming *ibinc;
        struct rds_page_frag *frag;
+       struct scatterlist *sg;
        unsigned long to_copy;
        unsigned long frag_off = 0;
        int copied = 0;
@@ -799,21 +851,17 @@ int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
        ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
        frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
        len = be32_to_cpu(inc->i_hdr.h_len);
+       sg = frag->f_sg;
 
        while (iov_iter_count(to) && copied < len) {
-               if (frag_off == ic->i_frag_sz) {
-                       frag = list_entry(frag->f_item.next,
-                                         struct rds_page_frag, f_item);
-                       frag_off = 0;
-               }
                to_copy = min_t(unsigned long, iov_iter_count(to),
-                               ic->i_frag_sz - frag_off);
+                               sg->length - frag_off);
                to_copy = min_t(unsigned long, to_copy, len - copied);
 
                /* XXX needs + offset for multiple recvs per page */
                rds_stats_add(s_copy_to_user, to_copy);
-               ret = copy_page_to_iter(sg_page(&frag->f_sg),
-                                       frag->f_sg.offset + frag_off,
+               ret = copy_page_to_iter(sg_page(sg),
+                                       sg->offset + frag_off,
                                        to_copy,
                                        to);
                if (ret != to_copy)
@@ -821,6 +869,19 @@ int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
 
                frag_off += to_copy;
                copied += to_copy;
+
+               if (frag_off == sg->length) {
+                       frag_off = 0;
+                       sg = sg_next(sg);
+               }
+
+               if (copied % ic->i_frag_sz == 0) {
+                       frag = list_entry(frag->f_item.next,
+                                         struct rds_page_frag, f_item);
+                       frag_off = 0;
+                       sg = frag->f_sg;
+               }
+
        }
 
        return copied;
@@ -1066,9 +1127,9 @@ static void rds_ib_cong_recv(struct rds_connection *conn,
                to_copy = min(ic->i_frag_sz - frag_off, RDS_CONG_PAGE_SIZE - map_off);
                BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
 
-               addr = kmap_atomic(sg_page(&frag->f_sg));
+               addr = kmap_atomic(sg_page(frag->f_sg));
 
-               src = addr + frag->f_sg.offset + frag_off;
+               src = addr + frag->f_sg[0].offset + frag_off;
                dst = (void *)map->m_page_addrs[map_page] + map_off;
                for (k = 0; k < to_copy; k += 8) {
                        /* Record ports that became uncongested, ie
@@ -1340,7 +1401,7 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
        } else
                recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
 
-       ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
+       ib_dma_unmap_sg(ic->i_cm_id->device, recv->r_frag->f_sg, ic->i_frag_pages, DMA_FROM_DEVICE);
 
        if (wc->status == IB_WC_SUCCESS) {
                if (rds_ib_srq_enabled)
@@ -1470,7 +1531,8 @@ int rds_ib_srq_prefill_ring(struct rds_ib_device *rds_ibdev)
                recv->r_wr.next = NULL;
                recv->r_wr.wr_id = i;
                recv->r_wr.sg_list = recv->r_sge;
-               recv->r_wr.num_sge = RDS_IB_RECV_SGE;
+               /* always posted with max supported SGE and one rds header */
+               recv->r_wr.num_sge = NUM_RDS_RECV_SG + 1;
                recv->r_ibinc = NULL;
                recv->r_frag = NULL;
                recv->r_ic = NULL;