rds: revert RDS code to 8cbd960 commit to rebase UEK commits

author Mukesh Kacker <mukesh.kacker@oracle.com>

Tue, 7 Jul 2015 23:17:18 +0000 (16:17 -0700)

committer Mukesh Kacker <mukesh.kacker@oracle.com>

Tue, 7 Jul 2015 23:37:22 +0000 (16:37 -0700)
author Mukesh Kacker <mukesh.kacker@oracle.com>
Tue, 7 Jul 2015 23:17:18 +0000 (16:17 -0700)
committer Mukesh Kacker <mukesh.kacker@oracle.com>
Tue, 7 Jul 2015 23:37:22 +0000 (16:37 -0700)
diff --git a/include/uapi/linux/rds.h b/include/linux/rds.h

similarity index 80%

rename from include/uapi/linux/rds.h

rename to include/linux/rds.h

index 91950950aa598060a8e0e370f82654cd9a75e7d6..d91dc91f544302a05765126308a345c1dac1c686 100644 (file)
--- a/include/uapi/linux/rds.h
+++ b/include/linux/rds.h
@@ -36,6 +36,15 @@
  
  #include <linux/types.h>
  
+/* These sparse annotated types shouldn't be in any user
+ * visible header file. We should clean this up rather
+ * than kludging around them. */
+#ifndef __KERNEL__
+#define __be16 u_int16_t
+#define __be32 u_int32_t
+#define __be64 u_int64_t
+#endif
+
  #define RDS_IB_ABI_VERSION             0x301
  
  /*
@@ -47,7 +56,6 @@
  /* deprecated: RDS_BARRIER 4 */
  #define RDS_RECVERR                    5
  #define RDS_CONG_MONITOR               6
-#define RDS_GET_MR_FOR_DEST            7
  
  /*
   * Control message types for SOL_RDS.
@@ -73,10 +81,6 @@
  #define RDS_CMSG_RDMA_MAP              3
  #define RDS_CMSG_RDMA_STATUS           4
  #define RDS_CMSG_CONG_UPDATE           5
-#define RDS_CMSG_ATOMIC_FADD           6
-#define RDS_CMSG_ATOMIC_CSWP           7
-#define RDS_CMSG_MASKED_ATOMIC_FADD    8
-#define RDS_CMSG_MASKED_ATOMIC_CSWP    9
  
  #define RDS_INFO_FIRST                 10000
  #define RDS_INFO_COUNTERS              10000
@@ -93,8 +97,8 @@
  #define RDS_INFO_LAST                  10010
  
  struct rds_info_counter {
-       uint8_t name[32];
-       uint64_t        value;
+       u_int8_t        name[32];
+       u_int64_t       value;
  } __attribute__((packed));
  
  #define RDS_INFO_CONNECTION_FLAG_SENDING       0x01
@@ -104,47 +108,43 @@ struct rds_info_counter {
  #define TRANSNAMSIZ    16
  
  struct rds_info_connection {
-       uint64_t        next_tx_seq;
-       uint64_t        next_rx_seq;
+       u_int64_t       next_tx_seq;
+       u_int64_t       next_rx_seq;
         __be32          laddr;
         __be32          faddr;
-       uint8_t transport[TRANSNAMSIZ];         /* null term ascii */
-       uint8_t flags;
+       u_int8_t        transport[TRANSNAMSIZ];         /* null term ascii */
+       u_int8_t        flags;
+} __attribute__((packed));
+
+struct rds_info_flow {
+       __be32          laddr;
+       __be32          faddr;
+       u_int32_t       bytes;
+       __be16          lport;
+       __be16          fport;
  } __attribute__((packed));
  
  #define RDS_INFO_MESSAGE_FLAG_ACK               0x01
  #define RDS_INFO_MESSAGE_FLAG_FAST_ACK          0x02
  
  struct rds_info_message {
-       uint64_t        seq;
-       uint32_t        len;
+       u_int64_t       seq;
+       u_int32_t       len;
         __be32          laddr;
         __be32          faddr;
         __be16          lport;
         __be16          fport;
-       uint8_t flags;
+       u_int8_t        flags;
  } __attribute__((packed));
  
  struct rds_info_socket {
-       uint32_t        sndbuf;
+       u_int32_t       sndbuf;
         __be32          bound_addr;
         __be32          connected_addr;
         __be16          bound_port;
         __be16          connected_port;
-       uint32_t        rcvbuf;
-       uint64_t        inum;
-} __attribute__((packed));
-
-struct rds_info_tcp_socket {
-       __be32          local_addr;
-       __be16          local_port;
-       __be32          peer_addr;
-       __be16          peer_port;
-       uint64_t       hdr_rem;
-       uint64_t       data_rem;
-       uint32_t       last_sent_nxt;
-       uint32_t       last_expected_una;
-       uint32_t       last_seen_una;
+       u_int32_t       rcvbuf;
+       u_int64_t       inum;
  } __attribute__((packed));
  
  #define RDS_IB_GID_LEN 16
@@ -199,69 +199,35 @@ struct rds_info_rdma_connection {
   * (so that the application does not have to worry about
   * alignment).
   */
-typedef uint64_t       rds_rdma_cookie_t;
+typedef u_int64_t      rds_rdma_cookie_t;
  
  struct rds_iovec {
-       uint64_t        addr;
-       uint64_t        bytes;
+       u_int64_t       addr;
+       u_int64_t       bytes;
  };
  
  struct rds_get_mr_args {
         struct rds_iovec vec;
-       uint64_t        cookie_addr;
+       u_int64_t       cookie_addr;
         uint64_t        flags;
  };
  
-struct rds_get_mr_for_dest_args {
-       struct sockaddr_storage dest_addr;
-       struct rds_iovec        vec;
-       uint64_t                cookie_addr;
-       uint64_t                flags;
-};
-
  struct rds_free_mr_args {
         rds_rdma_cookie_t cookie;
-       uint64_t        flags;
+       u_int64_t       flags;
  };
  
  struct rds_rdma_args {
         rds_rdma_cookie_t cookie;
         struct rds_iovec remote_vec;
-       uint64_t        local_vec_addr;
-       uint64_t        nr_local;
-       uint64_t        flags;
-       uint64_t        user_token;
-};
-
-struct rds_atomic_args {
-       rds_rdma_cookie_t cookie;
-       uint64_t        local_addr;
-       uint64_t        remote_addr;
-       union {
-               struct {
-                       uint64_t        compare;
-                       uint64_t        swap;
-               } cswp;
-               struct {
-                       uint64_t        add;
-               } fadd;
-               struct {
-                       uint64_t        compare;
-                       uint64_t        swap;
-                       uint64_t        compare_mask;
-                       uint64_t        swap_mask;
-               } m_cswp;
-               struct {
-                       uint64_t        add;
-                       uint64_t        nocarry_mask;
-               } m_fadd;
-       };
-       uint64_t        flags;
-       uint64_t        user_token;
+       u_int64_t       local_vec_addr;
+       u_int64_t       nr_local;
+       u_int64_t       flags;
+       u_int64_t       user_token;
  };
  
  struct rds_rdma_notify {
-       uint64_t        user_token;
+       u_int64_t       user_token;
         int32_t         status;
  };
  
@@ -280,6 +246,5 @@ struct rds_rdma_notify {
  #define RDS_RDMA_USE_ONCE      0x0008  /* free MR after use */
  #define RDS_RDMA_DONTWAIT      0x0010  /* Don't wait in SET_BARRIER */
  #define RDS_RDMA_NOTIFY_ME     0x0020  /* Notify when operation completes */
-#define RDS_RDMA_SILENT                0x0040  /* Do not interrupt remote */
  
  #endif /* IB_RDS_H */
diff --git a/net/rds/Kconfig b/net/rds/Kconfig

index f2c670ba7b9b2b26592e1b07bbedf0c5b8b3b842..796773b5df9b819c1bf33f90654556187264fe9d 100644 (file)
--- a/net/rds/Kconfig
+++ b/net/rds/Kconfig
@@ -1,28 +1,14 @@
  
  config RDS
-       tristate "The RDS Protocol"
-       depends on INET
+       tristate "Reliable Datagram Sockets (RDS) (EXPERIMENTAL)"
+       depends on INET && INFINIBAND_IPOIB && EXPERIMENTAL
+       depends on INFINIBAND && INFINIBAND_ADDR_TRANS
         ---help---
-         The RDS (Reliable Datagram Sockets) protocol provides reliable,
-         sequenced delivery of datagrams over Infiniband, iWARP,
-         or TCP.
-
-config RDS_RDMA
-       tristate "RDS over Infiniband and iWARP"
-       depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS
-       ---help---
-         Allow RDS to use Infiniband and iWARP as a transport.
-         This transport supports RDMA operations.
-
-config RDS_TCP
-       tristate "RDS over TCP"
-       depends on RDS
-       ---help---
-         Allow RDS to use TCP as a transport.
-         This transport does not support RDMA operations.
+         RDS provides reliable, sequenced delivery of datagrams
+         over Infiniband.
  
  config RDS_DEBUG
-        bool "RDS debugging messages"
+        bool "Debugging messages"
         depends on RDS
          default n
  
diff --git a/net/rds/Makefile b/net/rds/Makefile

index 56d3f6023ced41adb81510ec0dc8525c9230dea9..51f27585fa083f840d228f59c7a2b1968c6ab095 100644 (file)
--- a/net/rds/Makefile
+++ b/net/rds/Makefile
@@ -1,19 +1,14 @@
  obj-$(CONFIG_RDS) += rds.o
  rds-y :=       af_rds.o bind.o cong.o connection.o info.o message.o   \
                         recv.o send.o stats.o sysctl.o threads.o transport.o \
-                       loop.o page.o rdma.o
-
-obj-$(CONFIG_RDS_RDMA) += rds_rdma.o
-rds_rdma-y :=  rdma_transport.o \
+                       loop.o page.o rdma.o \
+                       rdma_transport.o \
                         ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \
                         ib_sysctl.o ib_rdma.o \
                         iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \
                         iw_sysctl.o iw_rdma.o
  
-
-obj-$(CONFIG_RDS_TCP) += rds_tcp.o
-rds_tcp-y :=           tcp.o tcp_connect.o tcp_listen.o tcp_recv.o \
-                       tcp_send.o tcp_stats.o
-
-ccflags-$(CONFIG_RDS_DEBUG)    :=      -DDEBUG
+ifeq ($(CONFIG_RDS_DEBUG), y)
+EXTRA_CFLAGS += -DDEBUG
+endif
  
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c

index 10443377fb9d8f5b5cb928647fa58c03001a072b..20cf16fc572f20594fa05d25ee4554a4f30d3d0a 100644 (file)
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -33,21 +33,14 @@
  #include <linux/module.h>
  #include <linux/errno.h>
  #include <linux/kernel.h>
-#include <linux/gfp.h>
  #include <linux/in.h>
  #include <linux/poll.h>
+#include <linux/version.h>
  #include <net/sock.h>
  
  #include "rds.h"
-
-char *rds_str_array(char **array, size_t elements, size_t index)
-{
-       if ((index < elements) && array[index])
-               return array[index];
-       else
-               return "unknown";
-}
-EXPORT_SYMBOL(rds_str_array);
+#include "rdma.h"
+#include "rdma_transport.h"
  
  /* this is just used for stats gathering :/ */
  static DEFINE_SPINLOCK(rds_sock_lock);
@@ -68,8 +61,9 @@ static int rds_release(struct socket *sock)
  {
         struct sock *sk = sock->sk;
         struct rds_sock *rs;
+       unsigned long flags;
  
-       if (!sk)
+       if (sk == NULL)
                 goto out;
  
         rs = rds_sk_to_rs(sk);
@@ -80,25 +74,15 @@ static int rds_release(struct socket *sock)
          * with the socket. */
         rds_clear_recv_queue(rs);
         rds_cong_remove_socket(rs);
-
-       /*
-        * the binding lookup hash uses rcu, we need to
-        * make sure we synchronize_rcu before we free our
-        * entry
-        */
         rds_remove_bound(rs);
-       synchronize_rcu();
-
         rds_send_drop_to(rs, NULL);
         rds_rdma_drop_keys(rs);
         rds_notify_queue_get(rs, NULL);
  
-       spin_lock_bh(&rds_sock_lock);
+       spin_lock_irqsave(&rds_sock_lock, flags);
         list_del_init(&rs->rs_item);
         rds_sock_count--;
-       spin_unlock_bh(&rds_sock_lock);
-
-       rds_trans_put(rs->rs_transport);
+       spin_unlock_irqrestore(&rds_sock_lock, flags);
  
         sock->sk = NULL;
         sock_put(sk);
@@ -175,10 +159,9 @@ static unsigned int rds_poll(struct file *file, struct socket *sock,
         unsigned int mask = 0;
         unsigned long flags;
  
-       poll_wait(file, sk_sleep(sk), wait);
+       poll_wait(file, sk->sk_sleep, wait);
  
-       if (rs->rs_seen_congestion)
-               poll_wait(file, &rds_poll_waitq, wait);
+       poll_wait(file, &rds_poll_waitq, wait);
  
         read_lock_irqsave(&rs->rs_recv_lock, flags);
         if (!rs->rs_cong_monitor) {
@@ -193,17 +176,13 @@ static unsigned int rds_poll(struct file *file, struct socket *sock,
                         mask |= (POLLIN | POLLRDNORM);
                 spin_unlock(&rs->rs_lock);
         }
-       if (!list_empty(&rs->rs_recv_queue) ||
-           !list_empty(&rs->rs_notify_queue))
+       if (!list_empty(&rs->rs_recv_queue)
+        || !list_empty(&rs->rs_notify_queue))
                 mask |= (POLLIN | POLLRDNORM);
         if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
                 mask |= (POLLOUT | POLLWRNORM);
         read_unlock_irqrestore(&rs->rs_recv_lock, flags);
  
-       /* clear state any time we wake a seen-congested socket */
-       if (mask)
-               rs->rs_seen_congestion = 0;
-
         return mask;
  }
  
@@ -271,7 +250,7 @@ static int rds_cong_monitor(struct rds_sock *rs, char __user *optval,
  }
  
  static int rds_setsockopt(struct socket *sock, int level, int optname,
-                         char __user *optval, unsigned int optlen)
+                         char __user *optval, int optlen)
  {
         struct rds_sock *rs = rds_sk_to_rs(sock->sk);
         int ret;
@@ -288,9 +267,6 @@ static int rds_setsockopt(struct socket *sock, int level, int optname,
         case RDS_GET_MR:
                 ret = rds_get_mr(rs, optval, optlen);
                 break;
-       case RDS_GET_MR_FOR_DEST:
-               ret = rds_get_mr_for_dest(rs, optval, optlen);
-               break;
         case RDS_FREE_MR:
                 ret = rds_free_mr(rs, optval, optlen);
                 break;
@@ -331,8 +307,8 @@ static int rds_getsockopt(struct socket *sock, int level, int optname,
                 if (len < sizeof(int))
                         ret = -EINVAL;
                 else
-               if (put_user(rs->rs_recverr, (int __user *) optval) ||
-                   put_user(sizeof(int), optlen))
+               if (put_user(rs->rs_recverr, (int __user *) optval)
+                || put_user(sizeof(int), optlen))
                         ret = -EFAULT;
                 else
                         ret = 0;
@@ -385,7 +361,7 @@ static struct proto rds_proto = {
         .obj_size = sizeof(struct rds_sock),
  };
  
-static const struct proto_ops rds_proto_ops = {
+static struct proto_ops rds_proto_ops = {
         .family =       AF_RDS,
         .owner =        THIS_MODULE,
         .release =      rds_release,
@@ -408,6 +384,7 @@ static const struct proto_ops rds_proto_ops = {
  
  static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
  {
+       unsigned long flags;
         struct rds_sock *rs;
  
         sock_init_data(sock, sk);
@@ -424,16 +401,15 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
         spin_lock_init(&rs->rs_rdma_lock);
         rs->rs_rdma_keys = RB_ROOT;
  
-       spin_lock_bh(&rds_sock_lock);
+       spin_lock_irqsave(&rds_sock_lock, flags);
         list_add_tail(&rs->rs_item, &rds_sock_list);
         rds_sock_count++;
-       spin_unlock_bh(&rds_sock_lock);
+       spin_unlock_irqrestore(&rds_sock_lock, flags);
  
         return 0;
  }
  
-static int rds_create(struct net *net, struct socket *sock, int protocol,
-                     int kern)
+static int rds_create(struct net *net, struct socket *sock, int protocol)
  {
         struct sock *sk;
  
@@ -457,7 +433,7 @@ void rds_sock_put(struct rds_sock *rs)
         sock_put(rds_rs_to_sk(rs));
  }
  
-static const struct net_proto_family rds_family_ops = {
+static struct net_proto_family rds_family_ops = {
         .family =       AF_RDS,
         .create =       rds_create,
         .owner  =       THIS_MODULE,
@@ -468,14 +444,17 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len,
                               struct rds_info_lengths *lens)
  {
         struct rds_sock *rs;
+       struct sock *sk;
         struct rds_incoming *inc;
+       unsigned long flags;
         unsigned int total = 0;
  
         len /= sizeof(struct rds_info_message);
  
-       spin_lock_bh(&rds_sock_lock);
+       spin_lock_irqsave(&rds_sock_lock, flags);
  
         list_for_each_entry(rs, &rds_sock_list, rs_item) {
+               sk = rds_rs_to_sk(rs);
                 read_lock(&rs->rs_recv_lock);
  
                 /* XXX too lazy to maintain counts.. */
@@ -489,7 +468,7 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len,
                 read_unlock(&rs->rs_recv_lock);
         }
  
-       spin_unlock_bh(&rds_sock_lock);
+       spin_unlock_irqrestore(&rds_sock_lock, flags);
  
         lens->nr = total;
         lens->each = sizeof(struct rds_info_message);
@@ -501,10 +480,11 @@ static void rds_sock_info(struct socket *sock, unsigned int len,
  {
         struct rds_info_socket sinfo;
         struct rds_sock *rs;
+       unsigned long flags;
  
         len /= sizeof(struct rds_info_socket);
  
-       spin_lock_bh(&rds_sock_lock);
+       spin_lock_irqsave(&rds_sock_lock, flags);
  
         if (len < rds_sock_count)
                 goto out;
@@ -525,11 +505,12 @@ out:
         lens->nr = rds_sock_count;
         lens->each = sizeof(struct rds_info_socket);
  
-       spin_unlock_bh(&rds_sock_lock);
+       spin_unlock_irqrestore(&rds_sock_lock, flags);
  }
  
-static void rds_exit(void)
+static void __exit rds_exit(void)
  {
+       rds_rdma_exit();
         sock_unregister(rds_family_ops.family);
         proto_unregister(&rds_proto);
         rds_conn_exit();
@@ -543,7 +524,7 @@ static void rds_exit(void)
  }
  module_exit(rds_exit);
  
-static int rds_init(void)
+static int __init rds_init(void)
  {
         int ret;
  
@@ -569,8 +550,14 @@ static int rds_init(void)
         rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info);
         rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
  
+       /* ib/iwarp transports currently compiled-in */
+       ret = rds_rdma_init();
+       if (ret)
+               goto out_sock;
         goto out;
  
+out_sock:
+       sock_unregister(rds_family_ops.family);
  out_proto:
         proto_unregister(&rds_proto);
  out_stats:
diff --git a/net/rds/bind.c b/net/rds/bind.c

index a2e6562da751f79bd9524c7e434f1d1d02d3b4fe..c17cc39160cefd86d44a8787a83824e5f06ce660 100644 (file)
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -34,52 +34,45 @@
  #include <net/sock.h>
  #include <linux/in.h>
  #include <linux/if_arp.h>
-#include <linux/jhash.h>
-#include <linux/ratelimit.h>
  #include "rds.h"
  
-#define BIND_HASH_SIZE 1024
-static struct hlist_head bind_hash_table[BIND_HASH_SIZE];
+/*
+ * XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't
+ * particularly zippy.
+ *
+ * This is now called for every incoming frame so we arguably care much more
+ * about it than we used to.
+ */
  static DEFINE_SPINLOCK(rds_bind_lock);
+static struct rb_root rds_bind_tree = RB_ROOT;
  
-static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port)
-{
-       return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) &
-                                 (BIND_HASH_SIZE - 1));
-}
-
-static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port,
-                                       struct rds_sock *insert)
+static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port,
+                                          struct rds_sock *insert)
  {
+       struct rb_node **p = &rds_bind_tree.rb_node;
+       struct rb_node *parent = NULL;
         struct rds_sock *rs;
-       struct hlist_head *head = hash_to_bucket(addr, port);
         u64 cmp;
         u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port);
  
-       rcu_read_lock();
-       hlist_for_each_entry_rcu(rs, head, rs_bound_node) {
+       while (*p) {
+               parent = *p;
+               rs = rb_entry(parent, struct rds_sock, rs_bound_node);
+
                 cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) |
                       be16_to_cpu(rs->rs_bound_port);
  
-               if (cmp == needle) {
-                       rcu_read_unlock();
+               if (needle < cmp)
+                       p = &(*p)->rb_left;
+               else if (needle > cmp)
+                       p = &(*p)->rb_right;
+               else
                         return rs;
-               }
         }
-       rcu_read_unlock();
  
         if (insert) {
-               /*
-                * make sure our addr and port are set before
-                * we are added to the list, other people
-                * in rcu will find us as soon as the
-                * hlist_add_head_rcu is done
-                */
-               insert->rs_bound_addr = addr;
-               insert->rs_bound_port = port;
-               rds_sock_addref(insert);
-
-               hlist_add_head_rcu(&insert->rs_bound_node, head);
+               rb_link_node(&insert->rs_bound_node, parent, p);
+               rb_insert_color(&insert->rs_bound_node, &rds_bind_tree);
         }
         return NULL;
  }
@@ -93,13 +86,15 @@ static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port,
  struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
  {
         struct rds_sock *rs;
+       unsigned long flags;
  
-       rs = rds_bind_lookup(addr, port, NULL);
-
+       spin_lock_irqsave(&rds_bind_lock, flags);
+       rs = rds_bind_tree_walk(addr, port, NULL);
         if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
                 rds_sock_addref(rs);
         else
                 rs = NULL;
+       spin_unlock_irqrestore(&rds_bind_lock, flags);
  
         rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
                 ntohs(port));
@@ -117,7 +112,7 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
                 rover = be16_to_cpu(*port);
                 last = rover;
         } else {
-               rover = max_t(u16, prandom_u32(), 2);
+               rover = max_t(u16, net_random(), 2);
                 last = rover - 1;
         }
  
@@ -126,15 +121,22 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
         do {
                 if (rover == 0)
                         rover++;
-               if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) {
-                       *port = rs->rs_bound_port;
+               if (rds_bind_tree_walk(addr, cpu_to_be16(rover), rs) == NULL) {
+                       *port = cpu_to_be16(rover);
                         ret = 0;
-                       rdsdebug("rs %p binding to %pI4:%d\n",
-                         rs, &addr, (int)ntohs(*port));
                         break;
                 }
         } while (rover++ != last);
  
+       if (ret == 0)  {
+               rs->rs_bound_addr = addr;
+               rs->rs_bound_port = *port;
+               rds_sock_addref(rs);
+
+               rdsdebug("rs %p binding to %pI4:%d\n",
+                 rs, &addr, (int)ntohs(*port));
+       }
+
         spin_unlock_irqrestore(&rds_bind_lock, flags);
  
         return ret;
@@ -151,7 +153,7 @@ void rds_remove_bound(struct rds_sock *rs)
                   rs, &rs->rs_bound_addr,
                   ntohs(rs->rs_bound_port));
  
-               hlist_del_init_rcu(&rs->rs_bound_node);
+               rb_erase(&rs->rs_bound_node, &rds_bind_tree);
                 rds_sock_put(rs);
                 rs->rs_bound_addr = 0;
         }
@@ -182,11 +184,9 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
                 goto out;
  
         trans = rds_trans_get_preferred(sin->sin_addr.s_addr);
-       if (!trans) {
+       if (trans == NULL) {
                 ret = -EADDRNOTAVAIL;
                 rds_remove_bound(rs);
-               printk_ratelimited(KERN_INFO "RDS: rds_bind() could not find a transport, "
-                               "load rds_tcp or rds_rdma?\n");
                 goto out;
         }
  
@@ -195,9 +195,5 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
  
  out:
         release_sock(sk);
-
-       /* we might have called rds_remove_bound on error */
-       if (ret)
-               synchronize_rcu();
         return ret;
  }
diff --git a/net/rds/cong.c b/net/rds/cong.c

index e6144b8246fd27fe49bffd228b44a44c3e7cbd81..710e4599d76cffa7a56ad2891c127960b37cb3c0 100644 (file)
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -30,11 +30,10 @@
   * SOFTWARE.
   *
   */
-#include <linux/slab.h>
  #include <linux/types.h>
  #include <linux/rbtree.h>
-#include <linux/bitops.h>
-#include <linux/export.h>
+
+#include <asm-generic/bitops/le.h>
  
  #include "rds.h"
  
@@ -141,7 +140,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
         unsigned long flags;
  
         map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL);
-       if (!map)
+       if (map == NULL)
                 return NULL;
  
         map->m_addr = addr;
@@ -159,7 +158,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
         ret = rds_cong_tree_walk(addr, map);
         spin_unlock_irqrestore(&rds_cong_lock, flags);
  
-       if (!ret) {
+       if (ret == NULL) {
                 ret = map;
                 map = NULL;
         }
@@ -205,7 +204,7 @@ int rds_cong_get_maps(struct rds_connection *conn)
         conn->c_lcong = rds_cong_from_addr(conn->c_laddr);
         conn->c_fcong = rds_cong_from_addr(conn->c_faddr);
  
-       if (!(conn->c_lcong && conn->c_fcong))
+       if (conn->c_lcong == NULL || conn->c_fcong == NULL)
                 return -ENOMEM;
  
         return 0;
@@ -221,20 +220,6 @@ void rds_cong_queue_updates(struct rds_cong_map *map)
         list_for_each_entry(conn, &map->m_conn_list, c_map_item) {
                 if (!test_and_set_bit(0, &conn->c_map_queued)) {
                         rds_stats_inc(s_cong_update_queued);
-                       /* We cannot inline the call to rds_send_xmit() here
-                        * for two reasons (both pertaining to a TCP transport):
-                        * 1. When we get here from the receive path, we
-                        *    are already holding the sock_lock (held by
-                        *    tcp_v4_rcv()). So inlining calls to
-                        *    tcp_setsockopt and/or tcp_sendmsg will deadlock
-                        *    when it tries to get the sock_lock())
-                        * 2. Interrupts are masked so that we can mark the
-                        *    the port congested from both send and recv paths.
-                        *    (See comment around declaration of rdc_cong_lock).
-                        *    An attempt to get the sock_lock() here will
-                        *    therefore trigger warnings.
-                        * Defer the xmit to rds_send_worker() instead.
-                        */
                         queue_delayed_work(rds_wq, &conn->c_send_w, 0);
                 }
         }
@@ -269,7 +254,6 @@ void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask)
                 read_unlock_irqrestore(&rds_cong_monitor_lock, flags);
         }
  }
-EXPORT_SYMBOL_GPL(rds_cong_map_updated);
  
  int rds_cong_updated_since(unsigned long *recent)
  {
@@ -299,7 +283,7 @@ void rds_cong_set_bit(struct rds_cong_map *map, __be16 port)
         i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
         off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
  
-       __set_bit_le(off, (void *)map->m_page_addrs[i]);
+       generic___set_le_bit(off, (void *)map->m_page_addrs[i]);
  }
  
  void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port)
@@ -313,7 +297,7 @@ void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port)
         i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
         off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
  
-       __clear_bit_le(off, (void *)map->m_page_addrs[i]);
+       generic___clear_le_bit(off, (void *)map->m_page_addrs[i]);
  }
  
  static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port)
@@ -324,7 +308,7 @@ static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port)
         i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
         off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
  
-       return test_bit_le(off, (void *)map->m_page_addrs[i]);
+       return generic_test_le_bit(off, (void *)map->m_page_addrs[i]);
  }
  
  void rds_cong_add_socket(struct rds_sock *rs)
diff --git a/net/rds/connection.c b/net/rds/connection.c

index da6da57e5f36b5cc13a5bc92abfedb6a5ccea45d..273f064930a8e1aec7f8539ed54d928447e7a53a 100644 (file)
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -32,12 +32,11 @@
   */
  #include <linux/kernel.h>
  #include <linux/list.h>
-#include <linux/slab.h>
-#include <linux/export.h>
  #include <net/inet_hashtables.h>
  
  #include "rds.h"
  #include "loop.h"
+#include "rdma.h"
  
  #define RDS_CONNECTION_HASH_BITS 12
  #define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS)
@@ -51,16 +50,10 @@ static struct kmem_cache *rds_conn_slab;
  
  static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
  {
-       static u32 rds_hash_secret __read_mostly;
-
-       unsigned long hash;
-
-       net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret));
-
         /* Pass NULL, don't need struct net for hash */
-       hash = __inet_ehashfn(be32_to_cpu(laddr), 0,
-                             be32_to_cpu(faddr), 0,
-                             rds_hash_secret);
+       unsigned long hash = inet_ehashfn(NULL,
+                                         be32_to_cpu(laddr), 0,
+                                         be32_to_cpu(faddr), 0);
         return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK];
  }
  
@@ -69,14 +62,26 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
                 var |= RDS_INFO_CONNECTION_FLAG_##suffix;       \
  } while (0)
  
-/* rcu read lock must be held or the connection spinlock */
+static inline int rds_conn_is_sending(struct rds_connection *conn)
+{
+       int ret = 0;
+
+       if (!mutex_trylock(&conn->c_send_lock))
+               ret = 1;
+       else
+               mutex_unlock(&conn->c_send_lock);
+
+       return ret;
+}
+
  static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
                                               __be32 laddr, __be32 faddr,
                                               struct rds_transport *trans)
  {
         struct rds_connection *conn, *ret = NULL;
+       struct hlist_node *pos;
  
-       hlist_for_each_entry_rcu(conn, head, c_hash_node) {
+       hlist_for_each_entry(conn, pos, head, c_hash_node) {
                 if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
                                 conn->c_trans == trans) {
                         ret = conn;
@@ -94,7 +99,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
   * and receiving over this connection again in the future.  It is up to
   * the transport to have serialized this call with its send and recv.
   */
-static void rds_conn_reset(struct rds_connection *conn)
+void rds_conn_reset(struct rds_connection *conn)
  {
         rdsdebug("connection %pI4 to %pI4 reset\n",
           &conn->c_laddr, &conn->c_faddr);
@@ -121,19 +126,17 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
                                        struct rds_transport *trans, gfp_t gfp,
                                        int is_outgoing)
  {
-       struct rds_connection *conn, *parent = NULL;
+       struct rds_connection *conn, *tmp, *parent = NULL;
         struct hlist_head *head = rds_conn_bucket(laddr, faddr);
-       struct rds_transport *loop_trans;
         unsigned long flags;
         int ret;
-       struct rds_transport *otrans = trans;
  
-       if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
-               goto new_conn;
-       rcu_read_lock();
+       spin_lock_irqsave(&rds_conn_lock, flags);
         conn = rds_conn_lookup(head, laddr, faddr, trans);
-       if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
-           laddr == faddr && !is_outgoing) {
+       if (conn
+        && conn->c_loopback
+        && conn->c_trans != &rds_loop_transport
+        && !is_outgoing) {
                 /* This is a looped back IB connection, and we're
                  * called by the code handling the incoming connect.
                  * We need a second connection object into which we
@@ -141,24 +144,26 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
                 parent = conn;
                 conn = parent->c_passive;
         }
-       rcu_read_unlock();
+       spin_unlock_irqrestore(&rds_conn_lock, flags);
         if (conn)
                 goto out;
  
-new_conn:
-       conn = kmem_cache_zalloc(rds_conn_slab, gfp);
-       if (!conn) {
+       conn = kmem_cache_alloc(rds_conn_slab, gfp);
+       if (conn == NULL) {
                 conn = ERR_PTR(-ENOMEM);
                 goto out;
         }
  
+       memset(conn, 0, sizeof(*conn));
+
         INIT_HLIST_NODE(&conn->c_hash_node);
+       conn->c_version = RDS_PROTOCOL_3_0;
         conn->c_laddr = laddr;
         conn->c_faddr = faddr;
         spin_lock_init(&conn->c_lock);
         conn->c_next_tx_seq = 1;
  
-       init_waitqueue_head(&conn->c_waitq);
+       mutex_init(&conn->c_send_lock);
         INIT_LIST_HEAD(&conn->c_send_queue);
         INIT_LIST_HEAD(&conn->c_retrans);
  
@@ -174,9 +179,7 @@ new_conn:
          * can bind to the destination address then we'd rather the messages
          * flow through loopback rather than either transport.
          */
-       loop_trans = rds_trans_get_preferred(faddr);
-       if (loop_trans) {
-               rds_trans_put(loop_trans);
+       if (rds_trans_get_preferred(faddr)) {
                 conn->c_loopback = 1;
                 if (is_outgoing && trans->t_prefer_loopback) {
                         /* "outgoing" connection - and the transport
@@ -197,7 +200,6 @@ new_conn:
         }
  
         atomic_set(&conn->c_state, RDS_CONN_DOWN);
-       conn->c_send_gen = 0;
         conn->c_reconnect_jiffies = 0;
         INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker);
         INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker);
@@ -211,49 +213,26 @@ new_conn:
           trans->t_name ? trans->t_name : "[unknown]",
           is_outgoing ? "(outgoing)" : "");
  
-       /*
-        * Since we ran without holding the conn lock, someone could
-        * have created the same conn (either normal or passive) in the
-        * interim. We check while holding the lock. If we won, we complete
-        * init and return our conn. If we lost, we rollback and return the
-        * other one.
-        */
         spin_lock_irqsave(&rds_conn_lock, flags);
-       if (parent) {
-               /* Creating passive conn */
-               if (parent->c_passive) {
-                       trans->conn_free(conn->c_transport_data);
-                       kmem_cache_free(rds_conn_slab, conn);
-                       conn = parent->c_passive;
-               } else {
+       if (parent == NULL) {
+               tmp = rds_conn_lookup(head, laddr, faddr, trans);
+               if (tmp == NULL)
+                       hlist_add_head(&conn->c_hash_node, head);
+       } else {
+               tmp = parent->c_passive;
+               if (!tmp)
                         parent->c_passive = conn;
-                       rds_cong_add_conn(conn);
-                       rds_conn_count++;
-               }
+       }
+
+       if (tmp) {
+               trans->conn_free(conn->c_transport_data);
+               kmem_cache_free(rds_conn_slab, conn);
+               conn = tmp;
         } else {
-               /* Creating normal conn */
-               struct rds_connection *found;
-
-               if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
-                       found = NULL;
-               else
-                       found = rds_conn_lookup(head, laddr, faddr, trans);
-               if (found) {
-                       trans->conn_free(conn->c_transport_data);
-                       kmem_cache_free(rds_conn_slab, conn);
-                       conn = found;
-               } else {
-                       if ((is_outgoing && otrans->t_type == RDS_TRANS_TCP) ||
-                           (otrans->t_type != RDS_TRANS_TCP)) {
-                               /* Only the active side should be added to
-                                * reconnect list for TCP.
-                                */
-                               hlist_add_head_rcu(&conn->c_hash_node, head);
-                       }
-                       rds_cong_add_conn(conn);
-                       rds_conn_count++;
-               }
+               rds_cong_add_conn(conn);
+               rds_conn_count++;
         }
+
         spin_unlock_irqrestore(&rds_conn_lock, flags);
  
  out:
@@ -265,100 +244,28 @@ struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
  {
         return __rds_conn_create(laddr, faddr, trans, gfp, 0);
  }
-EXPORT_SYMBOL_GPL(rds_conn_create);
  
  struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
                                        struct rds_transport *trans, gfp_t gfp)
  {
         return __rds_conn_create(laddr, faddr, trans, gfp, 1);
  }
-EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
-
-void rds_conn_shutdown(struct rds_connection *conn)
-{
-       /* shut it down unless it's down already */
-       if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
-               /*
-                * Quiesce the connection mgmt handlers before we start tearing
-                * things down. We don't hold the mutex for the entire
-                * duration of the shutdown operation, else we may be
-                * deadlocking with the CM handler. Instead, the CM event
-                * handler is supposed to check for state DISCONNECTING
-                */
-               mutex_lock(&conn->c_cm_lock);
-               if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING)
-                && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
-                       rds_conn_error(conn, "shutdown called in state %d\n",
-                                       atomic_read(&conn->c_state));
-                       mutex_unlock(&conn->c_cm_lock);
-                       return;
-               }
-               mutex_unlock(&conn->c_cm_lock);
-
-               wait_event(conn->c_waitq,
-                          !test_bit(RDS_IN_XMIT, &conn->c_flags));
-
-               conn->c_trans->conn_shutdown(conn);
-               rds_conn_reset(conn);
-
-               if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
-                       /* This can happen - eg when we're in the middle of tearing
-                        * down the connection, and someone unloads the rds module.
-                        * Quite reproduceable with loopback connections.
-                        * Mostly harmless.
-                        */
-                       rds_conn_error(conn,
-                               "%s: failed to transition to state DOWN, "
-                               "current state is %d\n",
-                               __func__,
-                               atomic_read(&conn->c_state));
-                       return;
-               }
-       }
-
-       /* Then reconnect if it's still live.
-        * The passive side of an IB loopback connection is never added
-        * to the conn hash, so we never trigger a reconnect on this
-        * conn - the reconnect is always triggered by the active peer. */
-       cancel_delayed_work_sync(&conn->c_conn_w);
-       rcu_read_lock();
-       if (!hlist_unhashed(&conn->c_hash_node)) {
-               rcu_read_unlock();
-               rds_queue_reconnect(conn);
-       } else {
-               rcu_read_unlock();
-       }
-}
  
-/*
- * Stop and free a connection.
- *
- * This can only be used in very limited circumstances.  It assumes that once
- * the conn has been shutdown that no one else is referencing the connection.
- * We can only ensure this in the rmmod path in the current code.
- */
  void rds_conn_destroy(struct rds_connection *conn)
  {
         struct rds_message *rm, *rtmp;
-       unsigned long flags;
  
         rdsdebug("freeing conn %p for %pI4 -> "
                  "%pI4\n", conn, &conn->c_laddr,
                  &conn->c_faddr);
  
-       /* Ensure conn will not be scheduled for reconnect */
-       spin_lock_irq(&rds_conn_lock);
-       hlist_del_init_rcu(&conn->c_hash_node);
-       spin_unlock_irq(&rds_conn_lock);
-       synchronize_rcu();
-
-       /* shut the connection down */
-       rds_conn_drop(conn);
-       flush_work(&conn->c_down_w);
+       hlist_del_init(&conn->c_hash_node);
  
-       /* make sure lingering queued work won't try to ref the conn */
-       cancel_delayed_work_sync(&conn->c_send_w);
-       cancel_delayed_work_sync(&conn->c_recv_w);
+       /* wait for the rds thread to shut it down */
+       atomic_set(&conn->c_state, RDS_CONN_ERROR);
+       cancel_delayed_work(&conn->c_conn_w);
+       queue_work(rds_wq, &conn->c_down_w);
+       flush_workqueue(rds_wq);
  
         /* tear down queued messages */
         list_for_each_entry_safe(rm, rtmp,
@@ -383,11 +290,8 @@ void rds_conn_destroy(struct rds_connection *conn)
         BUG_ON(!list_empty(&conn->c_retrans));
         kmem_cache_free(rds_conn_slab, conn);
  
-       spin_lock_irqsave(&rds_conn_lock, flags);
         rds_conn_count--;
-       spin_unlock_irqrestore(&rds_conn_lock, flags);
  }
-EXPORT_SYMBOL_GPL(rds_conn_destroy);
  
  static void rds_conn_message_info(struct socket *sock, unsigned int len,
                                   struct rds_info_iterator *iter,
@@ -395,26 +299,27 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
                                   int want_send)
  {
         struct hlist_head *head;
+       struct hlist_node *pos;
         struct list_head *list;
         struct rds_connection *conn;
         struct rds_message *rm;
-       unsigned int total = 0;
         unsigned long flags;
+       unsigned int total = 0;
         size_t i;
  
         len /= sizeof(struct rds_info_message);
  
-       rcu_read_lock();
+       spin_lock_irqsave(&rds_conn_lock, flags);
  
         for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
              i++, head++) {
-               hlist_for_each_entry_rcu(conn, head, c_hash_node) {
+               hlist_for_each_entry(conn, pos, head, c_hash_node) {
                         if (want_send)
                                 list = &conn->c_send_queue;
                         else
                                 list = &conn->c_retrans;
  
-                       spin_lock_irqsave(&conn->c_lock, flags);
+                       spin_lock(&conn->c_lock);
  
                         /* XXX too lazy to maintain counts.. */
                         list_for_each_entry(rm, list, m_conn_item) {
@@ -425,10 +330,11 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
                                                           conn->c_faddr, 0);
                         }
  
-                       spin_unlock_irqrestore(&conn->c_lock, flags);
+                       spin_unlock(&conn->c_lock);
                 }
         }
-       rcu_read_unlock();
+
+       spin_unlock_irqrestore(&rds_conn_lock, flags);
  
         lens->nr = total;
         lens->each = sizeof(struct rds_info_message);
@@ -457,17 +363,20 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
  {
         uint64_t buffer[(item_len + 7) / 8];
         struct hlist_head *head;
+       struct hlist_node *pos;
+       struct hlist_node *tmp;
         struct rds_connection *conn;
+       unsigned long flags;
         size_t i;
  
-       rcu_read_lock();
+       spin_lock_irqsave(&rds_conn_lock, flags);
  
         lens->nr = 0;
         lens->each = item_len;
  
         for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
              i++, head++) {
-               hlist_for_each_entry_rcu(conn, head, c_hash_node) {
+               hlist_for_each_entry_safe(conn, pos, tmp, head, c_hash_node) {
  
                         /* XXX no c_lock usage.. */
                         if (!visitor(conn, buffer))
@@ -483,9 +392,9 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
                         lens->nr++;
                 }
         }
-       rcu_read_unlock();
+
+       spin_unlock_irqrestore(&rds_conn_lock, flags);
  }
-EXPORT_SYMBOL_GPL(rds_for_each_conn_info);
  
  static int rds_conn_info_visitor(struct rds_connection *conn,
                                   void *buffer)
@@ -500,8 +409,8 @@ static int rds_conn_info_visitor(struct rds_connection *conn,
                 sizeof(cinfo->transport));
         cinfo->flags = 0;
  
-       rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &conn->c_flags),
-                         SENDING);
+       rds_conn_info_set(cinfo->flags,
+                         rds_conn_is_sending(conn), SENDING);
         /* XXX Future: return the state rather than these funky bits */
         rds_conn_info_set(cinfo->flags,
                           atomic_read(&conn->c_state) == RDS_CONN_CONNECTING,
@@ -521,12 +430,12 @@ static void rds_conn_info(struct socket *sock, unsigned int len,
                                 sizeof(struct rds_info_connection));
  }
  
-int rds_conn_init(void)
+int __init rds_conn_init(void)
  {
         rds_conn_slab = kmem_cache_create("rds_connection",
                                           sizeof(struct rds_connection),
                                           0, 0, NULL);
-       if (!rds_conn_slab)
+       if (rds_conn_slab == NULL)
                 return -ENOMEM;
  
         rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info);
@@ -561,19 +470,6 @@ void rds_conn_drop(struct rds_connection *conn)
         atomic_set(&conn->c_state, RDS_CONN_ERROR);
         queue_work(rds_wq, &conn->c_down_w);
  }
-EXPORT_SYMBOL_GPL(rds_conn_drop);
-
-/*
- * If the connection is down, trigger a connect. We may have scheduled a
- * delayed reconnect however - in this case we should not interfere.
- */
-void rds_conn_connect_if_down(struct rds_connection *conn)
-{
-       if (rds_conn_state(conn) == RDS_CONN_DOWN &&
-           !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
-               queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
-}
-EXPORT_SYMBOL_GPL(rds_conn_connect_if_down);
  
  /*
   * An error occurred on the connection
diff --git a/net/rds/ib.c b/net/rds/ib.c

index ba2dffeff60876ca669993d1863dcbb6cb76a740..4933b380985eb730b496dd21152dc470fff7a10c 100644 (file)
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -37,89 +37,25 @@
  #include <linux/inetdevice.h>
  #include <linux/if_arp.h>
  #include <linux/delay.h>
-#include <linux/slab.h>
-#include <linux/module.h>
  
  #include "rds.h"
  #include "ib.h"
  
-static unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
+unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
  unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */
-unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
  
  module_param(fmr_pool_size, int, 0444);
  MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA");
  module_param(fmr_message_size, int, 0444);
  MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
-module_param(rds_ib_retry_count, int, 0444);
-MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
  
-/*
- * we have a clumsy combination of RCU and a rwsem protecting this list
- * because it is used both in the get_mr fast path and while blocking in
- * the FMR flushing path.
- */
-DECLARE_RWSEM(rds_ib_devices_lock);
  struct list_head rds_ib_devices;
  
  /* NOTE: if also grabbing ibdev lock, grab this first */
  DEFINE_SPINLOCK(ib_nodev_conns_lock);
  LIST_HEAD(ib_nodev_conns);
  
-static void rds_ib_nodev_connect(void)
-{
-       struct rds_ib_connection *ic;
-
-       spin_lock(&ib_nodev_conns_lock);
-       list_for_each_entry(ic, &ib_nodev_conns, ib_node)
-               rds_conn_connect_if_down(ic->conn);
-       spin_unlock(&ib_nodev_conns_lock);
-}
-
-static void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev)
-{
-       struct rds_ib_connection *ic;
-       unsigned long flags;
-
-       spin_lock_irqsave(&rds_ibdev->spinlock, flags);
-       list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node)
-               rds_conn_drop(ic->conn);
-       spin_unlock_irqrestore(&rds_ibdev->spinlock, flags);
-}
-
-/*
- * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references
- * from interrupt context so we push freing off into a work struct in krdsd.
- */
-static void rds_ib_dev_free(struct work_struct *work)
-{
-       struct rds_ib_ipaddr *i_ipaddr, *i_next;
-       struct rds_ib_device *rds_ibdev = container_of(work,
-                                       struct rds_ib_device, free_work);
-
-       if (rds_ibdev->mr_pool)
-               rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
-       if (rds_ibdev->mr)
-               ib_dereg_mr(rds_ibdev->mr);
-       if (rds_ibdev->pd)
-               ib_dealloc_pd(rds_ibdev->pd);
-
-       list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
-               list_del(&i_ipaddr->list);
-               kfree(i_ipaddr);
-       }
-
-       kfree(rds_ibdev);
-}
-
-void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
-{
-       BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0);
-       if (atomic_dec_and_test(&rds_ibdev->refcount))
-               queue_work(rds_wq, &rds_ibdev->free_work);
-}
-
-static void rds_ib_add_one(struct ib_device *device)
+void rds_ib_add_one(struct ib_device *device)
  {
         struct rds_ib_device *rds_ibdev;
         struct ib_device_attr *dev_attr;
@@ -137,124 +73,85 @@ static void rds_ib_add_one(struct ib_device *device)
                 goto free_attr;
         }
  
-       rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL,
-                                ibdev_to_node(device));
+       rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL);
         if (!rds_ibdev)
                 goto free_attr;
  
         spin_lock_init(&rds_ibdev->spinlock);
-       atomic_set(&rds_ibdev->refcount, 1);
-       INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
  
         rds_ibdev->max_wrs = dev_attr->max_qp_wr;
         rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
  
+       rds_ibdev->fmr_page_shift = max(9, ffs(dev_attr->page_size_cap) - 1);
+       rds_ibdev->fmr_page_size  = 1 << rds_ibdev->fmr_page_shift;
+       rds_ibdev->fmr_page_mask  = ~((u64) rds_ibdev->fmr_page_size - 1);
         rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32;
         rds_ibdev->max_fmrs = dev_attr->max_fmr ?
                         min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) :
                         fmr_pool_size;
  
-       rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom;
-       rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom;
-
         rds_ibdev->dev = device;
         rds_ibdev->pd = ib_alloc_pd(device);
-       if (IS_ERR(rds_ibdev->pd)) {
-               rds_ibdev->pd = NULL;
-               goto put_dev;
-       }
+       if (IS_ERR(rds_ibdev->pd))
+               goto free_dev;
  
-       rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE);
-       if (IS_ERR(rds_ibdev->mr)) {
-               rds_ibdev->mr = NULL;
-               goto put_dev;
-       }
+       rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd,
+                                     IB_ACCESS_LOCAL_WRITE);
+       if (IS_ERR(rds_ibdev->mr))
+               goto err_pd;
  
         rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
         if (IS_ERR(rds_ibdev->mr_pool)) {
                 rds_ibdev->mr_pool = NULL;
-               goto put_dev;
+               goto err_mr;
         }
  
         INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
         INIT_LIST_HEAD(&rds_ibdev->conn_list);
-
-       down_write(&rds_ib_devices_lock);
-       list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices);
-       up_write(&rds_ib_devices_lock);
-       atomic_inc(&rds_ibdev->refcount);
+       list_add_tail(&rds_ibdev->list, &rds_ib_devices);
  
         ib_set_client_data(device, &rds_ib_client, rds_ibdev);
-       atomic_inc(&rds_ibdev->refcount);
  
-       rds_ib_nodev_connect();
+       goto free_attr;
  
-put_dev:
-       rds_ib_dev_put(rds_ibdev);
+err_mr:
+       ib_dereg_mr(rds_ibdev->mr);
+err_pd:
+       ib_dealloc_pd(rds_ibdev->pd);
+free_dev:
+       kfree(rds_ibdev);
  free_attr:
         kfree(dev_attr);
  }
  
-/*
- * New connections use this to find the device to associate with the
- * connection.  It's not in the fast path so we're not concerned about the
- * performance of the IB call.  (As of this writing, it uses an interrupt
- * blocking spinlock to serialize walking a per-device list of all registered
- * clients.)
- *
- * RCU is used to handle incoming connections racing with device teardown.
- * Rather than use a lock to serialize removal from the client_data and
- * getting a new reference, we use an RCU grace period.  The destruction
- * path removes the device from client_data and then waits for all RCU
- * readers to finish.
- *
- * A new connection can get NULL from this if its arriving on a
- * device that is in the process of being removed.
- */
-struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device)
-{
-       struct rds_ib_device *rds_ibdev;
-
-       rcu_read_lock();
-       rds_ibdev = ib_get_client_data(device, &rds_ib_client);
-       if (rds_ibdev)
-               atomic_inc(&rds_ibdev->refcount);
-       rcu_read_unlock();
-       return rds_ibdev;
-}
-
-/*
- * The IB stack is letting us know that a device is going away.  This can
- * happen if the underlying HCA driver is removed or if PCI hotplug is removing
- * the pci function, for example.
- *
- * This can be called at any time and can be racing with any other RDS path.
- */
-static void rds_ib_remove_one(struct ib_device *device)
+void rds_ib_remove_one(struct ib_device *device)
  {
         struct rds_ib_device *rds_ibdev;
+       struct rds_ib_ipaddr *i_ipaddr, *i_next;
  
         rds_ibdev = ib_get_client_data(device, &rds_ib_client);
         if (!rds_ibdev)
                 return;
  
-       rds_ib_dev_shutdown(rds_ibdev);
+       list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
+               list_del(&i_ipaddr->list);
+               kfree(i_ipaddr);
+       }
+
+       rds_ib_destroy_conns(rds_ibdev);
  
-       /* stop connection attempts from getting a reference to this device. */
-       ib_set_client_data(device, &rds_ib_client, NULL);
+       if (rds_ibdev->mr_pool)
+               rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
  
-       down_write(&rds_ib_devices_lock);
-       list_del_rcu(&rds_ibdev->list);
-       up_write(&rds_ib_devices_lock);
+       ib_dereg_mr(rds_ibdev->mr);
  
-       /*
-        * This synchronize rcu is waiting for readers of both the ib
-        * client data and the devices list to finish before we drop
-        * both of those references.
-        */
-       synchronize_rcu();
-       rds_ib_dev_put(rds_ibdev);
-       rds_ib_dev_put(rds_ibdev);
+       while (ib_dealloc_pd(rds_ibdev->pd)) {
+               rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd);
+               msleep(1);
+       }
+
+       list_del(&rds_ibdev->list);
+       kfree(rds_ibdev);
  }
  
  struct ib_client rds_ib_client = {
@@ -285,10 +182,10 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
                 ic = conn->c_transport_data;
                 dev_addr = &ic->i_cm_id->route.addr.dev_addr;
  
-               rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
-               rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
+               ib_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
+               ib_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
  
-               rds_ibdev = ic->rds_ibdev;
+               rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
                 iinfo->max_send_wr = ic->i_send_ring.w_nr;
                 iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
                 iinfo->max_send_sge = rds_ibdev->max_sge;
@@ -326,9 +223,9 @@ static int rds_ib_laddr_check(__be32 addr)
         /* Create a CMA ID and try to bind it. This catches both
          * IB and iWARP capable NICs.
          */
-       cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
-       if (IS_ERR(cm_id))
-               return PTR_ERR(cm_id);
+       cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
+       if (!cm_id)
+               return -EADDRNOTAVAIL;
  
         memset(&sin, 0, sizeof(sin));
         sin.sin_family = AF_INET;
@@ -338,8 +235,7 @@ static int rds_ib_laddr_check(__be32 addr)
         ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
         /* due to this, we will claim to support iWARP devices unless we
            check node_type. */
-       if (ret || !cm_id->device ||
-           cm_id->device->node_type != RDMA_NODE_IB_CA)
+       if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
                 ret = -EADDRNOTAVAIL;
  
         rdsdebug("addr %pI4 ret %d node type %d\n",
@@ -351,18 +247,11 @@ static int rds_ib_laddr_check(__be32 addr)
         return ret;
  }
  
-static void rds_ib_unregister_client(void)
-{
-       ib_unregister_client(&rds_ib_client);
-       /* wait for rds_ib_dev_free() to complete */
-       flush_workqueue(rds_wq);
-}
-
  void rds_ib_exit(void)
  {
         rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
-       rds_ib_unregister_client();
         rds_ib_destroy_nodev_conns();
+       ib_unregister_client(&rds_ib_client);
         rds_ib_sysctl_exit();
         rds_ib_recv_exit();
         rds_trans_unregister(&rds_ib_transport);
@@ -372,14 +261,15 @@ struct rds_transport rds_ib_transport = {
         .laddr_check            = rds_ib_laddr_check,
         .xmit_complete          = rds_ib_xmit_complete,
         .xmit                   = rds_ib_xmit,
+       .xmit_cong_map          = NULL,
         .xmit_rdma              = rds_ib_xmit_rdma,
-       .xmit_atomic            = rds_ib_xmit_atomic,
         .recv                   = rds_ib_recv,
         .conn_alloc             = rds_ib_conn_alloc,
         .conn_free              = rds_ib_conn_free,
         .conn_connect           = rds_ib_conn_connect,
         .conn_shutdown          = rds_ib_conn_shutdown,
         .inc_copy_to_user       = rds_ib_inc_copy_to_user,
+       .inc_purge              = rds_ib_inc_purge,
         .inc_free               = rds_ib_inc_free,
         .cm_initiate_connect    = rds_ib_cm_initiate_connect,
         .cm_handle_connect      = rds_ib_cm_handle_connect,
@@ -392,10 +282,9 @@ struct rds_transport rds_ib_transport = {
         .flush_mrs              = rds_ib_flush_mrs,
         .t_owner                = THIS_MODULE,
         .t_name                 = "infiniband",
-       .t_type                 = RDS_TRANS_IB
  };
  
-int rds_ib_init(void)
+int __init rds_ib_init(void)
  {
         int ret;
  
@@ -426,7 +315,7 @@ out_recv:
  out_sysctl:
         rds_ib_sysctl_exit();
  out_ibreg:
-       rds_ib_unregister_client();
+       ib_unregister_client(&rds_ib_client);
  out:
         return ret;
  }
diff --git a/net/rds/ib.h b/net/rds/ib.h

index c36d713229e0f5c5a1b43fe227a1e04480ba100d..069206cae733c3a9a88b5efda1b6478a92355b85 100644 (file)
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -3,14 +3,11 @@
  
  #include <rdma/ib_verbs.h>
  #include <rdma/rdma_cm.h>
-#include <linux/interrupt.h>
-#include <linux/pci.h>
-#include <linux/slab.h>
  #include "rds.h"
  #include "rdma_transport.h"
  
  #define RDS_FMR_SIZE                   256
-#define RDS_FMR_POOL_SIZE              8192
+#define RDS_FMR_POOL_SIZE              4096
  
  #define RDS_IB_MAX_SGE                 8
  #define RDS_IB_RECV_SGE                2
@@ -18,13 +15,8 @@
  #define RDS_IB_DEFAULT_RECV_WR         1024
  #define RDS_IB_DEFAULT_SEND_WR         256
  
-#define RDS_IB_DEFAULT_RETRY_COUNT     2
-
  #define RDS_IB_SUPPORTED_PROTOCOLS     0x00000003      /* minor versions supported */
  
-#define RDS_IB_RECYCLE_BATCH_COUNT     32
-
-extern struct rw_semaphore rds_ib_devices_lock;
  extern struct list_head rds_ib_devices;
  
  /*
@@ -32,29 +24,20 @@ extern struct list_head rds_ib_devices;
   * try and minimize the amount of memory tied up both the device and
   * socket receive queues.
   */
+/* page offset of the final full frag that fits in the page */
+#define RDS_PAGE_LAST_OFF (((PAGE_SIZE  / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
  struct rds_page_frag {
         struct list_head        f_item;
-       struct list_head        f_cache_entry;
-       struct scatterlist      f_sg;
+       struct page             *f_page;
+       unsigned long           f_offset;
+       dma_addr_t              f_mapped;
  };
  
  struct rds_ib_incoming {
         struct list_head        ii_frags;
-       struct list_head        ii_cache_entry;
         struct rds_incoming     ii_inc;
  };
  
-struct rds_ib_cache_head {
-       struct list_head *first;
-       unsigned long count;
-};
-
-struct rds_ib_refill_cache {
-       struct rds_ib_cache_head __percpu *percpu;
-       struct list_head         *xfer;
-       struct list_head         *ready;
-};
-
  struct rds_ib_connect_private {
         /* Add new fields at the end, and don't permute existing fields. */
         __be32                  dp_saddr;
@@ -68,7 +51,8 @@ struct rds_ib_connect_private {
  };
  
  struct rds_ib_send_work {
-       void                    *s_op;
+       struct rds_message      *s_rm;
+       struct rds_rdma_op      *s_op;
         struct ib_send_wr       s_wr;
         struct ib_sge           s_sge[RDS_IB_MAX_SGE];
         unsigned long           s_queued;
@@ -106,14 +90,12 @@ struct rds_ib_connection {
  
         /* tx */
         struct rds_ib_work_ring i_send_ring;
-       struct rm_data_op       *i_data_op;
+       struct rds_message      *i_rm;
         struct rds_header       *i_send_hdrs;
         u64                     i_send_hdrs_dma;
         struct rds_ib_send_work *i_sends;
-       atomic_t                i_signaled_sends;
  
         /* rx */
-       struct tasklet_struct   i_recv_tasklet;
         struct mutex            i_recv_mutex;
         struct rds_ib_work_ring i_recv_ring;
         struct rds_ib_incoming  *i_ibinc;
@@ -121,9 +103,8 @@ struct rds_ib_connection {
         struct rds_header       *i_recv_hdrs;
         u64                     i_recv_hdrs_dma;
         struct rds_ib_recv_work *i_recvs;
+       struct rds_page_frag    i_frag;
         u64                     i_ack_recv;     /* last ACK received */
-       struct rds_ib_refill_cache i_cache_incs;
-       struct rds_ib_refill_cache i_cache_frags;
  
         /* sending acks */
         unsigned long           i_ack_flags;
@@ -154,6 +135,7 @@ struct rds_ib_connection {
  
         /* Batched completions */
         unsigned int            i_unsignaled_wrs;
+       long                    i_unsignaled_bytes;
  };
  
  /* This assumes that atomic_t is at least 32 bits */
@@ -175,20 +157,16 @@ struct rds_ib_device {
         struct ib_pd            *pd;
         struct ib_mr            *mr;
         struct rds_ib_mr_pool   *mr_pool;
+       int                     fmr_page_shift;
+       int                     fmr_page_size;
+       u64                     fmr_page_mask;
         unsigned int            fmr_max_remaps;
         unsigned int            max_fmrs;
         int                     max_sge;
         unsigned int            max_wrs;
-       unsigned int            max_initiator_depth;
-       unsigned int            max_responder_resources;
         spinlock_t              spinlock;       /* protect the above */
-       atomic_t                refcount;
-       struct work_struct      free_work;
  };
  
-#define ibdev_to_node(ibdev) dev_to_node(ibdev->dma_device)
-#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev)
-
  /* bits for i_ack_flags */
  #define IB_ACK_IN_FLIGHT       0
  #define IB_ACK_REQUESTED       1
@@ -224,8 +202,6 @@ struct rds_ib_statistics {
         uint64_t        s_ib_rdma_mr_pool_flush;
         uint64_t        s_ib_rdma_mr_pool_wait;
         uint64_t        s_ib_rdma_mr_pool_depleted;
-       uint64_t        s_ib_atomic_cswp;
-       uint64_t        s_ib_atomic_fadd;
  };
  
  extern struct workqueue_struct *rds_ib_wq;
@@ -265,12 +241,12 @@ static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev,
  
  /* ib.c */
  extern struct rds_transport rds_ib_transport;
-struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device);
-void rds_ib_dev_put(struct rds_ib_device *rds_ibdev);
+extern void rds_ib_add_one(struct ib_device *device);
+extern void rds_ib_remove_one(struct ib_device *device);
  extern struct ib_client rds_ib_client;
  
+extern unsigned int fmr_pool_size;
  extern unsigned int fmr_message_size;
-extern unsigned int rds_ib_retry_count;
  
  extern spinlock_t ib_nodev_conns_lock;
  extern struct list_head ib_nodev_conns;
@@ -281,7 +257,7 @@ void rds_ib_conn_free(void *arg);
  int rds_ib_conn_connect(struct rds_connection *conn);
  void rds_ib_conn_shutdown(struct rds_connection *conn);
  void rds_ib_state_change(struct sock *sk);
-int rds_ib_listen_init(void);
+int __init rds_ib_listen_init(void);
  void rds_ib_listen_stop(void);
  void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
  int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
@@ -298,7 +274,15 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn,
  int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
  void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
  void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
-void rds_ib_destroy_nodev_conns(void);
+void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock);
+static inline void rds_ib_destroy_nodev_conns(void)
+{
+       __rds_ib_destroy_conns(&ib_nodev_conns, &ib_nodev_conns_lock);
+}
+static inline void rds_ib_destroy_conns(struct rds_ib_device *rds_ibdev)
+{
+       __rds_ib_destroy_conns(&rds_ibdev->conn_list, &rds_ibdev->spinlock);
+}
  struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *);
  void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
  void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
@@ -309,16 +293,16 @@ void rds_ib_free_mr(void *trans_private, int invalidate);
  void rds_ib_flush_mrs(void);
  
  /* ib_recv.c */
-int rds_ib_recv_init(void);
+int __init rds_ib_recv_init(void);
  void rds_ib_recv_exit(void);
  int rds_ib_recv(struct rds_connection *conn);
-int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic);
-void rds_ib_recv_free_caches(struct rds_ib_connection *ic);
-void rds_ib_recv_refill(struct rds_connection *conn, int prefill);
+int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
+                      gfp_t page_gfp, int prefill);
+void rds_ib_inc_purge(struct rds_incoming *inc);
  void rds_ib_inc_free(struct rds_incoming *inc);
-int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
+int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
+                            size_t size);
  void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context);
-void rds_ib_recv_tasklet_fn(unsigned long data);
  void rds_ib_recv_init_ring(struct rds_ib_connection *ic);
  void rds_ib_recv_clear_ring(struct rds_ib_connection *ic);
  void rds_ib_recv_init_ack(struct rds_ib_connection *ic);
@@ -339,19 +323,17 @@ u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest);
  extern wait_queue_head_t rds_ib_ring_empty_wait;
  
  /* ib_send.c */
-char *rds_ib_wc_status_str(enum ib_wc_status status);
  void rds_ib_xmit_complete(struct rds_connection *conn);
  int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
                 unsigned int hdr_off, unsigned int sg, unsigned int off);
  void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context);
  void rds_ib_send_init_ring(struct rds_ib_connection *ic);
  void rds_ib_send_clear_ring(struct rds_ib_connection *ic);
-int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op);
  void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits);
  void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted);
  int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
-                            u32 *adv_credits, int need_posted, int max_posted);
-int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op);
+                            u32 *adv_credits, int need_posted);
  
  /* ib_stats.c */
  DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
@@ -360,7 +342,7 @@ unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
                                     unsigned int avail);
  
  /* ib_sysctl.c */
-int rds_ib_sysctl_init(void);
+int __init rds_ib_sysctl_init(void);
  void rds_ib_sysctl_exit(void);
  extern unsigned long rds_ib_sysctl_max_send_wr;
  extern unsigned long rds_ib_sysctl_max_recv_wr;
@@ -368,5 +350,22 @@ extern unsigned long rds_ib_sysctl_max_unsig_wrs;
  extern unsigned long rds_ib_sysctl_max_unsig_bytes;
  extern unsigned long rds_ib_sysctl_max_recv_allocation;
  extern unsigned int rds_ib_sysctl_flow_control;
+extern ctl_table rds_ib_sysctl_table[];
+
+/*
+ * Helper functions for getting/setting the header and data SGEs in
+ * RDS packets (not RDMA)
+ */
+static inline struct ib_sge *
+rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
+{
+       return &sge[0];
+}
+
+static inline struct ib_sge *
+rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
+{
+       return &sge[1];
+}
  
  #endif
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c

index 8a09ee7db3c13bdd833784c4ee311e048a7c2789..f8e40e1a6038882eb5950da72506c9f70cbd7045 100644 (file)
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -32,43 +32,11 @@
   */
  #include <linux/kernel.h>
  #include <linux/in.h>
-#include <linux/slab.h>
  #include <linux/vmalloc.h>
-#include <linux/ratelimit.h>
  
  #include "rds.h"
  #include "ib.h"
  
-static char *rds_ib_event_type_strings[] = {
-#define RDS_IB_EVENT_STRING(foo) \
-               [IB_EVENT_##foo] = __stringify(IB_EVENT_##foo)
-       RDS_IB_EVENT_STRING(CQ_ERR),
-       RDS_IB_EVENT_STRING(QP_FATAL),
-       RDS_IB_EVENT_STRING(QP_REQ_ERR),
-       RDS_IB_EVENT_STRING(QP_ACCESS_ERR),
-       RDS_IB_EVENT_STRING(COMM_EST),
-       RDS_IB_EVENT_STRING(SQ_DRAINED),
-       RDS_IB_EVENT_STRING(PATH_MIG),
-       RDS_IB_EVENT_STRING(PATH_MIG_ERR),
-       RDS_IB_EVENT_STRING(DEVICE_FATAL),
-       RDS_IB_EVENT_STRING(PORT_ACTIVE),
-       RDS_IB_EVENT_STRING(PORT_ERR),
-       RDS_IB_EVENT_STRING(LID_CHANGE),
-       RDS_IB_EVENT_STRING(PKEY_CHANGE),
-       RDS_IB_EVENT_STRING(SM_CHANGE),
-       RDS_IB_EVENT_STRING(SRQ_ERR),
-       RDS_IB_EVENT_STRING(SRQ_LIMIT_REACHED),
-       RDS_IB_EVENT_STRING(QP_LAST_WQE_REACHED),
-       RDS_IB_EVENT_STRING(CLIENT_REREGISTER),
-#undef RDS_IB_EVENT_STRING
-};
-
-static char *rds_ib_event_str(enum ib_event_type type)
-{
-       return rds_str_array(rds_ib_event_type_strings,
-                            ARRAY_SIZE(rds_ib_event_type_strings), type);
-};
-
  /*
   * Set the selected protocol version
   */
@@ -126,46 +94,24 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
  {
         const struct rds_ib_connect_private *dp = NULL;
         struct rds_ib_connection *ic = conn->c_transport_data;
+       struct rds_ib_device *rds_ibdev;
         struct ib_qp_attr qp_attr;
         int err;
  
-       if (event->param.conn.private_data_len >= sizeof(*dp)) {
+       if (event->param.conn.private_data_len) {
                 dp = event->param.conn.private_data;
  
-               /* make sure it isn't empty data */
-               if (dp->dp_protocol_major) {
-                       rds_ib_set_protocol(conn,
+               rds_ib_set_protocol(conn,
                                 RDS_PROTOCOL(dp->dp_protocol_major,
-                               dp->dp_protocol_minor));
-                       rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
-               }
+                                       dp->dp_protocol_minor));
+               rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
         }
  
-       if (conn->c_version < RDS_PROTOCOL(3,1)) {
-               printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed,"
-                      " no longer supported\n",
-                      &conn->c_faddr,
-                      RDS_PROTOCOL_MAJOR(conn->c_version),
-                      RDS_PROTOCOL_MINOR(conn->c_version));
-               rds_conn_destroy(conn);
-               return;
-       } else {
-               printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
-                      &conn->c_faddr,
-                      RDS_PROTOCOL_MAJOR(conn->c_version),
-                      RDS_PROTOCOL_MINOR(conn->c_version),
-                      ic->i_flowctl ? ", flow control" : "");
-       }
-
-       /*
-        * Init rings and fill recv. this needs to wait until protocol negotiation
-        * is complete, since ring layout is different from 3.0 to 3.1.
-        */
-       rds_ib_send_init_ring(ic);
-       rds_ib_recv_init_ring(ic);
-       /* Post receive buffers - as a side effect, this will update
-        * the posted credit count. */
-       rds_ib_recv_refill(conn, 1);
+       printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
+                       &conn->c_laddr,
+                       RDS_PROTOCOL_MAJOR(conn->c_version),
+                       RDS_PROTOCOL_MINOR(conn->c_version),
+                       ic->i_flowctl ? ", flow control" : "");
  
         /* Tune RNR behavior */
         rds_ib_tune_rnr(ic, &qp_attr);
@@ -175,25 +121,17 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
         if (err)
                 printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
  
-       /* update ib_device with this local ipaddr */
-       err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr);
+       /* update ib_device with this local ipaddr & conn */
+       rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+       err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr);
         if (err)
-               printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n",
-                       err);
+               printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err);
+       rds_ib_add_conn(rds_ibdev, conn);
  
         /* If the peer gave us the last packet it saw, process this as if
          * we had received a regular ACK. */
-       if (dp) {
-               /* dp structure start is not guaranteed to be 8 bytes aligned.
-                * Since dp_ack_seq is 64-bit extended load operations can be
-                * used so go through get_unaligned to avoid unaligned errors.
-                */
-               __be64 dp_ack_seq = get_unaligned(&dp->dp_ack_seq);
-
-               if (dp_ack_seq)
-                       rds_send_drop_acked(conn, be64_to_cpu(dp_ack_seq),
-                                           NULL);
-       }
+       if (dp && dp->dp_ack_seq)
+               rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
  
         rds_connect_complete(conn);
  }
@@ -201,23 +139,18 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
  static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
                         struct rdma_conn_param *conn_param,
                         struct rds_ib_connect_private *dp,
-                       u32 protocol_version,
-                       u32 max_responder_resources,
-                       u32 max_initiator_depth)
+                       u32 protocol_version)
  {
-       struct rds_ib_connection *ic = conn->c_transport_data;
-       struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
-
         memset(conn_param, 0, sizeof(struct rdma_conn_param));
-
-       conn_param->responder_resources =
-               min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources);
-       conn_param->initiator_depth =
-               min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth);
-       conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7);
+       /* XXX tune these? */
+       conn_param->responder_resources = 1;
+       conn_param->initiator_depth = 1;
+       conn_param->retry_count = 7;
         conn_param->rnr_retry_count = 7;
  
         if (dp) {
+               struct rds_ib_connection *ic = conn->c_transport_data;
+
                 memset(dp, 0, sizeof(*dp));
                 dp->dp_saddr = conn->c_laddr;
                 dp->dp_daddr = conn->c_faddr;
@@ -242,8 +175,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
  
  static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
  {
-       rdsdebug("event %u (%s) data %p\n",
-                event->event, rds_ib_event_str(event->event), data);
+       rdsdebug("event %u data %p\n", event->event, data);
  }
  
  static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
@@ -251,19 +183,16 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
         struct rds_connection *conn = data;
         struct rds_ib_connection *ic = conn->c_transport_data;
  
-       rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event,
-                rds_ib_event_str(event->event));
+       rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);
  
         switch (event->event) {
         case IB_EVENT_COMM_EST:
                 rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
                 break;
         default:
-               rdsdebug("Fatal QP Event %u (%s) "
-                       "- connection %pI4->%pI4, reconnecting\n",
-                       event->event, rds_ib_event_str(event->event),
-                       &conn->c_laddr, &conn->c_faddr);
-               rds_conn_drop(conn);
+               printk(KERN_WARNING "RDS/ib: unhandled QP event %u "
+                      "on connection to %pI4\n", event->event,
+                      &conn->c_faddr);
                 break;
         }
  }
@@ -280,16 +209,18 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
         struct rds_ib_device *rds_ibdev;
         int ret;
  
-       /*
-        * It's normal to see a null device if an incoming connection races
-        * with device removal, so we don't print a warning.
+       /* rds_ib_add_one creates a rds_ib_device object per IB device,
+        * and allocates a protection domain, memory range and FMR pool
+        * for each.  If that fails for any reason, it will not register
+        * the rds_ibdev at all.
          */
-       rds_ibdev = rds_ib_get_client_data(dev);
-       if (!rds_ibdev)
+       rds_ibdev = ib_get_client_data(dev, &rds_ib_client);
+       if (rds_ibdev == NULL) {
+               if (printk_ratelimit())
+                       printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n",
+                                       dev->name);
                 return -EOPNOTSUPP;
-
-       /* add the conn now so that connection establishment has the dev */
-       rds_ib_add_conn(rds_ibdev, conn);
+       }
  
         if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1)
                 rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1);
@@ -360,7 +291,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
                                            ic->i_send_ring.w_nr *
                                                 sizeof(struct rds_header),
                                            &ic->i_send_hdrs_dma, GFP_KERNEL);
-       if (!ic->i_send_hdrs) {
+       if (ic->i_send_hdrs == NULL) {
                 ret = -ENOMEM;
                 rdsdebug("ib_dma_alloc_coherent send failed\n");
                 goto out;
@@ -370,7 +301,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
                                            ic->i_recv_ring.w_nr *
                                                 sizeof(struct rds_header),
                                            &ic->i_recv_hdrs_dma, GFP_KERNEL);
-       if (!ic->i_recv_hdrs) {
+       if (ic->i_recv_hdrs == NULL) {
                 ret = -ENOMEM;
                 rdsdebug("ib_dma_alloc_coherent recv failed\n");
                 goto out;
@@ -378,64 +309,54 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
  
         ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
                                        &ic->i_ack_dma, GFP_KERNEL);
-       if (!ic->i_ack) {
+       if (ic->i_ack == NULL) {
                 ret = -ENOMEM;
                 rdsdebug("ib_dma_alloc_coherent ack failed\n");
                 goto out;
         }
  
-       ic->i_sends = vzalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work),
-                                  ibdev_to_node(dev));
-       if (!ic->i_sends) {
+       ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
+       if (ic->i_sends == NULL) {
                 ret = -ENOMEM;
                 rdsdebug("send allocation failed\n");
                 goto out;
         }
+       rds_ib_send_init_ring(ic);
  
-       ic->i_recvs = vzalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work),
-                                  ibdev_to_node(dev));
-       if (!ic->i_recvs) {
+       ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work));
+       if (ic->i_recvs == NULL) {
                 ret = -ENOMEM;
                 rdsdebug("recv allocation failed\n");
                 goto out;
         }
  
+       rds_ib_recv_init_ring(ic);
         rds_ib_recv_init_ack(ic);
  
+       /* Post receive buffers - as a side effect, this will update
+        * the posted credit count. */
+       rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
+
         rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
                  ic->i_send_cq, ic->i_recv_cq);
  
  out:
-       rds_ib_dev_put(rds_ibdev);
         return ret;
  }
  
-static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event)
+static u32 rds_ib_protocol_compatible(const struct rds_ib_connect_private *dp)
  {
-       const struct rds_ib_connect_private *dp = event->param.conn.private_data;
         u16 common;
         u32 version = 0;
  
-       /*
-        * rdma_cm private data is odd - when there is any private data in the
+       /* rdma_cm private data is odd - when there is any private data in the
          * request, we will be given a pretty large buffer without telling us the
          * original size. The only way to tell the difference is by looking at
          * the contents, which are initialized to zero.
          * If the protocol version fields aren't set, this is a connection attempt
          * from an older version. This could could be 3.0 or 2.0 - we can't tell.
-        * We really should have changed this for OFED 1.3 :-(
-        */
-
-       /* Be paranoid. RDS always has privdata */
-       if (!event->param.conn.private_data_len) {
-               printk(KERN_NOTICE "RDS incoming connection has no private data, "
-                       "rejecting\n");
-               return 0;
-       }
-
-       /* Even if len is crap *now* I still want to check it. -ASG */
-       if (event->param.conn.private_data_len < sizeof (*dp) ||
-           dp->dp_protocol_major == 0)
+        * We really should have changed this for OFED 1.3 :-( */
+       if (dp->dp_protocol_major == 0)
                 return RDS_PROTOCOL_3_0;
  
         common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS;
@@ -443,11 +364,13 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event)
                 version = RDS_PROTOCOL_3_0;
                 while ((common >>= 1) != 0)
                         version++;
-       } else
-               printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n",
-                               &dp->dp_saddr,
-                               dp->dp_protocol_major,
-                               dp->dp_protocol_minor);
+       } else if (printk_ratelimit()) {
+               printk(KERN_NOTICE "RDS: Connection from %pI4 using "
+                       "incompatible protocol version %u.%u\n",
+                       &dp->dp_saddr,
+                       dp->dp_protocol_major,
+                       dp->dp_protocol_minor);
+       }
         return version;
  }
  
@@ -462,10 +385,10 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
         struct rds_ib_connection *ic = NULL;
         struct rdma_conn_param conn_param;
         u32 version;
-       int err = 1, destroy = 1;
+       int err, destroy = 1;
  
         /* Check whether the remote protocol version matches ours. */
-       version = rds_ib_protocol_compatible(event);
+       version = rds_ib_protocol_compatible(dp);
         if (!version)
                 goto out;
  
@@ -501,6 +424,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
                         /* Wait and see - our connect may still be succeeding */
                         rds_ib_stats_inc(s_ib_connect_raced);
                 }
+               mutex_unlock(&conn->c_cm_lock);
                 goto out;
         }
  
@@ -530,20 +454,20 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
                 goto out;
         }
  
-       rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
-               event->param.conn.responder_resources,
-               event->param.conn.initiator_depth);
+       rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version);
  
         /* rdma_accept() calls rdma_reject() internally if it fails */
         err = rdma_accept(cm_id, &conn_param);
-       if (err)
+       mutex_unlock(&conn->c_cm_lock);
+       if (err) {
                 rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err);
+               goto out;
+       }
+
+       return 0;
  
  out:
-       if (conn)
-               mutex_unlock(&conn->c_cm_lock);
-       if (err)
-               rdma_reject(cm_id, NULL, 0);
+       rdma_reject(cm_id, NULL, 0);
         return destroy;
  }
  
@@ -567,8 +491,8 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
                 goto out;
         }
  
-       rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION,
-               UINT_MAX, UINT_MAX);
+       rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION);
+
         ret = rdma_connect(cm_id, &conn_param);
         if (ret)
                 rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
@@ -593,7 +517,7 @@ int rds_ib_conn_connect(struct rds_connection *conn)
         /* XXX I wonder what affect the port space has */
         /* delegate cm event handler to rdma_transport */
         ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
-                                    RDMA_PS_TCP, IB_QPT_RC);
+                                    RDMA_PS_TCP);
         if (IS_ERR(ic->i_cm_id)) {
                 ret = PTR_ERR(ic->i_cm_id);
                 ic->i_cm_id = NULL;
@@ -652,19 +576,9 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
                                 ic->i_cm_id, err);
                 }
  
-               /*
-                * We want to wait for tx and rx completion to finish
-                * before we tear down the connection, but we have to be
-                * careful not to get stuck waiting on a send ring that
-                * only has unsignaled sends in it.  We've shutdown new
-                * sends before getting here so by waiting for signaled
-                * sends to complete we're ensured that there will be no
-                * more tx processing.
-                */
                 wait_event(rds_ib_ring_empty_wait,
-                          rds_ib_ring_empty(&ic->i_recv_ring) &&
-                          (atomic_read(&ic->i_signaled_sends) == 0));
-               tasklet_kill(&ic->i_recv_tasklet);
+                       rds_ib_ring_empty(&ic->i_send_ring) &&
+                       rds_ib_ring_empty(&ic->i_recv_ring));
  
                 if (ic->i_send_hdrs)
                         ib_dma_free_coherent(dev,
@@ -715,12 +629,9 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
         BUG_ON(ic->rds_ibdev);
  
         /* Clear pending transmit */
-       if (ic->i_data_op) {
-               struct rds_message *rm;
-
-               rm = container_of(ic->i_data_op, struct rds_message, data);
-               rds_message_put(rm);
-               ic->i_data_op = NULL;
+       if (ic->i_rm) {
+               rds_message_put(ic->i_rm);
+               ic->i_rm = NULL;
         }
  
         /* Clear the ACK state */
@@ -754,27 +665,17 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
  {
         struct rds_ib_connection *ic;
         unsigned long flags;
-       int ret;
  
         /* XXX too lazy? */
-       ic = kzalloc(sizeof(struct rds_ib_connection), gfp);
-       if (!ic)
+       ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL);
+       if (ic == NULL)
                 return -ENOMEM;
  
-       ret = rds_ib_recv_alloc_caches(ic);
-       if (ret) {
-               kfree(ic);
-               return ret;
-       }
-
         INIT_LIST_HEAD(&ic->ib_node);
-       tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn,
-                    (unsigned long) ic);
         mutex_init(&ic->i_recv_mutex);
  #ifndef KERNEL_HAS_ATOMIC64
         spin_lock_init(&ic->i_ack_lock);
  #endif
-       atomic_set(&ic->i_signaled_sends, 0);
  
         /*
          * rds_ib_conn_shutdown() waits for these to be emptied so they
@@ -816,8 +717,6 @@ void rds_ib_conn_free(void *arg)
         list_del(&ic->ib_node);
         spin_unlock_irq(lock_ptr);
  
-       rds_ib_recv_free_caches(ic);
-
         kfree(ic);
  }
  
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c

index 273b8bff6ba448aa013932f5ac7c9f929f49aa70..81033af930207116e5eca3369725a845c8110873 100644 (file)
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -31,15 +31,11 @@
   *
   */
  #include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/rculist.h>
-#include <linux/llist.h>
  
  #include "rds.h"
+#include "rdma.h"
  #include "ib.h"
  
-static DEFINE_PER_CPU(unsigned long, clean_list_grace);
-#define CLEAN_LIST_BUSY_BIT 0
  
  /*
   * This is stored as mr->r_trans_private.
@@ -48,11 +44,7 @@ struct rds_ib_mr {
         struct rds_ib_device    *device;
         struct rds_ib_mr_pool   *pool;
         struct ib_fmr           *fmr;
-
-       struct llist_node       llnode;
-
-       /* unmap_list is for freeing */
-       struct list_head        unmap_list;
+       struct list_head        list;
         unsigned int            remap_count;
  
         struct scatterlist      *sg;
@@ -66,16 +58,14 @@ struct rds_ib_mr {
   */
  struct rds_ib_mr_pool {
         struct mutex            flush_lock;             /* serialize fmr invalidate */
-       struct delayed_work     flush_worker;           /* flush worker */
+       struct work_struct      flush_worker;           /* flush worker */
  
+       spinlock_t              list_lock;              /* protect variables below */
         atomic_t                item_count;             /* total # of MRs */
         atomic_t                dirty_count;            /* # dirty of MRs */
-
-       struct llist_head       drop_list;              /* MRs that have reached their max_maps limit */
-       struct llist_head       free_list;              /* unused MRs */
-       struct llist_head       clean_list;             /* global unused & unamapped MRs */
-       wait_queue_head_t       flush_wait;
-
+       struct list_head        drop_list;              /* MRs that have reached their max_maps limit */
+       struct list_head        free_list;              /* unused MRs */
+       struct list_head        clean_list;             /* unused & unamapped MRs */
         atomic_t                free_pinned;            /* memory pinned by free MRs */
         unsigned long           max_items;
         unsigned long           max_items_soft;
@@ -83,7 +73,7 @@ struct rds_ib_mr_pool {
         struct ib_fmr_attr      fmr_attr;
  };
  
-static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **);
+static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all);
  static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr);
  static void rds_ib_mr_pool_flush_worker(struct work_struct *work);
  
@@ -92,17 +82,16 @@ static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
         struct rds_ib_device *rds_ibdev;
         struct rds_ib_ipaddr *i_ipaddr;
  
-       rcu_read_lock();
-       list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) {
-               list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
+       list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
+               spin_lock_irq(&rds_ibdev->spinlock);
+               list_for_each_entry(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
                         if (i_ipaddr->ipaddr == ipaddr) {
-                               atomic_inc(&rds_ibdev->refcount);
-                               rcu_read_unlock();
+                               spin_unlock_irq(&rds_ibdev->spinlock);
                                 return rds_ibdev;
                         }
                 }
+               spin_unlock_irq(&rds_ibdev->spinlock);
         }
-       rcu_read_unlock();
  
         return NULL;
  }
@@ -118,7 +107,7 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
         i_ipaddr->ipaddr = ipaddr;
  
         spin_lock_irq(&rds_ibdev->spinlock);
-       list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
+       list_add_tail(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
         spin_unlock_irq(&rds_ibdev->spinlock);
  
         return 0;
@@ -126,24 +115,17 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
  
  static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
  {
-       struct rds_ib_ipaddr *i_ipaddr;
-       struct rds_ib_ipaddr *to_free = NULL;
-
+       struct rds_ib_ipaddr *i_ipaddr, *next;
  
         spin_lock_irq(&rds_ibdev->spinlock);
-       list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
+       list_for_each_entry_safe(i_ipaddr, next, &rds_ibdev->ipaddr_list, list) {
                 if (i_ipaddr->ipaddr == ipaddr) {
-                       list_del_rcu(&i_ipaddr->list);
-                       to_free = i_ipaddr;
+                       list_del(&i_ipaddr->list);
+                       kfree(i_ipaddr);
                         break;
                 }
         }
         spin_unlock_irq(&rds_ibdev->spinlock);
-
-       if (to_free) {
-               synchronize_rcu();
-               kfree(to_free);
-       }
  }
  
  int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
@@ -151,10 +133,8 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
         struct rds_ib_device *rds_ibdev_old;
  
         rds_ibdev_old = rds_ib_get_device(ipaddr);
-       if (rds_ibdev_old) {
+       if (rds_ibdev_old)
                 rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
-               rds_ib_dev_put(rds_ibdev_old);
-       }
  
         return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
  }
@@ -169,13 +149,12 @@ void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *con
         BUG_ON(list_empty(&ic->ib_node));
         list_del(&ic->ib_node);
  
-       spin_lock(&rds_ibdev->spinlock);
+       spin_lock_irq(&rds_ibdev->spinlock);
         list_add_tail(&ic->ib_node, &rds_ibdev->conn_list);
-       spin_unlock(&rds_ibdev->spinlock);
+       spin_unlock_irq(&rds_ibdev->spinlock);
         spin_unlock_irq(&ib_nodev_conns_lock);
  
         ic->rds_ibdev = rds_ibdev;
-       atomic_inc(&rds_ibdev->refcount);
  }
  
  void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
@@ -195,21 +174,24 @@ void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *
         spin_unlock(&ib_nodev_conns_lock);
  
         ic->rds_ibdev = NULL;
-       rds_ib_dev_put(rds_ibdev);
  }
  
-void rds_ib_destroy_nodev_conns(void)
+void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock)
  {
         struct rds_ib_connection *ic, *_ic;
         LIST_HEAD(tmp_list);
  
         /* avoid calling conn_destroy with irqs off */
-       spin_lock_irq(&ib_nodev_conns_lock);
-       list_splice(&ib_nodev_conns, &tmp_list);
-       spin_unlock_irq(&ib_nodev_conns_lock);
-
-       list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node)
+       spin_lock_irq(list_lock);
+       list_splice(list, &tmp_list);
+       INIT_LIST_HEAD(list);
+       spin_unlock_irq(list_lock);
+
+       list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) {
+               if (ic->conn->c_passive)
+                       rds_conn_destroy(ic->conn->c_passive);
                 rds_conn_destroy(ic->conn);
+       }
  }
  
  struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
@@ -220,16 +202,16 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
         if (!pool)
                 return ERR_PTR(-ENOMEM);
  
-       init_llist_head(&pool->free_list);
-       init_llist_head(&pool->drop_list);
-       init_llist_head(&pool->clean_list);
+       INIT_LIST_HEAD(&pool->free_list);
+       INIT_LIST_HEAD(&pool->drop_list);
+       INIT_LIST_HEAD(&pool->clean_list);
         mutex_init(&pool->flush_lock);
-       init_waitqueue_head(&pool->flush_wait);
-       INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
+       spin_lock_init(&pool->list_lock);
+       INIT_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
  
         pool->fmr_attr.max_pages = fmr_message_size;
         pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
-       pool->fmr_attr.page_shift = PAGE_SHIFT;
+       pool->fmr_attr.page_shift = rds_ibdev->fmr_page_shift;
         pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4;
  
         /* We never allow more than max_items MRs to be allocated.
@@ -253,52 +235,34 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co
  
  void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
  {
-       cancel_delayed_work_sync(&pool->flush_worker);
-       rds_ib_flush_mr_pool(pool, 1, NULL);
-       WARN_ON(atomic_read(&pool->item_count));
-       WARN_ON(atomic_read(&pool->free_pinned));
+       flush_workqueue(rds_wq);
+       rds_ib_flush_mr_pool(pool, 1);
+       BUG_ON(atomic_read(&pool->item_count));
+       BUG_ON(atomic_read(&pool->free_pinned));
         kfree(pool);
  }
  
  static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
  {
         struct rds_ib_mr *ibmr = NULL;
-       struct llist_node *ret;
-       unsigned long *flag;
+       unsigned long flags;
  
-       preempt_disable();
-       flag = this_cpu_ptr(&clean_list_grace);
-       set_bit(CLEAN_LIST_BUSY_BIT, flag);
-       ret = llist_del_first(&pool->clean_list);
-       if (ret)
-               ibmr = llist_entry(ret, struct rds_ib_mr, llnode);
+       spin_lock_irqsave(&pool->list_lock, flags);
+       if (!list_empty(&pool->clean_list)) {
+               ibmr = list_entry(pool->clean_list.next, struct rds_ib_mr, list);
+               list_del_init(&ibmr->list);
+       }
+       spin_unlock_irqrestore(&pool->list_lock, flags);
  
-       clear_bit(CLEAN_LIST_BUSY_BIT, flag);
-       preempt_enable();
         return ibmr;
  }
  
-static inline void wait_clean_list_grace(void)
-{
-       int cpu;
-       unsigned long *flag;
-
-       for_each_online_cpu(cpu) {
-               flag = &per_cpu(clean_list_grace, cpu);
-               while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
-                       cpu_relax();
-       }
-}
-
  static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
  {
         struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
         struct rds_ib_mr *ibmr = NULL;
         int err = 0, iter = 0;
  
-       if (atomic_read(&pool->dirty_count) >= pool->max_items / 10)
-               schedule_delayed_work(&pool->flush_worker, 10);
-
         while (1) {
                 ibmr = rds_ib_reuse_fmr(pool);
                 if (ibmr)
@@ -325,24 +289,19 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
  
                 /* We do have some empty MRs. Flush them out. */
                 rds_ib_stats_inc(s_ib_rdma_mr_pool_wait);
-               rds_ib_flush_mr_pool(pool, 0, &ibmr);
-               if (ibmr)
-                       return ibmr;
+               rds_ib_flush_mr_pool(pool, 0);
         }
  
-       ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev));
+       ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
         if (!ibmr) {
                 err = -ENOMEM;
                 goto out_no_cigar;
         }
  
-       memset(ibmr, 0, sizeof(*ibmr));
-
         ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
                         (IB_ACCESS_LOCAL_WRITE |
                          IB_ACCESS_REMOTE_READ |
-                        IB_ACCESS_REMOTE_WRITE|
-                        IB_ACCESS_REMOTE_ATOMIC),
+                        IB_ACCESS_REMOTE_WRITE),
                         &pool->fmr_attr);
         if (IS_ERR(ibmr->fmr)) {
                 err = PTR_ERR(ibmr->fmr);
@@ -390,13 +349,13 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm
                 unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
                 u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
  
-               if (dma_addr & ~PAGE_MASK) {
+               if (dma_addr & ~rds_ibdev->fmr_page_mask) {
                         if (i > 0)
                                 return -EINVAL;
                         else
                                 ++page_cnt;
                 }
-               if ((dma_addr + dma_len) & ~PAGE_MASK) {
+               if ((dma_addr + dma_len) & ~rds_ibdev->fmr_page_mask) {
                         if (i < sg_dma_len - 1)
                                 return -EINVAL;
                         else
@@ -406,12 +365,11 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm
                 len += dma_len;
         }
  
-       page_cnt += len >> PAGE_SHIFT;
+       page_cnt += len >> rds_ibdev->fmr_page_shift;
         if (page_cnt > fmr_message_size)
                 return -EINVAL;
  
-       dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC,
-                                rdsibdev_to_node(rds_ibdev));
+       dma_pages = kmalloc(sizeof(u64) * page_cnt, GFP_ATOMIC);
         if (!dma_pages)
                 return -ENOMEM;
  
@@ -420,9 +378,9 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm
                 unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
                 u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
  
-               for (j = 0; j < dma_len; j += PAGE_SIZE)
+               for (j = 0; j < dma_len; j += rds_ibdev->fmr_page_size)
                         dma_pages[page_cnt++] =
-                               (dma_addr & PAGE_MASK) + j;
+                               (dma_addr & rds_ibdev->fmr_page_mask) + j;
         }
  
         ret = ib_map_phys_fmr(ibmr->fmr,
@@ -485,7 +443,6 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
  
                         /* FIXME we need a way to tell a r/w MR
                          * from a r/o MR */
-                       BUG_ON(irqs_disabled());
                         set_page_dirty(page);
                         put_page(page);
                 }
@@ -520,108 +477,34 @@ static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int fr
         return 0;
  }
  
-/*
- * given an llist of mrs, put them all into the list_head for more processing
- */
-static void llist_append_to_list(struct llist_head *llist, struct list_head *list)
-{
-       struct rds_ib_mr *ibmr;
-       struct llist_node *node;
-       struct llist_node *next;
-
-       node = llist_del_all(llist);
-       while (node) {
-               next = node->next;
-               ibmr = llist_entry(node, struct rds_ib_mr, llnode);
-               list_add_tail(&ibmr->unmap_list, list);
-               node = next;
-       }
-}
-
-/*
- * this takes a list head of mrs and turns it into linked llist nodes
- * of clusters.  Each cluster has linked llist nodes of
- * MR_CLUSTER_SIZE mrs that are ready for reuse.
- */
-static void list_to_llist_nodes(struct rds_ib_mr_pool *pool,
-                               struct list_head *list,
-                               struct llist_node **nodes_head,
-                               struct llist_node **nodes_tail)
-{
-       struct rds_ib_mr *ibmr;
-       struct llist_node *cur = NULL;
-       struct llist_node **next = nodes_head;
-
-       list_for_each_entry(ibmr, list, unmap_list) {
-               cur = &ibmr->llnode;
-               *next = cur;
-               next = &cur->next;
-       }
-       *next = NULL;
-       *nodes_tail = cur;
-}
-
  /*
   * Flush our pool of MRs.
   * At a minimum, all currently unused MRs are unmapped.
   * If the number of MRs allocated exceeds the limit, we also try
   * to free as many MRs as needed to get back to this limit.
   */
-static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
-                               int free_all, struct rds_ib_mr **ibmr_ret)
+static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
  {
         struct rds_ib_mr *ibmr, *next;
-       struct llist_node *clean_nodes;
-       struct llist_node *clean_tail;
         LIST_HEAD(unmap_list);
         LIST_HEAD(fmr_list);
         unsigned long unpinned = 0;
+       unsigned long flags;
         unsigned int nfreed = 0, ncleaned = 0, free_goal;
         int ret = 0;
  
         rds_ib_stats_inc(s_ib_rdma_mr_pool_flush);
  
-       if (ibmr_ret) {
-               DEFINE_WAIT(wait);
-               while(!mutex_trylock(&pool->flush_lock)) {
-                       ibmr = rds_ib_reuse_fmr(pool);
-                       if (ibmr) {
-                               *ibmr_ret = ibmr;
-                               finish_wait(&pool->flush_wait, &wait);
-                               goto out_nolock;
-                       }
-
-                       prepare_to_wait(&pool->flush_wait, &wait,
-                                       TASK_UNINTERRUPTIBLE);
-                       if (llist_empty(&pool->clean_list))
-                               schedule();
-
-                       ibmr = rds_ib_reuse_fmr(pool);
-                       if (ibmr) {
-                               *ibmr_ret = ibmr;
-                               finish_wait(&pool->flush_wait, &wait);
-                               goto out_nolock;
-                       }
-               }
-               finish_wait(&pool->flush_wait, &wait);
-       } else
-               mutex_lock(&pool->flush_lock);
-
-       if (ibmr_ret) {
-               ibmr = rds_ib_reuse_fmr(pool);
-               if (ibmr) {
-                       *ibmr_ret = ibmr;
-                       goto out;
-               }
-       }
+       mutex_lock(&pool->flush_lock);
  
+       spin_lock_irqsave(&pool->list_lock, flags);
         /* Get the list of all MRs to be dropped. Ordering matters -
-        * we want to put drop_list ahead of free_list.
-        */
-       llist_append_to_list(&pool->drop_list, &unmap_list);
-       llist_append_to_list(&pool->free_list, &unmap_list);
+        * we want to put drop_list ahead of free_list. */
+       list_splice_init(&pool->free_list, &unmap_list);
+       list_splice_init(&pool->drop_list, &unmap_list);
         if (free_all)
-               llist_append_to_list(&pool->clean_list, &unmap_list);
+               list_splice_init(&pool->clean_list, &unmap_list);
+       spin_unlock_irqrestore(&pool->list_lock, flags);
  
         free_goal = rds_ib_flush_goal(pool, free_all);
  
@@ -629,20 +512,19 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
                 goto out;
  
         /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
-       list_for_each_entry(ibmr, &unmap_list, unmap_list)
+       list_for_each_entry(ibmr, &unmap_list, list)
                 list_add(&ibmr->fmr->list, &fmr_list);
-
         ret = ib_unmap_fmr(&fmr_list);
         if (ret)
                 printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret);
  
         /* Now we can destroy the DMA mapping and unpin any pages */
-       list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) {
+       list_for_each_entry_safe(ibmr, next, &unmap_list, list) {
                 unpinned += ibmr->sg_len;
                 __rds_ib_teardown_mr(ibmr);
                 if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) {
                         rds_ib_stats_inc(s_ib_rdma_mr_free);
-                       list_del(&ibmr->unmap_list);
+                       list_del(&ibmr->list);
                         ib_dealloc_fmr(ibmr->fmr);
                         kfree(ibmr);
                         nfreed++;
@@ -650,27 +532,9 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
                 ncleaned++;
         }
  
-       if (!list_empty(&unmap_list)) {
-               /* we have to make sure that none of the things we're about
-                * to put on the clean list would race with other cpus trying
-                * to pull items off.  The llist would explode if we managed to
-                * remove something from the clean list and then add it back again
-                * while another CPU was spinning on that same item in llist_del_first.
-                *
-                * This is pretty unlikely, but just in case  wait for an llist grace period
-                * here before adding anything back into the clean list.
-                */
-               wait_clean_list_grace();
-
-               list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail);
-               if (ibmr_ret)
-                       *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode);
-
-               /* more than one entry in llist nodes */
-               if (clean_nodes->next)
-                       llist_add_batch(clean_nodes->next, clean_tail, &pool->clean_list);
-
-       }
+       spin_lock_irqsave(&pool->list_lock, flags);
+       list_splice(&unmap_list, &pool->clean_list);
+       spin_unlock_irqrestore(&pool->list_lock, flags);
  
         atomic_sub(unpinned, &pool->free_pinned);
         atomic_sub(ncleaned, &pool->dirty_count);
@@ -678,17 +542,14 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
  
  out:
         mutex_unlock(&pool->flush_lock);
-       if (waitqueue_active(&pool->flush_wait))
-               wake_up(&pool->flush_wait);
-out_nolock:
         return ret;
  }
  
  static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
  {
-       struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work);
+       struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker);
  
-       rds_ib_flush_mr_pool(pool, 0, NULL);
+       rds_ib_flush_mr_pool(pool, 0);
  }
  
  void rds_ib_free_mr(void *trans_private, int invalidate)
@@ -696,48 +557,47 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
         struct rds_ib_mr *ibmr = trans_private;
         struct rds_ib_device *rds_ibdev = ibmr->device;
         struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+       unsigned long flags;
  
         rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
  
         /* Return it to the pool's free list */
+       spin_lock_irqsave(&pool->list_lock, flags);
         if (ibmr->remap_count >= pool->fmr_attr.max_maps)
-               llist_add(&ibmr->llnode, &pool->drop_list);
+               list_add(&ibmr->list, &pool->drop_list);
         else
-               llist_add(&ibmr->llnode, &pool->free_list);
+               list_add(&ibmr->list, &pool->free_list);
  
         atomic_add(ibmr->sg_len, &pool->free_pinned);
         atomic_inc(&pool->dirty_count);
+       spin_unlock_irqrestore(&pool->list_lock, flags);
  
         /* If we've pinned too many pages, request a flush */
-       if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
-           atomic_read(&pool->dirty_count) >= pool->max_items / 10)
-               schedule_delayed_work(&pool->flush_worker, 10);
+       if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned
+        || atomic_read(&pool->dirty_count) >= pool->max_items / 10)
+               queue_work(rds_wq, &pool->flush_worker);
  
         if (invalidate) {
                 if (likely(!in_interrupt())) {
-                       rds_ib_flush_mr_pool(pool, 0, NULL);
+                       rds_ib_flush_mr_pool(pool, 0);
                 } else {
                         /* We get here if the user created a MR marked
                          * as use_once and invalidate at the same time. */
-                       schedule_delayed_work(&pool->flush_worker, 10);
+                       queue_work(rds_wq, &pool->flush_worker);
                 }
         }
-
-       rds_ib_dev_put(rds_ibdev);
  }
  
  void rds_ib_flush_mrs(void)
  {
         struct rds_ib_device *rds_ibdev;
  
-       down_read(&rds_ib_devices_lock);
         list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
                 struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
  
                 if (pool)
-                       rds_ib_flush_mr_pool(pool, 0, NULL);
+                       rds_ib_flush_mr_pool(pool, 0);
         }
-       up_read(&rds_ib_devices_lock);
  }
  
  void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
@@ -769,7 +629,6 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
                 printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret);
  
         ibmr->device = rds_ibdev;
-       rds_ibdev = NULL;
  
   out:
         if (ret) {
@@ -777,8 +636,5 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
                         rds_ib_free_mr(ibmr, 0);
                 ibmr = ERR_PTR(ret);
         }
-       if (rds_ibdev)
-               rds_ib_dev_put(rds_ibdev);
         return ibmr;
  }
-
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c

index 1b981a4e42c214d575a838b096da368a7f0316c6..36d931573ff4f4cb52aec684564e142bbea388e9 100644 (file)
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -31,7 +31,6 @@
   *
   */
  #include <linux/kernel.h>
-#include <linux/slab.h>
  #include <linux/pci.h>
  #include <linux/dma-mapping.h>
  #include <rdma/rdma_cm.h>
@@ -43,6 +42,42 @@ static struct kmem_cache *rds_ib_incoming_slab;
  static struct kmem_cache *rds_ib_frag_slab;
  static atomic_t        rds_ib_allocation = ATOMIC_INIT(0);
  
+static void rds_ib_frag_drop_page(struct rds_page_frag *frag)
+{
+       rdsdebug("frag %p page %p\n", frag, frag->f_page);
+       __free_page(frag->f_page);
+       frag->f_page = NULL;
+}
+
+static void rds_ib_frag_free(struct rds_page_frag *frag)
+{
+       rdsdebug("frag %p page %p\n", frag, frag->f_page);
+       BUG_ON(frag->f_page != NULL);
+       kmem_cache_free(rds_ib_frag_slab, frag);
+}
+
+/*
+ * We map a page at a time.  Its fragments are posted in order.  This
+ * is called in fragment order as the fragments get send completion events.
+ * Only the last frag in the page performs the unmapping.
+ *
+ * It's OK for ring cleanup to call this in whatever order it likes because
+ * DMA is not in flight and so we can unmap while other ring entries still
+ * hold page references in their frags.
+ */
+static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic,
+                                  struct rds_ib_recv_work *recv)
+{
+       struct rds_page_frag *frag = recv->r_frag;
+
+       rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
+       if (frag->f_mapped)
+               ib_dma_unmap_page(ic->i_cm_id->device,
+                              frag->f_mapped,
+                              RDS_FRAG_SIZE, DMA_FROM_DEVICE);
+       frag->f_mapped = 0;
+}
+
  void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
  {
         struct rds_ib_recv_work *recv;
@@ -59,161 +94,16 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
                 recv->r_wr.sg_list = recv->r_sge;
                 recv->r_wr.num_sge = RDS_IB_RECV_SGE;
  
-               sge = &recv->r_sge[0];
-               sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
-               sge->length = sizeof(struct rds_header);
-               sge->lkey = ic->i_mr->lkey;
-
-               sge = &recv->r_sge[1];
+               sge = rds_ib_data_sge(ic, recv->r_sge);
                 sge->addr = 0;
                 sge->length = RDS_FRAG_SIZE;
                 sge->lkey = ic->i_mr->lkey;
-       }
-}
-
-/*
- * The entire 'from' list, including the from element itself, is put on
- * to the tail of the 'to' list.
- */
-static void list_splice_entire_tail(struct list_head *from,
-                                   struct list_head *to)
-{
-       struct list_head *from_last = from->prev;
-
-       list_splice_tail(from_last, to);
-       list_add_tail(from_last, to);
-}
-
-static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache)
-{
-       struct list_head *tmp;
-
-       tmp = xchg(&cache->xfer, NULL);
-       if (tmp) {
-               if (cache->ready)
-                       list_splice_entire_tail(tmp, cache->ready);
-               else
-                       cache->ready = tmp;
-       }
-}
-
-static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache)
-{
-       struct rds_ib_cache_head *head;
-       int cpu;
-
-       cache->percpu = alloc_percpu(struct rds_ib_cache_head);
-       if (!cache->percpu)
-              return -ENOMEM;
-
-       for_each_possible_cpu(cpu) {
-               head = per_cpu_ptr(cache->percpu, cpu);
-               head->first = NULL;
-               head->count = 0;
-       }
-       cache->xfer = NULL;
-       cache->ready = NULL;
-
-       return 0;
-}
-
-int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic)
-{
-       int ret;
-
-       ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs);
-       if (!ret) {
-               ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags);
-               if (ret)
-                       free_percpu(ic->i_cache_incs.percpu);
-       }
-
-       return ret;
-}
  
-static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache,
-                                         struct list_head *caller_list)
-{
-       struct rds_ib_cache_head *head;
-       int cpu;
-
-       for_each_possible_cpu(cpu) {
-               head = per_cpu_ptr(cache->percpu, cpu);
-               if (head->first) {
-                       list_splice_entire_tail(head->first, caller_list);
-                       head->first = NULL;
-               }
-       }
-
-       if (cache->ready) {
-               list_splice_entire_tail(cache->ready, caller_list);
-               cache->ready = NULL;
-       }
-}
-
-void rds_ib_recv_free_caches(struct rds_ib_connection *ic)
-{
-       struct rds_ib_incoming *inc;
-       struct rds_ib_incoming *inc_tmp;
-       struct rds_page_frag *frag;
-       struct rds_page_frag *frag_tmp;
-       LIST_HEAD(list);
-
-       rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
-       rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list);
-       free_percpu(ic->i_cache_incs.percpu);
-
-       list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) {
-               list_del(&inc->ii_cache_entry);
-               WARN_ON(!list_empty(&inc->ii_frags));
-               kmem_cache_free(rds_ib_incoming_slab, inc);
-       }
-
-       rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
-       rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list);
-       free_percpu(ic->i_cache_frags.percpu);
-
-       list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) {
-               list_del(&frag->f_cache_entry);
-               WARN_ON(!list_empty(&frag->f_item));
-               kmem_cache_free(rds_ib_frag_slab, frag);
-       }
-}
-
-/* fwd decl */
-static void rds_ib_recv_cache_put(struct list_head *new_item,
-                                 struct rds_ib_refill_cache *cache);
-static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache);
-
-
-/* Recycle frag and attached recv buffer f_sg */
-static void rds_ib_frag_free(struct rds_ib_connection *ic,
-                            struct rds_page_frag *frag)
-{
-       rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
-
-       rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags);
-}
-
-/* Recycle inc after freeing attached frags */
-void rds_ib_inc_free(struct rds_incoming *inc)
-{
-       struct rds_ib_incoming *ibinc;
-       struct rds_page_frag *frag;
-       struct rds_page_frag *pos;
-       struct rds_ib_connection *ic = inc->i_conn->c_transport_data;
-
-       ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
-
-       /* Free attached frags */
-       list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
-               list_del_init(&frag->f_item);
-               rds_ib_frag_free(ic, frag);
+               sge = rds_ib_header_sge(ic, recv->r_sge);
+               sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
+               sge->length = sizeof(struct rds_header);
+               sge->lkey = ic->i_mr->lkey;
         }
-       BUG_ON(!list_empty(&ibinc->ii_frags));
-
-       rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
-       rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs);
  }
  
  static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
@@ -224,8 +114,10 @@ static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
                 recv->r_ibinc = NULL;
         }
         if (recv->r_frag) {
-               ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
-               rds_ib_frag_free(ic, recv->r_frag);
+               rds_ib_recv_unmap_page(ic, recv);
+               if (recv->r_frag->f_page)
+                       rds_ib_frag_drop_page(recv->r_frag);
+               rds_ib_frag_free(recv->r_frag);
                 recv->r_frag = NULL;
         }
  }
@@ -236,111 +128,83 @@ void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
  
         for (i = 0; i < ic->i_recv_ring.w_nr; i++)
                 rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
-}
  
-static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic,
-                                                    gfp_t slab_mask)
-{
-       struct rds_ib_incoming *ibinc;
-       struct list_head *cache_item;
-       int avail_allocs;
-
-       cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs);
-       if (cache_item) {
-               ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry);
-       } else {
-               avail_allocs = atomic_add_unless(&rds_ib_allocation,
-                                                1, rds_ib_sysctl_max_recv_allocation);
-               if (!avail_allocs) {
-                       rds_ib_stats_inc(s_ib_rx_alloc_limit);
-                       return NULL;
-               }
-               ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask);
-               if (!ibinc) {
-                       atomic_dec(&rds_ib_allocation);
-                       return NULL;
-               }
-       }
-       INIT_LIST_HEAD(&ibinc->ii_frags);
-       rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr);
-
-       return ibinc;
-}
-
-static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic,
-                                                   gfp_t slab_mask, gfp_t page_mask)
-{
-       struct rds_page_frag *frag;
-       struct list_head *cache_item;
-       int ret;
-
-       cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags);
-       if (cache_item) {
-               frag = container_of(cache_item, struct rds_page_frag, f_cache_entry);
-       } else {
-               frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask);
-               if (!frag)
-                       return NULL;
-
-               sg_init_table(&frag->f_sg, 1);
-               ret = rds_page_remainder_alloc(&frag->f_sg,
-                                              RDS_FRAG_SIZE, page_mask);
-               if (ret) {
-                       kmem_cache_free(rds_ib_frag_slab, frag);
-                       return NULL;
-               }
-       }
-
-       INIT_LIST_HEAD(&frag->f_item);
-
-       return frag;
+       if (ic->i_frag.f_page)
+               rds_ib_frag_drop_page(&ic->i_frag);
  }
  
  static int rds_ib_recv_refill_one(struct rds_connection *conn,
-                                 struct rds_ib_recv_work *recv, int prefill)
+                                 struct rds_ib_recv_work *recv,
+                                 gfp_t kptr_gfp, gfp_t page_gfp)
  {
         struct rds_ib_connection *ic = conn->c_transport_data;
+       dma_addr_t dma_addr;
         struct ib_sge *sge;
         int ret = -ENOMEM;
-       gfp_t slab_mask = GFP_NOWAIT;
-       gfp_t page_mask = GFP_NOWAIT;
  
-       if (prefill) {
-               slab_mask = GFP_KERNEL;
-               page_mask = GFP_HIGHUSER;
+       if (recv->r_ibinc == NULL) {
+               if (atomic_read(&rds_ib_allocation) >= rds_ib_sysctl_max_recv_allocation) {
+                       rds_ib_stats_inc(s_ib_rx_alloc_limit);
+                       goto out;
+               }
+               recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab,
+                                                kptr_gfp);
+               if (recv->r_ibinc == NULL)
+                       goto out;
+               atomic_inc(&rds_ib_allocation);
+               INIT_LIST_HEAD(&recv->r_ibinc->ii_frags);
+               rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr);
         }
  
-       if (!ic->i_cache_incs.ready)
-               rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
-       if (!ic->i_cache_frags.ready)
-               rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
+       if (recv->r_frag == NULL) {
+               recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp);
+               if (recv->r_frag == NULL)
+                       goto out;
+               INIT_LIST_HEAD(&recv->r_frag->f_item);
+               recv->r_frag->f_page = NULL;
+       }
  
-       /*
-        * ibinc was taken from recv if recv contained the start of a message.
-        * recvs that were continuations will still have this allocated.
-        */
-       if (!recv->r_ibinc) {
-               recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask);
-               if (!recv->r_ibinc)
+       if (ic->i_frag.f_page == NULL) {
+               ic->i_frag.f_page = alloc_page(page_gfp);
+               if (ic->i_frag.f_page == NULL)
                         goto out;
+               ic->i_frag.f_offset = 0;
         }
  
-       WARN_ON(recv->r_frag); /* leak! */
-       recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask);
-       if (!recv->r_frag)
+       dma_addr = ib_dma_map_page(ic->i_cm_id->device,
+                                 ic->i_frag.f_page,
+                                 ic->i_frag.f_offset,
+                                 RDS_FRAG_SIZE,
+                                 DMA_FROM_DEVICE);
+       if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
                 goto out;
  
-       ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg,
-                           1, DMA_FROM_DEVICE);
-       WARN_ON(ret != 1);
+       /*
+        * Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap()
+        * must be called on this recv.  This happens as completions hit
+        * in order or on connection shutdown.
+        */
+       recv->r_frag->f_page = ic->i_frag.f_page;
+       recv->r_frag->f_offset = ic->i_frag.f_offset;
+       recv->r_frag->f_mapped = dma_addr;
  
-       sge = &recv->r_sge[0];
+       sge = rds_ib_data_sge(ic, recv->r_sge);
+       sge->addr = dma_addr;
+       sge->length = RDS_FRAG_SIZE;
+
+       sge = rds_ib_header_sge(ic, recv->r_sge);
         sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
         sge->length = sizeof(struct rds_header);
  
-       sge = &recv->r_sge[1];
-       sge->addr = ib_sg_dma_address(ic->i_cm_id->device, &recv->r_frag->f_sg);
-       sge->length = ib_sg_dma_len(ic->i_cm_id->device, &recv->r_frag->f_sg);
+       get_page(recv->r_frag->f_page);
+
+       if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) {
+               ic->i_frag.f_offset += RDS_FRAG_SIZE;
+       } else {
+               put_page(ic->i_frag.f_page);
+               ic->i_frag.f_page = NULL;
+               ic->i_frag.f_offset = 0;
+       }
  
         ret = 0;
  out:
@@ -350,11 +214,13 @@ out:
  /*
   * This tries to allocate and post unused work requests after making sure that
   * they have all the allocations they need to queue received fragments into
- * sockets.
+ * sockets.  The i_recv_mutex is held here so that ring_alloc and _unalloc
+ * pairs don't go unmatched.
   *
   * -1 is returned if posting fails due to temporary resource exhaustion.
   */
-void rds_ib_recv_refill(struct rds_connection *conn, int prefill)
+int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
+                      gfp_t page_gfp, int prefill)
  {
         struct rds_ib_connection *ic = conn->c_transport_data;
         struct rds_ib_recv_work *recv;
@@ -363,33 +229,33 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill)
         int ret = 0;
         u32 pos;
  
-       while ((prefill || rds_conn_up(conn)) &&
-              rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
+       while ((prefill || rds_conn_up(conn))
+                       && rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
                 if (pos >= ic->i_recv_ring.w_nr) {
                         printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
                                         pos);
+                       ret = -EINVAL;
                         break;
                 }
  
                 recv = &ic->i_recvs[pos];
-               ret = rds_ib_recv_refill_one(conn, recv, prefill);
+               ret = rds_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp);
                 if (ret) {
+                       ret = -1;
                         break;
                 }
  
                 /* XXX when can this fail? */
                 ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
                 rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv,
-                        recv->r_ibinc, sg_page(&recv->r_frag->f_sg),
-                        (long) ib_sg_dma_address(
-                               ic->i_cm_id->device,
-                               &recv->r_frag->f_sg),
-                       ret);
+                        recv->r_ibinc, recv->r_frag->f_page,
+                        (long) recv->r_frag->f_mapped, ret);
                 if (ret) {
                         rds_ib_conn_error(conn, "recv post on "
                                "%pI4 returned %d, disconnecting and "
                                "reconnecting\n", &conn->c_faddr,
                                ret);
+                       ret = -1;
                         break;
                 }
  
@@ -402,82 +268,48 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill)
  
         if (ret)
                 rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
+       return ret;
  }
  
-/*
- * We want to recycle several types of recv allocations, like incs and frags.
- * To use this, the *_free() function passes in the ptr to a list_head within
- * the recyclee, as well as the cache to put it on.
- *
- * First, we put the memory on a percpu list. When this reaches a certain size,
- * We move it to an intermediate non-percpu list in a lockless manner, with some
- * xchg/compxchg wizardry.
- *
- * N.B. Instead of a list_head as the anchor, we use a single pointer, which can
- * be NULL and xchg'd. The list is actually empty when the pointer is NULL, and
- * list_empty() will return true with one element is actually present.
- */
-static void rds_ib_recv_cache_put(struct list_head *new_item,
-                                struct rds_ib_refill_cache *cache)
+void rds_ib_inc_purge(struct rds_incoming *inc)
  {
-       unsigned long flags;
-       struct list_head *old, *chpfirst;
-
-       local_irq_save(flags);
-
-       chpfirst = __this_cpu_read(cache->percpu->first);
-       if (!chpfirst)
-               INIT_LIST_HEAD(new_item);
-       else /* put on front */
-               list_add_tail(new_item, chpfirst);
-
-       __this_cpu_write(cache->percpu->first, new_item);
-       __this_cpu_inc(cache->percpu->count);
+       struct rds_ib_incoming *ibinc;
+       struct rds_page_frag *frag;
+       struct rds_page_frag *pos;
  
-       if (__this_cpu_read(cache->percpu->count) < RDS_IB_RECYCLE_BATCH_COUNT)
-               goto end;
+       ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+       rdsdebug("purging ibinc %p inc %p\n", ibinc, inc);
  
-       /*
-        * Return our per-cpu first list to the cache's xfer by atomically
-        * grabbing the current xfer list, appending it to our per-cpu list,
-        * and then atomically returning that entire list back to the
-        * cache's xfer list as long as it's still empty.
-        */
-       do {
-               old = xchg(&cache->xfer, NULL);
-               if (old)
-                       list_splice_entire_tail(old, chpfirst);
-               old = cmpxchg(&cache->xfer, NULL, chpfirst);
-       } while (old);
-
-
-       __this_cpu_write(cache->percpu->first, NULL);
-       __this_cpu_write(cache->percpu->count, 0);
-end:
-       local_irq_restore(flags);
+       list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
+               list_del_init(&frag->f_item);
+               rds_ib_frag_drop_page(frag);
+               rds_ib_frag_free(frag);
+       }
  }
  
-static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache)
+void rds_ib_inc_free(struct rds_incoming *inc)
  {
-       struct list_head *head = cache->ready;
-
-       if (head) {
-               if (!list_empty(head)) {
-                       cache->ready = head->next;
-                       list_del_init(head);
-               } else
-                       cache->ready = NULL;
-       }
+       struct rds_ib_incoming *ibinc;
+
+       ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
  
-       return head;
+       rds_ib_inc_purge(inc);
+       rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
+       BUG_ON(!list_empty(&ibinc->ii_frags));
+       kmem_cache_free(rds_ib_incoming_slab, ibinc);
+       atomic_dec(&rds_ib_allocation);
+       BUG_ON(atomic_read(&rds_ib_allocation) < 0);
  }
  
-int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
+int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
+                           size_t size)
  {
         struct rds_ib_incoming *ibinc;
         struct rds_page_frag *frag;
+       struct iovec *iov = first_iov;
         unsigned long to_copy;
         unsigned long frag_off = 0;
+       unsigned long iov_off = 0;
         int copied = 0;
         int ret;
         u32 len;
@@ -486,25 +318,37 @@ int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
         frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
         len = be32_to_cpu(inc->i_hdr.h_len);
  
-       while (iov_iter_count(to) && copied < len) {
+       while (copied < size && copied < len) {
                 if (frag_off == RDS_FRAG_SIZE) {
                         frag = list_entry(frag->f_item.next,
                                           struct rds_page_frag, f_item);
                         frag_off = 0;
                 }
-               to_copy = min_t(unsigned long, iov_iter_count(to),
-                               RDS_FRAG_SIZE - frag_off);
+               while (iov_off == iov->iov_len) {
+                       iov_off = 0;
+                       iov++;
+               }
+
+               to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off);
+               to_copy = min_t(size_t, to_copy, size - copied);
                 to_copy = min_t(unsigned long, to_copy, len - copied);
  
+               rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
+                        "[%p, %lu] + %lu\n",
+                        to_copy, iov->iov_base, iov->iov_len, iov_off,
+                        frag->f_page, frag->f_offset, frag_off);
+
                 /* XXX needs + offset for multiple recvs per page */
-               rds_stats_add(s_copy_to_user, to_copy);
-               ret = copy_page_to_iter(sg_page(&frag->f_sg),
-                                       frag->f_sg.offset + frag_off,
-                                       to_copy,
-                                       to);
-               if (ret != to_copy)
-                       return -EFAULT;
+               ret = rds_page_copy_to_user(frag->f_page,
+                                           frag->f_offset + frag_off,
+                                           iov->iov_base + iov_off,
+                                           to_copy);
+               if (ret) {
+                       copied = ret;
+                       break;
+               }
  
+               iov_off += to_copy;
                 frag_off += to_copy;
                 copied += to_copy;
         }
@@ -583,7 +427,7 @@ static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq,
  {
         atomic64_set(&ic->i_ack_next, seq);
         if (ack_required) {
-               smp_mb__before_atomic();
+               smp_mb__before_clear_bit();
                 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
         }
  }
@@ -591,7 +435,7 @@ static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq,
  static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
  {
         clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
-       smp_mb__after_atomic();
+       smp_mb__after_clear_bit();
  
         return atomic64_read(&ic->i_ack_next);
  }
@@ -623,8 +467,8 @@ static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credi
                 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
  
                 rds_ib_stats_inc(s_ib_ack_send_failure);
-
-               rds_ib_conn_error(ic->conn, "sending ack failed\n");
+               /* Need to finesse this later. */
+               BUG();
         } else
                 rds_ib_stats_inc(s_ib_ack_sent);
  }
@@ -680,7 +524,7 @@ void rds_ib_attempt_ack(struct rds_ib_connection *ic)
         }
  
         /* Can we get a send credit? */
-       if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) {
+       if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0)) {
                 rds_ib_stats_inc(s_ib_tx_throttle);
                 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
                 return;
@@ -752,7 +596,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn,
                 to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
                 BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
  
-               addr = kmap_atomic(sg_page(&frag->f_sg));
+               addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0);
  
                 src = addr + frag_off;
                 dst = (void *)map->m_page_addrs[map_page] + map_off;
@@ -762,7 +606,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn,
                         uncongested |= ~(*src) & *dst;
                         *dst++ = *src++;
                 }
-               kunmap_atomic(addr);
+               kunmap_atomic(addr, KM_SOFTIRQ0);
  
                 copied += to_copy;
  
@@ -801,7 +645,7 @@ struct rds_ib_ack_state {
  };
  
  static void rds_ib_process_recv(struct rds_connection *conn,
-                               struct rds_ib_recv_work *recv, u32 data_len,
+                               struct rds_ib_recv_work *recv, u32 byte_len,
                                 struct rds_ib_ack_state *state)
  {
         struct rds_ib_connection *ic = conn->c_transport_data;
@@ -811,17 +655,17 @@ static void rds_ib_process_recv(struct rds_connection *conn,
         /* XXX shut down the connection if port 0,0 are seen? */
  
         rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv,
-                data_len);
+                byte_len);
  
-       if (data_len < sizeof(struct rds_header)) {
+       if (byte_len < sizeof(struct rds_header)) {
                 rds_ib_conn_error(conn, "incoming message "
-                      "from %pI4 didn't include a "
+                      "from %pI4 didn't inclue a "
                        "header, disconnecting and "
                        "reconnecting\n",
                        &conn->c_faddr);
                 return;
         }
-       data_len -= sizeof(struct rds_header);
+       byte_len -= sizeof(struct rds_header);
  
         ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
  
@@ -843,7 +687,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
         if (ihdr->h_credit)
                 rds_ib_send_add_credits(conn, ihdr->h_credit);
  
-       if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) {
+       if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) {
                 /* This is an ACK-only packet. The fact that it gets
                  * special treatment here is that historically, ACKs
                  * were rather special beasts.
@@ -855,12 +699,12 @@ static void rds_ib_process_recv(struct rds_connection *conn,
                  * the inc is freed.  We don't go that route, so we have to drop the
                  * page ref ourselves.  We can't just leave the page on the recv
                  * because that confuses the dma mapping of pages and each recv's use
-                * of a partial page.
+                * of a partial page.  We can leave the frag, though, it will be
+                * reused.
                  *
                  * FIXME: Fold this into the code path below.
                  */
-               rds_ib_frag_free(ic, recv->r_frag);
-               recv->r_frag = NULL;
+               rds_ib_frag_drop_page(recv->r_frag);
                 return;
         }
  
@@ -870,7 +714,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
          * into the inc and save the inc so we can hang upcoming fragments
          * off its list.
          */
-       if (!ibinc) {
+       if (ibinc == NULL) {
                 ibinc = recv->r_ibinc;
                 recv->r_ibinc = NULL;
                 ic->i_ibinc = ibinc;
@@ -885,10 +729,10 @@ static void rds_ib_process_recv(struct rds_connection *conn,
                 hdr = &ibinc->ii_inc.i_hdr;
                 /* We can't just use memcmp here; fragments of a
                  * single message may carry different ACKs */
-               if (hdr->h_sequence != ihdr->h_sequence ||
-                   hdr->h_len != ihdr->h_len ||
-                   hdr->h_sport != ihdr->h_sport ||
-                   hdr->h_dport != ihdr->h_dport) {
+               if (hdr->h_sequence != ihdr->h_sequence
+                || hdr->h_len != ihdr->h_len
+                || hdr->h_sport != ihdr->h_sport
+                || hdr->h_dport != ihdr->h_dport) {
                         rds_ib_conn_error(conn,
                                 "fragment header mismatch; forcing reconnect\n");
                         return;
@@ -908,7 +752,8 @@ static void rds_ib_process_recv(struct rds_connection *conn,
                         rds_ib_cong_recv(conn, ibinc);
                 else {
                         rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
-                                         &ibinc->ii_inc, GFP_ATOMIC);
+                                         &ibinc->ii_inc, GFP_ATOMIC,
+                                         KM_SOFTIRQ0);
                         state->ack_next = be64_to_cpu(hdr->h_sequence);
                         state->ack_next_valid = 1;
                 }
@@ -938,67 +783,45 @@ void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context)
  {
         struct rds_connection *conn = context;
         struct rds_ib_connection *ic = conn->c_transport_data;
+       struct ib_wc wc;
+       struct rds_ib_ack_state state = { 0, };
+       struct rds_ib_recv_work *recv;
  
         rdsdebug("conn %p cq %p\n", conn, cq);
  
         rds_ib_stats_inc(s_ib_rx_cq_call);
  
-       tasklet_schedule(&ic->i_recv_tasklet);
-}
+       ib_req_notify_cq(cq, IB_CQ_SOLICITED);
  
-static inline void rds_poll_cq(struct rds_ib_connection *ic,
-                              struct rds_ib_ack_state *state)
-{
-       struct rds_connection *conn = ic->conn;
-       struct ib_wc wc;
-       struct rds_ib_recv_work *recv;
-
-       while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
-               rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
-                        (unsigned long long)wc.wr_id, wc.status,
-                        rds_ib_wc_status_str(wc.status), wc.byte_len,
+       while (ib_poll_cq(cq, 1, &wc) > 0) {
+               rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+                        (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
                          be32_to_cpu(wc.ex.imm_data));
                 rds_ib_stats_inc(s_ib_rx_cq_event);
  
                 recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
  
-               ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
+               rds_ib_recv_unmap_page(ic, recv);
  
                 /*
                  * Also process recvs in connecting state because it is possible
                  * to get a recv completion _before_ the rdmacm ESTABLISHED
                  * event is processed.
                  */
-               if (wc.status == IB_WC_SUCCESS) {
-                       rds_ib_process_recv(conn, recv, wc.byte_len, state);
-               } else {
+               if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
                         /* We expect errors as the qp is drained during shutdown */
-                       if (rds_conn_up(conn) || rds_conn_connecting(conn))
-                               rds_ib_conn_error(conn, "recv completion on %pI4 had "
-                                                 "status %u (%s), disconnecting and "
-                                                 "reconnecting\n", &conn->c_faddr,
-                                                 wc.status,
-                                                 rds_ib_wc_status_str(wc.status));
+                       if (wc.status == IB_WC_SUCCESS) {
+                               rds_ib_process_recv(conn, recv, wc.byte_len, &state);
+                       } else {
+                               rds_ib_conn_error(conn, "recv completion on "
+                                      "%pI4 had status %u, disconnecting and "
+                                      "reconnecting\n", &conn->c_faddr,
+                                      wc.status);
+                       }
                 }
  
-               /*
-                * It's very important that we only free this ring entry if we've truly
-                * freed the resources allocated to the entry.  The refilling path can
-                * leak if we don't.
-                */
                 rds_ib_ring_free(&ic->i_recv_ring, 1);
         }
-}
-
-void rds_ib_recv_tasklet_fn(unsigned long data)
-{
-       struct rds_ib_connection *ic = (struct rds_ib_connection *) data;
-       struct rds_connection *conn = ic->conn;
-       struct rds_ib_ack_state state = { 0, };
-
-       rds_poll_cq(ic, &state);
-       ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
-       rds_poll_cq(ic, &state);
  
         if (state.ack_next_valid)
                 rds_ib_set_ack(ic, state.ack_next, state.ack_required);
@@ -1015,8 +838,11 @@ void rds_ib_recv_tasklet_fn(unsigned long data)
         if (rds_ib_ring_empty(&ic->i_recv_ring))
                 rds_ib_stats_inc(s_ib_rx_ring_empty);
  
+       /*
+        * If the ring is running low, then schedule the thread to refill.
+        */
         if (rds_ib_ring_low(&ic->i_recv_ring))
-               rds_ib_recv_refill(conn, 0);
+               queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
  }
  
  int rds_ib_recv(struct rds_connection *conn)
@@ -1025,13 +851,25 @@ int rds_ib_recv(struct rds_connection *conn)
         int ret = 0;
  
         rdsdebug("conn %p\n", conn);
+
+       /*
+        * If we get a temporary posting failure in this context then
+        * we're really low and we want the caller to back off for a bit.
+        */
+       mutex_lock(&ic->i_recv_mutex);
+       if (rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
+               ret = -ENOMEM;
+       else
+               rds_ib_stats_inc(s_ib_rx_refill_from_thread);
+       mutex_unlock(&ic->i_recv_mutex);
+
         if (rds_conn_up(conn))
                 rds_ib_attempt_ack(ic);
  
         return ret;
  }
  
-int rds_ib_recv_init(void)
+int __init rds_ib_recv_init(void)
  {
         struct sysinfo si;
         int ret = -ENOMEM;
@@ -1042,14 +880,14 @@ int rds_ib_recv_init(void)
  
         rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming",
                                         sizeof(struct rds_ib_incoming),
-                                       0, SLAB_HWCACHE_ALIGN, NULL);
-       if (!rds_ib_incoming_slab)
+                                       0, 0, NULL);
+       if (rds_ib_incoming_slab == NULL)
                 goto out;
  
         rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
                                         sizeof(struct rds_page_frag),
-                                       0, SLAB_HWCACHE_ALIGN, NULL);
-       if (!rds_ib_frag_slab)
+                                       0, 0, NULL);
+       if (rds_ib_frag_slab == NULL)
                 kmem_cache_destroy(rds_ib_incoming_slab);
         else
                 ret = 0;
diff --git a/net/rds/ib_ring.c b/net/rds/ib_ring.c

index ff97e8eda858bbb2621c1108ab660ecbc0a6fddc..99a6ccae964cbd0e4fd5eccbaa9c38665be26f19 100644 (file)
--- a/net/rds/ib_ring.c
+++ b/net/rds/ib_ring.c
@@ -137,7 +137,7 @@ int rds_ib_ring_empty(struct rds_ib_work_ring *ring)
  
  int rds_ib_ring_low(struct rds_ib_work_ring *ring)
  {
-       return __rds_ib_ring_used(ring) <= (ring->w_nr >> 1);
+       return __rds_ib_ring_used(ring) <= (ring->w_nr >> 2);
  }
  
  /*
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c

index bd3825d38abc923bd905b6af266b4fffe706f427..cb6c52cb1c4c1aa47a085e2d71bdf1b61cf2dfa7 100644 (file)
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -34,52 +34,13 @@
  #include <linux/in.h>
  #include <linux/device.h>
  #include <linux/dmapool.h>
-#include <linux/ratelimit.h>
  
  #include "rds.h"
+#include "rdma.h"
  #include "ib.h"
  
-static char *rds_ib_wc_status_strings[] = {
-#define RDS_IB_WC_STATUS_STR(foo) \
-               [IB_WC_##foo] = __stringify(IB_WC_##foo)
-       RDS_IB_WC_STATUS_STR(SUCCESS),
-       RDS_IB_WC_STATUS_STR(LOC_LEN_ERR),
-       RDS_IB_WC_STATUS_STR(LOC_QP_OP_ERR),
-       RDS_IB_WC_STATUS_STR(LOC_EEC_OP_ERR),
-       RDS_IB_WC_STATUS_STR(LOC_PROT_ERR),
-       RDS_IB_WC_STATUS_STR(WR_FLUSH_ERR),
-       RDS_IB_WC_STATUS_STR(MW_BIND_ERR),
-       RDS_IB_WC_STATUS_STR(BAD_RESP_ERR),
-       RDS_IB_WC_STATUS_STR(LOC_ACCESS_ERR),
-       RDS_IB_WC_STATUS_STR(REM_INV_REQ_ERR),
-       RDS_IB_WC_STATUS_STR(REM_ACCESS_ERR),
-       RDS_IB_WC_STATUS_STR(REM_OP_ERR),
-       RDS_IB_WC_STATUS_STR(RETRY_EXC_ERR),
-       RDS_IB_WC_STATUS_STR(RNR_RETRY_EXC_ERR),
-       RDS_IB_WC_STATUS_STR(LOC_RDD_VIOL_ERR),
-       RDS_IB_WC_STATUS_STR(REM_INV_RD_REQ_ERR),
-       RDS_IB_WC_STATUS_STR(REM_ABORT_ERR),
-       RDS_IB_WC_STATUS_STR(INV_EECN_ERR),
-       RDS_IB_WC_STATUS_STR(INV_EEC_STATE_ERR),
-       RDS_IB_WC_STATUS_STR(FATAL_ERR),
-       RDS_IB_WC_STATUS_STR(RESP_TIMEOUT_ERR),
-       RDS_IB_WC_STATUS_STR(GENERAL_ERR),
-#undef RDS_IB_WC_STATUS_STR
-};
-
-char *rds_ib_wc_status_str(enum ib_wc_status status)
-{
-       return rds_str_array(rds_ib_wc_status_strings,
-                            ARRAY_SIZE(rds_ib_wc_status_strings), status);
-}
-
-/*
- * Convert IB-specific error message to RDS error message and call core
- * completion handler.
- */
-static void rds_ib_send_complete(struct rds_message *rm,
-                                int wc_status,
-                                void (*complete)(struct rds_message *rm, int status))
+static void rds_ib_send_rdma_complete(struct rds_message *rm,
+                                     int wc_status)
  {
         int notify_status;
  
@@ -99,124 +60,69 @@ static void rds_ib_send_complete(struct rds_message *rm,
                 notify_status = RDS_RDMA_OTHER_ERROR;
                 break;
         }
-       complete(rm, notify_status);
-}
-
-static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
-                                  struct rm_data_op *op,
-                                  int wc_status)
-{
-       if (op->op_nents)
-               ib_dma_unmap_sg(ic->i_cm_id->device,
-                               op->op_sg, op->op_nents,
-                               DMA_TO_DEVICE);
+       rds_rdma_send_complete(rm, notify_status);
  }
  
  static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
-                                  struct rm_rdma_op *op,
-                                  int wc_status)
+                                  struct rds_rdma_op *op)
  {
-       if (op->op_mapped) {
+       if (op->r_mapped) {
                 ib_dma_unmap_sg(ic->i_cm_id->device,
-                               op->op_sg, op->op_nents,
-                               op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-               op->op_mapped = 0;
+                       op->r_sg, op->r_nents,
+                       op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+               op->r_mapped = 0;
         }
-
-       /* If the user asked for a completion notification on this
-        * message, we can implement three different semantics:
-        *  1.  Notify when we received the ACK on the RDS message
-        *      that was queued with the RDMA. This provides reliable
-        *      notification of RDMA status at the expense of a one-way
-        *      packet delay.
-        *  2.  Notify when the IB stack gives us the completion event for
-        *      the RDMA operation.
-        *  3.  Notify when the IB stack gives us the completion event for
-        *      the accompanying RDS messages.
-        * Here, we implement approach #3. To implement approach #2,
-        * we would need to take an event for the rdma WR. To implement #1,
-        * don't call rds_rdma_send_complete at all, and fall back to the notify
-        * handling in the ACK processing code.
-        *
-        * Note: There's no need to explicitly sync any RDMA buffers using
-        * ib_dma_sync_sg_for_cpu - the completion for the RDMA
-        * operation itself unmapped the RDMA buffers, which takes care
-        * of synching.
-        */
-       rds_ib_send_complete(container_of(op, struct rds_message, rdma),
-                            wc_status, rds_rdma_send_complete);
-
-       if (op->op_write)
-               rds_stats_add(s_send_rdma_bytes, op->op_bytes);
-       else
-               rds_stats_add(s_recv_rdma_bytes, op->op_bytes);
  }
  
-static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic,
-                                    struct rm_atomic_op *op,
-                                    int wc_status)
+static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
+                         struct rds_ib_send_work *send,
+                         int wc_status)
  {
-       /* unmap atomic recvbuf */
-       if (op->op_mapped) {
-               ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1,
-                               DMA_FROM_DEVICE);
-               op->op_mapped = 0;
-       }
-
-       rds_ib_send_complete(container_of(op, struct rds_message, atomic),
-                            wc_status, rds_atomic_send_complete);
-
-       if (op->op_type == RDS_ATOMIC_TYPE_CSWP)
-               rds_ib_stats_inc(s_ib_atomic_cswp);
-       else
-               rds_ib_stats_inc(s_ib_atomic_fadd);
-}
+       struct rds_message *rm = send->s_rm;
+
+       rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
+
+       ib_dma_unmap_sg(ic->i_cm_id->device,
+                    rm->m_sg, rm->m_nents,
+                    DMA_TO_DEVICE);
+
+       if (rm->m_rdma_op != NULL) {
+               rds_ib_send_unmap_rdma(ic, rm->m_rdma_op);
+
+               /* If the user asked for a completion notification on this
+                * message, we can implement three different semantics:
+                *  1.  Notify when we received the ACK on the RDS message
+                *      that was queued with the RDMA. This provides reliable
+                *      notification of RDMA status at the expense of a one-way
+                *      packet delay.
+                *  2.  Notify when the IB stack gives us the completion event for
+                *      the RDMA operation.
+                *  3.  Notify when the IB stack gives us the completion event for
+                *      the accompanying RDS messages.
+                * Here, we implement approach #3. To implement approach #2,
+                * call rds_rdma_send_complete from the cq_handler. To implement #1,
+                * don't call rds_rdma_send_complete at all, and fall back to the notify
+                * handling in the ACK processing code.
+                *
+                * Note: There's no need to explicitly sync any RDMA buffers using
+                * ib_dma_sync_sg_for_cpu - the completion for the RDMA
+                * operation itself unmapped the RDMA buffers, which takes care
+                * of synching.
+                */
+               rds_ib_send_rdma_complete(rm, wc_status);
  
-/*
- * Unmap the resources associated with a struct send_work.
- *
- * Returns the rm for no good reason other than it is unobtainable
- * other than by switching on wr.opcode, currently, and the caller,
- * the event handler, needs it.
- */
-static struct rds_message *rds_ib_send_unmap_op(struct rds_ib_connection *ic,
-                                               struct rds_ib_send_work *send,
-                                               int wc_status)
-{
-       struct rds_message *rm = NULL;
-
-       /* In the error case, wc.opcode sometimes contains garbage */
-       switch (send->s_wr.opcode) {
-       case IB_WR_SEND:
-               if (send->s_op) {
-                       rm = container_of(send->s_op, struct rds_message, data);
-                       rds_ib_send_unmap_data(ic, send->s_op, wc_status);
-               }
-               break;
-       case IB_WR_RDMA_WRITE:
-       case IB_WR_RDMA_READ:
-               if (send->s_op) {
-                       rm = container_of(send->s_op, struct rds_message, rdma);
-                       rds_ib_send_unmap_rdma(ic, send->s_op, wc_status);
-               }
-               break;
-       case IB_WR_ATOMIC_FETCH_AND_ADD:
-       case IB_WR_ATOMIC_CMP_AND_SWP:
-               if (send->s_op) {
-                       rm = container_of(send->s_op, struct rds_message, atomic);
-                       rds_ib_send_unmap_atomic(ic, send->s_op, wc_status);
-               }
-               break;
-       default:
-               printk_ratelimited(KERN_NOTICE
-                              "RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
-                              __func__, send->s_wr.opcode);
-               break;
+               if (rm->m_rdma_op->r_write)
+                       rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes);
+               else
+                       rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes);
         }
  
-       send->s_wr.opcode = 0xdead;
+       /* If anyone waited for this message to get flushed out, wake
+        * them up now */
+       rds_message_unmapped(rm);
  
-       return rm;
+       rds_message_put(rm);
+       send->s_rm = NULL;
  }
  
  void rds_ib_send_init_ring(struct rds_ib_connection *ic)
@@ -227,18 +133,23 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic)
         for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
                 struct ib_sge *sge;
  
+               send->s_rm = NULL;
                 send->s_op = NULL;
  
                 send->s_wr.wr_id = i;
                 send->s_wr.sg_list = send->s_sge;
+               send->s_wr.num_sge = 1;
+               send->s_wr.opcode = IB_WR_SEND;
+               send->s_wr.send_flags = 0;
                 send->s_wr.ex.imm_data = 0;
  
-               sge = &send->s_sge[0];
+               sge = rds_ib_data_sge(ic, send->s_sge);
+               sge->lkey = ic->i_mr->lkey;
+
+               sge = rds_ib_header_sge(ic, send->s_sge);
                 sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
                 sge->length = sizeof(struct rds_header);
                 sge->lkey = ic->i_mr->lkey;
-
-               send->s_sge[1].lkey = ic->i_mr->lkey;
         }
  }
  
@@ -248,23 +159,15 @@ void rds_ib_send_clear_ring(struct rds_ib_connection *ic)
         u32 i;
  
         for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
-               if (send->s_op && send->s_wr.opcode != 0xdead)
-                       rds_ib_send_unmap_op(ic, send, IB_WC_WR_FLUSH_ERR);
+               if (send->s_wr.opcode == 0xdead)
+                       continue;
+               if (send->s_rm)
+                       rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
+               if (send->s_op)
+                       rds_ib_send_unmap_rdma(ic, send->s_op);
         }
  }
  
-/*
- * The only fast path caller always has a non-zero nr, so we don't
- * bother testing nr before performing the atomic sub.
- */
-static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr)
-{
-       if ((atomic_sub_return(nr, &ic->i_signaled_sends) == 0) &&
-           waitqueue_active(&rds_ib_ring_empty_wait))
-               wake_up(&rds_ib_ring_empty_wait);
-       BUG_ON(atomic_read(&ic->i_signaled_sends) < 0);
-}
-
  /*
   * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
   * operations performed in the send path.  As the sender allocs and potentially
@@ -275,14 +178,12 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
  {
         struct rds_connection *conn = context;
         struct rds_ib_connection *ic = conn->c_transport_data;
-       struct rds_message *rm = NULL;
         struct ib_wc wc;
         struct rds_ib_send_work *send;
         u32 completed;
         u32 oldest;
         u32 i = 0;
         int ret;
-       int nr_sig = 0;
  
         rdsdebug("cq %p conn %p\n", cq, conn);
         rds_ib_stats_inc(s_ib_tx_cq_call);
@@ -291,14 +192,13 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
                 rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
  
         while (ib_poll_cq(cq, 1, &wc) > 0) {
-               rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
-                        (unsigned long long)wc.wr_id, wc.status,
-                        rds_ib_wc_status_str(wc.status), wc.byte_len,
+               rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+                        (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
                          be32_to_cpu(wc.ex.imm_data));
                 rds_ib_stats_inc(s_ib_tx_cq_event);
  
                 if (wc.wr_id == RDS_IB_ACK_WR_ID) {
-                       if (time_after(jiffies, ic->i_ack_queued + HZ/2))
+                       if (ic->i_ack_queued + HZ/2 < jiffies)
                                 rds_ib_stats_inc(s_ib_tx_stalled);
                         rds_ib_ack_send_complete(ic);
                         continue;
@@ -310,41 +210,58 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
  
                 for (i = 0; i < completed; i++) {
                         send = &ic->i_sends[oldest];
-                       if (send->s_wr.send_flags & IB_SEND_SIGNALED)
-                               nr_sig++;
  
-                       rm = rds_ib_send_unmap_op(ic, send, wc.status);
+                       /* In the error case, wc.opcode sometimes contains garbage */
+                       switch (send->s_wr.opcode) {
+                       case IB_WR_SEND:
+                               if (send->s_rm)
+                                       rds_ib_send_unmap_rm(ic, send, wc.status);
+                               break;
+                       case IB_WR_RDMA_WRITE:
+                       case IB_WR_RDMA_READ:
+                               /* Nothing to be done - the SG list will be unmapped
+                                * when the SEND completes. */
+                               break;
+                       default:
+                               if (printk_ratelimit())
+                                       printk(KERN_NOTICE
+                                               "RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
+                                               __func__, send->s_wr.opcode);
+                               break;
+                       }
  
-                       if (time_after(jiffies, send->s_queued + HZ/2))
+                       send->s_wr.opcode = 0xdead;
+                       send->s_wr.num_sge = 1;
+                       if (send->s_queued + HZ/2 < jiffies)
                                 rds_ib_stats_inc(s_ib_tx_stalled);
  
-                       if (send->s_op) {
-                               if (send->s_op == rm->m_final_op) {
-                                       /* If anyone waited for this message to get flushed out, wake
-                                        * them up now */
-                                       rds_message_unmapped(rm);
-                               }
-                               rds_message_put(rm);
-                               send->s_op = NULL;
+                       /* If a RDMA operation produced an error, signal this right
+                        * away. If we don't, the subsequent SEND that goes with this
+                        * RDMA will be canceled with ERR_WFLUSH, and the application
+                        * never learn that the RDMA failed. */
+                       if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) {
+                               struct rds_message *rm;
+
+                               rm = rds_send_get_message(conn, send->s_op);
+                               if (rm)
+                                       rds_ib_send_rdma_complete(rm, wc.status);
                         }
  
                         oldest = (oldest + 1) % ic->i_send_ring.w_nr;
                 }
  
                 rds_ib_ring_free(&ic->i_send_ring, completed);
-               rds_ib_sub_signaled(ic, nr_sig);
-               nr_sig = 0;
  
-               if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
-                   test_bit(0, &conn->c_map_queued))
+               if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)
+                || test_bit(0, &conn->c_map_queued))
                         queue_delayed_work(rds_wq, &conn->c_send_w, 0);
  
                 /* We expect errors as the qp is drained during shutdown */
                 if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
-                       rds_ib_conn_error(conn, "send completion on %pI4 had status "
-                                         "%u (%s), disconnecting and reconnecting\n",
-                                         &conn->c_faddr, wc.status,
-                                         rds_ib_wc_status_str(wc.status));
+                       rds_ib_conn_error(conn,
+                               "send completion on %pI4 "
+                               "had status %u, disconnecting and reconnecting\n",
+                               &conn->c_faddr, wc.status);
                 }
         }
  }
@@ -355,7 +272,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
   *
   * Conceptually, we have two counters:
   *  -  send credits: this tells us how many WRs we're allowed
- *     to submit without overruning the receiver's queue. For
+ *     to submit without overruning the reciever's queue. For
   *     each SEND WR we post, we decrement this by one.
   *
   *  -  posted credits: this tells us how many WRs we recently
@@ -373,7 +290,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
   * credits (see rds_ib_send_add_credits below).
   *
   * The RDS send code is essentially single-threaded; rds_send_xmit
- * sets RDS_IN_XMIT to ensure exclusive access to the send ring.
+ * grabs c_send_lock to ensure exclusive access to the send ring.
   * However, the ACK sending code is independent and can race with
   * message SENDs.
   *
@@ -394,7 +311,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
   * and using atomic_cmpxchg when updating the two counters.
   */
  int rds_ib_send_grab_credits(struct rds_ib_connection *ic,
-                            u32 wanted, u32 *adv_credits, int need_posted, int max_posted)
+                            u32 wanted, u32 *adv_credits, int need_posted)
  {
         unsigned int avail, posted, got = 0, advertise;
         long oldval, newval;
@@ -409,7 +326,7 @@ try_again:
         posted = IB_GET_POST_CREDITS(oldval);
         avail = IB_GET_SEND_CREDITS(oldval);
  
-       rdsdebug("wanted=%u credits=%u posted=%u\n",
+       rdsdebug("rds_ib_send_grab_credits(%u): credits=%u posted=%u\n",
                         wanted, avail, posted);
  
         /* The last credit must be used to send a credit update. */
@@ -434,7 +351,7 @@ try_again:
          * available.
          */
         if (posted && (got || need_posted)) {
-               advertise = min_t(unsigned int, posted, max_posted);
+               advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT);
                 newval -= IB_SET_POST_CREDITS(advertise);
         }
  
@@ -453,7 +370,7 @@ void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits)
         if (credits == 0)
                 return;
  
-       rdsdebug("credits=%u current=%u%s\n",
+       rdsdebug("rds_ib_send_add_credits(%u): current=%u%s\n",
                         credits,
                         IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)),
                         test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : "");
@@ -492,21 +409,40 @@ void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted)
                 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
  }
  
-static inline int rds_ib_set_wr_signal_state(struct rds_ib_connection *ic,
-                                            struct rds_ib_send_work *send,
-                                            bool notify)
+static inline void
+rds_ib_xmit_populate_wr(struct rds_ib_connection *ic,
+               struct rds_ib_send_work *send, unsigned int pos,
+               unsigned long buffer, unsigned int length,
+               int send_flags)
  {
-       /*
-        * We want to delay signaling completions just enough to get
-        * the batching benefits but not so much that we create dead time
-        * on the wire.
-        */
-       if (ic->i_unsignaled_wrs-- == 0 || notify) {
-               ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
-               send->s_wr.send_flags |= IB_SEND_SIGNALED;
-               return 1;
+       struct ib_sge *sge;
+
+       WARN_ON(pos != send - ic->i_sends);
+
+       send->s_wr.send_flags = send_flags;
+       send->s_wr.opcode = IB_WR_SEND;
+       send->s_wr.num_sge = 2;
+       send->s_wr.next = NULL;
+       send->s_queued = jiffies;
+       send->s_op = NULL;
+
+       if (length != 0) {
+               sge = rds_ib_data_sge(ic, send->s_sge);
+               sge->addr = buffer;
+               sge->length = length;
+               sge->lkey = ic->i_mr->lkey;
+
+               sge = rds_ib_header_sge(ic, send->s_sge);
+       } else {
+               /* We're sending a packet with no payload. There is only
+                * one SGE */
+               send->s_wr.num_sge = 1;
+               sge = &send->s_sge[0];
         }
-       return 0;
+
+       sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
+       sge->length = sizeof(struct rds_header);
+       sge->lkey = ic->i_mr->lkey;
  }
  
  /*
@@ -535,27 +471,17 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
         u32 pos;
         u32 i;
         u32 work_alloc;
-       u32 credit_alloc = 0;
+       u32 credit_alloc;
         u32 posted;
         u32 adv_credits = 0;
         int send_flags = 0;
-       int bytes_sent = 0;
+       int sent;
         int ret;
         int flow_controlled = 0;
-       int nr_sig = 0;
  
         BUG_ON(off % RDS_FRAG_SIZE);
         BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
  
-       /* Do not send cong updates to IB loopback */
-       if (conn->c_loopback
-           && rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) {
-               rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
-               scat = &rm->data.op_sg[sg];
-               ret = max_t(int, RDS_CONG_MAP_BYTES, scat->length);
-               return sizeof(struct rds_header) + ret;
-       }
-
         /* FIXME we may overallocate here */
         if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0)
                 i = 1;
@@ -570,16 +496,17 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
                 goto out;
         }
  
+       credit_alloc = work_alloc;
         if (ic->i_flowctl) {
-               credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT);
+               credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0);
                 adv_credits += posted;
                 if (credit_alloc < work_alloc) {
                         rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
                         work_alloc = credit_alloc;
-                       flow_controlled = 1;
+                       flow_controlled++;
                 }
                 if (work_alloc == 0) {
-                       set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+                       rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
                         rds_ib_stats_inc(s_ib_tx_throttle);
                         ret = -ENOMEM;
                         goto out;
@@ -587,25 +514,31 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
         }
  
         /* map the message the first time we see it */
-       if (!ic->i_data_op) {
-               if (rm->data.op_nents) {
-                       rm->data.op_count = ib_dma_map_sg(dev,
-                                                         rm->data.op_sg,
-                                                         rm->data.op_nents,
-                                                         DMA_TO_DEVICE);
-                       rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
-                       if (rm->data.op_count == 0) {
+       if (ic->i_rm == NULL) {
+               /*
+               printk(KERN_NOTICE "rds_ib_xmit prep msg dport=%u flags=0x%x len=%d\n",
+                               be16_to_cpu(rm->m_inc.i_hdr.h_dport),
+                               rm->m_inc.i_hdr.h_flags,
+                               be32_to_cpu(rm->m_inc.i_hdr.h_len));
+                  */
+               if (rm->m_nents) {
+                       rm->m_count = ib_dma_map_sg(dev,
+                                        rm->m_sg, rm->m_nents, DMA_TO_DEVICE);
+                       rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count);
+                       if (rm->m_count == 0) {
                                 rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
                                 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
                                 ret = -ENOMEM; /* XXX ? */
                                 goto out;
                         }
                 } else {
-                       rm->data.op_count = 0;
+                       rm->m_count = 0;
                 }
  
+               ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+               ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes;
                 rds_message_addref(rm);
-               ic->i_data_op = &rm->data;
+               ic->i_rm = rm;
  
                 /* Finalize the header */
                 if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
@@ -615,10 +548,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
  
                 /* If it has a RDMA op, tell the peer we did it. This is
                  * used by the peer to release use-once RDMA MRs. */
-               if (rm->rdma.op_active) {
+               if (rm->m_rdma_op) {
                         struct rds_ext_header_rdma ext_hdr;
  
-                       ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
+                       ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key);
                         rds_message_add_extension(&rm->m_inc.i_hdr,
                                         RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
                 }
@@ -638,12 +571,18 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
                 /*
                  * Update adv_credits since we reset the ACK_REQUIRED bit.
                  */
-               if (ic->i_flowctl) {
-                       rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
-                       adv_credits += posted;
-                       BUG_ON(adv_credits > 255);
-               }
-       }
+               rds_ib_send_grab_credits(ic, 0, &posted, 1);
+               adv_credits += posted;
+               BUG_ON(adv_credits > 255);
+       } else if (ic->i_rm != rm)
+               BUG();
+
+       send = &ic->i_sends[pos];
+       first = send;
+       prev = NULL;
+       scat = &rm->m_sg[sg];
+       sent = 0;
+       i = 0;
  
         /* Sometimes you want to put a fence between an RDMA
          * READ and the following SEND.
@@ -651,64 +590,81 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
          * or when requested by the user. Right now, we let
          * the application choose.
          */
-       if (rm->rdma.op_active && rm->rdma.op_fence)
+       if (rm->m_rdma_op && rm->m_rdma_op->r_fence)
                 send_flags = IB_SEND_FENCE;
  
-       /* Each frag gets a header. Msgs may be 0 bytes */
-       send = &ic->i_sends[pos];
-       first = send;
-       prev = NULL;
-       scat = &ic->i_data_op->op_sg[sg];
-       i = 0;
-       do {
-               unsigned int len = 0;
-
-               /* Set up the header */
-               send->s_wr.send_flags = send_flags;
-               send->s_wr.opcode = IB_WR_SEND;
-               send->s_wr.num_sge = 1;
-               send->s_wr.next = NULL;
-               send->s_queued = jiffies;
-               send->s_op = NULL;
+       /*
+        * We could be copying the header into the unused tail of the page.
+        * That would need to be changed in the future when those pages might
+        * be mapped userspace pages or page cache pages.  So instead we always
+        * use a second sge and our long-lived ring of mapped headers.  We send
+        * the header after the data so that the data payload can be aligned on
+        * the receiver.
+        */
  
-               send->s_sge[0].addr = ic->i_send_hdrs_dma
-                       + (pos * sizeof(struct rds_header));
-               send->s_sge[0].length = sizeof(struct rds_header);
+       /* handle a 0-len message */
+       if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) {
+               rds_ib_xmit_populate_wr(ic, send, pos, 0, 0, send_flags);
+               goto add_header;
+       }
  
-               memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
+       /* if there's data reference it with a chain of work reqs */
+       for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) {
+               unsigned int len;
  
-               /* Set up the data, if present */
-               if (i < work_alloc
-                   && scat != &rm->data.op_sg[rm->data.op_count]) {
-                       len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
-                       send->s_wr.num_sge = 2;
+               send = &ic->i_sends[pos];
  
-                       send->s_sge[1].addr = ib_sg_dma_address(dev, scat) + off;
-                       send->s_sge[1].length = len;
+               len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
+               rds_ib_xmit_populate_wr(ic, send, pos,
+                               ib_sg_dma_address(dev, scat) + off, len,
+                               send_flags);
  
-                       bytes_sent += len;
-                       off += len;
-                       if (off == ib_sg_dma_len(dev, scat)) {
-                               scat++;
-                               off = 0;
-                       }
+               /*
+                * We want to delay signaling completions just enough to get
+                * the batching benefits but not so much that we create dead time
+                * on the wire.
+                */
+               if (ic->i_unsignaled_wrs-- == 0) {
+                       ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+                       send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
                 }
  
-               rds_ib_set_wr_signal_state(ic, send, 0);
+               ic->i_unsignaled_bytes -= len;
+               if (ic->i_unsignaled_bytes <= 0) {
+                       ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes;
+                       send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+               }
  
                 /*
                  * Always signal the last one if we're stopping due to flow control.
                  */
-               if (ic->i_flowctl && flow_controlled && i == (work_alloc-1))
+               if (flow_controlled && i == (work_alloc-1))
                         send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
  
-               if (send->s_wr.send_flags & IB_SEND_SIGNALED)
-                       nr_sig++;
-
                 rdsdebug("send %p wr %p num_sge %u next %p\n", send,
                          &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
  
-               if (ic->i_flowctl && adv_credits) {
+               sent += len;
+               off += len;
+               if (off == ib_sg_dma_len(dev, scat)) {
+                       scat++;
+                       off = 0;
+               }
+
+add_header:
+               /* Tack on the header after the data. The header SGE should already
+                * have been set up to point to the right header buffer. */
+               memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
+
+               if (0) {
+                       struct rds_header *hdr = &ic->i_send_hdrs[pos];
+
+                       printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n",
+                               be16_to_cpu(hdr->h_dport),
+                               hdr->h_flags,
+                               be32_to_cpu(hdr->h_len));
+               }
+               if (adv_credits) {
                         struct rds_header *hdr = &ic->i_send_hdrs[pos];
  
                         /* add credit and redo the header checksum */
@@ -723,25 +679,20 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
                 prev = send;
  
                 pos = (pos + 1) % ic->i_send_ring.w_nr;
-               send = &ic->i_sends[pos];
-               i++;
-
-       } while (i < work_alloc
-                && scat != &rm->data.op_sg[rm->data.op_count]);
+       }
  
         /* Account the RDS header in the number of bytes we sent, but just once.
          * The caller has no concept of fragmentation. */
         if (hdr_off == 0)
-               bytes_sent += sizeof(struct rds_header);
+               sent += sizeof(struct rds_header);
  
         /* if we finished the message then send completion owns it */
-       if (scat == &rm->data.op_sg[rm->data.op_count]) {
-               prev->s_op = ic->i_data_op;
-               prev->s_wr.send_flags |= IB_SEND_SOLICITED;
-               ic->i_data_op = NULL;
+       if (scat == &rm->m_sg[rm->m_count]) {
+               prev->s_rm = ic->i_rm;
+               prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+               ic->i_rm = NULL;
         }
  
-       /* Put back wrs & credits we didn't use */
         if (i < work_alloc) {
                 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
                 work_alloc = i;
@@ -749,9 +700,6 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
         if (ic->i_flowctl && i < credit_alloc)
                 rds_ib_send_add_credits(conn, credit_alloc - i);
  
-       if (nr_sig)
-               atomic_add(nr_sig, &ic->i_signaled_sends);
-
         /* XXX need to worry about failed_wr and partial sends. */
         failed_wr = &first->s_wr;
         ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
@@ -762,127 +710,32 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
                 printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 "
                        "returned %d\n", &conn->c_faddr, ret);
                 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
-               rds_ib_sub_signaled(ic, nr_sig);
-               if (prev->s_op) {
-                       ic->i_data_op = prev->s_op;
-                       prev->s_op = NULL;
+               if (prev->s_rm) {
+                       ic->i_rm = prev->s_rm;
+                       prev->s_rm = NULL;
                 }
-
-               rds_ib_conn_error(ic->conn, "ib_post_send failed\n");
+               /* Finesse this later */
+               BUG();
                 goto out;
         }
  
-       ret = bytes_sent;
+       ret = sent;
  out:
         BUG_ON(adv_credits);
         return ret;
  }
  
-/*
- * Issue atomic operation.
- * A simplified version of the rdma case, we always map 1 SG, and
- * only 8 bytes, for the return value from the atomic operation.
- */
-int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
-{
-       struct rds_ib_connection *ic = conn->c_transport_data;
-       struct rds_ib_send_work *send = NULL;
-       struct ib_send_wr *failed_wr;
-       struct rds_ib_device *rds_ibdev;
-       u32 pos;
-       u32 work_alloc;
-       int ret;
-       int nr_sig = 0;
-
-       rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
-
-       work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos);
-       if (work_alloc != 1) {
-               rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
-               rds_ib_stats_inc(s_ib_tx_ring_full);
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       /* address of send request in ring */
-       send = &ic->i_sends[pos];
-       send->s_queued = jiffies;
-
-       if (op->op_type == RDS_ATOMIC_TYPE_CSWP) {
-               send->s_wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP;
-               send->s_wr.wr.atomic.compare_add = op->op_m_cswp.compare;
-               send->s_wr.wr.atomic.swap = op->op_m_cswp.swap;
-               send->s_wr.wr.atomic.compare_add_mask = op->op_m_cswp.compare_mask;
-               send->s_wr.wr.atomic.swap_mask = op->op_m_cswp.swap_mask;
-       } else { /* FADD */
-               send->s_wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD;
-               send->s_wr.wr.atomic.compare_add = op->op_m_fadd.add;
-               send->s_wr.wr.atomic.swap = 0;
-               send->s_wr.wr.atomic.compare_add_mask = op->op_m_fadd.nocarry_mask;
-               send->s_wr.wr.atomic.swap_mask = 0;
-       }
-       nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify);
-       send->s_wr.num_sge = 1;
-       send->s_wr.next = NULL;
-       send->s_wr.wr.atomic.remote_addr = op->op_remote_addr;
-       send->s_wr.wr.atomic.rkey = op->op_rkey;
-       send->s_op = op;
-       rds_message_addref(container_of(send->s_op, struct rds_message, atomic));
-
-       /* map 8 byte retval buffer to the device */
-       ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE);
-       rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret);
-       if (ret != 1) {
-               rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
-               rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
-               ret = -ENOMEM; /* XXX ? */
-               goto out;
-       }
-
-       /* Convert our struct scatterlist to struct ib_sge */
-       send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg);
-       send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg);
-       send->s_sge[0].lkey = ic->i_mr->lkey;
-
-       rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr,
-                send->s_sge[0].addr, send->s_sge[0].length);
-
-       if (nr_sig)
-               atomic_add(nr_sig, &ic->i_signaled_sends);
-
-       failed_wr = &send->s_wr;
-       ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr);
-       rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic,
-                send, &send->s_wr, ret, failed_wr);
-       BUG_ON(failed_wr != &send->s_wr);
-       if (ret) {
-               printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 "
-                      "returned %d\n", &conn->c_faddr, ret);
-               rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
-               rds_ib_sub_signaled(ic, nr_sig);
-               goto out;
-       }
-
-       if (unlikely(failed_wr != &send->s_wr)) {
-               printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
-               BUG_ON(failed_wr != &send->s_wr);
-       }
-
-out:
-       return ret;
-}
-
-int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
  {
         struct rds_ib_connection *ic = conn->c_transport_data;
         struct rds_ib_send_work *send = NULL;
         struct rds_ib_send_work *first;
         struct rds_ib_send_work *prev;
         struct ib_send_wr *failed_wr;
+       struct rds_ib_device *rds_ibdev;
         struct scatterlist *scat;
         unsigned long len;
-       u64 remote_addr = op->op_remote_addr;
-       u32 max_sge = ic->rds_ibdev->max_sge;
+       u64 remote_addr = op->r_remote_addr;
         u32 pos;
         u32 work_alloc;
         u32 i;
@@ -890,28 +743,29 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
         int sent;
         int ret;
         int num_sge;
-       int nr_sig = 0;
-
-       /* map the op the first time we see it */
-       if (!op->op_mapped) {
-               op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
-                                            op->op_sg, op->op_nents, (op->op_write) ?
-                                            DMA_TO_DEVICE : DMA_FROM_DEVICE);
-               rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
-               if (op->op_count == 0) {
+
+       rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+
+       /* map the message the first time we see it */
+       if (!op->r_mapped) {
+               op->r_count = ib_dma_map_sg(ic->i_cm_id->device,
+                                       op->r_sg, op->r_nents, (op->r_write) ?
+                                       DMA_TO_DEVICE : DMA_FROM_DEVICE);
+               rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count);
+               if (op->r_count == 0) {
                         rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
                         ret = -ENOMEM; /* XXX ? */
                         goto out;
                 }
  
-               op->op_mapped = 1;
+               op->r_mapped = 1;
         }
  
         /*
          * Instead of knowing how to return a partial rdma read/write we insist that there
          * be enough work requests to send the entire message.
          */
-       i = ceil(op->op_count, max_sge);
+       i = ceil(op->r_count, rds_ibdev->max_sge);
  
         work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
         if (work_alloc != i) {
@@ -924,24 +778,30 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
         send = &ic->i_sends[pos];
         first = send;
         prev = NULL;
-       scat = &op->op_sg[0];
+       scat = &op->r_sg[0];
         sent = 0;
-       num_sge = op->op_count;
+       num_sge = op->r_count;
  
-       for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
+       for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) {
                 send->s_wr.send_flags = 0;
                 send->s_queued = jiffies;
-               send->s_op = NULL;
-
-               nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify);
+               /*
+                * We want to delay signaling completions just enough to get
+                * the batching benefits but not so much that we create dead time on the wire.
+                */
+               if (ic->i_unsignaled_wrs-- == 0) {
+                       ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+                       send->s_wr.send_flags = IB_SEND_SIGNALED;
+               }
  
-               send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
+               send->s_wr.opcode = op->r_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
                 send->s_wr.wr.rdma.remote_addr = remote_addr;
-               send->s_wr.wr.rdma.rkey = op->op_rkey;
+               send->s_wr.wr.rdma.rkey = op->r_key;
+               send->s_op = op;
  
-               if (num_sge > max_sge) {
-                       send->s_wr.num_sge = max_sge;
-                       num_sge -= max_sge;
+               if (num_sge > rds_ibdev->max_sge) {
+                       send->s_wr.num_sge = rds_ibdev->max_sge;
+                       num_sge -= rds_ibdev->max_sge;
                 } else {
                         send->s_wr.num_sge = num_sge;
                 }
@@ -951,7 +811,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
                 if (prev)
                         prev->s_wr.next = &send->s_wr;
  
-               for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
+               for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) {
                         len = ib_sg_dma_len(ic->i_cm_id->device, scat);
                         send->s_sge[j].addr =
                                  ib_sg_dma_address(ic->i_cm_id->device, scat);
@@ -973,20 +833,15 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
                         send = ic->i_sends;
         }
  
-       /* give a reference to the last op */
-       if (scat == &op->op_sg[op->op_count]) {
-               prev->s_op = op;
-               rds_message_addref(container_of(op, struct rds_message, rdma));
-       }
+       /* if we finished the message then send completion owns it */
+       if (scat == &op->r_sg[op->r_count])
+               prev->s_wr.send_flags = IB_SEND_SIGNALED;
  
         if (i < work_alloc) {
                 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
                 work_alloc = i;
         }
  
-       if (nr_sig)
-               atomic_add(nr_sig, &ic->i_signaled_sends);
-
         failed_wr = &first->s_wr;
         ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
         rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
@@ -996,7 +851,6 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
                 printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 "
                        "returned %d\n", &conn->c_faddr, ret);
                 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
-               rds_ib_sub_signaled(ic, nr_sig);
                 goto out;
         }
  
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c

index 2d5965d6e97c039517d219bfdad3f28f7437b1d7..02e3e3d50d4a3c952bb0817e0e596a94ccba1c61 100644 (file)
--- a/net/rds/ib_stats.c
+++ b/net/rds/ib_stats.c
@@ -37,9 +37,9 @@
  #include "rds.h"
  #include "ib.h"
  
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats);
+DEFINE_PER_CPU(struct rds_ib_statistics, rds_ib_stats) ____cacheline_aligned;
  
-static const char *const rds_ib_stat_names[] = {
+static char *rds_ib_stat_names[] = {
         "ib_connect_raced",
         "ib_listen_closed_stale",
         "ib_tx_cq_call",
@@ -67,8 +67,6 @@ static const char *const rds_ib_stat_names[] = {
         "ib_rdma_mr_pool_flush",
         "ib_rdma_mr_pool_wait",
         "ib_rdma_mr_pool_depleted",
-       "ib_atomic_cswp",
-       "ib_atomic_fadd",
  };
  
  unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c

index e4e41b3afce7134119dfebd1b22bf8464492a663..d87830db93a0a4d1ee0e3a4ea89199a2a15427f9 100644 (file)
--- a/net/rds/ib_sysctl.c
+++ b/net/rds/ib_sysctl.c
@@ -49,73 +49,89 @@ unsigned long rds_ib_sysctl_max_unsig_wrs = 16;
  static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1;
  static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64;
  
-/*
- * This sysctl does nothing.
- *
- * Backwards compatibility with RDS 3.0 wire protocol
- * disables initial FC credit exchange.
- * If it's ever possible to drop 3.0 support,
- * setting this to 1 and moving init/refill of send/recv
- * rings from ib_cm_connect_complete() back into ib_setup_qp()
- * will cause credits to be added before protocol negotiation.
- */
-unsigned int rds_ib_sysctl_flow_control = 0;
+unsigned long rds_ib_sysctl_max_unsig_bytes = (16 << 20);
+static unsigned long rds_ib_sysctl_max_unsig_bytes_min = 1;
+static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL;
  
-static struct ctl_table rds_ib_sysctl_table[] = {
+unsigned int rds_ib_sysctl_flow_control = 1;
+
+ctl_table rds_ib_sysctl_table[] = {
         {
+               .ctl_name       = CTL_UNNUMBERED,
                 .procname       = "max_send_wr",
                 .data           = &rds_ib_sysctl_max_send_wr,
                 .maxlen         = sizeof(unsigned long),
                 .mode           = 0644,
-               .proc_handler   = proc_doulongvec_minmax,
+               .proc_handler   = &proc_doulongvec_minmax,
                 .extra1         = &rds_ib_sysctl_max_wr_min,
                 .extra2         = &rds_ib_sysctl_max_wr_max,
         },
         {
+               .ctl_name       = CTL_UNNUMBERED,
                 .procname       = "max_recv_wr",
                 .data           = &rds_ib_sysctl_max_recv_wr,
                 .maxlen         = sizeof(unsigned long),
                 .mode           = 0644,
-               .proc_handler   = proc_doulongvec_minmax,
+               .proc_handler   = &proc_doulongvec_minmax,
                 .extra1         = &rds_ib_sysctl_max_wr_min,
                 .extra2         = &rds_ib_sysctl_max_wr_max,
         },
         {
+               .ctl_name       = CTL_UNNUMBERED,
                 .procname       = "max_unsignaled_wr",
                 .data           = &rds_ib_sysctl_max_unsig_wrs,
                 .maxlen         = sizeof(unsigned long),
                 .mode           = 0644,
-               .proc_handler   = proc_doulongvec_minmax,
+               .proc_handler   = &proc_doulongvec_minmax,
                 .extra1         = &rds_ib_sysctl_max_unsig_wr_min,
                 .extra2         = &rds_ib_sysctl_max_unsig_wr_max,
         },
         {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "max_unsignaled_bytes",
+               .data           = &rds_ib_sysctl_max_unsig_bytes,
+               .maxlen         = sizeof(unsigned long),
+               .mode           = 0644,
+               .proc_handler   = &proc_doulongvec_minmax,
+               .extra1         = &rds_ib_sysctl_max_unsig_bytes_min,
+               .extra2         = &rds_ib_sysctl_max_unsig_bytes_max,
+       },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
                 .procname       = "max_recv_allocation",
                 .data           = &rds_ib_sysctl_max_recv_allocation,
                 .maxlen         = sizeof(unsigned long),
                 .mode           = 0644,
-               .proc_handler   = proc_doulongvec_minmax,
+               .proc_handler   = &proc_doulongvec_minmax,
         },
         {
+               .ctl_name       = CTL_UNNUMBERED,
                 .procname       = "flow_control",
                 .data           = &rds_ib_sysctl_flow_control,
                 .maxlen         = sizeof(rds_ib_sysctl_flow_control),
                 .mode           = 0644,
-               .proc_handler   = proc_dointvec,
+               .proc_handler   = &proc_dointvec,
         },
+       { .ctl_name = 0}
+};
+
+static struct ctl_path rds_ib_sysctl_path[] = {
+       { .procname = "net", .ctl_name = CTL_NET, },
+       { .procname = "rds", .ctl_name = CTL_UNNUMBERED, },
+       { .procname = "ib", .ctl_name = CTL_UNNUMBERED, },
         { }
  };
  
  void rds_ib_sysctl_exit(void)
  {
         if (rds_ib_sysctl_hdr)
-               unregister_net_sysctl_table(rds_ib_sysctl_hdr);
+               unregister_sysctl_table(rds_ib_sysctl_hdr);
  }
  
-int rds_ib_sysctl_init(void)
+int __init rds_ib_sysctl_init(void)
  {
-       rds_ib_sysctl_hdr = register_net_sysctl(&init_net, "net/rds/ib", rds_ib_sysctl_table);
-       if (!rds_ib_sysctl_hdr)
+       rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table);
+       if (rds_ib_sysctl_hdr == NULL)
                 return -ENOMEM;
         return 0;
  }
diff --git a/net/rds/info.c b/net/rds/info.c

index 9a6b4f66187cf3e5ab533cd01344c9856834ebb7..1d885535214dba2ba7567bed1b005e391fa9d675 100644 (file)
--- a/net/rds/info.c
+++ b/net/rds/info.c
@@ -32,9 +32,7 @@
   */
  #include <linux/percpu.h>
  #include <linux/seq_file.h>
-#include <linux/slab.h>
  #include <linux/proc_fs.h>
-#include <linux/export.h>
  
  #include "rds.h"
  
@@ -77,11 +75,10 @@ void rds_info_register_func(int optname, rds_info_func func)
         BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
  
         spin_lock(&rds_info_lock);
-       BUG_ON(rds_info_funcs[offset]);
+       BUG_ON(rds_info_funcs[offset] != NULL);
         rds_info_funcs[offset] = func;
         spin_unlock(&rds_info_lock);
  }
-EXPORT_SYMBOL_GPL(rds_info_register_func);
  
  void rds_info_deregister_func(int optname, rds_info_func func)
  {
@@ -94,7 +91,6 @@ void rds_info_deregister_func(int optname, rds_info_func func)
         rds_info_funcs[offset] = NULL;
         spin_unlock(&rds_info_lock);
  }
-EXPORT_SYMBOL_GPL(rds_info_deregister_func);
  
  /*
   * Typically we hold an atomic kmap across multiple rds_info_copy() calls
@@ -103,8 +99,8 @@ EXPORT_SYMBOL_GPL(rds_info_deregister_func);
   */
  void rds_info_iter_unmap(struct rds_info_iterator *iter)
  {
-       if (iter->addr) {
-               kunmap_atomic(iter->addr);
+       if (iter->addr != NULL) {
+               kunmap_atomic(iter->addr, KM_USER0);
                 iter->addr = NULL;
         }
  }
@@ -118,8 +114,8 @@ void rds_info_copy(struct rds_info_iterator *iter, void *data,
         unsigned long this;
  
         while (bytes) {
-               if (!iter->addr)
-                       iter->addr = kmap_atomic(*iter->pages);
+               if (iter->addr == NULL)
+                       iter->addr = kmap_atomic(*iter->pages, KM_USER0);
  
                 this = min(bytes, PAGE_SIZE - iter->offset);
  
@@ -134,14 +130,13 @@ void rds_info_copy(struct rds_info_iterator *iter, void *data,
                 iter->offset += this;
  
                 if (iter->offset == PAGE_SIZE) {
-                       kunmap_atomic(iter->addr);
+                       kunmap_atomic(iter->addr, KM_USER0);
                         iter->addr = NULL;
                         iter->offset = 0;
                         iter->pages++;
                 }
         }
  }
-EXPORT_SYMBOL_GPL(rds_info_copy);
  
  /*
   * @optval points to the userspace buffer that the information snapshot
@@ -189,11 +184,14 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
                         >> PAGE_SHIFT;
  
         pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL);
-       if (!pages) {
+       if (pages == NULL) {
                 ret = -ENOMEM;
                 goto out;
         }
-       ret = get_user_pages_fast(start, nr_pages, 1, pages);
+       down_read(&current->mm->mmap_sem);
+       ret = get_user_pages(current, current->mm, start, nr_pages, 1, 0,
+                            pages, NULL);
+       up_read(&current->mm->mmap_sem);
         if (ret != nr_pages) {
                 if (ret > 0)
                         nr_pages = ret;
@@ -207,7 +205,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
  
  call_func:
         func = rds_info_funcs[optname - RDS_INFO_FIRST];
-       if (!func) {
+       if (func == NULL) {
                 ret = -ENOPROTOOPT;
                 goto out;
         }
@@ -235,7 +233,7 @@ call_func:
                 ret = -EFAULT;
  
  out:
-       for (i = 0; pages && i < nr_pages; i++)
+       for (i = 0; pages != NULL && i < nr_pages; i++)
                 put_page(pages[i]);
         kfree(pages);
  
diff --git a/net/rds/iw.c b/net/rds/iw.c

index 589935661d667d81b2f6159eb69c237f95329a63..b732efb5b6345b0e5e923388258b500c4eb04208 100644 (file)
--- a/net/rds/iw.c
+++ b/net/rds/iw.c
@@ -37,8 +37,6 @@
  #include <linux/inetdevice.h>
  #include <linux/if_arp.h>
  #include <linux/delay.h>
-#include <linux/slab.h>
-#include <linux/module.h>
  
  #include "rds.h"
  #include "iw.h"
@@ -57,7 +55,7 @@ struct list_head rds_iw_devices;
  DEFINE_SPINLOCK(iw_nodev_conns_lock);
  LIST_HEAD(iw_nodev_conns);
  
-static void rds_iw_add_one(struct ib_device *device)
+void rds_iw_add_one(struct ib_device *device)
  {
         struct rds_iw_device *rds_iwdev;
         struct ib_device_attr *dev_attr;
@@ -85,16 +83,23 @@ static void rds_iw_add_one(struct ib_device *device)
         rds_iwdev->max_wrs = dev_attr->max_qp_wr;
         rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE);
  
+       rds_iwdev->page_shift = max(PAGE_SHIFT, ffs(dev_attr->page_size_cap) - 1);
+
         rds_iwdev->dev = device;
         rds_iwdev->pd = ib_alloc_pd(device);
         if (IS_ERR(rds_iwdev->pd))
                 goto free_dev;
  
         if (!rds_iwdev->dma_local_lkey) {
-               rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
-                                       IB_ACCESS_REMOTE_READ |
-                                       IB_ACCESS_REMOTE_WRITE |
-                                       IB_ACCESS_LOCAL_WRITE);
+               if (device->node_type != RDMA_NODE_RNIC) {
+                       rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
+                                               IB_ACCESS_LOCAL_WRITE);
+               } else {
+                       rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
+                                               IB_ACCESS_REMOTE_READ |
+                                               IB_ACCESS_REMOTE_WRITE |
+                                               IB_ACCESS_LOCAL_WRITE);
+               }
                 if (IS_ERR(rds_iwdev->mr))
                         goto err_pd;
         } else
@@ -125,7 +130,7 @@ free_attr:
         kfree(dev_attr);
  }
  
-static void rds_iw_remove_one(struct ib_device *device)
+void rds_iw_remove_one(struct ib_device *device)
  {
         struct rds_iw_device *rds_iwdev;
         struct rds_iw_cm_id *i_cm_id, *next;
@@ -186,8 +191,8 @@ static int rds_iw_conn_info_visitor(struct rds_connection *conn,
                 ic = conn->c_transport_data;
                 dev_addr = &ic->i_cm_id->route.addr.dev_addr;
  
-               rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
-               rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
+               ib_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
+               ib_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
  
                 rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
                 iinfo->max_send_wr = ic->i_send_ring.w_nr;
@@ -227,9 +232,9 @@ static int rds_iw_laddr_check(__be32 addr)
         /* Create a CMA ID and try to bind it. This catches both
          * IB and iWARP capable NICs.
          */
-       cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
-       if (IS_ERR(cm_id))
-               return PTR_ERR(cm_id);
+       cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
+       if (!cm_id)
+               return -EADDRNOTAVAIL;
  
         memset(&sin, 0, sizeof(sin));
         sin.sin_family = AF_INET;
@@ -239,8 +244,7 @@ static int rds_iw_laddr_check(__be32 addr)
         ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
         /* due to this, we will claim to support IB devices unless we
            check node_type. */
-       if (ret || !cm_id->device ||
-           cm_id->device->node_type != RDMA_NODE_RNIC)
+       if (ret || cm_id->device->node_type != RDMA_NODE_RNIC)
                 ret = -EADDRNOTAVAIL;
  
         rdsdebug("addr %pI4 ret %d node type %d\n",
@@ -266,6 +270,7 @@ struct rds_transport rds_iw_transport = {
         .laddr_check            = rds_iw_laddr_check,
         .xmit_complete          = rds_iw_xmit_complete,
         .xmit                   = rds_iw_xmit,
+       .xmit_cong_map          = NULL,
         .xmit_rdma              = rds_iw_xmit_rdma,
         .recv                   = rds_iw_recv,
         .conn_alloc             = rds_iw_conn_alloc,
@@ -273,6 +278,7 @@ struct rds_transport rds_iw_transport = {
         .conn_connect           = rds_iw_conn_connect,
         .conn_shutdown          = rds_iw_conn_shutdown,
         .inc_copy_to_user       = rds_iw_inc_copy_to_user,
+       .inc_purge              = rds_iw_inc_purge,
         .inc_free               = rds_iw_inc_free,
         .cm_initiate_connect    = rds_iw_cm_initiate_connect,
         .cm_handle_connect      = rds_iw_cm_handle_connect,
@@ -285,11 +291,10 @@ struct rds_transport rds_iw_transport = {
         .flush_mrs              = rds_iw_flush_mrs,
         .t_owner                = THIS_MODULE,
         .t_name                 = "iwarp",
-       .t_type                 = RDS_TRANS_IWARP,
         .t_prefer_loopback      = 1,
  };
  
-int rds_iw_init(void)
+int __init rds_iw_init(void)
  {
         int ret;
  
diff --git a/net/rds/iw.h b/net/rds/iw.h

index cbe6674e31ee52f6c9fd4380e3f0942127ede7c9..b4fb27252895adb45a1cf635ee441741b392cc5d 100644 (file)
--- a/net/rds/iw.h
+++ b/net/rds/iw.h
@@ -1,7 +1,6 @@
  #ifndef _RDS_IW_H
  #define _RDS_IW_H
  
-#include <linux/interrupt.h>
  #include <rdma/ib_verbs.h>
  #include <rdma/rdma_cm.h>
  #include "rds.h"
@@ -71,7 +70,7 @@ struct rds_iw_send_work {
         struct rds_message      *s_rm;
  
         /* We should really put these into a union: */
-       struct rm_rdma_op       *s_op;
+       struct rds_rdma_op      *s_op;
         struct rds_iw_mapping   *s_mapping;
         struct ib_mr            *s_mr;
         struct ib_fast_reg_page_list *s_page_list;
@@ -120,7 +119,6 @@ struct rds_iw_connection {
         struct rds_iw_send_work *i_sends;
  
         /* rx */
-       struct tasklet_struct   i_recv_tasklet;
         struct mutex            i_recv_mutex;
         struct rds_iw_work_ring i_recv_ring;
         struct rds_iw_incoming  *i_iwinc;
@@ -183,6 +181,7 @@ struct rds_iw_device {
         struct ib_pd            *pd;
         struct ib_mr            *mr;
         struct rds_iw_mr_pool   *mr_pool;
+       int                     page_shift;
         int                     max_sge;
         unsigned int            max_wrs;
         unsigned int            dma_local_lkey:1;
@@ -269,6 +268,8 @@ static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic)
  
  /* ib.c */
  extern struct rds_transport rds_iw_transport;
+extern void rds_iw_add_one(struct ib_device *device);
+extern void rds_iw_remove_one(struct ib_device *device);
  extern struct ib_client rds_iw_client;
  
  extern unsigned int fastreg_pool_size;
@@ -283,7 +284,7 @@ void rds_iw_conn_free(void *arg);
  int rds_iw_conn_connect(struct rds_connection *conn);
  void rds_iw_conn_shutdown(struct rds_connection *conn);
  void rds_iw_state_change(struct sock *sk);
-int rds_iw_listen_init(void);
+int __init rds_iw_listen_init(void);
  void rds_iw_listen_stop(void);
  void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...);
  int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
@@ -317,17 +318,19 @@ void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
  void rds_iw_sync_mr(void *trans_private, int dir);
  void rds_iw_free_mr(void *trans_private, int invalidate);
  void rds_iw_flush_mrs(void);
+void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
  
  /* ib_recv.c */
-int rds_iw_recv_init(void);
+int __init rds_iw_recv_init(void);
  void rds_iw_recv_exit(void);
  int rds_iw_recv(struct rds_connection *conn);
  int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
                        gfp_t page_gfp, int prefill);
+void rds_iw_inc_purge(struct rds_incoming *inc);
  void rds_iw_inc_free(struct rds_incoming *inc);
-int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
+int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
+                            size_t size);
  void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context);
-void rds_iw_recv_tasklet_fn(unsigned long data);
  void rds_iw_recv_init_ring(struct rds_iw_connection *ic);
  void rds_iw_recv_clear_ring(struct rds_iw_connection *ic);
  void rds_iw_recv_init_ack(struct rds_iw_connection *ic);
@@ -354,11 +357,11 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
  void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context);
  void rds_iw_send_init_ring(struct rds_iw_connection *ic);
  void rds_iw_send_clear_ring(struct rds_iw_connection *ic);
-int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
+int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op);
  void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits);
  void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted);
  int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted,
-                            u32 *adv_credits, int need_posted, int max_posted);
+                            u32 *adv_credits, int need_posted);
  
  /* ib_stats.c */
  DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats);
@@ -367,7 +370,7 @@ unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
                                     unsigned int avail);
  
  /* ib_sysctl.c */
-int rds_iw_sysctl_init(void);
+int __init rds_iw_sysctl_init(void);
  void rds_iw_sysctl_exit(void);
  extern unsigned long rds_iw_sysctl_max_send_wr;
  extern unsigned long rds_iw_sysctl_max_recv_wr;
@@ -375,6 +378,7 @@ extern unsigned long rds_iw_sysctl_max_unsig_wrs;
  extern unsigned long rds_iw_sysctl_max_unsig_bytes;
  extern unsigned long rds_iw_sysctl_max_recv_allocation;
  extern unsigned int rds_iw_sysctl_flow_control;
+extern ctl_table rds_iw_sysctl_table[];
  
  /*
   * Helper functions for getting/setting the header and data SGEs in
diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c

index a6c2bea9f8f9b37b46ce381336a90fb685187083..a416b0d492b1ce7b08d0cc17158e379cc6f609f8 100644 (file)
--- a/net/rds/iw_cm.c
+++ b/net/rds/iw_cm.c
@@ -32,9 +32,7 @@
   */
  #include <linux/kernel.h>
  #include <linux/in.h>
-#include <linux/slab.h>
  #include <linux/vmalloc.h>
-#include <linux/ratelimit.h>
  
  #include "rds.h"
  #include "iw.h"
@@ -158,11 +156,9 @@ static void rds_iw_qp_event_handler(struct ib_event *event, void *data)
         case IB_EVENT_QP_REQ_ERR:
         case IB_EVENT_QP_FATAL:
         default:
-               rdsdebug("Fatal QP Event %u "
-                       "- connection %pI4->%pI4, reconnecting\n",
+               rds_iw_conn_error(conn, "RDS/IW: Fatal QP Event %u - connection %pI4->%pI4...reconnecting\n",
                         event->event, &conn->c_laddr,
                         &conn->c_faddr);
-               rds_conn_drop(conn);
                 break;
         }
  }
@@ -182,7 +178,7 @@ static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr,
         unsigned int send_size, recv_size;
         int ret;
  
-       /* The offset of 1 is to accommodate the additional ACK WR. */
+       /* The offset of 1 is to accomodate the additional ACK WR. */
         send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1);
         recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1);
         rds_iw_ring_resize(send_ring, send_size - 1);
@@ -258,8 +254,9 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
          * the rds_iwdev at all.
          */
         rds_iwdev = ib_get_client_data(dev, &rds_iw_client);
-       if (!rds_iwdev) {
-               printk_ratelimited(KERN_NOTICE "RDS/IW: No client_data for device %s\n",
+       if (rds_iwdev == NULL) {
+               if (printk_ratelimit())
+                       printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n",
                                         dev->name);
                 return -EOPNOTSUPP;
         }
@@ -292,7 +289,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
                                            ic->i_send_ring.w_nr *
                                                 sizeof(struct rds_header),
                                            &ic->i_send_hdrs_dma, GFP_KERNEL);
-       if (!ic->i_send_hdrs) {
+       if (ic->i_send_hdrs == NULL) {
                 ret = -ENOMEM;
                 rdsdebug("ib_dma_alloc_coherent send failed\n");
                 goto out;
@@ -302,7 +299,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
                                            ic->i_recv_ring.w_nr *
                                                 sizeof(struct rds_header),
                                            &ic->i_recv_hdrs_dma, GFP_KERNEL);
-       if (!ic->i_recv_hdrs) {
+       if (ic->i_recv_hdrs == NULL) {
                 ret = -ENOMEM;
                 rdsdebug("ib_dma_alloc_coherent recv failed\n");
                 goto out;
@@ -310,14 +307,14 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
  
         ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
                                        &ic->i_ack_dma, GFP_KERNEL);
-       if (!ic->i_ack) {
+       if (ic->i_ack == NULL) {
                 ret = -ENOMEM;
                 rdsdebug("ib_dma_alloc_coherent ack failed\n");
                 goto out;
         }
  
         ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work));
-       if (!ic->i_sends) {
+       if (ic->i_sends == NULL) {
                 ret = -ENOMEM;
                 rdsdebug("send allocation failed\n");
                 goto out;
@@ -325,7 +322,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn)
         rds_iw_send_init_ring(ic);
  
         ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work));
-       if (!ic->i_recvs) {
+       if (ic->i_recvs == NULL) {
                 ret = -ENOMEM;
                 rdsdebug("recv allocation failed\n");
                 goto out;
@@ -365,12 +362,13 @@ static u32 rds_iw_protocol_compatible(const struct rds_iw_connect_private *dp)
                 version = RDS_PROTOCOL_3_0;
                 while ((common >>= 1) != 0)
                         version++;
-       }
-       printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using "
+       } else if (printk_ratelimit()) {
+               printk(KERN_NOTICE "RDS: Connection from %pI4 using "
                         "incompatible protocol version %u.%u\n",
                         &dp->dp_saddr,
                         dp->dp_protocol_major,
                         dp->dp_protocol_minor);
+       }
         return version;
  }
  
@@ -451,7 +449,6 @@ int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
         err = rds_iw_setup_qp(conn);
         if (err) {
                 rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", err);
-               mutex_unlock(&conn->c_cm_lock);
                 goto out;
         }
  
@@ -521,7 +518,7 @@ int rds_iw_conn_connect(struct rds_connection *conn)
         /* XXX I wonder what affect the port space has */
         /* delegate cm event handler to rdma_transport */
         ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
-                                    RDMA_PS_TCP, IB_QPT_RC);
+                                    RDMA_PS_TCP);
         if (IS_ERR(ic->i_cm_id)) {
                 ret = PTR_ERR(ic->i_cm_id);
                 ic->i_cm_id = NULL;
@@ -590,8 +587,8 @@ void rds_iw_conn_shutdown(struct rds_connection *conn)
                         /* Actually this may happen quite frequently, when
                          * an outgoing connect raced with an incoming connect.
                          */
-                       rdsdebug("failed to disconnect, cm: %p err %d\n",
-                                ic->i_cm_id, err);
+                       rdsdebug("rds_iw_conn_shutdown: failed to disconnect,"
+                                  " cm: %p err %d\n", ic->i_cm_id, err);
                 }
  
                 if (ic->i_cm_id->qp) {
@@ -694,13 +691,11 @@ int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp)
         unsigned long flags;
  
         /* XXX too lazy? */
-       ic = kzalloc(sizeof(struct rds_iw_connection), gfp);
-       if (!ic)
+       ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL);
+       if (ic == NULL)
                 return -ENOMEM;
  
         INIT_LIST_HEAD(&ic->iw_node);
-       tasklet_init(&ic->i_recv_tasklet, rds_iw_recv_tasklet_fn,
-                    (unsigned long) ic);
         mutex_init(&ic->i_recv_mutex);
  #ifndef KERNEL_HAS_ATOMIC64
         spin_lock_init(&ic->i_ack_lock);
diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c

index dba8d0864f18046ee87a168d49cc159518fa2916..dcdb37da80f29bd741c0295b2c0a41577b7532d3 100644 (file)
--- a/net/rds/iw_rdma.c
+++ b/net/rds/iw_rdma.c
@@ -31,10 +31,9 @@
   *
   */
  #include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/ratelimit.h>
  
  #include "rds.h"
+#include "rdma.h"
  #include "iw.h"
  
  
@@ -84,13 +83,10 @@ static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
  static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
  static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
                         struct list_head *unmap_list,
-                       struct list_head *kill_list,
-                       int *unpinned);
+                       struct list_head *kill_list);
  static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
  
-static int rds_iw_get_device(struct sockaddr_in *src, struct sockaddr_in *dst,
-                            struct rds_iw_device **rds_iwdev,
-                            struct rdma_cm_id **cm_id)
+static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwdev, struct rdma_cm_id **cm_id)
  {
         struct rds_iw_device *iwdev;
         struct rds_iw_cm_id *i_cm_id;
@@ -114,23 +110,23 @@ static int rds_iw_get_device(struct sockaddr_in *src, struct sockaddr_in *dst,
                                 src_addr->sin_port,
                                 dst_addr->sin_addr.s_addr,
                                 dst_addr->sin_port,
-                               src->sin_addr.s_addr,
-                               src->sin_port,
-                               dst->sin_addr.s_addr,
-                               dst->sin_port);
+                               rs->rs_bound_addr,
+                               rs->rs_bound_port,
+                               rs->rs_conn_addr,
+                               rs->rs_conn_port);
  #ifdef WORKING_TUPLE_DETECTION
-                       if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr &&
-                           src_addr->sin_port == src->sin_port &&
-                           dst_addr->sin_addr.s_addr == dst->sin_addr.s_addr &&
-                           dst_addr->sin_port == dst->sin_port) {
+                       if (src_addr->sin_addr.s_addr == rs->rs_bound_addr &&
+                           src_addr->sin_port == rs->rs_bound_port &&
+                           dst_addr->sin_addr.s_addr == rs->rs_conn_addr &&
+                           dst_addr->sin_port == rs->rs_conn_port) {
  #else
                         /* FIXME - needs to compare the local and remote
                          * ipaddr/port tuple, but the ipaddr is the only
-                        * available information in the rds_sock (as the rest are
+                        * available infomation in the rds_sock (as the rest are
                          * zero'ed.  It doesn't appear to be properly populated
                          * during connection setup...
                          */
-                       if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr) {
+                       if (src_addr->sin_addr.s_addr == rs->rs_bound_addr) {
  #endif
                                 spin_unlock_irq(&iwdev->spinlock);
                                 *rds_iwdev = iwdev;
@@ -161,8 +157,7 @@ static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *
         return 0;
  }
  
-static void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev,
-                               struct rdma_cm_id *cm_id)
+void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
  {
         struct rds_iw_cm_id *i_cm_id;
  
@@ -182,13 +177,19 @@ int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_i
  {
         struct sockaddr_in *src_addr, *dst_addr;
         struct rds_iw_device *rds_iwdev_old;
+       struct rds_sock rs;
         struct rdma_cm_id *pcm_id;
         int rc;
  
         src_addr = (struct sockaddr_in *)&cm_id->route.addr.src_addr;
         dst_addr = (struct sockaddr_in *)&cm_id->route.addr.dst_addr;
  
-       rc = rds_iw_get_device(src_addr, dst_addr, &rds_iwdev_old, &pcm_id);
+       rs.rs_bound_addr = src_addr->sin_addr.s_addr;
+       rs.rs_bound_port = src_addr->sin_port;
+       rs.rs_conn_addr = dst_addr->sin_addr.s_addr;
+       rs.rs_conn_port = dst_addr->sin_port;
+
+       rc = rds_iw_get_device(&rs, &rds_iwdev_old, &pcm_id);
         if (rc)
                 rds_iw_remove_cm_id(rds_iwdev, cm_id);
  
@@ -205,9 +206,9 @@ void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *con
         BUG_ON(list_empty(&ic->iw_node));
         list_del(&ic->iw_node);
  
-       spin_lock(&rds_iwdev->spinlock);
+       spin_lock_irq(&rds_iwdev->spinlock);
         list_add_tail(&ic->iw_node, &rds_iwdev->conn_list);
-       spin_unlock(&rds_iwdev->spinlock);
+       spin_unlock_irq(&rds_iwdev->spinlock);
         spin_unlock_irq(&iw_nodev_conns_lock);
  
         ic->rds_iwdev = rds_iwdev;
@@ -244,8 +245,11 @@ void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock)
         INIT_LIST_HEAD(list);
         spin_unlock_irq(list_lock);
  
-       list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node)
+       list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) {
+               if (ic->conn->c_passive)
+                       rds_conn_destroy(ic->conn->c_passive);
                 rds_conn_destroy(ic->conn);
+       }
  }
  
  static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg,
@@ -259,12 +263,18 @@ static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg,
  }
  
  static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev,
-                       struct rds_iw_scatterlist *sg)
+                       struct rds_iw_scatterlist *sg,
+                       unsigned int dma_page_shift)
  {
         struct ib_device *dev = rds_iwdev->dev;
         u64 *dma_pages = NULL;
+       u64 dma_mask;
+       unsigned int dma_page_size;
         int i, j, ret;
  
+       dma_page_size = 1 << dma_page_shift;
+       dma_mask = dma_page_size - 1;
+
         WARN_ON(sg->dma_len);
  
         sg->dma_len = ib_dma_map_sg(dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
@@ -285,18 +295,18 @@ static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev,
                 sg->bytes += dma_len;
  
                 end_addr = dma_addr + dma_len;
-               if (dma_addr & PAGE_MASK) {
+               if (dma_addr & dma_mask) {
                         if (i > 0)
                                 goto out_unmap;
-                       dma_addr &= ~PAGE_MASK;
+                       dma_addr &= ~dma_mask;
                 }
-               if (end_addr & PAGE_MASK) {
+               if (end_addr & dma_mask) {
                         if (i < sg->dma_len - 1)
                                 goto out_unmap;
-                       end_addr = (end_addr + PAGE_MASK) & ~PAGE_MASK;
+                       end_addr = (end_addr + dma_mask) & ~dma_mask;
                 }
  
-               sg->dma_npages += (end_addr - dma_addr) >> PAGE_SHIFT;
+               sg->dma_npages += (end_addr - dma_addr) >> dma_page_shift;
         }
  
         /* Now gather the dma addrs into one list */
@@ -315,8 +325,8 @@ static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev,
                 u64 end_addr;
  
                 end_addr = dma_addr + dma_len;
-               dma_addr &= ~PAGE_MASK;
-               for (; dma_addr < end_addr; dma_addr += PAGE_SIZE)
+               dma_addr &= ~dma_mask;
+               for (; dma_addr < end_addr; dma_addr += dma_page_size)
                         dma_pages[j++] = dma_addr;
                 BUG_ON(j > sg->dma_npages);
         }
@@ -473,6 +483,17 @@ void rds_iw_sync_mr(void *trans_private, int direction)
         }
  }
  
+static inline unsigned int rds_iw_flush_goal(struct rds_iw_mr_pool *pool, int free_all)
+{
+       unsigned int item_count;
+
+       item_count = atomic_read(&pool->item_count);
+       if (free_all)
+               return item_count;
+
+       return 0;
+}
+
  /*
   * Flush our pool of MRs.
   * At a minimum, all currently unused MRs are unmapped.
@@ -485,7 +506,7 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
         LIST_HEAD(unmap_list);
         LIST_HEAD(kill_list);
         unsigned long flags;
-       unsigned int nfreed = 0, ncleaned = 0, unpinned = 0;
+       unsigned int nfreed = 0, ncleaned = 0, free_goal;
         int ret = 0;
  
         rds_iw_stats_inc(s_iw_rdma_mr_pool_flush);
@@ -499,6 +520,8 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
                 list_splice_init(&pool->clean_list, &kill_list);
         spin_unlock_irqrestore(&pool->list_lock, flags);
  
+       free_goal = rds_iw_flush_goal(pool, free_all);
+
         /* Batched invalidate of dirty MRs.
          * For FMR based MRs, the mappings on the unmap list are
          * actually members of an ibmr (ibmr->mapping). They either
@@ -508,8 +531,7 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
          * will be destroyed by the unmap function.
          */
         if (!list_empty(&unmap_list)) {
-               ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list,
-                                                    &kill_list, &unpinned);
+               ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list, &kill_list);
                 /* If we've been asked to destroy all MRs, move those
                  * that were simply cleaned to the kill list */
                 if (free_all)
@@ -533,7 +555,6 @@ static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
                 spin_unlock_irqrestore(&pool->list_lock, flags);
         }
  
-       atomic_sub(unpinned, &pool->free_pinned);
         atomic_sub(ncleaned, &pool->dirty_count);
         atomic_sub(nfreed, &pool->item_count);
  
@@ -561,8 +582,8 @@ void rds_iw_free_mr(void *trans_private, int invalidate)
         rds_iw_free_fastreg(pool, ibmr);
  
         /* If we've pinned too many pages, request a flush */
-       if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
-           atomic_read(&pool->dirty_count) >= pool->max_items / 10)
+       if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned
+        || atomic_read(&pool->dirty_count) >= pool->max_items / 10)
                 queue_work(rds_wq, &pool->flush_worker);
  
         if (invalidate) {
@@ -594,17 +615,9 @@ void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
         struct rds_iw_device *rds_iwdev;
         struct rds_iw_mr *ibmr = NULL;
         struct rdma_cm_id *cm_id;
-       struct sockaddr_in src = {
-               .sin_addr.s_addr = rs->rs_bound_addr,
-               .sin_port = rs->rs_bound_port,
-       };
-       struct sockaddr_in dst = {
-               .sin_addr.s_addr = rs->rs_conn_addr,
-               .sin_port = rs->rs_conn_port,
-       };
         int ret;
  
-       ret = rds_iw_get_device(&src, &dst, &rds_iwdev, &cm_id);
+       ret = rds_iw_get_device(rs, &rds_iwdev, &cm_id);
         if (ret || !cm_id) {
                 ret = -ENODEV;
                 goto out;
@@ -714,7 +727,7 @@ static int rds_iw_rdma_build_fastreg(struct rds_iw_mapping *mapping)
         f_wr.wr.fast_reg.rkey = mapping->m_rkey;
         f_wr.wr.fast_reg.page_list = ibmr->page_list;
         f_wr.wr.fast_reg.page_list_len = mapping->m_sg.dma_len;
-       f_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+       f_wr.wr.fast_reg.page_shift = ibmr->device->page_shift;
         f_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE |
                                 IB_ACCESS_REMOTE_READ |
                                 IB_ACCESS_REMOTE_WRITE;
@@ -724,8 +737,8 @@ static int rds_iw_rdma_build_fastreg(struct rds_iw_mapping *mapping)
         failed_wr = &f_wr;
         ret = ib_post_send(ibmr->cm_id->qp, &f_wr, &failed_wr);
         BUG_ON(failed_wr != &f_wr);
-       if (ret)
-               printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
+       if (ret && printk_ratelimit())
+               printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
                         __func__, __LINE__, ret);
         return ret;
  }
@@ -746,8 +759,8 @@ static int rds_iw_rdma_fastreg_inv(struct rds_iw_mr *ibmr)
  
         failed_wr = &s_wr;
         ret = ib_post_send(ibmr->cm_id->qp, &s_wr, &failed_wr);
-       if (ret) {
-               printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
+       if (ret && printk_ratelimit()) {
+               printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
                         __func__, __LINE__, ret);
                 goto out;
         }
@@ -767,7 +780,9 @@ static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
  
         rds_iw_set_scatterlist(&mapping->m_sg, sg, sg_len);
  
-       dma_pages = rds_iw_map_scatterlist(rds_iwdev, &mapping->m_sg);
+       dma_pages = rds_iw_map_scatterlist(rds_iwdev,
+                               &mapping->m_sg,
+                               rds_iwdev->page_shift);
         if (IS_ERR(dma_pages)) {
                 ret = PTR_ERR(dma_pages);
                 dma_pages = NULL;
@@ -822,8 +837,7 @@ static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool,
  
  static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
                                 struct list_head *unmap_list,
-                               struct list_head *kill_list,
-                               int *unpinned)
+                               struct list_head *kill_list)
  {
         struct rds_iw_mapping *mapping, *next;
         unsigned int ncleaned = 0;
@@ -850,7 +864,6 @@ static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
  
                 spin_lock_irqsave(&pool->list_lock, flags);
                 list_for_each_entry_safe(mapping, next, unmap_list, m_list) {
-                       *unpinned += mapping->m_sg.len;
                         list_move(&mapping->m_list, &laundered);
                         ncleaned++;
                 }
diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c

index a66d1794b2d0472e511a179ae9872c2766fb8dd8..fde470fa50d5457c72226d0336e00aeed8df534d 100644 (file)
--- a/net/rds/iw_recv.c
+++ b/net/rds/iw_recv.c
@@ -31,7 +31,6 @@
   *
   */
  #include <linux/kernel.h>
-#include <linux/slab.h>
  #include <linux/pci.h>
  #include <linux/dma-mapping.h>
  #include <rdma/rdma_cm.h>
@@ -53,7 +52,7 @@ static void rds_iw_frag_drop_page(struct rds_page_frag *frag)
  static void rds_iw_frag_free(struct rds_page_frag *frag)
  {
         rdsdebug("frag %p page %p\n", frag, frag->f_page);
-       BUG_ON(frag->f_page);
+       BUG_ON(frag->f_page != NULL);
         kmem_cache_free(rds_iw_frag_slab, frag);
  }
  
@@ -143,32 +142,31 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn,
         struct ib_sge *sge;
         int ret = -ENOMEM;
  
-       if (!recv->r_iwinc) {
-               if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) {
+       if (recv->r_iwinc == NULL) {
+               if (atomic_read(&rds_iw_allocation) >= rds_iw_sysctl_max_recv_allocation) {
                         rds_iw_stats_inc(s_iw_rx_alloc_limit);
                         goto out;
                 }
                 recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab,
                                                  kptr_gfp);
-               if (!recv->r_iwinc) {
-                       atomic_dec(&rds_iw_allocation);
+               if (recv->r_iwinc == NULL)
                         goto out;
-               }
+               atomic_inc(&rds_iw_allocation);
                 INIT_LIST_HEAD(&recv->r_iwinc->ii_frags);
                 rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr);
         }
  
-       if (!recv->r_frag) {
+       if (recv->r_frag == NULL) {
                 recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp);
-               if (!recv->r_frag)
+               if (recv->r_frag == NULL)
                         goto out;
                 INIT_LIST_HEAD(&recv->r_frag->f_item);
                 recv->r_frag->f_page = NULL;
         }
  
-       if (!ic->i_frag.f_page) {
+       if (ic->i_frag.f_page == NULL) {
                 ic->i_frag.f_page = alloc_page(page_gfp);
-               if (!ic->i_frag.f_page)
+               if (ic->i_frag.f_page == NULL)
                         goto out;
                 ic->i_frag.f_offset = 0;
         }
@@ -231,8 +229,8 @@ int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
         int ret = 0;
         u32 pos;
  
-       while ((prefill || rds_conn_up(conn)) &&
-              rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
+       while ((prefill || rds_conn_up(conn))
+                       && rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
                 if (pos >= ic->i_recv_ring.w_nr) {
                         printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
                                         pos);
@@ -273,7 +271,7 @@ int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
         return ret;
  }
  
-static void rds_iw_inc_purge(struct rds_incoming *inc)
+void rds_iw_inc_purge(struct rds_incoming *inc)
  {
         struct rds_iw_incoming *iwinc;
         struct rds_page_frag *frag;
@@ -303,12 +301,15 @@ void rds_iw_inc_free(struct rds_incoming *inc)
         BUG_ON(atomic_read(&rds_iw_allocation) < 0);
  }
  
-int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
+int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
+                           size_t size)
  {
         struct rds_iw_incoming *iwinc;
         struct rds_page_frag *frag;
+       struct iovec *iov = first_iov;
         unsigned long to_copy;
         unsigned long frag_off = 0;
+       unsigned long iov_off = 0;
         int copied = 0;
         int ret;
         u32 len;
@@ -317,25 +318,37 @@ int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
         frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item);
         len = be32_to_cpu(inc->i_hdr.h_len);
  
-       while (iov_iter_count(to) && copied < len) {
+       while (copied < size && copied < len) {
                 if (frag_off == RDS_FRAG_SIZE) {
                         frag = list_entry(frag->f_item.next,
                                           struct rds_page_frag, f_item);
                         frag_off = 0;
                 }
-               to_copy = min_t(unsigned long, iov_iter_count(to),
-                               RDS_FRAG_SIZE - frag_off);
+               while (iov_off == iov->iov_len) {
+                       iov_off = 0;
+                       iov++;
+               }
+
+               to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off);
+               to_copy = min_t(size_t, to_copy, size - copied);
                 to_copy = min_t(unsigned long, to_copy, len - copied);
  
+               rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
+                        "[%p, %lu] + %lu\n",
+                        to_copy, iov->iov_base, iov->iov_len, iov_off,
+                        frag->f_page, frag->f_offset, frag_off);
+
                 /* XXX needs + offset for multiple recvs per page */
-               rds_stats_add(s_copy_to_user, to_copy);
-               ret = copy_page_to_iter(frag->f_page,
-                                       frag->f_offset + frag_off,
-                                       to_copy,
-                                       to);
-               if (ret != to_copy)
-                       return -EFAULT;
+               ret = rds_page_copy_to_user(frag->f_page,
+                                           frag->f_offset + frag_off,
+                                           iov->iov_base + iov_off,
+                                           to_copy);
+               if (ret) {
+                       copied = ret;
+                       break;
+               }
  
+               iov_off += to_copy;
                 frag_off += to_copy;
                 copied += to_copy;
         }
@@ -414,7 +427,7 @@ static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq,
  {
         atomic64_set(&ic->i_ack_next, seq);
         if (ack_required) {
-               smp_mb__before_atomic();
+               smp_mb__before_clear_bit();
                 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
         }
  }
@@ -422,7 +435,7 @@ static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq,
  static u64 rds_iw_get_ack(struct rds_iw_connection *ic)
  {
         clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
-       smp_mb__after_atomic();
+       smp_mb__after_clear_bit();
  
         return atomic64_read(&ic->i_ack_next);
  }
@@ -454,8 +467,8 @@ static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credi
                 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
  
                 rds_iw_stats_inc(s_iw_ack_send_failure);
-
-               rds_iw_conn_error(ic->conn, "sending ack failed\n");
+               /* Need to finesse this later. */
+               BUG();
         } else
                 rds_iw_stats_inc(s_iw_ack_sent);
  }
@@ -511,7 +524,7 @@ void rds_iw_attempt_ack(struct rds_iw_connection *ic)
         }
  
         /* Can we get a send credit? */
-       if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) {
+       if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0)) {
                 rds_iw_stats_inc(s_iw_tx_throttle);
                 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
                 return;
@@ -583,7 +596,7 @@ static void rds_iw_cong_recv(struct rds_connection *conn,
                 to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
                 BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
  
-               addr = kmap_atomic(frag->f_page);
+               addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0);
  
                 src = addr + frag_off;
                 dst = (void *)map->m_page_addrs[map_page] + map_off;
@@ -593,7 +606,7 @@ static void rds_iw_cong_recv(struct rds_connection *conn,
                         uncongested |= ~(*src) & *dst;
                         *dst++ = *src++;
                 }
-               kunmap_atomic(addr);
+               kunmap_atomic(addr, KM_SOFTIRQ0);
  
                 copied += to_copy;
  
@@ -646,7 +659,7 @@ static void rds_iw_process_recv(struct rds_connection *conn,
  
         if (byte_len < sizeof(struct rds_header)) {
                 rds_iw_conn_error(conn, "incoming message "
-                      "from %pI4 didn't include a "
+                      "from %pI4 didn't inclue a "
                        "header, disconnecting and "
                        "reconnecting\n",
                        &conn->c_faddr);
@@ -701,7 +714,7 @@ static void rds_iw_process_recv(struct rds_connection *conn,
          * into the inc and save the inc so we can hang upcoming fragments
          * off its list.
          */
-       if (!iwinc) {
+       if (iwinc == NULL) {
                 iwinc = recv->r_iwinc;
                 recv->r_iwinc = NULL;
                 ic->i_iwinc = iwinc;
@@ -716,10 +729,10 @@ static void rds_iw_process_recv(struct rds_connection *conn,
                 hdr = &iwinc->ii_inc.i_hdr;
                 /* We can't just use memcmp here; fragments of a
                  * single message may carry different ACKs */
-               if (hdr->h_sequence != ihdr->h_sequence ||
-                   hdr->h_len != ihdr->h_len ||
-                   hdr->h_sport != ihdr->h_sport ||
-                   hdr->h_dport != ihdr->h_dport) {
+               if (hdr->h_sequence != ihdr->h_sequence
+                || hdr->h_len != ihdr->h_len
+                || hdr->h_sport != ihdr->h_sport
+                || hdr->h_dport != ihdr->h_dport) {
                         rds_iw_conn_error(conn,
                                 "fragment header mismatch; forcing reconnect\n");
                         return;
@@ -739,7 +752,8 @@ static void rds_iw_process_recv(struct rds_connection *conn,
                         rds_iw_cong_recv(conn, iwinc);
                 else {
                         rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
-                                         &iwinc->ii_inc, GFP_ATOMIC);
+                                         &iwinc->ii_inc, GFP_ATOMIC,
+                                         KM_SOFTIRQ0);
                         state->ack_next = be64_to_cpu(hdr->h_sequence);
                         state->ack_next_valid = 1;
                 }
@@ -769,22 +783,17 @@ void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context)
  {
         struct rds_connection *conn = context;
         struct rds_iw_connection *ic = conn->c_transport_data;
+       struct ib_wc wc;
+       struct rds_iw_ack_state state = { 0, };
+       struct rds_iw_recv_work *recv;
  
         rdsdebug("conn %p cq %p\n", conn, cq);
  
         rds_iw_stats_inc(s_iw_rx_cq_call);
  
-       tasklet_schedule(&ic->i_recv_tasklet);
-}
-
-static inline void rds_poll_cq(struct rds_iw_connection *ic,
-                              struct rds_iw_ack_state *state)
-{
-       struct rds_connection *conn = ic->conn;
-       struct ib_wc wc;
-       struct rds_iw_recv_work *recv;
+       ib_req_notify_cq(cq, IB_CQ_SOLICITED);
  
-       while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
+       while (ib_poll_cq(cq, 1, &wc) > 0) {
                 rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
                          (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
                          be32_to_cpu(wc.ex.imm_data));
@@ -802,7 +811,7 @@ static inline void rds_poll_cq(struct rds_iw_connection *ic,
                 if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
                         /* We expect errors as the qp is drained during shutdown */
                         if (wc.status == IB_WC_SUCCESS) {
-                               rds_iw_process_recv(conn, recv, wc.byte_len, state);
+                               rds_iw_process_recv(conn, recv, wc.byte_len, &state);
                         } else {
                                 rds_iw_conn_error(conn, "recv completion on "
                                        "%pI4 had status %u, disconnecting and "
@@ -813,17 +822,6 @@ static inline void rds_poll_cq(struct rds_iw_connection *ic,
  
                 rds_iw_ring_free(&ic->i_recv_ring, 1);
         }
-}
-
-void rds_iw_recv_tasklet_fn(unsigned long data)
-{
-       struct rds_iw_connection *ic = (struct rds_iw_connection *) data;
-       struct rds_connection *conn = ic->conn;
-       struct rds_iw_ack_state state = { 0, };
-
-       rds_poll_cq(ic, &state);
-       ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
-       rds_poll_cq(ic, &state);
  
         if (state.ack_next_valid)
                 rds_iw_set_ack(ic, state.ack_next, state.ack_required);
@@ -871,7 +869,7 @@ int rds_iw_recv(struct rds_connection *conn)
         return ret;
  }
  
-int rds_iw_recv_init(void)
+int __init rds_iw_recv_init(void)
  {
         struct sysinfo si;
         int ret = -ENOMEM;
@@ -883,13 +881,13 @@ int rds_iw_recv_init(void)
         rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming",
                                         sizeof(struct rds_iw_incoming),
                                         0, 0, NULL);
-       if (!rds_iw_incoming_slab)
+       if (rds_iw_incoming_slab == NULL)
                 goto out;
  
         rds_iw_frag_slab = kmem_cache_create("rds_iw_frag",
                                         sizeof(struct rds_page_frag),
                                         0, 0, NULL);
-       if (!rds_iw_frag_slab)
+       if (rds_iw_frag_slab == NULL)
                 kmem_cache_destroy(rds_iw_incoming_slab);
         else
                 ret = 0;
diff --git a/net/rds/iw_ring.c b/net/rds/iw_ring.c

index da8e3b63f66363c67a73700ba356179bbf2fbfc7..d422d4b5deef2b49c183d6030db61d09a98a8729 100644 (file)
--- a/net/rds/iw_ring.c
+++ b/net/rds/iw_ring.c
@@ -137,7 +137,7 @@ int rds_iw_ring_empty(struct rds_iw_work_ring *ring)
  
  int rds_iw_ring_low(struct rds_iw_work_ring *ring)
  {
-       return __rds_iw_ring_used(ring) <= (ring->w_nr >> 1);
+       return __rds_iw_ring_used(ring) <= (ring->w_nr >> 2);
  }
  
  
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c

index 13834780a3089e9e640e470f7b2e8b26c6334b7b..22dd38ffd6080843afd8ddc3987ff2a4490369eb 100644 (file)
--- a/net/rds/iw_send.c
+++ b/net/rds/iw_send.c
@@ -34,9 +34,9 @@
  #include <linux/in.h>
  #include <linux/device.h>
  #include <linux/dmapool.h>
-#include <linux/ratelimit.h>
  
  #include "rds.h"
+#include "rdma.h"
  #include "iw.h"
  
  static void rds_iw_send_rdma_complete(struct rds_message *rm,
@@ -64,13 +64,13 @@ static void rds_iw_send_rdma_complete(struct rds_message *rm,
  }
  
  static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic,
-                                  struct rm_rdma_op *op)
+                                  struct rds_rdma_op *op)
  {
-       if (op->op_mapped) {
+       if (op->r_mapped) {
                 ib_dma_unmap_sg(ic->i_cm_id->device,
-                       op->op_sg, op->op_nents,
-                       op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-               op->op_mapped = 0;
+                       op->r_sg, op->r_nents,
+                       op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+               op->r_mapped = 0;
         }
  }
  
@@ -83,11 +83,11 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
         rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
  
         ib_dma_unmap_sg(ic->i_cm_id->device,
-                    rm->data.op_sg, rm->data.op_nents,
+                    rm->m_sg, rm->m_nents,
                      DMA_TO_DEVICE);
  
-       if (rm->rdma.op_active) {
-               rds_iw_send_unmap_rdma(ic, &rm->rdma);
+       if (rm->m_rdma_op != NULL) {
+               rds_iw_send_unmap_rdma(ic, rm->m_rdma_op);
  
                 /* If the user asked for a completion notification on this
                  * message, we can implement three different semantics:
@@ -111,10 +111,10 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
                  */
                 rds_iw_send_rdma_complete(rm, wc_status);
  
-               if (rm->rdma.op_write)
-                       rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes);
+               if (rm->m_rdma_op->r_write)
+                       rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes);
                 else
-                       rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes);
+                       rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes);
         }
  
         /* If anyone waited for this message to get flushed out, wake
@@ -232,7 +232,7 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context)
                 }
  
                 if (wc.wr_id == RDS_IW_ACK_WR_ID) {
-                       if (time_after(jiffies, ic->i_ack_queued + HZ/2))
+                       if (ic->i_ack_queued + HZ/2 < jiffies)
                                 rds_iw_stats_inc(s_iw_tx_stalled);
                         rds_iw_ack_send_complete(ic);
                         continue;
@@ -259,7 +259,8 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context)
                                  * when the SEND completes. */
                                 break;
                         default:
-                               printk_ratelimited(KERN_NOTICE
+                               if (printk_ratelimit())
+                                       printk(KERN_NOTICE
                                                 "RDS/IW: %s: unexpected opcode 0x%x in WR!\n",
                                                 __func__, send->s_wr.opcode);
                                 break;
@@ -267,7 +268,7 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context)
  
                         send->s_wr.opcode = 0xdead;
                         send->s_wr.num_sge = 1;
-                       if (time_after(jiffies, send->s_queued + HZ/2))
+                       if (send->s_queued + HZ/2 < jiffies)
                                 rds_iw_stats_inc(s_iw_tx_stalled);
  
                         /* If a RDMA operation produced an error, signal this right
@@ -287,8 +288,8 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context)
  
                 rds_iw_ring_free(&ic->i_send_ring, completed);
  
-               if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
-                   test_bit(0, &conn->c_map_queued))
+               if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)
+                || test_bit(0, &conn->c_map_queued))
                         queue_delayed_work(rds_wq, &conn->c_send_w, 0);
  
                 /* We expect errors as the qp is drained during shutdown */
@@ -307,7 +308,7 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context)
   *
   * Conceptually, we have two counters:
   *  -  send credits: this tells us how many WRs we're allowed
- *     to submit without overruning the receiver's queue. For
+ *     to submit without overruning the reciever's queue. For
   *     each SEND WR we post, we decrement this by one.
   *
   *  -  posted credits: this tells us how many WRs we recently
@@ -346,7 +347,7 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context)
   * and using atomic_cmpxchg when updating the two counters.
   */
  int rds_iw_send_grab_credits(struct rds_iw_connection *ic,
-                            u32 wanted, u32 *adv_credits, int need_posted, int max_posted)
+                            u32 wanted, u32 *adv_credits, int need_posted)
  {
         unsigned int avail, posted, got = 0, advertise;
         long oldval, newval;
@@ -361,7 +362,7 @@ try_again:
         posted = IB_GET_POST_CREDITS(oldval);
         avail = IB_GET_SEND_CREDITS(oldval);
  
-       rdsdebug("wanted=%u credits=%u posted=%u\n",
+       rdsdebug("rds_iw_send_grab_credits(%u): credits=%u posted=%u\n",
                         wanted, avail, posted);
  
         /* The last credit must be used to send a credit update. */
@@ -386,7 +387,7 @@ try_again:
          * available.
          */
         if (posted && (got || need_posted)) {
-               advertise = min_t(unsigned int, posted, max_posted);
+               advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT);
                 newval -= IB_SET_POST_CREDITS(advertise);
         }
  
@@ -405,7 +406,7 @@ void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits)
         if (credits == 0)
                 return;
  
-       rdsdebug("credits=%u current=%u%s\n",
+       rdsdebug("rds_iw_send_add_credits(%u): current=%u%s\n",
                         credits,
                         IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)),
                         test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : "");
@@ -518,7 +519,8 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
         BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
  
         /* Fastreg support */
-       if (rds_rdma_cookie_key(rm->m_rdma_cookie) && !ic->i_fastreg_posted) {
+       if (rds_rdma_cookie_key(rm->m_rdma_cookie)
+        && !ic->i_fastreg_posted) {
                 ret = -EAGAIN;
                 goto out;
         }
@@ -539,7 +541,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
  
         credit_alloc = work_alloc;
         if (ic->i_flowctl) {
-               credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT);
+               credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0);
                 adv_credits += posted;
                 if (credit_alloc < work_alloc) {
                         rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
@@ -547,7 +549,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
                         flow_controlled++;
                 }
                 if (work_alloc == 0) {
-                       set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+                       rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
                         rds_iw_stats_inc(s_iw_tx_throttle);
                         ret = -ENOMEM;
                         goto out;
@@ -555,27 +557,25 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
         }
  
         /* map the message the first time we see it */
-       if (!ic->i_rm) {
+       if (ic->i_rm == NULL) {
                 /*
                 printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n",
                                 be16_to_cpu(rm->m_inc.i_hdr.h_dport),
                                 rm->m_inc.i_hdr.h_flags,
                                 be32_to_cpu(rm->m_inc.i_hdr.h_len));
                    */
-               if (rm->data.op_nents) {
-                       rm->data.op_count = ib_dma_map_sg(dev,
-                                                         rm->data.op_sg,
-                                                         rm->data.op_nents,
-                                                         DMA_TO_DEVICE);
-                       rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
-                       if (rm->data.op_count == 0) {
+               if (rm->m_nents) {
+                       rm->m_count = ib_dma_map_sg(dev,
+                                        rm->m_sg, rm->m_nents, DMA_TO_DEVICE);
+                       rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count);
+                       if (rm->m_count == 0) {
                                 rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
                                 rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
                                 ret = -ENOMEM; /* XXX ? */
                                 goto out;
                         }
                 } else {
-                       rm->data.op_count = 0;
+                       rm->m_count = 0;
                 }
  
                 ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
@@ -591,10 +591,10 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
  
                 /* If it has a RDMA op, tell the peer we did it. This is
                  * used by the peer to release use-once RDMA MRs. */
-               if (rm->rdma.op_active) {
+               if (rm->m_rdma_op) {
                         struct rds_ext_header_rdma ext_hdr;
  
-                       ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
+                       ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key);
                         rds_message_add_extension(&rm->m_inc.i_hdr,
                                         RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
                 }
@@ -614,15 +614,16 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
                 /*
                  * Update adv_credits since we reset the ACK_REQUIRED bit.
                  */
-               rds_iw_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
+               rds_iw_send_grab_credits(ic, 0, &posted, 1);
                 adv_credits += posted;
                 BUG_ON(adv_credits > 255);
-       }
+       } else if (ic->i_rm != rm)
+               BUG();
  
         send = &ic->i_sends[pos];
         first = send;
         prev = NULL;
-       scat = &rm->data.op_sg[sg];
+       scat = &rm->m_sg[sg];
         sent = 0;
         i = 0;
  
@@ -632,7 +633,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
          * or when requested by the user. Right now, we let
          * the application choose.
          */
-       if (rm->rdma.op_active && rm->rdma.op_fence)
+       if (rm->m_rdma_op && rm->m_rdma_op->r_fence)
                 send_flags = IB_SEND_FENCE;
  
         /*
@@ -651,7 +652,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
         }
  
         /* if there's data reference it with a chain of work reqs */
-       for (; i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]; i++) {
+       for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) {
                 unsigned int len;
  
                 send = &ic->i_sends[pos];
@@ -729,7 +730,7 @@ add_header:
                 sent += sizeof(struct rds_header);
  
         /* if we finished the message then send completion owns it */
-       if (scat == &rm->data.op_sg[rm->data.op_count]) {
+       if (scat == &rm->m_sg[rm->m_count]) {
                 prev->s_rm = ic->i_rm;
                 prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
                 ic->i_rm = NULL;
@@ -778,14 +779,14 @@ static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rd
         send->s_wr.wr.fast_reg.rkey = send->s_mr->rkey;
         send->s_wr.wr.fast_reg.page_list = send->s_page_list;
         send->s_wr.wr.fast_reg.page_list_len = nent;
-       send->s_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+       send->s_wr.wr.fast_reg.page_shift = rds_iwdev->page_shift;
         send->s_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE;
         send->s_wr.wr.fast_reg.iova_start = sg_addr;
  
         ib_update_fast_reg_key(send->s_mr, send->s_remap_count++);
  }
  
-int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
+int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
  {
         struct rds_iw_connection *ic = conn->c_transport_data;
         struct rds_iw_send_work *send = NULL;
@@ -795,7 +796,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
         struct rds_iw_device *rds_iwdev;
         struct scatterlist *scat;
         unsigned long len;
-       u64 remote_addr = op->op_remote_addr;
+       u64 remote_addr = op->r_remote_addr;
         u32 pos, fr_pos;
         u32 work_alloc;
         u32 i;
@@ -807,21 +808,21 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
         rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
  
         /* map the message the first time we see it */
-       if (!op->op_mapped) {
-               op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
-                                            op->op_sg, op->op_nents, (op->op_write) ?
-                                            DMA_TO_DEVICE : DMA_FROM_DEVICE);
-               rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
-               if (op->op_count == 0) {
+       if (!op->r_mapped) {
+               op->r_count = ib_dma_map_sg(ic->i_cm_id->device,
+                                       op->r_sg, op->r_nents, (op->r_write) ?
+                                       DMA_TO_DEVICE : DMA_FROM_DEVICE);
+               rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count);
+               if (op->r_count == 0) {
                         rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
                         ret = -ENOMEM; /* XXX ? */
                         goto out;
                 }
  
-               op->op_mapped = 1;
+               op->r_mapped = 1;
         }
  
-       if (!op->op_write) {
+       if (!op->r_write) {
                 /* Alloc space on the send queue for the fastreg */
                 work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos);
                 if (work_alloc != 1) {
@@ -836,7 +837,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
          * Instead of knowing how to return a partial rdma read/write we insist that there
          * be enough work requests to send the entire message.
          */
-       i = ceil(op->op_count, rds_iwdev->max_sge);
+       i = ceil(op->r_count, rds_iwdev->max_sge);
  
         work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
         if (work_alloc != i) {
@@ -847,17 +848,17 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
         }
  
         send = &ic->i_sends[pos];
-       if (!op->op_write) {
+       if (!op->r_write) {
                 first = prev = &ic->i_sends[fr_pos];
         } else {
                 first = send;
                 prev = NULL;
         }
-       scat = &op->op_sg[0];
+       scat = &op->r_sg[0];
         sent = 0;
-       num_sge = op->op_count;
+       num_sge = op->r_count;
  
-       for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
+       for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) {
                 send->s_wr.send_flags = 0;
                 send->s_queued = jiffies;
  
@@ -874,13 +875,13 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
                  * for local access after RDS is finished with it, using
                  * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed.
                  */
-               if (op->op_write)
+               if (op->r_write)
                         send->s_wr.opcode = IB_WR_RDMA_WRITE;
                 else
                         send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
  
                 send->s_wr.wr.rdma.remote_addr = remote_addr;
-               send->s_wr.wr.rdma.rkey = op->op_rkey;
+               send->s_wr.wr.rdma.rkey = op->r_key;
                 send->s_op = op;
  
                 if (num_sge > rds_iwdev->max_sge) {
@@ -894,7 +895,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
                 if (prev)
                         prev->s_wr.next = &send->s_wr;
  
-               for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
+               for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) {
                         len = ib_sg_dma_len(ic->i_cm_id->device, scat);
  
                         if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV)
@@ -928,7 +929,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
         }
  
         /* if we finished the message then send completion owns it */
-       if (scat == &op->op_sg[op->op_count])
+       if (scat == &op->r_sg[op->r_count])
                 first->s_wr.send_flags = IB_SEND_SIGNALED;
  
         if (i < work_alloc) {
@@ -942,9 +943,9 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
          * adapters do not allow using the lkey for this at all.  To bypass this use a
          * fastreg_mr (or possibly a dma_mr)
          */
-       if (!op->op_write) {
+       if (!op->r_write) {
                 rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos],
-                       op->op_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr);
+                       op->r_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr);
                 work_alloc++;
         }
  
diff --git a/net/rds/iw_stats.c b/net/rds/iw_stats.c

index 5fe67f6a1d8060f8312abf785a5631a68bec6d44..ccc7e8f0bf0e09253be5cff4a5fd3002e6388b8b 100644 (file)
--- a/net/rds/iw_stats.c
+++ b/net/rds/iw_stats.c
@@ -37,9 +37,9 @@
  #include "rds.h"
  #include "iw.h"
  
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_iw_statistics, rds_iw_stats);
+DEFINE_PER_CPU(struct rds_iw_statistics, rds_iw_stats) ____cacheline_aligned;
  
-static const char *const rds_iw_stat_names[] = {
+static char *rds_iw_stat_names[] = {
         "iw_connect_raced",
         "iw_listen_closed_stale",
         "iw_tx_cq_call",
diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c

index 139239d2cb228438e29f347b33035da15d5396c0..9590678cd616837e7699c80bc3ec1bb5c8c09f9c 100644 (file)
--- a/net/rds/iw_sysctl.c
+++ b/net/rds/iw_sysctl.c
@@ -55,69 +55,83 @@ static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL;
  
  unsigned int rds_iw_sysctl_flow_control = 1;
  
-static struct ctl_table rds_iw_sysctl_table[] = {
+ctl_table rds_iw_sysctl_table[] = {
         {
+               .ctl_name       = CTL_UNNUMBERED,
                 .procname       = "max_send_wr",
                 .data           = &rds_iw_sysctl_max_send_wr,
                 .maxlen         = sizeof(unsigned long),
                 .mode           = 0644,
-               .proc_handler   = proc_doulongvec_minmax,
+               .proc_handler   = &proc_doulongvec_minmax,
                 .extra1         = &rds_iw_sysctl_max_wr_min,
                 .extra2         = &rds_iw_sysctl_max_wr_max,
         },
         {
+               .ctl_name       = CTL_UNNUMBERED,
                 .procname       = "max_recv_wr",
                 .data           = &rds_iw_sysctl_max_recv_wr,
                 .maxlen         = sizeof(unsigned long),
                 .mode           = 0644,
-               .proc_handler   = proc_doulongvec_minmax,
+               .proc_handler   = &proc_doulongvec_minmax,
                 .extra1         = &rds_iw_sysctl_max_wr_min,
                 .extra2         = &rds_iw_sysctl_max_wr_max,
         },
         {
+               .ctl_name       = CTL_UNNUMBERED,
                 .procname       = "max_unsignaled_wr",
                 .data           = &rds_iw_sysctl_max_unsig_wrs,
                 .maxlen         = sizeof(unsigned long),
                 .mode           = 0644,
-               .proc_handler   = proc_doulongvec_minmax,
+               .proc_handler   = &proc_doulongvec_minmax,
                 .extra1         = &rds_iw_sysctl_max_unsig_wr_min,
                 .extra2         = &rds_iw_sysctl_max_unsig_wr_max,
         },
         {
+               .ctl_name       = CTL_UNNUMBERED,
                 .procname       = "max_unsignaled_bytes",
                 .data           = &rds_iw_sysctl_max_unsig_bytes,
                 .maxlen         = sizeof(unsigned long),
                 .mode           = 0644,
-               .proc_handler   = proc_doulongvec_minmax,
+               .proc_handler   = &proc_doulongvec_minmax,
                 .extra1         = &rds_iw_sysctl_max_unsig_bytes_min,
                 .extra2         = &rds_iw_sysctl_max_unsig_bytes_max,
         },
         {
+               .ctl_name       = CTL_UNNUMBERED,
                 .procname       = "max_recv_allocation",
                 .data           = &rds_iw_sysctl_max_recv_allocation,
                 .maxlen         = sizeof(unsigned long),
                 .mode           = 0644,
-               .proc_handler   = proc_doulongvec_minmax,
+               .proc_handler   = &proc_doulongvec_minmax,
         },
         {
+               .ctl_name       = CTL_UNNUMBERED,
                 .procname       = "flow_control",
                 .data           = &rds_iw_sysctl_flow_control,
                 .maxlen         = sizeof(rds_iw_sysctl_flow_control),
                 .mode           = 0644,
-               .proc_handler   = proc_dointvec,
+               .proc_handler   = &proc_dointvec,
         },
+       { .ctl_name = 0}
+};
+
+static struct ctl_path rds_iw_sysctl_path[] = {
+       { .procname = "net", .ctl_name = CTL_NET, },
+       { .procname = "rds", .ctl_name = CTL_UNNUMBERED, },
+       { .procname = "iw", .ctl_name = CTL_UNNUMBERED, },
         { }
  };
  
  void rds_iw_sysctl_exit(void)
  {
-       unregister_net_sysctl_table(rds_iw_sysctl_hdr);
+       if (rds_iw_sysctl_hdr)
+               unregister_sysctl_table(rds_iw_sysctl_hdr);
  }
  
-int rds_iw_sysctl_init(void)
+int __init rds_iw_sysctl_init(void)
  {
-       rds_iw_sysctl_hdr = register_net_sysctl(&init_net, "net/rds/iw", rds_iw_sysctl_table);
-       if (!rds_iw_sysctl_hdr)
+       rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table);
+       if (rds_iw_sysctl_hdr == NULL)
                 return -ENOMEM;
         return 0;
  }
diff --git a/net/rds/loop.c b/net/rds/loop.c

index 6b12b68541ae96fb8be76e72cb8d0e6f8c89abee..4a61997f554db1108c69cd619662526d80079f0a 100644 (file)
--- a/net/rds/loop.c
+++ b/net/rds/loop.c
@@ -31,7 +31,6 @@
   *
   */
  #include <linux/kernel.h>
-#include <linux/slab.h>
  #include <linux/in.h>
  
  #include "rds.h"
@@ -61,42 +60,39 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
                          unsigned int hdr_off, unsigned int sg,
                          unsigned int off)
  {
-       struct scatterlist *sgp = &rm->data.op_sg[sg];
-       int ret = sizeof(struct rds_header) +
-                       be32_to_cpu(rm->m_inc.i_hdr.h_len);
-
-       /* Do not send cong updates to loopback */
-       if (rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) {
-               rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
-               ret = min_t(int, ret, sgp->length - conn->c_xmit_data_off);
-               goto out;
-       }
-
         BUG_ON(hdr_off || sg || off);
  
         rds_inc_init(&rm->m_inc, conn, conn->c_laddr);
-       /* For the embedded inc. Matching put is in loop_inc_free() */
-       rds_message_addref(rm);
+       rds_message_addref(rm); /* for the inc */
  
         rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc,
-                         GFP_KERNEL);
+                         GFP_KERNEL, KM_USER0);
  
         rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence),
                             NULL);
  
         rds_inc_put(&rm->m_inc);
-out:
-       return ret;
+
+       return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len);
  }
  
-/*
- * See rds_loop_xmit(). Since our inc is embedded in the rm, we
- * make sure the rm lives at least until the inc is done.
- */
-static void rds_loop_inc_free(struct rds_incoming *inc)
+static int rds_loop_xmit_cong_map(struct rds_connection *conn,
+                                 struct rds_cong_map *map,
+                                 unsigned long offset)
  {
-        struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
-        rds_message_put(rm);
+       unsigned long i;
+
+       BUG_ON(offset);
+       BUG_ON(map != conn->c_lcong);
+
+       for (i = 0; i < RDS_CONG_MAP_PAGES; i++) {
+               memcpy((void *)conn->c_fcong->m_page_addrs[i],
+                      (void *)map->m_page_addrs[i], PAGE_SIZE);
+       }
+
+       rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
+
+       return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES;
  }
  
  /* we need to at least give the thread something to succeed */
@@ -121,8 +117,8 @@ static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp)
         struct rds_loop_connection *lc;
         unsigned long flags;
  
-       lc = kzalloc(sizeof(struct rds_loop_connection), gfp);
-       if (!lc)
+       lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL);
+       if (lc == NULL)
                 return -ENOMEM;
  
         INIT_LIST_HEAD(&lc->loop_node);
@@ -139,12 +135,8 @@ static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp)
  static void rds_loop_conn_free(void *arg)
  {
         struct rds_loop_connection *lc = arg;
-       unsigned long flags;
-
         rdsdebug("lc %p\n", lc);
-       spin_lock_irqsave(&loop_conns_lock, flags);
         list_del(&lc->loop_node);
-       spin_unlock_irqrestore(&loop_conns_lock, flags);
         kfree(lc);
  }
  
@@ -183,12 +175,14 @@ void rds_loop_exit(void)
   */
  struct rds_transport rds_loop_transport = {
         .xmit                   = rds_loop_xmit,
+       .xmit_cong_map          = rds_loop_xmit_cong_map,
         .recv                   = rds_loop_recv,
         .conn_alloc             = rds_loop_conn_alloc,
         .conn_free              = rds_loop_conn_free,
         .conn_connect           = rds_loop_conn_connect,
         .conn_shutdown          = rds_loop_conn_shutdown,
         .inc_copy_to_user       = rds_message_inc_copy_to_user,
-       .inc_free               = rds_loop_inc_free,
+       .inc_purge              = rds_message_inc_purge,
+       .inc_free               = rds_message_inc_free,
         .t_name                 = "loopback",
  };
diff --git a/net/rds/message.c b/net/rds/message.c

index 756c73729126d45c18a29bd5859aa04596a1ed0f..5a15dc8d0cd78bf63a27dfc0d9816477fa1959b8 100644 (file)
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -31,10 +31,11 @@
   *
   */
  #include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/export.h>
  
  #include "rds.h"
+#include "rdma.h"
+
+static DECLARE_WAIT_QUEUE_HEAD(rds_message_flush_waitq);
  
  static unsigned int    rds_exthdr_size[__RDS_EXTHDR_MAX] = {
  [RDS_EXTHDR_NONE]      = 0,
@@ -49,7 +50,6 @@ void rds_message_addref(struct rds_message *rm)
         rdsdebug("addref rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
         atomic_inc(&rm->m_refcount);
  }
-EXPORT_SYMBOL_GPL(rds_message_addref);
  
  /*
   * This relies on dma_map_sg() not touching sg[].page during merging.
@@ -61,28 +61,29 @@ static void rds_message_purge(struct rds_message *rm)
         if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
                 return;
  
-       for (i = 0; i < rm->data.op_nents; i++) {
-               rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i]));
+       for (i = 0; i < rm->m_nents; i++) {
+               rdsdebug("putting data page %p\n", (void *)sg_page(&rm->m_sg[i]));
                 /* XXX will have to put_page for page refs */
-               __free_page(sg_page(&rm->data.op_sg[i]));
+               __free_page(sg_page(&rm->m_sg[i]));
         }
-       rm->data.op_nents = 0;
+       rm->m_nents = 0;
  
-       if (rm->rdma.op_active)
-               rds_rdma_free_op(&rm->rdma);
-       if (rm->rdma.op_rdma_mr)
-               rds_mr_put(rm->rdma.op_rdma_mr);
+       if (rm->m_rdma_op)
+               rds_rdma_free_op(rm->m_rdma_op);
+       if (rm->m_rdma_mr)
+               rds_mr_put(rm->m_rdma_mr);
+}
  
-       if (rm->atomic.op_active)
-               rds_atomic_free_op(&rm->atomic);
-       if (rm->atomic.op_rdma_mr)
-               rds_mr_put(rm->atomic.op_rdma_mr);
+void rds_message_inc_purge(struct rds_incoming *inc)
+{
+       struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
+       rds_message_purge(rm);
  }
  
  void rds_message_put(struct rds_message *rm)
  {
         rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
-       WARN(!atomic_read(&rm->m_refcount), "danger refcount zero on %p\n", rm);
+
         if (atomic_dec_and_test(&rm->m_refcount)) {
                 BUG_ON(!list_empty(&rm->m_sock_item));
                 BUG_ON(!list_empty(&rm->m_conn_item));
@@ -91,7 +92,12 @@ void rds_message_put(struct rds_message *rm)
                 kfree(rm);
         }
  }
-EXPORT_SYMBOL_GPL(rds_message_put);
+
+void rds_message_inc_free(struct rds_incoming *inc)
+{
+       struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
+       rds_message_put(rm);
+}
  
  void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
                                  __be16 dport, u64 seq)
@@ -102,10 +108,9 @@ void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
         hdr->h_sequence = cpu_to_be64(seq);
         hdr->h_exthdr[0] = RDS_EXTHDR_NONE;
  }
-EXPORT_SYMBOL_GPL(rds_message_populate_header);
  
-int rds_message_add_extension(struct rds_header *hdr, unsigned int type,
-                             const void *data, unsigned int len)
+int rds_message_add_extension(struct rds_header *hdr,
+               unsigned int type, const void *data, unsigned int len)
  {
         unsigned int ext_len = sizeof(u8) + len;
         unsigned char *dst;
@@ -114,7 +119,8 @@ int rds_message_add_extension(struct rds_header *hdr, unsigned int type,
         if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE)
                 return 0;
  
-       if (type >= __RDS_EXTHDR_MAX || len != rds_exthdr_size[type])
+       if (type >= __RDS_EXTHDR_MAX
+        || len != rds_exthdr_size[type])
                 return 0;
  
         if (ext_len >= RDS_HEADER_EXT_SPACE)
@@ -127,7 +133,6 @@ int rds_message_add_extension(struct rds_header *hdr, unsigned int type,
         dst[len] = RDS_EXTHDR_NONE;
         return 1;
  }
-EXPORT_SYMBOL_GPL(rds_message_add_extension);
  
  /*
   * If a message has extension headers, retrieve them here.
@@ -175,6 +180,26 @@ none:
         return RDS_EXTHDR_NONE;
  }
  
+int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version)
+{
+       struct rds_ext_header_version ext_hdr;
+
+       ext_hdr.h_version = cpu_to_be32(version);
+       return rds_message_add_extension(hdr, RDS_EXTHDR_VERSION, &ext_hdr, sizeof(ext_hdr));
+}
+
+int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version)
+{
+       struct rds_ext_header_version ext_hdr;
+       unsigned int pos = 0, len = sizeof(ext_hdr);
+
+       /* We assume the version extension is the only one present */
+       if (rds_message_next_extension(hdr, &pos, &ext_hdr, &len) != RDS_EXTHDR_VERSION)
+               return 0;
+       *version = be32_to_cpu(ext_hdr.h_version);
+       return 1;
+}
+
  int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset)
  {
         struct rds_ext_header_rdma_dest ext_hdr;
@@ -183,80 +208,42 @@ int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 o
         ext_hdr.h_rdma_offset = cpu_to_be32(offset);
         return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr));
  }
-EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension);
  
-/*
- * Each rds_message is allocated with extra space for the scatterlist entries
- * rds ops will need. This is to minimize memory allocation count. Then, each rds op
- * can grab SGs when initializing its part of the rds_message.
- */
-struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp)
+struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp)
  {
         struct rds_message *rm;
  
-       if (extra_len > KMALLOC_MAX_SIZE - sizeof(struct rds_message))
-               return NULL;
-
-       rm = kzalloc(sizeof(struct rds_message) + extra_len, gfp);
+       rm = kzalloc(sizeof(struct rds_message) +
+                    (nents * sizeof(struct scatterlist)), gfp);
         if (!rm)
                 goto out;
  
-       rm->m_used_sgs = 0;
-       rm->m_total_sgs = extra_len / sizeof(struct scatterlist);
-
+       if (nents)
+               sg_init_table(rm->m_sg, nents);
         atomic_set(&rm->m_refcount, 1);
         INIT_LIST_HEAD(&rm->m_sock_item);
         INIT_LIST_HEAD(&rm->m_conn_item);
         spin_lock_init(&rm->m_rs_lock);
-       init_waitqueue_head(&rm->m_flush_wait);
  
  out:
         return rm;
  }
  
-/*
- * RDS ops use this to grab SG entries from the rm's sg pool.
- */
-struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents)
-{
-       struct scatterlist *sg_first = (struct scatterlist *) &rm[1];
-       struct scatterlist *sg_ret;
-
-       WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs);
-       WARN_ON(!nents);
-
-       if (rm->m_used_sgs + nents > rm->m_total_sgs)
-               return NULL;
-
-       sg_ret = &sg_first[rm->m_used_sgs];
-       sg_init_table(sg_ret, nents);
-       rm->m_used_sgs += nents;
-
-       return sg_ret;
-}
-
  struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len)
  {
         struct rds_message *rm;
         unsigned int i;
-       int num_sgs = ceil(total_len, PAGE_SIZE);
-       int extra_bytes = num_sgs * sizeof(struct scatterlist);
  
-       rm = rds_message_alloc(extra_bytes, GFP_NOWAIT);
-       if (!rm)
+       rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
+       if (rm == NULL)
                 return ERR_PTR(-ENOMEM);
  
         set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
         rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
-       rm->data.op_nents = ceil(total_len, PAGE_SIZE);
-       rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs);
-       if (!rm->data.op_sg) {
-               rds_message_put(rm);
-               return ERR_PTR(-ENOMEM);
-       }
+       rm->m_nents = ceil(total_len, PAGE_SIZE);
  
-       for (i = 0; i < rm->data.op_nents; ++i) {
-               sg_set_page(&rm->data.op_sg[i],
+       for (i = 0; i < rm->m_nents; ++i) {
+               sg_set_page(&rm->m_sg[i],
                                 virt_to_page(page_addrs[i]),
                                 PAGE_SIZE, 0);
         }
@@ -264,54 +251,88 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
         return rm;
  }
  
-int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from)
+struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
+                                              size_t total_len)
  {
-       unsigned long to_copy, nbytes;
+       unsigned long to_copy;
+       unsigned long iov_off;
         unsigned long sg_off;
+       struct rds_message *rm;
+       struct iovec *iov;
         struct scatterlist *sg;
-       int ret = 0;
+       int ret;
+
+       rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
+       if (rm == NULL) {
+               ret = -ENOMEM;
+               goto out;
+       }
  
-       rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from));
+       rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
  
         /*
          * now allocate and copy in the data payload.
          */
-       sg = rm->data.op_sg;
+       sg = rm->m_sg;
+       iov = first_iov;
+       iov_off = 0;
         sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
  
-       while (iov_iter_count(from)) {
-               if (!sg_page(sg)) {
-                       ret = rds_page_remainder_alloc(sg, iov_iter_count(from),
+       while (total_len) {
+               if (sg_page(sg) == NULL) {
+                       ret = rds_page_remainder_alloc(sg, total_len,
                                                        GFP_HIGHUSER);
                         if (ret)
-                               return ret;
-                       rm->data.op_nents++;
+                               goto out;
+                       rm->m_nents++;
                         sg_off = 0;
                 }
  
-               to_copy = min_t(unsigned long, iov_iter_count(from),
-                               sg->length - sg_off);
+               while (iov_off == iov->iov_len) {
+                       iov_off = 0;
+                       iov++;
+               }
+
+               to_copy = min(iov->iov_len - iov_off, sg->length - sg_off);
+               to_copy = min_t(size_t, to_copy, total_len);
+
+               rdsdebug("copying %lu bytes from user iov [%p, %zu] + %lu to "
+                        "sg [%p, %u, %u] + %lu\n",
+                        to_copy, iov->iov_base, iov->iov_len, iov_off,
+                        (void *)sg_page(sg), sg->offset, sg->length, sg_off);
  
-               rds_stats_add(s_copy_from_user, to_copy);
-               nbytes = copy_page_from_iter(sg_page(sg), sg->offset + sg_off,
-                                            to_copy, from);
-               if (nbytes != to_copy)
-                       return -EFAULT;
+               ret = rds_page_copy_from_user(sg_page(sg), sg->offset + sg_off,
+                                             iov->iov_base + iov_off,
+                                             to_copy);
+               if (ret)
+                       goto out;
  
+               iov_off += to_copy;
+               total_len -= to_copy;
                 sg_off += to_copy;
  
                 if (sg_off == sg->length)
                         sg++;
         }
  
-       return ret;
+       ret = 0;
+out:
+       if (ret) {
+               if (rm)
+                       rds_message_put(rm);
+               rm = ERR_PTR(ret);
+       }
+       return rm;
  }
  
-int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
+int rds_message_inc_copy_to_user(struct rds_incoming *inc,
+                                struct iovec *first_iov, size_t size)
  {
         struct rds_message *rm;
+       struct iovec *iov;
         struct scatterlist *sg;
         unsigned long to_copy;
+       unsigned long iov_off;
         unsigned long vec_off;
         int copied;
         int ret;
@@ -320,21 +341,36 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
         rm = container_of(inc, struct rds_message, m_inc);
         len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
  
-       sg = rm->data.op_sg;
+       iov = first_iov;
+       iov_off = 0;
+       sg = rm->m_sg;
         vec_off = 0;
         copied = 0;
  
-       while (iov_iter_count(to) && copied < len) {
-               to_copy = min_t(unsigned long, iov_iter_count(to),
-                               sg->length - vec_off);
+       while (copied < size && copied < len) {
+               while (iov_off == iov->iov_len) {
+                       iov_off = 0;
+                       iov++;
+               }
+
+               to_copy = min(iov->iov_len - iov_off, sg->length - vec_off);
+               to_copy = min_t(size_t, to_copy, size - copied);
                 to_copy = min_t(unsigned long, to_copy, len - copied);
  
-               rds_stats_add(s_copy_to_user, to_copy);
-               ret = copy_page_to_iter(sg_page(sg), sg->offset + vec_off,
-                                       to_copy, to);
-               if (ret != to_copy)
-                       return -EFAULT;
+               rdsdebug("copying %lu bytes to user iov [%p, %zu] + %lu to "
+                        "sg [%p, %u, %u] + %lu\n",
+                        to_copy, iov->iov_base, iov->iov_len, iov_off,
+                        sg_page(sg), sg->offset, sg->length, vec_off);
+
+               ret = rds_page_copy_to_user(sg_page(sg), sg->offset + vec_off,
+                                           iov->iov_base + iov_off,
+                                           to_copy);
+               if (ret) {
+                       copied = ret;
+                       break;
+               }
  
+               iov_off += to_copy;
                 vec_off += to_copy;
                 copied += to_copy;
  
@@ -353,14 +389,14 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
   */
  void rds_message_wait(struct rds_message *rm)
  {
-       wait_event_interruptible(rm->m_flush_wait,
+       wait_event(rds_message_flush_waitq,
                         !test_bit(RDS_MSG_MAPPED, &rm->m_flags));
  }
  
  void rds_message_unmapped(struct rds_message *rm)
  {
         clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
-       wake_up_interruptible(&rm->m_flush_wait);
+       if (waitqueue_active(&rds_message_flush_waitq))
+               wake_up(&rds_message_flush_waitq);
  }
-EXPORT_SYMBOL_GPL(rds_message_unmapped);
  
diff --git a/net/rds/page.c b/net/rds/page.c

index 9005a2c920ee6dccc045a15a707ae64ae59d3f87..c460743a89ad00c594fd8f606c083321e0ee8b39 100644 (file)
--- a/net/rds/page.c
+++ b/net/rds/page.c
@@ -31,9 +31,6 @@
   *
   */
  #include <linux/highmem.h>
-#include <linux/gfp.h>
-#include <linux/cpu.h>
-#include <linux/export.h>
  
  #include "rds.h"
  
@@ -42,8 +39,7 @@ struct rds_page_remainder {
         unsigned long   r_offset;
  };
  
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder,
-                                    rds_page_remainders);
+DEFINE_PER_CPU(struct rds_page_remainder, rds_page_remainders) ____cacheline_aligned;
  
  /*
   * returns 0 on success or -errno on failure.
@@ -60,26 +56,37 @@ int rds_page_copy_user(struct page *page, unsigned long offset,
         unsigned long ret;
         void *addr;
  
-       addr = kmap(page);
-       if (to_user) {
+       if (to_user)
                 rds_stats_add(s_copy_to_user, bytes);
-               ret = copy_to_user(ptr, addr + offset, bytes);
-       } else {
+       else
                 rds_stats_add(s_copy_from_user, bytes);
-               ret = copy_from_user(addr + offset, ptr, bytes);
+
+       addr = kmap_atomic(page, KM_USER0);
+       if (to_user)
+               ret = __copy_to_user_inatomic(ptr, addr + offset, bytes);
+       else
+               ret = __copy_from_user_inatomic(addr + offset, ptr, bytes);
+       kunmap_atomic(addr, KM_USER0);
+
+       if (ret) {
+               addr = kmap(page);
+               if (to_user)
+                       ret = copy_to_user(ptr, addr + offset, bytes);
+               else
+                       ret = copy_from_user(addr + offset, ptr, bytes);
+               kunmap(page);
+               if (ret)
+                       return -EFAULT;
         }
-       kunmap(page);
  
-       return ret ? -EFAULT : 0;
+       return 0;
  }
-EXPORT_SYMBOL_GPL(rds_page_copy_user);
  
-/**
- * rds_page_remainder_alloc - build up regions of a message.
+/*
+ * Message allocation uses this to build up regions of a message.
   *
- * @scat: Scatter list for message
- * @bytes: the number of bytes needed.
- * @gfp: the waiting behaviour of the allocation
+ * @bytes - the number of bytes needed.
+ * @gfp - the waiting behaviour of the allocation
   *
   * @gfp is always ored with __GFP_HIGHMEM.  Callers must be prepared to
   * kmap the pages, etc.
@@ -107,7 +114,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
         /* jump straight to allocation if we're trying for a huge page */
         if (bytes >= PAGE_SIZE) {
                 page = alloc_page(gfp);
-               if (!page) {
+               if (page == NULL) {
                         ret = -ENOMEM;
                 } else {
                         sg_set_page(scat, page, PAGE_SIZE, 0);
@@ -153,7 +160,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
                 rem = &per_cpu(rds_page_remainders, get_cpu());
                 local_irq_save(flags);
  
-               if (!page) {
+               if (page == NULL) {
                         ret = -ENOMEM;
                         break;
                 }
@@ -177,7 +184,6 @@ out:
                  ret ? 0 : scat->length);
         return ret;
  }
-EXPORT_SYMBOL_GPL(rds_page_remainder_alloc);
  
  static int rds_page_remainder_cpu_notify(struct notifier_block *self,
                                          unsigned long action, void *hcpu)
diff --git a/net/rds/rdma.c b/net/rds/rdma.c

index 40084d843e9fe33bc1545f1f573a32780880e223..eaeeb91e11196a07405ee9cbfc417c7da3793ea3 100644 (file)
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -31,11 +31,10 @@
   *
   */
  #include <linux/pagemap.h>
-#include <linux/slab.h>
  #include <linux/rbtree.h>
  #include <linux/dma-mapping.h> /* for DMA_*_DEVICE */
  
-#include "rds.h"
+#include "rdma.h"
  
  /*
   * XXX
@@ -130,22 +129,14 @@ void rds_rdma_drop_keys(struct rds_sock *rs)
  {
         struct rds_mr *mr;
         struct rb_node *node;
-       unsigned long flags;
  
         /* Release any MRs associated with this socket */
-       spin_lock_irqsave(&rs->rs_rdma_lock, flags);
         while ((node = rb_first(&rs->rs_rdma_keys))) {
                 mr = container_of(node, struct rds_mr, r_rb_node);
                 if (mr->r_trans == rs->rs_transport)
                         mr->r_invalidate = 0;
-               rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
-               RB_CLEAR_NODE(&mr->r_rb_node);
-               spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
-               rds_destroy_mr(mr);
                 rds_mr_put(mr);
-               spin_lock_irqsave(&rs->rs_rdma_lock, flags);
         }
-       spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
  
         if (rs->rs_transport && rs->rs_transport->flush_mrs)
                 rs->rs_transport->flush_mrs();
@@ -159,9 +150,12 @@ static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
  {
         int ret;
  
-       ret = get_user_pages_fast(user_addr, nr_pages, write, pages);
+       down_read(&current->mm->mmap_sem);
+       ret = get_user_pages(current, current->mm, user_addr,
+                            nr_pages, write, 0, pages, NULL);
+       up_read(&current->mm->mmap_sem);
  
-       if (ret >= 0 && ret < nr_pages) {
+       if (0 <= ret && (unsigned) ret < nr_pages) {
                 while (ret--)
                         put_page(pages[ret]);
                 ret = -EFAULT;
@@ -189,7 +183,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
                 goto out;
         }
  
-       if (!rs->rs_transport->get_mr) {
+       if (rs->rs_transport->get_mr == NULL) {
                 ret = -EOPNOTSUPP;
                 goto out;
         }
@@ -205,13 +199,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
  
         /* XXX clamp nr_pages to limit the size of this alloc? */
         pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
-       if (!pages) {
+       if (pages == NULL) {
                 ret = -ENOMEM;
                 goto out;
         }
  
         mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL);
-       if (!mr) {
+       if (mr == NULL) {
                 ret = -ENOMEM;
                 goto out;
         }
@@ -238,13 +232,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
          * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to
          * the zero page.
          */
-       ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1);
+       ret = rds_pin_pages(args->vec.addr & PAGE_MASK, nr_pages, pages, 1);
         if (ret < 0)
                 goto out;
  
         nents = ret;
         sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
-       if (!sg) {
+       if (sg == NULL) {
                 ret = -ENOMEM;
                 goto out;
         }
@@ -326,30 +320,6 @@ int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen)
         return __rds_rdma_map(rs, &args, NULL, NULL);
  }
  
-int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen)
-{
-       struct rds_get_mr_for_dest_args args;
-       struct rds_get_mr_args new_args;
-
-       if (optlen != sizeof(struct rds_get_mr_for_dest_args))
-               return -EINVAL;
-
-       if (copy_from_user(&args, (struct rds_get_mr_for_dest_args __user *)optval,
-                          sizeof(struct rds_get_mr_for_dest_args)))
-               return -EFAULT;
-
-       /*
-        * Initially, just behave like get_mr().
-        * TODO: Implement get_mr as wrapper around this
-        *       and deprecate it.
-        */
-       new_args.vec = args.vec;
-       new_args.cookie_addr = args.cookie_addr;
-       new_args.flags = args.flags;
-
-       return __rds_rdma_map(rs, &new_args, NULL, NULL);
-}
-
  /*
   * Free the MR indicated by the given R_Key
   */
@@ -414,217 +384,132 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
  
         spin_lock_irqsave(&rs->rs_rdma_lock, flags);
         mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
-       if (!mr) {
-               printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key);
-               spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
-               return;
-       }
-
-       if (mr->r_use_once || force) {
+       if (mr && (mr->r_use_once || force)) {
                 rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
                 RB_CLEAR_NODE(&mr->r_rb_node);
                 zot_me = 1;
-       }
+       } else if (mr)
+               atomic_inc(&mr->r_refcount);
         spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
  
         /* May have to issue a dma_sync on this memory region.
          * Note we could avoid this if the operation was a RDMA READ,
          * but at this point we can't tell. */
-       if (mr->r_trans->sync_mr)
-               mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
-
-       /* If the MR was marked as invalidate, this will
-        * trigger an async flush. */
-       if (zot_me)
-               rds_destroy_mr(mr);
-       rds_mr_put(mr);
+       if (mr != NULL) {
+               if (mr->r_trans->sync_mr)
+                       mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
+
+               /* If the MR was marked as invalidate, this will
+                * trigger an async flush. */
+               if (zot_me)
+                       rds_destroy_mr(mr);
+               rds_mr_put(mr);
+       }
  }
  
-void rds_rdma_free_op(struct rm_rdma_op *ro)
+void rds_rdma_free_op(struct rds_rdma_op *ro)
  {
         unsigned int i;
  
-       for (i = 0; i < ro->op_nents; i++) {
-               struct page *page = sg_page(&ro->op_sg[i]);
+       for (i = 0; i < ro->r_nents; i++) {
+               struct page *page = sg_page(&ro->r_sg[i]);
  
                 /* Mark page dirty if it was possibly modified, which
                  * is the case for a RDMA_READ which copies from remote
                  * to local memory */
-               if (!ro->op_write) {
-                       BUG_ON(irqs_disabled());
+               if (!ro->r_write)
                         set_page_dirty(page);
-               }
                 put_page(page);
         }
  
-       kfree(ro->op_notifier);
-       ro->op_notifier = NULL;
-       ro->op_active = 0;
-}
-
-void rds_atomic_free_op(struct rm_atomic_op *ao)
-{
-       struct page *page = sg_page(ao->op_sg);
-
-       /* Mark page dirty if it was possibly modified, which
-        * is the case for a RDMA_READ which copies from remote
-        * to local memory */
-       set_page_dirty(page);
-       put_page(page);
-
-       kfree(ao->op_notifier);
-       ao->op_notifier = NULL;
-       ao->op_active = 0;
+       kfree(ro->r_notifier);
+       kfree(ro);
  }
  
-
  /*
- * Count the number of pages needed to describe an incoming iovec array.
+ * args is a pointer to an in-kernel copy in the sendmsg cmsg.
   */
-static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs)
-{
-       int tot_pages = 0;
-       unsigned int nr_pages;
-       unsigned int i;
-
-       /* figure out the number of pages in the vector */
-       for (i = 0; i < nr_iovecs; i++) {
-               nr_pages = rds_pages_in_vec(&iov[i]);
-               if (nr_pages == 0)
-                       return -EINVAL;
-
-               tot_pages += nr_pages;
-
-               /*
-                * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
-                * so tot_pages cannot overflow without first going negative.
-                */
-               if (tot_pages < 0)
-                       return -EINVAL;
-       }
-
-       return tot_pages;
-}
-
-int rds_rdma_extra_size(struct rds_rdma_args *args)
+static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
+                                           struct rds_rdma_args *args)
  {
         struct rds_iovec vec;
-       struct rds_iovec __user *local_vec;
-       int tot_pages = 0;
+       struct rds_rdma_op *op = NULL;
         unsigned int nr_pages;
-       unsigned int i;
-
-       local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
-
-       /* figure out the number of pages in the vector */
-       for (i = 0; i < args->nr_local; i++) {
-               if (copy_from_user(&vec, &local_vec[i],
-                                  sizeof(struct rds_iovec)))
-                       return -EFAULT;
-
-               nr_pages = rds_pages_in_vec(&vec);
-               if (nr_pages == 0)
-                       return -EINVAL;
-
-               tot_pages += nr_pages;
-
-               /*
-                * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
-                * so tot_pages cannot overflow without first going negative.
-                */
-               if (tot_pages < 0)
-                       return -EINVAL;
-       }
-
-       return tot_pages * sizeof(struct scatterlist);
-}
-
-/*
- * The application asks for a RDMA transfer.
- * Extract all arguments and set up the rdma_op
- */
-int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
-                         struct cmsghdr *cmsg)
-{
-       struct rds_rdma_args *args;
-       struct rm_rdma_op *op = &rm->rdma;
-       int nr_pages;
+       unsigned int max_pages;
         unsigned int nr_bytes;
         struct page **pages = NULL;
-       struct rds_iovec iovstack[UIO_FASTIOV], *iovs = iovstack;
-       int iov_size;
+       struct rds_iovec __user *local_vec;
+       struct scatterlist *sg;
+       unsigned int nr;
         unsigned int i, j;
-       int ret = 0;
-
-       if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
-           || rm->rdma.op_active)
-               return -EINVAL;
+       int ret;
  
-       args = CMSG_DATA(cmsg);
  
         if (rs->rs_bound_addr == 0) {
                 ret = -ENOTCONN; /* XXX not a great errno */
-               goto out_ret;
+               goto out;
         }
  
-       if (args->nr_local > UIO_MAXIOV) {
+       if (args->nr_local > (u64)UINT_MAX) {
                 ret = -EMSGSIZE;
-               goto out_ret;
+               goto out;
         }
  
-       /* Check whether to allocate the iovec area */
-       iov_size = args->nr_local * sizeof(struct rds_iovec);
-       if (args->nr_local > UIO_FASTIOV) {
-               iovs = sock_kmalloc(rds_rs_to_sk(rs), iov_size, GFP_KERNEL);
-               if (!iovs) {
-                       ret = -ENOMEM;
-                       goto out_ret;
+       nr_pages = 0;
+       max_pages = 0;
+
+       local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
+
+       /* figure out the number of pages in the vector */
+       for (i = 0; i < args->nr_local; i++) {
+               if (copy_from_user(&vec, &local_vec[i],
+                                  sizeof(struct rds_iovec))) {
+                       ret = -EFAULT;
+                       goto out;
                 }
-       }
  
-       if (copy_from_user(iovs, (struct rds_iovec __user *)(unsigned long) args->local_vec_addr, iov_size)) {
-               ret = -EFAULT;
-               goto out;
-       }
+               nr = rds_pages_in_vec(&vec);
+               if (nr == 0) {
+                       ret = -EINVAL;
+                       goto out;
+               }
  
-       nr_pages = rds_rdma_pages(iovs, args->nr_local);
-       if (nr_pages < 0) {
-               ret = -EINVAL;
-               goto out;
+               max_pages = max(nr, max_pages);
+               nr_pages += nr;
         }
  
-       pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
-       if (!pages) {
+       pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL);
+       if (pages == NULL) {
                 ret = -ENOMEM;
                 goto out;
         }
  
-       op->op_write = !!(args->flags & RDS_RDMA_READWRITE);
-       op->op_fence = !!(args->flags & RDS_RDMA_FENCE);
-       op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
-       op->op_silent = !!(args->flags & RDS_RDMA_SILENT);
-       op->op_active = 1;
-       op->op_recverr = rs->rs_recverr;
-       WARN_ON(!nr_pages);
-       op->op_sg = rds_message_alloc_sgs(rm, nr_pages);
-       if (!op->op_sg) {
+       op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL);
+       if (op == NULL) {
                 ret = -ENOMEM;
                 goto out;
         }
  
-       if (op->op_notify || op->op_recverr) {
+       op->r_write = !!(args->flags & RDS_RDMA_READWRITE);
+       op->r_fence = !!(args->flags & RDS_RDMA_FENCE);
+       op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+       op->r_recverr = rs->rs_recverr;
+       WARN_ON(!nr_pages);
+       sg_init_table(op->r_sg, nr_pages);
+
+       if (op->r_notify || op->r_recverr) {
                 /* We allocate an uninitialized notifier here, because
                  * we don't want to do that in the completion handler. We
                  * would have to use GFP_ATOMIC there, and don't want to deal
                  * with failed allocations.
                  */
-               op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
-               if (!op->op_notifier) {
+               op->r_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
+               if (!op->r_notifier) {
                         ret = -ENOMEM;
                         goto out;
                 }
-               op->op_notifier->n_user_token = args->user_token;
-               op->op_notifier->n_status = RDS_RDMA_SUCCESS;
+               op->r_notifier->n_user_token = args->user_token;
+               op->r_notifier->n_status = RDS_RDMA_SUCCESS;
         }
  
         /* The cookie contains the R_Key of the remote memory region, and
@@ -634,55 +519,68 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
          * destination address (which is really an offset into the MR)
          * FIXME: We may want to move this into ib_rdma.c
          */
-       op->op_rkey = rds_rdma_cookie_key(args->cookie);
-       op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
+       op->r_key = rds_rdma_cookie_key(args->cookie);
+       op->r_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
  
         nr_bytes = 0;
  
         rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n",
                (unsigned long long)args->nr_local,
                (unsigned long long)args->remote_vec.addr,
-              op->op_rkey);
+              op->r_key);
  
         for (i = 0; i < args->nr_local; i++) {
-               struct rds_iovec *iov = &iovs[i];
-               /* don't need to check, rds_rdma_pages() verified nr will be +nonzero */
-               unsigned int nr = rds_pages_in_vec(iov);
+               if (copy_from_user(&vec, &local_vec[i],
+                                  sizeof(struct rds_iovec))) {
+                       ret = -EFAULT;
+                       goto out;
+               }
+
+               nr = rds_pages_in_vec(&vec);
+               if (nr == 0) {
+                       ret = -EINVAL;
+                       goto out;
+               }
  
-               rs->rs_user_addr = iov->addr;
-               rs->rs_user_bytes = iov->bytes;
+               rs->rs_user_addr = vec.addr;
+               rs->rs_user_bytes = vec.bytes;
  
+               /* did the user change the vec under us? */
+               if (nr > max_pages || op->r_nents + nr > nr_pages) {
+                       ret = -EINVAL;
+                       goto out;
+               }
                 /* If it's a WRITE operation, we want to pin the pages for reading.
                  * If it's a READ operation, we need to pin the pages for writing.
                  */
-               ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write);
+               ret = rds_pin_pages(vec.addr & PAGE_MASK, nr, pages, !op->r_write);
                 if (ret < 0)
                         goto out;
  
-               rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n",
-                        nr_bytes, nr, iov->bytes, iov->addr);
+               rdsdebug("RDS: nr_bytes %u nr %u vec.bytes %llu vec.addr %llx\n",
+                      nr_bytes, nr, vec.bytes, vec.addr);
  
-               nr_bytes += iov->bytes;
+               nr_bytes += vec.bytes;
  
                 for (j = 0; j < nr; j++) {
-                       unsigned int offset = iov->addr & ~PAGE_MASK;
-                       struct scatterlist *sg;
+                       unsigned int offset = vec.addr & ~PAGE_MASK;
  
-                       sg = &op->op_sg[op->op_nents + j];
+                       sg = &op->r_sg[op->r_nents + j];
                         sg_set_page(sg, pages[j],
-                                       min_t(unsigned int, iov->bytes, PAGE_SIZE - offset),
+                                       min_t(unsigned int, vec.bytes, PAGE_SIZE - offset),
                                         offset);
  
-                       rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n",
-                              sg->offset, sg->length, iov->addr, iov->bytes);
+                       rdsdebug("RDS: sg->offset %x sg->len %x vec.addr %llx vec.bytes %llu\n",
+                              sg->offset, sg->length, vec.addr, vec.bytes);
  
-                       iov->addr += sg->length;
-                       iov->bytes -= sg->length;
+                       vec.addr += sg->length;
+                       vec.bytes -= sg->length;
                 }
  
-               op->op_nents += nr;
+               op->r_nents += nr;
         }
  
+
         if (nr_bytes > args->remote_vec.bytes) {
                 rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n",
                                 nr_bytes,
@@ -690,19 +588,38 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
                 ret = -EINVAL;
                 goto out;
         }
-       op->op_bytes = nr_bytes;
+       op->r_bytes = nr_bytes;
  
+       ret = 0;
  out:
-       if (iovs != iovstack)
-               sock_kfree_s(rds_rs_to_sk(rs), iovs, iov_size);
         kfree(pages);
-out_ret:
-       if (ret)
-               rds_rdma_free_op(op);
-       else
-               rds_stats_inc(s_send_rdma);
+       if (ret) {
+               if (op)
+                       rds_rdma_free_op(op);
+               op = ERR_PTR(ret);
+       }
+       return op;
+}
  
-       return ret;
+/*
+ * The application asks for a RDMA transfer.
+ * Extract all arguments and set up the rdma_op
+ */
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+                         struct cmsghdr *cmsg)
+{
+       struct rds_rdma_op *op;
+
+       if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
+        || rm->m_rdma_op != NULL)
+               return -EINVAL;
+
+       op = rds_rdma_prepare(rs, CMSG_DATA(cmsg));
+       if (IS_ERR(op))
+               return PTR_ERR(op);
+       rds_stats_inc(s_send_rdma);
+       rm->m_rdma_op = op;
+       return 0;
  }
  
  /*
@@ -717,8 +634,8 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
         u32 r_key;
         int err = 0;
  
-       if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t)) ||
-           rm->m_rdma_cookie != 0)
+       if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t))
+        || rm->m_rdma_cookie != 0)
                 return -EINVAL;
  
         memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie));
@@ -732,7 +649,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
  
         spin_lock_irqsave(&rs->rs_rdma_lock, flags);
         mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
-       if (!mr)
+       if (mr == NULL)
                 err = -EINVAL;  /* invalid r_key */
         else
                 atomic_inc(&mr->r_refcount);
@@ -740,7 +657,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
  
         if (mr) {
                 mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
-               rm->rdma.op_rdma_mr = mr;
+               rm->m_rdma_mr = mr;
         }
         return err;
  }
@@ -754,106 +671,9 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
  int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
                           struct cmsghdr *cmsg)
  {
-       if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args)) ||
-           rm->m_rdma_cookie != 0)
-               return -EINVAL;
-
-       return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr);
-}
-
-/*
- * Fill in rds_message for an atomic request.
- */
-int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
-                   struct cmsghdr *cmsg)
-{
-       struct page *page = NULL;
-       struct rds_atomic_args *args;
-       int ret = 0;
-
-       if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args))
-        || rm->atomic.op_active)
+       if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args))
+        || rm->m_rdma_cookie != 0)
                 return -EINVAL;
  
-       args = CMSG_DATA(cmsg);
-
-       /* Nonmasked & masked cmsg ops converted to masked hw ops */
-       switch (cmsg->cmsg_type) {
-       case RDS_CMSG_ATOMIC_FADD:
-               rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
-               rm->atomic.op_m_fadd.add = args->fadd.add;
-               rm->atomic.op_m_fadd.nocarry_mask = 0;
-               break;
-       case RDS_CMSG_MASKED_ATOMIC_FADD:
-               rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
-               rm->atomic.op_m_fadd.add = args->m_fadd.add;
-               rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask;
-               break;
-       case RDS_CMSG_ATOMIC_CSWP:
-               rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
-               rm->atomic.op_m_cswp.compare = args->cswp.compare;
-               rm->atomic.op_m_cswp.swap = args->cswp.swap;
-               rm->atomic.op_m_cswp.compare_mask = ~0;
-               rm->atomic.op_m_cswp.swap_mask = ~0;
-               break;
-       case RDS_CMSG_MASKED_ATOMIC_CSWP:
-               rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
-               rm->atomic.op_m_cswp.compare = args->m_cswp.compare;
-               rm->atomic.op_m_cswp.swap = args->m_cswp.swap;
-               rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask;
-               rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask;
-               break;
-       default:
-               BUG(); /* should never happen */
-       }
-
-       rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
-       rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT);
-       rm->atomic.op_active = 1;
-       rm->atomic.op_recverr = rs->rs_recverr;
-       rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1);
-       if (!rm->atomic.op_sg) {
-               ret = -ENOMEM;
-               goto err;
-       }
-
-       /* verify 8 byte-aligned */
-       if (args->local_addr & 0x7) {
-               ret = -EFAULT;
-               goto err;
-       }
-
-       ret = rds_pin_pages(args->local_addr, 1, &page, 1);
-       if (ret != 1)
-               goto err;
-       ret = 0;
-
-       sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr));
-
-       if (rm->atomic.op_notify || rm->atomic.op_recverr) {
-               /* We allocate an uninitialized notifier here, because
-                * we don't want to do that in the completion handler. We
-                * would have to use GFP_ATOMIC there, and don't want to deal
-                * with failed allocations.
-                */
-               rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL);
-               if (!rm->atomic.op_notifier) {
-                       ret = -ENOMEM;
-                       goto err;
-               }
-
-               rm->atomic.op_notifier->n_user_token = args->user_token;
-               rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS;
-       }
-
-       rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie);
-       rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie);
-
-       return ret;
-err:
-       if (page)
-               put_page(page);
-       kfree(rm->atomic.op_notifier);
-
-       return ret;
+       return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->m_rdma_mr);
  }
diff --git a/net/rds/rdma.h b/net/rds/rdma.h

new file mode 100644 (file)

index 0000000..4255120
--- /dev/null
+++ b/net/rds/rdma.h
@@ -0,0 +1,84 @@
+#ifndef _RDS_RDMA_H
+#define _RDS_RDMA_H
+
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/scatterlist.h>
+
+#include "rds.h"
+
+struct rds_mr {
+       struct rb_node          r_rb_node;
+       atomic_t                r_refcount;
+       u32                     r_key;
+
+       /* A copy of the creation flags */
+       unsigned int            r_use_once:1;
+       unsigned int            r_invalidate:1;
+       unsigned int            r_write:1;
+
+       /* This is for RDS_MR_DEAD.
+        * It would be nice & consistent to make this part of the above
+        * bit field here, but we need to use test_and_set_bit.
+        */
+       unsigned long           r_state;
+       struct rds_sock         *r_sock; /* back pointer to the socket that owns us */
+       struct rds_transport    *r_trans;
+       void                    *r_trans_private;
+};
+
+/* Flags for mr->r_state */
+#define RDS_MR_DEAD            0
+
+struct rds_rdma_op {
+       u32                     r_key;
+       u64                     r_remote_addr;
+       unsigned int            r_write:1;
+       unsigned int            r_fence:1;
+       unsigned int            r_notify:1;
+       unsigned int            r_recverr:1;
+       unsigned int            r_mapped:1;
+       struct rds_notifier     *r_notifier;
+       unsigned int            r_bytes;
+       unsigned int            r_nents;
+       unsigned int            r_count;
+       struct scatterlist      r_sg[0];
+};
+
+static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
+{
+       return r_key | (((u64) offset) << 32);
+}
+
+static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
+{
+       return cookie;
+}
+
+static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
+{
+       return cookie >> 32;
+}
+
+int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
+int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
+void rds_rdma_drop_keys(struct rds_sock *rs);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+                         struct cmsghdr *cmsg);
+int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
+                         struct cmsghdr *cmsg);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+                         struct cmsghdr *cmsg);
+int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
+                         struct cmsghdr *cmsg);
+void rds_rdma_free_op(struct rds_rdma_op *ro);
+void rds_rdma_send_complete(struct rds_message *rm, int);
+
+extern void __rds_put_mr_final(struct rds_mr *mr);
+static inline void rds_mr_put(struct rds_mr *mr)
+{
+       if (atomic_dec_and_test(&mr->r_refcount))
+               __rds_put_mr_final(mr);
+}
+
+#endif
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c

index 6cd9d1deafc395d6573b7e3b801e666106d1f257..7b19024f97069e00340c20cb07daee54e28835de 100644 (file)
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -30,40 +30,11 @@
   * SOFTWARE.
   *
   */
-#include <linux/module.h>
  #include <rdma/rdma_cm.h>
  
  #include "rdma_transport.h"
  
-static struct rdma_cm_id *rds_rdma_listen_id;
-
-static char *rds_cm_event_strings[] = {
-#define RDS_CM_EVENT_STRING(foo) \
-               [RDMA_CM_EVENT_##foo] = __stringify(RDMA_CM_EVENT_##foo)
-       RDS_CM_EVENT_STRING(ADDR_RESOLVED),
-       RDS_CM_EVENT_STRING(ADDR_ERROR),
-       RDS_CM_EVENT_STRING(ROUTE_RESOLVED),
-       RDS_CM_EVENT_STRING(ROUTE_ERROR),
-       RDS_CM_EVENT_STRING(CONNECT_REQUEST),
-       RDS_CM_EVENT_STRING(CONNECT_RESPONSE),
-       RDS_CM_EVENT_STRING(CONNECT_ERROR),
-       RDS_CM_EVENT_STRING(UNREACHABLE),
-       RDS_CM_EVENT_STRING(REJECTED),
-       RDS_CM_EVENT_STRING(ESTABLISHED),
-       RDS_CM_EVENT_STRING(DISCONNECTED),
-       RDS_CM_EVENT_STRING(DEVICE_REMOVAL),
-       RDS_CM_EVENT_STRING(MULTICAST_JOIN),
-       RDS_CM_EVENT_STRING(MULTICAST_ERROR),
-       RDS_CM_EVENT_STRING(ADDR_CHANGE),
-       RDS_CM_EVENT_STRING(TIMEWAIT_EXIT),
-#undef RDS_CM_EVENT_STRING
-};
-
-static char *rds_cm_event_str(enum rdma_cm_event_type type)
-{
-       return rds_str_array(rds_cm_event_strings,
-                            ARRAY_SIZE(rds_cm_event_strings), type);
-};
+static struct rdma_cm_id *rds_iw_listen_id;
  
  int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
                               struct rdma_cm_event *event)
@@ -73,8 +44,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
         struct rds_transport *trans;
         int ret = 0;
  
-       rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id,
-                event->event, rds_cm_event_str(event->event));
+       rdsdebug("conn %p id %p handling event %u\n", conn, cm_id,
+                event->event);
  
         if (cm_id->device->node_type == RDMA_NODE_RNIC)
                 trans = &rds_iw_transport;
@@ -130,7 +101,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
                 break;
  
         case RDMA_CM_EVENT_DISCONNECTED:
-               rdsdebug("DISCONNECT event - dropping connection "
+               printk(KERN_WARNING "RDS/IW: DISCONNECT event - dropping connection "
                         "%pI4->%pI4\n", &conn->c_laddr,
                          &conn->c_faddr);
                 rds_conn_drop(conn);
@@ -138,8 +109,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
  
         default:
                 /* things like device disconnect? */
-               printk(KERN_ERR "RDS: unknown event %u (%s)!\n",
-                      event->event, rds_cm_event_str(event->event));
+               printk(KERN_ERR "unknown event %u\n", event->event);
+               BUG();
                 break;
         }
  
@@ -147,28 +118,26 @@ out:
         if (conn)
                 mutex_unlock(&conn->c_cm_lock);
  
-       rdsdebug("id %p event %u (%s) handling ret %d\n", cm_id, event->event,
-                rds_cm_event_str(event->event), ret);
+       rdsdebug("id %p event %u handling ret %d\n", cm_id, event->event, ret);
  
         return ret;
  }
  
-static int rds_rdma_listen_init(void)
+static int __init rds_rdma_listen_init(void)
  {
         struct sockaddr_in sin;
         struct rdma_cm_id *cm_id;
         int ret;
  
-       cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP,
-                              IB_QPT_RC);
+       cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP);
         if (IS_ERR(cm_id)) {
                 ret = PTR_ERR(cm_id);
-               printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
+               printk(KERN_ERR "RDS/IW: failed to setup listener, "
                        "rdma_create_id() returned %d\n", ret);
-               return ret;
+               goto out;
         }
  
-       sin.sin_family = AF_INET;
+       sin.sin_family = PF_INET,
         sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
         sin.sin_port = (__force u16)htons(RDS_PORT);
  
@@ -178,21 +147,21 @@ static int rds_rdma_listen_init(void)
          */
         ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
         if (ret) {
-               printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
+               printk(KERN_ERR "RDS/IW: failed to setup listener, "
                        "rdma_bind_addr() returned %d\n", ret);
                 goto out;
         }
  
         ret = rdma_listen(cm_id, 128);
         if (ret) {
-               printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
+               printk(KERN_ERR "RDS/IW: failed to setup listener, "
                        "rdma_listen() returned %d\n", ret);
                 goto out;
         }
  
         rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT);
  
-       rds_rdma_listen_id = cm_id;
+       rds_iw_listen_id = cm_id;
         cm_id = NULL;
  out:
         if (cm_id)
@@ -202,14 +171,14 @@ out:
  
  static void rds_rdma_listen_stop(void)
  {
-       if (rds_rdma_listen_id) {
-               rdsdebug("cm %p\n", rds_rdma_listen_id);
-               rdma_destroy_id(rds_rdma_listen_id);
-               rds_rdma_listen_id = NULL;
+       if (rds_iw_listen_id) {
+               rdsdebug("cm %p\n", rds_iw_listen_id);
+               rdma_destroy_id(rds_iw_listen_id);
+               rds_iw_listen_id = NULL;
         }
  }
  
-static int rds_rdma_init(void)
+int __init rds_rdma_init(void)
  {
         int ret;
  
@@ -234,18 +203,12 @@ err_iw_init:
  out:
         return ret;
  }
-module_init(rds_rdma_init);
  
-static void rds_rdma_exit(void)
+void rds_rdma_exit(void)
  {
         /* stop listening first to ensure no new connections are attempted */
         rds_rdma_listen_stop();
         rds_ib_exit();
         rds_iw_exit();
  }
-module_exit(rds_rdma_exit);
-
-MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
-MODULE_DESCRIPTION("RDS: IB/iWARP transport");
-MODULE_LICENSE("Dual BSD/GPL");
  
diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h

index faba4e382695e36c12c25f981b043077f4d7363c..2f2c7d976c219c787d337db96a46a020a6a3355d 100644 (file)
--- a/net/rds/rdma_transport.h
+++ b/net/rds/rdma_transport.h
@@ -11,6 +11,10 @@ int rds_rdma_conn_connect(struct rds_connection *conn);
  int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
                               struct rdma_cm_event *event);
  
+/* from rdma_transport.c */
+int rds_rdma_init(void);
+void rds_rdma_exit(void);
+
  /* from ib.c */
  extern struct rds_transport rds_ib_transport;
  int rds_ib_init(void);
diff --git a/net/rds/rds.h b/net/rds/rds.h

index 0d41155a2258cbbd16e19171c3daa376e3a83877..619f0a30a4e566952642e27e1d2b49f5f546ad11 100644 (file)
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -36,8 +36,8 @@
  #define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args)
  #else
  /* sigh, pr_debug() causes unused variable warnings */
-static inline __printf(1, 2)
-void rdsdebug(char *fmt, ...)
+static inline void __attribute__ ((format (printf, 1, 2)))
+rdsdebug(char *fmt, ...)
  {
  }
  #endif
@@ -50,6 +50,7 @@ void rdsdebug(char *fmt, ...)
  #define RDS_FRAG_SIZE  ((unsigned int)(1 << RDS_FRAG_SHIFT))
  
  #define RDS_CONG_MAP_BYTES     (65536 / 8)
+#define RDS_CONG_MAP_LONGS     (RDS_CONG_MAP_BYTES / sizeof(unsigned long))
  #define RDS_CONG_MAP_PAGES     (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
  #define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8)
  
@@ -79,7 +80,6 @@ enum {
  /* Bits for c_flags */
  #define RDS_LL_SEND_FULL       0
  #define RDS_RECONNECT_PENDING  1
-#define RDS_IN_XMIT            2
  
  struct rds_connection {
         struct hlist_node       c_hash_node;
@@ -91,13 +91,12 @@ struct rds_connection {
         struct rds_cong_map     *c_lcong;
         struct rds_cong_map     *c_fcong;
  
+       struct mutex            c_send_lock;    /* protect send ring */
         struct rds_message      *c_xmit_rm;
         unsigned long           c_xmit_sg;
         unsigned int            c_xmit_hdr_off;
         unsigned int            c_xmit_data_off;
-       unsigned int            c_xmit_atomic_sent;
         unsigned int            c_xmit_rdma_sent;
-       unsigned int            c_xmit_data_sent;
  
         spinlock_t              c_lock;         /* protect msg queues */
         u64                     c_next_tx_seq;
@@ -110,7 +109,6 @@ struct rds_connection {
         void                    *c_transport_data;
  
         atomic_t                c_state;
-       unsigned long           c_send_gen;
         unsigned long           c_flags;
         unsigned long           c_reconnect_jiffies;
         struct delayed_work     c_send_w;
@@ -118,10 +116,11 @@ struct rds_connection {
         struct delayed_work     c_conn_w;
         struct work_struct      c_down_w;
         struct mutex            c_cm_lock;      /* protect conn state & cm */
-       wait_queue_head_t       c_waitq;
  
         struct list_head        c_map_item;
         unsigned long           c_map_queued;
+       unsigned long           c_map_offset;
+       unsigned long           c_map_bytes;
  
         unsigned int            c_unacked_packets;
         unsigned int            c_unacked_bytes;
@@ -133,7 +132,7 @@ struct rds_connection {
  #define RDS_FLAG_CONG_BITMAP   0x01
  #define RDS_FLAG_ACK_REQUIRED  0x02
  #define RDS_FLAG_RETRANSMITTED 0x04
-#define RDS_MAX_ADV_CREDIT     255
+#define RDS_MAX_ADV_CREDIT     127
  
  /*
   * Maximum space available for extension headers.
@@ -207,48 +206,6 @@ struct rds_incoming {
         rds_rdma_cookie_t       i_rdma_cookie;
  };
  
-struct rds_mr {
-       struct rb_node          r_rb_node;
-       atomic_t                r_refcount;
-       u32                     r_key;
-
-       /* A copy of the creation flags */
-       unsigned int            r_use_once:1;
-       unsigned int            r_invalidate:1;
-       unsigned int            r_write:1;
-
-       /* This is for RDS_MR_DEAD.
-        * It would be nice & consistent to make this part of the above
-        * bit field here, but we need to use test_and_set_bit.
-        */
-       unsigned long           r_state;
-       struct rds_sock         *r_sock; /* back pointer to the socket that owns us */
-       struct rds_transport    *r_trans;
-       void                    *r_trans_private;
-};
-
-/* Flags for mr->r_state */
-#define RDS_MR_DEAD            0
-
-static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
-{
-       return r_key | (((u64) offset) << 32);
-}
-
-static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
-{
-       return cookie;
-}
-
-static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
-{
-       return cookie >> 32;
-}
-
-/* atomic operation types */
-#define RDS_ATOMIC_TYPE_CSWP           0
-#define RDS_ATOMIC_TYPE_FADD           1
-
  /*
   * m_sock_item and m_conn_item are on lists that are serialized under
   * conn->c_lock.  m_sock_item has additional meaning in that once it is empty
@@ -301,71 +258,13 @@ struct rds_message {
          *   -> rs->rs_lock
          */
         spinlock_t              m_rs_lock;
-       wait_queue_head_t       m_flush_wait;
-
         struct rds_sock         *m_rs;
-
-       /* cookie to send to remote, in rds header */
+       struct rds_rdma_op      *m_rdma_op;
         rds_rdma_cookie_t       m_rdma_cookie;
-
-       unsigned int            m_used_sgs;
-       unsigned int            m_total_sgs;
-
-       void                    *m_final_op;
-
-       struct {
-               struct rm_atomic_op {
-                       int                     op_type;
-                       union {
-                               struct {
-                                       uint64_t        compare;
-                                       uint64_t        swap;
-                                       uint64_t        compare_mask;
-                                       uint64_t        swap_mask;
-                               } op_m_cswp;
-                               struct {
-                                       uint64_t        add;
-                                       uint64_t        nocarry_mask;
-                               } op_m_fadd;
-                       };
-
-                       u32                     op_rkey;
-                       u64                     op_remote_addr;
-                       unsigned int            op_notify:1;
-                       unsigned int            op_recverr:1;
-                       unsigned int            op_mapped:1;
-                       unsigned int            op_silent:1;
-                       unsigned int            op_active:1;
-                       struct scatterlist      *op_sg;
-                       struct rds_notifier     *op_notifier;
-
-                       struct rds_mr           *op_rdma_mr;
-               } atomic;
-               struct rm_rdma_op {
-                       u32                     op_rkey;
-                       u64                     op_remote_addr;
-                       unsigned int            op_write:1;
-                       unsigned int            op_fence:1;
-                       unsigned int            op_notify:1;
-                       unsigned int            op_recverr:1;
-                       unsigned int            op_mapped:1;
-                       unsigned int            op_silent:1;
-                       unsigned int            op_active:1;
-                       unsigned int            op_bytes;
-                       unsigned int            op_nents;
-                       unsigned int            op_count;
-                       struct scatterlist      *op_sg;
-                       struct rds_notifier     *op_notifier;
-
-                       struct rds_mr           *op_rdma_mr;
-               } rdma;
-               struct rm_data_op {
-                       unsigned int            op_active:1;
-                       unsigned int            op_nents;
-                       unsigned int            op_count;
-                       struct scatterlist      *op_sg;
-               } data;
-       };
+       struct rds_mr           *m_rdma_mr;
+       unsigned int            m_nents;
+       unsigned int            m_count;
+       struct scatterlist      m_sg[0];
  };
  
  /*
@@ -406,19 +305,17 @@ struct rds_notifier {
   *                 transport is responsible for other serialization, including
   *                 rds_recv_incoming().  This is called in process context but
   *                 should try hard not to block.
+ *
+ * @xmit_cong_map: This asks the transport to send the local bitmap down the
+ *                given connection.  XXX get a better story about the bitmap
+ *                flag and header.
   */
  
-#define RDS_TRANS_IB   0
-#define RDS_TRANS_IWARP        1
-#define RDS_TRANS_TCP  2
-#define RDS_TRANS_COUNT        3
-
  struct rds_transport {
         char                    t_name[TRANSNAMSIZ];
         struct list_head        t_item;
         struct module           *t_owner;
         unsigned int            t_prefer_loopback:1;
-       unsigned int            t_type;
  
         int (*laddr_check)(__be32 addr);
         int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
@@ -429,10 +326,13 @@ struct rds_transport {
         void (*xmit_complete)(struct rds_connection *conn);
         int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
                     unsigned int hdr_off, unsigned int sg, unsigned int off);
-       int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op);
-       int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op);
+       int (*xmit_cong_map)(struct rds_connection *conn,
+                            struct rds_cong_map *map, unsigned long offset);
+       int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op);
         int (*recv)(struct rds_connection *conn);
-       int (*inc_copy_to_user)(struct rds_incoming *inc, struct iov_iter *to);
+       int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov,
+                               size_t size);
+       void (*inc_purge)(struct rds_incoming *inc);
         void (*inc_free)(struct rds_incoming *inc);
  
         int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
@@ -461,11 +361,17 @@ struct rds_sock {
          * bound_addr used for both incoming and outgoing, no INADDR_ANY
          * support.
          */
-       struct hlist_node       rs_bound_node;
+       struct rb_node          rs_bound_node;
         __be32                  rs_bound_addr;
         __be32                  rs_conn_addr;
         __be16                  rs_bound_port;
         __be16                  rs_conn_port;
+
+       /*
+        * This is only used to communicate the transport between bind and
+        * initiating connections.  All other trans use is referenced through
+        * the connection.
+        */
         struct rds_transport    *rs_transport;
  
         /*
@@ -476,8 +382,6 @@ struct rds_sock {
  
         /* flag indicating we were congested or not */
         int                     rs_congested;
-       /* seen congestion (ENOBUFS) when sending? */
-       int                     rs_seen_congestion;
  
         /* rs_lock protects all these adjacent members before the newline */
         spinlock_t              rs_lock;
@@ -554,8 +458,8 @@ struct rds_statistics {
         uint64_t        s_recv_ping;
         uint64_t        s_send_queue_empty;
         uint64_t        s_send_queue_full;
-       uint64_t        s_send_lock_contention;
-       uint64_t        s_send_lock_queue_raced;
+       uint64_t        s_send_sem_contention;
+       uint64_t        s_send_sem_queue_raced;
         uint64_t        s_send_immediate_retry;
         uint64_t        s_send_delayed_retry;
         uint64_t        s_send_drop_acked;
@@ -575,13 +479,12 @@ struct rds_statistics {
  };
  
  /* af_rds.c */
-char *rds_str_array(char **array, size_t elements, size_t index);
  void rds_sock_addref(struct rds_sock *rs);
  void rds_sock_put(struct rds_sock *rs);
  void rds_wake_sk_sleep(struct rds_sock *rs);
  static inline void __rds_wake_sk_sleep(struct sock *sk)
  {
-       wait_queue_head_t *waitq = sk_sleep(sk);
+       wait_queue_head_t *waitq = sk->sk_sleep;
  
         if (!sock_flag(sk, SOCK_DEAD) && waitq)
                 wake_up(waitq);
@@ -610,23 +513,22 @@ void rds_cong_exit(void);
  struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
  
  /* conn.c */
-int rds_conn_init(void);
+int __init rds_conn_init(void);
  void rds_conn_exit(void);
  struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
                                        struct rds_transport *trans, gfp_t gfp);
  struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
                                struct rds_transport *trans, gfp_t gfp);
-void rds_conn_shutdown(struct rds_connection *conn);
  void rds_conn_destroy(struct rds_connection *conn);
+void rds_conn_reset(struct rds_connection *conn);
  void rds_conn_drop(struct rds_connection *conn);
-void rds_conn_connect_if_down(struct rds_connection *conn);
  void rds_for_each_conn_info(struct socket *sock, unsigned int len,
                           struct rds_info_iterator *iter,
                           struct rds_info_lengths *lens,
                           int (*visitor)(struct rds_connection *, void *),
                           size_t item_len);
-__printf(2, 3)
-void __rds_conn_error(struct rds_connection *conn, const char *, ...);
+void __rds_conn_error(struct rds_connection *conn, const char *, ...)
+                               __attribute__ ((format (printf, 2, 3)));
  #define rds_conn_error(conn, fmt...) \
         __rds_conn_error(conn, KERN_WARNING "RDS: " fmt)
  
@@ -656,8 +558,8 @@ rds_conn_connecting(struct rds_connection *conn)
  
  /* message.c */
  struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
-struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents);
-int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from);
+struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
+                                              size_t total_len);
  struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
  void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
                                  __be16 dport, u64 seq);
@@ -665,8 +567,12 @@ int rds_message_add_extension(struct rds_header *hdr,
                               unsigned int type, const void *data, unsigned int len);
  int rds_message_next_extension(struct rds_header *hdr,
                                unsigned int *pos, void *buf, unsigned int *buflen);
+int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version);
+int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version);
  int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
-int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
+int rds_message_inc_copy_to_user(struct rds_incoming *inc,
+                                struct iovec *first_iov, size_t size);
+void rds_message_inc_purge(struct rds_incoming *inc);
  void rds_message_inc_free(struct rds_incoming *inc);
  void rds_message_addref(struct rds_message *rm);
  void rds_message_put(struct rds_message *rm);
@@ -700,11 +606,12 @@ void rds_page_exit(void);
  /* recv.c */
  void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
                   __be32 saddr);
+void rds_inc_addref(struct rds_incoming *inc);
  void rds_inc_put(struct rds_incoming *inc);
  void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
-                      struct rds_incoming *inc, gfp_t gfp);
-int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
-               int msg_flags);
+                      struct rds_incoming *inc, gfp_t gfp, enum km_type km);
+int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+               size_t size, int msg_flags);
  void rds_clear_recv_queue(struct rds_sock *rs);
  int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg);
  void rds_inc_info_copy(struct rds_incoming *inc,
@@ -712,7 +619,8 @@ void rds_inc_info_copy(struct rds_incoming *inc,
                        __be32 saddr, __be32 daddr, int flip);
  
  /* send.c */
-int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len);
+int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+               size_t payload_len);
  void rds_send_reset(struct rds_connection *conn);
  int rds_send_xmit(struct rds_connection *conn);
  struct sockaddr_in;
@@ -720,41 +628,17 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
  typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
  void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
                          is_acked_func is_acked);
+int rds_send_acked_before(struct rds_connection *conn, u64 seq);
+void rds_send_remove_from_sock(struct list_head *messages, int status);
  int rds_send_pong(struct rds_connection *conn, __be16 dport);
  struct rds_message *rds_send_get_message(struct rds_connection *,
-                                        struct rm_rdma_op *);
+                                        struct rds_rdma_op *);
  
  /* rdma.c */
  void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
-int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
-int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen);
-int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
-void rds_rdma_drop_keys(struct rds_sock *rs);
-int rds_rdma_extra_size(struct rds_rdma_args *args);
-int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
-                         struct cmsghdr *cmsg);
-int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
-                         struct cmsghdr *cmsg);
-int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
-                         struct cmsghdr *cmsg);
-int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
-                         struct cmsghdr *cmsg);
-void rds_rdma_free_op(struct rm_rdma_op *ro);
-void rds_atomic_free_op(struct rm_atomic_op *ao);
-void rds_rdma_send_complete(struct rds_message *rm, int wc_status);
-void rds_atomic_send_complete(struct rds_message *rm, int wc_status);
-int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
-                   struct cmsghdr *cmsg);
-
-void __rds_put_mr_final(struct rds_mr *mr);
-static inline void rds_mr_put(struct rds_mr *mr)
-{
-       if (atomic_dec_and_test(&mr->r_refcount))
-               __rds_put_mr_final(mr);
-}
  
  /* stats.c */
-DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
+DECLARE_PER_CPU(struct rds_statistics, rds_stats);
  #define rds_stats_inc_which(which, member) do {                \
         per_cpu(which, get_cpu()).member++;             \
         put_cpu();                                      \
@@ -765,14 +649,13 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
         put_cpu();                                      \
  } while (0)
  #define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count)
-int rds_stats_init(void);
+int __init rds_stats_init(void);
  void rds_stats_exit(void);
  void rds_stats_info_copy(struct rds_info_iterator *iter,
-                        uint64_t *values, const char *const *names,
-                        size_t nr);
+                        uint64_t *values, char **names, size_t nr);
  
  /* sysctl.c */
-int rds_sysctl_init(void);
+int __init rds_sysctl_init(void);
  void rds_sysctl_exit(void);
  extern unsigned long rds_sysctl_sndbuf_min;
  extern unsigned long rds_sysctl_sndbuf_default;
@@ -786,10 +669,9 @@ extern unsigned long rds_sysctl_trace_flags;
  extern unsigned int  rds_sysctl_trace_level;
  
  /* threads.c */
-int rds_threads_init(void);
+int __init rds_threads_init(void);
  void rds_threads_exit(void);
  extern struct workqueue_struct *rds_wq;
-void rds_queue_reconnect(struct rds_connection *conn);
  void rds_connect_worker(struct work_struct *);
  void rds_shutdown_worker(struct work_struct *);
  void rds_send_worker(struct work_struct *);
@@ -800,10 +682,9 @@ void rds_connect_complete(struct rds_connection *conn);
  int rds_trans_register(struct rds_transport *trans);
  void rds_trans_unregister(struct rds_transport *trans);
  struct rds_transport *rds_trans_get_preferred(__be32 addr);
-void rds_trans_put(struct rds_transport *trans);
  unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
                                        unsigned int avail);
-int rds_trans_init(void);
+int __init rds_trans_init(void);
  void rds_trans_exit(void);
  
  #endif
diff --git a/net/rds/recv.c b/net/rds/recv.c

index a00462b0d01de9ee2793d4fb273bf568b2eefc29..f2118c51cfa3b794c13177870e8966b600d667cf 100644 (file)
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -31,12 +31,11 @@
   *
   */
  #include <linux/kernel.h>
-#include <linux/slab.h>
  #include <net/sock.h>
  #include <linux/in.h>
-#include <linux/export.h>
  
  #include "rds.h"
+#include "rdma.h"
  
  void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
                   __be32 saddr)
@@ -47,9 +46,8 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
         inc->i_saddr = saddr;
         inc->i_rdma_cookie = 0;
  }
-EXPORT_SYMBOL_GPL(rds_inc_init);
  
-static void rds_inc_addref(struct rds_incoming *inc)
+void rds_inc_addref(struct rds_incoming *inc)
  {
         rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
         atomic_inc(&inc->i_refcount);
@@ -64,7 +62,6 @@ void rds_inc_put(struct rds_incoming *inc)
                 inc->i_conn->c_trans->inc_free(inc);
         }
  }
-EXPORT_SYMBOL_GPL(rds_inc_put);
  
  static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
                                   struct rds_cong_map *map,
@@ -155,7 +152,7 @@ static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock
   * tell us which roles the addrs in the conn are playing for this message.
   */
  void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
-                      struct rds_incoming *inc, gfp_t gfp)
+                      struct rds_incoming *inc, gfp_t gfp, enum km_type km)
  {
         struct rds_sock *rs = NULL;
         struct sock *sk;
@@ -195,8 +192,8 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
          * XXX we could spend more on the wire to get more robust failure
          * detection, arguably worth it to avoid data corruption.
          */
-       if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq &&
-           (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
+       if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq
+        && (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
                 rds_stats_inc(s_recv_drop_old_seq);
                 goto out;
         }
@@ -209,7 +206,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
         }
  
         rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
-       if (!rs) {
+       if (rs == NULL) {
                 rds_stats_inc(s_recv_drop_no_sock);
                 goto out;
         }
@@ -240,7 +237,6 @@ out:
         if (rs)
                 rds_sock_put(rs);
  }
-EXPORT_SYMBOL_GPL(rds_recv_incoming);
  
  /*
   * be very careful here.  This is being called as the condition in
@@ -250,7 +246,7 @@ static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc)
  {
         unsigned long flags;
  
-       if (!*inc) {
+       if (*inc == NULL) {
                 read_lock_irqsave(&rs->rs_recv_lock, flags);
                 if (!list_empty(&rs->rs_recv_queue)) {
                         *inc = list_entry(rs->rs_recv_queue.next,
@@ -296,7 +292,7 @@ static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc,
  int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
  {
         struct rds_notifier *notifier;
-       struct rds_rdma_notify cmsg = { 0 }; /* fill holes with zero */
+       struct rds_rdma_notify cmsg;
         unsigned int count = 0, max_messages = ~0U;
         unsigned long flags;
         LIST_HEAD(copy);
@@ -333,10 +329,10 @@ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
  
                 if (msghdr) {
                         cmsg.user_token = notifier->n_user_token;
-                       cmsg.status = notifier->n_status;
+                       cmsg.status  = notifier->n_status;
  
                         err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS,
-                                      sizeof(cmsg), &cmsg);
+                                       sizeof(cmsg), &cmsg);
                         if (err)
                                 break;
                 }
@@ -395,14 +391,14 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg)
         return 0;
  }
  
-int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
-               int msg_flags)
+int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+               size_t size, int msg_flags)
  {
         struct sock *sk = sock->sk;
         struct rds_sock *rs = rds_sk_to_rs(sk);
         long timeo;
         int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
-       DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+       struct sockaddr_in *sin;
         struct rds_incoming *inc = NULL;
  
         /* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */
@@ -413,29 +409,27 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
         if (msg_flags & MSG_OOB)
                 goto out;
  
-       while (1) {
-               struct iov_iter save;
-               /* If there are pending notifications, do those - and nothing else */
-               if (!list_empty(&rs->rs_notify_queue)) {
-                       ret = rds_notify_queue_get(rs, msg);
-                       break;
-               }
+       /* If there are pending notifications, do those - and nothing else */
+       if (!list_empty(&rs->rs_notify_queue)) {
+               ret = rds_notify_queue_get(rs, msg);
+               goto out;
+       }
  
-               if (rs->rs_cong_notify) {
-                       ret = rds_notify_cong(rs, msg);
-                       break;
-               }
+       if (rs->rs_cong_notify) {
+               ret = rds_notify_cong(rs, msg);
+               goto out;
+       }
  
+       while (1) {
                 if (!rds_next_incoming(rs, &inc)) {
                         if (nonblock) {
                                 ret = -EAGAIN;
                                 break;
                         }
  
-                       timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
-                                       (!list_empty(&rs->rs_notify_queue) ||
-                                        rs->rs_cong_notify ||
-                                        rds_next_incoming(rs, &inc)), timeo);
+                       timeo = wait_event_interruptible_timeout(*sk->sk_sleep,
+                                               rds_next_incoming(rs, &inc),
+                                               timeo);
                         rdsdebug("recvmsg woke inc %p timeo %ld\n", inc,
                                  timeo);
                         if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
@@ -450,8 +444,8 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                 rdsdebug("copying inc %p from %pI4:%u to user\n", inc,
                          &inc->i_conn->c_faddr,
                          ntohs(inc->i_hdr.h_sport));
-               save = msg->msg_iter;
-               ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter);
+               ret = inc->i_conn->c_trans->inc_copy_to_user(inc, msg->msg_iov,
+                                                            size);
                 if (ret < 0)
                         break;
  
@@ -464,7 +458,6 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                         rds_inc_put(inc);
                         inc = NULL;
                         rds_stats_inc(s_recv_deliver_raced);
-                       msg->msg_iter = save;
                         continue;
                 }
  
@@ -481,12 +474,12 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
  
                 rds_stats_inc(s_recv_delivered);
  
+               sin = (struct sockaddr_in *)msg->msg_name;
                 if (sin) {
                         sin->sin_family = AF_INET;
                         sin->sin_port = inc->i_hdr.h_sport;
                         sin->sin_addr.s_addr = inc->i_saddr;
                         memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
-                       msg->msg_namelen = sizeof(*sin);
                 }
                 break;
         }
diff --git a/net/rds/send.c b/net/rds/send.c

index e9430f537f9c2bb23bbaeeb66933e1e85058bd34..104fe033203da51eeef849782e1c291e072cdde6 100644 (file)
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -31,15 +31,12 @@
   *
   */
  #include <linux/kernel.h>
-#include <linux/moduleparam.h>
-#include <linux/gfp.h>
  #include <net/sock.h>
  #include <linux/in.h>
  #include <linux/list.h>
-#include <linux/ratelimit.h>
-#include <linux/export.h>
  
  #include "rds.h"
+#include "rdma.h"
  
  /* When transmitting messages in rds_send_xmit, we need to emerge from
   * time to time and briefly release the CPU. Otherwise the softlock watchdog
@@ -55,11 +52,8 @@ static int send_batch_count = 64;
  module_param(send_batch_count, int, 0444);
  MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
  
-static void rds_send_remove_from_sock(struct list_head *messages, int status);
-
  /*
- * Reset the send state.  Callers must ensure that this doesn't race with
- * rds_send_xmit().
+ * Reset the send state. Caller must hold c_send_lock when calling here.
   */
  void rds_send_reset(struct rds_connection *conn)
  {
@@ -67,22 +61,18 @@ void rds_send_reset(struct rds_connection *conn)
         unsigned long flags;
  
         if (conn->c_xmit_rm) {
-               rm = conn->c_xmit_rm;
-               conn->c_xmit_rm = NULL;
                 /* Tell the user the RDMA op is no longer mapped by the
                  * transport. This isn't entirely true (it's flushed out
                  * independently) but as the connection is down, there's
                  * no ongoing RDMA to/from that memory */
-               rds_message_unmapped(rm);
-               rds_message_put(rm);
+               rds_message_unmapped(conn->c_xmit_rm);
+               rds_message_put(conn->c_xmit_rm);
+               conn->c_xmit_rm = NULL;
         }
-
         conn->c_xmit_sg = 0;
         conn->c_xmit_hdr_off = 0;
         conn->c_xmit_data_off = 0;
-       conn->c_xmit_atomic_sent = 0;
         conn->c_xmit_rdma_sent = 0;
-       conn->c_xmit_data_sent = 0;
  
         conn->c_map_queued = 0;
  
@@ -99,27 +89,8 @@ void rds_send_reset(struct rds_connection *conn)
         spin_unlock_irqrestore(&conn->c_lock, flags);
  }
  
-static int acquire_in_xmit(struct rds_connection *conn)
-{
-       return test_and_set_bit(RDS_IN_XMIT, &conn->c_flags) == 0;
-}
-
-static void release_in_xmit(struct rds_connection *conn)
-{
-       clear_bit(RDS_IN_XMIT, &conn->c_flags);
-       smp_mb__after_atomic();
-       /*
-        * We don't use wait_on_bit()/wake_up_bit() because our waking is in a
-        * hot path and finding waiters is very rare.  We don't want to walk
-        * the system-wide hashed waitqueue buckets in the fast path only to
-        * almost never find waiters.
-        */
-       if (waitqueue_active(&conn->c_waitq))
-               wake_up_all(&conn->c_waitq);
-}
-
  /*
- * We're making the conscious trade-off here to only send one message
+ * We're making the concious trade-off here to only send one message
   * down the connection at a time.
   *   Pro:
   *      - tx queueing is a simple fifo list
@@ -137,14 +108,11 @@ int rds_send_xmit(struct rds_connection *conn)
         struct rds_message *rm;
         unsigned long flags;
         unsigned int tmp;
+       unsigned int send_quota = send_batch_count;
         struct scatterlist *sg;
         int ret = 0;
+       int was_empty = 0;
         LIST_HEAD(to_be_dropped);
-       int batch_count;
-       unsigned long send_gen = 0;
-
-restart:
-       batch_count = 0;
  
         /*
          * sendmsg calls here after having queued its message on the send
@@ -152,31 +120,14 @@ restart:
          * another thread is already feeding the queue then we back off.  This
          * avoids blocking the caller and trading per-connection data between
          * caches per message.
-        */
-       if (!acquire_in_xmit(conn)) {
-               rds_stats_inc(s_send_lock_contention);
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       /*
-        * we record the send generation after doing the xmit acquire.
-        * if someone else manages to jump in and do some work, we'll use
-        * this to avoid a goto restart farther down.
          *
-        * The acquire_in_xmit() check above ensures that only one
-        * caller can increment c_send_gen at any time.
+        * The sem holder will issue a retry if they notice that someone queued
+        * a message after they stopped walking the send queue but before they
+        * dropped the sem.
          */
-       conn->c_send_gen++;
-       send_gen = conn->c_send_gen;
-
-       /*
-        * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT,
-        * we do the opposite to avoid races.
-        */
-       if (!rds_conn_up(conn)) {
-               release_in_xmit(conn);
-               ret = 0;
+       if (!mutex_trylock(&conn->c_send_lock)) {
+               rds_stats_inc(s_send_sem_contention);
+               ret = -ENOMEM;
                 goto out;
         }
  
@@ -185,47 +136,76 @@ restart:
  
         /*
          * spin trying to push headers and data down the connection until
-        * the connection doesn't make forward progress.
+        * the connection doens't make forward progress.
          */
-       while (1) {
+       while (--send_quota) {
+               /*
+                * See if need to send a congestion map update if we're
+                * between sending messages.  The send_sem protects our sole
+                * use of c_map_offset and _bytes.
+                * Note this is used only by transports that define a special
+                * xmit_cong_map function. For all others, we create allocate
+                * a cong_map message and treat it just like any other send.
+                */
+               if (conn->c_map_bytes) {
+                       ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong,
+                                               conn->c_map_offset);
+                       if (ret <= 0)
+                               break;
+
+                       conn->c_map_offset += ret;
+                       conn->c_map_bytes -= ret;
+                       if (conn->c_map_bytes)
+                               continue;
+               }
  
+               /* If we're done sending the current message, clear the
+                * offset and S/G temporaries.
+                */
                 rm = conn->c_xmit_rm;
+               if (rm != NULL &&
+                   conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
+                   conn->c_xmit_sg == rm->m_nents) {
+                       conn->c_xmit_rm = NULL;
+                       conn->c_xmit_sg = 0;
+                       conn->c_xmit_hdr_off = 0;
+                       conn->c_xmit_data_off = 0;
+                       conn->c_xmit_rdma_sent = 0;
  
-               /*
-                * If between sending messages, we can send a pending congestion
-                * map update.
+                       /* Release the reference to the previous message. */
+                       rds_message_put(rm);
+                       rm = NULL;
+               }
+
+               /* If we're asked to send a cong map update, do so.
                  */
-               if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) {
+               if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) {
+                       if (conn->c_trans->xmit_cong_map != NULL) {
+                               conn->c_map_offset = 0;
+                               conn->c_map_bytes = sizeof(struct rds_header) +
+                                       RDS_CONG_MAP_BYTES;
+                               continue;
+                       }
+
                         rm = rds_cong_update_alloc(conn);
                         if (IS_ERR(rm)) {
                                 ret = PTR_ERR(rm);
                                 break;
                         }
-                       rm->data.op_active = 1;
  
                         conn->c_xmit_rm = rm;
                 }
  
                 /*
-                * If not already working on one, grab the next message.
+                * Grab the next message from the send queue, if there is one.
                  *
                  * c_xmit_rm holds a ref while we're sending this message down
                  * the connction.  We can use this ref while holding the
                  * send_sem.. rds_send_reset() is serialized with it.
                  */
-               if (!rm) {
+               if (rm == NULL) {
                         unsigned int len;
  
-                       batch_count++;
-
-                       /* we want to process as big a batch as we can, but
-                        * we also want to avoid softlockups.  If we've been
-                        * through a lot of messages, lets back off and see
-                        * if anyone else jumps in
-                        */
-                       if (batch_count >= 1024)
-                               goto over_batch;
-
                         spin_lock_irqsave(&conn->c_lock, flags);
  
                         if (!list_empty(&conn->c_send_queue)) {
@@ -243,8 +223,10 @@ restart:
  
                         spin_unlock_irqrestore(&conn->c_lock, flags);
  
-                       if (!rm)
+                       if (rm == NULL) {
+                               was_empty = 1;
                                 break;
+                       }
  
                         /* Unfortunately, the way Infiniband deals with
                          * RDMA to a bad MR key is by moving the entire
@@ -253,19 +235,20 @@ restart:
                          * connection.
                          * Therefore, we never retransmit messages with RDMA ops.
                          */
-                       if (rm->rdma.op_active &&
-                           test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
+                       if (rm->m_rdma_op
+                        && test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
                                 spin_lock_irqsave(&conn->c_lock, flags);
                                 if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
                                         list_move(&rm->m_conn_item, &to_be_dropped);
                                 spin_unlock_irqrestore(&conn->c_lock, flags);
+                               rds_message_put(rm);
                                 continue;
                         }
  
                         /* Require an ACK every once in a while */
                         len = ntohl(rm->m_inc.i_hdr.h_len);
-                       if (conn->c_unacked_packets == 0 ||
-                           conn->c_unacked_bytes < len) {
+                       if (conn->c_unacked_packets == 0
+                        || conn->c_unacked_bytes < len) {
                                 __set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
  
                                 conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
@@ -279,55 +262,23 @@ restart:
                         conn->c_xmit_rm = rm;
                 }
  
-               /* The transport either sends the whole rdma or none of it */
-               if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) {
-                       rm->m_final_op = &rm->rdma;
-                       ret = conn->c_trans->xmit_rdma(conn, &rm->rdma);
+               /*
+                * Try and send an rdma message.  Let's see if we can
+                * keep this simple and require that the transport either
+                * send the whole rdma or none of it.
+                */
+               if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) {
+                       ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op);
                         if (ret)
                                 break;
                         conn->c_xmit_rdma_sent = 1;
-
-                       /* The transport owns the mapped memory for now.
-                        * You can't unmap it while it's on the send queue */
-                       set_bit(RDS_MSG_MAPPED, &rm->m_flags);
-               }
-
-               if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) {
-                       rm->m_final_op = &rm->atomic;
-                       ret = conn->c_trans->xmit_atomic(conn, &rm->atomic);
-                       if (ret)
-                               break;
-                       conn->c_xmit_atomic_sent = 1;
-
                         /* The transport owns the mapped memory for now.
                          * You can't unmap it while it's on the send queue */
                         set_bit(RDS_MSG_MAPPED, &rm->m_flags);
                 }
  
-               /*
-                * A number of cases require an RDS header to be sent
-                * even if there is no data.
-                * We permit 0-byte sends; rds-ping depends on this.
-                * However, if there are exclusively attached silent ops,
-                * we skip the hdr/data send, to enable silent operation.
-                */
-               if (rm->data.op_nents == 0) {
-                       int ops_present;
-                       int all_ops_are_silent = 1;
-
-                       ops_present = (rm->atomic.op_active || rm->rdma.op_active);
-                       if (rm->atomic.op_active && !rm->atomic.op_silent)
-                               all_ops_are_silent = 0;
-                       if (rm->rdma.op_active && !rm->rdma.op_silent)
-                               all_ops_are_silent = 0;
-
-                       if (ops_present && all_ops_are_silent
-                           && !rm->m_rdma_cookie)
-                               rm->data.op_active = 0;
-               }
-
-               if (rm->data.op_active && !conn->c_xmit_data_sent) {
-                       rm->m_final_op = &rm->data;
+               if (conn->c_xmit_hdr_off < sizeof(struct rds_header) ||
+                   conn->c_xmit_sg < rm->m_nents) {
                         ret = conn->c_trans->xmit(conn, rm,
                                                   conn->c_xmit_hdr_off,
                                                   conn->c_xmit_sg,
@@ -343,7 +294,7 @@ restart:
                                 ret -= tmp;
                         }
  
-                       sg = &rm->data.op_sg[conn->c_xmit_sg];
+                       sg = &rm->m_sg[conn->c_xmit_sg];
                         while (ret) {
                                 tmp = min_t(int, ret, sg->length -
                                                       conn->c_xmit_data_off);
@@ -354,68 +305,49 @@ restart:
                                         sg++;
                                         conn->c_xmit_sg++;
                                         BUG_ON(ret != 0 &&
-                                              conn->c_xmit_sg == rm->data.op_nents);
+                                              conn->c_xmit_sg == rm->m_nents);
                                 }
                         }
-
-                       if (conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
-                           (conn->c_xmit_sg == rm->data.op_nents))
-                               conn->c_xmit_data_sent = 1;
-               }
-
-               /*
-                * A rm will only take multiple times through this loop
-                * if there is a data op. Thus, if the data is sent (or there was
-                * none), then we're done with the rm.
-                */
-               if (!rm->data.op_active || conn->c_xmit_data_sent) {
-                       conn->c_xmit_rm = NULL;
-                       conn->c_xmit_sg = 0;
-                       conn->c_xmit_hdr_off = 0;
-                       conn->c_xmit_data_off = 0;
-                       conn->c_xmit_rdma_sent = 0;
-                       conn->c_xmit_atomic_sent = 0;
-                       conn->c_xmit_data_sent = 0;
-
-                       rds_message_put(rm);
                 }
         }
  
-over_batch:
-       if (conn->c_trans->xmit_complete)
-               conn->c_trans->xmit_complete(conn);
-       release_in_xmit(conn);
-
         /* Nuke any messages we decided not to retransmit. */
-       if (!list_empty(&to_be_dropped)) {
-               /* irqs on here, so we can put(), unlike above */
-               list_for_each_entry(rm, &to_be_dropped, m_conn_item)
-                       rds_message_put(rm);
+       if (!list_empty(&to_be_dropped))
                 rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
-       }
+
+       if (conn->c_trans->xmit_complete)
+               conn->c_trans->xmit_complete(conn);
  
         /*
-        * Other senders can queue a message after we last test the send queue
-        * but before we clear RDS_IN_XMIT.  In that case they'd back off and
-        * not try and send their newly queued message.  We need to check the
-        * send queue after having cleared RDS_IN_XMIT so that their message
-        * doesn't get stuck on the send queue.
+        * We might be racing with another sender who queued a message but
+        * backed off on noticing that we held the c_send_lock.  If we check
+        * for queued messages after dropping the sem then either we'll
+        * see the queued message or the queuer will get the sem.  If we
+        * notice the queued message then we trigger an immediate retry.
          *
-        * If the transport cannot continue (i.e ret != 0), then it must
-        * call us when more room is available, such as from the tx
-        * completion handler.
-        *
-        * We have an extra generation check here so that if someone manages
-        * to jump in after our release_in_xmit, we'll see that they have done
-        * some work and we will skip our goto
+        * We need to be careful only to do this when we stopped processing
+        * the send queue because it was empty.  It's the only way we
+        * stop processing the loop when the transport hasn't taken
+        * responsibility for forward progress.
          */
-       if (ret == 0) {
-               smp_mb();
-               if (!list_empty(&conn->c_send_queue) &&
-                   send_gen == conn->c_send_gen) {
-                       rds_stats_inc(s_send_lock_queue_raced);
-                       goto restart;
+       mutex_unlock(&conn->c_send_lock);
+
+       if (conn->c_map_bytes || (send_quota == 0 && !was_empty)) {
+               /* We exhausted the send quota, but there's work left to
+                * do. Return and (re-)schedule the send worker.
+                */
+               ret = -EAGAIN;
+       }
+
+       if (ret == 0 && was_empty) {
+               /* A simple bit test would be way faster than taking the
+                * spin lock */
+               spin_lock_irqsave(&conn->c_lock, flags);
+               if (!list_empty(&conn->c_send_queue)) {
+                       rds_stats_inc(s_send_sem_queue_raced);
+                       ret = -EAGAIN;
                 }
+               spin_unlock_irqrestore(&conn->c_lock, flags);
         }
  out:
         return ret;
@@ -443,60 +375,52 @@ static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
  }
  
  /*
- * This is pretty similar to what happens below in the ACK
- * handling code - except that we call here as soon as we get
- * the IB send completion on the RDMA op and the accompanying
- * message.
+ * Returns true if there are no messages on the send and retransmit queues
+ * which have a sequence number greater than or equal to the given sequence
+ * number.
   */
-void rds_rdma_send_complete(struct rds_message *rm, int status)
+int rds_send_acked_before(struct rds_connection *conn, u64 seq)
  {
-       struct rds_sock *rs = NULL;
-       struct rm_rdma_op *ro;
-       struct rds_notifier *notifier;
-       unsigned long flags;
+       struct rds_message *rm, *tmp;
+       int ret = 1;
  
-       spin_lock_irqsave(&rm->m_rs_lock, flags);
+       spin_lock(&conn->c_lock);
  
-       ro = &rm->rdma;
-       if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
-           ro->op_active && ro->op_notify && ro->op_notifier) {
-               notifier = ro->op_notifier;
-               rs = rm->m_rs;
-               sock_hold(rds_rs_to_sk(rs));
-
-               notifier->n_status = status;
-               spin_lock(&rs->rs_lock);
-               list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
-               spin_unlock(&rs->rs_lock);
+       list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
+               if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
+                       ret = 0;
+               break;
+       }
  
-               ro->op_notifier = NULL;
+       list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
+               if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
+                       ret = 0;
+               break;
         }
  
-       spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+       spin_unlock(&conn->c_lock);
  
-       if (rs) {
-               rds_wake_sk_sleep(rs);
-               sock_put(rds_rs_to_sk(rs));
-       }
+       return ret;
  }
-EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
  
  /*
- * Just like above, except looks at atomic op
+ * This is pretty similar to what happens below in the ACK
+ * handling code - except that we call here as soon as we get
+ * the IB send completion on the RDMA op and the accompanying
+ * message.
   */
-void rds_atomic_send_complete(struct rds_message *rm, int status)
+void rds_rdma_send_complete(struct rds_message *rm, int status)
  {
         struct rds_sock *rs = NULL;
-       struct rm_atomic_op *ao;
+       struct rds_rdma_op *ro;
         struct rds_notifier *notifier;
-       unsigned long flags;
  
-       spin_lock_irqsave(&rm->m_rs_lock, flags);
+       spin_lock(&rm->m_rs_lock);
  
-       ao = &rm->atomic;
+       ro = rm->m_rdma_op;
         if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)
-           && ao->op_active && ao->op_notify && ao->op_notifier) {
-               notifier = ao->op_notifier;
+        && ro && ro->r_notify && ro->r_notifier) {
+               notifier = ro->r_notifier;
                 rs = rm->m_rs;
                 sock_hold(rds_rs_to_sk(rs));
  
@@ -505,17 +429,16 @@ void rds_atomic_send_complete(struct rds_message *rm, int status)
                 list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
                 spin_unlock(&rs->rs_lock);
  
-               ao->op_notifier = NULL;
+               ro->r_notifier = NULL;
         }
  
-       spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+       spin_unlock(&rm->m_rs_lock);
  
         if (rs) {
                 rds_wake_sk_sleep(rs);
                 sock_put(rds_rs_to_sk(rs));
         }
  }
-EXPORT_SYMBOL_GPL(rds_atomic_send_complete);
  
  /*
   * This is the same as rds_rdma_send_complete except we
@@ -523,23 +446,15 @@ EXPORT_SYMBOL_GPL(rds_atomic_send_complete);
   * socket, socket lock) and can just move the notifier.
   */
  static inline void
-__rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
+__rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
  {
-       struct rm_rdma_op *ro;
-       struct rm_atomic_op *ao;
-
-       ro = &rm->rdma;
-       if (ro->op_active && ro->op_notify && ro->op_notifier) {
-               ro->op_notifier->n_status = status;
-               list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue);
-               ro->op_notifier = NULL;
-       }
+       struct rds_rdma_op *ro;
  
-       ao = &rm->atomic;
-       if (ao->op_active && ao->op_notify && ao->op_notifier) {
-               ao->op_notifier->n_status = status;
-               list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue);
-               ao->op_notifier = NULL;
+       ro = rm->m_rdma_op;
+       if (ro && ro->r_notify && ro->r_notifier) {
+               ro->r_notifier->n_status = status;
+               list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue);
+               ro->r_notifier = NULL;
         }
  
         /* No need to wake the app - caller does this */
@@ -551,7 +466,7 @@ __rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
   * So speed is not an issue here.
   */
  struct rds_message *rds_send_get_message(struct rds_connection *conn,
-                                        struct rm_rdma_op *op)
+                                        struct rds_rdma_op *op)
  {
         struct rds_message *rm, *tmp, *found = NULL;
         unsigned long flags;
@@ -559,7 +474,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn,
         spin_lock_irqsave(&conn->c_lock, flags);
  
         list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
-               if (&rm->rdma == op) {
+               if (rm->m_rdma_op == op) {
                         atomic_inc(&rm->m_refcount);
                         found = rm;
                         goto out;
@@ -567,7 +482,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn,
         }
  
         list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
-               if (&rm->rdma == op) {
+               if (rm->m_rdma_op == op) {
                         atomic_inc(&rm->m_refcount);
                         found = rm;
                         break;
@@ -579,7 +494,6 @@ out:
  
         return found;
  }
-EXPORT_SYMBOL_GPL(rds_send_get_message);
  
  /*
   * This removes messages from the socket's list if they're on it.  The list
@@ -589,15 +503,14 @@ EXPORT_SYMBOL_GPL(rds_send_get_message);
   * removing the messages from the 'messages' list regardless of if it found
   * the messages on the socket list or not.
   */
-static void rds_send_remove_from_sock(struct list_head *messages, int status)
+void rds_send_remove_from_sock(struct list_head *messages, int status)
  {
-       unsigned long flags;
+       unsigned long flags = 0; /* silence gcc :P */
         struct rds_sock *rs = NULL;
         struct rds_message *rm;
  
+       local_irq_save(flags);
         while (!list_empty(messages)) {
-               int was_on_sock = 0;
-
                 rm = list_entry(messages->next, struct rds_message,
                                 m_conn_item);
                 list_del_init(&rm->m_conn_item);
@@ -612,55 +525,52 @@ static void rds_send_remove_from_sock(struct list_head *messages, int status)
                  * while we're messing with it. It does not prevent the
                  * message from being removed from the socket, though.
                  */
-               spin_lock_irqsave(&rm->m_rs_lock, flags);
+               spin_lock(&rm->m_rs_lock);
                 if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags))
                         goto unlock_and_drop;
  
                 if (rs != rm->m_rs) {
                         if (rs) {
+                               spin_unlock(&rs->rs_lock);
                                 rds_wake_sk_sleep(rs);
                                 sock_put(rds_rs_to_sk(rs));
                         }
                         rs = rm->m_rs;
-                       if (rs)
-                               sock_hold(rds_rs_to_sk(rs));
+                       spin_lock(&rs->rs_lock);
+                       sock_hold(rds_rs_to_sk(rs));
                 }
-               if (!rs)
-                       goto unlock_and_drop;
-               spin_lock(&rs->rs_lock);
  
                 if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
-                       struct rm_rdma_op *ro = &rm->rdma;
+                       struct rds_rdma_op *ro = rm->m_rdma_op;
                         struct rds_notifier *notifier;
  
                         list_del_init(&rm->m_sock_item);
                         rds_send_sndbuf_remove(rs, rm);
  
-                       if (ro->op_active && ro->op_notifier &&
-                              (ro->op_notify || (ro->op_recverr && status))) {
-                               notifier = ro->op_notifier;
+                       if (ro && ro->r_notifier
+                          && (status || ro->r_notify)) {
+                               notifier = ro->r_notifier;
                                 list_add_tail(&notifier->n_list,
                                                 &rs->rs_notify_queue);
                                 if (!notifier->n_status)
                                         notifier->n_status = status;
-                               rm->rdma.op_notifier = NULL;
+                               rm->m_rdma_op->r_notifier = NULL;
                         }
-                       was_on_sock = 1;
+                       rds_message_put(rm);
                         rm->m_rs = NULL;
                 }
-               spin_unlock(&rs->rs_lock);
  
  unlock_and_drop:
-               spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+               spin_unlock(&rm->m_rs_lock);
                 rds_message_put(rm);
-               if (was_on_sock)
-                       rds_message_put(rm);
         }
  
         if (rs) {
+               spin_unlock(&rs->rs_lock);
                 rds_wake_sk_sleep(rs);
                 sock_put(rds_rs_to_sk(rs));
         }
+       local_irq_restore(flags);
  }
  
  /*
@@ -670,6 +580,9 @@ unlock_and_drop:
   * queue. This means that in the TCP case, the message may not have been
   * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked
   * checks the RDS_MSG_HAS_ACK_SEQ bit.
+ *
+ * XXX It's not clear to me how this is safely serialized with socket
+ * destruction.  Maybe it should bail if it sees SOCK_DEAD.
   */
  void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
                          is_acked_func is_acked)
@@ -690,21 +603,21 @@ void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
  
         /* order flag updates with spin locks */
         if (!list_empty(&list))
-               smp_mb__after_atomic();
+               smp_mb__after_clear_bit();
  
         spin_unlock_irqrestore(&conn->c_lock, flags);
  
         /* now remove the messages from the sock list as needed */
         rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS);
  }
-EXPORT_SYMBOL_GPL(rds_send_drop_acked);
  
  void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
  {
         struct rds_message *rm, *tmp;
         struct rds_connection *conn;
-       unsigned long flags;
+       unsigned long flags, flags2;
         LIST_HEAD(list);
+       int wake = 0;
  
         /* get all the messages we're dropping under the rs lock */
         spin_lock_irqsave(&rs->rs_lock, flags);
@@ -714,57 +627,58 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
                              dest->sin_port != rm->m_inc.i_hdr.h_dport))
                         continue;
  
+               wake = 1;
                 list_move(&rm->m_sock_item, &list);
                 rds_send_sndbuf_remove(rs, rm);
                 clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
+
+               /* If this is a RDMA operation, notify the app. */
+               __rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED);
         }
  
         /* order flag updates with the rs lock */
-       smp_mb__after_atomic();
+       if (wake)
+               smp_mb__after_clear_bit();
  
         spin_unlock_irqrestore(&rs->rs_lock, flags);
  
-       if (list_empty(&list))
-               return;
+       if (wake)
+               rds_wake_sk_sleep(rs);
+
+       conn = NULL;
  
-       /* Remove the messages from the conn */
+       /* now remove the messages from the conn list as needed */
         list_for_each_entry(rm, &list, m_sock_item) {
+               /* We do this here rather than in the loop above, so that
+                * we don't have to nest m_rs_lock under rs->rs_lock */
+               spin_lock_irqsave(&rm->m_rs_lock, flags2);
+               rm->m_rs = NULL;
+               spin_unlock_irqrestore(&rm->m_rs_lock, flags2);
  
-               conn = rm->m_inc.i_conn;
-
-               spin_lock_irqsave(&conn->c_lock, flags);
                 /*
-                * Maybe someone else beat us to removing rm from the conn.
-                * If we race with their flag update we'll get the lock and
-                * then really see that the flag has been cleared.
+                * If we see this flag cleared then we're *sure* that someone
+                * else beat us to removing it from the conn.  If we race
+                * with their flag update we'll get the lock and then really
+                * see that the flag has been cleared.
                  */
-               if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
-                       spin_unlock_irqrestore(&conn->c_lock, flags);
-                       spin_lock_irqsave(&rm->m_rs_lock, flags);
-                       rm->m_rs = NULL;
-                       spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+               if (!test_bit(RDS_MSG_ON_CONN, &rm->m_flags))
                         continue;
-               }
-               list_del_init(&rm->m_conn_item);
-               spin_unlock_irqrestore(&conn->c_lock, flags);
  
-               /*
-                * Couldn't grab m_rs_lock in top loop (lock ordering),
-                * but we can now.
-                */
-               spin_lock_irqsave(&rm->m_rs_lock, flags);
-
-               spin_lock(&rs->rs_lock);
-               __rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
-               spin_unlock(&rs->rs_lock);
-
-               rm->m_rs = NULL;
-               spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+               if (conn != rm->m_inc.i_conn) {
+                       if (conn)
+                               spin_unlock_irqrestore(&conn->c_lock, flags);
+                       conn = rm->m_inc.i_conn;
+                       spin_lock_irqsave(&conn->c_lock, flags);
+               }
  
-               rds_message_put(rm);
+               if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
+                       list_del_init(&rm->m_conn_item);
+                       rds_message_put(rm);
+               }
         }
  
-       rds_wake_sk_sleep(rs);
+       if (conn)
+               spin_unlock_irqrestore(&conn->c_lock, flags);
  
         while (!list_empty(&list)) {
                 rm = list_entry(list.next, struct rds_message, m_sock_item);
@@ -844,70 +758,13 @@ out:
         return *queued;
  }
  
-/*
- * rds_message is getting to be quite complicated, and we'd like to allocate
- * it all in one go. This figures out how big it needs to be up front.
- */
-static int rds_rm_size(struct msghdr *msg, int data_len)
-{
-       struct cmsghdr *cmsg;
-       int size = 0;
-       int cmsg_groups = 0;
-       int retval;
-
-       for_each_cmsghdr(cmsg, msg) {
-               if (!CMSG_OK(msg, cmsg))
-                       return -EINVAL;
-
-               if (cmsg->cmsg_level != SOL_RDS)
-                       continue;
-
-               switch (cmsg->cmsg_type) {
-               case RDS_CMSG_RDMA_ARGS:
-                       cmsg_groups |= 1;
-                       retval = rds_rdma_extra_size(CMSG_DATA(cmsg));
-                       if (retval < 0)
-                               return retval;
-                       size += retval;
-
-                       break;
-
-               case RDS_CMSG_RDMA_DEST:
-               case RDS_CMSG_RDMA_MAP:
-                       cmsg_groups |= 2;
-                       /* these are valid but do no add any size */
-                       break;
-
-               case RDS_CMSG_ATOMIC_CSWP:
-               case RDS_CMSG_ATOMIC_FADD:
-               case RDS_CMSG_MASKED_ATOMIC_CSWP:
-               case RDS_CMSG_MASKED_ATOMIC_FADD:
-                       cmsg_groups |= 1;
-                       size += sizeof(struct scatterlist);
-                       break;
-
-               default:
-                       return -EINVAL;
-               }
-
-       }
-
-       size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist);
-
-       /* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */
-       if (cmsg_groups == 3)
-               return -EINVAL;
-
-       return size;
-}
-
  static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
                          struct msghdr *msg, int *allocated_mr)
  {
         struct cmsghdr *cmsg;
         int ret = 0;
  
-       for_each_cmsghdr(cmsg, msg) {
+       for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
                 if (!CMSG_OK(msg, cmsg))
                         return -EINVAL;
  
@@ -915,7 +772,7 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
                         continue;
  
                 /* As a side effect, RDMA_DEST and RDMA_MAP will set
-                * rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr.
+                * rm->m_rdma_cookie and rm->m_rdma_mr.
                  */
                 switch (cmsg->cmsg_type) {
                 case RDS_CMSG_RDMA_ARGS:
@@ -931,12 +788,6 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
                         if (!ret)
                                 *allocated_mr = 1;
                         break;
-               case RDS_CMSG_ATOMIC_CSWP:
-               case RDS_CMSG_ATOMIC_FADD:
-               case RDS_CMSG_MASKED_ATOMIC_CSWP:
-               case RDS_CMSG_MASKED_ATOMIC_FADD:
-                       ret = rds_cmsg_atomic(rs, rm, cmsg);
-                       break;
  
                 default:
                         return -EINVAL;
@@ -949,11 +800,12 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
         return ret;
  }
  
-int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
+int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+               size_t payload_len)
  {
         struct sock *sk = sock->sk;
         struct rds_sock *rs = rds_sk_to_rs(sk);
-       DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
+       struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name;
         __be32 daddr;
         __be16 dport;
         struct rds_message *rm = NULL;
@@ -961,11 +813,12 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
         int ret = 0;
         int queued = 0, allocated_mr = 0;
         int nonblock = msg->msg_flags & MSG_DONTWAIT;
-       long timeo = sock_sndtimeo(sk, nonblock);
+       long timeo = sock_rcvtimeo(sk, nonblock);
  
         /* Mirror Linux UDP mirror of BSD error message compatibility */
         /* XXX: Perhaps MSG_MORE someday */
         if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) {
+               printk(KERN_INFO "msg_flags 0x%08X\n", msg->msg_flags);
                 ret = -EOPNOTSUPP;
                 goto out;
         }
@@ -992,32 +845,20 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
                 goto out;
         }
  
-       /* size of rm including all sgs */
-       ret = rds_rm_size(msg, payload_len);
-       if (ret < 0)
+       rm = rds_message_copy_from_user(msg->msg_iov, payload_len);
+       if (IS_ERR(rm)) {
+               ret = PTR_ERR(rm);
+               rm = NULL;
                 goto out;
-
-       rm = rds_message_alloc(ret, GFP_KERNEL);
-       if (!rm) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       /* Attach data to the rm */
-       if (payload_len) {
-               rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE));
-               if (!rm->data.op_sg) {
-                       ret = -ENOMEM;
-                       goto out;
-               }
-               ret = rds_message_copy_from_user(rm, &msg->msg_iter);
-               if (ret)
-                       goto out;
         }
-       rm->data.op_active = 1;
  
         rm->m_daddr = daddr;
  
+       /* Parse any control messages the user may have included. */
+       ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
+       if (ret)
+               goto out;
+
         /* rds_conn_create has a spinlock that runs with IRQ off.
          * Caching the conn in the socket helps a lot. */
         if (rs->rs_conn && rs->rs_conn->c_faddr == daddr)
@@ -1033,32 +874,26 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
                 rs->rs_conn = conn;
         }
  
-       /* Parse any control messages the user may have included. */
-       ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
-       if (ret)
-               goto out;
-
-       if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) {
-               printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
-                              &rm->rdma, conn->c_trans->xmit_rdma);
-               ret = -EOPNOTSUPP;
-               goto out;
-       }
-
-       if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) {
-               printk_ratelimited(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n",
-                              &rm->atomic, conn->c_trans->xmit_atomic);
+       if ((rm->m_rdma_cookie || rm->m_rdma_op)
+        && conn->c_trans->xmit_rdma == NULL) {
+               if (printk_ratelimit())
+                       printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
+                               rm->m_rdma_op, conn->c_trans->xmit_rdma);
                 ret = -EOPNOTSUPP;
                 goto out;
         }
  
-       rds_conn_connect_if_down(conn);
+       /* If the connection is down, trigger a connect. We may
+        * have scheduled a delayed reconnect however - in this case
+        * we should not interfere.
+        */
+       if (rds_conn_state(conn) == RDS_CONN_DOWN
+        && !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
+               queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
  
         ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
-       if (ret) {
-               rs->rs_seen_congestion = 1;
+       if (ret)
                 goto out;
-       }
  
         while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port,
                                   dport, &queued)) {
@@ -1073,7 +908,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
                         goto out;
                 }
  
-               timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
+               timeo = wait_event_interruptible_timeout(*sk->sk_sleep,
                                         rds_send_queue_rm(rs, conn, rm,
                                                           rs->rs_bound_port,
                                                           dport,
@@ -1096,7 +931,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
         rds_stats_inc(s_send_queued);
  
         if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
-               rds_send_xmit(conn);
+               rds_send_worker(&conn->c_send_w.work);
  
         rds_message_put(rm);
         return payload_len;
@@ -1124,15 +959,20 @@ rds_send_pong(struct rds_connection *conn, __be16 dport)
         int ret = 0;
  
         rm = rds_message_alloc(0, GFP_ATOMIC);
-       if (!rm) {
+       if (rm == NULL) {
                 ret = -ENOMEM;
                 goto out;
         }
  
         rm->m_daddr = conn->c_faddr;
-       rm->data.op_active = 1;
  
-       rds_conn_connect_if_down(conn);
+       /* If the connection is down, trigger a connect. We may
+        * have scheduled a delayed reconnect however - in this case
+        * we should not interfere.
+        */
+       if (rds_conn_state(conn) == RDS_CONN_DOWN
+        && !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
+               queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
  
         ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL);
         if (ret)
@@ -1152,9 +992,7 @@ rds_send_pong(struct rds_connection *conn, __be16 dport)
         rds_stats_inc(s_send_queued);
         rds_stats_inc(s_send_pong);
  
-       if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
-               queue_delayed_work(rds_wq, &conn->c_send_w, 0);
-
+       queue_delayed_work(rds_wq, &conn->c_send_w, 0);
         rds_message_put(rm);
         return 0;
  
diff --git a/net/rds/stats.c b/net/rds/stats.c

index 73be187d389ed044c886d22e4960905283d9de87..637146893cf3e24286404e43c1400492c5437cb3 100644 (file)
--- a/net/rds/stats.c
+++ b/net/rds/stats.c
@@ -33,16 +33,14 @@
  #include <linux/percpu.h>
  #include <linux/seq_file.h>
  #include <linux/proc_fs.h>
-#include <linux/export.h>
  
  #include "rds.h"
  
  DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
-EXPORT_PER_CPU_SYMBOL_GPL(rds_stats);
  
  /* :.,$s/unsigned long\>.*\<s_\(.*\);/"\1",/g */
  
-static const char *const rds_stat_names[] = {
+static char *rds_stat_names[] = {
         "conn_reset",
         "recv_drop_bad_checksum",
         "recv_drop_old_seq",
@@ -58,8 +56,8 @@ static const char *const rds_stat_names[] = {
         "recv_ping",
         "send_queue_empty",
         "send_queue_full",
-       "send_lock_contention",
-       "send_lock_queue_raced",
+       "send_sem_contention",
+       "send_sem_queue_raced",
         "send_immediate_retry",
         "send_delayed_retry",
         "send_drop_acked",
@@ -79,7 +77,7 @@ static const char *const rds_stat_names[] = {
  };
  
  void rds_stats_info_copy(struct rds_info_iterator *iter,
-                        uint64_t *values, const char *const *names, size_t nr)
+                        uint64_t *values, char **names, size_t nr)
  {
         struct rds_info_counter ctr;
         size_t i;
@@ -87,13 +85,11 @@ void rds_stats_info_copy(struct rds_info_iterator *iter,
         for (i = 0; i < nr; i++) {
                 BUG_ON(strlen(names[i]) >= sizeof(ctr.name));
                 strncpy(ctr.name, names[i], sizeof(ctr.name) - 1);
-               ctr.name[sizeof(ctr.name) - 1] = '\0';
                 ctr.value = values[i];
  
                 rds_info_copy(iter, &ctr, sizeof(ctr));
         }
  }
-EXPORT_SYMBOL_GPL(rds_stats_info_copy);
  
  /*
   * This gives global counters across all the transports.  The strings
@@ -145,7 +141,7 @@ void rds_stats_exit(void)
         rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info);
  }
  
-int rds_stats_init(void)
+int __init rds_stats_init(void)
  {
         rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info);
         return 0;
diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c

index c173f69e1479bfaf643b9e5c69f4c9a151b18c67..307dc5c1be153d3326dbd0fb62e4874241fc559a 100644 (file)
--- a/net/rds/sysctl.c
+++ b/net/rds/sysctl.c
@@ -49,61 +49,74 @@ unsigned int  rds_sysctl_max_unacked_bytes = (16 << 20);
  
  unsigned int rds_sysctl_ping_enable = 1;
  
-static struct ctl_table rds_sysctl_rds_table[] = {
+static ctl_table rds_sysctl_rds_table[] = {
         {
+               .ctl_name       = CTL_UNNUMBERED,
                 .procname       = "reconnect_min_delay_ms",
                 .data           = &rds_sysctl_reconnect_min_jiffies,
                 .maxlen         = sizeof(unsigned long),
                 .mode           = 0644,
-               .proc_handler   = proc_doulongvec_ms_jiffies_minmax,
+               .proc_handler   = &proc_doulongvec_ms_jiffies_minmax,
                 .extra1         = &rds_sysctl_reconnect_min,
                 .extra2         = &rds_sysctl_reconnect_max_jiffies,
         },
         {
+               .ctl_name       = CTL_UNNUMBERED,
                 .procname       = "reconnect_max_delay_ms",
                 .data           = &rds_sysctl_reconnect_max_jiffies,
                 .maxlen         = sizeof(unsigned long),
                 .mode           = 0644,
-               .proc_handler   = proc_doulongvec_ms_jiffies_minmax,
+               .proc_handler   = &proc_doulongvec_ms_jiffies_minmax,
                 .extra1         = &rds_sysctl_reconnect_min_jiffies,
                 .extra2         = &rds_sysctl_reconnect_max,
         },
         {
+               .ctl_name       = CTL_UNNUMBERED,
                 .procname       = "max_unacked_packets",
                 .data           = &rds_sysctl_max_unacked_packets,
-               .maxlen         = sizeof(int),
+               .maxlen         = sizeof(unsigned long),
                 .mode           = 0644,
-               .proc_handler   = proc_dointvec,
+               .proc_handler   = &proc_dointvec,
         },
         {
+               .ctl_name       = CTL_UNNUMBERED,
                 .procname       = "max_unacked_bytes",
                 .data           = &rds_sysctl_max_unacked_bytes,
-               .maxlen         = sizeof(int),
+               .maxlen         = sizeof(unsigned long),
                 .mode           = 0644,
-               .proc_handler   = proc_dointvec,
+               .proc_handler   = &proc_dointvec,
         },
         {
+               .ctl_name       = CTL_UNNUMBERED,
                 .procname       = "ping_enable",
                 .data           = &rds_sysctl_ping_enable,
                 .maxlen         = sizeof(int),
                 .mode           = 0644,
-               .proc_handler   = proc_dointvec,
+               .proc_handler   = &proc_dointvec,
         },
+       { .ctl_name = 0}
+};
+
+static struct ctl_path rds_sysctl_path[] = {
+       { .procname = "net", .ctl_name = CTL_NET, },
+       { .procname = "rds", .ctl_name = CTL_UNNUMBERED, },
         { }
  };
  
+
  void rds_sysctl_exit(void)
  {
-       unregister_net_sysctl_table(rds_sysctl_reg_table);
+       if (rds_sysctl_reg_table)
+               unregister_sysctl_table(rds_sysctl_reg_table);
  }
  
-int rds_sysctl_init(void)
+int __init rds_sysctl_init(void)
  {
         rds_sysctl_reconnect_min = msecs_to_jiffies(1);
         rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min;
  
-       rds_sysctl_reg_table = register_net_sysctl(&init_net,"net/rds", rds_sysctl_rds_table);
-       if (!rds_sysctl_reg_table)
+       rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table);
+       if (rds_sysctl_reg_table == NULL)
                 return -ENOMEM;
         return 0;
  }
diff --git a/net/rds/tcp.c b/net/rds/tcp.c

deleted file mode 100644 (file)

index edac9ef..0000000
--- a/net/rds/tcp.c
+++ /dev/null
@@ -1,326 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/in.h>
-#include <linux/module.h>
-#include <net/tcp.h>
-
-#include "rds.h"
-#include "tcp.h"
-
-/* only for info exporting */
-static DEFINE_SPINLOCK(rds_tcp_tc_list_lock);
-static LIST_HEAD(rds_tcp_tc_list);
-static unsigned int rds_tcp_tc_count;
-
-/* Track rds_tcp_connection structs so they can be cleaned up */
-static DEFINE_SPINLOCK(rds_tcp_conn_lock);
-static LIST_HEAD(rds_tcp_conn_list);
-
-static struct kmem_cache *rds_tcp_conn_slab;
-
-#define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024)
-
-/* doing it this way avoids calling tcp_sk() */
-void rds_tcp_nonagle(struct socket *sock)
-{
-       mm_segment_t oldfs = get_fs();
-       int val = 1;
-
-       set_fs(KERNEL_DS);
-       sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, (char __user *)&val,
-                             sizeof(val));
-       set_fs(oldfs);
-}
-
-void rds_tcp_tune(struct socket *sock)
-{
-       struct sock *sk = sock->sk;
-
-       rds_tcp_nonagle(sock);
-
-       /*
-        * We're trying to saturate gigabit with the default,
-        * see svc_sock_setbufsize().
-        */
-       lock_sock(sk);
-       sk->sk_sndbuf = RDS_TCP_DEFAULT_BUFSIZE;
-       sk->sk_rcvbuf = RDS_TCP_DEFAULT_BUFSIZE;
-       sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK;
-       release_sock(sk);
-}
-
-u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc)
-{
-       return tcp_sk(tc->t_sock->sk)->snd_nxt;
-}
-
-u32 rds_tcp_snd_una(struct rds_tcp_connection *tc)
-{
-       return tcp_sk(tc->t_sock->sk)->snd_una;
-}
-
-void rds_tcp_restore_callbacks(struct socket *sock,
-                              struct rds_tcp_connection *tc)
-{
-       rdsdebug("restoring sock %p callbacks from tc %p\n", sock, tc);
-       write_lock_bh(&sock->sk->sk_callback_lock);
-
-       /* done under the callback_lock to serialize with write_space */
-       spin_lock(&rds_tcp_tc_list_lock);
-       list_del_init(&tc->t_list_item);
-       rds_tcp_tc_count--;
-       spin_unlock(&rds_tcp_tc_list_lock);
-
-       tc->t_sock = NULL;
-
-       sock->sk->sk_write_space = tc->t_orig_write_space;
-       sock->sk->sk_data_ready = tc->t_orig_data_ready;
-       sock->sk->sk_state_change = tc->t_orig_state_change;
-       sock->sk->sk_user_data = NULL;
-
-       write_unlock_bh(&sock->sk->sk_callback_lock);
-}
-
-/*
- * This is the only path that sets tc->t_sock.  Send and receive trust that
- * it is set.  The RDS_CONN_CONNECTED bit protects those paths from being
- * called while it isn't set.
- */
-void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn)
-{
-       struct rds_tcp_connection *tc = conn->c_transport_data;
-
-       rdsdebug("setting sock %p callbacks to tc %p\n", sock, tc);
-       write_lock_bh(&sock->sk->sk_callback_lock);
-
-       /* done under the callback_lock to serialize with write_space */
-       spin_lock(&rds_tcp_tc_list_lock);
-       list_add_tail(&tc->t_list_item, &rds_tcp_tc_list);
-       rds_tcp_tc_count++;
-       spin_unlock(&rds_tcp_tc_list_lock);
-
-       /* accepted sockets need our listen data ready undone */
-       if (sock->sk->sk_data_ready == rds_tcp_listen_data_ready)
-               sock->sk->sk_data_ready = sock->sk->sk_user_data;
-
-       tc->t_sock = sock;
-       tc->conn = conn;
-       tc->t_orig_data_ready = sock->sk->sk_data_ready;
-       tc->t_orig_write_space = sock->sk->sk_write_space;
-       tc->t_orig_state_change = sock->sk->sk_state_change;
-
-       sock->sk->sk_user_data = conn;
-       sock->sk->sk_data_ready = rds_tcp_data_ready;
-       sock->sk->sk_write_space = rds_tcp_write_space;
-       sock->sk->sk_state_change = rds_tcp_state_change;
-
-       write_unlock_bh(&sock->sk->sk_callback_lock);
-}
-
-static void rds_tcp_tc_info(struct socket *sock, unsigned int len,
-                           struct rds_info_iterator *iter,
-                           struct rds_info_lengths *lens)
-{
-       struct rds_info_tcp_socket tsinfo;
-       struct rds_tcp_connection *tc;
-       unsigned long flags;
-       struct sockaddr_in sin;
-       int sinlen;
-
-       spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
-
-       if (len / sizeof(tsinfo) < rds_tcp_tc_count)
-               goto out;
-
-       list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
-
-               sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 0);
-               tsinfo.local_addr = sin.sin_addr.s_addr;
-               tsinfo.local_port = sin.sin_port;
-               sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 1);
-               tsinfo.peer_addr = sin.sin_addr.s_addr;
-               tsinfo.peer_port = sin.sin_port;
-
-               tsinfo.hdr_rem = tc->t_tinc_hdr_rem;
-               tsinfo.data_rem = tc->t_tinc_data_rem;
-               tsinfo.last_sent_nxt = tc->t_last_sent_nxt;
-               tsinfo.last_expected_una = tc->t_last_expected_una;
-               tsinfo.last_seen_una = tc->t_last_seen_una;
-
-               rds_info_copy(iter, &tsinfo, sizeof(tsinfo));
-       }
-
-out:
-       lens->nr = rds_tcp_tc_count;
-       lens->each = sizeof(tsinfo);
-
-       spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
-}
-
-static int rds_tcp_laddr_check(__be32 addr)
-{
-       if (inet_addr_type(&init_net, addr) == RTN_LOCAL)
-               return 0;
-       return -EADDRNOTAVAIL;
-}
-
-static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
-{
-       struct rds_tcp_connection *tc;
-
-       tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp);
-       if (!tc)
-               return -ENOMEM;
-
-       tc->t_sock = NULL;
-       tc->t_tinc = NULL;
-       tc->t_tinc_hdr_rem = sizeof(struct rds_header);
-       tc->t_tinc_data_rem = 0;
-
-       conn->c_transport_data = tc;
-
-       spin_lock_irq(&rds_tcp_conn_lock);
-       list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list);
-       spin_unlock_irq(&rds_tcp_conn_lock);
-
-       rdsdebug("alloced tc %p\n", conn->c_transport_data);
-       return 0;
-}
-
-static void rds_tcp_conn_free(void *arg)
-{
-       struct rds_tcp_connection *tc = arg;
-       unsigned long flags;
-       rdsdebug("freeing tc %p\n", tc);
-
-       spin_lock_irqsave(&rds_tcp_conn_lock, flags);
-       list_del(&tc->t_tcp_node);
-       spin_unlock_irqrestore(&rds_tcp_conn_lock, flags);
-
-       kmem_cache_free(rds_tcp_conn_slab, tc);
-}
-
-static void rds_tcp_destroy_conns(void)
-{
-       struct rds_tcp_connection *tc, *_tc;
-       LIST_HEAD(tmp_list);
-
-       /* avoid calling conn_destroy with irqs off */
-       spin_lock_irq(&rds_tcp_conn_lock);
-       list_splice(&rds_tcp_conn_list, &tmp_list);
-       INIT_LIST_HEAD(&rds_tcp_conn_list);
-       spin_unlock_irq(&rds_tcp_conn_lock);
-
-       list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) {
-               if (tc->conn->c_passive)
-                       rds_conn_destroy(tc->conn->c_passive);
-               rds_conn_destroy(tc->conn);
-       }
-}
-
-static void rds_tcp_exit(void)
-{
-       rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
-       rds_tcp_listen_stop();
-       rds_tcp_destroy_conns();
-       rds_trans_unregister(&rds_tcp_transport);
-       rds_tcp_recv_exit();
-       kmem_cache_destroy(rds_tcp_conn_slab);
-}
-module_exit(rds_tcp_exit);
-
-struct rds_transport rds_tcp_transport = {
-       .laddr_check            = rds_tcp_laddr_check,
-       .xmit_prepare           = rds_tcp_xmit_prepare,
-       .xmit_complete          = rds_tcp_xmit_complete,
-       .xmit                   = rds_tcp_xmit,
-       .recv                   = rds_tcp_recv,
-       .conn_alloc             = rds_tcp_conn_alloc,
-       .conn_free              = rds_tcp_conn_free,
-       .conn_connect           = rds_tcp_conn_connect,
-       .conn_shutdown          = rds_tcp_conn_shutdown,
-       .inc_copy_to_user       = rds_tcp_inc_copy_to_user,
-       .inc_free               = rds_tcp_inc_free,
-       .stats_info_copy        = rds_tcp_stats_info_copy,
-       .exit                   = rds_tcp_exit,
-       .t_owner                = THIS_MODULE,
-       .t_name                 = "tcp",
-       .t_type                 = RDS_TRANS_TCP,
-       .t_prefer_loopback      = 1,
-};
-
-static int rds_tcp_init(void)
-{
-       int ret;
-
-       rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection",
-                                             sizeof(struct rds_tcp_connection),
-                                             0, 0, NULL);
-       if (!rds_tcp_conn_slab) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       ret = rds_tcp_recv_init();
-       if (ret)
-               goto out_slab;
-
-       ret = rds_trans_register(&rds_tcp_transport);
-       if (ret)
-               goto out_recv;
-
-       ret = rds_tcp_listen_init();
-       if (ret)
-               goto out_register;
-
-       rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
-
-       goto out;
-
-out_register:
-       rds_trans_unregister(&rds_tcp_transport);
-out_recv:
-       rds_tcp_recv_exit();
-out_slab:
-       kmem_cache_destroy(rds_tcp_conn_slab);
-out:
-       return ret;
-}
-module_init(rds_tcp_init);
-
-MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
-MODULE_DESCRIPTION("RDS: TCP transport");
-MODULE_LICENSE("Dual BSD/GPL");
-
diff --git a/net/rds/tcp.h b/net/rds/tcp.h

deleted file mode 100644 (file)

index 0dbdd37..0000000
--- a/net/rds/tcp.h
+++ /dev/null
@@ -1,87 +0,0 @@
-#ifndef _RDS_TCP_H
-#define _RDS_TCP_H
-
-#define RDS_TCP_PORT   16385
-
-struct rds_tcp_incoming {
-       struct rds_incoming     ti_inc;
-       struct sk_buff_head     ti_skb_list;
-};
-
-struct rds_tcp_connection {
-
-       struct list_head        t_tcp_node;
-       struct rds_connection   *conn;
-       struct socket           *t_sock;
-       void                    *t_orig_write_space;
-       void                    *t_orig_data_ready;
-       void                    *t_orig_state_change;
-
-       struct rds_tcp_incoming *t_tinc;
-       size_t                  t_tinc_hdr_rem;
-       size_t                  t_tinc_data_rem;
-
-       /* XXX error report? */
-       struct work_struct      t_conn_w;
-       struct work_struct      t_send_w;
-       struct work_struct      t_down_w;
-       struct work_struct      t_recv_w;
-
-       /* for info exporting only */
-       struct list_head        t_list_item;
-       u32                     t_last_sent_nxt;
-       u32                     t_last_expected_una;
-       u32                     t_last_seen_una;
-};
-
-struct rds_tcp_statistics {
-       uint64_t        s_tcp_data_ready_calls;
-       uint64_t        s_tcp_write_space_calls;
-       uint64_t        s_tcp_sndbuf_full;
-       uint64_t        s_tcp_connect_raced;
-       uint64_t        s_tcp_listen_closed_stale;
-};
-
-/* tcp.c */
-void rds_tcp_tune(struct socket *sock);
-void rds_tcp_nonagle(struct socket *sock);
-void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn);
-void rds_tcp_restore_callbacks(struct socket *sock,
-                              struct rds_tcp_connection *tc);
-u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc);
-u32 rds_tcp_snd_una(struct rds_tcp_connection *tc);
-u64 rds_tcp_map_seq(struct rds_tcp_connection *tc, u32 seq);
-extern struct rds_transport rds_tcp_transport;
-
-/* tcp_connect.c */
-int rds_tcp_conn_connect(struct rds_connection *conn);
-void rds_tcp_conn_shutdown(struct rds_connection *conn);
-void rds_tcp_state_change(struct sock *sk);
-
-/* tcp_listen.c */
-int rds_tcp_listen_init(void);
-void rds_tcp_listen_stop(void);
-void rds_tcp_listen_data_ready(struct sock *sk);
-
-/* tcp_recv.c */
-int rds_tcp_recv_init(void);
-void rds_tcp_recv_exit(void);
-void rds_tcp_data_ready(struct sock *sk);
-int rds_tcp_recv(struct rds_connection *conn);
-void rds_tcp_inc_free(struct rds_incoming *inc);
-int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
-
-/* tcp_send.c */
-void rds_tcp_xmit_prepare(struct rds_connection *conn);
-void rds_tcp_xmit_complete(struct rds_connection *conn);
-int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
-                unsigned int hdr_off, unsigned int sg, unsigned int off);
-void rds_tcp_write_space(struct sock *sk);
-
-/* tcp_stats.c */
-DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats);
-#define rds_tcp_stats_inc(member) rds_stats_inc_which(rds_tcp_stats, member)
-unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter,
-                                    unsigned int avail);
-
-#endif
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c

deleted file mode 100644 (file)

index 973109c..0000000
--- a/net/rds/tcp_connect.c
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/in.h>
-#include <net/tcp.h>
-
-#include "rds.h"
-#include "tcp.h"
-
-void rds_tcp_state_change(struct sock *sk)
-{
-       void (*state_change)(struct sock *sk);
-       struct rds_connection *conn;
-       struct rds_tcp_connection *tc;
-
-       read_lock(&sk->sk_callback_lock);
-       conn = sk->sk_user_data;
-       if (!conn) {
-               state_change = sk->sk_state_change;
-               goto out;
-       }
-       tc = conn->c_transport_data;
-       state_change = tc->t_orig_state_change;
-
-       rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state);
-
-       switch(sk->sk_state) {
-               /* ignore connecting sockets as they make progress */
-               case TCP_SYN_SENT:
-               case TCP_SYN_RECV:
-                       break;
-               case TCP_ESTABLISHED:
-                       rds_connect_complete(conn);
-                       break;
-               case TCP_CLOSE_WAIT:
-               case TCP_CLOSE:
-                       rds_conn_drop(conn);
-               default:
-                       break;
-       }
-out:
-       read_unlock(&sk->sk_callback_lock);
-       state_change(sk);
-}
-
-int rds_tcp_conn_connect(struct rds_connection *conn)
-{
-       struct socket *sock = NULL;
-       struct sockaddr_in src, dest;
-       int ret;
-
-       ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
-       if (ret < 0)
-               goto out;
-
-       rds_tcp_tune(sock);
-
-       src.sin_family = AF_INET;
-       src.sin_addr.s_addr = (__force u32)conn->c_laddr;
-       src.sin_port = (__force u16)htons(0);
-
-       ret = sock->ops->bind(sock, (struct sockaddr *)&src, sizeof(src));
-       if (ret) {
-               rdsdebug("bind failed with %d at address %pI4\n",
-                        ret, &conn->c_laddr);
-               goto out;
-       }
-
-       dest.sin_family = AF_INET;
-       dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
-       dest.sin_port = (__force u16)htons(RDS_TCP_PORT);
-
-       /*
-        * once we call connect() we can start getting callbacks and they
-        * own the socket
-        */
-       rds_tcp_set_callbacks(sock, conn);
-       ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest),
-                                O_NONBLOCK);
-
-       rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret);
-       if (ret == -EINPROGRESS)
-               ret = 0;
-       if (ret == 0)
-               sock = NULL;
-       else
-               rds_tcp_restore_callbacks(sock, conn->c_transport_data);
-
-out:
-       if (sock)
-               sock_release(sock);
-       return ret;
-}
-
-/*
- * Before killing the tcp socket this needs to serialize with callbacks.  The
- * caller has already grabbed the sending sem so we're serialized with other
- * senders.
- *
- * TCP calls the callbacks with the sock lock so we hold it while we reset the
- * callbacks to those set by TCP.  Our callbacks won't execute again once we
- * hold the sock lock.
- */
-void rds_tcp_conn_shutdown(struct rds_connection *conn)
-{
-       struct rds_tcp_connection *tc = conn->c_transport_data;
-       struct socket *sock = tc->t_sock;
-
-       rdsdebug("shutting down conn %p tc %p sock %p\n", conn, tc, sock);
-
-       if (sock) {
-               sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN);
-               lock_sock(sock->sk);
-               rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */
-
-               release_sock(sock->sk);
-               sock_release(sock);
-       }
-
-       if (tc->t_tinc) {
-               rds_inc_put(&tc->t_tinc->ti_inc);
-               tc->t_tinc = NULL;
-       }
-       tc->t_tinc_hdr_rem = sizeof(struct rds_header);
-       tc->t_tinc_data_rem = 0;
-}
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c

deleted file mode 100644 (file)

index 0da49e3..0000000
--- a/net/rds/tcp_listen.c
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/gfp.h>
-#include <linux/in.h>
-#include <net/tcp.h>
-
-#include "rds.h"
-#include "tcp.h"
-
-/*
- * cheesy, but simple..
- */
-static void rds_tcp_accept_worker(struct work_struct *work);
-static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker);
-static struct socket *rds_tcp_listen_sock;
-
-static int rds_tcp_keepalive(struct socket *sock)
-{
-       /* values below based on xs_udp_default_timeout */
-       int keepidle = 5; /* send a probe 'keepidle' secs after last data */
-       int keepcnt = 5; /* number of unack'ed probes before declaring dead */
-       int keepalive = 1;
-       int ret = 0;
-
-       ret = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
-                               (char *)&keepalive, sizeof(keepalive));
-       if (ret < 0)
-               goto bail;
-
-       ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT,
-                               (char *)&keepcnt, sizeof(keepcnt));
-       if (ret < 0)
-               goto bail;
-
-       ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE,
-                               (char *)&keepidle, sizeof(keepidle));
-       if (ret < 0)
-               goto bail;
-
-       /* KEEPINTVL is the interval between successive probes. We follow
-        * the model in xs_tcp_finish_connecting() and re-use keepidle.
-        */
-       ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL,
-                               (char *)&keepidle, sizeof(keepidle));
-bail:
-       return ret;
-}
-
-static int rds_tcp_accept_one(struct socket *sock)
-{
-       struct socket *new_sock = NULL;
-       struct rds_connection *conn;
-       int ret;
-       struct inet_sock *inet;
-       struct rds_tcp_connection *rs_tcp;
-
-       ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
-                              sock->sk->sk_protocol, &new_sock);
-       if (ret)
-               goto out;
-
-       new_sock->type = sock->type;
-       new_sock->ops = sock->ops;
-       ret = sock->ops->accept(sock, new_sock, O_NONBLOCK);
-       if (ret < 0)
-               goto out;
-
-       ret = rds_tcp_keepalive(new_sock);
-       if (ret < 0)
-               goto out;
-
-       rds_tcp_tune(new_sock);
-
-       inet = inet_sk(new_sock->sk);
-
-       rdsdebug("accepted tcp %pI4:%u -> %pI4:%u\n",
-                &inet->inet_saddr, ntohs(inet->inet_sport),
-                &inet->inet_daddr, ntohs(inet->inet_dport));
-
-       conn = rds_conn_create(inet->inet_saddr, inet->inet_daddr,
-                              &rds_tcp_transport, GFP_KERNEL);
-       if (IS_ERR(conn)) {
-               ret = PTR_ERR(conn);
-               goto out;
-       }
-       /* An incoming SYN request came in, and TCP just accepted it.
-        * We always create a new conn for listen side of TCP, and do not
-        * add it to the c_hash_list.
-        *
-        * If the client reboots, this conn will need to be cleaned up.
-        * rds_tcp_state_change() will do that cleanup
-        */
-       rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data;
-       WARN_ON(!rs_tcp || rs_tcp->t_sock);
-
-       /*
-        * see the comment above rds_queue_delayed_reconnect()
-        */
-       if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
-               if (rds_conn_state(conn) == RDS_CONN_UP)
-                       rds_tcp_stats_inc(s_tcp_listen_closed_stale);
-               else
-                       rds_tcp_stats_inc(s_tcp_connect_raced);
-               rds_conn_drop(conn);
-               ret = 0;
-               goto out;
-       }
-
-       rds_tcp_set_callbacks(new_sock, conn);
-       rds_connect_complete(conn);
-       new_sock = NULL;
-       ret = 0;
-
-out:
-       if (new_sock)
-               sock_release(new_sock);
-       return ret;
-}
-
-static void rds_tcp_accept_worker(struct work_struct *work)
-{
-       while (rds_tcp_accept_one(rds_tcp_listen_sock) == 0)
-               cond_resched();
-}
-
-void rds_tcp_listen_data_ready(struct sock *sk)
-{
-       void (*ready)(struct sock *sk);
-
-       rdsdebug("listen data ready sk %p\n", sk);
-
-       read_lock(&sk->sk_callback_lock);
-       ready = sk->sk_user_data;
-       if (!ready) { /* check for teardown race */
-               ready = sk->sk_data_ready;
-               goto out;
-       }
-
-       /*
-        * ->sk_data_ready is also called for a newly established child socket
-        * before it has been accepted and the accepter has set up their
-        * data_ready.. we only want to queue listen work for our listening
-        * socket
-        */
-       if (sk->sk_state == TCP_LISTEN)
-               queue_work(rds_wq, &rds_tcp_listen_work);
-
-out:
-       read_unlock(&sk->sk_callback_lock);
-       ready(sk);
-}
-
-int rds_tcp_listen_init(void)
-{
-       struct sockaddr_in sin;
-       struct socket *sock = NULL;
-       int ret;
-
-       ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
-       if (ret < 0)
-               goto out;
-
-       sock->sk->sk_reuse = SK_CAN_REUSE;
-       rds_tcp_nonagle(sock);
-
-       write_lock_bh(&sock->sk->sk_callback_lock);
-       sock->sk->sk_user_data = sock->sk->sk_data_ready;
-       sock->sk->sk_data_ready = rds_tcp_listen_data_ready;
-       write_unlock_bh(&sock->sk->sk_callback_lock);
-
-       sin.sin_family = PF_INET;
-       sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
-       sin.sin_port = (__force u16)htons(RDS_TCP_PORT);
-
-       ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
-       if (ret < 0)
-               goto out;
-
-       ret = sock->ops->listen(sock, 64);
-       if (ret < 0)
-               goto out;
-
-       rds_tcp_listen_sock = sock;
-       sock = NULL;
-out:
-       if (sock)
-               sock_release(sock);
-       return ret;
-}
-
-void rds_tcp_listen_stop(void)
-{
-       struct socket *sock = rds_tcp_listen_sock;
-       struct sock *sk;
-
-       if (!sock)
-               return;
-
-       sk = sock->sk;
-
-       /* serialize with and prevent further callbacks */
-       lock_sock(sk);
-       write_lock_bh(&sk->sk_callback_lock);
-       if (sk->sk_user_data) {
-               sk->sk_data_ready = sk->sk_user_data;
-               sk->sk_user_data = NULL;
-       }
-       write_unlock_bh(&sk->sk_callback_lock);
-       release_sock(sk);
-
-       /* wait for accepts to stop and close the socket */
-       flush_workqueue(rds_wq);
-       sock_release(sock);
-       rds_tcp_listen_sock = NULL;
-}
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c

deleted file mode 100644 (file)

index fbc5ef8..0000000
--- a/net/rds/tcp_recv.c
+++ /dev/null
@@ -1,336 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <net/tcp.h>
-
-#include "rds.h"
-#include "tcp.h"
-
-static struct kmem_cache *rds_tcp_incoming_slab;
-
-static void rds_tcp_inc_purge(struct rds_incoming *inc)
-{
-       struct rds_tcp_incoming *tinc;
-       tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
-       rdsdebug("purging tinc %p inc %p\n", tinc, inc);
-       skb_queue_purge(&tinc->ti_skb_list);
-}
-
-void rds_tcp_inc_free(struct rds_incoming *inc)
-{
-       struct rds_tcp_incoming *tinc;
-       tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
-       rds_tcp_inc_purge(inc);
-       rdsdebug("freeing tinc %p inc %p\n", tinc, inc);
-       kmem_cache_free(rds_tcp_incoming_slab, tinc);
-}
-
-/*
- * this is pretty lame, but, whatever.
- */
-int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
-{
-       struct rds_tcp_incoming *tinc;
-       struct sk_buff *skb;
-       int ret = 0;
-
-       if (!iov_iter_count(to))
-               goto out;
-
-       tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
-
-       skb_queue_walk(&tinc->ti_skb_list, skb) {
-               unsigned long to_copy, skb_off;
-               for (skb_off = 0; skb_off < skb->len; skb_off += to_copy) {
-                       to_copy = iov_iter_count(to);
-                       to_copy = min(to_copy, skb->len - skb_off);
-
-                       if (skb_copy_datagram_iter(skb, skb_off, to, to_copy))
-                               return -EFAULT;
-
-                       rds_stats_add(s_copy_to_user, to_copy);
-                       ret += to_copy;
-
-                       if (!iov_iter_count(to))
-                               goto out;
-               }
-       }
-out:
-       return ret;
-}
-
-/*
- * We have a series of skbs that have fragmented pieces of the congestion
- * bitmap.  They must add up to the exact size of the congestion bitmap.  We
- * use the skb helpers to copy those into the pages that make up the in-memory
- * congestion bitmap for the remote address of this connection.  We then tell
- * the congestion core that the bitmap has been changed so that it can wake up
- * sleepers.
- *
- * This is racing with sending paths which are using test_bit to see if the
- * bitmap indicates that their recipient is congested.
- */
-
-static void rds_tcp_cong_recv(struct rds_connection *conn,
-                             struct rds_tcp_incoming *tinc)
-{
-       struct sk_buff *skb;
-       unsigned int to_copy, skb_off;
-       unsigned int map_off;
-       unsigned int map_page;
-       struct rds_cong_map *map;
-       int ret;
-
-       /* catch completely corrupt packets */
-       if (be32_to_cpu(tinc->ti_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
-               return;
-
-       map_page = 0;
-       map_off = 0;
-       map = conn->c_fcong;
-
-       skb_queue_walk(&tinc->ti_skb_list, skb) {
-               skb_off = 0;
-               while (skb_off < skb->len) {
-                       to_copy = min_t(unsigned int, PAGE_SIZE - map_off,
-                                       skb->len - skb_off);
-
-                       BUG_ON(map_page >= RDS_CONG_MAP_PAGES);
-
-                       /* only returns 0 or -error */
-                       ret = skb_copy_bits(skb, skb_off,
-                               (void *)map->m_page_addrs[map_page] + map_off,
-                               to_copy);
-                       BUG_ON(ret != 0);
-
-                       skb_off += to_copy;
-                       map_off += to_copy;
-                       if (map_off == PAGE_SIZE) {
-                               map_off = 0;
-                               map_page++;
-                       }
-               }
-       }
-
-       rds_cong_map_updated(map, ~(u64) 0);
-}
-
-struct rds_tcp_desc_arg {
-       struct rds_connection *conn;
-       gfp_t gfp;
-};
-
-static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
-                            unsigned int offset, size_t len)
-{
-       struct rds_tcp_desc_arg *arg = desc->arg.data;
-       struct rds_connection *conn = arg->conn;
-       struct rds_tcp_connection *tc = conn->c_transport_data;
-       struct rds_tcp_incoming *tinc = tc->t_tinc;
-       struct sk_buff *clone;
-       size_t left = len, to_copy;
-
-       rdsdebug("tcp data tc %p skb %p offset %u len %zu\n", tc, skb, offset,
-                len);
-
-       /*
-        * tcp_read_sock() interprets partial progress as an indication to stop
-        * processing.
-        */
-       while (left) {
-               if (!tinc) {
-                       tinc = kmem_cache_alloc(rds_tcp_incoming_slab,
-                                               arg->gfp);
-                       if (!tinc) {
-                               desc->error = -ENOMEM;
-                               goto out;
-                       }
-                       tc->t_tinc = tinc;
-                       rdsdebug("alloced tinc %p\n", tinc);
-                       rds_inc_init(&tinc->ti_inc, conn, conn->c_faddr);
-                       /*
-                        * XXX * we might be able to use the __ variants when
-                        * we've already serialized at a higher level.
-                        */
-                       skb_queue_head_init(&tinc->ti_skb_list);
-               }
-
-               if (left && tc->t_tinc_hdr_rem) {
-                       to_copy = min(tc->t_tinc_hdr_rem, left);
-                       rdsdebug("copying %zu header from skb %p\n", to_copy,
-                                skb);
-                       skb_copy_bits(skb, offset,
-                                     (char *)&tinc->ti_inc.i_hdr +
-                                               sizeof(struct rds_header) -
-                                               tc->t_tinc_hdr_rem,
-                                     to_copy);
-                       tc->t_tinc_hdr_rem -= to_copy;
-                       left -= to_copy;
-                       offset += to_copy;
-
-                       if (tc->t_tinc_hdr_rem == 0) {
-                               /* could be 0 for a 0 len message */
-                               tc->t_tinc_data_rem =
-                                       be32_to_cpu(tinc->ti_inc.i_hdr.h_len);
-                       }
-               }
-
-               if (left && tc->t_tinc_data_rem) {
-                       clone = skb_clone(skb, arg->gfp);
-                       if (!clone) {
-                               desc->error = -ENOMEM;
-                               goto out;
-                       }
-
-                       to_copy = min(tc->t_tinc_data_rem, left);
-                       pskb_pull(clone, offset);
-                       pskb_trim(clone, to_copy);
-                       skb_queue_tail(&tinc->ti_skb_list, clone);
-
-                       rdsdebug("skb %p data %p len %d off %u to_copy %zu -> "
-                                "clone %p data %p len %d\n",
-                                skb, skb->data, skb->len, offset, to_copy,
-                                clone, clone->data, clone->len);
-
-                       tc->t_tinc_data_rem -= to_copy;
-                       left -= to_copy;
-                       offset += to_copy;
-               }
-
-               if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) {
-                       if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
-                               rds_tcp_cong_recv(conn, tinc);
-                       else
-                               rds_recv_incoming(conn, conn->c_faddr,
-                                                 conn->c_laddr, &tinc->ti_inc,
-                                                 arg->gfp);
-
-                       tc->t_tinc_hdr_rem = sizeof(struct rds_header);
-                       tc->t_tinc_data_rem = 0;
-                       tc->t_tinc = NULL;
-                       rds_inc_put(&tinc->ti_inc);
-                       tinc = NULL;
-               }
-       }
-out:
-       rdsdebug("returning len %zu left %zu skb len %d rx queue depth %d\n",
-                len, left, skb->len,
-                skb_queue_len(&tc->t_sock->sk->sk_receive_queue));
-       return len - left;
-}
-
-/* the caller has to hold the sock lock */
-static int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp)
-{
-       struct rds_tcp_connection *tc = conn->c_transport_data;
-       struct socket *sock = tc->t_sock;
-       read_descriptor_t desc;
-       struct rds_tcp_desc_arg arg;
-
-       /* It's like glib in the kernel! */
-       arg.conn = conn;
-       arg.gfp = gfp;
-       desc.arg.data = &arg;
-       desc.error = 0;
-       desc.count = 1; /* give more than one skb per call */
-
-       tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv);
-       rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp,
-                desc.error);
-
-       return desc.error;
-}
-
-/*
- * We hold the sock lock to serialize our rds_tcp_recv->tcp_read_sock from
- * data_ready.
- *
- * if we fail to allocate we're in trouble.. blindly wait some time before
- * trying again to see if the VM can free up something for us.
- */
-int rds_tcp_recv(struct rds_connection *conn)
-{
-       struct rds_tcp_connection *tc = conn->c_transport_data;
-       struct socket *sock = tc->t_sock;
-       int ret = 0;
-
-       rdsdebug("recv worker conn %p tc %p sock %p\n", conn, tc, sock);
-
-       lock_sock(sock->sk);
-       ret = rds_tcp_read_sock(conn, GFP_KERNEL);
-       release_sock(sock->sk);
-
-       return ret;
-}
-
-void rds_tcp_data_ready(struct sock *sk)
-{
-       void (*ready)(struct sock *sk);
-       struct rds_connection *conn;
-       struct rds_tcp_connection *tc;
-
-       rdsdebug("data ready sk %p\n", sk);
-
-       read_lock(&sk->sk_callback_lock);
-       conn = sk->sk_user_data;
-       if (!conn) { /* check for teardown race */
-               ready = sk->sk_data_ready;
-               goto out;
-       }
-
-       tc = conn->c_transport_data;
-       ready = tc->t_orig_data_ready;
-       rds_tcp_stats_inc(s_tcp_data_ready_calls);
-
-       if (rds_tcp_read_sock(conn, GFP_ATOMIC) == -ENOMEM)
-               queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
-out:
-       read_unlock(&sk->sk_callback_lock);
-       ready(sk);
-}
-
-int rds_tcp_recv_init(void)
-{
-       rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming",
-                                       sizeof(struct rds_tcp_incoming),
-                                       0, 0, NULL);
-       if (!rds_tcp_incoming_slab)
-               return -ENOMEM;
-       return 0;
-}
-
-void rds_tcp_recv_exit(void)
-{
-       kmem_cache_destroy(rds_tcp_incoming_slab);
-}
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c

deleted file mode 100644 (file)

index 53b17ca..0000000
--- a/net/rds/tcp_send.c
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/in.h>
-#include <net/tcp.h>
-
-#include "rds.h"
-#include "tcp.h"
-
-static void rds_tcp_cork(struct socket *sock, int val)
-{
-       mm_segment_t oldfs;
-
-       oldfs = get_fs();
-       set_fs(KERNEL_DS);
-       sock->ops->setsockopt(sock, SOL_TCP, TCP_CORK, (char __user *)&val,
-                             sizeof(val));
-       set_fs(oldfs);
-}
-
-void rds_tcp_xmit_prepare(struct rds_connection *conn)
-{
-       struct rds_tcp_connection *tc = conn->c_transport_data;
-
-       rds_tcp_cork(tc->t_sock, 1);
-}
-
-void rds_tcp_xmit_complete(struct rds_connection *conn)
-{
-       struct rds_tcp_connection *tc = conn->c_transport_data;
-
-       rds_tcp_cork(tc->t_sock, 0);
-}
-
-/* the core send_sem serializes this with other xmit and shutdown */
-static int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len)
-{
-       struct kvec vec = {
-                .iov_base = data,
-                .iov_len = len,
-       };
-        struct msghdr msg = {
-                .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL,
-        };
-
-       return kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len);
-}
-
-/* the core send_sem serializes this with other xmit and shutdown */
-int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
-                unsigned int hdr_off, unsigned int sg, unsigned int off)
-{
-       struct rds_tcp_connection *tc = conn->c_transport_data;
-       int done = 0;
-       int ret = 0;
-
-       if (hdr_off == 0) {
-               /*
-                * m_ack_seq is set to the sequence number of the last byte of
-                * header and data.  see rds_tcp_is_acked().
-                */
-               tc->t_last_sent_nxt = rds_tcp_snd_nxt(tc);
-               rm->m_ack_seq = tc->t_last_sent_nxt +
-                               sizeof(struct rds_header) +
-                               be32_to_cpu(rm->m_inc.i_hdr.h_len) - 1;
-               smp_mb__before_atomic();
-               set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags);
-               tc->t_last_expected_una = rm->m_ack_seq + 1;
-
-               rdsdebug("rm %p tcp nxt %u ack_seq %llu\n",
-                        rm, rds_tcp_snd_nxt(tc),
-                        (unsigned long long)rm->m_ack_seq);
-       }
-
-       if (hdr_off < sizeof(struct rds_header)) {
-               /* see rds_tcp_write_space() */
-               set_bit(SOCK_NOSPACE, &tc->t_sock->sk->sk_socket->flags);
-
-               ret = rds_tcp_sendmsg(tc->t_sock,
-                                     (void *)&rm->m_inc.i_hdr + hdr_off,
-                                     sizeof(rm->m_inc.i_hdr) - hdr_off);
-               if (ret < 0)
-                       goto out;
-               done += ret;
-               if (hdr_off + done != sizeof(struct rds_header))
-                       goto out;
-       }
-
-       while (sg < rm->data.op_nents) {
-               ret = tc->t_sock->ops->sendpage(tc->t_sock,
-                                               sg_page(&rm->data.op_sg[sg]),
-                                               rm->data.op_sg[sg].offset + off,
-                                               rm->data.op_sg[sg].length - off,
-                                               MSG_DONTWAIT|MSG_NOSIGNAL);
-               rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]),
-                        rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off,
-                        ret);
-               if (ret <= 0)
-                       break;
-
-               off += ret;
-               done += ret;
-               if (off == rm->data.op_sg[sg].length) {
-                       off = 0;
-                       sg++;
-               }
-       }
-
-out:
-       if (ret <= 0) {
-               /* write_space will hit after EAGAIN, all else fatal */
-               if (ret == -EAGAIN) {
-                       rds_tcp_stats_inc(s_tcp_sndbuf_full);
-                       ret = 0;
-               } else {
-                       printk(KERN_WARNING "RDS/tcp: send to %pI4 "
-                              "returned %d, disconnecting and reconnecting\n",
-                              &conn->c_faddr, ret);
-                       rds_conn_drop(conn);
-               }
-       }
-       if (done == 0)
-               done = ret;
-       return done;
-}
-
-/*
- * rm->m_ack_seq is set to the tcp sequence number that corresponds to the
- * last byte of the message, including the header.  This means that the
- * entire message has been received if rm->m_ack_seq is "before" the next
- * unacked byte of the TCP sequence space.  We have to do very careful
- * wrapping 32bit comparisons here.
- */
-static int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack)
-{
-       if (!test_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags))
-               return 0;
-       return (__s32)((u32)rm->m_ack_seq - (u32)ack) < 0;
-}
-
-void rds_tcp_write_space(struct sock *sk)
-{
-       void (*write_space)(struct sock *sk);
-       struct rds_connection *conn;
-       struct rds_tcp_connection *tc;
-
-       read_lock(&sk->sk_callback_lock);
-       conn = sk->sk_user_data;
-       if (!conn) {
-               write_space = sk->sk_write_space;
-               goto out;
-       }
-
-       tc = conn->c_transport_data;
-       rdsdebug("write_space for tc %p\n", tc);
-       write_space = tc->t_orig_write_space;
-       rds_tcp_stats_inc(s_tcp_write_space_calls);
-
-       rdsdebug("tcp una %u\n", rds_tcp_snd_una(tc));
-       tc->t_last_seen_una = rds_tcp_snd_una(tc);
-       rds_send_drop_acked(conn, rds_tcp_snd_una(tc), rds_tcp_is_acked);
-
-        if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf)
-               queue_delayed_work(rds_wq, &conn->c_send_w, 0);
-
-out:
-       read_unlock(&sk->sk_callback_lock);
-
-       /*
-        * write_space is only called when data leaves tcp's send queue if
-        * SOCK_NOSPACE is set.  We set SOCK_NOSPACE every time we put
-        * data in tcp's send queue because we use write_space to parse the
-        * sequence numbers and notice that rds messages have been fully
-        * received.
-        *
-        * tcp's write_space clears SOCK_NOSPACE if the send queue has more
-        * than a certain amount of space. So we need to set it again *after*
-        * we call tcp's write_space or else we might only get called on the
-        * first of a series of incoming tcp acks.
-        */
-       write_space(sk);
-
-       if (sk->sk_socket)
-               set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-}
diff --git a/net/rds/tcp_stats.c b/net/rds/tcp_stats.c

deleted file mode 100644 (file)

index f8a7954..0000000
--- a/net/rds/tcp_stats.c
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/percpu.h>
-#include <linux/seq_file.h>
-#include <linux/proc_fs.h>
-
-#include "rds.h"
-#include "tcp.h"
-
-DEFINE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats)
-       ____cacheline_aligned;
-
-static const char * const rds_tcp_stat_names[] = {
-       "tcp_data_ready_calls",
-       "tcp_write_space_calls",
-       "tcp_sndbuf_full",
-       "tcp_connect_raced",
-       "tcp_listen_closed_stale",
-};
-
-unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter,
-                                    unsigned int avail)
-{
-       struct rds_tcp_statistics stats = {0, };
-       uint64_t *src;
-       uint64_t *sum;
-       size_t i;
-       int cpu;
-
-       if (avail < ARRAY_SIZE(rds_tcp_stat_names))
-               goto out;
-
-       for_each_online_cpu(cpu) {
-               src = (uint64_t *)&(per_cpu(rds_tcp_stats, cpu));
-               sum = (uint64_t *)&stats;
-               for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
-                       *(sum++) += *(src++);
-       }
-
-       rds_stats_info_copy(iter, (uint64_t *)&stats, rds_tcp_stat_names,
-                           ARRAY_SIZE(rds_tcp_stat_names));
-out:
-       return ARRAY_SIZE(rds_tcp_stat_names);
-}
diff --git a/net/rds/threads.c b/net/rds/threads.c

index dc2402e871fda52d10b0a67089c7c6e13832f72b..828a1bf9ea9236055956b9f4abdc4da80abc97d5 100644 (file)
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -32,7 +32,6 @@
   */
  #include <linux/kernel.h>
  #include <linux/random.h>
-#include <linux/export.h>
  
  #include "rds.h"
  
@@ -62,14 +61,13 @@
   *
   * Transition to state DISCONNECTING/DOWN:
   *  -  Inside the shutdown worker; synchronizes with xmit path
- *     through RDS_IN_XMIT, and with connection management callbacks
+ *     through c_send_lock, and with connection management callbacks
   *     via c_cm_lock.
   *
   *     For receive callbacks, we rely on the underlying transport
   *     (TCP, IB/RDMA) to provide the necessary synchronisation.
   */
  struct workqueue_struct *rds_wq;
-EXPORT_SYMBOL_GPL(rds_wq);
  
  void rds_connect_complete(struct rds_connection *conn)
  {
@@ -78,7 +76,8 @@ void rds_connect_complete(struct rds_connection *conn)
                                 "current state is %d\n",
                                 __func__,
                                 atomic_read(&conn->c_state));
-               rds_conn_drop(conn);
+               atomic_set(&conn->c_state, RDS_CONN_ERROR);
+               queue_work(rds_wq, &conn->c_down_w);
                 return;
         }
  
@@ -90,7 +89,6 @@ void rds_connect_complete(struct rds_connection *conn)
         queue_delayed_work(rds_wq, &conn->c_send_w, 0);
         queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
  }
-EXPORT_SYMBOL_GPL(rds_connect_complete);
  
  /*
   * This random exponential backoff is relied on to eventually resolve racing
@@ -110,7 +108,7 @@ EXPORT_SYMBOL_GPL(rds_connect_complete);
   * We should *always* start with a random backoff; otherwise a broken connection
   * will always take several iterations to be re-established.
   */
-void rds_queue_reconnect(struct rds_connection *conn)
+static void rds_queue_reconnect(struct rds_connection *conn)
  {
         unsigned long rand;
  
@@ -156,6 +154,58 @@ void rds_connect_worker(struct work_struct *work)
         }
  }
  
+void rds_shutdown_worker(struct work_struct *work)
+{
+       struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
+
+       /* shut it down unless it's down already */
+       if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
+               /*
+                * Quiesce the connection mgmt handlers before we start tearing
+                * things down. We don't hold the mutex for the entire
+                * duration of the shutdown operation, else we may be
+                * deadlocking with the CM handler. Instead, the CM event
+                * handler is supposed to check for state DISCONNECTING
+                */
+               mutex_lock(&conn->c_cm_lock);
+               if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING)
+                && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
+                       rds_conn_error(conn, "shutdown called in state %d\n",
+                                       atomic_read(&conn->c_state));
+                       mutex_unlock(&conn->c_cm_lock);
+                       return;
+               }
+               mutex_unlock(&conn->c_cm_lock);
+
+               mutex_lock(&conn->c_send_lock);
+               conn->c_trans->conn_shutdown(conn);
+               rds_conn_reset(conn);
+               mutex_unlock(&conn->c_send_lock);
+
+               if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
+                       /* This can happen - eg when we're in the middle of tearing
+                        * down the connection, and someone unloads the rds module.
+                        * Quite reproduceable with loopback connections.
+                        * Mostly harmless.
+                        */
+                       rds_conn_error(conn,
+                               "%s: failed to transition to state DOWN, "
+                               "current state is %d\n",
+                               __func__,
+                               atomic_read(&conn->c_state));
+                       return;
+               }
+       }
+
+       /* Then reconnect if it's still live.
+        * The passive side of an IB loopback connection is never added
+        * to the conn hash, so we never trigger a reconnect on this
+        * conn - the reconnect is always triggered by the active peer. */
+       cancel_delayed_work(&conn->c_conn_w);
+       if (!hlist_unhashed(&conn->c_hash_node))
+               rds_queue_reconnect(conn);
+}
+
  void rds_send_worker(struct work_struct *work)
  {
         struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work);
@@ -200,22 +250,15 @@ void rds_recv_worker(struct work_struct *work)
         }
  }
  
-void rds_shutdown_worker(struct work_struct *work)
-{
-       struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
-
-       rds_conn_shutdown(conn);
-}
-
  void rds_threads_exit(void)
  {
         destroy_workqueue(rds_wq);
  }
  
-int rds_threads_init(void)
+int __init rds_threads_init(void)
  {
         rds_wq = create_singlethread_workqueue("krdsd");
-       if (!rds_wq)
+       if (rds_wq == NULL)
                 return -ENOMEM;
  
         return 0;
diff --git a/net/rds/transport.c b/net/rds/transport.c

index 7f2ac4fec3678b28715b95094c6346bcc49333e1..767da61ad2f397c577cd5ea7a81fb91392b1bf73 100644 (file)
--- a/net/rds/transport.c
+++ b/net/rds/transport.c
@@ -37,7 +37,7 @@
  #include "rds.h"
  #include "loop.h"
  
-static struct rds_transport *transports[RDS_TRANS_COUNT];
+static LIST_HEAD(rds_transports);
  static DECLARE_RWSEM(rds_trans_sem);
  
  int rds_trans_register(struct rds_transport *trans)
@@ -46,52 +46,35 @@ int rds_trans_register(struct rds_transport *trans)
  
         down_write(&rds_trans_sem);
  
-       if (transports[trans->t_type])
-               printk(KERN_ERR "RDS Transport type %d already registered\n",
-                       trans->t_type);
-       else {
-               transports[trans->t_type] = trans;
-               printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name);
-       }
+       list_add_tail(&trans->t_item, &rds_transports);
+       printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name);
  
         up_write(&rds_trans_sem);
  
         return 0;
  }
-EXPORT_SYMBOL_GPL(rds_trans_register);
  
  void rds_trans_unregister(struct rds_transport *trans)
  {
         down_write(&rds_trans_sem);
  
-       transports[trans->t_type] = NULL;
+       list_del_init(&trans->t_item);
         printk(KERN_INFO "Unregistered RDS/%s transport\n", trans->t_name);
  
         up_write(&rds_trans_sem);
  }
-EXPORT_SYMBOL_GPL(rds_trans_unregister);
-
-void rds_trans_put(struct rds_transport *trans)
-{
-       if (trans && trans->t_owner)
-               module_put(trans->t_owner);
-}
  
  struct rds_transport *rds_trans_get_preferred(__be32 addr)
  {
-       struct rds_transport *ret = NULL;
         struct rds_transport *trans;
-       unsigned int i;
+       struct rds_transport *ret = NULL;
  
         if (IN_LOOPBACK(ntohl(addr)))
                 return &rds_loop_transport;
  
         down_read(&rds_trans_sem);
-       for (i = 0; i < RDS_TRANS_COUNT; i++) {
-               trans = transports[i];
-
-               if (trans && (trans->laddr_check(addr) == 0) &&
-                   (!trans->t_owner || try_module_get(trans->t_owner))) {
+       list_for_each_entry(trans, &rds_transports, t_item) {
+               if (trans->laddr_check(addr) == 0) {
                         ret = trans;
                         break;
                 }
@@ -114,15 +97,12 @@ unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
         struct rds_transport *trans;
         unsigned int total = 0;
         unsigned int part;
-       int i;
  
         rds_info_iter_unmap(iter);
         down_read(&rds_trans_sem);
  
-       for (i = 0; i < RDS_TRANS_COUNT; i++)
-       {
-               trans = transports[i];
-               if (!trans || !trans->stats_info_copy)
+       list_for_each_entry(trans, &rds_transports, t_item) {
+               if (trans->stats_info_copy == NULL)
                         continue;
  
                 part = trans->stats_info_copy(iter, avail);
author	Mukesh Kacker <mukesh.kacker@oracle.com>
	Tue, 7 Jul 2015 23:17:18 +0000 (16:17 -0700)
committer	Mukesh Kacker <mukesh.kacker@oracle.com>
	Tue, 7 Jul 2015 23:37:22 +0000 (16:37 -0700)
include/linux/rds.h	[moved from include/uapi/linux/rds.h with 80% similarity]	patch \| blob \| history
net/rds/Kconfig		patch \| blob \| history
net/rds/Makefile		patch \| blob \| history
net/rds/af_rds.c		patch \| blob \| history
net/rds/bind.c		patch \| blob \| history
net/rds/cong.c		patch \| blob \| history
net/rds/connection.c		patch \| blob \| history
net/rds/ib.c		patch \| blob \| history
net/rds/ib.h		patch \| blob \| history
net/rds/ib_cm.c		patch \| blob \| history
net/rds/ib_rdma.c		patch \| blob \| history
net/rds/ib_recv.c		patch \| blob \| history
net/rds/ib_ring.c		patch \| blob \| history
net/rds/ib_send.c		patch \| blob \| history
net/rds/ib_stats.c		patch \| blob \| history
net/rds/ib_sysctl.c		patch \| blob \| history
net/rds/info.c		patch \| blob \| history
net/rds/iw.c		patch \| blob \| history
net/rds/iw.h		patch \| blob \| history
net/rds/iw_cm.c		patch \| blob \| history
net/rds/iw_rdma.c		patch \| blob \| history
net/rds/iw_recv.c		patch \| blob \| history
net/rds/iw_ring.c		patch \| blob \| history
net/rds/iw_send.c		patch \| blob \| history
net/rds/iw_stats.c		patch \| blob \| history
net/rds/iw_sysctl.c		patch \| blob \| history
net/rds/loop.c		patch \| blob \| history
net/rds/message.c		patch \| blob \| history
net/rds/page.c		patch \| blob \| history
net/rds/rdma.c		patch \| blob \| history
net/rds/rdma.h	[new file with mode: 0644]	patch \| blob
net/rds/rdma_transport.c		patch \| blob \| history
net/rds/rdma_transport.h		patch \| blob \| history
net/rds/rds.h		patch \| blob \| history
net/rds/recv.c		patch \| blob \| history
net/rds/send.c		patch \| blob \| history
net/rds/stats.c		patch \| blob \| history
net/rds/sysctl.c		patch \| blob \| history
net/rds/tcp.c	[deleted file]	patch \| blob \| history
net/rds/tcp.h	[deleted file]	patch \| blob \| history
net/rds/tcp_connect.c	[deleted file]	patch \| blob \| history
net/rds/tcp_listen.c	[deleted file]	patch \| blob \| history
net/rds/tcp_recv.c	[deleted file]	patch \| blob \| history
net/rds/tcp_send.c	[deleted file]	patch \| blob \| history
net/rds/tcp_stats.c	[deleted file]	patch \| blob \| history
net/rds/threads.c		patch \| blob \| history
net/rds/transport.c		patch \| blob \| history