#include <linux/types.h>
+/* These sparse annotated types shouldn't be in any user
+ * visible header file. We should clean this up rather
+ * than kludging around them. */
+#ifndef __KERNEL__
+#define __be16 u_int16_t
+#define __be32 u_int32_t
+#define __be64 u_int64_t
+#endif
+
#define RDS_IB_ABI_VERSION 0x301
/*
/* deprecated: RDS_BARRIER 4 */
#define RDS_RECVERR 5
#define RDS_CONG_MONITOR 6
-#define RDS_GET_MR_FOR_DEST 7
/*
* Control message types for SOL_RDS.
#define RDS_CMSG_RDMA_MAP 3
#define RDS_CMSG_RDMA_STATUS 4
#define RDS_CMSG_CONG_UPDATE 5
-#define RDS_CMSG_ATOMIC_FADD 6
-#define RDS_CMSG_ATOMIC_CSWP 7
-#define RDS_CMSG_MASKED_ATOMIC_FADD 8
-#define RDS_CMSG_MASKED_ATOMIC_CSWP 9
#define RDS_INFO_FIRST 10000
#define RDS_INFO_COUNTERS 10000
#define RDS_INFO_LAST 10010
struct rds_info_counter {
- uint8_t name[32];
- uint64_t value;
+ u_int8_t name[32];
+ u_int64_t value;
} __attribute__((packed));
#define RDS_INFO_CONNECTION_FLAG_SENDING 0x01
#define TRANSNAMSIZ 16
struct rds_info_connection {
- uint64_t next_tx_seq;
- uint64_t next_rx_seq;
+ u_int64_t next_tx_seq;
+ u_int64_t next_rx_seq;
__be32 laddr;
__be32 faddr;
- uint8_t transport[TRANSNAMSIZ]; /* null term ascii */
- uint8_t flags;
+ u_int8_t transport[TRANSNAMSIZ]; /* null term ascii */
+ u_int8_t flags;
+} __attribute__((packed));
+
+struct rds_info_flow {
+ __be32 laddr;
+ __be32 faddr;
+ u_int32_t bytes;
+ __be16 lport;
+ __be16 fport;
} __attribute__((packed));
#define RDS_INFO_MESSAGE_FLAG_ACK 0x01
#define RDS_INFO_MESSAGE_FLAG_FAST_ACK 0x02
struct rds_info_message {
- uint64_t seq;
- uint32_t len;
+ u_int64_t seq;
+ u_int32_t len;
__be32 laddr;
__be32 faddr;
__be16 lport;
__be16 fport;
- uint8_t flags;
+ u_int8_t flags;
} __attribute__((packed));
struct rds_info_socket {
- uint32_t sndbuf;
+ u_int32_t sndbuf;
__be32 bound_addr;
__be32 connected_addr;
__be16 bound_port;
__be16 connected_port;
- uint32_t rcvbuf;
- uint64_t inum;
-} __attribute__((packed));
-
-struct rds_info_tcp_socket {
- __be32 local_addr;
- __be16 local_port;
- __be32 peer_addr;
- __be16 peer_port;
- uint64_t hdr_rem;
- uint64_t data_rem;
- uint32_t last_sent_nxt;
- uint32_t last_expected_una;
- uint32_t last_seen_una;
+ u_int32_t rcvbuf;
+ u_int64_t inum;
} __attribute__((packed));
#define RDS_IB_GID_LEN 16
* (so that the application does not have to worry about
* alignment).
*/
-typedef uint64_t rds_rdma_cookie_t;
+typedef u_int64_t rds_rdma_cookie_t;
struct rds_iovec {
- uint64_t addr;
- uint64_t bytes;
+ u_int64_t addr;
+ u_int64_t bytes;
};
struct rds_get_mr_args {
struct rds_iovec vec;
- uint64_t cookie_addr;
+ u_int64_t cookie_addr;
uint64_t flags;
};
-struct rds_get_mr_for_dest_args {
- struct sockaddr_storage dest_addr;
- struct rds_iovec vec;
- uint64_t cookie_addr;
- uint64_t flags;
-};
-
struct rds_free_mr_args {
rds_rdma_cookie_t cookie;
- uint64_t flags;
+ u_int64_t flags;
};
struct rds_rdma_args {
rds_rdma_cookie_t cookie;
struct rds_iovec remote_vec;
- uint64_t local_vec_addr;
- uint64_t nr_local;
- uint64_t flags;
- uint64_t user_token;
-};
-
-struct rds_atomic_args {
- rds_rdma_cookie_t cookie;
- uint64_t local_addr;
- uint64_t remote_addr;
- union {
- struct {
- uint64_t compare;
- uint64_t swap;
- } cswp;
- struct {
- uint64_t add;
- } fadd;
- struct {
- uint64_t compare;
- uint64_t swap;
- uint64_t compare_mask;
- uint64_t swap_mask;
- } m_cswp;
- struct {
- uint64_t add;
- uint64_t nocarry_mask;
- } m_fadd;
- };
- uint64_t flags;
- uint64_t user_token;
+ u_int64_t local_vec_addr;
+ u_int64_t nr_local;
+ u_int64_t flags;
+ u_int64_t user_token;
};
struct rds_rdma_notify {
- uint64_t user_token;
+ u_int64_t user_token;
int32_t status;
};
#define RDS_RDMA_USE_ONCE 0x0008 /* free MR after use */
#define RDS_RDMA_DONTWAIT 0x0010 /* Don't wait in SET_BARRIER */
#define RDS_RDMA_NOTIFY_ME 0x0020 /* Notify when operation completes */
-#define RDS_RDMA_SILENT 0x0040 /* Do not interrupt remote */
#endif /* IB_RDS_H */
config RDS
- tristate "The RDS Protocol"
- depends on INET
+ tristate "Reliable Datagram Sockets (RDS) (EXPERIMENTAL)"
+ depends on INET && INFINIBAND_IPOIB && EXPERIMENTAL
+ depends on INFINIBAND && INFINIBAND_ADDR_TRANS
---help---
- The RDS (Reliable Datagram Sockets) protocol provides reliable,
- sequenced delivery of datagrams over Infiniband, iWARP,
- or TCP.
-
-config RDS_RDMA
- tristate "RDS over Infiniband and iWARP"
- depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS
- ---help---
- Allow RDS to use Infiniband and iWARP as a transport.
- This transport supports RDMA operations.
-
-config RDS_TCP
- tristate "RDS over TCP"
- depends on RDS
- ---help---
- Allow RDS to use TCP as a transport.
- This transport does not support RDMA operations.
+ RDS provides reliable, sequenced delivery of datagrams
+ over Infiniband.
config RDS_DEBUG
- bool "RDS debugging messages"
+ bool "Debugging messages"
depends on RDS
default n
obj-$(CONFIG_RDS) += rds.o
rds-y := af_rds.o bind.o cong.o connection.o info.o message.o \
recv.o send.o stats.o sysctl.o threads.o transport.o \
- loop.o page.o rdma.o
-
-obj-$(CONFIG_RDS_RDMA) += rds_rdma.o
-rds_rdma-y := rdma_transport.o \
+ loop.o page.o rdma.o \
+ rdma_transport.o \
ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \
ib_sysctl.o ib_rdma.o \
iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \
iw_sysctl.o iw_rdma.o
-
-obj-$(CONFIG_RDS_TCP) += rds_tcp.o
-rds_tcp-y := tcp.o tcp_connect.o tcp_listen.o tcp_recv.o \
- tcp_send.o tcp_stats.o
-
-ccflags-$(CONFIG_RDS_DEBUG) := -DDEBUG
+ifeq ($(CONFIG_RDS_DEBUG), y)
+EXTRA_CFLAGS += -DDEBUG
+endif
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/kernel.h>
-#include <linux/gfp.h>
#include <linux/in.h>
#include <linux/poll.h>
+#include <linux/version.h>
#include <net/sock.h>
#include "rds.h"
-
-char *rds_str_array(char **array, size_t elements, size_t index)
-{
- if ((index < elements) && array[index])
- return array[index];
- else
- return "unknown";
-}
-EXPORT_SYMBOL(rds_str_array);
+#include "rdma.h"
+#include "rdma_transport.h"
/* this is just used for stats gathering :/ */
static DEFINE_SPINLOCK(rds_sock_lock);
{
struct sock *sk = sock->sk;
struct rds_sock *rs;
+ unsigned long flags;
- if (!sk)
+ if (sk == NULL)
goto out;
rs = rds_sk_to_rs(sk);
* with the socket. */
rds_clear_recv_queue(rs);
rds_cong_remove_socket(rs);
-
- /*
- * the binding lookup hash uses rcu, we need to
- * make sure we synchronize_rcu before we free our
- * entry
- */
rds_remove_bound(rs);
- synchronize_rcu();
-
rds_send_drop_to(rs, NULL);
rds_rdma_drop_keys(rs);
rds_notify_queue_get(rs, NULL);
- spin_lock_bh(&rds_sock_lock);
+ spin_lock_irqsave(&rds_sock_lock, flags);
list_del_init(&rs->rs_item);
rds_sock_count--;
- spin_unlock_bh(&rds_sock_lock);
-
- rds_trans_put(rs->rs_transport);
+ spin_unlock_irqrestore(&rds_sock_lock, flags);
sock->sk = NULL;
sock_put(sk);
unsigned int mask = 0;
unsigned long flags;
- poll_wait(file, sk_sleep(sk), wait);
+ poll_wait(file, sk->sk_sleep, wait);
- if (rs->rs_seen_congestion)
- poll_wait(file, &rds_poll_waitq, wait);
+ poll_wait(file, &rds_poll_waitq, wait);
read_lock_irqsave(&rs->rs_recv_lock, flags);
if (!rs->rs_cong_monitor) {
mask |= (POLLIN | POLLRDNORM);
spin_unlock(&rs->rs_lock);
}
- if (!list_empty(&rs->rs_recv_queue) ||
- !list_empty(&rs->rs_notify_queue))
+ if (!list_empty(&rs->rs_recv_queue)
+ || !list_empty(&rs->rs_notify_queue))
mask |= (POLLIN | POLLRDNORM);
if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
mask |= (POLLOUT | POLLWRNORM);
read_unlock_irqrestore(&rs->rs_recv_lock, flags);
- /* clear state any time we wake a seen-congested socket */
- if (mask)
- rs->rs_seen_congestion = 0;
-
return mask;
}
}
static int rds_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
+ char __user *optval, int optlen)
{
struct rds_sock *rs = rds_sk_to_rs(sock->sk);
int ret;
case RDS_GET_MR:
ret = rds_get_mr(rs, optval, optlen);
break;
- case RDS_GET_MR_FOR_DEST:
- ret = rds_get_mr_for_dest(rs, optval, optlen);
- break;
case RDS_FREE_MR:
ret = rds_free_mr(rs, optval, optlen);
break;
if (len < sizeof(int))
ret = -EINVAL;
else
- if (put_user(rs->rs_recverr, (int __user *) optval) ||
- put_user(sizeof(int), optlen))
+ if (put_user(rs->rs_recverr, (int __user *) optval)
+ || put_user(sizeof(int), optlen))
ret = -EFAULT;
else
ret = 0;
.obj_size = sizeof(struct rds_sock),
};
-static const struct proto_ops rds_proto_ops = {
+static struct proto_ops rds_proto_ops = {
.family = AF_RDS,
.owner = THIS_MODULE,
.release = rds_release,
static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
{
+ unsigned long flags;
struct rds_sock *rs;
sock_init_data(sock, sk);
spin_lock_init(&rs->rs_rdma_lock);
rs->rs_rdma_keys = RB_ROOT;
- spin_lock_bh(&rds_sock_lock);
+ spin_lock_irqsave(&rds_sock_lock, flags);
list_add_tail(&rs->rs_item, &rds_sock_list);
rds_sock_count++;
- spin_unlock_bh(&rds_sock_lock);
+ spin_unlock_irqrestore(&rds_sock_lock, flags);
return 0;
}
-static int rds_create(struct net *net, struct socket *sock, int protocol,
- int kern)
+static int rds_create(struct net *net, struct socket *sock, int protocol)
{
struct sock *sk;
sock_put(rds_rs_to_sk(rs));
}
-static const struct net_proto_family rds_family_ops = {
+static struct net_proto_family rds_family_ops = {
.family = AF_RDS,
.create = rds_create,
.owner = THIS_MODULE,
struct rds_info_lengths *lens)
{
struct rds_sock *rs;
+ struct sock *sk;
struct rds_incoming *inc;
+ unsigned long flags;
unsigned int total = 0;
len /= sizeof(struct rds_info_message);
- spin_lock_bh(&rds_sock_lock);
+ spin_lock_irqsave(&rds_sock_lock, flags);
list_for_each_entry(rs, &rds_sock_list, rs_item) {
+ sk = rds_rs_to_sk(rs);
read_lock(&rs->rs_recv_lock);
/* XXX too lazy to maintain counts.. */
read_unlock(&rs->rs_recv_lock);
}
- spin_unlock_bh(&rds_sock_lock);
+ spin_unlock_irqrestore(&rds_sock_lock, flags);
lens->nr = total;
lens->each = sizeof(struct rds_info_message);
{
struct rds_info_socket sinfo;
struct rds_sock *rs;
+ unsigned long flags;
len /= sizeof(struct rds_info_socket);
- spin_lock_bh(&rds_sock_lock);
+ spin_lock_irqsave(&rds_sock_lock, flags);
if (len < rds_sock_count)
goto out;
lens->nr = rds_sock_count;
lens->each = sizeof(struct rds_info_socket);
- spin_unlock_bh(&rds_sock_lock);
+ spin_unlock_irqrestore(&rds_sock_lock, flags);
}
-static void rds_exit(void)
+static void __exit rds_exit(void)
{
+ rds_rdma_exit();
sock_unregister(rds_family_ops.family);
proto_unregister(&rds_proto);
rds_conn_exit();
}
module_exit(rds_exit);
-static int rds_init(void)
+static int __init rds_init(void)
{
int ret;
rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info);
rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
+ /* ib/iwarp transports currently compiled-in */
+ ret = rds_rdma_init();
+ if (ret)
+ goto out_sock;
goto out;
+out_sock:
+ sock_unregister(rds_family_ops.family);
out_proto:
proto_unregister(&rds_proto);
out_stats:
#include <net/sock.h>
#include <linux/in.h>
#include <linux/if_arp.h>
-#include <linux/jhash.h>
-#include <linux/ratelimit.h>
#include "rds.h"
-#define BIND_HASH_SIZE 1024
-static struct hlist_head bind_hash_table[BIND_HASH_SIZE];
+/*
+ * XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't
+ * particularly zippy.
+ *
+ * This is now called for every incoming frame so we arguably care much more
+ * about it than we used to.
+ */
static DEFINE_SPINLOCK(rds_bind_lock);
+static struct rb_root rds_bind_tree = RB_ROOT;
-static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port)
-{
- return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) &
- (BIND_HASH_SIZE - 1));
-}
-
-static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port,
- struct rds_sock *insert)
+static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port,
+ struct rds_sock *insert)
{
+ struct rb_node **p = &rds_bind_tree.rb_node;
+ struct rb_node *parent = NULL;
struct rds_sock *rs;
- struct hlist_head *head = hash_to_bucket(addr, port);
u64 cmp;
u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port);
- rcu_read_lock();
- hlist_for_each_entry_rcu(rs, head, rs_bound_node) {
+ while (*p) {
+ parent = *p;
+ rs = rb_entry(parent, struct rds_sock, rs_bound_node);
+
cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) |
be16_to_cpu(rs->rs_bound_port);
- if (cmp == needle) {
- rcu_read_unlock();
+ if (needle < cmp)
+ p = &(*p)->rb_left;
+ else if (needle > cmp)
+ p = &(*p)->rb_right;
+ else
return rs;
- }
}
- rcu_read_unlock();
if (insert) {
- /*
- * make sure our addr and port are set before
- * we are added to the list, other people
- * in rcu will find us as soon as the
- * hlist_add_head_rcu is done
- */
- insert->rs_bound_addr = addr;
- insert->rs_bound_port = port;
- rds_sock_addref(insert);
-
- hlist_add_head_rcu(&insert->rs_bound_node, head);
+ rb_link_node(&insert->rs_bound_node, parent, p);
+ rb_insert_color(&insert->rs_bound_node, &rds_bind_tree);
}
return NULL;
}
struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
{
struct rds_sock *rs;
+ unsigned long flags;
- rs = rds_bind_lookup(addr, port, NULL);
-
+ spin_lock_irqsave(&rds_bind_lock, flags);
+ rs = rds_bind_tree_walk(addr, port, NULL);
if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
rds_sock_addref(rs);
else
rs = NULL;
+ spin_unlock_irqrestore(&rds_bind_lock, flags);
rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
ntohs(port));
rover = be16_to_cpu(*port);
last = rover;
} else {
- rover = max_t(u16, prandom_u32(), 2);
+ rover = max_t(u16, net_random(), 2);
last = rover - 1;
}
do {
if (rover == 0)
rover++;
- if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) {
- *port = rs->rs_bound_port;
+ if (rds_bind_tree_walk(addr, cpu_to_be16(rover), rs) == NULL) {
+ *port = cpu_to_be16(rover);
ret = 0;
- rdsdebug("rs %p binding to %pI4:%d\n",
- rs, &addr, (int)ntohs(*port));
break;
}
} while (rover++ != last);
+ if (ret == 0) {
+ rs->rs_bound_addr = addr;
+ rs->rs_bound_port = *port;
+ rds_sock_addref(rs);
+
+ rdsdebug("rs %p binding to %pI4:%d\n",
+ rs, &addr, (int)ntohs(*port));
+ }
+
spin_unlock_irqrestore(&rds_bind_lock, flags);
return ret;
rs, &rs->rs_bound_addr,
ntohs(rs->rs_bound_port));
- hlist_del_init_rcu(&rs->rs_bound_node);
+ rb_erase(&rs->rs_bound_node, &rds_bind_tree);
rds_sock_put(rs);
rs->rs_bound_addr = 0;
}
goto out;
trans = rds_trans_get_preferred(sin->sin_addr.s_addr);
- if (!trans) {
+ if (trans == NULL) {
ret = -EADDRNOTAVAIL;
rds_remove_bound(rs);
- printk_ratelimited(KERN_INFO "RDS: rds_bind() could not find a transport, "
- "load rds_tcp or rds_rdma?\n");
goto out;
}
out:
release_sock(sk);
-
- /* we might have called rds_remove_bound on error */
- if (ret)
- synchronize_rcu();
return ret;
}
* SOFTWARE.
*
*/
-#include <linux/slab.h>
#include <linux/types.h>
#include <linux/rbtree.h>
-#include <linux/bitops.h>
-#include <linux/export.h>
+
+#include <asm-generic/bitops/le.h>
#include "rds.h"
unsigned long flags;
map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL);
- if (!map)
+ if (map == NULL)
return NULL;
map->m_addr = addr;
ret = rds_cong_tree_walk(addr, map);
spin_unlock_irqrestore(&rds_cong_lock, flags);
- if (!ret) {
+ if (ret == NULL) {
ret = map;
map = NULL;
}
conn->c_lcong = rds_cong_from_addr(conn->c_laddr);
conn->c_fcong = rds_cong_from_addr(conn->c_faddr);
- if (!(conn->c_lcong && conn->c_fcong))
+ if (conn->c_lcong == NULL || conn->c_fcong == NULL)
return -ENOMEM;
return 0;
list_for_each_entry(conn, &map->m_conn_list, c_map_item) {
if (!test_and_set_bit(0, &conn->c_map_queued)) {
rds_stats_inc(s_cong_update_queued);
- /* We cannot inline the call to rds_send_xmit() here
- * for two reasons (both pertaining to a TCP transport):
- * 1. When we get here from the receive path, we
- * are already holding the sock_lock (held by
- * tcp_v4_rcv()). So inlining calls to
- * tcp_setsockopt and/or tcp_sendmsg will deadlock
- * when it tries to get the sock_lock())
- * 2. Interrupts are masked so that we can mark the
- * the port congested from both send and recv paths.
- * (See comment around declaration of rdc_cong_lock).
- * An attempt to get the sock_lock() here will
- * therefore trigger warnings.
- * Defer the xmit to rds_send_worker() instead.
- */
queue_delayed_work(rds_wq, &conn->c_send_w, 0);
}
}
read_unlock_irqrestore(&rds_cong_monitor_lock, flags);
}
}
-EXPORT_SYMBOL_GPL(rds_cong_map_updated);
int rds_cong_updated_since(unsigned long *recent)
{
i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
- __set_bit_le(off, (void *)map->m_page_addrs[i]);
+ generic___set_le_bit(off, (void *)map->m_page_addrs[i]);
}
void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port)
i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
- __clear_bit_le(off, (void *)map->m_page_addrs[i]);
+ generic___clear_le_bit(off, (void *)map->m_page_addrs[i]);
}
static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port)
i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
- return test_bit_le(off, (void *)map->m_page_addrs[i]);
+ return generic_test_le_bit(off, (void *)map->m_page_addrs[i]);
}
void rds_cong_add_socket(struct rds_sock *rs)
*/
#include <linux/kernel.h>
#include <linux/list.h>
-#include <linux/slab.h>
-#include <linux/export.h>
#include <net/inet_hashtables.h>
#include "rds.h"
#include "loop.h"
+#include "rdma.h"
#define RDS_CONNECTION_HASH_BITS 12
#define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS)
static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
{
- static u32 rds_hash_secret __read_mostly;
-
- unsigned long hash;
-
- net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret));
-
/* Pass NULL, don't need struct net for hash */
- hash = __inet_ehashfn(be32_to_cpu(laddr), 0,
- be32_to_cpu(faddr), 0,
- rds_hash_secret);
+ unsigned long hash = inet_ehashfn(NULL,
+ be32_to_cpu(laddr), 0,
+ be32_to_cpu(faddr), 0);
return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK];
}
var |= RDS_INFO_CONNECTION_FLAG_##suffix; \
} while (0)
-/* rcu read lock must be held or the connection spinlock */
+static inline int rds_conn_is_sending(struct rds_connection *conn)
+{
+ int ret = 0;
+
+ if (!mutex_trylock(&conn->c_send_lock))
+ ret = 1;
+ else
+ mutex_unlock(&conn->c_send_lock);
+
+ return ret;
+}
+
static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
__be32 laddr, __be32 faddr,
struct rds_transport *trans)
{
struct rds_connection *conn, *ret = NULL;
+ struct hlist_node *pos;
- hlist_for_each_entry_rcu(conn, head, c_hash_node) {
+ hlist_for_each_entry(conn, pos, head, c_hash_node) {
if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
conn->c_trans == trans) {
ret = conn;
* and receiving over this connection again in the future. It is up to
* the transport to have serialized this call with its send and recv.
*/
-static void rds_conn_reset(struct rds_connection *conn)
+void rds_conn_reset(struct rds_connection *conn)
{
rdsdebug("connection %pI4 to %pI4 reset\n",
&conn->c_laddr, &conn->c_faddr);
struct rds_transport *trans, gfp_t gfp,
int is_outgoing)
{
- struct rds_connection *conn, *parent = NULL;
+ struct rds_connection *conn, *tmp, *parent = NULL;
struct hlist_head *head = rds_conn_bucket(laddr, faddr);
- struct rds_transport *loop_trans;
unsigned long flags;
int ret;
- struct rds_transport *otrans = trans;
- if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
- goto new_conn;
- rcu_read_lock();
+ spin_lock_irqsave(&rds_conn_lock, flags);
conn = rds_conn_lookup(head, laddr, faddr, trans);
- if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
- laddr == faddr && !is_outgoing) {
+ if (conn
+ && conn->c_loopback
+ && conn->c_trans != &rds_loop_transport
+ && !is_outgoing) {
/* This is a looped back IB connection, and we're
* called by the code handling the incoming connect.
* We need a second connection object into which we
parent = conn;
conn = parent->c_passive;
}
- rcu_read_unlock();
+ spin_unlock_irqrestore(&rds_conn_lock, flags);
if (conn)
goto out;
-new_conn:
- conn = kmem_cache_zalloc(rds_conn_slab, gfp);
- if (!conn) {
+ conn = kmem_cache_alloc(rds_conn_slab, gfp);
+ if (conn == NULL) {
conn = ERR_PTR(-ENOMEM);
goto out;
}
+ memset(conn, 0, sizeof(*conn));
+
INIT_HLIST_NODE(&conn->c_hash_node);
+ conn->c_version = RDS_PROTOCOL_3_0;
conn->c_laddr = laddr;
conn->c_faddr = faddr;
spin_lock_init(&conn->c_lock);
conn->c_next_tx_seq = 1;
- init_waitqueue_head(&conn->c_waitq);
+ mutex_init(&conn->c_send_lock);
INIT_LIST_HEAD(&conn->c_send_queue);
INIT_LIST_HEAD(&conn->c_retrans);
* can bind to the destination address then we'd rather the messages
* flow through loopback rather than either transport.
*/
- loop_trans = rds_trans_get_preferred(faddr);
- if (loop_trans) {
- rds_trans_put(loop_trans);
+ if (rds_trans_get_preferred(faddr)) {
conn->c_loopback = 1;
if (is_outgoing && trans->t_prefer_loopback) {
/* "outgoing" connection - and the transport
}
atomic_set(&conn->c_state, RDS_CONN_DOWN);
- conn->c_send_gen = 0;
conn->c_reconnect_jiffies = 0;
INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker);
INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker);
trans->t_name ? trans->t_name : "[unknown]",
is_outgoing ? "(outgoing)" : "");
- /*
- * Since we ran without holding the conn lock, someone could
- * have created the same conn (either normal or passive) in the
- * interim. We check while holding the lock. If we won, we complete
- * init and return our conn. If we lost, we rollback and return the
- * other one.
- */
spin_lock_irqsave(&rds_conn_lock, flags);
- if (parent) {
- /* Creating passive conn */
- if (parent->c_passive) {
- trans->conn_free(conn->c_transport_data);
- kmem_cache_free(rds_conn_slab, conn);
- conn = parent->c_passive;
- } else {
+ if (parent == NULL) {
+ tmp = rds_conn_lookup(head, laddr, faddr, trans);
+ if (tmp == NULL)
+ hlist_add_head(&conn->c_hash_node, head);
+ } else {
+ tmp = parent->c_passive;
+ if (!tmp)
parent->c_passive = conn;
- rds_cong_add_conn(conn);
- rds_conn_count++;
- }
+ }
+
+ if (tmp) {
+ trans->conn_free(conn->c_transport_data);
+ kmem_cache_free(rds_conn_slab, conn);
+ conn = tmp;
} else {
- /* Creating normal conn */
- struct rds_connection *found;
-
- if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
- found = NULL;
- else
- found = rds_conn_lookup(head, laddr, faddr, trans);
- if (found) {
- trans->conn_free(conn->c_transport_data);
- kmem_cache_free(rds_conn_slab, conn);
- conn = found;
- } else {
- if ((is_outgoing && otrans->t_type == RDS_TRANS_TCP) ||
- (otrans->t_type != RDS_TRANS_TCP)) {
- /* Only the active side should be added to
- * reconnect list for TCP.
- */
- hlist_add_head_rcu(&conn->c_hash_node, head);
- }
- rds_cong_add_conn(conn);
- rds_conn_count++;
- }
+ rds_cong_add_conn(conn);
+ rds_conn_count++;
}
+
spin_unlock_irqrestore(&rds_conn_lock, flags);
out:
{
return __rds_conn_create(laddr, faddr, trans, gfp, 0);
}
-EXPORT_SYMBOL_GPL(rds_conn_create);
struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp)
{
return __rds_conn_create(laddr, faddr, trans, gfp, 1);
}
-EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
-
-void rds_conn_shutdown(struct rds_connection *conn)
-{
- /* shut it down unless it's down already */
- if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
- /*
- * Quiesce the connection mgmt handlers before we start tearing
- * things down. We don't hold the mutex for the entire
- * duration of the shutdown operation, else we may be
- * deadlocking with the CM handler. Instead, the CM event
- * handler is supposed to check for state DISCONNECTING
- */
- mutex_lock(&conn->c_cm_lock);
- if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING)
- && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
- rds_conn_error(conn, "shutdown called in state %d\n",
- atomic_read(&conn->c_state));
- mutex_unlock(&conn->c_cm_lock);
- return;
- }
- mutex_unlock(&conn->c_cm_lock);
-
- wait_event(conn->c_waitq,
- !test_bit(RDS_IN_XMIT, &conn->c_flags));
-
- conn->c_trans->conn_shutdown(conn);
- rds_conn_reset(conn);
-
- if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
- /* This can happen - eg when we're in the middle of tearing
- * down the connection, and someone unloads the rds module.
- * Quite reproduceable with loopback connections.
- * Mostly harmless.
- */
- rds_conn_error(conn,
- "%s: failed to transition to state DOWN, "
- "current state is %d\n",
- __func__,
- atomic_read(&conn->c_state));
- return;
- }
- }
-
- /* Then reconnect if it's still live.
- * The passive side of an IB loopback connection is never added
- * to the conn hash, so we never trigger a reconnect on this
- * conn - the reconnect is always triggered by the active peer. */
- cancel_delayed_work_sync(&conn->c_conn_w);
- rcu_read_lock();
- if (!hlist_unhashed(&conn->c_hash_node)) {
- rcu_read_unlock();
- rds_queue_reconnect(conn);
- } else {
- rcu_read_unlock();
- }
-}
-/*
- * Stop and free a connection.
- *
- * This can only be used in very limited circumstances. It assumes that once
- * the conn has been shutdown that no one else is referencing the connection.
- * We can only ensure this in the rmmod path in the current code.
- */
void rds_conn_destroy(struct rds_connection *conn)
{
struct rds_message *rm, *rtmp;
- unsigned long flags;
rdsdebug("freeing conn %p for %pI4 -> "
"%pI4\n", conn, &conn->c_laddr,
&conn->c_faddr);
- /* Ensure conn will not be scheduled for reconnect */
- spin_lock_irq(&rds_conn_lock);
- hlist_del_init_rcu(&conn->c_hash_node);
- spin_unlock_irq(&rds_conn_lock);
- synchronize_rcu();
-
- /* shut the connection down */
- rds_conn_drop(conn);
- flush_work(&conn->c_down_w);
+ hlist_del_init(&conn->c_hash_node);
- /* make sure lingering queued work won't try to ref the conn */
- cancel_delayed_work_sync(&conn->c_send_w);
- cancel_delayed_work_sync(&conn->c_recv_w);
+ /* wait for the rds thread to shut it down */
+ atomic_set(&conn->c_state, RDS_CONN_ERROR);
+ cancel_delayed_work(&conn->c_conn_w);
+ queue_work(rds_wq, &conn->c_down_w);
+ flush_workqueue(rds_wq);
/* tear down queued messages */
list_for_each_entry_safe(rm, rtmp,
BUG_ON(!list_empty(&conn->c_retrans));
kmem_cache_free(rds_conn_slab, conn);
- spin_lock_irqsave(&rds_conn_lock, flags);
rds_conn_count--;
- spin_unlock_irqrestore(&rds_conn_lock, flags);
}
-EXPORT_SYMBOL_GPL(rds_conn_destroy);
static void rds_conn_message_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
int want_send)
{
struct hlist_head *head;
+ struct hlist_node *pos;
struct list_head *list;
struct rds_connection *conn;
struct rds_message *rm;
- unsigned int total = 0;
unsigned long flags;
+ unsigned int total = 0;
size_t i;
len /= sizeof(struct rds_info_message);
- rcu_read_lock();
+ spin_lock_irqsave(&rds_conn_lock, flags);
for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
i++, head++) {
- hlist_for_each_entry_rcu(conn, head, c_hash_node) {
+ hlist_for_each_entry(conn, pos, head, c_hash_node) {
if (want_send)
list = &conn->c_send_queue;
else
list = &conn->c_retrans;
- spin_lock_irqsave(&conn->c_lock, flags);
+ spin_lock(&conn->c_lock);
/* XXX too lazy to maintain counts.. */
list_for_each_entry(rm, list, m_conn_item) {
conn->c_faddr, 0);
}
- spin_unlock_irqrestore(&conn->c_lock, flags);
+ spin_unlock(&conn->c_lock);
}
}
- rcu_read_unlock();
+
+ spin_unlock_irqrestore(&rds_conn_lock, flags);
lens->nr = total;
lens->each = sizeof(struct rds_info_message);
{
uint64_t buffer[(item_len + 7) / 8];
struct hlist_head *head;
+ struct hlist_node *pos;
+ struct hlist_node *tmp;
struct rds_connection *conn;
+ unsigned long flags;
size_t i;
- rcu_read_lock();
+ spin_lock_irqsave(&rds_conn_lock, flags);
lens->nr = 0;
lens->each = item_len;
for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
i++, head++) {
- hlist_for_each_entry_rcu(conn, head, c_hash_node) {
+ hlist_for_each_entry_safe(conn, pos, tmp, head, c_hash_node) {
/* XXX no c_lock usage.. */
if (!visitor(conn, buffer))
lens->nr++;
}
}
- rcu_read_unlock();
+
+ spin_unlock_irqrestore(&rds_conn_lock, flags);
}
-EXPORT_SYMBOL_GPL(rds_for_each_conn_info);
static int rds_conn_info_visitor(struct rds_connection *conn,
void *buffer)
sizeof(cinfo->transport));
cinfo->flags = 0;
- rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &conn->c_flags),
- SENDING);
+ rds_conn_info_set(cinfo->flags,
+ rds_conn_is_sending(conn), SENDING);
/* XXX Future: return the state rather than these funky bits */
rds_conn_info_set(cinfo->flags,
atomic_read(&conn->c_state) == RDS_CONN_CONNECTING,
sizeof(struct rds_info_connection));
}
-int rds_conn_init(void)
+int __init rds_conn_init(void)
{
rds_conn_slab = kmem_cache_create("rds_connection",
sizeof(struct rds_connection),
0, 0, NULL);
- if (!rds_conn_slab)
+ if (rds_conn_slab == NULL)
return -ENOMEM;
rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info);
atomic_set(&conn->c_state, RDS_CONN_ERROR);
queue_work(rds_wq, &conn->c_down_w);
}
-EXPORT_SYMBOL_GPL(rds_conn_drop);
-
-/*
- * If the connection is down, trigger a connect. We may have scheduled a
- * delayed reconnect however - in this case we should not interfere.
- */
-void rds_conn_connect_if_down(struct rds_connection *conn)
-{
- if (rds_conn_state(conn) == RDS_CONN_DOWN &&
- !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
- queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
-}
-EXPORT_SYMBOL_GPL(rds_conn_connect_if_down);
/*
* An error occurred on the connection
#include <linux/inetdevice.h>
#include <linux/if_arp.h>
#include <linux/delay.h>
-#include <linux/slab.h>
-#include <linux/module.h>
#include "rds.h"
#include "ib.h"
-static unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
+unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */
-unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
module_param(fmr_pool_size, int, 0444);
MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA");
module_param(fmr_message_size, int, 0444);
MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
-module_param(rds_ib_retry_count, int, 0444);
-MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
-/*
- * we have a clumsy combination of RCU and a rwsem protecting this list
- * because it is used both in the get_mr fast path and while blocking in
- * the FMR flushing path.
- */
-DECLARE_RWSEM(rds_ib_devices_lock);
struct list_head rds_ib_devices;
/* NOTE: if also grabbing ibdev lock, grab this first */
DEFINE_SPINLOCK(ib_nodev_conns_lock);
LIST_HEAD(ib_nodev_conns);
-static void rds_ib_nodev_connect(void)
-{
- struct rds_ib_connection *ic;
-
- spin_lock(&ib_nodev_conns_lock);
- list_for_each_entry(ic, &ib_nodev_conns, ib_node)
- rds_conn_connect_if_down(ic->conn);
- spin_unlock(&ib_nodev_conns_lock);
-}
-
-static void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev)
-{
- struct rds_ib_connection *ic;
- unsigned long flags;
-
- spin_lock_irqsave(&rds_ibdev->spinlock, flags);
- list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node)
- rds_conn_drop(ic->conn);
- spin_unlock_irqrestore(&rds_ibdev->spinlock, flags);
-}
-
-/*
- * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references
- * from interrupt context so we push freing off into a work struct in krdsd.
- */
-static void rds_ib_dev_free(struct work_struct *work)
-{
- struct rds_ib_ipaddr *i_ipaddr, *i_next;
- struct rds_ib_device *rds_ibdev = container_of(work,
- struct rds_ib_device, free_work);
-
- if (rds_ibdev->mr_pool)
- rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
- if (rds_ibdev->mr)
- ib_dereg_mr(rds_ibdev->mr);
- if (rds_ibdev->pd)
- ib_dealloc_pd(rds_ibdev->pd);
-
- list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
- list_del(&i_ipaddr->list);
- kfree(i_ipaddr);
- }
-
- kfree(rds_ibdev);
-}
-
-void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
-{
- BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0);
- if (atomic_dec_and_test(&rds_ibdev->refcount))
- queue_work(rds_wq, &rds_ibdev->free_work);
-}
-
-static void rds_ib_add_one(struct ib_device *device)
+void rds_ib_add_one(struct ib_device *device)
{
struct rds_ib_device *rds_ibdev;
struct ib_device_attr *dev_attr;
goto free_attr;
}
- rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL,
- ibdev_to_node(device));
+ rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL);
if (!rds_ibdev)
goto free_attr;
spin_lock_init(&rds_ibdev->spinlock);
- atomic_set(&rds_ibdev->refcount, 1);
- INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
rds_ibdev->max_wrs = dev_attr->max_qp_wr;
rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
+ rds_ibdev->fmr_page_shift = max(9, ffs(dev_attr->page_size_cap) - 1);
+ rds_ibdev->fmr_page_size = 1 << rds_ibdev->fmr_page_shift;
+ rds_ibdev->fmr_page_mask = ~((u64) rds_ibdev->fmr_page_size - 1);
rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32;
rds_ibdev->max_fmrs = dev_attr->max_fmr ?
min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) :
fmr_pool_size;
- rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom;
- rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom;
-
rds_ibdev->dev = device;
rds_ibdev->pd = ib_alloc_pd(device);
- if (IS_ERR(rds_ibdev->pd)) {
- rds_ibdev->pd = NULL;
- goto put_dev;
- }
+ if (IS_ERR(rds_ibdev->pd))
+ goto free_dev;
- rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE);
- if (IS_ERR(rds_ibdev->mr)) {
- rds_ibdev->mr = NULL;
- goto put_dev;
- }
+ rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd,
+ IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(rds_ibdev->mr))
+ goto err_pd;
rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
if (IS_ERR(rds_ibdev->mr_pool)) {
rds_ibdev->mr_pool = NULL;
- goto put_dev;
+ goto err_mr;
}
INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
INIT_LIST_HEAD(&rds_ibdev->conn_list);
-
- down_write(&rds_ib_devices_lock);
- list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices);
- up_write(&rds_ib_devices_lock);
- atomic_inc(&rds_ibdev->refcount);
+ list_add_tail(&rds_ibdev->list, &rds_ib_devices);
ib_set_client_data(device, &rds_ib_client, rds_ibdev);
- atomic_inc(&rds_ibdev->refcount);
- rds_ib_nodev_connect();
+ goto free_attr;
-put_dev:
- rds_ib_dev_put(rds_ibdev);
+err_mr:
+ ib_dereg_mr(rds_ibdev->mr);
+err_pd:
+ ib_dealloc_pd(rds_ibdev->pd);
+free_dev:
+ kfree(rds_ibdev);
free_attr:
kfree(dev_attr);
}
-/*
- * New connections use this to find the device to associate with the
- * connection. It's not in the fast path so we're not concerned about the
- * performance of the IB call. (As of this writing, it uses an interrupt
- * blocking spinlock to serialize walking a per-device list of all registered
- * clients.)
- *
- * RCU is used to handle incoming connections racing with device teardown.
- * Rather than use a lock to serialize removal from the client_data and
- * getting a new reference, we use an RCU grace period. The destruction
- * path removes the device from client_data and then waits for all RCU
- * readers to finish.
- *
- * A new connection can get NULL from this if its arriving on a
- * device that is in the process of being removed.
- */
-struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device)
-{
- struct rds_ib_device *rds_ibdev;
-
- rcu_read_lock();
- rds_ibdev = ib_get_client_data(device, &rds_ib_client);
- if (rds_ibdev)
- atomic_inc(&rds_ibdev->refcount);
- rcu_read_unlock();
- return rds_ibdev;
-}
-
-/*
- * The IB stack is letting us know that a device is going away. This can
- * happen if the underlying HCA driver is removed or if PCI hotplug is removing
- * the pci function, for example.
- *
- * This can be called at any time and can be racing with any other RDS path.
- */
-static void rds_ib_remove_one(struct ib_device *device)
+void rds_ib_remove_one(struct ib_device *device)
{
struct rds_ib_device *rds_ibdev;
+ struct rds_ib_ipaddr *i_ipaddr, *i_next;
rds_ibdev = ib_get_client_data(device, &rds_ib_client);
if (!rds_ibdev)
return;
- rds_ib_dev_shutdown(rds_ibdev);
+ list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
+ list_del(&i_ipaddr->list);
+ kfree(i_ipaddr);
+ }
+
+ rds_ib_destroy_conns(rds_ibdev);
- /* stop connection attempts from getting a reference to this device. */
- ib_set_client_data(device, &rds_ib_client, NULL);
+ if (rds_ibdev->mr_pool)
+ rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
- down_write(&rds_ib_devices_lock);
- list_del_rcu(&rds_ibdev->list);
- up_write(&rds_ib_devices_lock);
+ ib_dereg_mr(rds_ibdev->mr);
- /*
- * This synchronize rcu is waiting for readers of both the ib
- * client data and the devices list to finish before we drop
- * both of those references.
- */
- synchronize_rcu();
- rds_ib_dev_put(rds_ibdev);
- rds_ib_dev_put(rds_ibdev);
+ while (ib_dealloc_pd(rds_ibdev->pd)) {
+ rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd);
+ msleep(1);
+ }
+
+ list_del(&rds_ibdev->list);
+ kfree(rds_ibdev);
}
struct ib_client rds_ib_client = {
ic = conn->c_transport_data;
dev_addr = &ic->i_cm_id->route.addr.dev_addr;
- rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
- rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
+ ib_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
+ ib_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
- rds_ibdev = ic->rds_ibdev;
+ rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
iinfo->max_send_wr = ic->i_send_ring.w_nr;
iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
iinfo->max_send_sge = rds_ibdev->max_sge;
/* Create a CMA ID and try to bind it. This catches both
* IB and iWARP capable NICs.
*/
- cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
- if (IS_ERR(cm_id))
- return PTR_ERR(cm_id);
+ cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
+ if (!cm_id)
+ return -EADDRNOTAVAIL;
memset(&sin, 0, sizeof(sin));
sin.sin_family = AF_INET;
ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
/* due to this, we will claim to support iWARP devices unless we
check node_type. */
- if (ret || !cm_id->device ||
- cm_id->device->node_type != RDMA_NODE_IB_CA)
+ if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
ret = -EADDRNOTAVAIL;
rdsdebug("addr %pI4 ret %d node type %d\n",
return ret;
}
-static void rds_ib_unregister_client(void)
-{
- ib_unregister_client(&rds_ib_client);
- /* wait for rds_ib_dev_free() to complete */
- flush_workqueue(rds_wq);
-}
-
void rds_ib_exit(void)
{
rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
- rds_ib_unregister_client();
rds_ib_destroy_nodev_conns();
+ ib_unregister_client(&rds_ib_client);
rds_ib_sysctl_exit();
rds_ib_recv_exit();
rds_trans_unregister(&rds_ib_transport);
.laddr_check = rds_ib_laddr_check,
.xmit_complete = rds_ib_xmit_complete,
.xmit = rds_ib_xmit,
+ .xmit_cong_map = NULL,
.xmit_rdma = rds_ib_xmit_rdma,
- .xmit_atomic = rds_ib_xmit_atomic,
.recv = rds_ib_recv,
.conn_alloc = rds_ib_conn_alloc,
.conn_free = rds_ib_conn_free,
.conn_connect = rds_ib_conn_connect,
.conn_shutdown = rds_ib_conn_shutdown,
.inc_copy_to_user = rds_ib_inc_copy_to_user,
+ .inc_purge = rds_ib_inc_purge,
.inc_free = rds_ib_inc_free,
.cm_initiate_connect = rds_ib_cm_initiate_connect,
.cm_handle_connect = rds_ib_cm_handle_connect,
.flush_mrs = rds_ib_flush_mrs,
.t_owner = THIS_MODULE,
.t_name = "infiniband",
- .t_type = RDS_TRANS_IB
};
-int rds_ib_init(void)
+int __init rds_ib_init(void)
{
int ret;
out_sysctl:
rds_ib_sysctl_exit();
out_ibreg:
- rds_ib_unregister_client();
+ ib_unregister_client(&rds_ib_client);
out:
return ret;
}
#include <rdma/ib_verbs.h>
#include <rdma/rdma_cm.h>
-#include <linux/interrupt.h>
-#include <linux/pci.h>
-#include <linux/slab.h>
#include "rds.h"
#include "rdma_transport.h"
#define RDS_FMR_SIZE 256
-#define RDS_FMR_POOL_SIZE 8192
+#define RDS_FMR_POOL_SIZE 4096
#define RDS_IB_MAX_SGE 8
#define RDS_IB_RECV_SGE 2
#define RDS_IB_DEFAULT_RECV_WR 1024
#define RDS_IB_DEFAULT_SEND_WR 256
-#define RDS_IB_DEFAULT_RETRY_COUNT 2
-
#define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */
-#define RDS_IB_RECYCLE_BATCH_COUNT 32
-
-extern struct rw_semaphore rds_ib_devices_lock;
extern struct list_head rds_ib_devices;
/*
* try and minimize the amount of memory tied up both the device and
* socket receive queues.
*/
+/* page offset of the final full frag that fits in the page */
+#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
struct rds_page_frag {
struct list_head f_item;
- struct list_head f_cache_entry;
- struct scatterlist f_sg;
+ struct page *f_page;
+ unsigned long f_offset;
+ dma_addr_t f_mapped;
};
struct rds_ib_incoming {
struct list_head ii_frags;
- struct list_head ii_cache_entry;
struct rds_incoming ii_inc;
};
-struct rds_ib_cache_head {
- struct list_head *first;
- unsigned long count;
-};
-
-struct rds_ib_refill_cache {
- struct rds_ib_cache_head __percpu *percpu;
- struct list_head *xfer;
- struct list_head *ready;
-};
-
struct rds_ib_connect_private {
/* Add new fields at the end, and don't permute existing fields. */
__be32 dp_saddr;
};
struct rds_ib_send_work {
- void *s_op;
+ struct rds_message *s_rm;
+ struct rds_rdma_op *s_op;
struct ib_send_wr s_wr;
struct ib_sge s_sge[RDS_IB_MAX_SGE];
unsigned long s_queued;
/* tx */
struct rds_ib_work_ring i_send_ring;
- struct rm_data_op *i_data_op;
+ struct rds_message *i_rm;
struct rds_header *i_send_hdrs;
u64 i_send_hdrs_dma;
struct rds_ib_send_work *i_sends;
- atomic_t i_signaled_sends;
/* rx */
- struct tasklet_struct i_recv_tasklet;
struct mutex i_recv_mutex;
struct rds_ib_work_ring i_recv_ring;
struct rds_ib_incoming *i_ibinc;
struct rds_header *i_recv_hdrs;
u64 i_recv_hdrs_dma;
struct rds_ib_recv_work *i_recvs;
+ struct rds_page_frag i_frag;
u64 i_ack_recv; /* last ACK received */
- struct rds_ib_refill_cache i_cache_incs;
- struct rds_ib_refill_cache i_cache_frags;
/* sending acks */
unsigned long i_ack_flags;
/* Batched completions */
unsigned int i_unsignaled_wrs;
+ long i_unsignaled_bytes;
};
/* This assumes that atomic_t is at least 32 bits */
struct ib_pd *pd;
struct ib_mr *mr;
struct rds_ib_mr_pool *mr_pool;
+ int fmr_page_shift;
+ int fmr_page_size;
+ u64 fmr_page_mask;
unsigned int fmr_max_remaps;
unsigned int max_fmrs;
int max_sge;
unsigned int max_wrs;
- unsigned int max_initiator_depth;
- unsigned int max_responder_resources;
spinlock_t spinlock; /* protect the above */
- atomic_t refcount;
- struct work_struct free_work;
};
-#define ibdev_to_node(ibdev) dev_to_node(ibdev->dma_device)
-#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev)
-
/* bits for i_ack_flags */
#define IB_ACK_IN_FLIGHT 0
#define IB_ACK_REQUESTED 1
uint64_t s_ib_rdma_mr_pool_flush;
uint64_t s_ib_rdma_mr_pool_wait;
uint64_t s_ib_rdma_mr_pool_depleted;
- uint64_t s_ib_atomic_cswp;
- uint64_t s_ib_atomic_fadd;
};
extern struct workqueue_struct *rds_ib_wq;
/* ib.c */
extern struct rds_transport rds_ib_transport;
-struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device);
-void rds_ib_dev_put(struct rds_ib_device *rds_ibdev);
+extern void rds_ib_add_one(struct ib_device *device);
+extern void rds_ib_remove_one(struct ib_device *device);
extern struct ib_client rds_ib_client;
+extern unsigned int fmr_pool_size;
extern unsigned int fmr_message_size;
-extern unsigned int rds_ib_retry_count;
extern spinlock_t ib_nodev_conns_lock;
extern struct list_head ib_nodev_conns;
int rds_ib_conn_connect(struct rds_connection *conn);
void rds_ib_conn_shutdown(struct rds_connection *conn);
void rds_ib_state_change(struct sock *sk);
-int rds_ib_listen_init(void);
+int __init rds_ib_listen_init(void);
void rds_ib_listen_stop(void);
void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
-void rds_ib_destroy_nodev_conns(void);
+void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock);
+static inline void rds_ib_destroy_nodev_conns(void)
+{
+ __rds_ib_destroy_conns(&ib_nodev_conns, &ib_nodev_conns_lock);
+}
+static inline void rds_ib_destroy_conns(struct rds_ib_device *rds_ibdev)
+{
+ __rds_ib_destroy_conns(&rds_ibdev->conn_list, &rds_ibdev->spinlock);
+}
struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *);
void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
void rds_ib_flush_mrs(void);
/* ib_recv.c */
-int rds_ib_recv_init(void);
+int __init rds_ib_recv_init(void);
void rds_ib_recv_exit(void);
int rds_ib_recv(struct rds_connection *conn);
-int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic);
-void rds_ib_recv_free_caches(struct rds_ib_connection *ic);
-void rds_ib_recv_refill(struct rds_connection *conn, int prefill);
+int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
+ gfp_t page_gfp, int prefill);
+void rds_ib_inc_purge(struct rds_incoming *inc);
void rds_ib_inc_free(struct rds_incoming *inc);
-int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
+int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
+ size_t size);
void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context);
-void rds_ib_recv_tasklet_fn(unsigned long data);
void rds_ib_recv_init_ring(struct rds_ib_connection *ic);
void rds_ib_recv_clear_ring(struct rds_ib_connection *ic);
void rds_ib_recv_init_ack(struct rds_ib_connection *ic);
extern wait_queue_head_t rds_ib_ring_empty_wait;
/* ib_send.c */
-char *rds_ib_wc_status_str(enum ib_wc_status status);
void rds_ib_xmit_complete(struct rds_connection *conn);
int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg, unsigned int off);
void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context);
void rds_ib_send_init_ring(struct rds_ib_connection *ic);
void rds_ib_send_clear_ring(struct rds_ib_connection *ic);
-int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op);
void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits);
void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted);
int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
- u32 *adv_credits, int need_posted, int max_posted);
-int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op);
+ u32 *adv_credits, int need_posted);
/* ib_stats.c */
DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
unsigned int avail);
/* ib_sysctl.c */
-int rds_ib_sysctl_init(void);
+int __init rds_ib_sysctl_init(void);
void rds_ib_sysctl_exit(void);
extern unsigned long rds_ib_sysctl_max_send_wr;
extern unsigned long rds_ib_sysctl_max_recv_wr;
extern unsigned long rds_ib_sysctl_max_unsig_bytes;
extern unsigned long rds_ib_sysctl_max_recv_allocation;
extern unsigned int rds_ib_sysctl_flow_control;
+extern ctl_table rds_ib_sysctl_table[];
+
+/*
+ * Helper functions for getting/setting the header and data SGEs in
+ * RDS packets (not RDMA)
+ */
+static inline struct ib_sge *
+rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
+{
+ return &sge[0];
+}
+
+static inline struct ib_sge *
+rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
+{
+ return &sge[1];
+}
#endif
*/
#include <linux/kernel.h>
#include <linux/in.h>
-#include <linux/slab.h>
#include <linux/vmalloc.h>
-#include <linux/ratelimit.h>
#include "rds.h"
#include "ib.h"
-static char *rds_ib_event_type_strings[] = {
-#define RDS_IB_EVENT_STRING(foo) \
- [IB_EVENT_##foo] = __stringify(IB_EVENT_##foo)
- RDS_IB_EVENT_STRING(CQ_ERR),
- RDS_IB_EVENT_STRING(QP_FATAL),
- RDS_IB_EVENT_STRING(QP_REQ_ERR),
- RDS_IB_EVENT_STRING(QP_ACCESS_ERR),
- RDS_IB_EVENT_STRING(COMM_EST),
- RDS_IB_EVENT_STRING(SQ_DRAINED),
- RDS_IB_EVENT_STRING(PATH_MIG),
- RDS_IB_EVENT_STRING(PATH_MIG_ERR),
- RDS_IB_EVENT_STRING(DEVICE_FATAL),
- RDS_IB_EVENT_STRING(PORT_ACTIVE),
- RDS_IB_EVENT_STRING(PORT_ERR),
- RDS_IB_EVENT_STRING(LID_CHANGE),
- RDS_IB_EVENT_STRING(PKEY_CHANGE),
- RDS_IB_EVENT_STRING(SM_CHANGE),
- RDS_IB_EVENT_STRING(SRQ_ERR),
- RDS_IB_EVENT_STRING(SRQ_LIMIT_REACHED),
- RDS_IB_EVENT_STRING(QP_LAST_WQE_REACHED),
- RDS_IB_EVENT_STRING(CLIENT_REREGISTER),
-#undef RDS_IB_EVENT_STRING
-};
-
-static char *rds_ib_event_str(enum ib_event_type type)
-{
- return rds_str_array(rds_ib_event_type_strings,
- ARRAY_SIZE(rds_ib_event_type_strings), type);
-};
-
/*
* Set the selected protocol version
*/
{
const struct rds_ib_connect_private *dp = NULL;
struct rds_ib_connection *ic = conn->c_transport_data;
+ struct rds_ib_device *rds_ibdev;
struct ib_qp_attr qp_attr;
int err;
- if (event->param.conn.private_data_len >= sizeof(*dp)) {
+ if (event->param.conn.private_data_len) {
dp = event->param.conn.private_data;
- /* make sure it isn't empty data */
- if (dp->dp_protocol_major) {
- rds_ib_set_protocol(conn,
+ rds_ib_set_protocol(conn,
RDS_PROTOCOL(dp->dp_protocol_major,
- dp->dp_protocol_minor));
- rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
- }
+ dp->dp_protocol_minor));
+ rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
}
- if (conn->c_version < RDS_PROTOCOL(3,1)) {
- printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed,"
- " no longer supported\n",
- &conn->c_faddr,
- RDS_PROTOCOL_MAJOR(conn->c_version),
- RDS_PROTOCOL_MINOR(conn->c_version));
- rds_conn_destroy(conn);
- return;
- } else {
- printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
- &conn->c_faddr,
- RDS_PROTOCOL_MAJOR(conn->c_version),
- RDS_PROTOCOL_MINOR(conn->c_version),
- ic->i_flowctl ? ", flow control" : "");
- }
-
- /*
- * Init rings and fill recv. this needs to wait until protocol negotiation
- * is complete, since ring layout is different from 3.0 to 3.1.
- */
- rds_ib_send_init_ring(ic);
- rds_ib_recv_init_ring(ic);
- /* Post receive buffers - as a side effect, this will update
- * the posted credit count. */
- rds_ib_recv_refill(conn, 1);
+ printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
+ &conn->c_laddr,
+ RDS_PROTOCOL_MAJOR(conn->c_version),
+ RDS_PROTOCOL_MINOR(conn->c_version),
+ ic->i_flowctl ? ", flow control" : "");
/* Tune RNR behavior */
rds_ib_tune_rnr(ic, &qp_attr);
if (err)
printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
- /* update ib_device with this local ipaddr */
- err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr);
+ /* update ib_device with this local ipaddr & conn */
+ rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+ err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr);
if (err)
- printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n",
- err);
+ printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err);
+ rds_ib_add_conn(rds_ibdev, conn);
/* If the peer gave us the last packet it saw, process this as if
* we had received a regular ACK. */
- if (dp) {
- /* dp structure start is not guaranteed to be 8 bytes aligned.
- * Since dp_ack_seq is 64-bit extended load operations can be
- * used so go through get_unaligned to avoid unaligned errors.
- */
- __be64 dp_ack_seq = get_unaligned(&dp->dp_ack_seq);
-
- if (dp_ack_seq)
- rds_send_drop_acked(conn, be64_to_cpu(dp_ack_seq),
- NULL);
- }
+ if (dp && dp->dp_ack_seq)
+ rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
rds_connect_complete(conn);
}
static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
struct rdma_conn_param *conn_param,
struct rds_ib_connect_private *dp,
- u32 protocol_version,
- u32 max_responder_resources,
- u32 max_initiator_depth)
+ u32 protocol_version)
{
- struct rds_ib_connection *ic = conn->c_transport_data;
- struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
-
memset(conn_param, 0, sizeof(struct rdma_conn_param));
-
- conn_param->responder_resources =
- min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources);
- conn_param->initiator_depth =
- min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth);
- conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7);
+ /* XXX tune these? */
+ conn_param->responder_resources = 1;
+ conn_param->initiator_depth = 1;
+ conn_param->retry_count = 7;
conn_param->rnr_retry_count = 7;
if (dp) {
+ struct rds_ib_connection *ic = conn->c_transport_data;
+
memset(dp, 0, sizeof(*dp));
dp->dp_saddr = conn->c_laddr;
dp->dp_daddr = conn->c_faddr;
static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
{
- rdsdebug("event %u (%s) data %p\n",
- event->event, rds_ib_event_str(event->event), data);
+ rdsdebug("event %u data %p\n", event->event, data);
}
static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
struct rds_connection *conn = data;
struct rds_ib_connection *ic = conn->c_transport_data;
- rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event,
- rds_ib_event_str(event->event));
+ rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);
switch (event->event) {
case IB_EVENT_COMM_EST:
rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
break;
default:
- rdsdebug("Fatal QP Event %u (%s) "
- "- connection %pI4->%pI4, reconnecting\n",
- event->event, rds_ib_event_str(event->event),
- &conn->c_laddr, &conn->c_faddr);
- rds_conn_drop(conn);
+ printk(KERN_WARNING "RDS/ib: unhandled QP event %u "
+ "on connection to %pI4\n", event->event,
+ &conn->c_faddr);
break;
}
}
struct rds_ib_device *rds_ibdev;
int ret;
- /*
- * It's normal to see a null device if an incoming connection races
- * with device removal, so we don't print a warning.
+ /* rds_ib_add_one creates a rds_ib_device object per IB device,
+ * and allocates a protection domain, memory range and FMR pool
+ * for each. If that fails for any reason, it will not register
+ * the rds_ibdev at all.
*/
- rds_ibdev = rds_ib_get_client_data(dev);
- if (!rds_ibdev)
+ rds_ibdev = ib_get_client_data(dev, &rds_ib_client);
+ if (rds_ibdev == NULL) {
+ if (printk_ratelimit())
+ printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n",
+ dev->name);
return -EOPNOTSUPP;
-
- /* add the conn now so that connection establishment has the dev */
- rds_ib_add_conn(rds_ibdev, conn);
+ }
if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1)
rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1);
ic->i_send_ring.w_nr *
sizeof(struct rds_header),
&ic->i_send_hdrs_dma, GFP_KERNEL);
- if (!ic->i_send_hdrs) {
+ if (ic->i_send_hdrs == NULL) {
ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent send failed\n");
goto out;
ic->i_recv_ring.w_nr *
sizeof(struct rds_header),
&ic->i_recv_hdrs_dma, GFP_KERNEL);
- if (!ic->i_recv_hdrs) {
+ if (ic->i_recv_hdrs == NULL) {
ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent recv failed\n");
goto out;
ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
&ic->i_ack_dma, GFP_KERNEL);
- if (!ic->i_ack) {
+ if (ic->i_ack == NULL) {
ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent ack failed\n");
goto out;
}
- ic->i_sends = vzalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work),
- ibdev_to_node(dev));
- if (!ic->i_sends) {
+ ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
+ if (ic->i_sends == NULL) {
ret = -ENOMEM;
rdsdebug("send allocation failed\n");
goto out;
}
+ rds_ib_send_init_ring(ic);
- ic->i_recvs = vzalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work),
- ibdev_to_node(dev));
- if (!ic->i_recvs) {
+ ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work));
+ if (ic->i_recvs == NULL) {
ret = -ENOMEM;
rdsdebug("recv allocation failed\n");
goto out;
}
+ rds_ib_recv_init_ring(ic);
rds_ib_recv_init_ack(ic);
+ /* Post receive buffers - as a side effect, this will update
+ * the posted credit count. */
+ rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
+
rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
ic->i_send_cq, ic->i_recv_cq);
out:
- rds_ib_dev_put(rds_ibdev);
return ret;
}
-static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event)
+static u32 rds_ib_protocol_compatible(const struct rds_ib_connect_private *dp)
{
- const struct rds_ib_connect_private *dp = event->param.conn.private_data;
u16 common;
u32 version = 0;
- /*
- * rdma_cm private data is odd - when there is any private data in the
+ /* rdma_cm private data is odd - when there is any private data in the
* request, we will be given a pretty large buffer without telling us the
* original size. The only way to tell the difference is by looking at
* the contents, which are initialized to zero.
* If the protocol version fields aren't set, this is a connection attempt
* from an older version. This could could be 3.0 or 2.0 - we can't tell.
- * We really should have changed this for OFED 1.3 :-(
- */
-
- /* Be paranoid. RDS always has privdata */
- if (!event->param.conn.private_data_len) {
- printk(KERN_NOTICE "RDS incoming connection has no private data, "
- "rejecting\n");
- return 0;
- }
-
- /* Even if len is crap *now* I still want to check it. -ASG */
- if (event->param.conn.private_data_len < sizeof (*dp) ||
- dp->dp_protocol_major == 0)
+ * We really should have changed this for OFED 1.3 :-( */
+ if (dp->dp_protocol_major == 0)
return RDS_PROTOCOL_3_0;
common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS;
version = RDS_PROTOCOL_3_0;
while ((common >>= 1) != 0)
version++;
- } else
- printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n",
- &dp->dp_saddr,
- dp->dp_protocol_major,
- dp->dp_protocol_minor);
+ } else if (printk_ratelimit()) {
+ printk(KERN_NOTICE "RDS: Connection from %pI4 using "
+ "incompatible protocol version %u.%u\n",
+ &dp->dp_saddr,
+ dp->dp_protocol_major,
+ dp->dp_protocol_minor);
+ }
return version;
}
struct rds_ib_connection *ic = NULL;
struct rdma_conn_param conn_param;
u32 version;
- int err = 1, destroy = 1;
+ int err, destroy = 1;
/* Check whether the remote protocol version matches ours. */
- version = rds_ib_protocol_compatible(event);
+ version = rds_ib_protocol_compatible(dp);
if (!version)
goto out;
/* Wait and see - our connect may still be succeeding */
rds_ib_stats_inc(s_ib_connect_raced);
}
+ mutex_unlock(&conn->c_cm_lock);
goto out;
}
goto out;
}
- rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
- event->param.conn.responder_resources,
- event->param.conn.initiator_depth);
+ rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version);
/* rdma_accept() calls rdma_reject() internally if it fails */
err = rdma_accept(cm_id, &conn_param);
- if (err)
+ mutex_unlock(&conn->c_cm_lock);
+ if (err) {
rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err);
+ goto out;
+ }
+
+ return 0;
out:
- if (conn)
- mutex_unlock(&conn->c_cm_lock);
- if (err)
- rdma_reject(cm_id, NULL, 0);
+ rdma_reject(cm_id, NULL, 0);
return destroy;
}
goto out;
}
- rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION,
- UINT_MAX, UINT_MAX);
+ rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION);
+
ret = rdma_connect(cm_id, &conn_param);
if (ret)
rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
/* XXX I wonder what affect the port space has */
/* delegate cm event handler to rdma_transport */
ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
- RDMA_PS_TCP, IB_QPT_RC);
+ RDMA_PS_TCP);
if (IS_ERR(ic->i_cm_id)) {
ret = PTR_ERR(ic->i_cm_id);
ic->i_cm_id = NULL;
ic->i_cm_id, err);
}
- /*
- * We want to wait for tx and rx completion to finish
- * before we tear down the connection, but we have to be
- * careful not to get stuck waiting on a send ring that
- * only has unsignaled sends in it. We've shutdown new
- * sends before getting here so by waiting for signaled
- * sends to complete we're ensured that there will be no
- * more tx processing.
- */
wait_event(rds_ib_ring_empty_wait,
- rds_ib_ring_empty(&ic->i_recv_ring) &&
- (atomic_read(&ic->i_signaled_sends) == 0));
- tasklet_kill(&ic->i_recv_tasklet);
+ rds_ib_ring_empty(&ic->i_send_ring) &&
+ rds_ib_ring_empty(&ic->i_recv_ring));
if (ic->i_send_hdrs)
ib_dma_free_coherent(dev,
BUG_ON(ic->rds_ibdev);
/* Clear pending transmit */
- if (ic->i_data_op) {
- struct rds_message *rm;
-
- rm = container_of(ic->i_data_op, struct rds_message, data);
- rds_message_put(rm);
- ic->i_data_op = NULL;
+ if (ic->i_rm) {
+ rds_message_put(ic->i_rm);
+ ic->i_rm = NULL;
}
/* Clear the ACK state */
{
struct rds_ib_connection *ic;
unsigned long flags;
- int ret;
/* XXX too lazy? */
- ic = kzalloc(sizeof(struct rds_ib_connection), gfp);
- if (!ic)
+ ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL);
+ if (ic == NULL)
return -ENOMEM;
- ret = rds_ib_recv_alloc_caches(ic);
- if (ret) {
- kfree(ic);
- return ret;
- }
-
INIT_LIST_HEAD(&ic->ib_node);
- tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn,
- (unsigned long) ic);
mutex_init(&ic->i_recv_mutex);
#ifndef KERNEL_HAS_ATOMIC64
spin_lock_init(&ic->i_ack_lock);
#endif
- atomic_set(&ic->i_signaled_sends, 0);
/*
* rds_ib_conn_shutdown() waits for these to be emptied so they
list_del(&ic->ib_node);
spin_unlock_irq(lock_ptr);
- rds_ib_recv_free_caches(ic);
-
kfree(ic);
}
*
*/
#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/rculist.h>
-#include <linux/llist.h>
#include "rds.h"
+#include "rdma.h"
#include "ib.h"
-static DEFINE_PER_CPU(unsigned long, clean_list_grace);
-#define CLEAN_LIST_BUSY_BIT 0
/*
* This is stored as mr->r_trans_private.
struct rds_ib_device *device;
struct rds_ib_mr_pool *pool;
struct ib_fmr *fmr;
-
- struct llist_node llnode;
-
- /* unmap_list is for freeing */
- struct list_head unmap_list;
+ struct list_head list;
unsigned int remap_count;
struct scatterlist *sg;
*/
struct rds_ib_mr_pool {
struct mutex flush_lock; /* serialize fmr invalidate */
- struct delayed_work flush_worker; /* flush worker */
+ struct work_struct flush_worker; /* flush worker */
+ spinlock_t list_lock; /* protect variables below */
atomic_t item_count; /* total # of MRs */
atomic_t dirty_count; /* # dirty of MRs */
-
- struct llist_head drop_list; /* MRs that have reached their max_maps limit */
- struct llist_head free_list; /* unused MRs */
- struct llist_head clean_list; /* global unused & unamapped MRs */
- wait_queue_head_t flush_wait;
-
+ struct list_head drop_list; /* MRs that have reached their max_maps limit */
+ struct list_head free_list; /* unused MRs */
+ struct list_head clean_list; /* unused & unamapped MRs */
atomic_t free_pinned; /* memory pinned by free MRs */
unsigned long max_items;
unsigned long max_items_soft;
struct ib_fmr_attr fmr_attr;
};
-static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **);
+static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all);
static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr);
static void rds_ib_mr_pool_flush_worker(struct work_struct *work);
struct rds_ib_device *rds_ibdev;
struct rds_ib_ipaddr *i_ipaddr;
- rcu_read_lock();
- list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) {
- list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
+ list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
+ spin_lock_irq(&rds_ibdev->spinlock);
+ list_for_each_entry(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
if (i_ipaddr->ipaddr == ipaddr) {
- atomic_inc(&rds_ibdev->refcount);
- rcu_read_unlock();
+ spin_unlock_irq(&rds_ibdev->spinlock);
return rds_ibdev;
}
}
+ spin_unlock_irq(&rds_ibdev->spinlock);
}
- rcu_read_unlock();
return NULL;
}
i_ipaddr->ipaddr = ipaddr;
spin_lock_irq(&rds_ibdev->spinlock);
- list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
+ list_add_tail(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
spin_unlock_irq(&rds_ibdev->spinlock);
return 0;
static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
{
- struct rds_ib_ipaddr *i_ipaddr;
- struct rds_ib_ipaddr *to_free = NULL;
-
+ struct rds_ib_ipaddr *i_ipaddr, *next;
spin_lock_irq(&rds_ibdev->spinlock);
- list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
+ list_for_each_entry_safe(i_ipaddr, next, &rds_ibdev->ipaddr_list, list) {
if (i_ipaddr->ipaddr == ipaddr) {
- list_del_rcu(&i_ipaddr->list);
- to_free = i_ipaddr;
+ list_del(&i_ipaddr->list);
+ kfree(i_ipaddr);
break;
}
}
spin_unlock_irq(&rds_ibdev->spinlock);
-
- if (to_free) {
- synchronize_rcu();
- kfree(to_free);
- }
}
int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
struct rds_ib_device *rds_ibdev_old;
rds_ibdev_old = rds_ib_get_device(ipaddr);
- if (rds_ibdev_old) {
+ if (rds_ibdev_old)
rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
- rds_ib_dev_put(rds_ibdev_old);
- }
return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
}
BUG_ON(list_empty(&ic->ib_node));
list_del(&ic->ib_node);
- spin_lock(&rds_ibdev->spinlock);
+ spin_lock_irq(&rds_ibdev->spinlock);
list_add_tail(&ic->ib_node, &rds_ibdev->conn_list);
- spin_unlock(&rds_ibdev->spinlock);
+ spin_unlock_irq(&rds_ibdev->spinlock);
spin_unlock_irq(&ib_nodev_conns_lock);
ic->rds_ibdev = rds_ibdev;
- atomic_inc(&rds_ibdev->refcount);
}
void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
spin_unlock(&ib_nodev_conns_lock);
ic->rds_ibdev = NULL;
- rds_ib_dev_put(rds_ibdev);
}
-void rds_ib_destroy_nodev_conns(void)
+void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock)
{
struct rds_ib_connection *ic, *_ic;
LIST_HEAD(tmp_list);
/* avoid calling conn_destroy with irqs off */
- spin_lock_irq(&ib_nodev_conns_lock);
- list_splice(&ib_nodev_conns, &tmp_list);
- spin_unlock_irq(&ib_nodev_conns_lock);
-
- list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node)
+ spin_lock_irq(list_lock);
+ list_splice(list, &tmp_list);
+ INIT_LIST_HEAD(list);
+ spin_unlock_irq(list_lock);
+
+ list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) {
+ if (ic->conn->c_passive)
+ rds_conn_destroy(ic->conn->c_passive);
rds_conn_destroy(ic->conn);
+ }
}
struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
if (!pool)
return ERR_PTR(-ENOMEM);
- init_llist_head(&pool->free_list);
- init_llist_head(&pool->drop_list);
- init_llist_head(&pool->clean_list);
+ INIT_LIST_HEAD(&pool->free_list);
+ INIT_LIST_HEAD(&pool->drop_list);
+ INIT_LIST_HEAD(&pool->clean_list);
mutex_init(&pool->flush_lock);
- init_waitqueue_head(&pool->flush_wait);
- INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
+ spin_lock_init(&pool->list_lock);
+ INIT_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
pool->fmr_attr.max_pages = fmr_message_size;
pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
- pool->fmr_attr.page_shift = PAGE_SHIFT;
+ pool->fmr_attr.page_shift = rds_ibdev->fmr_page_shift;
pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4;
/* We never allow more than max_items MRs to be allocated.
void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
{
- cancel_delayed_work_sync(&pool->flush_worker);
- rds_ib_flush_mr_pool(pool, 1, NULL);
- WARN_ON(atomic_read(&pool->item_count));
- WARN_ON(atomic_read(&pool->free_pinned));
+ flush_workqueue(rds_wq);
+ rds_ib_flush_mr_pool(pool, 1);
+ BUG_ON(atomic_read(&pool->item_count));
+ BUG_ON(atomic_read(&pool->free_pinned));
kfree(pool);
}
static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
{
struct rds_ib_mr *ibmr = NULL;
- struct llist_node *ret;
- unsigned long *flag;
+ unsigned long flags;
- preempt_disable();
- flag = this_cpu_ptr(&clean_list_grace);
- set_bit(CLEAN_LIST_BUSY_BIT, flag);
- ret = llist_del_first(&pool->clean_list);
- if (ret)
- ibmr = llist_entry(ret, struct rds_ib_mr, llnode);
+ spin_lock_irqsave(&pool->list_lock, flags);
+ if (!list_empty(&pool->clean_list)) {
+ ibmr = list_entry(pool->clean_list.next, struct rds_ib_mr, list);
+ list_del_init(&ibmr->list);
+ }
+ spin_unlock_irqrestore(&pool->list_lock, flags);
- clear_bit(CLEAN_LIST_BUSY_BIT, flag);
- preempt_enable();
return ibmr;
}
-static inline void wait_clean_list_grace(void)
-{
- int cpu;
- unsigned long *flag;
-
- for_each_online_cpu(cpu) {
- flag = &per_cpu(clean_list_grace, cpu);
- while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
- cpu_relax();
- }
-}
-
static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
{
struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
struct rds_ib_mr *ibmr = NULL;
int err = 0, iter = 0;
- if (atomic_read(&pool->dirty_count) >= pool->max_items / 10)
- schedule_delayed_work(&pool->flush_worker, 10);
-
while (1) {
ibmr = rds_ib_reuse_fmr(pool);
if (ibmr)
/* We do have some empty MRs. Flush them out. */
rds_ib_stats_inc(s_ib_rdma_mr_pool_wait);
- rds_ib_flush_mr_pool(pool, 0, &ibmr);
- if (ibmr)
- return ibmr;
+ rds_ib_flush_mr_pool(pool, 0);
}
- ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev));
+ ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
if (!ibmr) {
err = -ENOMEM;
goto out_no_cigar;
}
- memset(ibmr, 0, sizeof(*ibmr));
-
ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
(IB_ACCESS_LOCAL_WRITE |
IB_ACCESS_REMOTE_READ |
- IB_ACCESS_REMOTE_WRITE|
- IB_ACCESS_REMOTE_ATOMIC),
+ IB_ACCESS_REMOTE_WRITE),
&pool->fmr_attr);
if (IS_ERR(ibmr->fmr)) {
err = PTR_ERR(ibmr->fmr);
unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
- if (dma_addr & ~PAGE_MASK) {
+ if (dma_addr & ~rds_ibdev->fmr_page_mask) {
if (i > 0)
return -EINVAL;
else
++page_cnt;
}
- if ((dma_addr + dma_len) & ~PAGE_MASK) {
+ if ((dma_addr + dma_len) & ~rds_ibdev->fmr_page_mask) {
if (i < sg_dma_len - 1)
return -EINVAL;
else
len += dma_len;
}
- page_cnt += len >> PAGE_SHIFT;
+ page_cnt += len >> rds_ibdev->fmr_page_shift;
if (page_cnt > fmr_message_size)
return -EINVAL;
- dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC,
- rdsibdev_to_node(rds_ibdev));
+ dma_pages = kmalloc(sizeof(u64) * page_cnt, GFP_ATOMIC);
if (!dma_pages)
return -ENOMEM;
unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
- for (j = 0; j < dma_len; j += PAGE_SIZE)
+ for (j = 0; j < dma_len; j += rds_ibdev->fmr_page_size)
dma_pages[page_cnt++] =
- (dma_addr & PAGE_MASK) + j;
+ (dma_addr & rds_ibdev->fmr_page_mask) + j;
}
ret = ib_map_phys_fmr(ibmr->fmr,
/* FIXME we need a way to tell a r/w MR
* from a r/o MR */
- BUG_ON(irqs_disabled());
set_page_dirty(page);
put_page(page);
}
return 0;
}
-/*
- * given an llist of mrs, put them all into the list_head for more processing
- */
-static void llist_append_to_list(struct llist_head *llist, struct list_head *list)
-{
- struct rds_ib_mr *ibmr;
- struct llist_node *node;
- struct llist_node *next;
-
- node = llist_del_all(llist);
- while (node) {
- next = node->next;
- ibmr = llist_entry(node, struct rds_ib_mr, llnode);
- list_add_tail(&ibmr->unmap_list, list);
- node = next;
- }
-}
-
-/*
- * this takes a list head of mrs and turns it into linked llist nodes
- * of clusters. Each cluster has linked llist nodes of
- * MR_CLUSTER_SIZE mrs that are ready for reuse.
- */
-static void list_to_llist_nodes(struct rds_ib_mr_pool *pool,
- struct list_head *list,
- struct llist_node **nodes_head,
- struct llist_node **nodes_tail)
-{
- struct rds_ib_mr *ibmr;
- struct llist_node *cur = NULL;
- struct llist_node **next = nodes_head;
-
- list_for_each_entry(ibmr, list, unmap_list) {
- cur = &ibmr->llnode;
- *next = cur;
- next = &cur->next;
- }
- *next = NULL;
- *nodes_tail = cur;
-}
-
/*
* Flush our pool of MRs.
* At a minimum, all currently unused MRs are unmapped.
* If the number of MRs allocated exceeds the limit, we also try
* to free as many MRs as needed to get back to this limit.
*/
-static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
- int free_all, struct rds_ib_mr **ibmr_ret)
+static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
{
struct rds_ib_mr *ibmr, *next;
- struct llist_node *clean_nodes;
- struct llist_node *clean_tail;
LIST_HEAD(unmap_list);
LIST_HEAD(fmr_list);
unsigned long unpinned = 0;
+ unsigned long flags;
unsigned int nfreed = 0, ncleaned = 0, free_goal;
int ret = 0;
rds_ib_stats_inc(s_ib_rdma_mr_pool_flush);
- if (ibmr_ret) {
- DEFINE_WAIT(wait);
- while(!mutex_trylock(&pool->flush_lock)) {
- ibmr = rds_ib_reuse_fmr(pool);
- if (ibmr) {
- *ibmr_ret = ibmr;
- finish_wait(&pool->flush_wait, &wait);
- goto out_nolock;
- }
-
- prepare_to_wait(&pool->flush_wait, &wait,
- TASK_UNINTERRUPTIBLE);
- if (llist_empty(&pool->clean_list))
- schedule();
-
- ibmr = rds_ib_reuse_fmr(pool);
- if (ibmr) {
- *ibmr_ret = ibmr;
- finish_wait(&pool->flush_wait, &wait);
- goto out_nolock;
- }
- }
- finish_wait(&pool->flush_wait, &wait);
- } else
- mutex_lock(&pool->flush_lock);
-
- if (ibmr_ret) {
- ibmr = rds_ib_reuse_fmr(pool);
- if (ibmr) {
- *ibmr_ret = ibmr;
- goto out;
- }
- }
+ mutex_lock(&pool->flush_lock);
+ spin_lock_irqsave(&pool->list_lock, flags);
/* Get the list of all MRs to be dropped. Ordering matters -
- * we want to put drop_list ahead of free_list.
- */
- llist_append_to_list(&pool->drop_list, &unmap_list);
- llist_append_to_list(&pool->free_list, &unmap_list);
+ * we want to put drop_list ahead of free_list. */
+ list_splice_init(&pool->free_list, &unmap_list);
+ list_splice_init(&pool->drop_list, &unmap_list);
if (free_all)
- llist_append_to_list(&pool->clean_list, &unmap_list);
+ list_splice_init(&pool->clean_list, &unmap_list);
+ spin_unlock_irqrestore(&pool->list_lock, flags);
free_goal = rds_ib_flush_goal(pool, free_all);
goto out;
/* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
- list_for_each_entry(ibmr, &unmap_list, unmap_list)
+ list_for_each_entry(ibmr, &unmap_list, list)
list_add(&ibmr->fmr->list, &fmr_list);
-
ret = ib_unmap_fmr(&fmr_list);
if (ret)
printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret);
/* Now we can destroy the DMA mapping and unpin any pages */
- list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) {
+ list_for_each_entry_safe(ibmr, next, &unmap_list, list) {
unpinned += ibmr->sg_len;
__rds_ib_teardown_mr(ibmr);
if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) {
rds_ib_stats_inc(s_ib_rdma_mr_free);
- list_del(&ibmr->unmap_list);
+ list_del(&ibmr->list);
ib_dealloc_fmr(ibmr->fmr);
kfree(ibmr);
nfreed++;
ncleaned++;
}
- if (!list_empty(&unmap_list)) {
- /* we have to make sure that none of the things we're about
- * to put on the clean list would race with other cpus trying
- * to pull items off. The llist would explode if we managed to
- * remove something from the clean list and then add it back again
- * while another CPU was spinning on that same item in llist_del_first.
- *
- * This is pretty unlikely, but just in case wait for an llist grace period
- * here before adding anything back into the clean list.
- */
- wait_clean_list_grace();
-
- list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail);
- if (ibmr_ret)
- *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode);
-
- /* more than one entry in llist nodes */
- if (clean_nodes->next)
- llist_add_batch(clean_nodes->next, clean_tail, &pool->clean_list);
-
- }
+ spin_lock_irqsave(&pool->list_lock, flags);
+ list_splice(&unmap_list, &pool->clean_list);
+ spin_unlock_irqrestore(&pool->list_lock, flags);
atomic_sub(unpinned, &pool->free_pinned);
atomic_sub(ncleaned, &pool->dirty_count);
out:
mutex_unlock(&pool->flush_lock);
- if (waitqueue_active(&pool->flush_wait))
- wake_up(&pool->flush_wait);
-out_nolock:
return ret;
}
static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
{
- struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work);
+ struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker);
- rds_ib_flush_mr_pool(pool, 0, NULL);
+ rds_ib_flush_mr_pool(pool, 0);
}
void rds_ib_free_mr(void *trans_private, int invalidate)
struct rds_ib_mr *ibmr = trans_private;
struct rds_ib_device *rds_ibdev = ibmr->device;
struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
+ unsigned long flags;
rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
/* Return it to the pool's free list */
+ spin_lock_irqsave(&pool->list_lock, flags);
if (ibmr->remap_count >= pool->fmr_attr.max_maps)
- llist_add(&ibmr->llnode, &pool->drop_list);
+ list_add(&ibmr->list, &pool->drop_list);
else
- llist_add(&ibmr->llnode, &pool->free_list);
+ list_add(&ibmr->list, &pool->free_list);
atomic_add(ibmr->sg_len, &pool->free_pinned);
atomic_inc(&pool->dirty_count);
+ spin_unlock_irqrestore(&pool->list_lock, flags);
/* If we've pinned too many pages, request a flush */
- if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
- atomic_read(&pool->dirty_count) >= pool->max_items / 10)
- schedule_delayed_work(&pool->flush_worker, 10);
+ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned
+ || atomic_read(&pool->dirty_count) >= pool->max_items / 10)
+ queue_work(rds_wq, &pool->flush_worker);
if (invalidate) {
if (likely(!in_interrupt())) {
- rds_ib_flush_mr_pool(pool, 0, NULL);
+ rds_ib_flush_mr_pool(pool, 0);
} else {
/* We get here if the user created a MR marked
* as use_once and invalidate at the same time. */
- schedule_delayed_work(&pool->flush_worker, 10);
+ queue_work(rds_wq, &pool->flush_worker);
}
}
-
- rds_ib_dev_put(rds_ibdev);
}
void rds_ib_flush_mrs(void)
{
struct rds_ib_device *rds_ibdev;
- down_read(&rds_ib_devices_lock);
list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
if (pool)
- rds_ib_flush_mr_pool(pool, 0, NULL);
+ rds_ib_flush_mr_pool(pool, 0);
}
- up_read(&rds_ib_devices_lock);
}
void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret);
ibmr->device = rds_ibdev;
- rds_ibdev = NULL;
out:
if (ret) {
rds_ib_free_mr(ibmr, 0);
ibmr = ERR_PTR(ret);
}
- if (rds_ibdev)
- rds_ib_dev_put(rds_ibdev);
return ibmr;
}
-
*
*/
#include <linux/kernel.h>
-#include <linux/slab.h>
#include <linux/pci.h>
#include <linux/dma-mapping.h>
#include <rdma/rdma_cm.h>
static struct kmem_cache *rds_ib_frag_slab;
static atomic_t rds_ib_allocation = ATOMIC_INIT(0);
+static void rds_ib_frag_drop_page(struct rds_page_frag *frag)
+{
+ rdsdebug("frag %p page %p\n", frag, frag->f_page);
+ __free_page(frag->f_page);
+ frag->f_page = NULL;
+}
+
+static void rds_ib_frag_free(struct rds_page_frag *frag)
+{
+ rdsdebug("frag %p page %p\n", frag, frag->f_page);
+ BUG_ON(frag->f_page != NULL);
+ kmem_cache_free(rds_ib_frag_slab, frag);
+}
+
+/*
+ * We map a page at a time. Its fragments are posted in order. This
+ * is called in fragment order as the fragments get send completion events.
+ * Only the last frag in the page performs the unmapping.
+ *
+ * It's OK for ring cleanup to call this in whatever order it likes because
+ * DMA is not in flight and so we can unmap while other ring entries still
+ * hold page references in their frags.
+ */
+static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic,
+ struct rds_ib_recv_work *recv)
+{
+ struct rds_page_frag *frag = recv->r_frag;
+
+ rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
+ if (frag->f_mapped)
+ ib_dma_unmap_page(ic->i_cm_id->device,
+ frag->f_mapped,
+ RDS_FRAG_SIZE, DMA_FROM_DEVICE);
+ frag->f_mapped = 0;
+}
+
void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
{
struct rds_ib_recv_work *recv;
recv->r_wr.sg_list = recv->r_sge;
recv->r_wr.num_sge = RDS_IB_RECV_SGE;
- sge = &recv->r_sge[0];
- sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
- sge->length = sizeof(struct rds_header);
- sge->lkey = ic->i_mr->lkey;
-
- sge = &recv->r_sge[1];
+ sge = rds_ib_data_sge(ic, recv->r_sge);
sge->addr = 0;
sge->length = RDS_FRAG_SIZE;
sge->lkey = ic->i_mr->lkey;
- }
-}
-
-/*
- * The entire 'from' list, including the from element itself, is put on
- * to the tail of the 'to' list.
- */
-static void list_splice_entire_tail(struct list_head *from,
- struct list_head *to)
-{
- struct list_head *from_last = from->prev;
-
- list_splice_tail(from_last, to);
- list_add_tail(from_last, to);
-}
-
-static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache)
-{
- struct list_head *tmp;
-
- tmp = xchg(&cache->xfer, NULL);
- if (tmp) {
- if (cache->ready)
- list_splice_entire_tail(tmp, cache->ready);
- else
- cache->ready = tmp;
- }
-}
-
-static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache)
-{
- struct rds_ib_cache_head *head;
- int cpu;
-
- cache->percpu = alloc_percpu(struct rds_ib_cache_head);
- if (!cache->percpu)
- return -ENOMEM;
-
- for_each_possible_cpu(cpu) {
- head = per_cpu_ptr(cache->percpu, cpu);
- head->first = NULL;
- head->count = 0;
- }
- cache->xfer = NULL;
- cache->ready = NULL;
-
- return 0;
-}
-
-int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic)
-{
- int ret;
-
- ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs);
- if (!ret) {
- ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags);
- if (ret)
- free_percpu(ic->i_cache_incs.percpu);
- }
-
- return ret;
-}
-static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache,
- struct list_head *caller_list)
-{
- struct rds_ib_cache_head *head;
- int cpu;
-
- for_each_possible_cpu(cpu) {
- head = per_cpu_ptr(cache->percpu, cpu);
- if (head->first) {
- list_splice_entire_tail(head->first, caller_list);
- head->first = NULL;
- }
- }
-
- if (cache->ready) {
- list_splice_entire_tail(cache->ready, caller_list);
- cache->ready = NULL;
- }
-}
-
-void rds_ib_recv_free_caches(struct rds_ib_connection *ic)
-{
- struct rds_ib_incoming *inc;
- struct rds_ib_incoming *inc_tmp;
- struct rds_page_frag *frag;
- struct rds_page_frag *frag_tmp;
- LIST_HEAD(list);
-
- rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
- rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list);
- free_percpu(ic->i_cache_incs.percpu);
-
- list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) {
- list_del(&inc->ii_cache_entry);
- WARN_ON(!list_empty(&inc->ii_frags));
- kmem_cache_free(rds_ib_incoming_slab, inc);
- }
-
- rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
- rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list);
- free_percpu(ic->i_cache_frags.percpu);
-
- list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) {
- list_del(&frag->f_cache_entry);
- WARN_ON(!list_empty(&frag->f_item));
- kmem_cache_free(rds_ib_frag_slab, frag);
- }
-}
-
-/* fwd decl */
-static void rds_ib_recv_cache_put(struct list_head *new_item,
- struct rds_ib_refill_cache *cache);
-static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache);
-
-
-/* Recycle frag and attached recv buffer f_sg */
-static void rds_ib_frag_free(struct rds_ib_connection *ic,
- struct rds_page_frag *frag)
-{
- rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg));
-
- rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags);
-}
-
-/* Recycle inc after freeing attached frags */
-void rds_ib_inc_free(struct rds_incoming *inc)
-{
- struct rds_ib_incoming *ibinc;
- struct rds_page_frag *frag;
- struct rds_page_frag *pos;
- struct rds_ib_connection *ic = inc->i_conn->c_transport_data;
-
- ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
-
- /* Free attached frags */
- list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
- list_del_init(&frag->f_item);
- rds_ib_frag_free(ic, frag);
+ sge = rds_ib_header_sge(ic, recv->r_sge);
+ sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
+ sge->length = sizeof(struct rds_header);
+ sge->lkey = ic->i_mr->lkey;
}
- BUG_ON(!list_empty(&ibinc->ii_frags));
-
- rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
- rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs);
}
static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
recv->r_ibinc = NULL;
}
if (recv->r_frag) {
- ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
- rds_ib_frag_free(ic, recv->r_frag);
+ rds_ib_recv_unmap_page(ic, recv);
+ if (recv->r_frag->f_page)
+ rds_ib_frag_drop_page(recv->r_frag);
+ rds_ib_frag_free(recv->r_frag);
recv->r_frag = NULL;
}
}
for (i = 0; i < ic->i_recv_ring.w_nr; i++)
rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
-}
-static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic,
- gfp_t slab_mask)
-{
- struct rds_ib_incoming *ibinc;
- struct list_head *cache_item;
- int avail_allocs;
-
- cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs);
- if (cache_item) {
- ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry);
- } else {
- avail_allocs = atomic_add_unless(&rds_ib_allocation,
- 1, rds_ib_sysctl_max_recv_allocation);
- if (!avail_allocs) {
- rds_ib_stats_inc(s_ib_rx_alloc_limit);
- return NULL;
- }
- ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask);
- if (!ibinc) {
- atomic_dec(&rds_ib_allocation);
- return NULL;
- }
- }
- INIT_LIST_HEAD(&ibinc->ii_frags);
- rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr);
-
- return ibinc;
-}
-
-static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic,
- gfp_t slab_mask, gfp_t page_mask)
-{
- struct rds_page_frag *frag;
- struct list_head *cache_item;
- int ret;
-
- cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags);
- if (cache_item) {
- frag = container_of(cache_item, struct rds_page_frag, f_cache_entry);
- } else {
- frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask);
- if (!frag)
- return NULL;
-
- sg_init_table(&frag->f_sg, 1);
- ret = rds_page_remainder_alloc(&frag->f_sg,
- RDS_FRAG_SIZE, page_mask);
- if (ret) {
- kmem_cache_free(rds_ib_frag_slab, frag);
- return NULL;
- }
- }
-
- INIT_LIST_HEAD(&frag->f_item);
-
- return frag;
+ if (ic->i_frag.f_page)
+ rds_ib_frag_drop_page(&ic->i_frag);
}
static int rds_ib_recv_refill_one(struct rds_connection *conn,
- struct rds_ib_recv_work *recv, int prefill)
+ struct rds_ib_recv_work *recv,
+ gfp_t kptr_gfp, gfp_t page_gfp)
{
struct rds_ib_connection *ic = conn->c_transport_data;
+ dma_addr_t dma_addr;
struct ib_sge *sge;
int ret = -ENOMEM;
- gfp_t slab_mask = GFP_NOWAIT;
- gfp_t page_mask = GFP_NOWAIT;
- if (prefill) {
- slab_mask = GFP_KERNEL;
- page_mask = GFP_HIGHUSER;
+ if (recv->r_ibinc == NULL) {
+ if (atomic_read(&rds_ib_allocation) >= rds_ib_sysctl_max_recv_allocation) {
+ rds_ib_stats_inc(s_ib_rx_alloc_limit);
+ goto out;
+ }
+ recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab,
+ kptr_gfp);
+ if (recv->r_ibinc == NULL)
+ goto out;
+ atomic_inc(&rds_ib_allocation);
+ INIT_LIST_HEAD(&recv->r_ibinc->ii_frags);
+ rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr);
}
- if (!ic->i_cache_incs.ready)
- rds_ib_cache_xfer_to_ready(&ic->i_cache_incs);
- if (!ic->i_cache_frags.ready)
- rds_ib_cache_xfer_to_ready(&ic->i_cache_frags);
+ if (recv->r_frag == NULL) {
+ recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp);
+ if (recv->r_frag == NULL)
+ goto out;
+ INIT_LIST_HEAD(&recv->r_frag->f_item);
+ recv->r_frag->f_page = NULL;
+ }
- /*
- * ibinc was taken from recv if recv contained the start of a message.
- * recvs that were continuations will still have this allocated.
- */
- if (!recv->r_ibinc) {
- recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask);
- if (!recv->r_ibinc)
+ if (ic->i_frag.f_page == NULL) {
+ ic->i_frag.f_page = alloc_page(page_gfp);
+ if (ic->i_frag.f_page == NULL)
goto out;
+ ic->i_frag.f_offset = 0;
}
- WARN_ON(recv->r_frag); /* leak! */
- recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask);
- if (!recv->r_frag)
+ dma_addr = ib_dma_map_page(ic->i_cm_id->device,
+ ic->i_frag.f_page,
+ ic->i_frag.f_offset,
+ RDS_FRAG_SIZE,
+ DMA_FROM_DEVICE);
+ if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
goto out;
- ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg,
- 1, DMA_FROM_DEVICE);
- WARN_ON(ret != 1);
+ /*
+ * Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap()
+ * must be called on this recv. This happens as completions hit
+ * in order or on connection shutdown.
+ */
+ recv->r_frag->f_page = ic->i_frag.f_page;
+ recv->r_frag->f_offset = ic->i_frag.f_offset;
+ recv->r_frag->f_mapped = dma_addr;
- sge = &recv->r_sge[0];
+ sge = rds_ib_data_sge(ic, recv->r_sge);
+ sge->addr = dma_addr;
+ sge->length = RDS_FRAG_SIZE;
+
+ sge = rds_ib_header_sge(ic, recv->r_sge);
sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
sge->length = sizeof(struct rds_header);
- sge = &recv->r_sge[1];
- sge->addr = ib_sg_dma_address(ic->i_cm_id->device, &recv->r_frag->f_sg);
- sge->length = ib_sg_dma_len(ic->i_cm_id->device, &recv->r_frag->f_sg);
+ get_page(recv->r_frag->f_page);
+
+ if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) {
+ ic->i_frag.f_offset += RDS_FRAG_SIZE;
+ } else {
+ put_page(ic->i_frag.f_page);
+ ic->i_frag.f_page = NULL;
+ ic->i_frag.f_offset = 0;
+ }
ret = 0;
out:
/*
* This tries to allocate and post unused work requests after making sure that
* they have all the allocations they need to queue received fragments into
- * sockets.
+ * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc
+ * pairs don't go unmatched.
*
* -1 is returned if posting fails due to temporary resource exhaustion.
*/
-void rds_ib_recv_refill(struct rds_connection *conn, int prefill)
+int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
+ gfp_t page_gfp, int prefill)
{
struct rds_ib_connection *ic = conn->c_transport_data;
struct rds_ib_recv_work *recv;
int ret = 0;
u32 pos;
- while ((prefill || rds_conn_up(conn)) &&
- rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
+ while ((prefill || rds_conn_up(conn))
+ && rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
if (pos >= ic->i_recv_ring.w_nr) {
printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
pos);
+ ret = -EINVAL;
break;
}
recv = &ic->i_recvs[pos];
- ret = rds_ib_recv_refill_one(conn, recv, prefill);
+ ret = rds_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp);
if (ret) {
+ ret = -1;
break;
}
/* XXX when can this fail? */
ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv,
- recv->r_ibinc, sg_page(&recv->r_frag->f_sg),
- (long) ib_sg_dma_address(
- ic->i_cm_id->device,
- &recv->r_frag->f_sg),
- ret);
+ recv->r_ibinc, recv->r_frag->f_page,
+ (long) recv->r_frag->f_mapped, ret);
if (ret) {
rds_ib_conn_error(conn, "recv post on "
"%pI4 returned %d, disconnecting and "
"reconnecting\n", &conn->c_faddr,
ret);
+ ret = -1;
break;
}
if (ret)
rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
+ return ret;
}
-/*
- * We want to recycle several types of recv allocations, like incs and frags.
- * To use this, the *_free() function passes in the ptr to a list_head within
- * the recyclee, as well as the cache to put it on.
- *
- * First, we put the memory on a percpu list. When this reaches a certain size,
- * We move it to an intermediate non-percpu list in a lockless manner, with some
- * xchg/compxchg wizardry.
- *
- * N.B. Instead of a list_head as the anchor, we use a single pointer, which can
- * be NULL and xchg'd. The list is actually empty when the pointer is NULL, and
- * list_empty() will return true with one element is actually present.
- */
-static void rds_ib_recv_cache_put(struct list_head *new_item,
- struct rds_ib_refill_cache *cache)
+void rds_ib_inc_purge(struct rds_incoming *inc)
{
- unsigned long flags;
- struct list_head *old, *chpfirst;
-
- local_irq_save(flags);
-
- chpfirst = __this_cpu_read(cache->percpu->first);
- if (!chpfirst)
- INIT_LIST_HEAD(new_item);
- else /* put on front */
- list_add_tail(new_item, chpfirst);
-
- __this_cpu_write(cache->percpu->first, new_item);
- __this_cpu_inc(cache->percpu->count);
+ struct rds_ib_incoming *ibinc;
+ struct rds_page_frag *frag;
+ struct rds_page_frag *pos;
- if (__this_cpu_read(cache->percpu->count) < RDS_IB_RECYCLE_BATCH_COUNT)
- goto end;
+ ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
+ rdsdebug("purging ibinc %p inc %p\n", ibinc, inc);
- /*
- * Return our per-cpu first list to the cache's xfer by atomically
- * grabbing the current xfer list, appending it to our per-cpu list,
- * and then atomically returning that entire list back to the
- * cache's xfer list as long as it's still empty.
- */
- do {
- old = xchg(&cache->xfer, NULL);
- if (old)
- list_splice_entire_tail(old, chpfirst);
- old = cmpxchg(&cache->xfer, NULL, chpfirst);
- } while (old);
-
-
- __this_cpu_write(cache->percpu->first, NULL);
- __this_cpu_write(cache->percpu->count, 0);
-end:
- local_irq_restore(flags);
+ list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
+ list_del_init(&frag->f_item);
+ rds_ib_frag_drop_page(frag);
+ rds_ib_frag_free(frag);
+ }
}
-static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache)
+void rds_ib_inc_free(struct rds_incoming *inc)
{
- struct list_head *head = cache->ready;
-
- if (head) {
- if (!list_empty(head)) {
- cache->ready = head->next;
- list_del_init(head);
- } else
- cache->ready = NULL;
- }
+ struct rds_ib_incoming *ibinc;
+
+ ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
- return head;
+ rds_ib_inc_purge(inc);
+ rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
+ BUG_ON(!list_empty(&ibinc->ii_frags));
+ kmem_cache_free(rds_ib_incoming_slab, ibinc);
+ atomic_dec(&rds_ib_allocation);
+ BUG_ON(atomic_read(&rds_ib_allocation) < 0);
}
-int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
+int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
+ size_t size)
{
struct rds_ib_incoming *ibinc;
struct rds_page_frag *frag;
+ struct iovec *iov = first_iov;
unsigned long to_copy;
unsigned long frag_off = 0;
+ unsigned long iov_off = 0;
int copied = 0;
int ret;
u32 len;
frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
len = be32_to_cpu(inc->i_hdr.h_len);
- while (iov_iter_count(to) && copied < len) {
+ while (copied < size && copied < len) {
if (frag_off == RDS_FRAG_SIZE) {
frag = list_entry(frag->f_item.next,
struct rds_page_frag, f_item);
frag_off = 0;
}
- to_copy = min_t(unsigned long, iov_iter_count(to),
- RDS_FRAG_SIZE - frag_off);
+ while (iov_off == iov->iov_len) {
+ iov_off = 0;
+ iov++;
+ }
+
+ to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off);
+ to_copy = min_t(size_t, to_copy, size - copied);
to_copy = min_t(unsigned long, to_copy, len - copied);
+ rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
+ "[%p, %lu] + %lu\n",
+ to_copy, iov->iov_base, iov->iov_len, iov_off,
+ frag->f_page, frag->f_offset, frag_off);
+
/* XXX needs + offset for multiple recvs per page */
- rds_stats_add(s_copy_to_user, to_copy);
- ret = copy_page_to_iter(sg_page(&frag->f_sg),
- frag->f_sg.offset + frag_off,
- to_copy,
- to);
- if (ret != to_copy)
- return -EFAULT;
+ ret = rds_page_copy_to_user(frag->f_page,
+ frag->f_offset + frag_off,
+ iov->iov_base + iov_off,
+ to_copy);
+ if (ret) {
+ copied = ret;
+ break;
+ }
+ iov_off += to_copy;
frag_off += to_copy;
copied += to_copy;
}
{
atomic64_set(&ic->i_ack_next, seq);
if (ack_required) {
- smp_mb__before_atomic();
+ smp_mb__before_clear_bit();
set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
}
}
static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
{
clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
- smp_mb__after_atomic();
+ smp_mb__after_clear_bit();
return atomic64_read(&ic->i_ack_next);
}
set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
rds_ib_stats_inc(s_ib_ack_send_failure);
-
- rds_ib_conn_error(ic->conn, "sending ack failed\n");
+ /* Need to finesse this later. */
+ BUG();
} else
rds_ib_stats_inc(s_ib_ack_sent);
}
}
/* Can we get a send credit? */
- if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) {
+ if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0)) {
rds_ib_stats_inc(s_ib_tx_throttle);
clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
return;
to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
- addr = kmap_atomic(sg_page(&frag->f_sg));
+ addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0);
src = addr + frag_off;
dst = (void *)map->m_page_addrs[map_page] + map_off;
uncongested |= ~(*src) & *dst;
*dst++ = *src++;
}
- kunmap_atomic(addr);
+ kunmap_atomic(addr, KM_SOFTIRQ0);
copied += to_copy;
};
static void rds_ib_process_recv(struct rds_connection *conn,
- struct rds_ib_recv_work *recv, u32 data_len,
+ struct rds_ib_recv_work *recv, u32 byte_len,
struct rds_ib_ack_state *state)
{
struct rds_ib_connection *ic = conn->c_transport_data;
/* XXX shut down the connection if port 0,0 are seen? */
rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv,
- data_len);
+ byte_len);
- if (data_len < sizeof(struct rds_header)) {
+ if (byte_len < sizeof(struct rds_header)) {
rds_ib_conn_error(conn, "incoming message "
- "from %pI4 didn't include a "
+ "from %pI4 didn't inclue a "
"header, disconnecting and "
"reconnecting\n",
&conn->c_faddr);
return;
}
- data_len -= sizeof(struct rds_header);
+ byte_len -= sizeof(struct rds_header);
ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
if (ihdr->h_credit)
rds_ib_send_add_credits(conn, ihdr->h_credit);
- if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) {
+ if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) {
/* This is an ACK-only packet. The fact that it gets
* special treatment here is that historically, ACKs
* were rather special beasts.
* the inc is freed. We don't go that route, so we have to drop the
* page ref ourselves. We can't just leave the page on the recv
* because that confuses the dma mapping of pages and each recv's use
- * of a partial page.
+ * of a partial page. We can leave the frag, though, it will be
+ * reused.
*
* FIXME: Fold this into the code path below.
*/
- rds_ib_frag_free(ic, recv->r_frag);
- recv->r_frag = NULL;
+ rds_ib_frag_drop_page(recv->r_frag);
return;
}
* into the inc and save the inc so we can hang upcoming fragments
* off its list.
*/
- if (!ibinc) {
+ if (ibinc == NULL) {
ibinc = recv->r_ibinc;
recv->r_ibinc = NULL;
ic->i_ibinc = ibinc;
hdr = &ibinc->ii_inc.i_hdr;
/* We can't just use memcmp here; fragments of a
* single message may carry different ACKs */
- if (hdr->h_sequence != ihdr->h_sequence ||
- hdr->h_len != ihdr->h_len ||
- hdr->h_sport != ihdr->h_sport ||
- hdr->h_dport != ihdr->h_dport) {
+ if (hdr->h_sequence != ihdr->h_sequence
+ || hdr->h_len != ihdr->h_len
+ || hdr->h_sport != ihdr->h_sport
+ || hdr->h_dport != ihdr->h_dport) {
rds_ib_conn_error(conn,
"fragment header mismatch; forcing reconnect\n");
return;
rds_ib_cong_recv(conn, ibinc);
else {
rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
- &ibinc->ii_inc, GFP_ATOMIC);
+ &ibinc->ii_inc, GFP_ATOMIC,
+ KM_SOFTIRQ0);
state->ack_next = be64_to_cpu(hdr->h_sequence);
state->ack_next_valid = 1;
}
{
struct rds_connection *conn = context;
struct rds_ib_connection *ic = conn->c_transport_data;
+ struct ib_wc wc;
+ struct rds_ib_ack_state state = { 0, };
+ struct rds_ib_recv_work *recv;
rdsdebug("conn %p cq %p\n", conn, cq);
rds_ib_stats_inc(s_ib_rx_cq_call);
- tasklet_schedule(&ic->i_recv_tasklet);
-}
+ ib_req_notify_cq(cq, IB_CQ_SOLICITED);
-static inline void rds_poll_cq(struct rds_ib_connection *ic,
- struct rds_ib_ack_state *state)
-{
- struct rds_connection *conn = ic->conn;
- struct ib_wc wc;
- struct rds_ib_recv_work *recv;
-
- while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
- rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
- (unsigned long long)wc.wr_id, wc.status,
- rds_ib_wc_status_str(wc.status), wc.byte_len,
+ while (ib_poll_cq(cq, 1, &wc) > 0) {
+ rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+ (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
be32_to_cpu(wc.ex.imm_data));
rds_ib_stats_inc(s_ib_rx_cq_event);
recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
- ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE);
+ rds_ib_recv_unmap_page(ic, recv);
/*
* Also process recvs in connecting state because it is possible
* to get a recv completion _before_ the rdmacm ESTABLISHED
* event is processed.
*/
- if (wc.status == IB_WC_SUCCESS) {
- rds_ib_process_recv(conn, recv, wc.byte_len, state);
- } else {
+ if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
/* We expect errors as the qp is drained during shutdown */
- if (rds_conn_up(conn) || rds_conn_connecting(conn))
- rds_ib_conn_error(conn, "recv completion on %pI4 had "
- "status %u (%s), disconnecting and "
- "reconnecting\n", &conn->c_faddr,
- wc.status,
- rds_ib_wc_status_str(wc.status));
+ if (wc.status == IB_WC_SUCCESS) {
+ rds_ib_process_recv(conn, recv, wc.byte_len, &state);
+ } else {
+ rds_ib_conn_error(conn, "recv completion on "
+ "%pI4 had status %u, disconnecting and "
+ "reconnecting\n", &conn->c_faddr,
+ wc.status);
+ }
}
- /*
- * It's very important that we only free this ring entry if we've truly
- * freed the resources allocated to the entry. The refilling path can
- * leak if we don't.
- */
rds_ib_ring_free(&ic->i_recv_ring, 1);
}
-}
-
-void rds_ib_recv_tasklet_fn(unsigned long data)
-{
- struct rds_ib_connection *ic = (struct rds_ib_connection *) data;
- struct rds_connection *conn = ic->conn;
- struct rds_ib_ack_state state = { 0, };
-
- rds_poll_cq(ic, &state);
- ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
- rds_poll_cq(ic, &state);
if (state.ack_next_valid)
rds_ib_set_ack(ic, state.ack_next, state.ack_required);
if (rds_ib_ring_empty(&ic->i_recv_ring))
rds_ib_stats_inc(s_ib_rx_ring_empty);
+ /*
+ * If the ring is running low, then schedule the thread to refill.
+ */
if (rds_ib_ring_low(&ic->i_recv_ring))
- rds_ib_recv_refill(conn, 0);
+ queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
}
int rds_ib_recv(struct rds_connection *conn)
int ret = 0;
rdsdebug("conn %p\n", conn);
+
+ /*
+ * If we get a temporary posting failure in this context then
+ * we're really low and we want the caller to back off for a bit.
+ */
+ mutex_lock(&ic->i_recv_mutex);
+ if (rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
+ ret = -ENOMEM;
+ else
+ rds_ib_stats_inc(s_ib_rx_refill_from_thread);
+ mutex_unlock(&ic->i_recv_mutex);
+
if (rds_conn_up(conn))
rds_ib_attempt_ack(ic);
return ret;
}
-int rds_ib_recv_init(void)
+int __init rds_ib_recv_init(void)
{
struct sysinfo si;
int ret = -ENOMEM;
rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming",
sizeof(struct rds_ib_incoming),
- 0, SLAB_HWCACHE_ALIGN, NULL);
- if (!rds_ib_incoming_slab)
+ 0, 0, NULL);
+ if (rds_ib_incoming_slab == NULL)
goto out;
rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
sizeof(struct rds_page_frag),
- 0, SLAB_HWCACHE_ALIGN, NULL);
- if (!rds_ib_frag_slab)
+ 0, 0, NULL);
+ if (rds_ib_frag_slab == NULL)
kmem_cache_destroy(rds_ib_incoming_slab);
else
ret = 0;
int rds_ib_ring_low(struct rds_ib_work_ring *ring)
{
- return __rds_ib_ring_used(ring) <= (ring->w_nr >> 1);
+ return __rds_ib_ring_used(ring) <= (ring->w_nr >> 2);
}
/*
#include <linux/in.h>
#include <linux/device.h>
#include <linux/dmapool.h>
-#include <linux/ratelimit.h>
#include "rds.h"
+#include "rdma.h"
#include "ib.h"
-static char *rds_ib_wc_status_strings[] = {
-#define RDS_IB_WC_STATUS_STR(foo) \
- [IB_WC_##foo] = __stringify(IB_WC_##foo)
- RDS_IB_WC_STATUS_STR(SUCCESS),
- RDS_IB_WC_STATUS_STR(LOC_LEN_ERR),
- RDS_IB_WC_STATUS_STR(LOC_QP_OP_ERR),
- RDS_IB_WC_STATUS_STR(LOC_EEC_OP_ERR),
- RDS_IB_WC_STATUS_STR(LOC_PROT_ERR),
- RDS_IB_WC_STATUS_STR(WR_FLUSH_ERR),
- RDS_IB_WC_STATUS_STR(MW_BIND_ERR),
- RDS_IB_WC_STATUS_STR(BAD_RESP_ERR),
- RDS_IB_WC_STATUS_STR(LOC_ACCESS_ERR),
- RDS_IB_WC_STATUS_STR(REM_INV_REQ_ERR),
- RDS_IB_WC_STATUS_STR(REM_ACCESS_ERR),
- RDS_IB_WC_STATUS_STR(REM_OP_ERR),
- RDS_IB_WC_STATUS_STR(RETRY_EXC_ERR),
- RDS_IB_WC_STATUS_STR(RNR_RETRY_EXC_ERR),
- RDS_IB_WC_STATUS_STR(LOC_RDD_VIOL_ERR),
- RDS_IB_WC_STATUS_STR(REM_INV_RD_REQ_ERR),
- RDS_IB_WC_STATUS_STR(REM_ABORT_ERR),
- RDS_IB_WC_STATUS_STR(INV_EECN_ERR),
- RDS_IB_WC_STATUS_STR(INV_EEC_STATE_ERR),
- RDS_IB_WC_STATUS_STR(FATAL_ERR),
- RDS_IB_WC_STATUS_STR(RESP_TIMEOUT_ERR),
- RDS_IB_WC_STATUS_STR(GENERAL_ERR),
-#undef RDS_IB_WC_STATUS_STR
-};
-
-char *rds_ib_wc_status_str(enum ib_wc_status status)
-{
- return rds_str_array(rds_ib_wc_status_strings,
- ARRAY_SIZE(rds_ib_wc_status_strings), status);
-}
-
-/*
- * Convert IB-specific error message to RDS error message and call core
- * completion handler.
- */
-static void rds_ib_send_complete(struct rds_message *rm,
- int wc_status,
- void (*complete)(struct rds_message *rm, int status))
+static void rds_ib_send_rdma_complete(struct rds_message *rm,
+ int wc_status)
{
int notify_status;
notify_status = RDS_RDMA_OTHER_ERROR;
break;
}
- complete(rm, notify_status);
-}
-
-static void rds_ib_send_unmap_data(struct rds_ib_connection *ic,
- struct rm_data_op *op,
- int wc_status)
-{
- if (op->op_nents)
- ib_dma_unmap_sg(ic->i_cm_id->device,
- op->op_sg, op->op_nents,
- DMA_TO_DEVICE);
+ rds_rdma_send_complete(rm, notify_status);
}
static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
- struct rm_rdma_op *op,
- int wc_status)
+ struct rds_rdma_op *op)
{
- if (op->op_mapped) {
+ if (op->r_mapped) {
ib_dma_unmap_sg(ic->i_cm_id->device,
- op->op_sg, op->op_nents,
- op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
- op->op_mapped = 0;
+ op->r_sg, op->r_nents,
+ op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ op->r_mapped = 0;
}
-
- /* If the user asked for a completion notification on this
- * message, we can implement three different semantics:
- * 1. Notify when we received the ACK on the RDS message
- * that was queued with the RDMA. This provides reliable
- * notification of RDMA status at the expense of a one-way
- * packet delay.
- * 2. Notify when the IB stack gives us the completion event for
- * the RDMA operation.
- * 3. Notify when the IB stack gives us the completion event for
- * the accompanying RDS messages.
- * Here, we implement approach #3. To implement approach #2,
- * we would need to take an event for the rdma WR. To implement #1,
- * don't call rds_rdma_send_complete at all, and fall back to the notify
- * handling in the ACK processing code.
- *
- * Note: There's no need to explicitly sync any RDMA buffers using
- * ib_dma_sync_sg_for_cpu - the completion for the RDMA
- * operation itself unmapped the RDMA buffers, which takes care
- * of synching.
- */
- rds_ib_send_complete(container_of(op, struct rds_message, rdma),
- wc_status, rds_rdma_send_complete);
-
- if (op->op_write)
- rds_stats_add(s_send_rdma_bytes, op->op_bytes);
- else
- rds_stats_add(s_recv_rdma_bytes, op->op_bytes);
}
-static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic,
- struct rm_atomic_op *op,
- int wc_status)
+static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
+ struct rds_ib_send_work *send,
+ int wc_status)
{
- /* unmap atomic recvbuf */
- if (op->op_mapped) {
- ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1,
- DMA_FROM_DEVICE);
- op->op_mapped = 0;
- }
-
- rds_ib_send_complete(container_of(op, struct rds_message, atomic),
- wc_status, rds_atomic_send_complete);
-
- if (op->op_type == RDS_ATOMIC_TYPE_CSWP)
- rds_ib_stats_inc(s_ib_atomic_cswp);
- else
- rds_ib_stats_inc(s_ib_atomic_fadd);
-}
+ struct rds_message *rm = send->s_rm;
+
+ rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
+
+ ib_dma_unmap_sg(ic->i_cm_id->device,
+ rm->m_sg, rm->m_nents,
+ DMA_TO_DEVICE);
+
+ if (rm->m_rdma_op != NULL) {
+ rds_ib_send_unmap_rdma(ic, rm->m_rdma_op);
+
+ /* If the user asked for a completion notification on this
+ * message, we can implement three different semantics:
+ * 1. Notify when we received the ACK on the RDS message
+ * that was queued with the RDMA. This provides reliable
+ * notification of RDMA status at the expense of a one-way
+ * packet delay.
+ * 2. Notify when the IB stack gives us the completion event for
+ * the RDMA operation.
+ * 3. Notify when the IB stack gives us the completion event for
+ * the accompanying RDS messages.
+ * Here, we implement approach #3. To implement approach #2,
+ * call rds_rdma_send_complete from the cq_handler. To implement #1,
+ * don't call rds_rdma_send_complete at all, and fall back to the notify
+ * handling in the ACK processing code.
+ *
+ * Note: There's no need to explicitly sync any RDMA buffers using
+ * ib_dma_sync_sg_for_cpu - the completion for the RDMA
+ * operation itself unmapped the RDMA buffers, which takes care
+ * of synching.
+ */
+ rds_ib_send_rdma_complete(rm, wc_status);
-/*
- * Unmap the resources associated with a struct send_work.
- *
- * Returns the rm for no good reason other than it is unobtainable
- * other than by switching on wr.opcode, currently, and the caller,
- * the event handler, needs it.
- */
-static struct rds_message *rds_ib_send_unmap_op(struct rds_ib_connection *ic,
- struct rds_ib_send_work *send,
- int wc_status)
-{
- struct rds_message *rm = NULL;
-
- /* In the error case, wc.opcode sometimes contains garbage */
- switch (send->s_wr.opcode) {
- case IB_WR_SEND:
- if (send->s_op) {
- rm = container_of(send->s_op, struct rds_message, data);
- rds_ib_send_unmap_data(ic, send->s_op, wc_status);
- }
- break;
- case IB_WR_RDMA_WRITE:
- case IB_WR_RDMA_READ:
- if (send->s_op) {
- rm = container_of(send->s_op, struct rds_message, rdma);
- rds_ib_send_unmap_rdma(ic, send->s_op, wc_status);
- }
- break;
- case IB_WR_ATOMIC_FETCH_AND_ADD:
- case IB_WR_ATOMIC_CMP_AND_SWP:
- if (send->s_op) {
- rm = container_of(send->s_op, struct rds_message, atomic);
- rds_ib_send_unmap_atomic(ic, send->s_op, wc_status);
- }
- break;
- default:
- printk_ratelimited(KERN_NOTICE
- "RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
- __func__, send->s_wr.opcode);
- break;
+ if (rm->m_rdma_op->r_write)
+ rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes);
+ else
+ rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes);
}
- send->s_wr.opcode = 0xdead;
+ /* If anyone waited for this message to get flushed out, wake
+ * them up now */
+ rds_message_unmapped(rm);
- return rm;
+ rds_message_put(rm);
+ send->s_rm = NULL;
}
void rds_ib_send_init_ring(struct rds_ib_connection *ic)
for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
struct ib_sge *sge;
+ send->s_rm = NULL;
send->s_op = NULL;
send->s_wr.wr_id = i;
send->s_wr.sg_list = send->s_sge;
+ send->s_wr.num_sge = 1;
+ send->s_wr.opcode = IB_WR_SEND;
+ send->s_wr.send_flags = 0;
send->s_wr.ex.imm_data = 0;
- sge = &send->s_sge[0];
+ sge = rds_ib_data_sge(ic, send->s_sge);
+ sge->lkey = ic->i_mr->lkey;
+
+ sge = rds_ib_header_sge(ic, send->s_sge);
sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
sge->length = sizeof(struct rds_header);
sge->lkey = ic->i_mr->lkey;
-
- send->s_sge[1].lkey = ic->i_mr->lkey;
}
}
u32 i;
for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
- if (send->s_op && send->s_wr.opcode != 0xdead)
- rds_ib_send_unmap_op(ic, send, IB_WC_WR_FLUSH_ERR);
+ if (send->s_wr.opcode == 0xdead)
+ continue;
+ if (send->s_rm)
+ rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
+ if (send->s_op)
+ rds_ib_send_unmap_rdma(ic, send->s_op);
}
}
-/*
- * The only fast path caller always has a non-zero nr, so we don't
- * bother testing nr before performing the atomic sub.
- */
-static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr)
-{
- if ((atomic_sub_return(nr, &ic->i_signaled_sends) == 0) &&
- waitqueue_active(&rds_ib_ring_empty_wait))
- wake_up(&rds_ib_ring_empty_wait);
- BUG_ON(atomic_read(&ic->i_signaled_sends) < 0);
-}
-
/*
* The _oldest/_free ring operations here race cleanly with the alloc/unalloc
* operations performed in the send path. As the sender allocs and potentially
{
struct rds_connection *conn = context;
struct rds_ib_connection *ic = conn->c_transport_data;
- struct rds_message *rm = NULL;
struct ib_wc wc;
struct rds_ib_send_work *send;
u32 completed;
u32 oldest;
u32 i = 0;
int ret;
- int nr_sig = 0;
rdsdebug("cq %p conn %p\n", cq, conn);
rds_ib_stats_inc(s_ib_tx_cq_call);
rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
while (ib_poll_cq(cq, 1, &wc) > 0) {
- rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n",
- (unsigned long long)wc.wr_id, wc.status,
- rds_ib_wc_status_str(wc.status), wc.byte_len,
+ rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+ (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
be32_to_cpu(wc.ex.imm_data));
rds_ib_stats_inc(s_ib_tx_cq_event);
if (wc.wr_id == RDS_IB_ACK_WR_ID) {
- if (time_after(jiffies, ic->i_ack_queued + HZ/2))
+ if (ic->i_ack_queued + HZ/2 < jiffies)
rds_ib_stats_inc(s_ib_tx_stalled);
rds_ib_ack_send_complete(ic);
continue;
for (i = 0; i < completed; i++) {
send = &ic->i_sends[oldest];
- if (send->s_wr.send_flags & IB_SEND_SIGNALED)
- nr_sig++;
- rm = rds_ib_send_unmap_op(ic, send, wc.status);
+ /* In the error case, wc.opcode sometimes contains garbage */
+ switch (send->s_wr.opcode) {
+ case IB_WR_SEND:
+ if (send->s_rm)
+ rds_ib_send_unmap_rm(ic, send, wc.status);
+ break;
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_READ:
+ /* Nothing to be done - the SG list will be unmapped
+ * when the SEND completes. */
+ break;
+ default:
+ if (printk_ratelimit())
+ printk(KERN_NOTICE
+ "RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
+ __func__, send->s_wr.opcode);
+ break;
+ }
- if (time_after(jiffies, send->s_queued + HZ/2))
+ send->s_wr.opcode = 0xdead;
+ send->s_wr.num_sge = 1;
+ if (send->s_queued + HZ/2 < jiffies)
rds_ib_stats_inc(s_ib_tx_stalled);
- if (send->s_op) {
- if (send->s_op == rm->m_final_op) {
- /* If anyone waited for this message to get flushed out, wake
- * them up now */
- rds_message_unmapped(rm);
- }
- rds_message_put(rm);
- send->s_op = NULL;
+ /* If a RDMA operation produced an error, signal this right
+ * away. If we don't, the subsequent SEND that goes with this
+ * RDMA will be canceled with ERR_WFLUSH, and the application
+ * never learn that the RDMA failed. */
+ if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) {
+ struct rds_message *rm;
+
+ rm = rds_send_get_message(conn, send->s_op);
+ if (rm)
+ rds_ib_send_rdma_complete(rm, wc.status);
}
oldest = (oldest + 1) % ic->i_send_ring.w_nr;
}
rds_ib_ring_free(&ic->i_send_ring, completed);
- rds_ib_sub_signaled(ic, nr_sig);
- nr_sig = 0;
- if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
- test_bit(0, &conn->c_map_queued))
+ if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)
+ || test_bit(0, &conn->c_map_queued))
queue_delayed_work(rds_wq, &conn->c_send_w, 0);
/* We expect errors as the qp is drained during shutdown */
if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
- rds_ib_conn_error(conn, "send completion on %pI4 had status "
- "%u (%s), disconnecting and reconnecting\n",
- &conn->c_faddr, wc.status,
- rds_ib_wc_status_str(wc.status));
+ rds_ib_conn_error(conn,
+ "send completion on %pI4 "
+ "had status %u, disconnecting and reconnecting\n",
+ &conn->c_faddr, wc.status);
}
}
}
*
* Conceptually, we have two counters:
* - send credits: this tells us how many WRs we're allowed
- * to submit without overruning the receiver's queue. For
+ * to submit without overruning the reciever's queue. For
* each SEND WR we post, we decrement this by one.
*
* - posted credits: this tells us how many WRs we recently
* credits (see rds_ib_send_add_credits below).
*
* The RDS send code is essentially single-threaded; rds_send_xmit
- * sets RDS_IN_XMIT to ensure exclusive access to the send ring.
+ * grabs c_send_lock to ensure exclusive access to the send ring.
* However, the ACK sending code is independent and can race with
* message SENDs.
*
* and using atomic_cmpxchg when updating the two counters.
*/
int rds_ib_send_grab_credits(struct rds_ib_connection *ic,
- u32 wanted, u32 *adv_credits, int need_posted, int max_posted)
+ u32 wanted, u32 *adv_credits, int need_posted)
{
unsigned int avail, posted, got = 0, advertise;
long oldval, newval;
posted = IB_GET_POST_CREDITS(oldval);
avail = IB_GET_SEND_CREDITS(oldval);
- rdsdebug("wanted=%u credits=%u posted=%u\n",
+ rdsdebug("rds_ib_send_grab_credits(%u): credits=%u posted=%u\n",
wanted, avail, posted);
/* The last credit must be used to send a credit update. */
* available.
*/
if (posted && (got || need_posted)) {
- advertise = min_t(unsigned int, posted, max_posted);
+ advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT);
newval -= IB_SET_POST_CREDITS(advertise);
}
if (credits == 0)
return;
- rdsdebug("credits=%u current=%u%s\n",
+ rdsdebug("rds_ib_send_add_credits(%u): current=%u%s\n",
credits,
IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)),
test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : "");
set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
}
-static inline int rds_ib_set_wr_signal_state(struct rds_ib_connection *ic,
- struct rds_ib_send_work *send,
- bool notify)
+static inline void
+rds_ib_xmit_populate_wr(struct rds_ib_connection *ic,
+ struct rds_ib_send_work *send, unsigned int pos,
+ unsigned long buffer, unsigned int length,
+ int send_flags)
{
- /*
- * We want to delay signaling completions just enough to get
- * the batching benefits but not so much that we create dead time
- * on the wire.
- */
- if (ic->i_unsignaled_wrs-- == 0 || notify) {
- ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
- send->s_wr.send_flags |= IB_SEND_SIGNALED;
- return 1;
+ struct ib_sge *sge;
+
+ WARN_ON(pos != send - ic->i_sends);
+
+ send->s_wr.send_flags = send_flags;
+ send->s_wr.opcode = IB_WR_SEND;
+ send->s_wr.num_sge = 2;
+ send->s_wr.next = NULL;
+ send->s_queued = jiffies;
+ send->s_op = NULL;
+
+ if (length != 0) {
+ sge = rds_ib_data_sge(ic, send->s_sge);
+ sge->addr = buffer;
+ sge->length = length;
+ sge->lkey = ic->i_mr->lkey;
+
+ sge = rds_ib_header_sge(ic, send->s_sge);
+ } else {
+ /* We're sending a packet with no payload. There is only
+ * one SGE */
+ send->s_wr.num_sge = 1;
+ sge = &send->s_sge[0];
}
- return 0;
+
+ sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
+ sge->length = sizeof(struct rds_header);
+ sge->lkey = ic->i_mr->lkey;
}
/*
u32 pos;
u32 i;
u32 work_alloc;
- u32 credit_alloc = 0;
+ u32 credit_alloc;
u32 posted;
u32 adv_credits = 0;
int send_flags = 0;
- int bytes_sent = 0;
+ int sent;
int ret;
int flow_controlled = 0;
- int nr_sig = 0;
BUG_ON(off % RDS_FRAG_SIZE);
BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
- /* Do not send cong updates to IB loopback */
- if (conn->c_loopback
- && rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) {
- rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
- scat = &rm->data.op_sg[sg];
- ret = max_t(int, RDS_CONG_MAP_BYTES, scat->length);
- return sizeof(struct rds_header) + ret;
- }
-
/* FIXME we may overallocate here */
if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0)
i = 1;
goto out;
}
+ credit_alloc = work_alloc;
if (ic->i_flowctl) {
- credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT);
+ credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0);
adv_credits += posted;
if (credit_alloc < work_alloc) {
rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
work_alloc = credit_alloc;
- flow_controlled = 1;
+ flow_controlled++;
}
if (work_alloc == 0) {
- set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+ rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
rds_ib_stats_inc(s_ib_tx_throttle);
ret = -ENOMEM;
goto out;
}
/* map the message the first time we see it */
- if (!ic->i_data_op) {
- if (rm->data.op_nents) {
- rm->data.op_count = ib_dma_map_sg(dev,
- rm->data.op_sg,
- rm->data.op_nents,
- DMA_TO_DEVICE);
- rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
- if (rm->data.op_count == 0) {
+ if (ic->i_rm == NULL) {
+ /*
+ printk(KERN_NOTICE "rds_ib_xmit prep msg dport=%u flags=0x%x len=%d\n",
+ be16_to_cpu(rm->m_inc.i_hdr.h_dport),
+ rm->m_inc.i_hdr.h_flags,
+ be32_to_cpu(rm->m_inc.i_hdr.h_len));
+ */
+ if (rm->m_nents) {
+ rm->m_count = ib_dma_map_sg(dev,
+ rm->m_sg, rm->m_nents, DMA_TO_DEVICE);
+ rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count);
+ if (rm->m_count == 0) {
rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
ret = -ENOMEM; /* XXX ? */
goto out;
}
} else {
- rm->data.op_count = 0;
+ rm->m_count = 0;
}
+ ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+ ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes;
rds_message_addref(rm);
- ic->i_data_op = &rm->data;
+ ic->i_rm = rm;
/* Finalize the header */
if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
/* If it has a RDMA op, tell the peer we did it. This is
* used by the peer to release use-once RDMA MRs. */
- if (rm->rdma.op_active) {
+ if (rm->m_rdma_op) {
struct rds_ext_header_rdma ext_hdr;
- ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
+ ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key);
rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
}
/*
* Update adv_credits since we reset the ACK_REQUIRED bit.
*/
- if (ic->i_flowctl) {
- rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
- adv_credits += posted;
- BUG_ON(adv_credits > 255);
- }
- }
+ rds_ib_send_grab_credits(ic, 0, &posted, 1);
+ adv_credits += posted;
+ BUG_ON(adv_credits > 255);
+ } else if (ic->i_rm != rm)
+ BUG();
+
+ send = &ic->i_sends[pos];
+ first = send;
+ prev = NULL;
+ scat = &rm->m_sg[sg];
+ sent = 0;
+ i = 0;
/* Sometimes you want to put a fence between an RDMA
* READ and the following SEND.
* or when requested by the user. Right now, we let
* the application choose.
*/
- if (rm->rdma.op_active && rm->rdma.op_fence)
+ if (rm->m_rdma_op && rm->m_rdma_op->r_fence)
send_flags = IB_SEND_FENCE;
- /* Each frag gets a header. Msgs may be 0 bytes */
- send = &ic->i_sends[pos];
- first = send;
- prev = NULL;
- scat = &ic->i_data_op->op_sg[sg];
- i = 0;
- do {
- unsigned int len = 0;
-
- /* Set up the header */
- send->s_wr.send_flags = send_flags;
- send->s_wr.opcode = IB_WR_SEND;
- send->s_wr.num_sge = 1;
- send->s_wr.next = NULL;
- send->s_queued = jiffies;
- send->s_op = NULL;
+ /*
+ * We could be copying the header into the unused tail of the page.
+ * That would need to be changed in the future when those pages might
+ * be mapped userspace pages or page cache pages. So instead we always
+ * use a second sge and our long-lived ring of mapped headers. We send
+ * the header after the data so that the data payload can be aligned on
+ * the receiver.
+ */
- send->s_sge[0].addr = ic->i_send_hdrs_dma
- + (pos * sizeof(struct rds_header));
- send->s_sge[0].length = sizeof(struct rds_header);
+ /* handle a 0-len message */
+ if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) {
+ rds_ib_xmit_populate_wr(ic, send, pos, 0, 0, send_flags);
+ goto add_header;
+ }
- memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
+ /* if there's data reference it with a chain of work reqs */
+ for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) {
+ unsigned int len;
- /* Set up the data, if present */
- if (i < work_alloc
- && scat != &rm->data.op_sg[rm->data.op_count]) {
- len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
- send->s_wr.num_sge = 2;
+ send = &ic->i_sends[pos];
- send->s_sge[1].addr = ib_sg_dma_address(dev, scat) + off;
- send->s_sge[1].length = len;
+ len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
+ rds_ib_xmit_populate_wr(ic, send, pos,
+ ib_sg_dma_address(dev, scat) + off, len,
+ send_flags);
- bytes_sent += len;
- off += len;
- if (off == ib_sg_dma_len(dev, scat)) {
- scat++;
- off = 0;
- }
+ /*
+ * We want to delay signaling completions just enough to get
+ * the batching benefits but not so much that we create dead time
+ * on the wire.
+ */
+ if (ic->i_unsignaled_wrs-- == 0) {
+ ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+ send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
}
- rds_ib_set_wr_signal_state(ic, send, 0);
+ ic->i_unsignaled_bytes -= len;
+ if (ic->i_unsignaled_bytes <= 0) {
+ ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes;
+ send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+ }
/*
* Always signal the last one if we're stopping due to flow control.
*/
- if (ic->i_flowctl && flow_controlled && i == (work_alloc-1))
+ if (flow_controlled && i == (work_alloc-1))
send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
- if (send->s_wr.send_flags & IB_SEND_SIGNALED)
- nr_sig++;
-
rdsdebug("send %p wr %p num_sge %u next %p\n", send,
&send->s_wr, send->s_wr.num_sge, send->s_wr.next);
- if (ic->i_flowctl && adv_credits) {
+ sent += len;
+ off += len;
+ if (off == ib_sg_dma_len(dev, scat)) {
+ scat++;
+ off = 0;
+ }
+
+add_header:
+ /* Tack on the header after the data. The header SGE should already
+ * have been set up to point to the right header buffer. */
+ memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
+
+ if (0) {
+ struct rds_header *hdr = &ic->i_send_hdrs[pos];
+
+ printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n",
+ be16_to_cpu(hdr->h_dport),
+ hdr->h_flags,
+ be32_to_cpu(hdr->h_len));
+ }
+ if (adv_credits) {
struct rds_header *hdr = &ic->i_send_hdrs[pos];
/* add credit and redo the header checksum */
prev = send;
pos = (pos + 1) % ic->i_send_ring.w_nr;
- send = &ic->i_sends[pos];
- i++;
-
- } while (i < work_alloc
- && scat != &rm->data.op_sg[rm->data.op_count]);
+ }
/* Account the RDS header in the number of bytes we sent, but just once.
* The caller has no concept of fragmentation. */
if (hdr_off == 0)
- bytes_sent += sizeof(struct rds_header);
+ sent += sizeof(struct rds_header);
/* if we finished the message then send completion owns it */
- if (scat == &rm->data.op_sg[rm->data.op_count]) {
- prev->s_op = ic->i_data_op;
- prev->s_wr.send_flags |= IB_SEND_SOLICITED;
- ic->i_data_op = NULL;
+ if (scat == &rm->m_sg[rm->m_count]) {
+ prev->s_rm = ic->i_rm;
+ prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+ ic->i_rm = NULL;
}
- /* Put back wrs & credits we didn't use */
if (i < work_alloc) {
rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
work_alloc = i;
if (ic->i_flowctl && i < credit_alloc)
rds_ib_send_add_credits(conn, credit_alloc - i);
- if (nr_sig)
- atomic_add(nr_sig, &ic->i_signaled_sends);
-
/* XXX need to worry about failed_wr and partial sends. */
failed_wr = &first->s_wr;
ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 "
"returned %d\n", &conn->c_faddr, ret);
rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
- rds_ib_sub_signaled(ic, nr_sig);
- if (prev->s_op) {
- ic->i_data_op = prev->s_op;
- prev->s_op = NULL;
+ if (prev->s_rm) {
+ ic->i_rm = prev->s_rm;
+ prev->s_rm = NULL;
}
-
- rds_ib_conn_error(ic->conn, "ib_post_send failed\n");
+ /* Finesse this later */
+ BUG();
goto out;
}
- ret = bytes_sent;
+ ret = sent;
out:
BUG_ON(adv_credits);
return ret;
}
-/*
- * Issue atomic operation.
- * A simplified version of the rdma case, we always map 1 SG, and
- * only 8 bytes, for the return value from the atomic operation.
- */
-int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
-{
- struct rds_ib_connection *ic = conn->c_transport_data;
- struct rds_ib_send_work *send = NULL;
- struct ib_send_wr *failed_wr;
- struct rds_ib_device *rds_ibdev;
- u32 pos;
- u32 work_alloc;
- int ret;
- int nr_sig = 0;
-
- rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
-
- work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos);
- if (work_alloc != 1) {
- rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
- rds_ib_stats_inc(s_ib_tx_ring_full);
- ret = -ENOMEM;
- goto out;
- }
-
- /* address of send request in ring */
- send = &ic->i_sends[pos];
- send->s_queued = jiffies;
-
- if (op->op_type == RDS_ATOMIC_TYPE_CSWP) {
- send->s_wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP;
- send->s_wr.wr.atomic.compare_add = op->op_m_cswp.compare;
- send->s_wr.wr.atomic.swap = op->op_m_cswp.swap;
- send->s_wr.wr.atomic.compare_add_mask = op->op_m_cswp.compare_mask;
- send->s_wr.wr.atomic.swap_mask = op->op_m_cswp.swap_mask;
- } else { /* FADD */
- send->s_wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD;
- send->s_wr.wr.atomic.compare_add = op->op_m_fadd.add;
- send->s_wr.wr.atomic.swap = 0;
- send->s_wr.wr.atomic.compare_add_mask = op->op_m_fadd.nocarry_mask;
- send->s_wr.wr.atomic.swap_mask = 0;
- }
- nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify);
- send->s_wr.num_sge = 1;
- send->s_wr.next = NULL;
- send->s_wr.wr.atomic.remote_addr = op->op_remote_addr;
- send->s_wr.wr.atomic.rkey = op->op_rkey;
- send->s_op = op;
- rds_message_addref(container_of(send->s_op, struct rds_message, atomic));
-
- /* map 8 byte retval buffer to the device */
- ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE);
- rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret);
- if (ret != 1) {
- rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
- rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
- ret = -ENOMEM; /* XXX ? */
- goto out;
- }
-
- /* Convert our struct scatterlist to struct ib_sge */
- send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg);
- send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg);
- send->s_sge[0].lkey = ic->i_mr->lkey;
-
- rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr,
- send->s_sge[0].addr, send->s_sge[0].length);
-
- if (nr_sig)
- atomic_add(nr_sig, &ic->i_signaled_sends);
-
- failed_wr = &send->s_wr;
- ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr);
- rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic,
- send, &send->s_wr, ret, failed_wr);
- BUG_ON(failed_wr != &send->s_wr);
- if (ret) {
- printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 "
- "returned %d\n", &conn->c_faddr, ret);
- rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
- rds_ib_sub_signaled(ic, nr_sig);
- goto out;
- }
-
- if (unlikely(failed_wr != &send->s_wr)) {
- printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
- BUG_ON(failed_wr != &send->s_wr);
- }
-
-out:
- return ret;
-}
-
-int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
+int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
{
struct rds_ib_connection *ic = conn->c_transport_data;
struct rds_ib_send_work *send = NULL;
struct rds_ib_send_work *first;
struct rds_ib_send_work *prev;
struct ib_send_wr *failed_wr;
+ struct rds_ib_device *rds_ibdev;
struct scatterlist *scat;
unsigned long len;
- u64 remote_addr = op->op_remote_addr;
- u32 max_sge = ic->rds_ibdev->max_sge;
+ u64 remote_addr = op->r_remote_addr;
u32 pos;
u32 work_alloc;
u32 i;
int sent;
int ret;
int num_sge;
- int nr_sig = 0;
-
- /* map the op the first time we see it */
- if (!op->op_mapped) {
- op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
- op->op_sg, op->op_nents, (op->op_write) ?
- DMA_TO_DEVICE : DMA_FROM_DEVICE);
- rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
- if (op->op_count == 0) {
+
+ rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
+
+ /* map the message the first time we see it */
+ if (!op->r_mapped) {
+ op->r_count = ib_dma_map_sg(ic->i_cm_id->device,
+ op->r_sg, op->r_nents, (op->r_write) ?
+ DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count);
+ if (op->r_count == 0) {
rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
ret = -ENOMEM; /* XXX ? */
goto out;
}
- op->op_mapped = 1;
+ op->r_mapped = 1;
}
/*
* Instead of knowing how to return a partial rdma read/write we insist that there
* be enough work requests to send the entire message.
*/
- i = ceil(op->op_count, max_sge);
+ i = ceil(op->r_count, rds_ibdev->max_sge);
work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
if (work_alloc != i) {
send = &ic->i_sends[pos];
first = send;
prev = NULL;
- scat = &op->op_sg[0];
+ scat = &op->r_sg[0];
sent = 0;
- num_sge = op->op_count;
+ num_sge = op->r_count;
- for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
+ for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) {
send->s_wr.send_flags = 0;
send->s_queued = jiffies;
- send->s_op = NULL;
-
- nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify);
+ /*
+ * We want to delay signaling completions just enough to get
+ * the batching benefits but not so much that we create dead time on the wire.
+ */
+ if (ic->i_unsignaled_wrs-- == 0) {
+ ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
+ send->s_wr.send_flags = IB_SEND_SIGNALED;
+ }
- send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
+ send->s_wr.opcode = op->r_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
send->s_wr.wr.rdma.remote_addr = remote_addr;
- send->s_wr.wr.rdma.rkey = op->op_rkey;
+ send->s_wr.wr.rdma.rkey = op->r_key;
+ send->s_op = op;
- if (num_sge > max_sge) {
- send->s_wr.num_sge = max_sge;
- num_sge -= max_sge;
+ if (num_sge > rds_ibdev->max_sge) {
+ send->s_wr.num_sge = rds_ibdev->max_sge;
+ num_sge -= rds_ibdev->max_sge;
} else {
send->s_wr.num_sge = num_sge;
}
if (prev)
prev->s_wr.next = &send->s_wr;
- for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
+ for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) {
len = ib_sg_dma_len(ic->i_cm_id->device, scat);
send->s_sge[j].addr =
ib_sg_dma_address(ic->i_cm_id->device, scat);
send = ic->i_sends;
}
- /* give a reference to the last op */
- if (scat == &op->op_sg[op->op_count]) {
- prev->s_op = op;
- rds_message_addref(container_of(op, struct rds_message, rdma));
- }
+ /* if we finished the message then send completion owns it */
+ if (scat == &op->r_sg[op->r_count])
+ prev->s_wr.send_flags = IB_SEND_SIGNALED;
if (i < work_alloc) {
rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
work_alloc = i;
}
- if (nr_sig)
- atomic_add(nr_sig, &ic->i_signaled_sends);
-
failed_wr = &first->s_wr;
ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 "
"returned %d\n", &conn->c_faddr, ret);
rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
- rds_ib_sub_signaled(ic, nr_sig);
goto out;
}
#include "rds.h"
#include "ib.h"
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats);
+DEFINE_PER_CPU(struct rds_ib_statistics, rds_ib_stats) ____cacheline_aligned;
-static const char *const rds_ib_stat_names[] = {
+static char *rds_ib_stat_names[] = {
"ib_connect_raced",
"ib_listen_closed_stale",
"ib_tx_cq_call",
"ib_rdma_mr_pool_flush",
"ib_rdma_mr_pool_wait",
"ib_rdma_mr_pool_depleted",
- "ib_atomic_cswp",
- "ib_atomic_fadd",
};
unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1;
static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64;
-/*
- * This sysctl does nothing.
- *
- * Backwards compatibility with RDS 3.0 wire protocol
- * disables initial FC credit exchange.
- * If it's ever possible to drop 3.0 support,
- * setting this to 1 and moving init/refill of send/recv
- * rings from ib_cm_connect_complete() back into ib_setup_qp()
- * will cause credits to be added before protocol negotiation.
- */
-unsigned int rds_ib_sysctl_flow_control = 0;
+unsigned long rds_ib_sysctl_max_unsig_bytes = (16 << 20);
+static unsigned long rds_ib_sysctl_max_unsig_bytes_min = 1;
+static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL;
-static struct ctl_table rds_ib_sysctl_table[] = {
+unsigned int rds_ib_sysctl_flow_control = 1;
+
+ctl_table rds_ib_sysctl_table[] = {
{
+ .ctl_name = CTL_UNNUMBERED,
.procname = "max_send_wr",
.data = &rds_ib_sysctl_max_send_wr,
.maxlen = sizeof(unsigned long),
.mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
+ .proc_handler = &proc_doulongvec_minmax,
.extra1 = &rds_ib_sysctl_max_wr_min,
.extra2 = &rds_ib_sysctl_max_wr_max,
},
{
+ .ctl_name = CTL_UNNUMBERED,
.procname = "max_recv_wr",
.data = &rds_ib_sysctl_max_recv_wr,
.maxlen = sizeof(unsigned long),
.mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
+ .proc_handler = &proc_doulongvec_minmax,
.extra1 = &rds_ib_sysctl_max_wr_min,
.extra2 = &rds_ib_sysctl_max_wr_max,
},
{
+ .ctl_name = CTL_UNNUMBERED,
.procname = "max_unsignaled_wr",
.data = &rds_ib_sysctl_max_unsig_wrs,
.maxlen = sizeof(unsigned long),
.mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
+ .proc_handler = &proc_doulongvec_minmax,
.extra1 = &rds_ib_sysctl_max_unsig_wr_min,
.extra2 = &rds_ib_sysctl_max_unsig_wr_max,
},
{
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "max_unsignaled_bytes",
+ .data = &rds_ib_sysctl_max_unsig_bytes,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = &proc_doulongvec_minmax,
+ .extra1 = &rds_ib_sysctl_max_unsig_bytes_min,
+ .extra2 = &rds_ib_sysctl_max_unsig_bytes_max,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
.procname = "max_recv_allocation",
.data = &rds_ib_sysctl_max_recv_allocation,
.maxlen = sizeof(unsigned long),
.mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
+ .proc_handler = &proc_doulongvec_minmax,
},
{
+ .ctl_name = CTL_UNNUMBERED,
.procname = "flow_control",
.data = &rds_ib_sysctl_flow_control,
.maxlen = sizeof(rds_ib_sysctl_flow_control),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = &proc_dointvec,
},
+ { .ctl_name = 0}
+};
+
+static struct ctl_path rds_ib_sysctl_path[] = {
+ { .procname = "net", .ctl_name = CTL_NET, },
+ { .procname = "rds", .ctl_name = CTL_UNNUMBERED, },
+ { .procname = "ib", .ctl_name = CTL_UNNUMBERED, },
{ }
};
void rds_ib_sysctl_exit(void)
{
if (rds_ib_sysctl_hdr)
- unregister_net_sysctl_table(rds_ib_sysctl_hdr);
+ unregister_sysctl_table(rds_ib_sysctl_hdr);
}
-int rds_ib_sysctl_init(void)
+int __init rds_ib_sysctl_init(void)
{
- rds_ib_sysctl_hdr = register_net_sysctl(&init_net, "net/rds/ib", rds_ib_sysctl_table);
- if (!rds_ib_sysctl_hdr)
+ rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table);
+ if (rds_ib_sysctl_hdr == NULL)
return -ENOMEM;
return 0;
}
*/
#include <linux/percpu.h>
#include <linux/seq_file.h>
-#include <linux/slab.h>
#include <linux/proc_fs.h>
-#include <linux/export.h>
#include "rds.h"
BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
spin_lock(&rds_info_lock);
- BUG_ON(rds_info_funcs[offset]);
+ BUG_ON(rds_info_funcs[offset] != NULL);
rds_info_funcs[offset] = func;
spin_unlock(&rds_info_lock);
}
-EXPORT_SYMBOL_GPL(rds_info_register_func);
void rds_info_deregister_func(int optname, rds_info_func func)
{
rds_info_funcs[offset] = NULL;
spin_unlock(&rds_info_lock);
}
-EXPORT_SYMBOL_GPL(rds_info_deregister_func);
/*
* Typically we hold an atomic kmap across multiple rds_info_copy() calls
*/
void rds_info_iter_unmap(struct rds_info_iterator *iter)
{
- if (iter->addr) {
- kunmap_atomic(iter->addr);
+ if (iter->addr != NULL) {
+ kunmap_atomic(iter->addr, KM_USER0);
iter->addr = NULL;
}
}
unsigned long this;
while (bytes) {
- if (!iter->addr)
- iter->addr = kmap_atomic(*iter->pages);
+ if (iter->addr == NULL)
+ iter->addr = kmap_atomic(*iter->pages, KM_USER0);
this = min(bytes, PAGE_SIZE - iter->offset);
iter->offset += this;
if (iter->offset == PAGE_SIZE) {
- kunmap_atomic(iter->addr);
+ kunmap_atomic(iter->addr, KM_USER0);
iter->addr = NULL;
iter->offset = 0;
iter->pages++;
}
}
}
-EXPORT_SYMBOL_GPL(rds_info_copy);
/*
* @optval points to the userspace buffer that the information snapshot
>> PAGE_SHIFT;
pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL);
- if (!pages) {
+ if (pages == NULL) {
ret = -ENOMEM;
goto out;
}
- ret = get_user_pages_fast(start, nr_pages, 1, pages);
+ down_read(¤t->mm->mmap_sem);
+ ret = get_user_pages(current, current->mm, start, nr_pages, 1, 0,
+ pages, NULL);
+ up_read(¤t->mm->mmap_sem);
if (ret != nr_pages) {
if (ret > 0)
nr_pages = ret;
call_func:
func = rds_info_funcs[optname - RDS_INFO_FIRST];
- if (!func) {
+ if (func == NULL) {
ret = -ENOPROTOOPT;
goto out;
}
ret = -EFAULT;
out:
- for (i = 0; pages && i < nr_pages; i++)
+ for (i = 0; pages != NULL && i < nr_pages; i++)
put_page(pages[i]);
kfree(pages);
#include <linux/inetdevice.h>
#include <linux/if_arp.h>
#include <linux/delay.h>
-#include <linux/slab.h>
-#include <linux/module.h>
#include "rds.h"
#include "iw.h"
DEFINE_SPINLOCK(iw_nodev_conns_lock);
LIST_HEAD(iw_nodev_conns);
-static void rds_iw_add_one(struct ib_device *device)
+void rds_iw_add_one(struct ib_device *device)
{
struct rds_iw_device *rds_iwdev;
struct ib_device_attr *dev_attr;
rds_iwdev->max_wrs = dev_attr->max_qp_wr;
rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE);
+ rds_iwdev->page_shift = max(PAGE_SHIFT, ffs(dev_attr->page_size_cap) - 1);
+
rds_iwdev->dev = device;
rds_iwdev->pd = ib_alloc_pd(device);
if (IS_ERR(rds_iwdev->pd))
goto free_dev;
if (!rds_iwdev->dma_local_lkey) {
- rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
- IB_ACCESS_REMOTE_READ |
- IB_ACCESS_REMOTE_WRITE |
- IB_ACCESS_LOCAL_WRITE);
+ if (device->node_type != RDMA_NODE_RNIC) {
+ rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
+ IB_ACCESS_LOCAL_WRITE);
+ } else {
+ rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
+ IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_LOCAL_WRITE);
+ }
if (IS_ERR(rds_iwdev->mr))
goto err_pd;
} else
kfree(dev_attr);
}
-static void rds_iw_remove_one(struct ib_device *device)
+void rds_iw_remove_one(struct ib_device *device)
{
struct rds_iw_device *rds_iwdev;
struct rds_iw_cm_id *i_cm_id, *next;
ic = conn->c_transport_data;
dev_addr = &ic->i_cm_id->route.addr.dev_addr;
- rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
- rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
+ ib_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
+ ib_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
iinfo->max_send_wr = ic->i_send_ring.w_nr;
/* Create a CMA ID and try to bind it. This catches both
* IB and iWARP capable NICs.
*/
- cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
- if (IS_ERR(cm_id))
- return PTR_ERR(cm_id);
+ cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
+ if (!cm_id)
+ return -EADDRNOTAVAIL;
memset(&sin, 0, sizeof(sin));
sin.sin_family = AF_INET;
ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
/* due to this, we will claim to support IB devices unless we
check node_type. */
- if (ret || !cm_id->device ||
- cm_id->device->node_type != RDMA_NODE_RNIC)
+ if (ret || cm_id->device->node_type != RDMA_NODE_RNIC)
ret = -EADDRNOTAVAIL;
rdsdebug("addr %pI4 ret %d node type %d\n",
.laddr_check = rds_iw_laddr_check,
.xmit_complete = rds_iw_xmit_complete,
.xmit = rds_iw_xmit,
+ .xmit_cong_map = NULL,
.xmit_rdma = rds_iw_xmit_rdma,
.recv = rds_iw_recv,
.conn_alloc = rds_iw_conn_alloc,
.conn_connect = rds_iw_conn_connect,
.conn_shutdown = rds_iw_conn_shutdown,
.inc_copy_to_user = rds_iw_inc_copy_to_user,
+ .inc_purge = rds_iw_inc_purge,
.inc_free = rds_iw_inc_free,
.cm_initiate_connect = rds_iw_cm_initiate_connect,
.cm_handle_connect = rds_iw_cm_handle_connect,
.flush_mrs = rds_iw_flush_mrs,
.t_owner = THIS_MODULE,
.t_name = "iwarp",
- .t_type = RDS_TRANS_IWARP,
.t_prefer_loopback = 1,
};
-int rds_iw_init(void)
+int __init rds_iw_init(void)
{
int ret;
#ifndef _RDS_IW_H
#define _RDS_IW_H
-#include <linux/interrupt.h>
#include <rdma/ib_verbs.h>
#include <rdma/rdma_cm.h>
#include "rds.h"
struct rds_message *s_rm;
/* We should really put these into a union: */
- struct rm_rdma_op *s_op;
+ struct rds_rdma_op *s_op;
struct rds_iw_mapping *s_mapping;
struct ib_mr *s_mr;
struct ib_fast_reg_page_list *s_page_list;
struct rds_iw_send_work *i_sends;
/* rx */
- struct tasklet_struct i_recv_tasklet;
struct mutex i_recv_mutex;
struct rds_iw_work_ring i_recv_ring;
struct rds_iw_incoming *i_iwinc;
struct ib_pd *pd;
struct ib_mr *mr;
struct rds_iw_mr_pool *mr_pool;
+ int page_shift;
int max_sge;
unsigned int max_wrs;
unsigned int dma_local_lkey:1;
/* ib.c */
extern struct rds_transport rds_iw_transport;
+extern void rds_iw_add_one(struct ib_device *device);
+extern void rds_iw_remove_one(struct ib_device *device);
extern struct ib_client rds_iw_client;
extern unsigned int fastreg_pool_size;
int rds_iw_conn_connect(struct rds_connection *conn);
void rds_iw_conn_shutdown(struct rds_connection *conn);
void rds_iw_state_change(struct sock *sk);
-int rds_iw_listen_init(void);
+int __init rds_iw_listen_init(void);
void rds_iw_listen_stop(void);
void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...);
int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
void rds_iw_sync_mr(void *trans_private, int dir);
void rds_iw_free_mr(void *trans_private, int invalidate);
void rds_iw_flush_mrs(void);
+void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
/* ib_recv.c */
-int rds_iw_recv_init(void);
+int __init rds_iw_recv_init(void);
void rds_iw_recv_exit(void);
int rds_iw_recv(struct rds_connection *conn);
int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
gfp_t page_gfp, int prefill);
+void rds_iw_inc_purge(struct rds_incoming *inc);
void rds_iw_inc_free(struct rds_incoming *inc);
-int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
+int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
+ size_t size);
void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context);
-void rds_iw_recv_tasklet_fn(unsigned long data);
void rds_iw_recv_init_ring(struct rds_iw_connection *ic);
void rds_iw_recv_clear_ring(struct rds_iw_connection *ic);
void rds_iw_recv_init_ack(struct rds_iw_connection *ic);
void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context);
void rds_iw_send_init_ring(struct rds_iw_connection *ic);
void rds_iw_send_clear_ring(struct rds_iw_connection *ic);
-int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
+int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op);
void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits);
void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted);
int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted,
- u32 *adv_credits, int need_posted, int max_posted);
+ u32 *adv_credits, int need_posted);
/* ib_stats.c */
DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats);
unsigned int avail);
/* ib_sysctl.c */
-int rds_iw_sysctl_init(void);
+int __init rds_iw_sysctl_init(void);
void rds_iw_sysctl_exit(void);
extern unsigned long rds_iw_sysctl_max_send_wr;
extern unsigned long rds_iw_sysctl_max_recv_wr;
extern unsigned long rds_iw_sysctl_max_unsig_bytes;
extern unsigned long rds_iw_sysctl_max_recv_allocation;
extern unsigned int rds_iw_sysctl_flow_control;
+extern ctl_table rds_iw_sysctl_table[];
/*
* Helper functions for getting/setting the header and data SGEs in
*/
#include <linux/kernel.h>
#include <linux/in.h>
-#include <linux/slab.h>
#include <linux/vmalloc.h>
-#include <linux/ratelimit.h>
#include "rds.h"
#include "iw.h"
case IB_EVENT_QP_REQ_ERR:
case IB_EVENT_QP_FATAL:
default:
- rdsdebug("Fatal QP Event %u "
- "- connection %pI4->%pI4, reconnecting\n",
+ rds_iw_conn_error(conn, "RDS/IW: Fatal QP Event %u - connection %pI4->%pI4...reconnecting\n",
event->event, &conn->c_laddr,
&conn->c_faddr);
- rds_conn_drop(conn);
break;
}
}
unsigned int send_size, recv_size;
int ret;
- /* The offset of 1 is to accommodate the additional ACK WR. */
+ /* The offset of 1 is to accomodate the additional ACK WR. */
send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1);
recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1);
rds_iw_ring_resize(send_ring, send_size - 1);
* the rds_iwdev at all.
*/
rds_iwdev = ib_get_client_data(dev, &rds_iw_client);
- if (!rds_iwdev) {
- printk_ratelimited(KERN_NOTICE "RDS/IW: No client_data for device %s\n",
+ if (rds_iwdev == NULL) {
+ if (printk_ratelimit())
+ printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n",
dev->name);
return -EOPNOTSUPP;
}
ic->i_send_ring.w_nr *
sizeof(struct rds_header),
&ic->i_send_hdrs_dma, GFP_KERNEL);
- if (!ic->i_send_hdrs) {
+ if (ic->i_send_hdrs == NULL) {
ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent send failed\n");
goto out;
ic->i_recv_ring.w_nr *
sizeof(struct rds_header),
&ic->i_recv_hdrs_dma, GFP_KERNEL);
- if (!ic->i_recv_hdrs) {
+ if (ic->i_recv_hdrs == NULL) {
ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent recv failed\n");
goto out;
ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
&ic->i_ack_dma, GFP_KERNEL);
- if (!ic->i_ack) {
+ if (ic->i_ack == NULL) {
ret = -ENOMEM;
rdsdebug("ib_dma_alloc_coherent ack failed\n");
goto out;
}
ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work));
- if (!ic->i_sends) {
+ if (ic->i_sends == NULL) {
ret = -ENOMEM;
rdsdebug("send allocation failed\n");
goto out;
rds_iw_send_init_ring(ic);
ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work));
- if (!ic->i_recvs) {
+ if (ic->i_recvs == NULL) {
ret = -ENOMEM;
rdsdebug("recv allocation failed\n");
goto out;
version = RDS_PROTOCOL_3_0;
while ((common >>= 1) != 0)
version++;
- }
- printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using "
+ } else if (printk_ratelimit()) {
+ printk(KERN_NOTICE "RDS: Connection from %pI4 using "
"incompatible protocol version %u.%u\n",
&dp->dp_saddr,
dp->dp_protocol_major,
dp->dp_protocol_minor);
+ }
return version;
}
err = rds_iw_setup_qp(conn);
if (err) {
rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", err);
- mutex_unlock(&conn->c_cm_lock);
goto out;
}
/* XXX I wonder what affect the port space has */
/* delegate cm event handler to rdma_transport */
ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
- RDMA_PS_TCP, IB_QPT_RC);
+ RDMA_PS_TCP);
if (IS_ERR(ic->i_cm_id)) {
ret = PTR_ERR(ic->i_cm_id);
ic->i_cm_id = NULL;
/* Actually this may happen quite frequently, when
* an outgoing connect raced with an incoming connect.
*/
- rdsdebug("failed to disconnect, cm: %p err %d\n",
- ic->i_cm_id, err);
+ rdsdebug("rds_iw_conn_shutdown: failed to disconnect,"
+ " cm: %p err %d\n", ic->i_cm_id, err);
}
if (ic->i_cm_id->qp) {
unsigned long flags;
/* XXX too lazy? */
- ic = kzalloc(sizeof(struct rds_iw_connection), gfp);
- if (!ic)
+ ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL);
+ if (ic == NULL)
return -ENOMEM;
INIT_LIST_HEAD(&ic->iw_node);
- tasklet_init(&ic->i_recv_tasklet, rds_iw_recv_tasklet_fn,
- (unsigned long) ic);
mutex_init(&ic->i_recv_mutex);
#ifndef KERNEL_HAS_ATOMIC64
spin_lock_init(&ic->i_ack_lock);
*
*/
#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/ratelimit.h>
#include "rds.h"
+#include "rdma.h"
#include "iw.h"
static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
struct list_head *unmap_list,
- struct list_head *kill_list,
- int *unpinned);
+ struct list_head *kill_list);
static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
-static int rds_iw_get_device(struct sockaddr_in *src, struct sockaddr_in *dst,
- struct rds_iw_device **rds_iwdev,
- struct rdma_cm_id **cm_id)
+static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwdev, struct rdma_cm_id **cm_id)
{
struct rds_iw_device *iwdev;
struct rds_iw_cm_id *i_cm_id;
src_addr->sin_port,
dst_addr->sin_addr.s_addr,
dst_addr->sin_port,
- src->sin_addr.s_addr,
- src->sin_port,
- dst->sin_addr.s_addr,
- dst->sin_port);
+ rs->rs_bound_addr,
+ rs->rs_bound_port,
+ rs->rs_conn_addr,
+ rs->rs_conn_port);
#ifdef WORKING_TUPLE_DETECTION
- if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr &&
- src_addr->sin_port == src->sin_port &&
- dst_addr->sin_addr.s_addr == dst->sin_addr.s_addr &&
- dst_addr->sin_port == dst->sin_port) {
+ if (src_addr->sin_addr.s_addr == rs->rs_bound_addr &&
+ src_addr->sin_port == rs->rs_bound_port &&
+ dst_addr->sin_addr.s_addr == rs->rs_conn_addr &&
+ dst_addr->sin_port == rs->rs_conn_port) {
#else
/* FIXME - needs to compare the local and remote
* ipaddr/port tuple, but the ipaddr is the only
- * available information in the rds_sock (as the rest are
+ * available infomation in the rds_sock (as the rest are
* zero'ed. It doesn't appear to be properly populated
* during connection setup...
*/
- if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr) {
+ if (src_addr->sin_addr.s_addr == rs->rs_bound_addr) {
#endif
spin_unlock_irq(&iwdev->spinlock);
*rds_iwdev = iwdev;
return 0;
}
-static void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev,
- struct rdma_cm_id *cm_id)
+void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
{
struct rds_iw_cm_id *i_cm_id;
{
struct sockaddr_in *src_addr, *dst_addr;
struct rds_iw_device *rds_iwdev_old;
+ struct rds_sock rs;
struct rdma_cm_id *pcm_id;
int rc;
src_addr = (struct sockaddr_in *)&cm_id->route.addr.src_addr;
dst_addr = (struct sockaddr_in *)&cm_id->route.addr.dst_addr;
- rc = rds_iw_get_device(src_addr, dst_addr, &rds_iwdev_old, &pcm_id);
+ rs.rs_bound_addr = src_addr->sin_addr.s_addr;
+ rs.rs_bound_port = src_addr->sin_port;
+ rs.rs_conn_addr = dst_addr->sin_addr.s_addr;
+ rs.rs_conn_port = dst_addr->sin_port;
+
+ rc = rds_iw_get_device(&rs, &rds_iwdev_old, &pcm_id);
if (rc)
rds_iw_remove_cm_id(rds_iwdev, cm_id);
BUG_ON(list_empty(&ic->iw_node));
list_del(&ic->iw_node);
- spin_lock(&rds_iwdev->spinlock);
+ spin_lock_irq(&rds_iwdev->spinlock);
list_add_tail(&ic->iw_node, &rds_iwdev->conn_list);
- spin_unlock(&rds_iwdev->spinlock);
+ spin_unlock_irq(&rds_iwdev->spinlock);
spin_unlock_irq(&iw_nodev_conns_lock);
ic->rds_iwdev = rds_iwdev;
INIT_LIST_HEAD(list);
spin_unlock_irq(list_lock);
- list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node)
+ list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) {
+ if (ic->conn->c_passive)
+ rds_conn_destroy(ic->conn->c_passive);
rds_conn_destroy(ic->conn);
+ }
}
static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg,
}
static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev,
- struct rds_iw_scatterlist *sg)
+ struct rds_iw_scatterlist *sg,
+ unsigned int dma_page_shift)
{
struct ib_device *dev = rds_iwdev->dev;
u64 *dma_pages = NULL;
+ u64 dma_mask;
+ unsigned int dma_page_size;
int i, j, ret;
+ dma_page_size = 1 << dma_page_shift;
+ dma_mask = dma_page_size - 1;
+
WARN_ON(sg->dma_len);
sg->dma_len = ib_dma_map_sg(dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
sg->bytes += dma_len;
end_addr = dma_addr + dma_len;
- if (dma_addr & PAGE_MASK) {
+ if (dma_addr & dma_mask) {
if (i > 0)
goto out_unmap;
- dma_addr &= ~PAGE_MASK;
+ dma_addr &= ~dma_mask;
}
- if (end_addr & PAGE_MASK) {
+ if (end_addr & dma_mask) {
if (i < sg->dma_len - 1)
goto out_unmap;
- end_addr = (end_addr + PAGE_MASK) & ~PAGE_MASK;
+ end_addr = (end_addr + dma_mask) & ~dma_mask;
}
- sg->dma_npages += (end_addr - dma_addr) >> PAGE_SHIFT;
+ sg->dma_npages += (end_addr - dma_addr) >> dma_page_shift;
}
/* Now gather the dma addrs into one list */
u64 end_addr;
end_addr = dma_addr + dma_len;
- dma_addr &= ~PAGE_MASK;
- for (; dma_addr < end_addr; dma_addr += PAGE_SIZE)
+ dma_addr &= ~dma_mask;
+ for (; dma_addr < end_addr; dma_addr += dma_page_size)
dma_pages[j++] = dma_addr;
BUG_ON(j > sg->dma_npages);
}
}
}
+static inline unsigned int rds_iw_flush_goal(struct rds_iw_mr_pool *pool, int free_all)
+{
+ unsigned int item_count;
+
+ item_count = atomic_read(&pool->item_count);
+ if (free_all)
+ return item_count;
+
+ return 0;
+}
+
/*
* Flush our pool of MRs.
* At a minimum, all currently unused MRs are unmapped.
LIST_HEAD(unmap_list);
LIST_HEAD(kill_list);
unsigned long flags;
- unsigned int nfreed = 0, ncleaned = 0, unpinned = 0;
+ unsigned int nfreed = 0, ncleaned = 0, free_goal;
int ret = 0;
rds_iw_stats_inc(s_iw_rdma_mr_pool_flush);
list_splice_init(&pool->clean_list, &kill_list);
spin_unlock_irqrestore(&pool->list_lock, flags);
+ free_goal = rds_iw_flush_goal(pool, free_all);
+
/* Batched invalidate of dirty MRs.
* For FMR based MRs, the mappings on the unmap list are
* actually members of an ibmr (ibmr->mapping). They either
* will be destroyed by the unmap function.
*/
if (!list_empty(&unmap_list)) {
- ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list,
- &kill_list, &unpinned);
+ ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list, &kill_list);
/* If we've been asked to destroy all MRs, move those
* that were simply cleaned to the kill list */
if (free_all)
spin_unlock_irqrestore(&pool->list_lock, flags);
}
- atomic_sub(unpinned, &pool->free_pinned);
atomic_sub(ncleaned, &pool->dirty_count);
atomic_sub(nfreed, &pool->item_count);
rds_iw_free_fastreg(pool, ibmr);
/* If we've pinned too many pages, request a flush */
- if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
- atomic_read(&pool->dirty_count) >= pool->max_items / 10)
+ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned
+ || atomic_read(&pool->dirty_count) >= pool->max_items / 10)
queue_work(rds_wq, &pool->flush_worker);
if (invalidate) {
struct rds_iw_device *rds_iwdev;
struct rds_iw_mr *ibmr = NULL;
struct rdma_cm_id *cm_id;
- struct sockaddr_in src = {
- .sin_addr.s_addr = rs->rs_bound_addr,
- .sin_port = rs->rs_bound_port,
- };
- struct sockaddr_in dst = {
- .sin_addr.s_addr = rs->rs_conn_addr,
- .sin_port = rs->rs_conn_port,
- };
int ret;
- ret = rds_iw_get_device(&src, &dst, &rds_iwdev, &cm_id);
+ ret = rds_iw_get_device(rs, &rds_iwdev, &cm_id);
if (ret || !cm_id) {
ret = -ENODEV;
goto out;
f_wr.wr.fast_reg.rkey = mapping->m_rkey;
f_wr.wr.fast_reg.page_list = ibmr->page_list;
f_wr.wr.fast_reg.page_list_len = mapping->m_sg.dma_len;
- f_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+ f_wr.wr.fast_reg.page_shift = ibmr->device->page_shift;
f_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE |
IB_ACCESS_REMOTE_READ |
IB_ACCESS_REMOTE_WRITE;
failed_wr = &f_wr;
ret = ib_post_send(ibmr->cm_id->qp, &f_wr, &failed_wr);
BUG_ON(failed_wr != &f_wr);
- if (ret)
- printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
+ if (ret && printk_ratelimit())
+ printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
__func__, __LINE__, ret);
return ret;
}
failed_wr = &s_wr;
ret = ib_post_send(ibmr->cm_id->qp, &s_wr, &failed_wr);
- if (ret) {
- printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
+ if (ret && printk_ratelimit()) {
+ printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
__func__, __LINE__, ret);
goto out;
}
rds_iw_set_scatterlist(&mapping->m_sg, sg, sg_len);
- dma_pages = rds_iw_map_scatterlist(rds_iwdev, &mapping->m_sg);
+ dma_pages = rds_iw_map_scatterlist(rds_iwdev,
+ &mapping->m_sg,
+ rds_iwdev->page_shift);
if (IS_ERR(dma_pages)) {
ret = PTR_ERR(dma_pages);
dma_pages = NULL;
static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
struct list_head *unmap_list,
- struct list_head *kill_list,
- int *unpinned)
+ struct list_head *kill_list)
{
struct rds_iw_mapping *mapping, *next;
unsigned int ncleaned = 0;
spin_lock_irqsave(&pool->list_lock, flags);
list_for_each_entry_safe(mapping, next, unmap_list, m_list) {
- *unpinned += mapping->m_sg.len;
list_move(&mapping->m_list, &laundered);
ncleaned++;
}
*
*/
#include <linux/kernel.h>
-#include <linux/slab.h>
#include <linux/pci.h>
#include <linux/dma-mapping.h>
#include <rdma/rdma_cm.h>
static void rds_iw_frag_free(struct rds_page_frag *frag)
{
rdsdebug("frag %p page %p\n", frag, frag->f_page);
- BUG_ON(frag->f_page);
+ BUG_ON(frag->f_page != NULL);
kmem_cache_free(rds_iw_frag_slab, frag);
}
struct ib_sge *sge;
int ret = -ENOMEM;
- if (!recv->r_iwinc) {
- if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) {
+ if (recv->r_iwinc == NULL) {
+ if (atomic_read(&rds_iw_allocation) >= rds_iw_sysctl_max_recv_allocation) {
rds_iw_stats_inc(s_iw_rx_alloc_limit);
goto out;
}
recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab,
kptr_gfp);
- if (!recv->r_iwinc) {
- atomic_dec(&rds_iw_allocation);
+ if (recv->r_iwinc == NULL)
goto out;
- }
+ atomic_inc(&rds_iw_allocation);
INIT_LIST_HEAD(&recv->r_iwinc->ii_frags);
rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr);
}
- if (!recv->r_frag) {
+ if (recv->r_frag == NULL) {
recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp);
- if (!recv->r_frag)
+ if (recv->r_frag == NULL)
goto out;
INIT_LIST_HEAD(&recv->r_frag->f_item);
recv->r_frag->f_page = NULL;
}
- if (!ic->i_frag.f_page) {
+ if (ic->i_frag.f_page == NULL) {
ic->i_frag.f_page = alloc_page(page_gfp);
- if (!ic->i_frag.f_page)
+ if (ic->i_frag.f_page == NULL)
goto out;
ic->i_frag.f_offset = 0;
}
int ret = 0;
u32 pos;
- while ((prefill || rds_conn_up(conn)) &&
- rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
+ while ((prefill || rds_conn_up(conn))
+ && rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
if (pos >= ic->i_recv_ring.w_nr) {
printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
pos);
return ret;
}
-static void rds_iw_inc_purge(struct rds_incoming *inc)
+void rds_iw_inc_purge(struct rds_incoming *inc)
{
struct rds_iw_incoming *iwinc;
struct rds_page_frag *frag;
BUG_ON(atomic_read(&rds_iw_allocation) < 0);
}
-int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
+int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
+ size_t size)
{
struct rds_iw_incoming *iwinc;
struct rds_page_frag *frag;
+ struct iovec *iov = first_iov;
unsigned long to_copy;
unsigned long frag_off = 0;
+ unsigned long iov_off = 0;
int copied = 0;
int ret;
u32 len;
frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item);
len = be32_to_cpu(inc->i_hdr.h_len);
- while (iov_iter_count(to) && copied < len) {
+ while (copied < size && copied < len) {
if (frag_off == RDS_FRAG_SIZE) {
frag = list_entry(frag->f_item.next,
struct rds_page_frag, f_item);
frag_off = 0;
}
- to_copy = min_t(unsigned long, iov_iter_count(to),
- RDS_FRAG_SIZE - frag_off);
+ while (iov_off == iov->iov_len) {
+ iov_off = 0;
+ iov++;
+ }
+
+ to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off);
+ to_copy = min_t(size_t, to_copy, size - copied);
to_copy = min_t(unsigned long, to_copy, len - copied);
+ rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
+ "[%p, %lu] + %lu\n",
+ to_copy, iov->iov_base, iov->iov_len, iov_off,
+ frag->f_page, frag->f_offset, frag_off);
+
/* XXX needs + offset for multiple recvs per page */
- rds_stats_add(s_copy_to_user, to_copy);
- ret = copy_page_to_iter(frag->f_page,
- frag->f_offset + frag_off,
- to_copy,
- to);
- if (ret != to_copy)
- return -EFAULT;
+ ret = rds_page_copy_to_user(frag->f_page,
+ frag->f_offset + frag_off,
+ iov->iov_base + iov_off,
+ to_copy);
+ if (ret) {
+ copied = ret;
+ break;
+ }
+ iov_off += to_copy;
frag_off += to_copy;
copied += to_copy;
}
{
atomic64_set(&ic->i_ack_next, seq);
if (ack_required) {
- smp_mb__before_atomic();
+ smp_mb__before_clear_bit();
set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
}
}
static u64 rds_iw_get_ack(struct rds_iw_connection *ic)
{
clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
- smp_mb__after_atomic();
+ smp_mb__after_clear_bit();
return atomic64_read(&ic->i_ack_next);
}
set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
rds_iw_stats_inc(s_iw_ack_send_failure);
-
- rds_iw_conn_error(ic->conn, "sending ack failed\n");
+ /* Need to finesse this later. */
+ BUG();
} else
rds_iw_stats_inc(s_iw_ack_sent);
}
}
/* Can we get a send credit? */
- if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) {
+ if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0)) {
rds_iw_stats_inc(s_iw_tx_throttle);
clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
return;
to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
- addr = kmap_atomic(frag->f_page);
+ addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0);
src = addr + frag_off;
dst = (void *)map->m_page_addrs[map_page] + map_off;
uncongested |= ~(*src) & *dst;
*dst++ = *src++;
}
- kunmap_atomic(addr);
+ kunmap_atomic(addr, KM_SOFTIRQ0);
copied += to_copy;
if (byte_len < sizeof(struct rds_header)) {
rds_iw_conn_error(conn, "incoming message "
- "from %pI4 didn't include a "
+ "from %pI4 didn't inclue a "
"header, disconnecting and "
"reconnecting\n",
&conn->c_faddr);
* into the inc and save the inc so we can hang upcoming fragments
* off its list.
*/
- if (!iwinc) {
+ if (iwinc == NULL) {
iwinc = recv->r_iwinc;
recv->r_iwinc = NULL;
ic->i_iwinc = iwinc;
hdr = &iwinc->ii_inc.i_hdr;
/* We can't just use memcmp here; fragments of a
* single message may carry different ACKs */
- if (hdr->h_sequence != ihdr->h_sequence ||
- hdr->h_len != ihdr->h_len ||
- hdr->h_sport != ihdr->h_sport ||
- hdr->h_dport != ihdr->h_dport) {
+ if (hdr->h_sequence != ihdr->h_sequence
+ || hdr->h_len != ihdr->h_len
+ || hdr->h_sport != ihdr->h_sport
+ || hdr->h_dport != ihdr->h_dport) {
rds_iw_conn_error(conn,
"fragment header mismatch; forcing reconnect\n");
return;
rds_iw_cong_recv(conn, iwinc);
else {
rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
- &iwinc->ii_inc, GFP_ATOMIC);
+ &iwinc->ii_inc, GFP_ATOMIC,
+ KM_SOFTIRQ0);
state->ack_next = be64_to_cpu(hdr->h_sequence);
state->ack_next_valid = 1;
}
{
struct rds_connection *conn = context;
struct rds_iw_connection *ic = conn->c_transport_data;
+ struct ib_wc wc;
+ struct rds_iw_ack_state state = { 0, };
+ struct rds_iw_recv_work *recv;
rdsdebug("conn %p cq %p\n", conn, cq);
rds_iw_stats_inc(s_iw_rx_cq_call);
- tasklet_schedule(&ic->i_recv_tasklet);
-}
-
-static inline void rds_poll_cq(struct rds_iw_connection *ic,
- struct rds_iw_ack_state *state)
-{
- struct rds_connection *conn = ic->conn;
- struct ib_wc wc;
- struct rds_iw_recv_work *recv;
+ ib_req_notify_cq(cq, IB_CQ_SOLICITED);
- while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
+ while (ib_poll_cq(cq, 1, &wc) > 0) {
rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
(unsigned long long)wc.wr_id, wc.status, wc.byte_len,
be32_to_cpu(wc.ex.imm_data));
if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
/* We expect errors as the qp is drained during shutdown */
if (wc.status == IB_WC_SUCCESS) {
- rds_iw_process_recv(conn, recv, wc.byte_len, state);
+ rds_iw_process_recv(conn, recv, wc.byte_len, &state);
} else {
rds_iw_conn_error(conn, "recv completion on "
"%pI4 had status %u, disconnecting and "
rds_iw_ring_free(&ic->i_recv_ring, 1);
}
-}
-
-void rds_iw_recv_tasklet_fn(unsigned long data)
-{
- struct rds_iw_connection *ic = (struct rds_iw_connection *) data;
- struct rds_connection *conn = ic->conn;
- struct rds_iw_ack_state state = { 0, };
-
- rds_poll_cq(ic, &state);
- ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
- rds_poll_cq(ic, &state);
if (state.ack_next_valid)
rds_iw_set_ack(ic, state.ack_next, state.ack_required);
return ret;
}
-int rds_iw_recv_init(void)
+int __init rds_iw_recv_init(void)
{
struct sysinfo si;
int ret = -ENOMEM;
rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming",
sizeof(struct rds_iw_incoming),
0, 0, NULL);
- if (!rds_iw_incoming_slab)
+ if (rds_iw_incoming_slab == NULL)
goto out;
rds_iw_frag_slab = kmem_cache_create("rds_iw_frag",
sizeof(struct rds_page_frag),
0, 0, NULL);
- if (!rds_iw_frag_slab)
+ if (rds_iw_frag_slab == NULL)
kmem_cache_destroy(rds_iw_incoming_slab);
else
ret = 0;
int rds_iw_ring_low(struct rds_iw_work_ring *ring)
{
- return __rds_iw_ring_used(ring) <= (ring->w_nr >> 1);
+ return __rds_iw_ring_used(ring) <= (ring->w_nr >> 2);
}
#include <linux/in.h>
#include <linux/device.h>
#include <linux/dmapool.h>
-#include <linux/ratelimit.h>
#include "rds.h"
+#include "rdma.h"
#include "iw.h"
static void rds_iw_send_rdma_complete(struct rds_message *rm,
}
static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic,
- struct rm_rdma_op *op)
+ struct rds_rdma_op *op)
{
- if (op->op_mapped) {
+ if (op->r_mapped) {
ib_dma_unmap_sg(ic->i_cm_id->device,
- op->op_sg, op->op_nents,
- op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
- op->op_mapped = 0;
+ op->r_sg, op->r_nents,
+ op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ op->r_mapped = 0;
}
}
rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
ib_dma_unmap_sg(ic->i_cm_id->device,
- rm->data.op_sg, rm->data.op_nents,
+ rm->m_sg, rm->m_nents,
DMA_TO_DEVICE);
- if (rm->rdma.op_active) {
- rds_iw_send_unmap_rdma(ic, &rm->rdma);
+ if (rm->m_rdma_op != NULL) {
+ rds_iw_send_unmap_rdma(ic, rm->m_rdma_op);
/* If the user asked for a completion notification on this
* message, we can implement three different semantics:
*/
rds_iw_send_rdma_complete(rm, wc_status);
- if (rm->rdma.op_write)
- rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes);
+ if (rm->m_rdma_op->r_write)
+ rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes);
else
- rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes);
+ rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes);
}
/* If anyone waited for this message to get flushed out, wake
}
if (wc.wr_id == RDS_IW_ACK_WR_ID) {
- if (time_after(jiffies, ic->i_ack_queued + HZ/2))
+ if (ic->i_ack_queued + HZ/2 < jiffies)
rds_iw_stats_inc(s_iw_tx_stalled);
rds_iw_ack_send_complete(ic);
continue;
* when the SEND completes. */
break;
default:
- printk_ratelimited(KERN_NOTICE
+ if (printk_ratelimit())
+ printk(KERN_NOTICE
"RDS/IW: %s: unexpected opcode 0x%x in WR!\n",
__func__, send->s_wr.opcode);
break;
send->s_wr.opcode = 0xdead;
send->s_wr.num_sge = 1;
- if (time_after(jiffies, send->s_queued + HZ/2))
+ if (send->s_queued + HZ/2 < jiffies)
rds_iw_stats_inc(s_iw_tx_stalled);
/* If a RDMA operation produced an error, signal this right
rds_iw_ring_free(&ic->i_send_ring, completed);
- if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
- test_bit(0, &conn->c_map_queued))
+ if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)
+ || test_bit(0, &conn->c_map_queued))
queue_delayed_work(rds_wq, &conn->c_send_w, 0);
/* We expect errors as the qp is drained during shutdown */
*
* Conceptually, we have two counters:
* - send credits: this tells us how many WRs we're allowed
- * to submit without overruning the receiver's queue. For
+ * to submit without overruning the reciever's queue. For
* each SEND WR we post, we decrement this by one.
*
* - posted credits: this tells us how many WRs we recently
* and using atomic_cmpxchg when updating the two counters.
*/
int rds_iw_send_grab_credits(struct rds_iw_connection *ic,
- u32 wanted, u32 *adv_credits, int need_posted, int max_posted)
+ u32 wanted, u32 *adv_credits, int need_posted)
{
unsigned int avail, posted, got = 0, advertise;
long oldval, newval;
posted = IB_GET_POST_CREDITS(oldval);
avail = IB_GET_SEND_CREDITS(oldval);
- rdsdebug("wanted=%u credits=%u posted=%u\n",
+ rdsdebug("rds_iw_send_grab_credits(%u): credits=%u posted=%u\n",
wanted, avail, posted);
/* The last credit must be used to send a credit update. */
* available.
*/
if (posted && (got || need_posted)) {
- advertise = min_t(unsigned int, posted, max_posted);
+ advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT);
newval -= IB_SET_POST_CREDITS(advertise);
}
if (credits == 0)
return;
- rdsdebug("credits=%u current=%u%s\n",
+ rdsdebug("rds_iw_send_add_credits(%u): current=%u%s\n",
credits,
IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)),
test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : "");
BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
/* Fastreg support */
- if (rds_rdma_cookie_key(rm->m_rdma_cookie) && !ic->i_fastreg_posted) {
+ if (rds_rdma_cookie_key(rm->m_rdma_cookie)
+ && !ic->i_fastreg_posted) {
ret = -EAGAIN;
goto out;
}
credit_alloc = work_alloc;
if (ic->i_flowctl) {
- credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT);
+ credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0);
adv_credits += posted;
if (credit_alloc < work_alloc) {
rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
flow_controlled++;
}
if (work_alloc == 0) {
- set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
+ rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
rds_iw_stats_inc(s_iw_tx_throttle);
ret = -ENOMEM;
goto out;
}
/* map the message the first time we see it */
- if (!ic->i_rm) {
+ if (ic->i_rm == NULL) {
/*
printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n",
be16_to_cpu(rm->m_inc.i_hdr.h_dport),
rm->m_inc.i_hdr.h_flags,
be32_to_cpu(rm->m_inc.i_hdr.h_len));
*/
- if (rm->data.op_nents) {
- rm->data.op_count = ib_dma_map_sg(dev,
- rm->data.op_sg,
- rm->data.op_nents,
- DMA_TO_DEVICE);
- rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
- if (rm->data.op_count == 0) {
+ if (rm->m_nents) {
+ rm->m_count = ib_dma_map_sg(dev,
+ rm->m_sg, rm->m_nents, DMA_TO_DEVICE);
+ rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count);
+ if (rm->m_count == 0) {
rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
ret = -ENOMEM; /* XXX ? */
goto out;
}
} else {
- rm->data.op_count = 0;
+ rm->m_count = 0;
}
ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
/* If it has a RDMA op, tell the peer we did it. This is
* used by the peer to release use-once RDMA MRs. */
- if (rm->rdma.op_active) {
+ if (rm->m_rdma_op) {
struct rds_ext_header_rdma ext_hdr;
- ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
+ ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key);
rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
}
/*
* Update adv_credits since we reset the ACK_REQUIRED bit.
*/
- rds_iw_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
+ rds_iw_send_grab_credits(ic, 0, &posted, 1);
adv_credits += posted;
BUG_ON(adv_credits > 255);
- }
+ } else if (ic->i_rm != rm)
+ BUG();
send = &ic->i_sends[pos];
first = send;
prev = NULL;
- scat = &rm->data.op_sg[sg];
+ scat = &rm->m_sg[sg];
sent = 0;
i = 0;
* or when requested by the user. Right now, we let
* the application choose.
*/
- if (rm->rdma.op_active && rm->rdma.op_fence)
+ if (rm->m_rdma_op && rm->m_rdma_op->r_fence)
send_flags = IB_SEND_FENCE;
/*
}
/* if there's data reference it with a chain of work reqs */
- for (; i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]; i++) {
+ for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) {
unsigned int len;
send = &ic->i_sends[pos];
sent += sizeof(struct rds_header);
/* if we finished the message then send completion owns it */
- if (scat == &rm->data.op_sg[rm->data.op_count]) {
+ if (scat == &rm->m_sg[rm->m_count]) {
prev->s_rm = ic->i_rm;
prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
ic->i_rm = NULL;
send->s_wr.wr.fast_reg.rkey = send->s_mr->rkey;
send->s_wr.wr.fast_reg.page_list = send->s_page_list;
send->s_wr.wr.fast_reg.page_list_len = nent;
- send->s_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+ send->s_wr.wr.fast_reg.page_shift = rds_iwdev->page_shift;
send->s_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE;
send->s_wr.wr.fast_reg.iova_start = sg_addr;
ib_update_fast_reg_key(send->s_mr, send->s_remap_count++);
}
-int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
+int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
{
struct rds_iw_connection *ic = conn->c_transport_data;
struct rds_iw_send_work *send = NULL;
struct rds_iw_device *rds_iwdev;
struct scatterlist *scat;
unsigned long len;
- u64 remote_addr = op->op_remote_addr;
+ u64 remote_addr = op->r_remote_addr;
u32 pos, fr_pos;
u32 work_alloc;
u32 i;
rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
/* map the message the first time we see it */
- if (!op->op_mapped) {
- op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
- op->op_sg, op->op_nents, (op->op_write) ?
- DMA_TO_DEVICE : DMA_FROM_DEVICE);
- rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
- if (op->op_count == 0) {
+ if (!op->r_mapped) {
+ op->r_count = ib_dma_map_sg(ic->i_cm_id->device,
+ op->r_sg, op->r_nents, (op->r_write) ?
+ DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count);
+ if (op->r_count == 0) {
rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
ret = -ENOMEM; /* XXX ? */
goto out;
}
- op->op_mapped = 1;
+ op->r_mapped = 1;
}
- if (!op->op_write) {
+ if (!op->r_write) {
/* Alloc space on the send queue for the fastreg */
work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos);
if (work_alloc != 1) {
* Instead of knowing how to return a partial rdma read/write we insist that there
* be enough work requests to send the entire message.
*/
- i = ceil(op->op_count, rds_iwdev->max_sge);
+ i = ceil(op->r_count, rds_iwdev->max_sge);
work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
if (work_alloc != i) {
}
send = &ic->i_sends[pos];
- if (!op->op_write) {
+ if (!op->r_write) {
first = prev = &ic->i_sends[fr_pos];
} else {
first = send;
prev = NULL;
}
- scat = &op->op_sg[0];
+ scat = &op->r_sg[0];
sent = 0;
- num_sge = op->op_count;
+ num_sge = op->r_count;
- for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
+ for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) {
send->s_wr.send_flags = 0;
send->s_queued = jiffies;
* for local access after RDS is finished with it, using
* IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed.
*/
- if (op->op_write)
+ if (op->r_write)
send->s_wr.opcode = IB_WR_RDMA_WRITE;
else
send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
send->s_wr.wr.rdma.remote_addr = remote_addr;
- send->s_wr.wr.rdma.rkey = op->op_rkey;
+ send->s_wr.wr.rdma.rkey = op->r_key;
send->s_op = op;
if (num_sge > rds_iwdev->max_sge) {
if (prev)
prev->s_wr.next = &send->s_wr;
- for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
+ for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) {
len = ib_sg_dma_len(ic->i_cm_id->device, scat);
if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV)
}
/* if we finished the message then send completion owns it */
- if (scat == &op->op_sg[op->op_count])
+ if (scat == &op->r_sg[op->r_count])
first->s_wr.send_flags = IB_SEND_SIGNALED;
if (i < work_alloc) {
* adapters do not allow using the lkey for this at all. To bypass this use a
* fastreg_mr (or possibly a dma_mr)
*/
- if (!op->op_write) {
+ if (!op->r_write) {
rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos],
- op->op_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr);
+ op->r_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr);
work_alloc++;
}
#include "rds.h"
#include "iw.h"
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_iw_statistics, rds_iw_stats);
+DEFINE_PER_CPU(struct rds_iw_statistics, rds_iw_stats) ____cacheline_aligned;
-static const char *const rds_iw_stat_names[] = {
+static char *rds_iw_stat_names[] = {
"iw_connect_raced",
"iw_listen_closed_stale",
"iw_tx_cq_call",
unsigned int rds_iw_sysctl_flow_control = 1;
-static struct ctl_table rds_iw_sysctl_table[] = {
+ctl_table rds_iw_sysctl_table[] = {
{
+ .ctl_name = CTL_UNNUMBERED,
.procname = "max_send_wr",
.data = &rds_iw_sysctl_max_send_wr,
.maxlen = sizeof(unsigned long),
.mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
+ .proc_handler = &proc_doulongvec_minmax,
.extra1 = &rds_iw_sysctl_max_wr_min,
.extra2 = &rds_iw_sysctl_max_wr_max,
},
{
+ .ctl_name = CTL_UNNUMBERED,
.procname = "max_recv_wr",
.data = &rds_iw_sysctl_max_recv_wr,
.maxlen = sizeof(unsigned long),
.mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
+ .proc_handler = &proc_doulongvec_minmax,
.extra1 = &rds_iw_sysctl_max_wr_min,
.extra2 = &rds_iw_sysctl_max_wr_max,
},
{
+ .ctl_name = CTL_UNNUMBERED,
.procname = "max_unsignaled_wr",
.data = &rds_iw_sysctl_max_unsig_wrs,
.maxlen = sizeof(unsigned long),
.mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
+ .proc_handler = &proc_doulongvec_minmax,
.extra1 = &rds_iw_sysctl_max_unsig_wr_min,
.extra2 = &rds_iw_sysctl_max_unsig_wr_max,
},
{
+ .ctl_name = CTL_UNNUMBERED,
.procname = "max_unsignaled_bytes",
.data = &rds_iw_sysctl_max_unsig_bytes,
.maxlen = sizeof(unsigned long),
.mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
+ .proc_handler = &proc_doulongvec_minmax,
.extra1 = &rds_iw_sysctl_max_unsig_bytes_min,
.extra2 = &rds_iw_sysctl_max_unsig_bytes_max,
},
{
+ .ctl_name = CTL_UNNUMBERED,
.procname = "max_recv_allocation",
.data = &rds_iw_sysctl_max_recv_allocation,
.maxlen = sizeof(unsigned long),
.mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
+ .proc_handler = &proc_doulongvec_minmax,
},
{
+ .ctl_name = CTL_UNNUMBERED,
.procname = "flow_control",
.data = &rds_iw_sysctl_flow_control,
.maxlen = sizeof(rds_iw_sysctl_flow_control),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = &proc_dointvec,
},
+ { .ctl_name = 0}
+};
+
+static struct ctl_path rds_iw_sysctl_path[] = {
+ { .procname = "net", .ctl_name = CTL_NET, },
+ { .procname = "rds", .ctl_name = CTL_UNNUMBERED, },
+ { .procname = "iw", .ctl_name = CTL_UNNUMBERED, },
{ }
};
void rds_iw_sysctl_exit(void)
{
- unregister_net_sysctl_table(rds_iw_sysctl_hdr);
+ if (rds_iw_sysctl_hdr)
+ unregister_sysctl_table(rds_iw_sysctl_hdr);
}
-int rds_iw_sysctl_init(void)
+int __init rds_iw_sysctl_init(void)
{
- rds_iw_sysctl_hdr = register_net_sysctl(&init_net, "net/rds/iw", rds_iw_sysctl_table);
- if (!rds_iw_sysctl_hdr)
+ rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table);
+ if (rds_iw_sysctl_hdr == NULL)
return -ENOMEM;
return 0;
}
*
*/
#include <linux/kernel.h>
-#include <linux/slab.h>
#include <linux/in.h>
#include "rds.h"
unsigned int hdr_off, unsigned int sg,
unsigned int off)
{
- struct scatterlist *sgp = &rm->data.op_sg[sg];
- int ret = sizeof(struct rds_header) +
- be32_to_cpu(rm->m_inc.i_hdr.h_len);
-
- /* Do not send cong updates to loopback */
- if (rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) {
- rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
- ret = min_t(int, ret, sgp->length - conn->c_xmit_data_off);
- goto out;
- }
-
BUG_ON(hdr_off || sg || off);
rds_inc_init(&rm->m_inc, conn, conn->c_laddr);
- /* For the embedded inc. Matching put is in loop_inc_free() */
- rds_message_addref(rm);
+ rds_message_addref(rm); /* for the inc */
rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc,
- GFP_KERNEL);
+ GFP_KERNEL, KM_USER0);
rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence),
NULL);
rds_inc_put(&rm->m_inc);
-out:
- return ret;
+
+ return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len);
}
-/*
- * See rds_loop_xmit(). Since our inc is embedded in the rm, we
- * make sure the rm lives at least until the inc is done.
- */
-static void rds_loop_inc_free(struct rds_incoming *inc)
+static int rds_loop_xmit_cong_map(struct rds_connection *conn,
+ struct rds_cong_map *map,
+ unsigned long offset)
{
- struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
- rds_message_put(rm);
+ unsigned long i;
+
+ BUG_ON(offset);
+ BUG_ON(map != conn->c_lcong);
+
+ for (i = 0; i < RDS_CONG_MAP_PAGES; i++) {
+ memcpy((void *)conn->c_fcong->m_page_addrs[i],
+ (void *)map->m_page_addrs[i], PAGE_SIZE);
+ }
+
+ rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
+
+ return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES;
}
/* we need to at least give the thread something to succeed */
struct rds_loop_connection *lc;
unsigned long flags;
- lc = kzalloc(sizeof(struct rds_loop_connection), gfp);
- if (!lc)
+ lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL);
+ if (lc == NULL)
return -ENOMEM;
INIT_LIST_HEAD(&lc->loop_node);
static void rds_loop_conn_free(void *arg)
{
struct rds_loop_connection *lc = arg;
- unsigned long flags;
-
rdsdebug("lc %p\n", lc);
- spin_lock_irqsave(&loop_conns_lock, flags);
list_del(&lc->loop_node);
- spin_unlock_irqrestore(&loop_conns_lock, flags);
kfree(lc);
}
*/
struct rds_transport rds_loop_transport = {
.xmit = rds_loop_xmit,
+ .xmit_cong_map = rds_loop_xmit_cong_map,
.recv = rds_loop_recv,
.conn_alloc = rds_loop_conn_alloc,
.conn_free = rds_loop_conn_free,
.conn_connect = rds_loop_conn_connect,
.conn_shutdown = rds_loop_conn_shutdown,
.inc_copy_to_user = rds_message_inc_copy_to_user,
- .inc_free = rds_loop_inc_free,
+ .inc_purge = rds_message_inc_purge,
+ .inc_free = rds_message_inc_free,
.t_name = "loopback",
};
*
*/
#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/export.h>
#include "rds.h"
+#include "rdma.h"
+
+static DECLARE_WAIT_QUEUE_HEAD(rds_message_flush_waitq);
static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
[RDS_EXTHDR_NONE] = 0,
rdsdebug("addref rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
atomic_inc(&rm->m_refcount);
}
-EXPORT_SYMBOL_GPL(rds_message_addref);
/*
* This relies on dma_map_sg() not touching sg[].page during merging.
if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
return;
- for (i = 0; i < rm->data.op_nents; i++) {
- rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i]));
+ for (i = 0; i < rm->m_nents; i++) {
+ rdsdebug("putting data page %p\n", (void *)sg_page(&rm->m_sg[i]));
/* XXX will have to put_page for page refs */
- __free_page(sg_page(&rm->data.op_sg[i]));
+ __free_page(sg_page(&rm->m_sg[i]));
}
- rm->data.op_nents = 0;
+ rm->m_nents = 0;
- if (rm->rdma.op_active)
- rds_rdma_free_op(&rm->rdma);
- if (rm->rdma.op_rdma_mr)
- rds_mr_put(rm->rdma.op_rdma_mr);
+ if (rm->m_rdma_op)
+ rds_rdma_free_op(rm->m_rdma_op);
+ if (rm->m_rdma_mr)
+ rds_mr_put(rm->m_rdma_mr);
+}
- if (rm->atomic.op_active)
- rds_atomic_free_op(&rm->atomic);
- if (rm->atomic.op_rdma_mr)
- rds_mr_put(rm->atomic.op_rdma_mr);
+void rds_message_inc_purge(struct rds_incoming *inc)
+{
+ struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
+ rds_message_purge(rm);
}
void rds_message_put(struct rds_message *rm)
{
rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
- WARN(!atomic_read(&rm->m_refcount), "danger refcount zero on %p\n", rm);
+
if (atomic_dec_and_test(&rm->m_refcount)) {
BUG_ON(!list_empty(&rm->m_sock_item));
BUG_ON(!list_empty(&rm->m_conn_item));
kfree(rm);
}
}
-EXPORT_SYMBOL_GPL(rds_message_put);
+
+void rds_message_inc_free(struct rds_incoming *inc)
+{
+ struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
+ rds_message_put(rm);
+}
void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
__be16 dport, u64 seq)
hdr->h_sequence = cpu_to_be64(seq);
hdr->h_exthdr[0] = RDS_EXTHDR_NONE;
}
-EXPORT_SYMBOL_GPL(rds_message_populate_header);
-int rds_message_add_extension(struct rds_header *hdr, unsigned int type,
- const void *data, unsigned int len)
+int rds_message_add_extension(struct rds_header *hdr,
+ unsigned int type, const void *data, unsigned int len)
{
unsigned int ext_len = sizeof(u8) + len;
unsigned char *dst;
if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE)
return 0;
- if (type >= __RDS_EXTHDR_MAX || len != rds_exthdr_size[type])
+ if (type >= __RDS_EXTHDR_MAX
+ || len != rds_exthdr_size[type])
return 0;
if (ext_len >= RDS_HEADER_EXT_SPACE)
dst[len] = RDS_EXTHDR_NONE;
return 1;
}
-EXPORT_SYMBOL_GPL(rds_message_add_extension);
/*
* If a message has extension headers, retrieve them here.
return RDS_EXTHDR_NONE;
}
+int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version)
+{
+ struct rds_ext_header_version ext_hdr;
+
+ ext_hdr.h_version = cpu_to_be32(version);
+ return rds_message_add_extension(hdr, RDS_EXTHDR_VERSION, &ext_hdr, sizeof(ext_hdr));
+}
+
+int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version)
+{
+ struct rds_ext_header_version ext_hdr;
+ unsigned int pos = 0, len = sizeof(ext_hdr);
+
+ /* We assume the version extension is the only one present */
+ if (rds_message_next_extension(hdr, &pos, &ext_hdr, &len) != RDS_EXTHDR_VERSION)
+ return 0;
+ *version = be32_to_cpu(ext_hdr.h_version);
+ return 1;
+}
+
int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset)
{
struct rds_ext_header_rdma_dest ext_hdr;
ext_hdr.h_rdma_offset = cpu_to_be32(offset);
return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr));
}
-EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension);
-/*
- * Each rds_message is allocated with extra space for the scatterlist entries
- * rds ops will need. This is to minimize memory allocation count. Then, each rds op
- * can grab SGs when initializing its part of the rds_message.
- */
-struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp)
+struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp)
{
struct rds_message *rm;
- if (extra_len > KMALLOC_MAX_SIZE - sizeof(struct rds_message))
- return NULL;
-
- rm = kzalloc(sizeof(struct rds_message) + extra_len, gfp);
+ rm = kzalloc(sizeof(struct rds_message) +
+ (nents * sizeof(struct scatterlist)), gfp);
if (!rm)
goto out;
- rm->m_used_sgs = 0;
- rm->m_total_sgs = extra_len / sizeof(struct scatterlist);
-
+ if (nents)
+ sg_init_table(rm->m_sg, nents);
atomic_set(&rm->m_refcount, 1);
INIT_LIST_HEAD(&rm->m_sock_item);
INIT_LIST_HEAD(&rm->m_conn_item);
spin_lock_init(&rm->m_rs_lock);
- init_waitqueue_head(&rm->m_flush_wait);
out:
return rm;
}
-/*
- * RDS ops use this to grab SG entries from the rm's sg pool.
- */
-struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents)
-{
- struct scatterlist *sg_first = (struct scatterlist *) &rm[1];
- struct scatterlist *sg_ret;
-
- WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs);
- WARN_ON(!nents);
-
- if (rm->m_used_sgs + nents > rm->m_total_sgs)
- return NULL;
-
- sg_ret = &sg_first[rm->m_used_sgs];
- sg_init_table(sg_ret, nents);
- rm->m_used_sgs += nents;
-
- return sg_ret;
-}
-
struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len)
{
struct rds_message *rm;
unsigned int i;
- int num_sgs = ceil(total_len, PAGE_SIZE);
- int extra_bytes = num_sgs * sizeof(struct scatterlist);
- rm = rds_message_alloc(extra_bytes, GFP_NOWAIT);
- if (!rm)
+ rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
+ if (rm == NULL)
return ERR_PTR(-ENOMEM);
set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
- rm->data.op_nents = ceil(total_len, PAGE_SIZE);
- rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs);
- if (!rm->data.op_sg) {
- rds_message_put(rm);
- return ERR_PTR(-ENOMEM);
- }
+ rm->m_nents = ceil(total_len, PAGE_SIZE);
- for (i = 0; i < rm->data.op_nents; ++i) {
- sg_set_page(&rm->data.op_sg[i],
+ for (i = 0; i < rm->m_nents; ++i) {
+ sg_set_page(&rm->m_sg[i],
virt_to_page(page_addrs[i]),
PAGE_SIZE, 0);
}
return rm;
}
-int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from)
+struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
+ size_t total_len)
{
- unsigned long to_copy, nbytes;
+ unsigned long to_copy;
+ unsigned long iov_off;
unsigned long sg_off;
+ struct rds_message *rm;
+ struct iovec *iov;
struct scatterlist *sg;
- int ret = 0;
+ int ret;
+
+ rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
+ if (rm == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
- rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from));
+ rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
/*
* now allocate and copy in the data payload.
*/
- sg = rm->data.op_sg;
+ sg = rm->m_sg;
+ iov = first_iov;
+ iov_off = 0;
sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
- while (iov_iter_count(from)) {
- if (!sg_page(sg)) {
- ret = rds_page_remainder_alloc(sg, iov_iter_count(from),
+ while (total_len) {
+ if (sg_page(sg) == NULL) {
+ ret = rds_page_remainder_alloc(sg, total_len,
GFP_HIGHUSER);
if (ret)
- return ret;
- rm->data.op_nents++;
+ goto out;
+ rm->m_nents++;
sg_off = 0;
}
- to_copy = min_t(unsigned long, iov_iter_count(from),
- sg->length - sg_off);
+ while (iov_off == iov->iov_len) {
+ iov_off = 0;
+ iov++;
+ }
+
+ to_copy = min(iov->iov_len - iov_off, sg->length - sg_off);
+ to_copy = min_t(size_t, to_copy, total_len);
+
+ rdsdebug("copying %lu bytes from user iov [%p, %zu] + %lu to "
+ "sg [%p, %u, %u] + %lu\n",
+ to_copy, iov->iov_base, iov->iov_len, iov_off,
+ (void *)sg_page(sg), sg->offset, sg->length, sg_off);
- rds_stats_add(s_copy_from_user, to_copy);
- nbytes = copy_page_from_iter(sg_page(sg), sg->offset + sg_off,
- to_copy, from);
- if (nbytes != to_copy)
- return -EFAULT;
+ ret = rds_page_copy_from_user(sg_page(sg), sg->offset + sg_off,
+ iov->iov_base + iov_off,
+ to_copy);
+ if (ret)
+ goto out;
+ iov_off += to_copy;
+ total_len -= to_copy;
sg_off += to_copy;
if (sg_off == sg->length)
sg++;
}
- return ret;
+ ret = 0;
+out:
+ if (ret) {
+ if (rm)
+ rds_message_put(rm);
+ rm = ERR_PTR(ret);
+ }
+ return rm;
}
-int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
+int rds_message_inc_copy_to_user(struct rds_incoming *inc,
+ struct iovec *first_iov, size_t size)
{
struct rds_message *rm;
+ struct iovec *iov;
struct scatterlist *sg;
unsigned long to_copy;
+ unsigned long iov_off;
unsigned long vec_off;
int copied;
int ret;
rm = container_of(inc, struct rds_message, m_inc);
len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
- sg = rm->data.op_sg;
+ iov = first_iov;
+ iov_off = 0;
+ sg = rm->m_sg;
vec_off = 0;
copied = 0;
- while (iov_iter_count(to) && copied < len) {
- to_copy = min_t(unsigned long, iov_iter_count(to),
- sg->length - vec_off);
+ while (copied < size && copied < len) {
+ while (iov_off == iov->iov_len) {
+ iov_off = 0;
+ iov++;
+ }
+
+ to_copy = min(iov->iov_len - iov_off, sg->length - vec_off);
+ to_copy = min_t(size_t, to_copy, size - copied);
to_copy = min_t(unsigned long, to_copy, len - copied);
- rds_stats_add(s_copy_to_user, to_copy);
- ret = copy_page_to_iter(sg_page(sg), sg->offset + vec_off,
- to_copy, to);
- if (ret != to_copy)
- return -EFAULT;
+ rdsdebug("copying %lu bytes to user iov [%p, %zu] + %lu to "
+ "sg [%p, %u, %u] + %lu\n",
+ to_copy, iov->iov_base, iov->iov_len, iov_off,
+ sg_page(sg), sg->offset, sg->length, vec_off);
+
+ ret = rds_page_copy_to_user(sg_page(sg), sg->offset + vec_off,
+ iov->iov_base + iov_off,
+ to_copy);
+ if (ret) {
+ copied = ret;
+ break;
+ }
+ iov_off += to_copy;
vec_off += to_copy;
copied += to_copy;
*/
void rds_message_wait(struct rds_message *rm)
{
- wait_event_interruptible(rm->m_flush_wait,
+ wait_event(rds_message_flush_waitq,
!test_bit(RDS_MSG_MAPPED, &rm->m_flags));
}
void rds_message_unmapped(struct rds_message *rm)
{
clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
- wake_up_interruptible(&rm->m_flush_wait);
+ if (waitqueue_active(&rds_message_flush_waitq))
+ wake_up(&rds_message_flush_waitq);
}
-EXPORT_SYMBOL_GPL(rds_message_unmapped);
*
*/
#include <linux/highmem.h>
-#include <linux/gfp.h>
-#include <linux/cpu.h>
-#include <linux/export.h>
#include "rds.h"
unsigned long r_offset;
};
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder,
- rds_page_remainders);
+DEFINE_PER_CPU(struct rds_page_remainder, rds_page_remainders) ____cacheline_aligned;
/*
* returns 0 on success or -errno on failure.
unsigned long ret;
void *addr;
- addr = kmap(page);
- if (to_user) {
+ if (to_user)
rds_stats_add(s_copy_to_user, bytes);
- ret = copy_to_user(ptr, addr + offset, bytes);
- } else {
+ else
rds_stats_add(s_copy_from_user, bytes);
- ret = copy_from_user(addr + offset, ptr, bytes);
+
+ addr = kmap_atomic(page, KM_USER0);
+ if (to_user)
+ ret = __copy_to_user_inatomic(ptr, addr + offset, bytes);
+ else
+ ret = __copy_from_user_inatomic(addr + offset, ptr, bytes);
+ kunmap_atomic(addr, KM_USER0);
+
+ if (ret) {
+ addr = kmap(page);
+ if (to_user)
+ ret = copy_to_user(ptr, addr + offset, bytes);
+ else
+ ret = copy_from_user(addr + offset, ptr, bytes);
+ kunmap(page);
+ if (ret)
+ return -EFAULT;
}
- kunmap(page);
- return ret ? -EFAULT : 0;
+ return 0;
}
-EXPORT_SYMBOL_GPL(rds_page_copy_user);
-/**
- * rds_page_remainder_alloc - build up regions of a message.
+/*
+ * Message allocation uses this to build up regions of a message.
*
- * @scat: Scatter list for message
- * @bytes: the number of bytes needed.
- * @gfp: the waiting behaviour of the allocation
+ * @bytes - the number of bytes needed.
+ * @gfp - the waiting behaviour of the allocation
*
* @gfp is always ored with __GFP_HIGHMEM. Callers must be prepared to
* kmap the pages, etc.
/* jump straight to allocation if we're trying for a huge page */
if (bytes >= PAGE_SIZE) {
page = alloc_page(gfp);
- if (!page) {
+ if (page == NULL) {
ret = -ENOMEM;
} else {
sg_set_page(scat, page, PAGE_SIZE, 0);
rem = &per_cpu(rds_page_remainders, get_cpu());
local_irq_save(flags);
- if (!page) {
+ if (page == NULL) {
ret = -ENOMEM;
break;
}
ret ? 0 : scat->length);
return ret;
}
-EXPORT_SYMBOL_GPL(rds_page_remainder_alloc);
static int rds_page_remainder_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
*
*/
#include <linux/pagemap.h>
-#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/dma-mapping.h> /* for DMA_*_DEVICE */
-#include "rds.h"
+#include "rdma.h"
/*
* XXX
{
struct rds_mr *mr;
struct rb_node *node;
- unsigned long flags;
/* Release any MRs associated with this socket */
- spin_lock_irqsave(&rs->rs_rdma_lock, flags);
while ((node = rb_first(&rs->rs_rdma_keys))) {
mr = container_of(node, struct rds_mr, r_rb_node);
if (mr->r_trans == rs->rs_transport)
mr->r_invalidate = 0;
- rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
- RB_CLEAR_NODE(&mr->r_rb_node);
- spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
- rds_destroy_mr(mr);
rds_mr_put(mr);
- spin_lock_irqsave(&rs->rs_rdma_lock, flags);
}
- spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
if (rs->rs_transport && rs->rs_transport->flush_mrs)
rs->rs_transport->flush_mrs();
{
int ret;
- ret = get_user_pages_fast(user_addr, nr_pages, write, pages);
+ down_read(¤t->mm->mmap_sem);
+ ret = get_user_pages(current, current->mm, user_addr,
+ nr_pages, write, 0, pages, NULL);
+ up_read(¤t->mm->mmap_sem);
- if (ret >= 0 && ret < nr_pages) {
+ if (0 <= ret && (unsigned) ret < nr_pages) {
while (ret--)
put_page(pages[ret]);
ret = -EFAULT;
goto out;
}
- if (!rs->rs_transport->get_mr) {
+ if (rs->rs_transport->get_mr == NULL) {
ret = -EOPNOTSUPP;
goto out;
}
/* XXX clamp nr_pages to limit the size of this alloc? */
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
- if (!pages) {
+ if (pages == NULL) {
ret = -ENOMEM;
goto out;
}
mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL);
- if (!mr) {
+ if (mr == NULL) {
ret = -ENOMEM;
goto out;
}
* r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to
* the zero page.
*/
- ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1);
+ ret = rds_pin_pages(args->vec.addr & PAGE_MASK, nr_pages, pages, 1);
if (ret < 0)
goto out;
nents = ret;
sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
- if (!sg) {
+ if (sg == NULL) {
ret = -ENOMEM;
goto out;
}
return __rds_rdma_map(rs, &args, NULL, NULL);
}
-int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen)
-{
- struct rds_get_mr_for_dest_args args;
- struct rds_get_mr_args new_args;
-
- if (optlen != sizeof(struct rds_get_mr_for_dest_args))
- return -EINVAL;
-
- if (copy_from_user(&args, (struct rds_get_mr_for_dest_args __user *)optval,
- sizeof(struct rds_get_mr_for_dest_args)))
- return -EFAULT;
-
- /*
- * Initially, just behave like get_mr().
- * TODO: Implement get_mr as wrapper around this
- * and deprecate it.
- */
- new_args.vec = args.vec;
- new_args.cookie_addr = args.cookie_addr;
- new_args.flags = args.flags;
-
- return __rds_rdma_map(rs, &new_args, NULL, NULL);
-}
-
/*
* Free the MR indicated by the given R_Key
*/
spin_lock_irqsave(&rs->rs_rdma_lock, flags);
mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
- if (!mr) {
- printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key);
- spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
- return;
- }
-
- if (mr->r_use_once || force) {
+ if (mr && (mr->r_use_once || force)) {
rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
RB_CLEAR_NODE(&mr->r_rb_node);
zot_me = 1;
- }
+ } else if (mr)
+ atomic_inc(&mr->r_refcount);
spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
/* May have to issue a dma_sync on this memory region.
* Note we could avoid this if the operation was a RDMA READ,
* but at this point we can't tell. */
- if (mr->r_trans->sync_mr)
- mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
-
- /* If the MR was marked as invalidate, this will
- * trigger an async flush. */
- if (zot_me)
- rds_destroy_mr(mr);
- rds_mr_put(mr);
+ if (mr != NULL) {
+ if (mr->r_trans->sync_mr)
+ mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
+
+ /* If the MR was marked as invalidate, this will
+ * trigger an async flush. */
+ if (zot_me)
+ rds_destroy_mr(mr);
+ rds_mr_put(mr);
+ }
}
-void rds_rdma_free_op(struct rm_rdma_op *ro)
+void rds_rdma_free_op(struct rds_rdma_op *ro)
{
unsigned int i;
- for (i = 0; i < ro->op_nents; i++) {
- struct page *page = sg_page(&ro->op_sg[i]);
+ for (i = 0; i < ro->r_nents; i++) {
+ struct page *page = sg_page(&ro->r_sg[i]);
/* Mark page dirty if it was possibly modified, which
* is the case for a RDMA_READ which copies from remote
* to local memory */
- if (!ro->op_write) {
- BUG_ON(irqs_disabled());
+ if (!ro->r_write)
set_page_dirty(page);
- }
put_page(page);
}
- kfree(ro->op_notifier);
- ro->op_notifier = NULL;
- ro->op_active = 0;
-}
-
-void rds_atomic_free_op(struct rm_atomic_op *ao)
-{
- struct page *page = sg_page(ao->op_sg);
-
- /* Mark page dirty if it was possibly modified, which
- * is the case for a RDMA_READ which copies from remote
- * to local memory */
- set_page_dirty(page);
- put_page(page);
-
- kfree(ao->op_notifier);
- ao->op_notifier = NULL;
- ao->op_active = 0;
+ kfree(ro->r_notifier);
+ kfree(ro);
}
-
/*
- * Count the number of pages needed to describe an incoming iovec array.
+ * args is a pointer to an in-kernel copy in the sendmsg cmsg.
*/
-static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs)
-{
- int tot_pages = 0;
- unsigned int nr_pages;
- unsigned int i;
-
- /* figure out the number of pages in the vector */
- for (i = 0; i < nr_iovecs; i++) {
- nr_pages = rds_pages_in_vec(&iov[i]);
- if (nr_pages == 0)
- return -EINVAL;
-
- tot_pages += nr_pages;
-
- /*
- * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
- * so tot_pages cannot overflow without first going negative.
- */
- if (tot_pages < 0)
- return -EINVAL;
- }
-
- return tot_pages;
-}
-
-int rds_rdma_extra_size(struct rds_rdma_args *args)
+static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
+ struct rds_rdma_args *args)
{
struct rds_iovec vec;
- struct rds_iovec __user *local_vec;
- int tot_pages = 0;
+ struct rds_rdma_op *op = NULL;
unsigned int nr_pages;
- unsigned int i;
-
- local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
-
- /* figure out the number of pages in the vector */
- for (i = 0; i < args->nr_local; i++) {
- if (copy_from_user(&vec, &local_vec[i],
- sizeof(struct rds_iovec)))
- return -EFAULT;
-
- nr_pages = rds_pages_in_vec(&vec);
- if (nr_pages == 0)
- return -EINVAL;
-
- tot_pages += nr_pages;
-
- /*
- * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
- * so tot_pages cannot overflow without first going negative.
- */
- if (tot_pages < 0)
- return -EINVAL;
- }
-
- return tot_pages * sizeof(struct scatterlist);
-}
-
-/*
- * The application asks for a RDMA transfer.
- * Extract all arguments and set up the rdma_op
- */
-int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
- struct cmsghdr *cmsg)
-{
- struct rds_rdma_args *args;
- struct rm_rdma_op *op = &rm->rdma;
- int nr_pages;
+ unsigned int max_pages;
unsigned int nr_bytes;
struct page **pages = NULL;
- struct rds_iovec iovstack[UIO_FASTIOV], *iovs = iovstack;
- int iov_size;
+ struct rds_iovec __user *local_vec;
+ struct scatterlist *sg;
+ unsigned int nr;
unsigned int i, j;
- int ret = 0;
-
- if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
- || rm->rdma.op_active)
- return -EINVAL;
+ int ret;
- args = CMSG_DATA(cmsg);
if (rs->rs_bound_addr == 0) {
ret = -ENOTCONN; /* XXX not a great errno */
- goto out_ret;
+ goto out;
}
- if (args->nr_local > UIO_MAXIOV) {
+ if (args->nr_local > (u64)UINT_MAX) {
ret = -EMSGSIZE;
- goto out_ret;
+ goto out;
}
- /* Check whether to allocate the iovec area */
- iov_size = args->nr_local * sizeof(struct rds_iovec);
- if (args->nr_local > UIO_FASTIOV) {
- iovs = sock_kmalloc(rds_rs_to_sk(rs), iov_size, GFP_KERNEL);
- if (!iovs) {
- ret = -ENOMEM;
- goto out_ret;
+ nr_pages = 0;
+ max_pages = 0;
+
+ local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
+
+ /* figure out the number of pages in the vector */
+ for (i = 0; i < args->nr_local; i++) {
+ if (copy_from_user(&vec, &local_vec[i],
+ sizeof(struct rds_iovec))) {
+ ret = -EFAULT;
+ goto out;
}
- }
- if (copy_from_user(iovs, (struct rds_iovec __user *)(unsigned long) args->local_vec_addr, iov_size)) {
- ret = -EFAULT;
- goto out;
- }
+ nr = rds_pages_in_vec(&vec);
+ if (nr == 0) {
+ ret = -EINVAL;
+ goto out;
+ }
- nr_pages = rds_rdma_pages(iovs, args->nr_local);
- if (nr_pages < 0) {
- ret = -EINVAL;
- goto out;
+ max_pages = max(nr, max_pages);
+ nr_pages += nr;
}
- pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
- if (!pages) {
+ pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL);
+ if (pages == NULL) {
ret = -ENOMEM;
goto out;
}
- op->op_write = !!(args->flags & RDS_RDMA_READWRITE);
- op->op_fence = !!(args->flags & RDS_RDMA_FENCE);
- op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
- op->op_silent = !!(args->flags & RDS_RDMA_SILENT);
- op->op_active = 1;
- op->op_recverr = rs->rs_recverr;
- WARN_ON(!nr_pages);
- op->op_sg = rds_message_alloc_sgs(rm, nr_pages);
- if (!op->op_sg) {
+ op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL);
+ if (op == NULL) {
ret = -ENOMEM;
goto out;
}
- if (op->op_notify || op->op_recverr) {
+ op->r_write = !!(args->flags & RDS_RDMA_READWRITE);
+ op->r_fence = !!(args->flags & RDS_RDMA_FENCE);
+ op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
+ op->r_recverr = rs->rs_recverr;
+ WARN_ON(!nr_pages);
+ sg_init_table(op->r_sg, nr_pages);
+
+ if (op->r_notify || op->r_recverr) {
/* We allocate an uninitialized notifier here, because
* we don't want to do that in the completion handler. We
* would have to use GFP_ATOMIC there, and don't want to deal
* with failed allocations.
*/
- op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
- if (!op->op_notifier) {
+ op->r_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
+ if (!op->r_notifier) {
ret = -ENOMEM;
goto out;
}
- op->op_notifier->n_user_token = args->user_token;
- op->op_notifier->n_status = RDS_RDMA_SUCCESS;
+ op->r_notifier->n_user_token = args->user_token;
+ op->r_notifier->n_status = RDS_RDMA_SUCCESS;
}
/* The cookie contains the R_Key of the remote memory region, and
* destination address (which is really an offset into the MR)
* FIXME: We may want to move this into ib_rdma.c
*/
- op->op_rkey = rds_rdma_cookie_key(args->cookie);
- op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
+ op->r_key = rds_rdma_cookie_key(args->cookie);
+ op->r_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
nr_bytes = 0;
rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n",
(unsigned long long)args->nr_local,
(unsigned long long)args->remote_vec.addr,
- op->op_rkey);
+ op->r_key);
for (i = 0; i < args->nr_local; i++) {
- struct rds_iovec *iov = &iovs[i];
- /* don't need to check, rds_rdma_pages() verified nr will be +nonzero */
- unsigned int nr = rds_pages_in_vec(iov);
+ if (copy_from_user(&vec, &local_vec[i],
+ sizeof(struct rds_iovec))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ nr = rds_pages_in_vec(&vec);
+ if (nr == 0) {
+ ret = -EINVAL;
+ goto out;
+ }
- rs->rs_user_addr = iov->addr;
- rs->rs_user_bytes = iov->bytes;
+ rs->rs_user_addr = vec.addr;
+ rs->rs_user_bytes = vec.bytes;
+ /* did the user change the vec under us? */
+ if (nr > max_pages || op->r_nents + nr > nr_pages) {
+ ret = -EINVAL;
+ goto out;
+ }
/* If it's a WRITE operation, we want to pin the pages for reading.
* If it's a READ operation, we need to pin the pages for writing.
*/
- ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write);
+ ret = rds_pin_pages(vec.addr & PAGE_MASK, nr, pages, !op->r_write);
if (ret < 0)
goto out;
- rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n",
- nr_bytes, nr, iov->bytes, iov->addr);
+ rdsdebug("RDS: nr_bytes %u nr %u vec.bytes %llu vec.addr %llx\n",
+ nr_bytes, nr, vec.bytes, vec.addr);
- nr_bytes += iov->bytes;
+ nr_bytes += vec.bytes;
for (j = 0; j < nr; j++) {
- unsigned int offset = iov->addr & ~PAGE_MASK;
- struct scatterlist *sg;
+ unsigned int offset = vec.addr & ~PAGE_MASK;
- sg = &op->op_sg[op->op_nents + j];
+ sg = &op->r_sg[op->r_nents + j];
sg_set_page(sg, pages[j],
- min_t(unsigned int, iov->bytes, PAGE_SIZE - offset),
+ min_t(unsigned int, vec.bytes, PAGE_SIZE - offset),
offset);
- rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n",
- sg->offset, sg->length, iov->addr, iov->bytes);
+ rdsdebug("RDS: sg->offset %x sg->len %x vec.addr %llx vec.bytes %llu\n",
+ sg->offset, sg->length, vec.addr, vec.bytes);
- iov->addr += sg->length;
- iov->bytes -= sg->length;
+ vec.addr += sg->length;
+ vec.bytes -= sg->length;
}
- op->op_nents += nr;
+ op->r_nents += nr;
}
+
if (nr_bytes > args->remote_vec.bytes) {
rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n",
nr_bytes,
ret = -EINVAL;
goto out;
}
- op->op_bytes = nr_bytes;
+ op->r_bytes = nr_bytes;
+ ret = 0;
out:
- if (iovs != iovstack)
- sock_kfree_s(rds_rs_to_sk(rs), iovs, iov_size);
kfree(pages);
-out_ret:
- if (ret)
- rds_rdma_free_op(op);
- else
- rds_stats_inc(s_send_rdma);
+ if (ret) {
+ if (op)
+ rds_rdma_free_op(op);
+ op = ERR_PTR(ret);
+ }
+ return op;
+}
- return ret;
+/*
+ * The application asks for a RDMA transfer.
+ * Extract all arguments and set up the rdma_op
+ */
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg)
+{
+ struct rds_rdma_op *op;
+
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
+ || rm->m_rdma_op != NULL)
+ return -EINVAL;
+
+ op = rds_rdma_prepare(rs, CMSG_DATA(cmsg));
+ if (IS_ERR(op))
+ return PTR_ERR(op);
+ rds_stats_inc(s_send_rdma);
+ rm->m_rdma_op = op;
+ return 0;
}
/*
u32 r_key;
int err = 0;
- if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t)) ||
- rm->m_rdma_cookie != 0)
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t))
+ || rm->m_rdma_cookie != 0)
return -EINVAL;
memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie));
spin_lock_irqsave(&rs->rs_rdma_lock, flags);
mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
- if (!mr)
+ if (mr == NULL)
err = -EINVAL; /* invalid r_key */
else
atomic_inc(&mr->r_refcount);
if (mr) {
mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
- rm->rdma.op_rdma_mr = mr;
+ rm->m_rdma_mr = mr;
}
return err;
}
int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
struct cmsghdr *cmsg)
{
- if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args)) ||
- rm->m_rdma_cookie != 0)
- return -EINVAL;
-
- return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr);
-}
-
-/*
- * Fill in rds_message for an atomic request.
- */
-int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
- struct cmsghdr *cmsg)
-{
- struct page *page = NULL;
- struct rds_atomic_args *args;
- int ret = 0;
-
- if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args))
- || rm->atomic.op_active)
+ if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args))
+ || rm->m_rdma_cookie != 0)
return -EINVAL;
- args = CMSG_DATA(cmsg);
-
- /* Nonmasked & masked cmsg ops converted to masked hw ops */
- switch (cmsg->cmsg_type) {
- case RDS_CMSG_ATOMIC_FADD:
- rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
- rm->atomic.op_m_fadd.add = args->fadd.add;
- rm->atomic.op_m_fadd.nocarry_mask = 0;
- break;
- case RDS_CMSG_MASKED_ATOMIC_FADD:
- rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
- rm->atomic.op_m_fadd.add = args->m_fadd.add;
- rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask;
- break;
- case RDS_CMSG_ATOMIC_CSWP:
- rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
- rm->atomic.op_m_cswp.compare = args->cswp.compare;
- rm->atomic.op_m_cswp.swap = args->cswp.swap;
- rm->atomic.op_m_cswp.compare_mask = ~0;
- rm->atomic.op_m_cswp.swap_mask = ~0;
- break;
- case RDS_CMSG_MASKED_ATOMIC_CSWP:
- rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
- rm->atomic.op_m_cswp.compare = args->m_cswp.compare;
- rm->atomic.op_m_cswp.swap = args->m_cswp.swap;
- rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask;
- rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask;
- break;
- default:
- BUG(); /* should never happen */
- }
-
- rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
- rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT);
- rm->atomic.op_active = 1;
- rm->atomic.op_recverr = rs->rs_recverr;
- rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1);
- if (!rm->atomic.op_sg) {
- ret = -ENOMEM;
- goto err;
- }
-
- /* verify 8 byte-aligned */
- if (args->local_addr & 0x7) {
- ret = -EFAULT;
- goto err;
- }
-
- ret = rds_pin_pages(args->local_addr, 1, &page, 1);
- if (ret != 1)
- goto err;
- ret = 0;
-
- sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr));
-
- if (rm->atomic.op_notify || rm->atomic.op_recverr) {
- /* We allocate an uninitialized notifier here, because
- * we don't want to do that in the completion handler. We
- * would have to use GFP_ATOMIC there, and don't want to deal
- * with failed allocations.
- */
- rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL);
- if (!rm->atomic.op_notifier) {
- ret = -ENOMEM;
- goto err;
- }
-
- rm->atomic.op_notifier->n_user_token = args->user_token;
- rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS;
- }
-
- rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie);
- rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie);
-
- return ret;
-err:
- if (page)
- put_page(page);
- kfree(rm->atomic.op_notifier);
-
- return ret;
+ return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->m_rdma_mr);
}
--- /dev/null
+#ifndef _RDS_RDMA_H
+#define _RDS_RDMA_H
+
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/scatterlist.h>
+
+#include "rds.h"
+
+struct rds_mr {
+ struct rb_node r_rb_node;
+ atomic_t r_refcount;
+ u32 r_key;
+
+ /* A copy of the creation flags */
+ unsigned int r_use_once:1;
+ unsigned int r_invalidate:1;
+ unsigned int r_write:1;
+
+ /* This is for RDS_MR_DEAD.
+ * It would be nice & consistent to make this part of the above
+ * bit field here, but we need to use test_and_set_bit.
+ */
+ unsigned long r_state;
+ struct rds_sock *r_sock; /* back pointer to the socket that owns us */
+ struct rds_transport *r_trans;
+ void *r_trans_private;
+};
+
+/* Flags for mr->r_state */
+#define RDS_MR_DEAD 0
+
+struct rds_rdma_op {
+ u32 r_key;
+ u64 r_remote_addr;
+ unsigned int r_write:1;
+ unsigned int r_fence:1;
+ unsigned int r_notify:1;
+ unsigned int r_recverr:1;
+ unsigned int r_mapped:1;
+ struct rds_notifier *r_notifier;
+ unsigned int r_bytes;
+ unsigned int r_nents;
+ unsigned int r_count;
+ struct scatterlist r_sg[0];
+};
+
+static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
+{
+ return r_key | (((u64) offset) << 32);
+}
+
+static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
+{
+ return cookie;
+}
+
+static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
+{
+ return cookie >> 32;
+}
+
+int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
+int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
+void rds_rdma_drop_keys(struct rds_sock *rs);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg);
+int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg);
+int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
+ struct cmsghdr *cmsg);
+void rds_rdma_free_op(struct rds_rdma_op *ro);
+void rds_rdma_send_complete(struct rds_message *rm, int);
+
+extern void __rds_put_mr_final(struct rds_mr *mr);
+static inline void rds_mr_put(struct rds_mr *mr)
+{
+ if (atomic_dec_and_test(&mr->r_refcount))
+ __rds_put_mr_final(mr);
+}
+
+#endif
* SOFTWARE.
*
*/
-#include <linux/module.h>
#include <rdma/rdma_cm.h>
#include "rdma_transport.h"
-static struct rdma_cm_id *rds_rdma_listen_id;
-
-static char *rds_cm_event_strings[] = {
-#define RDS_CM_EVENT_STRING(foo) \
- [RDMA_CM_EVENT_##foo] = __stringify(RDMA_CM_EVENT_##foo)
- RDS_CM_EVENT_STRING(ADDR_RESOLVED),
- RDS_CM_EVENT_STRING(ADDR_ERROR),
- RDS_CM_EVENT_STRING(ROUTE_RESOLVED),
- RDS_CM_EVENT_STRING(ROUTE_ERROR),
- RDS_CM_EVENT_STRING(CONNECT_REQUEST),
- RDS_CM_EVENT_STRING(CONNECT_RESPONSE),
- RDS_CM_EVENT_STRING(CONNECT_ERROR),
- RDS_CM_EVENT_STRING(UNREACHABLE),
- RDS_CM_EVENT_STRING(REJECTED),
- RDS_CM_EVENT_STRING(ESTABLISHED),
- RDS_CM_EVENT_STRING(DISCONNECTED),
- RDS_CM_EVENT_STRING(DEVICE_REMOVAL),
- RDS_CM_EVENT_STRING(MULTICAST_JOIN),
- RDS_CM_EVENT_STRING(MULTICAST_ERROR),
- RDS_CM_EVENT_STRING(ADDR_CHANGE),
- RDS_CM_EVENT_STRING(TIMEWAIT_EXIT),
-#undef RDS_CM_EVENT_STRING
-};
-
-static char *rds_cm_event_str(enum rdma_cm_event_type type)
-{
- return rds_str_array(rds_cm_event_strings,
- ARRAY_SIZE(rds_cm_event_strings), type);
-};
+static struct rdma_cm_id *rds_iw_listen_id;
int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event)
struct rds_transport *trans;
int ret = 0;
- rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id,
- event->event, rds_cm_event_str(event->event));
+ rdsdebug("conn %p id %p handling event %u\n", conn, cm_id,
+ event->event);
if (cm_id->device->node_type == RDMA_NODE_RNIC)
trans = &rds_iw_transport;
break;
case RDMA_CM_EVENT_DISCONNECTED:
- rdsdebug("DISCONNECT event - dropping connection "
+ printk(KERN_WARNING "RDS/IW: DISCONNECT event - dropping connection "
"%pI4->%pI4\n", &conn->c_laddr,
&conn->c_faddr);
rds_conn_drop(conn);
default:
/* things like device disconnect? */
- printk(KERN_ERR "RDS: unknown event %u (%s)!\n",
- event->event, rds_cm_event_str(event->event));
+ printk(KERN_ERR "unknown event %u\n", event->event);
+ BUG();
break;
}
if (conn)
mutex_unlock(&conn->c_cm_lock);
- rdsdebug("id %p event %u (%s) handling ret %d\n", cm_id, event->event,
- rds_cm_event_str(event->event), ret);
+ rdsdebug("id %p event %u handling ret %d\n", cm_id, event->event, ret);
return ret;
}
-static int rds_rdma_listen_init(void)
+static int __init rds_rdma_listen_init(void)
{
struct sockaddr_in sin;
struct rdma_cm_id *cm_id;
int ret;
- cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP,
- IB_QPT_RC);
+ cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP);
if (IS_ERR(cm_id)) {
ret = PTR_ERR(cm_id);
- printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
+ printk(KERN_ERR "RDS/IW: failed to setup listener, "
"rdma_create_id() returned %d\n", ret);
- return ret;
+ goto out;
}
- sin.sin_family = AF_INET;
+ sin.sin_family = PF_INET,
sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
sin.sin_port = (__force u16)htons(RDS_PORT);
*/
ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
if (ret) {
- printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
+ printk(KERN_ERR "RDS/IW: failed to setup listener, "
"rdma_bind_addr() returned %d\n", ret);
goto out;
}
ret = rdma_listen(cm_id, 128);
if (ret) {
- printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
+ printk(KERN_ERR "RDS/IW: failed to setup listener, "
"rdma_listen() returned %d\n", ret);
goto out;
}
rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT);
- rds_rdma_listen_id = cm_id;
+ rds_iw_listen_id = cm_id;
cm_id = NULL;
out:
if (cm_id)
static void rds_rdma_listen_stop(void)
{
- if (rds_rdma_listen_id) {
- rdsdebug("cm %p\n", rds_rdma_listen_id);
- rdma_destroy_id(rds_rdma_listen_id);
- rds_rdma_listen_id = NULL;
+ if (rds_iw_listen_id) {
+ rdsdebug("cm %p\n", rds_iw_listen_id);
+ rdma_destroy_id(rds_iw_listen_id);
+ rds_iw_listen_id = NULL;
}
}
-static int rds_rdma_init(void)
+int __init rds_rdma_init(void)
{
int ret;
out:
return ret;
}
-module_init(rds_rdma_init);
-static void rds_rdma_exit(void)
+void rds_rdma_exit(void)
{
/* stop listening first to ensure no new connections are attempted */
rds_rdma_listen_stop();
rds_ib_exit();
rds_iw_exit();
}
-module_exit(rds_rdma_exit);
-
-MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
-MODULE_DESCRIPTION("RDS: IB/iWARP transport");
-MODULE_LICENSE("Dual BSD/GPL");
int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event);
+/* from rdma_transport.c */
+int rds_rdma_init(void);
+void rds_rdma_exit(void);
+
/* from ib.c */
extern struct rds_transport rds_ib_transport;
int rds_ib_init(void);
#define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args)
#else
/* sigh, pr_debug() causes unused variable warnings */
-static inline __printf(1, 2)
-void rdsdebug(char *fmt, ...)
+static inline void __attribute__ ((format (printf, 1, 2)))
+rdsdebug(char *fmt, ...)
{
}
#endif
#define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT))
#define RDS_CONG_MAP_BYTES (65536 / 8)
+#define RDS_CONG_MAP_LONGS (RDS_CONG_MAP_BYTES / sizeof(unsigned long))
#define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
#define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8)
/* Bits for c_flags */
#define RDS_LL_SEND_FULL 0
#define RDS_RECONNECT_PENDING 1
-#define RDS_IN_XMIT 2
struct rds_connection {
struct hlist_node c_hash_node;
struct rds_cong_map *c_lcong;
struct rds_cong_map *c_fcong;
+ struct mutex c_send_lock; /* protect send ring */
struct rds_message *c_xmit_rm;
unsigned long c_xmit_sg;
unsigned int c_xmit_hdr_off;
unsigned int c_xmit_data_off;
- unsigned int c_xmit_atomic_sent;
unsigned int c_xmit_rdma_sent;
- unsigned int c_xmit_data_sent;
spinlock_t c_lock; /* protect msg queues */
u64 c_next_tx_seq;
void *c_transport_data;
atomic_t c_state;
- unsigned long c_send_gen;
unsigned long c_flags;
unsigned long c_reconnect_jiffies;
struct delayed_work c_send_w;
struct delayed_work c_conn_w;
struct work_struct c_down_w;
struct mutex c_cm_lock; /* protect conn state & cm */
- wait_queue_head_t c_waitq;
struct list_head c_map_item;
unsigned long c_map_queued;
+ unsigned long c_map_offset;
+ unsigned long c_map_bytes;
unsigned int c_unacked_packets;
unsigned int c_unacked_bytes;
#define RDS_FLAG_CONG_BITMAP 0x01
#define RDS_FLAG_ACK_REQUIRED 0x02
#define RDS_FLAG_RETRANSMITTED 0x04
-#define RDS_MAX_ADV_CREDIT 255
+#define RDS_MAX_ADV_CREDIT 127
/*
* Maximum space available for extension headers.
rds_rdma_cookie_t i_rdma_cookie;
};
-struct rds_mr {
- struct rb_node r_rb_node;
- atomic_t r_refcount;
- u32 r_key;
-
- /* A copy of the creation flags */
- unsigned int r_use_once:1;
- unsigned int r_invalidate:1;
- unsigned int r_write:1;
-
- /* This is for RDS_MR_DEAD.
- * It would be nice & consistent to make this part of the above
- * bit field here, but we need to use test_and_set_bit.
- */
- unsigned long r_state;
- struct rds_sock *r_sock; /* back pointer to the socket that owns us */
- struct rds_transport *r_trans;
- void *r_trans_private;
-};
-
-/* Flags for mr->r_state */
-#define RDS_MR_DEAD 0
-
-static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
-{
- return r_key | (((u64) offset) << 32);
-}
-
-static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
-{
- return cookie;
-}
-
-static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
-{
- return cookie >> 32;
-}
-
-/* atomic operation types */
-#define RDS_ATOMIC_TYPE_CSWP 0
-#define RDS_ATOMIC_TYPE_FADD 1
-
/*
* m_sock_item and m_conn_item are on lists that are serialized under
* conn->c_lock. m_sock_item has additional meaning in that once it is empty
* -> rs->rs_lock
*/
spinlock_t m_rs_lock;
- wait_queue_head_t m_flush_wait;
-
struct rds_sock *m_rs;
-
- /* cookie to send to remote, in rds header */
+ struct rds_rdma_op *m_rdma_op;
rds_rdma_cookie_t m_rdma_cookie;
-
- unsigned int m_used_sgs;
- unsigned int m_total_sgs;
-
- void *m_final_op;
-
- struct {
- struct rm_atomic_op {
- int op_type;
- union {
- struct {
- uint64_t compare;
- uint64_t swap;
- uint64_t compare_mask;
- uint64_t swap_mask;
- } op_m_cswp;
- struct {
- uint64_t add;
- uint64_t nocarry_mask;
- } op_m_fadd;
- };
-
- u32 op_rkey;
- u64 op_remote_addr;
- unsigned int op_notify:1;
- unsigned int op_recverr:1;
- unsigned int op_mapped:1;
- unsigned int op_silent:1;
- unsigned int op_active:1;
- struct scatterlist *op_sg;
- struct rds_notifier *op_notifier;
-
- struct rds_mr *op_rdma_mr;
- } atomic;
- struct rm_rdma_op {
- u32 op_rkey;
- u64 op_remote_addr;
- unsigned int op_write:1;
- unsigned int op_fence:1;
- unsigned int op_notify:1;
- unsigned int op_recverr:1;
- unsigned int op_mapped:1;
- unsigned int op_silent:1;
- unsigned int op_active:1;
- unsigned int op_bytes;
- unsigned int op_nents;
- unsigned int op_count;
- struct scatterlist *op_sg;
- struct rds_notifier *op_notifier;
-
- struct rds_mr *op_rdma_mr;
- } rdma;
- struct rm_data_op {
- unsigned int op_active:1;
- unsigned int op_nents;
- unsigned int op_count;
- struct scatterlist *op_sg;
- } data;
- };
+ struct rds_mr *m_rdma_mr;
+ unsigned int m_nents;
+ unsigned int m_count;
+ struct scatterlist m_sg[0];
};
/*
* transport is responsible for other serialization, including
* rds_recv_incoming(). This is called in process context but
* should try hard not to block.
+ *
+ * @xmit_cong_map: This asks the transport to send the local bitmap down the
+ * given connection. XXX get a better story about the bitmap
+ * flag and header.
*/
-#define RDS_TRANS_IB 0
-#define RDS_TRANS_IWARP 1
-#define RDS_TRANS_TCP 2
-#define RDS_TRANS_COUNT 3
-
struct rds_transport {
char t_name[TRANSNAMSIZ];
struct list_head t_item;
struct module *t_owner;
unsigned int t_prefer_loopback:1;
- unsigned int t_type;
int (*laddr_check)(__be32 addr);
int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
void (*xmit_complete)(struct rds_connection *conn);
int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg, unsigned int off);
- int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op);
- int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op);
+ int (*xmit_cong_map)(struct rds_connection *conn,
+ struct rds_cong_map *map, unsigned long offset);
+ int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op);
int (*recv)(struct rds_connection *conn);
- int (*inc_copy_to_user)(struct rds_incoming *inc, struct iov_iter *to);
+ int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov,
+ size_t size);
+ void (*inc_purge)(struct rds_incoming *inc);
void (*inc_free)(struct rds_incoming *inc);
int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
* bound_addr used for both incoming and outgoing, no INADDR_ANY
* support.
*/
- struct hlist_node rs_bound_node;
+ struct rb_node rs_bound_node;
__be32 rs_bound_addr;
__be32 rs_conn_addr;
__be16 rs_bound_port;
__be16 rs_conn_port;
+
+ /*
+ * This is only used to communicate the transport between bind and
+ * initiating connections. All other trans use is referenced through
+ * the connection.
+ */
struct rds_transport *rs_transport;
/*
/* flag indicating we were congested or not */
int rs_congested;
- /* seen congestion (ENOBUFS) when sending? */
- int rs_seen_congestion;
/* rs_lock protects all these adjacent members before the newline */
spinlock_t rs_lock;
uint64_t s_recv_ping;
uint64_t s_send_queue_empty;
uint64_t s_send_queue_full;
- uint64_t s_send_lock_contention;
- uint64_t s_send_lock_queue_raced;
+ uint64_t s_send_sem_contention;
+ uint64_t s_send_sem_queue_raced;
uint64_t s_send_immediate_retry;
uint64_t s_send_delayed_retry;
uint64_t s_send_drop_acked;
};
/* af_rds.c */
-char *rds_str_array(char **array, size_t elements, size_t index);
void rds_sock_addref(struct rds_sock *rs);
void rds_sock_put(struct rds_sock *rs);
void rds_wake_sk_sleep(struct rds_sock *rs);
static inline void __rds_wake_sk_sleep(struct sock *sk)
{
- wait_queue_head_t *waitq = sk_sleep(sk);
+ wait_queue_head_t *waitq = sk->sk_sleep;
if (!sock_flag(sk, SOCK_DEAD) && waitq)
wake_up(waitq);
struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
/* conn.c */
-int rds_conn_init(void);
+int __init rds_conn_init(void);
void rds_conn_exit(void);
struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp);
struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp);
-void rds_conn_shutdown(struct rds_connection *conn);
void rds_conn_destroy(struct rds_connection *conn);
+void rds_conn_reset(struct rds_connection *conn);
void rds_conn_drop(struct rds_connection *conn);
-void rds_conn_connect_if_down(struct rds_connection *conn);
void rds_for_each_conn_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens,
int (*visitor)(struct rds_connection *, void *),
size_t item_len);
-__printf(2, 3)
-void __rds_conn_error(struct rds_connection *conn, const char *, ...);
+void __rds_conn_error(struct rds_connection *conn, const char *, ...)
+ __attribute__ ((format (printf, 2, 3)));
#define rds_conn_error(conn, fmt...) \
__rds_conn_error(conn, KERN_WARNING "RDS: " fmt)
/* message.c */
struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
-struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents);
-int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from);
+struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
+ size_t total_len);
struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
__be16 dport, u64 seq);
unsigned int type, const void *data, unsigned int len);
int rds_message_next_extension(struct rds_header *hdr,
unsigned int *pos, void *buf, unsigned int *buflen);
+int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version);
+int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version);
int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
-int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
+int rds_message_inc_copy_to_user(struct rds_incoming *inc,
+ struct iovec *first_iov, size_t size);
+void rds_message_inc_purge(struct rds_incoming *inc);
void rds_message_inc_free(struct rds_incoming *inc);
void rds_message_addref(struct rds_message *rm);
void rds_message_put(struct rds_message *rm);
/* recv.c */
void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
__be32 saddr);
+void rds_inc_addref(struct rds_incoming *inc);
void rds_inc_put(struct rds_incoming *inc);
void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
- struct rds_incoming *inc, gfp_t gfp);
-int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
- int msg_flags);
+ struct rds_incoming *inc, gfp_t gfp, enum km_type km);
+int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+ size_t size, int msg_flags);
void rds_clear_recv_queue(struct rds_sock *rs);
int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg);
void rds_inc_info_copy(struct rds_incoming *inc,
__be32 saddr, __be32 daddr, int flip);
/* send.c */
-int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len);
+int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+ size_t payload_len);
void rds_send_reset(struct rds_connection *conn);
int rds_send_xmit(struct rds_connection *conn);
struct sockaddr_in;
typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
is_acked_func is_acked);
+int rds_send_acked_before(struct rds_connection *conn, u64 seq);
+void rds_send_remove_from_sock(struct list_head *messages, int status);
int rds_send_pong(struct rds_connection *conn, __be16 dport);
struct rds_message *rds_send_get_message(struct rds_connection *,
- struct rm_rdma_op *);
+ struct rds_rdma_op *);
/* rdma.c */
void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
-int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
-int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen);
-int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
-void rds_rdma_drop_keys(struct rds_sock *rs);
-int rds_rdma_extra_size(struct rds_rdma_args *args);
-int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
- struct cmsghdr *cmsg);
-int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
- struct cmsghdr *cmsg);
-int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
- struct cmsghdr *cmsg);
-int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
- struct cmsghdr *cmsg);
-void rds_rdma_free_op(struct rm_rdma_op *ro);
-void rds_atomic_free_op(struct rm_atomic_op *ao);
-void rds_rdma_send_complete(struct rds_message *rm, int wc_status);
-void rds_atomic_send_complete(struct rds_message *rm, int wc_status);
-int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
- struct cmsghdr *cmsg);
-
-void __rds_put_mr_final(struct rds_mr *mr);
-static inline void rds_mr_put(struct rds_mr *mr)
-{
- if (atomic_dec_and_test(&mr->r_refcount))
- __rds_put_mr_final(mr);
-}
/* stats.c */
-DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
+DECLARE_PER_CPU(struct rds_statistics, rds_stats);
#define rds_stats_inc_which(which, member) do { \
per_cpu(which, get_cpu()).member++; \
put_cpu(); \
put_cpu(); \
} while (0)
#define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count)
-int rds_stats_init(void);
+int __init rds_stats_init(void);
void rds_stats_exit(void);
void rds_stats_info_copy(struct rds_info_iterator *iter,
- uint64_t *values, const char *const *names,
- size_t nr);
+ uint64_t *values, char **names, size_t nr);
/* sysctl.c */
-int rds_sysctl_init(void);
+int __init rds_sysctl_init(void);
void rds_sysctl_exit(void);
extern unsigned long rds_sysctl_sndbuf_min;
extern unsigned long rds_sysctl_sndbuf_default;
extern unsigned int rds_sysctl_trace_level;
/* threads.c */
-int rds_threads_init(void);
+int __init rds_threads_init(void);
void rds_threads_exit(void);
extern struct workqueue_struct *rds_wq;
-void rds_queue_reconnect(struct rds_connection *conn);
void rds_connect_worker(struct work_struct *);
void rds_shutdown_worker(struct work_struct *);
void rds_send_worker(struct work_struct *);
int rds_trans_register(struct rds_transport *trans);
void rds_trans_unregister(struct rds_transport *trans);
struct rds_transport *rds_trans_get_preferred(__be32 addr);
-void rds_trans_put(struct rds_transport *trans);
unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
unsigned int avail);
-int rds_trans_init(void);
+int __init rds_trans_init(void);
void rds_trans_exit(void);
#endif
*
*/
#include <linux/kernel.h>
-#include <linux/slab.h>
#include <net/sock.h>
#include <linux/in.h>
-#include <linux/export.h>
#include "rds.h"
+#include "rdma.h"
void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
__be32 saddr)
inc->i_saddr = saddr;
inc->i_rdma_cookie = 0;
}
-EXPORT_SYMBOL_GPL(rds_inc_init);
-static void rds_inc_addref(struct rds_incoming *inc)
+void rds_inc_addref(struct rds_incoming *inc)
{
rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
atomic_inc(&inc->i_refcount);
inc->i_conn->c_trans->inc_free(inc);
}
}
-EXPORT_SYMBOL_GPL(rds_inc_put);
static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
struct rds_cong_map *map,
* tell us which roles the addrs in the conn are playing for this message.
*/
void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
- struct rds_incoming *inc, gfp_t gfp)
+ struct rds_incoming *inc, gfp_t gfp, enum km_type km)
{
struct rds_sock *rs = NULL;
struct sock *sk;
* XXX we could spend more on the wire to get more robust failure
* detection, arguably worth it to avoid data corruption.
*/
- if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq &&
- (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
+ if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq
+ && (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
rds_stats_inc(s_recv_drop_old_seq);
goto out;
}
}
rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
- if (!rs) {
+ if (rs == NULL) {
rds_stats_inc(s_recv_drop_no_sock);
goto out;
}
if (rs)
rds_sock_put(rs);
}
-EXPORT_SYMBOL_GPL(rds_recv_incoming);
/*
* be very careful here. This is being called as the condition in
{
unsigned long flags;
- if (!*inc) {
+ if (*inc == NULL) {
read_lock_irqsave(&rs->rs_recv_lock, flags);
if (!list_empty(&rs->rs_recv_queue)) {
*inc = list_entry(rs->rs_recv_queue.next,
int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
{
struct rds_notifier *notifier;
- struct rds_rdma_notify cmsg = { 0 }; /* fill holes with zero */
+ struct rds_rdma_notify cmsg;
unsigned int count = 0, max_messages = ~0U;
unsigned long flags;
LIST_HEAD(copy);
if (msghdr) {
cmsg.user_token = notifier->n_user_token;
- cmsg.status = notifier->n_status;
+ cmsg.status = notifier->n_status;
err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS,
- sizeof(cmsg), &cmsg);
+ sizeof(cmsg), &cmsg);
if (err)
break;
}
return 0;
}
-int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
- int msg_flags)
+int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+ size_t size, int msg_flags)
{
struct sock *sk = sock->sk;
struct rds_sock *rs = rds_sk_to_rs(sk);
long timeo;
int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
- DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
+ struct sockaddr_in *sin;
struct rds_incoming *inc = NULL;
/* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */
if (msg_flags & MSG_OOB)
goto out;
- while (1) {
- struct iov_iter save;
- /* If there are pending notifications, do those - and nothing else */
- if (!list_empty(&rs->rs_notify_queue)) {
- ret = rds_notify_queue_get(rs, msg);
- break;
- }
+ /* If there are pending notifications, do those - and nothing else */
+ if (!list_empty(&rs->rs_notify_queue)) {
+ ret = rds_notify_queue_get(rs, msg);
+ goto out;
+ }
- if (rs->rs_cong_notify) {
- ret = rds_notify_cong(rs, msg);
- break;
- }
+ if (rs->rs_cong_notify) {
+ ret = rds_notify_cong(rs, msg);
+ goto out;
+ }
+ while (1) {
if (!rds_next_incoming(rs, &inc)) {
if (nonblock) {
ret = -EAGAIN;
break;
}
- timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
- (!list_empty(&rs->rs_notify_queue) ||
- rs->rs_cong_notify ||
- rds_next_incoming(rs, &inc)), timeo);
+ timeo = wait_event_interruptible_timeout(*sk->sk_sleep,
+ rds_next_incoming(rs, &inc),
+ timeo);
rdsdebug("recvmsg woke inc %p timeo %ld\n", inc,
timeo);
if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
rdsdebug("copying inc %p from %pI4:%u to user\n", inc,
&inc->i_conn->c_faddr,
ntohs(inc->i_hdr.h_sport));
- save = msg->msg_iter;
- ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter);
+ ret = inc->i_conn->c_trans->inc_copy_to_user(inc, msg->msg_iov,
+ size);
if (ret < 0)
break;
rds_inc_put(inc);
inc = NULL;
rds_stats_inc(s_recv_deliver_raced);
- msg->msg_iter = save;
continue;
}
rds_stats_inc(s_recv_delivered);
+ sin = (struct sockaddr_in *)msg->msg_name;
if (sin) {
sin->sin_family = AF_INET;
sin->sin_port = inc->i_hdr.h_sport;
sin->sin_addr.s_addr = inc->i_saddr;
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
- msg->msg_namelen = sizeof(*sin);
}
break;
}
*
*/
#include <linux/kernel.h>
-#include <linux/moduleparam.h>
-#include <linux/gfp.h>
#include <net/sock.h>
#include <linux/in.h>
#include <linux/list.h>
-#include <linux/ratelimit.h>
-#include <linux/export.h>
#include "rds.h"
+#include "rdma.h"
/* When transmitting messages in rds_send_xmit, we need to emerge from
* time to time and briefly release the CPU. Otherwise the softlock watchdog
module_param(send_batch_count, int, 0444);
MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
-static void rds_send_remove_from_sock(struct list_head *messages, int status);
-
/*
- * Reset the send state. Callers must ensure that this doesn't race with
- * rds_send_xmit().
+ * Reset the send state. Caller must hold c_send_lock when calling here.
*/
void rds_send_reset(struct rds_connection *conn)
{
unsigned long flags;
if (conn->c_xmit_rm) {
- rm = conn->c_xmit_rm;
- conn->c_xmit_rm = NULL;
/* Tell the user the RDMA op is no longer mapped by the
* transport. This isn't entirely true (it's flushed out
* independently) but as the connection is down, there's
* no ongoing RDMA to/from that memory */
- rds_message_unmapped(rm);
- rds_message_put(rm);
+ rds_message_unmapped(conn->c_xmit_rm);
+ rds_message_put(conn->c_xmit_rm);
+ conn->c_xmit_rm = NULL;
}
-
conn->c_xmit_sg = 0;
conn->c_xmit_hdr_off = 0;
conn->c_xmit_data_off = 0;
- conn->c_xmit_atomic_sent = 0;
conn->c_xmit_rdma_sent = 0;
- conn->c_xmit_data_sent = 0;
conn->c_map_queued = 0;
spin_unlock_irqrestore(&conn->c_lock, flags);
}
-static int acquire_in_xmit(struct rds_connection *conn)
-{
- return test_and_set_bit(RDS_IN_XMIT, &conn->c_flags) == 0;
-}
-
-static void release_in_xmit(struct rds_connection *conn)
-{
- clear_bit(RDS_IN_XMIT, &conn->c_flags);
- smp_mb__after_atomic();
- /*
- * We don't use wait_on_bit()/wake_up_bit() because our waking is in a
- * hot path and finding waiters is very rare. We don't want to walk
- * the system-wide hashed waitqueue buckets in the fast path only to
- * almost never find waiters.
- */
- if (waitqueue_active(&conn->c_waitq))
- wake_up_all(&conn->c_waitq);
-}
-
/*
- * We're making the conscious trade-off here to only send one message
+ * We're making the concious trade-off here to only send one message
* down the connection at a time.
* Pro:
* - tx queueing is a simple fifo list
struct rds_message *rm;
unsigned long flags;
unsigned int tmp;
+ unsigned int send_quota = send_batch_count;
struct scatterlist *sg;
int ret = 0;
+ int was_empty = 0;
LIST_HEAD(to_be_dropped);
- int batch_count;
- unsigned long send_gen = 0;
-
-restart:
- batch_count = 0;
/*
* sendmsg calls here after having queued its message on the send
* another thread is already feeding the queue then we back off. This
* avoids blocking the caller and trading per-connection data between
* caches per message.
- */
- if (!acquire_in_xmit(conn)) {
- rds_stats_inc(s_send_lock_contention);
- ret = -ENOMEM;
- goto out;
- }
-
- /*
- * we record the send generation after doing the xmit acquire.
- * if someone else manages to jump in and do some work, we'll use
- * this to avoid a goto restart farther down.
*
- * The acquire_in_xmit() check above ensures that only one
- * caller can increment c_send_gen at any time.
+ * The sem holder will issue a retry if they notice that someone queued
+ * a message after they stopped walking the send queue but before they
+ * dropped the sem.
*/
- conn->c_send_gen++;
- send_gen = conn->c_send_gen;
-
- /*
- * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT,
- * we do the opposite to avoid races.
- */
- if (!rds_conn_up(conn)) {
- release_in_xmit(conn);
- ret = 0;
+ if (!mutex_trylock(&conn->c_send_lock)) {
+ rds_stats_inc(s_send_sem_contention);
+ ret = -ENOMEM;
goto out;
}
/*
* spin trying to push headers and data down the connection until
- * the connection doesn't make forward progress.
+ * the connection doens't make forward progress.
*/
- while (1) {
+ while (--send_quota) {
+ /*
+ * See if need to send a congestion map update if we're
+ * between sending messages. The send_sem protects our sole
+ * use of c_map_offset and _bytes.
+ * Note this is used only by transports that define a special
+ * xmit_cong_map function. For all others, we create allocate
+ * a cong_map message and treat it just like any other send.
+ */
+ if (conn->c_map_bytes) {
+ ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong,
+ conn->c_map_offset);
+ if (ret <= 0)
+ break;
+
+ conn->c_map_offset += ret;
+ conn->c_map_bytes -= ret;
+ if (conn->c_map_bytes)
+ continue;
+ }
+ /* If we're done sending the current message, clear the
+ * offset and S/G temporaries.
+ */
rm = conn->c_xmit_rm;
+ if (rm != NULL &&
+ conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
+ conn->c_xmit_sg == rm->m_nents) {
+ conn->c_xmit_rm = NULL;
+ conn->c_xmit_sg = 0;
+ conn->c_xmit_hdr_off = 0;
+ conn->c_xmit_data_off = 0;
+ conn->c_xmit_rdma_sent = 0;
- /*
- * If between sending messages, we can send a pending congestion
- * map update.
+ /* Release the reference to the previous message. */
+ rds_message_put(rm);
+ rm = NULL;
+ }
+
+ /* If we're asked to send a cong map update, do so.
*/
- if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) {
+ if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) {
+ if (conn->c_trans->xmit_cong_map != NULL) {
+ conn->c_map_offset = 0;
+ conn->c_map_bytes = sizeof(struct rds_header) +
+ RDS_CONG_MAP_BYTES;
+ continue;
+ }
+
rm = rds_cong_update_alloc(conn);
if (IS_ERR(rm)) {
ret = PTR_ERR(rm);
break;
}
- rm->data.op_active = 1;
conn->c_xmit_rm = rm;
}
/*
- * If not already working on one, grab the next message.
+ * Grab the next message from the send queue, if there is one.
*
* c_xmit_rm holds a ref while we're sending this message down
* the connction. We can use this ref while holding the
* send_sem.. rds_send_reset() is serialized with it.
*/
- if (!rm) {
+ if (rm == NULL) {
unsigned int len;
- batch_count++;
-
- /* we want to process as big a batch as we can, but
- * we also want to avoid softlockups. If we've been
- * through a lot of messages, lets back off and see
- * if anyone else jumps in
- */
- if (batch_count >= 1024)
- goto over_batch;
-
spin_lock_irqsave(&conn->c_lock, flags);
if (!list_empty(&conn->c_send_queue)) {
spin_unlock_irqrestore(&conn->c_lock, flags);
- if (!rm)
+ if (rm == NULL) {
+ was_empty = 1;
break;
+ }
/* Unfortunately, the way Infiniband deals with
* RDMA to a bad MR key is by moving the entire
* connection.
* Therefore, we never retransmit messages with RDMA ops.
*/
- if (rm->rdma.op_active &&
- test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
+ if (rm->m_rdma_op
+ && test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
spin_lock_irqsave(&conn->c_lock, flags);
if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
list_move(&rm->m_conn_item, &to_be_dropped);
spin_unlock_irqrestore(&conn->c_lock, flags);
+ rds_message_put(rm);
continue;
}
/* Require an ACK every once in a while */
len = ntohl(rm->m_inc.i_hdr.h_len);
- if (conn->c_unacked_packets == 0 ||
- conn->c_unacked_bytes < len) {
+ if (conn->c_unacked_packets == 0
+ || conn->c_unacked_bytes < len) {
__set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
conn->c_xmit_rm = rm;
}
- /* The transport either sends the whole rdma or none of it */
- if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) {
- rm->m_final_op = &rm->rdma;
- ret = conn->c_trans->xmit_rdma(conn, &rm->rdma);
+ /*
+ * Try and send an rdma message. Let's see if we can
+ * keep this simple and require that the transport either
+ * send the whole rdma or none of it.
+ */
+ if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) {
+ ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op);
if (ret)
break;
conn->c_xmit_rdma_sent = 1;
-
- /* The transport owns the mapped memory for now.
- * You can't unmap it while it's on the send queue */
- set_bit(RDS_MSG_MAPPED, &rm->m_flags);
- }
-
- if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) {
- rm->m_final_op = &rm->atomic;
- ret = conn->c_trans->xmit_atomic(conn, &rm->atomic);
- if (ret)
- break;
- conn->c_xmit_atomic_sent = 1;
-
/* The transport owns the mapped memory for now.
* You can't unmap it while it's on the send queue */
set_bit(RDS_MSG_MAPPED, &rm->m_flags);
}
- /*
- * A number of cases require an RDS header to be sent
- * even if there is no data.
- * We permit 0-byte sends; rds-ping depends on this.
- * However, if there are exclusively attached silent ops,
- * we skip the hdr/data send, to enable silent operation.
- */
- if (rm->data.op_nents == 0) {
- int ops_present;
- int all_ops_are_silent = 1;
-
- ops_present = (rm->atomic.op_active || rm->rdma.op_active);
- if (rm->atomic.op_active && !rm->atomic.op_silent)
- all_ops_are_silent = 0;
- if (rm->rdma.op_active && !rm->rdma.op_silent)
- all_ops_are_silent = 0;
-
- if (ops_present && all_ops_are_silent
- && !rm->m_rdma_cookie)
- rm->data.op_active = 0;
- }
-
- if (rm->data.op_active && !conn->c_xmit_data_sent) {
- rm->m_final_op = &rm->data;
+ if (conn->c_xmit_hdr_off < sizeof(struct rds_header) ||
+ conn->c_xmit_sg < rm->m_nents) {
ret = conn->c_trans->xmit(conn, rm,
conn->c_xmit_hdr_off,
conn->c_xmit_sg,
ret -= tmp;
}
- sg = &rm->data.op_sg[conn->c_xmit_sg];
+ sg = &rm->m_sg[conn->c_xmit_sg];
while (ret) {
tmp = min_t(int, ret, sg->length -
conn->c_xmit_data_off);
sg++;
conn->c_xmit_sg++;
BUG_ON(ret != 0 &&
- conn->c_xmit_sg == rm->data.op_nents);
+ conn->c_xmit_sg == rm->m_nents);
}
}
-
- if (conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
- (conn->c_xmit_sg == rm->data.op_nents))
- conn->c_xmit_data_sent = 1;
- }
-
- /*
- * A rm will only take multiple times through this loop
- * if there is a data op. Thus, if the data is sent (or there was
- * none), then we're done with the rm.
- */
- if (!rm->data.op_active || conn->c_xmit_data_sent) {
- conn->c_xmit_rm = NULL;
- conn->c_xmit_sg = 0;
- conn->c_xmit_hdr_off = 0;
- conn->c_xmit_data_off = 0;
- conn->c_xmit_rdma_sent = 0;
- conn->c_xmit_atomic_sent = 0;
- conn->c_xmit_data_sent = 0;
-
- rds_message_put(rm);
}
}
-over_batch:
- if (conn->c_trans->xmit_complete)
- conn->c_trans->xmit_complete(conn);
- release_in_xmit(conn);
-
/* Nuke any messages we decided not to retransmit. */
- if (!list_empty(&to_be_dropped)) {
- /* irqs on here, so we can put(), unlike above */
- list_for_each_entry(rm, &to_be_dropped, m_conn_item)
- rds_message_put(rm);
+ if (!list_empty(&to_be_dropped))
rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
- }
+
+ if (conn->c_trans->xmit_complete)
+ conn->c_trans->xmit_complete(conn);
/*
- * Other senders can queue a message after we last test the send queue
- * but before we clear RDS_IN_XMIT. In that case they'd back off and
- * not try and send their newly queued message. We need to check the
- * send queue after having cleared RDS_IN_XMIT so that their message
- * doesn't get stuck on the send queue.
+ * We might be racing with another sender who queued a message but
+ * backed off on noticing that we held the c_send_lock. If we check
+ * for queued messages after dropping the sem then either we'll
+ * see the queued message or the queuer will get the sem. If we
+ * notice the queued message then we trigger an immediate retry.
*
- * If the transport cannot continue (i.e ret != 0), then it must
- * call us when more room is available, such as from the tx
- * completion handler.
- *
- * We have an extra generation check here so that if someone manages
- * to jump in after our release_in_xmit, we'll see that they have done
- * some work and we will skip our goto
+ * We need to be careful only to do this when we stopped processing
+ * the send queue because it was empty. It's the only way we
+ * stop processing the loop when the transport hasn't taken
+ * responsibility for forward progress.
*/
- if (ret == 0) {
- smp_mb();
- if (!list_empty(&conn->c_send_queue) &&
- send_gen == conn->c_send_gen) {
- rds_stats_inc(s_send_lock_queue_raced);
- goto restart;
+ mutex_unlock(&conn->c_send_lock);
+
+ if (conn->c_map_bytes || (send_quota == 0 && !was_empty)) {
+ /* We exhausted the send quota, but there's work left to
+ * do. Return and (re-)schedule the send worker.
+ */
+ ret = -EAGAIN;
+ }
+
+ if (ret == 0 && was_empty) {
+ /* A simple bit test would be way faster than taking the
+ * spin lock */
+ spin_lock_irqsave(&conn->c_lock, flags);
+ if (!list_empty(&conn->c_send_queue)) {
+ rds_stats_inc(s_send_sem_queue_raced);
+ ret = -EAGAIN;
}
+ spin_unlock_irqrestore(&conn->c_lock, flags);
}
out:
return ret;
}
/*
- * This is pretty similar to what happens below in the ACK
- * handling code - except that we call here as soon as we get
- * the IB send completion on the RDMA op and the accompanying
- * message.
+ * Returns true if there are no messages on the send and retransmit queues
+ * which have a sequence number greater than or equal to the given sequence
+ * number.
*/
-void rds_rdma_send_complete(struct rds_message *rm, int status)
+int rds_send_acked_before(struct rds_connection *conn, u64 seq)
{
- struct rds_sock *rs = NULL;
- struct rm_rdma_op *ro;
- struct rds_notifier *notifier;
- unsigned long flags;
+ struct rds_message *rm, *tmp;
+ int ret = 1;
- spin_lock_irqsave(&rm->m_rs_lock, flags);
+ spin_lock(&conn->c_lock);
- ro = &rm->rdma;
- if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) &&
- ro->op_active && ro->op_notify && ro->op_notifier) {
- notifier = ro->op_notifier;
- rs = rm->m_rs;
- sock_hold(rds_rs_to_sk(rs));
-
- notifier->n_status = status;
- spin_lock(&rs->rs_lock);
- list_add_tail(¬ifier->n_list, &rs->rs_notify_queue);
- spin_unlock(&rs->rs_lock);
+ list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
+ if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
+ ret = 0;
+ break;
+ }
- ro->op_notifier = NULL;
+ list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
+ if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
+ ret = 0;
+ break;
}
- spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+ spin_unlock(&conn->c_lock);
- if (rs) {
- rds_wake_sk_sleep(rs);
- sock_put(rds_rs_to_sk(rs));
- }
+ return ret;
}
-EXPORT_SYMBOL_GPL(rds_rdma_send_complete);
/*
- * Just like above, except looks at atomic op
+ * This is pretty similar to what happens below in the ACK
+ * handling code - except that we call here as soon as we get
+ * the IB send completion on the RDMA op and the accompanying
+ * message.
*/
-void rds_atomic_send_complete(struct rds_message *rm, int status)
+void rds_rdma_send_complete(struct rds_message *rm, int status)
{
struct rds_sock *rs = NULL;
- struct rm_atomic_op *ao;
+ struct rds_rdma_op *ro;
struct rds_notifier *notifier;
- unsigned long flags;
- spin_lock_irqsave(&rm->m_rs_lock, flags);
+ spin_lock(&rm->m_rs_lock);
- ao = &rm->atomic;
+ ro = rm->m_rdma_op;
if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)
- && ao->op_active && ao->op_notify && ao->op_notifier) {
- notifier = ao->op_notifier;
+ && ro && ro->r_notify && ro->r_notifier) {
+ notifier = ro->r_notifier;
rs = rm->m_rs;
sock_hold(rds_rs_to_sk(rs));
list_add_tail(¬ifier->n_list, &rs->rs_notify_queue);
spin_unlock(&rs->rs_lock);
- ao->op_notifier = NULL;
+ ro->r_notifier = NULL;
}
- spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+ spin_unlock(&rm->m_rs_lock);
if (rs) {
rds_wake_sk_sleep(rs);
sock_put(rds_rs_to_sk(rs));
}
}
-EXPORT_SYMBOL_GPL(rds_atomic_send_complete);
/*
* This is the same as rds_rdma_send_complete except we
* socket, socket lock) and can just move the notifier.
*/
static inline void
-__rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
+__rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
{
- struct rm_rdma_op *ro;
- struct rm_atomic_op *ao;
-
- ro = &rm->rdma;
- if (ro->op_active && ro->op_notify && ro->op_notifier) {
- ro->op_notifier->n_status = status;
- list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue);
- ro->op_notifier = NULL;
- }
+ struct rds_rdma_op *ro;
- ao = &rm->atomic;
- if (ao->op_active && ao->op_notify && ao->op_notifier) {
- ao->op_notifier->n_status = status;
- list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue);
- ao->op_notifier = NULL;
+ ro = rm->m_rdma_op;
+ if (ro && ro->r_notify && ro->r_notifier) {
+ ro->r_notifier->n_status = status;
+ list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue);
+ ro->r_notifier = NULL;
}
/* No need to wake the app - caller does this */
* So speed is not an issue here.
*/
struct rds_message *rds_send_get_message(struct rds_connection *conn,
- struct rm_rdma_op *op)
+ struct rds_rdma_op *op)
{
struct rds_message *rm, *tmp, *found = NULL;
unsigned long flags;
spin_lock_irqsave(&conn->c_lock, flags);
list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
- if (&rm->rdma == op) {
+ if (rm->m_rdma_op == op) {
atomic_inc(&rm->m_refcount);
found = rm;
goto out;
}
list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
- if (&rm->rdma == op) {
+ if (rm->m_rdma_op == op) {
atomic_inc(&rm->m_refcount);
found = rm;
break;
return found;
}
-EXPORT_SYMBOL_GPL(rds_send_get_message);
/*
* This removes messages from the socket's list if they're on it. The list
* removing the messages from the 'messages' list regardless of if it found
* the messages on the socket list or not.
*/
-static void rds_send_remove_from_sock(struct list_head *messages, int status)
+void rds_send_remove_from_sock(struct list_head *messages, int status)
{
- unsigned long flags;
+ unsigned long flags = 0; /* silence gcc :P */
struct rds_sock *rs = NULL;
struct rds_message *rm;
+ local_irq_save(flags);
while (!list_empty(messages)) {
- int was_on_sock = 0;
-
rm = list_entry(messages->next, struct rds_message,
m_conn_item);
list_del_init(&rm->m_conn_item);
* while we're messing with it. It does not prevent the
* message from being removed from the socket, though.
*/
- spin_lock_irqsave(&rm->m_rs_lock, flags);
+ spin_lock(&rm->m_rs_lock);
if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags))
goto unlock_and_drop;
if (rs != rm->m_rs) {
if (rs) {
+ spin_unlock(&rs->rs_lock);
rds_wake_sk_sleep(rs);
sock_put(rds_rs_to_sk(rs));
}
rs = rm->m_rs;
- if (rs)
- sock_hold(rds_rs_to_sk(rs));
+ spin_lock(&rs->rs_lock);
+ sock_hold(rds_rs_to_sk(rs));
}
- if (!rs)
- goto unlock_and_drop;
- spin_lock(&rs->rs_lock);
if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
- struct rm_rdma_op *ro = &rm->rdma;
+ struct rds_rdma_op *ro = rm->m_rdma_op;
struct rds_notifier *notifier;
list_del_init(&rm->m_sock_item);
rds_send_sndbuf_remove(rs, rm);
- if (ro->op_active && ro->op_notifier &&
- (ro->op_notify || (ro->op_recverr && status))) {
- notifier = ro->op_notifier;
+ if (ro && ro->r_notifier
+ && (status || ro->r_notify)) {
+ notifier = ro->r_notifier;
list_add_tail(¬ifier->n_list,
&rs->rs_notify_queue);
if (!notifier->n_status)
notifier->n_status = status;
- rm->rdma.op_notifier = NULL;
+ rm->m_rdma_op->r_notifier = NULL;
}
- was_on_sock = 1;
+ rds_message_put(rm);
rm->m_rs = NULL;
}
- spin_unlock(&rs->rs_lock);
unlock_and_drop:
- spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+ spin_unlock(&rm->m_rs_lock);
rds_message_put(rm);
- if (was_on_sock)
- rds_message_put(rm);
}
if (rs) {
+ spin_unlock(&rs->rs_lock);
rds_wake_sk_sleep(rs);
sock_put(rds_rs_to_sk(rs));
}
+ local_irq_restore(flags);
}
/*
* queue. This means that in the TCP case, the message may not have been
* assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked
* checks the RDS_MSG_HAS_ACK_SEQ bit.
+ *
+ * XXX It's not clear to me how this is safely serialized with socket
+ * destruction. Maybe it should bail if it sees SOCK_DEAD.
*/
void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
is_acked_func is_acked)
/* order flag updates with spin locks */
if (!list_empty(&list))
- smp_mb__after_atomic();
+ smp_mb__after_clear_bit();
spin_unlock_irqrestore(&conn->c_lock, flags);
/* now remove the messages from the sock list as needed */
rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS);
}
-EXPORT_SYMBOL_GPL(rds_send_drop_acked);
void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
{
struct rds_message *rm, *tmp;
struct rds_connection *conn;
- unsigned long flags;
+ unsigned long flags, flags2;
LIST_HEAD(list);
+ int wake = 0;
/* get all the messages we're dropping under the rs lock */
spin_lock_irqsave(&rs->rs_lock, flags);
dest->sin_port != rm->m_inc.i_hdr.h_dport))
continue;
+ wake = 1;
list_move(&rm->m_sock_item, &list);
rds_send_sndbuf_remove(rs, rm);
clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
+
+ /* If this is a RDMA operation, notify the app. */
+ __rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED);
}
/* order flag updates with the rs lock */
- smp_mb__after_atomic();
+ if (wake)
+ smp_mb__after_clear_bit();
spin_unlock_irqrestore(&rs->rs_lock, flags);
- if (list_empty(&list))
- return;
+ if (wake)
+ rds_wake_sk_sleep(rs);
+
+ conn = NULL;
- /* Remove the messages from the conn */
+ /* now remove the messages from the conn list as needed */
list_for_each_entry(rm, &list, m_sock_item) {
+ /* We do this here rather than in the loop above, so that
+ * we don't have to nest m_rs_lock under rs->rs_lock */
+ spin_lock_irqsave(&rm->m_rs_lock, flags2);
+ rm->m_rs = NULL;
+ spin_unlock_irqrestore(&rm->m_rs_lock, flags2);
- conn = rm->m_inc.i_conn;
-
- spin_lock_irqsave(&conn->c_lock, flags);
/*
- * Maybe someone else beat us to removing rm from the conn.
- * If we race with their flag update we'll get the lock and
- * then really see that the flag has been cleared.
+ * If we see this flag cleared then we're *sure* that someone
+ * else beat us to removing it from the conn. If we race
+ * with their flag update we'll get the lock and then really
+ * see that the flag has been cleared.
*/
- if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
- spin_unlock_irqrestore(&conn->c_lock, flags);
- spin_lock_irqsave(&rm->m_rs_lock, flags);
- rm->m_rs = NULL;
- spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+ if (!test_bit(RDS_MSG_ON_CONN, &rm->m_flags))
continue;
- }
- list_del_init(&rm->m_conn_item);
- spin_unlock_irqrestore(&conn->c_lock, flags);
- /*
- * Couldn't grab m_rs_lock in top loop (lock ordering),
- * but we can now.
- */
- spin_lock_irqsave(&rm->m_rs_lock, flags);
-
- spin_lock(&rs->rs_lock);
- __rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
- spin_unlock(&rs->rs_lock);
-
- rm->m_rs = NULL;
- spin_unlock_irqrestore(&rm->m_rs_lock, flags);
+ if (conn != rm->m_inc.i_conn) {
+ if (conn)
+ spin_unlock_irqrestore(&conn->c_lock, flags);
+ conn = rm->m_inc.i_conn;
+ spin_lock_irqsave(&conn->c_lock, flags);
+ }
- rds_message_put(rm);
+ if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
+ list_del_init(&rm->m_conn_item);
+ rds_message_put(rm);
+ }
}
- rds_wake_sk_sleep(rs);
+ if (conn)
+ spin_unlock_irqrestore(&conn->c_lock, flags);
while (!list_empty(&list)) {
rm = list_entry(list.next, struct rds_message, m_sock_item);
return *queued;
}
-/*
- * rds_message is getting to be quite complicated, and we'd like to allocate
- * it all in one go. This figures out how big it needs to be up front.
- */
-static int rds_rm_size(struct msghdr *msg, int data_len)
-{
- struct cmsghdr *cmsg;
- int size = 0;
- int cmsg_groups = 0;
- int retval;
-
- for_each_cmsghdr(cmsg, msg) {
- if (!CMSG_OK(msg, cmsg))
- return -EINVAL;
-
- if (cmsg->cmsg_level != SOL_RDS)
- continue;
-
- switch (cmsg->cmsg_type) {
- case RDS_CMSG_RDMA_ARGS:
- cmsg_groups |= 1;
- retval = rds_rdma_extra_size(CMSG_DATA(cmsg));
- if (retval < 0)
- return retval;
- size += retval;
-
- break;
-
- case RDS_CMSG_RDMA_DEST:
- case RDS_CMSG_RDMA_MAP:
- cmsg_groups |= 2;
- /* these are valid but do no add any size */
- break;
-
- case RDS_CMSG_ATOMIC_CSWP:
- case RDS_CMSG_ATOMIC_FADD:
- case RDS_CMSG_MASKED_ATOMIC_CSWP:
- case RDS_CMSG_MASKED_ATOMIC_FADD:
- cmsg_groups |= 1;
- size += sizeof(struct scatterlist);
- break;
-
- default:
- return -EINVAL;
- }
-
- }
-
- size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist);
-
- /* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */
- if (cmsg_groups == 3)
- return -EINVAL;
-
- return size;
-}
-
static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
struct msghdr *msg, int *allocated_mr)
{
struct cmsghdr *cmsg;
int ret = 0;
- for_each_cmsghdr(cmsg, msg) {
+ for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
if (!CMSG_OK(msg, cmsg))
return -EINVAL;
continue;
/* As a side effect, RDMA_DEST and RDMA_MAP will set
- * rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr.
+ * rm->m_rdma_cookie and rm->m_rdma_mr.
*/
switch (cmsg->cmsg_type) {
case RDS_CMSG_RDMA_ARGS:
if (!ret)
*allocated_mr = 1;
break;
- case RDS_CMSG_ATOMIC_CSWP:
- case RDS_CMSG_ATOMIC_FADD:
- case RDS_CMSG_MASKED_ATOMIC_CSWP:
- case RDS_CMSG_MASKED_ATOMIC_FADD:
- ret = rds_cmsg_atomic(rs, rm, cmsg);
- break;
default:
return -EINVAL;
return ret;
}
-int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
+int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+ size_t payload_len)
{
struct sock *sk = sock->sk;
struct rds_sock *rs = rds_sk_to_rs(sk);
- DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
+ struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name;
__be32 daddr;
__be16 dport;
struct rds_message *rm = NULL;
int ret = 0;
int queued = 0, allocated_mr = 0;
int nonblock = msg->msg_flags & MSG_DONTWAIT;
- long timeo = sock_sndtimeo(sk, nonblock);
+ long timeo = sock_rcvtimeo(sk, nonblock);
/* Mirror Linux UDP mirror of BSD error message compatibility */
/* XXX: Perhaps MSG_MORE someday */
if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) {
+ printk(KERN_INFO "msg_flags 0x%08X\n", msg->msg_flags);
ret = -EOPNOTSUPP;
goto out;
}
goto out;
}
- /* size of rm including all sgs */
- ret = rds_rm_size(msg, payload_len);
- if (ret < 0)
+ rm = rds_message_copy_from_user(msg->msg_iov, payload_len);
+ if (IS_ERR(rm)) {
+ ret = PTR_ERR(rm);
+ rm = NULL;
goto out;
-
- rm = rds_message_alloc(ret, GFP_KERNEL);
- if (!rm) {
- ret = -ENOMEM;
- goto out;
- }
-
- /* Attach data to the rm */
- if (payload_len) {
- rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE));
- if (!rm->data.op_sg) {
- ret = -ENOMEM;
- goto out;
- }
- ret = rds_message_copy_from_user(rm, &msg->msg_iter);
- if (ret)
- goto out;
}
- rm->data.op_active = 1;
rm->m_daddr = daddr;
+ /* Parse any control messages the user may have included. */
+ ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
+ if (ret)
+ goto out;
+
/* rds_conn_create has a spinlock that runs with IRQ off.
* Caching the conn in the socket helps a lot. */
if (rs->rs_conn && rs->rs_conn->c_faddr == daddr)
rs->rs_conn = conn;
}
- /* Parse any control messages the user may have included. */
- ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
- if (ret)
- goto out;
-
- if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) {
- printk_ratelimited(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
- &rm->rdma, conn->c_trans->xmit_rdma);
- ret = -EOPNOTSUPP;
- goto out;
- }
-
- if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) {
- printk_ratelimited(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n",
- &rm->atomic, conn->c_trans->xmit_atomic);
+ if ((rm->m_rdma_cookie || rm->m_rdma_op)
+ && conn->c_trans->xmit_rdma == NULL) {
+ if (printk_ratelimit())
+ printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
+ rm->m_rdma_op, conn->c_trans->xmit_rdma);
ret = -EOPNOTSUPP;
goto out;
}
- rds_conn_connect_if_down(conn);
+ /* If the connection is down, trigger a connect. We may
+ * have scheduled a delayed reconnect however - in this case
+ * we should not interfere.
+ */
+ if (rds_conn_state(conn) == RDS_CONN_DOWN
+ && !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
+ queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
- if (ret) {
- rs->rs_seen_congestion = 1;
+ if (ret)
goto out;
- }
while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port,
dport, &queued)) {
goto out;
}
- timeo = wait_event_interruptible_timeout(*sk_sleep(sk),
+ timeo = wait_event_interruptible_timeout(*sk->sk_sleep,
rds_send_queue_rm(rs, conn, rm,
rs->rs_bound_port,
dport,
rds_stats_inc(s_send_queued);
if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
- rds_send_xmit(conn);
+ rds_send_worker(&conn->c_send_w.work);
rds_message_put(rm);
return payload_len;
int ret = 0;
rm = rds_message_alloc(0, GFP_ATOMIC);
- if (!rm) {
+ if (rm == NULL) {
ret = -ENOMEM;
goto out;
}
rm->m_daddr = conn->c_faddr;
- rm->data.op_active = 1;
- rds_conn_connect_if_down(conn);
+ /* If the connection is down, trigger a connect. We may
+ * have scheduled a delayed reconnect however - in this case
+ * we should not interfere.
+ */
+ if (rds_conn_state(conn) == RDS_CONN_DOWN
+ && !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
+ queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL);
if (ret)
rds_stats_inc(s_send_queued);
rds_stats_inc(s_send_pong);
- if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
- queue_delayed_work(rds_wq, &conn->c_send_w, 0);
-
+ queue_delayed_work(rds_wq, &conn->c_send_w, 0);
rds_message_put(rm);
return 0;
#include <linux/percpu.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
-#include <linux/export.h>
#include "rds.h"
DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
-EXPORT_PER_CPU_SYMBOL_GPL(rds_stats);
/* :.,$s/unsigned long\>.*\<s_\(.*\);/"\1",/g */
-static const char *const rds_stat_names[] = {
+static char *rds_stat_names[] = {
"conn_reset",
"recv_drop_bad_checksum",
"recv_drop_old_seq",
"recv_ping",
"send_queue_empty",
"send_queue_full",
- "send_lock_contention",
- "send_lock_queue_raced",
+ "send_sem_contention",
+ "send_sem_queue_raced",
"send_immediate_retry",
"send_delayed_retry",
"send_drop_acked",
};
void rds_stats_info_copy(struct rds_info_iterator *iter,
- uint64_t *values, const char *const *names, size_t nr)
+ uint64_t *values, char **names, size_t nr)
{
struct rds_info_counter ctr;
size_t i;
for (i = 0; i < nr; i++) {
BUG_ON(strlen(names[i]) >= sizeof(ctr.name));
strncpy(ctr.name, names[i], sizeof(ctr.name) - 1);
- ctr.name[sizeof(ctr.name) - 1] = '\0';
ctr.value = values[i];
rds_info_copy(iter, &ctr, sizeof(ctr));
}
}
-EXPORT_SYMBOL_GPL(rds_stats_info_copy);
/*
* This gives global counters across all the transports. The strings
rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info);
}
-int rds_stats_init(void)
+int __init rds_stats_init(void)
{
rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info);
return 0;
unsigned int rds_sysctl_ping_enable = 1;
-static struct ctl_table rds_sysctl_rds_table[] = {
+static ctl_table rds_sysctl_rds_table[] = {
{
+ .ctl_name = CTL_UNNUMBERED,
.procname = "reconnect_min_delay_ms",
.data = &rds_sysctl_reconnect_min_jiffies,
.maxlen = sizeof(unsigned long),
.mode = 0644,
- .proc_handler = proc_doulongvec_ms_jiffies_minmax,
+ .proc_handler = &proc_doulongvec_ms_jiffies_minmax,
.extra1 = &rds_sysctl_reconnect_min,
.extra2 = &rds_sysctl_reconnect_max_jiffies,
},
{
+ .ctl_name = CTL_UNNUMBERED,
.procname = "reconnect_max_delay_ms",
.data = &rds_sysctl_reconnect_max_jiffies,
.maxlen = sizeof(unsigned long),
.mode = 0644,
- .proc_handler = proc_doulongvec_ms_jiffies_minmax,
+ .proc_handler = &proc_doulongvec_ms_jiffies_minmax,
.extra1 = &rds_sysctl_reconnect_min_jiffies,
.extra2 = &rds_sysctl_reconnect_max,
},
{
+ .ctl_name = CTL_UNNUMBERED,
.procname = "max_unacked_packets",
.data = &rds_sysctl_max_unacked_packets,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(unsigned long),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = &proc_dointvec,
},
{
+ .ctl_name = CTL_UNNUMBERED,
.procname = "max_unacked_bytes",
.data = &rds_sysctl_max_unacked_bytes,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(unsigned long),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = &proc_dointvec,
},
{
+ .ctl_name = CTL_UNNUMBERED,
.procname = "ping_enable",
.data = &rds_sysctl_ping_enable,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = &proc_dointvec,
},
+ { .ctl_name = 0}
+};
+
+static struct ctl_path rds_sysctl_path[] = {
+ { .procname = "net", .ctl_name = CTL_NET, },
+ { .procname = "rds", .ctl_name = CTL_UNNUMBERED, },
{ }
};
+
void rds_sysctl_exit(void)
{
- unregister_net_sysctl_table(rds_sysctl_reg_table);
+ if (rds_sysctl_reg_table)
+ unregister_sysctl_table(rds_sysctl_reg_table);
}
-int rds_sysctl_init(void)
+int __init rds_sysctl_init(void)
{
rds_sysctl_reconnect_min = msecs_to_jiffies(1);
rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min;
- rds_sysctl_reg_table = register_net_sysctl(&init_net,"net/rds", rds_sysctl_rds_table);
- if (!rds_sysctl_reg_table)
+ rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table);
+ if (rds_sysctl_reg_table == NULL)
return -ENOMEM;
return 0;
}
+++ /dev/null
-/*
- * Copyright (c) 2006 Oracle. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/in.h>
-#include <linux/module.h>
-#include <net/tcp.h>
-
-#include "rds.h"
-#include "tcp.h"
-
-/* only for info exporting */
-static DEFINE_SPINLOCK(rds_tcp_tc_list_lock);
-static LIST_HEAD(rds_tcp_tc_list);
-static unsigned int rds_tcp_tc_count;
-
-/* Track rds_tcp_connection structs so they can be cleaned up */
-static DEFINE_SPINLOCK(rds_tcp_conn_lock);
-static LIST_HEAD(rds_tcp_conn_list);
-
-static struct kmem_cache *rds_tcp_conn_slab;
-
-#define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024)
-
-/* doing it this way avoids calling tcp_sk() */
-void rds_tcp_nonagle(struct socket *sock)
-{
- mm_segment_t oldfs = get_fs();
- int val = 1;
-
- set_fs(KERNEL_DS);
- sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, (char __user *)&val,
- sizeof(val));
- set_fs(oldfs);
-}
-
-void rds_tcp_tune(struct socket *sock)
-{
- struct sock *sk = sock->sk;
-
- rds_tcp_nonagle(sock);
-
- /*
- * We're trying to saturate gigabit with the default,
- * see svc_sock_setbufsize().
- */
- lock_sock(sk);
- sk->sk_sndbuf = RDS_TCP_DEFAULT_BUFSIZE;
- sk->sk_rcvbuf = RDS_TCP_DEFAULT_BUFSIZE;
- sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK;
- release_sock(sk);
-}
-
-u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc)
-{
- return tcp_sk(tc->t_sock->sk)->snd_nxt;
-}
-
-u32 rds_tcp_snd_una(struct rds_tcp_connection *tc)
-{
- return tcp_sk(tc->t_sock->sk)->snd_una;
-}
-
-void rds_tcp_restore_callbacks(struct socket *sock,
- struct rds_tcp_connection *tc)
-{
- rdsdebug("restoring sock %p callbacks from tc %p\n", sock, tc);
- write_lock_bh(&sock->sk->sk_callback_lock);
-
- /* done under the callback_lock to serialize with write_space */
- spin_lock(&rds_tcp_tc_list_lock);
- list_del_init(&tc->t_list_item);
- rds_tcp_tc_count--;
- spin_unlock(&rds_tcp_tc_list_lock);
-
- tc->t_sock = NULL;
-
- sock->sk->sk_write_space = tc->t_orig_write_space;
- sock->sk->sk_data_ready = tc->t_orig_data_ready;
- sock->sk->sk_state_change = tc->t_orig_state_change;
- sock->sk->sk_user_data = NULL;
-
- write_unlock_bh(&sock->sk->sk_callback_lock);
-}
-
-/*
- * This is the only path that sets tc->t_sock. Send and receive trust that
- * it is set. The RDS_CONN_CONNECTED bit protects those paths from being
- * called while it isn't set.
- */
-void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn)
-{
- struct rds_tcp_connection *tc = conn->c_transport_data;
-
- rdsdebug("setting sock %p callbacks to tc %p\n", sock, tc);
- write_lock_bh(&sock->sk->sk_callback_lock);
-
- /* done under the callback_lock to serialize with write_space */
- spin_lock(&rds_tcp_tc_list_lock);
- list_add_tail(&tc->t_list_item, &rds_tcp_tc_list);
- rds_tcp_tc_count++;
- spin_unlock(&rds_tcp_tc_list_lock);
-
- /* accepted sockets need our listen data ready undone */
- if (sock->sk->sk_data_ready == rds_tcp_listen_data_ready)
- sock->sk->sk_data_ready = sock->sk->sk_user_data;
-
- tc->t_sock = sock;
- tc->conn = conn;
- tc->t_orig_data_ready = sock->sk->sk_data_ready;
- tc->t_orig_write_space = sock->sk->sk_write_space;
- tc->t_orig_state_change = sock->sk->sk_state_change;
-
- sock->sk->sk_user_data = conn;
- sock->sk->sk_data_ready = rds_tcp_data_ready;
- sock->sk->sk_write_space = rds_tcp_write_space;
- sock->sk->sk_state_change = rds_tcp_state_change;
-
- write_unlock_bh(&sock->sk->sk_callback_lock);
-}
-
-static void rds_tcp_tc_info(struct socket *sock, unsigned int len,
- struct rds_info_iterator *iter,
- struct rds_info_lengths *lens)
-{
- struct rds_info_tcp_socket tsinfo;
- struct rds_tcp_connection *tc;
- unsigned long flags;
- struct sockaddr_in sin;
- int sinlen;
-
- spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
-
- if (len / sizeof(tsinfo) < rds_tcp_tc_count)
- goto out;
-
- list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
-
- sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 0);
- tsinfo.local_addr = sin.sin_addr.s_addr;
- tsinfo.local_port = sin.sin_port;
- sock->ops->getname(sock, (struct sockaddr *)&sin, &sinlen, 1);
- tsinfo.peer_addr = sin.sin_addr.s_addr;
- tsinfo.peer_port = sin.sin_port;
-
- tsinfo.hdr_rem = tc->t_tinc_hdr_rem;
- tsinfo.data_rem = tc->t_tinc_data_rem;
- tsinfo.last_sent_nxt = tc->t_last_sent_nxt;
- tsinfo.last_expected_una = tc->t_last_expected_una;
- tsinfo.last_seen_una = tc->t_last_seen_una;
-
- rds_info_copy(iter, &tsinfo, sizeof(tsinfo));
- }
-
-out:
- lens->nr = rds_tcp_tc_count;
- lens->each = sizeof(tsinfo);
-
- spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
-}
-
-static int rds_tcp_laddr_check(__be32 addr)
-{
- if (inet_addr_type(&init_net, addr) == RTN_LOCAL)
- return 0;
- return -EADDRNOTAVAIL;
-}
-
-static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
-{
- struct rds_tcp_connection *tc;
-
- tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp);
- if (!tc)
- return -ENOMEM;
-
- tc->t_sock = NULL;
- tc->t_tinc = NULL;
- tc->t_tinc_hdr_rem = sizeof(struct rds_header);
- tc->t_tinc_data_rem = 0;
-
- conn->c_transport_data = tc;
-
- spin_lock_irq(&rds_tcp_conn_lock);
- list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list);
- spin_unlock_irq(&rds_tcp_conn_lock);
-
- rdsdebug("alloced tc %p\n", conn->c_transport_data);
- return 0;
-}
-
-static void rds_tcp_conn_free(void *arg)
-{
- struct rds_tcp_connection *tc = arg;
- unsigned long flags;
- rdsdebug("freeing tc %p\n", tc);
-
- spin_lock_irqsave(&rds_tcp_conn_lock, flags);
- list_del(&tc->t_tcp_node);
- spin_unlock_irqrestore(&rds_tcp_conn_lock, flags);
-
- kmem_cache_free(rds_tcp_conn_slab, tc);
-}
-
-static void rds_tcp_destroy_conns(void)
-{
- struct rds_tcp_connection *tc, *_tc;
- LIST_HEAD(tmp_list);
-
- /* avoid calling conn_destroy with irqs off */
- spin_lock_irq(&rds_tcp_conn_lock);
- list_splice(&rds_tcp_conn_list, &tmp_list);
- INIT_LIST_HEAD(&rds_tcp_conn_list);
- spin_unlock_irq(&rds_tcp_conn_lock);
-
- list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) {
- if (tc->conn->c_passive)
- rds_conn_destroy(tc->conn->c_passive);
- rds_conn_destroy(tc->conn);
- }
-}
-
-static void rds_tcp_exit(void)
-{
- rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
- rds_tcp_listen_stop();
- rds_tcp_destroy_conns();
- rds_trans_unregister(&rds_tcp_transport);
- rds_tcp_recv_exit();
- kmem_cache_destroy(rds_tcp_conn_slab);
-}
-module_exit(rds_tcp_exit);
-
-struct rds_transport rds_tcp_transport = {
- .laddr_check = rds_tcp_laddr_check,
- .xmit_prepare = rds_tcp_xmit_prepare,
- .xmit_complete = rds_tcp_xmit_complete,
- .xmit = rds_tcp_xmit,
- .recv = rds_tcp_recv,
- .conn_alloc = rds_tcp_conn_alloc,
- .conn_free = rds_tcp_conn_free,
- .conn_connect = rds_tcp_conn_connect,
- .conn_shutdown = rds_tcp_conn_shutdown,
- .inc_copy_to_user = rds_tcp_inc_copy_to_user,
- .inc_free = rds_tcp_inc_free,
- .stats_info_copy = rds_tcp_stats_info_copy,
- .exit = rds_tcp_exit,
- .t_owner = THIS_MODULE,
- .t_name = "tcp",
- .t_type = RDS_TRANS_TCP,
- .t_prefer_loopback = 1,
-};
-
-static int rds_tcp_init(void)
-{
- int ret;
-
- rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection",
- sizeof(struct rds_tcp_connection),
- 0, 0, NULL);
- if (!rds_tcp_conn_slab) {
- ret = -ENOMEM;
- goto out;
- }
-
- ret = rds_tcp_recv_init();
- if (ret)
- goto out_slab;
-
- ret = rds_trans_register(&rds_tcp_transport);
- if (ret)
- goto out_recv;
-
- ret = rds_tcp_listen_init();
- if (ret)
- goto out_register;
-
- rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
-
- goto out;
-
-out_register:
- rds_trans_unregister(&rds_tcp_transport);
-out_recv:
- rds_tcp_recv_exit();
-out_slab:
- kmem_cache_destroy(rds_tcp_conn_slab);
-out:
- return ret;
-}
-module_init(rds_tcp_init);
-
-MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
-MODULE_DESCRIPTION("RDS: TCP transport");
-MODULE_LICENSE("Dual BSD/GPL");
-
+++ /dev/null
-#ifndef _RDS_TCP_H
-#define _RDS_TCP_H
-
-#define RDS_TCP_PORT 16385
-
-struct rds_tcp_incoming {
- struct rds_incoming ti_inc;
- struct sk_buff_head ti_skb_list;
-};
-
-struct rds_tcp_connection {
-
- struct list_head t_tcp_node;
- struct rds_connection *conn;
- struct socket *t_sock;
- void *t_orig_write_space;
- void *t_orig_data_ready;
- void *t_orig_state_change;
-
- struct rds_tcp_incoming *t_tinc;
- size_t t_tinc_hdr_rem;
- size_t t_tinc_data_rem;
-
- /* XXX error report? */
- struct work_struct t_conn_w;
- struct work_struct t_send_w;
- struct work_struct t_down_w;
- struct work_struct t_recv_w;
-
- /* for info exporting only */
- struct list_head t_list_item;
- u32 t_last_sent_nxt;
- u32 t_last_expected_una;
- u32 t_last_seen_una;
-};
-
-struct rds_tcp_statistics {
- uint64_t s_tcp_data_ready_calls;
- uint64_t s_tcp_write_space_calls;
- uint64_t s_tcp_sndbuf_full;
- uint64_t s_tcp_connect_raced;
- uint64_t s_tcp_listen_closed_stale;
-};
-
-/* tcp.c */
-void rds_tcp_tune(struct socket *sock);
-void rds_tcp_nonagle(struct socket *sock);
-void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn);
-void rds_tcp_restore_callbacks(struct socket *sock,
- struct rds_tcp_connection *tc);
-u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc);
-u32 rds_tcp_snd_una(struct rds_tcp_connection *tc);
-u64 rds_tcp_map_seq(struct rds_tcp_connection *tc, u32 seq);
-extern struct rds_transport rds_tcp_transport;
-
-/* tcp_connect.c */
-int rds_tcp_conn_connect(struct rds_connection *conn);
-void rds_tcp_conn_shutdown(struct rds_connection *conn);
-void rds_tcp_state_change(struct sock *sk);
-
-/* tcp_listen.c */
-int rds_tcp_listen_init(void);
-void rds_tcp_listen_stop(void);
-void rds_tcp_listen_data_ready(struct sock *sk);
-
-/* tcp_recv.c */
-int rds_tcp_recv_init(void);
-void rds_tcp_recv_exit(void);
-void rds_tcp_data_ready(struct sock *sk);
-int rds_tcp_recv(struct rds_connection *conn);
-void rds_tcp_inc_free(struct rds_incoming *inc);
-int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
-
-/* tcp_send.c */
-void rds_tcp_xmit_prepare(struct rds_connection *conn);
-void rds_tcp_xmit_complete(struct rds_connection *conn);
-int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
- unsigned int hdr_off, unsigned int sg, unsigned int off);
-void rds_tcp_write_space(struct sock *sk);
-
-/* tcp_stats.c */
-DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats);
-#define rds_tcp_stats_inc(member) rds_stats_inc_which(rds_tcp_stats, member)
-unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter,
- unsigned int avail);
-
-#endif
+++ /dev/null
-/*
- * Copyright (c) 2006 Oracle. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/in.h>
-#include <net/tcp.h>
-
-#include "rds.h"
-#include "tcp.h"
-
-void rds_tcp_state_change(struct sock *sk)
-{
- void (*state_change)(struct sock *sk);
- struct rds_connection *conn;
- struct rds_tcp_connection *tc;
-
- read_lock(&sk->sk_callback_lock);
- conn = sk->sk_user_data;
- if (!conn) {
- state_change = sk->sk_state_change;
- goto out;
- }
- tc = conn->c_transport_data;
- state_change = tc->t_orig_state_change;
-
- rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state);
-
- switch(sk->sk_state) {
- /* ignore connecting sockets as they make progress */
- case TCP_SYN_SENT:
- case TCP_SYN_RECV:
- break;
- case TCP_ESTABLISHED:
- rds_connect_complete(conn);
- break;
- case TCP_CLOSE_WAIT:
- case TCP_CLOSE:
- rds_conn_drop(conn);
- default:
- break;
- }
-out:
- read_unlock(&sk->sk_callback_lock);
- state_change(sk);
-}
-
-int rds_tcp_conn_connect(struct rds_connection *conn)
-{
- struct socket *sock = NULL;
- struct sockaddr_in src, dest;
- int ret;
-
- ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
- if (ret < 0)
- goto out;
-
- rds_tcp_tune(sock);
-
- src.sin_family = AF_INET;
- src.sin_addr.s_addr = (__force u32)conn->c_laddr;
- src.sin_port = (__force u16)htons(0);
-
- ret = sock->ops->bind(sock, (struct sockaddr *)&src, sizeof(src));
- if (ret) {
- rdsdebug("bind failed with %d at address %pI4\n",
- ret, &conn->c_laddr);
- goto out;
- }
-
- dest.sin_family = AF_INET;
- dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
- dest.sin_port = (__force u16)htons(RDS_TCP_PORT);
-
- /*
- * once we call connect() we can start getting callbacks and they
- * own the socket
- */
- rds_tcp_set_callbacks(sock, conn);
- ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest),
- O_NONBLOCK);
-
- rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret);
- if (ret == -EINPROGRESS)
- ret = 0;
- if (ret == 0)
- sock = NULL;
- else
- rds_tcp_restore_callbacks(sock, conn->c_transport_data);
-
-out:
- if (sock)
- sock_release(sock);
- return ret;
-}
-
-/*
- * Before killing the tcp socket this needs to serialize with callbacks. The
- * caller has already grabbed the sending sem so we're serialized with other
- * senders.
- *
- * TCP calls the callbacks with the sock lock so we hold it while we reset the
- * callbacks to those set by TCP. Our callbacks won't execute again once we
- * hold the sock lock.
- */
-void rds_tcp_conn_shutdown(struct rds_connection *conn)
-{
- struct rds_tcp_connection *tc = conn->c_transport_data;
- struct socket *sock = tc->t_sock;
-
- rdsdebug("shutting down conn %p tc %p sock %p\n", conn, tc, sock);
-
- if (sock) {
- sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN);
- lock_sock(sock->sk);
- rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */
-
- release_sock(sock->sk);
- sock_release(sock);
- }
-
- if (tc->t_tinc) {
- rds_inc_put(&tc->t_tinc->ti_inc);
- tc->t_tinc = NULL;
- }
- tc->t_tinc_hdr_rem = sizeof(struct rds_header);
- tc->t_tinc_data_rem = 0;
-}
+++ /dev/null
-/*
- * Copyright (c) 2006 Oracle. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/gfp.h>
-#include <linux/in.h>
-#include <net/tcp.h>
-
-#include "rds.h"
-#include "tcp.h"
-
-/*
- * cheesy, but simple..
- */
-static void rds_tcp_accept_worker(struct work_struct *work);
-static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker);
-static struct socket *rds_tcp_listen_sock;
-
-static int rds_tcp_keepalive(struct socket *sock)
-{
- /* values below based on xs_udp_default_timeout */
- int keepidle = 5; /* send a probe 'keepidle' secs after last data */
- int keepcnt = 5; /* number of unack'ed probes before declaring dead */
- int keepalive = 1;
- int ret = 0;
-
- ret = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
- (char *)&keepalive, sizeof(keepalive));
- if (ret < 0)
- goto bail;
-
- ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT,
- (char *)&keepcnt, sizeof(keepcnt));
- if (ret < 0)
- goto bail;
-
- ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE,
- (char *)&keepidle, sizeof(keepidle));
- if (ret < 0)
- goto bail;
-
- /* KEEPINTVL is the interval between successive probes. We follow
- * the model in xs_tcp_finish_connecting() and re-use keepidle.
- */
- ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL,
- (char *)&keepidle, sizeof(keepidle));
-bail:
- return ret;
-}
-
-static int rds_tcp_accept_one(struct socket *sock)
-{
- struct socket *new_sock = NULL;
- struct rds_connection *conn;
- int ret;
- struct inet_sock *inet;
- struct rds_tcp_connection *rs_tcp;
-
- ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
- sock->sk->sk_protocol, &new_sock);
- if (ret)
- goto out;
-
- new_sock->type = sock->type;
- new_sock->ops = sock->ops;
- ret = sock->ops->accept(sock, new_sock, O_NONBLOCK);
- if (ret < 0)
- goto out;
-
- ret = rds_tcp_keepalive(new_sock);
- if (ret < 0)
- goto out;
-
- rds_tcp_tune(new_sock);
-
- inet = inet_sk(new_sock->sk);
-
- rdsdebug("accepted tcp %pI4:%u -> %pI4:%u\n",
- &inet->inet_saddr, ntohs(inet->inet_sport),
- &inet->inet_daddr, ntohs(inet->inet_dport));
-
- conn = rds_conn_create(inet->inet_saddr, inet->inet_daddr,
- &rds_tcp_transport, GFP_KERNEL);
- if (IS_ERR(conn)) {
- ret = PTR_ERR(conn);
- goto out;
- }
- /* An incoming SYN request came in, and TCP just accepted it.
- * We always create a new conn for listen side of TCP, and do not
- * add it to the c_hash_list.
- *
- * If the client reboots, this conn will need to be cleaned up.
- * rds_tcp_state_change() will do that cleanup
- */
- rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data;
- WARN_ON(!rs_tcp || rs_tcp->t_sock);
-
- /*
- * see the comment above rds_queue_delayed_reconnect()
- */
- if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
- if (rds_conn_state(conn) == RDS_CONN_UP)
- rds_tcp_stats_inc(s_tcp_listen_closed_stale);
- else
- rds_tcp_stats_inc(s_tcp_connect_raced);
- rds_conn_drop(conn);
- ret = 0;
- goto out;
- }
-
- rds_tcp_set_callbacks(new_sock, conn);
- rds_connect_complete(conn);
- new_sock = NULL;
- ret = 0;
-
-out:
- if (new_sock)
- sock_release(new_sock);
- return ret;
-}
-
-static void rds_tcp_accept_worker(struct work_struct *work)
-{
- while (rds_tcp_accept_one(rds_tcp_listen_sock) == 0)
- cond_resched();
-}
-
-void rds_tcp_listen_data_ready(struct sock *sk)
-{
- void (*ready)(struct sock *sk);
-
- rdsdebug("listen data ready sk %p\n", sk);
-
- read_lock(&sk->sk_callback_lock);
- ready = sk->sk_user_data;
- if (!ready) { /* check for teardown race */
- ready = sk->sk_data_ready;
- goto out;
- }
-
- /*
- * ->sk_data_ready is also called for a newly established child socket
- * before it has been accepted and the accepter has set up their
- * data_ready.. we only want to queue listen work for our listening
- * socket
- */
- if (sk->sk_state == TCP_LISTEN)
- queue_work(rds_wq, &rds_tcp_listen_work);
-
-out:
- read_unlock(&sk->sk_callback_lock);
- ready(sk);
-}
-
-int rds_tcp_listen_init(void)
-{
- struct sockaddr_in sin;
- struct socket *sock = NULL;
- int ret;
-
- ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
- if (ret < 0)
- goto out;
-
- sock->sk->sk_reuse = SK_CAN_REUSE;
- rds_tcp_nonagle(sock);
-
- write_lock_bh(&sock->sk->sk_callback_lock);
- sock->sk->sk_user_data = sock->sk->sk_data_ready;
- sock->sk->sk_data_ready = rds_tcp_listen_data_ready;
- write_unlock_bh(&sock->sk->sk_callback_lock);
-
- sin.sin_family = PF_INET;
- sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
- sin.sin_port = (__force u16)htons(RDS_TCP_PORT);
-
- ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
- if (ret < 0)
- goto out;
-
- ret = sock->ops->listen(sock, 64);
- if (ret < 0)
- goto out;
-
- rds_tcp_listen_sock = sock;
- sock = NULL;
-out:
- if (sock)
- sock_release(sock);
- return ret;
-}
-
-void rds_tcp_listen_stop(void)
-{
- struct socket *sock = rds_tcp_listen_sock;
- struct sock *sk;
-
- if (!sock)
- return;
-
- sk = sock->sk;
-
- /* serialize with and prevent further callbacks */
- lock_sock(sk);
- write_lock_bh(&sk->sk_callback_lock);
- if (sk->sk_user_data) {
- sk->sk_data_ready = sk->sk_user_data;
- sk->sk_user_data = NULL;
- }
- write_unlock_bh(&sk->sk_callback_lock);
- release_sock(sk);
-
- /* wait for accepts to stop and close the socket */
- flush_workqueue(rds_wq);
- sock_release(sock);
- rds_tcp_listen_sock = NULL;
-}
+++ /dev/null
-/*
- * Copyright (c) 2006 Oracle. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <net/tcp.h>
-
-#include "rds.h"
-#include "tcp.h"
-
-static struct kmem_cache *rds_tcp_incoming_slab;
-
-static void rds_tcp_inc_purge(struct rds_incoming *inc)
-{
- struct rds_tcp_incoming *tinc;
- tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
- rdsdebug("purging tinc %p inc %p\n", tinc, inc);
- skb_queue_purge(&tinc->ti_skb_list);
-}
-
-void rds_tcp_inc_free(struct rds_incoming *inc)
-{
- struct rds_tcp_incoming *tinc;
- tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
- rds_tcp_inc_purge(inc);
- rdsdebug("freeing tinc %p inc %p\n", tinc, inc);
- kmem_cache_free(rds_tcp_incoming_slab, tinc);
-}
-
-/*
- * this is pretty lame, but, whatever.
- */
-int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
-{
- struct rds_tcp_incoming *tinc;
- struct sk_buff *skb;
- int ret = 0;
-
- if (!iov_iter_count(to))
- goto out;
-
- tinc = container_of(inc, struct rds_tcp_incoming, ti_inc);
-
- skb_queue_walk(&tinc->ti_skb_list, skb) {
- unsigned long to_copy, skb_off;
- for (skb_off = 0; skb_off < skb->len; skb_off += to_copy) {
- to_copy = iov_iter_count(to);
- to_copy = min(to_copy, skb->len - skb_off);
-
- if (skb_copy_datagram_iter(skb, skb_off, to, to_copy))
- return -EFAULT;
-
- rds_stats_add(s_copy_to_user, to_copy);
- ret += to_copy;
-
- if (!iov_iter_count(to))
- goto out;
- }
- }
-out:
- return ret;
-}
-
-/*
- * We have a series of skbs that have fragmented pieces of the congestion
- * bitmap. They must add up to the exact size of the congestion bitmap. We
- * use the skb helpers to copy those into the pages that make up the in-memory
- * congestion bitmap for the remote address of this connection. We then tell
- * the congestion core that the bitmap has been changed so that it can wake up
- * sleepers.
- *
- * This is racing with sending paths which are using test_bit to see if the
- * bitmap indicates that their recipient is congested.
- */
-
-static void rds_tcp_cong_recv(struct rds_connection *conn,
- struct rds_tcp_incoming *tinc)
-{
- struct sk_buff *skb;
- unsigned int to_copy, skb_off;
- unsigned int map_off;
- unsigned int map_page;
- struct rds_cong_map *map;
- int ret;
-
- /* catch completely corrupt packets */
- if (be32_to_cpu(tinc->ti_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
- return;
-
- map_page = 0;
- map_off = 0;
- map = conn->c_fcong;
-
- skb_queue_walk(&tinc->ti_skb_list, skb) {
- skb_off = 0;
- while (skb_off < skb->len) {
- to_copy = min_t(unsigned int, PAGE_SIZE - map_off,
- skb->len - skb_off);
-
- BUG_ON(map_page >= RDS_CONG_MAP_PAGES);
-
- /* only returns 0 or -error */
- ret = skb_copy_bits(skb, skb_off,
- (void *)map->m_page_addrs[map_page] + map_off,
- to_copy);
- BUG_ON(ret != 0);
-
- skb_off += to_copy;
- map_off += to_copy;
- if (map_off == PAGE_SIZE) {
- map_off = 0;
- map_page++;
- }
- }
- }
-
- rds_cong_map_updated(map, ~(u64) 0);
-}
-
-struct rds_tcp_desc_arg {
- struct rds_connection *conn;
- gfp_t gfp;
-};
-
-static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
- unsigned int offset, size_t len)
-{
- struct rds_tcp_desc_arg *arg = desc->arg.data;
- struct rds_connection *conn = arg->conn;
- struct rds_tcp_connection *tc = conn->c_transport_data;
- struct rds_tcp_incoming *tinc = tc->t_tinc;
- struct sk_buff *clone;
- size_t left = len, to_copy;
-
- rdsdebug("tcp data tc %p skb %p offset %u len %zu\n", tc, skb, offset,
- len);
-
- /*
- * tcp_read_sock() interprets partial progress as an indication to stop
- * processing.
- */
- while (left) {
- if (!tinc) {
- tinc = kmem_cache_alloc(rds_tcp_incoming_slab,
- arg->gfp);
- if (!tinc) {
- desc->error = -ENOMEM;
- goto out;
- }
- tc->t_tinc = tinc;
- rdsdebug("alloced tinc %p\n", tinc);
- rds_inc_init(&tinc->ti_inc, conn, conn->c_faddr);
- /*
- * XXX * we might be able to use the __ variants when
- * we've already serialized at a higher level.
- */
- skb_queue_head_init(&tinc->ti_skb_list);
- }
-
- if (left && tc->t_tinc_hdr_rem) {
- to_copy = min(tc->t_tinc_hdr_rem, left);
- rdsdebug("copying %zu header from skb %p\n", to_copy,
- skb);
- skb_copy_bits(skb, offset,
- (char *)&tinc->ti_inc.i_hdr +
- sizeof(struct rds_header) -
- tc->t_tinc_hdr_rem,
- to_copy);
- tc->t_tinc_hdr_rem -= to_copy;
- left -= to_copy;
- offset += to_copy;
-
- if (tc->t_tinc_hdr_rem == 0) {
- /* could be 0 for a 0 len message */
- tc->t_tinc_data_rem =
- be32_to_cpu(tinc->ti_inc.i_hdr.h_len);
- }
- }
-
- if (left && tc->t_tinc_data_rem) {
- clone = skb_clone(skb, arg->gfp);
- if (!clone) {
- desc->error = -ENOMEM;
- goto out;
- }
-
- to_copy = min(tc->t_tinc_data_rem, left);
- pskb_pull(clone, offset);
- pskb_trim(clone, to_copy);
- skb_queue_tail(&tinc->ti_skb_list, clone);
-
- rdsdebug("skb %p data %p len %d off %u to_copy %zu -> "
- "clone %p data %p len %d\n",
- skb, skb->data, skb->len, offset, to_copy,
- clone, clone->data, clone->len);
-
- tc->t_tinc_data_rem -= to_copy;
- left -= to_copy;
- offset += to_copy;
- }
-
- if (tc->t_tinc_hdr_rem == 0 && tc->t_tinc_data_rem == 0) {
- if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
- rds_tcp_cong_recv(conn, tinc);
- else
- rds_recv_incoming(conn, conn->c_faddr,
- conn->c_laddr, &tinc->ti_inc,
- arg->gfp);
-
- tc->t_tinc_hdr_rem = sizeof(struct rds_header);
- tc->t_tinc_data_rem = 0;
- tc->t_tinc = NULL;
- rds_inc_put(&tinc->ti_inc);
- tinc = NULL;
- }
- }
-out:
- rdsdebug("returning len %zu left %zu skb len %d rx queue depth %d\n",
- len, left, skb->len,
- skb_queue_len(&tc->t_sock->sk->sk_receive_queue));
- return len - left;
-}
-
-/* the caller has to hold the sock lock */
-static int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp)
-{
- struct rds_tcp_connection *tc = conn->c_transport_data;
- struct socket *sock = tc->t_sock;
- read_descriptor_t desc;
- struct rds_tcp_desc_arg arg;
-
- /* It's like glib in the kernel! */
- arg.conn = conn;
- arg.gfp = gfp;
- desc.arg.data = &arg;
- desc.error = 0;
- desc.count = 1; /* give more than one skb per call */
-
- tcp_read_sock(sock->sk, &desc, rds_tcp_data_recv);
- rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp,
- desc.error);
-
- return desc.error;
-}
-
-/*
- * We hold the sock lock to serialize our rds_tcp_recv->tcp_read_sock from
- * data_ready.
- *
- * if we fail to allocate we're in trouble.. blindly wait some time before
- * trying again to see if the VM can free up something for us.
- */
-int rds_tcp_recv(struct rds_connection *conn)
-{
- struct rds_tcp_connection *tc = conn->c_transport_data;
- struct socket *sock = tc->t_sock;
- int ret = 0;
-
- rdsdebug("recv worker conn %p tc %p sock %p\n", conn, tc, sock);
-
- lock_sock(sock->sk);
- ret = rds_tcp_read_sock(conn, GFP_KERNEL);
- release_sock(sock->sk);
-
- return ret;
-}
-
-void rds_tcp_data_ready(struct sock *sk)
-{
- void (*ready)(struct sock *sk);
- struct rds_connection *conn;
- struct rds_tcp_connection *tc;
-
- rdsdebug("data ready sk %p\n", sk);
-
- read_lock(&sk->sk_callback_lock);
- conn = sk->sk_user_data;
- if (!conn) { /* check for teardown race */
- ready = sk->sk_data_ready;
- goto out;
- }
-
- tc = conn->c_transport_data;
- ready = tc->t_orig_data_ready;
- rds_tcp_stats_inc(s_tcp_data_ready_calls);
-
- if (rds_tcp_read_sock(conn, GFP_ATOMIC) == -ENOMEM)
- queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
-out:
- read_unlock(&sk->sk_callback_lock);
- ready(sk);
-}
-
-int rds_tcp_recv_init(void)
-{
- rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming",
- sizeof(struct rds_tcp_incoming),
- 0, 0, NULL);
- if (!rds_tcp_incoming_slab)
- return -ENOMEM;
- return 0;
-}
-
-void rds_tcp_recv_exit(void)
-{
- kmem_cache_destroy(rds_tcp_incoming_slab);
-}
+++ /dev/null
-/*
- * Copyright (c) 2006 Oracle. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/in.h>
-#include <net/tcp.h>
-
-#include "rds.h"
-#include "tcp.h"
-
-static void rds_tcp_cork(struct socket *sock, int val)
-{
- mm_segment_t oldfs;
-
- oldfs = get_fs();
- set_fs(KERNEL_DS);
- sock->ops->setsockopt(sock, SOL_TCP, TCP_CORK, (char __user *)&val,
- sizeof(val));
- set_fs(oldfs);
-}
-
-void rds_tcp_xmit_prepare(struct rds_connection *conn)
-{
- struct rds_tcp_connection *tc = conn->c_transport_data;
-
- rds_tcp_cork(tc->t_sock, 1);
-}
-
-void rds_tcp_xmit_complete(struct rds_connection *conn)
-{
- struct rds_tcp_connection *tc = conn->c_transport_data;
-
- rds_tcp_cork(tc->t_sock, 0);
-}
-
-/* the core send_sem serializes this with other xmit and shutdown */
-static int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len)
-{
- struct kvec vec = {
- .iov_base = data,
- .iov_len = len,
- };
- struct msghdr msg = {
- .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL,
- };
-
- return kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len);
-}
-
-/* the core send_sem serializes this with other xmit and shutdown */
-int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
- unsigned int hdr_off, unsigned int sg, unsigned int off)
-{
- struct rds_tcp_connection *tc = conn->c_transport_data;
- int done = 0;
- int ret = 0;
-
- if (hdr_off == 0) {
- /*
- * m_ack_seq is set to the sequence number of the last byte of
- * header and data. see rds_tcp_is_acked().
- */
- tc->t_last_sent_nxt = rds_tcp_snd_nxt(tc);
- rm->m_ack_seq = tc->t_last_sent_nxt +
- sizeof(struct rds_header) +
- be32_to_cpu(rm->m_inc.i_hdr.h_len) - 1;
- smp_mb__before_atomic();
- set_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags);
- tc->t_last_expected_una = rm->m_ack_seq + 1;
-
- rdsdebug("rm %p tcp nxt %u ack_seq %llu\n",
- rm, rds_tcp_snd_nxt(tc),
- (unsigned long long)rm->m_ack_seq);
- }
-
- if (hdr_off < sizeof(struct rds_header)) {
- /* see rds_tcp_write_space() */
- set_bit(SOCK_NOSPACE, &tc->t_sock->sk->sk_socket->flags);
-
- ret = rds_tcp_sendmsg(tc->t_sock,
- (void *)&rm->m_inc.i_hdr + hdr_off,
- sizeof(rm->m_inc.i_hdr) - hdr_off);
- if (ret < 0)
- goto out;
- done += ret;
- if (hdr_off + done != sizeof(struct rds_header))
- goto out;
- }
-
- while (sg < rm->data.op_nents) {
- ret = tc->t_sock->ops->sendpage(tc->t_sock,
- sg_page(&rm->data.op_sg[sg]),
- rm->data.op_sg[sg].offset + off,
- rm->data.op_sg[sg].length - off,
- MSG_DONTWAIT|MSG_NOSIGNAL);
- rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]),
- rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off,
- ret);
- if (ret <= 0)
- break;
-
- off += ret;
- done += ret;
- if (off == rm->data.op_sg[sg].length) {
- off = 0;
- sg++;
- }
- }
-
-out:
- if (ret <= 0) {
- /* write_space will hit after EAGAIN, all else fatal */
- if (ret == -EAGAIN) {
- rds_tcp_stats_inc(s_tcp_sndbuf_full);
- ret = 0;
- } else {
- printk(KERN_WARNING "RDS/tcp: send to %pI4 "
- "returned %d, disconnecting and reconnecting\n",
- &conn->c_faddr, ret);
- rds_conn_drop(conn);
- }
- }
- if (done == 0)
- done = ret;
- return done;
-}
-
-/*
- * rm->m_ack_seq is set to the tcp sequence number that corresponds to the
- * last byte of the message, including the header. This means that the
- * entire message has been received if rm->m_ack_seq is "before" the next
- * unacked byte of the TCP sequence space. We have to do very careful
- * wrapping 32bit comparisons here.
- */
-static int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack)
-{
- if (!test_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags))
- return 0;
- return (__s32)((u32)rm->m_ack_seq - (u32)ack) < 0;
-}
-
-void rds_tcp_write_space(struct sock *sk)
-{
- void (*write_space)(struct sock *sk);
- struct rds_connection *conn;
- struct rds_tcp_connection *tc;
-
- read_lock(&sk->sk_callback_lock);
- conn = sk->sk_user_data;
- if (!conn) {
- write_space = sk->sk_write_space;
- goto out;
- }
-
- tc = conn->c_transport_data;
- rdsdebug("write_space for tc %p\n", tc);
- write_space = tc->t_orig_write_space;
- rds_tcp_stats_inc(s_tcp_write_space_calls);
-
- rdsdebug("tcp una %u\n", rds_tcp_snd_una(tc));
- tc->t_last_seen_una = rds_tcp_snd_una(tc);
- rds_send_drop_acked(conn, rds_tcp_snd_una(tc), rds_tcp_is_acked);
-
- if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf)
- queue_delayed_work(rds_wq, &conn->c_send_w, 0);
-
-out:
- read_unlock(&sk->sk_callback_lock);
-
- /*
- * write_space is only called when data leaves tcp's send queue if
- * SOCK_NOSPACE is set. We set SOCK_NOSPACE every time we put
- * data in tcp's send queue because we use write_space to parse the
- * sequence numbers and notice that rds messages have been fully
- * received.
- *
- * tcp's write_space clears SOCK_NOSPACE if the send queue has more
- * than a certain amount of space. So we need to set it again *after*
- * we call tcp's write_space or else we might only get called on the
- * first of a series of incoming tcp acks.
- */
- write_space(sk);
-
- if (sk->sk_socket)
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-}
+++ /dev/null
-/*
- * Copyright (c) 2006 Oracle. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/percpu.h>
-#include <linux/seq_file.h>
-#include <linux/proc_fs.h>
-
-#include "rds.h"
-#include "tcp.h"
-
-DEFINE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats)
- ____cacheline_aligned;
-
-static const char * const rds_tcp_stat_names[] = {
- "tcp_data_ready_calls",
- "tcp_write_space_calls",
- "tcp_sndbuf_full",
- "tcp_connect_raced",
- "tcp_listen_closed_stale",
-};
-
-unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter,
- unsigned int avail)
-{
- struct rds_tcp_statistics stats = {0, };
- uint64_t *src;
- uint64_t *sum;
- size_t i;
- int cpu;
-
- if (avail < ARRAY_SIZE(rds_tcp_stat_names))
- goto out;
-
- for_each_online_cpu(cpu) {
- src = (uint64_t *)&(per_cpu(rds_tcp_stats, cpu));
- sum = (uint64_t *)&stats;
- for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
- *(sum++) += *(src++);
- }
-
- rds_stats_info_copy(iter, (uint64_t *)&stats, rds_tcp_stat_names,
- ARRAY_SIZE(rds_tcp_stat_names));
-out:
- return ARRAY_SIZE(rds_tcp_stat_names);
-}
*/
#include <linux/kernel.h>
#include <linux/random.h>
-#include <linux/export.h>
#include "rds.h"
*
* Transition to state DISCONNECTING/DOWN:
* - Inside the shutdown worker; synchronizes with xmit path
- * through RDS_IN_XMIT, and with connection management callbacks
+ * through c_send_lock, and with connection management callbacks
* via c_cm_lock.
*
* For receive callbacks, we rely on the underlying transport
* (TCP, IB/RDMA) to provide the necessary synchronisation.
*/
struct workqueue_struct *rds_wq;
-EXPORT_SYMBOL_GPL(rds_wq);
void rds_connect_complete(struct rds_connection *conn)
{
"current state is %d\n",
__func__,
atomic_read(&conn->c_state));
- rds_conn_drop(conn);
+ atomic_set(&conn->c_state, RDS_CONN_ERROR);
+ queue_work(rds_wq, &conn->c_down_w);
return;
}
queue_delayed_work(rds_wq, &conn->c_send_w, 0);
queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
}
-EXPORT_SYMBOL_GPL(rds_connect_complete);
/*
* This random exponential backoff is relied on to eventually resolve racing
* We should *always* start with a random backoff; otherwise a broken connection
* will always take several iterations to be re-established.
*/
-void rds_queue_reconnect(struct rds_connection *conn)
+static void rds_queue_reconnect(struct rds_connection *conn)
{
unsigned long rand;
}
}
+void rds_shutdown_worker(struct work_struct *work)
+{
+ struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
+
+ /* shut it down unless it's down already */
+ if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
+ /*
+ * Quiesce the connection mgmt handlers before we start tearing
+ * things down. We don't hold the mutex for the entire
+ * duration of the shutdown operation, else we may be
+ * deadlocking with the CM handler. Instead, the CM event
+ * handler is supposed to check for state DISCONNECTING
+ */
+ mutex_lock(&conn->c_cm_lock);
+ if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING)
+ && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
+ rds_conn_error(conn, "shutdown called in state %d\n",
+ atomic_read(&conn->c_state));
+ mutex_unlock(&conn->c_cm_lock);
+ return;
+ }
+ mutex_unlock(&conn->c_cm_lock);
+
+ mutex_lock(&conn->c_send_lock);
+ conn->c_trans->conn_shutdown(conn);
+ rds_conn_reset(conn);
+ mutex_unlock(&conn->c_send_lock);
+
+ if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
+ /* This can happen - eg when we're in the middle of tearing
+ * down the connection, and someone unloads the rds module.
+ * Quite reproduceable with loopback connections.
+ * Mostly harmless.
+ */
+ rds_conn_error(conn,
+ "%s: failed to transition to state DOWN, "
+ "current state is %d\n",
+ __func__,
+ atomic_read(&conn->c_state));
+ return;
+ }
+ }
+
+ /* Then reconnect if it's still live.
+ * The passive side of an IB loopback connection is never added
+ * to the conn hash, so we never trigger a reconnect on this
+ * conn - the reconnect is always triggered by the active peer. */
+ cancel_delayed_work(&conn->c_conn_w);
+ if (!hlist_unhashed(&conn->c_hash_node))
+ rds_queue_reconnect(conn);
+}
+
void rds_send_worker(struct work_struct *work)
{
struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work);
}
}
-void rds_shutdown_worker(struct work_struct *work)
-{
- struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
-
- rds_conn_shutdown(conn);
-}
-
void rds_threads_exit(void)
{
destroy_workqueue(rds_wq);
}
-int rds_threads_init(void)
+int __init rds_threads_init(void)
{
rds_wq = create_singlethread_workqueue("krdsd");
- if (!rds_wq)
+ if (rds_wq == NULL)
return -ENOMEM;
return 0;
#include "rds.h"
#include "loop.h"
-static struct rds_transport *transports[RDS_TRANS_COUNT];
+static LIST_HEAD(rds_transports);
static DECLARE_RWSEM(rds_trans_sem);
int rds_trans_register(struct rds_transport *trans)
down_write(&rds_trans_sem);
- if (transports[trans->t_type])
- printk(KERN_ERR "RDS Transport type %d already registered\n",
- trans->t_type);
- else {
- transports[trans->t_type] = trans;
- printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name);
- }
+ list_add_tail(&trans->t_item, &rds_transports);
+ printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name);
up_write(&rds_trans_sem);
return 0;
}
-EXPORT_SYMBOL_GPL(rds_trans_register);
void rds_trans_unregister(struct rds_transport *trans)
{
down_write(&rds_trans_sem);
- transports[trans->t_type] = NULL;
+ list_del_init(&trans->t_item);
printk(KERN_INFO "Unregistered RDS/%s transport\n", trans->t_name);
up_write(&rds_trans_sem);
}
-EXPORT_SYMBOL_GPL(rds_trans_unregister);
-
-void rds_trans_put(struct rds_transport *trans)
-{
- if (trans && trans->t_owner)
- module_put(trans->t_owner);
-}
struct rds_transport *rds_trans_get_preferred(__be32 addr)
{
- struct rds_transport *ret = NULL;
struct rds_transport *trans;
- unsigned int i;
+ struct rds_transport *ret = NULL;
if (IN_LOOPBACK(ntohl(addr)))
return &rds_loop_transport;
down_read(&rds_trans_sem);
- for (i = 0; i < RDS_TRANS_COUNT; i++) {
- trans = transports[i];
-
- if (trans && (trans->laddr_check(addr) == 0) &&
- (!trans->t_owner || try_module_get(trans->t_owner))) {
+ list_for_each_entry(trans, &rds_transports, t_item) {
+ if (trans->laddr_check(addr) == 0) {
ret = trans;
break;
}
struct rds_transport *trans;
unsigned int total = 0;
unsigned int part;
- int i;
rds_info_iter_unmap(iter);
down_read(&rds_trans_sem);
- for (i = 0; i < RDS_TRANS_COUNT; i++)
- {
- trans = transports[i];
- if (!trans || !trans->stats_info_copy)
+ list_for_each_entry(trans, &rds_transports, t_item) {
+ if (trans->stats_info_copy == NULL)
continue;
part = trans->stats_info_copy(iter, avail);