]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
rds: Enable RDS IPv6 support
authorKa-Cheong Poon <ka-cheong.poon@oracle.com>
Mon, 23 Oct 2017 13:21:49 +0000 (06:21 -0700)
committerChuck Anderson <chuck.anderson@oracle.com>
Mon, 11 Dec 2017 04:20:16 +0000 (20:20 -0800)
This patch enables RDS to use IPv6 addresses.  There are many data
structures (RDS socket options) used by RDS apps which use a 32 bit
integer to store IP address. To support IPv6, struct in6_addr needs to
be used. To ensure backward compatibility, a new data structure is
introduced for each of those data structures which use a 32 bit
integer to represent an IP address. And new socket options are
introduced to use those new structures. This means that existing apps
should work without a problem with the new RDS module.  For apps which
want to use IPv6, those new data structures and socket options can be
used. IPv4 mapped address is used to represent IPv4 address in the new
data structures.

RDS/RDMA/IB uses a private data (struct rds_ib_connect_private)
exchange between endpoints at RDS connection establishment time to
support RDMA. This private data exchange uses a 32 bit integer to
represent an IP address. This needs to be changed in order to support
IPv6. A new private data struct rds6_ib_connect_private is introduced
to handle this. To ensure backward compatibility, an IPv6 capable RDS
stack uses another RDMA listener port (RDS_CM_PORT which is 16385,
the same value as the RDS/TCP listener port number) to accept IPv6
connection. And it continues to use the original RDS_PORT for IPv4
RDS connections. When it needs to communicate with an IPv6 peer, it
uses the RDS_CM_PORT to send the connection set up request.

Orabug: 25410192

Signed-off-by: Ka-Cheong Poon <ka-cheong.poon@oracle.com>
Reviewed-by: HÃ¥kon Bugge <haakon.bugge@oracle.com>
19 files changed:
include/uapi/linux/rds.h
net/rds/af_rds.c
net/rds/bind.c
net/rds/connection.c
net/rds/ib.c
net/rds/ib.h
net/rds/ib_cm.c
net/rds/ib_rdma.c
net/rds/ib_send.c
net/rds/rdma_transport.c
net/rds/rdma_transport.h
net/rds/rds.h
net/rds/recv.c
net/rds/send.c
net/rds/tcp.c
net/rds/tcp.h
net/rds/tcp_connect.c
net/rds/tcp_listen.c
net/rds/threads.c

index ac631250c04b8889004be15d623d1ef9a067b8d4..651dcc6ae6c92a012041035e7cf3085675734289 100644 (file)
 #define RDS_GET_MR_FOR_DEST            7
 #define RDS_CONN_RESET                  8
 #define SO_RDS_TRANSPORT               9
+/* Socket option to tap receive path latency
+ *     SO_RDS: SO_RDS_MSG_RXPATH_LATENCY
+ *     Format used struct rds_rx_trace_so
+ */
+#define SO_RDS_MSG_RXPATH_LATENCY      10
+#define RDS6_CONN_RESET                        11
 
 /* supported values for SO_RDS_TRANSPORT */
 #define        RDS_TRANS_IB    0
 #define        RDS_TRANS_COUNT 3
 #define        RDS_TRANS_NONE  (~0)
 
-/* Socket option to tap receive path latency
- *     SO_RDS: SO_RDS_MSG_RXPATH_LATENCY
- *     Format used struct rds_rx_trace_so
- */
-#define SO_RDS_MSG_RXPATH_LATENCY      10
-
 /*
  * ioctl commands for SOL_RDS
 */
 #define SIOCRDSSETTOS                   (SIOCPROTOPRIVATE)
-#define SIOCRDSGETTOS                  (SIOCPROTOPRIVATE + 1)
+#define SIOCRDSGETTOS                   (SIOCPROTOPRIVATE + 1)
 #define SIOCRDSENABLENETFILTER          (SIOCPROTOPRIVATE + 2)
 
 #define IPPROTO_OKA (142)
@@ -142,9 +142,9 @@ struct rds_cmsg_rx_trace {
 #define RDS_CMSG_CONG_UPDATE           5
 #define RDS_CMSG_ATOMIC_FADD           6
 #define RDS_CMSG_ATOMIC_CSWP           7
-#define RDS_CMSG_MASKED_ATOMIC_FADD     8
-#define RDS_CMSG_MASKED_ATOMIC_CSWP     9
-#define RDS_CMSG_ASYNC_SEND             10
+#define RDS_CMSG_MASKED_ATOMIC_FADD    8
+#define RDS_CMSG_MASKED_ATOMIC_CSWP    9
+#define RDS_CMSG_ASYNC_SEND            10
 #define RDS_CMSG_RXPATH_LATENCY                11
 
 #define RDS_INFO_FIRST                 10000
@@ -159,7 +159,17 @@ struct rds_cmsg_rx_trace {
 #define RDS_INFO_IB_CONNECTIONS                10008
 #define RDS_INFO_CONNECTION_STATS      10009
 #define RDS_INFO_IWARP_CONNECTIONS     10010
-#define RDS_INFO_LAST                  10010
+
+/* PF_RDS6 options */
+#define RDS6_INFO_CONNECTIONS          10011
+#define RDS6_INFO_SEND_MESSAGES                10012
+#define RDS6_INFO_RETRANS_MESSAGES     10013
+#define RDS6_INFO_RECV_MESSAGES                10014
+#define RDS6_INFO_SOCKETS              10015
+#define RDS6_INFO_TCP_SOCKETS          10016
+#define RDS6_INFO_IB_CONNECTIONS       10017
+
+#define RDS_INFO_LAST                  10017
 
 struct rds_info_counter {
        u_int8_t        name[32];
@@ -169,7 +179,7 @@ struct rds_info_counter {
 #define RDS_INFO_CONNECTION_FLAG_SENDING       0x01
 #define RDS_INFO_CONNECTION_FLAG_CONNECTING    0x02
 #define RDS_INFO_CONNECTION_FLAG_CONNECTED     0x04
-#define RDS_INFO_CONNECTION_FLAG_ERROR          0x08
+#define RDS_INFO_CONNECTION_FLAG_ERROR         0x08
 
 #define TRANSNAMSIZ    16
 
@@ -183,12 +193,14 @@ struct rds_info_connection {
        u_int8_t        tos;
 } __attribute__((packed));
 
-struct rds_info_flow {
-       __be32          laddr;
-       __be32          faddr;
-       u_int32_t       bytes;
-       __be16          lport;
-       __be16          fport;
+struct rds6_info_connection {
+       uint64_t        next_tx_seq;
+       uint64_t        next_rx_seq;
+       struct in6_addr laddr;
+       struct in6_addr faddr;
+       uint8_t         transport[TRANSNAMSIZ];         /* null term ascii */
+       uint8_t         flags;
+       uint8_t         tos;
 } __attribute__((packed));
 
 #define RDS_INFO_MESSAGE_FLAG_ACK               0x01
@@ -205,6 +217,17 @@ struct rds_info_message {
        u_int8_t        tos;
 } __attribute__((packed));
 
+struct rds6_info_message {
+       uint64_t        seq;
+       uint32_t        len;
+       struct in6_addr laddr;
+       struct in6_addr faddr;
+       __be16          lport;
+       __be16          fport;
+       uint8_t         flags;
+       uint8_t         tos;
+} __attribute__((packed));
+
 struct rds_info_socket {
        u_int32_t       sndbuf;
        __be32          bound_addr;
@@ -215,6 +238,16 @@ struct rds_info_socket {
        u_int64_t       inum;
 } __attribute__((packed));
 
+struct rds6_info_socket {
+       uint32_t        sndbuf;
+       struct in6_addr bound_addr;
+       struct in6_addr connected_addr;
+       __be16          bound_port;
+       __be16          connected_port;
+       uint32_t        rcvbuf;
+       uint64_t        inum;
+} __attribute__((packed));
+
 struct rds_info_tcp_socket {
        __be32          local_addr;
        __be16          local_port;
@@ -227,6 +260,18 @@ struct rds_info_tcp_socket {
        u_int32_t       last_seen_una;
 } __attribute__((packed));
 
+struct rds6_info_tcp_socket {
+       struct in6_addr local_addr;
+       __be16          local_port;
+       struct in6_addr peer_addr;
+       __be16          peer_port;
+       uint64_t        hdr_rem;
+       uint64_t        data_rem;
+       uint32_t        last_sent_nxt;
+       uint32_t        last_expected_una;
+       uint32_t        last_seen_una;
+} __attribute__((packed));
+
 #define RDS_IB_GID_LEN 16
 struct rds_info_rdma_connection {
        __be32          src_addr;
@@ -251,6 +296,28 @@ struct rds_info_rdma_connection {
 
 };
 
+struct rds6_info_rdma_connection {
+       struct in6_addr src_addr;
+       struct in6_addr dst_addr;
+       uint8_t         src_gid[RDS_IB_GID_LEN];
+       uint8_t         dst_gid[RDS_IB_GID_LEN];
+
+       uint32_t        max_send_wr;
+       uint32_t        max_recv_wr;
+       uint32_t        max_send_sge;
+       uint32_t        rdma_mr_max;
+       uint32_t        rdma_mr_size;
+       uint8_t         tos;
+       uint8_t         sl;
+       uint32_t        cache_allocs;
+       uint32_t        frag;
+       uint16_t        flow_ctl_post_credit;
+       uint16_t        flow_ctl_send_credit;
+       uint32_t        qp_num;
+       uint32_t        w_alloc_ctr;
+       uint32_t        w_free_ctr;
+};
+
 /*
  * Congestion monitoring.
  * Congestion control in RDS happens at the host connection
@@ -339,6 +406,12 @@ struct rds_reset {
        struct in_addr  dst;
 };
 
+struct rds6_reset {
+       uint8_t tos;
+       struct in6_addr src;
+       struct in6_addr dst;
+};
+
 struct rds_asend_args {
        u_int64_t       user_token;
        u_int64_t       flags;
@@ -349,10 +422,10 @@ struct rds_rdma_send_notify {
        int32_t         status;
 };
 
-#define RDS_RDMA_SEND_SUCCESS  0
-#define RDS_RDMA_REMOTE_ERROR  1
-#define RDS_RDMA_SEND_CANCELED 2
-#define RDS_RDMA_SEND_DROPPED  3
+#define RDS_RDMA_SEND_SUCCESS          0
+#define RDS_RDMA_REMOTE_ERROR          1
+#define RDS_RDMA_SEND_CANCELED         2
+#define RDS_RDMA_SEND_DROPPED          3
 #define RDS_RDMA_SEND_OTHER_ERROR      4
 
 /*
index e84b71dc9dc446860610ee860f40e230669c4e4b..c7b30e619bab0d5e2643a40ba60dc1c980b56abc 100644 (file)
@@ -42,7 +42,7 @@
 #include <net/sock.h>
 
 #include "rds.h"
-#include "tcp.h"
+
 /* UNUSED for backwards compat only */
 static unsigned int rds_ib_retry_count = 0xdead;
 module_param(rds_ib_retry_count, int, 0444);
@@ -427,6 +427,51 @@ done:
        return 0;
 }
 
+static int rds6_user_reset(struct rds_sock *rs, char __user *optval, int optlen)
+{
+       struct rds6_reset reset;
+       struct rds_connection *conn;
+       LIST_HEAD(s_addr_conns);
+
+       if (optlen != sizeof(struct rds6_reset))
+               return -EINVAL;
+
+       if (copy_from_user(&reset, (struct rds6_reset __user *)optval,
+                          sizeof(struct rds6_reset)))
+               return -EFAULT;
+
+       /* Reset all conns associated with source addr */
+       if (ipv6_addr_any(&reset.dst)) {
+               pr_info("RDS: Reset ALL conns for Source %pI6c\n",
+                       &reset.src);
+
+               rds_conn_laddr_list(sock_net(rds_rs_to_sk(rs)),
+                                   &reset.src, &s_addr_conns);
+               if (list_empty(&s_addr_conns))
+                       goto done;
+
+               list_for_each_entry(conn, &s_addr_conns, c_laddr_node)
+                       if (conn)
+                               rds_user_conn_paths_drop(conn, 1);
+               goto done;
+       }
+
+       conn = rds_conn_find(sock_net(rds_rs_to_sk(rs)),
+                            &reset.src, &reset.dst, rs->rs_transport,
+                            reset.tos, rs->rs_bound_scope_id);
+
+       if (conn) {
+               bool is_tcp = conn->c_trans->t_type == RDS_TRANS_TCP;
+
+               printk(KERN_NOTICE "Resetting RDS/%s connection <%pI6c,%pI6c,%d>\n",
+                      is_tcp ? "tcp" : "IB",
+                      &reset.src, &reset.dst, conn->c_tos);
+               rds_user_conn_paths_drop(conn, DR_USER_RESET);
+       }
+done:
+       return 0;
+}
+
 static int rds_set_transport(struct rds_sock *rs, char __user *optval,
                             int optlen)
 {
@@ -533,6 +578,13 @@ static int rds_setsockopt(struct socket *sock, int level, int optname,
                }
                ret = rds_user_reset(rs, optval, optlen);
                break;
+       case RDS6_CONN_RESET:
+               if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) {
+                       ret =  -EACCES;
+                       break;
+               }
+               ret = rds6_user_reset(rs, optval, optlen);
+               break;
        case SO_RDS_TRANSPORT:
                lock_sock(sock->sk);
                ret = rds_set_transport(rs, optval, optlen);
@@ -611,7 +663,9 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
 {
        struct sock *sk = sock->sk;
        struct sockaddr_in *sin;
+       struct sockaddr_in6 *sin6;
        struct rds_sock *rs = rds_sk_to_rs(sk);
+       int addr_type;
        int ret = 0;
 
        lock_sock(sk);
@@ -637,7 +691,23 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
                break;
 
        case sizeof(struct sockaddr_in6):
-               ret = -EPROTONOSUPPORT;
+               sin6 = (struct sockaddr_in6 *)uaddr;
+               if (sin6->sin6_family != AF_INET6) {
+                       ret = -EAFNOSUPPORT;
+                       break;
+               }
+               addr_type = ipv6_addr_type(&sin6->sin6_addr);
+               if (!(addr_type & IPV6_ADDR_UNICAST)) {
+                       ret = -EPROTOTYPE;
+                       break;
+               }
+               if (addr_type & IPV6_ADDR_LINKLOCAL &&
+                   sin6->sin6_scope_id == 0) {
+                       ret = -EINVAL;
+                       break;
+               }
+               rs->rs_conn_addr = sin6->sin6_addr;
+               rs->rs_conn_port = sin6->sin6_port;
                break;
 
        default:
@@ -822,6 +892,38 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len,
        lens->each = sizeof(struct rds_info_message);
 }
 
+static void rds6_sock_inc_info(struct socket *sock, unsigned int len,
+                              struct rds_info_iterator *iter,
+                              struct rds_info_lengths *lens)
+{
+       struct rds_sock *rs;
+       struct rds_incoming *inc;
+       unsigned int total = 0;
+
+       len /= sizeof(struct rds6_info_message);
+
+       spin_lock_bh(&rds_sock_lock);
+
+       list_for_each_entry(rs, &rds_sock_list, rs_item) {
+               read_lock(&rs->rs_recv_lock);
+
+               /* XXX too lazy to maintain counts.. */
+               list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
+                       total++;
+                       if (total <= len)
+                               rds6_inc_info_copy(inc, iter, &inc->i_saddr,
+                                                  &rs->rs_bound_addr, 1);
+               }
+
+               read_unlock(&rs->rs_recv_lock);
+       }
+
+       spin_unlock_bh(&rds_sock_lock);
+
+       lens->nr = total;
+       lens->each = sizeof(struct rds6_info_message);
+}
+
 static void rds_sock_info(struct socket *sock, unsigned int len,
                          struct rds_info_iterator *iter,
                          struct rds_info_lengths *lens)
@@ -855,6 +957,39 @@ out:
        spin_unlock_bh(&rds_sock_lock);
 }
 
+static void rds6_sock_info(struct socket *sock, unsigned int len,
+                          struct rds_info_iterator *iter,
+                          struct rds_info_lengths *lens)
+{
+       struct rds6_info_socket sinfo6;
+       struct rds_sock *rs;
+
+       len /= sizeof(struct rds6_info_socket);
+
+       spin_lock_bh(&rds_sock_lock);
+
+       if (len < rds_sock_count)
+               goto out;
+
+       list_for_each_entry(rs, &rds_sock_list, rs_item) {
+               sinfo6.sndbuf = rds_sk_sndbuf(rs);
+               sinfo6.rcvbuf = rds_sk_rcvbuf(rs);
+               sinfo6.bound_addr = rs->rs_bound_addr;
+               sinfo6.connected_addr = rs->rs_conn_addr;
+               sinfo6.bound_port = rs->rs_bound_port;
+               sinfo6.connected_port = rs->rs_conn_port;
+               sinfo6.inum = sock_i_ino(rds_rs_to_sk(rs));
+
+               rds_info_copy(iter, &sinfo6, sizeof(sinfo6));
+       }
+
+out:
+       lens->nr = rds_sock_count;
+       lens->each = sizeof(struct rds6_info_socket);
+
+       spin_unlock_bh(&rds_sock_lock);
+}
+
 static unsigned long parse_ul(char *ptr, unsigned long max)
 {
        unsigned long val;
@@ -966,6 +1101,8 @@ static void __exit rds_exit(void)
        rds_page_exit();
        rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info);
        rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
+       rds_info_deregister_func(RDS6_INFO_SOCKETS, rds6_sock_info);
+       rds_info_deregister_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info);
 }
 
 module_exit(rds_exit);
@@ -1001,6 +1138,8 @@ static int __init rds_init(void)
 
        rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info);
        rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
+       rds_info_register_func(RDS6_INFO_SOCKETS, rds6_sock_info);
+       rds_info_register_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info);
 
        rds_qos_threshold_init();
 
index 9676d565433b94bf2766c016a02a64cc783b759e..31d56d469276ca5f4f148219a12db7176a49be8c 100644 (file)
@@ -189,9 +189,13 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
        struct in6_addr v6addr, *binding_addr;
        struct rds_transport *trans;
        __u32 scope_id = 0;
+       int addr_type;
        int ret = 0;
        __be16 port;
 
+       /* We allow an RDS socket to be bound to either IPv4 or IPv6
+        * address.
+        */
        if (addr_len == sizeof(struct sockaddr_in)) {
                struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
 
@@ -202,7 +206,21 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
                binding_addr = &v6addr;
                port = sin->sin_port;
        } else if (addr_len == sizeof(struct sockaddr_in6)) {
-               return -EPROTONOSUPPORT;
+               struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)uaddr;
+
+               addr_type = ipv6_addr_type(&sin6->sin6_addr);
+               if (sin6->sin6_family != AF_INET6 ||
+                   !(addr_type & IPV6_ADDR_UNICAST)) {
+                       return -EINVAL;
+               }
+               /* The scope ID must be specified for link local address. */
+               if (addr_type & IPV6_ADDR_LINKLOCAL) {
+                       if (sin6->sin6_scope_id == 0)
+                               return -EINVAL;
+                       scope_id = sin6->sin6_scope_id;
+               }
+               binding_addr = &sin6->sin6_addr;
+               port = sin6->sin6_port;
        } else {
                return -EINVAL;
        }
index 6121a186e46d94b5607e8b620c830a8cf5357e30..c13c7976353bf080c269be2d14bdf044188b3df1 100644 (file)
@@ -37,7 +37,6 @@
 
 #include "rds.h"
 #include "loop.h"
-#include "tcp.h"
 
 #define RDS_CONNECTION_HASH_BITS 12
 #define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS)
@@ -556,10 +555,21 @@ void rds_conn_destroy(struct rds_connection *conn, int shutdown)
 }
 EXPORT_SYMBOL_GPL(rds_conn_destroy);
 
-static void rds_conn_message_info(struct socket *sock, unsigned int len,
-                                 struct rds_info_iterator *iter,
-                                 struct rds_info_lengths *lens,
-                                 int want_send)
+static void __rds_inc_msg_cp(struct rds_incoming *inc,
+                            struct rds_info_iterator *iter,
+                            void *saddr, void *daddr, int flip, bool isv6)
+{
+       if (isv6)
+               rds6_inc_info_copy(inc, iter, saddr, daddr, flip);
+       else
+               rds_inc_info_copy(inc, iter, *(__be32 *)saddr,
+                                 *(__be32 *)daddr, flip);
+}
+
+static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len,
+                                     struct rds_info_iterator *iter,
+                                     struct rds_info_lengths *lens,
+                                     int want_send, bool isv6)
 {
        struct hlist_head *head;
        struct list_head *list;
@@ -570,7 +580,10 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
        size_t i;
        int j;
 
-       len /= sizeof(struct rds_info_message);
+       if (isv6)
+               len /= sizeof(struct rds6_info_message);
+       else
+               len /= sizeof(struct rds_info_message);
 
        rcu_read_lock();
 
@@ -595,18 +608,13 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
 
                                /* XXX too lazy to maintain counts.. */
                                list_for_each_entry(rm, list, m_conn_item) {
-                                       __be32 laddr;
-                                       __be32 faddr;
-
                                        total++;
-                                       laddr = conn->c_laddr.s6_addr32[3];
-                                       faddr = conn->c_faddr.s6_addr32[3];
                                        if (total <= len)
-                                               rds_inc_info_copy(&rm->m_inc,
-                                                                 iter,
-                                                                 laddr,
-                                                                 faddr,
-                                                                 0);
+                                               __rds_inc_msg_cp(&rm->m_inc,
+                                                                iter,
+                                                                &conn->c_laddr,
+                                                                &conn->c_faddr,
+                                                                0, isv6);
                                }
 
                                cp->cp_rdsinfo_pending = 0;
@@ -617,7 +625,26 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
        rcu_read_unlock();
 
        lens->nr = total;
-       lens->each = sizeof(struct rds_info_message);
+       if (isv6)
+               lens->each = sizeof(struct rds6_info_message);
+       else
+               lens->each = sizeof(struct rds_info_message);
+}
+
+static void rds_conn_message_info(struct socket *sock, unsigned int len,
+                                 struct rds_info_iterator *iter,
+                                 struct rds_info_lengths *lens,
+                                 int want_send)
+{
+       rds_conn_message_info_cmn(sock, len, iter, lens, want_send, false);
+}
+
+static void rds6_conn_message_info(struct socket *sock, unsigned int len,
+                                  struct rds_info_iterator *iter,
+                                  struct rds_info_lengths *lens,
+                                  int want_send)
+{
+       rds_conn_message_info_cmn(sock, len, iter, lens, want_send, true);
 }
 
 static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
@@ -627,6 +654,13 @@ static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
        rds_conn_message_info(sock, len, iter, lens, 1);
 }
 
+static void rds6_conn_message_info_send(struct socket *sock, unsigned int len,
+                                       struct rds_info_iterator *iter,
+                                       struct rds_info_lengths *lens)
+{
+       rds6_conn_message_info(sock, len, iter, lens, 1);
+}
+
 static void rds_conn_message_info_retrans(struct socket *sock,
                                          unsigned int len,
                                          struct rds_info_iterator *iter,
@@ -635,6 +669,14 @@ static void rds_conn_message_info_retrans(struct socket *sock,
        rds_conn_message_info(sock, len, iter, lens, 0);
 }
 
+static void rds6_conn_message_info_retrans(struct socket *sock,
+                                          unsigned int len,
+                                          struct rds_info_iterator *iter,
+                                          struct rds_info_lengths *lens)
+{
+       rds6_conn_message_info(sock, len, iter, lens, 0);
+}
+
 void rds_for_each_conn_info(struct socket *sock, unsigned int len,
                          struct rds_info_iterator *iter,
                          struct rds_info_lengths *lens,
@@ -727,6 +769,9 @@ static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
        struct rds_info_connection *cinfo = buffer;
        struct rds_connection *conn = cp->cp_conn;
 
+       if (conn->c_isv6)
+               return 0;
+
        cinfo->next_tx_seq = cp->cp_next_tx_seq;
        cinfo->next_rx_seq = cp->cp_next_rx_seq;
        cinfo->laddr = conn->c_laddr.s6_addr32[3];
@@ -750,6 +795,37 @@ static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
        return 1;
 }
 
+static int rds6_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
+{
+       struct rds6_info_connection *cinfo6 = buffer;
+       struct rds_connection *conn = cp->cp_conn;
+
+       cinfo6->next_tx_seq = cp->cp_next_tx_seq;
+       cinfo6->next_rx_seq = cp->cp_next_rx_seq;
+       cinfo6->laddr = conn->c_laddr;
+       cinfo6->faddr = conn->c_faddr;
+       cinfo6->tos = conn->c_tos;
+       strncpy(cinfo6->transport, conn->c_trans->t_name,
+               sizeof(cinfo6->transport));
+       cinfo6->flags = 0;
+
+       rds_conn_info_set(cinfo6->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags),
+                         SENDING);
+       /* XXX Future: return the state rather than these funky bits */
+       rds_conn_info_set(cinfo6->flags,
+                         atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING,
+                         CONNECTING);
+       rds_conn_info_set(cinfo6->flags,
+                         atomic_read(&cp->cp_state) == RDS_CONN_UP,
+                         CONNECTED);
+       rds_conn_info_set(cinfo6->flags, cp->cp_pending_flush,
+                         ERROR);
+       /* Just return 1 as there is no error case. This is a helper function
+        * for rds_walk_conn_path_info() and it wants a return value.
+        */
+       return 1;
+}
+
 static void rds_conn_info(struct socket *sock, unsigned int len,
                          struct rds_info_iterator *iter,
                          struct rds_info_lengths *lens)
@@ -759,6 +835,15 @@ static void rds_conn_info(struct socket *sock, unsigned int len,
                                sizeof(struct rds_info_connection));
 }
 
+static void rds6_conn_info(struct socket *sock, unsigned int len,
+                          struct rds_info_iterator *iter,
+                          struct rds_info_lengths *lens)
+{
+       rds_walk_conn_path_info(sock, len, iter, lens,
+                               rds6_conn_info_visitor,
+                               sizeof(struct rds6_info_connection));
+}
+
 int rds_conn_init(void)
 {
        rds_conn_slab = kmem_cache_create("rds_connection",
@@ -772,6 +857,11 @@ int rds_conn_init(void)
                               rds_conn_message_info_send);
        rds_info_register_func(RDS_INFO_RETRANS_MESSAGES,
                               rds_conn_message_info_retrans);
+       rds_info_register_func(RDS6_INFO_CONNECTIONS, rds6_conn_info);
+       rds_info_register_func(RDS6_INFO_SEND_MESSAGES,
+                              rds6_conn_message_info_send);
+       rds_info_register_func(RDS6_INFO_RETRANS_MESSAGES,
+                              rds6_conn_message_info_retrans);
 
        return 0;
 }
@@ -789,7 +879,11 @@ void rds_conn_exit(void)
                                 rds_conn_message_info_send);
        rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES,
                                 rds_conn_message_info_retrans);
-
+       rds_info_deregister_func(RDS6_INFO_CONNECTIONS, rds6_conn_info);
+       rds_info_deregister_func(RDS6_INFO_SEND_MESSAGES,
+                                rds6_conn_message_info_send);
+       rds_info_deregister_func(RDS6_INFO_RETRANS_MESSAGES,
+                                rds6_conn_message_info_retrans);
 }
 
 static char *conn_drop_reasons[] = {
index 60637ed5f8f72763af4967f8fa953b76a19b89ca..dc26513791a20232426652830fa1c1bdd1f11609 100644 (file)
 #include <net/inet_common.h>
 #include <net/ipoib/if_ipoib.h>
 #include <linux/rtnetlink.h>
+#include <net/addrconf.h>
 
 #include "rds.h"
 #include "ib.h"
-#include "tcp.h"
 #include "rds_single_path.h"
 #include <linux/time.h>
 
@@ -109,6 +109,7 @@ LIST_HEAD(ib_nodev_conns);
 struct workqueue_struct *rds_aux_wq;
 
 struct socket  *rds_ib_inet_socket;
+struct socket  *rds_ib_inet6_socket;
 
 static struct rds_ib_port *ip_config;
 static u8      ip_port_cnt = 0;
@@ -355,6 +356,9 @@ struct ib_client rds_ib_client = {
        .remove = rds_ib_remove_one
 };
 
+/* Remove IB connection information.  This function only reports IPv4
+ * connections for backward compatibility.
+ */
 static int rds_ib_conn_info_visitor(struct rds_connection *conn,
                                    void *buffer)
 {
@@ -396,7 +400,8 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
                iinfo->max_send_sge = rds_ibdev->max_sge;
                iinfo->qp_num = ic->i_cm_id->qp->qp_num;
                iinfo->w_alloc_ctr = ic->i_recv_ring.w_alloc_ctr;
-               iinfo->w_free_ctr  = (u32) atomic_read(&ic->i_recv_ring.w_free_ctr);
+               iinfo->w_free_ctr =
+                       (u32)atomic_read(&ic->i_recv_ring.w_free_ctr);
                iinfo->flow_ctl_post_credit =
                        IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
                iinfo->flow_ctl_send_credit =
@@ -407,6 +412,58 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
        return 1;
 }
 
+/* IPv6 version of rds_ib_conn_info_visitor(). */
+static int rds6_ib_conn_info_visitor(struct rds_connection *conn,
+                                    void *buffer)
+{
+       struct rds6_info_rdma_connection *iinfo6 = buffer;
+       struct rds_ib_connection *ic = conn->c_transport_data;
+
+       /* We will only ever look at IB transports */
+       if (conn->c_trans != &rds_ib_transport)
+               return 0;
+
+       iinfo6->src_addr = conn->c_laddr;
+       iinfo6->dst_addr = conn->c_faddr;
+
+       memset(&iinfo6->src_gid, 0, sizeof(iinfo6->src_gid));
+       memset(&iinfo6->dst_gid, 0, sizeof(iinfo6->dst_gid));
+
+       if (ic) {
+               iinfo6->tos = conn->c_tos;
+               iinfo6->sl = ic->i_sl;
+               iinfo6->frag = ic->i_frag_sz;
+       }
+
+       if (rds_conn_state(conn) == RDS_CONN_UP) {
+               struct rds_ib_device *rds_ibdev;
+               struct rdma_dev_addr *dev_addr;
+
+               ic = conn->c_transport_data;
+               dev_addr = &ic->i_cm_id->route.addr.dev_addr;
+               rdma_addr_get_sgid(dev_addr,
+                                  (union ib_gid *)&iinfo6->src_gid);
+               rdma_addr_get_dgid(dev_addr,
+                                  (union ib_gid *)&iinfo6->dst_gid);
+
+               rds_ibdev = ic->rds_ibdev;
+               iinfo6->max_send_wr = ic->i_send_ring.w_nr;
+               iinfo6->max_recv_wr = ic->i_recv_ring.w_nr;
+               iinfo6->max_send_sge = rds_ibdev->max_sge;
+               iinfo6->qp_num = ic->i_cm_id->qp->qp_num;
+               iinfo6->w_alloc_ctr = ic->i_recv_ring.w_alloc_ctr;
+               iinfo6->w_free_ctr =
+                       (u32)atomic_read(&ic->i_recv_ring.w_free_ctr);
+               iinfo6->flow_ctl_post_credit =
+                       IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
+               iinfo6->flow_ctl_send_credit =
+                       IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits));
+               rds6_ib_get_mr_info(rds_ibdev, iinfo6);
+               iinfo6->cache_allocs = atomic_read(&ic->i_cache_allocs);
+       }
+       return 1;
+}
+
 static void rds_ib_ic_info(struct socket *sock, unsigned int len,
                           struct rds_info_iterator *iter,
                           struct rds_info_lengths *lens)
@@ -416,6 +473,15 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len,
                                sizeof(struct rds_info_rdma_connection));
 }
 
+/* IPv6 version of rds_ib_ic_info(). */
+static void rds6_ib_ic_info(struct socket *sock, unsigned int len,
+                           struct rds_info_iterator *iter,
+                           struct rds_info_lengths *lens)
+{
+       rds_for_each_conn_info(sock, len, iter, lens,
+                              rds6_ib_conn_info_visitor,
+                              sizeof(struct rds6_info_rdma_connection));
+}
 
 /*
  * Early RDS/IB was built to only bind to an address if there is an IPoIB
@@ -432,10 +498,14 @@ static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr,
 {
        int ret;
        struct rdma_cm_id *cm_id;
+       struct sockaddr_in6 sin6;
        struct sockaddr_in sin;
+       struct sockaddr *sa;
+       bool isv4;
 
+       isv4 = ipv6_addr_v4mapped(addr);
        /* Link-local addresses don't play well with IB */
-       if (ipv4_is_linklocal_169(addr->s6_addr32[3])) {
+       if (isv4 && ipv4_is_linklocal_169(addr->s6_addr32[3])) {
                pr_info_once("\n");
                pr_info_once("****************************************************\n");
                pr_info_once("** WARNING WARNING WARNING WARNING WARNING        **\n");
@@ -458,19 +528,52 @@ static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr,
        if (IS_ERR(cm_id))
                return -EADDRNOTAVAIL;
 
-       memset(&sin, 0, sizeof(sin));
-       sin.sin_family = AF_INET;
-       sin.sin_addr.s_addr = addr->s6_addr32[3];
+       if (isv4) {
+               memset(&sin, 0, sizeof(sin));
+               sin.sin_family = AF_INET;
+               sin.sin_addr.s_addr = addr->s6_addr32[3];
+               sa = (struct sockaddr *)&sin;
+       } else {
+               memset(&sin6, 0, sizeof(sin6));
+               sin6.sin6_family = AF_INET6;
+               sin6.sin6_addr = *addr;
+               sin6.sin6_scope_id = scope_id;
+               sa = (struct sockaddr *)&sin6;
+
+               /* XXX Do a special IPv6 link local address check here.  The
+                * reason is that rdma_bind_addr() always succeeds with IPv6
+                * link local address regardless if it is configured or not in
+                * a system.
+                */
+               if (ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL) {
+                       struct net_device *dev;
+
+                       if (scope_id == 0)
+                               return -EADDRNOTAVAIL;
+
+                       /* Use init_net for now as RDS is not network
+                        * name space aware.
+                        */
+                       dev = dev_get_by_index(&init_net, scope_id);
+                       if (!dev)
+                               return -EADDRNOTAVAIL;
+                       if (!ipv6_chk_addr(&init_net, addr, dev, 1)) {
+                               dev_put(dev);
+                               return -EADDRNOTAVAIL;
+                       }
+                       dev_put(dev);
+               }
+       }
 
        /* rdma_bind_addr will only succeed for IB & iWARP devices */
-       ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+       ret = rdma_bind_addr(cm_id, sa);
        /* due to this, we will claim to support iWARP devices unless we
           check node_type. */
        if (ret || !cm_id->device || cm_id->device->node_type != RDMA_NODE_IB_CA)
                ret = -EADDRNOTAVAIL;
 
-       rdsdebug("addr %pI6c ret %d node type %d\n",
-                addr, ret,
+       rdsdebug("addr %pI6c%%%u ret %d node type %d\n",
+                addr, scope_id, ret,
                 cm_id->device ? cm_id->device->node_type : -1);
 
        rdma_destroy_id(cm_id);
@@ -2715,8 +2818,16 @@ int rds_ib_init(void)
                printk(KERN_ERR "RDS/IB: can't create TCP transport socket (%d).\n", -ret);
                goto out;
        }
+       ret = sock_create_kern(&init_net, PF_INET6, SOCK_DGRAM, 0,
+                              &rds_ib_inet6_socket);
+       if (ret < 0) {
+               printk(KERN_ERR "RDS/IB: can't create IPv6 configuration socket (%d).\n",
+                      -ret);
+               goto out;
+       }
 
        sock_net_set(rds_ib_inet_socket->sk, &init_net);
+       sock_net_set(rds_ib_inet6_socket->sk, &init_net);
 
        /* Initialise the RDS IB fragment size */
        rds_ib_init_frag(RDS_PROTOCOL_VERSION);
@@ -2748,6 +2859,7 @@ int rds_ib_init(void)
                goto out_aux_wq;
 
        rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+       rds_info_register_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info);
 
        ret = rds_ip_threads_init();
        if (ret) {
@@ -2790,6 +2902,7 @@ void rds_ib_exit(void)
 {
        unregister_netdevice_notifier(&rds_ib_nb);
        rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+       rds_info_deregister_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info);
        rds_ib_unregister_client();
        rds_ib_destroy_nodev_conns();
        rds_ib_sysctl_exit();
index b2bcce89f93cfe3508c2dfbc079b76c7767c2e42..be0b3dee3addb6a7b78204002c19728fde2c1515 100644 (file)
@@ -301,7 +301,7 @@ struct rds_ib_srq {
 
 struct rds_ib_alias {
        char                    if_name[IFNAMSIZ];
-       __be32                  ip_addr;
+       __be32                  ip_addr;
        __be32                  ip_bcast;
        __be32                  ip_mask;
 };
@@ -390,7 +390,7 @@ struct rds_ib_port {
        union ib_gid            gid;
        char                    port_label[4];
        char                    if_name[IFNAMSIZ];
-       __be32                  ip_addr;
+       __be32                  ip_addr;
        __be32                  ip_bcast;
        __be32                  ip_mask;
        unsigned int            ip_active_port;
@@ -634,6 +634,8 @@ void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *
 void rds_ib_destroy_nodev_conns(void);
 struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev, int npages);
 void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
+void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
+                        struct rds6_info_rdma_connection *iinfo6);
 void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
 void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
                    struct rds_sock *rs, u32 *key_ret,
index ea87f9afd466554943a671f247386d907a66b438..2445f161cbfab31b92f028154276387f58594b01 100644 (file)
@@ -41,7 +41,6 @@
 
 #include "rds.h"
 #include "ib.h"
-#include "tcp.h"
 #include "rds_single_path.h"
 
 static unsigned int rds_ib_max_frag = RDS_MAX_FRAG_SIZE;
@@ -1029,12 +1028,16 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
                (unsigned long long)be64_to_cpu(fguid),
                dp_cmn->ricpc_tos);
 
-       /* XXX IPoIB ACL Only support IPv4 */
-       acl_ret = rds_ib_match_acl(cm_id, saddr6->s6_addr32[3]);
-       if (acl_ret < 0) {
-               err = RDS_ACL_FAILURE;
-               rdsdebug("RDS: IB: passive: rds_ib_match_acl failed\n");
-               goto out;
+       /* IPoIB ACL only supports IPv4.  Let all IPv6 traffic pass. */
+       if (ipv6_addr_v4mapped(saddr6)) {
+               acl_ret = rds_ib_match_acl(cm_id, saddr6->s6_addr32[3]);
+               if (acl_ret < 0) {
+                       err = RDS_ACL_FAILURE;
+                       rdsdebug("RDS: IB: passive: rds_ib_match_acl failed\n");
+                       goto out;
+               }
+       } else {
+               acl_ret = 0;
        }
 
        /* RDS/IB is not currently netns aware, thus init_net */
@@ -1200,7 +1203,11 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6)
        u16 frag;
        int ret;
 
-       ret = rds_ib_match_acl(ic->i_cm_id, conn->c_faddr.s6_addr32[3]);
+       /* IPoIB ACL only supports IPv4.  Let all IPv6 traffic pass. */
+       if (ipv6_addr_v4mapped(&conn->c_faddr))
+               ret = rds_ib_match_acl(ic->i_cm_id, conn->c_faddr.s6_addr32[3]);
+       else
+               ret = 0;
        if (ret < 0) {
                pr_err("RDS: IB: active conn=%p, <%pI6c,%pI6c,%d> destroyed due ACL violation\n",
                       conn, &conn->c_laddr, &conn->c_faddr,
@@ -1266,7 +1273,10 @@ int rds_ib_conn_path_connect(struct rds_conn_path *cp)
 
        /* XXX I wonder what affect the port space has */
        /* delegate cm event handler to rdma_transport */
-       handler = rds_rdma_cm_event_handler;
+       if (conn->c_isv6)
+               handler = rds6_rdma_cm_event_handler;
+       else
+               handler = rds_rdma_cm_event_handler;
        ic->i_cm_id = rdma_create_id(handler, conn, RDMA_PS_TCP, IB_QPT_RC);
 
        if (IS_ERR(ic->i_cm_id)) {
@@ -1305,7 +1315,7 @@ int rds_ib_conn_path_connect(struct rds_conn_path *cp)
                sin6 = (struct sockaddr_in6 *)&dest;
                sin6->sin6_family = AF_INET6;
                sin6->sin6_addr = conn->c_faddr;
-               sin6->sin6_port = (__force u16)htons(RDS_TCP_PORT);
+               sin6->sin6_port = (__force u16)htons(RDS_CM_PORT);
                sin6->sin6_scope_id = conn->c_dev_if;
        }
 
index a096a33cdbd1616af88179dc0a843675234cb509..a48ffeaceec160ba375a9d85d267099c086080a0 100644 (file)
@@ -314,6 +314,15 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co
        iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages;
 }
 
+void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
+                        struct rds6_info_rdma_connection *iinfo6)
+{
+       struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool;
+
+       iinfo6->rdma_mr_max = pool_1m->max_items;
+       iinfo6->rdma_mr_size = pool_1m->fmr_attr.max_pages;
+}
+
 void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
 {
        struct rds_ib_mr *ibmr;
index d071c00e32c0cd23b8625717d31b1119a2cf0dbd..462d88ee6a4d12b06620ce165451b7dd1a30d446 100644 (file)
@@ -37,7 +37,6 @@
 
 #include "rds.h"
 #include "ib.h"
-#include "tcp.h"
 #include "rds_single_path.h"
 
 static char *rds_ib_wc_status_strings[] = {
index 1b70b42dd8dba40738f6c85ad03524036714aaba..d26fc8d3dd742303d77ce29cbd89680f0967770d 100644 (file)
@@ -36,7 +36,6 @@
 #include "rdma_transport.h"
 #include "ib.h"
 #include "net/arp.h"
-#include "tcp.h"
 #include "rds_single_path.h"
 
 #include <net/sock.h>
@@ -44,7 +43,9 @@
 
 #define RDS_REJ_CONSUMER_DEFINED 28
 
+/* Global IPv4 and IPv6 RDS RDMA listener cm_id */
 static struct rdma_cm_id *rds_rdma_listen_id;
+static struct rdma_cm_id *rds6_rdma_listen_id;
 
 int unload_allowed __initdata;
 
@@ -325,6 +326,12 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
        return rds_rdma_cm_event_handler_cmn(cm_id, event, false);
 }
 
+int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+                              struct rdma_cm_event *event)
+{
+       return rds_rdma_cm_event_handler_cmn(cm_id, event, true);
+}
+
 static int rds_rdma_listen_init_common(rdma_cm_event_handler handler,
                                       struct sockaddr *sa,
                                       struct rdma_cm_id **ret_cm_id)
@@ -358,7 +365,9 @@ static int rds_rdma_listen_init_common(rdma_cm_event_handler handler,
        }
 
        rdsdebug("cm %p listening on port %u\n", cm_id,
-                ntohs(((struct sockaddr_in *)sa)->sin_port));
+                sa->sa_family == PF_INET ?
+                ntohs(((struct sockaddr_in *)sa)->sin_port) :
+                ntohs(((struct sockaddr_in6 *)sa)->sin6_port));
 
        *ret_cm_id = cm_id;
        cm_id = NULL;
@@ -370,13 +379,14 @@ out:
 
 /* Initialize the RDS RDMA listeners.  We create two listeners for
  * compatibility reason.  The one on RDS_PORT is used for IPv4
- * requests only.  The one on RDS_TCP_PORT is used for IPv6 requests
+ * requests only.  The one on RDS_CM_PORT is used for IPv6 requests
  * only.  So only IPv6 enabled RDS module will communicate using this
  * port.
  */
 static int rds_rdma_listen_init(void)
 {
        int ret;
+       struct sockaddr_in6 sin6;
        struct sockaddr_in sin;
 
        sin.sin_family = PF_INET;
@@ -385,7 +395,21 @@ static int rds_rdma_listen_init(void)
        ret = rds_rdma_listen_init_common(rds_rdma_cm_event_handler,
                                          (struct sockaddr *)&sin,
                                          &rds_rdma_listen_id);
-       return ret;
+       if (ret)
+               return ret;
+
+       sin6.sin6_family = PF_INET6;
+       sin6.sin6_addr = in6addr_any;
+       sin6.sin6_port = htons(RDS_CM_PORT);
+       sin6.sin6_scope_id = 0;
+       sin6.sin6_flowinfo = 0;
+       ret = rds_rdma_listen_init_common(rds6_rdma_cm_event_handler,
+                                         (struct sockaddr *)&sin6,
+                                         &rds6_rdma_listen_id);
+       /* Keep going even when IPv6 is not enabled in the system. */
+       if (ret)
+               rdsdebug("Cannot set up IPv6 RDMA listener\n");
+       return 0;
 }
 
 static void rds_rdma_listen_stop(void)
@@ -395,6 +419,11 @@ static void rds_rdma_listen_stop(void)
                rdma_destroy_id(rds_rdma_listen_id);
                rds_rdma_listen_id = NULL;
        }
+       if (rds6_rdma_listen_id) {
+               rdsdebug("cm %p\n", rds6_rdma_listen_id);
+               rdma_destroy_id(rds6_rdma_listen_id);
+               rds6_rdma_listen_id = NULL;
+       }
 }
 
 #define MODULE_NAME "rds_rdma"
index 63aa4a091865af5bc6b36e153b3290212250eeb4..065fb17ed4971e5bf4f193b7de4e1be92eed69d2 100644 (file)
@@ -10,6 +10,8 @@
 int rds_rdma_conn_connect(struct rds_connection *conn);
 int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
                              struct rdma_cm_event *event);
+int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+                              struct rdma_cm_event *event);
 
 /* from rdma_transport.c */
 int rds_rdma_init(void);
index ab20763fedff3681f236d2c5f28b700e3777e47e..9049d35b80a84f6f259f611e14f7b1b98824ed5a 100644 (file)
  */
 #define RDS_ACL_FAILURE                0x04010020
 
-/*
- * XXX randomly chosen, but at least seems to be unused:
- * #               18464-18768 Unassigned
- * We should do better.  We want a reserved port to discourage unpriv'ed
- * userspace from listening.
+/* The following ports, 16385, 18634, 18635, are registered with IANA as
+ * the ports to be used for RDS over TCP and UDP.  18634 is the historical
+ * value used for the RDMA_CM listener port.  RDS/TCP uses port 16385.  After
+ * IPv6 work, RDMA_CM also uses 16385 as the listener port.  18634 is kept
+ * to ensure compatibility with older RDS modules.
  */
 #define RDS_PORT       18634
+#define RDS_CM_PORT    16385
+#define RDS_TCP_PORT   RDS_CM_PORT
 
 #ifdef ATOMIC64_INIT
 #define KERNEL_HAS_ATOMIC64
@@ -1046,6 +1048,10 @@ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg);
 void rds_inc_info_copy(struct rds_incoming *inc,
                       struct rds_info_iterator *iter,
                       __be32 saddr, __be32 daddr, int flip);
+void rds6_inc_info_copy(struct rds_incoming *inc,
+                       struct rds_info_iterator *iter,
+                       struct in6_addr *saddr, struct in6_addr *daddr,
+                       int flip);
 int rds_skb_local(struct sk_buff *skb);
 
 /* send.c */
index a162546432593fcaf9cef155d9cc4d54e16bb794..a763250c7a2509941394242cf54caa7d3cd90791 100644 (file)
@@ -39,7 +39,6 @@
 #include <linux/rds.h>
 
 #include "rds.h"
-#include "tcp.h"
 
 /* forward prototypes */
 static void
@@ -1112,6 +1111,31 @@ void rds_inc_info_copy(struct rds_incoming *inc,
        rds_info_copy(iter, &minfo, sizeof(minfo));
 }
 
+void rds6_inc_info_copy(struct rds_incoming *inc,
+                       struct rds_info_iterator *iter,
+                       struct in6_addr *saddr, struct in6_addr *daddr,
+                       int flip)
+{
+       struct rds6_info_message minfo6;
+
+       minfo6.seq = be64_to_cpu(inc->i_hdr.h_sequence);
+       minfo6.len = be32_to_cpu(inc->i_hdr.h_len);
+
+       if (flip) {
+               minfo6.laddr = *daddr;
+               minfo6.faddr = *saddr;
+               minfo6.lport = inc->i_hdr.h_dport;
+               minfo6.fport = inc->i_hdr.h_sport;
+       } else {
+               minfo6.laddr = *saddr;
+               minfo6.faddr = *daddr;
+               minfo6.lport = inc->i_hdr.h_sport;
+               minfo6.fport = inc->i_hdr.h_dport;
+       }
+
+       rds_info_copy(iter, &minfo6, sizeof(minfo6));
+}
+
 int rds_skb_local(struct sk_buff *skb)
 {
        struct rds_nf_hdr *dst, *org;
index 97e239bcbd9fa5ef4819184d2cf3e1cbf57bd1cb..c43b21b83207d39416826b3b510903148c102c05 100644 (file)
@@ -36,7 +36,6 @@
 #include <linux/list.h>
 
 #include "rds.h"
-#include "tcp.h"
 
 /* When transmitting messages in rds_send_xmit, we need to emerge from
  * time to time and briefly release the CPU. Otherwise the softlock watchdog
@@ -1251,8 +1250,27 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
                        break;
 
                case sizeof(*sin6): {
-                       ret = -EPROTONOSUPPORT;
-                       goto out;
+                       int addr_type;
+
+                       if (sin6->sin6_family != AF_INET6) {
+                               ret = -EINVAL;
+                               goto out;
+                       }
+                       addr_type = ipv6_addr_type(&sin6->sin6_addr);
+                       if (!(addr_type & IPV6_ADDR_UNICAST)) {
+                               ret = -EINVAL;
+                               goto out;
+                       }
+                       if (addr_type & IPV6_ADDR_LINKLOCAL &&
+                           sin6->sin6_scope_id == 0) {
+                               ret = -EINVAL;
+                               goto out;
+                       }
+
+                       daddr = sin6->sin6_addr;
+                       dport = sin6->sin6_port;
+                       scope_id = sin6->sin6_scope_id;
+                       break;
                }
 
                default:
index 54b58c1c21f8dbf2c31b7f38627b310a7b5b7173..72d1b915f83fb4a930106f589eb94df0128c19d9 100644 (file)
 /* only for info exporting */
 static DEFINE_SPINLOCK(rds_tcp_tc_list_lock);
 static LIST_HEAD(rds_tcp_tc_list);
+
+/* rds_tcp_tc_count counts only IPv4 connections.
+ * rds6_tcp_tc_count counts both IPv4 and IPv6 connections.
+ */
 static unsigned int rds_tcp_tc_count;
+static unsigned int rds6_tcp_tc_count;
 
 /* Track rds_tcp_connection structs so they can be cleaned up */
 static DEFINE_SPINLOCK(rds_tcp_conn_lock);
@@ -111,7 +116,9 @@ void rds_tcp_restore_callbacks(struct socket *sock,
        /* done under the callback_lock to serialize with write_space */
        spin_lock(&rds_tcp_tc_list_lock);
        list_del_init(&tc->t_list_item);
-       rds_tcp_tc_count--;
+       rds6_tcp_tc_count--;
+       if (!tc->t_cpath->cp_conn->c_isv6)
+               rds_tcp_tc_count--;
        spin_unlock(&rds_tcp_tc_list_lock);
 
        tc->t_sock = NULL;
@@ -198,7 +205,9 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp)
        /* done under the callback_lock to serialize with write_space */
        spin_lock(&rds_tcp_tc_list_lock);
        list_add_tail(&tc->t_list_item, &rds_tcp_tc_list);
-       rds_tcp_tc_count++;
+       if (!tc->t_cpath->cp_conn->c_isv6)
+               rds_tcp_tc_count++;
+       rds6_tcp_tc_count++;
        spin_unlock(&rds_tcp_tc_list_lock);
 
        /* accepted sockets need our listen data ready undone */
@@ -219,16 +228,16 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp)
        write_unlock_bh(&sock->sk->sk_callback_lock);
 }
 
-static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
+/* Handle RDS_INFO_TCP_SOCKETS socket option.  It only returns IPv4
+ * connections for backward compatibility.
+ */
+static void rds_tcp_tc_info(struct socket *sock, unsigned int len,
                            struct rds_info_iterator *iter,
                            struct rds_info_lengths *lens)
 {
        struct rds_info_tcp_socket tsinfo;
        struct rds_tcp_connection *tc;
        unsigned long flags;
-       struct sockaddr_in sin;
-       int sinlen;
-       struct socket *sock;
 
        spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
 
@@ -236,18 +245,15 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
                goto out;
 
        list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
+               struct inet_sock *inet = inet_sk(tc->t_sock->sk);
 
-               sock = tc->t_sock;
-               if (sock) {
-                       sock->ops->getname(sock, (struct sockaddr *)&sin,
-                                          &sinlen, 0);
-                       tsinfo.local_addr = sin.sin_addr.s_addr;
-                       tsinfo.local_port = sin.sin_port;
-                       sock->ops->getname(sock, (struct sockaddr *)&sin,
-                                          &sinlen, 1);
-                       tsinfo.peer_addr = sin.sin_addr.s_addr;
-                       tsinfo.peer_port = sin.sin_port;
-               }
+               if (tc->t_cpath->cp_conn->c_isv6)
+                       continue;
+
+               tsinfo.local_addr = inet->inet_saddr;
+               tsinfo.local_port = inet->inet_sport;
+               tsinfo.peer_addr = inet->inet_daddr;
+               tsinfo.peer_port = inet->inet_dport;
 
                tsinfo.hdr_rem = tc->t_tinc_hdr_rem;
                tsinfo.data_rem = tc->t_tinc_data_rem;
@@ -265,6 +271,48 @@ out:
        spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
 }
 
+/* Handle RDS6_INFO_TCP_SOCKETS socket option. It returns both IPv4 and
+ * IPv6 connections. IPv4 connection address is returned in an IPv4 mapped
+ * address.
+ */
+static void rds6_tcp_tc_info(struct socket *sock, unsigned int len,
+                            struct rds_info_iterator *iter,
+                            struct rds_info_lengths *lens)
+{
+       struct rds6_info_tcp_socket tsinfo6;
+       struct rds_tcp_connection *tc;
+       unsigned long flags;
+
+       spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
+
+       if (len / sizeof(tsinfo6) < rds6_tcp_tc_count)
+               goto out;
+
+       list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
+               struct sock *sk = tc->t_sock->sk;
+               struct inet_sock *inet = inet_sk(sk);
+
+               tsinfo6.local_addr = sk->sk_v6_rcv_saddr;
+               tsinfo6.local_port = inet->inet_sport;
+               tsinfo6.peer_addr = sk->sk_v6_daddr;
+               tsinfo6.peer_port = inet->inet_dport;
+
+               tsinfo6.hdr_rem = tc->t_tinc_hdr_rem;
+               tsinfo6.data_rem = tc->t_tinc_data_rem;
+               tsinfo6.last_sent_nxt = tc->t_last_sent_nxt;
+               tsinfo6.last_expected_una = tc->t_last_expected_una;
+               tsinfo6.last_seen_una = tc->t_last_seen_una;
+
+               rds_info_copy(iter, &tsinfo6, sizeof(tsinfo6));
+       }
+
+out:
+       lens->nr = rds6_tcp_tc_count;
+       lens->each = sizeof(tsinfo6);
+
+       spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
+}
+
 static int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr,
                               __u32 scope_id)
 {
@@ -469,13 +517,18 @@ static __net_init int rds_tcp_init_net(struct net *net)
                err = -ENOMEM;
                goto fail;
        }
-       rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net);
+       rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, true);
        if (!rtn->rds_tcp_listen_sock) {
-               pr_warn("could not set up listen sock\n");
-               unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
-               rtn->rds_tcp_sysctl = NULL;
-               err = -EAFNOSUPPORT;
-               goto fail;
+               pr_warn("could not set up IPv6 listen sock\n");
+
+               /* Try IPv4 as some systems disable IPv6 */
+               rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, false);
+               if (!rtn->rds_tcp_listen_sock) {
+                       unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
+                       rtn->rds_tcp_sysctl = NULL;
+                       err = -EAFNOSUPPORT;
+                       goto fail;
+               }
        }
        INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker);
        return 0;
@@ -642,6 +695,7 @@ static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
 static void __exit rds_tcp_exit(void)
 {
        rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
+       rds_info_deregister_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info);
        unregister_pernet_subsys(&rds_tcp_net_ops);
        if (unregister_netdevice_notifier(&rds_tcp_dev_notifier))
                pr_warn("could not unregister rds_tcp_dev_notifier\n");
@@ -681,6 +735,7 @@ static int __init rds_tcp_init(void)
        ret = rds_trans_register(&rds_tcp_transport);
 
        rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
+       rds_info_register_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info);
 
        goto out;
 
index e3eff146bd0ba4204701c00306f63c911d8285f6..c4fbafc2ef5aab3ca70df247ec14fa45748585f7 100644 (file)
@@ -1,8 +1,6 @@
 #ifndef _RDS_TCP_H
 #define _RDS_TCP_H
 
-#define RDS_TCP_PORT   16385
-
 struct rds_tcp_incoming {
        struct rds_incoming     ti_inc;
        struct sk_buff_head     ti_skb_list;
@@ -65,7 +63,7 @@ void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp);
 void rds_tcp_state_change(struct sock *sk);
 
 /* tcp_listen.c */
-struct socket *rds_tcp_listen_init(struct net *);
+struct socket *rds_tcp_listen_init(struct net *net, bool isv6);
 void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor);
 void rds_tcp_listen_data_ready(struct sock *sk);
 int rds_tcp_accept_one(struct socket *sock);
index 01449f3fb358572ba864553495588243b5e33f65..a993296927801e373fab66fc78635124f9038450 100644 (file)
@@ -83,9 +83,11 @@ out:
 int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
 {
        struct socket *sock = NULL;
+       struct sockaddr_in6 sin6;
        struct sockaddr_in sin;
        struct sockaddr *addr;
        int addrlen;
+       bool isv6;
        int ret;
        struct rds_connection *conn = cp->cp_conn;
        struct rds_tcp_connection *tc = cp->cp_transport_data;
@@ -103,18 +105,35 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
                return 0;
        }
 
-       ret = sock_create_kern(rds_conn_net(conn), PF_INET, SOCK_STREAM,
-                              IPPROTO_TCP, &sock);
+       if (ipv6_addr_v4mapped(&conn->c_laddr)) {
+               ret = sock_create_kern(rds_conn_net(conn), PF_INET,
+                                      SOCK_STREAM, IPPROTO_TCP, &sock);
+               isv6 = false;
+       } else {
+               ret = sock_create_kern(rds_conn_net(conn), PF_INET6,
+                                      SOCK_STREAM, IPPROTO_TCP, &sock);
+               isv6 = true;
+       }
        if (ret < 0)
                goto out;
 
        rds_tcp_tune(sock);
 
-       sin.sin_family = AF_INET;
-       sin.sin_addr.s_addr = (__force u32)conn->c_laddr.s6_addr32[3];
-       sin.sin_port = (__force u16)htons(0);
-       addr = (struct sockaddr *)&sin;
-       addrlen = sizeof(sin);
+       if (isv6) {
+               sin6.sin6_family = AF_INET6;
+               sin6.sin6_addr = conn->c_laddr;
+               sin6.sin6_port = 0;
+               sin6.sin6_flowinfo = 0;
+               sin6.sin6_scope_id = conn->c_dev_if;
+               addr = (struct sockaddr *)&sin6;
+               addrlen = sizeof(sin6);
+       } else {
+               sin.sin_family = AF_INET;
+               sin.sin_addr.s_addr = (__force u32)conn->c_laddr.s6_addr32[3];
+               sin.sin_port = (__force u16)htons(0);
+               addr = (struct sockaddr *)&sin;
+               addrlen = sizeof(sin);
+       }
 
        ret = sock->ops->bind(sock, addr, addrlen);
        if (ret) {
@@ -123,11 +142,21 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
                goto out;
        }
 
-       sin.sin_family = AF_INET;
-       sin.sin_addr.s_addr = (__force u32)conn->c_faddr.s6_addr32[3];
-       sin.sin_port = (__force u16)htons(RDS_TCP_PORT);
-       addr = (struct sockaddr *)&sin;
-       addrlen = sizeof(sin);
+       if (isv6) {
+               sin6.sin6_family = AF_INET6;
+               sin6.sin6_addr = conn->c_faddr;
+               sin6.sin6_port = htons(RDS_TCP_PORT);
+               sin6.sin6_flowinfo = 0;
+               sin6.sin6_scope_id = conn->c_dev_if;
+               addr = (struct sockaddr *)&sin6;
+               addrlen = sizeof(sin6);
+       } else {
+               sin.sin_family = AF_INET;
+               sin.sin_addr.s_addr = (__force u32)conn->c_faddr.s6_addr32[3];
+               sin.sin_port = (__force u16)htons(RDS_TCP_PORT);
+               addr = (struct sockaddr *)&sin;
+               addrlen = sizeof(sin);
+       }
 
        /*
         * once we call connect() we can start getting callbacks and they
index 7951f5eaf0218ff5f7590bd414603b582346e9e8..072ef91d101580fd9d1bf070ac368d924f74fbef 100644 (file)
@@ -146,7 +146,8 @@ int rds_tcp_accept_one(struct socket *sock)
 
        inet = inet_sk(new_sock->sk);
 
-       rdsdebug("accepted tcp %pI6c:%u -> %pI6c:%u\n",
+       rdsdebug("accepted family %d tcp %pI6c:%u -> %pI6c:%u\n",
+                sock->sk->sk_family,
                 &new_sock->sk->sk_v6_rcv_saddr, ntohs(inet->inet_sport),
                 &new_sock->sk->sk_v6_daddr, ntohs(inet->inet_dport));
 
@@ -240,15 +241,21 @@ out:
                ready(sk);
 }
 
-struct socket *rds_tcp_listen_init(struct net *net)
+struct socket *rds_tcp_listen_init(struct net *net, bool isv6)
 {
-       struct sockaddr_in sin;
        struct socket *sock = NULL;
+       struct sockaddr_storage ss;
+       struct sockaddr_in6 *sin6;
+       struct sockaddr_in *sin;
+       int addr_len;
        int ret;
 
-       ret = sock_create_kern(net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
-       if (ret < 0)
+       ret = sock_create_kern(net, isv6 ? PF_INET6 : PF_INET, SOCK_STREAM,
+                              IPPROTO_TCP, &sock);
+       if (ret < 0) {
+               rdsdebug("could not create listener socket: %d\n", ret);
                goto out;
+       }
 
        sock->sk->sk_reuse = 1;
        rds_tcp_nonagle(sock);
@@ -258,13 +265,28 @@ struct socket *rds_tcp_listen_init(struct net *net)
        sock->sk->sk_data_ready = rds_tcp_listen_data_ready;
        write_unlock_bh(&sock->sk->sk_callback_lock);
 
-       sin.sin_family = PF_INET;
-       sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
-       sin.sin_port = (__force u16)htons(RDS_TCP_PORT);
+       if (isv6) {
+               sin6 = (struct sockaddr_in6 *)&ss;
+               sin6->sin6_family = PF_INET6;
+               sin6->sin6_addr = in6addr_any;
+               sin6->sin6_port = (__force u16)htons(RDS_TCP_PORT);
+               sin6->sin6_scope_id = 0;
+               sin6->sin6_flowinfo = 0;
+               addr_len = sizeof(*sin6);
+       } else {
+               sin = (struct sockaddr_in *)&ss;
+               sin->sin_family = PF_INET;
+               sin->sin_addr.s_addr = INADDR_ANY;
+               sin->sin_port = (__force u16)htons(RDS_TCP_PORT);
+               addr_len = sizeof(*sin);
+       }
 
-       ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
-       if (ret < 0)
+       ret = sock->ops->bind(sock, (struct sockaddr *)&ss, addr_len);
+       if (ret < 0) {
+               rdsdebug("could not bind %s listener socket: %d\n",
+                        isv6 ? "IPv6" : "IPv4", ret);
                goto out;
+       }
 
        ret = sock->ops->listen(sock, 64);
        if (ret < 0)
index 70a9b55070f082ef7453e434c6a7eb109d070ab4..4985baca4f38118019fdb4764687f83e89cd7731 100644 (file)
@@ -34,7 +34,7 @@
 #include <linux/random.h>
 
 #include "rds.h"
-#include "tcp.h"
+
 static unsigned int rds_conn_hb_timeout = 0;
 module_param(rds_conn_hb_timeout, int, 0444);
 MODULE_PARM_DESC(rds_conn_hb_timeout, " Connection heartbeat timeout");