From: Ka-Cheong Poon Date: Mon, 23 Oct 2017 13:21:49 +0000 (-0700) Subject: rds: Enable RDS IPv6 support X-Git-Tag: v4.1.12-124.31.3~1488 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=160bb966577428612461b710095138eedbd8e9be;p=users%2Fjedix%2Flinux-maple.git rds: Enable RDS IPv6 support This patch enables RDS to use IPv6 addresses. There are many data structures (RDS socket options) used by RDS apps which use a 32 bit integer to store IP address. To support IPv6, struct in6_addr needs to be used. To ensure backward compatibility, a new data structure is introduced for each of those data structures which use a 32 bit integer to represent an IP address. And new socket options are introduced to use those new structures. This means that existing apps should work without a problem with the new RDS module. For apps which want to use IPv6, those new data structures and socket options can be used. IPv4 mapped address is used to represent IPv4 address in the new data structures. RDS/RDMA/IB uses a private data (struct rds_ib_connect_private) exchange between endpoints at RDS connection establishment time to support RDMA. This private data exchange uses a 32 bit integer to represent an IP address. This needs to be changed in order to support IPv6. A new private data struct rds6_ib_connect_private is introduced to handle this. To ensure backward compatibility, an IPv6 capable RDS stack uses another RDMA listener port (RDS_CM_PORT which is 16385, the same value as the RDS/TCP listener port number) to accept IPv6 connection. And it continues to use the original RDS_PORT for IPv4 RDS connections. When it needs to communicate with an IPv6 peer, it uses the RDS_CM_PORT to send the connection set up request. Orabug: 25410192 Signed-off-by: Ka-Cheong Poon Reviewed-by: HÃ¥kon Bugge --- diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h index ac631250c04b..651dcc6ae6c9 100644 --- a/include/uapi/linux/rds.h +++ b/include/uapi/linux/rds.h @@ -67,6 +67,12 @@ #define RDS_GET_MR_FOR_DEST 7 #define RDS_CONN_RESET 8 #define SO_RDS_TRANSPORT 9 +/* Socket option to tap receive path latency + * SO_RDS: SO_RDS_MSG_RXPATH_LATENCY + * Format used struct rds_rx_trace_so + */ +#define SO_RDS_MSG_RXPATH_LATENCY 10 +#define RDS6_CONN_RESET 11 /* supported values for SO_RDS_TRANSPORT */ #define RDS_TRANS_IB 0 @@ -74,17 +80,11 @@ #define RDS_TRANS_COUNT 3 #define RDS_TRANS_NONE (~0) -/* Socket option to tap receive path latency - * SO_RDS: SO_RDS_MSG_RXPATH_LATENCY - * Format used struct rds_rx_trace_so - */ -#define SO_RDS_MSG_RXPATH_LATENCY 10 - /* * ioctl commands for SOL_RDS */ #define SIOCRDSSETTOS (SIOCPROTOPRIVATE) -#define SIOCRDSGETTOS (SIOCPROTOPRIVATE + 1) +#define SIOCRDSGETTOS (SIOCPROTOPRIVATE + 1) #define SIOCRDSENABLENETFILTER (SIOCPROTOPRIVATE + 2) #define IPPROTO_OKA (142) @@ -142,9 +142,9 @@ struct rds_cmsg_rx_trace { #define RDS_CMSG_CONG_UPDATE 5 #define RDS_CMSG_ATOMIC_FADD 6 #define RDS_CMSG_ATOMIC_CSWP 7 -#define RDS_CMSG_MASKED_ATOMIC_FADD 8 -#define RDS_CMSG_MASKED_ATOMIC_CSWP 9 -#define RDS_CMSG_ASYNC_SEND 10 +#define RDS_CMSG_MASKED_ATOMIC_FADD 8 +#define RDS_CMSG_MASKED_ATOMIC_CSWP 9 +#define RDS_CMSG_ASYNC_SEND 10 #define RDS_CMSG_RXPATH_LATENCY 11 #define RDS_INFO_FIRST 10000 @@ -159,7 +159,17 @@ struct rds_cmsg_rx_trace { #define RDS_INFO_IB_CONNECTIONS 10008 #define RDS_INFO_CONNECTION_STATS 10009 #define RDS_INFO_IWARP_CONNECTIONS 10010 -#define RDS_INFO_LAST 10010 + +/* PF_RDS6 options */ +#define RDS6_INFO_CONNECTIONS 10011 +#define RDS6_INFO_SEND_MESSAGES 10012 +#define RDS6_INFO_RETRANS_MESSAGES 10013 +#define RDS6_INFO_RECV_MESSAGES 10014 +#define RDS6_INFO_SOCKETS 10015 +#define RDS6_INFO_TCP_SOCKETS 10016 +#define RDS6_INFO_IB_CONNECTIONS 10017 + +#define RDS_INFO_LAST 10017 struct rds_info_counter { u_int8_t name[32]; @@ -169,7 +179,7 @@ struct rds_info_counter { #define RDS_INFO_CONNECTION_FLAG_SENDING 0x01 #define RDS_INFO_CONNECTION_FLAG_CONNECTING 0x02 #define RDS_INFO_CONNECTION_FLAG_CONNECTED 0x04 -#define RDS_INFO_CONNECTION_FLAG_ERROR 0x08 +#define RDS_INFO_CONNECTION_FLAG_ERROR 0x08 #define TRANSNAMSIZ 16 @@ -183,12 +193,14 @@ struct rds_info_connection { u_int8_t tos; } __attribute__((packed)); -struct rds_info_flow { - __be32 laddr; - __be32 faddr; - u_int32_t bytes; - __be16 lport; - __be16 fport; +struct rds6_info_connection { + uint64_t next_tx_seq; + uint64_t next_rx_seq; + struct in6_addr laddr; + struct in6_addr faddr; + uint8_t transport[TRANSNAMSIZ]; /* null term ascii */ + uint8_t flags; + uint8_t tos; } __attribute__((packed)); #define RDS_INFO_MESSAGE_FLAG_ACK 0x01 @@ -205,6 +217,17 @@ struct rds_info_message { u_int8_t tos; } __attribute__((packed)); +struct rds6_info_message { + uint64_t seq; + uint32_t len; + struct in6_addr laddr; + struct in6_addr faddr; + __be16 lport; + __be16 fport; + uint8_t flags; + uint8_t tos; +} __attribute__((packed)); + struct rds_info_socket { u_int32_t sndbuf; __be32 bound_addr; @@ -215,6 +238,16 @@ struct rds_info_socket { u_int64_t inum; } __attribute__((packed)); +struct rds6_info_socket { + uint32_t sndbuf; + struct in6_addr bound_addr; + struct in6_addr connected_addr; + __be16 bound_port; + __be16 connected_port; + uint32_t rcvbuf; + uint64_t inum; +} __attribute__((packed)); + struct rds_info_tcp_socket { __be32 local_addr; __be16 local_port; @@ -227,6 +260,18 @@ struct rds_info_tcp_socket { u_int32_t last_seen_una; } __attribute__((packed)); +struct rds6_info_tcp_socket { + struct in6_addr local_addr; + __be16 local_port; + struct in6_addr peer_addr; + __be16 peer_port; + uint64_t hdr_rem; + uint64_t data_rem; + uint32_t last_sent_nxt; + uint32_t last_expected_una; + uint32_t last_seen_una; +} __attribute__((packed)); + #define RDS_IB_GID_LEN 16 struct rds_info_rdma_connection { __be32 src_addr; @@ -251,6 +296,28 @@ struct rds_info_rdma_connection { }; +struct rds6_info_rdma_connection { + struct in6_addr src_addr; + struct in6_addr dst_addr; + uint8_t src_gid[RDS_IB_GID_LEN]; + uint8_t dst_gid[RDS_IB_GID_LEN]; + + uint32_t max_send_wr; + uint32_t max_recv_wr; + uint32_t max_send_sge; + uint32_t rdma_mr_max; + uint32_t rdma_mr_size; + uint8_t tos; + uint8_t sl; + uint32_t cache_allocs; + uint32_t frag; + uint16_t flow_ctl_post_credit; + uint16_t flow_ctl_send_credit; + uint32_t qp_num; + uint32_t w_alloc_ctr; + uint32_t w_free_ctr; +}; + /* * Congestion monitoring. * Congestion control in RDS happens at the host connection @@ -339,6 +406,12 @@ struct rds_reset { struct in_addr dst; }; +struct rds6_reset { + uint8_t tos; + struct in6_addr src; + struct in6_addr dst; +}; + struct rds_asend_args { u_int64_t user_token; u_int64_t flags; @@ -349,10 +422,10 @@ struct rds_rdma_send_notify { int32_t status; }; -#define RDS_RDMA_SEND_SUCCESS 0 -#define RDS_RDMA_REMOTE_ERROR 1 -#define RDS_RDMA_SEND_CANCELED 2 -#define RDS_RDMA_SEND_DROPPED 3 +#define RDS_RDMA_SEND_SUCCESS 0 +#define RDS_RDMA_REMOTE_ERROR 1 +#define RDS_RDMA_SEND_CANCELED 2 +#define RDS_RDMA_SEND_DROPPED 3 #define RDS_RDMA_SEND_OTHER_ERROR 4 /* diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index e84b71dc9dc4..c7b30e619bab 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -42,7 +42,7 @@ #include #include "rds.h" -#include "tcp.h" + /* UNUSED for backwards compat only */ static unsigned int rds_ib_retry_count = 0xdead; module_param(rds_ib_retry_count, int, 0444); @@ -427,6 +427,51 @@ done: return 0; } +static int rds6_user_reset(struct rds_sock *rs, char __user *optval, int optlen) +{ + struct rds6_reset reset; + struct rds_connection *conn; + LIST_HEAD(s_addr_conns); + + if (optlen != sizeof(struct rds6_reset)) + return -EINVAL; + + if (copy_from_user(&reset, (struct rds6_reset __user *)optval, + sizeof(struct rds6_reset))) + return -EFAULT; + + /* Reset all conns associated with source addr */ + if (ipv6_addr_any(&reset.dst)) { + pr_info("RDS: Reset ALL conns for Source %pI6c\n", + &reset.src); + + rds_conn_laddr_list(sock_net(rds_rs_to_sk(rs)), + &reset.src, &s_addr_conns); + if (list_empty(&s_addr_conns)) + goto done; + + list_for_each_entry(conn, &s_addr_conns, c_laddr_node) + if (conn) + rds_user_conn_paths_drop(conn, 1); + goto done; + } + + conn = rds_conn_find(sock_net(rds_rs_to_sk(rs)), + &reset.src, &reset.dst, rs->rs_transport, + reset.tos, rs->rs_bound_scope_id); + + if (conn) { + bool is_tcp = conn->c_trans->t_type == RDS_TRANS_TCP; + + printk(KERN_NOTICE "Resetting RDS/%s connection <%pI6c,%pI6c,%d>\n", + is_tcp ? "tcp" : "IB", + &reset.src, &reset.dst, conn->c_tos); + rds_user_conn_paths_drop(conn, DR_USER_RESET); + } +done: + return 0; +} + static int rds_set_transport(struct rds_sock *rs, char __user *optval, int optlen) { @@ -533,6 +578,13 @@ static int rds_setsockopt(struct socket *sock, int level, int optname, } ret = rds_user_reset(rs, optval, optlen); break; + case RDS6_CONN_RESET: + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) { + ret = -EACCES; + break; + } + ret = rds6_user_reset(rs, optval, optlen); + break; case SO_RDS_TRANSPORT: lock_sock(sock->sk); ret = rds_set_transport(rs, optval, optlen); @@ -611,7 +663,9 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr, { struct sock *sk = sock->sk; struct sockaddr_in *sin; + struct sockaddr_in6 *sin6; struct rds_sock *rs = rds_sk_to_rs(sk); + int addr_type; int ret = 0; lock_sock(sk); @@ -637,7 +691,23 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr, break; case sizeof(struct sockaddr_in6): - ret = -EPROTONOSUPPORT; + sin6 = (struct sockaddr_in6 *)uaddr; + if (sin6->sin6_family != AF_INET6) { + ret = -EAFNOSUPPORT; + break; + } + addr_type = ipv6_addr_type(&sin6->sin6_addr); + if (!(addr_type & IPV6_ADDR_UNICAST)) { + ret = -EPROTOTYPE; + break; + } + if (addr_type & IPV6_ADDR_LINKLOCAL && + sin6->sin6_scope_id == 0) { + ret = -EINVAL; + break; + } + rs->rs_conn_addr = sin6->sin6_addr; + rs->rs_conn_port = sin6->sin6_port; break; default: @@ -822,6 +892,38 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len, lens->each = sizeof(struct rds_info_message); } +static void rds6_sock_inc_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + struct rds_sock *rs; + struct rds_incoming *inc; + unsigned int total = 0; + + len /= sizeof(struct rds6_info_message); + + spin_lock_bh(&rds_sock_lock); + + list_for_each_entry(rs, &rds_sock_list, rs_item) { + read_lock(&rs->rs_recv_lock); + + /* XXX too lazy to maintain counts.. */ + list_for_each_entry(inc, &rs->rs_recv_queue, i_item) { + total++; + if (total <= len) + rds6_inc_info_copy(inc, iter, &inc->i_saddr, + &rs->rs_bound_addr, 1); + } + + read_unlock(&rs->rs_recv_lock); + } + + spin_unlock_bh(&rds_sock_lock); + + lens->nr = total; + lens->each = sizeof(struct rds6_info_message); +} + static void rds_sock_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) @@ -855,6 +957,39 @@ out: spin_unlock_bh(&rds_sock_lock); } +static void rds6_sock_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + struct rds6_info_socket sinfo6; + struct rds_sock *rs; + + len /= sizeof(struct rds6_info_socket); + + spin_lock_bh(&rds_sock_lock); + + if (len < rds_sock_count) + goto out; + + list_for_each_entry(rs, &rds_sock_list, rs_item) { + sinfo6.sndbuf = rds_sk_sndbuf(rs); + sinfo6.rcvbuf = rds_sk_rcvbuf(rs); + sinfo6.bound_addr = rs->rs_bound_addr; + sinfo6.connected_addr = rs->rs_conn_addr; + sinfo6.bound_port = rs->rs_bound_port; + sinfo6.connected_port = rs->rs_conn_port; + sinfo6.inum = sock_i_ino(rds_rs_to_sk(rs)); + + rds_info_copy(iter, &sinfo6, sizeof(sinfo6)); + } + +out: + lens->nr = rds_sock_count; + lens->each = sizeof(struct rds6_info_socket); + + spin_unlock_bh(&rds_sock_lock); +} + static unsigned long parse_ul(char *ptr, unsigned long max) { unsigned long val; @@ -966,6 +1101,8 @@ static void __exit rds_exit(void) rds_page_exit(); rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info); rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); + rds_info_deregister_func(RDS6_INFO_SOCKETS, rds6_sock_info); + rds_info_deregister_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info); } module_exit(rds_exit); @@ -1001,6 +1138,8 @@ static int __init rds_init(void) rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info); rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); + rds_info_register_func(RDS6_INFO_SOCKETS, rds6_sock_info); + rds_info_register_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info); rds_qos_threshold_init(); diff --git a/net/rds/bind.c b/net/rds/bind.c index 9676d565433b..31d56d469276 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -189,9 +189,13 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct in6_addr v6addr, *binding_addr; struct rds_transport *trans; __u32 scope_id = 0; + int addr_type; int ret = 0; __be16 port; + /* We allow an RDS socket to be bound to either IPv4 or IPv6 + * address. + */ if (addr_len == sizeof(struct sockaddr_in)) { struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; @@ -202,7 +206,21 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) binding_addr = &v6addr; port = sin->sin_port; } else if (addr_len == sizeof(struct sockaddr_in6)) { - return -EPROTONOSUPPORT; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)uaddr; + + addr_type = ipv6_addr_type(&sin6->sin6_addr); + if (sin6->sin6_family != AF_INET6 || + !(addr_type & IPV6_ADDR_UNICAST)) { + return -EINVAL; + } + /* The scope ID must be specified for link local address. */ + if (addr_type & IPV6_ADDR_LINKLOCAL) { + if (sin6->sin6_scope_id == 0) + return -EINVAL; + scope_id = sin6->sin6_scope_id; + } + binding_addr = &sin6->sin6_addr; + port = sin6->sin6_port; } else { return -EINVAL; } diff --git a/net/rds/connection.c b/net/rds/connection.c index 6121a186e46d..c13c7976353b 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -37,7 +37,6 @@ #include "rds.h" #include "loop.h" -#include "tcp.h" #define RDS_CONNECTION_HASH_BITS 12 #define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS) @@ -556,10 +555,21 @@ void rds_conn_destroy(struct rds_connection *conn, int shutdown) } EXPORT_SYMBOL_GPL(rds_conn_destroy); -static void rds_conn_message_info(struct socket *sock, unsigned int len, - struct rds_info_iterator *iter, - struct rds_info_lengths *lens, - int want_send) +static void __rds_inc_msg_cp(struct rds_incoming *inc, + struct rds_info_iterator *iter, + void *saddr, void *daddr, int flip, bool isv6) +{ + if (isv6) + rds6_inc_info_copy(inc, iter, saddr, daddr, flip); + else + rds_inc_info_copy(inc, iter, *(__be32 *)saddr, + *(__be32 *)daddr, flip); +} + +static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int want_send, bool isv6) { struct hlist_head *head; struct list_head *list; @@ -570,7 +580,10 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, size_t i; int j; - len /= sizeof(struct rds_info_message); + if (isv6) + len /= sizeof(struct rds6_info_message); + else + len /= sizeof(struct rds_info_message); rcu_read_lock(); @@ -595,18 +608,13 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, /* XXX too lazy to maintain counts.. */ list_for_each_entry(rm, list, m_conn_item) { - __be32 laddr; - __be32 faddr; - total++; - laddr = conn->c_laddr.s6_addr32[3]; - faddr = conn->c_faddr.s6_addr32[3]; if (total <= len) - rds_inc_info_copy(&rm->m_inc, - iter, - laddr, - faddr, - 0); + __rds_inc_msg_cp(&rm->m_inc, + iter, + &conn->c_laddr, + &conn->c_faddr, + 0, isv6); } cp->cp_rdsinfo_pending = 0; @@ -617,7 +625,26 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, rcu_read_unlock(); lens->nr = total; - lens->each = sizeof(struct rds_info_message); + if (isv6) + lens->each = sizeof(struct rds6_info_message); + else + lens->each = sizeof(struct rds_info_message); +} + +static void rds_conn_message_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int want_send) +{ + rds_conn_message_info_cmn(sock, len, iter, lens, want_send, false); +} + +static void rds6_conn_message_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens, + int want_send) +{ + rds_conn_message_info_cmn(sock, len, iter, lens, want_send, true); } static void rds_conn_message_info_send(struct socket *sock, unsigned int len, @@ -627,6 +654,13 @@ static void rds_conn_message_info_send(struct socket *sock, unsigned int len, rds_conn_message_info(sock, len, iter, lens, 1); } +static void rds6_conn_message_info_send(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + rds6_conn_message_info(sock, len, iter, lens, 1); +} + static void rds_conn_message_info_retrans(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, @@ -635,6 +669,14 @@ static void rds_conn_message_info_retrans(struct socket *sock, rds_conn_message_info(sock, len, iter, lens, 0); } +static void rds6_conn_message_info_retrans(struct socket *sock, + unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + rds6_conn_message_info(sock, len, iter, lens, 0); +} + void rds_for_each_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, @@ -727,6 +769,9 @@ static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer) struct rds_info_connection *cinfo = buffer; struct rds_connection *conn = cp->cp_conn; + if (conn->c_isv6) + return 0; + cinfo->next_tx_seq = cp->cp_next_tx_seq; cinfo->next_rx_seq = cp->cp_next_rx_seq; cinfo->laddr = conn->c_laddr.s6_addr32[3]; @@ -750,6 +795,37 @@ static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer) return 1; } +static int rds6_conn_info_visitor(struct rds_conn_path *cp, void *buffer) +{ + struct rds6_info_connection *cinfo6 = buffer; + struct rds_connection *conn = cp->cp_conn; + + cinfo6->next_tx_seq = cp->cp_next_tx_seq; + cinfo6->next_rx_seq = cp->cp_next_rx_seq; + cinfo6->laddr = conn->c_laddr; + cinfo6->faddr = conn->c_faddr; + cinfo6->tos = conn->c_tos; + strncpy(cinfo6->transport, conn->c_trans->t_name, + sizeof(cinfo6->transport)); + cinfo6->flags = 0; + + rds_conn_info_set(cinfo6->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags), + SENDING); + /* XXX Future: return the state rather than these funky bits */ + rds_conn_info_set(cinfo6->flags, + atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING, + CONNECTING); + rds_conn_info_set(cinfo6->flags, + atomic_read(&cp->cp_state) == RDS_CONN_UP, + CONNECTED); + rds_conn_info_set(cinfo6->flags, cp->cp_pending_flush, + ERROR); + /* Just return 1 as there is no error case. This is a helper function + * for rds_walk_conn_path_info() and it wants a return value. + */ + return 1; +} + static void rds_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) @@ -759,6 +835,15 @@ static void rds_conn_info(struct socket *sock, unsigned int len, sizeof(struct rds_info_connection)); } +static void rds6_conn_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + rds_walk_conn_path_info(sock, len, iter, lens, + rds6_conn_info_visitor, + sizeof(struct rds6_info_connection)); +} + int rds_conn_init(void) { rds_conn_slab = kmem_cache_create("rds_connection", @@ -772,6 +857,11 @@ int rds_conn_init(void) rds_conn_message_info_send); rds_info_register_func(RDS_INFO_RETRANS_MESSAGES, rds_conn_message_info_retrans); + rds_info_register_func(RDS6_INFO_CONNECTIONS, rds6_conn_info); + rds_info_register_func(RDS6_INFO_SEND_MESSAGES, + rds6_conn_message_info_send); + rds_info_register_func(RDS6_INFO_RETRANS_MESSAGES, + rds6_conn_message_info_retrans); return 0; } @@ -789,7 +879,11 @@ void rds_conn_exit(void) rds_conn_message_info_send); rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES, rds_conn_message_info_retrans); - + rds_info_deregister_func(RDS6_INFO_CONNECTIONS, rds6_conn_info); + rds_info_deregister_func(RDS6_INFO_SEND_MESSAGES, + rds6_conn_message_info_send); + rds_info_deregister_func(RDS6_INFO_RETRANS_MESSAGES, + rds6_conn_message_info_retrans); } static char *conn_drop_reasons[] = { diff --git a/net/rds/ib.c b/net/rds/ib.c index 60637ed5f8f7..dc26513791a2 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -44,10 +44,10 @@ #include #include #include +#include #include "rds.h" #include "ib.h" -#include "tcp.h" #include "rds_single_path.h" #include @@ -109,6 +109,7 @@ LIST_HEAD(ib_nodev_conns); struct workqueue_struct *rds_aux_wq; struct socket *rds_ib_inet_socket; +struct socket *rds_ib_inet6_socket; static struct rds_ib_port *ip_config; static u8 ip_port_cnt = 0; @@ -355,6 +356,9 @@ struct ib_client rds_ib_client = { .remove = rds_ib_remove_one }; +/* Remove IB connection information. This function only reports IPv4 + * connections for backward compatibility. + */ static int rds_ib_conn_info_visitor(struct rds_connection *conn, void *buffer) { @@ -396,7 +400,8 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, iinfo->max_send_sge = rds_ibdev->max_sge; iinfo->qp_num = ic->i_cm_id->qp->qp_num; iinfo->w_alloc_ctr = ic->i_recv_ring.w_alloc_ctr; - iinfo->w_free_ctr = (u32) atomic_read(&ic->i_recv_ring.w_free_ctr); + iinfo->w_free_ctr = + (u32)atomic_read(&ic->i_recv_ring.w_free_ctr); iinfo->flow_ctl_post_credit = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)); iinfo->flow_ctl_send_credit = @@ -407,6 +412,58 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, return 1; } +/* IPv6 version of rds_ib_conn_info_visitor(). */ +static int rds6_ib_conn_info_visitor(struct rds_connection *conn, + void *buffer) +{ + struct rds6_info_rdma_connection *iinfo6 = buffer; + struct rds_ib_connection *ic = conn->c_transport_data; + + /* We will only ever look at IB transports */ + if (conn->c_trans != &rds_ib_transport) + return 0; + + iinfo6->src_addr = conn->c_laddr; + iinfo6->dst_addr = conn->c_faddr; + + memset(&iinfo6->src_gid, 0, sizeof(iinfo6->src_gid)); + memset(&iinfo6->dst_gid, 0, sizeof(iinfo6->dst_gid)); + + if (ic) { + iinfo6->tos = conn->c_tos; + iinfo6->sl = ic->i_sl; + iinfo6->frag = ic->i_frag_sz; + } + + if (rds_conn_state(conn) == RDS_CONN_UP) { + struct rds_ib_device *rds_ibdev; + struct rdma_dev_addr *dev_addr; + + ic = conn->c_transport_data; + dev_addr = &ic->i_cm_id->route.addr.dev_addr; + rdma_addr_get_sgid(dev_addr, + (union ib_gid *)&iinfo6->src_gid); + rdma_addr_get_dgid(dev_addr, + (union ib_gid *)&iinfo6->dst_gid); + + rds_ibdev = ic->rds_ibdev; + iinfo6->max_send_wr = ic->i_send_ring.w_nr; + iinfo6->max_recv_wr = ic->i_recv_ring.w_nr; + iinfo6->max_send_sge = rds_ibdev->max_sge; + iinfo6->qp_num = ic->i_cm_id->qp->qp_num; + iinfo6->w_alloc_ctr = ic->i_recv_ring.w_alloc_ctr; + iinfo6->w_free_ctr = + (u32)atomic_read(&ic->i_recv_ring.w_free_ctr); + iinfo6->flow_ctl_post_credit = + IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)); + iinfo6->flow_ctl_send_credit = + IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)); + rds6_ib_get_mr_info(rds_ibdev, iinfo6); + iinfo6->cache_allocs = atomic_read(&ic->i_cache_allocs); + } + return 1; +} + static void rds_ib_ic_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) @@ -416,6 +473,15 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len, sizeof(struct rds_info_rdma_connection)); } +/* IPv6 version of rds_ib_ic_info(). */ +static void rds6_ib_ic_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + rds_for_each_conn_info(sock, len, iter, lens, + rds6_ib_conn_info_visitor, + sizeof(struct rds6_info_rdma_connection)); +} /* * Early RDS/IB was built to only bind to an address if there is an IPoIB @@ -432,10 +498,14 @@ static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr, { int ret; struct rdma_cm_id *cm_id; + struct sockaddr_in6 sin6; struct sockaddr_in sin; + struct sockaddr *sa; + bool isv4; + isv4 = ipv6_addr_v4mapped(addr); /* Link-local addresses don't play well with IB */ - if (ipv4_is_linklocal_169(addr->s6_addr32[3])) { + if (isv4 && ipv4_is_linklocal_169(addr->s6_addr32[3])) { pr_info_once("\n"); pr_info_once("****************************************************\n"); pr_info_once("** WARNING WARNING WARNING WARNING WARNING **\n"); @@ -458,19 +528,52 @@ static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr, if (IS_ERR(cm_id)) return -EADDRNOTAVAIL; - memset(&sin, 0, sizeof(sin)); - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = addr->s6_addr32[3]; + if (isv4) { + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = addr->s6_addr32[3]; + sa = (struct sockaddr *)&sin; + } else { + memset(&sin6, 0, sizeof(sin6)); + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = *addr; + sin6.sin6_scope_id = scope_id; + sa = (struct sockaddr *)&sin6; + + /* XXX Do a special IPv6 link local address check here. The + * reason is that rdma_bind_addr() always succeeds with IPv6 + * link local address regardless if it is configured or not in + * a system. + */ + if (ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL) { + struct net_device *dev; + + if (scope_id == 0) + return -EADDRNOTAVAIL; + + /* Use init_net for now as RDS is not network + * name space aware. + */ + dev = dev_get_by_index(&init_net, scope_id); + if (!dev) + return -EADDRNOTAVAIL; + if (!ipv6_chk_addr(&init_net, addr, dev, 1)) { + dev_put(dev); + return -EADDRNOTAVAIL; + } + dev_put(dev); + } + } /* rdma_bind_addr will only succeed for IB & iWARP devices */ - ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); + ret = rdma_bind_addr(cm_id, sa); /* due to this, we will claim to support iWARP devices unless we check node_type. */ if (ret || !cm_id->device || cm_id->device->node_type != RDMA_NODE_IB_CA) ret = -EADDRNOTAVAIL; - rdsdebug("addr %pI6c ret %d node type %d\n", - addr, ret, + rdsdebug("addr %pI6c%%%u ret %d node type %d\n", + addr, scope_id, ret, cm_id->device ? cm_id->device->node_type : -1); rdma_destroy_id(cm_id); @@ -2715,8 +2818,16 @@ int rds_ib_init(void) printk(KERN_ERR "RDS/IB: can't create TCP transport socket (%d).\n", -ret); goto out; } + ret = sock_create_kern(&init_net, PF_INET6, SOCK_DGRAM, 0, + &rds_ib_inet6_socket); + if (ret < 0) { + printk(KERN_ERR "RDS/IB: can't create IPv6 configuration socket (%d).\n", + -ret); + goto out; + } sock_net_set(rds_ib_inet_socket->sk, &init_net); + sock_net_set(rds_ib_inet6_socket->sk, &init_net); /* Initialise the RDS IB fragment size */ rds_ib_init_frag(RDS_PROTOCOL_VERSION); @@ -2748,6 +2859,7 @@ int rds_ib_init(void) goto out_aux_wq; rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); + rds_info_register_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info); ret = rds_ip_threads_init(); if (ret) { @@ -2790,6 +2902,7 @@ void rds_ib_exit(void) { unregister_netdevice_notifier(&rds_ib_nb); rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); + rds_info_deregister_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info); rds_ib_unregister_client(); rds_ib_destroy_nodev_conns(); rds_ib_sysctl_exit(); diff --git a/net/rds/ib.h b/net/rds/ib.h index b2bcce89f93c..be0b3dee3add 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -301,7 +301,7 @@ struct rds_ib_srq { struct rds_ib_alias { char if_name[IFNAMSIZ]; - __be32 ip_addr; + __be32 ip_addr; __be32 ip_bcast; __be32 ip_mask; }; @@ -390,7 +390,7 @@ struct rds_ib_port { union ib_gid gid; char port_label[4]; char if_name[IFNAMSIZ]; - __be32 ip_addr; + __be32 ip_addr; __be32 ip_bcast; __be32 ip_mask; unsigned int ip_active_port; @@ -634,6 +634,8 @@ void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection * void rds_ib_destroy_nodev_conns(void); struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev, int npages); void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); +void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev, + struct rds6_info_rdma_connection *iinfo6); void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, struct rds_sock *rs, u32 *key_ret, diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index ea87f9afd466..2445f161cbfa 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -41,7 +41,6 @@ #include "rds.h" #include "ib.h" -#include "tcp.h" #include "rds_single_path.h" static unsigned int rds_ib_max_frag = RDS_MAX_FRAG_SIZE; @@ -1029,12 +1028,16 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, (unsigned long long)be64_to_cpu(fguid), dp_cmn->ricpc_tos); - /* XXX IPoIB ACL Only support IPv4 */ - acl_ret = rds_ib_match_acl(cm_id, saddr6->s6_addr32[3]); - if (acl_ret < 0) { - err = RDS_ACL_FAILURE; - rdsdebug("RDS: IB: passive: rds_ib_match_acl failed\n"); - goto out; + /* IPoIB ACL only supports IPv4. Let all IPv6 traffic pass. */ + if (ipv6_addr_v4mapped(saddr6)) { + acl_ret = rds_ib_match_acl(cm_id, saddr6->s6_addr32[3]); + if (acl_ret < 0) { + err = RDS_ACL_FAILURE; + rdsdebug("RDS: IB: passive: rds_ib_match_acl failed\n"); + goto out; + } + } else { + acl_ret = 0; } /* RDS/IB is not currently netns aware, thus init_net */ @@ -1200,7 +1203,11 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6) u16 frag; int ret; - ret = rds_ib_match_acl(ic->i_cm_id, conn->c_faddr.s6_addr32[3]); + /* IPoIB ACL only supports IPv4. Let all IPv6 traffic pass. */ + if (ipv6_addr_v4mapped(&conn->c_faddr)) + ret = rds_ib_match_acl(ic->i_cm_id, conn->c_faddr.s6_addr32[3]); + else + ret = 0; if (ret < 0) { pr_err("RDS: IB: active conn=%p, <%pI6c,%pI6c,%d> destroyed due ACL violation\n", conn, &conn->c_laddr, &conn->c_faddr, @@ -1266,7 +1273,10 @@ int rds_ib_conn_path_connect(struct rds_conn_path *cp) /* XXX I wonder what affect the port space has */ /* delegate cm event handler to rdma_transport */ - handler = rds_rdma_cm_event_handler; + if (conn->c_isv6) + handler = rds6_rdma_cm_event_handler; + else + handler = rds_rdma_cm_event_handler; ic->i_cm_id = rdma_create_id(handler, conn, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(ic->i_cm_id)) { @@ -1305,7 +1315,7 @@ int rds_ib_conn_path_connect(struct rds_conn_path *cp) sin6 = (struct sockaddr_in6 *)&dest; sin6->sin6_family = AF_INET6; sin6->sin6_addr = conn->c_faddr; - sin6->sin6_port = (__force u16)htons(RDS_TCP_PORT); + sin6->sin6_port = (__force u16)htons(RDS_CM_PORT); sin6->sin6_scope_id = conn->c_dev_if; } diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index a096a33cdbd1..a48ffeaceec1 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -314,6 +314,15 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages; } +void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev, + struct rds6_info_rdma_connection *iinfo6) +{ + struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool; + + iinfo6->rdma_mr_max = pool_1m->max_items; + iinfo6->rdma_mr_size = pool_1m->fmr_attr.max_pages; +} + void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) { struct rds_ib_mr *ibmr; diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index d071c00e32c0..462d88ee6a4d 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -37,7 +37,6 @@ #include "rds.h" #include "ib.h" -#include "tcp.h" #include "rds_single_path.h" static char *rds_ib_wc_status_strings[] = { diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index 1b70b42dd8db..d26fc8d3dd74 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -36,7 +36,6 @@ #include "rdma_transport.h" #include "ib.h" #include "net/arp.h" -#include "tcp.h" #include "rds_single_path.h" #include @@ -44,7 +43,9 @@ #define RDS_REJ_CONSUMER_DEFINED 28 +/* Global IPv4 and IPv6 RDS RDMA listener cm_id */ static struct rdma_cm_id *rds_rdma_listen_id; +static struct rdma_cm_id *rds6_rdma_listen_id; int unload_allowed __initdata; @@ -325,6 +326,12 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, return rds_rdma_cm_event_handler_cmn(cm_id, event, false); } +int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + return rds_rdma_cm_event_handler_cmn(cm_id, event, true); +} + static int rds_rdma_listen_init_common(rdma_cm_event_handler handler, struct sockaddr *sa, struct rdma_cm_id **ret_cm_id) @@ -358,7 +365,9 @@ static int rds_rdma_listen_init_common(rdma_cm_event_handler handler, } rdsdebug("cm %p listening on port %u\n", cm_id, - ntohs(((struct sockaddr_in *)sa)->sin_port)); + sa->sa_family == PF_INET ? + ntohs(((struct sockaddr_in *)sa)->sin_port) : + ntohs(((struct sockaddr_in6 *)sa)->sin6_port)); *ret_cm_id = cm_id; cm_id = NULL; @@ -370,13 +379,14 @@ out: /* Initialize the RDS RDMA listeners. We create two listeners for * compatibility reason. The one on RDS_PORT is used for IPv4 - * requests only. The one on RDS_TCP_PORT is used for IPv6 requests + * requests only. The one on RDS_CM_PORT is used for IPv6 requests * only. So only IPv6 enabled RDS module will communicate using this * port. */ static int rds_rdma_listen_init(void) { int ret; + struct sockaddr_in6 sin6; struct sockaddr_in sin; sin.sin_family = PF_INET; @@ -385,7 +395,21 @@ static int rds_rdma_listen_init(void) ret = rds_rdma_listen_init_common(rds_rdma_cm_event_handler, (struct sockaddr *)&sin, &rds_rdma_listen_id); - return ret; + if (ret) + return ret; + + sin6.sin6_family = PF_INET6; + sin6.sin6_addr = in6addr_any; + sin6.sin6_port = htons(RDS_CM_PORT); + sin6.sin6_scope_id = 0; + sin6.sin6_flowinfo = 0; + ret = rds_rdma_listen_init_common(rds6_rdma_cm_event_handler, + (struct sockaddr *)&sin6, + &rds6_rdma_listen_id); + /* Keep going even when IPv6 is not enabled in the system. */ + if (ret) + rdsdebug("Cannot set up IPv6 RDMA listener\n"); + return 0; } static void rds_rdma_listen_stop(void) @@ -395,6 +419,11 @@ static void rds_rdma_listen_stop(void) rdma_destroy_id(rds_rdma_listen_id); rds_rdma_listen_id = NULL; } + if (rds6_rdma_listen_id) { + rdsdebug("cm %p\n", rds6_rdma_listen_id); + rdma_destroy_id(rds6_rdma_listen_id); + rds6_rdma_listen_id = NULL; + } } #define MODULE_NAME "rds_rdma" diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h index 63aa4a091865..065fb17ed497 100644 --- a/net/rds/rdma_transport.h +++ b/net/rds/rdma_transport.h @@ -10,6 +10,8 @@ int rds_rdma_conn_connect(struct rds_connection *conn); int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event); +int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); /* from rdma_transport.c */ int rds_rdma_init(void); diff --git a/net/rds/rds.h b/net/rds/rds.h index ab20763fedff..9049d35b80a8 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -34,13 +34,15 @@ */ #define RDS_ACL_FAILURE 0x04010020 -/* - * XXX randomly chosen, but at least seems to be unused: - * # 18464-18768 Unassigned - * We should do better. We want a reserved port to discourage unpriv'ed - * userspace from listening. +/* The following ports, 16385, 18634, 18635, are registered with IANA as + * the ports to be used for RDS over TCP and UDP. 18634 is the historical + * value used for the RDMA_CM listener port. RDS/TCP uses port 16385. After + * IPv6 work, RDMA_CM also uses 16385 as the listener port. 18634 is kept + * to ensure compatibility with older RDS modules. */ #define RDS_PORT 18634 +#define RDS_CM_PORT 16385 +#define RDS_TCP_PORT RDS_CM_PORT #ifdef ATOMIC64_INIT #define KERNEL_HAS_ATOMIC64 @@ -1046,6 +1048,10 @@ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg); void rds_inc_info_copy(struct rds_incoming *inc, struct rds_info_iterator *iter, __be32 saddr, __be32 daddr, int flip); +void rds6_inc_info_copy(struct rds_incoming *inc, + struct rds_info_iterator *iter, + struct in6_addr *saddr, struct in6_addr *daddr, + int flip); int rds_skb_local(struct sk_buff *skb); /* send.c */ diff --git a/net/rds/recv.c b/net/rds/recv.c index a16254643259..a763250c7a25 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -39,7 +39,6 @@ #include #include "rds.h" -#include "tcp.h" /* forward prototypes */ static void @@ -1112,6 +1111,31 @@ void rds_inc_info_copy(struct rds_incoming *inc, rds_info_copy(iter, &minfo, sizeof(minfo)); } +void rds6_inc_info_copy(struct rds_incoming *inc, + struct rds_info_iterator *iter, + struct in6_addr *saddr, struct in6_addr *daddr, + int flip) +{ + struct rds6_info_message minfo6; + + minfo6.seq = be64_to_cpu(inc->i_hdr.h_sequence); + minfo6.len = be32_to_cpu(inc->i_hdr.h_len); + + if (flip) { + minfo6.laddr = *daddr; + minfo6.faddr = *saddr; + minfo6.lport = inc->i_hdr.h_dport; + minfo6.fport = inc->i_hdr.h_sport; + } else { + minfo6.laddr = *saddr; + minfo6.faddr = *daddr; + minfo6.lport = inc->i_hdr.h_sport; + minfo6.fport = inc->i_hdr.h_dport; + } + + rds_info_copy(iter, &minfo6, sizeof(minfo6)); +} + int rds_skb_local(struct sk_buff *skb) { struct rds_nf_hdr *dst, *org; diff --git a/net/rds/send.c b/net/rds/send.c index 97e239bcbd9f..c43b21b83207 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -36,7 +36,6 @@ #include #include "rds.h" -#include "tcp.h" /* When transmitting messages in rds_send_xmit, we need to emerge from * time to time and briefly release the CPU. Otherwise the softlock watchdog @@ -1251,8 +1250,27 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) break; case sizeof(*sin6): { - ret = -EPROTONOSUPPORT; - goto out; + int addr_type; + + if (sin6->sin6_family != AF_INET6) { + ret = -EINVAL; + goto out; + } + addr_type = ipv6_addr_type(&sin6->sin6_addr); + if (!(addr_type & IPV6_ADDR_UNICAST)) { + ret = -EINVAL; + goto out; + } + if (addr_type & IPV6_ADDR_LINKLOCAL && + sin6->sin6_scope_id == 0) { + ret = -EINVAL; + goto out; + } + + daddr = sin6->sin6_addr; + dport = sin6->sin6_port; + scope_id = sin6->sin6_scope_id; + break; } default: diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 54b58c1c21f8..72d1b915f83f 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -43,7 +43,12 @@ /* only for info exporting */ static DEFINE_SPINLOCK(rds_tcp_tc_list_lock); static LIST_HEAD(rds_tcp_tc_list); + +/* rds_tcp_tc_count counts only IPv4 connections. + * rds6_tcp_tc_count counts both IPv4 and IPv6 connections. + */ static unsigned int rds_tcp_tc_count; +static unsigned int rds6_tcp_tc_count; /* Track rds_tcp_connection structs so they can be cleaned up */ static DEFINE_SPINLOCK(rds_tcp_conn_lock); @@ -111,7 +116,9 @@ void rds_tcp_restore_callbacks(struct socket *sock, /* done under the callback_lock to serialize with write_space */ spin_lock(&rds_tcp_tc_list_lock); list_del_init(&tc->t_list_item); - rds_tcp_tc_count--; + rds6_tcp_tc_count--; + if (!tc->t_cpath->cp_conn->c_isv6) + rds_tcp_tc_count--; spin_unlock(&rds_tcp_tc_list_lock); tc->t_sock = NULL; @@ -198,7 +205,9 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp) /* done under the callback_lock to serialize with write_space */ spin_lock(&rds_tcp_tc_list_lock); list_add_tail(&tc->t_list_item, &rds_tcp_tc_list); - rds_tcp_tc_count++; + if (!tc->t_cpath->cp_conn->c_isv6) + rds_tcp_tc_count++; + rds6_tcp_tc_count++; spin_unlock(&rds_tcp_tc_list_lock); /* accepted sockets need our listen data ready undone */ @@ -219,16 +228,16 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp) write_unlock_bh(&sock->sk->sk_callback_lock); } -static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len, +/* Handle RDS_INFO_TCP_SOCKETS socket option. It only returns IPv4 + * connections for backward compatibility. + */ +static void rds_tcp_tc_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens) { struct rds_info_tcp_socket tsinfo; struct rds_tcp_connection *tc; unsigned long flags; - struct sockaddr_in sin; - int sinlen; - struct socket *sock; spin_lock_irqsave(&rds_tcp_tc_list_lock, flags); @@ -236,18 +245,15 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len, goto out; list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) { + struct inet_sock *inet = inet_sk(tc->t_sock->sk); - sock = tc->t_sock; - if (sock) { - sock->ops->getname(sock, (struct sockaddr *)&sin, - &sinlen, 0); - tsinfo.local_addr = sin.sin_addr.s_addr; - tsinfo.local_port = sin.sin_port; - sock->ops->getname(sock, (struct sockaddr *)&sin, - &sinlen, 1); - tsinfo.peer_addr = sin.sin_addr.s_addr; - tsinfo.peer_port = sin.sin_port; - } + if (tc->t_cpath->cp_conn->c_isv6) + continue; + + tsinfo.local_addr = inet->inet_saddr; + tsinfo.local_port = inet->inet_sport; + tsinfo.peer_addr = inet->inet_daddr; + tsinfo.peer_port = inet->inet_dport; tsinfo.hdr_rem = tc->t_tinc_hdr_rem; tsinfo.data_rem = tc->t_tinc_data_rem; @@ -265,6 +271,48 @@ out: spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags); } +/* Handle RDS6_INFO_TCP_SOCKETS socket option. It returns both IPv4 and + * IPv6 connections. IPv4 connection address is returned in an IPv4 mapped + * address. + */ +static void rds6_tcp_tc_info(struct socket *sock, unsigned int len, + struct rds_info_iterator *iter, + struct rds_info_lengths *lens) +{ + struct rds6_info_tcp_socket tsinfo6; + struct rds_tcp_connection *tc; + unsigned long flags; + + spin_lock_irqsave(&rds_tcp_tc_list_lock, flags); + + if (len / sizeof(tsinfo6) < rds6_tcp_tc_count) + goto out; + + list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) { + struct sock *sk = tc->t_sock->sk; + struct inet_sock *inet = inet_sk(sk); + + tsinfo6.local_addr = sk->sk_v6_rcv_saddr; + tsinfo6.local_port = inet->inet_sport; + tsinfo6.peer_addr = sk->sk_v6_daddr; + tsinfo6.peer_port = inet->inet_dport; + + tsinfo6.hdr_rem = tc->t_tinc_hdr_rem; + tsinfo6.data_rem = tc->t_tinc_data_rem; + tsinfo6.last_sent_nxt = tc->t_last_sent_nxt; + tsinfo6.last_expected_una = tc->t_last_expected_una; + tsinfo6.last_seen_una = tc->t_last_seen_una; + + rds_info_copy(iter, &tsinfo6, sizeof(tsinfo6)); + } + +out: + lens->nr = rds6_tcp_tc_count; + lens->each = sizeof(tsinfo6); + + spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags); +} + static int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr, __u32 scope_id) { @@ -469,13 +517,18 @@ static __net_init int rds_tcp_init_net(struct net *net) err = -ENOMEM; goto fail; } - rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net); + rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, true); if (!rtn->rds_tcp_listen_sock) { - pr_warn("could not set up listen sock\n"); - unregister_net_sysctl_table(rtn->rds_tcp_sysctl); - rtn->rds_tcp_sysctl = NULL; - err = -EAFNOSUPPORT; - goto fail; + pr_warn("could not set up IPv6 listen sock\n"); + + /* Try IPv4 as some systems disable IPv6 */ + rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, false); + if (!rtn->rds_tcp_listen_sock) { + unregister_net_sysctl_table(rtn->rds_tcp_sysctl); + rtn->rds_tcp_sysctl = NULL; + err = -EAFNOSUPPORT; + goto fail; + } } INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker); return 0; @@ -642,6 +695,7 @@ static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write, static void __exit rds_tcp_exit(void) { rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); + rds_info_deregister_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info); unregister_pernet_subsys(&rds_tcp_net_ops); if (unregister_netdevice_notifier(&rds_tcp_dev_notifier)) pr_warn("could not unregister rds_tcp_dev_notifier\n"); @@ -681,6 +735,7 @@ static int __init rds_tcp_init(void) ret = rds_trans_register(&rds_tcp_transport); rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); + rds_info_register_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info); goto out; diff --git a/net/rds/tcp.h b/net/rds/tcp.h index e3eff146bd0b..c4fbafc2ef5a 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -1,8 +1,6 @@ #ifndef _RDS_TCP_H #define _RDS_TCP_H -#define RDS_TCP_PORT 16385 - struct rds_tcp_incoming { struct rds_incoming ti_inc; struct sk_buff_head ti_skb_list; @@ -65,7 +63,7 @@ void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp); void rds_tcp_state_change(struct sock *sk); /* tcp_listen.c */ -struct socket *rds_tcp_listen_init(struct net *); +struct socket *rds_tcp_listen_init(struct net *net, bool isv6); void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor); void rds_tcp_listen_data_ready(struct sock *sk); int rds_tcp_accept_one(struct socket *sock); diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 01449f3fb358..a99329692780 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -83,9 +83,11 @@ out: int rds_tcp_conn_path_connect(struct rds_conn_path *cp) { struct socket *sock = NULL; + struct sockaddr_in6 sin6; struct sockaddr_in sin; struct sockaddr *addr; int addrlen; + bool isv6; int ret; struct rds_connection *conn = cp->cp_conn; struct rds_tcp_connection *tc = cp->cp_transport_data; @@ -103,18 +105,35 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) return 0; } - ret = sock_create_kern(rds_conn_net(conn), PF_INET, SOCK_STREAM, - IPPROTO_TCP, &sock); + if (ipv6_addr_v4mapped(&conn->c_laddr)) { + ret = sock_create_kern(rds_conn_net(conn), PF_INET, + SOCK_STREAM, IPPROTO_TCP, &sock); + isv6 = false; + } else { + ret = sock_create_kern(rds_conn_net(conn), PF_INET6, + SOCK_STREAM, IPPROTO_TCP, &sock); + isv6 = true; + } if (ret < 0) goto out; rds_tcp_tune(sock); - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = (__force u32)conn->c_laddr.s6_addr32[3]; - sin.sin_port = (__force u16)htons(0); - addr = (struct sockaddr *)&sin; - addrlen = sizeof(sin); + if (isv6) { + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = conn->c_laddr; + sin6.sin6_port = 0; + sin6.sin6_flowinfo = 0; + sin6.sin6_scope_id = conn->c_dev_if; + addr = (struct sockaddr *)&sin6; + addrlen = sizeof(sin6); + } else { + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = (__force u32)conn->c_laddr.s6_addr32[3]; + sin.sin_port = (__force u16)htons(0); + addr = (struct sockaddr *)&sin; + addrlen = sizeof(sin); + } ret = sock->ops->bind(sock, addr, addrlen); if (ret) { @@ -123,11 +142,21 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) goto out; } - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = (__force u32)conn->c_faddr.s6_addr32[3]; - sin.sin_port = (__force u16)htons(RDS_TCP_PORT); - addr = (struct sockaddr *)&sin; - addrlen = sizeof(sin); + if (isv6) { + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = conn->c_faddr; + sin6.sin6_port = htons(RDS_TCP_PORT); + sin6.sin6_flowinfo = 0; + sin6.sin6_scope_id = conn->c_dev_if; + addr = (struct sockaddr *)&sin6; + addrlen = sizeof(sin6); + } else { + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = (__force u32)conn->c_faddr.s6_addr32[3]; + sin.sin_port = (__force u16)htons(RDS_TCP_PORT); + addr = (struct sockaddr *)&sin; + addrlen = sizeof(sin); + } /* * once we call connect() we can start getting callbacks and they diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 7951f5eaf021..072ef91d1015 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -146,7 +146,8 @@ int rds_tcp_accept_one(struct socket *sock) inet = inet_sk(new_sock->sk); - rdsdebug("accepted tcp %pI6c:%u -> %pI6c:%u\n", + rdsdebug("accepted family %d tcp %pI6c:%u -> %pI6c:%u\n", + sock->sk->sk_family, &new_sock->sk->sk_v6_rcv_saddr, ntohs(inet->inet_sport), &new_sock->sk->sk_v6_daddr, ntohs(inet->inet_dport)); @@ -240,15 +241,21 @@ out: ready(sk); } -struct socket *rds_tcp_listen_init(struct net *net) +struct socket *rds_tcp_listen_init(struct net *net, bool isv6) { - struct sockaddr_in sin; struct socket *sock = NULL; + struct sockaddr_storage ss; + struct sockaddr_in6 *sin6; + struct sockaddr_in *sin; + int addr_len; int ret; - ret = sock_create_kern(net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); - if (ret < 0) + ret = sock_create_kern(net, isv6 ? PF_INET6 : PF_INET, SOCK_STREAM, + IPPROTO_TCP, &sock); + if (ret < 0) { + rdsdebug("could not create listener socket: %d\n", ret); goto out; + } sock->sk->sk_reuse = 1; rds_tcp_nonagle(sock); @@ -258,13 +265,28 @@ struct socket *rds_tcp_listen_init(struct net *net) sock->sk->sk_data_ready = rds_tcp_listen_data_ready; write_unlock_bh(&sock->sk->sk_callback_lock); - sin.sin_family = PF_INET; - sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY); - sin.sin_port = (__force u16)htons(RDS_TCP_PORT); + if (isv6) { + sin6 = (struct sockaddr_in6 *)&ss; + sin6->sin6_family = PF_INET6; + sin6->sin6_addr = in6addr_any; + sin6->sin6_port = (__force u16)htons(RDS_TCP_PORT); + sin6->sin6_scope_id = 0; + sin6->sin6_flowinfo = 0; + addr_len = sizeof(*sin6); + } else { + sin = (struct sockaddr_in *)&ss; + sin->sin_family = PF_INET; + sin->sin_addr.s_addr = INADDR_ANY; + sin->sin_port = (__force u16)htons(RDS_TCP_PORT); + addr_len = sizeof(*sin); + } - ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); - if (ret < 0) + ret = sock->ops->bind(sock, (struct sockaddr *)&ss, addr_len); + if (ret < 0) { + rdsdebug("could not bind %s listener socket: %d\n", + isv6 ? "IPv6" : "IPv4", ret); goto out; + } ret = sock->ops->listen(sock, 64); if (ret < 0) diff --git a/net/rds/threads.c b/net/rds/threads.c index 70a9b55070f0..4985baca4f38 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -34,7 +34,7 @@ #include #include "rds.h" -#include "tcp.h" + static unsigned int rds_conn_hb_timeout = 0; module_param(rds_conn_hb_timeout, int, 0444); MODULE_PARM_DESC(rds_conn_hb_timeout, " Connection heartbeat timeout");