From 4d2cc57ee46bca9a504a4bb2face13c7a189af49 Mon Sep 17 00:00:00 2001 From: Ka-Cheong Poon Date: Fri, 20 Oct 2017 02:08:36 -0700 Subject: [PATCH] rds: Changed IP address internal representation to struct in6_addr MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This patch changed the internal representation of an IP address to use struct in6_addr. IPv4 address is stored as an IPv4 mapped address. All the functions which take an IP address as argument are also changed to use struct in6_addr. But RDS socket layer is not modified such that it still does not accept IPv6 address from an application. And RDS layer does not accept nor initiate IPv6 connections. The RDS netfilter header is changed. User of RDS netfilter will need to be re-compiled. Orabug: 25410192 Signed-off-by: Ka-Cheong Poon Reviewed-by: HÃ¥kon Bugge --- include/uapi/linux/rds.h | 12 +- net/rds/af_rds.c | 147 +++++++++++------ net/rds/bind.c | 100 ++++++----- net/rds/cong.c | 34 ++-- net/rds/connection.c | 167 +++++++++++-------- net/rds/ib.c | 23 +-- net/rds/ib.h | 52 ++++-- net/rds/ib_cm.c | 346 ++++++++++++++++++++++++++++----------- net/rds/ib_rdma.c | 17 +- net/rds/ib_recv.c | 28 ++-- net/rds/ib_send.c | 8 +- net/rds/loop.c | 10 +- net/rds/rdma.c | 6 +- net/rds/rdma_transport.c | 117 +++++++------ net/rds/rds.h | 78 +++++---- net/rds/recv.c | 133 ++++++++------- net/rds/send.c | 77 ++++++--- net/rds/tcp.c | 30 +++- net/rds/tcp_connect.c | 40 +++-- net/rds/tcp_listen.c | 17 +- net/rds/tcp_recv.c | 9 +- net/rds/tcp_send.c | 4 +- net/rds/threads.c | 71 ++++++-- net/rds/transport.c | 15 +- 24 files changed, 1002 insertions(+), 539 deletions(-) diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h index 219c84630919..ac631250c04b 100644 --- a/include/uapi/linux/rds.h +++ b/include/uapi/linux/rds.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Oracle. All rights reserved. + * Copyright (c) 2008, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -370,11 +370,11 @@ struct rds_rdma_send_notify { /* netfilter related components */ struct rds_nf_hdr { - __be32 saddr; /* source address of request */ - __be32 daddr; /* destination address */ - __be16 sport; /* source port number */ - __be16 dport; /* destination port number */ - __be16 protocol; /* rds socket protocol family to use */ + struct in6_addr saddr; /* source address of request */ + struct in6_addr daddr; /* destination address */ + __be16 sport; /* source port number */ + __be16 dport; /* destination port number */ + __be16 protocol; /* rds socket protocol family to use */ #define RDS_NF_HDR_FLAG_BOTH (0x1) /* request needs to go locally and remote */ #define RDS_NF_HDR_FLAG_DONE (0x2) /* the request is consumed and done */ diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 1fb7b34b8553..e84b71dc9dc4 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -145,26 +146,51 @@ void rds_wake_sk_sleep(struct rds_sock *rs) static int rds_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer) { - struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; struct rds_sock *rs = rds_sk_to_rs(sock->sk); - - memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + struct sockaddr_in6 *sin6; + struct sockaddr_in *sin; /* racey, don't care */ if (peer) { - if (!rs->rs_conn_addr) + if (ipv6_addr_any(&rs->rs_conn_addr)) return -ENOTCONN; - sin->sin_port = rs->rs_conn_port; - sin->sin_addr.s_addr = rs->rs_conn_addr; + if (ipv6_addr_v4mapped(&rs->rs_conn_addr)) { + sin = (struct sockaddr_in *)uaddr; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + sin->sin_family = AF_INET; + sin->sin_port = rs->rs_conn_port; + sin->sin_addr.s_addr = rs->rs_conn_addr_v4; + *uaddr_len = sizeof(*sin); + } else { + sin6 = (struct sockaddr_in6 *)uaddr; + sin6->sin6_family = AF_INET6; + sin6->sin6_port = rs->rs_conn_port; + sin6->sin6_addr = rs->rs_conn_addr; + sin6->sin6_flowinfo = 0; + /* scope_id is the same as in the bound address. */ + sin6->sin6_scope_id = rs->rs_bound_scope_id; + *uaddr_len = sizeof(*sin6); + } } else { - sin->sin_port = rs->rs_bound_port; - sin->sin_addr.s_addr = rs->rs_bound_addr; + if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) { + sin = (struct sockaddr_in *)uaddr; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + sin->sin_family = AF_INET; + sin->sin_port = rs->rs_bound_port; + sin->sin_addr.s_addr = rs->rs_bound_addr_v4; + *uaddr_len = sizeof(*sin); + } else { + sin6 = (struct sockaddr_in6 *)uaddr; + sin6->sin6_family = AF_INET6; + sin6->sin6_port = rs->rs_bound_port; + sin6->sin6_addr = rs->rs_bound_addr; + sin6->sin6_flowinfo = 0; + sin6->sin6_scope_id = rs->rs_bound_scope_id; + *uaddr_len = sizeof(*sin6); + } } - sin->sin_family = AF_INET; - - *uaddr_len = sizeof(*sin); return 0; } @@ -269,11 +295,12 @@ static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval, int len) { + struct sockaddr_in6 sin6; struct sockaddr_in sin; int ret = 0; /* racing with another thread binding seems ok here */ - if (rs->rs_bound_addr == 0) { + if (ipv6_addr_any(&rs->rs_bound_addr)) { ret = -ENOTCONN; /* XXX not a great errno */ goto out; } @@ -281,14 +308,23 @@ static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval, if (len < sizeof(struct sockaddr_in)) { ret = -EINVAL; goto out; + } else if (len < sizeof(struct sockaddr_in6)) { + /* Assume IPv4 */ + if (copy_from_user(&sin, optval, sizeof(struct sockaddr_in))) { + ret = -EFAULT; + goto out; + } + ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr); + sin6.sin6_port = sin.sin_port; + } else { + if (copy_from_user(&sin6, optval, + sizeof(struct sockaddr_in6))) { + ret = -EFAULT; + goto out; + } } - if (copy_from_user(&sin, optval, sizeof(sin))) { - ret = -EFAULT; - goto out; - } - - rds_send_drop_to(rs, &sin); + rds_send_drop_to(rs, &sin6); out: return ret; } @@ -346,6 +382,7 @@ static int rds_user_reset(struct rds_sock *rs, char __user *optval, int optlen) { struct rds_reset reset; struct rds_connection *conn; + struct in6_addr src6, dst6; LIST_HEAD(s_addr_conns); if (optlen != sizeof(struct rds_reset)) @@ -356,30 +393,32 @@ static int rds_user_reset(struct rds_sock *rs, char __user *optval, int optlen) return -EFAULT; /* Reset all conns associated with source addr */ + ipv6_addr_set_v4mapped(reset.src.s_addr, &src6); if (reset.dst.s_addr == 0) { pr_info("RDS: Reset ALL conns for Source %pI4\n", &reset.src.s_addr); rds_conn_laddr_list(sock_net(rds_rs_to_sk(rs)), - reset.src.s_addr, &s_addr_conns); + &src6, &s_addr_conns); if (list_empty(&s_addr_conns)) goto done; list_for_each_entry(conn, &s_addr_conns, c_laddr_node) if (conn) - rds_user_conn_paths_drop(conn, 1); + rds_conn_drop(conn, DR_USER_RESET); goto done; } - conn = rds_conn_find(sock_net(rds_rs_to_sk(rs)), - reset.src.s_addr, reset.dst.s_addr, - rs->rs_transport, reset.tos); + ipv6_addr_set_v4mapped(reset.dst.s_addr, &dst6); + conn = rds_conn_find(sock_net(rds_rs_to_sk(rs)), &src6, &dst6, + rs->rs_transport, reset.tos, + rs->rs_bound_scope_id); if (conn) { bool is_tcp = conn->c_trans->t_type == RDS_TRANS_TCP; printk(KERN_NOTICE "Resetting RDS/%s connection <%pI4,%pI4,%d>\n", - is_tcp ? "tcp" : "IB", + is_tcp ? "TCP" : "IB", &reset.src.s_addr, &reset.dst.s_addr, conn->c_tos); rds_user_conn_paths_drop(conn, DR_USER_RESET); @@ -571,31 +610,41 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; - struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + struct sockaddr_in *sin; struct rds_sock *rs = rds_sk_to_rs(sk); int ret = 0; lock_sock(sk); - if (addr_len != sizeof(struct sockaddr_in)) { - ret = -EINVAL; - goto out; - } + switch (addr_len) { + case sizeof(struct sockaddr_in): + sin = (struct sockaddr_in *)uaddr; + if (sin->sin_family != AF_INET) { + ret = -EAFNOSUPPORT; + break; + } + if (sin->sin_addr.s_addr == INADDR_ANY) { + ret = -EDESTADDRREQ; + break; + } + if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) || + sin->sin_addr.s_addr == INADDR_BROADCAST) { + ret = -EINVAL; + break; + } + ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &rs->rs_conn_addr); + rs->rs_conn_port = sin->sin_port; + break; - if (sin->sin_family != AF_INET) { - ret = -EAFNOSUPPORT; - goto out; - } + case sizeof(struct sockaddr_in6): + ret = -EPROTONOSUPPORT; + break; - if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) { - ret = -EDESTADDRREQ; - goto out; + default: + ret = -EINVAL; + break; } - rs->rs_conn_addr = sin->sin_addr.s_addr; - rs->rs_conn_port = sin->sin_port; - -out: release_sock(sk); return ret; } @@ -659,8 +708,10 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol) rs->rs_netfilter_enabled = 0; rs->rs_rx_traces = 0; - if (rs->rs_bound_addr) - printk(KERN_CRIT "bound addr %x at create\n", rs->rs_bound_addr); + if (!ipv6_addr_any(&rs->rs_bound_addr)) { + printk(KERN_CRIT "bound addr %pI6c at create\n", + &rs->rs_bound_addr); + } spin_lock_bh(&rds_sock_lock); list_add_tail(&rs->rs_item, &rds_sock_list); @@ -756,8 +807,10 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len, list_for_each_entry(inc, &rs->rs_recv_queue, i_item) { total++; if (total <= len) - rds_inc_info_copy(inc, iter, inc->i_saddr, - rs->rs_bound_addr, 1); + rds_inc_info_copy(inc, iter, + inc->i_saddr.s6_addr32[3], + rs->rs_bound_addr_v4, + 1); } read_unlock(&rs->rs_recv_lock); @@ -786,8 +839,8 @@ static void rds_sock_info(struct socket *sock, unsigned int len, list_for_each_entry(rs, &rds_sock_list, rs_item) { sinfo.sndbuf = rds_sk_sndbuf(rs); sinfo.rcvbuf = rds_sk_rcvbuf(rs); - sinfo.bound_addr = rs->rs_bound_addr; - sinfo.connected_addr = rs->rs_conn_addr; + sinfo.bound_addr = rs->rs_bound_addr_v4; + sinfo.connected_addr = rs->rs_conn_addr_v4; sinfo.bound_port = rs->rs_bound_port; sinfo.connected_port = rs->rs_conn_port; sinfo.inum = sock_i_ino(rds_rs_to_sk(rs)); @@ -913,8 +966,8 @@ static void __exit rds_exit(void) rds_page_exit(); rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info); rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info); - } + module_exit(rds_exit); u32 rds_gen_num; diff --git a/net/rds/bind.c b/net/rds/bind.c index da29cdf6644b..9676d565433b 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include "rds.h" @@ -45,29 +46,31 @@ struct bind_bucket { #define BIND_HASH_SIZE 8192 static struct bind_bucket bind_hash_table[BIND_HASH_SIZE]; -static struct bind_bucket *hash_to_bucket(__be32 addr, __be16 port) +static struct bind_bucket *hash_to_bucket(struct in6_addr *addr, __be16 port) { - return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) & - (BIND_HASH_SIZE - 1)); + return bind_hash_table + + (jhash_3words(addr->s6_addr32[0] ^ addr->s6_addr32[1], + addr->s6_addr32[2] ^ addr->s6_addr32[3], + (u32)port, 0) & (BIND_HASH_SIZE - 1)); } /* * must hold either read or write lock (write lock for insert != NULL) */ static struct rds_sock *rds_bind_lookup(struct bind_bucket *bucket, - __be32 addr, __be16 port, - struct rds_sock *insert) + const struct in6_addr *addr, + __be16 port, + struct rds_sock *insert, + __u32 scope_id) { struct rds_sock *rs; struct hlist_head *head = &bucket->head; - u64 cmp; - u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port); + u16 lport = be16_to_cpu(port); hlist_for_each_entry(rs, head, rs_bound_node) { - cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) | - be16_to_cpu(rs->rs_bound_port); - - if (cmp == needle) { + if (lport == be16_to_cpu(rs->rs_bound_port) && + ipv6_addr_equal(addr, &rs->rs_bound_addr) && + rs->rs_bound_scope_id == scope_id) { rds_sock_addref(rs); return rs; } @@ -78,8 +81,9 @@ static struct rds_sock *rds_bind_lookup(struct bind_bucket *bucket, * make sure our addr and port are set before * we are added to the list. */ - insert->rs_bound_addr = addr; + insert->rs_bound_addr = *addr; insert->rs_bound_port = port; + insert->rs_bound_scope_id = scope_id; rds_sock_addref(insert); hlist_add_head(&insert->rs_bound_node, head); @@ -93,14 +97,15 @@ static struct rds_sock *rds_bind_lookup(struct bind_bucket *bucket, * The rx path can race with rds_release. We notice if rds_release() has * marked this socket and don't return a rs ref to the rx path. */ -struct rds_sock *rds_find_bound(__be32 addr, __be16 port) +struct rds_sock *rds_find_bound(struct in6_addr *addr, __be16 port, + __u32 scope_id) { struct rds_sock *rs; unsigned long flags; struct bind_bucket *bucket = hash_to_bucket(addr, port); read_lock_irqsave(&bucket->lock, flags); - rs = rds_bind_lookup(bucket, addr, port, NULL); + rs = rds_bind_lookup(bucket, addr, port, NULL, scope_id); read_unlock_irqrestore(&bucket->lock, flags); if (rs && sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) { @@ -108,14 +113,15 @@ struct rds_sock *rds_find_bound(__be32 addr, __be16 port) rs = NULL; } - rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, - ntohs(port)); + rdsdebug("returning rs %p for %pI6c:%u\n", rs, addr, + ntohs(port)); return rs; } /* returns -ve errno or +ve port */ -static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) +static int rds_add_bound(struct rds_sock *rs, struct in6_addr *addr, + __be16 *port, __u32 scope_id) { unsigned long flags; int ret = -EADDRINUSE; @@ -138,14 +144,15 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) bucket = hash_to_bucket(addr, cpu_to_be16(rover)); write_lock_irqsave(&bucket->lock, flags); - rrs = rds_bind_lookup(bucket, addr, cpu_to_be16(rover), rs); + rrs = rds_bind_lookup(bucket, addr, cpu_to_be16(rover), rs, + scope_id); write_unlock_irqrestore(&bucket->lock, flags); if (!rrs) { *port = rs->rs_bound_port; ret = 0; - rdsdebug("rs %p binding to %pI4:%d\n", - rs, &addr, (int)ntohs(*port)); + rdsdebug("rs %p binding to %pI6c:%d\n", + rs, addr, (int)ntohs(*port)); break; } else rds_sock_put(rrs); @@ -158,18 +165,18 @@ void rds_remove_bound(struct rds_sock *rs) { unsigned long flags; struct bind_bucket *bucket = - hash_to_bucket(rs->rs_bound_addr, rs->rs_bound_port); + hash_to_bucket(&rs->rs_bound_addr, rs->rs_bound_port); write_lock_irqsave(&bucket->lock, flags); - if (rs->rs_bound_addr) { - rdsdebug("rs %p unbinding from %pI4:%d\n", - rs, &rs->rs_bound_addr, - ntohs(rs->rs_bound_port)); + if (!ipv6_addr_any(&rs->rs_bound_addr)) { + rdsdebug("rs %p unbinding from %pI6c:%d\n", + rs, &rs->rs_bound_addr, + ntohs(rs->rs_bound_port)); hlist_del_init(&rs->rs_bound_node); rds_sock_put(rs); - rs->rs_bound_addr = 0; + rs->rs_bound_addr = in6addr_any; } write_unlock_irqrestore(&bucket->lock, flags); @@ -178,29 +185,38 @@ void rds_remove_bound(struct rds_sock *rs) int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; - struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; struct rds_sock *rs = rds_sk_to_rs(sk); + struct in6_addr v6addr, *binding_addr; struct rds_transport *trans; + __u32 scope_id = 0; int ret = 0; + __be16 port; + + if (addr_len == sizeof(struct sockaddr_in)) { + struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + + if (sin->sin_family != AF_INET || + sin->sin_addr.s_addr == INADDR_ANY) + return -EINVAL; + ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &v6addr); + binding_addr = &v6addr; + port = sin->sin_port; + } else if (addr_len == sizeof(struct sockaddr_in6)) { + return -EPROTONOSUPPORT; + } else { + return -EINVAL; + } lock_sock(sk); - if (addr_len != sizeof(struct sockaddr_in) || - sin->sin_family != AF_INET || - rs->rs_bound_addr || - sin->sin_addr.s_addr == htonl(INADDR_ANY)) { - ret = -EINVAL; - goto out; - } - - ret = rds_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port); + ret = rds_add_bound(rs, binding_addr, &port, scope_id); if (ret) goto out; if (rs->rs_transport) { /* previously bound */ trans = rs->rs_transport; if (trans->laddr_check(sock_net(sock->sk), - sin->sin_addr.s_addr) != 0) { + binding_addr, scope_id) != 0) { ret = -ENOPROTOOPT; rds_remove_bound(rs); } else { @@ -208,14 +224,14 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) } goto out; } - trans = rds_trans_get_preferred(sock_net(sock->sk), - sin->sin_addr.s_addr); + trans = rds_trans_get_preferred(sock_net(sock->sk), binding_addr, + scope_id); if (!trans) { ret = -EADDRNOTAVAIL; rds_remove_bound(rs); if (printk_ratelimit()) - printk(KERN_INFO "RDS: rds_bind() could not find a transport for %pI4, " - "load rds_tcp or rds_rdma?\n", &sin->sin_addr.s_addr); + printk(KERN_INFO "RDS: rds_bind() could not find a transport for %pI6c, load rds_tcp or rds_rdma?\n", + binding_addr); goto out; } diff --git a/net/rds/cong.c b/net/rds/cong.c index e2540be3deb0..7fe693f7e08b 100644 --- a/net/rds/cong.c +++ b/net/rds/cong.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007 Oracle. All rights reserved. + * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -100,7 +100,7 @@ static DEFINE_RWLOCK(rds_cong_monitor_lock); static DEFINE_SPINLOCK(rds_cong_lock); static struct rb_root rds_cong_tree = RB_ROOT; -static struct rds_cong_map *rds_cong_tree_walk(__be32 addr, +static struct rds_cong_map *rds_cong_tree_walk(struct in6_addr *addr, struct rds_cong_map *insert) { struct rb_node **p = &rds_cong_tree.rb_node; @@ -108,12 +108,14 @@ static struct rds_cong_map *rds_cong_tree_walk(__be32 addr, struct rds_cong_map *map; while (*p) { + int diff; parent = *p; map = rb_entry(parent, struct rds_cong_map, m_rb_node); - if (addr < map->m_addr) + diff = rds_addr_cmp(addr, &map->m_addr); + if (diff < 0) p = &(*p)->rb_left; - else if (addr > map->m_addr) + else if (diff > 0) p = &(*p)->rb_right; else return map; @@ -131,7 +133,7 @@ static struct rds_cong_map *rds_cong_tree_walk(__be32 addr, * these bitmaps in the process getting pointers to them. The bitmaps are only * ever freed as the module is removed after all connections have been freed. */ -static struct rds_cong_map *rds_cong_from_addr(__be32 addr) +static struct rds_cong_map *rds_cong_from_addr(struct in6_addr *addr) { struct rds_cong_map *map; struct rds_cong_map *ret = NULL; @@ -143,7 +145,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr) if (!map) return NULL; - map->m_addr = addr; + map->m_addr = *addr; init_waitqueue_head(&map->m_waitq); INIT_LIST_HEAD(&map->m_conn_list); @@ -170,7 +172,7 @@ out: kfree(map); } - rdsdebug("map %p for addr %x\n", ret, be32_to_cpu(addr)); + rdsdebug("map %p for addr %pI6c\n", ret, addr); return ret; } @@ -226,8 +228,8 @@ void rds_cong_remove_conn(struct rds_connection *conn) int rds_cong_get_maps(struct rds_connection *conn) { - conn->c_lcong = rds_cong_from_addr(conn->c_laddr); - conn->c_fcong = rds_cong_from_addr(conn->c_faddr); + conn->c_lcong = rds_cong_from_addr(&conn->c_laddr); + conn->c_fcong = rds_cong_from_addr(&conn->c_faddr); if (!(conn->c_lcong && conn->c_fcong)) return -ENOMEM; @@ -255,8 +257,8 @@ void rds_cong_queue_updates(struct rds_cong_map *map) void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask) { - rdsdebug("waking map %p for %pI4\n", - map, &map->m_addr); + rdsdebug("waking map %p for %pI6c\n", + map, &map->m_addr); rds_stats_inc(s_cong_update_received); atomic_inc(&rds_cong_generation); if (waitqueue_active(&map->m_waitq)) @@ -304,8 +306,8 @@ void rds_cong_set_bit(struct rds_cong_map *map, __be16 port) unsigned long i; unsigned long off; - rdsdebug("setting congestion for %pI4:%u in map %p\n", - &map->m_addr, ntohs(port), map); + rdsdebug("setting congestion for %pI6c:%u in map %p\n", + &map->m_addr, ntohs(port), map); i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; @@ -318,8 +320,8 @@ void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port) unsigned long i; unsigned long off; - rdsdebug("clearing congestion for %pI4:%u in map %p\n", - &map->m_addr, ntohs(port), map); + rdsdebug("clearing congestion for %pI6c:%u in map %p\n", + &map->m_addr, ntohs(port), map); i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; @@ -359,7 +361,7 @@ void rds_cong_remove_socket(struct rds_sock *rs) /* update congestion map for now-closed port */ spin_lock_irqsave(&rds_cong_lock, flags); - map = rds_cong_tree_walk(rs->rs_bound_addr, NULL); + map = rds_cong_tree_walk(&rs->rs_bound_addr, NULL); spin_unlock_irqrestore(&rds_cong_lock, flags); if (map && rds_cong_test_bit(map, rs->rs_bound_port)) { diff --git a/net/rds/connection.c b/net/rds/connection.c index 7adea0f53390..6121a186e46d 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -32,7 +32,8 @@ */ #include #include -#include +#include +#include #include "rds.h" #include "loop.h" @@ -48,18 +49,21 @@ static unsigned long rds_conn_count; static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES]; static struct kmem_cache *rds_conn_slab; -static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr) +static struct hlist_head *rds_conn_bucket(const struct in6_addr *laddr, + const struct in6_addr *faddr) { + static u32 rds6_hash_secret __read_mostly; static u32 rds_hash_secret __read_mostly; - unsigned long hash; + u32 lhash, fhash, hash; net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret)); + net_get_random_once(&rds6_hash_secret, sizeof(rds6_hash_secret)); + + lhash = (__force u32)laddr->s6_addr32[3]; + fhash = __ipv6_addr_jhash(faddr, rds6_hash_secret); + hash = __inet6_ehashfn(lhash, 0, fhash, 0, rds_hash_secret); - /* Pass NULL, don't need struct net for hash */ - hash = __inet_ehashfn(be32_to_cpu(laddr), 0, - be32_to_cpu(faddr), 0, - rds_hash_secret); return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK]; } @@ -71,28 +75,30 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr) /* rcu read lock must be held or the connection spinlock */ static struct rds_connection *rds_conn_lookup(struct net *net, struct hlist_head *head, - __be32 laddr, __be32 faddr, + const struct in6_addr *laddr, + const struct in6_addr *faddr, struct rds_transport *trans, - u8 tos) + u8 tos, + int dev_if) { struct rds_connection *conn, *ret = NULL; hlist_for_each_entry_rcu(conn, head, c_hash_node) { - if (conn->c_faddr == faddr && conn->c_laddr == laddr && - conn->c_tos == tos && - conn->c_trans == trans && - net == rds_conn_net(conn)) { + if (ipv6_addr_equal(&conn->c_faddr, faddr) && + ipv6_addr_equal(&conn->c_laddr, laddr) && + conn->c_tos == tos && conn->c_trans == trans && + net == rds_conn_net(conn) && + conn->c_dev_if == dev_if) { ret = conn; break; } } - rdsdebug("returning conn %p for %pI4 -> %pI4\n", ret, - &laddr, &faddr); + rdsdebug("returning conn %p for %pI6c -> %pI6c\n", ret, laddr, faddr); return ret; } -void rds_conn_laddr_list(struct net *net, - __be32 laddr, struct list_head *laddr_conns) +void rds_conn_laddr_list(struct net *net, struct in6_addr *laddr, + struct list_head *laddr_conns) { struct rds_connection *conn; struct hlist_head *head; @@ -103,7 +109,7 @@ void rds_conn_laddr_list(struct net *net, for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); i++, head++) { hlist_for_each_entry_rcu(conn, head, c_hash_node) - if (conn->c_laddr == laddr && + if (ipv6_addr_equal(&conn->c_laddr, laddr) && net == rds_conn_net(conn)) list_add(&conn->c_laddr_node, laddr_conns); } @@ -121,8 +127,8 @@ void rds_conn_path_reset(struct rds_conn_path *cp) { struct rds_connection *conn = cp->cp_conn; - rdsdebug("connection %pI4 to %pI4 reset\n", - &conn->c_laddr, &conn->c_faddr); + rdsdebug("connection %pI6c to %pI6c reset\n", + &conn->c_laddr, &conn->c_faddr); rds_stats_inc(s_conn_reset); rds_send_path_reset(cp); @@ -172,10 +178,12 @@ static void __rds_conn_path_init(struct rds_connection *conn, * are torn down as the module is removed, if ever. */ static struct rds_connection *__rds_conn_create(struct net *net, - __be32 laddr, __be32 faddr, - struct rds_transport *trans, gfp_t gfp, - u8 tos, - int is_outgoing) + const struct in6_addr *laddr, + const struct in6_addr *faddr, + struct rds_transport *trans, + gfp_t gfp, u8 tos, + int is_outgoing, + int dev_if) { struct rds_connection *conn, *parent = NULL; struct hlist_head *head = rds_conn_bucket(laddr, faddr); @@ -185,12 +193,12 @@ static struct rds_connection *__rds_conn_create(struct net *net, int npaths; rcu_read_lock(); - conn = rds_conn_lookup(net, head, laddr, faddr, trans, tos); - if (conn - && conn->c_loopback - && conn->c_trans != &rds_loop_transport - && laddr == faddr - && !is_outgoing) { + conn = rds_conn_lookup(net, head, laddr, faddr, trans, tos, dev_if); + if (conn && + conn->c_loopback && + conn->c_trans != &rds_loop_transport && + ipv6_addr_equal(laddr, faddr) && + !is_outgoing) { /* This is a looped back IB connection, and we're * called by the code handling the incoming connect. * We need a second connection object into which we @@ -210,8 +218,10 @@ static struct rds_connection *__rds_conn_create(struct net *net, memset(conn, 0, sizeof(*conn)); INIT_HLIST_NODE(&conn->c_hash_node); - conn->c_laddr = laddr; - conn->c_faddr = faddr; + conn->c_laddr = *laddr; + conn->c_isv6 = !ipv6_addr_v4mapped(laddr); + conn->c_faddr = *faddr; + conn->c_dev_if = dev_if; rds_conn_net_set(conn, net); conn->c_tos = tos; @@ -228,7 +238,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, * can bind to the destination address then we'd rather the messages * flow through loopback rather than either transport. */ - loop_trans = rds_trans_get_preferred(net, faddr); + loop_trans = rds_trans_get_preferred(net, faddr, conn->c_dev_if); if (loop_trans) { rds_trans_put(loop_trans); conn->c_loopback = 1; @@ -275,10 +285,10 @@ static struct rds_connection *__rds_conn_create(struct net *net, goto out; } - rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n", - conn, &laddr, &faddr, - trans->t_name ? trans->t_name : "[unknown]", - is_outgoing ? "(outgoing)" : ""); + rdsdebug("allocated conn %p for %pI6c -> %pI6c over %s %s\n", + conn, laddr, faddr, + trans->t_name ? trans->t_name : "[unknown]", + is_outgoing ? "(outgoing)" : ""); /* * Since we ran without holding the conn lock, someone could @@ -304,7 +314,8 @@ static struct rds_connection *__rds_conn_create(struct net *net, /* Creating normal conn */ struct rds_connection *found; - found = rds_conn_lookup(net, head, laddr, faddr, trans, tos); + found = rds_conn_lookup(net, head, laddr, faddr, trans, tos, + dev_if); if (found) { struct rds_conn_path *cp; int i; @@ -336,32 +347,35 @@ out: } struct rds_connection *rds_conn_create(struct net *net, - __be32 laddr, __be32 faddr, - struct rds_transport *trans, - u8 tos, gfp_t gfp) + const struct in6_addr *laddr, + const struct in6_addr *faddr, + struct rds_transport *trans, + u8 tos, gfp_t gfp, int dev_if) { - return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 0); + return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 0, dev_if); } EXPORT_SYMBOL_GPL(rds_conn_create); struct rds_connection *rds_conn_create_outgoing(struct net *net, - __be32 laddr, __be32 faddr, - struct rds_transport *trans, - u8 tos, gfp_t gfp) + struct in6_addr *laddr, + struct in6_addr *faddr, + struct rds_transport *trans, + u8 tos, gfp_t gfp, int dev_if) { - return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 1); + return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 1, dev_if); } EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); -struct rds_connection *rds_conn_find(struct net *net, __be32 laddr, - __be32 faddr, struct rds_transport *trans, - u8 tos) +struct rds_connection *rds_conn_find(struct net *net, struct in6_addr *laddr, + struct in6_addr *faddr, + struct rds_transport *trans, u8 tos, + int dev_if) { struct rds_connection *conn; struct hlist_head *head = rds_conn_bucket(laddr, faddr); rcu_read_lock(); - conn = rds_conn_lookup(net, head, laddr, faddr, trans, tos); + conn = rds_conn_lookup(net, head, laddr, faddr, trans, tos, dev_if); rcu_read_unlock(); return conn; @@ -375,7 +389,7 @@ void rds_conn_shutdown(struct rds_conn_path *cp) /* shut it down unless it's down already */ if (!rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_DOWN)) { rds_rtd(RDS_RTD_CM_EXT, - "RDS/%s: shutdown init <%pI4,%pI4,%d>, cn %p, cn->c_p %p\n", + "RDS/%s: shutdown init <%pI6c,%pI6c,%d>, cn %p, cn->c_p %p\n", conn->c_trans->t_type == RDS_TRANS_TCP ? "TCP" : "IB", &conn->c_laddr, &conn->c_faddr, conn->c_tos, conn, conn->c_passive); @@ -505,7 +519,7 @@ void rds_conn_destroy(struct rds_connection *conn, int shutdown) int i; int npaths = (conn->c_trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); - rds_rtd(RDS_RTD_CM, "freeing conn %p <%pI4,%pI4,%d>\n", + rds_rtd(RDS_RTD_CM, "freeing conn %p <%pI6c,%pI6c,%d>\n", conn, &conn->c_laddr, &conn->c_faddr, conn->c_tos); @@ -581,12 +595,17 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, /* XXX too lazy to maintain counts.. */ list_for_each_entry(rm, list, m_conn_item) { + __be32 laddr; + __be32 faddr; + total++; + laddr = conn->c_laddr.s6_addr32[3]; + faddr = conn->c_faddr.s6_addr32[3]; if (total <= len) rds_inc_info_copy(&rm->m_inc, iter, - conn->c_laddr, - conn->c_faddr, + laddr, + faddr, 0); } @@ -664,7 +683,6 @@ void rds_walk_conn_path_info(struct socket *sock, unsigned int len, struct hlist_head *head; struct rds_connection *conn; size_t i; - int j; rcu_read_lock(); @@ -675,17 +693,20 @@ void rds_walk_conn_path_info(struct socket *sock, unsigned int len, i++, head++) { hlist_for_each_entry_rcu(conn, head, c_hash_node) { struct rds_conn_path *cp; - int npaths; - npaths = (conn->c_trans->t_mp_capable ? - RDS_MPATH_WORKERS : 1); - for (j = 0; j < npaths; j++) { - cp = &conn->c_path[j]; + /* XXX We only copy the information from the first + * path for now. The problem is that if there are + * more than one underlying paths, we cannot report + * information of all of them using the exisitng + * API. For example, there is only one next_tx_seq, + * which path's next_tx_seq should we report? It is + * a bug in the design of MPRDS. + */ + cp = conn->c_path; - /* XXX no cp_lock usage.. */ - if (!visitor(cp, buffer)) - continue; - } + /* XXX no cp_lock usage.. */ + if (!visitor(cp, buffer)) + continue; /* We copy as much as we can fit in the buffer, * but we count all items so that the caller @@ -704,13 +725,14 @@ void rds_walk_conn_path_info(struct socket *sock, unsigned int len, static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer) { struct rds_info_connection *cinfo = buffer; + struct rds_connection *conn = cp->cp_conn; cinfo->next_tx_seq = cp->cp_next_tx_seq; cinfo->next_rx_seq = cp->cp_next_rx_seq; - cinfo->laddr = cp->cp_conn->c_laddr; - cinfo->faddr = cp->cp_conn->c_faddr; - cinfo->tos = cp->cp_conn->c_tos; - strncpy(cinfo->transport, cp->cp_conn->c_trans->t_name, + cinfo->laddr = conn->c_laddr.s6_addr32[3]; + cinfo->faddr = conn->c_faddr.s6_addr32[3]; + cinfo->tos = conn->c_tos; + strncpy(cinfo->transport, conn->c_trans->t_name, sizeof(cinfo->transport)); cinfo->flags = 0; @@ -767,6 +789,7 @@ void rds_conn_exit(void) rds_conn_message_info_send); rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES, rds_conn_message_info_retrans); + } static char *conn_drop_reasons[] = { @@ -849,7 +872,7 @@ void rds_conn_path_drop(struct rds_conn_path *cp, int reason) cp->cp_reconnect_err = 0; cp->cp_reconnect_racing = 0; if (conn->c_trans->t_type != RDS_TRANS_TCP) - printk(KERN_INFO "RDS/IB: connection <%pI4,%pI4,%d> dropped due to '%s'\n", + printk(KERN_INFO "RDS/IB: connection <%pI6c,%pI6c,%d> dropped due to '%s'\n", &conn->c_laddr, &conn->c_faddr, conn->c_tos, @@ -857,7 +880,7 @@ void rds_conn_path_drop(struct rds_conn_path *cp, int reason) } else if ((cp->cp_reconnect_warn) && (now - cp->cp_reconnect_start > 60)) { - printk(KERN_INFO "RDS/%s: re-connect <%pI4,%pI4,%d> stalling for more than 1 min...(drops=%u err=%d)\n", + printk(KERN_INFO "RDS/%s: re-connect <%pI6c,%pI6c,%d> stalling for more than 1 min...(drops=%u err=%d)\n", conn->c_trans->t_type == RDS_TRANS_TCP ? "TCP" : "IB", &conn->c_laddr, &conn->c_faddr, @@ -871,7 +894,7 @@ void rds_conn_path_drop(struct rds_conn_path *cp, int reason) atomic_set(&cp->cp_state, RDS_CONN_ERROR); rds_rtd(RDS_RTD_CM_EXT, - "RDS/%s: queueing shutdown work, conn %p, <%pI4,%pI4,%d>\n", + "RDS/%s: queueing shutdown work, conn %p, <%pI6c,%pI6c,%d>\n", conn->c_trans->t_type == RDS_TRANS_TCP ? "TCP" : "IB", conn, &conn->c_laddr, &conn->c_faddr, conn->c_tos); @@ -898,7 +921,7 @@ void rds_conn_path_connect_if_down(struct rds_conn_path *cp) if (rds_conn_path_state(cp) == RDS_CONN_DOWN && !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags)) { rds_rtd(RDS_RTD_CM_EXT, - "queueing connect work, conn %p, <%pI4,%pI4,%d>\n", + "queueing connect work, conn %p, <%pI6c,%pI6c,%d>\n", conn, &conn->c_laddr, &conn->c_faddr, conn->c_tos); queue_delayed_work(cp->cp_wq, &cp->cp_conn_w, 0); diff --git a/net/rds/ib.c b/net/rds/ib.c index cc465c5a01f7..60637ed5f8f7 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -364,9 +364,11 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, /* We will only ever look at IB transports */ if (conn->c_trans != &rds_ib_transport) return 0; + if (conn->c_isv6) + return 0; - iinfo->src_addr = conn->c_laddr; - iinfo->dst_addr = conn->c_faddr; + iinfo->src_addr = conn->c_laddr.s6_addr32[3]; + iinfo->dst_addr = conn->c_faddr.s6_addr32[3]; memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); @@ -425,20 +427,21 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len, * allowed to influence which paths have priority. We could call userspace * asserting this policy "routing". */ -static int rds_ib_laddr_check(struct net *net, __be32 addr) +static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr, + __u32 scope_id) { int ret; struct rdma_cm_id *cm_id; struct sockaddr_in sin; /* Link-local addresses don't play well with IB */ - if (ipv4_is_linklocal_169(addr)) { + if (ipv4_is_linklocal_169(addr->s6_addr32[3])) { pr_info_once("\n"); pr_info_once("****************************************************\n"); pr_info_once("** WARNING WARNING WARNING WARNING WARNING **\n"); pr_info_once("** **\n"); - pr_info_once("** RDS/IB: Link local address %pI4 NOT SUPPORTED **\n", - &addr); + pr_info_once("** RDS/IB: Link local address %pI6c NOT SUPPORTED **\n", + addr); pr_info_once("** **\n"); pr_info_once("** HAIP IP addresses should not be used on ORACLE **\n"); pr_info_once("** engineered systems **\n"); @@ -457,7 +460,7 @@ static int rds_ib_laddr_check(struct net *net, __be32 addr) memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; - sin.sin_addr.s_addr = addr; + sin.sin_addr.s_addr = addr->s6_addr32[3]; /* rdma_bind_addr will only succeed for IB & iWARP devices */ ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); @@ -466,9 +469,9 @@ static int rds_ib_laddr_check(struct net *net, __be32 addr) if (ret || !cm_id->device || cm_id->device->node_type != RDMA_NODE_IB_CA) ret = -EADDRNOTAVAIL; - rdsdebug("addr %pI4 ret %d node type %d\n", - &addr, ret, - cm_id->device ? cm_id->device->node_type : -1); + rdsdebug("addr %pI6c ret %d node type %d\n", + addr, ret, + cm_id->device ? cm_id->device->node_type : -1); rdma_destroy_id(cm_id); diff --git a/net/rds/ib.h b/net/rds/ib.h index 47fd839f865c..b2bcce89f93c 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -87,18 +87,43 @@ struct rds_ib_refill_cache { struct list_head *ready; }; +struct rds_ib_conn_priv_cmn { + u8 ricpc_protocol_major; + u8 ricpc_protocol_minor; + __be16 ricpc_protocol_minor_mask; /* bitmask */ + u8 ricpc_tos; + u8 ricpc_reserved1; + __be16 ricpc_frag_sz; + __be64 ricpc_ack_seq; + __be32 ricpc_credit; /* non-zero enables flow ctl */ +}; + struct rds_ib_connect_private { /* Add new fields at the end, and don't permute existing fields. */ - __be32 dp_saddr; - __be32 dp_daddr; - u8 dp_protocol_major; - u8 dp_protocol_minor; - __be16 dp_protocol_minor_mask; /* bitmask */ - u8 dp_tos; - u8 dp_reserved1; - __be16 dp_frag_sz; - __be64 dp_ack_seq; - __be32 dp_credit; /* non-zero enables flow ctl */ + __be32 dp_saddr; + __be32 dp_daddr; + struct rds_ib_conn_priv_cmn dp_cmn; +}; + +struct rds6_ib_connect_private { + /* Add new fields at the end, and don't permute existing fields. */ + struct in6_addr dp_saddr; + struct in6_addr dp_daddr; + struct rds_ib_conn_priv_cmn dp_cmn; +}; + +#define dp_protocol_major dp_cmn.ricpc_protocol_major +#define dp_protocol_minor dp_cmn.ricpc_protocol_minor +#define dp_protocol_minor_mask dp_cmn.ricpc_protocol_minor_mask +#define dp_tos dp_cmn.ricpc_tos +#define dp_reserved1 dp_cmn.ricpc_reserved1 +#define dp_frag_sz dp_cmn.ricpc_frag_sz +#define dp_ack_seq dp_cmn.ricpc_ack_seq +#define dp_credit dp_cmn.ricpc_credit + +union rds_ib_conn_priv { + struct rds_ib_connect_private ricp_v4; + struct rds6_ib_connect_private ricp_v6; }; struct rds_ib_send_work { @@ -591,8 +616,8 @@ void rds_ib_state_change(struct sock *sk); int rds_ib_listen_init(void); void rds_ib_listen_stop(void); int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, - struct rdma_cm_event *event); -int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id); + struct rdma_cm_event *event, bool isv6); +int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6); void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event); void rds_ib_init_frag(unsigned int version); @@ -602,7 +627,8 @@ int rds_ib_setup_fastreg(struct rds_ib_device *rds_ibdev); void rds_ib_reset_fastreg(struct work_struct *work); /* ib_rdma.c */ -int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); +int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, + struct in6_addr *ipaddr); void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); void rds_ib_destroy_nodev_conns(void); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 356312a2fc23..ea87f9afd466 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "rds.h" #include "ib.h" @@ -182,7 +183,7 @@ static u16 rds_ib_set_frag_size(struct rds_connection *conn, u16 dp_frag) ic->i_frag_pages = ceil(ic->i_frag_sz, PAGE_SIZE); - pr_debug("RDS/IB: conn <%pI4, %pI4,%d>, Frags : {%d,%d,%d}, updated {%d -> %d}\n", + pr_debug("RDS/IB: conn <%pI6c, %pI6c,%d>, Frags : {%d,%d,%d}, updated {%d -> %d}\n", &conn->c_laddr, &conn->c_faddr, conn->c_tos, ib_init_frag_size / SZ_1K, ic->i_frag_sz / SZ_1K, dp_frag / SZ_1K, current_frag / SZ_1K, ic->i_frag_sz / SZ_1K); @@ -251,27 +252,50 @@ out: */ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event) { - const struct rds_ib_connect_private *dp = NULL; struct rds_ib_connection *ic = conn->c_transport_data; + const union rds_ib_conn_priv *dp = NULL; struct ib_qp_attr qp_attr; + __be16 frag_sz = 0; + __be64 ack_seq = 0; + __be32 credit = 0; + u8 major = 0; + u8 minor = 0; int err; - if (event->param.conn.private_data_len >= sizeof(*dp)) { - dp = event->param.conn.private_data; - - /* make sure it isn't empty data */ - if (dp->dp_protocol_major) { - rds_ib_set_protocol(conn, - RDS_PROTOCOL(dp->dp_protocol_major, - dp->dp_protocol_minor)); - rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); - rds_ib_set_frag_size(conn, be16_to_cpu(dp->dp_frag_sz)); + dp = event->param.conn.private_data; + if (conn->c_isv6) { + if (event->param.conn.private_data_len >= + sizeof(struct rds6_ib_connect_private)) { + major = dp->ricp_v6.dp_protocol_major; + minor = dp->ricp_v6.dp_protocol_minor; + credit = dp->ricp_v6.dp_credit; + frag_sz = dp->ricp_v6.dp_frag_sz; + /* dp structure start is not guaranteed to be 8 bytes + * aligned. Since dp_ack_seq is 64-bit extended load + * operations can be used so go through get_unaligned + * to avoid unaligned errors. + */ + ack_seq = get_unaligned(&dp->ricp_v6.dp_ack_seq); } + } else if (event->param.conn.private_data_len >= + sizeof(struct rds_ib_connect_private)) { + major = dp->ricp_v4.dp_protocol_major; + minor = dp->ricp_v4.dp_protocol_minor; + credit = dp->ricp_v4.dp_credit; + frag_sz = dp->ricp_v4.dp_frag_sz; + ack_seq = get_unaligned(&dp->ricp_v4.dp_ack_seq); + } + + /* make sure it isn't empty data */ + if (major) { + rds_ib_set_protocol(conn, RDS_PROTOCOL(major, minor)); + rds_ib_set_flow_control(conn, be32_to_cpu(credit)); + rds_ib_set_frag_size(conn, be16_to_cpu(frag_sz)); } if (conn->c_version < RDS_PROTOCOL_VERSION) { if (conn->c_version != RDS_PROTOCOL_COMPAT_VERSION) { - printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed, no longer supported\n", + printk(KERN_NOTICE "RDS/IB: Connection to %pI6c version %u.%u failed, no longer supported\n", &conn->c_faddr, RDS_PROTOCOL_MAJOR(conn->c_version), RDS_PROTOCOL_MINOR(conn->c_version)); @@ -280,7 +304,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even } } - printk(KERN_NOTICE "RDS/IB: %s conn %p i_cm_id %p, frag %dKB, connected <%pI4,%pI4,%d> version %u.%u%s%s\n", + printk(KERN_NOTICE "RDS/IB: %s conn %p i_cm_id %p, frag %dKB, connected <%pI6c,%pI6c,%d> version %u.%u%s%s\n", ic->i_active_side ? "Active " : "Passive", conn, ic->i_cm_id, ic->i_frag_sz / SZ_1K, &conn->c_laddr, &conn->c_faddr, conn->c_tos, @@ -326,7 +350,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err); /* update ib_device with this local ipaddr */ - err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr); + err = rds_ib_update_ipaddr(ic->rds_ibdev, &conn->c_laddr); if (err) printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err); @@ -334,14 +358,8 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even /* If the peer gave us the last packet it saw, process this as if * we had received a regular ACK. */ if (dp) { - /* dp structure start is not guaranteed to be 8 bytes aligned. - * Since dp_ack_seq is 64-bit extended load operations can be - * used so go through get_unaligned to avoid unaligned errors. - */ - __be64 dp_ack_seq = get_unaligned(&dp->dp_ack_seq); - - if (dp_ack_seq) - rds_send_drop_acked(conn, be64_to_cpu(dp_ack_seq), + if (ack_seq) + rds_send_drop_acked(conn, be64_to_cpu(ack_seq), NULL); } @@ -349,11 +367,12 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even } static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, - struct rdma_conn_param *conn_param, - struct rds_ib_connect_private *dp, - u32 protocol_version, - u32 max_responder_resources, - u32 max_initiator_depth, u16 frag) + struct rdma_conn_param *conn_param, + union rds_ib_conn_priv *dp, + u32 protocol_version, + u32 max_responder_resources, + u32 max_initiator_depth, u16 frag, + bool isv6) { struct rds_ib_connection *ic = conn->c_transport_data; struct rds_ib_device *rds_ibdev = ic->rds_ibdev; @@ -370,26 +389,53 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, if (dp) { memset(dp, 0, sizeof(*dp)); - dp->dp_saddr = conn->c_laddr; - dp->dp_daddr = conn->c_faddr; - dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version); - dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); - dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS); - dp->dp_ack_seq = cpu_to_be64(rds_ib_piggyb_ack(ic)); - dp->dp_tos = conn->c_tos; + if (isv6) { + dp->ricp_v6.dp_saddr = conn->c_laddr; + dp->ricp_v6.dp_daddr = conn->c_faddr; + dp->ricp_v6.dp_protocol_major = + RDS_PROTOCOL_MAJOR(protocol_version); + dp->ricp_v6.dp_protocol_minor = + RDS_PROTOCOL_MINOR(protocol_version); + dp->ricp_v6.dp_protocol_minor_mask = + cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS); + dp->ricp_v6.dp_ack_seq = + cpu_to_be64(rds_ib_piggyb_ack(ic)); + dp->ricp_v6.dp_tos = conn->c_tos; + dp->ricp_v6.dp_frag_sz = cpu_to_be16(frag); + + conn_param->private_data = &dp->ricp_v6; + conn_param->private_data_len = sizeof(dp->ricp_v6); + } else { + dp->ricp_v4.dp_saddr = conn->c_laddr.s6_addr32[3]; + dp->ricp_v4.dp_daddr = conn->c_faddr.s6_addr32[3]; + dp->ricp_v4.dp_protocol_major = + RDS_PROTOCOL_MAJOR(protocol_version); + dp->ricp_v4.dp_protocol_minor = + RDS_PROTOCOL_MINOR(protocol_version); + dp->ricp_v4.dp_protocol_minor_mask = + cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS); + dp->ricp_v4.dp_ack_seq = + cpu_to_be64(rds_ib_piggyb_ack(ic)); + dp->ricp_v4.dp_tos = conn->c_tos; + dp->ricp_v4.dp_frag_sz = cpu_to_be16(frag); + + conn_param->private_data = &dp->ricp_v4; + conn_param->private_data_len = sizeof(dp->ricp_v4); + } /* Advertise flow control */ if (ic->i_flowctl) { unsigned int credits; - credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)); - dp->dp_credit = cpu_to_be32(credits); - atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits); + credits = IB_GET_POST_CREDITS( + atomic_read(&ic->i_credits)); + if (isv6) + dp->ricp_v6.dp_credit = cpu_to_be32(credits); + else + dp->ricp_v4.dp_credit = cpu_to_be32(credits); + atomic_sub(IB_SET_POST_CREDITS(credits), + &ic->i_credits); } - - dp->dp_frag_sz = cpu_to_be16(frag); - conn_param->private_data = dp; - conn_param->private_data_len = sizeof(*dp); } } @@ -476,7 +522,7 @@ static void poll_rcq(struct rds_ib_connection *ic, struct ib_cq *cq, while ((nr = ib_poll_cq(cq, RDS_WC_MAX, wcs)) > 0) { for (i = 0; i < nr; i++) { if ((++ic->i_rx_poll_cq % RDS_IB_RX_LIMIT) == 0) { - rdsdebug("connection <%pI4,%pI4,%d> RX poll_cq processed %d\n", + rdsdebug("connection <%pI6c,%pI6c,%d> RX poll_cq processed %d\n", &ic->conn->c_laddr, &ic->conn->c_faddr, ic->conn->c_tos, @@ -617,7 +663,7 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data) break; default: rds_rtd(RDS_RTD_ERR, - "Fatal QP Event %u (%s) - connection %pI4->%pI4 tos %d, reconnecting\n", + "Fatal QP Event %u (%s) - connection %pI6c->%pI6c tos %d, reconnecting\n", event->event, rds_ib_event_str(event->event), &conn->c_laddr, &conn->c_faddr, conn->c_tos); rds_conn_drop(conn, DR_IB_QP_EVENT); @@ -834,11 +880,13 @@ out: return ret; } -static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event) +static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6) { - const struct rds_ib_connect_private *dp = event->param.conn.private_data; - u16 common; + const union rds_ib_conn_priv *dp = event->param.conn.private_data; + u8 data_len, major, minor; u32 version = 0; + __be16 mask; + u16 common; /* * rdma_cm private data is odd - when there is any private data in the @@ -857,59 +905,132 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event) return 0; } + if (isv6) { + data_len = sizeof(struct rds6_ib_connect_private); + major = dp->ricp_v6.dp_protocol_major; + minor = dp->ricp_v6.dp_protocol_minor; + mask = dp->ricp_v6.dp_protocol_minor_mask; + } else { + data_len = sizeof(struct rds_ib_connect_private); + major = dp->ricp_v4.dp_protocol_major; + minor = dp->ricp_v4.dp_protocol_minor; + mask = dp->ricp_v4.dp_protocol_minor_mask; + } /* Even if len is crap *now* I still want to check it. -ASG */ - if (event->param.conn.private_data_len < sizeof(*dp) - || dp->dp_protocol_major == 0) + if (event->param.conn.private_data_len < data_len || major == 0) return RDS_PROTOCOL_4_0; - common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS; - if (dp->dp_protocol_major == 4 && common) { + common = be16_to_cpu(mask) & RDS_IB_SUPPORTED_PROTOCOLS; + if (major == 4 && common) { version = RDS_PROTOCOL_4_0; while ((common >>= 1) != 0) version++; - } else if (RDS_PROTOCOL_COMPAT_VERSION == - RDS_PROTOCOL(dp->dp_protocol_major, dp->dp_protocol_minor)) { + } else if (RDS_PROTOCOL_COMPAT_VERSION == RDS_PROTOCOL(major, minor)) { version = RDS_PROTOCOL_COMPAT_VERSION; - } else if (printk_ratelimit()) { - printk(KERN_NOTICE "RDS: Connection from %pI4 using " - "incompatible protocol version %u.%u\n", - &dp->dp_saddr, - dp->dp_protocol_major, - dp->dp_protocol_minor); + } else { + if (isv6) { + printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI6c using incompatible protocol version %u.%u\n", + &dp->ricp_v6.dp_saddr, major, minor); + } else { + printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n", + &dp->ricp_v4.dp_saddr, major, minor); + } } return version; } +/* Given an IPv6 address, find the IB net_device which hosts that address and + * return its index. This is used by the rds_ib_cm_handle_connect() code to + * find the interface index of where an incoming request comes from when + * the request is using a link local address. + * + * Note one problem in this search. It is possible that two interfaces have + * the same link local address. Unfortunately, this cannot be solved unless + * the underlying layer gives us the interface which an incoming RDMA connect + * request comes from. + */ +static u32 __rds_find_ifindex(struct net *net, const struct in6_addr *addr) +{ + struct net_device *dev; + int idx = 0; + + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + if (dev->type == ARPHRD_INFINIBAND && + ipv6_chk_addr(net, addr, dev, 0)) { + idx = dev->ifindex; + break; + } + } + rcu_read_unlock(); + + return idx; +} + int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, - struct rdma_cm_event *event) + struct rdma_cm_event *event, bool isv6) { __be64 lguid = cm_id->route.path_rec->sgid.global.interface_id; __be64 fguid = cm_id->route.path_rec->dgid.global.interface_id; - const struct rds_ib_connect_private *dp = event->param.conn.private_data; - struct rds_ib_connect_private dp_rep; - struct rds_connection *conn = NULL; + const struct rds_ib_conn_priv_cmn *dp_cmn; struct rds_ib_connection *ic = NULL; + struct rds_connection *conn = NULL; struct rdma_conn_param conn_param; - u32 version; - int err = 1, destroy = 1; + const union rds_ib_conn_priv *dp; + union rds_ib_conn_priv dp_rep; + struct in6_addr s_mapped_addr; + struct in6_addr d_mapped_addr; + const struct in6_addr *saddr6; + const struct in6_addr *daddr6; + int destroy = 1; int acl_ret = 0; + u32 ifindex = 0; + u32 version; + int err = 1; u16 frag; /* Check whether the remote protocol version matches ours. */ - version = rds_ib_protocol_compatible(event); + version = rds_ib_protocol_compatible(event, isv6); if (!version) goto out; + dp = event->param.conn.private_data; + if (isv6) { + dp_cmn = &dp->ricp_v6.dp_cmn; + saddr6 = &dp->ricp_v6.dp_saddr; + daddr6 = &dp->ricp_v6.dp_daddr; + /* If the local address is link local, need to find the + * interface index in order to create a proper RDS + * connection. + */ + if (ipv6_addr_type(daddr6) & IPV6_ADDR_LINKLOCAL) { + /* Using init_net for now .. */ + ifindex = __rds_find_ifindex(&init_net, daddr6); + /* No index found... Need to bail out. */ + if (ifindex == 0) { + err = -EOPNOTSUPP; + goto out; + } + } + } else { + dp_cmn = &dp->ricp_v4.dp_cmn; + ipv6_addr_set_v4mapped(dp->ricp_v4.dp_saddr, &s_mapped_addr); + ipv6_addr_set_v4mapped(dp->ricp_v4.dp_daddr, &d_mapped_addr); + saddr6 = &s_mapped_addr; + daddr6 = &d_mapped_addr; + } + rds_rtd(RDS_RTD_CM, - "saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid 0x%llx tos %d\n", - &dp->dp_saddr, &dp->dp_daddr, + "saddr %pI6c daddr %pI6c RDSv%u.%u lguid 0x%llx fguid 0x%llx tos %d\n", + saddr6, daddr6, RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version), (unsigned long long)be64_to_cpu(lguid), (unsigned long long)be64_to_cpu(fguid), - dp->dp_tos); + dp_cmn->ricpc_tos); - acl_ret = rds_ib_match_acl(cm_id, dp->dp_saddr); + /* XXX IPoIB ACL Only support IPv4 */ + acl_ret = rds_ib_match_acl(cm_id, saddr6->s6_addr32[3]); if (acl_ret < 0) { err = RDS_ACL_FAILURE; rdsdebug("RDS: IB: passive: rds_ib_match_acl failed\n"); @@ -917,8 +1038,10 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, } /* RDS/IB is not currently netns aware, thus init_net */ - conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr, - &rds_ib_transport, dp->dp_tos, GFP_KERNEL); + conn = rds_conn_create(&init_net, daddr6, saddr6, + &rds_ib_transport, dp_cmn->ricpc_tos, + GFP_KERNEL, ifindex); + if (IS_ERR(conn)) { rds_rtd(RDS_RTD_ERR, "rds_conn_create failed (%ld)\n", PTR_ERR(conn)); @@ -965,7 +1088,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, retry = DIV_ROUND_UP(retry, 1000); if (now > conn->c_connection_start && now - conn->c_connection_start > retry) { - pr_info("RDS/IB: conn <%pI4,%pI4,%d> racing for more than %lus, retry\n", + pr_info("RDS/IB: conn <%pI6c,%pI6c,%d> racing for more than %lus, retry\n", &conn->c_laddr, &conn->c_faddr, conn->c_tos, retry); set_bit(RDS_RECONNECT_TIMEDOUT, @@ -988,7 +1111,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, */ conn->c_connection_start = get_seconds(); - rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); + rds_ib_set_flow_control(conn, be32_to_cpu(dp_cmn->ricpc_credit)); /* Use ic->i_flowctl as the first post credit to enable * IB transport flow control. This first post credit is * deducted after advertise the credit to the remote @@ -998,8 +1121,9 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, /* If the peer gave us the last packet it saw, process this as if * we had received a regular ACK. */ - if (dp->dp_ack_seq) - rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); + if (dp_cmn->ricpc_ack_seq) + rds_send_drop_acked(conn, be64_to_cpu(dp_cmn->ricpc_ack_seq), + NULL); BUG_ON(cm_id->context); BUG_ON(ic->i_cm_id); @@ -1013,22 +1137,22 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, err = rds_ib_setup_qp(conn); if (err) { - pr_warn("RDS/IB: rds_ib_setup_qp failed with err(%d) for conn <%pI4,%pI4,%d>\n", + pr_warn("RDS/IB: rds_ib_setup_qp failed with err(%d) for conn <%pI6c,%pI6c,%d>\n", err, &conn->c_laddr, &conn->c_faddr, conn->c_tos); rds_conn_drop(conn, DR_IB_PAS_SETUP_QP_FAIL); goto out; } - frag = rds_ib_set_frag_size(conn, be16_to_cpu(dp->dp_frag_sz)); + frag = rds_ib_set_frag_size(conn, be16_to_cpu(dp_cmn->ricpc_frag_sz)); rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version, - event->param.conn.responder_resources, - event->param.conn.initiator_depth, - frag); + event->param.conn.responder_resources, + event->param.conn.initiator_depth, + frag, isv6); /* rdma_accept() calls rdma_reject() internally if it fails */ err = rdma_accept(cm_id, &conn_param); if (err) { - pr_warn("RDS/IB: rdma_accept failed with err(%d) for conn <%pI4,%pI4,%d>\n", + pr_warn("RDS/IB: rdma_accept failed with err(%d) for conn <%pI6c,%pI6c,%d>\n", err, &conn->c_laddr, &conn->c_faddr, conn->c_tos); rds_conn_drop(conn, DR_IB_RDMA_ACCEPT_FAIL); } @@ -1067,18 +1191,18 @@ void rds_ib_conn_destroy_init(struct rds_connection *conn) queue_delayed_work(rds_aux_wq, &work->work, 0); } -int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) +int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6) { struct rds_connection *conn = cm_id->context; struct rds_ib_connection *ic = conn->c_transport_data; struct rdma_conn_param conn_param; - struct rds_ib_connect_private dp; + union rds_ib_conn_priv dp; u16 frag; int ret; - ret = rds_ib_match_acl(ic->i_cm_id, conn->c_faddr); + ret = rds_ib_match_acl(ic->i_cm_id, conn->c_faddr.s6_addr32[3]); if (ret < 0) { - pr_err("RDS: IB: active conn=%p, <%pI4,%pI4,%d> destroyed due ACL violation\n", + pr_err("RDS: IB: active conn=%p, <%pI6c,%pI6c,%d> destroyed due ACL violation\n", conn, &conn->c_laddr, &conn->c_faddr, conn->c_tos); rds_ib_conn_destroy_init(conn); @@ -1097,7 +1221,7 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) */ atomic_set(&ic->i_credits, IB_SET_POST_CREDITS(ic->i_flowctl)); - pr_debug("RDS/IB: Initiate conn <%pI4, %pI4,%d> with Frags : {%d,%d}\n", + pr_debug("RDS/IB: Initiate conn <%pI6c, %pI6c,%d> with Frags : {%d,%d}\n", &conn->c_laddr, &conn->c_faddr, conn->c_tos, ib_init_frag_size / SZ_1K, ic->i_frag_sz / SZ_1K); @@ -1110,7 +1234,7 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) frag = rds_ib_set_frag_size(conn, ib_init_frag_size); rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, conn->c_proposed_version, UINT_MAX, UINT_MAX, - frag); + frag, isv6); ret = rdma_connect(cm_id, &conn_param); if (ret) { pr_warn("RDS/IB: rdma_connect failed (%d)\n", ret); @@ -1133,14 +1257,18 @@ out: int rds_ib_conn_path_connect(struct rds_conn_path *cp) { struct rds_connection *conn = cp->cp_conn; - struct rds_ib_connection *ic = conn->c_transport_data; - struct sockaddr_in src, dest; + struct sockaddr_storage src, dest; + rdma_cm_event_handler handler; + struct rds_ib_connection *ic; int ret; + ic = conn->c_transport_data; + /* XXX I wonder what affect the port space has */ /* delegate cm event handler to rdma_transport */ - ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, - RDMA_PS_TCP, IB_QPT_RC); + handler = rds_rdma_cm_event_handler; + ic->i_cm_id = rdma_create_id(handler, conn, RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(ic->i_cm_id)) { ret = PTR_ERR(ic->i_cm_id); ic->i_cm_id = NULL; @@ -1149,17 +1277,37 @@ int rds_ib_conn_path_connect(struct rds_conn_path *cp) } rds_rtd(RDS_RTD_CM_EXT, - "RDS/IB: conn init <%pI4,%pI4,%d> cm_id %p\n", + "RDS/IB: conn init <%pI6c,%pI6c,%d> cm_id %p\n", &conn->c_laddr, &conn->c_faddr, conn->c_tos, ic->i_cm_id); - src.sin_family = AF_INET; - src.sin_addr.s_addr = (__force u32)conn->c_laddr; - src.sin_port = (__force u16)htons(0); + if (ipv6_addr_v4mapped(&conn->c_faddr)) { + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)&src; + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = conn->c_laddr.s6_addr32[3]; + sin->sin_port = (__force u16)htons(0); - dest.sin_family = AF_INET; - dest.sin_addr.s_addr = (__force u32)conn->c_faddr; - dest.sin_port = (__force u16)htons(RDS_PORT); + sin = (struct sockaddr_in *)&dest; + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = conn->c_faddr.s6_addr32[3]; + sin->sin_port = (__force u16)htons(RDS_PORT); + } else { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)&src; + sin6->sin6_family = AF_INET6; + sin6->sin6_addr = conn->c_laddr; + sin6->sin6_port = (__force u16)htons(0); + sin6->sin6_scope_id = conn->c_dev_if; + + sin6 = (struct sockaddr_in6 *)&dest; + sin6->sin6_family = AF_INET6; + sin6->sin6_addr = conn->c_faddr; + sin6->sin6_port = (__force u16)htons(RDS_TCP_PORT); + sin6->sin6_scope_id = conn->c_dev_if; + } ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src, (struct sockaddr *)&dest, diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index d9f0649bf249..a096a33cdbd1 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -179,18 +179,19 @@ static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) kfree_rcu(to_free, rcu_head); } -int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) +int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, + struct in6_addr *ipaddr) { struct rds_ib_device *rds_ibdev_old; - rds_ibdev_old = rds_ib_get_device(ipaddr); + rds_ibdev_old = rds_ib_get_device(ipaddr->s6_addr32[3]); if (!rds_ibdev_old) - return rds_ib_add_ipaddr(rds_ibdev, ipaddr); + return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]); if (rds_ibdev_old != rds_ibdev) { - rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr); + rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr->s6_addr32[3]); rds_ib_dev_put(rds_ibdev_old); - return rds_ib_add_ipaddr(rds_ibdev, ipaddr); + return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]); } rds_ib_dev_put(rds_ibdev_old); @@ -1025,7 +1026,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, struct rds_ib_connection *ic = NULL; int ret; - rds_ibdev = rds_ib_get_device(rs->rs_bound_addr); + rds_ibdev = rds_ib_get_device(rs->rs_bound_addr.s6_addr32[3]); if (!rds_ibdev) { ret = -ENODEV; goto out; @@ -1312,7 +1313,7 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) if (wc->status != IB_WC_SUCCESS) { if (rds_conn_up(ic->conn)) { - pr_warn("RDS: IB: MR completion <%pI4,%pI4,%d> status %u " + pr_warn("RDS: IB: MR completion <%pI6c,%pI6c,%d> status %u " "vendor_err %u, disconnecting and reconnecting\n", &ic->conn->c_laddr, &ic->conn->c_faddr, ic->conn->c_tos, wc->status, wc->vendor_err); diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 26f1edebeac6..e6b853de541d 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -342,7 +342,7 @@ static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *i rds_ib_stats_inc(s_ib_rx_total_incs); } INIT_LIST_HEAD(&ibinc->ii_frags); - rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr); + rds_inc_init(&ibinc->ii_inc, ic->conn, &ic->conn->c_faddr); return ibinc; } @@ -700,7 +700,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp) ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); if (ret) { rds_conn_drop(conn, DR_IB_POST_RECV_FAIL); - pr_warn("RDS/IB: recv post on %pI4 returned %d, disconnecting and reconnecting\n", + pr_warn("RDS/IB: recv post on %pI6c returned %d, disconnecting and reconnecting\n", &conn->c_faddr, ret); break; } @@ -1183,7 +1183,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, if (data_len < sizeof(struct rds_header)) { rds_conn_drop(conn, DR_IB_HEADER_MISSING); - pr_warn("RDS/IB: incoming message from %pI4 didn't inclue a header, disconnecting and reconnecting\n", + pr_warn("RDS/IB: incoming message from %pI6c didn't inclue a header, disconnecting and reconnecting\n", &conn->c_faddr); return; } @@ -1194,7 +1194,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, /* Validate the checksum. */ if (!rds_message_verify_checksum(ihdr)) { rds_conn_drop(conn, DR_IB_HEADER_CORRUPTED); - pr_warn("RDS/IB: incoming message from %pI4 has corrupted header - forcing a reconnect\n", + pr_warn("RDS/IB: incoming message from %pI6c has corrupted header - forcing a reconnect\n", &conn->c_faddr); rds_stats_inc(s_recv_drop_bad_checksum); return; @@ -1273,10 +1273,10 @@ static void rds_ib_process_recv(struct rds_connection *conn, ic->i_recv_data_rem = 0; ic->i_ibinc = NULL; - if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) + if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) { rds_ib_cong_recv(conn, ibinc); - else { - rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr, + } else { + rds_recv_incoming(conn, &conn->c_faddr, &conn->c_laddr, &ibinc->ii_inc, GFP_ATOMIC); state->ack_next = be64_to_cpu(hdr->h_sequence); state->ack_next_valid = 1; @@ -1303,7 +1303,7 @@ void rds_ib_srq_process_recv(struct rds_connection *conn, struct rds_header *ihdr, *hdr; if (data_len < sizeof(struct rds_header)) { - printk(KERN_WARNING "RDS: from %pI4 didn't inclue a " + printk(KERN_WARNING "RDS: from %pI6c didn't inclue a " "header, disconnecting and " "reconnecting\n", &conn->c_faddr); @@ -1317,7 +1317,7 @@ void rds_ib_srq_process_recv(struct rds_connection *conn, /* Validate the checksum. */ if (!rds_message_verify_checksum(ihdr)) { - printk(KERN_WARNING "RDS: from %pI4 has corrupted header - " + printk(KERN_WARNING "RDS: from %pI6c has corrupted header - " "forcing a reconnect\n", &conn->c_faddr); rds_stats_inc(s_recv_drop_bad_checksum); @@ -1340,7 +1340,7 @@ void rds_ib_srq_process_recv(struct rds_connection *conn, if (!ibinc) { ibinc = recv->r_ibinc; - rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr); + rds_inc_init(&ibinc->ii_inc, ic->conn, &ic->conn->c_faddr); recv->r_ibinc = NULL; ic->i_ibinc = ibinc; hdr = &ibinc->ii_inc.i_hdr; @@ -1373,8 +1373,8 @@ void rds_ib_srq_process_recv(struct rds_connection *conn, if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) rds_ib_cong_recv(conn, ibinc); else { - rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr, - &ibinc->ii_inc, GFP_ATOMIC); + rds_recv_incoming(conn, &conn->c_faddr, &conn->c_laddr, + &ibinc->ii_inc, GFP_ATOMIC); state->ack_next = be64_to_cpu(hdr->h_sequence); state->ack_next_valid = 1; @@ -1420,7 +1420,7 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, if (rds_conn_up(conn) || rds_conn_connecting(conn)) { /* Flush errors are normal while draining the QP */ if (wc->status != IB_WC_WR_FLUSH_ERR) - pr_warn("RDS/IB: recv completion <%pI4,%pI4,%d> had status %u vendor_err 0x%x, disconnecting and reconnecting\n", + pr_warn("RDS/IB: recv completion <%pI6c,%pI6c,%d> had status %u vendor_err 0x%x, disconnecting and reconnecting\n", &conn->c_laddr, &conn->c_faddr, conn->c_tos, wc->status, wc->vendor_err); if (wc->status == IB_WC_LOC_LEN_ERR) diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index dba94adf6950..d071c00e32c0 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -367,7 +367,7 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) /* Flush errors are normal while draining the QP */ if (!(wc->status == IB_WC_WR_FLUSH_ERR || wc->status == IB_WC_RETRY_EXC_ERR)) - pr_warn("RDS/IB: send completion <%pI4,%pI4,%d> status %u vendor_err 0x%x, disconnecting and reconnecting\n", + pr_warn("RDS/IB: send completion <%pI6c,%pI6c,%d> status %u vendor_err 0x%x, disconnecting and reconnecting\n", &conn->c_laddr, &conn->c_faddr, conn->c_tos, wc->status, wc->vendor_err); rds_conn_drop(conn, DR_IB_SEND_COMP_ERR); @@ -833,7 +833,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, first, &first->s_wr, ret, failed_wr); BUG_ON(failed_wr != &first->s_wr); if (ret) { - printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 " + printk(KERN_WARNING "RDS/IB: ib_post_send to %pI6c " "returned %d\n", &conn->c_faddr, ret); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); rds_ib_sub_signaled(ic, nr_sig); @@ -926,7 +926,7 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) send, &send->s_wr, ret, failed_wr); BUG_ON(failed_wr != &send->s_wr); if (ret) { - printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 returned %d\n", + printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI6c returned %d\n", &conn->c_faddr, ret); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); rds_ib_sub_signaled(ic, nr_sig); @@ -1097,7 +1097,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) first, &first->s_wr, ret, failed_wr); BUG_ON(failed_wr != &first->s_wr); if (ret) { - printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 " + printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI6c " "returned %d\n", &conn->c_faddr, ret); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); rds_ib_sub_signaled(ic, nr_sig); diff --git a/net/rds/loop.c b/net/rds/loop.c index 49fbd3f9eb3c..779bcc0effdb 100644 --- a/net/rds/loop.c +++ b/net/rds/loop.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -32,6 +32,7 @@ */ #include #include +#include #include "rds.h" #include "loop.h" @@ -69,11 +70,11 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm, BUG_ON(hdr_off || sg || off); - rds_inc_init(&rm->m_inc, conn, conn->c_laddr); + rds_inc_init(&rm->m_inc, conn, &conn->c_laddr); /* For the embedded inc. Matching put is in loop_inc_free() */ rds_message_addref(rm); - rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc, + rds_recv_incoming(conn, &conn->c_laddr, &conn->c_faddr, &rm->m_inc, GFP_KERNEL); rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence), @@ -162,7 +163,8 @@ static int rds_message_skb_local(struct sk_buff *skb) org = rds_nf_hdr_org(skb); /* assuming original and dest are exactly the same then it's our own node */ - if (dst->daddr == org->daddr && dst->saddr == org->saddr && + if (ipv6_addr_equal(&dst->daddr, &org->daddr) && + ipv6_addr_equal(&dst->saddr, &org->saddr) && dst->sport == org->sport && dst->dport == org->dport) { return 1; } diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 8930b5563982..9cbfceabf166 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007 Oracle. All rights reserved. + * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -191,7 +191,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, long i; int ret; - if (rs->rs_bound_addr == 0) { + if (ipv6_addr_any(&rs->rs_bound_addr)) { ret = -ENOTCONN; /* XXX not a great errno */ goto out; } @@ -554,7 +554,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, args = CMSG_DATA(cmsg); - if (rs->rs_bound_addr == 0) { + if (ipv6_addr_any(&rs->rs_bound_addr)) { ret = -ENOTCONN; /* XXX not a great errno */ goto out; } diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index 6da92ca64a8a..1b70b42dd8db 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -81,8 +81,9 @@ static char *rds_cm_event_str(enum rdma_cm_event_type type) ARRAY_SIZE(rds_cm_event_strings), type); }; -int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, - struct rdma_cm_event *event) +int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event, + bool isv6) { /* this can be null in the listening path */ struct rds_connection *conn = cm_id->context; @@ -117,7 +118,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, switch (event->event) { case RDMA_CM_EVENT_CONNECT_REQUEST: - ret = trans->cm_handle_connect(cm_id, event); + ret = trans->cm_handle_connect(cm_id, event, isv6); break; case RDMA_CM_EVENT_ADDR_RESOLVED: @@ -134,7 +135,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, if (conn) { struct rds_ib_connection *ibic; - printk(KERN_CRIT "rds dropping connection after rdma_resolve_route failure connection %pI4->%pI4\n", + printk(KERN_CRIT "rds dropping connection after rdma_resolve_route failure connection %pI6c->%pI6c\n", &conn->c_laddr, &conn->c_faddr); ibic = conn->c_transport_data; if (ibic && ibic->i_cm_id == cm_id) @@ -164,10 +165,10 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, */ cm_id->route.path_rec[0].sl = conn->c_tos; cm_id->route.path_rec[0].qos_class = conn->c_tos; - ret = trans->cm_initiate_connect(cm_id); + ret = trans->cm_initiate_connect(cm_id, isv6); } else { rds_rtd(RDS_RTD_CM, - "ROUTE_RESOLVED: calling rds_conn_drop, conn %p <%pI4,%pI4,%d>\n", + "ROUTE_RESOLVED: calling rds_conn_drop, conn %p <%pI6c,%pI6c,%d>\n", conn, &conn->c_laddr, &conn->c_faddr, conn->c_tos); rds_conn_drop(conn, DR_IB_RDMA_CM_ID_MISMATCH); @@ -182,19 +183,23 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, printk(KERN_ERR "alloc_page failed .. NO MEM\n"); ret = -ENOMEM; } else { - r = (struct arpreq *)kmap(page); - memset(r, 0, sizeof(struct arpreq)); - sin = (struct sockaddr_in *)&r->arp_pa; - sin->sin_family = AF_INET; - sin->sin_addr.s_addr = conn->c_faddr; - inet_ioctl(rds_ib_inet_socket, SIOCDARP, (unsigned long) r); - kunmap(page); - __free_page(page); + if (ipv6_addr_v4mapped(&conn->c_faddr)) { + r = (struct arpreq *)kmap(page); + memset(r, 0, sizeof(struct arpreq)); + sin = (struct sockaddr_in *)&r->arp_pa; + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = + conn->c_faddr.s6_addr32[3]; + inet_ioctl(rds_ib_inet_socket, SIOCDARP, + (unsigned long)r); + kunmap(page); + __free_page(page); + } } if (conn) { rds_rtd(RDS_RTD_ERR, - "ROUTE_ERROR: conn %p, calling rds_conn_drop <%pI4,%pI4,%d>\n", + "ROUTE_ERROR: conn %p, calling rds_conn_drop <%pI6c,%pI6c,%d>\n", conn, &conn->c_laddr, &conn->c_faddr, conn->c_tos); rds_conn_drop(conn, DR_IB_ROUTE_ERR); @@ -208,7 +213,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, case RDMA_CM_EVENT_ADDR_ERROR: if (conn) { rds_rtd(RDS_RTD_ERR, - "ADDR_ERROR: conn %p, calling rds_conn_drop <%pI4,%pI4,%d>\n", + "ADDR_ERROR: conn %p, calling rds_conn_drop <%pI6c,%pI6c,%d>\n", conn, &conn->c_laddr, &conn->c_faddr, conn->c_tos); rds_conn_drop(conn, DR_IB_ADDR_ERR); @@ -220,7 +225,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, case RDMA_CM_EVENT_DEVICE_REMOVAL: if (conn) { rds_rtd(RDS_RTD_ERR, - "CONN/UNREACHABLE/RMVAL ERR: conn %p, calling rds_conn_drop <%pI4,%pI4,%d>\n", + "CONN/UNREACHABLE/RMVAL ERR: conn %p, calling rds_conn_drop <%pI6c,%pI6c,%d>\n", conn, &conn->c_laddr, &conn->c_faddr, conn->c_tos); rds_conn_drop(conn, DR_IB_CONNECT_ERR); @@ -234,7 +239,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, if (event->status == RDS_REJ_CONSUMER_DEFINED && (*err) == 0) { /* Rejection from RDSV3.1 */ - pr_warn("Rejected: CSR_DEF err 0, calling rds_conn_drop <%pI4,%pI4,%d>\n", + pr_warn("Rejected: CSR_DEF err 0, calling rds_conn_drop <%pI6c,%pI6c,%d>\n", &conn->c_laddr, &conn->c_faddr, conn->c_tos); if (!conn->c_tos) @@ -245,14 +250,14 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, } else if (event->status == RDS_REJ_CONSUMER_DEFINED && (*err) == RDS_ACL_FAILURE) { /* Rejection due to ACL violation */ - pr_err("RDS: IB: conn=%p, <%pI4,%pI4,%d> destroyed due to ACL violation\n", + pr_err("RDS: IB: conn=%p, <%pI6c,%pI6c,%d> destroyed due to ACL violation\n", conn, &conn->c_laddr, &conn->c_faddr, conn->c_tos); rds_ib_conn_destroy_init(conn); } else { rds_rtd(RDS_RTD_ERR, - "Rejected: *err %d status %d calling rds_conn_drop <%pI4,%pI4,%d>\n", + "Rejected: *err %d status %d calling rds_conn_drop <%pI6c,%pI6c,%d>\n", *err, event->status, &conn->c_laddr, &conn->c_faddr, @@ -264,12 +269,12 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, case RDMA_CM_EVENT_ADDR_CHANGE: rds_rtd(RDS_RTD_CM_EXT, - "ADDR_CHANGE event <%pI4,%pI4>\n", + "ADDR_CHANGE event <%pI6c,%pI6c>\n", &conn->c_laddr, &conn->c_faddr); if (conn) { rds_rtd(RDS_RTD_CM, - "ADDR_CHANGE: calling rds_conn_drop <%pI4,%pI4,%d>\n", + "ADDR_CHANGE: calling rds_conn_drop <%pI6c,%pI6c,%d>\n", &conn->c_laddr, &conn->c_faddr, conn->c_tos); if (!rds_conn_self_loopback_passive(conn)) { @@ -283,17 +288,15 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, case RDMA_CM_EVENT_DISCONNECTED: rds_rtd(RDS_RTD_CM, - "DISCONNECT event - dropping connection %pI4->%pI4 tos %d\n", + "DISCONNECT event - dropping connection %pI6c->%pI6c tos %d\n", &conn->c_laddr, &conn->c_faddr, conn->c_tos); rds_conn_drop(conn, DR_IB_DISCONNECTED_EVENT); break; case RDMA_CM_EVENT_TIMEWAIT_EXIT: if (conn) { - printk(KERN_INFO "TIMEWAIT_EXIT event - " - "dropping connection " - "%pI4->%pI4\n", &conn->c_laddr, - &conn->c_faddr); + printk(KERN_INFO "TIMEWAIT_EXIT event - dropping connection %pI6c->%pI6c\n", + &conn->c_laddr, &conn->c_faddr); rds_conn_drop(conn, DR_IB_TIMEWAIT_EXIT); } else printk(KERN_INFO "TIMEWAIT_EXIT event - conn=NULL\n"); @@ -316,46 +319,48 @@ out: return ret; } -static int rds_rdma_listen_init(void) +int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + return rds_rdma_cm_event_handler_cmn(cm_id, event, false); +} + +static int rds_rdma_listen_init_common(rdma_cm_event_handler handler, + struct sockaddr *sa, + struct rdma_cm_id **ret_cm_id) { - struct sockaddr_in sin; struct rdma_cm_id *cm_id; int ret; - cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP, - IB_QPT_RC); + cm_id = rdma_create_id(handler, NULL, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(cm_id)) { ret = PTR_ERR(cm_id); - printk(KERN_ERR "RDS/RDMA: failed to setup listener, " - "rdma_create_id() returned %d\n", ret); + printk(KERN_ERR "RDS/RDMA: failed to setup listener, rdma_create_id() returned %d\n", + ret); return ret; } - sin.sin_family = PF_INET, - sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY); - sin.sin_port = (__force u16)htons(RDS_PORT); - - /* - * XXX I bet this binds the cm_id to a device. If we want to support + /* XXX I bet this binds the cm_id to a device. If we want to support * fail-over we'll have to take this into consideration. */ - ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); + ret = rdma_bind_addr(cm_id, sa); if (ret) { - printk(KERN_ERR "RDS/RDMA: failed to setup listener, " - "rdma_bind_addr() returned %d\n", ret); + printk(KERN_ERR "RDS/RDMA: failed to setup listener, rdma_bind_addr() returned %d\n", + ret); goto out; } ret = rdma_listen(cm_id, 128); if (ret) { - printk(KERN_ERR "RDS/RDMA: failed to setup listener, " - "rdma_listen() returned %d\n", ret); + printk(KERN_ERR "RDS/RDMA: failed to setup listener, rdma_listen() returned %d\n", + ret); goto out; } - rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT); + rdsdebug("cm %p listening on port %u\n", cm_id, + ntohs(((struct sockaddr_in *)sa)->sin_port)); - rds_rdma_listen_id = cm_id; + *ret_cm_id = cm_id; cm_id = NULL; out: if (cm_id) @@ -363,6 +368,26 @@ out: return ret; } +/* Initialize the RDS RDMA listeners. We create two listeners for + * compatibility reason. The one on RDS_PORT is used for IPv4 + * requests only. The one on RDS_TCP_PORT is used for IPv6 requests + * only. So only IPv6 enabled RDS module will communicate using this + * port. + */ +static int rds_rdma_listen_init(void) +{ + int ret; + struct sockaddr_in sin; + + sin.sin_family = PF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(RDS_PORT); + ret = rds_rdma_listen_init_common(rds_rdma_cm_event_handler, + (struct sockaddr *)&sin, + &rds_rdma_listen_id); + return ret; +} + static void rds_rdma_listen_stop(void) { if (rds_rdma_listen_id) { diff --git a/net/rds/rds.h b/net/rds/rds.h index 6fdbcac20d6a..ab20763fedff 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include "info.h" @@ -108,7 +109,7 @@ enum { struct rds_cong_map { struct rb_node m_rb_node; - __be32 m_addr; + struct in6_addr m_addr; wait_queue_head_t m_waitq; struct list_head m_conn_list; unsigned long m_page_addrs[RDS_CONG_MAP_PAGES]; @@ -290,12 +291,15 @@ struct rds_conn_path { struct rds_connection { struct hlist_node c_hash_node; - __be32 c_laddr; - __be32 c_faddr; + struct in6_addr c_laddr; + struct in6_addr c_faddr; + int c_dev_if; /* c_laddr's interface index */ unsigned int c_loopback:1, + c_isv6:1, c_ping_triggered:1, c_destroy_in_prog:1, - c_pad_to_32:29; + + c_pad_to_32:28; int c_npaths; struct rds_connection *c_passive; struct rds_transport *c_trans; @@ -446,7 +450,7 @@ struct rds_incoming { struct rds_conn_path *i_conn_path; struct rds_header i_hdr; unsigned long i_rx_jiffies; - __be32 i_saddr; + struct in6_addr i_saddr; /* extension fields for dealing with netfilter */ struct rds_connection *i_oconn; @@ -543,7 +547,7 @@ struct rds_message { struct list_head m_conn_item; struct rds_incoming m_inc; u64 m_ack_seq; - __be32 m_daddr; + struct in6_addr m_daddr; unsigned long m_flags; /* Never access m_rs without holding m_rs_lock. @@ -666,7 +670,8 @@ struct rds_transport { t_mp_capable:1; unsigned int t_type; - int (*laddr_check)(struct net *net, __be32 addr); + int (*laddr_check)(struct net *net, const struct in6_addr *addr, + __u32 scope_id); int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp); void (*conn_free)(void *data); int (*conn_path_connect)(struct rds_conn_path *cp); @@ -685,8 +690,8 @@ struct rds_transport { int (*skb_local)(struct sk_buff *skb); int (*cm_handle_connect)(struct rdma_cm_id *cm_id, - struct rdma_cm_event *event); - int (*cm_initiate_connect)(struct rdma_cm_id *cm_id); + struct rdma_cm_event *event, bool isv6); + int (*cm_initiate_connect)(struct rdma_cm_id *cm_id, bool isv6); void (*cm_connect_complete)(struct rds_connection *conn, struct rdma_cm_event *event); @@ -713,9 +718,13 @@ struct rds_sock { * support. */ struct hlist_node rs_bound_node; - __be32 rs_bound_addr; - __be32 rs_conn_addr; - __be16 rs_bound_port; + struct sockaddr_in6 rs_bound_sin6; +#define rs_bound_addr rs_bound_sin6.sin6_addr +#define rs_bound_addr_v4 rs_bound_sin6.sin6_addr.s6_addr32[3] +#define rs_bound_port rs_bound_sin6.sin6_port +#define rs_bound_scope_id rs_bound_sin6.sin6_scope_id + struct in6_addr rs_conn_addr; +#define rs_conn_addr_v4 rs_conn_addr.s6_addr32[3] __be16 rs_conn_port; struct rds_transport *rs_transport; @@ -860,7 +869,8 @@ void debug_sock_put(struct sock *sock); /* bind.c */ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); void rds_remove_bound(struct rds_sock *rs); -struct rds_sock *rds_find_bound(__be32 addr, __be16 port); +struct rds_sock *rds_find_bound(struct in6_addr *addr, __be16 port, + __u32 scope_id); void rds_bind_lock_init(void); /* cong.c */ @@ -883,23 +893,26 @@ extern u32 rds_gen_num; int rds_conn_init(void); void rds_conn_exit(void); struct rds_connection *rds_conn_create(struct net *net, - __be32 laddr, __be32 faddr, - struct rds_transport *trans, - u8 tos, gfp_t gfp); + const struct in6_addr *laddr, + const struct in6_addr *faddr, + struct rds_transport *trans, + u8 tos, gfp_t gfp, int dev_if); struct rds_connection *rds_conn_create_outgoing(struct net *net, - __be32 laddr, __be32 faddr, - struct rds_transport *trans, - u8 tos, gfp_t gfp); -struct rds_connection *rds_conn_find(struct net *net, __be32 laddr, - __be32 faddr, - struct rds_transport *trans, u8 tos); + struct in6_addr *laddr, + struct in6_addr *faddr, + struct rds_transport *trans, + u8 tos, gfp_t gfp, int dev_if); +struct rds_connection *rds_conn_find(struct net *net, struct in6_addr *laddr, + struct in6_addr *faddr, + struct rds_transport *trans, u8 tos, + int dev_if); void rds_conn_shutdown(struct rds_conn_path *cp); void rds_conn_destroy(struct rds_connection *conn, int shutdown); void rds_conn_reset(struct rds_connection *conn); void rds_conn_drop(struct rds_connection *conn, int reason); void rds_conn_path_drop(struct rds_conn_path *cp, int reason); -void rds_conn_laddr_list(struct net *net, - __be32 laddr, struct list_head *laddr_conns); +void rds_conn_laddr_list(struct net *net, struct in6_addr *laddr, + struct list_head *laddr_conns); void rds_conn_connect_if_down(struct rds_connection *conn); void rds_conn_path_connect_if_down(struct rds_conn_path *conn); void rds_for_each_conn_info(struct socket *sock, unsigned int len, @@ -964,7 +977,7 @@ rds_conn_connecting(struct rds_connection *conn) static inline bool rds_conn_self_loopback_passive(struct rds_connection *conn) { - if (conn->c_laddr == conn->c_faddr && !conn->c_passive) + if (ipv6_addr_equal(&conn->c_laddr, &conn->c_faddr) && !conn->c_passive) return true; else return false; @@ -1018,12 +1031,13 @@ void rds_page_exit(void); /* recv.c */ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, - __be32 saddr); + struct in6_addr *saddr); void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *conn, - __be32 saddr); + struct in6_addr *saddr); void rds_inc_addref(struct rds_incoming *inc); void rds_inc_put(struct rds_incoming *inc); -void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, +void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr, + struct in6_addr *daddr, struct rds_incoming *inc, gfp_t gfp); int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int msg_flags); @@ -1038,8 +1052,7 @@ int rds_skb_local(struct sk_buff *skb); int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len); void rds_send_path_reset(struct rds_conn_path *cp); int rds_send_xmit(struct rds_conn_path *cp); -struct sockaddr_in; -void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest); +void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest); typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); void rds_send_drop_acked(struct rds_connection *conn, u64 ack, is_acked_func is_acked); @@ -1135,11 +1148,14 @@ void rds_hb_worker(struct work_struct *); void rds_reconnect_timeout(struct work_struct *); void rds_connect_path_complete(struct rds_conn_path *cp, int curr); void rds_connect_complete(struct rds_connection *conn); +int rds_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2); /* transport.c */ int rds_trans_register(struct rds_transport *trans); void rds_trans_unregister(struct rds_transport *trans); -struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr); +struct rds_transport *rds_trans_get_preferred(struct net *net, + const struct in6_addr *addr, + __u32 scope_id); void rds_trans_put(struct rds_transport *trans); unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); diff --git a/net/rds/recv.c b/net/rds/recv.c index 6133a3437ed7..a16254643259 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -43,7 +43,8 @@ /* forward prototypes */ static void -rds_recv_drop(struct rds_connection *conn, __be32 saddr, __be32 daddr, +rds_recv_drop(struct rds_connection *conn, struct in6_addr *saddr, + struct in6_addr *daddr, struct rds_incoming *inc, gfp_t gfp); static void @@ -55,7 +56,8 @@ rds_recv_forward(struct rds_conn_path *cp, struct rds_incoming *inc, gfp_t gfp); static void -rds_recv_local(struct rds_conn_path *cp, __be32 saddr, __be32 daddr, +rds_recv_local(struct rds_conn_path *cp, struct in6_addr *saddr, + struct in6_addr *daddr, struct rds_incoming *inc, gfp_t gfp, struct rds_sock *rs); static int @@ -66,14 +68,14 @@ rds_recv_ok(struct sock *sk, struct sk_buff *skb) } void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, - __be32 saddr) + struct in6_addr *saddr) { int i; atomic_set(&inc->i_refcount, 1); INIT_LIST_HEAD(&inc->i_item); inc->i_conn = conn; - inc->i_saddr = saddr; + inc->i_saddr = *saddr; inc->i_rdma_cookie = 0; inc->i_oconn = NULL; inc->i_skb = NULL; @@ -86,7 +88,7 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, EXPORT_SYMBOL_GPL(rds_inc_init); void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp, - __be32 saddr) + struct in6_addr *saddr) { int i; @@ -94,7 +96,7 @@ void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp, INIT_LIST_HEAD(&inc->i_item); inc->i_conn = cp->cp_conn; inc->i_conn_path = cp; - inc->i_saddr = saddr; + inc->i_saddr = *saddr; inc->i_rdma_cookie = 0; inc->i_oconn = NULL; inc->i_skb = NULL; @@ -150,7 +152,7 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk, rds_stats_add(s_recv_bytes_removed_from_socket, -delta); now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs); - rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d " + rdsdebug("rs %p (%pI6c:%u) recv bytes %d buf %d " "now_cong %d delta %d\n", rs, &rs->rs_bound_addr, ntohs(rs->rs_bound_port), rs->rs_rcv_bytes, @@ -306,7 +308,7 @@ static void rds_start_mprds(struct rds_connection *conn) struct rds_conn_path *cp; if (conn->c_npaths > 1 && - IS_CANONICAL(conn->c_laddr, conn->c_faddr)) { + rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) < 0) { for (i = 0; i < conn->c_npaths; i++) { cp = &conn->c_path[i]; rds_conn_path_connect_if_down(cp); @@ -330,7 +332,8 @@ static void rds_start_mprds(struct rds_connection *conn) * conn. This lets loopback, who only has one conn for both directions, * tell us which roles the addrs in the conn are playing for this message. */ -void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, +void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr, + struct in6_addr *daddr, struct rds_incoming *inc, gfp_t gfp) { struct sk_buff *skb; @@ -340,8 +343,8 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, int ret; struct rds_conn_path *cp; - rdsdebug(KERN_ALERT "incoming: conn %p, inc %p, %pI4:%d -> %pI4:%d\n", - conn, inc, &saddr, inc->i_hdr.h_sport, &daddr, + rdsdebug(KERN_ALERT "incoming: conn %p, inc %p, %pI6c : %d -> %pI6c : %d\n", + conn, inc, saddr, inc->i_hdr.h_sport, daddr, inc->i_hdr.h_dport); /* initialize some globals */ @@ -357,7 +360,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, cp = &conn->c_path[0]; /* lets find a socket to which this request belongs */ - rs = rds_find_bound(daddr, inc->i_hdr.h_dport); + rs = rds_find_bound(daddr, inc->i_hdr.h_dport, conn->c_dev_if); /* pass it on locally if there is no socket bound, or if netfilter is * disabled for this socket */ @@ -379,8 +382,8 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, if (NULL == skb) { /* if we have allocation problems, then we just need to depart */ rds_rtd(RDS_RTD_ERR, - "failure to allocate space for inc %p, %pI4 -> %pI4 tos %d\n", - inc, &saddr, &daddr, conn->c_tos); + "failure to allocate space for inc %p, %pI6c -> %pI6c tos %d\n", + inc, saddr, daddr, conn->c_tos); rds_recv_local(cp, saddr, daddr, inc, gfp, rs); /* drop the reference if we had taken one */ if (NULL != rs) @@ -396,8 +399,8 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, org = rds_nf_hdr_org(skb); /* now update our rds_nf_hdr for tracking locations of the request */ - dst->saddr = saddr; - dst->daddr = daddr; + dst->saddr = *saddr; + dst->daddr = *daddr; dst->sport = inc->i_hdr.h_sport; dst->dport = inc->i_hdr.h_dport; dst->flags = 0; @@ -428,8 +431,8 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, /* if we had a failure to convert, then just assuming to continue as local */ else { rds_rtd(RDS_RTD_RCV_EXT, - "failed to create skb form, conn %p, inc %p, %pI4 -> %pI4 tos %d\n", - conn, inc, &saddr, &daddr, conn->c_tos); + "failed to create skb form, conn %p, inc %p, %pI6c -> %pI6c tos %d\n", + conn, inc, saddr, daddr, conn->c_tos); ret = 1; } @@ -444,7 +447,8 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, /* this is the normal good processed state */ else if (ret >= 0) { /* check the original header and if changed do the needful */ - if (dst->saddr == org->saddr && dst->daddr == org->daddr && + if (ipv6_addr_equal(&dst->saddr, &org->saddr) && + ipv6_addr_equal(&dst->daddr, &org->daddr) && conn->c_trans->skb_local(skb)) { rds_recv_local(cp, saddr, daddr, inc, gfp, NULL); } @@ -471,19 +475,20 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, /* we don't really expect an error state from this call that isn't the done above */ else { /* we don't really know how to handle this yet - just ignore for now */ - printk(KERN_ERR "unacceptible state for skb ret %d, conn %p, inc %p, %pI4 -> %pI4\n", - ret, conn, inc, &saddr, &daddr); + printk(KERN_ERR "unacceptible state for skb ret %d, conn %p, inc %p, %pI6c -> %pI6c\n", + ret, conn, inc, saddr, daddr); } } EXPORT_SYMBOL_GPL(rds_recv_incoming); static void -rds_recv_drop(struct rds_connection *conn, __be32 saddr, __be32 daddr, +rds_recv_drop(struct rds_connection *conn, struct in6_addr *saddr, + struct in6_addr *daddr, struct rds_incoming *inc, gfp_t gfp) { /* drop the existing incoming message */ - rdsdebug("dropping request on conn %p, inc %p, %pI4 -> %pI4", - conn, inc, &saddr, &daddr); + rdsdebug("dropping request on conn %p, inc %p, %pI6c -> %pI6c", + conn, inc, saddr, daddr); } static void @@ -498,30 +503,31 @@ rds_recv_route(struct rds_connection *conn, struct rds_incoming *inc, org = rds_nf_hdr_org(inc->i_skb); /* special case where we are swapping the message back on the same connection */ - if (dst->saddr == org->daddr && dst->daddr == org->saddr) { + if (ipv6_addr_equal(&dst->saddr, &org->daddr) && + ipv6_addr_equal(&dst->daddr, &org->saddr)) { nconn = conn; } else { /* reroute to a new conn structure, possibly the same one */ nconn = rds_conn_find(rds_conn_net(conn), - dst->saddr, dst->daddr, conn->c_trans, - conn->c_tos); + &dst->saddr, &dst->daddr, conn->c_trans, + conn->c_tos, conn->c_dev_if); } /* cannot find a matching connection so drop the request */ if (NULL == nconn) { - printk(KERN_ALERT "cannot find matching conn for inc %p, %pI4 -> %pI4\n", + printk(KERN_ALERT "cannot find matching conn for inc %p, %pI6c -> %pI6c\n", inc, &dst->saddr, &dst->daddr); - rdsdebug("cannot find matching conn for inc %p, %pI4 -> %pI4", + rdsdebug("cannot find matching conn for inc %p, %pI6c -> %pI6c", inc, &dst->saddr, &dst->daddr); - rds_recv_drop(conn, dst->saddr, dst->daddr, inc, gfp); + rds_recv_drop(conn, &dst->saddr, &dst->daddr, inc, gfp); } /* this is a request for our local node, but potentially a different source * either way we process it locally */ else if (conn->c_trans->skb_local(inc->i_skb)) { WARN_ON(nconn->c_trans->t_mp_capable); rds_recv_local(&nconn->c_path[0], - dst->saddr, dst->daddr, inc, gfp, NULL); + &dst->saddr, &dst->daddr, inc, gfp, NULL); } /* looks like this request is going out to another node */ else { @@ -548,10 +554,10 @@ rds_recv_forward(struct rds_conn_path *cp, struct rds_incoming *inc, org = rds_nf_hdr_org(inc->i_skb); /* find the proper output socket - it should be the local one on which we originated */ - rs = rds_find_bound(dst->saddr, dst->sport); + rs = rds_find_bound(&dst->saddr, dst->sport, conn->c_dev_if); if (!rs) { rds_rtd(RDS_RTD_RCV, - "failed to find output rds_socket dst %pI4 : %u, inc %p, conn %p tos %d\n", + "failed to find output rds_socket dst %pI6c : %u, inc %p, conn %p tos %d\n", &dst->daddr, dst->dport, inc, conn, conn->c_tos); rds_stats_inc(s_recv_drop_no_sock); @@ -565,7 +571,7 @@ rds_recv_forward(struct rds_conn_path *cp, struct rds_incoming *inc, ret = rds_send_internal(conn, rs, inc->i_skb, gfp); if (len != ret) { rds_rtd(RDS_RTD_RCV, - "failed to send rds_data dst %pI4 : %u, inc %p, conn %p tos %d, len %d != ret %d\n", + "failed to send rds_data dst %pI6c : %u, inc %p, conn %p tos %d, len %d != ret %d\n", &dst->daddr, dst->dport, inc, conn, conn->c_tos, len, ret); goto out; @@ -588,14 +594,14 @@ out: NF_HOOK(PF_RDS_HOOK, NF_RDS_FORWARD_ERROR, sk, inc->i_skb, NULL, NULL, rds_recv_ok); /* then hand the request off to normal local processing on the old connection */ - rds_recv_local(&inc->i_oconn->c_path[0], org->saddr, org->daddr, + rds_recv_local(&inc->i_oconn->c_path[0], &org->saddr, &org->daddr, inc, gfp, NULL); - } static void -rds_recv_local(struct rds_conn_path *cp, __be32 saddr, __be32 daddr, - struct rds_incoming *inc, gfp_t gfp, struct rds_sock *rs) +rds_recv_local(struct rds_conn_path *cp, struct in6_addr *saddr, + struct in6_addr *daddr, struct rds_incoming *inc, gfp_t gfp, + struct rds_sock *rs) { struct sock *sk; unsigned long flags; @@ -641,7 +647,7 @@ rds_recv_local(struct rds_conn_path *cp, __be32 saddr, __be32 daddr, if (inc_hdr_h_sequence != cp->cp_next_rx_seq) { rds_rtd(RDS_RTD_RCV, - "conn %p <%pI4,%pI4,%d> expect seq# %llu, recved seq# %llu, retrans bit %d\n", + "conn %p <%pI6c,%pI6c,%d> expect seq# %llu, recved seq# %llu, retrans bit %d\n", conn, &conn->c_laddr, &conn->c_faddr, conn->c_tos, cp->cp_next_rx_seq, inc_hdr_h_sequence, inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED); @@ -656,7 +662,8 @@ rds_recv_local(struct rds_conn_path *cp, __be32 saddr, __be32 daddr, if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) { if (inc->i_hdr.h_sport == 0) { - rdsdebug("ignore ping with 0 sport from 0x%x\n", saddr); + rdsdebug("ignore ping with 0 sport from %pI6c\n", + &saddr); goto out; } if (inc->i_hdr.h_flags & RDS_FLAG_HB_PING) { @@ -687,7 +694,7 @@ rds_recv_local(struct rds_conn_path *cp, __be32 saddr, __be32 daddr, } if (!rs) - rs = rds_find_bound(daddr, inc->i_hdr.h_dport); + rs = rds_find_bound(daddr, inc->i_hdr.h_dport, conn->c_dev_if); if (!rs) { rds_stats_inc(s_recv_drop_no_sock); goto out; @@ -940,6 +947,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, struct rds_sock *rs = rds_sk_to_rs(sk); long timeo; int ret = 0, nonblock = msg_flags & MSG_DONTWAIT; + struct sockaddr_in6 *sin6; struct sockaddr_in *sin; struct rds_incoming *inc = NULL; @@ -988,7 +996,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, break; } - rdsdebug("copying inc %p from %pI4:%u to user\n", inc, + rdsdebug("copying inc %p from %pI6c:%u to user\n", inc, &inc->i_conn->c_faddr, ntohs(inc->i_hdr.h_sport)); save = msg->msg_iter; @@ -1022,13 +1030,26 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, rds_stats_inc(s_recv_delivered); - sin = (struct sockaddr_in *)msg->msg_name; - if (sin) { - sin->sin_family = AF_INET; - sin->sin_port = inc->i_hdr.h_sport; - sin->sin_addr.s_addr = inc->i_saddr; - memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); - msg->msg_namelen = sizeof(*sin); + if (msg->msg_name) { + if (ipv6_addr_v4mapped(&inc->i_saddr)) { + sin = (struct sockaddr_in *)msg->msg_name; + + sin->sin_family = AF_INET; + sin->sin_port = inc->i_hdr.h_sport; + sin->sin_addr.s_addr = + inc->i_saddr.s6_addr32[3]; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + msg->msg_namelen = sizeof(*sin); + } else { + sin6 = (struct sockaddr_in6 *)msg->msg_name; + + sin6->sin6_family = AF_INET6; + sin6->sin6_port = inc->i_hdr.h_sport; + sin6->sin6_addr = inc->i_saddr; + sin6->sin6_flowinfo = 0; + sin6->sin6_scope_id = rs->rs_bound_scope_id; + msg->msg_namelen = sizeof(*sin6); + } } break; } @@ -1099,14 +1120,14 @@ int rds_skb_local(struct sk_buff *skb) dst = rds_nf_hdr_dst(skb); org = rds_nf_hdr_org(skb); - /* just check to see that the destination is still the same */ - if (dst->daddr == org->daddr && dst->dport == org->dport) { + /* Just check to see that the destination is still the same. + * Otherwise, the sport/dport have likely swapped so consider + * it a different node. + */ + if (ipv6_addr_equal(&dst->daddr, &org->daddr) && + dst->dport == org->dport) return 1; - } - /* otherwise, the sport/dport have likely swapped so consider - * it a different node */ - else { + else return 0; - } } EXPORT_SYMBOL(rds_skb_local); diff --git a/net/rds/send.c b/net/rds/send.c index 86a92011d6d7..97e239bcbd9f 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -845,7 +845,7 @@ void rds_send_drop_acked(struct rds_connection *conn, u64 ack, } EXPORT_SYMBOL_GPL(rds_send_drop_acked); -void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) +void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest) { struct rds_message *rm, *tmp; struct rds_connection *conn = NULL; @@ -858,8 +858,9 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) spin_lock_irqsave(&rs->rs_lock, flags); list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) { - if (dest && (dest->sin_addr.s_addr != rm->m_daddr || - dest->sin_port != rm->m_inc.i_hdr.h_dport)) + if (dest && + (!ipv6_addr_equal(&dest->sin6_addr, &rm->m_daddr) || + dest->sin6_port != rm->m_inc.i_hdr.h_dport)) continue; list_move(&rm->m_sock_item, &list); @@ -1207,8 +1208,8 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) { struct sock *sk = sock->sk; struct rds_sock *rs = rds_sk_to_rs(sk); + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)msg->msg_name; struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name; - __be32 daddr; __be16 dport; struct rds_message *rm = NULL; struct rds_connection *conn; @@ -1218,6 +1219,9 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) long timeo = sock_sndtimeo(sk, nonblock); size_t total_payload_len = payload_len, rdma_payload_len = 0; struct rds_conn_path *cpath; + struct in6_addr daddr; + __u32 scope_id = 0; + int namelen; /* Mirror Linux UDP mirror of BSD error message compatibility */ /* XXX: Perhaps MSG_MORE someday */ @@ -1227,27 +1231,61 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) goto out; } - if (msg->msg_namelen) { - /* XXX fail non-unicast destination IPs? */ - if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET) { + namelen = msg->msg_namelen; + if (namelen != 0) { + if (namelen < sizeof(*usin)) { + ret = -EINVAL; + goto out; + } + switch (namelen) { + case sizeof(*usin): + if (usin->sin_family != AF_INET || + usin->sin_addr.s_addr == INADDR_ANY || + usin->sin_addr.s_addr == INADDR_BROADCAST || + IN_MULTICAST(ntohl(usin->sin_addr.s_addr))) { + ret = -EINVAL; + goto out; + } + ipv6_addr_set_v4mapped(usin->sin_addr.s_addr, &daddr); + dport = usin->sin_port; + break; + + case sizeof(*sin6): { + ret = -EPROTONOSUPPORT; + goto out; + } + + default: ret = -EINVAL; goto out; } - daddr = usin->sin_addr.s_addr; - dport = usin->sin_port; } else { /* We only care about consistency with ->connect() */ lock_sock(sk); daddr = rs->rs_conn_addr; dport = rs->rs_conn_port; + scope_id = rs->rs_bound_scope_id; release_sock(sk); } lock_sock(sk); - if (daddr == 0 || rs->rs_bound_addr == 0) { - release_sock(sk); - ret = -ENOTCONN; /* XXX not a great errno */ - goto out; + if (ipv6_addr_any(&rs->rs_bound_addr)) { + if (ipv6_addr_any(&daddr)) { + release_sock(sk); + ret = -ENOTCONN; + goto out; + } + } else if (namelen != 0) { + /* Cannot send to an IPv4 address using an IPv6 source + * address and cannot send to an IPv6 address using an + * IPv4 source address. + */ + if (ipv6_addr_v4mapped(&daddr) ^ + ipv6_addr_v4mapped(&rs->rs_bound_addr)) { + release_sock(sk); + ret = -EOPNOTSUPP; + goto out; + } } release_sock(sk); @@ -1294,14 +1332,15 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) /* rds_conn_create has a spinlock that runs with IRQ off. * Caching the conn in the socket helps a lot. */ - if (rs->rs_conn && rs->rs_conn->c_faddr == daddr && - rs->rs_tos == rs->rs_conn->c_tos) + if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr) && + rs->rs_tos == rs->rs_conn->c_tos) conn = rs->rs_conn; else { conn = rds_conn_create_outgoing(sock_net(sock->sk), - rs->rs_bound_addr, daddr, - rs->rs_transport, rs->rs_tos, - sock->sk->sk_allocation); + &rs->rs_bound_addr, &daddr, + rs->rs_transport, rs->rs_tos, + sock->sk->sk_allocation, + scope_id); if (IS_ERR(conn)) { ret = PTR_ERR(conn); goto out; @@ -1549,7 +1588,7 @@ int rds_send_internal(struct rds_connection *conn, struct rds_sock *rs, queue_delayed_work(conn->c_path[0].cp_wq, &conn->c_path[0].cp_send_w, 1); - rdsdebug("message sent for rs %p, conn %p, len %d, %pI4:%u->%pI4:%u\n", + rdsdebug("message sent for rs %p, conn %p, len %d, %pI6c:%u->%pI6c:%u\n", rs, conn, skb->len, &dst->saddr, dst->sport, &dst->daddr, dst->dport); ret = skb->len; diff --git a/net/rds/tcp.c b/net/rds/tcp.c index db1838759c0c..54b58c1c21f8 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "rds.h" #include "tcp.h" @@ -264,9 +265,34 @@ out: spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags); } -static int rds_tcp_laddr_check(struct net *net, __be32 addr) +static int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr, + __u32 scope_id) { - if (inet_addr_type(net, addr) == RTN_LOCAL) + struct net_device *dev = NULL; + int ret; + + if (ipv6_addr_v4mapped(addr)) { + if (inet_addr_type(net, addr->s6_addr32[3]) == RTN_LOCAL) + return 0; + return -EADDRNOTAVAIL; + } + + /* If the scope_id is specified, check only those addresses + * hosted on the specified interface. + */ + if (scope_id != 0) { + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, scope_id); + /* scope_id is not valid... */ + if (!dev) { + rcu_read_unlock(); + return -EADDRNOTAVAIL; + } + } + ret = ipv6_chk_addr(net, addr, dev, 0); + if (scope_id != 0) + rcu_read_unlock(); + if (ret) return 0; return -EADDRNOTAVAIL; } diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index dc157a78b6ac..01449f3fb358 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -60,10 +60,10 @@ void rds_tcp_state_change(struct sock *sk) case TCP_SYN_RECV: break; case TCP_ESTABLISHED: - if (!IS_CANONICAL(cp->cp_conn->c_laddr, - cp->cp_conn->c_faddr) && - rds_conn_path_transition(cp, RDS_CONN_CONNECTING, - RDS_CONN_ERROR)) { + if (rds_addr_cmp(&cp->cp_conn->c_laddr, + &cp->cp_conn->c_faddr) >= 0 && + rds_conn_path_transition(cp, RDS_CONN_CONNECTING, + RDS_CONN_ERROR)) { rds_conn_path_drop(cp, DR_TCP_STATE_CLOSE); } else { rds_connect_path_complete(cp, RDS_CONN_CONNECTING); @@ -83,7 +83,9 @@ out: int rds_tcp_conn_path_connect(struct rds_conn_path *cp) { struct socket *sock = NULL; - struct sockaddr_in src, dest; + struct sockaddr_in sin; + struct sockaddr *addr; + int addrlen; int ret; struct rds_connection *conn = cp->cp_conn; struct rds_tcp_connection *tc = cp->cp_transport_data; @@ -108,29 +110,33 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) rds_tcp_tune(sock); - src.sin_family = AF_INET; - src.sin_addr.s_addr = (__force u32)conn->c_laddr; - src.sin_port = (__force u16)htons(0); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = (__force u32)conn->c_laddr.s6_addr32[3]; + sin.sin_port = (__force u16)htons(0); + addr = (struct sockaddr *)&sin; + addrlen = sizeof(sin); - ret = sock->ops->bind(sock, (struct sockaddr *)&src, sizeof(src)); + ret = sock->ops->bind(sock, addr, addrlen); if (ret) { - rdsdebug("bind failed with %d at address %pI4\n", + rdsdebug("bind failed with %d at address %pI6c\n", ret, &conn->c_laddr); goto out; } - dest.sin_family = AF_INET; - dest.sin_addr.s_addr = (__force u32)conn->c_faddr; - dest.sin_port = (__force u16)htons(RDS_TCP_PORT); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = (__force u32)conn->c_faddr.s6_addr32[3]; + sin.sin_port = (__force u16)htons(RDS_TCP_PORT); + addr = (struct sockaddr *)&sin; + addrlen = sizeof(sin); /* * once we call connect() we can start getting callbacks and they * own the socket */ rds_tcp_set_callbacks(sock, cp); - ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest), - O_NONBLOCK); - rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret); + ret = sock->ops->connect(sock, addr, addrlen, O_NONBLOCK); + rdsdebug("connect to address %pI6c returned %d\n", &conn->c_faddr, + ret); if (ret == -EINPROGRESS) ret = 0; diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 00e6a461219c..7951f5eaf021 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -81,10 +81,9 @@ bail: struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) { int i; - bool peer_is_smaller = IS_CANONICAL(conn->c_faddr, conn->c_laddr); int npaths = max_t(int, 1, conn->c_npaths); - if (!peer_is_smaller) { + if (rds_addr_cmp(&conn->c_faddr, &conn->c_laddr) >= 0) { if (npaths <= 1) rds_conn_path_connect_if_down(&conn->c_path[0]); return NULL; @@ -147,13 +146,15 @@ int rds_tcp_accept_one(struct socket *sock) inet = inet_sk(new_sock->sk); - rdsdebug("accepted tcp %pI4:%u -> %pI4:%u\n", - &inet->inet_saddr, ntohs(inet->inet_sport), - &inet->inet_daddr, ntohs(inet->inet_dport)); + rdsdebug("accepted tcp %pI6c:%u -> %pI6c:%u\n", + &new_sock->sk->sk_v6_rcv_saddr, ntohs(inet->inet_sport), + &new_sock->sk->sk_v6_daddr, ntohs(inet->inet_dport)); conn = rds_conn_create(sock_net(sock->sk), - inet->inet_saddr, inet->inet_daddr, - &rds_tcp_transport, 0, GFP_KERNEL); + &new_sock->sk->sk_v6_rcv_saddr, + &new_sock->sk->sk_v6_daddr, + &rds_tcp_transport, 0, GFP_KERNEL, + new_sock->sk->sk_bound_dev_if); if (IS_ERR(conn)) { ret = PTR_ERR(conn); goto out; diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index 3319a1a95c09..9eef9c7f5d4b 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -178,7 +178,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, tc->t_tinc = tinc; rdsdebug("alloced tinc %p\n", tinc); rds_inc_path_init(&tinc->ti_inc, cp, - cp->cp_conn->c_faddr); + &cp->cp_conn->c_faddr); tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] = local_clock(); @@ -238,8 +238,9 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) rds_tcp_cong_recv(conn, tinc); else - rds_recv_incoming(conn, conn->c_faddr, - conn->c_laddr, &tinc->ti_inc, + rds_recv_incoming(conn, &conn->c_faddr, + &conn->c_laddr, + &tinc->ti_inc, arg->gfp); tc->t_tinc_hdr_rem = sizeof(struct rds_header); diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index e0ede55bdf3d..cb10a6fb39c1 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -158,7 +158,7 @@ out: * an incoming RST. */ if (rds_conn_path_up(cp)) { - pr_warn("RDS/tcp: send to %pI4 on cp [%d]" + pr_warn("RDS/tcp: send to %pI6c on cp [%d]" "returned %d, " "disconnecting and reconnecting\n", &conn->c_faddr, cp->cp_index, ret); diff --git a/net/rds/threads.c b/net/rds/threads.c index 81e037b59a20..70a9b55070f0 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -82,14 +82,14 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr) struct rds_connection *conn = cp->cp_conn; if (!rds_conn_path_transition(cp, curr, RDS_CONN_UP)) { - pr_warn("RDS: Cannot transition conn <%pI4,%pI4,%d> to state UP, current state is %d\n", + pr_warn("RDS: Cannot transition conn <%pI6c,%pI6c,%d> to state UP, current state is %d\n", &conn->c_laddr, &conn->c_faddr, conn->c_tos, atomic_read(&cp->cp_state)); rds_conn_path_drop(cp, DR_IB_NOT_CONNECTING_STATE); return; } - rds_rtd(RDS_RTD_CM_EXT, "conn %p for %pI4 to %pI4 tos %d complete\n", + rds_rtd(RDS_RTD_CM_EXT, "conn %p for %pI6c to %pI6c tos %d complete\n", conn, &conn->c_laddr, &conn->c_faddr, conn->c_tos); cp->cp_reconnect_jiffies = 0; @@ -138,12 +138,12 @@ void rds_queue_reconnect(struct rds_conn_path *cp) bool is_tcp = conn->c_trans->t_type == RDS_TRANS_TCP; rds_rtd(RDS_RTD_CM_EXT, - "conn %p for %pI4 to %pI4 tos %d reconnect jiffies %lu\n", conn, - &conn->c_laddr, &conn->c_faddr, conn->c_tos, + "conn %p for %pI6c to %pI6c tos %d reconnect jiffies %lu\n", + conn, &conn->c_laddr, &conn->c_faddr, conn->c_tos, cp->cp_reconnect_jiffies); /* let peer with smaller addr initiate reconnect, to avoid duels */ - if (is_tcp && !IS_CANONICAL(conn->c_laddr, conn->c_faddr)) + if (is_tcp && rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) >= 0) return; set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); @@ -156,7 +156,7 @@ void rds_queue_reconnect(struct rds_conn_path *cp) get_random_bytes(&rand, sizeof(rand)); rds_rtd(RDS_RTD_CM_EXT, - "%lu delay %lu ceil conn %p for %pI4 -> %pI4 tos %d\n", + "%lu delay %lu ceil conn %p for %pI6c -> %pI6c tos %d\n", rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies, conn, &conn->c_laddr, &conn->c_faddr, conn->c_tos); @@ -177,7 +177,7 @@ void rds_connect_worker(struct work_struct *work) bool is_tcp = conn->c_trans->t_type == RDS_TRANS_TCP; if (is_tcp && cp->cp_index > 0 && - !IS_CANONICAL(cp->cp_conn->c_laddr, cp->cp_conn->c_faddr)) + rds_addr_cmp(&cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr) > 0) return; clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING); @@ -191,7 +191,7 @@ void rds_connect_worker(struct work_struct *work) ret = conn->c_trans->conn_path_connect(cp); rds_rtd(RDS_RTD_CM_EXT, - "conn %p for %pI4 to %pI4 tos %d dispatched, ret %d\n", + "conn %p for %pI6c to %pI6c tos %d dispatched, ret %d\n", conn, &conn->c_laddr, &conn->c_faddr, conn->c_tos, ret); if (ret) { @@ -287,7 +287,7 @@ void rds_hb_worker(struct work_struct *work) cp->cp_hb_start = now; } else if (now - cp->cp_hb_start > rds_conn_hb_timeout) { rds_rtd(RDS_RTD_CM, - "RDS/IB: connection <%pI4,%pI4,%d> timed out (0x%lx,0x%lx)..discon and recon\n", + "RDS/IB: connection <%pI6c,%pI6c,%d> timed out (0x%lx,0x%lx)..discon and recon\n", &conn->c_laddr, &conn->c_faddr, conn->c_tos, cp->cp_hb_start, now); rds_conn_path_drop(cp, DR_HB_TIMEOUT); @@ -305,7 +305,7 @@ void rds_reconnect_timeout(struct work_struct *work) struct rds_connection *conn = cp->cp_conn; if (cp->cp_reconnect_retry_count > rds_sysctl_reconnect_max_retries) { - pr_info("RDS: connection <%pI4,%pI4,%d> reconnect retries(%d) exceeded, stop retry\n", + pr_info("RDS: connection <%pI6c,%pI6c,%d> reconnect retries(%d) exceeded, stop retry\n", &conn->c_laddr, &conn->c_faddr, conn->c_tos, cp->cp_reconnect_retry_count); return; @@ -318,7 +318,7 @@ void rds_reconnect_timeout(struct work_struct *work) } else { cp->cp_reconnect_retry_count++; rds_rtd(RDS_RTD_CM, - "conn <%pI4,%pI4,%d> not up, retry(%d)\n", + "conn <%pI6c,%pI6c,%d> not up, retry(%d)\n", &conn->c_laddr, &conn->c_faddr, conn->c_tos, cp->cp_reconnect_retry_count); queue_delayed_work(cp->cp_wq, &cp->cp_reconn_w, @@ -342,7 +342,7 @@ void rds_shutdown_worker(struct work_struct *work) rds_sysctl_shutdown_trace_start_time) && (now - cp->cp_reconnect_start < rds_sysctl_shutdown_trace_end_time)) - pr_info("RDS/%s: connection <%pI4,%pI4,%d> shutdown init due to '%s'\n", + pr_info("RDS/%s: connection <%pI6c,%pI6c,%d> shutdown init due to '%s'\n", (is_tcp ? "TCP" : "IB"), &conn->c_laddr, &conn->c_faddr, @@ -370,3 +370,50 @@ int rds_threads_init(void) return 0; } + +/* Compare two IPv6 addresses. Return 0 if the two addresses are equal. + * Return 1 if the first is greater. Return -1 if the second is greater. + */ +int rds_addr_cmp(const struct in6_addr *addr1, + const struct in6_addr *addr2) +{ +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 + const __be64 *a1, *a2; + __be64 x, y; + + a1 = (__be64 *)addr1; + a2 = (__be64 *)addr2; + + if (*a1 != *a2) { + if (be64_to_cpu(*a1) < be64_to_cpu(*a2)) + return -1; + else + return 1; + } else { + x = be64_to_cpu(*++a1); + y = be64_to_cpu(*++a2); + if (x < y) + return -1; + else if (x > y) + return 1; + else + return 0; + } +#else + u32 a, b; + int i; + + for (i = 0; i < 4; i++) { + if (addr1->s6_addr32[i] != addr2->s6_addr32[i]) { + a = ntohl(addr1->s6_addr32[i]); + b = ntohl(addr2->s6_addr32[i]); + if (a < b) + return -1; + else if (a > b) + return 1; + } + } + return 0; +#endif +} +EXPORT_SYMBOL_GPL(rds_addr_cmp); diff --git a/net/rds/transport.c b/net/rds/transport.c index d2bb778bd55a..7a5f6c66f5bb 100644 --- a/net/rds/transport.c +++ b/net/rds/transport.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -33,6 +33,7 @@ #include #include #include +#include #include "rds.h" #include "loop.h" @@ -77,20 +78,26 @@ void rds_trans_put(struct rds_transport *trans) module_put(trans->t_owner); } -struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr) +struct rds_transport *rds_trans_get_preferred(struct net *net, + const struct in6_addr *addr, + __u32 scope_id) { struct rds_transport *ret = NULL; struct rds_transport *trans; unsigned int i; - if (IN_LOOPBACK(ntohl(addr))) + if (ipv6_addr_v4mapped(addr)) { + if (*(u_int8_t *)&addr->s6_addr32[3] == IN_LOOPBACKNET) + return &rds_loop_transport; + } else if (ipv6_addr_loopback(addr)) { return &rds_loop_transport; + } down_read(&rds_trans_sem); for (i = 0; i < RDS_TRANS_COUNT; i++) { trans = transports[i]; - if (trans && (trans->laddr_check(net, addr) == 0) && + if (trans && (trans->laddr_check(net, addr, scope_id) == 0) && (!trans->t_owner || try_module_get(trans->t_owner))) { ret = trans; break; -- 2.50.1