From: Sowmini Varadhan Date: Fri, 28 Aug 2015 11:16:01 +0000 (-0400) Subject: RDS-TCP: Make RDS-TCP work correctly when it is set up in a netns other than init_net X-Git-Tag: v4.1.12-92~287^2^2~2 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=2a4426bf6efa8131d4ada583a56031652b7ce215;p=users%2Fjedix%2Flinux-maple.git RDS-TCP: Make RDS-TCP work correctly when it is set up in a netns other than init_net Open the sockets calling sock_create_kern() with the correct struct net pointer, and use that struct net pointer when verifying the address passed to rds_bind(). Backport of upstream commit: d5a8ac28a7ff2f250d1bedbb6008dd2f6f6f1638 Orabug: 21437445 Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- diff --git a/net/rds/bind.c b/net/rds/bind.c index 9510dc64691b..7c14afe1b394 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -201,7 +201,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) ret = 0; goto out; } - trans = rds_trans_get_preferred(sin->sin_addr.s_addr); + trans = rds_trans_get_preferred(sock_net(sock->sk), + sin->sin_addr.s_addr); if (!trans) { ret = -EADDRNOTAVAIL; rds_remove_bound(rs); diff --git a/net/rds/connection.c b/net/rds/connection.c index 1d118e053725..0dd0cef5ca8d 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -118,7 +118,8 @@ void rds_conn_reset(struct rds_connection *conn) * For now they are not garbage collected once they're created. They * are torn down as the module is removed, if ever. */ -static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, +static struct rds_connection *__rds_conn_create(struct net *net, + __be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp, u8 tos, int is_outgoing) @@ -164,6 +165,7 @@ new_conn: conn->c_faddr = faddr; spin_lock_init(&conn->c_lock); conn->c_next_tx_seq = 1; + rds_conn_net_set(conn, net); init_waitqueue_head(&conn->c_waitq); INIT_LIST_HEAD(&conn->c_send_queue); @@ -183,7 +185,7 @@ new_conn: * can bind to the destination address then we'd rather the messages * flow through loopback rather than either transport. */ - loop_trans = rds_trans_get_preferred(faddr); + loop_trans = rds_trans_get_preferred(net, faddr); if (loop_trans) { rds_trans_put(loop_trans); conn->c_loopback = 1; @@ -279,19 +281,21 @@ out: return conn; } -struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, +struct rds_connection *rds_conn_create(struct net *net, + __be32 laddr, __be32 faddr, struct rds_transport *trans, u8 tos, gfp_t gfp) { - return __rds_conn_create(laddr, faddr, trans, gfp, tos, 0); + return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 0); } EXPORT_SYMBOL_GPL(rds_conn_create); -struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, +struct rds_connection *rds_conn_create_outgoing(struct net *net, + __be32 laddr, __be32 faddr, struct rds_transport *trans, u8 tos, gfp_t gfp) { - return __rds_conn_create(laddr, faddr, trans, gfp, tos, 1); + return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 1); } EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); diff --git a/net/rds/ib.c b/net/rds/ib.c index 3034308c69c3..7bdae3901437 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -403,7 +403,7 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len, * allowed to influence which paths have priority. We could call userspace * asserting this policy "routing". */ -static int rds_ib_laddr_check(__be32 addr) +static int rds_ib_laddr_check(struct net *net, __be32 addr) { int ret; struct rdma_cm_id *cm_id; diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 4f3b52cdd95a..a1bbf0467cd9 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -792,8 +792,9 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, (unsigned long long)be64_to_cpu(lguid), (unsigned long long)be64_to_cpu(fguid)); - conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_ib_transport, - dp->dp_tos, GFP_KERNEL); + /* RDS/IB is not currently netns aware, thus init_net */ + conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr, + &rds_ib_transport, dp->dp_tos, GFP_KERNEL); if (IS_ERR(conn)) { rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); conn = NULL; @@ -801,7 +802,8 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, } if (dp->dp_tos && !conn->c_base_conn) { - conn->c_base_conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, + conn->c_base_conn = rds_conn_create(&init_net, + dp->dp_daddr, dp->dp_saddr, &rds_ib_transport, 0, GFP_KERNEL); if (IS_ERR(conn->c_base_conn)) { conn = NULL; diff --git a/net/rds/iw.c b/net/rds/iw.c index d9451aa1d9e4..2e57057a1434 100644 --- a/net/rds/iw.c +++ b/net/rds/iw.c @@ -216,7 +216,7 @@ static void rds_iw_ic_info(struct socket *sock, unsigned int len, * allowed to influence which paths have priority. We could call userspace * asserting this policy "routing". */ -static int rds_iw_laddr_check(__be32 addr) +static int rds_iw_laddr_check(struct net *net, __be32 addr) { int ret; struct rdma_cm_id *cm_id; diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c index 5be52ba4f835..edcc5858fbbe 100644 --- a/net/rds/iw_cm.c +++ b/net/rds/iw_cm.c @@ -395,8 +395,9 @@ int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, &dp->dp_saddr, &dp->dp_daddr, RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version)); - conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_iw_transport, - 0, GFP_KERNEL); + /* RDS/IW is not currently netns aware, thus init_net */ + conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr, + &rds_iw_transport, 0, GFP_KERNEL); if (IS_ERR(conn)) { rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); conn = NULL; diff --git a/net/rds/rds.h b/net/rds/rds.h index a80dfdb3ee4b..be8306c35b37 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -146,6 +146,7 @@ struct rds_connection { /* Protocol version */ unsigned int c_proposed_version; unsigned int c_version; + possible_net_t c_net; /* Re-connect stall diagnostics */ unsigned long c_reconnect_start; @@ -173,6 +174,18 @@ struct rds_connection { unsigned int c_route_resolved; }; +static inline +struct net *rds_conn_net(struct rds_connection *conn) +{ + return read_pnet(&conn->c_net); +} + +static inline +void rds_conn_net_set(struct rds_connection *conn, struct net *net) +{ + write_pnet(&conn->c_net, net); +} + #define RDS_FLAG_CONG_BITMAP 0x01 #define RDS_FLAG_ACK_REQUIRED 0x02 #define RDS_FLAG_RETRANSMITTED 0x04 @@ -476,7 +489,7 @@ struct rds_transport { unsigned int t_prefer_loopback:1; unsigned int t_type; - int (*laddr_check)(__be32 addr); + int (*laddr_check)(struct net *net, __be32 addr); int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp); void (*conn_free)(void *data); int (*conn_connect)(struct rds_connection *conn); @@ -683,10 +696,12 @@ struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); /* conn.c */ int rds_conn_init(void); void rds_conn_exit(void); -struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, +struct rds_connection *rds_conn_create(struct net *net, + __be32 laddr, __be32 faddr, struct rds_transport *trans, u8 tos, gfp_t gfp); -struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, +struct rds_connection *rds_conn_create_outgoing(struct net *net, + __be32 laddr, __be32 faddr, struct rds_transport *trans, u8 tos, gfp_t gfp); struct rds_connection *rds_conn_find(__be32 laddr, __be32 faddr, @@ -892,7 +907,7 @@ void rds_connect_complete(struct rds_connection *conn); /* transport.c */ int rds_trans_register(struct rds_transport *trans); void rds_trans_unregister(struct rds_transport *trans); -struct rds_transport *rds_trans_get_preferred(__be32 addr); +struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr); void rds_trans_put(struct rds_transport *trans); unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); diff --git a/net/rds/send.c b/net/rds/send.c index 3174eb02c166..cc4835a81afe 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1240,7 +1240,8 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) rs->rs_tos == rs->rs_conn->c_tos) conn = rs->rs_conn; else { - conn = rds_conn_create_outgoing(rs->rs_bound_addr, daddr, + conn = rds_conn_create_outgoing(sock_net(sock->sk), + rs->rs_bound_addr, daddr, rs->rs_transport, rs->rs_tos, sock->sk->sk_allocation); if (IS_ERR(conn)) { @@ -1250,6 +1251,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) if (rs->rs_tos && !conn->c_base_conn) { conn->c_base_conn = rds_conn_create_outgoing( + sock_net(sock->sk), rs->rs_bound_addr, daddr, rs->rs_transport, 0, sock->sk->sk_allocation); diff --git a/net/rds/tcp.c b/net/rds/tcp.c index a3dcc1e381a6..07918d42c0f5 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -187,9 +187,9 @@ out: spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags); } -static int rds_tcp_laddr_check(__be32 addr) +static int rds_tcp_laddr_check(struct net *net, __be32 addr) { - if (inet_addr_type(&init_net, addr) == RTN_LOCAL) + if (inet_addr_type(net, addr) == RTN_LOCAL) return 0; return -EADDRNOTAVAIL; } diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 62ac2bbf686e..79bf36f79e3f 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -79,7 +79,8 @@ int rds_tcp_conn_connect(struct rds_connection *conn) struct sockaddr_in src, dest; int ret; - ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + ret = sock_create_kern(rds_conn_net(conn), PF_INET, SOCK_STREAM, + IPPROTO_TCP, &sock); if (ret < 0) goto out; diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 3d7df924c4ef..41abc08c9b2c 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -84,8 +84,9 @@ static int rds_tcp_accept_one(struct socket *sock) struct inet_sock *inet; struct rds_tcp_connection *rs_tcp; - ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, - sock->sk->sk_protocol, &new_sock); + ret = sock_create_kern(sock_net(sock->sk), sock->sk->sk_family, + sock->sk->sk_type, sock->sk->sk_protocol, + &new_sock); if (ret) goto out; @@ -107,8 +108,9 @@ static int rds_tcp_accept_one(struct socket *sock) NIPQUAD(inet->inet_saddr), ntohs(inet->inet_sport), NIPQUAD(inet->inet_daddr), ntohs(inet->inet_dport)); - conn = rds_conn_create(inet->inet_saddr, inet->inet_daddr, &rds_tcp_transport, - 0, GFP_KERNEL); + conn = rds_conn_create(sock_net(sock->sk), + inet->inet_saddr, inet->inet_daddr, + &rds_tcp_transport, 0, GFP_KERNEL); if (IS_ERR(conn)) { ret = PTR_ERR(conn); goto out; @@ -186,7 +188,13 @@ int rds_tcp_listen_init(void) struct socket *sock = NULL; int ret; - ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + /* MUST call sock_create_kern directly so that we avoid get_net() + * in sk_alloc(). Doing a get_net() will result in cleanup_net() + * never getting invoked, which will leave sock and other things + * in limbo. + */ + ret = sock_create_kern(current->nsproxy->net_ns, PF_INET, SOCK_STREAM, + IPPROTO_TCP, &sock); if (ret < 0) goto out; diff --git a/net/rds/transport.c b/net/rds/transport.c index e3a4811d8245..d2bb778bd55a 100644 --- a/net/rds/transport.c +++ b/net/rds/transport.c @@ -77,7 +77,7 @@ void rds_trans_put(struct rds_transport *trans) module_put(trans->t_owner); } -struct rds_transport *rds_trans_get_preferred(__be32 addr) +struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr) { struct rds_transport *ret = NULL; struct rds_transport *trans; @@ -90,7 +90,7 @@ struct rds_transport *rds_trans_get_preferred(__be32 addr) for (i = 0; i < RDS_TRANS_COUNT; i++) { trans = transports[i]; - if (trans && (trans->laddr_check(addr) == 0) && + if (trans && (trans->laddr_check(net, addr) == 0) && (!trans->t_owner || try_module_get(trans->t_owner))) { ret = trans; break;