From edc31ebe9e36d87dcd08931784a49c2e75eed359 Mon Sep 17 00:00:00 2001 From: Ajaykumar Hotchandani Date: Sun, 13 Jan 2013 21:21:22 -0800 Subject: [PATCH] [patch1/3] Merge for Mellanox OFED R2, 0080 release Orabug: 15997083 Code files for following directories are copied from Mellanox OFED R2, 0080 release: drivers/infiniband/core drivers/infiniband/hw/mlx4 drivers/infiniband/ulp/ipoib drivers/infiniband/ulp/sdp drivers/infiniband/ulp/iser drivers/infiniband/ulp/srp drivers/net/mlx4 drivers/net/mlx4_vnic net/rds include/linux/mlx4 include/rdma Furthermore, this patch contains following: - Kconfig and Makefile specific changes. - Resolution of compilation errors arose due to this merge. Signed-off-by: Ajaykumar Hotchandani --- drivers/infiniband/Kconfig | 2 + drivers/infiniband/Makefile | 1 + drivers/infiniband/core/Makefile | 2 +- drivers/infiniband/core/addr.c | 34 +- drivers/infiniband/core/agent.c | 21 +- drivers/infiniband/core/cache.c | 67 +- drivers/infiniband/core/cm.c | 407 +- drivers/infiniband/core/cm_msgs.h | 71 +- drivers/infiniband/core/cma.c | 2233 +++++++--- drivers/infiniband/core/core_priv.h | 4 +- drivers/infiniband/core/device.c | 66 +- drivers/infiniband/core/fmr_pool.c | 233 +- drivers/infiniband/core/iwcm.c | 9 +- drivers/infiniband/core/local_sa.c | 1273 ++++++ drivers/infiniband/core/mad.c | 174 +- drivers/infiniband/core/mad_priv.h | 3 +- drivers/infiniband/core/multicast.c | 60 +- drivers/infiniband/core/notice.c | 749 ++++ drivers/infiniband/core/sa.h | 39 + drivers/infiniband/core/sa_query.c | 610 ++- drivers/infiniband/core/smi.c | 8 - drivers/infiniband/core/sysfs.c | 180 +- drivers/infiniband/core/ucm.c | 96 +- drivers/infiniband/core/ucma.c | 133 +- drivers/infiniband/core/ud_header.c | 83 +- drivers/infiniband/core/umem.c | 67 +- drivers/infiniband/core/user_mad.c | 217 +- drivers/infiniband/core/uverbs.h | 46 +- drivers/infiniband/core/uverbs_cmd.c | 1798 +++++++- drivers/infiniband/core/uverbs_main.c | 366 +- drivers/infiniband/core/uverbs_marshall.c | 4 - drivers/infiniband/core/verbs.c | 200 +- drivers/infiniband/hw/amso1100/c2_provider.c | 2 +- drivers/infiniband/hw/cxgb3/iwch_provider.c | 2 +- drivers/infiniband/hw/cxgb4/provider.c | 2 +- drivers/infiniband/hw/ehca/ehca_main.c | 2 +- drivers/infiniband/hw/ipath/ipath_verbs.c | 2 +- drivers/infiniband/hw/mlx4/Makefile | 4 +- drivers/infiniband/hw/mlx4/ah.c | 119 +- drivers/infiniband/hw/mlx4/alias_GUID.c | 817 ++++ drivers/infiniband/hw/mlx4/alias_GUID.h | 112 + drivers/infiniband/hw/mlx4/cm.c | 538 +++ drivers/infiniband/hw/mlx4/cq.c | 144 +- drivers/infiniband/hw/mlx4/ib_events.c | 326 ++ drivers/infiniband/hw/mlx4/ib_events.h | 115 + drivers/infiniband/hw/mlx4/mad.c | 1771 +++++++- drivers/infiniband/hw/mlx4/main.c | 1044 ++++- drivers/infiniband/hw/mlx4/mcg.c | 1504 +++++++ drivers/infiniband/hw/mlx4/mlx4_ib.h | 419 +- drivers/infiniband/hw/mlx4/mr.c | 123 +- drivers/infiniband/hw/mlx4/qp.c | 1489 ++++++- drivers/infiniband/hw/mlx4/srq.c | 70 +- drivers/infiniband/hw/mlx4/sysfs.c | 801 ++++ drivers/infiniband/hw/mlx4/wc.c | 74 + drivers/infiniband/hw/mlx4/wc.h | 41 + drivers/infiniband/hw/mthca/mthca_cmd.c | 2 +- drivers/infiniband/hw/mthca/mthca_provider.c | 2 +- drivers/infiniband/hw/nes/nes_verbs.c | 2 +- drivers/infiniband/hw/qib/qib_verbs.c | 2 +- drivers/infiniband/ulp/ipoib/ipoib.h | 168 +- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 215 +- drivers/infiniband/ulp/ipoib/ipoib_ethtool.c | 204 +- drivers/infiniband/ulp/ipoib/ipoib_fs.c | 33 +- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 171 +- drivers/infiniband/ulp/ipoib/ipoib_main.c | 1177 ++++-- .../infiniband/ulp/ipoib/ipoib_multicast.c | 269 +- drivers/infiniband/ulp/ipoib/ipoib_verbs.c | 17 +- drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 72 +- drivers/infiniband/ulp/iser/iscsi_iser.c | 165 +- drivers/infiniband/ulp/iser/iscsi_iser.h | 104 +- drivers/infiniband/ulp/iser/iser_initiator.c | 522 ++- drivers/infiniband/ulp/iser/iser_memory.c | 186 +- drivers/infiniband/ulp/iser/iser_verbs.c | 392 +- drivers/infiniband/ulp/sdp/Kconfig | 28 + drivers/infiniband/ulp/sdp/Makefile | 6 + drivers/infiniband/ulp/sdp/sdp.h | 997 +++++ drivers/infiniband/ulp/sdp/sdp_bcopy.c | 313 ++ drivers/infiniband/ulp/sdp/sdp_cma.c | 650 +++ drivers/infiniband/ulp/sdp/sdp_dbg.h | 301 ++ drivers/infiniband/ulp/sdp/sdp_main.c | 3014 ++++++++++++++ drivers/infiniband/ulp/sdp/sdp_proc.c | 912 +++++ drivers/infiniband/ulp/sdp/sdp_rx.c | 951 +++++ drivers/infiniband/ulp/sdp/sdp_tx.c | 530 +++ drivers/infiniband/ulp/sdp/sdp_zcopy.c | 795 ++++ drivers/infiniband/ulp/srp/ib_srp.c | 1693 ++++---- drivers/infiniband/ulp/srp/ib_srp.h | 104 +- drivers/net/Kconfig | 7 + drivers/net/Makefile | 1 + drivers/net/mlx4/Makefile | 7 +- drivers/net/mlx4/alloc.c | 90 +- drivers/net/mlx4/catas.c | 20 +- drivers/net/mlx4/cmd.c | 1609 +++++++- drivers/net/mlx4/cq.c | 169 +- drivers/net/mlx4/en_cq.c | 47 +- drivers/net/mlx4/en_ethtool.c | 183 +- drivers/net/mlx4/en_frag.c | 211 + drivers/net/mlx4/en_main.c | 167 +- drivers/net/mlx4/en_netdev.c | 407 +- drivers/net/mlx4/en_params.c | 485 +++ drivers/net/mlx4/en_port.c | 226 +- drivers/net/mlx4/en_port.h | 547 +-- drivers/net/mlx4/en_resources.c | 4 +- drivers/net/mlx4/en_rx.c | 464 ++- drivers/net/mlx4/en_selftest.c | 9 +- drivers/net/mlx4/en_tx.c | 236 +- drivers/net/mlx4/eq.c | 1023 +++-- drivers/net/mlx4/fmr_api.h | 102 + drivers/net/mlx4/fmr_master.c | 279 ++ drivers/net/mlx4/fmr_master.h | 26 + drivers/net/mlx4/fmr_slave.c | 179 + drivers/net/mlx4/fmr_slave.h | 16 + drivers/net/mlx4/fw.c | 635 ++- drivers/net/mlx4/fw.h | 22 + drivers/net/mlx4/icm.c | 476 ++- drivers/net/mlx4/icm.h | 53 +- drivers/net/mlx4/intf.c | 53 +- drivers/net/mlx4/main.c | 1389 +++++-- drivers/net/mlx4/mcg.c | 329 +- drivers/net/mlx4/mlx4.h | 1117 ++++- drivers/net/mlx4/mlx4_en.h | 171 +- drivers/net/mlx4/mr.c | 809 +++- drivers/net/mlx4/pd.c | 31 +- drivers/net/mlx4/pkey.c | 81 + drivers/net/mlx4/port.c | 733 +++- drivers/net/mlx4/profile.c | 31 +- drivers/net/mlx4/qp.c | 306 +- drivers/net/mlx4/reset.c | 36 + drivers/net/mlx4/resource_tracker.c | 3634 +++++++++++++++++ drivers/net/mlx4/rt_torture.c | 312 ++ drivers/net/mlx4/sense.c | 36 +- drivers/net/mlx4/srq.c | 159 +- drivers/net/mlx4/xen_fmr.h | 129 + drivers/net/mlx4/xen_fmr_master.c | 573 +++ drivers/net/mlx4/xen_fmr_slave.c | 360 ++ drivers/net/mlx4/xrcd.c | 106 + drivers/net/mlx4_vnic/Makefile | 14 + drivers/net/mlx4_vnic/fip_parser.c | 510 +++ drivers/net/mlx4_vnic/vnic.h | 1385 +++++++ drivers/net/mlx4_vnic/vnic_data.h | 132 + drivers/net/mlx4_vnic/vnic_data_ethtool.c | 427 ++ drivers/net/mlx4_vnic/vnic_data_fs.c | 922 +++++ drivers/net/mlx4_vnic/vnic_data_ib.c | 1632 ++++++++ drivers/net/mlx4_vnic/vnic_data_mac.c | 375 ++ drivers/net/mlx4_vnic/vnic_data_main.c | 1119 +++++ drivers/net/mlx4_vnic/vnic_data_neigh.c | 164 + drivers/net/mlx4_vnic/vnic_data_netdev.c | 1071 +++++ drivers/net/mlx4_vnic/vnic_data_rx.c | 678 +++ drivers/net/mlx4_vnic/vnic_data_tx.c | 476 +++ drivers/net/mlx4_vnic/vnic_fip.h | 1025 +++++ drivers/net/mlx4_vnic/vnic_fip_discover.c | 1936 +++++++++ drivers/net/mlx4_vnic/vnic_fip_discover.h | 167 + drivers/net/mlx4_vnic/vnic_fip_ib.c | 440 ++ drivers/net/mlx4_vnic/vnic_fip_login.c | 1727 ++++++++ drivers/net/mlx4_vnic/vnic_fip_main.c | 174 + drivers/net/mlx4_vnic/vnic_fip_pkt.c | 856 ++++ drivers/net/mlx4_vnic/vnic_fip_pkt.h | 40 + drivers/net/mlx4_vnic/vnic_fip_vhub.c | 635 +++ drivers/net/mlx4_vnic/vnic_main.c | 84 + drivers/net/mlx4_vnic/vnic_mcast.c | 1095 +++++ drivers/net/mlx4_vnic/vnic_mcast.h | 0 drivers/net/mlx4_vnic/vnic_param.c | 181 + drivers/net/mlx4_vnic/vnic_port.c | 518 +++ drivers/net/mlx4_vnic/vnic_qp.c | 1496 +++++++ drivers/net/mlx4_vnic/vnic_stats_helper.c | 104 + drivers/net/mlx4_vnic/vnic_utils.h | 317 ++ include/linux/mlx4/cmd.h | 95 +- include/linux/mlx4/device.h | 1068 ++++- include/linux/mlx4/driver.h | 29 +- include/linux/mlx4/qp.h | 31 +- include/linux/mlx4/srq.h | 12 + include/linux/rds.h | 156 +- include/rdma/ib_addr.h | 47 +- include/rdma/ib_cache.h | 16 + include/rdma/ib_cm.h | 73 +- include/rdma/ib_fmr_pool.h | 15 +- include/rdma/ib_mad.h | 11 +- include/rdma/ib_pack.h | 20 +- include/rdma/ib_pma.h | 156 + include/rdma/ib_sa.h | 209 +- include/rdma/ib_umem.h | 2 + include/rdma/ib_user_cm.h | 1 - include/rdma/ib_user_verbs.h | 161 +- include/rdma/ib_verbs.h | 224 +- include/rdma/iw_cm.h | 11 +- include/rdma/rdma_cm.h | 60 +- include/rdma/rdma_user_cm.h | 8 +- include/rdma/sdp_socket.h | 24 + net/9p/trans_rdma.c | 3 +- net/rds/af_rds.c | 167 +- net/rds/bind.c | 49 +- net/rds/cong.c | 22 +- net/rds/connection.c | 79 +- net/rds/ib.c | 1111 ++++- net/rds/ib.h | 233 +- net/rds/ib_cm.c | 618 ++- net/rds/ib_rdma.c | 178 +- net/rds/ib_recv.c | 777 +++- net/rds/ib_ring.c | 4 +- net/rds/ib_send.c | 268 +- net/rds/ib_stats.c | 31 +- net/rds/ib_sysctl.c | 15 +- net/rds/info.c | 1 - net/rds/iw.c | 11 +- net/rds/iw.h | 6 +- net/rds/iw_cm.c | 8 +- net/rds/iw_rdma.c | 14 +- net/rds/iw_recv.c | 15 +- net/rds/iw_ring.c | 2 +- net/rds/iw_send.c | 23 +- net/rds/iw_stats.c | 4 +- net/rds/iw_sysctl.c | 16 +- net/rds/loop.c | 16 +- net/rds/message.c | 39 +- net/rds/page.c | 39 +- net/rds/rdma.c | 210 +- net/rds/rdma_transport.c | 130 +- net/rds/rdma_transport.h | 4 + net/rds/rds.h | 80 +- net/rds/recv.c | 48 +- net/rds/send.c | 470 ++- net/rds/stats.c | 4 +- net/rds/sysctl.c | 31 +- net/rds/tcp.c | 13 +- net/rds/tcp.h | 14 + net/rds/tcp_connect.c | 13 +- net/rds/tcp_listen.c | 15 +- net/rds/tcp_recv.c | 8 +- net/rds/tcp_send.c | 10 +- net/rds/tcp_stats.c | 2 +- net/rds/threads.c | 66 +- net/rds/transport.c | 3 +- net/sunrpc/xprtrdma/svc_rdma_transport.c | 3 +- net/sunrpc/xprtrdma/verbs.c | 2 +- 233 files changed, 69811 insertions(+), 8062 deletions(-) create mode 100644 drivers/infiniband/core/local_sa.c create mode 100644 drivers/infiniband/core/notice.c create mode 100644 drivers/infiniband/hw/mlx4/alias_GUID.c create mode 100644 drivers/infiniband/hw/mlx4/alias_GUID.h create mode 100644 drivers/infiniband/hw/mlx4/cm.c create mode 100644 drivers/infiniband/hw/mlx4/ib_events.c create mode 100644 drivers/infiniband/hw/mlx4/ib_events.h create mode 100644 drivers/infiniband/hw/mlx4/mcg.c create mode 100644 drivers/infiniband/hw/mlx4/sysfs.c create mode 100644 drivers/infiniband/hw/mlx4/wc.c create mode 100644 drivers/infiniband/hw/mlx4/wc.h create mode 100644 drivers/infiniband/ulp/sdp/Kconfig create mode 100644 drivers/infiniband/ulp/sdp/Makefile create mode 100644 drivers/infiniband/ulp/sdp/sdp.h create mode 100644 drivers/infiniband/ulp/sdp/sdp_bcopy.c create mode 100644 drivers/infiniband/ulp/sdp/sdp_cma.c create mode 100644 drivers/infiniband/ulp/sdp/sdp_dbg.h create mode 100644 drivers/infiniband/ulp/sdp/sdp_main.c create mode 100644 drivers/infiniband/ulp/sdp/sdp_proc.c create mode 100644 drivers/infiniband/ulp/sdp/sdp_rx.c create mode 100644 drivers/infiniband/ulp/sdp/sdp_tx.c create mode 100644 drivers/infiniband/ulp/sdp/sdp_zcopy.c create mode 100644 drivers/net/mlx4/en_frag.c create mode 100644 drivers/net/mlx4/en_params.c create mode 100644 drivers/net/mlx4/fmr_api.h create mode 100644 drivers/net/mlx4/fmr_master.c create mode 100644 drivers/net/mlx4/fmr_master.h create mode 100644 drivers/net/mlx4/fmr_slave.c create mode 100644 drivers/net/mlx4/fmr_slave.h create mode 100644 drivers/net/mlx4/pkey.c create mode 100644 drivers/net/mlx4/resource_tracker.c create mode 100644 drivers/net/mlx4/rt_torture.c create mode 100644 drivers/net/mlx4/xen_fmr.h create mode 100644 drivers/net/mlx4/xen_fmr_master.c create mode 100644 drivers/net/mlx4/xen_fmr_slave.c create mode 100644 drivers/net/mlx4/xrcd.c create mode 100644 drivers/net/mlx4_vnic/Makefile create mode 100644 drivers/net/mlx4_vnic/fip_parser.c create mode 100644 drivers/net/mlx4_vnic/vnic.h create mode 100644 drivers/net/mlx4_vnic/vnic_data.h create mode 100644 drivers/net/mlx4_vnic/vnic_data_ethtool.c create mode 100644 drivers/net/mlx4_vnic/vnic_data_fs.c create mode 100644 drivers/net/mlx4_vnic/vnic_data_ib.c create mode 100644 drivers/net/mlx4_vnic/vnic_data_mac.c create mode 100644 drivers/net/mlx4_vnic/vnic_data_main.c create mode 100644 drivers/net/mlx4_vnic/vnic_data_neigh.c create mode 100644 drivers/net/mlx4_vnic/vnic_data_netdev.c create mode 100644 drivers/net/mlx4_vnic/vnic_data_rx.c create mode 100644 drivers/net/mlx4_vnic/vnic_data_tx.c create mode 100644 drivers/net/mlx4_vnic/vnic_fip.h create mode 100644 drivers/net/mlx4_vnic/vnic_fip_discover.c create mode 100644 drivers/net/mlx4_vnic/vnic_fip_discover.h create mode 100644 drivers/net/mlx4_vnic/vnic_fip_ib.c create mode 100644 drivers/net/mlx4_vnic/vnic_fip_login.c create mode 100644 drivers/net/mlx4_vnic/vnic_fip_main.c create mode 100644 drivers/net/mlx4_vnic/vnic_fip_pkt.c create mode 100644 drivers/net/mlx4_vnic/vnic_fip_pkt.h create mode 100644 drivers/net/mlx4_vnic/vnic_fip_vhub.c create mode 100644 drivers/net/mlx4_vnic/vnic_main.c create mode 100644 drivers/net/mlx4_vnic/vnic_mcast.c create mode 100644 drivers/net/mlx4_vnic/vnic_mcast.h create mode 100644 drivers/net/mlx4_vnic/vnic_param.c create mode 100644 drivers/net/mlx4_vnic/vnic_port.c create mode 100644 drivers/net/mlx4_vnic/vnic_qp.c create mode 100644 drivers/net/mlx4_vnic/vnic_stats_helper.c create mode 100644 drivers/net/mlx4_vnic/vnic_utils.h create mode 100644 include/rdma/ib_pma.h create mode 100644 include/rdma/sdp_socket.h diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 0f9a84c1046ab..be97cdbe30676 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -58,4 +58,6 @@ source "drivers/infiniband/ulp/srp/Kconfig" source "drivers/infiniband/ulp/iser/Kconfig" +source "drivers/infiniband/ulp/sdp/Kconfig" + endif # INFINIBAND diff --git a/drivers/infiniband/Makefile b/drivers/infiniband/Makefile index 9cc7a47d3e673..ef6dea2c99a76 100644 --- a/drivers/infiniband/Makefile +++ b/drivers/infiniband/Makefile @@ -11,3 +11,4 @@ obj-$(CONFIG_INFINIBAND_NES) += hw/nes/ obj-$(CONFIG_INFINIBAND_IPOIB) += ulp/ipoib/ obj-$(CONFIG_INFINIBAND_SRP) += ulp/srp/ obj-$(CONFIG_INFINIBAND_ISER) += ulp/iser/ +obj-$(CONFIG_INFINIBAND_SDP) += ulp/sdp/ diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index c8bbaef1becb1..f21773e833a14 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -13,7 +13,7 @@ ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o ib_mad-y := mad.o smi.o agent.o mad_rmpp.o -ib_sa-y := sa_query.o multicast.o +ib_sa-y := sa_query.o multicast.o notice.o local_sa.o ib_cm-y := cm.o diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index f2a84c6f85433..a07dc93f1512b 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -35,7 +35,6 @@ #include #include -#include #include #include #include @@ -130,8 +129,8 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) case AF_INET6: - rcu_read_lock(); - for_each_netdev_rcu(&init_net, dev) { + read_lock(&dev_base_lock); + for_each_netdev(&init_net, dev) { if (ipv6_chk_addr(&init_net, &((struct sockaddr_in6 *) addr)->sin6_addr, dev, 1)) { @@ -139,7 +138,7 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) break; } } - rcu_read_unlock(); + read_unlock(&dev_base_lock); break; #endif } @@ -185,20 +184,15 @@ static int addr4_resolve(struct sockaddr_in *src_in, __be32 dst_ip = dst_in->sin_addr.s_addr; struct rtable *rt; struct neighbour *neigh; - struct flowi4 fl4; int ret; - memset(&fl4, 0, sizeof(fl4)); - fl4.daddr = dst_ip; - fl4.saddr = src_ip; - fl4.flowi4_oif = addr->bound_dev_if; - rt = ip_route_output_key(&init_net, &fl4); + rt = ip_route_output(&init_net, dst_ip, src_ip, 0, addr->bound_dev_if); if (IS_ERR(rt)) { ret = PTR_ERR(rt); goto out; } src_in->sin_family = AF_INET; - src_in->sin_addr.s_addr = fl4.saddr; + src_in->sin_addr.s_addr = rt->rt_src; if (rt->dst.dev->flags & IFF_LOOPBACK) { ret = rdma_translate_ip((struct sockaddr *) dst_in, addr); @@ -215,9 +209,7 @@ static int addr4_resolve(struct sockaddr_in *src_in, neigh = neigh_lookup(&arp_tbl, &rt->rt_gateway, rt->dst.dev); if (!neigh || !(neigh->nud_state & NUD_VALID)) { - rcu_read_lock(); neigh_event_send(dst_get_neighbour(&rt->dst), NULL); - rcu_read_unlock(); ret = -ENODATA; if (neigh) goto release; @@ -274,17 +266,15 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, ret = rdma_copy_addr(addr, dst->dev, NULL); goto put; } - - rcu_read_lock(); + neigh = dst_get_neighbour(dst); if (!neigh || !(neigh->nud_state & NUD_VALID)) { - if (neigh) - neigh_event_send(neigh, NULL); + neigh_event_send(neigh, NULL); ret = -ENODATA; - } else { - ret = rdma_copy_addr(addr, dst->dev, neigh->ha); + goto put; } - rcu_read_unlock(); + + ret = rdma_copy_addr(addr, dst->dev, neigh->ha); put: dst_release(dst); return ret; @@ -440,7 +430,7 @@ static struct notifier_block nb = { .notifier_call = netevent_callback }; -static int __init addr_init(void) +static int addr_init(void) { addr_wq = create_singlethread_workqueue("ib_addr"); if (!addr_wq) @@ -450,7 +440,7 @@ static int __init addr_init(void) return 0; } -static void __exit addr_cleanup(void) +static void addr_cleanup(void) { unregister_netevent_notifier(&nb); destroy_workqueue(addr_wq); diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c index 2bc7f5af64f42..964f4fb301484 100644 --- a/drivers/infiniband/core/agent.c +++ b/drivers/infiniband/core/agent.c @@ -48,6 +48,8 @@ struct ib_agent_port_private { struct list_head port_list; struct ib_mad_agent *agent[2]; + struct ib_device *device; + u8 port_num; }; static DEFINE_SPINLOCK(ib_agent_port_list_lock); @@ -58,11 +60,10 @@ __ib_get_agent_port(struct ib_device *device, int port_num) { struct ib_agent_port_private *entry; - list_for_each_entry(entry, &ib_agent_port_list, port_list) { - if (entry->agent[1]->device == device && - entry->agent[1]->port_num == port_num) + list_for_each_entry(entry, &ib_agent_port_list, port_list) + if (entry->device == device && entry->port_num == port_num) return entry; - } + return NULL; } @@ -101,8 +102,7 @@ void agent_send_response(struct ib_mad *mad, struct ib_grh *grh, agent = port_priv->agent[qpn]; ah = ib_create_ah_from_wc(agent->qp->pd, wc, grh, port_num); if (IS_ERR(ah)) { - printk(KERN_ERR SPFX "ib_create_ah_from_wc error %ld\n", - PTR_ERR(ah)); + printk(KERN_ERR SPFX "ib_create_ah_from_wc error\n"); return; } @@ -156,7 +156,7 @@ int ib_agent_port_open(struct ib_device *device, int port_num) goto error1; } - if (rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND) { + if (rdma_port_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND) { /* Obtain send only MAD agent for SMI QP */ port_priv->agent[0] = ib_register_mad_agent(device, port_num, IB_QPT_SMI, NULL, 0, @@ -178,6 +178,9 @@ int ib_agent_port_open(struct ib_device *device, int port_num) goto error3; } + port_priv->device = device; + port_priv->port_num = port_num; + spin_lock_irqsave(&ib_agent_port_list_lock, flags); list_add_tail(&port_priv->port_list, &ib_agent_port_list); spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); @@ -185,7 +188,7 @@ int ib_agent_port_open(struct ib_device *device, int port_num) return 0; error3: - if (port_priv->agent[0]) + if (rdma_port_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND) ib_unregister_mad_agent(port_priv->agent[0]); error2: kfree(port_priv); @@ -209,7 +212,7 @@ int ib_agent_port_close(struct ib_device *device, int port_num) spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); ib_unregister_mad_agent(port_priv->agent[1]); - if (port_priv->agent[0]) + if (rdma_port_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND) ib_unregister_mad_agent(port_priv->agent[0]); kfree(port_priv); diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index f9ba7d74dfc03..ea4f4b8afa760 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -85,7 +85,7 @@ int ib_get_cached_gid(struct ib_device *device, cache = device->cache.gid_cache[port_num - start_port(device)]; - if (index < 0 || index >= cache->table_len) + if (!cache || index < 0 || index >= cache->table_len) ret = -EINVAL; else *gid = cache->table[index]; @@ -114,6 +114,10 @@ int ib_find_cached_gid(struct ib_device *device, for (p = 0; p <= end_port(device) - start_port(device); ++p) { cache = device->cache.gid_cache[p]; + if (!cache) { + ret = -EINVAL; + goto found; + } for (i = 0; i < cache->table_len; ++i) { if (!memcmp(gid, &cache->table[i], sizeof *gid)) { *port_num = p + start_port(device); @@ -147,7 +151,7 @@ int ib_get_cached_pkey(struct ib_device *device, cache = device->cache.pkey_cache[port_num - start_port(device)]; - if (index < 0 || index >= cache->table_len) + if (!cache || index < 0 || index >= cache->table_len) ret = -EINVAL; else *pkey = cache->table[index]; @@ -167,6 +171,7 @@ int ib_find_cached_pkey(struct ib_device *device, unsigned long flags; int i; int ret = -ENOENT; + int partial_ix = -1; if (port_num < start_port(device) || port_num > end_port(device)) return -EINVAL; @@ -174,21 +179,68 @@ int ib_find_cached_pkey(struct ib_device *device, read_lock_irqsave(&device->cache.lock, flags); cache = device->cache.pkey_cache[port_num - start_port(device)]; + if (!cache) { + ret = -EINVAL; + goto out; + } *index = -1; - for (i = 0; i < cache->table_len; ++i) if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) { + if (cache->table[i] & 0x8000) { + *index = i; + ret = 0; + break; + } else + partial_ix = i; + } + if (ret && partial_ix >= 0) { + *index = partial_ix; + ret = 0; + } +out: + read_unlock_irqrestore(&device->cache.lock, flags); + + return ret; +} +EXPORT_SYMBOL(ib_find_cached_pkey); + +int ib_find_exact_cached_pkey(struct ib_device *device, + u8 port_num, + u16 pkey, + u16 *index) +{ + struct ib_pkey_cache *cache; + unsigned long flags; + int i; + int ret = -ENOENT; + + if (port_num < start_port(device) || port_num > end_port(device)) + return -EINVAL; + + read_lock_irqsave(&device->cache.lock, flags); + + cache = device->cache.pkey_cache[port_num - start_port(device)]; + if (!cache) { + ret = -EINVAL; + goto out; + } + + *index = -1; + + for (i = 0; i < cache->table_len; ++i) + if (cache->table[i] == pkey) { *index = i; ret = 0; break; } +out: read_unlock_irqrestore(&device->cache.lock, flags); return ret; } -EXPORT_SYMBOL(ib_find_cached_pkey); +EXPORT_SYMBOL(ib_find_exact_cached_pkey); int ib_get_cached_lmc(struct ib_device *device, u8 port_num, @@ -302,13 +354,14 @@ static void ib_cache_event(struct ib_event_handler *handler, event->event == IB_EVENT_LID_CHANGE || event->event == IB_EVENT_PKEY_CHANGE || event->event == IB_EVENT_SM_CHANGE || - event->event == IB_EVENT_CLIENT_REREGISTER) { + event->event == IB_EVENT_CLIENT_REREGISTER || + event->event == IB_EVENT_GID_CHANGE) { work = kmalloc(sizeof *work, GFP_ATOMIC); if (work) { INIT_WORK(&work->work, ib_cache_task); work->device = event->device; work->port_num = event->element.port_num; - queue_work(ib_wq, &work->work); + schedule_work(&work->work); } } } @@ -368,7 +421,7 @@ static void ib_cache_cleanup_one(struct ib_device *device) int p; ib_unregister_event_handler(&device->cache.event_handler); - flush_workqueue(ib_wq); + flush_scheduled_work(); for (p = 0; p <= end_port(device) - start_port(device); ++p) { kfree(device->cache.pkey_cache[p]); diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index fc0f2bd9ca825..96b0d95fe10d7 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -55,6 +55,17 @@ MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("InfiniBand CM"); MODULE_LICENSE("Dual BSD/GPL"); +#define PFX "ib_cm: " + +/* + * Limit CM message timeouts to something reasonable: + * 8 seconds per message, with up to 15 retries + */ +static int max_timeout = 21; +module_param(max_timeout, int, 0644); +MODULE_PARM_DESC(max_timeout, "Maximum IB CM per message timeout " + "(default=21, or ~8 seconds)"); + static void cm_add_one(struct ib_device *device); static void cm_remove_one(struct ib_device *device); @@ -92,7 +103,9 @@ enum { CM_SIDR_REQ_COUNTER, CM_SIDR_REP_COUNTER, CM_LAP_COUNTER, + CM_SAP_COUNTER, CM_APR_COUNTER, + CM_SPR_COUNTER, CM_ATTR_COUNT, CM_ATTR_ID_OFFSET = 0x0010, }; @@ -210,6 +223,7 @@ struct cm_id_private { atomic_t refcount; struct ib_mad_send_buf *msg; + struct ib_mad_send_buf *lap_msg; struct cm_timewait_info *timewait_info; /* todo: use alternate port on send failure */ struct cm_av av; @@ -352,6 +366,7 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av) unsigned long flags; int ret; u8 p; + int force_grh; read_lock_irqsave(&cm.device_lock, flags); list_for_each_entry(cm_dev, &cm.device_list, list) { @@ -372,8 +387,10 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av) return ret; av->port = port; + force_grh = rdma_port_link_layer(cm_dev->ib_device, port->port_num) == + IB_LINK_LAYER_ETHERNET ? 1 : 0; ib_init_ah_from_path(cm_dev->ib_device, port->port_num, path, - &av->ah_attr); + &av->ah_attr, force_grh); av->timeout = path->packet_life_time + 1; return 0; } @@ -781,11 +798,11 @@ static void cm_cleanup_timewait(struct cm_timewait_info *timewait_info) } } -static struct cm_timewait_info * cm_create_timewait_info(__be32 local_id) +static struct cm_timewait_info * cm_create_timewait_info(__be32 local_id, gfp_t flags) { struct cm_timewait_info *timewait_info; - timewait_info = kzalloc(sizeof *timewait_info, GFP_KERNEL); + timewait_info = kzalloc(sizeof *timewait_info, flags); if (!timewait_info) return ERR_PTR(-ENOMEM); @@ -839,6 +856,24 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err) cm_id_priv = container_of(cm_id, struct cm_id_private, id); retest: spin_lock_irq(&cm_id_priv->lock); + + /* handle lap states first */ + switch (cm_id->lap_state) { + case IB_CM_LAP_UNINIT: + case IB_CM_LAP_IDLE: + break; + case IB_CM_LAP_SENT: + cm_id_priv->id.lap_state = IB_CM_LAP_IDLE; + ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->lap_msg); + cm_id_priv->lap_msg = NULL; + break; + case IB_CM_LAP_RCVD: + case IB_CM_MRA_LAP_SENT: + case IB_CM_MRA_LAP_RCVD: + default: + break; + } + switch (cm_id->state) { case IB_CM_LISTEN: cm_id->state = IB_CM_IDLE; @@ -905,6 +940,7 @@ retest: break; } + cm_free_id(cm_id->local_id); cm_deref_id(cm_id_priv); wait_for_completion(&cm_id_priv->comp); @@ -1012,11 +1048,23 @@ static void cm_format_req(struct cm_req_msg *req_msg, cm_req_set_init_depth(req_msg, param->initiator_depth); cm_req_set_remote_resp_timeout(req_msg, param->remote_cm_response_timeout); + if (param->remote_cm_response_timeout > (u8) max_timeout) { + printk(KERN_WARNING PFX "req remote_cm_response_timeout %d > " + "%d, decreasing\n", param->remote_cm_response_timeout, + max_timeout); + cm_req_set_remote_resp_timeout(req_msg, (u8) max_timeout); + } cm_req_set_qp_type(req_msg, param->qp_type); cm_req_set_flow_ctrl(req_msg, param->flow_control); cm_req_set_starting_psn(req_msg, cpu_to_be32(param->starting_psn)); cm_req_set_local_resp_timeout(req_msg, param->local_cm_response_timeout); + if (param->local_cm_response_timeout > (u8) max_timeout) { + printk(KERN_WARNING PFX "req local_cm_response_timeout %d > " + "%d, decreasing\n", param->local_cm_response_timeout, + max_timeout); + cm_req_set_local_resp_timeout(req_msg, (u8) max_timeout); + } cm_req_set_retry_count(req_msg, param->retry_count); req_msg->pkey = param->primary_path->pkey; cm_req_set_path_mtu(req_msg, param->primary_path->mtu); @@ -1066,6 +1114,12 @@ static void cm_format_req(struct cm_req_msg *req_msg, alt_path->packet_life_time)); } + /* + * this version supports APM extensions. R1 and and drivers + * not supporting SAP extensions ignore this field. + */ + cm_req_set_sap_support(req_msg, 1); + if (param->private_data && param->private_data_len) memcpy(req_msg->private_data, param->private_data, param->private_data_len); @@ -1112,33 +1166,39 @@ int ib_send_cm_req(struct ib_cm_id *cm_id, spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_IDLE) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); - ret = -EINVAL; - goto out; + return -EINVAL; } - spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv-> - id.local_id); + id.local_id, + GFP_ATOMIC); if (IS_ERR(cm_id_priv->timewait_info)) { - ret = PTR_ERR(cm_id_priv->timewait_info); - goto out; + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return (PTR_ERR(cm_id_priv->timewait_info)); } ret = cm_init_av_by_path(param->primary_path, &cm_id_priv->av); - if (ret) - goto error1; - if (param->alternate_path) { + if (!ret && param->alternate_path) { ret = cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av); - if (ret) - goto error1; } + if (ret) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + goto error1; + } + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + cm_id->service_id = param->service_id; cm_id->service_mask = ~cpu_to_be64(0); cm_id_priv->timeout_ms = cm_convert_to_ms( param->primary_path->packet_life_time) * 2 + cm_convert_to_ms( param->remote_cm_response_timeout); + if (cm_id_priv->timeout_ms > cm_convert_to_ms(max_timeout)) { + printk(KERN_WARNING PFX "req timeout_ms %d > %d, decreasing\n", + cm_id_priv->timeout_ms, cm_convert_to_ms(max_timeout)); + cm_id_priv->timeout_ms = cm_convert_to_ms(max_timeout); + } cm_id_priv->max_cm_retries = param->max_cm_retries; cm_id_priv->initiator_depth = param->initiator_depth; cm_id_priv->responder_resources = param->responder_resources; @@ -1171,9 +1231,11 @@ int ib_send_cm_req(struct ib_cm_id *cm_id, spin_unlock_irqrestore(&cm_id_priv->lock, flags); return 0; -error2: cm_free_msg(cm_id_priv->msg); -error1: kfree(cm_id_priv->timewait_info); -out: return ret; +error2: + cm_free_msg(cm_id_priv->msg); +error1: + kfree(cm_id_priv->timewait_info); + return ret; } EXPORT_SYMBOL(ib_send_cm_req); @@ -1526,7 +1588,8 @@ static int cm_req_handler(struct cm_work *work) work->mad_recv_wc->recv_buf.grh, &cm_id_priv->av); cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv-> - id.local_id); + id.local_id, + GFP_KERNEL); if (IS_ERR(cm_id_priv->timewait_info)) { ret = PTR_ERR(cm_id_priv->timewait_info); goto destroy; @@ -1570,6 +1633,13 @@ static int cm_req_handler(struct cm_work *work) cm_id_priv->tid = req_msg->hdr.tid; cm_id_priv->timeout_ms = cm_convert_to_ms( cm_req_get_local_resp_timeout(req_msg)); + if (cm_req_get_local_resp_timeout(req_msg) > (u8) max_timeout) { + printk(KERN_WARNING PFX "rcvd cm_local_resp_timeout %d > %d, " + "decreasing used timeout_ms\n", + cm_req_get_local_resp_timeout(req_msg), max_timeout); + cm_id_priv->timeout_ms = cm_convert_to_ms(max_timeout); + } + cm_id_priv->max_cm_retries = cm_req_get_max_cm_retries(req_msg); cm_id_priv->remote_qpn = cm_req_get_local_qpn(req_msg); cm_id_priv->initiator_depth = cm_req_get_resp_res(req_msg); @@ -1580,6 +1650,14 @@ static int cm_req_handler(struct cm_work *work) cm_id_priv->retry_count = cm_req_get_retry_count(req_msg); cm_id_priv->rnr_retry_count = cm_req_get_rnr_retry_count(req_msg); cm_id_priv->qp_type = cm_req_get_qp_type(req_msg); + /* We only mark whether the remote explicitly declared SAP support. + * Even if it did not, we assume it could be R1 implementation so + * we will not refrain from sending SAP messgaes. However, if we + * don't get a SPR for a SAP message, we will assume that the + * remote simply does not support SAP extenstions and we will + * refrain from sending any more SAPs + */ + cm_id->remote_sap_support = cm_req_get_sap_support(req_msg); cm_format_req_event(work, cm_id_priv, &listen_cm_id_priv->id); cm_process_work(cm_id_priv, work); @@ -2498,6 +2576,12 @@ static int cm_mra_handler(struct cm_work *work) cm_mra_get_service_timeout(mra_msg); timeout = cm_convert_to_ms(cm_mra_get_service_timeout(mra_msg)) + cm_convert_to_ms(cm_id_priv->av.timeout); + if (timeout > cm_convert_to_ms(max_timeout)) { + printk(KERN_WARNING PFX "calculated mra timeout %d > %d, " + "decreasing used timeout_ms\n", timeout, + cm_convert_to_ms(max_timeout)); + timeout = cm_convert_to_ms(max_timeout); + } spin_lock_irq(&cm_id_priv->lock); switch (cm_id_priv->id.state) { @@ -2555,6 +2639,25 @@ out: return -EINVAL; } +static void cm_format_sap(struct cm_sap_msg *sap_msg, + struct cm_id_private *cm_id_priv, + struct ib_sa_path_rec *alternate_path, + const void *private_data, + u8 private_data_len) +{ + cm_format_mad_hdr(&sap_msg->hdr, CM_SAP_ATTR_ID, + cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_LAP)); + sap_msg->local_comm_id = cm_id_priv->id.local_id; + sap_msg->remote_comm_id = cm_id_priv->id.remote_id; + cm_sap_set_remote_qpn(sap_msg, cm_id_priv->local_qpn); + /* todo: need remote CM response timeout */ + sap_msg->alt_local_lid = alternate_path->slid; + sap_msg->alt_local_gid = alternate_path->sgid; + + if (private_data && private_data_len) + memcpy(sap_msg->private_data, private_data, private_data_len); +} + static void cm_format_lap(struct cm_lap_msg *lap_msg, struct cm_id_private *cm_id_priv, struct ib_sa_path_rec *alternate_path, @@ -2632,13 +2735,71 @@ int ib_send_cm_lap(struct ib_cm_id *cm_id, } cm_id->lap_state = IB_CM_LAP_SENT; - cm_id_priv->msg = msg; + cm_id_priv->lap_msg = msg; out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_lap); +int ib_send_cm_sap(struct ib_cm_id *cm_id, + struct ib_sa_path_rec *alternate_path, + const void *private_data, + u8 private_data_len) +{ + struct cm_id_private *cm_id_priv; + struct ib_mad_send_buf *msg; + unsigned long flags; + int ret; + + if (private_data && private_data_len > IB_CM_LAP_PRIVATE_DATA_SIZE) + return -EINVAL; + + if (cm_id->sap_support_disabled) + return -EPERM; + + cm_id_priv = container_of(cm_id, struct cm_id_private, id); + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id->state != IB_CM_ESTABLISHED || + (cm_id->sap_state != IB_CM_SAP_UNINIT && + cm_id->sap_state != IB_CM_SAP_IDLE)) { + ret = -EINVAL; + goto out; + } + + ret = cm_alloc_msg(cm_id_priv, &msg); + if (ret) + goto out; + + cm_format_sap((struct cm_sap_msg *) msg->mad, cm_id_priv, + alternate_path, private_data, private_data_len); + msg->timeout_ms = cm_id_priv->timeout_ms; + msg->context[1] = (void *) (unsigned long) IB_CM_ESTABLISHED; + + ret = ib_post_send_mad(msg, NULL); + if (ret) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + cm_free_msg(msg); + return ret; + } + + cm_id->sap_state = IB_CM_SAP_SENT; + cm_id_priv->lap_msg = msg; + +out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; +} +EXPORT_SYMBOL(ib_send_cm_sap); + +static void cm_format_path_from_sap(struct cm_id_private *cm_id_priv, + struct ib_sa_path_rec *path, + struct cm_sap_msg *sap_msg) +{ + memset(path, 0, sizeof *path); + path->dgid = sap_msg->alt_local_gid; + path->dlid = sap_msg->alt_local_lid; +} + static void cm_format_path_from_lap(struct cm_id_private *cm_id_priv, struct ib_sa_path_rec *path, struct cm_lap_msg *lap_msg) @@ -2663,6 +2824,64 @@ static void cm_format_path_from_lap(struct cm_id_private *cm_id_priv, path->packet_life_time -= (path->packet_life_time > 0); } +static int cm_sap_handler(struct cm_work *work) +{ + struct cm_id_private *cm_id_priv; + struct cm_sap_msg *sap_msg; + struct ib_cm_sap_event_param *param; + int ret; + __be32 qpn; + + /* todo: verify LAP request and send reject APR if invalid. */ + sap_msg = (struct cm_sap_msg *)work->mad_recv_wc->recv_buf.mad; + cm_id_priv = cm_acquire_id(sap_msg->remote_comm_id, + sap_msg->local_comm_id); + if (!cm_id_priv) + return -EINVAL; + + param = &work->cm_event.param.sap_rcvd; + param->alternate_path = &work->path[0]; + cm_format_path_from_sap(cm_id_priv, param->alternate_path, sap_msg); + work->cm_event.private_data = &sap_msg->private_data; + + spin_lock_irq(&cm_id_priv->lock); + if (cm_id_priv->id.state != IB_CM_ESTABLISHED) + goto unlock; + + switch (cm_id_priv->id.sap_state) { + case IB_CM_SAP_UNINIT: + case IB_CM_SAP_IDLE: + break; + case IB_CM_SAP_RCVD: + atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. + counter[CM_SAP_COUNTER]); + goto unlock; + default: + goto unlock; + } + + qpn = cm_sap_get_remote_qpn(sap_msg); + if (qpn && qpn != cm_id_priv->remote_qpn) + goto unlock; + + cm_id_priv->id.sap_state = IB_CM_SAP_RCVD; + cm_id_priv->tid = sap_msg->hdr.tid; + ret = atomic_inc_and_test(&cm_id_priv->work_count); + if (!ret) + list_add_tail(&work->list, &cm_id_priv->work_list); + spin_unlock_irq(&cm_id_priv->lock); + + if (ret) + cm_process_work(cm_id_priv, work); + else + cm_deref_id(cm_id_priv); + return 0; + +unlock: spin_unlock_irq(&cm_id_priv->lock); + cm_deref_id(cm_id_priv); + return -EINVAL; +} + static int cm_lap_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; @@ -2803,6 +3022,72 @@ out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); } EXPORT_SYMBOL(ib_send_cm_apr); +static void cm_format_spr(struct cm_spr_msg *spr_msg, + struct cm_id_private *cm_id_priv, + enum ib_cm_spr_status status, + void *info, + u8 info_length, + const void *private_data, + u8 private_data_len) +{ + cm_format_mad_hdr(&spr_msg->hdr, CM_SPR_ATTR_ID, cm_id_priv->tid); + spr_msg->local_comm_id = cm_id_priv->id.local_id; + spr_msg->remote_comm_id = cm_id_priv->id.remote_id; + spr_msg->ap_status = (u8) status; + + if (info && info_length) { + spr_msg->info_length = info_length; + memcpy(spr_msg->info, info, info_length); + } + + if (private_data && private_data_len) + memcpy(spr_msg->private_data, private_data, private_data_len); +} + +int ib_send_cm_spr(struct ib_cm_id *cm_id, + enum ib_cm_spr_status status, + void *info, + u8 info_length, + const void *private_data, + u8 private_data_len) +{ + struct cm_id_private *cm_id_priv; + struct ib_mad_send_buf *msg; + unsigned long flags; + int ret; + + if ((private_data && private_data_len > IB_CM_APR_PRIVATE_DATA_SIZE) || + (info && info_length > IB_CM_APR_INFO_LENGTH)) { + return -EINVAL; + } + + cm_id_priv = container_of(cm_id, struct cm_id_private, id); + spin_lock_irqsave(&cm_id_priv->lock, flags); + if (cm_id->state != IB_CM_ESTABLISHED || + (cm_id->sap_state != IB_CM_SAP_RCVD )) { + ret = -EINVAL; + goto out; + } + + ret = cm_alloc_msg(cm_id_priv, &msg); + if (ret) + goto out; + + cm_format_spr((struct cm_spr_msg *) msg->mad, cm_id_priv, status, + info, info_length, private_data, private_data_len); + ret = ib_post_send_mad(msg, NULL); + if (ret) { + spin_unlock_irqrestore(&cm_id_priv->lock, flags); + cm_free_msg(msg); + return ret; + } + + cm_id->sap_state = IB_CM_SAP_IDLE; +out: spin_unlock_irqrestore(&cm_id_priv->lock, flags); + return ret; +} +EXPORT_SYMBOL(ib_send_cm_spr); + static int cm_apr_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; @@ -2828,8 +3113,50 @@ static int cm_apr_handler(struct cm_work *work) goto out; } cm_id_priv->id.lap_state = IB_CM_LAP_IDLE; - ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - cm_id_priv->msg = NULL; + ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->lap_msg); + cm_id_priv->lap_msg = NULL; + + ret = atomic_inc_and_test(&cm_id_priv->work_count); + if (!ret) + list_add_tail(&work->list, &cm_id_priv->work_list); + spin_unlock_irq(&cm_id_priv->lock); + + if (ret) + cm_process_work(cm_id_priv, work); + else + cm_deref_id(cm_id_priv); + return 0; +out: + cm_deref_id(cm_id_priv); + return -EINVAL; +} + +static int cm_spr_handler(struct cm_work *work) +{ + struct cm_id_private *cm_id_priv; + struct cm_spr_msg *spr_msg; + int ret; + + spr_msg = (struct cm_spr_msg *)work->mad_recv_wc->recv_buf.mad; + cm_id_priv = cm_acquire_id(spr_msg->remote_comm_id, + spr_msg->local_comm_id); + if (!cm_id_priv) + return -EINVAL; /* Unmatched reply. */ + + work->cm_event.param.spr_rcvd.ap_status = spr_msg->ap_status; + work->cm_event.param.spr_rcvd.spr_info = &spr_msg->info; + work->cm_event.param.spr_rcvd.info_len = spr_msg->info_length; + work->cm_event.private_data = &spr_msg->private_data; + + spin_lock_irq(&cm_id_priv->lock); + if (cm_id_priv->id.state != IB_CM_ESTABLISHED || + (cm_id_priv->id.sap_state != IB_CM_SAP_SENT)) { + spin_unlock_irq(&cm_id_priv->lock); + goto out; + } + cm_id_priv->id.sap_state = IB_CM_SAP_IDLE; + ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->lap_msg); + cm_id_priv->lap_msg = NULL; ret = atomic_inc_and_test(&cm_id_priv->work_count); if (!ret) @@ -2912,6 +3239,9 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id, return -EINVAL; cm_id_priv = container_of(cm_id, struct cm_id_private, id); + + spin_lock_irqsave(&cm_id_priv->lock, flags); + ret = cm_init_av_by_path(param->path, &cm_id_priv->av); if (ret) goto out; @@ -2919,6 +3249,12 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id, cm_id->service_id = param->service_id; cm_id->service_mask = ~cpu_to_be64(0); cm_id_priv->timeout_ms = param->timeout_ms; + if (cm_id_priv->timeout_ms > cm_convert_to_ms(max_timeout)) { + printk(KERN_WARNING PFX "sidr req timeout_ms %d > %d, " + "decreasing used timeout_ms\n", param->timeout_ms, + cm_convert_to_ms(max_timeout)); + cm_id_priv->timeout_ms = cm_convert_to_ms(max_timeout); + } cm_id_priv->max_cm_retries = param->max_cm_retries; ret = cm_alloc_msg(cm_id_priv, &msg); if (ret) @@ -2929,21 +3265,19 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id, msg->timeout_ms = cm_id_priv->timeout_ms; msg->context[1] = (void *) (unsigned long) IB_CM_SIDR_REQ_SENT; - spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state == IB_CM_IDLE) ret = ib_post_send_mad(msg, NULL); else ret = -EINVAL; if (ret) { - spin_unlock_irqrestore(&cm_id_priv->lock, flags); cm_free_msg(msg); goto out; } cm_id->state = IB_CM_SIDR_REQ_SENT; cm_id_priv->msg = msg; - spin_unlock_irqrestore(&cm_id_priv->lock, flags); out: + spin_unlock_irqrestore(&cm_id_priv->lock, flags); return ret; } EXPORT_SYMBOL(ib_send_cm_sidr_req); @@ -3142,7 +3476,6 @@ static void cm_process_send_error(struct ib_mad_send_buf *msg, memset(&cm_event, 0, sizeof cm_event); cm_id_priv = msg->context[0]; - /* Discard old sends or ones without a response. */ spin_lock_irq(&cm_id_priv->lock); state = (enum ib_cm_state) (unsigned long) msg->context[1]; @@ -3264,9 +3597,15 @@ static void cm_work_handler(struct work_struct *_work) case IB_CM_LAP_RECEIVED: ret = cm_lap_handler(work); break; + case IB_CM_SAP_RECEIVED: + ret = cm_sap_handler(work); + break; case IB_CM_APR_RECEIVED: ret = cm_apr_handler(work); break; + case IB_CM_SPR_RECEIVED: + ret = cm_spr_handler(work); + break; case IB_CM_TIMEWAIT_EXIT: ret = cm_timewait_handler(work); break; @@ -3408,9 +3747,16 @@ static void cm_recv_handler(struct ib_mad_agent *mad_agent, paths = 1; event = IB_CM_LAP_RECEIVED; break; + case CM_SAP_ATTR_ID: + paths = 1; + event = IB_CM_SAP_RECEIVED; + break; case CM_APR_ATTR_ID: event = IB_CM_APR_RECEIVED; break; + case CM_SPR_ATTR_ID: + event = IB_CM_SPR_RECEIVED; + break; default: ib_free_recv_mad(mad_recv_wc); return; @@ -3618,7 +3964,7 @@ static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr, atomic_long_read(&group->counter[cm_attr->index])); } -static const struct sysfs_ops cm_counter_ops = { +static struct sysfs_ops cm_counter_ops = { .show = cm_show_counter }; @@ -3639,17 +3985,8 @@ static struct kobj_type cm_port_obj_type = { .release = cm_release_port_obj }; -static char *cm_devnode(struct device *dev, mode_t *mode) -{ - if (mode) - *mode = 0666; - return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev)); -} - struct class cm_class = { - .owner = THIS_MODULE, .name = "infiniband_cm", - .devnode = cm_devnode, }; EXPORT_SYMBOL(cm_class); diff --git a/drivers/infiniband/core/cm_msgs.h b/drivers/infiniband/core/cm_msgs.h index 7e63c08f697c3..514e9587abac3 100644 --- a/drivers/infiniband/core/cm_msgs.h +++ b/drivers/infiniband/core/cm_msgs.h @@ -55,6 +55,8 @@ #define CM_SIDR_REP_ATTR_ID cpu_to_be16(0x0018) #define CM_LAP_ATTR_ID cpu_to_be16(0x0019) #define CM_APR_ATTR_ID cpu_to_be16(0x001A) +#define CM_SAP_ATTR_ID cpu_to_be16(0x001B) +#define CM_SPR_ATTR_ID cpu_to_be16(0x001C) enum cm_msg_sequence { CM_MSG_SEQUENCE_REQ, @@ -112,7 +114,7 @@ struct cm_req_msg { u8 alt_hop_limit; /* SL:4, subnet local:1, rsvd:3 */ u8 alt_offset138; - /* local ACK timeout:5, rsvd:3 */ + /* local ACK timeout:5, SAP support:1, rsvd:2 */ u8 alt_offset139; u8 private_data[IB_CM_REQ_PRIVATE_DATA_SIZE]; @@ -412,6 +414,17 @@ static inline void cm_req_set_alt_local_ack_timeout(struct cm_req_msg *req_msg, (local_ack_timeout << 3)); } +static inline void cm_req_set_sap_support(struct cm_req_msg *req_msg, u8 supported) +{ + req_msg->alt_offset139 = (u8) ((req_msg->alt_offset139 & 0xfb) | + (!!supported << 2)); +} + +static inline u8 cm_req_get_sap_support(struct cm_req_msg *req_msg) +{ + return (u8) ((req_msg->alt_offset139 >> 2 & 1)); +} + /* Message REJected or MRAed */ enum cm_msg_response { CM_MSG_RESPONSE_REQ = 0x0, @@ -668,6 +681,24 @@ struct cm_lap_msg { u8 private_data[IB_CM_LAP_PRIVATE_DATA_SIZE]; } __attribute__ ((packed)); +struct cm_sap_msg { + struct ib_mad_hdr hdr; + + __be32 local_comm_id; + __be32 remote_comm_id; + + __be32 rsvd8; + /* remote QPN/EECN:24, rsvd:8 */ + __be32 offset12; + __be32 rsvd16; + + __be16 alt_local_lid; + __be16 rsvd22; + union ib_gid alt_local_gid; + __be32 rsvd6[6]; + u8 private_data[IB_CM_LAP_PRIVATE_DATA_SIZE]; +} __attribute__ ((packed)); + static inline __be32 cm_lap_get_remote_qpn(struct cm_lap_msg *lap_msg) { return cpu_to_be32(be32_to_cpu(lap_msg->offset12) >> 8); @@ -763,6 +794,17 @@ static inline void cm_lap_set_local_ack_timeout(struct cm_lap_msg *lap_msg, (lap_msg->offset63 & 0x07); } +static inline void cm_sap_set_remote_qpn(struct cm_sap_msg *sap_msg, __be32 qpn) +{ + sap_msg->offset12 = cpu_to_be32((be32_to_cpu(qpn) << 8) | + (be32_to_cpu(sap_msg->offset12) & 0x000000FF)); +} + +static inline __be32 cm_sap_get_remote_qpn(struct cm_sap_msg *sap_msg) +{ + return cpu_to_be32(be32_to_cpu(sap_msg->offset12) >> 8); +} + struct cm_apr_msg { struct ib_mad_hdr hdr; @@ -771,6 +813,33 @@ struct cm_apr_msg { u8 info_length; u8 ap_status; + /* + * __be16 rsvd; + * + * This field should be here according to the spec + * but since they are not implemneted in R1 we + * don't put them here either + */ + u8 info[IB_CM_APR_INFO_LENGTH]; + + u8 private_data[IB_CM_APR_PRIVATE_DATA_SIZE]; +} __attribute__ ((packed)); + +struct cm_spr_msg { + struct ib_mad_hdr hdr; + + __be32 local_comm_id; + __be32 remote_comm_id; + + u8 info_length; + u8 ap_status; + /* + * __be16 rsvd10; + * + * This field should be here according to the spec + * but since they are not implemneted in R1 we + * don't put them here either + */ u8 info[IB_CM_APR_INFO_LENGTH]; u8 private_data[IB_CM_APR_PRIVATE_DATA_SIZE]; diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index b6a33b3c516de..554db157691d2 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -40,14 +40,12 @@ #include #include #include -#include #include #include #include #include -#include #include #include #include @@ -57,10 +55,57 @@ MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("Generic RDMA CM Agent"); MODULE_LICENSE("Dual BSD/GPL"); +static int debug_level = 0; +#define cma_pr(level, priv, format, arg...) \ + printk(level "CMA: %p: %s: " format, ((struct rdma_id_priv *) priv) , __func__, ## arg) + +#define cma_dbg(priv, format, arg...) \ + do { if (debug_level) cma_pr(KERN_DEBUG, priv, format, ## arg); } while (0) + +#define cma_warn(priv, format, arg...) \ + cma_pr(KERN_WARNING, priv, format, ## arg); + + +#define CMA_GID_FMT "%2.2x%2.2x:%2.2x%2.2x" + +#define CMA_GID_RAW_ARG(gid) ((u8 *)(gid))[12],\ + ((u8 *)(gid))[13],\ + ((u8 *)(gid))[14],\ + ((u8 *)(gid))[15] + +#define CMA_GID_ARG(gid) CMA_GID_RAW_ARG((gid).raw) + +#define cma_debug_path(priv, pfx, p) \ + cma_dbg(priv, pfx "sgid=" CMA_GID_FMT ",dgid=" CMA_GID_FMT "\n", CMA_GID_ARG(p.sgid), CMA_GID_ARG(p.dgid)) + +#define cma_debug_gid(priv, g) \ + cma_dbg(priv, "gid=" CMA_GID_FMT "\n", CMA_GID_ARG(g) + + +static int tavor_quirk = 0; +module_param_named(tavor_quirk, tavor_quirk, int, 0644); +MODULE_PARM_DESC(tavor_quirk, "Tavor performance quirk: limit MTU to 1K if > 0"); + +int unify_tcp_port_space = 0; +module_param(unify_tcp_port_space, int, 0644); +MODULE_PARM_DESC(unify_tcp_port_space, "Unify the host TCP and RDMA port " + "space allocation (default=0)"); + #define CMA_CM_RESPONSE_TIMEOUT 20 #define CMA_MAX_CM_RETRIES 15 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) -#define CMA_IBOE_PACKET_LIFETIME 18 +#define IBOE_PACKET_LIFETIME 18 + +module_param_named(debug_level, debug_level, int, 0644); +MODULE_PARM_DESC(debug_level, "debug level default=0"); + +static int cma_response_timeout = CMA_CM_RESPONSE_TIMEOUT; +module_param_named(cma_response_timeout, cma_response_timeout, int, 0644); +MODULE_PARM_DESC(cma_response_timeout, "CMA_CM_RESPONSE_TIMEOUT default=20"); + +static int def_prec2sl = 3; +module_param_named(def_prec2sl, def_prec2sl, int, 0644); +MODULE_PARM_DESC(def_prec2sl, "Default value for SL priority with RoCE. Valid values 0 - 7"); static void cma_add_one(struct ib_device *device); static void cma_remove_one(struct ib_device *device); @@ -77,17 +122,35 @@ static LIST_HEAD(dev_list); static LIST_HEAD(listen_any_list); static DEFINE_MUTEX(lock); static struct workqueue_struct *cma_wq; +static struct workqueue_struct *cma_free_wq; static DEFINE_IDR(sdp_ps); static DEFINE_IDR(tcp_ps); static DEFINE_IDR(udp_ps); static DEFINE_IDR(ipoib_ps); +static int next_port; struct cma_device { struct list_head list; struct ib_device *device; + struct ib_event_handler event_handler; struct completion comp; atomic_t refcount; struct list_head id_list; + int *port_active; +}; + +enum cma_state { + CMA_IDLE, + CMA_ADDR_QUERY, + CMA_ADDR_RESOLVED, + CMA_ROUTE_QUERY, + CMA_ROUTE_RESOLVED, + CMA_CONNECT, + CMA_DISCONNECT, + CMA_ADDR_BOUND, + CMA_LISTEN, + CMA_DEVICE_REMOVAL, + CMA_DESTROYING }; struct rdma_bind_list { @@ -96,6 +159,11 @@ struct rdma_bind_list { unsigned short port; }; +enum cma_apm_flags { + CMA_APM_ACTIVE_SIDE = 1, + CMA_APM_ENABLED = (1<<1) +}; + /* * Device removal can occur at anytime, so we need extra handling to * serialize notifying the user of device removal with other callbacks. @@ -106,6 +174,7 @@ struct rdma_id_private { struct rdma_cm_id id; struct rdma_bind_list *bind_list; + struct socket *sock; struct hlist_node node; struct list_head list; /* listen_any_list or cma_device.list */ struct list_head listen_list; /* per device listens */ @@ -113,13 +182,15 @@ struct rdma_id_private { struct list_head mc_list; int internal_id; - enum rdma_cm_state state; + enum cma_state state; spinlock_t lock; + spinlock_t cm_lock; struct mutex qp_mutex; struct completion comp; atomic_t refcount; struct mutex handler_mutex; + struct work_struct work; /* garbage coll */ int backlog; int timeout_ms; @@ -133,12 +204,23 @@ struct rdma_id_private { u32 seq_num; u32 qkey; u32 qp_num; - pid_t owner; u8 srq; u8 tos; - u8 reuseaddr; + enum cma_apm_flags apm_flags; + int alt_path_index; + void (*qp_event_handler)(struct ib_event *, void *); + void *qp_context; + int qp_timeout; }; +void cma_debug_routes(struct rdma_id_private *id_priv) +{ + struct rdma_route *route = &id_priv->id.route; + cma_dbg(id_priv, "***num_paths: %d, alt path index: %d\n", route->num_paths, id_priv->alt_path_index); + cma_debug_path(id_priv, "path-0: ", route->path_rec[0]); + cma_debug_path(id_priv, "path-1: ", route->path_rec[1]); +} + struct cma_multicast { struct rdma_id_private *id_priv; union { @@ -153,17 +235,42 @@ struct cma_multicast { struct cma_work { struct work_struct work; struct rdma_id_private *id; - enum rdma_cm_state old_state; - enum rdma_cm_state new_state; + enum cma_state old_state; + enum cma_state new_state; struct rdma_cm_event event; }; +struct alt_path_work { + struct delayed_work work; + struct rdma_id_private *id; + struct ib_sa_path_rec path_rec; +}; + +struct cma_active_mig_send_lap_work { + struct work_struct work; + struct rdma_id_private *id; +}; + + struct cma_ndev_work { struct work_struct work; struct rdma_id_private *id; struct rdma_cm_event event; }; +struct cma_port_ud_work { + struct work_struct work; + struct cma_device *cma_dev; + u8 port_num; + u8 up; +}; + +struct cma_sap_work { + struct delayed_work work; + struct rdma_id_private *id; + int from_portdown; +}; + struct iboe_mcast_work { struct work_struct work; struct rdma_id_private *id; @@ -205,7 +312,15 @@ struct sdp_hah { #define CMA_VERSION 0x00 #define SDP_MAJ_VERSION 0x2 -static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp) +static void cma_work_handler(struct work_struct *_work); +static int cma_resolve_alt_ib_route(struct rdma_id_private *id_priv); +static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms, + struct cma_work *work, + struct ib_sa_path_rec *path_rec, + ib_sa_comp_mask comp_mask); +static void cma_mig_send_lap_work(struct work_struct *_work); + +static int cma_comp(struct rdma_id_private *id_priv, enum cma_state comp) { unsigned long flags; int ret; @@ -217,7 +332,7 @@ static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp) } static int cma_comp_exch(struct rdma_id_private *id_priv, - enum rdma_cm_state comp, enum rdma_cm_state exch) + enum cma_state comp, enum cma_state exch) { unsigned long flags; int ret; @@ -229,11 +344,11 @@ static int cma_comp_exch(struct rdma_id_private *id_priv, return ret; } -static enum rdma_cm_state cma_exch(struct rdma_id_private *id_priv, - enum rdma_cm_state exch) +static enum cma_state cma_exch(struct rdma_id_private *id_priv, + enum cma_state exch) { unsigned long flags; - enum rdma_cm_state old; + enum cma_state old; spin_lock_irqsave(&id_priv->lock, flags); old = id_priv->state; @@ -267,6 +382,11 @@ static inline void sdp_set_ip_ver(struct sdp_hh *hh, u8 ip_ver) hh->ip_version = (ip_ver << 4) | (hh->ip_version & 0xF); } +static inline int cma_is_ud_ps(enum rdma_port_space ps) +{ + return (ps == RDMA_PS_UDP || ps == RDMA_PS_IPOIB); +} + static void cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { @@ -292,13 +412,11 @@ static inline void release_mc(struct kref *kref) kfree(mc); } -static void cma_release_dev(struct rdma_id_private *id_priv) +static void cma_detach_from_dev(struct rdma_id_private *id_priv) { - mutex_lock(&lock); list_del(&id_priv->list); cma_deref_dev(id_priv->cma_dev); id_priv->cma_dev = NULL; - mutex_unlock(&lock); } static int cma_set_qkey(struct rdma_id_private *id_priv) @@ -327,65 +445,36 @@ static int cma_set_qkey(struct rdma_id_private *id_priv) return ret; } -static int find_gid_port(struct ib_device *device, union ib_gid *gid, u8 port_num) -{ - int i; - int err; - struct ib_port_attr props; - union ib_gid tmp; - - err = ib_query_port(device, port_num, &props); - if (err) - return 1; - - for (i = 0; i < props.gid_tbl_len; ++i) { - err = ib_query_gid(device, port_num, i, &tmp); - if (err) - return 1; - if (!memcmp(&tmp, gid, sizeof tmp)) - return 0; - } - - return -EAGAIN; -} - static int cma_acquire_dev(struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct cma_device *cma_dev; - union ib_gid gid, iboe_gid; + union ib_gid gid; int ret = -ENODEV; - u8 port; - enum rdma_link_layer dev_ll = dev_addr->dev_type == ARPHRD_INFINIBAND ? - IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; - mutex_lock(&lock); - iboe_addr_get_sgid(dev_addr, &iboe_gid); + if (dev_addr->dev_type != ARPHRD_INFINIBAND) { + iboe_addr_get_sgid(dev_addr, &gid); + list_for_each_entry(cma_dev, &dev_list, list) { + ret = ib_find_cached_gid(cma_dev->device, &gid, + &id_priv->id.port_num, NULL); + if (!ret) + goto out; + } + } + memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof gid); list_for_each_entry(cma_dev, &dev_list, list) { - for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) { - if (rdma_port_get_link_layer(cma_dev->device, port) == dev_ll) { - if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB && - rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET) - ret = find_gid_port(cma_dev->device, &iboe_gid, port); - else - ret = find_gid_port(cma_dev->device, &gid, port); - - if (!ret) { - id_priv->id.port_num = port; - goto out; - } else if (ret == 1) - break; - } - } + ret = ib_find_cached_gid(cma_dev->device, &gid, + &id_priv->id.port_num, NULL); + if (!ret) + break; } out: if (!ret) cma_attach_to_dev(id_priv, cma_dev); - mutex_unlock(&lock); return ret; } @@ -396,7 +485,7 @@ static void cma_deref_id(struct rdma_id_private *id_priv) } static int cma_disable_callback(struct rdma_id_private *id_priv, - enum rdma_cm_state state) + enum cma_state state) { mutex_lock(&id_priv->handler_mutex); if (id_priv->state != state) { @@ -412,8 +501,7 @@ static int cma_has_cm_dev(struct rdma_id_private *id_priv) } struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler, - void *context, enum rdma_port_space ps, - enum ib_qp_type qp_type) + void *context, enum rdma_port_space ps) { struct rdma_id_private *id_priv; @@ -421,13 +509,13 @@ struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler, if (!id_priv) return ERR_PTR(-ENOMEM); - id_priv->owner = task_pid_nr(current); - id_priv->state = RDMA_CM_IDLE; + id_priv->state = CMA_IDLE; id_priv->id.context = context; id_priv->id.event_handler = event_handler; id_priv->id.ps = ps; - id_priv->id.qp_type = qp_type; + id_priv->alt_path_index = 1; spin_lock_init(&id_priv->lock); + spin_lock_init(&id_priv->cm_lock); mutex_init(&id_priv->qp_mutex); init_completion(&id_priv->comp); atomic_set(&id_priv->refcount, 1); @@ -466,6 +554,128 @@ static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) return ret; } +static void cma_send_sap(struct rdma_id_private *id_priv, + struct ib_sa_path_rec *path_rec) +{ + int ret; + struct ib_cm_id *cm_id = id_priv->cm_id.ib; + unsigned long flags; + + + cma_dbg(id_priv, "send SAP. suggest gid=" CMA_GID_FMT "\n", CMA_GID_ARG(path_rec->sgid)); + spin_lock_irqsave(&id_priv->cm_lock, flags); + if (id_priv->cm_id.ib) { + if (id_priv->cm_id.ib->state != IB_CM_ESTABLISHED) { + cma_dbg(id_priv, "sap not sent. CM state not established (%d)\n", + id_priv->cm_id.ib->state); + } else { + cma_dbg(id_priv, "sending SAP\n"); + ret = ib_send_cm_sap(cm_id, path_rec, NULL, 0); + if (ret) { + cma_dbg(id_priv, "failed to send SAP (%d)\n", ret); + } else { + cma_dbg(id_priv, "sap was sent\n"); + } + } + } else { + cma_dbg(id_priv, "invalid CM id\n"); + } + spin_unlock_irqrestore(&id_priv->cm_lock, flags); +} + +static void cma_send_lap(struct rdma_id_private *id_priv, + struct ib_sa_path_rec *path_rec) +{ + int ret; + struct ib_cm_id *cm_id = id_priv->cm_id.ib; + unsigned long flags; + + cma_debug_path(id_priv, "send LAP with path ", (*path_rec)); + spin_lock_irqsave(&id_priv->cm_lock, flags); + if (id_priv->cm_id.ib) { + cma_dbg(id_priv, "sending LAP\n"); + ret = ib_send_cm_lap(cm_id, path_rec, NULL, 0); + if (ret) { + cma_dbg(id_priv, "failed to send LAP (%d)\n", ret); + } else { + cma_dbg(id_priv, "lap was sent\n"); + } + } else { + cma_dbg(id_priv, "invalid CM id\n"); + } + spin_unlock_irqrestore(&id_priv->cm_lock, flags); +} + +/* this function needs some error handling!!! */ +static void cma_qp_event_handler(struct ib_event *event, void *data) +{ + struct rdma_id_private *id_priv = data; + struct rdma_route *route = &id_priv->id.route; + struct cma_active_mig_send_lap_work *work; + unsigned long flags; + int ret; + u8 port; + int migrated_index; + + /* call the consumer's event handler first */ + if (id_priv->qp_event_handler && !id_priv->id.ucontext) + id_priv->qp_event_handler(event, id_priv->qp_context); + + if (event->event == IB_EVENT_PATH_MIG_ERR && !id_priv->id.ucontext) + cma_dbg(id_priv, "\ngot event IB_EVENT_PATH_MIG_ERR, qpn=0x%x\n\n", event->element.qp->qp_num); + + if (event->event != IB_EVENT_PATH_MIG) + return; + + if (!id_priv->id.ucontext) + cma_dbg(id_priv, "\ngot event IB_EVENT_PATH_MIG, qpn=0x%x\n", event->element.qp->qp_num); + + spin_lock_irqsave(&id_priv->cm_lock, flags); + if (id_priv->cm_id.ib) { + ret = ib_cm_notify(id_priv->cm_id.ib, event->event); + if (ret) { + cma_dbg(id_priv, "ib_cm_notify failed (%d)\n", ret); + } else { + cma_dbg(id_priv, "cm notified\n"); + } + } else { + cma_dbg(id_priv, "invalid CM id\n"); + } + spin_unlock_irqrestore(&id_priv->cm_lock, flags); + + spin_lock_irqsave(&id_priv->lock, flags); + + migrated_index = id_priv->alt_path_index; + id_priv->alt_path_index ^= 0x1; + cma_dbg(id_priv, "new primary path index %d, alt path idx %d\n", + migrated_index, id_priv->alt_path_index); + /* update active port in cma_priv */ + if (!ib_find_cached_gid(event->device, + &route->path_rec[migrated_index].sgid, + &port, NULL)) { + cma_dbg(id_priv, "Setting port num. old=%d, new=%d\n", + id_priv->id.port_num, port); + id_priv->id.port_num = port; + } + cma_dbg(id_priv, "Migrated. new gids:\n"); + cma_debug_routes(id_priv); + if (id_priv->apm_flags & CMA_APM_ACTIVE_SIDE) { + work = kzalloc(sizeof *work, GFP_ATOMIC); + if (work) { + atomic_inc(&id_priv->refcount); + work->id = id_priv; + INIT_WORK(&work->work, cma_mig_send_lap_work); + queue_work(cma_wq, &work->work); + } else { + cma_dbg(id_priv, "Failed to allocate send_lap work struct\n"); + } + + } + //cma_send_lap(id_priv, &route->path_rec[id_priv->alt_path_index]); + + spin_unlock_irqrestore(&id_priv->lock, flags); +} + static int cma_init_conn_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { struct ib_qp_attr qp_attr; @@ -489,12 +699,15 @@ int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, id_priv = container_of(id, struct rdma_id_private, id); if (id->device != pd->device) return -EINVAL; - + id_priv->qp_event_handler = qp_init_attr->event_handler; + id_priv->qp_context = qp_init_attr->qp_context; + qp_init_attr->event_handler = cma_qp_event_handler; + qp_init_attr->qp_context = id_priv; qp = ib_create_qp(pd, qp_init_attr); if (IS_ERR(qp)) return PTR_ERR(qp); - if (id->qp_type == IB_QPT_UD) + if (cma_is_ud_ps(id_priv->id.ps)) ret = cma_init_ud_qp(id_priv, qp); else ret = cma_init_conn_qp(id_priv, qp); @@ -577,6 +790,12 @@ static int cma_modify_qp_rts(struct rdma_id_private *id_priv, if (conn_param) qp_attr.max_rd_atomic = conn_param->initiator_depth; + + if (id_priv->qp_timeout) { + qp_attr.timeout = id_priv->qp_timeout; + qp_attr_mask |= IB_QP_TIMEOUT; + } + ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); out: mutex_unlock(&id_priv->qp_mutex); @@ -608,7 +827,7 @@ static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, int ret; u16 pkey; - if (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num) == + if (rdma_port_link_layer(id_priv->id.device, id_priv->id.port_num) == IB_LINK_LAYER_INFINIBAND) pkey = ib_addr_get_pkey(dev_addr); else @@ -622,7 +841,7 @@ static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, qp_attr->port_num = id_priv->id.port_num; *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT; - if (id_priv->id.qp_type == IB_QPT_UD) { + if (cma_is_ud_ps(id_priv->id.ps)) { ret = cma_set_qkey(id_priv); if (ret) return ret; @@ -645,7 +864,7 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, id_priv = container_of(id, struct rdma_id_private, id); switch (rdma_node_get_transport(id_priv->id.device->node_type)) { case RDMA_TRANSPORT_IB: - if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD)) + if (!id_priv->cm_id.ib || cma_is_ud_ps(id_priv->id.ps)) ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask); else ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr, @@ -699,21 +918,6 @@ static inline int cma_any_addr(struct sockaddr *addr) return cma_zero_addr(addr) || cma_loopback_addr(addr); } -static int cma_addr_cmp(struct sockaddr *src, struct sockaddr *dst) -{ - if (src->sa_family != dst->sa_family) - return -1; - - switch (src->sa_family) { - case AF_INET: - return ((struct sockaddr_in *) src)->sin_addr.s_addr != - ((struct sockaddr_in *) dst)->sin_addr.s_addr; - default: - return ipv6_addr_cmp(&((struct sockaddr_in6 *) src)->sin6_addr, - &((struct sockaddr_in6 *) dst)->sin6_addr); - } -} - static inline __be16 cma_port(struct sockaddr *addr) { if (addr->sa_family == AF_INET) @@ -808,7 +1012,7 @@ static inline int cma_user_data_offset(enum rdma_port_space ps) static void cma_cancel_route(struct rdma_id_private *id_priv) { - switch (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num)) { + switch (rdma_port_link_layer(id_priv->id.device, id_priv->id.port_num)) { case IB_LINK_LAYER_INFINIBAND: if (id_priv->query) ib_sa_cancel_query(id_priv->query_id, id_priv->query); @@ -844,16 +1048,16 @@ static void cma_cancel_listens(struct rdma_id_private *id_priv) } static void cma_cancel_operation(struct rdma_id_private *id_priv, - enum rdma_cm_state state) + enum cma_state state) { switch (state) { - case RDMA_CM_ADDR_QUERY: + case CMA_ADDR_QUERY: rdma_addr_cancel(&id_priv->id.route.addr.dev_addr); break; - case RDMA_CM_ROUTE_QUERY: + case CMA_ROUTE_QUERY: cma_cancel_route(id_priv); break; - case RDMA_CM_LISTEN: + case CMA_LISTEN: if (cma_any_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr) && !id_priv->cma_dev) cma_cancel_listens(id_priv); @@ -877,6 +1081,8 @@ static void cma_release_port(struct rdma_id_private *id_priv) kfree(bind_list); } mutex_unlock(&lock); + if (id_priv->sock) + sock_release(id_priv->sock); } static void cma_leave_mc_groups(struct rdma_id_private *id_priv) @@ -887,7 +1093,7 @@ static void cma_leave_mc_groups(struct rdma_id_private *id_priv) mc = container_of(id_priv->mc_list.next, struct cma_multicast, list); list_del(&mc->list); - switch (rdma_port_get_link_layer(id_priv->cma_dev->device, id_priv->id.port_num)) { + switch (rdma_port_link_layer(id_priv->cma_dev->device, id_priv->id.port_num)) { case IB_LINK_LAYER_INFINIBAND: ib_sa_free_multicast(mc->multicast.ib); kfree(mc); @@ -900,98 +1106,723 @@ static void cma_leave_mc_groups(struct rdma_id_private *id_priv) } } } +static void __rdma_free(struct work_struct *work) +{ + struct rdma_id_private *id_priv; + id_priv = container_of(work, struct rdma_id_private, work); + + wait_for_completion(&id_priv->comp); + + if (id_priv->internal_id) + cma_deref_id(id_priv->id.context); + + kfree(id_priv->id.route.path_rec); + kfree(id_priv); +} void rdma_destroy_id(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; - enum rdma_cm_state state; + enum cma_state state; + unsigned long flags; + struct ib_cm_id *ib; id_priv = container_of(id, struct rdma_id_private, id); - state = cma_exch(id_priv, RDMA_CM_DESTROYING); + state = cma_exch(id_priv, CMA_DESTROYING); cma_cancel_operation(id_priv, state); - /* - * Wait for any active callback to finish. New callbacks will find - * the id_priv state set to destroying and abort. - */ - mutex_lock(&id_priv->handler_mutex); - mutex_unlock(&id_priv->handler_mutex); - + mutex_lock(&lock); if (id_priv->cma_dev) { + mutex_unlock(&lock); switch (rdma_node_get_transport(id_priv->id.device->node_type)) { case RDMA_TRANSPORT_IB: - if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib)) - ib_destroy_cm_id(id_priv->cm_id.ib); + spin_lock_irqsave(&id_priv->cm_lock, flags); + if (id_priv->cm_id.ib) { + ib = id_priv->cm_id.ib; + id_priv->cm_id.ib = NULL; + spin_unlock_irqrestore(&id_priv->cm_lock, flags); + ib_destroy_cm_id(ib); + } else + spin_unlock_irqrestore(&id_priv->cm_lock, flags); break; case RDMA_TRANSPORT_IWARP: - if (id_priv->cm_id.iw && !IS_ERR(id_priv->cm_id.iw)) + if (id_priv->cm_id.iw) iw_destroy_cm_id(id_priv->cm_id.iw); break; default: break; } cma_leave_mc_groups(id_priv); - cma_release_dev(id_priv); + mutex_lock(&lock); + cma_detach_from_dev(id_priv); + } + mutex_unlock(&lock); + + cma_release_port(id_priv); + cma_deref_id(id_priv); + INIT_WORK(&id_priv->work, __rdma_free); + queue_work(cma_free_wq, &id_priv->work); +} +EXPORT_SYMBOL(rdma_destroy_id); + +static int cma_rep_recv(struct rdma_id_private *id_priv) +{ + int ret; + + ret = cma_modify_qp_rtr(id_priv, NULL); + if (ret) + goto reject; + + ret = cma_modify_qp_rts(id_priv, NULL); + if (ret) + goto reject; + + cma_dbg(id_priv, "sending RTU\n"); + ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0); + if (ret) + goto reject; + + return 0; +reject: + cma_modify_qp_err(id_priv); + cma_dbg(id_priv, "sending REJ\n"); + ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, + NULL, 0, NULL, 0); + return ret; +} + +static int cma_verify_rep(struct rdma_id_private *id_priv, void *data) +{ + if (id_priv->id.ps == RDMA_PS_SDP && + sdp_get_majv(((struct sdp_hah *) data)->sdp_version) != + SDP_MAJ_VERSION) + return -EINVAL; + + return 0; +} + +static void cma_set_rep_event_data(struct rdma_cm_event *event, + struct ib_cm_rep_event_param *rep_data, + void *private_data) +{ + event->param.conn.private_data = private_data; + event->param.conn.private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE; + event->param.conn.responder_resources = rep_data->responder_resources; + event->param.conn.initiator_depth = rep_data->initiator_depth; + event->param.conn.flow_control = rep_data->flow_control; + event->param.conn.rnr_retry_count = rep_data->rnr_retry_count; + event->param.conn.srq = rep_data->srq; + event->param.conn.qp_num = rep_data->remote_qpn; +} + +static int cma_qp_set_alt_path(struct rdma_id_private *id_priv) +{ + struct ib_qp_attr qp_attr; + int qp_attr_mask, ret = 0; + unsigned long flags; + struct ib_qp_init_attr qp_init_attr; + enum ib_mig_state path_mig_state; + struct rdma_cm_event event; + + if (id_priv->id.ucontext) { + /* let userspace consumers know */ + memset(&event, 0, sizeof event); + event.event = RDMA_CM_EVENT_LOAD_ALT_PATH; + event.status = 0; + ret = id_priv->id.event_handler(&id_priv->id, &event); + return ret; + } + + qp_attr_mask = IB_QP_PATH_MIG_STATE; + + mutex_lock(&id_priv->qp_mutex); + if (!id_priv->id.qp) { + cma_dbg(id_priv, "cma-id qp has been destroyed.\n"); + ret = -EINVAL; + } else + ret = ib_query_qp(id_priv->id.qp, &qp_attr, qp_attr_mask, &qp_init_attr); + + if (ret) { + cma_dbg(id_priv, "fail to query qp (%d)", ret); + mutex_unlock(&id_priv->qp_mutex); + return ret; + } + path_mig_state = qp_attr.path_mig_state; + cma_dbg(id_priv, "qp 0x%x is in path migration state %d/%s\n", id_priv->qp_num, + path_mig_state, + (!path_mig_state) ? "MIGRATED" : + (path_mig_state == 1) ? "REARM" : + (path_mig_state == 2) ? "ARMED" : "UNKNOWN" ); + + spin_lock_irqsave(&id_priv->cm_lock, flags); + if (!id_priv->id.qp) { + cma_dbg(id_priv, "qp is null. qpn=%d\n", id_priv->qp_num); + ret = -EINVAL; + goto out; + } + if (id_priv->cm_id.ib) { + qp_attr.qp_state = IB_QPS_RTS; + ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, &qp_attr, &qp_attr_mask); + if (ret) { + cma_dbg(id_priv, "failed to init alt path QP attr (%d)\n", ret); + goto out; + } + /* fix requested state. later, do it in CM level */ + switch (path_mig_state) { + case IB_MIG_MIGRATED: + break; + case IB_MIG_REARM: + case IB_MIG_ARMED: + /* FIXME: exclude IB_QP_PATH_MIG_STATE from mask in ib_cm_init_qp_attr() */ + qp_attr_mask &= ~IB_QP_PATH_MIG_STATE; + break; + default: + cma_warn(id_priv, "qp is in unexpected state\n"); + ret = -EINVAL; + goto out; + + } + + spin_unlock_irqrestore(&id_priv->cm_lock, flags); + ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); + if (ret) { + cma_warn(id_priv, "failed to modify QP to REARM (%d)\n", ret); + } else { + memset(&event, 0, sizeof event); + event.event = RDMA_CM_EVENT_ALT_PATH_LOADED; + event.param.ud.alt_path_index = id_priv->alt_path_index; + ret = id_priv->id.event_handler(&id_priv->id, &event); + } + goto out1; + } else { + cma_dbg(id_priv, "invalid CM id\n"); + ret = -EINVAL; + goto out; + } + cma_dbg(id_priv, "alternate path was loaded\n"); +out: + spin_unlock_irqrestore(&id_priv->cm_lock, flags); +out1: + mutex_unlock(&id_priv->qp_mutex); + return ret; +} + +static int cma_suggest_alt_sgid(struct rdma_id_private *id_priv, + union ib_gid *ref_gid, + union ib_gid *alt_gid) +{ + int ret; + u8 p, port; + union ib_gid gid; + + if (!id_priv->id.ucontext && !id_priv->id.qp) { + cma_dbg(id_priv, "cma-id qp has been destroyed.\n"); + return -EINVAL; + } + + port = id_priv->id.port_num; + if (port < 1 || port > id_priv->id.device->phys_port_cnt) { + cma_dbg(id_priv, "cma-id port num is invalid (%u).\n", port); + return -EINVAL; + } + + cma_dbg(id_priv, "current port is %d\n", port); + + /* give preference to ports other than current qp port */ + for (p = 1; p <= id_priv->id.device->phys_port_cnt; ++p) { + if (p == port) + continue; + if (id_priv->cma_dev->port_active[p]) { + cma_dbg(id_priv, "found alternate ACTIVE port %d\n", p); + ret = ib_get_cached_gid(id_priv->id.device, p, 0, &gid); + if (ret) { + cma_warn(id_priv, "port %d: failed to get gid\n", p); + continue; + } + if (!memcmp(ref_gid, &gid, sizeof (union ib_gid))) { + cma_dbg(id_priv, "can have alt sgid different from qp sgid," + " but same as requested\n"); + memcpy(alt_gid, &gid, sizeof (union ib_gid)); + cma_dbg(id_priv, "returning 1, no improvement, alt gid=" + CMA_GID_FMT "\n", CMA_GID_ARG(*alt_gid)); + return 1; + } else { + cma_dbg(id_priv, "can have alt sgid different from " + "qp sgid, but different than requested\n"); + memcpy(alt_gid, &gid, sizeof (union ib_gid)); + cma_dbg(id_priv, "improvement. returning 0, alt gid=" + CMA_GID_FMT "\n", CMA_GID_ARG(*alt_gid)); + return 0; + } + } + } + + /* did not find different active port. Try same port */ + cma_dbg(id_priv, "Trying current port %d \n", port); + if (id_priv->cma_dev->port_active[port]) { + cma_dbg(id_priv, "current port %d ACTIVE\n", port); + ret = ib_get_cached_gid(id_priv->id.device, port, 0, &gid); + if (ret) { + cma_warn(id_priv, "current port %d: failed to get gid\n", p); + return ret; + } + if (!memcmp(ref_gid, &gid, sizeof (union ib_gid))) { + cma_dbg(id_priv, "can have alt sgid same as from qp" + "sgid, and same as requested\n"); + memcpy(alt_gid, &gid, sizeof (union ib_gid)); + cma_dbg(id_priv, "returning 1, no improvement, alt gid=" + CMA_GID_FMT "\n", CMA_GID_ARG(*alt_gid)); + return 1; + } else { + cma_dbg(id_priv, "can have alt sgid same as qp sgid, " + "but different than requested\n"); + memcpy(alt_gid, &gid, sizeof (union ib_gid)); + cma_dbg(id_priv, "improvement. returning 0, alt gid=" + CMA_GID_FMT "\n", CMA_GID_ARG(*alt_gid)); + return 0; + } + } + + cma_dbg(id_priv, "No active ports found. Returning -EINVAL\n"); + return -EINVAL; +} + +static void cma_alt_path_work_handler(struct work_struct *_work) +{ + struct alt_path_work *work = container_of(_work, struct alt_path_work, work.work); + struct rdma_id_private *id_priv = work->id; + struct rdma_route *route = &id_priv->id.route; + + mutex_lock(&id_priv->handler_mutex); + cma_dbg(id_priv, "setting alt_path\n"); + + route->path_rec[id_priv->alt_path_index] = work->path_rec; + if (cma_qp_set_alt_path(id_priv)) + cma_dbg(id_priv, "fail to set alt path\n"); + + mutex_unlock(&id_priv->handler_mutex); + cma_deref_id(id_priv); + kfree(work); +} + +void cma_sap_handler(struct rdma_id_private *id_priv, union ib_gid *dgid) +{ + struct cma_work *work; + struct ib_sa_path_rec path_rec; + union ib_gid ref_gid; + ib_sa_comp_mask comp_mask; + int ret, status; + + int pri_path_index; + unsigned long flags; + struct rdma_route *route = &id_priv->id.route; + + spin_lock_irqsave(&id_priv->lock, flags); + pri_path_index = (route->num_paths == 2) ? + id_priv->alt_path_index ^ 0x1 : 0; + memcpy(&ref_gid, &route->path_rec[pri_path_index].sgid, sizeof(ref_gid)); + spin_unlock_irqrestore(&id_priv->lock, flags); + + + cma_dbg(id_priv, "sap received. suggested dgid=" CMA_GID_FMT "\n", CMA_GID_ARG(*dgid)); + + if (!id_priv->cm_id.ib) + return; + + spin_lock_irqsave(&id_priv->cm_lock, flags); + + if (id_priv->cm_id.ib) { + if (!(id_priv->apm_flags & CMA_APM_ENABLED) || + id_priv->cm_id.ib->state != IB_CM_ESTABLISHED) { + cma_dbg(id_priv, "reject SAP (no APM or no connection)\n"); + status = IB_CM_SPR_REJECT; + } else if (id_priv->cm_id.ib->lap_state != IB_CM_LAP_UNINIT && + id_priv->cm_id.ib->lap_state != IB_CM_LAP_IDLE) { + cma_dbg(id_priv, "reject SAP. LAP is pending APR\n"); + status = IB_CM_SPR_BUSY; + } else + status = IB_CM_SPR_SUCCESS; + + cma_dbg(id_priv, "sending SPR\n"); + ret = ib_send_cm_spr(id_priv->cm_id.ib, status, NULL, 0, NULL, 0); + if (ret) + cma_warn(id_priv, "failed to send spr\n"); + } else { + cma_dbg(id_priv, "invalid CM id\n"); + ret = 1; + } + spin_unlock_irqrestore(&id_priv->cm_lock, flags); + + if (ret) + return; + + if (cma_suggest_alt_sgid(id_priv, &ref_gid, &path_rec.sgid) < 0) + return; + + path_rec.dgid = *dgid; + cma_debug_path(id_priv, "adjust path ", path_rec); + work = kzalloc(sizeof *work, GFP_KERNEL); + if (!work) { + cma_warn(id_priv, "failed to allocate work\n"); + return; + } + atomic_inc(&id_priv->refcount); + work->id = id_priv; + INIT_WORK(&work->work, cma_work_handler); + work->old_state = CMA_CONNECT; + work->new_state = CMA_CONNECT; + work->event.event = RDMA_CM_EVENT_ALT_ROUTE_RESOLVED; + + comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID; + if (cma_query_ib_route(id_priv, id_priv->timeout_ms, work, + &path_rec, comp_mask)) { + kfree(work); + cma_deref_id(id_priv); + cma_warn(id_priv, "Failed querying for improved alt route\n"); + } +} + +static void cma_lap_handler(struct rdma_id_private *id_priv, struct ib_sa_path_rec *path_rec) +{ + int ret = 0; + enum ib_cm_apr_status status; + union ib_gid new_gid; + u8 info_len; + struct alt_path_work *work; + struct rdma_route *route; + unsigned long flags, flags1; + + route = &id_priv->id.route; + + status = IB_CM_APR_SUCCESS; + info_len = 0; + + cma_dbg(id_priv, "lap received with sgid=" CMA_GID_FMT + ", dgid=" CMA_GID_FMT "\n", + CMA_GID_ARG(path_rec->sgid), + CMA_GID_ARG(path_rec->dgid) ); + cma_debug_routes(id_priv); + + spin_lock_irqsave(&id_priv->lock, flags1); + ret = cma_suggest_alt_sgid(id_priv, &path_rec->sgid, &new_gid); + + if (!ret) { + /* have better GID */ + status = IB_CM_APR_INVALID_GID; + info_len = sizeof new_gid; + cma_dbg(id_priv, "will send apr with status IB_CM_APR_INVALID_GID and gid=" CMA_GID_FMT "\n", CMA_GID_ARG(new_gid)); + } else if (ret < 0) { + status = IB_CM_APR_REJECT; + cma_dbg(id_priv, "Could not verify gid. IB_CM_APR_REJECT\n"); + } else if (ret > 0 && id_priv->id.route.num_paths < 2 && + !memcmp(&path_rec->dgid, &route->path_rec[0].dgid, sizeof(union ib_gid)) && + !memcmp(&path_rec->sgid, &route->path_rec[0].sgid, sizeof(union ib_gid))){ + status = IB_CM_APR_REJECT; + cma_dbg(id_priv, "suggested alt identical to primary and cannot improve. IB_CM_APR_REJECT\n"); + } else { + cma_dbg(id_priv, "no better gid. IB_CM_APR_SUCCESS\n"); + } + + spin_lock_irqsave(&id_priv->cm_lock, flags); + if (!(id_priv->apm_flags & CMA_APM_ENABLED) || !id_priv->cm_id.ib || + id_priv->cm_id.ib->state != IB_CM_ESTABLISHED) { + status = IB_CM_APR_REJECT; + info_len = 0; + cma_dbg(id_priv, " Rejecting LAP (connection not established/LAP not enabled)\n"); + } + + ret = 0; + if (id_priv->cm_id.ib) { + cma_dbg(id_priv, "sending APR\n"); + ret = ib_send_cm_apr(id_priv->cm_id.ib, status, &new_gid, + info_len, NULL, 0); + } + spin_unlock_irqrestore(&id_priv->cm_lock, flags); + + if (ret) + cma_dbg(id_priv, "failed to send APR (%d)\n", ret); + + if (status != IB_CM_APR_SUCCESS || ret) { + spin_unlock_irqrestore(&id_priv->lock, flags1); + return; + } + + cma_dbg(id_priv, "APR success. setting alt path.\n"); + cma_debug_routes(id_priv); + if (id_priv->id.route.num_paths == 2) { + spin_unlock_irqrestore(&id_priv->lock, flags1); + cma_dbg(id_priv, "wait before setting alt path\n"); + work = kzalloc(sizeof *work, GFP_KERNEL); + if (work) { + atomic_inc(&id_priv->refcount); + work->id = id_priv; + /* + * cma_qp_event_handler can change the alt_path_index + * under us while we wait. This can cause the alt path +` * record to be out-of-synced with the active side, + * so delaying the alt path record setting as well. + */ + work->path_rec = *path_rec; + INIT_DELAYED_WORK(&work->work, cma_alt_path_work_handler); + queue_delayed_work(cma_wq, &work->work, msecs_to_jiffies(1000)); + } + } else { + route->path_rec[id_priv->alt_path_index] = *path_rec; + spin_unlock_irqrestore(&id_priv->lock, flags1); + cma_dbg(id_priv, "num_paths !=2, setting alt path immediately.\n"); + /* we set num_paths to 2 unprotected below. However, since this + * variable is never decreased, and never goes above 2, there is + * no risk here. + */ + if (cma_qp_set_alt_path(id_priv)) { + cma_dbg(id_priv, "fail to set alt path\n"); + } else + id_priv->id.route.num_paths = 2; + cma_debug_routes(id_priv); + } + return; +} + +void cma_apr_handler(struct rdma_id_private *id_priv, + enum ib_cm_apr_status status, + void *info, u8 info_len) +{ + struct cma_work *work; + struct ib_sa_path_rec path_rec; + ib_sa_comp_mask comp_mask; + union ib_gid new_gid; + unsigned long flags; + + cma_dbg(id_priv, "apr received\n"); + if (status == IB_CM_APR_INVALID_GID && info_len == sizeof (union ib_gid)) { + memcpy((void*) &new_gid, info, info_len); + + spin_lock_irqsave(&id_priv->lock, flags); + path_rec.sgid = id_priv->id.route.path_rec[id_priv->alt_path_index].sgid; + id_priv->id.route.path_rec[id_priv->alt_path_index].dgid = new_gid; + spin_unlock_irqrestore(&id_priv->lock, flags); + path_rec.dgid = new_gid; + cma_dbg(id_priv, "request to improve dgid. alt ix=%d, sgid=" CMA_GID_FMT + ", NEW dgid=" CMA_GID_FMT "\n", id_priv->alt_path_index, + CMA_GID_ARG(path_rec.sgid), CMA_GID_ARG(new_gid)); + + cma_debug_path(id_priv, "resend path query ", path_rec); + work = kzalloc(sizeof *work, GFP_KERNEL); + if (!work) { + cma_warn(id_priv, "failed to allocate work\n"); + return; + } + atomic_inc(&id_priv->refcount); + work->id = id_priv; + INIT_WORK(&work->work, cma_work_handler); + work->old_state = CMA_CONNECT; + work->new_state = CMA_CONNECT; + work->event.event = RDMA_CM_EVENT_ALT_ROUTE_RESOLVED; + + comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID; + if (cma_query_ib_route(id_priv, id_priv->timeout_ms, work, + &path_rec, comp_mask)) { + kfree(work); + cma_deref_id(id_priv); + cma_warn(id_priv, "failed querying for improved alt route\n"); + } + + } else if (status == IB_CM_APR_SUCCESS) { + cma_dbg(id_priv, "APR success. setting alt path\n"); + cma_debug_routes(id_priv); + if (cma_qp_set_alt_path(id_priv)) + cma_dbg(id_priv, "fail to set alt path\n"); + /* TBD do another round of LAP if check_alt_path returns not optimum */ + } else { + cma_dbg(id_priv, "received failure APR.(status=%d)\n", status); + } +} + +static void cma_mig_send_lap_work(struct work_struct *_work) +{ + struct cma_active_mig_send_lap_work *work = + container_of(_work, struct cma_active_mig_send_lap_work, work); + struct rdma_id_private *id_priv = work->id; + + mutex_lock(&id_priv->handler_mutex); + cma_dbg(id_priv, "sending mig-lap -- via resolve_alt_ib_route\n"); + + cma_resolve_alt_ib_route(id_priv); + mutex_unlock(&id_priv->handler_mutex); + cma_deref_id(id_priv); + kfree(work); +} + +static void cma_sap_work_handler(struct work_struct *_work) +{ + struct cma_sap_work *work = + container_of(_work, struct cma_sap_work, work.work); + struct rdma_id_private *id_priv = work->id; + struct ib_sa_path_rec path_rec; + union ib_gid pri_sgid, alt_sgid; + struct rdma_route *route = &id_priv->id.route; + int pri_ix, alt_ix, num_paths; + unsigned long flags; + int ret; + + + spin_lock_irqsave(&id_priv->cm_lock, flags); + if (!(id_priv->apm_flags & CMA_APM_ENABLED) || !id_priv->cm_id.ib || + id_priv->cm_id.ib->state != IB_CM_ESTABLISHED) { + spin_unlock_irqrestore(&id_priv->cm_lock, flags); + goto out; } + spin_unlock_irqrestore(&id_priv->cm_lock, flags); - cma_release_port(id_priv); - cma_deref_id(id_priv); - wait_for_completion(&id_priv->comp); + spin_lock_irqsave(&id_priv->lock, flags); + num_paths = route->num_paths; + alt_ix = id_priv->alt_path_index; + pri_ix = (num_paths == 2) ? alt_ix ^ 0x1 : 0; + memcpy(&pri_sgid, &route->path_rec[pri_ix].sgid, sizeof(pri_sgid)); + if (num_paths == 2) + memcpy(&alt_sgid, &route->path_rec[alt_ix].sgid, sizeof(alt_sgid)); + spin_unlock_irqrestore(&id_priv->lock, flags); - if (id_priv->internal_id) - cma_deref_id(id_priv->id.context); + mutex_lock(&id_priv->handler_mutex); + cma_dbg(id_priv, "sending sap\n"); + + ret = cma_suggest_alt_sgid(id_priv, &pri_sgid, &path_rec.sgid); + if (ret > 0 && work->from_portdown) { + if ((num_paths > 1) && + memcmp(&pri_sgid, &alt_sgid, sizeof (union ib_gid))) + cma_send_sap(id_priv, &path_rec); + else { + cma_dbg(id_priv, "sap not sent -- no need\n"); + } + } else if (!ret && !work->from_portdown) { + if ((num_paths < 2) || + !memcmp(&pri_sgid, &alt_sgid, sizeof (union ib_gid))) + cma_send_sap(id_priv, &path_rec); + else { + cma_dbg(id_priv, "sap not sent -- no need\n"); + } + } else + cma_dbg(id_priv, "no suggestion for alt sgid," + " ret = %d, port_down = %d\n", + ret, work->from_portdown); - kfree(id_priv->id.route.path_rec); - kfree(id_priv); + mutex_unlock(&id_priv->handler_mutex); +out: + cma_deref_id(id_priv); + kfree(work); } -EXPORT_SYMBOL(rdma_destroy_id); -static int cma_rep_recv(struct rdma_id_private *id_priv) +static int cma_schedule_sap(struct rdma_id_private *id_priv, int ms, int from_portdown) { - int ret; + struct cma_sap_work *sap_work; + unsigned long flags; + int ret = -EINVAL; - ret = cma_modify_qp_rtr(id_priv, NULL); - if (ret) - goto reject; + if (!(id_priv->apm_flags & CMA_APM_ENABLED)) { + cma_dbg(NULL, "No SAP scheduled -- APM not enabled\n"); + return -ENOSYS; + } - ret = cma_modify_qp_rts(id_priv, NULL); - if (ret) - goto reject; + sap_work = kzalloc(sizeof *sap_work, GFP_KERNEL); + if (!sap_work) { + cma_warn(NULL, "failed to allocate work\n"); + return -ENOMEM; + } - ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0); - if (ret) - goto reject; + spin_lock_irqsave(&id_priv->cm_lock, flags); + if (id_priv->cm_id.ib) { + if (id_priv->cm_id.ib->state != IB_CM_ESTABLISHED) { + kfree(sap_work); + cma_dbg(id_priv, "sap not scheduled." + "CM not in correct state (%d)\n", + id_priv->cm_id.ib->state); + goto out; + } + atomic_inc(&id_priv->refcount); + sap_work->id = id_priv; + sap_work->from_portdown = from_portdown; + INIT_DELAYED_WORK(&sap_work->work, cma_sap_work_handler); + queue_delayed_work(cma_wq, &sap_work->work, msecs_to_jiffies(ms)); + cma_dbg(id_priv, "sap is scheduled to %d ms from now\n", ms); + ret = 0; + } else { + kfree(sap_work); + cma_dbg(id_priv, "sap not scheduled. CM ID no longer valid\n"); + } +out: + spin_unlock_irqrestore(&id_priv->cm_lock, flags); return 0; -reject: - cma_modify_qp_err(id_priv); - ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, - NULL, 0, NULL, 0); - return ret; } -static int cma_verify_rep(struct rdma_id_private *id_priv, void *data) +void cma_spr_handler(struct rdma_id_private *id_priv, + enum ib_cm_apr_status status, + void *info, u8 info_len) { - if (id_priv->id.ps == RDMA_PS_SDP && - sdp_get_majv(((struct sdp_hah *) data)->sdp_version) != - SDP_MAJ_VERSION) - return -EINVAL; + cma_dbg(id_priv, "spr received, status=%d\n", status); + if (id_priv->cm_id.ib->sap_support_disabled) { + cma_dbg(id_priv, "ignoring spr since remote does not support SAP\n"); + return; + } + + if (status == IB_CM_SPR_BUSY) + cma_schedule_sap(id_priv, 4000, 0); + else if (status != IB_CM_SPR_SUCCESS) + id_priv->cm_id.ib->sap_support_disabled = 1; - return 0; } -static void cma_set_rep_event_data(struct rdma_cm_event *event, - struct ib_cm_rep_event_param *rep_data, - void *private_data) +static const char *cm_event_str(enum ib_cm_event_type event) { - event->param.conn.private_data = private_data; - event->param.conn.private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE; - event->param.conn.responder_resources = rep_data->responder_resources; - event->param.conn.initiator_depth = rep_data->initiator_depth; - event->param.conn.flow_control = rep_data->flow_control; - event->param.conn.rnr_retry_count = rep_data->rnr_retry_count; - event->param.conn.srq = rep_data->srq; - event->param.conn.qp_num = rep_data->remote_qpn; + switch (event) { + case IB_CM_REQ_ERROR: return "IB_CM_REQ_ERROR"; + case IB_CM_REQ_RECEIVED: return "IB_CM_REQ_RECEIVED"; + case IB_CM_REP_ERROR: return "IB_CM_REP_ERROR"; + case IB_CM_REP_RECEIVED: return "IB_CM_REP_RECEIVED"; + case IB_CM_RTU_RECEIVED: return "IB_CM_RTU_RECEIVED"; + case IB_CM_USER_ESTABLISHED: return "IB_CM_USER_ESTABLISHED"; + case IB_CM_DREQ_ERROR: return "IB_CM_DREQ_ERROR"; + case IB_CM_DREQ_RECEIVED: return "IB_CM_DREQ_RECEIVED"; + case IB_CM_DREP_RECEIVED: return "IB_CM_DREP_RECEIVED"; + case IB_CM_TIMEWAIT_EXIT: return "IB_CM_TIMEWAIT_EXIT"; + case IB_CM_MRA_RECEIVED: return "IB_CM_MRA_RECEIVED"; + case IB_CM_REJ_RECEIVED: return "IB_CM_REJ_RECEIVED"; + case IB_CM_LAP_ERROR: return "IB_CM_LAP_ERROR"; + case IB_CM_LAP_RECEIVED: return "IB_CM_LAP_RECEIVED"; + case IB_CM_APR_RECEIVED: return "IB_CM_APR_RECEIVED"; + case IB_CM_SIDR_REQ_ERROR: return "IB_CM_SIDR_REQ_ERROR"; + case IB_CM_SIDR_REQ_RECEIVED: return "IB_CM_SIDR_REQ_RECEIVED"; + case IB_CM_SIDR_REP_RECEIVED: return "IB_CM_SIDR_REP_RECEIVED"; + case IB_CM_SAP_RECEIVED: return "IB_CM_SAP_RECEIVED"; + case IB_CM_SPR_RECEIVED: return "IB_CM_SPR_RECEIVED"; + default: return "unknown CM event"; + } +} + +static const char *cma_event_str(enum rdma_cm_event_type event) +{ + switch (event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: return "RDMA_CM_EVENT_ADDR_RESOLVED"; + case RDMA_CM_EVENT_ADDR_ERROR: return "RDMA_CM_EVENT_ADDR_ERROR"; + case RDMA_CM_EVENT_ROUTE_RESOLVED: return "RDMA_CM_EVENT_ROUTE_RESOLVED"; + case RDMA_CM_EVENT_ROUTE_ERROR: return "RDMA_CM_EVENT_ROUTE_ERROR"; + case RDMA_CM_EVENT_CONNECT_REQUEST: return "RDMA_CM_EVENT_CONNECT_REQUEST"; + case RDMA_CM_EVENT_CONNECT_RESPONSE: return "RDMA_CM_EVENT_CONNECT_RESPONSE"; + case RDMA_CM_EVENT_CONNECT_ERROR: return "RDMA_CM_EVENT_CONNECT_ERROR"; + case RDMA_CM_EVENT_UNREACHABLE: return "RDMA_CM_EVENT_UNREACHABLE"; + case RDMA_CM_EVENT_REJECTED: return "RDMA_CM_EVENT_REJECTED"; + case RDMA_CM_EVENT_ESTABLISHED: return "RDMA_CM_EVENT_ESTABLISHED"; + case RDMA_CM_EVENT_DISCONNECTED: return "RDMA_CM_EVENT_DISCONNECTED"; + case RDMA_CM_EVENT_DEVICE_REMOVAL: return "RDMA_CM_EVENT_DEVICE_REMOVAL"; + case RDMA_CM_EVENT_MULTICAST_JOIN: return "RDMA_CM_EVENT_MULTICAST_JOIN"; + case RDMA_CM_EVENT_MULTICAST_ERROR: return "RDMA_CM_EVENT_MULTICAST_ERROR"; + case RDMA_CM_EVENT_ADDR_CHANGE: return "RDMA_CM_EVENT_ADDR_CHANGE"; + case RDMA_CM_EVENT_TIMEWAIT_EXIT: return "RDMA_CM_EVENT_TIMEWAIT_EXIT"; + case RDMA_CM_EVENT_ALT_ROUTE_RESOLVED: return "RDMA_CM_EVENT_ALT_ROUTE_RESOLVED"; + case RDMA_CM_EVENT_ALT_ROUTE_ERROR: return "RDMA_CM_EVENT_ALT_ROUTE_ERROR"; + case RDMA_CM_EVENT_ALT_PATH_LOADED: return "RDMA_CM_EVENT_ALT_PATH_LOADED"; + default: return "unknown CMA event"; + } } static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) @@ -1001,11 +1832,12 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) int ret = 0; if ((ib_event->event != IB_CM_TIMEWAIT_EXIT && - cma_disable_callback(id_priv, RDMA_CM_CONNECT)) || + cma_disable_callback(id_priv, CMA_CONNECT)) || (ib_event->event == IB_CM_TIMEWAIT_EXIT && - cma_disable_callback(id_priv, RDMA_CM_DISCONNECT))) + cma_disable_callback(id_priv, CMA_DISCONNECT))) return 0; + cma_dbg(id_priv, "received event %s(%d)\n", cm_event_str(ib_event->event), ib_event->event); memset(&event, 0, sizeof event); switch (ib_event->event) { case IB_CM_REQ_ERROR: @@ -1034,8 +1866,7 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) event.status = -ETIMEDOUT; /* fall through */ case IB_CM_DREQ_RECEIVED: case IB_CM_DREP_RECEIVED: - if (!cma_comp_exch(id_priv, RDMA_CM_CONNECT, - RDMA_CM_DISCONNECT)) + if (!cma_comp_exch(id_priv, CMA_CONNECT, CMA_DISCONNECT)) goto out; event.event = RDMA_CM_EVENT_DISCONNECTED; break; @@ -1052,6 +1883,24 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) event.param.conn.private_data = ib_event->private_data; event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE; break; + case IB_CM_LAP_RECEIVED: + cma_lap_handler(id_priv, + ib_event->param.lap_rcvd.alternate_path); + goto out; + case IB_CM_SAP_RECEIVED: + cma_sap_handler(id_priv, + &ib_event->param.sap_rcvd.alternate_path->dgid); + goto out; + case IB_CM_APR_RECEIVED: + cma_apr_handler(id_priv, ib_event->param.apr_rcvd.ap_status, + ib_event->param.apr_rcvd.apr_info, + ib_event->param.apr_rcvd.info_len); + goto out; + case IB_CM_SPR_RECEIVED: + cma_spr_handler(id_priv, ib_event->param.spr_rcvd.ap_status, + ib_event->param.spr_rcvd.spr_info, + ib_event->param.spr_rcvd.info_len); + goto out; default: printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); @@ -1062,11 +1911,27 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; - cma_exch(id_priv, RDMA_CM_DESTROYING); + cma_exch(id_priv, CMA_DESTROYING); mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return ret; } + cma_dbg(id_priv, "apm enabled %d, event %s(%d), ps %d, active %d\n", + !!(id_priv->apm_flags & CMA_APM_ENABLED), + cma_event_str(event.event), event.event, + id_priv->id.ps, !!(id_priv->apm_flags & CMA_APM_ACTIVE_SIDE)); + if ( id_priv->apm_flags & CMA_APM_ENABLED && + (event.event == RDMA_CM_EVENT_ESTABLISHED || + (id_priv->id.ps == RDMA_PS_SDP && + event.event == RDMA_CM_EVENT_CONNECT_RESPONSE))) { + if (id_priv->apm_flags & CMA_APM_ACTIVE_SIDE) { + cma_dbg(id_priv, "begin resolve alt route\n"); + cma_resolve_alt_ib_route(id_priv); + } else { + cma_dbg(id_priv, "calling cma_schedule_sap\n"); + cma_schedule_sap(id_priv, 2000, 0); + } + } out: mutex_unlock(&id_priv->handler_mutex); return ret; @@ -1088,7 +1953,7 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, goto err; id = rdma_create_id(listen_id->event_handler, listen_id->context, - listen_id->ps, ib_event->param.req_rcvd.qp_type); + listen_id->ps); if (IS_ERR(id)) goto err; @@ -1097,8 +1962,7 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, rt = &id->route; rt->num_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1; - rt->path_rec = kmalloc(sizeof *rt->path_rec * rt->num_paths, - GFP_KERNEL); + rt->path_rec = kmalloc(sizeof *rt->path_rec * 2, GFP_KERNEL); if (!rt->path_rec) goto destroy_id; @@ -1119,7 +1983,7 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid); id_priv = container_of(id, struct rdma_id_private, id); - id_priv->state = RDMA_CM_CONNECT; + id_priv->state = CMA_CONNECT; return id_priv; destroy_id: @@ -1139,7 +2003,7 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id, int ret; id = rdma_create_id(listen_id->event_handler, listen_id->context, - listen_id->ps, IB_QPT_UD); + listen_id->ps); if (IS_ERR(id)) return NULL; @@ -1159,7 +2023,7 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id, } id_priv = container_of(id, struct rdma_id_private, id); - id_priv->state = RDMA_CM_CONNECT; + id_priv->state = CMA_CONNECT; return id_priv; err: rdma_destroy_id(id); @@ -1188,13 +2052,14 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) int offset, ret; listen_id = cm_id->context; - if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) + cma_dbg(listen_id, "received %s\n", cm_event_str(ib_event->event)); + if (cma_disable_callback(listen_id, CMA_LISTEN)) return -ECONNABORTED; memset(&event, 0, sizeof event); offset = cma_user_data_offset(listen_id->id.ps); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; - if (listen_id->id.qp_type == IB_QPT_UD) { + if (cma_is_ud_ps(listen_id->id.ps)) { conn_id = cma_new_udp_id(&listen_id->id, ib_event); event.param.ud.private_data = ib_event->private_data + offset; event.param.ud.private_data_len = @@ -1210,7 +2075,9 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); + mutex_lock(&lock); ret = cma_acquire_dev(conn_id); + mutex_unlock(&lock); if (ret) goto release_conn_id; @@ -1230,8 +2097,11 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) * while we're accessing the cm_id. */ mutex_lock(&lock); - if (cma_comp(conn_id, RDMA_CM_CONNECT) && (conn_id->id.qp_type != IB_QPT_UD)) + if (cma_comp(conn_id, CMA_CONNECT) && + !cma_is_ud_ps(conn_id->id.ps)) { + cma_dbg(container_of(&conn_id->id, struct rdma_id_private, id), "sending MRA\n"); ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); + } mutex_unlock(&lock); mutex_unlock(&conn_id->handler_mutex); cma_deref_id(conn_id); @@ -1243,7 +2113,7 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) conn_id->cm_id.ib = NULL; release_conn_id: - cma_exch(conn_id, RDMA_CM_DESTROYING); + cma_exch(conn_id, CMA_DESTROYING); mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(&conn_id->id); @@ -1314,7 +2184,7 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) struct sockaddr_in *sin; int ret = 0; - if (cma_disable_callback(id_priv, RDMA_CM_CONNECT)) + if (cma_disable_callback(id_priv, CMA_CONNECT)) return 0; memset(&event, 0, sizeof event); @@ -1357,7 +2227,7 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.iw = NULL; - cma_exch(id_priv, RDMA_CM_DESTROYING); + cma_exch(id_priv, CMA_DESTROYING); mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return ret; @@ -1379,20 +2249,20 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, struct ib_device_attr attr; listen_id = cm_id->context; - if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) + if (cma_disable_callback(listen_id, CMA_LISTEN)) return -ECONNABORTED; /* Create a new RDMA id for the new IW CM ID */ new_cm_id = rdma_create_id(listen_id->id.event_handler, listen_id->id.context, - RDMA_PS_TCP, IB_QPT_RC); + RDMA_PS_TCP); if (IS_ERR(new_cm_id)) { ret = -ENOMEM; goto out; } conn_id = container_of(new_cm_id, struct rdma_id_private, id); mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); - conn_id->state = RDMA_CM_CONNECT; + conn_id->state = CMA_CONNECT; dev = ip_dev_find(&init_net, iw_event->local_addr.sin_addr.s_addr); if (!dev) { @@ -1408,7 +2278,9 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, goto out; } + mutex_lock(&lock); ret = cma_acquire_dev(conn_id); + mutex_unlock(&lock); if (ret) { mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); @@ -1447,7 +2319,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, if (ret) { /* User wants to destroy the CM ID */ conn_id->cm_id.iw = NULL; - cma_exch(conn_id, RDMA_CM_DESTROYING); + cma_exch(conn_id, CMA_DESTROYING); mutex_unlock(&conn_id->handler_mutex); cma_deref_id(conn_id); rdma_destroy_id(&conn_id->id); @@ -1468,13 +2340,15 @@ static int cma_ib_listen(struct rdma_id_private *id_priv) { struct ib_cm_compare_data compare_data; struct sockaddr *addr; + struct ib_cm_id *id; __be64 svc_id; int ret; - id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device, cma_req_handler, - id_priv); - if (IS_ERR(id_priv->cm_id.ib)) - return PTR_ERR(id_priv->cm_id.ib); + id = ib_create_cm_id(id_priv->id.device, cma_req_handler, id_priv); + if (IS_ERR(id)) + return PTR_ERR(id); + + id_priv->cm_id.ib = id; addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr; svc_id = cma_get_service_id(id_priv->id.ps, addr); @@ -1497,12 +2371,15 @@ static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) { int ret; struct sockaddr_in *sin; + struct iw_cm_id *id; + + id = iw_create_cm_id(id_priv->id.device, + iw_conn_req_handler, + id_priv); + if (IS_ERR(id)) + return PTR_ERR(id); - id_priv->cm_id.iw = iw_create_cm_id(id_priv->id.device, - iw_conn_req_handler, - id_priv); - if (IS_ERR(id_priv->cm_id.iw)) - return PTR_ERR(id_priv->cm_id.iw); + id_priv->cm_id.iw = id; sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; id_priv->cm_id.iw->local_addr = *sin; @@ -1534,14 +2411,13 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, struct rdma_cm_id *id; int ret; - id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps, - id_priv->id.qp_type); + id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps); if (IS_ERR(id)) return; dev_id_priv = container_of(id, struct rdma_id_private, id); - dev_id_priv->state = RDMA_CM_ADDR_BOUND; + dev_id_priv->state = CMA_ADDR_BOUND; memcpy(&id->route.addr.src_addr, &id_priv->id.route.addr.src_addr, ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr)); @@ -1552,8 +2428,7 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, ret = rdma_listen(id, id_priv->backlog); if (ret) - printk(KERN_WARNING "RDMA CMA: cma_listen_on_dev, error %d, " - "listening on device %s\n", ret, cma_dev->device->name); + cma_warn(id_priv, "cma_listen_on_dev, error %d, listening on device %s\n", ret, cma_dev->device->name); } static void cma_listen_on_all(struct rdma_id_private *id_priv) @@ -1567,6 +2442,50 @@ static void cma_listen_on_all(struct rdma_id_private *id_priv) mutex_unlock(&lock); } +int rdma_listen(struct rdma_cm_id *id, int backlog) +{ + struct rdma_id_private *id_priv; + int ret; + + id_priv = container_of(id, struct rdma_id_private, id); + if (id_priv->state == CMA_IDLE) { + ((struct sockaddr *) &id->route.addr.src_addr)->sa_family = AF_INET; + ret = rdma_bind_addr(id, (struct sockaddr *) &id->route.addr.src_addr); + if (ret) + return ret; + } + + if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_LISTEN)) + return -EINVAL; + + id_priv->backlog = backlog; + if (id->device) { + switch (rdma_node_get_transport(id->device->node_type)) { + case RDMA_TRANSPORT_IB: + ret = cma_ib_listen(id_priv); + if (ret) + goto err; + break; + case RDMA_TRANSPORT_IWARP: + ret = cma_iw_listen(id_priv, backlog); + if (ret) + goto err; + break; + default: + ret = -ENOSYS; + goto err; + } + } else + cma_listen_on_all(id_priv); + + return 0; +err: + id_priv->backlog = 0; + cma_comp_exch(id_priv, CMA_LISTEN, CMA_ADDR_BOUND); + return ret; +} +EXPORT_SYMBOL(rdma_listen); + void rdma_set_service_type(struct rdma_cm_id *id, int tos) { struct rdma_id_private *id_priv; @@ -1576,59 +2495,104 @@ void rdma_set_service_type(struct rdma_cm_id *id, int tos) } EXPORT_SYMBOL(rdma_set_service_type); +void rdma_set_timeout(struct rdma_cm_id *id, int timeout) +{ + struct rdma_id_private *id_priv; + + id_priv = container_of(id, struct rdma_id_private, id); + id_priv->qp_timeout = (u8) timeout; +} +EXPORT_SYMBOL(rdma_set_timeout); + static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec, void *context) { struct cma_work *work = context; struct rdma_route *route; + struct rdma_id_private *id_priv = work->id; + unsigned long flags; route = &work->id->id.route; if (!status) { - route->num_paths = 1; - *route->path_rec = *path_rec; + cma_debug_path(id_priv, "got path: ", (*path_rec)); + cma_dbg(id_priv, "current num_paths=%d\n", route->num_paths); + + spin_lock_irqsave(&id_priv->lock, flags); + if (route->num_paths == 0) { + route->path_rec[0] = *path_rec; + route->num_paths = 1; + } else { + if ((route->num_paths < 2) && + (memcmp(&path_rec->sgid, &route->path_rec[0].sgid, + sizeof(union ib_gid)) || + memcmp(&path_rec->dgid, &route->path_rec[0].dgid, + sizeof(union ib_gid)))) + { + cma_dbg(id_priv, "new path_rec gids different" + " from primary path gids. Accepting\n"); + route->num_paths++; + } + if (route->num_paths == 2) { + route->path_rec[id_priv->alt_path_index] = *path_rec; + cma_send_lap(id_priv, &route->path_rec[id_priv->alt_path_index]); + } + } + spin_unlock_irqrestore(&id_priv->lock, flags); } else { - work->old_state = RDMA_CM_ROUTE_QUERY; - work->new_state = RDMA_CM_ADDR_RESOLVED; - work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; - work->event.status = status; + cma_warn(id_priv, "bad status %d from path query\n", status); + if (!route->num_paths) { + work->old_state = CMA_ROUTE_QUERY; + work->new_state = CMA_ADDR_RESOLVED; + work->event.status = status; + work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; + } else { + work->old_state = CMA_CONNECT; + work->new_state = CMA_CONNECT; + work->event.status = status; + work->event.event = RDMA_CM_EVENT_ALT_ROUTE_ERROR; + cma_dbg(id_priv, "failed to get alternate path record\n"); + } } - queue_work(cma_wq, &work->work); } static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms, - struct cma_work *work) + struct cma_work *work, + struct ib_sa_path_rec *path_rec, + ib_sa_comp_mask comp_mask) { - struct rdma_addr *addr = &id_priv->id.route.addr; - struct ib_sa_path_rec path_rec; - ib_sa_comp_mask comp_mask; struct sockaddr_in6 *sin6; + struct rdma_addr *addr = &id_priv->id.route.addr; - memset(&path_rec, 0, sizeof path_rec); - rdma_addr_get_sgid(&addr->dev_addr, &path_rec.sgid); - rdma_addr_get_dgid(&addr->dev_addr, &path_rec.dgid); - path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(&addr->dev_addr)); - path_rec.numb_path = 1; - path_rec.reversible = 1; - path_rec.service_id = cma_get_service_id(id_priv->id.ps, + cma_debug_path(id_priv, "query for ", (*path_rec)); + + path_rec->pkey = cpu_to_be16(ib_addr_get_pkey(&addr->dev_addr)); + path_rec->numb_path = 1; + path_rec->reversible = 1; + path_rec->service_id = cma_get_service_id(id_priv->id.ps, (struct sockaddr *) &addr->dst_addr); - comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | - IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH | - IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID; + comp_mask |= IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH | + IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID; if (addr->src_addr.ss_family == AF_INET) { - path_rec.qos_class = cpu_to_be16((u16) id_priv->tos); + path_rec->qos_class = cpu_to_be16((u16) id_priv->tos); comp_mask |= IB_SA_PATH_REC_QOS_CLASS; } else { sin6 = (struct sockaddr_in6 *) &addr->src_addr; - path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20); + path_rec->traffic_class = + (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20); comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; } + if (tavor_quirk) { + path_rec->mtu_selector = IB_SA_LT; + path_rec->mtu = IB_MTU_2048; + } + id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device, - id_priv->id.port_num, &path_rec, + id_priv->id.port_num, path_rec, comp_mask, timeout_ms, GFP_KERNEL, cma_query_handler, work, &id_priv->query); @@ -1636,6 +2600,54 @@ static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms, return (id_priv->query_id < 0) ? id_priv->query_id : 0; } +static int cma_query_primary_ib_route(struct rdma_id_private *id_priv, + int timeout_ms, struct cma_work *work) +{ + struct rdma_addr *addr = &id_priv->id.route.addr; + struct ib_sa_path_rec path_rec; + ib_sa_comp_mask comp_mask; + + memset(&path_rec, 0, sizeof path_rec); + rdma_addr_get_sgid(&addr->dev_addr, &path_rec.sgid); + rdma_addr_get_dgid(&addr->dev_addr, &path_rec.dgid); + + comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID; + + return cma_query_ib_route(id_priv, timeout_ms, work, + &path_rec, comp_mask); +} + +static int cma_query_alt_ib_route(struct rdma_id_private *id_priv, + int timeout_ms, struct cma_work *work) +{ + struct ib_sa_path_rec path_rec; + struct rdma_route *route; + ib_sa_comp_mask comp_mask; + int pri_path_index; + unsigned long flags; + int ret; + + route = &id_priv->id.route; + memset(&path_rec, 0, sizeof path_rec); + + spin_lock_irqsave(&id_priv->lock, flags); + cma_dbg(id_priv, "num route %d\n", route->num_paths); + pri_path_index = (route->num_paths ==2) ? id_priv->alt_path_index ^ 0x1 : 0; + path_rec.sgid = route->path_rec[pri_path_index].sgid; + path_rec.dgid = route->path_rec[pri_path_index].dgid; + spin_unlock_irqrestore(&id_priv->lock, flags); + + ret = cma_suggest_alt_sgid(id_priv, &path_rec.sgid, &path_rec.sgid); + if (ret < 0) { + cma_dbg(id_priv, "failed to get alt sgid\n"); + return ret; + } + + comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID; + return cma_query_ib_route(id_priv, timeout_ms, work, + &path_rec, comp_mask); +} + static void cma_work_handler(struct work_struct *_work) { struct cma_work *work = container_of(_work, struct cma_work, work); @@ -1647,7 +2659,7 @@ static void cma_work_handler(struct work_struct *_work) goto out; if (id_priv->id.event_handler(&id_priv->id, &work->event)) { - cma_exch(id_priv, RDMA_CM_DESTROYING); + cma_exch(id_priv, CMA_DESTROYING); destroy = 1; } out: @@ -1665,12 +2677,12 @@ static void cma_ndev_work_handler(struct work_struct *_work) int destroy = 0; mutex_lock(&id_priv->handler_mutex); - if (id_priv->state == RDMA_CM_DESTROYING || - id_priv->state == RDMA_CM_DEVICE_REMOVAL) + if (id_priv->state == CMA_DESTROYING || + id_priv->state == CMA_DEVICE_REMOVAL) goto out; if (id_priv->id.event_handler(&id_priv->id, &work->event)) { - cma_exch(id_priv, RDMA_CM_DESTROYING); + cma_exch(id_priv, CMA_DESTROYING); destroy = 1; } @@ -1682,7 +2694,39 @@ out: kfree(work); } -static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms) +static int cma_resolve_alt_ib_route(struct rdma_id_private *id_priv) +{ + struct cma_work *work; + int ret = 0; + struct rdma_route *route = &id_priv->id.route; + + /* it does not make sense to handle alternate routes if we don't have a primary route */ + if (route->num_paths == 0) + return -EINVAL; + + work = kzalloc(sizeof *work, GFP_KERNEL); + if (!work) + return -ENOMEM; + + atomic_inc(&id_priv->refcount); + work->id = id_priv; + INIT_WORK(&work->work, cma_work_handler); + work->old_state = CMA_CONNECT; + work->new_state = CMA_CONNECT; + work->event.event = RDMA_CM_EVENT_ALT_ROUTE_RESOLVED; + + ret = cma_query_alt_ib_route(id_priv, id_priv->timeout_ms, work); + if (ret) { + kfree(work); + cma_deref_id(id_priv); + cma_dbg(id_priv, "failed to start alt route discovery (%d)\n", ret); + return ret; + } + return 0; +} + +static int cma_resolve_primary_ib_route(struct rdma_id_private *id_priv, + int timeout_ms) { struct rdma_route *route = &id_priv->id.route; struct cma_work *work; @@ -1692,19 +2736,21 @@ static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms) if (!work) return -ENOMEM; + id_priv->timeout_ms = timeout_ms; + work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); - work->old_state = RDMA_CM_ROUTE_QUERY; - work->new_state = RDMA_CM_ROUTE_RESOLVED; + work->old_state = CMA_ROUTE_QUERY; + work->new_state = CMA_ROUTE_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; - route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL); + route->path_rec = kmalloc(sizeof *route->path_rec * 2, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; goto err1; } - ret = cma_query_ib_route(id_priv, timeout_ms, work); + ret = cma_query_primary_ib_route(id_priv, timeout_ms, work); if (ret) goto err2; @@ -1724,21 +2770,20 @@ int rdma_set_ib_paths(struct rdma_cm_id *id, int ret; id_priv = container_of(id, struct rdma_id_private, id); - if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, - RDMA_CM_ROUTE_RESOLVED)) + if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ROUTE_RESOLVED)) return -EINVAL; - id->route.path_rec = kmemdup(path_rec, sizeof *path_rec * num_paths, - GFP_KERNEL); + id->route.path_rec = kmalloc(sizeof *path_rec * 2, GFP_KERNEL); if (!id->route.path_rec) { ret = -ENOMEM; goto err; } + memcpy(id->route.path_rec, path_rec, sizeof *path_rec * num_paths); id->route.num_paths = num_paths; return 0; err: - cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED); + cma_comp_exch(id_priv, CMA_ROUTE_RESOLVED, CMA_ADDR_RESOLVED); return ret; } EXPORT_SYMBOL(rdma_set_ib_paths); @@ -1753,13 +2798,18 @@ static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms) work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); - work->old_state = RDMA_CM_ROUTE_QUERY; - work->new_state = RDMA_CM_ROUTE_RESOLVED; + work->old_state = CMA_ROUTE_QUERY; + work->new_state = CMA_ROUTE_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; queue_work(cma_wq, &work->work); return 0; } +static u8 tos_to_sl(u8 tos) +{ + return def_prec2sl & 7; +} + static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; @@ -1769,7 +2819,7 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) struct sockaddr_in *src_addr = (struct sockaddr_in *)&route->addr.src_addr; struct sockaddr_in *dst_addr = (struct sockaddr_in *)&route->addr.dst_addr; struct net_device *ndev = NULL; - u16 vid; + u16 vid = 0; if (src_addr->sin_family != dst_addr->sin_family) return -EINVAL; @@ -1796,7 +2846,9 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) goto err2; } - vid = rdma_vlan_dev_vlan_id(ndev); +#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) + vid = vlan_dev_vlan_id(ndev); +#endif iboe_mac_vlan_to_ll(&route->path_rec->sgid, addr->dev_addr.src_dev_addr, vid); iboe_mac_vlan_to_ll(&route->path_rec->dgid, addr->dev_addr.dst_dev_addr, vid); @@ -1805,21 +2857,21 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) route->path_rec->reversible = 1; route->path_rec->pkey = cpu_to_be16(0xffff); route->path_rec->mtu_selector = IB_SA_EQ; - route->path_rec->sl = id_priv->tos >> 5; + route->path_rec->sl = tos_to_sl(id_priv->tos); route->path_rec->mtu = iboe_get_mtu(ndev->mtu); route->path_rec->rate_selector = IB_SA_EQ; route->path_rec->rate = iboe_get_rate(ndev); dev_put(ndev); route->path_rec->packet_life_time_selector = IB_SA_EQ; - route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME; + route->path_rec->packet_life_time = IBOE_PACKET_LIFETIME; if (!route->path_rec->mtu) { ret = -EINVAL; goto err2; } - work->old_state = RDMA_CM_ROUTE_QUERY; - work->new_state = RDMA_CM_ROUTE_RESOLVED; + work->old_state = CMA_ROUTE_QUERY; + work->new_state = CMA_ROUTE_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; work->event.status = 0; @@ -1841,15 +2893,15 @@ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) int ret; id_priv = container_of(id, struct rdma_id_private, id); - if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY)) + if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ROUTE_QUERY)) return -EINVAL; atomic_inc(&id_priv->refcount); switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: - switch (rdma_port_get_link_layer(id->device, id->port_num)) { + switch (rdma_port_link_layer(id->device, id->port_num)) { case IB_LINK_LAYER_INFINIBAND: - ret = cma_resolve_ib_route(id_priv, timeout_ms); + ret = cma_resolve_primary_ib_route(id_priv, timeout_ms); break; case IB_LINK_LAYER_ETHERNET: ret = cma_resolve_iboe_route(id_priv); @@ -1870,12 +2922,38 @@ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) return 0; err: - cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED); + cma_comp_exch(id_priv, CMA_ROUTE_QUERY, CMA_ADDR_RESOLVED); cma_deref_id(id_priv); return ret; } EXPORT_SYMBOL(rdma_resolve_route); + +int rdma_enable_apm(struct rdma_cm_id *id, enum alt_path_type alt_type) +{ + struct rdma_id_private *id_priv; + id_priv = container_of(id, struct rdma_id_private, id); + + if (!id->device) + return -EINVAL; + + if (rdma_node_get_transport(id->device->node_type) != + RDMA_TRANSPORT_IB) { + cma_warn(id_priv, "wrong transport\n"); + return -EINVAL; + } + + if (rdma_port_link_layer(id->device, id->port_num) != + IB_LINK_LAYER_INFINIBAND) { + cma_warn(id_priv, "wrong link layer\n"); + return -EINVAL; + } + id_priv->apm_flags |= CMA_APM_ENABLED; + cma_dbg(id_priv, "apm is enabled\n"); + return 0; +} +EXPORT_SYMBOL(rdma_enable_apm); + static int cma_bind_loopback(struct rdma_id_private *id_priv) { struct cma_device *cma_dev; @@ -1909,7 +2987,7 @@ port_found: goto out; id_priv->id.route.addr.dev_addr.dev_type = - (rdma_port_get_link_layer(cma_dev->device, p) == IB_LINK_LAYER_INFINIBAND) ? + (rdma_port_link_layer(cma_dev->device, p) == IB_LINK_LAYER_INFINIBAND) ? ARPHRD_INFINIBAND : ARPHRD_ETHER; rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid); @@ -1929,16 +3007,23 @@ static void addr_handler(int status, struct sockaddr *src_addr, memset(&event, 0, sizeof event); mutex_lock(&id_priv->handler_mutex); - if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, - RDMA_CM_ADDR_RESOLVED)) + + /* + * Grab mutex to block rdma_destroy_id() from removing the device while + * we're trying to acquire it. + */ + mutex_lock(&lock); + if (!cma_comp_exch(id_priv, CMA_ADDR_QUERY, CMA_ADDR_RESOLVED)) { + mutex_unlock(&lock); goto out; + } if (!status && !id_priv->cma_dev) status = cma_acquire_dev(id_priv); + mutex_unlock(&lock); if (status) { - if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, - RDMA_CM_ADDR_BOUND)) + if (!cma_comp_exch(id_priv, CMA_ADDR_RESOLVED, CMA_ADDR_BOUND)) goto out; event.event = RDMA_CM_EVENT_ADDR_ERROR; event.status = status; @@ -1949,7 +3034,7 @@ static void addr_handler(int status, struct sockaddr *src_addr, } if (id_priv->id.event_handler(&id_priv->id, &event)) { - cma_exch(id_priv, RDMA_CM_DESTROYING); + cma_exch(id_priv, CMA_DESTROYING); mutex_unlock(&id_priv->handler_mutex); cma_deref_id(id_priv); rdma_destroy_id(&id_priv->id); @@ -1994,8 +3079,8 @@ static int cma_resolve_loopback(struct rdma_id_private *id_priv) work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); - work->old_state = RDMA_CM_ADDR_QUERY; - work->new_state = RDMA_CM_ADDR_RESOLVED; + work->old_state = CMA_ADDR_QUERY; + work->new_state = CMA_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; queue_work(cma_wq, &work->work); return 0; @@ -2024,13 +3109,13 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, int ret; id_priv = container_of(id, struct rdma_id_private, id); - if (id_priv->state == RDMA_CM_IDLE) { + if (id_priv->state == CMA_IDLE) { ret = cma_bind_addr(id, src_addr, dst_addr); if (ret) return ret; } - if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) + if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_ADDR_QUERY)) return -EINVAL; atomic_inc(&id_priv->refcount); @@ -2040,36 +3125,17 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, else ret = rdma_resolve_ip(&addr_client, (struct sockaddr *) &id->route.addr.src_addr, dst_addr, &id->route.addr.dev_addr, - timeout_ms, addr_handler, id_priv); - if (ret) - goto err; - - return 0; -err: - cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); - cma_deref_id(id_priv); - return ret; -} -EXPORT_SYMBOL(rdma_resolve_addr); - -int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse) -{ - struct rdma_id_private *id_priv; - unsigned long flags; - int ret; - - id_priv = container_of(id, struct rdma_id_private, id); - spin_lock_irqsave(&id_priv->lock, flags); - if (id_priv->state == RDMA_CM_IDLE) { - id_priv->reuseaddr = reuse; - ret = 0; - } else { - ret = -EINVAL; - } - spin_unlock_irqrestore(&id_priv->lock, flags); + timeout_ms, addr_handler, id_priv); + if (ret) + goto err; + + return 0; +err: + cma_comp_exch(id_priv, CMA_ADDR_QUERY, CMA_ADDR_BOUND); + cma_deref_id(id_priv); return ret; } -EXPORT_SYMBOL(rdma_set_reuseaddr); +EXPORT_SYMBOL(rdma_resolve_addr); static void cma_bind_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv) @@ -2117,100 +3183,112 @@ err1: static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv) { - static unsigned int last_used_port; - int low, high, remaining; - unsigned int rover; - - inet_get_local_port_range(&low, &high); - remaining = (high - low) + 1; - rover = net_random() % remaining + low; -retry: - if (last_used_port != rover && - !idr_find(ps, (unsigned short) rover)) { - int ret = cma_alloc_port(ps, id_priv, rover); - /* - * Remember previously used port number in order to avoid - * re-using same port immediately after it is closed. - */ - if (!ret) - last_used_port = rover; - if (ret != -EADDRNOTAVAIL) - return ret; - } - if (--remaining) { - rover++; - if ((rover < low) || (rover > high)) - rover = low; - goto retry; - } - return -EADDRNOTAVAIL; -} - -/* - * Check that the requested port is available. This is called when trying to - * bind to a specific port, or when trying to listen on a bound port. In - * the latter case, the provided id_priv may already be on the bind_list, but - * we still need to check that it's okay to start listening. - */ -static int cma_check_port(struct rdma_bind_list *bind_list, - struct rdma_id_private *id_priv, uint8_t reuseaddr) -{ - struct rdma_id_private *cur_id; - struct sockaddr *addr, *cur_addr; - struct hlist_node *node; + struct rdma_bind_list *bind_list; + int port, ret, low, high; - addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr; - if (cma_any_addr(addr) && !reuseaddr) - return -EADDRNOTAVAIL; + bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL); + if (!bind_list) + return -ENOMEM; - hlist_for_each_entry(cur_id, node, &bind_list->owners, node) { - if (id_priv == cur_id) - continue; +retry: + /* FIXME: add proper port randomization per like inet_csk_get_port */ + do { + ret = idr_get_new_above(ps, bind_list, next_port, &port); + } while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL)); - if ((cur_id->state == RDMA_CM_LISTEN) || - !reuseaddr || !cur_id->reuseaddr) { - cur_addr = (struct sockaddr *) &cur_id->id.route.addr.src_addr; - if (cma_any_addr(cur_addr)) - return -EADDRNOTAVAIL; + if (ret) + goto err1; - if (!cma_addr_cmp(addr, cur_addr)) - return -EADDRINUSE; + inet_get_local_port_range(&low, &high); + if (port > high) { + if (next_port != low) { + idr_remove(ps, port); + next_port = low; + goto retry; } + ret = -EADDRNOTAVAIL; + goto err2; } + + if (port == high) + next_port = low; + else + next_port = port + 1; + + bind_list->ps = ps; + bind_list->port = (unsigned short) port; + cma_bind_port(bind_list, id_priv); return 0; +err2: + idr_remove(ps, port); +err1: + kfree(bind_list); + return ret; } static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv) { + struct rdma_id_private *cur_id; + struct sockaddr_in *sin, *cur_sin; struct rdma_bind_list *bind_list; + struct hlist_node *node; unsigned short snum; - int ret; - snum = ntohs(cma_port((struct sockaddr *) &id_priv->id.route.addr.src_addr)); + sin = (struct sockaddr_in *) &id_priv->id.route.addr.src_addr; + snum = ntohs(sin->sin_port); if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) return -EACCES; bind_list = idr_find(ps, snum); - if (!bind_list) { - ret = cma_alloc_port(ps, id_priv, snum); - } else { - ret = cma_check_port(bind_list, id_priv, id_priv->reuseaddr); - if (!ret) - cma_bind_port(bind_list, id_priv); + if (!bind_list) + return cma_alloc_port(ps, id_priv, snum); + + /* + * We don't support binding to any address if anyone is bound to + * a specific address on the same port. + */ + if (cma_any_addr((struct sockaddr *) &id_priv->id.route.addr.src_addr)) + return -EADDRNOTAVAIL; + + hlist_for_each_entry(cur_id, node, &bind_list->owners, node) { + if (cma_any_addr((struct sockaddr *) &cur_id->id.route.addr.src_addr)) + return -EADDRNOTAVAIL; + + cur_sin = (struct sockaddr_in *) &cur_id->id.route.addr.src_addr; + if (sin->sin_addr.s_addr == cur_sin->sin_addr.s_addr) + return -EADDRINUSE; } - return ret; + + cma_bind_port(bind_list, id_priv); + return 0; } -static int cma_bind_listen(struct rdma_id_private *id_priv) +static int cma_get_tcp_port(struct rdma_id_private *id_priv) { - struct rdma_bind_list *bind_list = id_priv->bind_list; - int ret = 0; + int ret; + int size; + struct socket *sock; - mutex_lock(&lock); - if (bind_list->owners.first->next) - ret = cma_check_port(bind_list, id_priv, 0); - mutex_unlock(&lock); - return ret; + ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + if (ret) + return ret; + ret = sock->ops->bind(sock, + (struct sockaddr *) &id_priv->id.route.addr.src_addr, + ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr)); + if (ret) { + sock_release(sock); + return ret; + } + size = ip_addr_size((struct sockaddr *) &id_priv->id.route.addr.src_addr); + ret = sock->ops->getname(sock, + (struct sockaddr *) &id_priv->id.route.addr.src_addr, + &size, 0); + if (ret) { + sock_release(sock); + return ret; + } + id_priv->sock = sock; + return 0; } static int cma_get_port(struct rdma_id_private *id_priv) @@ -2224,6 +3302,11 @@ static int cma_get_port(struct rdma_id_private *id_priv) break; case RDMA_PS_TCP: ps = &tcp_ps; + if (unify_tcp_port_space) { + ret = cma_get_tcp_port(id_priv); + if (ret) + goto out; + } break; case RDMA_PS_UDP: ps = &udp_ps; @@ -2241,14 +3324,14 @@ static int cma_get_port(struct rdma_id_private *id_priv) else ret = cma_use_port(ps, id_priv); mutex_unlock(&lock); - +out: return ret; } static int cma_check_linklocal(struct rdma_dev_addr *dev_addr, struct sockaddr *addr) { -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#if defined(CONFIG_IPv6) || defined(CONFIG_IPV6_MODULE) struct sockaddr_in6 *sin6; if (addr->sa_family != AF_INET6) @@ -2264,56 +3347,6 @@ static int cma_check_linklocal(struct rdma_dev_addr *dev_addr, return 0; } -int rdma_listen(struct rdma_cm_id *id, int backlog) -{ - struct rdma_id_private *id_priv; - int ret; - - id_priv = container_of(id, struct rdma_id_private, id); - if (id_priv->state == RDMA_CM_IDLE) { - ((struct sockaddr *) &id->route.addr.src_addr)->sa_family = AF_INET; - ret = rdma_bind_addr(id, (struct sockaddr *) &id->route.addr.src_addr); - if (ret) - return ret; - } - - if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN)) - return -EINVAL; - - if (id_priv->reuseaddr) { - ret = cma_bind_listen(id_priv); - if (ret) - goto err; - } - - id_priv->backlog = backlog; - if (id->device) { - switch (rdma_node_get_transport(id->device->node_type)) { - case RDMA_TRANSPORT_IB: - ret = cma_ib_listen(id_priv); - if (ret) - goto err; - break; - case RDMA_TRANSPORT_IWARP: - ret = cma_iw_listen(id_priv, backlog); - if (ret) - goto err; - break; - default: - ret = -ENOSYS; - goto err; - } - } else - cma_listen_on_all(id_priv); - - return 0; -err: - id_priv->backlog = 0; - cma_comp_exch(id_priv, RDMA_CM_LISTEN, RDMA_CM_ADDR_BOUND); - return ret; -} -EXPORT_SYMBOL(rdma_listen); - int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) { struct rdma_id_private *id_priv; @@ -2323,7 +3356,7 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) return -EAFNOSUPPORT; id_priv = container_of(id, struct rdma_id_private, id); - if (!cma_comp_exch(id_priv, RDMA_CM_IDLE, RDMA_CM_ADDR_BOUND)) + if (!cma_comp_exch(id_priv, CMA_IDLE, CMA_ADDR_BOUND)) return -EINVAL; ret = cma_check_linklocal(&id->route.addr.dev_addr, addr); @@ -2335,7 +3368,9 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) if (ret) goto err1; + mutex_lock(&lock); ret = cma_acquire_dev(id_priv); + mutex_unlock(&lock); if (ret) goto err1; } @@ -2347,10 +3382,13 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) return 0; err2: - if (id_priv->cma_dev) - cma_release_dev(id_priv); + if (id_priv->cma_dev) { + mutex_lock(&lock); + cma_detach_from_dev(id_priv); + mutex_unlock(&lock); + } err1: - cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE); + cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_IDLE); return ret; } EXPORT_SYMBOL(rdma_bind_addr); @@ -2422,8 +3460,9 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, struct rdma_cm_event event; struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd; int ret = 0; + int force_grh; - if (cma_disable_callback(id_priv, RDMA_CM_CONNECT)) + if (cma_disable_callback(id_priv, CMA_CONNECT)) return 0; memset(&event, 0, sizeof event); @@ -2451,9 +3490,11 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, event.status = -EINVAL; break; } + force_grh = rdma_port_link_layer(cm_id->device, id_priv->id.port_num) == + IB_LINK_LAYER_ETHERNET ? 1 : 0; ib_init_ah_from_path(id_priv->id.device, id_priv->id.port_num, id_priv->id.route.path_rec, - &event.param.ud.ah_attr); + &event.param.ud.ah_attr, force_grh); event.param.ud.qp_num = rep->qpn; event.param.ud.qkey = rep->qkey; event.event = RDMA_CM_EVENT_ESTABLISHED; @@ -2469,7 +3510,7 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; - cma_exch(id_priv, RDMA_CM_DESTROYING); + cma_exch(id_priv, CMA_DESTROYING); mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return ret; @@ -2484,6 +3525,7 @@ static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, { struct ib_cm_sidr_req_param req; struct rdma_route *route; + struct ib_cm_id *id; int ret; req.private_data_len = sizeof(struct cma_hdr) + @@ -2501,19 +3543,21 @@ static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, if (ret) goto out; - id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device, - cma_sidr_rep_handler, id_priv); - if (IS_ERR(id_priv->cm_id.ib)) { - ret = PTR_ERR(id_priv->cm_id.ib); + id = ib_create_cm_id(id_priv->id.device, cma_sidr_rep_handler, + id_priv); + if (IS_ERR(id)) { + ret = PTR_ERR(id); goto out; } + id_priv->cm_id.ib = id; req.path = route->path_rec; req.service_id = cma_get_service_id(id_priv->id.ps, (struct sockaddr *) &route->addr.dst_addr); - req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8); + req.timeout_ms = 1 << (cma_response_timeout - 8); req.max_cm_retries = CMA_MAX_CM_RETRIES; + cma_dbg(id_priv, "sending SIDR\n"); ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req); if (ret) { ib_destroy_cm_id(id_priv->cm_id.ib); @@ -2530,6 +3574,7 @@ static int cma_connect_ib(struct rdma_id_private *id_priv, struct ib_cm_req_param req; struct rdma_route *route; void *private_data; + struct ib_cm_id *id; int offset, ret; memset(&req, 0, sizeof req); @@ -2543,12 +3588,12 @@ static int cma_connect_ib(struct rdma_id_private *id_priv, memcpy(private_data + offset, conn_param->private_data, conn_param->private_data_len); - id_priv->cm_id.ib = ib_create_cm_id(id_priv->id.device, cma_ib_handler, - id_priv); - if (IS_ERR(id_priv->cm_id.ib)) { - ret = PTR_ERR(id_priv->cm_id.ib); + id = ib_create_cm_id(id_priv->id.device, cma_ib_handler, id_priv); + if (IS_ERR(id)) { + ret = PTR_ERR(id); goto out; } + id_priv->cm_id.ib = id; route = &id_priv->id.route; ret = cma_format_hdr(private_data, id_priv->id.ps, route); @@ -2570,14 +3615,15 @@ static int cma_connect_ib(struct rdma_id_private *id_priv, req.flow_control = conn_param->flow_control; req.retry_count = conn_param->retry_count; req.rnr_retry_count = conn_param->rnr_retry_count; - req.remote_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; - req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; + req.remote_cm_response_timeout = cma_response_timeout; + req.local_cm_response_timeout = cma_response_timeout; req.max_cm_retries = CMA_MAX_CM_RETRIES; req.srq = id_priv->srq ? 1 : 0; + cma_dbg(id_priv, "sending REQ\n"); ret = ib_send_cm_req(id_priv->cm_id.ib, &req); out: - if (ret && !IS_ERR(id_priv->cm_id.ib)) { + if (ret && !IS_ERR(id)) { ib_destroy_cm_id(id_priv->cm_id.ib); id_priv->cm_id.ib = NULL; } @@ -2635,9 +3681,11 @@ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) int ret; id_priv = container_of(id, struct rdma_id_private, id); - if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT)) + + if (!cma_comp_exch(id_priv, CMA_ROUTE_RESOLVED, CMA_CONNECT)) return -EINVAL; + id_priv->apm_flags |= CMA_APM_ACTIVE_SIDE; if (!id->qp) { id_priv->qp_num = conn_param->qp_num; id_priv->srq = conn_param->srq; @@ -2645,7 +3693,7 @@ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: - if (id->qp_type == IB_QPT_UD) + if (cma_is_ud_ps(id->ps)) ret = cma_resolve_ib_udp(id_priv, conn_param); else ret = cma_connect_ib(id_priv, conn_param); @@ -2662,7 +3710,7 @@ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) return 0; err: - cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_ROUTE_RESOLVED); + cma_comp_exch(id_priv, CMA_CONNECT, CMA_ROUTE_RESOLVED); return ret; } EXPORT_SYMBOL(rdma_connect); @@ -2693,6 +3741,7 @@ static int cma_accept_ib(struct rdma_id_private *id_priv, rep.rnr_retry_count = conn_param->rnr_retry_count; rep.srq = id_priv->srq ? 1 : 0; + cma_dbg(id_priv, "sending REP\n"); ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep); out: return ret; @@ -2739,6 +3788,7 @@ static int cma_send_sidr_rep(struct rdma_id_private *id_priv, rep.private_data = private_data; rep.private_data_len = private_data_len; + cma_dbg(id_priv, "sending SIDR\n"); return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep); } @@ -2748,10 +3798,7 @@ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) int ret; id_priv = container_of(id, struct rdma_id_private, id); - - id_priv->owner = task_pid_nr(current); - - if (!cma_comp(id_priv, RDMA_CM_CONNECT)) + if (!cma_comp(id_priv, CMA_CONNECT)) return -EINVAL; if (!id->qp && conn_param) { @@ -2761,7 +3808,7 @@ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: - if (id->qp_type == IB_QPT_UD) + if (cma_is_ud_ps(id->ps)) ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, conn_param->private_data, conn_param->private_data_len); @@ -2789,6 +3836,24 @@ reject: } EXPORT_SYMBOL(rdma_accept); +static int ib_ca_notify(struct rdma_cm_id *id, enum ib_event_type event) +{ + struct ib_event ev; + struct rdma_id_private *id_priv; + id_priv = container_of(id, struct rdma_id_private, id); + + switch (event) { + case IB_EVENT_PATH_MIG: + memset(&ev, 0, sizeof ev); + ev.event = event; + ev.device = id_priv->cma_dev->device; + cma_qp_event_handler(&ev, id_priv); + return 0; + default: + return ib_cm_notify(id_priv->cm_id.ib, event); + } +} + int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) { struct rdma_id_private *id_priv; @@ -2800,7 +3865,7 @@ int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) switch (id->device->node_type) { case RDMA_NODE_IB_CA: - ret = ib_cm_notify(id_priv->cm_id.ib, event); + ret = ib_ca_notify(id, event); break; default: ret = 0; @@ -2822,13 +3887,15 @@ int rdma_reject(struct rdma_cm_id *id, const void *private_data, switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: - if (id->qp_type == IB_QPT_UD) + if (cma_is_ud_ps(id->ps)) ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, private_data, private_data_len); - else + else { + cma_dbg(id_priv, "sending REJ\n"); ret = ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, private_data, private_data_len); + } break; case RDMA_TRANSPORT_IWARP: ret = iw_cm_reject(id_priv->cm_id.iw, @@ -2857,8 +3924,11 @@ int rdma_disconnect(struct rdma_cm_id *id) if (ret) goto out; /* Initiate or respond to a disconnect. */ - if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) + cma_dbg(id_priv, "sending DREQ\n"); + if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) { + cma_dbg(id_priv, "sending DREP\n"); ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0); + } break; case RDMA_TRANSPORT_IWARP: ret = iw_cm_disconnect(id_priv->cm_id.iw, 0); @@ -2880,8 +3950,8 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) int ret; id_priv = mc->id_priv; - if (cma_disable_callback(id_priv, RDMA_CM_ADDR_BOUND) && - cma_disable_callback(id_priv, RDMA_CM_ADDR_RESOLVED)) + if (cma_disable_callback(id_priv, CMA_ADDR_BOUND) && + cma_disable_callback(id_priv, CMA_ADDR_RESOLVED)) return 0; mutex_lock(&id_priv->qp_mutex); @@ -2905,7 +3975,7 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) ret = id_priv->id.event_handler(&id_priv->id, &event); if (ret) { - cma_exch(id_priv, RDMA_CM_DESTROYING); + cma_exch(id_priv, CMA_DESTROYING); mutex_unlock(&id_priv->handler_mutex); rdma_destroy_id(&id_priv->id); return 0; @@ -2984,6 +4054,7 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, return 0; } + static void iboe_mcast_work_handler(struct work_struct *work) { struct iboe_mcast_work *mw = container_of(work, struct iboe_mcast_work, work); @@ -3003,9 +4074,9 @@ static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid) if (cma_any_addr(addr)) { memset(mgid, 0, sizeof *mgid); - } else if (addr->sa_family == AF_INET6) { + } else if (addr->sa_family == AF_INET6) memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); - } else { + else { mgid->raw[0] = 0xff; mgid->raw[1] = 0x0e; mgid->raw[2] = 0; @@ -3023,7 +4094,7 @@ static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid) } static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, - struct cma_multicast *mc) + struct cma_multicast *mc) { struct iboe_mcast_work *work; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; @@ -3056,6 +4127,7 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, err = -ENODEV; goto out2; } + mc->multicast.ib->rec.rate = iboe_get_rate(ndev); mc->multicast.ib->rec.hop_limit = 1; mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu); @@ -3088,8 +4160,8 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, int ret; id_priv = container_of(id, struct rdma_id_private, id); - if (!cma_comp(id_priv, RDMA_CM_ADDR_BOUND) && - !cma_comp(id_priv, RDMA_CM_ADDR_RESOLVED)) + if (!cma_comp(id_priv, CMA_ADDR_BOUND) && + !cma_comp(id_priv, CMA_ADDR_RESOLVED)) return -EINVAL; mc = kmalloc(sizeof *mc, GFP_KERNEL); @@ -3106,7 +4178,7 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: - switch (rdma_port_get_link_layer(id->device, id->port_num)) { + switch (rdma_port_link_layer(id->device, id->port_num)) { case IB_LINK_LAYER_INFINIBAND: ret = cma_join_ib_multicast(id_priv, mc); break; @@ -3129,6 +4201,7 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, spin_unlock_irq(&id_priv->lock); kfree(mc); } + return ret; } EXPORT_SYMBOL(rdma_join_multicast); @@ -3150,7 +4223,7 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) &mc->multicast.ib->rec.mgid, mc->multicast.ib->rec.mlid); if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) == RDMA_TRANSPORT_IB) { - switch (rdma_port_get_link_layer(id->device, id->port_num)) { + switch (rdma_port_link_layer(id->device, id->port_num)) { case IB_LINK_LAYER_INFINIBAND: ib_sa_free_multicast(mc->multicast.ib); kfree(mc); @@ -3228,22 +4301,188 @@ static struct notifier_block cma_nb = { .notifier_call = cma_netdev_callback }; +static void cma_port_ud_handler(struct work_struct *work) +{ + + struct cma_port_ud_work *w = container_of(work, struct cma_port_ud_work, work); + struct cma_device *cma_dev = w->cma_dev; + + struct rdma_id_private *id_priv; + struct rdma_route *route; + union ib_gid sgid[2]; + int pri_path_index; + int num_paths; + unsigned long flags; + int ret; + + + /* iterate through all the cma ids bound to this device */ + list_for_each_entry(id_priv, &cma_dev->id_list, list) { + mutex_lock(&id_priv->handler_mutex); + if (!(id_priv->apm_flags & CMA_APM_ENABLED)) { + /* + * we're only interested in APM enabled ids. If someone + * wants to enable APM for a given id, it must do it + * through calling rdma_enable_apm + */ + cma_dbg(id_priv, "APM is not enabled. skip\n"); + mutex_unlock(&id_priv->handler_mutex); + continue; + } + if (id_priv->state != CMA_CONNECT) { + /* + * If the object is not connected then no use in handling such events. + */ + cma_dbg(id_priv, "state (%d) is not CMA_CONNECT. skip\n", id_priv->state); + mutex_unlock(&id_priv->handler_mutex); + continue; + } + + route = &id_priv->id.route; + spin_lock_irqsave(&id_priv->lock, flags); + num_paths = route->num_paths; + /* + * we only keep an indication of the alt path index we use. The primary + * path is always the other index. hence the calculation using xor. + * If we only have one path then it is always at index 0. + */ + pri_path_index = (num_paths == 2) ? id_priv->alt_path_index ^ 0x1 : 0; + if (num_paths == 2) { + sgid[0] = route->path_rec[0].sgid; + sgid[1] = route->path_rec[1].sgid; + } + spin_unlock_irqrestore(&id_priv->lock, flags); + + if (w->up) { + /* port changed from DOWN to UP */ + cma_dbg(id_priv, "port %d UP\n", w->port_num); + cma_debug_routes(id_priv); + if ((num_paths < 2) || + !memcmp(&sgid[0], &sgid[1], sizeof (union ib_gid))) { + cma_dbg(id_priv, "UP not yet at best paths. will try to improve\n"); + if (id_priv->apm_flags & CMA_APM_ACTIVE_SIDE) { + cma_dbg(id_priv, "will try to find alt route\n"); + ret = cma_resolve_alt_ib_route(id_priv); + if (ret) { + cma_dbg(id_priv, "fail to resolve alt route (%d)\n", ret); + } + } else { + if (cma_schedule_sap(id_priv, 2000, 0)) { + cma_dbg(id_priv, "failed to schedule sap\n"); + } + } + } + } else { + u8 port = 0; + cma_dbg(id_priv, "port %d DOWN\n", w->port_num); + cma_debug_routes(id_priv); + if (num_paths > 1) { + if (ib_find_cached_gid(id_priv->id.device, &sgid[pri_path_index], + &port, NULL)) { + cma_dbg(id_priv, "Find cached gid of primary sgid failed\n"); + } + + /* if the primary path port went down, leave to path MIG to fix */ + if (w->port_num == (int) port) { + cma_dbg(id_priv, "Primary path port DOWN. Leave to MIG\n"); + mutex_unlock(&id_priv->handler_mutex); + continue; + } + if (memcmp(&sgid[0], &sgid[1], sizeof (union ib_gid))) { + cma_dbg(id_priv, "DOWN not yet at best paths. will try to improve\n"); + if (id_priv->apm_flags & CMA_APM_ACTIVE_SIDE) { + cma_dbg(id_priv, "will try to find alt route\n"); + ret = cma_resolve_alt_ib_route(id_priv); + if (ret) { + cma_dbg(id_priv, "fail to resolve alt route (%d)\n", ret); + } + } else { + cma_dbg(id_priv, "scheduling immediate SAP\n"); + if (cma_schedule_sap(id_priv, 0, (int) w->port_num)) { + cma_dbg(id_priv, "failed to schedule sap\n"); + } + } + } + } + } + mutex_unlock(&id_priv->handler_mutex); + } + kfree(work); +} + +/* + * this is handler that receives asynchronous events for the device, + * including port events + */ +static void cma_event_handler(struct ib_event_handler *handler, struct ib_event *event) +{ + struct cma_device *cma_dev = + container_of(handler, typeof(*cma_dev), event_handler); + u8 port = event->element.port_num; + struct cma_port_ud_work *work; + + /* we're only interested in port Up/Down events */ + if ( event->event != IB_EVENT_PORT_ACTIVE && + event->event != IB_EVENT_PORT_ERR) + return; + + cma_dbg(NULL, "port %s/%d is %s\n", event->device->name, port, + (event->event == IB_EVENT_PORT_ACTIVE) ? "UP" : "DOWN"); + + /* cache the state of the port */ + if (event->event == IB_EVENT_PORT_ACTIVE) + cma_dev->port_active[port] = 1; + else + cma_dev->port_active[port] = 0; + + work = kzalloc(sizeof *work, GFP_ATOMIC); + if (!work) { + cma_warn(NULL, "failed to allocate work\n"); + return; + } + + INIT_WORK(&work->work, cma_port_ud_handler); + work->cma_dev = cma_dev; + work->port_num = port; + work->up = cma_dev->port_active[port]; + queue_work(cma_wq, &work->work); +} + static void cma_add_one(struct ib_device *device) { struct cma_device *cma_dev; struct rdma_id_private *id_priv; + struct ib_port_attr port_attr; + int p; cma_dev = kmalloc(sizeof *cma_dev, GFP_KERNEL); if (!cma_dev) return; + cma_dev->port_active = kmalloc(sizeof (*cma_dev->port_active) * + device->phys_port_cnt + 1, GFP_KERNEL); + if (!cma_dev->port_active) { + kfree(cma_dev); + return; + } + cma_dev->device = device; init_completion(&cma_dev->comp); atomic_set(&cma_dev->refcount, 1); INIT_LIST_HEAD(&cma_dev->id_list); ib_set_client_data(device, &cma_client, cma_dev); + INIT_IB_EVENT_HANDLER(&cma_dev->event_handler, device, cma_event_handler); + for (p = 1; p <= device->phys_port_cnt; ++p) { + if (!ib_query_port(cma_dev->device, p, &port_attr)) { + cma_dev->port_active[p] = + port_attr.state == IB_PORT_ACTIVE ? 1 : 0; + } else + cma_dev->port_active[p] = 0; + } + if (ib_register_event_handler(&cma_dev->event_handler)) + cma_warn(NULL, "fail to register event handler\n"); mutex_lock(&lock); list_add_tail(&cma_dev->list, &dev_list); list_for_each_entry(id_priv, &listen_any_list, list) @@ -3254,19 +4493,19 @@ static void cma_add_one(struct ib_device *device) static int cma_remove_id_dev(struct rdma_id_private *id_priv) { struct rdma_cm_event event; - enum rdma_cm_state state; + enum cma_state state; int ret = 0; /* Record that we want to remove the device */ - state = cma_exch(id_priv, RDMA_CM_DEVICE_REMOVAL); - if (state == RDMA_CM_DESTROYING) + state = cma_exch(id_priv, CMA_DEVICE_REMOVAL); + if (state == CMA_DESTROYING) return 0; cma_cancel_operation(id_priv, state); mutex_lock(&id_priv->handler_mutex); /* Check for destruction from another callback. */ - if (!cma_comp(id_priv, RDMA_CM_DEVICE_REMOVAL)) + if (!cma_comp(id_priv, CMA_DEVICE_REMOVAL)) goto out; memset(&event, 0, sizeof event); @@ -3312,117 +4551,34 @@ static void cma_remove_one(struct ib_device *device) cma_dev = ib_get_client_data(device, &cma_client); if (!cma_dev) return; + ib_unregister_event_handler(&cma_dev->event_handler); mutex_lock(&lock); list_del(&cma_dev->list); mutex_unlock(&lock); cma_process_remove(cma_dev); + kfree(cma_dev->port_active); kfree(cma_dev); } -static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb) +static int cma_init(void) { - struct nlmsghdr *nlh; - struct rdma_cm_id_stats *id_stats; - struct rdma_id_private *id_priv; - struct rdma_cm_id *id = NULL; - struct cma_device *cma_dev; - int i_dev = 0, i_id = 0; - - /* - * We export all of the IDs as a sequence of messages. Each - * ID gets its own netlink message. - */ - mutex_lock(&lock); - - list_for_each_entry(cma_dev, &dev_list, list) { - if (i_dev < cb->args[0]) { - i_dev++; - continue; - } - - i_id = 0; - list_for_each_entry(id_priv, &cma_dev->id_list, list) { - if (i_id < cb->args[1]) { - i_id++; - continue; - } - - id_stats = ibnl_put_msg(skb, &nlh, cb->nlh->nlmsg_seq, - sizeof *id_stats, RDMA_NL_RDMA_CM, - RDMA_NL_RDMA_CM_ID_STATS); - if (!id_stats) - goto out; - - memset(id_stats, 0, sizeof *id_stats); - id = &id_priv->id; - id_stats->node_type = id->route.addr.dev_addr.dev_type; - id_stats->port_num = id->port_num; - id_stats->bound_dev_if = - id->route.addr.dev_addr.bound_dev_if; - - if (id->route.addr.src_addr.ss_family == AF_INET) { - if (ibnl_put_attr(skb, nlh, - sizeof(struct sockaddr_in), - &id->route.addr.src_addr, - RDMA_NL_RDMA_CM_ATTR_SRC_ADDR)) { - goto out; - } - if (ibnl_put_attr(skb, nlh, - sizeof(struct sockaddr_in), - &id->route.addr.dst_addr, - RDMA_NL_RDMA_CM_ATTR_DST_ADDR)) { - goto out; - } - } else if (id->route.addr.src_addr.ss_family == AF_INET6) { - if (ibnl_put_attr(skb, nlh, - sizeof(struct sockaddr_in6), - &id->route.addr.src_addr, - RDMA_NL_RDMA_CM_ATTR_SRC_ADDR)) { - goto out; - } - if (ibnl_put_attr(skb, nlh, - sizeof(struct sockaddr_in6), - &id->route.addr.dst_addr, - RDMA_NL_RDMA_CM_ATTR_DST_ADDR)) { - goto out; - } - } - - id_stats->pid = id_priv->owner; - id_stats->port_space = id->ps; - id_stats->cm_state = id_priv->state; - id_stats->qp_num = id_priv->qp_num; - id_stats->qp_type = id->qp_type; - - i_id++; - } - - cb->args[1] = 0; - i_dev++; - } - -out: - mutex_unlock(&lock); - cb->args[0] = i_dev; - cb->args[1] = i_id; - - return skb->len; -} - -static const struct ibnl_client_cbs cma_cb_table[] = { - [RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats }, -}; + int ret = -ENOMEM, low, high, remaining; -static int __init cma_init(void) -{ - int ret; + get_random_bytes(&next_port, sizeof next_port); + inet_get_local_port_range(&low, &high); + remaining = (high - low) + 1; + next_port = ((unsigned int) next_port % remaining) + low; cma_wq = create_singlethread_workqueue("rdma_cm"); if (!cma_wq) return -ENOMEM; + cma_free_wq = create_singlethread_workqueue("rdma_cm_fr"); + if (!cma_free_wq) + goto err1; + ib_sa_register_client(&sa_client); rdma_addr_register_client(&addr_client); register_netdevice_notifier(&cma_nb); @@ -3430,27 +4586,27 @@ static int __init cma_init(void) ret = ib_register_client(&cma_client); if (ret) goto err; - - if (ibnl_add_client(RDMA_NL_RDMA_CM, RDMA_NL_RDMA_CM_NUM_OPS, cma_cb_table)) - printk(KERN_WARNING "RDMA CMA: failed to add netlink callback\n"); - return 0; err: unregister_netdevice_notifier(&cma_nb); rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); + + destroy_workqueue(cma_free_wq); +err1: destroy_workqueue(cma_wq); return ret; } -static void __exit cma_cleanup(void) +static void cma_cleanup(void) { - ibnl_remove_client(RDMA_NL_RDMA_CM); ib_unregister_client(&cma_client); unregister_netdevice_notifier(&cma_nb); rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); + flush_workqueue(cma_free_wq); + destroy_workqueue(cma_free_wq); destroy_workqueue(cma_wq); idr_destroy(&sdp_ps); idr_destroy(&tcp_ps); @@ -3458,5 +4614,6 @@ static void __exit cma_cleanup(void) idr_destroy(&ipoib_ps); } + module_init(cma_init); module_exit(cma_cleanup); diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index a565af5c2d2e8..05ac36e6acdb5 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -38,9 +38,7 @@ #include -int ib_device_register_sysfs(struct ib_device *device, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)); +int ib_device_register_sysfs(struct ib_device *device); void ib_device_unregister_sysfs(struct ib_device *device); int ib_sysfs_setup(void); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 4007f721d25d2..2276f6883ddb8 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -38,7 +38,7 @@ #include #include #include -#include +#include #include "core_priv.h" @@ -46,6 +46,12 @@ MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("core kernel InfiniBand API"); MODULE_LICENSE("Dual BSD/GPL"); +#ifdef __ia64__ +/* workaround for a bug in hp chipset that would cause kernel + panic when dma resources are exhaused */ +int dma_map_sg_hp_wa = 0; +#endif + struct ib_client_data { struct list_head list; struct ib_client *client; @@ -270,9 +276,7 @@ out: * callback for each device that is added. @device must be allocated * with ib_alloc_device(). */ -int ib_register_device(struct ib_device *device, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)) +int ib_register_device(struct ib_device *device) { int ret; @@ -293,6 +297,10 @@ int ib_register_device(struct ib_device *device, INIT_LIST_HEAD(&device->client_data_list); spin_lock_init(&device->event_handler_lock); spin_lock_init(&device->client_data_lock); + device->ib_uverbs_xrcd_table = RB_ROOT; + mutex_init(&device->xrcd_table_mutex); + device->relaxed_pd = NULL; + INIT_LIST_HEAD(&device->relaxed_pool_list); ret = read_port_table_lengths(device); if (ret) { @@ -301,7 +309,7 @@ int ib_register_device(struct ib_device *device, goto out; } - ret = ib_device_register_sysfs(device, port_callback); + ret = ib_device_register_sysfs(device); if (ret) { printk(KERN_WARNING "Couldn't register device %s with driver model\n", device->name); @@ -699,20 +707,29 @@ EXPORT_SYMBOL(ib_find_gid); int ib_find_pkey(struct ib_device *device, u8 port_num, u16 pkey, u16 *index) { - int ret, i; + int i; u16 tmp_pkey; + int ret; + int partial_ix = -1; for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) { ret = ib_query_pkey(device, port_num, i, &tmp_pkey); if (ret) return ret; - if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { - *index = i; - return 0; + /* if there is full-member pkey take it.*/ + if (tmp_pkey & 0x8000) { + *index = i; + return 0; + } + partial_ix = i; } } - + /*no full-member, if exists take the limited*/ + if (partial_ix >= 0) { + *index = partial_ix; + return 0; + } return -ENOENT; } EXPORT_SYMBOL(ib_find_pkey); @@ -724,44 +741,31 @@ static int __init ib_core_init(void) ib_wq = alloc_workqueue("infiniband", 0, 0); if (!ib_wq) return -ENOMEM; +#ifdef __ia64__ + if (ia64_platform_is("hpzx1")) + dma_map_sg_hp_wa = 1; +#endif ret = ib_sysfs_setup(); - if (ret) { + if (ret) printk(KERN_WARNING "Couldn't create InfiniBand device class\n"); - goto err; - } - - ret = ibnl_init(); - if (ret) { - printk(KERN_WARNING "Couldn't init IB netlink interface\n"); - goto err_sysfs; - } ret = ib_cache_setup(); if (ret) { printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n"); - goto err_nl; + destroy_workqueue(ib_wq); + ib_sysfs_cleanup(); } - return 0; - -err_nl: - ibnl_cleanup(); - -err_sysfs: - ib_sysfs_cleanup(); - -err: - destroy_workqueue(ib_wq); return ret; } static void __exit ib_core_cleanup(void) { ib_cache_cleanup(); - ibnl_cleanup(); ib_sysfs_cleanup(); /* Make sure that any pending umem accounting work is done. */ + flush_scheduled_work(); destroy_workqueue(ib_wq); } diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c index 4507043d24c8c..7ed408bb7c2ba 100644 --- a/drivers/infiniband/core/fmr_pool.c +++ b/drivers/infiniband/core/fmr_pool.c @@ -79,8 +79,11 @@ enum { * pool_lock to maintain consistency. */ +#define FMR_SPLIT_COUNT 3 + struct ib_fmr_pool { spinlock_t pool_lock; + spinlock_t used_pool_lock; int pool_size; int max_pages; @@ -88,6 +91,7 @@ struct ib_fmr_pool { int dirty_watermark; int dirty_len; struct list_head free_list; + struct list_head used_list; struct list_head dirty_list; struct hlist_head *cache_bucket; @@ -101,6 +105,8 @@ struct ib_fmr_pool { atomic_t flush_ser; wait_queue_head_t force_wait; + struct ib_pd *pd; + int relaxed; }; static inline u32 ib_fmr_hash(u64 first_page) @@ -113,7 +119,8 @@ static inline u32 ib_fmr_hash(u64 first_page) static inline struct ib_pool_fmr *ib_fmr_cache_lookup(struct ib_fmr_pool *pool, u64 *page_list, int page_list_len, - u64 io_virtual_address) + u64 io_virtual_address, + struct ib_pd *pd) { struct hlist_head *bucket; struct ib_pool_fmr *fmr; @@ -127,6 +134,7 @@ static inline struct ib_pool_fmr *ib_fmr_cache_lookup(struct ib_fmr_pool *pool, hlist_for_each_entry(fmr, pos, bucket, cache_node) if (io_virtual_address == fmr->io_virtual_address && page_list_len == fmr->page_list_len && + pd == fmr->pd && !memcmp(page_list, fmr->page_list, page_list_len * sizeof *page_list)) return fmr; @@ -134,13 +142,98 @@ static inline struct ib_pool_fmr *ib_fmr_cache_lookup(struct ib_fmr_pool *pool, return NULL; } -static void ib_fmr_batch_release(struct ib_fmr_pool *pool) + +static void fmr_teardown_mr(struct ib_pool_fmr *fmr) +{ + + if (fmr->sg_len) { + ib_dma_unmap_sg(fmr->pd->device, + fmr->sg, fmr->sg_len, + DMA_BIDIRECTIONAL); + } + + /* Release the s/g list */ + if (fmr->sg_len) { + unsigned int i; + + for (i = 0; i < fmr->sg_len; ++i) { + struct page *page = sg_page(&fmr->sg[i]); + + /* FIXME we need a way to tell a r/w MR + * from a r/o MR */ + BUG_ON(irqs_disabled()); + set_page_dirty(page); + put_page(page); + } + kfree(fmr->sg); + + fmr->sg = NULL; + fmr->sg_len = 0; + } +} + +static void ib_fmr_batch_release(struct ib_fmr_pool *pool, int unmap_usedonce) { int ret; struct ib_pool_fmr *fmr; LIST_HEAD(unmap_list); LIST_HEAD(fmr_list); + if (unmap_usedonce) { + /* force a flush */ + struct ib_pool_fmr *fmr; + int already_split = 0; + int count = 0; + LIST_HEAD(temp_list); + + spin_lock_irq(&pool->used_pool_lock); + list_splice_init(&pool->used_list, &temp_list); + spin_unlock_irq(&pool->used_pool_lock); + list_for_each_entry(fmr, &temp_list, list) { + /* find first fmr that is not mapped yet */ + if (fmr->remap_count == 0 || + (count > (pool->pool_size / FMR_SPLIT_COUNT))) { + /* split the list 2 two */ + list_cut_position(&unmap_list, &temp_list, + &fmr->list); + spin_lock_irq(&pool->used_pool_lock); + list_splice(&temp_list, &pool->used_list); + spin_unlock_irq(&pool->used_pool_lock); + already_split = 1; + break; + } else { + hlist_del_init(&fmr->cache_node); + fmr->remap_count = 0; + list_add_tail(&fmr->fmr->list, &fmr_list); + count++; + } + } + + if (!already_split) { + /* All are mapped once */ + list_splice_tail(&temp_list, &unmap_list); + } + if (!list_empty(&unmap_list)) { + ret = ib_unmap_fmr(&fmr_list); + if (ret) + printk(KERN_WARNING PFX "ib_unmap_fmr returned" + " %d\n", ret); + + if (pool->relaxed) { + list_for_each_entry(fmr, &unmap_list, list) { + fmr_teardown_mr(fmr); + } + } + spin_lock_irq(&pool->pool_lock); + list_splice(&unmap_list, &pool->free_list); + spin_unlock_irq(&pool->pool_lock); + } + INIT_LIST_HEAD(&unmap_list); + INIT_LIST_HEAD(&fmr_list); + + + } + spin_lock_irq(&pool->pool_lock); list_for_each_entry(fmr, &pool->dirty_list, list) { @@ -150,8 +243,9 @@ static void ib_fmr_batch_release(struct ib_fmr_pool *pool) #ifdef DEBUG if (fmr->ref_count !=0) { - printk(KERN_WARNING PFX "Unmapping FMR 0x%08x with ref count %d\n", - fmr, fmr->ref_count); + printk(KERN_WARNING PFX "Unmapping FMR 0x%08x with ref" + " count %d\n", + fmr, fmr->ref_count); } #endif } @@ -169,6 +263,12 @@ static void ib_fmr_batch_release(struct ib_fmr_pool *pool) if (ret) printk(KERN_WARNING PFX "ib_unmap_fmr returned %d\n", ret); + if (pool->relaxed) { + list_for_each_entry(fmr, &unmap_list, list) { + fmr_teardown_mr(fmr); + } + } + spin_lock_irq(&pool->pool_lock); list_splice(&unmap_list, &pool->free_list); spin_unlock_irq(&pool->pool_lock); @@ -177,10 +277,12 @@ static void ib_fmr_batch_release(struct ib_fmr_pool *pool) static int ib_fmr_cleanup_thread(void *pool_ptr) { struct ib_fmr_pool *pool = pool_ptr; + int time_left = 1; do { if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) < 0) { - ib_fmr_batch_release(pool); + ib_fmr_batch_release(pool, 0); + time_left = 1; atomic_inc(&pool->flush_ser); wake_up_interruptible(&pool->force_wait); @@ -189,16 +291,26 @@ static int ib_fmr_cleanup_thread(void *pool_ptr) pool->flush_function(pool, pool->flush_arg); } + if (!time_left && pool->relaxed) { + ib_fmr_batch_release(pool, 1); + + if (pool->flush_function) + pool->flush_function(pool, pool->flush_arg); + } + set_current_state(TASK_INTERRUPTIBLE); if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) >= 0 && - !kthread_should_stop()) - schedule(); + !kthread_should_stop()) { + /* run once in 50 mills */ + time_left = schedule_timeout(msecs_to_jiffies(50)); + } __set_current_state(TASK_RUNNING); } while (!kthread_should_stop()); return 0; } + /** * ib_create_fmr_pool - Create an FMR pool * @pd:Protection domain for FMRs @@ -220,6 +332,9 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, if (!params) return ERR_PTR(-EINVAL); + if (params->cache && params->relaxed) + return ERR_PTR(-EINVAL); + device = pd->device; if (!device->alloc_fmr || !device->dealloc_fmr || !device->map_phys_fmr || !device->unmap_fmr) { @@ -228,6 +343,13 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, return ERR_PTR(-ENOSYS); } + if (params->relaxed && !device->set_fmr_pd) { + printk(KERN_INFO PFX "Device %s does not support relaxed FMRs\n", + device->name); + return ERR_PTR(-ENOSYS); + } + + attr = kmalloc(sizeof *attr, GFP_KERNEL); if (!attr) { printk(KERN_WARNING PFX "couldn't allocate device attr struct\n"); @@ -260,6 +382,7 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, pool->flush_arg = params->flush_arg; INIT_LIST_HEAD(&pool->free_list); + INIT_LIST_HEAD(&pool->used_list); INIT_LIST_HEAD(&pool->dirty_list); if (params->cache) { @@ -282,9 +405,12 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, pool->dirty_watermark = params->dirty_watermark; pool->dirty_len = 0; spin_lock_init(&pool->pool_lock); + spin_lock_init(&pool->used_pool_lock); atomic_set(&pool->req_ser, 0); atomic_set(&pool->flush_ser, 0); init_waitqueue_head(&pool->force_wait); + pool->pd = pd; + pool->relaxed = params->relaxed; pool->thread = kthread_run(ib_fmr_cleanup_thread, pool, @@ -319,12 +445,14 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, fmr->pool = pool; fmr->remap_count = 0; fmr->ref_count = 0; + fmr->pd = pd; + fmr->page_list_len = 0; + fmr->sg = NULL; + fmr->sg_len = 0; INIT_HLIST_NODE(&fmr->cache_node); fmr->fmr = ib_alloc_fmr(pd, params->access, &fmr_attr); if (IS_ERR(fmr->fmr)) { - printk(KERN_WARNING PFX "fmr_create failed " - "for FMR %d\n", i); kfree(fmr); goto out_fail; } @@ -363,15 +491,25 @@ void ib_destroy_fmr_pool(struct ib_fmr_pool *pool) int i; kthread_stop(pool->thread); - ib_fmr_batch_release(pool); + ib_fmr_batch_release(pool, 0); i = 0; list_for_each_entry_safe(fmr, tmp, &pool->free_list, list) { + ib_set_fmr_pd(fmr->fmr, pool->pd); + ib_dealloc_fmr(fmr->fmr); + list_del(&fmr->list); + kfree(fmr); + ++i; + } + list_for_each_entry_safe(fmr, tmp, &pool->used_list, list) { if (fmr->remap_count) { INIT_LIST_HEAD(&fmr_list); list_add_tail(&fmr->fmr->list, &fmr_list); ib_unmap_fmr(&fmr_list); + if (pool->relaxed) + fmr_teardown_mr(fmr); } + ib_set_fmr_pd(fmr->fmr, pool->pd); ib_dealloc_fmr(fmr->fmr); list_del(&fmr->list); kfree(fmr); @@ -396,7 +534,6 @@ EXPORT_SYMBOL(ib_destroy_fmr_pool); int ib_flush_fmr_pool(struct ib_fmr_pool *pool) { int serial; - struct ib_pool_fmr *fmr, *next; /* * The free_list holds FMRs that may have been used @@ -404,12 +541,9 @@ int ib_flush_fmr_pool(struct ib_fmr_pool *pool) * Put them on the dirty list now so that the cleanup * thread will reap them too. */ - spin_lock_irq(&pool->pool_lock); - list_for_each_entry_safe(fmr, next, &pool->free_list, list) { - if (fmr->remap_count > 0) - list_move(&fmr->list, &pool->dirty_list); - } - spin_unlock_irq(&pool->pool_lock); + spin_lock_irq(&pool->used_pool_lock); + list_splice_init(&pool->used_list, &pool->dirty_list); + spin_unlock_irq(&pool->used_pool_lock); serial = atomic_inc_return(&pool->req_ser); wake_up_process(pool->thread); @@ -428,13 +562,15 @@ EXPORT_SYMBOL(ib_flush_fmr_pool); * @page_list:List of pages to map * @list_len:Number of pages in @page_list * @io_virtual_address:I/O virtual address for new FMR + * @rargs: argument sepecified when relaxed MR is used. * * Map an FMR from an FMR pool. */ struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle, u64 *page_list, int list_len, - u64 io_virtual_address) + u64 io_virtual_address, + struct ib_fmr_args_relaxed *rargs) { struct ib_fmr_pool *pool = pool_handle; struct ib_pool_fmr *fmr; @@ -444,11 +580,15 @@ struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle, if (list_len < 1 || list_len > pool->max_pages) return ERR_PTR(-EINVAL); + if (pool->relaxed && rargs == NULL) + return ERR_PTR(-EINVAL); + + spin_lock_irqsave(&pool->pool_lock, flags); fmr = ib_fmr_cache_lookup(pool, page_list, list_len, - io_virtual_address); + io_virtual_address, rargs ? rargs->pd : NULL); if (fmr) { /* found in cache */ ++fmr->ref_count; @@ -463,21 +603,44 @@ struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle, if (list_empty(&pool->free_list)) { spin_unlock_irqrestore(&pool->pool_lock, flags); - return ERR_PTR(-EAGAIN); + spin_lock_irqsave(&pool->used_pool_lock, flags); + if (list_empty(&pool->used_list)) { + spin_unlock_irqrestore(&pool->used_pool_lock, flags); + return ERR_PTR(-EAGAIN); + } + fmr = list_entry(pool->used_list.next, struct ib_pool_fmr, + list); + list_del(&fmr->list); + hlist_del_init(&fmr->cache_node); + spin_unlock_irqrestore(&pool->used_pool_lock, flags); + } else { + fmr = list_entry(pool->free_list.next, struct ib_pool_fmr, + list); + list_del(&fmr->list); + hlist_del_init(&fmr->cache_node); + spin_unlock_irqrestore(&pool->pool_lock, flags); } - fmr = list_entry(pool->free_list.next, struct ib_pool_fmr, list); - list_del(&fmr->list); - hlist_del_init(&fmr->cache_node); - spin_unlock_irqrestore(&pool->pool_lock, flags); + if (pool->relaxed && fmr->pd != rargs->pd) { + result = ib_set_fmr_pd(fmr->fmr, rargs->pd); + if (result) { + spin_lock_irqsave(&pool->used_pool_lock, flags); + list_add(&fmr->list, &pool->used_list); + spin_unlock_irqrestore(&pool->used_pool_lock, flags); + + printk(KERN_WARNING PFX "set_fmr_pd returns %d\n", result); + + return ERR_PTR(result); + } + } result = ib_map_phys_fmr(fmr->fmr, page_list, list_len, io_virtual_address); if (result) { - spin_lock_irqsave(&pool->pool_lock, flags); - list_add(&fmr->list, &pool->free_list); - spin_unlock_irqrestore(&pool->pool_lock, flags); + spin_lock_irqsave(&pool->used_pool_lock, flags); + list_add(&fmr->list, &pool->used_list); + spin_unlock_irqrestore(&pool->used_pool_lock, flags); printk(KERN_WARNING PFX "fmr_map returns %d\n", result); @@ -498,6 +661,16 @@ struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle, spin_unlock_irqrestore(&pool->pool_lock, flags); } + if (pool->relaxed) { + fmr->pd = rargs->pd; + /* if it was mapped earlier */ + if (fmr->remap_count > 1) + fmr_teardown_mr(fmr); + + fmr->sg = rargs->sg; + fmr->sg_len = rargs->sg_len; + } + return fmr; } EXPORT_SYMBOL(ib_fmr_pool_map_phys); @@ -516,12 +689,12 @@ int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr) pool = fmr->pool; - spin_lock_irqsave(&pool->pool_lock, flags); + spin_lock_irqsave(&pool->used_pool_lock, flags); --fmr->ref_count; if (!fmr->ref_count) { if (fmr->remap_count < pool->max_remaps) { - list_add_tail(&fmr->list, &pool->free_list); + list_add_tail(&fmr->list, &pool->used_list); } else { list_add_tail(&fmr->list, &pool->dirty_list); if (++pool->dirty_len >= pool->dirty_watermark) { @@ -537,7 +710,7 @@ int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr) fmr, fmr->ref_count); #endif - spin_unlock_irqrestore(&pool->pool_lock, flags); + spin_unlock_irqrestore(&pool->used_pool_lock, flags); return 0; } diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index a9c042345c6fe..1bd1161f238b5 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -40,11 +40,10 @@ #include #include #include -#include #include #include #include -#include +#include #include #include @@ -506,8 +505,6 @@ int iw_cm_accept(struct iw_cm_id *cm_id, qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn); if (!qp) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); - clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); - wake_up_all(&cm_id_priv->connect_wait); return -EINVAL; } cm_id->device->iwcm->add_ref(qp); @@ -567,8 +564,6 @@ int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn); if (!qp) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); - clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); - wake_up_all(&cm_id_priv->connect_wait); return -EINVAL; } cm_id->device->iwcm->add_ref(qp); @@ -725,7 +720,7 @@ static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv, */ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT); - if (iw_event->status == 0) { + if (iw_event->status == IW_CM_EVENT_STATUS_ACCEPTED) { cm_id_priv->id.local_addr = iw_event->local_addr; cm_id_priv->id.remote_addr = iw_event->remote_addr; cm_id_priv->state = IW_CM_STATE_ESTABLISHED; diff --git a/drivers/infiniband/core/local_sa.c b/drivers/infiniband/core/local_sa.c new file mode 100644 index 0000000000000..eb62c429528e6 --- /dev/null +++ b/drivers/infiniband/core/local_sa.c @@ -0,0 +1,1273 @@ +/* + * Copyright (c) 2006 Intel Corporation.  All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "sa.h" + +MODULE_AUTHOR("Sean Hefty"); +MODULE_DESCRIPTION("InfiniBand subnet administration caching"); +MODULE_LICENSE("Dual BSD/GPL"); + +enum { + SA_DB_MAX_PATHS_PER_DEST = 0x7F, + SA_DB_MIN_RETRY_TIMER = 4000, /* 4 sec */ + SA_DB_MAX_RETRY_TIMER = 256000 /* 256 sec */ +}; + +static int set_paths_per_dest(const char *val, struct kernel_param *kp); +static unsigned long paths_per_dest = 0; +module_param_call(paths_per_dest, set_paths_per_dest, param_get_ulong, + &paths_per_dest, 0644); +MODULE_PARM_DESC(paths_per_dest, "Maximum number of paths to retrieve " + "to each destination (DGID). Set to 0 " + "to disable cache."); + +static int set_subscribe_inform_info(const char *val, struct kernel_param *kp); +static char subscribe_inform_info = 1; +module_param_call(subscribe_inform_info, set_subscribe_inform_info, + param_get_bool, &subscribe_inform_info, 0644); +MODULE_PARM_DESC(subscribe_inform_info, + "Subscribe for SA InformInfo/Notice events."); + +static int do_refresh(const char *val, struct kernel_param *kp); +module_param_call(refresh, do_refresh, NULL, NULL, 0200); + +static unsigned long retry_timer = SA_DB_MIN_RETRY_TIMER; + +enum sa_db_lookup_method { + SA_DB_LOOKUP_LEAST_USED, + SA_DB_LOOKUP_RANDOM +}; + +static int set_lookup_method(const char *val, struct kernel_param *kp); +static int get_lookup_method(char *buf, struct kernel_param *kp); +static unsigned long lookup_method; +module_param_call(lookup_method, set_lookup_method, get_lookup_method, + &lookup_method, 0644); +MODULE_PARM_DESC(lookup_method, "Method used to return path records when " + "multiple paths exist to a given destination."); + +static void sa_db_add_dev(struct ib_device *device); +static void sa_db_remove_dev(struct ib_device *device); + +static struct ib_client sa_db_client = { + .name = "local_sa", + .add = sa_db_add_dev, + .remove = sa_db_remove_dev +}; + +static LIST_HEAD(dev_list); +static DEFINE_MUTEX(lock); +static rwlock_t rwlock; +static struct workqueue_struct *sa_wq; +static struct ib_sa_client sa_client; + +enum sa_db_state { + SA_DB_IDLE, + SA_DB_REFRESH, + SA_DB_DESTROY +}; + +struct sa_db_port { + struct sa_db_device *dev; + struct ib_mad_agent *agent; + /* Limit number of outstanding MADs to SA to reduce SA flooding */ + struct ib_mad_send_buf *msg; + u16 sm_lid; + u8 sm_sl; + struct ib_inform_info *in_info; + struct ib_inform_info *out_info; + struct rb_root paths; + struct list_head update_list; + unsigned long update_id; + enum sa_db_state state; + struct work_struct work; + union ib_gid gid; + int port_num; +}; + +struct sa_db_device { + struct list_head list; + struct ib_device *device; + struct ib_event_handler event_handler; + int start_port; + int port_count; + struct sa_db_port port[0]; +}; + +struct ib_sa_iterator { + struct ib_sa_iterator *next; +}; + +struct ib_sa_attr_iter { + struct ib_sa_iterator *iter; + unsigned long flags; +}; + +struct ib_sa_attr_list { + struct ib_sa_iterator iter; + struct ib_sa_iterator *tail; + int update_id; + union ib_gid gid; + struct rb_node node; +}; + +struct ib_path_rec_info { + struct ib_sa_iterator iter; /* keep first */ + struct ib_sa_path_rec rec; + unsigned long lookups; +}; + +struct ib_sa_mad_iter { + struct ib_mad_recv_wc *recv_wc; + struct ib_mad_recv_buf *recv_buf; + int attr_size; + int attr_offset; + int data_offset; + int data_left; + void *attr; + u8 attr_data[0]; +}; + +enum sa_update_type { + SA_UPDATE_FULL, + SA_UPDATE_ADD, + SA_UPDATE_REMOVE +}; + +struct update_info { + struct list_head list; + union ib_gid gid; + enum sa_update_type type; +}; + +struct sa_path_request { + struct work_struct work; + struct ib_sa_client *client; + void (*callback)(int, struct ib_sa_path_rec *, void *); + void *context; + struct ib_sa_path_rec path_rec; +}; + +static void process_updates(struct sa_db_port *port); + +static void free_attr_list(struct ib_sa_attr_list *attr_list) +{ + struct ib_sa_iterator *cur; + + for (cur = attr_list->iter.next; cur; cur = attr_list->iter.next) { + attr_list->iter.next = cur->next; + kfree(cur); + } + attr_list->tail = &attr_list->iter; +} + +static void remove_attr(struct rb_root *root, struct ib_sa_attr_list *attr_list) +{ + rb_erase(&attr_list->node, root); + free_attr_list(attr_list); + kfree(attr_list); +} + +static void remove_all_attrs(struct rb_root *root) +{ + struct rb_node *node, *next_node; + struct ib_sa_attr_list *attr_list; + + write_lock_irq(&rwlock); + for (node = rb_first(root); node; node = next_node) { + next_node = rb_next(node); + attr_list = rb_entry(node, struct ib_sa_attr_list, node); + remove_attr(root, attr_list); + } + write_unlock_irq(&rwlock); +} + +static void remove_old_attrs(struct rb_root *root, unsigned long update_id) +{ + struct rb_node *node, *next_node; + struct ib_sa_attr_list *attr_list; + + write_lock_irq(&rwlock); + for (node = rb_first(root); node; node = next_node) { + next_node = rb_next(node); + attr_list = rb_entry(node, struct ib_sa_attr_list, node); + if (attr_list->update_id != update_id) + remove_attr(root, attr_list); + } + write_unlock_irq(&rwlock); +} + +static struct ib_sa_attr_list *insert_attr_list(struct rb_root *root, + struct ib_sa_attr_list *attr_list) +{ + struct rb_node **link = &root->rb_node; + struct rb_node *parent = NULL; + struct ib_sa_attr_list *cur_attr_list; + int cmp; + + while (*link) { + parent = *link; + cur_attr_list = rb_entry(parent, struct ib_sa_attr_list, node); + cmp = memcmp(&cur_attr_list->gid, &attr_list->gid, + sizeof attr_list->gid); + if (cmp < 0) + link = &(*link)->rb_left; + else if (cmp > 0) + link = &(*link)->rb_right; + else + return cur_attr_list; + } + rb_link_node(&attr_list->node, parent, link); + rb_insert_color(&attr_list->node, root); + return NULL; +} + +static struct ib_sa_attr_list *find_attr_list(struct rb_root *root, u8 *gid) +{ + struct rb_node *node = root->rb_node; + struct ib_sa_attr_list *attr_list; + int cmp; + + while (node) { + attr_list = rb_entry(node, struct ib_sa_attr_list, node); + cmp = memcmp(&attr_list->gid, gid, sizeof attr_list->gid); + if (cmp < 0) + node = node->rb_left; + else if (cmp > 0) + node = node->rb_right; + else + return attr_list; + } + return NULL; +} + +static int insert_attr(struct rb_root *root, unsigned long update_id, void *key, + struct ib_sa_iterator *iter) +{ + struct ib_sa_attr_list *attr_list; + void *err; + + write_lock_irq(&rwlock); + attr_list = find_attr_list(root, key); + if (!attr_list) { + write_unlock_irq(&rwlock); + attr_list = kmalloc(sizeof *attr_list, GFP_KERNEL); + if (!attr_list) + return -ENOMEM; + + attr_list->iter.next = NULL; + attr_list->tail = &attr_list->iter; + attr_list->update_id = update_id; + memcpy(attr_list->gid.raw, key, sizeof attr_list->gid); + + write_lock_irq(&rwlock); + err = insert_attr_list(root, attr_list); + if (err) { + write_unlock_irq(&rwlock); + kfree(attr_list); + return PTR_ERR(err); + } + } else if (attr_list->update_id != update_id) { + free_attr_list(attr_list); + attr_list->update_id = update_id; + } + + attr_list->tail->next = iter; + iter->next = NULL; + attr_list->tail = iter; + write_unlock_irq(&rwlock); + return 0; +} + +static struct ib_sa_mad_iter *ib_sa_iter_create(struct ib_mad_recv_wc *mad_recv_wc) +{ + struct ib_sa_mad_iter *iter; + struct ib_sa_mad *mad = (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad; + int attr_size, attr_offset; + + attr_offset = be16_to_cpu(mad->sa_hdr.attr_offset) * 8; + attr_size = 64; /* path record length */ + if (attr_offset < attr_size) + return ERR_PTR(-EINVAL); + + iter = kzalloc(sizeof *iter + attr_size, GFP_KERNEL); + if (!iter) + return ERR_PTR(-ENOMEM); + + iter->data_left = mad_recv_wc->mad_len - IB_MGMT_SA_HDR; + iter->recv_wc = mad_recv_wc; + iter->recv_buf = &mad_recv_wc->recv_buf; + iter->attr_offset = attr_offset; + iter->attr_size = attr_size; + return iter; +} + +static void ib_sa_iter_free(struct ib_sa_mad_iter *iter) +{ + kfree(iter); +} + +static void *ib_sa_iter_next(struct ib_sa_mad_iter *iter) +{ + struct ib_sa_mad *mad; + int left, offset = 0; + + while (iter->data_left >= iter->attr_offset) { + while (iter->data_offset < IB_MGMT_SA_DATA) { + mad = (struct ib_sa_mad *) iter->recv_buf->mad; + + left = IB_MGMT_SA_DATA - iter->data_offset; + if (left < iter->attr_size) { + /* copy first piece of the attribute */ + iter->attr = &iter->attr_data; + memcpy(iter->attr, + &mad->data[iter->data_offset], left); + offset = left; + break; + } else if (offset) { + /* copy the second piece of the attribute */ + memcpy(iter->attr + offset, &mad->data[0], + iter->attr_size - offset); + iter->data_offset = iter->attr_size - offset; + offset = 0; + } else { + iter->attr = &mad->data[iter->data_offset]; + iter->data_offset += iter->attr_size; + } + + iter->data_left -= iter->attr_offset; + goto out; + } + iter->data_offset = 0; + iter->recv_buf = list_entry(iter->recv_buf->list.next, + struct ib_mad_recv_buf, list); + } + iter->attr = NULL; +out: + return iter->attr; +} + +/* + * Copy path records from a received response and insert them into our cache. + * A path record in the MADs are in network order, packed, and may + * span multiple MAD buffers, just to make our life hard. + */ +static void update_path_db(struct sa_db_port *port, + struct ib_mad_recv_wc *mad_recv_wc, + enum sa_update_type type) +{ + struct ib_sa_mad_iter *iter; + struct ib_path_rec_info *path_info; + void *attr; + int ret; + + iter = ib_sa_iter_create(mad_recv_wc); + if (IS_ERR(iter)) + return; + + port->update_id += (type == SA_UPDATE_FULL); + + while ((attr = ib_sa_iter_next(iter)) && + (path_info = kmalloc(sizeof *path_info, GFP_KERNEL))) { + + ib_sa_unpack_attr(&path_info->rec, attr, IB_SA_ATTR_PATH_REC); + + ret = insert_attr(&port->paths, port->update_id, + path_info->rec.dgid.raw, &path_info->iter); + if (ret) { + kfree(path_info); + break; + } + } + ib_sa_iter_free(iter); + + if (type == SA_UPDATE_FULL) + remove_old_attrs(&port->paths, port->update_id); +} + +static struct ib_mad_send_buf *get_sa_msg(struct sa_db_port *port, + struct update_info *update) +{ + struct ib_ah_attr ah_attr; + struct ib_mad_send_buf *msg; + + msg = ib_create_send_mad(port->agent, 1, 0, 0, IB_MGMT_SA_HDR, + IB_MGMT_SA_DATA, GFP_KERNEL); + if (IS_ERR(msg)) + return NULL; + + memset(&ah_attr, 0, sizeof ah_attr); + ah_attr.dlid = port->sm_lid; + ah_attr.sl = port->sm_sl; + ah_attr.port_num = port->port_num; + + msg->ah = ib_create_ah(port->agent->qp->pd, &ah_attr); + if (IS_ERR(msg->ah)) { + ib_free_send_mad(msg); + return NULL; + } + + msg->timeout_ms = retry_timer; + msg->retries = 0; + msg->context[0] = port; + msg->context[1] = update; + return msg; +} + +static __be64 form_tid(u32 hi_tid) +{ + static atomic_t tid; + return cpu_to_be64((((u64) hi_tid) << 32) | + ((u32) atomic_inc_return(&tid))); +} + +static void format_path_req(struct sa_db_port *port, + struct update_info *update, + struct ib_mad_send_buf *msg) +{ + struct ib_sa_mad *mad = msg->mad; + struct ib_sa_path_rec path_rec; + + mad->mad_hdr.base_version = IB_MGMT_BASE_VERSION; + mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM; + mad->mad_hdr.class_version = IB_SA_CLASS_VERSION; + mad->mad_hdr.method = IB_SA_METHOD_GET_TABLE; + mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_PATH_REC); + mad->mad_hdr.tid = form_tid(msg->mad_agent->hi_tid); + + mad->sa_hdr.comp_mask = IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_NUMB_PATH; + + path_rec.sgid = port->gid; + path_rec.numb_path = (u8) paths_per_dest; + + if (update->type == SA_UPDATE_ADD) { + mad->sa_hdr.comp_mask |= IB_SA_PATH_REC_DGID; + memcpy(&path_rec.dgid, &update->gid, sizeof path_rec.dgid); + } + + ib_sa_pack_attr(mad->data, &path_rec, IB_SA_ATTR_PATH_REC); +} + +static int send_query(struct sa_db_port *port, + struct update_info *update) +{ + int ret; + + port->msg = get_sa_msg(port, update); + if (!port->msg) + return -ENOMEM; + + format_path_req(port, update, port->msg); + + ret = ib_post_send_mad(port->msg, NULL); + if (ret) + goto err; + + return 0; + +err: + ib_destroy_ah(port->msg->ah); + ib_free_send_mad(port->msg); + return ret; +} + +static void add_update(struct sa_db_port *port, u8 *gid, + enum sa_update_type type) +{ + struct update_info *update; + + update = kmalloc(sizeof *update, GFP_KERNEL); + if (update) { + if (gid) + memcpy(&update->gid, gid, sizeof update->gid); + update->type = type; + list_add(&update->list, &port->update_list); + } + + if (port->state == SA_DB_IDLE) { + port->state = SA_DB_REFRESH; + process_updates(port); + } +} + +static void clean_update_list(struct sa_db_port *port) +{ + struct update_info *update; + + while (!list_empty(&port->update_list)) { + update = list_entry(port->update_list.next, + struct update_info, list); + list_del(&update->list); + kfree(update); + } +} + +static int notice_handler(int status, struct ib_inform_info *info, + struct ib_sa_notice *notice) +{ + struct sa_db_port *port = info->context; + struct ib_sa_notice_data_gid *gid_data; + struct ib_inform_info **pinfo; + enum sa_update_type type; + + if (info->trap_number == IB_SA_SM_TRAP_GID_IN_SERVICE) { + pinfo = &port->in_info; + type = SA_UPDATE_ADD; + } else { + pinfo = &port->out_info; + type = SA_UPDATE_REMOVE; + } + + mutex_lock(&lock); + if (port->state == SA_DB_DESTROY || !*pinfo) { + mutex_unlock(&lock); + return 0; + } + + if (notice) { + gid_data = (struct ib_sa_notice_data_gid *) + ¬ice->data_details; + add_update(port, gid_data->gid, type); + mutex_unlock(&lock); + } else if (status == -ENETRESET) { + *pinfo = NULL; + mutex_unlock(&lock); + } else { + if (status) + *pinfo = ERR_PTR(-EINVAL); + port->state = SA_DB_IDLE; + clean_update_list(port); + mutex_unlock(&lock); + queue_work(sa_wq, &port->work); + } + + return status; +} + +static int reg_in_info(struct sa_db_port *port) +{ + int ret = 0; + + port->in_info = ib_sa_register_inform_info(&sa_client, + port->dev->device, + port->port_num, + IB_SA_SM_TRAP_GID_IN_SERVICE, + GFP_KERNEL, notice_handler, + port); + if (IS_ERR(port->in_info)) + ret = PTR_ERR(port->in_info); + + return ret; +} + +static int reg_out_info(struct sa_db_port *port) +{ + int ret = 0; + + port->out_info = ib_sa_register_inform_info(&sa_client, + port->dev->device, + port->port_num, + IB_SA_SM_TRAP_GID_OUT_OF_SERVICE, + GFP_KERNEL, notice_handler, + port); + if (IS_ERR(port->out_info)) + ret = PTR_ERR(port->out_info); + + return ret; +} + +static void unsubscribe_port(struct sa_db_port *port) +{ + if (port->in_info && !IS_ERR(port->in_info)) + ib_sa_unregister_inform_info(port->in_info); + + if (port->out_info && !IS_ERR(port->out_info)) + ib_sa_unregister_inform_info(port->out_info); + + port->out_info = NULL; + port->in_info = NULL; + +} + +static void cleanup_port(struct sa_db_port *port) +{ + unsubscribe_port(port); + + clean_update_list(port); + remove_all_attrs(&port->paths); +} + +static int update_port_info(struct sa_db_port *port) +{ + struct ib_port_attr port_attr; + int ret; + + ret = ib_query_port(port->dev->device, port->port_num, &port_attr); + if (ret) + return ret; + + if (port_attr.state != IB_PORT_ACTIVE) + return -ENODATA; + + port->sm_lid = port_attr.sm_lid; + port->sm_sl = port_attr.sm_sl; + return 0; +} + +static void process_updates(struct sa_db_port *port) +{ + struct update_info *update; + struct ib_sa_attr_list *attr_list; + int ret; + + if (!paths_per_dest || update_port_info(port)) { + cleanup_port(port); + goto out; + } + + /* Event registration is an optimization, so ignore failures. */ + if (subscribe_inform_info) { + if (!port->out_info) { + ret = reg_out_info(port); + if (!ret) + return; + } + + if (!port->in_info) { + ret = reg_in_info(port); + if (!ret) + return; + } + } else + unsubscribe_port(port); + + while (!list_empty(&port->update_list)) { + update = list_entry(port->update_list.next, + struct update_info, list); + + if (update->type == SA_UPDATE_REMOVE) { + write_lock_irq(&rwlock); + attr_list = find_attr_list(&port->paths, + update->gid.raw); + if (attr_list) + remove_attr(&port->paths, attr_list); + write_unlock_irq(&rwlock); + } else { + ret = send_query(port, update); + if (!ret) + return; + + } + list_del(&update->list); + kfree(update); + } +out: + port->state = SA_DB_IDLE; +} + +static void refresh_port_db(struct sa_db_port *port) +{ + if (port->state == SA_DB_DESTROY) + return; + + if (port->state == SA_DB_REFRESH) { + clean_update_list(port); + ib_cancel_mad(port->agent, port->msg); + } + + add_update(port, NULL, SA_UPDATE_FULL); +} + +static void refresh_dev_db(struct sa_db_device *dev) +{ + int i; + + for (i = 0; i < dev->port_count; i++) + refresh_port_db(&dev->port[i]); +} + +static void refresh_db(void) +{ + struct sa_db_device *dev; + + list_for_each_entry(dev, &dev_list, list) + refresh_dev_db(dev); +} + +static int do_refresh(const char *val, struct kernel_param *kp) +{ + mutex_lock(&lock); + refresh_db(); + mutex_unlock(&lock); + return 0; +} + +static int get_lookup_method(char *buf, struct kernel_param *kp) +{ + return sprintf(buf, + "%c %d round robin\n" + "%c %d random", + (lookup_method == SA_DB_LOOKUP_LEAST_USED) ? '*' : ' ', + SA_DB_LOOKUP_LEAST_USED, + (lookup_method == SA_DB_LOOKUP_RANDOM) ? '*' : ' ', + SA_DB_LOOKUP_RANDOM); +} + +static int set_lookup_method(const char *val, struct kernel_param *kp) +{ + unsigned long method; + int ret = 0; + + method = simple_strtoul(val, NULL, 0); + + switch (method) { + case SA_DB_LOOKUP_LEAST_USED: + case SA_DB_LOOKUP_RANDOM: + lookup_method = method; + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int set_paths_per_dest(const char *val, struct kernel_param *kp) +{ + int ret; + + mutex_lock(&lock); + ret = param_set_ulong(val, kp); + if (ret) + goto out; + + if (paths_per_dest > SA_DB_MAX_PATHS_PER_DEST) + paths_per_dest = SA_DB_MAX_PATHS_PER_DEST; + refresh_db(); +out: + mutex_unlock(&lock); + return ret; +} + +static int set_subscribe_inform_info(const char *val, struct kernel_param *kp) +{ + int ret; + + ret = param_set_bool(val, kp); + if (ret) + return ret; + + return do_refresh(val, kp); +} + +static void port_work_handler(struct work_struct *work) +{ + struct sa_db_port *port; + + port = container_of(work, typeof(*port), work); + mutex_lock(&lock); + refresh_port_db(port); + mutex_unlock(&lock); +} + +static void handle_event(struct ib_event_handler *event_handler, + struct ib_event *event) +{ + struct sa_db_device *dev; + struct sa_db_port *port; + + dev = container_of(event_handler, typeof(*dev), event_handler); + port = &dev->port[event->element.port_num - dev->start_port]; + + switch (event->event) { + case IB_EVENT_PORT_ERR: + case IB_EVENT_LID_CHANGE: + case IB_EVENT_SM_CHANGE: + case IB_EVENT_CLIENT_REREGISTER: + case IB_EVENT_PKEY_CHANGE: + case IB_EVENT_PORT_ACTIVE: + queue_work(sa_wq, &port->work); + break; + default: + break; + } +} + +static void ib_free_path_iter(struct ib_sa_attr_iter *iter) +{ + read_unlock_irqrestore(&rwlock, iter->flags); +} + +static int ib_create_path_iter(struct ib_device *device, u8 port_num, + union ib_gid *dgid, struct ib_sa_attr_iter *iter) +{ + struct sa_db_device *dev; + struct sa_db_port *port; + struct ib_sa_attr_list *list; + + dev = ib_get_client_data(device, &sa_db_client); + if (!dev) + return -ENODEV; + + port = &dev->port[port_num - dev->start_port]; + + read_lock_irqsave(&rwlock, iter->flags); + list = find_attr_list(&port->paths, dgid->raw); + if (!list) { + ib_free_path_iter(iter); + return -ENODATA; + } + + iter->iter = &list->iter; + return 0; +} + +static struct ib_sa_path_rec *ib_get_next_path(struct ib_sa_attr_iter *iter) +{ + struct ib_path_rec_info *next_path; + + iter->iter = iter->iter->next; + if (iter->iter) { + next_path = container_of(iter->iter, struct ib_path_rec_info, iter); + return &next_path->rec; + } else + return NULL; +} + +static int cmp_rec(struct ib_sa_path_rec *src, + struct ib_sa_path_rec *dst, ib_sa_comp_mask comp_mask) +{ + /* DGID check already done */ + if (comp_mask & IB_SA_PATH_REC_SGID && + memcmp(&src->sgid, &dst->sgid, sizeof src->sgid)) + return -EINVAL; + if (comp_mask & IB_SA_PATH_REC_DLID && src->dlid != dst->dlid) + return -EINVAL; + if (comp_mask & IB_SA_PATH_REC_SLID && src->slid != dst->slid) + return -EINVAL; + if (comp_mask & IB_SA_PATH_REC_RAW_TRAFFIC && + src->raw_traffic != dst->raw_traffic) + return -EINVAL; + + if (comp_mask & IB_SA_PATH_REC_FLOW_LABEL && + src->flow_label != dst->flow_label) + return -EINVAL; + if (comp_mask & IB_SA_PATH_REC_HOP_LIMIT && + src->hop_limit != dst->hop_limit) + return -EINVAL; + if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS && + src->traffic_class != dst->traffic_class) + return -EINVAL; + if (comp_mask & IB_SA_PATH_REC_REVERSIBLE && + dst->reversible && !src->reversible) + return -EINVAL; + /* Numb path check already done */ + if (comp_mask & IB_SA_PATH_REC_PKEY && src->pkey != dst->pkey) + return -EINVAL; + + if (comp_mask & IB_SA_PATH_REC_SL && src->sl != dst->sl) + return -EINVAL; + + if (ib_sa_check_selector(comp_mask, IB_SA_PATH_REC_MTU_SELECTOR, + IB_SA_PATH_REC_MTU, dst->mtu_selector, + src->mtu, dst->mtu)) + return -EINVAL; + if (ib_sa_check_selector(comp_mask, IB_SA_PATH_REC_RATE_SELECTOR, + IB_SA_PATH_REC_RATE, dst->rate_selector, + src->rate, dst->rate)) + return -EINVAL; + if (ib_sa_check_selector(comp_mask, + IB_SA_PATH_REC_PACKET_LIFE_TIME_SELECTOR, + IB_SA_PATH_REC_PACKET_LIFE_TIME, + dst->packet_life_time_selector, + src->packet_life_time, dst->packet_life_time)) + return -EINVAL; + + return 0; +} + +static struct ib_sa_path_rec *get_random_path(struct ib_sa_attr_iter *iter, + struct ib_sa_path_rec *req_path, + ib_sa_comp_mask comp_mask) +{ + struct ib_sa_path_rec *path, *rand_path = NULL; + int num, count = 0; + + for (path = ib_get_next_path(iter); path; + path = ib_get_next_path(iter)) { + if (!cmp_rec(path, req_path, comp_mask)) { + get_random_bytes(&num, sizeof num); + if ((num % ++count) == 0) + rand_path = path; + } + } + + return rand_path; +} + +static struct ib_sa_path_rec *get_next_path(struct ib_sa_attr_iter *iter, + struct ib_sa_path_rec *req_path, + ib_sa_comp_mask comp_mask) +{ + struct ib_path_rec_info *cur_path, *next_path = NULL; + struct ib_sa_path_rec *path; + unsigned long lookups = ~0; + + for (path = ib_get_next_path(iter); path; + path = ib_get_next_path(iter)) { + if (!cmp_rec(path, req_path, comp_mask)) { + + cur_path = container_of(iter->iter, struct ib_path_rec_info, + iter); + if (cur_path->lookups < lookups) { + lookups = cur_path->lookups; + next_path = cur_path; + } + } + } + + if (next_path) { + next_path->lookups++; + return &next_path->rec; + } else + return NULL; +} + +static void report_path(struct work_struct *work) +{ + struct sa_path_request *req; + + req = container_of(work, struct sa_path_request, work); + req->callback(0, &req->path_rec, req->context); + ib_sa_client_put(req->client); + kfree(req); +} + +/** + * ib_sa_path_rec_get - Start a Path get query + * @client:SA client + * @device:device to send query on + * @port_num: port number to send query on + * @rec:Path Record to send in query + * @comp_mask:component mask to send in query + * @timeout_ms:time to wait for response + * @gfp_mask:GFP mask to use for internal allocations + * @callback:function called when query completes, times out or is + * canceled + * @context:opaque user context passed to callback + * @sa_query:query context, used to cancel query + * + * Send a Path Record Get query to the SA to look up a path. The + * callback function will be called when the query completes (or + * fails); status is 0 for a successful response, -EINTR if the query + * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error + * occurred sending the query. The resp parameter of the callback is + * only valid if status is 0. + * + * If the return value of ib_sa_path_rec_get() is negative, it is an + * error code. Otherwise it is a query ID that can be used to cancel + * the query. + */ +int ib_sa_path_rec_get(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + struct ib_sa_path_rec *rec, + ib_sa_comp_mask comp_mask, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_path_rec *resp, + void *context), + void *context, + struct ib_sa_query **sa_query) +{ + struct sa_path_request *req; + struct ib_sa_attr_iter iter; + struct ib_sa_path_rec *path_rec; + int ret; + + if (!paths_per_dest) + goto query_sa; + + if (!(comp_mask & IB_SA_PATH_REC_DGID) || + !(comp_mask & IB_SA_PATH_REC_NUMB_PATH) || rec->numb_path != 1) + goto query_sa; + + req = kmalloc(sizeof *req, gfp_mask); + if (!req) + goto query_sa; + + ret = ib_create_path_iter(device, port_num, &rec->dgid, &iter); + if (ret) + goto free_req; + + if (lookup_method == SA_DB_LOOKUP_RANDOM) + path_rec = get_random_path(&iter, rec, comp_mask); + else + path_rec = get_next_path(&iter, rec, comp_mask); + + if (!path_rec) + goto free_iter; + + memcpy(&req->path_rec, path_rec, sizeof *path_rec); + ib_free_path_iter(&iter); + + INIT_WORK(&req->work, report_path); + req->client = client; + req->callback = callback; + req->context = context; + + ib_sa_client_get(client); + queue_work(sa_wq, &req->work); + *sa_query = ERR_PTR(-EEXIST); + return 0; + +free_iter: + ib_free_path_iter(&iter); +free_req: + kfree(req); +query_sa: + return ib_sa_path_rec_query(client, device, port_num, rec, comp_mask, + timeout_ms, gfp_mask, callback, context, + sa_query); +} +EXPORT_SYMBOL(ib_sa_path_rec_get); + +static void recv_handler(struct ib_mad_agent *mad_agent, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct sa_db_port *port; + struct update_info *update; + struct ib_mad_send_buf *msg; + enum sa_update_type type; + + msg = (struct ib_mad_send_buf *) (unsigned long) mad_recv_wc->wc->wr_id; + port = msg->context[0]; + update = msg->context[1]; + + mutex_lock(&lock); + if (port->state == SA_DB_DESTROY || + update != list_entry(port->update_list.next, + struct update_info, list)) { + mutex_unlock(&lock); + } else { + type = update->type; + mutex_unlock(&lock); + update_path_db(mad_agent->context, mad_recv_wc, type); + } + + ib_free_recv_mad(mad_recv_wc); +} + +static void send_handler(struct ib_mad_agent *agent, + struct ib_mad_send_wc *mad_send_wc) +{ + struct ib_mad_send_buf *msg; + struct sa_db_port *port; + struct update_info *update; + int ret; + + msg = mad_send_wc->send_buf; + port = msg->context[0]; + update = msg->context[1]; + + mutex_lock(&lock); + if (port->state == SA_DB_DESTROY) + goto unlock; + + if (update == list_entry(port->update_list.next, + struct update_info, list)) { + + if (mad_send_wc->status == IB_WC_RESP_TIMEOUT_ERR && + msg->timeout_ms < SA_DB_MAX_RETRY_TIMER) { + + msg->timeout_ms <<= 1; + ret = ib_post_send_mad(msg, NULL); + if (!ret) { + mutex_unlock(&lock); + return; + } + } + list_del(&update->list); + kfree(update); + } + process_updates(port); +unlock: + mutex_unlock(&lock); + + ib_destroy_ah(msg->ah); + ib_free_send_mad(msg); +} + +static int init_port(struct sa_db_device *dev, int port_num) +{ + struct sa_db_port *port; + int ret; + + port = &dev->port[port_num - dev->start_port]; + port->dev = dev; + port->port_num = port_num; + INIT_WORK(&port->work, port_work_handler); + port->paths = RB_ROOT; + INIT_LIST_HEAD(&port->update_list); + + ret = ib_get_cached_gid(dev->device, port_num, 0, &port->gid); + if (ret) + return ret; + + port->agent = ib_register_mad_agent(dev->device, port_num, IB_QPT_GSI, + NULL, IB_MGMT_RMPP_VERSION, + send_handler, recv_handler, port); + if (IS_ERR(port->agent)) + ret = PTR_ERR(port->agent); + + return ret; +} + +static void destroy_port(struct sa_db_port *port) +{ + mutex_lock(&lock); + port->state = SA_DB_DESTROY; + mutex_unlock(&lock); + + ib_unregister_mad_agent(port->agent); + cleanup_port(port); + flush_workqueue(sa_wq); +} + +static void sa_db_add_dev(struct ib_device *device) +{ + struct sa_db_device *dev; + struct sa_db_port *port; + int s, e, i, ret; + + if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) + return; + + if (device->node_type == RDMA_NODE_IB_SWITCH) { + s = e = 0; + } else { + s = 1; + e = device->phys_port_cnt; + } + + dev = kzalloc(sizeof *dev + (e - s + 1) * sizeof *port, GFP_KERNEL); + if (!dev) + return; + + dev->start_port = s; + dev->port_count = e - s + 1; + dev->device = device; + for (i = 0; i < dev->port_count; i++) { + ret = init_port(dev, s + i); + if (ret) + goto err; + } + + ib_set_client_data(device, &sa_db_client, dev); + + INIT_IB_EVENT_HANDLER(&dev->event_handler, device, handle_event); + + mutex_lock(&lock); + list_add_tail(&dev->list, &dev_list); + refresh_dev_db(dev); + mutex_unlock(&lock); + + ib_register_event_handler(&dev->event_handler); + return; +err: + while (i--) + destroy_port(&dev->port[i]); + kfree(dev); +} + +static void sa_db_remove_dev(struct ib_device *device) +{ + struct sa_db_device *dev; + int i; + + dev = ib_get_client_data(device, &sa_db_client); + if (!dev) + return; + + ib_unregister_event_handler(&dev->event_handler); + flush_workqueue(sa_wq); + + for (i = 0; i < dev->port_count; i++) + destroy_port(&dev->port[i]); + + mutex_lock(&lock); + list_del(&dev->list); + mutex_unlock(&lock); + + kfree(dev); +} + +int sa_db_init(void) +{ + int ret; + + rwlock_init(&rwlock); + sa_wq = create_singlethread_workqueue("local_sa"); + if (!sa_wq) + return -ENOMEM; + + ib_sa_register_client(&sa_client); + ret = ib_register_client(&sa_db_client); + if (ret) + goto err; + + return 0; + +err: + ib_sa_unregister_client(&sa_client); + destroy_workqueue(sa_wq); + return ret; +} + +void sa_db_cleanup(void) +{ + ib_unregister_client(&sa_db_client); + ib_sa_unregister_client(&sa_client); + destroy_workqueue(sa_wq); +} diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index b4d8672a3e4ef..ae9671558733c 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -47,8 +47,8 @@ MODULE_DESCRIPTION("kernel IB MAD API"); MODULE_AUTHOR("Hal Rosenstock"); MODULE_AUTHOR("Sean Hefty"); -static int mad_sendq_size = IB_MAD_QP_SEND_SIZE; -static int mad_recvq_size = IB_MAD_QP_RECV_SIZE; +int mad_sendq_size = IB_MAD_QP_SEND_SIZE; +int mad_recvq_size = IB_MAD_QP_RECV_SIZE; module_param_named(send_queue_size, mad_sendq_size, int, 0444); MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests"); @@ -61,7 +61,8 @@ static struct list_head ib_mad_port_list; static u32 ib_mad_client_id = 0; /* Port list lock */ -static DEFINE_SPINLOCK(ib_mad_port_list_lock); +static spinlock_t ib_mad_port_list_lock; + /* Forward declarations */ static int method_in_use(struct ib_mad_mgmt_method_table **method, @@ -184,6 +185,15 @@ int ib_response_mad(struct ib_mad *mad) } EXPORT_SYMBOL(ib_response_mad); +static void timeout_callback(unsigned long data) +{ + struct ib_mad_agent_private *mad_agent_priv = + (struct ib_mad_agent_private *) data; + + queue_work(mad_agent_priv->qp_info->port_priv->wq, + &mad_agent_priv->timeout_work); +} + /* * ib_register_mad_agent - Register to send/receive MADs */ @@ -276,13 +286,6 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, goto error1; } - /* Verify the QP requested is supported. For example, Ethernet devices - * will not have QP0 */ - if (!port_priv->qp_info[qpn].qp) { - ret = ERR_PTR(-EPROTONOSUPPORT); - goto error1; - } - /* Allocate structures */ mad_agent_priv = kzalloc(sizeof *mad_agent_priv, GFP_KERNEL); if (!mad_agent_priv) { @@ -298,11 +301,13 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, } if (mad_reg_req) { - reg_req = kmemdup(mad_reg_req, sizeof *reg_req, GFP_KERNEL); + reg_req = kmalloc(sizeof *reg_req, GFP_KERNEL); if (!reg_req) { ret = ERR_PTR(-ENOMEM); goto error3; } + /* Make a copy of the MAD registration request */ + memcpy(reg_req, mad_reg_req, sizeof *reg_req); } /* Now, fill in the various structures */ @@ -320,7 +325,9 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, INIT_LIST_HEAD(&mad_agent_priv->wait_list); INIT_LIST_HEAD(&mad_agent_priv->done_list); INIT_LIST_HEAD(&mad_agent_priv->rmpp_list); - INIT_DELAYED_WORK(&mad_agent_priv->timed_work, timeout_sends); + INIT_WORK(&mad_agent_priv->timeout_work, timeout_sends); + setup_timer(&mad_agent_priv->timeout_timer, timeout_callback, + (unsigned long) mad_agent_priv); INIT_LIST_HEAD(&mad_agent_priv->local_list); INIT_WORK(&mad_agent_priv->local_work, local_completions); atomic_set(&mad_agent_priv->refcount, 1); @@ -527,7 +534,8 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv) */ cancel_mads(mad_agent_priv); port_priv = mad_agent_priv->qp_info->port_priv; - cancel_delayed_work(&mad_agent_priv->timed_work); + del_timer_sync(&mad_agent_priv->timeout_timer); + cancel_work_sync(&mad_agent_priv->timeout_work); spin_lock_irqsave(&port_priv->reg_lock, flags); remove_mad_reg_req(mad_agent_priv); @@ -688,8 +696,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, struct ib_wc mad_wc; struct ib_send_wr *send_wr = &mad_send_wr->send_wr; - if (device->node_type == RDMA_NODE_IB_SWITCH && - smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + if (device->node_type == RDMA_NODE_IB_SWITCH) port_num = send_wr->wr.ud.port_num; else port_num = mad_agent_priv->agent.port_num; @@ -700,9 +707,10 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, * If we are at the start of the LID routed part, don't update the * hop_ptr or hop_cnt. See section 14.2.2, Vol 1 IB spec. */ - if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) == - IB_LID_PERMISSIVE && - smi_handle_dr_smp_send(smp, device->node_type, port_num) == + if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) != + IB_LID_PERMISSIVE) + goto out; + if (smi_handle_dr_smp_send(smp, device->node_type, port_num) == IB_SMI_DISCARD) { ret = -EINVAL; printk(KERN_ERR PFX "Invalid directed route\n"); @@ -1021,12 +1029,20 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr) mad_send_wr->send_buf.mad, sge[0].length, DMA_TO_DEVICE); - mad_send_wr->header_mapping = sge[0].addr; + if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[0].addr))) + return -ENOMEM; sge[1].addr = ib_dma_map_single(mad_agent->device, ib_get_payload(mad_send_wr), sge[1].length, DMA_TO_DEVICE); + + if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[1].addr))) { + ret = -ENOMEM; + goto dma1_err; + } + + mad_send_wr->header_mapping = sge[0].addr; mad_send_wr->payload_mapping = sge[1].addr; spin_lock_irqsave(&qp_info->send_queue.lock, flags); @@ -1044,14 +1060,17 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr) list_add_tail(&mad_send_wr->mad_list.list, list); } spin_unlock_irqrestore(&qp_info->send_queue.lock, flags); - if (ret) { - ib_dma_unmap_single(mad_agent->device, - mad_send_wr->header_mapping, - sge[0].length, DMA_TO_DEVICE); - ib_dma_unmap_single(mad_agent->device, - mad_send_wr->payload_mapping, - sge[1].length, DMA_TO_DEVICE); - } + + if (!ret) + return 0; + + ib_dma_unmap_single(mad_agent->device, + mad_send_wr->header_mapping, + sge[1].length, DMA_TO_DEVICE); +dma1_err: + ib_dma_unmap_single(mad_agent->device, + mad_send_wr->payload_mapping, + sge[0].length, DMA_TO_DEVICE); return ret; } @@ -1199,7 +1218,10 @@ static int method_in_use(struct ib_mad_mgmt_method_table **method, { int i; - for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) { + for (i = find_first_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS); + i < IB_MGMT_MAX_METHODS; + i = find_next_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS, + 1+i)) { if ((*method)->agent[i]) { printk(KERN_ERR PFX "Method %d already in use\n", i); return -EINVAL; @@ -1333,9 +1355,13 @@ static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req, goto error3; /* Finally, add in methods being registered */ - for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) + for (i = find_first_bit(mad_reg_req->method_mask, + IB_MGMT_MAX_METHODS); + i < IB_MGMT_MAX_METHODS; + i = find_next_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS, + 1+i)) { (*method)->agent[i] = agent_priv; - + } return 0; error3: @@ -1428,9 +1454,13 @@ check_in_use: goto error4; /* Finally, add in methods being registered */ - for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) + for (i = find_first_bit(mad_reg_req->method_mask, + IB_MGMT_MAX_METHODS); + i < IB_MGMT_MAX_METHODS; + i = find_next_bit(mad_reg_req->method_mask, IB_MGMT_MAX_METHODS, + 1+i)) { (*method)->agent[i] = agent_priv; - + } return 0; error4: @@ -1838,6 +1868,28 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, } } +static int generate_unmatched_resp(struct ib_mad_private *recv, + struct ib_mad_private *response) +{ + int matched = 0; + + if ((recv->mad.mad.mad_hdr.method == IB_MGMT_METHOD_GET) || + (recv->mad.mad.mad_hdr.method == IB_MGMT_METHOD_SET)) { + memcpy(response, recv, sizeof(*response)); + response->header.recv_wc.wc = &response->header.wc; + response->header.recv_wc.recv_buf.mad = &response->mad.mad; + response->header.recv_wc.recv_buf.grh = &response->grh; + response->mad.mad.mad_hdr.method = IB_MGMT_METHOD_GET_RESP; + response->mad.mad.mad_hdr.status = + cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB); + if (recv->mad.mad.mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + response->mad.mad.mad_hdr.status |=IB_SMP_DIRECTION; + matched = 1; + } + + return matched; +} + static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv, struct ib_wc *wc) { @@ -1847,6 +1899,7 @@ static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv, struct ib_mad_list_head *mad_list; struct ib_mad_agent_private *mad_agent; int port_num; + int ret = IB_MAD_RESULT_SUCCESS; mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id; qp_info = mad_list->mad_queue->qp_info; @@ -1930,7 +1983,6 @@ static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv, local: /* Give driver "right of first refusal" on incoming MAD */ if (port_priv->device->process_mad) { - int ret; ret = port_priv->device->process_mad(port_priv->device, 0, port_priv->port_num, @@ -1959,7 +2011,10 @@ local: * or via recv_handler in ib_mad_complete_recv() */ recv = NULL; - } + } else if ((ret & IB_MAD_RESULT_SUCCESS) && + generate_unmatched_resp(recv, response)) + agent_send_response(&response->mad.mad, &recv->grh, wc, + port_priv->device, port_num, qp_info->qp->qp_num); out: /* Post another receive request for this QP */ @@ -1974,10 +2029,9 @@ out: static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv) { struct ib_mad_send_wr_private *mad_send_wr; - unsigned long delay; if (list_empty(&mad_agent_priv->wait_list)) { - __cancel_delayed_work(&mad_agent_priv->timed_work); + del_timer(&mad_agent_priv->timeout_timer); } else { mad_send_wr = list_entry(mad_agent_priv->wait_list.next, struct ib_mad_send_wr_private, @@ -1986,13 +2040,8 @@ static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv) if (time_after(mad_agent_priv->timeout, mad_send_wr->timeout)) { mad_agent_priv->timeout = mad_send_wr->timeout; - __cancel_delayed_work(&mad_agent_priv->timed_work); - delay = mad_send_wr->timeout - jiffies; - if ((long)delay <= 0) - delay = 1; - queue_delayed_work(mad_agent_priv->qp_info-> - port_priv->wq, - &mad_agent_priv->timed_work, delay); + mod_timer(&mad_agent_priv->timeout_timer, + mad_send_wr->timeout); } } } @@ -2019,17 +2068,14 @@ static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr) temp_mad_send_wr->timeout)) break; } - } - else + } else list_item = &mad_agent_priv->wait_list; list_add(&mad_send_wr->agent_list, list_item); /* Reschedule a work item if we have a shorter timeout */ - if (mad_agent_priv->wait_list.next == &mad_send_wr->agent_list) { - __cancel_delayed_work(&mad_agent_priv->timed_work); - queue_delayed_work(mad_agent_priv->qp_info->port_priv->wq, - &mad_agent_priv->timed_work, delay); - } + if (mad_agent_priv->wait_list.next == &mad_send_wr->agent_list) + mod_timer(&mad_agent_priv->timeout_timer, + mad_send_wr->timeout); } void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr, @@ -2473,10 +2519,10 @@ static void timeout_sends(struct work_struct *work) struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_wr_private *mad_send_wr; struct ib_mad_send_wc mad_send_wc; - unsigned long flags, delay; + unsigned long flags; mad_agent_priv = container_of(work, struct ib_mad_agent_private, - timed_work.work); + timeout_work); mad_send_wc.vendor_err = 0; spin_lock_irqsave(&mad_agent_priv->lock, flags); @@ -2486,12 +2532,8 @@ static void timeout_sends(struct work_struct *work) agent_list); if (time_after(mad_send_wr->timeout, jiffies)) { - delay = mad_send_wr->timeout - jiffies; - if ((long)delay <= 0) - delay = 1; - queue_delayed_work(mad_agent_priv->qp_info-> - port_priv->wq, - &mad_agent_priv->timed_work, delay); + mod_timer(&mad_agent_priv->timeout_timer, + mad_send_wr->timeout); break; } @@ -2567,6 +2609,14 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, sizeof *mad_priv - sizeof mad_priv->header, DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(qp_info->port_priv->device, + sg_list.addr))) { + ret = -ENOMEM; + kmem_cache_free(ib_mad_cache, mad_priv); + printk(KERN_ERR PFX "ib_dma_map_single failed\n"); + break; + } + mad_priv->header.mapping = sg_list.addr; recv_wr.wr_id = (unsigned long)&mad_priv->header.mad_list; mad_priv->header.mad_list.mad_queue = recv_queue; @@ -2809,7 +2859,7 @@ static int ib_mad_port_open(struct ib_device *device, init_mad_qp(port_priv, &port_priv->qp_info[1]); cq_size = mad_sendq_size + mad_recvq_size; - has_smi = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND; + has_smi = rdma_port_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND; if (has_smi) cq_size *= 2; @@ -2978,9 +3028,6 @@ static void ib_mad_remove_device(struct ib_device *device) { int i, num_ports, cur_port; - if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) - return; - if (device->node_type == RDMA_NODE_IB_SWITCH) { num_ports = 1; cur_port = 0; @@ -3015,6 +3062,8 @@ static int __init ib_mad_init_module(void) mad_sendq_size = min(mad_sendq_size, IB_MAD_QP_MAX_SIZE); mad_sendq_size = max(mad_sendq_size, IB_MAD_QP_MIN_SIZE); + spin_lock_init(&ib_mad_port_list_lock); + ib_mad_cache = kmem_cache_create("ib_mad", sizeof(struct ib_mad_private), 0, @@ -3050,3 +3099,4 @@ static void __exit ib_mad_cleanup_module(void) module_init(ib_mad_init_module); module_exit(ib_mad_cleanup_module); + diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h index 9430ab4969c55..8b4df0a33e0b1 100644 --- a/drivers/infiniband/core/mad_priv.h +++ b/drivers/infiniband/core/mad_priv.h @@ -102,7 +102,8 @@ struct ib_mad_agent_private { struct list_head send_list; struct list_head wait_list; struct list_head done_list; - struct delayed_work timed_work; + struct work_struct timeout_work; + struct timer_list timeout_timer; unsigned long timeout; struct list_head local_list; struct work_struct local_work; diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index 68b4162fd9d2b..a49ac583691a1 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -251,34 +251,6 @@ static u8 get_leave_state(struct mcast_group *group) return leave_state & group->rec.join_state; } -static int check_selector(ib_sa_comp_mask comp_mask, - ib_sa_comp_mask selector_mask, - ib_sa_comp_mask value_mask, - u8 selector, u8 src_value, u8 dst_value) -{ - int err; - - if (!(comp_mask & selector_mask) || !(comp_mask & value_mask)) - return 0; - - switch (selector) { - case IB_SA_GT: - err = (src_value <= dst_value); - break; - case IB_SA_LT: - err = (src_value >= dst_value); - break; - case IB_SA_EQ: - err = (src_value != dst_value); - break; - default: - err = 0; - break; - } - - return err; -} - static int cmp_rec(struct ib_sa_mcmember_rec *src, struct ib_sa_mcmember_rec *dst, ib_sa_comp_mask comp_mask) { @@ -291,24 +263,24 @@ static int cmp_rec(struct ib_sa_mcmember_rec *src, return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid) return -EINVAL; - if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR, - IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector, - src->mtu, dst->mtu)) + if (ib_sa_check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR, + IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector, + src->mtu, dst->mtu)) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS && src->traffic_class != dst->traffic_class) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey) return -EINVAL; - if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR, - IB_SA_MCMEMBER_REC_RATE, dst->rate_selector, - src->rate, dst->rate)) + if (ib_sa_check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR, + IB_SA_MCMEMBER_REC_RATE, dst->rate_selector, + src->rate, dst->rate)) return -EINVAL; - if (check_selector(comp_mask, - IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR, - IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME, - dst->packet_life_time_selector, - src->packet_life_time, dst->packet_life_time)) + if (ib_sa_check_selector(comp_mask, + IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR, + IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME, + dst->packet_life_time_selector, + src->packet_life_time, dst->packet_life_time)) return -EINVAL; if (comp_mask & IB_SA_MCMEMBER_REC_SL && src->sl != dst->sl) return -EINVAL; @@ -546,7 +518,7 @@ static void leave_handler(int status, struct ib_sa_mcmember_rec *rec, { struct mcast_group *group = context; - if (status && group->retries > 0 && + if (status && (group->retries > 0) && !send_leave(group, group->leave_state)) group->retries--; else @@ -774,7 +746,7 @@ static void mcast_event_handler(struct ib_event_handler *handler, int index; dev = container_of(handler, struct mcast_device, event_handler); - if (rdma_port_get_link_layer(dev->device, event->element.port_num) != + if (rdma_port_link_layer(dev->device, event->element.port_num) != IB_LINK_LAYER_INFINIBAND) return; @@ -805,7 +777,7 @@ static void mcast_add_one(struct ib_device *device) if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) return; - dev = kmalloc(sizeof *dev + device->phys_port_cnt * sizeof *port, + dev = kzalloc(sizeof *dev + device->phys_port_cnt * sizeof *port, GFP_KERNEL); if (!dev) return; @@ -818,7 +790,7 @@ static void mcast_add_one(struct ib_device *device) } for (i = 0; i <= dev->end_port - dev->start_port; i++) { - if (rdma_port_get_link_layer(device, dev->start_port + i) != + if (rdma_port_link_layer(device, dev->start_port + i) != IB_LINK_LAYER_INFINIBAND) continue; port = &dev->port[i]; @@ -857,7 +829,7 @@ static void mcast_remove_one(struct ib_device *device) flush_workqueue(mcast_wq); for (i = 0; i <= dev->end_port - dev->start_port; i++) { - if (rdma_port_get_link_layer(device, dev->start_port + i) == + if (rdma_port_link_layer(device, dev->start_port + i) == IB_LINK_LAYER_INFINIBAND) { port = &dev->port[i]; deref_port(port); diff --git a/drivers/infiniband/core/notice.c b/drivers/infiniband/core/notice.c new file mode 100644 index 0000000000000..4a8d98f3e06ce --- /dev/null +++ b/drivers/infiniband/core/notice.c @@ -0,0 +1,749 @@ +/* + * Copyright (c) 2006 Intel Corporation.  All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "sa.h" + +MODULE_AUTHOR("Sean Hefty"); +MODULE_DESCRIPTION("InfiniBand InformInfo & Notice event handling"); +MODULE_LICENSE("Dual BSD/GPL"); + +static void inform_add_one(struct ib_device *device); +static void inform_remove_one(struct ib_device *device); + +static struct ib_client inform_client = { + .name = "ib_notice", + .add = inform_add_one, + .remove = inform_remove_one +}; + +static struct ib_sa_client sa_client; +static struct workqueue_struct *inform_wq; + +struct inform_device; + +struct inform_port { + struct inform_device *dev; + spinlock_t lock; + struct rb_root table; + atomic_t refcount; + struct completion comp; + u8 port_num; +}; + +struct inform_device { + struct ib_device *device; + struct ib_event_handler event_handler; + int start_port; + int end_port; + struct inform_port port[0]; +}; + +enum inform_state { + INFORM_IDLE, + INFORM_REGISTERING, + INFORM_MEMBER, + INFORM_BUSY, + INFORM_ERROR +}; + +struct inform_member; + +struct inform_group { + u16 trap_number; + struct rb_node node; + struct inform_port *port; + spinlock_t lock; + struct work_struct work; + struct list_head pending_list; + struct list_head active_list; + struct list_head notice_list; + struct inform_member *last_join; + int members; + enum inform_state join_state; /* State relative to SA */ + atomic_t refcount; + enum inform_state state; + struct ib_sa_query *query; + int query_id; +}; + +struct inform_member { + struct ib_inform_info info; + struct ib_sa_client *client; + struct inform_group *group; + struct list_head list; + enum inform_state state; + atomic_t refcount; + struct completion comp; +}; + +struct inform_notice { + struct list_head list; + struct ib_sa_notice notice; +}; + +static void reg_handler(int status, struct ib_sa_inform *inform, + void *context); +static void unreg_handler(int status, struct ib_sa_inform *inform, + void *context); + +static struct inform_group *inform_find(struct inform_port *port, + u16 trap_number) +{ + struct rb_node *node = port->table.rb_node; + struct inform_group *group; + + while (node) { + group = rb_entry(node, struct inform_group, node); + if (trap_number < group->trap_number) + node = node->rb_left; + else if (trap_number > group->trap_number) + node = node->rb_right; + else + return group; + } + return NULL; +} + +static struct inform_group *inform_insert(struct inform_port *port, + struct inform_group *group) +{ + struct rb_node **link = &port->table.rb_node; + struct rb_node *parent = NULL; + struct inform_group *cur_group; + + while (*link) { + parent = *link; + cur_group = rb_entry(parent, struct inform_group, node); + if (group->trap_number < cur_group->trap_number) + link = &(*link)->rb_left; + else if (group->trap_number > cur_group->trap_number) + link = &(*link)->rb_right; + else + return cur_group; + } + rb_link_node(&group->node, parent, link); + rb_insert_color(&group->node, &port->table); + return NULL; +} + +static void deref_port(struct inform_port *port) +{ + if (atomic_dec_and_test(&port->refcount)) + complete(&port->comp); +} + +static void release_group(struct inform_group *group) +{ + struct inform_port *port = group->port; + unsigned long flags; + + spin_lock_irqsave(&port->lock, flags); + if (atomic_dec_and_test(&group->refcount)) { + rb_erase(&group->node, &port->table); + spin_unlock_irqrestore(&port->lock, flags); + kfree(group); + deref_port(port); + } else + spin_unlock_irqrestore(&port->lock, flags); +} + +static void deref_member(struct inform_member *member) +{ + if (atomic_dec_and_test(&member->refcount)) + complete(&member->comp); +} + +static void queue_reg(struct inform_member *member) +{ + struct inform_group *group = member->group; + unsigned long flags; + + spin_lock_irqsave(&group->lock, flags); + list_add(&member->list, &group->pending_list); + if (group->state == INFORM_IDLE) { + group->state = INFORM_BUSY; + atomic_inc(&group->refcount); + queue_work(inform_wq, &group->work); + } + spin_unlock_irqrestore(&group->lock, flags); +} + +static int send_reg(struct inform_group *group, struct inform_member *member) +{ + struct inform_port *port = group->port; + struct ib_sa_inform inform; + int ret; + + memset(&inform, 0, sizeof inform); + inform.lid_range_begin = cpu_to_be16(0xFFFF); + inform.is_generic = 1; + inform.subscribe = 1; + inform.type = cpu_to_be16(IB_SA_EVENT_TYPE_ALL); + inform.trap.generic.trap_num = cpu_to_be16(member->info.trap_number); + inform.trap.generic.resp_time = 19; + inform.trap.generic.producer_type = + cpu_to_be32(IB_SA_EVENT_PRODUCER_TYPE_ALL); + + group->last_join = member; + ret = ib_sa_informinfo_query(&sa_client, port->dev->device, + port->port_num, &inform, 3000, GFP_KERNEL, + reg_handler, group,&group->query); + if (ret >= 0) { + group->query_id = ret; + ret = 0; + } + return ret; +} + +static int send_unreg(struct inform_group *group) +{ + struct inform_port *port = group->port; + struct ib_sa_inform inform; + int ret; + + memset(&inform, 0, sizeof inform); + inform.lid_range_begin = cpu_to_be16(0xFFFF); + inform.is_generic = 1; + inform.type = cpu_to_be16(IB_SA_EVENT_TYPE_ALL); + inform.trap.generic.trap_num = cpu_to_be16(group->trap_number); + inform.trap.generic.qpn = IB_QP1; + inform.trap.generic.resp_time = 19; + inform.trap.generic.producer_type = + cpu_to_be32(IB_SA_EVENT_PRODUCER_TYPE_ALL); + + ret = ib_sa_informinfo_query(&sa_client, port->dev->device, + port->port_num, &inform, 3000, GFP_KERNEL, + unreg_handler, group, &group->query); + if (ret >= 0) { + group->query_id = ret; + ret = 0; + } + return ret; +} + +static void join_group(struct inform_group *group, struct inform_member *member) +{ + member->state = INFORM_MEMBER; + group->members++; + list_move(&member->list, &group->active_list); +} + +static int fail_join(struct inform_group *group, struct inform_member *member, + int status) +{ + spin_lock_irq(&group->lock); + list_del_init(&member->list); + spin_unlock_irq(&group->lock); + return member->info.callback(status, &member->info, NULL); +} + +static void process_group_error(struct inform_group *group) +{ + struct inform_member *member; + int ret; + + spin_lock_irq(&group->lock); + while (!list_empty(&group->active_list)) { + member = list_entry(group->active_list.next, + struct inform_member, list); + atomic_inc(&member->refcount); + list_del_init(&member->list); + group->members--; + member->state = INFORM_ERROR; + spin_unlock_irq(&group->lock); + + ret = member->info.callback(-ENETRESET, &member->info, NULL); + deref_member(member); + if (ret) + ib_sa_unregister_inform_info(&member->info); + spin_lock_irq(&group->lock); + } + + group->join_state = INFORM_IDLE; + group->state = INFORM_BUSY; + spin_unlock_irq(&group->lock); +} + +/* + * Report a notice to all active subscribers. We use a temporary list to + * handle unsubscription requests while the notice is being reported, which + * avoids holding the group lock while in the user's callback. + */ +static void process_notice(struct inform_group *group, + struct inform_notice *info_notice) +{ + struct inform_member *member; + struct list_head list; + int ret; + + INIT_LIST_HEAD(&list); + + spin_lock_irq(&group->lock); + list_splice_init(&group->active_list, &list); + while (!list_empty(&list)) { + + member = list_entry(list.next, struct inform_member, list); + atomic_inc(&member->refcount); + list_move(&member->list, &group->active_list); + spin_unlock_irq(&group->lock); + + ret = member->info.callback(0, &member->info, + &info_notice->notice); + deref_member(member); + if (ret) + ib_sa_unregister_inform_info(&member->info); + spin_lock_irq(&group->lock); + } + spin_unlock_irq(&group->lock); +} + +static void inform_work_handler(struct work_struct *work) +{ + struct inform_group *group; + struct inform_member *member; + struct ib_inform_info *info; + struct inform_notice *info_notice; + int status, ret; + + group = container_of(work, typeof(*group), work); +retest: + spin_lock_irq(&group->lock); + while (!list_empty(&group->pending_list) || + !list_empty(&group->notice_list) || + (group->state == INFORM_ERROR)) { + + if (group->state == INFORM_ERROR) { + spin_unlock_irq(&group->lock); + process_group_error(group); + goto retest; + } + + if (!list_empty(&group->notice_list)) { + info_notice = list_entry(group->notice_list.next, + struct inform_notice, list); + list_del(&info_notice->list); + spin_unlock_irq(&group->lock); + process_notice(group, info_notice); + kfree(info_notice); + goto retest; + } + + member = list_entry(group->pending_list.next, + struct inform_member, list); + info = &member->info; + atomic_inc(&member->refcount); + + if (group->join_state == INFORM_MEMBER) { + join_group(group, member); + spin_unlock_irq(&group->lock); + ret = info->callback(0, info, NULL); + } else { + spin_unlock_irq(&group->lock); + status = send_reg(group, member); + if (!status) { + deref_member(member); + return; + } + ret = fail_join(group, member, status); + } + + deref_member(member); + if (ret) + ib_sa_unregister_inform_info(&member->info); + spin_lock_irq(&group->lock); + } + + if (!group->members && (group->join_state == INFORM_MEMBER)) { + group->join_state = INFORM_IDLE; + spin_unlock_irq(&group->lock); + if (send_unreg(group)) + goto retest; + } else { + group->state = INFORM_IDLE; + spin_unlock_irq(&group->lock); + release_group(group); + } +} + +/* + * Fail a join request if it is still active - at the head of the pending queue. + */ +static void process_join_error(struct inform_group *group, int status) +{ + struct inform_member *member; + int ret; + + spin_lock_irq(&group->lock); + member = list_entry(group->pending_list.next, + struct inform_member, list); + if (group->last_join == member) { + atomic_inc(&member->refcount); + list_del_init(&member->list); + spin_unlock_irq(&group->lock); + ret = member->info.callback(status, &member->info, NULL); + deref_member(member); + if (ret) + ib_sa_unregister_inform_info(&member->info); + } else + spin_unlock_irq(&group->lock); +} + +static void reg_handler(int status, struct ib_sa_inform *inform, void *context) +{ + struct inform_group *group = context; + + if (status) + process_join_error(group, status); + else + group->join_state = INFORM_MEMBER; + + inform_work_handler(&group->work); +} + +static void unreg_handler(int status, struct ib_sa_inform *rec, void *context) +{ + struct inform_group *group = context; + + inform_work_handler(&group->work); +} + +int notice_dispatch(struct ib_device *device, u8 port_num, + struct ib_sa_notice *notice) +{ + struct inform_device *dev; + struct inform_port *port; + struct inform_group *group; + struct inform_notice *info_notice; + + dev = ib_get_client_data(device, &inform_client); + if (!dev) + return 0; /* No one to give notice to. */ + + port = &dev->port[port_num - dev->start_port]; + spin_lock_irq(&port->lock); + group = inform_find(port, __be16_to_cpu(notice->trap. + generic.trap_num)); + if (!group) { + spin_unlock_irq(&port->lock); + return 0; + } + + atomic_inc(&group->refcount); + spin_unlock_irq(&port->lock); + + info_notice = kmalloc(sizeof *info_notice, GFP_KERNEL); + if (!info_notice) { + release_group(group); + return -ENOMEM; + } + + info_notice->notice = *notice; + + spin_lock_irq(&group->lock); + list_add(&info_notice->list, &group->notice_list); + if (group->state == INFORM_IDLE) { + group->state = INFORM_BUSY; + spin_unlock_irq(&group->lock); + inform_work_handler(&group->work); + } else { + spin_unlock_irq(&group->lock); + release_group(group); + } + + return 0; +} + +static struct inform_group *acquire_group(struct inform_port *port, + u16 trap_number, gfp_t gfp_mask) +{ + struct inform_group *group, *cur_group; + unsigned long flags; + + spin_lock_irqsave(&port->lock, flags); + group = inform_find(port, trap_number); + if (group) + goto found; + spin_unlock_irqrestore(&port->lock, flags); + + group = kzalloc(sizeof *group, gfp_mask); + if (!group) + return NULL; + + group->port = port; + group->trap_number = trap_number; + INIT_LIST_HEAD(&group->pending_list); + INIT_LIST_HEAD(&group->active_list); + INIT_LIST_HEAD(&group->notice_list); + INIT_WORK(&group->work, inform_work_handler); + spin_lock_init(&group->lock); + + spin_lock_irqsave(&port->lock, flags); + cur_group = inform_insert(port, group); + if (cur_group) { + kfree(group); + group = cur_group; + } else + atomic_inc(&port->refcount); +found: + atomic_inc(&group->refcount); + spin_unlock_irqrestore(&port->lock, flags); + return group; +} + +/* + * We serialize all join requests to a single group to make our lives much + * easier. Otherwise, two users could try to join the same group + * simultaneously, with different configurations, one could leave while the + * join is in progress, etc., which makes locking around error recovery + * difficult. + */ +struct ib_inform_info * +ib_sa_register_inform_info(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + u16 trap_number, gfp_t gfp_mask, + int (*callback)(int status, + struct ib_inform_info *info, + struct ib_sa_notice *notice), + void *context) +{ + struct inform_device *dev; + struct inform_member *member; + struct ib_inform_info *info; + int ret; + + dev = ib_get_client_data(device, &inform_client); + if (!dev) + return ERR_PTR(-ENODEV); + + member = kzalloc(sizeof *member, gfp_mask); + if (!member) + return ERR_PTR(-ENOMEM); + + ib_sa_client_get(client); + member->client = client; + member->info.trap_number = trap_number; + member->info.callback = callback; + member->info.context = context; + init_completion(&member->comp); + atomic_set(&member->refcount, 1); + member->state = INFORM_REGISTERING; + + member->group = acquire_group(&dev->port[port_num - dev->start_port], + trap_number, gfp_mask); + if (!member->group) { + ret = -ENOMEM; + goto err; + } + + /* + * The user will get the info structure in their callback. They + * could then free the info structure before we can return from + * this routine. So we save the pointer to return before queuing + * any callback. + */ + info = &member->info; + queue_reg(member); + return info; + +err: + ib_sa_client_put(member->client); + kfree(member); + return ERR_PTR(ret); +} +EXPORT_SYMBOL(ib_sa_register_inform_info); + +void ib_sa_unregister_inform_info(struct ib_inform_info *info) +{ + struct inform_member *member; + struct inform_group *group; + + member = container_of(info, struct inform_member, info); + group = member->group; + + spin_lock_irq(&group->lock); + if (member->state == INFORM_MEMBER) + group->members--; + + list_del_init(&member->list); + + if (group->state == INFORM_IDLE) { + group->state = INFORM_BUSY; + spin_unlock_irq(&group->lock); + /* Continue to hold reference on group until callback */ + queue_work(inform_wq, &group->work); + } else { + spin_unlock_irq(&group->lock); + release_group(group); + } + + deref_member(member); + wait_for_completion(&member->comp); + ib_sa_client_put(member->client); + kfree(member); +} +EXPORT_SYMBOL(ib_sa_unregister_inform_info); + +static void inform_groups_lost(struct inform_port *port) +{ + struct inform_group *group; + struct rb_node *node; + unsigned long flags; + + spin_lock_irqsave(&port->lock, flags); + for (node = rb_first(&port->table); node; node = rb_next(node)) { + group = rb_entry(node, struct inform_group, node); + spin_lock(&group->lock); + if (group->state == INFORM_IDLE) { + atomic_inc(&group->refcount); + queue_work(inform_wq, &group->work); + } + group->state = INFORM_ERROR; + spin_unlock(&group->lock); + } + spin_unlock_irqrestore(&port->lock, flags); +} + +static void inform_event_handler(struct ib_event_handler *handler, + struct ib_event *event) +{ + struct inform_device *dev; + + dev = container_of(handler, struct inform_device, event_handler); + + switch (event->event) { + case IB_EVENT_PORT_ERR: + case IB_EVENT_LID_CHANGE: + case IB_EVENT_SM_CHANGE: + case IB_EVENT_CLIENT_REREGISTER: + inform_groups_lost(&dev->port[event->element.port_num - + dev->start_port]); + break; + default: + break; + } +} + +static void inform_add_one(struct ib_device *device) +{ + struct inform_device *dev; + struct inform_port *port; + int i; + + if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) + return; + + dev = kmalloc(sizeof *dev + device->phys_port_cnt * sizeof *port, + GFP_KERNEL); + if (!dev) + return; + + if (device->node_type == RDMA_NODE_IB_SWITCH) + dev->start_port = dev->end_port = 0; + else { + dev->start_port = 1; + dev->end_port = device->phys_port_cnt; + } + + for (i = 0; i <= dev->end_port - dev->start_port; i++) { + port = &dev->port[i]; + port->dev = dev; + port->port_num = dev->start_port + i; + spin_lock_init(&port->lock); + port->table = RB_ROOT; + init_completion(&port->comp); + atomic_set(&port->refcount, 1); + } + + dev->device = device; + ib_set_client_data(device, &inform_client, dev); + + INIT_IB_EVENT_HANDLER(&dev->event_handler, device, inform_event_handler); + ib_register_event_handler(&dev->event_handler); +} + +static void inform_remove_one(struct ib_device *device) +{ + struct inform_device *dev; + struct inform_port *port; + int i; + + dev = ib_get_client_data(device, &inform_client); + if (!dev) + return; + + ib_unregister_event_handler(&dev->event_handler); + flush_workqueue(inform_wq); + + for (i = 0; i <= dev->end_port - dev->start_port; i++) { + port = &dev->port[i]; + deref_port(port); + wait_for_completion(&port->comp); + } + + kfree(dev); +} + +int notice_init(void) +{ + int ret; + + inform_wq = create_singlethread_workqueue("ib_inform"); + if (!inform_wq) + return -ENOMEM; + + ib_sa_register_client(&sa_client); + + ret = ib_register_client(&inform_client); + if (ret) + goto err; + return 0; + +err: + ib_sa_unregister_client(&sa_client); + destroy_workqueue(inform_wq); + return ret; +} + +void notice_cleanup(void) +{ + ib_unregister_client(&inform_client); + ib_sa_unregister_client(&sa_client); + destroy_workqueue(inform_wq); +} diff --git a/drivers/infiniband/core/sa.h b/drivers/infiniband/core/sa.h index b1d4bbf4ce5c7..b8abdd767b6c6 100644 --- a/drivers/infiniband/core/sa.h +++ b/drivers/infiniband/core/sa.h @@ -48,6 +48,29 @@ static inline void ib_sa_client_put(struct ib_sa_client *client) complete(&client->comp); } +int ib_sa_check_selector(ib_sa_comp_mask comp_mask, + ib_sa_comp_mask selector_mask, + ib_sa_comp_mask value_mask, + u8 selector, u8 src_value, u8 dst_value); + +int ib_sa_pack_attr(void *dst, void *src, int attr_id); + +int ib_sa_unpack_attr(void *dst, void *src, int attr_id); + +int ib_sa_path_rec_query(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + struct ib_sa_path_rec *rec, + ib_sa_comp_mask comp_mask, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_path_rec *resp, + void *context), + void *context, + struct ib_sa_query **sa_query); + +int sa_db_init(void); +void sa_db_cleanup(void); + int ib_sa_mcmember_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, u8 method, @@ -63,4 +86,20 @@ int ib_sa_mcmember_rec_query(struct ib_sa_client *client, int mcast_init(void); void mcast_cleanup(void); +int ib_sa_informinfo_query(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + struct ib_sa_inform *rec, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_inform *resp, + void *context), + void *context, + struct ib_sa_query **sa_query); + +int notice_dispatch(struct ib_device *device, u8 port_num, + struct ib_sa_notice *notice); + +int notice_init(void); +void notice_cleanup(void); + #endif /* SA_H */ diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index fbbfa24cf5724..623562e69a625 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -60,10 +60,12 @@ struct ib_sa_sm_ah { struct ib_sa_port { struct ib_mad_agent *agent; + struct ib_mad_agent *notice_agent; struct ib_sa_sm_ah *sm_ah; struct work_struct update_task; spinlock_t ah_lock; u8 port_num; + struct ib_device *device; }; struct ib_sa_device { @@ -93,13 +95,23 @@ struct ib_sa_path_query { void *context; struct ib_sa_query sa_query; }; - +struct ib_sa_guidinfo_query { + void (*callback)(int, struct ib_sa_guidinfo_rec *, void *); + void *context; + struct ib_sa_query sa_query; +}; struct ib_sa_mcmember_query { void (*callback)(int, struct ib_sa_mcmember_rec *, void *); void *context; struct ib_sa_query sa_query; }; +struct ib_sa_inform_query { + void (*callback)(int, struct ib_sa_inform *, void *); + void *context; + struct ib_sa_query sa_query; +}; + static void ib_sa_add_one(struct ib_device *device); static void ib_sa_remove_one(struct ib_device *device); @@ -109,10 +121,10 @@ static struct ib_client sa_client = { .remove = ib_sa_remove_one }; -static DEFINE_SPINLOCK(idr_lock); +static spinlock_t idr_lock; static DEFINE_IDR(query_idr); -static DEFINE_SPINLOCK(tid_lock); +static spinlock_t tid_lock; static u32 tid; #define PATH_REC_FIELD(field) \ @@ -215,6 +227,36 @@ static const struct ib_field path_rec_table[] = { .size_bits = 48 }, }; + +#define GUIDINFO_REC_FIELD(field) \ + .struct_offset_bytes = offsetof(struct ib_sa_guidinfo_rec, field), \ + .struct_size_bytes = sizeof ((struct ib_sa_guidinfo_rec *) 0)->field, \ + .field_name = "sa_guidinfo_rec:" #field + +static const struct ib_field guidinfo_rec_table[] = { + { GUIDINFO_REC_FIELD(lid), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 16 }, + { GUIDINFO_REC_FIELD(block_num), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 8 }, + { GUIDINFO_REC_FIELD(res1), + .offset_words = 0, + .offset_bits = 24, + .size_bits = 8 }, + { GUIDINFO_REC_FIELD(res2), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 32 }, + { GUIDINFO_REC_FIELD(guid_info_list), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 512 }, +}; + + #define MCMEMBER_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct ib_sa_mcmember_rec, field), \ .struct_size_bytes = sizeof ((struct ib_sa_mcmember_rec *) 0)->field, \ @@ -347,6 +389,162 @@ static const struct ib_field service_rec_table[] = { .size_bits = 2*64 }, }; +#define INFORM_FIELD(field) \ + .struct_offset_bytes = offsetof(struct ib_sa_inform, field), \ + .struct_size_bytes = sizeof ((struct ib_sa_inform *) 0)->field, \ + .field_name = "sa_inform:" #field + +static const struct ib_field inform_table[] = { + { INFORM_FIELD(gid), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 128 }, + { INFORM_FIELD(lid_range_begin), + .offset_words = 4, + .offset_bits = 0, + .size_bits = 16 }, + { INFORM_FIELD(lid_range_end), + .offset_words = 4, + .offset_bits = 16, + .size_bits = 16 }, + { RESERVED, + .offset_words = 5, + .offset_bits = 0, + .size_bits = 16 }, + { INFORM_FIELD(is_generic), + .offset_words = 5, + .offset_bits = 16, + .size_bits = 8 }, + { INFORM_FIELD(subscribe), + .offset_words = 5, + .offset_bits = 24, + .size_bits = 8 }, + { INFORM_FIELD(type), + .offset_words = 6, + .offset_bits = 0, + .size_bits = 16 }, + { INFORM_FIELD(trap.generic.trap_num), + .offset_words = 6, + .offset_bits = 16, + .size_bits = 16 }, + { INFORM_FIELD(trap.generic.qpn), + .offset_words = 7, + .offset_bits = 0, + .size_bits = 24 }, + { RESERVED, + .offset_words = 7, + .offset_bits = 24, + .size_bits = 3 }, + { INFORM_FIELD(trap.generic.resp_time), + .offset_words = 7, + .offset_bits = 27, + .size_bits = 5 }, + { RESERVED, + .offset_words = 8, + .offset_bits = 0, + .size_bits = 8 }, + { INFORM_FIELD(trap.generic.producer_type), + .offset_words = 8, + .offset_bits = 8, + .size_bits = 24 }, +}; + +#define NOTICE_FIELD(field) \ + .struct_offset_bytes = offsetof(struct ib_sa_notice, field), \ + .struct_size_bytes = sizeof ((struct ib_sa_notice *) 0)->field, \ + .field_name = "sa_notice:" #field + +static const struct ib_field notice_table[] = { + { NOTICE_FIELD(is_generic), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 1 }, + { NOTICE_FIELD(type), + .offset_words = 0, + .offset_bits = 1, + .size_bits = 7 }, + { NOTICE_FIELD(trap.generic.producer_type), + .offset_words = 0, + .offset_bits = 8, + .size_bits = 24 }, + { NOTICE_FIELD(trap.generic.trap_num), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 16 }, + { NOTICE_FIELD(issuer_lid), + .offset_words = 1, + .offset_bits = 16, + .size_bits = 16 }, + { NOTICE_FIELD(notice_toggle), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 1 }, + { NOTICE_FIELD(notice_count), + .offset_words = 2, + .offset_bits = 1, + .size_bits = 15 }, + { NOTICE_FIELD(data_details), + .offset_words = 2, + .offset_bits = 16, + .size_bits = 432 }, + { NOTICE_FIELD(issuer_gid), + .offset_words = 16, + .offset_bits = 0, + .size_bits = 128 }, +}; + +int ib_sa_check_selector(ib_sa_comp_mask comp_mask, + ib_sa_comp_mask selector_mask, + ib_sa_comp_mask value_mask, + u8 selector, u8 src_value, u8 dst_value) +{ + int err; + + if (!(comp_mask & selector_mask) || !(comp_mask & value_mask)) + return 0; + + switch (selector) { + case IB_SA_GT: + err = (src_value <= dst_value); + break; + case IB_SA_LT: + err = (src_value >= dst_value); + break; + case IB_SA_EQ: + err = (src_value != dst_value); + break; + default: + err = 0; + break; + } + + return err; +} + +int ib_sa_pack_attr(void *dst, void *src, int attr_id) +{ + switch (attr_id) { + case IB_SA_ATTR_PATH_REC: + ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), src, dst); + break; + default: + return -EINVAL; + } + return 0; +} + +int ib_sa_unpack_attr(void *dst, void *src, int attr_id) +{ + switch (attr_id) { + case IB_SA_ATTR_PATH_REC: + ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), src, dst); + break; + default: + return -EINVAL; + } + return 0; +} + static void free_sm_ah(struct kref *kref) { struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref); @@ -416,17 +614,17 @@ static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event struct ib_sa_port *port = &sa_dev->port[event->element.port_num - sa_dev->start_port]; - if (rdma_port_get_link_layer(handler->device, port->port_num) != IB_LINK_LAYER_INFINIBAND) - return; + if (rdma_port_link_layer(handler->device, port->port_num) == IB_LINK_LAYER_INFINIBAND) { + spin_lock_irqsave(&port->ah_lock, flags); + if (port->sm_ah) + kref_put(&port->sm_ah->ref, free_sm_ah); + port->sm_ah = NULL; + spin_unlock_irqrestore(&port->ah_lock, flags); - spin_lock_irqsave(&port->ah_lock, flags); - if (port->sm_ah) - kref_put(&port->sm_ah->ref, free_sm_ah); - port->sm_ah = NULL; - spin_unlock_irqrestore(&port->ah_lock, flags); + schedule_work(&sa_dev->port[event->element.port_num - + sa_dev->start_port].update_task); + } - queue_work(ib_wq, &sa_dev->port[event->element.port_num - - sa_dev->start_port].update_task); } } @@ -492,11 +690,11 @@ static u8 get_src_path_mask(struct ib_device *device, u8 port_num) } int ib_init_ah_from_path(struct ib_device *device, u8 port_num, - struct ib_sa_path_rec *rec, struct ib_ah_attr *ah_attr) + struct ib_sa_path_rec *rec, struct ib_ah_attr *ah_attr, + int force_grh) { int ret; u16 gid_index; - int force_grh; memset(ah_attr, 0, sizeof *ah_attr); ah_attr->dlid = be16_to_cpu(rec->dlid); @@ -506,8 +704,6 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, ah_attr->port_num = port_num; ah_attr->static_rate = rec->rate; - force_grh = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_ETHERNET; - if (rec->hop_limit > 1 || force_grh) { ah_attr->ah_flags = IB_AH_GRH; ah_attr->grh.dgid = rec->dgid; @@ -638,41 +834,16 @@ static void ib_sa_path_rec_release(struct ib_sa_query *sa_query) kfree(container_of(sa_query, struct ib_sa_path_query, sa_query)); } -/** - * ib_sa_path_rec_get - Start a Path get query - * @client:SA client - * @device:device to send query on - * @port_num: port number to send query on - * @rec:Path Record to send in query - * @comp_mask:component mask to send in query - * @timeout_ms:time to wait for response - * @gfp_mask:GFP mask to use for internal allocations - * @callback:function called when query completes, times out or is - * canceled - * @context:opaque user context passed to callback - * @sa_query:query context, used to cancel query - * - * Send a Path Record Get query to the SA to look up a path. The - * callback function will be called when the query completes (or - * fails); status is 0 for a successful response, -EINTR if the query - * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error - * occurred sending the query. The resp parameter of the callback is - * only valid if status is 0. - * - * If the return value of ib_sa_path_rec_get() is negative, it is an - * error code. Otherwise it is a query ID that can be used to cancel - * the query. - */ -int ib_sa_path_rec_get(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, - struct ib_sa_path_rec *rec, - ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, - void (*callback)(int status, - struct ib_sa_path_rec *resp, - void *context), - void *context, - struct ib_sa_query **sa_query) +int ib_sa_path_rec_query(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + struct ib_sa_path_rec *rec, + ib_sa_comp_mask comp_mask, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_path_rec *resp, + void *context), + void *context, + struct ib_sa_query **sa_query) { struct ib_sa_path_query *query; struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); @@ -729,7 +900,104 @@ err1: kfree(query); return ret; } -EXPORT_SYMBOL(ib_sa_path_rec_get); + +/*Support GuidInfoRecord*/ +static void ib_sa_guidinfo_rec_callback(struct ib_sa_query *sa_query, + int status, + struct ib_sa_mad *mad) +{ + struct ib_sa_guidinfo_query *query = + container_of(sa_query, struct ib_sa_guidinfo_query, sa_query); + + if (mad) { + struct ib_sa_guidinfo_rec rec; + + ib_unpack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table), + mad->data, &rec); + query->callback(status, &rec, query->context); + } else + query->callback(status, NULL, query->context); +} + +static void ib_sa_guidinfo_rec_release(struct ib_sa_query *sa_query) +{ + kfree(container_of(sa_query, struct ib_sa_guidinfo_query, sa_query)); +} + +int ib_sa_guid_info_rec_query(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + struct ib_sa_guidinfo_rec *rec, + ib_sa_comp_mask comp_mask, u8 method, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_guidinfo_rec *resp, + void *context), + void *context, + struct ib_sa_query **sa_query) +{ + struct ib_sa_guidinfo_query *query; + struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); + struct ib_sa_port *port; + struct ib_mad_agent *agent; + struct ib_sa_mad *mad; + int ret; + + if (!sa_dev) + return -ENODEV; + + if (method != IB_MGMT_METHOD_SET && + method != IB_SA_METHOD_DELETE) { + return -EINVAL; + } + + port = &sa_dev->port[port_num - sa_dev->start_port]; + agent = port->agent; + + query = kmalloc(sizeof *query, gfp_mask); + if (!query) + return -ENOMEM; + + query->sa_query.port = port; + ret = alloc_mad(&query->sa_query, gfp_mask); + if (ret) + goto err1; + + ib_sa_client_get(client); + query->sa_query.client = client; + query->callback = callback; + query->context = context; + + mad = query->sa_query.mad_buf->mad; + init_mad(mad, agent); + + query->sa_query.callback = callback ? ib_sa_guidinfo_rec_callback : NULL; + query->sa_query.release = ib_sa_guidinfo_rec_release; + + mad->mad_hdr.method = method; /*IB_MGMT_METHOD_SET or IB_SA_METHOD_DELETE;*/ + mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_GUID_INFO_REC); + mad->sa_hdr.comp_mask = comp_mask; + + ib_pack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table), rec, mad->data); + + *sa_query = &query->sa_query; + + ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); + if (ret < 0) + goto err2; + + return ret; + +err2: + *sa_query = NULL; + ib_sa_client_put(query->sa_query.client); + free_mad(&query->sa_query); + +err1: + kfree(query); + return ret; +} +EXPORT_SYMBOL(ib_sa_guid_info_rec_query); + static void ib_sa_service_rec_callback(struct ib_sa_query *sa_query, int status, @@ -945,11 +1213,166 @@ err1: return ret; } +static void ib_sa_inform_callback(struct ib_sa_query *sa_query, + int status, + struct ib_sa_mad *mad) +{ + struct ib_sa_inform_query *query = + container_of(sa_query, struct ib_sa_inform_query, sa_query); + + if (mad) { + struct ib_sa_inform rec; + + ib_unpack(inform_table, ARRAY_SIZE(inform_table), + mad->data, &rec); + query->callback(status, &rec, query->context); + } else + query->callback(status, NULL, query->context); +} + +static void ib_sa_inform_release(struct ib_sa_query *sa_query) +{ + kfree(container_of(sa_query, struct ib_sa_inform_query, sa_query)); +} + +/** + * ib_sa_informinfo_query - Start an InformInfo registration. + * @client:SA client + * @device:device to send query on + * @port_num: port number to send query on + * @rec:Inform record to send in query + * @timeout_ms:time to wait for response + * @gfp_mask:GFP mask to use for internal allocations + * @callback:function called when notice handler registration completes, + * times out or is canceled + * @context:opaque user context passed to callback + * @sa_query:query context, used to cancel query + * + * This function sends inform info to register with SA to receive + * in-service notice. + * The callback function will be called when the query completes (or + * fails); status is 0 for a successful response, -EINTR if the query + * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error + * occurred sending the query. The resp parameter of the callback is + * only valid if status is 0. + * + * If the return value of ib_sa_inform_query() is negative, it is an + * error code. Otherwise it is a query ID that can be used to cancel + * the query. + */ +int ib_sa_informinfo_query(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + struct ib_sa_inform *rec, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_inform *resp, + void *context), + void *context, + struct ib_sa_query **sa_query) +{ + struct ib_sa_inform_query *query; + struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); + struct ib_sa_port *port; + struct ib_mad_agent *agent; + struct ib_sa_mad *mad; + int ret; + + if (!sa_dev) + return -ENODEV; + + port = &sa_dev->port[port_num - sa_dev->start_port]; + agent = port->agent; + + query = kmalloc(sizeof *query, gfp_mask); + if (!query) + return -ENOMEM; + + query->sa_query.port = port; + ret = alloc_mad(&query->sa_query, gfp_mask); + if (ret) + goto err1; + + ib_sa_client_get(client); + query->sa_query.client = client; + query->callback = callback; + query->context = context; + + mad = query->sa_query.mad_buf->mad; + init_mad(mad, agent); + + query->sa_query.callback = callback ? ib_sa_inform_callback : NULL; + query->sa_query.release = ib_sa_inform_release; + query->sa_query.port = port; + mad->mad_hdr.method = IB_MGMT_METHOD_SET; + mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_INFORM_INFO); + + ib_pack(inform_table, ARRAY_SIZE(inform_table), rec, mad->data); + + *sa_query = &query->sa_query; + ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); + if (ret < 0) + goto err2; + + return ret; + +err2: + *sa_query = NULL; + ib_sa_client_put(query->sa_query.client); + free_mad(&query->sa_query); +err1: + kfree(query); + return ret; +} + +static void ib_sa_notice_resp(struct ib_sa_port *port, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct ib_mad_send_buf *mad_buf; + struct ib_sa_mad *mad; + int ret; + unsigned long flags; + + mad_buf = ib_create_send_mad(port->notice_agent, 1, 0, 0, + IB_MGMT_SA_HDR, IB_MGMT_SA_DATA, + GFP_KERNEL); + if (IS_ERR(mad_buf)) + return; + + mad = mad_buf->mad; + memcpy(mad, mad_recv_wc->recv_buf.mad, sizeof *mad); + mad->mad_hdr.method = IB_MGMT_METHOD_REPORT_RESP; + + spin_lock_irqsave(&port->ah_lock, flags); + if (!port->sm_ah) { + spin_unlock_irqrestore(&port->ah_lock, flags); + ib_free_send_mad(mad_buf); + return; + } + kref_get(&port->sm_ah->ref); + mad_buf->context[0] = &port->sm_ah->ref; + mad_buf->ah = port->sm_ah->ah; + spin_unlock_irqrestore(&port->ah_lock, flags); + + ret = ib_post_send_mad(mad_buf, NULL); + if (ret) + goto err; + + return; +err: + kref_put(mad_buf->context[0], free_sm_ah); + ib_free_send_mad(mad_buf); +} + static void send_handler(struct ib_mad_agent *agent, struct ib_mad_send_wc *mad_send_wc) { - struct ib_sa_query *query = mad_send_wc->send_buf->context[0]; unsigned long flags; + struct ib_sa_query *query ; + + if(NULL == mad_send_wc->send_buf->context[0]) + return; + + query = mad_send_wc->send_buf->context[0]; if (query->callback) switch (mad_send_wc->status) { @@ -998,9 +1421,36 @@ static void recv_handler(struct ib_mad_agent *mad_agent, ib_free_recv_mad(mad_recv_wc); } +static void notice_resp_handler(struct ib_mad_agent *agent, + struct ib_mad_send_wc *mad_send_wc) +{ + kref_put(mad_send_wc->send_buf->context[0], free_sm_ah); + ib_free_send_mad(mad_send_wc->send_buf); +} + +static void notice_handler(struct ib_mad_agent *mad_agent, + struct ib_mad_recv_wc *mad_recv_wc) +{ + struct ib_sa_port *port; + struct ib_sa_mad *mad; + struct ib_sa_notice notice; + + port = mad_agent->context; + mad = (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad; + ib_unpack(notice_table, ARRAY_SIZE(notice_table), mad->data, ¬ice); + + if (!notice_dispatch(port->device, port->port_num, ¬ice)) + ib_sa_notice_resp(port, mad_recv_wc); + ib_free_recv_mad(mad_recv_wc); +} + static void ib_sa_add_one(struct ib_device *device) { struct ib_sa_device *sa_dev; + struct ib_mad_reg_req reg_req = { + .mgmt_class = IB_MGMT_CLASS_SUBN_ADM, + .mgmt_class_version = 2 + }; int s, e, i; if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) @@ -1024,7 +1474,7 @@ static void ib_sa_add_one(struct ib_device *device) for (i = 0; i <= e - s; ++i) { spin_lock_init(&sa_dev->port[i].ah_lock); - if (rdma_port_get_link_layer(device, i + 1) != IB_LINK_LAYER_INFINIBAND) + if (rdma_port_link_layer(device, i + 1) != IB_LINK_LAYER_INFINIBAND) continue; sa_dev->port[i].sm_ah = NULL; @@ -1034,7 +1484,19 @@ static void ib_sa_add_one(struct ib_device *device) ib_register_mad_agent(device, i + s, IB_QPT_GSI, NULL, 0, send_handler, recv_handler, sa_dev); - if (IS_ERR(sa_dev->port[i].agent)) + if (IS_ERR(sa_dev->port[i].agent)) { + sa_dev->port[i].notice_agent = ERR_PTR(-ENOMEM); + goto err; + } + + sa_dev->port[i].device = device; + set_bit(IB_MGMT_METHOD_REPORT, reg_req.method_mask); + sa_dev->port[i].notice_agent = + ib_register_mad_agent(device, i + s, IB_QPT_GSI, + ®_req, 0, notice_resp_handler, + notice_handler, &sa_dev->port[i]); + + if (IS_ERR(sa_dev->port[i].notice_agent)) goto err; INIT_WORK(&sa_dev->port[i].update_task, update_sm_ah); @@ -1054,15 +1516,19 @@ static void ib_sa_add_one(struct ib_device *device) goto err; for (i = 0; i <= e - s; ++i) - if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) + if (rdma_port_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) update_sm_ah(&sa_dev->port[i].update_task); return; err: - while (--i >= 0) - if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) - ib_unregister_mad_agent(sa_dev->port[i].agent); + for (; i >= 0; --i) + if (rdma_port_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) { + if (!IS_ERR(sa_dev->port[i].notice_agent)) + ib_unregister_mad_agent(sa_dev->port[i].notice_agent); + if (!IS_ERR(sa_dev->port[i].agent)) + ib_unregister_mad_agent(sa_dev->port[i].agent); + } kfree(sa_dev); @@ -1079,10 +1545,11 @@ static void ib_sa_remove_one(struct ib_device *device) ib_unregister_event_handler(&sa_dev->event_handler); - flush_workqueue(ib_wq); + flush_scheduled_work(); for (i = 0; i <= sa_dev->end_port - sa_dev->start_port; ++i) { - if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) { + if (rdma_port_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) { + ib_unregister_mad_agent(sa_dev->port[i].notice_agent); ib_unregister_mad_agent(sa_dev->port[i].agent); if (sa_dev->port[i].sm_ah) kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah); @@ -1097,6 +1564,9 @@ static int __init ib_sa_init(void) { int ret; + spin_lock_init(&idr_lock); + spin_lock_init(&tid_lock); + get_random_bytes(&tid, sizeof tid); ret = ib_register_client(&sa_client); @@ -1111,7 +1581,23 @@ static int __init ib_sa_init(void) goto err2; } + ret = notice_init(); + if (ret) { + printk(KERN_ERR "Couldn't initialize notice handling\n"); + goto err3; + } + + ret = sa_db_init(); + if (ret) { + printk(KERN_ERR "Couldn't initialize local SA\n"); + goto err4; + } + return 0; +err4: + notice_cleanup(); +err3: + mcast_cleanup(); err2: ib_unregister_client(&sa_client); err1: @@ -1120,7 +1606,9 @@ err1: static void __exit ib_sa_cleanup(void) { + sa_db_cleanup(); mcast_cleanup(); + notice_cleanup(); ib_unregister_client(&sa_client); idr_destroy(&query_idr); } diff --git a/drivers/infiniband/core/smi.c b/drivers/infiniband/core/smi.c index 5855e4405d9bf..87236753bce9b 100644 --- a/drivers/infiniband/core/smi.c +++ b/drivers/infiniband/core/smi.c @@ -52,10 +52,6 @@ enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp, hop_cnt = smp->hop_cnt; /* See section 14.2.2.2, Vol 1 IB spec */ - /* C14-6 -- valid hop_cnt values are from 0 to 63 */ - if (hop_cnt >= IB_SMP_MAX_PATH_HOPS) - return IB_SMI_DISCARD; - if (!ib_get_smp_direction(smp)) { /* C14-9:1 */ if (hop_cnt && hop_ptr == 0) { @@ -137,10 +133,6 @@ enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type, hop_cnt = smp->hop_cnt; /* See section 14.2.2.2, Vol 1 IB spec */ - /* C14-6 -- valid hop_cnt values are from 0 to 63 */ - if (hop_cnt >= IB_SMP_MAX_PATH_HOPS) - return IB_SMI_DISCARD; - if (!ib_get_smp_direction(smp)) { /* C14-9:1 -- sender should have incremented hop_ptr */ if (hop_cnt && hop_ptr == 0) diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 9ab5df72df7bf..ae182f1520f2d 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -38,6 +38,7 @@ #include #include +#include struct ib_port { struct kobject kobj; @@ -79,7 +80,7 @@ static ssize_t port_attr_show(struct kobject *kobj, return port_attr->show(p, port_attr, buf); } -static const struct sysfs_ops port_sysfs_ops = { +static struct sysfs_ops port_sysfs_ops = { .show = port_attr_show }; @@ -185,18 +186,41 @@ static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused, if (ret) return ret; - switch (attr.active_speed) { - case 2: speed = " DDR"; break; - case 4: speed = " QDR"; break; - } + if ((attr.port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) && + attr.ext_active_speed) { + switch (attr.ext_active_speed) { + case 1: speed = " FDR"; break; + case 2: speed = " EDR"; break; + default: return -EINVAL; + } + + /* Legacy software will report QDR for higher speeds than QDR */ + attr.active_speed = 4; + + return sprintf(buf, "%d Gb/sec (%dX%s)\n", + ib_ext_active_speed_to_rate(attr.ext_active_speed) * + ib_width_enum_to_int(attr.active_width), + ib_width_enum_to_int(attr.active_width), speed); + } else { + switch (attr.active_speed) { + case 2: + speed = " DDR"; + break; + case 4: + speed = attr.link_encoding == 0 ? " QDR" : " FDR10"; + break; + } - rate = 25 * ib_width_enum_to_int(attr.active_width) * attr.active_speed; - if (rate < 0) - return -EINVAL; + rate = 25 * ib_width_enum_to_int(attr.active_width) * + attr.active_speed; + if (rate < 0) + return -EINVAL; + + return sprintf(buf, "%d%s Gb/sec (%dX%s)\n", + rate / 10, rate % 10 ? ".5" : "", + ib_width_enum_to_int(attr.active_width), speed); + } - return sprintf(buf, "%d%s Gb/sec (%dX%s)\n", - rate / 10, rate % 10 ? ".5" : "", - ib_width_enum_to_int(attr.active_width), speed); } static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused, @@ -225,9 +249,9 @@ static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused, static ssize_t link_layer_show(struct ib_port *p, struct port_attribute *unused, char *buf) { - switch (rdma_port_get_link_layer(p->ibdev, p->port_num)) { + switch (rdma_port_link_layer(p->ibdev, p->port_num)) { case IB_LINK_LAYER_INFINIBAND: - return sprintf(buf, "%s\n", "InfiniBand"); + return sprintf(buf, "%s\n", "IB"); case IB_LINK_LAYER_ETHERNET: return sprintf(buf, "%s\n", "Ethernet"); default: @@ -288,14 +312,8 @@ static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr, return sprintf(buf, "0x%04x\n", pkey); } -#define PORT_PMA_ATTR(_name, _counter, _width, _offset) \ -struct port_table_attribute port_pma_attr_##_name = { \ - .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \ - .index = (_offset) | ((_width) << 16) | ((_counter) << 24) \ -} - -static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, - char *buf) +static ssize_t get_pma_counters(struct ib_port *p, struct port_attribute *attr, + char *buf, int c_ext) { struct port_table_attribute *tab_attr = container_of(attr, struct port_table_attribute, attr); @@ -306,7 +324,7 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, ssize_t ret; if (!p->ibdev->process_mad) - return sprintf(buf, "N/A (no PMA)\n"); + return -ENXIO; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); @@ -319,7 +337,10 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, in_mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_PERF_MGMT; in_mad->mad_hdr.class_version = 1; in_mad->mad_hdr.method = IB_MGMT_METHOD_GET; - in_mad->mad_hdr.attr_id = cpu_to_be16(0x12); /* PortCounters */ + if (c_ext) + in_mad->mad_hdr.attr_id = IB_PMA_PORT_COUNTERS_EXT; + else + in_mad->mad_hdr.attr_id = IB_PMA_PORT_COUNTERS; in_mad->data[41] = p->port_num; /* PortSelect field */ @@ -347,6 +368,10 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, ret = sprintf(buf, "%u\n", be32_to_cpup((__be32 *)(out_mad->data + 40 + offset / 8))); break; + case 64: + ret = sprintf(buf, "%llu\n", + be64_to_cpup((__be64 *)(out_mad->data + 40 + offset / 8))); + break; default: ret = 0; } @@ -358,6 +383,18 @@ out: return ret; } +#define PORT_PMA_ATTR(_name, _counter, _width, _offset) \ +struct port_table_attribute port_pma_attr_##_name = { \ + .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \ + .index = (_offset) | ((_width) << 16) | ((_counter) << 24) \ +} + +static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, + char *buf) +{ + return get_pma_counters(p, attr, buf, 0); +} + static PORT_PMA_ATTR(symbol_error , 0, 16, 32); static PORT_PMA_ATTR(link_error_recovery , 1, 8, 48); static PORT_PMA_ATTR(link_downed , 2, 8, 56); @@ -374,6 +411,12 @@ static PORT_PMA_ATTR(port_xmit_data , 12, 32, 192); static PORT_PMA_ATTR(port_rcv_data , 13, 32, 224); static PORT_PMA_ATTR(port_xmit_packets , 14, 32, 256); static PORT_PMA_ATTR(port_rcv_packets , 15, 32, 288); +/* + * There is no bit allocated for port_xmit_wait in the CounterSelect field + * (IB spec). However, since this bit is ignored when reading + * (show_pma_counter), the _counter field of port_xmit_wait can be set to zero. + */ +static PORT_PMA_ATTR(port_xmit_wait , 0, 32, 320); static struct attribute *pma_attrs[] = { &port_pma_attr_symbol_error.attr.attr, @@ -392,6 +435,7 @@ static struct attribute *pma_attrs[] = { &port_pma_attr_port_rcv_data.attr.attr, &port_pma_attr_port_xmit_packets.attr.attr, &port_pma_attr_port_rcv_packets.attr.attr, + &port_pma_attr_port_xmit_wait.attr.attr, NULL }; @@ -400,6 +444,44 @@ static struct attribute_group pma_group = { .attrs = pma_attrs }; +#define PORT_PMA_ATTR_EXT(_name, _counter, _width, _offset) \ +struct port_table_attribute port_pma_attr_ext_##_name = { \ + .attr = __ATTR(_name, S_IRUGO, show_pma_counter_ext, NULL), \ + .index = (_offset) | ((_width) << 16) | ((_counter) << 24) \ +} + +static ssize_t show_pma_counter_ext(struct ib_port *p, + struct port_attribute *attr, char *buf) +{ + return get_pma_counters(p, attr, buf, 1); +} + +static PORT_PMA_ATTR_EXT(port_xmit_data_64 , 0, 64, 64); +static PORT_PMA_ATTR_EXT(port_rcv_data_64 , 0, 64, 128); +static PORT_PMA_ATTR_EXT(port_xmit_packets_64 , 0, 64, 192); +static PORT_PMA_ATTR_EXT(port_rcv_packets_64 , 0, 64, 256); +static PORT_PMA_ATTR_EXT(port_unicast_xmit_packets , 0, 64, 320); +static PORT_PMA_ATTR_EXT(port_unicast_rcv_packets , 0, 64, 384); +static PORT_PMA_ATTR_EXT(port_multicast_xmit_packets , 0, 64, 448); +static PORT_PMA_ATTR_EXT(port_multicast_rcv_packets , 0, 64, 512); + +static struct attribute *pma_attrs_ext[] = { + &port_pma_attr_ext_port_xmit_data_64.attr.attr, + &port_pma_attr_ext_port_rcv_data_64.attr.attr, + &port_pma_attr_ext_port_xmit_packets_64.attr.attr, + &port_pma_attr_ext_port_rcv_packets_64.attr.attr, + &port_pma_attr_ext_port_unicast_xmit_packets.attr.attr, + &port_pma_attr_ext_port_unicast_rcv_packets.attr.attr, + &port_pma_attr_ext_port_multicast_xmit_packets.attr.attr, + &port_pma_attr_ext_port_multicast_rcv_packets.attr.attr, + NULL +}; + +static struct attribute_group pma_ext_group = { + .name = "counters_ext", + .attrs = pma_attrs_ext +}; + static void ib_port_release(struct kobject *kobj) { struct ib_port *p = container_of(kobj, struct ib_port, kobj); @@ -476,7 +558,6 @@ alloc_group_attrs(ssize_t (*show)(struct ib_port *, element->attr.attr.mode = S_IRUGO; element->attr.show = show; element->index = i; - sysfs_attr_init(&element->attr.attr); tab_attr[i] = &element->attr.attr; } @@ -490,9 +571,7 @@ err: return NULL; } -static int add_port(struct ib_device *device, int port_num, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)) +static int add_port(struct ib_device *device, int port_num) { struct ib_port *p; struct ib_port_attr attr; @@ -520,10 +599,14 @@ static int add_port(struct ib_device *device, int port_num, if (ret) goto err_put; + ret = sysfs_create_group(&p->kobj, &pma_ext_group); + if (ret) + goto err_remove_pma; + p->gid_group.name = "gids"; p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len); if (!p->gid_group.attrs) - goto err_remove_pma; + goto err_remove_pma_ext; ret = sysfs_create_group(&p->kobj, &p->gid_group); if (ret) @@ -539,20 +622,11 @@ static int add_port(struct ib_device *device, int port_num, if (ret) goto err_free_pkey; - if (port_callback) { - ret = port_callback(device, port_num, &p->kobj); - if (ret) - goto err_remove_pkey; - } - list_add_tail(&p->kobj.entry, &device->port_list); kobject_uevent(&p->kobj, KOBJ_ADD); return 0; -err_remove_pkey: - sysfs_remove_group(&p->kobj, &p->pkey_group); - err_free_pkey: for (i = 0; i < attr.pkey_tbl_len; ++i) kfree(p->pkey_group.attrs[i]); @@ -568,6 +642,9 @@ err_free_gid: kfree(p->gid_group.attrs); +err_remove_pma_ext: + sysfs_remove_group(&p->kobj, &pma_ext_group); + err_remove_pma: sysfs_remove_group(&p->kobj, &pma_group); @@ -780,9 +857,7 @@ static struct attribute_group iw_stats_group = { .attrs = iw_proto_stats_attrs, }; -int ib_device_register_sysfs(struct ib_device *device, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)) +int ib_device_register_sysfs(struct ib_device *device) { struct device *class_dev = &device->dev; int ret; @@ -790,8 +865,8 @@ int ib_device_register_sysfs(struct ib_device *device, class_dev->class = &ib_class; class_dev->parent = device->dma_device; - dev_set_name(class_dev, device->name); dev_set_drvdata(class_dev, device); + dev_set_name(class_dev, device->name); INIT_LIST_HEAD(&device->port_list); @@ -813,12 +888,12 @@ int ib_device_register_sysfs(struct ib_device *device, } if (device->node_type == RDMA_NODE_IB_SWITCH) { - ret = add_port(device, 0, port_callback); + ret = add_port(device, 0); if (ret) goto err_put; } else { for (i = 1; i <= device->phys_port_cnt; ++i) { - ret = add_port(device, i, port_callback); + ret = add_port(device, i); if (ret) goto err_put; } @@ -886,3 +961,22 @@ void ib_sysfs_cleanup(void) { class_unregister(&ib_class); } + +int ib_sysfs_create_port_files(struct ib_device *device, + int (*create)(struct ib_device *dev, u8 port_num, + struct kobject *kobj)) +{ + struct kobject *p; + struct ib_port *port; + int ret = 0; + + list_for_each_entry(p, &device->port_list, entry) { + port = container_of(p, struct ib_port, kobj); + ret = create(device, port->port_num, &port->kobj); + if (ret) + break; + } + + return ret; +} +EXPORT_SYMBOL(ib_sysfs_create_port_files); diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index 08f948df8fa98..2690f5dfaf998 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -38,12 +38,12 @@ #include #include #include -#include #include #include #include #include #include +#include #include #include @@ -706,9 +706,14 @@ static int ib_ucm_alloc_data(const void **dest, u64 src, u32 len) if (!len) return 0; - data = memdup_user((void __user *)(unsigned long)src, len); - if (IS_ERR(data)) - return PTR_ERR(data); + data = kmalloc(len, GFP_KERNEL); + if (!data) + return -ENOMEM; + + if (copy_from_user(data, (void __user *)(unsigned long)src, len)) { + kfree(data); + return -EFAULT; + } *dest = data; return 0; @@ -1122,7 +1127,7 @@ static ssize_t ib_ucm_write(struct file *filp, const char __user *buf, if (copy_from_user(&hdr, buf, sizeof(hdr))) return -EFAULT; - if (hdr.cmd < 0 || hdr.cmd >= ARRAY_SIZE(ucm_cmd_table)) + if (hdr.cmd >= ARRAY_SIZE(ucm_cmd_table)) return -EINVAL; if (hdr.in + sizeof(hdr) > len) @@ -1176,7 +1181,7 @@ static int ib_ucm_open(struct inode *inode, struct file *filp) file->filp = filp; file->device = container_of(inode->i_cdev, struct ib_ucm_device, cdev); - return nonseekable_open(inode, filp); + return 0; } static int ib_ucm_close(struct inode *inode, struct file *filp) @@ -1211,20 +1216,16 @@ static void ib_ucm_release_dev(struct device *dev) ucm_dev = container_of(dev, struct ib_ucm_device, dev); cdev_del(&ucm_dev->cdev); - if (ucm_dev->devnum < IB_UCM_MAX_DEVICES) - clear_bit(ucm_dev->devnum, dev_map); - else - clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, dev_map); + clear_bit(ucm_dev->devnum, dev_map); kfree(ucm_dev); } static const struct file_operations ucm_fops = { - .owner = THIS_MODULE, - .open = ib_ucm_open, + .owner = THIS_MODULE, + .open = ib_ucm_open, .release = ib_ucm_close, - .write = ib_ucm_write, + .write = ib_ucm_write, .poll = ib_ucm_poll, - .llseek = no_llseek, }; static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr, @@ -1237,32 +1238,8 @@ static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); -static dev_t overflow_maj; -static DECLARE_BITMAP(overflow_map, IB_UCM_MAX_DEVICES); -static int find_overflow_devnum(void) -{ - int ret; - - if (!overflow_maj) { - ret = alloc_chrdev_region(&overflow_maj, 0, IB_UCM_MAX_DEVICES, - "infiniband_cm"); - if (ret) { - printk(KERN_ERR "ucm: couldn't register dynamic device number\n"); - return ret; - } - } - - ret = find_first_zero_bit(overflow_map, IB_UCM_MAX_DEVICES); - if (ret >= IB_UCM_MAX_DEVICES) - return -1; - - return ret; -} - static void ib_ucm_add_one(struct ib_device *device) { - int devnum; - dev_t base; struct ib_ucm_device *ucm_dev; if (!device->alloc_ucontext || @@ -1275,25 +1252,16 @@ static void ib_ucm_add_one(struct ib_device *device) ucm_dev->ib_dev = device; - devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES); - if (devnum >= IB_UCM_MAX_DEVICES) { - devnum = find_overflow_devnum(); - if (devnum < 0) - goto err; - - ucm_dev->devnum = devnum + IB_UCM_MAX_DEVICES; - base = devnum + overflow_maj; - set_bit(devnum, overflow_map); - } else { - ucm_dev->devnum = devnum; - base = devnum + IB_UCM_BASE_DEV; - set_bit(devnum, dev_map); - } + ucm_dev->devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES); + if (ucm_dev->devnum >= IB_UCM_MAX_DEVICES) + goto err; + + set_bit(ucm_dev->devnum, dev_map); cdev_init(&ucm_dev->cdev, &ucm_fops); ucm_dev->cdev.owner = THIS_MODULE; kobject_set_name(&ucm_dev->cdev.kobj, "ucm%d", ucm_dev->devnum); - if (cdev_add(&ucm_dev->cdev, base, 1)) + if (cdev_add(&ucm_dev->cdev, IB_UCM_BASE_DEV + ucm_dev->devnum, 1)) goto err; ucm_dev->dev.class = &cm_class; @@ -1314,10 +1282,7 @@ err_dev: device_unregister(&ucm_dev->dev); err_cdev: cdev_del(&ucm_dev->cdev); - if (ucm_dev->devnum < IB_UCM_MAX_DEVICES) - clear_bit(devnum, dev_map); - else - clear_bit(devnum, overflow_map); + clear_bit(ucm_dev->devnum, dev_map); err: kfree(ucm_dev); return; @@ -1333,8 +1298,13 @@ static void ib_ucm_remove_one(struct ib_device *device) device_unregister(&ucm_dev->dev); } -static CLASS_ATTR_STRING(abi_version, S_IRUGO, - __stringify(IB_USER_CM_ABI_VERSION)); +static ssize_t show_abi_version(struct class *class, + struct class_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", IB_USER_CM_ABI_VERSION); +} +static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL); static int __init ib_ucm_init(void) { @@ -1347,7 +1317,7 @@ static int __init ib_ucm_init(void) goto error1; } - ret = class_create_file(&cm_class, &class_attr_abi_version.attr); + ret = class_create_file(&cm_class, &class_attr_abi_version); if (ret) { printk(KERN_ERR "ucm: couldn't create abi_version attribute\n"); goto error2; @@ -1361,7 +1331,7 @@ static int __init ib_ucm_init(void) return 0; error3: - class_remove_file(&cm_class, &class_attr_abi_version.attr); + class_remove_file(&cm_class, &class_attr_abi_version); error2: unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES); error1: @@ -1371,10 +1341,8 @@ error1: static void __exit ib_ucm_cleanup(void) { ib_unregister_client(&ucm_client); - class_remove_file(&cm_class, &class_attr_abi_version.attr); + class_remove_file(&cm_class, &class_attr_abi_version); unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES); - if (overflow_maj) - unregister_chrdev_region(overflow_maj, IB_UCM_MAX_DEVICES); idr_destroy(&ctx_id_table); } diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 71be5eebd683a..6d0af621cee11 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -34,13 +34,11 @@ #include #include #include -#include #include #include #include #include -#include -#include +#include #include #include @@ -51,24 +49,8 @@ MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access"); MODULE_LICENSE("Dual BSD/GPL"); -static unsigned int max_backlog = 1024; - -static struct ctl_table_header *ucma_ctl_table_hdr; -static ctl_table ucma_ctl_table[] = { - { - .procname = "max_backlog", - .data = &max_backlog, - .maxlen = sizeof max_backlog, - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { } -}; - -static struct ctl_path ucma_ctl_path[] = { - { .procname = "net" }, - { .procname = "rdma_ucm" }, - { } +enum { + UCMA_MAX_BACKLOG = 128 }; struct ucma_file { @@ -349,6 +331,7 @@ static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf, ctx->cm_id = uevent->cm_id; ctx->cm_id->context = ctx; uevent->resp.id = ctx->id; + ctx->cm_id->ucontext = ctx; } if (copy_to_user((void __user *)(unsigned long)cmd.response, @@ -367,28 +350,13 @@ done: return ret; } -static int ucma_get_qp_type(struct rdma_ucm_create_id *cmd, enum ib_qp_type *qp_type) -{ - switch (cmd->ps) { - case RDMA_PS_TCP: - *qp_type = IB_QPT_RC; - return 0; - case RDMA_PS_UDP: - case RDMA_PS_IPOIB: - *qp_type = IB_QPT_UD; - return 0; - default: - return -EINVAL; - } -} - -static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf, - int in_len, int out_len) +static ssize_t ucma_create_id(struct ucma_file *file, + const char __user *inbuf, + int in_len, int out_len) { struct rdma_ucm_create_id cmd; struct rdma_ucm_create_id_resp resp; struct ucma_context *ctx; - enum ib_qp_type qp_type; int ret; if (out_len < sizeof(resp)) @@ -397,10 +365,6 @@ static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf, if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; - ret = ucma_get_qp_type(&cmd, &qp_type); - if (ret) - return ret; - mutex_lock(&file->mut); ctx = ucma_alloc_ctx(file); mutex_unlock(&file->mut); @@ -408,11 +372,12 @@ static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf, return -ENOMEM; ctx->uid = cmd.uid; - ctx->cm_id = rdma_create_id(ucma_event_handler, ctx, cmd.ps, qp_type); + ctx->cm_id = rdma_create_id(ucma_event_handler, ctx, cmd.ps); if (IS_ERR(ctx->cm_id)) { ret = PTR_ERR(ctx->cm_id); goto err1; } + ctx->cm_id->ucontext = ctx; resp.id = ctx->id; if (copy_to_user((void __user *)(unsigned long)cmd.response, @@ -620,7 +585,7 @@ static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp, } static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp, - struct rdma_route *route) + struct rdma_route *route) { struct rdma_dev_addr *dev_addr; struct net_device *dev; @@ -631,15 +596,17 @@ static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp, case 0: dev_addr = &route->addr.dev_addr; dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); - if (dev) { - vid = rdma_vlan_dev_vlan_id(dev); - dev_put(dev); - } + if (dev) { +#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) + vid = vlan_dev_vlan_id(dev); +#endif + dev_put(dev); + } iboe_mac_vlan_to_ll((union ib_gid *) &resp->ib_route[0].dgid, dev_addr->dst_dev_addr, vid); iboe_addr_get_sgid(dev_addr, - (union ib_gid *) &resp->ib_route[0].sgid); + (union ib_gid *) &resp->ib_route[0].sgid); resp->ib_route[0].pkey = cpu_to_be16(0xffff); break; case 2: @@ -655,16 +622,6 @@ static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp, } } -static void ucma_copy_iw_route(struct rdma_ucm_query_route_resp *resp, - struct rdma_route *route) -{ - struct rdma_dev_addr *dev_addr; - - dev_addr = &route->addr.dev_addr; - rdma_addr_get_dgid(dev_addr, (union ib_gid *) &resp->ib_route[0].dgid); - rdma_addr_get_sgid(dev_addr, (union ib_gid *) &resp->ib_route[0].sgid); -} - static ssize_t ucma_query_route(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) @@ -699,10 +656,8 @@ static ssize_t ucma_query_route(struct ucma_file *file, resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid; resp.port_num = ctx->cm_id->port_num; - switch (rdma_node_get_transport(ctx->cm_id->device->node_type)) { - case RDMA_TRANSPORT_IB: - switch (rdma_port_get_link_layer(ctx->cm_id->device, - ctx->cm_id->port_num)) { + if (rdma_node_get_transport(ctx->cm_id->device->node_type) == RDMA_TRANSPORT_IB) { + switch (rdma_port_link_layer(ctx->cm_id->device, ctx->cm_id->port_num)) { case IB_LINK_LAYER_INFINIBAND: ucma_copy_ib_route(&resp, &ctx->cm_id->route); break; @@ -712,12 +667,6 @@ static ssize_t ucma_query_route(struct ucma_file *file, default: break; } - break; - case RDMA_TRANSPORT_IWARP: - ucma_copy_iw_route(&resp, &ctx->cm_id->route); - break; - default: - break; } out: @@ -781,8 +730,8 @@ static ssize_t ucma_listen(struct ucma_file *file, const char __user *inbuf, if (IS_ERR(ctx)) return PTR_ERR(ctx); - ctx->backlog = cmd.backlog > 0 && cmd.backlog < max_backlog ? - cmd.backlog : max_backlog; + ctx->backlog = cmd.backlog > 0 && cmd.backlog < UCMA_MAX_BACKLOG ? + cmd.backlog : UCMA_MAX_BACKLOG; ret = rdma_listen(ctx->cm_id, ctx->backlog); ucma_put_ctx(ctx); return ret; @@ -902,13 +851,17 @@ static int ucma_set_option_id(struct ucma_context *ctx, int optname, } rdma_set_service_type(ctx->cm_id, *((u8 *) optval)); break; - case RDMA_OPTION_ID_REUSEADDR: - if (optlen != sizeof(int)) { + + case RDMA_OPTION_IB_APM: + if (optlen != sizeof(u8)) { ret = -EINVAL; break; } - ret = rdma_set_reuseaddr(ctx->cm_id, *((int *) optval) ? 1 : 0); - break; + + if (*(u8 *)optval) + ret = rdma_enable_apm(ctx->cm_id, RDMA_ALT_PATH_BEST); + break; + default: ret = -ENOSYS; } @@ -1270,7 +1223,7 @@ static ssize_t ucma_write(struct file *filp, const char __user *buf, if (copy_from_user(&hdr, buf, sizeof(hdr))) return -EFAULT; - if (hdr.cmd < 0 || hdr.cmd >= ARRAY_SIZE(ucma_cmd_table)) + if (hdr.cmd >= ARRAY_SIZE(ucma_cmd_table)) return -EINVAL; if (hdr.in + sizeof(hdr) > len) @@ -1322,8 +1275,7 @@ static int ucma_open(struct inode *inode, struct file *filp) filp->private_data = file; file->filp = filp; - - return nonseekable_open(inode, filp); + return 0; } static int ucma_close(struct inode *inode, struct file *filp) @@ -1353,15 +1305,12 @@ static const struct file_operations ucma_fops = { .release = ucma_close, .write = ucma_write, .poll = ucma_poll, - .llseek = no_llseek, }; static struct miscdevice ucma_misc = { - .minor = MISC_DYNAMIC_MINOR, - .name = "rdma_cm", - .nodename = "infiniband/rdma_cm", - .mode = 0666, - .fops = &ucma_fops, + .minor = MISC_DYNAMIC_MINOR, + .name = "rdma_cm", + .fops = &ucma_fops, }; static ssize_t show_abi_version(struct device *dev, @@ -1383,26 +1332,16 @@ static int __init ucma_init(void) ret = device_create_file(ucma_misc.this_device, &dev_attr_abi_version); if (ret) { printk(KERN_ERR "rdma_ucm: couldn't create abi_version attr\n"); - goto err1; - } - - ucma_ctl_table_hdr = register_sysctl_paths(ucma_ctl_path, ucma_ctl_table); - if (!ucma_ctl_table_hdr) { - printk(KERN_ERR "rdma_ucm: couldn't register sysctl paths\n"); - ret = -ENOMEM; - goto err2; + goto err; } return 0; -err2: - device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); -err1: +err: misc_deregister(&ucma_misc); return ret; } static void __exit ucma_cleanup(void) { - unregister_sysctl_table(ucma_ctl_table_hdr); device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); misc_deregister(&ucma_misc); idr_destroy(&ctx_idr); diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c index 9b737ff133e21..959868e3e557a 100644 --- a/drivers/infiniband/core/ud_header.c +++ b/drivers/infiniband/core/ud_header.c @@ -230,39 +230,47 @@ void ib_ud_header_init(int payload_bytes, int immediate_present, struct ib_ud_header *header) { + u16 packet_length; + memset(header, 0, sizeof *header); if (lrh_present) { - u16 packet_length; - header->lrh.link_version = 0; header->lrh.link_next_header = grh_present ? IB_LNH_IBA_GLOBAL : IB_LNH_IBA_LOCAL; - packet_length = (IB_LRH_BYTES + - IB_BTH_BYTES + - IB_DETH_BYTES + - (grh_present ? IB_GRH_BYTES : 0) + - payload_bytes + - 4 + /* ICRC */ - 3) / 4; /* round up */ - header->lrh.packet_length = cpu_to_be16(packet_length); - } + packet_length = IB_LRH_BYTES; + } else + packet_length = 0; - if (vlan_present) - header->eth.type = cpu_to_be16(ETH_P_8021Q); + if (eth_present) { + if (vlan_present) { + header->eth.type = cpu_to_be16(ETH_P_8021Q); + packet_length += IB_VLAN_BYTES; + } + packet_length += IB_ETH_BYTES; + } + + packet_length += IB_BTH_BYTES + IB_DETH_BYTES + payload_bytes + + 4 + /* ICRC */ + 3; /* round up */ + packet_length /= 4; if (grh_present) { - header->grh.ip_version = 6; - header->grh.payload_length = - cpu_to_be16((IB_BTH_BYTES + - IB_DETH_BYTES + - payload_bytes + - 4 + /* ICRC */ - 3) & ~3); /* round up */ + packet_length += IB_GRH_BYTES / 4; + header->grh.ip_version = 6; + header->grh.payload_length = + cpu_to_be16((IB_BTH_BYTES + + IB_DETH_BYTES + + payload_bytes + + 4 + /* ICRC */ + 3) & ~3); /* round up */ header->grh.next_header = 0x1b; } - if (immediate_present) + if (lrh_present) + header->lrh.packet_length = cpu_to_be16(packet_length); + + if (header->immediate_present) header->bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; else header->bth.opcode = IB_OPCODE_UD_SEND_ONLY; @@ -277,6 +285,36 @@ void ib_ud_header_init(int payload_bytes, } EXPORT_SYMBOL(ib_ud_header_init); +/** + * ib_lrh_header_pack - Pack LRH header struct into wire format + * @lrh:unpacked LRH header struct + * @buf:Buffer to pack into + * + * ib_lrh_header_pack() packs the LRH header structure @lrh into + * wire format in the buffer @buf. + */ +int ib_lrh_header_pack(struct ib_unpacked_lrh *lrh, void *buf) +{ + ib_pack(lrh_table, ARRAY_SIZE(lrh_table), lrh, buf); + return 0; +} +EXPORT_SYMBOL(ib_lrh_header_pack); + +/** + * ib_lrh_header_unpack - Unpack LRH structure from wire format + * @lrh:unpacked LRH header struct + * @buf:Buffer to pack into + * + * ib_lrh_header_unpack() unpacks the LRH header structure from + * wire format (in buf) into @lrh. + */ +int ib_lrh_header_unpack(void *buf, struct ib_unpacked_lrh *lrh) +{ + ib_unpack(lrh_table, ARRAY_SIZE(lrh_table), buf, lrh); + return 0; +} +EXPORT_SYMBOL(ib_lrh_header_unpack); + /** * ib_ud_header_pack - Pack UD header struct into wire format * @header:UD header struct @@ -300,11 +338,14 @@ int ib_ud_header_pack(struct ib_ud_header *header, &header->eth, buf + len); len += IB_ETH_BYTES; } + + if (header->vlan_present) { ib_pack(vlan_table, ARRAY_SIZE(vlan_table), &header->vlan, buf + len); len += IB_VLAN_BYTES; } + if (header->grh_present) { ib_pack(grh_table, ARRAY_SIZE(grh_table), &header->grh, buf + len); diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index b645e558876f8..a14f7a4da7825 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -37,23 +37,76 @@ #include #include #include -#include #include "uverbs.h" +static int allow_weak_ordering; +module_param(allow_weak_ordering, bool, 0444); +MODULE_PARM_DESC(allow_weak_ordering, "Allow weak ordering for data registered memory"); + #define IB_UMEM_MAX_PAGE_CHUNK \ ((PAGE_SIZE - offsetof(struct ib_umem_chunk, page_list)) / \ ((void *) &((struct ib_umem_chunk *) 0)->page_list[1] - \ (void *) &((struct ib_umem_chunk *) 0)->page_list[0])) +#ifdef __ia64__ +extern int dma_map_sg_hp_wa; + +static int dma_map_sg_ia64(struct ib_device *ibdev, + struct scatterlist *sg, + int nents, + enum dma_data_direction dir) +{ + int i, rc, j, lents = 0; + struct device *dev; + + if (!dma_map_sg_hp_wa) + return ib_dma_map_sg(ibdev, sg, nents, dir); + + dev = ibdev->dma_device; + for (i = 0; i < nents; ++i) { + rc = dma_map_sg(dev, sg + i, 1, dir); + if (rc <= 0) { + for (j = 0; j < i; ++j) + dma_unmap_sg(dev, sg + j, 1, dir); + + return 0; + } + lents += rc; + } + + return lents; +} + +static void dma_unmap_sg_ia64(struct ib_device *ibdev, + struct scatterlist *sg, + int nents, + enum dma_data_direction dir) +{ + int i; + struct device *dev; + + if (!dma_map_sg_hp_wa) + return ib_dma_unmap_sg(ibdev, sg, nents, dir); + + dev = ibdev->dma_device; + for (i = 0; i < nents; ++i) + dma_unmap_sg(dev, sg + i, 1, dir); +} + +#define ib_dma_map_sg(dev, sg, nents, dir) dma_map_sg_ia64(dev, sg, nents, dir) +#define ib_dma_unmap_sg(dev, sg, nents, dir) dma_unmap_sg_ia64(dev, sg, nents, dir) + +#endif + static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) { struct ib_umem_chunk *chunk, *tmp; int i; list_for_each_entry_safe(chunk, tmp, &umem->chunk_list, list) { - ib_dma_unmap_sg(dev, chunk->page_list, - chunk->nents, DMA_BIDIRECTIONAL); + ib_dma_unmap_sg_attrs(dev, chunk->page_list, + chunk->nents, DMA_BIDIRECTIONAL, &chunk->attrs); for (i = 0; i < chunk->nents; ++i) { struct page *page = sg_page(&chunk->page_list[i]); @@ -92,6 +145,9 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, if (dmasync) dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs); + else if (allow_weak_ordering) + dma_set_attr(DMA_ATTR_WEAK_ORDERING, &attrs); + if (!can_do_mlock()) return ERR_PTR(-EPERM); @@ -137,7 +193,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, down_write(¤t->mm->mmap_sem); locked = npages + current->mm->locked_vm; - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { ret = -ENOMEM; @@ -170,6 +226,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, goto out; } + chunk->attrs = attrs; chunk->nents = min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK); sg_init_table(chunk->page_list, chunk->nents); for (i = 0; i < chunk->nents; ++i) { @@ -262,7 +319,7 @@ void ib_umem_release(struct ib_umem *umem) umem->mm = mm; umem->diff = diff; - queue_work(ib_wq, &umem->work); + schedule_work(&umem->work); return; } } else diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 8d261b6ea5fea..db3883b3259aa 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -44,8 +44,8 @@ #include #include #include -#include #include +#include #include #include @@ -66,9 +66,12 @@ enum { }; /* - * Our lifetime rules for these structs are the following: - * device special file is opened, we take a reference on the - * ib_umad_port's struct ib_umad_device. We drop these + * Our lifetime rules for these structs are the following: each time a + * device special file is opened, we look up the corresponding struct + * ib_umad_port by minor in the umad_port[] table while holding the + * port_lock. If this lookup succeeds, we take a reference on the + * ib_umad_port's struct ib_umad_device while still holding the + * port_lock; if the lookup fails, we fail the open(). We drop these * references in the corresponding close(). * * In addition to references coming from open character devices, there @@ -76,14 +79,19 @@ enum { * module's reference taken when allocating the ib_umad_device in * ib_umad_add_one(). * - * When destroying an ib_umad_device, we drop the module's reference. + * When destroying an ib_umad_device, we clear all of its + * ib_umad_ports from umad_port[] while holding port_lock before + * dropping the module's reference to the ib_umad_device. This is + * always safe because any open() calls will either succeed and obtain + * a reference before we clear the umad_port[] entries, or fail after + * we clear the umad_port[] entries. */ struct ib_umad_port { - struct cdev cdev; + struct cdev *cdev; struct device *dev; - struct cdev sm_cdev; + struct cdev *sm_cdev; struct device *sm_dev; struct semaphore sm_sem; @@ -129,6 +137,7 @@ static struct class *umad_class; static const dev_t base_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE); static DEFINE_SPINLOCK(port_lock); +static struct ib_umad_port *umad_port[IB_UMAD_MAX_PORTS]; static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS); static void ib_umad_add_one(struct ib_device *device); @@ -458,8 +467,7 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, goto err; } - if (packet->mad.hdr.id < 0 || - packet->mad.hdr.id >= IB_UMAD_MAX_AGENTS) { + if (packet->mad.hdr.id >= IB_UMAD_MAX_AGENTS) { ret = -EINVAL; goto err; } @@ -488,8 +496,8 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, ah_attr.ah_flags = IB_AH_GRH; memcpy(ah_attr.grh.dgid.raw, packet->mad.hdr.gid, 16); ah_attr.grh.sgid_index = packet->mad.hdr.gid_index; - ah_attr.grh.flow_label = be32_to_cpu(packet->mad.hdr.flow_label); - ah_attr.grh.hop_limit = packet->mad.hdr.hop_limit; + ah_attr.grh.flow_label = be32_to_cpu(packet->mad.hdr.flow_label); + ah_attr.grh.hop_limit = packet->mad.hdr.hop_limit; ah_attr.grh.traffic_class = packet->mad.hdr.traffic_class; } @@ -520,9 +528,9 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, goto err_ah; } - packet->msg->ah = ah; + packet->msg->ah = ah; packet->msg->timeout_ms = packet->mad.hdr.timeout_ms; - packet->msg->retries = packet->mad.hdr.retries; + packet->msg->retries = packet->mad.hdr.retries; packet->msg->context[0] = packet; /* Copy MAD header. Any RMPP header is already in place. */ @@ -703,7 +711,7 @@ static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg) mutex_lock(&file->port->file_mutex); mutex_lock(&file->mutex); - if (id < 0 || id >= IB_UMAD_MAX_AGENTS || !__get_agent(file, id)) { + if (id >= IB_UMAD_MAX_AGENTS || !__get_agent(file, id)) { ret = -EINVAL; goto out; } @@ -771,22 +779,29 @@ static long ib_umad_compat_ioctl(struct file *filp, unsigned int cmd, /* * ib_umad_open() does not need the BKL: * - * - the ib_umad_port structures are properly reference counted, and + * - umad_port[] accesses are protected by port_lock, the + * ib_umad_port structures are properly reference counted, and * everything else is purely local to the file being created, so * races against other open calls are not a problem; * - the ioctl method does not affect any global state outside of the * file structure being operated on; + * - the port is added to umad_port[] as the last part of module + * initialization so the open method will either immediately run + * -ENXIO, or all required initialization will be done. */ static int ib_umad_open(struct inode *inode, struct file *filp) { struct ib_umad_port *port; struct ib_umad_file *file; - int ret; + int ret = 0; - port = container_of(inode->i_cdev, struct ib_umad_port, cdev); + spin_lock(&port_lock); + port = umad_port[iminor(inode) - IB_UMAD_MINOR_BASE]; if (port) kref_get(&port->umad_dev->ref); - else + spin_unlock(&port_lock); + + if (!port) return -ENXIO; mutex_lock(&port->file_mutex); @@ -814,8 +829,6 @@ static int ib_umad_open(struct inode *inode, struct file *filp) list_add_tail(&file->port_list, &port->file_list); - ret = nonseekable_open(inode, filp); - out: mutex_unlock(&port->file_mutex); return ret; @@ -859,17 +872,16 @@ static int ib_umad_close(struct inode *inode, struct file *filp) } static const struct file_operations umad_fops = { - .owner = THIS_MODULE, - .read = ib_umad_read, - .write = ib_umad_write, - .poll = ib_umad_poll, + .owner = THIS_MODULE, + .read = ib_umad_read, + .write = ib_umad_write, + .poll = ib_umad_poll, .unlocked_ioctl = ib_umad_ioctl, #ifdef CONFIG_COMPAT - .compat_ioctl = ib_umad_compat_ioctl, + .compat_ioctl = ib_umad_compat_ioctl, #endif - .open = ib_umad_open, - .release = ib_umad_close, - .llseek = no_llseek, + .open = ib_umad_open, + .release = ib_umad_close }; static int ib_umad_sm_open(struct inode *inode, struct file *filp) @@ -880,10 +892,13 @@ static int ib_umad_sm_open(struct inode *inode, struct file *filp) }; int ret; - port = container_of(inode->i_cdev, struct ib_umad_port, sm_cdev); + spin_lock(&port_lock); + port = umad_port[iminor(inode) - IB_UMAD_MINOR_BASE - IB_UMAD_MAX_PORTS]; if (port) kref_get(&port->umad_dev->ref); - else + spin_unlock(&port_lock); + + if (!port) return -ENXIO; if (filp->f_flags & O_NONBLOCK) { @@ -906,7 +921,7 @@ static int ib_umad_sm_open(struct inode *inode, struct file *filp) filp->private_data = port; - return nonseekable_open(inode, filp); + return 0; fail: kref_put(&port->umad_dev->ref, ib_umad_release_dev); @@ -934,10 +949,9 @@ static int ib_umad_sm_close(struct inode *inode, struct file *filp) } static const struct file_operations umad_sm_fops = { - .owner = THIS_MODULE, - .open = ib_umad_sm_open, - .release = ib_umad_sm_close, - .llseek = no_llseek, + .owner = THIS_MODULE, + .open = ib_umad_sm_open, + .release = ib_umad_sm_close }; static struct ib_client umad_client = { @@ -970,54 +984,24 @@ static ssize_t show_port(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR(port, S_IRUGO, show_port, NULL); -static CLASS_ATTR_STRING(abi_version, S_IRUGO, - __stringify(IB_USER_MAD_ABI_VERSION)); - -static dev_t overflow_maj; -static DECLARE_BITMAP(overflow_map, IB_UMAD_MAX_PORTS); -static int find_overflow_devnum(void) +static ssize_t show_abi_version(struct class *class, + struct class_attribute *attr, + char *buf) { - int ret; - - if (!overflow_maj) { - ret = alloc_chrdev_region(&overflow_maj, 0, IB_UMAD_MAX_PORTS * 2, - "infiniband_mad"); - if (ret) { - printk(KERN_ERR "user_mad: couldn't register dynamic device number\n"); - return ret; - } - } - - ret = find_first_zero_bit(overflow_map, IB_UMAD_MAX_PORTS); - if (ret >= IB_UMAD_MAX_PORTS) - return -1; - - return ret; + return sprintf(buf, "%d\n", IB_USER_MAD_ABI_VERSION); } +static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL); static int ib_umad_init_port(struct ib_device *device, int port_num, struct ib_umad_port *port) { - int devnum; - dev_t base; - spin_lock(&port_lock); - devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS); - if (devnum >= IB_UMAD_MAX_PORTS) { + port->dev_num = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS); + if (port->dev_num >= IB_UMAD_MAX_PORTS) { spin_unlock(&port_lock); - devnum = find_overflow_devnum(); - if (devnum < 0) - return -1; - - spin_lock(&port_lock); - port->dev_num = devnum + IB_UMAD_MAX_PORTS; - base = devnum + overflow_maj; - set_bit(devnum, overflow_map); - } else { - port->dev_num = devnum; - base = devnum + base_dev; - set_bit(devnum, dev_map); + return -1; } + set_bit(port->dev_num, dev_map); spin_unlock(&port_lock); port->ib_dev = device; @@ -1026,14 +1010,17 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, mutex_init(&port->file_mutex); INIT_LIST_HEAD(&port->file_list); - cdev_init(&port->cdev, &umad_fops); - port->cdev.owner = THIS_MODULE; - kobject_set_name(&port->cdev.kobj, "umad%d", port->dev_num); - if (cdev_add(&port->cdev, base, 1)) + port->cdev = cdev_alloc(); + if (!port->cdev) + return -1; + port->cdev->owner = THIS_MODULE; + port->cdev->ops = &umad_fops; + kobject_set_name(&port->cdev->kobj, "umad%d", port->dev_num); + if (cdev_add(port->cdev, base_dev + port->dev_num, 1)) goto err_cdev; port->dev = device_create(umad_class, device->dma_device, - port->cdev.dev, port, + port->cdev->dev, port, "umad%d", port->dev_num); if (IS_ERR(port->dev)) goto err_cdev; @@ -1043,15 +1030,17 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, if (device_create_file(port->dev, &dev_attr_port)) goto err_dev; - base += IB_UMAD_MAX_PORTS; - cdev_init(&port->sm_cdev, &umad_sm_fops); - port->sm_cdev.owner = THIS_MODULE; - kobject_set_name(&port->sm_cdev.kobj, "issm%d", port->dev_num); - if (cdev_add(&port->sm_cdev, base, 1)) + port->sm_cdev = cdev_alloc(); + if (!port->sm_cdev) + goto err_dev; + port->sm_cdev->owner = THIS_MODULE; + port->sm_cdev->ops = &umad_sm_fops; + kobject_set_name(&port->sm_cdev->kobj, "issm%d", port->dev_num); + if (cdev_add(port->sm_cdev, base_dev + port->dev_num + IB_UMAD_MAX_PORTS, 1)) goto err_sm_cdev; port->sm_dev = device_create(umad_class, device->dma_device, - port->sm_cdev.dev, port, + port->sm_cdev->dev, port, "issm%d", port->dev_num); if (IS_ERR(port->sm_dev)) goto err_sm_cdev; @@ -1061,23 +1050,24 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, if (device_create_file(port->sm_dev, &dev_attr_port)) goto err_sm_dev; + spin_lock(&port_lock); + umad_port[port->dev_num] = port; + spin_unlock(&port_lock); + return 0; err_sm_dev: - device_destroy(umad_class, port->sm_cdev.dev); + device_destroy(umad_class, port->sm_cdev->dev); err_sm_cdev: - cdev_del(&port->sm_cdev); + cdev_del(port->sm_cdev); err_dev: - device_destroy(umad_class, port->cdev.dev); + device_destroy(umad_class, port->cdev->dev); err_cdev: - cdev_del(&port->cdev); - if (port->dev_num < IB_UMAD_MAX_PORTS) - clear_bit(devnum, dev_map); - else - clear_bit(devnum, overflow_map); + cdev_del(port->cdev); + clear_bit(port->dev_num, dev_map); return -1; } @@ -1085,16 +1075,21 @@ err_cdev: static void ib_umad_kill_port(struct ib_umad_port *port) { struct ib_umad_file *file; + int already_dead; int id; dev_set_drvdata(port->dev, NULL); dev_set_drvdata(port->sm_dev, NULL); - device_destroy(umad_class, port->cdev.dev); - device_destroy(umad_class, port->sm_cdev.dev); + device_destroy(umad_class, port->cdev->dev); + device_destroy(umad_class, port->sm_cdev->dev); + + cdev_del(port->cdev); + cdev_del(port->sm_cdev); - cdev_del(&port->cdev); - cdev_del(&port->sm_cdev); + spin_lock(&port_lock); + umad_port[port->dev_num] = NULL; + spin_unlock(&port_lock); mutex_lock(&port->file_mutex); @@ -1102,6 +1097,7 @@ static void ib_umad_kill_port(struct ib_umad_port *port) list_for_each_entry(file, &port->file_list, port_list) { mutex_lock(&file->mutex); + already_dead = file->agents_dead; file->agents_dead = 1; mutex_unlock(&file->mutex); @@ -1112,10 +1108,7 @@ static void ib_umad_kill_port(struct ib_umad_port *port) mutex_unlock(&port->file_mutex); - if (port->dev_num < IB_UMAD_MAX_PORTS) - clear_bit(port->dev_num, dev_map); - else - clear_bit(port->dev_num - IB_UMAD_MAX_PORTS, overflow_map); + clear_bit(port->dev_num, dev_map); } static void ib_umad_add_one(struct ib_device *device) @@ -1147,8 +1140,9 @@ static void ib_umad_add_one(struct ib_device *device) for (i = s; i <= e; ++i) { umad_dev->port[i - s].umad_dev = umad_dev; - if (ib_umad_init_port(device, i, &umad_dev->port[i - s])) - goto err; + if (rdma_port_link_layer(device, i) == IB_LINK_LAYER_INFINIBAND) + if (ib_umad_init_port(device, i, &umad_dev->port[i - s])) + goto err; } ib_set_client_data(device, &umad_client, umad_dev); @@ -1157,7 +1151,8 @@ static void ib_umad_add_one(struct ib_device *device) err: while (--i >= s) - ib_umad_kill_port(&umad_dev->port[i - s]); + if (rdma_port_link_layer(device, i) == IB_LINK_LAYER_INFINIBAND) + ib_umad_kill_port(&umad_dev->port[i - s]); kref_put(&umad_dev->ref, ib_umad_release_dev); } @@ -1171,16 +1166,12 @@ static void ib_umad_remove_one(struct ib_device *device) return; for (i = 0; i <= umad_dev->end_port - umad_dev->start_port; ++i) - ib_umad_kill_port(&umad_dev->port[i]); + if (rdma_port_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) + ib_umad_kill_port(&umad_dev->port[i]); kref_put(&umad_dev->ref, ib_umad_release_dev); } -static char *umad_devnode(struct device *dev, mode_t *mode) -{ - return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev)); -} - static int __init ib_umad_init(void) { int ret; @@ -1199,9 +1190,7 @@ static int __init ib_umad_init(void) goto out_chrdev; } - umad_class->devnode = umad_devnode; - - ret = class_create_file(umad_class, &class_attr_abi_version.attr); + ret = class_create_file(umad_class, &class_attr_abi_version); if (ret) { printk(KERN_ERR "user_mad: couldn't create abi_version attribute\n"); goto out_class; @@ -1230,8 +1219,6 @@ static void __exit ib_umad_cleanup(void) ib_unregister_client(&umad_client); class_destroy(umad_class); unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2); - if (overflow_maj) - unregister_chrdev_region(overflow_maj, IB_UMAD_MAX_PORTS * 2); } module_init(ib_umad_init); diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index a078e5624d22f..7997c0fed1594 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -41,11 +41,11 @@ #include #include #include -#include #include #include #include +#include /* * Our lifetime rules for these structs are the following: @@ -70,23 +70,23 @@ struct ib_uverbs_device { struct kref ref; - int num_comp_vectors; struct completion comp; + int devnum; + struct cdev *cdev; struct device *dev; struct ib_device *ib_dev; - int devnum; - struct cdev cdev; + int num_comp_vectors; }; struct ib_uverbs_event_file { struct kref ref; - int is_async; struct ib_uverbs_file *uverbs_file; spinlock_t lock; - int is_closed; wait_queue_head_t poll_wait; struct fasync_struct *async_queue; struct list_head event_list; + int is_async; + int is_closed; }; struct ib_uverbs_file { @@ -134,14 +134,22 @@ struct ib_ucq_object { u32 async_events_reported; }; +struct ib_uxrcd_object { + struct ib_uobject uobject; + struct list_head xrc_reg_qp_list; +}; + extern spinlock_t ib_uverbs_idr_lock; extern struct idr ib_uverbs_pd_idr; +extern struct idr ib_uverbs_shpd_idr; extern struct idr ib_uverbs_mr_idr; +extern struct idr ib_uverbs_fmr_idr; extern struct idr ib_uverbs_mw_idr; extern struct idr ib_uverbs_ah_idr; extern struct idr ib_uverbs_cq_idr; extern struct idr ib_uverbs_qp_idr; extern struct idr ib_uverbs_srq_idr; +extern struct idr ib_uverbs_xrc_domain_idr; void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj); @@ -161,6 +169,12 @@ void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr); void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr); void ib_uverbs_event_handler(struct ib_event_handler *handler, struct ib_event *event); +void ib_uverbs_xrc_rcv_qp_event_handler(struct ib_event *event, + void *context_ptr); +void ib_uverbs_dealloc_xrcd(struct ib_device *ib_dev, + struct ib_xrcd *xrcd); +int ib_uverbs_cleanup_xrc_rcv_qp(struct ib_uverbs_file *file, + struct ib_xrcd *xrcd, u32 qp_num); #define IB_UVERBS_DECLARE_CMD(name) \ ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \ @@ -195,5 +209,25 @@ IB_UVERBS_DECLARE_CMD(create_srq); IB_UVERBS_DECLARE_CMD(modify_srq); IB_UVERBS_DECLARE_CMD(query_srq); IB_UVERBS_DECLARE_CMD(destroy_srq); +IB_UVERBS_DECLARE_CMD(create_xrc_srq); +IB_UVERBS_DECLARE_CMD(open_xrc_domain); +IB_UVERBS_DECLARE_CMD(close_xrc_domain); +IB_UVERBS_DECLARE_CMD(create_xrc_rcv_qp); +IB_UVERBS_DECLARE_CMD(modify_xrc_rcv_qp); +IB_UVERBS_DECLARE_CMD(query_xrc_rcv_qp); +IB_UVERBS_DECLARE_CMD(reg_xrc_rcv_qp); +IB_UVERBS_DECLARE_CMD(unreg_xrc_rcv_qp); +IB_UVERBS_DECLARE_CMD(get_eth_l2_addr); +IB_UVERBS_DECLARE_CMD(alloc_shpd); +IB_UVERBS_DECLARE_CMD(share_pd); +IB_UVERBS_DECLARE_CMD(reg_mr_relaxed); +IB_UVERBS_DECLARE_CMD(dereg_mr_relaxed); +IB_UVERBS_DECLARE_CMD(flush_relaxed_mr); + +/* FMR parameters */ +extern int ufmr_pool1_blocksize; +extern int ufmr_pool1_nelems; +extern int ufmr_pool2_blocksize; +extern int ufmr_pool2_nelems; #endif /* UVERBS_H */ diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index c42699285f8eb..91026d9322fdb 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -38,11 +38,27 @@ #include #include +#include +#include #include "uverbs.h" + + +/* FMR parameters */ +/* default Pool 1 block size */ +int ufmr_pool1_blocksize = 8 * 1024; +/* default no of fmrs in Pool 1 */ +int ufmr_pool1_nelems = 32 * 1024; +/* default Pool 2 block size */ +int ufmr_pool2_blocksize = 1 * 1024 * 1024; +/* default no of fmrs in Pool 2 */ +int ufmr_pool2_nelems = 4 * 1024; + static struct lock_class_key pd_lock_key; +static struct lock_class_key shpd_lock_key; static struct lock_class_key mr_lock_key; +static struct lock_class_key fmr_lock_key; static struct lock_class_key cq_lock_key; static struct lock_class_key qp_lock_key; static struct lock_class_key ah_lock_key; @@ -215,6 +231,11 @@ static void put_pd_read(struct ib_pd *pd) put_uobj_read(pd->uobject); } +static void put_pd_write(struct ib_pd *pd) +{ + put_uobj_write(pd->uobject); +} + static struct ib_cq *idr_read_cq(int cq_handle, struct ib_ucontext *context, int nested) { return idr_read_obj(&ib_uverbs_cq_idr, cq_handle, context, nested); @@ -255,6 +276,123 @@ static void put_srq_read(struct ib_srq *srq) put_uobj_read(srq->uobject); } +static struct ib_xrcd *idr_read_xrcd(int xrcd_handle, + struct ib_ucontext *context, + struct ib_uobject **uobj) +{ + *uobj = idr_read_uobj(&ib_uverbs_xrc_domain_idr, xrcd_handle, + context, 0); + return *uobj ? (*uobj)->object : NULL; +} + +static void put_xrcd_read(struct ib_uobject *uobj) +{ + put_uobj_read(uobj); +} + +/* + * get the number of pages by looking at the page indices that the start and + * end addresses fall in. + * + * Returns 0 if the vec is invalid. It is invalid if the number of bytes + * causes the address to wrap or overflows an unsigned int. This comes + * from being stored in the 'length' member of 'struct scatterlist'. + */ +static unsigned int get_pages_in_range(u64 addr, u64 bytes) +{ + if ((addr + bytes <= addr) || + (bytes > (u64)UINT_MAX)) + return 0; + + return ((addr + bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) - + (addr >> PAGE_SHIFT); +} + +/* Pin user pages*/ +static int fmr_pin_pages(unsigned long user_addr, unsigned int nr_pages, + struct page **pages, int write) +{ + int ret; + + down_read(¤t->mm->mmap_sem); + ret = get_user_pages(current, current->mm, user_addr, + nr_pages, write, 0, pages, NULL); + up_read(¤t->mm->mmap_sem); + + if (0 <= ret && (unsigned) ret < nr_pages) { + while (ret--) + put_page(pages[ret]); + ret = -EFAULT; + } + + return ret; +} + +static int create_fmr_pool(struct ib_pd *pd, int pages, int size, u32 access) +{ + + int ret = 0; + struct ib_fmr_pool_param fmr_param; + struct ib_fmr_pool *fmr_pool; + struct ib_relaxed_pool_data *pool_data; + struct ib_relaxed_pool_data *pos; + int found = 0; + + /*create pools - 32k fmrs of 8k buf, 4k fmrs of 1meg */ + memset(&fmr_param, 0, sizeof fmr_param); + fmr_param.pool_size = size; + fmr_param.dirty_watermark = 512; + fmr_param.cache = 0; + fmr_param.relaxed = 1; + fmr_param.max_pages_per_fmr = pages; + fmr_param.page_shift = PAGE_SHIFT; + fmr_param.access = access; + + fmr_pool = ib_create_fmr_pool(pd, &fmr_param); + + if (IS_ERR(fmr_pool)) { + ret = PTR_ERR(fmr_pool); + goto err_exit; + } + + pool_data = kmalloc(sizeof *pool_data, GFP_KERNEL); + + if (!pool_data) { + ret = -ENOMEM; + (void)ib_destroy_fmr_pool(fmr_pool); + goto err_exit; + } + + pool_data->fmr_pool = fmr_pool; + pool_data->access_flags = access; + pool_data->max_pages = pages; + list_for_each_entry(pos, &pd->device->relaxed_pool_list, pool_list) { + if (pages <= pos->max_pages) { + list_add_tail(&pool_data->pool_list, &pos->pool_list); + found = 1; + break; + } + } + if (!found) + list_add_tail(&pool_data->pool_list, + &pd->device->relaxed_pool_list); + +#ifdef DEBUG + printk(KERN_INFO "FMR POOLS :\n"); + list_for_each_entry(pos, &pd->device->relaxed_pool_list, pool_list) { + printk(KERN_INFO "\t pos -> %p, pages = %d, access = %x, " + "pool = %p\n", + pos, pos->max_pages, pos->access_flags, + pos->fmr_pool); + } +#endif + + return 0; + +err_exit: + return ret; +} + ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) @@ -293,11 +431,13 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, ucontext->device = ibdev; INIT_LIST_HEAD(&ucontext->pd_list); INIT_LIST_HEAD(&ucontext->mr_list); + INIT_LIST_HEAD(&ucontext->fmr_list); INIT_LIST_HEAD(&ucontext->mw_list); INIT_LIST_HEAD(&ucontext->cq_list); INIT_LIST_HEAD(&ucontext->qp_list); INIT_LIST_HEAD(&ucontext->srq_list); INIT_LIST_HEAD(&ucontext->ah_list); + INIT_LIST_HEAD(&ucontext->xrc_domain_list); ucontext->closing = 0; resp.num_comp_vectors = file->device->num_comp_vectors; @@ -460,8 +600,8 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file, resp.active_width = attr.active_width; resp.active_speed = attr.active_speed; resp.phys_state = attr.phys_state; - resp.link_layer = rdma_port_get_link_layer(file->device->ib_dev, - cmd.port_num); + resp.link_layer = attr.link_layer; + resp.ext_active_speed = attr.ext_active_speed; if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) @@ -507,6 +647,7 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, pd->device = file->device->ib_dev; pd->uobject = uobj; + pd->shpd = NULL; /* will be filled in when pd is shared */ atomic_set(&pd->usecnt, 0); uobj->object = pd; @@ -544,13 +685,228 @@ err: return ret; } +ssize_t ib_uverbs_alloc_shpd(struct ib_uverbs_file *file, + const char __user *buf, + int in_len, int out_len) +{ + struct ib_uverbs_alloc_shpd cmd; + struct ib_uverbs_alloc_shpd_resp resp; + struct ib_udata udata; + struct ib_uobject *uobj; + struct ib_uobject *shuobj = NULL; + struct ib_pd *pd; + struct ib_shpd *shpd = NULL; + int ret; + + if (!file->device->ib_dev->alloc_shpd || + !file->device->ib_dev->share_pd || + !file->device->ib_dev->remove_shpd) + return -ENOSYS; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + INIT_UDATA(&udata, buf + sizeof cmd, + (unsigned long) cmd.response + sizeof resp, + in_len - sizeof cmd, out_len - sizeof resp); + + uobj = idr_write_uobj(&ib_uverbs_pd_idr, cmd.pd_handle, file->ucontext); + if (!uobj) + return -EINVAL; + + pd = uobj->object; + + /* pd can be shared only once */ + if (pd->shpd) { + ret = -EINVAL; + goto err_pd; + } + + + /* create a new uobj */ + shuobj = kmalloc(sizeof *shuobj, GFP_KERNEL); + if (!shuobj) { + ret = -ENOMEM; + goto err_pd; + } + + init_uobj(shuobj, 0, 0/* global */, &shpd_lock_key); + down_write(&shuobj->mutex); + + /* alloc shared pd from device driver */ + shpd = file->device->ib_dev->alloc_shpd(file->device->ib_dev, pd); + if (IS_ERR(shpd)) { + ret = PTR_ERR(shpd); + goto err_shobj; + } + + shpd->device = file->device->ib_dev; + shpd->uobject = shuobj; + shpd->share_key = cmd.share_key; + /* initialize shared count for this shpd */ + atomic_set(&shpd->shared, 1); + + shuobj->object = shpd; + + /* link new uobj to device level list */ + ret = idr_add_uobj(&ib_uverbs_shpd_idr, shuobj); + if (ret) + goto err_idr; + + /* return pd_handle */ + memset(&resp, 0, sizeof resp); + resp.shpd_handle = shuobj->id; + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) { + ret = -EFAULT; + goto err_copy; + } + + shuobj->live = 1; + + /* mark pd as shared */ + pd->shpd = shpd; + + up_write(&shuobj->mutex); + put_pd_write(pd); + + return in_len; + +err_copy: + idr_remove_uobj(&ib_uverbs_shpd_idr, shuobj); + +err_idr: + file->device->ib_dev->remove_shpd(file->device->ib_dev, shpd, 1); + +err_shobj: + put_uobj_write(shuobj); + +err_pd: + put_pd_write(pd); + + return ret; +} + +ssize_t ib_uverbs_share_pd(struct ib_uverbs_file *file, + const char __user *buf, + int in_len, int out_len) +{ + struct ib_uverbs_share_pd cmd; + struct ib_uverbs_share_pd_resp resp; + struct ib_udata udata; + struct ib_uobject *uobj = NULL; + struct ib_uobject *shuobj; + struct ib_pd *pd; + struct ib_shpd *shpd; + int ret; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + INIT_UDATA(&udata, buf + sizeof cmd, + (unsigned long) cmd.response + sizeof resp, + in_len - sizeof cmd, out_len - sizeof resp); + + /* get global uobject for the shared pd */ + shuobj = idr_read_uobj(&ib_uverbs_shpd_idr, cmd.shpd_handle, 0/* global */, 0); + if (!shuobj) + return -EINVAL; + + shpd = shuobj->object; + + /* check if the key matches */ + if (shpd->share_key != cmd.share_key) { + printk(KERN_WARNING "WARNING : invalid shared pd key\n"); + ret = -EINVAL; + goto err_putshpd; + } + + /* check if the devices match */ + if (strncmp(file->device->ib_dev->name, shpd->device->name, IB_DEVICE_NAME_MAX)) { + ret = -EINVAL; + goto err_putshpd; + } + + /* allocate a new user object */ + uobj = kmalloc(sizeof *uobj, GFP_KERNEL); + if (!uobj) { + ret = -ENOMEM; + goto err_putshpd; + } + + + init_uobj(uobj, 0, file->ucontext, &pd_lock_key); + down_write(&uobj->mutex); + + /* share the pd at device driver level */ + pd = file->device->ib_dev->share_pd(file->device->ib_dev, + file->ucontext, &udata, shpd); + if (IS_ERR(pd)) { + ret = PTR_ERR(pd); + goto err_putuobj; + } + + pd->device = file->device->ib_dev; + pd->uobject = uobj; + pd->shpd = shpd; + atomic_set(&pd->usecnt, 0); + + /* initialize uobj and return pd_handle */ + uobj->object = pd; + ret = idr_add_uobj(&ib_uverbs_pd_idr, uobj); + if (ret) + goto err_idr; + + memset(&resp, 0, sizeof resp); + resp.pd_handle = uobj->id; + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) { + ret = -EFAULT; + goto err_copy; + } + + mutex_lock(&file->mutex); + list_add_tail(&uobj->list, &file->ucontext->pd_list); + mutex_unlock(&file->mutex); + + uobj->live = 1; + atomic_inc(&shpd->shared); + + up_write(&uobj->mutex); + + put_uobj_read(shuobj); + + return in_len; + +err_copy: + idr_remove_uobj(&ib_uverbs_pd_idr, uobj); + +err_idr: + ib_dealloc_pd(pd); + +err_putuobj: + + put_uobj_write(uobj); + +err_putshpd: + put_uobj_read(shuobj); + + return ret; +} + ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_dealloc_pd cmd; struct ib_uobject *uobj; - int ret; + int ret = 0; + struct ib_uobject *shuobj = 0; + struct ib_pd *pd = NULL; + struct ib_shpd *shpd = NULL; + struct ib_relaxed_pool_data *pos; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; @@ -559,10 +915,38 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file, if (!uobj) return -EINVAL; + pd = uobj->object; + + /* flush all pd reference from HCA - relaxed FMR */ + list_for_each_entry(pos, &pd->device->relaxed_pool_list, pool_list) { + ib_flush_fmr_pool(pos->fmr_pool); + } + + /* is pd shared ?*/ + if (pd->shpd) { + shpd = pd->shpd; + shuobj = shpd->uobject; + } + ret = ib_dealloc_pd(uobj->object); if (!ret) uobj->live = 0; + if (!ret && shpd) { + down_write(&shuobj->mutex); + + /* if this shpd is no longer shared */ + if (atomic_dec_and_test(&shpd->shared)) { + /* free the shpd info from device driver */ + file->device->ib_dev->remove_shpd(file->device->ib_dev, shpd, 0); + shuobj->live = 0; + up_write(&shuobj->mutex); + idr_remove_uobj(&ib_uverbs_shpd_idr, shuobj); + put_uobj(shuobj); + } else + up_write(&shuobj->mutex); + } + put_uobj_write(uobj); if (ret) @@ -680,58 +1064,404 @@ err_free: return ret; } -ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) +ssize_t ib_uverbs_reg_mr_relaxed(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) { - struct ib_uverbs_dereg_mr cmd; - struct ib_mr *mr; - struct ib_uobject *uobj; - int ret = -EINVAL; + struct ib_uverbs_reg_mr cmd; + struct ib_uverbs_reg_mr_resp resp; + struct ib_udata udata; + struct ib_uobject *uobj; + struct ib_pd *pd; + int ret; + + struct ib_relaxed_pool_data *pos; + struct ib_fmr_args_relaxed rel_args; + unsigned int n; + int found = 0; + struct page **pages; + int page_cnt; + u64 *dma_pages; + struct scatterlist *sg; + struct ib_pool_fmr *fmr; + int fmr_mapped = 0; + + if (out_len < sizeof resp) + return -ENOSPC; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle, file->ucontext); - if (!uobj) + INIT_UDATA(&udata, buf + sizeof cmd, + (unsigned long) cmd.response + sizeof resp, + in_len - sizeof cmd, out_len - sizeof resp); + + if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)) return -EINVAL; - mr = uobj->object; + /* + * Local write permission is required if remote write or + * remote atomic permission is also requested. + */ + if (cmd.access_flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) && + !(cmd.access_flags & IB_ACCESS_LOCAL_WRITE)) + return -EINVAL; - ret = ib_dereg_mr(mr); - if (!ret) - uobj->live = 0; + /* FMRs are limited to less than 1M for now */ + if (cmd.length >= (1*1024*1024 + PAGE_SIZE - 1)) + return -EINVAL; - put_uobj_write(uobj); + uobj = kmalloc(sizeof *uobj, GFP_KERNEL); + if (!uobj) + return -ENOMEM; - if (ret) - return ret; + init_uobj(uobj, 0, file->ucontext, &fmr_lock_key); + down_write(&uobj->mutex); - idr_remove_uobj(&ib_uverbs_mr_idr, uobj); + pd = idr_read_pd(cmd.pd_handle, file->ucontext); + if (!pd) { + ret = -EINVAL; + goto err_free; + } - mutex_lock(&file->mutex); - list_del(&uobj->list); - mutex_unlock(&file->mutex); + /* Relaxed MR */ + /* pd->device has a list of FMR pools, sorted by size & access_flags */ + /* if pool is already available use that pool and map the address. if + it is not available then allocate a new pool & allocate from there */ + { - put_uobj(uobj); + n = get_pages_in_range(cmd.start, cmd.length); + if (n == 0) { + ret = -EINVAL; + goto err_put; + } - return in_len; -} + found = 0; -ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) -{ - struct ib_uverbs_create_comp_channel cmd; - struct ib_uverbs_create_comp_channel_resp resp; - struct file *filp; - int ret; + list_for_each_entry(pos, &pd->device->relaxed_pool_list, pool_list) { + if (cmd.access_flags == pos->access_flags + && n <= pos->max_pages){ + found = 1; + break; + } + } - if (out_len < sizeof resp) - return -ENOSPC; + if (!found) { + int pool1pages = (ufmr_pool1_blocksize + PAGE_SIZE) >> PAGE_SHIFT; + int pool2pages = (ufmr_pool2_blocksize + PAGE_SIZE) >> PAGE_SHIFT; + struct ib_pd *pool_pd = file->device->ib_dev->relaxed_pd; - if (copy_from_user(&cmd, buf, sizeof cmd)) - return -EFAULT; + /* Create pool for 8kb buffers */ + ret = create_fmr_pool(pool_pd, pool1pages, ufmr_pool1_nelems, + cmd.access_flags); + if (ret < 0) + goto err_put; + + /* Create pool for 1mb buffers */ + ret = create_fmr_pool(pool_pd, pool2pages, ufmr_pool2_nelems, + cmd.access_flags); + if (ret < 0) + goto err_put; + + list_for_each_entry(pos, &pd->device->relaxed_pool_list, pool_list) { + if (cmd.access_flags == pos->access_flags + && n <= pos->max_pages){ + found = 1; + break; + } + } + if (!found) { + ret = -EINVAL; + goto err_put; + } + } + + + pages = kcalloc(n, sizeof(struct page *), GFP_KERNEL); + if (!pages) { + ret = -ENOMEM; + goto err_put; + } + + ret = fmr_pin_pages(cmd.start & PAGE_MASK, n, pages, + cmd.access_flags & IB_ACCESS_LOCAL_WRITE ? 1 : 0); + if (ret < 0) + goto err_pages_alloc; + + + /* TODO: define following as a separate function */ + if (1) { + u32 len = 0; + int sg_dma_len; + int i, j; + + page_cnt = 0; + + sg = kcalloc(n, sizeof(*sg), GFP_KERNEL); + if (sg == NULL) { + ret = -ENOMEM; + goto err_unpin; + } + sg_init_table(sg, n); + /* Stick all pages into the scatterlist */ + for (i = 0 ; i < n; i++) + sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0); + + sg_dma_len = ib_dma_map_sg(pd->device, sg, n, + DMA_BIDIRECTIONAL); + if (unlikely(!sg_dma_len)) { + printk(KERN_WARNING "RFMR/IB: dma_map_sg failed!\n"); + ret = -EBUSY; + goto err_free_sg; + } + + + for (i = 0; i < sg_dma_len; ++i) { + unsigned int dma_len = ib_sg_dma_len(pd->device, &sg[i]); + u64 dma_addr = ib_sg_dma_address(pd->device, &sg[i]); + + if (dma_addr & ~PAGE_MASK) { + if (i > 0) { + ret = -EINVAL; + goto err_free_sg; + } else + ++page_cnt; + } + if ((dma_addr + dma_len) & ~PAGE_MASK) { + if (i < sg_dma_len - 1) { + ret = -EINVAL; + goto err_free_sg; + } else + ++page_cnt; + } + + len += dma_len; + } + + page_cnt += len >> PAGE_SHIFT; + + dma_pages = kmalloc(sizeof(u64) * page_cnt, GFP_ATOMIC); + if (!dma_pages) { + ret = -ENOMEM; + goto err_free_sg; + } + + page_cnt = 0; + for (i = 0; i < sg_dma_len; ++i) { + unsigned int dma_len = ib_sg_dma_len(pd->device, &sg[i]); + u64 dma_addr = ib_sg_dma_address(pd->device, &sg[i]); + + for (j = 0; j < dma_len; j += PAGE_SIZE) { + dma_pages[page_cnt++] = + (dma_addr & PAGE_MASK) + j; + } + } + } + + + rel_args.pd = pd; + rel_args.sg = sg; + rel_args.sg_len = n; + + fmr = ib_fmr_pool_map_phys(pos->fmr_pool, dma_pages, page_cnt, + cmd.hca_va & PAGE_MASK, &rel_args); + + kfree(dma_pages); + + if (IS_ERR(fmr)) { + ret = PTR_ERR(fmr); + goto err_free_sg; + } + + fmr_mapped = 1; + + } + + fmr->fmr->device = pd->device; + fmr->fmr->pd = pd; + atomic_inc(&pd->usecnt); + + uobj->object = fmr; + ret = idr_add_uobj(&ib_uverbs_fmr_idr, uobj); + if (ret) + goto err_unreg; + + memset(&resp, 0, sizeof resp); + resp.lkey = fmr->fmr->lkey; + resp.rkey = fmr->fmr->rkey; + resp.mr_handle = uobj->id; + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) { + ret = -EFAULT; + goto err_copy; + } + + kfree(pages); + + put_pd_read(pd); + + mutex_lock(&file->mutex); + list_add_tail(&uobj->list, &file->ucontext->fmr_list); + mutex_unlock(&file->mutex); + + uobj->live = 1; + + up_write(&uobj->mutex); + + return in_len; + +err_copy: + idr_remove_uobj(&ib_uverbs_fmr_idr, uobj); + +err_unreg: + ib_fmr_pool_unmap(fmr); + atomic_dec(&pd->usecnt); + +err_free_sg: + /* if mapped already, this will be freed while flushing */ + if (!fmr_mapped) + kfree(sg); + +err_unpin: + /* if mapped already, pages will be unpinned during flushing */ + if (!fmr_mapped) + while (n--) + put_page(pages[n]); + +err_pages_alloc: + kfree(pages); + + +err_put: + put_pd_read(pd); + +err_free: + put_uobj_write(uobj); + return ret; +} + +ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_dereg_mr cmd; + struct ib_mr *mr; + struct ib_uobject *uobj; + int ret = -EINVAL; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle, file->ucontext); + if (!uobj) + return -EINVAL; + + mr = uobj->object; + + ret = ib_dereg_mr(mr); + if (!ret) + uobj->live = 0; + + put_uobj_write(uobj); + + if (ret) + return ret; + + idr_remove_uobj(&ib_uverbs_mr_idr, uobj); + + mutex_lock(&file->mutex); + list_del(&uobj->list); + mutex_unlock(&file->mutex); + + put_uobj(uobj); + + return in_len; +} + +ssize_t ib_uverbs_dereg_mr_relaxed(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_dereg_mr cmd; + struct ib_uobject *uobj; + int ret = -EINVAL; + struct ib_pool_fmr *fmr; + struct ib_pd *pd; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + uobj = idr_write_uobj(&ib_uverbs_fmr_idr, cmd.mr_handle, file->ucontext); + if (!uobj) + return -EINVAL; + + fmr = uobj->object; + pd = fmr->fmr->pd; + + ret = ib_fmr_pool_unmap(fmr); + if (!ret) + uobj->live = 0; + + put_uobj_write(uobj); + + if (ret) + return ret; + + atomic_dec(&pd->usecnt); + + idr_remove_uobj(&ib_uverbs_fmr_idr, uobj); + + mutex_lock(&file->mutex); + list_del(&uobj->list); + mutex_unlock(&file->mutex); + + put_uobj(uobj); + + return in_len; +} + +ssize_t ib_uverbs_flush_relaxed_mr(struct ib_uverbs_file *file, + const char __user *buf, + int in_len, int out_len) +{ + struct ib_uverbs_flush_relaxed_mr cmd; + struct ib_uobject *uobj; + struct ib_pd *pd; + struct ib_relaxed_pool_data *pos; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + uobj = idr_write_uobj(&ib_uverbs_pd_idr, cmd.pd_handle, file->ucontext); + if (!uobj) + return -EINVAL; + + /* flush all the pools associated with the pd */ + pd = uobj->object; + list_for_each_entry(pos, &pd->device->relaxed_pool_list, pool_list) { + ib_flush_fmr_pool(pos->fmr_pool); + } + + put_uobj_write(uobj); + + return in_len; +} + +ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_create_comp_channel cmd; + struct ib_uverbs_create_comp_channel_resp resp; + struct file *filp; + int ret; + + if (out_len < sizeof resp) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; ret = get_unused_fd(); if (ret < 0) @@ -777,7 +1507,8 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, (unsigned long) cmd.response + sizeof resp, in_len - sizeof cmd, out_len - sizeof resp); - if (cmd.comp_vector >= file->device->num_comp_vectors) + if (cmd.comp_vector >= file->device->num_comp_vectors && + cmd.comp_vector != IB_CQ_VECTOR_LEAST_ATTACHED) return -EINVAL; obj = kmalloc(sizeof *obj, GFP_KERNEL); @@ -893,81 +1624,68 @@ out: return ret ? ret : in_len; } -static int copy_wc_to_user(void __user *dest, struct ib_wc *wc) -{ - struct ib_uverbs_wc tmp; - - tmp.wr_id = wc->wr_id; - tmp.status = wc->status; - tmp.opcode = wc->opcode; - tmp.vendor_err = wc->vendor_err; - tmp.byte_len = wc->byte_len; - tmp.ex.imm_data = (__u32 __force) wc->ex.imm_data; - tmp.qp_num = wc->qp->qp_num; - tmp.src_qp = wc->src_qp; - tmp.wc_flags = wc->wc_flags; - tmp.pkey_index = wc->pkey_index; - tmp.slid = wc->slid; - tmp.sl = wc->sl; - tmp.dlid_path_bits = wc->dlid_path_bits; - tmp.port_num = wc->port_num; - tmp.reserved = 0; - - if (copy_to_user(dest, &tmp, sizeof tmp)) - return -EFAULT; - - return 0; -} - ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_poll_cq cmd; - struct ib_uverbs_poll_cq_resp resp; - u8 __user *header_ptr; - u8 __user *data_ptr; + struct ib_uverbs_poll_cq_resp *resp; struct ib_cq *cq; - struct ib_wc wc; - int ret; + struct ib_wc *wc; + int ret = 0; + int i; + int rsize; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); - if (!cq) - return -EINVAL; + wc = kmalloc(cmd.ne * sizeof *wc, GFP_KERNEL); + if (!wc) + return -ENOMEM; - /* we copy a struct ib_uverbs_poll_cq_resp to user space */ - header_ptr = (void __user *)(unsigned long) cmd.response; - data_ptr = header_ptr + sizeof resp; + rsize = sizeof *resp + cmd.ne * sizeof(struct ib_uverbs_wc); + resp = kmalloc(rsize, GFP_KERNEL); + if (!resp) { + ret = -ENOMEM; + goto out_wc; + } - memset(&resp, 0, sizeof resp); - while (resp.count < cmd.ne) { - ret = ib_poll_cq(cq, 1, &wc); - if (ret < 0) - goto out_put; - if (!ret) - break; + cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0); + if (!cq) { + ret = -EINVAL; + goto out; + } - ret = copy_wc_to_user(data_ptr, &wc); - if (ret) - goto out_put; + resp->count = ib_poll_cq(cq, cmd.ne, wc); - data_ptr += sizeof(struct ib_uverbs_wc); - ++resp.count; + put_cq_read(cq); + + for (i = 0; i < resp->count; i++) { + resp->wc[i].wr_id = wc[i].wr_id; + resp->wc[i].status = wc[i].status; + resp->wc[i].opcode = wc[i].opcode; + resp->wc[i].vendor_err = wc[i].vendor_err; + resp->wc[i].byte_len = wc[i].byte_len; + resp->wc[i].ex.imm_data = (__u32 __force) wc[i].ex.imm_data; + resp->wc[i].qp_num = wc[i].qp->qp_num; + resp->wc[i].src_qp = wc[i].src_qp; + resp->wc[i].wc_flags = wc[i].wc_flags; + resp->wc[i].pkey_index = wc[i].pkey_index; + resp->wc[i].slid = wc[i].slid; + resp->wc[i].sl = wc[i].sl; + resp->wc[i].dlid_path_bits = wc[i].dlid_path_bits; + resp->wc[i].port_num = wc[i].port_num; } - if (copy_to_user(header_ptr, &resp, sizeof resp)) { + if (copy_to_user((void __user *) (unsigned long) cmd.response, resp, rsize)) ret = -EFAULT; - goto out_put; - } - ret = in_len; +out: + kfree(resp); -out_put: - put_cq_read(cq); - return ret; +out_wc: + kfree(wc); + return ret ? ret : in_len; } ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file, @@ -1057,6 +1775,8 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, struct ib_srq *srq; struct ib_qp *qp; struct ib_qp_init_attr attr; + struct ib_xrcd *xrcd; + struct ib_uobject *xrcd_uobj; int ret; if (out_len < sizeof resp) @@ -1076,17 +1796,22 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &qp_lock_key); down_write(&obj->uevent.uobject.mutex); - srq = cmd.is_srq ? idr_read_srq(cmd.srq_handle, file->ucontext) : NULL; + srq = (cmd.is_srq && cmd.qp_type != IB_QPT_XRC) ? + idr_read_srq(cmd.srq_handle, file->ucontext) : NULL; + xrcd = cmd.qp_type == IB_QPT_XRC ? + idr_read_xrcd(cmd.srq_handle, file->ucontext, &xrcd_uobj) : NULL; pd = idr_read_pd(cmd.pd_handle, file->ucontext); scq = idr_read_cq(cmd.send_cq_handle, file->ucontext, 0); rcq = cmd.recv_cq_handle == cmd.send_cq_handle ? scq : idr_read_cq(cmd.recv_cq_handle, file->ucontext, 1); - if (!pd || !scq || !rcq || (cmd.is_srq && !srq)) { + if (!pd || !scq || !rcq || (cmd.is_srq && !srq) || + (cmd.qp_type == IB_QPT_XRC && !xrcd)) { ret = -EINVAL; goto err_put; } + attr.create_flags = 0; attr.event_handler = ib_uverbs_qp_event_handler; attr.qp_context = file; attr.send_cq = scq; @@ -1094,6 +1819,7 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, attr.srq = srq; attr.sq_sig_type = cmd.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; attr.qp_type = cmd.qp_type; + attr.xrc_domain = xrcd; attr.create_flags = 0; attr.cap.max_send_wr = cmd.max_send_wr; @@ -1121,11 +1847,14 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, qp->event_handler = attr.event_handler; qp->qp_context = attr.qp_context; qp->qp_type = attr.qp_type; + qp->xrcd = attr.xrc_domain; atomic_inc(&pd->usecnt); atomic_inc(&attr.send_cq->usecnt); atomic_inc(&attr.recv_cq->usecnt); if (attr.srq) atomic_inc(&attr.srq->usecnt); + else if (attr.xrc_domain) + atomic_inc(&attr.xrc_domain->usecnt); obj->uevent.uobject.object = qp; ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject); @@ -1153,6 +1882,8 @@ ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, put_cq_read(rcq); if (srq) put_srq_read(srq); + if (xrcd) + put_xrcd_read(xrcd_uobj); mutex_lock(&file->mutex); list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list); @@ -1179,6 +1910,8 @@ err_put: put_cq_read(rcq); if (srq) put_srq_read(srq); + if (xrcd) + put_xrcd_read(xrcd_uobj); put_uobj_write(&obj->uevent.uobject); return ret; @@ -1855,6 +2588,38 @@ err: return ret; } +ssize_t ib_uverbs_get_eth_l2_addr(struct ib_uverbs_file *file, const char __user *buf, + int in_len, int out_len) +{ + struct ib_uverbs_get_eth_l2_addr cmd; + struct ib_uverbs_get_eth_l2_addr_resp resp; + int ret; + struct ib_pd *pd; + + if (out_len < sizeof resp) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + pd = idr_read_pd(cmd.pd_handle, file->ucontext); + if (!pd) + return -EINVAL; + + ret = ib_get_eth_l2_addr(pd->device, cmd.port, (union ib_gid *)cmd.gid, + cmd.sgid_idx, resp.mac, &resp.vlan_id); + put_pd_read(pd); + if (!ret) { + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) + return -EFAULT; + + return in_len; + } + + return ret; +} + ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { @@ -2031,6 +2796,8 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, srq->uobject = &obj->uobject; srq->event_handler = attr.event_handler; srq->srq_context = attr.srq_context; + srq->xrc_cq = NULL; + srq->xrcd = NULL; atomic_inc(&pd->usecnt); atomic_set(&srq->usecnt, 0); @@ -2076,16 +2843,147 @@ err: return ret; } -ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file, +ssize_t ib_uverbs_create_xrc_srq(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) { - struct ib_uverbs_modify_srq cmd; - struct ib_udata udata; - struct ib_srq *srq; - struct ib_srq_attr attr; - int ret; - + struct ib_uverbs_create_xrc_srq cmd; + struct ib_uverbs_create_srq_resp resp; + struct ib_udata udata; + struct ib_uevent_object *obj; + struct ib_pd *pd; + struct ib_srq *srq; + struct ib_cq *xrc_cq; + struct ib_xrcd *xrcd; + struct ib_srq_init_attr attr; + struct ib_uobject *xrcd_uobj; + int ret; + + if (out_len < sizeof resp) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + INIT_UDATA(&udata, buf + sizeof cmd, + (unsigned long) cmd.response + sizeof resp, + in_len - sizeof cmd, out_len - sizeof resp); + + obj = kmalloc(sizeof *obj, GFP_KERNEL); + if (!obj) + return -ENOMEM; + + init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, + &srq_lock_key); + down_write(&obj->uobject.mutex); + + pd = idr_read_pd(cmd.pd_handle, file->ucontext); + if (!pd) { + ret = -EINVAL; + goto err; + } + + xrc_cq = idr_read_cq(cmd.xrc_cq, file->ucontext, 0); + if (!xrc_cq) { + ret = -EINVAL; + goto err_put_pd; + } + + xrcd = idr_read_xrcd(cmd.xrcd_handle, file->ucontext, &xrcd_uobj); + if (!xrcd) { + ret = -EINVAL; + goto err_put_cq; + } + + + attr.event_handler = ib_uverbs_srq_event_handler; + attr.srq_context = file; + attr.attr.max_wr = cmd.max_wr; + attr.attr.max_sge = cmd.max_sge; + attr.attr.srq_limit = cmd.srq_limit; + + obj->events_reported = 0; + INIT_LIST_HEAD(&obj->event_list); + + srq = pd->device->create_xrc_srq(pd, xrc_cq, xrcd, &attr, &udata); + if (IS_ERR(srq)) { + ret = PTR_ERR(srq); + goto err_put; + } + + srq->device = pd->device; + srq->pd = pd; + srq->uobject = &obj->uobject; + srq->event_handler = attr.event_handler; + srq->srq_context = attr.srq_context; + srq->xrc_cq = xrc_cq; + srq->xrcd = xrcd; + atomic_inc(&pd->usecnt); + atomic_inc(&xrc_cq->usecnt); + atomic_inc(&xrcd->usecnt); + + atomic_set(&srq->usecnt, 0); + + obj->uobject.object = srq; + ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uobject); + if (ret) + goto err_destroy; + + memset(&resp, 0, sizeof resp); + resp.srq_handle = obj->uobject.id; + resp.max_wr = attr.attr.max_wr; + resp.max_sge = attr.attr.max_sge; + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) { + ret = -EFAULT; + goto err_copy; + } + + put_xrcd_read(xrcd_uobj); + put_cq_read(xrc_cq); + put_pd_read(pd); + + mutex_lock(&file->mutex); + list_add_tail(&obj->uobject.list, &file->ucontext->srq_list); + mutex_unlock(&file->mutex); + + obj->uobject.live = 1; + + up_write(&obj->uobject.mutex); + + return in_len; + +err_copy: + idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uobject); + +err_destroy: + ib_destroy_srq(srq); + +err_put: + put_xrcd_read(xrcd_uobj); + +err_put_cq: + put_cq_read(xrc_cq); + +err_put_pd: + put_pd_read(pd); + +err: + put_uobj_write(&obj->uobject); + return ret; +} + +ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_modify_srq cmd; + struct ib_udata udata; + struct ib_srq *srq; + struct ib_srq_attr attr; + int ret; + if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; @@ -2194,3 +3092,695 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, return ret ? ret : in_len; } + +static struct inode *xrc_file2inode(struct file *f) +{ + return f->f_dentry->d_inode; +} + +struct xrcd_table_entry { + struct rb_node node; + struct inode *inode; + struct ib_xrcd *xrcd; +}; + +static int xrcd_table_insert(struct ib_device *dev, + struct inode *i_n, + struct ib_xrcd *xrcd) +{ + struct xrcd_table_entry *entry, *scan; + struct rb_node **p = &dev->ib_uverbs_xrcd_table.rb_node; + struct rb_node *parent = NULL; + + entry = kmalloc(sizeof(struct xrcd_table_entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + entry->inode = i_n; + entry->xrcd = xrcd; + + while (*p) { + parent = *p; + scan = rb_entry(parent, struct xrcd_table_entry, node); + + if (i_n < scan->inode) + p = &(*p)->rb_left; + else if (i_n > scan->inode) + p = &(*p)->rb_right; + else { + kfree(entry); + return -EEXIST; + } + } + + rb_link_node(&entry->node, parent, p); + rb_insert_color(&entry->node, &dev->ib_uverbs_xrcd_table); + igrab(i_n); + return 0; +} + +static struct xrcd_table_entry *xrcd_table_search(struct ib_device *dev, + struct inode *i_n) +{ + struct xrcd_table_entry *scan; + struct rb_node **p = &dev->ib_uverbs_xrcd_table.rb_node; + struct rb_node *parent = NULL; + + while (*p) { + parent = *p; + scan = rb_entry(parent, struct xrcd_table_entry, node); + + if (i_n < scan->inode) + p = &(*p)->rb_left; + else if (i_n > scan->inode) + p = &(*p)->rb_right; + else + return scan; + } + return NULL; +} + +static int find_xrcd(struct ib_device *dev, struct inode *i_n, + struct ib_xrcd **xrcd) +{ + struct xrcd_table_entry *entry; + + entry = xrcd_table_search(dev, i_n); + if (!entry) + return -EINVAL; + + *xrcd = entry->xrcd; + return 0; +} + + +static void xrcd_table_delete(struct ib_device *dev, + struct inode *i_n) +{ + struct xrcd_table_entry *entry = xrcd_table_search(dev, i_n); + + if (entry) { + iput(i_n); + rb_erase(&entry->node, &dev->ib_uverbs_xrcd_table); + kfree(entry); + } +} + +ssize_t ib_uverbs_open_xrc_domain(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_open_xrc_domain cmd; + struct ib_uverbs_open_xrc_domain_resp resp; + struct ib_udata udata; + struct ib_uobject *uobj; + struct ib_uxrcd_object *xrcd_uobj; + struct ib_xrcd *xrcd = NULL; + struct file *f = NULL; + struct inode *inode = NULL; + int ret = 0; + int new_xrcd = 0; + + if (out_len < sizeof resp) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + INIT_UDATA(&udata, buf + sizeof cmd, + (unsigned long) cmd.response + sizeof resp, + in_len - sizeof cmd, out_len - sizeof resp); + + mutex_lock(&file->device->ib_dev->xrcd_table_mutex); + if (cmd.fd != (u32) (-1)) { + /* search for file descriptor */ + f = fget(cmd.fd); + if (!f) { + ret = -EBADF; + goto err_table_mutex_unlock; + } + + inode = xrc_file2inode(f); + if (!inode) { + ret = -EBADF; + goto err_table_mutex_unlock; + } + + ret = find_xrcd(file->device->ib_dev, inode, &xrcd); + if (ret && !(cmd.oflags & O_CREAT)) { + /* no file descriptor. Need CREATE flag */ + ret = -EAGAIN; + goto err_table_mutex_unlock; + } + + if (xrcd && cmd.oflags & O_EXCL) { + ret = -EINVAL; + goto err_table_mutex_unlock; + } + } + + xrcd_uobj = kmalloc(sizeof *xrcd_uobj, GFP_KERNEL); + if (!xrcd_uobj) { + ret = -ENOMEM; + goto err_table_mutex_unlock; + } + + uobj = &xrcd_uobj->uobject; + init_uobj(uobj, 0, file->ucontext, &pd_lock_key); + down_write(&uobj->mutex); + + if (!xrcd) { + xrcd = file->device->ib_dev->alloc_xrcd(file->device->ib_dev, + file->ucontext, &udata); + if (IS_ERR(xrcd)) { + ret = PTR_ERR(xrcd); + goto err; + } + xrcd->uobject = (cmd.fd == -1) ? uobj : NULL; + xrcd->inode = inode; + xrcd->device = file->device->ib_dev; + atomic_set(&xrcd->usecnt, 0); + new_xrcd = 1; + } + + uobj->object = xrcd; + ret = idr_add_uobj(&ib_uverbs_xrc_domain_idr, uobj); + if (ret) + goto err_idr; + + memset(&resp, 0, sizeof resp); + resp.xrcd_handle = uobj->id; + + if (inode) { + if (new_xrcd) { + /* create new inode/xrcd table entry */ + ret = xrcd_table_insert(file->device->ib_dev, inode, xrcd); + if (ret) + goto err_insert_xrcd; + } + atomic_inc(&xrcd->usecnt); + } + if (f) + fput(f); + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) { + ret = -EFAULT; + goto err_copy; + } + + INIT_LIST_HEAD(&xrcd_uobj->xrc_reg_qp_list); + + mutex_lock(&file->mutex); + list_add_tail(&uobj->list, &file->ucontext->xrc_domain_list); + mutex_unlock(&file->mutex); + + uobj->live = 1; + + up_write(&uobj->mutex); + + mutex_unlock(&file->device->ib_dev->xrcd_table_mutex); + return in_len; + +err_copy: + + if (inode) { + if (new_xrcd) + xrcd_table_delete(file->device->ib_dev, inode); + atomic_dec(&xrcd->usecnt); + } + +err_insert_xrcd: + idr_remove_uobj(&ib_uverbs_xrc_domain_idr, uobj); + +err_idr: + ib_dealloc_xrcd(xrcd); + +err: + put_uobj_write(uobj); + +err_table_mutex_unlock: + + if (f) + fput(f); + mutex_unlock(&file->device->ib_dev->xrcd_table_mutex); + return ret; +} + +ssize_t ib_uverbs_close_xrc_domain(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_close_xrc_domain cmd; + struct ib_uobject *uobj, *t_uobj; + struct ib_uxrcd_object *xrcd_uobj; + struct ib_xrcd *xrcd = NULL; + struct inode *inode = NULL; + int ret = 0; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + mutex_lock(&file->device->ib_dev->xrcd_table_mutex); + uobj = idr_write_uobj(&ib_uverbs_xrc_domain_idr, cmd.xrcd_handle, + file->ucontext); + if (!uobj) { + ret = -EINVAL; + goto err_unlock_mutex; + } + + mutex_lock(&file->mutex); + if (!ret) { + list_for_each_entry(t_uobj, &file->ucontext->qp_list, list) { + struct ib_qp *qp = t_uobj->object; + if (qp->xrcd && qp->xrcd == uobj->object) { + ret = -EBUSY; + break; + } + } + } + if (!ret) { + list_for_each_entry(t_uobj, &file->ucontext->srq_list, list) { + struct ib_srq *srq = t_uobj->object; + if (srq->xrcd && srq->xrcd == uobj->object) { + ret = -EBUSY; + break; + } + } + } + mutex_unlock(&file->mutex); + if (ret) { + put_uobj_write(uobj); + goto err_unlock_mutex; + } + + xrcd_uobj = container_of(uobj, struct ib_uxrcd_object, uobject); + if (!list_empty(&xrcd_uobj->xrc_reg_qp_list)) { + ret = -EBUSY; + put_uobj_write(uobj); + goto err_unlock_mutex; + } + + xrcd = (struct ib_xrcd *) (uobj->object); + inode = xrcd->inode; + + if (inode) + atomic_dec(&xrcd->usecnt); + + ret = ib_dealloc_xrcd(uobj->object); + if (!ret) + uobj->live = 0; + + put_uobj_write(uobj); + + if (ret && !inode) + goto err_unlock_mutex; + + if (!ret && inode) + xrcd_table_delete(file->device->ib_dev, inode); + + idr_remove_uobj(&ib_uverbs_xrc_domain_idr, uobj); + + mutex_lock(&file->mutex); + list_del(&uobj->list); + mutex_unlock(&file->mutex); + + put_uobj(uobj); + + mutex_unlock(&file->device->ib_dev->xrcd_table_mutex); + return in_len; + +err_unlock_mutex: + mutex_unlock(&file->device->ib_dev->xrcd_table_mutex); + return ret; +} + +void ib_uverbs_dealloc_xrcd(struct ib_device *ib_dev, + struct ib_xrcd *xrcd) +{ + struct inode *inode = NULL; + int ret = 0; + + inode = xrcd->inode; + if (inode) + atomic_dec(&xrcd->usecnt); + + ret = ib_dealloc_xrcd(xrcd); + if (!ret && inode) + xrcd_table_delete(ib_dev, inode); +} + +ssize_t ib_uverbs_create_xrc_rcv_qp(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_create_xrc_rcv_qp cmd; + struct ib_uverbs_create_xrc_rcv_qp_resp resp; + struct ib_uxrc_rcv_object *obj; + struct ib_qp_init_attr init_attr; + struct ib_xrcd *xrcd; + struct ib_uobject *uobj; + struct ib_uxrcd_object *xrcd_uobj; + u32 qp_num; + int err; + + if (out_len < sizeof resp) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + obj = kzalloc(sizeof *obj, GFP_KERNEL); + if (!obj) + return -ENOMEM; + + xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj); + if (!xrcd) { + err = -EINVAL; + goto err_out; + } + + init_attr.event_handler = ib_uverbs_xrc_rcv_qp_event_handler; + init_attr.qp_context = file; + init_attr.srq = NULL; + init_attr.sq_sig_type = + cmd.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; + init_attr.qp_type = IB_QPT_XRC; + init_attr.xrc_domain = xrcd; + + init_attr.cap.max_send_wr = 1; + init_attr.cap.max_recv_wr = 0; + init_attr.cap.max_send_sge = 1; + init_attr.cap.max_recv_sge = 0; + init_attr.cap.max_inline_data = 0; + + err = xrcd->device->create_xrc_rcv_qp(&init_attr, &qp_num); + if (err) + goto err_put; + + memset(&resp, 0, sizeof resp); + resp.qpn = qp_num; + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) { + err = -EFAULT; + goto err_destroy; + } + + atomic_inc(&xrcd->usecnt); + put_xrcd_read(uobj); + obj->qp_num = qp_num; + obj->domain_handle = cmd.xrc_domain_handle; + xrcd_uobj = container_of(uobj, struct ib_uxrcd_object, uobject); + mutex_lock(&file->device->ib_dev->xrcd_table_mutex); + list_add_tail(&obj->list, &xrcd_uobj->xrc_reg_qp_list); + mutex_unlock(&file->device->ib_dev->xrcd_table_mutex); + + return in_len; + +err_destroy: + xrcd->device->unreg_xrc_rcv_qp(xrcd, file, qp_num); +err_put: + put_xrcd_read(uobj); +err_out: + kfree(obj); + return err; +} + +ssize_t ib_uverbs_modify_xrc_rcv_qp(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_modify_xrc_rcv_qp cmd; + struct ib_qp_attr *attr; + struct ib_xrcd *xrcd; + struct ib_uobject *uobj; + int err; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + attr = kzalloc(sizeof *attr, GFP_KERNEL); + if (!attr) + return -ENOMEM; + + xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj); + if (!xrcd) { + kfree(attr); + return -EINVAL; + } + + attr->qp_state = cmd.qp_state; + attr->cur_qp_state = cmd.cur_qp_state; + attr->qp_access_flags = cmd.qp_access_flags; + attr->pkey_index = cmd.pkey_index; + attr->port_num = cmd.port_num; + attr->path_mtu = cmd.path_mtu; + attr->path_mig_state = cmd.path_mig_state; + attr->qkey = cmd.qkey; + attr->rq_psn = cmd.rq_psn; + attr->sq_psn = cmd.sq_psn; + attr->dest_qp_num = cmd.dest_qp_num; + attr->alt_pkey_index = cmd.alt_pkey_index; + attr->en_sqd_async_notify = cmd.en_sqd_async_notify; + attr->max_rd_atomic = cmd.max_rd_atomic; + attr->max_dest_rd_atomic = cmd.max_dest_rd_atomic; + attr->min_rnr_timer = cmd.min_rnr_timer; + attr->port_num = cmd.port_num; + attr->timeout = cmd.timeout; + attr->retry_cnt = cmd.retry_cnt; + attr->rnr_retry = cmd.rnr_retry; + attr->alt_port_num = cmd.alt_port_num; + attr->alt_timeout = cmd.alt_timeout; + + memcpy(attr->ah_attr.grh.dgid.raw, cmd.dest.dgid, 16); + attr->ah_attr.grh.flow_label = cmd.dest.flow_label; + attr->ah_attr.grh.sgid_index = cmd.dest.sgid_index; + attr->ah_attr.grh.hop_limit = cmd.dest.hop_limit; + attr->ah_attr.grh.traffic_class = cmd.dest.traffic_class; + attr->ah_attr.dlid = cmd.dest.dlid; + attr->ah_attr.sl = cmd.dest.sl; + attr->ah_attr.src_path_bits = cmd.dest.src_path_bits; + attr->ah_attr.static_rate = cmd.dest.static_rate; + attr->ah_attr.ah_flags = cmd.dest.is_global ? IB_AH_GRH : 0; + attr->ah_attr.port_num = cmd.dest.port_num; + + memcpy(attr->alt_ah_attr.grh.dgid.raw, cmd.alt_dest.dgid, 16); + attr->alt_ah_attr.grh.flow_label = cmd.alt_dest.flow_label; + attr->alt_ah_attr.grh.sgid_index = cmd.alt_dest.sgid_index; + attr->alt_ah_attr.grh.hop_limit = cmd.alt_dest.hop_limit; + attr->alt_ah_attr.grh.traffic_class = cmd.alt_dest.traffic_class; + attr->alt_ah_attr.dlid = cmd.alt_dest.dlid; + attr->alt_ah_attr.sl = cmd.alt_dest.sl; + attr->alt_ah_attr.src_path_bits = cmd.alt_dest.src_path_bits; + attr->alt_ah_attr.static_rate = cmd.alt_dest.static_rate; + attr->alt_ah_attr.ah_flags = cmd.alt_dest.is_global ? IB_AH_GRH : 0; + attr->alt_ah_attr.port_num = cmd.alt_dest.port_num; + + err = xrcd->device->modify_xrc_rcv_qp(xrcd, cmd.qp_num, attr, cmd.attr_mask); + put_xrcd_read(uobj); + kfree(attr); + return err ? err : in_len; +} + +ssize_t ib_uverbs_query_xrc_rcv_qp(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_query_xrc_rcv_qp cmd; + struct ib_uverbs_query_qp_resp resp; + struct ib_qp_attr *attr; + struct ib_qp_init_attr *init_attr; + struct ib_xrcd *xrcd; + struct ib_uobject *uobj; + int ret; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + attr = kmalloc(sizeof *attr, GFP_KERNEL); + init_attr = kmalloc(sizeof *init_attr, GFP_KERNEL); + if (!attr || !init_attr) { + ret = -ENOMEM; + goto out; + } + + xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj); + if (!xrcd) { + ret = -EINVAL; + goto out; + } + + ret = xrcd->device->query_xrc_rcv_qp(xrcd, cmd.qp_num, attr, + cmd.attr_mask, init_attr); + + put_xrcd_read(uobj); + + if (ret) + goto out; + + memset(&resp, 0, sizeof resp); + resp.qp_state = attr->qp_state; + resp.cur_qp_state = attr->cur_qp_state; + resp.path_mtu = attr->path_mtu; + resp.path_mig_state = attr->path_mig_state; + resp.qkey = attr->qkey; + resp.rq_psn = attr->rq_psn; + resp.sq_psn = attr->sq_psn; + resp.dest_qp_num = attr->dest_qp_num; + resp.qp_access_flags = attr->qp_access_flags; + resp.pkey_index = attr->pkey_index; + resp.alt_pkey_index = attr->alt_pkey_index; + resp.sq_draining = attr->sq_draining; + resp.max_rd_atomic = attr->max_rd_atomic; + resp.max_dest_rd_atomic = attr->max_dest_rd_atomic; + resp.min_rnr_timer = attr->min_rnr_timer; + resp.port_num = attr->port_num; + resp.timeout = attr->timeout; + resp.retry_cnt = attr->retry_cnt; + resp.rnr_retry = attr->rnr_retry; + resp.alt_port_num = attr->alt_port_num; + resp.alt_timeout = attr->alt_timeout; + + memcpy(resp.dest.dgid, attr->ah_attr.grh.dgid.raw, 16); + resp.dest.flow_label = attr->ah_attr.grh.flow_label; + resp.dest.sgid_index = attr->ah_attr.grh.sgid_index; + resp.dest.hop_limit = attr->ah_attr.grh.hop_limit; + resp.dest.traffic_class = attr->ah_attr.grh.traffic_class; + resp.dest.dlid = attr->ah_attr.dlid; + resp.dest.sl = attr->ah_attr.sl; + resp.dest.src_path_bits = attr->ah_attr.src_path_bits; + resp.dest.static_rate = attr->ah_attr.static_rate; + resp.dest.is_global = !!(attr->ah_attr.ah_flags & IB_AH_GRH); + resp.dest.port_num = attr->ah_attr.port_num; + + memcpy(resp.alt_dest.dgid, attr->alt_ah_attr.grh.dgid.raw, 16); + resp.alt_dest.flow_label = attr->alt_ah_attr.grh.flow_label; + resp.alt_dest.sgid_index = attr->alt_ah_attr.grh.sgid_index; + resp.alt_dest.hop_limit = attr->alt_ah_attr.grh.hop_limit; + resp.alt_dest.traffic_class = attr->alt_ah_attr.grh.traffic_class; + resp.alt_dest.dlid = attr->alt_ah_attr.dlid; + resp.alt_dest.sl = attr->alt_ah_attr.sl; + resp.alt_dest.src_path_bits = attr->alt_ah_attr.src_path_bits; + resp.alt_dest.static_rate = attr->alt_ah_attr.static_rate; + resp.alt_dest.is_global = !!(attr->alt_ah_attr.ah_flags & IB_AH_GRH); + resp.alt_dest.port_num = attr->alt_ah_attr.port_num; + + resp.max_send_wr = init_attr->cap.max_send_wr; + resp.max_recv_wr = init_attr->cap.max_recv_wr; + resp.max_send_sge = init_attr->cap.max_send_sge; + resp.max_recv_sge = init_attr->cap.max_recv_sge; + resp.max_inline_data = init_attr->cap.max_inline_data; + resp.sq_sig_all = init_attr->sq_sig_type == IB_SIGNAL_ALL_WR; + + if (copy_to_user((void __user *) (unsigned long) cmd.response, + &resp, sizeof resp)) + ret = -EFAULT; + +out: + kfree(attr); + kfree(init_attr); + + return ret ? ret : in_len; +} + +ssize_t ib_uverbs_reg_xrc_rcv_qp(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_reg_xrc_rcv_qp cmd; + struct ib_uxrc_rcv_object *qp_obj, *tmp; + struct ib_xrcd *xrcd; + struct ib_uobject *uobj; + struct ib_uxrcd_object *xrcd_uobj; + int ret; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + qp_obj = kmalloc(sizeof *qp_obj, GFP_KERNEL); + if (!qp_obj) + return -ENOMEM; + + xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj); + if (!xrcd) { + ret = -EINVAL; + goto err_out; + } + + ret = xrcd->device->reg_xrc_rcv_qp(xrcd, file, cmd.qp_num); + if (ret) + goto err_put; + + xrcd_uobj = container_of(uobj, struct ib_uxrcd_object, uobject); + mutex_lock(&file->device->ib_dev->xrcd_table_mutex); + list_for_each_entry(tmp, &xrcd_uobj->xrc_reg_qp_list, list) + if (cmd.qp_num == tmp->qp_num) { + kfree(qp_obj); + mutex_unlock(&file->device->ib_dev->xrcd_table_mutex); + put_xrcd_read(uobj); + return in_len; + } + qp_obj->qp_num = cmd.qp_num; + qp_obj->domain_handle = cmd.xrc_domain_handle; + list_add_tail(&qp_obj->list, &xrcd_uobj->xrc_reg_qp_list); + mutex_unlock(&file->device->ib_dev->xrcd_table_mutex); + atomic_inc(&xrcd->usecnt); + put_xrcd_read(uobj); + return in_len; + +err_put: + put_xrcd_read(uobj); +err_out: + + kfree(qp_obj); + return ret; +} + +int ib_uverbs_cleanup_xrc_rcv_qp(struct ib_uverbs_file *file, + struct ib_xrcd *xrcd, u32 qp_num) +{ + int err; + err = xrcd->device->unreg_xrc_rcv_qp(xrcd, file, qp_num); + if (!err) + atomic_dec(&xrcd->usecnt); + return err; +} + +ssize_t ib_uverbs_unreg_xrc_rcv_qp(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_unreg_xrc_rcv_qp cmd; + struct ib_uxrc_rcv_object *qp_obj, *tmp; + struct ib_xrcd *xrcd; + struct ib_uobject *uobj; + struct ib_uxrcd_object *xrcd_uobj; + int ret; + + if (copy_from_user(&cmd, buf, sizeof cmd)) + return -EFAULT; + + xrcd = idr_read_xrcd(cmd.xrc_domain_handle, file->ucontext, &uobj); + if (!xrcd) + return -EINVAL; + + ret = xrcd->device->unreg_xrc_rcv_qp(xrcd, file, cmd.qp_num); + if (ret) { + put_xrcd_read(uobj); + return -EINVAL; + } + atomic_dec(&xrcd->usecnt); + + xrcd_uobj = container_of(uobj, struct ib_uxrcd_object, uobject); + mutex_lock(&file->device->ib_dev->xrcd_table_mutex); + list_for_each_entry_safe(qp_obj, tmp, &xrcd_uobj->xrc_reg_qp_list, list) + if (cmd.qp_num == qp_obj->qp_num) { + list_del(&qp_obj->list); + kfree(qp_obj); + break; + } + mutex_unlock(&file->device->ib_dev->xrcd_table_mutex); + put_xrcd_read(uobj); + return in_len; +} diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 56898b6578a49..dd8395e370799 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -40,9 +40,9 @@ #include #include #include -#include #include #include +#include #include #include @@ -54,6 +54,18 @@ MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("InfiniBand userspace verbs access"); MODULE_LICENSE("Dual BSD/GPL"); +module_param(ufmr_pool1_blocksize, int, 0444); +MODULE_PARM_DESC(ufmr_pool1_blocksize, "Block size for Usermode FMR Pool 1"); + +module_param(ufmr_pool1_nelems, int, 0444); +MODULE_PARM_DESC(ufmr_pool1_nelems, "No of FMRs in Usermode FMR Pool 1"); + +module_param(ufmr_pool2_blocksize, int, 0444); +MODULE_PARM_DESC(ufmr_pool2_blocksize, "Block size for Usermode FMR Pool 2"); + +module_param(ufmr_pool2_nelems, int, 0444); +MODULE_PARM_DESC(ufmr_pool2_nelems, "No of FMRs in Usermode FMR Pool 2"); + enum { IB_UVERBS_MAJOR = 231, IB_UVERBS_BASE_MINOR = 192, @@ -66,52 +78,80 @@ static struct class *uverbs_class; DEFINE_SPINLOCK(ib_uverbs_idr_lock); DEFINE_IDR(ib_uverbs_pd_idr); +DEFINE_IDR(ib_uverbs_shpd_idr); DEFINE_IDR(ib_uverbs_mr_idr); +DEFINE_IDR(ib_uverbs_fmr_idr); DEFINE_IDR(ib_uverbs_mw_idr); DEFINE_IDR(ib_uverbs_ah_idr); DEFINE_IDR(ib_uverbs_cq_idr); DEFINE_IDR(ib_uverbs_qp_idr); DEFINE_IDR(ib_uverbs_srq_idr); +DEFINE_IDR(ib_uverbs_xrc_domain_idr); -static DEFINE_SPINLOCK(map_lock); +static spinlock_t map_lock; +static struct ib_uverbs_device *dev_table[IB_UVERBS_MAX_DEVICES]; static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES); static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) = { - [IB_USER_VERBS_CMD_GET_CONTEXT] = ib_uverbs_get_context, - [IB_USER_VERBS_CMD_QUERY_DEVICE] = ib_uverbs_query_device, - [IB_USER_VERBS_CMD_QUERY_PORT] = ib_uverbs_query_port, - [IB_USER_VERBS_CMD_ALLOC_PD] = ib_uverbs_alloc_pd, - [IB_USER_VERBS_CMD_DEALLOC_PD] = ib_uverbs_dealloc_pd, - [IB_USER_VERBS_CMD_REG_MR] = ib_uverbs_reg_mr, - [IB_USER_VERBS_CMD_DEREG_MR] = ib_uverbs_dereg_mr, + [IB_USER_VERBS_CMD_GET_CONTEXT] = ib_uverbs_get_context, + [IB_USER_VERBS_CMD_QUERY_DEVICE] = ib_uverbs_query_device, + [IB_USER_VERBS_CMD_QUERY_PORT] = ib_uverbs_query_port, + [IB_USER_VERBS_CMD_ALLOC_PD] = ib_uverbs_alloc_pd, + [IB_USER_VERBS_CMD_DEALLOC_PD] = ib_uverbs_dealloc_pd, + [IB_USER_VERBS_CMD_REG_MR] = ib_uverbs_reg_mr, + [IB_USER_VERBS_CMD_DEREG_MR] = ib_uverbs_dereg_mr, [IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL] = ib_uverbs_create_comp_channel, - [IB_USER_VERBS_CMD_CREATE_CQ] = ib_uverbs_create_cq, - [IB_USER_VERBS_CMD_RESIZE_CQ] = ib_uverbs_resize_cq, - [IB_USER_VERBS_CMD_POLL_CQ] = ib_uverbs_poll_cq, - [IB_USER_VERBS_CMD_REQ_NOTIFY_CQ] = ib_uverbs_req_notify_cq, - [IB_USER_VERBS_CMD_DESTROY_CQ] = ib_uverbs_destroy_cq, - [IB_USER_VERBS_CMD_CREATE_QP] = ib_uverbs_create_qp, - [IB_USER_VERBS_CMD_QUERY_QP] = ib_uverbs_query_qp, - [IB_USER_VERBS_CMD_MODIFY_QP] = ib_uverbs_modify_qp, - [IB_USER_VERBS_CMD_DESTROY_QP] = ib_uverbs_destroy_qp, - [IB_USER_VERBS_CMD_POST_SEND] = ib_uverbs_post_send, - [IB_USER_VERBS_CMD_POST_RECV] = ib_uverbs_post_recv, - [IB_USER_VERBS_CMD_POST_SRQ_RECV] = ib_uverbs_post_srq_recv, - [IB_USER_VERBS_CMD_CREATE_AH] = ib_uverbs_create_ah, - [IB_USER_VERBS_CMD_DESTROY_AH] = ib_uverbs_destroy_ah, - [IB_USER_VERBS_CMD_ATTACH_MCAST] = ib_uverbs_attach_mcast, - [IB_USER_VERBS_CMD_DETACH_MCAST] = ib_uverbs_detach_mcast, - [IB_USER_VERBS_CMD_CREATE_SRQ] = ib_uverbs_create_srq, - [IB_USER_VERBS_CMD_MODIFY_SRQ] = ib_uverbs_modify_srq, - [IB_USER_VERBS_CMD_QUERY_SRQ] = ib_uverbs_query_srq, - [IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq, + [IB_USER_VERBS_CMD_CREATE_CQ] = ib_uverbs_create_cq, + [IB_USER_VERBS_CMD_RESIZE_CQ] = ib_uverbs_resize_cq, + [IB_USER_VERBS_CMD_POLL_CQ] = ib_uverbs_poll_cq, + [IB_USER_VERBS_CMD_REQ_NOTIFY_CQ] = ib_uverbs_req_notify_cq, + [IB_USER_VERBS_CMD_DESTROY_CQ] = ib_uverbs_destroy_cq, + [IB_USER_VERBS_CMD_CREATE_QP] = ib_uverbs_create_qp, + [IB_USER_VERBS_CMD_QUERY_QP] = ib_uverbs_query_qp, + [IB_USER_VERBS_CMD_MODIFY_QP] = ib_uverbs_modify_qp, + [IB_USER_VERBS_CMD_DESTROY_QP] = ib_uverbs_destroy_qp, + [IB_USER_VERBS_CMD_POST_SEND] = ib_uverbs_post_send, + [IB_USER_VERBS_CMD_POST_RECV] = ib_uverbs_post_recv, + [IB_USER_VERBS_CMD_POST_SRQ_RECV] = ib_uverbs_post_srq_recv, + [IB_USER_VERBS_CMD_CREATE_AH] = ib_uverbs_create_ah, + [IB_USER_VERBS_CMD_DESTROY_AH] = ib_uverbs_destroy_ah, + [IB_USER_VERBS_CMD_ATTACH_MCAST] = ib_uverbs_attach_mcast, + [IB_USER_VERBS_CMD_DETACH_MCAST] = ib_uverbs_detach_mcast, + [IB_USER_VERBS_CMD_CREATE_SRQ] = ib_uverbs_create_srq, + [IB_USER_VERBS_CMD_MODIFY_SRQ] = ib_uverbs_modify_srq, + [IB_USER_VERBS_CMD_QUERY_SRQ] = ib_uverbs_query_srq, + [IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq, + [IB_USER_VERBS_CMD_CREATE_XRC_SRQ] = ib_uverbs_create_xrc_srq, + [IB_USER_VERBS_CMD_OPEN_XRC_DOMAIN] = ib_uverbs_open_xrc_domain, + [IB_USER_VERBS_CMD_CLOSE_XRC_DOMAIN] = ib_uverbs_close_xrc_domain, + [IB_USER_VERBS_CMD_CREATE_XRC_RCV_QP] = ib_uverbs_create_xrc_rcv_qp, + [IB_USER_VERBS_CMD_MODIFY_XRC_RCV_QP] = ib_uverbs_modify_xrc_rcv_qp, + [IB_USER_VERBS_CMD_QUERY_XRC_RCV_QP] = ib_uverbs_query_xrc_rcv_qp, + [IB_USER_VERBS_CMD_REG_XRC_RCV_QP] = ib_uverbs_reg_xrc_rcv_qp, + [IB_USER_VERBS_CMD_UNREG_XRC_RCV_QP] = ib_uverbs_unreg_xrc_rcv_qp, + [IB_USER_VERBS_CMD_GET_ETH_L2_ADDR] = ib_uverbs_get_eth_l2_addr, + [IB_USER_VERBS_CMD_ALLOC_SHPD] = ib_uverbs_alloc_shpd, + [IB_USER_VERBS_CMD_SHARE_PD] = ib_uverbs_share_pd, + [IB_USER_VERBS_CMD_REG_MR_RELAXED] = ib_uverbs_reg_mr_relaxed, + [IB_USER_VERBS_CMD_DEREG_MR_RELAXED] = ib_uverbs_dereg_mr_relaxed, + [IB_USER_VERBS_CMD_FLUSH_RELAXED_MR] = ib_uverbs_flush_relaxed_mr, }; static void ib_uverbs_add_one(struct ib_device *device); static void ib_uverbs_remove_one(struct ib_device *device); +static void release_uobj(struct kref *kref) +{ + kfree(container_of(kref, struct ib_uobject, ref)); +} + +static void put_uobj(struct ib_uobject *uobj) +{ + kref_put(&uobj->ref, release_uobj); +} + static void ib_uverbs_release_dev(struct kref *ref) { struct ib_uverbs_device *dev = @@ -208,17 +248,6 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, kfree(uqp); } - list_for_each_entry_safe(uobj, tmp, &context->cq_list, list) { - struct ib_cq *cq = uobj->object; - struct ib_uverbs_event_file *ev_file = cq->cq_context; - struct ib_ucq_object *ucq = - container_of(uobj, struct ib_ucq_object, uobject); - - idr_remove_uobj(&ib_uverbs_cq_idr, uobj); - ib_destroy_cq(cq); - ib_uverbs_release_ucq(file, ev_file, ucq); - kfree(ucq); - } list_for_each_entry_safe(uobj, tmp, &context->srq_list, list) { struct ib_srq *srq = uobj->object; @@ -231,6 +260,18 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, kfree(uevent); } + list_for_each_entry_safe(uobj, tmp, &context->cq_list, list) { + struct ib_cq *cq = uobj->object; + struct ib_uverbs_event_file *ev_file = cq->cq_context; + struct ib_ucq_object *ucq = + container_of(uobj, struct ib_ucq_object, uobject); + + idr_remove_uobj(&ib_uverbs_cq_idr, uobj); + ib_destroy_cq(cq); + ib_uverbs_release_ucq(file, ev_file, ucq); + kfree(ucq); + } + /* XXX Free MWs */ list_for_each_entry_safe(uobj, tmp, &context->mr_list, list) { @@ -241,11 +282,73 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, kfree(uobj); } + list_for_each_entry_safe(uobj, tmp, &context->fmr_list, list) { + struct ib_pool_fmr *fmr = uobj->object; + struct ib_pd *pd = fmr->pd; + idr_remove_uobj(&ib_uverbs_fmr_idr, uobj); + ib_fmr_pool_unmap(fmr); + atomic_dec(&pd->usecnt); + kfree(uobj); + } + + mutex_lock(&file->device->ib_dev->xrcd_table_mutex); + list_for_each_entry_safe(uobj, tmp, &context->xrc_domain_list, list) { + struct ib_xrcd *xrcd = uobj->object; + struct ib_uxrc_rcv_object *xrc_qp_obj, *tmp1; + struct ib_uxrcd_object *xrcd_uobj = + container_of(uobj, struct ib_uxrcd_object, uobject); + + list_for_each_entry_safe(xrc_qp_obj, tmp1, + &xrcd_uobj->xrc_reg_qp_list, list) { + list_del(&xrc_qp_obj->list); + ib_uverbs_cleanup_xrc_rcv_qp(file, xrcd, + xrc_qp_obj->qp_num); + kfree(xrc_qp_obj); + } + + idr_remove_uobj(&ib_uverbs_xrc_domain_idr, uobj); + ib_uverbs_dealloc_xrcd(file->device->ib_dev, xrcd); + kfree(uobj); + } + mutex_unlock(&file->device->ib_dev->xrcd_table_mutex); + list_for_each_entry_safe(uobj, tmp, &context->pd_list, list) { struct ib_pd *pd = uobj->object; + struct ib_uobject *shuobj = NULL; + struct ib_shpd *shpd = NULL; + struct ib_relaxed_pool_data *pos; + + /* flush fmr pool associated with this pd */ + list_for_each_entry(pos, &pd->device->relaxed_pool_list, pool_list) { + ib_flush_fmr_pool(pos->fmr_pool); + } idr_remove_uobj(&ib_uverbs_pd_idr, uobj); + + /* is pd shared ?*/ + if (pd->shpd) { + shpd = pd->shpd; + shuobj = shpd->uobject; + } + ib_dealloc_pd(pd); + + if(shpd) { + + down_write(&shuobj->mutex); + + /* if this shpd is no longer shared */ + if (atomic_dec_and_test(&shpd->shared)) { + /* free the shpd info from device driver */ + file->device->ib_dev->remove_shpd(file->device->ib_dev, shpd, 0); + shuobj->live = 0; + up_write(&shuobj->mutex); + idr_remove_uobj(&ib_uverbs_shpd_idr, shuobj); + /* there could some one waiting to lock this shared object */ + put_uobj(shuobj); + } else + up_write(&shuobj->mutex); + } kfree(uobj); } @@ -366,11 +469,10 @@ static int ib_uverbs_event_close(struct inode *inode, struct file *filp) static const struct file_operations uverbs_event_fops = { .owner = THIS_MODULE, - .read = ib_uverbs_event_read, + .read = ib_uverbs_event_read, .poll = ib_uverbs_event_poll, .release = ib_uverbs_event_close, - .fasync = ib_uverbs_event_fasync, - .llseek = no_llseek, + .fasync = ib_uverbs_event_fasync }; void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context) @@ -485,6 +587,13 @@ void ib_uverbs_event_handler(struct ib_event_handler *handler, NULL, NULL); } +void ib_uverbs_xrc_rcv_qp_event_handler(struct ib_event *event, + void *context_ptr) +{ + ib_uverbs_async_handler(context_ptr, event->element.xrc_qp_num, + event->event, NULL, NULL); +} + struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file, int is_async) { @@ -557,18 +666,15 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, if (hdr.in_words * 4 != count) return -EINVAL; - if (hdr.command < 0 || - hdr.command >= ARRAY_SIZE(uverbs_cmd_table) || - !uverbs_cmd_table[hdr.command]) + if (hdr.command >= ARRAY_SIZE(uverbs_cmd_table) || + !uverbs_cmd_table[hdr.command] || + !(file->device->ib_dev->uverbs_cmd_mask & (1ull << hdr.command))) return -EINVAL; if (!file->ucontext && hdr.command != IB_USER_VERBS_CMD_GET_CONTEXT) return -EINVAL; - if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << hdr.command))) - return -ENOSYS; - return uverbs_cmd_table[hdr.command](file, buf + sizeof hdr, hdr.in_words * 4, hdr.out_words * 4); } @@ -586,12 +692,14 @@ static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma) /* * ib_uverbs_open() does not need the BKL: * - * - the ib_uverbs_device structures are properly reference counted and + * - dev_table[] accesses are protected by map_lock, the + * ib_uverbs_device structures are properly reference counted, and * everything else is purely local to the file being created, so * races against other open calls are not a problem; * - there is no ioctl method to race against; - * - the open method will either immediately run -ENXIO, or all - * required initialization will be done. + * - the device is added to dev_table[] as the last part of module + * initialization, the open method will either immediately run + * -ENXIO, or all required initialization will be done. */ static int ib_uverbs_open(struct inode *inode, struct file *filp) { @@ -599,10 +707,13 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) struct ib_uverbs_file *file; int ret; - dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev); + spin_lock(&map_lock); + dev = dev_table[iminor(inode) - IB_UVERBS_BASE_MINOR]; if (dev) kref_get(&dev->ref); - else + spin_unlock(&map_lock); + + if (!dev) return -ENXIO; if (!try_module_get(dev->ib_dev->owner)) { @@ -624,7 +735,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) filp->private_data = file; - return nonseekable_open(inode, filp); + return 0; err_module: module_put(dev->ib_dev->owner); @@ -649,20 +760,18 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp) } static const struct file_operations uverbs_fops = { - .owner = THIS_MODULE, - .write = ib_uverbs_write, - .open = ib_uverbs_open, - .release = ib_uverbs_close, - .llseek = no_llseek, + .owner = THIS_MODULE, + .write = ib_uverbs_write, + .open = ib_uverbs_open, + .release = ib_uverbs_close }; static const struct file_operations uverbs_mmap_fops = { - .owner = THIS_MODULE, - .write = ib_uverbs_write, + .owner = THIS_MODULE, + .write = ib_uverbs_write, .mmap = ib_uverbs_mmap, - .open = ib_uverbs_open, - .release = ib_uverbs_close, - .llseek = no_llseek, + .open = ib_uverbs_open, + .release = ib_uverbs_close }; static struct ib_client uverbs_client = { @@ -695,41 +804,16 @@ static ssize_t show_dev_abi_version(struct device *device, } static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL); -static CLASS_ATTR_STRING(abi_version, S_IRUGO, - __stringify(IB_USER_VERBS_ABI_VERSION)); - -static dev_t overflow_maj; -static DECLARE_BITMAP(overflow_map, IB_UVERBS_MAX_DEVICES); - -/* - * If we have more than IB_UVERBS_MAX_DEVICES, dynamically overflow by - * requesting a new major number and doubling the number of max devices we - * support. It's stupid, but simple. - */ -static int find_overflow_devnum(void) +static ssize_t show_abi_version(struct class *class, + struct class_attribute *attr, + char *buf) { - int ret; - - if (!overflow_maj) { - ret = alloc_chrdev_region(&overflow_maj, 0, IB_UVERBS_MAX_DEVICES, - "infiniband_verbs"); - if (ret) { - printk(KERN_ERR "user_verbs: couldn't register dynamic device number\n"); - return ret; - } - } - - ret = find_first_zero_bit(overflow_map, IB_UVERBS_MAX_DEVICES); - if (ret >= IB_UVERBS_MAX_DEVICES) - return -1; - - return ret; + return sprintf(buf, "%d\n", IB_USER_VERBS_ABI_VERSION); } +static CLASS_ATTR(abi_version, S_IRUGO, show_abi_version, NULL); static void ib_uverbs_add_one(struct ib_device *device) { - int devnum; - dev_t base; struct ib_uverbs_device *uverbs_dev; if (!device->alloc_ucontext) @@ -743,36 +827,28 @@ static void ib_uverbs_add_one(struct ib_device *device) init_completion(&uverbs_dev->comp); spin_lock(&map_lock); - devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES); - if (devnum >= IB_UVERBS_MAX_DEVICES) { + uverbs_dev->devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES); + if (uverbs_dev->devnum >= IB_UVERBS_MAX_DEVICES) { spin_unlock(&map_lock); - devnum = find_overflow_devnum(); - if (devnum < 0) - goto err; - - spin_lock(&map_lock); - uverbs_dev->devnum = devnum + IB_UVERBS_MAX_DEVICES; - base = devnum + overflow_maj; - set_bit(devnum, overflow_map); - } else { - uverbs_dev->devnum = devnum; - base = devnum + IB_UVERBS_BASE_DEV; - set_bit(devnum, dev_map); + goto err; } + set_bit(uverbs_dev->devnum, dev_map); spin_unlock(&map_lock); uverbs_dev->ib_dev = device; uverbs_dev->num_comp_vectors = device->num_comp_vectors; - cdev_init(&uverbs_dev->cdev, NULL); - uverbs_dev->cdev.owner = THIS_MODULE; - uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops; - kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum); - if (cdev_add(&uverbs_dev->cdev, base, 1)) + uverbs_dev->cdev = cdev_alloc(); + if (!uverbs_dev->cdev) + goto err; + uverbs_dev->cdev->owner = THIS_MODULE; + uverbs_dev->cdev->ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops; + kobject_set_name(&uverbs_dev->cdev->kobj, "uverbs%d", uverbs_dev->devnum); + if (cdev_add(uverbs_dev->cdev, IB_UVERBS_BASE_DEV + uverbs_dev->devnum, 1)) goto err_cdev; uverbs_dev->dev = device_create(uverbs_class, device->dma_device, - uverbs_dev->cdev.dev, uverbs_dev, + uverbs_dev->cdev->dev, uverbs_dev, "uverbs%d", uverbs_dev->devnum); if (IS_ERR(uverbs_dev->dev)) goto err_cdev; @@ -782,19 +858,26 @@ static void ib_uverbs_add_one(struct ib_device *device) if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version)) goto err_class; + device->relaxed_pd = ib_alloc_pd(device); + if (IS_ERR(device->relaxed_pd)) { + device->relaxed_pd = NULL; + goto err_class; + } + + spin_lock(&map_lock); + dev_table[uverbs_dev->devnum] = uverbs_dev; + spin_unlock(&map_lock); + ib_set_client_data(device, &uverbs_client, uverbs_dev); return; err_class: - device_destroy(uverbs_class, uverbs_dev->cdev.dev); + device_destroy(uverbs_class, uverbs_dev->cdev->dev); err_cdev: - cdev_del(&uverbs_dev->cdev); - if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES) - clear_bit(devnum, dev_map); - else - clear_bit(devnum, overflow_map); + cdev_del(uverbs_dev->cdev); + clear_bit(uverbs_dev->devnum, dev_map); err: kref_put(&uverbs_dev->ref, ib_uverbs_release_dev); @@ -806,35 +889,42 @@ err: static void ib_uverbs_remove_one(struct ib_device *device) { struct ib_uverbs_device *uverbs_dev = ib_get_client_data(device, &uverbs_client); + struct ib_relaxed_pool_data *pos; + struct ib_relaxed_pool_data *tmp; + int ret = 0; if (!uverbs_dev) return; + list_for_each_entry_safe(pos, tmp, &device->relaxed_pool_list, pool_list) { + ib_destroy_fmr_pool(pos->fmr_pool); + list_del(&pos->pool_list); + kfree(pos); + } + + ret = ib_dealloc_pd(device->relaxed_pd); + device->relaxed_pd = NULL; dev_set_drvdata(uverbs_dev->dev, NULL); - device_destroy(uverbs_class, uverbs_dev->cdev.dev); - cdev_del(&uverbs_dev->cdev); + device_destroy(uverbs_class, uverbs_dev->cdev->dev); + cdev_del(uverbs_dev->cdev); - if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES) - clear_bit(uverbs_dev->devnum, dev_map); - else - clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map); + spin_lock(&map_lock); + dev_table[uverbs_dev->devnum] = NULL; + spin_unlock(&map_lock); + + clear_bit(uverbs_dev->devnum, dev_map); kref_put(&uverbs_dev->ref, ib_uverbs_release_dev); wait_for_completion(&uverbs_dev->comp); kfree(uverbs_dev); } -static char *uverbs_devnode(struct device *dev, mode_t *mode) -{ - if (mode) - *mode = 0666; - return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev)); -} - static int __init ib_uverbs_init(void) { int ret; + spin_lock_init(&map_lock); + ret = register_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES, "infiniband_verbs"); if (ret) { @@ -849,9 +939,7 @@ static int __init ib_uverbs_init(void) goto out_chrdev; } - uverbs_class->devnode = uverbs_devnode; - - ret = class_create_file(uverbs_class, &class_attr_abi_version.attr); + ret = class_create_file(uverbs_class, &class_attr_abi_version); if (ret) { printk(KERN_ERR "user_verbs: couldn't create abi_version attribute\n"); goto out_class; @@ -880,10 +968,10 @@ static void __exit ib_uverbs_cleanup(void) ib_unregister_client(&uverbs_client); class_destroy(uverbs_class); unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES); - if (overflow_maj) - unregister_chrdev_region(overflow_maj, IB_UVERBS_MAX_DEVICES); idr_destroy(&ib_uverbs_pd_idr); + idr_destroy(&ib_uverbs_shpd_idr); idr_destroy(&ib_uverbs_mr_idr); + idr_destroy(&ib_uverbs_fmr_idr); idr_destroy(&ib_uverbs_mw_idr); idr_destroy(&ib_uverbs_ah_idr); idr_destroy(&ib_uverbs_cq_idr); diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c index 1b1146f87124e..5440da0e59b4d 100644 --- a/drivers/infiniband/core/uverbs_marshall.c +++ b/drivers/infiniband/core/uverbs_marshall.c @@ -40,21 +40,18 @@ void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst, dst->grh.sgid_index = src->grh.sgid_index; dst->grh.hop_limit = src->grh.hop_limit; dst->grh.traffic_class = src->grh.traffic_class; - memset(&dst->grh.reserved, 0, sizeof(dst->grh.reserved)); dst->dlid = src->dlid; dst->sl = src->sl; dst->src_path_bits = src->src_path_bits; dst->static_rate = src->static_rate; dst->is_global = src->ah_flags & IB_AH_GRH ? 1 : 0; dst->port_num = src->port_num; - dst->reserved = 0; } EXPORT_SYMBOL(ib_copy_ah_attr_to_user); void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst, struct ib_qp_attr *src) { - dst->qp_state = src->qp_state; dst->cur_qp_state = src->cur_qp_state; dst->path_mtu = src->path_mtu; dst->path_mig_state = src->path_mig_state; @@ -86,7 +83,6 @@ void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst, dst->rnr_retry = src->rnr_retry; dst->alt_port_num = src->alt_port_num; dst->alt_timeout = src->alt_timeout; - memset(dst->reserved, 0, sizeof(dst->reserved)); } EXPORT_SYMBOL(ib_copy_qp_attr_to_user); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index af7a8b08b2e95..efaf918d30b6a 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -77,6 +77,22 @@ enum ib_rate mult_to_ib_rate(int mult) } EXPORT_SYMBOL(mult_to_ib_rate); +int ib_ext_rate_to_int(enum ib_rate rate) +{ + switch (rate) { + case IB_RATE_14_GBPS: return 14; + case IB_RATE_56_GBPS: return 56; + case IB_RATE_112_GBPS: return 112; + case IB_RATE_168_GBPS: return 168; + case IB_RATE_25_GBPS: return 25; + case IB_RATE_100_GBPS: return 100; + case IB_RATE_200_GBPS: return 200; + case IB_RATE_300_GBPS: return 300; + default: return -1; + } +} +EXPORT_SYMBOL(ib_ext_rate_to_int); + enum rdma_transport_type rdma_node_get_transport(enum rdma_node_type node_type) { @@ -94,7 +110,7 @@ rdma_node_get_transport(enum rdma_node_type node_type) } EXPORT_SYMBOL(rdma_node_get_transport); -enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num) +enum rdma_link_layer rdma_port_link_layer(struct ib_device *device, u8 port_num) { if (device->get_link_layer) return device->get_link_layer(device, port_num); @@ -108,7 +124,7 @@ enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_ return IB_LINK_LAYER_UNSPECIFIED; } } -EXPORT_SYMBOL(rdma_port_get_link_layer); +EXPORT_SYMBOL(rdma_port_link_layer); /* Protection domains */ @@ -250,6 +266,8 @@ struct ib_srq *ib_create_srq(struct ib_pd *pd, srq->uobject = NULL; srq->event_handler = srq_init_attr->event_handler; srq->srq_context = srq_init_attr->srq_context; + srq->xrc_cq = NULL; + srq->xrcd = NULL; atomic_inc(&pd->usecnt); atomic_set(&srq->usecnt, 0); } @@ -258,6 +276,36 @@ struct ib_srq *ib_create_srq(struct ib_pd *pd, } EXPORT_SYMBOL(ib_create_srq); +struct ib_srq *ib_create_xrc_srq(struct ib_pd *pd, + struct ib_cq *xrc_cq, + struct ib_xrcd *xrcd, + struct ib_srq_init_attr *srq_init_attr) +{ + struct ib_srq *srq; + + if (!pd->device->create_xrc_srq) + return ERR_PTR(-ENOSYS); + + srq = pd->device->create_xrc_srq(pd, xrc_cq, xrcd, srq_init_attr, NULL); + + if (!IS_ERR(srq)) { + srq->device = pd->device; + srq->pd = pd; + srq->uobject = NULL; + srq->event_handler = srq_init_attr->event_handler; + srq->srq_context = srq_init_attr->srq_context; + srq->xrc_cq = xrc_cq; + srq->xrcd = xrcd; + atomic_inc(&pd->usecnt); + atomic_inc(&xrcd->usecnt); + atomic_inc(&xrc_cq->usecnt); + atomic_set(&srq->usecnt, 0); + } + + return srq; +} +EXPORT_SYMBOL(ib_create_xrc_srq); + int ib_modify_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr, enum ib_srq_attr_mask srq_attr_mask) @@ -279,16 +327,25 @@ EXPORT_SYMBOL(ib_query_srq); int ib_destroy_srq(struct ib_srq *srq) { struct ib_pd *pd; + struct ib_cq *xrc_cq; + struct ib_xrcd *xrcd; int ret; if (atomic_read(&srq->usecnt)) return -EBUSY; pd = srq->pd; + xrc_cq = srq->xrc_cq; + xrcd = srq->xrcd; ret = srq->device->destroy_srq(srq); - if (!ret) + if (!ret) { atomic_dec(&pd->usecnt); + if (xrc_cq) + atomic_dec(&xrc_cq->usecnt); + if (xrcd) + atomic_dec(&xrcd->usecnt); + } return ret; } @@ -313,11 +370,15 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, qp->event_handler = qp_init_attr->event_handler; qp->qp_context = qp_init_attr->qp_context; qp->qp_type = qp_init_attr->qp_type; + qp->xrcd = qp->qp_type == IB_QPT_XRC ? + qp_init_attr->xrc_domain : NULL; atomic_inc(&pd->usecnt); atomic_inc(&qp_init_attr->send_cq->usecnt); atomic_inc(&qp_init_attr->recv_cq->usecnt); if (qp_init_attr->srq) atomic_inc(&qp_init_attr->srq->usecnt); + if (qp->qp_type == IB_QPT_XRC) + atomic_inc(&qp->xrcd->usecnt); } return qp; @@ -326,8 +387,8 @@ EXPORT_SYMBOL(ib_create_qp); static const struct { int valid; - enum ib_qp_attr_mask req_param[IB_QPT_RAW_ETHERTYPE + 1]; - enum ib_qp_attr_mask opt_param[IB_QPT_RAW_ETHERTYPE + 1]; + enum ib_qp_attr_mask req_param[IB_QPT_RAW_ETY + 1]; + enum ib_qp_attr_mask opt_param[IB_QPT_RAW_ETY + 1]; } qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = { [IB_QPS_RESET] = { [IB_QPS_RESET] = { .valid = 1 }, @@ -343,6 +404,9 @@ static const struct { [IB_QPT_RC] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), + [IB_QPT_XRC] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | @@ -365,6 +429,9 @@ static const struct { [IB_QPT_RC] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), + [IB_QPT_XRC] = (IB_QP_PKEY_INDEX | + IB_QP_PORT | + IB_QP_ACCESS_FLAGS), [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | @@ -384,6 +451,12 @@ static const struct { IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER), + [IB_QPT_XRC] = (IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_DEST_QPN | + IB_QP_RQ_PSN | + IB_QP_MAX_DEST_RD_ATOMIC | + IB_QP_MIN_RNR_TIMER), }, .opt_param = { [IB_QPT_UD] = (IB_QP_PKEY_INDEX | @@ -394,6 +467,9 @@ static const struct { [IB_QPT_RC] = (IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX), + [IB_QPT_XRC] = (IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX), [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | @@ -414,6 +490,11 @@ static const struct { IB_QP_RNR_RETRY | IB_QP_SQ_PSN | IB_QP_MAX_QP_RD_ATOMIC), + [IB_QPT_XRC] = (IB_QP_TIMEOUT | + IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY | + IB_QP_SQ_PSN | + IB_QP_MAX_QP_RD_ATOMIC), [IB_QPT_SMI] = IB_QP_SQ_PSN, [IB_QPT_GSI] = IB_QP_SQ_PSN, }, @@ -429,6 +510,11 @@ static const struct { IB_QP_ACCESS_FLAGS | IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_CUR_STATE | @@ -453,6 +539,11 @@ static const struct { IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE | IB_QP_MIN_RNR_TIMER), + [IB_QPT_XRC] = (IB_QP_CUR_STATE | + IB_QP_ACCESS_FLAGS | + IB_QP_ALT_PATH | + IB_QP_PATH_MIG_STATE | + IB_QP_MIN_RNR_TIMER), [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_CUR_STATE | @@ -465,6 +556,7 @@ static const struct { [IB_QPT_UD] = IB_QP_EN_SQD_ASYNC_NOTIFY, [IB_QPT_UC] = IB_QP_EN_SQD_ASYNC_NOTIFY, [IB_QPT_RC] = IB_QP_EN_SQD_ASYNC_NOTIFY, + [IB_QPT_XRC] = IB_QP_EN_SQD_ASYNC_NOTIFY, [IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY, [IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY } @@ -487,6 +579,11 @@ static const struct { IB_QP_ACCESS_FLAGS | IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC] = (IB_QP_CUR_STATE | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_CUR_STATE | @@ -515,6 +612,18 @@ static const struct { IB_QP_PKEY_INDEX | IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), + [IB_QPT_XRC] = (IB_QP_PORT | + IB_QP_AV | + IB_QP_TIMEOUT | + IB_QP_RETRY_CNT | + IB_QP_RNR_RETRY | + IB_QP_MAX_QP_RD_ATOMIC | + IB_QP_MAX_DEST_RD_ATOMIC | + IB_QP_ALT_PATH | + IB_QP_ACCESS_FLAGS | + IB_QP_PKEY_INDEX | + IB_QP_MIN_RNR_TIMER | + IB_QP_PATH_MIG_STATE), [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | @@ -599,12 +708,15 @@ int ib_destroy_qp(struct ib_qp *qp) struct ib_pd *pd; struct ib_cq *scq, *rcq; struct ib_srq *srq; + struct ib_xrcd *xrcd; + enum ib_qp_type qp_type = qp->qp_type; int ret; pd = qp->pd; scq = qp->send_cq; rcq = qp->recv_cq; srq = qp->srq; + xrcd = qp->xrcd; ret = qp->device->destroy_qp(qp); if (!ret) { @@ -613,6 +725,8 @@ int ib_destroy_qp(struct ib_qp *qp) atomic_dec(&rcq->usecnt); if (srq) atomic_dec(&srq->usecnt); + if (qp_type == IB_QPT_XRC) + atomic_dec(&xrcd->usecnt); } return ret; @@ -871,6 +985,21 @@ struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd, } EXPORT_SYMBOL(ib_alloc_fmr); +int ib_set_fmr_pd(struct ib_fmr *fmr, struct ib_pd *pd) +{ + int ret = 0; + if (fmr->device->set_fmr_pd) { + ret = fmr->device->set_fmr_pd(fmr, pd); + if (!ret) + fmr->pd = pd; + + return ret; + } else + return -ENOSYS; +} +EXPORT_SYMBOL(ib_set_fmr_pd); + + int ib_unmap_fmr(struct list_head *fmr_list) { struct ib_fmr *fmr; @@ -903,9 +1032,17 @@ int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) { if (!qp->device->attach_mcast) return -ENOSYS; - if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) - return -EINVAL; + switch (rdma_node_get_transport(qp->device->node_type)) { + case RDMA_TRANSPORT_IB: + if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) + return -EINVAL; + break; + case RDMA_TRANSPORT_IWARP: + if (qp->qp_type != IB_QPT_RAW_ETY) + return -EINVAL; + break; + } return qp->device->attach_mcast(qp, gid, lid); } EXPORT_SYMBOL(ib_attach_mcast); @@ -914,9 +1051,54 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) { if (!qp->device->detach_mcast) return -ENOSYS; - if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) - return -EINVAL; + switch (rdma_node_get_transport(qp->device->node_type)) { + case RDMA_TRANSPORT_IB: + if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) + return -EINVAL; + break; + case RDMA_TRANSPORT_IWARP: + if (qp->qp_type != IB_QPT_RAW_ETY) + return -EINVAL; + break; + } return qp->device->detach_mcast(qp, gid, lid); } EXPORT_SYMBOL(ib_detach_mcast); + +int ib_dealloc_xrcd(struct ib_xrcd *xrcd) +{ + if (atomic_read(&xrcd->usecnt)) + return -EBUSY; + + return xrcd->device->dealloc_xrcd(xrcd); +} +EXPORT_SYMBOL(ib_dealloc_xrcd); + +struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device) +{ + struct ib_xrcd *xrcd; + + if (!device->alloc_xrcd) + return ERR_PTR(-ENOSYS); + + xrcd = device->alloc_xrcd(device, NULL, NULL); + if (!IS_ERR(xrcd)) { + xrcd->device = device; + xrcd->inode = NULL; + xrcd->uobject = NULL; + atomic_set(&xrcd->usecnt, 0); + } + return xrcd; +} +EXPORT_SYMBOL(ib_alloc_xrcd); + +int ib_get_eth_l2_addr(struct ib_device *device, u8 port, union ib_gid *gid, + int sgid_idx, u8 *mac, __u16 *vlan_id) +{ + if (!device->get_eth_l2_addr) + return -ENOSYS; + + return device->get_eth_l2_addr(device, port, gid, sgid_idx, mac, vlan_id); +} +EXPORT_SYMBOL(ib_get_eth_l2_addr); diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c b/drivers/infiniband/hw/amso1100/c2_provider.c index aeebc4d37e336..c47f618d12e89 100644 --- a/drivers/infiniband/hw/amso1100/c2_provider.c +++ b/drivers/infiniband/hw/amso1100/c2_provider.c @@ -865,7 +865,7 @@ int c2_register_device(struct c2_dev *dev) dev->ibdev.iwcm->create_listen = c2_service_create; dev->ibdev.iwcm->destroy_listen = c2_service_destroy; - ret = ib_register_device(&dev->ibdev, NULL); + ret = ib_register_device(&dev->ibdev); if (ret) goto out_free_iwcm; diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 2e2741307af4b..bbfd8b370a8d2 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -1442,7 +1442,7 @@ int iwch_register_device(struct iwch_dev *dev) dev->ibdev.iwcm->rem_ref = iwch_qp_rem_ref; dev->ibdev.iwcm->get_qp = iwch_get_qp; - ret = ib_register_device(&dev->ibdev, NULL); + ret = ib_register_device(&dev->ibdev); if (ret) goto bail1; diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 5b9e4220ca08f..92187a926fb99 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -506,7 +506,7 @@ int c4iw_register_device(struct c4iw_dev *dev) dev->ibdev.iwcm->rem_ref = c4iw_qp_rem_ref; dev->ibdev.iwcm->get_qp = c4iw_get_qp; - ret = ib_register_device(&dev->ibdev, NULL); + ret = ib_register_device(&dev->ibdev); if (ret) goto bail1; diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c index c240e9972cb0e..128f77c8d25a6 100644 --- a/drivers/infiniband/hw/ehca/ehca_main.c +++ b/drivers/infiniband/hw/ehca/ehca_main.c @@ -800,7 +800,7 @@ static int __devinit ehca_probe(struct platform_device *dev, goto probe5; } - ret = ib_register_device(&shca->ib_device, NULL); + ret = ib_register_device(&shca->ib_device); if (ret) { ehca_err(&shca->ib_device, "ib_register_device() failed ret=%i", ret); diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c index dd7f26d04d46b..559f39be0dcc1 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.c +++ b/drivers/infiniband/hw/ipath/ipath_verbs.c @@ -2182,7 +2182,7 @@ int ipath_register_ib_device(struct ipath_devdata *dd) snprintf(dev->node_desc, sizeof(dev->node_desc), IPATH_IDSTR " %s", init_utsname()->nodename); - ret = ib_register_device(dev, NULL); + ret = ib_register_device(dev); if (ret) goto err_reg; diff --git a/drivers/infiniband/hw/mlx4/Makefile b/drivers/infiniband/hw/mlx4/Makefile index 70f09c7826da4..cb3cfe83ec170 100644 --- a/drivers/infiniband/hw/mlx4/Makefile +++ b/drivers/infiniband/hw/mlx4/Makefile @@ -1,3 +1,5 @@ obj-$(CONFIG_MLX4_INFINIBAND) += mlx4_ib.o -mlx4_ib-y := ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o +mlx4_ib-y := ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o alias_GUID.o sysfs.o cm.o mcg.o +mlx4_ib-y += wc.o +mlx4_ib-y += ib_events.o diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c index 4b8f9c49397e2..0f7aa3a8514f5 100644 --- a/drivers/infiniband/hw/mlx4/ah.c +++ b/drivers/infiniband/hw/mlx4/ah.c @@ -30,21 +30,25 @@ * SOFTWARE. */ +#include "mlx4_ib.h" #include -#include - -#include #include #include - -#include "mlx4_ib.h" +#include int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr, u8 *mac, int *is_mcast, u8 port) { + struct mlx4_ib_iboe *iboe = &dev->iboe; struct in6_addr in6; *is_mcast = 0; + spin_lock(&iboe->lock); + if (!iboe->netdevs[port - 1]) { + spin_unlock(&iboe->lock); + return -EINVAL; + } + spin_unlock(&iboe->lock); memcpy(&in6, ah_attr->grh.dgid.raw, sizeof in6); if (rdma_link_local_addr(&in6)) @@ -61,13 +65,28 @@ int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_att static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr, struct mlx4_ib_ah *ah) { - struct mlx4_dev *dev = to_mdev(pd->device)->dev; + struct mlx4_ib_dev *dev = to_mdev(pd->device); + int gid_index; + u8 ah_port_num = ah_attr->port_num; + u8 ah_sgid_index; + - ah->av.ib.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24)); + ah->av.ib.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_port_num << 24)); ah->av.ib.g_slid = ah_attr->src_path_bits; if (ah_attr->ah_flags & IB_AH_GRH) { + ah_sgid_index = ah_attr->grh.sgid_index; ah->av.ib.g_slid |= 0x80; - ah->av.ib.gid_index = ah_attr->grh.sgid_index; + if (mlx4_is_mfunc(dev->dev)) { + /* Map to function-specific gid */ + gid_index = be16_to_cpu(dev->virt2phys_gids[ah_port_num][ah_sgid_index]); + if (!is_gid_idx_valid(gid_index)) { + mlx4_ib_warn(pd->device, "cannot create ah with " + "gid %d\n", gid_index); + return ERR_PTR(-EINVAL); + } + ah->av.ib.gid_index = gid_index; + } else + ah->av.ib.gid_index = ah_sgid_index; ah->av.ib.hop_limit = ah_attr->grh.hop_limit; ah->av.ib.sl_tclass_flowlabel |= cpu_to_be32((ah_attr->grh.traffic_class << 20) | @@ -79,7 +98,7 @@ static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr, if (ah_attr->static_rate) { ah->av.ib.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET; while (ah->av.ib.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET && - !(1 << ah->av.ib.stat_rate & dev->caps.stat_rate_support)) + !(1 << ah->av.ib.stat_rate & dev->dev->caps.stat_rate_support)) --ah->av.ib.stat_rate; } ah->av.ib.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28); @@ -87,12 +106,12 @@ static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr, return &ah->ibah; } +/* TODO: Add support in double GUID feature */ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr, - struct mlx4_ib_ah *ah) + struct mlx4_ib_ah *ah) { struct mlx4_ib_dev *ibdev = to_mdev(pd->device); struct mlx4_dev *dev = ibdev->dev; - union ib_gid sgid; u8 mac[6]; int err; int is_mcast; @@ -103,12 +122,8 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr return ERR_PTR(err); memcpy(ah->av.eth.mac, mac, 6); - err = ib_get_cached_gid(pd->device, ah_attr->port_num, ah_attr->grh.sgid_index, &sgid); - if (err) - return ERR_PTR(err); - vlan_tag = rdma_get_vlan_id(&sgid); - if (vlan_tag < 0x1000) - vlan_tag |= (ah_attr->sl & 7) << 13; + vlan_tag = rdma_get_vlan_id(&ah_attr->grh.dgid); + vlan_tag |= (ah_attr->sl & 7) << 13; ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24)); ah->av.eth.gid_index = ah_attr->grh.sgid_index; ah->av.eth.vlan = cpu_to_be16(vlan_tag); @@ -140,27 +155,41 @@ struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) if (!ah) return ERR_PTR(-ENOMEM); - if (rdma_port_get_link_layer(pd->device, ah_attr->port_num) == IB_LINK_LAYER_ETHERNET) { + + ah->ex = NULL; + + /* By default, when operating in multi-function mode, the demux function's + * GSI mads are also looped back to the tunnel QP for canonical processing + * with other functions */ + ah->gsi_demux_lb = 1; + if (rdma_port_link_layer(pd->device, + (ah_attr->port_num & 0x7f)) == IB_LINK_LAYER_ETHERNET) { if (!(ah_attr->ah_flags & IB_AH_GRH)) { ret = ERR_PTR(-EINVAL); + goto out; } else { - /* - * TBD: need to handle the case when we get - * called in an atomic context and there we - * might sleep. We don't expect this - * currently since we're working with link - * local addresses which we can translate - * without going to sleep. - */ + /* TBD: need to handle the case when we get called + in an atomic context and there we might sleep. We + don't expect this currently since we're working with + link local addresses which we can translate without + going to sleep */ ret = create_iboe_ah(pd, ah_attr, ah); + if (IS_ERR(ret)) + goto out; + else + return ret; } - + } else { + ret = create_ib_ah(pd, ah_attr, ah); if (IS_ERR(ret)) - kfree(ah); + goto out; + else + return ret; + } - return ret; - } else - return create_ib_ah(pd, ah_attr, ah); /* never fails */ +out: + kfree(ah); + return ret; } int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) @@ -171,7 +200,7 @@ int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) memset(ah_attr, 0, sizeof *ah_attr); ah_attr->sl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; ah_attr->port_num = be32_to_cpu(ah->av.ib.port_pd) >> 24; - ll = rdma_port_get_link_layer(ibah->device, ah_attr->port_num); + ll = rdma_port_link_layer(ibah->device, ah_attr->port_num); ah_attr->dlid = ll == IB_LINK_LAYER_INFINIBAND ? be16_to_cpu(ah->av.ib.dlid) : 0; if (ah->av.ib.stat_rate) ah_attr->static_rate = ah->av.ib.stat_rate - MLX4_STAT_RATE_OFFSET; @@ -194,6 +223,32 @@ int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) int mlx4_ib_destroy_ah(struct ib_ah *ah) { + if (to_mah(ah)->ex) + kfree(to_mah(ah)->ex); kfree(to_mah(ah)); return 0; } + +int mlx4_ib_get_eth_l2_addr(struct ib_device *device, u8 port, union ib_gid *dgid, + int sgid_idx, u8 *mac, u16 *vlan_id) +{ + int err; + struct mlx4_ib_dev *ibdev = to_mdev(device); + struct ib_ah_attr ah_attr = { + .port_num = port, + }; + int is_mcast; + union ib_gid sgid; + + memcpy(ah_attr.grh.dgid.raw, dgid, 16); + err = mlx4_ib_resolve_grh(ibdev, &ah_attr, mac, &is_mcast, port); + if (err) + return err; + + err = ib_get_cached_gid(device, port, sgid_idx, &sgid); + if (err) + return err; + *vlan_id = rdma_get_vlan_id(&sgid); + return 0; +} + diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c new file mode 100644 index 0000000000000..3884837f6bd57 --- /dev/null +++ b/drivers/infiniband/hw/mlx4/alias_GUID.c @@ -0,0 +1,817 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + /***********************************************************/ +/*This file support the handling of the Alias GUID feature. */ +/***********************************************************/ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mlx4_ib.h" +#include "alias_GUID.h" + +/* + +The driver keeps the current state of all guids, as they are in the HW. +Whenever smp mad for GUIDInfo record came, it will be cached. + +*/ +void update_cache_on_guid_change(struct mlx4_ib_dev *dev, int block_num, u8 port_num, u8* p_data) +{ + int i; + u64 guid_indexes; + int slave_id; + int gid_index; + __be64 tmp_cur_ag; + int port_index = port_num -1; + + if ((!mlx4_is_mfunc(dev->dev)) || (!dev->dev->caps.sqp_demux)) + return; + + if (block_num >= NUM_ALIAS_GUID_REC_IN_PORT) { + printk(KERN_ERR "Failed to update guid cache. bn %d is out of range", block_num); + return; + } + + guid_indexes = be64_to_cpu(dev->sriov.alias_guid. + ports_guid[port_num - 1]. + all_rec_per_port[block_num]. + guid_indexes); + mlx4_ib_dbg("%s:port:%d, guid_indexes: 0x%llx\n", __func__, port_num, guid_indexes); + + for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) { + /*the location of the specific index starts from bit number 4 till bit num 11*/ + if (test_bit(i + 4, (unsigned long *)&guid_indexes)) { + gid_index = (block_num * NUM_ALIAS_GUID_IN_REC) + i; + slave_id = mlx4_gid_idx_to_slave(dev->dev, gid_index); + if (slave_id >= dev->dev->num_slaves) { + mlx4_ib_dbg("%s:The last slave: %d\n", __func__, slave_id); + goto out; + } + tmp_cur_ag = *(__be64*)&p_data[i * GUID_REC_SIZE]; + + /*cache the guid:*/ + memcpy(&dev->sriov.demux[port_index].guid_cache[gid_index], + &p_data[i * GUID_REC_SIZE], + GUID_REC_SIZE); + } + else + mlx4_ib_dbg("%s: Guid number :%d in block :%d" + " was not updated\n", + __func__, i, block_num); + } +out: + return; +} + +/* + Whenever new GUID was set/unset (guid table change) create event and + notify the relevant slave (master also should be notify) + If the GUID value is not as we have in the cache the slave will not be updated, + in this case it waits for the smp_snoop to call the function and to updatea the slave. + block_number - the index of the block (16 blocks available) + port_number - 1 or 2 + + GUID change event on the master should be handled outside this function. + the return value of the function should be checked to find out wheather or not + on of the master's GUIDs was changed. + + return value: 0 - master GUID was not changed. + 1 - master GUID was changed. +*/ +int notify_slaves_on_guid_change(struct mlx4_ib_dev *dev, int block_num, u8 port_num, u8* p_data) +{ + int i; + u64 guid_indexes; + int slave_id; + int gid_index; + int slave0_gid_changed = 0; + enum slave_port_state new_state; + enum slave_port_state prev_state; + __be64 tmp_cur_ag, form_cache_ag; + enum slave_port_gen_event gen_event; + + if ((!mlx4_is_mfunc(dev->dev)) || (!dev->dev->caps.sqp_demux)) + return 0; /* dummy value for compilation only */ + + guid_indexes = be64_to_cpu(dev->sriov.alias_guid. + ports_guid[port_num - 1]. + all_rec_per_port[block_num]. + guid_indexes); + mlx4_ib_dbg("%s:port:%d, guid_indexes: 0x%llx\n", __func__, port_num, guid_indexes); + + /*calculate the slaves and notify them*/ + for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) { + /*the location of the specific index starts from bit number 4 till bit num 11*/ + if (test_bit(i + 4, (unsigned long *)&guid_indexes)) { + gid_index = (block_num * NUM_ALIAS_GUID_IN_REC) + i; + slave_id = mlx4_gid_idx_to_slave(dev->dev, gid_index); + + tmp_cur_ag = *(__be64*)&p_data[i * GUID_REC_SIZE]; + form_cache_ag = get_cached_alias_guid(dev, port_num, gid_index); + + if (slave_id >= dev->dev->num_slaves) { + mlx4_ib_dbg("%s:The last slave: %d\n", __func__, slave_id); + goto out; + /* GID change for slave 0 will be handled outside this function */ + } else if (slave_id == 0) { + mlx4_ib_dbg("%s: GID change event on gid %d of slave0\n", + __func__, gid_index); + if (tmp_cur_ag != form_cache_ag) + slave0_gid_changed = 1; + continue; + } + + /*check if guid is not the same as in the cache, and notify slaves.*/ + if (tmp_cur_ag != form_cache_ag) { + mlx4_ib_dbg("%s: (tmp_cur_ag: 0x%llx, form_cache_ag: 0x%llx) notifing relevant slaves...\n", + __func__, be64_to_cpu(tmp_cur_ag), be64_to_cpu(form_cache_ag)); + mlx4_gen_guid_change_eqe(dev->dev, slave_id, port_num); + } + + /* The GID at index 0 controls the state of the port - + * when it is invalid the port is considered to be down. + * No need to further act on GIDs at other indexes */ + if (ACT_GID_TO_SLAVE_GID(dev->dev, gid_index) != 0) + continue; + + /*2 cases: Valid GUID, and Invalid Guid*/ + if (MLX4_NOT_SET_GUID != tmp_cur_ag) { /*valid GUID*/ + prev_state = mlx4_get_slave_port_state(dev->dev, slave_id, port_num); + new_state = set_and_calc_slave_port_state(dev->dev, slave_id, port_num, + MLX4_PORT_STATE_IB_PORT_STATE_EVENT_GID_VALID, + &gen_event); + mlx4_ib_dbg("%s: slave: %d, port:%d prev_port_state: %d," + " new_port_state: %d, gen_event :%d\n", + __func__, slave_id, port_num, prev_state, + new_state, gen_event); + if (SLAVE_PORT_GEN_EVENT_UP == gen_event) { + mlx4_ib_dbg("%s: sending PORT_UP event to slave: %d, port:%d\n", + __func__, slave_id, port_num); + mlx4_gen_port_state_change_eqe(dev->dev, slave_id, + port_num, + MLX4_PORT_CHANGE_SUBTYPE_ACTIVE); + } else { + mlx4_ib_dbg("%s: GOT: %d event to slave: %d, port:%d\n", + __func__, gen_event, slave_id, port_num); + } + } + else { /*Invalidate GUID*/ + set_and_calc_slave_port_state(dev->dev, + slave_id, + port_num, + MLX4_PORT_STATE_IB_EVENT_GID_INVALID, + &gen_event); + mlx4_ib_dbg("%s: sending MLX4_PORT_STATE_IB_EVENT_GID_INVALID" + " event to slave: %d, port:%d [got gen_event: %d]\n", + __func__, slave_id, port_num, gen_event); + mlx4_gen_port_state_change_eqe(dev->dev, slave_id, + port_num, + MLX4_PORT_CHANGE_SUBTYPE_DOWN); + } + } + else + mlx4_ib_dbg("%s: Guid number :%d in block :%d" + " was not updated\n", + __func__, i, block_num); + } +out: + return slave0_gid_changed; + +} +/**************************************************************************** +* aliasguid_query_handler : callback function whenever we have success/failure/timeout +******************************************************************************/ +static void aliasguid_query_handler(int status, + struct ib_sa_guidinfo_rec *guid_rec, + void *context) +{ + struct mlx4_ib_dev *dev; + struct mlx4_alias_guid_work_context *cb_ctx = context; + u8 port_index; + int i; + struct mlx4_sriov_alias_guid_info_rec_det *rec; + + /*ib_sa_comp_mask comp_mask = 0;*/ + unsigned long flags, flags1; + + if (!context) { + printk(KERN_ERR "alias_guid: context is null. This is a BUG!!!\n"); + return; + } + + dev = cb_ctx->dev; + port_index = cb_ctx->port - 1; + rec = &dev->sriov.alias_guid.ports_guid[port_index].all_rec_per_port[cb_ctx->block_num]; + if (status) { + rec->status = MLX4_GUID_INFO_STATUS_IDLE; + mlx4_ib_dbg("%s: (port: %d) failed: status = %d\n", + __func__, cb_ctx->port, status); + } else { + if (guid_rec->block_num == cb_ctx->block_num) { + mlx4_ib_dbg("%s: lid/port: %d/%d, block_num: %d", __func__, + be16_to_cpu(guid_rec->lid), cb_ctx->port, guid_rec->block_num); + + rec = &dev->sriov.alias_guid.ports_guid[port_index].all_rec_per_port[guid_rec->block_num]; + /*update the status on the adminstratively records*/ + rec->status = MLX4_GUID_INFO_STATUS_SET; + /*update metod to be set (default)*/ + rec->method = MLX4_GUID_INFO_RECORD_SET; + /*rec->guid_indexes = comp_mask;*/ + + for (i = 0 ; i < NUM_ALIAS_GUID_IN_REC; i++){ + __be64 tmp_cur_ag; + tmp_cur_ag = guid_rec->guid_info_list[i]; + if (cb_ctx->method == MLX4_GUID_INFO_RECORD_DELETE) { + if (MLX4_NOT_SET_GUID == tmp_cur_ag) { + mlx4_ib_dbg("%s:Record num %d in block_num:%d was deleted by SM, " + "ownership by %d (0 = driver, 1=sysAdmin, 2=None)\n", + __func__, i, guid_rec->block_num, rec->ownership); + } else { + /* FIXME : in case of record wasn't deleted we only print an error + we can't reschedule the task since the next task can be a set and + not delete task.*/ + mlx4_ib_dbg("ERROR: %s:Record num %d in block_num:%d was Not deleted " + "by SM, ownership by %d (0 = driver, 1=sysAdmin, 2=None)\n", + __func__, i, guid_rec->block_num, rec->ownership); + } + /* turn OFF the block index bit so it won't be modified in next tasks */ + rec->guid_indexes = rec->guid_indexes & ~get_alias_guid_comp_mask_from_index(i); + continue; + } + + /* + check if the SM didn't assign one of the records. + if it didn't, if it was not sysadmin request: + asks the SM to give a new GUID, (instead of the driver request). + */ + if (MLX4_NOT_SET_GUID == tmp_cur_ag) { + mlx4_ib_dbg("%s:Record num %d in block_num:%d was declined by SM, " + "ownership by %d (0 = driver, 1=sysAdmin, 2=None)\n", + __func__, i, guid_rec->block_num,rec->ownership); + if (MLX4_GUID_DRIVER_ASSIGN == rec->ownership) { + /*if it is driver assign, asks for new GUID from SM*/ + rec->all_recs[i] = MLX4_NOT_SET_GUID; + /*Mark the record as it wasn't assined, and let it to be sent again + in the next work sched.*/ + rec->status = MLX4_GUID_INFO_STATUS_IDLE; + rec->guid_indexes = rec->guid_indexes | get_alias_guid_comp_mask_from_index(i); + } + } + else { /*properly assigned record*/ + /*We save the GUID we just got from the SM in the admin_guid in order to be + persistance, and in the request from the sm the process will ask for the same GUID */ + if (MLX4_GUID_SYSADMIN_ASSIGN == rec->ownership && + tmp_cur_ag != rec->all_recs[i]) { + /*the case the sysadmin assignment failed.*/ + mlx4_ib_dbg("%s: Failed to set admin guid after SysAdmin configuration " + "Record num %d in block_num:%d was declined by SM " + "new val(0x%llx) was kept\n", + __func__, i, guid_rec->block_num, + be64_to_cpu(rec->all_recs[i])); + } else + rec->all_recs[i] = guid_rec->guid_info_list[i]; + } + } + } else + printk(KERN_ERR "block num mismatch: %d != %d", + cb_ctx->block_num, guid_rec->block_num); + } + + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + if (!dev->sriov.is_going_down) + queue_delayed_work(dev->sriov.alias_guid.ports_guid[port_index].wq, + &dev->sriov.alias_guid.ports_guid[port_index].alias_guid_work, 0); + if (cb_ctx->sa_query) { + list_del(&cb_ctx->list); + kfree(cb_ctx); + } else + complete(&cb_ctx->done); + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); +} + +static void invalidate_guid_record(struct mlx4_ib_dev *dev, u8 port, int index) +{ + int i; + __be64 cur_admin_val; + ib_sa_comp_mask comp_mask = 0; + + dev->sriov.alias_guid.ports_guid[port - 1].all_rec_per_port[index].status + = MLX4_GUID_INFO_STATUS_IDLE; + dev->sriov.alias_guid.ports_guid[port - 1].all_rec_per_port[index].method + = MLX4_GUID_INFO_RECORD_SET; + + /* calculate the comp_mask for that record.*/ + for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) { + cur_admin_val = dev->sriov.alias_guid.ports_guid[port - 1].all_rec_per_port[index].all_recs[i]; + /* + check the admin value: if it for delete (~00LL) or + we are in the first guid (hw guid)dont put it for assigment or + the records isnot in ownership of he sysadmin and the sm doesn't + need to assign GUIDs. + */ + if (MLX4_GUID_FOR_DELETE_VAL == cur_admin_val || + (!index && i == 0) || + MLX4_GUID_NONE_ASSIGN == dev->sriov.alias_guid.ports_guid[port - 1].all_rec_per_port[index].ownership) + continue; + + comp_mask = comp_mask | get_alias_guid_comp_mask_from_index(i); + } + dev->sriov.alias_guid.ports_guid[port - 1].all_rec_per_port[index].guid_indexes = comp_mask; +} + +static int mlx4_ib_set_guid_rec(struct ib_device *ibdev, + u8 port, int index, + struct mlx4_sriov_alias_guid_info_rec_det *rec_det) +{ + int err; + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct ib_sa_guidinfo_rec guid_info_rec; + ib_sa_comp_mask comp_mask; + struct ib_port_attr attr; + struct mlx4_alias_guid_work_context *callback_context; + unsigned long resched_delay, flags, flags1; + struct list_head *head = &dev->sriov.alias_guid.ports_guid[port - 1].cb_list; + + err = mlx4_ib_query_port(ibdev, port, &attr); + if (err) { + mlx4_ib_dbg( "failed to mlx4_ib_query_port (err:%d), port:%d !!!\n", + err, port); + return err; + } + /*check the port was configured by the sm, otherwise no need to send */ + if (attr.state != IB_PORT_ACTIVE) { + mlx4_ib_dbg("port: %d not active...rescheduling", port); + resched_delay = 5 * HZ; + err = -EAGAIN; + goto new_schedule; + } + + callback_context = kmalloc(sizeof *callback_context, GFP_KERNEL); + if (!callback_context) { + err = -ENOMEM; + mlx4_ib_dbg("mlx4_ib_set_guid_rec: no Mem\n"); + resched_delay = HZ * 5; + goto new_schedule; + } + callback_context->port = port; + callback_context->dev = dev; + callback_context->block_num = index; + callback_context->method = rec_det->method; + + memset(&guid_info_rec, 0, sizeof guid_info_rec); + + guid_info_rec.lid = cpu_to_be16(attr.lid); + guid_info_rec.block_num = index; + + memcpy(guid_info_rec.guid_info_list, rec_det->all_recs, sizeof rec_det->all_recs); + comp_mask = IB_SA_GUIDINFO_REC_LID | + IB_SA_GUIDINFO_REC_BLOCK_NUM | + rec_det->guid_indexes; + + init_completion(&callback_context->done); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + list_add_tail(&callback_context->list, head); + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + + callback_context->query_id = ib_sa_guid_info_rec_query(&dev->sriov.alias_guid.sa_client, ibdev, port, + &guid_info_rec, comp_mask, + rec_det->method, 1000/*timeout*/, + GFP_KERNEL, + aliasguid_query_handler, callback_context, + &callback_context->sa_query); + if (callback_context->query_id < 0) { + mlx4_ib_dbg("mlx4_ib_set_guid_rec: failed to ib_sa_guid_info_rec_query," + "query_id: %d will reschedule to the next 1 sec.\n", callback_context->query_id); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + list_del(&callback_context->list); + kfree(callback_context); + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + resched_delay = 1 * HZ; + err = -EAGAIN; + goto new_schedule; + } + err = 0; + goto out; + +new_schedule: + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + invalidate_guid_record(dev, port, index); + if (!dev->sriov.is_going_down) { + queue_delayed_work(dev->sriov.alias_guid.ports_guid[port - 1].wq, + &dev->sriov.alias_guid.ports_guid[port - 1].alias_guid_work, + resched_delay); + } + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); + +out: + return err; +} + +void invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port) +{ + int i; + unsigned long flags, flags1; + + mlx4_ib_dbg("%s: port %d", __func__, port); + + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + for (i = 0; i < NUM_ALIAS_GUID_REC_IN_PORT; i++) + invalidate_guid_record(dev, port, i); + + if ((!mlx4_is_mfunc(dev->dev)) || (!dev->dev->caps.sqp_demux)) + goto out; + if (!dev->sriov.is_going_down) { + /* + make sure no work waits in the queue, if the work is already queued(not on the timer) + the cancel will faild, it is not a problem because that is excactly what we want, + the work started.. + */ + __cancel_delayed_work(&dev->sriov.alias_guid.ports_guid[port - 1].alias_guid_work); + queue_delayed_work(dev->sriov.alias_guid.ports_guid[port - 1].wq, + &dev->sriov.alias_guid.ports_guid[port - 1].alias_guid_work, + 0); + } +out: + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); +} +/* The function returns the next record that was not configured (or failed to configured)*/ +int get_next_record_to_update(struct mlx4_ib_dev *dev, u8 port, struct mlx4_next_alias_guid_work *rec) +{ + int j; + unsigned long flags; + + for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++ ) { + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags); + if (dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j].status == MLX4_GUID_INFO_STATUS_IDLE) { + memcpy(&rec->rec_det, &dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j], + sizeof(struct mlx4_sriov_alias_guid_info_rec_det)); + rec->port = port; + rec->block_num = j; + dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j].status = MLX4_GUID_INFO_STATUS_PENDING; + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags); + return 0; + } + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags); + } + mlx4_ib_dbg("no more work to do"); + return -ENOENT; +} + +void set_administratively_guid_record(struct mlx4_ib_dev *dev, int port, + int rec_index, + struct mlx4_sriov_alias_guid_info_rec_det *rec_det) +{ + dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].guid_indexes = + rec_det->guid_indexes; + memcpy(dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].all_recs, + rec_det->all_recs, + sizeof rec_det->all_recs); + dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].status = + rec_det->status; +} + +int mlx4_ib_set_all_slaves_guids(struct mlx4_ib_dev *dev, int port) +{ + int j; + int is_first_rec = 1; /* The first guid in the first rec is RO */ + struct mlx4_sriov_alias_guid_info_rec_det rec_det ; + + for (j = 0 ; j < NUM_ALIAS_GUID_REC_IN_PORT ; j++) { + memset(rec_det.all_recs, 0, sizeof rec_det.all_recs); + rec_det.guid_indexes = (is_first_rec ? 0 :IB_SA_COMPMASK_GID0) | + IB_SA_COMPMASK_GID1 | IB_SA_COMPMASK_GID2 | + IB_SA_COMPMASK_GID3 | IB_SA_COMPMASK_GID4 | + IB_SA_COMPMASK_GID5 | IB_SA_COMPMASK_GID6 | + IB_SA_COMPMASK_GID7; + rec_det.status = MLX4_GUID_INFO_STATUS_IDLE; + is_first_rec = 0; + set_administratively_guid_record(dev, port, j, &rec_det); + } + is_first_rec = 1; + return 0; +} + +ib_sa_comp_mask get_alias_guid_comp_mask_from_index(int index) +{ + return IB_SA_COMP_MASK(4 + index); +} + +int mlx4_ib_process_get_response_set_GUID(struct ib_device *ibdev, + u8 port_num, struct ib_mad *in_mad) +{ + mlx4_ib_dbg("processing GETRESP"); + return 0; +} + +static void alias_guid_work(struct work_struct *work) +{ + struct delayed_work *delay = to_delayed_work(work); + int ret = 0; + struct mlx4_next_alias_guid_work *rec; + struct mlx4_sriov_alias_guid_port_rec_det *sriov_alias_port = + container_of(delay, struct mlx4_sriov_alias_guid_port_rec_det, + alias_guid_work); + struct mlx4_sriov_alias_guid *sriov_alias_guid = sriov_alias_port->parent; + struct mlx4_ib_sriov *ib_sriov = container_of(sriov_alias_guid, + struct mlx4_ib_sriov, + alias_guid); + struct mlx4_ib_dev *dev = container_of(ib_sriov, struct mlx4_ib_dev, sriov); + + rec = kzalloc(sizeof *rec, GFP_KERNEL); + if (!rec) { + printk(KERN_ERR "alias_guid_work: No Memory\n"); + return; + } + + mlx4_ib_dbg("starting [port: %d]...", sriov_alias_port->port + 1); + ret = get_next_record_to_update(dev, sriov_alias_port->port, rec); + if (ret) { + mlx4_ib_dbg("No more records to update."); + goto out; + } + + mlx4_ib_set_guid_rec(&dev->ib_dev, rec->port + 1, rec->block_num, + &rec->rec_det); + +out: + kfree(rec); +} + + +int init_alias_guid_work(struct mlx4_ib_dev *dev, int port) +{ + unsigned long flags, flags1; + if ((!mlx4_is_mfunc(dev->dev)) || (!dev->dev->caps.sqp_demux)) + return 0; + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + if (!dev->sriov.is_going_down) { + queue_delayed_work(dev->sriov.alias_guid.ports_guid[port].wq, + &dev->sriov.alias_guid.ports_guid[port].alias_guid_work, 0); + } + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); + mlx4_ib_dbg("queue work for port: %d", port); + return 0; +} + +/*new function for Oracle only: driver setting the even GUIDs*/ +/* + * generate the GUID using the following formula: + * change the fourth byte to be: the GUID index in the port GUID table. + * For example: + * 00:02:C9:03:YY:XX:XX:XX + * Where: + * 00:02:C9:03 - Mellanox prefix GUID + * YY - is the GUID index in the GUID table + * XX:XX:XX - rest of the original GUID + */ +__be64 get_generated_guid(struct mlx4_ib_dev *dev, int port_num, int record_index, int guid_index_in_rec) +{ + static union ib_gid gid = {.raw={0}}; + __be64 gen_guid = 0; + static int queried_port = 1; + + /* if the gid of this port was not already queried - + query and act accordingly */ + if ((!gid.global.interface_id || (queried_port != port_num)) && + dev->ib_dev.query_gid(&dev->ib_dev, port_num, 0, &gid)) + goto exit; + + queried_port = port_num; + gen_guid = gid.global.interface_id; + ((u8 *)(&gen_guid))[4] = record_index * NUM_ALIAS_GUID_IN_REC + + guid_index_in_rec + mlx4_ib_guid_gen_magic; + + mlx4_ib_dbg("record: %d, index:%d, port_guid: 0x%llx got: 0x%llx", + record_index, guid_index_in_rec, gid.global.interface_id, gen_guid); + +exit: + return gen_guid; +} + +void clear_alias_guid_work(struct mlx4_ib_dev *dev) +{ + int i; + struct mlx4_ib_sriov *sriov = &dev->sriov; + struct mlx4_alias_guid_work_context *cb_ctx; + struct mlx4_sriov_alias_guid_port_rec_det *det; + struct ib_sa_query *sa_query; + unsigned long flags; + + for (i = 0 ; i < MLX4_MAX_PORTS; i++) { + cancel_delayed_work(&dev->sriov.alias_guid.ports_guid[i].alias_guid_work); + det = &sriov->alias_guid.ports_guid[i]; + spin_lock_irqsave(&sriov->alias_guid.ag_work_lock, flags); + while(!list_empty(&det->cb_list)) { + cb_ctx = list_entry(det->cb_list.next, + struct mlx4_alias_guid_work_context, + list); + sa_query = cb_ctx->sa_query; + cb_ctx->sa_query = NULL; + list_del(&cb_ctx->list); + spin_unlock_irqrestore(&sriov->alias_guid.ag_work_lock, flags); + ib_sa_cancel_query(cb_ctx->query_id, sa_query); + wait_for_completion(&cb_ctx->done); + kfree(cb_ctx); + spin_lock_irqsave(&sriov->alias_guid.ag_work_lock, flags); + } + spin_unlock_irqrestore(&sriov->alias_guid.ag_work_lock, flags); + } + for (i = 0 ; i < MLX4_MAX_PORTS; i++) { + /*force flush anyway.*/ + flush_workqueue(dev->sriov.alias_guid.ports_guid[i].wq); + destroy_workqueue(dev->sriov.alias_guid.ports_guid[i].wq); + } + ib_sa_unregister_client(&dev->sriov.alias_guid.sa_client); +} + +int init_alias_guid_service(struct mlx4_ib_dev *dev) +{ + char alias_wq_name[15]; + int ret = 0; + int i, j, k; + int curr_gid; + int slave_gid_idx; + struct mlx4_sriov_alias_guid *ag; + struct mlx4_sriov_alias_guid_port_rec_det *pg; + __be64 gen_guid; + + if ((!mlx4_is_mfunc(dev->dev)) || (!dev->dev->caps.sqp_demux)) + return 0; + + ag = &dev->sriov.alias_guid; + ib_sa_register_client(&ag->sa_client); + + spin_lock_init(&ag->ag_work_lock); + + for (i = 0 ; i < MLX4_MAX_PORTS; ++i) { + pg = &ag->ports_guid[i]; + INIT_LIST_HEAD(&pg->cb_list); + /* Check if the SM doesn't need to assign the GUIDs */ + for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; ++j) { + if (mlx4_ib_sm_guid_assign) + pg->all_rec_per_port[j].ownership = MLX4_GUID_DRIVER_ASSIGN; + else { + pg->all_rec_per_port[j].ownership = MLX4_GUID_SYSADMIN_ASSIGN; + + /* mark each val as it was deleted, till the sysAdmin will give it valid val */ + for (k = 0; k < NUM_ALIAS_GUID_IN_REC; ++k) { + /* Oracle request for guid-0 driver assignment: + all GUIDs at index 0 and Dom0 GUID-1 */ + curr_gid = j * NUM_ALIAS_GUID_IN_REC + k; + slave_gid_idx = ACT_GID_TO_SLAVE_GID(dev->dev, curr_gid); + if (slave_gid_idx == 0 || + (slave_gid_idx == 1 && + mlx4_gid_idx_to_slave(dev->dev, curr_gid) == 0)) { + gen_guid = get_generated_guid(dev, i + 1, j, k); + if (!gen_guid) { + ret = -EINVAL; + goto err; + } + } else + gen_guid = MLX4_GUID_FOR_DELETE_VAL; + + pg->all_rec_per_port[j].all_recs[k] = gen_guid; + } + } + + /* prepare the records, set them to be allocated by sm */ + invalidate_guid_record(dev, i + 1, j); + } + + pg->parent = ag; + pg->port = i; + if (mlx4_ib_sm_guid_assign) + mlx4_ib_set_all_slaves_guids(dev, i); + + snprintf(alias_wq_name, sizeof alias_wq_name, "alias_guid%d", i); + pg->wq = + create_singlethread_workqueue(alias_wq_name); + if (!pg->wq) { + ret = -ENOMEM; + goto err; + } + INIT_DELAYED_WORK(&pg->alias_guid_work, alias_guid_work); + } + return 0; +err: + printk(KERN_ERR "init_alias_guid_service: Failed. (ret:%d)\n", ret); + return ret; +} + +static void init_query_mad(struct ib_smp *mad) +{ + mad->base_version = 1; + mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; + mad->class_version = 1; + mad->method = IB_MGMT_METHOD_GET; +} + +int mlx4_ib_get_indexed_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + struct mlx4_ib_dev *dev = to_mdev(ibdev); + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; + in_mad->attr_mod = cpu_to_be32(port); + + err = mlx4_MAD_IFC(dev, 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + + memcpy(gid->raw, out_mad->data + 8, 8); + + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; + in_mad->attr_mod = cpu_to_be32(index / 8); + + err = mlx4_MAD_IFC(dev, 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8); +out: + kfree(in_mad); + kfree(out_mad); + return err; +} + +__be64 get_cached_alias_guid(struct mlx4_ib_dev *dev, int port, int index) +{ + __be64 cur_admin_val; + + if (index >= NUM_ALIAS_GUID_PER_PORT) { + printk(KERN_ERR "%s: BUG: asked for index:%d\n", __func__, index); + return -1; + } + cur_admin_val = *(__be64*)&dev->sriov.demux[port - 1].guid_cache[index]; + +/* cur_admin_val = *(__be64*)&dev->sriov.alias_guid.ports_guid[port - 1]. + all_rec_per_port[record_num].all_recs[GUID_REC_SIZE * guid_index_in_rec]; +*/ + return cur_admin_val; +} + +enum mlx4_guid_alias_rec_status get_record_status(struct mlx4_ib_dev *dev, int port, int index) +{ + int record_num; + + record_num = index / 8; + if (record_num >= NUM_ALIAS_GUID_REC_IN_PORT) { + printk(KERN_ERR "%s: BUG: asked for index:%d (record:%d)\n", __func__, index, record_num); + return MLX4_GUID_INFO_STATUS_IDLE; + } + return dev->sriov.alias_guid.ports_guid[port - 1].all_rec_per_port[record_num].status; +} diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.h b/drivers/infiniband/hw/mlx4/alias_GUID.h new file mode 100644 index 0000000000000..82479ecee5fcc --- /dev/null +++ b/drivers/infiniband/hw/mlx4/alias_GUID.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + /***********************************************************/ +/*This file support the handling of the Alias GUID feature. */ +/***********************************************************/ +#ifndef MLX4_ALIAS_GUID_H +#define MLX4_ALIAS_GUID_H + +#include +#include +#include +#include + +#include +#include +#include +#include +#include "mlx4_ib.h" + +#define MLX4_PORT_DOWN_WAIT_TIME (HZ * 10) +#define MLX4_GUID_FOR_DELETE_VAL cpu_to_be64(~0ULL) + +enum mlx4_guid_alias_rec_method { + MLX4_GUID_INFO_RECORD_SET = IB_MGMT_METHOD_SET, + MLX4_GUID_INFO_RECORD_DELETE = IB_SA_METHOD_DELETE, +}; + +/*work completion status */ +enum guid_alias_status { + MLX4_PORT_NOT_CONFIGURED = 1971, +}; + +/*structures*/ + +struct mlx4_alias_guid_work_context { + u8 port; + struct mlx4_ib_dev *dev ; + struct ib_sa_query *sa_query; + struct completion done; + int query_id; + struct list_head list; + int block_num; + u8 method; +}; + +struct mlx4_next_alias_guid_work { + u8 port; + u8 block_num; + struct mlx4_sriov_alias_guid_info_rec_det rec_det; +}; + +/*Functions*/ + +/*init work for port, send the (port_num - 1) for port number*/ +int init_alias_guid_work(struct mlx4_ib_dev *dev, int port); + +void clear_alias_guid_work(struct mlx4_ib_dev *dev); + +int init_alias_guid_service(struct mlx4_ib_dev *dev); + +/*When ever you want all the record to be assign*/ +void invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port); + +/*sysfs function:*/ +int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *device) ; + +void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device); + +int mlx4_ib_get_indexed_gid(struct ib_device *ibdev, u8 port, int index, + union ib_gid *gid); + +ib_sa_comp_mask get_alias_guid_comp_mask_from_index(int index); + +int notify_slaves_on_guid_change(struct mlx4_ib_dev *dev, int block_num, + u8 port_num, u8* p_data); + +void update_cache_on_guid_change(struct mlx4_ib_dev *dev, int block_num, u8 port_num, u8* p_data); + +__be64 get_cached_alias_guid(struct mlx4_ib_dev *dev, int port, int index); + +enum mlx4_guid_alias_rec_status get_record_status(struct mlx4_ib_dev *dev, + int port, int index); +#endif /*MLX4_ALIAS_GUID_H*/ diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c new file mode 100644 index 0000000000000..9430c2f01c678 --- /dev/null +++ b/drivers/infiniband/hw/mlx4/cm.c @@ -0,0 +1,538 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include +#include +#include + +#include "mlx4_ib.h" + +#ifndef DEBUG +#define TRACE(format, arg...) mlx4_ib_dbg(format, ## arg) +#else +#define TRACE(format, arg...) \ + do { \ + printk("%30s:%d - " format, __func__, __LINE__, ## arg); \ + } while (0) +#endif +#define CM_CLEANUP_CACHE_TIMEOUT ( 5 * HZ ) + +struct id_map_entry { + struct rb_node node; + + u32 sl_cm_id; + u32 pv_cm_id; + int slave_id; + int scheduled_delete; + struct mlx4_ib_dev *dev; + + struct list_head list; + struct delayed_work timeout; +}; + +struct cm_generic_msg { + struct ib_mad_hdr hdr; + + __be32 local_comm_id; + __be32 remote_comm_id; +}; + +struct cm_req_msg { + unsigned char unused[0x60]; + union ib_gid primary_path_sgid; +}; + +#define CM_REQ_ATTR_ID cpu_to_be16(0x0010) +#define CM_MRA_ATTR_ID cpu_to_be16(0x0011) +#define CM_REJ_ATTR_ID cpu_to_be16(0x0012) +#define CM_REP_ATTR_ID cpu_to_be16(0x0013) +#define CM_RTU_ATTR_ID cpu_to_be16(0x0014) +#define CM_DREQ_ATTR_ID cpu_to_be16(0x0015) +#define CM_DREP_ATTR_ID cpu_to_be16(0x0016) +#define CM_SIDR_REQ_ATTR_ID cpu_to_be16(0x0017) +#define CM_SIDR_REP_ATTR_ID cpu_to_be16(0x0018) +#define CM_LAP_ATTR_ID cpu_to_be16(0x0019) +#define CM_APR_ATTR_ID cpu_to_be16(0x001A) + +#define CODE2STR(__code) { __code, #__code } +static const char *attr2str(int code) +{ + int i; + struct { + int code; + const char *str; + } code2str[] = { + CODE2STR(CM_REQ_ATTR_ID), + CODE2STR(CM_MRA_ATTR_ID), + CODE2STR(CM_REJ_ATTR_ID), + CODE2STR(CM_REP_ATTR_ID), + CODE2STR(CM_RTU_ATTR_ID), + CODE2STR(CM_DREQ_ATTR_ID), + CODE2STR(CM_DREP_ATTR_ID), + CODE2STR(CM_SIDR_REQ_ATTR_ID), + CODE2STR(CM_SIDR_REP_ATTR_ID), + CODE2STR(CM_LAP_ATTR_ID), + CODE2STR(CM_APR_ATTR_ID), + }; + + for (i = 0; i < ARRAY_SIZE(code2str); i++) { + if (code2str[i].code == code) + return code2str[i].str; + } + + return "Unknown"; +} + +static void set_local_comm_id(struct ib_mad *mad, u32 cm_id) +{ + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + msg->local_comm_id = cpu_to_be32(cm_id); +} + +static u32 get_local_comm_id(struct ib_mad *mad) +{ + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + + return be32_to_cpu(msg->local_comm_id); +} + +static void set_remote_comm_id(struct ib_mad *mad, u32 cm_id) +{ + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + msg->remote_comm_id = cpu_to_be32(cm_id); + //TRACE("Replacing cm_id\n"); +} + +static u32 get_remote_comm_id(struct ib_mad *mad) +{ + struct cm_generic_msg *msg = (struct cm_generic_msg *)mad; + + return be32_to_cpu(msg->remote_comm_id); +} + +static union ib_gid gid_from_req_msg(struct ib_device *ibdev, struct ib_mad *mad) +{ + struct cm_req_msg *msg = (struct cm_req_msg *)mad; + + return msg->primary_path_sgid; +} + +/* Lock should be taken before called */ +static struct id_map_entry * +id_map_find_by_sl_id(struct ib_device *ibdev, u32 slave_id, u32 sl_cm_id) +{ + struct rb_root *sl_id_map = &to_mdev(ibdev)->sriov.sl_id_map; + struct rb_node *node = sl_id_map->rb_node; + + //TRACE("looking id for {slave: %d, sl_cm_id: 0x%x}\n", +// slave_id, sl_cm_id); + while (node) + { + struct id_map_entry *id_map_entry = + rb_entry(node, struct id_map_entry, node); + + if (id_map_entry->sl_cm_id > sl_cm_id) + node = node->rb_left; + else if (id_map_entry->sl_cm_id < sl_cm_id) + node = node->rb_right; + else if (id_map_entry->slave_id > slave_id) + node = node->rb_left; + else if (id_map_entry->slave_id < slave_id) + node = node->rb_right; + else { + //TRACE("Found id\n"); + return id_map_entry; + } + } + //TRACE("Couldn't find id\n"); + return NULL; +} + +static void id_map_ent_timeout(struct work_struct *work) +{ + struct delayed_work *delay = to_delayed_work(work); + struct id_map_entry *ent = container_of(delay, struct id_map_entry, timeout); + struct id_map_entry *db_ent, *found_ent; + struct mlx4_ib_dev *dev = ent->dev; + struct mlx4_ib_sriov *sriov = &dev->sriov; + struct rb_root *sl_id_map = &sriov->sl_id_map; + int pv_id = (int) ent->pv_cm_id; + + spin_lock(&sriov->id_map_lock); + db_ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, pv_id); + if (db_ent) { + TRACE("timeout cleanup: id[pv_cm_id: 0x%x] = " + "{slave_id: %d, sl_cm_id: 0x%x}\n", + pv_id, ent->slave_id, ent->sl_cm_id); + } else { + TRACE("timeout cleanup: No entry for pv_cm_id 0x%x\n", pv_id); + goto out; + } + found_ent = id_map_find_by_sl_id(&dev->ib_dev, ent->slave_id, ent->sl_cm_id); + if (found_ent && found_ent == ent) + rb_erase(&found_ent->node, sl_id_map); + idr_remove(&sriov->pv_id_table, pv_id); + +out: + list_del(&ent->list); + spin_unlock(&sriov->id_map_lock); + TRACE("Freeing ent [pv_cm_id: 0x%x]\n", pv_id); + kfree(ent); +} + +static void id_map_find_del(struct ib_device *ibdev, int pv_cm_id) +{ + struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; + struct rb_root *sl_id_map = &sriov->sl_id_map; + struct id_map_entry *ent, *found_ent; + + spin_lock(&sriov->id_map_lock); + ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, pv_cm_id); + if (ent) { + TRACE("id[pv_cm_id: 0x%x] = {slave_id: %d, sl_cm_id: 0x%x}\n", + pv_cm_id, ent->slave_id, ent->sl_cm_id); + } else { + TRACE("No entry for pv_cm_id 0x%x\n", pv_cm_id); + goto out; + } + found_ent = id_map_find_by_sl_id(ibdev, ent->slave_id, ent->sl_cm_id); + if (found_ent && found_ent == ent) + rb_erase(&found_ent->node, sl_id_map); + idr_remove(&sriov->pv_id_table, pv_cm_id); +out: + spin_unlock(&sriov->id_map_lock); +} + +static void sl_id_map_add(struct ib_device *ibdev, struct id_map_entry *new) +{ + struct rb_root *sl_id_map = &to_mdev(ibdev)->sriov.sl_id_map; + struct rb_node **link = &sl_id_map->rb_node, *parent = NULL; + struct id_map_entry *ent; + int slave_id = new->slave_id; + int sl_cm_id = new->sl_cm_id; + + //TRACE("Storing slave_id: %d, sl_cm_id: 0x%x\n", slave_id, sl_cm_id); + ent = id_map_find_by_sl_id(ibdev, slave_id, sl_cm_id); + if (ent) { + mlx4_ib_dbg("overriding existing sl_id_map entry (cm_id = %x)", + sl_cm_id); + + rb_replace_node(&ent->node, &new->node, sl_id_map); + //id_map_entry_free(ibdev, ent); + + return; + } + + /* Go to the bottom of the tree */ + while (*link) + { + struct id_map_entry *ent; + + parent = *link; + ent = rb_entry(parent, struct id_map_entry, node); + + if (ent->sl_cm_id > sl_cm_id || (ent->sl_cm_id == sl_cm_id && ent->slave_id > slave_id)) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + + rb_link_node(&new->node, parent, link); + rb_insert_color(&new->node, sl_id_map); +} + +static struct id_map_entry * +id_map_alloc(struct ib_device *ibdev, int slave_id, u32 sl_cm_id) +{ + int ret, id; + static int next_id; + struct id_map_entry *ent; + struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; + + //TRACE("Allocating new id\n"); + ent = kmalloc(sizeof(struct id_map_entry), GFP_KERNEL); + if (!ent) { + mlx4_ib_warn(ibdev, "Couldn't allocate id cache entry - out of memory\n"); + return ERR_PTR(-ENOMEM); + } + + ent->sl_cm_id = sl_cm_id; + ent->slave_id = slave_id; + ent->scheduled_delete = 0; + ent->dev = to_mdev(ibdev); + INIT_DELAYED_WORK(&ent->timeout, id_map_ent_timeout); + + do { + spin_lock(&to_mdev(ibdev)->sriov.id_map_lock); + ret = idr_get_new_above(&sriov->pv_id_table, ent, + next_id, &id); + if (!ret) { + next_id = ((unsigned) id + 1) & MAX_ID_MASK; + ent->pv_cm_id = (u32)id; + TRACE("allocated pv_cm_id: 0x%x sl_cm_id: 0x%x\n", + id, sl_cm_id); + sl_id_map_add(ibdev, ent); + } else { + TRACE("Error allocating idr %d\n", ret); + } + + spin_unlock(&sriov->id_map_lock); + } while ( (ret == -EAGAIN) && idr_pre_get(&sriov->pv_id_table, GFP_KERNEL) ); + /*the function idr_get_new_above can return -ENOSPC, so don't insert in that case.*/ + if (!ret) { + spin_lock(&sriov->id_map_lock); + list_add_tail(&ent->list, &sriov->cm_list); + spin_unlock(&sriov->id_map_lock); + return ent; + } + /*error flow*/ + kfree(ent); + mlx4_ib_warn(ibdev, "No more space in the idr (err:0x%x)\n", ret); + return ERR_PTR(-ENOMEM); +} + +static struct id_map_entry * +id_map_get(struct ib_device *ibdev, int *pv_cm_id, int sl_cm_id, int slave_id) +{ + struct id_map_entry *ent; + struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; + + spin_lock(&sriov->id_map_lock); + if (*pv_cm_id == -1) { + ent = id_map_find_by_sl_id(ibdev, sl_cm_id, slave_id); + if (ent) + *pv_cm_id = (int) ent->pv_cm_id; + } else { + ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, *pv_cm_id); + if (ent) { + TRACE("id[pv_cm_id: 0x%x] = {slave_id: %d, sl_cm_id: 0x%x}\n", + *pv_cm_id, ent->slave_id, ent->sl_cm_id); + } else { + TRACE("No entry for pv_cm_id 0x%x\n", *pv_cm_id); + } + } + spin_unlock(&sriov->id_map_lock); + + return ent; +} + +static void schedule_delayed(struct ib_device *ibdev, struct id_map_entry *id) +{ + struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov; + unsigned long flags; + + spin_lock_irqsave(&sriov->going_down_lock, flags); + spin_lock(&sriov->id_map_lock); + /*make sure that there is no schedule inside the scheduled work.*/ + if (!sriov->is_going_down) { + id->scheduled_delete = 1; + schedule_delayed_work(&id->timeout, CM_CLEANUP_CACHE_TIMEOUT); + } + spin_unlock(&sriov->id_map_lock); + spin_unlock_irqrestore(&sriov->going_down_lock, flags); +} + +int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id, + struct ib_mad *mad) +{ + struct id_map_entry *id; + u32 sl_cm_id; + int pv_cm_id = -1; + + TRACE("CM packet to send. type: %s\n", + attr2str(mad->mad_hdr.attr_id)); + + sl_cm_id = get_local_comm_id(mad); + + if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID || + mad->mad_hdr.attr_id == CM_REP_ATTR_ID) { + id = id_map_alloc(ibdev, slave_id, sl_cm_id); + if (IS_ERR(id)) { + mlx4_ib_warn(ibdev, "%s: id{slave: %d, sl_cm_id: 0x%x} Failed to id_map_alloc\n", + __func__, slave_id, sl_cm_id); + return PTR_ERR(id); + } + } else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID) { + return 0; + } else { + id = id_map_get(ibdev, &pv_cm_id, slave_id, sl_cm_id); + } + + if (!id) { + mlx4_ib_dbg("id{slave: %d, sl_cm_id: 0x%x} is NULL!\n", + slave_id, sl_cm_id); + return -EINVAL; + } + + set_local_comm_id(mad, id->pv_cm_id); + + if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID) { + //TRACE("Starting cm cleanup timeout\n"); + schedule_delayed(ibdev, id); + } else if (mad->mad_hdr.attr_id == CM_DREP_ATTR_ID) + id_map_find_del(ibdev, pv_cm_id); + + return 0; +} + +int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, + struct ib_mad *mad) +{ + u32 pv_cm_id; + int gid_idx; + struct id_map_entry *id; + + TRACE("A CM packet arrived. type: %s\n", + attr2str(mad->mad_hdr.attr_id)); + + if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID) { + union ib_gid gid; + + gid = gid_from_req_msg(ibdev, mad); + gid_idx = mlx4_ib_find_real_gid(ibdev, port, gid.global.interface_id); + //TRACE("Slave by gid is: %d\n", *slave); + if (gid_idx < 0) { + mlx4_ib_warn(ibdev, "failed matching gid index by gid (0x%llx)\n", + gid.global.interface_id); + return -ENOENT; + } + *slave = mlx4_gid_idx_to_slave(to_mdev(ibdev)->dev, gid_idx); + return 0; + } + + pv_cm_id = get_remote_comm_id(mad); + //TRACE("pv_cm_id = 0x%x\n", pv_cm_id); + id = id_map_get(ibdev, (int *)&pv_cm_id, -1, -1); + + if (!id) { + mlx4_ib_dbg("Couldn't find an entry for pv_cm_id 0x%x\n", pv_cm_id); + return -ENOENT; + } + + *slave = id->slave_id; + set_remote_comm_id(mad, id->sl_cm_id); + TRACE("id[0x%x] = {slave: %d, sl_cm_id: 0x%x}\n", + pv_cm_id, id->slave_id, id->sl_cm_id); + + if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID) { + //TRACE("Starting cm cleanup timeout\n"); + schedule_delayed(ibdev, id); + } else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID || + mad->mad_hdr.attr_id == CM_DREP_ATTR_ID) { + id_map_find_del(ibdev, (int) pv_cm_id); + } + + return 0; +} + +void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev) +{ + TRACE("%s\n", __func__); + spin_lock_init(&dev->sriov.id_map_lock); + INIT_LIST_HEAD(&dev->sriov.cm_list); + dev->sriov.sl_id_map = RB_ROOT; + idr_init(&dev->sriov.pv_id_table); + idr_pre_get(&dev->sriov.pv_id_table, GFP_KERNEL); +} + +/* slave = -1 ==> all slaves */ +/* TBD -- call paravirt clean for single slave. Need for slave RESET event */ +void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave) +{ + struct mlx4_ib_sriov *sriov = &dev->sriov; + struct rb_root *sl_id_map = &sriov->sl_id_map; + struct list_head lh; + struct rb_node *nd; + int need_flush = 1; + struct id_map_entry *map, *tmp_map; + TRACE("%s\n", __func__); + /* cancel all delayed work queue entries */ + INIT_LIST_HEAD(&lh); + spin_lock(&sriov->id_map_lock); + list_for_each_entry_safe(map, tmp_map, &dev->sriov.cm_list, list) { + if (slave < 0 || slave == map->slave_id) { + if (map->scheduled_delete) + need_flush &= !!cancel_delayed_work(&map->timeout); + } + } + + spin_unlock(&sriov->id_map_lock); + + if (!need_flush) + flush_scheduled_work(); /* make sure all timers were flushed */ + + /* now, remove all leftover entries from databases*/ + spin_lock(&sriov->id_map_lock); + if (slave < 0) { + while (rb_first(sl_id_map)) { + struct id_map_entry *ent = + rb_entry(rb_first(sl_id_map), + struct id_map_entry, node); + + rb_erase(&ent->node, sl_id_map); + idr_remove(&sriov->pv_id_table, (int) ent->pv_cm_id); + } + list_splice_init(&dev->sriov.cm_list, &lh); + } else { + /* first, move nodes belonging to slave to db remove list */ + nd = rb_first(sl_id_map); + while (nd) { + struct id_map_entry *ent = + rb_entry(nd, struct id_map_entry, node); + nd = rb_next(nd); + if (ent->slave_id == slave) + list_move_tail(&ent->list, &lh); + } + /* remove those nodes from databases */ + list_for_each_entry_safe(map, tmp_map, &lh, list) { + rb_erase(&map->node, sl_id_map); + idr_remove(&sriov->pv_id_table, (int) map->pv_cm_id); + } + + /* add remaining nodes from cm_list */ + list_for_each_entry_safe(map, tmp_map, &dev->sriov.cm_list, list) { + if (slave == map->slave_id) + list_move_tail(&map->list, &lh); + } + } + + spin_unlock(&sriov->id_map_lock); + + /* free any map entries left behind due to cancel_delayed_work above */ + list_for_each_entry_safe(map, tmp_map, &lh, list) { + list_del(&map->list); + kfree(map); + } +} + diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index e8df155bc3b07..f3bb0ed0f834d 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -33,11 +33,14 @@ #include #include -#include +#include #include "mlx4_ib.h" #include "user.h" +/* Which firmware version adds support for Resize CQ */ +#define MLX4_FW_VER_RESIZE_CQ mlx4_fw_ver(2, 5, 0) + static void mlx4_ib_cq_comp(struct mlx4_cq *cq) { struct ib_cq *ibcq = &to_mibcq(cq)->ibcq; @@ -106,7 +109,7 @@ static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf * goto out; err = mlx4_mtt_init(dev->dev, buf->buf.npages, buf->buf.page_shift, - &buf->mtt); + &buf->mtt, MLX4_MR_FLAG_NONE); if (err) goto err_buf; @@ -117,7 +120,7 @@ static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf * return 0; err_mtt: - mlx4_mtt_cleanup(dev->dev, &buf->mtt); + mlx4_mtt_cleanup(dev->dev, &buf->mtt, MLX4_MR_FLAG_NONE); err_buf: mlx4_buf_free(dev->dev, nent * sizeof(struct mlx4_cqe), @@ -144,7 +147,8 @@ static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *cont return PTR_ERR(*umem); err = mlx4_mtt_init(dev->dev, ib_umem_page_count(*umem), - ilog2((*umem)->page_size), &buf->mtt); + ilog2((*umem)->page_size), &buf->mtt, + MLX4_MR_FLAG_NONE); if (err) goto err_buf; @@ -155,7 +159,7 @@ static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *cont return 0; err_mtt: - mlx4_mtt_cleanup(dev->dev, &buf->mtt); + mlx4_mtt_cleanup(dev->dev, &buf->mtt, MLX4_MR_FLAG_NONE); err_buf: ib_umem_release(*umem); @@ -172,10 +176,12 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector struct mlx4_uar *uar; int err; - if (entries < 1 || entries > dev->dev->caps.max_cqes) + if (entries < 1 || entries > dev->dev->caps.max_cqes) { + mlx4_ib_dbg("invalid num of entries: %d", entries); return ERR_PTR(-EINVAL); + } - cq = kmalloc(sizeof *cq, GFP_KERNEL); + cq = kzalloc(sizeof *cq, GFP_KERNEL); if (!cq) return ERR_PTR(-ENOMEM); @@ -223,7 +229,9 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector } err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar, - cq->db.dma, &cq->mcq, vector, 0); + cq->db.dma, &cq->mcq, + vector == IB_CQ_VECTOR_LEAST_ATTACHED ? + MLX4_LEAST_ATTACHED_VECTOR : vector, 0); if (err) goto err_dbmap; @@ -243,7 +251,7 @@ err_dbmap: mlx4_ib_db_unmap_user(to_mucontext(context), &cq->db); err_mtt: - mlx4_mtt_cleanup(dev->dev, &cq->buf.mtt); + mlx4_mtt_cleanup(dev->dev, &cq->buf.mtt, MLX4_MR_FLAG_NONE); if (context) ib_umem_release(cq->umem); @@ -350,6 +358,9 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) int outst_cqe; int err; + if (dev->dev->caps.fw_ver < MLX4_FW_VER_RESIZE_CQ) + return -ENOSYS; + mutex_lock(&cq->resize_mutex); if (entries < 1 || entries > dev->dev->caps.max_cqes) { @@ -386,7 +397,7 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) if (err) goto err_buf; - mlx4_mtt_cleanup(dev->dev, &mtt); + mlx4_mtt_cleanup(dev->dev, &mtt, MLX4_MR_FLAG_NONE); if (ibcq->uobject) { cq->buf = cq->resize_buf->buf; cq->ibcq.cqe = cq->resize_buf->cqe; @@ -399,7 +410,7 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) } else { struct mlx4_ib_cq_buf tmp_buf; int tmp_cqe = 0; - + spin_lock_irq(&cq->lock); if (cq->resize_buf) { mlx4_ib_cq_resize_copy_cqes(cq); @@ -420,7 +431,7 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) goto out; err_buf: - mlx4_mtt_cleanup(dev->dev, &cq->resize_buf->buf.mtt); + mlx4_mtt_cleanup(dev->dev, &cq->resize_buf->buf.mtt, MLX4_MR_FLAG_NONE); if (!ibcq->uobject) mlx4_ib_free_cq_buf(dev, &cq->resize_buf->buf, cq->resize_buf->cqe); @@ -444,7 +455,7 @@ int mlx4_ib_destroy_cq(struct ib_cq *cq) struct mlx4_ib_cq *mcq = to_mcq(cq); mlx4_cq_free(dev->dev, &mcq->mcq); - mlx4_mtt_cleanup(dev->dev, &mcq->buf.mtt); + mlx4_mtt_cleanup(dev->dev, &mcq->buf.mtt, MLX4_MR_FLAG_NONE); if (cq->uobject) { mlx4_ib_db_unmap_user(to_mucontext(cq->uobject->context), &mcq->db); @@ -544,6 +555,37 @@ static int mlx4_ib_ipoib_csum_ok(__be16 status, __be16 checksum) checksum == cpu_to_be16(0xffff); } +static int ph_to_virt_pkey(struct mlx4_ib_dev *dev, int port, u16 ph_idx, u16 *virt_idx) +{ + if (port < 1 || port > dev->num_ports) { + mlx4_ib_warn(&dev->ib_dev, "port = %d is out of range\n", port); + return -EINVAL; + } + + *virt_idx = dev->pkeys.phys2virt_pkey[port - 1][ph_idx]; + return 0; +} + +static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct ib_wc *wc, + unsigned tail, struct mlx4_cqe *cqe) +{ + struct mlx4_ib_proxy_sqp_hdr *hdr; + + ib_dma_sync_single_for_cpu(qp->ibqp.device, + qp->sqp_proxy_rcv[tail].map, + sizeof(struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + hdr = (struct mlx4_ib_proxy_sqp_hdr *) (qp->sqp_proxy_rcv[tail].addr); + wc->pkey_index = hdr->tun.pkey_index; + wc->slid = hdr->tun.slid; + wc->sl = hdr->tun.sl; + wc->src_qp = hdr->tun.src_qp; + wc->wc_flags |= hdr->tun.wc_flags & IB_WC_GRH; + wc->dlid_path_bits = 0; + wc->csum_ok = mlx4_ib_ipoib_csum_ok(cqe->status, cqe->checksum); + return 0; +} + static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, struct mlx4_ib_qp **cur_qp, struct ib_wc *wc) @@ -551,11 +593,17 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, struct mlx4_cqe *cqe; struct mlx4_qp *mqp; struct mlx4_ib_wq *wq; - struct mlx4_ib_srq *srq; + struct mlx4_ib_srq *uninitialized_var(srq); + struct mlx4_srq *msrq; int is_send; int is_error; u32 g_mlpath_rqpn; + int is_xrc_recv = 0; u16 wqe_ctr; + u16 ph_pkey_index, virt_pkey_index; + int port; + int err; + unsigned tail = 0; repoll: cqe = next_cqe_sw(cq); @@ -596,7 +644,24 @@ repoll: goto repoll; } - if (!*cur_qp || + if ((be32_to_cpu(cqe->vlan_my_qpn) & (1 << 23)) && !is_send) { + /* + * We do not have to take the XRC SRQ table lock here, + * because CQs will be locked while XRC SRQs are removed + * from the table. + */ + msrq = __mlx4_srq_lookup(to_mdev(cq->ibcq.device)->dev, + be32_to_cpu(cqe->g_mlpath_rqpn) & + 0xffffff); + if (unlikely(!msrq)) { + printk(KERN_WARNING "CQ %06x with entry for unknown " + "XRC SRQ %06x\n", cq->mcq.cqn, + be32_to_cpu(cqe->g_mlpath_rqpn) & 0xffffff); + return -EINVAL; + } + is_xrc_recv = 1; + srq = to_mibsrq(msrq); + } else if (!*cur_qp || (be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) != (*cur_qp)->mqp.qpn) { /* * We do not have to take the QP table lock here, @@ -614,7 +679,7 @@ repoll: *cur_qp = to_mibqp(mqp); } - wc->qp = &(*cur_qp)->ibqp; + wc->qp = is_xrc_recv ? NULL: &(*cur_qp)->ibqp; if (is_send) { wq = &(*cur_qp)->sq; @@ -624,6 +689,10 @@ repoll: } wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; ++wq->tail; + } else if (is_xrc_recv) { + wqe_ctr = be16_to_cpu(cqe->wqe_index); + wc->wr_id = srq->wrid[wqe_ctr]; + mlx4_ib_free_srq_wqe(srq, wqe_ctr); } else if ((*cur_qp)->ibqp.srq) { srq = to_msrq((*cur_qp)->ibqp.srq); wqe_ctr = be16_to_cpu(cqe->wqe_index); @@ -631,12 +700,14 @@ repoll: mlx4_ib_free_srq_wqe(srq, wqe_ctr); } else { wq = &(*cur_qp)->rq; - wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + tail = wq->tail & (wq->wqe_cnt - 1); + wc->wr_id = wq->wrid[tail]; ++wq->tail; } if (unlikely(is_error)) { mlx4_ib_handle_error_cqe((struct mlx4_err_cqe *) cqe, wc); + wc->opcode = is_send; return 0; } @@ -714,13 +785,39 @@ repoll: break; } + if (mlx4_is_mfunc(to_mdev(cq->ibcq.device)->dev)) { + if (is_xrc_recv) { + mqp = __mlx4_qp_lookup(to_mdev(cq->ibcq.device)->dev, + be32_to_cpu(cqe->vlan_my_qpn)); + if (unlikely(!mqp)) { + printk(KERN_WARNING "CQ %06x, XRC SRQ 0x%x with unknown QPN %06x\n", + cq->mcq.cqn, srq->ibsrq.xrc_srq_num, + be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK); + return -EINVAL; + } + port = to_mibqp(mqp)->port; + } else if ((*cur_qp)->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI || + (*cur_qp)->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI) { + return use_tunnel_data(*cur_qp, cq, wc, tail, cqe); + } else + port = (*cur_qp)->port; + + ph_pkey_index = be32_to_cpu(cqe->immed_rss_invalid) & 0x7f; + err = ph_to_virt_pkey(to_mdev(cq->ibcq.device), port, + ph_pkey_index, &virt_pkey_index); + if (err) + return err; + wc->pkey_index = virt_pkey_index; + } else { + wc->pkey_index = be32_to_cpu(cqe->immed_rss_invalid) & 0x7f; + } + wc->slid = be16_to_cpu(cqe->rlid); wc->sl = be16_to_cpu(cqe->sl_vid) >> 12; g_mlpath_rqpn = be32_to_cpu(cqe->g_mlpath_rqpn); wc->src_qp = g_mlpath_rqpn & 0xffffff; wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f; wc->wc_flags |= g_mlpath_rqpn & 0x80000000 ? IB_WC_GRH : 0; - wc->pkey_index = be32_to_cpu(cqe->immed_rss_invalid) & 0x7f; wc->csum_ok = mlx4_ib_ipoib_csum_ok(cqe->status, cqe->checksum); } @@ -759,7 +856,7 @@ int mlx4_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) mlx4_cq_arm(&to_mcq(ibcq)->mcq, (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? MLX4_CQ_DB_REQ_NOT_SOL : MLX4_CQ_DB_REQ_NOT, - to_mdev(ibcq->device)->uar_map, + to_mdev(ibcq->device)->priv_uar.map, MLX4_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->uar_lock)); return 0; @@ -771,6 +868,10 @@ void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq) int nfreed = 0; struct mlx4_cqe *cqe, *dest; u8 owner_bit; + int is_xrc_srq = 0; + + if (srq && srq->ibsrq.xrc_cq) + is_xrc_srq = 1; /* * First we need to find the current producer index, so we @@ -789,7 +890,10 @@ void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq) */ while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) { cqe = get_cqe(cq, prod_index & cq->ibcq.cqe); - if ((be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) { + if (((be32_to_cpu(cqe->vlan_my_qpn) & 0xffffff) == qpn) || + (is_xrc_srq && + (be32_to_cpu(cqe->g_mlpath_rqpn) & 0xffffff) == + srq->msrq.srqn)) { if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) mlx4_ib_free_srq_wqe(srq, be16_to_cpu(cqe->wqe_index)); ++nfreed; diff --git a/drivers/infiniband/hw/mlx4/ib_events.c b/drivers/infiniband/hw/mlx4/ib_events.c new file mode 100644 index 0000000000000..4247353fbf210 --- /dev/null +++ b/drivers/infiniband/hw/mlx4/ib_events.c @@ -0,0 +1,326 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + /***************************************************************/ +/* This file supports the handling of mlx4_ib events. */ +/****************************************************************/ + +#include +#include "mlx4_ib.h" +#include "ib_events.h" +#include "alias_GUID.h" + +#define GET_BLK_PTR_FROM_EQE(eqe) be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.block_ptr) +#define GET_MASK_FROM_EQE(eqe) be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.tbl_entries_mask) +#define NUM_IDX_IN_PKEY_TBL_BLK 32 +#define GUID_TBL_ENTRY_SIZE 8 /* size in bytes */ +#define GUID_TBL_BLK_NUM_ENTRIES 8 +#define GUID_TBL_BLK_SIZE (GUID_TBL_ENTRY_SIZE * GUID_TBL_BLK_NUM_ENTRIES) + +#define MSTR_SM_CHANGE_MASK (MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK | MLX4_EQ_PORT_INFO_MSTR_SM_LID_CHANGE_MASK) + +enum { + MLX4_DEV_PMC_SUBTYPE_GUID_INFO = 0x14, + MLX4_DEV_PMC_SUBTYPE_PORT_INFO = 0x15, + MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE = 0x16, +}; + +enum { + MLX4_EQ_PORT_INFO_MSTR_SM_LID_CHANGE_MASK = 1 << 0, + MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK = 1 << 1, + MLX4_EQ_PORT_INFO_LID_CHANGE_MASK = 1 << 2, + MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK = 1 << 3, + MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK = 1 << 4, +}; + +void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num) +{ + struct ib_event event; + + event.device = &dev->ib_dev; + event.element.port_num = port_num; + event.event = IB_EVENT_LID_CHANGE; + + ib_dispatch_event(&event); + + if (mlx4_is_mfunc(dev->dev) && dev->dev->caps.sqp_demux && (!dev->sriov.is_going_down)) + mlx4_gen_all_sw_eqe(dev->dev, port_num, + LID_CHANGE_AVIAL); +} + +void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num) +{ + struct ib_event event; + + event.device = &dev->ib_dev; + event.element.port_num = port_num; + event.event = IB_EVENT_CLIENT_REREGISTER; + + /*also re-configure the alias-guid and mcg's */ + if (dev->dev->caps.sqp_demux) { + invalidate_all_guid_record(dev, port_num); + + if (!dev->sriov.is_going_down) { + mlx4_ib_mcg_port_cleanup(&dev->sriov.demux[port_num - 1], 0); + mlx4_gen_all_sw_eqe(dev->dev, port_num, + CLIENT_REREGISTER_AVIAL); + } + } + ib_dispatch_event(&event); +} + +static void propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num, + struct mlx4_ib_eqe *eqe) +{ + int pkey_idx_base; + int i, ix, slave; + int have_event = 0; + int err; + u32 change_bitmap; + + change_bitmap = GET_MASK_FROM_EQE(eqe); + pkey_idx_base = (GET_BLK_PTR_FROM_EQE(eqe) * NUM_IDX_IN_PKEY_TBL_BLK); + + for (slave = 0; slave < dev->dev->caps.sqp_demux; slave++) { + if (slave == dev->dev->caps.function) + continue; + + if (!mlx4_is_slave_active(dev->dev, slave)) + continue; + + have_event = 0; + + /* go through the bitmap to see which indexes in the pkeys block + were modified */ + for (i = 0; i < NUM_IDX_IN_PKEY_TBL_BLK; i++) { + if (!(change_bitmap & (1 << i))) + continue; + + for (ix = 0; ix < dev->dev->caps.pkey_table_len[port_num]; ix++) { + if (dev->pkeys.virt2phys_pkey[slave][port_num - 1][ix] == + (pkey_idx_base + i)) { + mlx4_ib_dbg("%s: slave %d, port %d, ix %d", + __func__, slave, port_num, ix); + + err = mlx4_gen_pkey_eqe(dev->dev, slave, port_num); + mlx4_ib_dbg("propagate_pkey_ev: slave %d," + " port %d, ix %d (%d)", + slave, port_num, ix, err); + have_event = 1; + break; + } + } + + if (have_event) + break; + } + } +} + +static void handle_pkey_change_event(struct mlx4_ib_eqe *eqe, + struct mlx4_ib_dev *dev) +{ + struct ib_event event; + u8 port_num = eqe->event.port_mgmt_change.port; + + mlx4_ib_dbg("PKEY Change event: port=%d\n", port_num); + + event.device = &dev->ib_dev; + event.event = IB_EVENT_PKEY_CHANGE; + event.element.port_num = port_num; + + ib_dispatch_event(&event); + + if (!mlx4_is_mfunc(dev->dev) || !dev->dev->caps.sqp_demux || dev->sriov.is_going_down) + return; + + propagate_pkey_ev(dev, port_num, eqe); +} + +static inline void handle_master_sm_change_event(struct mlx4_ib_dev *dev, + struct mlx4_ib_eqe *eqe) +{ + u16 lid = be16_to_cpu(eqe->event.port_mgmt_change.params.port_info.mstr_sm_lid); + u8 sl = eqe->event.port_mgmt_change.params.port_info.mstr_sm_sl & 0xf; + u8 port_num = eqe->event.port_mgmt_change.port; + + update_sm_ah(dev, port_num, lid, sl); +} + +static void handle_slaves_guid_change(struct mlx4_ib_dev *dev, u8 port_num, + u32 guid_tbl_blk_num, u32 change_bitmap) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + u16 i; + + if (!mlx4_is_mfunc(dev->dev) || !mlx4_is_master(dev->dev)) + return; + + in_mad = kmalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) { + mlx4_ib_warn(&dev->ib_dev, "failed to allocate memory for guid info mads\n"); + goto out; + } + + guid_tbl_blk_num *= 4; + + for (i = 0; i < 4; i++) { + if (change_bitmap && (!((change_bitmap >> (8 * i)) & 0xff))) + continue; + memset(in_mad, 0, sizeof *in_mad); + memset(out_mad, 0, sizeof *out_mad); + + in_mad->base_version = 1; + in_mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; + in_mad->class_version = 1; + in_mad->method = IB_MGMT_METHOD_GET; + in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; + in_mad->attr_mod = cpu_to_be32(guid_tbl_blk_num + i); + + if (mlx4_MAD_IFC(dev, 1, 1, port_num, NULL, NULL, + in_mad, out_mad)) { + mlx4_ib_warn(&dev->ib_dev, "Failed in get GUID INFO MAD_IFC\n"); + goto out; + } + + update_cache_on_guid_change(dev, guid_tbl_blk_num + i, port_num, + (u8*)(&((struct ib_smp *)out_mad)->data)); + notify_slaves_on_guid_change(dev, guid_tbl_blk_num + i, port_num, + (u8*)(&((struct ib_smp *)out_mad)->data)); + } + +out: + if (in_mad) + kfree(in_mad); + + if (out_mad) + kfree(out_mad); + + return; +} + +static void handle_guid_change_event(struct mlx4_ib_dev *dev, + struct mlx4_ib_eqe *eqe) +{ + struct ib_event event; + u32 tbl_block; + u32 change_bitmap; + u8 port = eqe->event.port_mgmt_change.port; + + /* The mfunc master's GUID is always the default GUID + and will never change, so there's no need to dispatch the event */ + if (!mlx4_is_mfunc(dev->dev) || + (mlx4_is_mfunc(dev->dev) && !mlx4_is_master(dev->dev))) { + event.device = &dev->ib_dev; + event.event = IB_EVENT_GID_CHANGE; + event.element.port_num = port; + ib_dispatch_event(&event); + + return; + } + + /*if master, notify relevant slaves*/ + if (dev->dev->caps.sqp_demux && (!dev->sriov.is_going_down)) { + tbl_block = GET_BLK_PTR_FROM_EQE(eqe); + change_bitmap = GET_MASK_FROM_EQE(eqe); + + handle_slaves_guid_change(dev, port, tbl_block, change_bitmap); + } +} + +void handle_port_mgmt_change_event(struct work_struct *work) +{ + struct ib_event event; + struct ib_event_work *ew = container_of(work, struct ib_event_work, work); + struct mlx4_ib_dev *dev = ew->ib_dev; + struct mlx4_ib_eqe *eqe = &(ew->ib_eqe); + u8 port = eqe->event.port_mgmt_change.port; + u32 changed_attr; + + switch(eqe->subtype) { + case MLX4_DEV_PMC_SUBTYPE_PORT_INFO: + changed_attr = be32_to_cpu(eqe->event.port_mgmt_change.params.port_info.changed_attr); + + /* Update the SM ah - This should be done before handling + the other changed attributes*/ + if (changed_attr & MSTR_SM_CHANGE_MASK) { + mlx4_ib_dbg("Master SM changed on port %d", port); + + handle_master_sm_change_event(dev, eqe); + } + + /* Check if it is a lid change event */ + if (changed_attr & MLX4_EQ_PORT_INFO_LID_CHANGE_MASK) { + mlx4_ib_dbg("LID change event on port %d", port); + + handle_lid_change_event(dev, port); + } + + /* Generate GUID changed event */ + if (changed_attr & MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK) { + mlx4_ib_dbg("GID prefix changed on port %d", port); + + event.device = &dev->ib_dev; + event.event = IB_EVENT_GID_CHANGE; + event.element.port_num = port; + ib_dispatch_event(&event); + + if (mlx4_is_mfunc(dev->dev) && mlx4_is_master(dev->dev)) + /*if master, notify all slaves*/ + mlx4_gen_all_sw_eqe(dev->dev, port, + GUID_CHANGE_AVIAL); + } + + if (changed_attr & MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK) { + mlx4_ib_dbg("CLIENT REREGISTER event on port %d", port); + handle_client_rereg_event(dev, port); + } + break; + + case MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE: + mlx4_ib_dbg("PKEY Change event on port=%d", port); + + handle_pkey_change_event(eqe, dev); + break; + case MLX4_DEV_PMC_SUBTYPE_GUID_INFO: + mlx4_ib_dbg("GUID change event on port %d", port); + + handle_guid_change_event(dev, eqe); + break; + default: + printk(KERN_WARNING "Unsupported subtype 0x%x for " + "Port Management Change event\n", eqe->subtype); + } + + kfree(ew); +} diff --git a/drivers/infiniband/hw/mlx4/ib_events.h b/drivers/infiniband/hw/mlx4/ib_events.h new file mode 100644 index 0000000000000..c5ca9d6dec88e --- /dev/null +++ b/drivers/infiniband/hw/mlx4/ib_events.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + /***********************************************************/ +/*This file support the handling of mlx4_ib events. */ +/***********************************************************/ +#ifndef MLX4_IB_EVENTS_H +#define MLX4_IB_EVENTS_H + +#include "mlx4_ib.h" + +#define MAX_SET_PORT_INFO_GEN_EVENTS 4 + + + +struct mlx4_ib_eqe { + u8 reserved1; + u8 type; + u8 reserved2; + u8 subtype; + union { + u32 raw[6]; + struct { + u16 reserved1; + __be16 token; + u32 reserved2; + u8 reserved3[3]; + u8 status; + __be64 out_param; + } __attribute__((packed)) cmd; + struct { + u32 reserved1[2]; + __be32 port; + } __attribute__((packed)) port_change; + struct { + #define COMM_CHANNEL_BIT_ARRAY_SIZE 4 + u32 reserved; + u32 bit_vec[COMM_CHANNEL_BIT_ARRAY_SIZE]; + } __attribute__((packed)) comm_channel_arm; + struct { + u8 reserved[3]; + u8 vep_num; + } __attribute__((packed)) vep_config; + struct { + u8 port; + u8 reserved[3]; + __be64 mac; + } __attribute__((packed)) mac_update; + struct { + u8 port; + } __attribute__((packed)) sw_event; + struct { + __be32 slave_id; + } __attribute__((packed)) flr_event; + struct { + u8 reserved[3]; + u8 port; + union { + struct { + __be16 mstr_sm_lid; + __be16 port_lid; + __be32 changed_attr; + u8 reserved[3]; + u8 mstr_sm_sl; + } __attribute__((packed)) port_info; + struct { + __be32 block_ptr; + __be32 tbl_entries_mask; + } __attribute__((packed)) tbl_change_info; + } params; + } __attribute__((packed)) port_mgmt_change; + } event; + u8 reserved3[3]; + u8 owner; +}; + +struct ib_event_work { + struct work_struct work; + struct mlx4_ib_dev *ib_dev; + struct mlx4_ib_eqe ib_eqe; +}; + + +void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num); +void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num); +void handle_port_mgmt_change_event(struct work_struct *work); +#endif /* MLX4_IB_EVENTSMLX4_DEV_EVENT_PORT_MGMT_CHANGE_H */ diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index 44fc3104e9185..76ec0d985f32e 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -32,17 +32,87 @@ #include #include +#include +#include +#include + #include -#include #include "mlx4_ib.h" +#include "alias_GUID.h" +#include "ib_events.h" enum { MLX4_IB_VENDOR_CLASS1 = 0x9, MLX4_IB_VENDOR_CLASS2 = 0xa }; +#define MLX4_TUN_SEND_WRID_SHIFT 34 +#define MLX4_TUN_QPN_SHIFT 32 +#define MLX4_TUN_WRID_RECV (((u64) 1) << MLX4_TUN_SEND_WRID_SHIFT) +#define MLX4_TUN_SET_WRID_QPN(a) (((u64) ((a) & 0x3)) << MLX4_TUN_QPN_SHIFT) + +#define MLX4_TUN_IS_RECV(a) (((a) >> MLX4_TUN_SEND_WRID_SHIFT) & 0x1) +#define MLX4_TUN_WRID_QPN(a) (((a) >> MLX4_TUN_QPN_SHIFT) & 0x3) + +/* QP and CQ parameters */ +#define MLX4_IB_MAD_QP_SEND_SIZE 256 +#define MLX4_IB_MAD_QP_RECV_SIZE 256 +#define MLX4_IB_MAD_QP_MIN_SIZE 64 +#define MLX4_IB_MAD_QP_MAX_SIZE 8192 +#define MLX4_IB_MAD_SEND_REQ_MAX_SG 2 +#define MLX4_IB_MAD_RECV_REQ_MAX_SG 1 + +#define MLX4_IB_MAD_SEND_Q_PSN 0 + +struct mlx4_mad_rcv_buf { + struct ib_grh grh; + u8 payload[256]; +} __attribute__ ((packed)); + +struct mlx4_mad_snd_buf { + u8 payload[256]; +} __attribute__ ((packed)); + +struct mlx4_tunnel_mad { + struct ib_grh grh; + struct mlx4_ib_tunnel_header hdr; + struct ib_mad mad; +} __attribute__ ((packed)); + +struct mlx4_rcv_tunnel_mad { + struct mlx4_rcv_tunnel_hdr hdr; + struct ib_grh grh; + struct ib_mad mad; +} __attribute__ ((packed)); + +/* This function should only be called by the master, to get the function + number */ +static inline int get_master_func_num(struct mlx4_ib_dev *dev) +{ +#ifdef CONFIG_MLX4_DEBUG + if (!dev->dev->caps.sqp_demux) + printk(KERN_ERR "function %s was called by non-master\n", + __func__); +#endif /* CONFIG_MLX4_DEBUG */ + + return dev->dev->caps.function; +} + +static enum rdma_link_layer +mlx4_ib_port_link_layer(struct mlx4_ib_dev *device, u8 port_num) +{ + return device->dev->caps.port_mask[port_num] == MLX4_PORT_TYPE_IB ? + IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; +} + +__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx) +{ + return cpu_to_be64(atomic_inc_return(&ctx->tid)) | + cpu_to_be64(0xff00000000000000LL); +} + int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey, int port, struct ib_wc *in_wc, struct ib_grh *in_grh, void *in_mad, void *response_mad) @@ -106,9 +176,22 @@ int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey, in_modifier |= in_wc->slid << 16; } + /* Currently, MADs in Dom0 (which is also currently the MAD master) + * are not modified in paravirtualization (wrapper) code. (The Dom0 pkey + * paravirt table is a 1-1 mapping which is not modifiable). Therefore, + * we can safely set the native flag to 1 (true) for MAD_IFC, so that MAD_IFC + * in Dom0 * will be executed directly, and not via the wrapper. + * Doing this saves significant time in MAD processing on Dom0. + * + * TBD -- need to review MAD_IFC paravirtualization, so that when doing MAD_IFC + * as part of processing received MADS to generate a response MAD -- no paravirt + * should be performed. However, when MAD_IFC is called inside API calls + * (e.g., ib_query_port), paravirt should be done (again, though, in Dom0 the current + * paravirt code is a NOP). + */ err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma, in_modifier, op_modifier, - MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C); + MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C, 1); if (!err) memcpy(response_mad, outmailbox->buf, 256); @@ -119,73 +202,142 @@ int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey, return err; } -static void update_sm_ah(struct mlx4_ib_dev *dev, u8 port_num, u16 lid, u8 sl) +/* + * Snoop SM MADs for port info and P_Key table sets, so we can + * synthesize LID change and P_Key change events. + */ +static void propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num, + int block, u32 change_bitmap) { - struct ib_ah *new_ah; - struct ib_ah_attr ah_attr; + int i, ix, slave, err; + int have_event = 0; - if (!dev->send_agent[port_num - 1][0]) - return; - - memset(&ah_attr, 0, sizeof ah_attr); - ah_attr.dlid = lid; - ah_attr.sl = sl; - ah_attr.port_num = port_num; - - new_ah = ib_create_ah(dev->send_agent[port_num - 1][0]->qp->pd, - &ah_attr); - if (IS_ERR(new_ah)) - return; + for (slave = 0; slave < dev->dev->caps.sqp_demux; slave++) { + if (slave == dev->dev->caps.function) + continue; + if (!mlx4_is_slave_active(dev->dev, slave)) + continue; - spin_lock(&dev->sm_lock); - if (dev->sm_ah[port_num - 1]) - ib_destroy_ah(dev->sm_ah[port_num - 1]); - dev->sm_ah[port_num - 1] = new_ah; - spin_unlock(&dev->sm_lock); + have_event = 0; + for (i = 0; i < 32; i++) { + if (!(change_bitmap & (1 << i))) + continue; + for (ix = 0; + ix < dev->dev->caps.pkey_table_len[port_num]; ix++) { + if (dev->pkeys.virt2phys_pkey[slave][port_num - 1] + [ix] == i + 32 * block) { + err = mlx4_gen_pkey_eqe(dev->dev, slave, port_num); + mlx4_ib_dbg("propagate_pkey_ev: slave %d," + " port %d, ix %d (%d)", + slave, port_num, ix, err); + have_event = 1; + break; + } + } + if (have_event) + break; + } + } } -/* - * Snoop SM MADs for port info and P_Key table sets, so we can - * synthesize LID change and P_Key change events. - */ static void smp_snoop(struct ib_device *ibdev, u8 port_num, struct ib_mad *mad, u16 prev_lid) { struct ib_event event; + struct ib_port_info *pinfo; + u16 lid, *base; + int slave0_gid_changed = 0; /* dummy value. to avoid comp warnings */ + u32 bn, pkey_change_bitmap; + int i; + struct mlx4_ib_dev *dev = to_mdev(ibdev); if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED || mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) && - mad->mad_hdr.method == IB_MGMT_METHOD_SET) { - if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO) { - struct ib_port_info *pinfo = - (struct ib_port_info *) ((struct ib_smp *) mad)->data; - u16 lid = be16_to_cpu(pinfo->lid); + mad->mad_hdr.method == IB_MGMT_METHOD_SET) + switch(mad->mad_hdr.attr_id) { + case IB_SMP_ATTR_PORT_INFO: + pinfo = (struct ib_port_info *) ((struct ib_smp *) mad)->data; + lid = be16_to_cpu(pinfo->lid); + + update_sm_ah(to_mdev(ibdev), port_num, + be16_to_cpu(pinfo->sm_lid), + pinfo->neighbormtu_mastersmsl & 0xf); - update_sm_ah(to_mdev(ibdev), port_num, - be16_to_cpu(pinfo->sm_lid), - pinfo->neighbormtu_mastersmsl & 0xf); + if (pinfo->clientrereg_resv_subnetto & 0x80) + handle_client_rereg_event(dev, port_num); + + if (prev_lid != lid) + handle_lid_change_event(dev, port_num); + + break; + case IB_SMP_ATTR_PKEY_TABLE: event.device = ibdev; + event.event = IB_EVENT_PKEY_CHANGE; event.element.port_num = port_num; - if (pinfo->clientrereg_resv_subnetto & 0x80) { - event.event = IB_EVENT_CLIENT_REREGISTER; + if (!mlx4_is_mfunc(dev->dev)) { ib_dispatch_event(&event); + break; } - if (prev_lid != lid) { - event.event = IB_EVENT_LID_CHANGE; + bn = be32_to_cpu(((struct ib_smp *)mad)->attr_mod) & 0xFFFF; + base = (u16 *) &(((struct ib_smp *)mad)->data[0]); + pkey_change_bitmap = 0; + for (i = 0; i < 32; i++) { + mlx4_ib_dbg("PKEY[%d] = x%x", + i + bn*32, be16_to_cpu(base[i])); + if (be16_to_cpu(base[i]) != + dev->pkeys.phys_pkey_cache[port_num - 1][i + bn*32]) { + pkey_change_bitmap |= (1 << i); + dev->pkeys.phys_pkey_cache[port_num - 1][i + bn*32] = + be16_to_cpu(base[i]); + } + } + mlx4_ib_dbg("PKEY Change event: port=%d, " + "block=0x%x, change_bitmap=0x%x\n", + port_num, bn, pkey_change_bitmap); + + if (pkey_change_bitmap) { ib_dispatch_event(&event); + if (dev->dev->caps.sqp_demux && (!dev->sriov.is_going_down)) + propagate_pkey_ev(dev, port_num, bn, pkey_change_bitmap); } - } + break; - if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PKEY_TABLE) { - event.device = ibdev; - event.event = IB_EVENT_PKEY_CHANGE; - event.element.port_num = port_num; - ib_dispatch_event(&event); + case IB_SMP_ATTR_GUID_INFO: + /* For SRIOV slave IB GID change event will be dispatched + only if a value was writen to one of it's GIDs. + for slave > 0 this event will only be generated in a + relevant GID was changed. + for slave 0, the IB event will be dispatched only if a relevant + GID was changed, according to slave0_gid_changed value */ + + /*if master, notify relevant slaves*/ + if (dev->dev->caps.sqp_demux && mlx4_is_master(dev->dev) && + (!dev->sriov.is_going_down)) { + bn = be32_to_cpu(((struct ib_smp *)mad)->attr_mod); + slave0_gid_changed = + notify_slaves_on_guid_change(dev, bn, port_num, + (u8 *)(&((struct ib_smp *)mad)->data)); + update_cache_on_guid_change(dev, bn, port_num, (u8*)(&((struct ib_smp *)mad)->data)); + } + + /* IB GID change event will be dispached for non-sriov drivers + and for sriov slaves, or for sriov master if one of it's GIDs + */ + if (!mlx4_is_mfunc(dev->dev) || + (mlx4_is_mfunc(dev->dev) && + (!mlx4_is_master(dev->dev) || slave0_gid_changed))) { + event.device = ibdev; + event.event = IB_EVENT_GID_CHANGE; + event.element.port_num = port_num; + ib_dispatch_event(&event); + } + break; + default: + break; } - } } static void node_desc_override(struct ib_device *dev, @@ -211,8 +363,6 @@ static void forward_trap(struct mlx4_ib_dev *dev, u8 port_num, struct ib_mad *ma if (agent) { send_buf = ib_create_send_mad(agent, qpn, 0, 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, GFP_ATOMIC); - if (IS_ERR(send_buf)) - return; /* * We rely here on the fact that MLX QPs don't use the * address handle after the send is posted (this is @@ -232,18 +382,310 @@ static void forward_trap(struct mlx4_ib_dev *dev, u8 port_num, struct ib_mad *ma } } -int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, - struct ib_wc *in_wc, struct ib_grh *in_grh, - struct ib_mad *in_mad, struct ib_mad *out_mad) +static int mlx4_ib_demux_sa_handler(struct ib_device *ibdev, int port, int slave, + struct ib_sa_mad *sa_mad) +{ + int ret = 0; + + /* dispatch to different sa handlers */ + switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) { + case IB_SA_ATTR_MC_MEMBER_REC: + ret = mlx4_ib_mcg_demux_handler(ibdev, port, slave, sa_mad); + break; + default: + break; + } + return ret; +} + +int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + int i; + + /* Look for the guid in all the possibly assigned gids, up to the GIDs table + length - there can be gids_per_func gids assigned per slave, + and the max number of slaves is sqp_demux. */ + for (i = 0; + i < min(dev->dev->caps.sqp_demux * dev->dev->gids_per_func, + MLX4_MAX_NUM_GIDS); i++) { + if (dev->sriov.demux[port - 1].guid_cache[i] == guid) + return i; + } + return -1; +} + +static inline int slave_from_gid_idx(struct mlx4_ib_dev *dev, + int gid_index) +{ + int index; + + index = slave_gid_index(dev->dev, gid_index); + + return ((index < 0) ? -EINVAL : mlx4_gid_idx_to_slave(dev->dev, index)); +} + +static int find_slave_port_pkey_ix(struct mlx4_ib_dev *dev, int slave, + u8 port, u16 pkey, u16 *ix) +{ + int i, ret; + u8 unassigned_pkey_ix, pkey_ix, partial_ix = 0xFF; + u16 slot_pkey; + + if (slave == get_master_func_num(dev)) + return ib_find_cached_pkey(&dev->ib_dev, port, pkey, ix); + + unassigned_pkey_ix = dev->dev->caps.pkey_table_max_len[port] - 1; + + for (i = 0; i < dev->dev->caps.pkey_table_len[port]; i++) { + if (dev->pkeys.virt2phys_pkey[slave][port - 1][i] == unassigned_pkey_ix) + continue; + + pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][i]; + + ret = ib_get_cached_pkey(&dev->ib_dev, port, pkey_ix, &slot_pkey); + if (ret) + continue; + if ((slot_pkey & 0x7FFF) == (pkey & 0x7FFF)) { + if (slot_pkey & 0x8000) { + *ix = (u16) pkey_ix; + return 0; + } else { + /* take first partial pkey index found */ + if (partial_ix == 0xFF) + partial_ix = pkey_ix; + } + } + } + + if (partial_ix < 0xFF) { + *ix = (u16) partial_ix; + return 0; + } + + return -EINVAL; +} + +int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, + enum ib_qp_type dest_qpt, struct ib_wc *wc, + struct ib_grh *grh, struct ib_mad *mad) +{ + struct ib_sge list; + struct ib_send_wr wr, *bad_wr; + struct mlx4_ib_demux_pv_ctx *tun_ctx; + struct mlx4_ib_demux_pv_qp *tun_qp; + struct mlx4_rcv_tunnel_mad *tun_mad; + struct ib_ah_attr attr; + struct ib_ah *ah; + struct ib_qp *src_qp = NULL; + unsigned tun_tx_ix = 0; + int dqpn; + int ret = 0; + u16 tun_pkey_ix; + u16 cached_pkey; + + if (dest_qpt > IB_QPT_GSI) + return -EINVAL; + + tun_ctx = dev->sriov.demux[port-1].tun[slave]; + + /* check if proxy qp created */ + if (!tun_ctx || tun_ctx->state != DEMUX_PV_STATE_ACTIVE) + return -EAGAIN; + + /* QP0 forwarding only for Dom0 */ + if (!dest_qpt && (get_master_func_num(dev) != slave)) + return -EINVAL; + + if (!dest_qpt) + tun_qp = &tun_ctx->qp[0]; + else + tun_qp = &tun_ctx->qp[1]; + + /* compute pkey index to put in tunnel header for slave */ + if (dest_qpt) { + u16 pkey_ix; + ret = ib_get_cached_pkey(&dev->ib_dev, port, wc->pkey_index, &cached_pkey); + if (ret) + return -EINVAL; + + ret = find_slave_port_pkey_ix(dev, slave, port, cached_pkey, &pkey_ix); + if (ret) + return -EINVAL; + tun_pkey_ix = pkey_ix; + } else + tun_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0]; + + dqpn = dev->dev->caps.tunnel_qpn + 8 * (slave + 1) + port + (dest_qpt * 2) - 1; + + /* get tunnel tx data buf for slave */ + src_qp = tun_qp->qp; + + /* create ah */ + memset(&attr, 0, sizeof attr); + attr.dlid = dev->sriov.local_lid[port - 1]; /* What about dlid path-bits? do we want to integrate them? */ + attr.port_num = port | 0x80; /* force loopback */ + /* attr.sl = 0; XXX is that OK? + attr.src_path_bits = 0; N.A. - we forward the original source lid + attr.static_rate = 0; */ + ah = ib_create_ah(tun_ctx->pd, &attr); + if (IS_ERR(ah)) + return -ENOMEM; + + /* allocate tunnel tx buf after pass failure returns */ + spin_lock(&tun_qp->tx_lock); + if (tun_qp->tx_ix_head - tun_qp->tx_ix_tail >= + (MLX4_NUM_TUNNEL_BUFS - 1)) + ret = -EAGAIN; + else + tun_tx_ix = (++tun_qp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1); + spin_unlock(&tun_qp->tx_lock); + if (ret) + goto out; + + tun_mad = (struct mlx4_rcv_tunnel_mad *) (tun_qp->tx_ring[tun_tx_ix].buf.addr); + if (tun_qp->tx_ring[tun_tx_ix].ah) + ib_destroy_ah(tun_qp->tx_ring[tun_tx_ix].ah); + tun_qp->tx_ring[tun_tx_ix].ah = ah; + ib_dma_sync_single_for_cpu(&dev->ib_dev, + tun_qp->tx_ring[tun_tx_ix].buf.map, + sizeof (struct mlx4_rcv_tunnel_mad), + DMA_TO_DEVICE); + + /* copy over to tunnel buffer */ + if (grh) + memcpy(&tun_mad->grh, grh, sizeof (*grh)); + memcpy(&tun_mad->mad, mad, sizeof (*mad)); + + /* adjust tunnel data */ + tun_mad->hdr.pkey_index = tun_pkey_ix; + tun_mad->hdr.sl = wc->sl; + tun_mad->hdr.slid = wc->slid; + tun_mad->hdr.src_qp = wc->src_qp; + tun_mad->hdr.wc_flags = (grh) ? wc->wc_flags : 0; + + ib_dma_sync_single_for_device(&dev->ib_dev, + tun_qp->tx_ring[tun_tx_ix].buf.map, + sizeof (struct mlx4_rcv_tunnel_mad), + DMA_TO_DEVICE); + + list.addr = tun_qp->tx_ring[tun_tx_ix].buf.map; + list.length = sizeof(struct mlx4_rcv_tunnel_mad); + list.lkey = tun_ctx->mr->lkey; + + wr.wr.ud.ah = ah; + wr.wr.ud.port_num = port; + wr.wr.ud.remote_qkey = IB_QP_SET_QKEY; + wr.wr.ud.remote_qpn = dqpn; + wr.next = NULL; + wr.wr_id = ((u64) tun_tx_ix) | MLX4_TUN_SET_WRID_QPN(dest_qpt); + wr.sg_list = &list; + wr.num_sge = 1; + wr.opcode = IB_WR_SEND; + wr.send_flags = IB_SEND_SIGNALED; + + ret = ib_post_send(src_qp, &wr, &bad_wr); +out: + if (ret) + ib_destroy_ah(ah); + return ret; +} + +static int mlx4_ib_demux_mad(struct ib_device *ibdev, u8 port, + struct ib_wc *wc, struct ib_grh *grh, + struct ib_mad *mad) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + int err; + int slave; + int gid_idx; + u8 *slave_id; + + /* Initially assume that this mad is for us */ + slave = get_master_func_num(dev); + + /* See if the slave id is encoded in a response mad */ + if (mad->mad_hdr.method & 0x80) { + slave_id = (u8*) &mad->mad_hdr.tid; + slave = *slave_id; + if (slave != 255) /*255 indicates the dom0*/ + *slave_id = 0; /* remap tid */ + } + + /* If a grh is present, we demux according to it */ + if (wc->wc_flags & IB_WC_GRH) { + gid_idx = mlx4_ib_find_real_gid(ibdev, port, grh->dgid.global.interface_id); + slave = mlx4_gid_idx_to_slave(dev->dev, gid_idx); + if ((gid_idx < 0) || (slave < 0)) { + mlx4_ib_warn(ibdev, "failed matching grh\n"); + return -ENOENT; + } + } + /* Class-specific handling */ + switch (mad->mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_ADM: + if (mlx4_ib_demux_sa_handler(ibdev, port, slave, + (struct ib_sa_mad *) mad)) + return 0; + break; + case IB_MGMT_CLASS_CM: + if (mlx4_ib_demux_cm_handler(ibdev, port, &slave, mad)) + return 0; + break; + default: + /* Drop unsupported classes for slaves in tunnel mode */ + if (slave != get_master_func_num(dev)) { + mlx4_ib_dbg("dropping unsupported ingress mad from class:%d " + "for slave:%d", mad->mad_hdr.mgmt_class, slave); + return 0; + } + } + /*make sure that no slave==255 was not handled yet.*/ + if (slave > dev->dev->caps.sqp_demux){ + mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n", slave, dev->dev->caps.sqp_demux); + return -ENOENT; + } + + err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad); + if (err) + mlx4_ib_dbg("failed sending to slave %d via tunnel qp (%d)", + slave, err); + return 0; +} + +static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) { u16 slid, prev_lid = 0; int err; struct ib_port_attr pattr; + struct mlx4_ib_dev *dev = to_mdev(ibdev); +#ifdef DEBUG + /* XXX debug - print source of qp1 messages */ + if (in_wc && in_wc->qp->qp_num) { + mlx4_ib_dbg("received MAD: slid:%d sqpn:%d " + "dlid_bits:%d dqpn:%d wc_flags:0x%x, cls %x, mtd %x, atr %x", + in_wc->slid, in_wc->src_qp, + in_wc->dlid_path_bits, + in_wc->qp->qp_num, + in_wc->wc_flags, + in_mad->mad_hdr.mgmt_class, in_mad->mad_hdr.method, be16_to_cpu(in_mad->mad_hdr.attr_id)); + if (in_wc->wc_flags & IB_WC_GRH) { + mlx4_ib_dbg("sgid_hi:0x%016llx sgid_lo:0x%016llx", + be64_to_cpu(in_grh->sgid.global.subnet_prefix), + be64_to_cpu(in_grh->sgid.global.interface_id)); + mlx4_ib_dbg("dgid_hi:0x%016llx dgid_lo:0x%016llx", + be64_to_cpu(in_grh->dgid.global.subnet_prefix), + be64_to_cpu(in_grh->dgid.global.interface_id)); + } + } +#endif /*DEBUG*/ slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) { - forward_trap(to_mdev(ibdev), port_num, in_mad); + forward_trap(dev, port_num, in_mad); return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; } @@ -255,7 +697,8 @@ int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, return IB_MAD_RESULT_SUCCESS; /* - * Don't process SMInfo queries -- the SMA can't handle them. + * Don't process SMInfo queries + * MADs -- the SMA can't handle them. */ if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO) return IB_MAD_RESULT_SUCCESS; @@ -276,7 +719,7 @@ int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, !ib_query_port(ibdev, port_num, &pattr)) prev_lid = pattr.lid; - err = mlx4_MAD_IFC(to_mdev(ibdev), + err = mlx4_MAD_IFC(dev, mad_flags & IB_MAD_IGNORE_MKEY, mad_flags & IB_MAD_IGNORE_BKEY, port_num, in_wc, in_grh, in_mad, out_mad); @@ -284,7 +727,9 @@ int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, return IB_MAD_RESULT_FAILURE; if (!out_mad->mad_hdr.status) { - smp_snoop(ibdev, port_num, in_mad, prev_lid); + if (!dev->dev->is_internal_sma) + smp_snoop(ibdev, port_num, in_mad, prev_lid); + node_desc_override(ibdev, out_mad); } @@ -299,12 +744,1227 @@ int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; } +static __be32 be64_to_be32(__be64 b64) +{ + return cpu_to_be32(be64_to_cpu(b64) & 0xffffffff); +} + + +static void edit_counter(struct mlx4_counters *cnt, void *counters, + __be16 attr_id) +{ + switch (attr_id) { + case IB_PMA_PORT_COUNTERS: + { + struct ib_pma_portcounters *pma_cnt = + (struct ib_pma_portcounters *) counters; + + pma_cnt->port_xmit_data = + cpu_to_be32((be64_to_cpu(cnt->tx_bytes) >> 2)); + pma_cnt->port_rcv_data = + cpu_to_be32((be64_to_cpu(cnt->rx_bytes) >> 2)); + pma_cnt->port_xmit_packets = + cpu_to_be32(be64_to_cpu(cnt->tx_frames)); + pma_cnt->port_rcv_packets = + cpu_to_be32(be64_to_cpu(cnt->rx_frames)); + break; + } + case IB_PMA_PORT_COUNTERS_EXT: + { + struct ib_pma_portcounters_ext *pma_cnt_ext = + (struct ib_pma_portcounters_ext *) counters; + + pma_cnt_ext->port_xmit_data = cpu_to_be64(be64_to_cpu(cnt->tx_bytes) >> 2); + pma_cnt_ext->port_rcv_data = cpu_to_be64(be64_to_cpu(cnt->rx_bytes) >> 2); + pma_cnt_ext->port_xmit_packets = cnt->tx_frames; + pma_cnt_ext->port_rcv_packets = cnt->rx_frames; + break; + } + default: + pr_warn("Unsupported attr_id 0x%x\n", attr_id); + break; + } +} + + +static void edit_ext_counter(struct mlx4_counters_ext *cnt, void *counters, + __be16 attr_id) +{ + switch (attr_id) { + case IB_PMA_PORT_COUNTERS: + { + struct ib_pma_portcounters *pma_cnt = + (struct ib_pma_portcounters *) counters; + + pma_cnt->port_xmit_data = + cpu_to_be32((be64_to_cpu(cnt->tx_uni_bytes) >> 2)); + pma_cnt->port_rcv_data = + cpu_to_be32((be64_to_cpu(cnt->rx_uni_bytes) >> 2)); + pma_cnt->port_xmit_packets = + cpu_to_be32(be64_to_cpu(cnt->tx_uni_frames)); + pma_cnt->port_rcv_packets = + cpu_to_be32(be64_to_cpu(cnt->rx_uni_frames)); + break; + } + case IB_PMA_PORT_COUNTERS_EXT: + { + struct ib_pma_portcounters_ext *pma_cnt_ext = + (struct ib_pma_portcounters_ext *) counters; + + pma_cnt_ext->port_xmit_data = cpu_to_be64(be64_to_cpu(cnt->tx_uni_bytes) >> 2); + pma_cnt_ext->port_rcv_data = cpu_to_be64(be64_to_cpu(cnt->rx_uni_bytes) >> 2); + pma_cnt_ext->port_unicast_xmit_packets = cnt->tx_uni_frames; + pma_cnt_ext->port_unicast_rcv_packets = cnt->rx_uni_frames; + break; + } + default: + pr_warn("Unsupported attr_id 0x%x\n", attr_id); + break; + } + +} + +static int rdmaoe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_ib_dev *dev = to_mdev(ibdev); + int err; + u32 inmod = dev->counters[port_num - 1] & 0xffff; + int mode; + + mailbox = mlx4_alloc_cmd_mailbox(dev->dev); + if (IS_ERR(mailbox)) + return IB_MAD_RESULT_FAILURE; + + err = mlx4_cmd_box(dev->dev, 0, mailbox->dma, inmod, 0, + MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C, 0); + if (err) + err = IB_MAD_RESULT_FAILURE; + else { + memset(&out_mad->data, 0, IB_MGMT_MAD_DATA); + memcpy(&out_mad->mad_hdr, &in_mad->mad_hdr, sizeof(out_mad->mad_hdr)); + out_mad->mad_hdr.method = 0x81; + out_mad->mad_hdr.status = 0; + mode = be32_to_cpu(((struct mlx4_counters *)mailbox->buf)->counter_mode) & 0xf; + switch (mode) { + case 0: + edit_counter(mailbox->buf, + (void *)(out_mad->data + 40), + in_mad->mad_hdr.attr_id); + + err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; + break; + case 1: + edit_ext_counter(mailbox->buf, + (void *)(out_mad->data + 40), + in_mad->mad_hdr.attr_id); + err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; + break; + default: + err = IB_MAD_RESULT_FAILURE; + } + } + + mlx4_free_cmd_mailbox(dev->dev, mailbox); + + return err; +} + +int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, + struct ib_wc *in_wc, struct ib_grh *in_grh, + struct ib_mad *in_mad, struct ib_mad *out_mad) +{ + if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && !in_wc && + in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT && + in_mad->mad_hdr.method == IB_MGMT_METHOD_GET && ( + in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS || + in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS_EXT /* port counters */ )) + return rdmaoe_process_mad(ibdev, mad_flags, port_num, in_wc, + in_grh, in_mad, out_mad); + + switch (rdma_port_link_layer(ibdev, port_num)) { + case IB_LINK_LAYER_INFINIBAND: + return ib_process_mad(ibdev, mad_flags, port_num, in_wc, + in_grh, in_mad, out_mad); + case IB_LINK_LAYER_ETHERNET: + return rdmaoe_process_mad(ibdev, mad_flags, port_num, in_wc, + in_grh, in_mad, out_mad); + default: + return -EINVAL; + } +} + static void send_handler(struct ib_mad_agent *agent, struct ib_mad_send_wc *mad_send_wc) { + if (mad_send_wc->send_buf->context[0]) + ib_destroy_ah(mad_send_wc->send_buf->context[0]); ib_free_send_mad(mad_send_wc->send_buf); } +static void mlx4_ib_tunnel_comp_handler(struct ib_cq *cq, void *arg) +{ + unsigned long flags; + struct mlx4_ib_demux_pv_ctx *ctx = cq->cq_context; + struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + if (!dev->sriov.is_going_down && ctx->state == DEMUX_PV_STATE_ACTIVE) + queue_work(ctx->wq, &ctx->work); + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); +} + +static int mlx4_ib_post_pv_qp_buf(struct mlx4_ib_demux_pv_ctx *ctx, + struct mlx4_ib_demux_pv_qp *tun_qp, + int index) +{ + struct ib_sge sg_list; + struct ib_recv_wr recv_wr, *bad_recv_wr; + int size; + + size = (tun_qp->qp->qp_type == IB_QPT_UD) ? + sizeof (struct mlx4_tunnel_mad) : sizeof (struct mlx4_mad_rcv_buf); + + sg_list.addr = tun_qp->ring[index].map; + sg_list.length = size; + sg_list.lkey = ctx->mr->lkey; + + recv_wr.next = NULL; + recv_wr.sg_list = &sg_list; + recv_wr.num_sge = 1; + recv_wr.wr_id = (u64) index | MLX4_TUN_WRID_RECV | + MLX4_TUN_SET_WRID_QPN(tun_qp->proxy_qpt); + ib_dma_sync_single_for_device(ctx->ib_dev, tun_qp->ring[index].map, + size, DMA_FROM_DEVICE); + return ib_post_recv(tun_qp->qp, &recv_wr, &bad_recv_wr); +} + +static int mlx4_ib_multiplex_sa_handler(struct ib_device *ibdev, int port, + int slave, struct ib_sa_mad *sa_mad) +{ + int ret = 0; + + /* dispatch to different sa handlers */ + switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) { + case IB_SA_ATTR_MC_MEMBER_REC: + ret = mlx4_ib_mcg_multiplex_handler(ibdev, port, slave, sa_mad); + break; + default: + break; + } + return ret; +} + +static int is_proxy_qp0(struct mlx4_ib_dev *dev, int qpn, int slave) +{ + int slave_start = dev->dev->caps.tunnel_qpn + 8 * (slave + 1); + + return (qpn >= slave_start && qpn <= slave_start + 1); +} + + +int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, + enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn, + u32 qkey, struct ib_ah_attr *attr, struct ib_mad *mad) +{ + struct ib_sge list; + struct ib_send_wr wr, *bad_wr; + struct mlx4_ib_demux_pv_ctx *sqp_ctx; + struct mlx4_ib_demux_pv_qp *sqp; + struct mlx4_mad_snd_buf *sqp_mad; + struct ib_ah *ah; + struct ib_qp *send_qp = NULL; + unsigned wire_tx_ix = 0; + int ret = 0; + int src_qpnum; + u16 wire_pkey_ix; + u8 sgid_index; + + + sqp_ctx = dev->sriov.sqps[port-1]; + + /* check if proxy qp created */ + if (!sqp_ctx || sqp_ctx->state != DEMUX_PV_STATE_ACTIVE) + return -EAGAIN; + + /* QP0 forwarding only for Dom0 */ + if (dest_qpt == IB_QPT_SMI && (get_master_func_num(dev) != slave)) + return -EINVAL; + + if (dest_qpt == IB_QPT_SMI) { + src_qpnum = 0; + sqp = &sqp_ctx->qp[0]; + wire_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0]; + } else { + src_qpnum = 1; + sqp = &sqp_ctx->qp[1]; + wire_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][pkey_index]; + } + + send_qp = sqp->qp; + + /* create ah */ + sgid_index = attr->grh.sgid_index; + attr->grh.sgid_index = 0; + ah = ib_create_ah(sqp_ctx->pd, attr); + if (IS_ERR(ah)) + return -ENOMEM; + attr->grh.sgid_index = sgid_index; + to_mah(ah)->av.ib.gid_index = sgid_index; + /* get rid of force-loopback bit */ + to_mah(ah)->av.ib.port_pd &= cpu_to_be32(0x7FFFFFFF); + spin_lock(&sqp->tx_lock); + if (sqp->tx_ix_head - sqp->tx_ix_tail >= + (MLX4_NUM_TUNNEL_BUFS - 1)) + ret = -EAGAIN; + else + wire_tx_ix = (++sqp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1); + spin_unlock(&sqp->tx_lock); + if (ret) + goto out; + + sqp_mad = (struct mlx4_mad_snd_buf *) (sqp->tx_ring[wire_tx_ix].buf.addr); + if (sqp->tx_ring[wire_tx_ix].ah) + ib_destroy_ah(sqp->tx_ring[wire_tx_ix].ah); + sqp->tx_ring[wire_tx_ix].ah = ah; + ib_dma_sync_single_for_cpu(&dev->ib_dev, + sqp->tx_ring[wire_tx_ix].buf.map, + sizeof (struct mlx4_mad_snd_buf), + DMA_TO_DEVICE); + + memcpy(&sqp_mad->payload, mad, sizeof (*mad)); + + ib_dma_sync_single_for_device(&dev->ib_dev, + sqp->tx_ring[wire_tx_ix].buf.map, + sizeof (struct mlx4_mad_snd_buf), + DMA_TO_DEVICE); + + list.addr = sqp->tx_ring[wire_tx_ix].buf.map; + list.length = sizeof(struct mlx4_mad_snd_buf); + list.lkey = sqp_ctx->mr->lkey; + + wr.wr.ud.ah = ah; + wr.wr.ud.port_num = port; + wr.wr.ud.pkey_index = wire_pkey_ix; + wr.wr.ud.remote_qkey = qkey; + wr.wr.ud.remote_qpn = remote_qpn; + wr.next = NULL; + wr.wr_id = ((u64) wire_tx_ix) | MLX4_TUN_SET_WRID_QPN(src_qpnum); + wr.sg_list = &list; + wr.num_sge = 1; + wr.opcode = IB_WR_SEND; + wr.send_flags = IB_SEND_SIGNALED; + + ret = ib_post_send(send_qp, &wr, &bad_wr); +out: + if (ret) + ib_destroy_ah(ah); + return ret; +} + +static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc *wc) +{ + struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); + struct mlx4_ib_demux_pv_qp *tun_qp = &ctx->qp[MLX4_TUN_WRID_QPN(wc->wr_id)]; + int wr_ix = wc->wr_id & (MLX4_NUM_TUNNEL_BUFS - 1); + struct mlx4_tunnel_mad *tunnel = tun_qp->ring[wr_ix].addr; + struct mlx4_ib_ah ah; + struct ib_ah_attr ah_attr; + u8 *slave_id; + int is_master; /* 1 if master */ + int slave; + + /* Get slave that sent this packet */ + if (wc->src_qp < dev->dev->caps.tunnel_qpn + 8 || + wc->src_qp >= dev->dev->caps.tunnel_qpn + 8 * (dev->dev->caps.sqp_demux + 1) || + (wc->src_qp & 0x1) != ctx->port - 1 || + wc->src_qp & 0x4) { + mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d\n", wc->src_qp); + return; + } + slave = ((wc->src_qp & ~0x7) - dev->dev->caps.tunnel_qpn) / 8 - 1; + if (slave != ctx->slave) { + mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d: " + "belongs to another slave\n", wc->src_qp); + return; + } + + is_master = (slave == get_master_func_num(dev)); + + if (!is_master && !(wc->src_qp & 0x2)) { + mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d: " + "non-master trying to send QP0 packets\n", wc->src_qp); + return; + } + + /* Map transaction ID */ + ib_dma_sync_single_for_cpu(ctx->ib_dev, tun_qp->ring[wr_ix].map, + sizeof(struct mlx4_tunnel_mad), + DMA_FROM_DEVICE); + switch (tunnel->mad.mad_hdr.method) { + case IB_MGMT_METHOD_SET: + case IB_MGMT_METHOD_GET: + case IB_MGMT_METHOD_REPORT: + case IB_SA_METHOD_GET_TABLE: + case IB_SA_METHOD_DELETE: + case IB_SA_METHOD_GET_MULTI: + case IB_SA_METHOD_GET_TRACE_TBL: + slave_id = (u8*) &tunnel->mad.mad_hdr.tid; + if (*slave_id) { + /* XXX TODO: hold a mapping instead of failing */ + mlx4_ib_warn(ctx->ib_dev, "egress mad has non-null tid msb:%d " + "class:%d slave:%d\n", *slave_id, + tunnel->mad.mad_hdr.mgmt_class, slave); + return; + } else + *slave_id = slave; + default: + /* nothing */; + } + + /* Class-specific handling */ + switch (tunnel->mad.mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_ADM: + if (mlx4_ib_multiplex_sa_handler(ctx->ib_dev, ctx->port, slave, + (struct ib_sa_mad *) &tunnel->mad)) + return; + break; + case IB_MGMT_CLASS_CM: + if (mlx4_ib_multiplex_cm_handler(ctx->ib_dev, ctx->port, slave, + (struct ib_mad *) &tunnel->mad)) + return; + break; + default: + /* Drop unsupported classes for slaves in tunnel mode */ + if (!is_master) { + mlx4_ib_warn(ctx->ib_dev, "dropping unsupported egress mad from class:%d " + "for slave:%d\n", tunnel->mad.mad_hdr.mgmt_class, slave); + return; + } + } + + /* We are using standard ib_core services to send the mad, so generate a + * stadard address handle by decoding the tunnelled mlx4_ah fields */ + memcpy(&ah.av, &tunnel->hdr.av, sizeof (struct mlx4_av)); + ah.ibah.device = ctx->ib_dev; + mlx4_ib_query_ah(&ah.ibah, &ah_attr); + if ((ah_attr.ah_flags & IB_AH_GRH) && + (mlx4_gid_idx_to_slave(dev->dev, ah_attr.grh.sgid_index) != slave)) { + mlx4_ib_warn(ctx->ib_dev, "slave:%d accessed invalid sgid_index:%d\n", + slave, ah_attr.grh.sgid_index); + return; + } + + mlx4_ib_send_to_wire(dev, slave, ctx->port, + is_proxy_qp0(dev, wc->src_qp, slave) ? + IB_QPT_SMI : IB_QPT_GSI, + tunnel->hdr.pkey_index, tunnel->hdr.remote_qpn, + tunnel->hdr.qkey, + &ah_attr, &tunnel->mad); +} + +static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx, + enum ib_qp_type qp_type, int is_tun) +{ + int i; + struct mlx4_ib_demux_pv_qp *tun_qp; + int rx_buf_size, tx_buf_size; + + if (qp_type > IB_QPT_GSI) + return -EINVAL; + + tun_qp = &ctx->qp[qp_type]; + + tun_qp->ring = kzalloc(sizeof(struct mlx4_ib_buf) * MLX4_NUM_TUNNEL_BUFS, + GFP_KERNEL); + if (!tun_qp->ring) + return -ENOMEM; + + tun_qp->tx_ring = kzalloc(sizeof(struct mlx4_ib_tun_tx_buf) * + MLX4_NUM_TUNNEL_BUFS, + GFP_KERNEL); + if (!tun_qp->tx_ring) { + kfree(tun_qp->ring); + tun_qp->ring = NULL; + return -ENOMEM; + } + + if (is_tun) { + rx_buf_size = sizeof (struct mlx4_tunnel_mad); + tx_buf_size = sizeof (struct mlx4_rcv_tunnel_mad); + } else { + rx_buf_size = sizeof (struct mlx4_mad_rcv_buf); + tx_buf_size = sizeof (struct mlx4_mad_snd_buf); + } + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + tun_qp->ring[i].addr = kmalloc(rx_buf_size, GFP_KERNEL); + if (!tun_qp->ring[i].addr) + goto err; + tun_qp->ring[i].map = ib_dma_map_single(ctx->ib_dev, + tun_qp->ring[i].addr, + rx_buf_size, + DMA_FROM_DEVICE); + } + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + tun_qp->tx_ring[i].buf.addr = + kmalloc(tx_buf_size, GFP_KERNEL); + if (!tun_qp->tx_ring[i].buf.addr) + goto tx_err; + tun_qp->tx_ring[i].buf.map = + ib_dma_map_single(ctx->ib_dev, + tun_qp->tx_ring[i].buf.addr, + tx_buf_size, + DMA_TO_DEVICE); + tun_qp->tx_ring[i].ah = NULL; + } + spin_lock_init(&tun_qp->tx_lock); + tun_qp->tx_ix_head = 0; + tun_qp->tx_ix_tail = 0; + tun_qp->proxy_qpt = qp_type; + + return 0; + +tx_err: + while (i > 0) { + --i; + ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map, + tx_buf_size, DMA_TO_DEVICE); + kfree(tun_qp->tx_ring[i].buf.addr); + } + kfree(tun_qp->tx_ring); + tun_qp->tx_ring = NULL; + i = MLX4_NUM_TUNNEL_BUFS; +err: + while (i > 0) { + --i; + ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map, + rx_buf_size, DMA_FROM_DEVICE); + kfree(tun_qp->ring[i].addr); + } + kfree(tun_qp->ring); + tun_qp->ring = NULL; + return -ENOMEM; +} + +static void mlx4_ib_free_pv_qp_bufs(struct mlx4_ib_demux_pv_ctx *ctx, + enum ib_qp_type qp_type, int is_tun) +{ + int i; + struct mlx4_ib_demux_pv_qp *tun_qp; + int rx_buf_size, tx_buf_size; + + if (qp_type > IB_QPT_GSI) + return; + + tun_qp = &ctx->qp[qp_type]; + if (is_tun) { + rx_buf_size = sizeof (struct mlx4_tunnel_mad); + tx_buf_size = sizeof (struct mlx4_rcv_tunnel_mad); + } else { + rx_buf_size = sizeof (struct mlx4_mad_rcv_buf); + tx_buf_size = sizeof (struct mlx4_mad_snd_buf); + } + + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map, + rx_buf_size, DMA_FROM_DEVICE); + kfree(tun_qp->ring[i].addr); + } + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map, + tx_buf_size, DMA_TO_DEVICE); + kfree(tun_qp->tx_ring[i].buf.addr); + if (tun_qp->tx_ring[i].ah) + ib_destroy_ah(tun_qp->tx_ring[i].ah); + } + kfree(tun_qp->tx_ring); + kfree(tun_qp->ring); +} + +static void mlx4_ib_tunnel_comp_worker(struct work_struct *work) +{ + struct mlx4_ib_demux_pv_ctx *ctx; + struct mlx4_ib_demux_pv_qp *tun_qp; + struct ib_wc wc; + int ret; + ctx = container_of(work, struct mlx4_ib_demux_pv_ctx, work); + ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP); + + while (ib_poll_cq(ctx->cq, 1, &wc) == 1) { + tun_qp = &ctx->qp[MLX4_TUN_WRID_QPN(wc.wr_id)]; + if (wc.status == IB_WC_SUCCESS) { + switch (wc.opcode) { + case IB_WC_RECV: + mlx4_ib_multiplex_mad(ctx, &wc); + ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp, + wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)); + if (ret) + printk(KERN_ERR "Failed reposting tunnel " + "buf:%lld\n", wc.wr_id); + break; + case IB_WC_SEND: +#ifdef DEBUG + mlx4_ib_dbg("received tunnel send completion:" + "wrid=0x%llx, status=0x%x", + wc.wr_id, wc.status); +#endif /*DEBUG*/ + ib_destroy_ah(tun_qp->tx_ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].ah); + tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah + = NULL; + spin_lock(&tun_qp->tx_lock); + tun_qp->tx_ix_tail++; + spin_unlock(&tun_qp->tx_lock); + + break; + default: + break; + } + } else { + mlx4_ib_dbg("mlx4_ib: completion error in tunnel: %d." + " status = %d, wrid = 0x%llx", + ctx->slave, wc.status, wc.wr_id); + if (!MLX4_TUN_IS_RECV(wc.wr_id)) { + ib_destroy_ah(tun_qp->tx_ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].ah); + tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah + = NULL; + spin_lock(&tun_qp->tx_lock); + tun_qp->tx_ix_tail++; + spin_unlock(&tun_qp->tx_lock); + } + } + } +} + +static void pv_qp_event_handler(struct ib_event *event, void *qp_context) +{ + struct mlx4_ib_demux_pv_ctx *sqp = qp_context; + + /* It's worse than that! He's dead, Jim! */ + printk(KERN_ERR "Fatal error (%d) on a MAD QP on port %d\n", + event->event, sqp->port); +} + +static int create_pv_sqp(struct mlx4_ib_demux_pv_ctx *ctx, + enum ib_qp_type qp_type, int create_tun) +{ + int i, ret; + struct mlx4_ib_demux_pv_qp *tun_qp; + struct mlx4_ib_qp_tunnel_init_attr qp_init_attr; + struct ib_qp_attr attr; + int qp_attr_mask_INIT; + + if (qp_type > IB_QPT_GSI) + return -EINVAL; + + tun_qp = &ctx->qp[qp_type]; + + memset(&qp_init_attr, 0, sizeof qp_init_attr); + qp_init_attr.init_attr.send_cq = ctx->cq; + qp_init_attr.init_attr.recv_cq = ctx->cq; + qp_init_attr.init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; + qp_init_attr.init_attr.cap.max_send_wr = MLX4_NUM_TUNNEL_BUFS; + qp_init_attr.init_attr.cap.max_recv_wr = MLX4_NUM_TUNNEL_BUFS; + qp_init_attr.init_attr.cap.max_send_sge = 1; + qp_init_attr.init_attr.cap.max_recv_sge = 1; + if (create_tun) { + qp_init_attr.init_attr.qp_type = IB_QPT_UD; + qp_init_attr.init_attr.create_flags = MLX4_IB_QP_TUNNEL; + qp_init_attr.port = ctx->port; + qp_init_attr.slave = ctx->slave; + qp_init_attr.proxy_qp_type = qp_type; + qp_attr_mask_INIT = IB_QP_STATE | IB_QP_PKEY_INDEX | + IB_QP_QKEY | IB_QP_PORT; + } else { + qp_init_attr.init_attr.qp_type = qp_type; + qp_init_attr.init_attr.create_flags = MLX4_IB_SRIOV_SQP; + qp_attr_mask_INIT = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY; + } + qp_init_attr.init_attr.port_num = ctx->port; + qp_init_attr.init_attr.qp_context = ctx; + qp_init_attr.init_attr.event_handler = pv_qp_event_handler; + tun_qp->qp = ib_create_qp(ctx->pd, &qp_init_attr.init_attr); + if (IS_ERR(tun_qp->qp)) { + ret = PTR_ERR(tun_qp->qp); + tun_qp->qp = NULL; + printk(KERN_ERR "Couldn't create %s QP (%d)\n", + create_tun ? "tunnel" : "special", ret); + return ret; + } + + memset(&attr, 0, sizeof attr); + attr.qp_state = IB_QPS_INIT; + attr.pkey_index = + to_mdev(ctx->ib_dev)->pkeys.virt2phys_pkey[ctx->slave][ctx->port - 1][0]; + attr.qkey = IB_QP1_QKEY; + attr.port_num = ctx->port; + ret = ib_modify_qp(tun_qp->qp, &attr, qp_attr_mask_INIT); + if (ret) { + printk(KERN_ERR "Couldn't change %s qp state to INIT (%d)\n", + create_tun ? "tunnel" : "special", ret); + goto err_qp; + } + attr.qp_state = IB_QPS_RTR; + ret = ib_modify_qp(tun_qp->qp, &attr, IB_QP_STATE); + if (ret) { + printk(KERN_ERR "Couldn't change %s qp state to RTR (%d)\n", + create_tun ? "tunnel" : "special", ret); + goto err_qp; + } + attr.qp_state = IB_QPS_RTS; + attr.sq_psn = 0; + ret = ib_modify_qp(tun_qp->qp, &attr, IB_QP_STATE | IB_QP_SQ_PSN); + if (ret) { + printk(KERN_ERR "Couldn't change %s qp state to RTS (%d)\n", + create_tun ? "tunnel" : "special", ret); + goto err_qp; + } + + for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { + ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp, i); + if (ret) { + printk(KERN_ERR " mlx4_ib_post_pv_buf error" + " (err = %d, i = %d)\n", ret, i); + goto err_qp; + } + } + return 0; + +err_qp: + ib_destroy_qp(tun_qp->qp); + tun_qp->qp = NULL; + return ret; +} + +/* + * IB MAD completion callback for real SQPs + */ +static void mlx4_ib_sqp_comp_worker(struct work_struct *work) +{ + struct mlx4_ib_demux_pv_ctx *ctx; + struct mlx4_ib_demux_pv_qp *sqp; + struct ib_wc wc; + struct ib_grh *grh; + struct ib_mad *mad; + + ctx = container_of(work, struct mlx4_ib_demux_pv_ctx, work); + ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP); + + while (mlx4_ib_poll_cq(ctx->cq, 1, &wc) == 1) { + sqp = &ctx->qp[MLX4_TUN_WRID_QPN(wc.wr_id)]; + if (wc.status == IB_WC_SUCCESS) { + switch (wc.opcode) { + case IB_WC_SEND: + ib_destroy_ah(sqp->tx_ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].ah); + sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah + = NULL; + spin_lock(&sqp->tx_lock); + sqp->tx_ix_tail++; + spin_unlock(&sqp->tx_lock); + break; + case IB_WC_RECV: + mad = (struct ib_mad *) &(((struct mlx4_mad_rcv_buf *) + (sqp->ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].addr))->payload); + grh = &(((struct mlx4_mad_rcv_buf *) + (sqp->ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].addr))->grh); + mlx4_ib_demux_mad(ctx->ib_dev, ctx->port, &wc, grh, mad); + if (mlx4_ib_post_pv_qp_buf(ctx, sqp, wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1))) + printk(KERN_ERR "Failed reposting SQP " + "buf:%lld\n", wc.wr_id); + break; + default: + BUG_ON(1); + break; + } + } else { + mlx4_ib_dbg("mlx4_ib: completion error in tunnel: %d." + " status = %d, wrid = 0x%llx", + ctx->slave, wc.status, wc.wr_id); + if (!MLX4_TUN_IS_RECV(wc.wr_id)) { + ib_destroy_ah(sqp->tx_ring[wc.wr_id & + (MLX4_NUM_TUNNEL_BUFS - 1)].ah); + sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah + = NULL; + spin_lock(&sqp->tx_lock); + sqp->tx_ix_tail++; + spin_unlock(&sqp->tx_lock); + } + } + } +} + +static int alloc_pv_object(struct mlx4_ib_dev *dev, int slave, int port, + struct mlx4_ib_demux_pv_ctx **ret_ctx) +{ + struct mlx4_ib_demux_pv_ctx *ctx; + + *ret_ctx = NULL; + ctx = kzalloc(sizeof(struct mlx4_ib_demux_pv_ctx), GFP_KERNEL); + if (!ctx) { + printk(KERN_ERR "failed allocating pv resource context" + " for port %d, slave %d\n", port, slave); + return -ENOMEM; + } + + ctx->ib_dev = &dev->ib_dev; + ctx->port = port; + ctx->slave = slave; + *ret_ctx = ctx; + return 0; +} + +static void free_pv_object(struct mlx4_ib_dev *dev, int slave, int port) +{ + if (dev->sriov.demux[port - 1].tun[slave]) { + kfree(dev->sriov.demux[port - 1].tun[slave]); + dev->sriov.demux[port - 1].tun[slave] = NULL; + } +} + +static int create_pv_resources(struct ib_device *ibdev, int slave, int port, + int create_tun, struct mlx4_ib_demux_pv_ctx *ctx) +{ + int ret, cq_size; + + ctx->state = DEMUX_PV_STATE_STARTING; + if ((ctx->slave == to_mdev(ctx->ib_dev)->dev->caps.function) && + mlx4_ib_port_link_layer(to_mdev(ctx->ib_dev), ctx->port) == + IB_LINK_LAYER_INFINIBAND) + ctx->has_smi = 1; + + if (ctx->has_smi) { + ret = mlx4_ib_alloc_pv_bufs(ctx, IB_QPT_SMI, create_tun); + if (ret) { + printk(KERN_ERR "Failed allocating qp0 tunnel bufs (%d)\n", ret); + goto err_out; + } + } + + ret = mlx4_ib_alloc_pv_bufs(ctx, IB_QPT_GSI, create_tun); + if (ret) { + printk(KERN_ERR "Failed allocating qp1 tunnel bufs (%d)\n", ret); + goto err_out_qp0; + } + + cq_size = 2 * MLX4_NUM_TUNNEL_BUFS; + if (ctx->has_smi) + cq_size *= 2; + + ctx->cq = ib_create_cq(ctx->ib_dev, mlx4_ib_tunnel_comp_handler, + NULL, ctx, cq_size, 0); + if (IS_ERR(ctx->cq)) { + ret = PTR_ERR(ctx->cq); + printk(KERN_ERR "Couldn't create tunnel CQ (%d)\n", ret); + goto err_buf; + } + + ctx->pd = ib_alloc_pd(ctx->ib_dev); + if (IS_ERR(ctx->pd)) { + ret = PTR_ERR(ctx->pd); + printk(KERN_ERR "Couldn't create tunnel PD (%d)\n", ret); + goto err_cq; + } + + ctx->mr = ib_get_dma_mr(ctx->pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(ctx->mr)) { + ret = PTR_ERR(ctx->mr); + printk(KERN_ERR "Couldn't get tunnel DMA MR (%d)\n", ret); + goto err_pd; + } + + if (ctx->has_smi) { + ret = create_pv_sqp(ctx, IB_QPT_SMI, create_tun); + if (ret) { + printk(KERN_ERR "Couldn't create %s QP0 (%d)\n", + create_tun ? "tunnel for" : "", ret); + goto err_mr; + } + } + + ret = create_pv_sqp(ctx, IB_QPT_GSI, create_tun); + if (ret) { + printk(KERN_ERR "Couldn't create %s QP1 (%d)\n", + create_tun ? "tunnel for" : "", ret); + goto err_qp0; + } + + if (create_tun) + INIT_WORK(&ctx->work, mlx4_ib_tunnel_comp_worker); + else + INIT_WORK(&ctx->work, mlx4_ib_sqp_comp_worker); + + ctx->wq = to_mdev(ibdev)->sriov.demux[port - 1].wq; + + ret = ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP); + if (ret) { + printk(KERN_ERR "Couldn't arm tunnel cq (%d)\n", ret); + goto err_wq; + } + ctx->state = DEMUX_PV_STATE_ACTIVE; + return 0; + +err_wq: + ctx->wq = NULL; + ib_destroy_qp(ctx->qp[1].qp); + ctx->qp[1].qp = NULL; + + +err_qp0: + if (ctx->has_smi) + ib_destroy_qp(ctx->qp[0].qp); + ctx->qp[0].qp = NULL; + +err_mr: + ib_dereg_mr(ctx->mr); + ctx->mr = NULL; + +err_pd: + ib_dealloc_pd(ctx->pd); + ctx->pd = NULL; + +err_cq: + ib_destroy_cq(ctx->cq); + ctx->cq = NULL; + +err_buf: + mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, create_tun); + +err_out_qp0: + if (ctx->has_smi) + mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_SMI, create_tun); +err_out: + ctx->state = DEMUX_PV_STATE_DOWN; + return ret; +} + +void mlx4_ib_tunnels_update_work(struct work_struct *work) +{ + struct mlx4_ib_demux_work *dmxw; + + dmxw = container_of(work, struct mlx4_ib_demux_work, work); + mlx4_ib_tunnels_update(dmxw->dev, dmxw->slave, (int) dmxw->port, dmxw->do_init, 1); + kfree(dmxw); + return; +} + +void destroy_pv_resources(struct mlx4_ib_dev *dev, int slave, int port, + struct mlx4_ib_demux_pv_ctx *ctx, int flush) +{ + if (!ctx) + return; + if (ctx->state > DEMUX_PV_STATE_DOWN) { + ctx->state = DEMUX_PV_STATE_DOWNING; + if (flush) + flush_workqueue(ctx->wq); + if (ctx->has_smi) { + ib_destroy_qp(ctx->qp[0].qp); + ctx->qp[0].qp = NULL; + mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_SMI, 1); + } + ib_destroy_qp(ctx->qp[1].qp); + ctx->qp[1].qp = NULL; + mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, 1); + ib_dereg_mr(ctx->mr); + ctx->mr = NULL; + ib_dealloc_pd(ctx->pd); + ctx->pd = NULL; + ib_destroy_cq(ctx->cq); + ctx->cq = NULL; + ctx->state = DEMUX_PV_STATE_DOWN; + } +} + +int mlx4_ib_tunnels_update(struct mlx4_ib_dev *dev, int slave, int port, + int do_init, int from_wq) +{ + int ret = 0; + + if (!do_init) { + clean_vf_mcast(&dev->sriov.demux[port -1], slave); + /* for master, destroy real sqp resources */ + if (slave == dev->dev->caps.function) + destroy_pv_resources(dev, slave, port, + dev->sriov.sqps[port - 1], 1); + + destroy_pv_resources(dev, slave, port, + dev->sriov.demux[port - 1].tun[slave], 1); + return 0; + } + + ret = create_pv_resources(&dev->ib_dev, slave, port, 1, + dev->sriov.demux[port - 1].tun[slave]); + if (!ret && slave == dev->dev->caps.function) { + ret = create_pv_resources(&dev->ib_dev, slave, port, 0, + dev->sriov.sqps[port - 1]); + } + return ret; +} + +static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev, + struct mlx4_ib_demux_ctx *ctx, + int port) +{ + char name[12]; + int ret = 0; + int i; + + ctx->tun = kzalloc(dev->dev->caps.sqp_demux * + sizeof(struct mlx4_ib_demux_pv_ctx *), GFP_KERNEL); + if (!ctx->tun) + return -ENOMEM; + + ctx->dev = dev; + ctx->port = port; + ctx->ib_dev = &dev->ib_dev; + + for (i = 0; i < dev->dev->caps.sqp_demux; i++) { + ret = alloc_pv_object(dev, i, port, &ctx->tun[i]); + if (ret) { + ret = -ENOMEM; + goto err_mcg; + } + } + + ret = mlx4_ib_mcg_port_init(ctx); + if (ret) { + printk(KERN_ERR "Failed initializing mcg para-virt (%d)\n", ret); + goto err_mcg; + } + + snprintf(name, sizeof name, "mlx4_ibt%d", port); + ctx->wq = create_singlethread_workqueue(name); + if (!ctx->wq) { + printk(KERN_ERR "Failed to create tunnelling WQ for port %d\n", port); + ret = -ENOMEM; + goto err_wq; + } + + snprintf(name, sizeof name, "mlx4_ibud%d", port); + ctx->ud_wq = create_singlethread_workqueue(name); + if (!ctx->ud_wq) { + printk(KERN_ERR "Failed to create up/down WQ for port %d\n", port); + ret = -ENOMEM; + goto err_udwq; + } + + return 0; + +err_udwq: + destroy_workqueue(ctx->wq); + ctx->wq = NULL; + +err_wq: + mlx4_ib_mcg_port_cleanup(ctx, 1); +err_mcg: + for (i = 0; i < dev->dev->caps.sqp_demux; i++) + free_pv_object(dev, i, port); + kfree(ctx->tun); + ctx->tun = NULL; + return ret; +} + +static void mlx4_ib_free_sqp_ctx(struct mlx4_ib_demux_pv_ctx *sqp_ctx) +{ + if (sqp_ctx->state > DEMUX_PV_STATE_DOWN) { + sqp_ctx->state = DEMUX_PV_STATE_DOWNING; + flush_workqueue(sqp_ctx->wq); + if (sqp_ctx->has_smi) { + ib_destroy_qp(sqp_ctx->qp[0].qp); + sqp_ctx->qp[0].qp = NULL; + mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_SMI, 0); + } + ib_destroy_qp(sqp_ctx->qp[1].qp); + sqp_ctx->qp[1].qp = NULL; + mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_GSI, 0); + ib_dereg_mr(sqp_ctx->mr); + sqp_ctx->mr = NULL; + ib_dealloc_pd(sqp_ctx->pd); + sqp_ctx->pd = NULL; + ib_destroy_cq(sqp_ctx->cq); + sqp_ctx->cq = NULL; + sqp_ctx->state = DEMUX_PV_STATE_DOWN; + } +} + +static void mlx4_ib_free_demux_ctx(struct mlx4_ib_demux_ctx *ctx) +{ + int i; + if (ctx) { + struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev); + mlx4_ib_mcg_port_cleanup(ctx, 1); + for (i = 0; i < dev->dev->caps.sqp_demux; i++) { + if (!ctx->tun[i]) + continue; + if (ctx->tun[i]->state > DEMUX_PV_STATE_DOWN) + ctx->tun[i]->state = DEMUX_PV_STATE_DOWNING; + } + flush_workqueue(ctx->wq); + for (i = 0; i < dev->dev->caps.sqp_demux; i++) { + destroy_pv_resources(dev, i, ctx->port, ctx->tun[i], 0); + free_pv_object(dev, i, ctx->port); + } + kfree(ctx->tun); + destroy_workqueue(ctx->ud_wq); + destroy_workqueue(ctx->wq); + } +} + +void mlx4_ib_master_tunnels(struct mlx4_ib_dev *dev, int do_init) +{ + int i; + + if (!dev->dev->caps.sqp_demux) + return; + /* initialize or tear down tunnel QPs for the slave */ + for (i = 0; i < dev->dev->caps.num_ports; i++) + mlx4_ib_tunnels_update(dev, dev->dev->caps.function, i + 1, do_init, 0); + return; +} + +static inline void set_gids_per_func(struct mlx4_ib_dev *dev) +{ + u8 *gids_per_func = &(dev->dev->gids_per_func); + u8 max_gids_per_func; + + max_gids_per_func = MLX4_MAX_NUM_GIDS / dev->dev->sr_iov; + if (mlx4_ib_gids_per_func > max_gids_per_func || + mlx4_ib_gids_per_func < 0) { + *gids_per_func = 1; + + mlx4_ib_warn(&dev->ib_dev, "Invalid parameter gids_per_func %d. " + "value must be between 0 and %d. " + "defaulting to %d\n", + max_gids_per_func, + mlx4_ib_gids_per_func, + *gids_per_func); + /* if the parameter is set to 0 - use as many gids per function as possible */ + } else if (mlx4_ib_gids_per_func == 0) + *gids_per_func = max_gids_per_func; + else + *gids_per_func = mlx4_ib_gids_per_func; + + mlx4_ib_dbg("Requested gids per func: %d, setting gids per func to %d. " + "Num VFs is: %d\n", mlx4_ib_gids_per_func, *gids_per_func, + dev->dev->sr_iov); + + return; +} + + +int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev) +{ + int i = 0; + int err; + + if (!mlx4_is_mfunc(dev->dev)) + return 0; + + dev->sriov.is_going_down = 0; + spin_lock_init(&dev->sriov.going_down_lock); + mlx4_ib_cm_paravirt_init(dev); + + /* XXX user-space RDMACM relies on unqie node guids to distinguish among + * cma devices. */ + ((u8*) &dev->ib_dev.node_guid)[MLX4_SLAVE_ID_NODE_GUID_OFFSET] += + mlx4_ib_get_virt2phys_gid(dev, 1, 0); + + mlx4_ib_warn(&dev->ib_dev, "multi-function enabled\n"); + mlx4_ib_warn(&dev->ib_dev, "using node_guid:0x%016llx\n", + be64_to_cpu(dev->ib_dev.node_guid)); + + if (!dev->dev->caps.sqp_demux) + mlx4_ib_warn(&dev->ib_dev, "operating in qp1 tunnel mode\n"); + + if (dev->dev->caps.sqp_demux) { + set_gids_per_func(dev); + + err = init_alias_guid_service(dev); + if (err) { + mlx4_ib_warn(&dev->ib_dev, "Failed init alias guid process.\n"); + goto paravirt_err; + } + err = mlx4_ib_device_register_sysfs(dev); + if (err) { + mlx4_ib_warn(&dev->ib_dev, "Failed to register sysfs\n"); + goto sysfs_err; + } + + /* XXX until we have proper SM support, we mannually assign + * additional port guids for guests */ + /*if (mlx4_ib_set_slave_guids(&dev->ib_dev)) + mlx4_ib_warn(&dev->ib_dev, "Failed setting slave guids\n"); + */ + + mlx4_ib_warn(&dev->ib_dev, "initializing demux service for %d qp1 clients\n", + dev->dev->caps.sqp_demux); + for (i = 0; i < dev->num_ports; i++) { + err = alloc_pv_object(dev, dev->dev->caps.function, i + 1, + &dev->sriov.sqps[i]); + if (err) + goto demux_err; + err = mlx4_ib_alloc_demux_ctx(dev, &dev->sriov.demux[i], i + 1); + if (err) + goto demux_err; + } + mlx4_ib_master_tunnels(dev, 1); + } + return 0; + +demux_err: + while (i > 0) { + free_pv_object(dev, dev->dev->caps.function, i + 1); + mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]); + --i; + } + mlx4_ib_device_unregister_sysfs(dev); + +sysfs_err: + clear_alias_guid_work(dev); + +paravirt_err: + mlx4_ib_cm_paravirt_clean(dev, -1); + + return err; +} + +void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev) +{ + int i; + unsigned long flags; + + if (!mlx4_is_mfunc(dev->dev)) + return; + + spin_lock_irqsave(&dev->sriov.going_down_lock, flags); + dev->sriov.is_going_down = 1; + spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); + if (dev->dev->caps.sqp_demux) { + for (i = 0; i < dev->num_ports; i++) { + flush_workqueue(dev->sriov.demux[i].ud_wq); + mlx4_ib_free_sqp_ctx(dev->sriov.sqps[i]); + kfree(dev->sriov.sqps[i]); + dev->sriov.sqps[i] = NULL; + mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]); + } + + mlx4_ib_cm_paravirt_clean(dev, -1); + clear_alias_guid_work(dev); + mlx4_ib_device_unregister_sysfs(dev); + //mlx4_ib_master_tunnels(dev, 0); + } +} + int mlx4_ib_mad_init(struct mlx4_ib_dev *dev) { struct ib_mad_agent *agent; @@ -313,7 +1973,7 @@ int mlx4_ib_mad_init(struct mlx4_ib_dev *dev) enum rdma_link_layer ll; for (p = 0; p < dev->num_ports; ++p) { - ll = rdma_port_get_link_layer(&dev->ib_dev, p + 1); + ll = rdma_port_link_layer(&dev->ib_dev, p + 1); for (q = 0; q <= 1; ++q) { if (ll == IB_LINK_LAYER_INFINIBAND) { agent = ib_register_mad_agent(&dev->ib_dev, p + 1, @@ -330,6 +1990,7 @@ int mlx4_ib_mad_init(struct mlx4_ib_dev *dev) } } + return 0; err: diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index fbe1973f77b0d..d3966dcbc1fcf 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -49,29 +49,61 @@ #include "mlx4_ib.h" #include "user.h" +#include "wc.h" +#include "alias_GUID.h" +#include "ib_events.h" -#define DRV_NAME "mlx4_ib" -#define DRV_VERSION "1.0" +#define DRV_NAME MLX4_IB_DRV_NAME +#define DRV_VERSION "1.0-ofed1.5.5" #define DRV_RELDATE "April 4, 2008" MODULE_AUTHOR("Roland Dreier"); -MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver"); +MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver - supports multi-func"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_VERSION(DRV_VERSION); +#ifdef CONFIG_MLX4_DEBUG + +int mlx4_ib_debug_level = 0; +module_param_named(debug_level, mlx4_ib_debug_level, int, 0644); +MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); + +#endif /* CONFIG_MLX4_DEBUG */ + +int mlx4_ib_sm_guid_assign = 0; +module_param_named(sm_guid_assign, mlx4_ib_sm_guid_assign, int, 0444); +MODULE_PARM_DESC(sm_guid_assign, "Enable SM alias_GUID assignment if sm_guid_assign > 0"); + + +int mlx4_ib_guid_gen_magic = 100; +module_param_named(guid_gen_magic, mlx4_ib_guid_gen_magic, int, 0444); +MODULE_PARM_DESC(guid_gen_magic, "Magic num to add to the generated guid, default is 100."); + +int mlx4_ib_gids_per_func = 1; +module_param_named(guids_per_func, mlx4_ib_gids_per_func, int, 0444); +MODULE_PARM_DESC(guids_per_func, "Number of guids per function, default is 1."); + + static const char mlx4_ib_version[] = DRV_NAME ": Mellanox ConnectX InfiniBand driver v" DRV_VERSION " (" DRV_RELDATE ")\n"; +static void *get_ibdev(struct mlx4_dev *dev, void *ctx, u8 port) +{ + struct mlx4_ib_dev *mlxibdev = ctx; + return &mlxibdev->ib_dev; +} + struct update_gid_work { - struct work_struct work; - union ib_gid gids[128]; - struct mlx4_ib_dev *dev; - int port; + struct work_struct work; + union ib_gid gids[128]; + int port; + struct mlx4_ib_dev *dev; }; static struct workqueue_struct *wq; + static void init_query_mad(struct ib_smp *mad) { mad->base_version = 1; @@ -128,17 +160,21 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_REMOTE_INV) && (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_FAST_REG_WR)) props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) + props->device_cap_flags |= IB_DEVICE_XRC; + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_RAW_ETY) + props->max_raw_ethy_qp = dev->ib_dev.phys_port_cnt; props->vendor_id = be32_to_cpup((__be32 *) (out_mad->data + 36)) & 0xffffff; - props->vendor_part_id = be16_to_cpup((__be16 *) (out_mad->data + 30)); + props->vendor_part_id = dev->dev->pdev->device; props->hw_ver = be32_to_cpup((__be32 *) (out_mad->data + 32)); memcpy(&props->sys_image_guid, out_mad->data + 4, 8); props->max_mr_size = ~0ull; props->page_size_cap = dev->dev->caps.page_size_cap; props->max_qp = dev->dev->caps.num_qps - dev->dev->caps.reserved_qps; - props->max_qp_wr = dev->dev->caps.max_wqes; + props->max_qp_wr = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE; props->max_sge = min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg); props->max_cq = dev->dev->caps.num_cqs - dev->dev->caps.reserved_cqs; @@ -151,7 +187,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->max_srq = dev->dev->caps.num_srqs - dev->dev->caps.reserved_srqs; props->max_srq_wr = dev->dev->caps.max_srq_wqes - 1; props->max_srq_sge = dev->dev->caps.max_srq_sge; - props->max_fast_reg_page_list_len = MLX4_MAX_FAST_REG_PAGES; + props->max_fast_reg_page_list_len = PAGE_SIZE / sizeof (u64); props->local_ca_ack_delay = dev->dev->caps.local_ca_ack_delay; props->atomic_cap = dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_ATOMIC ? IB_ATOMIC_HCA : IB_ATOMIC_NONE; @@ -175,13 +211,13 @@ mlx4_ib_port_link_layer(struct ib_device *device, u8 port_num) { struct mlx4_dev *dev = to_mdev(device)->dev; - return dev->caps.port_mask & (1 << (port_num - 1)) ? + return dev->caps.port_mask[port_num] == MLX4_PORT_TYPE_IB ? IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; } -static int ib_link_query_port(struct ib_device *ibdev, u8 port, - struct ib_port_attr *props, - struct ib_smp *out_mad) +static void ib_link_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props, + struct ib_smp *out_mad) { props->lid = be16_to_cpup((__be16 *) (out_mad->data + 16)); props->lmc = out_mad->data[34] & 0x7; @@ -202,8 +238,41 @@ static int ib_link_query_port(struct ib_device *ibdev, u8 port, props->subnet_timeout = out_mad->data[51] & 0x1f; props->max_vl_num = out_mad->data[37] >> 4; props->init_type_reply = out_mad->data[41] >> 4; + props->link_layer = IB_LINK_LAYER_INFINIBAND; + props->ext_active_speed = out_mad->data[62] >> 4; - return 0; + /* Cache local lid for qp1 tunneling in sriov */ + to_mdev(ibdev)->sriov.local_lid[port - 1] = props->lid; + +} + +int eth_to_ib_width(int w) +{ + switch (w) { + case 4: + return IB_WIDTH_4X; + case 8: + case 16: + return IB_WIDTH_8X; + case 32: + return IB_WIDTH_12X; + default: + return IB_WIDTH_1X; + } +} + +int eth_to_ib_speed(int s) +{ + switch (s) { + case 256: + return 1; + case 512: + return 2; + case 1024: + return 4; + default: + return 1; + } } static u8 state_to_phys_state(enum ib_port_state state) @@ -217,10 +286,35 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port, { struct mlx4_ib_iboe *iboe = &to_mdev(ibdev)->iboe; struct net_device *ndev; + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_dev *dev = to_mdev(ibdev)->dev; + int err; + u32 *outbox; + u32 fl; enum ib_mtu tmp; - props->active_width = IB_WIDTH_1X; - props->active_speed = 4; + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + err = mlx4_cmd_box(dev, 0, mailbox->dma, port, 0, MLX4_CMD_QUERY_PORT, + MLX4_CMD_TIME_CLASS_B, 0); + if (err) + goto out; + outbox = mailbox->buf; + + fl = be32_to_cpu(outbox[0x40 / 4]); + if (fl & 1 << 31) + props->active_width = eth_to_ib_width(fl & 0xffff); + else + props->active_width = IB_WIDTH_1X; + + fl = be32_to_cpu(outbox[0x44 / 4]); + if (fl & 1 << 31) + props->active_speed = eth_to_ib_speed(fl & 0xffff); + else + props->active_speed = 1; + props->port_cap_flags = IB_PORT_CM_SUP; props->gid_tbl_len = to_mdev(ibdev)->dev->caps.gid_table_len[port]; props->max_msg_sz = to_mdev(ibdev)->dev->caps.max_msg_sz; @@ -231,27 +325,29 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port, props->subnet_timeout = 0; props->max_vl_num = out_mad->data[37] >> 4; props->init_type_reply = 0; + props->link_layer = IB_LINK_LAYER_ETHERNET; props->state = IB_PORT_DOWN; props->phys_state = state_to_phys_state(props->state); props->active_mtu = IB_MTU_256; spin_lock(&iboe->lock); ndev = iboe->netdevs[port - 1]; if (!ndev) - goto out; + goto out_ul; tmp = iboe_get_mtu(ndev->mtu); - props->active_mtu = tmp ? min(props->max_mtu, tmp) : IB_MTU_256; - - props->state = (netif_running(ndev) && netif_carrier_ok(ndev)) ? + props->active_mtu = tmp ? min(props->max_mtu, tmp) : 0; + props->state = netif_running(ndev) && netif_oper_up(ndev) ? IB_PORT_ACTIVE : IB_PORT_DOWN; props->phys_state = state_to_phys_state(props->state); -out: +out_ul: spin_unlock(&iboe->lock); - return 0; +out: + mlx4_free_cmd_mailbox(dev, mailbox); + return err; } -static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port, +int mlx4_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { struct ib_smp *in_mad = NULL; @@ -273,7 +369,7 @@ static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port, if (err) goto out; - err = mlx4_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND ? + mlx4_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND ? ib_link_query_port(ibdev, port, props, out_mad) : eth_link_query_port(ibdev, port, props, out_mad); @@ -289,7 +385,10 @@ static int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; + struct mlx4_ib_dev *dev = to_mdev(ibdev); int err = -ENOMEM; + int clear = 0; + int gid_index = index; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); @@ -300,12 +399,67 @@ static int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; in_mad->attr_mod = cpu_to_be32(port); - err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad); + err = mlx4_MAD_IFC(dev, 1, 1, port, NULL, NULL, in_mad, out_mad); if (err) goto out; memcpy(gid->raw, out_mad->data + 8, 8); + if (dev->dev->caps.sqp_demux) { + /* Cache subnet prefix */ + dev->sriov.demux[port - 1].subnet_prefix = gid->global.subnet_prefix; + } + + if (mlx4_is_mfunc(dev->dev)) { + /* If this function is demuxing qp1, we need to cache + * the real guids */ + gid_index = (dev->dev->caps.sqp_demux ? index : + mlx4_ib_get_virt2phys_gid(dev, port, index)); + if (!is_gid_idx_valid(gid_index)) { + err = 0; + clear = 1; + + goto out; + } + } + init_query_mad(in_mad); + in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; + in_mad->attr_mod = cpu_to_be32(gid_index / 8); + + err = mlx4_MAD_IFC(dev, 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) + goto out; + memcpy(gid->raw + 8, out_mad->data + (gid_index % 8) * 8, 8); + + if (dev->dev->caps.sqp_demux) { + dev->sriov.demux[port - 1].guid_cache[gid_index] = + gid->global.interface_id; + } +out: + if (clear) + memset(gid->raw + 8, 0, 8); + kfree(in_mad); + kfree(out_mad); + return err; +} + + +/* Disable this function, since it's not in use. + * It was only being used by mlx4_ib_set_slave_guids(), which is no longer in + * use */ +#if 0 +static int mlx4_ib_set_guid(struct ib_device *ibdev, u8 port, int index, u8 *guid) +{ + struct ib_smp *in_mad = NULL; + struct ib_smp *out_mad = NULL; + int err = -ENOMEM; + + in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); + out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); + if (!in_mad || !out_mad) + goto out; + + /* First get relevant block */ init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; in_mad->attr_mod = cpu_to_be32(index / 8); @@ -314,16 +468,25 @@ static int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, if (err) goto out; - memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8); + /* Copy block to set mad and update proper GUID */ + in_mad->method = IB_MGMT_METHOD_SET; + memcpy(in_mad->data, out_mad->data, IB_SMP_DATA_SIZE); + memcpy(in_mad->data + (index % 8) * 8, guid, 8); + + err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) + mlx4_ib_warn(ibdev, "Failed setting guid block\n"); out: kfree(in_mad); kfree(out_mad); return err; } +#endif +/* TODO - add support in double-GUIDs feature */ static int iboe_query_gid(struct ib_device *ibdev, u8 port, int index, - union ib_gid *gid) + union ib_gid *gid) { struct mlx4_ib_dev *dev = to_mdev(ibdev); @@ -335,14 +498,37 @@ static int iboe_query_gid(struct ib_device *ibdev, u8 port, int index, static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid) { - if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND) + if (rdma_port_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND) return __mlx4_ib_query_gid(ibdev, port, index, gid); else return iboe_query_gid(ibdev, port, index, gid); } -static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, - u16 *pkey) +/* Disable this function, since it's not in use */ +#if 0 +int mlx4_ib_set_slave_guids(struct ib_device *ibdev) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + union ib_gid gid; + int i, j; + u8 base; + + for (i = 1; i <= dev->num_ports; ++i) { + if (mlx4_ib_query_gid(ibdev, i, 0, &gid)) + return -EFAULT; + + dev->sriov.demux[i - 1].gid_id_base = base = gid.raw[MLX4_SLAVE_ID_GID_OFFSET]; + for (j = 1; j < dev->dev->caps.sqp_demux; j++) { /* slave0 gets the hw guid */ + gid.raw[MLX4_SLAVE_ID_GID_OFFSET] = base + j; /* allow overflows */ + if (mlx4_ib_set_guid(ibdev, i, j, (u8*) &gid.global.interface_id)) + return -EFAULT; + } + } + return 0; +} +#endif + +int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; @@ -373,20 +559,25 @@ static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask, struct ib_device_modify *props) { struct mlx4_cmd_mailbox *mailbox; + int err; if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) return -EOPNOTSUPP; if (!(mask & IB_DEVICE_MODIFY_NODE_DESC)) - return 0; + return 0; spin_lock(&to_mdev(ibdev)->sm_lock); memcpy(ibdev->node_desc, props->node_desc, 64); spin_unlock(&to_mdev(ibdev)->sm_lock); - /* - * If possible, pass node desc to FW, so it can generate - * a 144 trap. If cmd fails, just ignore. + /* do not pass description to FW if we are a slave */ + if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && + !mlx4_is_master(to_mdev(ibdev)->dev)) + return 0; + + /* if possible, pass node desc to FW, so it can generate + * a 144 trap. If cmd fails, just ignore. */ mailbox = mlx4_alloc_cmd_mailbox(to_mdev(ibdev)->dev); if (IS_ERR(mailbox)) @@ -394,8 +585,10 @@ static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask, memset(mailbox->buf, 0, 256); memcpy(mailbox->buf, props->node_desc, 64); - mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0, - MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A); + err = mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0, + MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, 1); + if (err) + mlx4_ib_dbg("SET_NODE command failed (%d)", err); mlx4_free_cmd_mailbox(to_mdev(ibdev)->dev, mailbox); @@ -424,7 +617,7 @@ static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, } err = mlx4_cmd(dev->dev, mailbox->dma, port, is_eth, MLX4_CMD_SET_PORT, - MLX4_CMD_TIME_CLASS_B); + MLX4_CMD_TIME_CLASS_B, 0); mlx4_free_cmd_mailbox(dev->dev, mailbox); return err; @@ -434,6 +627,7 @@ static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, struct ib_port_modify *props) { struct ib_port_attr attr; + struct mlx4_ib_dev *dev = to_mdev(ibdev); u32 cap_mask; int err; @@ -443,6 +637,13 @@ static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, if (err) goto out; + /* XXX ToDo: Remove 2 lines below once MAD_IFC paravirtualization + * code is completed. Once this is done, slave will only see + * cap bits relevant to it. + */ + if (mlx4_is_mfunc(dev->dev) && !mlx4_is_master(dev->dev)) + attr.port_cap_flags &= ~IB_PORT_SM; + cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) & ~props->clr_port_cap_mask; @@ -467,8 +668,14 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev, return ERR_PTR(-EAGAIN); resp.qp_tab_size = dev->dev->caps.num_qps; - resp.bf_reg_size = dev->dev->caps.bf_reg_size; - resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; + + if (mlx4_wc_enabled()) { + resp.bf_reg_size = dev->dev->caps.bf_reg_size; + resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; + } else { + resp.bf_reg_size = 0; + resp.bf_regs_per_page = 0; + } context = kmalloc(sizeof *context, GFP_KERNEL); if (!context) @@ -518,7 +725,7 @@ static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) PAGE_SIZE, vma->vm_page_prot)) return -EAGAIN; } else if (vma->vm_pgoff == 1 && dev->dev->caps.bf_reg_size != 0) { - vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + vma->vm_page_prot = pgprot_wc(vma->vm_page_prot); if (io_remap_pfn_range(vma, vma->vm_start, to_mucontext(context)->uar.pfn + @@ -538,7 +745,7 @@ static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev, struct mlx4_ib_pd *pd; int err; - pd = kmalloc(sizeof *pd, GFP_KERNEL); + pd = kzalloc(sizeof *pd, GFP_KERNEL); if (!pd) return ERR_PTR(-ENOMEM); @@ -558,19 +765,91 @@ static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev, return &pd->ibpd; } +static struct ib_shpd *mlx4_ib_alloc_shpd(struct ib_device *ibdev, + struct ib_pd *pd) +{ + struct mlx4_ib_shpd *shpd; + + shpd = kzalloc(sizeof *shpd, GFP_KERNEL); + if (!shpd) + return ERR_PTR(-ENOMEM); + + shpd->pdn = to_mpd(pd)->pdn; + + return &shpd->ibshpd; +} + +static struct ib_pd *mlx4_ib_share_pd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata, struct ib_shpd *shpd) +{ + struct mlx4_ib_pd *pd; + + pd = kzalloc(sizeof *pd, GFP_KERNEL); + if (!pd) + return ERR_PTR(-ENOMEM); + + pd->pdn = to_mshpd(shpd)->pdn; + + if (context) + if (ib_copy_to_udata(udata, &pd->pdn, sizeof(__u32))) { + kfree(pd); + return ERR_PTR(-EFAULT); + } + + return &pd->ibpd; +} + +static int mlx4_ib_remove_shpd(struct ib_device *ibdev, + struct ib_shpd *shpd, int atinit) +{ + + /* + * if remove shpd is called during shpd creation time itself, then + * pd should not be freed from device. it will be freed when deall_pd + * is called + */ + if (!atinit) + mlx4_pd_free(to_mdev(ibdev)->dev, to_mshpd(shpd)->pdn); + kfree(shpd); + + return 0; +} + static int mlx4_ib_dealloc_pd(struct ib_pd *pd) { - mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn); - kfree(pd); + struct ib_shpd *shpd = pd->shpd; + if (shpd) { + /* if pd is shared, pd number will be freed by remove_shpd call */ + kfree(pd); + } else { + mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn); + kfree(pd); + } return 0; } +/* Return value: + on Success: virt2phys_gids are of type u16. + on Failure: GID_INDEX_INVALID */ +u16 mlx4_ib_get_virt2phys_gid(struct mlx4_ib_dev *dev, u8 port, u8 gid_index) +{ + if (port < 1 || port > MLX4_MAX_PORTS || gid_index > MLX4_MAX_NUM_GIDS) { + mlx4_ib_warn(&dev->ib_dev, "Invalid parameter sent to function." + " port %d, gid_index: %d\n", + port, gid_index); + return GID_INDEX_INVALID; + } + + return be16_to_cpu(dev->virt2phys_gids[port][gid_index]); +} + static int add_gid_entry(struct ib_qp *ibqp, union ib_gid *gid) { struct mlx4_ib_qp *mqp = to_mqp(ibqp); struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); - struct mlx4_ib_gid_entry *ge; + struct gid_entry *ge; ge = kzalloc(sizeof *ge, GFP_KERNEL); if (!ge) @@ -604,7 +883,6 @@ int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, if (ndev) dev_hold(ndev); spin_unlock(&mdev->iboe.lock); - if (ndev) { rdma_get_mcast_mac((struct in6_addr *)gid, mac); rtnl_lock(); @@ -623,8 +901,8 @@ static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); struct mlx4_ib_qp *mqp = to_mqp(ibqp); - err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, - !!(mqp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK), + err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, !!(mqp->flags & + MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK), MLX4_PROT_IB_IPV6); if (err) return err; @@ -640,11 +918,11 @@ err_add: return err; } -static struct mlx4_ib_gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw) +struct gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw) { - struct mlx4_ib_gid_entry *ge; - struct mlx4_ib_gid_entry *tmp; - struct mlx4_ib_gid_entry *ret = NULL; + struct gid_entry *ge; + struct gid_entry *tmp; + struct gid_entry *ret = NULL; list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) { if (!memcmp(raw, ge->gid.raw, 16)) { @@ -663,10 +941,11 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) struct mlx4_ib_qp *mqp = to_mqp(ibqp); u8 mac[6]; struct net_device *ndev; - struct mlx4_ib_gid_entry *ge; + struct gid_entry *ge; err = mlx4_multicast_detach(mdev->dev, - &mqp->mqp, gid->raw, MLX4_PROT_IB_IPV6); + &mqp->mqp, gid->raw, + MLX4_PROT_IB_IPV6); if (err) return err; @@ -695,6 +974,80 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) return 0; } +static void mlx4_dummy_comp_handler(struct ib_cq *cq, void *cq_context) +{ +} + +static struct ib_xrcd *mlx4_ib_alloc_xrcd(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct mlx4_ib_xrcd *xrcd; + struct mlx4_ib_dev *mdev = to_mdev(ibdev); + struct ib_pd *pd; + struct ib_cq *cq; + int err; + + if (!(mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)) + return ERR_PTR(-ENOSYS); + + xrcd = kmalloc(sizeof *xrcd, GFP_KERNEL); + if (!xrcd) + return ERR_PTR(-ENOMEM); + + err = mlx4_xrcd_alloc(mdev->dev, &xrcd->xrcdn); + if (err) + goto err_xrcd; + + pd = mlx4_ib_alloc_pd(ibdev, NULL, NULL); + if (IS_ERR(pd)) { + err = PTR_ERR(pd); + goto err_pd; + } + pd->device = ibdev; + + cq = mlx4_ib_create_cq(ibdev, 1, 0, NULL, NULL); + if (IS_ERR(cq)) { + err = PTR_ERR(cq); + goto err_cq; + } + cq->device = ibdev; + cq->comp_handler = mlx4_dummy_comp_handler; + + if (context) + if (ib_copy_to_udata(udata, &xrcd->xrcdn, sizeof(__u32))) { + err = -EFAULT; + goto err_copy; + } + + xrcd->cq = cq; + xrcd->pd = pd; + return &xrcd->ibxrcd; + +err_copy: + mlx4_ib_destroy_cq(cq); +err_cq: + mlx4_ib_dealloc_pd(pd); +err_pd: + mlx4_xrcd_free(mdev->dev, xrcd->xrcdn); +err_xrcd: + kfree(xrcd); + return ERR_PTR(err); +} + +static int mlx4_ib_dealloc_xrcd(struct ib_xrcd *xrcd) +{ + struct mlx4_ib_xrcd *mxrcd = to_mxrcd(xrcd); + + mlx4_ib_destroy_cq(mxrcd->cq); + mlx4_ib_dealloc_pd(mxrcd->pd); + mlx4_xrcd_free(to_mdev(xrcd->device)->dev, to_mxrcd(xrcd)->xrcdn); + kfree(xrcd); + + return 0; +} + + static int init_node_data(struct mlx4_ib_dev *dev) { struct ib_smp *in_mad = NULL; @@ -721,6 +1074,7 @@ static int init_node_data(struct mlx4_ib_dev *dev) if (err) goto out; + dev->dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32)); memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8); out: @@ -776,11 +1130,127 @@ static struct device_attribute *mlx4_class_attributes[] = { &dev_attr_board_id }; -static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, struct net_device *dev) +/* + * create show function and a device_attribute struct pointing to + * the function for _name + */ +#define DEVICE_DIAG_RPRT_ATTR(_name, _offset, _op_mod) \ +static ssize_t show_rprt_##_name(struct device *dev, \ + struct device_attribute *attr, \ + char *buf){ \ + return show_diag_rprt(dev, buf, _offset, _op_mod); \ +} \ +static DEVICE_ATTR(_name, S_IRUGO, show_rprt_##_name, NULL); + +#define MLX4_DIAG_RPRT_CLEAR_DIAGS 3 + +static size_t show_diag_rprt(struct device *device, char *buf, + u32 offset, u8 op_modifier) +{ + size_t ret; + u32 counter_offset = offset; + u32 diag_counter = 0; + struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, + ib_dev.dev); + + ret = mlx4_query_diag_counters(dev->dev, 1, op_modifier, + &counter_offset, &diag_counter); + if (ret) + return ret; + + return sprintf(buf,"%d\n", diag_counter); +} + +static ssize_t clear_diag_counters(struct device *device, + struct device_attribute *attr, + const char *buf, size_t length) +{ + size_t ret; + struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, + ib_dev.dev); + + ret = mlx4_query_diag_counters(dev->dev, 0, MLX4_DIAG_RPRT_CLEAR_DIAGS, + NULL, NULL); + if (ret) + return ret; + + return length; +} + +DEVICE_DIAG_RPRT_ATTR(rq_num_lle , 0x00, 2); +DEVICE_DIAG_RPRT_ATTR(sq_num_lle , 0x04, 2); +DEVICE_DIAG_RPRT_ATTR(rq_num_lqpoe , 0x08, 2); +DEVICE_DIAG_RPRT_ATTR(sq_num_lqpoe , 0x0C, 2); +DEVICE_DIAG_RPRT_ATTR(rq_num_lpe , 0x18, 2); +DEVICE_DIAG_RPRT_ATTR(sq_num_lpe , 0x1C, 2); +DEVICE_DIAG_RPRT_ATTR(rq_num_wrfe , 0x20, 2); +DEVICE_DIAG_RPRT_ATTR(sq_num_wrfe , 0x24, 2); +DEVICE_DIAG_RPRT_ATTR(sq_num_mwbe , 0x2C, 2); +DEVICE_DIAG_RPRT_ATTR(sq_num_bre , 0x34, 2); +DEVICE_DIAG_RPRT_ATTR(rq_num_lae , 0x38, 2); +DEVICE_DIAG_RPRT_ATTR(sq_num_rire , 0x44, 2); +DEVICE_DIAG_RPRT_ATTR(rq_num_rire , 0x48, 2); +DEVICE_DIAG_RPRT_ATTR(sq_num_rae , 0x4C, 2); +DEVICE_DIAG_RPRT_ATTR(rq_num_rae , 0x50, 2); +DEVICE_DIAG_RPRT_ATTR(sq_num_roe , 0x54, 2); +DEVICE_DIAG_RPRT_ATTR(sq_num_tree , 0x5C, 2); +DEVICE_DIAG_RPRT_ATTR(sq_num_rree , 0x64, 2); +DEVICE_DIAG_RPRT_ATTR(rq_num_rnr , 0x68, 2); +DEVICE_DIAG_RPRT_ATTR(sq_num_rnr , 0x6C, 2); +DEVICE_DIAG_RPRT_ATTR(rq_num_oos , 0x100, 2); +DEVICE_DIAG_RPRT_ATTR(sq_num_oos , 0x104, 2); +DEVICE_DIAG_RPRT_ATTR(rq_num_mce , 0x108, 2); +DEVICE_DIAG_RPRT_ATTR(rq_num_udsdprd , 0x118, 2); +DEVICE_DIAG_RPRT_ATTR(rq_num_ucsdprd , 0x120, 2); +DEVICE_DIAG_RPRT_ATTR(num_cqovf , 0x1A0, 2); +DEVICE_DIAG_RPRT_ATTR(num_eqovf , 0x1A4, 2); +DEVICE_DIAG_RPRT_ATTR(num_baddb , 0x1A8, 2); + +static DEVICE_ATTR(clear_diag, S_IWUSR, NULL, clear_diag_counters); + +static struct attribute *diag_rprt_attrs[] = { + &dev_attr_rq_num_lle.attr, + &dev_attr_sq_num_lle.attr, + &dev_attr_rq_num_lqpoe.attr, + &dev_attr_sq_num_lqpoe.attr, + &dev_attr_rq_num_lpe.attr, + &dev_attr_sq_num_lpe.attr, + &dev_attr_rq_num_wrfe.attr, + &dev_attr_sq_num_wrfe.attr, + &dev_attr_sq_num_mwbe.attr, + &dev_attr_sq_num_bre.attr, + &dev_attr_rq_num_lae.attr, + &dev_attr_sq_num_rire.attr, + &dev_attr_rq_num_rire.attr, + &dev_attr_sq_num_rae.attr, + &dev_attr_rq_num_rae.attr, + &dev_attr_sq_num_roe.attr, + &dev_attr_sq_num_tree.attr, + &dev_attr_sq_num_rree.attr, + &dev_attr_rq_num_rnr.attr, + &dev_attr_sq_num_rnr.attr, + &dev_attr_rq_num_oos.attr, + &dev_attr_sq_num_oos.attr, + &dev_attr_rq_num_mce.attr, + &dev_attr_rq_num_udsdprd.attr, + &dev_attr_rq_num_ucsdprd.attr, + &dev_attr_num_cqovf.attr, + &dev_attr_num_eqovf.attr, + &dev_attr_num_baddb.attr, + &dev_attr_clear_diag.attr, + NULL +}; + +static struct attribute_group diag_counters_group = { + .name = "diag_counters", + .attrs = diag_rprt_attrs +}; + +static void mlx4_addrconf_ifid_eui48(u8 *eui, int is_vlan, u16 vlan_id, struct net_device *dev) { memcpy(eui, dev->dev_addr, 3); memcpy(eui + 5, dev->dev_addr + 3, 3); - if (vlan_id < 0x1000) { + if (is_vlan) { eui[3] = vlan_id >> 8; eui[4] = vlan_id & 0xff; } else { @@ -809,7 +1279,7 @@ static void update_gids_task(struct work_struct *work) memcpy(gids, gw->gids, sizeof gw->gids); err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | gw->port, - 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B); + 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, 0); if (err) printk(KERN_WARNING "set port command failed\n"); else { @@ -836,6 +1306,7 @@ static int update_ipv6_gids(struct mlx4_ib_dev *dev, int port, int clear) int free; int found; int need_update = 0; + int is_vlan; u16 vid; work = kzalloc(sizeof *work, GFP_ATOMIC); @@ -848,12 +1319,23 @@ static int update_ipv6_gids(struct mlx4_ib_dev *dev, int port, int clear) goto out; } - rcu_read_lock(); - for_each_netdev_rcu(&init_net, tmp) { - if (ndev && (tmp == ndev || rdma_vlan_dev_real_dev(tmp) == ndev)) { + read_lock(&dev_base_lock); + for_each_netdev(&init_net, tmp) { + if (ndev && (tmp == ndev +#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) + || vlan_dev_real_dev(tmp) == ndev)) { +#else + )) { +#endif gid.global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); - vid = rdma_vlan_dev_vlan_id(tmp); - mlx4_addrconf_ifid_eui48(&gid.raw[8], vid, ndev); +#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) + vid = vlan_dev_vlan_id(tmp); + is_vlan = tmp->priv_flags & IFF_802_1Q_VLAN; +#else + vid = 0; + is_vlan = 0; +#endif + mlx4_addrconf_ifid_eui48(&gid.raw[8], is_vlan, vid, ndev); found = 0; free = -1; for (i = 0; i < 128; ++i) { @@ -868,11 +1350,7 @@ static int update_ipv6_gids(struct mlx4_ib_dev *dev, int port, int clear) } if (!found) { - if (tmp == ndev && - (memcmp(&dev->iboe.gid_table[port - 1][0], - &gid, sizeof gid) || - !memcmp(&dev->iboe.gid_table[port - 1][0], - &zgid, sizeof gid))) { + if (tmp == ndev && (memcmp(&dev->iboe.gid_table[port - 1][0], &gid, sizeof gid) || !memcmp(&dev->iboe.gid_table[port - 1][0], &zgid, sizeof gid))) { dev->iboe.gid_table[port - 1][0] = gid; ++need_update; hits[0] = 1; @@ -884,7 +1362,7 @@ static int update_ipv6_gids(struct mlx4_ib_dev *dev, int port, int clear) } } } - rcu_read_unlock(); + read_unlock(&dev_base_lock); for (i = 0; i < 128; ++i) if (!hits[i]) { @@ -952,8 +1430,7 @@ static int mlx4_ib_netdev_event(struct notifier_block *this, unsigned long event spin_lock(&iboe->lock); mlx4_foreach_ib_transport_port(port, ibdev->dev) { oldnd = iboe->netdevs[port - 1]; - iboe->netdevs[port - 1] = - mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port); + iboe->netdevs[port - 1] = mlx4_get_prot_dev(ibdev->dev, MLX4_PROT_EN, port); if (oldnd != iboe->netdevs[port - 1]) { if (iboe->netdevs[port - 1]) netdev_added(ibdev, port); @@ -962,11 +1439,19 @@ static int mlx4_ib_netdev_event(struct notifier_block *this, unsigned long event } } - if (dev == iboe->netdevs[0] || - (iboe->netdevs[0] && rdma_vlan_dev_real_dev(dev) == iboe->netdevs[0])) + if (dev == iboe->netdevs[0] +#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) + || vlan_dev_real_dev(dev) == iboe->netdevs[0]) +#else + ) +#endif handle_en_event(ibdev, 1, event); else if (dev == iboe->netdevs[1] - || (iboe->netdevs[1] && rdma_vlan_dev_real_dev(dev) == iboe->netdevs[1])) +#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) + || vlan_dev_real_dev(dev) == iboe->netdevs[1]) +#else + ) +#endif handle_en_event(ibdev, 2, event); spin_unlock(&iboe->lock); @@ -974,15 +1459,126 @@ static int mlx4_ib_netdev_event(struct notifier_block *this, unsigned long event return NOTIFY_DONE; } +static int mlx4_GET_GID_MAP(struct mlx4_dev *dev, u8 port, __be16 *phys_gid_idx) +{ + struct mlx4_cmd_mailbox *mailbox; + int err; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + err = mlx4_cmd_box(dev, 0, mailbox->dma, port, 0, MLX4_CMD_GET_GID_MAP, + MLX4_CMD_TIME_CLASS_A, 0); + if (!err) + memcpy(phys_gid_idx, mailbox->buf, sizeof(__be16) * MLX4_MAX_NUM_GIDS); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} + +static void guide_update(struct mlx4_ib_dev *dev, u8 port) +{ + struct ib_event ibev; + + ibev.event = IB_EVENT_GID_CHANGE; + ibev.device = &dev->ib_dev; + ibev.element.port_num = port; + ib_dispatch_event(&ibev); +} + +/* This function should only be called for multi-func slaves (domUs) */ +static int mlx4_get_gid_map(struct mlx4_ib_dev *dev) +{ + u8 i; + int err = 0; + + if (!mlx4_is_mfunc(dev->dev)) + return 0; + + for (i = 1; i <= dev->dev->caps.num_ports; i++) { + if (mlx4_GET_GID_MAP(dev->dev, i, dev->virt2phys_gids[i])) { + mlx4_ib_warn(&dev->ib_dev, "Failed to get GID map " + "for port %d\n", i); + err = 1; + } + + guide_update(dev, i); + } + + return err; +} + +int mlx4_request_pkey_table_update(struct mlx4_ib_dev *dev, u8 port) +{ + return mlx4_GET_PKEY_TABLE(dev->dev, port, + dev->pkeys.phys2virt_pkey[port - 1]); +} + +static void init_pkeys(struct mlx4_ib_dev *ibdev) +{ + int port; + int slave; + int i; + + if (ibdev->dev->caps.sqp_demux) { + for (slave = 0; slave <= ibdev->dev->sr_iov; ++slave) { + for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) { + for (i = 0; i < ibdev->dev->caps.pkey_table_max_len[port]; ++i) { + ibdev->pkeys.virt2phys_pkey[slave][port - 1][i] = + /* master has the identity virt2phys pkey mapping */ + (slave == ibdev->dev->caps.function || !i) ? i : + ibdev->dev->caps.pkey_table_max_len[port] - 1; + mlx4_sync_pkey_table(ibdev->dev, slave, port, i, + ibdev->pkeys.virt2phys_pkey[slave][port - 1][i]); + } + } + } + /* initialize pkey cache */ + for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) { + for (i = 0; i < ibdev->dev->caps.pkey_table_max_len[port]; ++i) + ibdev->pkeys.phys_pkey_cache[port-1][i] = (i) ? 0 : 0xFFFF; + } + } + + for (port = 1; port <= ibdev->num_ports; ++port) + if (mlx4_request_pkey_table_update(ibdev, port)) + printk(KERN_WARNING "pkey table update failed for slave %d, port %d\n", + ibdev->dev->caps.function, port); +} + +static int clear_counter_set(struct mlx4_dev *dev, int index) +{ + struct mlx4_cmd_mailbox *mailbox; + int err; + u32 inmod = index | (1 << 31); + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + err = mlx4_cmd_box(dev, 0, mailbox->dma, inmod, 0, + MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C, 0); + + mlx4_free_cmd_mailbox(dev, mailbox); + + return err; +} + static void *mlx4_ib_add(struct mlx4_dev *dev) { + static int mlx4_ib_version_printed; struct mlx4_ib_dev *ibdev; int num_ports = 0; int i; int err; struct mlx4_ib_iboe *iboe; + int k; - printk_once(KERN_INFO "%s", mlx4_ib_version); + if (!mlx4_ib_version_printed) { + printk(KERN_INFO "%s", mlx4_ib_version); + ++mlx4_ib_version_printed; + } mlx4_foreach_ib_transport_port(i, dev) num_ports++; @@ -1005,9 +1601,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) if (mlx4_uar_alloc(dev, &ibdev->priv_uar)) goto err_pd; - ibdev->uar_map = ioremap((phys_addr_t) ibdev->priv_uar.pfn << PAGE_SHIFT, - PAGE_SIZE); - if (!ibdev->uar_map) + ibdev->priv_uar.map = ioremap(ibdev->priv_uar.pfn << PAGE_SHIFT, PAGE_SIZE); + if (!ibdev->priv_uar.map) goto err_uar; MLX4_INIT_DOORBELL_LOCK(&ibdev->uar_lock); @@ -1044,7 +1639,13 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | - (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ); + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | + (1ull << IB_USER_VERBS_CMD_GET_ETH_L2_ADDR) | + (1ull << IB_USER_VERBS_CMD_ALLOC_SHPD) | + (1ull << IB_USER_VERBS_CMD_SHARE_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR_RELAXED) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR_RELAXED) | + (1ull << IB_USER_VERBS_CMD_FLUSH_RELAXED_MR); ibdev->ib_dev.query_device = mlx4_ib_query_device; ibdev->ib_dev.query_port = mlx4_ib_query_port; @@ -1092,35 +1693,82 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.map_phys_fmr = mlx4_ib_map_phys_fmr; ibdev->ib_dev.unmap_fmr = mlx4_ib_unmap_fmr; ibdev->ib_dev.dealloc_fmr = mlx4_ib_fmr_dealloc; + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) { + ibdev->ib_dev.create_xrc_srq = mlx4_ib_create_xrc_srq; + ibdev->ib_dev.alloc_xrcd = mlx4_ib_alloc_xrcd; + ibdev->ib_dev.dealloc_xrcd = mlx4_ib_dealloc_xrcd; + ibdev->ib_dev.create_xrc_rcv_qp = mlx4_ib_create_xrc_rcv_qp; + ibdev->ib_dev.modify_xrc_rcv_qp = mlx4_ib_modify_xrc_rcv_qp; + ibdev->ib_dev.query_xrc_rcv_qp = mlx4_ib_query_xrc_rcv_qp; + ibdev->ib_dev.reg_xrc_rcv_qp = mlx4_ib_reg_xrc_rcv_qp; + ibdev->ib_dev.unreg_xrc_rcv_qp = mlx4_ib_unreg_xrc_rcv_qp; + ibdev->ib_dev.uverbs_cmd_mask |= + (1ull << IB_USER_VERBS_CMD_CREATE_XRC_SRQ) | + (1ull << IB_USER_VERBS_CMD_OPEN_XRC_DOMAIN) | + (1ull << IB_USER_VERBS_CMD_CLOSE_XRC_DOMAIN) | + (1ull << IB_USER_VERBS_CMD_CREATE_XRC_RCV_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_XRC_RCV_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_XRC_RCV_QP) | + (1ull << IB_USER_VERBS_CMD_REG_XRC_RCV_QP) | + (1ull << IB_USER_VERBS_CMD_UNREG_XRC_RCV_QP); + } - spin_lock_init(&iboe->lock); + ibdev->ib_dev.get_eth_l2_addr = mlx4_ib_get_eth_l2_addr; + ibdev->ib_dev.alloc_shpd = mlx4_ib_alloc_shpd; + ibdev->ib_dev.share_pd = mlx4_ib_share_pd; + ibdev->ib_dev.remove_shpd = mlx4_ib_remove_shpd; + ibdev->ib_dev.set_fmr_pd = mlx4_ib_set_fmr_pd; + spin_lock_init(&iboe->lock); if (init_node_data(ibdev)) goto err_map; + for (k = 0; k < ibdev->num_ports; ++k) { + err = mlx4_counter_alloc(ibdev->dev, &ibdev->counters[k]); + if (err) + ibdev->counters[k] = -1; + else if (clear_counter_set(dev, ibdev->counters[k])) + printk(KERN_WARNING "failed to clear counters set %d\n", + ibdev->counters[k]); + } + spin_lock_init(&ibdev->sm_lock); mutex_init(&ibdev->cap_mask_mutex); + mutex_init(&ibdev->xrc_reg_mutex); - if (ib_register_device(&ibdev->ib_dev, NULL)) - goto err_map; + if (ib_register_device(&ibdev->ib_dev)) + goto err_counter; if (mlx4_ib_mad_init(ibdev)) goto err_reg; + if (mlx4_is_mfunc(ibdev->dev)) { + if (mlx4_ib_init_sriov(ibdev)) + goto err_mad; + + if (mlx4_get_gid_map(ibdev)) + goto err_mad; + } + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE && !iboe->nb.notifier_call) { iboe->nb.notifier_call = mlx4_ib_netdev_event; err = register_netdevice_notifier(&iboe->nb); if (err) - goto err_reg; + goto err_sriov; } - for (i = 0; i < ARRAY_SIZE(mlx4_class_attributes); ++i) { if (device_create_file(&ibdev->ib_dev.dev, mlx4_class_attributes[i])) goto err_notif; } - ibdev->ib_active = true; + if(sysfs_create_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group)) + goto err_notif; + + ibdev->ib_active = 1; + + if (mlx4_is_mfunc(ibdev->dev)) + init_pkeys(ibdev); return ibdev; @@ -1129,11 +1777,22 @@ err_notif: printk(KERN_WARNING "failure unregistering notifier\n"); flush_workqueue(wq); +err_sriov: + mlx4_ib_close_sriov(ibdev); + +err_mad: + mlx4_ib_mad_cleanup(ibdev); + err_reg: ib_unregister_device(&ibdev->ib_dev); +err_counter: + for (; k; --k) + if (ibdev->counters[k - 1] >= 0) + mlx4_counter_free(ibdev->dev, ibdev->counters[k - 1]); + err_map: - iounmap(ibdev->uar_map); + iounmap(ibdev->priv_uar.map); err_uar: mlx4_uar_free(dev, &ibdev->priv_uar); @@ -1151,15 +1810,23 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr) { struct mlx4_ib_dev *ibdev = ibdev_ptr; int p; + int k; + sysfs_remove_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group); + + mlx4_ib_close_sriov(ibdev); mlx4_ib_mad_cleanup(ibdev); ib_unregister_device(&ibdev->ib_dev); + for (k = 0; k < ibdev->num_ports; ++k) + if (ibdev->counters[k] >= 0) + mlx4_counter_free(ibdev->dev, ibdev->counters[k]); + if (ibdev->iboe.nb.notifier_call) { - if (unregister_netdevice_notifier(&ibdev->iboe.nb)) - printk(KERN_WARNING "failure unregistering notifier\n"); + unregister_netdevice_notifier(&ibdev->iboe.nb); + flush_workqueue(wq); ibdev->iboe.nb.notifier_call = NULL; } - iounmap(ibdev->uar_map); + iounmap(ibdev->priv_uar.map); mlx4_foreach_port(p, dev, MLX4_PORT_TYPE_IB) mlx4_CLOSE_PORT(dev, p); @@ -1169,29 +1836,193 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr) ib_dealloc_device(&ibdev->ib_dev); } -static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr, - enum mlx4_dev_event event, int port) +struct pkey_work { + struct work_struct work; + struct mlx4_ib_dev *dev; + u8 port; +}; + +static void pkey_update(struct work_struct *work) { + struct pkey_work *pw = container_of(work, struct pkey_work, work); struct ib_event ibev; - struct mlx4_ib_dev *ibdev = to_mdev((struct ib_device *) ibdev_ptr); - if (port > ibdev->num_ports) + printk("%s: port %d\n", __func__, pw->port); + + if (mlx4_request_pkey_table_update(pw->dev, pw->port)) + printk(KERN_ERR "update pkey phys to virt failed\n"); + + ibev.event = IB_EVENT_PKEY_CHANGE; + ibev.device = &pw->dev->ib_dev; + ibev.element.port_num = pw->port; + ib_dispatch_event(&ibev); + kfree(pw); + +} + +struct guid_check_work { + struct delayed_work work; + struct mlx4_ib_dev *dev; + u8 port; +}; + +static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init) +{ + struct mlx4_ib_demux_work **dm = NULL; + struct mlx4_dev *dev = ibdev->dev; + int i; + unsigned long flags; + + if (!dev->caps.sqp_demux) return; + dm = kzalloc(sizeof(*dm) * dev->caps.num_ports, GFP_ATOMIC); + if (!dm) { + printk(KERN_ERR "failed to allocate memory for tunneling qp update\n"); + goto out; + } + + for (i = 0; i < dev->caps.num_ports; i++) { + dm[i] = kmalloc(sizeof(struct mlx4_ib_demux_work), GFP_ATOMIC); + if (!dm[i]) { + printk(KERN_ERR "failed to allocate memory for tunneling qp update work struct\n"); + for (i = 0; i < dev->caps.num_ports; i++) { + if (dm[i]) + kfree(dm[i]); + } + goto out; + } + } + /* initialize or tear down tunnel QPs for the slave */ + for (i = 0; i < dev->caps.num_ports; i++) { + INIT_WORK(&dm[i]->work, mlx4_ib_tunnels_update_work); + dm[i]->port = i+1; + dm[i]->slave = slave; + dm[i]->do_init = do_init; + dm[i]->dev = ibdev; + spin_lock_irqsave(&ibdev->sriov.going_down_lock, flags); + if (!ibdev->sriov.is_going_down) + queue_work(ibdev->sriov.demux[i].ud_wq, &dm[i]->work); + spin_unlock_irqrestore(&ibdev->sriov.going_down_lock, flags); + } +out: + if (dm) + kfree(dm); + return; +} + +void update_sm_ah(struct mlx4_ib_dev *dev, u8 port_num, u16 lid, u8 sl) +{ + struct ib_ah *new_ah; + struct ib_ah_attr ah_attr; + + if (!dev->send_agent[port_num - 1][0]) + return; + + memset(&ah_attr, 0, sizeof ah_attr); + ah_attr.dlid = lid; + ah_attr.sl = sl; + ah_attr.port_num = port_num; + + new_ah = ib_create_ah(dev->send_agent[port_num - 1][0]->qp->pd, + &ah_attr); + if (IS_ERR(new_ah)) + return; + + spin_lock(&dev->sm_lock); + if (dev->sm_ah[port_num - 1]) + ib_destroy_ah(dev->sm_ah[port_num - 1]); + dev->sm_ah[port_num - 1] = new_ah; + spin_unlock(&dev->sm_lock); +} + +static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr, + enum mlx4_dev_event event, unsigned long param) +{ + struct ib_event ibev; + struct mlx4_ib_dev *ibdev = to_mdev((struct ib_device *) ibdev_ptr); + struct pkey_work *pw; + u8 port = 0; + struct mlx4_ib_eqe *eqe = NULL; + struct ib_event_work *ew; + + if (event == MLX4_DEV_EVENT_PORT_MGMT_CHANGE) + eqe = (struct mlx4_ib_eqe *)param; + else + port = (u8)param; + switch (event) { case MLX4_DEV_EVENT_PORT_UP: + if (port > ibdev->num_ports) + return; + if (dev->caps.sqp_demux) { + invalidate_all_guid_record(ibdev, port); + } + printk(KERN_ERR "mlx4_ib_event MLX4_DEV_EVENT_PORT_UP (port:%d)\n", port); ibev.event = IB_EVENT_PORT_ACTIVE; break; case MLX4_DEV_EVENT_PORT_DOWN: + printk(KERN_ERR "mlx4_ib_event MLX4_DEV_EVENT_PORT_DOWN (port:%d)\n", port); + if (port > ibdev->num_ports) + return; ibev.event = IB_EVENT_PORT_ERR; break; case MLX4_DEV_EVENT_CATASTROPHIC_ERROR: - ibdev->ib_active = false; + ibdev->ib_active = 0; + port = 0; ibev.event = IB_EVENT_DEVICE_FATAL; break; + case MLX4_DEV_EVENT_PKEY_UPDATE: + if (port > ibdev->num_ports) + return; + pw = kmalloc(sizeof *pw, GFP_ATOMIC); + if (pw) { + INIT_WORK(&pw->work, pkey_update); + pw->port = port; + pw->dev = ibdev; + queue_work(wq, &pw->work); + } else + printk(KERN_ERR "failed to allocate memory for pkey update\n"); + + return; + case MLX4_DEV_EVENT_SLAVE_INIT: + do_slave_init(ibdev, port, 1); + return; + case MLX4_DEV_EVENT_SLAVE_SHUTDOWN: + do_slave_init(ibdev, port, 0); + return; + + case MLX4_DEV_EVENT_GUID_CHANGE: + ibev.event = IB_EVENT_GID_CHANGE; + break; + + case MLX4_DEV_EVENT_LID_CHANGE: + ibev.event = IB_EVENT_LID_CHANGE; + break; + + case MLX4_DEV_EVENT_CLIENT_REREGISTER: + ibev.event = IB_EVENT_CLIENT_REREGISTER; + break; + /* This event will only be received in non-mfunc driver + or for the master, in mfunc driver */ + case MLX4_DEV_EVENT_PORT_MGMT_CHANGE: + ew = kmalloc(sizeof *ew, GFP_ATOMIC); + if (!ew) { + printk(KERN_ERR "failed to allocate memory for " + "events work\n"); + break; + } + + INIT_WORK(&ew->work, handle_port_mgmt_change_event); + memcpy(&ew->ib_eqe, eqe, sizeof *eqe); + ew->ib_dev = ibdev; + + queue_work(wq, &ew->work); + + return; default: return; } @@ -1203,10 +2034,11 @@ static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr, } static struct mlx4_interface mlx4_ib_interface = { - .add = mlx4_ib_add, - .remove = mlx4_ib_remove, - .event = mlx4_ib_event, - .protocol = MLX4_PROT_IB_IPV6 + .add = mlx4_ib_add, + .remove = mlx4_ib_remove, + .event = mlx4_ib_event, + .get_prot_dev = get_ibdev, + .protocol = MLX4_PROT_IB, }; static int __init mlx4_ib_init(void) @@ -1217,18 +2049,28 @@ static int __init mlx4_ib_init(void) if (!wq) return -ENOMEM; + err = mlx4_ib_mcg_init(); + if (err) + goto clean_wq; + err = mlx4_register_interface(&mlx4_ib_interface); - if (err) { - destroy_workqueue(wq); - return err; - } + if (err) + goto clean_mcg; return 0; + +clean_mcg: + mlx4_ib_mcg_destroy(); + +clean_wq: + destroy_workqueue(wq); + return err; } static void __exit mlx4_ib_cleanup(void) { mlx4_unregister_interface(&mlx4_ib_interface); + mlx4_ib_mcg_destroy(); destroy_workqueue(wq); } diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c new file mode 100644 index 0000000000000..03ca1102ea48a --- /dev/null +++ b/drivers/infiniband/hw/mlx4/mcg.c @@ -0,0 +1,1504 @@ +/* + * Copyright (c) 2010 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "mlx4_ib.h" + +#define MAX_VFS 80 +#define MAX_PEND_REQS_PER_FUNC 4 +#define MAD_TIMEOUT_MS 2000 + +#define mcg_warn(fmt, arg...) printk("MCG WARNING: " fmt, ##arg ) +#define mcg_error(fmt, arg...) printk(KERN_ERR fmt, ##arg ) +#define mcg_warn_group(group, format, arg...) \ + printk("%s-%d: %16s (port %d): WARNING: " format, __func__, __LINE__,\ + (group)->name, group->demux->port, ## arg) + +#define mcg_error_group(group, format, arg...) \ + printk(KERN_ERR " %16s: " format, (group)->name, ## arg) + + +#define DEBUG_MCG + +static union ib_gid mgid0; + +static struct workqueue_struct *clean_wq; + +enum mcast_state { + MCAST_NOT_MEMBER = 0, + MCAST_MEMBER, +}; + +enum mcast_group_state { + MCAST_IDLE, + MCAST_JOIN_SENT, + MCAST_LEAVE_SENT, + MCAST_RESP_READY + //MCAST_GROUP_ERROR, +}; + +struct mcast_member { + enum mcast_state state; + uint8_t join_state; + int num_pend_reqs; + struct list_head pending; +}; + +struct ib_sa_mcmember_data { + union ib_gid mgid; + union ib_gid port_gid; + __be32 qkey; + __be16 mlid; + u8 mtusel_mtu; + u8 tclass; + __be16 pkey; + u8 ratesel_rate; + u8 lifetmsel_lifetm; + __be32 sl_flowlabel_hoplimit; + u8 scope_join_state; + u8 proxy_join; + u8 reserved[2]; +}; + +struct mcast_group { + struct ib_sa_mcmember_data rec; + struct rb_node node; + struct list_head mgid0_list; + struct mlx4_ib_demux_ctx *demux; + struct mcast_member func[MAX_VFS]; + struct mutex lock; + struct work_struct work; + struct list_head pending_list; + int members[3]; + enum mcast_group_state state; + enum mcast_group_state prev_state; + struct ib_sa_mad response_sa_mad; + __be64 last_req_tid; + + char name[33]; /* MGID string */ + struct device_attribute dentry; + + /* refcount is the reference count for the following: + 1. Each queued request + 2. Each invocation of the worker thread + 3. Membership of the port at the SA + */ + atomic_t refcount; + + /* delayed work to clean pending SM request */ + struct delayed_work timeout_work; + struct list_head cleanup_list; +}; + +struct mcast_req { + int func; + struct ib_sa_mad sa_mad; + struct list_head group_list; + struct list_head func_list; + struct mcast_group *group; + int clean; +}; + + +#define safe_atomic_dec(ref) \ + do \ + if (atomic_dec_and_test(ref)) \ + mcg_warn_group(group, "did not expect to reach zero\n"); \ + while(0) + +#ifdef DEBUG_MCG + +char *ib_mgmt_method_name[] = { + "", // 0x0 + "IB_MGMT_METHOD_GET", // 0x01 + "IB_MGMT_METHOD_SET", // 0x02 + "IB_MGMT_METHOD_SEND", // 0x03 + "", // 0x04 + "IB_MGMT_METHOD_TRAP", // 0x05 + "IB_MGMT_METHOD_REPORT", // 0x06 + "IB_MGMT_METHOD_TRAP_REPRESS", // 0x07 + "", "", "", "", "", "", "", "", "", "", //0x8-0x11 + "IB_SA_METHOD_GET_TABLE", // 0x12 + "IB_SA_METHOD_GET_TRACE_TBL", // 0x13 + "IB_SA_METHOD_GET_MULTI", // 0x14 + "IB_SA_METHOD_DELETE", // 0x15 +}; + +char *ib_mgmt_method_resp_name[] = { + "IB_MGMT_METHOD_RESP", // 0x80 + "IB_MGMT_METHOD_GET_RESP", // 0x81 + "", "", "", "", // 0x82-0x85 + "IB_MGMT_METHOD_REPORT_RESP", // 0x86 + "", "", "", "", "", "", "", "", "", "", "", // 0x87-0x91 + "IB_SA_METHOD_GET_TABLE_RESP", // 0x92 + "", // 0x93 + "IB_SA_METHOD_GET_MULTI_RESP", // 0x94 + "IB_SA_METHOD_DELETE_RESP", // 0x95 +}; + +#define dump_groups(ctx) do { \ + struct mcast_group *tg; \ + struct rb_node *p; \ + mcg_debug("dumping\n"); \ + for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p)) { \ + tg = rb_entry(p, struct mcast_group, node); \ + mcg_debug_group(tg, "available\n"); \ + } \ + } while (0) + +char *debug_mcg_method_name(uint8_t method) +{ + if (method < ARRAY_SIZE(ib_mgmt_method_name)) + return ib_mgmt_method_name[method]; + else if ((method >= 0x80) && (method < 0x80 + ARRAY_SIZE(ib_mgmt_method_resp_name))) + return ib_mgmt_method_resp_name[method - 0x80]; + else + return "unknown"; +} + +static const char *get_state_string(enum mcast_group_state state) +{ + switch(state) { + case MCAST_IDLE: + return "MCAST_IDLE"; + case MCAST_JOIN_SENT: + return "MCAST_JOIN_SENT"; + case MCAST_LEAVE_SENT: + return "MCAST_LEAVE_SENT"; + case MCAST_RESP_READY: + return "MCAST_RESP_READY"; + } + return "Invalid State"; +} + +static inline void hexdump(char *p, int size) { + int i; + for (i = 0; i < size; ++i) { + if (i % 16 == 0) + printk("\n%06x: ", i); + if (i % 8 == 0) + printk(" "); + printk("%02x ", (unsigned char)*(p+i)); + } + printk("\n\n"); +} + +#define mcg_debug(fmt, arg...) if (mlx4_ib_debug_level & 2)\ + printk("%s-%d:" fmt, __func__, __LINE__, ##arg) +#define mcg_debug_group(group, format, arg...) \ + if (mlx4_ib_debug_level & 2) \ + printk("%s-%d: %16s (refcount %d) (port %d): " format, __func__, __LINE__, (group)->name,\ + atomic_read(&group->refcount), group->demux->port, ## arg) + +#else /* DEBUG_MCG */ + +#define mcg_debug(fmt, arg...) do { } while (0) +#define mcg_debug_group(group, format, arg...) do { } while (0) + +#endif /* DEBUG_MCG */ + + +static struct mcast_group *mcast_find(struct mlx4_ib_demux_ctx *ctx, + union ib_gid *mgid) +{ + struct rb_node *node = ctx->mcg_table.rb_node; + struct mcast_group *group; + int ret; + + while (node) { + group = rb_entry(node, struct mcast_group, node); + ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid); + if (!ret) + return group; + + if (ret < 0) + node = node->rb_left; + else + node = node->rb_right; + } + return NULL; +} + +static struct mcast_group *mcast_insert(struct mlx4_ib_demux_ctx *ctx, + struct mcast_group *group) +{ + struct rb_node **link = &ctx->mcg_table.rb_node; + struct rb_node *parent = NULL; + struct mcast_group *cur_group; + int ret; + + while (*link) { + parent = *link; + cur_group = rb_entry(parent, struct mcast_group, node); + + ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw, + sizeof group->rec.mgid); + if (ret < 0) + link = &(*link)->rb_left; + else if (ret > 0) + link = &(*link)->rb_right; + else + return cur_group; + } + rb_link_node(&group->node, parent, link); + rb_insert_color(&group->node, &ctx->mcg_table); + return NULL; +} + +/* Should be replaced by common SRIOV send_to_wire() function */ +static int send_mad_to_wire(struct mlx4_ib_demux_ctx *ctx, struct ib_mad *mad) +{ + struct mlx4_ib_dev *dev = ctx->dev; + struct ib_ah_attr ah_attr; + + spin_lock(&dev->sm_lock); + if (!dev->sm_ah[ctx->port - 1]) { /* port is not yet Active, sm_ah not ready */ + spin_unlock(&dev->sm_lock); + mcg_debug("No sm_ah. Port %d not ready? Skipping send\n", ctx->port); + return -EAGAIN; + } + mlx4_ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr); + spin_unlock(&dev->sm_lock); + return mlx4_ib_send_to_wire(dev, dev->dev->caps.function, ctx->port, + IB_QPT_GSI, 0, 1, IB_QP1_QKEY, &ah_attr, mad); +} + +/* Should be replaced by common SRIOV send_to_slave() function */ +static int send_mad_to_slave(int slave, struct mlx4_ib_demux_ctx *ctx, struct ib_mad *mad) +{ + struct mlx4_ib_dev *dev = ctx->dev; + struct ib_mad_agent *agent = dev->send_agent[ctx->port - 1][1]; + struct ib_wc wc; + struct ib_ah_attr ah_attr; + + /* Our agent might not yet be registered when mads start to arrive */ + if (!agent) + return -EAGAIN; + + ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr); + + if (ib_find_cached_pkey(&dev->ib_dev, ctx->port, IB_DEFAULT_PKEY_FULL, &wc.pkey_index)) + return -EINVAL; + wc.sl = 0; + wc.dlid_path_bits = 0; + wc.port_num = ctx->port; + wc.slid = ah_attr.dlid; /* opensm lid */ + wc.src_qp = 1; + return mlx4_ib_send_to_slave(dev, slave, ctx->port, IB_QPT_GSI, &wc, NULL, mad); +} + +static int send_join_to_wire(struct mcast_group *group, struct ib_sa_mad *sa_mad) +{ + struct ib_sa_mad mad; + struct ib_sa_mcmember_data *sa_mad_data = (struct ib_sa_mcmember_data *)&mad.data; + int ret; + + mcg_debug_group(group, "Sending join to wire\n"); + + /* we rely on a mad request as arrived from a VF */ + memcpy(&mad, sa_mad, sizeof mad); + + /* fix port GID to be the real one (slave 0) */ + sa_mad_data->port_gid.global.interface_id = group->demux->guid_cache[0]; + + /* assign our own TID */ + mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux); + group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */ + + ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad); + if (ret) + mcg_debug_group(group, "Error sending join mad to wire (%d)\n", ret); + + /* set timeout handler */ + if (!ret) { + /* calls mlx4_ib_mcg_timeout_handler */ + queue_delayed_work(group->demux->mcg_wq, &group->timeout_work, + msecs_to_jiffies(MAD_TIMEOUT_MS)); + } + + return ret; +} + +static int send_leave_to_wire(struct mcast_group *group, u8 join_state) +{ + struct ib_sa_mad mad; + struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data; + int ret; + + memset(&mad, 0, sizeof mad); + mad.mad_hdr.base_version = 1; + mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM; + mad.mad_hdr.class_version = 2; + mad.mad_hdr.method = IB_SA_METHOD_DELETE; + mad.mad_hdr.status = cpu_to_be16(0); + mad.mad_hdr.class_specific = cpu_to_be16(0); + mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux); + group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */ + mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC); + mad.mad_hdr.attr_mod = cpu_to_be32(0); + mad.sa_hdr.sm_key = 0x0; + mad.sa_hdr.attr_offset = cpu_to_be16(7); + mad.sa_hdr.comp_mask = IB_SA_MCMEMBER_REC_MGID | + IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_JOIN_STATE; + + *sa_data = group->rec; + sa_data->scope_join_state = join_state; + + mcg_debug_group(group, "Sending leave to wire, join_state=%x, tid 0x%llx\n", join_state, + be64_to_cpu(mad.mad_hdr.tid)); + ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad); + if (ret) { + mcg_debug_group(group, "Error sending leave mad to wire (%d)\n", ret); + group->state = MCAST_IDLE; + } + + /* set timeout handler */ + if (!ret) { + /* calls mlx4_ib_mcg_timeout_handler */ + queue_delayed_work(group->demux->mcg_wq, &group->timeout_work, + msecs_to_jiffies(MAD_TIMEOUT_MS)); + } + + return ret; +} + +static int send_reply_to_slave(int slave, struct mcast_group *group, + struct ib_sa_mad *req_sa_mad, u16 status) +{ + struct ib_sa_mad mad; + struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data; + struct ib_sa_mcmember_data *req_sa_data = (struct ib_sa_mcmember_data *)&req_sa_mad->data; + int ret; + + mcg_debug_group(group, "Sending reply to slave %d, status=0x%x\n", slave, status); + memset(&mad, 0, sizeof mad); + mad.mad_hdr.base_version = 1; + mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM; + mad.mad_hdr.class_version = 2; + mad.mad_hdr.method = IB_MGMT_METHOD_GET_RESP; + mad.mad_hdr.status = cpu_to_be16(status); + mad.mad_hdr.class_specific = cpu_to_be16(0); + mad.mad_hdr.tid = req_sa_mad->mad_hdr.tid; + *(u8 *)&mad.mad_hdr.tid = 0; /* resetting tid to 0 */ + mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC); + mad.mad_hdr.attr_mod = cpu_to_be32(0); + mad.sa_hdr.sm_key = req_sa_mad->sa_hdr.sm_key; + mad.sa_hdr.attr_offset = cpu_to_be16(7); + mad.sa_hdr.comp_mask = 0; /* ignored on responses, see IBTA spec */ + + *sa_data = group->rec; + + /* reconstruct VF's requested join_state and port_gid */ + sa_data->scope_join_state &= 0xf0; + sa_data->scope_join_state |= (group->func[slave].join_state & 0x0f); + memcpy(&sa_data->port_gid, &req_sa_data->port_gid, sizeof req_sa_data->port_gid); + + ret = send_mad_to_slave(slave, group->demux, (struct ib_mad *)&mad); + if (ret) + mcg_debug_group(group, "Error sending mad to slave %d (%d)\n", + slave, ret); + return ret; +} + +static int check_selector(ib_sa_comp_mask comp_mask, + ib_sa_comp_mask selector_mask, + ib_sa_comp_mask value_mask, + u8 src_value, u8 dst_value) +{ + int err; + u8 selector = dst_value >> 6; + dst_value &= 0x3f; + src_value &= 0x3f; + + if (!(comp_mask & selector_mask) || !(comp_mask & value_mask)) + return 0; + + switch (selector) { + case IB_SA_GT: + err = (src_value <= dst_value); + break; + case IB_SA_LT: + err = (src_value >= dst_value); + break; + case IB_SA_EQ: + err = (src_value != dst_value); + break; + default: + err = 0; + break; + } + + return err; +} + +static u16 cmp_rec(struct ib_sa_mcmember_data *src, + struct ib_sa_mcmember_data *dst, ib_sa_comp_mask comp_mask) +{ + /* src is group record, dst is request record */ + /* MGID must already match */ + /* Port_GID we always replace to our Port_GID, so it is a match */ + +#define MAD_STATUS_REQ_INVALID 0x0200 + if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid) + return MAD_STATUS_REQ_INVALID; + if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR, + IB_SA_MCMEMBER_REC_MTU, + src->mtusel_mtu, dst->mtusel_mtu)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS && + src->tclass != dst->tclass) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey) + return MAD_STATUS_REQ_INVALID; + if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR, + IB_SA_MCMEMBER_REC_RATE, + src->ratesel_rate, dst->ratesel_rate)) + return MAD_STATUS_REQ_INVALID; + if (check_selector(comp_mask, + IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR, + IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME, + src->lifetmsel_lifetm, dst->lifetmsel_lifetm)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_SL && + (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0xf0000000) != + (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0xf0000000)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL && + (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x0fffff00) != + (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x0fffff00)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT && + (be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x000000ff) != + (be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x000000ff)) + return MAD_STATUS_REQ_INVALID; + if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE && + (src->scope_join_state & 0xf0) != + (dst->scope_join_state & 0xf0)) + return MAD_STATUS_REQ_INVALID; + + /* join_state checked separately, proxy_join ignored */ + + return 0; +} + +/* release group, return 1 if this was last release and group is destroyed + * timout work is canceled sync */ +static int release_group(struct mcast_group *group, int from_timeout_handler) +{ + struct mlx4_ib_demux_ctx *ctx = group->demux; + int nzgroup; + + mutex_lock(&ctx->mcg_table_lock); + mutex_lock(&group->lock); + if (atomic_dec_and_test(&group->refcount)) { + if (!from_timeout_handler) { + if(group->state != MCAST_IDLE && + !cancel_delayed_work(&group->timeout_work)) { + atomic_inc(&group->refcount); + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + return 0; + } + } + + nzgroup = memcmp(&group->rec.mgid, &mgid0, sizeof mgid0); + mcg_debug_group(group, "nzgroup %d\n", nzgroup); + if (nzgroup) { + del_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr); + mcg_debug_group(group, "deleted from sysfs\n"); + } + if (!list_empty(&group->pending_list)) + mcg_warn_group(group, "releasing a group with non empty pending list\n"); + if (nzgroup) + rb_erase(&group->node, &ctx->mcg_table); + list_del_init(&group->mgid0_list); + mcg_debug_group(group, "freeing group %p\n", group); + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + kfree(group); + return 1; + } else { + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + } + return 0; +} + +static void adjust_membership(struct mcast_group *group, u8 join_state, int inc) +{ + int i; + + for (i = 0; i < 3; i++, join_state >>= 1) + if (join_state & 0x1) + group->members[i] += inc; +} + +static u8 get_leave_state(struct mcast_group *group) +{ + u8 leave_state = 0; + int i; + + for (i = 0; i < 3; i++) + if (!group->members[i]) + leave_state |= (1 << i); + + return leave_state & (group->rec.scope_join_state & 7); +} + +static int join_group(struct mcast_group *group, int slave, u8 join_mask) +{ + int ret = 0; + u8 join_state; + + /* remove bits that slave is already member of, and adjust */ + join_state = join_mask & (~group->func[slave].join_state); + adjust_membership(group, join_state, 1); + group->func[slave].join_state |= join_state; + if (group->func[slave].state != MCAST_MEMBER && join_state) { + group->func[slave].state = MCAST_MEMBER; + ret = 1; + } + return ret; +} + +static int leave_group(struct mcast_group *group, int slave, u8 leave_state) +{ + int ret = 0; + + adjust_membership(group, leave_state, -1); + group->func[slave].join_state &= ~leave_state; + if (!group->func[slave].join_state) { + group->func[slave].state = MCAST_NOT_MEMBER; + ret = 1; + } + return ret; +} + +static int check_leave(struct mcast_group *group, int slave, u8 leave_mask) +{ + if (group->func[slave].state != MCAST_MEMBER) { + mcg_debug_group(group, "not a member\n"); + return MAD_STATUS_REQ_INVALID; + } + + /* make sure we're not deleting unset bits */ + if (~group->func[slave].join_state & leave_mask) { + mcg_debug_group(group, "VF state %d, leave mask %d\n", group->func[slave].join_state, leave_mask); + return MAD_STATUS_REQ_INVALID; + } + + if (!leave_mask) + return MAD_STATUS_REQ_INVALID; + + return 0; +} + +static void mlx4_ib_mcg_timeout_handler(struct work_struct *work) +{ + struct delayed_work *delay = to_delayed_work(work); + struct mcast_group *group; + struct mcast_req *req = NULL; + int rel = 0; + + group = container_of(delay, typeof(*group), timeout_work); + + mcg_debug_group(group, "mcg Timeout\n"); + mutex_lock(&group->lock); + if (group->state == MCAST_JOIN_SENT) { + if (!list_empty(&group->pending_list)) { + req = list_first_entry(&group->pending_list, struct mcast_req, group_list); + list_del(&req->group_list); + list_del(&req->func_list); + --group->func[req->func].num_pend_reqs; + mutex_unlock(&group->lock); + kfree(req); + if (memcmp(&group->rec.mgid, &mgid0, sizeof mgid0)) { + if (release_group(group, 1)) + return; + } else { + kfree(group); + return; + } + mutex_lock(&group->lock); + } else + mcg_warn_group(group, "DRIVER BUG\n"); + } else if (group->state == MCAST_LEAVE_SENT) { + if (group->rec.scope_join_state & 7) { + group->rec.scope_join_state &= 0xf8; + rel = 1; + } + group->state = MCAST_IDLE; + mutex_unlock(&group->lock); + if (release_group(group, 1)) + return; + mutex_lock(&group->lock); + } else + mcg_warn_group(group, "invalid state %s\n", get_state_string(group->state)); + group->state = MCAST_IDLE; + atomic_inc(&group->refcount); + mcg_debug_group(group, "\n"); + if (!queue_work(group->demux->mcg_wq, &group->work)) { + safe_atomic_dec(&group->refcount); + mcg_debug_group(group, "\n"); + } + + mutex_unlock(&group->lock); +} + +static int handle_leave_req(struct mcast_group *group, u8 leave_mask, + struct mcast_req *req) +{ + u16 status; + + if (req->clean) + leave_mask = group->func[req->func].join_state; + + status = check_leave(group, req->func, leave_mask); + if (!status) + leave_group(group, req->func, leave_mask); + + mcg_debug_group(group, "req %p status 0x%04x, VF %d, clean %d\n", + req, status, req->func, req->clean); + if (!req->clean) + send_reply_to_slave(req->func, group, &req->sa_mad, status); + --group->func[req->func].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + return 1; +} + +static int handle_join_req(struct mcast_group *group, u8 join_mask, + struct mcast_req *req) +{ + u8 group_join_state = group->rec.scope_join_state & 7; + int ref = 0; + u16 status; + struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data; + + if (join_mask == (group_join_state & join_mask)) { + /* port's membership need not change */ + status = cmp_rec(&group->rec, sa_data, req->sa_mad.sa_hdr.comp_mask); + if (!status) + join_group(group, req->func, join_mask); + + mcg_debug_group(group, "req %p status 0x%04x, VF %d, ref %d, join %d, group join %d\n", + req, status, req->func, ref, join_mask, group_join_state); + --group->func[req->func].num_pend_reqs; + send_reply_to_slave(req->func, group, &req->sa_mad, status); + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + ++ref; + } else { + /* port's membership needs to be updated */ + mcg_debug_group(group, "req %p, VF %d, join %d, group join %d\n", + req, req->func, join_mask, group_join_state); + group->prev_state = group->state; + if (send_join_to_wire(group, &req->sa_mad)) { + mcg_debug_group(group, "Error sending mad to wire\n"); + --group->func[req->func].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + ref = 1; + group->state = group->prev_state; + } else + group->state = MCAST_JOIN_SENT; + } + + return ref; +} + +static void mlx4_ib_mcg_work_handler(struct work_struct *work) +{ + struct mcast_group *group; + struct mcast_req *req = NULL; + struct ib_sa_mcmember_data *sa_data; + u8 group_join_state, req_join_state; + int rc = 1; /* release_count - this is for the scheduled work */ + u16 status; + u8 method; + + group = container_of(work, typeof(*group), work); + + mutex_lock(&group->lock); + + mcg_debug_group(group, "group state %s, pending %c\n", + get_state_string(group->state), + (list_empty(&group->pending_list))? 'N' : 'Y'); + + /* First, let's see if a response from SM is waiting regarding this group. + * If so, we need to update the group's REC. If this is a bad response, we + * may need to send a bad response to a VF waiting for it. If VF is waiting + * and this is a good response, the VF will be answered later in this func. */ + if (group->state == MCAST_RESP_READY) { + /* cancels mlx4_ib_mcg_timeout_handler */ + cancel_delayed_work(&group->timeout_work); + status = be16_to_cpu(group->response_sa_mad.mad_hdr.status); + method = group->response_sa_mad.mad_hdr.method; + if (group->last_req_tid != group->response_sa_mad.mad_hdr.tid) { + mcg_warn_group(group, "Got MAD response to existing MGID but wrong TID, dropping. Resp TID=%llx, group TID=%llx\n", + be64_to_cpu(group->response_sa_mad.mad_hdr.tid), + be64_to_cpu(group->last_req_tid)); + group->state = group->prev_state; + goto process_requests; + } + if (status) { + mcg_debug_group(group, "work: got bad response from SM on %s request: status=0x%x\n", + (method == IB_MGMT_METHOD_GET_RESP)? "JOIN" : "LEAVE", + status); + if (!list_empty(&group->pending_list)) + req = list_first_entry(&group->pending_list, + struct mcast_req, group_list); + if ((method == IB_MGMT_METHOD_GET_RESP)) { + if (req) { + send_reply_to_slave(req->func, group, &req->sa_mad, status); + --group->func[req->func].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + ++rc; + } else + mcg_warn_group(group, "no request for failed join\n"); + } else if (method == IB_SA_METHOD_DELETE_RESP && group->demux->flushing) { + mcg_debug_group(group, "SM rejected delete\n"); + ++rc; + } + } else { + u8 resp_join_state; + u8 cur_join_state; + + resp_join_state = ((struct ib_sa_mcmember_data *)group->response_sa_mad.data)->scope_join_state & 7; + cur_join_state = group->rec.scope_join_state & 7; + + mcg_debug_group(group, "method %s, orig join %d, new join %d\n", + (method == IB_MGMT_METHOD_GET_RESP)? "JOIN" : "LEAVE", + cur_join_state, resp_join_state); + if (method == IB_MGMT_METHOD_GET_RESP) { + /* successfull join */ + if (!cur_join_state && resp_join_state) + --rc; + } else if (!resp_join_state) + ++rc; + + if (memcmp(&group->rec.mgid, group->response_sa_mad.data, 16)) + mcg_debug_group(group, "MGIDs mismatch\n"); + + memcpy(&group->rec, group->response_sa_mad.data, sizeof group->rec); + mcg_debug_group(group, "update join %d, rc %d\n", + group->rec.scope_join_state & 7, rc); + } + group->state = MCAST_IDLE; + } + +process_requests: + /* We should now go over pending join/leave requests, as long as we are idle. */ + while (!list_empty(&group->pending_list) && group->state == MCAST_IDLE) { + req = list_first_entry(&group->pending_list, struct mcast_req, + group_list); + mcg_debug_group(group, "req %p, VF %d, rc %d\n", + req, req->func, rc); + sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data; + group_join_state = group->rec.scope_join_state & 0x7; + req_join_state = sa_data->scope_join_state & 0x7; + + /* For a leave request, we will immediately answer the VF, and + * update our internal counters. The actual leave will be sent + * to SM later, if at all needed. We dequeue the request now. */ + if (req->sa_mad.mad_hdr.method == IB_SA_METHOD_DELETE) + rc += handle_leave_req(group, req_join_state, req); + else + rc += handle_join_req(group, req_join_state, req); + } + + /* Handle leaves */ + if (group->state == MCAST_IDLE) { + req_join_state = get_leave_state(group); + mcg_debug_group(group, "req_join_state %d\n", req_join_state); + if (req_join_state) { + group->rec.scope_join_state &= ~req_join_state; + group->prev_state = group->state; + if (send_leave_to_wire(group, req_join_state)) { + group->state = group->prev_state; + ++rc; + } else + group->state = MCAST_LEAVE_SENT; + } + } + + mcg_debug_group(group, "rc %d\n", rc); + if (!list_empty(&group->pending_list) && group->state == MCAST_IDLE) { + mcg_debug_group(group, "list not empty rc %d\n", rc); + goto process_requests; + } + mutex_unlock(&group->lock); + + while (rc--) + release_group(group, 0); +} + +static struct mcast_group *search_relocate_mgid0_group(struct mlx4_ib_demux_ctx *ctx, + __be64 tid, + union ib_gid *new_mgid) +{ + struct mcast_group *group = NULL, *cur_group; + struct mcast_req *req; + struct list_head *pos; + struct list_head *n; + + mutex_lock(&ctx->mcg_table_lock); + list_for_each_safe(pos, n, &ctx->mcg_mgid0_list) { + group = list_entry(pos, struct mcast_group, mgid0_list); + mutex_lock(&group->lock); + if (group->last_req_tid == tid) { + mcg_debug_group(group, "Found mgid0 group with correct TID. Relocating to new MGID: %016llx%016llx\n", + be64_to_cpu(new_mgid->global.subnet_prefix), + be64_to_cpu(new_mgid->global.interface_id)); + if (memcmp(new_mgid, &mgid0, sizeof mgid0)) { + group->rec.mgid = *new_mgid; + sprintf(group->name, "%016llx%016llx", + be64_to_cpu(group->rec.mgid.global.subnet_prefix), + be64_to_cpu(group->rec.mgid.global.interface_id)); + list_del_init(&group->mgid0_list); + cur_group = mcast_insert(ctx, group); + if (cur_group) { + /* A race between our code and SM. Silently cleaning the new one */ + mcg_debug_group(group, "found that group already exists while trying to " + "relocate mgid0 group, removing duplicate\n"); + + req = list_first_entry(&group->pending_list, + struct mcast_req, group_list); + --group->func[req->func].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + release_group(group, 0); + return NULL; + } + + atomic_inc(&group->refcount); + add_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr); + mcg_debug_group(group, "added to sysfs, refcount %d\n", + atomic_read(&group->refcount)); + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + return group; + } else { + struct mcast_req *tmp1, *tmp2; + + mcg_debug("got 0 mgid response. removing group %p\n", group); + list_del(&group->mgid0_list); + if (!list_empty(&group->pending_list) && group->state != MCAST_IDLE) + cancel_delayed_work_sync(&group->timeout_work); + + list_for_each_entry_safe(tmp1, tmp2, &group->pending_list, group_list) { + list_del(&tmp1->group_list); + kfree(tmp1); + } + mutex_unlock(&group->lock); + mutex_unlock(&ctx->mcg_table_lock); + kfree(group); + return NULL; + } + } + mutex_unlock(&group->lock); + } + mutex_unlock(&ctx->mcg_table_lock); + + return NULL; +} + +static ssize_t sysfs_show_group(struct device *dev, + struct device_attribute *attr, char *buf); + +static struct mcast_group *acquire_group(struct mlx4_ib_demux_ctx *ctx, + union ib_gid *mgid, int create, + gfp_t gfp_mask) +{ + struct mcast_group *group, *cur_group; + int is_mgid0; + int i; + + is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0); + if (!is_mgid0) { + group = mcast_find(ctx, mgid); + if (group) + goto found; + } + + if (!create) + return ERR_PTR(-ENOENT); + + group = kzalloc(sizeof *group, gfp_mask); + if (!group) + return ERR_PTR(-ENOMEM); + + group->demux = ctx; + group->rec.mgid = *mgid; + INIT_LIST_HEAD(&group->pending_list); + INIT_LIST_HEAD(&group->mgid0_list); + for (i = 0; i < MAX_VFS; ++i) + INIT_LIST_HEAD(&group->func[i].pending); + INIT_WORK(&group->work, mlx4_ib_mcg_work_handler); + INIT_DELAYED_WORK(&group->timeout_work, mlx4_ib_mcg_timeout_handler); + mutex_init(&group->lock); + sprintf(group->name, "%016llx%016llx", + be64_to_cpu(group->rec.mgid.global.subnet_prefix), + be64_to_cpu(group->rec.mgid.global.interface_id)); + group->dentry.show = sysfs_show_group; + group->dentry.store = NULL; + group->dentry.attr.name = group->name; + group->dentry.attr.mode = 0400; + group->state = MCAST_IDLE; + mcg_debug_group(group, "created (pointer %p)\n", group); + + if (is_mgid0) { + list_add(&group->mgid0_list, &ctx->mcg_mgid0_list); + goto found; + } + + cur_group = mcast_insert(ctx, group); + if (cur_group) { + mcg_warn("group just showed up %s - confused\n", cur_group->name); + kfree(group); + return ERR_PTR(-EINVAL); + } + + mcg_debug_group(group, "adding to sysfs\n"); + add_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr); + +found: + atomic_inc(&group->refcount); + return group; +} + +static void queue_req(struct mcast_req *req) +{ + struct mcast_group *group = req->group; + + atomic_inc(&group->refcount); /* for the request */ + atomic_inc(&group->refcount); /* for scheduling the work */ + list_add_tail(&req->group_list, &group->pending_list); + list_add_tail(&req->func_list, &group->func[req->func].pending); + /* calls mlx4_ib_mcg_work_handler */ + if (!queue_work(group->demux->mcg_wq, &group->work)) + safe_atomic_dec(&group->refcount); +} + +int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave, + struct ib_sa_mad *mad) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)mad->data; + struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1]; + struct mcast_group *group; + + switch (mad->mad_hdr.method) { + case IB_MGMT_METHOD_GET_RESP: + case IB_SA_METHOD_DELETE_RESP: + mcg_debug("slave %d, port %d method 0x%x (%s) MGID %016llx%016llx join_state= %d tid 0x%llx\n", + slave, port, mad->mad_hdr.method, debug_mcg_method_name(mad->mad_hdr.method), + be64_to_cpu(rec->mgid.global.subnet_prefix), be64_to_cpu(rec->mgid.global.interface_id), + rec->scope_join_state & 0xf, be64_to_cpu(mad->mad_hdr.tid)); + + mutex_lock(&ctx->mcg_table_lock); + group = acquire_group(ctx, &rec->mgid, 0, GFP_KERNEL); + mutex_unlock(&ctx->mcg_table_lock); + if (IS_ERR(group)) { + if (mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP) { + __be64 tid = mad->mad_hdr.tid; + *(u8 *)(&tid) = (u8)slave; /* in group we kept the modified TID */ + group = search_relocate_mgid0_group(ctx, tid, &rec->mgid); + } else + group = NULL; + } + + if (!group) { + mcg_debug("demux: port %d: response to non-existing group, dropping.\n", port); + return 1; + } + + mutex_lock(&group->lock); + group->response_sa_mad = *mad; + group->prev_state = group->state; + group->state = MCAST_RESP_READY; + /* calls mlx4_ib_mcg_work_handler */ + atomic_inc(&group->refcount); + if (!queue_work(ctx->mcg_wq, &group->work)) + safe_atomic_dec(&group->refcount); + mutex_unlock(&group->lock); + release_group(group, 0); + return 1; /* consumed */ + case IB_MGMT_METHOD_SET: + case IB_SA_METHOD_GET_TABLE: + case IB_SA_METHOD_GET_TABLE_RESP: + case IB_SA_METHOD_DELETE: + return 0; /* not consumed, pass-through to guest over tunnel */ + default: + mcg_warn("In demux, port %d: unexpected MCMember method: 0x%x, dropping\n", + port, mad->mad_hdr.method); + return 1; /* consumed */ + } +} + +int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port, + int slave, struct ib_sa_mad *sa_mad) +{ + struct mlx4_ib_dev *dev = to_mdev(ibdev); + struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)sa_mad->data; + struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1]; + struct mcast_group *group; + struct mcast_req *req; + int may_create = 0; + + if (ctx->flushing) + return -EAGAIN; + + switch (sa_mad->mad_hdr.method) { + case IB_MGMT_METHOD_SET: + may_create = 1; + case IB_SA_METHOD_DELETE: + mcg_debug("slave %d, port %d, method 0x%x (%s), MGID %016llx%016llx, join_state %d, tid 0x%llx\n", + slave, port, sa_mad->mad_hdr.method, debug_mcg_method_name(sa_mad->mad_hdr.method), + be64_to_cpu(rec->mgid.global.subnet_prefix), be64_to_cpu(rec->mgid.global.interface_id), + rec->scope_join_state & 0xf, be64_to_cpu(sa_mad->mad_hdr.tid)); + + req = kzalloc(sizeof *req, GFP_KERNEL); + if (!req) + return -ENOMEM; + + req->func = slave; + req->sa_mad = *sa_mad; + + mutex_lock(&ctx->mcg_table_lock); + group = acquire_group(ctx, &rec->mgid, may_create, GFP_KERNEL); + mutex_unlock(&ctx->mcg_table_lock); + if (IS_ERR(group)) { + mcg_debug("group %016llx%016llx not found, return %ld\n", + be64_to_cpu(rec->mgid.global.subnet_prefix), + be64_to_cpu(rec->mgid.global.interface_id), + PTR_ERR(group)); + kfree(req); + return PTR_ERR(group); + } + mutex_lock(&group->lock); + if (group->func[slave].num_pend_reqs > MAX_PEND_REQS_PER_FUNC) { + mutex_unlock(&group->lock); + mcg_warn_group(group, "Port %d, Func %d has too many pending requests (%d), dropping\n", + port, slave, MAX_PEND_REQS_PER_FUNC); + release_group(group, 0); + kfree(req); + return -ENOMEM; + } + ++group->func[slave].num_pend_reqs; + req->group = group; + queue_req(req); + mutex_unlock(&group->lock); + release_group(group, 0); + return 1; /* consumed */ + case IB_SA_METHOD_GET_TABLE: + case IB_MGMT_METHOD_GET_RESP: + case IB_SA_METHOD_GET_TABLE_RESP: + case IB_SA_METHOD_DELETE_RESP: + return 0; /* not consumed, pass-through */ + default: + mcg_warn("In multiplex, port %d, func %d: unexpected MCMember method: 0x%x, dropping\n", + port, slave, sa_mad->mad_hdr.method); + return 1; /* consumed */ + } +} + +#ifdef DEBUG_MCG +static int check_consistency_ok(struct mcast_group *group, char *tmp_str) +{ + struct list_head *pos; + int funcs = 0, pending = 0; + int members[3] = {0, 0, 0}; + int f, i; + int ok_refs, ok_members, ok_group_state, ok_pend_req = 1; + int len = 0; + u8 join_state; + + for (f = 0; f < MAX_VFS; ++f) { + if (group->func[f].state > MCAST_NOT_MEMBER) { + u8 join_state = group->func[f].join_state; + ++funcs; + for (i = 0; i < 3; i++, join_state >>= 1) + if (join_state & 0x1) + members[i]++; + } + if (ok_pend_req && (group->func[f].num_pend_reqs < 0 || + group->func[f].num_pend_reqs > MAX_PEND_REQS_PER_FUNC)) { + ok_pend_req = 0; + len += sprintf(tmp_str, "VF%d_pend=%d", f, group->func[f].num_pend_reqs); + } + } + list_for_each(pos, &group->pending_list) + ++pending; + + ok_refs = ((1 + pending) == atomic_read(&group->refcount)); + if (!ok_refs && tmp_str) + len += sprintf(tmp_str + len, "VFs=%d Pend=%d ", funcs, pending); + + ok_members = 1; + for (i = 0; i < 3; i++) + if (members[i] != group->members[i]) { + ok_members = 0; + if (tmp_str) + len += sprintf(tmp_str + len, "MEMBERS[%d]=%d ", i, members[i]); + } + + join_state = 0; + for (i = 2; i >= 0; i--) { + join_state <<= 1; + if (group->members[i]) + join_state |= 0x1; + } + ok_group_state = (join_state == (group->rec.scope_join_state & 0xf)); + if (!ok_group_state && tmp_str) + len += sprintf(tmp_str + len, "STATE=%d ", join_state); + + return (ok_refs && ok_members && ok_group_state); +} +#else +static int check_consistency_ok(struct mcast_group *group, char *tmp_str) +{ + sprintf(tmp_str, "NO_DEBUG"); + return 0; +} +#endif /* DEBUG_MCG */ + +static ssize_t sysfs_show_group(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct mcast_group *group = + container_of(attr, struct mcast_group, dentry); + struct mcast_req *req = NULL; + char consistency_str[40]; + char pending_str[40]; + char state_str[40]; + ssize_t len = 0; + int f; + + if (group->state == MCAST_IDLE) + sprintf(state_str, "%s", get_state_string(group->state)); + else + sprintf(state_str, "%s(TID=0x%llx)", + get_state_string(group->state), + be64_to_cpu(group->last_req_tid)); + if (list_empty(&group->pending_list)) { + sprintf(pending_str, "No"); + } else { + req = list_first_entry(&group->pending_list, struct mcast_req, group_list); + sprintf(pending_str, "Yes(TID=0x%llx)", + be64_to_cpu(req->sa_mad.mad_hdr.tid)); + } + sprintf(consistency_str, "OK"); + check_consistency_ok(group, consistency_str); + len += sprintf(buf + len, "%1d [%02d,%02d,%02d] %4d %4s %5s %11s ", + group->rec.scope_join_state & 0xf, + group->members[2], group->members[1], group->members[0], + atomic_read(&group->refcount), + pending_str, + state_str, + consistency_str); + for (f = 0; f < MAX_VFS; ++f) + if (group->func[f].state == MCAST_MEMBER) + len += sprintf(buf + len, "%d[%1x] ", + f, group->func[f].join_state); + + len += sprintf(buf + len, "\t\t(%4hx %4x %2x %2x %2x %2x %2x " + "%4x %4x %2x %2x)\n", + be16_to_cpu(group->rec.pkey), + be32_to_cpu(group->rec.qkey), + (group->rec.mtusel_mtu & 0xc0) >> 6, + group->rec.mtusel_mtu & 0x3f, + group->rec.tclass, + (group->rec.ratesel_rate & 0xc0) >> 6, + group->rec.ratesel_rate & 0x3f, + (be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0xf0000000) >> 28, + (be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0x0fffff00) >> 8, + be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0x000000ff, + group->rec.proxy_join); + + return len; +} + +int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx) +{ + char name[20]; + + mcg_debug("mcg init\n"); + atomic_set(&ctx->tid, 0); + sprintf(name, "mlx4_ib_mcg%d", ctx->port); + ctx->mcg_wq = create_singlethread_workqueue(name); + if (!ctx->mcg_wq) + return -ENOMEM; + + mutex_init(&ctx->mcg_table_lock); + ctx->mcg_table = RB_ROOT; + INIT_LIST_HEAD(&ctx->mcg_mgid0_list); + ctx->flushing = 0; + + return 0; +} + +static void force_clean_group(struct mcast_group *group) +{ + struct mcast_req *req, *tmp + ; + list_for_each_entry_safe(req, tmp, &group->pending_list, group_list) { + list_del(&req->group_list); + kfree(req); + } + del_sysfs_port_mcg_attr(group->demux->dev, group->demux->port, &group->dentry.attr); + mcg_debug_group(group, "deleting group\n"); + rb_erase(&group->node, &group->demux->mcg_table); + kfree(group); +} + +static void _mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq) +{ + int i; + struct rb_node *p; + struct mcast_group *group; + unsigned long end; + int count; + + mcg_debug("port %d, destroy_wq %d\n", ctx->port, destroy_wq); + for (i = 0; i < MAX_VFS; ++i) + clean_vf_mcast(ctx, i); + + end = jiffies + msecs_to_jiffies(MAD_TIMEOUT_MS + 3000); + do { + count = 0; + mutex_lock(&ctx->mcg_table_lock); + for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p)) + ++count; + mutex_unlock(&ctx->mcg_table_lock); + if (!count) + break; + + msleep(1); + } while (time_after(end, jiffies)); + + flush_workqueue(ctx->mcg_wq); + if (destroy_wq) + destroy_workqueue(ctx->mcg_wq); + + mutex_lock(&ctx->mcg_table_lock); + while((p = rb_first(&ctx->mcg_table)) != NULL) { + group = rb_entry(p, struct mcast_group, node); + if (atomic_read(&group->refcount)) + mcg_warn_group(group, "group refcount %d!!! (pointer %p)\n", atomic_read(&group->refcount), group); + + force_clean_group(group); + } + mutex_unlock(&ctx->mcg_table_lock); +} + +struct clean_work { + struct work_struct work; + struct mlx4_ib_demux_ctx *ctx; + int destroy_wq; +}; + +static void mcg_clean_task(struct work_struct *work) +{ + struct clean_work *cw = container_of(work, struct clean_work, work); + + _mlx4_ib_mcg_port_cleanup(cw->ctx, cw->destroy_wq); + cw->ctx->flushing = 0; + kfree(cw); +} + +void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq) +{ + struct clean_work *work; + + if (ctx->flushing) + return; + + ctx->flushing = 1; + + if (destroy_wq) { + _mlx4_ib_mcg_port_cleanup(ctx, destroy_wq); + ctx->flushing = 0; + return; + } + + work = kmalloc(sizeof *work, GFP_KERNEL); + if (!work) { + ctx->flushing = 0; + mcg_warn("failed allocating work for cleanup\n"); + return; + } + + work->ctx = ctx; + work->destroy_wq = destroy_wq; + INIT_WORK(&work->work, mcg_clean_task); + queue_work(clean_wq, &work->work); +} + +static void build_leave_mad(struct mcast_req *req) +{ + struct ib_sa_mad *mad = &req->sa_mad; + + mad->mad_hdr.method = IB_SA_METHOD_DELETE; +} + + +static void clear_pending_reqs(struct mcast_group *group, int vf) +{ + struct mcast_req *req, *tmp, *group_first = NULL; + int clear; + int pend = 0; + + if (!list_empty(&group->pending_list)) + group_first = list_first_entry(&group->pending_list, struct mcast_req, group_list); + + mcg_debug_group(group, "group_first %p\n", group_first); + list_for_each_entry_safe(req, tmp, &group->func[vf].pending, func_list) { + clear = 1; + mcg_debug_group(group, "req %p, tid 0x%llx\n", req, be64_to_cpu(req->sa_mad.mad_hdr.tid)); + if(group_first == req && + (group->state == MCAST_JOIN_SENT || + group->state == MCAST_LEAVE_SENT)) { + clear = cancel_delayed_work(&group->timeout_work); + pend = !clear; + mcg_debug_group(group, "tid 0x%llx, clear %d\n", be64_to_cpu(req->sa_mad.mad_hdr.tid), clear); + group->state = MCAST_IDLE; + } + if (clear) { + --group->func[vf].num_pend_reqs; + list_del(&req->group_list); + list_del(&req->func_list); + kfree(req); + atomic_dec(&group->refcount); + } + } + + if (!pend && (!list_empty(&group->func[vf].pending) || group->func[vf].num_pend_reqs)) { + mcg_warn_group(group, "DRIVER BUG: list_empty %d, num_pend_reqs %d\n", + list_empty(&group->func[vf].pending), group->func[vf].num_pend_reqs); + if (!list_empty(&group->func[vf].pending)) + mcg_debug_group(group, "first req ptr is %p\n", + list_first_entry(&group->func[vf].pending, + struct mcast_req, group_list)); + } +} + +static int push_deleteing_req(struct mcast_group *group, int slave) +{ + union ib_gid mgid; + struct mcast_req *req; + struct mcast_req *pend_req; + + if (!group->func[slave].join_state) + return 0; + + mgid = group->rec.mgid; + req = kzalloc(sizeof *req, GFP_KERNEL); + if (!req) { + mcg_warn_group(group, "failed allocation - may leave stall groups\n"); + return -ENOMEM; + } + + mcg_debug_group(group, "vf %d, num_pend_reqs %d\n", slave, group->func[slave].num_pend_reqs); + if (!list_empty(&group->func[slave].pending)) { + pend_req = list_entry(group->func[slave].pending.prev, struct mcast_req, group_list); + if (pend_req->clean) { + mcg_debug_group(group, "pending delete already exists %p\n", pend_req); + kfree(req); + return 0; + } + } + + req->clean = 1; + req->func = slave; + req->group = group; + ++group->func[slave].num_pend_reqs; + build_leave_mad(req); + mcg_debug_group(group, "pushing delete request, VF %d, join_state %d\n", + slave, group->func[slave].join_state); + queue_req(req); + return 0; +} + +void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave) +{ + struct mcast_group *group; + struct rb_node *p; + + mcg_debug("called for vf %d, port %d\n", slave, ctx->port); + mutex_lock(&ctx->mcg_table_lock); + for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p)) { + group = rb_entry(p, struct mcast_group, node); + mutex_lock(&group->lock); + if (atomic_read(&group->refcount)) { + /* clear pending requests of this VF */ + clear_pending_reqs(group, slave); + push_deleteing_req(group, slave); + } + mutex_unlock(&group->lock); + } + mutex_unlock(&ctx->mcg_table_lock); +} + + +int mlx4_ib_mcg_init(void) +{ + clean_wq = create_singlethread_workqueue("mlx4_ib_mcg"); + if (!clean_wq) + return -ENOMEM; + + return 0; +} + +void mlx4_ib_mcg_destroy(void) +{ + destroy_workqueue(clean_wq); +} diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 2a322f21049fa..1d86251c9bd97 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -37,13 +37,59 @@ #include #include #include +#include #include #include +#include +#include #include #include + +#define MLX4_IB_DRV_NAME "mlx4_ib" + +#ifdef CONFIG_MLX4_DEBUG +extern int mlx4_ib_debug_level; + +#define mlx4_ib_dbg(format, arg...) \ + do { \ + if (mlx4_ib_debug_level & 1) \ + printk(KERN_DEBUG "<" MLX4_IB_DRV_NAME "> %s: " format "\n",\ + __func__, ## arg); \ + } while (0) + +#else /* CONFIG_MLX4_DEBUG */ + +#define mlx4_ib_dbg(format, arg...) do {} while (0) + +#endif /* CONFIG_MLX4_DEBUG */ +/*module param to indicates if SM assigns the alias_GUID*/ +extern int mlx4_ib_sm_guid_assign; + +/*module param for generating even GUIDs, Oracle only*/ +extern int mlx4_ib_guid_gen_magic; + +extern int mlx4_ib_gids_per_func; + +enum { + MLX4_IB_SQ_MIN_WQE_SHIFT = 6 +}; + +#define MLX4_IB_SQ_HEADROOM(shift) ((2048 >> (shift)) + 1) +#define MLX4_IB_SQ_MAX_SPARE (MLX4_IB_SQ_HEADROOM(MLX4_IB_SQ_MIN_WQE_SHIFT)) + + +#define mlx4_ib_warn(ibdev, format, arg...) \ + dev_warn((ibdev)->dma_device, "mlx4_ib: " format, ## arg) + +#define ACT_GID_INDEX(mlx4_dev_ptr, func_gid_idx, func) \ + ((mlx4_dev_ptr->sr_iov + 1) * func_gid_idx + func) +#define ACT_GID_TO_SLAVE_GID(mlx4_dev_ptr, act_gid_idx) \ + (act_gid_idx / (mlx4_dev_ptr->sr_iov + 1)) +#define GID_INDEX_INVALID 0xFFFF + struct mlx4_ib_ucontext { struct ib_ucontext ibucontext; struct mlx4_uar uar; @@ -56,6 +102,18 @@ struct mlx4_ib_pd { u32 pdn; }; +struct mlx4_ib_shpd { + struct ib_shpd ibshpd; + u32 pdn; +}; + +struct mlx4_ib_xrcd { + struct ib_xrcd ibxrcd; + u32 xrcdn; + struct ib_pd *pd; + struct ib_cq *cq; +}; + struct mlx4_ib_cq_buf { struct mlx4_buf buf; struct mlx4_mtt mtt; @@ -110,15 +168,54 @@ struct mlx4_ib_wq { enum mlx4_ib_qp_flags { MLX4_IB_QP_LSO = 1 << 0, MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = 1 << 1, + MLX4_IB_XRC_RCV = 1 << 2, + MLX4_IB_QP_TUNNEL = 1 << 30, + MLX4_IB_SRIOV_SQP = 1 << 31, }; -struct mlx4_ib_gid_entry { +struct gid_entry { struct list_head list; union ib_gid gid; int added; u8 port; }; +enum mlx4_ib_qp_type { + /* + * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries + * here (and in that order) since the MAD layer uses them as + * indices into a 2-entry table. + */ + MLX4_IB_QPT_SMI = IB_QPT_SMI, + MLX4_IB_QPT_GSI = IB_QPT_GSI, + + MLX4_IB_QPT_RC = IB_QPT_RC, + MLX4_IB_QPT_UC = IB_QPT_UC, + MLX4_IB_QPT_UD = IB_QPT_UD, + MLX4_IB_QPT_XRC = IB_QPT_XRC, + MLX4_IB_QPT_RAW_IPV6 = IB_QPT_RAW_IPV6, + MLX4_IB_QPT_RAW_ETY = IB_QPT_RAW_ETY, + + MLX4_IB_QPT_PROXY_SMI = 100, + MLX4_IB_QPT_PROXY_GSI, + MLX4_IB_QPT_TUN_SMI, + MLX4_IB_QPT_TUN_GSI, +}; + +struct mlx4_rcv_tunnel_hdr { + u32 src_qp; + u32 wc_flags; + u16 pkey_index; + u16 slid; + u8 sl; + u8 reserved[5]; /* align to 16 bytes */ +}; + +struct mlx4_ib_proxy_sqp_hdr { + struct ib_grh grh; + struct mlx4_rcv_tunnel_hdr tun; +} __attribute__ ((packed)); + struct mlx4_ib_qp { struct ib_qp ibqp; struct mlx4_qp mqp; @@ -133,12 +230,14 @@ struct mlx4_ib_qp { int sq_max_wqes_per_wr; int sq_spare_wqes; struct mlx4_ib_wq sq; - + enum mlx4_ib_qp_type mlx4_ib_qp_type; struct ib_umem *umem; struct mlx4_mtt mtt; int buf_size; struct mutex mutex; u32 flags; + struct list_head xrc_reg_list; + u16 xrcdn; u8 port; u8 alt_port; u8 atomic_rd_en; @@ -147,6 +246,9 @@ struct mlx4_ib_qp { u8 state; int mlx_type; struct list_head gid_list; + struct mlx4_ib_buf *sqp_proxy_rcv; + int max_inline_data; + struct mlx4_bf bf; }; struct mlx4_ib_srq { @@ -164,24 +266,218 @@ struct mlx4_ib_srq { struct mutex mutex; }; +/* source info for sriov forwarding */ +struct mlx4_ib_ah_ext { + u16 slid; + u8 sgid[16]; + u32 sqpn; +}; + struct mlx4_ib_ah { struct ib_ah ibah; union mlx4_ext_av av; + struct mlx4_ib_ah_ext *ex; + u8 gsi_demux_lb; +}; + +enum { + MLX4_NUM_TUNNEL_BUFS = 256, + MLX4_SLAVE_ID_GID_OFFSET = 13, + MLX4_SLAVE_ID_NODE_GUID_OFFSET = 5 +}; + +struct mlx4_ib_tunnel_header { + struct mlx4_av av; + u32 remote_qpn; + u32 qkey; + u16 pkey_index; + u8 reserved[6]; /* align to 8 bytes */ +}; + +struct mlx4_ib_buf { + void *addr; + dma_addr_t map; +}; + +#define MLX4_MAX_NUM_GIDS 128 + +/****************************************/ +/* alias guid support */ +/****************************************/ +#define NUM_PORT_ALIAS_GUID 2 +#define NUM_ALIAS_GUID_IN_REC 8 +#define NUM_ALIAS_GUID_REC_IN_PORT 16 +#define GUID_REC_SIZE 8 +#define NUM_ALIAS_GUID_PER_PORT 128 + +enum mlx4_guid_alias_rec_status { + MLX4_GUID_INFO_STATUS_IDLE, + MLX4_GUID_INFO_STATUS_SET, + MLX4_GUID_INFO_STATUS_PENDING, +}; + +enum mlx4_guid_alias_rec_ownership { + MLX4_GUID_DRIVER_ASSIGN, + MLX4_GUID_SYSADMIN_ASSIGN, + MLX4_GUID_NONE_ASSIGN, /*init state of each record*/ +}; + +struct mlx4_sriov_alias_guid_info_rec_det { + __be64 all_recs[NUM_ALIAS_GUID_IN_REC]; + ib_sa_comp_mask guid_indexes; /*indicates what from the 8 records are valid*/ + enum mlx4_guid_alias_rec_status status; /*indicates the administraively status of the record.*/ + u8 method; /*set or delete*/ + enum mlx4_guid_alias_rec_ownership ownership; /*indicates who assign that alias_guid record*/ +}; + +struct mlx4_sriov_alias_guid_port_rec_det { + struct mlx4_sriov_alias_guid_info_rec_det all_rec_per_port[NUM_ALIAS_GUID_REC_IN_PORT]; + struct workqueue_struct *wq; + struct delayed_work alias_guid_work; + u8 port; + struct mlx4_sriov_alias_guid* parent; + struct list_head cb_list; +}; + +struct mlx4_sriov_alias_guid { + struct mlx4_sriov_alias_guid_port_rec_det ports_guid[MLX4_MAX_PORTS]; + spinlock_t ag_work_lock; + struct ib_sa_client sa_client; +}; + +struct mlx4_ib_demux_work { + struct work_struct work; + struct mlx4_ib_dev *dev; + int slave; + int do_init; + u8 port; + +}; + +struct mlx4_ib_tun_tx_buf { + struct mlx4_ib_buf buf; + struct ib_ah *ah; +}; + +struct mlx4_ib_demux_pv_qp { + struct ib_qp *qp; + enum ib_qp_type proxy_qpt; + struct mlx4_ib_buf *ring; + struct mlx4_ib_tun_tx_buf *tx_ring; + spinlock_t tx_lock; + unsigned tx_ix_head; + unsigned tx_ix_tail; +}; + +enum mlx4_ib_demux_pv_state { + DEMUX_PV_STATE_DOWN, + DEMUX_PV_STATE_STARTING, + DEMUX_PV_STATE_ACTIVE, + DEMUX_PV_STATE_DOWNING, +}; + +struct mlx4_ib_demux_pv_ctx { + int port; + int slave; + enum mlx4_ib_demux_pv_state state; + int has_smi; + struct ib_device *ib_dev; + struct ib_cq *cq; + struct ib_pd *pd; + struct ib_mr *mr; + struct work_struct work; + struct workqueue_struct *wq; + struct mlx4_ib_demux_pv_qp qp[2]; +}; + +struct mlx4_ib_demux_ctx { + struct ib_device *ib_dev; + int port; + struct workqueue_struct *wq; + struct workqueue_struct *ud_wq; + spinlock_t ud_lock; + __be64 subnet_prefix; + __be64 guid_cache[MLX4_MAX_NUM_GIDS]; + u8 gid_id_base; + + atomic_t tid; + + struct mlx4_ib_dev *dev; + /* the following lock protects both mcg_table and mcg_mgid0_list */ + struct mutex mcg_table_lock; + struct rb_root mcg_table; + struct list_head mcg_mgid0_list; + struct workqueue_struct *mcg_wq; + struct mlx4_ib_demux_pv_ctx **tun; + int flushing; /* flushing the work queue */ +}; + +struct mlx4_ib_sriov { + struct mlx4_ib_demux_ctx demux[MLX4_MAX_PORTS]; + u16 local_lid[MLX4_MAX_PORTS]; + struct mlx4_sriov_alias_guid alias_guid; + struct list_head cm_list; + spinlock_t id_map_lock; + struct rb_root sl_id_map; + struct idr pv_id_table; + struct mlx4_ib_demux_pv_ctx *sqps[MLX4_MAX_PORTS]; + /*when using that spinlock you should use "irq" because it may be called + from interupt context.*/ + spinlock_t going_down_lock; + int is_going_down; }; struct mlx4_ib_iboe { spinlock_t lock; struct net_device *netdevs[MLX4_MAX_PORTS]; struct notifier_block nb; - union ib_gid gid_table[MLX4_MAX_PORTS][128]; + union ib_gid gid_table[MLX4_MAX_PORTS][MLX4_MAX_NUM_GIDS]; +}; + +struct pkey_mgt { + u8 phys2virt_pkey[MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS]; + u8 virt2phys_pkey[MLX4_MFUNC_MAX][MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS]; + u16 phys_pkey_cache[MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS]; + struct list_head pkey_port_list[MLX4_MFUNC_MAX]; + struct kobject *device_parent[MLX4_MFUNC_MAX]; +}; + +struct mlx4_ib_iov_sysfs_attr { + void *ctx; + struct kobject *kobj; + unsigned long data; + u32 entry_num; + char name[15]; + struct device_attribute dentry; + struct device *dev; +}; + +struct mlx4_ib_iov_sysfs_attr_ar { + struct mlx4_ib_iov_sysfs_attr dentries[3 * NUM_ALIAS_GUID_PER_PORT + 1]; +}; + +struct mlx4_ib_iov_port { + char name[100]; + u8 num; + struct mlx4_ib_dev *dev; + struct list_head list; + /* 3 = 1 for admin_alias + 1 gids + 1 gid is assigned to which vf */ + struct mlx4_ib_iov_sysfs_attr_ar *dentr_ar; + struct ib_port_attr attr; + struct kobject *cur_port; + struct kobject *admin_alias_parent; + struct kobject *gids_parent; + struct kobject *pkeys_parent; + struct kobject *mcgs_parent; + struct mlx4_ib_iov_sysfs_attr mcg_dentry; }; +struct mlx4_ib_dev; + struct mlx4_ib_dev { struct ib_device ib_dev; struct mlx4_dev *dev; int num_ports; - void __iomem *uar_map; - struct mlx4_uar priv_uar; u32 priv_pdn; MLX4_DECLARE_DOORBELL_LOCK(uar_lock); @@ -189,12 +485,42 @@ struct mlx4_ib_dev { struct ib_mad_agent *send_agent[MLX4_MAX_PORTS][2]; struct ib_ah *sm_ah[MLX4_MAX_PORTS]; spinlock_t sm_lock; + struct mlx4_ib_sriov sriov; struct mutex cap_mask_mutex; - bool ib_active; + struct mutex xrc_reg_mutex; + int ib_active; struct mlx4_ib_iboe iboe; + int counters[MLX4_MAX_PORTS]; + /*iov sysfs support*/ + struct kobject *iov_parent; + struct kobject *ports_parent; + struct kobject *dev_ports_parent[MLX4_MFUNC_MAX]; + struct mlx4_ib_iov_port iov_ports[MLX4_MAX_PORTS]; + struct pkey_mgt pkeys; + __be16 virt2phys_gids[MLX4_MAX_PORTS + 1][MLX4_MAX_NUM_GIDS]; +}; + +struct mlx4_ib_qp_tunnel_init_attr { + struct ib_qp_init_attr init_attr; + int slave; + enum ib_qp_type proxy_qp_type; + u8 port; }; +static inline int is_gid_idx_valid(u16 gid_index) +{ + return (gid_index != GID_INDEX_INVALID); +} + +static inline int slave_gid_index(struct mlx4_dev *dev, int index) +{ + if (index < 0 || index >= dev->gids_per_func) + return -EINVAL; + + return ACT_GID_INDEX(dev, index, dev->caps.function); +} + static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev) { return container_of(ibdev, struct mlx4_ib_dev, ib_dev); @@ -210,6 +536,16 @@ static inline struct mlx4_ib_pd *to_mpd(struct ib_pd *ibpd) return container_of(ibpd, struct mlx4_ib_pd, ibpd); } +static inline struct mlx4_ib_shpd *to_mshpd(struct ib_shpd *ibshpd) +{ + return container_of(ibshpd, struct mlx4_ib_shpd, ibshpd); +} + +static inline struct mlx4_ib_xrcd *to_mxrcd(struct ib_xrcd *ibxrcd) +{ + return container_of(ibxrcd, struct mlx4_ib_xrcd, ibxrcd); +} + static inline struct mlx4_ib_cq *to_mcq(struct ib_cq *ibcq) { return container_of(ibcq, struct mlx4_ib_cq, ibcq); @@ -259,6 +595,14 @@ static inline struct mlx4_ib_ah *to_mah(struct ib_ah *ibah) return container_of(ibah, struct mlx4_ib_ah, ibah); } +int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev); +void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev); + +#if 0 +/* Disable this function as it is not in use */ +int mlx4_ib_set_slave_guids(struct ib_device *ibdev); +#endif + int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt, struct mlx4_db *db); void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db); @@ -294,6 +638,11 @@ int mlx4_ib_destroy_ah(struct ib_ah *ah); struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *init_attr, struct ib_udata *udata); +struct ib_srq *mlx4_ib_create_xrc_srq(struct ib_pd *pd, + struct ib_cq *xrc_cq, + struct ib_xrcd *xrcd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata); int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, enum ib_srq_attr_mask attr_mask, struct ib_udata *udata); int mlx4_ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); @@ -326,19 +675,46 @@ void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev); struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int mr_access_flags, struct ib_fmr_attr *fmr_attr); +int mlx4_ib_set_fmr_pd(struct ib_fmr *ibfmr, struct ib_pd *pd); int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, int npages, u64 iova); int mlx4_ib_unmap_fmr(struct list_head *fmr_list); int mlx4_ib_fmr_dealloc(struct ib_fmr *fmr); +int mlx4_ib_create_xrc_rcv_qp(struct ib_qp_init_attr *init_attr, + u32 *qp_num); +int mlx4_ib_modify_xrc_rcv_qp(struct ib_xrcd *xrcd, u32 qp_num, + struct ib_qp_attr *attr, int attr_mask); +int mlx4_ib_query_xrc_rcv_qp(struct ib_xrcd *xrcd, u32 qp_num, + struct ib_qp_attr *attr, int attr_mask, + struct ib_qp_init_attr *init_attr); +int mlx4_ib_reg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num); +int mlx4_ib_unreg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num); + +int mlx4_ib_query_port(struct ib_device *ibdev, u8 port, + struct ib_port_attr *props); int mlx4_ib_resolve_grh(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah_attr, u8 *mac, int *is_mcast, u8 port); +int mlx4_ib_get_eth_l2_addr(struct ib_device *device, u8 port, union ib_gid *dgid, + int sgid_idx, u8 *mac, u16 *vlan_id); + +int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx); +void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq); +void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave); +int mlx4_ib_mcg_init(void); +void mlx4_ib_mcg_destroy(void); +int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port, int slave, + struct ib_sa_mad *sa_mad); +int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave, + struct ib_sa_mad *mad); +ssize_t print_mcg_table(struct mlx4_ib_demux_ctx *ctx, char *page); + static inline int mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah) { u8 port = be32_to_cpu(ah->av.ib.port_pd) >> 24 & 3; - if (rdma_port_get_link_layer(ah->ibah.device, port) == IB_LINK_LAYER_ETHERNET) + if (rdma_port_link_layer(ah->ibah.device, port) == IB_LINK_LAYER_ETHERNET) return 1; return !!(ah->av.ib.g_slid & 0x80); @@ -347,4 +723,33 @@ static inline int mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah) int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, union ib_gid *gid); +int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid); + +int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave, + struct ib_mad *mad); + +int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id, + struct ib_mad *mad); + +void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev); +void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave_id); +int mlx4_request_pkey_table_update(struct mlx4_ib_dev *dev, u8 port); +int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey); +u64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx); +int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, + struct attribute *attr); +void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, + struct attribute *attr); +void mlx4_ib_tunnels_update_work(struct work_struct *work); +int mlx4_ib_tunnels_update(struct mlx4_ib_dev *dev, int slave, int port, + int do_init, int from_wq); +int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, + enum ib_qp_type qpt, struct ib_wc *wc, + struct ib_grh *grh, struct ib_mad *mad); +int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, + enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn, + u32 qkey, struct ib_ah_attr *attr, struct ib_mad *mad); +void update_sm_ah(struct mlx4_ib_dev *dev, u8 port_num, u16 lid, u8 sl); +u16 mlx4_ib_get_virt2phys_gid(struct mlx4_ib_dev *dev, u8 port, u8 gid_index); + #endif /* MLX4_IB_H */ diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index dca55b19a6f19..2d9608ef53d13 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -31,8 +31,6 @@ * SOFTWARE. */ -#include - #include "mlx4_ib.h" static u32 convert_access(int acc) @@ -49,7 +47,7 @@ struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc) struct mlx4_ib_mr *mr; int err; - mr = kmalloc(sizeof *mr, GFP_KERNEL); + mr = kzalloc(sizeof *mr, GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); @@ -121,6 +119,70 @@ out: return err; } +static int handle_hugetlb_user_mr(struct ib_pd *pd, struct mlx4_ib_mr *mr, + u64 start, u64 virt_addr, int access_flags) +{ +#if defined(CONFIG_HUGETLB_PAGE) && !defined(__powerpc__) && !defined(__ia64__) + struct mlx4_ib_dev *dev = to_mdev(pd->device); + struct ib_umem_chunk *chunk; + unsigned dsize; + dma_addr_t daddr; + unsigned cur_size = 0; + dma_addr_t uninitialized_var(cur_addr); + int n; + struct ib_umem *umem = mr->umem; + u64 *arr; + int err = 0; + int i; + int j = 0; + int off = start & (HPAGE_SIZE - 1); + + n = DIV_ROUND_UP(off + umem->length, HPAGE_SIZE); + arr = kmalloc(n * sizeof *arr, GFP_KERNEL); + if (!arr) + return -ENOMEM; + + list_for_each_entry(chunk, &umem->chunk_list, list) + for (i = 0; i < chunk->nmap; ++i) { + daddr = sg_dma_address(&chunk->page_list[i]); + dsize = sg_dma_len(&chunk->page_list[i]); + if (!cur_size) { + cur_addr = daddr; + cur_size = dsize; + } else if (cur_addr + cur_size != daddr) { + err = -EINVAL; + goto out; + } else + cur_size += dsize; + + if (cur_size > HPAGE_SIZE) { + err = -EINVAL; + goto out; + } else if (cur_size == HPAGE_SIZE) { + cur_size = 0; + arr[j++] = cur_addr; + } + } + + if (cur_size) { + arr[j++] = cur_addr; + } + + err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, umem->length, + convert_access(access_flags), n, HPAGE_SHIFT, &mr->mmr); + if (err) + goto out; + + err = mlx4_write_mtt(dev->dev, &mr->mmr.mtt, 0, n, arr); + +out: + kfree(arr); + return err; +#else + return -ENOSYS; +#endif +} + struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata) @@ -131,7 +193,7 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, int err; int n; - mr = kmalloc(sizeof *mr, GFP_KERNEL); + mr = kzalloc(sizeof *mr, GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); @@ -142,17 +204,20 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, goto err_free; } - n = ib_umem_page_count(mr->umem); - shift = ilog2(mr->umem->page_size); + if (!mr->umem->hugetlb || + handle_hugetlb_user_mr(pd, mr, start, virt_addr, access_flags)) { + n = ib_umem_page_count(mr->umem); + shift = ilog2(mr->umem->page_size); - err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length, - convert_access(access_flags), n, shift, &mr->mmr); - if (err) - goto err_umem; + err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length, + convert_access(access_flags), n, shift, &mr->mmr); + if (err) + goto err_umem; - err = mlx4_ib_umem_write_mtt(dev, &mr->mmr.mtt, mr->umem); - if (err) - goto err_mr; + err = mlx4_ib_umem_write_mtt(dev, &mr->mmr.mtt, mr->umem); + if (err) + goto err_mr; + } err = mlx4_mr_enable(dev->dev, &mr->mmr); if (err) @@ -193,7 +258,7 @@ struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd, struct mlx4_ib_mr *mr; int err; - mr = kmalloc(sizeof *mr, GFP_KERNEL); + mr = kzalloc(sizeof *mr, GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); @@ -226,7 +291,7 @@ struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device struct mlx4_ib_fast_reg_page_list *mfrpl; int size = page_list_len * sizeof (u64); - if (page_list_len > MLX4_MAX_FAST_REG_PAGES) + if (size > PAGE_SIZE) return ERR_PTR(-EINVAL); mfrpl = kmalloc(sizeof *mfrpl, GFP_KERNEL); @@ -240,7 +305,7 @@ struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device mfrpl->mapped_page_list = dma_alloc_coherent(&dev->dev->pdev->dev, size, &mfrpl->map, GFP_KERNEL); - if (!mfrpl->mapped_page_list) + if (!mfrpl->ibfrpl.page_list) goto err_free; WARN_ON(mfrpl->map & 0x3f); @@ -272,10 +337,12 @@ struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int acc, struct mlx4_ib_fmr *fmr; int err = -ENOMEM; - fmr = kmalloc(sizeof *fmr, GFP_KERNEL); + fmr = kzalloc(sizeof *fmr, GFP_KERNEL); if (!fmr) return ERR_PTR(-ENOMEM); + fmr->mfmr.mr.flags |= MLX4_MR_FLAG_FMR; + err = mlx4_fmr_alloc(dev->dev, to_mpd(pd)->pdn, convert_access(acc), fmr_attr->max_pages, fmr_attr->max_maps, fmr_attr->page_shift, &fmr->mfmr); @@ -299,6 +366,14 @@ err_free: return ERR_PTR(err); } +int mlx4_ib_set_fmr_pd(struct ib_fmr *ibfmr, struct ib_pd *pd) +{ + struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr); + u32 pdn = to_mpd(pd)->pdn; + + return mlx4_set_fmr_pd(&ifmr->mfmr, pdn); +} + int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, int npages, u64 iova) { @@ -312,7 +387,6 @@ int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, int mlx4_ib_unmap_fmr(struct list_head *fmr_list) { struct ib_fmr *ibfmr; - int err; struct mlx4_dev *mdev = NULL; list_for_each_entry(ibfmr, fmr_list, list) { @@ -330,20 +404,8 @@ int mlx4_ib_unmap_fmr(struct list_head *fmr_list) mlx4_fmr_unmap(mdev, &ifmr->mfmr, &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey); } - /* - * Make sure all MPT status updates are visible before issuing - * SYNC_TPT firmware command. - */ - wmb(); - - err = mlx4_SYNC_TPT(mdev); - if (err) - printk(KERN_WARNING "mlx4_ib: SYNC_TPT error %d when " - "unmapping FMRs\n", err); - return 0; } - int mlx4_ib_fmr_dealloc(struct ib_fmr *ibfmr) { struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr); @@ -351,7 +413,6 @@ int mlx4_ib_fmr_dealloc(struct ib_fmr *ibfmr) int err; err = mlx4_fmr_free(dev->dev, &ifmr->mfmr); - if (!err) kfree(ifmr); diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 23c04ff6519b1..fffb9c06db979 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -32,7 +32,6 @@ */ #include -#include #include #include @@ -40,6 +39,7 @@ #include #include +#include #include "mlx4_ib.h" #include "user.h" @@ -52,22 +52,25 @@ enum { MLX4_IB_DEFAULT_SCHED_QUEUE = 0x83, MLX4_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f, MLX4_IB_LINK_TYPE_IB = 0, - MLX4_IB_LINK_TYPE_ETH = 1 + MLX4_IB_LINK_TYPE_ETH = 1, }; enum { /* - * Largest possible UD header: send with GRH and immediate - * data plus 18 bytes for an Ethernet header with VLAN/802.1Q - * tag. (LRH would only use 8 bytes, so Ethernet is the - * biggest case) + * Largest possible UD header: send with GRH and immediate data. + * 4 bytes added to accommodate for eth header instead of lrh */ - MLX4_IB_UD_HEADER_SIZE = 82, - MLX4_IB_LSO_HEADER_SPARE = 128, + MLX4_IB_UD_HEADER_SIZE = 76, + MLX4_IB_MAX_RAW_ETY_HDR_SIZE = 12 }; enum { - MLX4_IB_IBOE_ETHERTYPE = 0x8915 + MLX4_IBOE_ETHERTYPE = 0x8915 +}; + +struct mlx4_ib_xrc_reg_entry { + struct list_head list; + void *context; }; struct mlx4_ib_sqp { @@ -80,8 +83,7 @@ struct mlx4_ib_sqp { }; enum { - MLX4_IB_MIN_SQ_STRIDE = 6, - MLX4_IB_CACHE_LINE_SIZE = 64, + MLX4_IB_MIN_SQ_STRIDE = 6 }; static const __be32 mlx4_ib_opcode[] = { @@ -100,21 +102,56 @@ static const __be32 mlx4_ib_opcode[] = { [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA), }; +#ifndef wc_wmb + #if defined(__i386__) + #define wc_wmb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory") + #elif defined(__x86_64__) + #define wc_wmb() asm volatile("sfence" ::: "memory") + #elif defined(__ia64__) + #define wc_wmb() asm volatile("fwb" ::: "memory") + #else + #define wc_wmb() wmb() + #endif +#endif + + static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp) { return container_of(mqp, struct mlx4_ib_sqp, qp); } +static int is_tunnel_qp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) +{ + if (!mlx4_is_mfunc(dev->dev)) + return 0; + + return qp->mqp.qpn >= dev->dev->caps.tunnel_qpn && + qp->mqp.qpn < dev->dev->caps.tunnel_qpn + + 8 + 16 * MLX4_MFUNC_MAX; +} + static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) { - return qp->mqp.qpn >= dev->dev->caps.sqp_start && - qp->mqp.qpn <= dev->dev->caps.sqp_start + 3; + return ((mlx4_is_master(dev->dev) && + qp->mqp.qpn >= dev->dev->caps.tunnel_qpn && + qp->mqp.qpn <= dev->dev->caps.tunnel_qpn + 3) || + (qp->mqp.qpn >= dev->dev->caps.sqp_start && + qp->mqp.qpn <= dev->dev->caps.sqp_start + 3)); } +/* used for INIT/CLOSE port logic */ static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) { - return qp->mqp.qpn >= dev->dev->caps.sqp_start && - qp->mqp.qpn <= dev->dev->caps.sqp_start + 1; + int qp0; + + /* qp0 is either the proxy qp0, or the real qp0 */ + qp0 = (qp->mqp.qpn >= dev->dev->caps.sqp_start && + qp->mqp.qpn <= dev->dev->caps.sqp_start + 1) || + (mlx4_is_mfunc(dev->dev) && + qp->mqp.qpn >= dev->dev->caps.tunnel_qpn && + qp->mqp.qpn <= dev->dev->caps.tunnel_qpn + 1); + + return qp0; } static void *get_wqe(struct mlx4_ib_qp *qp, int offset) @@ -222,14 +259,15 @@ static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind) static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type) { struct ib_event event; - struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; + struct mlx4_ib_qp *mqp = to_mibqp(qp); + struct ib_qp *ibqp = &mqp->ibqp; + struct mlx4_ib_xrc_reg_entry *ctx_entry; if (type == MLX4_EVENT_TYPE_PATH_MIG) to_mibqp(qp)->port = to_mibqp(qp)->alt_port; if (ibqp->event_handler) { event.device = ibqp->device; - event.element.qp = ibqp; switch (type) { case MLX4_EVENT_TYPE_PATH_MIG: event.event = IB_EVENT_PATH_MIG; @@ -261,11 +299,20 @@ static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type) return; } + if (unlikely(ibqp->qp_type == IB_QPT_XRC && + mqp->flags & MLX4_IB_XRC_RCV)) { + event.event |= IB_XRC_QP_EVENT_FLAG; + event.element.xrc_qp_num = ibqp->qp_num; + list_for_each_entry(ctx_entry, &mqp->xrc_reg_list, list) + ibqp->event_handler(&event, ctx_entry->context); + return; + } + event.element.qp = ibqp; ibqp->event_handler(&event, ibqp->qp_context); } } -static int send_wqe_overhead(enum ib_qp_type type, u32 flags) +static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags) { /* * UD WQEs must have a datagram segment. @@ -274,19 +321,29 @@ static int send_wqe_overhead(enum ib_qp_type type, u32 flags) * header and space for the ICRC). */ switch (type) { - case IB_QPT_UD: + case MLX4_IB_QPT_UD: return sizeof (struct mlx4_wqe_ctrl_seg) + sizeof (struct mlx4_wqe_datagram_seg) + - ((flags & MLX4_IB_QP_LSO) ? MLX4_IB_LSO_HEADER_SPARE : 0); - case IB_QPT_UC: + ((flags & MLX4_IB_QP_LSO) ? 128 : 0); + case MLX4_IB_QPT_PROXY_SMI: + case MLX4_IB_QPT_PROXY_GSI: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_datagram_seg) + 64; + case MLX4_IB_QPT_TUN_SMI: + case MLX4_IB_QPT_TUN_GSI: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_datagram_seg); + + case MLX4_IB_QPT_UC: return sizeof (struct mlx4_wqe_ctrl_seg) + sizeof (struct mlx4_wqe_raddr_seg); - case IB_QPT_RC: + case MLX4_IB_QPT_XRC: + case MLX4_IB_QPT_RC: return sizeof (struct mlx4_wqe_ctrl_seg) + sizeof (struct mlx4_wqe_atomic_seg) + sizeof (struct mlx4_wqe_raddr_seg); - case IB_QPT_SMI: - case IB_QPT_GSI: + case MLX4_IB_QPT_SMI: + case MLX4_IB_QPT_GSI: return sizeof (struct mlx4_wqe_ctrl_seg) + ALIGN(MLX4_IB_UD_HEADER_SIZE + DIV_ROUND_UP(MLX4_IB_UD_HEADER_SIZE, @@ -296,60 +353,97 @@ static int send_wqe_overhead(enum ib_qp_type type, u32 flags) ALIGN(4 + sizeof (struct mlx4_wqe_inline_seg), sizeof (struct mlx4_wqe_data_seg)); + case MLX4_IB_QPT_RAW_ETY: + return sizeof(struct mlx4_wqe_ctrl_seg) + + ALIGN(MLX4_IB_MAX_RAW_ETY_HDR_SIZE + + sizeof(struct mlx4_wqe_inline_seg), + sizeof(struct mlx4_wqe_data_seg)); + default: return sizeof (struct mlx4_wqe_ctrl_seg); } } static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, - int is_user, int has_srq, struct mlx4_ib_qp *qp) + int is_user, int has_srq_or_is_xrc, struct mlx4_ib_qp *qp) { /* Sanity check RQ size before proceeding */ - if (cap->max_recv_wr > dev->dev->caps.max_wqes || - cap->max_recv_sge > dev->dev->caps.max_rq_sg) + if (cap->max_recv_wr > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE || + cap->max_recv_sge > + min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg)) { + mlx4_ib_dbg("Requested RQ size (sge or wr) too large"); return -EINVAL; + } - if (has_srq) { + if (has_srq_or_is_xrc) { /* QPs attached to an SRQ should have no RQ */ - if (cap->max_recv_wr) + if (cap->max_recv_wr) { + mlx4_ib_dbg("non-zero RQ size for QP using SRQ"); return -EINVAL; + } qp->rq.wqe_cnt = qp->rq.max_gs = 0; } else { /* HW requires >= 1 RQ entry with >= 1 gather entry */ - if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge)) + if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge)) { + mlx4_ib_dbg("user QP RQ has 0 wr's or 0 sge's " + "(wr: 0x%x, sge: 0x%x)", cap->max_recv_wr, + cap->max_recv_sge); return -EINVAL; + } qp->rq.wqe_cnt = roundup_pow_of_two(max(1U, cap->max_recv_wr)); qp->rq.max_gs = roundup_pow_of_two(max(1U, cap->max_recv_sge)); qp->rq.wqe_shift = ilog2(qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg)); } - cap->max_recv_wr = qp->rq.max_post = qp->rq.wqe_cnt; - cap->max_recv_sge = qp->rq.max_gs; + /* leave userspace return values as they were, so as not to break ABI */ + if (is_user) { + cap->max_recv_wr = qp->rq.max_post = qp->rq.wqe_cnt; + cap->max_recv_sge = qp->rq.max_gs; + } else { + cap->max_recv_wr = qp->rq.max_post = + min_t(int, dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE, qp->rq.wqe_cnt); + cap->max_recv_sge = min(qp->rq.max_gs, + min_t(int, dev->dev->caps.max_sq_sg, + dev->dev->caps.max_rq_sg)); + } + /* We don't support inline sends for kernel QPs (yet) */ + return 0; } static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, - enum ib_qp_type type, struct mlx4_ib_qp *qp) + enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp) { int s; /* Sanity check SQ size before proceeding */ - if (cap->max_send_wr > dev->dev->caps.max_wqes || - cap->max_send_sge > dev->dev->caps.max_sq_sg || + if (cap->max_send_wr > (dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE) || + cap->max_send_sge > + min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg) || cap->max_inline_data + send_wqe_overhead(type, qp->flags) + - sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz) + sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz) { + mlx4_ib_dbg("Requested SQ resources exceed device maxima"); return -EINVAL; + } /* * For MLX transport we need 2 extra S/G entries: * one for the header and one for the checksum at the end */ if ((type == IB_QPT_SMI || type == IB_QPT_GSI) && - cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg) + cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg) { + mlx4_ib_dbg("No space for SQP hdr/csum sge's"); + return -EINVAL; + } + + if (type == IB_QPT_RAW_ETY && + cap->max_send_sge + 1 > dev->dev->caps.max_sq_sg) { + mlx4_ib_dbg("No space for RAW ETY hdr"); return -EINVAL; + } s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg), cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) + @@ -368,7 +462,7 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, * anymore, so we do this only if selective signaling is off. * * Further, on 32-bit platforms, we can't use vmap() to make - * the QP buffer virtually contiguous. Thus we have to use + * the QP buffer virtually contigious. Thus we have to use * constant-sized WRs to make sure a WR is always fully within * a single page-sized chunk. * @@ -391,7 +485,9 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, */ if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC && qp->sq_signal_bits && BITS_PER_LONG == 64 && - type != IB_QPT_SMI && type != IB_QPT_GSI) + type != IB_QPT_SMI && type != IB_QPT_GSI && type != IB_QPT_RAW_ETY && + type != MLX4_IB_QPT_PROXY_SMI && type != MLX4_IB_QPT_PROXY_GSI && + type != MLX4_IB_QPT_TUN_SMI) qp->sq.wqe_shift = ilog2(64); else qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s)); @@ -417,7 +513,7 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, ++qp->sq.wqe_shift; } - qp->sq.max_gs = (min(dev->dev->caps.max_sq_desc_sz, + qp->sq.max_gs = (min_t(int, dev->dev->caps.max_sq_desc_sz, (qp->sq_max_wqes_per_wr << qp->sq.wqe_shift)) - send_wqe_overhead(type, qp->flags)) / sizeof (struct mlx4_wqe_data_seg); @@ -434,11 +530,10 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, cap->max_send_wr = qp->sq.max_post = (qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr; - cap->max_send_sge = min(qp->sq.max_gs, + cap->max_send_sge = min_t(int, qp->sq.max_gs, min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg)); - /* We don't support inline sends for kernel QPs (yet) */ - cap->max_inline_data = 0; + qp->max_inline_data = cap->max_inline_data; return 0; } @@ -451,8 +546,10 @@ static int set_user_sq_size(struct mlx4_ib_dev *dev, if ((1 << ucmd->log_sq_bb_count) > dev->dev->caps.max_wqes || ucmd->log_sq_stride > ilog2(roundup_pow_of_two(dev->dev->caps.max_sq_desc_sz)) || - ucmd->log_sq_stride < MLX4_IB_MIN_SQ_STRIDE) + ucmd->log_sq_stride < MLX4_IB_MIN_SQ_STRIDE) { + mlx4_ib_dbg("Requested max wqes or wqe stride exceeds max"); return -EINVAL; + } qp->sq.wqe_cnt = 1 << ucmd->log_sq_bb_count; qp->sq.wqe_shift = ucmd->log_sq_stride; @@ -463,12 +560,111 @@ static int set_user_sq_size(struct mlx4_ib_dev *dev, return 0; } +static int alloc_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp) +{ + int i; + + qp->sqp_proxy_rcv = + kmalloc(sizeof (struct mlx4_ib_buf) * qp->rq.wqe_cnt, + GFP_KERNEL); + if (!qp->sqp_proxy_rcv) + return -ENOMEM; + for (i = 0; i < qp->rq.wqe_cnt; i++) { + qp->sqp_proxy_rcv[i].addr = + kmalloc(sizeof (struct mlx4_ib_proxy_sqp_hdr), + GFP_KERNEL); + if (!qp->sqp_proxy_rcv[i].addr) + goto err; + qp->sqp_proxy_rcv[i].map = + ib_dma_map_single(dev, qp->sqp_proxy_rcv[i].addr, + sizeof (struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + } + return 0; + +err: + while (i > 0) { + --i; + ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map, + sizeof (struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + kfree(qp->sqp_proxy_rcv[i].addr); + } + kfree(qp->sqp_proxy_rcv); + qp->sqp_proxy_rcv = NULL; + return -ENOMEM; +} + +static void free_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp) +{ + int i; + + for (i = 0; i < qp->rq.wqe_cnt; i++) { + ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map, + sizeof (struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + kfree(qp->sqp_proxy_rcv[i].addr); + } + kfree(qp->sqp_proxy_rcv); +} + static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, struct ib_qp_init_attr *init_attr, - struct ib_udata *udata, int sqpn, struct mlx4_ib_qp *qp) + struct ib_udata *udata, int sqpn, struct mlx4_ib_qp **caller_qp) { int qpn; int err; + struct mlx4_ib_sqp *sqp; + struct mlx4_ib_qp *qp; + enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type; + + /* When tunneling special qps, we use a plain UD qp */ + if (sqpn) { + if (mlx4_is_mfunc(dev->dev) && + (!dev->dev->caps.sqp_demux || + !(init_attr->create_flags & MLX4_IB_SRIOV_SQP))) + qp_type = init_attr->qp_type == IB_QPT_SMI ? + MLX4_IB_QPT_PROXY_SMI : MLX4_IB_QPT_PROXY_GSI; + qpn = sqpn; + /* add extra sg entry for tunneling */ + init_attr->cap.max_recv_sge++; + } else if (init_attr->create_flags & MLX4_IB_QP_TUNNEL) { + struct mlx4_ib_qp_tunnel_init_attr *tnl_init = + container_of(init_attr, + struct mlx4_ib_qp_tunnel_init_attr, init_attr); + if (tnl_init->proxy_qp_type != IB_QPT_SMI && + tnl_init->proxy_qp_type != IB_QPT_GSI) + return -EINVAL; + qp_type = (tnl_init->proxy_qp_type == IB_QPT_SMI) ? + MLX4_IB_QPT_TUN_SMI : MLX4_IB_QPT_TUN_GSI; + qpn = dev->dev->caps.tunnel_qpn + 8 * (1 + MLX4_MFUNC_MAX + tnl_init->slave) + + tnl_init->proxy_qp_type * 2 + tnl_init->port - 1; + sqpn = qpn; + } + + if (!*caller_qp) { + if (qp_type == IB_QPT_SMI || qp_type == IB_QPT_GSI || + qp_type == MLX4_IB_QPT_PROXY_SMI || qp_type == MLX4_IB_QPT_PROXY_GSI || + qp_type == MLX4_IB_QPT_TUN_SMI) { + sqp = kzalloc(sizeof(struct mlx4_ib_sqp), GFP_KERNEL); + if (!sqp) + return -ENOMEM; + qp = &sqp->qp; + } else { + qp = kzalloc(sizeof(struct mlx4_ib_qp), GFP_KERNEL); + if (!qp) + return -ENOMEM; + } + } else + qp = *caller_qp; + + qp->mlx4_ib_qp_type = qp_type; + + if (mlx4_is_mfunc (dev->dev) && + (qp_type == MLX4_IB_QPT_SMI || qp_type == MLX4_IB_QPT_GSI)) { + qpn -= 8; + sqpn -= 8; + } mutex_init(&qp->mutex); spin_lock_init(&qp->sq.lock); @@ -479,7 +675,8 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); - err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, !!init_attr->srq, qp); + err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, + !!init_attr->srq || !!init_attr->xrc_domain , qp); if (err) goto err; @@ -501,23 +698,31 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, qp->buf_size, 0, 0); if (IS_ERR(qp->umem)) { err = PTR_ERR(qp->umem); + mlx4_ib_dbg("ib_umem_get error (%d)", err); goto err; } err = mlx4_mtt_init(dev->dev, ib_umem_page_count(qp->umem), - ilog2(qp->umem->page_size), &qp->mtt); - if (err) + ilog2(qp->umem->page_size), &qp->mtt, + MLX4_MR_FLAG_NONE); + if (err) { + mlx4_ib_dbg("mlx4_mtt_init error (%d)", err); goto err_buf; + } err = mlx4_ib_umem_write_mtt(dev, &qp->mtt, qp->umem); - if (err) + if (err) { + mlx4_ib_dbg("mlx4_ib_umem_write_mtt error (%d)", err); goto err_mtt; + } - if (!init_attr->srq) { + if (!init_attr->srq && init_attr->qp_type != IB_QPT_XRC) { err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context), ucmd.db_addr, &qp->db); - if (err) + if (err) { + mlx4_ib_dbg("mlx4_ib_db_map_user error (%d)", err); goto err_mtt; + } } } else { qp->sq_no_prefetch = 0; @@ -528,11 +733,11 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) qp->flags |= MLX4_IB_QP_LSO; - err = set_kernel_sq_size(dev, &init_attr->cap, init_attr->qp_type, qp); + err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp); if (err) goto err; - if (!init_attr->srq) { + if (!init_attr->srq && init_attr->qp_type != IB_QPT_XRC) { err = mlx4_db_alloc(dev->dev, &qp->db, 0); if (err) goto err; @@ -540,19 +745,32 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, *qp->db.db = 0; } + if (qp->max_inline_data) { + err = mlx4_bf_alloc(dev->dev, &qp->bf); + if (err) { + mlx4_ib_dbg("failed to allocate blue flame register (%d)", err); + qp->bf.uar = &dev->priv_uar; + } + } else + qp->bf.uar = &dev->priv_uar; + if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf)) { err = -ENOMEM; goto err_db; } err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift, - &qp->mtt); - if (err) + &qp->mtt, MLX4_MR_FLAG_NONE); + if (err) { + mlx4_ib_dbg("kernel qp mlx4_mtt_init error (%d)", err); goto err_buf; + } err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf); - if (err) + if (err) { + mlx4_ib_dbg("mlx4_buf_write_mtt error (%d)", err); goto err_mtt; + } qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), GFP_KERNEL); qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), GFP_KERNEL); @@ -564,17 +782,26 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, } if (sqpn) { - qpn = sqpn; + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI) { + if (alloc_proxy_bufs(pd->device, qp)) { + err = -ENOMEM; + goto err_wrid; + } + } } else { err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn); if (err) - goto err_wrid; + goto err_proxy; } err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp); if (err) goto err_qpn; + if (init_attr->qp_type == IB_QPT_XRC) + qp->mqp.qpn |= (1 << 23); + /* * Hardware wants QPN written in big-endian order (after * shifting) for send doorbell. Precompute this value to save @@ -583,16 +810,19 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); qp->mqp.event = mlx4_ib_qp_event; - + if (!*caller_qp) + *caller_qp = qp; return 0; err_qpn: if (!sqpn) mlx4_qp_release_range(dev->dev, qpn, 1); - +err_proxy: + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI) + free_proxy_bufs(pd->device, qp); err_wrid: if (pd->uobject) { - if (!init_attr->srq) + if (!init_attr->srq && init_attr->qp_type != IB_QPT_XRC) mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &qp->db); } else { @@ -601,7 +831,7 @@ err_wrid: } err_mtt: - mlx4_mtt_cleanup(dev->dev, &qp->mtt); + mlx4_mtt_cleanup(dev->dev, &qp->mtt, MLX4_MR_FLAG_NONE); err_buf: if (pd->uobject) @@ -610,10 +840,15 @@ err_buf: mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); err_db: - if (!pd->uobject && !init_attr->srq) + if (!pd->uobject && !init_attr->srq && init_attr->qp_type != IB_QPT_XRC) mlx4_db_free(dev->dev, &qp->db); + if (qp->max_inline_data) + mlx4_bf_free(dev->dev, &qp->bf); + err: + if (!*caller_qp) + kfree(qp); return err; } @@ -632,12 +867,10 @@ static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state) } static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq) - __acquires(&send_cq->lock) __acquires(&recv_cq->lock) { - if (send_cq == recv_cq) { + if (send_cq == recv_cq) spin_lock_irq(&send_cq->lock); - __acquire(&recv_cq->lock); - } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { spin_lock_irq(&send_cq->lock); spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); } else { @@ -647,12 +880,10 @@ static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv } static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq) - __releases(&send_cq->lock) __releases(&recv_cq->lock) { - if (send_cq == recv_cq) { - __release(&recv_cq->lock); + if (send_cq == recv_cq) spin_unlock_irq(&send_cq->lock); - } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { + else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { spin_unlock(&recv_cq->lock); spin_unlock_irq(&send_cq->lock); } else { @@ -663,7 +894,7 @@ static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *re static void del_gid_entries(struct mlx4_ib_qp *qp) { - struct mlx4_ib_gid_entry *ge, *tmp; + struct gid_entry *ge, *tmp; list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) { list_del(&ge->list); @@ -700,21 +931,26 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, mlx4_qp_free(dev->dev, &qp->mqp); - if (!is_sqp(dev, qp)) + if (!is_sqp(dev, qp) && !is_tunnel_qp(dev, qp)) mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1); - mlx4_mtt_cleanup(dev->dev, &qp->mtt); + mlx4_mtt_cleanup(dev->dev, &qp->mtt, MLX4_MR_FLAG_NONE); if (is_user) { - if (!qp->ibqp.srq) + if (!qp->ibqp.srq && qp->ibqp.qp_type != IB_QPT_XRC) mlx4_ib_db_unmap_user(to_mucontext(qp->ibqp.uobject->context), &qp->db); ib_umem_release(qp->umem); } else { kfree(qp->sq.wrid); kfree(qp->rq.wrid); + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI) + free_proxy_bufs(&dev->ib_dev, qp); mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); - if (!qp->ibqp.srq) + if (qp->max_inline_data) + mlx4_bf_free(dev->dev, &qp->bf); + if (!qp->ibqp.srq && qp->ibqp.qp_type != IB_QPT_XRC) mlx4_db_free(dev->dev, &qp->db); } @@ -726,63 +962,68 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, struct ib_udata *udata) { struct mlx4_ib_dev *dev = to_mdev(pd->device); - struct mlx4_ib_sqp *sqp; - struct mlx4_ib_qp *qp; + struct mlx4_ib_qp *qp = NULL; int err; /* - * We only support LSO and multicast loopback blocking, and + * We only support LSO, vendor flag1, and multicast loopback blocking, and * only for kernel UD QPs. */ - if (init_attr->create_flags & ~(IB_QP_CREATE_IPOIB_UD_LSO | - IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)) + if (init_attr->create_flags & ~(MLX4_IB_QP_LSO | + MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK | + MLX4_IB_QP_TUNNEL | MLX4_IB_SRIOV_SQP)) return ERR_PTR(-EINVAL); if (init_attr->create_flags && - (pd->uobject || init_attr->qp_type != IB_QPT_UD)) + (pd->uobject || + ((init_attr->create_flags & ~MLX4_IB_SRIOV_SQP) && + init_attr->qp_type != IB_QPT_UD) || + ((init_attr->create_flags & MLX4_IB_SRIOV_SQP) && + init_attr->qp_type > IB_QPT_GSI))) return ERR_PTR(-EINVAL); switch (init_attr->qp_type) { + case IB_QPT_XRC: + if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)) + return ERR_PTR(-ENOSYS); case IB_QPT_RC: case IB_QPT_UC: case IB_QPT_UD: { - qp = kzalloc(sizeof *qp, GFP_KERNEL); - if (!qp) - return ERR_PTR(-ENOMEM); - - err = create_qp_common(dev, pd, init_attr, udata, 0, qp); + err = create_qp_common(dev, pd, init_attr, udata, 0, &qp); if (err) { - kfree(qp); return ERR_PTR(err); } + if (init_attr->qp_type == IB_QPT_XRC) + qp->xrcdn = to_mxrcd(init_attr->xrc_domain)->xrcdn; + else + qp->xrcdn = 0; + qp->ibqp.qp_num = qp->mqp.qpn; break; } + case IB_QPT_RAW_ETY: + if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_RAW_ETY)) + return ERR_PTR(-ENOSYS); case IB_QPT_SMI: case IB_QPT_GSI: { /* Userspace is not allowed to create special QPs: */ - if (pd->uobject) + if (pd->uobject) { + mlx4_ib_dbg("Userspace is not allowed to create special QPs"); return ERR_PTR(-EINVAL); - - sqp = kzalloc(sizeof *sqp, GFP_KERNEL); - if (!sqp) - return ERR_PTR(-ENOMEM); - - qp = &sqp->qp; + } err = create_qp_common(dev, pd, init_attr, udata, dev->dev->caps.sqp_start + - (init_attr->qp_type == IB_QPT_SMI ? 0 : 2) + + (init_attr->qp_type == IB_QPT_RAW_ETY ? 4 : + (init_attr->qp_type == IB_QPT_SMI ? 0 : 2)) + init_attr->port_num - 1, - qp); - if (err) { - kfree(sqp); + &qp); + if (err) return ERR_PTR(err); - } qp->port = init_attr->port_num; qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1; @@ -790,7 +1031,8 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, break; } default: - /* Don't support raw QPs */ + mlx4_ib_dbg("Invalid QP type requested for create_qp (%d)", + init_attr->qp_type); return ERR_PTR(-EINVAL); } @@ -815,15 +1057,25 @@ int mlx4_ib_destroy_qp(struct ib_qp *qp) return 0; } -static int to_mlx4_st(enum ib_qp_type type) +static int to_mlx4_st(struct mlx4_ib_dev *dev, enum mlx4_ib_qp_type type) { switch (type) { - case IB_QPT_RC: return MLX4_QP_ST_RC; - case IB_QPT_UC: return MLX4_QP_ST_UC; - case IB_QPT_UD: return MLX4_QP_ST_UD; - case IB_QPT_SMI: - case IB_QPT_GSI: return MLX4_QP_ST_MLX; - default: return -1; + case MLX4_IB_QPT_RC: return MLX4_QP_ST_RC; + case MLX4_IB_QPT_UC: return MLX4_QP_ST_UC; + case MLX4_IB_QPT_UD: return MLX4_QP_ST_UD; + case MLX4_IB_QPT_XRC: return MLX4_QP_ST_XRC; + case MLX4_IB_QPT_RAW_IPV6: return MLX4_QP_ST_MLX; + case MLX4_IB_QPT_RAW_ETY: return MLX4_QP_ST_MLX; + case MLX4_IB_QPT_SMI: return MLX4_QP_ST_MLX; + case MLX4_IB_QPT_GSI: return MLX4_QP_ST_MLX; + + case MLX4_IB_QPT_PROXY_SMI: + case MLX4_IB_QPT_TUN_SMI: return (mlx4_is_mfunc(dev->dev) ? + MLX4_QP_ST_MLX : -1); + case MLX4_IB_QPT_PROXY_GSI: + case MLX4_IB_QPT_TUN_GSI: return (mlx4_is_mfunc(dev->dev) ? + MLX4_QP_ST_UD : -1); + default: return -1; } } @@ -877,12 +1129,13 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, struct mlx4_qp_path *path, u8 port) { int err; - int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port) == - IB_LINK_LAYER_ETHERNET; + int is_eth = rdma_port_link_layer(&dev->ib_dev, port) == + IB_LINK_LAYER_ETHERNET ? 1 : 0; u8 mac[6]; int is_mcast; u16 vlan_tag; int vidx; + int gid_index; path->grh_mylmc = ah->src_path_bits & 0x7f; path->rlid = cpu_to_be16(ah->dlid); @@ -893,7 +1146,6 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, --path->static_rate; } else path->static_rate = 0; - path->counter_index = 0xff; if (ah->ah_flags & IB_AH_GRH) { if (ah->grh.sgid_index >= dev->dev->caps.gid_table_len[port]) { @@ -903,7 +1155,19 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, } path->grh_mylmc |= 1 << 7; - path->mgid_index = ah->grh.sgid_index; + if (mlx4_is_mfunc(dev->dev)) { + gid_index = + mlx4_ib_get_virt2phys_gid(dev, port, + ah->grh.sgid_index); + if (!is_gid_idx_valid(gid_index)) { + mlx4_ib_warn(&dev->ib_dev, + "cannot modify qp with gid " + "index %d\n", gid_index); + return -1; + } + path->mgid_index = gid_index & 0x7f; + } else + path->mgid_index = ah->grh.sgid_index; path->hop_limit = ah->grh.hop_limit; path->tclass_flowlabel = cpu_to_be32((ah->grh.traffic_class << 20) | @@ -911,9 +1175,10 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, memcpy(path->rgid, ah->grh.dgid.raw, 16); } + /* TODO - fix to handle double GUID feature */ if (is_eth) { path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | - ((port - 1) << 6) | ((ah->sl & 7) << 3) | ((ah->sl & 8) >> 1); + ((port - 1) << 6) | ((ah->sl & 0x7) << 3) | ((ah->sl & 8) >> 1); if (!(ah->ah_flags & IB_AH_GRH)) return -1; @@ -928,7 +1193,7 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, path->grh_mylmc &= 0x80; vlan_tag = rdma_get_vlan_id(&dev->iboe.gid_table[port - 1][ah->grh.sgid_index]); - if (vlan_tag < 0x1000) { + if (vlan_tag) { if (mlx4_find_cached_vlan(dev->dev, port, vlan_tag, &vidx)) return -ENOENT; @@ -944,13 +1209,11 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) { - struct mlx4_ib_gid_entry *ge, *tmp; + struct gid_entry *ge, *tmp; list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) { - if (!ge->added && mlx4_ib_add_mc(dev, qp, &ge->gid)) { + if (!ge->added && mlx4_ib_add_mc(dev, qp, &ge->gid)) ge->added = 1; - ge->port = qp->port; - } } } @@ -970,7 +1233,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, return -ENOMEM; context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) | - (to_mlx4_st(ibqp->qp_type) << 16)); + (to_mlx4_st(dev, qp->mlx4_ib_qp_type) << 16)); if (!(attr_mask & IB_QP_PATH_MIG_STATE)) context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11); @@ -989,7 +1252,8 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, } } - if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) + if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI || + ibqp->qp_type == IB_QPT_RAW_ETY) context->mtu_msgmax = (IB_MTU_4096 << 5) | 11; else if (ibqp->qp_type == IB_QPT_UD) { if (qp->flags & MLX4_IB_QP_LSO) @@ -1015,13 +1279,16 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, context->sq_size_stride = ilog2(qp->sq.wqe_cnt) << 3; context->sq_size_stride |= qp->sq.wqe_shift - 4; - if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { context->sq_size_stride |= !!qp->sq_no_prefetch << 7; + if (ibqp->qp_type == IB_QPT_XRC) + context->xrcd = cpu_to_be32((u32) qp->xrcdn); + } if (qp->ibqp.uobject) context->usr_page = cpu_to_be32(to_mucontext(ibqp->uobject->context)->uar.index); else - context->usr_page = cpu_to_be32(dev->priv_uar.index); + context->usr_page = cpu_to_be32(qp->bf.uar->index); if (attr_mask & IB_QP_DEST_QPN) context->remote_qpn = cpu_to_be32(attr->dest_qp_num); @@ -1034,40 +1301,64 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, } } + if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR && + dev->counters[qp->port - 1] != -1) { + context->pri_path.counter_index = dev->counters[qp->port - 1]; + optpar |= MLX4_QP_OPTPAR_COUNTER_INDEX; + } + if (attr_mask & IB_QP_PKEY_INDEX) { + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_SMI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI) + context->pri_path.disable_pkey_check = 0x40; context->pri_path.pkey_index = attr->pkey_index; optpar |= MLX4_QP_OPTPAR_PKEY_INDEX; } if (attr_mask & IB_QP_AV) { if (mlx4_set_path(dev, &attr->ah_attr, &context->pri_path, - attr_mask & IB_QP_PORT ? attr->port_num : qp->port)) + attr_mask & IB_QP_PORT ? attr->port_num : qp->port)) { + mlx4_ib_dbg("qpn 0x%x: could not set pri path params", + ibqp->qp_num); goto out; + } optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH | MLX4_QP_OPTPAR_SCHED_QUEUE); } if (attr_mask & IB_QP_TIMEOUT) { - context->pri_path.ackto |= attr->timeout << 3; + context->pri_path.ackto |= (attr->timeout << 3); optpar |= MLX4_QP_OPTPAR_ACK_TIMEOUT; } if (attr_mask & IB_QP_ALT_PATH) { if (attr->alt_port_num == 0 || - attr->alt_port_num > dev->dev->caps.num_ports) + attr->alt_port_num > dev->num_ports) { + mlx4_ib_dbg("qpn 0x%x: invalid alternate port num (%d)", + ibqp->qp_num, attr->alt_port_num); goto out; + } if (attr->alt_pkey_index >= - dev->dev->caps.pkey_table_len[attr->alt_port_num]) + dev->dev->caps.pkey_table_len[attr->alt_port_num]) { + mlx4_ib_dbg("qpn 0x%x: invalid alt pkey index (0x%x)", + ibqp->qp_num, attr->alt_pkey_index); goto out; + } if (mlx4_set_path(dev, &attr->alt_ah_attr, &context->alt_path, - attr->alt_port_num)) + attr->alt_port_num)) { + mlx4_ib_dbg("qpn 0x%x: could not set alt path params", + ibqp->qp_num); goto out; + } context->alt_path.pkey_index = attr->alt_pkey_index; context->alt_path.ackto = attr->alt_timeout << 3; + context->alt_path.counter_index = dev->counters[attr->alt_port_num - 1]; optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH; } @@ -1124,26 +1415,52 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, context->cqn_recv = cpu_to_be32(to_mcq(ibqp->recv_cq)->mcq.cqn); + /* proxy and tunnel qp qkeys will be changed in modify-qp wrappers */ if (attr_mask & IB_QP_QKEY) { - context->qkey = cpu_to_be32(attr->qkey); + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_SMI) + context->qkey = cpu_to_be32(IB_QP_SET_QKEY); + else { + if (mlx4_is_mfunc(dev->dev) && + qp->mlx4_ib_qp_type != MLX4_IB_QPT_PROXY_GSI && + qp->mlx4_ib_qp_type != MLX4_IB_QPT_TUN_GSI && + (attr->qkey & MLX4_RESERVED_QKEY_MASK) == + MLX4_RESERVED_QKEY_BASE) { + printk(KERN_WARNING "Cannot use reserved QKEY 0x%x " + "(range 0xffff0000..0xffffffff is reserved)\n", + attr->qkey); + err = -EINVAL; + goto out; + } + context->qkey = cpu_to_be32(attr->qkey); + } optpar |= MLX4_QP_OPTPAR_Q_KEY; } if (ibqp->srq) context->srqn = cpu_to_be32(1 << 24 | to_msrq(ibqp->srq)->msrq.srqn); - if (!ibqp->srq && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + if (!ibqp->srq && ibqp->qp_type != IB_QPT_XRC && + cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) context->db_rec_addr = cpu_to_be64(qp->db.dma); if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR && (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI || - ibqp->qp_type == IB_QPT_UD)) { + ibqp->qp_type == IB_QPT_UD || ibqp->qp_type == IB_QPT_RAW_ETY)) { context->pri_path.sched_queue = (qp->port - 1) << 6; - if (is_qp0(dev, qp)) + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_SMI) { context->pri_path.sched_queue |= MLX4_IB_DEFAULT_QP0_SCHED_QUEUE; - else + if (qp->mlx4_ib_qp_type != MLX4_IB_QPT_SMI) + context->pri_path.fl = 0x80; + } else { + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI) + context->pri_path.fl = 0x80; context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE; + } } if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD && @@ -1227,7 +1544,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, qp->sq.head = 0; qp->sq.tail = 0; qp->sq_next_wqe = 0; - if (!ibqp->srq) + if (!ibqp->srq && ibqp->qp_type != IB_QPT_XRC) *qp->db.db = 0; } @@ -1249,27 +1566,49 @@ int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; - if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask)) { + mlx4_ib_dbg("qpn 0x%x: invalid attribute mask specified " + "for transition %d to %d. qp_type %d, attr_mask 0x%x", + ibqp->qp_num, cur_state, new_state, + ibqp->qp_type, attr_mask); goto out; + } if ((attr_mask & IB_QP_PORT) && - (attr->port_num == 0 || attr->port_num > dev->dev->caps.num_ports)) { + (attr->port_num == 0 || attr->port_num > dev->num_ports)) { + mlx4_ib_dbg("qpn 0x%x: invalid port number (%d) specified " + "for transition %d to %d. qp_type %d", + ibqp->qp_num, attr->port_num, cur_state, + new_state, ibqp->qp_type); goto out; } if (attr_mask & IB_QP_PKEY_INDEX) { int p = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; - if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p]) + if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p]) { + mlx4_ib_dbg("qpn 0x%x: invalid pkey index (%d) specified " + "for transition %d to %d. qp_type %d", + ibqp->qp_num, attr->pkey_index, cur_state, + new_state, ibqp->qp_type); goto out; + } } if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) { + mlx4_ib_dbg("qpn 0x%x: max_rd_atomic (%d) too large. " + "Transition %d to %d. qp_type %d", + ibqp->qp_num, attr->max_rd_atomic, cur_state, + new_state, ibqp->qp_type); goto out; } if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && attr->max_dest_rd_atomic > dev->dev->caps.max_qp_dest_rdma) { + mlx4_ib_dbg("qpn 0x%x: max_dest_rd_atomic (%d) too large. " + "Transition %d to %d. qp_type %d", + ibqp->qp_num, attr->max_dest_rd_atomic, cur_state, + new_state, ibqp->qp_type); goto out; } @@ -1285,43 +1624,230 @@ out: return err; } +static int build_raw_ety_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, + void *wqe, unsigned *mlx_seg_len) +{ + int payload = 0; + int header_size, packet_length; + struct mlx4_wqe_mlx_seg *mlx = wqe; + struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx; + u32 *lrh = wqe + sizeof *mlx + sizeof *inl; + int i; + + /* Only IB_WR_SEND is supported */ + if (wr->opcode != IB_WR_SEND) + return -EINVAL; + + for (i = 0; i < wr->num_sge; ++i) + payload += wr->sg_list[i].length; + + header_size = IB_LRH_BYTES + 4; /* LRH + RAW_HEADER (32 bits) */ + + /* headers + payload and round up */ + packet_length = (header_size + payload + 3) / 4; + + mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); + + mlx->flags |= cpu_to_be32(MLX4_WQE_MLX_ICRC | + (wr->wr.raw_ety.lrh->service_level << 8)); + + mlx->rlid = wr->wr.raw_ety.lrh->destination_lid; + + wr->wr.raw_ety.lrh->packet_length = cpu_to_be16(packet_length); + + ib_lrh_header_pack(wr->wr.raw_ety.lrh, lrh); + lrh += IB_LRH_BYTES / 4; /* LRH size is a dword multiple */ + *lrh = cpu_to_be32(wr->wr.raw_ety.eth_type); + + inl->byte_count = cpu_to_be32(1 << 31 | header_size); + + *mlx_seg_len = + ALIGN(sizeof(struct mlx4_wqe_inline_seg) + header_size, 16); + + return 0; +} + +static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, + void *wqe, unsigned *mlx_seg_len) +{ + struct mlx4_ib_dev *mdev = to_mdev(sqp->qp.ibqp.device); + struct ib_device *ib_dev = &mdev->ib_dev; + struct mlx4_wqe_mlx_seg *mlx = wqe; + struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx; + struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah); + u16 pkey; + u32 qkey; + int send_size; + int header_size; + int spc; + int i; + + if (wr->opcode != IB_WR_SEND) + return -EINVAL; + + send_size = 0; + + for (i = 0; i < wr->num_sge; ++i) + send_size += wr->sg_list[i].length; + + /* for proxy-qp0 sends, need to add in size of tunnel header */ + /* for tunnel-qp0 sends, tunnel header is already in s/g list */ + if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI) + send_size += sizeof(struct mlx4_ib_tunnel_header); + + ib_ud_header_init(send_size, 1, 0, 0, 0, 0, &sqp->ud_header); + + if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI) { + sqp->ud_header.lrh.service_level = + be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; + sqp->ud_header.lrh.destination_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f); + sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f); + } else { + sqp->ud_header.lrh.destination_lid = + cpu_to_be16(mdev->sriov.local_lid[sqp->qp.port - 1]); + sqp->ud_header.lrh.source_lid = + cpu_to_be16(mdev->sriov.local_lid[sqp->qp.port - 1]); + } + + mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); + + mlx->flags |= cpu_to_be32(MLX4_WQE_MLX_VL15 | 0x1 | MLX4_WQE_MLX_SLR); /* force loopback */ + mlx->rlid = sqp->ud_header.lrh.destination_lid; + + sqp->ud_header.lrh.virtual_lane = 0; + sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED); + ib_get_cached_pkey(ib_dev, sqp->qp.port, 0, &pkey); + sqp->ud_header.bth.pkey = cpu_to_be16(pkey); + if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_TUN_SMI) + sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); + else + sqp->ud_header.bth.destination_qpn = + cpu_to_be32(mdev->dev->caps.tunnel_qpn + 8 * + (1 + MLX4_MFUNC_MAX + mdev->dev->caps.function) + + sqp->qp.port - 1); + + sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); + if (mlx4_get_parav_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey)) + return -EINVAL; + sqp->ud_header.deth.qkey = cpu_to_be32(qkey); + sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.mqp.qpn); + + sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY; + sqp->ud_header.immediate_present = 0; + + header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf); + + if (0) { + printk(KERN_ERR "built UD header of size %d:\n", header_size); + for (i = 0; i < header_size / 4; ++i) { + if (i % 8 == 0) + printk(" [%02x] ", i * 4); + printk(" %08x", + be32_to_cpu(((__be32 *) sqp->header_buf)[i])); + if ((i + 1) % 8 == 0) + printk("\n"); + } + printk("\n"); + } + + /* + * Inline data segments may not cross a 64 byte boundary. If + * our UD header is bigger than the space available up to the + * next 64 byte boundary in the WQE, use two inline data + * segments to hold the UD header. + */ + spc = MLX4_INLINE_ALIGN - + ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); + if (header_size <= spc) { + inl->byte_count = cpu_to_be32(1 << 31 | header_size); + memcpy(inl + 1, sqp->header_buf, header_size); + i = 1; + } else { + inl->byte_count = cpu_to_be32(1 << 31 | spc); + memcpy(inl + 1, sqp->header_buf, spc); + + inl = (void *) (inl + 1) + spc; + memcpy(inl + 1, sqp->header_buf + spc, header_size - spc); + /* + * Need a barrier here to make sure all the data is + * visible before the byte_count field is set. + * Otherwise the HCA prefetcher could grab the 64-byte + * chunk with this inline segment and get a valid (!= + * 0xffffffff) byte count but stale data, and end up + * generating a packet with bad headers. + * + * The first inline segment's byte_count field doesn't + * need a barrier, because it comes after a + * control/MLX segment and therefore is at an offset + * of 16 mod 64. + */ + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc)); + i = 2; + } + + *mlx_seg_len = + ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16); + return 0; +} + static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, void *wqe, unsigned *mlx_seg_len) { - struct ib_device *ib_dev = sqp->qp.ibqp.device; + struct ib_device *ib_dev = &to_mdev(sqp->qp.ibqp.device)->ib_dev; struct mlx4_wqe_mlx_seg *mlx = wqe; struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx; struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah); - union ib_gid sgid; u16 pkey; int send_size; int header_size; int spc; int i; + union ib_gid sgid; int is_eth; - int is_vlan = 0; int is_grh; - u16 vlan = 0; + int is_vlan = 0; + int err = 0; + __be16 vlan = 0; send_size = 0; for (i = 0; i < wr->num_sge; ++i) send_size += wr->sg_list[i].length; - is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET; + is_eth = rdma_port_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET; is_grh = mlx4_ib_ah_grh_present(ah); + + /* TODO - add support in double GUID feature */ if (is_eth) { - ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24, - ah->av.ib.gid_index, &sgid); - vlan = rdma_get_vlan_id(&sgid); - is_vlan = vlan < 0x1000; + if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) { + /* When multi-function is enabled, the ib_core gid + * indexes don't necessarily match the hw ones, so + * we must use our own cache */ + sgid.global.subnet_prefix = + to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1]. + subnet_prefix; + sgid.global.interface_id = + to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1]. + guid_cache[ah->av.ib.gid_index]; + } else { + err = ib_get_cached_gid(ib_dev, + be32_to_cpu(ah->av.ib.port_pd) >> 24, + ah->av.ib.gid_index, &sgid); + if (err) + return err; + } + + vlan = cpu_to_be16(rdma_get_vlan_id(&sgid)); + is_vlan = !!vlan; } - ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header); + ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header); if (!is_eth) { sqp->ud_header.lrh.service_level = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; sqp->ud_header.lrh.destination_lid = ah->av.ib.dlid; - sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f); + sqp->ud_header.lrh.source_lid = ah->ex ? cpu_to_be16(ah->ex->slid) : + cpu_to_be16(ah->av.ib.g_slid & 0x7f); } if (is_grh) { @@ -1330,8 +1856,23 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, sqp->ud_header.grh.flow_label = ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff); sqp->ud_header.grh.hop_limit = ah->av.ib.hop_limit; - ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24, - ah->av.ib.gid_index, &sqp->ud_header.grh.source_gid); + if (ah->ex) + memcpy(&sqp->ud_header.grh.source_gid, ah->ex->sgid, 16); + else { + if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) { + /* When multi-function is enabled, the ib_core gid + * indexes don't necessarily match the hw ones, so + * we must use our own cache */ + sqp->ud_header.grh.source_gid.global.subnet_prefix = + to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1]. + subnet_prefix; + sqp->ud_header.grh.source_gid.global.interface_id = + to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1]. + guid_cache[ah->av.ib.gid_index]; + } else + ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24, + ah->av.ib.gid_index, &sqp->ud_header.grh.source_gid); + } memcpy(sqp->ud_header.grh.destination_gid.raw, ah->av.ib.dgid, 16); } @@ -1340,19 +1881,22 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, if (!is_eth) { mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) | - (sqp->ud_header.lrh.destination_lid == - IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) | - (sqp->ud_header.lrh.service_level << 8)); + ((sqp->ud_header.lrh.destination_lid == + IB_LID_PERMISSIVE || ah->ex) ? MLX4_WQE_MLX_SLR : 0) | + (sqp->ud_header.lrh.service_level << 8)); + if (ah->av.ib.port_pd & cpu_to_be32(0x80000000)) { + mlx->flags |= cpu_to_be32(0x1); /* force loopback */ + } mlx->rlid = sqp->ud_header.lrh.destination_lid; } switch (wr->opcode) { case IB_WR_SEND: - sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY; + sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY; sqp->ud_header.immediate_present = 0; break; case IB_WR_SEND_WITH_IMM: - sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; + sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; sqp->ud_header.immediate_present = 1; sqp->ud_header.immediate_data = wr->ex.imm_data; break; @@ -1364,19 +1908,18 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, u8 *smac; memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6); - /* FIXME: cache smac value? */ - smac = to_mdev(sqp->qp.ibqp.device)->iboe.netdevs[sqp->qp.port - 1]->dev_addr; + smac = to_mdev(sqp->qp.ibqp.device)->iboe.netdevs[sqp->qp.port - 1]->dev_addr; /* fixme: cache this value */ memcpy(sqp->ud_header.eth.smac_h, smac, 6); if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6)) mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK); - if (!is_vlan) { - sqp->ud_header.eth.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE); - } else { + if (!is_vlan) + sqp->ud_header.eth.type = cpu_to_be16(MLX4_IBOE_ETHERTYPE); + else { u16 pcp; - sqp->ud_header.vlan.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE); - pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 27 & 3) << 13; - sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp); + sqp->ud_header.vlan.type = cpu_to_be16(MLX4_IBOE_ETHERTYPE); + pcp = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 23 & 0xe0; + sqp->ud_header.vlan.tag = vlan | pcp; } } else { sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : 0; @@ -1393,7 +1936,8 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ? sqp->qkey : wr->wr.ud.remote_qkey); - sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num); + sqp->ud_header.deth.source_qpn = ah->ex ? cpu_to_be32(ah->ex->sqpn) : + cpu_to_be32(sqp->qp.ibqp.qp_num); header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf); @@ -1417,7 +1961,7 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, * segments to hold the UD header. */ spc = MLX4_INLINE_ALIGN - - ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); + ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); if (header_size <= spc) { inl->byte_count = cpu_to_be32(1 << 31 | header_size); memcpy(inl + 1, sqp->header_buf, header_size); @@ -1447,7 +1991,7 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, } *mlx_seg_len = - ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16); + ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16); return 0; } @@ -1549,6 +2093,63 @@ static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, *vlan = dseg->vlan; } +static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev, + struct mlx4_wqe_datagram_seg *dseg, + struct ib_send_wr *wr, enum ib_qp_type qpt) +{ + union mlx4_ext_av *av = &to_mah(wr->wr.ud.ah)->av; + struct mlx4_av sqp_av = {0}; + int port = *((u8*) &av->ib.port_pd) & 0x3; + + /* XXX see if we weren't better off creating an AH for qp1 in the SA... */ + sqp_av.port_pd = av->ib.port_pd | cpu_to_be32(0x80000000); /* force loopback */ + sqp_av.g_slid = av->ib.g_slid & 0x7f; /* no GRH */ + sqp_av.dlid = cpu_to_be16(dev->sriov.local_lid[port - 1]); + sqp_av.sl_tclass_flowlabel = av->ib.sl_tclass_flowlabel & cpu_to_be32(0xf0000000); + + memcpy(dseg->av, &sqp_av, sizeof (struct mlx4_av)); + dseg->dqpn = cpu_to_be32(dev->dev->caps.tunnel_qpn + 8 * + (1 + MLX4_MFUNC_MAX + dev->dev->caps.function) + + qpt * 2 + port - 1); + dseg->qkey = cpu_to_be32(0x80000000); /* use well-known qkey from the QPC */ +} + +static void build_tunnel_header(struct ib_send_wr *wr, void *wqe, unsigned *mlx_seg_len) +{ + struct mlx4_wqe_inline_seg *inl = wqe; + struct mlx4_ib_tunnel_header hdr; + struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah); + int spc; + int i; + + memcpy(&hdr.av, &ah->av, sizeof hdr.av); + hdr.remote_qpn = wr->wr.ud.remote_qpn; + hdr.pkey_index = wr->wr.ud.pkey_index; + hdr.qkey = wr->wr.ud.remote_qkey; + + spc = MLX4_INLINE_ALIGN - + ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); + if (sizeof(hdr) <= spc) { + memcpy(inl + 1, &hdr, sizeof(hdr)); + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | sizeof(hdr)); + i = 1; + } else { + memcpy(inl + 1, &hdr, spc); + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | spc); + + inl = (void *) (inl + 1) + spc; + memcpy(inl + 1, (void *) &hdr + spc, sizeof(hdr) - spc); + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | (sizeof(hdr) - spc)); + i = 2; + } + + *mlx_seg_len = + ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + sizeof(hdr), 16); +} + static void set_mlx_icrc_seg(void *dseg) { u32 *t = dseg; @@ -1596,12 +2197,11 @@ static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr, struct mlx4_ib_qp *qp, unsigned *lso_seg_len, - __be32 *lso_hdr_sz, __be32 *blh) + __be32 *lso_hdr_sz, int *blh) { unsigned halign = ALIGN(sizeof *wqe + wr->wr.ud.hlen, 16); - if (unlikely(halign > MLX4_IB_CACHE_LINE_SIZE)) - *blh = cpu_to_be32(1 << 6); + *blh = unlikely(halign > 64) ? 1 : 0; if (unlikely(!(qp->flags & MLX4_IB_QP_LSO) && wr->num_sge > qp->sq.max_gs - (halign >> 4))) @@ -1630,12 +2230,99 @@ static __be32 send_ieth(struct ib_send_wr *wr) } } +static int lay_inline_data(struct mlx4_ib_qp *qp, struct ib_send_wr *wr, + void *wqe, int *sz) +{ + struct mlx4_wqe_inline_seg *seg; + void *addr; + int len, seg_len; + int num_seg; + int off, to_copy; + int i; + int inl = 0; + + seg = wqe; + wqe += sizeof *seg; + off = ((unsigned long)wqe) & (unsigned long)(MLX4_INLINE_ALIGN - 1); + num_seg = 0; + seg_len = 0; + + for (i = 0; i < wr->num_sge; ++i) { + addr = (void *) (unsigned long)(wr->sg_list[i].addr); + len = wr->sg_list[i].length; + inl += len; + + if (inl > qp->max_inline_data) { + inl = 0; + return -1; + } + + while (len >= MLX4_INLINE_ALIGN - off) { + to_copy = MLX4_INLINE_ALIGN - off; + memcpy(wqe, addr, to_copy); + len -= to_copy; + wqe += to_copy; + addr += to_copy; + seg_len += to_copy; + wmb(); /* see comment below */ + seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len); + seg_len = 0; + seg = wqe; + wqe += sizeof *seg; + off = sizeof *seg; + ++num_seg; + } + + memcpy(wqe, addr, len); + wqe += len; + seg_len += len; + off += len; + } + + if (seg_len) { + ++num_seg; + /* + * Need a barrier here to make sure + * all the data is visible before the + * byte_count field is set. Otherwise + * the HCA prefetcher could grab the + * 64-byte chunk with this inline + * segment and get a valid (!= + * 0xffffffff) byte count but stale + * data, and end up sending the wrong + * data. + */ + wmb(); + seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len); + } + + *sz = (inl + num_seg * sizeof *seg + 15) / 16; + + return 0; +} + +static void add_zero_len_inline(void *wqe) +{ + struct mlx4_wqe_inline_seg *inl = wqe; + memset(wqe, 0, 16); + inl->byte_count = cpu_to_be32(1 << 31); +} +/* + * Avoid using memcpy() to copy to BlueFlame page, since memcpy() + * implementations may use move-string-buffer assembler instructions, + * which do not guarantee order of copying. + */ +static void mlx4_bf_copy(unsigned long *dst, unsigned long *src, unsigned bytecnt) +{ + __iowrite64_copy(dst, src, bytecnt / 8); +} + int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, struct ib_send_wr **bad_wr) { struct mlx4_ib_qp *qp = to_mqp(ibqp); void *wqe; - struct mlx4_wqe_ctrl_seg *ctrl; + struct mlx4_wqe_ctrl_seg *uninitialized_var(ctrl); struct mlx4_wqe_data_seg *dseg; unsigned long flags; int nreq; @@ -1647,9 +2334,10 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, __be32 dummy; __be32 *lso_wqe; __be32 uninitialized_var(lso_hdr_sz); - __be32 blh; int i; - __be16 vlan = cpu_to_be16(0xffff); + int blh = 0; + __be16 vlan = 0; + int inl = 0; spin_lock_irqsave(&qp->sq.lock, flags); @@ -1657,21 +2345,24 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, for (nreq = 0; wr; ++nreq, wr = wr->next) { lso_wqe = &dummy; - blh = 0; if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) { + mlx4_ib_dbg("QP 0x%x: WQE overflow", ibqp->qp_num); err = -ENOMEM; *bad_wr = wr; goto out; } if (unlikely(wr->num_sge > qp->sq.max_gs)) { + mlx4_ib_dbg("QP 0x%x: too many sg entries (%d)", + ibqp->qp_num, wr->num_sge); err = -EINVAL; *bad_wr = wr; goto out; } ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); + *((u32 *) (&ctrl->vlan_tag)) = 0; qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id; ctrl->srcrb_flags = @@ -1689,9 +2380,13 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, wqe += sizeof *ctrl; size = sizeof *ctrl / 16; - switch (ibqp->qp_type) { - case IB_QPT_RC: - case IB_QPT_UC: + switch (qp->mlx4_ib_qp_type) { + case MLX4_IB_QPT_XRC: + ctrl->srcrb_flags |= + cpu_to_be32(wr->xrc_remote_srq_num << 8); + /* fall thru */ + case MLX4_IB_QPT_RC: + case MLX4_IB_QPT_UC: switch (wr->opcode) { case IB_WR_ATOMIC_CMP_AND_SWP: case IB_WR_ATOMIC_FETCH_AND_ADD: @@ -1752,7 +2447,22 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } break; - case IB_QPT_UD: + case MLX4_IB_QPT_TUN_SMI: + err = build_sriov_qp0_header(to_msqp(qp), wr, ctrl, &seglen); + if (unlikely(err)) { + *bad_wr = wr; + goto out; + } + wqe += seglen; + size += seglen / 16; + break; + case MLX4_IB_QPT_TUN_GSI: + /* this is a UD qp used in MAD responses to slaves. */ + set_datagram_seg(wqe, wr, &vlan); + wqe += sizeof (struct mlx4_wqe_datagram_seg); + size += sizeof (struct mlx4_wqe_datagram_seg) / 16; + break; + case MLX4_IB_QPT_UD: set_datagram_seg(wqe, wr, &vlan); wqe += sizeof (struct mlx4_wqe_datagram_seg); size += sizeof (struct mlx4_wqe_datagram_seg) / 16; @@ -1769,8 +2479,42 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } break; - case IB_QPT_SMI: - case IB_QPT_GSI: + case MLX4_IB_QPT_PROXY_SMI: + if (unlikely(!to_mdev(ibqp->device)->dev->caps.sqp_demux)) { + err = -ENOSYS; + *bad_wr = wr; + goto out; + } + err = build_sriov_qp0_header(to_msqp(qp), wr, ctrl, &seglen); + if (unlikely(err)) { + *bad_wr = wr; + goto out; + } + wqe += seglen; + size += seglen / 16; + /* to start tunnel header on a cache-line boundary */ + add_zero_len_inline(wqe); + wqe += 16; + size++; + build_tunnel_header(wr, wqe, &seglen); + wqe += seglen; + size += seglen / 16; + break; + case MLX4_IB_QPT_PROXY_GSI: + /* If we are tunneling special qps, this is a UD qp. + * In this case we first add a UD segment targeting + * the tunnel qp, and then add a header with address + * information */ + set_tunnel_datagram_seg(to_mdev(ibqp->device), wqe, wr, ibqp->qp_type); + wqe += sizeof (struct mlx4_wqe_datagram_seg); + size += sizeof (struct mlx4_wqe_datagram_seg) / 16; + build_tunnel_header(wr, wqe, &seglen); + wqe += seglen; + size += seglen / 16; + break; + + case MLX4_IB_QPT_SMI: + case MLX4_IB_QPT_GSI: err = build_mlx_header(to_msqp(qp), wr, ctrl, &seglen); if (unlikely(err)) { *bad_wr = wr; @@ -1780,6 +2524,17 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, size += seglen / 16; break; + case MLX4_IB_QPT_RAW_ETY: + err = build_raw_ety_header(to_msqp(qp), wr, ctrl, + &seglen); + if (unlikely(err)) { + *bad_wr = wr; + goto out; + } + wqe += seglen; + size += seglen / 16; + break; + default: break; } @@ -1793,17 +2548,28 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, dseg = wqe; dseg += wr->num_sge - 1; - size += wr->num_sge * (sizeof (struct mlx4_wqe_data_seg) / 16); /* Add one more inline data segment for ICRC for MLX sends */ - if (unlikely(qp->ibqp.qp_type == IB_QPT_SMI || - qp->ibqp.qp_type == IB_QPT_GSI)) { + if (unlikely(qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_SMI)) { set_mlx_icrc_seg(dseg + 1); size += sizeof (struct mlx4_wqe_data_seg) / 16; } - for (i = wr->num_sge - 1; i >= 0; --i, --dseg) - set_data_seg(dseg, wr->sg_list + i); + if (wr->send_flags & IB_SEND_INLINE && wr->num_sge) { + int sz; + err = lay_inline_data(qp, wr, wqe, &sz); + if (!err) { + inl = 1; + size += sz; + } + } else { + size += wr->num_sge * (sizeof (struct mlx4_wqe_data_seg) / 16); + for (i = wr->num_sge - 1; i >= 0; --i, --dseg) + set_data_seg(dseg, wr->sg_list + i); + } /* * Possibly overwrite stamping in cacheline with LSO @@ -1816,11 +2582,6 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ? MLX4_WQE_CTRL_FENCE : 0) | size; - if (be16_to_cpu(vlan) < 0x1000) { - ctrl->ins_vlan = 1 << 6; - ctrl->vlan_tag = vlan; - } - /* * Make sure descriptor is fully written before * setting ownership bit (because HW can start @@ -1834,7 +2595,13 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] | - (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) | blh; + (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) | + (blh ? cpu_to_be32(1 << 6) : 0); + + if (vlan) { + ctrl->ins_vlan = 1 << 6; + ctrl->vlan_tag = vlan; + } stamp = ind + qp->sq_spare_wqes; ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift); @@ -1855,7 +2622,24 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } out: - if (likely(nreq)) { + if (nreq == 1 && inl && size > 1 && size < qp->bf.buf_size / 16) { + ctrl->owner_opcode |= htonl((qp->sq_next_wqe & 0xffff) << 8); + *(u32 *) (&ctrl->vlan_tag) |= qp->doorbell_qpn; + /* + * Make sure that descriptor is written to memory + * before writing to BlueFlame page. + */ + wmb(); + + ++qp->sq.head; + + mlx4_bf_copy(qp->bf.reg + qp->bf.offset, (unsigned long *) ctrl, + ALIGN(size * 16, 64)); + wc_wmb(); + + qp->bf.offset ^= qp->bf.buf_size; + + } else if (nreq) { qp->sq.head += nreq; /* @@ -1864,8 +2648,7 @@ out: */ wmb(); - writel(qp->doorbell_qpn, - to_mdev(ibqp->device)->uar_map + MLX4_SEND_DOORBELL); + writel(qp->doorbell_qpn, qp->bf.uar->map + MLX4_SEND_DOORBELL); /* * Make sure doorbells don't leak out of SQ spinlock @@ -1873,8 +2656,10 @@ out: */ mmiowb(); - stamp_send_wqe(qp, stamp, size * 16); + } + if (likely(nreq)) { + stamp_send_wqe(qp, stamp, size * 16); ind = pad_wraparound(qp, ind); qp->sq_next_wqe = ind; } @@ -1893,20 +2678,25 @@ int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, int err = 0; int nreq; int ind; + int max_gs; int i; + max_gs = qp->rq.max_gs; spin_lock_irqsave(&qp->rq.lock, flags); ind = qp->rq.head & (qp->rq.wqe_cnt - 1); for (nreq = 0; wr; ++nreq, wr = wr->next) { if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { + mlx4_ib_dbg("QP 0x%x: WQE overflow", ibqp->qp_num); err = -ENOMEM; *bad_wr = wr; goto out; } if (unlikely(wr->num_sge > qp->rq.max_gs)) { + mlx4_ib_dbg("QP 0x%x: too many sg entries (%d)", + ibqp->qp_num, wr->num_sge); err = -EINVAL; *bad_wr = wr; goto out; @@ -1914,10 +2704,25 @@ int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, scat = get_recv_wqe(qp, ind); + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI || + qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI) { + ib_dma_sync_single_for_device(ibqp->device, + qp->sqp_proxy_rcv[ind].map, + sizeof(struct mlx4_ib_proxy_sqp_hdr), + DMA_FROM_DEVICE); + scat->byte_count = + cpu_to_be32(sizeof(struct mlx4_ib_proxy_sqp_hdr)); + /* use dma lkey from upper layer entry */ + scat->lkey = cpu_to_be32(wr->sg_list->lkey); + scat->addr = cpu_to_be64(qp->sqp_proxy_rcv[ind].map); + scat++; + max_gs--; + } + for (i = 0; i < wr->num_sge; ++i) __set_data_seg(scat + i, wr->sg_list + i); - if (i < qp->rq.max_gs) { + if (i < max_gs) { scat[i].byte_count = 0; scat[i].lkey = cpu_to_be32(MLX4_INVALID_LKEY); scat[i].addr = 0; @@ -1985,10 +2790,10 @@ static int to_ib_qp_access_flags(int mlx4_flags) return ib_flags; } -static void to_ib_ah_attr(struct mlx4_ib_dev *ibdev, struct ib_ah_attr *ib_ah_attr, - struct mlx4_qp_path *path) +static void to_ib_ah_attr(struct mlx4_ib_dev *ib_dev, struct ib_ah_attr *ib_ah_attr, + struct mlx4_qp_path *path) { - struct mlx4_dev *dev = ibdev->dev; + struct mlx4_dev *dev = ib_dev->dev; int is_eth; memset(ib_ah_attr, 0, sizeof *ib_ah_attr); @@ -1997,8 +2802,8 @@ static void to_ib_ah_attr(struct mlx4_ib_dev *ibdev, struct ib_ah_attr *ib_ah_at if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->caps.num_ports) return; - is_eth = rdma_port_get_link_layer(&ibdev->ib_dev, ib_ah_attr->port_num) == - IB_LINK_LAYER_ETHERNET; + is_eth = rdma_port_link_layer(&ib_dev->ib_dev, ib_ah_attr->port_num) == + IB_LINK_LAYER_ETHERNET ? 1 : 0; if (is_eth) ib_ah_attr->sl = ((path->sched_queue >> 3) & 0x7) | ((path->sched_queue & 4) << 1); @@ -2006,6 +2811,7 @@ static void to_ib_ah_attr(struct mlx4_ib_dev *ibdev, struct ib_ah_attr *ib_ah_at ib_ah_attr->sl = (path->sched_queue >> 2) & 0xf; ib_ah_attr->dlid = be16_to_cpu(path->rlid); + ib_ah_attr->src_path_bits = path->grh_mylmc & 0x7f; ib_ah_attr->static_rate = path->static_rate ? path->static_rate - 5 : 0; ib_ah_attr->ah_flags = (path->grh_mylmc & (1 << 7)) ? IB_AH_GRH : 0; @@ -2057,7 +2863,8 @@ int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr qp_attr->qp_access_flags = to_ib_qp_access_flags(be32_to_cpu(context.params2)); - if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) { + if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC || + qp->ibqp.qp_type == IB_QPT_XRC) { to_ib_ah_attr(dev, &qp_attr->ah_attr, &context.pri_path); to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context.alt_path); qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f; @@ -2112,8 +2919,290 @@ done: if (qp->flags & MLX4_IB_QP_LSO) qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO; + qp_init_attr->sq_sig_type = qp->sq_signal_bits == cpu_to_be32( + MLX4_WQE_CTRL_CQ_UPDATE) ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; + out: mutex_unlock(&qp->mutex); return err; } +int mlx4_ib_create_xrc_rcv_qp(struct ib_qp_init_attr *init_attr, + u32 *qp_num) +{ + struct mlx4_ib_dev *dev = to_mdev(init_attr->xrc_domain->device); + struct mlx4_ib_xrcd *xrcd = to_mxrcd(init_attr->xrc_domain); + struct mlx4_ib_qp *qp; + struct ib_qp *ibqp; + struct mlx4_ib_xrc_reg_entry *ctx_entry; + int err; + + if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)) + return -ENOSYS; + + if (init_attr->qp_type != IB_QPT_XRC) + return -EINVAL; + + ctx_entry = kmalloc(sizeof *ctx_entry, GFP_KERNEL); + if (!ctx_entry) + return -ENOMEM; + + qp = kzalloc(sizeof *qp, GFP_KERNEL); + if (!qp) { + kfree(ctx_entry); + return -ENOMEM; + } + qp->flags = MLX4_IB_XRC_RCV; + qp->xrcdn = to_mxrcd(init_attr->xrc_domain)->xrcdn; + INIT_LIST_HEAD(&qp->xrc_reg_list); + err = create_qp_common(dev, xrcd->pd, init_attr, NULL, 0, &qp); + if (err) { + kfree(ctx_entry); + kfree(qp); + return err; + } + + ibqp = &qp->ibqp; + /* set the ibpq attributes which will be used by the mlx4 module */ + ibqp->qp_num = qp->mqp.qpn; + ibqp->device = init_attr->xrc_domain->device; + ibqp->pd = xrcd->pd; + ibqp->send_cq = ibqp->recv_cq = xrcd->cq; + ibqp->event_handler = init_attr->event_handler; + ibqp->qp_context = init_attr->qp_context; + ibqp->qp_type = init_attr->qp_type; + ibqp->xrcd = init_attr->xrc_domain; + + mutex_lock(&qp->mutex); + ctx_entry->context = init_attr->qp_context; + list_add_tail(&ctx_entry->list, &qp->xrc_reg_list); + mutex_unlock(&qp->mutex); + *qp_num = qp->mqp.qpn; + return 0; +} + +int mlx4_ib_modify_xrc_rcv_qp(struct ib_xrcd *ibxrcd, u32 qp_num, + struct ib_qp_attr *attr, int attr_mask) +{ + struct mlx4_ib_dev *dev = to_mdev(ibxrcd->device); + struct mlx4_ib_xrcd *xrcd = to_mxrcd(ibxrcd); + struct mlx4_qp *mqp; + int err = -EINVAL; + + if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)) + return -ENOSYS; + + mutex_lock(&dev->xrc_reg_mutex); + mqp = __mlx4_qp_lookup(dev->dev, qp_num); + if (unlikely(!mqp)) { + printk(KERN_WARNING "mlx4_ib_reg_xrc_rcv_qp: " + "unknown QPN %06x\n", qp_num); + goto err_out; + } + + if (xrcd->xrcdn != to_mxrcd(to_mibqp(mqp)->ibqp.xrcd)->xrcdn) + goto err_out; + + err = mlx4_ib_modify_qp(&(to_mibqp(mqp)->ibqp), attr, attr_mask, NULL); + mutex_unlock(&dev->xrc_reg_mutex); + return err; + +err_out: + mutex_unlock(&dev->xrc_reg_mutex); + return err; +} + +int mlx4_ib_query_xrc_rcv_qp(struct ib_xrcd *ibxrcd, u32 qp_num, + struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct mlx4_ib_dev *dev = to_mdev(ibxrcd->device); + struct mlx4_ib_xrcd *xrcd = to_mxrcd(ibxrcd); + struct mlx4_ib_qp *qp; + struct mlx4_qp *mqp; + struct mlx4_qp_context context; + int mlx4_state; + int err = -EINVAL; + + if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)) + return -ENOSYS; + + mutex_lock(&dev->xrc_reg_mutex); + mqp = __mlx4_qp_lookup(dev->dev, qp_num); + if (unlikely(!mqp)) { + printk(KERN_WARNING "mlx4_ib_reg_xrc_rcv_qp: " + "unknown QPN %06x\n", qp_num); + goto err_out; + } + + qp = to_mibqp(mqp); + if (xrcd->xrcdn != to_mxrcd(qp->ibqp.xrcd)->xrcdn) + goto err_out; + + if (qp->state == IB_QPS_RESET) { + qp_attr->qp_state = IB_QPS_RESET; + goto done; + } + + err = mlx4_qp_query(dev->dev, mqp, &context); + if (err) + goto err_out; + + mlx4_state = be32_to_cpu(context.flags) >> 28; + + qp_attr->qp_state = to_ib_qp_state(mlx4_state); + qp_attr->path_mtu = context.mtu_msgmax >> 5; + qp_attr->path_mig_state = + to_ib_mig_state((be32_to_cpu(context.flags) >> 11) & 0x3); + qp_attr->qkey = be32_to_cpu(context.qkey); + qp_attr->rq_psn = be32_to_cpu(context.rnr_nextrecvpsn) & 0xffffff; + qp_attr->sq_psn = be32_to_cpu(context.next_send_psn) & 0xffffff; + qp_attr->dest_qp_num = be32_to_cpu(context.remote_qpn) & 0xffffff; + qp_attr->qp_access_flags = + to_ib_qp_access_flags(be32_to_cpu(context.params2)); + + if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC || + qp->ibqp.qp_type == IB_QPT_XRC) { + to_ib_ah_attr(dev, &qp_attr->ah_attr, &context.pri_path); + to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, + &context.alt_path); + qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f; + qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num; + } + + qp_attr->pkey_index = context.pri_path.pkey_index & 0x7f; + if (qp_attr->qp_state == IB_QPS_INIT) + qp_attr->port_num = qp->port; + else + qp_attr->port_num = context.pri_path.sched_queue & 0x40 ? 2 : 1; + + /* qp_attr->en_sqd_async_notify is only applicable in modify qp */ + qp_attr->sq_draining = mlx4_state == MLX4_QP_STATE_SQ_DRAINING; + + qp_attr->max_rd_atomic = + 1 << ((be32_to_cpu(context.params1) >> 21) & 0x7); + + qp_attr->max_dest_rd_atomic = + 1 << ((be32_to_cpu(context.params2) >> 21) & 0x7); + qp_attr->min_rnr_timer = + (be32_to_cpu(context.rnr_nextrecvpsn) >> 24) & 0x1f; + qp_attr->timeout = context.pri_path.ackto >> 3; + qp_attr->retry_cnt = (be32_to_cpu(context.params1) >> 16) & 0x7; + qp_attr->rnr_retry = (be32_to_cpu(context.params1) >> 13) & 0x7; + qp_attr->alt_timeout = context.alt_path.ackto >> 3; + +done: + qp_attr->cur_qp_state = qp_attr->qp_state; + qp_attr->cap.max_recv_wr = 0; + qp_attr->cap.max_recv_sge = 0; + qp_attr->cap.max_send_wr = 0; + qp_attr->cap.max_send_sge = 0; + qp_attr->cap.max_inline_data = 0; + qp_init_attr->cap = qp_attr->cap; + + mutex_unlock(&dev->xrc_reg_mutex); + return 0; + +err_out: + mutex_unlock(&dev->xrc_reg_mutex); + return err; +} + +int mlx4_ib_reg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num) +{ + + struct mlx4_ib_xrcd *mxrcd = to_mxrcd(xrcd); + + struct mlx4_qp *mqp; + struct mlx4_ib_qp *mibqp; + struct mlx4_ib_xrc_reg_entry *ctx_entry, *tmp; + int err = -EINVAL; + + mutex_lock(&to_mdev(xrcd->device)->xrc_reg_mutex); + mqp = __mlx4_qp_lookup(to_mdev(xrcd->device)->dev, qp_num); + if (unlikely(!mqp)) { + printk(KERN_WARNING "mlx4_ib_reg_xrc_rcv_qp: " + "unknown QPN %06x\n", qp_num); + goto err_out; + } + + mibqp = to_mibqp(mqp); + + if (mxrcd->xrcdn != to_mxrcd(mibqp->ibqp.xrcd)->xrcdn) + goto err_out; + + ctx_entry = kmalloc(sizeof *ctx_entry, GFP_KERNEL); + if (!ctx_entry) { + err = -ENOMEM; + goto err_out; + } + + mutex_lock(&mibqp->mutex); + list_for_each_entry(tmp, &mibqp->xrc_reg_list, list) + if (tmp->context == context) { + mutex_unlock(&mibqp->mutex); + kfree(ctx_entry); + mutex_unlock(&to_mdev(xrcd->device)->xrc_reg_mutex); + return 0; + } + + ctx_entry->context = context; + list_add_tail(&ctx_entry->list, &mibqp->xrc_reg_list); + mutex_unlock(&mibqp->mutex); + mutex_unlock(&to_mdev(xrcd->device)->xrc_reg_mutex); + return 0; + +err_out: + mutex_unlock(&to_mdev(xrcd->device)->xrc_reg_mutex); + return err; +} + +int mlx4_ib_unreg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num) +{ + + struct mlx4_ib_xrcd *mxrcd = to_mxrcd(xrcd); + + struct mlx4_qp *mqp; + struct mlx4_ib_qp *mibqp; + struct mlx4_ib_xrc_reg_entry *ctx_entry, *tmp; + int found = 0; + int err = -EINVAL; + + mutex_lock(&to_mdev(xrcd->device)->xrc_reg_mutex); + mqp = __mlx4_qp_lookup(to_mdev(xrcd->device)->dev, qp_num); + if (unlikely(!mqp)) { + printk(KERN_WARNING "mlx4_ib_unreg_xrc_rcv_qp: " + "unknown QPN %06x\n", qp_num); + goto err_out; + } + + mibqp = to_mibqp(mqp); + + if (mxrcd->xrcdn != (mibqp->xrcdn & 0xffff)) + goto err_out; + + mutex_lock(&mibqp->mutex); + list_for_each_entry_safe(ctx_entry, tmp, &mibqp->xrc_reg_list, list) + if (ctx_entry->context == context) { + found = 1; + list_del(&ctx_entry->list); + kfree(ctx_entry); + break; + } + + mutex_unlock(&mibqp->mutex); + if (!found) + goto err_out; + + /* destroy the QP if the registration list is empty */ + if (list_empty(&mibqp->xrc_reg_list)) + mlx4_ib_destroy_qp(&mibqp->ibqp); + + mutex_unlock(&to_mdev(xrcd->device)->xrc_reg_mutex); + return 0; + +err_out: + mutex_unlock(&to_mdev(xrcd->device)->xrc_reg_mutex); + return err; +} + diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index 818b7ecace5e3..f0a4262da4b6a 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -33,7 +33,6 @@ #include #include -#include #include "mlx4_ib.h" #include "user.h" @@ -68,14 +67,17 @@ static void mlx4_ib_srq_event(struct mlx4_srq *srq, enum mlx4_event type) } } -struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, - struct ib_srq_init_attr *init_attr, - struct ib_udata *udata) +struct ib_srq *mlx4_ib_create_xrc_srq(struct ib_pd *pd, + struct ib_cq *xrc_cq, + struct ib_xrcd *xrcd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata) { struct mlx4_ib_dev *dev = to_mdev(pd->device); struct mlx4_ib_srq *srq; struct mlx4_wqe_srq_next_seg *next; - struct mlx4_wqe_data_seg *scatter; + u32 cqn; + u16 xrcdn; int desc_size; int buf_size; int err; @@ -83,8 +85,12 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, /* Sanity check SRQ size before proceeding */ if (init_attr->attr.max_wr >= dev->dev->caps.max_srq_wqes || - init_attr->attr.max_sge > dev->dev->caps.max_srq_sge) + init_attr->attr.max_sge > dev->dev->caps.max_srq_sge) { + mlx4_ib_dbg("a size param is out of range. " + "max_wr = 0x%x, max_sge = 0x%x", + init_attr->attr.max_wr, init_attr->attr.max_sge); return ERR_PTR(-EINVAL); + } srq = kmalloc(sizeof *srq, GFP_KERNEL); if (!srq) @@ -119,7 +125,8 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, } err = mlx4_mtt_init(dev->dev, ib_umem_page_count(srq->umem), - ilog2(srq->umem->page_size), &srq->mtt); + ilog2(srq->umem->page_size), &srq->mtt, + MLX4_MR_FLAG_NONE); if (err) goto err_buf; @@ -132,6 +139,8 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, if (err) goto err_mtt; } else { + struct mlx4_wqe_data_seg *scatter; + err = mlx4_db_alloc(dev->dev, &srq->db, 0); if (err) goto err_srq; @@ -159,7 +168,7 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, } err = mlx4_mtt_init(dev->dev, srq->buf.npages, srq->buf.page_shift, - &srq->mtt); + &srq->mtt, MLX4_MR_FLAG_NONE); if (err) goto err_buf; @@ -174,18 +183,24 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, } } - err = mlx4_srq_alloc(dev->dev, to_mpd(pd)->pdn, &srq->mtt, + cqn = xrc_cq ? (u32) (to_mcq(xrc_cq)->mcq.cqn) : 0; + xrcdn = xrcd ? (u16) (to_mxrcd(xrcd)->xrcdn) : + (u16) dev->dev->caps.reserved_xrcds; + + err = mlx4_srq_alloc(dev->dev, to_mpd(pd)->pdn, cqn, xrcdn, &srq->mtt, srq->db.dma, &srq->msrq); if (err) goto err_wrid; srq->msrq.event = mlx4_ib_srq_event; - if (pd->uobject) + if (pd->uobject) { if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof (__u32))) { err = -EFAULT; goto err_wrid; } + } else + srq->ibsrq.xrc_srq_num = srq->msrq.srqn; init_attr->attr.max_wr = srq->msrq.max - 1; @@ -198,7 +213,7 @@ err_wrid: kfree(srq->wrid); err_mtt: - mlx4_mtt_cleanup(dev->dev, &srq->mtt); + mlx4_mtt_cleanup(dev->dev, &srq->mtt, MLX4_MR_FLAG_NONE); err_buf: if (pd->uobject) @@ -224,12 +239,16 @@ int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, int ret; /* We don't support resizing SRQs (yet?) */ - if (attr_mask & IB_SRQ_MAX_WR) + if (attr_mask & IB_SRQ_MAX_WR) { + mlx4_ib_dbg("resize not yet supported"); return -EINVAL; + } if (attr_mask & IB_SRQ_LIMIT) { - if (attr->srq_limit >= srq->msrq.max) + if (attr->srq_limit >= srq->msrq.max){ + mlx4_ib_dbg("limit (0x%x) too high", attr->srq_limit); return -EINVAL; + } mutex_lock(&srq->mutex); ret = mlx4_srq_arm(dev->dev, &srq->msrq, attr->srq_limit); @@ -242,6 +261,13 @@ int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, return 0; } +struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata) +{ + return mlx4_ib_create_xrc_srq(pd, NULL, NULL, init_attr, udata); +} + int mlx4_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr) { struct mlx4_ib_dev *dev = to_mdev(ibsrq->device); @@ -264,9 +290,21 @@ int mlx4_ib_destroy_srq(struct ib_srq *srq) { struct mlx4_ib_dev *dev = to_mdev(srq->device); struct mlx4_ib_srq *msrq = to_msrq(srq); + struct mlx4_ib_cq *cq; + + mlx4_srq_invalidate(dev->dev, &msrq->msrq); + + if (srq->xrc_cq && !srq->uobject) { + cq = to_mcq(srq->xrc_cq); + spin_lock_irq(&cq->lock); + __mlx4_ib_cq_clean(cq, -1, msrq); + mlx4_srq_remove(dev->dev, &msrq->msrq); + spin_unlock_irq(&cq->lock); + } else + mlx4_srq_remove(dev->dev, &msrq->msrq); mlx4_srq_free(dev->dev, &msrq->msrq); - mlx4_mtt_cleanup(dev->dev, &msrq->mtt); + mlx4_mtt_cleanup(dev->dev, &msrq->mtt, MLX4_MR_FLAG_NONE); if (srq->uobject) { mlx4_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db); @@ -312,12 +350,16 @@ int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, for (nreq = 0; wr; ++nreq, wr = wr->next) { if (unlikely(wr->num_sge > srq->msrq.max_gs)) { + mlx4_ib_dbg("srq num 0x%x: num s/g entries too large (%d)", + srq->msrq.srqn, wr->num_sge); err = -EINVAL; *bad_wr = wr; break; } if (unlikely(srq->head == srq->tail)) { + mlx4_ib_dbg("srq num 0x%x: No entries available to post.", + srq->msrq.srqn); err = -ENOMEM; *bad_wr = wr; break; diff --git a/drivers/infiniband/hw/mlx4/sysfs.c b/drivers/infiniband/hw/mlx4/sysfs.c new file mode 100644 index 0000000000000..610709075d437 --- /dev/null +++ b/drivers/infiniband/hw/mlx4/sysfs.c @@ -0,0 +1,801 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. + * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/*#include "core_priv.h"*/ +#include "mlx4_ib.h" +#include "alias_GUID.h" +#include +#include + +#include +/*The function returns the administartively value of that GUID. +meaning, the value that was setted by the administrator. +Values: + 0 - let the opensm to assign. + 0xff - delete this entry. + other - assigned by administrator. +*/ +static ssize_t show_admin_alias_guid(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int record_num;/*0-15*/ + int guid_index_in_rec; /*0 - 7*/ + struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry = + container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry); + struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx; + struct mlx4_ib_dev *mdev = port->dev; + + record_num = mlx4_ib_iov_dentry->entry_num / 8 ; + guid_index_in_rec = mlx4_ib_iov_dentry->entry_num % 8 ; + + return sprintf(buf, "%llx\n", + be64_to_cpu(mdev->sriov.alias_guid.ports_guid[port->num - 1]. + all_rec_per_port[record_num].all_recs[guid_index_in_rec])); +} + +/*The function stors the (new)administartively value of that GUID. +Values: + 0 - let the opensm to assign. + 0xff - delete this entry. + other - assigned by administrator. +*/ +static ssize_t store_admin_alias_guid(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int record_num;/*0-15*/ + int guid_index_in_rec; /*0 - 7*/ + struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry = + container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry); + struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx; + struct mlx4_ib_dev *mdev = port->dev; + u64 sysadmin_ag_val; + + record_num = mlx4_ib_iov_dentry->entry_num / 8; + guid_index_in_rec = mlx4_ib_iov_dentry->entry_num % 8; + if (0 == record_num && 0 == guid_index_in_rec) { + printk(KERN_ERR "GUID 0 block 0 is RO\n"); + return count; + } + + sscanf(buf,"%llx", &sysadmin_ag_val); + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num]. + all_recs[guid_index_in_rec] = cpu_to_be64(sysadmin_ag_val); + + /*change the state to be be pending for update*/ + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].status + = MLX4_GUID_INFO_STATUS_IDLE ; + + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].method + = MLX4_GUID_INFO_RECORD_SET; + + /*set the method, is it set or delete*/ + switch (sysadmin_ag_val) { + case MLX4_GUID_FOR_DELETE_VAL: + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].method + = MLX4_GUID_INFO_RECORD_DELETE; + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership + = MLX4_GUID_SYSADMIN_ASSIGN; + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].guid_indexes = 0; + break; + /*if the sysadmin asks for the SM to re-assign:*/ + case MLX4_NOT_SET_GUID: + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership + = MLX4_GUID_DRIVER_ASSIGN; + break; + /*The sysadmin asks for specific value.*/ + default: + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership + = MLX4_GUID_SYSADMIN_ASSIGN; + break; + } + + /*set the record index*/ + + mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].guid_indexes + |= get_alias_guid_comp_mask_from_index(guid_index_in_rec); + + init_alias_guid_work(mdev, port->num - 1); + + return count; +} + +static ssize_t show_port_gid(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry = + container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry); + struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx; + struct mlx4_ib_dev *mdev = port->dev; + union ib_gid gid; + ssize_t ret; + + ret = mlx4_ib_get_indexed_gid(&mdev->ib_dev, + port->num, + mlx4_ib_iov_dentry->entry_num, + &gid); + if (ret) + return ret; + ret = sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + be16_to_cpu(((__be16 *) gid.raw)[0]), + be16_to_cpu(((__be16 *) gid.raw)[1]), + be16_to_cpu(((__be16 *) gid.raw)[2]), + be16_to_cpu(((__be16 *) gid.raw)[3]), + be16_to_cpu(((__be16 *) gid.raw)[4]), + be16_to_cpu(((__be16 *) gid.raw)[5]), + be16_to_cpu(((__be16 *) gid.raw)[6]), + be16_to_cpu(((__be16 *) gid.raw)[7])); + return ret; +} + +static ssize_t show_phys_port_pkey(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry = + container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry); + struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx; + struct mlx4_ib_dev *mdev = port->dev; + u16 pkey; + ssize_t ret; + + ret = mlx4_ib_query_pkey(&mdev->ib_dev, port->num, + mlx4_ib_iov_dentry->entry_num, &pkey); + if (ret) + return ret; + + return sprintf(buf, "0x%04x\n", pkey); +} + +#define DENTRY_REMOVE(_dentry) \ +do { \ + sysfs_remove_file((_dentry)->kobj, &(_dentry)->dentry.attr); \ +} while (0); + +static int create_sysfs_entry(void *_ctx, struct mlx4_ib_iov_sysfs_attr *_dentry, + char *_name, struct kobject *_kobj, + ssize_t (*show)(struct device *dev, + struct device_attribute *attr, + char *buf), + ssize_t (*store)(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) + ) +{ + int ret = 0; + struct mlx4_ib_iov_sysfs_attr *vdentry = _dentry; + + vdentry->ctx = _ctx; + vdentry->dentry.show = show; + vdentry->dentry.store = store; + vdentry->dentry.attr.name = vdentry->name; + vdentry->dentry.attr.mode = 0; + vdentry->kobj = _kobj; + snprintf(vdentry->name, 15, "%s", _name); + + if (vdentry->dentry.store) + vdentry->dentry.attr.mode |= S_IWUGO; + + if (vdentry->dentry.show) + vdentry->dentry.attr.mode |= S_IRUGO; + + ret = sysfs_create_file(vdentry->kobj, &vdentry->dentry.attr); + if (ret) { + printk(KERN_ERR "failed to create %s\n", vdentry->dentry.attr.name); + vdentry->ctx = NULL; + return ret; + } + + return ret; +} + +int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, + struct attribute *attr) +{ + struct mlx4_ib_iov_port *port = &device->iov_ports[port_num - 1]; + int ret; + + ret = sysfs_create_file(port->mcgs_parent, attr); + if (ret) + printk(KERN_ERR "failed to create %s\n", attr->name); + + return ret; +} + +void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, + struct attribute *attr) +{ + struct mlx4_ib_iov_port *port = &device->iov_ports[port_num - 1]; + + sysfs_remove_file(port->mcgs_parent, attr); +} + +static int add_port_entries(struct mlx4_ib_dev *device, int port_num) +{ + int i; + char buff[10]; + struct mlx4_ib_iov_port *port = NULL; + int ret = 0 ; + struct ib_port_attr attr; + /*get the table size.*/ + ret = mlx4_ib_query_port(&device->ib_dev, port_num, &attr); + if (ret) + goto err; + + port = &device->iov_ports[port_num - 1]; + port->dev = device; + port->num = port_num; +/* iov - + port num - + --admin_guids + --(operational)gids + --mcg_table +*/ + port->dentr_ar = kzalloc(sizeof (struct mlx4_ib_iov_sysfs_attr_ar), + GFP_KERNEL); + if (!port->dentr_ar) { + printk(KERN_ERR "add_port_entries: could not allocate dentry array\n"); + ret = -ENOMEM; + goto err; + } + sprintf(buff, "%d", port_num); + port->cur_port = kobject_create_and_add(buff, + kobject_get(device->ports_parent)); + if (!port->cur_port) { + ret = -ENOMEM; + goto kobj_create_err; + } + /*setting the admin GUID*/ + port->admin_alias_parent = kobject_create_and_add("admin_guids", + kobject_get(port->cur_port)); + if (!port->admin_alias_parent) { + ret = -ENOMEM; + goto err_admin_guids1; + } + + for (i = 0 ; i < attr.gid_tbl_len; i++) { + sprintf(buff, "%d",i); + port->dentr_ar->dentries[i].entry_num = i; + ret = create_sysfs_entry(port, &port->dentr_ar->dentries[i], + buff, port->admin_alias_parent, + show_admin_alias_guid, store_admin_alias_guid); + if (ret) + goto err_admin_guids2; + } + + /*setting the operational GUID*/ + port->gids_parent = kobject_create_and_add("gids", + kobject_get(port->cur_port)); + if (!port->gids_parent) { + ret = -ENOMEM; + goto err_gids1; + } + + for (i = 0 ; i < attr.gid_tbl_len; i++) { + sprintf(buff, "%d",i); + port->dentr_ar->dentries[attr.gid_tbl_len + i].entry_num = i; + ret = create_sysfs_entry(port, + &port->dentr_ar->dentries[attr.gid_tbl_len + i], + buff, + port->gids_parent, show_port_gid, NULL); + if (ret) + goto err_gids2; + } + + /* physical port pkey table */ + port->pkeys_parent = kobject_create_and_add("pkeys", + kobject_get(port->cur_port)); + if (!port->pkeys_parent) { + ret = -ENOMEM; + goto err_pkeys1; + } + + for (i = 0 ; i < attr.pkey_tbl_len; i++) { + sprintf(buff, "%d",i); + port->dentr_ar->dentries[2 * attr.gid_tbl_len + i].entry_num = i; + ret = create_sysfs_entry(port, + &port->dentr_ar->dentries[2 * attr.gid_tbl_len + i], + buff, port->pkeys_parent, + show_phys_port_pkey, NULL); + if (ret) + goto err_pkeys2; + } + + /* MCGs table */ + port->mcgs_parent = kobject_create_and_add("mcgs", + kobject_get(port->cur_port)); + if (!port->mcgs_parent) { + ret = -ENOMEM; + goto err_mcgs1; + } + + return 0 ; + +err_mcgs1: +err_pkeys2: + kobject_put(port->pkeys_parent); + +err_pkeys1: +err_gids2: + kobject_put(port->gids_parent); + +err_gids1: + +err_admin_guids2: + kobject_put(port->admin_alias_parent); + +err_admin_guids1: + kobject_put(port->cur_port); + +kobj_create_err: + kfree(port->dentr_ar); + +err: + printk(KERN_ERR "add_port_entries FAILED: for port:%d, error: %d\n", + port_num, ret); + return ret; +} + +static void get_name(struct mlx4_ib_dev *dev, char *name, int i, int max) +{ + char base_name[9]; + + /*pci_name format is: bus:dev:func -> xxxx:yy:zz.n*/ + strlcpy(name, pci_name(dev->dev->pdev), max); + strncpy(base_name, name,8); /*till xxxx:yy:*/ + base_name[8] ='\0'; + /*with no ARI only 3 last bits are used so when the fn it higher than 8 + need to add it to the dev num, so till 8 wil be count in the last number*/ + sprintf(name, "%s%.2d.%d", base_name,(i/8), (i%8)); +} + +struct mlx4_port { + struct kobject kobj; + struct mlx4_ib_dev *dev; + struct attribute_group pkey_group; + struct attribute_group gid_group; + u8 port_num; + int slave; +}; + + +static void mlx4_port_release(struct kobject *kobj) +{ + struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj); + struct attribute *a; + int i; + + for (i = 0; (a = p->pkey_group.attrs[i]); ++i) + kfree(a); + + kfree(p->pkey_group.attrs); + + for (i = 0; (a = p->gid_group.attrs[i]); ++i) + kfree(a); + + kfree(p->gid_group.attrs); + + kfree(p); +} + +struct port_attribute { + struct attribute attr; + ssize_t (*show)(struct mlx4_port *, struct port_attribute *, char *buf); + ssize_t (*store)(struct mlx4_port *, struct port_attribute *, + const char *buf, size_t count); +}; + +static ssize_t port_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct port_attribute *port_attr = + container_of(attr, struct port_attribute, attr); + struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj); + + if (!port_attr->show) + return -EIO; + + return port_attr->show(p, port_attr, buf); +} + +static ssize_t port_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t size) +{ + struct port_attribute *port_attr = + container_of(attr, struct port_attribute, attr); + struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj); + + if (!port_attr->store) + return -EIO; + + return port_attr->store(p, port_attr, buf, size); +} + +static struct sysfs_ops port_sysfs_ops = { + .show = port_attr_show, + .store = port_attr_store, +}; + +static struct kobj_type port_type = { + .release = mlx4_port_release, + .sysfs_ops = &port_sysfs_ops, +}; + +struct port_table_attribute { + struct port_attribute attr; + char name[8]; + int index; +}; + +static ssize_t show_port_pkey(struct mlx4_port *p, struct port_attribute *attr, + char *buf) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + ssize_t ret = -ENODEV; + + if (p->dev->pkeys.virt2phys_pkey[p->slave][p->port_num - 1][tab_attr->index] >= + (p->dev->dev->caps.pkey_table_len[p->port_num])) + ret = sprintf(buf, "none\n"); + else + ret = sprintf(buf, "%d\n", + p->dev->pkeys.virt2phys_pkey[p->slave] + [p->port_num - 1][tab_attr->index]); + + + return ret; +} + +static ssize_t store_port_pkey(struct mlx4_port *p, struct port_attribute *attr, + const char *buf, size_t count) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + int idx; + int err; + + /* do not allow remapping Dom0 virtual pkey table */ + if (p->slave == p->dev->dev->caps.function) + return -EINVAL; + + if (!strncasecmp(buf, "no", 2)) + idx = p->dev->dev->caps.pkey_table_max_len[p->port_num] - 1; + else if (sscanf(buf, "%i", &idx) != 1 || + idx >= p->dev->dev->caps.pkey_table_len[p->port_num] || + idx < 0) + return -EINVAL; + + p->dev->pkeys.virt2phys_pkey[p->slave][p->port_num - 1][tab_attr->index] = idx; + mlx4_sync_pkey_table(p->dev->dev, p->slave, p->port_num, tab_attr->index, idx); + err = mlx4_gen_pkey_eqe(p->dev->dev, p->slave, p->port_num); + if (err) { + printk("mlx4_gen_pkey_eqe failed for slave %d, port %d, index %d\n", + p->slave, p->port_num, idx); + return err; + } + + return count; +} + +static ssize_t show_port_gid_idx(struct mlx4_port *p, struct port_attribute *attr, + char *buf) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + + return sprintf(buf, "%d\n", ACT_GID_INDEX(p->dev->dev, tab_attr->index, p->slave)); +} + + +static struct attribute ** +alloc_group_attrs(ssize_t (*show)(struct mlx4_port *, + struct port_attribute *, char *buf), + ssize_t (*store)(struct mlx4_port *, struct port_attribute *, + const char *buf, size_t count), + int len) +{ + struct attribute **tab_attr; + struct port_table_attribute *element; + int i; + + tab_attr = kcalloc(1 + len, sizeof(struct attribute *), GFP_KERNEL); + if (!tab_attr) + return NULL; + + for (i = 0; i < len; i++) { + element = kzalloc(sizeof(struct port_table_attribute), + GFP_KERNEL); + if (!element) + goto err; + + if (snprintf(element->name, sizeof(element->name), + "%d", i) >= sizeof(element->name)) { + kfree(element); + goto err; + } + + element->attr.attr.name = element->name; + if (store) { + element->attr.attr.mode = S_IWUSR | S_IRUGO; + element->attr.store = store; + } else + element->attr.attr.mode = S_IRUGO; + + element->attr.show = show; + element->index = i; + + tab_attr[i] = &element->attr.attr; + } + + return tab_attr; + +err: + while (--i >= 0) + kfree(tab_attr[i]); + kfree(tab_attr); + return NULL; +} + +static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave) +{ + struct mlx4_port *p; + int i; + int ret; + + p = kzalloc(sizeof *p, GFP_KERNEL); + if (!p) + return -ENOMEM; + + p->dev = dev; + p->port_num = port_num; + p->slave = slave; + + ret = kobject_init_and_add(&p->kobj, &port_type, + kobject_get(dev->dev_ports_parent[slave]), + "%d", port_num); + if (ret) + goto err_alloc; + + p->pkey_group.name = "pkey_idx"; + p->pkey_group.attrs = alloc_group_attrs(show_port_pkey, store_port_pkey, + dev->dev->caps.pkey_table_len[port_num]); + if (!p->pkey_group.attrs) + goto err_put; + + ret = sysfs_create_group(&p->kobj, &p->pkey_group); + if (ret) + goto err_free_pkey; + + p->gid_group.name = "gid_idx"; + p->gid_group.attrs = alloc_group_attrs(show_port_gid_idx, NULL, dev->dev->gids_per_func); + if (!p->gid_group.attrs) + goto err_free_pkey; + + ret = sysfs_create_group(&p->kobj, &p->gid_group); + if (ret) + goto err_free_gid; + + list_add_tail(&p->kobj.entry, &dev->pkeys.pkey_port_list[slave]); + return 0; + +err_free_gid: + for (i = 0; i < dev->dev->gids_per_func; ++i) + kfree(p->gid_group.attrs[i]); + + kfree(p->gid_group.attrs); + +err_free_pkey: + for (i = 0; i < dev->dev->caps.pkey_table_len[port_num]; ++i) + kfree(p->pkey_group.attrs[i]); + + kfree(p->pkey_group.attrs); + +err_put: + kobject_put(dev->dev_ports_parent[slave]); + +err_alloc: + kfree(p); + + return ret; +} + +static int register_one_pkey_tree(struct mlx4_ib_dev *device, int slave) +{ + char name[32]; + int err; + int port; + + get_name(device, name, slave, sizeof name); + + device->pkeys.device_parent[slave] = kobject_create_and_add(name, + kobject_get(device->iov_parent)); + if (!device->pkeys.device_parent[slave]) { + err = -ENOMEM; + goto fail_dev; + } + + INIT_LIST_HEAD(&device->pkeys.pkey_port_list[slave]); + + device->dev_ports_parent[slave] = kobject_create_and_add("ports", + kobject_get(device->pkeys.device_parent[slave])); + if (!device->dev_ports_parent[slave]) { + err = -ENOMEM; + goto err_ports; + } + + for (port = 1; port <= device->dev->caps.num_ports; ++port) { + err = add_port(device, port, slave); + if (err) + goto err_add; + } + + return 0; + +err_add: + { + struct kobject *p, *t; + struct mlx4_port *port; + + list_for_each_entry_safe(p, t, &device->pkeys.pkey_port_list[slave], entry) { + list_del(&p->entry); + port = container_of(p, struct mlx4_port, kobj); + sysfs_remove_group(p, &port->pkey_group); + sysfs_remove_group(p, &port->gid_group); + kobject_put(p); + } + } + kobject_put(device->dev_ports_parent[slave]); +err_ports: + kobject_put(device->pkeys.device_parent[slave]); + +fail_dev: + return err; +} + +static int register_pkey_tree(struct mlx4_ib_dev *device) +{ + int i; + + if (!device->dev->caps.sqp_demux) + return 0; + + for (i = 0; i <= device->dev->sr_iov; ++i) + register_one_pkey_tree(device, i); + + return 0; +} + +static void unregister_pkey_tree(struct mlx4_ib_dev *device) +{ + int slave; + struct kobject *p, *t; + struct mlx4_port *port; + + if (!device->dev->caps.sqp_demux) + return; + + for (slave = device->dev->sr_iov; slave >= 0; --slave) { + list_for_each_entry_safe(p, t, &device->pkeys.pkey_port_list[slave], entry) { + list_del(&p->entry); + port = container_of(p, struct mlx4_port, kobj); + sysfs_remove_group(p, &port->pkey_group); + sysfs_remove_group(p, &port->gid_group); + kobject_put(p); + kobject_put(device->dev_ports_parent[slave]); + } + kobject_put(device->dev_ports_parent[slave]); + kobject_put(device->pkeys.device_parent[slave]); + kobject_put(device->pkeys.device_parent[slave]); + kobject_put(device->iov_parent); + } +} + +int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *device) +{ + + int i; + int ret = 0; + + if (!device->dev->caps.sqp_demux) + return 0; + + device->iov_parent = kobject_create_and_add("iov", + kobject_get(device->ib_dev.ports_parent->parent)); + if (!device->iov_parent) { + ret = -ENOMEM; + goto err; + } + device->ports_parent = kobject_create_and_add("ports", + kobject_get(device->iov_parent)); + if (!device->iov_parent) { + ret = -ENOMEM; + goto err_port; + } + for (i = 1; i <= device->ib_dev.phys_port_cnt; ++i) { + ret = add_port_entries(device, i); + if (ret) + goto err_port; + } + + ret = register_pkey_tree(device); + if (ret) + goto err_pkey; + + return ret; + + +err_pkey: + +err_port: + kobject_put(device->ib_dev.ports_parent->parent); +err: + printk(KERN_ERR "mlx4_ib_device_register_sysfs Error\n"); + return ret; + +} +void unregister_alias_guid_tree(struct mlx4_ib_dev *device) +{ + struct mlx4_ib_iov_port *p; + int i; + + if (!device->dev->caps.sqp_demux) + return; + + for (i = 0; i < MLX4_MAX_PORTS; i++) { + p = &device->iov_ports[i]; + kobject_put(p->admin_alias_parent); + kobject_put(p->gids_parent); + kobject_put(p->pkeys_parent); + kobject_put(p->mcgs_parent); + kobject_put(p->cur_port); + kobject_put(p->cur_port); + kobject_put(p->cur_port); + kobject_put(p->cur_port); + kobject_put(p->cur_port); + kobject_put(p->dev->ports_parent); + kfree(p->dentr_ar); + } +} + +void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device) +{ + unregister_alias_guid_tree(device); + unregister_pkey_tree(device); + kobject_put(device->ports_parent); + kobject_put(device->iov_parent); + kobject_put(device->iov_parent); + kobject_put(device->ib_dev.ports_parent->parent); +} diff --git a/drivers/infiniband/hw/mlx4/wc.c b/drivers/infiniband/hw/mlx4/wc.c new file mode 100644 index 0000000000000..827de14a068c9 --- /dev/null +++ b/drivers/infiniband/hw/mlx4/wc.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2006-2007 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include "wc.h" + +#if defined(__i386__) || defined(__x86_64__) + +pgprot_t pgprot_wc(pgprot_t _prot) +{ + return pgprot_writecombine(_prot); +} + +int mlx4_wc_enabled(void) +{ + return 1; +} + +#elif defined(CONFIG_PPC64) + +pgprot_t pgprot_wc(pgprot_t _prot) +{ + return __pgprot((pgprot_val(_prot) | _PAGE_NO_CACHE) & + ~(pgprot_t)_PAGE_GUARDED); +} + +int mlx4_wc_enabled(void) +{ + return 1; +} + +#else /* !(defined(__i386__) || defined(__x86_64__)) */ + +pgprot_t pgprot_wc(pgprot_t _prot) +{ + return pgprot_noncached(_prot); +} + +int mlx4_wc_enabled(void) +{ + return 0; +} + +#endif + diff --git a/drivers/infiniband/hw/mlx4/wc.h b/drivers/infiniband/hw/mlx4/wc.h new file mode 100644 index 0000000000000..f32fe1ee55e76 --- /dev/null +++ b/drivers/infiniband/hw/mlx4/wc.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2006-2007 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef mlx4_WC_H +#define mlx4_WC_H + +#include + +int mlx4_wc_enabled(void); +pgprot_t pgprot_wc(pgprot_t _prot); + +#endif diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c index 7bfa2a1649551..5510f8c99208c 100644 --- a/drivers/infiniband/hw/mthca/mthca_cmd.c +++ b/drivers/infiniband/hw/mthca/mthca_cmd.c @@ -1817,7 +1817,7 @@ int mthca_CONF_SPECIAL_QP(struct mthca_dev *dev, int type, u32 qpn, case IB_QPT_RAW_IPV6: op_mod = 2; break; - case IB_QPT_RAW_ETHERTYPE: + case IB_QPT_RAW_ETY: op_mod = 3; break; default: diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 1e0b4b6074ad0..f080a784bc795 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -1403,7 +1403,7 @@ int mthca_register_device(struct mthca_dev *dev) mutex_init(&dev->cap_mask_mutex); - ret = ib_register_device(&dev->ib_dev, NULL); + ret = ib_register_device(&dev->ib_dev); if (ret) return ret; diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 95ca93ceedac9..593d9121901f4 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -4008,7 +4008,7 @@ int nes_register_ofa_device(struct nes_ib_device *nesibdev) struct nes_adapter *nesadapter = nesdev->nesadapter; int i, ret; - ret = ib_register_device(&nesvnic->nesibdev->ibdev, NULL); + ret = ib_register_device(&nesvnic->nesibdev->ibdev); if (ret) { return ret; } diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index 9fab404888505..052824ebeeb6e 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -2160,7 +2160,7 @@ int qib_register_ib_device(struct qib_devdata *dd) snprintf(ibdev->node_desc, sizeof(ibdev->node_desc), QIB_IDSTR " %s", init_utsname()->nodename); - ret = ib_register_device(ibdev, qib_create_port_files); + ret = ib_register_device(ibdev); if (ret) goto err_reg; diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 936804efb7768..8a5c4c97a37c9 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -51,7 +51,7 @@ #include #include #include -#include +#include /* constants */ @@ -71,8 +71,8 @@ enum { IPOIB_CM_BUF_SIZE = IPOIB_CM_MTU + IPOIB_ENCAP_LEN, IPOIB_CM_HEAD_SIZE = IPOIB_CM_BUF_SIZE % PAGE_SIZE, IPOIB_CM_RX_SG = ALIGN(IPOIB_CM_BUF_SIZE, PAGE_SIZE) / PAGE_SIZE, - IPOIB_RX_RING_SIZE = 256, - IPOIB_TX_RING_SIZE = 128, + IPOIB_RX_RING_SIZE = 512, + IPOIB_TX_RING_SIZE = 512, IPOIB_MAX_QUEUE_SIZE = 8192, IPOIB_MIN_QUEUE_SIZE = 2, IPOIB_CM_MAX_CONN_QP = 4096, @@ -92,13 +92,24 @@ enum { IPOIB_STOP_REAPER = 7, IPOIB_FLAG_ADMIN_CM = 9, IPOIB_FLAG_UMCAST = 10, + IPOIB_FLAG_CSUM = 11, + IPOIB_MCAST_RUN_GC = 12, + IPOIB_FLAG_AUTO_MODER = 13, /*indicates moderation is running*/ + IPOIB_STOP_NEIGH_GC = 14, + IPOIB_NEIGH_TBL_FLUSH = 15, IPOIB_MAX_BACKOFF_SECONDS = 16, + IPOIB_FLAG_MODULE_DOWN = 17, /*indicates module is his way down*/ IPOIB_MCAST_FLAG_FOUND = 0, /* used in set_multicast_list */ IPOIB_MCAST_FLAG_SENDONLY = 1, IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */ IPOIB_MCAST_FLAG_ATTACHED = 3, + IPOIB_MCAST_JOIN_STARTED = 4, + IPOIB_MCAST_UMCAST_ATTACHED = 5, + + IPOIB_MAX_LRO_DESCRIPTORS = 8, + IPOIB_LRO_MAX_AGGR = 64, MAX_SEND_CQE = 16, IPOIB_CM_COPYBREAK = 256, @@ -133,6 +144,7 @@ struct ipoib_mcast { struct list_head list; unsigned long created; + unsigned long used; unsigned long backoff; unsigned long flags; @@ -143,6 +155,7 @@ struct ipoib_mcast { struct sk_buff_head pkt_queue; struct net_device *dev; + struct completion done; }; struct ipoib_rx_buf { @@ -160,6 +173,13 @@ struct ipoib_cm_tx_buf { u64 mapping; }; +/* in order to call dst->ops->update_pmtu out of spin-lock*/ +struct ipoib_pmtu_update { + struct work_struct work; + struct sk_buff *skb; + unsigned int mtu; +}; + struct ib_cm_id; struct ipoib_cm_data { @@ -255,9 +275,83 @@ struct ipoib_cm_dev_priv { int num_frags; }; + +struct ipoib_arp_repath { + struct work_struct work; + u16 lid; + union ib_gid sgid; + struct net_device *dev; +}; + +/* adaptive moderation parameters: */ +enum { + /* Target number of packets to coalesce with interrupt moderation */ + IPOIB_RX_COAL_TARGET = 88, + IPOIB_RX_COAL_TIME = 16, + IPOIB_TX_COAL_PKTS = 5, + IPOIB_TX_COAL_TIME = 0x80, + IPOIB_RX_RATE_LOW = 400000, + IPOIB_RX_COAL_TIME_LOW = 0, + IPOIB_RX_RATE_HIGH = 450000, + IPOIB_RX_COAL_TIME_HIGH = 128, + IPOIB_RX_SIZE_THRESH = 1024, + IPOIB_RX_RATE_THRESH = 1000000 / IPOIB_RX_COAL_TIME_HIGH, + IPOIB_SAMPLE_INTERVAL = 0, + IPOIB_AVG_PKT_SMALL = 256, + IPOIB_AUTO_CONF = 0xffff, + ADAPT_MODERATION_DELAY = HZ / 4, +}; + struct ipoib_ethtool_st { - u16 coalesce_usecs; + __u32 rx_max_coalesced_frames; + __u32 rx_coalesce_usecs; +/* u16 coalesce_usecs; u16 max_coalesced_frames; +*/ + __u32 pkt_rate_low; + __u32 pkt_rate_high; + __u32 rx_coalesce_usecs_low; + __u32 rx_coalesce_usecs_high; + __u32 rate_sample_interval; + __u32 use_adaptive_rx_coalesce; + int last_moder_time; + u16 sample_interval; + unsigned long last_moder_jiffies; + unsigned long last_moder_packets; + unsigned long last_moder_tx_packets; + unsigned long last_moder_bytes; +}; + +struct ipoib_lro { + struct net_lro_mgr lro_mgr; + struct net_lro_desc lro_desc[IPOIB_MAX_LRO_DESCRIPTORS]; +}; + +#define SOCK_ACCL_POLL_TCP 1UL << 28 +#define SOCK_ACCL_POLL_UDP 1UL << 29 + +struct sock_accl_ops { + void (*poll)(struct net_device *dev, int ring_num); + void (*get_tcp_ring)(struct net_device *dev, u8 *poll_ring, + u32 saddr, u32 daddr, u16 sport, u16 dport); + void (*get_udp_rings)(struct net_device *dev, u8 *poll_rings, + u8 *num_rings); +}; + +struct ipoib_neigh_table; +struct ipoib_neigh_hash { + struct ipoib_neigh_table *ntbl; + struct ipoib_neigh __rcu **buckets; + struct rcu_head rcu; + u32 mask; + u32 size; +}; + +struct ipoib_neigh_table { + struct ipoib_neigh_hash __rcu *htbl; + atomic_t entries; + struct completion flushed; + struct completion deleted; }; /* @@ -266,6 +360,10 @@ struct ipoib_ethtool_st { * of tx_lock (ie tx_lock must be acquired first if needed). */ struct ipoib_dev_priv { + + struct sock_accl_ops accl_priv; + spinlock_t rx_ring_lock; + spinlock_t lock; struct net_device *dev; @@ -279,18 +377,23 @@ struct ipoib_dev_priv { struct rb_root path_tree; struct list_head path_list; + struct ipoib_neigh_table ntbl; + struct ipoib_mcast *broadcast; struct list_head multicast_list; struct rb_root multicast_tree; struct delayed_work pkey_poll_task; - struct delayed_work mcast_task; + struct delayed_work mcast_join_task; + struct delayed_work mcast_leave_task; struct work_struct carrier_on_task; struct work_struct flush_light; struct work_struct flush_normal; struct work_struct flush_heavy; struct work_struct restart_task; struct delayed_work ah_reap_task; + struct delayed_work adaptive_moder_task; + struct delayed_work neigh_reap_task; struct ib_device *ca; u8 port; @@ -332,6 +435,7 @@ struct ipoib_dev_priv { struct net_device *parent; struct list_head child_intfs; struct list_head list; + int child_index; #ifdef CONFIG_INFINIBAND_IPOIB_CM struct ipoib_cm_dev_priv cm; @@ -345,6 +449,9 @@ struct ipoib_dev_priv { int hca_caps; struct ipoib_ethtool_st ethtool; struct timer_list poll_timer; + + struct ipoib_lro lro; + struct mutex state_lock; }; struct ipoib_ah { @@ -377,13 +484,16 @@ struct ipoib_neigh { #ifdef CONFIG_INFINIBAND_IPOIB_CM struct ipoib_cm_tx *cm; #endif - union ib_gid dgid; + u8 daddr[INFINIBAND_ALEN]; struct sk_buff_head queue; - struct neighbour *neighbour; struct net_device *dev; struct list_head list; + struct ipoib_neigh __rcu *hnext; + struct rcu_head rcu; + atomic_t refcnt; + unsigned long alive; }; #define IPOIB_UD_MTU(ib_mtu) (ib_mtu - IPOIB_ENCAP_LEN) @@ -394,26 +504,29 @@ static inline int ipoib_ud_need_sg(unsigned int ib_mtu) return IPOIB_UD_BUF_SIZE(ib_mtu) > PAGE_SIZE; } -/* - * We stash a pointer to our private neighbour information after our - * hardware address in neigh->ha. The ALIGN() expression here makes - * sure that this pointer is stored aligned so that an unaligned - * load is not needed to dereference it. - */ -static inline struct ipoib_neigh **to_ipoib_neigh(struct neighbour *neigh) +void ipoib_neigh_dtor(struct ipoib_neigh *neigh); +static inline void ipoib_neigh_put(struct ipoib_neigh *neigh) { - return (void*) neigh + ALIGN(offsetof(struct neighbour, ha) + - INFINIBAND_ALEN, sizeof(void *)); + if (atomic_dec_and_test(&neigh->refcnt)) + ipoib_neigh_dtor(neigh); } - -struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neigh, +struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr); +struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr, struct net_device *dev); -void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh); +void ipoib_neigh_free(struct ipoib_neigh *neigh); +void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid); extern struct workqueue_struct *ipoib_workqueue; +extern struct workqueue_struct *ipoib_auto_moder_workqueue; + +extern int ipoib_mc_sendonly_timeout; /* functions */ +void ipoib_get_tcp_ring(struct net_device *dev, u8 *poll_ring, u32 saddr, u32 daddr, u16 sport, u16 dport); +void ipoib_get_udp_rings(struct net_device *dev, u8 *poll_rings, u8 *num_rings); +void ipoib_accl_poll(struct net_device *dev, int ring_num); + int ipoib_poll(struct napi_struct *napi, int budget); void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr); void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr); @@ -425,7 +538,6 @@ static inline void ipoib_put_ah(struct ipoib_ah *ah) { kref_put(&ah->ref, ipoib_free_ah); } - int ipoib_open(struct net_device *dev); int ipoib_add_pkey_attr(struct net_device *dev); int ipoib_add_umcast_attr(struct net_device *dev); @@ -433,6 +545,7 @@ int ipoib_add_umcast_attr(struct net_device *dev); void ipoib_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_ah *address, u32 qpn); void ipoib_reap_ah(struct work_struct *work); +void ipoib_repath_ah(struct work_struct *work); void ipoib_mark_paths_invalid(struct net_device *dev); void ipoib_flush_paths(struct net_device *dev); @@ -454,8 +567,9 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port); void ipoib_dev_cleanup(struct net_device *dev); void ipoib_mcast_join_task(struct work_struct *work); +void ipoib_mcast_leave_task(struct work_struct *work); void ipoib_mcast_carrier_on_task(struct work_struct *work); -void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb); +void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb); void ipoib_mcast_restart_task(struct work_struct *work); int ipoib_mcast_start_thread(struct net_device *dev); @@ -490,8 +604,10 @@ void ipoib_transport_dev_cleanup(struct net_device *dev); void ipoib_event(struct ib_event_handler *handler, struct ib_event *record); -int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey); -int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey); +int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey, + unsigned char clone_index); +int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey, + unsigned char clone_index); void ipoib_pkey_poll(struct work_struct *work); int ipoib_pkey_dev_delay_open(struct net_device *dev); @@ -517,10 +633,10 @@ static inline int ipoib_cm_admin_enabled(struct net_device *dev) test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); } -static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n) +static inline int ipoib_cm_enabled(struct net_device *dev, u8 *hwaddr) { struct ipoib_dev_priv *priv = netdev_priv(dev); - return IPOIB_CM_SUPPORTED(n->ha) && + return IPOIB_CM_SUPPORTED(hwaddr) && test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); } @@ -575,7 +691,7 @@ static inline int ipoib_cm_admin_enabled(struct net_device *dev) { return 0; } -static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n) +static inline int ipoib_cm_enabled(struct net_device *dev, u8 *hwaddr) { return 0; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 39913a065f99d..1af105cedef10 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -31,11 +31,11 @@ */ #include +#include #include #include #include #include -#include #include #include "ipoib.h" @@ -45,14 +45,16 @@ int ipoib_max_conn_qp = 128; module_param_named(max_nonsrq_conn_qp, ipoib_max_conn_qp, int, 0444); MODULE_PARM_DESC(max_nonsrq_conn_qp, "Max number of connected-mode QPs per interface " - "(applied only if shared receive queue is not available)"); + "(applied only if shared receive queue is not available) " + "(default: 128)"); #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA static int data_debug_level; module_param_named(cm_data_debug_level, data_debug_level, int, 0644); MODULE_PARM_DESC(cm_data_debug_level, - "Enable data path debug tracing for connected mode if > 0"); + "Enable data path debug tracing for connected mode if > 0 " + "(default: 0)"); #endif #define IPOIB_CM_IETF_ID 0x1000000000000000ULL @@ -84,7 +86,7 @@ static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags, ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); for (i = 0; i < frags; ++i) - ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE); + ib_dma_unmap_page(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE); } static int ipoib_cm_post_receive_srq(struct net_device *dev, int id) @@ -183,7 +185,7 @@ partial_error: ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); for (; i > 0; --i) - ib_dma_unmap_single(priv->ca, mapping[i], PAGE_SIZE, DMA_FROM_DEVICE); + ib_dma_unmap_page(priv->ca, mapping[i], PAGE_SIZE, DMA_FROM_DEVICE); dev_kfree_skb_any(skb); return NULL; @@ -352,13 +354,15 @@ static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_i int ret; int i; - rx->rx_ring = vzalloc(ipoib_recvq_size * sizeof *rx->rx_ring); + rx->rx_ring = vmalloc(ipoib_recvq_size * sizeof *rx->rx_ring); if (!rx->rx_ring) { printk(KERN_WARNING "%s: failed to allocate CM non-SRQ ring (%d entries)\n", priv->ca->name, ipoib_recvq_size); return -ENOMEM; } + memset(rx->rx_ring, 0, ipoib_recvq_size * sizeof *rx->rx_ring); + t = kmalloc(sizeof *t, GFP_KERNEL); if (!t) { ret = -ENOMEM; @@ -661,6 +665,7 @@ copied: skb_reset_mac_header(skb); skb_pull(skb, IPOIB_ENCAP_LEN); + dev->last_rx = jiffies; ++dev->stats.rx_packets; dev->stats.rx_bytes += skb->len; @@ -707,14 +712,15 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_cm_tx_buf *tx_req; u64 addr; - int rc; if (unlikely(skb->len > tx->mtu)) { - ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", - skb->len, tx->mtu); + ipoib_warn(priv, "%s: packet len %d (> %d) too long to send, dropping\n", + __func__, skb->len, tx->mtu); ++dev->stats.tx_dropped; ++dev->stats.tx_errors; ipoib_cm_skb_too_long(dev, skb, tx->mtu - IPOIB_ENCAP_LEN); + + dev_kfree_skb_any(skb); return; } @@ -739,10 +745,9 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ tx_req->mapping = addr; - rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), - addr, skb->len); - if (unlikely(rc)) { - ipoib_warn(priv, "post_send failed, error %d\n", rc); + if (unlikely(post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), + addr, skb->len))) { + ipoib_warn(priv, "post_send failed\n"); ++dev->stats.tx_errors; ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE); dev_kfree_skb_any(skb); @@ -790,7 +795,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) netif_tx_lock(dev); ++tx->tx_tail; - if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && + if (unlikely(--priv->tx_outstanding <= ipoib_sendq_size >> 1) && netif_queue_stopped(dev) && test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) netif_wake_queue(dev); @@ -799,19 +804,23 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) wc->status != IB_WC_WR_FLUSH_ERR) { struct ipoib_neigh *neigh; - ipoib_dbg(priv, "failed cm send event " - "(status=%d, wrid=%d vend_err %x)\n", - wc->status, wr_id, wc->vendor_err); + /*IB_WC_RNR_RETRY_EXC_ERR error is part of the life cycle, so don't make waves.*/ + if (IB_WC_RNR_RETRY_EXC_ERR != wc->status) + ipoib_warn(priv, "%s: failed cm send event " + "(status=%d, wrid=%d vend_err %x)\n", + __func__, wc->status, wr_id, wc->vendor_err); + else + ipoib_dbg(priv, "%s: failed cm send event " + "(status=%d, wrid=%d vend_err %x)\n", + __func__, wc->status, wr_id, wc->vendor_err); spin_lock_irqsave(&priv->lock, flags); neigh = tx->neigh; if (neigh) { neigh->cm = NULL; - list_del(&neigh->list); - if (neigh->ah) - ipoib_put_ah(neigh->ah); - ipoib_neigh_free(dev, neigh); + list_del_init(&neigh->list); + ipoib_neigh_free(neigh); tx->neigh = NULL; } @@ -997,9 +1006,10 @@ static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even while ((skb = __skb_dequeue(&skqueue))) { skb->dev = p->dev; - if (dev_queue_xmit(skb)) - ipoib_warn(priv, "dev_queue_xmit failed " - "to requeue packet\n"); + ret = dev_queue_xmit(skb); + if (ret) + ipoib_warn(priv, "%s:dev_queue_xmit failed (ret = %d) " + "to requeue packet\n",__func__, ret); } ret = ib_send_cm_rtu(cm_id, NULL, 0); @@ -1095,12 +1105,13 @@ static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, struct ipoib_dev_priv *priv = netdev_priv(p->dev); int ret; - p->tx_ring = vzalloc(ipoib_sendq_size * sizeof *p->tx_ring); + p->tx_ring = vmalloc(ipoib_sendq_size * sizeof *p->tx_ring); if (!p->tx_ring) { ipoib_warn(priv, "failed to allocate tx ring\n"); ret = -ENOMEM; goto err_tx; } + memset(p->tx_ring, 0, ipoib_sendq_size * sizeof *p->tx_ring); p->qp = ipoib_cm_create_tx_qp(p->dev, p); if (IS_ERR(p->qp)) { @@ -1149,8 +1160,8 @@ err_tx: static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p) { struct ipoib_dev_priv *priv = netdev_priv(p->dev); - struct ipoib_cm_tx_buf *tx_req; unsigned long begin; + int num_tries = 0; ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n", p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail); @@ -1158,36 +1169,50 @@ static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p) if (p->id) ib_destroy_cm_id(p->id); + /*move the qp to ERROR state*/ + if (p->qp) { + if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE)) + ipoib_warn(priv, "%s: Failed to modify QP to ERROR state\n", + __func__); + } + if (p->tx_ring) { - /* Wait for all sends to complete */ + /* + * Wait for all sends to complete, + * All of them should return here after ERROR state in the qp. + */ begin = jiffies; - while ((int) p->tx_tail - (int) p->tx_head < 0) { + while (p->tx_tail != p->tx_head) { if (time_after(jiffies, begin + 5 * HZ)) { - ipoib_warn(priv, "timing out; %d sends not completed\n", + ipoib_warn(priv, "timing out; %d sends not completed still waiting..\n", p->tx_head - p->tx_tail); - goto timeout; + /* + * check if we are in napi_disable state (in port/module down etc.), + * if so we need to force drain over the qp in order to get all the comp + * otherwise ib_req_notify_cq to get the poll_tx at the next time. + */ + if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) { + ipoib_warn(priv, "%s: start drain CQ \n", __func__); + ipoib_drain_cq(p->dev); + + ipoib_warn(priv, "%s: re-arm CQ\n", __func__); + if (ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP)) + ipoib_warn(priv, "request notify on RECIVE CQ failed\n"); + } + begin = jiffies; + num_tries++; + if (num_tries == 5) { + ipoib_warn(priv, "%s: %d not completed Going out.\n", + __func__, p->tx_head - p->tx_tail); + goto out; + } } - - msleep(1); + /*let the wc to arrived.*/ + msleep(2); } } - -timeout: - - while ((int) p->tx_tail - (int) p->tx_head < 0) { - tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)]; - ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, - DMA_TO_DEVICE); - dev_kfree_skb_any(tx_req->skb); - ++p->tx_tail; - netif_tx_lock_bh(p->dev); - if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && - netif_queue_stopped(p->dev) && - test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) - netif_wake_queue(p->dev); - netif_tx_unlock_bh(p->dev); - } - +out: + /* assume all the wc are reached.*/ if (p->qp) ib_destroy_qp(p->qp); @@ -1227,10 +1252,8 @@ static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, if (neigh) { neigh->cm = NULL; - list_del(&neigh->list); - if (neigh->ah) - ipoib_put_ah(neigh->ah); - ipoib_neigh_free(dev, neigh); + list_del_init(&neigh->list); + ipoib_neigh_free(neigh); tx->neigh = NULL; } @@ -1273,12 +1296,15 @@ struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) { struct ipoib_dev_priv *priv = netdev_priv(tx->dev); + unsigned long flags; if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { + spin_lock_irqsave(&priv->lock, flags); list_move(&tx->list, &priv->cm.reap_list); queue_work(ipoib_workqueue, &priv->cm.reap_task); ipoib_dbg(priv, "Reap connection for gid %pI6\n", - tx->neigh->dgid.raw); + tx->neigh->daddr + 4); tx->neigh = NULL; + spin_unlock_irqrestore(&priv->lock, flags); } } @@ -1302,7 +1328,7 @@ static void ipoib_cm_tx_start(struct work_struct *work) p = list_entry(priv->cm.start_list.next, typeof(*p), list); list_del_init(&p->list); neigh = p->neigh; - qpn = IPOIB_QPN(neigh->neighbour->ha); + qpn = IPOIB_QPN(neigh->daddr); memcpy(&pathrec, &p->path->pathrec, sizeof pathrec); spin_unlock_irqrestore(&priv->lock, flags); @@ -1317,12 +1343,10 @@ static void ipoib_cm_tx_start(struct work_struct *work) neigh = p->neigh; if (neigh) { neigh->cm = NULL; - list_del(&neigh->list); - if (neigh->ah) - ipoib_put_ah(neigh->ah); - ipoib_neigh_free(dev, neigh); + list_del_init(&neigh->list); + ipoib_neigh_free(neigh); } - list_del(&p->list); + list_del_init(&p->list); kfree(p); } } @@ -1344,7 +1368,7 @@ static void ipoib_cm_tx_reap(struct work_struct *work) while (!list_empty(&priv->cm.reap_list)) { p = list_entry(priv->cm.reap_list.next, typeof(*p), list); - list_del(&p->list); + list_del_init(&p->list); spin_unlock_irqrestore(&priv->lock, flags); netif_tx_unlock_bh(dev); ipoib_cm_tx_destroy(p); @@ -1388,18 +1412,50 @@ static void ipoib_cm_skb_reap(struct work_struct *work) netif_tx_unlock_bh(dev); } +static void ipoib_cm_update_pmtu_task(struct work_struct *work) +{ + struct ipoib_pmtu_update *pmtu_update = + container_of(work, struct ipoib_pmtu_update, work); + struct sk_buff *skb = pmtu_update->skb; + + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), pmtu_update->mtu); + + consume_skb(skb); + + kfree(pmtu_update); +} + void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, unsigned int mtu) { struct ipoib_dev_priv *priv = netdev_priv(dev); - int e = skb_queue_empty(&priv->cm.skb_queue); - - if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); - +/* int e = skb_queue_empty(&priv->cm.skb_queue);*/ + struct ipoib_pmtu_update *pmtu_update; + + if (skb_dst(skb)) { + /* take the pmtu_update ouf ot spin-lock context */ + pmtu_update = kzalloc(sizeof *pmtu_update, GFP_ATOMIC); + if (pmtu_update) { + pmtu_update->skb = skb; + pmtu_update->mtu = mtu; + /* in order to keep the skb available */ + skb_get(skb); + + INIT_WORK(&pmtu_update->work, ipoib_cm_update_pmtu_task); + /* + * in order to have it serial, push that task to + * the same queue which the function will push + * the priv->cm.skb_task work. + */ + queue_work(ipoib_workqueue, &pmtu_update->work); + } else + ipoib_warn(priv, "Failed alloc pmtu_update and update_pmtu(skb->dst, mtu)\n"); + } +/* TODO: check how to do that without the panic: skb_queue_tail(&priv->cm.skb_queue, skb); if (e) queue_work(ipoib_workqueue, &priv->cm.skb_task); +*/ } static void ipoib_cm_rx_reap(struct work_struct *work) @@ -1455,32 +1511,41 @@ static ssize_t set_mode(struct device *d, struct device_attribute *attr, struct net_device *dev = to_net_dev(d); struct ipoib_dev_priv *priv = netdev_priv(dev); - if (!rtnl_trylock()) - return restart_syscall(); - /* flush paths if we switch modes so that connections are restarted */ if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) { set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); ipoib_warn(priv, "enabling connected mode " "will cause multicast packet drops\n"); - netdev_update_features(dev); - rtnl_unlock(); + + rtnl_lock(); + dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_SG | NETIF_F_TSO); priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM; + if (ipoib_cm_max_mtu(dev) > priv->mcast_mtu) + ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n", + priv->mcast_mtu); + dev_set_mtu(dev, ipoib_cm_max_mtu(dev)); + rtnl_unlock(); + ipoib_flush_paths(dev); return count; } if (!strcmp(buf, "datagram\n")) { clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); - netdev_update_features(dev); + + rtnl_lock(); + if (test_bit(IPOIB_FLAG_CSUM, &priv->flags)) { + dev->features |= NETIF_F_IP_CSUM | NETIF_F_SG; + if (priv->hca_caps & IB_DEVICE_UD_TSO) + dev->features |= NETIF_F_TSO; + } dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu)); rtnl_unlock(); ipoib_flush_paths(dev); return count; } - rtnl_unlock(); return -EINVAL; } @@ -1511,7 +1576,7 @@ static void ipoib_cm_create_srq(struct net_device *dev, int max_sge) return; } - priv->cm.srq_ring = vzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring); + priv->cm.srq_ring = vmalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring); if (!priv->cm.srq_ring) { printk(KERN_WARNING "%s: failed to allocate CM SRQ ring (%d entries)\n", priv->ca->name, ipoib_recvq_size); @@ -1520,6 +1585,7 @@ static void ipoib_cm_create_srq(struct net_device *dev, int max_sge) return; } + memset(priv->cm.srq_ring, 0, ipoib_recvq_size * sizeof *priv->cm.srq_ring); } int ipoib_cm_dev_init(struct net_device *dev) @@ -1554,6 +1620,7 @@ int ipoib_cm_dev_init(struct net_device *dev) attr.max_srq_sge = min_t(int, IPOIB_CM_RX_SG, attr.max_srq_sge); ipoib_cm_create_srq(dev, attr.max_srq_sge); if (ipoib_cm_has_srq(dev)) { + priv->cm.max_cm_mtu = attr.max_srq_sge * PAGE_SIZE - 0x10; priv->cm.num_frags = attr.max_srq_sge; ipoib_dbg(priv, "max_cm_mtu = 0x%x, num_frags=%d\n", diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c index 29bc7b5724ace..7b587a5e42805 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c @@ -42,48 +42,210 @@ static void ipoib_get_drvinfo(struct net_device *netdev, strncpy(drvinfo->driver, "ipoib", sizeof(drvinfo->driver) - 1); } +static u32 ipoib_get_rx_csum(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + return test_bit(IPOIB_FLAG_CSUM, &priv->flags) && + !test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); +} + +static int ipoib_set_tso(struct net_device *dev, u32 data) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + if (data) { + if (!test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) && + (dev->features & NETIF_F_SG) && + (priv->hca_caps & IB_DEVICE_UD_TSO)) { + dev->features |= NETIF_F_TSO; + } else { + ipoib_warn(priv, "can't set TSO on\n"); + return -EOPNOTSUPP; + } + } else + dev->features &= ~NETIF_F_TSO; + + return 0; +} + static int ipoib_get_coalesce(struct net_device *dev, struct ethtool_coalesce *coal) { struct ipoib_dev_priv *priv = netdev_priv(dev); - coal->rx_coalesce_usecs = priv->ethtool.coalesce_usecs; + coal->rx_coalesce_usecs = priv->ethtool.rx_coalesce_usecs; + coal->rx_max_coalesced_frames = priv->ethtool.rx_max_coalesced_frames; + coal->pkt_rate_low = priv->ethtool.pkt_rate_low; + coal->rx_coalesce_usecs_low = priv->ethtool.rx_coalesce_usecs_low; + coal->rx_coalesce_usecs_high = priv->ethtool.rx_coalesce_usecs_high; + coal->pkt_rate_high = priv->ethtool.pkt_rate_high; + coal->rate_sample_interval = priv->ethtool.rate_sample_interval; + coal->use_adaptive_rx_coalesce = priv->ethtool.use_adaptive_rx_coalesce; +/* coal->rx_coalesce_usecs = priv->ethtool.coalesce_usecs; coal->rx_max_coalesced_frames = priv->ethtool.max_coalesced_frames; - +*/ return 0; } +enum ipoib_auto_moder_operation { + NONE, + MOVING_TO_ON, + MOVING_TO_OFF +}; + static int ipoib_set_coalesce(struct net_device *dev, - struct ethtool_coalesce *coal) + struct ethtool_coalesce *coal) { - struct ipoib_dev_priv *priv = netdev_priv(dev); - int ret; - - /* - * These values are saved in the private data and returned - * when ipoib_get_coalesce() is called - */ - if (coal->rx_coalesce_usecs > 0xffff || - coal->rx_max_coalesced_frames > 0xffff) - return -EINVAL; - - ret = ib_modify_cq(priv->recv_cq, coal->rx_max_coalesced_frames, - coal->rx_coalesce_usecs); - if (ret && ret != -ENOSYS) { - ipoib_warn(priv, "failed modifying CQ (%d)\n", ret); - return ret; + struct ipoib_dev_priv *priv = netdev_priv(dev); + int ret; + enum ipoib_auto_moder_operation moder_operation = NONE; + + /* + * These values are saved in the private data and returned + * when ipoib_get_coalesce() is called + */ + if (coal->rx_coalesce_usecs > 0xffff || + coal->rx_max_coalesced_frames > 0xffff) + return -EINVAL; + priv->ethtool.rx_max_coalesced_frames = + (coal->rx_max_coalesced_frames == + IPOIB_AUTO_CONF) ? + IPOIB_RX_COAL_TARGET : + coal->rx_max_coalesced_frames; + priv->ethtool.rx_coalesce_usecs = (coal->rx_coalesce_usecs == + IPOIB_AUTO_CONF) ? + IPOIB_RX_COAL_TIME : + coal->rx_coalesce_usecs; + + ret = ib_modify_cq(priv->recv_cq, coal->rx_max_coalesced_frames, + coal->rx_coalesce_usecs); + if (ret && ret != -ENOSYS) { + ipoib_warn(priv, "failed modifying CQ (%d)\n", ret); + return ret; + } + + priv->ethtool.pkt_rate_low = coal->pkt_rate_low; + priv->ethtool.rx_coalesce_usecs_low = coal->rx_coalesce_usecs_low; + priv->ethtool.rx_coalesce_usecs_high = coal->rx_coalesce_usecs_high; + priv->ethtool.pkt_rate_high = coal->pkt_rate_high; + priv->ethtool.rate_sample_interval = coal->rate_sample_interval; + + if (priv->ethtool.use_adaptive_rx_coalesce && + !coal->use_adaptive_rx_coalesce) { + /* switch from adaptive-mode to non-adaptive mode: + cancell the adaptive moderation task. */ + clear_bit(IPOIB_FLAG_AUTO_MODER, &priv->flags); + cancel_delayed_work(&priv->adaptive_moder_task); + moder_operation = MOVING_TO_OFF; + } else if ((!priv->ethtool.use_adaptive_rx_coalesce && + coal->use_adaptive_rx_coalesce)) { + /* switch from non-adaptive-mode to adaptive mode, + starts it now */ + set_bit(IPOIB_FLAG_AUTO_MODER, &priv->flags); + moder_operation = MOVING_TO_ON; + priv->ethtool.use_adaptive_rx_coalesce = 1; + queue_delayed_work(ipoib_auto_moder_workqueue, + &priv->adaptive_moder_task, 0); + } + + if (MOVING_TO_OFF == moder_operation) + flush_workqueue(ipoib_auto_moder_workqueue); + else if (MOVING_TO_ON == moder_operation) { + /* move to initial values */ + ret = ib_modify_cq(priv->recv_cq, + priv->ethtool.rx_max_coalesced_frames, + priv->ethtool.rx_coalesce_usecs); + if (ret && ret != -ENOSYS) { + ipoib_warn(priv, "failed modifying CQ (%d)" + "(when moving to auto-moderation)\n", + ret); + return ret; + } + } + priv->ethtool.use_adaptive_rx_coalesce = coal->use_adaptive_rx_coalesce; + + return 0; +} + +static const char ipoib_stats_keys[][ETH_GSTRING_LEN] = { + "LRO aggregated", "LRO flushed", + "LRO avg aggr", "LRO no desc" +}; + +static void ipoib_get_strings(struct net_device *netdev, u32 stringset, u8 *data) +{ + switch (stringset) { + case ETH_SS_STATS: + memcpy(data, *ipoib_stats_keys, sizeof(ipoib_stats_keys)); + break; } +} + +static int ipoib_get_sset_count(struct net_device *dev, int sset) +{ + switch (sset) { + case ETH_SS_STATS: + return ARRAY_SIZE(ipoib_stats_keys); + default: + return -EOPNOTSUPP; + } +} + +static void ipoib_get_ethtool_stats(struct net_device *dev, + struct ethtool_stats *stats, uint64_t *data) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int index = 0; + + /* Get LRO statistics */ + data[index++] = priv->lro.lro_mgr.stats.aggregated; + data[index++] = priv->lro.lro_mgr.stats.flushed; + if (priv->lro.lro_mgr.stats.flushed) + data[index++] = priv->lro.lro_mgr.stats.aggregated / + priv->lro.lro_mgr.stats.flushed; + else + data[index++] = 0; + data[index++] = priv->lro.lro_mgr.stats.no_desc; +} - priv->ethtool.coalesce_usecs = coal->rx_coalesce_usecs; - priv->ethtool.max_coalesced_frames = coal->rx_max_coalesced_frames; +static void ipoib_get_ringparam(struct net_device *dev, + struct ethtool_ringparam *param) +{ + + memset(param, 0, sizeof(*param)); + param->rx_max_pending = IPOIB_MAX_QUEUE_SIZE; + param->tx_max_pending = IPOIB_MAX_QUEUE_SIZE; + param->rx_pending = ipoib_recvq_size; + param->tx_pending = ipoib_sendq_size; +} +int ipoib_set_flags(struct net_device *dev, u32 data, u32 supported) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + ethtool_op_set_flags(dev, data, supported); + /*no support in LRO with 4k mtu.*/ + if (ipoib_ud_need_sg(priv->max_ib_mtu) && (data & NETIF_F_LRO)) { + + priv->dev->features &= ~NETIF_F_LRO; + return -EOPNOTSUPP; + } return 0; } static const struct ethtool_ops ipoib_ethtool_ops = { .get_drvinfo = ipoib_get_drvinfo, + .get_rx_csum = ipoib_get_rx_csum, + .get_tso = ethtool_op_get_tso, + .set_tso = ipoib_set_tso, .get_coalesce = ipoib_get_coalesce, .set_coalesce = ipoib_set_coalesce, + .get_flags = ethtool_op_get_flags, + .set_flags = ipoib_set_flags, + .get_strings = ipoib_get_strings, + .get_sset_count = ipoib_get_sset_count, + .get_ethtool_stats = ipoib_get_ethtool_stats, + .get_ringparam = ipoib_get_ringparam, }; void ipoib_set_ethtool_ops(struct net_device *dev) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_fs.c b/drivers/infiniband/ulp/ipoib/ipoib_fs.c index 86eae229dc49e..83ae898e48aaa 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_fs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_fs.c @@ -32,7 +32,6 @@ #include #include -#include struct file_operations; @@ -212,16 +211,28 @@ static int ipoib_path_seq_show(struct seq_file *file, void *iter_ptr) gid_buf, path.pathrec.dlid ? "yes" : "no"); if (path.pathrec.dlid) { - rate = ib_rate_to_mult(path.pathrec.rate) * 25; - - seq_printf(file, - " DLID: 0x%04x\n" - " SL: %12d\n" - " rate: %*d%s Gb/sec\n", - be16_to_cpu(path.pathrec.dlid), - path.pathrec.sl, - 10 - ((rate % 10) ? 2 : 0), - rate / 10, rate % 10 ? ".5" : ""); + if (path.pathrec.rate > IB_RATE_120_GBPS) { + rate = ib_ext_rate_to_int(path.pathrec.rate); + + seq_printf(file, + " DLID: 0x%04x\n" + " SL: %12d\n" + " rate: %3d Gb/sec\n", + be16_to_cpu(path.pathrec.dlid), + path.pathrec.sl, + rate); + } else { + rate = ib_rate_to_mult(path.pathrec.rate) * 25; + + seq_printf(file, + " DLID: 0x%04x\n" + " SL: %12d\n" + " rate: %*d%s Gb/sec\n", + be16_to_cpu(path.pathrec.dlid), + path.pathrec.sl, + 10 - ((rate % 10) ? 2 : 0), + rate / 10, rate % 10 ? ".5" : ""); + } } seq_putc(file, '\n'); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 81ae61d68a222..3f2ecf8a29e46 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -35,10 +35,11 @@ #include #include -#include +#include #include #include +#include /* For ARPHRD_xxx */ #include "ipoib.h" @@ -217,6 +218,41 @@ static int ipoib_ib_post_receives(struct net_device *dev) return 0; } +static inline void ipoib_create_repath_ent(struct net_device *dev, + struct sk_buff *skb, + u16 lid) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_arp_repath *arp_repath; + struct arphdr *parphdr; + + parphdr = (struct arphdr *)(skb->data); + if (((ARPOP_REPLY != be16_to_cpu(parphdr->ar_op)) && + (ARPOP_REQUEST != be16_to_cpu(parphdr->ar_op))) || + (parphdr->ar_hln != INFINIBAND_ALEN) || + (skb->len < (sizeof(struct arphdr) + INFINIBAND_ALEN))) { + return; + } + + arp_repath = kzalloc(sizeof *arp_repath, GFP_ATOMIC); + if (!arp_repath) { + ipoib_warn(priv, "Failed alloc ipoib_arp_repath.\n"); + return; + } + + INIT_WORK(&arp_repath->work, ipoib_repath_ah); + + arp_repath->lid = lid; + memcpy(&arp_repath->sgid, skb->data + sizeof(struct arphdr) + 4, + sizeof(union ib_gid)); + arp_repath->dev = dev; + + if (!test_bit(IPOIB_STOP_REAPER, &priv->flags)) + queue_work(ipoib_workqueue, &arp_repath->work); + else + kfree(arp_repath); +} + static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -288,14 +324,20 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) skb_reset_mac_header(skb); skb_pull(skb, IPOIB_ENCAP_LEN); + dev->last_rx = jiffies; ++dev->stats.rx_packets; dev->stats.rx_bytes += skb->len; + if (unlikely(be16_to_cpu(skb->protocol) == ETH_P_ARP)) + ipoib_create_repath_ent(dev, skb, wc->slid); skb->dev = dev; - if ((dev->features & NETIF_F_RXCSUM) && likely(wc->csum_ok)) + if (test_bit(IPOIB_FLAG_CSUM, &priv->flags) && likely(wc->csum_ok)) skb->ip_summed = CHECKSUM_UNNECESSARY; - napi_gro_receive(&priv->napi, skb); + if (dev->features & NETIF_F_LRO) + lro_receive_skb(&priv->lro.lro_mgr, skb, NULL); + else + netif_receive_skb(skb); repost: if (unlikely(ipoib_ib_post_receive(dev, wr_id))) @@ -389,7 +431,7 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) dev_kfree_skb_any(tx_req->skb); ++priv->tx_tail; - if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && + if (unlikely(--priv->tx_outstanding <= ipoib_sendq_size >> 1) && netif_queue_stopped(dev) && test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) netif_wake_queue(dev); @@ -422,6 +464,7 @@ int ipoib_poll(struct napi_struct *napi, int budget) done = 0; + spin_lock(&priv->rx_ring_lock); poll_more: while (done < budget) { int max = (budget - done); @@ -447,6 +490,9 @@ poll_more: } if (done < budget) { + if (dev->features & NETIF_F_LRO) + lro_flush_all(&priv->lro.lro_mgr); + napi_complete(napi); if (unlikely(ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP | @@ -455,9 +501,54 @@ poll_more: goto poll_more; } + spin_unlock(&priv->rx_ring_lock); return done; } +void ipoib_get_tcp_ring(struct net_device *dev, u8 *poll_ring, u32 saddr, u32 daddr, u16 sport, u16 dport) +{ + *poll_ring = 0; +} + +void ipoib_get_udp_rings(struct net_device *dev, u8 *poll_rings, u8 *num_rings) +{ + *poll_rings = 0; + *num_rings = 1; +} + +void ipoib_accl_poll(struct net_device *dev, int ring_num) +{ + int budget = 64; + struct ipoib_dev_priv *priv = netdev_priv(dev); + int n, i, num_recv = 0; + struct ib_wc *wc; + + if (!spin_trylock_bh(&priv->rx_ring_lock)) + return; + while (num_recv < budget) { + n = ib_poll_cq(priv->recv_cq, budget, priv->ibwc); + for (i = 0; i < n; i++) { + wc = priv->ibwc + i; + + if (wc->wr_id & IPOIB_OP_RECV) { + num_recv++; + if (wc->wr_id & IPOIB_OP_CM) + ipoib_cm_handle_rx_wc(dev, wc); + else + ipoib_ib_handle_rx_wc(dev, wc); + } else + ipoib_cm_handle_tx_wc(priv->dev, wc); + } + if (n < budget) + break; + } + /* We always want to flush all of the accumulated skb's */ + if (dev->features & NETIF_F_LRO) + lro_flush_all(&priv->lro.lro_mgr); + + spin_unlock_bh(&priv->rx_ring_lock); +} + void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr) { struct net_device *dev = dev_ptr; @@ -532,7 +623,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_tx_buf *tx_req; - int hlen, rc; + int hlen; void *phead; if (skb_is_gso(skb)) { @@ -547,11 +638,13 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, } } else { if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) { - ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", - skb->len, priv->mcast_mtu + IPOIB_ENCAP_LEN); + ipoib_warn(priv, "%s: packet len %d (> %d) too long to send, dropping\n", + __func__, skb->len, priv->mcast_mtu + IPOIB_ENCAP_LEN); ++dev->stats.tx_dropped; ++dev->stats.tx_errors; ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu); + + dev_kfree_skb_any(skb); return; } phead = NULL; @@ -588,10 +681,9 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, netif_stop_queue(dev); } - rc = post_send(priv, priv->tx_head & (ipoib_sendq_size - 1), - address->ah, qpn, tx_req, phead, hlen); - if (unlikely(rc)) { - ipoib_warn(priv, "post_send failed, error %d\n", rc); + if (unlikely(post_send(priv, priv->tx_head & (ipoib_sendq_size - 1), + address->ah, qpn, tx_req, phead, hlen))) { + ipoib_warn(priv, "post_send failed\n"); ++dev->stats.tx_errors; --priv->tx_outstanding; ipoib_dma_unmap_tx(priv->ca, tx_req); @@ -646,6 +738,25 @@ void ipoib_reap_ah(struct work_struct *work) round_jiffies_relative(HZ)); } +static void ipoib_ah_dev_cleanup(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + unsigned long begin; + + begin = jiffies; + + while (!list_empty(&priv->dead_ahs)) { + __ipoib_reap_ah(dev); + + if (time_after(jiffies, begin + HZ)) { + ipoib_warn(priv, "timing out; will leak address handles\n"); + break; + } + + msleep(1); + } +} + static void ipoib_ib_tx_timer_func(unsigned long ctx) { drain_tx_cq((struct net_device *)ctx); @@ -717,6 +828,8 @@ int ipoib_ib_dev_up(struct net_device *dev) set_bit(IPOIB_FLAG_OPER_UP, &priv->flags); + set_bit(IPOIB_FLAG_AUTO_MODER, &priv->flags); + return ipoib_mcast_start_thread(dev); } @@ -739,6 +852,12 @@ int ipoib_ib_dev_down(struct net_device *dev, int flush) flush_workqueue(ipoib_workqueue); } + /* cancell the adaptive moderation task. */ + if (test_and_clear_bit(IPOIB_FLAG_AUTO_MODER, &priv->flags)) + cancel_delayed_work(&priv->adaptive_moder_task); + + flush_workqueue(ipoib_auto_moder_workqueue); + ipoib_mcast_stop_thread(dev, flush); ipoib_mcast_dev_flush(dev); @@ -803,6 +922,8 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_qp_attr qp_attr; + struct ib_qp_init_attr query_init_attr; + int ret; unsigned long begin; struct ipoib_tx_buf *tx_req; int i; @@ -816,9 +937,17 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush) * Move our QP to the error state and then reinitialize in * when all work requests have completed or have been flushed. */ - qp_attr.qp_state = IB_QPS_ERR; - if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) - ipoib_warn(priv, "Failed to modify QP to ERROR state\n"); + ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr); + + /* Cannot move to Error state if we still in RESET state.*/ + if (!ret && qp_attr.qp_state != IB_QPS_RESET) { + qp_attr.qp_state = IB_QPS_ERR; + if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) + ipoib_warn(priv, "Failed to modify QP to ERROR state\n"); + } else + ipoib_dbg(priv, "ib_query_qp returned: %d," + "qp state is %d, no need to move to ERROR.\n", + ret, qp_attr.qp_state); /* Wait for all sends and receives to complete */ begin = jiffies; @@ -875,18 +1004,7 @@ timeout: if (flush) flush_workqueue(ipoib_workqueue); - begin = jiffies; - - while (!list_empty(&priv->dead_ahs)) { - __ipoib_reap_ah(dev); - - if (time_after(jiffies, begin + HZ)) { - ipoib_warn(priv, "timing out; will leak address handles\n"); - break; - } - - msleep(1); - } + ipoib_ah_dev_cleanup(dev); ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP); @@ -1022,6 +1140,7 @@ void ipoib_ib_dev_cleanup(struct net_device *dev) ipoib_mcast_stop_thread(dev, 1); ipoib_mcast_dev_flush(dev); + ipoib_ah_dev_cleanup(dev); ipoib_transport_dev_cleanup(dev); } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 8abdfae3ac48c..2059bcee6c498 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -46,7 +46,8 @@ #include #include -#include +#include +#include MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); @@ -56,17 +57,33 @@ int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE; int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE; module_param_named(send_queue_size, ipoib_sendq_size, int, 0444); -MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue"); +MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue" + "(default: 512)"); module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444); -MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue"); +MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue " + "(default: 512)"); + +static int lro = 1; +module_param(lro, bool, 0444); +MODULE_PARM_DESC(lro, "Enable LRO (Large Receive Offload) (default: 0)"); + +static int lro_max_aggr = IPOIB_LRO_MAX_AGGR; +module_param(lro_max_aggr, int, 0644); +MODULE_PARM_DESC(lro_max_aggr, "LRO: Max packets to be aggregated " + "(default = 64)"); #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG int ipoib_debug_level; module_param_named(debug_level, ipoib_debug_level, int, 0644); -MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); +MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default: 0)"); #endif +int ipoib_mc_sendonly_timeout; + +module_param_named(mc_sendonly_timeout, ipoib_mc_sendonly_timeout, int, 0644); +MODULE_PARM_DESC(mc_sendonly_timeout, "Multicast sendonly GC timeout (default: 0)"); + struct ipoib_path_iter { struct net_device *dev; struct ipoib_path path; @@ -80,10 +97,13 @@ static const u8 ipv4_bcast_addr[] = { struct workqueue_struct *ipoib_workqueue; +struct workqueue_struct *ipoib_auto_moder_workqueue; + struct ib_sa_client ipoib_sa_client; static void ipoib_add_one(struct ib_device *device); static void ipoib_remove_one(struct ib_device *device); +static void ipoib_neigh_reclaim(struct rcu_head *rp); static struct ib_client ipoib_client = { .name = "ipoib", @@ -127,6 +147,13 @@ int ipoib_open(struct net_device *dev) netif_start_queue(dev); + if (priv->ethtool.use_adaptive_rx_coalesce) { + set_bit(IPOIB_FLAG_AUTO_MODER, &priv->flags); + queue_delayed_work(ipoib_auto_moder_workqueue, + &priv->adaptive_moder_task, + ADAPT_MODERATION_DELAY); + } + return 0; err_stop: @@ -143,12 +170,13 @@ static int ipoib_stop(struct net_device *dev) struct ipoib_dev_priv *priv = netdev_priv(dev); ipoib_dbg(priv, "stopping interface\n"); - + mutex_lock(&priv->state_lock); clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); + mutex_unlock(&priv->state_lock); netif_stop_queue(dev); - ipoib_ib_dev_down(dev, 1); + ipoib_ib_dev_down(dev, 0); ipoib_ib_dev_stop(dev, 0); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { @@ -171,16 +199,6 @@ static int ipoib_stop(struct net_device *dev) return 0; } -static u32 ipoib_fix_features(struct net_device *dev, u32 features) -{ - struct ipoib_dev_priv *priv = netdev_priv(dev); - - if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags)) - features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO); - - return features; -} - static int ipoib_change_mtu(struct net_device *dev, int new_mtu) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -205,6 +223,8 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu) dev->mtu = min(priv->mcast_mtu, priv->admin_mtu); + queue_work(ipoib_workqueue, &priv->flush_light); + return 0; } @@ -264,30 +284,15 @@ static int __path_add(struct net_device *dev, struct ipoib_path *path) static void path_free(struct net_device *dev, struct ipoib_path *path) { - struct ipoib_dev_priv *priv = netdev_priv(dev); - struct ipoib_neigh *neigh, *tn; struct sk_buff *skb; - unsigned long flags; while ((skb = __skb_dequeue(&path->queue))) dev_kfree_skb_irq(skb); - spin_lock_irqsave(&priv->lock, flags); - - list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) { - /* - * It's safe to call ipoib_put_ah() inside priv->lock - * here, because we know that path->ah will always - * hold one more reference, so ipoib_put_ah() will - * never do more than decrement the ref count. - */ - if (neigh->ah) - ipoib_put_ah(neigh->ah); - - ipoib_neigh_free(dev, neigh); - } + ipoib_dbg(netdev_priv(dev), "path_free\n"); - spin_unlock_irqrestore(&priv->lock, flags); + /* remove all neigh connected to this path */ + ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw); if (path->ah) ipoib_put_ah(path->ah); @@ -391,6 +396,7 @@ void ipoib_flush_paths(struct net_device *dev) spin_unlock_irqrestore(&priv->lock, flags); netif_tx_unlock_bh(dev); wait_for_completion(&path->done); + list_del(&path->list); path_free(dev, path); netif_tx_lock_bh(dev); spin_lock_irqsave(&priv->lock, flags); @@ -413,6 +419,7 @@ static void path_rec_completion(int status, struct sk_buff_head skqueue; struct sk_buff *skb; unsigned long flags; + int ret; if (!status) ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n", @@ -426,7 +433,7 @@ static void path_rec_completion(int status, if (!status) { struct ib_ah_attr av; - if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av)) + if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av, 0)) ah = ipoib_create_ah(dev, priv->pd, &av); } @@ -458,19 +465,15 @@ static void path_rec_completion(int status, } kref_get(&path->ah->ref); neigh->ah = path->ah; - memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, - sizeof(union ib_gid)); - if (ipoib_cm_enabled(dev, neigh->neighbour)) { + if (ipoib_cm_enabled(dev, neigh->daddr)) { if (!ipoib_cm_get(neigh)) ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh)); if (!ipoib_cm_get(neigh)) { - list_del(&neigh->list); - if (neigh->ah) - ipoib_put_ah(neigh->ah); - ipoib_neigh_free(dev, neigh); + list_del_init(&neigh->list); + ipoib_neigh_free(neigh); continue; } } @@ -491,9 +494,10 @@ static void path_rec_completion(int status, while ((skb = __skb_dequeue(&skqueue))) { skb->dev = dev; - if (dev_queue_xmit(skb)) - ipoib_warn(priv, "dev_queue_xmit failed " - "to requeue packet\n"); + ret = dev_queue_xmit(skb); + if (ret) + ipoib_warn(priv, "%s: dev_queue_xmit failed to requeue" + " packet (ret:%d)\n", __func__, ret); } } @@ -528,15 +532,41 @@ static int path_rec_start(struct net_device *dev, struct ipoib_path *path) { struct ipoib_dev_priv *priv = netdev_priv(dev); + ib_sa_comp_mask comp_mask = IB_SA_PATH_REC_MTU_SELECTOR | IB_SA_PATH_REC_MTU; + struct ib_sa_path_rec p_rec; + + p_rec = path->pathrec; + p_rec.mtu_selector = IB_SA_GT; + + switch (roundup_pow_of_two(dev->mtu + IPOIB_ENCAP_LEN)) { + case 512: + p_rec.mtu = IB_MTU_256; + break; + case 1024: + p_rec.mtu = IB_MTU_512; + break; + case 2048: + p_rec.mtu = IB_MTU_1024; + break; + case 4096: + p_rec.mtu = IB_MTU_2048; + break; + default: + /* Wildcard everything */ + comp_mask = 0; + p_rec.mtu = 0; + p_rec.mtu_selector = 0; + } - ipoib_dbg(priv, "Start path record lookup for %pI6\n", - path->pathrec.dgid.raw); + ipoib_dbg(priv, "Start path record lookup for %pI6 MTU > %d\n", + p_rec.dgid.raw, + comp_mask ? ib_mtu_enum_to_int(p_rec.mtu) : 0); init_completion(&path->done); path->query_id = ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port, - &path->pathrec, + &p_rec, comp_mask | IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_NUMB_PATH | @@ -555,28 +585,27 @@ static int path_rec_start(struct net_device *dev, return 0; } -/* called with rcu_read_lock */ -static void neigh_add_path(struct sk_buff *skb, struct net_device *dev) +static void neigh_add_path(struct sk_buff *skb, u8 *daddr, + struct net_device *dev) + { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_path *path; struct ipoib_neigh *neigh; - struct neighbour *n; unsigned long flags; - n = dst_get_neighbour(skb_dst(skb)); - neigh = ipoib_neigh_alloc(n, skb->dev); + spin_lock_irqsave(&priv->lock, flags); + neigh = ipoib_neigh_alloc(daddr, dev); if (!neigh) { + spin_unlock_irqrestore(&priv->lock, flags); ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); return; } - spin_lock_irqsave(&priv->lock, flags); - - path = __path_find(dev, n->ha + 4); + path = __path_find(dev, daddr + 4); if (!path) { - path = path_rec_create(dev, n->ha + 4); + path = path_rec_create(dev, daddr + 4); if (!path) goto err_path; @@ -588,17 +617,13 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev) if (path->ah) { kref_get(&path->ah->ref); neigh->ah = path->ah; - memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, - sizeof(union ib_gid)); - if (ipoib_cm_enabled(dev, neigh->neighbour)) { + if (ipoib_cm_enabled(dev, neigh->daddr)) { if (!ipoib_cm_get(neigh)) ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh)); if (!ipoib_cm_get(neigh)) { - list_del(&neigh->list); - if (neigh->ah) - ipoib_put_ah(neigh->ah); - ipoib_neigh_free(dev, neigh); + list_del_init(&neigh->list); + ipoib_neigh_free(neigh); goto err_drop; } if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) @@ -608,11 +633,8 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev) skb_queue_len(&neigh->queue)); goto err_drop; } - } else { - spin_unlock_irqrestore(&priv->lock, flags); - ipoib_send(dev, skb, path->ah, IPOIB_QPN(n->ha)); - return; - } + } else + ipoib_send(dev, skb, path->ah, IPOIB_QPN(daddr)); } else { neigh->ah = NULL; @@ -623,38 +645,20 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev) } spin_unlock_irqrestore(&priv->lock, flags); + ipoib_neigh_put(neigh); return; err_list: - list_del(&neigh->list); + list_del_init(&neigh->list); err_path: - ipoib_neigh_free(dev, neigh); + ipoib_neigh_free(neigh); err_drop: ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); spin_unlock_irqrestore(&priv->lock, flags); -} - -/* called with rcu_read_lock */ -static void ipoib_path_lookup(struct sk_buff *skb, struct net_device *dev) -{ - struct ipoib_dev_priv *priv = netdev_priv(skb->dev); - struct dst_entry *dst = skb_dst(skb); - struct neighbour *n; - - /* Look up path record for unicasts */ - n = dst_get_neighbour(dst); - if (n->ha[4] != 0xff) { - neigh_add_path(skb, dev); - return; - } - - /* Add in the P_Key for multicasts */ - n->ha[8] = (priv->pkey >> 8) & 0xff; - n->ha[9] = priv->pkey & 0xff; - ipoib_mcast_send(dev, n->ha + 4, skb); + ipoib_neigh_put(neigh); } static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, @@ -697,9 +701,7 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, ipoib_dbg(priv, "Send unicast ARP to %04x\n", be16_to_cpu(path->pathrec.dlid)); - spin_unlock_irqrestore(&priv->lock, flags); ipoib_send(dev, skb, path->ah, IPOIB_QPN(cb->hwaddr)); - return; } else if ((path->query || !path_rec_start(dev, path)) && skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { __skb_queue_tail(&path->queue, skb); @@ -713,92 +715,81 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) { - struct ipoib_dev_priv *priv = netdev_priv(dev); - struct ipoib_neigh *neigh; - struct neighbour *n = NULL; - unsigned long flags; - - rcu_read_lock(); - if (likely(skb_dst(skb))) - n = dst_get_neighbour(skb_dst(skb)); - - if (likely(n)) { - if (unlikely(!*to_ipoib_neigh(n))) { - ipoib_path_lookup(skb, dev); - goto unlock; - } - - neigh = *to_ipoib_neigh(n); - - if (unlikely((memcmp(&neigh->dgid.raw, - n->ha + 4, - sizeof(union ib_gid))) || - (neigh->dev != dev))) { - spin_lock_irqsave(&priv->lock, flags); - /* - * It's safe to call ipoib_put_ah() inside - * priv->lock here, because we know that - * path->ah will always hold one more reference, - * so ipoib_put_ah() will never do more than - * decrement the ref count. - */ - if (neigh->ah) - ipoib_put_ah(neigh->ah); - list_del(&neigh->list); - ipoib_neigh_free(dev, neigh); - spin_unlock_irqrestore(&priv->lock, flags); - ipoib_path_lookup(skb, dev); - goto unlock; - } - - if (ipoib_cm_get(neigh)) { - if (ipoib_cm_up(neigh)) { - ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); - goto unlock; - } - } else if (neigh->ah) { - ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(n->ha)); - goto unlock; - } - - if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { - spin_lock_irqsave(&priv->lock, flags); - __skb_queue_tail(&neigh->queue, skb); - spin_unlock_irqrestore(&priv->lock, flags); - } else { - ++dev->stats.tx_dropped; - dev_kfree_skb_any(skb); - } - } else { - struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb; - - if (cb->hwaddr[4] == 0xff) { - /* Add in the P_Key for multicast*/ - cb->hwaddr[8] = (priv->pkey >> 8) & 0xff; - cb->hwaddr[9] = priv->pkey & 0xff; - - ipoib_mcast_send(dev, cb->hwaddr + 4, skb); - } else { - /* unicast GID -- should be ARP or RARP reply */ - - if ((be16_to_cpup((__be16 *) skb->data) != ETH_P_ARP) && - (be16_to_cpup((__be16 *) skb->data) != ETH_P_RARP)) { - ipoib_warn(priv, "Unicast, no %s: type %04x, QPN %06x %pI6\n", - skb_dst(skb) ? "neigh" : "dst", - be16_to_cpup((__be16 *) skb->data), - IPOIB_QPN(cb->hwaddr), - cb->hwaddr + 4); - dev_kfree_skb_any(skb); - ++dev->stats.tx_dropped; - goto unlock; - } - - unicast_arp_send(skb, dev, cb); - } - } -unlock: - rcu_read_unlock(); - return NETDEV_TX_OK; + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_neigh *neigh; + struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb; + struct ipoib_header *header; + unsigned long flags; + + header = (struct ipoib_header *) skb->data; + + if (unlikely(cb->hwaddr[4] == 0xff)) { + /* multicast, arrange "if" according to probability */ + if ((header->proto != htons(ETH_P_IP)) && + (header->proto != htons(ETH_P_IPV6)) && + (header->proto != htons(ETH_P_ARP)) && + (header->proto != htons(ETH_P_RARP))) { + /* ethertype not supported by IPoIB */ + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + return NETDEV_TX_OK; + } + /* Add in the P_Key for multicast*/ + cb->hwaddr[8] = (priv->pkey >> 8) & 0xff; + cb->hwaddr[9] = priv->pkey & 0xff; + + neigh = ipoib_neigh_get(dev, cb->hwaddr); + if (likely(neigh)) + goto send_using_neigh; + ipoib_mcast_send(dev, cb->hwaddr, skb); + return NETDEV_TX_OK; + } + /* unicast, arrange "switch" according to probability */ + switch (header->proto) { + case htons(ETH_P_IP): + case htons(ETH_P_IPV6): + neigh = ipoib_neigh_get(dev, cb->hwaddr); + if (unlikely(!neigh)) { + neigh_add_path(skb, cb->hwaddr, dev); + return NETDEV_TX_OK; + } + break; + case htons(ETH_P_ARP): + case htons(ETH_P_RARP): + /* for unicast ARP and RARP should always perform path find */ + unicast_arp_send(skb, dev, cb); + return NETDEV_TX_OK; + default: + /* ethertype not supported by IPoIB */ + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + return NETDEV_TX_OK; + } +send_using_neigh: + /* note we now hold a ref to neigh */ + if (ipoib_cm_get(neigh)) { + if (ipoib_cm_up(neigh)) { + ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); + goto unref; + } + } else if (neigh->ah) { + ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(cb->hwaddr)); + goto unref; + } + + if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { + spin_lock_irqsave(&priv->lock, flags); + __skb_queue_tail(&neigh->queue, skb); + spin_unlock_irqrestore(&priv->lock, flags); + } else { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + } + +unref: + ipoib_neigh_put(neigh); + + return NETDEV_TX_OK; } static void ipoib_timeout(struct net_device *dev) @@ -807,9 +798,16 @@ static void ipoib_timeout(struct net_device *dev) ipoib_warn(priv, "transmit timeout: latency %d msecs\n", jiffies_to_msecs(jiffies - dev->trans_start)); - ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n", - netif_queue_stopped(dev), - priv->tx_head, priv->tx_tail); + ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u, tx_outstanding %u ipoib_sendq_size: %d \n", + netif_queue_stopped(dev),priv->tx_head, priv->tx_tail, priv->tx_outstanding, ipoib_sendq_size); + + if (unlikely(priv->tx_outstanding < ipoib_sendq_size >> 1) && + netif_queue_stopped(dev) && + test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { + ipoib_warn(priv, "%s: waking the queue\n", __func__); + netif_wake_queue(dev); + } + /* XXX reset QP, etc. */ } @@ -819,8 +817,7 @@ static int ipoib_hard_header(struct sk_buff *skb, const void *daddr, const void *saddr, unsigned len) { struct ipoib_header *header; - struct dst_entry *dst; - struct neighbour *n; + struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb; header = (struct ipoib_header *) skb_push(skb, sizeof *header); @@ -828,14 +825,11 @@ static int ipoib_hard_header(struct sk_buff *skb, header->reserved = 0; /* - * If we don't have a dst_entry structure, stuff the + * we don't rely on dst_entry structure, always stuff the * destination address into skb->cb so we can figure out where * to send the packet later. */ - if (!skb_dst(skb)) { - struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb; - memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN); - } + memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN); return 0; } @@ -852,100 +846,655 @@ static void ipoib_set_mcast_list(struct net_device *dev) queue_work(ipoib_workqueue, &priv->restart_task); } -static void ipoib_neigh_cleanup(struct neighbour *n) +static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr) { - struct ipoib_neigh *neigh; - struct ipoib_dev_priv *priv = netdev_priv(n->dev); + /* + * * Use only the address parts that contributes to spreading + * * The subnet prefix is not used as one can not connect to + * * same remote port (GUID) using the same remote QPN via two + * * different subnets. + * */ + /* qpn octets[1:4) & port GUID octets[12:20) */ + u32 *daddr_32 = (u32 *) daddr; + u32 hv; + + hv = jhash_3words(daddr_32[3], daddr_32[4], 0xFFFFFF & daddr_32[0], 0); + return hv & htbl->mask; +} + +struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + struct ipoib_neigh *neigh = NULL; + u32 hash_val; + + rcu_read_lock_bh(); + + htbl = rcu_dereference_bh(ntbl->htbl); + + if (!htbl) + goto out_unlock; + + hash_val = ipoib_addr_hash(htbl, daddr); + for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]); + neigh != NULL; + neigh = rcu_dereference_bh(neigh->hnext)) { + /* don't use flags for the comapre */ + if (memcmp(daddr+1, neigh->daddr+1, INFINIBAND_ALEN-1) == 0) { + /* found, take one ref on behalf of the caller */ + if (!atomic_inc_not_zero(&neigh->refcnt)) { + /* deleted */ + neigh = NULL; + goto out_unlock; + } + neigh->alive = jiffies; + goto out_unlock; + } + } + +out_unlock: + rcu_read_unlock_bh(); + return neigh; +} + +static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv) +{ + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + unsigned long neigh_obsolete; + unsigned long dt; unsigned long flags; - struct ipoib_ah *ah = NULL; + int i; - neigh = *to_ipoib_neigh(n); - if (neigh) - priv = netdev_priv(neigh->dev); - else + if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) return; - ipoib_dbg(priv, - "neigh_cleanup for %06x %pI6\n", - IPOIB_QPN(n->ha), - n->ha + 4); spin_lock_irqsave(&priv->lock, flags); - if (neigh->ah) - ah = neigh->ah; - list_del(&neigh->list); - ipoib_neigh_free(n->dev, neigh); + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + + if (!htbl) + goto out_unlock; + + /* neigh is obsolete if it was idle for two GC periods */ + dt = 2 * arp_tbl.gc_interval; + neigh_obsolete = jiffies - dt; + /* handle possible race condition */ + if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) + goto out_unlock; + + for (i = 0; i < htbl->size; i++) { + struct ipoib_neigh *neigh; + struct ipoib_neigh __rcu **np = &htbl->buckets[i]; + + while ((neigh = rcu_dereference_protected(*np, + lockdep_is_held(&priv->lock))) != NULL) { + /* was the neigh idle for two GC periods */ + if (time_after(neigh_obsolete, neigh->alive)) { + rcu_assign_pointer(*np, + rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&priv->lock))); + /* remove from path/mc list */ + list_del_init(&neigh->list); + call_rcu(&neigh->rcu, ipoib_neigh_reclaim); + } else { + np = &neigh->hnext; + } + + } + } +out_unlock: spin_unlock_irqrestore(&priv->lock, flags); +} + +static void ipoib_reap_neigh(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, neigh_reap_task.work); + + __ipoib_reap_neigh(priv); - if (ah) - ipoib_put_ah(ah); + if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) + queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, + arp_tbl.gc_interval); } -struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour, +static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr, struct net_device *dev) { struct ipoib_neigh *neigh; - neigh = kmalloc(sizeof *neigh, GFP_ATOMIC); + neigh = kzalloc(sizeof *neigh, GFP_ATOMIC); if (!neigh) return NULL; - neigh->neighbour = neighbour; neigh->dev = dev; - memset(&neigh->dgid.raw, 0, sizeof (union ib_gid)); - *to_ipoib_neigh(neighbour) = neigh; + memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr)); skb_queue_head_init(&neigh->queue); + INIT_LIST_HEAD(&neigh->list); ipoib_cm_set(neigh, NULL); + /* one ref on behalf of the caller */ + atomic_set(&neigh->refcnt, 1); + + ipoib_dbg(netdev_priv(dev), + "neigh ctor for %06x %pI6\n", + IPOIB_QPN(neigh->daddr), + neigh->daddr + 4); return neigh; } -void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh) +struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr, + struct net_device *dev) { + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + struct ipoib_neigh *neigh; + u32 hash_val; + + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + if (!htbl) { + neigh = NULL; + goto out_unlock; + } + + /* need to add a new neigh, but maybe some other thread succeeded? + * recalc hash, maybe hash resize took place so we do a search + */ + hash_val = ipoib_addr_hash(htbl, daddr); + for (neigh = rcu_dereference_protected(htbl->buckets[hash_val], + lockdep_is_held(&priv->lock)); + neigh != NULL; + neigh = rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&priv->lock))) { + /* don't use flags for the comapre */ + if (memcmp(daddr+1, neigh->daddr+1, INFINIBAND_ALEN-1) == 0) { + /* found, take one ref on behalf of the caller */ + if (!atomic_inc_not_zero(&neigh->refcnt)) { + /* deleted */ + neigh = NULL; + break; + } + neigh->alive = jiffies; + goto out_unlock; + } + } + + neigh = ipoib_neigh_ctor(daddr, dev); + if (!neigh) + goto out_unlock; + + /* one ref on behalf of the hash table */ + atomic_inc(&neigh->refcnt); + neigh->alive = jiffies; + /* put in hash */ + rcu_assign_pointer(neigh->hnext, + rcu_dereference_protected(htbl->buckets[hash_val], + lockdep_is_held(&priv->lock))); + rcu_assign_pointer(htbl->buckets[hash_val], neigh); + atomic_inc(&ntbl->entries); + +out_unlock: + return neigh; +} + +void ipoib_neigh_dtor(struct ipoib_neigh *neigh) +{ + /* neigh reference count was dropprd to zero */ + struct net_device *dev = neigh->dev; + struct ipoib_dev_priv *priv = netdev_priv(dev); struct sk_buff *skb; - *to_ipoib_neigh(neigh->neighbour) = NULL; + if (neigh->ah) + ipoib_put_ah(neigh->ah); while ((skb = __skb_dequeue(&neigh->queue))) { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); } if (ipoib_cm_get(neigh)) ipoib_cm_destroy_tx(ipoib_cm_get(neigh)); + ipoib_dbg(netdev_priv(dev), + "neigh free for %06x %pI6\n", + IPOIB_QPN(neigh->daddr), + neigh->daddr + 4); kfree(neigh); + if (atomic_dec_and_test(&priv->ntbl.entries)) { + if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags)) + complete(&priv->ntbl.flushed); + } } -static int ipoib_neigh_setup_dev(struct net_device *dev, struct neigh_parms *parms) +static void ipoib_neigh_reclaim(struct rcu_head *rp) { - parms->neigh_cleanup = ipoib_neigh_cleanup; + /* Called as a result of removal from hash table */ + struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu); + /* note TX context may hold another ref */ + ipoib_neigh_put(neigh); +} - return 0; +/* +* clean_path_from_cache: free path from both caches +* (list and rb tree) +* call that function under lock. (netif_tx_lock_bh && priv->lock) +*/ +static inline void clean_path_from_cache(struct ipoib_path *path, + struct ipoib_dev_priv *priv) +{ + list_del(&path->list); + rb_erase(&path->rb_node, &priv->path_tree); + if (path->query) + ib_sa_cancel_query(path->query_id, path->query); +} + +/* +* clean_path_dependencies: free path from neigths. +* Do not call this function under locks. +*/ +static inline void clean_path_references(struct ipoib_path *path, + struct net_device *dev) +{ + wait_for_completion(&path->done); + path_free(dev, path); +} + +/* +* ipoib_repath_ah: for each arp response/request: +* check that the lid ipoib kept for this gid +* is the same as it has in the arp packet. +* if not, delete that path from the cache. +*/ +void ipoib_repath_ah(struct work_struct *work) +{ + struct ipoib_arp_repath *repath = + container_of(work, struct ipoib_arp_repath, work); + + struct net_device *dev = repath->dev; + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_path *path_from_cache; + u16 lid_from_cache; + unsigned long flags; + + netif_tx_lock_bh(dev); + spin_lock_irqsave(&priv->lock, flags); + + path_from_cache = __path_find(dev, &repath->sgid); + + if (path_from_cache) { + lid_from_cache = be16_to_cpu(path_from_cache->pathrec.dlid); + /*check if we have the same path in the path cache:*/ + if ((lid_from_cache && repath->lid) && + (repath->lid != lid_from_cache)) { + ipoib_warn(priv, "Found gid with mismach lids." + "(cache:%d,from arp: %d)\n", + lid_from_cache, repath->lid); + clean_path_from_cache(path_from_cache, priv); + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); + clean_path_references(path_from_cache, dev); + goto free_res; + } + } + + spin_unlock_irqrestore(&priv->lock, flags); + netif_tx_unlock_bh(dev); + + free_res: + kfree(repath); +} + +void ipoib_neigh_free(struct ipoib_neigh *neigh) +{ + struct net_device *dev = neigh->dev; + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + struct ipoib_neigh __rcu **np; + struct ipoib_neigh *n; + u32 hash_val; + + + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + if (!htbl) + return; + + hash_val = ipoib_addr_hash(htbl, neigh->daddr); + np = &htbl->buckets[hash_val]; + for (n = rcu_dereference_protected(*np, + lockdep_is_held(&priv->lock)); + n != NULL; + n = rcu_dereference_protected(*np, + lockdep_is_held(&priv->lock))) { + if (n == neigh) { + /* found */ + rcu_assign_pointer(*np, + rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&priv->lock))); + call_rcu(&neigh->rcu, ipoib_neigh_reclaim); + return; + } else { + np = &n->hnext; + } + } +} + +static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv) +{ + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + struct ipoib_neigh **buckets; + u32 size; + + clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); + ntbl->htbl = NULL; + htbl = kzalloc(sizeof(*htbl), GFP_KERNEL); + if (!htbl) + return -ENOMEM; + set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); + size = roundup_pow_of_two(arp_tbl.gc_thresh3); + buckets = kzalloc(size * sizeof(*buckets), GFP_KERNEL); + if (!buckets) { + kfree(htbl); + return -ENOMEM; + } + htbl->size = size; + htbl->mask = (size - 1); + htbl->buckets = buckets; + ntbl->htbl = htbl; + htbl->ntbl = ntbl; + atomic_set(&ntbl->entries, 0); + + /* start garbage collection */ + clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); + queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, + arp_tbl.gc_interval); + + return 0; +} + +static void neigh_hash_free_rcu(struct rcu_head *head) +{ + struct ipoib_neigh_hash *htbl = container_of(head, + struct ipoib_neigh_hash, + rcu); + struct ipoib_neigh __rcu **buckets = htbl->buckets; + struct ipoib_neigh_table *ntbl = htbl->ntbl; + + kfree(buckets); + kfree(htbl); + complete(&ntbl->deleted); +} + +void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + unsigned long flags; + int i; + + /* remove all neigh connected to a given path or mcast */ + spin_lock_irqsave(&priv->lock, flags); + + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + + if (!htbl) + goto out_unlock; + + for (i = 0; i < htbl->size; i++) { + struct ipoib_neigh *neigh; + struct ipoib_neigh __rcu **np = &htbl->buckets[i]; + + while ((neigh = rcu_dereference_protected(*np, + lockdep_is_held(&priv->lock))) != NULL) { + /* delete neighs belong to this parent */ + if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) { + rcu_assign_pointer(*np, + rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&priv->lock))); + /* remove from parent list */ + list_del_init(&neigh->list); + call_rcu(&neigh->rcu, ipoib_neigh_reclaim); + } else { + np = &neigh->hnext; + } + + } + } + +out_unlock: + spin_unlock_irqrestore(&priv->lock, flags); +} + +static void ipoib_flush_neighs(struct ipoib_dev_priv *priv) +{ + struct ipoib_neigh_table *ntbl = &priv->ntbl; + struct ipoib_neigh_hash *htbl; + unsigned long flags; + int i; + + spin_lock_irqsave(&priv->lock, flags); + + htbl = rcu_dereference_protected(ntbl->htbl, + lockdep_is_held(&priv->lock)); + + if (!htbl) + goto out_unlock; + + for (i = 0; i < htbl->size; i++) { + struct ipoib_neigh *neigh; + struct ipoib_neigh __rcu **np = &htbl->buckets[i]; + + while ((neigh = rcu_dereference_protected(*np, + lockdep_is_held(&ntbl->rwlock))) != NULL) { + rcu_assign_pointer(*np, + rcu_dereference_protected(neigh->hnext, + lockdep_is_held(&ntbl->rwlock))); + /* remove from path/mc list */ + spin_lock_irqsave(&priv->lock, flags); + list_del_init(&neigh->list); + spin_unlock_irqrestore(&priv->lock, flags); + call_rcu(&neigh->rcu, ipoib_neigh_reclaim); + } + } + + rcu_assign_pointer(ntbl->htbl, NULL); + call_rcu(&htbl->rcu, neigh_hash_free_rcu); + +out_unlock: + spin_unlock_irqrestore(&priv->lock, flags); +} + +static void ipoib_neigh_hash_uninit(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + int stopped; + + ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n"); + init_completion(&priv->ntbl.deleted); + set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); + + /* Stop GC if called at init fail need to cancel work */ + stopped = test_and_set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); + if (!stopped) + cancel_delayed_work(&priv->neigh_reap_task); + + ipoib_flush_neighs(priv); + + wait_for_completion(&priv->ntbl.deleted); + +} + +static void ipoib_set_default_moderation(struct ipoib_dev_priv *priv) +{ + + /* If we haven't received a specific coalescing setting + * (module param), we set the moderation parameters as follows: + * - moder_cnt is set to the number of mtu sized packets to + * satisfy our coaelscing target. + * - moder_time is set to a fixed value. + */ + priv->ethtool.rx_max_coalesced_frames = IPOIB_RX_COAL_TARGET; + priv->ethtool.rx_coalesce_usecs = IPOIB_RX_COAL_TIME; + printk(KERN_ERR "Default coalesing params for mtu:%d - " + "rx_frames:%d rx_usecs:%d\n", + priv->dev->mtu, priv->ethtool.rx_max_coalesced_frames, + priv->ethtool.rx_coalesce_usecs); + + /* Reset auto-moderation params */ + priv->ethtool.pkt_rate_low = IPOIB_RX_RATE_LOW; + priv->ethtool.rx_coalesce_usecs_low = IPOIB_RX_COAL_TIME_LOW; + priv->ethtool.pkt_rate_high = IPOIB_RX_RATE_HIGH; + priv->ethtool.rx_coalesce_usecs_high = IPOIB_RX_COAL_TIME_HIGH; + priv->ethtool.sample_interval = IPOIB_SAMPLE_INTERVAL; + priv->ethtool.use_adaptive_rx_coalesce = 1; + priv->ethtool.last_moder_time = IPOIB_AUTO_CONF; + priv->ethtool.last_moder_jiffies = 0; + priv->ethtool.last_moder_packets = 0; + priv->ethtool.last_moder_tx_packets = 0; + priv->ethtool.last_moder_bytes = 0; +} +/* +The function classifies the incoming traffic during each sampling interval +into classes. The rx_usec value (i.e., moderation time) is then adjusted +appropriately per class. +There are two classes defined: + A. Bulk traffic: for heavy traffic consisting of packets of normal size. + This class is further divided into two sub-classes: + 1. Traffic that is mainly BW bound + - This traffic will get maximum moderation. + 2. Traffic that is mostly latency bound + - For situations where low latency is vital + - The rx_usec will be changed to a value in the range: + (ethtool.pkt_rate_low .. ethtool.pkt_rate_high) + depending on sampled packet rate. + B. Low latency traffic: for minimal traffic, or small packets. + - This traffic will get minimum moderation. +*/ +static void ipoib_auto_moderation(struct ipoib_dev_priv *priv) +{ + unsigned long period = jiffies - priv->ethtool.last_moder_jiffies; + unsigned long packets; + unsigned long rate; + unsigned long avg_pkt_size; + unsigned long rx_packets; + unsigned long rx_bytes; + unsigned long tx_packets; + unsigned long tx_pkt_diff; + unsigned long rx_pkt_diff; + int moder_time; + int ret; + + if (!priv->ethtool.use_adaptive_rx_coalesce) + return; + + rx_packets = priv->dev->stats.rx_packets; + rx_bytes = priv->dev->stats.rx_bytes; + tx_packets = priv->dev->stats.tx_packets; + + tx_pkt_diff = tx_packets - priv->ethtool.last_moder_tx_packets; + rx_pkt_diff = rx_packets - priv->ethtool.last_moder_packets; + packets = max(tx_pkt_diff, rx_pkt_diff); + rate = packets * HZ / period; + avg_pkt_size = packets ? + (rx_bytes - priv->ethtool.last_moder_bytes) / packets : 0; + + /* Apply auto-moderation only when packet rate exceeds a rate that + * it matters */ + if (rate > IPOIB_RX_RATE_THRESH && + avg_pkt_size > IPOIB_AVG_PKT_SMALL) { + if (rate < priv->ethtool.pkt_rate_low) + moder_time = + priv->ethtool.rx_coalesce_usecs_low; + else if (rate > priv->ethtool.pkt_rate_high) + moder_time = + priv->ethtool.rx_coalesce_usecs_high; + else + moder_time = (rate - priv->ethtool.pkt_rate_low) * + (priv->ethtool.rx_coalesce_usecs_high - priv->ethtool.rx_coalesce_usecs_low) / + (priv->ethtool.pkt_rate_high - priv->ethtool.pkt_rate_low) + + priv->ethtool.rx_coalesce_usecs_low; + + } else + moder_time = priv->ethtool.rx_coalesce_usecs_low; + + if (moder_time != priv->ethtool.last_moder_time) { + ipoib_dbg(priv, "%s: Rx moder_time changed from:%d to %d\n", + __func__, priv->ethtool.last_moder_time, moder_time); + priv->ethtool.last_moder_time = moder_time; + ret = ib_modify_cq(priv->recv_cq, + priv->ethtool.rx_max_coalesced_frames, + moder_time); + if (ret && ret != -ENOSYS) + ipoib_warn(priv, "%s: failed modifying CQ (%d)\n", + __func__, ret); + } + + priv->ethtool.last_moder_packets = rx_packets; + priv->ethtool.last_moder_tx_packets = tx_packets; + priv->ethtool.last_moder_bytes = rx_bytes; + priv->ethtool.last_moder_jiffies = jiffies; +} + +static void ipoib_config_adapt_moder(struct work_struct *work) +{ + struct delayed_work *delay = to_delayed_work(work); + struct ipoib_dev_priv *priv = container_of(delay, + struct ipoib_dev_priv, + adaptive_moder_task); + + if (!(netif_running(priv->dev) && netif_carrier_ok(priv->dev))) { + ipoib_dbg(priv, "%s: port is not ACTIVE, no configuration" + " for adaptive moderation\n", + __func__); + return; + } + + ipoib_auto_moderation(priv); + + if (test_bit(IPOIB_FLAG_AUTO_MODER, &priv->flags) && + priv->ethtool.use_adaptive_rx_coalesce) + queue_delayed_work(ipoib_auto_moder_workqueue, + &priv->adaptive_moder_task, + ADAPT_MODERATION_DELAY); } int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) { struct ipoib_dev_priv *priv = netdev_priv(dev); + if (ipoib_neigh_hash_init(priv) < 0) + goto out; + /* Allocate RX/TX "rings" to hold queued skbs */ priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, GFP_KERNEL); if (!priv->rx_ring) { printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", ca->name, ipoib_recvq_size); - goto out; + goto out_neigh_hash_cleanup; } - priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring); + priv->tx_ring = vmalloc(ipoib_sendq_size * sizeof *priv->tx_ring); if (!priv->tx_ring) { printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n", ca->name, ipoib_sendq_size); goto out_rx_ring_cleanup; } + memset(priv->tx_ring, 0, ipoib_sendq_size * sizeof *priv->tx_ring); /* priv->tx_head, tx_tail & tx_outstanding are already 0 */ - if (ipoib_ib_dev_init(dev, ca, port)) goto out_tx_ring_cleanup; + ipoib_set_default_moderation(priv); + return 0; out_tx_ring_cleanup: @@ -954,6 +1503,8 @@ out_tx_ring_cleanup: out_rx_ring_cleanup: kfree(priv->rx_ring); +out_neigh_hash_cleanup: + ipoib_neigh_hash_uninit(dev); out: return -ENOMEM; } @@ -966,6 +1517,9 @@ void ipoib_dev_cleanup(struct net_device *dev) /* Delete any child interfaces first */ list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { + /* Stop GC on child */ + set_bit(IPOIB_STOP_NEIGH_GC, &cpriv->flags); + cancel_delayed_work(&cpriv->neigh_reap_task); unregister_netdev(cpriv->dev); ipoib_dev_cleanup(cpriv->dev); free_netdev(cpriv->dev); @@ -978,21 +1532,69 @@ void ipoib_dev_cleanup(struct net_device *dev) priv->rx_ring = NULL; priv->tx_ring = NULL; + + ipoib_neigh_hash_uninit(dev); } static const struct header_ops ipoib_header_ops = { .create = ipoib_hard_header, }; +static int get_skb_hdr(struct sk_buff *skb, void **iphdr, + void **tcph, u64 *hdr_flags, void *priv) +{ + unsigned int ip_len; + struct iphdr *iph; + + if (unlikely(skb->protocol != htons(ETH_P_IP))) + return -1; + + /* + * In the future we may add an else clause that verifies the + * checksum and allows devices which do not calculate checksum + * to use LRO. + */ + if (unlikely(skb->ip_summed != CHECKSUM_UNNECESSARY)) + return -1; + + /* Check for non-TCP packet */ + skb_reset_network_header(skb); + iph = ip_hdr(skb); + if (iph->protocol != IPPROTO_TCP) + return -1; + + ip_len = ip_hdrlen(skb); + skb_set_transport_header(skb, ip_len); + *tcph = tcp_hdr(skb); + + /* check if IP header and TCP header are complete */ + if (ntohs(iph->tot_len) < ip_len + tcp_hdrlen(skb)) + return -1; + + *hdr_flags = LRO_IPV4 | LRO_TCP; + *iphdr = iph; + + return 0; +} + +static void ipoib_lro_setup(struct ipoib_dev_priv *priv) +{ + priv->lro.lro_mgr.max_aggr = lro_max_aggr; + priv->lro.lro_mgr.max_desc = IPOIB_MAX_LRO_DESCRIPTORS; + priv->lro.lro_mgr.lro_arr = priv->lro.lro_desc; + priv->lro.lro_mgr.get_skb_header = get_skb_hdr; + priv->lro.lro_mgr.features = LRO_F_NAPI; + priv->lro.lro_mgr.dev = priv->dev; + priv->lro.lro_mgr.ip_summed_aggr = CHECKSUM_UNNECESSARY; +} + static const struct net_device_ops ipoib_netdev_ops = { .ndo_open = ipoib_open, .ndo_stop = ipoib_stop, .ndo_change_mtu = ipoib_change_mtu, - .ndo_fix_features = ipoib_fix_features, .ndo_start_xmit = ipoib_start_xmit, .ndo_tx_timeout = ipoib_timeout, .ndo_set_multicast_list = ipoib_set_mcast_list, - .ndo_neigh_setup = ipoib_neigh_setup_dev, }; static void ipoib_setup(struct net_device *dev) @@ -1006,7 +1608,7 @@ static void ipoib_setup(struct net_device *dev) netif_napi_add(dev, &priv->napi, ipoib_poll, 100); - dev->watchdog_timeo = HZ; + dev->watchdog_timeo = 5 * HZ; dev->flags |= IFF_BROADCAST | IFF_MULTICAST; @@ -1024,9 +1626,13 @@ static void ipoib_setup(struct net_device *dev) priv->dev = dev; + ipoib_lro_setup(priv); + spin_lock_init(&priv->lock); + spin_lock_init(&priv->rx_ring_lock); mutex_init(&priv->vlan_mutex); + mutex_init(&priv->state_lock); INIT_LIST_HEAD(&priv->path_list); INIT_LIST_HEAD(&priv->child_intfs); @@ -1034,13 +1640,16 @@ static void ipoib_setup(struct net_device *dev) INIT_LIST_HEAD(&priv->multicast_list); INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll); - INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); + INIT_DELAYED_WORK(&priv->mcast_join_task, ipoib_mcast_join_task); + INIT_DELAYED_WORK(&priv->mcast_leave_task, ipoib_mcast_leave_task); INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task); INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light); INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal); INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy); INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); + INIT_DELAYED_WORK(&priv->adaptive_moder_task, ipoib_config_adapt_moder); + INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh); } struct ipoib_dev_priv *ipoib_intf_alloc(const char *name) @@ -1095,17 +1704,44 @@ int ipoib_add_umcast_attr(struct net_device *dev) return device_create_file(&dev->dev, &dev_attr_umcast); } +static int parse_child(struct device *dev, const char *buf, int *pkey, + int *child_index) +{ + int ret; + struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev)); + + *pkey = *child_index = -1; + + /* 'pkey' or 'pkey.child_index' or '.child_index' are allowed */ + ret = sscanf(buf, "%i.%i", pkey, child_index); + if (ret == 1) /* just pkey, implicit child index is 0 */ + *child_index = 0; + else if (ret != 2) { /* pkey same as parent, specified child index */ + *pkey = priv->pkey; + ret = sscanf(buf, ".%i", child_index); + if (ret != 1 || *child_index == 0) + return -EINVAL; + } + + if (*child_index < 0 || *child_index > 0xff) + return -EINVAL; + + if (*pkey < 0 || *pkey > 0xffff) + return -EINVAL; + + ipoib_dbg(priv, "parse_child inp %s out pkey %04x index %d\n", + buf, *pkey, *child_index); + return 0; +} + static ssize_t create_child(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - int pkey; + int pkey, child_index; int ret; - if (sscanf(buf, "%i", &pkey) != 1) - return -EINVAL; - - if (pkey < 0 || pkey > 0xffff) + if (parse_child(dev, buf, &pkey, &child_index)) return -EINVAL; /* @@ -1114,37 +1750,43 @@ static ssize_t create_child(struct device *dev, */ pkey |= 0x8000; - ret = ipoib_vlan_add(to_net_dev(dev), pkey); + ret = ipoib_vlan_add(to_net_dev(dev), pkey, child_index); return ret ? ret : count; } -static DEVICE_ATTR(create_child, S_IWUSR, NULL, create_child); +static DEVICE_ATTR(create_child, S_IWUGO, NULL, create_child); static ssize_t delete_child(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - int pkey; + int pkey, child_index; int ret; - if (sscanf(buf, "%i", &pkey) != 1) - return -EINVAL; - - if (pkey < 0 || pkey > 0xffff) + if (parse_child(dev, buf, &pkey, &child_index)) return -EINVAL; - ret = ipoib_vlan_delete(to_net_dev(dev), pkey); + ret = ipoib_vlan_delete(to_net_dev(dev), pkey, child_index); return ret ? ret : count; } -static DEVICE_ATTR(delete_child, S_IWUSR, NULL, delete_child); +static DEVICE_ATTR(delete_child, S_IWUGO, NULL, delete_child); int ipoib_add_pkey_attr(struct net_device *dev) { return device_create_file(&dev->dev, &dev_attr_pkey); } +void set_lro_features_bit(struct ipoib_dev_priv *priv) +{ + if (lro) + priv->dev->features |= NETIF_F_LRO; + /*no support in LRO with 4k mtu.*/ + if (ipoib_ud_need_sg(priv->max_ib_mtu)) + priv->dev->features &= ~NETIF_F_LRO; +} + int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca) { struct ib_device_attr *device_attr; @@ -1169,18 +1811,21 @@ int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca) kfree(device_attr); if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { - priv->dev->hw_features = NETIF_F_SG | - NETIF_F_IP_CSUM | NETIF_F_RXCSUM; + set_bit(IPOIB_FLAG_CSUM, &priv->flags); + priv->dev->features |= NETIF_F_SG | NETIF_F_IP_CSUM; + } - if (priv->hca_caps & IB_DEVICE_UD_TSO) - priv->dev->hw_features |= NETIF_F_TSO; + set_lro_features_bit(priv); - priv->dev->features |= priv->dev->hw_features; - } + if (priv->dev->features & NETIF_F_SG && priv->hca_caps & IB_DEVICE_UD_TSO) + priv->dev->features |= NETIF_F_TSO; + + priv->dev->features |= SOCK_ACCL_POLL_TCP | SOCK_ACCL_POLL_UDP; return 0; } + static struct net_device *ipoib_add_port(const char *format, struct ib_device *hca, u8 port) { @@ -1214,8 +1859,12 @@ static struct net_device *ipoib_add_port(const char *format, goto device_init_failed; } - if (ipoib_set_dev_features(priv, hca)) + result = ipoib_set_dev_features(priv, hca); + if (result) { + printk(KERN_WARNING "%s: failed to set device features for port %d (ret = %d)\n", + hca->name, port, result); goto device_init_failed; + } /* * Set the full membership bit, so that we join the right @@ -1226,6 +1875,10 @@ static struct net_device *ipoib_add_port(const char *format, priv->dev->broadcast[8] = priv->pkey >> 8; priv->dev->broadcast[9] = priv->pkey & 0xff; + priv->accl_priv.poll = &ipoib_accl_poll; + priv->accl_priv.get_tcp_ring = ipoib_get_tcp_ring; + priv->accl_priv.get_udp_rings = ipoib_get_udp_rings; + result = ib_query_gid(hca, port, 0, &priv->local_gid); if (result) { printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n", @@ -1257,9 +1910,14 @@ static struct net_device *ipoib_add_port(const char *format, hca->name, port, result); goto register_failed; } + /*force lro on the dev->features, because the function + register_netdev disable it according to our private lro*/ + set_lro_features_bit(priv); ipoib_create_debug_files(priv->dev); + result = -ENOMEM; + if (ipoib_cm_add_mode_attr(priv->dev)) goto sysfs_failed; if (ipoib_add_pkey_attr(priv->dev)) @@ -1279,6 +1937,9 @@ sysfs_failed: register_failed: ib_unregister_event_handler(&priv->event_handler); + /* Stop GC if started before flush */ + set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); + cancel_delayed_work(&priv->neigh_reap_task); flush_workqueue(ipoib_workqueue); event_failed: @@ -1316,7 +1977,7 @@ static void ipoib_add_one(struct ib_device *device) } for (p = s; p <= e; ++p) { - if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND) + if (rdma_port_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND) continue; dev = ipoib_add_port("ib%d", device, p); if (!IS_ERR(dev)) { @@ -1337,15 +1998,25 @@ static void ipoib_remove_one(struct ib_device *device) return; dev_list = ib_get_client_data(device, &ipoib_client); + if (!dev_list) + return; list_for_each_entry_safe(priv, tmp, dev_list, list) { + if (rdma_port_link_layer(device, priv->port) != IB_LINK_LAYER_INFINIBAND) + continue; + + set_bit(IPOIB_FLAG_MODULE_DOWN, &priv->flags); ib_unregister_event_handler(&priv->event_handler); rtnl_lock(); dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); rtnl_unlock(); + /* Stop GC */ + set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); + cancel_delayed_work(&priv->neigh_reap_task); flush_workqueue(ipoib_workqueue); + flush_workqueue(ipoib_auto_moder_workqueue); unregister_netdev(priv->dev); ipoib_dev_cleanup(priv->dev); @@ -1365,7 +2036,8 @@ static int __init ipoib_init_module(void) ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size); ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE); - ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE); + ipoib_sendq_size = max(ipoib_sendq_size, max(2 * MAX_SEND_CQE, + IPOIB_MIN_QUEUE_SIZE)); #ifdef CONFIG_INFINIBAND_IPOIB_CM ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP); #endif @@ -1394,6 +2066,14 @@ static int __init ipoib_init_module(void) goto err_fs; } + ipoib_auto_moder_workqueue = + create_singlethread_workqueue("ipoib_auto_moder"); + if (!ipoib_auto_moder_workqueue) { + ret = -ENOMEM; + goto err_am; + } + + ib_sa_register_client(&ipoib_sa_client); ret = ib_register_client(&ipoib_client); @@ -1404,6 +2084,8 @@ static int __init ipoib_init_module(void) err_sa: ib_sa_unregister_client(&ipoib_sa_client); + destroy_workqueue(ipoib_auto_moder_workqueue); +err_am: destroy_workqueue(ipoib_workqueue); err_fs: @@ -1418,6 +2100,7 @@ static void __exit ipoib_cleanup_module(void) ib_sa_unregister_client(&ipoib_sa_client); ipoib_unregister_debugfs(); destroy_workqueue(ipoib_workqueue); + destroy_workqueue(ipoib_auto_moder_workqueue); } module_init(ipoib_init_module); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index fc045946298ee..81e2d678bc0ba 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -40,7 +40,6 @@ #include #include #include -#include #include @@ -51,7 +50,7 @@ static int mcast_debug_level; module_param(mcast_debug_level, int, 0644); MODULE_PARM_DESC(mcast_debug_level, - "Enable multicast debug tracing if > 0"); + "Enable multicast debug tracing if > 0 (default: 0)"); #endif static DEFINE_MUTEX(mcast_mutex); @@ -68,28 +67,14 @@ struct ipoib_mcast_iter { static void ipoib_mcast_free(struct ipoib_mcast *mcast) { struct net_device *dev = mcast->dev; - struct ipoib_dev_priv *priv = netdev_priv(dev); - struct ipoib_neigh *neigh, *tmp; int tx_dropped = 0; ipoib_dbg_mcast(netdev_priv(dev), "deleting multicast group %pI6\n", mcast->mcmember.mgid.raw); - spin_lock_irq(&priv->lock); - list_for_each_entry_safe(neigh, tmp, &mcast->neigh_list, list) { - /* - * It's safe to call ipoib_put_ah() inside priv->lock - * here, because we know that mcast->ah will always - * hold one more reference, so ipoib_put_ah() will - * never do more than decrement the ref count. - */ - if (neigh->ah) - ipoib_put_ah(neigh->ah); - ipoib_neigh_free(dev, neigh); - } - - spin_unlock_irq(&priv->lock); + /* remove all neigh connected to this mcast */ + ipoib_del_neighs_by_gid(dev, mcast->mcmember.mgid.raw); if (mcast->ah) ipoib_put_ah(mcast->ah); @@ -117,6 +102,7 @@ static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev, mcast->dev = dev; mcast->created = jiffies; + mcast->used = jiffies; mcast->backoff = 1; INIT_LIST_HEAD(&mcast->list); @@ -189,9 +175,7 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, mcast->mcmember = *mcmember; - /* Set the multicast MTU and cached Q_Key before we attach if it's - * the broadcast group. - */ + /* Set the cached Q_Key before we attach if it's the broadcast group */ if (!memcmp(mcast->mcmember.mgid.raw, priv->dev->broadcast + 4, sizeof (union ib_gid))) { spin_lock_irq(&priv->lock); @@ -199,17 +183,10 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, spin_unlock_irq(&priv->lock); return -EAGAIN; } - priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu)); priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey); spin_unlock_irq(&priv->lock); priv->tx_wr.wr.ud.remote_qkey = priv->qkey; set_qkey = 1; - - if (!ipoib_cm_admin_enabled(dev)) { - rtnl_lock(); - dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu)); - rtnl_unlock(); - } } if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { @@ -267,14 +244,13 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, netif_tx_lock_bh(dev); while (!skb_queue_empty(&mcast->pkt_queue)) { struct sk_buff *skb = skb_dequeue(&mcast->pkt_queue); - netif_tx_unlock_bh(dev); skb->dev = dev; - - if (dev_queue_xmit(skb)) - ipoib_warn(priv, "dev_queue_xmit failed to requeue packet\n"); - + ret = dev_queue_xmit(skb); + if (ret) + ipoib_warn(priv, "%s: dev_queue_xmit failed to " + "requeue packet(ret: %d)\n", __func__, ret); netif_tx_lock_bh(dev); } netif_tx_unlock_bh(dev); @@ -327,6 +303,7 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast) .join_state = 1 #endif }; + ib_sa_comp_mask comp_mask; int ret = 0; if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { @@ -343,12 +320,38 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast) rec.port_gid = priv->local_gid; rec.pkey = cpu_to_be16(priv->pkey); + comp_mask = + IB_SA_MCMEMBER_REC_MGID | + IB_SA_MCMEMBER_REC_PORT_GID | + IB_SA_MCMEMBER_REC_PKEY | + IB_SA_MCMEMBER_REC_JOIN_STATE; + + if (priv->broadcast) { + comp_mask |= + IB_SA_MCMEMBER_REC_QKEY | + IB_SA_MCMEMBER_REC_MTU_SELECTOR | + IB_SA_MCMEMBER_REC_MTU | + IB_SA_MCMEMBER_REC_TRAFFIC_CLASS | + IB_SA_MCMEMBER_REC_RATE_SELECTOR | + IB_SA_MCMEMBER_REC_RATE | + IB_SA_MCMEMBER_REC_SL | + IB_SA_MCMEMBER_REC_FLOW_LABEL | + IB_SA_MCMEMBER_REC_HOP_LIMIT; + + rec.qkey = priv->broadcast->mcmember.qkey; + rec.mtu_selector = IB_SA_EQ; + rec.mtu = priv->broadcast->mcmember.mtu; + rec.traffic_class = priv->broadcast->mcmember.traffic_class; + rec.rate_selector = IB_SA_EQ; + rec.rate = priv->broadcast->mcmember.rate; + rec.sl = priv->broadcast->mcmember.sl; + rec.flow_label = priv->broadcast->mcmember.flow_label; + rec.hop_limit = priv->broadcast->mcmember.hop_limit; + } + mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, &rec, - IB_SA_MCMEMBER_REC_MGID | - IB_SA_MCMEMBER_REC_PORT_GID | - IB_SA_MCMEMBER_REC_PKEY | - IB_SA_MCMEMBER_REC_JOIN_STATE, + comp_mask, GFP_ATOMIC, ipoib_mcast_sendonly_join_complete, mcast); @@ -371,20 +374,31 @@ void ipoib_mcast_carrier_on_task(struct work_struct *work) carrier_on_task); struct ib_port_attr attr; - /* - * Take rtnl_lock to avoid racing with ipoib_stop() and - * turning the carrier back on while a device is being - * removed. - */ + mutex_lock(&priv->state_lock); + if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { + ipoib_dbg(priv, "Keeping carrier off - IPOIB_FLAG_ADMIN_UP not set.\n"); + goto out; + } + if (ib_query_port(priv->ca, priv->port, &attr) || attr.state != IB_PORT_ACTIVE) { ipoib_dbg(priv, "Keeping carrier off until IB port is active\n"); - return; + goto out; } - rtnl_lock(); netif_carrier_on(priv->dev); - rtnl_unlock(); + + /* enable auto-moderation */ + if (priv->ethtool.use_adaptive_rx_coalesce && + test_bit(IPOIB_FLAG_AUTO_MODER, &priv->flags)) + queue_delayed_work(ipoib_auto_moder_workqueue, + &priv->adaptive_moder_task, + ADAPT_MODERATION_DELAY); + +out: + mutex_unlock(&priv->state_lock); + + } static int ipoib_mcast_join_complete(int status, @@ -398,8 +412,10 @@ static int ipoib_mcast_join_complete(int status, mcast->mcmember.mgid.raw, status); /* We trap for port events ourselves. */ - if (status == -ENETRESET) - return 0; + if (status == -ENETRESET){ + status = 0; + goto out; + } if (!status) status = ipoib_mcast_join_finish(mcast, &multicast->rec); @@ -409,7 +425,7 @@ static int ipoib_mcast_join_complete(int status, mutex_lock(&mcast_mutex); if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) queue_delayed_work(ipoib_workqueue, - &priv->mcast_task, 0); + &priv->mcast_join_task, 0); mutex_unlock(&mcast_mutex); /* @@ -419,7 +435,8 @@ static int ipoib_mcast_join_complete(int status, if (mcast == priv->broadcast) queue_work(ipoib_workqueue, &priv->carrier_on_task); - return 0; + status = 0; + goto out; } if (mcast->logcount++ < 20) { @@ -442,11 +459,12 @@ static int ipoib_mcast_join_complete(int status, mutex_lock(&mcast_mutex); spin_lock_irq(&priv->lock); if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) - queue_delayed_work(ipoib_workqueue, &priv->mcast_task, + queue_delayed_work(ipoib_workqueue, &priv->mcast_join_task, mcast->backoff * HZ); spin_unlock_irq(&priv->lock); mutex_unlock(&mcast_mutex); - +out: + complete(&mcast->done); return status; } @@ -496,11 +514,15 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, } set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + init_completion(&mcast->done); + set_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags); + mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, &rec, comp_mask, GFP_KERNEL, ipoib_mcast_join_complete, mcast); if (IS_ERR(mcast->mc)) { clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + complete(&mcast->done); ret = PTR_ERR(mcast->mc); ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret); @@ -511,7 +533,7 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, mutex_lock(&mcast_mutex); if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) queue_delayed_work(ipoib_workqueue, - &priv->mcast_task, + &priv->mcast_join_task, mcast->backoff * HZ); mutex_unlock(&mcast_mutex); } @@ -520,12 +542,20 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, void ipoib_mcast_join_task(struct work_struct *work) { struct ipoib_dev_priv *priv = - container_of(work, struct ipoib_dev_priv, mcast_task.work); + container_of(work, struct ipoib_dev_priv, mcast_join_task.work); struct net_device *dev = priv->dev; + struct ib_port_attr attr; if (!test_bit(IPOIB_MCAST_RUN, &priv->flags)) return; + if (ib_query_port(priv->ca, priv->port, &attr) || + attr.state != IB_PORT_ACTIVE) { + ipoib_dbg(priv, "%s: port state is not ACTIVE (state = %d) suspend task.\n", + __func__, attr.state); + return; + } + if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid)) ipoib_warn(priv, "ib_query_gid() failed\n"); else @@ -552,7 +582,7 @@ void ipoib_mcast_join_task(struct work_struct *work) mutex_lock(&mcast_mutex); if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) queue_delayed_work(ipoib_workqueue, - &priv->mcast_task, HZ); + &priv->mcast_join_task, HZ); mutex_unlock(&mcast_mutex); return; } @@ -566,8 +596,10 @@ void ipoib_mcast_join_task(struct work_struct *work) spin_unlock_irq(&priv->lock); } - if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { - if (!test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) + if (priv->broadcast && + !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { + if (priv->broadcast && + !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) ipoib_mcast_join(dev, priv->broadcast, 0); return; } @@ -595,6 +627,19 @@ void ipoib_mcast_join_task(struct work_struct *work) return; } + spin_lock_irq(&priv->lock); + if (priv->broadcast) + priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu)); + else + priv->mcast_mtu = priv->admin_mtu; + spin_unlock_irq(&priv->lock); + + if (!ipoib_cm_admin_enabled(dev)) { + rtnl_lock(); + dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu)); + rtnl_unlock(); + } + ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n"); clear_bit(IPOIB_MCAST_RUN, &priv->flags); @@ -608,7 +653,9 @@ int ipoib_mcast_start_thread(struct net_device *dev) mutex_lock(&mcast_mutex); if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags)) - queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0); + queue_delayed_work(ipoib_workqueue, &priv->mcast_join_task, 0); + if (!test_and_set_bit(IPOIB_MCAST_RUN_GC, &priv->flags)) + queue_delayed_work(ipoib_workqueue, &priv->mcast_leave_task, 0); mutex_unlock(&mcast_mutex); return 0; @@ -622,7 +669,9 @@ int ipoib_mcast_stop_thread(struct net_device *dev, int flush) mutex_lock(&mcast_mutex); clear_bit(IPOIB_MCAST_RUN, &priv->flags); - cancel_delayed_work(&priv->mcast_task); + clear_bit(IPOIB_MCAST_RUN_GC, &priv->flags); + cancel_delayed_work(&priv->mcast_join_task); + cancel_delayed_work(&priv->mcast_leave_task); mutex_unlock(&mcast_mutex); if (flush) @@ -653,11 +702,12 @@ static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast) return 0; } -void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb) +void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_mcast *mcast; unsigned long flags; + void *mgid = daddr + 4; spin_lock_irqsave(&priv->lock, flags); @@ -686,6 +736,25 @@ void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb) set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags); memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid)); + + /* + * Check if user-space already attached to that mcg. + * if yes, marks the mcg as user-space-attached, and when + * the kernel will call ipoib to add it as full memeber + * in set_mc_list callback, ipoib ignores that mcg. + */ + if (test_bit(IPOIB_FLAG_UMCAST, &priv->flags)) { + union ib_gid sa_mgid; + struct ib_sa_mcmember_rec rec; + + memcpy(sa_mgid.raw, mgid, sizeof sa_mgid); + if (!ib_sa_get_mcmember_rec(priv->ca, priv->port, &sa_mgid, &rec)) { + ipoib_dbg_mcast(priv, "Found send-only that already attached" + " by user-space mgid %pI6\n", &mgid); + set_bit(IPOIB_MCAST_UMCAST_ATTACHED, &mcast->flags); + } + } + __ipoib_mcast_add(dev, mcast); list_add_tail(&mcast->list, &priv->multicast_list); } @@ -713,26 +782,26 @@ void ipoib_mcast_send(struct net_device *dev, void *mgid, struct sk_buff *skb) out: if (mcast && mcast->ah) { - struct dst_entry *dst = skb_dst(skb); - struct neighbour *n = NULL; - - rcu_read_lock(); - if (dst) - n = dst_get_neighbour(dst); - if (n && !*to_ipoib_neigh(n)) { - struct ipoib_neigh *neigh = ipoib_neigh_alloc(n, - skb->dev); - - if (neigh) { - kref_get(&mcast->ah->ref); - neigh->ah = mcast->ah; - list_add_tail(&neigh->list, &mcast->neigh_list); - } - } - rcu_read_unlock(); - spin_unlock_irqrestore(&priv->lock, flags); - ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN); - return; + struct ipoib_neigh *neigh; + + spin_unlock_irqrestore(&priv->lock, flags); + neigh = ipoib_neigh_get(dev, daddr); + spin_lock_irqsave(&priv->lock, flags); + if (!neigh) { + neigh = ipoib_neigh_alloc(daddr, dev); + if (neigh) { + kref_get(&mcast->ah->ref); + neigh->ah = mcast->ah; + list_add_tail(&neigh->list, &mcast->neigh_list); + } + } + spin_unlock_irqrestore(&priv->lock, flags); + mcast->used = jiffies; + ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN); + if (neigh) + ipoib_neigh_put(neigh); + return; + } unlock: @@ -764,6 +833,11 @@ void ipoib_mcast_dev_flush(struct net_device *dev) spin_unlock_irqrestore(&priv->lock, flags); + /*seperate between the wait to the leav.*/ + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) + if (test_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags)) + wait_for_completion(&mcast->done); + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { ipoib_mcast_leave(dev, mcast); ipoib_mcast_free(mcast); @@ -824,8 +898,9 @@ void ipoib_mcast_restart_task(struct work_struct *work) struct ipoib_mcast *nmcast; /* ignore group which is directly joined by userspace */ - if (test_bit(IPOIB_FLAG_UMCAST, &priv->flags) && - !ib_sa_get_mcmember_rec(priv->ca, priv->port, &mgid, &rec)) { + if ((!mcast && test_bit(IPOIB_FLAG_UMCAST, &priv->flags) && + !ib_sa_get_mcmember_rec(priv->ca, priv->port, &mgid, &rec)) || + (mcast && test_bit(IPOIB_MCAST_UMCAST_ATTACHED, &mcast->flags))) { ipoib_dbg_mcast(priv, "ignoring multicast entry for mgid %pI6\n", mgid.raw); continue; @@ -890,6 +965,38 @@ void ipoib_mcast_restart_task(struct work_struct *work) ipoib_mcast_start_thread(dev); } +void ipoib_mcast_leave_task(struct work_struct *work) +{ + struct ipoib_dev_priv *priv = + container_of(work, struct ipoib_dev_priv, mcast_leave_task.work); + struct net_device *dev = priv->dev; + struct ipoib_mcast *mcast, *tmcast; + LIST_HEAD(remove_list); + + if (!test_bit(IPOIB_MCAST_RUN_GC, &priv->flags)) + return; + + if (ipoib_mc_sendonly_timeout > 0) { + list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) { + if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) && + time_before(mcast->used, jiffies - ipoib_mc_sendonly_timeout * HZ)) { + rb_erase(&mcast->rb_node, &priv->multicast_tree); + list_move_tail(&mcast->list, &remove_list); + } + } + + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { + ipoib_mcast_leave(dev, mcast); + ipoib_mcast_free(mcast); + } + } + + mutex_lock(&mcast_mutex); + if (test_bit(IPOIB_MCAST_RUN_GC, &priv->flags)) + queue_delayed_work(ipoib_workqueue, &priv->mcast_leave_task, 60 * HZ); + mutex_unlock(&mcast_mutex); +} + #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index 049a997caff35..2e94e2d4c6627 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -31,9 +31,8 @@ * SOFTWARE. */ -#include - #include "ipoib.h" +#include int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid, int set_qkey) { @@ -144,6 +143,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) int ret, size; int i; + struct ethtool_coalesce *coal; priv->pd = ib_alloc_pd(priv->ca); if (IS_ERR(priv->pd)) { @@ -167,7 +167,8 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) size += ipoib_recvq_size * ipoib_max_conn_qp; } - priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size, 0); + priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size, + priv->child_index % priv->ca->num_comp_vectors); if (IS_ERR(priv->recv_cq)) { printk(KERN_WARNING "%s: failed to create receive CQ\n", ca->name); goto out_free_mr; @@ -183,6 +184,16 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) if (ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP)) goto out_free_send_cq; + coal = kzalloc(sizeof *coal, GFP_KERNEL); + if (coal) { + coal->rx_coalesce_usecs = 10; + coal->tx_coalesce_usecs = 10; + coal->rx_max_coalesced_frames = 16; + coal->tx_max_coalesced_frames = 16; + dev->ethtool_ops->set_coalesce(dev, coal); + kfree(coal); + } + init_attr.send_cq = priv->send_cq; init_attr.recv_cq = priv->recv_cq; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index d7e9740c72480..96889d8ab8051 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -33,6 +33,7 @@ #include #include +#include #include #include @@ -49,7 +50,8 @@ static ssize_t show_parent(struct device *d, struct device_attribute *attr, } static DEVICE_ATTR(parent, S_IRUGO, show_parent, NULL); -int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) +int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey, + unsigned char child_index) { struct ipoib_dev_priv *ppriv, *priv; char intf_name[IFNAMSIZ]; @@ -59,31 +61,52 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) return -EPERM; ppriv = netdev_priv(pdev); - - if (!rtnl_trylock()) - return restart_syscall(); + while (!rtnl_trylock()) { + if (test_bit(IPOIB_FLAG_MODULE_DOWN, &ppriv->flags)) { + ipoib_dbg(ppriv, "%s: module is going down - nop\n", + __func__); + return -ENODEV; + } + /* enable other tasks to unlock the rtnl */ + msleep(5); + } mutex_lock(&ppriv->vlan_mutex); /* - * First ensure this isn't a duplicate. We check the parent device and - * then all of the child interfaces to make sure the Pkey doesn't match. + * First ensure this isn't a duplicate. We check all of the child + * interfaces to make sure the Pkey AND the child index + * don't match. */ - if (ppriv->pkey == pkey) { - result = -ENOTUNIQ; - priv = NULL; - goto err; - } - list_for_each_entry(priv, &ppriv->child_intfs, list) { - if (priv->pkey == pkey) { + if (priv->pkey == pkey && priv->child_index == child_index) { result = -ENOTUNIQ; priv = NULL; goto err; } } - snprintf(intf_name, sizeof intf_name, "%s.%04x", - ppriv->dev->name, pkey); + /* + * for the case of non-legacy and same pkey childs we wanted to use + * a notation of ibN.pkey:index and ibN:index but this is problematic + * with tools like ifconfig who treat devices with ":" in their names + * as aliases which are restriced, e.t w.r.t counters, etc + */ + if (ppriv->pkey != pkey && child_index == 0) /* legacy child */ + snprintf(intf_name, sizeof intf_name, "%s.%04x", + ppriv->dev->name, pkey); + else if (ppriv->pkey != pkey && child_index != 0) /* non-legacy child */ + snprintf(intf_name, sizeof intf_name, "%s.%04x.%d", + ppriv->dev->name, pkey, child_index); + else if (ppriv->pkey == pkey && child_index != 0) /* same pkey child */ + snprintf(intf_name, sizeof intf_name, "%s.%d", + ppriv->dev->name, child_index); + else { + ipoib_warn(ppriv, "wrong pkey/child_index pairing %04x %d\n", + pkey, child_index); + result = -EINVAL; + goto err; + } + priv = ipoib_intf_alloc(intf_name); if (!priv) { result = -ENOMEM; @@ -101,6 +124,7 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) goto err; priv->pkey = pkey; + priv->child_index = child_index; memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, INFINIBAND_ALEN); priv->dev->broadcast[8] = pkey >> 8; @@ -157,7 +181,8 @@ err: return result; } -int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) +int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey, + unsigned char child_index) { struct ipoib_dev_priv *ppriv, *priv, *tpriv; struct net_device *dev = NULL; @@ -166,14 +191,20 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) return -EPERM; ppriv = netdev_priv(pdev); + while (!rtnl_trylock()) { + if (test_bit(IPOIB_FLAG_MODULE_DOWN, &ppriv->flags)) { + ipoib_dbg(ppriv, "%s: module is going down - nop\n", + __func__); + return -ENODEV; + } + /* enable other tasks to unlock the rtnl */ + msleep(5); + } - if (!rtnl_trylock()) - return restart_syscall(); mutex_lock(&ppriv->vlan_mutex); list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) { - if (priv->pkey == pkey) { + if (priv->pkey == pkey && priv->child_index == child_index) { unregister_netdevice(priv->dev); - ipoib_dev_cleanup(priv->dev); list_del(&priv->list); dev = priv->dev; break; @@ -183,6 +214,7 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) rtnl_unlock(); if (dev) { + ipoib_dev_cleanup(dev); free_netdev(dev); return 0; } diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index 3c442c34d33cc..f7bd97c7466ed 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -56,7 +56,6 @@ #include #include #include -#include #include @@ -129,28 +128,6 @@ static int iscsi_iser_pdu_alloc(struct iscsi_task *task, uint8_t opcode) return 0; } -int iser_initialize_task_headers(struct iscsi_task *task, - struct iser_tx_desc *tx_desc) -{ - struct iscsi_iser_conn *iser_conn = task->conn->dd_data; - struct iser_device *device = iser_conn->ib_conn->device; - struct iscsi_iser_task *iser_task = task->dd_data; - u64 dma_addr; - - dma_addr = ib_dma_map_single(device->ib_device, (void *)tx_desc, - ISER_HEADERS_LEN, DMA_TO_DEVICE); - if (ib_dma_mapping_error(device->ib_device, dma_addr)) - return -ENOMEM; - - tx_desc->dma_addr = dma_addr; - tx_desc->tx_sg[0].addr = tx_desc->dma_addr; - tx_desc->tx_sg[0].length = ISER_HEADERS_LEN; - tx_desc->tx_sg[0].lkey = device->mr->lkey; - - iser_task->headers_initialized = 1; - iser_task->iser_conn = iser_conn; - return 0; -} /** * iscsi_iser_task_init - Initialize task * @task: iscsi task @@ -160,17 +137,17 @@ int iser_initialize_task_headers(struct iscsi_task *task, static int iscsi_iser_task_init(struct iscsi_task *task) { + struct iscsi_iser_conn *iser_conn = task->conn->dd_data; struct iscsi_iser_task *iser_task = task->dd_data; - if (!iser_task->headers_initialized) - if (iser_initialize_task_headers(task, &iser_task->desc)) - return -ENOMEM; - /* mgmt task */ - if (!task->sc) + if (!task->sc) { + iser_task->desc.data = task->data; return 0; + } iser_task->command_sent = 0; + iser_task->iser_conn = iser_conn; iser_task_rdma_init(iser_task); return 0; } @@ -191,7 +168,7 @@ iscsi_iser_mtask_xmit(struct iscsi_conn *conn, struct iscsi_task *task) { int error = 0; - iser_dbg("mtask xmit [cid %d itt 0x%x]\n", conn->id, task->itt); + iser_dbg("task deq [cid %d itt 0x%x]\n", conn->id, task->itt); error = iser_send_control(conn, task); @@ -201,6 +178,9 @@ iscsi_iser_mtask_xmit(struct iscsi_conn *conn, struct iscsi_task *task) * - if yes, the task is recycled at iscsi_complete_pdu * - if no, the task is recycled at iser_snd_completion */ + if (error && error != -ENOBUFS) + iscsi_conn_failure(conn, ISCSI_ERR_CONN_FAILED); + return error; } @@ -252,7 +232,7 @@ iscsi_iser_task_xmit(struct iscsi_task *task) task->imm_count, task->unsol_r2t.data_length); } - iser_dbg("ctask xmit [cid %d itt 0x%x]\n", + iser_dbg("task deq [cid %d itt 0x%x]\n", conn->id, task->itt); /* Send the cmd PDU */ @@ -268,6 +248,8 @@ iscsi_iser_task_xmit(struct iscsi_task *task) error = iscsi_iser_task_xmit_unsol_data(conn, task); iscsi_iser_task_xmit_exit: + if (error && error != -ENOBUFS) + iscsi_conn_failure(conn, ISCSI_ERR_CONN_FAILED); return error; } @@ -275,8 +257,11 @@ static void iscsi_iser_cleanup_task(struct iscsi_task *task) { struct iscsi_iser_task *iser_task = task->dd_data; - /* mgmt tasks do not need special cleanup */ - if (!task->sc) + /* + * mgmt tasks do not need special cleanup and we do not + * allocate anything in the init task callout + */ + if (!task->sc || task->state == ISCSI_TASK_PENDING) return; if (iser_task->status == ISER_TASK_STATUS_STARTED) { @@ -301,7 +286,7 @@ iscsi_iser_conn_create(struct iscsi_cls_session *cls_session, uint32_t conn_idx) * due to issues with the login code re iser sematics * this not set in iscsi_conn_setup - FIXME */ - conn->max_recv_dlength = ISER_RECV_DATA_SEG_LEN; + conn->max_recv_dlength = 128; iser_conn = conn->dd_data; conn->dd_data = iser_conn; @@ -325,7 +310,7 @@ iscsi_iser_conn_destroy(struct iscsi_cls_conn *cls_conn) */ if (ib_conn) { ib_conn->iser_conn = NULL; - iser_conn_put(ib_conn, 1); /* deref iscsi/ib conn unbinding */ + iser_conn_put(ib_conn); } } @@ -354,18 +339,14 @@ iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session, } ib_conn = ep->dd_data; - if (iser_alloc_rx_descriptors(ib_conn)) - return -ENOMEM; - /* binds the iSER connection retrieved from the previously * connected ep_handle to the iSCSI layer connection. exchanges * connection pointers */ - iser_err("binding iscsi/iser conn %p %p to ib_conn %p\n", - conn, conn->dd_data, ib_conn); + iser_err("binding iscsi conn %p to iser_conn %p\n",conn,ib_conn); iser_conn = conn->dd_data; ib_conn->iser_conn = iser_conn; iser_conn->ib_conn = ib_conn; - iser_conn_get(ib_conn); /* ref iscsi/ib conn binding */ + iser_conn_get(ib_conn); return 0; } @@ -386,11 +367,24 @@ iscsi_iser_conn_stop(struct iscsi_cls_conn *cls_conn, int flag) * There is no unbind event so the stop callback * must release the ref from the bind. */ - iser_conn_put(ib_conn, 1); /* deref iscsi/ib conn unbinding */ + iser_conn_put(ib_conn); } iser_conn->ib_conn = NULL; } +static int +iscsi_iser_conn_start(struct iscsi_cls_conn *cls_conn) +{ + struct iscsi_conn *conn = cls_conn->dd_data; + int err; + + err = iser_conn_set_full_featured_mode(conn); + if (err) + return err; + + return iscsi_conn_start(cls_conn); +} + static void iscsi_iser_session_destroy(struct iscsi_cls_session *cls_session) { struct Scsi_Host *shost = iscsi_session_to_shost(cls_session); @@ -410,7 +404,7 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep, struct Scsi_Host *shost; struct iser_conn *ib_conn; - shost = iscsi_host_alloc(&iscsi_iser_sht, 0, 0); + shost = iscsi_host_alloc(&iscsi_iser_sht, 0, 1); if (!shost) return NULL; shost->transportt = iscsi_iser_scsi_transport; @@ -522,32 +516,8 @@ iscsi_iser_conn_get_stats(struct iscsi_cls_conn *cls_conn, struct iscsi_stats *s stats->custom[3].value = conn->fmr_unalign_cnt; } -static int iscsi_iser_get_ep_param(struct iscsi_endpoint *ep, - enum iscsi_param param, char *buf) -{ - struct iser_conn *ib_conn = ep->dd_data; - int len; - - switch (param) { - case ISCSI_PARAM_CONN_PORT: - case ISCSI_PARAM_CONN_ADDRESS: - if (!ib_conn || !ib_conn->cma_id) - return -ENOTCONN; - - return iscsi_conn_get_addr_param((struct sockaddr_storage *) - &ib_conn->cma_id->route.addr.dst_addr, - param, buf); - break; - default: - return -ENOSYS; - } - - return len; -} - static struct iscsi_endpoint * -iscsi_iser_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr, - int non_blocking) +iscsi_iser_ep_connect(struct sockaddr *dst_addr, int non_blocking) { int err; struct iser_conn *ib_conn; @@ -618,59 +588,6 @@ iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep) iser_conn_terminate(ib_conn); } -static mode_t iser_attr_is_visible(int param_type, int param) -{ - switch (param_type) { - case ISCSI_HOST_PARAM: - switch (param) { - case ISCSI_HOST_PARAM_NETDEV_NAME: - case ISCSI_HOST_PARAM_HWADDRESS: - case ISCSI_HOST_PARAM_INITIATOR_NAME: - return S_IRUGO; - default: - return 0; - } - case ISCSI_PARAM: - switch (param) { - case ISCSI_PARAM_MAX_RECV_DLENGTH: - case ISCSI_PARAM_MAX_XMIT_DLENGTH: - case ISCSI_PARAM_HDRDGST_EN: - case ISCSI_PARAM_DATADGST_EN: - case ISCSI_PARAM_CONN_ADDRESS: - case ISCSI_PARAM_CONN_PORT: - case ISCSI_PARAM_EXP_STATSN: - case ISCSI_PARAM_PERSISTENT_ADDRESS: - case ISCSI_PARAM_PERSISTENT_PORT: - case ISCSI_PARAM_PING_TMO: - case ISCSI_PARAM_RECV_TMO: - case ISCSI_PARAM_INITIAL_R2T_EN: - case ISCSI_PARAM_MAX_R2T: - case ISCSI_PARAM_IMM_DATA_EN: - case ISCSI_PARAM_FIRST_BURST: - case ISCSI_PARAM_MAX_BURST: - case ISCSI_PARAM_PDU_INORDER_EN: - case ISCSI_PARAM_DATASEQ_INORDER_EN: - case ISCSI_PARAM_TARGET_NAME: - case ISCSI_PARAM_TPGT: - case ISCSI_PARAM_USERNAME: - case ISCSI_PARAM_PASSWORD: - case ISCSI_PARAM_USERNAME_IN: - case ISCSI_PARAM_PASSWORD_IN: - case ISCSI_PARAM_FAST_ABORT: - case ISCSI_PARAM_ABORT_TMO: - case ISCSI_PARAM_LU_RESET_TMO: - case ISCSI_PARAM_TGT_RESET_TMO: - case ISCSI_PARAM_IFACE_NAME: - case ISCSI_PARAM_INITIATOR_NAME: - return S_IRUGO; - default: - return 0; - } - } - - return 0; -} - static struct scsi_host_template iscsi_iser_sht = { .module = THIS_MODULE, .name = "iSCSI Initiator over iSER, v." DRV_VER, @@ -681,7 +598,7 @@ static struct scsi_host_template iscsi_iser_sht = { .cmd_per_lun = ISER_DEF_CMD_PER_LUN, .eh_abort_handler = iscsi_eh_abort, .eh_device_reset_handler= iscsi_eh_device_reset, - .eh_target_reset_handler = iscsi_eh_recover_target, + .eh_target_reset_handler= iscsi_eh_recover_target, .target_alloc = iscsi_target_alloc, .use_clustering = DISABLE_CLUSTERING, .proc_name = "iscsi_iser", @@ -699,12 +616,10 @@ static struct iscsi_transport iscsi_iser_transport = { .create_conn = iscsi_iser_conn_create, .bind_conn = iscsi_iser_conn_bind, .destroy_conn = iscsi_iser_conn_destroy, - .attr_is_visible = iser_attr_is_visible, .set_param = iscsi_iser_set_param, .get_conn_param = iscsi_conn_get_param, - .get_ep_param = iscsi_iser_get_ep_param, .get_session_param = iscsi_session_get_param, - .start_conn = iscsi_conn_start, + .start_conn = iscsi_iser_conn_start, .stop_conn = iscsi_iser_conn_stop, /* iscsi host params */ .get_host_param = iscsi_host_get_param, @@ -738,7 +653,7 @@ static int __init iser_init(void) memset(&ig, 0, sizeof(struct iser_global)); ig.desc_cache = kmem_cache_create("iser_descriptors", - sizeof(struct iser_tx_desc), + sizeof (struct iser_desc), 0, SLAB_HWCACHE_ALIGN, NULL); if (ig.desc_cache == NULL) diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index 634aef039fe25..9d529cae1f0d9 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -91,7 +91,7 @@ #define SIZE_4K (1UL << SHIFT_4K) #define MASK_4K (~(SIZE_4K-1)) - /* support up to 512KB in one RDMA */ + /* support upto 512KB in one RDMA */ #define ISCSI_ISER_SG_TABLESIZE (0x80000 >> SHIFT_4K) #define ISER_DEF_CMD_PER_LUN 128 @@ -102,9 +102,9 @@ #define ISER_MAX_TX_MISC_PDUS 6 /* NOOP_OUT(2), TEXT(1), * * SCSI_TMFUNC(2), LOGOUT(1) */ -#define ISER_QP_MAX_RECV_DTOS (ISCSI_DEF_XMIT_CMDS_MAX) - -#define ISER_MIN_POSTED_RX (ISCSI_DEF_XMIT_CMDS_MAX >> 2) +#define ISER_QP_MAX_RECV_DTOS (ISCSI_DEF_XMIT_CMDS_MAX + \ + ISER_MAX_RX_MISC_PDUS + \ + ISER_MAX_TX_MISC_PDUS) /* the max TX (send) WR supported by the iSER QP is defined by * * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect * @@ -132,12 +132,6 @@ struct iser_hdr { __be64 read_va; } __attribute__((packed)); -/* Constant PDU lengths calculations */ -#define ISER_HEADERS_LEN (sizeof(struct iser_hdr) + sizeof(struct iscsi_hdr)) - -#define ISER_RECV_DATA_SEG_LEN 128 -#define ISER_RX_PAYLOAD_SIZE (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN) -#define ISER_RX_LOGIN_SIZE (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN) /* Length of an object name string */ #define ISER_OBJECT_NAME_SIZE 64 @@ -193,46 +187,53 @@ struct iser_regd_buf { struct iser_mem_reg reg; /* memory registration info */ void *virt_addr; struct iser_device *device; /* device->device for dma_unmap */ + u64 dma_addr; /* if non zero, addr for dma_unmap */ enum dma_data_direction direction; /* direction for dma_unmap */ unsigned int data_size; + atomic_t ref_count; /* refcount, freed when dec to 0 */ +}; + +#define MAX_REGD_BUF_VECTOR_LEN 2 + +struct iser_dto { + struct iscsi_iser_task *task; + struct iser_conn *ib_conn; + int notify_enable; + + /* vector of registered buffers */ + unsigned int regd_vector_len; + struct iser_regd_buf *regd[MAX_REGD_BUF_VECTOR_LEN]; + + /* offset into the registered buffer may be specified */ + unsigned int offset[MAX_REGD_BUF_VECTOR_LEN]; + + /* a smaller size may be specified, if 0, then full size is used */ + unsigned int used_sz[MAX_REGD_BUF_VECTOR_LEN]; }; enum iser_desc_type { + ISCSI_RX, ISCSI_TX_CONTROL , ISCSI_TX_SCSI_COMMAND, ISCSI_TX_DATAOUT }; -struct iser_tx_desc { +struct iser_desc { struct iser_hdr iser_header; struct iscsi_hdr iscsi_header; + struct iser_regd_buf hdr_regd_buf; + void *data; /* used by RX & TX_CONTROL */ + struct iser_regd_buf data_regd_buf; /* used by RX & TX_CONTROL */ enum iser_desc_type type; - u64 dma_addr; - /* sg[0] points to iser/iscsi headers, sg[1] optionally points to either - of immediate data, unsolicited data-out or control (login,text) */ - struct ib_sge tx_sg[2]; - int num_sge; + struct iser_dto dto; }; -#define ISER_RX_PAD_SIZE (256 - (ISER_RX_PAYLOAD_SIZE + \ - sizeof(u64) + sizeof(struct ib_sge))) -struct iser_rx_desc { - struct iser_hdr iser_header; - struct iscsi_hdr iscsi_header; - char data[ISER_RECV_DATA_SEG_LEN]; - u64 dma_addr; - struct ib_sge rx_sg; - char pad[ISER_RX_PAD_SIZE]; -} __attribute__((packed)); - struct iser_device { struct ib_device *ib_device; struct ib_pd *pd; - struct ib_cq *rx_cq; - struct ib_cq *tx_cq; + struct ib_cq *cq; struct ib_mr *mr; struct tasklet_struct cq_tasklet; - struct ib_event_handler event_handler; struct list_head ig_list; /* entry in ig devices list */ int refcount; }; @@ -247,19 +248,17 @@ struct iser_conn { struct rdma_cm_id *cma_id; /* CMA ID */ struct ib_qp *qp; /* QP */ struct ib_fmr_pool *fmr_pool; /* pool of IB FMRs */ + int disc_evt_flag; /* disconn event delivered */ wait_queue_head_t wait; /* waitq for conn/disconn */ - int post_recv_buf_count; /* posted rx count */ + atomic_t post_recv_buf_count; /* posted rx count */ atomic_t post_send_buf_count; /* posted tx count */ + atomic_t unexpected_pdu_count;/* count of received * + * unexpected pdus * + * not yet retired */ char name[ISER_OBJECT_NAME_SIZE]; struct iser_page_vec *page_vec; /* represents SG to fmr maps* * maps serialized as tx is*/ struct list_head conn_list; /* entry in ig conn list */ - - char *login_buf; - u64 login_dma; - unsigned int rx_desc_head; - struct iser_rx_desc *rx_descs; - struct ib_recv_wr rx_wr[ISER_MIN_POSTED_RX]; }; struct iscsi_iser_conn { @@ -268,7 +267,7 @@ struct iscsi_iser_conn { }; struct iscsi_iser_task { - struct iser_tx_desc desc; + struct iser_desc desc; struct iscsi_iser_conn *iser_conn; enum iser_task_status status; int command_sent; /* set if command sent */ @@ -276,7 +275,6 @@ struct iscsi_iser_task { struct iser_regd_buf rdma_regd[ISER_DIRS_NUM];/* regd rdma buf */ struct iser_data_buf data[ISER_DIRS_NUM]; /* orig. data des*/ struct iser_data_buf data_copy[ISER_DIRS_NUM];/* contig. copy */ - int headers_initialized; }; struct iser_page_vec { @@ -320,21 +318,26 @@ void iser_conn_init(struct iser_conn *ib_conn); void iser_conn_get(struct iser_conn *ib_conn); -int iser_conn_put(struct iser_conn *ib_conn, int destroy_cma_id_allowed); +void iser_conn_put(struct iser_conn *ib_conn); void iser_conn_terminate(struct iser_conn *ib_conn); -void iser_rcv_completion(struct iser_rx_desc *desc, - unsigned long dto_xfer_len, - struct iser_conn *ib_conn); +void iser_rcv_completion(struct iser_desc *desc, + unsigned long dto_xfer_len); -void iser_snd_completion(struct iser_tx_desc *desc, struct iser_conn *ib_conn); +void iser_snd_completion(struct iser_desc *desc); void iser_task_rdma_init(struct iscsi_iser_task *task); void iser_task_rdma_finalize(struct iscsi_iser_task *task); -void iser_free_rx_descriptors(struct iser_conn *ib_conn); +void iser_dto_buffs_release(struct iser_dto *dto); + +int iser_regd_buff_release(struct iser_regd_buf *regd_buf); + +void iser_reg_single(struct iser_device *device, + struct iser_regd_buf *regd_buf, + enum dma_data_direction direction); void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *task, enum iser_data_dir cmd_dir); @@ -353,9 +356,11 @@ int iser_reg_page_vec(struct iser_conn *ib_conn, void iser_unreg_mem(struct iser_mem_reg *mem_reg); -int iser_post_recvl(struct iser_conn *ib_conn); -int iser_post_recvm(struct iser_conn *ib_conn, int count); -int iser_post_send(struct iser_conn *ib_conn, struct iser_tx_desc *tx_desc); +int iser_post_recv(struct iser_desc *rx_desc); +int iser_post_send(struct iser_desc *tx_desc); + +int iser_conn_state_comp(struct iser_conn *ib_conn, + enum iser_ib_conn_state comp); int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, struct iser_data_buf *data, @@ -363,7 +368,4 @@ int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, enum dma_data_direction dma_dir); void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task); -int iser_initialize_task_headers(struct iscsi_task *task, - struct iser_tx_desc *tx_desc); -int iser_alloc_rx_descriptors(struct iser_conn *ib_conn); #endif diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index eb1ee6f8d8948..9de640200ad3b 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -39,6 +39,29 @@ #include "iscsi_iser.h" +/* Constant PDU lengths calculations */ +#define ISER_TOTAL_HEADERS_LEN (sizeof (struct iser_hdr) + \ + sizeof (struct iscsi_hdr)) + +/* iser_dto_add_regd_buff - increments the reference count for * + * the registered buffer & adds it to the DTO object */ +static void iser_dto_add_regd_buff(struct iser_dto *dto, + struct iser_regd_buf *regd_buf, + unsigned long use_offset, + unsigned long use_size) +{ + int add_idx; + + atomic_inc(®d_buf->ref_count); + + add_idx = dto->regd_vector_len; + dto->regd[add_idx] = regd_buf; + dto->used_sz[add_idx] = use_size; + dto->offset[add_idx] = use_offset; + + dto->regd_vector_len++; +} + /* Register user buffer memory and initialize passive rdma * dto descriptor. Total data size is stored in * iser_task->data[ISER_DIR_IN].data_len @@ -99,9 +122,9 @@ iser_prepare_write_cmd(struct iscsi_task *task, struct iscsi_iser_task *iser_task = task->dd_data; struct iser_regd_buf *regd_buf; int err; + struct iser_dto *send_dto = &iser_task->desc.dto; struct iser_hdr *hdr = &iser_task->desc.iser_header; struct iser_data_buf *buf_out = &iser_task->data[ISER_DIR_OUT]; - struct ib_sge *tx_dsg = &iser_task->desc.tx_sg[1]; err = iser_dma_map_task_data(iser_task, buf_out, @@ -140,127 +163,184 @@ iser_prepare_write_cmd(struct iscsi_task *task, if (imm_sz > 0) { iser_dbg("Cmd itt:%d, WRITE, adding imm.data sz: %d\n", task->itt, imm_sz); - tx_dsg->addr = regd_buf->reg.va; - tx_dsg->length = imm_sz; - tx_dsg->lkey = regd_buf->reg.lkey; - iser_task->desc.num_sge = 2; + iser_dto_add_regd_buff(send_dto, + regd_buf, + 0, + imm_sz); } return 0; } -/* creates a new tx descriptor and adds header regd buffer */ -static void iser_create_send_desc(struct iser_conn *ib_conn, - struct iser_tx_desc *tx_desc) +/** + * iser_post_receive_control - allocates, initializes and posts receive DTO. + */ +static int iser_post_receive_control(struct iscsi_conn *conn) { - struct iser_device *device = ib_conn->device; + struct iscsi_iser_conn *iser_conn = conn->dd_data; + struct iser_desc *rx_desc; + struct iser_regd_buf *regd_hdr; + struct iser_regd_buf *regd_data; + struct iser_dto *recv_dto = NULL; + struct iser_device *device = iser_conn->ib_conn->device; + int rx_data_size, err; + int posts, outstanding_unexp_pdus; + + /* for the login sequence we must support rx of upto 8K; login is done + * after conn create/bind (connect) and conn stop/bind (reconnect), + * what's common for both schemes is that the connection is not started + */ + if (conn->c_stage != ISCSI_CONN_STARTED) + rx_data_size = ISCSI_DEF_MAX_RECV_SEG_LEN; + else /* FIXME till user space sets conn->max_recv_dlength correctly */ + rx_data_size = 128; - ib_dma_sync_single_for_cpu(device->ib_device, - tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE); + outstanding_unexp_pdus = + atomic_xchg(&iser_conn->ib_conn->unexpected_pdu_count, 0); - memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr)); - tx_desc->iser_header.flags = ISER_VER; + /* + * in addition to the response buffer, replace those consumed by + * unexpected pdus. + */ + for (posts = 0; posts < 1 + outstanding_unexp_pdus; posts++) { + rx_desc = kmem_cache_alloc(ig.desc_cache, GFP_NOIO); + if (rx_desc == NULL) { + iser_err("Failed to alloc desc for post recv %d\n", + posts); + err = -ENOMEM; + goto post_rx_cache_alloc_failure; + } + rx_desc->type = ISCSI_RX; + rx_desc->data = kmalloc(rx_data_size, GFP_NOIO); + if (rx_desc->data == NULL) { + iser_err("Failed to alloc data buf for post recv %d\n", + posts); + err = -ENOMEM; + goto post_rx_kmalloc_failure; + } - tx_desc->num_sge = 1; + recv_dto = &rx_desc->dto; + recv_dto->ib_conn = iser_conn->ib_conn; + recv_dto->regd_vector_len = 0; - if (tx_desc->tx_sg[0].lkey != device->mr->lkey) { - tx_desc->tx_sg[0].lkey = device->mr->lkey; - iser_dbg("sdesc %p lkey mismatch, fixing\n", tx_desc); - } -} + regd_hdr = &rx_desc->hdr_regd_buf; + memset(regd_hdr, 0, sizeof(struct iser_regd_buf)); + regd_hdr->device = device; + regd_hdr->virt_addr = rx_desc; /* == &rx_desc->iser_header */ + regd_hdr->data_size = ISER_TOTAL_HEADERS_LEN; + iser_reg_single(device, regd_hdr, DMA_FROM_DEVICE); -int iser_alloc_rx_descriptors(struct iser_conn *ib_conn) -{ - int i, j; - u64 dma_addr; - struct iser_rx_desc *rx_desc; - struct ib_sge *rx_sg; - struct iser_device *device = ib_conn->device; - - ib_conn->rx_descs = kmalloc(ISER_QP_MAX_RECV_DTOS * - sizeof(struct iser_rx_desc), GFP_KERNEL); - if (!ib_conn->rx_descs) - goto rx_desc_alloc_fail; - - rx_desc = ib_conn->rx_descs; - - for (i = 0; i < ISER_QP_MAX_RECV_DTOS; i++, rx_desc++) { - dma_addr = ib_dma_map_single(device->ib_device, (void *)rx_desc, - ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); - if (ib_dma_mapping_error(device->ib_device, dma_addr)) - goto rx_desc_dma_map_failed; - - rx_desc->dma_addr = dma_addr; - - rx_sg = &rx_desc->rx_sg; - rx_sg->addr = rx_desc->dma_addr; - rx_sg->length = ISER_RX_PAYLOAD_SIZE; - rx_sg->lkey = device->mr->lkey; - } + iser_dto_add_regd_buff(recv_dto, regd_hdr, 0, 0); - ib_conn->rx_desc_head = 0; + regd_data = &rx_desc->data_regd_buf; + memset(regd_data, 0, sizeof(struct iser_regd_buf)); + regd_data->device = device; + regd_data->virt_addr = rx_desc->data; + regd_data->data_size = rx_data_size; + + iser_reg_single(device, regd_data, DMA_FROM_DEVICE); + + iser_dto_add_regd_buff(recv_dto, regd_data, 0, 0); + + err = iser_post_recv(rx_desc); + if (err) { + iser_err("Failed iser_post_recv for post %d\n", posts); + goto post_rx_post_recv_failure; + } + } + /* all posts successful */ return 0; -rx_desc_dma_map_failed: - rx_desc = ib_conn->rx_descs; - for (j = 0; j < i; j++, rx_desc++) - ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr, - ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); - kfree(ib_conn->rx_descs); - ib_conn->rx_descs = NULL; -rx_desc_alloc_fail: - iser_err("failed allocating rx descriptors / data buffers\n"); - return -ENOMEM; +post_rx_post_recv_failure: + iser_dto_buffs_release(recv_dto); + kfree(rx_desc->data); +post_rx_kmalloc_failure: + kmem_cache_free(ig.desc_cache, rx_desc); +post_rx_cache_alloc_failure: + if (posts > 0) { + /* + * response buffer posted, but did not replace all unexpected + * pdu recv bufs. Ignore error, retry occurs next send + */ + outstanding_unexp_pdus -= (posts - 1); + err = 0; + } + atomic_add(outstanding_unexp_pdus, + &iser_conn->ib_conn->unexpected_pdu_count); + + return err; } -void iser_free_rx_descriptors(struct iser_conn *ib_conn) +/* creates a new tx descriptor and adds header regd buffer */ +static void iser_create_send_desc(struct iscsi_iser_conn *iser_conn, + struct iser_desc *tx_desc) { - int i; - struct iser_rx_desc *rx_desc; - struct iser_device *device = ib_conn->device; + struct iser_regd_buf *regd_hdr = &tx_desc->hdr_regd_buf; + struct iser_dto *send_dto = &tx_desc->dto; - if (ib_conn->login_buf) { - ib_dma_unmap_single(device->ib_device, ib_conn->login_dma, - ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE); - kfree(ib_conn->login_buf); - } + memset(regd_hdr, 0, sizeof(struct iser_regd_buf)); + regd_hdr->device = iser_conn->ib_conn->device; + regd_hdr->virt_addr = tx_desc; /* == &tx_desc->iser_header */ + regd_hdr->data_size = ISER_TOTAL_HEADERS_LEN; + + send_dto->ib_conn = iser_conn->ib_conn; + send_dto->notify_enable = 1; + send_dto->regd_vector_len = 0; - if (!ib_conn->rx_descs) - return; + memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr)); + tx_desc->iser_header.flags = ISER_VER; - rx_desc = ib_conn->rx_descs; - for (i = 0; i < ISER_QP_MAX_RECV_DTOS; i++, rx_desc++) - ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr, - ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); - kfree(ib_conn->rx_descs); + iser_dto_add_regd_buff(send_dto, regd_hdr, 0, 0); } -static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req) +/** + * iser_conn_set_full_featured_mode - (iSER API) + */ +int iser_conn_set_full_featured_mode(struct iscsi_conn *conn) { struct iscsi_iser_conn *iser_conn = conn->dd_data; - iser_dbg("req op %x flags %x\n", req->opcode, req->flags); - /* check if this is the last login - going to full feature phase */ - if ((req->flags & ISCSI_FULL_FEATURE_PHASE) != ISCSI_FULL_FEATURE_PHASE) - return 0; - + int i; /* - * Check that there is one posted recv buffer (for the last login - * response) and no posted send buffers left - they must have been - * consumed during previous login phases. + * FIXME this value should be declared to the target during login with + * the MaxOutstandingUnexpectedPDUs key when supported */ - WARN_ON(iser_conn->ib_conn->post_recv_buf_count != 1); - WARN_ON(atomic_read(&iser_conn->ib_conn->post_send_buf_count) != 0); + int initial_post_recv_bufs_num = ISER_MAX_RX_MISC_PDUS; + + iser_dbg("Initially post: %d\n", initial_post_recv_bufs_num); + + /* Check that there is no posted recv or send buffers left - */ + /* they must be consumed during the login phase */ + BUG_ON(atomic_read(&iser_conn->ib_conn->post_recv_buf_count) != 0); + BUG_ON(atomic_read(&iser_conn->ib_conn->post_send_buf_count) != 0); - iser_dbg("Initially post: %d\n", ISER_MIN_POSTED_RX); /* Initial post receive buffers */ - if (iser_post_recvm(iser_conn->ib_conn, ISER_MIN_POSTED_RX)) - return -ENOMEM; + for (i = 0; i < initial_post_recv_bufs_num; i++) { + if (iser_post_receive_control(conn) != 0) { + iser_err("Failed to post recv bufs at:%d conn:0x%p\n", + i, conn); + return -ENOMEM; + } + } + iser_dbg("Posted %d post recv bufs, conn:0x%p\n", i, conn); + return 0; +} +static int +iser_check_xmit(struct iscsi_conn *conn, void *task) +{ + struct iscsi_iser_conn *iser_conn = conn->dd_data; + + if (atomic_read(&iser_conn->ib_conn->post_send_buf_count) == + ISER_QP_MAX_REQ_DTOS) { + iser_dbg("%ld can't xmit task %p\n",jiffies,task); + return -ENOBUFS; + } return 0; } + /** * iser_send_command - send command PDU */ @@ -269,18 +349,27 @@ int iser_send_command(struct iscsi_conn *conn, { struct iscsi_iser_conn *iser_conn = conn->dd_data; struct iscsi_iser_task *iser_task = task->dd_data; + struct iser_dto *send_dto = NULL; unsigned long edtl; - int err; + int err = 0; struct iser_data_buf *data_buf; struct iscsi_cmd *hdr = (struct iscsi_cmd *)task->hdr; struct scsi_cmnd *sc = task->sc; - struct iser_tx_desc *tx_desc = &iser_task->desc; + + if (!iser_conn_state_comp(iser_conn->ib_conn, ISER_CONN_UP)) { + iser_err("Failed to send, conn: 0x%p is not up\n", iser_conn->ib_conn); + return -EPERM; + } + if (iser_check_xmit(conn, task)) + return -ENOBUFS; edtl = ntohl(hdr->data_length); /* build the tx desc regd header and add it to the tx desc dto */ - tx_desc->type = ISCSI_TX_SCSI_COMMAND; - iser_create_send_desc(iser_conn->ib_conn, tx_desc); + iser_task->desc.type = ISCSI_TX_SCSI_COMMAND; + send_dto = &iser_task->desc.dto; + send_dto->task = iser_task; + iser_create_send_desc(iser_conn, &iser_task->desc); if (hdr->flags & ISCSI_FLAG_CMD_READ) data_buf = &iser_task->data[ISER_DIR_IN]; @@ -309,13 +398,23 @@ int iser_send_command(struct iscsi_conn *conn, goto send_command_error; } + iser_reg_single(iser_conn->ib_conn->device, + send_dto->regd[0], DMA_TO_DEVICE); + + if (iser_post_receive_control(conn) != 0) { + iser_err("post_recv failed!\n"); + err = -ENOMEM; + goto send_command_error; + } + iser_task->status = ISER_TASK_STATUS_STARTED; - err = iser_post_send(iser_conn->ib_conn, tx_desc); + err = iser_post_send(&iser_task->desc); if (!err) return 0; send_command_error: + iser_dto_buffs_release(send_dto); iser_err("conn %p failed task->itt %d err %d\n",conn, task->itt, err); return err; } @@ -329,13 +428,20 @@ int iser_send_data_out(struct iscsi_conn *conn, { struct iscsi_iser_conn *iser_conn = conn->dd_data; struct iscsi_iser_task *iser_task = task->dd_data; - struct iser_tx_desc *tx_desc = NULL; - struct iser_regd_buf *regd_buf; + struct iser_desc *tx_desc = NULL; + struct iser_dto *send_dto = NULL; unsigned long buf_offset; unsigned long data_seg_len; uint32_t itt; int err = 0; - struct ib_sge *tx_dsg; + + if (!iser_conn_state_comp(iser_conn->ib_conn, ISER_CONN_UP)) { + iser_err("Failed to send, conn: 0x%p is not up\n", iser_conn->ib_conn); + return -EPERM; + } + + if (iser_check_xmit(conn, task)) + return -ENOBUFS; itt = (__force uint32_t)hdr->itt; data_seg_len = ntoh24(hdr->dlength); @@ -344,25 +450,28 @@ int iser_send_data_out(struct iscsi_conn *conn, iser_dbg("%s itt %d dseg_len %d offset %d\n", __func__,(int)itt,(int)data_seg_len,(int)buf_offset); - tx_desc = kmem_cache_zalloc(ig.desc_cache, GFP_ATOMIC); + tx_desc = kmem_cache_alloc(ig.desc_cache, GFP_NOIO); if (tx_desc == NULL) { iser_err("Failed to alloc desc for post dataout\n"); return -ENOMEM; } tx_desc->type = ISCSI_TX_DATAOUT; - tx_desc->iser_header.flags = ISER_VER; memcpy(&tx_desc->iscsi_header, hdr, sizeof(struct iscsi_hdr)); - /* build the tx desc */ - iser_initialize_task_headers(task, tx_desc); + /* build the tx desc regd header and add it to the tx desc dto */ + send_dto = &tx_desc->dto; + send_dto->task = iser_task; + iser_create_send_desc(iser_conn, tx_desc); - regd_buf = &iser_task->rdma_regd[ISER_DIR_OUT]; - tx_dsg = &tx_desc->tx_sg[1]; - tx_dsg->addr = regd_buf->reg.va + buf_offset; - tx_dsg->length = data_seg_len; - tx_dsg->lkey = regd_buf->reg.lkey; - tx_desc->num_sge = 2; + iser_reg_single(iser_conn->ib_conn->device, + send_dto->regd[0], DMA_TO_DEVICE); + + /* all data was registered for RDMA, we can use the lkey */ + iser_dto_add_regd_buff(send_dto, + &iser_task->rdma_regd[ISER_DIR_OUT], + buf_offset, + data_seg_len); if (buf_offset + data_seg_len > iser_task->data[ISER_DIR_OUT].data_len) { iser_err("Offset:%ld & DSL:%ld in Data-Out " @@ -376,11 +485,12 @@ int iser_send_data_out(struct iscsi_conn *conn, itt, buf_offset, data_seg_len); - err = iser_post_send(iser_conn->ib_conn, tx_desc); + err = iser_post_send(tx_desc); if (!err) return 0; send_data_out_error: + iser_dto_buffs_release(send_dto); kmem_cache_free(ig.desc_cache, tx_desc); iser_err("conn %p failed err %d\n",conn, err); return err; @@ -391,47 +501,64 @@ int iser_send_control(struct iscsi_conn *conn, { struct iscsi_iser_conn *iser_conn = conn->dd_data; struct iscsi_iser_task *iser_task = task->dd_data; - struct iser_tx_desc *mdesc = &iser_task->desc; + struct iser_desc *mdesc = &iser_task->desc; + struct iser_dto *send_dto = NULL; unsigned long data_seg_len; int err = 0; + struct iser_regd_buf *regd_buf; struct iser_device *device; + unsigned char opcode; + + if (!iser_conn_state_comp(iser_conn->ib_conn, ISER_CONN_UP)) { + iser_err("Failed to send, conn: 0x%p is not up\n", iser_conn->ib_conn); + return -EPERM; + } + + if (iser_check_xmit(conn, task)) + return -ENOBUFS; /* build the tx desc regd header and add it to the tx desc dto */ mdesc->type = ISCSI_TX_CONTROL; - iser_create_send_desc(iser_conn->ib_conn, mdesc); + send_dto = &mdesc->dto; + send_dto->task = NULL; + iser_create_send_desc(iser_conn, mdesc); device = iser_conn->ib_conn->device; + iser_reg_single(device, send_dto->regd[0], DMA_TO_DEVICE); + data_seg_len = ntoh24(task->hdr->dlength); if (data_seg_len > 0) { - struct ib_sge *tx_dsg = &mdesc->tx_sg[1]; - if (task != conn->login_task) { - iser_err("data present on non login task!!!\n"); - goto send_control_error; - } - memcpy(iser_conn->ib_conn->login_buf, task->data, - task->data_count); - tx_dsg->addr = iser_conn->ib_conn->login_dma; - tx_dsg->length = data_seg_len; - tx_dsg->lkey = device->mr->lkey; - mdesc->num_sge = 2; + regd_buf = &mdesc->data_regd_buf; + memset(regd_buf, 0, sizeof(struct iser_regd_buf)); + regd_buf->device = device; + regd_buf->virt_addr = task->data; + regd_buf->data_size = task->data_count; + iser_reg_single(device, regd_buf, + DMA_TO_DEVICE); + iser_dto_add_regd_buff(send_dto, regd_buf, + 0, + data_seg_len); } - if (task == conn->login_task) { - err = iser_post_recvl(iser_conn->ib_conn); - if (err) - goto send_control_error; - err = iser_post_rx_bufs(conn, task->hdr); - if (err) + opcode = task->hdr->opcode & ISCSI_OPCODE_MASK; + + /* post recv buffer for response if one is expected */ + if (!(opcode == ISCSI_OP_NOOP_OUT && task->hdr->itt == RESERVED_ITT)) { + if (iser_post_receive_control(conn) != 0) { + iser_err("post_rcv_buff failed!\n"); + err = -ENOMEM; goto send_control_error; + } } - err = iser_post_send(iser_conn->ib_conn, mdesc); + err = iser_post_send(mdesc); if (!err) return 0; send_control_error: + iser_dto_buffs_release(send_dto); iser_err("conn %p failed err %d\n",conn, err); return err; } @@ -439,71 +566,104 @@ send_control_error: /** * iser_rcv_dto_completion - recv DTO completion */ -void iser_rcv_completion(struct iser_rx_desc *rx_desc, - unsigned long rx_xfer_len, - struct iser_conn *ib_conn) +void iser_rcv_completion(struct iser_desc *rx_desc, + unsigned long dto_xfer_len) { - struct iscsi_iser_conn *conn = ib_conn->iser_conn; + struct iser_dto *dto = &rx_desc->dto; + struct iscsi_iser_conn *conn = dto->ib_conn->iser_conn; + struct iscsi_task *task; + struct iscsi_iser_task *iser_task; struct iscsi_hdr *hdr; - u64 rx_dma; - int rx_buflen, outstanding, count, err; - - /* differentiate between login to all other PDUs */ - if ((char *)rx_desc == ib_conn->login_buf) { - rx_dma = ib_conn->login_dma; - rx_buflen = ISER_RX_LOGIN_SIZE; - } else { - rx_dma = rx_desc->dma_addr; - rx_buflen = ISER_RX_PAYLOAD_SIZE; - } - - ib_dma_sync_single_for_cpu(ib_conn->device->ib_device, rx_dma, - rx_buflen, DMA_FROM_DEVICE); + char *rx_data = NULL; + int rx_data_len = 0; + unsigned char opcode; hdr = &rx_desc->iscsi_header; - iser_dbg("op 0x%x itt 0x%x dlen %d\n", hdr->opcode, - hdr->itt, (int)(rx_xfer_len - ISER_HEADERS_LEN)); + iser_dbg("op 0x%x itt 0x%x\n", hdr->opcode,hdr->itt); + + if (dto_xfer_len > ISER_TOTAL_HEADERS_LEN) { /* we have data */ + rx_data_len = dto_xfer_len - ISER_TOTAL_HEADERS_LEN; + rx_data = dto->regd[1]->virt_addr; + rx_data += dto->offset[1]; + } + + opcode = hdr->opcode & ISCSI_OPCODE_MASK; + + if (opcode == ISCSI_OP_SCSI_CMD_RSP) { + spin_lock(&conn->iscsi_conn->session->lock); + task = iscsi_itt_to_ctask(conn->iscsi_conn, hdr->itt); + if (task) + __iscsi_get_task(task); + spin_unlock(&conn->iscsi_conn->session->lock); + + if (!task) + iser_err("itt can't be matched to task!!! " + "conn %p opcode %d itt %d\n", + conn->iscsi_conn, opcode, hdr->itt); + else { + iser_task = task->dd_data; + iser_dbg("itt %d task %p\n",hdr->itt, task); + iser_task->status = ISER_TASK_STATUS_COMPLETED; + iser_task_rdma_finalize(iser_task); + iscsi_put_task(task); + } + } + iser_dto_buffs_release(dto); - iscsi_iser_recv(conn->iscsi_conn, hdr, - rx_desc->data, rx_xfer_len - ISER_HEADERS_LEN); + iscsi_iser_recv(conn->iscsi_conn, hdr, rx_data, rx_data_len); - ib_dma_sync_single_for_device(ib_conn->device->ib_device, rx_dma, - rx_buflen, DMA_FROM_DEVICE); + kfree(rx_desc->data); + kmem_cache_free(ig.desc_cache, rx_desc); /* decrementing conn->post_recv_buf_count only --after-- freeing the * * task eliminates the need to worry on tasks which are completed in * * parallel to the execution of iser_conn_term. So the code that waits * * for the posted rx bufs refcount to become zero handles everything */ - conn->ib_conn->post_recv_buf_count--; - - if (rx_dma == ib_conn->login_dma) - return; + atomic_dec(&conn->ib_conn->post_recv_buf_count); - outstanding = ib_conn->post_recv_buf_count; - if (outstanding + ISER_MIN_POSTED_RX <= ISER_QP_MAX_RECV_DTOS) { - count = min(ISER_QP_MAX_RECV_DTOS - outstanding, - ISER_MIN_POSTED_RX); - err = iser_post_recvm(ib_conn, count); - if (err) - iser_err("posting %d rx bufs err %d\n", count, err); + /* + * if an unexpected PDU was received then the recv wr consumed must + * be replaced, this is done in the next send of a control-type PDU + */ + if (opcode == ISCSI_OP_NOOP_IN && hdr->itt == RESERVED_ITT) { + /* nop-in with itt = 0xffffffff */ + atomic_inc(&conn->ib_conn->unexpected_pdu_count); + } + else if (opcode == ISCSI_OP_ASYNC_EVENT) { + /* asyncronous message */ + atomic_inc(&conn->ib_conn->unexpected_pdu_count); } + /* a reject PDU consumes the recv buf posted for the response */ } -void iser_snd_completion(struct iser_tx_desc *tx_desc, - struct iser_conn *ib_conn) +void iser_snd_completion(struct iser_desc *tx_desc) { + struct iser_dto *dto = &tx_desc->dto; + struct iser_conn *ib_conn = dto->ib_conn; + struct iscsi_iser_conn *iser_conn = ib_conn->iser_conn; + struct iscsi_conn *conn = iser_conn->iscsi_conn; struct iscsi_task *task; - struct iser_device *device = ib_conn->device; + int resume_tx = 0; + + iser_dbg("Initiator, Data sent dto=0x%p\n", dto); - if (tx_desc->type == ISCSI_TX_DATAOUT) { - ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr, - ISER_HEADERS_LEN, DMA_TO_DEVICE); + iser_dto_buffs_release(dto); + + if (tx_desc->type == ISCSI_TX_DATAOUT) kmem_cache_free(ig.desc_cache, tx_desc); - } + + if (atomic_read(&iser_conn->ib_conn->post_send_buf_count) == + ISER_QP_MAX_REQ_DTOS) + resume_tx = 1; atomic_dec(&ib_conn->post_send_buf_count); + if (resume_tx) { + iser_dbg("%ld resuming tx\n",jiffies); + iscsi_conn_queue_work(conn); + } + if (tx_desc->type == ISCSI_TX_CONTROL) { /* this arithmetic is legal by libiscsi dd_data allocation */ task = (void *) ((long)(void *)tx_desc - @@ -532,6 +692,7 @@ void iser_task_rdma_init(struct iscsi_iser_task *iser_task) void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task) { + int deferred; int is_rdma_aligned = 1; struct iser_regd_buf *regd; @@ -549,17 +710,32 @@ void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task) if (iser_task->dir[ISER_DIR_IN]) { regd = &iser_task->rdma_regd[ISER_DIR_IN]; - if (regd->reg.is_fmr) - iser_unreg_mem(®d->reg); + deferred = iser_regd_buff_release(regd); + if (deferred) { + iser_err("%d references remain for BUF-IN rdma reg\n", + atomic_read(®d->ref_count)); + } } if (iser_task->dir[ISER_DIR_OUT]) { regd = &iser_task->rdma_regd[ISER_DIR_OUT]; - if (regd->reg.is_fmr) - iser_unreg_mem(®d->reg); + deferred = iser_regd_buff_release(regd); + if (deferred) { + iser_err("%d references remain for BUF-OUT rdma reg\n", + atomic_read(®d->ref_count)); + } } /* if the data was unaligned, it was already unmapped and then copied */ if (is_rdma_aligned) iser_dma_unmap_task_data(iser_task); } + +void iser_dto_buffs_release(struct iser_dto *dto) +{ + int i; + + for (i = 0; i < dto->regd_vector_len; i++) + iser_regd_buff_release(dto->regd[i]); +} + diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index fb88d6896b677..b9453d068e9d8 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -40,6 +40,62 @@ #define ISER_KMALLOC_THRESHOLD 0x20000 /* 128K - kmalloc limit */ +/** + * Decrements the reference count for the + * registered buffer & releases it + * + * returns 0 if released, 1 if deferred + */ +int iser_regd_buff_release(struct iser_regd_buf *regd_buf) +{ + struct ib_device *dev; + + if ((atomic_read(®d_buf->ref_count) == 0) || + atomic_dec_and_test(®d_buf->ref_count)) { + /* if we used the dma mr, unreg is just NOP */ + if (regd_buf->reg.is_fmr) + iser_unreg_mem(®d_buf->reg); + + if (regd_buf->dma_addr) { + dev = regd_buf->device->ib_device; + ib_dma_unmap_single(dev, + regd_buf->dma_addr, + regd_buf->data_size, + regd_buf->direction); + } + /* else this regd buf is associated with task which we */ + /* dma_unmap_single/sg later */ + return 0; + } else { + iser_dbg("Release deferred, regd.buff: 0x%p\n", regd_buf); + return 1; + } +} + +/** + * iser_reg_single - fills registered buffer descriptor with + * registration information + */ +void iser_reg_single(struct iser_device *device, + struct iser_regd_buf *regd_buf, + enum dma_data_direction direction) +{ + u64 dma_addr; + + dma_addr = ib_dma_map_single(device->ib_device, + regd_buf->virt_addr, + regd_buf->data_size, direction); + BUG_ON(ib_dma_mapping_error(device->ib_device, dma_addr)); + + regd_buf->reg.lkey = device->mr->lkey; + regd_buf->reg.len = regd_buf->data_size; + regd_buf->reg.va = dma_addr; + regd_buf->reg.is_fmr = 0; + + regd_buf->dma_addr = dma_addr; + regd_buf->direction = direction; +} + /** * iser_start_rdma_unaligned_sg */ @@ -53,10 +109,10 @@ static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, unsigned long cmd_data_len = data->data_len; if (cmd_data_len > ISER_KMALLOC_THRESHOLD) - mem = (void *)__get_free_pages(GFP_ATOMIC, + mem = (void *)__get_free_pages(GFP_NOIO, ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT); else - mem = kmalloc(cmd_data_len, GFP_ATOMIC); + mem = kmalloc(cmd_data_len, GFP_NOIO); if (mem == NULL) { iser_err("Failed to allocate mem size %d %d for copying sglist\n", @@ -153,8 +209,6 @@ void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, mem_copy->copy_buf = NULL; } -#define IS_4K_ALIGNED(addr) ((((unsigned long)addr) & ~MASK_4K) == 0) - /** * iser_sg_to_page_vec - Translates scatterlist entries to physical addresses * and returns the length of resulting physical address array (may be less than @@ -167,52 +221,62 @@ void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, * where --few fragments of the same page-- are present in the SG as * consecutive elements. Also, it handles one entry SG. */ - static int iser_sg_to_page_vec(struct iser_data_buf *data, struct iser_page_vec *page_vec, struct ib_device *ibdev) { - struct scatterlist *sg, *sgl = (struct scatterlist *)data->buf; - u64 start_addr, end_addr, page, chunk_start = 0; + struct scatterlist *sgl = (struct scatterlist *)data->buf; + struct scatterlist *sg; + u64 first_addr, last_addr, page; + int end_aligned; + unsigned int cur_page = 0; unsigned long total_sz = 0; - unsigned int dma_len; - int i, new_chunk, cur_page, last_ent = data->dma_nents - 1; + int i; /* compute the offset of first element */ page_vec->offset = (u64) sgl[0].offset & ~MASK_4K; - new_chunk = 1; - cur_page = 0; for_each_sg(sgl, sg, data->dma_nents, i) { - start_addr = ib_sg_dma_address(ibdev, sg); - if (new_chunk) - chunk_start = start_addr; - dma_len = ib_sg_dma_len(ibdev, sg); - end_addr = start_addr + dma_len; + unsigned int dma_len = ib_sg_dma_len(ibdev, sg); + total_sz += dma_len; - /* collect page fragments until aligned or end of SG list */ - if (!IS_4K_ALIGNED(end_addr) && i < last_ent) { - new_chunk = 0; - continue; + first_addr = ib_sg_dma_address(ibdev, sg); + last_addr = first_addr + dma_len; + + end_aligned = !(last_addr & ~MASK_4K); + + /* continue to collect page fragments till aligned or SG ends */ + while (!end_aligned && (i + 1 < data->dma_nents)) { + sg = sg_next(sg); + i++; + dma_len = ib_sg_dma_len(ibdev, sg); + total_sz += dma_len; + last_addr = ib_sg_dma_address(ibdev, sg) + dma_len; + end_aligned = !(last_addr & ~MASK_4K); } - new_chunk = 1; - - /* address of the first page in the contiguous chunk; - masking relevant for the very first SG entry, - which might be unaligned */ - page = chunk_start & MASK_4K; - do { - page_vec->pages[cur_page++] = page; + + /* handle the 1st page in the 1st DMA element */ + if (cur_page == 0) { + page = first_addr & MASK_4K; + page_vec->pages[cur_page] = page; + cur_page++; page += SIZE_4K; - } while (page < end_addr); - } + } else + page = first_addr; + + for (; page < last_addr; page += SIZE_4K) { + page_vec->pages[cur_page] = page; + cur_page++; + } + } page_vec->data_size = total_sz; iser_dbg("page_vec->data_size:%d cur_page %d\n", page_vec->data_size,cur_page); return cur_page; } +#define IS_4K_ALIGNED(addr) ((((unsigned long)addr) & ~MASK_4K) == 0) /** * iser_data_buf_aligned_len - Tries to determine the maximal correctly aligned @@ -220,40 +284,42 @@ static int iser_sg_to_page_vec(struct iser_data_buf *data, * the number of entries which are aligned correctly. Supports the case where * consecutive SG elements are actually fragments of the same physcial page. */ -static int iser_data_buf_aligned_len(struct iser_data_buf *data, - struct ib_device *ibdev) +static unsigned int iser_data_buf_aligned_len(struct iser_data_buf *data, + struct ib_device *ibdev) { - struct scatterlist *sgl, *sg, *next_sg = NULL; - u64 start_addr, end_addr; - int i, ret_len, start_check = 0; - - if (data->dma_nents == 1) - return 1; + struct scatterlist *sgl, *sg; + u64 end_addr, next_addr; + int i, cnt; + unsigned int ret_len = 0; sgl = (struct scatterlist *)data->buf; - start_addr = ib_sg_dma_address(ibdev, sgl); + cnt = 0; for_each_sg(sgl, sg, data->dma_nents, i) { - if (start_check && !IS_4K_ALIGNED(start_addr)) - break; - - next_sg = sg_next(sg); - if (!next_sg) - break; - - end_addr = start_addr + ib_sg_dma_len(ibdev, sg); - start_addr = ib_sg_dma_address(ibdev, next_sg); - - if (end_addr == start_addr) { - start_check = 0; - continue; - } else - start_check = 1; - - if (!IS_4K_ALIGNED(end_addr)) - break; + /* iser_dbg("Checking sg iobuf [%d]: phys=0x%08lX " + "offset: %ld sz: %ld\n", i, + (unsigned long)sg_phys(sg), + (unsigned long)sg->offset, + (unsigned long)sg->length); */ + end_addr = ib_sg_dma_address(ibdev, sg) + + ib_sg_dma_len(ibdev, sg); + /* iser_dbg("Checking sg iobuf end address " + "0x%08lX\n", end_addr); */ + if (i + 1 < data->dma_nents) { + next_addr = ib_sg_dma_address(ibdev, sg_next(sg)); + /* are i, i+1 fragments of the same page? */ + if (end_addr == next_addr) { + cnt++; + continue; + } else if (!IS_4K_ALIGNED(end_addr)) { + ret_len = cnt + 1; + break; + } + } + cnt++; } - ret_len = (next_sg) ? i : i+1; + if (i == data->dma_nents) + ret_len = cnt; /* loop ended */ iser_dbg("Found %d aligned entries out of %d in sg:0x%p\n", ret_len, data->dma_nents, data); return ret_len; @@ -418,5 +484,9 @@ int iser_reg_rdma_mem(struct iscsi_iser_task *iser_task, return err; } } + + /* take a reference on this regd buf such that it will not be released * + * (eg in send dto completion) before we get the scsi response */ + atomic_inc(®d_buf->ref_count); return 0; } diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index ede1475bee09c..62d2151c8d997 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -32,14 +32,14 @@ */ #include #include -#include #include #include "iscsi_iser.h" #define ISCSI_ISER_MAX_CONN 8 -#define ISER_MAX_RX_CQ_LEN (ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN) -#define ISER_MAX_TX_CQ_LEN (ISER_QP_MAX_REQ_DTOS * ISCSI_ISER_MAX_CONN) +#define ISER_MAX_CQ_LEN ((ISER_QP_MAX_RECV_DTOS + \ + ISER_QP_MAX_REQ_DTOS) * \ + ISCSI_ISER_MAX_CONN) static void iser_cq_tasklet_fn(unsigned long data); static void iser_cq_callback(struct ib_cq *cq, void *cq_context); @@ -54,13 +54,6 @@ static void iser_qp_event_callback(struct ib_event *cause, void *context) iser_err("got qp event %d\n",cause->event); } -static void iser_event_handler(struct ib_event_handler *handler, - struct ib_event *event) -{ - iser_err("async event %d on device %s port %d\n", event->event, - event->device->name, event->element.port_num); -} - /** * iser_create_device_ib_res - creates Protection Domain (PD), Completion * Queue (CQ), DMA Memory Region (DMA MR) with the device associated with @@ -74,23 +67,15 @@ static int iser_create_device_ib_res(struct iser_device *device) if (IS_ERR(device->pd)) goto pd_err; - device->rx_cq = ib_create_cq(device->ib_device, + device->cq = ib_create_cq(device->ib_device, iser_cq_callback, iser_cq_event_callback, (void *)device, - ISER_MAX_RX_CQ_LEN, 0); - if (IS_ERR(device->rx_cq)) - goto rx_cq_err; - - device->tx_cq = ib_create_cq(device->ib_device, - NULL, iser_cq_event_callback, - (void *)device, - ISER_MAX_TX_CQ_LEN, 0); + ISER_MAX_CQ_LEN, 0); + if (IS_ERR(device->cq)) + goto cq_err; - if (IS_ERR(device->tx_cq)) - goto tx_cq_err; - - if (ib_req_notify_cq(device->rx_cq, IB_CQ_NEXT_COMP)) + if (ib_req_notify_cq(device->cq, IB_CQ_NEXT_COMP)) goto cq_arm_err; tasklet_init(&device->cq_tasklet, @@ -103,22 +88,13 @@ static int iser_create_device_ib_res(struct iser_device *device) if (IS_ERR(device->mr)) goto dma_mr_err; - INIT_IB_EVENT_HANDLER(&device->event_handler, device->ib_device, - iser_event_handler); - if (ib_register_event_handler(&device->event_handler)) - goto handler_err; - return 0; -handler_err: - ib_dereg_mr(device->mr); dma_mr_err: tasklet_kill(&device->cq_tasklet); cq_arm_err: - ib_destroy_cq(device->tx_cq); -tx_cq_err: - ib_destroy_cq(device->rx_cq); -rx_cq_err: + ib_destroy_cq(device->cq); +cq_err: ib_dealloc_pd(device->pd); pd_err: iser_err("failed to allocate an IB resource\n"); @@ -134,15 +110,13 @@ static void iser_free_device_ib_res(struct iser_device *device) BUG_ON(device->mr == NULL); tasklet_kill(&device->cq_tasklet); - (void)ib_unregister_event_handler(&device->event_handler); + (void)ib_dereg_mr(device->mr); - (void)ib_destroy_cq(device->tx_cq); - (void)ib_destroy_cq(device->rx_cq); + (void)ib_destroy_cq(device->cq); (void)ib_dealloc_pd(device->pd); device->mr = NULL; - device->tx_cq = NULL; - device->rx_cq = NULL; + device->cq = NULL; device->pd = NULL; } @@ -155,27 +129,20 @@ static int iser_create_ib_conn_res(struct iser_conn *ib_conn) { struct iser_device *device; struct ib_qp_init_attr init_attr; - int ret = -ENOMEM; + int ret; struct ib_fmr_pool_param params; BUG_ON(ib_conn->device == NULL); device = ib_conn->device; - ib_conn->login_buf = kmalloc(ISER_RX_LOGIN_SIZE, GFP_KERNEL); - if (!ib_conn->login_buf) - goto out_err; - - ib_conn->login_dma = ib_dma_map_single(ib_conn->device->ib_device, - (void *)ib_conn->login_buf, ISER_RX_LOGIN_SIZE, - DMA_FROM_DEVICE); - ib_conn->page_vec = kmalloc(sizeof(struct iser_page_vec) + (sizeof(u64) * (ISCSI_ISER_SG_TABLESIZE +1)), GFP_KERNEL); - if (!ib_conn->page_vec) - goto out_err; - + if (!ib_conn->page_vec) { + ret = -ENOMEM; + goto alloc_err; + } ib_conn->page_vec->pages = (u64 *) (ib_conn->page_vec + 1); params.page_shift = SHIFT_4K; @@ -187,6 +154,7 @@ static int iser_create_ib_conn_res(struct iser_conn *ib_conn) params.pool_size = ISCSI_DEF_XMIT_CMDS_MAX * 2; params.dirty_watermark = ISCSI_DEF_XMIT_CMDS_MAX; params.cache = 0; + params.relaxed = 0; params.flush_function = NULL; params.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | @@ -195,26 +163,25 @@ static int iser_create_ib_conn_res(struct iser_conn *ib_conn) ib_conn->fmr_pool = ib_create_fmr_pool(device->pd, ¶ms); if (IS_ERR(ib_conn->fmr_pool)) { ret = PTR_ERR(ib_conn->fmr_pool); - ib_conn->fmr_pool = NULL; - goto out_err; + goto fmr_pool_err; } memset(&init_attr, 0, sizeof init_attr); init_attr.event_handler = iser_qp_event_callback; init_attr.qp_context = (void *)ib_conn; - init_attr.send_cq = device->tx_cq; - init_attr.recv_cq = device->rx_cq; + init_attr.send_cq = device->cq; + init_attr.recv_cq = device->cq; init_attr.cap.max_send_wr = ISER_QP_MAX_REQ_DTOS; init_attr.cap.max_recv_wr = ISER_QP_MAX_RECV_DTOS; - init_attr.cap.max_send_sge = 2; - init_attr.cap.max_recv_sge = 1; + init_attr.cap.max_send_sge = MAX_REGD_BUF_VECTOR_LEN; + init_attr.cap.max_recv_sge = 2; init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; init_attr.qp_type = IB_QPT_RC; ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr); if (ret) - goto out_err; + goto qp_err; ib_conn->qp = ib_conn->cma_id->qp; iser_err("setting conn %p cma_id %p: fmr_pool %p qp %p\n", @@ -222,7 +189,11 @@ static int iser_create_ib_conn_res(struct iser_conn *ib_conn) ib_conn->fmr_pool, ib_conn->cma_id->qp); return ret; -out_err: +qp_err: + (void)ib_destroy_fmr_pool(ib_conn->fmr_pool); +fmr_pool_err: + kfree(ib_conn->page_vec); +alloc_err: iser_err("unable to alloc mem or create resource, err %d\n", ret); return ret; } @@ -231,7 +202,7 @@ out_err: * releases the FMR pool, QP and CMA ID objects, returns 0 on success, * -1 on failure */ -static int iser_free_ib_conn_res(struct iser_conn *ib_conn, int can_destroy_id) +static int iser_free_ib_conn_res(struct iser_conn *ib_conn) { BUG_ON(ib_conn == NULL); @@ -246,8 +217,7 @@ static int iser_free_ib_conn_res(struct iser_conn *ib_conn, int can_destroy_id) if (ib_conn->qp != NULL) rdma_destroy_qp(ib_conn->cma_id); - /* if cma handler context, the caller acts s.t the cma destroy the id */ - if (ib_conn->cma_id != NULL && can_destroy_id) + if (ib_conn->cma_id != NULL) rdma_destroy_id(ib_conn->cma_id); ib_conn->fmr_pool = NULL; @@ -309,6 +279,17 @@ static void iser_device_try_release(struct iser_device *device) mutex_unlock(&ig.device_list_mutex); } +int iser_conn_state_comp(struct iser_conn *ib_conn, + enum iser_ib_conn_state comp) +{ + int ret; + + spin_lock_bh(&ib_conn->lock); + ret = (ib_conn->state == comp); + spin_unlock_bh(&ib_conn->lock); + return ret; +} + static int iser_conn_state_comp_exch(struct iser_conn *ib_conn, enum iser_ib_conn_state comp, enum iser_ib_conn_state exch) @@ -325,7 +306,7 @@ static int iser_conn_state_comp_exch(struct iser_conn *ib_conn, /** * Frees all conn objects and deallocs conn descriptor */ -static void iser_conn_release(struct iser_conn *ib_conn, int can_destroy_id) +static void iser_conn_release(struct iser_conn *ib_conn) { struct iser_device *device = ib_conn->device; @@ -334,12 +315,14 @@ static void iser_conn_release(struct iser_conn *ib_conn, int can_destroy_id) mutex_lock(&ig.connlist_mutex); list_del(&ib_conn->conn_list); mutex_unlock(&ig.connlist_mutex); - iser_free_rx_descriptors(ib_conn); - iser_free_ib_conn_res(ib_conn, can_destroy_id); + + iser_free_ib_conn_res(ib_conn); ib_conn->device = NULL; /* on EVENT_ADDR_ERROR there's no device yet for this conn */ if (device != NULL) iser_device_try_release(device); + if (ib_conn->iser_conn) + ib_conn->iser_conn->ib_conn = NULL; iscsi_destroy_endpoint(ib_conn->ep); } @@ -348,13 +331,10 @@ void iser_conn_get(struct iser_conn *ib_conn) atomic_inc(&ib_conn->refcount); } -int iser_conn_put(struct iser_conn *ib_conn, int can_destroy_id) +void iser_conn_put(struct iser_conn *ib_conn) { - if (atomic_dec_and_test(&ib_conn->refcount)) { - iser_conn_release(ib_conn, can_destroy_id); - return 1; - } - return 0; + if (atomic_dec_and_test(&ib_conn->refcount)) + iser_conn_release(ib_conn); } /** @@ -378,20 +358,19 @@ void iser_conn_terminate(struct iser_conn *ib_conn) wait_event_interruptible(ib_conn->wait, ib_conn->state == ISER_CONN_DOWN); - iser_conn_put(ib_conn, 1); /* deref ib conn deallocate */ + iser_conn_put(ib_conn); } -static int iser_connect_error(struct rdma_cm_id *cma_id) +static void iser_connect_error(struct rdma_cm_id *cma_id) { struct iser_conn *ib_conn; ib_conn = (struct iser_conn *)cma_id->context; ib_conn->state = ISER_CONN_DOWN; wake_up_interruptible(&ib_conn->wait); - return iser_conn_put(ib_conn, 0); /* deref ib conn's cma id */ } -static int iser_addr_handler(struct rdma_cm_id *cma_id) +static void iser_addr_handler(struct rdma_cm_id *cma_id) { struct iser_device *device; struct iser_conn *ib_conn; @@ -400,7 +379,8 @@ static int iser_addr_handler(struct rdma_cm_id *cma_id) device = iser_device_find_by_ib_device(cma_id); if (!device) { iser_err("device lookup/creation failed\n"); - return iser_connect_error(cma_id); + iser_connect_error(cma_id); + return; } ib_conn = (struct iser_conn *)cma_id->context; @@ -409,13 +389,11 @@ static int iser_addr_handler(struct rdma_cm_id *cma_id) ret = rdma_resolve_route(cma_id, 1000); if (ret) { iser_err("resolve route failed: %d\n", ret); - return iser_connect_error(cma_id); + iser_connect_error(cma_id); } - - return 0; } -static int iser_route_handler(struct rdma_cm_id *cma_id) +static void iser_route_handler(struct rdma_cm_id *cma_id) { struct rdma_conn_param conn_param; int ret; @@ -436,9 +414,9 @@ static int iser_route_handler(struct rdma_cm_id *cma_id) goto failure; } - return 0; + return; failure: - return iser_connect_error(cma_id); + iser_connect_error(cma_id); } static void iser_connected_handler(struct rdma_cm_id *cma_id) @@ -450,12 +428,12 @@ static void iser_connected_handler(struct rdma_cm_id *cma_id) wake_up_interruptible(&ib_conn->wait); } -static int iser_disconnected_handler(struct rdma_cm_id *cma_id) +static void iser_disconnected_handler(struct rdma_cm_id *cma_id) { struct iser_conn *ib_conn; - int ret; ib_conn = (struct iser_conn *)cma_id->context; + ib_conn->disc_evt_flag = 1; /* getting here when the state is UP means that the conn is being * * terminated asynchronously from the iSCSI layer's perspective. */ @@ -465,29 +443,25 @@ static int iser_disconnected_handler(struct rdma_cm_id *cma_id) ISCSI_ERR_CONN_FAILED); /* Complete the termination process if no posts are pending */ - if (ib_conn->post_recv_buf_count == 0 && + if ((atomic_read(&ib_conn->post_recv_buf_count) == 0) && (atomic_read(&ib_conn->post_send_buf_count) == 0)) { ib_conn->state = ISER_CONN_DOWN; wake_up_interruptible(&ib_conn->wait); } - - ret = iser_conn_put(ib_conn, 0); /* deref ib conn's cma id */ - return ret; } static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) { int ret = 0; - iser_err("event %d status %d conn %p id %p\n", - event->event, event->status, cma_id->context, cma_id); + iser_err("event %d conn %p id %p\n",event->event,cma_id->context,cma_id); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: - ret = iser_addr_handler(cma_id); + iser_addr_handler(cma_id); break; case RDMA_CM_EVENT_ROUTE_RESOLVED: - ret = iser_route_handler(cma_id); + iser_route_handler(cma_id); break; case RDMA_CM_EVENT_ESTABLISHED: iser_connected_handler(cma_id); @@ -497,12 +471,13 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_REJECTED: - ret = iser_connect_error(cma_id); + iser_err("event: %d, error: %d\n", event->event, event->status); + iser_connect_error(cma_id); break; case RDMA_CM_EVENT_DISCONNECTED: case RDMA_CM_EVENT_DEVICE_REMOVAL: case RDMA_CM_EVENT_ADDR_CHANGE: - ret = iser_disconnected_handler(cma_id); + iser_disconnected_handler(cma_id); break; default: iser_err("Unexpected RDMA CM event (%d)\n", event->event); @@ -515,16 +490,17 @@ void iser_conn_init(struct iser_conn *ib_conn) { ib_conn->state = ISER_CONN_INIT; init_waitqueue_head(&ib_conn->wait); - ib_conn->post_recv_buf_count = 0; + atomic_set(&ib_conn->post_recv_buf_count, 0); atomic_set(&ib_conn->post_send_buf_count, 0); - atomic_set(&ib_conn->refcount, 1); /* ref ib conn allocation */ + atomic_set(&ib_conn->unexpected_pdu_count, 0); + atomic_set(&ib_conn->refcount, 1); INIT_LIST_HEAD(&ib_conn->conn_list); spin_lock_init(&ib_conn->lock); } /** * starts the process of connecting to the target - * sleeps until the connection is established or rejected + * sleeps untill the connection is established or rejected */ int iser_connect(struct iser_conn *ib_conn, struct sockaddr_in *src_addr, @@ -545,10 +521,9 @@ int iser_connect(struct iser_conn *ib_conn, ib_conn->state = ISER_CONN_PENDING; - iser_conn_get(ib_conn); /* ref ib conn's cma id */ ib_conn->cma_id = rdma_create_id(iser_cma_handler, (void *)ib_conn, - RDMA_PS_TCP, IB_QPT_RC); + RDMA_PS_TCP); if (IS_ERR(ib_conn->cma_id)) { err = PTR_ERR(ib_conn->cma_id); iser_err("rdma_create_id failed: %d\n", err); @@ -583,7 +558,7 @@ id_failure: addr_failure: ib_conn->state = ISER_CONN_DOWN; connect_failure: - iser_conn_release(ib_conn, 1); + iser_conn_release(ib_conn); return err; } @@ -607,7 +582,7 @@ int iser_reg_page_vec(struct iser_conn *ib_conn, mem = ib_fmr_pool_map_phys(ib_conn->fmr_pool, page_list, page_vec->length, - io_addr); + io_addr, NULL); if (IS_ERR(mem)) { status = (int)PTR_ERR(mem); @@ -652,97 +627,136 @@ void iser_unreg_mem(struct iser_mem_reg *reg) reg->mem_h = NULL; } -int iser_post_recvl(struct iser_conn *ib_conn) +/** + * iser_dto_to_iov - builds IOV from a dto descriptor + */ +static void iser_dto_to_iov(struct iser_dto *dto, struct ib_sge *iov, int iov_len) { - struct ib_recv_wr rx_wr, *rx_wr_failed; - struct ib_sge sge; - int ib_ret; - - sge.addr = ib_conn->login_dma; - sge.length = ISER_RX_LOGIN_SIZE; - sge.lkey = ib_conn->device->mr->lkey; + int i; + struct ib_sge *sge; + struct iser_regd_buf *regd_buf; + + if (dto->regd_vector_len > iov_len) { + iser_err("iov size %d too small for posting dto of len %d\n", + iov_len, dto->regd_vector_len); + BUG(); + } - rx_wr.wr_id = (unsigned long)ib_conn->login_buf; - rx_wr.sg_list = &sge; - rx_wr.num_sge = 1; - rx_wr.next = NULL; + for (i = 0; i < dto->regd_vector_len; i++) { + sge = &iov[i]; + regd_buf = dto->regd[i]; + + sge->addr = regd_buf->reg.va; + sge->length = regd_buf->reg.len; + sge->lkey = regd_buf->reg.lkey; + + if (dto->used_sz[i] > 0) /* Adjust size */ + sge->length = dto->used_sz[i]; + + /* offset and length should not exceed the regd buf length */ + if (sge->length + dto->offset[i] > regd_buf->reg.len) { + iser_err("Used len:%ld + offset:%d, exceed reg.buf.len:" + "%ld in dto:0x%p [%d], va:0x%08lX\n", + (unsigned long)sge->length, dto->offset[i], + (unsigned long)regd_buf->reg.len, dto, i, + (unsigned long)sge->addr); + BUG(); + } - ib_conn->post_recv_buf_count++; - ib_ret = ib_post_recv(ib_conn->qp, &rx_wr, &rx_wr_failed); - if (ib_ret) { - iser_err("ib_post_recv failed ret=%d\n", ib_ret); - ib_conn->post_recv_buf_count--; + sge->addr += dto->offset[i]; /* Adjust offset */ } - return ib_ret; } -int iser_post_recvm(struct iser_conn *ib_conn, int count) +/** + * iser_post_recv - Posts a receive buffer. + * + * returns 0 on success, -1 on failure + */ +int iser_post_recv(struct iser_desc *rx_desc) { - struct ib_recv_wr *rx_wr, *rx_wr_failed; - int i, ib_ret; - unsigned int my_rx_head = ib_conn->rx_desc_head; - struct iser_rx_desc *rx_desc; - - for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) { - rx_desc = &ib_conn->rx_descs[my_rx_head]; - rx_wr->wr_id = (unsigned long)rx_desc; - rx_wr->sg_list = &rx_desc->rx_sg; - rx_wr->num_sge = 1; - rx_wr->next = rx_wr + 1; - my_rx_head = (my_rx_head + 1) & (ISER_QP_MAX_RECV_DTOS - 1); - } + int ib_ret, ret_val = 0; + struct ib_recv_wr recv_wr, *recv_wr_failed; + struct ib_sge iov[2]; + struct iser_conn *ib_conn; + struct iser_dto *recv_dto = &rx_desc->dto; + + /* Retrieve conn */ + ib_conn = recv_dto->ib_conn; - rx_wr--; - rx_wr->next = NULL; /* mark end of work requests list */ + iser_dto_to_iov(recv_dto, iov, 2); - ib_conn->post_recv_buf_count += count; - ib_ret = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &rx_wr_failed); + recv_wr.next = NULL; + recv_wr.sg_list = iov; + recv_wr.num_sge = recv_dto->regd_vector_len; + recv_wr.wr_id = (unsigned long)rx_desc; + + atomic_inc(&ib_conn->post_recv_buf_count); + ib_ret = ib_post_recv(ib_conn->qp, &recv_wr, &recv_wr_failed); if (ib_ret) { iser_err("ib_post_recv failed ret=%d\n", ib_ret); - ib_conn->post_recv_buf_count -= count; - } else - ib_conn->rx_desc_head = my_rx_head; - return ib_ret; -} + atomic_dec(&ib_conn->post_recv_buf_count); + ret_val = -1; + } + return ret_val; +} /** * iser_start_send - Initiate a Send DTO operation * * returns 0 on success, -1 on failure */ -int iser_post_send(struct iser_conn *ib_conn, struct iser_tx_desc *tx_desc) +int iser_post_send(struct iser_desc *tx_desc) { - int ib_ret; + int ib_ret, ret_val = 0; struct ib_send_wr send_wr, *send_wr_failed; + struct ib_sge iov[MAX_REGD_BUF_VECTOR_LEN]; + struct iser_conn *ib_conn; + struct iser_dto *dto = &tx_desc->dto; - ib_dma_sync_single_for_device(ib_conn->device->ib_device, - tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE); + ib_conn = dto->ib_conn; + + iser_dto_to_iov(dto, iov, MAX_REGD_BUF_VECTOR_LEN); send_wr.next = NULL; send_wr.wr_id = (unsigned long)tx_desc; - send_wr.sg_list = tx_desc->tx_sg; - send_wr.num_sge = tx_desc->num_sge; + send_wr.sg_list = iov; + send_wr.num_sge = dto->regd_vector_len; send_wr.opcode = IB_WR_SEND; - send_wr.send_flags = IB_SEND_SIGNALED; + send_wr.send_flags = dto->notify_enable ? IB_SEND_SIGNALED : 0; atomic_inc(&ib_conn->post_send_buf_count); ib_ret = ib_post_send(ib_conn->qp, &send_wr, &send_wr_failed); if (ib_ret) { + iser_err("Failed to start SEND DTO, dto: 0x%p, IOV len: %d\n", + dto, dto->regd_vector_len); iser_err("ib_post_send failed, ret:%d\n", ib_ret); atomic_dec(&ib_conn->post_send_buf_count); + ret_val = -1; } - return ib_ret; + + return ret_val; } -static void iser_handle_comp_error(struct iser_tx_desc *desc, - struct iser_conn *ib_conn) +static void iser_handle_comp_error(struct iser_desc *desc) { - if (desc && desc->type == ISCSI_TX_DATAOUT) + struct iser_dto *dto = &desc->dto; + struct iser_conn *ib_conn = dto->ib_conn; + + iser_dto_buffs_release(dto); + + if (desc->type == ISCSI_RX) { + kfree(desc->data); kmem_cache_free(ig.desc_cache, desc); + atomic_dec(&ib_conn->post_recv_buf_count); + } else { /* type is TX control/command/dataout */ + if (desc->type == ISCSI_TX_DATAOUT) + kmem_cache_free(ig.desc_cache, desc); + atomic_dec(&ib_conn->post_send_buf_count); + } - if (ib_conn->post_recv_buf_count == 0 && + if (atomic_read(&ib_conn->post_recv_buf_count) == 0 && atomic_read(&ib_conn->post_send_buf_count) == 0) { /* getting here when the state is UP means that the conn is * * being terminated asynchronously from the iSCSI layer's * @@ -752,81 +766,41 @@ static void iser_handle_comp_error(struct iser_tx_desc *desc, iscsi_conn_failure(ib_conn->iser_conn->iscsi_conn, ISCSI_ERR_CONN_FAILED); - /* no more non completed posts to the QP, complete the - * termination process w.o worrying on disconnect event */ - ib_conn->state = ISER_CONN_DOWN; - wake_up_interruptible(&ib_conn->wait); - } -} - -static int iser_drain_tx_cq(struct iser_device *device) -{ - struct ib_cq *cq = device->tx_cq; - struct ib_wc wc; - struct iser_tx_desc *tx_desc; - struct iser_conn *ib_conn; - int completed_tx = 0; - - while (ib_poll_cq(cq, 1, &wc) == 1) { - tx_desc = (struct iser_tx_desc *) (unsigned long) wc.wr_id; - ib_conn = wc.qp->qp_context; - if (wc.status == IB_WC_SUCCESS) { - if (wc.opcode == IB_WC_SEND) - iser_snd_completion(tx_desc, ib_conn); - else - iser_err("expected opcode %d got %d\n", - IB_WC_SEND, wc.opcode); - } else { - iser_err("tx id %llx status %d vend_err %x\n", - wc.wr_id, wc.status, wc.vendor_err); - atomic_dec(&ib_conn->post_send_buf_count); - iser_handle_comp_error(tx_desc, ib_conn); + /* complete the termination process if disconnect event was delivered * + * note there are no more non completed posts to the QP */ + if (ib_conn->disc_evt_flag) { + ib_conn->state = ISER_CONN_DOWN; + wake_up_interruptible(&ib_conn->wait); } - completed_tx++; } - return completed_tx; } - static void iser_cq_tasklet_fn(unsigned long data) { struct iser_device *device = (struct iser_device *)data; - struct ib_cq *cq = device->rx_cq; + struct ib_cq *cq = device->cq; struct ib_wc wc; - struct iser_rx_desc *desc; + struct iser_desc *desc; unsigned long xfer_len; - struct iser_conn *ib_conn; - int completed_tx, completed_rx; - completed_tx = completed_rx = 0; while (ib_poll_cq(cq, 1, &wc) == 1) { - desc = (struct iser_rx_desc *) (unsigned long) wc.wr_id; + desc = (struct iser_desc *) (unsigned long) wc.wr_id; BUG_ON(desc == NULL); - ib_conn = wc.qp->qp_context; + if (wc.status == IB_WC_SUCCESS) { - if (wc.opcode == IB_WC_RECV) { + if (desc->type == ISCSI_RX) { xfer_len = (unsigned long)wc.byte_len; - iser_rcv_completion(desc, xfer_len, ib_conn); - } else - iser_err("expected opcode %d got %d\n", - IB_WC_RECV, wc.opcode); + iser_rcv_completion(desc, xfer_len); + } else /* type == ISCSI_TX_CONTROL/SCSI_CMD/DOUT */ + iser_snd_completion(desc); } else { - if (wc.status != IB_WC_WR_FLUSH_ERR) - iser_err("rx id %llx status %d vend_err %x\n", - wc.wr_id, wc.status, wc.vendor_err); - ib_conn->post_recv_buf_count--; - iser_handle_comp_error(NULL, ib_conn); + iser_err("comp w. error op %d status %d\n",desc->type,wc.status); + iser_handle_comp_error(desc); } - completed_rx++; - if (!(completed_rx & 63)) - completed_tx += iser_drain_tx_cq(device); } /* #warning "it is assumed here that arming CQ only once its empty" * * " would not cause interrupts to be missed" */ ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); - - completed_tx += iser_drain_tx_cq(device); - iser_dbg("got %d rx %d tx completions\n", completed_rx, completed_tx); } static void iser_cq_callback(struct ib_cq *cq, void *cq_context) diff --git a/drivers/infiniband/ulp/sdp/Kconfig b/drivers/infiniband/ulp/sdp/Kconfig new file mode 100644 index 0000000000000..b5fadf4452c33 --- /dev/null +++ b/drivers/infiniband/ulp/sdp/Kconfig @@ -0,0 +1,28 @@ +config INFINIBAND_SDP + tristate "Sockets Direct Protocol" + depends on INFINIBAND && INFINIBAND_IPOIB + ---help--- + Support for Sockets Direct Protocol (SDP). This provides + sockets semantics over InfiniBand via address family + AF_INET_SDP (address family 27). You can also LD_PRELOAD the + libsdp library from to have standard + sockets applications use SDP. + +config INFINIBAND_SDP_DEBUG + bool "Sockets Direct Protocol debugging" + depends on INFINIBAND_SDP + ---help--- + This option causes debugging code to be compiled into the + SDP driver. The output can be turned on via the debug_level + module parameter (which can also be set through sysfs after the + driver is loaded). + +config INFINIBAND_SDP_DEBUG_DATA + bool "Sockets Direct Protocol data path debugging" + depends on INFINIBAND_SDP_DEBUG + ---help--- + This option compiles debugging code into the the data path + of the SDP driver. The output can be turned on via the + data_debug_level module parameter; however, even with output + turned off, this debugging code will have some performance + impact. diff --git a/drivers/infiniband/ulp/sdp/Makefile b/drivers/infiniband/ulp/sdp/Makefile new file mode 100644 index 0000000000000..67e4300a07f7e --- /dev/null +++ b/drivers/infiniband/ulp/sdp/Makefile @@ -0,0 +1,6 @@ +EXTRA_CFLAGS += -Idrivers/infiniband/include +EXTRA_CFLAGS += -ggdb + +obj-$(CONFIG_INFINIBAND_SDP) += ib_sdp.o + +ib_sdp-y := sdp_main.o sdp_cma.o sdp_bcopy.o sdp_proc.o sdp_tx.o sdp_rx.o sdp_zcopy.o diff --git a/drivers/infiniband/ulp/sdp/sdp.h b/drivers/infiniband/ulp/sdp/sdp.h new file mode 100644 index 0000000000000..0017384d71af9 --- /dev/null +++ b/drivers/infiniband/ulp/sdp/sdp.h @@ -0,0 +1,997 @@ +#ifndef _SDP_H_ +#define _SDP_H_ + +#include +#include +#include +#include +#include /* For urgent data flags */ +#include +#include +#include +#include +#include "sdp_dbg.h" + +#ifndef NIPQUAD +#define NIPQUAD(addr) \ + ((unsigned char *)&(addr))[0], \ + ((unsigned char *)&(addr))[1], \ + ((unsigned char *)&(addr))[2], \ + ((unsigned char *)&(addr))[3] +#endif + +#ifndef NIPQUAD_FMT +#define NIPQUAD_FMT "%u.%u.%u.%u" +#endif + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#ifndef NIP6 +#define NIP6(addr) \ + ntohs((addr).s6_addr16[0]), \ + ntohs((addr).s6_addr16[1]), \ + ntohs((addr).s6_addr16[2]), \ + ntohs((addr).s6_addr16[3]), \ + ntohs((addr).s6_addr16[4]), \ + ntohs((addr).s6_addr16[5]), \ + ntohs((addr).s6_addr16[6]), \ + ntohs((addr).s6_addr16[7]) +#endif + +#ifndef NIP6_FMT +#define NIP6_FMT "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x" +#endif +#endif + +#define inet_num(sk) inet_sk(sk)->inet_num +#define inet_sport(sk) inet_sk(sk)->inet_sport +#define inet_dport(sk) inet_sk(sk)->inet_dport +#define inet_saddr(sk) inet_sk(sk)->inet_saddr +#define sdp_inet_daddr(sk) inet_sk(sk)->inet_daddr +#define sdp_inet_rcv_saddr(sk) inet_sk(sk)->inet_rcv_saddr + +#define sdp_sk_sleep(sk) sk_sleep(sk) +#define sk_ssk(ssk) ((struct sock *)ssk) + +/* Interval between sucessive polls in the Tx routine when polling is used + instead of interrupts (in per-core Tx rings) - should be power of 2 */ +#define SDP_TX_POLL_MODER 16 +#define SDP_TX_POLL_TIMEOUT (HZ / 20) +#define SDP_NAGLE_TIMEOUT (HZ / 10) + +#define SDP_RX_ARMING_DELAY (msecs_to_jiffies(10)) +#define SDP_RDMA_READ_TIMEOUT (60 * HZ) /* timeout - fatal hw error */ + +#define SDP_SRCAVAIL_CANCEL_TIMEOUT (HZ * 60) +#define SDP_SRCAVAIL_ADV_TIMEOUT (1 * HZ) + +#define SDP_RESOLVE_TIMEOUT 1000 +#define SDP_ROUTE_TIMEOUT 1000 +#define SDP_KEEPALIVE_TIME (120 * 60 * HZ) +#define SDP_FIN_WAIT_TIMEOUT (60 * HZ) /* like TCP_FIN_TIMEOUT */ +#define SDP_CMA_TIMEWAIT_TIMEOUT (150 * HZ) + +extern int sdp_rx_size; +#define SDP_TX_SIZE 0x40 +#define SDP_DEF_INLINE_THRESH 256 + +#define SDP_FMR_SIZE (MIN(0x1000, PAGE_SIZE) / sizeof(u64)) + +#define SDP_MAX_RDMA_READ_LEN (PAGE_SIZE * (SDP_FMR_SIZE - 2)) + +#define SDP_MAX_RECV_SGES 9 /* 1 for sdp header + 8 for payload */ +#define SDP_MAX_SEND_SGES 9 /* same as above */ + +/* skb inlined data len - rest will be rx'ed into frags */ +#define SDP_SKB_HEAD_SIZE (0x500 + sizeof(struct sdp_bsdh)) + +/* limit tx payload len, if the sink supports bigger buffers than the source + * can handle. + * or rx fragment size (limited by sge->length size) */ +#define SDP_MAX_PAYLOAD ((1UL << 16) - SDP_SKB_HEAD_SIZE) + +#define SDP_NUM_WC 4 + +#define SDP_DEF_ZCOPY_THRESH 64*1024 +#define SDP_MIN_ZCOPY_THRESH PAGE_SIZE +#define SDP_MAX_ZCOPY_THRESH 1048576 + +#define SDP_OP_RECV 0x800000000LL +#define SDP_OP_SEND 0x400000000LL +#define SDP_OP_RDMA 0x200000000LL +#define SDP_OP_NOP 0x100000000LL + +/* how long (in jiffies) to block sender till tx completion*/ +#define SDP_BZCOPY_POLL_TIMEOUT (HZ / 10) + +#define SDP_AUTO_CONF 0xffff + +struct sdp_skb_cb { + __u32 seq; /* Starting sequence number */ + __u32 end_seq; /* SEQ + FIN + SYN + datalen */ + __u8 flags; /* TCP header flags. */ + struct bzcopy_state *bz; + struct rx_srcavail_state *rx_sa; + struct tx_srcavail_state *tx_sa; +}; + +#define SDP_SKB_CB(__skb) ((struct sdp_skb_cb *)&((__skb)->cb[0])) +#define BZCOPY_STATE(skb) (SDP_SKB_CB(skb)->bz) +#define RX_SRCAVAIL_STATE(skb) (SDP_SKB_CB(skb)->rx_sa) +#define TX_SRCAVAIL_STATE(skb) (SDP_SKB_CB(skb)->tx_sa) + +#ifndef MIN +#define MIN(a, b) (a < b ? a : b) +#endif + +#define ring_head(ring) (atomic_read(&(ring).head)) +#define ring_tail(ring) (atomic_read(&(ring).tail)) +#define ring_posted(ring) (ring_head(ring) - ring_tail(ring)) + +#define rx_ring_posted(ssk) ring_posted(ssk->rx_ring) +#define tx_ring_posted(ssk) (ring_posted(ssk->tx_ring) + \ + (ssk->tx_ring.rdma_inflight ? ssk->tx_ring.rdma_inflight->busy : 0)) + +#define posts_handler(ssk) atomic_read(&ssk->somebody_is_doing_posts) +#define posts_handler_get(ssk) \ + do { \ + atomic_inc(&ssk->somebody_is_doing_posts); \ + sdp_postpone_rx_timer(ssk); \ + } while (0) + +#define posts_handler_put(ssk, intr_delay) \ + do { \ + sdp_do_posts(ssk); \ + if (atomic_dec_and_test(&ssk->somebody_is_doing_posts) && \ + likely(ssk->qp_active)) \ + sdp_schedule_arm_rx_cq(ssk, intr_delay);\ + } while (0) + +#define sdp_common_release(sk) do { \ + sdp_dbg(sk, "%s:%d - sock_put(SOCK_REF_ALIVE" \ + ") - refcount = %d from withing sk_common_release\n",\ + __func__, __LINE__, atomic_read(&(sk)->sk_refcnt));\ + percpu_counter_inc((sk)->sk_prot->orphan_count);\ + sdp_add_to_history(sk, "sdp_common_release"); \ + _sdp_add_to_history(sk, "SOCK_REF_ALIVE", __func__, __LINE__, \ + 2, SOCK_REF_ALIVE); \ + sk_common_release(sk); \ +} while (0) + +extern int sdp_inline_thresh; +extern int sdp_zcopy_thresh; +extern struct workqueue_struct *sdp_wq; +extern struct list_head sock_list; +extern spinlock_t sock_list_lock; +extern int rcvbuf_initial_size; +extern struct proto sdp_proto; +extern struct workqueue_struct *rx_comp_wq; +extern spinlock_t sdp_large_sockets_lock; +extern struct ib_client sdp_client; +#ifdef SDPSTATS_ON +DECLARE_PER_CPU(struct sdpstats, sdpstats); +#endif + +enum sdp_mid { + SDP_MID_HELLO = 0x0, + SDP_MID_HELLO_ACK = 0x1, + SDP_MID_DISCONN = 0x2, + SDP_MID_ABORT = 0x3, + SDP_MID_SENDSM = 0x4, + SDP_MID_RDMARDCOMPL = 0x6, + SDP_MID_SRCAVAIL_CANCEL = 0x8, + SDP_MID_CHRCVBUF = 0xB, + SDP_MID_CHRCVBUF_ACK = 0xC, + SDP_MID_SINKAVAIL = 0xFD, + SDP_MID_SRCAVAIL = 0xFE, + SDP_MID_DATA = 0xFF, +}; + +enum sdp_flags { + SDP_OOB_PRES = 1 << 0, + SDP_OOB_PEND = 1 << 1, +}; + +enum { + SDP_MIN_TX_CREDITS = 2 +}; + +enum { + SDP_ERR_ERROR = -4, + SDP_ERR_FAULT = -3, + SDP_NEW_SEG = -2, + SDP_DO_WAIT_MEM = -1 +}; + +struct sdp_bsdh { + u8 mid; + u8 flags; + __u16 bufs; + __u32 len; + __u32 mseq; + __u32 mseq_ack; +} __attribute__((__packed__)); + +union cma_ip_addr { + struct in6_addr ip6; + struct { + __u32 pad[3]; + __u32 addr; + } ip4; +} __attribute__((__packed__)); + +#define HH_IPV_MASK 0xf0 +#define HH_IPV4 0x40 +#define HH_IPV6 0x60 +/* TODO: too much? Can I avoid having the src/dst and port here? */ +struct sdp_hh { + struct sdp_bsdh bsdh; + u8 majv_minv; + u8 ipv_cap; + u8 rsvd1; + u8 max_adverts; + __u32 desremrcvsz; + __u32 localrcvsz; + __u16 port; + __u16 rsvd2; + union cma_ip_addr src_addr; + union cma_ip_addr dst_addr; + u8 rsvd3[IB_CM_REQ_PRIVATE_DATA_SIZE - sizeof(struct sdp_bsdh) - 48]; +} __attribute__((__packed__)); + +struct sdp_hah { + struct sdp_bsdh bsdh; + u8 majv_minv; + u8 ipv_cap; + u8 rsvd1; + u8 ext_max_adverts; + __u32 actrcvsz; + u8 rsvd2[IB_CM_REP_PRIVATE_DATA_SIZE - sizeof(struct sdp_bsdh) - 8]; +} __attribute__((__packed__)); + +struct sdp_rrch { + __u32 len; +} __attribute__((__packed__)); + +struct sdp_srcah { + __u32 len; + __u32 rkey; + __u64 vaddr; +} __attribute__((__packed__)); + +struct sdp_buf { + struct sk_buff *skb; + /* The relation of mapping <-> pages is like this: + * mapping[0] doesn't have a correspondent page. + * mapping[i + 1] <-> pages[i] + */ + u64 mapping[SDP_MAX_SEND_SGES]; + struct page *pages[SDP_MAX_SEND_SGES - 1]; +} __attribute__((__packed__)); + +struct sdp_chrecvbuf { + u32 size; +} __attribute__((__packed__)); + +/* Context used for synchronous zero copy bcopy (BZCOPY) */ +struct bzcopy_state { + unsigned char __user *u_base; + int u_len; + int left; + int page_cnt; + int cur_page; + int cur_offset; + int busy; + struct sdp_sock *ssk; + struct page **pages; +}; + +enum tx_sa_flag { + TX_SA_SENDSM = 0x01, + TX_SA_CROSS_SEND = 0x02, + TX_SA_INTRRUPTED = 0x04, + TX_SA_TIMEDOUT = 0x08, + TX_SA_ERROR = 0x10, +}; + +struct rx_srcavail_state { + /* Advertised buffer stuff */ + u32 mseq; + u32 reported; + u32 copied; + u32 len; + u32 rkey; + u64 vaddr; + + /* Dest buff info */ + struct ib_umem *umem; + struct ib_pool_fmr *fmr; + + /* Utility */ + u8 busy; + struct sk_buff *skb; /* SrcAvail skb */ +}; + +struct tx_srcavail_state { + /* Data below 'busy' will be reset */ + u8 busy; + + struct ib_umem *umem; + struct ib_pool_fmr *fmr; + + u32 bytes_sent; + u32 bytes_acked; + + enum tx_sa_flag abort_flags; + u8 posted; + + u32 mseq; +}; + +struct sdp_tx_ring { + struct rx_srcavail_state *rdma_inflight; + struct sdp_buf *buffer; + atomic_t head; + atomic_t tail; + struct ib_cq *cq; + + u32 una_seq; + atomic_t credits; +#define tx_credits(ssk) (atomic_read(&ssk->tx_ring.credits)) + + struct timer_list timer; + struct tasklet_struct tasklet; + u16 poll_cnt; +}; + +struct sdp_rx_ring { + struct sdp_buf *buffer; + atomic_t head; + atomic_t tail; + struct ib_cq *cq; + + struct timer_list cq_arm_timer; +}; + +struct sdp_device { + struct ib_pd *pd; + struct ib_mr *mr; + struct ib_fmr_pool *fmr_pool; +}; + +struct sdp_moderation { + unsigned long last_moder_packets; + unsigned long last_moder_tx_packets; + unsigned long last_moder_bytes; + unsigned long last_moder_jiffies; + int last_moder_time; + u16 rx_usecs; + u16 rx_frames; + u16 tx_usecs; + u32 pkt_rate_low; + u16 rx_usecs_low; + u32 pkt_rate_high; + u16 rx_usecs_high; + u16 sample_interval; + u16 adaptive_rx_coal; + u32 msg_enable; + + int moder_cnt; + int moder_time; +}; + +struct sdp_sock { + /* sk has to be the first member of inet_sock */ + struct inet_sock isk; + struct list_head sock_list; + struct list_head accept_queue; + struct list_head backlog_queue; + struct sk_buff_head rx_ctl_q; + struct sock *parent; + struct sdp_device *sdp_dev; + int cpu; + + unsigned int sk_id; + +#ifdef SDP_SOCK_HISTORY + struct sdp_sock_hist hst[SDP_SOCK_HISTORY_LEN]; + unsigned long hst_idx; /* next free slot */ + spinlock_t hst_lock; + struct dentry *hst_dentr; +#endif /* SDP_SOCK_HISTORY */ + + int qp_active; + spinlock_t tx_sa_lock; + struct tx_srcavail_state *tx_sa; + + /* set when SrcAvail received, reset when SendSM/RdmaRdCompl sent */ + struct rx_srcavail_state *rx_sa; + int sa_post_sendsm; /* Need to send SendSM */ + int sa_post_rdma_rd_compl; /* Number of finished RDMA read bytes not reported */ + /* If > 0, need to send RdmaRdCompl */ + u32 sa_cancel_mseq; + int sa_cancel_arrived; /* is 'sa_cancel_mseq' relevant or not, sticky */ + + struct ib_ucontext context; + + int max_sge; + + struct work_struct rx_comp_work; + + struct delayed_work dreq_wait_work; + struct delayed_work cma_timewait_work; + struct work_struct destroy_work; + + int tx_compl_pending; + atomic_t somebody_is_doing_posts; + + /* Like tcp_sock */ + u16 urg_data; + u32 urg_seq; + u32 copied_seq; +#define rcv_nxt(ssk) atomic_read(&(ssk->rcv_nxt)) + atomic_t rcv_nxt; + + u32 write_seq; + int xmit_size_goal; + int nonagle; + + int dreq_wait_timeout; + int cma_timewait_timeout; + + unsigned keepalive_time; + + spinlock_t lock; + + /* tx_head/rx_head when keepalive timer started */ + unsigned keepalive_tx_head; + unsigned keepalive_rx_head; + + int destructed_already; + int sdp_disconnect; /* Need to send SDP_MID_DISCONNECT */ + int id_destroyed_already; /* for sdp_remove_device() only */ + + struct sdp_rx_ring rx_ring; + struct sdp_tx_ring tx_ring; + + /* Data below will be reset on error */ + struct rdma_cm_id *id; + struct ib_device *ib_device; + + /* SDP specific */ + atomic_t mseq_ack; +#define mseq_ack(ssk) (atomic_read(&ssk->mseq_ack)) + unsigned max_bufs; /* Initial buffers offered by other side */ + unsigned min_bufs; /* Low water mark to wake senders */ + + u32 nagle_last_unacked; /* mseq of lastest unacked packet */ + struct timer_list nagle_timer; /* timeout waiting for ack */ + + atomic_t remote_credits; +#define remote_credits(ssk) (atomic_read(&ssk->remote_credits)) + int poll_cq; + + /* rdma specific */ + struct ib_qp *qp; + + /* SDP slow start */ + int sent_request_head; /* mark the tx_head of the last send resize + request */ + int sent_request; /* 0 - not sent yet, 1 - request pending + -1 - resize done succesfully */ + int recv_request_head; /* mark the rx_head when the resize request + was recieved */ + int recv_request; /* flag if request to resize was recieved */ + int recv_frags; /* max skb frags in recv packets */ + int send_frags; /* max skb frags in send packets */ + + unsigned long tx_packets; + unsigned long rx_packets; + unsigned long rx_bytes; + struct sdp_moderation auto_mod; + + /* ZCOPY data: -1:use global; 0:disable zcopy; >0: zcopy threshold */ + int zcopy_thresh; + int inline_thresh; + + int last_bind_err; + + /* ipv6_pinfo has to be the last member of tcp6_sock, see inet6_sk_generic */ + struct ipv6_pinfo inet6; +}; + +static inline void tx_sa_reset(struct tx_srcavail_state *tx_sa) +{ + memset((void *)&tx_sa->busy, 0, + sizeof(*tx_sa) - offsetof(typeof(*tx_sa), busy)); +} + +static inline int sdp_chk_sa_cancel(struct sdp_sock *ssk, struct rx_srcavail_state *rx_sa) +{ + return ssk->sa_cancel_arrived && + before(rx_sa->mseq, ssk->sa_cancel_mseq); +} + +static inline struct sdp_sock *sdp_sk(const struct sock *sk) +{ + return (struct sdp_sock *)sk; +} + +static inline int _sdp_exch_state(const char *func, int line, struct sock *sk, + int from_states, int state) +{ + unsigned long flags; + int old; + + spin_lock_irqsave(&sdp_sk(sk)->lock, flags); + + sdp_dbg(sk, "%s:%d - set state: %s -> %s 0x%x\n", func, line, + sdp_state_str(sk->sk_state), + sdp_state_str(state), from_states); + + if ((1 << sk->sk_state) & ~from_states) { + sdp_warn(sk, "%s:%d: trying to exchange state from unexpected " + "state %s to state %s. expected states: 0x%x\n", + func, line, sdp_state_str(sk->sk_state), + sdp_state_str(state), from_states); + } + + old = sk->sk_state; + sk->sk_state = state; + + spin_unlock_irqrestore(&sdp_sk(sk)->lock, flags); + + sdp_add_to_history(sk, sdp_state_str(state)); + + return old; +} +#define sdp_exch_state(sk, from_states, state) \ + _sdp_exch_state(__func__, __LINE__, sk, from_states, state) + +static inline void sdp_set_error(struct sock *sk, int err) +{ + int ib_teardown_states = TCPF_FIN_WAIT1 | TCPF_CLOSE_WAIT + | TCPF_LAST_ACK; + sk->sk_err = -err; + if (sk->sk_socket) + sk->sk_socket->state = SS_DISCONNECTING; + + if ((1 << sk->sk_state) & ib_teardown_states) + sdp_exch_state(sk, ib_teardown_states, TCP_TIME_WAIT); + else if (TCP_TIME_WAIT != sk->sk_state) + sdp_exch_state(sk, ~0, TCP_CLOSE); + + sk->sk_error_report(sk); +} + +/* return the min of: + * - tx credits + * - free slots in tx_ring (not including SDP_MIN_TX_CREDITS + */ +static inline int tx_slots_free(struct sdp_sock *ssk) +{ + int min_free; + + min_free = MIN(tx_credits(ssk), + SDP_TX_SIZE - tx_ring_posted(ssk)); + if (min_free < SDP_MIN_TX_CREDITS) + return 0; + + return min_free - SDP_MIN_TX_CREDITS; +}; + +static inline unsigned sdp_cycles_to_usecs(unsigned long c) +{ +#ifdef CONFIG_PPC + return c / tb_ticks_per_usec; +#elif defined(__ia64__) + return c / local_cpu_data->cyc_per_usec; +#else + return c * 1000 / cpu_khz; +#endif +} + +/* utilities */ +static inline char *mid2str(int mid) +{ +#define ENUM2STR(e) [e] = #e + static char *mid2str[] = { + ENUM2STR(SDP_MID_HELLO), + ENUM2STR(SDP_MID_HELLO_ACK), + ENUM2STR(SDP_MID_ABORT), + ENUM2STR(SDP_MID_DISCONN), + ENUM2STR(SDP_MID_SENDSM), + ENUM2STR(SDP_MID_RDMARDCOMPL), + ENUM2STR(SDP_MID_SRCAVAIL_CANCEL), + ENUM2STR(SDP_MID_CHRCVBUF), + ENUM2STR(SDP_MID_CHRCVBUF_ACK), + ENUM2STR(SDP_MID_DATA), + ENUM2STR(SDP_MID_SRCAVAIL), + ENUM2STR(SDP_MID_SINKAVAIL), + }; + + if (mid < 0 || mid >= ARRAY_SIZE(mid2str)) { + printk(KERN_WARNING "mid %d is illegal\n", mid); + return NULL; + } + + return mid2str[mid]; +} + +static inline struct sk_buff *sdp_stream_alloc_skb(struct sock *sk, int size, + gfp_t gfp, int kind) +{ + struct sk_buff *skb; + + /* The TCP header must be at least 32-bit aligned. */ + size = ALIGN(size, 4); + + skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); + if (skb) { + + if ((kind == SK_MEM_RECV && sk_rmem_schedule(sk, skb->truesize)) || + (kind == SK_MEM_SEND && sk_wmem_schedule(sk, skb->truesize))) { + /* + * Make sure that we have exactly size bytes + * available to the caller, no more, no less. + */ + skb_reserve(skb, skb_tailroom(skb) - size); + return skb; + } + __kfree_skb(skb); + } else { + sk->sk_prot->enter_memory_pressure(sk); + sk_stream_moderate_sndbuf(sk); + } + return NULL; +} + +static inline struct sk_buff *sdp_alloc_skb(struct sock *sk, u8 mid, int size, + gfp_t gfp) +{ + struct sdp_bsdh *h; + struct sk_buff *skb; + + if (!gfp) { + if (unlikely(sk->sk_allocation)) + gfp = sk->sk_allocation; + else + gfp = GFP_KERNEL; + } + + skb = sdp_stream_alloc_skb(sk, size, gfp, SK_MEM_SEND); + if (unlikely(!skb)) + return NULL; + + skb_header_release(skb); + + h = (struct sdp_bsdh *)skb_push(skb, sizeof *h); + h->mid = mid; + + skb_reset_transport_header(skb); + + return skb; +} +static inline struct sk_buff *sdp_alloc_skb_data(struct sock *sk, int size, gfp_t gfp) +{ + return sdp_alloc_skb(sk, SDP_MID_DATA, size, gfp); +} + +static inline struct sk_buff *sdp_alloc_skb_disconnect(struct sock *sk, + gfp_t gfp) +{ + return sdp_alloc_skb(sk, SDP_MID_DISCONN, 0, gfp); +} + +static inline struct sk_buff *sdp_alloc_skb_chrcvbuf_ack(struct sock *sk, + int size, gfp_t gfp) +{ + struct sk_buff *skb; + struct sdp_chrecvbuf *resp_size; + + skb = sdp_alloc_skb(sk, SDP_MID_CHRCVBUF_ACK, sizeof(*resp_size), gfp); + if (unlikely(!skb)) + return NULL; + + resp_size = (struct sdp_chrecvbuf *)skb_put(skb, sizeof *resp_size); + resp_size->size = htonl(size); + + return skb; +} + +static inline struct sk_buff *sdp_alloc_skb_srcavail(struct sock *sk, + u32 len, u32 rkey, u64 vaddr, gfp_t gfp) +{ + struct sk_buff *skb; + struct sdp_srcah *srcah; + + skb = sdp_alloc_skb(sk, SDP_MID_SRCAVAIL, sizeof(*srcah), gfp); + if (unlikely(!skb)) + return NULL; + + srcah = (struct sdp_srcah *)skb_put(skb, sizeof(*srcah)); + srcah->len = htonl(len); + srcah->rkey = htonl(rkey); + srcah->vaddr = cpu_to_be64(vaddr); + + return skb; +} + +static inline struct sk_buff *sdp_alloc_skb_srcavail_cancel(struct sock *sk, + gfp_t gfp) +{ + return sdp_alloc_skb(sk, SDP_MID_SRCAVAIL_CANCEL, 0, gfp); +} + +static inline struct sk_buff *sdp_alloc_skb_rdmardcompl(struct sock *sk, + u32 len, gfp_t gfp) +{ + struct sk_buff *skb; + struct sdp_rrch *rrch; + + skb = sdp_alloc_skb(sk, SDP_MID_RDMARDCOMPL, sizeof(*rrch), gfp); + if (unlikely(!skb)) + return NULL; + + rrch = (struct sdp_rrch *)skb_put(skb, sizeof(*rrch)); + rrch->len = htonl(len); + + return skb; +} + +static inline struct sk_buff *sdp_alloc_skb_sendsm(struct sock *sk, gfp_t gfp) +{ + return sdp_alloc_skb(sk, SDP_MID_SENDSM, 0, gfp); +} +static inline int sdp_tx_ring_slots_left(struct sdp_sock *ssk) +{ + return SDP_TX_SIZE - tx_ring_posted(ssk); +} + +/* Return true if need to send credit update. Rules are: + * - at least half of the RX buffer is available + * - 1.5 * c < p + * - has TX credits + * - has room in tx Q + * + * p = number of posted buffers + * c = current credits count at the peer + */ +static inline int credit_update_needed(struct sdp_sock *ssk) +{ + int c; + + c = remote_credits(ssk); + if (likely(c > SDP_MIN_TX_CREDITS)) + c += c/2; + return unlikely(c < rx_ring_posted(ssk)) && + likely(tx_credits(ssk) > 0) && + likely(sdp_tx_ring_slots_left(ssk)); +} + + +#ifdef SDPSTATS_ON + +#define SDPSTATS_MAX_HIST_SIZE 256 +struct sdpstats { + u64 rx_bytes; + u64 tx_bytes; + u32 post_send[256]; + u32 inline_sends; + u32 sendmsg_bcopy_segment; + u32 sendmsg_bzcopy_segment; + u32 sendmsg_zcopy_segment; + u32 sendmsg; + u32 recvmsg; + u32 post_send_credits; + u32 sendmsg_seglen[25]; + u32 send_size[25]; + u32 post_recv; + u32 rx_int_arm; + u32 tx_int_arm; + u32 rx_int_count; + u32 tx_int_count; + u32 rx_int_wake_up; + u32 rx_int_queue; + u32 rx_int_no_op; + u32 rx_cq_modified; + u32 rx_cq_arm_timer; + u32 rx_wq; + u32 bzcopy_poll_miss; + u32 send_wait_for_mem; + u32 send_miss_no_credits; + u32 rx_poll_miss; + u32 rx_poll_hit; + u32 poll_hit_usec[16]; + u32 tx_poll_miss; + u32 tx_poll_hit; + u32 tx_poll_busy; + u32 tx_poll_no_op; + u32 memcpy_count; + u32 credits_before_update[64]; + u32 zcopy_tx_timeout; + u32 zcopy_cross_send; + u32 zcopy_tx_aborted; + u32 zcopy_tx_error; + u32 fmr_alloc_error; + u32 keepalive_timer; + u32 nagle_timer; +}; + +static inline void sdpstats_hist(u32 *h, u32 val, u32 maxidx, int is_log) +{ + int idx = is_log ? ilog2(val) : val; + + /* ilog2(0) == -1 */ + if (idx < 0) + idx = 0; + else if (unlikely(idx > maxidx)) + idx = maxidx; + + h[idx]++; +} + +#define SDPSTATS_COUNTER_INC(stat) do { __get_cpu_var(sdpstats).stat++; } while (0) +#define SDPSTATS_COUNTER_ADD(stat, val) do { __get_cpu_var(sdpstats).stat += val; } while (0) +#define SDPSTATS_COUNTER_MID_INC(stat, mid) do { __get_cpu_var(sdpstats).stat[mid]++; } \ + while (0) +#define SDPSTATS_HIST(stat, size) \ + sdpstats_hist(__get_cpu_var(sdpstats).stat, size, ARRAY_SIZE(__get_cpu_var(sdpstats).stat) - 1, 1) + +#define SDPSTATS_HIST_LINEAR(stat, size) \ + sdpstats_hist(__get_cpu_var(sdpstats).stat, size, ARRAY_SIZE(__get_cpu_var(sdpstats).stat) - 1, 0) + +#else +#define SDPSTATS_COUNTER_INC(stat) +#define SDPSTATS_COUNTER_ADD(stat, val) +#define SDPSTATS_COUNTER_MID_INC(stat, mid) +#define SDPSTATS_HIST_LINEAR(stat, size) +#define SDPSTATS_HIST(stat, size) +#endif + +static inline void sdp_cleanup_sdp_buf(struct sdp_sock *ssk, struct sdp_buf *sbuf, + size_t head_size, enum dma_data_direction dir) +{ + int i; + struct sk_buff *skb; + struct ib_device *dev = ssk->ib_device; + + skb = sbuf->skb; + sbuf->skb = NULL; + + if (!sbuf->mapping[0]) + return; /* Inlined send - nothing to cleanup */ + + ib_dma_unmap_single(dev, sbuf->mapping[0], head_size, dir); + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + ib_dma_unmap_page(dev, sbuf->mapping[i + 1], + skb_shinfo(skb)->frags[i].size, + dir); + sbuf->mapping[i + 1] = 0; + } +} + +static inline void sdp_postpone_rx_timer(struct sdp_sock *ssk) +{ + if (timer_pending(&ssk->rx_ring.cq_arm_timer) && ssk->qp_active) + mod_timer(&ssk->rx_ring.cq_arm_timer, MAX_JIFFY_OFFSET); +} + +static inline void sdp_arm_rx_cq(struct sock *sk) +{ + if (unlikely(!sdp_sk(sk)->rx_ring.cq)) + return; + + SDPSTATS_COUNTER_INC(rx_int_arm); + + sdp_postpone_rx_timer(sdp_sk(sk)); + + if (unlikely(0 > ib_req_notify_cq(sdp_sk(sk)->rx_ring.cq, + IB_CQ_NEXT_COMP))) + sdp_warn(sk, "error arming rx cq\n"); +} + +static inline void sdp_arm_tx_cq(struct sock *sk) +{ + if (unlikely(!sdp_sk(sk)->tx_ring.cq)) + return; + + SDPSTATS_COUNTER_INC(tx_int_arm); + sdp_dbg_data(sk, "Arming TX cq. credits: %d, posted: %d\n", + tx_credits(sdp_sk(sk)), tx_ring_posted(sdp_sk(sk))); + + if (unlikely(0 > ib_req_notify_cq(sdp_sk(sk)->tx_ring.cq, + IB_CQ_NEXT_COMP))) + sdp_warn(sk, "error arming tx cq\n"); +} + +static inline void sdp_schedule_arm_rx_cq(struct sdp_sock *ssk, + unsigned long delay) +{ + if (unlikely(!ssk->rx_ring.cq)) + return; + + if (delay && ssk->qp_active) + mod_timer(&ssk->rx_ring.cq_arm_timer, jiffies + delay); + else { + /* There is no point of setting up a timer for an immediate + * cq-arming, better arm it now. */ + sdp_arm_rx_cq(sk_ssk(ssk)); + } +} + +static inline int somebody_is_waiting(struct sock *sk) +{ + return sk->sk_socket && + test_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static inline struct ipv6_pinfo *sdp_inet6_sk_generic(struct sock *sk) +{ + const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo); + + return (struct ipv6_pinfo *)(((u8 *)sk) + offset); +} +#endif + +/* sdp_main.c */ +void sdp_set_default_moderation(struct sdp_sock *ssk); +int sdp_init_sock(struct sock *sk); +void sdp_start_keepalive_timer(struct sock *sk); +void sdp_remove_sock(struct sdp_sock *ssk); +void sdp_add_sock(struct sdp_sock *ssk); +void sdp_urg(struct sdp_sock *ssk, struct sk_buff *skb); +void sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk); +void sdp_reset_sk(struct sock *sk, int rc); +void sdp_reset(struct sock *sk); +int sdp_tx_wait_memory(struct sdp_sock *ssk, long *timeo_p, int *credits_needed); +void sdp_skb_entail(struct sock *sk, struct sk_buff *skb); +void sdp_start_cma_timewait_timeout(struct sdp_sock *ssk, int timeo); +int sdp_abort_rx_srcavail(struct sock *sk, int post_sendsm); +extern struct rw_semaphore device_removal_lock; +extern int sdp_apm_enable; + +/* sdp_proc.c */ +int __init sdp_proc_init(void); +void sdp_proc_unregister(void); + +/* sdp_cma.c */ +int sdp_cma_handler(struct rdma_cm_id *, struct rdma_cm_event *); + +/* sdp_tx.c */ +int sdp_tx_ring_create(struct sdp_sock *ssk, struct ib_device *device); +void sdp_tx_ring_destroy(struct sdp_sock *ssk); +int sdp_xmit_poll(struct sdp_sock *ssk, int force); +void sdp_post_send(struct sdp_sock *ssk, struct sk_buff *skb); +int sdp_post_sends(struct sdp_sock *ssk, gfp_t gfp); +void sdp_nagle_timeout(unsigned long data); +void sdp_post_keepalive(struct sdp_sock *ssk); + +/* sdp_rx.c */ +int sdp_rx_ring_create(struct sdp_sock *ssk, struct ib_device *device); +void sdp_rx_ring_destroy(struct sdp_sock *ssk); +int sdp_resize_buffers(struct sdp_sock *ssk, u32 new_size); +int sdp_init_buffers(struct sdp_sock *ssk, u32 new_size); +void sdp_do_posts(struct sdp_sock *ssk); +void sdp_rx_comp_full(struct sdp_sock *ssk); +void sdp_remove_large_sock(const struct sdp_sock *ssk); +void sdp_handle_disconn(struct sock *sk); +int sdp_poll_rx_cq(struct sdp_sock *ssk); + +/* sdp_zcopy.c */ +int sdp_sendmsg_zcopy(struct kiocb *iocb, struct sock *sk, struct iovec *iov); +int sdp_handle_srcavail(struct sdp_sock *ssk, struct sdp_srcah *srcah); +void sdp_handle_sendsm(struct sdp_sock *ssk, u32 mseq_ack); +void sdp_handle_rdma_read_compl(struct sdp_sock *ssk, u32 mseq_ack, + u32 bytes_completed); +int sdp_handle_rdma_read_cqe(struct sdp_sock *ssk); +int sdp_rdma_to_iovec(struct sock *sk, struct iovec *iov, int msg_iovlen, + struct sk_buff *skb, unsigned long *used, u32 offset); +int sdp_post_rdma_rd_compl(struct sock *sk, + struct rx_srcavail_state *rx_sa); +int sdp_post_sendsm(struct sock *sk); +void sdp_abort_srcavail(struct sock *sk); +void sdp_abort_rdma_read(struct sock *sk); + +#endif diff --git a/drivers/infiniband/ulp/sdp/sdp_bcopy.c b/drivers/infiniband/ulp/sdp/sdp_bcopy.c new file mode 100644 index 0000000000000..508ac8490887c --- /dev/null +++ b/drivers/infiniband/ulp/sdp/sdp_bcopy.c @@ -0,0 +1,313 @@ +/* + * Copyright (c) 2006 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id$ + */ +#include "sdp.h" + +#ifdef CONFIG_INFINIBAND_SDP_DEBUG_DATA +void _dump_packet(const char *func, int line, struct sock *sk, char *str, + struct sk_buff *skb, const struct sdp_bsdh *h) +{ + struct sdp_hh *hh; + struct sdp_hah *hah; + struct sdp_chrecvbuf *req_size; + struct sdp_rrch *rrch; + struct sdp_srcah *srcah; + int len = 0; + char buf[256]; + len += snprintf(buf, 255-len, "mid: %-20s flags: 0x%x " + "bufs: 0x%x len: 0x%x mseq: 0x%x mseq_ack: 0x%x | ", + mid2str(h->mid), h->flags, + ntohs(h->bufs), ntohl(h->len), ntohl(h->mseq), + ntohl(h->mseq_ack)); + + switch (h->mid) { + case SDP_MID_HELLO: + hh = (struct sdp_hh *)h; + len += snprintf(buf + len, 255-len, + "max_adverts: %d majv_minv: 0x%x " + "localrcvsz: 0x%x desremrcvsz: 0x%x |", + hh->max_adverts, hh->majv_minv, + ntohl(hh->localrcvsz), + ntohl(hh->desremrcvsz)); + break; + case SDP_MID_HELLO_ACK: + hah = (struct sdp_hah *)h; + len += snprintf(buf + len, 255-len, "actrcvz: 0x%x |", + ntohl(hah->actrcvsz)); + break; + case SDP_MID_CHRCVBUF: + case SDP_MID_CHRCVBUF_ACK: + req_size = (struct sdp_chrecvbuf *)(h+1); + len += snprintf(buf + len, 255-len, "req_size: 0x%x |", + ntohl(req_size->size)); + break; + case SDP_MID_DATA: + len += snprintf(buf + len, 255-len, "data_len: 0x%zx |", + ntohl(h->len) - sizeof(struct sdp_bsdh)); + break; + case SDP_MID_RDMARDCOMPL: + rrch = (struct sdp_rrch *)(h+1); + + len += snprintf(buf + len, 255-len, " | len: 0x%x |", + ntohl(rrch->len)); + break; + case SDP_MID_SRCAVAIL: + srcah = (struct sdp_srcah *)(h+1); + + len += snprintf(buf + len, 255-len, " | payload: 0x%zx, " + "len: 0x%x, rkey: 0x%x, vaddr: 0x%llx |", + ntohl(h->len) - sizeof(struct sdp_bsdh) - + sizeof(struct sdp_srcah), + ntohl(srcah->len), ntohl(srcah->rkey), + be64_to_cpu(srcah->vaddr)); + break; + default: + break; + } + buf[len] = 0; + if (sdp_data_debug_level & 0x1) + _sdp_printk(func, line, KERN_WARNING, sk, "%s: %s\n", str, buf); + _sdp_prf(sk, skb, func, line, "%s: %s", str, buf); +} +#endif + +static inline void update_send_head(struct sock *sk, struct sk_buff *skb) +{ + struct page *page; + sk->sk_send_head = skb->next; + if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue) { + sk->sk_send_head = NULL; + page = sk->sk_sndmsg_page; + if (page) { + put_page(page); + sk->sk_sndmsg_page = NULL; + } + } +} + +static inline int sdp_nagle_off(struct sdp_sock *ssk, struct sk_buff *skb) +{ + struct sdp_bsdh *h = (struct sdp_bsdh *)skb_transport_header(skb); + int send_now = + BZCOPY_STATE(skb) || + unlikely(h->mid != SDP_MID_DATA) || + (ssk->nonagle & TCP_NAGLE_OFF) || + !ssk->nagle_last_unacked || + skb->next != (struct sk_buff *)&sk_ssk(ssk)->sk_write_queue || + skb->len + sizeof(struct sdp_bsdh) >= ssk->xmit_size_goal || + (SDP_SKB_CB(skb)->flags & TCPHDR_PSH) || + (SDP_SKB_CB(skb)->flags & TCPHDR_URG); + + if (send_now) { + unsigned long mseq = ring_head(ssk->tx_ring); + ssk->nagle_last_unacked = mseq; + } else { + if (!timer_pending(&ssk->nagle_timer) && ssk->qp_active) { + mod_timer(&ssk->nagle_timer, + jiffies + SDP_NAGLE_TIMEOUT); + sdp_dbg_data(sk_ssk(ssk), "Starting nagle timer\n"); + } + } + + return send_now; +} + +void sdp_nagle_timeout(unsigned long data) +{ + struct sdp_sock *ssk = (struct sdp_sock *)data; + struct sock *sk = sk_ssk(ssk); + + SDPSTATS_COUNTER_INC(nagle_timer); + sdp_dbg_data(sk, "last_unacked = %u\n", ssk->nagle_last_unacked); + + if (!ssk->nagle_last_unacked) + goto out2; + + /* Only process if the socket is not in use */ + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + sdp_dbg_data(sk, "socket is busy - will try later\n"); + goto out; + } + + if (sk->sk_state == TCP_CLOSE) { + bh_unlock_sock(sk); + return; + } + + ssk->nagle_last_unacked = 0; + sdp_post_sends(ssk, GFP_ATOMIC); + + if (sdp_sk_sleep(sk) && waitqueue_active(sdp_sk_sleep(sk))) + sk_stream_write_space(sk); +out: + bh_unlock_sock(sk); +out2: + if (sk->sk_send_head && ssk->qp_active) { + /* If has pending sends - rearm */ + mod_timer(&ssk->nagle_timer, jiffies + SDP_NAGLE_TIMEOUT); + } +} + +static inline int sdp_should_rearm(struct sock *sk) +{ + return sk->sk_state != TCP_ESTABLISHED || sdp_sk(sk)->tx_sa || + somebody_is_waiting(sk); +} + +int sdp_post_sends(struct sdp_sock *ssk, gfp_t gfp) +{ + /* TODO: nonagle? */ + struct sk_buff *skb; + int post_count = 0; + struct sock *sk = sk_ssk(ssk); + + if (unlikely(!ssk->id)) { + if (sk->sk_send_head) { + sdp_dbg(sk, "Send on socket without cmid ECONNRESET\n"); + /* TODO: flush send queue? */ + sdp_reset(sk); + } + return -ECONNRESET; + } +again: + if (sdp_tx_ring_slots_left(ssk) < SDP_TX_SIZE / 2) + sdp_xmit_poll(ssk, 1); + + /* Run out of credits, check if got a credit update */ + if (unlikely(tx_credits(ssk) <= SDP_MIN_TX_CREDITS)) { + sdp_poll_rx_cq(ssk); + + if (unlikely(sdp_should_rearm(sk) || !posts_handler(ssk))) + sdp_arm_rx_cq(sk); + } + + if (unlikely((ssk->sa_post_rdma_rd_compl || ssk->sa_post_sendsm) && + tx_credits(ssk) < SDP_MIN_TX_CREDITS)) { + sdp_dbg_data(sk, "Run out of credits, can't abort SrcAvail. " + "RdmaRdCompl: %d SendSm: %d\n", + ssk->sa_post_rdma_rd_compl, ssk->sa_post_sendsm); + } + + if (ssk->sa_post_rdma_rd_compl && tx_credits(ssk) >= SDP_MIN_TX_CREDITS) { + int unreported = ssk->sa_post_rdma_rd_compl; + + skb = sdp_alloc_skb_rdmardcompl(sk, unreported, gfp); + if (!skb) + goto no_mem; + sdp_post_send(ssk, skb); + post_count++; + ssk->sa_post_rdma_rd_compl = 0; + } + + if (ssk->sa_post_sendsm && tx_credits(ssk) >= SDP_MIN_TX_CREDITS) { + skb = sdp_alloc_skb_sendsm(sk, gfp); + if (unlikely(!skb)) + goto no_mem; + sdp_post_send(ssk, skb); + ssk->sa_post_sendsm = 0; + post_count++; + } + + if (ssk->recv_request && + ring_tail(ssk->rx_ring) >= SDP_MIN_TX_CREDITS && + tx_credits(ssk) >= SDP_MIN_TX_CREDITS && + sdp_tx_ring_slots_left(ssk)) { + skb = sdp_alloc_skb_chrcvbuf_ack(sk, + ssk->recv_frags * PAGE_SIZE, gfp); + if (!skb) + goto no_mem; + ssk->recv_request = 0; + sdp_post_send(ssk, skb); + post_count++; + } + + if (tx_credits(ssk) <= SDP_MIN_TX_CREDITS && + sdp_tx_ring_slots_left(ssk) && + sk->sk_send_head && + sdp_nagle_off(ssk, sk->sk_send_head)) { + SDPSTATS_COUNTER_INC(send_miss_no_credits); + } + + while (tx_credits(ssk) > SDP_MIN_TX_CREDITS && + sdp_tx_ring_slots_left(ssk) && + (skb = sk->sk_send_head) && + sdp_nagle_off(ssk, skb)) { + update_send_head(sk, skb); + __skb_dequeue(&sk->sk_write_queue); + + sdp_post_send(ssk, skb); + + post_count++; + } + + if (credit_update_needed(ssk) && + likely((1 << sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_FIN_WAIT1))) { + + skb = sdp_alloc_skb_data(sk, 0, gfp); + if (!skb) + goto no_mem; + + sk->sk_wmem_queued += skb->truesize; + sk_mem_charge(sk, skb->truesize); + + sdp_post_send(ssk, skb); + SDPSTATS_COUNTER_INC(post_send_credits); + post_count++; + } + + /* send DisConn if needed + * Do not send DisConn if there is only 1 credit. Compliance with CA4-82 + * If one credit is available, an implementation shall only send SDP + * messages that provide additional credits and also do not contain ULP + * payload. */ + if (unlikely(ssk->sdp_disconnect) && + !sk->sk_send_head && + tx_credits(ssk) >= SDP_MIN_TX_CREDITS) { + skb = sdp_alloc_skb_disconnect(sk, gfp); + if (!skb) + goto no_mem; + ssk->sdp_disconnect = 0; + sdp_post_send(ssk, skb); + post_count++; + } + + if (!sdp_tx_ring_slots_left(ssk) || post_count) { + if (sdp_xmit_poll(ssk, 1)) + goto again; + } + +no_mem: + return post_count; +} diff --git a/drivers/infiniband/ulp/sdp/sdp_cma.c b/drivers/infiniband/ulp/sdp/sdp_cma.c new file mode 100644 index 0000000000000..250e5af2194ba --- /dev/null +++ b/drivers/infiniband/ulp/sdp/sdp_cma.c @@ -0,0 +1,650 @@ +/* + * Copyright (c) 2006 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id$ + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#include +#include +#endif +#include "sdp.h" + +#define SDP_MAJV_MINV 0x22 + +SDP_MODPARAM_INT(sdp_rx_size, 0x40, "HW rx queue size (max num of credits)." + " Must be power of 2."); + +SDP_MODPARAM_SINT(sdp_retry_count, 5, "IB layer retry count"); + +SDP_MODPARAM_SINT(sdp_link_layer_ib_only, 0, "Support only link layer of " + "type Infiniband"); + +static void sdp_qp_event_handler(struct ib_event *event, void *data) +{ + if (event->event == IB_EVENT_PATH_MIG) { + sdp_dbg(NULL, "Path migration event\n"); + return; + } + sdp_warn(NULL, "unexpected invocation: event: %d, data=%p\n", + event->event, data); +} + +static int sdp_get_max_dev_sge(struct ib_device *dev) +{ + struct ib_device_attr attr; + static int max_sges = -1; + int rc; + + if (max_sges > 0) + goto out; + + rc = ib_query_device(dev, &attr); + if (rc) { + sdp_warn(NULL, "ib_query_device failed: %d\n", rc); + goto out; + } + + max_sges = attr.max_sge; + +out: + return max_sges; +} + +static int sdp_init_qp(struct sock *sk, struct rdma_cm_id *id) +{ + struct ib_qp_init_attr qp_init_attr = { + .event_handler = sdp_qp_event_handler, + .cap.max_send_wr = SDP_TX_SIZE, + .cap.max_recv_wr = sdp_rx_size, + .cap.max_inline_data = sdp_inline_thresh, + .sq_sig_type = IB_SIGNAL_REQ_WR, + .qp_type = IB_QPT_RC, + }; + struct ib_device *device = id->device; + int rc; + + sdp_dbg(sk, "%s\n", __func__); + + sdp_sk(sk)->max_sge = sdp_get_max_dev_sge(device); + sdp_dbg(sk, "Max sges: %d\n", sdp_sk(sk)->max_sge); + + qp_init_attr.cap.max_send_sge = MIN(sdp_sk(sk)->max_sge, SDP_MAX_SEND_SGES); + sdp_dbg(sk, "Setting max send sge to: %d\n", qp_init_attr.cap.max_send_sge); + + qp_init_attr.cap.max_recv_sge = MIN(sdp_sk(sk)->max_sge, SDP_MAX_RECV_SGES); + sdp_dbg(sk, "Setting max recv sge to: %d\n", qp_init_attr.cap.max_recv_sge); + + sdp_sk(sk)->sdp_dev = ib_get_client_data(device, &sdp_client); + if (!sdp_sk(sk)->sdp_dev) { + sdp_warn(sk, "SDP not available on device %s\n", device->name); + rc = -ENODEV; + goto err_rx; + } + + rc = sdp_rx_ring_create(sdp_sk(sk), device); + if (rc) + goto err_rx; + + rc = sdp_tx_ring_create(sdp_sk(sk), device); + if (rc) + goto err_tx; + + qp_init_attr.recv_cq = sdp_sk(sk)->rx_ring.cq; + qp_init_attr.send_cq = sdp_sk(sk)->tx_ring.cq; + + rc = rdma_create_qp(id, sdp_sk(sk)->sdp_dev->pd, &qp_init_attr); + if (rc) { + sdp_warn(sk, "Unable to create QP: %d.\n", rc); + goto err_qp; + } + sdp_sk(sk)->qp = id->qp; + sdp_sk(sk)->ib_device = device; + sdp_sk(sk)->qp_active = 1; + sdp_sk(sk)->context.device = device; + sdp_sk(sk)->inline_thresh = qp_init_attr.cap.max_inline_data; + + sdp_dbg(sk, "%s done\n", __func__); + return 0; + +err_qp: + sdp_tx_ring_destroy(sdp_sk(sk)); +err_tx: + sdp_rx_ring_destroy(sdp_sk(sk)); +err_rx: + return rc; +} + +static int sdp_get_max_send_frags(u32 buf_size) +{ + return MIN( + /* +1 to conpensate on not aligned buffers */ + (PAGE_ALIGN(buf_size) >> PAGE_SHIFT) + 1, + SDP_MAX_SEND_SGES - 1); +} + +static int sdp_connect_handler(struct sock *sk, struct rdma_cm_id *id, + struct rdma_cm_event *event) +{ + struct sockaddr_in *dst_addr; + struct sock *child; + const struct sdp_hh *h; + int rc = 0; + + sdp_dbg(sk, "%s %p -> %p\n", __func__, sdp_sk(sk)->id, id); + + h = event->param.conn.private_data; + SDP_DUMP_PACKET(sk, "RX", NULL, &h->bsdh); + + if (h->ipv_cap & HH_IPV_MASK & ~(HH_IPV4 | HH_IPV6)) { + sdp_warn(sk, "Bad IPV field in SDP Hello header: 0x%x\n", + h->ipv_cap & HH_IPV_MASK); + return -EINVAL; + } + + if (!h->max_adverts) + return -EINVAL; + + child = sk_clone(sk, GFP_KERNEL); + if (!child) + return -ENOMEM; + + sdp_init_sock(child); + + dst_addr = (struct sockaddr_in *)&id->route.addr.dst_addr; + inet_dport(child) = dst_addr->sin_port; + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (inet6_sk(sk)) { + struct ipv6_pinfo *newnp; + + newnp = inet_sk(child)->pinet6 = sdp_inet6_sk_generic(child); + + memcpy(newnp, inet6_sk(sk), sizeof(struct ipv6_pinfo)); + + if ((h->ipv_cap & HH_IPV_MASK) == HH_IPV4) { + /* V6 mapped */ + sdp_inet_daddr(child) = dst_addr->sin_addr.s_addr; + ipv6_addr_set(&newnp->daddr, 0, 0, htonl(0x0000FFFF), + h->src_addr.ip4.addr); + + ipv6_addr_set(&newnp->saddr, 0, 0, htonl(0x0000FFFF), + h->dst_addr.ip4.addr); + + ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr); + } else if ((h->ipv_cap & HH_IPV_MASK) == HH_IPV6) { + struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *)dst_addr; + struct sockaddr_in6 *src_addr6 = + (struct sockaddr_in6 *)&id->route.addr.src_addr; + + ipv6_addr_copy(&newnp->daddr, &dst_addr6->sin6_addr); + ipv6_addr_copy(&newnp->saddr, &src_addr6->sin6_addr); + ipv6_addr_copy(&newnp->rcv_saddr, &src_addr6->sin6_addr); + } else { + sdp_warn(child, "Bad IPV field: 0x%x\n", h->ipv_cap & HH_IPV_MASK); + } + + sdp_inet_daddr(child) = inet_saddr(child) = sdp_inet_rcv_saddr(child) = LOOPBACK4_IPV6; + } else +#endif + { + sdp_inet_daddr(child) = dst_addr->sin_addr.s_addr; + } + +#ifdef SDP_SOCK_HISTORY + sdp_ssk_hist_rename(sk); +#endif + __sock_put(child, SOCK_REF_CLONE); + + down_read(&device_removal_lock); + + rc = sdp_init_qp(child, id); + if (rc) { + bh_unlock_sock(child); + up_read(&device_removal_lock); + sdp_sk(child)->destructed_already = 1; +#ifdef SDP_SOCK_HISTORY + sdp_ssk_hist_close(child); +#endif + sk_free(child); + return rc; + } + + sdp_sk(child)->max_bufs = ntohs(h->bsdh.bufs); + atomic_set(&sdp_sk(child)->tx_ring.credits, sdp_sk(child)->max_bufs); + + sdp_sk(child)->min_bufs = tx_credits(sdp_sk(child)) / 4; + sdp_sk(child)->xmit_size_goal = ntohl(h->localrcvsz) - + sizeof(struct sdp_bsdh); + + sdp_sk(child)->send_frags = sdp_get_max_send_frags(sdp_sk(child)->xmit_size_goal); + sdp_init_buffers(sdp_sk(child), rcvbuf_initial_size); + + id->context = child; + sdp_sk(child)->id = id; + + list_add_tail(&sdp_sk(child)->backlog_queue, + &sdp_sk(sk)->backlog_queue); + sdp_sk(child)->parent = sk; + + bh_unlock_sock(child); + sdp_add_sock(sdp_sk(child)); + up_read(&device_removal_lock); + + sdp_exch_state(child, TCPF_LISTEN | TCPF_CLOSE, TCP_SYN_RECV); + + /* child->sk_write_space(child); */ + /* child->sk_data_ready(child, 0); */ + sk->sk_data_ready(sk, 0); + + return 0; +} + +static int sdp_response_handler(struct sock *sk, struct rdma_cm_id *id, + struct rdma_cm_event *event) +{ + const struct sdp_hah *h; + struct sockaddr_in *dst_addr; + sdp_dbg(sk, "%s\n", __func__); + + sdp_exch_state(sk, TCPF_SYN_SENT, TCP_ESTABLISHED); + sdp_set_default_moderation(sdp_sk(sk)); + + if (sock_flag(sk, SOCK_KEEPOPEN)) + sdp_start_keepalive_timer(sk); + + if (sock_flag(sk, SOCK_DEAD)) + return 0; + + h = event->param.conn.private_data; + SDP_DUMP_PACKET(sk, "RX", NULL, &h->bsdh); + sdp_sk(sk)->max_bufs = ntohs(h->bsdh.bufs); + atomic_set(&sdp_sk(sk)->tx_ring.credits, sdp_sk(sk)->max_bufs); + sdp_sk(sk)->min_bufs = tx_credits(sdp_sk(sk)) / 4; + sdp_sk(sk)->xmit_size_goal = + ntohl(h->actrcvsz) - sizeof(struct sdp_bsdh); + sdp_sk(sk)->send_frags = sdp_get_max_send_frags(sdp_sk(sk)->xmit_size_goal); + sdp_sk(sk)->xmit_size_goal = MIN(sdp_sk(sk)->xmit_size_goal, + sdp_sk(sk)->send_frags * PAGE_SIZE); + + sdp_sk(sk)->poll_cq = 1; + + sk->sk_state_change(sk); + sk_wake_async(sk, 0, POLL_OUT); + + dst_addr = (struct sockaddr_in *)&id->route.addr.dst_addr; + inet_dport(sk) = dst_addr->sin_port; + sdp_inet_daddr(sk) = dst_addr->sin_addr.s_addr; + +#ifdef SDP_SOCK_HISTORY + sdp_ssk_hist_rename(sk); +#endif + return 0; +} + +static int sdp_connected_handler(struct sock *sk) +{ + struct sock *parent; + sdp_dbg(sk, "%s\n", __func__); + + parent = sdp_sk(sk)->parent; + BUG_ON(!parent); + + sdp_exch_state(sk, TCPF_SYN_RECV, TCP_ESTABLISHED); + +#ifdef SDP_SOCK_HISTORY + sdp_ssk_hist_rename(sk); +#endif + sdp_set_default_moderation(sdp_sk(sk)); + + if (sock_flag(sk, SOCK_KEEPOPEN)) + sdp_start_keepalive_timer(sk); + + if (sock_flag(sk, SOCK_DEAD)) + return 0; + + lock_sock(parent); + if (!sdp_sk(parent)->id) { /* TODO: look at SOCK_DEAD? */ + sdp_dbg(sk, "parent is going away.\n"); + goto done; + } + + sk_acceptq_added(parent); + sdp_dbg(parent, "%s child connection established\n", __func__); + list_del_init(&sdp_sk(sk)->backlog_queue); + list_add_tail(&sdp_sk(sk)->accept_queue, + &sdp_sk(parent)->accept_queue); + + parent->sk_state_change(parent); + sk_wake_async(parent, 0, POLL_OUT); +done: + release_sock(parent); + + return 0; +} + +static int sdp_disconnected_handler(struct sock *sk) +{ + struct sdp_sock *ssk = sdp_sk(sk); + + sdp_dbg(sk, "%s\n", __func__); + + if (ssk->tx_ring.cq) + if (sdp_xmit_poll(ssk, 1)) + sdp_post_sends(ssk, 0); + + if (sk->sk_state == TCP_SYN_RECV) { + sdp_connected_handler(sk); + + if (rcv_nxt(ssk)) + return 0; + } + + return -ECONNRESET; +} + +int sdp_cma_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) +{ + struct rdma_conn_param conn_param; + struct sock *parent = NULL; + struct sock *child = NULL; + struct sock *sk; + struct sdp_hah hah; + struct sdp_hh hh; + + int rc = 0, rc2; + + sk = id->context; + if (!sk) { + sdp_dbg(NULL, "cm_id is being torn down, event %s\n", + rdma_cm_event_str(event->event)); + return event->event == RDMA_CM_EVENT_CONNECT_REQUEST ? + -EINVAL : 0; + } + + sdp_add_to_history(sk, rdma_cm_event_str(event->event)); + + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); + sdp_dbg(sk, "event: %s\n", rdma_cm_event_str(event->event)); + if (!sdp_sk(sk)->id) { + sdp_dbg(sk, "socket is being torn down\n"); + rc = event->event == RDMA_CM_EVENT_CONNECT_REQUEST ? + -EINVAL : 0; + release_sock(sk); + return rc; + } + + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + if (sdp_link_layer_ib_only && + rdma_node_get_transport(id->device->node_type) == + RDMA_TRANSPORT_IB && + rdma_port_link_layer(id->device, id->port_num) != + IB_LINK_LAYER_INFINIBAND) { + sdp_dbg(sk, "Link layer is: %d. Only IB link layer " + "is allowed\n", + rdma_port_link_layer(id->device, id->port_num)); + rc = -ENETUNREACH; + break; + } + + rc = rdma_resolve_route(id, SDP_ROUTE_TIMEOUT); + break; + case RDMA_CM_EVENT_ADDR_ERROR: + rc = -ENETUNREACH; + break; + case RDMA_CM_EVENT_ROUTE_RESOLVED: + rc = sdp_init_qp(sk, id); + if (rc) + break; + memset(&hh, 0, sizeof hh); + hh.bsdh.mid = SDP_MID_HELLO; + hh.bsdh.len = htonl(sizeof(struct sdp_hh)); + hh.max_adverts = 1; + + hh.majv_minv = SDP_MAJV_MINV; + sdp_init_buffers(sdp_sk(sk), rcvbuf_initial_size); + hh.bsdh.bufs = htons(rx_ring_posted(sdp_sk(sk))); + atomic_set(&sdp_sk(sk)->remote_credits, + rx_ring_posted(sdp_sk(sk))); + hh.localrcvsz = hh.desremrcvsz = htonl(sdp_sk(sk)->recv_frags * + PAGE_SIZE + sizeof(struct sdp_bsdh)); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (inet6_sk(sk)) { + struct sockaddr *src_addr = (struct sockaddr *)&id->route.addr.src_addr; + struct sockaddr_in *addr4 = (struct sockaddr_in *)src_addr; + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)src_addr; + + if (src_addr->sa_family == AF_INET) { + /* IPv4 over IPv6 */ + ipv6_addr_set(&inet6_sk(sk)->rcv_saddr, 0, 0, htonl(0xFFFF), + addr4->sin_addr.s_addr); + } else { + inet6_sk(sk)->rcv_saddr = addr6->sin6_addr; + } + inet6_sk(sk)->saddr = inet6_sk(sk)->rcv_saddr; + } + else +#endif + { + inet_saddr(sk) = sdp_inet_rcv_saddr(sk) = + ((struct sockaddr_in *)&id->route.addr.src_addr)->sin_addr.s_addr; + } + memset(&conn_param, 0, sizeof conn_param); + conn_param.private_data_len = sizeof hh; + conn_param.private_data = &hh; + conn_param.responder_resources = 4 /* TODO */; + conn_param.initiator_depth = 4 /* TODO */; + conn_param.retry_count = sdp_retry_count; + SDP_DUMP_PACKET(sk, "TX", NULL, &hh.bsdh); + + if (sdp_apm_enable) { + rc = rdma_enable_apm(id, RDMA_ALT_PATH_BEST); + if (rc) + sdp_warn(sk, "APM couldn't be enabled for active side: %d\n", rc); + } + + rc = rdma_connect(id, &conn_param); + break; + + case RDMA_CM_EVENT_ALT_ROUTE_RESOLVED: + sdp_dbg(sk, "alt route was resolved slid=%d, dlid=%d\n", + id->route.path_rec[1].slid, id->route.path_rec[1].dlid); + break; + + case RDMA_CM_EVENT_ALT_PATH_LOADED: + sdp_dbg(sk, "alt route path loaded\n"); + break; + + case RDMA_CM_EVENT_ALT_ROUTE_ERROR: + sdp_warn(sk, "alt route resolve error\n"); + break; + + case RDMA_CM_EVENT_ROUTE_ERROR: + rc = -ETIMEDOUT; + break; + case RDMA_CM_EVENT_CONNECT_REQUEST: + rc = sdp_connect_handler(sk, id, event); + if (rc) { + sdp_dbg(sk, "Destroying qp\n"); + rdma_reject(id, NULL, 0); + break; + } + child = id->context; + atomic_set(&sdp_sk(child)->remote_credits, + rx_ring_posted(sdp_sk(child))); + memset(&hah, 0, sizeof hah); + hah.bsdh.mid = SDP_MID_HELLO_ACK; + hah.bsdh.bufs = htons(rx_ring_posted(sdp_sk(child))); + hah.bsdh.len = htonl(sizeof(struct sdp_hah)); + hah.majv_minv = SDP_MAJV_MINV; + hah.ext_max_adverts = 1; /* Doesn't seem to be mandated by spec, + but just in case */ + hah.actrcvsz = htonl(sdp_sk(child)->recv_frags * PAGE_SIZE + + sizeof(struct sdp_bsdh)); + memset(&conn_param, 0, sizeof conn_param); + conn_param.private_data_len = sizeof hah; + conn_param.private_data = &hah; + conn_param.responder_resources = 4 /* TODO */; + conn_param.initiator_depth = 4 /* TODO */; + conn_param.retry_count = sdp_retry_count; + SDP_DUMP_PACKET(sk, "TX", NULL, &hah.bsdh); + rc = rdma_accept(id, &conn_param); + if (rc) { + sdp_sk(child)->id = NULL; + id->qp = NULL; + id->context = NULL; + parent = sdp_sk(child)->parent; /* TODO: hold ? */ + } else if (sdp_apm_enable) { + rc2 = rdma_enable_apm(id, RDMA_ALT_PATH_BEST); + if (rc2) + sdp_warn(sk, "APM couldn't be enabled for passive side: %d\n", rc2); + } + break; + case RDMA_CM_EVENT_CONNECT_RESPONSE: + rc = sdp_response_handler(sk, id, event); + if (rc) { + sdp_dbg(sk, "Destroying qp\n"); + rdma_reject(id, NULL, 0); + } else { + rc = rdma_accept(id, NULL); + if (!rc && sdp_apm_enable) { + rc2 = rdma_enable_apm(id, RDMA_ALT_PATH_BEST); + if (rc2) + sdp_warn(sk, "APM couldn't be enabled for passive side:%d \n", rc2); + } + } + break; + case RDMA_CM_EVENT_CONNECT_ERROR: + rc = -ETIMEDOUT; + break; + case RDMA_CM_EVENT_UNREACHABLE: + rc = -ENETUNREACH; + break; + case RDMA_CM_EVENT_REJECTED: + rc = -ECONNREFUSED; + break; + case RDMA_CM_EVENT_ESTABLISHED: + inet_saddr(sk) = sdp_inet_rcv_saddr(sk) = + ((struct sockaddr_in *)&id->route.addr.src_addr)->sin_addr.s_addr; + rc = sdp_connected_handler(sk); + break; + case RDMA_CM_EVENT_DISCONNECTED: /* This means DREQ/DREP received */ + if (sk->sk_state == TCP_LAST_ACK) { + sdp_cancel_dreq_wait_timeout(sdp_sk(sk)); + + sdp_exch_state(sk, TCPF_LAST_ACK, TCP_TIME_WAIT); + + sdp_dbg(sk, "%s: waiting for Infiniband tear down\n", + __func__); + } + + sdp_sk(sk)->qp_active = 0; + rdma_disconnect(id); + + if (sk->sk_state != TCP_TIME_WAIT) { + if (sk->sk_state == TCP_CLOSE_WAIT) { + sdp_dbg(sk, "IB teardown while in " + "TCP_CLOSE_WAIT taking reference to " + "let close() finish the work\n"); + sock_hold(sk, SOCK_REF_CMA); + sdp_start_cma_timewait_timeout(sdp_sk(sk), + SDP_CMA_TIMEWAIT_TIMEOUT); + + } + sdp_set_error(sk, -EPIPE); + rc = sdp_disconnected_handler(sk); + } + break; + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + rc = sdp_disconnected_handler(sk); + break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + rc = -ENETRESET; + break; + + case RDMA_CM_EVENT_ADDR_CHANGE: + sdp_dbg(sk, "Got Address change event\n"); + rc = 0; + break; + default: + printk(KERN_ERR "SDP: Unexpected CMA event: %d\n", + event->event); + rc = -ECONNABORTED; + break; + } + + sdp_dbg(sk, "event: %s handled\n", rdma_cm_event_str(event->event)); + + if (rc && sdp_sk(sk)->id == id) { + child = sk; + sdp_sk(sk)->id = NULL; + id->qp = NULL; + id->context = NULL; + parent = sdp_sk(sk)->parent; + sdp_reset_sk(sk, rc); + } + + release_sock(sk); + + sdp_dbg(sk, "event: %s done. status %d\n", + rdma_cm_event_str(event->event), rc); + + if (parent) { + lock_sock(parent); + if (!sdp_sk(parent)->id) { /* TODO: look at SOCK_DEAD? */ + sdp_dbg(sk, "parent is going away.\n"); + child = NULL; + goto done; + } + if (!list_empty(&sdp_sk(child)->backlog_queue)) + list_del_init(&sdp_sk(child)->backlog_queue); + else + child = NULL; +done: + release_sock(parent); + if (child) + sdp_common_release(child); + } + return rc; +} diff --git a/drivers/infiniband/ulp/sdp/sdp_dbg.h b/drivers/infiniband/ulp/sdp/sdp_dbg.h new file mode 100644 index 0000000000000..3526d676ea710 --- /dev/null +++ b/drivers/infiniband/ulp/sdp/sdp_dbg.h @@ -0,0 +1,301 @@ +#ifndef _SDP_DBG_H_ +#define _SDP_DBG_H_ + +#define SDPSTATS_ON + +#ifdef CONFIG_INFINIBAND_SDP_DEBUG +#define SDP_SOCK_HISTORY +#endif + +#ifdef CONFIG_INFINIBAND_SDP_DEBUG_DATA +#define SDP_PROFILING +#endif + +#define SDP_WARN_ON(x) WARN_ON(x) +static inline struct sdp_sock *sdp_sk(const struct sock *sk); + +#define _sdp_printk(func, line, level, sk, format, arg...) do { \ + preempt_disable(); \ + printk(level "%s:%d sdp_sock(%5d:%d %d:%d): " format, \ + func, line, \ + current->pid, smp_processor_id(), \ + (sk) ? inet_num(sk) : -1, \ + (sk) ? ntohs(inet_dport(sk)) : -1, ## arg); \ + preempt_enable(); \ +} while (0) +#define sdp_printk(level, sk, format, arg...) \ + _sdp_printk(__func__, __LINE__, level, sk, format, ## arg) +#define sdp_warn(sk, format, arg...) \ + do { \ + sdp_printk(KERN_WARNING, sk, format, ## arg); \ + sdp_prf(sk, NULL, format , ## arg); \ + } while (0) + +#define SDP_MODPARAM_SINT(var, def_val, msg) \ + static int var = def_val; \ + module_param_named(var, var, int, 0644); \ + MODULE_PARM_DESC(var, msg " [" #def_val "]"); \ + +#define SDP_MODPARAM_INT(var, def_val, msg) \ + int var = def_val; \ + module_param_named(var, var, int, 0644); \ + MODULE_PARM_DESC(var, msg " [" #def_val "]"); \ + +#ifdef SDP_PROFILING +struct sk_buff; +struct sdpprf_log { + int idx; + int pid; + int cpu; + int sk_num; + int sk_dport; + struct sk_buff *skb; + char msg[256]; + + cycles_t time; + + const char *func; + int line; +}; + +#define SDPPRF_LOG_SIZE 0x20000 /* must be a power of 2 */ + +extern struct sdpprf_log sdpprf_log[SDPPRF_LOG_SIZE]; +extern atomic_t sdpprf_log_count; + +#define _sdp_prf(sk, s, _func, _line, format, arg...) ({ \ + int idx = atomic_add_return(1, &sdpprf_log_count) - 1; \ + struct sdpprf_log *l = \ + &sdpprf_log[idx & (SDPPRF_LOG_SIZE - 1)]; \ + preempt_disable(); \ + l->idx = idx; \ + l->pid = current->pid; \ + l->sk_num = (sk) ? inet_num(sk) : -1; \ + l->sk_dport = (sk) ? ntohs(inet_dport(sk)) : -1; \ + l->cpu = smp_processor_id(); \ + l->skb = s; \ + snprintf(l->msg, sizeof(l->msg) - 1, format, ## arg); \ + l->time = get_cycles(); \ + l->func = _func; \ + l->line = _line; \ + preempt_enable(); \ + 1; \ +}) +#define sdp_prf1(sk, s, format, arg...) \ + _sdp_prf(sk, s, __func__, __LINE__, format, ## arg) +#define sdp_prf(sk, s, format, arg...) sdp_prf1(sk, s, format, ## arg) + +#else +#define _sdp_prf(sk, s, _func, _line, format, arg...) +#define sdp_prf1(sk, s, format, arg...) +#define sdp_prf(sk, s, format, arg...) +#endif + +#ifdef CONFIG_INFINIBAND_SDP_DEBUG +extern int sdp_debug_level; + +#define sdp_dbg(sk, format, arg...) \ + do { \ + if (sdp_debug_level > 0) \ + sdp_printk(KERN_WARNING, sk, format , ## arg); \ + sdp_prf(sk, NULL, format , ## arg); \ + } while (0) + +#define sock_ref(sk, msg, sock_op) ({ \ + if (!atomic_read(&(sk)->sk_refcnt)) {\ + sdp_warn(sk, "%s:%d - %s (%s) ref = 0.\n", \ + __func__, __LINE__, #sock_op, msg); \ + sdp_print_history(sk); \ + SDP_WARN_ON(1); \ + } else { \ + sdp_dbg_data(sk, "%s:%d - %s (%s) ref = %d.\n", __func__, __LINE__, \ + #sock_op, msg, atomic_read(&(sk)->sk_refcnt)); \ + sock_op(sk); \ + }\ +}) + +#else /* CONFIG_INFINIBAND_SDP_DEBUG */ +#define sdp_dbg(priv, format, arg...) \ + do { (void) (priv); } while (0) +#define sock_ref(sk, msg, sock_op) sock_op(sk) +#endif /* CONFIG_INFINIBAND_SDP_DEBUG */ + +#ifdef CONFIG_INFINIBAND_SDP_DEBUG_DATA + +extern int sdp_data_debug_level; +#define sdp_dbg_data(sk, format, arg...) \ + do { \ + if (sdp_data_debug_level & 0x2) \ + sdp_printk(KERN_WARNING, sk, format , ## arg); \ + sdp_prf(sk, NULL, format , ## arg); \ + } while (0) +#define SDP_DUMP_PACKET(sk, str, skb, h) \ + do { \ + dump_packet(sk, str, skb, h); \ + } while (0) +#else +#define sdp_dbg_data(priv, format, arg...) +#define SDP_DUMP_PACKET(sk, str, skb, h) +#endif + +enum sdp_ref { + SOCK_REF_RESET, + SOCK_REF_ALIVE, /* sock_alloc -> destruct_sock */ + SOCK_REF_CLONE, + SOCK_REF_CMA, /* sdp_cma_handler is expected to be invoked */ + SOCK_REF_SEQ, /* during proc read */ + SOCK_REF_DREQ_TO, /* dreq timeout is pending */ + SOCK_REF_ZCOPY, /* zcopy send in process */ + SOCK_REF_RDMA_RD, /* RDMA read in process */ + SOCK_REF_KEEPALIVE /* socket is held by sk_reset_timer */ +}; + +#ifdef SDP_SOCK_HISTORY +#define SDP_SOCK_HISTORY_LEN 128 + +enum sdp_ref_type { + NOT_REF, + HOLD_REF, + PUT_REF, + __PUT_REF, + BOTH_REF +}; + +struct sdp_sock_hist { + char *str; + char *func; + int line; + int pid; + u8 cnt; + u8 ref_type; /* enum sdp_ref_type */ + u8 ref_enum; /* enum sdp_ref */ +}; + +static inline char *reftype2str(int reftype) +{ +#define ENUM2STR(e) [e] = #e + static char *enum2str[] = { + ENUM2STR(NOT_REF), + ENUM2STR(HOLD_REF), + ENUM2STR(PUT_REF), + ENUM2STR(__PUT_REF), + ENUM2STR(BOTH_REF) + }; + + if (reftype < 0 || reftype >= ARRAY_SIZE(enum2str)) { + printk(KERN_WARNING "reftype %d is illegal\n", reftype); + return NULL; + } + + return enum2str[reftype]; +} + +void _sdp_add_to_history(struct sock *sk, const char *str, + const char *func, int line, int ref_type, int ref_enum); +void sdp_print_history(struct sock *sk); + +#define sdp_add_to_history(sk, str) \ + _sdp_add_to_history(sk, str, __func__, __LINE__, 0, 0) + +#define sock_hold(sk, msg) \ + do { \ + _sdp_add_to_history(sk, #msg, __func__, __LINE__, \ + HOLD_REF, msg); \ + sock_ref(sk, #msg, sock_hold); \ + } while (0) + +#define sock_put(sk, msg) \ + do { \ + _sdp_add_to_history(sk, #msg, __func__, __LINE__, \ + PUT_REF, msg); \ + sock_ref(sk, #msg, sock_put); \ + } while (0) + +#define __sock_put(sk, msg) \ + do { \ + _sdp_add_to_history(sk, #msg, __func__, __LINE__, \ + __PUT_REF, msg); \ + sock_ref(sk, #msg, __sock_put); \ + } while (0) + +int sdp_ssk_hist_open(struct sock *sk); +int sdp_ssk_hist_close(struct sock *sk); +int sdp_ssk_hist_rename(struct sock *sk); + +#else +#define sock_hold(sk, msg) sock_ref(sk, #msg, sock_hold) +#define sock_put(sk, msg) sock_ref(sk, #msg, sock_put) +#define __sock_put(sk, msg) sock_ref(sk, #msg, __sock_put) + +#define _sdp_add_to_history(sk, str, func, line, ref_type, ref_enum) +#define sdp_add_to_history(sk, str) +#define sdp_print_history(sk) + +#endif /* SDP_SOCK_HISTORY */ + +#define ENUM2STR(e) [e] = #e + +static inline char *sdp_state_str(int state) +{ + static char *state2str[] = { + ENUM2STR(TCP_ESTABLISHED), + ENUM2STR(TCP_SYN_SENT), + ENUM2STR(TCP_SYN_RECV), + ENUM2STR(TCP_FIN_WAIT1), + ENUM2STR(TCP_FIN_WAIT2), + ENUM2STR(TCP_TIME_WAIT), + ENUM2STR(TCP_CLOSE), + ENUM2STR(TCP_CLOSE_WAIT), + ENUM2STR(TCP_LAST_ACK), + ENUM2STR(TCP_LISTEN), + ENUM2STR(TCP_CLOSING), + }; + + if (state < 0 || state >= ARRAY_SIZE(state2str)) { + printk(KERN_WARNING "state %d is illegal\n", state); + return NULL; + } + + return state2str[state]; +} + +static inline const char* rdma_cm_event_str(int event) +{ + static const char* state2str[] = { + ENUM2STR(RDMA_CM_EVENT_ADDR_RESOLVED), + ENUM2STR(RDMA_CM_EVENT_ADDR_ERROR), + ENUM2STR(RDMA_CM_EVENT_ROUTE_RESOLVED), + ENUM2STR(RDMA_CM_EVENT_ROUTE_ERROR), + ENUM2STR(RDMA_CM_EVENT_CONNECT_REQUEST), + ENUM2STR(RDMA_CM_EVENT_CONNECT_RESPONSE), + ENUM2STR(RDMA_CM_EVENT_CONNECT_ERROR), + ENUM2STR(RDMA_CM_EVENT_UNREACHABLE), + ENUM2STR(RDMA_CM_EVENT_REJECTED), + ENUM2STR(RDMA_CM_EVENT_ESTABLISHED), + ENUM2STR(RDMA_CM_EVENT_DISCONNECTED), + ENUM2STR(RDMA_CM_EVENT_DEVICE_REMOVAL), + ENUM2STR(RDMA_CM_EVENT_MULTICAST_JOIN), + ENUM2STR(RDMA_CM_EVENT_MULTICAST_ERROR), + ENUM2STR(RDMA_CM_EVENT_ADDR_CHANGE), + ENUM2STR(RDMA_CM_EVENT_TIMEWAIT_EXIT), + ENUM2STR(RDMA_CM_EVENT_ALT_ROUTE_RESOLVED), + ENUM2STR(RDMA_CM_EVENT_ALT_ROUTE_ERROR) + }; + + if (event < 0 || event >= ARRAY_SIZE(state2str)) { + printk(KERN_WARNING "event %d is illegal\n", event); + return NULL; + } + + return state2str[event]; +} + +struct sdp_bsdh; +#ifdef CONFIG_INFINIBAND_SDP_DEBUG_DATA +void _dump_packet(const char *func, int line, struct sock *sk, char *str, + struct sk_buff *skb, const struct sdp_bsdh *h); +#define dump_packet(sk, str, skb, h) \ + _dump_packet(__func__, __LINE__, sk, str, skb, h) +#endif + +#endif diff --git a/drivers/infiniband/ulp/sdp/sdp_main.c b/drivers/infiniband/ulp/sdp/sdp_main.c new file mode 100644 index 0000000000000..3fd77d15f6b45 --- /dev/null +++ b/drivers/infiniband/ulp/sdp/sdp_main.c @@ -0,0 +1,3014 @@ +/* + * Copyright (c) 2006 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +/* + * This file is based on net/ipv4/tcp.c + * under the following permission notice: + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or(at your option) any later version. + */ + +#if defined(__ia64__) +/* csum_partial_copy_from_user is not exported on ia64. + We don't really need it for SDP - skb_copy_to_page happens to call it + but for SDP HW checksum is always set, so ... */ + +#include +#include +#include + +static inline +unsigned int csum_partial_copy_from_user_new (const char *src, char *dst, + int len, unsigned int sum, + int *errp) +{ + *errp = -EINVAL; + return 0; +} + +#define csum_partial_copy_from_user csum_partial_copy_from_user_new +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "sdp.h" +#include + +MODULE_AUTHOR("Michael S. Tsirkin"); +MODULE_DESCRIPTION("InfiniBand SDP module"); +MODULE_LICENSE("Dual BSD/GPL"); + +#ifdef CONFIG_INFINIBAND_SDP_DEBUG +SDP_MODPARAM_INT(sdp_debug_level, 0, "Enable debug tracing if > 0."); +#endif +#ifdef CONFIG_INFINIBAND_SDP_DEBUG_DATA +SDP_MODPARAM_INT(sdp_data_debug_level, 0, + "Enable data path debug tracing if > 0."); +#endif + +SDP_MODPARAM_INT(sdp_apm_enable, 1, "Enable APM."); +SDP_MODPARAM_SINT(sdp_fmr_pool_size, 20, "Number of FMRs to allocate for pool"); +SDP_MODPARAM_SINT(sdp_fmr_dirty_wm, 5, "Watermark to flush fmr pool"); + +SDP_MODPARAM_SINT(recv_poll, 700, "usecs to poll recv before arming interrupt."); +SDP_MODPARAM_SINT(sdp_keepalive_time, SDP_KEEPALIVE_TIME, + "Default idle time in seconds before keepalive probe sent."); + +SDP_MODPARAM_INT(sdp_inline_thresh, SDP_DEF_INLINE_THRESH, + "Inline copy threshold. effective to new sockets only; 0=Off."); + +SDP_MODPARAM_INT(sdp_zcopy_thresh, SDP_DEF_ZCOPY_THRESH , + "Zero copy using RDMA threshold; 0=Off."); +#define SDP_RX_COAL_TIME_HIGH 128 +SDP_MODPARAM_SINT(sdp_rx_coal_target, 0x50000, + "Target number of bytes to coalesce with interrupt moderation."); +SDP_MODPARAM_SINT(sdp_rx_coal_time, 0x10, "rx coal time (jiffies)."); +SDP_MODPARAM_SINT(sdp_rx_rate_low, 80000, "rx_rate low (packets/sec)."); +SDP_MODPARAM_SINT(sdp_rx_coal_time_low, 0, "low moderation usec."); +SDP_MODPARAM_SINT(sdp_rx_rate_high, 100000, "rx_rate high (packets/sec)."); +SDP_MODPARAM_SINT(sdp_rx_coal_time_high, 128, "high moderation usec."); +SDP_MODPARAM_SINT(sdp_rx_rate_thresh, (200000 / SDP_RX_COAL_TIME_HIGH), + "rx rate thresh ()."); +SDP_MODPARAM_SINT(sdp_sample_interval, (HZ / 4), "sample interval (jiffies)."); + +SDP_MODPARAM_SINT(hw_int_mod_count, -1, + "forced hw int moderation val. -1 for auto (packets)."); +SDP_MODPARAM_SINT(hw_int_mod_usec, -1, + "forced hw int moderation val. -1 for auto (usec)."); + +struct workqueue_struct *sdp_wq; +struct workqueue_struct *rx_comp_wq; + +struct list_head sock_list; +spinlock_t sock_list_lock; + +DECLARE_RWSEM(device_removal_lock); + +static inline unsigned int sdp_keepalive_time_when(const struct sdp_sock *ssk) +{ + return ssk->keepalive_time ? : sdp_keepalive_time; +} + +inline void sdp_add_sock(struct sdp_sock *ssk) +{ + spin_lock_irq(&sock_list_lock); + list_add_tail(&ssk->sock_list, &sock_list); + spin_unlock_irq(&sock_list_lock); +} + +inline void sdp_remove_sock(struct sdp_sock *ssk) +{ + spin_lock_irq(&sock_list_lock); + BUG_ON(list_empty(&sock_list)); + list_del_init(&(ssk->sock_list)); + spin_unlock_irq(&sock_list_lock); +} + +static int sdp_get_port(struct sock *sk, unsigned short snum) +{ + struct sdp_sock *ssk = sdp_sk(sk); + struct sockaddr_in *src_addr; + int addr_len; + int rc; + + struct sockaddr_storage addr; + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&addr; + struct sockaddr_in *addr4 = (struct sockaddr_in *)&addr; + + sdp_add_to_history(sk, __func__); + + if (!ssk->id) + ssk->id = rdma_create_id(sdp_cma_handler, sk, RDMA_PS_SDP); + + if (!ssk->id) + return -ENOMEM; + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (inet6_sk(sk)) { + int addr_type = ipv6_addr_type(&inet6_sk(sk)->rcv_saddr); + if (addr_type == IPV6_ADDR_MAPPED) { + addr4->sin_family = AF_INET; + addr4->sin_port = htons(snum); + addr4->sin_addr.s_addr = inet6_sk(sk)->rcv_saddr.s6_addr32[3]; + addr_len = sizeof(*addr4); + } else { + addr6->sin6_family = AF_INET6; + addr6->sin6_port = htons(snum); + addr6->sin6_scope_id = sk->sk_bound_dev_if; + ipv6_addr_copy(&addr6->sin6_addr, &inet6_sk(sk)->rcv_saddr); + addr_len = sizeof(*addr6); + } + } + else +#endif + { + + addr4->sin_family = AF_INET; + addr4->sin_port = htons(snum); + addr4->sin_addr.s_addr = sdp_inet_rcv_saddr(sk); + + addr_len = sizeof(*addr4); + + sdp_dbg(sk, "%s: " NIPQUAD_FMT ":%u\n", __func__, + NIPQUAD(addr4->sin_addr.s_addr), ntohs(addr4->sin_port)); + } + + /* IP core seems to bind many times to the same address */ + /* TODO: I don't really understand why. Find out. */ + if (!memcmp(&addr, &ssk->id->route.addr.src_addr, addr_len)) + return 0; + + rc = ssk->last_bind_err = rdma_bind_addr(ssk->id, (struct sockaddr *)&addr); + if (rc) { + sdp_dbg(sk, "Destroying rdma id rc = %d\n", rc); + rdma_destroy_id(ssk->id); + ssk->id = NULL; + return rc; + } + + src_addr = (struct sockaddr_in *)&(ssk->id->route.addr.src_addr); + inet_num(sk) = ntohs(src_addr->sin_port); +#ifdef SDP_SOCK_HISTORY + sdp_ssk_hist_rename(sk); +#endif + return 0; +} + +static void sdp_destroy_qp(struct sdp_sock *ssk) +{ + sdp_dbg(sk_ssk(ssk), "destroying qp\n"); + sdp_prf(sk_ssk(ssk), NULL, "destroying qp"); + + sdp_add_to_history(sk_ssk(ssk), __func__); + ssk->qp_active = 0; + + if (ssk->qp) { + ib_destroy_qp(ssk->qp); + ssk->qp = NULL; + } + + sdp_rx_ring_destroy(ssk); + sdp_tx_ring_destroy(ssk); + + sdp_remove_large_sock(ssk); +} + +static void sdp_reset_keepalive_timer(struct sock *sk, unsigned long len) +{ + struct sdp_sock *ssk = sdp_sk(sk); + + sdp_dbg(sk, "%s\n", __func__); + + ssk->keepalive_tx_head = ring_head(ssk->tx_ring); + ssk->keepalive_rx_head = ring_head(ssk->rx_ring); + + sk_reset_timer(sk, &sk->sk_timer, jiffies + len); +} + +static void sdp_delete_keepalive_timer(struct sock *sk) +{ + struct sdp_sock *ssk = sdp_sk(sk); + + sdp_dbg(sk, "%s\n", __func__); + + ssk->keepalive_tx_head = 0; + ssk->keepalive_rx_head = 0; + + sk_stop_timer(sk, &sk->sk_timer); +} + +static void sdp_keepalive_timer(unsigned long data) +{ + struct sock *sk = (struct sock *)data; + struct sdp_sock *ssk = sdp_sk(sk); + + sdp_dbg(sk, "%s\n", __func__); + SDPSTATS_COUNTER_INC(keepalive_timer); + + /* Only process if the socket is not in use */ + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + sdp_reset_keepalive_timer(sk, HZ / 20); + goto out; + } + + if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_LISTEN || + sk->sk_state == TCP_CLOSE || !ssk->qp) + goto out; + + if (ssk->keepalive_tx_head == ring_head(ssk->tx_ring) && + ssk->keepalive_rx_head == ring_head(ssk->rx_ring)) + sdp_post_keepalive(ssk); + + sdp_reset_keepalive_timer(sk, sdp_keepalive_time_when(ssk)); + +out: + bh_unlock_sock(sk); + sock_put(sk, SOCK_REF_KEEPALIVE); +} + +static void sdp_set_keepalive(struct sock *sk, int val) +{ + sdp_dbg(sk, "%s %d\n", __func__, val); + + if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) + return; + + if (val && !sock_flag(sk, SOCK_KEEPOPEN)) + sdp_start_keepalive_timer(sk); + else if (!val) + sdp_delete_keepalive_timer(sk); +} + +void sdp_start_keepalive_timer(struct sock *sk) +{ + sdp_reset_keepalive_timer(sk, sdp_keepalive_time_when(sdp_sk(sk))); +} + +void sdp_set_default_moderation(struct sdp_sock *ssk) +{ + struct sock *sk = sk_ssk(ssk); + struct sdp_moderation *mod = &ssk->auto_mod; + int rx_buf_size; + + if (hw_int_mod_count > -1 || hw_int_mod_usec > -1) { + int err; + + mod->adaptive_rx_coal = 0; + + if (hw_int_mod_count > 0 && hw_int_mod_usec > 0) { + err = ib_modify_cq(ssk->rx_ring.cq, hw_int_mod_count, + hw_int_mod_usec); + if (unlikely(err)) + sdp_warn(sk, + "Failed modifying moderation for cq\n"); + else + sdp_dbg(sk, + "Using fixed interrupt moderation\n"); + SDPSTATS_COUNTER_INC(rx_cq_modified); + } + return; + } + + mod->adaptive_rx_coal = 1; + sdp_dbg(sk, "Using adaptive interrupt moderation\n"); + + /* If we haven't received a specific coalescing setting + * (module param), we set the moderation paramters as follows: + * - moder_cnt is set to the number of mtu sized packets to + * satisfy our coelsing target. + * - moder_time is set to a fixed value. + */ + rx_buf_size = (ssk->recv_frags * PAGE_SIZE) + sizeof(struct sdp_bsdh); + mod->moder_cnt = sdp_rx_coal_target / rx_buf_size + 1; + mod->moder_time = sdp_rx_coal_time; + sdp_dbg(sk, "Default coalesing params for buf size:%d - " + "moder_cnt:%d moder_time:%d\n", + rx_buf_size, mod->moder_cnt, mod->moder_time); + + /* Reset auto-moderation params */ + mod->pkt_rate_low = sdp_rx_rate_low; + mod->rx_usecs_low = sdp_rx_coal_time_low; + mod->pkt_rate_high = sdp_rx_rate_high; + mod->rx_usecs_high = sdp_rx_coal_time_high; + mod->sample_interval = sdp_sample_interval; + + mod->last_moder_time = SDP_AUTO_CONF; + mod->last_moder_jiffies = 0; + mod->last_moder_packets = 0; + mod->last_moder_tx_packets = 0; + mod->last_moder_bytes = 0; +} + +/* If tx and rx packet rates are not balanced, assume that + * traffic is mainly BW bound and apply maximum moderation. + * Otherwise, moderate according to packet rate */ +static inline int calc_moder_time(int rate, struct sdp_moderation *mod, + int tx_pkt_diff, int rx_pkt_diff) +{ + if (2 * tx_pkt_diff > 3 * rx_pkt_diff || + 2 * rx_pkt_diff > 3 * tx_pkt_diff) + return mod->rx_usecs_high; + + if (rate < mod->pkt_rate_low) + return mod->rx_usecs_low; + + if (rate > mod->pkt_rate_high) + return mod->rx_usecs_high; + + return (rate - mod->pkt_rate_low) * + (mod->rx_usecs_high - mod->rx_usecs_low) / + (mod->pkt_rate_high - mod->pkt_rate_low) + + mod->rx_usecs_low; +} + +static void sdp_auto_moderation(struct sdp_sock *ssk) +{ + struct sdp_moderation *mod = &ssk->auto_mod; + + unsigned long period = jiffies - mod->last_moder_jiffies; + unsigned long packets; + unsigned long rate; + unsigned long avg_pkt_size; + unsigned long tx_pkt_diff; + unsigned long rx_pkt_diff; + int moder_time; + int err; + + if (unlikely(!ssk->rx_ring.cq)) + return; + + if (!mod->adaptive_rx_coal) + return; + + if (period < mod->sample_interval) + return; + + if (!mod->last_moder_jiffies || !period) + goto out; + + tx_pkt_diff = ((unsigned long) (ssk->tx_packets - + mod->last_moder_tx_packets)); + rx_pkt_diff = ((unsigned long) (ssk->rx_packets - + mod->last_moder_packets)); + packets = max(tx_pkt_diff, rx_pkt_diff); + rate = packets * HZ / period; + avg_pkt_size = packets ? ((unsigned long) (ssk->rx_bytes - + mod->last_moder_bytes)) / packets : 0; + + /* Apply auto-moderation only when packet rate exceeds a rate that + * it matters */ + if (rate > sdp_rx_rate_thresh) { + moder_time = calc_moder_time(rate, mod, tx_pkt_diff, + rx_pkt_diff); + } else { + /* When packet rate is low, use default moderation rather + * than 0 to prevent interrupt storms if traffic suddenly + * increases */ + moder_time = mod->moder_time; + } + + sdp_dbg_data(sk_ssk(ssk), "tx rate:%lu rx_rate:%lu\n", + tx_pkt_diff * HZ / period, rx_pkt_diff * HZ / period); + + sdp_dbg_data(sk_ssk(ssk), "Rx moder_time changed from:%d to %d " + "period:%lu [jiff] packets:%lu avg_pkt_size:%lu " + "rate:%lu [p/s])\n", + mod->last_moder_time, moder_time, period, packets, + avg_pkt_size, rate); + + if (moder_time != mod->last_moder_time) { + mod->last_moder_time = moder_time; + err = ib_modify_cq(ssk->rx_ring.cq, mod->moder_cnt, moder_time); + if (unlikely(err)) { + sdp_dbg_data(sk_ssk(ssk), + "Failed modifying moderation for cq"); + } + SDPSTATS_COUNTER_INC(rx_cq_modified); + } + +out: + mod->last_moder_packets = ssk->rx_packets; + mod->last_moder_tx_packets = ssk->tx_packets; + mod->last_moder_bytes = ssk->rx_bytes; + mod->last_moder_jiffies = jiffies; +} + +void sdp_reset_sk(struct sock *sk, int rc) +{ + struct sdp_sock *ssk = sdp_sk(sk); + + sdp_dbg(sk, "%s\n", __func__); + + if (ssk->tx_ring.cq) + if (sdp_xmit_poll(ssk, 1)) + sdp_post_sends(ssk, 0); + + sdp_abort_srcavail(sk); + + if (!(sk->sk_shutdown & RCV_SHUTDOWN) || !sk_stream_memory_free(sk)) { + sdp_dbg(sk, "setting state to error\n"); + sdp_set_error(sk, rc); + } + + sk->sk_state_change(sk); + + /* Don't destroy socket before destroy work does its job */ + sock_hold(sk, SOCK_REF_RESET); + queue_work(sdp_wq, &ssk->destroy_work); +} + +/* Like tcp_reset */ +/* When we get a reset (completion with error) we do this. */ +void sdp_reset(struct sock *sk) +{ + int err; + + sdp_dbg(sk, "%s state=%s\n", __func__, sdp_state_str(sk->sk_state)); + + if (sk->sk_state != TCP_ESTABLISHED) + return; + + /* We want the right error as BSD sees it (and indeed as we do). */ + + /* On fin we currently only set RCV_SHUTDOWN, so .. */ + err = (sk->sk_shutdown & RCV_SHUTDOWN) ? EPIPE : ECONNRESET; + + sdp_set_error(sk, -err); + sk->sk_state_change(sk); +} + +/* TODO: linger? */ +static void sdp_destroy_resources(struct sock *sk) +{ + struct sdp_sock *ssk = sdp_sk(sk); + struct rdma_cm_id *id = NULL; + sdp_dbg(sk, "%s\n", __func__); + + lock_sock(sk); + + sk->sk_send_head = NULL; + skb_queue_purge(&sk->sk_write_queue); + /* + * If sendmsg cached page exists, toss it. + */ + if (sk->sk_sndmsg_page) { + __free_page(sk->sk_sndmsg_page); + sk->sk_sndmsg_page = NULL; + } + + id = ssk->id; + if (ssk->id) { + id->qp = NULL; + ssk->id = NULL; + release_sock(sk); + rdma_destroy_id(id); + lock_sock(sk); + } + + sdp_destroy_qp(ssk); + + /* QP is destroyed, so no one will queue skbs anymore. */ + if (ssk->rx_sa) + sdp_abort_rx_srcavail(sk, 0); + + skb_queue_purge(&sk->sk_receive_queue); + skb_queue_purge(&ssk->rx_ctl_q); + + sdp_dbg(sk, "%s done; releasing sock\n", __func__); + release_sock(sk); +} + +static inline void sdp_kill_id_and_release(struct sdp_sock *ssk) +{ + struct sock *sk = sk_ssk(ssk); + struct rdma_cm_id *id; + + lock_sock(sk); + id = ssk->id; + ssk->id = NULL; + release_sock(sk); + + if (id) + rdma_destroy_id(id); + sdp_common_release(sk); +} + +static void sdp_destruct(struct sock *sk) +{ + struct sdp_sock *ssk = sdp_sk(sk); + struct sdp_sock *s, *t; + + sdp_dbg(sk, "%s\n", __func__); + if (ssk->destructed_already) { + sdp_warn(sk, "redestructing sk!\n"); + return; + } + + sdp_add_to_history(sk, __func__); + percpu_counter_dec(sk->sk_prot->orphan_count); + percpu_counter_dec(sk->sk_prot->sockets_allocated); + ssk->destructed_already = 1; + + down_read(&device_removal_lock); + sdp_remove_sock(ssk); + sdp_destroy_resources(sk); + up_read(&device_removal_lock); + +#ifdef SDP_SOCK_HISTORY + sdp_add_to_history(sk, __func__); + sdp_ssk_hist_close(sk); +#endif + + flush_workqueue(rx_comp_wq); + /* Consider use cancel_work_sync(&ssk->rx_comp_work) */ + + sk_mem_reclaim(sk); + + if (sk->sk_wmem_queued || atomic_read(&sk->sk_rmem_alloc) || sk->sk_forward_alloc) { + sdp_dbg(sk, "wmem_queued: 0x%x rmem_alloc: 0x%x forward: 0x%x " + "proto: 0x%lx\n", sk->sk_wmem_queued, + atomic_read(&sk->sk_rmem_alloc), + sk->sk_forward_alloc, + atomic_long_read(sk->sk_prot->memory_allocated)); + } + + if (ssk->parent) + goto done; + + list_for_each_entry_safe(s, t, &ssk->backlog_queue, backlog_queue) { + sdp_kill_id_and_release(s); + } + list_for_each_entry_safe(s, t, &ssk->accept_queue, accept_queue) { + sdp_kill_id_and_release(s); + } + +done: + sdp_dbg(sk, "%s done\n", __func__); +} + +static inline void sdp_start_dreq_wait_timeout(struct sdp_sock *ssk, int timeo) +{ + sdp_dbg(sk_ssk(ssk), "Starting dreq wait timeout\n"); + + queue_delayed_work(sdp_wq, &ssk->dreq_wait_work, timeo); + ssk->dreq_wait_timeout = 1; +} + +static void sdp_send_disconnect(struct sock *sk) +{ + sock_hold(sk, SOCK_REF_DREQ_TO); + sdp_start_dreq_wait_timeout(sdp_sk(sk), SDP_FIN_WAIT_TIMEOUT); + + sdp_sk(sk)->sdp_disconnect = 1; + sdp_post_sends(sdp_sk(sk), 0); + + sdp_arm_rx_cq(sk); +} + +/* + * State processing on a close. + * TCP_ESTABLISHED -> TCP_FIN_WAIT1 -> TCP_CLOSE + */ +static int sdp_close_state(struct sock *sk) +{ + switch (sk->sk_state) { + case TCP_ESTABLISHED: + sdp_exch_state(sk, TCPF_ESTABLISHED, TCP_FIN_WAIT1); + break; + case TCP_CLOSE_WAIT: + sdp_exch_state(sk, TCPF_CLOSE_WAIT, TCP_LAST_ACK); + break; + default: + return 0; + } + + return 1; +} + +/* + * In order to prevent asynchronous-events handling after the last reference + * count removed, we destroy rdma_id so cma_handler() won't be invoked. + * This function should be called under lock_sock(sk). + */ +static inline void disable_cma_handler(struct sock *sk) +{ + if (sdp_sk(sk)->id) { + struct rdma_cm_id *id = sdp_sk(sk)->id; + sdp_sk(sk)->id = NULL; + release_sock(sk); + rdma_destroy_id(id); + lock_sock(sk); + } +} + +static void sdp_cma_timewait_timeout_work(struct work_struct *work) +{ + struct sdp_sock *ssk = + container_of(work, struct sdp_sock, cma_timewait_work.work); + struct sock *sk = sk_ssk(ssk); + + lock_sock(sk); + if (!ssk->cma_timewait_timeout) { + release_sock(sk); + return; + } + + ssk->cma_timewait_timeout = 0; + release_sock(sk); + sock_put(sk, SOCK_REF_CMA); +} + +static int sdp_cancel_cma_timewait_timeout(struct sdp_sock *ssk) +{ + if (!ssk->cma_timewait_timeout) + return 0; + + ssk->cma_timewait_timeout = 0; + return cancel_delayed_work(&ssk->cma_timewait_work); + /* No need to use the sync'ed function because the socket's refcnt is + * pre-taken and multiple invocations of sock_put() are self sync'ed + * (atomic operation). + */ +} + +void sdp_start_cma_timewait_timeout(struct sdp_sock *ssk, int timeo) +{ + queue_delayed_work(sdp_wq, &ssk->cma_timewait_work, timeo); + ssk->cma_timewait_timeout = 1; +} + +/* Like tcp_close */ +static void sdp_close(struct sock *sk, long timeout) +{ + struct sk_buff *skb; + int data_was_unread = 0; + + sdp_add_to_history(sk, __func__); + lock_sock(sk); + + sdp_dbg(sk, "%s\n", __func__); + sdp_prf(sk, NULL, __func__); + + sdp_sk(sk)->cpu = smp_processor_id(); + sdp_delete_keepalive_timer(sk); + + sk->sk_shutdown = SHUTDOWN_MASK; + + if ((1 << sk->sk_state) & (TCPF_TIME_WAIT | TCPF_CLOSE)) { + /* this could happen if socket was closed by a CM teardown + and after that the user called close() */ + disable_cma_handler(sk); + goto out; + } + + if (sk->sk_state == TCP_LISTEN || sk->sk_state == TCP_SYN_SENT) { + sdp_exch_state(sk, TCPF_LISTEN | TCPF_SYN_SENT, TCP_CLOSE); + disable_cma_handler(sk); + + /* Special case: stop listening. + This is done by sdp_destruct. */ + goto out; + } + + sock_hold(sk, SOCK_REF_CMA); + sdp_start_cma_timewait_timeout(sdp_sk(sk), SDP_CMA_TIMEWAIT_TIMEOUT); + + /* We need to flush the recv. buffs. We do this only on the + * descriptor close, not protocol-sourced closes, because the + * reader process may not have drained the data yet! + */ + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + struct sdp_bsdh *h = (struct sdp_bsdh *)skb_transport_header(skb); + if (h->mid == SDP_MID_DISCONN) { + sdp_handle_disconn(sk); + } else { + if (h->mid == SDP_MID_SRCAVAIL && sdp_sk(sk)->rx_sa) + sdp_abort_rx_srcavail(sk, 1); + + sdp_dbg(sk, "Data was unread. skb: %p\n", skb); + data_was_unread = 1; + } + __kfree_skb(skb); + } + + sk_mem_reclaim(sk); + + /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section + * 3.10, we send a RST here because data was lost. To + * witness the awful effects of the old behavior of always + * doing a FIN, run an older 2.1.x kernel or 2.0.x, start + * a bulk GET in an FTP client, suspend the process, wait + * for the client to advertise a zero window, then kill -9 + * the FTP client, wheee... Note: timeout is always zero + * in such a case. + */ + if (data_was_unread || + (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime)) { + /* Unread data was tossed, zap the connection. */ + NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); + sdp_exch_state(sk, TCPF_CLOSE_WAIT | TCPF_ESTABLISHED, + TCP_TIME_WAIT); + + /* Go into abortive close */ + sk->sk_prot->disconnect(sk, 0); + } else if (sdp_close_state(sk)) { + /* We FIN if the application ate all the data before + * zapping the connection. + */ + + sdp_send_disconnect(sk); + } + + /* TODO: state should move to CLOSE or CLOSE_WAIT etc on disconnect. + Since it currently doesn't, do it here to avoid blocking below. */ + if (!sdp_sk(sk)->id) + sdp_exch_state(sk, TCPF_FIN_WAIT1 | TCPF_LAST_ACK | + TCPF_CLOSE_WAIT, TCP_CLOSE); + + sk_stream_wait_close(sk, timeout); +out: + release_sock(sk); + + sdp_common_release(sk); +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static int sdp_ipv6_connect(struct sock *sk, struct sockaddr_storage *saddr, + struct sockaddr *uaddr, int addr_len) +{ + struct sdp_sock *ssk = sdp_sk(sk); + struct sockaddr_in6 *usin = (struct sockaddr_in6 *)uaddr; + struct sockaddr_in6 *src_addr = (struct sockaddr_in6 *)saddr; + int rc; + int addr_type; + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + if (uaddr->sa_family == AF_INET6_SDP) + uaddr->sa_family = AF_INET6; + + if (uaddr->sa_family != AF_INET6) + return -EAFNOSUPPORT; + + /* + * connect() to INADDR_ANY means loopback (BSD'ism). + */ + if(ipv6_addr_any(&usin->sin6_addr)) + usin->sin6_addr.s6_addr[15] = 0x1; + + addr_type = ipv6_addr_type(&usin->sin6_addr); + + if(addr_type & IPV6_ADDR_MULTICAST) + return -ENETUNREACH; + + sk->sk_bound_dev_if = usin->sin6_scope_id; + + src_addr->sin6_family = AF_INET6; + src_addr->sin6_port = htons(inet_sport(sk)); + src_addr->sin6_addr = inet6_sk(sk)->saddr; + + if (ssk->id && (addr_type != ipv6_addr_type(&inet6_sk(sk)->rcv_saddr))) { + sdp_dbg(sk, "Existing address type is different for the " + "requested. rebinding socket\n"); + rdma_destroy_id(ssk->id); + ssk->id = NULL; + } + + if (!ssk->id) { + /* If IPv4 over IPv6, make sure rdma_bind will expect ipv4 address */ + if (addr_type == IPV6_ADDR_MAPPED) + ipv6_addr_set(&inet6_sk(sk)->rcv_saddr, 0, 0, htonl(0x0000FFFF), 0); + + rc = sdp_get_port(sk, htons(inet_sport(sk))); + if (rc) + return rc; + inet_sport(sk) = htons(inet_num(sk)); + } + + ipv6_addr_copy(&inet6_sk(sk)->daddr, &usin->sin6_addr); + + if (addr_type == IPV6_ADDR_MAPPED) { + struct sockaddr_in *addr4 = (struct sockaddr_in *)uaddr; + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)uaddr; + + if (__ipv6_only_sock(sk)) + return -ENETUNREACH; + + addr4->sin_addr.s_addr = addr6->sin6_addr.s6_addr32[3]; + addr4->sin_family = AF_INET; + } + + sdp_dbg(sk, "%s " NIP6_FMT ":%hu -> " NIP6_FMT ":%hu\n", __func__, + NIP6(src_addr->sin6_addr), + ntohs(src_addr->sin6_port), + NIP6(((struct sockaddr_in6 *)uaddr)->sin6_addr), + ntohs(((struct sockaddr_in6 *)uaddr)->sin6_port)); + + return 0; +} +#endif + +static int sdp_ipv4_connect(struct sock *sk, struct sockaddr_storage *saddr, + struct sockaddr *uaddr, int addr_len) +{ + struct sdp_sock *ssk = sdp_sk(sk); + struct sockaddr_in *src_addr = (struct sockaddr_in *)saddr; + int rc; + + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + if (uaddr->sa_family == AF_INET_SDP) + uaddr->sa_family = AF_INET; + + if (uaddr->sa_family != AF_INET) + return -EAFNOSUPPORT; + + if (!ssk->id) { + rc = sdp_get_port(sk, htons(inet_num(sk))); + if (rc) + return rc; + inet_sport(sk) = htons(inet_num(sk)); + } + + src_addr->sin_family = AF_INET; + src_addr->sin_port = htons(inet_sport(sk)); + src_addr->sin_addr.s_addr = inet_saddr(sk); + + sdp_dbg(sk, "%s " NIPQUAD_FMT ":%hu -> " NIPQUAD_FMT ":%hu\n", __func__, + NIPQUAD(src_addr->sin_addr.s_addr), + ntohs(src_addr->sin_port), + NIPQUAD(((struct sockaddr_in *)uaddr)->sin_addr.s_addr), + ntohs(((struct sockaddr_in *)uaddr)->sin_port)); + + return 0; +} + +static int sdp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct sdp_sock *ssk = sdp_sk(sk); + struct sockaddr_storage src_addr = { 0 }; + int rc; + + sdp_add_to_history(sk, __func__); + ssk->cpu = smp_processor_id(); + release_sock(sk); + flush_workqueue(sdp_wq); + lock_sock(sk); + if (sk->sk_err) { + sdp_dbg(sk, "Can't connect, socket marked with error: %d\n", + sk->sk_err); + return -sk->sk_err; + } + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (inet6_sk(sk)) + rc = sdp_ipv6_connect(sk, &src_addr, uaddr, addr_len); + else +#endif + rc = sdp_ipv4_connect(sk, &src_addr, uaddr, addr_len); + + if (rc) + goto err; + + rc = rdma_resolve_addr(ssk->id, (struct sockaddr *)&src_addr, + uaddr, SDP_RESOLVE_TIMEOUT); + if (rc) { + sdp_dbg(sk, "rdma_resolve_addr failed: %d\n", rc); + goto err; + } + + sdp_exch_state(sk, TCPF_CLOSE, TCP_SYN_SENT); + + return rc; + +err: + sdp_dbg(sk, "Error: rc = %d\n", rc); + return rc; +} + +static int sdp_disconnect(struct sock *sk, int flags) +{ + struct sdp_sock *ssk = sdp_sk(sk); + int rc = 0; + struct sdp_sock *s, *t; + struct rdma_cm_id *id; + + sdp_dbg(sk, "%s\n", __func__); + + ssk->cpu = smp_processor_id(); + if (sk->sk_state != TCP_LISTEN) { + if (ssk->id) { + sdp_sk(sk)->qp_active = 0; + rc = rdma_disconnect(ssk->id); + } + + return rc; + } + + sdp_exch_state(sk, TCPF_LISTEN, TCP_CLOSE); + id = ssk->id; + ssk->id = NULL; + release_sock(sk); /* release socket since locking semantics is parent + inside child */ + if (id) + rdma_destroy_id(id); + + list_for_each_entry_safe(s, t, &ssk->backlog_queue, backlog_queue) { + sdp_kill_id_and_release(s); + } + list_for_each_entry_safe(s, t, &ssk->accept_queue, accept_queue) { + sdp_kill_id_and_release(s); + } + + lock_sock(sk); + + return 0; +} + +/* Like inet_csk_wait_for_connect */ +static int sdp_wait_for_connect(struct sock *sk, long timeo) +{ + struct sdp_sock *ssk = sdp_sk(sk); + DEFINE_WAIT(wait); + int err; + + sdp_dbg(sk, "%s\n", __func__); + /* + * True wake-one mechanism for incoming connections: only + * one process gets woken up, not the 'whole herd'. + * Since we do not 'race & poll' for established sockets + * anymore, the common case will execute the loop only once. + * + * Subtle issue: "add_wait_queue_exclusive()" will be added + * after any current non-exclusive waiters, and we know that + * it will always _stay_ after any new non-exclusive waiters + * because all non-exclusive waiters are added at the + * beginning of the wait-queue. As such, it's ok to "drop" + * our exclusiveness temporarily when we get woken up without + * having to remove and re-insert us on the wait queue. + */ + for (;;) { + prepare_to_wait_exclusive(sdp_sk_sleep(sk), &wait, + TASK_INTERRUPTIBLE); + release_sock(sk); + if (list_empty(&ssk->accept_queue)) { + timeo = schedule_timeout(timeo); + } + lock_sock(sk); + err = 0; + if (!list_empty(&ssk->accept_queue)) + break; + err = -EINVAL; + if (sk->sk_state != TCP_LISTEN) + break; + err = sock_intr_errno(timeo); + if (signal_pending(current)) + break; + err = -EAGAIN; + if (!timeo) + break; + } + finish_wait(sdp_sk_sleep(sk), &wait); + sdp_dbg(sk, "%s returns %d\n", __func__, err); + return err; +} + +/* Consider using request_sock_queue instead of duplicating all this */ +/* Like inet_csk_accept */ +static struct sock *sdp_accept(struct sock *sk, int flags, int *err) +{ + struct sdp_sock *newssk = NULL, *ssk; + struct sock *newsk; + int error; + + sdp_add_to_history(sk, __func__); + sdp_dbg(sk, "%s state %s expected %s *err %d\n", __func__, + sdp_state_str(sk->sk_state), "TCP_LISTEN", *err); + + ssk = sdp_sk(sk); + lock_sock(sk); + ssk->cpu = smp_processor_id(); + + /* We need to make sure that this socket is listening, + * and that it has something pending. + */ + error = -EINVAL; + if (sk->sk_state != TCP_LISTEN) + goto out_err; + + /* Find already established connection */ + if (list_empty(&ssk->accept_queue)) { + long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + + /* If this is a non blocking socket don't sleep */ + error = -EAGAIN; + if (!timeo) + goto out_err; + + error = sdp_wait_for_connect(sk, timeo); + if (error) + goto out_err; + } + + newssk = list_entry(ssk->accept_queue.next, struct sdp_sock, + accept_queue); + list_del_init(&newssk->accept_queue); + newssk->parent = NULL; + sk_acceptq_removed(sk); + newsk = sk_ssk(newssk); +out: + release_sock(sk); + if (newsk) { + lock_sock(newsk); + if (newssk->rx_ring.cq) { + newssk->poll_cq = 1; + sdp_arm_rx_cq(sk_ssk(newssk)); + } + release_sock(newsk); + } + sdp_dbg(sk, "%s: status %d sk %p newsk %p\n", __func__, + *err, sk, newsk); + return newsk; +out_err: + sdp_dbg(sk, "%s: error %d\n", __func__, error); + newsk = NULL; + *err = error; + goto out; +} + +/* Like tcp_ioctl */ +static int sdp_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + struct sdp_sock *ssk = sdp_sk(sk); + int answ; + + sdp_add_to_history(sk, __func__); + sdp_dbg(sk, "%s\n", __func__); + + switch (cmd) { + case SIOCINQ: + if (sk->sk_state == TCP_LISTEN) + return -EINVAL; + + lock_sock(sk); + ssk->cpu = smp_processor_id(); + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) + answ = 0; + else if (sock_flag(sk, SOCK_URGINLINE) || + !ssk->urg_data || + before(ssk->urg_seq, ssk->copied_seq) || + !before(ssk->urg_seq, rcv_nxt(ssk))) { + answ = rcv_nxt(ssk) - ssk->copied_seq; + + /* Subtract 1, if FIN is in queue. */ + if (answ && !skb_queue_empty(&sk->sk_receive_queue)) + answ -= + (skb_transport_header(sk->sk_receive_queue.prev))[0] + == SDP_MID_DISCONN ? 1 : 0; + } else + answ = ssk->urg_seq - ssk->copied_seq; + release_sock(sk); + break; + case SIOCATMARK: + answ = ssk->urg_data && ssk->urg_seq == ssk->copied_seq; + break; + case SIOCOUTQ: + if (sk->sk_state == TCP_LISTEN) + return -EINVAL; + + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) + answ = 0; + else + answ = ssk->write_seq - ssk->tx_ring.una_seq; + break; + default: + return -ENOIOCTLCMD; + } + /* TODO: Need to handle: + case SIOCOUTQ: + */ + return put_user(answ, (int __user *)arg); +} + +void sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk) +{ + if (!ssk->dreq_wait_timeout) + return; + + sdp_dbg(sk_ssk(ssk), "cancelling dreq wait timeout\n"); + + ssk->dreq_wait_timeout = 0; + if (cancel_delayed_work_sync(&ssk->dreq_wait_work)) { + /* The timeout hasn't reached - need to clean ref count */ + sock_put(sk_ssk(ssk), SOCK_REF_DREQ_TO); + } +} + +static void sdp_destroy_work(struct work_struct *work) +{ + struct sdp_sock *ssk = container_of(work, struct sdp_sock, + destroy_work); + struct sock *sk = sk_ssk(ssk); + sdp_dbg(sk, "%s: refcnt %d\n", __func__, atomic_read(&sk->sk_refcnt)); + + lock_sock(sk); + sdp_destroy_qp(ssk); + release_sock(sk); + + /* Can be sure that rx_comp_work won't be queued from here cause + * ssk->rx_ring.cq is NULL from here + */ + cancel_work_sync(&ssk->rx_comp_work); + + lock_sock(sk); + memset((void *)&ssk->id, 0, sizeof(*ssk) - offsetof(typeof(*ssk), id)); + release_sock(sk); + + sdp_cancel_dreq_wait_timeout(ssk); + + lock_sock(sk); + if (sk->sk_state == TCP_TIME_WAIT) { + if (sdp_cancel_cma_timewait_timeout(ssk)) + sock_put(sk, SOCK_REF_CMA); + } + + /* In normal close current state is TCP_TIME_WAIT or TCP_CLOSE + but if a CM connection is dropped below our legs state could + be any state */ + sdp_exch_state(sk, ~0, TCP_CLOSE); + release_sock(sk); + + sock_put(sk, SOCK_REF_RESET); +} + +static void sdp_dreq_wait_timeout_work(struct work_struct *work) +{ + struct sdp_sock *ssk = + container_of(work, struct sdp_sock, dreq_wait_work.work); + struct sock *sk = sk_ssk(ssk); + + if (!ssk->dreq_wait_timeout) + goto out; + + lock_sock(sk); + + if (!ssk->dreq_wait_timeout || + !((1 << sk->sk_state) & (TCPF_FIN_WAIT1 | TCPF_LAST_ACK))) { + release_sock(sk); + goto out; + } + + sdp_dbg(sk, "timed out waiting for FIN/DREQ. " + "going into abortive close.\n"); + + ssk->dreq_wait_timeout = 0; + sdp_exch_state(sk, TCPF_LAST_ACK | TCPF_FIN_WAIT1, TCP_TIME_WAIT); + + if (ssk->id) { + sdp_dbg(sk, "Destroyed QP\n"); + ssk->qp_active = 0; + rdma_disconnect(ssk->id); + release_sock(sk); + } else { + release_sock(sk); + sock_put(sk, SOCK_REF_CMA); + } + +out: + sock_put(sk, SOCK_REF_DREQ_TO); +} + +/* + * Only SDP interact with this receive queue. Don't want + * lockdep warnings that using spinlock irqsave + */ +static struct lock_class_key ib_sdp_sk_receive_queue_lock_key; + +static struct lock_class_key ib_sdp_sk_callback_lock_key; + +static void sdp_destroy_work(struct work_struct *work); +static void sdp_dreq_wait_timeout_work(struct work_struct *work); +static void sdp_cma_timewait_timeout_work(struct work_struct *work); + +atomic_t socket_idx = ATOMIC_INIT(0); + +int sdp_init_sock(struct sock *sk) +{ + struct sdp_sock *ssk = sdp_sk(sk); + + sdp_dbg(sk, "%s\n", __func__); + + ssk->sk_id = atomic_inc_return(&socket_idx); + + INIT_LIST_HEAD(&ssk->accept_queue); + INIT_LIST_HEAD(&ssk->backlog_queue); + INIT_DELAYED_WORK(&ssk->dreq_wait_work, sdp_dreq_wait_timeout_work); + INIT_DELAYED_WORK(&ssk->cma_timewait_work, sdp_cma_timewait_timeout_work); + INIT_WORK(&ssk->destroy_work, sdp_destroy_work); + + lockdep_set_class(&sk->sk_receive_queue.lock, + &ib_sdp_sk_receive_queue_lock_key); + + lockdep_set_class(&sk->sk_callback_lock, + &ib_sdp_sk_callback_lock_key); + + sk->sk_route_caps |= NETIF_F_SG | NETIF_F_NO_CSUM; + + skb_queue_head_init(&ssk->rx_ctl_q); + + atomic_set(&ssk->mseq_ack, 0); + + ssk->rx_ring.buffer = NULL; + ssk->tx_ring.buffer = NULL; + ssk->sdp_disconnect = 0; + ssk->destructed_already = 0; + ssk->id_destroyed_already = 0; + spin_lock_init(&ssk->lock); + spin_lock_init(&ssk->tx_sa_lock); + ssk->tx_compl_pending = 0; + + atomic_set(&ssk->somebody_is_doing_posts, 0); + ssk->cpu = smp_processor_id(); + ssk->tx_ring.rdma_inflight = NULL; + + init_timer(&ssk->rx_ring.cq_arm_timer); + init_timer(&ssk->tx_ring.timer); + init_timer(&ssk->nagle_timer); + init_timer(&sk->sk_timer); + setup_timer(&sk->sk_timer, sdp_keepalive_timer, (unsigned long)sk); + ssk->sa_cancel_arrived = 0; + ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */ + ssk->last_bind_err = 0; + +#ifdef SDP_SOCK_HISTORY + memset(ssk->hst, 0, sizeof ssk->hst); + ssk->hst_idx = 0; + spin_lock_init(&ssk->hst_lock); + sdp_ssk_hist_open(sk); +#endif + + return 0; +} + +static void sdp_shutdown(struct sock *sk, int how) +{ + struct sdp_sock *ssk = sdp_sk(sk); + + sdp_add_to_history(sk, __func__); + sdp_dbg(sk, "%s\n", __func__); + if (!(how & SEND_SHUTDOWN)) + return; + + /* If we've already sent a FIN, or it's a closed state, skip this. */ + if (!((1 << sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_SYN_SENT | + TCPF_SYN_RECV | TCPF_CLOSE_WAIT))) { + return; + } + + if (!sdp_close_state(sk)) + return; + + /* + * Just turn off CORK here. + * We could check for socket shutting down in main data path, + * but this costs no extra cycles there. + */ + ssk->nonagle &= ~TCP_NAGLE_CORK; + if (ssk->nonagle & TCP_NAGLE_OFF) + ssk->nonagle |= TCP_NAGLE_PUSH; + + sdp_send_disconnect(sk); +} + +static void sdp_mark_push(struct sdp_sock *ssk, struct sk_buff *skb) +{ + SDP_SKB_CB(skb)->flags |= TCPHDR_PSH; + sdp_do_posts(ssk); +} + +static inline void sdp_push_pending_frames(struct sock *sk) +{ + struct sk_buff *skb = sk->sk_send_head; + if (skb) { + sdp_mark_push(sdp_sk(sk), skb); + } +} + +/* SOL_SOCKET level options are handled by sock_setsockopt */ +static int sdp_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, unsigned optlen) +{ + struct sdp_sock *ssk = sdp_sk(sk); + int val; + int err = 0; + + sdp_add_to_history(sk, __func__); + sdp_dbg(sk, "%s\n", __func__); + if (optlen < sizeof(int)) + return -EINVAL; + + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + lock_sock(sk); + ssk->cpu = smp_processor_id(); + + /* SOCK_KEEPALIVE is really a SOL_SOCKET level option but there + * is a problem handling it at that level. In order to start + * the keepalive timer on an SDP socket, we must call an SDP + * specific routine. Since sock_setsockopt() can not be modifed + * to understand SDP, the application must pass that option + * through to us. Since SO_KEEPALIVE and TCP_DEFER_ACCEPT both + * use the same optname, the level must not be SOL_TCP or SOL_SOCKET + */ + if (level == PF_INET_SDP && optname == SO_KEEPALIVE) { + sdp_set_keepalive(sk, val); + if (val) + sock_set_flag(sk, SOCK_KEEPOPEN); + else + sock_reset_flag(sk, SOCK_KEEPOPEN); + goto out; + } + + if (level != SOL_TCP) { + err = -ENOPROTOOPT; + goto out; + } + + switch (optname) { + case TCP_NODELAY: + if (val) { + /* TCP_NODELAY is weaker than TCP_CORK, so that + * this option on corked socket is remembered, but + * it is not activated until cork is cleared. + * + * However, when TCP_NODELAY is set we make + * an explicit push, which overrides even TCP_CORK + * for currently queued segments. + */ + ssk->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; + sdp_push_pending_frames(sk); + } else { + ssk->nonagle &= ~TCP_NAGLE_OFF; + } + break; + case TCP_CORK: + /* When set indicates to always queue non-full frames. + * Later the user clears this option and we transmit + * any pending partial frames in the queue. This is + * meant to be used alongside sendfile() to get properly + * filled frames when the user (for example) must write + * out headers with a write() call first and then use + * sendfile to send out the data parts. + * + * TCP_CORK can be set together with TCP_NODELAY and it is + * stronger than TCP_NODELAY. + */ + if (val) { + ssk->nonagle |= TCP_NAGLE_CORK; + } else { + ssk->nonagle &= ~TCP_NAGLE_CORK; + if (ssk->nonagle&TCP_NAGLE_OFF) + ssk->nonagle |= TCP_NAGLE_PUSH; + sdp_push_pending_frames(sk); + } + break; + case TCP_KEEPIDLE: + if (val < 1 || val > MAX_TCP_KEEPIDLE) + err = -EINVAL; + else { + ssk->keepalive_time = val * HZ; + + if (sock_flag(sk, SOCK_KEEPOPEN) && + !((1 << sk->sk_state) & + (TCPF_CLOSE | TCPF_LISTEN))) { + sdp_reset_keepalive_timer(sk, + ssk->keepalive_time); + } + } + break; + case SDP_ZCOPY_THRESH: + if (val != 0 && (val < SDP_MIN_ZCOPY_THRESH || + val > SDP_MAX_ZCOPY_THRESH)) + err = -EINVAL; + else + ssk->zcopy_thresh = val; + break; + default: + err = -ENOPROTOOPT; + break; + } + +out: + release_sock(sk); + return err; +} + +/* SOL_SOCKET level options are handled by sock_getsockopt */ +static int sdp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *option) +{ + /* TODO */ + struct sdp_sock *ssk = sdp_sk(sk); + int val, len; + + sdp_add_to_history(sk, __func__); + sdp_dbg(sk, "%s\n", __func__); + + if (level != SOL_TCP) + return -EOPNOTSUPP; + + if (get_user(len, option)) + return -EFAULT; + + len = min_t(unsigned int, len, sizeof(int)); + + if (len < 0) + return -EINVAL; + + switch (optname) { + case TCP_NODELAY: + val = !!(ssk->nonagle&TCP_NAGLE_OFF); + break; + case TCP_CORK: + val = !!(ssk->nonagle&TCP_NAGLE_CORK); + break; + case TCP_KEEPIDLE: + val = (ssk->keepalive_time ? : sdp_keepalive_time) / HZ; + break; + case TCP_MAXSEG: + val = ssk->xmit_size_goal; + break; + case SDP_ZCOPY_THRESH: + val = ssk->zcopy_thresh; + break; + case SDP_LAST_BIND_ERR: + val = ssk->last_bind_err; + break; + default: + return -ENOPROTOOPT; + } + + if (put_user(len, option)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + return 0; +} + +static inline int cycles_before(cycles_t a, cycles_t b) +{ + /* cycles_t is unsigned, but may be int/long/long long. */ + + if (sizeof(cycles_t) == 4) + return before(a, b); + else + return (s64)(a - b) < 0; +} + +static inline cycles_t sdp_usec_to_cycles(int usecs) +{ +#ifdef CONFIG_PPC + return usecs * tb_ticks_per_usec; +#elif defined(__ia64__) + return usecs * local_cpu_data->cyc_per_usec; +#else + return usecs * cpu_khz / 1000; +#endif +} + +static inline int poll_recv_cq(struct sock *sk) +{ + cycles_t start = get_cycles(); + cycles_t end = start + sdp_usec_to_cycles(recv_poll); + + sdp_prf(sk, NULL, "polling recv"); + + if (unlikely(!sdp_sk(sk)->rx_ring.cq)) + return 0; + + do { + if (sdp_poll_rx_cq(sdp_sk(sk))) { + SDPSTATS_COUNTER_INC(rx_poll_hit); + SDPSTATS_HIST(poll_hit_usec, sdp_cycles_to_usecs( + (unsigned long)(get_cycles() - start))); + return 0; + } + } while (cycles_before(get_cycles(), end)); + + SDPSTATS_COUNTER_INC(rx_poll_miss); + return 1; +} + +/* Like tcp_recv_urg */ +/* + * Handle reading urgent data. BSD has very simple semantics for + * this, no blocking and very strange errors 8) + */ + +static int sdp_recv_urg(struct sock *sk, long timeo, + struct msghdr *msg, int len, int flags, + int *addr_len) +{ + struct sdp_sock *ssk = sdp_sk(sk); + + poll_recv_cq(sk); + + /* No URG data to read. */ + if (sock_flag(sk, SOCK_URGINLINE) || !ssk->urg_data || + ssk->urg_data == TCP_URG_READ) + return -EINVAL; /* Yes this is right ! */ + + if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE)) + return -ENOTCONN; + + if (ssk->urg_data & TCP_URG_VALID) { + int err = 0; + char c = ssk->urg_data; + + if (!(flags & MSG_PEEK)) + ssk->urg_data = TCP_URG_READ; + + /* Read urgent data. */ + msg->msg_flags |= MSG_OOB; + + if (len > 0) { + if (!(flags & MSG_TRUNC)) + err = memcpy_toiovec(msg->msg_iov, &c, 1); + len = 1; + } else + msg->msg_flags |= MSG_TRUNC; + + return err ? -EFAULT : len; + } + + if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN)) + return 0; + + /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and + * the available implementations agree in this case: + * this call should never block, independent of the + * blocking state of the socket. + * Mike + */ + return -EAGAIN; +} + +static inline void sdp_mark_urg(struct sock *sk, int flags) +{ + if (unlikely(flags & MSG_OOB)) { + struct sk_buff *skb = sk->sk_write_queue.prev; + SDP_SKB_CB(skb)->flags |= TCPHDR_URG; + } +} + +static inline void sdp_push(struct sock *sk, int flags) +{ + if (sk->sk_send_head) + sdp_mark_urg(sk, flags); + sdp_do_posts(sdp_sk(sk)); +} + +void sdp_skb_entail(struct sock *sk, struct sk_buff *skb) +{ + __skb_queue_tail(&sk->sk_write_queue, skb); + sk->sk_wmem_queued += skb->truesize; + sk_mem_charge(sk, skb->truesize); + if (!sk->sk_send_head) + sk->sk_send_head = skb; + if (sdp_sk(sk)->nonagle & TCP_NAGLE_PUSH) + sdp_sk(sk)->nonagle &= ~TCP_NAGLE_PUSH; +} + +#define TCP_PAGE(sk) (sk->sk_sndmsg_page) +#define TCP_OFF(sk) (sk->sk_sndmsg_off) +static inline int sdp_bcopy_get(struct sock *sk, struct sk_buff *skb, + char __user *from, int copy) +{ + int err; + struct sdp_sock *ssk = sdp_sk(sk); + + /* Where to copy to? */ + if (skb_tailroom(skb) > 0) { + /* We have some space in skb head. Superb! */ + if (copy > skb_tailroom(skb)) + copy = skb_tailroom(skb); + if ((err = skb_add_data(skb, from, copy)) != 0) + return SDP_ERR_FAULT; + } else { + /* Put data in skb->frags */ + int merge = 0; + int i = skb_shinfo(skb)->nr_frags; + struct page *page = TCP_PAGE(sk); + int off = TCP_OFF(sk); + + if (skb_can_coalesce(skb, i, page, off) && + off != PAGE_SIZE) { + /* We can extend the last page + * fragment. */ + merge = 1; + } else if (i == ssk->send_frags) { + /* Need to add new fragment and cannot + * do this because all the page slots are + * busy. */ + sdp_mark_push(ssk, skb); + return SDP_NEW_SEG; + } else if (page) { + if (off == PAGE_SIZE) { + put_page(page); + TCP_PAGE(sk) = page = NULL; + off = 0; + } + } else + off = 0; + + if (copy > PAGE_SIZE - off) + copy = PAGE_SIZE - off; + + if (!sk_wmem_schedule(sk, copy)) + return SDP_DO_WAIT_MEM; + + if (!page) { + /* Allocate new cache page. */ + page = sk_stream_alloc_page(sk); + if (!page) + return SDP_DO_WAIT_MEM; + } + + /* Time to copy data. We are close to + * the end! */ + SDPSTATS_COUNTER_ADD(memcpy_count, copy); + err = skb_copy_to_page(sk, from, skb, page, + off, copy); + if (err) { + /* If this page was new, give it to the + * socket so it does not get leaked. + */ + if (!TCP_PAGE(sk)) { + TCP_PAGE(sk) = page; + TCP_OFF(sk) = 0; + } + return SDP_ERR_ERROR; + } + + /* Update the skb. */ + if (merge) { + skb_shinfo(skb)->frags[i - 1].size += copy; + } else { + skb_fill_page_desc(skb, i, page, off, copy); + if (TCP_PAGE(sk)) { + get_page(page); + } else if (off + copy < PAGE_SIZE) { + get_page(page); + TCP_PAGE(sk) = page; + } + } + + TCP_OFF(sk) = off + copy; + } + + return copy; +} + +/* like sk_stream_wait_memory - except: + * - if credits_needed provided - wait for enough credits + * - TX irq will use this (in sendmsg context) to do the actual tx + * comp poll and post + */ +int sdp_tx_wait_memory(struct sdp_sock *ssk, long *timeo_p, int *credits_needed) +{ + struct sock *sk = sk_ssk(ssk); + int err = 0; + long vm_wait = 0; + long current_timeo = *timeo_p; + DEFINE_WAIT(wait); + + if (sk_stream_memory_free(sk)) + current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2; + + while (1) { + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + + prepare_to_wait(sdp_sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) + goto do_error; + if (!*timeo_p) + goto do_nonblock; + if (signal_pending(current)) + goto do_interrupted; + clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + + sdp_do_posts(ssk); + + if (credits_needed) { + if (tx_slots_free(ssk) >= *credits_needed) + break; + } else { + if (sk_stream_memory_free(sk) && !vm_wait) + break; + } + + /* Before going to sleep, make sure no credit update is missed, + * rx_cq will be armed now. */ + posts_handler_put(ssk, 0); + + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + sk->sk_write_pending++; + + sdp_prf1(sk, NULL, "Going to sleep"); + + if (tx_credits(ssk) > SDP_MIN_TX_CREDITS) + sdp_arm_tx_cq(sk); + + if (credits_needed) { + sk_wait_event(sk, ¤t_timeo, + !sk->sk_err && + !(sk->sk_shutdown & SEND_SHUTDOWN) && + !ssk->tx_compl_pending && + tx_slots_free(ssk) >= *credits_needed && + vm_wait); + } else { + sk_wait_event(sk, ¤t_timeo, + !sk->sk_err && + !(sk->sk_shutdown & SEND_SHUTDOWN) && + !ssk->tx_compl_pending && + sk_stream_memory_free(sk) && + tx_credits(ssk) > SDP_MIN_TX_CREDITS && + vm_wait); + } + + sdp_prf(sk, NULL, "Woke up. memfree: %d", sk_stream_memory_free(sk)); + sk->sk_write_pending--; + + posts_handler_get(ssk); + + if (!ssk->qp_active) + goto do_error; + + if (vm_wait) { + vm_wait -= current_timeo; + current_timeo = *timeo_p; + if (current_timeo != MAX_SCHEDULE_TIMEOUT && + (current_timeo -= vm_wait) < 0) + current_timeo = 0; + vm_wait = 0; + } + *timeo_p = current_timeo; + } +out: + finish_wait(sdp_sk_sleep(sk), &wait); + return err; + +do_error: + err = -EPIPE; + goto out; +do_nonblock: + err = -EAGAIN; + goto out; +do_interrupted: + err = sock_intr_errno(*timeo_p); + goto out; +} + +/* Like tcp_sendmsg */ +/* TODO: check locking */ +static int sdp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t size) +{ + int i; + struct sdp_sock *ssk = sdp_sk(sk); + struct sk_buff *skb; + int flags; + const int size_goal = MIN(ssk->xmit_size_goal, SDP_MAX_PAYLOAD); + int err, copied; + long timeo; + int zcopy_thresh = + -1 != ssk->zcopy_thresh ? ssk->zcopy_thresh : sdp_zcopy_thresh; + + SDPSTATS_COUNTER_INC(sendmsg); + + lock_sock(sk); + ssk->cpu = smp_processor_id(); + sdp_dbg_data(sk, "%s size = 0x%zx\n", __func__, size); + + posts_handler_get(ssk); + SDP_WARN_ON(ssk->tx_sa); + + flags = msg->msg_flags; + timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + + /* Wait for a connection to finish. */ + if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) + if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) + goto out_err; + + /* This should be in poll */ + clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + + /* Ok commence sending. */ + copied = 0; + + err = -EPIPE; + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) + goto do_error; + + for (i = 0; i < msg->msg_iovlen; i++) { + struct iovec *iov = &msg->msg_iov[i]; + int seglen = iov->iov_len; + char __user *from = iov->iov_base; + + sdp_dbg_data(sk, "Sending iov: 0x%x/0x%zx %p\n", i, msg->msg_iovlen, from); + + SDPSTATS_HIST(sendmsg_seglen, seglen); + + if (zcopy_thresh && seglen > zcopy_thresh && + seglen > SDP_MIN_ZCOPY_THRESH && + tx_slots_free(ssk) && ssk->sdp_dev && + ssk->sdp_dev->fmr_pool && !(flags & MSG_OOB)) { + int zcopied = 0; + + zcopied = sdp_sendmsg_zcopy(iocb, sk, iov); + + if (zcopied < 0) { + sdp_dbg_data(sk, "ZCopy send err: %d\n", zcopied); + err = zcopied; + goto out_err; + } + + copied += zcopied; + seglen = iov->iov_len; + from = iov->iov_base; + + sdp_dbg_data(sk, "ZCopied: 0x%x/0x%x\n", zcopied, seglen); + } + + SDPSTATS_COUNTER_INC(sendmsg_bcopy_segment); + + while (seglen > 0) { + int copy; + + skb = sk->sk_write_queue.prev; + + if (!sk->sk_send_head || + (copy = size_goal - (skb->len - sizeof(struct sdp_bsdh))) <= 0) { +new_segment: + /* + * Allocate a new segment + * For bcopy, we stop sending once we have + * SO_SENDBUF bytes in flight. For bzcopy + * we stop sending once we run out of remote + * receive credits. + */ + if (unlikely(!sk_stream_memory_free(sk))) { + if (!poll_recv_cq(sk)) + sdp_do_posts(ssk); + if ((!sk_stream_memory_free(sk))) + goto wait_for_sndbuf; + } + + skb = sdp_alloc_skb_data(sk, min(seglen, size_goal), 0); + if (!skb) { + err = -ENOMEM; + goto do_error; + } + + /* + * Check whether we can use HW checksum. + */ + if (sk->sk_route_caps & + (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | + NETIF_F_HW_CSUM)) + skb->ip_summed = CHECKSUM_PARTIAL; + + sdp_skb_entail(sk, skb); + copy = size_goal; + + sdp_dbg_data(sk, "created new skb: %p" + " len = 0x%zx, sk_send_head: %p " + "copy: 0x%x size_goal: 0x%x\n", + skb, skb->len - sizeof(struct sdp_bsdh), + sk->sk_send_head, copy, size_goal); + + + } else { + sdp_dbg_data(sk, "adding to existing skb: %p" + " len = 0x%zx, sk_send_head: %p " + "copy: 0x%x\n", + skb, skb->len - sizeof(struct sdp_bsdh), + sk->sk_send_head, copy); + } + + /* Try to append data to the end of skb. */ + if (copy > seglen) + copy = seglen; + + copy = sdp_bcopy_get(sk, skb, from, copy); + + if (unlikely(copy < 0)) { + switch (copy) { + case SDP_DO_WAIT_MEM: + goto wait_for_sndbuf; + case SDP_NEW_SEG: + goto new_segment; + case SDP_ERR_FAULT: + goto do_fault; + default: + goto do_error; + } + } + + if (!copied) + SDP_SKB_CB(skb)->flags &= ~TCPHDR_PSH; + + ssk->write_seq += copy; + SDP_SKB_CB(skb)->end_seq += copy; + /*unused: skb_shinfo(skb)->gso_segs = 0;*/ + + from += copy; + copied += copy; + seglen -= copy; + continue; + +wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + sdp_prf(sk, skb, "wait for mem. credits: %d", tx_credits(ssk)); + SDPSTATS_COUNTER_INC(send_wait_for_mem); + if (copied) + sdp_push(sk, flags & ~MSG_MORE); + + err = sdp_tx_wait_memory(ssk, &timeo, NULL); + if (err) + goto do_error; + } + } + +out: + if (copied) + sdp_push(sk, flags); + + sdp_auto_moderation(ssk); + + err = copied; + + sdp_dbg_data(sk, "copied: 0x%x\n", copied); + if (copied > 0) + SDPSTATS_COUNTER_ADD(tx_bytes, copied); + + goto fin; + +do_fault: + sdp_prf(sk, skb, "prepare fault"); + + if (skb->len <= sizeof(struct sdp_bsdh)) { + if (sk->sk_send_head == skb) + sk->sk_send_head = NULL; + __skb_unlink(skb, &sk->sk_write_queue); + sk_wmem_free_skb(sk, skb); + } + +do_error: + if (copied) + goto out; +out_err: + err = sk_stream_error(sk, flags, err); + sdp_dbg_data(sk, "err: %d\n", err); + +fin: + posts_handler_put(ssk, SDP_RX_ARMING_DELAY); + + if (!err && !ssk->qp_active) { + err = -EPIPE; + sdp_set_error(sk, err); + sdp_dbg(sk, "can't send anymore\n"); + } + + + sk_mem_reclaim(sk); + + release_sock(sk); + + return err; +} + +int sdp_abort_rx_srcavail(struct sock *sk, int post_sendsm) +{ + struct sdp_sock *ssk = sdp_sk(sk); + struct rx_srcavail_state *rx_sa = ssk->rx_sa; + struct sk_buff *skb = rx_sa->skb; + struct sdp_bsdh *h = + (struct sdp_bsdh *)skb_transport_header(skb); + + sdp_dbg_data(sk, "SrcAvail aborted\n"); + + ssk->rx_sa = NULL; + + h->mid = SDP_MID_DATA; + + sdp_post_rdma_rd_compl(sk, rx_sa); + if (post_sendsm) + sdp_post_sendsm(sk); + + /* arriving SrcAvailCancel might be handled by sdp_do_posts(). Must set + * ssk->rx_sa to NULL before, to prevent it from reentering this + * function. + * XXX: Coming to think of that, Why to call sdp_do_posts() and not + * sdp_post_sends()? */ + sdp_do_posts(ssk); + + RX_SRCAVAIL_STATE(skb) = NULL; + kfree(rx_sa); + + return 0; +} + +/* Like tcp_recvmsg */ +/* Maybe use skb_recv_datagram here? */ +/* Note this does not seem to handle vectored messages. Relevant? */ +static int sdp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len, int noblock, int flags, + int *addr_len) +{ + struct sk_buff *skb = NULL; + struct sdp_sock *ssk = sdp_sk(sk); + long timeo; + int target; + unsigned long used; + int err; + u32 peek_seq; + u32 *seq; + int copied = 0; + int avail_bytes_count = 0; /* Could be inlined in skb */ + /* or advertised for RDMA */ + SDPSTATS_COUNTER_INC(recvmsg); + + lock_sock(sk); + ssk->cpu = smp_processor_id(); + sdp_dbg_data(sk, "iovlen: %zd iov_len: 0x%zx flags: 0x%x peek: 0x%x\n", + msg->msg_iovlen, msg->msg_iov[0].iov_len, flags, + MSG_PEEK); + + posts_handler_get(ssk); + + err = -ENOTCONN; + if (sk->sk_state == TCP_LISTEN) + goto out; + + timeo = sock_rcvtimeo(sk, noblock); + /* Urgent data needs to be handled specially. */ + if (flags & MSG_OOB) + goto recv_urg; + + seq = &ssk->copied_seq; + if (flags & MSG_PEEK) { + peek_seq = ssk->copied_seq; + seq = &peek_seq; + } + + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + + do { + struct rx_srcavail_state *rx_sa = NULL; + u32 offset; + + /* Are we at urgent data? Stop if we have read anything or have + * SIGURG pending. */ + if (ssk->urg_data && ssk->urg_seq == *seq) { + if (copied) + break; + if (signal_pending(current)) { + copied = timeo ? sock_intr_errno(timeo) : + -EAGAIN; + break; + } + } + + skb = skb_peek(&sk->sk_receive_queue); + do { + struct sdp_bsdh *h; + if (!skb) + break; + + offset = *seq - SDP_SKB_CB(skb)->seq; + avail_bytes_count = 0; + + h = (struct sdp_bsdh *)skb_transport_header(skb); + + switch (h->mid) { + case SDP_MID_DISCONN: + if (flags & MSG_PEEK) { + /* There is no point of handling a + * remote disconnection request while + * MSG_PEEK. The remote disconnection + * request will be handled upon regular + * recv. */ + goto got_disconn_in_peek; + } + sdp_dbg(sk, "Handle RX SDP_MID_DISCONN\n"); + sdp_prf(sk, NULL, "Handle RX SDP_MID_DISCONN"); + sdp_handle_disconn(sk); + goto found_fin_ok; + + case SDP_MID_SRCAVAIL: + rx_sa = RX_SRCAVAIL_STATE(skb); + if (unlikely(!rx_sa)) { + /* SrcAvailCancel arrived and handled */ + h->mid = SDP_MID_DATA; + goto check_srcavail_skb; + } + + if (sdp_chk_sa_cancel(ssk, rx_sa) || + !ssk->sdp_dev || + !ssk->sdp_dev->fmr_pool) { + sdp_dbg_data(sk, "Aborting SA " + "due to SACancel or " + "no fmr pool\n"); + sdp_abort_rx_srcavail(sk, 1); + rx_sa = NULL; +check_srcavail_skb: + if (offset < skb->len) { + sdp_prf(sk, skb, "Converted SA to DATA"); + goto sdp_mid_data; + } else { + sdp_prf(sk, skb, "Cancelled SA with no payload left"); + goto skb_cleanup; + } + } + + /* if has payload - handle as if MID_DATA */ + if (offset < skb->len) { + sdp_dbg_data(sk, "SrcAvail has " + "payload: %d/%d\n", + offset, + skb->len); + avail_bytes_count = skb->len; + } else { + sdp_dbg_data(sk, "Finished payload. " + "RDMAing: %d/%d\n", + offset, rx_sa->len); + + if (flags & MSG_PEEK) { + u32 real_offset = + ssk->copied_seq - + SDP_SKB_CB(skb)->seq; + sdp_dbg_data(sk, "Peek on RDMA data - " + "fallback to BCopy\n"); + sdp_abort_rx_srcavail(sk, 1); + rx_sa = NULL; + if (real_offset >= skb->len) + goto force_skb_cleanup; + } else { + avail_bytes_count = rx_sa->len; + } + } + + break; + + case SDP_MID_DATA: +sdp_mid_data: + rx_sa = NULL; + avail_bytes_count = skb->len; + break; + default: + break; + } + + if (before(*seq, SDP_SKB_CB(skb)->seq)) { + sdp_warn(sk, "skb: %p recvmsg bug: copied %X seq %X\n", + skb, *seq, SDP_SKB_CB(skb)->seq); + sdp_reset(sk); + break; + } + + if (offset < avail_bytes_count) + goto found_ok_skb; + + if (unlikely(!(flags & MSG_PEEK))) { + /* Could happen when SrcAvail was canceled + * and transformed into DATA SKB */ + goto skb_cleanup; + } + + SDP_WARN_ON(h->mid == SDP_MID_SRCAVAIL); + + skb = skb->next; + } while (skb != (struct sk_buff *)&sk->sk_receive_queue); + + if (copied >= target) + break; + + if (copied) { + if (sk->sk_err || + sk->sk_state == TCP_CLOSE || + (sk->sk_shutdown & RCV_SHUTDOWN) || + !timeo || + signal_pending(current) || + (flags & MSG_PEEK)) + break; + } else { + if (sock_flag(sk, SOCK_DONE)) + break; + + if (sk->sk_err) { + copied = sock_error(sk); + break; + } + + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + + if (sk->sk_state == TCP_CLOSE) { + if (!sock_flag(sk, SOCK_DONE)) { + /* This occurs when user tries to read + * from never connected socket. + */ + copied = -ENOTCONN; + break; + } + break; + } + + if (!timeo) { + copied = -EAGAIN; + break; + } + + if (signal_pending(current)) { + copied = sock_intr_errno(timeo); + break; + } + } + + if (poll_recv_cq(sk)) { + sdp_dbg_data(sk, "sk_wait_data %ld\n", timeo); + posts_handler_put(ssk, 0); + + sk_wait_data(sk, &timeo); + posts_handler_get(ssk); + + sdp_dbg_data(sk, "got data/timeout\n"); + } + sdp_do_posts(ssk); + continue; + + found_ok_skb: + sdp_dbg_data(sk, "bytes avail: %d\n", avail_bytes_count); + sdp_dbg_data(sk, "buf len %Zd offset %d\n", len, offset); + sdp_dbg_data(sk, "copied %d target %d\n", copied, target); + used = avail_bytes_count - offset; + if (len < used) + used = len; + + sdp_dbg_data(sk, "%s: used %ld\n", __func__, used); + + if (ssk->urg_data) { + u32 urg_offset = ssk->urg_seq - *seq; + if (urg_offset < used) { + if (!urg_offset) { + if (!sock_flag(sk, SOCK_URGINLINE)) { + ++*seq; + offset++; + used--; + if (!used) + goto skip_copy; + } + } else + used = urg_offset; + } + } + if (!(flags & MSG_TRUNC)) { + if (rx_sa && offset >= skb->len) { + /* No more payload - start rdma copy */ + sdp_dbg_data(sk, "RDMA copy of 0x%lx bytes\n", used); + err = sdp_rdma_to_iovec(sk, msg->msg_iov, msg->msg_iovlen, skb, + &used, offset); + if (unlikely(err)) { + /* ssk->rx_sa might had been freed when + * we slept. */ + if (ssk->rx_sa) + sdp_abort_rx_srcavail(sk, 1); + rx_sa = NULL; + if (err == -EAGAIN || err == -ETIME) + goto skb_cleanup; + sdp_warn(sk, "err from rdma %d - sendSM\n", err); + skb_unlink(skb, &sk->sk_receive_queue); + __kfree_skb(skb); + } + } else { + sdp_dbg_data(sk, "memcpy 0x%lx bytes +0x%x -> %p\n", + used, offset, msg->msg_iov[0].iov_base); + + err = skb_copy_datagram_iovec(skb, offset, + /* TODO: skip header? */ + msg->msg_iov, used); + if (rx_sa && !(flags & MSG_PEEK)) { + rx_sa->copied += used; + rx_sa->reported += used; + } + } + if (err) { + sdp_dbg(sk, "%s: data copy failed" + "offset %d size %ld status %d\n", + __func__, offset, used, err); + /* Exception. Bailout! */ + if (!copied) + copied = err; + break; + } + } + + copied += used; + len -= used; + *seq += used; + offset = *seq - SDP_SKB_CB(skb)->seq; + sdp_dbg_data(sk, "done copied 0x%x target 0x%x\n", copied, target); + + sdp_do_posts(sdp_sk(sk)); + if (rx_sa && !ssk->rx_sa) { + /* SrcAvail canceled. Must not access local rx_sa */ + rx_sa = NULL; + } +skip_copy: + if (ssk->urg_data && after(ssk->copied_seq, ssk->urg_seq)) + ssk->urg_data = 0; + + + if (rx_sa && !(flags & MSG_PEEK)) { + sdp_post_rdma_rd_compl(sk, rx_sa); + sdp_post_sends(ssk, 0); + } + + if (!rx_sa && offset < skb->len) + continue; + + if (rx_sa && offset < rx_sa->len) + continue; + + offset = 0; + +skb_cleanup: + if (!(flags & MSG_PEEK)) { + struct sdp_bsdh *h; + h = (struct sdp_bsdh *)skb_transport_header(skb); + sdp_prf1(sk, skb, "READ finished. mseq: %d mseq_ack:%d", + ntohl(h->mseq), ntohl(h->mseq_ack)); + + if (rx_sa) { + /* ssk->rx_sa might had been freed when we slept. + */ + if (ssk->rx_sa) + sdp_abort_rx_srcavail(sk, 0); + rx_sa = NULL; + } +force_skb_cleanup: + sdp_dbg_data(sk, "unlinking skb %p\n", skb); + skb_unlink(skb, &sk->sk_receive_queue); + __kfree_skb(skb); + } + continue; +found_fin_ok: + ++*seq; + if (!(flags & MSG_PEEK)) { + skb_unlink(skb, &sk->sk_receive_queue); + __kfree_skb(skb); + } + break; + + } while (len > 0); + +got_disconn_in_peek: + err = copied; + if (copied > 0) + SDPSTATS_COUNTER_ADD(rx_bytes, copied); +out: + + posts_handler_put(ssk, SDP_RX_ARMING_DELAY); + + sdp_auto_moderation(ssk); + + if (!err && !ssk->qp_active) { + err = -EPIPE; + sdp_set_error(sk, err); + sdp_dbg(sk, "data won't be available anymore\n"); + } + + + sk_mem_reclaim(sk); + + release_sock(sk); + sdp_dbg_data(sk, "recvmsg finished. ret = %d\n", err); + return err; + +recv_urg: + err = sdp_recv_urg(sk, timeo, msg, len, flags, addr_len); + goto out; +} + +static int sdp_listen(struct sock *sk, int backlog) +{ + struct sdp_sock *ssk = sdp_sk(sk); + int rc; + + sdp_dbg(sk, "%s\n", __func__); + sdp_add_to_history(sk, __func__); + + if (!ssk->id) { + rc = sdp_get_port(sk, 0); + if (rc) + return rc; + inet_sport(sk) = htons(inet_num(sk)); + } + + rc = rdma_listen(ssk->id, backlog); + if (rc) { + sdp_dbg(sk, "rdma_listen failed: %d\n", rc); + sdp_set_error(sk, rc); + } else + sdp_exch_state(sk, TCPF_CLOSE, TCP_LISTEN); + return rc; +} + +/* We almost could use inet_listen, but that calls + inet_csk_listen_start. Longer term we'll want to add + a listen callback to struct proto, similiar to bind. */ +static int sdp_inet_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + unsigned char old_state; + int err; + + lock_sock(sk); + sdp_sk(sk)->cpu = smp_processor_id(); + + err = -EINVAL; + if (sock->state != SS_UNCONNECTED) + goto out; + + old_state = sk->sk_state; + if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) + goto out; + + /* Really, if the socket is already in listen state + * we can only allow the backlog to be adjusted. + */ + if (old_state != TCP_LISTEN) { + err = sdp_listen(sk, backlog); + if (err) + goto out; + } + sk->sk_max_ack_backlog = backlog; + err = 0; + +out: + release_sock(sk); + return err; +} + +static void sdp_unhash(struct sock *sk) +{ + sdp_dbg(sk, "%s\n", __func__); +} + +static inline unsigned int sdp_listen_poll(const struct sock *sk) +{ + return !list_empty(&sdp_sk(sk)->accept_queue) ? + (POLLIN | POLLRDNORM) : 0; +} + +static unsigned int sdp_poll(struct file *file, struct socket *socket, + struct poll_table_struct *wait) +{ + unsigned int mask; + struct sock *sk = socket->sk; + + lock_sock(sk); + sdp_sk(sk)->cpu = smp_processor_id(); + + if (sk->sk_state == TCP_ESTABLISHED) { + sdp_do_posts(sdp_sk(sk)); + } + mask = datagram_poll(file, socket, wait); + if (!(mask & POLLIN)) + sdp_arm_rx_cq(sk); + + /* + * Adjust for memory in later kernels + */ + if (!sk_stream_memory_free(sk)) + mask &= ~(POLLOUT | POLLWRNORM | POLLWRBAND); + + /* TODO: Slightly ugly: it would be nicer if there was function + * like datagram_poll that didn't include poll_wait, + * then we could reverse the order. */ + if (sk->sk_state == TCP_LISTEN) { + mask = sdp_listen_poll(sk); + goto out; + } + + if (sdp_sk(sk)->urg_data & TCP_URG_VALID) + mask |= POLLPRI; +out: + release_sock(sk); + return mask; +} + +static void sdp_enter_memory_pressure(struct sock *sk) +{ + sdp_dbg(sk, "%s\n", __func__); +} + +void sdp_urg(struct sdp_sock *ssk, struct sk_buff *skb) +{ + struct sock *sk = sk_ssk(ssk); + u8 tmp; + u32 ptr = skb->len - 1; + + ssk->urg_seq = SDP_SKB_CB(skb)->seq + ptr; + + if (skb_copy_bits(skb, ptr, &tmp, 1)) + BUG(); + ssk->urg_data = TCP_URG_VALID | tmp; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, 0); +} + +static struct percpu_counter *sockets_allocated; +static atomic_long_t memory_allocated; +static struct percpu_counter *orphan_count; +static int memory_pressure; +struct proto sdp_proto = { + .close = sdp_close, + .connect = sdp_connect, + .disconnect = sdp_disconnect, + .accept = sdp_accept, + .ioctl = sdp_ioctl, + .init = sdp_init_sock, + .shutdown = sdp_shutdown, + .setsockopt = sdp_setsockopt, + .getsockopt = sdp_getsockopt, + .sendmsg = sdp_sendmsg, + .recvmsg = sdp_recvmsg, + .unhash = sdp_unhash, + .get_port = sdp_get_port, + /* Wish we had this: .listen = sdp_listen */ + .enter_memory_pressure = sdp_enter_memory_pressure, + .memory_allocated = &memory_allocated, + .memory_pressure = &memory_pressure, + .sysctl_mem = sysctl_tcp_mem, + .sysctl_wmem = sysctl_tcp_wmem, + .sysctl_rmem = sysctl_tcp_rmem, + .max_header = sizeof(struct sdp_bsdh), + .obj_size = sizeof(struct sdp_sock), + .owner = THIS_MODULE, + .name = "SDP", +}; + +static struct proto_ops sdp_ipv4_proto_ops = { + .family = PF_INET, + .owner = THIS_MODULE, + .release = inet_release, + .bind = inet_bind, + .connect = inet_stream_connect, /* TODO: inet_datagram connect would + autobind, but need to fix get_port + with port 0 first. */ + .socketpair = sock_no_socketpair, + .accept = inet_accept, + .getname = inet_getname, + .poll = sdp_poll, + .ioctl = inet_ioctl, + .listen = sdp_inet_listen, + .shutdown = inet_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = sock_common_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static struct proto_ops sdp_ipv6_proto_ops = { + .family = PF_INET6, + .owner = THIS_MODULE, + .release = inet6_release, + .bind = inet6_bind, + .connect = inet_stream_connect, /* TODO: inet_datagram connect would + autobind, but need to fix get_port + with port 0 first. */ + .socketpair = sock_no_socketpair, + .accept = inet_accept, + .getname = inet6_getname, + .poll = sdp_poll, + .ioctl = inet6_ioctl, + .listen = sdp_inet_listen, + .shutdown = inet_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = sock_common_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; +#endif + +static int sdp_create_ipvx_socket(struct net *net, struct socket *sock, int protocol, + struct proto_ops *proto_ops) +{ + struct sock *sk; + int rc; + + sdp_dbg(NULL, "type %d protocol %d\n", sock->type, protocol); + + if (net != &init_net) + return -EAFNOSUPPORT; + + if (sock->type != SOCK_STREAM) { + sdp_warn(NULL, "SDP: unsupported type %d.\n", sock->type); + return -ESOCKTNOSUPPORT; + } + + /* IPPROTO_IP is a wildcard match */ + if (protocol != IPPROTO_TCP && protocol != IPPROTO_IP) { + sdp_warn(NULL, "SDP: unsupported protocol %d.\n", protocol); + return -EPROTONOSUPPORT; + } + + sk = sk_alloc(net, PF_INET_SDP, GFP_KERNEL, &sdp_proto); + if (!sk) { + sdp_warn(NULL, "SDP: failed to allocate socket.\n"); + return -ENOMEM; + } + sock_init_data(sock, sk); + sk->sk_protocol = 0x0 /* TODO: inherit tcp socket to use IPPROTO_TCP */; + percpu_counter_inc(sk->sk_prot->sockets_allocated); + + memset((struct inet_sock *)sk + 1, 0, + sizeof(struct sdp_sock) - sizeof(struct inet_sock)); + rc = sdp_init_sock(sk); + if (rc) { + sdp_warn(sk, "SDP: failed to init sock.\n"); + sdp_common_release(sk); + return -ENOMEM; + } + + sdp_add_to_history(sk, __func__); + sk->sk_destruct = sdp_destruct; + sock->ops = proto_ops; + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (proto_ops->family == PF_INET6) + inet_sk(sock->sk)->pinet6 = sdp_inet6_sk_generic(sock->sk); +#endif + + sock->state = SS_UNCONNECTED; + + sdp_add_sock(sdp_sk(sk)); + + return 0; +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static int sdp_create_v6_socket(struct net *net, struct socket *sock, int protocol, + int kern) +{ + return sdp_create_ipvx_socket(net, sock, protocol, &sdp_ipv6_proto_ops); +} +#endif + +static int sdp_create_v4_socket(struct net *net, struct socket *sock, int protocol, + int kern) +{ + return sdp_create_ipvx_socket(net, sock, protocol, &sdp_ipv4_proto_ops); +} + +static void sdp_add_device(struct ib_device *device) +{ + struct sdp_device *sdp_dev; + struct ib_fmr_pool_param fmr_param; + + sdp_dev = kmalloc(sizeof *sdp_dev, GFP_KERNEL); + if (!sdp_dev) + return; + + sdp_dev->pd = ib_alloc_pd(device); + if (IS_ERR(sdp_dev->pd)) { + printk(KERN_WARNING "Unable to allocate PD: %ld.\n", + PTR_ERR(sdp_dev->pd)); + goto err_pd_alloc; + } + + sdp_dev->mr = ib_get_dma_mr(sdp_dev->pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(sdp_dev->mr)) { + printk(KERN_WARNING "Unable to get dma MR: %ld.\n", + PTR_ERR(sdp_dev->mr)); + goto err_mr; + } + + memset(&fmr_param, 0, sizeof fmr_param); + fmr_param.pool_size = sdp_fmr_pool_size; + fmr_param.dirty_watermark = sdp_fmr_dirty_wm; + fmr_param.cache = 1; + fmr_param.relaxed = 0; + fmr_param.max_pages_per_fmr = SDP_FMR_SIZE; + fmr_param.page_shift = PAGE_SHIFT; + fmr_param.access = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ); + + sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, &fmr_param); + if (IS_ERR(sdp_dev->fmr_pool)) { + sdp_dev->fmr_pool = NULL; + } + + ib_set_client_data(device, &sdp_client, sdp_dev); + + return; + +err_mr: + ib_dealloc_pd(sdp_dev->pd); +err_pd_alloc: + kfree(sdp_dev); +} + +static void sdp_remove_device(struct ib_device *device) +{ + struct sdp_sock *ssk; + struct sock *sk; + struct rdma_cm_id *id; + struct sdp_device *sdp_dev; + + sdp_dev = ib_get_client_data(device, &sdp_client); + ib_set_client_data(device, &sdp_client, NULL); + + /* destroy_ids: */ +do_next: + down_write(&device_removal_lock); + + spin_lock_irq(&sock_list_lock); + list_for_each_entry(ssk, &sock_list, sock_list) { + if (ssk->ib_device == device && !ssk->id_destroyed_already) { + spin_unlock_irq(&sock_list_lock); + sk = sk_ssk(ssk); + sdp_add_to_history(sk, __func__); + lock_sock(sk); + /* ssk->id must be lock-protected, + * to enable mutex with sdp_close() */ + id = ssk->id; + ssk->id = NULL; + ssk->id_destroyed_already = 1; + + release_sock(sk); + up_write(&device_removal_lock); + + if (id) + rdma_destroy_id(id); + schedule(); + goto do_next; + } + } + + /* destroy qps: */ +kill_socks: + list_for_each_entry(ssk, &sock_list, sock_list) { + if (ssk->ib_device == device) { + spin_unlock_irq(&sock_list_lock); + sk = sk_ssk(ssk); + lock_sock(sk); + + sdp_abort_srcavail(sk); + sdp_abort_rdma_read(sk); + sdp_destroy_qp(ssk); + sdp_set_error(sk, -ENODEV); + ssk->ib_device = NULL; + ssk->sdp_dev = NULL; + + release_sock(sk); + flush_workqueue(rx_comp_wq); + schedule(); + spin_lock_irq(&sock_list_lock); + + goto kill_socks; + } + } + + spin_unlock_irq(&sock_list_lock); + + up_write(&device_removal_lock); + + if (!sdp_dev) + return; + + if (sdp_dev->fmr_pool) { + ib_flush_fmr_pool(sdp_dev->fmr_pool); + ib_destroy_fmr_pool(sdp_dev->fmr_pool); + } + + ib_dereg_mr(sdp_dev->mr); + + ib_dealloc_pd(sdp_dev->pd); + + kfree(sdp_dev); +} + +static struct net_proto_family sdp_net_proto = { + .family = AF_INET_SDP, + .create = sdp_create_v4_socket, + .owner = THIS_MODULE, +}; + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static struct net_proto_family sdp_net6_proto = { + .family = AF_INET6_SDP, + .create = sdp_create_v6_socket, + .owner = THIS_MODULE, +}; +#endif + +struct ib_client sdp_client = { + .name = "sdp", + .add = sdp_add_device, + .remove = sdp_remove_device +}; + +static int __init sdp_init(void) +{ + int rc = -ENOMEM; + + INIT_LIST_HEAD(&sock_list); + spin_lock_init(&sock_list_lock); + spin_lock_init(&sdp_large_sockets_lock); + + sockets_allocated = kzalloc(sizeof(*sockets_allocated), GFP_KERNEL); + if (!sockets_allocated) + goto no_mem_sockets_allocated; + + orphan_count = kzalloc(sizeof(*orphan_count), GFP_KERNEL); + if (!orphan_count) + goto no_mem_orphan_count; + + percpu_counter_init(sockets_allocated, 0); + percpu_counter_init(orphan_count, 0); + + sdp_proto.sockets_allocated = sockets_allocated; + sdp_proto.orphan_count = orphan_count; + + rx_comp_wq = create_workqueue("rx_comp_wq"); + if (!rx_comp_wq) + goto no_mem_rx_wq; + + sdp_wq = create_singlethread_workqueue("sdp_wq"); + if (!sdp_wq) + goto no_mem_sdp_wq; + + rc = proto_register(&sdp_proto, 1); + if (rc) { + printk(KERN_WARNING "proto_register failed: %d\n", rc); + goto error_proto_reg; + } + + rc = sock_register(&sdp_net_proto); + if (rc) { + printk(KERN_WARNING "sock_register sdp IPv4 failed: %d\n", rc); + goto error_sock_reg; + } + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + rc = sock_register(&sdp_net6_proto); + if (rc) { + printk(KERN_WARNING "sock_register sdp IPv6 failed: %d\n", rc); + goto error_sock_reg6; + } +#endif + + sdp_proc_init(); + + rc = ib_register_client(&sdp_client); + if (rc) { + printk(KERN_WARNING "ib_register_client failed: %d\n", rc); + goto error_ib_reg; + } + + return 0; + +error_ib_reg: + sdp_proc_unregister(); + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +error_sock_reg6: + sock_unregister(PF_INET_SDP); +#endif +error_sock_reg: + proto_unregister(&sdp_proto); +error_proto_reg: + destroy_workqueue(sdp_wq); +no_mem_sdp_wq: + destroy_workqueue(rx_comp_wq); +no_mem_rx_wq: + kfree(orphan_count); +no_mem_orphan_count: + kfree(sockets_allocated); +no_mem_sockets_allocated: + return rc; +} + +static void __exit sdp_exit(void) +{ + sock_unregister(PF_INET6_SDP); + sock_unregister(PF_INET_SDP); + proto_unregister(&sdp_proto); + + if (percpu_counter_sum(orphan_count)) + printk(KERN_WARNING "%s: orphan_count %lld\n", __func__, + percpu_counter_sum(orphan_count)); + + destroy_workqueue(rx_comp_wq); + destroy_workqueue(sdp_wq); + + BUG_ON(!list_empty(&sock_list)); + + if (atomic_long_read(&memory_allocated)) + sdp_dbg(NULL, "SDP detected memory leak. Memory_allocated: %ld\n", + atomic_long_read(&memory_allocated)); + + if (percpu_counter_sum(sockets_allocated)) + printk(KERN_WARNING "%s: sockets_allocated %lld\n", __func__, + percpu_counter_sum(sockets_allocated)); + + sdp_proc_unregister(); + + ib_unregister_client(&sdp_client); + + percpu_counter_destroy(orphan_count); + percpu_counter_destroy(sockets_allocated); + + kfree(orphan_count); + kfree(sockets_allocated); +} + +module_init(sdp_init); +module_exit(sdp_exit); diff --git a/drivers/infiniband/ulp/sdp/sdp_proc.c b/drivers/infiniband/ulp/sdp/sdp_proc.c new file mode 100644 index 0000000000000..5dbd3fe94d3d4 --- /dev/null +++ b/drivers/infiniband/ulp/sdp/sdp_proc.c @@ -0,0 +1,912 @@ +/* + * Copyright (c) 2008 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include "sdp.h" + +#ifdef CONFIG_PROC_FS + +#define DEBUGFS_SDP_BASE "sdp" +#define PROC_SDP_STATS "sdpstats" +#define PROC_SDP_PERF "sdpprf" + +#if defined(SDP_SOCK_HISTORY) || defined(SDP_PROFILING) +struct dentry *sdp_dbgfs_base; +#endif +#ifdef SDP_PROFILING +struct dentry *sdp_prof_file = NULL; +#endif + +/* just like TCP fs */ +struct sdp_seq_afinfo { + struct module *owner; + char *name; + sa_family_t family; + int (*seq_show) (struct seq_file *m, void *v); + struct file_operations *seq_fops; +}; + +struct sdp_iter_state { + sa_family_t family; + int num; + struct seq_operations seq_ops; +}; + +static void *sdp_get_idx(struct seq_file *seq, loff_t pos) +{ + int i = 0; + struct sdp_sock *ssk; + + if (!list_empty(&sock_list)) + list_for_each_entry(ssk, &sock_list, sock_list) { + if (i == pos) + return ssk; + i++; + } + + return NULL; +} + +#define sdp_sock_hold_return(sk, msg) \ + ({ \ + _sdp_add_to_history(sk, #msg, __func__, __LINE__, HOLD_REF, msg); \ + sdp_dbg(sk, "%s:%d - %s (%s) ref = %d.\n", __func__, __LINE__, \ + "sock_hold", #msg, atomic_read(&(sk)->sk_refcnt)); \ + atomic_inc_return(&(sk)->sk_refcnt); \ + }) + +static void *sdp_seq_start(struct seq_file *seq, loff_t *pos) +{ + void *start = NULL; + struct sdp_iter_state *st = seq->private; + + st->num = 0; + + if (!*pos) + return SEQ_START_TOKEN; + + spin_lock_irq(&sock_list_lock); + start = sdp_get_idx(seq, *pos - 1); + if (!start) + goto out; + + if (sdp_sock_hold_return((struct sock *)start, SOCK_REF_SEQ) < 2) + start = NULL; +out: + spin_unlock_irq(&sock_list_lock); + + return start; +} + +static void *sdp_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sdp_iter_state *st = seq->private; + void *next = NULL; + + spin_lock_irq(&sock_list_lock); + if (v == SEQ_START_TOKEN) + next = sdp_get_idx(seq, 0); + else + next = sdp_get_idx(seq, *pos); + if (!next) + goto out; + + if (sdp_sock_hold_return((struct sock *)next, SOCK_REF_SEQ) < 2) + next = NULL; +out: + spin_unlock_irq(&sock_list_lock); + *pos += 1; + st->num++; + + return next; +} + +static void sdp_seq_stop(struct seq_file *seq, void *v) +{ +} + +#define TMPSZ 150 + +static int sdp_v4_seq_show(struct seq_file *seq, int num, struct sock *sk) +{ + char tmpbuf[TMPSZ + 1]; + unsigned int dest; + unsigned int src; + int uid; + unsigned long inode; + __u16 destp; + __u16 srcp; + __u32 rx_queue, tx_queue; + + dest = sdp_inet_daddr(sk); + src = sdp_inet_rcv_saddr(sk); + destp = ntohs(inet_dport(sk)); + srcp = ntohs(inet_sport(sk)); + uid = sock_i_uid(sk); + inode = sock_i_ino(sk); + rx_queue = rcv_nxt(sdp_sk(sk)) - sdp_sk(sk)->copied_seq; + tx_queue = sdp_sk(sk)->write_seq - sdp_sk(sk)->tx_ring.una_seq; + + sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %5d %lu %08X:%08X %X", + num, src, srcp, dest, destp, uid, inode, + rx_queue, tx_queue, sk->sk_state); + + seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf); + + return 0; +} + +static int sdp_v6_seq_show(struct seq_file *seq, int num, struct sock *sk) +{ + char tmpbuf[TMPSZ + 1]; + struct in6_addr *src; + struct in6_addr *dest; + int uid; + unsigned long inode; + __u16 destp; + __u16 srcp; + __u32 rx_queue, tx_queue; + + dest = &inet6_sk(sk)->daddr; + src = &inet6_sk(sk)->rcv_saddr; + destp = ntohs(inet_dport(sk)); + srcp = ntohs(inet_sport(sk)); + uid = sock_i_uid(sk); + inode = sock_i_ino(sk); + rx_queue = rcv_nxt(sdp_sk(sk)) - sdp_sk(sk)->copied_seq; + tx_queue = sdp_sk(sk)->write_seq - sdp_sk(sk)->tx_ring.una_seq; + + sprintf(tmpbuf, + "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " + "%5d %lu %08X:%08X %X", + num, + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], + srcp, + dest->s6_addr32[0], dest->s6_addr32[1], + dest->s6_addr32[2], dest->s6_addr32[3], + destp, + uid, inode, + rx_queue, tx_queue, sk->sk_state); + + seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf); + + return 0; +} + +static int sdp_seq_show(struct seq_file *seq, void *v) +{ + struct sdp_iter_state *st; + struct sock *sk = v; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "%-*s\n", TMPSZ - 1, + " sl local_address rem_address " + "uid inode rx_queue tx_queue state"); + goto out; + } + + st = seq->private; + + if (inet6_sk(sk)) + sdp_v6_seq_show(seq, st->num, sk); + else + sdp_v4_seq_show(seq, st->num, sk); + + sock_put(sk, SOCK_REF_SEQ); +out: + return 0; +} + +static int sdp_seq_open(struct inode *inode, struct file *file) +{ + struct sdp_seq_afinfo *afinfo = PDE(inode)->data; + struct seq_file *seq; + struct sdp_iter_state *s; + int rc; + + if (unlikely(afinfo == NULL)) + return -EINVAL; + +/* Workaround bogus warning by memtrack */ +#define _kzalloc(size,flags) kzalloc(size,flags) +#undef kzalloc + s = kzalloc(sizeof(*s), GFP_KERNEL); +#define kzalloc(s,f) _kzalloc(s,f) + if (!s) + return -ENOMEM; + s->family = afinfo->family; + s->seq_ops.start = sdp_seq_start; + s->seq_ops.next = sdp_seq_next; + s->seq_ops.show = afinfo->seq_show; + s->seq_ops.stop = sdp_seq_stop; + + rc = seq_open(file, &s->seq_ops); + if (rc) + goto out_kfree; + seq = file->private_data; + seq->private = s; +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + + +static struct file_operations sdp_seq_fops; +static struct sdp_seq_afinfo sdp_seq_afinfo = { + .owner = THIS_MODULE, + .name = "sdp", + .family = AF_INET_SDP, + .seq_show = sdp_seq_show, + .seq_fops = &sdp_seq_fops, +}; + +#ifdef SDPSTATS_ON +DEFINE_PER_CPU(struct sdpstats, sdpstats); + +static void sdpstats_seq_hist(struct seq_file *seq, char *str, u32 *h, int n, + int is_log) +{ + int i; + u32 max = 0; + int first = -1, last = n - 1; + + seq_printf(seq, "%s:\n", str); + + for (i = 0; i < n; i++) { + if (h[i] > max) + max = h[i]; + + if (first == -1 && h[i]) + first = i; + + if (h[i]) + last = i; + } + + if (max == 0) { + seq_printf(seq, " - all values are 0\n"); + return; + } + + for (i = first; i <= last; i++) { + char s[51]; + int j = 50 * h[i] / max; + int val = is_log ? (i == n-1 ? 0 : 1<time - start_t); + usec_rem = do_div(t, USEC_PER_SEC); + remove_newline(l->msg); + seq_printf(m, "%-6d: [%5lu.%06lu] %-50s - [%d{%d} %d:%d] " + "skb: %p %s:%d\n", + l->idx, t, usec_rem, + l->msg, l->pid, l->cpu, l->sk_num, l->sk_dport, + l->skb, l->func, l->line); +out: + return 0; +} + +static void *sdpprf_start(struct seq_file *p, loff_t *pos) +{ + int count = atomic_read(&sdpprf_log_count); + int first = sdpprf_first_idx(count); + struct sdpprf_log *l = NULL; + + if (!count) + return SEQ_START_TOKEN; + + if (*pos >= MIN(count, SDPPRF_LOG_SIZE)) + return NULL; + + l = &sdpprf_log[(first + *pos) & (SDPPRF_LOG_SIZE - 1)]; + + start_t = l->time; + + + return l; +} + +static void *sdpprf_next(struct seq_file *p, void *v, loff_t *pos) +{ + struct sdpprf_log *l = v; + + (*pos)++; + if (*pos >= MIN(atomic_read(&sdpprf_log_count), SDPPRF_LOG_SIZE)) + return NULL; + + ++l; + if (l - &sdpprf_log[0] >= SDPPRF_LOG_SIZE) + return &sdpprf_log[0]; + + return l; +} + +static void sdpprf_stop(struct seq_file *p, void *v) +{ +} + +static struct seq_operations sdpprf_ops = { + .start = sdpprf_start, + .stop = sdpprf_stop, + .next = sdpprf_next, + .show = sdpprf_show, +}; + +static int sdpprf_open(struct inode *inode, struct file *file) +{ + int res; + + res = seq_open(file, &sdpprf_ops); + + return res; +} + +static ssize_t sdpprf_write(struct file *file, const char __user *buf, + size_t count, loff_t *offs) +{ + atomic_set(&sdpprf_log_count, 0); + printk(KERN_INFO "Cleared sdpprf statistics\n"); + + return count; +} + +static struct file_operations sdpprf_fops = { + .open = sdpprf_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = sdpprf_write, +}; +#endif /* SDP_PROFILING */ + +#ifdef SDP_SOCK_HISTORY + +void sdp_print_history(struct sock *sk) +{ + struct sdp_sock *ssk = sdp_sk(sk); + unsigned i; + unsigned long flags; + + spin_lock_irqsave(&ssk->hst_lock, flags); + + sdp_warn(sk, "############## %p %s %lu/%zu ##############\n", + sk, sdp_state_str(sk->sk_state), + ssk->hst_idx, ARRAY_SIZE(ssk->hst)); + + for (i = 0; i < ssk->hst_idx; ++i) { + struct sdp_sock_hist *hst = &ssk->hst[i]; + char *ref_str = reftype2str(hst->ref_type); + + if (hst->ref_type == NOT_REF) + ref_str = ""; + + if (hst->cnt != 1) { + sdp_warn(sk, "[%s:%d pid: %d] %s %s : %d\n", + hst->func, hst->line, hst->pid, + ref_str, hst->str, hst->cnt); + } else { + sdp_warn(sk, "[%s:%d pid: %d] %s %s\n", + hst->func, hst->line, hst->pid, + ref_str, hst->str); + } + } + + spin_unlock_irqrestore(&ssk->hst_lock, flags); +} + +void _sdp_add_to_history(struct sock *sk, const char *str, + const char *func, int line, int ref_type, int ref_enum) +{ + struct sdp_sock *ssk = sdp_sk(sk); + unsigned i; + unsigned long flags; + struct sdp_sock_hist *hst; + + spin_lock_irqsave(&ssk->hst_lock, flags); + + i = ssk->hst_idx; + + if (i >= ARRAY_SIZE(ssk->hst)) { + //sdp_warn(sk, "overflow, drop: %s\n", s); + ++ssk->hst_idx; + goto out; + } + + if (ssk->hst[i].str) + sdp_warn(sk, "overwriting %s\n", ssk->hst[i].str); + + switch (ref_type) { + case NOT_REF: + case HOLD_REF: +simple_add: + hst = &ssk->hst[i]; + hst->str = (char *)str; + hst->func = (char *)func; + hst->line = line; + hst->ref_type = ref_type; + hst->ref_enum = ref_enum; + hst->cnt = 1; + hst->pid = current->pid; + ++ssk->hst_idx; + break; + case PUT_REF: + case __PUT_REF: + /* Try to shrink history by attaching HOLD+PUT + * together */ + hst = i > 0 ? &ssk->hst[i - 1] : NULL; + if (hst && hst->ref_type == HOLD_REF && + hst->ref_enum == ref_enum) { + hst->ref_type = BOTH_REF; + hst->func = (char *)func; + hst->line = line; + hst->pid = current->pid; + + /* try to shrink some more - by summing up */ + --i; + hst = i > 0 ? &ssk->hst[i - 1] : NULL; + if (hst && hst->ref_type == BOTH_REF && + hst->ref_enum == ref_enum) { + ++hst->cnt; + hst->func = (char *)func; + hst->line = line; + hst->pid = current->pid; + ssk->hst[i].str = NULL; + + --ssk->hst_idx; + } + } else + goto simple_add; + break; + default: + sdp_warn(sk, "error\n"); + } +out: + spin_unlock_irqrestore(&ssk->hst_lock, flags); +} +static int sdp_ssk_hist_seq_show(struct seq_file *seq, void *v) +{ + struct sock *sk = seq->private; + struct sdp_sock *ssk = sdp_sk(sk); + unsigned i; + unsigned long flags; + + spin_lock_irqsave(&ssk->hst_lock, flags); + + seq_printf(seq, "############## %p %s %lu/%zu ##############\n", + sk, sdp_state_str(sk->sk_state), + ssk->hst_idx, ARRAY_SIZE(ssk->hst)); + + seq_printf(seq, "rmem: %d wmem: %d wqueue: %d " + "fw: %d prot->alloc: %ld\n", + atomic_read(&sk->sk_rmem_alloc), + atomic_read(&sk->sk_wmem_alloc), + sk->sk_wmem_queued, + sk->sk_forward_alloc, + atomic_long_read(sk->sk_prot->memory_allocated)); + + for (i = 0; i < min(ssk->hst_idx, ARRAY_SIZE(ssk->hst)); ++i) { + struct sdp_sock_hist *hst = &ssk->hst[i]; + char *ref_str = reftype2str(hst->ref_type); + + if (hst->ref_type == NOT_REF) + ref_str = ""; + + if (hst->cnt != 1) { + seq_printf(seq, "[%30s:%-5d pid: %-6d] %s %s : %d\n", + hst->func, hst->line, hst->pid, + ref_str, hst->str, hst->cnt); + } else { + seq_printf(seq, "[%30s:%-5d pid: %-6d] %s %s\n", + hst->func, hst->line, hst->pid, + ref_str, hst->str); + } + } + + spin_unlock_irqrestore(&ssk->hst_lock, flags); + return 0; +} + +static int sdp_ssk_hist_seq_open(struct inode *inode, struct file *file) +{ + struct sock *sk = inode->i_private; + + return single_open(file, sdp_ssk_hist_seq_show, sk); +} + +static struct file_operations ssk_hist_fops = { + .owner = THIS_MODULE, + .open = sdp_ssk_hist_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void sdp_ssk_hist_name(char *sk_name, int len, struct sock *sk) +{ + int lport = inet_num(sk); + int rport = ntohs(inet_dport(sk)); + + snprintf(sk_name, len, "%05x_%d:%d", + sdp_sk(sk)->sk_id, lport, rport); +} + +int sdp_ssk_hist_open(struct sock *sk) +{ + int ret = 0; + char sk_name[256]; + struct sdp_sock *ssk = sdp_sk(sk); + + if (!sdp_dbgfs_base) { + return 0; + } + + sdp_ssk_hist_name(sk_name, sizeof(sk_name), sk); + + ssk->hst_dentr = debugfs_create_file(sk_name, S_IRUGO | S_IWUGO, + sdp_dbgfs_base, sk, &ssk_hist_fops); + if (IS_ERR(ssk->hst_dentr)) { + ret = PTR_ERR(ssk->hst_dentr); + ssk->hst_dentr = NULL; + } + + return ret; +} + +int sdp_ssk_hist_close(struct sock *sk) +{ + if (sk && sdp_sk(sk)->hst_dentr) + debugfs_remove(sdp_sk(sk)->hst_dentr); + return 0; +} + +int sdp_ssk_hist_rename(struct sock *sk) +{ + char sk_name[256]; + struct dentry *d; + + if (!sk || !sdp_sk(sk)->hst_dentr) + return 0; + + sdp_ssk_hist_name(sk_name, sizeof(sk_name), sk); + + d = debugfs_rename(sdp_dbgfs_base, sdp_sk(sk)->hst_dentr, sdp_dbgfs_base, sk_name); + if (IS_ERR(d)) + return PTR_ERR(d); + + return 0; +} +#endif + +int __init sdp_proc_init(void) +{ + struct proc_dir_entry *p = NULL; +#ifdef SDPSTATS_ON + struct proc_dir_entry *stats = NULL; +#endif + + sdp_seq_afinfo.seq_fops->owner = sdp_seq_afinfo.owner; + sdp_seq_afinfo.seq_fops->open = sdp_seq_open; + sdp_seq_afinfo.seq_fops->read = seq_read; + sdp_seq_afinfo.seq_fops->llseek = seq_lseek; + sdp_seq_afinfo.seq_fops->release = seq_release_private; + +#if defined(SDP_PROFILING) || defined(SDP_SOCK_HISTORY) + sdp_dbgfs_base = debugfs_create_dir(DEBUGFS_SDP_BASE, NULL); + if (!sdp_dbgfs_base || IS_ERR(sdp_dbgfs_base)) { + if (PTR_ERR(sdp_dbgfs_base) == -ENODEV) + printk(KERN_WARNING "sdp: debugfs is not supported.\n"); + else { + printk(KERN_ERR "sdp: error creating debugfs information %ld\n", + PTR_ERR(sdp_dbgfs_base)); + return -EINVAL; + } + } +#endif + + p = proc_net_fops_create(&init_net, sdp_seq_afinfo.name, S_IRUGO, + sdp_seq_afinfo.seq_fops); + if (p) + p->data = &sdp_seq_afinfo; + else + goto no_mem; + +#ifdef SDPSTATS_ON + + stats = proc_net_fops_create(&init_net, PROC_SDP_STATS, + S_IRUGO | S_IWUGO, &sdpstats_fops); + if (!stats) + goto no_mem_stats; + +#endif + +#ifdef SDP_PROFILING + sdp_prof_file = debugfs_create_file(PROC_SDP_PERF, S_IRUGO | S_IWUGO, + sdp_dbgfs_base, NULL, &sdpprf_fops); + if (!sdp_prof_file) + goto no_mem_prof; +#endif + + return 0; + +#ifdef SDP_PROFILING +no_mem_prof: +#endif + +#ifdef SDPSTATS_ON + proc_net_remove(&init_net, PROC_SDP_STATS); + +no_mem_stats: +#endif + proc_net_remove(&init_net, sdp_seq_afinfo.name); + +no_mem: + return -ENOMEM; +} + +void sdp_proc_unregister(void) +{ + proc_net_remove(&init_net, sdp_seq_afinfo.name); + memset(sdp_seq_afinfo.seq_fops, 0, sizeof(*sdp_seq_afinfo.seq_fops)); + +#ifdef SDPSTATS_ON + proc_net_remove(&init_net, PROC_SDP_STATS); +#endif +#ifdef SDP_PROFILING + debugfs_remove(sdp_prof_file); +#endif +#if defined(SDP_PROFILING) || defined(SDP_SOCK_HISTORY) + debugfs_remove(sdp_dbgfs_base); +#endif +} + +#else /* CONFIG_PROC_FS */ + +int __init sdp_proc_init(void) +{ + return 0; +} + +void sdp_proc_unregister(void) +{ + +} +#endif /* CONFIG_PROC_FS */ diff --git a/drivers/infiniband/ulp/sdp/sdp_rx.c b/drivers/infiniband/ulp/sdp/sdp_rx.c new file mode 100644 index 0000000000000..7fa392d5955a8 --- /dev/null +++ b/drivers/infiniband/ulp/sdp/sdp_rx.c @@ -0,0 +1,951 @@ +/* + * Copyright (c) 2009 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include "sdp.h" + +SDP_MODPARAM_INT(rcvbuf_initial_size, 32 * 1024, + "Receive buffer initial size in bytes."); + +#ifdef CONFIG_PPC +SDP_MODPARAM_SINT(max_large_sockets, 100, + "Max number of large sockets (32k buffers)."); +#else +SDP_MODPARAM_SINT(max_large_sockets, 1000, + "Max number of large sockets (32k buffers)."); +#endif + +static int curr_large_sockets; +spinlock_t sdp_large_sockets_lock; + +static int sdp_get_large_socket(const struct sdp_sock *ssk) +{ + int ret; + + if (ssk->recv_request) + return 1; + + spin_lock_irq(&sdp_large_sockets_lock); + ret = curr_large_sockets < max_large_sockets; + if (ret) + curr_large_sockets++; + spin_unlock_irq(&sdp_large_sockets_lock); + + return ret; +} + +void sdp_remove_large_sock(const struct sdp_sock *ssk) +{ + if (ssk->recv_frags) { + spin_lock_irq(&sdp_large_sockets_lock); + curr_large_sockets--; + spin_unlock_irq(&sdp_large_sockets_lock); + } +} + +/* Like tcp_fin - called when SDP_MID_DISCONNECT is received */ +void sdp_handle_disconn(struct sock *sk) +{ + sdp_dbg(sk, "%s\n", __func__); + + sk->sk_shutdown |= RCV_SHUTDOWN; + sock_set_flag(sk, SOCK_DONE); + + switch (sk->sk_state) { + case TCP_SYN_RECV: + case TCP_ESTABLISHED: + sdp_exch_state(sk, TCPF_SYN_RECV | TCPF_ESTABLISHED, + TCP_CLOSE_WAIT); + break; + + case TCP_FIN_WAIT1: + /* Received a reply FIN - start Infiniband tear down */ + sdp_dbg(sk, "%s: Starting Infiniband tear down sending DREQ\n", + __func__); + + sdp_cancel_dreq_wait_timeout(sdp_sk(sk)); + + sdp_exch_state(sk, TCPF_FIN_WAIT1, TCP_TIME_WAIT); + + if (sdp_sk(sk)->id) { + sdp_sk(sk)->qp_active = 0; + rdma_disconnect(sdp_sk(sk)->id); + } else { + /* possible in a case of device removal */ + sdp_dbg(sk, "sdp_sk(sk)->id is NULL\n"); + return; + } + break; + case TCP_TIME_WAIT: + /* This is a mutual close situation and we've got the DREQ from + the peer before the SDP_MID_DISCONNECT */ + break; + case TCP_CLOSE: + /* FIN arrived after IB teardown started - do nothing */ + sdp_dbg(sk, "%s: fin in state %s\n", + __func__, sdp_state_str(sk->sk_state)); + return; + default: + sdp_warn(sk, "%s: FIN in unexpected state. sk->sk_state=%s\n", + __func__, sdp_state_str(sk->sk_state)); + break; + } + + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + + /* Do not send POLL_HUP for half duplex close. */ + if (sk->sk_shutdown == SHUTDOWN_MASK || + sk->sk_state == TCP_CLOSE) + sk_wake_async(sk, 1, POLL_HUP); + else + sk_wake_async(sk, 1, POLL_IN); + } +} + +static void sdp_sock_rfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + atomic_sub(skb->truesize, &sk->sk_rmem_alloc); + sk_mem_uncharge(sk, skb->truesize); +} + +static int sdp_post_recv(struct sdp_sock *ssk) +{ + struct sock *sk = sk_ssk(ssk); + struct sdp_buf *rx_req; + int i, frags; + int rc = 0; + u64 addr; + struct ib_device *dev; + struct ib_recv_wr rx_wr = { NULL }; + struct ib_sge ibsge[SDP_MAX_RECV_SGES]; + struct ib_sge *sge = ibsge; + struct ib_recv_wr *bad_wr; + struct sk_buff *skb; + struct page *page; + skb_frag_t *frag; + struct sdp_bsdh *h; + int id = ring_head(ssk->rx_ring); + gfp_t gfp_page; + int pages_alloced = 0; + + /* Now, allocate and repost recv */ + /* TODO: allocate from cache */ + + if (unlikely(sk_ssk(ssk)->sk_allocation)) { + skb = sdp_stream_alloc_skb(sk_ssk(ssk), SDP_SKB_HEAD_SIZE, + sk_ssk(ssk)->sk_allocation, SK_MEM_RECV); + gfp_page = sk_ssk(ssk)->sk_allocation | __GFP_HIGHMEM; + } else { + skb = sdp_stream_alloc_skb(sk_ssk(ssk), SDP_SKB_HEAD_SIZE, + GFP_KERNEL, SK_MEM_RECV); + gfp_page = GFP_HIGHUSER; + } + + if (unlikely(!skb)) + return -1; + + sdp_prf(sk_ssk(ssk), skb, "Posting skb"); + h = (struct sdp_bsdh *)skb->head; + + rx_req = ssk->rx_ring.buffer + (id & (sdp_rx_size - 1)); + rx_req->skb = skb; + + for (i = 0; i < ssk->recv_frags; ++i) { + if (rx_req->mapping[i + 1]) + page = rx_req->pages[i]; + else { + rx_req->pages[i] = page = alloc_pages(gfp_page, 0); + if (unlikely(!page)) + goto err; + pages_alloced++; + } + frag = &skb_shinfo(skb)->frags[i]; + frag->page = page; + frag->page_offset = 0; + frag->size = min(PAGE_SIZE, SDP_MAX_PAYLOAD); + ++skb_shinfo(skb)->nr_frags; + } + skb->truesize += ssk->recv_frags * min(PAGE_SIZE, SDP_MAX_PAYLOAD); + if (!sk_rmem_schedule(sk, ssk->recv_frags * min(PAGE_SIZE, SDP_MAX_PAYLOAD))) { + sdp_dbg(sk, "RX couldn't post, rx posted = %d.", + rx_ring_posted(sdp_sk(sk))); + sdp_dbg(sk, "Out of memory\n"); + goto err; + } + + dev = ssk->ib_device; + addr = ib_dma_map_single(dev, h, SDP_SKB_HEAD_SIZE, DMA_FROM_DEVICE); + BUG_ON(ib_dma_mapping_error(dev, addr)); + + rx_req->mapping[0] = addr; + + /* TODO: proper error handling */ + sge->addr = (u64)addr; + sge->length = SDP_SKB_HEAD_SIZE; + sge->lkey = ssk->sdp_dev->mr->lkey; + frags = skb_shinfo(skb)->nr_frags; + for (i = 0; i < frags; ++i) { + ++sge; + if (rx_req->mapping[i + 1]) { + addr = rx_req->mapping[i + 1]; + } else { + addr = ib_dma_map_page(dev, skb_shinfo(skb)->frags[i].page, + skb_shinfo(skb)->frags[i].page_offset, + skb_shinfo(skb)->frags[i].size, + DMA_FROM_DEVICE); + BUG_ON(ib_dma_mapping_error(dev, addr)); + rx_req->mapping[i + 1] = addr; + } + sge->addr = addr; + sge->length = skb_shinfo(skb)->frags[i].size; + sge->lkey = ssk->sdp_dev->mr->lkey; + } + + rx_wr.next = NULL; + rx_wr.wr_id = id | SDP_OP_RECV; + rx_wr.sg_list = ibsge; + rx_wr.num_sge = frags + 1; + rc = ib_post_recv(ssk->qp, &rx_wr, &bad_wr); + if (unlikely(rc)) { + sdp_warn(sk_ssk(ssk), "ib_post_recv failed. status %d\n", rc); + goto err; + } + + skb_set_owner_r(skb, sk_ssk(ssk)); + skb->destructor=sdp_sock_rfree; + + atomic_inc(&ssk->rx_ring.head); + SDPSTATS_COUNTER_INC(post_recv); + + return 0; + +err: + sdp_cleanup_sdp_buf(ssk, rx_req, SDP_SKB_HEAD_SIZE, DMA_FROM_DEVICE); + __kfree_skb(skb); + + if (rc) + sdp_reset(sk_ssk(ssk)); + return -1; +} + +static inline int sdp_post_recvs_needed(struct sdp_sock *ssk) +{ + struct sock *sk = sk_ssk(ssk); + int buffer_size = ssk->recv_frags * PAGE_SIZE; + int posted = rx_ring_posted(ssk); + + if (unlikely(!ssk->qp_active)) + return 0; + + if (likely(posted >= sdp_rx_size)) + return 0; + + if (unlikely(posted < SDP_MIN_TX_CREDITS)) + return 1; + + if (rcv_nxt(ssk) - ssk->copied_seq + (posted - SDP_MIN_TX_CREDITS) * + buffer_size >= sk->sk_rcvbuf) { + return 0; + } + + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_prot->sysctl_rmem[2]) + return 0; + + return 1; +} + +static inline void sdp_post_recvs(struct sdp_sock *ssk) +{ +again: + while (sdp_post_recvs_needed(ssk)) { + if (sdp_post_recv(ssk)) + return; + } + + if (sdp_post_recvs_needed(ssk)) + goto again; +} + +static inline struct sk_buff *sdp_sock_queue_rcv_skb(struct sock *sk, + struct sk_buff *skb) +{ + int skb_len; + struct sdp_sock *ssk = sdp_sk(sk); + struct sdp_bsdh *h = (struct sdp_bsdh *)skb_transport_header(skb); + + SDP_SKB_CB(skb)->seq = rcv_nxt(ssk); + if (unlikely(h->flags & SDP_OOB_PRES)) + sdp_urg(ssk, skb); + + if (h->mid == SDP_MID_SRCAVAIL) { + struct sdp_srcah *srcah = (struct sdp_srcah *)(h+1); + struct rx_srcavail_state *rx_sa; + + SDP_WARN_ON(ssk->rx_sa); + ssk->rx_sa = rx_sa = RX_SRCAVAIL_STATE(skb) = kzalloc( + sizeof(struct rx_srcavail_state), GFP_ATOMIC); + if (unlikely(!rx_sa)) { + /* if there is no space, fall to BCopy. */ + sdp_dbg(sk, "Can't allocate memory for rx_sa\n"); + h->mid = SDP_MID_DATA; + goto mid_data; + } + + rx_sa->mseq = ntohl(h->mseq); + rx_sa->len = skb_len = ntohl(srcah->len); + rx_sa->rkey = ntohl(srcah->rkey); + rx_sa->vaddr = be64_to_cpu(srcah->vaddr); + rx_sa->skb = skb; + + if (ssk->tx_sa) { + sdp_dbg_data(sk_ssk(ssk), "got RX SrcAvail while waiting " + "for TX SrcAvail. waking up TX SrcAvail" + "to be aborted\n"); + wake_up(sdp_sk_sleep(sk)); + } + + atomic_add(skb->len, &ssk->rcv_nxt); + sdp_dbg_data(sk, "queueing SrcAvail. skb_len = %d vaddr = %lld\n", + skb_len, rx_sa->vaddr); + } else { +mid_data: + skb_len = skb->len; + + atomic_add(skb_len, &ssk->rcv_nxt); + } + + skb_queue_tail(&sk->sk_receive_queue, skb); + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, skb_len); + return skb; +} + +static int sdp_get_recv_sges(struct sdp_sock *ssk, u32 new_size) +{ + int recv_sges = ssk->max_sge - 1; /* 1 sge is dedicated to sdp header */ + + recv_sges = MIN(recv_sges, PAGE_ALIGN(new_size) >> PAGE_SHIFT); + recv_sges = MIN(recv_sges, SDP_MAX_RECV_SGES - 1); + + return recv_sges; +} + +int sdp_init_buffers(struct sdp_sock *ssk, u32 new_size) +{ + ssk->recv_frags = sdp_get_recv_sges(ssk, new_size); + + sdp_post_recvs(ssk); + + return 0; +} + +int sdp_resize_buffers(struct sdp_sock *ssk, u32 new_size) +{ + u32 curr_size = ssk->recv_frags << PAGE_SHIFT; + u32 max_size = (ssk->max_sge - 1) << PAGE_SHIFT; + + if (new_size > curr_size && new_size <= max_size && + sdp_get_large_socket(ssk)) { + ssk->recv_frags = sdp_get_recv_sges(ssk, new_size); + return 0; + } else + return -1; +} + +static void sdp_handle_resize_request(struct sdp_sock *ssk, + struct sdp_chrecvbuf *buf) +{ + if (sdp_resize_buffers(ssk, ntohl(buf->size)) == 0) + ssk->recv_request_head = ring_head(ssk->rx_ring) + 1; + else + ssk->recv_request_head = ring_tail(ssk->rx_ring); + ssk->recv_request = 1; +} + +static void sdp_handle_resize_ack(struct sdp_sock *ssk, + struct sdp_chrecvbuf *buf) +{ + u32 new_size = ntohl(buf->size); + + if (new_size > ssk->xmit_size_goal) { + ssk->sent_request = -1; + ssk->xmit_size_goal = new_size; + ssk->send_frags = + PAGE_ALIGN(ssk->xmit_size_goal) / PAGE_SIZE + 1; + } else + ssk->sent_request = 0; +} + +static void sdp_reuse_sdp_buf(struct sdp_sock *ssk, struct sdp_buf *sbuf, int len) +{ + struct sock *sk = sk_ssk(ssk); + int i; + struct sk_buff *skb; + struct ib_device *dev = ssk->ib_device; + enum dma_data_direction dir = DMA_FROM_DEVICE; + int bytes_reused = 0; + int used; + + skb = sbuf->skb; + + ib_dma_unmap_single(dev, sbuf->mapping[0], SDP_SKB_HEAD_SIZE, dir); + used = SDP_SKB_HEAD_SIZE; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + if (used >= len) { + int count = min(PAGE_SIZE, SDP_MAX_PAYLOAD) * + (skb_shinfo(skb)->nr_frags - i); + skb->truesize -= count; + + skb_shinfo(skb)->nr_frags = i; + bytes_reused += count; + break; + } + + ib_dma_unmap_page(dev, sbuf->mapping[i + 1], + skb_shinfo(skb)->frags[i].size, + dir); + sbuf->mapping[i + 1] = 0; + + used += skb_shinfo(skb)->frags[i].size; + } + atomic_sub(bytes_reused, &sk->sk_rmem_alloc); + sk_mem_uncharge(sk, bytes_reused); + +} + +static struct sk_buff *sdp_recv_completion(struct sdp_sock *ssk, int id, int len) +{ + struct sdp_buf *rx_req; + struct ib_device *dev; + struct sk_buff *skb; + + if (unlikely(id != ring_tail(ssk->rx_ring))) { + sdp_warn(sk_ssk(ssk), "Bogus recv completion id %d tail %d\n", + id, ring_tail(ssk->rx_ring)); + return NULL; + } + + dev = ssk->ib_device; + rx_req = &ssk->rx_ring.buffer[id & (sdp_rx_size - 1)]; + skb = rx_req->skb; + sdp_reuse_sdp_buf(ssk, rx_req, len); + + atomic_inc(&ssk->rx_ring.tail); + atomic_dec(&ssk->remote_credits); + return skb; +} + +/* socket lock should be taken before calling this */ +static int sdp_process_rx_ctl_skb(struct sdp_sock *ssk, struct sk_buff *skb) +{ + struct sdp_bsdh *h = (struct sdp_bsdh *)skb_transport_header(skb); + struct sock *sk = sk_ssk(ssk); + + sdp_dbg_data(sk, "Handling %s\n", mid2str(h->mid)); + sdp_prf(sk, skb, "Handling %s", mid2str(h->mid)); + + switch (h->mid) { + case SDP_MID_DATA: + case SDP_MID_SRCAVAIL: + SDP_WARN_ON(!(sk->sk_shutdown & RCV_SHUTDOWN)); + + sdp_dbg(sk, "DATA after socket rcv was shutdown\n"); + + /* got data in RCV_SHUTDOWN */ + if (sk->sk_state == TCP_FIN_WAIT1) { + sdp_dbg(sk, "RX data when state = FIN_WAIT1\n"); + /* go into abortive close */ + sdp_exch_state(sk, TCPF_FIN_WAIT1, + TCP_TIME_WAIT); + + sk->sk_prot->disconnect(sk, 0); + } + break; + case SDP_MID_RDMARDCOMPL: + sdp_warn(sk, "Handling RdmaRdCompl - ERROR\n"); + break; + case SDP_MID_SENDSM: + sdp_handle_sendsm(ssk, ntohl(h->mseq_ack)); + break; + case SDP_MID_SRCAVAIL_CANCEL: + if (ssk->rx_sa && after(ntohl(h->mseq), ssk->rx_sa->mseq) && + !ssk->tx_ring.rdma_inflight) { + sdp_abort_rx_srcavail(sk, 1); + } + break; + case SDP_MID_SINKAVAIL: + case SDP_MID_ABORT: + sdp_reset(sk); + break; + case SDP_MID_DISCONN: + sdp_handle_disconn(sk); + break; + case SDP_MID_CHRCVBUF: + sdp_handle_resize_request(ssk, (struct sdp_chrecvbuf *)(h+1)); + break; + case SDP_MID_CHRCVBUF_ACK: + sdp_handle_resize_ack(ssk, (struct sdp_chrecvbuf *)(h+1)); + break; + default: + /* TODO: Handle other messages */ + sdp_warn(sk, "SDP: FIXME MID %d\n", h->mid); + } + + __kfree_skb(skb); + return 0; +} + +static int sdp_process_rx_skb(struct sdp_sock *ssk, struct sk_buff *skb) +{ + struct sock *sk = sk_ssk(ssk); + int frags; + struct sdp_bsdh *h; + int pagesz, i; + unsigned long mseq_ack; + int credits_before; + + h = (struct sdp_bsdh *)skb_transport_header(skb); + + SDPSTATS_HIST_LINEAR(credits_before_update, tx_credits(ssk)); + + mseq_ack = ntohl(h->mseq_ack); + credits_before = tx_credits(ssk); + atomic_set(&ssk->tx_ring.credits, mseq_ack - ring_head(ssk->tx_ring) + + 1 + ntohs(h->bufs)); + if (!before(mseq_ack, ssk->nagle_last_unacked)) + ssk->nagle_last_unacked = 0; + + sdp_prf1(sk_ssk(ssk), skb, "RX: %s +%d c:%d->%d mseq:%d ack:%d", + mid2str(h->mid), ntohs(h->bufs), credits_before, + tx_credits(ssk), ntohl(h->mseq), ntohl(h->mseq_ack)); + + frags = skb_shinfo(skb)->nr_frags; + pagesz = PAGE_ALIGN(skb->data_len); + skb_shinfo(skb)->nr_frags = pagesz / PAGE_SIZE; + + for (i = skb_shinfo(skb)->nr_frags; i < frags; ++i) { + put_page(skb_shinfo(skb)->frags[i].page); + } + + if (unlikely(h->flags & SDP_OOB_PEND)) + sk_send_sigurg(sk); + + skb_pull(skb, sizeof(struct sdp_bsdh)); + + if (unlikely(h->mid == SDP_MID_SRCAVAIL)) { + if (ssk->rx_sa) { + sdp_dbg_data(sk, "SrcAvail in the middle of another SrcAvail. Aborting\n"); + h->mid = SDP_MID_DATA; + sdp_post_sendsm(sk); + sdp_do_posts(ssk); + } else { + skb_pull(skb, sizeof(struct sdp_srcah)); + } + } + + if (unlikely(h->mid == SDP_MID_DATA && skb->len == 0)) { + /* Credit update is valid even after RCV_SHUTDOWN */ + __kfree_skb(skb); + return 0; + } + + if ((h->mid != SDP_MID_DATA && h->mid != SDP_MID_SRCAVAIL && + h->mid != SDP_MID_DISCONN) || + unlikely(sk->sk_shutdown & RCV_SHUTDOWN)) { + sdp_prf(sk, NULL, "Control skb - queing to control queue"); + if (h->mid == SDP_MID_SRCAVAIL_CANCEL) { + sdp_dbg_data(sk, "Got SrcAvailCancel. " + "seq: 0x%d seq_ack: 0x%d\n", + ntohl(h->mseq), ntohl(h->mseq_ack)); + ssk->sa_cancel_mseq = ntohl(h->mseq); + ssk->sa_cancel_arrived = 1; + if (ssk->rx_sa) + wake_up(sdp_sk_sleep(sk)); + + skb_queue_tail(&ssk->rx_ctl_q, skb); + } else if (h->mid == SDP_MID_RDMARDCOMPL) { + struct sdp_rrch *rrch = (struct sdp_rrch *)(h+1); + sdp_dbg_data(sk, "RdmaRdCompl message arrived\n"); + sdp_handle_rdma_read_compl(ssk, ntohl(h->mseq_ack), + ntohl(rrch->len)); + __kfree_skb(skb); + } else + skb_queue_tail(&ssk->rx_ctl_q, skb); + + return 0; + } + + sdp_prf(sk, NULL, "queueing %s skb", mid2str(h->mid)); + sdp_sock_queue_rcv_skb(sk, skb); + + return 0; +} + +static struct sk_buff *sdp_process_rx_wc(struct sdp_sock *ssk, + struct ib_wc *wc) +{ + struct sk_buff *skb; + struct sdp_bsdh *h; + struct sock *sk = sk_ssk(ssk); + int mseq; + + skb = sdp_recv_completion(ssk, wc->wr_id, wc->byte_len); + if (unlikely(!skb)) + return NULL; + + if (unlikely(wc->status)) { + if (ssk->qp_active) { + sdp_dbg(sk, "Recv completion with error. " + "Status %d, vendor: %d\n", + wc->status, wc->vendor_err); + sdp_reset(sk); + ssk->qp_active = 0; + } + __kfree_skb(skb); + return NULL; + } + + sdp_dbg_data(sk, "Recv completion. ID %d Length %d\n", + (int)wc->wr_id, wc->byte_len); + if (unlikely(wc->byte_len < sizeof(struct sdp_bsdh))) { + sdp_warn(sk, "SDP BUG! byte_len %d < %zd\n", + wc->byte_len, sizeof(struct sdp_bsdh)); + __kfree_skb(skb); + return NULL; + } + skb->len = wc->byte_len; + skb->data = skb->head; + + h = (struct sdp_bsdh *)skb->data; + + if (likely(wc->byte_len > SDP_SKB_HEAD_SIZE)) + skb->data_len = wc->byte_len - SDP_SKB_HEAD_SIZE; + else + skb->data_len = 0; + +#ifdef NET_SKBUFF_DATA_USES_OFFSET + skb->tail = skb_headlen(skb); +#else + skb->tail = skb->head + skb_headlen(skb); +#endif + SDP_DUMP_PACKET(sk_ssk(ssk), "RX", skb, h); + skb_reset_transport_header(skb); + + ssk->rx_packets++; + ssk->rx_bytes += skb->len; + + mseq = ntohl(h->mseq); + atomic_set(&ssk->mseq_ack, mseq); + if (unlikely(mseq != (int)wc->wr_id)) + sdp_warn(sk, "SDP BUG! mseq %d != wrid %d\n", + mseq, (int)wc->wr_id); + + return skb; +} + +/* like sk_stream_write_space - execpt measures remote credits */ +static void sdp_bzcopy_write_space(struct sdp_sock *ssk) +{ + struct sock *sk = sk_ssk(ssk); + struct socket *sock = sk->sk_socket; + struct socket_wq *wq; + + if (tx_credits(ssk) < ssk->min_bufs || !sock) + return; + + clear_bit(SOCK_NOSPACE, &sock->flags); + sdp_prf1(sk, NULL, "Waking up sleepers"); + + rcu_read_lock(); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible(&wq->wait); + if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) + sock_wake_async(sock, 2, POLL_OUT); + rcu_read_unlock(); +} + +int sdp_poll_rx_cq(struct sdp_sock *ssk) +{ + struct ib_cq *cq = ssk->rx_ring.cq; + struct ib_wc ibwc[SDP_NUM_WC]; + int n, i; + int wc_processed = 0; + struct sk_buff *skb; + + do { + n = ib_poll_cq(cq, SDP_NUM_WC, ibwc); + for (i = 0; i < n; ++i) { + struct ib_wc *wc = &ibwc[i]; + + BUG_ON(!(wc->wr_id & SDP_OP_RECV)); + skb = sdp_process_rx_wc(ssk, wc); + if (!skb) + continue; + + sdp_process_rx_skb(ssk, skb); + wc_processed++; + } + } while (n == SDP_NUM_WC); + + if (wc_processed) { + sdp_prf(sk_ssk(ssk), NULL, "processed %d", wc_processed); + + sk_mem_reclaim(sk_ssk(ssk)); + + sdp_bzcopy_write_space(ssk); + } + + return wc_processed; +} + +static void sdp_rx_comp_work(struct work_struct *work) +{ + struct sdp_sock *ssk = container_of(work, struct sdp_sock, + rx_comp_work); + struct sock *sk = sk_ssk(ssk); + + SDPSTATS_COUNTER_INC(rx_wq); + + sdp_prf(sk, NULL, "%s", __func__); + + if (unlikely(!ssk->qp)) { + sdp_prf(sk, NULL, "qp was destroyed"); + return; + } + if (unlikely(!ssk->rx_ring.cq)) { + sdp_prf(sk, NULL, "rx_ring.cq is NULL"); + return; + } + + if (unlikely(!ssk->poll_cq)) { + struct rdma_cm_id *id = ssk->id; + if (id && id->qp) + rdma_notify(id, RDMA_CM_EVENT_ESTABLISHED); + return; + } + + lock_sock(sk); + + posts_handler_get(ssk); + sdp_do_posts(ssk); + posts_handler_put(ssk, SDP_RX_ARMING_DELAY); + release_sock(sk); +} + +void sdp_do_posts(struct sdp_sock *ssk) +{ + struct sock *sk = sk_ssk(ssk); + int xmit_poll_force; + struct sk_buff *skb; + + if (!ssk->qp_active) { + sdp_dbg(sk, "QP is deactivated\n"); + return; + } + + if (likely(ssk->rx_ring.cq)) + sdp_poll_rx_cq(ssk); + + while ((skb = skb_dequeue(&ssk->rx_ctl_q))) + sdp_process_rx_ctl_skb(ssk, skb); + + if (sk->sk_state == TCP_TIME_WAIT) + return; + + if (!ssk->rx_ring.cq || !ssk->tx_ring.cq) + return; + + sdp_post_recvs(ssk); + + if (tx_ring_posted(ssk)) + sdp_xmit_poll(ssk, 1); + + sdp_post_sends(ssk, 0); + + xmit_poll_force = sk->sk_write_pending && + (tx_credits(ssk) > SDP_MIN_TX_CREDITS); + + if (credit_update_needed(ssk) || xmit_poll_force) { + /* if has pending tx because run out of tx_credits - xmit it */ + sdp_prf(sk, NULL, "Processing to free pending sends"); + sdp_xmit_poll(ssk, xmit_poll_force); + sdp_prf(sk, NULL, "Sending credit update"); + sdp_post_sends(ssk, 0); + } + +} + +static inline int should_wake_up(struct sock *sk) +{ + return sdp_sk_sleep(sk) && waitqueue_active(sdp_sk_sleep(sk)) && + (posts_handler(sdp_sk(sk)) || somebody_is_waiting(sk)); +} + +static void sdp_rx_irq(struct ib_cq *cq, void *cq_context) +{ + struct sock *sk = cq_context; + struct sdp_sock *ssk = sdp_sk(sk); + + if (unlikely(cq != ssk->rx_ring.cq)) { + sdp_warn(sk, "cq = %p, ssk->cq = %p\n", cq, ssk->rx_ring.cq); + return; + } + + SDPSTATS_COUNTER_INC(rx_int_count); + + sdp_prf(sk, NULL, "rx irq"); + + if (should_wake_up(sk)) { + wake_up_interruptible(sdp_sk_sleep(sk)); + SDPSTATS_COUNTER_INC(rx_int_wake_up); + } else { + if (queue_work_on(ssk->cpu, rx_comp_wq, &ssk->rx_comp_work)) + SDPSTATS_COUNTER_INC(rx_int_queue); + else + SDPSTATS_COUNTER_INC(rx_int_no_op); + } +} + +static void sdp_rx_ring_purge(struct sdp_sock *ssk) +{ + struct ib_device *dev = ssk->ib_device; + int id, i; + + while (rx_ring_posted(ssk) > 0) { + struct sk_buff *skb; + skb = sdp_recv_completion(ssk, ring_tail(ssk->rx_ring), INT_MAX); + if (!skb) + break; + __kfree_skb(skb); + } + + for (id = 0; id < sdp_rx_size; id++) { + struct sdp_buf *sbuf = &ssk->rx_ring.buffer[id]; + + for (i = 1; i < SDP_MAX_SEND_SGES; i++) { + if (!sbuf->mapping[i]) + continue; + + ib_dma_unmap_page(dev, sbuf->mapping[i], + min(PAGE_SIZE, SDP_MAX_PAYLOAD), + DMA_FROM_DEVICE); + sbuf->mapping[i] = 0; + put_page(sbuf->pages[i - 1]); + } + } +} + +static void sdp_rx_cq_event_handler(struct ib_event *event, void *data) +{ +} + +static void sdp_arm_cq_timer(unsigned long data) +{ + struct sdp_sock *ssk = (struct sdp_sock *)data; + + SDPSTATS_COUNTER_INC(rx_cq_arm_timer); + sdp_arm_rx_cq(sk_ssk(ssk)); +} + +int sdp_rx_ring_create(struct sdp_sock *ssk, struct ib_device *device) +{ + struct ib_cq *rx_cq; + int rc = 0; + + atomic_set(&ssk->rx_ring.head, 1); + atomic_set(&ssk->rx_ring.tail, 1); + + ssk->rx_ring.buffer = kzalloc( + sizeof *ssk->rx_ring.buffer * sdp_rx_size, GFP_KERNEL); + if (!ssk->rx_ring.buffer) { + sdp_warn(sk_ssk(ssk), + "Unable to allocate RX Ring size %zd.\n", + sizeof(*ssk->rx_ring.buffer) * sdp_rx_size); + + return -ENOMEM; + } + + rx_cq = ib_create_cq(device, sdp_rx_irq, sdp_rx_cq_event_handler, + sk_ssk(ssk), sdp_rx_size, IB_CQ_VECTOR_LEAST_ATTACHED); + + if (IS_ERR(rx_cq)) { + rc = PTR_ERR(rx_cq); + sdp_warn(sk_ssk(ssk), "Unable to allocate RX CQ: %d.\n", rc); + goto err_cq; + } + + ssk->rx_ring.cq = rx_cq; + + INIT_WORK(&ssk->rx_comp_work, sdp_rx_comp_work); + setup_timer(&ssk->rx_ring.cq_arm_timer, sdp_arm_cq_timer, + (unsigned long)ssk); + sdp_arm_rx_cq(sk_ssk(ssk)); + + return 0; + +err_cq: + kfree(ssk->rx_ring.buffer); + ssk->rx_ring.buffer = NULL; + return rc; +} + +void sdp_rx_ring_destroy(struct sdp_sock *ssk) +{ + del_timer_sync(&ssk->rx_ring.cq_arm_timer); + + if (ssk->rx_ring.buffer) { + sdp_rx_ring_purge(ssk); + + kfree(ssk->rx_ring.buffer); + ssk->rx_ring.buffer = NULL; + } + + if (ssk->rx_ring.cq) { + if (ib_destroy_cq(ssk->rx_ring.cq)) { + sdp_warn(sk_ssk(ssk), "destroy cq(%p) failed\n", + ssk->rx_ring.cq); + } else { + ssk->rx_ring.cq = NULL; + } + } + + SDP_WARN_ON(ring_head(ssk->rx_ring) != ring_tail(ssk->rx_ring)); +} diff --git a/drivers/infiniband/ulp/sdp/sdp_tx.c b/drivers/infiniband/ulp/sdp/sdp_tx.c new file mode 100644 index 0000000000000..f99b87ec99ce6 --- /dev/null +++ b/drivers/infiniband/ulp/sdp/sdp_tx.c @@ -0,0 +1,530 @@ +/* + * Copyright (c) 2009 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include "sdp.h" + +#define sdp_cnt(var) do { (var)++; } while (0) + +SDP_MODPARAM_SINT(sdp_keepalive_probes_sent, 0, + "Total number of keepalive probes sent."); + +static int sdp_process_tx_cq(struct sdp_sock *ssk); + +int sdp_xmit_poll(struct sdp_sock *ssk, int force) +{ + int wc_processed = 0; + + sdp_prf(sk_ssk(ssk), NULL, "%s", __func__); + + /* If we don't have a pending timer, set one up to catch our recent + post in case the interface becomes idle */ + if (likely(ssk->qp_active && sk_ssk(ssk)->sk_state != TCP_CLOSE) && + !timer_pending(&ssk->tx_ring.timer)) { + mod_timer(&ssk->tx_ring.timer, jiffies + SDP_TX_POLL_TIMEOUT); + } + + ssk->tx_compl_pending = 0; + + /* Poll the CQ every SDP_TX_POLL_MODER packets */ + if (force || (++ssk->tx_ring.poll_cnt & (SDP_TX_POLL_MODER - 1)) == 0) + wc_processed = sdp_process_tx_cq(ssk); + + return wc_processed; +} + +void sdp_post_send(struct sdp_sock *ssk, struct sk_buff *skb) +{ + struct sdp_buf *tx_req; + struct sdp_bsdh *h = (struct sdp_bsdh *)skb_transport_header(skb); + unsigned long mseq = ring_head(ssk->tx_ring); + int i, rc, frags; + u64 addr; + struct ib_device *dev; + struct ib_send_wr *bad_wr; + + struct ib_sge ibsge[SDP_MAX_SEND_SGES]; + struct ib_sge *sge = ibsge; + struct ib_send_wr tx_wr = { NULL }; + u32 send_flags = IB_SEND_SIGNALED; + + SDPSTATS_COUNTER_MID_INC(post_send, h->mid); + SDPSTATS_HIST(send_size, skb->len); + + if (!ssk->qp_active) + goto err; + + ssk->tx_packets++; + + if (h->mid != SDP_MID_SRCAVAIL && + h->mid != SDP_MID_DATA && + h->mid != SDP_MID_SRCAVAIL_CANCEL) { + struct sock *sk = sk_ssk(ssk); + + sk->sk_wmem_queued += skb->truesize; + sk_mem_charge(sk, skb->truesize); + } + + if (unlikely(h->mid == SDP_MID_SRCAVAIL)) { + struct tx_srcavail_state *tx_sa = TX_SRCAVAIL_STATE(skb); + if (ssk->tx_sa != tx_sa) { + sdp_dbg_data(sk_ssk(ssk), "SrcAvail cancelled " + "before being sent!\n"); + SDP_WARN_ON(1); + sk_wmem_free_skb(sk_ssk(ssk), skb); + return; + } + TX_SRCAVAIL_STATE(skb)->mseq = mseq; + } + + if (unlikely(SDP_SKB_CB(skb)->flags & TCPHDR_URG)) + h->flags = SDP_OOB_PRES | SDP_OOB_PEND; + else + h->flags = 0; + + h->bufs = htons(rx_ring_posted(ssk)); + h->len = htonl(skb->len); + h->mseq = htonl(mseq); + h->mseq_ack = htonl(mseq_ack(ssk)); + + sdp_prf(sk_ssk(ssk), skb, "TX: %s bufs: %d mseq:%ld ack:%d c: %d", + mid2str(h->mid), rx_ring_posted(ssk), mseq, + ntohl(h->mseq_ack), tx_credits(ssk)); + + SDP_DUMP_PACKET(sk_ssk(ssk), "TX", skb, h); + + tx_req = &ssk->tx_ring.buffer[mseq & (SDP_TX_SIZE - 1)]; + tx_req->skb = skb; + dev = ssk->ib_device; + + if (skb->len <= ssk->inline_thresh && !skb_shinfo(skb)->nr_frags) { + SDPSTATS_COUNTER_INC(inline_sends); + sge->addr = (u64) skb->data; + sge->length = skb->len; + sge->lkey = 0; + frags = 0; + tx_req->mapping[0] = 0; /* Nothing to be cleaned up by sdp_cleanup_sdp_buf() */ + send_flags |= IB_SEND_INLINE; + } else { + addr = ib_dma_map_single(dev, skb->data, skb->len - skb->data_len, + DMA_TO_DEVICE); + tx_req->mapping[0] = addr; + + /* TODO: proper error handling */ + BUG_ON(ib_dma_mapping_error(dev, addr)); + + sge->addr = addr; + sge->length = skb->len - skb->data_len; + sge->lkey = ssk->sdp_dev->mr->lkey; + frags = skb_shinfo(skb)->nr_frags; + for (i = 0; i < frags; ++i) { + ++sge; + addr = ib_dma_map_page(dev, skb_shinfo(skb)->frags[i].page, + skb_shinfo(skb)->frags[i].page_offset, + skb_shinfo(skb)->frags[i].size, + DMA_TO_DEVICE); + BUG_ON(ib_dma_mapping_error(dev, addr)); + tx_req->mapping[i + 1] = addr; + sge->addr = addr; + sge->length = skb_shinfo(skb)->frags[i].size; + sge->lkey = ssk->sdp_dev->mr->lkey; + } + } + + tx_wr.next = NULL; + tx_wr.wr_id = ring_head(ssk->tx_ring) | SDP_OP_SEND; + tx_wr.sg_list = ibsge; + tx_wr.num_sge = frags + 1; + tx_wr.opcode = IB_WR_SEND; + tx_wr.send_flags = send_flags; + if (unlikely(SDP_SKB_CB(skb)->flags & TCPHDR_URG)) + tx_wr.send_flags |= IB_SEND_SOLICITED; + + rc = ib_post_send(ssk->qp, &tx_wr, &bad_wr); + if (unlikely(rc)) { + sdp_dbg(sk_ssk(ssk), + "ib_post_send failed with status %d.\n", rc); + + sdp_cleanup_sdp_buf(ssk, tx_req, skb->len - skb->data_len, DMA_TO_DEVICE); + + sdp_set_error(sk_ssk(ssk), -ECONNRESET); + + goto err; + } + + atomic_inc(&ssk->tx_ring.head); + atomic_dec(&ssk->tx_ring.credits); + atomic_set(&ssk->remote_credits, rx_ring_posted(ssk)); + + return; + +err: + sk_wmem_free_skb(sk_ssk(ssk), skb); +} + +static struct sk_buff *sdp_send_completion(struct sdp_sock *ssk, int mseq) +{ + struct ib_device *dev; + struct sdp_buf *tx_req; + struct sk_buff *skb = NULL; + struct sdp_tx_ring *tx_ring = &ssk->tx_ring; + if (unlikely(mseq != ring_tail(*tx_ring))) { + printk(KERN_WARNING "Bogus send completion id %d tail %d\n", + mseq, ring_tail(*tx_ring)); + goto out; + } + + dev = ssk->ib_device; + tx_req = &tx_ring->buffer[mseq & (SDP_TX_SIZE - 1)]; + skb = tx_req->skb; + if (!skb) + goto skip; /* This slot was used by RDMA WR */ + + sdp_cleanup_sdp_buf(ssk, tx_req, skb->len - skb->data_len, DMA_TO_DEVICE); + + tx_ring->una_seq += SDP_SKB_CB(skb)->end_seq; + + /* TODO: AIO and real zcopy code; add their context support here */ + if (BZCOPY_STATE(skb)) + BZCOPY_STATE(skb)->busy--; + +skip: + atomic_inc(&tx_ring->tail); + +out: + return skb; +} + +static inline void sdp_process_tx_wc(struct sdp_sock *ssk, struct ib_wc *wc) +{ + struct sock *sk = sk_ssk(ssk); + + if (likely(wc->wr_id & SDP_OP_SEND)) { + struct sk_buff *skb; + + skb = sdp_send_completion(ssk, wc->wr_id); + if (likely(skb)) + sk_wmem_free_skb(sk, skb); + } else if (wc->wr_id & SDP_OP_RDMA) { + if (ssk->tx_ring.rdma_inflight && + ssk->tx_ring.rdma_inflight->busy) { + /* Only last RDMA read WR is signalled. Order is guaranteed - + * therefore if Last RDMA read WR is completed - all other + * have, too */ + ssk->tx_ring.rdma_inflight->busy = 0; + } else { + sdp_warn(sk, "Unexpected RDMA read completion, " + "probably was canceled already\n"); + } + + wake_up(sdp_sk_sleep(sk)); + } else { + /* Keepalive probe sent cleanup */ + sdp_cnt(sdp_keepalive_probes_sent); + } + + if (likely(!wc->status) || wc->status == IB_WC_WR_FLUSH_ERR) + return; + + sdp_warn(sk, "Send completion with error. wr_id 0x%llx Status %d\n", + wc->wr_id, wc->status); + + sdp_set_error(sk, -ECONNRESET); +} + +static int sdp_process_tx_cq(struct sdp_sock *ssk) +{ + struct ib_wc ibwc[SDP_NUM_WC]; + int n, i; + int wc_processed = 0; + + if (!ssk->tx_ring.cq) { + sdp_dbg(sk_ssk(ssk), "tx irq on destroyed tx_cq\n"); + return 0; + } + + do { + n = ib_poll_cq(ssk->tx_ring.cq, SDP_NUM_WC, ibwc); + for (i = 0; i < n; ++i) { + sdp_process_tx_wc(ssk, ibwc + i); + wc_processed++; + } + } while (n == SDP_NUM_WC); + + if (wc_processed) { + struct sock *sk = sk_ssk(ssk); + sdp_prf1(sk, NULL, "Waking sendmsg. inflight=%d", + (u32) tx_ring_posted(ssk)); + + sk_mem_reclaim(sk); + + sk_stream_write_space(sk_ssk(ssk)); + if (sk->sk_write_pending && + test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) && + tx_ring_posted(ssk)) { + /* a write is pending and still no room in tx queue, + * arm tx cq + */ + sdp_prf(sk_ssk(ssk), NULL, "pending tx - rearming"); + sdp_arm_tx_cq(sk); + } + + } + + return wc_processed; +} + +/* Select who will handle tx completion: + * - a write is pending - wake it up and let it do the poll + post + * - post handler is taken - taker will do the poll + post + * else return 1 and let the caller do it + */ +static int sdp_tx_handler_select(struct sdp_sock *ssk) +{ + struct sock *sk = sk_ssk(ssk); + + if (sk->sk_write_pending) { + /* Do the TX posts from sender context */ + if (sdp_sk_sleep(sk) && waitqueue_active(sdp_sk_sleep(sk))) { + sdp_prf1(sk, NULL, "Waking up pending sendmsg"); + wake_up_interruptible(sdp_sk_sleep(sk)); + return 0; + } else + sdp_prf1(sk, NULL, "Unexpected: sk_sleep=%p, " + "waitqueue_active: %d\n", + sdp_sk_sleep(sk), waitqueue_active(sdp_sk_sleep(sk))); + } + + if (posts_handler(ssk)) { + /* Somebody else available to check for completion */ + sdp_prf1(sk, NULL, "Somebody else will call do_posts"); + return 0; + } + + return 1; +} + +static void sdp_poll_tx_timeout(unsigned long data) +{ + struct sdp_sock *ssk = (struct sdp_sock *)data; + struct sock *sk = sk_ssk(ssk); + u32 inflight, wc_processed; + + sdp_prf1(sk_ssk(ssk), NULL, "TX timeout: inflight=%d, head=%d tail=%d", + (u32) tx_ring_posted(ssk), + ring_head(ssk->tx_ring), ring_tail(ssk->tx_ring)); + + /* Only process if the socket is not in use */ + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + sdp_prf(sk_ssk(ssk), NULL, "TX comp: socket is busy"); + + if (sdp_tx_handler_select(ssk) && sk->sk_state != TCP_CLOSE && + likely(ssk->qp_active)) { + sdp_prf1(sk, NULL, "schedule a timer"); + mod_timer(&ssk->tx_ring.timer, jiffies + SDP_TX_POLL_TIMEOUT); + } + + SDPSTATS_COUNTER_INC(tx_poll_busy); + goto out; + } + + if (unlikely(!ssk->qp || sk->sk_state == TCP_CLOSE)) { + SDPSTATS_COUNTER_INC(tx_poll_no_op); + goto out; + } + + wc_processed = sdp_process_tx_cq(ssk); + if (!wc_processed) + SDPSTATS_COUNTER_INC(tx_poll_miss); + else { + sdp_post_sends(ssk, GFP_ATOMIC); + SDPSTATS_COUNTER_INC(tx_poll_hit); + } + + inflight = (u32) tx_ring_posted(ssk); + sdp_prf1(sk_ssk(ssk), NULL, "finished tx proccessing. inflight = %d", + tx_ring_posted(ssk)); + + /* If there are still packets in flight and the timer has not already + * been scheduled by the Tx routine then schedule it here to guarantee + * completion processing of these packets */ + if (inflight && likely(ssk->qp_active)) + mod_timer(&ssk->tx_ring.timer, jiffies + SDP_TX_POLL_TIMEOUT); + +out: + if (ssk->tx_ring.rdma_inflight && ssk->tx_ring.rdma_inflight->busy) { + sdp_prf1(sk, NULL, "RDMA is inflight - arming irq"); + sdp_arm_tx_cq(sk); + } + + bh_unlock_sock(sk); +} + +static void sdp_tx_irq(struct ib_cq *cq, void *cq_context) +{ + struct sock *sk = cq_context; + struct sdp_sock *ssk = sdp_sk(sk); + + sdp_prf1(sk, NULL, "tx irq"); + sdp_dbg_data(sk, "Got tx comp interrupt\n"); + + SDPSTATS_COUNTER_INC(tx_int_count); + + ssk->tx_compl_pending = 1; + + if (sdp_tx_handler_select(ssk) && likely(ssk->qp_active && + sk->sk_state != TCP_CLOSE)) { + sdp_prf1(sk, NULL, "poll and post from tasklet"); + mod_timer(&ssk->tx_ring.timer, jiffies + SDP_TX_POLL_TIMEOUT); + tasklet_schedule(&ssk->tx_ring.tasklet); + } +} + +static void sdp_tx_ring_purge(struct sdp_sock *ssk) +{ + while (ring_posted(ssk->tx_ring)) { + struct sk_buff *skb; + skb = sdp_send_completion(ssk, ring_tail(ssk->tx_ring)); + if (!skb) + break; + sk_wmem_free_skb(sk_ssk(ssk), skb); + } +} + +void sdp_post_keepalive(struct sdp_sock *ssk) +{ + int rc; + struct ib_send_wr wr, *bad_wr; + + sdp_dbg(sk_ssk(ssk), "%s\n", __func__); + + memset(&wr, 0, sizeof(wr)); + + wr.next = NULL; + wr.wr_id = 0; + wr.sg_list = NULL; + wr.num_sge = 0; + wr.opcode = IB_WR_RDMA_WRITE; + + rc = ib_post_send(ssk->qp, &wr, &bad_wr); + if (rc) { + sdp_dbg(sk_ssk(ssk), + "ib_post_keepalive failed with status %d.\n", rc); + sdp_set_error(sk_ssk(ssk), -ECONNRESET); + } + + sdp_cnt(sdp_keepalive_probes_sent); +} + +static void sdp_tx_cq_event_handler(struct ib_event *event, void *data) +{ +} + +int sdp_tx_ring_create(struct sdp_sock *ssk, struct ib_device *device) +{ + struct ib_cq *tx_cq; + int rc = 0; + + atomic_set(&ssk->tx_ring.head, 1); + atomic_set(&ssk->tx_ring.tail, 1); + + ssk->tx_ring.buffer = kmalloc( + sizeof *ssk->tx_ring.buffer * SDP_TX_SIZE, GFP_KERNEL); + if (!ssk->tx_ring.buffer) { + rc = -ENOMEM; + sdp_warn(sk_ssk(ssk), "Can't allocate TX Ring size %zd.\n", + sizeof(*ssk->tx_ring.buffer) * SDP_TX_SIZE); + + goto out; + } + + tx_cq = ib_create_cq(device, sdp_tx_irq, sdp_tx_cq_event_handler, + sk_ssk(ssk), SDP_TX_SIZE, IB_CQ_VECTOR_LEAST_ATTACHED); + + if (IS_ERR(tx_cq)) { + rc = PTR_ERR(tx_cq); + sdp_warn(sk_ssk(ssk), "Unable to allocate TX CQ: %d.\n", rc); + goto err_cq; + } + + ssk->tx_ring.cq = tx_cq; + + setup_timer(&ssk->tx_ring.timer, sdp_poll_tx_timeout, + (unsigned long)ssk); + ssk->tx_ring.poll_cnt = 0; + + tasklet_init(&ssk->tx_ring.tasklet, sdp_poll_tx_timeout, + (unsigned long) ssk); + + setup_timer(&ssk->nagle_timer, sdp_nagle_timeout, (unsigned long) ssk); + + return 0; + +err_cq: + kfree(ssk->tx_ring.buffer); + ssk->tx_ring.buffer = NULL; +out: + return rc; +} + +void sdp_tx_ring_destroy(struct sdp_sock *ssk) +{ + del_timer_sync(&ssk->tx_ring.timer); + + if (ssk->nagle_timer.function) + del_timer_sync(&ssk->nagle_timer); + + if (ssk->tx_ring.buffer) { + sdp_tx_ring_purge(ssk); + + kfree(ssk->tx_ring.buffer); + ssk->tx_ring.buffer = NULL; + } + + if (ssk->tx_ring.cq) { + if (ib_destroy_cq(ssk->tx_ring.cq)) { + sdp_warn(sk_ssk(ssk), "destroy cq(%p) failed\n", + ssk->tx_ring.cq); + } else { + ssk->tx_ring.cq = NULL; + } + } + + tasklet_kill(&ssk->tx_ring.tasklet); + /* tx_cq is destroyed, so no more tx_irq, so no one will schedule this + * tasklet. */ + + SDP_WARN_ON(ring_head(ssk->tx_ring) != ring_tail(ssk->tx_ring)); +} diff --git a/drivers/infiniband/ulp/sdp/sdp_zcopy.c b/drivers/infiniband/ulp/sdp/sdp_zcopy.c new file mode 100644 index 0000000000000..0a36bdc7d7b54 --- /dev/null +++ b/drivers/infiniband/ulp/sdp/sdp_zcopy.c @@ -0,0 +1,795 @@ +/* + * Copyright (c) 2006 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for memcpy_toiovec */ +#include +#include +#include +#include "sdp.h" + +static int sdp_post_srcavail(struct sock *sk, struct tx_srcavail_state *tx_sa) +{ + struct sdp_sock *ssk = sdp_sk(sk); + struct sk_buff *skb; + int payload_len; + struct page *payload_pg; + int off, len; + struct ib_umem_chunk *chunk; + + if (ssk->tx_sa) { + /* ssk->tx_sa might already be there in a case of + * multithreading: user thread initiated Zcopy and went to + * sleep, and now another user thread tries to ZCopy. + * Fallback to BCopy - data might be mixed. + * TODO: Fix it. fallback to BCopy is not enough because recv + * side has seq warnings. + */ + sdp_dbg_data(sk, "user already initiated ZCopy transmission\n"); + return -EAGAIN; + } + + BUG_ON(!tx_sa); + BUG_ON(!tx_sa->fmr || !tx_sa->fmr->fmr->lkey); + BUG_ON(!tx_sa->umem); + BUG_ON(!tx_sa->umem->chunk_list.next); + + chunk = list_entry(tx_sa->umem->chunk_list.next, struct ib_umem_chunk, list); + BUG_ON(!chunk->nmap); + + off = tx_sa->umem->offset; + len = tx_sa->umem->length; + + tx_sa->bytes_sent = tx_sa->bytes_acked = 0; + + skb = sdp_alloc_skb_srcavail(sk, len, tx_sa->fmr->fmr->lkey, off, 0); + if (!skb) { + return -ENOMEM; + } + sdp_dbg_data(sk, "sending SrcAvail\n"); + + TX_SRCAVAIL_STATE(skb) = tx_sa; /* tx_sa is hanged on the skb + * but continue to live after skb is freed */ + ssk->tx_sa = tx_sa; + + /* must have payload inlined in SrcAvail packet in combined mode */ + payload_len = MIN(tx_sa->umem->page_size - off, len); + payload_len = MIN(payload_len, ssk->xmit_size_goal - sizeof(struct sdp_srcah)); + payload_pg = sg_page(&chunk->page_list[0]); + get_page(payload_pg); + + sdp_dbg_data(sk, "payload: off: 0x%x, pg: %p, len: 0x%x\n", + off, payload_pg, payload_len); + + skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags, + payload_pg, off, payload_len); + + skb->len += payload_len; + skb->data_len = payload_len; + + sdp_skb_entail(sk, skb); + + ssk->write_seq += payload_len; + SDP_SKB_CB(skb)->end_seq += payload_len; + + tx_sa->bytes_sent = tx_sa->umem->length; + tx_sa->bytes_acked = payload_len; + + /* TODO: pushing the skb into the tx_queue should be enough */ + + return 0; +} + +static int sdp_post_srcavail_cancel(struct sock *sk) +{ + struct sdp_sock *ssk = sdp_sk(sk); + struct sk_buff *skb; + + sdp_dbg_data(sk_ssk(ssk), "Posting srcavail cancel\n"); + + skb = sdp_alloc_skb_srcavail_cancel(sk, 0); + if (unlikely(!skb)) + return -ENOMEM; + + sdp_skb_entail(sk, skb); + + sdp_post_sends(ssk, 0); + + return 0; +} + +static int sdp_wait_rdmardcompl(struct sdp_sock *ssk, long *timeo_p, + int ignore_signals) +{ + struct sock *sk = sk_ssk(ssk); + int err = 0; + long current_timeo = *timeo_p; + struct tx_srcavail_state *tx_sa = ssk->tx_sa; + DEFINE_WAIT(wait); + + sdp_dbg_data(sk, "sleep till RdmaRdCompl. timeo = %ld.\n", *timeo_p); + sdp_prf1(sk, NULL, "Going to sleep"); + while (ssk->qp_active) { + prepare_to_wait(sdp_sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + + if (unlikely(!*timeo_p)) { + err = -ETIME; + tx_sa->abort_flags |= TX_SA_TIMEDOUT; + sdp_prf1(sk, NULL, "timeout"); + SDPSTATS_COUNTER_INC(zcopy_tx_timeout); + break; + } + + else if (tx_sa->bytes_acked > tx_sa->bytes_sent) { + err = -EINVAL; + sdp_dbg_data(sk, "acked bytes > sent bytes\n"); + tx_sa->abort_flags |= TX_SA_ERROR; + break; + } + + if (tx_sa->abort_flags & TX_SA_SENDSM) { + sdp_prf1(sk, NULL, "Aborting SrcAvail sending"); + SDPSTATS_COUNTER_INC(zcopy_tx_aborted); + err = -EAGAIN; + break ; + } + + if (!ignore_signals) { + if (signal_pending(current)) { + err = -EINTR; + sdp_prf1(sk, NULL, "signalled"); + tx_sa->abort_flags |= TX_SA_INTRRUPTED; + break; + } + + if (ssk->rx_sa && (tx_sa->bytes_acked < tx_sa->bytes_sent)) { + sdp_dbg_data(sk, "Crossing SrcAvail - aborting this\n"); + tx_sa->abort_flags |= TX_SA_CROSS_SEND; + SDPSTATS_COUNTER_INC(zcopy_cross_send); + err = -ETIME; + break ; + } + } + + posts_handler_put(ssk, 0); + + sk_wait_event(sk, ¤t_timeo, + tx_sa->abort_flags && + ssk->rx_sa && + (tx_sa->bytes_acked < tx_sa->bytes_sent)); + + posts_handler_get(ssk); + + if (tx_sa->bytes_acked == tx_sa->bytes_sent) + break; + + *timeo_p = current_timeo; + } + + finish_wait(sdp_sk_sleep(sk), &wait); + + sdp_dbg_data(sk, "Finished waiting - RdmaRdCompl: %d/%d bytes, flags: 0x%x\n", + tx_sa->bytes_acked, tx_sa->bytes_sent, tx_sa->abort_flags); + + if (!ssk->qp_active) { + sdp_dbg(sk, "QP destroyed while waiting\n"); + return -EINVAL; + } + return err; +} + +static int sdp_wait_rdma_wr_finished(struct sdp_sock *ssk) +{ + struct sock *sk = sk_ssk(ssk); + long timeo = SDP_RDMA_READ_TIMEOUT; + int rc = 0; + DEFINE_WAIT(wait); + + sdp_dbg_data(sk, "Sleep till RDMA wr finished.\n"); + while (1) { + prepare_to_wait(sdp_sk_sleep(sk), &wait, TASK_UNINTERRUPTIBLE); + + if (!ssk->tx_ring.rdma_inflight->busy) { + sdp_dbg_data(sk, "got rdma cqe\n"); + if (sk->sk_err == ECONNRESET) + rc = -EPIPE; + break; + } + + if (!ssk->qp_active) { + sdp_dbg_data(sk, "QP destroyed\n"); + rc = -EPIPE; + break; + } + + if (!timeo) { + sdp_warn(sk, "Fatal: no RDMA read completion\n"); + rc = -EIO; + sdp_set_error(sk, rc); + break; + } + + posts_handler_put(ssk, 0); + + sdp_prf1(sk, NULL, "Going to sleep"); + sk_wait_event(sk, &timeo, + !ssk->tx_ring.rdma_inflight->busy || + !ssk->qp_active); + sdp_prf1(sk, NULL, "Woke up"); + sdp_dbg_data(sk_ssk(ssk), "woke up sleepers\n"); + + posts_handler_get(ssk); + } + + finish_wait(sdp_sk_sleep(sk), &wait); + + sdp_dbg_data(sk, "Finished waiting\n"); + return rc; +} + +int sdp_post_rdma_rd_compl(struct sock *sk, struct rx_srcavail_state *rx_sa) +{ + int unreported = rx_sa->copied - rx_sa->reported; + + if (rx_sa->copied <= rx_sa->reported) + return 0; + + sdp_sk(sk)->sa_post_rdma_rd_compl += unreported; + rx_sa->reported += unreported; + + return 0; +} + +int sdp_post_sendsm(struct sock *sk) +{ + struct sdp_sock *ssk = sdp_sk(sk); + + ssk->sa_post_sendsm = 1; + + return 0; +} + +static int sdp_update_iov_used(struct sock *sk, struct iovec *iov, int len) +{ + sdp_dbg_data(sk, "updating consumed 0x%x bytes from iov\n", len); + while (len > 0) { + if (iov->iov_len) { + int copy = min_t(unsigned int, iov->iov_len, len); + len -= copy; + iov->iov_len -= copy; + iov->iov_base += copy; + } + iov++; + } + + return 0; +} + +static inline int sge_bytes(struct ib_sge *sge, int sge_cnt) +{ + int bytes = 0; + + while (sge_cnt > 0) { + bytes += sge->length; + sge++; + sge_cnt--; + } + + return bytes; +} +void sdp_handle_sendsm(struct sdp_sock *ssk, u32 mseq_ack) +{ + struct sock *sk = sk_ssk(ssk); + unsigned long flags; + + spin_lock_irqsave(&ssk->tx_sa_lock, flags); + + if (!ssk->tx_sa) { + sdp_prf1(sk, NULL, "SendSM for cancelled/finished SrcAvail"); + goto out; + } + + if (after(ssk->tx_sa->mseq, mseq_ack)) { + sdp_dbg_data(sk, "SendSM arrived for old SrcAvail. " + "SendSM mseq_ack: 0x%x, SrcAvail mseq: 0x%x\n", + mseq_ack, ssk->tx_sa->mseq); + goto out; + } + + sdp_dbg_data(sk, "Got SendSM - aborting SrcAvail\n"); + + ssk->tx_sa->abort_flags |= TX_SA_SENDSM; + wake_up(sdp_sk_sleep(sk)); + sdp_dbg_data(sk, "woke up sleepers\n"); + +out: + spin_unlock_irqrestore(&ssk->tx_sa_lock, flags); +} + +void sdp_handle_rdma_read_compl(struct sdp_sock *ssk, u32 mseq_ack, + u32 bytes_completed) +{ + struct sock *sk = sk_ssk(ssk); + unsigned long flags; + + sdp_prf1(sk, NULL, "RdmaRdCompl ssk=%p tx_sa=%p", ssk, ssk->tx_sa); + sdp_dbg_data(sk, "RdmaRdCompl ssk=%p tx_sa=%p\n", ssk, ssk->tx_sa); + + spin_lock_irqsave(&ssk->tx_sa_lock, flags); + + if (!ssk->tx_sa) { + sdp_dbg_data(sk, "Got RdmaRdCompl for aborted SrcAvail\n"); + goto out; + } + + if (after(ssk->tx_sa->mseq, mseq_ack)) { + sdp_dbg_data(sk, "RdmaRdCompl arrived for old SrcAvail. " + "SendSM mseq_ack: 0x%x, SrcAvail mseq: 0x%x\n", + mseq_ack, ssk->tx_sa->mseq); + goto out; + } + + ssk->tx_sa->bytes_acked += bytes_completed; + + wake_up(sdp_sk_sleep(sk)); + sdp_dbg_data(sk, "woke up sleepers\n"); + +out: + spin_unlock_irqrestore(&ssk->tx_sa_lock, flags); +} + +static unsigned long sdp_get_max_memlockable_bytes(unsigned long offset) +{ + unsigned long avail; + unsigned long lock_limit; + + if (capable(CAP_IPC_LOCK)) + return ULONG_MAX; + + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + avail = lock_limit - (current->mm->locked_vm << PAGE_SHIFT); + + return avail < offset ? 0 : avail - offset; +} + +static int sdp_alloc_fmr(struct sock *sk, void *uaddr, size_t len, + struct ib_pool_fmr **_fmr, struct ib_umem **_umem, int access, int min_len) +{ + struct ib_pool_fmr *fmr; + struct ib_umem *umem; + struct ib_device *dev = sdp_sk(sk)->ib_device; + u64 *pages; + struct ib_umem_chunk *chunk; + int n = 0, j, k; + int rc = 0; + unsigned long max_lockable_bytes; + + if (unlikely(len > SDP_MAX_RDMA_READ_LEN)) { + sdp_dbg_data(sk, "len:0x%zx > FMR_SIZE: 0x%lx\n", + len, SDP_MAX_RDMA_READ_LEN); + len = SDP_MAX_RDMA_READ_LEN; + } + + max_lockable_bytes = sdp_get_max_memlockable_bytes((unsigned long)uaddr & ~PAGE_MASK); + if (unlikely(len > max_lockable_bytes)) { + sdp_dbg_data(sk, "len:0x%zx > RLIMIT_MEMLOCK available: 0x%lx\n", + len, max_lockable_bytes); + len = max_lockable_bytes; + } + + if (unlikely(len <= min_len)) + return -EAGAIN; + + sdp_dbg_data(sk, "user buf: %p, len:0x%zx max_lockable_bytes: 0x%lx\n", + uaddr, len, max_lockable_bytes); + + umem = ib_umem_get(&sdp_sk(sk)->context, (unsigned long)uaddr, len, + access, 0); + + if (IS_ERR(umem)) { + rc = -EAGAIN; + sdp_dbg_data(sk, "Error doing umem_get 0x%zx bytes: %ld\n", len, PTR_ERR(umem)); + sdp_dbg_data(sk, "RLIMIT_MEMLOCK: 0x%lx[cur] 0x%lx[max] CAP_IPC_LOCK: %d\n", + current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur, + current->signal->rlim[RLIMIT_MEMLOCK].rlim_max, + capable(CAP_IPC_LOCK)); + goto err_umem_get; + } + + sdp_dbg_data(sk, "umem->offset = 0x%x, length = 0x%zx\n", + umem->offset, umem->length); + + pages = (u64 *) __get_free_page(GFP_KERNEL); + if (!pages) { + rc = -ENOMEM; + goto err_pages_alloc; + } + + list_for_each_entry(chunk, &umem->chunk_list, list) { + for (j = 0; j < chunk->nmap; ++j) { + unsigned len2; + len2 = ib_sg_dma_len(dev, + &chunk->page_list[j]) >> PAGE_SHIFT; + + SDP_WARN_ON(len2 > len); + len -= len2; + + for (k = 0; k < len2; ++k) { + pages[n++] = ib_sg_dma_address(dev, + &chunk->page_list[j]) + + umem->page_size * k; + BUG_ON(n >= SDP_FMR_SIZE); + } + } + } + + fmr = ib_fmr_pool_map_phys(sdp_sk(sk)->sdp_dev->fmr_pool, pages, n, 0, NULL); + if (IS_ERR(fmr)) { + sdp_dbg_data(sk, "Error allocating fmr: %ld\n", PTR_ERR(fmr)); + SDPSTATS_COUNTER_INC(fmr_alloc_error); + rc = PTR_ERR(fmr); + goto err_fmr_alloc; + } + + free_page((unsigned long) pages); + + *_umem = umem; + *_fmr = fmr; + + return 0; + +err_fmr_alloc: + free_page((unsigned long) pages); + +err_pages_alloc: + ib_umem_release(umem); + +err_umem_get: + + return rc; +} + +static inline void sdp_free_fmr(struct sock *sk, struct ib_pool_fmr **_fmr, + struct ib_umem **_umem) +{ + if (*_fmr) { + ib_fmr_pool_unmap(*_fmr); + *_fmr = NULL; + } + + if (*_umem) { + ib_umem_release(*_umem); + *_umem = NULL; + } +} + +static int sdp_post_rdma_read(struct sock *sk, struct rx_srcavail_state *rx_sa, + u32 offset) +{ + struct sdp_sock *ssk = sdp_sk(sk); + struct ib_send_wr *bad_wr; + struct ib_send_wr wr = { NULL }; + struct ib_sge sge; + int rc; + + wr.opcode = IB_WR_RDMA_READ; + wr.next = NULL; + wr.wr_id = SDP_OP_RDMA; + wr.wr.rdma.rkey = rx_sa->rkey; + wr.send_flags = 0; + + ssk->tx_ring.rdma_inflight = rx_sa; + + sge.addr = rx_sa->umem->offset; + sge.length = rx_sa->umem->length; + sge.lkey = rx_sa->fmr->fmr->lkey; + + wr.wr.rdma.remote_addr = rx_sa->vaddr + offset; + wr.num_sge = 1; + wr.sg_list = &sge; + rx_sa->busy++; + + wr.send_flags = IB_SEND_SIGNALED; + + rc = ib_post_send(ssk->qp, &wr, &bad_wr); + if (unlikely(rc)) { + rx_sa->busy--; + ssk->tx_ring.rdma_inflight = NULL; + } + + return rc; +} + +int sdp_rdma_to_iovec(struct sock *sk, struct iovec *iov, int msg_iovlen, + struct sk_buff *skb, unsigned long *used, u32 offset) +{ + struct sdp_sock *ssk = sdp_sk(sk); + struct rx_srcavail_state *rx_sa = RX_SRCAVAIL_STATE(skb); + int rc = 0; + int len = *used; + int copied; + int i = 0; + + if (unlikely(!ssk->ib_device)) + return -ENODEV; + + while (!iov->iov_len) { + ++iov; + i++; + } + WARN_ON(i >= msg_iovlen); + + sdp_dbg_data(sk_ssk(ssk), "preparing RDMA read." + " len: 0x%x. buffer len: 0x%zx\n", len, iov->iov_len); + + sock_hold(sk, SOCK_REF_RDMA_RD); + + if (len > rx_sa->len) { + sdp_warn(sk, "len:0x%x > rx_sa->len: 0x%x\n", len, rx_sa->len); + SDP_WARN_ON(1); + len = rx_sa->len; + } + + rc = sdp_alloc_fmr(sk, iov->iov_base, len, &rx_sa->fmr, &rx_sa->umem, + IB_ACCESS_LOCAL_WRITE, 0); + if (rc) { + sdp_dbg_data(sk, "Error allocating fmr: %d\n", rc); + goto err_alloc_fmr; + } + + rc = sdp_post_rdma_read(sk, rx_sa, offset); + if (unlikely(rc)) { + sdp_warn(sk, "ib_post_send failed with status %d.\n", rc); + sdp_set_error(sk_ssk(ssk), -ECONNRESET); + goto err_post_send; + } + + sdp_prf(sk, skb, "Finished posting, now to wait"); + sdp_arm_tx_cq(sk); + + rc = sdp_wait_rdma_wr_finished(ssk); + if (unlikely(rc)) + goto err_wait; + + copied = rx_sa->umem->length; + + sdp_update_iov_used(sk, iov, copied); + atomic_add(copied, &ssk->rcv_nxt); + *used = copied; + rx_sa->copied += copied; + +err_wait: + ssk->tx_ring.rdma_inflight = NULL; + +err_post_send: + sdp_free_fmr(sk, &rx_sa->fmr, &rx_sa->umem); + +err_alloc_fmr: + sock_put(sk, SOCK_REF_RDMA_RD); + + return rc; +} + +static inline int wait_for_sndbuf(struct sock *sk, long *timeo_p) +{ + struct sdp_sock *ssk = sdp_sk(sk); + int ret = 0; + int credits_needed = 1; + + sdp_dbg_data(sk, "Wait for mem\n"); + + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + + SDPSTATS_COUNTER_INC(send_wait_for_mem); + + sdp_do_posts(ssk); + + if (sdp_xmit_poll(ssk, 1)) + sdp_post_sends(ssk, 0); + + ret = sdp_tx_wait_memory(ssk, timeo_p, &credits_needed); + + return ret; +} + +static int do_sdp_sendmsg_zcopy(struct sock *sk, struct tx_srcavail_state *tx_sa, + struct iovec *iov, long *timeo) +{ + struct sdp_sock *ssk = sdp_sk(sk); + int rc = 0; + unsigned long lock_flags; + + rc = sdp_alloc_fmr(sk, iov->iov_base, iov->iov_len, + &tx_sa->fmr, &tx_sa->umem, IB_ACCESS_REMOTE_READ, sdp_zcopy_thresh); + if (unlikely(rc)) { + sdp_dbg_data(sk, "Error allocating fmr: %d\n", rc); + goto err_alloc_fmr; + } + + if (tx_slots_free(ssk) == 0) { + rc = wait_for_sndbuf(sk, timeo); + if (unlikely(rc)) { + sdp_warn(sk, "Couldn't get send buffer\n"); + goto err_no_tx_slots; + } + } + + rc = sdp_post_srcavail(sk, tx_sa); + if (unlikely(rc)) { + sdp_dbg(sk, "Error posting SrcAvail: %d\n", rc); + goto err_abort_send; + } + + rc = sdp_wait_rdmardcompl(ssk, timeo, 0); + if (unlikely(rc)) { + enum tx_sa_flag f = tx_sa->abort_flags; + + if (f & TX_SA_SENDSM) { + sdp_dbg_data(sk, "Got SendSM. use SEND verb.\n"); + } else if (f & TX_SA_ERROR) { + sdp_dbg_data(sk, "SrcAvail error completion\n"); + sdp_reset(sk); + SDPSTATS_COUNTER_INC(zcopy_tx_error); + } else if (ssk->qp_active) { + sdp_post_srcavail_cancel(sk); + + /* Wait for RdmaRdCompl/SendSM to + * finish the transaction */ + *timeo = SDP_SRCAVAIL_CANCEL_TIMEOUT; + rc = sdp_wait_rdmardcompl(ssk, timeo, 1); + if (unlikely(rc == -ETIME || rc == -EINVAL)) { + /* didn't get RdmaRdCompl/SendSM after sending + * SrcAvailCancel - There is a connection + * problem. */ + sdp_reset(sk); + rc = -sk->sk_err; + } + } else { + sdp_dbg_data(sk, "QP was destroyed while waiting\n"); + } + } else { + sdp_dbg_data(sk, "got RdmaRdCompl\n"); + } + + spin_lock_irqsave(&ssk->tx_sa_lock, lock_flags); + ssk->tx_sa = NULL; + spin_unlock_irqrestore(&ssk->tx_sa_lock, lock_flags); + +err_abort_send: + sdp_update_iov_used(sk, iov, tx_sa->bytes_acked); + +err_no_tx_slots: + sdp_free_fmr(sk, &tx_sa->fmr, &tx_sa->umem); + +err_alloc_fmr: + return rc; +} + +int sdp_sendmsg_zcopy(struct kiocb *iocb, struct sock *sk, struct iovec *iov) +{ + struct sdp_sock *ssk = sdp_sk(sk); + int rc = 0; + long timeo = SDP_SRCAVAIL_ADV_TIMEOUT; + struct tx_srcavail_state *tx_sa; + size_t bytes_to_copy = iov->iov_len; + int copied = 0; + + sdp_dbg_data(sk, "Sending ZCopy iov: %p, iov_len: 0x%zx\n", + iov->iov_base, iov->iov_len); + if (ssk->rx_sa) { + /* Don't want both sides to send SrcAvail because both of them + * will wait on sendmsg() until timeout. + */ + sdp_dbg_data(sk, "Deadlock prevent: crossing SrcAvail\n"); + return 0; + } + + sock_hold(sk_ssk(ssk), SOCK_REF_ZCOPY); + SDPSTATS_COUNTER_INC(sendmsg_zcopy_segment); + + /* Ok commence sending. */ + + tx_sa = kmalloc(sizeof(struct tx_srcavail_state), GFP_KERNEL); + if (!tx_sa) { + sdp_warn(sk, "Error allocating zcopy context\n"); + rc = -EAGAIN; /* Buffer too big - fallback to bcopy */ + goto err_alloc_tx_sa; + } + + do { + tx_sa_reset(tx_sa); + + rc = do_sdp_sendmsg_zcopy(sk, tx_sa, iov, &timeo); + + if (iov->iov_len && iov->iov_len < sdp_zcopy_thresh) { + sdp_dbg_data(sk, "0x%zx bytes left, switching to bcopy\n", + iov->iov_len); + break; + } + } while (!rc && iov->iov_len > 0 && !tx_sa->abort_flags); + + kfree(tx_sa); +err_alloc_tx_sa: + copied = bytes_to_copy - iov->iov_len; + + sdp_prf1(sk, NULL, "sdp_sendmsg_zcopy end rc: %d copied: %d", rc, copied); + + sock_put(sk_ssk(ssk), SOCK_REF_ZCOPY); + + if (rc < 0 && rc != -EAGAIN && rc != -ETIME) + return rc; + + return copied; +} + +void sdp_abort_srcavail(struct sock *sk) +{ + struct sdp_sock *ssk = sdp_sk(sk); + struct tx_srcavail_state *tx_sa = ssk->tx_sa; + unsigned long flags; + + if (!tx_sa) + return; + + spin_lock_irqsave(&ssk->tx_sa_lock, flags); + + sdp_free_fmr(sk, &tx_sa->fmr, &tx_sa->umem); + + ssk->tx_sa = NULL; + + spin_unlock_irqrestore(&ssk->tx_sa_lock, flags); +} + +void sdp_abort_rdma_read(struct sock *sk) +{ + struct sdp_sock *ssk = sdp_sk(sk); + struct rx_srcavail_state *rx_sa; + + rx_sa = ssk->rx_sa; + if (!rx_sa) + return; + + sdp_free_fmr(sk, &rx_sa->fmr, &rx_sa->umem); + + /* kfree(rx_sa) and posting SendSM will be handled in the nornal + * flows. + */ +} diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index aa5eafa194abb..1e6759e1fd1da 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -59,35 +59,35 @@ MODULE_DESCRIPTION("InfiniBand SCSI RDMA Protocol initiator " "v" DRV_VERSION " (" DRV_RELDATE ")"); MODULE_LICENSE("Dual BSD/GPL"); -static unsigned int srp_sg_tablesize; -static unsigned int cmd_sg_entries; -static unsigned int indirect_sg_entries; -static bool allow_ext_sg; -static int topspin_workarounds = 1; - -module_param(srp_sg_tablesize, uint, 0444); -MODULE_PARM_DESC(srp_sg_tablesize, "Deprecated name for cmd_sg_entries"); - -module_param(cmd_sg_entries, uint, 0444); -MODULE_PARM_DESC(cmd_sg_entries, - "Default number of gather/scatter entries in the SRP command (default is 12, max 255)"); +static int srp_sg_tablesize = SRP_DEF_SG_TABLESIZE; +static int srp_max_iu_len; -module_param(indirect_sg_entries, uint, 0444); -MODULE_PARM_DESC(indirect_sg_entries, - "Default max number of gather/scatter entries (default is 12, max is " __stringify(SCSI_MAX_SG_CHAIN_SEGMENTS) ")"); +module_param(srp_sg_tablesize, int, 0444); +MODULE_PARM_DESC(srp_sg_tablesize, + "Max number of gather/scatter entries per I/O (default is 12, max 255)"); -module_param(allow_ext_sg, bool, 0444); -MODULE_PARM_DESC(allow_ext_sg, - "Default behavior when there are more than cmd_sg_entries S/G entries after mapping; fails the request when false (default false)"); +static int topspin_workarounds = 1; module_param(topspin_workarounds, int, 0444); MODULE_PARM_DESC(topspin_workarounds, "Enable workarounds for Topspin/Cisco SRP target bugs if != 0"); +static int mellanox_workarounds = 1; + +module_param(mellanox_workarounds, int, 0444); +MODULE_PARM_DESC(mellanox_workarounds, + "Enable workarounds for Mellanox SRP target bugs if != 0"); + +static int srp_dev_loss_tmo = 60; + +module_param(srp_dev_loss_tmo, int, 0444); +MODULE_PARM_DESC(srp_dev_loss_tmo, + "Default number of seconds that srp transport should \ + insulate the lost of a remote port (default is 60 secs"); + static void srp_add_one(struct ib_device *device); static void srp_remove_one(struct ib_device *device); -static void srp_recv_completion(struct ib_cq *cq, void *target_ptr); -static void srp_send_completion(struct ib_cq *cq, void *target_ptr); +static void srp_completion(struct ib_cq *cq, void *target_ptr); static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event); static struct scsi_transport_template *ib_srp_transport_template; @@ -120,6 +120,14 @@ static int srp_target_is_topspin(struct srp_target_port *target) !memcmp(&target->ioc_guid, cisco_oui, sizeof cisco_oui)); } +static int srp_target_is_mellanox(struct srp_target_port *target) +{ + static const u8 mellanox_oui[3] = { 0x00, 0x02, 0xc9 }; + + return mellanox_workarounds && + !memcmp(&target->ioc_guid, mellanox_oui, sizeof mellanox_oui); +} + static struct srp_iu *srp_alloc_iu(struct srp_host *host, size_t size, gfp_t gfp_mask, enum dma_data_direction direction) @@ -226,21 +234,14 @@ static int srp_create_target_ib(struct srp_target_port *target) if (!init_attr) return -ENOMEM; - target->recv_cq = ib_create_cq(target->srp_host->srp_dev->dev, - srp_recv_completion, NULL, target, SRP_RQ_SIZE, 0); - if (IS_ERR(target->recv_cq)) { - ret = PTR_ERR(target->recv_cq); - goto err; - } - - target->send_cq = ib_create_cq(target->srp_host->srp_dev->dev, - srp_send_completion, NULL, target, SRP_SQ_SIZE, 0); - if (IS_ERR(target->send_cq)) { - ret = PTR_ERR(target->send_cq); - goto err_recv_cq; + target->cq = ib_create_cq(target->srp_host->srp_dev->dev, + srp_completion, NULL, target, SRP_CQ_SIZE, 0); + if (IS_ERR(target->cq)) { + ret = PTR_ERR(target->cq); + goto out; } - ib_req_notify_cq(target->recv_cq, IB_CQ_NEXT_COMP); + ib_req_notify_cq(target->cq, IB_CQ_NEXT_COMP); init_attr->event_handler = srp_qp_event; init_attr->cap.max_send_wr = SRP_SQ_SIZE; @@ -249,32 +250,24 @@ static int srp_create_target_ib(struct srp_target_port *target) init_attr->cap.max_send_sge = 1; init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; init_attr->qp_type = IB_QPT_RC; - init_attr->send_cq = target->send_cq; - init_attr->recv_cq = target->recv_cq; + init_attr->send_cq = target->cq; + init_attr->recv_cq = target->cq; target->qp = ib_create_qp(target->srp_host->srp_dev->pd, init_attr); if (IS_ERR(target->qp)) { ret = PTR_ERR(target->qp); - goto err_send_cq; + ib_destroy_cq(target->cq); + goto out; } ret = srp_init_qp(target, target->qp); - if (ret) - goto err_qp; - - kfree(init_attr); - return 0; - -err_qp: - ib_destroy_qp(target->qp); - -err_send_cq: - ib_destroy_cq(target->send_cq); - -err_recv_cq: - ib_destroy_cq(target->recv_cq); + if (ret) { + ib_destroy_qp(target->qp); + ib_destroy_cq(target->cq); + goto out; + } -err: +out: kfree(init_attr); return ret; } @@ -284,12 +277,11 @@ static void srp_free_target_ib(struct srp_target_port *target) int i; ib_destroy_qp(target->qp); - ib_destroy_cq(target->send_cq); - ib_destroy_cq(target->recv_cq); + ib_destroy_cq(target->cq); for (i = 0; i < SRP_RQ_SIZE; ++i) srp_free_iu(target->srp_host, target->rx_ring[i]); - for (i = 0; i < SRP_SQ_SIZE; ++i) + for (i = 0; i < SRP_SQ_SIZE + 1; ++i) srp_free_iu(target->srp_host, target->tx_ring[i]); } @@ -376,7 +368,7 @@ static int srp_send_req(struct srp_target_port *target) req->priv.opcode = SRP_LOGIN_REQ; req->priv.tag = 0; - req->priv.req_it_iu_len = cpu_to_be32(target->max_iu_len); + req->priv.req_it_iu_len = cpu_to_be32(srp_max_iu_len); req->priv.req_buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT | SRP_BUF_FORMAT_INDIRECT); /* @@ -430,46 +422,11 @@ static void srp_disconnect_target(struct srp_target_port *target) { /* XXX should send SRP_I_LOGOUT request */ - init_completion(&target->done); if (ib_send_cm_dreq(target->cm_id, NULL, 0)) { shost_printk(KERN_DEBUG, target->scsi_host, PFX "Sending CM DREQ failed\n"); return; } - wait_for_completion(&target->done); -} - -static bool srp_change_state(struct srp_target_port *target, - enum srp_target_state old, - enum srp_target_state new) -{ - bool changed = false; - - spin_lock_irq(&target->lock); - if (target->state == old) { - target->state = new; - changed = true; - } - spin_unlock_irq(&target->lock); - return changed; -} - -static void srp_free_req_data(struct srp_target_port *target) -{ - struct ib_device *ibdev = target->srp_host->srp_dev->dev; - struct srp_request *req; - int i; - - for (i = 0, req = target->req_ring; i < SRP_CMD_SQ_SIZE; ++i, ++req) { - kfree(req->fmr_list); - kfree(req->map_page); - if (req->indirect_dma_addr) { - ib_dma_unmap_single(ibdev, req->indirect_dma_addr, - target->indirect_size, - DMA_TO_DEVICE); - } - kfree(req->indirect_desc); - } } static void srp_remove_work(struct work_struct *work) @@ -477,8 +434,14 @@ static void srp_remove_work(struct work_struct *work) struct srp_target_port *target = container_of(work, struct srp_target_port, work); - if (!srp_change_state(target, SRP_TARGET_DEAD, SRP_TARGET_REMOVED)) + spin_lock_irq(target->scsi_host->host_lock); + if (target->state != SRP_TARGET_DEAD) { + spin_unlock_irq(target->scsi_host->host_lock); return; + } + target->state = SRP_TARGET_REMOVED; + del_timer(&target->qp_err_timer); + spin_unlock_irq(target->scsi_host->host_lock); spin_lock(&target->srp_host->target_lock); list_del(&target->list); @@ -488,7 +451,6 @@ static void srp_remove_work(struct work_struct *work) scsi_remove_host(target->scsi_host); ib_destroy_cm_id(target->cm_id); srp_free_target_ib(target); - srp_free_req_data(target); scsi_host_put(target->scsi_host); } @@ -552,88 +514,48 @@ static void srp_unmap_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, struct srp_request *req) { - struct ib_device *ibdev = target->srp_host->srp_dev->dev; - struct ib_pool_fmr **pfmr; - if (!scsi_sglist(scmnd) || (scmnd->sc_data_direction != DMA_TO_DEVICE && scmnd->sc_data_direction != DMA_FROM_DEVICE)) return; - pfmr = req->fmr_list; - while (req->nfmr--) - ib_fmr_pool_unmap(*pfmr++); - - ib_dma_unmap_sg(ibdev, scsi_sglist(scmnd), scsi_sg_count(scmnd), - scmnd->sc_data_direction); -} - -/** - * srp_claim_req - Take ownership of the scmnd associated with a request. - * @target: SRP target port. - * @req: SRP request. - * @scmnd: If NULL, take ownership of @req->scmnd. If not NULL, only take - * ownership of @req->scmnd if it equals @scmnd. - * - * Return value: - * Either NULL or a pointer to the SCSI command the caller became owner of. - */ -static struct scsi_cmnd *srp_claim_req(struct srp_target_port *target, - struct srp_request *req, - struct scsi_cmnd *scmnd) -{ - unsigned long flags; - - spin_lock_irqsave(&target->lock, flags); - if (!scmnd) { - scmnd = req->scmnd; - req->scmnd = NULL; - } else if (req->scmnd == scmnd) { - req->scmnd = NULL; - } else { - scmnd = NULL; + if (req->fmr) { + ib_fmr_pool_unmap(req->fmr); + req->fmr = NULL; } - spin_unlock_irqrestore(&target->lock, flags); - return scmnd; + ib_dma_unmap_sg(target->srp_host->srp_dev->dev, scsi_sglist(scmnd), + scsi_sg_count(scmnd), scmnd->sc_data_direction); } -/** - * srp_free_req() - Unmap data and add request to the free request list. - */ -static void srp_free_req(struct srp_target_port *target, - struct srp_request *req, struct scsi_cmnd *scmnd, - s32 req_lim_delta) +static void srp_remove_req(struct srp_target_port *target, struct srp_request *req) { - unsigned long flags; - - srp_unmap_data(scmnd, target, req); - - spin_lock_irqsave(&target->lock, flags); - target->req_lim += req_lim_delta; - list_add_tail(&req->list, &target->free_reqs); - spin_unlock_irqrestore(&target->lock, flags); + srp_unmap_data(req->scmnd, target, req); + list_move_tail(&req->list, &target->free_reqs); } -static void srp_reset_req(struct srp_target_port *target, struct srp_request *req) +static void srp_reset_req(struct srp_target_port *target, + struct srp_request *req, int status) { - struct scsi_cmnd *scmnd = srp_claim_req(target, req, NULL); - - if (scmnd) { - srp_free_req(target, req, scmnd, 0); - scmnd->result = DID_RESET << 16; - scmnd->scsi_done(scmnd); - } + req->scmnd->result = status << 16; + req->scmnd->scsi_done(req->scmnd); + srp_remove_req(target, req); } static int srp_reconnect_target(struct srp_target_port *target) { - struct ib_qp_attr qp_attr; - struct ib_wc wc; - int i, ret; + struct srp_request *req, *tmp; + int ret; + struct ib_cq *old_cq; + struct ib_qp *old_qp; - if (!srp_change_state(target, SRP_TARGET_LIVE, SRP_TARGET_CONNECTING)) + spin_lock_irq(target->scsi_host->host_lock); + if (target->state != SRP_TARGET_LIVE) { + spin_unlock_irq(target->scsi_host->host_lock); return -EAGAIN; + } + target->state = SRP_TARGET_CONNECTING; + spin_unlock_irq(target->scsi_host->host_lock); srp_disconnect_target(target); /* @@ -644,37 +566,39 @@ static int srp_reconnect_target(struct srp_target_port *target) if (ret) goto err; - qp_attr.qp_state = IB_QPS_RESET; - ret = ib_modify_qp(target->qp, &qp_attr, IB_QP_STATE); - if (ret) - goto err; - - ret = srp_init_qp(target, target->qp); - if (ret) + old_qp = target->qp; + old_cq = target->cq; + ret = srp_create_target_ib(target); + if (ret) { + target->qp = old_qp; + target->cq = old_cq; goto err; + } - while (ib_poll_cq(target->recv_cq, 1, &wc) > 0) - ; /* nothing */ - while (ib_poll_cq(target->send_cq, 1, &wc) > 0) - ; /* nothing */ + ib_destroy_qp(old_qp); + ib_destroy_cq(old_cq); - for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { - struct srp_request *req = &target->req_ring[i]; - if (req->scmnd) - srp_reset_req(target, req); - } + spin_lock_irq(target->scsi_host->host_lock); + list_for_each_entry_safe(req, tmp, &target->req_queue, list) + srp_reset_req(target, req, DID_RESET); + spin_unlock_irq(target->scsi_host->host_lock); - INIT_LIST_HEAD(&target->free_tx); - for (i = 0; i < SRP_SQ_SIZE; ++i) - list_add(&target->tx_ring[i]->list, &target->free_tx); + target->rx_head = 0; + target->tx_head = 0; + target->tx_tail = 0; target->qp_in_error = 0; ret = srp_connect_target(target); if (ret) goto err; - if (!srp_change_state(target, SRP_TARGET_CONNECTING, SRP_TARGET_LIVE)) + spin_lock_irq(target->scsi_host->host_lock); + if (target->state == SRP_TARGET_CONNECTING) { + ret = 0; + target->state = SRP_TARGET_LIVE; + } else ret = -EAGAIN; + spin_unlock_irq(target->scsi_host->host_lock); return ret; @@ -684,169 +608,114 @@ err: /* * We couldn't reconnect, so kill our target port off. - * However, we have to defer the real removal because we - * are in the context of the SCSI error handler now, which - * will deadlock if we call scsi_remove_host(). - * - * Schedule our work inside the lock to avoid a race with - * the flush_scheduled_work() in srp_remove_one(). + * However, we have to defer the real removal because we might + * be in the context of the SCSI error handler now, which + * would deadlock if we call scsi_remove_host(). */ - spin_lock_irq(&target->lock); + spin_lock_irq(target->scsi_host->host_lock); if (target->state == SRP_TARGET_CONNECTING) { target->state = SRP_TARGET_DEAD; + target->work_in_progress = 1; + del_timer(&target->qp_err_timer); + INIT_WORK(&target->work, srp_remove_work); - queue_work(ib_wq, &target->work); + schedule_work(&target->work); } - spin_unlock_irq(&target->lock); + spin_unlock_irq(target->scsi_host->host_lock); return ret; } -static void srp_map_desc(struct srp_map_state *state, dma_addr_t dma_addr, - unsigned int dma_len, u32 rkey) -{ - struct srp_direct_buf *desc = state->desc; - - desc->va = cpu_to_be64(dma_addr); - desc->key = cpu_to_be32(rkey); - desc->len = cpu_to_be32(dma_len); - - state->total_len += dma_len; - state->desc++; - state->ndesc++; -} - -static int srp_map_finish_fmr(struct srp_map_state *state, - struct srp_target_port *target) +static int srp_map_fmr(struct srp_target_port *target, struct scatterlist *scat, + int sg_cnt, struct srp_request *req, + struct srp_direct_buf *buf) { - struct srp_device *dev = target->srp_host->srp_dev; - struct ib_pool_fmr *fmr; u64 io_addr = 0; + u64 *dma_pages; + u32 len; + int page_cnt; + int i, j; + int ret; + struct srp_device *dev = target->srp_host->srp_dev; + struct ib_device *ibdev = dev->dev; + struct scatterlist *sg; - if (!state->npages) - return 0; + if (!dev->fmr_pool) + return -ENODEV; - if (state->npages == 1) { - srp_map_desc(state, state->base_dma_addr, state->fmr_len, - target->rkey); - state->npages = state->fmr_len = 0; - return 0; - } + if (srp_target_is_mellanox(target) && + (ib_sg_dma_address(ibdev, &scat[0]) & ~dev->fmr_page_mask)) + return -EINVAL; - fmr = ib_fmr_pool_map_phys(dev->fmr_pool, state->pages, - state->npages, io_addr); - if (IS_ERR(fmr)) - return PTR_ERR(fmr); + len = page_cnt = 0; + scsi_for_each_sg(req->scmnd, sg, sg_cnt, i) { + unsigned int dma_len = ib_sg_dma_len(ibdev, sg); - *state->next_fmr++ = fmr; - state->nfmr++; + if (ib_sg_dma_address(ibdev, sg) & ~dev->fmr_page_mask) { + if (i > 0) + return -EINVAL; + else + ++page_cnt; + } + if ((ib_sg_dma_address(ibdev, sg) + dma_len) & + ~dev->fmr_page_mask) { + if (i < sg_cnt - 1) + return -EINVAL; + else + ++page_cnt; + } - srp_map_desc(state, 0, state->fmr_len, fmr->fmr->rkey); - state->npages = state->fmr_len = 0; - return 0; -} + len += dma_len; + } -static void srp_map_update_start(struct srp_map_state *state, - struct scatterlist *sg, int sg_index, - dma_addr_t dma_addr) -{ - state->unmapped_sg = sg; - state->unmapped_index = sg_index; - state->unmapped_addr = dma_addr; -} + page_cnt += len >> dev->fmr_page_shift; + if (page_cnt > SRP_FMR_SIZE) + return -ENOMEM; -static int srp_map_sg_entry(struct srp_map_state *state, - struct srp_target_port *target, - struct scatterlist *sg, int sg_index, - int use_fmr) -{ - struct srp_device *dev = target->srp_host->srp_dev; - struct ib_device *ibdev = dev->dev; - dma_addr_t dma_addr = ib_sg_dma_address(ibdev, sg); - unsigned int dma_len = ib_sg_dma_len(ibdev, sg); - unsigned int len; - int ret; + dma_pages = kmalloc(sizeof (u64) * page_cnt, GFP_ATOMIC); + if (!dma_pages) + return -ENOMEM; - if (!dma_len) - return 0; + page_cnt = 0; + scsi_for_each_sg(req->scmnd, sg, sg_cnt, i) { + unsigned int dma_len = ib_sg_dma_len(ibdev, sg); - if (use_fmr == SRP_MAP_NO_FMR) { - /* Once we're in direct map mode for a request, we don't - * go back to FMR mode, so no need to update anything - * other than the descriptor. - */ - srp_map_desc(state, dma_addr, dma_len, target->rkey); - return 0; + for (j = 0; j < dma_len; j += dev->fmr_page_size) + dma_pages[page_cnt++] = + (ib_sg_dma_address(ibdev, sg) & + dev->fmr_page_mask) + j; } - /* If we start at an offset into the FMR page, don't merge into - * the current FMR. Finish it out, and use the kernel's MR for this - * sg entry. This is to avoid potential bugs on some SRP targets - * that were never quite defined, but went away when the initiator - * avoided using FMR on such page fragments. - */ - if (dma_addr & ~dev->fmr_page_mask || dma_len > dev->fmr_max_size) { - ret = srp_map_finish_fmr(state, target); - if (ret) - return ret; - - srp_map_desc(state, dma_addr, dma_len, target->rkey); - srp_map_update_start(state, NULL, 0, 0); - return 0; + req->fmr = ib_fmr_pool_map_phys(dev->fmr_pool, + dma_pages, page_cnt, io_addr, NULL); + if (IS_ERR(req->fmr)) { + ret = PTR_ERR(req->fmr); + req->fmr = NULL; + goto out; } - /* If this is the first sg to go into the FMR, save our position. - * We need to know the first unmapped entry, its index, and the - * first unmapped address within that entry to be able to restart - * mapping after an error. - */ - if (!state->unmapped_sg) - srp_map_update_start(state, sg, sg_index, dma_addr); - - while (dma_len) { - if (state->npages == SRP_FMR_SIZE) { - ret = srp_map_finish_fmr(state, target); - if (ret) - return ret; - - srp_map_update_start(state, sg, sg_index, dma_addr); - } + buf->va = cpu_to_be64(ib_sg_dma_address(ibdev, &scat[0]) & + ~dev->fmr_page_mask); + buf->key = cpu_to_be32(req->fmr->fmr->rkey); + buf->len = cpu_to_be32(len); - len = min_t(unsigned int, dma_len, dev->fmr_page_size); + ret = 0; - if (!state->npages) - state->base_dma_addr = dma_addr; - state->pages[state->npages++] = dma_addr; - state->fmr_len += len; - dma_addr += len; - dma_len -= len; - } +out: + kfree(dma_pages); - /* If the last entry of the FMR wasn't a full page, then we need to - * close it out and start a new one -- we can only merge at page - * boundries. - */ - ret = 0; - if (len != dev->fmr_page_size) { - ret = srp_map_finish_fmr(state, target); - if (!ret) - srp_map_update_start(state, NULL, 0, 0); - } return ret; } static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, struct srp_request *req) { - struct scatterlist *scat, *sg; + struct scatterlist *scat; struct srp_cmd *cmd = req->cmd->buf; - int i, len, nents, count, use_fmr; + int len, nents, count; + u8 fmt = SRP_DATA_DESC_DIRECT; struct srp_device *dev; struct ib_device *ibdev; - struct srp_map_state state; - struct srp_indirect_buf *indirect_hdr; - u32 table_len; - u8 fmt; if (!scsi_sglist(scmnd) || scmnd->sc_data_direction == DMA_NONE) return sizeof (struct srp_cmd); @@ -866,8 +735,6 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, ibdev = dev->dev; count = ib_dma_map_sg(ibdev, scat, nents, scmnd->sc_data_direction); - if (unlikely(count == 0)) - return -EIO; fmt = SRP_DATA_DESC_DIRECT; len = sizeof (struct srp_cmd) + sizeof (struct srp_direct_buf); @@ -882,101 +749,51 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_target_port *target, struct srp_direct_buf *buf = (void *) cmd->add_data; buf->va = cpu_to_be64(ib_sg_dma_address(ibdev, scat)); - buf->key = cpu_to_be32(target->rkey); + buf->key = cpu_to_be32(dev->mr->rkey); buf->len = cpu_to_be32(ib_sg_dma_len(ibdev, scat)); - - req->nfmr = 0; - goto map_complete; - } - - /* We have more than one scatter/gather entry, so build our indirect - * descriptor table, trying to merge as many entries with FMR as we - * can. - */ - indirect_hdr = (void *) cmd->add_data; - - ib_dma_sync_single_for_cpu(ibdev, req->indirect_dma_addr, - target->indirect_size, DMA_TO_DEVICE); - - memset(&state, 0, sizeof(state)); - state.desc = req->indirect_desc; - state.pages = req->map_page; - state.next_fmr = req->fmr_list; - - use_fmr = dev->fmr_pool ? SRP_MAP_ALLOW_FMR : SRP_MAP_NO_FMR; - - for_each_sg(scat, sg, count, i) { - if (srp_map_sg_entry(&state, target, sg, i, use_fmr)) { - /* FMR mapping failed, so backtrack to the first - * unmapped entry and continue on without using FMR. - */ - dma_addr_t dma_addr; - unsigned int dma_len; - -backtrack: - sg = state.unmapped_sg; - i = state.unmapped_index; - - dma_addr = ib_sg_dma_address(ibdev, sg); - dma_len = ib_sg_dma_len(ibdev, sg); - dma_len -= (state.unmapped_addr - dma_addr); - dma_addr = state.unmapped_addr; - use_fmr = SRP_MAP_NO_FMR; - srp_map_desc(&state, dma_addr, dma_len, target->rkey); + } else if (srp_map_fmr(target, scat, count, req, + (void *) cmd->add_data)) { + /* + * FMR mapping failed, and the scatterlist has more + * than one entry. Generate an indirect memory + * descriptor. + */ + struct srp_indirect_buf *buf = (void *) cmd->add_data; + struct scatterlist *sg; + u32 datalen = 0; + int i; + + fmt = SRP_DATA_DESC_INDIRECT; + len = sizeof (struct srp_cmd) + + sizeof (struct srp_indirect_buf) + + count * sizeof (struct srp_direct_buf); + + scsi_for_each_sg(scmnd, sg, count, i) { + unsigned int dma_len = ib_sg_dma_len(ibdev, sg); + + buf->desc_list[i].va = + cpu_to_be64(ib_sg_dma_address(ibdev, sg)); + buf->desc_list[i].key = + cpu_to_be32(dev->mr->rkey); + buf->desc_list[i].len = cpu_to_be32(dma_len); + datalen += dma_len; } - } - if (use_fmr == SRP_MAP_ALLOW_FMR && srp_map_finish_fmr(&state, target)) - goto backtrack; + if (scmnd->sc_data_direction == DMA_TO_DEVICE) + cmd->data_out_desc_cnt = count; + else + cmd->data_in_desc_cnt = count; - /* We've mapped the request, now pull as much of the indirect - * descriptor table as we can into the command buffer. If this - * target is not using an external indirect table, we are - * guaranteed to fit into the command, as the SCSI layer won't - * give us more S/G entries than we allow. - */ - req->nfmr = state.nfmr; - if (state.ndesc == 1) { - /* FMR mapping was able to collapse this to one entry, - * so use a direct descriptor. - */ - struct srp_direct_buf *buf = (void *) cmd->add_data; + buf->table_desc.va = + cpu_to_be64(req->cmd->dma + sizeof *cmd + sizeof *buf); + buf->table_desc.key = + cpu_to_be32(target->srp_host->srp_dev->mr->rkey); + buf->table_desc.len = + cpu_to_be32(count * sizeof (struct srp_direct_buf)); - *buf = req->indirect_desc[0]; - goto map_complete; - } - - if (unlikely(target->cmd_sg_cnt < state.ndesc && - !target->allow_ext_sg)) { - shost_printk(KERN_ERR, target->scsi_host, - "Could not fit S/G list into SRP_CMD\n"); - return -EIO; + buf->len = cpu_to_be32(datalen); } - count = min(state.ndesc, target->cmd_sg_cnt); - table_len = state.ndesc * sizeof (struct srp_direct_buf); - - fmt = SRP_DATA_DESC_INDIRECT; - len = sizeof(struct srp_cmd) + sizeof (struct srp_indirect_buf); - len += count * sizeof (struct srp_direct_buf); - - memcpy(indirect_hdr->desc_list, req->indirect_desc, - count * sizeof (struct srp_direct_buf)); - - indirect_hdr->table_desc.va = cpu_to_be64(req->indirect_dma_addr); - indirect_hdr->table_desc.key = cpu_to_be32(target->rkey); - indirect_hdr->table_desc.len = cpu_to_be32(table_len); - indirect_hdr->len = cpu_to_be32(state.total_len); - - if (scmnd->sc_data_direction == DMA_TO_DEVICE) - cmd->data_out_desc_cnt = count; - else - cmd->data_in_desc_cnt = count; - - ib_dma_sync_single_for_device(ibdev, req->indirect_dma_addr, table_len, - DMA_TO_DEVICE); - -map_complete: if (scmnd->sc_data_direction == DMA_TO_DEVICE) cmd->buf_fmt = fmt << 4; else @@ -985,126 +802,33 @@ map_complete: return len; } -/* - * Return an IU and possible credit to the free pool - */ -static void srp_put_tx_iu(struct srp_target_port *target, struct srp_iu *iu, - enum srp_iu_type iu_type) +static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp) { + struct srp_request *req; + struct scsi_cmnd *scmnd; unsigned long flags; + s32 delta; - spin_lock_irqsave(&target->lock, flags); - list_add(&iu->list, &target->free_tx); - if (iu_type != SRP_IU_RSP) - ++target->req_lim; - spin_unlock_irqrestore(&target->lock, flags); -} - -/* - * Must be called with target->lock held to protect req_lim and free_tx. - * If IU is not sent, it must be returned using srp_put_tx_iu(). - * - * Note: - * An upper limit for the number of allocated information units for each - * request type is: - * - SRP_IU_CMD: SRP_CMD_SQ_SIZE, since the SCSI mid-layer never queues - * more than Scsi_Host.can_queue requests. - * - SRP_IU_TSK_MGMT: SRP_TSK_MGMT_SQ_SIZE. - * - SRP_IU_RSP: 1, since a conforming SRP target never sends more than - * one unanswered SRP request to an initiator. - */ -static struct srp_iu *__srp_get_tx_iu(struct srp_target_port *target, - enum srp_iu_type iu_type) -{ - s32 rsv = (iu_type == SRP_IU_TSK_MGMT) ? 0 : SRP_TSK_MGMT_SQ_SIZE; - struct srp_iu *iu; + delta = (s32) be32_to_cpu(rsp->req_lim_delta); - srp_send_completion(target->send_cq, target); + spin_lock_irqsave(target->scsi_host->host_lock, flags); - if (list_empty(&target->free_tx)) - return NULL; + target->req_lim += delta; - /* Initiator responses to target requests do not consume credits */ - if (iu_type != SRP_IU_RSP) { - if (target->req_lim <= rsv) { - ++target->zero_req_lim; - return NULL; - } - - --target->req_lim; - } - - iu = list_first_entry(&target->free_tx, struct srp_iu, list); - list_del(&iu->list); - return iu; -} - -static int srp_post_send(struct srp_target_port *target, - struct srp_iu *iu, int len) -{ - struct ib_sge list; - struct ib_send_wr wr, *bad_wr; - - list.addr = iu->dma; - list.length = len; - list.lkey = target->lkey; - - wr.next = NULL; - wr.wr_id = (uintptr_t) iu; - wr.sg_list = &list; - wr.num_sge = 1; - wr.opcode = IB_WR_SEND; - wr.send_flags = IB_SEND_SIGNALED; - - return ib_post_send(target->qp, &wr, &bad_wr); -} - -static int srp_post_recv(struct srp_target_port *target, struct srp_iu *iu) -{ - struct ib_recv_wr wr, *bad_wr; - struct ib_sge list; - - list.addr = iu->dma; - list.length = iu->size; - list.lkey = target->lkey; - - wr.next = NULL; - wr.wr_id = (uintptr_t) iu; - wr.sg_list = &list; - wr.num_sge = 1; - - return ib_post_recv(target->qp, &wr, &bad_wr); -} - -static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp) -{ - struct srp_request *req; - struct scsi_cmnd *scmnd; - unsigned long flags; + req = &target->req_ring[rsp->tag & ~SRP_TAG_TSK_MGMT]; if (unlikely(rsp->tag & SRP_TAG_TSK_MGMT)) { - spin_lock_irqsave(&target->lock, flags); - target->req_lim += be32_to_cpu(rsp->req_lim_delta); - spin_unlock_irqrestore(&target->lock, flags); - - target->tsk_mgmt_status = -1; - if (be32_to_cpu(rsp->resp_data_len) >= 4) - target->tsk_mgmt_status = rsp->data[3]; - complete(&target->tsk_mgmt_done); + if (be32_to_cpu(rsp->resp_data_len) < 4) + req->tsk_status = -1; + else + req->tsk_status = rsp->data[3]; + complete(&req->done); } else { - req = &target->req_ring[rsp->tag]; - scmnd = srp_claim_req(target, req, NULL); - if (!scmnd) { + scmnd = req->scmnd; + if (!scmnd) shost_printk(KERN_ERR, target->scsi_host, "Null scmnd for RSP w/tag %016llx\n", (unsigned long long) rsp->tag); - - spin_lock_irqsave(&target->lock, flags); - target->req_lim += be32_to_cpu(rsp->req_lim_delta); - spin_unlock_irqrestore(&target->lock, flags); - - return; - } scmnd->result = rsp->status; if (rsp->flags & SRP_RSP_FLAG_SNSVALID) { @@ -1119,205 +843,281 @@ static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp) else if (rsp->flags & (SRP_RSP_FLAG_DIOVER | SRP_RSP_FLAG_DIUNDER)) scsi_set_resid(scmnd, be32_to_cpu(rsp->data_in_res_cnt)); - srp_free_req(target, req, scmnd, - be32_to_cpu(rsp->req_lim_delta)); + if (!req->tsk_mgmt) { + scmnd->host_scribble = (void *) -1L; + scmnd->scsi_done(scmnd); - scmnd->host_scribble = NULL; - scmnd->scsi_done(scmnd); + srp_remove_req(target, req); + } else + req->cmd_done = 1; } + + spin_unlock_irqrestore(target->scsi_host->host_lock, flags); } -static int srp_response_common(struct srp_target_port *target, s32 req_delta, - void *rsp, int len) +static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc) { - struct ib_device *dev = target->srp_host->srp_dev->dev; - unsigned long flags; + struct ib_device *dev; struct srp_iu *iu; - int err; + u8 opcode; - spin_lock_irqsave(&target->lock, flags); - target->req_lim += req_delta; - iu = __srp_get_tx_iu(target, SRP_IU_RSP); - spin_unlock_irqrestore(&target->lock, flags); + iu = target->rx_ring[wc->wr_id & ~SRP_OP_RECV]; + + dev = target->srp_host->srp_dev->dev; + ib_dma_sync_single_for_cpu(dev, iu->dma, target->max_ti_iu_len, + DMA_FROM_DEVICE); + + opcode = *(u8 *) iu->buf; - if (!iu) { - shost_printk(KERN_ERR, target->scsi_host, PFX - "no IU available to send response\n"); - return 1; + if (0) { + int i; + + shost_printk(KERN_ERR, target->scsi_host, + PFX "recv completion, opcode 0x%02x\n", opcode); + + for (i = 0; i < wc->byte_len; ++i) { + if (i % 8 == 0) + printk(KERN_ERR " [%02x] ", i); + printk(" %02x", ((u8 *) iu->buf)[i]); + if ((i + 1) % 8 == 0) + printk("\n"); + } + + if (wc->byte_len % 8) + printk("\n"); } - ib_dma_sync_single_for_cpu(dev, iu->dma, len, DMA_TO_DEVICE); - memcpy(iu->buf, rsp, len); - ib_dma_sync_single_for_device(dev, iu->dma, len, DMA_TO_DEVICE); + switch (opcode) { + case SRP_RSP: + srp_process_rsp(target, iu->buf); + break; - err = srp_post_send(target, iu, len); - if (err) { - shost_printk(KERN_ERR, target->scsi_host, PFX - "unable to post response: %d\n", err); - srp_put_tx_iu(target, iu, SRP_IU_RSP); + case SRP_T_LOGOUT: + /* XXX Handle target logout */ + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Got target logout request\n"); + break; + + default: + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Unhandled SRP opcode 0x%02x\n", opcode); + break; } - return err; + ib_dma_sync_single_for_device(dev, iu->dma, target->max_ti_iu_len, + DMA_FROM_DEVICE); } -static void srp_process_cred_req(struct srp_target_port *target, - struct srp_cred_req *req) +static void srp_reconnect_work(struct work_struct *work) { - struct srp_cred_rsp rsp = { - .opcode = SRP_CRED_RSP, - .tag = req->tag, - }; - s32 delta = be32_to_cpu(req->req_lim_delta); + struct srp_target_port *target = + container_of(work, struct srp_target_port, work); - if (srp_response_common(target, delta, &rsp, sizeof rsp)) - shost_printk(KERN_ERR, target->scsi_host, PFX - "problems processing SRP_CRED_REQ\n"); + srp_reconnect_target(target); + spin_lock_irq(target->scsi_host->host_lock); + target->work_in_progress = 0; + spin_unlock_irq(target->scsi_host->host_lock); } -static void srp_process_aer_req(struct srp_target_port *target, - struct srp_aer_req *req) +static void srp_qp_in_err_timer(unsigned long data) { - struct srp_aer_rsp rsp = { - .opcode = SRP_AER_RSP, - .tag = req->tag, - }; - s32 delta = be32_to_cpu(req->req_lim_delta); + struct srp_target_port *target = (struct srp_target_port *)data; + struct srp_request *req, *tmp; + + if (target->state != SRP_TARGET_LIVE) + return; - shost_printk(KERN_ERR, target->scsi_host, PFX - "ignoring AER for LUN %llu\n", be64_to_cpu(req->lun)); + spin_lock_irq(target->scsi_host->host_lock); + list_for_each_entry_safe(req, tmp, &target->req_queue, list) + srp_reset_req(target, req, DID_RESET); + spin_unlock_irq(target->scsi_host->host_lock); - if (srp_response_common(target, delta, &rsp, sizeof rsp)) - shost_printk(KERN_ERR, target->scsi_host, PFX - "problems processing SRP_AER_REQ\n"); + spin_lock_irq(target->scsi_host->host_lock); + if (!target->work_in_progress) { + target->work_in_progress = 1; + INIT_WORK(&target->work, srp_reconnect_work); + schedule_work(&target->work); + } + spin_unlock_irq(target->scsi_host->host_lock); } -static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc) +static void srp_qp_err_add_timer(struct srp_target_port *target, int time) { - struct ib_device *dev = target->srp_host->srp_dev->dev; - struct srp_iu *iu = (struct srp_iu *) (uintptr_t) wc->wr_id; - int res; - u8 opcode; + if (!timer_pending(&target->qp_err_timer)) { + setup_timer(&target->qp_err_timer, + srp_qp_in_err_timer, + (unsigned long)target); + target->qp_err_timer.expires = round_jiffies(time*HZ + jiffies); + add_timer(&target->qp_err_timer); + } +} - ib_dma_sync_single_for_cpu(dev, iu->dma, target->max_ti_iu_len, - DMA_FROM_DEVICE); +static void srp_completion(struct ib_cq *cq, void *target_ptr) +{ + struct srp_target_port *target = target_ptr; + struct ib_wc wc; - opcode = *(u8 *) iu->buf; + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + while (ib_poll_cq(cq, 1, &wc) > 0) { + if (wc.status) { + unsigned long flags; - if (0) { - shost_printk(KERN_ERR, target->scsi_host, - PFX "recv completion, opcode 0x%02x\n", opcode); - print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 8, 1, - iu->buf, wc->byte_len, true); + shost_printk(KERN_ERR, target->scsi_host, + PFX "failed %s status %d\n", + wc.wr_id & SRP_OP_RECV ? "receive" : "send", + wc.status); + spin_lock_irqsave(target->scsi_host->host_lock, flags); + if (!target->qp_in_error && + target->state == SRP_TARGET_LIVE) { + target->qp_in_error = 1; + srp_qp_err_add_timer(target, + (srp_dev_loss_tmo > + SRP_CONN_ERR_TIMEOUT) ? + (srp_dev_loss_tmo - + SRP_CONN_ERR_TIMEOUT) : + 1); + } + spin_unlock_irqrestore(target->scsi_host->host_lock, flags); + break; + } + + if (wc.wr_id & SRP_OP_RECV) + srp_handle_recv(target, &wc); + else + ++target->tx_tail; } +} - switch (opcode) { - case SRP_RSP: - srp_process_rsp(target, iu->buf); - break; +static int __srp_post_recv(struct srp_target_port *target) +{ + struct srp_iu *iu; + struct ib_sge list; + struct ib_recv_wr wr, *bad_wr; + unsigned int next; + int ret; - case SRP_CRED_REQ: - srp_process_cred_req(target, iu->buf); - break; + next = target->rx_head & (SRP_RQ_SIZE - 1); + wr.wr_id = next | SRP_OP_RECV; + iu = target->rx_ring[next]; - case SRP_AER_REQ: - srp_process_aer_req(target, iu->buf); - break; + list.addr = iu->dma; + list.length = iu->size; + list.lkey = target->srp_host->srp_dev->mr->lkey; - case SRP_T_LOGOUT: - /* XXX Handle target logout */ - shost_printk(KERN_WARNING, target->scsi_host, - PFX "Got target logout request\n"); - break; + wr.next = NULL; + wr.sg_list = &list; + wr.num_sge = 1; + + ret = ib_post_recv(target->qp, &wr, &bad_wr); + if (!ret) + ++target->rx_head; + + return ret; +} - default: - shost_printk(KERN_WARNING, target->scsi_host, - PFX "Unhandled SRP opcode 0x%02x\n", opcode); - break; - } +static int srp_post_recv(struct srp_target_port *target) +{ + unsigned long flags; + int ret; - ib_dma_sync_single_for_device(dev, iu->dma, target->max_ti_iu_len, - DMA_FROM_DEVICE); + spin_lock_irqsave(target->scsi_host->host_lock, flags); + ret = __srp_post_recv(target); + spin_unlock_irqrestore(target->scsi_host->host_lock, flags); - res = srp_post_recv(target, iu); - if (res != 0) - shost_printk(KERN_ERR, target->scsi_host, - PFX "Recv failed with error code %d\n", res); + return ret; } -static void srp_recv_completion(struct ib_cq *cq, void *target_ptr) +/* + * Must be called with target->scsi_host->host_lock held to protect + * req_lim and tx_head. Lock cannot be dropped between call here and + * call to __srp_post_send(). + */ +static struct srp_iu *__srp_get_tx_iu(struct srp_target_port *target, + enum srp_request_type req_type) { - struct srp_target_port *target = target_ptr; - struct ib_wc wc; + s32 min = (req_type == SRP_REQ_TASK_MGMT) ? 1 : 2; - ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); - while (ib_poll_cq(cq, 1, &wc) > 0) { - if (wc.status) { - shost_printk(KERN_ERR, target->scsi_host, - PFX "failed receive status %d\n", - wc.status); - target->qp_in_error = 1; - break; - } + if (target->tx_head - target->tx_tail >= SRP_SQ_SIZE) + return NULL; - srp_handle_recv(target, &wc); + if (target->req_lim < min) { + ++target->zero_req_lim; + return NULL; } + + return target->tx_ring[target->tx_head & SRP_SQ_SIZE]; } -static void srp_send_completion(struct ib_cq *cq, void *target_ptr) +/* + * Must be called with target->scsi_host->host_lock held to protect + * req_lim and tx_head. + */ +static int __srp_post_send(struct srp_target_port *target, + struct srp_iu *iu, int len) { - struct srp_target_port *target = target_ptr; - struct ib_wc wc; - struct srp_iu *iu; + struct ib_sge list; + struct ib_send_wr wr, *bad_wr; + int ret = 0; - while (ib_poll_cq(cq, 1, &wc) > 0) { - if (wc.status) { - shost_printk(KERN_ERR, target->scsi_host, - PFX "failed send status %d\n", - wc.status); - target->qp_in_error = 1; - break; - } + list.addr = iu->dma; + list.length = len; + list.lkey = target->srp_host->srp_dev->mr->lkey; + + wr.next = NULL; + wr.wr_id = target->tx_head & SRP_SQ_SIZE; + wr.sg_list = &list; + wr.num_sge = 1; + wr.opcode = IB_WR_SEND; + wr.send_flags = IB_SEND_SIGNALED; - iu = (struct srp_iu *) (uintptr_t) wc.wr_id; - list_add(&iu->list, &target->free_tx); + ret = ib_post_send(target->qp, &wr, &bad_wr); + + if (!ret) { + ++target->tx_head; + --target->req_lim; } + + return ret; } -static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) +static int srp_queuecommand(struct scsi_cmnd *scmnd, + void (*done)(struct scsi_cmnd *)) { - struct srp_target_port *target = host_to_target(shost); + struct srp_target_port *target = host_to_target(scmnd->device->host); struct srp_request *req; struct srp_iu *iu; struct srp_cmd *cmd; struct ib_device *dev; - unsigned long flags; int len; - if (target->state == SRP_TARGET_CONNECTING) + if (target->state == SRP_TARGET_CONNECTING || + target->qp_in_error) goto err; if (target->state == SRP_TARGET_DEAD || - target->state == SRP_TARGET_REMOVED) { - scmnd->result = DID_BAD_TARGET << 16; - scmnd->scsi_done(scmnd); + target->state == SRP_TARGET_REMOVED || + target->scsi_id != scmnd->device->id) { + if (target->scsi_id != scmnd->device->id) + scmnd->result = DID_BAD_TARGET << 16; + else + scmnd->result = DID_NO_CONNECT << 16; + done(scmnd); return 0; } - spin_lock_irqsave(&target->lock, flags); - iu = __srp_get_tx_iu(target, SRP_IU_CMD); + iu = __srp_get_tx_iu(target, SRP_REQ_NORMAL); if (!iu) - goto err_unlock; - - req = list_first_entry(&target->free_reqs, struct srp_request, list); - list_del(&req->list); - spin_unlock_irqrestore(&target->lock, flags); + goto err; dev = target->srp_host->srp_dev->dev; - ib_dma_sync_single_for_cpu(dev, iu->dma, target->max_iu_len, + ib_dma_sync_single_for_cpu(dev, iu->dma, srp_max_iu_len, DMA_TO_DEVICE); + req = list_entry(target->free_reqs.next, struct srp_request, list); + + scmnd->scsi_done = done; scmnd->result = 0; - scmnd->host_scribble = (void *) req; + scmnd->host_scribble = (void *) (long) req->index; cmd = iu->buf; memset(cmd, 0, sizeof *cmd); @@ -1329,36 +1129,36 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) req->scmnd = scmnd; req->cmd = iu; + req->cmd_done = 0; + req->tsk_mgmt = NULL; len = srp_map_data(scmnd, target, req); if (len < 0) { shost_printk(KERN_ERR, target->scsi_host, PFX "Failed to map data\n"); - goto err_iu; + goto err; } - ib_dma_sync_single_for_device(dev, iu->dma, target->max_iu_len, + if (__srp_post_recv(target)) { + shost_printk(KERN_ERR, target->scsi_host, PFX "Recv failed\n"); + goto err_unmap; + } + + ib_dma_sync_single_for_device(dev, iu->dma, srp_max_iu_len, DMA_TO_DEVICE); - if (srp_post_send(target, iu, len)) { + if (__srp_post_send(target, iu, len)) { shost_printk(KERN_ERR, target->scsi_host, PFX "Send failed\n"); goto err_unmap; } + list_move_tail(&req->list, &target->req_queue); + return 0; err_unmap: srp_unmap_data(scmnd, target, req); -err_iu: - srp_put_tx_iu(target, iu, SRP_IU_CMD); - - spin_lock_irqsave(&target->lock, flags); - list_add(&req->list, &target->free_reqs); - -err_unlock: - spin_unlock_irqrestore(&target->lock, flags); - err: return SCSI_MLQUEUE_HOST_BUSY; } @@ -1375,14 +1175,12 @@ static int srp_alloc_iu_bufs(struct srp_target_port *target) goto err; } - for (i = 0; i < SRP_SQ_SIZE; ++i) { + for (i = 0; i < SRP_SQ_SIZE + 1; ++i) { target->tx_ring[i] = srp_alloc_iu(target->srp_host, - target->max_iu_len, + srp_max_iu_len, GFP_KERNEL, DMA_TO_DEVICE); if (!target->tx_ring[i]) goto err; - - list_add(&target->tx_ring[i]->list, &target->free_tx); } return 0; @@ -1393,7 +1191,7 @@ err: target->rx_ring[i] = NULL; } - for (i = 0; i < SRP_SQ_SIZE; ++i) { + for (i = 0; i < SRP_SQ_SIZE + 1; ++i) { srp_free_iu(target->srp_host, target->tx_ring[i]); target->tx_ring[i] = NULL; } @@ -1401,78 +1199,6 @@ err: return -ENOMEM; } -static void srp_cm_rep_handler(struct ib_cm_id *cm_id, - struct srp_login_rsp *lrsp, - struct srp_target_port *target) -{ - struct ib_qp_attr *qp_attr = NULL; - int attr_mask = 0; - int ret; - int i; - - if (lrsp->opcode == SRP_LOGIN_RSP) { - target->max_ti_iu_len = be32_to_cpu(lrsp->max_ti_iu_len); - target->req_lim = be32_to_cpu(lrsp->req_lim_delta); - - /* - * Reserve credits for task management so we don't - * bounce requests back to the SCSI mid-layer. - */ - target->scsi_host->can_queue - = min(target->req_lim - SRP_TSK_MGMT_SQ_SIZE, - target->scsi_host->can_queue); - } else { - shost_printk(KERN_WARNING, target->scsi_host, - PFX "Unhandled RSP opcode %#x\n", lrsp->opcode); - ret = -ECONNRESET; - goto error; - } - - if (!target->rx_ring[0]) { - ret = srp_alloc_iu_bufs(target); - if (ret) - goto error; - } - - ret = -ENOMEM; - qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL); - if (!qp_attr) - goto error; - - qp_attr->qp_state = IB_QPS_RTR; - ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); - if (ret) - goto error_free; - - ret = ib_modify_qp(target->qp, qp_attr, attr_mask); - if (ret) - goto error_free; - - for (i = 0; i < SRP_RQ_SIZE; i++) { - struct srp_iu *iu = target->rx_ring[i]; - ret = srp_post_recv(target, iu); - if (ret) - goto error_free; - } - - qp_attr->qp_state = IB_QPS_RTS; - ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); - if (ret) - goto error_free; - - ret = ib_modify_qp(target->qp, qp_attr, attr_mask); - if (ret) - goto error_free; - - ret = ib_send_cm_rtu(cm_id, NULL, 0); - -error_free: - kfree(qp_attr); - -error: - target->status = ret; -} - static void srp_cm_rej_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event, struct srp_target_port *target) @@ -1556,7 +1282,10 @@ static void srp_cm_rej_handler(struct ib_cm_id *cm_id, static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) { struct srp_target_port *target = cm_id->context; + struct ib_qp_attr *qp_attr = NULL; + int attr_mask = 0; int comp = 0; + int opcode = 0; switch (event->event) { case IB_CM_REQ_ERROR: @@ -1568,7 +1297,61 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) case IB_CM_REP_RECEIVED: comp = 1; - srp_cm_rep_handler(cm_id, event->private_data, target); + opcode = *(u8 *) event->private_data; + + if (opcode == SRP_LOGIN_RSP) { + struct srp_login_rsp *rsp = event->private_data; + + target->max_ti_iu_len = be32_to_cpu(rsp->max_ti_iu_len); + target->req_lim = be32_to_cpu(rsp->req_lim_delta); + + target->scsi_host->can_queue = min(target->req_lim, + target->scsi_host->can_queue); + } else { + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Unhandled RSP opcode %#x\n", opcode); + target->status = -ECONNRESET; + break; + } + + if (!target->rx_ring[0]) { + target->status = srp_alloc_iu_bufs(target); + if (target->status) + break; + } + + qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL); + if (!qp_attr) { + target->status = -ENOMEM; + break; + } + + qp_attr->qp_state = IB_QPS_RTR; + target->status = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); + if (target->status) + break; + + target->status = ib_modify_qp(target->qp, qp_attr, attr_mask); + if (target->status) + break; + + target->status = srp_post_recv(target); + if (target->status) + break; + + qp_attr->qp_state = IB_QPS_RTS; + target->status = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); + if (target->status) + break; + + target->status = ib_modify_qp(target->qp, qp_attr, attr_mask); + if (target->status) + break; + + target->status = ib_send_cm_rtu(cm_id, NULL, 0); + if (target->status) + break; + break; case IB_CM_REJ_RECEIVED: @@ -1590,7 +1373,6 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) shost_printk(KERN_ERR, target->scsi_host, PFX "connection closed\n"); - comp = 1; target->status = 0; break; @@ -1608,92 +1390,124 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) if (comp) complete(&target->done); + kfree(qp_attr); + return 0; } static int srp_send_tsk_mgmt(struct srp_target_port *target, - u64 req_tag, unsigned int lun, u8 func) + struct srp_request *req, u8 func) { - struct ib_device *dev = target->srp_host->srp_dev->dev; struct srp_iu *iu; struct srp_tsk_mgmt *tsk_mgmt; - if (target->state == SRP_TARGET_DEAD || - target->state == SRP_TARGET_REMOVED) - return -1; + spin_lock_irq(target->scsi_host->host_lock); - init_completion(&target->tsk_mgmt_done); + if (target->state == SRP_TARGET_DEAD || + target->state == SRP_TARGET_REMOVED) { + req->scmnd->result = DID_BAD_TARGET << 16; + goto out; + } - spin_lock_irq(&target->lock); - iu = __srp_get_tx_iu(target, SRP_IU_TSK_MGMT); - spin_unlock_irq(&target->lock); + init_completion(&req->done); + iu = __srp_get_tx_iu(target, SRP_REQ_TASK_MGMT); if (!iu) - return -1; + goto out; - ib_dma_sync_single_for_cpu(dev, iu->dma, sizeof *tsk_mgmt, - DMA_TO_DEVICE); tsk_mgmt = iu->buf; memset(tsk_mgmt, 0, sizeof *tsk_mgmt); tsk_mgmt->opcode = SRP_TSK_MGMT; - tsk_mgmt->lun = cpu_to_be64((u64) lun << 48); - tsk_mgmt->tag = req_tag | SRP_TAG_TSK_MGMT; + tsk_mgmt->lun = cpu_to_be64((u64) req->scmnd->device->lun << 48); + tsk_mgmt->tag = req->index | SRP_TAG_TSK_MGMT; tsk_mgmt->tsk_mgmt_func = func; - tsk_mgmt->task_tag = req_tag; + tsk_mgmt->task_tag = req->index; - ib_dma_sync_single_for_device(dev, iu->dma, sizeof *tsk_mgmt, - DMA_TO_DEVICE); - if (srp_post_send(target, iu, sizeof *tsk_mgmt)) { - srp_put_tx_iu(target, iu, SRP_IU_TSK_MGMT); - return -1; - } + if (__srp_post_send(target, iu, sizeof *tsk_mgmt)) + goto out; + + req->tsk_mgmt = iu; - if (!wait_for_completion_timeout(&target->tsk_mgmt_done, + spin_unlock_irq(target->scsi_host->host_lock); + + if (!wait_for_completion_timeout(&req->done, msecs_to_jiffies(SRP_ABORT_TIMEOUT_MS))) return -1; return 0; + +out: + spin_unlock_irq(target->scsi_host->host_lock); + return -1; +} + +static int srp_find_req(struct srp_target_port *target, + struct scsi_cmnd *scmnd, + struct srp_request **req) +{ + if (scmnd->host_scribble == (void *) -1L) + return -1; + + *req = &target->req_ring[(long) scmnd->host_scribble]; + + return 0; } static int srp_abort(struct scsi_cmnd *scmnd) { struct srp_target_port *target = host_to_target(scmnd->device->host); - struct srp_request *req = (struct srp_request *) scmnd->host_scribble; + struct srp_request *req; + int ret = SUCCESS; shost_printk(KERN_ERR, target->scsi_host, "SRP abort called\n"); - if (!req || target->qp_in_error || !srp_claim_req(target, req, scmnd)) + if (target->qp_in_error) + return FAILED; + if (srp_find_req(target, scmnd, &req)) + return FAILED; + if (srp_send_tsk_mgmt(target, req, SRP_TSK_ABORT_TASK)) return FAILED; - srp_send_tsk_mgmt(target, req->index, scmnd->device->lun, - SRP_TSK_ABORT_TASK); - srp_free_req(target, req, scmnd, 0); - scmnd->result = DID_ABORT << 16; - scmnd->scsi_done(scmnd); - return SUCCESS; + spin_lock_irq(target->scsi_host->host_lock); + + if (req->cmd_done) { + srp_remove_req(target, req); + scmnd->scsi_done(scmnd); + } else if (!req->tsk_status) { + srp_remove_req(target, req); + scmnd->result = DID_ABORT << 16; + } else + ret = FAILED; + + spin_unlock_irq(target->scsi_host->host_lock); + + return ret; } static int srp_reset_device(struct scsi_cmnd *scmnd) { struct srp_target_port *target = host_to_target(scmnd->device->host); - int i; + struct srp_request *req, *tmp; shost_printk(KERN_ERR, target->scsi_host, "SRP reset_device called\n"); if (target->qp_in_error) return FAILED; - if (srp_send_tsk_mgmt(target, SRP_TAG_NO_REQ, scmnd->device->lun, - SRP_TSK_LUN_RESET)) + if (srp_find_req(target, scmnd, &req)) + return FAILED; + if (srp_send_tsk_mgmt(target, req, SRP_TSK_LUN_RESET)) return FAILED; - if (target->tsk_mgmt_status) + if (req->tsk_status) return FAILED; - for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { - struct srp_request *req = &target->req_ring[i]; - if (req->scmnd && req->scmnd->device == scmnd->device) - srp_reset_req(target, req); - } + spin_lock_irq(target->scsi_host->host_lock); + + list_for_each_entry_safe(req, tmp, &target->req_queue, list) + if (req->scmnd->device == scmnd->device) + srp_reset_req(target, req, DID_RESET); + + spin_unlock_irq(target->scsi_host->host_lock); return SUCCESS; } @@ -1701,9 +1515,22 @@ static int srp_reset_device(struct scsi_cmnd *scmnd) static int srp_reset_host(struct scsi_cmnd *scmnd) { struct srp_target_port *target = host_to_target(scmnd->device->host); + struct srp_request *req, *tmp; int ret = FAILED; - shost_printk(KERN_ERR, target->scsi_host, PFX "SRP reset_host called\n"); + shost_printk(KERN_ERR, target->scsi_host, + PFX "SRP reset_host called state %d qp_err %d\n", + target->state, target->qp_in_error); + + spin_lock_irq(target->scsi_host->host_lock); + if (timer_pending(&target->qp_err_timer) || target->qp_in_error || + target->state != SRP_TARGET_LIVE) { + list_for_each_entry_safe(req, tmp, &target->req_queue, list) + srp_reset_req(target, req, DID_RESET); + spin_unlock_irq(target->scsi_host->host_lock); + return SUCCESS; + } + spin_unlock_irq(target->scsi_host->host_lock); if (!srp_reconnect_target(target)) ret = SUCCESS; @@ -1786,18 +1613,6 @@ static ssize_t show_orig_dgid(struct device *dev, return sprintf(buf, "%pI6\n", target->orig_dgid); } -static ssize_t show_req_lim(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct srp_target_port *target = host_to_target(class_to_shost(dev)); - - if (target->state == SRP_TARGET_DEAD || - target->state == SRP_TARGET_REMOVED) - return -ENODEV; - - return sprintf(buf, "%d\n", target->req_lim); -} - static ssize_t show_zero_req_lim(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1826,20 +1641,53 @@ static ssize_t show_local_ib_device(struct device *dev, return sprintf(buf, "%s\n", target->srp_host->srp_dev->dev->name); } -static ssize_t show_cmd_sg_entries(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t srp_target_oofabric(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) { struct srp_target_port *target = host_to_target(class_to_shost(dev)); - return sprintf(buf, "%u\n", target->cmd_sg_cnt); + shost_printk(KERN_DEBUG, target->scsi_host, PFX + "Get async_event out-of-fabric at state=%d qp_err=%d\n", + target->state, target->qp_in_error); + + if (target->state != SRP_TARGET_LIVE) + return -EINVAL; + + spin_lock_irq(target->scsi_host->host_lock); + if (!target->qp_in_error) + srp_qp_err_add_timer(target, srp_dev_loss_tmo); + spin_unlock_irq(target->scsi_host->host_lock); + + return count; } -static ssize_t show_allow_ext_sg(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t srp_target_infabric(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) { struct srp_target_port *target = host_to_target(class_to_shost(dev)); - return sprintf(buf, "%s\n", target->allow_ext_sg ? "true" : "false"); + shost_printk(KERN_DEBUG, target->scsi_host, PFX + "Get async_event in-fabric at state=%d qp_err=%d\n", + target->state, target->qp_in_error); + + spin_lock_irq(target->scsi_host->host_lock); + if (timer_pending(&target->qp_err_timer) + && target->qp_in_error) { + shost_printk(KERN_WARNING PFX, target->scsi_host, + "delete qp_in_err timer\n"); + del_timer(&target->qp_err_timer); + if (target->state == SRP_TARGET_LIVE && + !target->work_in_progress) { + target->work_in_progress = 1; + INIT_WORK(&target->work, srp_reconnect_work); + schedule_work(&target->work); + } + } + spin_unlock_irq(target->scsi_host->host_lock); + + return count; } static DEVICE_ATTR(id_ext, S_IRUGO, show_id_ext, NULL); @@ -1848,12 +1696,11 @@ static DEVICE_ATTR(service_id, S_IRUGO, show_service_id, NULL); static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL); static DEVICE_ATTR(dgid, S_IRUGO, show_dgid, NULL); static DEVICE_ATTR(orig_dgid, S_IRUGO, show_orig_dgid, NULL); -static DEVICE_ATTR(req_lim, S_IRUGO, show_req_lim, NULL); static DEVICE_ATTR(zero_req_lim, S_IRUGO, show_zero_req_lim, NULL); static DEVICE_ATTR(local_ib_port, S_IRUGO, show_local_ib_port, NULL); static DEVICE_ATTR(local_ib_device, S_IRUGO, show_local_ib_device, NULL); -static DEVICE_ATTR(cmd_sg_entries, S_IRUGO, show_cmd_sg_entries, NULL); -static DEVICE_ATTR(allow_ext_sg, S_IRUGO, show_allow_ext_sg, NULL); +static DEVICE_ATTR(target_oofabric, S_IWUSR, NULL, srp_target_oofabric); +static DEVICE_ATTR(target_infabric, S_IWUSR, NULL, srp_target_infabric); static struct device_attribute *srp_host_attrs[] = { &dev_attr_id_ext, @@ -1862,12 +1709,11 @@ static struct device_attribute *srp_host_attrs[] = { &dev_attr_pkey, &dev_attr_dgid, &dev_attr_orig_dgid, - &dev_attr_req_lim, &dev_attr_zero_req_lim, &dev_attr_local_ib_port, &dev_attr_local_ib_device, - &dev_attr_cmd_sg_entries, - &dev_attr_allow_ext_sg, + &dev_attr_target_oofabric, + &dev_attr_target_infabric, NULL }; @@ -1880,10 +1726,9 @@ static struct scsi_host_template srp_template = { .eh_abort_handler = srp_abort, .eh_device_reset_handler = srp_reset_device, .eh_host_reset_handler = srp_reset_host, - .sg_tablesize = SRP_DEF_SG_TABLESIZE, - .can_queue = SRP_CMD_SQ_SIZE, + .can_queue = SRP_SQ_SIZE, .this_id = -1, - .cmd_per_lun = SRP_CMD_SQ_SIZE, + .cmd_per_lun = SRP_SQ_SIZE, .use_clustering = ENABLE_CLUSTERING, .shost_attrs = srp_host_attrs }; @@ -1952,9 +1797,6 @@ enum { SRP_OPT_MAX_CMD_PER_LUN = 1 << 6, SRP_OPT_IO_CLASS = 1 << 7, SRP_OPT_INITIATOR_EXT = 1 << 8, - SRP_OPT_CMD_SG_ENTRIES = 1 << 9, - SRP_OPT_ALLOW_EXT_SG = 1 << 10, - SRP_OPT_SG_TABLESIZE = 1 << 11, SRP_OPT_ALL = (SRP_OPT_ID_EXT | SRP_OPT_IOC_GUID | SRP_OPT_DGID | @@ -1972,9 +1814,6 @@ static const match_table_t srp_opt_tokens = { { SRP_OPT_MAX_CMD_PER_LUN, "max_cmd_per_lun=%d" }, { SRP_OPT_IO_CLASS, "io_class=%x" }, { SRP_OPT_INITIATOR_EXT, "initiator_ext=%s" }, - { SRP_OPT_CMD_SG_ENTRIES, "cmd_sg_entries=%u" }, - { SRP_OPT_ALLOW_EXT_SG, "allow_ext_sg=%u" }, - { SRP_OPT_SG_TABLESIZE, "sg_tablesize=%u" }, { SRP_OPT_ERR, NULL } }; @@ -2074,7 +1913,7 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) printk(KERN_WARNING PFX "bad max cmd_per_lun parameter '%s'\n", p); goto out; } - target->scsi_host->cmd_per_lun = min(token, SRP_CMD_SQ_SIZE); + target->scsi_host->cmd_per_lun = min(token, SRP_SQ_SIZE); break; case SRP_OPT_IO_CLASS: @@ -2102,31 +1941,6 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) kfree(p); break; - case SRP_OPT_CMD_SG_ENTRIES: - if (match_int(args, &token) || token < 1 || token > 255) { - printk(KERN_WARNING PFX "bad max cmd_sg_entries parameter '%s'\n", p); - goto out; - } - target->cmd_sg_cnt = token; - break; - - case SRP_OPT_ALLOW_EXT_SG: - if (match_int(args, &token)) { - printk(KERN_WARNING PFX "bad allow_ext_sg parameter '%s'\n", p); - goto out; - } - target->allow_ext_sg = !!token; - break; - - case SRP_OPT_SG_TABLESIZE: - if (match_int(args, &token) || token < 1 || - token > SCSI_MAX_SG_CHAIN_SEGMENTS) { - printk(KERN_WARNING PFX "bad max sg_tablesize parameter '%s'\n", p); - goto out; - } - target->sg_tablesize = token; - break; - default: printk(KERN_WARNING PFX "unknown parameter or missing value " "'%s' in target creation request\n", p); @@ -2157,75 +1971,36 @@ static ssize_t srp_create_target(struct device *dev, container_of(dev, struct srp_host, dev); struct Scsi_Host *target_host; struct srp_target_port *target; - struct ib_device *ibdev = host->srp_dev->dev; - dma_addr_t dma_addr; - int i, ret; + int ret; + int i; target_host = scsi_host_alloc(&srp_template, sizeof (struct srp_target_port)); if (!target_host) return -ENOMEM; - target_host->transportt = ib_srp_transport_template; - target_host->max_channel = 0; - target_host->max_id = 1; + target_host->transportt = ib_srp_transport_template; target_host->max_lun = SRP_MAX_LUN; target_host->max_cmd_len = sizeof ((struct srp_cmd *) (void *) 0L)->cdb; target = host_to_target(target_host); - target->io_class = SRP_REV16A_IB_IO_CLASS; - target->scsi_host = target_host; - target->srp_host = host; - target->lkey = host->srp_dev->mr->lkey; - target->rkey = host->srp_dev->mr->rkey; - target->cmd_sg_cnt = cmd_sg_entries; - target->sg_tablesize = indirect_sg_entries ? : cmd_sg_entries; - target->allow_ext_sg = allow_ext_sg; + target->io_class = SRP_REV16A_IB_IO_CLASS; + target->scsi_host = target_host; + target->srp_host = host; + + INIT_LIST_HEAD(&target->free_reqs); + INIT_LIST_HEAD(&target->req_queue); + for (i = 0; i < SRP_SQ_SIZE; ++i) { + target->req_ring[i].index = i; + list_add_tail(&target->req_ring[i].list, &target->free_reqs); + } ret = srp_parse_options(buf, target); if (ret) goto err; - if (!host->srp_dev->fmr_pool && !target->allow_ext_sg && - target->cmd_sg_cnt < target->sg_tablesize) { - printk(KERN_WARNING PFX "No FMR pool and no external indirect descriptors, limiting sg_tablesize to cmd_sg_cnt\n"); - target->sg_tablesize = target->cmd_sg_cnt; - } - - target_host->sg_tablesize = target->sg_tablesize; - target->indirect_size = target->sg_tablesize * - sizeof (struct srp_direct_buf); - target->max_iu_len = sizeof (struct srp_cmd) + - sizeof (struct srp_indirect_buf) + - target->cmd_sg_cnt * sizeof (struct srp_direct_buf); - - spin_lock_init(&target->lock); - INIT_LIST_HEAD(&target->free_tx); - INIT_LIST_HEAD(&target->free_reqs); - for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { - struct srp_request *req = &target->req_ring[i]; - - req->fmr_list = kmalloc(target->cmd_sg_cnt * sizeof (void *), - GFP_KERNEL); - req->map_page = kmalloc(SRP_FMR_SIZE * sizeof (void *), - GFP_KERNEL); - req->indirect_desc = kmalloc(target->indirect_size, GFP_KERNEL); - if (!req->fmr_list || !req->map_page || !req->indirect_desc) - goto err_free_mem; - - dma_addr = ib_dma_map_single(ibdev, req->indirect_desc, - target->indirect_size, - DMA_TO_DEVICE); - if (ib_dma_mapping_error(ibdev, dma_addr)) - goto err_free_mem; - - req->indirect_dma_addr = dma_addr; - req->index = i; - list_add_tail(&req->list, &target->free_reqs); - } - - ib_query_gid(ibdev, host->port, 0, &target->path.sgid); + ib_query_gid(host->srp_dev->dev, host->port, 0, &target->path.sgid); shost_printk(KERN_DEBUG, target->scsi_host, PFX "new target: id_ext %016llx ioc_guid %016llx pkey %04x " @@ -2238,11 +2013,11 @@ static ssize_t srp_create_target(struct device *dev, ret = srp_create_target_ib(target); if (ret) - goto err_free_mem; + goto err; ret = srp_new_cm_id(target); if (ret) - goto err_free_ib; + goto err_free; target->qp_in_error = 0; ret = srp_connect_target(target); @@ -2264,12 +2039,9 @@ err_disconnect: err_cm_id: ib_destroy_cm_id(target->cm_id); -err_free_ib: +err_free: srp_free_target_ib(target); -err_free_mem: - srp_free_req_data(target); - err: scsi_host_put(target_host); @@ -2336,13 +2108,88 @@ free_host: return NULL; } +static void srp_event_handler(struct ib_event_handler *handler, + struct ib_event *event) +{ + struct srp_device *srp_dev = + ib_get_client_data(event->device, &srp_client); + struct srp_host *host, *tmp_host; + struct srp_target_port *target, *tmp_target; + + if (!srp_dev || srp_dev->dev != event->device) + return; + + printk(KERN_WARNING PFX "ASYNC event= %d on device= %s\n", + event->event, srp_dev->dev->name); + + switch (event->event) { + case IB_EVENT_PORT_ERR: + spin_lock(&srp_dev->dev_lock); + list_for_each_entry_safe(host, tmp_host, + &srp_dev->dev_list, list) { + if (event->element.port_num == host->port) { + spin_lock(&host->target_lock); + list_for_each_entry_safe(target, tmp_target, + &host->target_list, list) { + unsigned long flags; + + spin_lock_irqsave(target->scsi_host->host_lock, + flags); + if (!target->qp_in_error && + target->state == SRP_TARGET_LIVE) + srp_qp_err_add_timer(target, srp_dev_loss_tmo); + + spin_unlock_irqrestore(target->scsi_host->host_lock, + flags); + } + spin_unlock(&host->target_lock); + } + } + spin_unlock(&srp_dev->dev_lock); + break; + case IB_EVENT_PORT_ACTIVE: + case IB_EVENT_LID_CHANGE: + case IB_EVENT_PKEY_CHANGE: + case IB_EVENT_SM_CHANGE: + spin_lock(&srp_dev->dev_lock); + list_for_each_entry_safe(host, tmp_host, &srp_dev->dev_list, + list) { + if (event->element.port_num == host->port) { + spin_lock(&host->target_lock); + list_for_each_entry_safe(target, tmp_target, + &host->target_list, list) { + unsigned long flags; + + spin_lock_irqsave(target->scsi_host->host_lock, + flags); + if (timer_pending(&target->qp_err_timer) + && !target->qp_in_error) { + shost_printk(KERN_WARNING PFX, + target->scsi_host, + "delete qp_in_err timer\n"); + del_timer(&target->qp_err_timer); + } + spin_unlock_irqrestore(target->scsi_host->host_lock, + flags); + } + spin_unlock(&host->target_lock); + } + } + spin_unlock(&srp_dev->dev_lock); + break; + default: + break; + } + +} + static void srp_add_one(struct ib_device *device) { struct srp_device *srp_dev; struct ib_device_attr *dev_attr; struct ib_fmr_pool_param fmr_param; struct srp_host *host; - int max_pages_per_fmr, fmr_page_shift, s, e, p; + int s, e, p; dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); if (!dev_attr) @@ -2360,15 +2207,15 @@ static void srp_add_one(struct ib_device *device) /* * Use the smallest page size supported by the HCA, down to a - * minimum of 4096 bytes. We're unlikely to build large sglists - * out of smaller entries. + * minimum of 512 bytes (which is the smallest sector that a + * SCSI command will ever carry). */ - fmr_page_shift = max(12, ffs(dev_attr->page_size_cap) - 1); - srp_dev->fmr_page_size = 1 << fmr_page_shift; - srp_dev->fmr_page_mask = ~((u64) srp_dev->fmr_page_size - 1); - srp_dev->fmr_max_size = srp_dev->fmr_page_size * SRP_FMR_SIZE; + srp_dev->fmr_page_shift = max(9, ffs(dev_attr->page_size_cap) - 1); + srp_dev->fmr_page_size = 1 << srp_dev->fmr_page_shift; + srp_dev->fmr_page_mask = ~((u64) srp_dev->fmr_page_size - 1); INIT_LIST_HEAD(&srp_dev->dev_list); + spin_lock_init(&srp_dev->dev_lock); srp_dev->dev = device; srp_dev->pd = ib_alloc_pd(device); @@ -2382,24 +2229,23 @@ static void srp_add_one(struct ib_device *device) if (IS_ERR(srp_dev->mr)) goto err_pd; - for (max_pages_per_fmr = SRP_FMR_SIZE; - max_pages_per_fmr >= SRP_FMR_MIN_SIZE; - max_pages_per_fmr /= 2, srp_dev->fmr_max_size /= 2) { - memset(&fmr_param, 0, sizeof fmr_param); - fmr_param.pool_size = SRP_FMR_POOL_SIZE; - fmr_param.dirty_watermark = SRP_FMR_DIRTY_SIZE; - fmr_param.cache = 1; - fmr_param.max_pages_per_fmr = max_pages_per_fmr; - fmr_param.page_shift = fmr_page_shift; - fmr_param.access = (IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_READ); - - srp_dev->fmr_pool = ib_create_fmr_pool(srp_dev->pd, &fmr_param); - if (!IS_ERR(srp_dev->fmr_pool)) - break; - } + INIT_IB_EVENT_HANDLER(&srp_dev->event_handler, srp_dev->dev, + srp_event_handler); + if (ib_register_event_handler(&srp_dev->event_handler)) + goto err_pd; + memset(&fmr_param, 0, sizeof fmr_param); + fmr_param.pool_size = SRP_FMR_POOL_SIZE; + fmr_param.dirty_watermark = SRP_FMR_DIRTY_SIZE; + fmr_param.cache = 1; + fmr_param.relaxed = 0; + fmr_param.max_pages_per_fmr = SRP_FMR_SIZE; + fmr_param.page_shift = srp_dev->fmr_page_shift; + fmr_param.access = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ); + + srp_dev->fmr_pool = ib_create_fmr_pool(srp_dev->pd, &fmr_param); if (IS_ERR(srp_dev->fmr_pool)) srp_dev->fmr_pool = NULL; @@ -2440,6 +2286,9 @@ static void srp_remove_one(struct ib_device *device) srp_dev = ib_get_client_data(device, &srp_client); + ib_unregister_event_handler(&srp_dev->event_handler); + + spin_lock(&srp_dev->dev_lock); list_for_each_entry_safe(host, tmp_host, &srp_dev->dev_list, list) { device_unregister(&host->dev); /* @@ -2454,9 +2303,9 @@ static void srp_remove_one(struct ib_device *device) */ spin_lock(&host->target_lock); list_for_each_entry(target, &host->target_list, list) { - spin_lock_irq(&target->lock); + spin_lock_irq(target->scsi_host->host_lock); target->state = SRP_TARGET_REMOVED; - spin_unlock_irq(&target->lock); + spin_unlock_irq(target->scsi_host->host_lock); } spin_unlock(&host->target_lock); @@ -2465,7 +2314,7 @@ static void srp_remove_one(struct ib_device *device) * started before we marked our target ports as * removed, and any target port removal tasks. */ - flush_workqueue(ib_wq); + flush_scheduled_work(); list_for_each_entry_safe(target, tmp_target, &host->target_list, list) { @@ -2473,13 +2322,14 @@ static void srp_remove_one(struct ib_device *device) scsi_remove_host(target->scsi_host); srp_disconnect_target(target); ib_destroy_cm_id(target->cm_id); + del_timer(&target->qp_err_timer); srp_free_target_ib(target); - srp_free_req_data(target); scsi_host_put(target->scsi_host); } kfree(host); } + spin_unlock(&srp_dev->dev_lock); if (srp_dev->fmr_pool) ib_destroy_fmr_pool(srp_dev->fmr_pool); @@ -2496,27 +2346,9 @@ static int __init srp_init_module(void) { int ret; - BUILD_BUG_ON(FIELD_SIZEOF(struct ib_wc, wr_id) < sizeof(void *)); - - if (srp_sg_tablesize) { - printk(KERN_WARNING PFX "srp_sg_tablesize is deprecated, please use cmd_sg_entries\n"); - if (!cmd_sg_entries) - cmd_sg_entries = srp_sg_tablesize; - } - - if (!cmd_sg_entries) - cmd_sg_entries = SRP_DEF_SG_TABLESIZE; - - if (cmd_sg_entries > 255) { - printk(KERN_WARNING PFX "Clamping cmd_sg_entries to 255\n"); - cmd_sg_entries = 255; - } - - if (!indirect_sg_entries) - indirect_sg_entries = cmd_sg_entries; - else if (indirect_sg_entries < cmd_sg_entries) { - printk(KERN_WARNING PFX "Bumping up indirect_sg_entries to match cmd_sg_entries (%u)\n", cmd_sg_entries); - indirect_sg_entries = cmd_sg_entries; + if (srp_sg_tablesize > 255) { + printk(KERN_WARNING PFX "Clamping srp_sg_tablesize to 255\n"); + srp_sg_tablesize = 255; } ib_srp_transport_template = @@ -2524,6 +2356,11 @@ static int __init srp_init_module(void) if (!ib_srp_transport_template) return -ENOMEM; + srp_template.sg_tablesize = srp_sg_tablesize; + srp_max_iu_len = (sizeof (struct srp_cmd) + + sizeof (struct srp_indirect_buf) + + srp_sg_tablesize * 16); + ret = class_register(&srp_class); if (ret) { printk(KERN_ERR PFX "couldn't register class infiniband_srp\n"); diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 020caf0c3789e..a492f87d5bb83 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -49,6 +49,7 @@ enum { SRP_PATH_REC_TIMEOUT_MS = 1000, SRP_ABORT_TIMEOUT_MS = 5000, + SRP_CONN_ERR_TIMEOUT = 30, SRP_PORT_REDIRECT = 1, SRP_DLID_REDIRECT = 2, @@ -59,25 +60,18 @@ enum { SRP_RQ_SHIFT = 6, SRP_RQ_SIZE = 1 << SRP_RQ_SHIFT, + SRP_SQ_SIZE = SRP_RQ_SIZE - 1, + SRP_CQ_SIZE = SRP_SQ_SIZE + SRP_RQ_SIZE, - SRP_SQ_SIZE = SRP_RQ_SIZE, - SRP_RSP_SQ_SIZE = 1, - SRP_REQ_SQ_SIZE = SRP_SQ_SIZE - SRP_RSP_SQ_SIZE, - SRP_TSK_MGMT_SQ_SIZE = 1, - SRP_CMD_SQ_SIZE = SRP_REQ_SQ_SIZE - SRP_TSK_MGMT_SQ_SIZE, + SRP_TAG_TSK_MGMT = 1 << (SRP_RQ_SHIFT + 1), - SRP_TAG_NO_REQ = ~0U, - SRP_TAG_TSK_MGMT = 1U << 31, - - SRP_FMR_SIZE = 512, - SRP_FMR_MIN_SIZE = 128, + SRP_FMR_SIZE = 256, SRP_FMR_POOL_SIZE = 1024, - SRP_FMR_DIRTY_SIZE = SRP_FMR_POOL_SIZE / 4, - - SRP_MAP_ALLOW_FMR = 0, - SRP_MAP_NO_FMR = 1, + SRP_FMR_DIRTY_SIZE = SRP_FMR_POOL_SIZE / 4 }; +#define SRP_OP_RECV (1 << 31) + enum srp_target_state { SRP_TARGET_LIVE, SRP_TARGET_CONNECTING, @@ -85,21 +79,22 @@ enum srp_target_state { SRP_TARGET_REMOVED }; -enum srp_iu_type { - SRP_IU_CMD, - SRP_IU_TSK_MGMT, - SRP_IU_RSP, +enum srp_request_type { + SRP_REQ_NORMAL, + SRP_REQ_TASK_MGMT, }; struct srp_device { struct list_head dev_list; struct ib_device *dev; + spinlock_t dev_lock; struct ib_pd *pd; struct ib_mr *mr; + struct ib_event_handler event_handler; struct ib_fmr_pool *fmr_pool; - u64 fmr_page_mask; + int fmr_page_shift; int fmr_page_size; - int fmr_max_size; + u64 fmr_page_mask; }; struct srp_host { @@ -116,37 +111,15 @@ struct srp_request { struct list_head list; struct scsi_cmnd *scmnd; struct srp_iu *cmd; - struct ib_pool_fmr **fmr_list; - u64 *map_page; - struct srp_direct_buf *indirect_desc; - dma_addr_t indirect_dma_addr; - short nfmr; + struct srp_iu *tsk_mgmt; + struct ib_pool_fmr *fmr; + struct completion done; short index; + u8 cmd_done; + u8 tsk_status; }; struct srp_target_port { - /* These are RW in the hot path, and commonly used together */ - struct list_head free_tx; - struct list_head free_reqs; - spinlock_t lock; - s32 req_lim; - - /* These are read-only in the hot path */ - struct ib_cq *send_cq ____cacheline_aligned_in_smp; - struct ib_cq *recv_cq; - struct ib_qp *qp; - u32 lkey; - u32 rkey; - enum srp_target_state state; - unsigned int max_iu_len; - unsigned int cmd_sg_cnt; - unsigned int indirect_size; - bool allow_ext_sg; - - /* Everything above this point is used in the hot path of - * command processing. Try to keep them packed into cachelines. - */ - __be64 id_ext; __be64 ioc_guid; __be64 service_id; @@ -156,7 +129,6 @@ struct srp_target_port { struct Scsi_Host *scsi_host; char target_name[32]; unsigned int scsi_id; - unsigned int sg_tablesize; struct ib_sa_path_rec path; __be16 orig_dgid[8]; @@ -164,47 +136,41 @@ struct srp_target_port { int path_query_id; struct ib_cm_id *cm_id; + struct ib_cq *cq; + struct ib_qp *qp; int max_ti_iu_len; + s32 req_lim; int zero_req_lim; - struct srp_iu *tx_ring[SRP_SQ_SIZE]; + unsigned rx_head; struct srp_iu *rx_ring[SRP_RQ_SIZE]; - struct srp_request req_ring[SRP_CMD_SQ_SIZE]; + + unsigned tx_head; + unsigned tx_tail; + struct srp_iu *tx_ring[SRP_SQ_SIZE + 1]; + + struct list_head free_reqs; + struct list_head req_queue; + struct srp_request req_ring[SRP_SQ_SIZE]; struct work_struct work; + int work_in_progress; struct list_head list; struct completion done; int status; + enum srp_target_state state; int qp_in_error; - - struct completion tsk_mgmt_done; - u8 tsk_mgmt_status; + struct timer_list qp_err_timer; }; struct srp_iu { - struct list_head list; u64 dma; void *buf; size_t size; enum dma_data_direction direction; }; -struct srp_map_state { - struct ib_pool_fmr **next_fmr; - struct srp_direct_buf *desc; - u64 *pages; - dma_addr_t base_dma_addr; - u32 fmr_len; - u32 total_len; - unsigned int npages; - unsigned int nfmr; - unsigned int ndesc; - struct scatterlist *unmapped_sg; - int unmapped_index; - dma_addr_t unmapped_addr; -}; - #endif /* IB_SRP_H */ diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index e536e8b85b30b..874fffa9d558b 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -2874,6 +2874,13 @@ config MLX4_DEBUG debug_level module parameter (which can also be set after the driver is loaded through sysfs). +config MLX4_VNIC + tristate "Mellanox Technologies VNIC support" + depends on PCI && INFINIBAND && INFINIBAND_ADDR_TRANS + select MLX4_CORE + help + Mellanox Technologies VNIC functionality. + config TEHUTI tristate "Tehuti Networks 10G Ethernet" depends on PCI diff --git a/drivers/net/Makefile b/drivers/net/Makefile index 9fccee4fded77..cbd8f916b6d33 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -259,6 +259,7 @@ obj-$(CONFIG_DM9000) += dm9000.o obj-$(CONFIG_PASEMI_MAC) += pasemi_mac_driver.o pasemi_mac_driver-objs := pasemi_mac.o pasemi_mac_ethtool.o obj-$(CONFIG_MLX4_CORE) += mlx4/ +obj-$(CONFIG_MLX4_VNIC) += mlx4_vnic/ obj-$(CONFIG_ENC28J60) += enc28j60.o obj-$(CONFIG_ETHOC) += ethoc.o obj-$(CONFIG_GRETH) += greth.o diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile index d1aa45a158541..d003b14a1f66b 100644 --- a/drivers/net/mlx4/Makefile +++ b/drivers/net/mlx4/Makefile @@ -1,9 +1,12 @@ obj-$(CONFIG_MLX4_CORE) += mlx4_core.o mlx4_core-y := alloc.o catas.o cmd.o cq.o eq.o fw.o icm.o intf.o main.o mcg.o \ - mr.o pd.o port.o profile.o qp.o reset.o sense.o srq.o + mr.o pd.o port.o profile.o qp.o reset.o sense.o srq.o xrcd.o resource_tracker.o pkey.o \ + fmr_master.o fmr_slave.o + +mlx4_core-$(CONFIG_MLX4_RTT_TESTS) += rt_torture.o obj-$(CONFIG_MLX4_EN) += mlx4_en.o mlx4_en-y := en_main.o en_tx.o en_rx.o en_ethtool.o en_port.o en_cq.o \ - en_resources.o en_netdev.o en_selftest.o + en_resources.o en_netdev.o en_frag.o en_selftest.o diff --git a/drivers/net/mlx4/alloc.c b/drivers/net/mlx4/alloc.c index 116cae334dadc..400d05fc55bb0 100644 --- a/drivers/net/mlx4/alloc.c +++ b/drivers/net/mlx4/alloc.c @@ -62,9 +62,6 @@ u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap) } else obj = -1; - if (obj != -1) - --bitmap->avail; - spin_unlock(&bitmap->lock); return obj; @@ -75,26 +72,56 @@ void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj) mlx4_bitmap_free_range(bitmap, obj, 1); } +static unsigned long find_aligned_range(unsigned long *bitmap, + u32 start, u32 nbits, + int len, int align) +{ + unsigned long end, i; + +again: + start = ALIGN(start, align); + + while ((start < nbits) && test_bit(start, bitmap)) + start += align; + + if (start >= nbits) + return -1; + + end = start+len; + if (end > nbits) + return -1; + + for (i = start + 1; i < end; i++) { + if (test_bit(i, bitmap)) { + start = i + 1; + goto again; + } + } + + return start; +} + u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, int align) { - u32 obj; + u32 obj, i; if (likely(cnt == 1 && align == 1)) return mlx4_bitmap_alloc(bitmap); spin_lock(&bitmap->lock); - obj = bitmap_find_next_zero_area(bitmap->table, bitmap->max, - bitmap->last, cnt, align - 1); + obj = find_aligned_range(bitmap->table, bitmap->last, + bitmap->max, cnt, align); if (obj >= bitmap->max) { bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top) & bitmap->mask; - obj = bitmap_find_next_zero_area(bitmap->table, bitmap->max, - 0, cnt, align - 1); + obj = find_aligned_range(bitmap->table, 0, bitmap->max, + cnt, align); } if (obj < bitmap->max) { - bitmap_set(bitmap->table, obj, cnt); + for (i = 0; i < cnt; i++) + set_bit(obj + i, bitmap->table); if (obj == bitmap->last) { bitmap->last = (obj + cnt); if (bitmap->last >= bitmap->max) @@ -104,35 +131,35 @@ u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, int align) } else obj = -1; - if (obj != -1) - bitmap->avail -= cnt; - spin_unlock(&bitmap->lock); return obj; } -u32 mlx4_bitmap_avail(struct mlx4_bitmap *bitmap) -{ - return bitmap->avail; -} - void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt) { + u32 i; + obj &= bitmap->max + bitmap->reserved_top - 1; spin_lock(&bitmap->lock); - bitmap_clear(bitmap->table, obj, cnt); + for (i = 0; i < cnt; i++) + clear_bit(obj + i, bitmap->table); bitmap->last = min(bitmap->last, obj); bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top) & bitmap->mask; - bitmap->avail += cnt; spin_unlock(&bitmap->lock); } int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask, u32 reserved_bot, u32 reserved_top) { + int i; + + /* sanity check */ + if (num <= (u64)reserved_top + reserved_bot) + return -EINVAL; + /* num must be a power of 2 */ if (num != roundup_pow_of_two(num)) return -EINVAL; @@ -142,18 +169,28 @@ int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask, bitmap->max = num - reserved_top; bitmap->mask = mask; bitmap->reserved_top = reserved_top; - bitmap->avail = num - reserved_top - reserved_bot; spin_lock_init(&bitmap->lock); bitmap->table = kzalloc(BITS_TO_LONGS(bitmap->max) * sizeof (long), GFP_KERNEL); if (!bitmap->table) return -ENOMEM; - bitmap_set(bitmap->table, 0, reserved_bot); + for (i = 0; i < reserved_bot; ++i) + set_bit(i, bitmap->table); return 0; } +/* Like bitmap_init, but doesn't require 'num' to be a power of 2 or + * a non-trivial mask */ +int mlx4_bitmap_init_no_mask(struct mlx4_bitmap *bitmap, u32 num, + u32 reserved_bot, u32 reserved_top) +{ + u32 num_rounded = roundup_pow_of_two(num); + return mlx4_bitmap_init(bitmap, num_rounded, num_rounded - 1, + reserved_bot, num_rounded - num + reserved_top); +} + void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap) { kfree(bitmap->table); @@ -191,11 +228,10 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct, } else { int i; - buf->direct.buf = NULL; buf->nbufs = (size + PAGE_SIZE - 1) / PAGE_SIZE; buf->npages = buf->nbufs; buf->page_shift = PAGE_SHIFT; - buf->page_list = kcalloc(buf->nbufs, sizeof(*buf->page_list), + buf->page_list = kzalloc(buf->nbufs * sizeof *buf->page_list, GFP_KERNEL); if (!buf->page_list) return -ENOMEM; @@ -243,7 +279,7 @@ void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf) dma_free_coherent(&dev->pdev->dev, size, buf->direct.buf, buf->direct.map); else { - if (BITS_PER_LONG == 64 && buf->direct.buf) + if (BITS_PER_LONG == 64) vunmap(buf->direct.buf); for (i = 0; i < buf->nbufs; ++i) @@ -383,7 +419,7 @@ int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres, goto err_db; err = mlx4_mtt_init(dev, wqres->buf.npages, wqres->buf.page_shift, - &wqres->mtt); + &wqres->mtt, MLX4_MR_FLAG_NONE); if (err) goto err_buf; @@ -394,7 +430,7 @@ int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres, return 0; err_mtt: - mlx4_mtt_cleanup(dev, &wqres->mtt); + mlx4_mtt_cleanup(dev, &wqres->mtt, MLX4_MR_FLAG_NONE); err_buf: mlx4_buf_free(dev, size, &wqres->buf); err_db: @@ -407,7 +443,7 @@ EXPORT_SYMBOL_GPL(mlx4_alloc_hwq_res); void mlx4_free_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres, int size) { - mlx4_mtt_cleanup(dev, &wqres->mtt); + mlx4_mtt_cleanup(dev, &wqres->mtt, MLX4_MR_FLAG_NONE); mlx4_buf_free(dev, size, &wqres->buf); mlx4_db_free(dev, &wqres->db); } diff --git a/drivers/net/mlx4/catas.c b/drivers/net/mlx4/catas.c index 32f947154c33c..f8356ba06eff5 100644 --- a/drivers/net/mlx4/catas.c +++ b/drivers/net/mlx4/catas.c @@ -47,7 +47,7 @@ static struct work_struct catas_work; static int internal_err_reset = 1; module_param(internal_err_reset, int, 0644); MODULE_PARM_DESC(internal_err_reset, - "Reset device on internal errors if non-zero (default 1)"); + "Reset device on internal errors if non-zero (default 1, in SRIOV driver default is 0)"); static void dump_err_buf(struct mlx4_dev *dev) { @@ -91,6 +91,9 @@ static void catas_reset(struct work_struct *work) LIST_HEAD(tlist); int ret; + if (!mutex_trylock(&drv_mutex)) + return; + spin_lock_irq(&catas_lock); list_splice_init(&catas_list, &tlist); spin_unlock_irq(&catas_lock); @@ -101,19 +104,24 @@ static void catas_reset(struct work_struct *work) ret = mlx4_restart_one(priv->dev.pdev); /* 'priv' now is not valid */ if (ret) - pr_err("mlx4 %s: Reset failed (%d)\n", - pci_name(pdev), ret); + printk(KERN_ERR "mlx4 %s: Reset failed (%d)\n", + pci_name(pdev), ret); else { dev = pci_get_drvdata(pdev); mlx4_dbg(dev, "Reset succeeded\n"); } } + mutex_unlock(&drv_mutex); } void mlx4_start_catas_poll(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); - phys_addr_t addr; + unsigned long addr; + + /*If we are in SRIOV the default of the is 0*/ + if (mlx4_is_mfunc(dev)) + internal_err_reset = 0; INIT_LIST_HEAD(&priv->catas_err.list); init_timer(&priv->catas_err.timer); @@ -124,8 +132,8 @@ void mlx4_start_catas_poll(struct mlx4_dev *dev) priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4); if (!priv->catas_err.map) { - mlx4_warn(dev, "Failed to map internal error buffer at 0x%llx\n", - (unsigned long long) addr); + mlx4_warn(dev, "Failed to map internal error buffer at 0x%lx\n", + addr); return; } diff --git a/drivers/net/mlx4/cmd.c b/drivers/net/mlx4/cmd.c index 23cee7b6af918..98dabe257dbfe 100644 --- a/drivers/net/mlx4/cmd.c +++ b/drivers/net/mlx4/cmd.c @@ -33,17 +33,23 @@ */ #include -#include #include #include - -#include +#include +#include #include #include "mlx4.h" +#include "fw.h" +#include "icm.h" +#include "fmr_master.h" #define CMD_POLL_TOKEN 0xffff +#define INBOX_MASK 0xffffffffffffff00ULL + +#define CMD_CHAN_VER 1 +#define CMD_CHAN_IF_REV 1 enum { /* command completed successfully: */ @@ -109,8 +115,11 @@ struct mlx4_cmd_context { int next; u64 out_param; u16 token; + u8 fw_status; }; +static int mlx4_master_process_vhcr(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *in_vhcr); + static int mlx4_status_to_errno(u8 status) { static const int trans_table[] = { @@ -141,6 +150,112 @@ static int mlx4_status_to_errno(u8 status) return trans_table[status]; } +static int comm_pending(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + u32 status = readl(&priv->mfunc.comm->slave_read); + + return (swab32(status) >> 31) != priv->cmd.comm_toggle; +} + +static void mlx4_comm_cmd_post(struct mlx4_dev *dev, u8 cmd, u16 param) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + u32 val; + + priv->cmd.comm_toggle ^= 1; + val = param | (cmd << 16) | (priv->cmd.comm_toggle << 31); + __raw_writel((__force u32) cpu_to_be32(val), &priv->mfunc.comm->slave_write); + mmiowb(); +} + +int mlx4_comm_cmd_poll(struct mlx4_dev *dev, u8 cmd, u16 param, unsigned long timeout) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + unsigned long end; + int err = 0; + int ret_from_pending = 0; + + /* First, verify that the master reports correct status */ + if (comm_pending(dev)) { + mlx4_warn(dev, "Communication channel is not idle. my toggle is %d (cmd:0x%x)\n", + priv->cmd.comm_toggle, cmd); + return -EAGAIN; + } + + /* Write command */ + down(&priv->cmd.poll_sem); + mlx4_comm_cmd_post(dev, cmd, param); + + end = msecs_to_jiffies(timeout) + jiffies; + while (comm_pending(dev) && time_before(jiffies, end)) + cond_resched(); + ret_from_pending = comm_pending(dev); + if (ret_from_pending) { + /*check if the slave is trying to boot in the middle of FLR process. + The only result in the RESET command that is not 0 is the MLX4_DELAY_RESET_SLAVE*/ + if ((MLX4_COMM_CMD_RESET == cmd)) { + mlx4_warn(dev, "Got slave FLRed from Communication channel (ret:0x%x)\n", ret_from_pending); + err = MLX4_DELAY_RESET_SLAVE; + } else { + mlx4_warn(dev, "Communication channel timed out\n"); + err = -ETIMEDOUT; + } + } + + up(&priv->cmd.poll_sem); + return err; +} + +static int mlx4_comm_cmd_wait(struct mlx4_dev *dev, u8 op, + u16 param, unsigned long timeout) +{ + struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd; + struct mlx4_cmd_context *context; + int err = 0; + + down(&cmd->event_sem); + + spin_lock(&cmd->context_lock); + BUG_ON(cmd->free_head < 0); + context = &cmd->context[cmd->free_head]; + context->token += cmd->token_mask + 1; + cmd->free_head = context->next; + spin_unlock(&cmd->context_lock); + + init_completion(&context->done); + + mlx4_comm_cmd_post(dev, op, param); + + if (!wait_for_completion_timeout(&context->done, msecs_to_jiffies(timeout))) { + err = -EBUSY; + goto out; + } + + err = context->result; + if (err && context->fw_status != CMD_STAT_MULTI_FUNC_REQ) { + mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n", + op, context->fw_status); + goto out; + } + +out: + spin_lock(&cmd->context_lock); + context->next = cmd->free_head; + cmd->free_head = context - cmd->context; + spin_unlock(&cmd->context_lock); + + up(&cmd->event_sem); + return err; +} + +int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param, unsigned long timeout) +{ + if (mlx4_priv(dev)->cmd.use_events) + return mlx4_comm_cmd_wait(dev, cmd, param, timeout); + return mlx4_comm_cmd_poll(dev, cmd, param, timeout); +} + static int cmd_pending(struct mlx4_dev *dev) { u32 status = readl(mlx4_priv(dev)->cmd.hcr + HCR_STATUS_OFFSET); @@ -166,8 +281,10 @@ static int mlx4_cmd_post(struct mlx4_dev *dev, u64 in_param, u64 out_param, end += msecs_to_jiffies(GO_BIT_TIMEOUT_MSECS); while (cmd_pending(dev)) { - if (time_after_eq(jiffies, end)) + if (time_after_eq(jiffies, end)) { + mlx4_err(dev, "%s:cmd_pending failed\n", __func__); goto out; + } cond_resched(); } @@ -208,6 +325,59 @@ out: return ret; } +static int mlx4_slave_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param, + int out_is_imm, u32 in_modifier, u8 op_modifier, + u16 op, unsigned long timeout) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_vhcr *vhcr = priv->mfunc.vhcr; + int ret; + + down(&priv->cmd.slave_sem); + vhcr->in_param = in_param; + vhcr->out_param = out_param ? *out_param : 0; + vhcr->in_modifier = in_modifier; + vhcr->timeout = timeout; + vhcr->op = op; + vhcr->token = CMD_POLL_TOKEN; + vhcr->op_modifier = op_modifier; + vhcr->errno = 0; + vhcr->cookie++; + if (mlx4_is_master(dev)) { + ret = mlx4_master_process_vhcr(dev, dev->caps.function, vhcr); + if (!ret) { + if (out_is_imm) { + if (out_param) + *out_param = vhcr->out_param; + else { + mlx4_err(dev, "response expected while output mailbox is " + "NULL for command 0x%x\n", op); + vhcr->errno = -EINVAL; + } + } + ret = vhcr->errno; + } + } else { + ret = mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR_POST, 0, + MLX4_COMM_TIME + timeout); + if (!ret) { + if (out_is_imm) { + if (out_param) + *out_param = vhcr->out_param; + else { + mlx4_err(dev, "response expected while output mailbox is " + "NULL for command 0x%x\n", op); + vhcr->errno = -EINVAL; + } + } + ret = vhcr->errno; + } else + mlx4_err(dev, "failed execution of VHCR_POST command opcode 0x%x, cookie %d\n", op, vhcr->cookie); + } + up(&priv->cmd.slave_sem); + return ret; +} + static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param, int out_is_imm, u32 in_modifier, u8 op_modifier, u16 op, unsigned long timeout) @@ -216,19 +386,23 @@ static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param, void __iomem *hcr = priv->cmd.hcr; int err = 0; unsigned long end; + u32 stat; down(&priv->cmd.poll_sem); err = mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0, in_modifier, op_modifier, op, CMD_POLL_TOKEN, 0); - if (err) + if (err) { + mlx4_err(dev, "%s:command 0x%x, mlx4_cmd_post failed\n", __func__, op); goto out; + } end = msecs_to_jiffies(timeout) + jiffies; while (cmd_pending(dev) && time_before(jiffies, end)) cond_resched(); if (cmd_pending(dev)) { + mlx4_err(dev, "%s:command 0x%x, cmd_pending failed\n", __func__, op); err = -ETIMEDOUT; goto out; } @@ -239,9 +413,10 @@ static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param, __raw_readl(hcr + HCR_OUT_PARAM_OFFSET)) << 32 | (u64) be32_to_cpu((__force __be32) __raw_readl(hcr + HCR_OUT_PARAM_OFFSET + 4)); - - err = mlx4_status_to_errno(be32_to_cpu((__force __be32) - __raw_readl(hcr + HCR_STATUS_OFFSET)) >> 24); + stat = be32_to_cpu((__force __be32) __raw_readl(hcr + HCR_STATUS_OFFSET)) >> 24; + err = mlx4_status_to_errno(stat); + if (err && stat != CMD_STAT_MULTI_FUNC_REQ) + mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n", op, stat); out: up(&priv->cmd.poll_sem); @@ -258,6 +433,7 @@ void mlx4_cmd_event(struct mlx4_dev *dev, u16 token, u8 status, u64 out_param) if (token != context->token) return; + context->fw_status = status; context->result = mlx4_status_to_errno(status); context->out_param = out_param; @@ -288,15 +464,25 @@ static int mlx4_cmd_wait(struct mlx4_dev *dev, u64 in_param, u64 *out_param, if (!wait_for_completion_timeout(&context->done, msecs_to_jiffies(timeout))) { err = -EBUSY; + mlx4_err(dev, "%s:command 0x%x, wait_for_completion_timeout failed\n", __func__, op); goto out; } err = context->result; - if (err) + if (err && context->fw_status != CMD_STAT_MULTI_FUNC_REQ) { + mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n", + op, context->fw_status); goto out; + } - if (out_is_imm) - *out_param = context->out_param; + if (out_is_imm) { + if (out_param) + *out_param = context->out_param; + else { + mlx4_err(dev, "response expected while output mailbox is NULL for command 0x%x\n", op); + err = -EINVAL; + } + } out: spin_lock(&cmd->context_lock); @@ -310,17 +496,1312 @@ out: int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param, int out_is_imm, u32 in_modifier, u8 op_modifier, - u16 op, unsigned long timeout) + u16 op, unsigned long timeout, int native) { - if (mlx4_priv(dev)->cmd.use_events) - return mlx4_cmd_wait(dev, in_param, out_param, out_is_imm, - in_modifier, op_modifier, op, timeout); - else - return mlx4_cmd_poll(dev, in_param, out_param, out_is_imm, - in_modifier, op_modifier, op, timeout); + if (!mlx4_is_mfunc(dev) || (native && mlx4_is_master(dev))) { + if (mlx4_priv(dev)->cmd.use_events) + return mlx4_cmd_wait(dev, in_param, out_param, out_is_imm, + in_modifier, op_modifier, op, timeout); + else + return mlx4_cmd_poll(dev, in_param, out_param, out_is_imm, + in_modifier, op_modifier, op, timeout); + } + return mlx4_slave_cmd(dev, in_param, out_param, out_is_imm, + in_modifier, op_modifier, op, timeout); } EXPORT_SYMBOL_GPL(__mlx4_cmd); + +static int mlx4_ARM_COMM_CHANNEL(struct mlx4_dev *dev) +{ + return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_ARM_COMM_CHANNEL, MLX4_CMD_TIME_CLASS_B, 1); +} + +static int mlx4_ACCESS_MEM(struct mlx4_dev *dev, u64 master_addr, + int slave, u64 slave_addr, + int size, int is_read) +{ + u64 in_param; + u64 out_param; + + if ((slave_addr & 0xfff) | (master_addr & 0xfff) | + (slave & ~0x7f) | (size & 0xff)) { + mlx4_err(dev, "Bad access mem params - slave_addr:0x%llx " + "master_addr:0x%llx slave_id:%d size:%d\n", + slave_addr, master_addr, slave, size); + return -EINVAL; + } + + if (is_read) { + in_param = (u64) slave | slave_addr; + out_param = (u64) dev->caps.function | master_addr; + } else { + in_param = (u64) dev->caps.function | master_addr; + out_param = (u64) slave | slave_addr; + } + + return mlx4_cmd_imm(dev, in_param, &out_param, size, 0, + MLX4_CMD_ACCESS_MEM, + MLX4_CMD_TIME_CLASS_A, 1); +} + +static int MAD_IFC(struct mlx4_dev *dev, int ignore_mkey, int ignore_bkey, + int port, struct ib_wc *in_wc, struct ib_grh *in_grh, + void *in_mad, void *response_mad) +{ + struct mlx4_cmd_mailbox *inmailbox, *outmailbox; + void *inbox; + int err; + u32 in_modifier = port; + u8 op_modifier = 0; + + inmailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(inmailbox)) + return PTR_ERR(inmailbox); + inbox = inmailbox->buf; + + outmailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(outmailbox)) { + mlx4_free_cmd_mailbox(dev, inmailbox); + return PTR_ERR(outmailbox); + } + + memcpy(inbox, in_mad, 256); + + /* + * Key check traps can't be generated unless we have in_wc to + * tell us where to send the trap. + */ + if (ignore_mkey || !in_wc) + op_modifier |= 0x1; + if (ignore_bkey || !in_wc) + op_modifier |= 0x2; + + if (in_wc) { + struct { + __be32 my_qpn; + u32 reserved1; + __be32 rqpn; + u8 sl; + u8 g_path; + u16 reserved2[2]; + __be16 pkey; + u32 reserved3[11]; + u8 grh[40]; + } *ext_info; + + memset(inbox + 256, 0, 256); + ext_info = inbox + 256; + + ext_info->my_qpn = cpu_to_be32(in_wc->qp->qp_num); + ext_info->rqpn = cpu_to_be32(in_wc->src_qp); + ext_info->sl = in_wc->sl << 4; + ext_info->g_path = in_wc->dlid_path_bits | + (in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0); + ext_info->pkey = cpu_to_be16(in_wc->pkey_index); + + if (in_grh) + memcpy(ext_info->grh, in_grh, 40); + + op_modifier |= 0x4; + + in_modifier |= in_wc->slid << 16; + } + + err = mlx4_cmd_box(dev, inmailbox->dma, outmailbox->dma, + in_modifier, op_modifier, + MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C, 1); + + if (!err) + memcpy(response_mad, outmailbox->buf, 256); + + mlx4_free_cmd_mailbox(dev, inmailbox); + mlx4_free_cmd_mailbox(dev, outmailbox); + + return err; +} + +static int query_pkey_block(struct mlx4_dev *dev, u8 port, u16 index, u16 *pkey, struct ib_smp *in_mad, struct ib_smp *out_mad) +{ + int err = -ENOMEM; + int i; + + if (index & 0x1f) + return -EINVAL; + + in_mad->attr_mod = cpu_to_be32(index / 32); + + err = MAD_IFC(dev, 1, 1, port, NULL, NULL, in_mad, out_mad); + if (err) + return err; + + for (i = 0; i < 32; ++i) + pkey[i] = be16_to_cpu(((__be16 *) out_mad->data)[i]); + + return err; +} + +static int get_full_pkey_table(struct mlx4_dev *dev, u8 port, u16 *table, struct ib_smp * in_mad, struct ib_smp *outsmp) +{ + int i; + int err; + + for (i = 0; i < dev->caps.pkey_table_len[port]; i += 32) { + err = query_pkey_block(dev, port, i, table + i, in_mad, outsmp); + if (err) + return err; + } + + return 0; +} +#define PORT_CAPABILITY_LOCATION_IN_SMP 20 +#define PORT_STATE_OFFSET 32 + +static enum ib_port_state vf_port_state(struct mlx4_dev *dev, int port, int vf) +{ + if (mlx4_get_slave_port_state(dev, vf, port) == SLAVE_PORT_UP) + return IB_PORT_ACTIVE; + else + return IB_PORT_DOWN; +} + +static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct ib_smp *smp = inbox->buf; + u32 index; + u8 port; + u16 *table; + int err; + int vidx, pidx; + struct mlx4_priv *priv = mlx4_priv(dev); + struct ib_smp *outsmp = outbox->buf; + __be16 *outtab = (__be16 *)(outsmp->data); + __be32 slave_cap_mask; + port = vhcr->in_modifier; + + /*mlx4_dbg(dev, "%s, slave %d, bv %d, mc %d, cv %d, mtd %d atrr %d\n", + __func__, slave, smp->base_version, smp->mgmt_class, smp->class_version, smp->method, be16_to_cpu(smp->attr_id)); + */ + if (smp->base_version == 1 && + smp->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED && + smp->class_version == 1) { + if (smp->method == IB_MGMT_METHOD_GET) { + if (smp->attr_id == IB_SMP_ATTR_PKEY_TABLE) { + index = be32_to_cpu(smp->attr_mod); + /*mlx4_dbg(dev, "query pkey for slave %d, port %d, index %d\n", slave, port, index);*/ + if (port < 1 || port > dev->caps.num_ports) + return -EINVAL; + table = kzalloc(dev->caps.pkey_table_len[port] * sizeof *table, GFP_KERNEL); + if (!table) + return -ENOMEM; + err = get_full_pkey_table(dev, port, table, smp, outsmp); + if (!err) { + for (vidx = index * 32; vidx < (index + 1) * 32; ++vidx) { + pidx = priv->virt2phys_pkey[slave][port - 1][vidx]; + outtab[vidx % 32] = cpu_to_be16(table[pidx]); + /*mlx4_dbg(dev, "vidx = %d, pidx = %d, pkey = 0x%04x\n", vidx, pidx, table[pidx]);*/ + } + } + kfree(table); + return err; + } + if (smp->attr_id == IB_SMP_ATTR_PORT_INFO) { + /*get the slave specific caps:*/ + /*do the command */ + err = mlx4_cmd_box(dev, inbox->dma, outbox->dma, + vhcr->in_modifier, vhcr->op_modifier, + vhcr->op, vhcr->timeout, 1); + /*mlx4_dbg(dev, "query port_info for slave %d, port %d, slave_cap: 0x%x\n", slave, port, slave_cap_mask);*/ + /* modify the response for slaves */ + if (!err) { + u8 *state = outsmp->data + PORT_STATE_OFFSET; + + *state = (*state & 0xf0) | vf_port_state(dev, port, slave); + if (slave != dev->caps.function) { + slave_cap_mask = priv->mfunc.master.slave_state[slave].ib_cap_mask[port]; + memcpy(outsmp->data + PORT_CAPABILITY_LOCATION_IN_SMP, &slave_cap_mask, 4); + } + } + return err; + } else if (smp->attr_id == IB_SMP_ATTR_GUID_INFO && mlx4_is_mfunc(dev)) { + __be64 *gids_block = (__be64 *)outsmp->data; + u8 block_idx; + int i; + + /* for every record, we need to go index by index and see if it's valid for this slave */ + err = mlx4_cmd_box(dev, inbox->dma, outbox->dma, + vhcr->in_modifier, vhcr->op_modifier, + vhcr->op, vhcr->timeout, 1); + if (!err) { + /* Verify that the slave only access GIDs that are + mapped to it */ + block_idx = be32_to_cpu(smp->attr_mod) * 8; + for (i = 0; i < 8; i++, block_idx++) { + if (slave != mlx4_gid_idx_to_slave(dev, block_idx)) + gids_block[i] = 0x0; + } + } + return err; + } + } + if (smp->method == IB_MGMT_METHOD_SET && slave != dev->caps.function) { + mlx4_err(dev, "slave %d is trying to execute a IB_MGMT_METHOD_SET operation (%d). aborting\n", slave,smp->attr_id); + return -EPERM; + } + } + /*default:*/ + return mlx4_cmd_box(dev, inbox->dma, outbox->dma, + vhcr->in_modifier, vhcr->op_modifier, + vhcr->op, vhcr->timeout, 1); +} + +int mlx4_DMA_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + u64 in_param; + u64 out_param; + int err; + + in_param = cmd->has_inbox ? (u64) inbox->dma : vhcr->in_param; + out_param = cmd->has_outbox ? (u64) outbox->dma : vhcr->out_param; + if (cmd->encode_slave_id) { + in_param &= 0xffffffffffffff00ll; + in_param |= slave; + } + + err = __mlx4_cmd(dev, in_param, &out_param, cmd->out_is_imm, + vhcr->in_modifier, vhcr->op_modifier, vhcr->op, + vhcr->timeout, 1); + + if (cmd->out_is_imm) + vhcr->out_param = out_param; + + return err; +} + +static struct mlx4_cmd_info cmd_info[] = { + { + .opcode = MLX4_CMD_QUERY_FW, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = NULL + }, + { + .opcode = MLX4_CMD_QUERY_SLAVE_CAP, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_QUERY_SLAVE_CAP_wrapper + }, + { + .opcode = MLX4_CMD_ENABLE_FMR, + .has_outbox = true, + .wrapper = mlx4_ENABLE_FMR_wrapper + }, + { + .opcode = MLX4_CMD_QUERY_ADAPTER, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = NULL + }, + { + .opcode = MLX4_CMD_COMM_INT, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_COMM_INT_wrapper + }, + { + .opcode = MLX4_CMD_INIT_PORT, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_INIT_PORT_wrapper + }, + { + .opcode = MLX4_CMD_CLOSE_PORT, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_CLOSE_PORT_wrapper + }, + { + .opcode = MLX4_CMD_SENSE_PORT, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = true, + .encode_slave_id = false, + .verify = NULL, + .wrapper = NULL + }, + { + .opcode = MLX4_CMD_QUERY_PORT, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_QUERY_PORT_wrapper + }, + { + .opcode = MLX4_CMD_SET_PORT, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_SET_PORT_wrapper + }, + { + .opcode = MLX4_CMD_SET_NODE, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = NULL + }, + { + .opcode = MLX4_CMD_MAP_EQ, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_MAP_EQ_wrapper + }, + { + .opcode = MLX4_CMD_SW2HW_EQ, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = true, + .verify = NULL, + .wrapper = mlx4_SW2HW_EQ_wrapper + }, + { + .opcode = MLX4_CMD_HW_HEALTH_CHECK, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = NULL + }, + { + .opcode = MLX4_CMD_NOP, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = NULL + }, + { + .opcode = MLX4_CMD_ALLOC_RES, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = true, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_ALLOC_RES_wrapper + }, + { + .opcode = MLX4_CMD_FREE_RES, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_FREE_RES_wrapper + }, + { + .opcode = MLX4_CMD_GET_EVENT, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = true, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_GET_EVENT_wrapper + }, + + { + .opcode = MLX4_CMD_REPLACE_RES, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = true, + .encode_slave_id = false, + .verify = NULL, + .wrapper = NULL + }, + { + .opcode = MLX4_CMD_SW2HW_MPT, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = true, + .verify = NULL, + .wrapper = mlx4_SW2HW_MPT_wrapper + }, + { + .opcode = MLX4_CMD_QUERY_MPT, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_QUERY_MPT_wrapper + }, + { + .opcode = MLX4_CMD_HW2SW_MPT, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_HW2SW_MPT_wrapper + }, + { + .opcode = MLX4_CMD_READ_MTT, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = NULL + }, + { + .opcode = MLX4_CMD_WRITE_MTT, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_WRITE_MTT_wrapper + }, + { + .opcode = MLX4_CMD_SYNC_TPT, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = NULL + }, + + { + .opcode = MLX4_CMD_HW2SW_EQ, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = true, + .verify = NULL, + .wrapper = mlx4_HW2SW_EQ_wrapper + }, + { + .opcode = MLX4_CMD_QUERY_EQ, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = true, + .verify = NULL, + .wrapper = mlx4_QUERY_EQ_wrapper + }, + { + .opcode = MLX4_CMD_SW2HW_CQ, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = true, + .verify = NULL, + .wrapper = mlx4_SW2HW_CQ_wrapper + }, + { + .opcode = MLX4_CMD_HW2SW_CQ, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_HW2SW_CQ_wrapper + }, + { + .opcode = MLX4_CMD_QUERY_CQ, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_QUERY_CQ_wrapper + }, + { + .opcode = MLX4_CMD_MODIFY_CQ, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = true, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_MODIFY_CQ_wrapper + }, + { + .opcode = MLX4_CMD_SW2HW_SRQ, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = true, + .verify = NULL, + .wrapper = mlx4_SW2HW_SRQ_wrapper + }, + { + .opcode = MLX4_CMD_HW2SW_SRQ, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_HW2SW_SRQ_wrapper + }, + { + .opcode = MLX4_CMD_QUERY_SRQ, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_QUERY_SRQ_wrapper + }, + { + .opcode = MLX4_CMD_ARM_SRQ, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_ARM_SRQ_wrapper + }, + { + .opcode = MLX4_CMD_RST2INIT_QP, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = true, + .verify = NULL, + .wrapper = mlx4_RST2INIT_QP_wrapper + }, + { + .opcode = MLX4_CMD_INIT2INIT_QP, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_INIT2INIT_QP_wrapper + }, + { + .opcode = MLX4_CMD_INIT2RTR_QP, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_INIT2RTR_QP_wrapper + }, + { + .opcode = MLX4_CMD_RTR2RTS_QP, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_RTR2RTS_QP_wrapper + }, + { + .opcode = MLX4_CMD_RTS2RTS_QP, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_RTS2RTS_QP_wrapper + }, + { + .opcode = MLX4_CMD_SQERR2RTS_QP, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_SQERR2RTS_QP_wrapper + }, + { + .opcode = MLX4_CMD_2ERR_QP, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_2ERR_QP_wrapper + }, + { + .opcode = MLX4_CMD_RTS2SQD_QP, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_RTS2SQD_QP_wrapper + }, + { + .opcode = MLX4_CMD_SQD2SQD_QP, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_SQD2SQD_QP_wrapper + }, + { + .opcode = MLX4_CMD_SQD2RTS_QP, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_SQD2RTS_QP_wrapper + }, + { + .opcode = MLX4_CMD_2RST_QP, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_2RST_QP_wrapper + }, + { + .opcode = MLX4_CMD_QUERY_QP, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_QUERY_QP_wrapper + }, + { + .opcode = MLX4_CMD_INIT2INIT_QP, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_INIT2INIT_QP_wrapper + }, + { + .opcode = MLX4_CMD_SUSPEND_QP, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_SUSPEND_QP_wrapper + }, + { + .opcode = MLX4_CMD_UNSUSPEND_QP, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_UNSUSPEND_QP_wrapper + }, + { + .opcode = MLX4_CMD_CONF_SPECIAL_QP, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, /* XXX verify: only demux can do this */ + .wrapper = NULL + }, + { + .opcode = MLX4_CMD_MAD_IFC, + .has_inbox = true, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_MAD_IFC_wrapper + }, + { + .opcode = MLX4_CMD_QUERY_IF_STAT, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_QUERY_IF_STAT_wrapper + }, + { + .opcode = MLX4_CMD_MAP_ICM, + .has_inbox = true, + .wrapper = mlx4_MAP_ICM_wrapper + }, + { + .opcode = MLX4_CMD_UNMAP_ICM, + .wrapper = mlx4_UNMAP_ICM_wrapper + }, + { + .opcode = MLX4_CMD_GET_GID_MAP, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_GET_GID_MAP_wrapper + }, + /* Native multicast commands are not available for guests */ + { + .opcode = MLX4_CMD_MCAST_ATTACH, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_MCAST_wrapper + }, + { + .opcode = MLX4_CMD_PROMISC, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_PROMISC_wrapper + }, + { + .opcode = MLX4_CMD_DIAG_RPRT, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, /* need verifier */ + .wrapper = NULL + }, + { + .opcode = MLX4_CMD_GET_PKEY_TABLE, + .has_outbox = true, + .wrapper = mlx4_PKEY_TABLE_wrapper, + }, + + /* Ethernet specific commands */ + { + .opcode = MLX4_CMD_SET_VLAN_FLTR, + .has_inbox = true, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_SET_VLAN_FLTR_wrapper + }, + { + .opcode = MLX4_CMD_SET_MCAST_FLTR, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_SET_MCAST_FLTR_wrapper + }, + { + .opcode = MLX4_CMD_DUMP_ETH_STATS, + .has_inbox = false, + .has_outbox = true, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_DUMP_ETH_STATS_wrapper + }, + { + .opcode = MLX4_CMD_INFORM_FLR_DONE, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = NULL + }, +}; + +static int mlx4_master_process_vhcr(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *in_vhcr) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_cmd_info *cmd = NULL; + struct mlx4_vhcr *vhcr = in_vhcr ? in_vhcr : priv->mfunc.vhcr; + struct mlx4_cmd_mailbox *inbox = NULL; + struct mlx4_cmd_mailbox *outbox = NULL; + u64 in_param; + u64 out_param; + int ret = 0; + int i; + + /* DMA in the vHCR */ + if (!in_vhcr) { + ret = mlx4_ACCESS_MEM(dev, priv->mfunc.vhcr_dma, slave, + priv->mfunc.master.slave_state[slave].vhcr_dma, + ALIGN(sizeof(struct mlx4_vhcr), + MLX4_ACCESS_MEM_ALIGN), 1); + if (ret) { + mlx4_err(dev, "%s:Failed reading vhcr ret: 0x%x\n", __func__, ret); + return ret; + } + if (vhcr->cookie != ++priv->mfunc.master.slave_state[slave].cookie) + mlx4_err(dev, "**** VHCR INCONSISTENCY vhcr cookie %d, expected state cookie %d\n", + vhcr->cookie, priv->mfunc.master.slave_state[slave].cookie); + } + + /* Lookup command */ + for (i = 0; i < ARRAY_SIZE(cmd_info); ++i) { + if (vhcr->op == cmd_info[i].opcode) { + cmd = &cmd_info[i]; + break; + } + } + if (!cmd) { + mlx4_err(dev, "Unknown command:0x%x accepted from slave:%d\n", + vhcr->op, slave); + vhcr->errno = -EINVAL; + goto out_status; + } + + /* Read inbox */ + if (cmd->has_inbox) { + vhcr->in_param &= INBOX_MASK; + inbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(inbox)) { + ret = PTR_ERR(inbox); + inbox = NULL; + goto out; + } + + /* FIXME: add mailbox size per-command */ + ret = mlx4_ACCESS_MEM(dev, inbox->dma, slave, + vhcr->in_param, + MLX4_MAILBOX_SIZE, 1); + if (ret) { + mlx4_err(dev, "%s: Failed reading inbox (cmd:0x%x)\n", __func__, cmd->opcode); + goto out; + } + } + + /* Apply permission and bound checks if applicable */ + if (cmd->verify && cmd->verify(dev, slave, vhcr, inbox)) { + mlx4_warn(dev, "Command:0x%x from slave: %d failed protection checks for resource_id:%d\n", vhcr->op, slave, vhcr->in_modifier); + vhcr->errno = -EPERM; + goto out_status; + } + + /* Allocate outbox */ + if (cmd->has_outbox) { + outbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(outbox)) { + ret = PTR_ERR(outbox); + outbox = NULL; + goto out; + } + } + + /* Execute the command! */ + if (cmd->wrapper) + vhcr->errno = cmd->wrapper(dev, slave, vhcr, inbox, outbox, cmd); + else { + in_param = cmd->has_inbox ? (u64) inbox->dma : vhcr->in_param; + out_param = cmd->has_outbox ? (u64) outbox->dma : vhcr->out_param; + vhcr->errno = __mlx4_cmd(dev, in_param, &out_param, + cmd->out_is_imm, + vhcr->in_modifier, + vhcr->op_modifier, + vhcr->op, + vhcr->timeout, 1); + if (cmd->out_is_imm) + vhcr->out_param = out_param; + } + + /* Write outbox if command completed successfully */ + if (cmd->has_outbox && !vhcr->errno) { + ret = mlx4_ACCESS_MEM(dev, outbox->dma, slave, + vhcr->out_param, + MLX4_MAILBOX_SIZE, 0); + if (ret) { + mlx4_err(dev, "%s:Failed writing outbox\n", __func__); + goto out; + } + } + +out_status: + /* DMA back vhcr result */ + if (!in_vhcr) { + ret = mlx4_ACCESS_MEM(dev, priv->mfunc.vhcr_dma, slave, + priv->mfunc.master.slave_state[slave].vhcr_dma, + ALIGN(sizeof(struct mlx4_vhcr), + MLX4_ACCESS_MEM_ALIGN), 0); + if (ret) + mlx4_err(dev, "%s:Failed writing vhcr result\n", __func__); + } + + if (vhcr->errno) + mlx4_swarn("vhcr command:0x%x slave:%d failed with error:%d\n", + vhcr->op, slave, vhcr->errno); + /* Fall through... */ + +out: + mlx4_free_cmd_mailbox(dev, inbox); + mlx4_free_cmd_mailbox(dev, outbox); + return ret; +} + +static void mlx4_master_do_cmd(struct mlx4_dev *dev, int slave, u8 cmd, u16 param, u8 toggle) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *slave_state = priv->mfunc.master.slave_state; + u32 reply; + u32 slave_status = 0; + u8 is_going_down = 0; + + slave_state[slave].comm_toggle ^= 1; + reply = (u32) slave_state[slave].comm_toggle << 31; + if (toggle != slave_state[slave].comm_toggle) { + mlx4_warn(dev, "Incorrect toggle %d from slave %d. *** MASTER STATE COMPROMISIED ***\n", + toggle, slave); + goto reset_slave; + } + if (cmd == MLX4_COMM_CMD_RESET) { + mlx4_warn(dev, "Received reset from slave:%d\n", slave); + slave_state[slave].active = false; + /*check if we are in the middle of FLR process, + if so return "retry" status to the slave*/ + if (MLX4_COMM_CMD_FLR == slave_state[slave].last_cmd) { + slave_status = MLX4_DELAY_RESET_SLAVE; + goto inform_slave_state; + } + mlx4_dispatch_event(dev, MLX4_DEV_EVENT_SLAVE_SHUTDOWN, + (unsigned long)slave); + + /* write the version in the event field */ + reply |= mlx4_comm_get_version(); + + goto reset_slave; + } + /*command from slave in the middle of FLR*/ + if (cmd != MLX4_COMM_CMD_RESET && MLX4_COMM_CMD_FLR == slave_state[slave].last_cmd) { + mlx4_warn(dev, "slave:%d is Trying to run cmd(0x%x) in the middle of FLR\n", slave, cmd); + return; + } + + switch (cmd) { + case MLX4_COMM_CMD_VHCR0: + if (slave_state[slave].last_cmd != MLX4_COMM_CMD_RESET) + goto reset_slave; + slave_state[slave].vhcr_dma = ((u64) param) << 48; + priv->mfunc.master.slave_state[slave].cookie = 0; + mutex_init(&priv->mfunc.master.gen_eqe_mutex[slave]); + break; + case MLX4_COMM_CMD_VHCR1: + if (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR0) + goto reset_slave; + slave_state[slave].vhcr_dma |= ((u64) param) << 32; + break; + case MLX4_COMM_CMD_VHCR2: + if (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR1) + goto reset_slave; + slave_state[slave].vhcr_dma |= ((u64) param) << 16; + break; + case MLX4_COMM_CMD_VHCR_EN: + if (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR2) + goto reset_slave; + slave_state[slave].vhcr_dma |= param; + if (mlx4_QUERY_FUNC(dev, slave, &slave_state[slave].pf_num)) { + mlx4_err(dev, "Failed to determine physical function " + "number for slave %d\n", slave); + goto reset_slave; + } + slave_state[slave].vep_num = slave_state[slave].pf_num >> 1; + slave_state[slave].active = true; + mlx4_dispatch_event(dev, MLX4_DEV_EVENT_SLAVE_INIT, slave); + break; + case MLX4_COMM_CMD_VHCR_POST: + if ((slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR_EN) && + (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR_POST)) + goto reset_slave; + down(&priv->cmd.slave_sem); + if (mlx4_master_process_vhcr(dev, slave, NULL)) { + mlx4_err(dev, "Failed processing vhcr for slave:%d, reseting slave.\n", slave); + up(&priv->cmd.slave_sem); + goto reset_slave; + } + up(&priv->cmd.slave_sem); + break; + default: + mlx4_warn(dev, "Bad comm cmd:%d from slave:%d\n", cmd, slave); + goto reset_slave; + } + spin_lock(&priv->mfunc.master.slave_state_lock); + if (!slave_state[slave].is_slave_going_down) { + slave_state[slave].last_cmd = cmd; + } else { + is_going_down = 1; + } + spin_unlock(&priv->mfunc.master.slave_state_lock); + if (is_going_down) { + mlx4_warn(dev, "Slave is going down aborting command(%d) executing from slave:%d\n", + cmd, slave); + return; + } + __raw_writel((__force u32) cpu_to_be32(reply), + &priv->mfunc.comm[slave].slave_read); + mmiowb(); + if (mlx4_GEN_EQE(dev, slave, &priv->mfunc.master.cmd_eqe)) + mlx4_warn(dev, "Failed to generate command completion eqe " + "for slave %d\n", slave); + + return; + +reset_slave: + /* cleanup any slave resources */ + mlx4_delete_all_resources_for_slave(dev, slave); + spin_lock(&priv->mfunc.master.slave_state_lock); + if (!slave_state[slave].is_slave_going_down) { + slave_state[slave].last_cmd = MLX4_COMM_CMD_RESET; + } + spin_unlock(&priv->mfunc.master.slave_state_lock); + /*with slave in the middle of flr, no need to clean resources again.*/ +inform_slave_state: + memset(&slave_state[slave].event_eq, 0, + sizeof(struct mlx4_slave_event_eq_info)); + __raw_writel((__force u32) cpu_to_be32(reply), + &priv->mfunc.comm[slave].slave_read); + wmb(); +} + +/* master command processing */ +void mlx4_master_comm_channel(struct work_struct *work) +{ + struct mlx4_mfunc_master_ctx *master = container_of(work, + struct mlx4_mfunc_master_ctx, + comm_work); + struct mlx4_mfunc *mfunc = container_of(master, struct mlx4_mfunc, master); + struct mlx4_priv *priv = container_of(mfunc, struct mlx4_priv, mfunc); + struct mlx4_dev *dev = &priv->dev; + u32 *bit_vec; + u32 comm_cmd; + u32 vec; + int i, j, slave; + int toggle; + int served = 0; + int reported = 0; + u32 slt; + + bit_vec = master->comm_arm_bit_vector; + for (i = 0; i < COMM_CHANNEL_BIT_ARRAY_SIZE; i++) { + vec = be32_to_cpu(bit_vec[i]); + for (j = 0; j < 32; j++) { + if (!(vec & (1 << j))) + continue; + ++reported; + slave = (i * 32) + j; + comm_cmd = swab32(readl(&mfunc->comm[slave].slave_write)); + slt = swab32(readl(&mfunc->comm[slave].slave_read)) >> 31; + toggle = comm_cmd >> 31; + if (toggle != slt) { + if (master->slave_state[slave].comm_toggle != slt) { + printk("slave %d out of sync. read toggle %d, state toggle %d. Resynching.\n", + slave, slt, master->slave_state[slave].comm_toggle); + master->slave_state[slave].comm_toggle = slt; + } + mlx4_master_do_cmd(dev, slave, comm_cmd >> 16 & 0xff, + comm_cmd & 0xffff, toggle); + ++served; + } + } + } + + if (reported && reported != served) + mlx4_warn(dev, "Got command event with bitmask from %d slaves but %d were served\n", + reported, served); + + if (mlx4_ARM_COMM_CHANNEL(dev)) + mlx4_warn(dev, "Failed to arm comm channel events\n"); +} + +static int sync_toggles(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int wr_toggle; + int rd_toggle; + unsigned long end; + + wr_toggle = swab32(readl(&priv->mfunc.comm->slave_write)) >> 31; + end = jiffies + msecs_to_jiffies(5000); + + while (time_before(jiffies, end)) { + rd_toggle = swab32(readl(&priv->mfunc.comm->slave_read)) >> 31; + if (rd_toggle == wr_toggle) { + priv->cmd.comm_toggle = rd_toggle; + return 0; + } + + cond_resched(); + } + + /* + * we coud reach here if for example the previous VM using this function + * misbehaved and left the channel with unsynced state. We should fix + * this here and give this VM a chance to use a properly synced channel + */ + mlx4_warn(dev, "recovering from previously mis-behaved VM\n"); + __raw_writel((__force u32) 0, &priv->mfunc.comm->slave_read); + __raw_writel((__force u32) 0, &priv->mfunc.comm->slave_write); + priv->cmd.comm_toggle = 0; + + return 0; +} + +int mlx4_multi_func_init(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *s_state; + int i, err, port; + + priv->mfunc.vhcr = dma_alloc_coherent(&(dev->pdev->dev), PAGE_SIZE, + &priv->mfunc.vhcr_dma, + GFP_KERNEL); + if (!priv->mfunc.vhcr) { + mlx4_err(dev, "Couldn't allocate vhcr.\n"); + return -ENOMEM; + } + priv->mfunc.vhcr->cookie = 0; + + if (mlx4_is_master(dev)) + priv->mfunc.comm = ioremap(pci_resource_start(dev->pdev, + priv->fw.comm_bar) + + priv->fw.comm_base, + MLX4_COMM_PAGESIZE); + else + priv->mfunc.comm = ioremap(pci_resource_start(dev->pdev, 2) + + MLX4_SLAVE_COMM_BASE, + MLX4_COMM_PAGESIZE); + if (!priv->mfunc.comm) { + mlx4_err(dev, "Couldn't map communication vector.\n"); + goto err_vhcr; + } + + if (mlx4_is_master(dev)) { + priv->mfunc.master.slave_state = kzalloc(dev->num_slaves * + sizeof(struct mlx4_slave_state), + GFP_KERNEL); + if (!priv->mfunc.master.slave_state) + goto err_comm; + + for (i = 0; i < dev->num_slaves; ++i) { + s_state = &priv->mfunc.master.slave_state[i]; + s_state->last_cmd = MLX4_COMM_CMD_RESET; + __raw_writel((__force u32) 0, &priv->mfunc.comm[i].slave_write); + __raw_writel((__force u32) 0, &priv->mfunc.comm[i].slave_read); + mmiowb(); + for (port = 1; port <= MLX4_MAX_PORTS; port++) { + s_state->vlan_filter[port] = + kzalloc(sizeof(struct mlx4_vlan_fltr), + GFP_KERNEL); + if (!s_state->vlan_filter[port]) { + if (--port) + kfree(s_state->vlan_filter[port]); + goto err_slaves; + } + INIT_LIST_HEAD(&s_state->mcast_filters[port]); + } + spin_lock_init(&s_state->lock); + } + + memset(&priv->mfunc.master.cmd_eqe, 0, sizeof(struct mlx4_eqe)); + priv->mfunc.master.cmd_eqe.type = MLX4_EVENT_TYPE_CMD; + INIT_WORK(&priv->mfunc.master.comm_work, mlx4_master_comm_channel); + INIT_WORK(&priv->mfunc.master.slave_event_work, mlx4_gen_slave_eqe); + INIT_WORK(&priv->mfunc.master.vep_config_work, mlx4_update_vep_config); + INIT_WORK(&priv->mfunc.master.slave_flr_event_work, mlx4_master_handle_slave_flr); + spin_lock_init(&priv->mfunc.master.vep_config_lock); + spin_lock_init(&priv->mfunc.master.slave_state_lock); + spin_lock_init(&priv->mfunc.master.slave_eq.event_lock); + priv->mfunc.master.comm_wq = create_singlethread_workqueue("mlx4_comm"); + if (!priv->mfunc.master.comm_wq) + goto err_slaves; + + if (mlx4_init_resource_tracker(dev)) + goto err_thread; + + mlx4_QUERY_VEP_CFG(dev, dev->caps.function, + &priv->mfunc.master.slave_state[dev->caps.function].vep_cfg); + priv->mfunc.master.slave_state[dev->caps.function].pf_num = dev->caps.function; + priv->mfunc.master.slave_state[dev->caps.function].vep_num = dev->caps.function >> 1; + + sema_init(&priv->cmd.slave_sem, 1); + err = mlx4_ARM_COMM_CHANNEL(dev); + if (err) { + mlx4_err(dev, " Failed to arm comm channel eq: %x\n", err); + goto err_resource; + } + + } else { + err = sync_toggles(dev); + if (err) { + mlx4_err(dev, "Couldn't sync toggles\n"); + goto err_comm; + } + + sema_init(&priv->cmd.slave_sem, 1); + } + return 0; + +err_resource: + mlx4_free_resource_tracker(dev); +err_thread: + flush_workqueue(priv->mfunc.master.comm_wq); + destroy_workqueue(priv->mfunc.master.comm_wq); +err_slaves: + while (--i) { + for (port = 1; port <= MLX4_MAX_PORTS; port++) + kfree(priv->mfunc.master.slave_state[i].vlan_filter[port]); + } + kfree(priv->mfunc.master.slave_state); +err_comm: + iounmap(priv->mfunc.comm); +err_vhcr: + dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE, + priv->mfunc.vhcr, + priv->mfunc.vhcr_dma); + priv->mfunc.vhcr = NULL; + return -ENOMEM; +} + int mlx4_cmd_init(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); @@ -330,22 +1811,50 @@ int mlx4_cmd_init(struct mlx4_dev *dev) priv->cmd.use_events = 0; priv->cmd.toggle = 1; - priv->cmd.hcr = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_HCR_BASE, - MLX4_HCR_SIZE); - if (!priv->cmd.hcr) { - mlx4_err(dev, "Couldn't map command register."); - return -ENOMEM; + priv->cmd.hcr = NULL; + priv->mfunc.vhcr = NULL; + + if (!mlx4_is_mfunc(dev) || mlx4_is_master(dev)) { + priv->cmd.hcr = ioremap(pci_resource_start(dev->pdev, 0) + + MLX4_HCR_BASE, MLX4_HCR_SIZE); + if (!priv->cmd.hcr) { + mlx4_err(dev, "Couldn't map command register.\n"); + return -ENOMEM; + } } priv->cmd.pool = pci_pool_create("mlx4_cmd", dev->pdev, MLX4_MAILBOX_SIZE, MLX4_MAILBOX_SIZE, 0); - if (!priv->cmd.pool) { - iounmap(priv->cmd.hcr); - return -ENOMEM; - } + if (!priv->cmd.pool) + goto err_hcr; return 0; + +err_hcr: + if (!mlx4_is_mfunc(dev) || mlx4_is_master(dev)) + iounmap(priv->cmd.hcr); + return -ENOMEM; +} + +void mlx4_multi_func_cleanup(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int i, port; + + if (mlx4_is_master(dev)) { + flush_workqueue(priv->mfunc.master.comm_wq); + destroy_workqueue(priv->mfunc.master.comm_wq); + for (i = 0; i < dev->num_slaves; i++) { + for (port = 1; port <= MLX4_MAX_PORTS; port++) + kfree(priv->mfunc.master.slave_state[i].vlan_filter[port]); + } + kfree(priv->mfunc.master.slave_state); + } + iounmap(priv->mfunc.comm); + dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE, + priv->mfunc.vhcr, priv->mfunc.vhcr_dma); + priv->mfunc.vhcr = NULL; } void mlx4_cmd_cleanup(struct mlx4_dev *dev) @@ -353,7 +1862,9 @@ void mlx4_cmd_cleanup(struct mlx4_dev *dev) struct mlx4_priv *priv = mlx4_priv(dev); pci_pool_destroy(priv->cmd.pool); - iounmap(priv->cmd.hcr); + + if (!mlx4_is_mfunc(dev) || mlx4_is_master(dev)) + iounmap(priv->cmd.hcr); } /* @@ -364,6 +1875,7 @@ int mlx4_cmd_use_events(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); int i; + int err = 0; priv->cmd.context = kmalloc(priv->cmd.max_cmds * sizeof (struct mlx4_cmd_context), @@ -388,11 +1900,23 @@ int mlx4_cmd_use_events(struct mlx4_dev *dev) ; /* nothing */ --priv->cmd.token_mask; + down(&priv->cmd.poll_sem); priv->cmd.use_events = 1; - down(&priv->cmd.poll_sem); + if (mlx4_is_mfunc(dev) && !mlx4_is_master(dev)) { + err = mlx4_cmd(dev, 0, 1, 0, MLX4_CMD_COMM_INT, MLX4_CMD_TIME_CLASS_A, 0); + if (err) { + mlx4_err(dev, "Failed to move to events for the slave\n"); + priv->cmd.use_events = 0; + for (i = 0; i < priv->cmd.max_cmds; ++i) + down(&priv->cmd.event_sem); - return 0; + up(&priv->cmd.poll_sem); + } + } + + + return err; } /* @@ -411,6 +1935,9 @@ void mlx4_cmd_use_polling(struct mlx4_dev *dev) kfree(priv->cmd.context); up(&priv->cmd.poll_sem); + + if (mlx4_is_mfunc(dev) && !mlx4_is_master(dev)) + mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_COMM_INT, MLX4_CMD_TIME_CLASS_A, 0); } struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev) @@ -441,3 +1968,25 @@ void mlx4_free_cmd_mailbox(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbo kfree(mailbox); } EXPORT_SYMBOL_GPL(mlx4_free_cmd_mailbox); + +int mlx4_COMM_INT_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_event_eq_info *event_eq = + &priv->mfunc.master.slave_state[slave].event_eq; + + if (vhcr->in_modifier) + event_eq->use_int = true; + else + event_eq->use_int = false; + + return 0; +} + +u32 mlx4_comm_get_version(void) +{ + return (((u32)CMD_CHAN_IF_REV << 8) | (u32)CMD_CHAN_VER); +} diff --git a/drivers/net/mlx4/cq.c b/drivers/net/mlx4/cq.c index bd8ef9f2fa715..88f45a1fc2b68 100644 --- a/drivers/net/mlx4/cq.c +++ b/drivers/net/mlx4/cq.c @@ -34,8 +34,8 @@ * SOFTWARE. */ +#include #include -#include #include #include @@ -43,27 +43,6 @@ #include "mlx4.h" #include "icm.h" -struct mlx4_cq_context { - __be32 flags; - u16 reserved1[3]; - __be16 page_offset; - __be32 logsize_usrpage; - __be16 cq_period; - __be16 cq_max_count; - u8 reserved2[3]; - u8 comp_eqn; - u8 log_page_size; - u8 reserved3[2]; - u8 mtt_base_addr_h; - __be32 mtt_base_addr_l; - __be32 last_notified_index; - __be32 solicit_producer_index; - __be32 consumer_index; - __be32 producer_index; - u32 reserved4[2]; - __be64 db_rec_addr; -}; - #define MLX4_CQ_STATUS_OK ( 0 << 28) #define MLX4_CQ_STATUS_OVERFLOW ( 9 << 28) #define MLX4_CQ_STATUS_WRITE_FAIL (10 << 28) @@ -80,7 +59,7 @@ void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn) cq = radix_tree_lookup(&mlx4_priv(dev)->cq_table.tree, cqn & (dev->caps.num_cqs - 1)); if (!cq) { - mlx4_warn(dev, "Completion event for bogus CQ %08x\n", cqn); + mlx4_dbg(dev, "Completion event for bogus CQ %08x\n", cqn); return; } @@ -116,23 +95,23 @@ void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type) static int mlx4_SW2HW_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, int cq_num) { - return mlx4_cmd(dev, mailbox->dma, cq_num, 0, MLX4_CMD_SW2HW_CQ, - MLX4_CMD_TIME_CLASS_A); + return mlx4_cmd(dev, mailbox->dma | dev->caps.function, cq_num, 0, + MLX4_CMD_SW2HW_CQ, MLX4_CMD_TIME_CLASS_A, 0); } static int mlx4_MODIFY_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, int cq_num, u32 opmod) { return mlx4_cmd(dev, mailbox->dma, cq_num, opmod, MLX4_CMD_MODIFY_CQ, - MLX4_CMD_TIME_CLASS_A); + MLX4_CMD_TIME_CLASS_A, 0); } static int mlx4_HW2SW_CQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, int cq_num) { - return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, cq_num, - mailbox ? 0 : 1, MLX4_CMD_HW2SW_CQ, - MLX4_CMD_TIME_CLASS_A); + return mlx4_cmd_box(dev, dev->caps.function, mailbox ? mailbox->dma : 0, + cq_num, mailbox ? 0 : 1, MLX4_CMD_HW2SW_CQ, + MLX4_CMD_TIME_CLASS_A, 0); } int mlx4_cq_modify(struct mlx4_dev *dev, struct mlx4_cq *cq, @@ -187,6 +166,96 @@ int mlx4_cq_resize(struct mlx4_dev *dev, struct mlx4_cq *cq, } EXPORT_SYMBOL_GPL(mlx4_cq_resize); +static int mlx4_find_least_loaded_vector(struct mlx4_priv *priv) +{ + int i; + int index = 0; + int min = priv->eq_table.eq[0].load; + + for (i = 1; i < priv->dev.caps.num_comp_vectors; i++) { + if (priv->eq_table.eq[i].load < min) { + index = i; + min = priv->eq_table.eq[i].load; + } + } + + return index; +} + +int __mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_cq_table *cq_table = &priv->cq_table; + int err; + + *cqn = mlx4_bitmap_alloc(&cq_table->bitmap); + if (*cqn == -1) + return -ENOMEM; + + err = mlx4_table_get(dev, &cq_table->table, *cqn, MLX4_MR_FLAG_NONE); + if (err) + goto err_out; + + err = mlx4_table_get(dev, &cq_table->cmpt_table, *cqn, + MLX4_MR_FLAG_NONE); + if (err) + goto err_put; + return 0; + +err_put: + mlx4_table_put(dev, &cq_table->table, *cqn, MLX4_MR_FLAG_NONE); + +err_out: + mlx4_bitmap_free(&cq_table->bitmap, *cqn); + return err; +} + +int mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn) +{ + u64 out_param; + int err; + + if (mlx4_is_mfunc(dev)) { + err = mlx4_cmd_imm(dev, 0, &out_param, RES_CQ, + RES_OP_RESERVE_AND_MAP, + MLX4_CMD_ALLOC_RES, + MLX4_CMD_TIME_CLASS_A, 0); + if (err) + return err; + else { + *cqn = get_param_l(&out_param); + return 0; + } + } + return __mlx4_cq_alloc_icm(dev, cqn); +} + +void __mlx4_cq_free_icm(struct mlx4_dev *dev, int cqn) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_cq_table *cq_table = &priv->cq_table; + + mlx4_table_put(dev, &cq_table->cmpt_table, cqn, MLX4_MR_FLAG_NONE); + mlx4_table_put(dev, &cq_table->table, cqn, MLX4_MR_FLAG_NONE); + mlx4_bitmap_free(&cq_table->bitmap, cqn); +} + +void mlx4_cq_free_icm(struct mlx4_dev *dev, int cqn) +{ + u64 in_param; + int err; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, cqn); + err = mlx4_cmd(dev, in_param, RES_CQ, RES_OP_RESERVE_AND_MAP, + MLX4_CMD_FREE_RES, + MLX4_CMD_TIME_CLASS_A, 0); + if (err) + mlx4_warn(dev, "Failed freeing cq:%d\n", cqn); + } else + __mlx4_cq_free_icm(dev, cqn); +} + int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt, struct mlx4_uar *uar, u64 db_rec, struct mlx4_cq *cq, unsigned vector, int collapsed) @@ -198,28 +267,21 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt, u64 mtt_addr; int err; - if (vector > dev->caps.num_comp_vectors + dev->caps.comp_pool) - return -EINVAL; - - cq->vector = vector; - - cq->cqn = mlx4_bitmap_alloc(&cq_table->bitmap); - if (cq->cqn == -1) - return -ENOMEM; + cq->vector = (vector == MLX4_LEAST_ATTACHED_VECTOR) ? + mlx4_find_least_loaded_vector(priv) : vector; - err = mlx4_table_get(dev, &cq_table->table, cq->cqn); - if (err) - goto err_out; + if (cq->vector >= dev->caps.num_comp_vectors) + return -EINVAL; - err = mlx4_table_get(dev, &cq_table->cmpt_table, cq->cqn); + err = mlx4_cq_alloc_icm(dev, &cq->cqn); if (err) - goto err_put; + return err; spin_lock_irq(&cq_table->lock); err = radix_tree_insert(&cq_table->tree, cq->cqn, cq); spin_unlock_irq(&cq_table->lock); if (err) - goto err_cmpt_put; + goto err_icm; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) { @@ -232,7 +294,7 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt, cq_context->flags = cpu_to_be32(!!collapsed << 18); cq_context->logsize_usrpage = cpu_to_be32((ilog2(nent) << 24) | uar->index); - cq_context->comp_eqn = priv->eq_table.eq[vector].eqn; + cq_context->comp_eqn = priv->eq_table.eq[cq->vector].eqn; cq_context->log_page_size = mtt->page_shift - MLX4_ICM_PAGE_SHIFT; mtt_addr = mlx4_mtt_addr(dev, mtt); @@ -245,6 +307,7 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt, if (err) goto err_radix; + priv->eq_table.eq[cq->vector].load++; cq->cons_index = 0; cq->arm_sn = 1; cq->uar = uar; @@ -258,14 +321,8 @@ err_radix: radix_tree_delete(&cq_table->tree, cq->cqn); spin_unlock_irq(&cq_table->lock); -err_cmpt_put: - mlx4_table_put(dev, &cq_table->cmpt_table, cq->cqn); - -err_put: - mlx4_table_put(dev, &cq_table->table, cq->cqn); - -err_out: - mlx4_bitmap_free(&cq_table->bitmap, cq->cqn); +err_icm: + mlx4_cq_free_icm(dev, cq->cqn); return err; } @@ -282,6 +339,7 @@ void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq) mlx4_warn(dev, "HW2SW_CQ failed (%d) for CQN %06x\n", err, cq->cqn); synchronize_irq(priv->eq_table.eq[cq->vector].irq); + priv->eq_table.eq[cq->vector].load--; spin_lock_irq(&cq_table->lock); radix_tree_delete(&cq_table->tree, cq->cqn); @@ -291,8 +349,7 @@ void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq) complete(&cq->free); wait_for_completion(&cq->free); - mlx4_table_put(dev, &cq_table->table, cq->cqn); - mlx4_bitmap_free(&cq_table->bitmap, cq->cqn); + mlx4_cq_free_icm(dev, cq->cqn); } EXPORT_SYMBOL_GPL(mlx4_cq_free); @@ -303,6 +360,8 @@ int mlx4_init_cq_table(struct mlx4_dev *dev) spin_lock_init(&cq_table->lock); INIT_RADIX_TREE(&cq_table->tree, GFP_ATOMIC); + if (mlx4_is_mfunc(dev) && !mlx4_is_master(dev)) + return 0; err = mlx4_bitmap_init(&cq_table->bitmap, dev->caps.num_cqs, dev->caps.num_cqs - 1, dev->caps.reserved_cqs, 0); @@ -314,6 +373,8 @@ int mlx4_init_cq_table(struct mlx4_dev *dev) void mlx4_cleanup_cq_table(struct mlx4_dev *dev) { + if (mlx4_is_mfunc(dev) && !mlx4_is_master(dev)) + return; /* Nothing to do to clean up radix_tree */ mlx4_bitmap_cleanup(&mlx4_priv(dev)->cq_table.bitmap); } diff --git a/drivers/net/mlx4/en_cq.c b/drivers/net/mlx4/en_cq.c index 2d1a34267b80a..093f43ffed895 100644 --- a/drivers/net/mlx4/en_cq.c +++ b/drivers/net/mlx4/en_cq.c @@ -51,7 +51,14 @@ int mlx4_en_create_cq(struct mlx4_en_priv *priv, int err; cq->size = entries; - cq->buf_size = cq->size * sizeof(struct mlx4_cqe); + if (mode == RX) { + cq->buf_size = cq->size * sizeof(struct mlx4_cqe); + cq->vector = (ring + priv->port) % + mdev->dev->caps.num_comp_vectors; + } else { + cq->buf_size = sizeof(struct mlx4_cqe); + cq->vector = MLX4_LEAST_ATTACHED_VECTOR; + } cq->ring = ring; cq->is_tx = mode; @@ -71,12 +78,10 @@ int mlx4_en_create_cq(struct mlx4_en_priv *priv, return err; } -int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq, - int cq_idx) +int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq) { struct mlx4_en_dev *mdev = priv->mdev; - int err = 0; - char name[25]; + int err; cq->dev = mdev->pndev[priv->port]; cq->mcq.set_ci_db = cq->wqres.db.db; @@ -85,39 +90,11 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq, *cq->mcq.arm_db = 0; memset(cq->buf, 0, cq->buf_size); - if (cq->is_tx == RX) { - if (mdev->dev->caps.comp_pool) { - if (!cq->vector) { - sprintf(name, "%s-%d", priv->dev->name, - cq->ring); - /* Set IRQ for specific name (per ring) */ - if (mlx4_assign_eq(mdev->dev, name, &cq->vector)) { - cq->vector = (cq->ring + 1 + priv->port) - % mdev->dev->caps.num_comp_vectors; - mlx4_warn(mdev, "Failed Assigning an EQ to " - "%s ,Falling back to legacy EQ's\n", - name); - } - } - } else { - cq->vector = (cq->ring + 1 + priv->port) % - mdev->dev->caps.num_comp_vectors; - } - } else { - /* For TX we use the same irq per - ring we assigned for the RX */ - struct mlx4_en_cq *rx_cq; - - cq_idx = cq_idx % priv->rx_ring_num; - rx_cq = &priv->rx_cq[cq_idx]; - cq->vector = rx_cq->vector; - } - if (!cq->is_tx) cq->size = priv->rx_ring[cq->ring].actual_size; err = mlx4_cq_alloc(mdev->dev, cq->size, &cq->wqres.mtt, &mdev->priv_uar, - cq->wqres.db.dma, &cq->mcq, cq->vector, 0); + cq->wqres.db.dma, &cq->mcq, cq->vector, cq->is_tx); if (err) return err; @@ -142,8 +119,6 @@ void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq) mlx4_en_unmap_buffer(&cq->wqres.buf); mlx4_free_hwq_res(mdev->dev, &cq->wqres, cq->buf_size); - if (priv->mdev->dev->caps.comp_pool && cq->vector) - mlx4_release_eq(priv->mdev->dev, cq->vector); cq->buf_size = 0; cq->buf = NULL; } diff --git a/drivers/net/mlx4/en_ethtool.c b/drivers/net/mlx4/en_ethtool.c index 25440e91daa1b..de201fe461de7 100644 --- a/drivers/net/mlx4/en_ethtool.c +++ b/drivers/net/mlx4/en_ethtool.c @@ -34,18 +34,34 @@ #include #include #include +#include #include "mlx4_en.h" #include "en_port.h" +static void mlx4_en_update_lro_stats(struct mlx4_en_priv *priv) +{ + int i; + + priv->port_stats.lro_aggregated = 0; + priv->port_stats.lro_flushed = 0; + priv->port_stats.lro_no_desc = 0; + + for (i = 0; i < priv->rx_ring_num; i++) { + priv->port_stats.lro_aggregated += priv->rx_ring[i].lro.stats.aggregated; + priv->port_stats.lro_flushed += priv->rx_ring[i].lro.stats.flushed; + priv->port_stats.lro_no_desc += priv->rx_ring[i].lro.stats.no_desc; + } +} + static void mlx4_en_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; - strncpy(drvinfo->driver, DRV_NAME, 32); + sprintf(drvinfo->driver, DRV_NAME " (%s)", mdev->dev->board_id); strncpy(drvinfo->version, DRV_VERSION " (" DRV_RELDATE ")", 32); sprintf(drvinfo->fw_version, "%d.%d.%d", (u16) (mdev->dev->caps.fw_ver >> 32), @@ -57,6 +73,64 @@ mlx4_en_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) drvinfo->eedump_len = 0; } +static u32 mlx4_en_get_tso(struct net_device *dev) +{ + return (dev->features & NETIF_F_TSO) != 0; +} + +static int mlx4_en_set_tso(struct net_device *dev, u32 data) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + + if (data) { + if (!priv->mdev->LSO_support) + return -EPERM; + dev->features |= (NETIF_F_TSO | NETIF_F_TSO6); +#ifdef HAVE_NETDEV_VLAN_FEATURES + dev->vlan_features |= (NETIF_F_TSO | NETIF_F_TSO6); +#else + if (priv->vlgrp) { + int i; + struct net_device *vdev; + for (i = 0; i < VLAN_N_VID; i++) { + vdev = vlan_group_get_device(priv->vlgrp, i); + vdev->features |= (NETIF_F_TSO | NETIF_F_TSO6); + vlan_group_set_device(priv->vlgrp, i, vdev); + } + } +#endif + } else { + dev->features &= ~(NETIF_F_TSO | NETIF_F_TSO6); +#ifdef HAVE_NETDEV_VLAN_FEATURES + dev->vlan_features &= ~(NETIF_F_TSO | NETIF_F_TSO6); +#else + if (priv->vlgrp) { + int i; + struct net_device *vdev; + for (i = 0; i < VLAN_N_VID; i++) { + vdev = vlan_group_get_device(priv->vlgrp, i); + vdev->features &= ~(NETIF_F_TSO | NETIF_F_TSO6); + vlan_group_set_device(priv->vlgrp, i, vdev); + } + } +#endif + } + return 0; +} + +static u32 mlx4_en_get_rx_csum(struct net_device *dev) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + return priv->rx_csum; +} + +static int mlx4_en_set_rx_csum(struct net_device *dev, u32 data) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + priv->rx_csum = (data != 0); + return 0; +} + static const char main_strings[][ETH_GSTRING_LEN] = { "rx_packets", "tx_packets", "rx_bytes", "tx_bytes", "rx_errors", "tx_errors", "rx_dropped", "tx_dropped", "multicast", "collisions", @@ -66,7 +140,7 @@ static const char main_strings[][ETH_GSTRING_LEN] = { "tx_heartbeat_errors", "tx_window_errors", /* port statistics */ - "tso_packets", + "lro_aggregated", "lro_flushed", "lro_no_desc", "tso_packets", "queue_stopped", "wake_queue", "tx_timeout", "rx_alloc_failed", "rx_csum_good", "rx_csum_none", "tx_chksum_offload", @@ -102,35 +176,28 @@ static void mlx4_en_get_wol(struct net_device *netdev, { struct mlx4_en_priv *priv = netdev_priv(netdev); int err = 0; - u64 config = 0; - u64 mask; - - if ((priv->port < 1) || (priv->port > 2)) { - en_err(priv, "Failed to get WoL information\n"); - return; - } + struct mlx4_wol_struct wol_info; - mask = (priv->port == 1) ? MLX4_DEV_CAP_FLAG_WOL_PORT1 : - MLX4_DEV_CAP_FLAG_WOL_PORT2; - - if (!(priv->mdev->dev->caps.flags & mask)) { + if (!priv->mdev->dev->caps.wol) { wol->supported = 0; wol->wolopts = 0; return; } - err = mlx4_wol_read(priv->mdev->dev, &config, priv->port); + memset(&wol_info, 0, sizeof(wol_info)); + + err = mlx4_wol_read(priv->mdev->dev, &wol_info, priv->port); if (err) { en_err(priv, "Failed to get WoL information\n"); return; } - if (config & MLX4_EN_WOL_MAGIC) + if (be32_to_cpu(wol_info.flags) & MLX4_EN_WOL_MAGIC) wol->supported = WAKE_MAGIC; else wol->supported = 0; - if (config & MLX4_EN_WOL_ENABLED) + if (be32_to_cpu(wol_info.flags) & MLX4_EN_WOL_ENABLED) wol->wolopts = WAKE_MAGIC; else wol->wolopts = 0; @@ -140,42 +207,41 @@ static int mlx4_en_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) { struct mlx4_en_priv *priv = netdev_priv(netdev); - u64 config = 0; + struct mlx4_wol_struct wol_info; int err = 0; - u64 mask; - if ((priv->port < 1) || (priv->port > 2)) - return -EOPNOTSUPP; - - mask = (priv->port == 1) ? MLX4_DEV_CAP_FLAG_WOL_PORT1 : - MLX4_DEV_CAP_FLAG_WOL_PORT2; - - if (!(priv->mdev->dev->caps.flags & mask)) + if (!priv->mdev->dev->caps.wol) { + wol->supported = 0; + wol->wolopts = 0; return -EOPNOTSUPP; + } if (wol->supported & ~WAKE_MAGIC) return -EINVAL; - err = mlx4_wol_read(priv->mdev->dev, &config, priv->port); - if (err) { - en_err(priv, "Failed to get WoL info, unable to modify\n"); - return err; - } + memset(&wol_info, 0, sizeof(wol_info)); + + err = mlx4_wol_read(priv->mdev->dev, &wol_info, priv->port); + if (err) + en_err(priv, "Failed to get WoL information\n"); if (wol->wolopts & WAKE_MAGIC) { - config |= MLX4_EN_WOL_DO_MODIFY | MLX4_EN_WOL_ENABLED | - MLX4_EN_WOL_MAGIC; + wol_info.flags |= cpu_to_be32(MLX4_EN_WOL_DO_MODIFY | + MLX4_EN_WOL_ENABLED | + MLX4_EN_WOL_MAGIC); } else { - config &= ~(MLX4_EN_WOL_ENABLED | MLX4_EN_WOL_MAGIC); - config |= MLX4_EN_WOL_DO_MODIFY; + wol_info.flags |= + cpu_to_be32(MLX4_EN_WOL_DO_MODIFY & ~MLX4_EN_WOL_MAGIC); } + err = mlx4_wol_write(priv->mdev->dev, &wol_info, priv->port); - err = mlx4_wol_write(priv->mdev->dev, config, priv->port); - if (err) + if (err) { en_err(priv, "Failed to set WoL information\n"); + return err; + } - return err; -} + return 0; + } static int mlx4_en_get_sset_count(struct net_device *dev, int sset) { @@ -186,8 +252,7 @@ static int mlx4_en_get_sset_count(struct net_device *dev, int sset) return NUM_ALL_STATS + (priv->tx_ring_num + priv->rx_ring_num) * 2; case ETH_SS_TEST: - return MLX4_EN_NUM_SELF_TEST - !(priv->mdev->dev->caps.flags - & MLX4_DEV_CAP_FLAG_UC_LOOPBACK) * 2; + return MLX4_EN_NUM_SELF_TEST - !(priv->mdev->dev->caps.loopback_support) * 2; default: return -EOPNOTSUPP; } @@ -202,6 +267,8 @@ static void mlx4_en_get_ethtool_stats(struct net_device *dev, spin_lock_bh(&priv->stats_lock); + mlx4_en_update_lro_stats(priv); + for (i = 0; i < NUM_MAIN_STATS; i++) data[index++] = ((unsigned long *) &priv->stats)[i]; for (i = 0; i < NUM_PORT_STATS; i++) @@ -237,7 +304,7 @@ static void mlx4_en_get_strings(struct net_device *dev, case ETH_SS_TEST: for (i = 0; i < MLX4_EN_NUM_SELF_TEST - 2; i++) strcpy(data + i * ETH_GSTRING_LEN, mlx4_en_test_names[i]); - if (priv->mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UC_LOOPBACK) + if (priv->mdev->dev->caps.loopback_support) for (; i < MLX4_EN_NUM_SELF_TEST; i++) strcpy(data + i * ETH_GSTRING_LEN, mlx4_en_test_names[i]); break; @@ -282,10 +349,10 @@ static int mlx4_en_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) trans_type = priv->port_state.transciver; if (netif_carrier_ok(dev)) { - ethtool_cmd_speed_set(cmd, priv->port_state.link_speed); + cmd->speed = priv->port_state.link_speed; cmd->duplex = DUPLEX_FULL; } else { - ethtool_cmd_speed_set(cmd, -1); + cmd->speed = -1; cmd->duplex = -1; } @@ -309,8 +376,7 @@ static int mlx4_en_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) static int mlx4_en_set_settings(struct net_device *dev, struct ethtool_cmd *cmd) { if ((cmd->autoneg == AUTONEG_ENABLE) || - (ethtool_cmd_speed(cmd) != SPEED_10000) || - (cmd->duplex != DUPLEX_FULL)) + (cmd->speed != SPEED_10000) || (cmd->duplex != DUPLEX_FULL)) return -EINVAL; /* Nothing to change */ @@ -344,7 +410,8 @@ static int mlx4_en_set_coalesce(struct net_device *dev, priv->rx_frames = (coal->rx_max_coalesced_frames == MLX4_EN_AUTO_CONF) ? - MLX4_EN_RX_COAL_TARGET : + MLX4_EN_RX_COAL_TARGET / + priv->dev->mtu + 1 : coal->rx_max_coalesced_frames; priv->rx_usecs = (coal->rx_coalesce_usecs == MLX4_EN_AUTO_CONF) ? @@ -358,13 +425,13 @@ static int mlx4_en_set_coalesce(struct net_device *dev, priv->rx_usecs_high = coal->rx_coalesce_usecs_high; priv->sample_interval = coal->rate_sample_interval; priv->adaptive_rx_coal = coal->use_adaptive_rx_coalesce; + priv->last_moder_time = MLX4_EN_AUTO_CONF; if (priv->adaptive_rx_coal) return 0; for (i = 0; i < priv->rx_ring_num; i++) { priv->rx_cq[i].moder_cnt = priv->rx_frames; priv->rx_cq[i].moder_time = priv->rx_usecs; - priv->last_moder_time[i] = MLX4_EN_AUTO_CONF; err = mlx4_en_set_cq_moder(priv, &priv->rx_cq[i]); if (err) return err; @@ -410,7 +477,6 @@ static int mlx4_en_set_ringparam(struct net_device *dev, u32 rx_size, tx_size; int port_up = 0; int err = 0; - int i; if (param->rx_jumbo_pending || param->rx_mini_pending) return -EINVAL; @@ -449,15 +515,6 @@ static int mlx4_en_set_ringparam(struct net_device *dev, en_err(priv, "Failed starting port\n"); } - for (i = 0; i < priv->rx_ring_num; i++) { - priv->rx_cq[i].moder_cnt = priv->rx_frames; - priv->rx_cq[i].moder_time = priv->rx_usecs; - priv->last_moder_time[i] = MLX4_EN_AUTO_CONF; - err = mlx4_en_set_cq_moder(priv, &priv->rx_cq[i]); - if (err) - goto out; - } - out: mutex_unlock(&mdev->state_lock); return err; @@ -480,7 +537,17 @@ const struct ethtool_ops mlx4_en_ethtool_ops = { .get_drvinfo = mlx4_en_get_drvinfo, .get_settings = mlx4_en_get_settings, .set_settings = mlx4_en_set_settings, +#ifdef NETIF_F_TSO + .get_tso = mlx4_en_get_tso, + .set_tso = mlx4_en_set_tso, +#endif + .get_sg = ethtool_op_get_sg, + .set_sg = ethtool_op_set_sg, .get_link = ethtool_op_get_link, + .get_rx_csum = mlx4_en_get_rx_csum, + .set_rx_csum = mlx4_en_set_rx_csum, + .get_tx_csum = ethtool_op_get_tx_csum, + .set_tx_csum = ethtool_op_set_tx_ipv6_csum, .get_strings = mlx4_en_get_strings, .get_sset_count = mlx4_en_get_sset_count, .get_ethtool_stats = mlx4_en_get_ethtool_stats, @@ -495,6 +562,8 @@ const struct ethtool_ops mlx4_en_ethtool_ops = { .set_pauseparam = mlx4_en_set_pauseparam, .get_ringparam = mlx4_en_get_ringparam, .set_ringparam = mlx4_en_set_ringparam, + .get_flags = ethtool_op_get_flags, + .set_flags = ethtool_op_set_flags, }; diff --git a/drivers/net/mlx4/en_frag.c b/drivers/net/mlx4/en_frag.c new file mode 100644 index 0000000000000..9d34e1214c973 --- /dev/null +++ b/drivers/net/mlx4/en_frag.c @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2007 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include +#include + +#include "mlx4_en.h" + + +static struct mlx4_en_ipfrag *find_session(struct mlx4_en_rx_ring *ring, + struct iphdr *iph) +{ + struct mlx4_en_ipfrag *session; + int i; + + for (i = 0; i < MLX4_EN_NUM_IPFRAG_SESSIONS; i++) { + session = &ring->ipfrag[i]; + if (session->fragments == NULL) + continue; + if (session->daddr == iph->daddr && + session->saddr == iph->saddr && + session->id == iph->id && + session->protocol == iph->protocol) { + return session; + } + } + return NULL; +} + +static struct mlx4_en_ipfrag *start_session(struct mlx4_en_rx_ring *ring, + struct iphdr *iph) +{ + struct mlx4_en_ipfrag *session; + int index = -1; + int i; + + for (i = 0; i < MLX4_EN_NUM_IPFRAG_SESSIONS; i++) { + if (ring->ipfrag[i].fragments == NULL) { + index = i; + break; + } + } + if (index < 0) + return NULL; + + session = &ring->ipfrag[index]; + + return session; +} + + +static void flush_session(struct mlx4_en_priv *priv, + struct mlx4_en_ipfrag *session, + u16 more) +{ + struct sk_buff *skb = session->fragments; + struct iphdr *iph = ip_hdr(skb); + struct net_device *dev = skb->dev; + + /* Update IP length and checksum */ + iph->tot_len = htons(session->total_len); + iph->frag_off = htons(more | (session->offset >> 3)); + iph->check = 0; + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + + if (session->vlan) + vlan_hwaccel_receive_skb(skb, priv->vlgrp, + be16_to_cpu(session->sl_vid)); + else + netif_receive_skb(skb); + dev->last_rx = jiffies; + session->fragments = NULL; + session->last = NULL; +} + + +static inline void frag_append(struct mlx4_en_priv *priv, + struct mlx4_en_ipfrag *session, + struct sk_buff *skb, + unsigned int data_len) +{ + struct sk_buff *parent = session->fragments; + + /* Update skb bookkeeping */ + parent->len += data_len; + parent->data_len += data_len; + session->total_len += data_len; + + skb_pull(skb, skb->len - data_len); + parent->truesize += skb->truesize; + + if (session->last) + session->last->next = skb; + else + skb_shinfo(parent)->frag_list = skb; + + session->last = skb; +} + +int mlx4_en_rx_frags(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring, + struct sk_buff *skb, struct mlx4_cqe *cqe) +{ + struct mlx4_en_ipfrag *session; + struct iphdr *iph; + u16 ip_len; + u16 ip_hlen; + int data_len; + u16 offset; + + skb_reset_network_header(skb); + skb_reset_transport_header(skb); + iph = ip_hdr(skb); + ip_len = ntohs(iph->tot_len); + ip_hlen = iph->ihl * 4; + data_len = ip_len - ip_hlen; + offset = ntohs(iph->frag_off); + offset &= IP_OFFSET; + offset <<= 3; + + session = find_session(ring, iph); + if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) { + if (session) + flush_session(priv, session, IP_MF); + return -EINVAL; + } + if (session) { + if (unlikely(session->offset + session->total_len != + offset + ip_hlen)) { + flush_session(priv, session, IP_MF); + goto new_session; + } + /* Packets smaller then 60 bytes are padded to that size + * Need to fix len field of the skb to fit the actual data size + * Since ethernet header already removed, the IP total length + * is exactly the data size (the skb is linear) + */ + skb->len = ip_len; + + frag_append(priv, session, skb, data_len); + } else { +new_session: + session = start_session(ring, iph); + if (unlikely(!session)) + return -ENOSPC; + + session->fragments = skb; + session->daddr = iph->daddr; + session->saddr = iph->saddr; + session->id = iph->id; + session->protocol = iph->protocol; + session->total_len = ip_len; + session->offset = offset; + session->vlan = (priv->vlgrp && + (be32_to_cpu(cqe->vlan_my_qpn) & + MLX4_CQE_VLAN_PRESENT_MASK)) ? 1 : 0; + session->sl_vid = cqe->sl_vid; + } + if (!(ntohs(iph->frag_off) & IP_MF)) + flush_session(priv, session, 0); + else if (session->fragments->len + priv->dev->mtu > 65536) + flush_session(priv, session, IP_MF); + + return 0; +} + + +void mlx4_en_flush_frags(struct mlx4_en_priv *priv, + struct mlx4_en_rx_ring *ring) +{ + struct mlx4_en_ipfrag *session; + int i; + + for (i = 0; i < MLX4_EN_NUM_IPFRAG_SESSIONS; i++) { + session = &ring->ipfrag[i]; + if (session->fragments) + flush_session(priv, session, IP_MF); + } +} diff --git a/drivers/net/mlx4/en_main.c b/drivers/net/mlx4/en_main.c index a06096fcc0b8b..ece207d517cae 100644 --- a/drivers/net/mlx4/en_main.c +++ b/drivers/net/mlx4/en_main.c @@ -35,7 +35,6 @@ #include #include #include -#include #include #include @@ -70,35 +69,19 @@ MLX4_EN_PARM_INT(tcp_rss, 1, MLX4_EN_PARM_INT(udp_rss, 1, "Enable RSS for incomming UDP traffic or disabled (0)"); +/* Number of LRO sessions per Rx ring (rounded up to a power of two) */ +MLX4_EN_PARM_INT(num_lro, MLX4_EN_MAX_LRO_DESCRIPTORS, + "Number of LRO sessions per ring or disabled (0)"); + +/* Allow reassembly of fragmented IP packets */ +MLX4_EN_PARM_INT(ip_reasm, 1, "Allow reassembly of fragmented IP packets (!0)"); + /* Priority pausing */ MLX4_EN_PARM_INT(pfctx, 0, "Priority based Flow Control policy on TX[7:0]." " Per priority bit mask"); MLX4_EN_PARM_INT(pfcrx, 0, "Priority based Flow Control policy on RX[7:0]." " Per priority bit mask"); -int en_print(const char *level, const struct mlx4_en_priv *priv, - const char *format, ...) -{ - va_list args; - struct va_format vaf; - int i; - - va_start(args, format); - - vaf.fmt = format; - vaf.va = &args; - if (priv->registered) - i = printk("%s%s: %s: %pV", - level, DRV_NAME, priv->dev->name, &vaf); - else - i = printk("%s%s: %s: Port %d: %pV", - level, DRV_NAME, dev_name(&priv->mdev->pdev->dev), - priv->port, &vaf); - va_end(args); - - return i; -} - static int mlx4_en_get_profile(struct mlx4_en_dev *mdev) { struct mlx4_en_profile *params = &mdev->profile; @@ -106,11 +89,12 @@ static int mlx4_en_get_profile(struct mlx4_en_dev *mdev) params->tcp_rss = tcp_rss; params->udp_rss = udp_rss; - if (params->udp_rss && !(mdev->dev->caps.flags - & MLX4_DEV_CAP_FLAG_UDP_RSS)) { + if (params->udp_rss && !mdev->dev->caps.udp_rss) { mlx4_warn(mdev, "UDP RSS is not supported on this device.\n"); params->udp_rss = 0; } + params->num_lro = min_t(int, num_lro , MLX4_EN_MAX_LRO_DESCRIPTORS); + params->ip_reasm = ip_reasm; for (i = 1; i <= MLX4_MAX_PORTS; i++) { params->prof[i].rx_pause = 1; params->prof[i].rx_ppp = pfcrx; @@ -118,14 +102,14 @@ static int mlx4_en_get_profile(struct mlx4_en_dev *mdev) params->prof[i].tx_ppp = pfctx; params->prof[i].tx_ring_size = MLX4_EN_DEF_TX_RING_SIZE; params->prof[i].rx_ring_size = MLX4_EN_DEF_RX_RING_SIZE; - params->prof[i].tx_ring_num = MLX4_EN_NUM_TX_RINGS + + params->prof[i].tx_ring_num = MLX4_EN_NUM_HASH_RINGS + 1 + (!!pfcrx) * MLX4_EN_NUM_PPP_RINGS; } return 0; } -static void *mlx4_en_get_netdev(struct mlx4_dev *dev, void *ctx, u8 port) +static void *get_netdev(struct mlx4_dev *dev, void *ctx, u8 port) { struct mlx4_en_dev *endev = ctx; @@ -133,10 +117,12 @@ static void *mlx4_en_get_netdev(struct mlx4_dev *dev, void *ctx, u8 port) } static void mlx4_en_event(struct mlx4_dev *dev, void *endev_ptr, - enum mlx4_dev_event event, int port) + enum mlx4_dev_event event, + unsigned long port) { struct mlx4_en_dev *mdev = (struct mlx4_en_dev *) endev_ptr; struct mlx4_en_priv *priv; + int i; if (!mdev->pndev[port]) return; @@ -151,6 +137,15 @@ static void mlx4_en_event(struct mlx4_dev *dev, void *endev_ptr, queue_work(mdev->workqueue, &priv->linkstate_task); break; + case MLX4_EVENT_TYPE_MAC_UPDATE: + priv->mac = dev->caps.def_mac[port]; + for (i = 0; i < ETH_ALEN; i++) { + priv->dev->dev_addr[ETH_ALEN - 1 - i] = (u8) (priv->mac >> (8 * i)); + priv->dev->perm_addr[ETH_ALEN - 1 - i] = (u8) (priv->mac >> (8 * i)); + } + queue_work(mdev->workqueue, &priv->mac_task); + break; + case MLX4_DEV_EVENT_CATASTROPHIC_ERROR: mlx4_err(mdev, "Internal error detected, restarting device\n"); break; @@ -176,7 +171,6 @@ static void mlx4_en_remove(struct mlx4_dev *dev, void *endev_ptr) flush_workqueue(mdev->workqueue); destroy_workqueue(mdev->workqueue); mlx4_mr_free(dev, &mdev->mr); - iounmap(mdev->uar_map); mlx4_uar_free(dev, &mdev->priv_uar); mlx4_pd_free(dev, mdev->priv_pdn); kfree(mdev); @@ -184,11 +178,15 @@ static void mlx4_en_remove(struct mlx4_dev *dev, void *endev_ptr) static void *mlx4_en_add(struct mlx4_dev *dev) { + static int mlx4_en_version_printed; struct mlx4_en_dev *mdev; int i; int err; - printk_once(KERN_INFO "%s", mlx4_en_version); + if (!mlx4_en_version_printed) { + printk(KERN_INFO "%s", mlx4_en_version); + mlx4_en_version_printed++; + } mdev = kzalloc(sizeof *mdev, GFP_KERNEL); if (!mdev) { @@ -204,8 +202,7 @@ static void *mlx4_en_add(struct mlx4_dev *dev) if (mlx4_uar_alloc(dev, &mdev->priv_uar)) goto err_pd; - mdev->uar_map = ioremap((phys_addr_t) mdev->priv_uar.pfn << PAGE_SHIFT, - PAGE_SIZE); + mdev->uar_map = ioremap(mdev->priv_uar.pfn << PAGE_SHIFT, PAGE_SIZE); if (!mdev->uar_map) goto err_uar; spin_lock_init(&mdev->uar_lock); @@ -224,7 +221,7 @@ static void *mlx4_en_add(struct mlx4_dev *dev) MLX4_PERM_LOCAL_WRITE | MLX4_PERM_LOCAL_READ, 0, 0, &mdev->mr)) { mlx4_err(mdev, "Failed allocating memory region\n"); - goto err_map; + goto err_uar; } if (mlx4_mr_enable(mdev->dev, &mdev->mr)) { mlx4_err(mdev, "Failed enabling memory region\n"); @@ -238,24 +235,17 @@ static void *mlx4_en_add(struct mlx4_dev *dev) goto err_mr; } - /* Configure which ports to start according to module parameters */ + /* Configure wich ports to start according to module parameters */ mdev->port_cnt = 0; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) mdev->port_cnt++; - mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) { - if (!dev->caps.comp_pool) { - mdev->profile.prof[i].rx_ring_num = - rounddown_pow_of_two(max_t(int, MIN_RX_RINGS, - min_t(int, - dev->caps.num_comp_vectors, - MAX_RX_RINGS))); - } else { - mdev->profile.prof[i].rx_ring_num = rounddown_pow_of_two( - min_t(int, dev->caps.comp_pool/ - dev->caps.num_ports - 1 , MAX_MSIX_P_PORT - 1)); - } - } + /* Number of RX rings is between (MIN_RX_RINGS, MAX_RX_RINGS) + 1 + * and depends on number of completion vectors */ + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) + mdev->profile.prof[i].rx_ring_num = rounddown_pow_of_two( + max_t(int, MIN_RX_RINGS, + min_t(int, dev->caps.num_comp_vectors, MAX_RX_RINGS - 1))) + 1; /* Create our own workqueue for reset/multicast tasks * Note: we cannot use the shared workqueue because of deadlocks caused @@ -276,16 +266,30 @@ static void *mlx4_en_add(struct mlx4_dev *dev) /* Create a netdev for each port */ mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) { mlx4_info(mdev, "Activating port:%d\n", i); - if (mlx4_en_init_netdev(mdev, i, &mdev->profile.prof[i])) + if (mlx4_en_init_netdev(mdev, i, &mdev->profile.prof[i])) { mdev->pndev[i] = NULL; + goto err_free_netdev; + } } return mdev; + +err_free_netdev: + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) { + if (mdev->pndev[i]) + mlx4_en_destroy_netdev(mdev->pndev[i]); + } + + mutex_lock(&mdev->state_lock); + mdev->device_up = false; + mutex_unlock(&mdev->state_lock); + flush_workqueue(mdev->workqueue); + + /* Stop event queue before we drop down to release shared SW state */ + destroy_workqueue(mdev->workqueue); + err_mr: mlx4_mr_free(dev, &mdev->mr); -err_map: - if (!mdev->uar_map) - iounmap(mdev->uar_map); err_uar: mlx4_uar_free(dev, &mdev->priv_uar); err_pd: @@ -296,12 +300,61 @@ err_free_res: return NULL; } +enum mlx4_query_reply mlx4_en_query(void *endev_ptr, void *int_dev) +{ + struct mlx4_en_dev *mdev = endev_ptr; + struct net_device *netdev = int_dev; + int p; + + for (p = 1; p <= MLX4_MAX_PORTS; ++p) + if (mdev->pndev[p] == netdev) + return p; + + return MLX4_QUERY_NOT_MINE; +} + +static struct pci_device_id mlx4_en_pci_table[] = { + { PCI_VDEVICE(MELLANOX, 0x6340) }, /* MT25408 "Hermon" SDR */ + { PCI_VDEVICE(MELLANOX, 0x634a) }, /* MT25408 "Hermon" DDR */ + { PCI_VDEVICE(MELLANOX, 0x6354) }, /* MT25408 "Hermon" QDR */ + { PCI_VDEVICE(MELLANOX, 0x6732) }, /* MT25408 "Hermon" DDR PCIe gen2 */ + { PCI_VDEVICE(MELLANOX, 0x673c) }, /* MT25408 "Hermon" QDR PCIe gen2 */ + { PCI_VDEVICE(MELLANOX, 0x6368) }, /* MT25408 "Hermon" EN 10GigE */ + { PCI_VDEVICE(MELLANOX, 0x6750) }, /* MT25408 "Hermon" EN 10GigE PCIe gen2 */ + { PCI_VDEVICE(MELLANOX, 0x6372) }, /* MT25458 ConnectX EN 10GBASE-T 10GigE */ + { PCI_VDEVICE(MELLANOX, 0x675a) }, /* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */ + { PCI_VDEVICE(MELLANOX, 0x6764) }, /* MT26468 ConnectX EN 10GigE PCIe gen2 */ + { PCI_VDEVICE(MELLANOX, 0x6746) }, /* MT26438 ConnectX VPI PCIe 2.0 5GT/s - IB QDR / 10GigE Virt+ */ + { PCI_VDEVICE(MELLANOX, 0x676e) }, /* MT26478 ConnectX EN 40GigE PCIe 2.0 5GT/s */ + { PCI_VDEVICE(MELLANOX, 0x6778) }, /* MT26488 ConnectX VPI PCIe 2.0 5GT/s - IB DDR / 10GigE Virt+ */ + { PCI_VDEVICE(MELLANOX, 0x1000) }, + { PCI_VDEVICE(MELLANOX, 0x1001) }, + { PCI_VDEVICE(MELLANOX, 0x1002) }, + { PCI_VDEVICE(MELLANOX, 0x1003) }, + { PCI_VDEVICE(MELLANOX, 0x1004) }, + { PCI_VDEVICE(MELLANOX, 0x1005) }, + { PCI_VDEVICE(MELLANOX, 0x1006) }, + { PCI_VDEVICE(MELLANOX, 0x1007) }, + { PCI_VDEVICE(MELLANOX, 0x1008) }, + { PCI_VDEVICE(MELLANOX, 0x1009) }, + { PCI_VDEVICE(MELLANOX, 0x100a) }, + { PCI_VDEVICE(MELLANOX, 0x100b) }, + { PCI_VDEVICE(MELLANOX, 0x100c) }, + { PCI_VDEVICE(MELLANOX, 0x100d) }, + { PCI_VDEVICE(MELLANOX, 0x100e) }, + { PCI_VDEVICE(MELLANOX, 0x100f) }, + { 0, } +}; + +MODULE_DEVICE_TABLE(pci, mlx4_en_pci_table); + static struct mlx4_interface mlx4_en_interface = { - .add = mlx4_en_add, - .remove = mlx4_en_remove, - .event = mlx4_en_event, - .get_dev = mlx4_en_get_netdev, - .protocol = MLX4_PROT_ETH, + .add = mlx4_en_add, + .remove = mlx4_en_remove, + .event = mlx4_en_event, + .query = mlx4_en_query, + .get_prot_dev = get_netdev, + .protocol = MLX4_PROT_EN, }; static int __init mlx4_en_init(void) diff --git a/drivers/net/mlx4/en_netdev.c b/drivers/net/mlx4/en_netdev.c index aae1fb00fd218..0e9603bcd84b9 100644 --- a/drivers/net/mlx4/en_netdev.c +++ b/drivers/net/mlx4/en_netdev.c @@ -35,7 +35,6 @@ #include #include #include -#include #include #include @@ -45,28 +44,58 @@ #include "mlx4_en.h" #include "en_port.h" + +static void mlx4_en_vlan_rx_register(struct net_device *dev, struct vlan_group *grp) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + int err; + + en_dbg(HW, priv, "Registering VLAN group:%p\n", grp); + priv->vlgrp = grp; + + mutex_lock(&mdev->state_lock); + if (mdev->device_up && priv->port_up && !(priv->flags & MLX4_EN_FLAG_PROMISC)) { + err = mlx4_SET_VLAN_FLTR(mdev->dev, priv->port, grp); + if (err) + en_err(priv, "Failed configuring VLAN filter\n"); + } + mutex_unlock(&mdev->state_lock); +} + static void mlx4_en_vlan_rx_add_vid(struct net_device *dev, unsigned short vid) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; int err; int idx; +#ifndef HAVE_NETDEV_VLAN_FEATURES + struct net_device *vdev; +#endif - en_dbg(HW, priv, "adding VLAN:%d\n", vid); + if (!priv->vlgrp) + return; - set_bit(vid, priv->active_vlans); + en_dbg(HW, priv, "adding VLAN:%d (vlgrp entry:%p)\n", + vid, vlan_group_get_device(priv->vlgrp, vid)); /* Add VID to port VLAN filter */ mutex_lock(&mdev->state_lock); - if (mdev->device_up && priv->port_up) { - err = mlx4_SET_VLAN_FLTR(mdev->dev, priv); + if (mdev->device_up && priv->port_up && !(priv->flags & MLX4_EN_FLAG_PROMISC)) { + err = mlx4_SET_VLAN_FLTR(mdev->dev, priv->port, priv->vlgrp); if (err) en_err(priv, "Failed configuring VLAN filter\n"); } if (mlx4_register_vlan(mdev->dev, priv->port, vid, &idx)) - en_err(priv, "failed adding vlan %d\n", vid); + en_dbg(HW, priv, "failed adding vlan %d\n", vid); mutex_unlock(&mdev->state_lock); +#ifndef HAVE_NETDEV_VLAN_FEATURES + vdev = vlan_group_get_device(priv->vlgrp, vid); + vdev->features |= dev->features; + vdev->features |= NETIF_F_LLTX; + vlan_group_set_device(priv->vlgrp, vid, vdev); +#endif } static void mlx4_en_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid) @@ -76,19 +105,22 @@ static void mlx4_en_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid) int err; int idx; - en_dbg(HW, priv, "Killing VID:%d\n", vid); + if (!priv->vlgrp) + return; - clear_bit(vid, priv->active_vlans); + en_dbg(HW, priv, "Killing VID:%d (vlgrp:%p vlgrp entry:%p)\n", + vid, priv->vlgrp, vlan_group_get_device(priv->vlgrp, vid)); + vlan_group_set_device(priv->vlgrp, vid, NULL); /* Remove VID from port VLAN filter */ mutex_lock(&mdev->state_lock); if (!mlx4_find_cached_vlan(mdev->dev, priv->port, vid, &idx)) mlx4_unregister_vlan(mdev->dev, priv->port, idx); else - en_err(priv, "could not find vid %d in cache\n", vid); + en_dbg(HW, priv, "could not find vid %d in cache\n", vid); - if (mdev->device_up && priv->port_up) { - err = mlx4_SET_VLAN_FLTR(mdev->dev, priv); + if (mdev->device_up && priv->port_up && !(priv->flags & MLX4_EN_FLAG_PROMISC)) { + err = mlx4_SET_VLAN_FLTR(mdev->dev, priv->port, priv->vlgrp); if (err) en_err(priv, "Failed configuring VLAN filter\n"); } @@ -133,7 +165,7 @@ static void mlx4_en_do_set_mac(struct work_struct *work) if (priv->port_up) { /* Remove old MAC and insert the new one */ err = mlx4_replace_mac(mdev->dev, priv->port, - priv->base_qpn, priv->mac, 0); + priv->base_qpn, priv->mac); if (err) en_err(priv, "Failed changing HW MAC address\n"); } else @@ -164,6 +196,7 @@ static void mlx4_en_cache_mclist(struct net_device *dev) en_err(priv, "failed to allocate multicast list\n"); return; } + i = 0; netdev_for_each_mc_addr(ha, dev) memcpy(mc_addrs + i++ * ETH_ALEN, ha->addr, ETH_ALEN); @@ -204,6 +237,16 @@ static void mlx4_en_do_set_multicast(struct work_struct *work) goto out; } + if (!netif_carrier_ok(dev)) { + if (!mlx4_en_QUERY_PORT(mdev, priv->port)) { + if (priv->port_state.link_state) { + priv->last_link_state = MLX4_DEV_EVENT_PORT_UP; + netif_carrier_on(dev); + en_dbg(LINK, priv, "Link Up\n"); + } + } + } + /* * Promsicuous mode: disable all filters */ @@ -215,16 +258,15 @@ static void mlx4_en_do_set_multicast(struct work_struct *work) priv->flags |= MLX4_EN_FLAG_PROMISC; /* Enable promiscouos mode */ - if (!(mdev->dev->caps.flags & - MLX4_DEV_CAP_FLAG_VEP_UC_STEER)) + if (!mdev->dev->caps.vep_uc_steering) err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port, priv->base_qpn, 1); else err = mlx4_unicast_promisc_add(mdev->dev, priv->base_qpn, - priv->port); + priv->port - 1); if (err) en_err(priv, "Failed enabling " - "promiscuous mode\n"); + "promiscous mode\n"); /* Disable port multicast filter (unconditionally) */ err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0, @@ -236,22 +278,24 @@ static void mlx4_en_do_set_multicast(struct work_struct *work) /* Add the default qp number as multicast promisc */ if (!(priv->flags & MLX4_EN_FLAG_MC_PROMISC)) { err = mlx4_multicast_promisc_add(mdev->dev, priv->base_qpn, - priv->port); + priv->port - 1); if (err) en_err(priv, "Failed entering multicast promisc mode\n"); priv->flags |= MLX4_EN_FLAG_MC_PROMISC; } - /* Disable port VLAN filter */ - err = mlx4_SET_VLAN_FLTR(mdev->dev, priv); - if (err) - en_err(priv, "Failed disabling VLAN filter\n"); + if (priv->vlgrp) { + /* Disable port VLAN filter */ + err = mlx4_SET_VLAN_FLTR(mdev->dev, priv->port, NULL); + if (err) + en_err(priv, "Failed disabling VLAN filter\n"); + } } goto out; } /* - * Not in promiscuous mode + * Not in promiscous mode */ if (priv->flags & MLX4_EN_FLAG_PROMISC) { @@ -260,28 +304,30 @@ static void mlx4_en_do_set_multicast(struct work_struct *work) priv->flags &= ~MLX4_EN_FLAG_PROMISC; /* Disable promiscouos mode */ - if (!(mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER)) + if (!mdev->dev->caps.vep_uc_steering) err = mlx4_SET_PORT_qpn_calc(mdev->dev, priv->port, priv->base_qpn, 0); else err = mlx4_unicast_promisc_remove(mdev->dev, priv->base_qpn, - priv->port); + priv->port - 1); if (err) - en_err(priv, "Failed disabling promiscuous mode\n"); + en_err(priv, "Failed disabling promiscous mode\n"); /* Disable Multicast promisc */ if (priv->flags & MLX4_EN_FLAG_MC_PROMISC) { err = mlx4_multicast_promisc_remove(mdev->dev, priv->base_qpn, - priv->port); + priv->port - 1); if (err) - en_err(priv, "Failed disabling multicast promiscuous mode\n"); + en_err(priv, "Failed disabling multicast promiscous mode\n"); priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC; } /* Enable port VLAN filter */ - err = mlx4_SET_VLAN_FLTR(mdev->dev, priv); - if (err) - en_err(priv, "Failed enabling VLAN filter\n"); + if (priv->vlgrp) { + err = mlx4_SET_VLAN_FLTR(mdev->dev, priv->port, priv->vlgrp); + if (err) + en_err(priv, "Failed enabling VLAN filter\n"); + } } /* Enable/disable the multicast filter according to IFF_ALLMULTI */ @@ -294,7 +340,7 @@ static void mlx4_en_do_set_multicast(struct work_struct *work) /* Add the default qp number as multicast promisc */ if (!(priv->flags & MLX4_EN_FLAG_MC_PROMISC)) { err = mlx4_multicast_promisc_add(mdev->dev, priv->base_qpn, - priv->port); + priv->port - 1); if (err) en_err(priv, "Failed entering multicast promisc mode\n"); priv->flags |= MLX4_EN_FLAG_MC_PROMISC; @@ -304,9 +350,9 @@ static void mlx4_en_do_set_multicast(struct work_struct *work) /* Disable Multicast promisc */ if (priv->flags & MLX4_EN_FLAG_MC_PROMISC) { err = mlx4_multicast_promisc_remove(mdev->dev, priv->base_qpn, - priv->port); + priv->port - 1); if (err) - en_err(priv, "Failed disabling multicast promiscuous mode\n"); + en_err(priv, "Failed disabling multicast promiscous mode\n"); priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC; } @@ -333,18 +379,20 @@ static void mlx4_en_do_set_multicast(struct work_struct *work) netif_tx_unlock_bh(dev); for (i = 0; i < priv->mc_addrs_cnt; i++) { mcast_addr = - mlx4_en_mac_to_u64(priv->mc_addrs + i * ETH_ALEN); + mlx4_en_mac_to_u64(priv->mc_addrs + i * ETH_ALEN); memcpy(&mc_list[10], priv->mc_addrs + i * ETH_ALEN, ETH_ALEN); mc_list[5] = priv->port; mlx4_multicast_attach(mdev->dev, &priv->rss_map.indir_qp, - mc_list, 0, MLX4_PROT_ETH); + mc_list, 0, MLX4_PROT_ETH); + mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, - mcast_addr, 0, MLX4_MCAST_CONFIG); + mcast_addr, 0, MLX4_MCAST_CONFIG); } err = mlx4_SET_MCAST_FLTR(mdev->dev, priv->port, 0, 0, MLX4_MCAST_ENABLE); if (err) en_err(priv, "Failed enabling multicast filter\n"); + } out: mutex_unlock(&mdev->state_lock); @@ -362,7 +410,10 @@ static void mlx4_en_netpoll(struct net_device *dev) cq = &priv->rx_cq[i]; spin_lock_irqsave(&cq->lock, flags); napi_synchronize(&cq->napi); - mlx4_en_process_rx_cq(dev, cq, 0); + if (priv->rx_ring[i].use_frags) + mlx4_en_process_rx_cq(dev, cq, 0); + else + mlx4_en_process_rx_cq_skb(dev, cq, 0); spin_unlock_irqrestore(&cq->lock, flags); } } @@ -399,12 +450,12 @@ static void mlx4_en_set_default_moderation(struct mlx4_en_priv *priv) int i; /* If we haven't received a specific coalescing setting - * (module param), we set the moderation parameters as follows: + * (module param), we set the moderation paramters as follows: * - moder_cnt is set to the number of mtu sized packets to * satisfy our coelsing target. * - moder_time is set to a fixed value. */ - priv->rx_frames = MLX4_EN_RX_COAL_TARGET; + priv->rx_frames = MLX4_EN_RX_COAL_TARGET / priv->dev->mtu + 1; priv->rx_usecs = MLX4_EN_RX_COAL_TIME; en_dbg(INTR, priv, "Default coalesing params for mtu:%d - " "rx_frames:%d rx_usecs:%d\n", @@ -415,9 +466,6 @@ static void mlx4_en_set_default_moderation(struct mlx4_en_priv *priv) cq = &priv->rx_cq[i]; cq->moder_cnt = priv->rx_frames; cq->moder_time = priv->rx_usecs; - priv->last_moder_time[i] = MLX4_EN_AUTO_CONF; - priv->last_moder_packets[i] = 0; - priv->last_moder_bytes[i] = 0; } for (i = 0; i < priv->tx_ring_num; i++) { @@ -433,8 +481,11 @@ static void mlx4_en_set_default_moderation(struct mlx4_en_priv *priv) priv->rx_usecs_high = MLX4_EN_RX_COAL_TIME_HIGH; priv->sample_interval = MLX4_EN_SAMPLE_INTERVAL; priv->adaptive_rx_coal = 1; + priv->last_moder_time = MLX4_EN_AUTO_CONF; priv->last_moder_jiffies = 0; + priv->last_moder_packets = 0; priv->last_moder_tx_packets = 0; + priv->last_moder_bytes = 0; } static void mlx4_en_auto_moderation(struct mlx4_en_priv *priv) @@ -446,31 +497,45 @@ static void mlx4_en_auto_moderation(struct mlx4_en_priv *priv) unsigned long avg_pkt_size; unsigned long rx_packets; unsigned long rx_bytes; + unsigned long tx_packets; + unsigned long tx_pkt_diff; unsigned long rx_pkt_diff; int moder_time; - int ring, err; + int i, err; if (!priv->adaptive_rx_coal || period < priv->sample_interval * HZ) return; - for (ring = 0; ring < priv->rx_ring_num; ring++) { - spin_lock_bh(&priv->stats_lock); - rx_packets = priv->rx_ring[ring].packets; - rx_bytes = priv->rx_ring[ring].bytes; - spin_unlock_bh(&priv->stats_lock); - - rx_pkt_diff = ((unsigned long) (rx_packets - - priv->last_moder_packets[ring])); - packets = rx_pkt_diff; - rate = packets * HZ / period; - avg_pkt_size = packets ? ((unsigned long) (rx_bytes - - priv->last_moder_bytes[ring])) / packets : 0; - - /* Apply auto-moderation only when packet rate - * exceeds a rate that it matters */ - if (rate > (MLX4_EN_RX_RATE_THRESH / priv->rx_ring_num) && - avg_pkt_size > MLX4_EN_AVG_PKT_SMALL) { - if (rate < priv->pkt_rate_low) + spin_lock_bh(&priv->stats_lock); + rx_packets = priv->stats.rx_packets; + rx_bytes = priv->stats.rx_bytes; + tx_packets = priv->stats.tx_packets; + spin_unlock_bh(&priv->stats_lock); + + if (!priv->last_moder_jiffies || !period) + goto out; + + tx_pkt_diff = ((unsigned long) (tx_packets - + priv->last_moder_tx_packets)); + rx_pkt_diff = ((unsigned long) (rx_packets - + priv->last_moder_packets)); + packets = max(tx_pkt_diff, rx_pkt_diff); + rate = packets * HZ / period; + avg_pkt_size = packets ? ((unsigned long) (rx_bytes - + priv->last_moder_bytes)) / packets : 0; + + /* Apply auto-moderation only when packet rate exceeds a rate that + * it matters */ + if (rate > MLX4_EN_RX_RATE_THRESH) { + /* If tx and rx packet rates are not balanced, assume that + * traffic is mainly BW bound and apply maximum moderation. + * Otherwise, moderate according to packet rate */ + if (2 * tx_pkt_diff > 3 * rx_pkt_diff || + 2 * rx_pkt_diff > 3 * tx_pkt_diff) { + moder_time = priv->rx_usecs_high; + } else { + if (rate < priv->pkt_rate_low || + avg_pkt_size < MLX4_EN_AVG_PKT_SMALL) moder_time = priv->rx_usecs_low; else if (rate > priv->pkt_rate_high) moder_time = priv->rx_usecs_high; @@ -479,37 +544,111 @@ static void mlx4_en_auto_moderation(struct mlx4_en_priv *priv) (priv->rx_usecs_high - priv->rx_usecs_low) / (priv->pkt_rate_high - priv->pkt_rate_low) + priv->rx_usecs_low; - } else { - moder_time = priv->rx_usecs_low; } + } else { + /* When packet rate is low, use default moderation rather than + * 0 to prevent interrupt storms if traffic suddenly increases */ + moder_time = priv->rx_usecs; + } + + en_dbg(INTR, priv, "tx rate:%lu rx_rate:%lu\n", + tx_pkt_diff * HZ / period, rx_pkt_diff * HZ / period); - if (moder_time != priv->last_moder_time[ring]) { - priv->last_moder_time[ring] = moder_time; - cq = &priv->rx_cq[ring]; + en_dbg(INTR, priv, "Rx moder_time changed from:%d to %d period:%lu " + "[jiff] packets:%lu avg_pkt_size:%lu rate:%lu [p/s])\n", + priv->last_moder_time, moder_time, period, packets, + avg_pkt_size, rate); + + if (moder_time != priv->last_moder_time) { + priv->last_moder_time = moder_time; + for (i = 0; i < priv->rx_ring_num; i++) { + cq = &priv->rx_cq[i]; cq->moder_time = moder_time; err = mlx4_en_set_cq_moder(priv, cq); - if (err) - en_err(priv, "Failed modifying moderation " - "for cq:%d\n", ring); + if (err) { + en_err(priv, "Failed modifying moderation for cq:%d\n", i); + break; + } } - priv->last_moder_packets[ring] = rx_packets; - priv->last_moder_bytes[ring] = rx_bytes; } +out: + priv->last_moder_packets = rx_packets; + priv->last_moder_tx_packets = tx_packets; + priv->last_moder_bytes = rx_bytes; priv->last_moder_jiffies = jiffies; } +static void set_ring_counters(struct mlx4_en_priv *priv) +{ + struct net_device_stats *stats = &priv->stats; + int i; + + spin_lock_bh(&priv->stats_lock); + + stats->rx_packets = 0; + stats->rx_bytes = 0; + for (i = 0; i < priv->rx_ring_num; i++) { + stats->rx_packets += priv->rx_ring[i].packets; + stats->rx_bytes += priv->rx_ring[i].bytes; + } + stats->tx_packets = 0; + stats->tx_bytes = 0; + for (i = 0; i <= priv->tx_ring_num; i++) { + stats->tx_packets += priv->tx_ring[i].packets; + stats->tx_bytes += priv->tx_ring[i].bytes; + } + spin_unlock_bh(&priv->stats_lock); +} + +static void mlx4_en_set_stats(struct mlx4_en_priv *priv, + struct mlx4_eth_common_counters *eth_counters) +{ + struct net_device_stats *stats = &priv->stats; + + spin_lock_bh(&priv->stats_lock); + + stats->rx_errors = eth_counters->rx_errors; + + stats->tx_errors = eth_counters->tx_errors; + stats->multicast = eth_counters->multicast; + stats->collisions = 0; + stats->rx_length_errors = eth_counters->rx_length_errors; + stats->rx_over_errors = eth_counters->rx_over_errors; + stats->rx_crc_errors = eth_counters->rx_crc_errors; + stats->rx_frame_errors = 0; + stats->rx_fifo_errors = eth_counters->rx_fifo_errors; + stats->rx_missed_errors = eth_counters->rx_missed_errors; + stats->tx_aborted_errors = 0; + stats->tx_carrier_errors = 0; + stats->tx_fifo_errors = 0; + stats->tx_heartbeat_errors = 0; + stats->tx_window_errors = 0; + + priv->pkstats.broadcast = eth_counters->broadcast; + + spin_unlock_bh(&priv->stats_lock); +} + static void mlx4_en_do_get_stats(struct work_struct *work) { struct delayed_work *delay = to_delayed_work(work); struct mlx4_en_priv *priv = container_of(delay, struct mlx4_en_priv, stats_task); struct mlx4_en_dev *mdev = priv->mdev; + struct mlx4_eth_common_counters eth_counters; int err; - err = mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 0); - if (err) - en_dbg(HW, priv, "Could not update stats\n"); + memset(ð_counters, 0, sizeof(eth_counters)); + + if (!(priv->stat_cnt++ & STATS_FREQ_MASK)) { + err = mlx4_DUMP_ETH_STATS(mdev->dev, priv->port, 0, ð_counters); + if (!err) + mlx4_en_set_stats(priv, ð_counters); + else + en_dbg(HW, priv, "Could not update stats \n"); + } + set_ring_counters(priv); mutex_lock(&mdev->state_lock); if (mdev->device_up) { @@ -548,7 +687,6 @@ static void mlx4_en_linkstate(struct work_struct *work) mutex_unlock(&mdev->state_lock); } - int mlx4_en_start_port(struct net_device *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); @@ -578,10 +716,11 @@ int mlx4_en_start_port(struct net_device *dev) en_err(priv, "Failed to activate RX rings\n"); return err; } + for (i = 0; i < priv->rx_ring_num; i++) { cq = &priv->rx_cq[i]; - err = mlx4_en_activate_cq(priv, cq, i); + err = mlx4_en_activate_cq(priv, cq); if (err) { en_err(priv, "Failed activating Rx CQ\n"); goto cq_err; @@ -619,7 +758,7 @@ int mlx4_en_start_port(struct net_device *dev) for (i = 0; i < priv->tx_ring_num; i++) { /* Configure cq */ cq = &priv->tx_cq[i]; - err = mlx4_en_activate_cq(priv, cq, i); + err = mlx4_en_activate_cq(priv, cq); if (err) { en_err(priv, "Failed allocating Tx CQ\n"); goto tx_err; @@ -676,14 +815,11 @@ int mlx4_en_start_port(struct net_device *dev) /* Attach rx QP to bradcast address */ memset(&mc_list[10], 0xff, ETH_ALEN); - mc_list[5] = priv->port; + mc_list[7] = (priv->port - 1) << 4; if (mlx4_multicast_attach(mdev->dev, &priv->rss_map.indir_qp, mc_list, 0, MLX4_PROT_ETH)) mlx4_warn(mdev, "Failed Attaching Broadcast\n"); - /* Must redo promiscuous mode setup. */ - priv->flags &= ~(MLX4_EN_FLAG_PROMISC | MLX4_EN_FLAG_MC_PROMISC); - /* Schedule multicast task to populate multicast list */ queue_work(mdev->workqueue, &priv->mcast_task); @@ -732,14 +868,14 @@ void mlx4_en_stop_port(struct net_device *dev) /* Detach All multicasts */ memset(&mc_list[10], 0xff, ETH_ALEN); - mc_list[5] = priv->port; + mc_list[7] = (priv->port - 1) << 4; mlx4_multicast_detach(mdev->dev, &priv->rss_map.indir_qp, mc_list, MLX4_PROT_ETH); for (i = 0; i < priv->mc_addrs_cnt; i++) { memcpy(&mc_list[10], priv->mc_addrs + i * ETH_ALEN, ETH_ALEN); mc_list[5] = priv->port; mlx4_multicast_detach(mdev->dev, &priv->rss_map.indir_qp, - mc_list, MLX4_PROT_ETH); + mc_list, MLX4_PROT_ETH); } mlx4_en_clear_list(dev); /* Flush multicast filter */ @@ -809,7 +945,7 @@ static int mlx4_en_open(struct net_device *dev) } /* Reset HW statistics and performance counters */ - if (mlx4_en_DUMP_ETH_STATS(mdev, priv->port, 1)) + if (mlx4_DUMP_ETH_STATS(mdev->dev, priv->port, 1, NULL)) en_dbg(HW, priv, "Failed dumping statistics\n"); memset(&priv->stats, 0, sizeof(priv->stats)); @@ -824,6 +960,7 @@ static int mlx4_en_open(struct net_device *dev) priv->rx_ring[i].packets = 0; } + mlx4_en_set_default_moderation(priv); err = mlx4_en_start_port(dev); if (err) en_err(priv, "Failed starting port:%d\n", priv->port); @@ -873,13 +1010,6 @@ int mlx4_en_alloc_resources(struct mlx4_en_priv *priv) { struct mlx4_en_port_profile *prof = priv->prof; int i; - int base_tx_qpn, err; - - err = mlx4_qp_reserve_range(priv->mdev->dev, priv->tx_ring_num, 256, &base_tx_qpn); - if (err) { - en_err(priv, "failed reserving range for TX rings\n"); - return err; - } /* Create tx Rings */ for (i = 0; i < priv->tx_ring_num; i++) { @@ -887,7 +1017,7 @@ int mlx4_en_alloc_resources(struct mlx4_en_priv *priv) prof->tx_ring_size, i, TX)) goto err; - if (mlx4_en_create_tx_ring(priv, &priv->tx_ring[i], base_tx_qpn + i, + if (mlx4_en_create_tx_ring(priv, &priv->tx_ring[i], prof->tx_ring_size, TXBB_SIZE)) goto err; } @@ -898,8 +1028,12 @@ int mlx4_en_alloc_resources(struct mlx4_en_priv *priv) prof->rx_ring_size, i, RX)) goto err; + if (i > priv->rx_ring_num - priv->udp_rings - 1) + priv->rx_ring[i].use_frags = 0; + else + priv->rx_ring[i].use_frags = 1; if (mlx4_en_create_rx_ring(priv, &priv->rx_ring[i], - prof->rx_ring_size, priv->stride)) + prof->rx_ring_size)) goto err; } @@ -907,7 +1041,6 @@ int mlx4_en_alloc_resources(struct mlx4_en_priv *priv) err: en_err(priv, "Failed to allocate NIC resources\n"); - mlx4_qp_release_range(priv->mdev->dev, base_tx_qpn, priv->tx_ring_num); return -ENOMEM; } @@ -962,6 +1095,7 @@ static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu) en_dbg(DRV, priv, "Change MTU called with card down!?\n"); } else { mlx4_en_stop_port(dev); + mlx4_en_set_default_moderation(priv); err = mlx4_en_start_port(dev); if (err) { en_err(priv, "Failed restarting port:%d\n", @@ -974,21 +1108,6 @@ static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu) return 0; } -static int mlx4_en_set_features(struct net_device *netdev, - netdev_features_t features) -{ - struct mlx4_en_priv *priv = netdev_priv(netdev); - - if (features & NETIF_F_LOOPBACK) - priv->ctrl_flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK); - else - priv->ctrl_flags &= - cpu_to_be32(~MLX4_WQE_CTRL_FORCE_LOOPBACK); - - return 0; - -} - static const struct net_device_ops mlx4_netdev_ops = { .ndo_open = mlx4_en_open, .ndo_stop = mlx4_en_close, @@ -1000,12 +1119,12 @@ static const struct net_device_ops mlx4_netdev_ops = { .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = mlx4_en_change_mtu, .ndo_tx_timeout = mlx4_en_tx_timeout, + .ndo_vlan_rx_register = mlx4_en_vlan_rx_register, .ndo_vlan_rx_add_vid = mlx4_en_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = mlx4_en_vlan_rx_kill_vid, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = mlx4_en_netpoll, #endif - .ndo_set_features = mlx4_en_set_features, }; int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, @@ -1016,8 +1135,7 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, int i; int err; - dev = alloc_etherdev_mqs(sizeof(struct mlx4_en_priv), - prof->tx_ring_num, prof->rx_ring_num); + dev = alloc_etherdev_mq(sizeof(struct mlx4_en_priv), prof->tx_ring_num); if (dev == NULL) { mlx4_err(mdev, "Net device allocation failed\n"); return -ENOMEM; @@ -1037,11 +1155,11 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, priv->prof = prof; priv->port = port; priv->port_up = false; + priv->rx_csum = 1; priv->flags = prof->flags; - priv->ctrl_flags = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE | - MLX4_WQE_CTRL_SOLICITED); priv->tx_ring_num = prof->tx_ring_num; priv->rx_ring_num = prof->rx_ring_num; + priv->udp_rings = mdev->profile.udp_rss ? prof->rx_ring_num / 2 : 1; priv->mac_index = -1; priv->msg_enable = MLX4_EN_MSG_LEVEL; spin_lock_init(&priv->stats_lock); @@ -1061,8 +1179,6 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, goto out; } - priv->stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) + - DS_SIZE * MLX4_EN_MAX_RX_FRAGS); err = mlx4_en_alloc_resources(priv); if (err) goto out; @@ -1076,13 +1192,15 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, } priv->allocated = 1; + /* Populate Tx priority mappings */ + mlx4_en_set_prio_map(priv, priv->tx_prio_map, + prof->tx_ring_num - MLX4_EN_NUM_HASH_RINGS); + /* * Initialize netdev entry points */ dev->netdev_ops = &mlx4_netdev_ops; dev->watchdog_timeo = MLX4_EN_WATCHDOG_TIMEOUT; - netif_set_real_num_tx_queues(dev, priv->tx_ring_num); - netif_set_real_num_rx_queues(dev, priv->rx_ring_num); SET_ETHTOOL_OPS(dev, &mlx4_en_ethtool_ops); @@ -1096,49 +1214,40 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, /* * Set driver features */ - dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; - if (mdev->LSO_support) - dev->hw_features |= NETIF_F_TSO | NETIF_F_TSO6; - - dev->vlan_features = dev->hw_features; - - dev->hw_features |= NETIF_F_RXCSUM | NETIF_F_RXHASH; - dev->features = dev->hw_features | NETIF_F_HIGHDMA | - NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX | - NETIF_F_HW_VLAN_FILTER; - dev->hw_features |= NETIF_F_LOOPBACK; + dev->features |= NETIF_F_SG; + dev->features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; +#ifdef HAVE_NETDEV_VLAN_FEATURES + dev->vlan_features |= NETIF_F_SG; + dev->vlan_features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; +#endif + dev->features |= NETIF_F_HIGHDMA; + dev->features |= NETIF_F_HW_VLAN_TX | + NETIF_F_HW_VLAN_RX | + NETIF_F_HW_VLAN_FILTER; + if (mdev->profile.num_lro) + dev->features |= NETIF_F_LRO; + if (mdev->LSO_support) { + dev->features |= NETIF_F_TSO; + dev->features |= NETIF_F_TSO6; +#ifdef HAVE_NETDEV_VLAN_FEATURES + dev->vlan_features |= NETIF_F_TSO; + dev->vlan_features |= NETIF_F_TSO6; +#endif + } mdev->pndev[port] = dev; netif_carrier_off(dev); err = register_netdev(dev); if (err) { - en_err(priv, "Netdev registration failed for port %d\n", port); + mlx4_err(mdev, "Netdev registration failed for port %d\n", port); goto out; } - priv->registered = 1; en_warn(priv, "Using %d TX rings\n", prof->tx_ring_num); en_warn(priv, "Using %d RX rings\n", prof->rx_ring_num); - /* Configure port */ - err = mlx4_SET_PORT_general(mdev->dev, priv->port, - MLX4_EN_MIN_MTU, - 0, 0, 0, 0); - if (err) { - en_err(priv, "Failed setting port general configurations " - "for port %d, with error %d\n", priv->port, err); - goto out; - } - - /* Init port */ - en_warn(priv, "Initializing port\n"); - err = mlx4_INIT_PORT(mdev->dev, priv->port); - if (err) { - en_err(priv, "Failed Initializing port\n"); - goto out; - } - mlx4_en_set_default_moderation(priv); + priv->registered = 1; queue_delayed_work(mdev->workqueue, &priv->stats_task, STATS_DELAY); return 0; diff --git a/drivers/net/mlx4/en_params.c b/drivers/net/mlx4/en_params.c new file mode 100644 index 0000000000000..c1bd040b9e05d --- /dev/null +++ b/drivers/net/mlx4/en_params.c @@ -0,0 +1,485 @@ +/* + * Copyright (c) 2007 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include + +#include "mlx4_en.h" +#include "en_port.h" + +#define MLX4_EN_PARM_INT(X, def_val, desc) \ + static unsigned int X = def_val;\ + module_param(X , uint, 0444); \ + MODULE_PARM_DESC(X, desc); + + +/* + * Device scope module parameters + */ + + +/* Use a XOR rathern than Toeplitz hash function for RSS */ +MLX4_EN_PARM_INT(rss_xor, 0, "Use XOR hash function for RSS"); + +/* RSS hash type mask - default to */ +MLX4_EN_PARM_INT(rss_mask, 0xf, "RSS hash type bitmask"); + +/* Number of LRO sessions per Rx ring (rounded up to a power of two) */ +MLX4_EN_PARM_INT(num_lro, MLX4_EN_MAX_LRO_DESCRIPTORS, + "Number of LRO sessions per ring or disabled (0)"); + +/* Priority pausing */ +MLX4_EN_PARM_INT(pfctx, 0, "Priority based Flow Control policy on TX[7:0]." + " Per priority bit mask"); +MLX4_EN_PARM_INT(pfcrx, 0, "Priority based Flow Control policy on RX[7:0]." + " Per priority bit mask"); + +int mlx4_en_get_profile(struct mlx4_en_dev *mdev) +{ + struct mlx4_en_profile *params = &mdev->profile; + int i; + + params->rss_xor = (rss_xor != 0); + params->rss_mask = rss_mask & 0x1f; + params->num_lro = min_t(int, num_lro , MLX4_EN_MAX_LRO_DESCRIPTORS); + for (i = 1; i <= MLX4_MAX_PORTS; i++) { + params->prof[i].rx_pause = 1; + params->prof[i].rx_ppp = pfcrx; + params->prof[i].tx_pause = 1; + params->prof[i].tx_ppp = pfctx; + params->prof[i].tx_ring_size = MLX4_EN_DEF_TX_RING_SIZE; + params->prof[i].rx_ring_size = MLX4_EN_DEF_RX_RING_SIZE; + } + if (pfcrx || pfctx) { + params->prof[1].tx_ring_num = MLX4_EN_TX_RING_NUM; + params->prof[2].tx_ring_num = MLX4_EN_TX_RING_NUM; + } else { + params->prof[1].tx_ring_num = 1; + params->prof[2].tx_ring_num = 1; + } + + return 0; +} + + +/* + * Ethtool support + */ + +static void mlx4_en_update_lro_stats(struct mlx4_en_priv *priv) +{ + int i; + + priv->port_stats.lro_aggregated = 0; + priv->port_stats.lro_flushed = 0; + priv->port_stats.lro_no_desc = 0; + + for (i = 0; i < priv->rx_ring_num; i++) { + priv->port_stats.lro_aggregated += priv->rx_ring[i].lro.stats.aggregated; + priv->port_stats.lro_flushed += priv->rx_ring[i].lro.stats.flushed; + priv->port_stats.lro_no_desc += priv->rx_ring[i].lro.stats.no_desc; + } +} + +static void +mlx4_en_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + + sprintf(drvinfo->driver, DRV_NAME " (%s)", mdev->dev->board_id); + strncpy(drvinfo->version, DRV_VERSION " (" DRV_RELDATE ")", 32); + sprintf(drvinfo->fw_version, "%d.%d.%d", + (u16) (mdev->dev->caps.fw_ver >> 32), + (u16) ((mdev->dev->caps.fw_ver >> 16) & 0xffff), + (u16) (mdev->dev->caps.fw_ver & 0xffff)); + strncpy(drvinfo->bus_info, pci_name(mdev->dev->pdev), 32); + drvinfo->n_stats = 0; + drvinfo->regdump_len = 0; + drvinfo->eedump_len = 0; +} + +static u32 mlx4_en_get_tso(struct net_device *dev) +{ + return (dev->features & NETIF_F_TSO) != 0; +} + +static int mlx4_en_set_tso(struct net_device *dev, u32 data) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + + if (data) { + if (!priv->mdev->LSO_support) + return -EPERM; + dev->features |= (NETIF_F_TSO | NETIF_F_TSO6); + } else + dev->features &= ~(NETIF_F_TSO | NETIF_F_TSO6); + return 0; +} + +static u32 mlx4_en_get_rx_csum(struct net_device *dev) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + return priv->rx_csum; +} + +static int mlx4_en_set_rx_csum(struct net_device *dev, u32 data) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + priv->rx_csum = (data != 0); + return 0; +} + +static const char main_strings[][ETH_GSTRING_LEN] = { + "rx_packets", "tx_packets", "rx_bytes", "tx_bytes", "rx_errors", + "tx_errors", "rx_dropped", "tx_dropped", "multicast", "collisions", + "rx_length_errors", "rx_over_errors", "rx_crc_errors", + "rx_frame_errors", "rx_fifo_errors", "rx_missed_errors", + "tx_aborted_errors", "tx_carrier_errors", "tx_fifo_errors", + "tx_heartbeat_errors", "tx_window_errors", + + /* port statistics */ + "lro_aggregated", "lro_flushed", "lro_no_desc", "tso_packets", + "queue_stopped", "wake_queue", "tx_timeout", "rx_alloc_failed", + "rx_csum_good", "rx_csum_none", "tx_chksum_offload", + + /* packet statistics */ + "broadcast", "rx_prio_0", "rx_prio_1", "rx_prio_2", "rx_prio_3", + "rx_prio_4", "rx_prio_5", "rx_prio_6", "rx_prio_7", "tx_prio_0", + "tx_prio_1", "tx_prio_2", "tx_prio_3", "tx_prio_4", "tx_prio_5", + "tx_prio_6", "tx_prio_7", +}; +#define NUM_MAIN_STATS 21 +#define NUM_ALL_STATS (NUM_MAIN_STATS + NUM_PORT_STATS + NUM_PKT_STATS + NUM_PERF_STATS) + +static u32 mlx4_en_get_msglevel(struct net_device *dev) +{ + return ((struct mlx4_en_priv *) netdev_priv(dev))->msg_enable; +} + +static void mlx4_en_set_msglevel(struct net_device *dev, u32 val) +{ + ((struct mlx4_en_priv *) netdev_priv(dev))->msg_enable = val; +} + +static void mlx4_en_get_wol(struct net_device *netdev, + struct ethtool_wolinfo *wol) +{ + wol->supported = 0; + wol->wolopts = 0; + + return; +} + +static int mlx4_en_get_sset_count(struct net_device *dev, int sset) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + + if (sset != ETH_SS_STATS) + return -EOPNOTSUPP; + + return NUM_ALL_STATS + (priv->tx_ring_num + priv->rx_ring_num) * 2; +} + +static void mlx4_en_get_ethtool_stats(struct net_device *dev, + struct ethtool_stats *stats, uint64_t *data) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + int index = 0; + int i; + + spin_lock_bh(&priv->stats_lock); + + mlx4_en_update_lro_stats(priv); + + for (i = 0; i < NUM_MAIN_STATS; i++) + data[index++] = ((unsigned long *) &priv->stats)[i]; + for (i = 0; i < NUM_PORT_STATS; i++) + data[index++] = ((unsigned long *) &priv->port_stats)[i]; + for (i = 0; i < priv->tx_ring_num; i++) { + data[index++] = priv->tx_ring[i].packets; + data[index++] = priv->tx_ring[i].bytes; + } + for (i = 0; i < priv->rx_ring_num; i++) { + data[index++] = priv->rx_ring[i].packets; + data[index++] = priv->rx_ring[i].bytes; + } + for (i = 0; i < NUM_PKT_STATS; i++) + data[index++] = ((unsigned long *) &priv->pkstats)[i]; + spin_unlock_bh(&priv->stats_lock); + +} + +static void mlx4_en_get_strings(struct net_device *dev, + uint32_t stringset, uint8_t *data) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + int index = 0; + int i; + + if (stringset != ETH_SS_STATS) + return; + + /* Add main counters */ + for (i = 0; i < NUM_MAIN_STATS; i++) + strcpy(data + (index++) * ETH_GSTRING_LEN, main_strings[i]); + for (i = 0; i < NUM_PORT_STATS; i++) + strcpy(data + (index++) * ETH_GSTRING_LEN, + main_strings[i + NUM_MAIN_STATS]); + for (i = 0; i < priv->tx_ring_num; i++) { + sprintf(data + (index++) * ETH_GSTRING_LEN, + "tx%d_packets", i); + sprintf(data + (index++) * ETH_GSTRING_LEN, + "tx%d_bytes", i); + } + for (i = 0; i < priv->rx_ring_num; i++) { + sprintf(data + (index++) * ETH_GSTRING_LEN, + "rx%d_packets", i); + sprintf(data + (index++) * ETH_GSTRING_LEN, + "rx%d_bytes", i); + } + for (i = 0; i < NUM_PKT_STATS; i++) + strcpy(data + (index++) * ETH_GSTRING_LEN, + main_strings[i + NUM_MAIN_STATS + NUM_PORT_STATS]); +} + +static int mlx4_en_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) +{ + cmd->autoneg = AUTONEG_DISABLE; + cmd->supported = SUPPORTED_10000baseT_Full; + cmd->advertising = SUPPORTED_10000baseT_Full; + if (netif_carrier_ok(dev)) { + cmd->speed = SPEED_10000; + cmd->duplex = DUPLEX_FULL; + } else { + cmd->speed = -1; + cmd->duplex = -1; + } + return 0; +} + +static int mlx4_en_set_settings(struct net_device *dev, struct ethtool_cmd *cmd) +{ + if ((cmd->autoneg == AUTONEG_ENABLE) || + (cmd->speed != SPEED_10000) || (cmd->duplex != DUPLEX_FULL)) + return -EINVAL; + + /* Nothing to change */ + return 0; +} + +static int mlx4_en_get_coalesce(struct net_device *dev, + struct ethtool_coalesce *coal) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + + coal->tx_coalesce_usecs = 0; + coal->tx_max_coalesced_frames = 0; + coal->rx_coalesce_usecs = priv->rx_usecs; + coal->rx_max_coalesced_frames = priv->rx_frames; + + coal->pkt_rate_low = priv->pkt_rate_low; + coal->rx_coalesce_usecs_low = priv->rx_usecs_low; + coal->pkt_rate_high = priv->pkt_rate_high; + coal->rx_coalesce_usecs_high = priv->rx_usecs_high; + coal->rate_sample_interval = priv->sample_interval; + coal->use_adaptive_rx_coalesce = priv->adaptive_rx_coal; + return 0; +} + +static int mlx4_en_set_coalesce(struct net_device *dev, + struct ethtool_coalesce *coal) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + int err, i; + + priv->rx_frames = (coal->rx_max_coalesced_frames == + MLX4_EN_AUTO_CONF) ? + MLX4_EN_RX_COAL_TARGET / + priv->dev->mtu + 1 : + coal->rx_max_coalesced_frames; + priv->rx_usecs = (coal->rx_coalesce_usecs == + MLX4_EN_AUTO_CONF) ? + MLX4_EN_RX_COAL_TIME : + coal->rx_coalesce_usecs; + + /* Set adaptive coalescing params */ + priv->pkt_rate_low = coal->pkt_rate_low; + priv->rx_usecs_low = coal->rx_coalesce_usecs_low; + priv->pkt_rate_high = coal->pkt_rate_high; + priv->rx_usecs_high = coal->rx_coalesce_usecs_high; + priv->sample_interval = coal->rate_sample_interval; + priv->adaptive_rx_coal = coal->use_adaptive_rx_coalesce; + priv->last_moder_time = MLX4_EN_AUTO_CONF; + if (priv->adaptive_rx_coal) + return 0; + + for (i = 0; i < priv->rx_ring_num; i++) { + priv->rx_cq[i].moder_cnt = priv->rx_frames; + priv->rx_cq[i].moder_time = priv->rx_usecs; + err = mlx4_en_set_cq_moder(priv, &priv->rx_cq[i]); + if (err) + return err; + } + return 0; +} + +static int mlx4_en_set_pauseparam(struct net_device *dev, + struct ethtool_pauseparam *pause) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + int err; + + priv->prof->tx_pause = pause->tx_pause != 0; + priv->prof->rx_pause = pause->rx_pause != 0; + err = mlx4_SET_PORT_general(mdev->dev, priv->port, + priv->rx_skb_size + ETH_FCS_LEN, + priv->prof->tx_pause, + priv->prof->tx_ppp, + priv->prof->rx_pause, + priv->prof->rx_ppp); + if (err) + mlx4_err(mdev, "Failed setting pause params to\n"); + + return err; +} + +static void mlx4_en_get_pauseparam(struct net_device *dev, + struct ethtool_pauseparam *pause) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + + pause->tx_pause = priv->prof->tx_pause; + pause->rx_pause = priv->prof->rx_pause; +} + +static int mlx4_en_set_ringparam(struct net_device *dev, + struct ethtool_ringparam *param) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + u32 rx_size, tx_size; + int port_up = 0; + int err = 0; + + if (param->rx_jumbo_pending || param->rx_mini_pending) + return -EINVAL; + + rx_size = roundup_pow_of_two(param->rx_pending); + rx_size = max_t(u32, rx_size, MLX4_EN_MIN_RX_SIZE); + rx_size = min_t(u32, rx_size, MLX4_EN_MAX_RX_SIZE); + tx_size = roundup_pow_of_two(param->tx_pending); + tx_size = max_t(u32, tx_size, MLX4_EN_MIN_TX_SIZE); + tx_size = min_t(u32, tx_size, MLX4_EN_MAX_TX_SIZE); + + if (rx_size == priv->prof->rx_ring_size && + tx_size == priv->prof->tx_ring_size) + return 0; + + mutex_lock(&mdev->state_lock); + if (priv->port_up) { + port_up = 1; + mlx4_en_stop_port(dev); + } + + mlx4_en_free_resources(priv); + + priv->prof->tx_ring_size = tx_size; + priv->prof->rx_ring_size = rx_size; + + err = mlx4_en_alloc_resources(priv); + if (err) { + mlx4_err(mdev, "Failed reallocating port resources\n"); + goto out; + } + if (port_up) { + err = mlx4_en_start_port(dev); + if (err) + mlx4_err(mdev, "Failed starting port\n"); + } + +out: + mutex_unlock(&mdev->state_lock); + return err; +} + +static void mlx4_en_get_ringparam(struct net_device *dev, + struct ethtool_ringparam *param) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv->mdev; + + memset(param, 0, sizeof(*param)); + param->rx_max_pending = MLX4_EN_MAX_RX_SIZE; + param->tx_max_pending = MLX4_EN_MAX_TX_SIZE; + param->rx_pending = mdev->profile.prof[priv->port].rx_ring_size; + param->tx_pending = mdev->profile.prof[priv->port].tx_ring_size; +} + +const struct ethtool_ops mlx4_en_ethtool_ops = { + .get_drvinfo = mlx4_en_get_drvinfo, + .get_settings = mlx4_en_get_settings, + .set_settings = mlx4_en_set_settings, +#ifdef NETIF_F_TSO + .get_tso = mlx4_en_get_tso, + .set_tso = mlx4_en_set_tso, +#endif + .get_sg = ethtool_op_get_sg, + .set_sg = ethtool_op_set_sg, + .get_link = ethtool_op_get_link, + .get_rx_csum = mlx4_en_get_rx_csum, + .set_rx_csum = mlx4_en_set_rx_csum, + .get_tx_csum = ethtool_op_get_tx_csum, + .set_tx_csum = ethtool_op_set_tx_ipv6_csum, + .get_strings = mlx4_en_get_strings, + .get_sset_count = mlx4_en_get_sset_count, + .get_ethtool_stats = mlx4_en_get_ethtool_stats, + .get_wol = mlx4_en_get_wol, + .get_msglevel = mlx4_en_get_msglevel, + .set_msglevel = mlx4_en_set_msglevel, + .get_coalesce = mlx4_en_get_coalesce, + .set_coalesce = mlx4_en_set_coalesce, + .get_pauseparam = mlx4_en_get_pauseparam, + .set_pauseparam = mlx4_en_set_pauseparam, + .get_ringparam = mlx4_en_get_ringparam, + .set_ringparam = mlx4_en_set_ringparam, + .get_flags = ethtool_op_get_flags, + .set_flags = ethtool_op_set_flags, +}; + + + + + diff --git a/drivers/net/mlx4/en_port.c b/drivers/net/mlx4/en_port.c index 03c84cd78cdee..e323cc50f173e 100644 --- a/drivers/net/mlx4/en_port.c +++ b/drivers/net/mlx4/en_port.c @@ -41,111 +41,6 @@ #include "mlx4_en.h" -int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, - u64 mac, u64 clear, u8 mode) -{ - return mlx4_cmd(dev, (mac | (clear << 63)), port, mode, - MLX4_CMD_SET_MCAST_FLTR, MLX4_CMD_TIME_CLASS_B); -} - -int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, struct mlx4_en_priv *priv) -{ - struct mlx4_cmd_mailbox *mailbox; - struct mlx4_set_vlan_fltr_mbox *filter; - int i; - int j; - int index = 0; - u32 entry; - int err = 0; - - mailbox = mlx4_alloc_cmd_mailbox(dev); - if (IS_ERR(mailbox)) - return PTR_ERR(mailbox); - - filter = mailbox->buf; - memset(filter, 0, sizeof(*filter)); - for (i = VLAN_FLTR_SIZE - 1; i >= 0; i--) { - entry = 0; - for (j = 0; j < 32; j++) - if (test_bit(index++, priv->active_vlans)) - entry |= 1 << j; - filter->entry[i] = cpu_to_be32(entry); - } - err = mlx4_cmd(dev, mailbox->dma, priv->port, 0, MLX4_CMD_SET_VLAN_FLTR, - MLX4_CMD_TIME_CLASS_B); - mlx4_free_cmd_mailbox(dev, mailbox); - return err; -} - - -int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu, - u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx) -{ - struct mlx4_cmd_mailbox *mailbox; - struct mlx4_set_port_general_context *context; - int err; - u32 in_mod; - - mailbox = mlx4_alloc_cmd_mailbox(dev); - if (IS_ERR(mailbox)) - return PTR_ERR(mailbox); - context = mailbox->buf; - memset(context, 0, sizeof *context); - - context->flags = SET_PORT_GEN_ALL_VALID; - context->mtu = cpu_to_be16(mtu); - context->pptx = (pptx * (!pfctx)) << 7; - context->pfctx = pfctx; - context->pprx = (pprx * (!pfcrx)) << 7; - context->pfcrx = pfcrx; - - in_mod = MLX4_SET_PORT_GENERAL << 8 | port; - err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT, - MLX4_CMD_TIME_CLASS_B); - - mlx4_free_cmd_mailbox(dev, mailbox); - return err; -} - -int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn, - u8 promisc) -{ - struct mlx4_cmd_mailbox *mailbox; - struct mlx4_set_port_rqp_calc_context *context; - int err; - u32 in_mod; - u32 m_promisc = (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) ? - MCAST_DIRECT : MCAST_DEFAULT; - - if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER && - dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER) - return 0; - - mailbox = mlx4_alloc_cmd_mailbox(dev); - if (IS_ERR(mailbox)) - return PTR_ERR(mailbox); - context = mailbox->buf; - memset(context, 0, sizeof *context); - - context->base_qpn = cpu_to_be32(base_qpn); - context->n_mac = dev->caps.log_num_macs; - context->promisc = cpu_to_be32(promisc << SET_PORT_PROMISC_SHIFT | - base_qpn); - context->mcast = cpu_to_be32(m_promisc << SET_PORT_MC_PROMISC_SHIFT | - base_qpn); - context->intra_no_vlan = 0; - context->no_vlan = MLX4_NO_VLAN_IDX; - context->intra_vlan_miss = 0; - context->vlan_miss = MLX4_VLAN_MISS_IDX; - - in_mod = MLX4_SET_PORT_RQP_CALC << 8 | port; - err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT, - MLX4_CMD_TIME_CLASS_B); - - mlx4_free_cmd_mailbox(dev, mailbox); - return err; -} - int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port) { struct mlx4_en_query_port_context *qport_context; @@ -159,7 +54,7 @@ int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port) return PTR_ERR(mailbox); memset(mailbox->buf, 0, sizeof(*qport_context)); err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, port, 0, - MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B); + MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B, 0); if (err) goto out; qport_context = mailbox->buf; @@ -167,21 +62,11 @@ int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port) /* This command is always accessed from Ethtool context * already synchronized, no need in locking */ state->link_state = !!(qport_context->link_up & MLX4_EN_LINK_UP_MASK); - switch (qport_context->link_speed & MLX4_EN_SPEED_MASK) { - case MLX4_EN_1G_SPEED: + if ((qport_context->link_speed & MLX4_EN_SPEED_MASK) == + MLX4_EN_1G_SPEED) state->link_speed = 1000; - break; - case MLX4_EN_10G_SPEED_XAUI: - case MLX4_EN_10G_SPEED_XFI: + else state->link_speed = 10000; - break; - case MLX4_EN_40G_SPEED: - state->link_speed = 40000; - break; - default: - state->link_speed = -1; - break; - } state->transciver = qport_context->transceiver; out: @@ -189,106 +74,3 @@ out: return err; } -int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset) -{ - struct mlx4_en_stat_out_mbox *mlx4_en_stats; - struct mlx4_en_priv *priv = netdev_priv(mdev->pndev[port]); - struct net_device_stats *stats = &priv->stats; - struct mlx4_cmd_mailbox *mailbox; - u64 in_mod = reset << 8 | port; - int err; - int i; - - mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); - if (IS_ERR(mailbox)) - return PTR_ERR(mailbox); - memset(mailbox->buf, 0, sizeof(*mlx4_en_stats)); - err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, in_mod, 0, - MLX4_CMD_DUMP_ETH_STATS, MLX4_CMD_TIME_CLASS_B); - if (err) - goto out; - - mlx4_en_stats = mailbox->buf; - - spin_lock_bh(&priv->stats_lock); - - stats->rx_packets = 0; - stats->rx_bytes = 0; - priv->port_stats.rx_chksum_good = 0; - priv->port_stats.rx_chksum_none = 0; - for (i = 0; i < priv->rx_ring_num; i++) { - stats->rx_packets += priv->rx_ring[i].packets; - stats->rx_bytes += priv->rx_ring[i].bytes; - priv->port_stats.rx_chksum_good += priv->rx_ring[i].csum_ok; - priv->port_stats.rx_chksum_none += priv->rx_ring[i].csum_none; - } - stats->tx_packets = 0; - stats->tx_bytes = 0; - priv->port_stats.tx_chksum_offload = 0; - for (i = 0; i < priv->tx_ring_num; i++) { - stats->tx_packets += priv->tx_ring[i].packets; - stats->tx_bytes += priv->tx_ring[i].bytes; - priv->port_stats.tx_chksum_offload += priv->tx_ring[i].tx_csum; - } - - stats->rx_errors = be64_to_cpu(mlx4_en_stats->PCS) + - be32_to_cpu(mlx4_en_stats->RdropLength) + - be32_to_cpu(mlx4_en_stats->RJBBR) + - be32_to_cpu(mlx4_en_stats->RCRC) + - be32_to_cpu(mlx4_en_stats->RRUNT); - stats->tx_errors = be32_to_cpu(mlx4_en_stats->TDROP); - stats->multicast = be64_to_cpu(mlx4_en_stats->MCAST_prio_0) + - be64_to_cpu(mlx4_en_stats->MCAST_prio_1) + - be64_to_cpu(mlx4_en_stats->MCAST_prio_2) + - be64_to_cpu(mlx4_en_stats->MCAST_prio_3) + - be64_to_cpu(mlx4_en_stats->MCAST_prio_4) + - be64_to_cpu(mlx4_en_stats->MCAST_prio_5) + - be64_to_cpu(mlx4_en_stats->MCAST_prio_6) + - be64_to_cpu(mlx4_en_stats->MCAST_prio_7) + - be64_to_cpu(mlx4_en_stats->MCAST_novlan); - stats->collisions = 0; - stats->rx_length_errors = be32_to_cpu(mlx4_en_stats->RdropLength); - stats->rx_over_errors = be32_to_cpu(mlx4_en_stats->RdropOvflw); - stats->rx_crc_errors = be32_to_cpu(mlx4_en_stats->RCRC); - stats->rx_frame_errors = 0; - stats->rx_fifo_errors = be32_to_cpu(mlx4_en_stats->RdropOvflw); - stats->rx_missed_errors = be32_to_cpu(mlx4_en_stats->RdropOvflw); - stats->tx_aborted_errors = 0; - stats->tx_carrier_errors = 0; - stats->tx_fifo_errors = 0; - stats->tx_heartbeat_errors = 0; - stats->tx_window_errors = 0; - - priv->pkstats.broadcast = - be64_to_cpu(mlx4_en_stats->RBCAST_prio_0) + - be64_to_cpu(mlx4_en_stats->RBCAST_prio_1) + - be64_to_cpu(mlx4_en_stats->RBCAST_prio_2) + - be64_to_cpu(mlx4_en_stats->RBCAST_prio_3) + - be64_to_cpu(mlx4_en_stats->RBCAST_prio_4) + - be64_to_cpu(mlx4_en_stats->RBCAST_prio_5) + - be64_to_cpu(mlx4_en_stats->RBCAST_prio_6) + - be64_to_cpu(mlx4_en_stats->RBCAST_prio_7) + - be64_to_cpu(mlx4_en_stats->RBCAST_novlan); - priv->pkstats.rx_prio[0] = be64_to_cpu(mlx4_en_stats->RTOT_prio_0); - priv->pkstats.rx_prio[1] = be64_to_cpu(mlx4_en_stats->RTOT_prio_1); - priv->pkstats.rx_prio[2] = be64_to_cpu(mlx4_en_stats->RTOT_prio_2); - priv->pkstats.rx_prio[3] = be64_to_cpu(mlx4_en_stats->RTOT_prio_3); - priv->pkstats.rx_prio[4] = be64_to_cpu(mlx4_en_stats->RTOT_prio_4); - priv->pkstats.rx_prio[5] = be64_to_cpu(mlx4_en_stats->RTOT_prio_5); - priv->pkstats.rx_prio[6] = be64_to_cpu(mlx4_en_stats->RTOT_prio_6); - priv->pkstats.rx_prio[7] = be64_to_cpu(mlx4_en_stats->RTOT_prio_7); - priv->pkstats.tx_prio[0] = be64_to_cpu(mlx4_en_stats->TTOT_prio_0); - priv->pkstats.tx_prio[1] = be64_to_cpu(mlx4_en_stats->TTOT_prio_1); - priv->pkstats.tx_prio[2] = be64_to_cpu(mlx4_en_stats->TTOT_prio_2); - priv->pkstats.tx_prio[3] = be64_to_cpu(mlx4_en_stats->TTOT_prio_3); - priv->pkstats.tx_prio[4] = be64_to_cpu(mlx4_en_stats->TTOT_prio_4); - priv->pkstats.tx_prio[5] = be64_to_cpu(mlx4_en_stats->TTOT_prio_5); - priv->pkstats.tx_prio[6] = be64_to_cpu(mlx4_en_stats->TTOT_prio_6); - priv->pkstats.tx_prio[7] = be64_to_cpu(mlx4_en_stats->TTOT_prio_7); - spin_unlock_bh(&priv->stats_lock); - -out: - mlx4_free_cmd_mailbox(mdev->dev, mailbox); - return err; -} - diff --git a/drivers/net/mlx4/en_port.h b/drivers/net/mlx4/en_port.h index 19eb244f51653..75e6b0d28785c 100644 --- a/drivers/net/mlx4/en_port.h +++ b/drivers/net/mlx4/en_port.h @@ -35,73 +35,12 @@ #define _MLX4_EN_PORT_H_ -#define SET_PORT_GEN_ALL_VALID 0x7 -#define SET_PORT_PROMISC_SHIFT 31 -#define SET_PORT_MC_PROMISC_SHIFT 30 - -enum { - MLX4_CMD_SET_VLAN_FLTR = 0x47, - MLX4_CMD_SET_MCAST_FLTR = 0x48, - MLX4_CMD_DUMP_ETH_STATS = 0x49, -}; - -enum { - MCAST_DIRECT_ONLY = 0, - MCAST_DIRECT = 1, - MCAST_DEFAULT = 2 -}; - -struct mlx4_set_port_general_context { - u8 reserved[3]; - u8 flags; - u16 reserved2; - __be16 mtu; - u8 pptx; - u8 pfctx; - u16 reserved3; - u8 pprx; - u8 pfcrx; - u16 reserved4; -}; - -struct mlx4_set_port_rqp_calc_context { - __be32 base_qpn; - u8 rererved; - u8 n_mac; - u8 n_vlan; - u8 n_prio; - u8 reserved2[3]; - u8 mac_miss; - u8 intra_no_vlan; - u8 no_vlan; - u8 intra_vlan_miss; - u8 vlan_miss; - u8 reserved3[3]; - u8 no_vlan_prio; - __be32 promisc; - __be32 mcast; -}; - -#define VLAN_FLTR_SIZE 128 -struct mlx4_set_vlan_fltr_mbox { - __be32 entry[VLAN_FLTR_SIZE]; -}; - - enum { MLX4_MCAST_CONFIG = 0, MLX4_MCAST_DISABLE = 1, MLX4_MCAST_ENABLE = 2, }; -enum { - MLX4_EN_1G_SPEED = 0x02, - MLX4_EN_10G_SPEED_XFI = 0x01, - MLX4_EN_10G_SPEED_XAUI = 0x00, - MLX4_EN_40G_SPEED = 0x40, - MLX4_EN_OTHER_SPEED = 0x0f, -}; - struct mlx4_en_query_port_context { u8 link_up; #define MLX4_EN_LINK_UP_MASK 0x80 @@ -109,493 +48,11 @@ struct mlx4_en_query_port_context { __be16 mtu; u8 reserved2; u8 link_speed; -#define MLX4_EN_SPEED_MASK 0x43 +#define MLX4_EN_SPEED_MASK 0x3 +#define MLX4_EN_1G_SPEED 0x2 u16 reserved3[5]; __be64 mac; u8 transceiver; }; - -struct mlx4_en_stat_out_mbox { - /* Received frames with a length of 64 octets */ - __be64 R64_prio_0; - __be64 R64_prio_1; - __be64 R64_prio_2; - __be64 R64_prio_3; - __be64 R64_prio_4; - __be64 R64_prio_5; - __be64 R64_prio_6; - __be64 R64_prio_7; - __be64 R64_novlan; - /* Received frames with a length of 127 octets */ - __be64 R127_prio_0; - __be64 R127_prio_1; - __be64 R127_prio_2; - __be64 R127_prio_3; - __be64 R127_prio_4; - __be64 R127_prio_5; - __be64 R127_prio_6; - __be64 R127_prio_7; - __be64 R127_novlan; - /* Received frames with a length of 255 octets */ - __be64 R255_prio_0; - __be64 R255_prio_1; - __be64 R255_prio_2; - __be64 R255_prio_3; - __be64 R255_prio_4; - __be64 R255_prio_5; - __be64 R255_prio_6; - __be64 R255_prio_7; - __be64 R255_novlan; - /* Received frames with a length of 511 octets */ - __be64 R511_prio_0; - __be64 R511_prio_1; - __be64 R511_prio_2; - __be64 R511_prio_3; - __be64 R511_prio_4; - __be64 R511_prio_5; - __be64 R511_prio_6; - __be64 R511_prio_7; - __be64 R511_novlan; - /* Received frames with a length of 1023 octets */ - __be64 R1023_prio_0; - __be64 R1023_prio_1; - __be64 R1023_prio_2; - __be64 R1023_prio_3; - __be64 R1023_prio_4; - __be64 R1023_prio_5; - __be64 R1023_prio_6; - __be64 R1023_prio_7; - __be64 R1023_novlan; - /* Received frames with a length of 1518 octets */ - __be64 R1518_prio_0; - __be64 R1518_prio_1; - __be64 R1518_prio_2; - __be64 R1518_prio_3; - __be64 R1518_prio_4; - __be64 R1518_prio_5; - __be64 R1518_prio_6; - __be64 R1518_prio_7; - __be64 R1518_novlan; - /* Received frames with a length of 1522 octets */ - __be64 R1522_prio_0; - __be64 R1522_prio_1; - __be64 R1522_prio_2; - __be64 R1522_prio_3; - __be64 R1522_prio_4; - __be64 R1522_prio_5; - __be64 R1522_prio_6; - __be64 R1522_prio_7; - __be64 R1522_novlan; - /* Received frames with a length of 1548 octets */ - __be64 R1548_prio_0; - __be64 R1548_prio_1; - __be64 R1548_prio_2; - __be64 R1548_prio_3; - __be64 R1548_prio_4; - __be64 R1548_prio_5; - __be64 R1548_prio_6; - __be64 R1548_prio_7; - __be64 R1548_novlan; - /* Received frames with a length of 1548 < octets < MTU */ - __be64 R2MTU_prio_0; - __be64 R2MTU_prio_1; - __be64 R2MTU_prio_2; - __be64 R2MTU_prio_3; - __be64 R2MTU_prio_4; - __be64 R2MTU_prio_5; - __be64 R2MTU_prio_6; - __be64 R2MTU_prio_7; - __be64 R2MTU_novlan; - /* Received frames with a length of MTU< octets and good CRC */ - __be64 RGIANT_prio_0; - __be64 RGIANT_prio_1; - __be64 RGIANT_prio_2; - __be64 RGIANT_prio_3; - __be64 RGIANT_prio_4; - __be64 RGIANT_prio_5; - __be64 RGIANT_prio_6; - __be64 RGIANT_prio_7; - __be64 RGIANT_novlan; - /* Received broadcast frames with good CRC */ - __be64 RBCAST_prio_0; - __be64 RBCAST_prio_1; - __be64 RBCAST_prio_2; - __be64 RBCAST_prio_3; - __be64 RBCAST_prio_4; - __be64 RBCAST_prio_5; - __be64 RBCAST_prio_6; - __be64 RBCAST_prio_7; - __be64 RBCAST_novlan; - /* Received multicast frames with good CRC */ - __be64 MCAST_prio_0; - __be64 MCAST_prio_1; - __be64 MCAST_prio_2; - __be64 MCAST_prio_3; - __be64 MCAST_prio_4; - __be64 MCAST_prio_5; - __be64 MCAST_prio_6; - __be64 MCAST_prio_7; - __be64 MCAST_novlan; - /* Received unicast not short or GIANT frames with good CRC */ - __be64 RTOTG_prio_0; - __be64 RTOTG_prio_1; - __be64 RTOTG_prio_2; - __be64 RTOTG_prio_3; - __be64 RTOTG_prio_4; - __be64 RTOTG_prio_5; - __be64 RTOTG_prio_6; - __be64 RTOTG_prio_7; - __be64 RTOTG_novlan; - - /* Count of total octets of received frames, includes framing characters */ - __be64 RTTLOCT_prio_0; - /* Count of total octets of received frames, not including framing - characters */ - __be64 RTTLOCT_NOFRM_prio_0; - /* Count of Total number of octets received - (only for frames without errors) */ - __be64 ROCT_prio_0; - - __be64 RTTLOCT_prio_1; - __be64 RTTLOCT_NOFRM_prio_1; - __be64 ROCT_prio_1; - - __be64 RTTLOCT_prio_2; - __be64 RTTLOCT_NOFRM_prio_2; - __be64 ROCT_prio_2; - - __be64 RTTLOCT_prio_3; - __be64 RTTLOCT_NOFRM_prio_3; - __be64 ROCT_prio_3; - - __be64 RTTLOCT_prio_4; - __be64 RTTLOCT_NOFRM_prio_4; - __be64 ROCT_prio_4; - - __be64 RTTLOCT_prio_5; - __be64 RTTLOCT_NOFRM_prio_5; - __be64 ROCT_prio_5; - - __be64 RTTLOCT_prio_6; - __be64 RTTLOCT_NOFRM_prio_6; - __be64 ROCT_prio_6; - - __be64 RTTLOCT_prio_7; - __be64 RTTLOCT_NOFRM_prio_7; - __be64 ROCT_prio_7; - - __be64 RTTLOCT_novlan; - __be64 RTTLOCT_NOFRM_novlan; - __be64 ROCT_novlan; - - /* Count of Total received frames including bad frames */ - __be64 RTOT_prio_0; - /* Count of Total number of received frames with 802.1Q encapsulation */ - __be64 R1Q_prio_0; - __be64 reserved1; - - __be64 RTOT_prio_1; - __be64 R1Q_prio_1; - __be64 reserved2; - - __be64 RTOT_prio_2; - __be64 R1Q_prio_2; - __be64 reserved3; - - __be64 RTOT_prio_3; - __be64 R1Q_prio_3; - __be64 reserved4; - - __be64 RTOT_prio_4; - __be64 R1Q_prio_4; - __be64 reserved5; - - __be64 RTOT_prio_5; - __be64 R1Q_prio_5; - __be64 reserved6; - - __be64 RTOT_prio_6; - __be64 R1Q_prio_6; - __be64 reserved7; - - __be64 RTOT_prio_7; - __be64 R1Q_prio_7; - __be64 reserved8; - - __be64 RTOT_novlan; - __be64 R1Q_novlan; - __be64 reserved9; - - /* Total number of Successfully Received Control Frames */ - __be64 RCNTL; - __be64 reserved10; - __be64 reserved11; - __be64 reserved12; - /* Count of received frames with a length/type field value between 46 - (42 for VLANtagged frames) and 1500 (also 1500 for VLAN-tagged frames), - inclusive */ - __be64 RInRangeLengthErr; - /* Count of received frames with length/type field between 1501 and 1535 - decimal, inclusive */ - __be64 ROutRangeLengthErr; - /* Count of received frames that are longer than max allowed size for - 802.3 frames (1518/1522) */ - __be64 RFrmTooLong; - /* Count frames received with PCS error */ - __be64 PCS; - - /* Transmit frames with a length of 64 octets */ - __be64 T64_prio_0; - __be64 T64_prio_1; - __be64 T64_prio_2; - __be64 T64_prio_3; - __be64 T64_prio_4; - __be64 T64_prio_5; - __be64 T64_prio_6; - __be64 T64_prio_7; - __be64 T64_novlan; - __be64 T64_loopbk; - /* Transmit frames with a length of 65 to 127 octets. */ - __be64 T127_prio_0; - __be64 T127_prio_1; - __be64 T127_prio_2; - __be64 T127_prio_3; - __be64 T127_prio_4; - __be64 T127_prio_5; - __be64 T127_prio_6; - __be64 T127_prio_7; - __be64 T127_novlan; - __be64 T127_loopbk; - /* Transmit frames with a length of 128 to 255 octets */ - __be64 T255_prio_0; - __be64 T255_prio_1; - __be64 T255_prio_2; - __be64 T255_prio_3; - __be64 T255_prio_4; - __be64 T255_prio_5; - __be64 T255_prio_6; - __be64 T255_prio_7; - __be64 T255_novlan; - __be64 T255_loopbk; - /* Transmit frames with a length of 256 to 511 octets */ - __be64 T511_prio_0; - __be64 T511_prio_1; - __be64 T511_prio_2; - __be64 T511_prio_3; - __be64 T511_prio_4; - __be64 T511_prio_5; - __be64 T511_prio_6; - __be64 T511_prio_7; - __be64 T511_novlan; - __be64 T511_loopbk; - /* Transmit frames with a length of 512 to 1023 octets */ - __be64 T1023_prio_0; - __be64 T1023_prio_1; - __be64 T1023_prio_2; - __be64 T1023_prio_3; - __be64 T1023_prio_4; - __be64 T1023_prio_5; - __be64 T1023_prio_6; - __be64 T1023_prio_7; - __be64 T1023_novlan; - __be64 T1023_loopbk; - /* Transmit frames with a length of 1024 to 1518 octets */ - __be64 T1518_prio_0; - __be64 T1518_prio_1; - __be64 T1518_prio_2; - __be64 T1518_prio_3; - __be64 T1518_prio_4; - __be64 T1518_prio_5; - __be64 T1518_prio_6; - __be64 T1518_prio_7; - __be64 T1518_novlan; - __be64 T1518_loopbk; - /* Counts transmit frames with a length of 1519 to 1522 bytes */ - __be64 T1522_prio_0; - __be64 T1522_prio_1; - __be64 T1522_prio_2; - __be64 T1522_prio_3; - __be64 T1522_prio_4; - __be64 T1522_prio_5; - __be64 T1522_prio_6; - __be64 T1522_prio_7; - __be64 T1522_novlan; - __be64 T1522_loopbk; - /* Transmit frames with a length of 1523 to 1548 octets */ - __be64 T1548_prio_0; - __be64 T1548_prio_1; - __be64 T1548_prio_2; - __be64 T1548_prio_3; - __be64 T1548_prio_4; - __be64 T1548_prio_5; - __be64 T1548_prio_6; - __be64 T1548_prio_7; - __be64 T1548_novlan; - __be64 T1548_loopbk; - /* Counts transmit frames with a length of 1549 to MTU bytes */ - __be64 T2MTU_prio_0; - __be64 T2MTU_prio_1; - __be64 T2MTU_prio_2; - __be64 T2MTU_prio_3; - __be64 T2MTU_prio_4; - __be64 T2MTU_prio_5; - __be64 T2MTU_prio_6; - __be64 T2MTU_prio_7; - __be64 T2MTU_novlan; - __be64 T2MTU_loopbk; - /* Transmit frames with a length greater than MTU octets and a good CRC. */ - __be64 TGIANT_prio_0; - __be64 TGIANT_prio_1; - __be64 TGIANT_prio_2; - __be64 TGIANT_prio_3; - __be64 TGIANT_prio_4; - __be64 TGIANT_prio_5; - __be64 TGIANT_prio_6; - __be64 TGIANT_prio_7; - __be64 TGIANT_novlan; - __be64 TGIANT_loopbk; - /* Transmit broadcast frames with a good CRC */ - __be64 TBCAST_prio_0; - __be64 TBCAST_prio_1; - __be64 TBCAST_prio_2; - __be64 TBCAST_prio_3; - __be64 TBCAST_prio_4; - __be64 TBCAST_prio_5; - __be64 TBCAST_prio_6; - __be64 TBCAST_prio_7; - __be64 TBCAST_novlan; - __be64 TBCAST_loopbk; - /* Transmit multicast frames with a good CRC */ - __be64 TMCAST_prio_0; - __be64 TMCAST_prio_1; - __be64 TMCAST_prio_2; - __be64 TMCAST_prio_3; - __be64 TMCAST_prio_4; - __be64 TMCAST_prio_5; - __be64 TMCAST_prio_6; - __be64 TMCAST_prio_7; - __be64 TMCAST_novlan; - __be64 TMCAST_loopbk; - /* Transmit good frames that are neither broadcast nor multicast */ - __be64 TTOTG_prio_0; - __be64 TTOTG_prio_1; - __be64 TTOTG_prio_2; - __be64 TTOTG_prio_3; - __be64 TTOTG_prio_4; - __be64 TTOTG_prio_5; - __be64 TTOTG_prio_6; - __be64 TTOTG_prio_7; - __be64 TTOTG_novlan; - __be64 TTOTG_loopbk; - - /* total octets of transmitted frames, including framing characters */ - __be64 TTTLOCT_prio_0; - /* total octets of transmitted frames, not including framing characters */ - __be64 TTTLOCT_NOFRM_prio_0; - /* ifOutOctets */ - __be64 TOCT_prio_0; - - __be64 TTTLOCT_prio_1; - __be64 TTTLOCT_NOFRM_prio_1; - __be64 TOCT_prio_1; - - __be64 TTTLOCT_prio_2; - __be64 TTTLOCT_NOFRM_prio_2; - __be64 TOCT_prio_2; - - __be64 TTTLOCT_prio_3; - __be64 TTTLOCT_NOFRM_prio_3; - __be64 TOCT_prio_3; - - __be64 TTTLOCT_prio_4; - __be64 TTTLOCT_NOFRM_prio_4; - __be64 TOCT_prio_4; - - __be64 TTTLOCT_prio_5; - __be64 TTTLOCT_NOFRM_prio_5; - __be64 TOCT_prio_5; - - __be64 TTTLOCT_prio_6; - __be64 TTTLOCT_NOFRM_prio_6; - __be64 TOCT_prio_6; - - __be64 TTTLOCT_prio_7; - __be64 TTTLOCT_NOFRM_prio_7; - __be64 TOCT_prio_7; - - __be64 TTTLOCT_novlan; - __be64 TTTLOCT_NOFRM_novlan; - __be64 TOCT_novlan; - - __be64 TTTLOCT_loopbk; - __be64 TTTLOCT_NOFRM_loopbk; - __be64 TOCT_loopbk; - - /* Total frames transmitted with a good CRC that are not aborted */ - __be64 TTOT_prio_0; - /* Total number of frames transmitted with 802.1Q encapsulation */ - __be64 T1Q_prio_0; - __be64 reserved13; - - __be64 TTOT_prio_1; - __be64 T1Q_prio_1; - __be64 reserved14; - - __be64 TTOT_prio_2; - __be64 T1Q_prio_2; - __be64 reserved15; - - __be64 TTOT_prio_3; - __be64 T1Q_prio_3; - __be64 reserved16; - - __be64 TTOT_prio_4; - __be64 T1Q_prio_4; - __be64 reserved17; - - __be64 TTOT_prio_5; - __be64 T1Q_prio_5; - __be64 reserved18; - - __be64 TTOT_prio_6; - __be64 T1Q_prio_6; - __be64 reserved19; - - __be64 TTOT_prio_7; - __be64 T1Q_prio_7; - __be64 reserved20; - - __be64 TTOT_novlan; - __be64 T1Q_novlan; - __be64 reserved21; - - __be64 TTOT_loopbk; - __be64 T1Q_loopbk; - __be64 reserved22; - - /* Received frames with a length greater than MTU octets and a bad CRC */ - __be32 RJBBR; - /* Received frames with a bad CRC that are not runts, jabbers, - or alignment errors */ - __be32 RCRC; - /* Received frames with SFD with a length of less than 64 octets and a - bad CRC */ - __be32 RRUNT; - /* Received frames with a length less than 64 octets and a good CRC */ - __be32 RSHORT; - /* Total Number of Received Packets Dropped */ - __be32 RDROP; - /* Drop due to overflow */ - __be32 RdropOvflw; - /* Drop due to overflow */ - __be32 RdropLength; - /* Total of good frames. Does not include frames received with - frame-too-long, FCS, or length errors */ - __be32 RTOTFRMS; - /* Total dropped Xmited packets */ - __be32 TDROP; -}; - - #endif diff --git a/drivers/net/mlx4/en_resources.c b/drivers/net/mlx4/en_resources.c index 0dfb4ec8a9dd0..df3ff7e1d000e 100644 --- a/drivers/net/mlx4/en_resources.c +++ b/drivers/net/mlx4/en_resources.c @@ -31,7 +31,6 @@ * */ -#include #include #include @@ -47,8 +46,9 @@ void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride, context->flags = cpu_to_be32(7 << 16 | rss << 13); context->pd = cpu_to_be32(mdev->priv_pdn); context->mtu_msgmax = 0xff; - if (!is_tx && !rss) + if (!is_tx && !rss) { context->rq_size_stride = ilog2(size) << 3 | (ilog2(stride) - 4); + } if (is_tx) context->sq_size_stride = ilog2(size) << 3 | (ilog2(stride) - 4); else diff --git a/drivers/net/mlx4/en_rx.c b/drivers/net/mlx4/en_rx.c index 635f710f200ef..95089d50a8655 100644 --- a/drivers/net/mlx4/en_rx.c +++ b/drivers/net/mlx4/en_rx.c @@ -32,7 +32,6 @@ */ #include -#include #include #include #include @@ -42,6 +41,18 @@ #include "mlx4_en.h" +static int mlx4_en_get_frag_header(struct skb_frag_struct *frags, void **mac_hdr, + void **ip_hdr, void **tcpudp_hdr, + u64 *hdr_flags, void *priv) +{ + *mac_hdr = page_address(frags->page) + frags->page_offset; + *ip_hdr = *mac_hdr + ETH_HLEN; + *tcpudp_hdr = (struct tcphdr *)(*ip_hdr + sizeof(struct iphdr)); + *hdr_flags = LRO_IPV4 | LRO_TCP; + + return 0; +} + static int mlx4_en_alloc_frag(struct mlx4_en_priv *priv, struct mlx4_en_rx_desc *rx_desc, struct skb_frag_struct *skb_frags, @@ -123,6 +134,15 @@ static void mlx4_en_destroy_allocator(struct mlx4_en_priv *priv, } } +static void +mlx4_en_init_rx_desc_skb(struct mlx4_en_priv *priv, + struct mlx4_en_rx_ring *ring, int index) +{ + struct mlx4_en_rx_desc *rx_desc = ring->buf + ring->stride * index; + + rx_desc->data->byte_count = cpu_to_be32(priv->rx_skb_size); + rx_desc->data->lkey = cpu_to_be32(priv->mdev->mr.key); +} static void mlx4_en_init_rx_desc(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring, int index) @@ -152,6 +172,40 @@ static void mlx4_en_init_rx_desc(struct mlx4_en_priv *priv, } } +static int +mlx4_en_alloc_rx_skb(struct mlx4_en_priv *priv, + struct mlx4_en_rx_desc *rx_desc, + struct sk_buff **pskb, int unmap) +{ + struct mlx4_en_dev *mdev = priv->mdev; + dma_addr_t dma; + int size = priv->rx_skb_size + NET_IP_ALIGN; + struct sk_buff *new_skb = dev_alloc_skb(size); + + if (unlikely(new_skb == NULL)) + return -ENOMEM; + + if (unmap) + pci_unmap_single(mdev->pdev, be64_to_cpu(rx_desc->data->addr), + be32_to_cpu(rx_desc->data->byte_count), + PCI_DMA_FROMDEVICE); + new_skb->dev = priv->dev; + skb_reserve(new_skb, NET_IP_ALIGN); + dma = pci_map_single(priv->mdev->pdev, new_skb->data, size, DMA_FROM_DEVICE); + *pskb = new_skb; + rx_desc->data->addr = cpu_to_be64(dma); + return 0; +} + +static int +mlx4_en_prepare_rx_desc_skb(struct mlx4_en_priv *priv, + struct mlx4_en_rx_ring *ring, int index) +{ + struct mlx4_en_rx_desc *rx_desc = ring->buf + (index * ring->stride); + struct sk_buff **pskb = (struct sk_buff **) ring->rx_info + index; + + return mlx4_en_alloc_rx_skb(priv, rx_desc, pskb, 0); +} static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring, int index) @@ -184,19 +238,29 @@ static void mlx4_en_free_rx_desc(struct mlx4_en_priv *priv, { struct mlx4_en_dev *mdev = priv->mdev; struct skb_frag_struct *skb_frags; + struct sk_buff *skb; struct mlx4_en_rx_desc *rx_desc = ring->buf + (index << ring->log_stride); dma_addr_t dma; int nr; - skb_frags = ring->rx_info + (index << priv->log_rx_info); - for (nr = 0; nr < priv->num_frags; nr++) { - en_dbg(DRV, priv, "Freeing fragment:%d\n", nr); - dma = be64_to_cpu(rx_desc->data[nr].addr); - - en_dbg(DRV, priv, "Unmapping buffer at dma:0x%llx\n", (u64) dma); - pci_unmap_single(mdev->pdev, dma, skb_frags[nr].size, + if (ring->use_frags) { + skb_frags = ring->rx_info + (index << priv->log_rx_info); + for (nr = 0; nr < priv->num_frags; nr++) { + en_dbg(DRV, priv, "Freeing fragment:%d\n", nr); + dma = be64_to_cpu(rx_desc->data[nr].addr); + + en_dbg(DRV, priv, "Unmaping buffer at dma:0x%llx\n", (u64) dma); + pci_unmap_single(mdev->pdev, dma, skb_frags[nr].size, + PCI_DMA_FROMDEVICE); + put_page(skb_frags[nr].page); + } + } else { + skb = *((struct sk_buff **) ring->rx_info + index); + dma = be64_to_cpu(rx_desc->data->addr); + pci_unmap_single(mdev->pdev, dma, + priv->rx_skb_size + NET_IP_ALIGN, PCI_DMA_FROMDEVICE); - put_page(skb_frags[nr].page); + kfree_skb(skb); } } @@ -206,13 +270,19 @@ static int mlx4_en_fill_rx_buffers(struct mlx4_en_priv *priv) int ring_ind; int buf_ind; int new_size; + int err; for (buf_ind = 0; buf_ind < priv->prof->rx_ring_size; buf_ind++) { for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) { ring = &priv->rx_ring[ring_ind]; - if (mlx4_en_prepare_rx_desc(priv, ring, - ring->actual_size)) { + if (ring->use_frags) + err = mlx4_en_prepare_rx_desc(priv, ring, + ring->actual_size); + else + err = mlx4_en_prepare_rx_desc_skb(priv, ring, + ring->actual_size); + if (err) { if (ring->actual_size < MLX4_EN_MIN_RX_SIZE) { en_err(priv, "Failed to allocate " "enough rx buffers\n"); @@ -220,7 +290,7 @@ static int mlx4_en_fill_rx_buffers(struct mlx4_en_priv *priv) } else { new_size = rounddown_pow_of_two(ring->actual_size); en_warn(priv, "Only %d buffers allocated " - "reducing ring size to %d", + "reducing ring size to %d\n", ring->actual_size, new_size); goto reduce_rings; } @@ -262,8 +332,9 @@ static void mlx4_en_free_rx_buf(struct mlx4_en_priv *priv, } } + int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv, - struct mlx4_en_rx_ring *ring, u32 size, u16 stride) + struct mlx4_en_rx_ring *ring, u32 size) { struct mlx4_en_dev *mdev = priv->mdev; int err; @@ -274,12 +345,18 @@ int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv, ring->cons = 0; ring->size = size; ring->size_mask = size - 1; - ring->stride = stride; + ring->stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) + + DS_SIZE * (ring->use_frags ? + MLX4_EN_MAX_RX_FRAGS : 1)); ring->log_stride = ffs(ring->stride) - 1; ring->buf_size = ring->size * ring->stride + TXBB_SIZE; - tmp = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS * - sizeof(struct skb_frag_struct)); + if (ring->use_frags) + tmp = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS * + sizeof(struct skb_frag_struct)); + else + tmp = size * sizeof(struct sk_buff *); + ring->rx_info = vmalloc(tmp); if (!ring->rx_info) { en_err(priv, "Failed allocating rx_info ring\n"); @@ -300,8 +377,28 @@ int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv, } ring->buf = ring->wqres.buf.direct.buf; + /* Configure lro mngr */ + memset(&ring->lro, 0, sizeof(struct net_lro_mgr)); + ring->lro.dev = priv->dev; + ring->lro.features = LRO_F_NAPI; + ring->lro.frag_align_pad = NET_IP_ALIGN; + ring->lro.ip_summed = CHECKSUM_UNNECESSARY; + ring->lro.ip_summed_aggr = CHECKSUM_UNNECESSARY; + ring->lro.max_desc = mdev->profile.num_lro; + ring->lro.max_aggr = MAX_SKB_FRAGS; + ring->lro.lro_arr = kzalloc(mdev->profile.num_lro * + sizeof(struct net_lro_desc), + GFP_KERNEL); + if (!ring->lro.lro_arr) { + en_err(priv, "Failed to allocate lro array\n"); + goto err_map; + } + ring->lro.get_frag_header = mlx4_en_get_frag_header; + return 0; +err_map: + mlx4_en_unmap_buffer(&ring->wqres.buf); err_hwq: mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size); err_ring: @@ -327,7 +424,8 @@ int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv) ring->actual_size = 0; ring->cqn = priv->rx_cq[ring_ind].mcq.cqn; - ring->stride = stride; + if (ring->use_frags) + ring->stride = stride; if (ring->stride <= TXBB_SIZE) ring->buf += TXBB_SIZE; @@ -337,18 +435,21 @@ int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv) memset(ring->buf, 0, ring->buf_size); mlx4_en_update_rx_prod_db(ring); - /* Initailize all descriptors */ - for (i = 0; i < ring->size; i++) - mlx4_en_init_rx_desc(priv, ring, i); - - /* Initialize page allocators */ - err = mlx4_en_init_allocator(priv, ring); - if (err) { - en_err(priv, "Failed initializing ring allocator\n"); - if (ring->stride <= TXBB_SIZE) - ring->buf -= TXBB_SIZE; - ring_ind--; - goto err_allocator; + if (ring->use_frags) { + /* Initailize all descriptors */ + for (i = 0; i < ring->size; i++) + mlx4_en_init_rx_desc(priv, ring, i); + + /* Initialize page allocators */ + err = mlx4_en_init_allocator(priv, ring); + if (err) { + en_err(priv, "Failed initializing ring allocator\n"); + ring_ind--; + goto err_allocator; + } + } else { + for (i = 0; i < ring->size; i++) + mlx4_en_init_rx_desc_skb(priv, ring, i); } } err = mlx4_en_fill_rx_buffers(priv); @@ -371,9 +472,8 @@ err_buffers: ring_ind = priv->rx_ring_num - 1; err_allocator: while (ring_ind >= 0) { - if (priv->rx_ring[ring_ind].stride <= TXBB_SIZE) - priv->rx_ring[ring_ind].buf -= TXBB_SIZE; - mlx4_en_destroy_allocator(priv, &priv->rx_ring[ring_ind]); + if (priv->rx_ring[ring_ind].use_frags) + mlx4_en_destroy_allocator(priv, &priv->rx_ring[ring_ind]); ring_ind--; } return err; @@ -384,6 +484,7 @@ void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv, { struct mlx4_en_dev *mdev = priv->mdev; + kfree(ring->lro.lro_arr); mlx4_en_unmap_buffer(&ring->wqres.buf); mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size + TXBB_SIZE); vfree(ring->rx_info); @@ -396,7 +497,8 @@ void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv, mlx4_en_free_rx_buf(priv, ring); if (ring->stride <= TXBB_SIZE) ring->buf -= TXBB_SIZE; - mlx4_en_destroy_allocator(priv, ring); + if (ring->use_frags) + mlx4_en_destroy_allocator(priv, ring); } @@ -404,11 +506,10 @@ void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv, static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv, struct mlx4_en_rx_desc *rx_desc, struct skb_frag_struct *skb_frags, - struct sk_buff *skb, + struct skb_frag_struct *skb_frags_rx, struct mlx4_en_rx_alloc *page_alloc, int length) { - struct skb_frag_struct *skb_frags_rx = skb_shinfo(skb)->frags; struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_frag_info *frag_info; int nr; @@ -424,7 +525,6 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv, skb_frags_rx[nr].page = skb_frags[nr].page; skb_frags_rx[nr].size = skb_frags[nr].size; skb_frags_rx[nr].page_offset = skb_frags[nr].page_offset; - skb->truesize += frag_info->frag_stride; dma = be64_to_cpu(rx_desc->data[nr].addr); /* Allocate a replacement page */ @@ -436,9 +536,8 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv, PCI_DMA_FROMDEVICE); } /* Adjust size of last fragment to match actual length */ - if (nr > 0) - skb_frags_rx[nr - 1].size = length - - priv->frag_info[nr - 1].frag_prefix_size; + skb_frags_rx[nr - 1].size = length - + priv->frag_info[nr - 1].frag_prefix_size; return nr; fail: @@ -472,6 +571,7 @@ static struct sk_buff *mlx4_en_rx_skb(struct mlx4_en_priv *priv, skb->dev = priv->dev; skb_reserve(skb, NET_IP_ALIGN); skb->len = length; + skb->truesize = length + sizeof(struct sk_buff); /* Get pointer to first fragment so we could copy the headers into the * (linear part of the) skb */ @@ -481,17 +581,18 @@ static struct sk_buff *mlx4_en_rx_skb(struct mlx4_en_priv *priv, /* We are copying all relevant data to the skb - temporarily * synch buffers for the copy */ dma = be64_to_cpu(rx_desc->data[0].addr); - dma_sync_single_for_cpu(&mdev->pdev->dev, dma, length, - DMA_FROM_DEVICE); + dma_sync_single_range_for_cpu(&mdev->pdev->dev, dma, 0, + length, DMA_FROM_DEVICE); skb_copy_to_linear_data(skb, va, length); - dma_sync_single_for_device(&mdev->pdev->dev, dma, length, - DMA_FROM_DEVICE); + dma_sync_single_range_for_device(&mdev->pdev->dev, dma, 0, + length, DMA_FROM_DEVICE); skb->tail += length; } else { /* Move relevant fragments to skb */ used_frags = mlx4_en_complete_rx_desc(priv, rx_desc, skb_frags, - skb, page_alloc, length); + skb_shinfo(skb)->frags, + page_alloc, length); if (unlikely(!used_frags)) { kfree_skb(skb); return NULL; @@ -512,6 +613,64 @@ static struct sk_buff *mlx4_en_rx_skb(struct mlx4_en_priv *priv, return skb; } +static inline int invalid_cqe(struct mlx4_en_priv *priv, + struct mlx4_cqe *cqe) +{ + /* Drop packet on bad receive or bad checksum */ + if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == + MLX4_CQE_OPCODE_ERROR)) { + en_err(priv, "CQE completed in error - vendor " + "syndrom:%d syndrom:%d\n", + ((struct mlx4_err_cqe *) cqe)->vendor_err_syndrome, + ((struct mlx4_err_cqe *) cqe)->syndrome); + return 1; + } + if (unlikely(cqe->badfcs_enc & MLX4_CQE_BAD_FCS)) { + en_dbg(RX_ERR, priv, "Accepted frame with bad FCS\n"); + return 1;; + } + + return 0; +} + +static struct sk_buff * +mlx4_en_get_rx_skb(struct mlx4_en_priv *priv, + struct mlx4_en_rx_desc *rx_desc, + struct sk_buff **pskb, + unsigned int length) +{ + struct mlx4_en_dev *mdev = priv->mdev; + struct sk_buff *skb; + dma_addr_t dma; + + if (length <= SMALL_PACKET_SIZE) { + skb = dev_alloc_skb(length + NET_IP_ALIGN); + if (unlikely(!skb)) + return NULL; + + skb->dev = priv->dev; + skb_reserve(skb, NET_IP_ALIGN); + /* We are copying all relevant data to the skb - temporarily + * synch buffers for the copy */ + dma = be64_to_cpu(rx_desc->data->addr); + dma_sync_single_range_for_cpu(&mdev->pdev->dev, dma, 0, + length, DMA_FROM_DEVICE); + skb_copy_to_linear_data(skb, (*pskb)->data, length); + dma_sync_single_range_for_device(&mdev->pdev->dev, dma, 0, + length, DMA_FROM_DEVICE); + + } else { + skb = *pskb; + if (unlikely(mlx4_en_alloc_rx_skb(priv, rx_desc, pskb, 1))) + return NULL; + } + + skb->tail += length; + skb->len = length; + skb->truesize = length + sizeof(struct sk_buff); + return skb; +} + static void validate_loopback(struct mlx4_en_priv *priv, struct sk_buff *skb) { int i; @@ -528,12 +687,112 @@ out_loopback: dev_kfree_skb_any(skb); } +int mlx4_en_process_rx_cq_skb(struct net_device *dev, + struct mlx4_en_cq *cq, int budget) +{ + struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_cqe *cqe; + struct mlx4_en_rx_ring *ring = &priv->rx_ring[cq->ring]; + struct mlx4_en_rx_desc *rx_desc; + struct sk_buff **pskb; + struct sk_buff *skb; + int index; + unsigned int length; + int polled = 0; + + if (!priv->port_up) + return 0; + + /* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx + * descriptor offset can be deduced from the CQE index instead of + * reading 'cqe->index' */ + index = cq->mcq.cons_index & ring->size_mask; + cqe = &cq->buf[index]; + + /* Process all completed CQEs */ + while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK, + cq->mcq.cons_index & cq->size)) { + + pskb = (struct sk_buff **) ring->rx_info + index; + rx_desc = ring->buf + (index << ring->log_stride); + + /* + * make sure we read the CQE after we read the ownership bit + */ + rmb(); + + if (invalid_cqe(priv, cqe)) + goto next; + + /* + * Packet is OK - process it. + */ + length = be32_to_cpu(cqe->byte_cnt); + ring->bytes += length; + ring->packets++; + + skb = mlx4_en_get_rx_skb(priv, rx_desc, pskb, length); + if (unlikely(!skb)) { + priv->stats.rx_dropped++; + goto next; + } + + if (unlikely(priv->validate_loopback)) { + validate_loopback(priv, skb); + goto next; + } + skb->protocol = eth_type_trans(skb, dev); + + if (likely(priv->rx_csum && cqe->checksum == 0xffff)) { + priv->port_stats.rx_chksum_good++; + skb->ip_summed = CHECKSUM_UNNECESSARY; + } else { + priv->port_stats.rx_chksum_none++; + skb->ip_summed = CHECKSUM_NONE; + if (priv->mdev->profile.ip_reasm && + cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV4) && + !mlx4_en_rx_frags(priv, ring, skb, cqe)) + goto next; + } + + /* Push it up the stack */ + if (priv->vlgrp && (be32_to_cpu(cqe->vlan_my_qpn) & + MLX4_CQE_VLAN_PRESENT_MASK)) { + vlan_hwaccel_receive_skb(skb, priv->vlgrp, + be16_to_cpu(cqe->sl_vid)); + } else + netif_receive_skb(skb); + + dev->last_rx = jiffies; + +next: + ++cq->mcq.cons_index; + index = (cq->mcq.cons_index) & ring->size_mask; + cqe = &cq->buf[index]; + if (++polled == budget) + goto out; + } + + /* If CQ is empty, flush all pending IP reassembly sessions */ + mlx4_en_flush_frags(priv, ring); + +out: + AVG_PERF_COUNTER(priv->pstats.rx_coal_avg, polled); + mlx4_cq_set_ci(&cq->mcq); + wmb(); /* ensure HW sees CQ consumer before we post new buffers */ + ring->cons = cq->mcq.cons_index; + ring->prod += polled; /* Polled descriptors were realocated in place */ + mlx4_en_update_rx_prod_db(ring); + return polled; +} + int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int budget) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_cqe *cqe; struct mlx4_en_rx_ring *ring = &priv->rx_ring[cq->ring]; struct skb_frag_struct *skb_frags; + struct skb_frag_struct lro_frags[MLX4_EN_MAX_RX_FRAGS]; struct mlx4_en_rx_desc *rx_desc; struct sk_buff *skb; int index; @@ -563,79 +822,63 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud */ rmb(); - /* Drop packet on bad receive or bad checksum */ - if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == - MLX4_CQE_OPCODE_ERROR)) { - en_err(priv, "CQE completed in error - vendor " - "syndrom:%d syndrom:%d\n", - ((struct mlx4_err_cqe *) cqe)->vendor_err_syndrome, - ((struct mlx4_err_cqe *) cqe)->syndrome); - goto next; - } - if (unlikely(cqe->badfcs_enc & MLX4_CQE_BAD_FCS)) { - en_dbg(RX_ERR, priv, "Accepted frame with bad FCS\n"); + if (invalid_cqe(priv, cqe)) goto next; - } /* * Packet is OK - process it. */ length = be32_to_cpu(cqe->byte_cnt); - length -= ring->fcs_del; ring->bytes += length; ring->packets++; - if (likely(dev->features & NETIF_F_RXCSUM)) { + if (likely(priv->rx_csum)) { if ((cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPOK)) && (cqe->checksum == cpu_to_be16(0xffff))) { - ring->csum_ok++; + priv->port_stats.rx_chksum_good++; /* This packet is eligible for LRO if it is: * - DIX Ethernet (type interpretation) * - TCP/IP (v4) * - without IP options * - not an IP fragment */ - if (dev->features & NETIF_F_GRO) { - struct sk_buff *gro_skb = napi_get_frags(&cq->napi); - if (!gro_skb) - goto next; + if (mlx4_en_can_lro(cqe->status) && + dev->features & NETIF_F_LRO) { nr = mlx4_en_complete_rx_desc( priv, rx_desc, - skb_frags, gro_skb, + skb_frags, lro_frags, ring->page_alloc, length); if (!nr) goto next; - skb_shinfo(gro_skb)->nr_frags = nr; - gro_skb->len = length; - gro_skb->data_len = length; - gro_skb->ip_summed = CHECKSUM_UNNECESSARY; - - if (cqe->vlan_my_qpn & - cpu_to_be32(MLX4_CQE_VLAN_PRESENT_MASK)) { - u16 vid = be16_to_cpu(cqe->sl_vid); - - __vlan_hwaccel_put_tag(gro_skb, vid); - } - - if (dev->features & NETIF_F_RXHASH) - gro_skb->rxhash = be32_to_cpu(cqe->immed_rss_invalid); - - skb_record_rx_queue(gro_skb, cq->ring); - napi_gro_frags(&cq->napi); + if (priv->vlgrp && (cqe->vlan_my_qpn & + cpu_to_be32(MLX4_CQE_VLAN_PRESENT_MASK))) { + lro_vlan_hwaccel_receive_frags( + &ring->lro, lro_frags, + length, length, + priv->vlgrp, + be16_to_cpu(cqe->sl_vid), + NULL, 0); + } else + lro_receive_frags(&ring->lro, + lro_frags, + length, + length, + NULL, 0); goto next; } /* LRO not possible, complete processing here */ ip_summed = CHECKSUM_UNNECESSARY; + INC_PERF_COUNTER(priv->pstats.lro_misses); } else { ip_summed = CHECKSUM_NONE; - ring->csum_none++; + priv->port_stats.rx_chksum_none++; } } else { ip_summed = CHECKSUM_NONE; - ring->csum_none++; + priv->port_stats.rx_chksum_none++; } skb = mlx4_en_rx_skb(priv, rx_desc, skb_frags, @@ -654,15 +897,13 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud skb->protocol = eth_type_trans(skb, dev); skb_record_rx_queue(skb, cq->ring); - if (dev->features & NETIF_F_RXHASH) - skb->rxhash = be32_to_cpu(cqe->immed_rss_invalid); - - if (be32_to_cpu(cqe->vlan_my_qpn) & - MLX4_CQE_VLAN_PRESENT_MASK) - __vlan_hwaccel_put_tag(skb, be16_to_cpu(cqe->sl_vid)); - /* Push it up the stack */ - netif_receive_skb(skb); + if (priv->vlgrp && (be32_to_cpu(cqe->vlan_my_qpn) & + MLX4_CQE_VLAN_PRESENT_MASK)) { + vlan_hwaccel_receive_skb(skb, priv->vlgrp, + be16_to_cpu(cqe->sl_vid)); + } else + netif_receive_skb(skb); next: ++cq->mcq.cons_index; @@ -671,10 +912,14 @@ next: if (++polled == budget) { /* We are here because we reached the NAPI budget - * flush only pending LRO sessions */ + lro_flush_all(&ring->lro); goto out; } } + /* If CQ is empty flush all LRO sessions unconditionally */ + lro_flush_all(&ring->lro); + out: AVG_PERF_COUNTER(priv->pstats.rx_coal_avg, polled); mlx4_cq_set_ci(&cq->mcq); @@ -705,7 +950,10 @@ int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget) struct mlx4_en_priv *priv = netdev_priv(dev); int done; - done = mlx4_en_process_rx_cq(dev, cq, budget); + if (priv->rx_ring[cq->ring].use_frags) + done = mlx4_en_process_rx_cq(dev, cq, budget); + else + done = mlx4_en_process_rx_cq_skb(dev, cq, budget); /* If we used up all the quota - we're probably not done yet... */ if (done == budget) @@ -719,7 +967,7 @@ int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget) } -/* Calculate the last offset position that accommodates a full fragment +/* Calculate the last offset position that accomodates a full fragment * (assuming fagment size = stride-align) */ static int mlx4_en_last_alloc_offset(struct mlx4_en_priv *priv, u16 stride, u16 align) { @@ -813,13 +1061,6 @@ static int mlx4_en_config_rss_qp(struct mlx4_en_priv *priv, int qpn, qpn, ring->cqn, context); context->db_rec_addr = cpu_to_be64(ring->wqres.db.dma); - /* Cancel FCS removal if FW allows */ - if (mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_FCS_KEEP) { - context->param3 |= cpu_to_be32(1 << 29); - ring->fcs_del = ETH_FCS_LEN; - } else - ring->fcs_del = 0; - err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, context, qp, state); if (err) { mlx4_qp_remove(mdev->dev, qp); @@ -839,17 +1080,14 @@ int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv) struct mlx4_qp_context context; struct mlx4_en_rss_context *rss_context; void *ptr; - u8 rss_mask = 0x3f; + u8 rss_mask = (priv->udp_rings > 1) ? 0x3f : 0x14; int i, qpn; int err = 0; int good_qps = 0; - static const u32 rsskey[10] = { 0xD181C62C, 0xF7F4DB5B, 0x1983A2FC, - 0x943E1ADB, 0xD9389E6B, 0xD1039C2C, 0xA74499AD, - 0x593D56D9, 0xF3253C06, 0x2ADC1FFC}; en_dbg(DRV, priv, "Configuring rss steering\n"); err = mlx4_qp_reserve_range(mdev->dev, priv->rx_ring_num, - priv->rx_ring_num, + roundup_pow_of_two(priv->rx_ring_num), &rss_map->base_qpn); if (err) { en_err(priv, "Failed reserving %d qps\n", priv->rx_ring_num); @@ -858,7 +1096,8 @@ int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv) for (i = 0; i < priv->rx_ring_num; i++) { qpn = rss_map->base_qpn + i; - err = mlx4_en_config_rss_qp(priv, qpn, &priv->rx_ring[i], + err = mlx4_en_config_rss_qp(priv, qpn, + &priv->rx_ring[i], &rss_map->state[i], &rss_map->qps[i]); if (err) @@ -879,16 +1118,15 @@ int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv) ptr = ((void *) &context) + 0x3c; rss_context = (struct mlx4_en_rss_context *) ptr; - rss_context->base_qpn = cpu_to_be32(ilog2(priv->rx_ring_num) << 24 | + rss_context->base_qpn = cpu_to_be32(ilog2(priv->rx_ring_num - priv->udp_rings) << 24 | (rss_map->base_qpn)); - rss_context->default_qpn = cpu_to_be32(rss_map->base_qpn); + rss_context->default_qpn = cpu_to_be32(rss_map->base_qpn + + priv->rx_ring_num - + priv->udp_rings); rss_context->flags = rss_mask; - rss_context->hash_fn = 1; - for (i = 0; i < 10; i++) - rss_context->rss_key[i] = rsskey[i]; - - if (priv->mdev->profile.udp_rss) + if (priv->udp_rings > 1) rss_context->base_qpn_udp = rss_context->default_qpn; + err = mlx4_qp_to_ready(mdev->dev, &priv->res.mtt, &context, &rss_map->indir_qp, &rss_map->indir_state); if (err) diff --git a/drivers/net/mlx4/en_selftest.c b/drivers/net/mlx4/en_selftest.c index 9fdbcecd499da..0e69f5981cb87 100644 --- a/drivers/net/mlx4/en_selftest.c +++ b/drivers/net/mlx4/en_selftest.c @@ -43,7 +43,7 @@ static int mlx4_en_test_registers(struct mlx4_en_priv *priv) { return mlx4_cmd(priv->mdev->dev, 0, 0, 0, MLX4_CMD_HW_HEALTH_CHECK, - MLX4_CMD_TIME_CLASS_A); + MLX4_CMD_TIME_CLASS_A, 0); } static int mlx4_en_test_loopback_xmit(struct mlx4_en_priv *priv) @@ -107,7 +107,7 @@ static int mlx4_en_test_loopback(struct mlx4_en_priv *priv) mlx4_en_test_loopback_exit: priv->validate_loopback = 0; - return !loopback_ok; + return (!loopback_ok); } @@ -149,7 +149,7 @@ void mlx4_en_ex_selftest(struct net_device *dev, u32 *flags, u64 *buf) netif_carrier_off(dev); retry_tx: - /* Wait until all tx queues are empty. + /* Wait untill all tx queues are empty. * there should not be any additional incoming traffic * since we turned the carrier off */ msleep(200); @@ -159,8 +159,7 @@ retry_tx: goto retry_tx; } - if (priv->mdev->dev->caps.flags & - MLX4_DEV_CAP_FLAG_UC_LOOPBACK) { + if (priv->mdev->dev->caps.loopback_support) { buf[3] = mlx4_en_test_registers(priv); buf[4] = mlx4_en_test_loopback(priv); } diff --git a/drivers/net/mlx4/en_tx.c b/drivers/net/mlx4/en_tx.c index 5215d9415f1b8..12447845189b0 100644 --- a/drivers/net/mlx4/en_tx.c +++ b/drivers/net/mlx4/en_tx.c @@ -33,27 +33,24 @@ #include #include -#include #include #include #include #include -#include #include "mlx4_en.h" enum { MAX_INLINE = 104, /* 128 - 16 - 4 - 4 */ - MAX_BF = 256, }; static int inline_thold __read_mostly = MAX_INLINE; module_param_named(inline_thold, inline_thold, int, 0444); -MODULE_PARM_DESC(inline_thold, "threshold for using inline data"); +MODULE_PARM_DESC(inline_thold, "treshold for using inline data"); int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, - struct mlx4_en_tx_ring *ring, int qpn, u32 size, + struct mlx4_en_tx_ring *ring, u32 size, u16 stride) { struct mlx4_en_dev *mdev = priv->mdev; @@ -104,25 +101,23 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, "buf_size:%d dma:%llx\n", ring, ring->buf, ring->size, ring->buf_size, (unsigned long long) ring->wqres.buf.direct.map); - ring->qpn = qpn; - err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp); + err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &ring->qpn); if (err) { - en_err(priv, "Failed allocating qp %d\n", ring->qpn); + en_err(priv, "Failed reserving qp for tx ring.\n"); goto err_map; } - ring->qp.event = mlx4_en_sqp_event; - err = mlx4_bf_alloc(mdev->dev, &ring->bf); + err = mlx4_qp_alloc(mdev->dev, ring->qpn, &ring->qp); if (err) { - en_dbg(DRV, priv, "working without blueflame (%d)", err); - ring->bf.uar = &mdev->priv_uar; - ring->bf.uar->map = mdev->uar_map; - ring->bf_enabled = false; - } else - ring->bf_enabled = true; + en_err(priv, "Failed allocating qp %d\n", ring->qpn); + goto err_reserve; + } + ring->qp.event = mlx4_en_sqp_event; return 0; +err_reserve: + mlx4_qp_release_range(mdev->dev, ring->qpn, 1); err_map: mlx4_en_unmap_buffer(&ring->wqres.buf); err_hwq_res: @@ -142,8 +137,6 @@ void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_dev *mdev = priv->mdev; en_dbg(DRV, priv, "Destroying tx ring, qpn: %d\n", ring->qpn); - if (ring->bf_enabled) - mlx4_bf_free(mdev->dev, &ring->bf); mlx4_qp_remove(mdev->dev, &ring->qp); mlx4_qp_free(mdev->dev, &ring->qp); mlx4_qp_release_range(mdev->dev, ring->qpn, 1); @@ -176,8 +169,6 @@ int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv, mlx4_en_fill_qp_context(priv, ring->size, ring->stride, 1, 0, ring->qpn, ring->cqn, &ring->context); - if (ring->bf_enabled) - ring->context.usr_page = cpu_to_be32(ring->bf.uar->index); err = mlx4_qp_to_ready(mdev->dev, &ring->wqres.mtt, &ring->context, &ring->qp, &ring->qp_state); @@ -307,60 +298,87 @@ int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring) return cnt; } +void mlx4_en_set_prio_map(struct mlx4_en_priv *priv, u16 *prio_map, u32 ring_num) +{ + int block = 8 / ring_num; + int extra = 8 - (block * ring_num); + int num = 0; + u16 ring = 1; + int prio; + + if (ring_num == 1) { + for (prio = 0; prio < 8; prio++) + prio_map[prio] = 0; + return; + } + + for (prio = 0; prio < 8; prio++) { + if (extra && (num == block + 1)) { + ring++; + num = 0; + extra--; + } else if (!extra && (num == block)) { + ring++; + num = 0; + } + prio_map[prio] = ring; + en_dbg(DRV, priv, " prio:%d --> ring:%d\n", prio, ring); + num++; + } +} + static void mlx4_en_process_tx_cq(struct net_device *dev, struct mlx4_en_cq *cq) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_cq *mcq = &cq->mcq; struct mlx4_en_tx_ring *ring = &priv->tx_ring[cq->ring]; - struct mlx4_cqe *cqe; + struct mlx4_cqe *cqe = cq->buf; u16 index; - u16 new_index, ring_index; + u16 new_index; u32 txbbs_skipped = 0; - u32 cons_index = mcq->cons_index; - int size = cq->size; - u32 size_mask = ring->size_mask; - struct mlx4_cqe *buf = cq->buf; + u32 cq_last_sav; - if (!priv->port_up) + /* index always points to the first TXBB of the last polled descriptor */ + index = ring->cons & ring->size_mask; + new_index = be16_to_cpu(cqe->wqe_index) & ring->size_mask; + if (index == new_index) return; - index = cons_index & size_mask; - cqe = &buf[index]; - ring_index = ring->cons & size_mask; - - /* Process all completed CQEs */ - while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK, - cons_index & size)) { - /* - * make sure we read the CQE after we read the - * ownership bit - */ - rmb(); - - /* Skip over last polled CQE */ - new_index = be16_to_cpu(cqe->wqe_index) & size_mask; + if (!priv->port_up) + return; + /* + * We use a two-stage loop: + * - the first samples the HW-updated CQE + * - the second frees TXBBs until the last sample + * This lets us amortize CQE cache misses, while still polling the CQ + * until is quiescent. + */ + cq_last_sav = mcq->cons_index; + do { do { + /* Skip over last polled CQE */ + index = (index + ring->last_nr_txbb) & ring->size_mask; txbbs_skipped += ring->last_nr_txbb; - ring_index = (ring_index + ring->last_nr_txbb) & size_mask; - /* free next descriptor */ + + /* Poll next CQE */ ring->last_nr_txbb = mlx4_en_free_tx_desc( - priv, ring, ring_index, - !!((ring->cons + txbbs_skipped) & - ring->size)); - } while (ring_index != new_index); - - ++cons_index; - index = cons_index & size_mask; - cqe = &buf[index]; - } + priv, ring, index, + !!((ring->cons + txbbs_skipped) & + ring->size)); + ++mcq->cons_index; + } while (index != new_index); + + new_index = be16_to_cpu(cqe->wqe_index) & ring->size_mask; + } while (index != new_index); + AVG_PERF_COUNTER(priv->pstats.tx_coal_avg, + (u32) (mcq->cons_index - cq_last_sav)); /* * To prevent CQ overflow we first update CQ consumer and only then * the ring consumer. */ - mcq->cons_index = cons_index; mlx4_cq_set_ci(mcq); wmb(); ring->cons += txbbs_skipped; @@ -447,7 +465,6 @@ static inline void mlx4_en_xmit_poll(struct mlx4_en_priv *priv, int tx_ind) { struct mlx4_en_cq *cq = &priv->tx_cq[tx_ind]; struct mlx4_en_tx_ring *ring = &priv->tx_ring[tx_ind]; - unsigned long flags; /* If we don't have a pending timer, set one up to catch our recent post in case the interface becomes idle */ @@ -456,9 +473,9 @@ static inline void mlx4_en_xmit_poll(struct mlx4_en_priv *priv, int tx_ind) /* Poll the CQ every mlx4_en_TX_MODER_POLL packets */ if ((++ring->poll_cnt & (MLX4_EN_TX_POLL_MODER - 1)) == 0) - if (spin_trylock_irqsave(&ring->comp_lock, flags)) { + if (spin_trylock_irq(&ring->comp_lock)) { mlx4_en_process_tx_cq(priv->dev, cq); - spin_unlock_irqrestore(&ring->comp_lock, flags); + spin_unlock_irq(&ring->comp_lock); } } @@ -587,24 +604,55 @@ u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb) { struct mlx4_en_priv *priv = netdev_priv(dev); u16 vlan_tag = 0; + int tx_ind = 0; + struct tcphdr *th = tcp_hdr(skb); + struct iphdr *iph = ip_hdr(skb); + struct mlx4_en_tx_hash_entry *entry; + u32 hash_index; - /* If we support per priority flow control and the packet contains - * a vlan tag, send the packet to the TX ring assigned to that priority - */ - if (priv->prof->rx_ppp && vlan_tx_tag_present(skb)) { + /* Obtain VLAN information if present */ + if (priv->vlgrp && vlan_tx_tag_present(skb)) { vlan_tag = vlan_tx_tag_get(skb); - return MLX4_EN_NUM_TX_RINGS + (vlan_tag >> 13); + /* Set the Tx ring to use according to vlan priority */ + tx_ind = priv->tx_prio_map[vlan_tag >> 13]; + if (tx_ind) + return tx_ind; } - return skb_tx_hash(dev, skb); -} + /* Hashing is only done for TCP/IP or UDP/IP packets */ + if (be16_to_cpu(skb->protocol) != ETH_P_IP) + return MLX4_EN_NUM_HASH_RINGS; + + hash_index = be32_to_cpu(iph->daddr) & MLX4_EN_TX_HASH_MASK; + switch(iph->protocol) { + case IPPROTO_UDP: + break; + case IPPROTO_TCP: + hash_index = (hash_index ^ be16_to_cpu(th->dest ^ th->source)) & + MLX4_EN_TX_HASH_MASK; + break; + default: + return MLX4_EN_NUM_HASH_RINGS; + } -static void mlx4_bf_copy(unsigned long *dst, unsigned long *src, unsigned bytecnt) -{ - __iowrite64_copy(dst, src, bytecnt / 8); + entry = &priv->tx_hash[hash_index]; + if(unlikely(!entry->cnt)) { + tx_ind = hash_index & (MLX4_EN_NUM_HASH_RINGS / 2 - 1); + if (2 * entry->small_pkts > entry->big_pkts) + tx_ind += MLX4_EN_NUM_HASH_RINGS / 2; + entry->small_pkts = entry->big_pkts = 0; + entry->ring = tx_ind; + } + + entry->cnt++; + if (skb->len > MLX4_EN_SMALL_PKT_SIZE) + entry->big_pkts++; + else + entry->small_pkts++; + return entry->ring; } -netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) +int mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_en_dev *mdev = priv->mdev; @@ -622,13 +670,12 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) int desc_size; int real_size; dma_addr_t dma; - u32 index, bf_index; + u32 index; __be32 op_own; u16 vlan_tag = 0; int i; int lso_header_size; void *fragptr; - bool bounce = false; if (!priv->port_up) goto tx_drop; @@ -637,7 +684,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) if (unlikely(!real_size)) goto tx_drop; - /* Align descriptor to TXBB size */ + /* Allign descriptor to TXBB size */ desc_size = ALIGN(real_size, TXBB_SIZE); nr_txbb = desc_size / TXBB_SIZE; if (unlikely(nr_txbb > MAX_DESC_TXBBS)) { @@ -648,7 +695,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) tx_ind = skb->queue_mapping; ring = &priv->tx_ring[tx_ind]; - if (vlan_tx_tag_present(skb)) + if (priv->vlgrp && vlan_tx_tag_present(skb)) vlan_tag = vlan_tx_tag_get(skb); /* Check available TXBBs And 2K spare for prefetch */ @@ -671,16 +718,13 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) /* Packet is good - grab an index and transmit it */ index = ring->prod & ring->size_mask; - bf_index = ring->prod; /* See if we have enough space for whole descriptor TXBB for setting * SW ownership on next descriptor; if not, use a bounce buffer. */ if (likely(index + nr_txbb <= ring->size)) tx_desc = ring->buf + index * TXBB_SIZE; - else { + else tx_desc = (struct mlx4_en_tx_desc *) ring->bounce_buf; - bounce = true; - } /* Save skb in tx_info ring */ tx_info = &ring->tx_info[index]; @@ -692,11 +736,12 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag); tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN * !!vlan_tag; tx_desc->ctrl.fence_size = (real_size / 16) & 0x3f; - tx_desc->ctrl.srcrb_flags = priv->ctrl_flags; + tx_desc->ctrl.srcrb_flags = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE | + MLX4_WQE_CTRL_SOLICITED); if (likely(skb->ip_summed == CHECKSUM_PARTIAL)) { tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM | MLX4_WQE_CTRL_TCP_UDP_CSUM); - ring->tx_csum++; + priv->port_stats.tx_chksum_offload++; } if (unlikely(priv->validate_loopback)) { @@ -784,42 +829,27 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) ring->prod += nr_txbb; /* If we used a bounce buffer then copy descriptor back into place */ - if (bounce) + if (tx_desc == (struct mlx4_en_tx_desc *) ring->bounce_buf) tx_desc = mlx4_en_bounce_to_desc(priv, ring, index, desc_size); /* Run destructor before passing skb to HW */ if (likely(!skb_shared(skb))) skb_orphan(skb); - if (ring->bf_enabled && desc_size <= MAX_BF && !bounce && !vlan_tag) { - *(u32 *) (&tx_desc->ctrl.vlan_tag) |= ring->doorbell_qpn; - op_own |= htonl((bf_index & 0xffff) << 8); - /* Ensure new descirptor hits memory - * before setting ownership of this descriptor to HW */ - wmb(); - tx_desc->ctrl.owner_opcode = op_own; - - wmb(); - - mlx4_bf_copy(ring->bf.reg + ring->bf.offset, (unsigned long *) &tx_desc->ctrl, - desc_size); - - wmb(); + /* Ensure new descirptor hits memory + * before setting ownership of this descriptor to HW */ + wmb(); + tx_desc->ctrl.owner_opcode = op_own; - ring->bf.offset ^= ring->bf.buf_size; - } else { - /* Ensure new descirptor hits memory - * before setting ownership of this descriptor to HW */ - wmb(); - tx_desc->ctrl.owner_opcode = op_own; - wmb(); - writel(ring->doorbell_qpn, ring->bf.uar->map + MLX4_SEND_DOORBELL); - } + /* Ring doorbell! */ + wmb(); + writel(ring->doorbell_qpn, mdev->uar_map + MLX4_SEND_DOORBELL); + dev->trans_start = jiffies; /* Poll CQ here */ mlx4_en_xmit_poll(priv, tx_ind); - return NETDEV_TX_OK; + return 0; tx_drop: dev_kfree_skb_any(skb); diff --git a/drivers/net/mlx4/eq.c b/drivers/net/mlx4/eq.c index 1ad1f6029af80..3b3566ac22945 100644 --- a/drivers/net/mlx4/eq.c +++ b/drivers/net/mlx4/eq.c @@ -31,8 +31,8 @@ * SOFTWARE. */ +#include #include -#include #include #include @@ -41,9 +41,7 @@ #include "mlx4.h" #include "fw.h" -enum { - MLX4_IRQNAME_SIZE = 32 -}; +extern int enable_entropy; enum { MLX4_NUM_ASYNC_EQE = 0x100, @@ -51,30 +49,6 @@ enum { MLX4_EQ_ENTRY_SIZE = 0x20 }; -/* - * Must be packed because start is 64 bits but only aligned to 32 bits. - */ -struct mlx4_eq_context { - __be32 flags; - u16 reserved1[3]; - __be16 page_offset; - u8 log_eq_size; - u8 reserved2[4]; - u8 eq_period; - u8 reserved3; - u8 eq_max_count; - u8 reserved4[3]; - u8 intr; - u8 log_page_size; - u8 reserved5[2]; - u8 mtt_base_addr_h; - __be32 mtt_base_addr_l; - u32 reserved6[2]; - __be32 consumer_index; - __be32 producer_index; - u32 reserved7[4]; -}; - #define MLX4_EQ_STATUS_OK ( 0 << 28) #define MLX4_EQ_STATUS_WRITE_FAIL (10 << 28) #define MLX4_EQ_OWNER_SW ( 0 << 24) @@ -99,46 +73,25 @@ struct mlx4_eq_context { (1ull << MLX4_EVENT_TYPE_SRQ_CATAS_ERROR) | \ (1ull << MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE) | \ (1ull << MLX4_EVENT_TYPE_SRQ_LIMIT) | \ - (1ull << MLX4_EVENT_TYPE_CMD)) - -struct mlx4_eqe { - u8 reserved1; - u8 type; - u8 reserved2; - u8 subtype; - union { - u32 raw[6]; - struct { - __be32 cqn; - } __packed comp; - struct { - u16 reserved1; - __be16 token; - u32 reserved2; - u8 reserved3[3]; - u8 status; - __be64 out_param; - } __packed cmd; - struct { - __be32 qpn; - } __packed qp; - struct { - __be32 srqn; - } __packed srq; - struct { - __be32 cqn; - u32 reserved1; - u8 reserved2[3]; - u8 syndrome; - } __packed cq_err; - struct { - u32 reserved1[2]; - __be32 port; - } __packed port_change; - } event; - u8 reserved3[3]; - u8 owner; -} __packed; + (1ull << MLX4_EVENT_TYPE_CMD) | \ + (1ull << MLX4_EVENT_TYPE_VEP_UPDATE) | \ + (1ull << MLX4_EVENT_TYPE_MAC_UPDATE) | \ + (1ull << MLX4_EVENT_TYPE_COMM_CHANNEL) | \ + (1ull << MLX4_EVENT_TYPE_FLR_EVENT) | \ + (1ull << MLX4_EVENT_TYPE_SW_EVENT)) + +#define OUT_MAD_IFC_DATA_OFFSET 64 /* in bytes */ +#define PORT_MGMT_CHANGE_EV_CAP_BIT 59 + +/* register to port management changed event only if internal sma is supported + * and the event can be generated. */ +static u64 set_port_mgmt_changed_bit(struct mlx4_dev *dev) +{ + if ((dev->caps.flags & (1ull << PORT_MGMT_CHANGE_EV_CAP_BIT)) && dev->is_internal_sma) + return (1ull << MLX4_EVENT_TYPE_PORT_MGMT_CHANGE); + else + return 0; +} static void eq_set_ci(struct mlx4_eq *eq, int req_not) { @@ -161,13 +114,474 @@ static struct mlx4_eqe *next_eqe_sw(struct mlx4_eq *eq) return !!(eqe->owner & 0x80) ^ !!(eq->cons_index & eq->nent) ? NULL : eqe; } +static struct mlx4_eqe *next_slave_event_eqe(struct mlx4_slave_event_eq *slave_eq) +{ + struct mlx4_eqe *eqe = + &slave_eq->event_eqe[slave_eq->cons & (SLAVE_EVENT_EQ_SIZE - 1)]; + return (!!(eqe->owner & 0x80) ^ !!(slave_eq->cons & SLAVE_EVENT_EQ_SIZE)) ? + eqe : NULL; +} +void mlx4_gen_slave_eqe(struct work_struct *work) +{ + struct mlx4_mfunc_master_ctx *master = container_of(work, + struct mlx4_mfunc_master_ctx, + slave_event_work); + struct mlx4_mfunc *mfunc = container_of(master, struct mlx4_mfunc, master); + struct mlx4_priv *priv = container_of(mfunc, struct mlx4_priv, mfunc); + struct mlx4_dev *dev = &priv->dev; + struct mlx4_slave_event_eq *slave_eq = &mfunc->master.slave_eq; + struct mlx4_eqe *eqe; + u8 slave; + int i; + + for (eqe = next_slave_event_eqe(slave_eq); eqe; + eqe = next_slave_event_eqe(slave_eq)) { + slave = eqe->slave_id; + + /* All active slaves need to receive the event */ + if (slave == ALL_SLAVES) { + for (i = 0; i < dev->num_slaves; i++) { + if (i != dev->caps.function && master->slave_state[i].active) + if (mlx4_GEN_EQE(dev, i, eqe)) + mlx4_warn(dev, "Failed to generate event " + "for slave %d\n", i); + } + } else { + if (mlx4_GEN_EQE(dev, slave, eqe)) + mlx4_warn(dev, "Failed to generate event " + "for slave %d\n", slave); + } + ++slave_eq->cons; + } +} + + +static void slave_event(struct mlx4_dev *dev, u8 slave, struct mlx4_eqe *eqe) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_event_eq *slave_eq = &priv->mfunc.master.slave_eq; + struct mlx4_eqe *s_eqe = NULL; + unsigned long flags; + + spin_lock_irqsave(&slave_eq->event_lock, flags); + s_eqe = &slave_eq->event_eqe[slave_eq->prod & (SLAVE_EVENT_EQ_SIZE - 1)]; + if ((!!(s_eqe->owner & 0x80)) ^ (!!(slave_eq->prod & SLAVE_EVENT_EQ_SIZE))) { + mlx4_warn(dev, "Master failed to generate an EQE for slave: %d. " + "No free EQE on slave events queue\n", slave); + spin_unlock_irqrestore(&slave_eq->event_lock, flags); + return; + } + + memcpy(s_eqe, eqe, sizeof(struct mlx4_eqe) - 1); + s_eqe->slave_id = slave; + /* ensure all information is written before setting the ownersip bit */ + wmb(); + s_eqe->owner = !!(slave_eq->prod & SLAVE_EVENT_EQ_SIZE) ? 0x0 : 0x80; + ++slave_eq->prod; + + queue_work(priv->mfunc.master.comm_wq, &priv->mfunc.master.slave_event_work); + spin_unlock_irqrestore(&slave_eq->event_lock, flags); +} + +int mlx4_gen_pkey_eqe(struct mlx4_dev *dev, int slave, u8 port) +{ + struct mlx4_eqe eqe; + + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *s_slave = &priv->mfunc.master.slave_state[slave]; + + if (!s_slave->active) + return 0; + + memset(&eqe, 0, sizeof eqe); + + eqe.type = MLX4_EVENT_TYPE_SW_EVENT; + eqe.subtype = PKEY_UPDATE_AVIAL; + eqe.event.sw_event.port = port; + + return mlx4_GEN_EQE(dev, slave, &eqe); +} +EXPORT_SYMBOL(mlx4_gen_pkey_eqe); + +int mlx4_gen_guid_change_eqe(struct mlx4_dev *dev, int slave, u8 port) +{ + struct mlx4_eqe *eqe; + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *s_slave = &priv->mfunc.master.slave_state[slave]; + + /* don't send if we don't have the that slave or + the slave is inactive */ + if ((dev->sr_iov < slave) || (!s_slave->active)) + return 0; + + eqe = kzalloc(sizeof *eqe, GFP_KERNEL); + if (!eqe) { + mlx4_warn(dev, "Failed to allocate memory for eqe\n"); + return 1; + } + + eqe->type = MLX4_EVENT_TYPE_SW_EVENT; + eqe->subtype = GUID_CHANGE_AVIAL; + eqe->event.sw_event.port = port; + + slave_event(dev, slave, eqe); + kfree(eqe); + + return 0; +} +EXPORT_SYMBOL(mlx4_gen_guid_change_eqe); + +void mlx4_gen_port_state_change_eqe(struct mlx4_dev *dev, int slave, u8 port, u8 port_subtype_change) +{ + struct mlx4_eqe eqe; + + if (dev->sr_iov < slave) /*don't send if we don't have the that slave*/ + return; +/* + if (!s_slave->active) { + //mlx4_warn(dev, "Trying to pass event to inactive slave\n"); + return 0; + } +*/ + memset(&eqe, 0, sizeof eqe); + + eqe.type = MLX4_EVENT_TYPE_PORT_CHANGE; + eqe.subtype = port_subtype_change; + eqe.event.port_change.port = cpu_to_be32(port << 28); + + mlx4_dbg(dev, "%s: sending: %d to slave: %d on port: %d\n", __func__, port_subtype_change, slave, port); + slave_event(dev, slave, &eqe); +} +EXPORT_SYMBOL(mlx4_gen_port_state_change_eqe); + +enum slave_port_state mlx4_get_slave_port_state(struct mlx4_dev *dev, int slave, u8 port) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *s_state = priv->mfunc.master.slave_state; + if (slave >= dev->num_slaves || port > MLX4_MAX_PORTS) { + printk(KERN_ERR "%s: BUG!!! asking for slave:%d, port:%d\n", __func__, slave, port); + return SLAVE_PORT_DOWN; + } + return s_state[slave].port_state[port]; +} +EXPORT_SYMBOL(mlx4_get_slave_port_state); + +static int mlx4_set_slave_port_state(struct mlx4_dev *dev, int slave, u8 port, enum slave_port_state state) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *s_state = priv->mfunc.master.slave_state; + + if (slave >= dev->num_slaves || port > MLX4_MAX_PORTS || port == 0) { + printk(KERN_ERR "%s: BUG!!! asking for slave:%d, port:%d\n", __func__, slave, port); + return -1; + } + s_state[slave].port_state[port] = state; + + return 0; +} + +void set_all_slave_state(struct mlx4_dev *dev, u8 port, int event) +{ + int i; + enum slave_port_gen_event gen_event; + + for (i = 0; i < dev->num_slaves; i++) + set_and_calc_slave_port_state(dev, i, port, event, &gen_event); +} +/************************************************************************** + The function get as input the new event to that port, + and according to the prev state change the slave's port state. + The events are: + MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN, + MLX4_PORT_STATE_DEV_EVENT_PORT_UP + MLX4_PORT_STATE_IB_EVENT_GID_VALID + MLX4_PORT_STATE_IB_EVENT_GID_INVALID +***************************************************************************/ +int set_and_calc_slave_port_state(struct mlx4_dev *dev, int slave, u8 port, int event, enum slave_port_gen_event* gen_event) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *ctx = NULL; + unsigned long flags; + int ret = -1; + enum slave_port_state cur_state = mlx4_get_slave_port_state(dev, slave, port); + + *gen_event = SLAVE_PORT_GEN_EVENT_NONE; + + if (slave >= dev->num_slaves || port > MLX4_MAX_PORTS || port == 0) { + printk(KERN_ERR "%s: BUG!!! asking for slave:%d, port:%d\n", __func__, slave, port); + return ret; + } + + ctx = &priv->mfunc.master.slave_state[slave]; + spin_lock_irqsave(&ctx->lock, flags); + + mlx4_dbg(dev, "%s: slave: %d, current state: %d new event :%d\n", __func__, slave, cur_state, event); + + switch (cur_state) { + case SLAVE_PORT_DOWN: + if (MLX4_PORT_STATE_DEV_EVENT_PORT_UP == event) + mlx4_set_slave_port_state(dev, slave, port, SLAVE_PENDING_UP); + break; + case SLAVE_PENDING_UP: + if (MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN == event) + mlx4_set_slave_port_state(dev, slave, port, SLAVE_PORT_DOWN); + + else if (MLX4_PORT_STATE_IB_PORT_STATE_EVENT_GID_VALID == event) { + mlx4_set_slave_port_state(dev, slave, port, SLAVE_PORT_UP); + *gen_event = SLAVE_PORT_GEN_EVENT_UP; + } + + break; + case SLAVE_PORT_UP: + if (MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN == event) { + mlx4_set_slave_port_state(dev, slave, port, SLAVE_PORT_DOWN); + *gen_event = SLAVE_PORT_GEN_EVENT_DOWN; + } + else if (MLX4_PORT_STATE_IB_EVENT_GID_INVALID == event) { + mlx4_set_slave_port_state(dev, slave, port, SLAVE_PENDING_UP); + *gen_event = SLAVE_PORT_GEN_EVENT_DOWN; + } + + break; + default: + printk(KERN_ERR "%s: BUG!!! UNKNOWN state: " + "slave:%d, port:%d\n", __func__, slave, port); + goto out; + } + ret = mlx4_get_slave_port_state(dev, slave, port); + mlx4_dbg(dev, "%s: slave: %d, current state: %d new event :%d gen_event: %d\n", + __func__, slave, cur_state, event, *gen_event); + +out: + spin_unlock_irqrestore(&ctx->lock, flags); + return ret; +} + +EXPORT_SYMBOL(set_and_calc_slave_port_state); + +int mlx4_gen_all_sw_eqe(struct mlx4_dev *dev, u8 port, int avial) +{ + struct mlx4_eqe eqe; + + memset(&eqe, 0, sizeof eqe); + + eqe.type = MLX4_EVENT_TYPE_SW_EVENT; + eqe.subtype = avial; + eqe.event.sw_event.port = port; + + slave_event(dev, ALL_SLAVES, &eqe); + return 0; +} +EXPORT_SYMBOL(mlx4_gen_all_sw_eqe); + +static void mlx4_slave_event(struct mlx4_dev *dev, int slave, struct mlx4_eqe* eqe) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *s_slave = &priv->mfunc.master.slave_state[slave]; + + if (!s_slave->active) { + /*mlx4_warn(dev, "Trying to pass event to inactive slave\n");*/ + return; + } + + slave_event(dev, slave, eqe); +} + +int mlx4_GET_EVENT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *ctx = &priv->mfunc.master.slave_state[slave]; + unsigned long flags; + + spin_lock_irqsave(&ctx->lock, flags); + if (ctx->eq_ci == ctx->eq_pi) { + vhcr->out_param = MLX4_EVENT_TYPE_NONE; + } else if ((u16) (ctx->eq_pi - ctx->eq_ci) > MLX4_MFUNC_MAX_EQES) { + ctx->eq_ci = ctx->eq_pi - MLX4_MFUNC_MAX_EQES; + vhcr->out_param = MLX4_EVENT_TYPE_EQ_OVERFLOW; + } else { + vhcr->out_param = ctx->eq[ctx->eq_ci & MLX4_MFUNC_EQE_MASK].type | + ((u64) ctx->eq[ctx->eq_ci & MLX4_MFUNC_EQE_MASK].port << 8) | + ((u64) ctx->eq[ctx->eq_ci & MLX4_MFUNC_EQE_MASK].param << 32); + ++ctx->eq_ci; + } + spin_unlock_irqrestore(&ctx->lock, flags); + return 0; +} + +void mlx4_update_vep_config(struct work_struct *work) +{ + struct mlx4_mfunc_master_ctx *master = container_of(work, + struct mlx4_mfunc_master_ctx, + vep_config_work); + struct mlx4_mfunc *mfunc = container_of(master, struct mlx4_mfunc, master); + struct mlx4_priv *priv = container_of(mfunc, struct mlx4_priv, mfunc); + struct mlx4_dev *dev = &priv->dev; + struct mlx4_vep_cfg vep_cfg; + struct mlx4_eqe new_eqe; + int vep_num; + int port; + int i; + bool port_updated[MLX4_MAX_PORTS + 1] = {false}; + u16 vep_config_map; + + spin_lock_irq(&mfunc->master.vep_config_lock); + vep_config_map = mfunc->master.vep_config_bitmap; + mfunc->master.vep_config_bitmap = 0; + spin_unlock_irq(&mfunc->master.vep_config_lock); + + while (vep_config_map) { + for (vep_num = 0; vep_num < 16; vep_num++) { + if (!(vep_config_map & (1 << vep_num))) + continue; + + port = (vep_num & 0x1) + 1; + port_updated[port] = true; + + if (mlx4_QUERY_VEP_CFG(dev, vep_num, &vep_cfg)) { + mlx4_warn(dev, "failed to read VEP configuration " + "for function %d\n", vep_num); + continue; + } + if (vep_cfg.link != mfunc->master.slave_state[vep_num].vep_cfg.link) { + new_eqe.type = MLX4_EVENT_TYPE_PORT_CHANGE; + new_eqe.event.port_change.port = cpu_to_be32(port << 28); + new_eqe.subtype = vep_cfg.link ? + MLX4_PORT_CHANGE_SUBTYPE_ACTIVE : + MLX4_PORT_CHANGE_SUBTYPE_DOWN; + if (priv->link_up[port]) { + if (vep_num == dev->caps.function) + mlx4_dispatch_event(dev, vep_cfg.link ? + MLX4_DEV_EVENT_PORT_UP : + MLX4_DEV_EVENT_PORT_DOWN, + (unsigned long)port); + else + mlx4_slave_event(dev, vep_num, + &new_eqe); + } + mfunc->master.slave_state[vep_num].vep_cfg.link = vep_cfg.link; + + } + + if (vep_cfg.mac != mfunc->master.slave_state[vep_num].vep_cfg.mac) { + mfunc->master.slave_state[vep_num].vep_cfg.mac = vep_cfg.mac; + if (vep_num == dev->caps.function) { + dev->caps.def_mac[port] = vep_cfg.mac; + mlx4_dispatch_event(dev, + MLX4_EVENT_TYPE_MAC_UPDATE, + (unsigned long)port); + } else { + new_eqe.type = MLX4_EVENT_TYPE_MAC_UPDATE; + new_eqe.event.mac_update.port = port; + new_eqe.event.mac_update.mac = cpu_to_be64(vep_cfg.mac); + mlx4_slave_event(dev, vep_num, &new_eqe); + } + } + } + spin_lock_irq(&mfunc->master.vep_config_lock); + vep_config_map = mfunc->master.vep_config_bitmap; + mfunc->master.vep_config_bitmap = 0; + spin_unlock_irq(&mfunc->master.vep_config_lock); + } + for (i = 1; i <= dev->caps.num_ports; i++) { + if (port_updated[i]) + mlx4_update_uplink_arbiter(dev, i); + } +} + +void mlx4_master_handle_slave_flr(struct work_struct *work) +{ + struct mlx4_mfunc_master_ctx *master = container_of(work, + struct mlx4_mfunc_master_ctx, + slave_flr_event_work); + struct mlx4_mfunc *mfunc = container_of(master, struct mlx4_mfunc, master); + struct mlx4_priv *priv = container_of(mfunc, struct mlx4_priv, mfunc); + struct mlx4_dev *dev = &priv->dev; + struct mlx4_slave_state *slave_state = priv->mfunc.master.slave_state; + int i; + int err; + + mlx4_dbg(dev, "mlx4_handle_slave_flr\n"); + + for (i = 0 ; i < dev->num_slaves; i++) { + + if (MLX4_COMM_CMD_FLR == slave_state[i].last_cmd) { + mlx4_dbg(dev, "mlx4_handle_slave_flr: clean slave: %d\n", i); + + mlx4_delete_all_resources_for_slave(dev, i); + /*return the slave to running mode*/ + spin_lock(&priv->mfunc.master.slave_state_lock); + slave_state[i].last_cmd = MLX4_COMM_CMD_RESET; + slave_state[i].is_slave_going_down = 0; + spin_unlock(&priv->mfunc.master.slave_state_lock); + /*notify the FW:*/ + err = mlx4_cmd(dev, 0, i, 0, MLX4_CMD_INFORM_FLR_DONE, MLX4_CMD_TIME_CLASS_A, 0); + if (err) + mlx4_warn(dev, "Failed to notify FW on FLR done (slave:%d)\n", i); + } + } +} + +int mlx4_GET_PKEY_TABLE(struct mlx4_dev *dev, u8 port, u8 table[]) +{ + struct mlx4_cmd_mailbox *mailbox; + u8 *outbox; + int err; + int i; + + if (mlx4_is_master(dev)) { + /* master has the identity phys-pkey to virt-pkey mapping */ + for (i = 0; i < dev->caps.pkey_table_len[port]; ++i) + table[i] = i; + return 0; + } + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + outbox = mailbox->buf; + + err = mlx4_cmd_box(dev, 0, mailbox->dma, port, 0, MLX4_CMD_GET_PKEY_TABLE, + MLX4_CMD_TIME_CLASS_A, 0); + if (err) + goto out; + + mlx4_dbg(dev, "port = %d, pkey table len = %d\n", + port, dev->caps.pkey_table_len[port]); + for (i = 0; i < dev->caps.pkey_table_len[port]; ++i) { + table[i] = outbox[i]; + mlx4_dbg(dev, "pkey index %d maps to port pkey at %d\n", + i, table[i]); + } + +out: + if (!mlx4_is_master(dev)) + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} +EXPORT_SYMBOL(mlx4_GET_PKEY_TABLE); + static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq) { + struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_eqe *eqe; + u64 mac; int cqn; int eqes_found = 0; int set_ci = 0; int port; + int slave; + int ret; + int i; + enum slave_port_state slave_port_state; + enum slave_port_gen_event gen_event; + u32 flr_slave; + u8 vep_num; + u8 update_slave_state; while ((eqe = next_eqe_sw(eq))) { /* @@ -190,14 +604,58 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq) case MLX4_EVENT_TYPE_PATH_MIG_FAILED: case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR: case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR: - mlx4_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff, - eqe->type); + mlx4_sdbg("event %d arrived\n", eqe->type); + if (mlx4_is_mfunc(dev) && mlx4_is_master(dev)) { + /* forward only to slave owning the QP */ + ret = mlx4_get_slave_from_resource_id(dev, RES_QP, + be32_to_cpu(eqe->event.qp.qpn) + & 0xffffff, &slave); + if (ret && ret != -ENOENT) { + mlx4_dbg(dev, "QP event %02x(%02x) on EQ %d at index %u:" + "could not get slave id (%d)\n", + eqe->type, eqe->subtype, + eq->eqn, eq->cons_index, ret); + break; + } + + if (!ret && slave != dev->caps.function) { + mlx4_slave_event(dev, slave, eqe); + break; + } + + } + mlx4_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & + 0xffffff, eqe->type); break; case MLX4_EVENT_TYPE_SRQ_LIMIT: + mlx4_warn(dev, "%s: MLX4_EVENT_TYPE_SRQ_LIMIT\n", __func__); case MLX4_EVENT_TYPE_SRQ_CATAS_ERROR: - mlx4_srq_event(dev, be32_to_cpu(eqe->event.srq.srqn) & 0xffffff, - eqe->type); + if (mlx4_is_mfunc(dev) && mlx4_is_master(dev)) { + /* forward only to slave owning the SRQ */ + ret = mlx4_get_slave_from_resource_id(dev, RES_SRQ, + be32_to_cpu(eqe->event.srq.srqn) + & 0xffffff, &slave); + if (ret && ret != -ENOENT) { + mlx4_warn(dev, "SRQ event %02x(%02x) on EQ %d at index %u:" + "could not get slave id (%d)\n", + eqe->type, eqe->subtype, + eq->eqn, eq->cons_index, ret); + break; + } + mlx4_warn(dev, "%s: slave:%d, srq_no:0x%x, event: %02x(%02x)\n", + __func__, slave, be32_to_cpu(eqe->event.srq.srqn), + eqe->type, eqe->subtype); + + if (!ret && slave != dev->caps.function) { + mlx4_warn(dev, "%s: sending event %02x(%02x) to slave:%d\n", + __func__, eqe->type, eqe->subtype, slave); + mlx4_slave_event(dev, slave, eqe); + break; + } + } + mlx4_srq_event(dev, be32_to_cpu(eqe->event.srq.srqn) & + 0xffffff, eqe->type); break; case MLX4_EVENT_TYPE_CMD: @@ -210,13 +668,46 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq) case MLX4_EVENT_TYPE_PORT_CHANGE: port = be32_to_cpu(eqe->event.port_change.port) >> 28; if (eqe->subtype == MLX4_PORT_CHANGE_SUBTYPE_DOWN) { + priv->link_up[port] = false; mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_DOWN, - port); + (unsigned long)port); mlx4_priv(dev)->sense.do_sense_port[port] = 1; + if (mlx4_is_mfunc(dev) && mlx4_is_master(dev)) + /*change the state of all slave's port to down:*/ + for (i = 0; i < dev->num_slaves; i++) { + slave_port_state = mlx4_get_slave_port_state(dev, i, port); + set_and_calc_slave_port_state(dev, i, port, MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN, &gen_event); + /*we can be in pending state, than do not send port_down event*/ + if (SLAVE_PORT_GEN_EVENT_DOWN == gen_event) { + mlx4_dbg(dev, "%s: Sending MLX4_PORT_CHANGE_SUBTYPE_DOWN to slave: %d, port:%d\n", + __func__, i, port); + if (i == dev->caps.function) + continue; + mlx4_slave_event(dev, i, eqe); + } + } } else { - mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_UP, - port); + priv->link_up[port] = true; + /* Link UP event is acceptable only in case VEP link is enabled*/ + mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_UP, port); mlx4_priv(dev)->sense.do_sense_port[port] = 0; + + if (mlx4_is_mfunc(dev) && mlx4_is_master(dev)) { + set_all_slave_state(dev, port, MLX4_DEV_EVENT_PORT_UP); + /* NO SUPPORT IN EN !!!!!! + u8 vep_num; + for (i = 0; i < dev->num_slaves; i++) { + struct mlx4_slave_state *s_state = priv->mfunc.master.slave_state; + if (i == dev->caps.function || !(s_state[i].active)) + continue; + vep_num = s_state[i].pf_num; + spin_lock(&priv->mfunc.master.vep_config_lock); + if (s_state[vep_num].vep_cfg.link) + mlx4_slave_event(dev, i, eqe); + spin_unlock(&priv->mfunc.master.vep_config_lock); + } + */ + } } break; @@ -225,7 +716,24 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq) eqe->event.cq_err.syndrome == 1 ? "overrun" : "access violation", be32_to_cpu(eqe->event.cq_err.cqn) & 0xffffff); - mlx4_cq_event(dev, be32_to_cpu(eqe->event.cq_err.cqn), + if (mlx4_is_mfunc(dev) && mlx4_is_master(dev)) { + ret = mlx4_get_slave_from_resource_id(dev, RES_CQ, + be32_to_cpu(eqe->event.cq_err.cqn) + & 0xffffff, &slave); + if (ret && ret != -ENOENT) { + mlx4_dbg(dev, "CQ event %02x(%02x) on EQ %d at index %u:" + "could not get slave id (%d)\n", + eqe->type, eqe->subtype, + eq->eqn, eq->cons_index, ret); + break; + } + + if (!ret && slave != dev->caps.function) { + mlx4_slave_event(dev, slave, eqe); + break; + } + } + mlx4_cq_event(dev, be32_to_cpu(eqe->event.cq_err.cqn) & 0xffffff, eqe->type); break; @@ -233,13 +741,103 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq) mlx4_warn(dev, "EQ overrun on EQN %d\n", eq->eqn); break; + case MLX4_EVENT_TYPE_COMM_CHANNEL: + if (!mlx4_is_mfunc(dev) || + (mlx4_is_mfunc(dev) && !mlx4_is_master(dev))) { + mlx4_warn(dev, "Received comm channel event " + "for non master device\n"); + break; + } + memcpy(&priv->mfunc.master.comm_arm_bit_vector, + eqe->event.comm_channel_arm.bit_vec, + sizeof eqe->event.comm_channel_arm.bit_vec); + queue_work(priv->mfunc.master.comm_wq, + &priv->mfunc.master.comm_work); + break; + + case MLX4_EVENT_TYPE_MAC_UPDATE: + port = eqe->event.mac_update.port; + mac = be64_to_cpu(eqe->event.mac_update.mac); + dev->caps.def_mac[port] = mac; + mlx4_dispatch_event(dev, MLX4_EVENT_TYPE_MAC_UPDATE, port); + break; + + case MLX4_EVENT_TYPE_VEP_UPDATE: + if (!mlx4_is_mfunc(dev) || + (mlx4_is_mfunc(dev) && !mlx4_is_master(dev))) { + mlx4_warn(dev, "Non-master function received" + "VEP_UPDATE event\n"); + break; + } + vep_num = eqe->event.vep_config.vep_num; + spin_lock(&priv->mfunc.master.vep_config_lock); + priv->mfunc.master.vep_config_bitmap |= 1 << vep_num; + spin_unlock(&priv->mfunc.master.vep_config_lock); + queue_work(priv->mfunc.master.comm_wq, &priv->mfunc.master.vep_config_work); + break; + + case MLX4_EVENT_TYPE_SW_EVENT: + mlx4_dbg(dev, "got SW event, subtype = %d\n", eqe->subtype); + if (eqe->subtype == PKEY_UPDATE_AVIAL) + mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PKEY_UPDATE, + eqe->event.sw_event.port); + else if (eqe->subtype == GUID_CHANGE_AVIAL) + mlx4_dispatch_event(dev, MLX4_DEV_EVENT_GUID_CHANGE, + eqe->event.sw_event.port); + else if (eqe->subtype == LID_CHANGE_AVIAL) + mlx4_dispatch_event(dev, MLX4_DEV_EVENT_LID_CHANGE, + eqe->event.sw_event.port); + else if (eqe->subtype == CLIENT_REREGISTER_AVIAL) + mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CLIENT_REREGISTER, + eqe->event.sw_event.port); + break; + + case MLX4_EVENT_TYPE_FLR_EVENT: + flr_slave = be32_to_cpu(eqe->event.flr_event.slave_id); + if (!mlx4_is_master(dev)) { + mlx4_warn(dev, "Non-master function received" + "FLR event\n"); + break; + } + + mlx4_dbg(dev, "FLR event for slave: %d\n", flr_slave); + + if (flr_slave >= dev->num_slaves) { + mlx4_warn(dev, "Got FLR for unknown function: %d\n", flr_slave); + update_slave_state = 0; + } else + update_slave_state = 1; + + mlx4_dispatch_event(dev, MLX4_DEV_EVENT_SLAVE_SHUTDOWN, flr_slave); + spin_lock(&priv->mfunc.master.slave_state_lock); + if(update_slave_state) { + priv->mfunc.master.slave_state[flr_slave].active = false; + priv->mfunc.master.slave_state[flr_slave].last_cmd = MLX4_COMM_CMD_FLR; + priv->mfunc.master.slave_state[flr_slave].is_slave_going_down = 1; + } + spin_unlock(&priv->mfunc.master.slave_state_lock); + queue_work(priv->mfunc.master.comm_wq, &priv->mfunc.master.slave_flr_event_work); + break; + case MLX4_EVENT_TYPE_PORT_MGMT_CHANGE: + if (mlx4_is_mfunc(dev) && !mlx4_is_master(dev)) { + mlx4_dbg(dev, "Non-master function received" + "port mgmt change event\n"); + break; + } else + mlx4_dbg(dev, "Port Management Change event, " + "subtype = 0x%x\n", eqe->subtype); + + mlx4_dispatch_event(dev, MLX4_DEV_EVENT_PORT_MGMT_CHANGE, (unsigned long) eqe); + + break; case MLX4_EVENT_TYPE_EEC_CATAS_ERROR: case MLX4_EVENT_TYPE_ECC_DETECT: default: - mlx4_warn(dev, "Unhandled event %02x(%02x) on EQ %d at index %u\n", - eqe->type, eqe->subtype, eq->eqn, eq->cons_index); + mlx4_warn(dev, "Unhandled event %02x(%02x) on EQ %d at index %u. owner=%x, nent=0x%x, slave=%x, ownership=%s\n", + eqe->type, eqe->subtype, eq->eqn, eq->cons_index, eqe->owner, eq->nent, eqe->slave_id, + !!(eqe->owner & 0x80) ^ !!(eq->cons_index & eq->nent) ? "HW" : "SW"); break; - } + }; ++eq->cons_index; eqes_found = 1; @@ -289,25 +887,54 @@ static irqreturn_t mlx4_msi_x_interrupt(int irq, void *eq_ptr) return IRQ_HANDLED; } +int mlx4_MAP_EQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_event_eq_info *event_eq = + &priv->mfunc.master.slave_state[slave].event_eq; + u32 in_modifier = vhcr->in_modifier; + u32 eqn = in_modifier & 0x1FF; + u64 in_param = vhcr->in_param; + int err = 0; + + if (slave == dev->caps.function) + err = mlx4_cmd(dev, in_param, (in_modifier & 0x80000000) | eqn, + 0, MLX4_CMD_MAP_EQ, MLX4_CMD_TIME_CLASS_B, 1); + if (!err) { + if (in_modifier >> 31) { + /* unmap */ + event_eq->event_type &= ~in_param; + } else { + event_eq->eqn = eqn; + event_eq->event_type = in_param; + } + } + return err; +} + static int mlx4_MAP_EQ(struct mlx4_dev *dev, u64 event_mask, int unmap, int eq_num) { return mlx4_cmd(dev, event_mask, (unmap << 31) | eq_num, - 0, MLX4_CMD_MAP_EQ, MLX4_CMD_TIME_CLASS_B); + 0, MLX4_CMD_MAP_EQ, MLX4_CMD_TIME_CLASS_B, 0); } static int mlx4_SW2HW_EQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, int eq_num) { - return mlx4_cmd(dev, mailbox->dma, eq_num, 0, MLX4_CMD_SW2HW_EQ, - MLX4_CMD_TIME_CLASS_A); + return mlx4_cmd(dev, mailbox->dma | dev->caps.function, eq_num, 0, + MLX4_CMD_SW2HW_EQ, MLX4_CMD_TIME_CLASS_A, 0); } static int mlx4_HW2SW_EQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, int eq_num) { - return mlx4_cmd_box(dev, 0, mailbox->dma, eq_num, 0, MLX4_CMD_HW2SW_EQ, - MLX4_CMD_TIME_CLASS_A); + return mlx4_cmd_box(dev, dev->caps.function, mailbox->dma, eq_num, + 0, MLX4_CMD_HW2SW_EQ, MLX4_CMD_TIME_CLASS_A, 0); } static int mlx4_num_eq_uar(struct mlx4_dev *dev) @@ -317,8 +944,8 @@ static int mlx4_num_eq_uar(struct mlx4_dev *dev) * we need to map, take the difference of highest index and * the lowest index we'll use and add 1. */ - return (dev->caps.num_comp_vectors + 1 + dev->caps.reserved_eqs + - dev->caps.comp_pool)/4 - dev->caps.reserved_eqs/4 + 1; + return (dev->caps.num_comp_vectors + 1 + dev->caps.reserved_eqs) / 4 - + dev->caps.reserved_eqs / 4 + 1; } static void __iomem *mlx4_get_eq_uar(struct mlx4_dev *dev, struct mlx4_eq *eq) @@ -399,7 +1026,8 @@ static int mlx4_create_eq(struct mlx4_dev *dev, int nent, goto err_out_free_eq; } - err = mlx4_mtt_init(dev, npages, PAGE_SHIFT, &eq->mtt); + err = mlx4_mtt_init(dev, npages, PAGE_SHIFT, &eq->mtt, + MLX4_MR_FLAG_NONE); if (err) goto err_out_free_eq; @@ -432,7 +1060,7 @@ static int mlx4_create_eq(struct mlx4_dev *dev, int nent, return err; err_out_free_mtt: - mlx4_mtt_cleanup(dev, &eq->mtt); + mlx4_mtt_cleanup(dev, &eq->mtt, MLX4_MR_FLAG_NONE); err_out_free_eq: mlx4_bitmap_free(&priv->eq_table.bitmap, eq->eqn); @@ -475,16 +1103,16 @@ static void mlx4_free_eq(struct mlx4_dev *dev, mlx4_dbg(dev, "Dumping EQ context %02x:\n", eq->eqn); for (i = 0; i < sizeof (struct mlx4_eq_context) / 4; ++i) { if (i % 4 == 0) - pr_cont("[%02x] ", i * 4); - pr_cont(" %08x", be32_to_cpup(mailbox->buf + i * 4)); + printk("[%02x] ", i * 4); + printk(" %08x", be32_to_cpup(mailbox->buf + i * 4)); if ((i + 1) % 4 == 0) - pr_cont("\n"); + printk("\n"); } } - mlx4_mtt_cleanup(dev, &eq->mtt); + mlx4_mtt_cleanup(dev, &eq->mtt, MLX4_MR_FLAG_NONE); for (i = 0; i < npages; ++i) - pci_free_consistent(dev->pdev, PAGE_SIZE, + dma_free_coherent(&dev->pdev->dev, PAGE_SIZE, eq->page_list[i].buf, eq->page_list[i].map); @@ -496,32 +1124,16 @@ static void mlx4_free_eq(struct mlx4_dev *dev, static void mlx4_free_irqs(struct mlx4_dev *dev) { struct mlx4_eq_table *eq_table = &mlx4_priv(dev)->eq_table; - struct mlx4_priv *priv = mlx4_priv(dev); - int i, vec; + int i; if (eq_table->have_irq) free_irq(dev->pdev->irq, dev); - for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) if (eq_table->eq[i].have_irq) { free_irq(eq_table->eq[i].irq, eq_table->eq + i); eq_table->eq[i].have_irq = 0; } - for (i = 0; i < dev->caps.comp_pool; i++) { - /* - * Freeing the assigned irq's - * all bits should be 0, but we need to validate - */ - if (priv->msix_ctl.pool_bm & 1ULL << i) { - /* NO need protecting*/ - vec = dev->caps.num_comp_vectors + 1 + i; - free_irq(priv->eq_table.eq[vec].irq, - &priv->eq_table.eq[vec]); - } - } - - kfree(eq_table->irq_names); } @@ -566,40 +1178,41 @@ void mlx4_free_eq_table(struct mlx4_dev *dev) int mlx4_init_eq_table(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); + const char *dname = &dev_name(&dev->pdev->dev)[5]; int err; int i; + int interrupt_flags; - priv->eq_table.uar_map = kcalloc(sizeof *priv->eq_table.uar_map, - mlx4_num_eq_uar(dev), GFP_KERNEL); - if (!priv->eq_table.uar_map) { - err = -ENOMEM; - goto err_out_free; - } + priv->eq_table.uar_map = kcalloc(mlx4_num_eq_uar(dev), + sizeof *priv->eq_table.uar_map, GFP_KERNEL); + if (!priv->eq_table.uar_map) + return -ENOMEM; - err = mlx4_bitmap_init(&priv->eq_table.bitmap, dev->caps.num_eqs, - dev->caps.num_eqs - 1, dev->caps.reserved_eqs, 0); + err = mlx4_bitmap_init_no_mask(&priv->eq_table.bitmap, dev->caps.num_eqs, + dev->caps.reserved_eqs, 0); if (err) goto err_out_free; for (i = 0; i < mlx4_num_eq_uar(dev); ++i) priv->eq_table.uar_map[i] = NULL; - err = mlx4_map_clr_int(dev); - if (err) - goto err_out_bitmap; + if (!mlx4_is_mfunc(dev) || mlx4_is_master(dev)) { + err = mlx4_map_clr_int(dev); + if (err) + goto err_out_bitmap; - priv->eq_table.clr_mask = - swab32(1 << (priv->eq_table.inta_pin & 31)); - priv->eq_table.clr_int = priv->clr_base + - (priv->eq_table.inta_pin < 32 ? 4 : 0); + priv->eq_table.clr_mask = + swab32(1 << (priv->eq_table.inta_pin & 31)); + priv->eq_table.clr_int = priv->clr_base + + (priv->eq_table.inta_pin < 32 ? 4 : 0); + } - priv->eq_table.irq_names = - kmalloc(MLX4_IRQNAME_SIZE * (dev->caps.num_comp_vectors + 1 + - dev->caps.comp_pool), - GFP_KERNEL); + priv->eq_table.irq_names = kmalloc(32 * (dev->caps.num_comp_vectors + 1), + GFP_KERNEL); if (!priv->eq_table.irq_names) { err = -ENOMEM; - goto err_out_bitmap; + i = 0; + goto err_out_unmap; } for (i = 0; i < dev->caps.num_comp_vectors; ++i) { @@ -620,44 +1233,27 @@ int mlx4_init_eq_table(struct mlx4_dev *dev) if (err) goto err_out_comp; - /*if additional completion vectors poolsize is 0 this loop will not run*/ - for (i = dev->caps.num_comp_vectors + 1; - i < dev->caps.num_comp_vectors + dev->caps.comp_pool + 1; ++i) { - - err = mlx4_create_eq(dev, dev->caps.num_cqs - - dev->caps.reserved_cqs + - MLX4_NUM_SPARE_EQE, - (dev->flags & MLX4_FLAG_MSI_X) ? i : 0, - &priv->eq_table.eq[i]); - if (err) { - --i; - goto err_out_unmap; - } - } - - if (dev->flags & MLX4_FLAG_MSI_X) { const char *eq_name; + if (enable_entropy) + interrupt_flags = IRQF_SAMPLE_RANDOM; + else + interrupt_flags = 0; + for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) { - if (i < dev->caps.num_comp_vectors) { - snprintf(priv->eq_table.irq_names + - i * MLX4_IRQNAME_SIZE, - MLX4_IRQNAME_SIZE, - "mlx4-comp-%d@pci:%s", i, - pci_name(dev->pdev)); - } else { - snprintf(priv->eq_table.irq_names + - i * MLX4_IRQNAME_SIZE, - MLX4_IRQNAME_SIZE, - "mlx4-async@pci:%s", - pci_name(dev->pdev)); - } + if (i < dev->caps.num_comp_vectors) + snprintf(priv->eq_table.irq_names + i * 32, 32, + "eth-mlx4-%s-%d", dname, i); + else + snprintf(priv->eq_table.irq_names + i * 32, 32, + "mlx4-%s-(async)", dname); + + eq_name = priv->eq_table.irq_names + i * 32; - eq_name = priv->eq_table.irq_names + - i * MLX4_IRQNAME_SIZE; err = request_irq(priv->eq_table.eq[i].irq, - mlx4_msi_x_interrupt, 0, eq_name, + mlx4_msi_x_interrupt, + interrupt_flags, eq_name, priv->eq_table.eq + i); if (err) goto err_out_async; @@ -665,23 +1261,26 @@ int mlx4_init_eq_table(struct mlx4_dev *dev) priv->eq_table.eq[i].have_irq = 1; } } else { - snprintf(priv->eq_table.irq_names, - MLX4_IRQNAME_SIZE, - DRV_NAME "@pci:%s", - pci_name(dev->pdev)); + if (enable_entropy) + interrupt_flags = IRQF_SHARED | IRQF_SAMPLE_RANDOM; + else + interrupt_flags = IRQF_SHARED; + err = request_irq(dev->pdev->irq, mlx4_interrupt, - IRQF_SHARED, priv->eq_table.irq_names, dev); + interrupt_flags, DRV_NAME, dev); if (err) goto err_out_async; priv->eq_table.have_irq = 1; } - err = mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 0, - priv->eq_table.eq[dev->caps.num_comp_vectors].eqn); + /* If we are working in internal SMA - also register for port mgmt changed events */ + err = mlx4_MAP_EQ(dev, + MLX4_ASYNC_EVENT_MASK | set_port_mgmt_changed_bit(dev), + 0, priv->eq_table.eq[dev->caps.num_comp_vectors].eqn); if (err) mlx4_warn(dev, "MAP_EQ for async EQ %d failed (%d)\n", - priv->eq_table.eq[dev->caps.num_comp_vectors].eqn, err); + priv->eq_table.eq[dev->caps.num_comp_vectors].eqn, err); for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) eq_set_ci(&priv->eq_table.eq[i], 1); @@ -692,14 +1291,15 @@ err_out_async: mlx4_free_eq(dev, &priv->eq_table.eq[dev->caps.num_comp_vectors]); err_out_comp: - i = dev->caps.num_comp_vectors - 1; + i = dev->caps.num_comp_vectors; err_out_unmap: - while (i >= 0) { - mlx4_free_eq(dev, &priv->eq_table.eq[i]); + while (i > 0) { --i; + mlx4_free_eq(dev, &priv->eq_table.eq[i]); } - mlx4_unmap_clr_int(dev); + if (!mlx4_is_mfunc(dev) || mlx4_is_master(dev)) + mlx4_unmap_clr_int(dev); mlx4_free_irqs(dev); err_out_bitmap: @@ -716,15 +1316,17 @@ void mlx4_cleanup_eq_table(struct mlx4_dev *dev) struct mlx4_priv *priv = mlx4_priv(dev); int i; - mlx4_MAP_EQ(dev, MLX4_ASYNC_EVENT_MASK, 1, - priv->eq_table.eq[dev->caps.num_comp_vectors].eqn); + mlx4_MAP_EQ(dev, + MLX4_ASYNC_EVENT_MASK | set_port_mgmt_changed_bit(dev), 1, + priv->eq_table.eq[dev->caps.num_comp_vectors].eqn); mlx4_free_irqs(dev); - for (i = 0; i < dev->caps.num_comp_vectors + dev->caps.comp_pool + 1; ++i) + for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i) mlx4_free_eq(dev, &priv->eq_table.eq[i]); - mlx4_unmap_clr_int(dev); + if (!mlx4_is_mfunc(dev) || mlx4_is_master(dev)) + mlx4_unmap_clr_int(dev); for (i = 0; i < mlx4_num_eq_uar(dev); ++i) if (priv->eq_table.uar_map[i]) @@ -747,7 +1349,8 @@ int mlx4_test_interrupts(struct mlx4_dev *dev) err = mlx4_NOP(dev); /* When not in MSI_X, there is only one irq to check */ - if (!(dev->flags & MLX4_FLAG_MSI_X)) + if (!(dev->flags & MLX4_FLAG_MSI_X) || + (mlx4_is_mfunc(dev) && !mlx4_is_master(dev))) return err; /* A loop over all completion vectors, for each vector we will check @@ -778,65 +1381,3 @@ int mlx4_test_interrupts(struct mlx4_dev *dev) return err; } EXPORT_SYMBOL(mlx4_test_interrupts); - -int mlx4_assign_eq(struct mlx4_dev *dev, char* name, int * vector) -{ - - struct mlx4_priv *priv = mlx4_priv(dev); - int vec = 0, err = 0, i; - - spin_lock(&priv->msix_ctl.pool_lock); - for (i = 0; !vec && i < dev->caps.comp_pool; i++) { - if (~priv->msix_ctl.pool_bm & 1ULL << i) { - priv->msix_ctl.pool_bm |= 1ULL << i; - vec = dev->caps.num_comp_vectors + 1 + i; - snprintf(priv->eq_table.irq_names + - vec * MLX4_IRQNAME_SIZE, - MLX4_IRQNAME_SIZE, "%s", name); - err = request_irq(priv->eq_table.eq[vec].irq, - mlx4_msi_x_interrupt, 0, - &priv->eq_table.irq_names[vec<<5], - priv->eq_table.eq + vec); - if (err) { - /*zero out bit by fliping it*/ - priv->msix_ctl.pool_bm ^= 1 << i; - vec = 0; - continue; - /*we dont want to break here*/ - } - eq_set_ci(&priv->eq_table.eq[vec], 1); - } - } - spin_unlock(&priv->msix_ctl.pool_lock); - - if (vec) { - *vector = vec; - } else { - *vector = 0; - err = (i == dev->caps.comp_pool) ? -ENOSPC : err; - } - return err; -} -EXPORT_SYMBOL(mlx4_assign_eq); - -void mlx4_release_eq(struct mlx4_dev *dev, int vec) -{ - struct mlx4_priv *priv = mlx4_priv(dev); - /*bm index*/ - int i = vec - dev->caps.num_comp_vectors - 1; - - if (likely(i >= 0)) { - /*sanity check , making sure were not trying to free irq's - Belonging to a legacy EQ*/ - spin_lock(&priv->msix_ctl.pool_lock); - if (priv->msix_ctl.pool_bm & 1ULL << i) { - free_irq(priv->eq_table.eq[vec].irq, - &priv->eq_table.eq[vec]); - priv->msix_ctl.pool_bm &= ~(1ULL << i); - } - spin_unlock(&priv->msix_ctl.pool_lock); - } - -} -EXPORT_SYMBOL(mlx4_release_eq); - diff --git a/drivers/net/mlx4/fmr_api.h b/drivers/net/mlx4/fmr_api.h new file mode 100644 index 0000000000000..b00ae1815dc2b --- /dev/null +++ b/drivers/net/mlx4/fmr_api.h @@ -0,0 +1,102 @@ +#ifndef MLX4_FMR_API_H +#define MLX4_FMR_API_H + +#include + +enum { + FMR_PROTOCOL_KVM = 0, /* default protocol */ + FMR_PROTOCOL_XEN = 1, +}; + +/* + * Info that will be passed between FMR API module and mlx4_core driver + * It is protocol specific, each protocol will add its private data. + */ +struct vpm { + u64 va; + u64 pa_logsz; + u8 info[0]; +}; + +/* + * MASTER FMR API + */ + +struct mlx4_icm_master { + u8 protocol; /* Xen/KVM/... */ + u8 vpm_info_size; /* vpm size specific to current protocol */ + u8 fmr_info_size; /* key size used by protocol during init */ + u8 log_page_size; /* page size used by page allocation */ + + /* Called by each HCA device on load */ + int (*init)(struct pci_dev *ppf, void **ppf_ctx); + + /* Called each time a new vf registers to ppf */ + int (*add_function)(void *ppf_ctx, struct pci_dev *vf, u8 *fmr_info, + void **vf_ctx); + + /* Called each time a vf unregisters from ppf */ + int (*del_function)(void *vf_ctx); + + /* Map pages using info from vpm and returns ctx handle */ + dma_addr_t (*dma_map)(void *vf_ctx, struct vpm *vpm, + void **vpm_ctx); + + /* Unmap pages based on ctx handle */ + int (*dma_unmap)(void *vpm_ctx); + + /* Called by each HCA before unload*/ + void (*term)(void *ppf_ctx); +}; + +/* + * Master FMR API calls this method on load to register callbacks + * Note: The mlx4_core is loaded but it is possible that probe pci + * was not yet called. + */ +int mlx4_reg_icm_master(struct mlx4_icm_master *master); + +/* + * Master FMR API calls this method before unload + * Note: The module should keep a reference count and if + * is still in use the unload will not be allowed + */ +int mlx4_unreg_icm_master(struct mlx4_icm_master *master); + +/* + * SLAVE_FMR_API + */ + +struct mlx4_icm_slave { + u8 protocol; /* Xen/KVM/... */ + + /* Called by each FV on load */ + int (*init)(struct pci_dev *vf, u8 vpm_info_size, u8 fmr_info_size, + u8 *fmr_info, void **vf_ctx); + + /* Share pages using info from vpm and returns ctx handle */ + int (*share)(void *vf_ctx, void *virt_addr, struct vpm *vpm_page, + void **vpm_ctx); + + /* Release pages based on ctx handle */ + int (*unshare)(void *vpm_ctx); + + /* Called by each VF before unload*/ + void (*term)(void *vf_ctx); +}; + +/* + * Slave FMR API calls this method on load to register callbacks + * Note: The mlx4_core is loaded but it is possible that probe pci + * was not yet called. + */ +int mlx4_reg_icm_slave(struct mlx4_icm_slave *slave); + +/* + * Slave FMR API calls this method before unload + * Note: The module should keep a reference count and if + * is still in use the unload will not be allowed + */ +int mlx4_unreg_icm_slave(struct mlx4_icm_slave *slave); + +#endif /* MLX4_FMR_API_H */ diff --git a/drivers/net/mlx4/fmr_master.c b/drivers/net/mlx4/fmr_master.c new file mode 100644 index 0000000000000..59adbd51a7b30 --- /dev/null +++ b/drivers/net/mlx4/fmr_master.c @@ -0,0 +1,279 @@ + +#include +#include +#include "fmr_api.h" +#include "mlx4.h" + + +struct mlx4_pf_fmr_ctx { + struct mlx4_dev *dev; + void *ctx; +}; + +static struct mlx4_icm_master *icm_master; + +static spinlock_t pf_fmr_ctx_lock; +static struct mlx4_pf_fmr_ctx pf_fmr_ctx[MLX4_MAX_NUM_PF]; +static int reg_pf_num; + +void mlx4_fmr_master_init(void) +{ + spin_lock_init(&pf_fmr_ctx_lock); +} + + +static void fmr_master_delete_vpm_ctx(struct mlx4_dev *dev, + struct mlx4_fmr_vpm_ctx *vpm_ctx) +{ + int err; + + err = icm_master->dma_unmap(vpm_ctx->ctx); + if (err) + mlx4_dbg(dev, "ICM MASTER: delete vpm ctx " + "failed for addr 0x%llx with error %d\n", + (unsigned long long)vpm_ctx->va, err); + + kfree(vpm_ctx); +} + + +static int fmr_master_context_init(struct mlx4_dev *dev) +{ + int err; + spin_lock_irq(&pf_fmr_ctx_lock); + if (mlx4_priv(dev)->mfunc.master.fmr_ctx) + { + spin_unlock_irq(&pf_fmr_ctx_lock); + return 0; + } + + err = icm_master->init(dev->pdev, + &mlx4_priv(dev)->mfunc.master.fmr_ctx); + if (err) { + mlx4_dbg(dev, "ICM MASTER: init failed, error %d\n", err); + spin_unlock_irq(&pf_fmr_ctx_lock); + return err; + } + + pf_fmr_ctx[reg_pf_num].ctx = mlx4_priv(dev)->mfunc.master.fmr_ctx; + pf_fmr_ctx[reg_pf_num].dev = dev; + reg_pf_num++; + spin_unlock_irq(&pf_fmr_ctx_lock); + + mlx4_dbg(dev, "ICM MASTER: module inited\n"); + return 0; +} + +#define DELETE_BATCH 16 +void mlx4_fmr_master_delete_slave(struct mlx4_dev *dev, int slave) +{ + struct mlx4_slave_fmr_ctx *slave_fmr_ctx; + struct mlx4_fmr_vpm_ctx *vpm_ctx[DELETE_BATCH]; + int num_vpm_ctx, i; + + mlx4_dbg(dev, "ICM MASTER: delete slave %d\n", slave); + + if (!icm_master) { + mlx4_dbg(dev, "ICM MASTER: no module registered\n"); + return; + } + + slave_fmr_ctx = &mlx4_priv(dev)->mfunc.master.slave_fmr_ctx[slave]; + + spin_lock_irq(&slave_fmr_ctx->vpm_ctx_tree_lock); + if (!slave_fmr_ctx->vf_ctx) { + mlx4_dbg(dev, "ICM MASTER: delete - no data for slave %d\n", + slave); + spin_unlock_irq(&slave_fmr_ctx->vpm_ctx_tree_lock); + return; + } + + do { + num_vpm_ctx = radix_tree_gang_lookup( + &slave_fmr_ctx->vpm_ctx_tree, + (void **)vpm_ctx, 0, DELETE_BATCH); + for (i = 0; i < num_vpm_ctx; ++i) { + radix_tree_delete(&slave_fmr_ctx->vpm_ctx_tree, + vpm_ctx[i]->va); + fmr_master_delete_vpm_ctx(dev, vpm_ctx[i]); + } + } while (num_vpm_ctx); + + icm_master->del_function(slave_fmr_ctx->vf_ctx); + slave_fmr_ctx->vf_ctx = NULL; + spin_unlock_irq(&slave_fmr_ctx->vpm_ctx_tree_lock); +} + +int mlx4_reg_icm_master(struct mlx4_icm_master *master) +{ + icm_master = master; + + printk(KERN_INFO "ICM MASTER: module registered\n"); + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_reg_icm_master); + +int mlx4_unreg_icm_master(struct mlx4_icm_master *master) +{ + int i, j; + struct mlx4_dev *dev; + + if (icm_master != master) + return -EINVAL; + + spin_lock_irq(&pf_fmr_ctx_lock); + for (i = 0; i < reg_pf_num; ++i) { + dev = pf_fmr_ctx[i].dev; + for (j = 0 ; j < dev->num_slaves; j++) + mlx4_fmr_master_delete_slave(dev, j); + icm_master->term(pf_fmr_ctx[i].ctx); + } + reg_pf_num = 0; + icm_master = NULL; + + spin_unlock_irq(&pf_fmr_ctx_lock); + + printk(KERN_INFO "ICM MASTER: module unregistered\n"); + + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_unreg_icm_master); + +u8 mlx4_fmr_master_protocol(void) +{ + return icm_master->protocol; +} + +u8 mlx4_fmr_master_vpm_info_size(void) +{ + return icm_master->vpm_info_size; +} + +u8 mlx4_fmr_master_fmr_info_size(void) +{ + return icm_master->fmr_info_size; +} + +u8 mlx4_fmr_master_fmr_log_page_size(void) +{ + return icm_master->log_page_size; +} + +int mlx4_ENABLE_FMR_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_enable_fmr_mbox *enable_fmr_mbox; + struct mlx4_slave_fmr_ctx *slave_fmr_ctx; + int err; + + if (!icm_master) { + mlx4_dbg(dev, "ICM MASTER: no module registered\n"); + return -EINVAL; + } + + err = fmr_master_context_init(dev); + if (err) { + mlx4_dbg(dev, "ICM MASTER: module init failed\n"); + return err; + } + + enable_fmr_mbox = outbox->buf; + memset(enable_fmr_mbox, 0, sizeof *enable_fmr_mbox); + + slave_fmr_ctx = &mlx4_priv(dev)->mfunc.master.slave_fmr_ctx[slave]; + + err = icm_master->add_function(mlx4_priv(dev)->mfunc.master.fmr_ctx, + NULL, /* todo: replace with vf's pci_dev */ + NULL, /* todo: replace with fmr_info */ + &slave_fmr_ctx->vf_ctx); + if (err) { + mlx4_dbg(dev, "ICM MASTER: add function failed," + " err %d\n", err); + return err; + } + + spin_lock_init(&slave_fmr_ctx->vpm_ctx_tree_lock); + INIT_RADIX_TREE(&slave_fmr_ctx->vpm_ctx_tree, GFP_ATOMIC); + + if (!dev->caps.fmr_log_page_size) + dev->caps.fmr_log_page_size = icm_master->log_page_size; + + enable_fmr_mbox->protocol = icm_master->protocol; + enable_fmr_mbox->fmr_info_size = icm_master->fmr_info_size; + enable_fmr_mbox->vpm_info_size = icm_master->vpm_info_size; + enable_fmr_mbox->log_page_size = icm_master->log_page_size; + enable_fmr_mbox->base_mpt_entry = + cpu_to_be32(dev->caps.fmr_dmpt_base_idx + + slave * dev->caps.fmr_num_mpts); + + /* add here protocol specific private info */ + + return 0; +} + +dma_addr_t mlx4_fmr_master_dma_map(struct mlx4_dev *dev, int slave, + struct vpm *vpm) +{ + struct mlx4_slave_fmr_ctx *slave_fmr_ctx; + struct mlx4_fmr_vpm_ctx *vpm_ctx; + dma_addr_t addr; + int err; + + slave_fmr_ctx = &mlx4_priv(dev)->mfunc.master.slave_fmr_ctx[slave]; + if (!slave_fmr_ctx->vf_ctx) { + mlx4_dbg(dev, "ICM MASTER: failed to map dma addr\n"); + return 0; + } + + vpm_ctx = kzalloc(sizeof(vpm_ctx), GFP_KERNEL); + if (!vpm_ctx) { + mlx4_dbg(dev, "ICM MASTER: dma map has no mem left\n"); + return 0; + } + vpm_ctx->va = be64_to_cpu(vpm->va); + addr = icm_master->dma_map(slave_fmr_ctx->vf_ctx, vpm, &vpm_ctx->ctx); + if (addr) { + spin_lock_irq(&slave_fmr_ctx->vpm_ctx_tree_lock); + err = radix_tree_insert(&slave_fmr_ctx->vpm_ctx_tree, + vpm_ctx->va, vpm_ctx); + spin_unlock_irq(&slave_fmr_ctx->vpm_ctx_tree_lock); + if (err) { + mlx4_dbg(dev, "ICM MASTER: failed to save dma addr\n"); + goto out_free_vpm_ctx; + } + } + + return addr; + +out_free_vpm_ctx: + kfree(vpm_ctx); + return 0; +} + +void mlx4_fmr_master_dma_unmap(struct mlx4_dev *dev, int slave, u64 va) +{ + struct mlx4_slave_fmr_ctx *slave_fmr_ctx; + struct mlx4_fmr_vpm_ctx *vpm_ctx; + + slave_fmr_ctx = &mlx4_priv(dev)->mfunc.master.slave_fmr_ctx[slave]; + if (!slave_fmr_ctx->vf_ctx) { + mlx4_dbg(dev, "ICM MASTER: failed to unmap dma" + " for addr 0x%llx\n", + (unsigned long long)va); + return; + } + spin_lock_irq(&slave_fmr_ctx->vpm_ctx_tree_lock); + vpm_ctx = radix_tree_delete(&slave_fmr_ctx->vpm_ctx_tree, va); + spin_unlock_irq(&slave_fmr_ctx->vpm_ctx_tree_lock); + if (!vpm_ctx) { + mlx4_dbg(dev, "ICM MASTER: unmap dma failed to get" + " track data for addr 0x%llx\n", + (unsigned long long)va); + return; + } + + fmr_master_delete_vpm_ctx(dev, vpm_ctx); +} diff --git a/drivers/net/mlx4/fmr_master.h b/drivers/net/mlx4/fmr_master.h new file mode 100644 index 0000000000000..f489c5da96e8b --- /dev/null +++ b/drivers/net/mlx4/fmr_master.h @@ -0,0 +1,26 @@ +#ifndef MLX4_FMR_MASTER_H +#define MLX4_FMR_MASTER_H + +#include "fmr_api.h" + +u8 mlx4_fmr_master_protocol(void); +u8 mlx4_fmr_master_vpm_info_size(void); +u8 mlx4_fmr_master_fmr_info_size(void); +u8 mlx4_fmr_master_fmr_log_page_size(void); + +int mlx4_ENABLE_FMR_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); + +int mlx4_fmr_master_init(void); + +void mlx4_fmr_master_delete_slave(struct mlx4_dev *dev, int slave); + +dma_addr_t mlx4_fmr_master_dma_map(struct mlx4_dev *dev, int slave, + struct vpm *vpm_page); + +void mlx4_fmr_master_dma_unmap(struct mlx4_dev *dev, int slave, u64 va); + +#endif /* MLX4_FMR_MASTER_H */ diff --git a/drivers/net/mlx4/fmr_slave.c b/drivers/net/mlx4/fmr_slave.c new file mode 100644 index 0000000000000..a2898f1dd898c --- /dev/null +++ b/drivers/net/mlx4/fmr_slave.c @@ -0,0 +1,179 @@ + +#include +#include "fmr_api.h" +#include "mlx4.h" + +static struct mlx4_icm_slave *icm_slave; +static u8 vpm_info_size; + +static spinlock_t vf_fmr_ctx_lock; +static void *vf_fmr_ctx[MLX4_MAX_NUM_VF]; +static int reg_vf_num; + + +inline int mlx4_fmr_flow(struct mlx4_dev *dev, enum mlx4_mr_flags flags) +{ + return icm_slave && mlx4_is_mfunc(dev) && (flags & MLX4_MR_FLAG_FMR); +} + +void mlx4_fmr_slave_init(void) +{ + spin_lock_init(&vf_fmr_ctx_lock); +} + +int mlx4_fmr_slave_context_init(struct mlx4_dev *dev) +{ + struct mlx4_cmd_mailbox *outbox; + struct mlx4_enable_fmr_mbox *enable_fmr_mbox; + int err = 0; + + if (!icm_slave) + return -EINVAL; + + if (mlx4_priv(dev)->fmr_ctx) + return 0; + + outbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(outbox)) + return PTR_ERR(outbox); + + err = mlx4_cmd_box(dev, 0, outbox->dma, 0, 0, MLX4_CMD_ENABLE_FMR, + MLX4_CMD_TIME_CLASS_A, 0); + if (err) { + mlx4_dbg(dev, "MLX4_CMD_ENABLE_FMR failed, err %d\n", err); + goto out_mailbox_free; + } + + enable_fmr_mbox = (struct mlx4_enable_fmr_mbox *)outbox->buf; + if (icm_slave->protocol != enable_fmr_mbox->protocol) { + mlx4_dbg(dev, "Slave fmr protocol (%d) is different from master" + " protocol (%d)\n", icm_slave->protocol, + enable_fmr_mbox->protocol); + err = -EINVAL; + goto out_mailbox_free; + } + +/* + Moved to query_slave_cap + dev->caps.fmr_dmpt_base_idx = + be32_to_cpu(enable_fmr_mbox->base_mpt_entry); +*/ + dev->caps.fmr_log_page_size = enable_fmr_mbox->log_page_size; + if (dev->caps.fmr_log_page_size != PAGE_SHIFT) { + mlx4_dbg(dev, "Slave fmr supports only the same " + "page size for master and slave\n"); + err = -EINVAL; + goto out_mailbox_free; + } + + err = icm_slave->init(dev->pdev, enable_fmr_mbox->vpm_info_size, + enable_fmr_mbox->fmr_info_size, + enable_fmr_mbox->fmr_info, + &mlx4_priv(dev)->fmr_ctx); + if (err) { + mlx4_dbg(dev, "Slave enable fmr failed, error %d\n", err); + goto out_mailbox_free; + } + + spin_lock_irq(&vf_fmr_ctx_lock); + vf_fmr_ctx[reg_vf_num++] = mlx4_priv(dev)->fmr_ctx; + spin_unlock_irq(&vf_fmr_ctx_lock); + + vpm_info_size = enable_fmr_mbox->vpm_info_size; + + mlx4_dbg(dev, "ICM SLAVE: module inited\n"); + +out_mailbox_free: + mlx4_free_cmd_mailbox(dev, outbox); + return err; +} + +int mlx4_reg_icm_slave(struct mlx4_icm_slave *slave) +{ + icm_slave = slave; + + printk(KERN_INFO "ICM SLAVE: module registered\n"); + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_reg_icm_slave); + +int mlx4_unreg_icm_slave(struct mlx4_icm_slave *slave) +{ + int i; + + if (!icm_slave) { + printk(KERN_ERR "ICM SLAVE: no module registered\n"); + return -EINVAL; + } + + spin_lock_irq(&vf_fmr_ctx_lock); + for (i = 0; i < reg_vf_num; ++i) { + icm_slave->term(vf_fmr_ctx[i]); + vf_fmr_ctx[i] = NULL; + } + reg_vf_num = 0; + spin_unlock_irq(&vf_fmr_ctx_lock); + + printk(KERN_INFO "ICM SLAVE: module unregistered\n"); + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_unreg_icm_slave); + +void mlx4_fmr_slave_context_term(struct mlx4_dev *dev) +{ + int i; + + if (!icm_slave) { + mlx4_dbg(dev, "ICM SLAVE: no module registered\n"); + return; + } + spin_lock_irq(&vf_fmr_ctx_lock); + if (!mlx4_priv(dev)->fmr_ctx) { + mlx4_dbg(dev, "ICM SLAVE: no fmr context\n"); + spin_unlock_irq(&vf_fmr_ctx_lock); + return; + } + + for (i = 0; i < reg_vf_num; ++i) + if (vf_fmr_ctx[i] == mlx4_priv(dev)->fmr_ctx) + break; + + if (i == reg_vf_num) { + mlx4_dbg(dev, "ICM SLAVE: fmr context not registered\n"); + spin_unlock_irq(&vf_fmr_ctx_lock); + return; + } + + icm_slave->term(mlx4_priv(dev)->fmr_ctx); + reg_vf_num -= 1; + for (; i < reg_vf_num; ++i) + vf_fmr_ctx[i] = vf_fmr_ctx[i + 1]; + vf_fmr_ctx[reg_vf_num] = NULL; + mlx4_priv(dev)->fmr_ctx = NULL; + + spin_unlock_irq(&vf_fmr_ctx_lock); +} + +int mlx4_fmr_slave_vpm_info_size(void) +{ + return vpm_info_size; +} + +int mlx4_fmr_slave_share(struct mlx4_dev *dev, void *virt_addr, + struct vpm *vpm_page, void **vpm_ctx) +{ + if (!icm_slave) { + mlx4_dbg(dev, "ICM SLAVE: no module registered\n"); + return -EINVAL; + } + + return icm_slave->share(&mlx4_priv(dev)->fmr_ctx, virt_addr, + vpm_page, vpm_ctx); +} + +int mlx4_fmr_slave_unshare(void *vpm_ctx) +{ + return icm_slave->unshare(vpm_ctx); +} + + diff --git a/drivers/net/mlx4/fmr_slave.h b/drivers/net/mlx4/fmr_slave.h new file mode 100644 index 0000000000000..44ec103cd67e2 --- /dev/null +++ b/drivers/net/mlx4/fmr_slave.h @@ -0,0 +1,16 @@ +#ifndef MLX4_FMR_SLAVE_H +#define MLX4_FMR_SLAVE_H + +#include "fmr_api.h" + +int mlx4_fmr_flow(struct mlx4_dev *dev, enum mlx4_mr_flags flags); +int mlx4_fmr_slave_init(void); +int mlx4_fmr_slave_context_init(struct mlx4_dev *dev); +void mlx4_fmr_slave_context_term(struct mlx4_dev *dev); + +int mlx4_fmr_slave_vpm_info_size(void); +int mlx4_fmr_slave_share(struct mlx4_dev *dev, void *virt_addr, + struct vpm *vpm_page, void **vpm_ctx); +int mlx4_fmr_slave_unshare(void *vpm_ctx); + +#endif /* MLX4_FMR_SLAVE_H */ diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c index 60cd522680840..a1e2c17c12caf 100644 --- a/drivers/net/mlx4/fw.c +++ b/drivers/net/mlx4/fw.c @@ -32,8 +32,8 @@ * SOFTWARE. */ +#include #include -#include #include "fw.h" #include "icm.h" @@ -51,6 +51,10 @@ static int enable_qos; module_param(enable_qos, bool, 0444); MODULE_PARM_DESC(enable_qos, "Enable Quality of Service support in the HCA (default: off)"); +static int mlx4_pre_t11_mode = 0; +module_param_named(enable_pre_t11_mode, mlx4_pre_t11_mode, int, 0644); +MODULE_PARM_DESC(enable_pre_t11_mode, "For FCoXX, enable pre-t11 mode if non-zero (default: 0)"); + #define MLX4_GET(dest, source, offset) \ do { \ void *__p = (char *) (source) + (offset); \ @@ -90,7 +94,6 @@ static void dump_dev_cap_flags(struct mlx4_dev *dev, u64 flags) [ 9] = "Q_Key violation counter", [10] = "VMM", [12] = "DPDP", - [15] = "Big LSO headers", [16] = "MW support", [17] = "APM support", [18] = "Atomic ops support", @@ -100,12 +103,8 @@ static void dump_dev_cap_flags(struct mlx4_dev *dev, u64 flags) [24] = "Demand paging support", [25] = "Router support", [30] = "IBoE support", - [32] = "Unicast loopback support", - [34] = "FCS header control", - [38] = "Wake On LAN support", - [40] = "UDP RSS support", - [41] = "Unicast VEP steering support", - [42] = "Multicast VEP steering support" + [48] = "Basic counters support", + [49] = "Extended counters support", }; int i; @@ -137,7 +136,225 @@ int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg) MLX4_PUT(inbox, cfg->log_pg_sz_m, MOD_STAT_CFG_PG_SZ_M_OFFSET); err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_MOD_STAT_CFG, - MLX4_CMD_TIME_CLASS_A); + MLX4_CMD_TIME_CLASS_A, 1); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} + +int mlx4_CMD_SET_IF_STAT(struct mlx4_dev *dev, int mode) +{ + int err; + u64 inparam; + + inparam = mode == MLX4_CUNTERS_BASIC ? 0 : 1; + err = mlx4_cmd(dev, inparam, 0, 0, MLX4_CMD_SET_IF_STAT, + MLX4_CMD_TIME_CLASS_A, 1); + + return err; +} + +int mlx4_QUERY_VEP_CFG(struct mlx4_dev *dev, u8 pf_num, + struct mlx4_vep_cfg *cfg) +{ + int err; + u32 in_mod; + u64 output; + u8 vep_num = pf_num >> 1; + u8 port_num = (pf_num & 1) + 1; + +#define QUERY_VEP_CFG_OPMOD 3 + +#define QUERY_VEP_CFG_INMOD (2 << 28) +#define QUERY_VEP_CFG_INMOD_VEP_OFFSET 16 +#define QUERY_VEP_CFG_INMOD_PORT_OFFSET 8 + +#define QUERY_VEP_CFG_MAC_OFFSET 0x90 +#define QUERY_VEP_CFG_LINK_OFFSET 0xa0 + + + in_mod = QUERY_VEP_CFG_INMOD | (vep_num << QUERY_VEP_CFG_INMOD_VEP_OFFSET) | + (port_num << QUERY_VEP_CFG_INMOD_PORT_OFFSET); + + err = mlx4_cmd_imm(dev, 0, &output, in_mod | QUERY_VEP_CFG_MAC_OFFSET, + QUERY_VEP_CFG_OPMOD, MLX4_CMD_MOD_STAT_CFG, + MLX4_CMD_TIME_CLASS_A, 1); + if (err) { + mlx4_err(dev, "Failed to retrieve mac for function %d\n", vep_num); + return err; + } + cfg->mac = output & 0xffffffffffffULL; + + err = mlx4_cmd_imm(dev, 0, &output, in_mod | QUERY_VEP_CFG_LINK_OFFSET, + QUERY_VEP_CFG_OPMOD, MLX4_CMD_MOD_STAT_CFG, + MLX4_CMD_TIME_CLASS_A, 1); + if (err) { + mlx4_err(dev, "Failed to retrieve link for function %d\n", vep_num); + return err; + } + cfg->link = (output >> 32) & 1; + + return 0; +} + +int mlx4_update_uplink_arbiter(struct mlx4_dev *dev, u8 port) +{ + struct mlx4_cmd_mailbox *mailbox; + u32 in_mod; + int err; + int i; + u8 *buf; + u64 *buf64; + +#define QUERY_UPLINK_ARB_OPMOD 2 +#define QUERY_UPLINK_ARB_INMOD (3 << 28) +#define QUERY_UPLINK_ARB_PORT_OFFSET 8 +#define SET_PORT_ARB_MOD 2 + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + buf = mailbox->buf; + + in_mod = QUERY_UPLINK_ARB_INMOD | (port << QUERY_UPLINK_ARB_PORT_OFFSET); + err = mlx4_cmd_box(dev, 0, mailbox->dma, in_mod, QUERY_UPLINK_ARB_OPMOD, + MLX4_CMD_MOD_STAT_CFG, MLX4_CMD_TIME_CLASS_A, 1); + if (err) { + mlx4_err(dev, "Failed to read uplink arbiter configuration " + "for port %d\n", port); + goto out; + } + +#define UPLINK_VEP_MODE 2 +#define VEP_CONFIG_OFFSET 0x40 +#define VEP_CONFIG_SIZE 0x8 +#define VEP_ENABLE_MASK (1ull << 63 | 1ull << 39 | 1ull << 31) + + if (buf[3] != UPLINK_VEP_MODE) { + /* not running in vep mode, nothing to do */ + /* TODO: config ets mode */ + mlx4_priv(dev)->vep_mode[port] = false; + err = 0; + goto out; + } + + mlx4_priv(dev)->vep_mode[port] = true; + + buf[0] = 1 << 7; + for (i = 0; i < dev->caps.pf_num; i++) { + buf64 = (u64 *) (&buf[VEP_CONFIG_OFFSET + i * VEP_CONFIG_SIZE]); + *buf64 |= cpu_to_be64(VEP_ENABLE_MASK); + } + err = mlx4_cmd(dev, mailbox->dma, (u32) port, SET_PORT_ARB_MOD, + MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_A, 1); + if (err) + mlx4_err(dev, "Failed to set uplink arbiter configuration " + "for port %d\n", port); + +out: + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} + +static int query_port_common(struct mlx4_dev *dev, + struct mlx4_cmd_mailbox *outbox, u8 port, + u8 function) +{ + int err; + u8 *buf; + u8 vep_num; + + err = mlx4_cmd_box(dev, 0, outbox->dma, port, 0, MLX4_CMD_QUERY_PORT, + MLX4_CMD_TIME_CLASS_B, 1); + if (!err) { + buf = outbox->buf; + vep_num = mlx4_priv(dev)->mfunc.master.slave_state[function].pf_num; + buf[0] &= (mlx4_priv(dev)->mfunc.master.slave_state[vep_num].vep_cfg.link) << 7; + } + return err; +} + +int mlx4_QUERY_PORT_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + return query_port_common(dev, outbox, vhcr->in_modifier, slave); +} + +int mlx4_QUERY_PORT(struct mlx4_dev *dev, void *ptr, u8 port) +{ + struct mlx4_cmd_mailbox *outbox = ptr; + + return mlx4_cmd_box(dev, 0, outbox->dma, port, 0, + MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B, 0); +} +EXPORT_SYMBOL_GPL(mlx4_QUERY_PORT); + +int mlx4_QUERY_SLAVE_CAP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_mfunc_master_ctx *master = &priv->mfunc.master; + struct mlx4_slave_state *slave_st = &master->slave_state[slave]; + struct mlx4_caps *caps = outbox->buf; + u8 pf_num = slave_st->pf_num; + int i; + int err = 0; + + memcpy(caps, &dev->caps, sizeof *caps); + + /* The Master function is in charge for qp1 of al slaves */ + caps->sqp_demux = 0; + caps->num_qp_per_mgm = mlx4_get_qp_per_mgm(dev); + if (pf_num == slave) { + err = mlx4_QUERY_VEP_CFG(dev, pf_num, &slave_st->vep_cfg); + if (err) + mlx4_warn(dev, "Failed to retreive mac address for vep %d\n", pf_num); + else + caps->def_mac[(pf_num & 1) + 1] = slave_st->vep_cfg.mac; + } + + for (i = 1; i <= min((u32)(MLX4_MAX_PORTS + 1), dev->caps.num_ports); ++i) { + if (pf_num != slave || err) + caps->def_mac[i] = dev->caps.def_mac[i] + (slave << 8); + + caps->gid_table_len[i] = dev->gids_per_func; + } + + /* Ports are activated according to physical function number */ + mlx4_set_port_mask(dev, caps, slave); + + caps->function = slave; + /* Should be passed in QUERY_HCA cmd */ + caps->fmr_dmpt_base += slave * dev->caps.fmr_num_mpts * + dev->caps.dmpt_entry_sz; + + /* All other resources are allocated by the master, but we still report + * 'num' and 'reserved' capabilities as follows: + * - num remains the maximum resource index + * - 'num - reserved' is the total available objects of a resource, but + * resource indices may be less than 'reserved' + * TODO: set per-resource quotas */ + return 0; +} + +int mlx4_QUERY_SLAVE_CAP(struct mlx4_dev *dev, struct mlx4_caps *caps) +{ + struct mlx4_cmd_mailbox *mailbox; + int err; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_SLAVE_CAP, + MLX4_CMD_TIME_CLASS_A, 0); + if (!err) + memcpy(caps, mailbox->buf, sizeof *caps); mlx4_free_cmd_mailbox(dev, mailbox); return err; @@ -148,11 +365,14 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) struct mlx4_cmd_mailbox *mailbox; u32 *outbox; u8 field; - u32 field32, flags, ext_flags; + u32 field32; u16 size; u16 stat_rate; int err; int i; + u32 in_modifier; + u64 out_param; + u32 tmp1, tmp2; #define QUERY_DEV_CAP_OUT_SIZE 0x100 #define QUERY_DEV_CAP_MAX_SRQ_SZ_OFFSET 0x10 @@ -163,11 +383,12 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) #define QUERY_DEV_CAP_MAX_SRQ_OFFSET 0x15 #define QUERY_DEV_CAP_RSVD_EEC_OFFSET 0x16 #define QUERY_DEV_CAP_MAX_EEC_OFFSET 0x17 +#define QUERY_DEV_CAP_RSVD_EQ_OFFSET 0x18 #define QUERY_DEV_CAP_MAX_CQ_SZ_OFFSET 0x19 #define QUERY_DEV_CAP_RSVD_CQ_OFFSET 0x1a #define QUERY_DEV_CAP_MAX_CQ_OFFSET 0x1b #define QUERY_DEV_CAP_MAX_MPT_OFFSET 0x1d -#define QUERY_DEV_CAP_RSVD_EQ_OFFSET 0x1e +#define QUERY_DEV_CAP_LOG_RSVD_EQ_OFFSET 0x1e #define QUERY_DEV_CAP_MAX_EQ_OFFSET 0x1f #define QUERY_DEV_CAP_RSVD_MTT_OFFSET 0x20 #define QUERY_DEV_CAP_MAX_MRW_SZ_OFFSET 0x21 @@ -178,6 +399,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) #define QUERY_DEV_CAP_MAX_RES_QP_OFFSET 0x2b #define QUERY_DEV_CAP_MAX_GSO_OFFSET 0x2d #define QUERY_DEV_CAP_MAX_RDMA_OFFSET 0x2f +#define QUERY_DEV_CAP_STAT_CFG_INL_OFFSET 0x31 #define QUERY_DEV_CAP_RSZ_SRQ_OFFSET 0x33 #define QUERY_DEV_CAP_ACK_DELAY_OFFSET 0x35 #define QUERY_DEV_CAP_MTU_WIDTH_OFFSET 0x36 @@ -187,6 +409,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) #define QUERY_DEV_CAP_RATE_SUPPORT_OFFSET 0x3c #define QUERY_DEV_CAP_MAX_PKEY_OFFSET 0x3f #define QUERY_DEV_CAP_EXT_FLAGS_OFFSET 0x40 +#define QUERY_DEV_CAP_UDP_RSS_OFFSET 0x42 +#define QUERY_DEV_CAP_ETH_UC_LOOPBACK_OFFSET 0x43 #define QUERY_DEV_CAP_FLAGS_OFFSET 0x44 #define QUERY_DEV_CAP_RSVD_UAR_OFFSET 0x48 #define QUERY_DEV_CAP_UAR_SZ_OFFSET 0x49 @@ -204,6 +428,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) #define QUERY_DEV_CAP_MAX_MCG_OFFSET 0x63 #define QUERY_DEV_CAP_RSVD_PD_OFFSET 0x64 #define QUERY_DEV_CAP_MAX_PD_OFFSET 0x65 +#define QUERY_DEV_CAP_RSVD_XRC_OFFSET 0x66 +#define QUERY_DEV_CAP_MAX_XRC_OFFSET 0x67 #define QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET 0x80 #define QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET 0x82 #define QUERY_DEV_CAP_AUX_ENTRY_SZ_OFFSET 0x84 @@ -217,6 +443,9 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) #define QUERY_DEV_CAP_BMME_FLAGS_OFFSET 0x94 #define QUERY_DEV_CAP_RSVD_LKEY_OFFSET 0x98 #define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET 0xa0 +#define QUERY_DEV_CAP_MAX_BASIC_CNT_OFFSET 0x68 +#define QUERY_DEV_CAP_MAX_EXT_CNT_OFFSET 0x6c +#define QUERY_DEV_CAP_MAD_DEMUX_OFFSET 0xb0 mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) @@ -224,7 +453,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) outbox = mailbox->buf; err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_DEV_CAP, - MLX4_CMD_TIME_CLASS_A); + MLX4_CMD_TIME_CLASS_A, 1); if (err) goto out; @@ -245,7 +474,11 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MPT_OFFSET); dev_cap->max_mpts = 1 << (field & 0x3f); MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_EQ_OFFSET); - dev_cap->reserved_eqs = field & 0xf; + if (!field) { + MLX4_GET(field, outbox, QUERY_DEV_CAP_LOG_RSVD_EQ_OFFSET); + dev_cap->reserved_eqs = 1 << (field & 0xf); + } else + dev_cap->reserved_eqs = field; MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_EQ_OFFSET); dev_cap->max_eqs = 1 << (field & 0xf); MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_MTT_OFFSET); @@ -271,15 +504,26 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev_cap->max_rdma_global = 1 << (field & 0x3f); MLX4_GET(field, outbox, QUERY_DEV_CAP_ACK_DELAY_OFFSET); dev_cap->local_ca_ack_delay = field & 0x1f; + MLX4_GET(field, outbox, QUERY_DEV_CAP_MTU_WIDTH_OFFSET); + dev_cap->pf_num = field; + if (dev_cap->pf_num > 1) + dev->flags |= (MLX4_FLAG_MFUNC | MLX4_FLAG_MASTER); MLX4_GET(field, outbox, QUERY_DEV_CAP_VL_PORT_OFFSET); dev_cap->num_ports = field & 0xf; MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_MSG_SZ_OFFSET); dev_cap->max_msg_sz = 1 << (field & 0x1f); MLX4_GET(stat_rate, outbox, QUERY_DEV_CAP_RATE_SUPPORT_OFFSET); dev_cap->stat_rate_support = stat_rate; - MLX4_GET(ext_flags, outbox, QUERY_DEV_CAP_EXT_FLAGS_OFFSET); - MLX4_GET(flags, outbox, QUERY_DEV_CAP_FLAGS_OFFSET); - dev_cap->flags = flags | (u64)ext_flags << 32; + MLX4_GET(field, outbox, QUERY_DEV_CAP_UDP_RSS_OFFSET); + dev_cap->udp_rss = field & 0x1; + MLX4_GET(field, outbox, QUERY_DEV_CAP_ETH_UC_LOOPBACK_OFFSET); + dev_cap->loopback_support = field & 0x1; + dev_cap->vep_uc_steering = field & 0x4; + dev_cap->vep_mc_steering = field & 0x8; + dev_cap->wol = field & 0x40; + MLX4_GET(tmp1, outbox, QUERY_DEV_CAP_EXT_FLAGS_OFFSET); + MLX4_GET(tmp2, outbox, QUERY_DEV_CAP_FLAGS_OFFSET); + dev_cap->flags = tmp2 | (u64)tmp1 << 32; MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_UAR_OFFSET); dev_cap->reserved_uars = field >> 4; MLX4_GET(field, outbox, QUERY_DEV_CAP_UAR_SZ_OFFSET); @@ -292,8 +536,6 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) MLX4_GET(field, outbox, QUERY_DEV_CAP_LOG_BF_REG_SZ_OFFSET); dev_cap->bf_reg_size = 1 << (field & 0x1f); MLX4_GET(field, outbox, QUERY_DEV_CAP_LOG_MAX_BF_REGS_PER_PAGE_OFFSET); - if ((1 << (field & 0x3f)) > (PAGE_SIZE / dev_cap->bf_reg_size)) - field = 3; dev_cap->bf_regs_per_page = 1 << (field & 0x3f); mlx4_dbg(dev, "BlueFlame available (reg size %d, regs/page %d)\n", dev_cap->bf_reg_size, dev_cap->bf_regs_per_page); @@ -318,6 +560,11 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_PD_OFFSET); dev_cap->max_pds = 1 << (field & 0x3f); + MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_XRC_OFFSET); + dev_cap->reserved_xrcds = field >> 4; + MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_XRC_OFFSET); + dev_cap->max_xrcds = 1 << (field & 0x1f); + MLX4_GET(size, outbox, QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET); dev_cap->rdmarc_entry_sz = size; MLX4_GET(size, outbox, QUERY_DEV_CAP_QPC_ENTRY_SZ_OFFSET); @@ -343,6 +590,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev_cap->max_srq_sz = 1 << field; MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_SZ_OFFSET); dev_cap->max_qp_sz = 1 << field; + MLX4_GET(field, outbox, QUERY_DEV_CAP_STAT_CFG_INL_OFFSET); + dev_cap->inline_cfg = field & 1; MLX4_GET(field, outbox, QUERY_DEV_CAP_RSZ_SRQ_OFFSET); dev_cap->resize_srq = field & 1; MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_SG_RQ_OFFSET); @@ -356,6 +605,12 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) QUERY_DEV_CAP_RSVD_LKEY_OFFSET); MLX4_GET(dev_cap->max_icm_sz, outbox, QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET); + MLX4_GET(dev_cap->max_basic_counters, outbox, + QUERY_DEV_CAP_MAX_BASIC_CNT_OFFSET); + MLX4_GET(dev_cap->max_ext_counters, outbox, + QUERY_DEV_CAP_MAX_EXT_CNT_OFFSET); + MLX4_GET(dev_cap->mad_demux, outbox, + QUERY_DEV_CAP_MAD_DEMUX_OFFSET); if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { for (i = 1; i <= dev_cap->num_ports; ++i) { @@ -382,9 +637,14 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) #define QUERY_PORT_WAVELENGTH_OFFSET 0x1c #define QUERY_PORT_TRANS_CODE_OFFSET 0x20 +#define STAT_CFG_PORT_MODE (1 << 28) +#define STAT_CFG_PORT_OFFSET 0x8 +#define STAT_CFG_PORT_MASK (1 << 20) +#define STAT_CFG_MOD_INLINE 0x3 + for (i = 1; i <= dev_cap->num_ports; ++i) { err = mlx4_cmd_box(dev, 0, mailbox->dma, i, 0, MLX4_CMD_QUERY_PORT, - MLX4_CMD_TIME_CLASS_B); + MLX4_CMD_TIME_CLASS_B, 1); if (err) goto out; @@ -403,12 +663,36 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev_cap->log_max_macs[i] = field & 0xf; dev_cap->log_max_vlans[i] = field >> 4; MLX4_GET(dev_cap->eth_mtu[i], outbox, QUERY_PORT_ETH_MTU_OFFSET); - MLX4_GET(dev_cap->def_mac[i], outbox, QUERY_PORT_MAC_OFFSET); + if (mlx4_is_mfunc(dev) && mlx4_is_master(dev)) { + /* master is not always function 0, + * need to avoid mac collisions */ + struct mlx4_vep_cfg cfg; + err = mlx4_QUERY_VEP_CFG(dev, dev->caps.function | (i - 1), &cfg); + if (err) + goto out; + dev_cap->def_mac[i] = cfg.mac; + } else { + MLX4_GET(dev_cap->def_mac[i], outbox, QUERY_PORT_MAC_OFFSET); + } MLX4_GET(field32, outbox, QUERY_PORT_TRANS_VENDOR_OFFSET); dev_cap->trans_type[i] = field32 >> 24; dev_cap->vendor_oui[i] = field32 & 0xffffff; MLX4_GET(dev_cap->wavelength[i], outbox, QUERY_PORT_WAVELENGTH_OFFSET); MLX4_GET(dev_cap->trans_code[i], outbox, QUERY_PORT_TRANS_CODE_OFFSET); + + /* Query stat cfg for port enablement */ + if (dev_cap->inline_cfg) { + in_modifier = STAT_CFG_PORT_MODE | i << 8 | + STAT_CFG_PORT_OFFSET; + err = mlx4_cmd_imm(dev, 0, &out_param, + in_modifier, + STAT_CFG_MOD_INLINE, + MLX4_CMD_MOD_STAT_CFG, + MLX4_CMD_TIME_CLASS_B, 1); + if (!err) + if (!(out_param & STAT_CFG_PORT_MASK)) + dev_cap->supported_port_types[i] = 0; + } } } @@ -506,7 +790,7 @@ int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt) if (++nent == MLX4_MAILBOX_SIZE / 16) { err = mlx4_cmd(dev, mailbox->dma, nent, 0, op, - MLX4_CMD_TIME_CLASS_B); + MLX4_CMD_TIME_CLASS_B, 1); if (err) goto out; nent = 0; @@ -515,7 +799,7 @@ int mlx4_map_cmd(struct mlx4_dev *dev, u16 op, struct mlx4_icm *icm, u64 virt) } if (nent) - err = mlx4_cmd(dev, mailbox->dma, nent, 0, op, MLX4_CMD_TIME_CLASS_B); + err = mlx4_cmd(dev, mailbox->dma, nent, 0, op, MLX4_CMD_TIME_CLASS_B, 1); if (err) goto out; @@ -544,13 +828,13 @@ int mlx4_MAP_FA(struct mlx4_dev *dev, struct mlx4_icm *icm) int mlx4_UNMAP_FA(struct mlx4_dev *dev) { - return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_UNMAP_FA, MLX4_CMD_TIME_CLASS_B); + return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_UNMAP_FA, MLX4_CMD_TIME_CLASS_B, 1); } int mlx4_RUN_FW(struct mlx4_dev *dev) { - return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_RUN_FW, MLX4_CMD_TIME_CLASS_A); + return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_RUN_FW, MLX4_CMD_TIME_CLASS_A, 1); } int mlx4_QUERY_FW(struct mlx4_dev *dev) @@ -566,6 +850,8 @@ int mlx4_QUERY_FW(struct mlx4_dev *dev) #define QUERY_FW_OUT_SIZE 0x100 #define QUERY_FW_VER_OFFSET 0x00 +#define MC_PROMISC_VER 0x2000702bcull +#define QUERY_FW_PPF_ID 0x09 #define QUERY_FW_CMD_IF_REV_OFFSET 0x0a #define QUERY_FW_MAX_CMD_OFFSET 0x0f #define QUERY_FW_ERR_START_OFFSET 0x30 @@ -576,13 +862,16 @@ int mlx4_QUERY_FW(struct mlx4_dev *dev) #define QUERY_FW_CLR_INT_BASE_OFFSET 0x20 #define QUERY_FW_CLR_INT_BAR_OFFSET 0x28 +#define QUERY_FW_COMM_BASE_OFFSET 0x40 +#define QUERY_FW_COMM_BAR_OFFSET 0x48 + mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); outbox = mailbox->buf; err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_FW, - MLX4_CMD_TIME_CLASS_A); + MLX4_CMD_TIME_CLASS_A, 1); if (err) goto out; @@ -594,6 +883,13 @@ int mlx4_QUERY_FW(struct mlx4_dev *dev) dev->caps.fw_ver = (fw_ver & 0xffff00000000ull) | ((fw_ver & 0xffff0000ull) >> 16) | ((fw_ver & 0x0000ffffull) << 16); + if (dev->caps.fw_ver < MC_PROMISC_VER) + dev->caps.mc_promisc_mode = 2; + else + dev->caps.mc_promisc_mode = 1; + + MLX4_GET(lg, outbox, QUERY_FW_PPF_ID); + dev->caps.function = lg; MLX4_GET(cmd_if_rev, outbox, QUERY_FW_CMD_IF_REV_OFFSET); if (cmd_if_rev < MLX4_COMMAND_INTERFACE_MIN_REV || @@ -636,6 +932,11 @@ int mlx4_QUERY_FW(struct mlx4_dev *dev) MLX4_GET(fw->clr_int_bar, outbox, QUERY_FW_CLR_INT_BAR_OFFSET); fw->clr_int_bar = (fw->clr_int_bar >> 6) * 2; + MLX4_GET(fw->comm_base, outbox, QUERY_FW_COMM_BASE_OFFSET); + MLX4_GET(fw->comm_bar, outbox, QUERY_FW_COMM_BAR_OFFSET); + fw->comm_bar = (fw->comm_bar >> 6) * 2; + mlx4_dbg(dev, "Communication vector bar:%d offset:0x%llx\n", fw->comm_bar, + fw->comm_base); mlx4_dbg(dev, "FW size %d KB\n", fw->fw_pages >> 2); /* @@ -698,7 +999,7 @@ int mlx4_QUERY_ADAPTER(struct mlx4_dev *dev, struct mlx4_adapter *adapter) outbox = mailbox->buf; err = mlx4_cmd_box(dev, 0, mailbox->dma, 0, 0, MLX4_CMD_QUERY_ADAPTER, - MLX4_CMD_TIME_CLASS_A); + MLX4_CMD_TIME_CLASS_A, 1); if (err) goto out; @@ -712,6 +1013,11 @@ out: return err; } +static u8 fmr_prot_support(u64 flags) +{ + return (flags >> 58) & 1; +} + int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) { struct mlx4_cmd_mailbox *mailbox; @@ -722,6 +1028,7 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) #define INIT_HCA_VERSION_OFFSET 0x000 #define INIT_HCA_VERSION 2 #define INIT_HCA_CACHELINE_SZ_OFFSET 0x0e +#define INIT_HCA_X86_64_BYTE_CACHELINE_SZ 0x40 #define INIT_HCA_FLAGS_OFFSET 0x014 #define INIT_HCA_QPC_OFFSET 0x020 #define INIT_HCA_QPC_BASE_OFFSET (INIT_HCA_QPC_OFFSET + 0x10) @@ -744,9 +1051,11 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) #define INIT_HCA_LOG_MC_TABLE_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x1b) #define INIT_HCA_TPT_OFFSET 0x0f0 #define INIT_HCA_DMPT_BASE_OFFSET (INIT_HCA_TPT_OFFSET + 0x00) +#define INIT_HCA_LOG_PROT_FMRS_OFFSET (INIT_HCA_TPT_OFFSET + 0x09) #define INIT_HCA_LOG_MPT_SZ_OFFSET (INIT_HCA_TPT_OFFSET + 0x0b) #define INIT_HCA_MTT_BASE_OFFSET (INIT_HCA_TPT_OFFSET + 0x10) #define INIT_HCA_CMPT_BASE_OFFSET (INIT_HCA_TPT_OFFSET + 0x18) +#define INIT_HCA_PROT_FMR_BASE_OFFSET (INIT_HCA_TPT_OFFSET + 0x20) #define INIT_HCA_UAR_OFFSET 0x120 #define INIT_HCA_LOG_UAR_SZ_OFFSET (INIT_HCA_UAR_OFFSET + 0x0a) #define INIT_HCA_UAR_PAGE_SZ_OFFSET (INIT_HCA_UAR_OFFSET + 0x0b) @@ -759,9 +1068,9 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) memset(inbox, 0, INIT_HCA_IN_SIZE); *((u8 *) mailbox->buf + INIT_HCA_VERSION_OFFSET) = INIT_HCA_VERSION; - - *((u8 *) mailbox->buf + INIT_HCA_CACHELINE_SZ_OFFSET) = - (ilog2(cache_line_size()) - 4) << 5; +#if defined(__x86_64__) || defined(__PPC64__) + *((u8 *) mailbox->buf + INIT_HCA_CACHELINE_SZ_OFFSET) = INIT_HCA_X86_64_BYTE_CACHELINE_SZ; +#endif #if defined(__LITTLE_ENDIAN) *(inbox + INIT_HCA_FLAGS_OFFSET / 4) &= ~cpu_to_be32(1 << 1); @@ -781,6 +1090,9 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) if (enable_qos) *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 2); + /* disable bad pkey traps */ + *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 15); + /* QPC/EEC/CQC/EQC/RDMARC attributes */ MLX4_PUT(inbox, param->qpc_base, INIT_HCA_QPC_BASE_OFFSET); @@ -801,7 +1113,7 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) MLX4_PUT(inbox, param->mc_base, INIT_HCA_MC_BASE_OFFSET); MLX4_PUT(inbox, param->log_mc_entry_sz, INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET); MLX4_PUT(inbox, param->log_mc_hash_sz, INIT_HCA_LOG_MC_HASH_SZ_OFFSET); - if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) + if (dev->caps.vep_mc_steering) MLX4_PUT(inbox, (u8) (1 << 3), INIT_HCA_UC_STEERING_OFFSET); MLX4_PUT(inbox, param->log_mc_table_sz, INIT_HCA_LOG_MC_TABLE_SZ_OFFSET); @@ -812,12 +1124,22 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) MLX4_PUT(inbox, param->mtt_base, INIT_HCA_MTT_BASE_OFFSET); MLX4_PUT(inbox, param->cmpt_base, INIT_HCA_CMPT_BASE_OFFSET); + if (fmr_prot_support(dev->caps.flags)) { + *((u8 *) inbox + INIT_HCA_LOG_PROT_FMRS_OFFSET) |= + ilog2(dev->caps.fmr_num_mpts); + MLX4_PUT(inbox, dev->caps.fmr_dmpt_base - dev->caps.dmpt_base, + INIT_HCA_PROT_FMR_BASE_OFFSET); + } + /* UAR attributes */ MLX4_PUT(inbox, (u8) (PAGE_SHIFT - 12), INIT_HCA_UAR_PAGE_SZ_OFFSET); MLX4_PUT(inbox, param->log_uar_sz, INIT_HCA_LOG_UAR_SZ_OFFSET); + if (!mlx4_pre_t11_mode && dev->caps.flags & (u32) MLX4_DEV_CAP_FLAG_FC_T11) + *(inbox + INIT_HCA_FLAGS_OFFSET / 4) |= cpu_to_be32(1 << 10); + - err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_INIT_HCA, 10000); + err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_INIT_HCA, 10000, 1); if (err) mlx4_err(dev, "INIT_HCA returns %d\n", err); @@ -826,6 +1148,93 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) return err; } + + +int mlx4_SET_VEP(struct mlx4_dev *dev, int slave, u8 vep_link) +{ + struct mlx4_cmd_mailbox *mailbox; + u8 *buffer; + int ret; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + buffer = mailbox->buf; + + buffer[0] = 1 << 6; + buffer[3] = vep_link << 1; + + ret = mlx4_cmd(dev, mailbox->dma, slave, 0, MLX4_CMD_SET_VEP, + MLX4_CMD_TIME_CLASS_A, 1); + + mlx4_free_cmd_mailbox(dev, mailbox); + return ret; +} + +/* for IB-type ports only */ +static int check_qp0_state(struct mlx4_dev *dev, int function, int port) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + /* irrelevant if not infiniband */ + if ( + priv->mfunc.master.qp0_state[port].proxy_qp0_active && + priv->mfunc.master.qp0_state[port].qp0_active) + return 1; + return 0; +} + +static int mlx4_common_init_port(struct mlx4_dev *dev, int function, int port) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int err; + + if ((priv->vep_mode[port]) && (function == dev->caps.function || + function == priv->mfunc.master.slave_state[function].pf_num)) + mlx4_SET_VEP(dev, function, 1); + + if (priv->mfunc.master.slave_state[function].init_port_mask & (1 << port)) + return 0; + + if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) { + /* Enable port only if it was previously disabled */ + if (!priv->mfunc.master.init_port_ref[port]) { + mlx4_update_uplink_arbiter(dev, port); + err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT, + MLX4_CMD_TIME_CLASS_A, 1); + if (err) + return err; + priv->mfunc.master.slave_state[function].init_port_mask |= (1 << port); + } + } else { + if (function == dev->caps.function) { + if (check_qp0_state(dev, function, port) && + !priv->mfunc.master.qp0_state[port].port_active) { + mlx4_update_uplink_arbiter(dev, port); + err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT, + MLX4_CMD_TIME_CLASS_A, 1); + if (err) + return err; + priv->mfunc.master.qp0_state[port].port_active = 1; + priv->mfunc.master.slave_state[function].init_port_mask |= (1 << port); + } + } else + priv->mfunc.master.slave_state[function].init_port_mask |= (1 << port); + } + ++priv->mfunc.master.init_port_ref[port]; + return 0; +} + +int mlx4_INIT_PORT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int port = vhcr->in_modifier; + + return mlx4_common_init_port(dev, slave, port); +} + int mlx4_INIT_PORT(struct mlx4_dev *dev, int port) { struct mlx4_cmd_mailbox *mailbox; @@ -869,33 +1278,83 @@ int mlx4_INIT_PORT(struct mlx4_dev *dev, int port) MLX4_PUT(inbox, field, INIT_PORT_MAX_PKEY_OFFSET); err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_INIT_PORT, - MLX4_CMD_TIME_CLASS_A); + MLX4_CMD_TIME_CLASS_A, 1); mlx4_free_cmd_mailbox(dev, mailbox); - } else + } else { err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT, - MLX4_CMD_TIME_CLASS_A); + MLX4_CMD_TIME_CLASS_A, 0); + } return err; } EXPORT_SYMBOL_GPL(mlx4_INIT_PORT); +static int mlx4_common_close_port(struct mlx4_dev *dev, int function, int port) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int err; + + if ((priv->vep_mode[port]) && (function == dev->caps.function || + function == priv->mfunc.master.slave_state[function].pf_num)) + mlx4_SET_VEP(dev, function, 0); + + if (!(priv->mfunc.master.slave_state[function].init_port_mask & (1 << port))) + return 0; + + if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) { + if (priv->mfunc.master.init_port_ref[port] == 1) { + err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT, 1000, 1); + if (err) + return err; + } + priv->mfunc.master.slave_state[function].init_port_mask &= ~(1 << port); + } else { + /* infiniband port */ + if (function == dev->caps.function) { + if (!priv->mfunc.master.qp0_state[port].qp0_active && + priv->mfunc.master.qp0_state[port].port_active) { + err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT, + 1000, 1); + if (err) + return err; + priv->mfunc.master.slave_state[function].init_port_mask &= ~(1 << port); + priv->mfunc.master.qp0_state[port].port_active = 0; + } + } else + priv->mfunc.master.slave_state[function].init_port_mask &= ~(1 << port); + } + --priv->mfunc.master.init_port_ref[port]; + return 0; +} + +int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int port = vhcr->in_modifier; + + return mlx4_common_close_port(dev, slave, port); +} + int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port) { - return mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT, 1000); + return mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT, 1000, 0); } EXPORT_SYMBOL_GPL(mlx4_CLOSE_PORT); int mlx4_CLOSE_HCA(struct mlx4_dev *dev, int panic) { - return mlx4_cmd(dev, 0, 0, panic, MLX4_CMD_CLOSE_HCA, 1000); + return mlx4_cmd(dev, 0, 0, panic, MLX4_CMD_CLOSE_HCA, 1000, 1); } int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages) { int ret = mlx4_cmd_imm(dev, icm_size, aux_pages, 0, 0, MLX4_CMD_SET_ICM_SIZE, - MLX4_CMD_TIME_CLASS_A); + MLX4_CMD_TIME_CLASS_A, 1); if (ret) return ret; @@ -912,24 +1371,112 @@ int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages) int mlx4_NOP(struct mlx4_dev *dev) { /* Input modifier of 0x1f means "finish as soon as possible." */ - return mlx4_cmd(dev, 0, 0x1f, 0, MLX4_CMD_NOP, 100); + return mlx4_cmd(dev, 0, 0x1f, 0, MLX4_CMD_NOP, 100, 1); +} + +int mlx4_QUERY_FUNC(struct mlx4_dev *dev, int func, u8 *pf_num) +{ + struct mlx4_cmd_mailbox *mailbox; + u8 *outbox; + int ret; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + outbox = mailbox->buf; + + ret = mlx4_cmd_box(dev, 0, mailbox->dma, func & 0xff, 0, + MLX4_CMD_QUERY_FUNC, MLX4_CMD_TIME_CLASS_A, 1); + if (ret) + goto out; + + *pf_num = outbox[3]; + +out: + mlx4_free_cmd_mailbox(dev, mailbox); + return ret; +} + +#define ACT_GID_INDEX(mlx4_dev_ptr, func_gid_idx, func) \ + ((mlx4_dev_ptr->sr_iov + 1) * func_gid_idx + func) + +int mlx4_GET_GID_MAP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + __be16 phys_gid_idx[128]; + int i; + + /* Set all the gid indexes to be invalid - 0xffff */ + memset(phys_gid_idx, 0xff, sizeof phys_gid_idx); + + for (i = 0; i < dev->gids_per_func; i++) + phys_gid_idx[i] = cpu_to_be16(ACT_GID_INDEX(dev, i, slave)); + + memcpy(outbox->buf, phys_gid_idx, sizeof phys_gid_idx); + + return 0; +} + +int mlx4_query_diag_counters(struct mlx4_dev *dev, int array_length, + u8 op_modifier, u32 in_offset[], u32 counter_out[]) +{ + struct mlx4_cmd_mailbox *mailbox; + u32 *outbox; + int ret; + int i; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + outbox = mailbox->buf; + + ret = mlx4_cmd_box(dev, 0, mailbox->dma, 0, op_modifier, + MLX4_CMD_DIAG_RPRT, MLX4_CMD_TIME_CLASS_A, 0); + if (ret) + goto out; + + for (i=0; i < array_length; i++) { + if (in_offset[i] > MLX4_MAILBOX_SIZE) { + ret = -EINVAL; + goto out; + } + + MLX4_GET(counter_out[i], outbox, in_offset[i]); + } + +out: + mlx4_free_cmd_mailbox(dev, mailbox); + return ret; +} +EXPORT_SYMBOL_GPL(mlx4_query_diag_counters); + +void mlx4_get_fc_t11_settings(struct mlx4_dev *dev, int *enable_pre_t11, int *t11_supported) +{ + *enable_pre_t11 = !!mlx4_pre_t11_mode; + *t11_supported = !!(dev->caps.flags & MLX4_DEV_CAP_FLAG_FC_T11); } +EXPORT_SYMBOL_GPL(mlx4_get_fc_t11_settings); #define MLX4_WOL_SETUP_MODE (5 << 28) -int mlx4_wol_read(struct mlx4_dev *dev, u64 *config, int port) +int mlx4_wol_read(struct mlx4_dev *dev, struct mlx4_wol_struct *output, int port) { u32 in_mod = MLX4_WOL_SETUP_MODE | port << 8; - return mlx4_cmd_imm(dev, 0, config, in_mod, 0x3, - MLX4_CMD_MOD_STAT_CFG, MLX4_CMD_TIME_CLASS_A); + return mlx4_cmd_imm(dev, 0, (u64 *)output, in_mod, 0x3, + MLX4_CMD_MOD_STAT_CFG, MLX4_CMD_TIME_CLASS_A, 1); } EXPORT_SYMBOL_GPL(mlx4_wol_read); -int mlx4_wol_write(struct mlx4_dev *dev, u64 config, int port) +int mlx4_wol_write(struct mlx4_dev *dev, struct mlx4_wol_struct *input, int port) { u32 in_mod = MLX4_WOL_SETUP_MODE | port << 8; + u64 in_param = ((u64) be32_to_cpu(input->flags) << 32) + | (u64) (be32_to_cpu(input->preserved1)); - return mlx4_cmd(dev, config, in_mod, 0x1, MLX4_CMD_MOD_STAT_CFG, - MLX4_CMD_TIME_CLASS_A); + return mlx4_cmd(dev, in_param, in_mod, 0x1, MLX4_CMD_MOD_STAT_CFG, + MLX4_CMD_TIME_CLASS_A, 1); } EXPORT_SYMBOL_GPL(mlx4_wol_write); diff --git a/drivers/net/mlx4/fw.h b/drivers/net/mlx4/fw.h index 56ed1646ccedf..2ee9325a0c2bc 100644 --- a/drivers/net/mlx4/fw.h +++ b/drivers/net/mlx4/fw.h @@ -64,6 +64,7 @@ struct mlx4_dev_cap { int max_responder_per_qp; int max_rdma_global; int local_ca_ack_delay; + int pf_num; int num_ports; u32 max_msg_sz; int ib_mtu[MLX4_MAX_PORTS + 1]; @@ -78,6 +79,10 @@ struct mlx4_dev_cap { u16 wavelength[MLX4_MAX_PORTS + 1]; u64 trans_code[MLX4_MAX_PORTS + 1]; u16 stat_rate_support; + int udp_rss; + int loopback_support; + int vep_uc_steering; + int vep_mc_steering; u64 flags; int reserved_uars; int uar_size; @@ -93,6 +98,8 @@ struct mlx4_dev_cap { int max_mcgs; int reserved_pds; int max_pds; + int reserved_xrcds; + int max_xrcds; int qpc_entry_sz; int rdmarc_entry_sz; int altc_entry_sz; @@ -103,6 +110,7 @@ struct mlx4_dev_cap { int dmpt_entry_sz; int cmpt_entry_sz; int mtt_entry_sz; + int inline_cfg; int resize_srq; u32 bmme_flags; u32 reserved_lkey; @@ -111,6 +119,10 @@ struct mlx4_dev_cap { u8 supported_port_types[MLX4_MAX_PORTS + 1]; u8 log_max_macs[MLX4_MAX_PORTS + 1]; u8 log_max_vlans[MLX4_MAX_PORTS + 1]; + u32 max_basic_counters; + u32 max_ext_counters; + int wol; + u32 mad_demux; }; struct mlx4_adapter { @@ -164,6 +176,12 @@ struct mlx4_set_ib_param { }; int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap); +int mlx4_QUERY_SLAVE_CAP(struct mlx4_dev *dev, struct mlx4_caps *caps); +int mlx4_QUERY_SLAVE_CAP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); int mlx4_MAP_FA(struct mlx4_dev *dev, struct mlx4_icm *icm); int mlx4_UNMAP_FA(struct mlx4_dev *dev); int mlx4_RUN_FW(struct mlx4_dev *dev); @@ -177,5 +195,9 @@ int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm); int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev); int mlx4_NOP(struct mlx4_dev *dev); int mlx4_MOD_STAT_CFG(struct mlx4_dev *dev, struct mlx4_mod_stat_cfg *cfg); +int mlx4_QUERY_FUNC(struct mlx4_dev *dev, int func, u8 *pf_num); +int mlx4_QUERY_VEP_CFG(struct mlx4_dev *dev, u8 vep_num, struct mlx4_vep_cfg *cfg); +int mlx4_CMD_SET_IF_STAT(struct mlx4_dev *dev, int mode); +int mlx4_update_uplink_arbiter(struct mlx4_dev *dev, u8 port); #endif /* MLX4_FW_H */ diff --git a/drivers/net/mlx4/icm.c b/drivers/net/mlx4/icm.c index 02393fdf44c17..d0a45e373168e 100644 --- a/drivers/net/mlx4/icm.c +++ b/drivers/net/mlx4/icm.c @@ -31,25 +31,19 @@ * SOFTWARE. */ +#include #include #include #include -#include #include #include "mlx4.h" #include "icm.h" #include "fw.h" - -/* - * We allocate in as big chunks as we can, up to a maximum of 256 KB - * per chunk. - */ -enum { - MLX4_ICM_ALLOC_SIZE = 1 << 18, - MLX4_TABLE_CHUNK_SIZE = 1 << 18 -}; +#include "fmr_api.h" +#include "fmr_slave.h" +#include "fmr_master.h" static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk) { @@ -74,15 +68,23 @@ static void mlx4_free_icm_coherent(struct mlx4_dev *dev, struct mlx4_icm_chunk * sg_dma_address(&chunk->mem[i])); } -void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent) +void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent, + enum mlx4_mr_flags flags) { struct mlx4_icm_chunk *chunk, *tmp; + int fmr_flow, i; if (!icm) return; + fmr_flow = mlx4_fmr_flow(dev, flags); + list_for_each_entry_safe(chunk, tmp, &icm->chunk_list, list) { - if (coherent) + if (fmr_flow) + for (i = 0; i < chunk->npages; ++i) { + __free_page(chunk->fmr_pages[i]); + chunk->fmr_pages[i] = NULL; + } else if (coherent) mlx4_free_icm_coherent(dev, chunk); else mlx4_free_icm_pages(dev, chunk); @@ -130,7 +132,7 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages, /* We use sg_set_buf for coherent allocs, which assumes low memory */ BUG_ON(coherent && (gfp_mask & __GFP_HIGHMEM)); - icm = kmalloc(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN)); + icm = kzalloc(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN)); if (!icm) return NULL; @@ -141,7 +143,7 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages, while (npages > 0) { if (!chunk) { - chunk = kmalloc(sizeof *chunk, + chunk = kzalloc(sizeof *chunk, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN)); if (!chunk) goto fail; @@ -163,30 +165,29 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages, ret = mlx4_alloc_icm_pages(&chunk->mem[chunk->npages], cur_order, gfp_mask); - if (ret) { - if (--cur_order < 0) - goto fail; - else - continue; - } + if (!ret) { + ++chunk->npages; - ++chunk->npages; + if (coherent) + ++chunk->nsg; + else if (chunk->npages == MLX4_ICM_CHUNK_LEN) { + chunk->nsg = pci_map_sg(dev->pdev, chunk->mem, + chunk->npages, + PCI_DMA_BIDIRECTIONAL); - if (coherent) - ++chunk->nsg; - else if (chunk->npages == MLX4_ICM_CHUNK_LEN) { - chunk->nsg = pci_map_sg(dev->pdev, chunk->mem, - chunk->npages, - PCI_DMA_BIDIRECTIONAL); + if (chunk->nsg <= 0) + goto fail; + } - if (chunk->nsg <= 0) + if (chunk->npages == MLX4_ICM_CHUNK_LEN) + chunk = NULL; + + npages -= 1 << cur_order; + } else { + --cur_order; + if (cur_order < 0) goto fail; } - - if (chunk->npages == MLX4_ICM_CHUNK_LEN) - chunk = NULL; - - npages -= 1 << cur_order; } if (!coherent && chunk) { @@ -201,19 +202,256 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages, return icm; fail: - mlx4_free_icm(dev, icm, coherent); + mlx4_free_icm(dev, icm, coherent, MLX4_MR_FLAG_NONE); return NULL; } +static int mlx4_UNMAP_FMR(struct mlx4_dev *dev, u64 virt, u32 page_count, + struct mlx4_icm *icm) +{ + struct mlx4_icm_chunk *chunk; + int err, i; + + err = mlx4_cmd(dev, virt, page_count, 1, MLX4_CMD_UNMAP_ICM, + MLX4_CMD_TIME_CLASS_B, 0); + if (err) { + mlx4_dbg(dev, "UNMAP FMR failed for virt 0x%llx\n", + (unsigned long long) virt); + return err; + } + + /* fmr flow maps all pages into first chunk */ + chunk = list_empty(&icm->chunk_list) ? NULL : + list_entry(icm->chunk_list.next, struct mlx4_icm_chunk, list); + if (!chunk) { + mlx4_dbg(dev, "UNMAP FMR got null chunk\n"); + return -EINVAL; + } + + for (i = 0; i < page_count; ++i) { + mlx4_fmr_slave_unshare(chunk->fmr_vpm_ctx[i]); + chunk->fmr_vpm_ctx[i] = NULL; + } + + return 0; +} + +static int mlx4_MAP_FMR(struct mlx4_dev *dev, struct mlx4_icm *icm, u64 virt) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_icm_chunk *chunk; + struct vpm *vpm; + void *vpm_raw; + int fmr_vpm_size, i, nent; + int err; + + + + err = mlx4_fmr_slave_context_init(dev); + if (err) { + mlx4_warn(dev, "FMR init failed. FMR disabled.\n"); + return err; + } + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + /* fmr flow maps all pages into first chunk */ + chunk = list_empty(&icm->chunk_list) ? NULL : + list_entry(icm->chunk_list.next, struct mlx4_icm_chunk, list); + if (!chunk) { + mlx4_dbg(dev, "MAP FMR got null chunk\n"); + err = -EINVAL; + goto out_free_mailbox; + } + + vpm_raw = mailbox->buf; + + /* vpm includes two u64 fields and private data in 2 byte words */ + fmr_vpm_size = sizeof(struct vpm) + mlx4_fmr_slave_vpm_info_size(); + + for (i = 0, nent = 0; i < chunk->npages; ++i, virt += PAGE_SIZE, + vpm_raw += fmr_vpm_size) { + vpm = (struct vpm *)vpm_raw; + memset(vpm_raw, 0, fmr_vpm_size); + err = mlx4_fmr_slave_share(dev, + lowmem_page_address(chunk->fmr_pages[i]), + vpm, &chunk->fmr_vpm_ctx[i]); + + if (err) { + mlx4_dbg(dev, "MAP FMR failed to share page, err %d\n", + err); + goto out; + } + + vpm->va = cpu_to_be64(virt); + + if ((++nent + 1) * fmr_vpm_size > MLX4_MAILBOX_SIZE) { + err = mlx4_cmd(dev, mailbox->dma | dev->caps.function, + nent, 1, MLX4_CMD_MAP_ICM, + MLX4_CMD_TIME_CLASS_B, 0); + if (err) { + mlx4_dbg(dev, "MAP FMR cmd failed, err %d\n", + err); + goto out_unshare; + } + nent = 0; + } + } + + if (nent) { + err = mlx4_cmd(dev, mailbox->dma | dev->caps.function, nent, 1, + MLX4_CMD_MAP_ICM, + MLX4_CMD_TIME_CLASS_B, 0); + if (err) { + mlx4_dbg(dev, "MAP FMR cmd failed, err %d\n", err); + goto out; + } + } + + mlx4_dbg(dev, "MAP FMR %d pages at %llx for ICM.\n", + chunk->npages, (unsigned long long) (virt - i * PAGE_SIZE)); + + mlx4_free_cmd_mailbox(dev, mailbox); + return 0; + +out_unshare: + mlx4_fmr_slave_unshare(chunk->fmr_vpm_ctx[i]); + +out: + mlx4_UNMAP_FMR(dev, virt -= i * PAGE_SIZE, i, icm); + +out_free_mailbox: + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} + +int mlx4_MAP_ICM_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err, nent, i; + u64 va; + struct vpm *vpm; + void *vpm_raw; + int vpm_info_size; + dma_addr_t addr; + + nent = vhcr->in_modifier; + + if (!vhcr->op_modifier) + return mlx4_cmd(dev, inbox->dma, nent, 0, MLX4_CMD_MAP_ICM, + MLX4_CMD_TIME_CLASS_B, 1); + + + vpm_info_size = sizeof(struct vpm) + mlx4_fmr_master_vpm_info_size(); + vpm_raw = inbox->buf; + + for (i = 0; i < nent; ++i, vpm_raw += vpm_info_size) { + vpm = (struct vpm *)vpm_raw; + va = be64_to_cpu(vpm->va); + addr = mlx4_fmr_master_dma_map(dev, slave, vpm); + if (!addr) { + mlx4_dbg(dev, "MAP ICM wrapper failed to get fmr dma" + " addr for va 0x%llx\n", + (unsigned long long)va); + err = -EINVAL; + goto out_addr; + } + + err = mlx4_MAP_ICM_page(dev, (u64)addr, va); + if (err) { + mlx4_dbg(dev, "MAP ICM wrapper failed to map icm" + " addr for va 0x%llx\n", + (unsigned long long)va); + err = -EINVAL; + goto out_dma_free; + } + } + + return 0; + +out_dma_free: + mlx4_fmr_master_dma_unmap(dev, slave, be64_to_cpu(vpm->va)); + +out_addr: + for (--i, vpm_raw -= vpm_info_size; i >= 0; --i, vpm_raw -= vpm_info_size) { + vpm = (struct vpm *)vpm_raw; + va = be64_to_cpu(vpm->va); + if (mlx4_UNMAP_ICM(dev, va, 1)) + mlx4_warn(dev, "MAP ICM wrapper failed to unmap icm" + " addr for va 0x%llx with err %d\n", + (unsigned long long)va, err); + mlx4_fmr_master_dma_unmap(dev, slave, va); + } + + return err; +} + static int mlx4_MAP_ICM(struct mlx4_dev *dev, struct mlx4_icm *icm, u64 virt) { return mlx4_map_cmd(dev, MLX4_CMD_MAP_ICM, icm, virt); } -static int mlx4_UNMAP_ICM(struct mlx4_dev *dev, u64 virt, u32 page_count) +int mlx4_UNMAP_ICM_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + u32 page_count = vhcr->in_modifier; + u64 virt = vhcr->in_param; + int err, i; + + err = mlx4_cmd(dev, virt, page_count, 0, MLX4_CMD_UNMAP_ICM, + MLX4_CMD_TIME_CLASS_B, 1); + + if (err) { + mlx4_dbg(dev, "UNMAP ICM wrapper failed for addr 0x%llx," + " page count %d with err %d\n", + (unsigned long long)virt, page_count, err); + return err; + } + + for (i = 0; i < page_count; ++i, virt += PAGE_SIZE) + mlx4_fmr_master_dma_unmap(dev, slave, virt); + + return 0; +} + +int mlx4_UNMAP_ICM(struct mlx4_dev *dev, u64 virt, u32 page_count) { return mlx4_cmd(dev, virt, page_count, 0, MLX4_CMD_UNMAP_ICM, - MLX4_CMD_TIME_CLASS_B); + MLX4_CMD_TIME_CLASS_B, 1); +} + +int mlx4_MAP_ICM_page(struct mlx4_dev *dev, u64 dma_addr, u64 virt) +{ + struct mlx4_cmd_mailbox *mailbox; + __be64 *inbox; + int err; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + inbox = mailbox->buf; + + inbox[0] = cpu_to_be64(virt); + inbox[1] = cpu_to_be64(dma_addr); + + err = mlx4_cmd(dev, mailbox->dma, 1, 0, MLX4_CMD_MAP_ICM, + MLX4_CMD_TIME_CLASS_B, 1); + + mlx4_free_cmd_mailbox(dev, mailbox); + + if (!err) + mlx4_dbg(dev, "Mapped page at %llx to %llx for ICM.\n", + (unsigned long long) dma_addr, (unsigned long long) virt); + + return err; } int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm) @@ -223,13 +461,64 @@ int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm) int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev) { - return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_UNMAP_ICM_AUX, MLX4_CMD_TIME_CLASS_B); + return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_UNMAP_ICM_AUX, MLX4_CMD_TIME_CLASS_B, 1); +} + +static struct mlx4_icm *mlx4_alloc_fmr(struct mlx4_dev *dev, int npages, + gfp_t gfp_mask) +{ + struct mlx4_icm *icm; + struct mlx4_icm_chunk *chunk; + int i; + + icm = kzalloc(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN)); + if (!icm) { + mlx4_dbg(dev, "alloc fmr failed to alloc icm mem\n"); + return NULL; + } + + icm->refcount = 0; + INIT_LIST_HEAD(&icm->chunk_list); + + /* Fmr flow maps all pages into first chunk */ + chunk = kzalloc(sizeof *chunk, + gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN)); + if (!chunk) { + mlx4_dbg(dev, "alloc fmr failed to alloc chunk mem\n"); + goto out_free_icm; + } + + /* The memory is allocated but not dma mapped */ + for (i = 0; i < npages; ++i) { + chunk->fmr_pages[i] = alloc_page(gfp_mask); + if (!chunk->fmr_pages[i]) { + mlx4_dbg(dev, "alloc fmr failed to alloc chunk mem\n"); + goto out_free_chunk; + } + } + + chunk->npages = npages; + list_add_tail(&chunk->list, &icm->chunk_list); + return icm; + +out_free_chunk: + for (; i > 0; --i) + __free_page(chunk->fmr_pages[i]); + kfree(chunk); + +out_free_icm: + kfree(icm); + + return NULL; } -int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj) +int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj, + enum mlx4_mr_flags flags) { int i = (obj & (table->num_obj - 1)) / (MLX4_TABLE_CHUNK_SIZE / table->obj_size); int ret = 0; + int fmr_flow; + gfp_t gfp_mask; mutex_lock(&table->mutex); @@ -238,48 +527,71 @@ int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj) goto out; } - table->icm[i] = mlx4_alloc_icm(dev, MLX4_TABLE_CHUNK_SIZE >> PAGE_SHIFT, - (table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) | - __GFP_NOWARN, table->coherent); + fmr_flow = mlx4_fmr_flow(dev, flags); + gfp_mask = (table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) | __GFP_NOWARN; + + table->icm[i] = fmr_flow ? + mlx4_alloc_fmr(dev, MLX4_TABLE_CHUNK_PAGES, gfp_mask) : + mlx4_alloc_icm(dev, MLX4_TABLE_CHUNK_PAGES, gfp_mask, + table->coherent); if (!table->icm[i]) { ret = -ENOMEM; goto out; } - if (mlx4_MAP_ICM(dev, table->icm[i], table->virt + - (u64) i * MLX4_TABLE_CHUNK_SIZE)) { - mlx4_free_icm(dev, table->icm[i], table->coherent); + ret = fmr_flow ? + mlx4_MAP_FMR(dev, table->icm[i], table->virt + + (u64) i * MLX4_TABLE_CHUNK_SIZE) : + mlx4_MAP_ICM(dev, table->icm[i], table->virt + + (u64) i * MLX4_TABLE_CHUNK_SIZE); + + if (ret) { + mlx4_free_icm(dev, table->icm[i], table->coherent, flags); table->icm[i] = NULL; ret = -ENOMEM; goto out; } ++table->icm[i]->refcount; + table->icm[i]->chunk_size = MLX4_TABLE_CHUNK_SIZE; out: mutex_unlock(&table->mutex); return ret; } -void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj) +void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj, + enum mlx4_mr_flags flags) { int i; + int fmr_flow; i = (obj & (table->num_obj - 1)) / (MLX4_TABLE_CHUNK_SIZE / table->obj_size); mutex_lock(&table->mutex); - if (--table->icm[i]->refcount == 0) { + if (--table->icm[i]->refcount > 0) + goto out; + + fmr_flow = mlx4_fmr_flow(dev, flags); + if (fmr_flow) + mlx4_UNMAP_FMR(dev, table->virt + i * MLX4_TABLE_CHUNK_SIZE, + table->icm[i]->chunk_size / MLX4_ICM_PAGE_SIZE, + table->icm[i]); + else mlx4_UNMAP_ICM(dev, table->virt + i * MLX4_TABLE_CHUNK_SIZE, - MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE); - mlx4_free_icm(dev, table->icm[i], table->coherent); - table->icm[i] = NULL; - } + table->icm[i]->chunk_size / MLX4_ICM_PAGE_SIZE); + mlx4_free_icm(dev, table->icm[i], table->coherent, flags); + table->icm[i] = NULL; + +out: mutex_unlock(&table->mutex); } -void *mlx4_table_find(struct mlx4_icm_table *table, int obj, dma_addr_t *dma_handle) +void *mlx4_table_find(struct mlx4_dev *dev, struct mlx4_icm_table *table, + int obj, dma_addr_t *dma_handle, + enum mlx4_mr_flags flags) { int idx, offset, dma_offset, i; struct mlx4_icm_chunk *chunk; @@ -298,6 +610,18 @@ void *mlx4_table_find(struct mlx4_icm_table *table, int obj, dma_addr_t *dma_han if (!icm) goto out; + if (mlx4_fmr_flow(dev, flags)) { + /* fmr flow maps all pages into first chunk */ + chunk = list_empty(&icm->chunk_list) ? NULL : + list_entry(icm->chunk_list.next, struct mlx4_icm_chunk, + list); + if (!chunk) + return NULL; + + page = chunk->fmr_pages[offset / PAGE_SIZE]; + offset %= PAGE_SIZE; + goto out; + } list_for_each_entry(chunk, &icm->chunk_list, list) { for (i = 0; i < chunk->npages; ++i) { if (dma_handle && dma_offset >= 0) { @@ -325,13 +649,13 @@ out: } int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, - int start, int end) + int start, int end, enum mlx4_mr_flags flags) { int inc = MLX4_TABLE_CHUNK_SIZE / table->obj_size; int i, err; for (i = start; i <= end; i += inc) { - err = mlx4_table_get(dev, table, i); + err = mlx4_table_get(dev, table, i, flags); if (err) goto fail; } @@ -341,19 +665,19 @@ int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, fail: while (i > start) { i -= inc; - mlx4_table_put(dev, table, i); + mlx4_table_put(dev, table, i, flags); } return err; } void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, - int start, int end) + int start, int end, enum mlx4_mr_flags flags) { int i; for (i = start; i <= end; i += MLX4_TABLE_CHUNK_SIZE / table->obj_size) - mlx4_table_put(dev, table, i); + mlx4_table_put(dev, table, i, flags); } int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table, @@ -371,6 +695,10 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table, table->icm = kcalloc(num_icm, sizeof *table->icm, GFP_KERNEL); if (!table->icm) return -ENOMEM; + + for (i = 0; i < num_icm; ++i) + table->icm[i] = NULL; + table->virt = virt; table->num_icm = num_icm; table->num_obj = nobj; @@ -389,8 +717,10 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table, __GFP_NOWARN, use_coherent); if (!table->icm[i]) goto err; - if (mlx4_MAP_ICM(dev, table->icm[i], virt + i * MLX4_TABLE_CHUNK_SIZE)) { - mlx4_free_icm(dev, table->icm[i], use_coherent); + if (mlx4_MAP_ICM(dev, table->icm[i], virt + + i * MLX4_TABLE_CHUNK_SIZE)) { + mlx4_free_icm(dev, table->icm[i], use_coherent, + MLX4_MR_FLAG_NONE); table->icm[i] = NULL; goto err; } @@ -400,6 +730,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table, * gets freed (since it contains reserved firmware objects). */ ++table->icm[i]->refcount; + table->icm[i]->chunk_size = chunk_size; } return 0; @@ -408,23 +739,36 @@ err: for (i = 0; i < num_icm; ++i) if (table->icm[i]) { mlx4_UNMAP_ICM(dev, virt + i * MLX4_TABLE_CHUNK_SIZE, - MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE); - mlx4_free_icm(dev, table->icm[i], use_coherent); + table->icm[i]->chunk_size / + MLX4_ICM_PAGE_SIZE); + mlx4_free_icm(dev, table->icm[i], use_coherent, + MLX4_MR_FLAG_NONE); } return -ENOMEM; } -void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table) +void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table, + enum mlx4_mr_flags flags) { int i; - for (i = 0; i < table->num_icm; ++i) - if (table->icm[i]) { + for (i = 0; i < table->num_icm; ++i) { + if (!table->icm[i]) + continue; + + if (mlx4_fmr_flow(dev, flags)) + mlx4_UNMAP_FMR(dev, table->virt + i * MLX4_TABLE_CHUNK_SIZE, + table->icm[i]->chunk_size / MLX4_ICM_PAGE_SIZE, + table->icm[i]); + else mlx4_UNMAP_ICM(dev, table->virt + i * MLX4_TABLE_CHUNK_SIZE, - MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE); - mlx4_free_icm(dev, table->icm[i], table->coherent); - } + table->icm[i]->chunk_size / + MLX4_ICM_PAGE_SIZE); + + mlx4_free_icm(dev, table->icm[i], table->coherent, flags); + } kfree(table->icm); + table->icm = NULL; } diff --git a/drivers/net/mlx4/icm.h b/drivers/net/mlx4/icm.h index b10c07a1dc1a0..1ce6e53a5d5ec 100644 --- a/drivers/net/mlx4/icm.h +++ b/drivers/net/mlx4/icm.h @@ -47,16 +47,30 @@ enum { MLX4_ICM_PAGE_SIZE = 1 << MLX4_ICM_PAGE_SHIFT, }; +/* + * We allocate in as big chunks as we can, up to a maximum of 256 KB + * per chunk. + */ +enum { + MLX4_ICM_ALLOC_SIZE = 1 << 18, + MLX4_TABLE_CHUNK_SIZE = 1 << 18, + MLX4_TABLE_CHUNK_PAGES = (1 << 18) >> PAGE_SHIFT +}; + + struct mlx4_icm_chunk { struct list_head list; int npages; int nsg; struct scatterlist mem[MLX4_ICM_CHUNK_LEN]; + void *fmr_vpm_ctx[MLX4_TABLE_CHUNK_PAGES]; + struct page *fmr_pages[MLX4_TABLE_CHUNK_PAGES]; }; struct mlx4_icm { struct list_head chunk_list; int refcount; + unsigned chunk_size; }; struct mlx4_icm_iter { @@ -69,25 +83,25 @@ struct mlx4_dev; struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages, gfp_t gfp_mask, int coherent); -void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent); +void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent, + enum mlx4_mr_flags flags); -int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj); -void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj); +int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj, + enum mlx4_mr_flags flags); +void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj, + enum mlx4_mr_flags flags); int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, - int start, int end); + int start, int end, enum mlx4_mr_flags flags); void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, - int start, int end); + int start, int end, enum mlx4_mr_flags flags); int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table, u64 virt, int obj_size, int nobj, int reserved, int use_lowmem, int use_coherent); -void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table); -int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj); -void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj); -void *mlx4_table_find(struct mlx4_icm_table *table, int obj, dma_addr_t *dma_handle); -int mlx4_table_get_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, - int start, int end); -void mlx4_table_put_range(struct mlx4_dev *dev, struct mlx4_icm_table *table, - int start, int end); +void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table, + enum mlx4_mr_flags flags); +void *mlx4_table_find(struct mlx4_dev *dev, struct mlx4_icm_table *table, + int obj, dma_addr_t *dma_handle, + enum mlx4_mr_flags flags); static inline void mlx4_icm_first(struct mlx4_icm *icm, struct mlx4_icm_iter *iter) @@ -128,7 +142,20 @@ static inline unsigned long mlx4_icm_size(struct mlx4_icm_iter *iter) return sg_dma_len(&iter->chunk->mem[iter->page_idx]); } +int mlx4_UNMAP_ICM(struct mlx4_dev *dev, u64 virt, u32 page_count); +int mlx4_MAP_ICM_page(struct mlx4_dev *dev, u64 dma_addr, u64 virt); int mlx4_MAP_ICM_AUX(struct mlx4_dev *dev, struct mlx4_icm *icm); int mlx4_UNMAP_ICM_AUX(struct mlx4_dev *dev); +int mlx4_MAP_ICM_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_UNMAP_ICM_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); + #endif /* MLX4_ICM_H */ diff --git a/drivers/net/mlx4/intf.c b/drivers/net/mlx4/intf.c index 73c94fcdfddf0..906746bf26e43 100644 --- a/drivers/net/mlx4/intf.c +++ b/drivers/net/mlx4/intf.c @@ -31,8 +31,6 @@ * SOFTWARE. */ -#include - #include "mlx4.h" struct mlx4_device_context { @@ -114,7 +112,38 @@ void mlx4_unregister_interface(struct mlx4_interface *intf) } EXPORT_SYMBOL_GPL(mlx4_unregister_interface); -void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type, int port) +struct mlx4_dev *mlx4_query_interface(void *int_dev, int *port) +{ + struct mlx4_priv *priv; + struct mlx4_device_context *dev_ctx; + enum mlx4_query_reply r; + unsigned long flags; + + mutex_lock(&intf_mutex); + + list_for_each_entry(priv, &dev_list, dev_list) { + spin_lock_irqsave(&priv->ctx_lock, flags); + list_for_each_entry(dev_ctx, &priv->ctx_list, list) { + if (!dev_ctx->intf->query) + continue; + r = dev_ctx->intf->query(dev_ctx->context, int_dev); + if (r != MLX4_QUERY_NOT_MINE) { + *port = r; + spin_unlock_irqrestore(&priv->ctx_lock, flags); + mutex_unlock(&intf_mutex); + return &priv->dev; + } + } + spin_unlock_irqrestore(&priv->ctx_lock, flags); + } + + mutex_unlock(&intf_mutex); + return NULL; +} +EXPORT_SYMBOL_GPL(mlx4_query_interface); + +void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type, + unsigned long param) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_device_context *dev_ctx; @@ -124,7 +153,7 @@ void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type, int por list_for_each_entry(dev_ctx, &priv->ctx_list, list) if (dev_ctx->intf->event) - dev_ctx->intf->event(dev, dev_ctx->context, type, port); + dev_ctx->intf->event(dev, dev_ctx->context, type, param); spin_unlock_irqrestore(&priv->ctx_lock, flags); } @@ -141,7 +170,8 @@ int mlx4_register_device(struct mlx4_dev *dev) mlx4_add_device(intf, priv); mutex_unlock(&intf_mutex); - mlx4_start_catas_poll(dev); + if (!mlx4_is_mfunc(dev) || mlx4_is_master(dev)) + mlx4_start_catas_poll(dev); return 0; } @@ -151,7 +181,8 @@ void mlx4_unregister_device(struct mlx4_dev *dev) struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_interface *intf; - mlx4_stop_catas_poll(dev); + if (!mlx4_is_mfunc(dev) || mlx4_is_master(dev)) + mlx4_stop_catas_poll(dev); mutex_lock(&intf_mutex); list_for_each_entry(intf, &intf_list, list) @@ -162,7 +193,7 @@ void mlx4_unregister_device(struct mlx4_dev *dev) mutex_unlock(&intf_mutex); } -void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto, int port) +void *mlx4_find_get_prot_dev(struct mlx4_dev *dev, enum mlx4_prot proto, int port) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_device_context *dev_ctx; @@ -172,13 +203,13 @@ void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto, int spin_lock_irqsave(&priv->ctx_lock, flags); list_for_each_entry(dev_ctx, &priv->ctx_list, list) - if (dev_ctx->intf->protocol == proto && dev_ctx->intf->get_dev) { - result = dev_ctx->intf->get_dev(dev, dev_ctx->context, port); + if (dev_ctx->intf->protocol == proto && dev_ctx->intf->get_prot_dev) { + result = dev_ctx->intf->get_prot_dev(dev, dev_ctx->context, port); break; - } + } spin_unlock_irqrestore(&priv->ctx_lock, flags); return result; } -EXPORT_SYMBOL_GPL(mlx4_get_protocol_dev); + diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index 545a771886453..ba4a64e798668 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -39,6 +39,8 @@ #include #include #include +#include +#include #include #include @@ -48,6 +50,9 @@ #include "fw.h" #include "icm.h" +#include "fmr_master.h" +#include "fmr_slave.h" + MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("Mellanox ConnectX HCA low-level driver"); MODULE_LICENSE("Dual BSD/GPL"); @@ -63,6 +68,11 @@ MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); #endif /* CONFIG_MLX4_DEBUG */ +int mlx4_blck_lb=1; +module_param_named(block_loopback, mlx4_blck_lb, int, 0644); +MODULE_PARM_DESC(block_loopback, "Block multicast loopback packets if > 0"); + + #ifdef CONFIG_PCI_MSI static int msi_x = 1; @@ -75,37 +85,137 @@ MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero"); #endif /* CONFIG_PCI_MSI */ +#ifdef CONFIG_PCI_IOV + +static int sr_iov; +module_param(sr_iov, int, 0444); +MODULE_PARM_DESC(sr_iov, "enable #sr_iov functions if sr_iov > 0"); + +static int probe_vf; +module_param(probe_vf, int, 0444); +MODULE_PARM_DESC(probe_vf, "number of vfs to probe by pf driver (sr_iov > 0)"); + +int mlx4_log_num_mgm_entry_size = 10; +module_param_named(log_num_mgm_entry_size, mlx4_log_num_mgm_entry_size, int, 0444); +MODULE_PARM_DESC(log_num_mgm_entry_size, "log mgm size, that defines the num of qp per mcg," + " for example: 10 gives 248." + "range: 9<= log_num_mgm_entry_size <= 12"); + +#else /* CONFIG_PCI_IOV */ +static int sr_iov = 0; +#define probe_vf 0 +int mlx4_log_num_mgm_entry_size = 9; +#endif /* CONFIG_PCI_IOV */ + +/* let the mlx4 generate entropy by default */ + +int enable_entropy = 1; +module_param(enable_entropy, int, 0444); +MODULE_PARM_DESC(enable_entropy, "Allow the mlx4 to seed the entropy pool (default = 1)"); + static char mlx4_version[] __devinitdata = DRV_NAME ": Mellanox ConnectX core driver v" DRV_VERSION " (" DRV_RELDATE ")\n"; +struct mutex drv_mutex; + static struct mlx4_profile default_profile = { - .num_qp = 1 << 17, + .num_qp = 1 << 18, .num_srq = 1 << 16, .rdmarc_per_qp = 1 << 4, .num_cq = 1 << 16, .num_mcg = 1 << 13, - .num_mpt = 1 << 17, - .num_mtt = 1 << 20, + .num_mpt = 1 << 20, + .num_mtt = 1 << 21 }; -static int log_num_mac = 2; +static int log_num_mac = 7; module_param_named(log_num_mac, log_num_mac, int, 0444); MODULE_PARM_DESC(log_num_mac, "Log2 max number of MACs per ETH port (1-7)"); -static int log_num_vlan; -module_param_named(log_num_vlan, log_num_vlan, int, 0444); -MODULE_PARM_DESC(log_num_vlan, "Log2 max number of VLANs per ETH port (0-7)"); - static int use_prio; module_param_named(use_prio, use_prio, bool, 0444); MODULE_PARM_DESC(use_prio, "Enable steering by VLAN priority on ETH ports " "(0/1, default 0)"); +static struct mlx4_profile mod_param_profile = { 0 }; + +module_param_named(log_num_qp, mod_param_profile.num_qp, int, 0444); +MODULE_PARM_DESC(log_num_qp, "log maximum number of QPs per HCA"); + +module_param_named(log_num_srq, mod_param_profile.num_srq, int, 0444); +MODULE_PARM_DESC(log_num_srq, "log maximum number of SRQs per HCA"); + +module_param_named(log_rdmarc_per_qp, mod_param_profile.rdmarc_per_qp, int, 0444); +MODULE_PARM_DESC(log_rdmarc_per_qp, "log number of RDMARC buffers per QP"); + +module_param_named(log_num_cq, mod_param_profile.num_cq, int, 0444); +MODULE_PARM_DESC(log_num_cq, "log maximum number of CQs per HCA"); + +module_param_named(log_num_mcg, mod_param_profile.num_mcg, int, 0444); +MODULE_PARM_DESC(log_num_mcg, "log maximum number of multicast groups per HCA"); + +module_param_named(log_num_mpt, mod_param_profile.num_mpt, int, 0444); +MODULE_PARM_DESC(log_num_mpt, + "log maximum number of memory protection table entries per HCA"); + +module_param_named(log_num_mtt, mod_param_profile.num_mtt, int, 0444); +MODULE_PARM_DESC(log_num_mtt, + "log maximum number of memory translation table segments per HCA"); + static int log_mtts_per_seg = ilog2(MLX4_MTT_ENTRY_PER_SEG); module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444); MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment (1-7)"); +static void process_mod_param_profile(void) +{ + default_profile.num_qp = (mod_param_profile.num_qp ? + 1 << mod_param_profile.num_qp : + default_profile.num_qp); + default_profile.num_srq = (mod_param_profile.num_srq ? + 1 << mod_param_profile.num_srq : + default_profile.num_srq); + default_profile.rdmarc_per_qp = (mod_param_profile.rdmarc_per_qp ? + 1 << mod_param_profile.rdmarc_per_qp : + default_profile.rdmarc_per_qp); + default_profile.num_cq = (mod_param_profile.num_cq ? + 1 << mod_param_profile.num_cq : + default_profile.num_cq); + default_profile.num_mcg = (mod_param_profile.num_mcg ? + 1 << mod_param_profile.num_mcg : + default_profile.num_mcg); + default_profile.num_mpt = (mod_param_profile.num_mpt ? + 1 << mod_param_profile.num_mpt : + default_profile.num_mpt); + default_profile.num_mtt = (mod_param_profile.num_mtt ? + 1 << mod_param_profile.num_mtt : + default_profile.num_mtt); +} + +struct mlx4_port_config +{ + struct list_head list; + enum mlx4_port_type port_type[MLX4_MAX_PORTS + 1]; + struct pci_dev *pdev; +}; +static LIST_HEAD(config_list); + +static void mlx4_config_cleanup(void) +{ + struct mlx4_port_config *config, *tmp; + + list_for_each_entry_safe(config, tmp, &config_list, list) { + list_del(&config->list); + kfree(config); + } +} + +void *mlx4_get_prot_dev(struct mlx4_dev *dev, enum mlx4_prot proto, int port) +{ + return mlx4_find_get_prot_dev(dev, proto, port); +} +EXPORT_SYMBOL(mlx4_get_prot_dev); + int mlx4_check_port_params(struct mlx4_dev *dev, enum mlx4_port_type *port_type) { @@ -134,15 +244,30 @@ int mlx4_check_port_params(struct mlx4_dev *dev, return 0; } -static void mlx4_set_port_mask(struct mlx4_dev *dev) +void mlx4_set_port_mask(struct mlx4_dev *dev, struct mlx4_caps *caps, int function) { int i; + int active = (function & 1) + 1; - dev->caps.port_mask = 0; - for (i = 1; i <= dev->caps.num_ports; ++i) - if (dev->caps.port_type[i] == MLX4_PORT_TYPE_IB) - dev->caps.port_mask |= 1 << (i - 1); + for (i = 1; i <= caps->num_ports; ++i) { + caps->port_mask[i] = caps->port_type[i]; + if (dev->caps.pf_num > 1 && i != active) + caps->port_mask[i] = 0; + } +} + +static u8 get_counters_mode(u64 flags) +{ + switch (flags >> 48 & 3) { + case 2: + case 3: + case 1: + return MLX4_CUNTERS_BASIC; + default: + return MLX4_CUNTERS_DISABLED; + } } + static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) { int err; @@ -175,12 +300,14 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) return -ENODEV; } + dev->caps.pf_num = dev_cap->pf_num; dev->caps.num_ports = dev_cap->num_ports; for (i = 1; i <= dev->caps.num_ports; ++i) { dev->caps.vl_cap[i] = dev_cap->max_vl[i]; dev->caps.ib_mtu_cap[i] = dev_cap->ib_mtu[i]; dev->caps.gid_table_len[i] = dev_cap->max_gids[i]; dev->caps.pkey_table_len[i] = dev_cap->max_pkeys[i]; + dev->caps.pkey_table_max_len[i] = dev_cap->max_pkeys[i]; dev->caps.port_width_cap[i] = dev_cap->max_port_width[i]; dev->caps.eth_mtu_cap[i] = dev_cap->eth_mtu[i]; dev->caps.def_mac[i] = dev_cap->def_mac[i]; @@ -191,6 +318,7 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev->caps.trans_code[i] = dev_cap->trans_code[i]; } + dev->caps.uar_page_size = PAGE_SIZE; dev->caps.num_uars = dev_cap->uar_size / PAGE_SIZE; dev->caps.local_ca_ack_delay = dev_cap->local_ca_ack_delay; dev->caps.bf_reg_size = dev_cap->bf_reg_size; @@ -204,7 +332,7 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev->caps.reserved_srqs = dev_cap->reserved_srqs; dev->caps.max_sq_desc_sz = dev_cap->max_sq_desc_sz; dev->caps.max_rq_desc_sz = dev_cap->max_rq_desc_sz; - dev->caps.num_qp_per_mgm = MLX4_QP_PER_MGM; + dev->caps.num_qp_per_mgm = mlx4_get_qp_per_mgm(dev); /* * Subtract 1 from the limit because we need to allocate a * spare CQE so the HCA HW can tell the difference between an @@ -217,26 +345,40 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev->caps.reserved_mtts = DIV_ROUND_UP(dev_cap->reserved_mtts, dev->caps.mtts_per_seg); dev->caps.reserved_mrws = dev_cap->reserved_mrws; - dev->caps.reserved_uars = dev_cap->reserved_uars; + + /* The first 128 UARs are used for EQ doorbells */ + dev->caps.reserved_uars = max_t(int, 128, dev_cap->reserved_uars); dev->caps.reserved_pds = dev_cap->reserved_pds; dev->caps.mtt_entry_sz = dev->caps.mtts_per_seg * dev_cap->mtt_entry_sz; + dev->caps.dmpt_entry_sz = dev_cap->dmpt_entry_sz; dev->caps.max_msg_sz = dev_cap->max_msg_sz; dev->caps.page_size_cap = ~(u32) (dev_cap->min_page_sz - 1); dev->caps.flags = dev_cap->flags; dev->caps.bmme_flags = dev_cap->bmme_flags; dev->caps.reserved_lkey = dev_cap->reserved_lkey; dev->caps.stat_rate_support = dev_cap->stat_rate_support; + dev->caps.udp_rss = dev_cap->udp_rss; + dev->caps.loopback_support = dev_cap->loopback_support; + dev->caps.vep_uc_steering = dev_cap->vep_uc_steering; + dev->caps.vep_mc_steering = dev_cap->vep_mc_steering; + dev->caps.wol = dev_cap->wol; dev->caps.max_gso_sz = dev_cap->max_gso_sz; + dev->caps.reserved_xrcds = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ? + dev_cap->reserved_xrcds : 0; + dev->caps.max_xrcds = (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) ? + dev_cap->max_xrcds : 0; dev->caps.log_num_macs = log_num_mac; - dev->caps.log_num_vlans = log_num_vlan; dev->caps.log_num_prios = use_prio ? 3 : 0; for (i = 1; i <= dev->caps.num_ports; ++i) { - if (dev->caps.supported_type[i] != MLX4_PORT_TYPE_ETH) - dev->caps.port_type[i] = MLX4_PORT_TYPE_IB; - else - dev->caps.port_type[i] = MLX4_PORT_TYPE_ETH; + dev->caps.port_type[i] = MLX4_PORT_TYPE_NONE; + if (dev->caps.supported_type[i]) { + if (dev->caps.supported_type[i] != MLX4_PORT_TYPE_ETH) + dev->caps.port_type[i] = MLX4_PORT_TYPE_IB; + else + dev->caps.port_type[i] = MLX4_PORT_TYPE_ETH; + } dev->caps.possible_type[i] = dev->caps.port_type[i]; mlx4_priv(dev)->sense.sense_allowed[i] = dev->caps.supported_type[i] == MLX4_PORT_TYPE_AUTO; @@ -247,15 +389,18 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) "for port %d, reducing to %d.\n", i, 1 << dev->caps.log_num_macs); } - if (dev->caps.log_num_vlans > dev_cap->log_max_vlans[i]) { - dev->caps.log_num_vlans = dev_cap->log_max_vlans[i]; - mlx4_warn(dev, "Requested number of VLANs is too much " - "for port %d, reducing to %d.\n", - i, 1 << dev->caps.log_num_vlans); - } + dev->caps.log_num_vlans = dev_cap->log_max_vlans[i]; } - mlx4_set_port_mask(dev); + dev->caps.counters_mode = get_counters_mode(dev_cap->flags); + if (mlx4_CMD_SET_IF_STAT(dev, dev->caps.counters_mode)) + mlx4_warn(dev, "setting counters mode to %d failed\n", + dev->caps.counters_mode); + + dev->caps.max_basic_counters = 1 << ilog2(dev_cap->max_basic_counters); + dev->caps.max_ext_counters = 1 << ilog2(dev_cap->max_ext_counters); + mlx4_dbg(dev, "max_basic_counters %d, max_ext_counters %d\n", + dev->caps.max_basic_counters, dev->caps.max_ext_counters); dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] = dev_cap->reserved_qps; dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] = @@ -264,12 +409,157 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) (1 << dev->caps.log_num_vlans) * (1 << dev->caps.log_num_prios) * dev->caps.num_ports; - dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH] = MLX4_NUM_FEXCH; dev->caps.reserved_qps = dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW] + dev->caps.reserved_qps_cnt[MLX4_QP_REGION_ETH_ADDR] + - dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] + - dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH]; + dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR]; + + dev->caps.mad_demux = dev_cap->mad_demux; + + /* Master function demultiplexes mads */ + dev->caps.sqp_demux = (mlx4_is_master(dev)) ? MLX4_MAX_NUM_SLAVES : 0; + return 0; +} +/*The function checks if there are live vf, return the num of them*/ +static int mlx4_how_many_lives_vf(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *s_state; + int i; + int ret = 0; + + for (i = 1/*the ppf is 0*/; i < dev->num_slaves; ++i) { + s_state = &priv->mfunc.master.slave_state[i]; + if (s_state->active && s_state->last_cmd != MLX4_COMM_CMD_RESET) { + mlx4_warn(dev, "%s: slave: %d is still active\n", __func__, i); + ret++; + } + } + return ret; +} + +int mlx4_get_parav_qkey(struct mlx4_dev *dev, u32 qpn, u32 *qkey) +{ + u32 qk = MLX4_RESERVED_QKEY_BASE; + if (qpn >= dev->caps.tunnel_qpn + 8 + 16 * MLX4_MFUNC_MAX || + qpn < dev->caps.tunnel_qpn + 8) + return -EINVAL; + + if (qpn >= dev->caps.tunnel_qpn + 8 * (MLX4_MFUNC_MAX + 1)) + /* tunnel qp */ + qk += qpn - (dev->caps.tunnel_qpn + 8 * (MLX4_MFUNC_MAX + 1)); + else + qk += qpn - (dev->caps.tunnel_qpn + 8); + *qkey = qk; + return 0; +} +EXPORT_SYMBOL(mlx4_get_parav_qkey); + +int mlx4_is_slave_active(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_state *s_slave; + + if (!mlx4_is_mfunc(dev) || !mlx4_is_master(dev)) + return 0; + + s_slave = &priv->mfunc.master.slave_state[slave]; + return (!!s_slave->active); +} +EXPORT_SYMBOL(mlx4_is_slave_active); + +int mlx4_slave_cap(struct mlx4_dev *dev) +{ + int err; + u32 page_size; + + err = mlx4_QUERY_SLAVE_CAP(dev, &dev->caps); + if (err) + return err; + + page_size = ~dev->caps.page_size_cap + 1; + mlx4_warn(dev, "HCA minimum page size:%d\n", page_size); + if (page_size > PAGE_SIZE) { + mlx4_err(dev, "HCA minimum page size of %d bigger than " + "kernel PAGE_SIZE of %ld, aborting.\n", + page_size, PAGE_SIZE); + return -ENODEV; + } + + /* TODO: relax this assumption */ + if (dev->caps.uar_page_size != PAGE_SIZE) { + mlx4_err(dev, "UAR size:%d != kernel PAGE_SIZE of %ld\n", + dev->caps.uar_page_size, PAGE_SIZE); + return -ENODEV; + } + + if (dev->caps.num_ports > MLX4_MAX_PORTS) { + mlx4_err(dev, "HCA has %d ports, but we only support %d, " + "aborting.\n", dev->caps.num_ports, MLX4_MAX_PORTS); + return -ENODEV; + } + + if (dev->caps.uar_page_size * (dev->caps.num_uars - + dev->caps.reserved_uars) > + pci_resource_len(dev->pdev, 2)) { + mlx4_err(dev, "HCA reported UAR region size of 0x%x bigger than " + "PCI resource 2 size of 0x%llx, aborting.\n", + dev->caps.uar_page_size * dev->caps.num_uars, + (unsigned long long) pci_resource_len(dev->pdev, 2)); + return -ENODEV; + } + + /* Adjust eq number */ + if (dev->caps.num_eqs - dev->caps.reserved_eqs > num_possible_cpus() + 1) + dev->caps.num_eqs = dev->caps.reserved_eqs + num_possible_cpus() + 1; + + /* Calculate our sqp_start */ + dev->caps.sqp_start = dev->caps.tunnel_qpn + 8 * (dev->caps.function + 1); + + /* Calculate fmr dmpt index */ + dev->caps.fmr_dmpt_base_idx = (dev->caps.fmr_dmpt_base - + dev->caps.dmpt_base) / + dev->caps.dmpt_entry_sz; + +#if 0 + mlx4_warn(dev, "sqp_demux:%d\n", dev->caps.sqp_demux); + mlx4_warn(dev, "num_uars:%d reserved_uars:%d uar region:0x%x bar2:0x%llx\n", + dev->caps.num_uars, dev->caps.reserved_uars, + dev->caps.uar_page_size * dev->caps.num_uars, + pci_resource_len(dev->pdev, 2)); + mlx4_warn(dev, "num_eqs:%d reserved_eqs:%d\n", dev->caps.num_eqs, + dev->caps.reserved_eqs); + mlx4_warn(dev, "num_pds:%d reserved_pds:%d slave_pd_shift:%d pd_base:%d\n", + dev->caps.num_pds, + dev->caps.reserved_pds, + dev->caps.slave_pd_shift, + dev->caps.pd_base); +#endif + return 0; +} + +static int mlx4_save_config(struct mlx4_dev *dev) +{ + struct mlx4_port_config *config; + int i; + + list_for_each_entry(config, &config_list, list) { + if (config->pdev == dev->pdev) { + for (i = 1; i <= dev->caps.num_ports; i++) + config->port_type[i] = dev->caps.possible_type[i]; + return 0; + } + } + + config = kmalloc(sizeof(struct mlx4_port_config), GFP_KERNEL); + if (!config) + return -ENOMEM; + + config->pdev = dev->pdev; + for (i = 1; i <= dev->caps.num_ports; i++) + config->port_type[i] = dev->caps.possible_type[i]; + + list_add_tail(&config->list, &config_list); return 0; } @@ -297,14 +587,15 @@ int mlx4_change_port_types(struct mlx4_dev *dev, mlx4_unregister_device(dev); for (port = 1; port <= dev->caps.num_ports; port++) { mlx4_CLOSE_PORT(dev, port); - err = mlx4_SET_PORT(dev, port); + err = mlx4_SET_PORT(dev, port, -1); if (err) { mlx4_err(dev, "Failed to set port %d, " "aborting\n", port); goto out; } } - mlx4_set_port_mask(dev); + mlx4_set_port_mask(dev, &dev->caps, dev->caps.function); + mlx4_save_config(dev); err = mlx4_register_device(dev); } @@ -368,6 +659,13 @@ static ssize_t set_port_type(struct device *dev, types[i] = mdev->caps.port_type[i+1]; } + if (priv->trig) { + if (++priv->changed_ports < mdev->caps.num_ports) + goto out; + else + priv->trig = priv->changed_ports = 0; + } + if (!(mdev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) { for (i = 1; i <= mdev->caps.num_ports; i++) { if (mdev->caps.possible_type[i] == MLX4_PORT_TYPE_AUTO) { @@ -403,6 +701,23 @@ out: return err ? err : count; } +static ssize_t trigger_port(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct mlx4_dev *mdev = pci_get_drvdata(pdev); + struct mlx4_priv *priv = container_of(mdev, struct mlx4_priv, dev); + + if (!priv) + return -ENODEV; + + mutex_lock(&priv->port_mutex); + priv->trig = 1; + mutex_unlock(&priv->port_mutex); + return count; +} +DEVICE_ATTR(port_trigger, S_IWUGO, NULL, trigger_port); + static int mlx4_load_fw(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); @@ -433,7 +748,7 @@ err_unmap_fa: mlx4_UNMAP_FA(dev); err_free: - mlx4_free_icm(dev, priv->fw.fw_icm, 0); + mlx4_free_icm(dev, priv->fw.fw_icm, 0, MLX4_MR_FLAG_NONE); return err; } @@ -442,6 +757,7 @@ static int mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base, { struct mlx4_priv *priv = mlx4_priv(dev); int err; + int num_eqs; err = mlx4_init_icm_table(dev, &priv->qp_table.cmpt_table, cmpt_base + @@ -471,25 +787,30 @@ static int mlx4_init_cmpt_table(struct mlx4_dev *dev, u64 cmpt_base, if (err) goto err_srq; + num_eqs = (mlx4_is_mfunc(dev) && mlx4_is_master(dev)) ? + roundup_pow_of_two(mlx4_master_get_num_eqs(dev)) : + dev->caps.num_eqs; err = mlx4_init_icm_table(dev, &priv->eq_table.cmpt_table, cmpt_base + ((u64) (MLX4_CMPT_TYPE_EQ * cmpt_entry_sz) << MLX4_CMPT_SHIFT), - cmpt_entry_sz, - dev->caps.num_eqs, dev->caps.num_eqs, 0, 0); + cmpt_entry_sz, num_eqs, num_eqs, 0, 0); if (err) goto err_cq; return 0; err_cq: - mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table); + mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table, + MLX4_MR_FLAG_NONE); err_srq: - mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table); + mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table, + MLX4_MR_FLAG_NONE); err_qp: - mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table); + mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table, + MLX4_MR_FLAG_NONE); err: return err; @@ -500,6 +821,8 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap, { struct mlx4_priv *priv = mlx4_priv(dev); u64 aux_pages; + int num_eqs; + u32 num_mpts; int err; err = mlx4_SET_ICM_SIZE(dev, icm_size, &aux_pages); @@ -531,10 +854,13 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap, goto err_unmap_aux; } + + num_eqs = (mlx4_is_mfunc(dev) && mlx4_is_master(dev)) ? + roundup_pow_of_two(mlx4_master_get_num_eqs(dev)) : + dev->caps.num_eqs; err = mlx4_init_icm_table(dev, &priv->eq_table.table, init_hca->eqc_base, dev_cap->eqc_entry_sz, - dev->caps.num_eqs, dev->caps.num_eqs, - 0, 0); + num_eqs, num_eqs, 0, 0); if (err) { mlx4_err(dev, "Failed to map EQ context memory, aborting.\n"); goto err_unmap_cmpt; @@ -561,10 +887,18 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap, goto err_unmap_eq; } + /* reserve mpts for fmr */ + num_mpts = dev->caps.num_mpts >> 1; + + if ((num_mpts * dev->caps.dmpt_entry_sz) & (PAGE_SIZE - 1)) { + mlx4_err(dev, "MPT size is not page aligned, aborting.\n"); + return -EINVAL; + } + err = mlx4_init_icm_table(dev, &priv->mr_table.dmpt_table, init_hca->dmpt_base, dev_cap->dmpt_entry_sz, - dev->caps.num_mpts, + num_mpts, dev->caps.reserved_mrws, 1, 1); if (err) { mlx4_err(dev, "Failed to map dMPT context memory, aborting.\n"); @@ -641,7 +975,7 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap, * and it's a lot easier than trying to track ref counts. */ err = mlx4_init_icm_table(dev, &priv->mcg_table.table, - init_hca->mc_base, MLX4_MGM_ENTRY_SIZE, + init_hca->mc_base, mlx4_get_mgm_entry_size(dev), dev->caps.num_mgms + dev->caps.num_amgms, dev->caps.num_mgms + dev->caps.num_amgms, 0, 0); @@ -653,43 +987,56 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap, return 0; err_unmap_srq: - mlx4_cleanup_icm_table(dev, &priv->srq_table.table); + mlx4_cleanup_icm_table(dev, &priv->srq_table.table, + MLX4_MR_FLAG_NONE); err_unmap_cq: - mlx4_cleanup_icm_table(dev, &priv->cq_table.table); + mlx4_cleanup_icm_table(dev, &priv->cq_table.table, + MLX4_MR_FLAG_NONE); err_unmap_rdmarc: - mlx4_cleanup_icm_table(dev, &priv->qp_table.rdmarc_table); + mlx4_cleanup_icm_table(dev, &priv->qp_table.rdmarc_table, + MLX4_MR_FLAG_NONE); err_unmap_altc: - mlx4_cleanup_icm_table(dev, &priv->qp_table.altc_table); + mlx4_cleanup_icm_table(dev, &priv->qp_table.altc_table, + MLX4_MR_FLAG_NONE); err_unmap_auxc: - mlx4_cleanup_icm_table(dev, &priv->qp_table.auxc_table); + mlx4_cleanup_icm_table(dev, &priv->qp_table.auxc_table, + MLX4_MR_FLAG_NONE); err_unmap_qp: - mlx4_cleanup_icm_table(dev, &priv->qp_table.qp_table); + mlx4_cleanup_icm_table(dev, &priv->qp_table.qp_table, + MLX4_MR_FLAG_NONE); err_unmap_dmpt: - mlx4_cleanup_icm_table(dev, &priv->mr_table.dmpt_table); + mlx4_cleanup_icm_table(dev, &priv->mr_table.dmpt_table, + MLX4_MR_FLAG_NONE); err_unmap_mtt: - mlx4_cleanup_icm_table(dev, &priv->mr_table.mtt_table); + mlx4_cleanup_icm_table(dev, &priv->mr_table.mtt_table, + MLX4_MR_FLAG_NONE); err_unmap_eq: - mlx4_cleanup_icm_table(dev, &priv->eq_table.table); + mlx4_cleanup_icm_table(dev, &priv->eq_table.table, + MLX4_MR_FLAG_NONE); err_unmap_cmpt: - mlx4_cleanup_icm_table(dev, &priv->eq_table.cmpt_table); - mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table); - mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table); - mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table); + mlx4_cleanup_icm_table(dev, &priv->eq_table.cmpt_table, + MLX4_MR_FLAG_NONE); + mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table, + MLX4_MR_FLAG_NONE); + mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table, + MLX4_MR_FLAG_NONE); + mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table, + MLX4_MR_FLAG_NONE); err_unmap_aux: mlx4_UNMAP_ICM_AUX(dev); err_free_aux: - mlx4_free_icm(dev, priv->fw.aux_icm, 0); + mlx4_free_icm(dev, priv->fw.aux_icm, 0, MLX4_MR_FLAG_NONE); return err; } @@ -698,23 +1045,47 @@ static void mlx4_free_icms(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); - mlx4_cleanup_icm_table(dev, &priv->mcg_table.table); - mlx4_cleanup_icm_table(dev, &priv->srq_table.table); - mlx4_cleanup_icm_table(dev, &priv->cq_table.table); - mlx4_cleanup_icm_table(dev, &priv->qp_table.rdmarc_table); - mlx4_cleanup_icm_table(dev, &priv->qp_table.altc_table); - mlx4_cleanup_icm_table(dev, &priv->qp_table.auxc_table); - mlx4_cleanup_icm_table(dev, &priv->qp_table.qp_table); - mlx4_cleanup_icm_table(dev, &priv->mr_table.dmpt_table); - mlx4_cleanup_icm_table(dev, &priv->mr_table.mtt_table); - mlx4_cleanup_icm_table(dev, &priv->eq_table.table); - mlx4_cleanup_icm_table(dev, &priv->eq_table.cmpt_table); - mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table); - mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table); - mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table); + mlx4_cleanup_icm_table(dev, &priv->mcg_table.table, + MLX4_MR_FLAG_NONE); + mlx4_cleanup_icm_table(dev, &priv->srq_table.table, + MLX4_MR_FLAG_NONE); + mlx4_cleanup_icm_table(dev, &priv->cq_table.table, + MLX4_MR_FLAG_NONE); + mlx4_cleanup_icm_table(dev, &priv->qp_table.rdmarc_table, + MLX4_MR_FLAG_NONE); + mlx4_cleanup_icm_table(dev, &priv->qp_table.altc_table, + MLX4_MR_FLAG_NONE); + mlx4_cleanup_icm_table(dev, &priv->qp_table.auxc_table, + MLX4_MR_FLAG_NONE); + mlx4_cleanup_icm_table(dev, &priv->qp_table.qp_table, + MLX4_MR_FLAG_NONE); + mlx4_cleanup_icm_table(dev, &priv->mr_table.dmpt_table, + MLX4_MR_FLAG_NONE); + mlx4_cleanup_icm_table(dev, &priv->mr_table.mtt_table, + MLX4_MR_FLAG_NONE); + mlx4_cleanup_icm_table(dev, &priv->eq_table.table, + MLX4_MR_FLAG_NONE); + mlx4_cleanup_icm_table(dev, &priv->eq_table.cmpt_table, + MLX4_MR_FLAG_NONE); + mlx4_cleanup_icm_table(dev, &priv->cq_table.cmpt_table, + MLX4_MR_FLAG_NONE); + mlx4_cleanup_icm_table(dev, &priv->srq_table.cmpt_table, + MLX4_MR_FLAG_NONE); + mlx4_cleanup_icm_table(dev, &priv->qp_table.cmpt_table, + MLX4_MR_FLAG_NONE); mlx4_UNMAP_ICM_AUX(dev); - mlx4_free_icm(dev, priv->fw.aux_icm, 0); + mlx4_free_icm(dev, priv->fw.aux_icm, 0, MLX4_MR_FLAG_NONE); +} + +static void mlx4_slave_exit(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + down(&priv->cmd.slave_sem); + if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, MLX4_COMM_TIME)) + mlx4_warn(dev, "Failed to close slave function.\n"); + up(&priv->cmd.slave_sem); } static int map_bf_area(struct mlx4_dev *dev) @@ -742,10 +1113,103 @@ static void unmap_bf_area(struct mlx4_dev *dev) static void mlx4_close_hca(struct mlx4_dev *dev) { unmap_bf_area(dev); - mlx4_CLOSE_HCA(dev, 0); - mlx4_free_icms(dev); - mlx4_UNMAP_FA(dev); - mlx4_free_icm(dev, mlx4_priv(dev)->fw.fw_icm, 0); + + if (mlx4_is_mfunc(dev) && !mlx4_is_master(dev)) + mlx4_slave_exit(dev); + else { + mlx4_CLOSE_HCA(dev, 0); + mlx4_free_icms(dev); + mlx4_UNMAP_FA(dev); + mlx4_free_icm(dev, mlx4_priv(dev)->fw.fw_icm, 0, + MLX4_MR_FLAG_NONE); + } +} + +static int mlx4_init_slave(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + u64 dma = (u64) priv->mfunc.vhcr_dma; + int num_of_reset_retries = NUM_OF_RESET_RETRIES; + int ret_from_reset = 0; + u32 slave_read; + u32 cmd_channel_ver; + + down(&priv->cmd.slave_sem); + priv->cmd.max_cmds = 1; + mlx4_warn(dev, "Sending reset\n"); + ret_from_reset = mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, MLX4_COMM_TIME); + /*if we are in the middle of flr the slave will try NUM_OF_RESET_RETRIES times + before leaving.*/ + if(ret_from_reset) { + if (MLX4_DELAY_RESET_SLAVE == ret_from_reset ) { + msleep(SLEEP_TIME_IN_RESET); + while (ret_from_reset && num_of_reset_retries) { + mlx4_warn(dev, "slave is currently in the middle of FLR. retrying...(try num:%d)\n", + (NUM_OF_RESET_RETRIES - num_of_reset_retries + 1)); + ret_from_reset = mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, MLX4_COMM_TIME); + num_of_reset_retries = num_of_reset_retries - 1; + } + } else + goto err; + } + + /* check the driver version - the slave I/F revision must match the master's */ + slave_read = swab32(readl(&priv->mfunc.comm->slave_read)); + cmd_channel_ver = mlx4_comm_get_version(); + + if (MLX4_COMM_GET_IF_REV(cmd_channel_ver) != MLX4_COMM_GET_IF_REV(slave_read)) { + mlx4_err(dev, "slave driver version is not supported by the master\n"); + goto err; + } + + mlx4_warn(dev, "Sending vhcr0\n"); + if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR0, dma >> 48, + MLX4_COMM_TIME)) + goto err; + if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR1, dma >> 32, + MLX4_COMM_TIME)) + goto err; + if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR2, dma >> 16, + MLX4_COMM_TIME)) + goto err; + if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR_EN, dma, MLX4_COMM_TIME)) + goto err; + up(&priv->cmd.slave_sem); + return 0; + +err: + mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, 0); + up(&priv->cmd.slave_sem); + return -EIO; +} + +static void mlx4_dom0_fmr_cap(struct mlx4_dev *dev, + struct mlx4_init_hca_param *init_hca) +{ + int num_mpts, num_fmr_clients; + + /* fmr clients are the VFs and the PF. Does not support multiple PFs */ + num_fmr_clients = dev->sr_iov + 1; + + /* should be retrieved using QUERY DEV CAP cmd */ + dev->caps.fmr_num_mpts = rounddown_pow_of_two((dev->caps.num_mpts >> 1) + / num_fmr_clients); + + + /* can be replaced by a dynamic mtt allocator */ + dev->caps.fmr_num_mtt_segs = + rounddown_pow_of_two((dev->caps.num_mtt_segs >> 1) / + num_fmr_clients); + + num_mpts = dev->caps.num_mpts >> 1; + + dev->caps.fmr_dmpt_base = init_hca->dmpt_base + num_mpts + * dev->caps.dmpt_entry_sz; + dev->caps.fmr_dmpt_base_idx = num_mpts; + + /* save for fmr mtt tables virtual address computation */ + dev->caps.mtt_base = init_hca->mtt_base; + dev->caps.dmpt_base = init_hca->dmpt_base; } static int mlx4_init_hca(struct mlx4_dev *dev) @@ -756,63 +1220,98 @@ static int mlx4_init_hca(struct mlx4_dev *dev) struct mlx4_mod_stat_cfg mlx4_cfg; struct mlx4_profile profile; struct mlx4_init_hca_param init_hca; + struct mlx4_port_config *config; u64 icm_size; int err; + int i; - err = mlx4_QUERY_FW(dev); - if (err) { - if (err == -EACCES) - mlx4_info(dev, "non-primary physical function, skipping.\n"); - else - mlx4_err(dev, "QUERY_FW command failed, aborting.\n"); - return err; - } + if (!mlx4_is_mfunc(dev) || mlx4_is_master(dev)) { + err = mlx4_QUERY_FW(dev); + if (err) { + if (err == -EACCES) + mlx4_info(dev, "non-primary physical function, skipping.\n"); + else + mlx4_err(dev, "QUERY_FW command failed, aborting.\n"); + goto out; + } - err = mlx4_load_fw(dev); - if (err) { - mlx4_err(dev, "Failed to start FW, aborting.\n"); - return err; - } + err = mlx4_load_fw(dev); + if (err) { + mlx4_err(dev, "Failed to start FW, aborting.\n"); + goto out; + } - mlx4_cfg.log_pg_sz_m = 1; - mlx4_cfg.log_pg_sz = 0; - err = mlx4_MOD_STAT_CFG(dev, &mlx4_cfg); - if (err) - mlx4_warn(dev, "Failed to override log_pg_sz parameter\n"); + mlx4_cfg.log_pg_sz_m = 1; + mlx4_cfg.log_pg_sz = 0; + err = mlx4_MOD_STAT_CFG(dev, &mlx4_cfg); + if (err) + mlx4_warn(dev, "Failed to override log_pg_sz parameter\n"); - err = mlx4_dev_cap(dev, &dev_cap); - if (err) { - mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n"); - goto err_stop_fw; - } + err = mlx4_dev_cap(dev, &dev_cap); + if (err) { + mlx4_err(dev, "QUERY_DEV_CAP command failed, aborting.\n"); + goto err_stop_fw; + } - profile = default_profile; + process_mod_param_profile(); + profile = default_profile; - icm_size = mlx4_make_profile(dev, &profile, &dev_cap, &init_hca); - if ((long long) icm_size < 0) { - err = icm_size; - goto err_stop_fw; - } + list_for_each_entry(config, &config_list, list) { + if (config->pdev == dev->pdev) { + for (i = 1; i <= dev->caps.num_ports; i++) { + dev->caps.possible_type[i] = config->port_type[i]; + if (config->port_type[i] != MLX4_PORT_TYPE_AUTO) + dev->caps.port_type[i] = config->port_type[i]; + } + } + } - if (map_bf_area(dev)) - mlx4_dbg(dev, "Failed to map blue flame area\n"); + icm_size = mlx4_make_profile(dev, &profile, &dev_cap, &init_hca); + if ((long long) icm_size < 0) { + err = icm_size; + goto err_stop_fw; + } - init_hca.log_uar_sz = ilog2(dev->caps.num_uars); + init_hca.log_uar_sz = ilog2(dev->caps.num_uars); - err = mlx4_init_icm(dev, &dev_cap, &init_hca, icm_size); - if (err) - goto err_stop_fw; + mlx4_dom0_fmr_cap(dev, &init_hca); - err = mlx4_INIT_HCA(dev, &init_hca); - if (err) { - mlx4_err(dev, "INIT_HCA command failed, aborting.\n"); - goto err_free_icm; + err = mlx4_init_icm(dev, &dev_cap, &init_hca, icm_size); + if (err) + goto err_stop_fw; + + err = mlx4_INIT_HCA(dev, &init_hca); + if (err) { + mlx4_err(dev, "INIT_HCA command failed, aborting.\n"); + goto err_free_icm; + } + } else { + err = mlx4_init_slave(dev); + if (err) { + mlx4_err(dev, "Failed to initialize slave\n"); + goto out; + } + + err = mlx4_slave_cap(dev); + if (err) { + mlx4_err(dev, "Failed to obtain slave caps\n"); + goto err_close_hca; + } + } + + if (map_bf_area(dev)) + mlx4_dbg(dev, "Kernel support for blue flame is not available " + "for kernels < 2.6.28\n"); + + /*Only the master set the ports, all the rest got it from it.*/ + if (!mlx4_is_mfunc(dev) || mlx4_is_master(dev)) { + mlx4_set_port_mask(dev, &dev->caps, dev->caps.function); } err = mlx4_QUERY_ADAPTER(dev, &adapter); if (err) { mlx4_err(dev, "QUERY_ADAPTER command failed, aborting.\n"); - goto err_close; + goto unmap_bf; } priv->eq_table.inta_pin = adapter.inta_pin; @@ -820,17 +1319,203 @@ static int mlx4_init_hca(struct mlx4_dev *dev) return 0; -err_close: - mlx4_CLOSE_HCA(dev, 0); +unmap_bf: + unmap_bf_area(dev); + +err_close_hca: + mlx4_close_hca(dev); + goto out; err_free_icm: - mlx4_free_icms(dev); + if (!mlx4_is_mfunc(dev) || mlx4_is_master(dev)) + mlx4_free_icms(dev); err_stop_fw: - unmap_bf_area(dev); - mlx4_UNMAP_FA(dev); - mlx4_free_icm(dev, priv->fw.fw_icm, 0); + if (!mlx4_is_mfunc(dev) || mlx4_is_master(dev)) { + mlx4_UNMAP_FA(dev); + mlx4_free_icm(dev, priv->fw.fw_icm, 0, MLX4_MR_FLAG_NONE); + } + +out: + return err; +} + +static int mlx4_init_counters_table(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int err; + int nent; + + switch (dev->caps.counters_mode) { + case MLX4_CUNTERS_BASIC: + nent = dev->caps.max_basic_counters; + break; + case MLX4_CUNTERS_EXT: + nent = dev->caps.max_ext_counters; + break; + default: + return -ENOENT; + } + err = mlx4_bitmap_init(&priv->counters_bitmap, nent, nent - 1, 0, 0); + if (err) + return err; + + return 0; +} + +static void mlx4_cleanup_counters_table(struct mlx4_dev *dev) +{ + switch (dev->caps.counters_mode) { + case MLX4_CUNTERS_BASIC: + case MLX4_CUNTERS_EXT: + mlx4_bitmap_cleanup(&mlx4_priv(dev)->counters_bitmap); + break; + default: + break; + } +} + +int __mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + switch (dev->caps.counters_mode) { + case MLX4_CUNTERS_BASIC: + case MLX4_CUNTERS_EXT: + *idx = mlx4_bitmap_alloc(&priv->counters_bitmap); + if (*idx == -1) + return -ENOMEM; + return 0; + default: + return -ENOMEM; + } +} + +int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx) +{ + u64 out_param; + int err; + + if (mlx4_is_mfunc(dev)) { + err = mlx4_cmd_imm(dev, 0, &out_param, RES_COUNTER, RES_OP_RESERVE, + MLX4_CMD_ALLOC_RES, MLX4_CMD_TIME_CLASS_A, 0); + if (!err) + *idx = get_param_l(&out_param); + return err; + } + return __mlx4_counter_alloc(dev, idx); +} +EXPORT_SYMBOL_GPL(mlx4_counter_alloc); + +void __mlx4_counter_free(struct mlx4_dev *dev, u32 idx) +{ + switch (dev->caps.counters_mode) { + case MLX4_CUNTERS_BASIC: + case MLX4_CUNTERS_EXT: + mlx4_bitmap_free(&mlx4_priv(dev)->counters_bitmap, idx); + return; + default: + return; + } +} + +void mlx4_counter_free(struct mlx4_dev *dev, u32 idx) +{ + u64 in_param; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, idx); + if (mlx4_cmd(dev, in_param, RES_COUNTER, RES_OP_RESERVE, + MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A, 0)) + mlx4_warn(dev, "Failed freeing counter: %d\n", idx); + return; + } + __mlx4_counter_free(dev, idx); +} +EXPORT_SYMBOL_GPL(mlx4_counter_free); + +void mlx4_slave_handle_guid(struct mlx4_dev *dev, int slave_id, u8 port_num, __be64 cur_ag) +{ + enum slave_port_state new_state; + enum slave_port_gen_event gen_event; + + mlx4_gen_guid_change_eqe(dev, slave_id, port_num); + + mlx4_dbg(dev, "%s: update slave number:%d, port %d, GUID: 0x%llx\n", __func__, + slave_id, port_num, cur_ag); + + if (MLX4_NOT_SET_GUID != cur_ag) { /* valid GUID */ + new_state = set_and_calc_slave_port_state(dev, slave_id, + port_num, + MLX4_PORT_STATE_IB_PORT_STATE_EVENT_GID_VALID, + &gen_event); + mlx4_dbg(dev, "%s: slave: %d, port:%d , new_port_state: %d, gen_event :%d\n", + __func__, slave_id, port_num, new_state, gen_event); + + if (SLAVE_PORT_GEN_EVENT_UP == gen_event) { + mlx4_dbg(dev, "%s: sending PORT_UP event to slave: %d, port:%d\n", + __func__, slave_id, port_num); + + mlx4_gen_port_state_change_eqe(dev, slave_id, port_num, + MLX4_PORT_CHANGE_SUBTYPE_ACTIVE); + } else + mlx4_dbg(dev, "%s: GOT: %d event to slave: %d, port:%d\n", + __func__, gen_event, slave_id, port_num); + + } else { /*Invalidate GUID*/ + set_and_calc_slave_port_state(dev, + slave_id, + port_num, + MLX4_PORT_STATE_IB_EVENT_GID_INVALID, + &gen_event); + mlx4_dbg(dev, "%s: sending MLX4_PORT_STATE_IB_EVENT_GID_INVALID" + " event to slave: %d, port:%d [got gen_event: %d]\n", + __func__, slave_id, port_num, gen_event); + mlx4_gen_port_state_change_eqe(dev, slave_id, port_num, MLX4_PORT_CHANGE_SUBTYPE_DOWN); + } +} +EXPORT_SYMBOL(mlx4_slave_handle_guid); + +static int mlx4_config_mad_demux(struct mlx4_dev *dev) +{ + struct mlx4_cmd_mailbox *mailbox; + int err = 0; + + /* Check if mad_demux is supported */ + if (!(dev->caps.mad_demux & 0x01)) + return 0; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + mlx4_warn(dev, "Failed to allocate mailbox for cmd MAD_IFC"); + return 1; + } + + /* Query mad_demux to find out which events can + be generated by the FW */ + err = mlx4_cmd_box(dev, 0, mailbox->dma, 0x01 /* subn class */, + MLX4_CMD_MAD_DEMUX_QUERY_REST, MLX4_CMD_MAD_DEMUX, + MLX4_CMD_TIME_CLASS_B, 1); + if (err) { + mlx4_warn(dev, "Failed in mlx4_cmd_box of MLX4_CMD_MAD_DEMUX, " + "query restrictions"); + goto out; + } + + /* Config mad_demux */ + err = mlx4_cmd(dev, mailbox->dma, 0x01 /* subn class */, + MLX4_CMD_MAD_DEMUX_CONFIG, MLX4_CMD_MAD_DEMUX, + MLX4_CMD_TIME_CLASS_B, 1); + if (err) { + mlx4_warn(dev, "Failed in mlx4_cmd_box of MLX4_CMD_MAD_DEMUX, " + "configure"); + goto out; + } + dev->is_internal_sma = 1; + +out: + mlx4_free_cmd_mailbox(dev, mailbox); return err; } @@ -855,7 +1540,7 @@ static int mlx4_setup_hca(struct mlx4_dev *dev) goto err_uar_table_free; } - priv->kar = ioremap((phys_addr_t) priv->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE); + priv->kar = ioremap(priv->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE); if (!priv->kar) { mlx4_err(dev, "Couldn't map kernel access region, " "aborting.\n"); @@ -870,11 +1555,26 @@ static int mlx4_setup_hca(struct mlx4_dev *dev) goto err_kar_unmap; } + err = mlx4_init_xrcd_table(dev); + if (err) { + mlx4_err(dev, "Failed to initialize extended " + "reliably connected domain table, aborting.\n"); + goto err_pd_table_free; + } + err = mlx4_init_mr_table(dev); if (err) { mlx4_err(dev, "Failed to initialize " "memory region table, aborting.\n"); - goto err_pd_table_free; + goto err_xrcd_table_free; + } + + if (!mlx4_is_mfunc(dev) || mlx4_is_master(dev)) { + err = mlx4_config_mad_demux(dev); + if (err) { + mlx4_err(dev, "Failed in config_mad_demux\n"); + goto err_mr_table_free; + } } err = mlx4_init_eq_table(dev); @@ -887,7 +1587,7 @@ static int mlx4_setup_hca(struct mlx4_dev *dev) err = mlx4_cmd_use_events(dev); if (err) { mlx4_err(dev, "Failed to switch to event-driven " - "firmware commands, aborting.\n"); + "firmware commands, aborting.\n"); goto err_eq_table_free; } @@ -938,29 +1638,45 @@ static int mlx4_setup_hca(struct mlx4_dev *dev) goto err_qp_table_free; } - for (port = 1; port <= dev->caps.num_ports; port++) { - enum mlx4_port_type port_type = 0; - mlx4_SENSE_PORT(dev, port, &port_type); - if (port_type) - dev->caps.port_type[port] = port_type; - ib_port_default_caps = 0; - err = mlx4_get_port_ib_caps(dev, port, &ib_port_default_caps); - if (err) - mlx4_warn(dev, "failed to get port %d default " - "ib capabilities (%d). Continuing with " - "caps = 0\n", port, err); - dev->caps.ib_port_def_cap[port] = ib_port_default_caps; - err = mlx4_SET_PORT(dev, port); - if (err) { - mlx4_err(dev, "Failed to set port %d, aborting\n", - port); - goto err_mcg_table_free; + err = mlx4_init_counters_table(dev); + if (err && err != -ENOENT) { + mlx4_err(dev, "Failed to initialize counters table, aborting.\n"); + goto err_mcg_table_free; + } + + if (!mlx4_is_mfunc(dev) || mlx4_is_master(dev)) { + int pkey_tbl_size; + for (port = 1; port <= dev->caps.num_ports; port++) { + ib_port_default_caps = 0; + pkey_tbl_size = -1; + err = mlx4_get_port_ib_caps(dev, port, &ib_port_default_caps); + if (err) + mlx4_warn(dev, "failed to get port %d default " + "ib capabilities (%d). Continuing with " + "caps = 0\n", port, err); + dev->caps.ib_port_def_cap[port] = ib_port_default_caps; + if (mlx4_is_master(dev)) { + int i; + for (i = 0; i < dev->num_slaves; i++) + if (i != dev->caps.function) + priv->mfunc.master.slave_state[i].ib_cap_mask[port] = + ib_port_default_caps; + pkey_tbl_size = dev->caps.pkey_table_len[port] - 1; + } + err = mlx4_SET_PORT(dev, port, pkey_tbl_size); + if (err) { + mlx4_err(dev, "Failed to set port %d, aborting\n", + port); + goto err_counters_table_free; + } } } - mlx4_set_port_mask(dev); return 0; +err_counters_table_free: + mlx4_cleanup_counters_table(dev); + err_mcg_table_free: mlx4_cleanup_mcg_table(dev); @@ -982,6 +1698,9 @@ err_eq_table_free: err_mr_table_free: mlx4_cleanup_mr_table(dev); +err_xrcd_table_free: + mlx4_cleanup_xrcd_table(dev); + err_pd_table_free: mlx4_cleanup_pd_table(dev); @@ -1000,15 +1719,13 @@ static void mlx4_enable_msi_x(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); struct msix_entry *entries; - int nreq = min_t(int, dev->caps.num_ports * - min_t(int, num_online_cpus() + 1, MAX_MSIX_P_PORT) - + MSIX_LEGACY_SZ, MAX_MSIX); + int nreq; int err; int i; if (msi_x) { nreq = min_t(int, dev->caps.num_eqs - dev->caps.reserved_eqs, - nreq); + num_online_cpus() + 1); entries = kcalloc(nreq, sizeof *entries, GFP_KERNEL); if (!entries) goto no_msi; @@ -1031,15 +1748,7 @@ static void mlx4_enable_msi_x(struct mlx4_dev *dev) goto no_msi; } - if (nreq < - MSIX_LEGACY_SZ + dev->caps.num_ports * MIN_MSIX_P_PORT) { - /*Working in legacy mode , all EQ's shared*/ - dev->caps.comp_pool = 0; - dev->caps.num_comp_vectors = nreq - 1; - } else { - dev->caps.comp_pool = nreq - MSIX_LEGACY_SZ; - dev->caps.num_comp_vectors = MSIX_LEGACY_SZ - 1; - } + dev->caps.num_comp_vectors = nreq - 1; for (i = 0; i < nreq; ++i) priv->eq_table.eq[i].irq = entries[i].vector; @@ -1051,7 +1760,6 @@ static void mlx4_enable_msi_x(struct mlx4_dev *dev) no_msi: dev->caps.num_comp_vectors = 1; - dev->caps.comp_pool = 0; for (i = 0; i < 2; ++i) priv->eq_table.eq[i].irq = dev->pdev->irq; @@ -1064,17 +1772,18 @@ static int mlx4_init_port_info(struct mlx4_dev *dev, int port) info->dev = dev; info->port = port; - mlx4_init_mac_table(dev, &info->mac_table); - mlx4_init_vlan_table(dev, &info->vlan_table); - info->base_qpn = dev->caps.reserved_qps_base[MLX4_QP_REGION_ETH_ADDR] + + if (!mlx4_is_mfunc(dev) || mlx4_is_master(dev)) { + INIT_RADIX_TREE(&info->mac_tree, GFP_KERNEL); + mlx4_init_mac_table(dev, &info->mac_table); + mlx4_init_vlan_table(dev, &info->vlan_table); + info->base_qpn = dev->caps.reserved_qps_base[MLX4_QP_REGION_ETH_ADDR] + (port - 1) * (1 << log_num_mac); - + } sprintf(info->dev_name, "mlx4_port%d", port); info->port_attr.attr.name = info->dev_name; info->port_attr.attr.mode = S_IRUGO | S_IWUSR; info->port_attr.show = show_port_type; info->port_attr.store = set_port_type; - sysfs_attr_init(&info->port_attr.attr); err = device_create_file(&dev->pdev->dev, &info->port_attr); if (err) { @@ -1093,10 +1802,17 @@ static void mlx4_cleanup_port_info(struct mlx4_port_info *info) device_remove_file(&info->dev->pdev->dev, &info->port_attr); } +static int mlx4_init_trigger(struct mlx4_priv *priv) +{ + memcpy(&priv->trigger_attr, &dev_attr_port_trigger, + sizeof(struct device_attribute)); + return device_create_file(&priv->dev.pdev->dev, &priv->trigger_attr); +} + static int mlx4_init_steering(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); - int num_entries = dev->caps.num_ports; + int num_entries = max(dev->caps.num_ports, dev->caps.pf_num); int i, j; priv->steer = kzalloc(sizeof(struct mlx4_steer) * num_entries, GFP_KERNEL); @@ -1108,7 +1824,6 @@ static int mlx4_init_steering(struct mlx4_dev *dev) INIT_LIST_HEAD(&priv->steer[i].promisc_qps[j]); INIT_LIST_HEAD(&priv->steer[i].steer_entries[j]); } - INIT_LIST_HEAD(&priv->steer[i].high_prios); } return 0; } @@ -1118,7 +1833,7 @@ static void mlx4_clear_steering(struct mlx4_dev *dev) struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_steer_index *entry, *tmp_entry; struct mlx4_promisc_qp *pqp, *tmp_pqp; - int num_entries = dev->caps.num_ports; + int num_entries = max(dev->caps.num_ports, dev->caps.pf_num); int i, j; for (i = 0; i < num_entries; i++) { @@ -1146,14 +1861,21 @@ static void mlx4_clear_steering(struct mlx4_dev *dev) kfree(priv->steer); } +static int extended_func_num(struct pci_dev *pdev) +{ + return PCI_SLOT(pdev->devfn) * 8 + PCI_FUNC(pdev->devfn); +} + static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) { struct mlx4_priv *priv; struct mlx4_dev *dev; int err; int port; + int mfunc_cleaned_up = 0; - pr_info(DRV_NAME ": Initializing %s\n", pci_name(pdev)); + printk(KERN_INFO PFX "Initializing %s\n", + pci_name(pdev)); err = pci_enable_device(pdev); if (err) { @@ -1162,12 +1884,19 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) return err; } + /* Since we give to each VF two GUIDs, we can't support more than 63 VFs */ + if (sr_iov > MLX4_MAX_NUM_VF - 1) { + printk(KERN_ERR "There are more VF's(%d) than allowed(%d)\n",sr_iov, MLX4_MAX_NUM_VF - 1); + return -EINVAL; + } /* - * Check for BARs. We expect 0: 1MB + * Check for BARs. */ - if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM) || - pci_resource_len(pdev, 0) != 1 << 20) { - dev_err(&pdev->dev, "Missing DCS, aborting.\n"); + if (((id == NULL) || !(id->driver_data & MLX4_VF)) && + !(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) { + dev_err(&pdev->dev, "Missing DCS, aborting.(id == 0X%p, id->driver_data: 0x%lx," + " pci_resource_flags(pdev, 0):0x%lx)\n", + id, id ? id->driver_data : 0, pci_resource_flags(pdev, 0)); err = -ENODEV; goto err_disable_pdev; } @@ -1177,12 +1906,18 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) goto err_disable_pdev; } - err = pci_request_regions(pdev, DRV_NAME); + err = pci_request_region(pdev, 0, DRV_NAME); if (err) { - dev_err(&pdev->dev, "Couldn't get PCI resources, aborting\n"); + dev_err(&pdev->dev, "Cannot request control region (err:0X%x), aborting.\n", err); goto err_disable_pdev; } + err = pci_request_region(pdev, 2, DRV_NAME); + if (err) { + dev_err(&pdev->dev, "Cannot request UAR region (err:0X%x), aborting.\n", err); + goto err_release_bar0; + } + pci_set_master(pdev); err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); @@ -1191,7 +1926,7 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); if (err) { dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting.\n"); - goto err_release_regions; + goto err_release_bar2; } } err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); @@ -1202,19 +1937,16 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) if (err) { dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, " "aborting.\n"); - goto err_release_regions; + goto err_release_bar2; } } - /* Allow large DMA segments, up to the firmware limit of 1 GB */ - dma_set_max_seg_size(&pdev->dev, 1024 * 1024 * 1024); - priv = kzalloc(sizeof *priv, GFP_KERNEL); if (!priv) { dev_err(&pdev->dev, "Device struct alloc failed, " "aborting.\n"); err = -ENOMEM; - goto err_release_regions; + goto err_release_bar2; } dev = &priv->dev; @@ -1230,43 +1962,122 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) INIT_LIST_HEAD(&priv->bf_list); mutex_init(&priv->bf_mutex); - dev->rev_id = pdev->revision; + /* Detect if this device is a virtual function */ + if (id && id->driver_data & MLX4_VF) { + /* When acting as pf, we normally skip vfs unless explicitly + * requested to probe them. */ + if (sr_iov && extended_func_num(pdev) > probe_vf) { + mlx4_warn(dev, "Skipping virtual function:%d\n", + extended_func_num(pdev)); + err = -ENODEV; + goto err_free_dev; + } + mlx4_warn(dev, "Detected virtual function - running in slave mode\n"); + dev->flags |= MLX4_FLAG_MFUNC; + } + + /* We reset the device and enable SRIOV only for physical devices */ + if (!mlx4_is_mfunc(dev) || mlx4_is_master(dev)) { + /* Claim ownership on the device, + * if already taken, act as slave*/ + err = mlx4_get_ownership(dev); + if (err) { + if (err < 0) + goto err_free_dev; + else { + err = 0; + dev->flags |= MLX4_FLAG_MFUNC; + dev->flags &= ~MLX4_FLAG_MASTER; + goto slave_start; + } + } - /* - * Now reset the HCA before we touch the PCI capabilities or - * attempt a firmware command, since a boot ROM may have left - * the HCA in an undefined state. - */ - err = mlx4_reset(dev); - if (err) { - mlx4_err(dev, "Failed to reset HCA, aborting.\n"); - goto err_free_dev; + if (sr_iov) { + mlx4_warn(dev, "Enabling sriov with:%d vfs\n", sr_iov); + if (pci_enable_sriov(pdev, sr_iov)) { + mlx4_err(dev, "Failed to enable sriov, aborting.\n"); + goto err_rel_own; + } + mlx4_warn(dev, "Running in master mode\n"); + dev->flags |= MLX4_FLAG_SRIOV | MLX4_FLAG_MASTER; + dev->sr_iov = sr_iov; + } + + /* + * Now reset the HCA before we touch the PCI capabilities or + * attempt a firmware command, since a boot ROM may have left + * the HCA in an undefined state. + */ + err = mlx4_reset(dev); + if (err) { + mlx4_err(dev, "Failed to reset HCA, aborting.\n"); + goto err_sriov; + } } +slave_start: if (mlx4_cmd_init(dev)) { mlx4_err(dev, "Failed to init command interface, aborting.\n"); - goto err_free_dev; + goto err_sriov; + } + + /* In slave functions, the communication channel must be initialized before + * posting commands. Also, init num_slaves before calling mlx4_init_hca */ + if (mlx4_is_mfunc(dev)) { + if(mlx4_is_master(dev)) + dev->num_slaves = MLX4_MAX_NUM_SLAVES; + else { + dev->num_slaves = 0; + if (mlx4_multi_func_init(dev)) { + mlx4_err(dev, "Failed to init slave mfunc" + " interface, aborting.\n"); + goto err_cmd; + } + } } err = mlx4_init_hca(dev); - if (err) - goto err_cmd; + if (err) { + if (err == -EACCES) { + /* Not primary Physical function + * Running in slave mode */ + mlx4_cmd_cleanup(dev); + dev->flags |= MLX4_FLAG_MFUNC; + dev->flags &= ~MLX4_FLAG_MASTER; + goto slave_start; + } else + goto err_mfunc; + } + + /* In master functions, the communication channel must be initialized after obtaining + * its address from fw */ + if (mlx4_is_mfunc(dev) && mlx4_is_master(dev)) { + if (mlx4_multi_func_init(dev)) { + mlx4_err(dev, "Failed to init master mfunc interface, aborting.\n"); + goto err_close; + } + } err = mlx4_alloc_eq_table(dev); if (err) - goto err_close; - - priv->msix_ctl.pool_bm = 0; - spin_lock_init(&priv->msix_ctl.pool_lock); + goto err_master_mfunc; mlx4_enable_msi_x(dev); - - err = mlx4_init_steering(dev); - if (err) + if ((mlx4_is_mfunc(dev) && !mlx4_is_master(dev)) && + !(dev->flags & MLX4_FLAG_MSI_X)) { + mlx4_err(dev, "INTx is not supported in slave mode, aborting.\n"); goto err_free_eq; + } + + if (!mlx4_is_mfunc(dev) || mlx4_is_master(dev)) { + err = mlx4_init_steering(dev); + if (err) + goto err_free_eq; + } err = mlx4_setup_hca(dev); - if (err == -EBUSY && (dev->flags & MLX4_FLAG_MSI_X)) { + if (err == -EBUSY && (dev->flags & MLX4_FLAG_MSI_X) && + (!mlx4_is_mfunc(dev) || mlx4_is_master(dev))) { dev->flags &= ~MLX4_FLAG_MSI_X; pci_disable_msix(pdev); err = mlx4_setup_hca(dev); @@ -1285,17 +2096,35 @@ static int __mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) if (err) goto err_port; - mlx4_sense_init(dev); + err = mlx4_init_trigger(priv); + if (err) + goto err_register; + + err = mlx4_sense_init(dev); + if (err) + goto err_trigger; + mlx4_start_sense(dev); pci_set_drvdata(pdev, dev); + err = mlx4_rtt_init(dev); + if (err) + goto err_sense; + return 0; +err_sense: + mlx4_sense_cleanup(dev); +err_trigger: + device_remove_file(&dev->pdev->dev, &priv->trigger_attr); +err_register: + mlx4_unregister_device(dev); err_port: for (--port; port >= 1; --port) mlx4_cleanup_port_info(&priv->port[port]); + mlx4_cleanup_counters_table(dev); mlx4_cleanup_mcg_table(dev); mlx4_cleanup_qp_table(dev); mlx4_cleanup_srq_table(dev); @@ -1303,29 +2132,52 @@ err_port: mlx4_cmd_use_polling(dev); mlx4_cleanup_eq_table(dev); mlx4_cleanup_mr_table(dev); + mlx4_cleanup_xrcd_table(dev); mlx4_cleanup_pd_table(dev); mlx4_cleanup_uar_table(dev); err_steer: - mlx4_clear_steering(dev); + if (!mlx4_is_mfunc(dev) || mlx4_is_master(dev)) + mlx4_clear_steering(dev); err_free_eq: mlx4_free_eq_table(dev); +err_master_mfunc: + if (mlx4_is_mfunc(dev) && mlx4_is_master(dev)) + mlx4_multi_func_cleanup(dev); + + mfunc_cleaned_up = 1; + err_close: if (dev->flags & MLX4_FLAG_MSI_X) pci_disable_msix(pdev); mlx4_close_hca(dev); +err_mfunc: + if (!mfunc_cleaned_up && mlx4_is_mfunc(dev) && !mlx4_is_master(dev)) + mlx4_multi_func_cleanup(dev); + err_cmd: mlx4_cmd_cleanup(dev); +err_sriov: + if (sr_iov && (dev->flags & MLX4_FLAG_SRIOV)) + pci_disable_sriov(pdev); + +err_rel_own: + if (!mlx4_is_mfunc(dev) || mlx4_is_master(dev)) + mlx4_free_ownership(dev); + err_free_dev: kfree(priv); -err_release_regions: - pci_release_regions(pdev); +err_release_bar2: + pci_release_region(pdev, 2); + +err_release_bar0: + pci_release_region(pdev, 0); err_disable_pdev: pci_disable_device(pdev); @@ -1336,7 +2188,12 @@ err_disable_pdev: static int __devinit mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id) { - printk_once(KERN_INFO "%s", mlx4_version); + static int mlx4_version_printed; + + if (!mlx4_version_printed) { + printk(KERN_INFO "%s", mlx4_version); + ++mlx4_version_printed; + } return __mlx4_init_one(pdev, id); } @@ -1348,14 +2205,22 @@ static void mlx4_remove_one(struct pci_dev *pdev) int p; if (dev) { - mlx4_stop_sense(dev); + /*in SRIOV it is not allowed to unload the ppf's driver when there is alive vf's*/ + if (mlx4_is_mfunc(dev) && mlx4_is_master(dev)) { + if (mlx4_how_many_lives_vf(dev)) + printk(KERN_ERR "Removing PPF when there are assinged VF's !!!\n"); + } + mlx4_rtt_cleanup(dev); + mlx4_sense_cleanup(dev); mlx4_unregister_device(dev); + device_remove_file(&dev->pdev->dev, &priv->trigger_attr); for (p = 1; p <= dev->caps.num_ports; p++) { mlx4_cleanup_port_info(&priv->port[p]); mlx4_CLOSE_PORT(dev, p); } + mlx4_cleanup_counters_table(dev); mlx4_cleanup_mcg_table(dev); mlx4_cleanup_qp_table(dev); mlx4_cleanup_srq_table(dev); @@ -1363,21 +2228,41 @@ static void mlx4_remove_one(struct pci_dev *pdev) mlx4_cmd_use_polling(dev); mlx4_cleanup_eq_table(dev); mlx4_cleanup_mr_table(dev); + mlx4_cleanup_xrcd_table(dev); mlx4_cleanup_pd_table(dev); + if (mlx4_is_mfunc(dev) && mlx4_is_master(dev)) { + mlx4_free_resource_tracker(dev); + } + + if (mlx4_is_mfunc(dev) && !mlx4_is_master(dev)) + mlx4_fmr_slave_context_term(dev); + iounmap(priv->kar); mlx4_uar_free(dev, &priv->driver_uar); mlx4_cleanup_uar_table(dev); - mlx4_clear_steering(dev); + if (!mlx4_is_mfunc(dev) || mlx4_is_master(dev)) + mlx4_clear_steering(dev); mlx4_free_eq_table(dev); + if (mlx4_is_mfunc(dev) && mlx4_is_master(dev)) + mlx4_multi_func_cleanup(dev); mlx4_close_hca(dev); + if (mlx4_is_mfunc(dev) && !mlx4_is_master(dev)) + mlx4_multi_func_cleanup(dev); mlx4_cmd_cleanup(dev); if (dev->flags & MLX4_FLAG_MSI_X) pci_disable_msix(pdev); + if (sr_iov && (dev->flags & MLX4_FLAG_SRIOV)) { + mlx4_warn(dev, "Disabling sriov\n"); + pci_disable_sriov(pdev); + } + if (!mlx4_is_mfunc(dev) || mlx4_is_master(dev)) + mlx4_free_ownership(dev); kfree(priv); - pci_release_regions(pdev); + pci_release_region(pdev, 2); + pci_release_region(pdev, 0); pci_disable_device(pdev); pci_set_drvdata(pdev, NULL); } @@ -1389,34 +2274,52 @@ int mlx4_restart_one(struct pci_dev *pdev) return __mlx4_init_one(pdev, NULL); } -static DEFINE_PCI_DEVICE_TABLE(mlx4_pci_table) = { - { PCI_VDEVICE(MELLANOX, 0x6340) }, /* MT25408 "Hermon" SDR */ - { PCI_VDEVICE(MELLANOX, 0x634a) }, /* MT25408 "Hermon" DDR */ - { PCI_VDEVICE(MELLANOX, 0x6354) }, /* MT25408 "Hermon" QDR */ - { PCI_VDEVICE(MELLANOX, 0x6732) }, /* MT25408 "Hermon" DDR PCIe gen2 */ - { PCI_VDEVICE(MELLANOX, 0x673c) }, /* MT25408 "Hermon" QDR PCIe gen2 */ - { PCI_VDEVICE(MELLANOX, 0x6368) }, /* MT25408 "Hermon" EN 10GigE */ - { PCI_VDEVICE(MELLANOX, 0x6750) }, /* MT25408 "Hermon" EN 10GigE PCIe gen2 */ - { PCI_VDEVICE(MELLANOX, 0x6372) }, /* MT25458 ConnectX EN 10GBASE-T 10GigE */ - { PCI_VDEVICE(MELLANOX, 0x675a) }, /* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */ - { PCI_VDEVICE(MELLANOX, 0x6764) }, /* MT26468 ConnectX EN 10GigE PCIe gen2*/ - { PCI_VDEVICE(MELLANOX, 0x6746) }, /* MT26438 ConnectX EN 40GigE PCIe gen2 5GT/s */ - { PCI_VDEVICE(MELLANOX, 0x676e) }, /* MT26478 ConnectX2 40GigE PCIe gen2 */ - { PCI_VDEVICE(MELLANOX, 0x1002) }, /* MT25400 Family [ConnectX-2 Virtual Function] */ - { PCI_VDEVICE(MELLANOX, 0x1003) }, /* MT27500 Family [ConnectX-3] */ - { PCI_VDEVICE(MELLANOX, 0x1004) }, /* MT27500 Family [ConnectX-3 Virtual Function] */ - { PCI_VDEVICE(MELLANOX, 0x1005) }, /* MT27510 Family */ - { PCI_VDEVICE(MELLANOX, 0x1006) }, /* MT27511 Family */ - { PCI_VDEVICE(MELLANOX, 0x1007) }, /* MT27520 Family */ - { PCI_VDEVICE(MELLANOX, 0x1008) }, /* MT27521 Family */ - { PCI_VDEVICE(MELLANOX, 0x1009) }, /* MT27530 Family */ - { PCI_VDEVICE(MELLANOX, 0x100a) }, /* MT27531 Family */ - { PCI_VDEVICE(MELLANOX, 0x100b) }, /* MT27540 Family */ - { PCI_VDEVICE(MELLANOX, 0x100c) }, /* MT27541 Family */ - { PCI_VDEVICE(MELLANOX, 0x100d) }, /* MT27550 Family */ - { PCI_VDEVICE(MELLANOX, 0x100e) }, /* MT27551 Family */ - { PCI_VDEVICE(MELLANOX, 0x100f) }, /* MT27560 Family */ - { PCI_VDEVICE(MELLANOX, 0x1010) }, /* MT27561 Family */ +int mlx4_gid_idx_to_slave(struct mlx4_dev *dev, int gid_index) +{ + return gid_index % (dev->sr_iov + 1); +} +EXPORT_SYMBOL_GPL(mlx4_gid_idx_to_slave); + +static struct pci_device_id mlx4_pci_table[] = { + { MLX4_VDEVICE(MELLANOX, 0x6340, 0) }, /* MT25408 "Hermon" SDR */ + { MLX4_VDEVICE(MELLANOX, 0x6341, MLX4_VF) }, /* MT25408 "Hermon" SDR VF */ + { MLX4_VDEVICE(MELLANOX, 0x634a, 0) }, /* MT25408 "Hermon" DDR */ + { MLX4_VDEVICE(MELLANOX, 0x634b, MLX4_VF) }, /* MT25408 "Hermon" DDR VF */ + { MLX4_VDEVICE(MELLANOX, 0x6354, 0) }, /* MT25408 "Hermon" QDR */ + { MLX4_VDEVICE(MELLANOX, 0x6732, 0) }, /* MT25408 "Hermon" DDR PCIe gen2 */ + { MLX4_VDEVICE(MELLANOX, 0x6733, MLX4_VF) }, /* MT25408 "Hermon" DDR PCIe gen2 VF */ + { MLX4_VDEVICE(MELLANOX, 0x673c, 0) }, /* MT25408 "Hermon" QDR PCIe gen2 */ + { MLX4_VDEVICE(MELLANOX, 0x673d, MLX4_VF) }, /* MT25408 "Hermon" QDR PCIe gen2 VF */ + { MLX4_VDEVICE(MELLANOX, 0x6368, 0) }, /* MT25408 "Hermon" EN 10GigE */ + { MLX4_VDEVICE(MELLANOX, 0x6369, MLX4_VF) }, /* MT25408 "Hermon" EN 10GigE VF */ + { MLX4_VDEVICE(MELLANOX, 0x6750, 0) }, /* MT25408 "Hermon" EN 10GigE PCIe gen2 */ + { MLX4_VDEVICE(MELLANOX, 0x6751, MLX4_VF) }, /* MT25408 "Hermon" EN 10GigE PCIe gen2 VF */ + { MLX4_VDEVICE(MELLANOX, 0x6372, 0) }, /* MT25458 ConnectX EN 10GBASE-T 10GigE */ + { MLX4_VDEVICE(MELLANOX, 0x6373, MLX4_VF) }, /* MT25458 ConnectX EN 10GBASE-T 10GigE */ + { MLX4_VDEVICE(MELLANOX, 0x675a, 0) }, /* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */ + { MLX4_VDEVICE(MELLANOX, 0x675b, MLX4_VF) }, /* MT25458 ConnectX EN 10GBASE-T+Gen2 10GigE */ + { MLX4_VDEVICE(MELLANOX, 0x6764, 0) }, /* MT26468 ConnectX EN 10GigE PCIe gen2*/ + { MLX4_VDEVICE(MELLANOX, 0x6765, MLX4_VF) }, /* MT26468 ConnectX EN 10GigE PCIe gen2 VF*/ + { MLX4_VDEVICE(MELLANOX, 0x6746, 0) }, /* MT26438 ConnectX VPI PCIe 2.0 5GT/s - IB QDR / 10GigE Virt+ */ + { MLX4_VDEVICE(MELLANOX, 0x6747, MLX4_VF) }, /* MT26438 ConnectX VPI PCIe 2.0 5GT/s - IB QDR / 10GigE Virt+ VF*/ + { MLX4_VDEVICE(MELLANOX, 0x676e, 0) }, /* MT26478 ConnectX EN 40GigE PCIe 2.0 5GT/s */ + { MLX4_VDEVICE(MELLANOX, 0x676f, MLX4_VF) }, /* MT26478 ConnectX EN 40GigE PCIe 2.0 5GT/s VF*/ + { MLX4_VDEVICE(MELLANOX, 0x6778, 0) }, /* MT26488 ConnectX VPI PCIe 2.0 5GT/s - IB DDR / 10GigE Virt+ */ + { MLX4_VDEVICE(MELLANOX, 0x6779, MLX4_VF) }, /* MT26488 ConnectX VPI PCIe 2.0 5GT/s - IB DDR / 10GigE Virt+ VF*/ + { MLX4_VDEVICE(MELLANOX, 0x1002, MLX4_VF) }, /* ConnectX-2 Virtual Function */ + { MLX4_VDEVICE(MELLANOX, 0x1003, 0) }, /* ConnectX-3 */ + { MLX4_VDEVICE(MELLANOX, 0x1004, MLX4_VF) }, /* ConnectX-3 Virtual Function */ + { MLX4_VDEVICE(MELLANOX, 0x1005, 0) }, + { MLX4_VDEVICE(MELLANOX, 0x1006, 0) }, + { MLX4_VDEVICE(MELLANOX, 0x1007, 0) }, + { MLX4_VDEVICE(MELLANOX, 0x1008, 0) }, + { MLX4_VDEVICE(MELLANOX, 0x1009, 0) }, + { MLX4_VDEVICE(MELLANOX, 0x100a, 0) }, + { MLX4_VDEVICE(MELLANOX, 0x100b, 0) }, + { MLX4_VDEVICE(MELLANOX, 0x100c, 0) }, + { MLX4_VDEVICE(MELLANOX, 0x100d, 0) }, + { MLX4_VDEVICE(MELLANOX, 0x100e, 0) }, + { MLX4_VDEVICE(MELLANOX, 0x100f, 0) }, { 0, } }; @@ -1432,31 +2335,29 @@ static struct pci_driver mlx4_driver = { static int __init mlx4_verify_params(void) { if ((log_num_mac < 0) || (log_num_mac > 7)) { - pr_warning("mlx4_core: bad num_mac: %d\n", log_num_mac); - return -1; - } - - if ((log_num_vlan < 0) || (log_num_vlan > 7)) { - pr_warning("mlx4_core: bad num_vlan: %d\n", log_num_vlan); + printk(KERN_WARNING "mlx4_core: bad num_mac: %d\n", log_num_mac); return -1; } if ((log_mtts_per_seg < 1) || (log_mtts_per_seg > 7)) { - pr_warning("mlx4_core: bad log_mtts_per_seg: %d\n", log_mtts_per_seg); + printk(KERN_WARNING "mlx4_core: bad log_mtts_per_seg: %d\n", log_mtts_per_seg); return -1; } return 0; } - static int __init mlx4_init(void) { int ret; + mutex_init(&drv_mutex); + if (mlx4_verify_params()) return -EINVAL; mlx4_catas_init(); + mlx4_fmr_master_init(); + mlx4_fmr_slave_init(); mlx4_wq = create_singlethread_workqueue("mlx4"); if (!mlx4_wq) @@ -1468,9 +2369,13 @@ static int __init mlx4_init(void) static void __exit mlx4_cleanup(void) { + mutex_lock(&drv_mutex); + mlx4_config_cleanup(); pci_unregister_driver(&mlx4_driver); + mutex_unlock(&drv_mutex); destroy_workqueue(mlx4_wq); } module_init(mlx4_init); module_exit(mlx4_cleanup); + diff --git a/drivers/net/mlx4/mcg.c b/drivers/net/mlx4/mcg.c index cd1784593a3c9..d1a0084f79836 100644 --- a/drivers/net/mlx4/mcg.c +++ b/drivers/net/mlx4/mcg.c @@ -31,8 +31,9 @@ * SOFTWARE. */ +#include #include -#include +#include #include @@ -41,30 +42,45 @@ #define MGM_QPN_MASK 0x00FFFFFF #define MGM_BLCK_LB_BIT 30 -static const u8 zero_gid[16]; /* automatically initialized to 0 */ +struct mlx4_mgm { + __be32 next_gid_index; + __be32 members_count; + u32 reserved[2]; + u8 gid[16]; + __be32 qp[0];/*The array will be overide from the mailbox*/ +}; + +int mlx4_get_mgm_entry_size(struct mlx4_dev *dev) +{ + if (mlx4_is_mfunc(dev) && mlx4_is_master(dev)) + return min((1 << mlx4_log_num_mgm_entry_size), MLX4_MAX_MGM_ENTRY_SIZE); + return MLX4_MGM_ENTRY_SIZE; +} + +int mlx4_get_qp_per_mgm(struct mlx4_dev *dev) +{ + return 4 * (mlx4_get_mgm_entry_size(dev) / 16 - 2); +} static int mlx4_READ_ENTRY(struct mlx4_dev *dev, int index, struct mlx4_cmd_mailbox *mailbox) { - return mlx4_cmd_box(dev, 0, mailbox->dma, index, 0, MLX4_CMD_READ_MCG, - MLX4_CMD_TIME_CLASS_A); + return mlx4_cmd_box(dev, 0, mailbox->dma, index, 0, + MLX4_CMD_READ_MCG, MLX4_CMD_TIME_CLASS_A, 1); } static int mlx4_WRITE_ENTRY(struct mlx4_dev *dev, int index, struct mlx4_cmd_mailbox *mailbox) { - return mlx4_cmd(dev, mailbox->dma, index, 0, MLX4_CMD_WRITE_MCG, - MLX4_CMD_TIME_CLASS_A); + return mlx4_cmd(dev, mailbox->dma, index, 0, + MLX4_CMD_WRITE_MCG, MLX4_CMD_TIME_CLASS_A, 1); } -static int mlx4_WRITE_PROMISC(struct mlx4_dev *dev, u8 vep_num, u8 port, u8 steer, +static int mlx4_WRITE_PROMISC(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox) { - u32 in_mod; - - in_mod = (u32) vep_num << 24 | (u32) port << 16 | steer << 1; - return mlx4_cmd(dev, mailbox->dma, in_mod, 0x1, - MLX4_CMD_WRITE_MCG, MLX4_CMD_TIME_CLASS_A); + return mlx4_cmd(dev, mailbox->dma, 0, 0x1, + MLX4_CMD_WRITE_MCG, MLX4_CMD_TIME_CLASS_A, 1); } static int mlx4_GID_HASH(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, @@ -74,7 +90,7 @@ static int mlx4_GID_HASH(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, int err; err = mlx4_cmd_imm(dev, mailbox->dma, &imm, 0, op_mod, - MLX4_CMD_MGID_HASH, MLX4_CMD_TIME_CLASS_A); + MLX4_CMD_MGID_HASH, MLX4_CMD_TIME_CLASS_A, 1); if (!err) *hash = imm; @@ -82,6 +98,11 @@ static int mlx4_GID_HASH(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, return err; } +/* + * Helper functions to manage multifunction steering data structures. + * Used only for Ethernet steering. + */ + static struct mlx4_promisc_qp *get_promisc_qp(struct mlx4_dev *dev, u8 pf_num, enum mlx4_steer_type steer, u32 qpn) @@ -101,11 +122,11 @@ static struct mlx4_promisc_qp *get_promisc_qp(struct mlx4_dev *dev, u8 pf_num, * Add new entry to steering data structure. * All promisc QPs should be added as well */ -static int new_steering_entry(struct mlx4_dev *dev, u8 vep_num, u8 port, +static int new_steering_entry(struct mlx4_dev *dev, u8 pf_num, enum mlx4_steer_type steer, unsigned int index, u32 qpn) { - struct mlx4_steer *s_steer; + struct mlx4_steer *s_steer = &mlx4_priv(dev)->steer[pf_num]; struct mlx4_cmd_mailbox *mailbox; struct mlx4_mgm *mgm; u32 members_count; @@ -114,10 +135,7 @@ static int new_steering_entry(struct mlx4_dev *dev, u8 vep_num, u8 port, struct mlx4_promisc_qp *dqp = NULL; u32 prot; int err; - u8 pf_num; - pf_num = (dev->caps.num_ports == 1) ? vep_num : (vep_num << 1) | (port - 1); - s_steer = &mlx4_priv(dev)->steer[pf_num]; new_entry = kzalloc(sizeof *new_entry, GFP_KERNEL); if (!new_entry) return -ENOMEM; @@ -164,7 +182,7 @@ static int new_steering_entry(struct mlx4_dev *dev, u8 vep_num, u8 port, /* don't add already existing qpn */ if (pqp->qpn == qpn) continue; - if (members_count == MLX4_QP_PER_MGM) { + if (members_count == mlx4_get_qp_per_mgm(dev)) { /* out of space */ err = -ENOMEM; goto out_mailbox; @@ -192,18 +210,14 @@ out_alloc: } /* update the data structures with existing steering entry */ -static int existing_steering_entry(struct mlx4_dev *dev, u8 vep_num, u8 port, +static int existing_steering_entry(struct mlx4_dev *dev, u8 pf_num, enum mlx4_steer_type steer, unsigned int index, u32 qpn) { - struct mlx4_steer *s_steer; + struct mlx4_steer *s_steer = &mlx4_priv(dev)->steer[pf_num]; struct mlx4_steer_index *tmp_entry, *entry = NULL; struct mlx4_promisc_qp *pqp; struct mlx4_promisc_qp *dqp; - u8 pf_num; - - pf_num = (dev->caps.num_ports == 1) ? vep_num : (vep_num << 1) | (port - 1); - s_steer = &mlx4_priv(dev)->steer[pf_num]; pqp = get_promisc_qp(dev, pf_num, steer, qpn); if (!pqp) @@ -222,9 +236,9 @@ static int existing_steering_entry(struct mlx4_dev *dev, u8 vep_num, u8 port, /* the given qpn is listed as a promisc qpn * we need to add it as a duplicate to this entry - * for future references */ + * for future refernce */ list_for_each_entry(dqp, &entry->duplicates, list) { - if (qpn == dqp->qpn) + if (dqp->qpn == pqp->qpn) return 0; /* qp is already duplicated */ } @@ -240,17 +254,13 @@ static int existing_steering_entry(struct mlx4_dev *dev, u8 vep_num, u8 port, /* Check whether a qpn is a duplicate on steering entry * If so, it should not be removed from mgm */ -static bool check_duplicate_entry(struct mlx4_dev *dev, u8 vep_num, u8 port, +static bool check_duplicate_entry(struct mlx4_dev *dev, u8 pf_num, enum mlx4_steer_type steer, unsigned int index, u32 qpn) { - struct mlx4_steer *s_steer; + struct mlx4_steer *s_steer = &mlx4_priv(dev)->steer[pf_num]; struct mlx4_steer_index *tmp_entry, *entry = NULL; struct mlx4_promisc_qp *dqp, *tmp_dqp; - u8 pf_num; - - pf_num = (dev->caps.num_ports == 1) ? vep_num : (vep_num << 1) | (port - 1); - s_steer = &mlx4_priv(dev)->steer[pf_num]; /* if qp is not promisc, it cannot be duplicated */ if (!get_promisc_qp(dev, pf_num, steer, qpn)) @@ -278,11 +288,11 @@ static bool check_duplicate_entry(struct mlx4_dev *dev, u8 vep_num, u8 port, } /* I a steering entry contains only promisc QPs, it can be removed. */ -static bool can_remove_steering_entry(struct mlx4_dev *dev, u8 vep_num, u8 port, +static bool can_remove_steering_entry(struct mlx4_dev *dev, u8 pf_num, enum mlx4_steer_type steer, unsigned int index, u32 tqpn) { - struct mlx4_steer *s_steer; + struct mlx4_steer *s_steer = &mlx4_priv(dev)->steer[pf_num]; struct mlx4_cmd_mailbox *mailbox; struct mlx4_mgm *mgm; struct mlx4_steer_index *entry = NULL, *tmp_entry; @@ -290,10 +300,6 @@ static bool can_remove_steering_entry(struct mlx4_dev *dev, u8 vep_num, u8 port, u32 members_count; bool ret = false; int i; - u8 pf_num; - - pf_num = (dev->caps.num_ports == 1) ? vep_num : (vep_num << 1) | (port - 1); - s_steer = &mlx4_priv(dev)->steer[pf_num]; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) @@ -311,18 +317,12 @@ static bool can_remove_steering_entry(struct mlx4_dev *dev, u8 vep_num, u8 port, } } /* All the qps currently registered for this entry are promiscuous, - * Checking for duplicates */ + * it can be removed */ ret = true; list_for_each_entry_safe(entry, tmp_entry, &s_steer->steer_entries[steer], list) { if (entry->index == index) { - if (list_empty(&entry->duplicates)) { - list_del(&entry->list); - kfree(entry); - } else { - /* This entry contains duplicates so it shouldn't be removed */ - ret = false; - goto out; - } + list_del(&entry->list); + kfree(entry); } } @@ -331,10 +331,11 @@ out: return ret; } -static int add_promisc_qp(struct mlx4_dev *dev, u8 vep_num, u8 port, + +static int add_promisc_qp(struct mlx4_dev *dev, u8 pf_num, enum mlx4_steer_type steer, u32 qpn) { - struct mlx4_steer *s_steer; + struct mlx4_steer *s_steer = &mlx4_priv(dev)->steer[pf_num]; struct mlx4_cmd_mailbox *mailbox; struct mlx4_mgm *mgm; struct mlx4_steer_index *entry; @@ -346,23 +347,13 @@ static int add_promisc_qp(struct mlx4_dev *dev, u8 vep_num, u8 port, bool found; int last_index; int err; - u8 pf_num; - struct mlx4_priv *priv = mlx4_priv(dev); - pf_num = (dev->caps.num_ports == 1) ? vep_num : (vep_num << 1) | (port - 1); - s_steer = &mlx4_priv(dev)->steer[pf_num]; - - mutex_lock(&priv->mcg_table.mutex); - if (get_promisc_qp(dev, pf_num, steer, qpn)) { - err = 0; /* Noting to do, already exists */ - goto out_mutex; - } + if (get_promisc_qp(dev, pf_num, steer, qpn)) + return 0; /* Noting to do, already exists */ pqp = kmalloc(sizeof *pqp, GFP_KERNEL); - if (!pqp) { - err = -ENOMEM; - goto out_mutex; - } + if (!pqp) + return -ENOMEM; pqp->qpn = qpn; mailbox = mlx4_alloc_cmd_mailbox(dev); @@ -396,7 +387,7 @@ static int add_promisc_qp(struct mlx4_dev *dev, u8 vep_num, u8 port, } if (!found) { /* Need to add the qpn to mgm */ - if (members_count == MLX4_QP_PER_MGM) { + if (members_count == mlx4_get_qp_per_mgm(dev)) { /* entry is full */ err = -ENOMEM; goto out_mailbox; @@ -414,35 +405,34 @@ static int add_promisc_qp(struct mlx4_dev *dev, u8 vep_num, u8 port, list_add_tail(&pqp->list, &s_steer->promisc_qps[steer]); /* now need to add all the promisc qps to default entry */ memset(mgm, 0, sizeof *mgm); + mgm->gid[7] = pf_num << 4; + mgm->next_gid_index = cpu_to_be32(1 << 4); members_count = 0; list_for_each_entry(dqp, &s_steer->promisc_qps[steer], list) mgm->qp[members_count++] = cpu_to_be32(dqp->qpn & MGM_QPN_MASK); mgm->members_count = cpu_to_be32(members_count | MLX4_PROT_ETH << 30); - err = mlx4_WRITE_PROMISC(dev, vep_num, port, steer, mailbox); + err = mlx4_WRITE_PROMISC(dev, mailbox); if (err) goto out_list; mlx4_free_cmd_mailbox(dev, mailbox); - mutex_unlock(&priv->mcg_table.mutex); return 0; out_list: list_del(&pqp->list); out_mailbox: + /* TODO: undo partial addition of promisc qps */ mlx4_free_cmd_mailbox(dev, mailbox); out_alloc: kfree(pqp); -out_mutex: - mutex_unlock(&priv->mcg_table.mutex); return err; } -static int remove_promisc_qp(struct mlx4_dev *dev, u8 vep_num, u8 port, +static int remove_promisc_qp(struct mlx4_dev *dev, u8 pf_num, enum mlx4_steer_type steer, u32 qpn) { - struct mlx4_priv *priv = mlx4_priv(dev); - struct mlx4_steer *s_steer; + struct mlx4_steer *s_steer = &mlx4_priv(dev)->steer[pf_num]; struct mlx4_cmd_mailbox *mailbox; struct mlx4_mgm *mgm; struct mlx4_steer_index *entry; @@ -453,18 +443,12 @@ static int remove_promisc_qp(struct mlx4_dev *dev, u8 vep_num, u8 port, bool back_to_list = false; int loc, i; int err; - u8 pf_num; - - pf_num = (dev->caps.num_ports == 1) ? vep_num : (vep_num << 1) | (port - 1); - s_steer = &mlx4_priv(dev)->steer[pf_num]; - mutex_lock(&priv->mcg_table.mutex); pqp = get_promisc_qp(dev, pf_num, steer, qpn); if (unlikely(!pqp)) { mlx4_warn(dev, "QP %x is not promiscuous QP\n", qpn); /* nothing to do */ - err = 0; - goto out_mutex; + return 0; } /*remove from list of promisc qps */ @@ -477,13 +461,17 @@ static int remove_promisc_qp(struct mlx4_dev *dev, u8 vep_num, u8 port, back_to_list = true; goto out_list; } + kfree(pqp); + mgm = mailbox->buf; + mgm->gid[7] = pf_num << 4; + mgm->next_gid_index = cpu_to_be32(1 << 4); members_count = 0; list_for_each_entry(dqp, &s_steer->promisc_qps[steer], list) mgm->qp[members_count++] = cpu_to_be32(dqp->qpn & MGM_QPN_MASK); mgm->members_count = cpu_to_be32(members_count | MLX4_PROT_ETH << 30); - err = mlx4_WRITE_PROMISC(dev, vep_num, port, steer, mailbox); + err = mlx4_WRITE_PROMISC(dev,mailbox); if (err) goto out_mailbox; @@ -510,6 +498,11 @@ static int remove_promisc_qp(struct mlx4_dev *dev, u8 vep_num, u8 port, if ((be32_to_cpu(mgm->qp[i]) & MGM_QPN_MASK) == qpn) loc = i; + if (loc < 0) { + mlx4_warn(dev, "QPN 0x%x is not found in default entry\n", qpn); + goto out_mailbox; + } + mgm->members_count = cpu_to_be32(--members_count | (MLX4_PROT_ETH << 30)); mgm->qp[loc] = mgm->qp[i - 1]; @@ -527,10 +520,6 @@ out_mailbox: out_list: if (back_to_list) list_add_tail(&pqp->list, &s_steer->promisc_qps[steer]); - else - kfree(pqp); -out_mutex: - mutex_unlock(&priv->mcg_table.mutex); return err; } @@ -549,7 +538,7 @@ out_mutex: * If no AMGM exists for given gid, *index = -1, *prev = index of last * entry in hash chain and *mgm holds end of hash chain. */ -static int find_entry(struct mlx4_dev *dev, u8 port, +static int find_entry(struct mlx4_dev *dev, u8 *gid, enum mlx4_protocol prot, enum mlx4_steer_type steer, struct mlx4_cmd_mailbox *mgm_mailbox, @@ -559,8 +548,7 @@ static int find_entry(struct mlx4_dev *dev, u8 port, struct mlx4_mgm *mgm = mgm_mailbox->buf; u8 *mgid; int err; - u8 op_mod = (prot == MLX4_PROT_ETH) ? - !!(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER) : 0; + u8 op_mod = (prot == MLX4_PROT_ETH) ? !!(dev->caps.vep_mc_steering) : 0; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) @@ -594,8 +582,8 @@ static int find_entry(struct mlx4_dev *dev, u8 port, } if (!memcmp(mgm->gid, gid, 16) && - be32_to_cpu(mgm->members_count) >> 30 == prot) - return err; + (prot == be32_to_cpu(mgm->members_count) >> 30)) + return err; *prev = *index; *index = be32_to_cpu(mgm->next_gid_index) >> 6; @@ -618,7 +606,7 @@ int mlx4_qp_attach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], int link = 0; int i; int err; - u8 port = gid[5]; + u8 pf_num = gid[7] >> 4; u8 new_entry = 0; mailbox = mlx4_alloc_cmd_mailbox(dev); @@ -627,8 +615,8 @@ int mlx4_qp_attach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], mgm = mailbox->buf; mutex_lock(&priv->mcg_table.mutex); - err = find_entry(dev, port, gid, prot, steer, - mailbox, &hash, &prev, &index); + + err = find_entry(dev, gid, prot, steer, mailbox, &hash, &prev, &index); if (err) goto out; @@ -653,7 +641,7 @@ int mlx4_qp_attach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], } members_count = be32_to_cpu(mgm->members_count) & 0xffffff; - if (members_count == MLX4_QP_PER_MGM) { + if (members_count == mlx4_get_qp_per_mgm(dev)) { mlx4_err(dev, "MGM at index %x is full.\n", index); err = -ENOMEM; goto out; @@ -666,18 +654,14 @@ int mlx4_qp_attach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], goto out; } - if (block_mcast_loopback) - mgm->qp[members_count++] = cpu_to_be32((qp->qpn & MGM_QPN_MASK) | - (1U << MGM_BLCK_LB_BIT)); - else - mgm->qp[members_count++] = cpu_to_be32(qp->qpn & MGM_QPN_MASK); + mgm->qp[members_count++] = cpu_to_be32((qp->qpn & MGM_QPN_MASK) | + (!!mlx4_blck_lb << MGM_BLCK_LB_BIT)); - mgm->members_count = cpu_to_be32(members_count | (u32) prot << 30); + mgm->members_count = cpu_to_be32(members_count | ((u32) prot << 30)); err = mlx4_WRITE_ENTRY(dev, index, mailbox); if (err) goto out; - if (!link) goto out; @@ -685,20 +669,20 @@ int mlx4_qp_attach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], if (err) goto out; - mgm->next_gid_index = cpu_to_be32(index << 6); + mgm->next_gid_index = cpu_to_be32((index << 6) | + (!!(dev->caps.vep_mc_steering) << 4)); err = mlx4_WRITE_ENTRY(dev, prev, mailbox); if (err) goto out; - out: if (prot == MLX4_PROT_ETH) { /* manage the steering entry for promisc mode */ if (new_entry) - new_steering_entry(dev, 0, port, steer, index, qp->qpn); + err = new_steering_entry(dev, pf_num, steer, index, qp->qpn); else - existing_steering_entry(dev, 0, port, steer, - index, qp->qpn); + err = existing_steering_entry(dev, pf_num, steer, index, qp->qpn); + /* TODO handle an error flow here, need to clean the MGMS */ } if (err && link && index != -1) { if (index < dev->caps.num_mgms) @@ -725,8 +709,7 @@ int mlx4_qp_detach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], int prev, index; int i, loc; int err; - u8 port = gid[5]; - bool removed_entry = false; + u8 pf_num = gid[7] >> 4; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) @@ -735,8 +718,7 @@ int mlx4_qp_detach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], mutex_lock(&priv->mcg_table.mutex); - err = find_entry(dev, port, gid, prot, steer, - mailbox, &hash, &prev, &index); + err = find_entry(dev, gid, prot, steer, mailbox, &hash, &prev, &index); if (err) goto out; @@ -748,7 +730,7 @@ int mlx4_qp_detach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], /* if this pq is also a promisc qp, it shouldn't be removed */ if (prot == MLX4_PROT_ETH && - check_duplicate_entry(dev, 0, port, steer, index, qp->qpn)) + check_duplicate_entry(dev, pf_num, steer, index, qp->qpn)) goto out; members_count = be32_to_cpu(mgm->members_count) & 0xffffff; @@ -763,20 +745,16 @@ int mlx4_qp_detach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], } - mgm->members_count = cpu_to_be32(--members_count | (u32) prot << 30); + mgm->members_count = cpu_to_be32(--members_count | ((u32) prot << 30)); mgm->qp[loc] = mgm->qp[i - 1]; mgm->qp[i - 1] = 0; - if (prot == MLX4_PROT_ETH) - removed_entry = can_remove_steering_entry(dev, 0, port, steer, index, qp->qpn); - if (i != 1 && (prot != MLX4_PROT_ETH || !removed_entry)) { + if (i != 1 && (prot != MLX4_PROT_ETH || + !can_remove_steering_entry(dev, pf_num, steer, index, qp->qpn))) { err = mlx4_WRITE_ENTRY(dev, index, mailbox); goto out; } - /* We are going to delete the entry, members count should be 0 */ - mgm->members_count = cpu_to_be32((u32) prot << 30); - if (prev == -1) { /* Remove entry from MGM */ int amgm_index = be32_to_cpu(mgm->next_gid_index) >> 6; @@ -801,12 +779,12 @@ int mlx4_qp_detach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], } } else { /* Remove entry from AMGM */ - int cur_next_index = be32_to_cpu(mgm->next_gid_index) >> 6; + int cur_next_index = be32_to_cpu(mgm->next_gid_index); err = mlx4_READ_ENTRY(dev, prev, mailbox); if (err) goto out; - mgm->next_gid_index = cpu_to_be32(cur_next_index << 6); + mgm->next_gid_index = cpu_to_be32(cur_next_index); err = mlx4_WRITE_ENTRY(dev, prev, mailbox); if (err) @@ -827,83 +805,136 @@ out: return err; } +static int mlx4_MCAST(struct mlx4_dev *dev, struct mlx4_qp *qp, + u8 gid[16], u8 attach, u8 block_loopback, + enum mlx4_protocol prot) +{ + struct mlx4_cmd_mailbox *mailbox; + int err; + int qpn; + + if (!mlx4_is_mfunc(dev)) + return -EBADF; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + memcpy(mailbox->buf, gid, 16); + qpn = qp->qpn; + qpn |= (prot << 28); + if (attach && block_loopback) + qpn |= (1 << 31); + + err = mlx4_cmd(dev, mailbox->dma, qpn, attach, MLX4_CMD_MCAST_ATTACH, + MLX4_CMD_TIME_CLASS_A, 0); + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} + int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], int block_mcast_loopback, enum mlx4_protocol prot) { - enum mlx4_steer_type steer; - - steer = (is_valid_ether_addr(&gid[10])) ? MLX4_UC_STEER : MLX4_MC_STEER; - - if (prot == MLX4_PROT_ETH && - !(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER)) + if (prot == MLX4_PROT_ETH && !dev->caps.vep_mc_steering) return 0; + if (mlx4_is_mfunc(dev)) + return mlx4_MCAST(dev, qp, gid, 1, block_mcast_loopback, prot); + if (prot == MLX4_PROT_ETH) - gid[7] |= (steer << 1); + gid[7] |= (dev->caps.function << 4 | MLX4_MC_STEER << 1); return mlx4_qp_attach_common(dev, qp, gid, block_mcast_loopback, prot, - steer); + MLX4_MC_STEER); } EXPORT_SYMBOL_GPL(mlx4_multicast_attach); int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], - enum mlx4_protocol prot) + enum mlx4_protocol prot) { - enum mlx4_steer_type steer; - - steer = (is_valid_ether_addr(&gid[10])) ? MLX4_UC_STEER : MLX4_MC_STEER; - - if (prot == MLX4_PROT_ETH && - !(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER)) + if (prot == MLX4_PROT_ETH && !dev->caps.vep_mc_steering) return 0; - if (prot == MLX4_PROT_ETH) { - gid[7] |= (steer << 1); - } + if (mlx4_is_mfunc(dev)) + return mlx4_MCAST(dev, qp, gid, 0, 0, prot); - return mlx4_qp_detach_common(dev, qp, gid, prot, steer); + if (prot == MLX4_PROT_ETH) + gid[7] |= (dev->caps.function << 4 | MLX4_MC_STEER << 1); + + return mlx4_qp_detach_common(dev, qp, gid, prot, MLX4_MC_STEER); } EXPORT_SYMBOL_GPL(mlx4_multicast_detach); +int mlx4_PROMISC_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + u8 pf_num = mlx4_priv(dev)->mfunc.master.slave_state[slave].pf_num; + u32 qpn = (u32) vhcr->in_param & 0xffffffff; + u8 port = vhcr->in_param >> 63; + enum mlx4_steer_type steer = vhcr->in_modifier; + if (vhcr->op_modifier) + return add_promisc_qp(dev, pf_num | port, steer, qpn); + else + return remove_promisc_qp(dev, pf_num | port, steer, qpn); +} + +static int mlx4_PROMISC(struct mlx4_dev *dev, u32 qpn, + enum mlx4_steer_type steer, u8 add, u8 port) +{ + return mlx4_cmd(dev, (u64) qpn | (u64) port << 63, (u32) steer, add, + MLX4_CMD_PROMISC, MLX4_CMD_TIME_CLASS_A, 0); +} int mlx4_multicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port) { - if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER)) + if (!dev->caps.vep_mc_steering) return 0; + if (mlx4_is_mfunc(dev)) + return mlx4_PROMISC(dev, qpn, MLX4_MC_STEER, 1, port); - return add_promisc_qp(dev, 0, port, MLX4_MC_STEER, qpn); + return add_promisc_qp(dev, dev->caps.function | port, MLX4_MC_STEER, qpn); } EXPORT_SYMBOL_GPL(mlx4_multicast_promisc_add); int mlx4_multicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port) { - if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER)) + if (!dev->caps.vep_mc_steering) return 0; + if (mlx4_is_mfunc(dev) && !mlx4_is_master(dev)) + return mlx4_PROMISC(dev, qpn, MLX4_MC_STEER, 0, port); - return remove_promisc_qp(dev, 0, port, MLX4_MC_STEER, qpn); + return remove_promisc_qp(dev, dev->caps.function | port, MLX4_MC_STEER, qpn); } EXPORT_SYMBOL_GPL(mlx4_multicast_promisc_remove); int mlx4_unicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port) { - if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER)) + if (!dev->caps.vep_mc_steering) return 0; + if (mlx4_is_mfunc(dev) && !mlx4_is_master(dev)) + return mlx4_PROMISC(dev, qpn, MLX4_UC_STEER, 1, port); - return add_promisc_qp(dev, 0, port, MLX4_UC_STEER, qpn); + return add_promisc_qp(dev, dev->caps.function | port, MLX4_UC_STEER, qpn); } EXPORT_SYMBOL_GPL(mlx4_unicast_promisc_add); int mlx4_unicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port) { - if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_MC_STEER)) + if (!dev->caps.vep_mc_steering) return 0; - return remove_promisc_qp(dev, 0, port, MLX4_UC_STEER, qpn); + if (mlx4_is_mfunc(dev) && !mlx4_is_master(dev)) + return mlx4_PROMISC(dev, qpn, MLX4_UC_STEER, 0, port); + + return remove_promisc_qp(dev, dev->caps.function | port, MLX4_UC_STEER, qpn); } EXPORT_SYMBOL_GPL(mlx4_unicast_promisc_remove); @@ -912,6 +943,10 @@ int mlx4_init_mcg_table(struct mlx4_dev *dev) struct mlx4_priv *priv = mlx4_priv(dev); int err; + /* Nothing to do for slaves - mcg handling is para-virtualized */ + if (mlx4_is_mfunc(dev) && !mlx4_is_master(dev)) + return 0; + err = mlx4_bitmap_init(&priv->mcg_table.bitmap, dev->caps.num_amgms, dev->caps.num_amgms - 1, 0, 0); if (err) @@ -924,5 +959,7 @@ int mlx4_init_mcg_table(struct mlx4_dev *dev) void mlx4_cleanup_mcg_table(struct mlx4_dev *dev) { + if (mlx4_is_mfunc(dev) && !mlx4_is_master(dev)) + return; mlx4_bitmap_cleanup(&mlx4_priv(dev)->mcg_table.bitmap); } diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index dd7d745fbab49..6e7432cdbc8bc 100644 --- a/drivers/net/mlx4/mlx4.h +++ b/drivers/net/mlx4/mlx4.h @@ -40,31 +40,38 @@ #include #include #include -#include #include +#include #include #include #include +#include +#include #define DRV_NAME "mlx4_core" -#define DRV_VERSION "0.01" -#define DRV_RELDATE "May 1, 2007" +#define PFX DRV_NAME ": " +#define DRV_VERSION "1.0-ofed1.5.5" +#define DRV_RELDATE "April 4, 2008" enum { MLX4_HCR_BASE = 0x80680, + MLX4_HCR_SRIOV_BASE = 0x4080680, /* good for SRIOV FW ony */ MLX4_HCR_SIZE = 0x0001c, - MLX4_CLR_INT_SIZE = 0x00008 + MLX4_CLR_INT_SIZE = 0x00008, + MLX4_SLAVE_COMM_BASE = 0x0, + MLX4_COMM_PAGESIZE = 0x1000 }; enum { - MLX4_MGM_ENTRY_SIZE = 0x100, + MLX4_MGM_ENTRY_SIZE = 0x200, MLX4_QP_PER_MGM = 4 * (MLX4_MGM_ENTRY_SIZE / 16 - 2), - MLX4_MTT_ENTRY_PER_SEG = 8 + MLX4_MTT_ENTRY_PER_SEG = 8, + MLX4_MAX_MGM_ENTRY_SIZE = 0x1000, }; enum { - MLX4_NUM_PDS = 1 << 15 + MLX4_NUM_PDS = 1 << 15, }; enum { @@ -80,24 +87,137 @@ enum { MLX4_NUM_CMPTS = MLX4_CMPT_NUM_TYPE << MLX4_CMPT_SHIFT }; +#define MLX4_COMM_TIME 10000 +enum { + /* + * the fisrt entry is a dummy command. It has been + * added to avoid buggy commands to generate reset. + * This is mainly for debugging. Once all is clean + * we can remove this + */ + MLX4_COMM_CMD_DUMMY, + MLX4_COMM_CMD_RESET, + MLX4_COMM_CMD_VHCR0, + MLX4_COMM_CMD_VHCR1, + MLX4_COMM_CMD_VHCR2, + MLX4_COMM_CMD_VHCR_EN, + MLX4_COMM_CMD_VHCR_POST, + MLX4_COMM_CMD_FLR +}; + +/*The flag indicates that the slave should delay the RESET cmd*/ +#define MLX4_DELAY_RESET_SLAVE 0xbbbbbbb +/*indicates how many retries will be done if we are in the middle of FLR*/ +#define NUM_OF_RESET_RETRIES 10 +#define SLEEP_TIME_IN_RESET 2 * 1000 +enum mlx4_resource { + RES_QP, + RES_CQ, + RES_SRQ, + RES_MPT, + RES_MTT, + RES_MAC, + RES_EQ, + RES_COUNTER, + RES_XRCDN, + MLX4_NUM_OF_RESOURCE_TYPE +}; + +enum mlx4_alloc_mode { + ICM_RESERVE_AND_ALLOC, + ICM_RESERVE, + ICM_ALLOC, + ICM_MAC_VLAN, + + /* eli added */ + RES_OP_RESERVE, + RES_OP_RESERVE_AND_MAP, + RES_OP_MAP_ICM, +}; + + +struct mlx4_vhcr { + u64 in_param; + u64 out_param; + u32 in_modifier; + u32 timeout; + u32 errno; + u16 op; + u16 token; + u8 op_modifier; + u32 cookie; +}; + +struct mlx4_cmd_info { + u16 opcode; + bool has_inbox; + bool has_outbox; + bool out_is_imm; + bool encode_slave_id; + int (*verify)(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox); + int (*wrapper)(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info * cmd); +}; + #ifdef CONFIG_MLX4_DEBUG extern int mlx4_debug_level; #else /* CONFIG_MLX4_DEBUG */ #define mlx4_debug_level (0) #endif /* CONFIG_MLX4_DEBUG */ +#define mlx4_printk(level, format, arg...) \ + do { \ + printk(level "%s-%d: slave %d, " format, __func__, __LINE__ , slave, ## arg); \ + } while(0) + #define mlx4_dbg(mdev, format, arg...) \ -do { \ - if (mlx4_debug_level) \ - dev_printk(KERN_DEBUG, &mdev->pdev->dev, format, ##arg); \ + do { \ + if (mlx4_debug_level & 1) \ + dev_printk(KERN_DEBUG, &mdev->pdev->dev, format, ## arg); \ + } while (0) + +#define mlx4_sdbg(format, arg...) \ +do { \ + if (!(mlx4_debug_level & 2)) \ + break; \ + mlx4_printk(KERN_DEBUG, format, ## arg); \ +} while (0) + +#define mlx4_swarn(format, arg...) \ +do { \ + if (!(mlx4_debug_level & 2)) \ + break; \ + mlx4_printk(KERN_WARNING, format, ## arg); \ } while (0) +#ifdef CONFIG_MLX4_RTT_TESTS +#define SASSERT(cond) do \ + if (!(cond)) { \ + printk(KERN_ERR "%s-%d: *** DRIVER BUG ***\n", __func__, __LINE__); \ + dump_stack(); \ + } while(0) +#else +#define SASSERT(cond) do {} while (0) +#endif + #define mlx4_err(mdev, format, arg...) \ - dev_err(&mdev->pdev->dev, format, ##arg) + dev_err(&mdev->pdev->dev, format, ## arg) #define mlx4_info(mdev, format, arg...) \ - dev_info(&mdev->pdev->dev, format, ##arg) + dev_info(&mdev->pdev->dev, format, ## arg) #define mlx4_warn(mdev, format, arg...) \ - dev_warn(&mdev->pdev->dev, format, ##arg) + dev_warn(&mdev->pdev->dev, format, ## arg) + +extern int mlx4_blck_lb; +extern int mlx4_log_num_mgm_entry_size; + +#define MLX4_MAX_NUM_SLAVES (MLX4_MAX_NUM_PF + MLX4_MAX_NUM_VF) + +#define MLX4_VF (1 << 0) +#define MLX4_VDEVICE(vendor, device, flags) \ + PCI_VDEVICE(vendor, device), (flags) struct mlx4_bitmap { u32 last; @@ -105,7 +225,6 @@ struct mlx4_bitmap { u32 max; u32 reserved_top; u32 mask; - u32 avail; spinlock_t lock; unsigned long *table; }; @@ -130,6 +249,68 @@ struct mlx4_icm_table { struct mlx4_icm **icm; }; + +struct mlx4_eqe { + u8 reserved1; + u8 type; + u8 reserved2; + u8 subtype; + union { + u32 raw[6]; + struct { + __be32 cqn; + } __attribute__((packed)) comp; + struct { + u16 reserved1; + __be16 token; + u32 reserved2; + u8 reserved3[3]; + u8 status; + __be64 out_param; + } __attribute__((packed)) cmd; + struct { + __be32 qpn; + } __attribute__((packed)) qp; + struct { + __be32 srqn; + } __attribute__((packed)) srq; + struct { + __be32 cqn; + u32 reserved1; + u8 reserved2[3]; + u8 syndrome; + } __attribute__((packed)) cq_err; + struct { + u32 reserved1[2]; + __be32 port; + } __attribute__((packed)) port_change; + struct { + #define COMM_CHANNEL_BIT_ARRAY_SIZE 4 + u32 reserved; + u32 bit_vec[COMM_CHANNEL_BIT_ARRAY_SIZE]; + } __attribute__((packed)) comm_channel_arm; + struct { + u8 reserved[3]; + u8 vep_num; + } __attribute__((packed)) vep_config; + struct { + u8 port; + u8 reserved[3]; + __be64 mac; + } __attribute__((packed)) mac_update; + struct { + u8 port; + } __attribute__((packed)) sw_event; + struct { + __be32 slave_id; + } __attribute__((packed)) flr_event; + } event; +#define ALL_SLAVES 0xff + u8 slave_id; + u8 reserved3[2]; + u8 owner; +} __attribute__((packed)); + struct mlx4_eq { struct mlx4_dev *dev; void __iomem *doorbell; @@ -138,10 +319,24 @@ struct mlx4_eq { u16 irq; u16 have_irq; int nent; + int load; struct mlx4_buf_list *page_list; struct mlx4_mtt mtt; }; +struct mlx4_slave_eqe { + u8 type; + u8 port; + u32 param; +}; + +struct mlx4_slave_event_eq_info { + u32 eqn; + bool use_int; + u16 token; + u64 event_type; +}; + struct mlx4_profile { int num_qp; int rdmarc_per_qp; @@ -155,16 +350,301 @@ struct mlx4_profile { struct mlx4_fw { u64 clr_int_base; u64 catas_offset; + u64 comm_base; struct mlx4_icm *fw_icm; struct mlx4_icm *aux_icm; u32 catas_size; u16 fw_pages; u8 clr_int_bar; u8 catas_bar; + u8 comm_bar; +}; + +struct mlx4_comm { + u32 slave_write; + u32 slave_read; +}; + +#define VLAN_FLTR_SIZE 128 + +struct mlx4_vlan_fltr { + __be32 entry[VLAN_FLTR_SIZE]; +}; + +#define GID_SIZE 16 + +enum mlx4_resource_state { + RES_INIT = 0, + RES_RESERVED = 1, + RES_ALLOCATED = 2, + RES_ALLOCATED_AFTER_RESERVATION = 3, +/*When registered mac the master reserved that qp, but the allocation should be in the slave*/ + RES_ALLOCATED_WITH_MASTER_RESERVATION = 4 +}; + +struct mlx_tracked_qp_mcg { + u8 gid[GID_SIZE]; + enum mlx4_protocol prot; + struct list_head list; +}; + +struct mlx_tracked_vln_fltr { + int port; + struct mlx4_vlan_fltr vlan_fltr; +}; + +struct qp_specific_data { + struct list_head mcg_list; + int state; +}; + +struct mtt_specific_data { + int order; +}; + +struct en_specifica_data { + u8 port; +}; + +struct mlx4_tracked_resource { + int slave_id; + int res_type; + int resource_id; + /* state indicates the allocation stage, + importance where there is reservation and after that allocation + */ + unsigned long state; + union { + struct qp_specific_data qp; + struct mtt_specific_data mtt; + struct en_specifica_data en; + } specific_data; + struct list_head list; }; -#define MGM_QPN_MASK 0x00FFFFFF -#define MGM_BLCK_LB_BIT 30 +struct res_common { + struct list_head list; + u32 res_id; + int owner; + int state; + int from_state; + int to_state; + int removing; +}; + +enum { + RES_ANY_BUSY = 1 +}; + +struct res_gid { + struct list_head list; + u8 gid[16]; + enum mlx4_protocol prot; +}; + +enum res_qp_states { + RES_QP_BUSY = RES_ANY_BUSY, + + /* QP number was allocated */ + RES_QP_RESERVED, + + /* ICM memory for QP context was mapped */ + RES_QP_MAPPED, + + /* QP is in hw ownership */ + RES_QP_HW +}; + +static inline const char *qp_states_str(enum res_qp_states state) +{ + switch (state) { + case RES_QP_BUSY: return "RES_QP_BUSY"; + case RES_QP_RESERVED: return "RES_QP_RESERVED"; + case RES_QP_MAPPED: return "RES_QP_MAPPED"; + case RES_QP_HW: return "RES_QP_HW"; + default: return "Unknown"; + } +} + +struct res_qp { + struct res_common com; + struct res_mtt *mtt; + struct res_cq *rcq; + struct res_cq *scq; + struct res_srq *srq; + struct list_head mcg_list; + spinlock_t mcg_spl; + int local_qpn; +}; + +enum res_mtt_states { + RES_MTT_BUSY = RES_ANY_BUSY, + RES_MTT_RESERVED, + RES_MTT_ALLOCATED, +}; + +static inline const char *mtt_states_str(enum res_mtt_states state) +{ + switch (state) { + case RES_MTT_BUSY: return "RES_MTT_BUSY"; + case RES_MTT_RESERVED: return "RES_MTT_RESERVED"; + case RES_MTT_ALLOCATED: return "RES_MTT_ALLOCATED"; + default: return "Unknown"; + } +} + +struct res_mtt { + struct res_common com; + int order; + atomic_t ref_count; +}; + +enum res_mpt_states { + RES_MPT_BUSY = RES_ANY_BUSY, + RES_MPT_RESERVED, + RES_MPT_MAPPED, + RES_MPT_HW, +}; + +static inline const char *mr_states_str(enum res_mtt_states state) +{ + switch (state) { + case RES_MPT_BUSY: return "RES_MPT_BUSY"; + case RES_MPT_RESERVED: return "RES_MPT_RESERVED"; + case RES_MPT_MAPPED: return "RES_MPT_MAPPED"; + case RES_MPT_HW: return "RES_MPT_HW"; + default: return "Unknown"; + } +} + +struct res_mpt { + struct res_common com; + struct res_mtt *mtt; + int key; + enum mlx4_mr_flags flags; +}; + +enum res_eq_states { + RES_EQ_BUSY = RES_ANY_BUSY, + RES_EQ_RESERVED, + RES_EQ_HW, +}; + +static inline const char *eq_states_str(enum res_mtt_states state) +{ + switch (state) { + case RES_EQ_BUSY: return "RES_EQ_BUSY"; + case RES_EQ_RESERVED: return "RES_EQ_RESERVED"; + case RES_EQ_HW: return "RES_EQ_HW"; + default: return "Unknown"; + } +} + +struct res_eq { + struct res_common com; + struct res_mtt *mtt; +}; + +enum res_cq_states { + RES_CQ_BUSY = RES_ANY_BUSY, + RES_CQ_ALLOCATED, + RES_CQ_HW, +}; + +static inline const char *cq_states_str(enum res_cq_states state) +{ + switch (state) { + case RES_CQ_BUSY: return "RES_CQ_BUSY"; + case RES_CQ_ALLOCATED: return "RES_CQ_ALLOCATED"; + case RES_CQ_HW: return "RES_CQ_HW"; + default: return "Unknown"; + } +} + +struct res_cq { + struct res_common com; + struct res_mtt *mtt; + atomic_t ref_count; +}; + +enum res_srq_states { + RES_SRQ_BUSY = RES_ANY_BUSY, + RES_SRQ_ALLOCATED, + RES_SRQ_HW, +}; + +static inline const char *srq_states_str(enum res_srq_states state) +{ + switch (state) { + case RES_SRQ_BUSY: return "RES_SRQ_BUSY"; + case RES_SRQ_ALLOCATED: return "RES_SRQ_ALLOCATED"; + case RES_SRQ_HW: return "RES_SRQ_HW"; + default: return "Unknown"; + } +} + +struct res_srq { + struct res_common com; + struct res_mtt *mtt; + struct res_cq *cq; + atomic_t ref_count; +}; + +enum res_counter_states { + RES_COUNTER_BUSY = RES_ANY_BUSY, + RES_COUNTER_ALLOCATED, +}; + +static inline const char *counter_states_str(enum res_counter_states state) +{ + switch (state) { + case RES_COUNTER_BUSY: return "RES_COUNTER_BUSY"; + case RES_COUNTER_ALLOCATED: return "RES_COUNTER_ALLOCATED"; + default: return "Unknown"; + } +} + +struct res_counter { + struct res_common com; + int port; +}; + +enum res_xrcdn_states { + RES_XRCDN_BUSY = RES_ANY_BUSY, + RES_XRCDN_ALLOCATED, +}; + +static inline const char *xrcdn_states_str(enum res_xrcdn_states state) +{ + switch (state) { + case RES_XRCDN_BUSY: return "RES_XRCDN_BUSY"; + case RES_XRCDN_ALLOCATED: return "RES_XRCDN_ALLOCATED"; + default: return "Unknown"; + } +} + +struct res_xrcdn { + struct res_common com; + int port; +}; + +struct slave_list { + struct mutex mutex; + struct list_head res_list[MLX4_NUM_OF_RESOURCE_TYPE]; +}; + +struct mlx4_resource_tracker { + spinlock_t lock; + /* tree for each resources */ + struct radix_tree_root res_tree[MLX4_NUM_OF_RESOURCE_TYPE]; + /* num_of_slave's lists, one per slave */ + struct slave_list *slave_list; +}; + +struct mlx4_mcast_entry { + struct list_head list; + u64 addr; +}; struct mlx4_promisc_qp { struct list_head list; @@ -177,19 +657,102 @@ struct mlx4_steer_index { struct list_head duplicates; }; -struct mlx4_mgm { - __be32 next_gid_index; - __be32 members_count; - u32 reserved[2]; - u8 gid[16]; - __be32 qp[MLX4_QP_PER_MGM]; +struct mlx4_vep_cfg { + u64 mac; + u8 link; +}; + +struct mlx4_slave_state { + u8 comm_toggle; + u8 last_cmd; + u8 init_port_mask; + u8 pf_num; + u8 vep_num; + bool active; + u8 function; + dma_addr_t vhcr_dma; + u16 mtu[MLX4_MAX_PORTS + 1]; + __be32 ib_cap_mask[MLX4_MAX_PORTS + 1]; + struct mlx4_slave_eqe eq[MLX4_MFUNC_MAX_EQES]; + struct list_head mcast_filters[MLX4_MAX_PORTS + 1]; + struct mlx4_vlan_fltr *vlan_filter[MLX4_MAX_PORTS + 1]; + struct mlx4_slave_event_eq_info event_eq; + struct mlx4_vep_cfg vep_cfg; + u16 eq_pi; + u16 eq_ci; + spinlock_t lock; + /*initialized via the kzalloc*/ + u8 is_slave_going_down; + u32 cookie; + /*save the slave port state*/ + enum slave_port_state port_state[MLX4_MAX_PORTS + 1]; +}; + +#define SLAVE_EVENT_EQ_SIZE 128 +struct mlx4_slave_event_eq { + u32 eqn; + u32 cons; + u32 prod; + spinlock_t event_lock; + struct mlx4_eqe event_eqe[SLAVE_EVENT_EQ_SIZE]; +}; + +struct mlx4_master_qp0_state { + int proxy_qp0_active; + int qp0_active; + int port_active; +}; + +struct mlx4_slave_fmr_ctx { + void *vf_ctx; + /* keeps track of vpm_ctx using va as key */ + struct radix_tree_root vpm_ctx_tree; + spinlock_t vpm_ctx_tree_lock; +}; + +struct mlx4_fmr_vpm_ctx { + u64 va; + void *ctx; +}; + +struct mlx4_mfunc_master_ctx { + struct mlx4_slave_state *slave_state; + struct mlx4_master_qp0_state qp0_state[MLX4_MAX_PORTS + 1]; + int init_port_ref[MLX4_MAX_PORTS + 1]; + u16 max_mtu[MLX4_MAX_PORTS + 1]; + int disable_mcast_ref[MLX4_MAX_PORTS + 1]; + struct mlx4_resource_tracker res_tracker; + struct workqueue_struct *comm_wq; + struct work_struct comm_work; + struct work_struct slave_event_work; + struct work_struct vep_config_work; + struct work_struct slave_flr_event_work; + u16 vep_config_bitmap; + spinlock_t vep_config_lock; + spinlock_t slave_state_lock; + u32 comm_arm_bit_vector[4]; + struct mlx4_eqe cmd_eqe; + struct mlx4_slave_event_eq slave_eq; + struct mutex gen_eqe_mutex[MLX4_MFUNC_MAX]; + struct mlx4_slave_fmr_ctx slave_fmr_ctx[MLX4_MFUNC_MAX]; + void *fmr_ctx; +}; + +struct mlx4_mfunc { + struct mlx4_comm __iomem *comm; + struct mlx4_vhcr *vhcr; + dma_addr_t vhcr_dma; + + struct mlx4_mfunc_master_ctx master; }; + struct mlx4_cmd { struct pci_pool *pool; void __iomem *hcr; struct mutex hcr_mutex; struct semaphore poll_sem; struct semaphore event_sem; + struct semaphore slave_sem; int max_cmds; spinlock_t context_lock; int free_head; @@ -197,6 +760,7 @@ struct mlx4_cmd { u16 token_mask; u8 use_events; u8 toggle; + u8 comm_toggle; }; struct mlx4_uar_table { @@ -210,6 +774,12 @@ struct mlx4_mr_table { u64 mpt_base; struct mlx4_icm_table mtt_table; struct mlx4_icm_table dmpt_table; + struct { + struct mlx4_buddy mtt_buddy; + struct mlx4_bitmap mpt_bitmap; + struct mlx4_icm_table mtt_table; + struct mlx4_icm_table dmpt_table; + } fmr; }; struct mlx4_cq_table { @@ -236,7 +806,6 @@ struct mlx4_eq_table { struct mlx4_srq_table { struct mlx4_bitmap bitmap; spinlock_t lock; - struct radix_tree_root tree; struct mlx4_icm_table table; struct mlx4_icm_table cmpt_table; }; @@ -270,7 +839,6 @@ struct mlx4_catas_err { struct mlx4_mac_table { __be64 entries[MLX4_MAX_MAC_NUM]; - int refs[MLX4_MAX_MAC_NUM]; struct mutex mutex; int total; int max; @@ -287,6 +855,49 @@ struct mlx4_vlan_table { int max; }; + +#define SET_PORT_GEN_ALL_VALID 0x7 +#define SET_PORT_PROMISC_SHIFT 31 +#define SET_PORT_MC_PROMISC_SHIFT 30 + +enum { + MCAST_DIRECT_ONLY = 0, + MCAST_DIRECT = 1, + MCAST_DEFAULT = 2 +}; + + +struct mlx4_set_port_general_context { + u8 reserved[3]; + u8 flags; + u16 reserved2; + __be16 mtu; + u8 pptx; + u8 pfctx; + u16 reserved3; + u8 pprx; + u8 pfcrx; + u16 reserved4; +}; + +struct mlx4_set_port_rqp_calc_context { + __be32 base_qpn; + u8 rererved; + u8 n_mac; + u8 n_vlan; + u8 n_prio; + u8 reserved2[3]; + u8 mac_miss; + u8 intra_no_vlan; + u8 no_vlan; + u8 intra_vlan_miss; + u8 vlan_miss; + u8 reserved3[3]; + u8 no_vlan_prio; + __be32 promisc; + __be32 mcast; +}; + struct mlx4_mac_entry { u64 mac; }; @@ -308,17 +919,15 @@ struct mlx4_sense { u8 do_sense_port[MLX4_MAX_PORTS + 1]; u8 sense_allowed[MLX4_MAX_PORTS + 1]; struct delayed_work sense_poll; + struct workqueue_struct *sense_wq; + u32 resched; }; -struct mlx4_msix_ctl { - u64 pool_bm; - spinlock_t pool_lock; -}; +extern struct mutex drv_mutex; struct mlx4_steer { struct list_head promisc_qps[MLX4_NUM_STEERS]; struct list_head steer_entries[MLX4_NUM_STEERS]; - struct list_head high_prios; }; struct mlx4_priv { @@ -333,8 +942,10 @@ struct mlx4_priv { struct mlx4_fw fw; struct mlx4_cmd cmd; + struct mlx4_mfunc mfunc; struct mlx4_bitmap pd_bitmap; + struct mlx4_bitmap xrcd_bitmap; struct mlx4_uar_table uar_table; struct mlx4_mr_table mr_table; struct mlx4_cq_table cq_table; @@ -342,6 +953,9 @@ struct mlx4_priv { struct mlx4_srq_table srq_table; struct mlx4_qp_table qp_table; struct mlx4_mcg_table mcg_table; + struct mlx4_bitmap counters_bitmap; + struct list_head bf_list; + struct mutex bf_mutex; struct mlx4_catas_err catas_err; @@ -350,13 +964,19 @@ struct mlx4_priv { struct mlx4_uar driver_uar; void __iomem *kar; struct mlx4_port_info port[MLX4_MAX_PORTS + 1]; + struct device_attribute trigger_attr; + int trig; + int changed_ports; struct mlx4_sense sense; struct mutex port_mutex; - struct mlx4_msix_ctl msix_ctl; struct mlx4_steer *steer; - struct list_head bf_list; - struct mutex bf_mutex; - struct io_mapping *bf_mapping; + bool link_up[MLX4_MAX_PORTS + 1]; + bool vep_mode[MLX4_MAX_PORTS + 1]; + u8 virt2phys_pkey[MLX4_MFUNC_MAX][MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS]; + int reserved_mtts; + struct io_mapping *bf_mapping; + struct device_attribute test_attr; + void *fmr_ctx; }; static inline struct mlx4_priv *mlx4_priv(struct mlx4_dev *dev) @@ -364,6 +984,12 @@ static inline struct mlx4_priv *mlx4_priv(struct mlx4_dev *dev) return container_of(dev, struct mlx4_priv, dev); } +static inline int mlx4_master_get_num_eqs(struct mlx4_dev *dev) +{ + return (dev->caps.reserved_eqs + + MLX4_MFUNC_EQ_NUM * (dev->num_slaves + 1)); +} + #define MLX4_SENSE_RANGE (HZ * 3) extern struct workqueue_struct *mlx4_wq; @@ -372,17 +998,26 @@ u32 mlx4_bitmap_alloc(struct mlx4_bitmap *bitmap); void mlx4_bitmap_free(struct mlx4_bitmap *bitmap, u32 obj); u32 mlx4_bitmap_alloc_range(struct mlx4_bitmap *bitmap, int cnt, int align); void mlx4_bitmap_free_range(struct mlx4_bitmap *bitmap, u32 obj, int cnt); -u32 mlx4_bitmap_avail(struct mlx4_bitmap *bitmap); int mlx4_bitmap_init(struct mlx4_bitmap *bitmap, u32 num, u32 mask, u32 reserved_bot, u32 resetrved_top); +int mlx4_bitmap_init_no_mask(struct mlx4_bitmap *bitmap, u32 num, + u32 reserved_bot, u32 reserved_top); void mlx4_bitmap_cleanup(struct mlx4_bitmap *bitmap); int mlx4_reset(struct mlx4_dev *dev); +int mlx4_get_ownership(struct mlx4_dev *dev); +void mlx4_free_ownership(struct mlx4_dev *dev); int mlx4_alloc_eq_table(struct mlx4_dev *dev); void mlx4_free_eq_table(struct mlx4_dev *dev); +int mlx4_GET_EVENT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); int mlx4_init_pd_table(struct mlx4_dev *dev); +int mlx4_init_xrcd_table(struct mlx4_dev *dev); int mlx4_init_uar_table(struct mlx4_dev *dev); int mlx4_init_mr_table(struct mlx4_dev *dev); int mlx4_init_eq_table(struct mlx4_dev *dev); @@ -399,6 +1034,87 @@ void mlx4_cleanup_cq_table(struct mlx4_dev *dev); void mlx4_cleanup_qp_table(struct mlx4_dev *dev); void mlx4_cleanup_srq_table(struct mlx4_dev *dev); void mlx4_cleanup_mcg_table(struct mlx4_dev *dev); +void mlx4_cleanup_xrcd_table(struct mlx4_dev *dev); + +int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn); +void __mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn); +void mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn); +int __mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn); +int mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn); +void __mlx4_cq_free_icm(struct mlx4_dev *dev, int cqn); +void mlx4_cq_free_icm(struct mlx4_dev *dev, int cqn); +int __mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn); +int mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn); +void __mlx4_srq_free_icm(struct mlx4_dev *dev, int srqn); +void mlx4_srq_free_icm(struct mlx4_dev *dev, int srqn); +int __mlx4_mr_reserve(struct mlx4_dev *dev); +int mlx4_mr_reserve(struct mlx4_dev *dev, enum mlx4_mr_flags flags); +void __mlx4_mr_release(struct mlx4_dev *dev, u32 index); +void mlx4_mr_release(struct mlx4_dev *dev, u32 index, + enum mlx4_mr_flags flags); +int __mlx4_mr_alloc_icm(struct mlx4_dev *dev, u32 index, + enum mlx4_mr_flags flags); +void __mlx4_mr_free_icm(struct mlx4_dev *dev, u32 index, + enum mlx4_mr_flags flags); +void mlx4_mr_free_icm(struct mlx4_dev *dev, u32 index, + enum mlx4_mr_flags flags); +u32 __mlx4_reserve_mtt_range(struct mlx4_dev *dev, int order); +u32 __mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order, + enum mlx4_mr_flags flags); +u32 mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order, + enum mlx4_mr_flags flags); +void __mlx4_free_mtt_reserved_range(struct mlx4_dev *dev, u32 first_seg, + int order); +void __mlx4_free_mtt_range(struct mlx4_dev *dev, u32 first_seg, int order, + enum mlx4_mr_flags flags); +void mlx4_free_mtt_range(struct mlx4_dev *dev, u32 first_seg, int order, + enum mlx4_mr_flags flags); +int mlx4_WRITE_MTT_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SYNC_TPT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SW2HW_MPT_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_HW2SW_MPT_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_QUERY_MPT_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_RST2INIT_QP_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SW2HW_EQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_DMA_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int __mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base); +void __mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt); +int __mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *qpn, u8 wrap); +void __mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, int qpn); + +void __mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, int qpn); +int __mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac); +int __mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, + int start_index, int npages, u64 *page_list); +int __mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn); +void __mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn); void mlx4_start_catas_poll(struct mlx4_dev *dev); void mlx4_stop_catas_poll(struct mlx4_dev *dev); @@ -406,7 +1122,8 @@ void mlx4_catas_init(void); int mlx4_restart_one(struct pci_dev *pdev); int mlx4_register_device(struct mlx4_dev *dev); void mlx4_unregister_device(struct mlx4_dev *dev); -void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type, int port); +void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type, unsigned long param); +void *mlx4_find_get_prot_dev(struct mlx4_dev *dev, enum mlx4_prot proto, int port); struct mlx4_dev_cap; struct mlx4_init_hca_param; @@ -415,13 +1132,163 @@ u64 mlx4_make_profile(struct mlx4_dev *dev, struct mlx4_profile *request, struct mlx4_dev_cap *dev_cap, struct mlx4_init_hca_param *init_hca); +void mlx4_master_comm_channel(struct work_struct *work); +void mlx4_gen_slave_eqe(struct work_struct *work); +void mlx4_update_vep_config(struct work_struct *work); +void mlx4_master_handle_slave_flr(struct work_struct *work); + +int mlx4_ALLOC_RES_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_FREE_RES_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_MAP_EQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_COMM_INT_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_RTR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_HW2SW_EQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_QUERY_EQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SW2HW_CQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_HW2SW_CQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_QUERY_CQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_MODIFY_CQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SW2HW_SRQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_HW2SW_SRQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_QUERY_SRQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_ARM_SRQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_INIT2RTR_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_INIT2INIT_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_RTS2RTS_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SQERR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_2ERR_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_RTS2SQD_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SQD2SQD_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SQD2RTS_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_2RST_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_QUERY_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_INIT2INIT_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SUSPEND_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_UNSUSPEND_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); + +int mlx4_GEN_EQE(struct mlx4_dev *dev, int slave, struct mlx4_eqe *eqe); int mlx4_cmd_init(struct mlx4_dev *dev); void mlx4_cmd_cleanup(struct mlx4_dev *dev); +int mlx4_multi_func_init(struct mlx4_dev *dev); +void mlx4_multi_func_cleanup(struct mlx4_dev *dev); void mlx4_cmd_event(struct mlx4_dev *dev, u16 token, u8 status, u64 out_param); int mlx4_cmd_use_events(struct mlx4_dev *dev); void mlx4_cmd_use_polling(struct mlx4_dev *dev); +int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param, unsigned long timeout); + + void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn); void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type); @@ -431,28 +1298,200 @@ void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type); void mlx4_handle_catas_err(struct mlx4_dev *dev); -int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port, - enum mlx4_port_type *type); void mlx4_do_sense_ports(struct mlx4_dev *dev, enum mlx4_port_type *stype, enum mlx4_port_type *defaults); void mlx4_start_sense(struct mlx4_dev *dev); void mlx4_stop_sense(struct mlx4_dev *dev); -void mlx4_sense_init(struct mlx4_dev *dev); +int mlx4_sense_init(struct mlx4_dev *dev); +void mlx4_sense_cleanup(struct mlx4_dev *dev); int mlx4_check_port_params(struct mlx4_dev *dev, enum mlx4_port_type *port_type); int mlx4_change_port_types(struct mlx4_dev *dev, enum mlx4_port_type *port_types); +void mlx4_set_port_mask(struct mlx4_dev *dev, struct mlx4_caps *caps, int function); void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table); void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table); -int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port); +/* resource tracker functions*/ +int mlx4_init_resource_tracker(struct mlx4_dev *dev); + +void mlx4_free_resource_tracker(struct mlx4_dev *dev); + +int mlx4_get_slave_from_resource_id(struct mlx4_dev *dev, enum mlx4_resource resource_type, + int resource_id, int *slave); +int mlx4_get_resource_obj(struct mlx4_dev *dev, enum mlx4_resource resource_type, + int resource_id, int slave, struct mlx4_tracked_resource **rt); + +/* the parameter "state" indicates the current status (like in qp/mtt) + need to reserve the renge before the allocation*/ +int mlx4_add_resource_for_slave(struct mlx4_dev *dev, enum mlx4_resource resource_type, + int slave_id, int resource_id, unsigned long state); +/* use this fuction when there is call for resrvation of qp/mtt */ +int mlx4_add_range_resource_for_slave(struct mlx4_dev *dev, enum mlx4_resource resource_type, + int slave_id, int from, int cnt); + +int mlx4_delete_resource_for_slave(struct mlx4_dev *dev, enum mlx4_resource resource_type, + int slave_id, int resource_id); + +int mlx4_delete_range_resource_for_slave(struct mlx4_dev *dev, enum mlx4_resource resource_type, + int slave_id, int from, int cnt); + +void mlx4_delete_all_resources_for_slave(struct mlx4_dev *dev, int slave_id); + +int mlx4_add_mcg_to_tracked_qp(struct mlx4_dev *dev, int qpn, u8* gid, enum mlx4_protocol prot) ; +int mlx4_remove_mcg_from_tracked_qp(struct mlx4_dev *dev, int qpn, u8* gid); + +int mlx4_add_port_to_tracked_mac(struct mlx4_dev *dev, int qpn, u8 port) ; + +void mlx4_delete_specific_res_type_for_slave(struct mlx4_dev *dev, int slave_id, + enum mlx4_resource resource_type); +void mlx4_delete_specific_res_id(struct mlx4_dev *dev, int slave_id, + enum mlx4_resource resource_type, int res_id); + +int mlx4_add_mtt_resource_for_slave(struct mlx4_dev *dev, + int slave_id, int resource_id, + unsigned long state, int order); +/*Resource tracker - verification functions.*/ + +int mlx4_verify_resource_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); + +int mlx4_verify_mpt_index(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox); + +int mlx4_verify_srq_aram(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox) ; + +int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port, int pk_tbl_sz); +int mlx4_SET_PORT_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_INIT_PORT_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_QUERY_PORT_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps); + +int mlx4_MCAST_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_PROMISC_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); int mlx4_qp_detach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], enum mlx4_protocol prot, enum mlx4_steer_type steer); int mlx4_qp_attach_common(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], int block_mcast_loopback, enum mlx4_protocol prot, enum mlx4_steer_type steer); +int mlx4_SET_MCAST_FLTR_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_SET_VLAN_FLTR_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_common_set_vlan_fltr(struct mlx4_dev *dev, int function, + int port, void *buf); +int mlx4_common_dump_eth_stats(struct mlx4_dev *dev, int slave, u32 in_mod, + struct mlx4_cmd_mailbox *outbox); +int mlx4_DUMP_ETH_STATS_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_PKEY_TABLE_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_QUERY_IF_STAT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_GET_GID_MAP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd); +int mlx4_register_pkey_tree(struct mlx4_dev *dev, int slave); +void mlx4_unregister_pkey_sysfs(struct mlx4_dev *dev, int slave); +int mlx4_sysfs_setup(void); +void mlx4_sysfs_cleanup(void); +int mlx4_get_mgm_entry_size(struct mlx4_dev *dev); +int mlx4_get_qp_per_mgm(struct mlx4_dev *dev); + +#if defined(GID_FMT) || defined(GID_ARG) +#error redefinition of GID macros +#else +#define GID_FMT "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x" +#define GID_ARG(g) (g)[0],(g)[1],(g)[2],(g)[3],(g)[4],(g)[5],(g)[6],(g)[7],(g)[8],(g)[9],(g)[10],(g)[11],(g)[12],(g)[13],(g)[14],(g)[15] +#endif + +/* FIXME: endianess */ +static inline void set_param_l(void *arg, u32 val) +{ + *((u32 *)arg) = val; +} + +/* FIXME: endianess */ +static inline void set_param_h(void *arg, u32 val) +{ + *(((u32 *)arg) + 1) = val; +} + +/* FIXME: endianess */ +static inline u32 get_param_l(void *arg) +{ + return *((u32 *)arg); +} + +/* FIXME: endianess */ +static inline u32 get_param_h(void *arg) +{ + return *(((u32 *)arg) + 1); +} + +static inline spinlock_t *mlx4_tlock(struct mlx4_dev *dev) +{ + return &mlx4_priv(dev)->mfunc.master.res_tracker.lock; +} + +#define NOT_MASKED_PD_BITS 17 + +#ifdef CONFIG_MLX4_RTT_TESTS +int mlx4_rtt_init(struct mlx4_dev *dev); +void mlx4_rtt_cleanup(struct mlx4_dev *dev); +#else +static inline int mlx4_rtt_init(struct mlx4_dev *dev) +{ + return 0; +} + +static inline void mlx4_rtt_cleanup(struct mlx4_dev *dev) +{ +} +#endif + + + #endif /* MLX4_H */ diff --git a/drivers/net/mlx4/mlx4_en.h b/drivers/net/mlx4/mlx4_en.h index df9053af3b6d3..84ed1d8bc8fe8 100644 --- a/drivers/net/mlx4/mlx4_en.h +++ b/drivers/net/mlx4/mlx4_en.h @@ -34,12 +34,11 @@ #ifndef _MLX4_EN_H_ #define _MLX4_EN_H_ -#include #include #include #include #include -#include +#include #include #include @@ -51,11 +50,42 @@ #include "en_port.h" #define DRV_NAME "mlx4_en" -#define DRV_VERSION "1.5.4.2" -#define DRV_RELDATE "October 2011" +#define DRV_VERSION "1.5.4.24" +#define DRV_RELDATE "June 2010" #define MLX4_EN_MSG_LEVEL (NETIF_MSG_LINK | NETIF_MSG_IFDOWN) +#define en_print(level, priv, format, arg...) \ + { \ + if ((priv)->registered) \ + printk(level "%s: %s: " format, DRV_NAME, \ + (priv->dev)->name, ## arg); \ + else \ + printk(level "%s: %s: Port %d: " format, \ + DRV_NAME, dev_name(&priv->mdev->pdev->dev), \ + (priv)->port, ## arg); \ + } + +#define en_dbg(mlevel, priv, format, arg...) \ + if (NETIF_MSG_##mlevel & priv->msg_enable) \ + en_print(KERN_DEBUG, priv, format, ## arg) +#define en_warn(priv, format, arg...) \ + en_print(KERN_WARNING, priv, format, ## arg) +#define en_err(priv, format, arg...) \ + en_print(KERN_ERR, priv, format, ## arg) +#define en_info(priv, format, arg...) \ + en_print(KERN_INFO, priv, format, ## arg) + +#define mlx4_err(mdev, format, arg...) \ + printk(KERN_ERR "%s %s: " format , DRV_NAME ,\ + dev_name(&mdev->pdev->dev) , ## arg) +#define mlx4_info(mdev, format, arg...) \ + printk(KERN_INFO "%s %s: " format , DRV_NAME ,\ + dev_name(&mdev->pdev->dev) , ## arg) +#define mlx4_warn(mdev, format, arg...) \ + printk(KERN_WARNING "%s %s: " format , DRV_NAME ,\ + dev_name(&mdev->pdev->dev) , ## arg) + /* * Device constants */ @@ -63,6 +93,7 @@ #define MLX4_EN_PAGE_SHIFT 12 #define MLX4_EN_PAGE_SIZE (1 << MLX4_EN_PAGE_SHIFT) +#define MAX_TX_RINGS (MLX4_EN_NUM_HASH_RINGS + 1 + MLX4_EN_NUM_PPP_RINGS) #define MAX_RX_RINGS 16 #define MIN_RX_RINGS 4 #define TXBB_SIZE 64 @@ -72,6 +103,7 @@ #define STAMP_SHIFT 31 #define STAMP_VAL 0x7fffffff #define STATS_DELAY (HZ / 4) +#define STATS_FREQ_MASK 0x7 /* Typical TSO descriptor with 16 gather entries is 352 bytes... */ #define MAX_DESC_SIZE 512 @@ -87,6 +119,7 @@ #define MLX4_EN_ALLOC_SIZE (PAGE_SIZE << MLX4_EN_ALLOC_ORDER) #define MLX4_EN_MAX_LRO_DESCRIPTORS 32 +#define MLX4_EN_NUM_IPFRAG_SESSIONS 16 /* Receive fragment sizes; we use at most 4 fragments (for 9600 byte MTU * and 4K allocations) */ @@ -96,7 +129,7 @@ enum { FRAG_SZ2 = 4096, FRAG_SZ3 = MLX4_EN_ALLOC_SIZE }; -#define MLX4_EN_MAX_RX_FRAGS 4 +#define MLX4_EN_MAX_RX_FRAGS 4 /* Maximum ring sizes */ #define MLX4_EN_MAX_TX_SIZE 8192 @@ -107,14 +140,15 @@ enum { #define MLX4_EN_MIN_TX_SIZE (4096 / TXBB_SIZE) #define MLX4_EN_SMALL_PKT_SIZE 64 -#define MLX4_EN_NUM_TX_RINGS 8 +#define MLX4_EN_TX_HASH_SIZE 256 +#define MLX4_EN_TX_HASH_MASK (MLX4_EN_TX_HASH_SIZE - 1) +#define MLX4_EN_NUM_HASH_RINGS 4 #define MLX4_EN_NUM_PPP_RINGS 8 -#define MAX_TX_RINGS (MLX4_EN_NUM_TX_RINGS + MLX4_EN_NUM_PPP_RINGS) #define MLX4_EN_DEF_TX_RING_SIZE 512 #define MLX4_EN_DEF_RX_RING_SIZE 1024 -/* Target number of packets to coalesce with interrupt moderation */ -#define MLX4_EN_RX_COAL_TARGET 44 +/* Target number of bytes to coalesce with interrupt moderation */ +#define MLX4_EN_RX_COAL_TARGET 0x20000 #define MLX4_EN_RX_COAL_TIME 0x10 #define MLX4_EN_TX_COAL_PKTS 5 @@ -134,7 +168,7 @@ enum { #define MLX4_EN_DEF_RX_PAUSE 1 #define MLX4_EN_DEF_TX_PAUSE 1 -/* Interval between successive polls in the Tx routine when polling is used +/* Interval between sucessive polls in the Tx routine when polling is used instead of interrupts (in per-core Tx rings) - should be power of 2 */ #define MLX4_EN_TX_POLL_MODER 16 #define MLX4_EN_TX_POLL_TIMEOUT (HZ / 4) @@ -218,9 +252,6 @@ struct mlx4_en_tx_desc { #define MLX4_EN_USE_SRQ 0x01000000 -#define MLX4_EN_CX3_LOW_ID 0x1000 -#define MLX4_EN_CX3_HIGH_ID 0x1005 - struct mlx4_en_rx_alloc { struct page *page; u16 offset; @@ -249,10 +280,20 @@ struct mlx4_en_tx_ring { struct mlx4_srq dummy; unsigned long bytes; unsigned long packets; - unsigned long tx_csum; spinlock_t comp_lock; - struct mlx4_bf bf; - bool bf_enabled; +}; + +struct mlx4_en_ipfrag { + struct sk_buff *fragments; + struct sk_buff *last; + __be32 saddr; + __be32 daddr; + __be16 id; + u8 protocol; + int total_len; + u16 offset; + unsigned int vlan; + __be16 sl_vid; }; struct mlx4_en_rx_desc { @@ -263,6 +304,7 @@ struct mlx4_en_rx_desc { struct mlx4_en_rx_ring { struct mlx4_hwq_resources wqres; struct mlx4_en_rx_alloc page_alloc[MLX4_EN_MAX_RX_FRAGS]; + struct net_lro_mgr lro; u32 size ; /* number of Rx descs*/ u32 actual_size; u32 size_mask; @@ -272,16 +314,14 @@ struct mlx4_en_rx_ring { u32 prod; u32 cons; u32 buf_size; - u8 fcs_del; void *buf; void *rx_info; unsigned long bytes; unsigned long packets; - unsigned long csum_ok; - unsigned long csum_none; + struct mlx4_en_ipfrag ipfrag[MLX4_EN_NUM_IPFRAG_SESSIONS]; + unsigned int use_frags; }; - static inline int mlx4_en_can_lro(__be16 status) { return (status & cpu_to_be16(MLX4_CQE_STATUS_IPV4 | @@ -329,6 +369,8 @@ struct mlx4_en_port_profile { struct mlx4_en_profile { int rss_xor; + int num_lro; + int ip_reasm; int tcp_rss; int udp_rss; u8 rss_mask; @@ -390,6 +432,9 @@ struct mlx4_en_pkt_stats { }; struct mlx4_en_port_stats { + unsigned long lro_aggregated; + unsigned long lro_flushed; + unsigned long lro_no_desc; unsigned long tso_packets; unsigned long queue_stopped; unsigned long wake_queue; @@ -398,7 +443,7 @@ struct mlx4_en_port_stats { unsigned long rx_chksum_good; unsigned long rx_chksum_none; unsigned long tx_chksum_offload; -#define NUM_PORT_STATS 8 +#define NUM_PORT_STATS 11 }; struct mlx4_en_perf_stats { @@ -417,24 +462,31 @@ struct mlx4_en_frag_info { u16 frag_stride; u16 frag_align; u16 last_offset; +}; +struct mlx4_en_tx_hash_entry { + u8 cnt; + unsigned int small_pkts; + unsigned int big_pkts; + unsigned int ring; }; struct mlx4_en_priv { struct mlx4_en_dev *mdev; struct mlx4_en_port_profile *prof; struct net_device *dev; - unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)]; + struct vlan_group *vlgrp; struct net_device_stats stats; struct net_device_stats ret_stats; struct mlx4_en_port_state port_state; spinlock_t stats_lock; + u8 stat_cnt; - unsigned long last_moder_packets[MAX_RX_RINGS]; + unsigned long last_moder_packets; unsigned long last_moder_tx_packets; - unsigned long last_moder_bytes[MAX_RX_RINGS]; + unsigned long last_moder_bytes; unsigned long last_moder_jiffies; - int last_moder_time[MAX_RX_RINGS]; + int last_moder_time; u16 rx_usecs; u16 rx_frames; u16 tx_usecs; @@ -456,19 +508,20 @@ struct mlx4_en_priv { int port; int registered; int allocated; - int stride; + int rx_csum; u64 mac; int mac_index; unsigned max_mtu; int base_qpn; struct mlx4_en_rss_map rss_map; - u32 ctrl_flags; + u16 tx_prio_map[8]; u32 flags; #define MLX4_EN_FLAG_PROMISC 0x1 #define MLX4_EN_FLAG_MC_PROMISC 0x2 u32 tx_ring_num; u32 rx_ring_num; + u32 udp_rings; u32 rx_skb_size; struct mlx4_en_frag_info frag_info[MLX4_EN_MAX_RX_FRAGS]; u16 num_frags; @@ -478,6 +531,7 @@ struct mlx4_en_priv { struct mlx4_en_rx_ring rx_ring[MAX_RX_RINGS]; struct mlx4_en_cq tx_cq[MAX_TX_RINGS]; struct mlx4_en_cq rx_cq[MAX_RX_RINGS]; + struct mlx4_en_tx_hash_entry tx_hash[MLX4_EN_TX_HASH_SIZE]; struct work_struct mcast_task; struct work_struct mac_task; struct work_struct watchdog_task; @@ -488,18 +542,21 @@ struct mlx4_en_priv { struct mlx4_en_port_stats port_stats; char *mc_addrs; int mc_addrs_cnt; - struct mlx4_en_stat_out_mbox hw_stats; + struct mlx4_stat_out_mbox hw_stats; int vids[128]; bool wol; }; enum mlx4_en_wol { - MLX4_EN_WOL_MAGIC = (1ULL << 61), - MLX4_EN_WOL_ENABLED = (1ULL << 62), - MLX4_EN_WOL_DO_MODIFY = (1ULL << 63), + MLX4_EN_WOL_MAGIC = (1 << 29), + MLX4_EN_WOL_ENABLED = (1 << 30), + MLX4_EN_WOL_DO_MODIFY = (1 << 31), }; - +int mlx4_en_rx_frags(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring, + struct sk_buff *skb, struct mlx4_cqe *cqe); +void mlx4_en_flush_frags(struct mlx4_en_priv *priv, + struct mlx4_en_rx_ring *ring); void mlx4_en_destroy_netdev(struct net_device *dev); int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, struct mlx4_en_port_profile *prof); @@ -513,8 +570,7 @@ int mlx4_en_alloc_resources(struct mlx4_en_priv *priv); int mlx4_en_create_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq, int entries, int ring, enum cq_type mode); void mlx4_en_destroy_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq); -int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq, - int cq_idx); +int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq); void mlx4_en_deactivate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq); int mlx4_en_set_cq_moder(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq); int mlx4_en_arm_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq); @@ -522,10 +578,10 @@ int mlx4_en_arm_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq); void mlx4_en_poll_tx_cq(unsigned long data); void mlx4_en_tx_irq(struct mlx4_cq *mcq); u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb); -netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev); +int mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev); int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring, - int qpn, u32 size, u16 stride); + u32 size, u16 stride); void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring); int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring, @@ -534,8 +590,7 @@ void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring); int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv, - struct mlx4_en_rx_ring *ring, - u32 size, u16 stride); + struct mlx4_en_rx_ring *ring, u32 size); void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring); int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv); @@ -544,6 +599,9 @@ void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv, int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int budget); +int mlx4_en_process_rx_cq_skb(struct net_device *dev, + struct mlx4_en_cq *cq, + int budget); int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget); void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride, int is_tx, int rss, int qpn, int cqn, @@ -553,19 +611,18 @@ int mlx4_en_map_buffer(struct mlx4_buf *buf); void mlx4_en_unmap_buffer(struct mlx4_buf *buf); void mlx4_en_calc_rx_buf(struct net_device *dev); +void mlx4_en_set_prio_map(struct mlx4_en_priv *priv, u16 *prio_map, u32 ring_num); int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv); void mlx4_en_release_rss_steer(struct mlx4_en_priv *priv); int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring); void mlx4_en_rx_irq(struct mlx4_cq *mcq); -int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mode); -int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, struct mlx4_en_priv *priv); +int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, u8 port, struct vlan_group *grp); int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu, u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx); int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn, u8 promisc); -int mlx4_en_DUMP_ETH_STATS(struct mlx4_en_dev *mdev, u8 port, u8 reset); int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port); #define MLX4_EN_NUM_SELF_TEST 5 @@ -576,36 +633,4 @@ u64 mlx4_en_mac_to_u64(u8 *addr); * Globals */ extern const struct ethtool_ops mlx4_en_ethtool_ops; - - - -/* - * printk / logging functions - */ - -int en_print(const char *level, const struct mlx4_en_priv *priv, - const char *format, ...) __attribute__ ((format (printf, 3, 4))); - -#define en_dbg(mlevel, priv, format, arg...) \ -do { \ - if (NETIF_MSG_##mlevel & priv->msg_enable) \ - en_print(KERN_DEBUG, priv, format, ##arg); \ -} while (0) -#define en_warn(priv, format, arg...) \ - en_print(KERN_WARNING, priv, format, ##arg) -#define en_err(priv, format, arg...) \ - en_print(KERN_ERR, priv, format, ##arg) -#define en_info(priv, format, arg...) \ - en_print(KERN_INFO, priv, format, ## arg) - -#define mlx4_err(mdev, format, arg...) \ - pr_err("%s %s: " format, DRV_NAME, \ - dev_name(&mdev->pdev->dev), ##arg) -#define mlx4_info(mdev, format, arg...) \ - pr_info("%s %s: " format, DRV_NAME, \ - dev_name(&mdev->pdev->dev), ##arg) -#define mlx4_warn(mdev, format, arg...) \ - pr_warning("%s %s: " format, DRV_NAME, \ - dev_name(&mdev->pdev->dev), ##arg) - #endif diff --git a/drivers/net/mlx4/mr.c b/drivers/net/mlx4/mr.c index 9c188bdd7f4f2..c388bd8da6c67 100644 --- a/drivers/net/mlx4/mr.c +++ b/drivers/net/mlx4/mr.c @@ -32,34 +32,19 @@ * SOFTWARE. */ +#include #include -#include +#include #include #include "mlx4.h" #include "icm.h" +#include "fmr_slave.h" /* * Must be packed because mtt_seg is 64 bits but only aligned to 32 bits. */ -struct mlx4_mpt_entry { - __be32 flags; - __be32 qpn; - __be32 key; - __be32 pd_flags; - __be64 start; - __be64 length; - __be32 lkey; - __be32 win_cnt; - u8 reserved1[3]; - u8 mtt_rep; - __be64 mtt_seg; - __be32 mtt_sz; - __be32 entity_size; - __be32 first_byte_offset; -} __packed; - #define MLX4_MPT_FLAG_SW_OWNS (0xfUL << 28) #define MLX4_MPT_FLAG_FREE (0x3UL << 28) #define MLX4_MPT_FLAG_MIO (1 << 17) @@ -71,6 +56,8 @@ struct mlx4_mpt_entry { #define MLX4_MPT_PD_FLAG_RAE (1 << 28) #define MLX4_MPT_PD_FLAG_EN_INV (3 << 24) +#define MLX4_MPT_FLAG2_FBO_EN (1 << 7) + #define MLX4_MPT_STATUS_SW 0xF0 #define MLX4_MPT_STATUS_HW 0x00 @@ -91,7 +78,7 @@ static u32 mlx4_buddy_alloc(struct mlx4_buddy *buddy, int order) } spin_unlock(&buddy->lock); - return -1; + return 0xFFFFFFFF; found: clear_bit(seg, buddy->bits[o]); @@ -179,26 +166,74 @@ static void mlx4_buddy_cleanup(struct mlx4_buddy *buddy) kfree(buddy->num_free); } -static u32 mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order) +inline u32 __mlx4_reserve_mtt_range(struct mlx4_dev *dev, int order) +{ + return mlx4_buddy_alloc(&mlx4_priv(dev)->mr_table.mtt_buddy, order); +} + +u32 __mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order, + enum mlx4_mr_flags flags) { struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table; u32 seg; + struct mlx4_buddy *buddy; + struct mlx4_icm_table *icm_table; + int fmr_flow; + + fmr_flow = mlx4_fmr_flow(dev, flags); + + if (fmr_flow) { + buddy = &mr_table->fmr.mtt_buddy; + icm_table = &mr_table->fmr.mtt_table; + } else { + buddy = &mr_table->mtt_buddy; + icm_table = &mr_table->mtt_table; + } - seg = mlx4_buddy_alloc(&mr_table->mtt_buddy, order); - if (seg == -1) - return -1; + seg = mlx4_buddy_alloc(buddy, order); + if (seg == 0xFFFFFFFF) { + mlx4_err(dev, "alloc mtt range failed in budddy alloc\n"); + return 0xFFFFFFFF; + } - if (mlx4_table_get_range(dev, &mr_table->mtt_table, seg, - seg + (1 << order) - 1)) { - mlx4_buddy_free(&mr_table->mtt_buddy, seg, order); - return -1; + if (mlx4_table_get_range(dev, icm_table, seg, + seg + (1 << order) - 1, flags)) { + mlx4_buddy_free(buddy, seg, order); + mlx4_err(dev, "alloc mtt range failed to get table range\n"); + return 0xFFFFFFFF; } return seg; } +u32 mlx4_alloc_mtt_range(struct mlx4_dev *dev, int order, + enum mlx4_mr_flags flags) +{ + u64 in_param; + u64 out_param; + u16 op; + int err; + + op = mlx4_fmr_flow(dev, flags) ? RES_OP_RESERVE : + RES_OP_RESERVE_AND_MAP; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, order); + err = mlx4_cmd_imm(dev, in_param, &out_param, RES_MTT, op, + MLX4_CMD_ALLOC_RES, + MLX4_CMD_TIME_CLASS_A, 0); + if (err) { + mlx4_dbg(dev, "Failed to alloc mtt order:%d fmr_flow %d\n", + order, mlx4_fmr_flow(dev, flags)); + return 0xFFFFFFFF; + } + return get_param_l(&out_param); + } + return __mlx4_alloc_mtt_range(dev, order, flags); +} + int mlx4_mtt_init(struct mlx4_dev *dev, int npages, int page_shift, - struct mlx4_mtt *mtt) + struct mlx4_mtt *mtt, enum mlx4_mr_flags flags) { int i; @@ -212,24 +247,81 @@ int mlx4_mtt_init(struct mlx4_dev *dev, int npages, int page_shift, for (mtt->order = 0, i = dev->caps.mtts_per_seg; i < npages; i <<= 1) ++mtt->order; - mtt->first_seg = mlx4_alloc_mtt_range(dev, mtt->order); - if (mtt->first_seg == -1) + mtt->first_seg = mlx4_fmr_flow(dev, flags) ? + __mlx4_alloc_mtt_range(dev, mtt->order, flags) : + mlx4_alloc_mtt_range(dev, mtt->order, flags); + if (mtt->first_seg == 0xFFFFFFFF) return -ENOMEM; return 0; } EXPORT_SYMBOL_GPL(mlx4_mtt_init); -void mlx4_mtt_cleanup(struct mlx4_dev *dev, struct mlx4_mtt *mtt) +inline void __mlx4_free_mtt_reserved_range(struct mlx4_dev *dev, u32 first_seg, + int order) +{ + mlx4_buddy_free(&mlx4_priv(dev)->mr_table.mtt_buddy, + first_seg, order); +} + +void __mlx4_free_mtt_range(struct mlx4_dev *dev, u32 first_seg, int order, + enum mlx4_mr_flags flags) { struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table; + struct mlx4_buddy *buddy; + struct mlx4_icm_table *icm_table; + int fmr_flow; + + fmr_flow = mlx4_fmr_flow(dev, flags); + + if (fmr_flow) { + buddy = &mr_table->fmr.mtt_buddy; + icm_table = &mr_table->fmr.mtt_table; + } else { + buddy = &mr_table->mtt_buddy; + icm_table = &mr_table->mtt_table; + } + + mlx4_buddy_free(buddy, first_seg, order); + mlx4_table_put_range(dev, icm_table, first_seg, + first_seg + (1 << order) - 1, flags); +} + +void mlx4_free_mtt_range(struct mlx4_dev *dev, u32 first_seg, int order, + enum mlx4_mr_flags flags) +{ + u64 in_param; + u16 op; + int err; + + op = mlx4_fmr_flow(dev, flags) ? RES_OP_RESERVE : + RES_OP_RESERVE_AND_MAP; + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, first_seg); + set_param_h(&in_param, order); + err = mlx4_cmd(dev, in_param, RES_MTT, op, + MLX4_CMD_FREE_RES, + MLX4_CMD_TIME_CLASS_A, 0); + if (err) + mlx4_warn(dev, "Failed to free mtt range at:%d order:%d" + " fmr_flow %d\n", first_seg, order, + mlx4_fmr_flow(dev, flags)); + return; + } + __mlx4_free_mtt_range(dev, first_seg, order, flags); +} + +void mlx4_mtt_cleanup(struct mlx4_dev *dev, struct mlx4_mtt *mtt, + enum mlx4_mr_flags flags) +{ if (mtt->order < 0) return; - mlx4_buddy_free(&mr_table->mtt_buddy, mtt->first_seg, mtt->order); - mlx4_table_put_range(dev, &mr_table->mtt_table, mtt->first_seg, - mtt->first_seg + (1 << mtt->order) - 1); + if (mlx4_fmr_flow(dev, flags)) + __mlx4_free_mtt_range(dev, mtt->first_seg, mtt->order, flags); + else + mlx4_free_mtt_range(dev, mtt->first_seg, mtt->order, flags); } EXPORT_SYMBOL_GPL(mlx4_mtt_cleanup); @@ -244,77 +336,278 @@ static u32 hw_index_to_key(u32 ind) return (ind >> 24) | (ind << 8); } -static u32 key_to_hw_index(u32 key) -{ - return (key << 24) | (key >> 8); -} - static int mlx4_SW2HW_MPT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, int mpt_index) { - return mlx4_cmd(dev, mailbox->dma, mpt_index, 0, MLX4_CMD_SW2HW_MPT, - MLX4_CMD_TIME_CLASS_B); + return mlx4_cmd(dev, mailbox->dma | dev->caps.function , mpt_index, + 0, MLX4_CMD_SW2HW_MPT, MLX4_CMD_TIME_CLASS_B, 0); } static int mlx4_HW2SW_MPT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, int mpt_index) { return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, mpt_index, - !mailbox, MLX4_CMD_HW2SW_MPT, MLX4_CMD_TIME_CLASS_B); + !mailbox, MLX4_CMD_HW2SW_MPT, MLX4_CMD_TIME_CLASS_B, 0); } -int mlx4_mr_alloc(struct mlx4_dev *dev, u32 pd, u64 iova, u64 size, u32 access, - int npages, int page_shift, struct mlx4_mr *mr) +int mlx4_mr_reserve_range(struct mlx4_dev *dev, int cnt, int align, u32 *base_mridx) { struct mlx4_priv *priv = mlx4_priv(dev); - u32 index; - int err; + u32 mridx; - index = mlx4_bitmap_alloc(&priv->mr_table.mpt_bitmap); - if (index == -1) + mridx = mlx4_bitmap_alloc_range(&priv->mr_table.mpt_bitmap, cnt, align); + if (mridx == -1) return -ENOMEM; + *base_mridx = mridx; + return 0; + +} +EXPORT_SYMBOL_GPL(mlx4_mr_reserve_range); + +void mlx4_mr_release_range(struct mlx4_dev *dev, u32 base_mridx, int cnt) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + mlx4_bitmap_free_range(&priv->mr_table.mpt_bitmap, base_mridx, cnt); +} +EXPORT_SYMBOL_GPL(mlx4_mr_release_range); + +int mlx4_mr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, u32 pd, + u64 iova, u64 size, u32 access, int npages, + int page_shift, struct mlx4_mr *mr) +{ mr->iova = iova; mr->size = size; mr->pd = pd; mr->access = access; - mr->enabled = 0; - mr->key = hw_index_to_key(index); + mr->enabled = MLX4_MR_DISABLED; + mr->key = hw_index_to_key(mridx); + + return mlx4_mtt_init(dev, npages, page_shift, &mr->mtt, mr->flags); +} +EXPORT_SYMBOL_GPL(mlx4_mr_alloc_reserved); + +static int mlx4_WRITE_MTT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, + int num_entries) +{ + return mlx4_cmd(dev, mailbox->dma, num_entries, 0, MLX4_CMD_WRITE_MTT, + MLX4_CMD_TIME_CLASS_A, 0); +} + +int __mlx4_mr_reserve(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + return mlx4_bitmap_alloc(&priv->mr_table.mpt_bitmap); +} - err = mlx4_mtt_init(dev, npages, page_shift, &mr->mtt); +static int mlx4_fmr_reserve(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + return mlx4_bitmap_alloc(&priv->mr_table.fmr.mpt_bitmap); +} + +static void mlx4_fmr_release(struct mlx4_dev *dev, u32 idx) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + mlx4_bitmap_free(&priv->mr_table.fmr.mpt_bitmap, idx); +} + +int mlx4_mr_reserve(struct mlx4_dev *dev, enum mlx4_mr_flags flags) +{ + u64 out_param; + u64 in_param = 0; + int idx = 0; + int fmr_flow; + + fmr_flow = mlx4_fmr_flow(dev, flags); + if (fmr_flow) { + idx = mlx4_fmr_reserve(dev); + if (idx < 0) { + mlx4_warn(dev, "Failed allocating mpt index for FMR\n"); + return -1; + } + set_param_l(&in_param, dev->caps.fmr_dmpt_base_idx + idx); + set_param_h(&in_param, flags); + } + + if (mlx4_is_mfunc(dev)) { + if (mlx4_cmd_imm(dev, in_param, &out_param, RES_MPT, + RES_OP_RESERVE, MLX4_CMD_ALLOC_RES, + MLX4_CMD_TIME_CLASS_A, 0)) { + mlx4_warn(dev, "Failed to reserve mr fmr_flow %d" + ", idx %d\n", fmr_flow, idx); + return -1; + } + mlx4_dbg(dev, "Allocated mpt index = 0x%x, fmr_flow %d\n", + get_param_l(&out_param), fmr_flow); + return get_param_l(&out_param); + } + return __mlx4_mr_reserve(dev); +} + +void __mlx4_mr_release(struct mlx4_dev *dev, u32 index) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + mlx4_bitmap_free(&priv->mr_table.mpt_bitmap, index); +} + +void mlx4_mr_release(struct mlx4_dev *dev, u32 index, enum mlx4_mr_flags flags) +{ + u64 in_param = 0; + int fmr_flow; + + fmr_flow = mlx4_fmr_flow(dev, flags); + if (fmr_flow) { + mlx4_fmr_release(dev, index - dev->caps.fmr_dmpt_base_idx); + set_param_h(&in_param, flags); + } + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, index); + if (mlx4_cmd(dev, in_param, RES_MPT, RES_OP_RESERVE, + MLX4_CMD_FREE_RES, + MLX4_CMD_TIME_CLASS_A, 0)) + mlx4_warn(dev, "Failed to release mr index:%d," + " fmr_flow %d\n", index, fmr_flow); + else + mlx4_dbg(dev, "Release mpt index = 0x%x, fmr_flow %d\n", + index, fmr_flow); + return; + } + __mlx4_mr_release(dev, index); +} + +int __mlx4_mr_alloc_icm(struct mlx4_dev *dev, u32 index, + enum mlx4_mr_flags flags) +{ + struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table; + int fmr_flow = mlx4_fmr_flow(dev, flags); + struct mlx4_icm_table *icm_table; + + icm_table = fmr_flow ? &mr_table->fmr.dmpt_table : + &mr_table->dmpt_table; + + if (fmr_flow) + index -= dev->caps.fmr_dmpt_base_idx; + + return mlx4_table_get(dev, icm_table, index, flags); +} + +int mlx4_mr_alloc_icm(struct mlx4_dev *dev, u32 index, enum mlx4_mr_flags flags) +{ + u64 param; + int fmr_flow = mlx4_fmr_flow(dev, flags); + int err = 0; + + if (mlx4_is_mfunc(dev)) { + set_param_l(¶m, index); + err = mlx4_cmd_imm(dev, param, ¶m, RES_MPT, RES_OP_MAP_ICM, + MLX4_CMD_ALLOC_RES, + MLX4_CMD_TIME_CLASS_A, 0); + if (err) { + mlx4_err(dev, "Alloc icm failed with err %d\n", err); + return err; + } + } + + if (!mlx4_is_mfunc(dev) || fmr_flow) + err = __mlx4_mr_alloc_icm(dev, index, flags); + + return err; +} + +void __mlx4_mr_free_icm(struct mlx4_dev *dev, u32 index, + enum mlx4_mr_flags flags) +{ + struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table; + int fmr_flow = mlx4_fmr_flow(dev, flags); + struct mlx4_icm_table *icm_table; + + icm_table = fmr_flow ? &mr_table->fmr.dmpt_table : + &mr_table->dmpt_table; + + if (fmr_flow) + index -= dev->caps.fmr_dmpt_base_idx; + + mlx4_table_put(dev, icm_table, index, flags); +} + +void mlx4_mr_free_icm(struct mlx4_dev *dev, u32 index, enum mlx4_mr_flags flags) +{ + u64 in_param; + int fmr_flow = mlx4_fmr_flow(dev, flags); + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, index); + if (mlx4_cmd(dev, in_param, RES_MPT, RES_OP_MAP_ICM, + MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A, 0)) { + mlx4_warn(dev, "Failed to free icm of mr index:%d\n", index); + return; + } + } + + if (!mlx4_is_mfunc(dev) || fmr_flow) + return __mlx4_mr_free_icm(dev, index, flags); +} + +int mlx4_mr_alloc(struct mlx4_dev *dev, u32 pd, u64 iova, u64 size, u32 access, + int npages, int page_shift, struct mlx4_mr *mr) +{ + u32 index; + int err; + + index = mlx4_mr_reserve(dev, mr->flags); + if (index == -1) + return -ENOMEM; + + err = mlx4_mr_alloc_reserved(dev, index, pd, iova, size, + access, npages, page_shift, mr); if (err) - mlx4_bitmap_free(&priv->mr_table.mpt_bitmap, index); + mlx4_mr_release(dev, index, mr->flags); return err; } EXPORT_SYMBOL_GPL(mlx4_mr_alloc); -void mlx4_mr_free(struct mlx4_dev *dev, struct mlx4_mr *mr) +void mlx4_mr_free_reserved(struct mlx4_dev *dev, struct mlx4_mr *mr) { - struct mlx4_priv *priv = mlx4_priv(dev); int err; + int fmr_flow = mlx4_fmr_flow(dev, mr->flags); - if (mr->enabled) { + if (mr->enabled == MLX4_MR_EN_HW || + (fmr_flow && (mr->enabled == MLX4_MR_EN_SW))) { err = mlx4_HW2SW_MPT(dev, NULL, - key_to_hw_index(mr->key) & - (dev->caps.num_mpts - 1)); + key_to_mpt_index(dev, mr->key)); if (err) - mlx4_warn(dev, "HW2SW_MPT failed (%d)\n", err); + mlx4_warn(dev, "xxx HW2SW_MPT failed (%d)\n", err); + + mr->enabled = MLX4_MR_EN_SW; } + mlx4_mtt_cleanup(dev, &mr->mtt, mr->flags); +} +EXPORT_SYMBOL_GPL(mlx4_mr_free_reserved); - mlx4_mtt_cleanup(dev, &mr->mtt); - mlx4_bitmap_free(&priv->mr_table.mpt_bitmap, key_to_hw_index(mr->key)); +void mlx4_mr_free(struct mlx4_dev *dev, struct mlx4_mr *mr) +{ + mlx4_mr_free_reserved(dev, mr); + if (mr->enabled) + mlx4_mr_free_icm(dev, key_to_mpt_index(dev, mr->key), + mr->flags); + mlx4_mr_release(dev, key_to_mpt_index(dev, mr->key), mr->flags); } EXPORT_SYMBOL_GPL(mlx4_mr_free); int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr) { - struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table; struct mlx4_cmd_mailbox *mailbox; struct mlx4_mpt_entry *mpt_entry; int err; - err = mlx4_table_get(dev, &mr_table->dmpt_table, key_to_hw_index(mr->key)); + err = mlx4_mr_alloc_icm(dev, key_to_mpt_index(dev, mr->key), mr->flags); if (err) return err; @@ -341,7 +634,12 @@ int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr) mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_PHYSICAL); mpt_entry->mtt_seg = 0; } else { - mpt_entry->mtt_seg = cpu_to_be64(mlx4_mtt_addr(dev, &mr->mtt)); + /* compute the GLOBAL segment index */ + mpt_entry->mtt_seg = !mlx4_fmr_flow(dev, mr->flags) ? + cpu_to_be64(mlx4_mtt_addr(dev, &mr->mtt)) : + cpu_to_be64((u64)dev->caps.fmr_mtt_base_idx + * dev->caps.mtt_entry_sz + + mlx4_mtt_addr(dev, &mr->mtt)); } if (mr->mtt.order >= 0 && mr->mtt.page_shift == 0) { @@ -356,13 +654,13 @@ int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr) } err = mlx4_SW2HW_MPT(dev, mailbox, - key_to_hw_index(mr->key) & (dev->caps.num_mpts - 1)); + key_to_mpt_index(dev, mr->key)); if (err) { - mlx4_warn(dev, "SW2HW_MPT failed (%d)\n", err); + mlx4_warn(dev, "SW2HW_MPT failed (%d) got mpt idx %lx\n", err, + (unsigned long)key_to_mpt_index(dev, mr->key)); goto err_cmd; } - - mr->enabled = 1; + mr->enabled = MLX4_MR_EN_HW; mlx4_free_cmd_mailbox(dev, mailbox); @@ -372,7 +670,7 @@ err_cmd: mlx4_free_cmd_mailbox(dev, mailbox); err_table: - mlx4_table_put(dev, &mr_table->dmpt_table, key_to_hw_index(mr->key)); + mlx4_mr_free_icm(dev, key_to_mpt_index(dev, mr->key), mr->flags); return err; } EXPORT_SYMBOL_GPL(mlx4_mr_enable); @@ -394,13 +692,14 @@ static int mlx4_write_mtt_chunk(struct mlx4_dev *dev, struct mlx4_mtt *mtt, if (start_index & (dev->caps.mtts_per_seg - 1)) return -EINVAL; - mtts = mlx4_table_find(&priv->mr_table.mtt_table, mtt->first_seg + - s / dev->caps.mtt_entry_sz, &dma_handle); + mtts = mlx4_table_find(dev, &priv->mr_table.mtt_table, mtt->first_seg + + s / dev->caps.mtt_entry_sz, &dma_handle, + MLX4_MR_FLAG_NONE); if (!mtts) return -ENOMEM; dma_sync_single_for_cpu(&dev->pdev->dev, dma_handle, - npages * sizeof (u64), DMA_TO_DEVICE); + npages * sizeof (u64), DMA_TO_DEVICE); for (i = 0; i < npages; ++i) mtts[i] = cpu_to_be64(page_list[i] | MLX4_MTT_FLAG_PRESENT); @@ -411,27 +710,68 @@ static int mlx4_write_mtt_chunk(struct mlx4_dev *dev, struct mlx4_mtt *mtt, return 0; } -int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, - int start_index, int npages, u64 *page_list) +int __mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, + int start_index, int npages, u64 *page_list) { + int err = 0; int chunk; - int err; - - if (mtt->order < 0) - return -EINVAL; while (npages > 0) { chunk = min_t(int, PAGE_SIZE / sizeof(u64), npages); err = mlx4_write_mtt_chunk(dev, mtt, start_index, chunk, page_list); if (err) return err; - npages -= chunk; start_index += chunk; page_list += chunk; } + return err; +} - return 0; +int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, + int start_index, int npages, u64 *page_list) +{ + struct mlx4_cmd_mailbox *mailbox = NULL; + __be64 *inbox = NULL; + int chunk; + int err = 0; + int i; + + if (mtt->order < 0) + return -EINVAL; + + if (mlx4_is_mfunc(dev)) { + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + inbox = mailbox->buf; + + while (npages > 0) { + int s = mtt->first_seg * dev->caps.mtts_per_seg + start_index; + chunk = min_t(int, MLX4_MAILBOX_SIZE / sizeof(u64) - dev->caps.mtts_per_seg, npages); + if (s / (PAGE_SIZE / sizeof (u64)) != + (s + chunk - 1) / (PAGE_SIZE / sizeof (u64))) + chunk = PAGE_SIZE / sizeof (u64) - (s % (PAGE_SIZE / sizeof (u64))); + + inbox[0] = cpu_to_be64(mtt->first_seg * dev->caps.mtts_per_seg + start_index); + inbox[1] = 0; + for (i = 0; i < chunk; ++i) + inbox[i + 2] = cpu_to_be64(page_list[i] | MLX4_MTT_FLAG_PRESENT); + err = mlx4_WRITE_MTT(dev, mailbox, chunk); + if (err) { + mlx4_free_cmd_mailbox(dev, mailbox); + return err; + } + + npages -= chunk; + start_index += chunk; + page_list += chunk; + } + mlx4_free_cmd_mailbox(dev, mailbox); + return err; + } + + return __mlx4_write_mtt(dev, mtt, start_index, npages, page_list); } EXPORT_SYMBOL_GPL(mlx4_write_mtt); @@ -459,44 +799,182 @@ int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, } EXPORT_SYMBOL_GPL(mlx4_buf_write_mtt); -int mlx4_init_mr_table(struct mlx4_dev *dev) +static int mlx4_init_icm_fmr(struct mlx4_dev *dev) { - struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table; + struct mlx4_priv *priv = mlx4_priv(dev); int err; - err = mlx4_bitmap_init(&mr_table->mpt_bitmap, dev->caps.num_mpts, - ~0, dev->caps.reserved_mrws, 0); - if (err) + if (!is_power_of_2(dev->caps.fmr_num_mpts)) { + mlx4_err(dev, "Num mpts is not power of 2, aborting.\n"); + return -EINVAL; + } + + if ((dev->caps.dmpt_entry_sz * dev->caps.fmr_num_mpts) & + (PAGE_SIZE - 1)) { + mlx4_err(dev, "MPT size is not page aligned, aborting.\n"); + return -EINVAL; + } + + err = mlx4_init_icm_table(dev, &priv->mr_table.fmr.dmpt_table, + dev->caps.fmr_dmpt_base, + dev->caps.dmpt_entry_sz, + dev->caps.fmr_num_mpts, + 0, 1, 1); + if (err) { + mlx4_err(dev, "Failed to map FMR dMPT context memory," + " aborting.\n"); return err; + } - err = mlx4_buddy_init(&mr_table->mtt_buddy, - ilog2(dev->caps.num_mtt_segs)); - if (err) - goto err_buddy; - - if (dev->caps.reserved_mtts) { - if (mlx4_alloc_mtt_range(dev, fls(dev->caps.reserved_mtts - 1)) == -1) { - mlx4_warn(dev, "MTT table of order %d is too small.\n", - mr_table->mtt_buddy.max_order); - err = -ENOMEM; - goto err_reserve_mtts; + dev->caps.fmr_mtt_base_idx = mlx4_alloc_mtt_range(dev, + fls(dev->caps.fmr_num_mtt_segs - 1), + MLX4_MR_FLAG_FMR); + if (dev->caps.fmr_mtt_base_idx == 0xFFFFFFFF) { + mlx4_err(dev, "Failed alloc mtt range for fmr\n"); + err = -ENOMEM; + goto out_free_mpt_icm; + } + + dev->caps.fmr_mtt_base = dev->caps.mtt_base + dev->caps.fmr_mtt_base_idx + * dev->caps.mtt_entry_sz; + + err = mlx4_init_icm_table(dev, &priv->mr_table.fmr.mtt_table, + dev->caps.fmr_mtt_base, + dev->caps.mtt_entry_sz, + dev->caps.fmr_num_mtt_segs, + 0, 1, 0); + if (err) { + mlx4_err(dev, "Failed to map FMR MTT context memory," + " aborting.\n"); + goto out_release_mtt; + } + + return 0; + +out_release_mtt: + mlx4_free_mtt_range(dev, dev->caps.fmr_mtt_base_idx, + fls(dev->caps.fmr_num_mtt_segs - 1), + MLX4_MR_FLAG_FMR); + +out_free_mpt_icm: + mlx4_cleanup_icm_table(dev, &mlx4_priv(dev)->mr_table.fmr.dmpt_table, + MLX4_MR_FLAG_FMR); + return err; +} + + +int mlx4_init_mr_table(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_mr_table *mr_table = &priv->mr_table; + int num_mpts; + int err; + + /* compute mpts without the reserved for fmr */ + num_mpts = dev->caps.num_mpts >> 1; + if (!is_power_of_2(num_mpts)) { + mlx4_err(dev, "MPT num is not power of 2, aborting.\n"); + return -EINVAL; + } + + + if (!mlx4_is_mfunc(dev) || mlx4_is_master(dev)) { + err = mlx4_bitmap_init(&mr_table->mpt_bitmap, num_mpts, + num_mpts-1, dev->caps.reserved_mrws, 0); + if (err) + return err; + + err = mlx4_buddy_init(&mr_table->mtt_buddy, + ilog2(dev->caps.num_mtt_segs)); + if (err) + goto err_mpt; + + if (dev->caps.reserved_mtts) { + priv->reserved_mtts = mlx4_alloc_mtt_range(dev, + fls(dev->caps.reserved_mtts - 1), + MLX4_MR_FLAG_NONE); + + if (priv->reserved_mtts == 0xFFFFFFFF) { + mlx4_warn(dev, "MTT table of order %d" + " is too small.\n", + mr_table->mtt_buddy.max_order); + err = -ENOMEM; + goto err_buddy; + } + } + } + + if (mlx4_is_mfunc(dev)) { + err = mlx4_bitmap_init(&mr_table->fmr.mpt_bitmap, + dev->caps.fmr_num_mpts, + dev->caps.fmr_num_mpts - 1, 0, 0); + if (err) + goto err_alloc; + + err = mlx4_buddy_init(&mr_table->fmr.mtt_buddy, + ilog2(dev->caps.fmr_num_mtt_segs)); + if (err) + goto err_fmr_mpt; + + err = mlx4_init_icm_fmr(dev); + if (err) { + mlx4_err(dev, "Failed to initialize fmr icm\n"); + goto err_fmr_mpt; } } return 0; -err_reserve_mtts: - mlx4_buddy_cleanup(&mr_table->mtt_buddy); +err_fmr_mpt: + if (mlx4_is_mfunc(dev)) + mlx4_bitmap_cleanup(&mr_table->fmr.mpt_bitmap); + +err_alloc: + if (!mlx4_is_mfunc(dev) || mlx4_is_master(dev)) + if (dev->caps.reserved_mtts) + mlx4_free_mtt_range(dev, priv->reserved_mtts, + fls(dev->caps.reserved_mtts - 1), + MLX4_MR_FLAG_NONE); err_buddy: - mlx4_bitmap_cleanup(&mr_table->mpt_bitmap); + if (!mlx4_is_mfunc(dev) || mlx4_is_master(dev)) + mlx4_buddy_cleanup(&mr_table->mtt_buddy); + +err_mpt: + if (!mlx4_is_mfunc(dev) || mlx4_is_master(dev)) + mlx4_bitmap_cleanup(&mr_table->mpt_bitmap); return err; } void mlx4_cleanup_mr_table(struct mlx4_dev *dev) { - struct mlx4_mr_table *mr_table = &mlx4_priv(dev)->mr_table; + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_mr_table *mr_table = &priv->mr_table; + + if (mlx4_is_mfunc(dev)) { + if (dev->caps.fmr_mtt_base_idx != 0xFFFFFFFF) { + mlx4_free_mtt_range(dev, dev->caps.fmr_mtt_base_idx, + fls(dev->caps.fmr_num_mtt_segs - 1), + MLX4_MR_FLAG_FMR); + mlx4_cleanup_icm_table(dev, + &mlx4_priv(dev)->mr_table.fmr.dmpt_table, + MLX4_MR_FLAG_FMR); + mlx4_cleanup_icm_table(dev, + &mlx4_priv(dev)->mr_table.fmr.mtt_table, + MLX4_MR_FLAG_FMR); + } + + mlx4_bitmap_cleanup(&mr_table->fmr.mpt_bitmap); + mlx4_buddy_cleanup(&mr_table->fmr.mtt_buddy); + } + + if (mlx4_is_mfunc(dev) && !mlx4_is_master(dev)) + return; + if (priv->reserved_mtts >= 0) + mlx4_free_mtt_range(dev, priv->reserved_mtts, + fls(dev->caps.reserved_mtts - 1), + MLX4_MR_FLAG_NONE); mlx4_buddy_cleanup(&mr_table->mtt_buddy); mlx4_bitmap_cleanup(&mr_table->mpt_bitmap); @@ -529,8 +1007,17 @@ static inline int mlx4_check_fmr(struct mlx4_fmr *fmr, u64 *page_list, return 0; } -int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list, - int npages, u64 iova, u32 *lkey, u32 *rkey) +int mlx4_set_fmr_pd(struct mlx4_fmr *fmr, u32 pd) +{ + fmr->mr.pd = pd; + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_set_fmr_pd); + + +int mlx4_map_phys_fmr_fbo(struct mlx4_dev *dev, struct mlx4_fmr *fmr, + u64 *page_list, int npages, u64 iova, u32 fbo, + u32 len, u32 *lkey, u32 *rkey, int same_key) { u32 key; int i, err; @@ -542,9 +1029,9 @@ int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list ++fmr->maps; key = key_to_hw_index(fmr->mr.key); - key += dev->caps.num_mpts; + if (!same_key) + key += dev->caps.num_mpts; *lkey = *rkey = fmr->mr.key = hw_index_to_key(key); - *(u8 *) fmr->mpt = MLX4_MPT_STATUS_SW; /* Make sure MPT status is visible before writing MTT entries */ @@ -561,8 +1048,16 @@ int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list fmr->mpt->key = cpu_to_be32(key); fmr->mpt->lkey = cpu_to_be32(key); - fmr->mpt->length = cpu_to_be64(npages * (1ull << fmr->page_shift)); + fmr->mpt->length = cpu_to_be64(len); fmr->mpt->start = cpu_to_be64(iova); + fmr->mpt->first_byte_offset = cpu_to_be32(fbo & 0x001fffff); + fmr->mpt->flags2 = (fbo ? MLX4_MPT_FLAG2_FBO_EN : 0); + + fmr->mpt->pd_flags = cpu_to_be32(fmr->mr.pd | MLX4_MPT_PD_FLAG_EN_INV); + if (fmr->mr.mtt.order >= 0 && fmr->mr.mtt.page_shift == 0) { + fmr->mpt->pd_flags |= cpu_to_be32(MLX4_MPT_PD_FLAG_FAST_REG | + MLX4_MPT_PD_FLAG_RAE); + } /* Make MTT entries are visible before setting MPT status */ wmb(); @@ -574,12 +1069,23 @@ int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list return 0; } +EXPORT_SYMBOL_GPL(mlx4_map_phys_fmr_fbo); + +int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list, + int npages, u64 iova, u32 *lkey, u32 *rkey) +{ + u32 len = npages * (1ull << fmr->page_shift); + + return mlx4_map_phys_fmr_fbo(dev, fmr, page_list, npages, iova, 0, + len, lkey, rkey, 0); +} EXPORT_SYMBOL_GPL(mlx4_map_phys_fmr); int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages, int max_maps, u8 page_shift, struct mlx4_fmr *fmr) { struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_icm_table *icm_table; u64 mtt_seg; int err = -ENOMEM; @@ -602,9 +1108,13 @@ int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages, mtt_seg = fmr->mr.mtt.first_seg * dev->caps.mtt_entry_sz; - fmr->mtts = mlx4_table_find(&priv->mr_table.mtt_table, + icm_table = mlx4_fmr_flow(dev, fmr->mr.flags) ? + &priv->mr_table.fmr.mtt_table : + &priv->mr_table.mtt_table; + + fmr->mtts = mlx4_table_find(dev, icm_table, fmr->mr.mtt.first_seg, - &fmr->dma_handle); + &fmr->dma_handle, fmr->mr.flags); if (!fmr->mtts) { err = -ENOMEM; goto err_free; @@ -618,19 +1128,82 @@ err_free: } EXPORT_SYMBOL_GPL(mlx4_fmr_alloc); +int mlx4_fmr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, + u32 pd, u32 access, int max_pages, + int max_maps, u8 page_shift, struct mlx4_fmr *fmr) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_icm_table *icm_table; + u64 mtt_seg; + int err = -ENOMEM; + + if (page_shift < (ffs(dev->caps.page_size_cap) - 1) || page_shift >= 32) + return -EINVAL; + + /* All MTTs must fit in the same page */ + if (max_pages * sizeof *fmr->mtts > PAGE_SIZE) + return -EINVAL; + + fmr->page_shift = page_shift; + fmr->max_pages = max_pages; + fmr->max_maps = max_maps; + fmr->maps = 0; + + err = mlx4_mr_alloc_reserved(dev, mridx, pd, 0, 0, access, max_pages, + page_shift, &fmr->mr); + if (err) { + mlx4_err(dev, "Failed to fmr alloc reserved for mpt idx %lx\n", + (unsigned long)mridx); + return err; + } + + mtt_seg = fmr->mr.mtt.first_seg * dev->caps.mtt_entry_sz; + + icm_table = mlx4_fmr_flow(dev, fmr->mr.flags) ? + &priv->mr_table.fmr.mtt_table : + &priv->mr_table.mtt_table; + + fmr->mtts = mlx4_table_find(dev, icm_table, + fmr->mr.mtt.first_seg, + &fmr->dma_handle, fmr->mr.flags); + if (!fmr->mtts) { + err = -ENOMEM; + mlx4_err(dev, "Failed getting mtts in fmr alloc reserved for mpt idx %lx\n", + (unsigned long)mridx); + goto err_free; + } + + return 0; + +err_free: + mlx4_mr_free_reserved(dev, &fmr->mr); + return err; +} +EXPORT_SYMBOL_GPL(mlx4_fmr_alloc_reserved); + int mlx4_fmr_enable(struct mlx4_dev *dev, struct mlx4_fmr *fmr) { struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_icm_table *icm_table; int err; err = mlx4_mr_enable(dev, &fmr->mr); if (err) return err; - fmr->mpt = mlx4_table_find(&priv->mr_table.dmpt_table, - key_to_hw_index(fmr->mr.key), NULL); - if (!fmr->mpt) + icm_table = mlx4_fmr_flow(dev, fmr->mr.flags) ? + &priv->mr_table.fmr.dmpt_table : + &priv->mr_table.dmpt_table; + + fmr->mpt = mlx4_table_find(dev, icm_table, + key_to_mpt_index(dev, fmr->mr.key) - + dev->caps.fmr_dmpt_base_idx, + NULL, fmr->mr.flags); + if (!fmr->mpt) { + mlx4_err(dev, "Failed to enable fmr for mpt idx %lx\n", + (unsigned long)key_to_mpt_index(dev, fmr->mr.key)); return -ENOMEM; + } return 0; } @@ -644,7 +1217,7 @@ void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr, fmr->maps = 0; - *(u8 *) fmr->mpt = MLX4_MPT_STATUS_SW; + fmr->mr.enabled = MLX4_MR_EN_SW; } EXPORT_SYMBOL_GPL(mlx4_fmr_unmap); @@ -653,15 +1226,27 @@ int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr) if (fmr->maps) return -EBUSY; - fmr->mr.enabled = 0; mlx4_mr_free(dev, &fmr->mr); + fmr->mr.enabled = MLX4_MR_DISABLED; return 0; } EXPORT_SYMBOL_GPL(mlx4_fmr_free); +int mlx4_fmr_free_reserved(struct mlx4_dev *dev, struct mlx4_fmr *fmr) +{ + if (fmr->maps) + return -EBUSY; + + mlx4_mr_free_reserved(dev, &fmr->mr); + fmr->mr.enabled = MLX4_MR_DISABLED; + + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_fmr_free_reserved); + int mlx4_SYNC_TPT(struct mlx4_dev *dev) { - return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_SYNC_TPT, 1000); + return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_SYNC_TPT, 1000, 0); } EXPORT_SYMBOL_GPL(mlx4_SYNC_TPT); diff --git a/drivers/net/mlx4/pd.c b/drivers/net/mlx4/pd.c index 1286b886dcea5..affdb12fa0115 100644 --- a/drivers/net/mlx4/pd.c +++ b/drivers/net/mlx4/pd.c @@ -31,7 +31,9 @@ * SOFTWARE. */ +#include #include +#include #include #include @@ -39,10 +41,6 @@ #include "mlx4.h" #include "icm.h" -enum { - MLX4_NUM_RESERVED_UARS = 8 -}; - int mlx4_pd_alloc(struct mlx4_dev *dev, u32 *pdn) { struct mlx4_priv *priv = mlx4_priv(dev); @@ -50,7 +48,8 @@ int mlx4_pd_alloc(struct mlx4_dev *dev, u32 *pdn) *pdn = mlx4_bitmap_alloc(&priv->pd_bitmap); if (*pdn == -1) return -ENOMEM; - + if (mlx4_is_mfunc(dev)) + *pdn |= (dev->caps.function + 1) << NOT_MASKED_PD_BITS; return 0; } EXPORT_SYMBOL_GPL(mlx4_pd_alloc); @@ -66,7 +65,7 @@ int mlx4_init_pd_table(struct mlx4_dev *dev) struct mlx4_priv *priv = mlx4_priv(dev); return mlx4_bitmap_init(&priv->pd_bitmap, dev->caps.num_pds, - (1 << 24) - 1, dev->caps.reserved_pds, 0); + (1 << NOT_MASKED_PD_BITS) - 1, dev->caps.reserved_pds, 0); } void mlx4_cleanup_pd_table(struct mlx4_dev *dev) @@ -77,13 +76,19 @@ void mlx4_cleanup_pd_table(struct mlx4_dev *dev) int mlx4_uar_alloc(struct mlx4_dev *dev, struct mlx4_uar *uar) { + int offset; + uar->index = mlx4_bitmap_alloc(&mlx4_priv(dev)->uar_table.bitmap); if (uar->index == -1) return -ENOMEM; - uar->pfn = (pci_resource_start(dev->pdev, 2) >> PAGE_SHIFT) + uar->index; + if (mlx4_is_mfunc(dev) && !mlx4_is_master(dev)) + offset = uar->index % ((int) pci_resource_len(dev->pdev, 2) / + dev->caps.uar_page_size); + else + offset = uar->index; + uar->pfn = (pci_resource_start(dev->pdev, 2) >> PAGE_SHIFT) + offset; uar->map = NULL; - return 0; } EXPORT_SYMBOL_GPL(mlx4_uar_alloc); @@ -108,10 +113,6 @@ int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf) if (!list_empty(&priv->bf_list)) uar = list_entry(priv->bf_list.next, struct mlx4_uar, bf_list); else { - if (mlx4_bitmap_avail(&priv->uar_table.bitmap) < MLX4_NUM_RESERVED_UARS) { - err = -ENOMEM; - goto out; - } uar = kmalloc(sizeof *uar, GFP_KERNEL); if (!uar) { err = -ENOMEM; @@ -199,9 +200,9 @@ int mlx4_init_uar_table(struct mlx4_dev *dev) return -ENODEV; } - return mlx4_bitmap_init(&mlx4_priv(dev)->uar_table.bitmap, - dev->caps.num_uars, dev->caps.num_uars - 1, - max(128, dev->caps.reserved_uars), 0); + return mlx4_bitmap_init_no_mask(&mlx4_priv(dev)->uar_table.bitmap, + dev->caps.num_uars, + dev->caps.reserved_uars, 0); } void mlx4_cleanup_uar_table(struct mlx4_dev *dev) diff --git a/drivers/net/mlx4/pkey.c b/drivers/net/mlx4/pkey.c new file mode 100644 index 0000000000000..a6cdf15e5bee6 --- /dev/null +++ b/drivers/net/mlx4/pkey.c @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2010 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include "mlx4.h" + +static void invalidate_p2v_table(int len, u8 inval, u8 *table) +{ + int i; + + for (i = 0; i < len; ++i) + table[i] = inval; +} + + +void mlx4_sync_pkey_table(struct mlx4_dev *dev, int slave, int port, int i, int val) +{ + struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev); + + if (!dev->caps.sqp_demux) + return; + + priv->virt2phys_pkey[slave][port - 1][i] = val; +} +EXPORT_SYMBOL(mlx4_sync_pkey_table); + +int mlx4_PKEY_TABLE_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + u8 *p2v = outbox->buf; + u8 port = vhcr->in_modifier; + u8 virt, phys; + struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev); + + mlx4_dbg(dev, "got update request for slave %d, port %d\n", slave, port); + invalidate_p2v_table(dev->caps.pkey_table_len[port], + dev->caps.pkey_table_max_len[port] - 1, + p2v); + + for (virt = 0; virt < dev->caps.pkey_table_len[port]; ++virt) { + phys = priv->virt2phys_pkey[slave][port - 1][virt]; + p2v[phys] = virt; + mlx4_dbg(dev, "phys %d = virt %d\n", phys, virt); + } + + return 0; +} + diff --git a/drivers/net/mlx4/port.c b/drivers/net/mlx4/port.c index 836338a2a0a72..f3ad5b05fddf2 100644 --- a/drivers/net/mlx4/port.c +++ b/drivers/net/mlx4/port.c @@ -32,13 +32,21 @@ #include #include +#include #include +#include + #include "mlx4.h" +#include "en_port.h" + +int mlx4_ib_set_4k_mtu = 0; +module_param_named(set_4k_mtu, mlx4_ib_set_4k_mtu, int, 0444); +MODULE_PARM_DESC(set_4k_mtu, "attempt to set 4K MTU to all ConnectX ports"); #define MLX4_MAC_VALID (1ull << 63) -#define MLX4_MAC_MASK 0xffffffffffffULL +#define MLX4_MAC_MASK 0x7fffffffffffffffULL #define MLX4_VLAN_VALID (1u << 31) #define MLX4_VLAN_MASK 0xfff @@ -48,10 +56,8 @@ void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table) int i; mutex_init(&table->mutex); - for (i = 0; i < MLX4_MAX_MAC_NUM; i++) { + for (i = 0; i < MLX4_MAX_MAC_NUM; i++) table->entries[i] = 0; - table->refs[i] = 0; - } table->max = 1 << dev->caps.log_num_macs; table->total = 0; } @@ -65,7 +71,7 @@ void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table) table->entries[i] = 0; table->refs[i] = 0; } - table->max = (1 << dev->caps.log_num_vlans) - MLX4_VLAN_REGULAR; + table->max = 1 << dev->caps.log_num_vlans; table->total = 0; } @@ -84,7 +90,7 @@ static int mlx4_set_port_mac_table(struct mlx4_dev *dev, u8 port, in_mod = MLX4_SET_PORT_MAC_TABLE << 8 | port; err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT, - MLX4_CMD_TIME_CLASS_B); + MLX4_CMD_TIME_CLASS_B, 1); mlx4_free_cmd_mailbox(dev, mailbox); return err; @@ -94,6 +100,7 @@ static int mlx4_uc_steer_add(struct mlx4_dev *dev, u8 port, u64 mac, int *qpn, u8 reserve) { struct mlx4_qp qp; + u8 pf_num; u8 gid[16] = {0}; int err; @@ -106,11 +113,11 @@ static int mlx4_uc_steer_add(struct mlx4_dev *dev, u8 port, } qp.qpn = *qpn; + pf_num = ((u8) (mac >> 48)) | (port - 1); mac &= 0xffffffffffffULL; mac = cpu_to_be64(mac << 16); memcpy(&gid[10], &mac, ETH_ALEN); - gid[5] = port; - gid[7] = MLX4_UC_STEER << 1; + gid[7] = pf_num << 4 | MLX4_UC_STEER << 1; err = mlx4_qp_attach_common(dev, &qp, gid, 0, MLX4_PROT_ETH, MLX4_UC_STEER); @@ -124,69 +131,62 @@ static void mlx4_uc_steer_release(struct mlx4_dev *dev, u8 port, u64 mac, int qpn, u8 free) { struct mlx4_qp qp; + u8 pf_num; u8 gid[16] = {0}; qp.qpn = qpn; + pf_num = ((u8) (mac >> 48)) | (port - 1); mac &= 0xffffffffffffULL; mac = cpu_to_be64(mac << 16); memcpy(&gid[10], &mac, ETH_ALEN); - gid[5] = port; - gid[7] = MLX4_UC_STEER << 1; + gid[7] = pf_num << 4 | MLX4_UC_STEER << 1; mlx4_qp_detach_common(dev, &qp, gid, MLX4_PROT_ETH, MLX4_UC_STEER); if (free) mlx4_qp_release_range(dev, qpn, 1); } -int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *qpn, u8 wrap) +int __mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *qpn, u8 wrap) { struct mlx4_port_info *info = &mlx4_priv(dev)->port[port]; struct mlx4_mac_table *table = &info->mac_table; struct mlx4_mac_entry *entry; int i, err = 0; int free = -1; + if (!wrap) + mac |= (u64) (dev->caps.function) << 48; - if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER) { + if (dev->caps.vep_uc_steering) { err = mlx4_uc_steer_add(dev, port, mac, qpn, 1); - if (err) - return err; - - entry = kmalloc(sizeof *entry, GFP_KERNEL); - if (!entry) { - mlx4_uc_steer_release(dev, port, mac, *qpn, 1); - return -ENOMEM; - } - - entry->mac = mac; - err = radix_tree_insert(&info->mac_tree, *qpn, entry); - if (err) { - kfree(entry); - mlx4_uc_steer_release(dev, port, mac, *qpn, 1); - return err; + if (!err) { + entry = kmalloc(sizeof *entry, GFP_KERNEL); + if (!entry) { + mlx4_uc_steer_release(dev, port, mac, *qpn, 1); + return -ENOMEM; + } + entry->mac = mac; + err = radix_tree_insert(&info->mac_tree, *qpn, entry); + if (err) + mlx4_uc_steer_release(dev, port, mac, *qpn, 1); } + return err; } mlx4_dbg(dev, "Registering MAC: 0x%llx\n", (unsigned long long) mac); - mutex_lock(&table->mutex); - for (i = 0; i < MLX4_MAX_MAC_NUM - 1; i++) { - if (free < 0 && !table->refs[i]) { + for (i = 0; i < MLX4_MAX_MAC_NUM; i++) { + if (free < 0 && !table->entries[i]) { free = i; continue; } if (mac == (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) { - /* MAC already registered, increase references count */ - ++table->refs[i]; + /* MAC + PF already registered, Must not have duplicates */ + err = -EEXIST; goto out; } } - if (free < 0) { - err = -ENOMEM; - goto out; - } - mlx4_dbg(dev, "Free MAC index is %d\n", free); if (table->total == table->max) { @@ -196,24 +196,36 @@ int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *qpn, u8 wrap) } /* Register new MAC */ - table->refs[free] = 1; table->entries[free] = cpu_to_be64(mac | MLX4_MAC_VALID); err = mlx4_set_port_mac_table(dev, port, table->entries); if (unlikely(err)) { mlx4_err(dev, "Failed adding MAC: 0x%llx\n", (unsigned long long) mac); - table->refs[free] = 0; table->entries[free] = 0; goto out; } - if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER)) - *qpn = info->base_qpn + free; + *qpn = info->base_qpn + free; ++table->total; out: mutex_unlock(&table->mutex); return err; } + +int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *qpn, u8 wrap) +{ + u64 out_param; + int err; + + if (mlx4_is_mfunc(dev)) { + err = mlx4_cmd_imm(dev, mac, &out_param, RES_MAC, port, + MLX4_CMD_ALLOC_RES, MLX4_CMD_TIME_CLASS_A, 0); + if (!err) + *qpn = out_param; + return err; + } + return __mlx4_register_mac(dev, port, mac, qpn, wrap); +} EXPORT_SYMBOL_GPL(mlx4_register_mac); static int validate_index(struct mlx4_dev *dev, @@ -228,33 +240,21 @@ static int validate_index(struct mlx4_dev *dev, return err; } -static int find_index(struct mlx4_dev *dev, - struct mlx4_mac_table *table, u64 mac) -{ - int i; - for (i = 0; i < MLX4_MAX_MAC_NUM; i++) { - if (mac == (MLX4_MAC_MASK & be64_to_cpu(table->entries[i]))) - return i; - } - /* Mac not found */ - return -EINVAL; -} - -void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, int qpn) +void __mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, int qpn) { struct mlx4_port_info *info = &mlx4_priv(dev)->port[port]; struct mlx4_mac_table *table = &info->mac_table; int index = qpn - info->base_qpn; struct mlx4_mac_entry *entry; - if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER) { + if (dev->caps.vep_uc_steering) { entry = radix_tree_lookup(&info->mac_tree, qpn); if (entry) { mlx4_uc_steer_release(dev, port, entry->mac, qpn, 1); radix_tree_delete(&info->mac_tree, qpn); - index = find_index(dev, table, entry->mac); kfree(entry); } + return; } mutex_lock(&table->mutex); @@ -262,35 +262,40 @@ void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, int qpn) if (validate_index(dev, table, index)) goto out; - /* Check whether this address has reference count */ - if (!(--table->refs[index])) { - table->entries[index] = 0; - mlx4_set_port_mac_table(dev, port, table->entries); - --table->total; - } + table->entries[index] = 0; + mlx4_set_port_mac_table(dev, port, table->entries); + --table->total; out: mutex_unlock(&table->mutex); } + +void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, int qpn) +{ + + if (mlx4_is_mfunc(dev)) { + mlx4_cmd(dev, qpn, RES_MAC, port, + MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A, 0); + return; + } + __mlx4_unregister_mac(dev, port, qpn); + return; +} EXPORT_SYMBOL_GPL(mlx4_unregister_mac); -int mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac, u8 wrap) +int __mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac) { struct mlx4_port_info *info = &mlx4_priv(dev)->port[port]; struct mlx4_mac_table *table = &info->mac_table; - int index = qpn - info->base_qpn; struct mlx4_mac_entry *entry; + int index = qpn - info->base_qpn; int err; - if (dev->caps.flags & MLX4_DEV_CAP_FLAG_VEP_UC_STEER) { + if (dev->caps.vep_uc_steering) { entry = radix_tree_lookup(&info->mac_tree, qpn); if (!entry) return -EINVAL; - index = find_index(dev, table, entry->mac); mlx4_uc_steer_release(dev, port, entry->mac, qpn, 0); - entry->mac = new_mac; - err = mlx4_uc_steer_add(dev, port, entry->mac, &qpn, 0); - if (err || index < 0) - return err; + return mlx4_uc_steer_add(dev, port, entry->mac, &qpn, 0); } mutex_lock(&table->mutex); @@ -310,7 +315,16 @@ out: mutex_unlock(&table->mutex); return err; } + +int mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac) +{ + if (mlx4_is_mfunc(dev)) + return mlx4_cmd_imm(dev, new_mac, (u64 *) &qpn, RES_MAC, port, + MLX4_CMD_REPLACE_RES, MLX4_CMD_TIME_CLASS_A, 0); + return __mlx4_replace_mac(dev, port, qpn, new_mac); +} EXPORT_SYMBOL_GPL(mlx4_replace_mac); + static int mlx4_set_port_vlan_table(struct mlx4_dev *dev, u8 port, __be32 *entries) { @@ -325,7 +339,7 @@ static int mlx4_set_port_vlan_table(struct mlx4_dev *dev, u8 port, memcpy(mailbox->buf, entries, MLX4_VLAN_TABLE_SIZE); in_mod = MLX4_SET_PORT_VLAN_TABLE << 8 | port; err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT, - MLX4_CMD_TIME_CLASS_B); + MLX4_CMD_TIME_CLASS_B, 0); mlx4_free_cmd_mailbox(dev, mailbox); @@ -341,7 +355,7 @@ int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx) if (table->refs[i] && (vid == (MLX4_VLAN_MASK & be32_to_cpu(table->entries[i])))) { - /* VLAN already registered, increase reference count */ + /* Vlan already registered, increase refernce count */ *idx = i; return 0; } @@ -357,14 +371,10 @@ int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index) int i, err = 0; int free = -1; - mutex_lock(&table->mutex); - - if (table->total == table->max) { - /* No free vlan entries */ - err = -ENOSPC; - goto out; - } + if (mlx4_is_mfunc(dev)) + return 0; + mutex_lock(&table->mutex); for (i = MLX4_VLAN_REGULAR; i < MLX4_MAX_VLAN_NUM; i++) { if (free < 0 && (table->refs[i] == 0)) { free = i; @@ -374,15 +384,16 @@ int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index) if (table->refs[i] && (vlan == (MLX4_VLAN_MASK & be32_to_cpu(table->entries[i])))) { - /* Vlan already registered, increase references count */ + /* Vlan already registered, increase refernce count */ *index = i; ++table->refs[i]; goto out; } } - if (free < 0) { - err = -ENOMEM; + if (table->total == table->max) { + /* No free vlan entries */ + err = -ENOSPC; goto out; } @@ -410,6 +421,9 @@ void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, int index) { struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table; + if (mlx4_is_mfunc(dev)) + return; + if (index < MLX4_VLAN_REGULAR) { mlx4_warn(dev, "Trying to free special vlan index %d\n", index); return; @@ -461,7 +475,7 @@ int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps) *(__be32 *) (&inbuf[20]) = cpu_to_be32(port); err = mlx4_cmd_box(dev, inmailbox->dma, outmailbox->dma, port, 3, - MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C); + MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C, 1); if (!err) *caps = *(__be32 *) (outbuf + 84); mlx4_free_cmd_mailbox(dev, inmailbox); @@ -469,12 +483,148 @@ int mlx4_get_port_ib_caps(struct mlx4_dev *dev, u8 port, __be32 *caps) return err; } -int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port) +static int mlx4_common_set_port(struct mlx4_dev *dev, int slave, u32 in_mod, + u8 op_mod, struct mlx4_cmd_mailbox *inbox) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_port_info *port_info; + struct mlx4_mfunc_master_ctx *master = &priv->mfunc.master; + struct mlx4_slave_state *slave_st = &master->slave_state[slave]; + struct mlx4_set_port_rqp_calc_context *qpn_context; + struct mlx4_set_port_general_context *gen_context; + int reset_qkey_viols; + int port; + int is_eth; + u32 in_modifier; + u32 promisc; + u16 mtu, prev_mtu; + int err; + int i; + __be32 agg_cap_mask; + __be32 slave_cap_mask; + __be32 new_cap_mask; + + port = in_mod & 0xff; + in_modifier = in_mod >> 8; + is_eth = op_mod; + port_info = &priv->port[port]; + + /* All slaves can perform SET_PORT operations, just need to verify + * we keep the mutual resources unchanged */ + if (is_eth) { + switch (in_modifier) { + case MLX4_SET_PORT_RQP_CALC: + qpn_context = inbox->buf; + qpn_context->base_qpn = cpu_to_be32(port_info->base_qpn); + qpn_context->n_mac = 0x7; + promisc = be32_to_cpu(qpn_context->promisc) >> + SET_PORT_PROMISC_SHIFT; + qpn_context->promisc = cpu_to_be32( + promisc << SET_PORT_PROMISC_SHIFT | + port_info->base_qpn); + promisc = be32_to_cpu(qpn_context->mcast) >> + SET_PORT_MC_PROMISC_SHIFT; + qpn_context->mcast = cpu_to_be32( + promisc << SET_PORT_MC_PROMISC_SHIFT | + port_info->base_qpn); + break; + case MLX4_SET_PORT_GENERAL: + gen_context = inbox->buf; + /* Mtu is configured as the max MTU among all the + * the functions on the port. */ + mtu = be16_to_cpu(gen_context->mtu); + mtu = min_t(int, mtu, dev->caps.eth_mtu_cap[port]); + prev_mtu = slave_st->mtu[port]; + slave_st->mtu[port] = mtu; + if (mtu > master->max_mtu[port]) + master->max_mtu[port] = mtu; + if (mtu < prev_mtu && prev_mtu == master->max_mtu[port]) { + slave_st->mtu[port] = mtu; + master->max_mtu[port] = mtu; + for (i = 0; i < dev->num_slaves; i++) { + master->max_mtu[port] = + max(master->max_mtu[port], + master->slave_state[i].mtu[port]); + } + } + + gen_context->mtu = cpu_to_be16(master->max_mtu[port]); + break; + } + return mlx4_cmd(dev, inbox->dma, in_mod, op_mod, + MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, 1); + } + + /* For IB, we only consider: + * - The capability mask, which is set to the aggregate of all slave frunction + * capabilities + * - The QKey violatin counter - reset according to each request. + */ + + if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { + reset_qkey_viols = (*(u8 *) inbox->buf) & 0x40; + new_cap_mask = ((__be32 *) inbox->buf)[2]; + } else { + reset_qkey_viols = ((u8 *) inbox->buf)[3] & 0x1; + new_cap_mask = ((__be32 *) inbox->buf)[1]; + } + + /* only master has access to qp0 */ + if ((new_cap_mask & cpu_to_be32(IB_PORT_SM)) && slave != dev->caps.function) { + mlx4_warn(dev, "denying sm port capability for slave:%d\n", slave); + return -EINVAL; + } + agg_cap_mask = 0; + slave_cap_mask = priv->mfunc.master.slave_state[slave].ib_cap_mask[port]; + priv->mfunc.master.slave_state[slave].ib_cap_mask[port] = new_cap_mask; + for (i = 0; i < dev->num_slaves; i++) + agg_cap_mask |= priv->mfunc.master.slave_state[i].ib_cap_mask[port]; + +#if 0 + mlx4_warn(dev, "slave=%d, port=%d, old_slave_cap:0x%x, new_slave_cap:0x%x, " + "agg_cap:0x%x qkey_reset:%d\n", slave, port, + be32_to_cpu(slave_cap_mask), + be32_to_cpu(priv->mfunc.master.slave_state[slave].ib_cap_mask[port]), + be32_to_cpu(agg_cap_mask), reset_qkey_viols); +#endif + + /* only clear mailbox for guests. Master may be setting + * MTU or PKEY table size + */ + if (slave != dev->caps.function) + memset(inbox->buf, 0, 256); + if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { + *(u8 *) inbox->buf |= !!reset_qkey_viols << 6; + ((__be32 *) inbox->buf)[2] = agg_cap_mask; + } else { + ((u8 *) inbox->buf)[3] |= !!reset_qkey_viols; + ((__be32 *) inbox->buf)[1] = agg_cap_mask; + } + + err = mlx4_cmd(dev, inbox->dma, port, is_eth, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B, 1); + if (err) + priv->mfunc.master.slave_state[slave].ib_cap_mask[port] = slave_cap_mask; + return err; +} + +int mlx4_SET_PORT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + return mlx4_common_set_port(dev, slave, vhcr->in_modifier, + vhcr->op_modifier, inbox); +} + + +int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port, int pkey_tbl_size) { struct mlx4_cmd_mailbox *mailbox; int err; - if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH) + if (dev->caps.port_type[port] != MLX4_PORT_TYPE_IB) return 0; mailbox = mlx4_alloc_cmd_mailbox(dev); @@ -483,10 +633,421 @@ int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port) memset(mailbox->buf, 0, 256); + if (mlx4_ib_set_4k_mtu && (!mlx4_is_mfunc(dev) || mlx4_is_master(dev))) + ((__be32 *) mailbox->buf)[0] |= cpu_to_be32((1 << 22) | (1 << 21) | (5 << 12) | (2 << 4)); + ((__be32 *) mailbox->buf)[1] = dev->caps.ib_port_def_cap[port]; + + if (pkey_tbl_size >= 0 && mlx4_is_master(dev)) { + ((__be32 *) mailbox->buf)[0] |= cpu_to_be32(1 << 20); + ((__be16 *) mailbox->buf)[20] = cpu_to_be16(pkey_tbl_size); + } + err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_SET_PORT, - MLX4_CMD_TIME_CLASS_B); + MLX4_CMD_TIME_CLASS_B, 0); + + if (!err && mlx4_is_master(dev) && pkey_tbl_size >= 0) + dev->caps.pkey_table_len[port] = pkey_tbl_size; + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} + + +int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu, + u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_set_port_general_context *context; + int err; + u32 in_mod; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + context = mailbox->buf; + memset(context, 0, sizeof *context); + + context->flags = SET_PORT_GEN_ALL_VALID; + context->mtu = cpu_to_be16(mtu); + context->pptx = (pptx * (!pfctx)) << 7; + context->pfctx = pfctx; + context->pprx = (pprx * (!pfcrx)) << 7; + context->pfcrx = pfcrx; + in_mod = MLX4_SET_PORT_GENERAL << 8 | port; + err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B, 0); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} +EXPORT_SYMBOL(mlx4_SET_PORT_general); + +int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn, + u8 promisc) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_set_port_rqp_calc_context *context; + int err; + u32 in_mod; + u32 m_promisc = (dev->caps.vep_mc_steering) ? MCAST_DIRECT : MCAST_DEFAULT; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + context = mailbox->buf; + memset(context, 0, sizeof *context); + + context->base_qpn = cpu_to_be32(base_qpn); + context->n_mac = 0x7; + context->promisc = cpu_to_be32(promisc << SET_PORT_PROMISC_SHIFT | + base_qpn); + context->mcast = cpu_to_be32(m_promisc << SET_PORT_MC_PROMISC_SHIFT | + base_qpn); + context->intra_no_vlan = 0; + context->no_vlan = MLX4_NO_VLAN_IDX; + context->intra_vlan_miss = 0; + context->vlan_miss = MLX4_VLAN_MISS_IDX; + + in_mod = MLX4_SET_PORT_RQP_CALC << 8 | port; + err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT, + MLX4_CMD_TIME_CLASS_B, 0); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} +EXPORT_SYMBOL(mlx4_SET_PORT_qpn_calc); + +static int mlx4_common_set_mcast_fltr(struct mlx4_dev *dev, int function, + int port, u64 addr, u64 clear, u8 mode) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int err = 0; + struct mlx4_mcast_entry *entry, *tmp; + struct mlx4_slave_state *s_state = &priv->mfunc.master.slave_state[function]; + int i; + + switch (mode) { + case MLX4_MCAST_DISABLE: + /* The multicast filter is disabled only once, + * If some other function already done it, operation + * is ignored */ + if (!(priv->mfunc.master.disable_mcast_ref[port]++)) + err = mlx4_cmd(dev, 0, port, MLX4_MCAST_DISABLE, + MLX4_CMD_SET_MCAST_FLTR, + MLX4_CMD_TIME_CLASS_B, 1); + break; + case MLX4_MCAST_ENABLE: + /* We enable the muticast filter only if all functions + * have the filter enabled */ + if (!(--priv->mfunc.master.disable_mcast_ref[port])) + err = mlx4_cmd(dev, 0, port, MLX4_MCAST_ENABLE, + MLX4_CMD_SET_MCAST_FLTR, + MLX4_CMD_TIME_CLASS_B, 1); + break; + case MLX4_MCAST_CONFIG: + if (clear) { + /* Disable the muticast filter while updating it */ + if (!priv->mfunc.master.disable_mcast_ref[port]) { + err = mlx4_cmd(dev, 0, port, MLX4_MCAST_DISABLE, + MLX4_CMD_SET_MCAST_FLTR, + MLX4_CMD_TIME_CLASS_B, 1); + if (err) { + mlx4_warn(dev, "Failed to disable multicast " + "filter\n"); + goto out; + } + } + /* Clear the multicast filter */ + err = mlx4_cmd(dev, clear << 63, port, + MLX4_MCAST_CONFIG, + MLX4_CMD_SET_MCAST_FLTR, + MLX4_CMD_TIME_CLASS_B, 1); + if (err) { + mlx4_warn(dev, "Failed clearing the multicast filter\n"); + goto out; + } + + /* Clear the multicast addresses for the given slave */ + list_for_each_entry_safe(entry, tmp, + &s_state->mcast_filters[port], + list) { + list_del(&entry->list); + kfree(entry); + } + + /* Assign all the multicast addresses that still exist */ + for (i = 0; i < dev->num_slaves; i++) { + list_for_each_entry(entry, + &priv->mfunc.master.slave_state[function].mcast_filters[port], + list) { + if (mlx4_cmd(dev, entry->addr, port, + MLX4_MCAST_CONFIG, + MLX4_CMD_SET_MCAST_FLTR, + MLX4_CMD_TIME_CLASS_B, 1)) + mlx4_warn(dev, "Failed to reconfigure " + "multicast address: 0x%llx\n", + entry->addr); + } + } + /* Enable the filter */ + if (!priv->mfunc.master.disable_mcast_ref[port]) { + err = mlx4_cmd(dev, 0, port, MLX4_MCAST_ENABLE, + MLX4_CMD_SET_MCAST_FLTR, + MLX4_CMD_TIME_CLASS_B, 1); + if (err) { + mlx4_warn(dev, "Failed to enable multicast " + "filter\n"); + goto out; + } + } + } + /* Add the new address if exists */ + if (addr) { + entry = kzalloc(sizeof (struct mlx4_mcast_entry), + GFP_KERNEL); + if (!entry) { + mlx4_warn(dev, "Failed to allocate entry for " + "muticast address\n"); + err = -ENOMEM; + goto out; + } + INIT_LIST_HEAD(&entry->list); + entry->addr = addr; + list_add_tail(&entry->list, &s_state->mcast_filters[port]); + err = mlx4_cmd(dev, addr, port, MLX4_MCAST_CONFIG, + MLX4_CMD_SET_MCAST_FLTR, + MLX4_CMD_TIME_CLASS_B, 1); + if (err) + mlx4_warn(dev, "Failed to add the new address:" + "0x%llx\n", addr); + } + break; + default: + mlx4_warn(dev, "SET_MCAST_FILTER called with illegal modifier\n"); + err = -EINVAL; + } +out: + return err; +} + +int mlx4_SET_MCAST_FLTR_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int port = vhcr->in_modifier; + u64 addr = vhcr->in_param & 0xffffffffffffULL; + u64 clear = vhcr->in_param >> 63; + u8 mode = vhcr->op_modifier; + + return mlx4_common_set_mcast_fltr(dev, slave, port, addr, clear, mode); +} + +int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, + u64 mac, u64 clear, u8 mode) +{ + return mlx4_cmd(dev, (mac | (clear << 63)), port, mode, + MLX4_CMD_SET_MCAST_FLTR, MLX4_CMD_TIME_CLASS_B, 0); +} +EXPORT_SYMBOL(mlx4_SET_MCAST_FLTR); + + +int mlx4_common_set_vlan_fltr(struct mlx4_dev *dev, int function, + int port, void *buf) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_vlan_fltr *filter; + struct mlx4_slave_state *s_state = &priv->mfunc.master.slave_state[function]; + int i, j, err; + + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + /* Update slave's Vlan filter */ + memcpy(s_state->vlan_filter[port]->entry, buf, + sizeof(struct mlx4_vlan_fltr)); + + /* We configure the Vlan filter to allow the vlans of + * all slaves */ + filter = mailbox->buf; + memset(filter, 0, sizeof(*filter)); + for (i = VLAN_FLTR_SIZE - 1; i >= 0; i--) { + for (j = 0; j < dev->num_slaves; j++) { + s_state = &priv->mfunc.master.slave_state[j]; + filter->entry[i] |= s_state->vlan_filter[port]->entry[i]; + } + } + + err = mlx4_cmd(dev, mailbox->dma, port, 0, MLX4_CMD_SET_VLAN_FLTR, + MLX4_CMD_TIME_CLASS_B, 1); + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} + +int mlx4_SET_VLAN_FLTR_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err, port; + + port = vhcr->in_modifier; + err = mlx4_common_set_vlan_fltr(dev, slave, vhcr->in_modifier, inbox->buf); + + return err; +} + + +int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, u8 port, struct vlan_group *grp) +{ + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_vlan_fltr *filter; + int i; + int j; + int index = 0; + u32 entry; + int err = 0; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + filter = mailbox->buf; + if (grp) { + memset(filter, 0, sizeof *filter); + for (i = VLAN_FLTR_SIZE - 1; i >= 0; i--) { + entry = 0; + for (j = 0; j < 32; j++) + if (vlan_group_get_device(grp, index++)) + entry |= 1 << j; + filter->entry[i] = cpu_to_be32(entry); + } + } else { + /* When no vlans are configured we block all vlans */ + memset(filter, 0, sizeof(*filter)); + } + err = mlx4_cmd(dev, mailbox->dma, port, 0, + MLX4_CMD_SET_VLAN_FLTR, MLX4_CMD_TIME_CLASS_B, 0); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} +EXPORT_SYMBOL(mlx4_SET_VLAN_FLTR); + +int mlx4_common_dump_eth_stats(struct mlx4_dev *dev, int slave, + u32 in_mod, struct mlx4_cmd_mailbox *outbox) +{ + return mlx4_cmd_box(dev, 0, outbox->dma, in_mod, 0, + MLX4_CMD_DUMP_ETH_STATS, MLX4_CMD_TIME_CLASS_B, 1); +} + +int mlx4_DUMP_ETH_STATS_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + return mlx4_common_dump_eth_stats(dev, slave, + vhcr->in_modifier, outbox); +} + +static void fill_port_statistics(void *statistics, + struct mlx4_eth_common_counters *stats) +{ + struct mlx4_stat_out_mbox *mlx4_port_stats; + mlx4_port_stats = statistics; + + stats->rx_errors = be64_to_cpu(mlx4_port_stats->PCS) + + be32_to_cpu(mlx4_port_stats->RdropLength) + + be32_to_cpu(mlx4_port_stats->RJBBR) + + be32_to_cpu(mlx4_port_stats->RCRC) + + be32_to_cpu(mlx4_port_stats->RRUNT); + stats->tx_errors = be32_to_cpu(mlx4_port_stats->TDROP); + stats->multicast = be64_to_cpu(mlx4_port_stats->MCAST_prio_0) + + be64_to_cpu(mlx4_port_stats->MCAST_prio_1) + + be64_to_cpu(mlx4_port_stats->MCAST_prio_2) + + be64_to_cpu(mlx4_port_stats->MCAST_prio_3) + + be64_to_cpu(mlx4_port_stats->MCAST_prio_4) + + be64_to_cpu(mlx4_port_stats->MCAST_prio_5) + + be64_to_cpu(mlx4_port_stats->MCAST_prio_6) + + be64_to_cpu(mlx4_port_stats->MCAST_prio_7) + + be64_to_cpu(mlx4_port_stats->MCAST_novlan); + stats->rx_length_errors = be32_to_cpu(mlx4_port_stats->RdropLength); + stats->rx_over_errors = be32_to_cpu(mlx4_port_stats->RdropOvflw); + stats->rx_crc_errors = be32_to_cpu(mlx4_port_stats->RCRC); + stats->rx_fifo_errors = be32_to_cpu(mlx4_port_stats->RdropOvflw); + stats->rx_missed_errors = be32_to_cpu(mlx4_port_stats->RdropOvflw); + stats->broadcast = be64_to_cpu(mlx4_port_stats->RBCAST_prio_0) + + be64_to_cpu(mlx4_port_stats->RBCAST_prio_1) + + be64_to_cpu(mlx4_port_stats->RBCAST_prio_2) + + be64_to_cpu(mlx4_port_stats->RBCAST_prio_3) + + be64_to_cpu(mlx4_port_stats->RBCAST_prio_4) + + be64_to_cpu(mlx4_port_stats->RBCAST_prio_5) + + be64_to_cpu(mlx4_port_stats->RBCAST_prio_6) + + be64_to_cpu(mlx4_port_stats->RBCAST_prio_7) + + be64_to_cpu(mlx4_port_stats->RBCAST_novlan); +} + +static void fill_function_statistics(void *statistics, + struct mlx4_eth_common_counters *stats) +{ + struct mlx4_func_stat_out_mbox *mlx4_function_stats; + mlx4_function_stats = statistics; + + stats->rx_errors = + be64_to_cpu(mlx4_function_stats->etherStatsCRCAlignErrors) + + be64_to_cpu(mlx4_function_stats->etherStatsFragments) + + be64_to_cpu(mlx4_function_stats->etherStatsJabbers); + /* stats->tx_errors = */ + stats->multicast = + be64_to_cpu(mlx4_function_stats->etherStatsMulticastPkts); + /* stats->rx_length_errors = */ + stats->rx_over_errors = + be64_to_cpu(mlx4_function_stats->etherStatsDropEvents); + stats->rx_crc_errors = + be64_to_cpu(mlx4_function_stats->etherStatsCRCAlignErrors); + stats->rx_fifo_errors = + be64_to_cpu(mlx4_function_stats->etherStatsDropEvents); + stats->rx_missed_errors = + be64_to_cpu(mlx4_function_stats->etherStatsDropEvents); + stats->broadcast = + be64_to_cpu(mlx4_function_stats->etherStatsBroadcastPkts); +} + +int mlx4_DUMP_ETH_STATS(struct mlx4_dev *dev, u8 port, u8 reset, + struct mlx4_eth_common_counters *stats) +{ + struct mlx4_cmd_mailbox *mailbox; + void (*do_fill_statistics)(void *, struct mlx4_eth_common_counters *) = NULL; + u32 in_mod; + int err; + + in_mod = (reset << 8) | ((mlx4_is_mfunc(dev)) ? + (MLX4_DUMP_STATS_FUNC_COUNTERS << 12 | dev->caps.function) : + (MLX4_DUMP_STATS_PORT_COUNTERS << 12 | port)); + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + err = mlx4_cmd_box(dev, 0, mailbox->dma, in_mod, 0, + MLX4_CMD_DUMP_ETH_STATS, MLX4_CMD_TIME_CLASS_B, 0); + if (err) + goto out; + + do_fill_statistics = mlx4_is_mfunc(dev) ? fill_function_statistics + : fill_port_statistics; + + if (!reset) + do_fill_statistics(mailbox->buf, stats); + +out: mlx4_free_cmd_mailbox(dev, mailbox); return err; } +EXPORT_SYMBOL_GPL(mlx4_DUMP_ETH_STATS); diff --git a/drivers/net/mlx4/profile.c b/drivers/net/mlx4/profile.c index b967647d0c762..66d7f9140c3cb 100644 --- a/drivers/net/mlx4/profile.c +++ b/drivers/net/mlx4/profile.c @@ -32,7 +32,7 @@ * SOFTWARE. */ -#include +#include #include "mlx4.h" #include "fw.h" @@ -85,7 +85,14 @@ u64 mlx4_make_profile(struct mlx4_dev *dev, struct mlx4_resource tmp; int i, j; - profile = kcalloc(MLX4_RES_NUM, sizeof(*profile), GFP_KERNEL); + if ((request->num_qp > 1 << 23) || (request->num_qp < dev_cap->reserved_qps)) { + mlx4_err(dev, "Log number of QPs (%d) in profile isn't supported. " + "valid valued are (%d-%d)\n", ilog2(request->num_qp), + ilog2(dev_cap->reserved_qps), 23); + return -EINVAL; + } + + profile = kzalloc(MLX4_RES_NUM * sizeof *profile, GFP_KERNEL); if (!profile) return -ENOMEM; @@ -99,15 +106,19 @@ u64 mlx4_make_profile(struct mlx4_dev *dev, profile[MLX4_RES_DMPT].size = dev_cap->dmpt_entry_sz; profile[MLX4_RES_CMPT].size = dev_cap->cmpt_entry_sz; profile[MLX4_RES_MTT].size = dev->caps.mtts_per_seg * dev_cap->mtt_entry_sz; - profile[MLX4_RES_MCG].size = MLX4_MGM_ENTRY_SIZE; - + profile[MLX4_RES_MCG].size = mlx4_get_mgm_entry_size(dev); profile[MLX4_RES_QP].num = request->num_qp; profile[MLX4_RES_RDMARC].num = request->num_qp * request->rdmarc_per_qp; profile[MLX4_RES_ALTC].num = request->num_qp; profile[MLX4_RES_AUXC].num = request->num_qp; profile[MLX4_RES_SRQ].num = request->num_srq; profile[MLX4_RES_CQ].num = request->num_cq; - profile[MLX4_RES_EQ].num = min_t(unsigned, dev_cap->max_eqs, MAX_MSIX); + if (mlx4_is_master(dev)) + profile[MLX4_RES_EQ].num = mlx4_master_get_num_eqs(dev); + else + profile[MLX4_RES_EQ].num = min_t(unsigned, dev_cap->max_eqs, + dev_cap->reserved_eqs + + num_possible_cpus() + 1); profile[MLX4_RES_DMPT].num = request->num_mpt; profile[MLX4_RES_CMPT].num = MLX4_NUM_CMPTS; profile[MLX4_RES_MTT].num = request->num_mtt; @@ -196,7 +207,13 @@ u64 mlx4_make_profile(struct mlx4_dev *dev, init_hca->log_num_cqs = profile[i].log_num; break; case MLX4_RES_EQ: - dev->caps.num_eqs = profile[i].num; + if (mlx4_is_master(dev)) { + dev->caps.num_eqs = dev_cap->reserved_eqs + + min_t(unsigned, + MLX4_MFUNC_EQ_NUM, + num_possible_cpus() + 1); + } else + dev->caps.num_eqs = profile[i].num; init_hca->eqc_base = profile[i].start; init_hca->log_num_eqs = profile[i].log_num; break; @@ -218,7 +235,7 @@ u64 mlx4_make_profile(struct mlx4_dev *dev, dev->caps.num_mgms = profile[i].num >> 1; dev->caps.num_amgms = profile[i].num >> 1; init_hca->mc_base = profile[i].start; - init_hca->log_mc_entry_sz = ilog2(MLX4_MGM_ENTRY_SIZE); + init_hca->log_mc_entry_sz = ilog2(mlx4_get_mgm_entry_size(dev)); init_hca->log_mc_table_sz = profile[i].log_num; init_hca->log_mc_hash_sz = profile[i].log_num - 1; break; diff --git a/drivers/net/mlx4/qp.c b/drivers/net/mlx4/qp.c index ec9350e5f21ab..5489364fac77c 100644 --- a/drivers/net/mlx4/qp.c +++ b/drivers/net/mlx4/qp.c @@ -33,7 +33,8 @@ * SOFTWARE. */ -#include +#include + #include #include @@ -54,7 +55,7 @@ void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type) spin_unlock(&qp_table->lock); if (!qp) { - mlx4_warn(dev, "Async event for bogus QP %08x\n", qpn); + mlx4_dbg(dev, "Async event for none existent QP %08x\n", qpn); return; } @@ -64,10 +65,24 @@ void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type) complete(&qp->free); } -int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt, - enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state, - struct mlx4_qp_context *context, enum mlx4_qp_optpar optpar, - int sqd_event, struct mlx4_qp *qp) +/* used for INIT/CLOSE port logic */ +static int is_qp0(struct mlx4_dev *dev, struct mlx4_qp *qp, int *real_qp0, int *proxy_qp0) +{ + /* qp0 is either the proxy qp0, or the real qp0 */ + *proxy_qp0 = qp->qpn >= dev->caps.sqp_start && + qp->qpn <= dev->caps.sqp_start + 1; + + *real_qp0 = mlx4_is_mfunc(dev) && + qp->qpn >= dev->caps.tunnel_qpn && + qp->qpn <= dev->caps.tunnel_qpn + 1; + + return *real_qp0 || *proxy_qp0; +} + +int __mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt, + enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state, + struct mlx4_qp_context *context, enum mlx4_qp_optpar optpar, + int sqd_event, struct mlx4_qp *qp, int native) { static const u16 op[MLX4_QP_NUM_STATE][MLX4_QP_NUM_STATE] = { [MLX4_QP_STATE_RST] = { @@ -109,16 +124,32 @@ int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt, } }; + struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_cmd_mailbox *mailbox; int ret = 0; + int real_qp0 = 0; + int proxy_qp0 = 0; + u8 port; + u8 vep_num; if (cur_state >= MLX4_QP_NUM_STATE || new_state >= MLX4_QP_NUM_STATE || !op[cur_state][new_state]) return -EINVAL; - if (op[cur_state][new_state] == MLX4_CMD_2RST_QP) - return mlx4_cmd(dev, 0, qp->qpn, 2, - MLX4_CMD_2RST_QP, MLX4_CMD_TIME_CLASS_A); + if (op[cur_state][new_state] == MLX4_CMD_2RST_QP) { + ret = mlx4_cmd(dev, 0, qp->qpn, 2, + MLX4_CMD_2RST_QP, MLX4_CMD_TIME_CLASS_A, native); + if (mlx4_is_master(dev) && cur_state != MLX4_QP_STATE_ERR && + cur_state != MLX4_QP_STATE_RST && + is_qp0(dev, qp, &real_qp0, &proxy_qp0)) { + port = (qp->qpn & 1) + 1; + if (proxy_qp0) + priv->mfunc.master.qp0_state[port].proxy_qp0_active = 0; + else + priv->mfunc.master.qp0_state[port].qp0_active = 0; + } + return ret; + } mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) @@ -131,107 +162,233 @@ int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt, context->log_page_size = mtt->page_shift - MLX4_ICM_PAGE_SHIFT; } + port = ((context->pri_path.sched_queue >> 6) & 1) + 1; + if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH) { + vep_num = dev->caps.function >> 1; + context->pri_path.sched_queue = (context->pri_path.sched_queue & 0xc3) | + (vep_num << 3); + } + *(__be32 *) mailbox->buf = cpu_to_be32(optpar); memcpy(mailbox->buf + 8, context, sizeof *context); ((struct mlx4_qp_context *) (mailbox->buf + 8))->local_qpn = cpu_to_be32(qp->qpn); - ret = mlx4_cmd(dev, mailbox->dma, qp->qpn | (!!sqd_event << 31), + ret = mlx4_cmd(dev, mailbox->dma | dev->caps.function, + qp->qpn | (!!sqd_event << 31), new_state == MLX4_QP_STATE_RST ? 2 : 0, - op[cur_state][new_state], MLX4_CMD_TIME_CLASS_C); + op[cur_state][new_state], MLX4_CMD_TIME_CLASS_C, native); + + if (mlx4_is_master(dev) && cur_state != MLX4_QP_STATE_ERR && + cur_state != MLX4_QP_STATE_RST && new_state == MLX4_QP_STATE_ERR && + is_qp0(dev, qp, &real_qp0, &proxy_qp0)) { + if (proxy_qp0) + priv->mfunc.master.qp0_state[port].proxy_qp0_active = 0; + else + priv->mfunc.master.qp0_state[port].qp0_active = 0; + } + + if (mlx4_is_master(dev) && new_state == MLX4_QP_STATE_RTR && + is_qp0(dev, qp, &real_qp0, &proxy_qp0)) { + if (proxy_qp0) + priv->mfunc.master.qp0_state[port].proxy_qp0_active = 1; + else + priv->mfunc.master.qp0_state[port].qp0_active = 1; + } mlx4_free_cmd_mailbox(dev, mailbox); return ret; } + +int mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt, + enum mlx4_qp_state cur_state, enum mlx4_qp_state new_state, + struct mlx4_qp_context *context, enum mlx4_qp_optpar optpar, + int sqd_event, struct mlx4_qp *qp) +{ + return __mlx4_qp_modify(dev, mtt, cur_state, new_state, context, + optpar, sqd_event, qp, 0); +} EXPORT_SYMBOL_GPL(mlx4_qp_modify); -int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base) +int __mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_qp_table *qp_table = &priv->qp_table; - int qpn; - qpn = mlx4_bitmap_alloc_range(&qp_table->bitmap, cnt, align); - if (qpn == -1) + *base = mlx4_bitmap_alloc_range(&qp_table->bitmap, cnt, align); + if (*base == -1) return -ENOMEM; - - *base = qpn; return 0; } + +int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base) +{ + u64 in_param; + u64 out_param; + int err; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, cnt); + set_param_h(&in_param, align); + err = mlx4_cmd_imm(dev, in_param, &out_param, RES_QP, RES_OP_RESERVE, + MLX4_CMD_ALLOC_RES, + MLX4_CMD_TIME_CLASS_A, 0); + if (err) + return err; + + *base = get_param_l(&out_param); + return 0; + } + return __mlx4_qp_reserve_range(dev, cnt, align, base); +} EXPORT_SYMBOL_GPL(mlx4_qp_reserve_range); -void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt) +void __mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_qp_table *qp_table = &priv->qp_table; - if (base_qpn < dev->caps.sqp_start + 8) - return; + if (mlx4_is_qp_reserved(dev, (u32) base_qpn)) + return; mlx4_bitmap_free_range(&qp_table->bitmap, base_qpn, cnt); } + +void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt) +{ + u64 in_param; + int err; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, base_qpn); + set_param_h(&in_param, cnt); + err = mlx4_cmd(dev, in_param, RES_QP, RES_OP_RESERVE, + MLX4_CMD_FREE_RES, + MLX4_CMD_TIME_CLASS_A, 0); + if (err) { + mlx4_warn(dev, "Failed to release qp range base:%d cnt:%d\n", + base_qpn, cnt); + } + } else + __mlx4_qp_release_range(dev, base_qpn, cnt); +} EXPORT_SYMBOL_GPL(mlx4_qp_release_range); -int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp) +int __mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_qp_table *qp_table = &priv->qp_table; int err; - if (!qpn) - return -EINVAL; - - qp->qpn = qpn; - - err = mlx4_table_get(dev, &qp_table->qp_table, qp->qpn); + err = mlx4_table_get(dev, &qp_table->qp_table, qpn, MLX4_MR_FLAG_NONE); if (err) goto err_out; - err = mlx4_table_get(dev, &qp_table->auxc_table, qp->qpn); + err = mlx4_table_get(dev, &qp_table->auxc_table, qpn, + MLX4_MR_FLAG_NONE); if (err) goto err_put_qp; - err = mlx4_table_get(dev, &qp_table->altc_table, qp->qpn); + err = mlx4_table_get(dev, &qp_table->altc_table, qpn, + MLX4_MR_FLAG_NONE); if (err) goto err_put_auxc; - err = mlx4_table_get(dev, &qp_table->rdmarc_table, qp->qpn); + err = mlx4_table_get(dev, &qp_table->rdmarc_table, qpn, + MLX4_MR_FLAG_NONE); if (err) goto err_put_altc; - err = mlx4_table_get(dev, &qp_table->cmpt_table, qp->qpn); + err = mlx4_table_get(dev, &qp_table->cmpt_table, qpn, + MLX4_MR_FLAG_NONE); if (err) goto err_put_rdmarc; - spin_lock_irq(&qp_table->lock); - err = radix_tree_insert(&dev->qp_table_tree, qp->qpn & (dev->caps.num_qps - 1), qp); - spin_unlock_irq(&qp_table->lock); - if (err) - goto err_put_cmpt; - - atomic_set(&qp->refcount, 1); - init_completion(&qp->free); - return 0; -err_put_cmpt: - mlx4_table_put(dev, &qp_table->cmpt_table, qp->qpn); - err_put_rdmarc: - mlx4_table_put(dev, &qp_table->rdmarc_table, qp->qpn); + mlx4_table_put(dev, &qp_table->rdmarc_table, qpn, MLX4_MR_FLAG_NONE); err_put_altc: - mlx4_table_put(dev, &qp_table->altc_table, qp->qpn); + mlx4_table_put(dev, &qp_table->altc_table, qpn, MLX4_MR_FLAG_NONE); err_put_auxc: - mlx4_table_put(dev, &qp_table->auxc_table, qp->qpn); + mlx4_table_put(dev, &qp_table->auxc_table, qpn, MLX4_MR_FLAG_NONE); err_put_qp: - mlx4_table_put(dev, &qp_table->qp_table, qp->qpn); + mlx4_table_put(dev, &qp_table->qp_table, qpn, MLX4_MR_FLAG_NONE); err_out: return err; } + +static int mlx4_qp_alloc_icm(struct mlx4_dev *dev, int qpn) +{ + u64 param; + + if (mlx4_is_mfunc(dev)) { + set_param_l(¶m, qpn); + return mlx4_cmd_imm(dev, param, ¶m, RES_QP, RES_OP_MAP_ICM, + MLX4_CMD_ALLOC_RES, MLX4_CMD_TIME_CLASS_A, 0); + } + return __mlx4_qp_alloc_icm(dev, qpn); +} + +void __mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_qp_table *qp_table = &priv->qp_table; + + mlx4_table_put(dev, &qp_table->cmpt_table, qpn, MLX4_MR_FLAG_NONE); + mlx4_table_put(dev, &qp_table->rdmarc_table, qpn, MLX4_MR_FLAG_NONE); + mlx4_table_put(dev, &qp_table->altc_table, qpn, MLX4_MR_FLAG_NONE); + mlx4_table_put(dev, &qp_table->auxc_table, qpn, MLX4_MR_FLAG_NONE); + mlx4_table_put(dev, &qp_table->qp_table, qpn, MLX4_MR_FLAG_NONE); +} + +void mlx4_qp_free_icm(struct mlx4_dev *dev, int qpn) +{ + u64 in_param; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, qpn); + if (mlx4_cmd(dev, in_param, RES_QP, RES_OP_MAP_ICM, + MLX4_CMD_FREE_RES, MLX4_CMD_TIME_CLASS_A, 0)) + mlx4_warn(dev, "Failed to free icm of qp:%d\n", qpn); + } else + __mlx4_qp_free_icm(dev, qpn); +} + +int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_qp_table *qp_table = &priv->qp_table; + int err; + + if (!qpn) + return -EINVAL; + + qp->qpn = qpn; + + err = mlx4_qp_alloc_icm(dev, qpn); + if (err) + return err; + + spin_lock_irq(&qp_table->lock); + err = radix_tree_insert(&dev->qp_table_tree, qp->qpn & (dev->caps.num_qps - 1), qp); + spin_unlock_irq(&qp_table->lock); + if (err) + goto err_icm; + + atomic_set(&qp->refcount, 1); + init_completion(&qp->free); + + return 0; + +err_icm: + mlx4_qp_free_icm(dev, qpn); + return err; +} EXPORT_SYMBOL_GPL(mlx4_qp_alloc); void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp) @@ -247,24 +404,19 @@ EXPORT_SYMBOL_GPL(mlx4_qp_remove); void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp) { - struct mlx4_qp_table *qp_table = &mlx4_priv(dev)->qp_table; - if (atomic_dec_and_test(&qp->refcount)) complete(&qp->free); wait_for_completion(&qp->free); - mlx4_table_put(dev, &qp_table->cmpt_table, qp->qpn); - mlx4_table_put(dev, &qp_table->rdmarc_table, qp->qpn); - mlx4_table_put(dev, &qp_table->altc_table, qp->qpn); - mlx4_table_put(dev, &qp_table->auxc_table, qp->qpn); - mlx4_table_put(dev, &qp_table->qp_table, qp->qpn); + mlx4_qp_free_icm(dev, qp->qpn); } EXPORT_SYMBOL_GPL(mlx4_qp_free); static int mlx4_CONF_SPECIAL_QP(struct mlx4_dev *dev, u32 base_qpn) { - return mlx4_cmd(dev, 0, base_qpn, 0, MLX4_CMD_CONF_SPECIAL_QP, - MLX4_CMD_TIME_CLASS_B); + return mlx4_cmd(dev, 0, base_qpn, + (dev->caps.flags & MLX4_DEV_CAP_FLAG_RAW_ETY) ? 4 : 0, + MLX4_CMD_CONF_SPECIAL_QP, MLX4_CMD_TIME_CLASS_B, 1); } int mlx4_init_qp_table(struct mlx4_dev *dev) @@ -275,13 +427,20 @@ int mlx4_init_qp_table(struct mlx4_dev *dev) spin_lock_init(&qp_table->lock); INIT_RADIX_TREE(&dev->qp_table_tree, GFP_ATOMIC); + if (mlx4_is_mfunc(dev) && !mlx4_is_master(dev)) + return 0; /* * We reserve 2 extra QPs per port for the special QPs. The * block of special QPs must be aligned to a multiple of 8, so * round up. + * If we are operating in multi-function mode, we reserve additional + * 8 qps for each function, while the "real" special qps serve as + * a tunnel for para-virtualization purposes. + * We also reserve the MSB of the 24-bit QP number to indicate + * an XRC qp. */ - dev->caps.sqp_start = + dev->caps.tunnel_qpn = ALIGN(dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW], 8); { @@ -312,21 +471,48 @@ int mlx4_init_qp_table(struct mlx4_dev *dev) } + /* reserve 8 (unused) tunnel QPs, 8 real SQPs, and, per function, + * 8 proxy SQPs (used by slaves), and 8 corresponding tunnel QPs (for master). + * Each proxy SQP works opposite its own Dom0 tunnel QP. + */ err = mlx4_bitmap_init(&qp_table->bitmap, dev->caps.num_qps, - (1 << 23) - 1, dev->caps.sqp_start + 8, + (1 << 23) - 1, dev->caps.tunnel_qpn + 8 + + 16 * MLX4_MFUNC_MAX * + !!(mlx4_is_mfunc(dev) && mlx4_is_master(dev)), reserved_from_top); if (err) return err; + /*In mfunc the start of the sqp is diffrent:*/ + if (mlx4_is_mfunc(dev)) + dev->caps.sqp_start = dev->caps.tunnel_qpn + 8 * (dev->caps.function + 1); + else + dev->caps.sqp_start = dev->caps.tunnel_qpn; - return mlx4_CONF_SPECIAL_QP(dev, dev->caps.sqp_start); + return mlx4_CONF_SPECIAL_QP(dev, dev->caps.tunnel_qpn); } void mlx4_cleanup_qp_table(struct mlx4_dev *dev) { + if (mlx4_is_mfunc(dev) && !mlx4_is_master(dev)) + return; + mlx4_CONF_SPECIAL_QP(dev, 0); mlx4_bitmap_cleanup(&mlx4_priv(dev)->qp_table.bitmap); } +int mlx4_qp_get_region(struct mlx4_dev *dev, enum mlx4_qp_region region, + int *base_qpn, int *cnt) +{ + if ((region < 0) || (region >= MLX4_NUM_QP_REGION)) + return -EINVAL; + + *base_qpn = dev->caps.reserved_qps_base[region]; + *cnt = dev->caps.reserved_qps_cnt[region]; + + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_qp_get_region); + int mlx4_qp_query(struct mlx4_dev *dev, struct mlx4_qp *qp, struct mlx4_qp_context *context) { @@ -338,7 +524,7 @@ int mlx4_qp_query(struct mlx4_dev *dev, struct mlx4_qp *qp, return PTR_ERR(mailbox); err = mlx4_cmd_box(dev, 0, mailbox->dma, qp->qpn, 0, - MLX4_CMD_QUERY_QP, MLX4_CMD_TIME_CLASS_A); + MLX4_CMD_QUERY_QP, MLX4_CMD_TIME_CLASS_A, 0); if (!err) memcpy(context, mailbox->buf + 8, sizeof *context); diff --git a/drivers/net/mlx4/reset.c b/drivers/net/mlx4/reset.c index e5741dab3825f..f8219b2a5edc1 100644 --- a/drivers/net/mlx4/reset.c +++ b/drivers/net/mlx4/reset.c @@ -31,6 +31,7 @@ * SOFTWARE. */ +#include #include #include #include @@ -39,6 +40,41 @@ #include "mlx4.h" + +#define MLX4_OWNER_BASE 0x8069c +#define MLX4_OWNER_SIZE 4 + +int mlx4_get_ownership(struct mlx4_dev *dev) +{ + void __iomem *owner; + u32 ret; + + owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE, + MLX4_OWNER_SIZE); + if (!owner) { + mlx4_err(dev, "Failed to map ownership sempahore\n"); + return -ENOMEM; + } + + ret = readl(owner); + iounmap(owner); + return (int) !!ret; +} + +void mlx4_free_ownership(struct mlx4_dev *dev) +{ + void __iomem *owner; + + owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE, + MLX4_OWNER_SIZE); + if (!owner) { + mlx4_err(dev, "Failed to map ownership sempahore\n"); + return; + } + writel(0, owner); + iounmap(owner); +} + int mlx4_reset(struct mlx4_dev *dev) { void __iomem *reset; diff --git a/drivers/net/mlx4/resource_tracker.c b/drivers/net/mlx4/resource_tracker.c new file mode 100644 index 0000000000000..4ef1e35b78f43 --- /dev/null +++ b/drivers/net/mlx4/resource_tracker.c @@ -0,0 +1,3634 @@ +/* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved. + * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "mlx4.h" +#include "fw.h" +#include "fmr_master.h" +#include "fmr_slave.h" + +/* For Debug uses */ +static const char *ResourceType(enum mlx4_resource rt) +{ + switch (rt) { + case RES_QP: return "RES_QP"; + case RES_CQ: return "RES_CQ"; + case RES_SRQ: return "RES_SRQ"; + case RES_MPT: return "RES_MPT"; + case RES_MTT: return "RES_MTT"; + case RES_MAC: return "RES_MAC"; + case RES_EQ: return "RES_EQ"; + case RES_COUNTER: return "RES_COUNTER"; + case RES_XRCDN: return "RES_XRCDN"; + default: return "Unknown resource type !!!"; + }; +} + +int mlx4_init_resource_tracker(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int i; + int t; + + priv->mfunc.master.res_tracker.slave_list = + kzalloc(dev->num_slaves * sizeof (struct slave_list), GFP_KERNEL); + if (!priv->mfunc.master.res_tracker.slave_list) + return -ENOMEM; + + for (i = 0 ; i < dev->num_slaves; i++) { + for (t = 0; t < MLX4_NUM_OF_RESOURCE_TYPE; ++t) + INIT_LIST_HEAD(&priv->mfunc.master.res_tracker.slave_list[i].res_list[t]); + mutex_init(&priv->mfunc.master.res_tracker.slave_list[i].mutex); + } + + mlx4_dbg(dev, "Started init_resource_tracker: %ld slaves \n", dev->num_slaves); + for (i = 0 ; i < MLX4_NUM_OF_RESOURCE_TYPE; i++) + INIT_RADIX_TREE(&priv->mfunc.master.res_tracker.res_tree[i], + GFP_ATOMIC|__GFP_NOWARN); + + spin_lock_init(&priv->mfunc.master.res_tracker.lock); + return 0 ; +} + +void mlx4_free_resource_tracker(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int i; + + if (priv->mfunc.master.res_tracker.slave_list) { + for (i = 0 ; i < dev->num_slaves; i++) + mlx4_delete_all_resources_for_slave(dev, i); + + kfree(priv->mfunc.master.res_tracker.slave_list); + } +} + +static void update_pkey_index(struct mlx4_dev *dev, int slave, + struct mlx4_cmd_mailbox *inbox) +{ + u8 sched = *(u8 *)(inbox->buf + 64); + u8 orig_index = *(u8 *)(inbox->buf + 35); + u8 new_index; + struct mlx4_priv *priv = mlx4_priv(dev); + int port; + + port = (sched >> 6 & 1) + 1; + + new_index = priv->virt2phys_pkey[slave][port - 1][orig_index]; + *(u8 *)(inbox->buf + 35) = new_index; + + mlx4_dbg(dev, "port = %d, orig pkey index = %d, " + "new pkey index = %d\n", port, orig_index, new_index); +} + +static void update_ud_gid(struct mlx4_qp_context *qp_ctx, u8 slave) +{ + u32 ts = (be32_to_cpu(qp_ctx->flags) >> 16) & 0xff; + + if (MLX4_QP_ST_UD == ts) + qp_ctx->pri_path.mgid_index = 0x80 | slave; + + mlx4_sdbg("slave %d, new gid index: 0x%x ", + slave, qp_ctx->pri_path.mgid_index); +} + +static int mpt_mask(struct mlx4_dev *dev) +{ + return dev->caps.num_mpts - 1; +} + +static void *find_res(struct mlx4_dev *dev, int res_id, + enum mlx4_resource type) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + return radix_tree_lookup(&priv->mfunc.master.res_tracker.res_tree[type], + res_id); +} + +static int get_res(struct mlx4_dev *dev, int slave, int res_id, enum mlx4_resource type, + void *res) +{ + struct res_common *r; + int err = 0; + + spin_lock_irq(mlx4_tlock(dev)); + r = find_res(dev, res_id, type); + if (!r) { + err = -ENONET; + goto exit; + } + + if (r->state == RES_ANY_BUSY) { + err = -EBUSY; + goto exit; + } + + if (r->owner != slave) { + err = -EPERM; + goto exit; + } + + r->from_state = r->state; + r->state = RES_ANY_BUSY; + mlx4_sdbg("res %s id 0x%x to busy\n", ResourceType(type), r->res_id); + + if (res) + *((struct res_common **)res) = r; + +exit: + spin_unlock_irq(mlx4_tlock(dev)); + return err; +} + +#if 0 +static void __put_res(struct mlx4_dev *dev, int slave, void *_r) +{ + struct res_common *r = _r; + spin_lock_irq(mlx4_tlock(dev)); + mlx4_sdbg("move back id 0x%x from %d to %d\n", + r->res_id, r->state, r->from_state); + r->state = r->from_state; + spin_unlock_irq(mlx4_tlock(dev)); +} +#endif + +int mlx4_get_slave_from_resource_id(struct mlx4_dev *dev, + enum mlx4_resource type, + int res_id, int *slave) +{ + + struct res_common *r; + int err = -ENOENT; + int id = res_id; + unsigned long flags; + + if (type == RES_QP) + id &= 0x7fffff; + spin_lock_irqsave(mlx4_tlock(dev), flags); + + r = find_res(dev, id, type); + if (r) { + *slave = r->owner; + err = 0; + } + spin_unlock_irqrestore(mlx4_tlock(dev), flags); + + return err; +} + +static void put_res(struct mlx4_dev *dev, int slave, int res_id, enum mlx4_resource type) +{ + struct res_common *r; + + spin_lock_irq(mlx4_tlock(dev)); + r = find_res(dev, res_id, type); + SASSERT(r); + mlx4_sdbg("move back %s id 0x%x from %d to %d\n", + ResourceType(type), r->res_id, r->state, r->from_state); + r->state = r->from_state; + spin_unlock_irq(mlx4_tlock(dev)); +} + +static struct res_common *alloc_qp_tr(int id) +{ + struct res_qp *ret; + + ret = kzalloc(sizeof *ret, GFP_KERNEL); + if (!ret) + return NULL; + + ret->com.res_id = id; + ret->com.state = RES_QP_RESERVED; + INIT_LIST_HEAD(&ret->mcg_list); + spin_lock_init(&ret->mcg_spl); + + return &ret->com; +} + +static struct res_common *alloc_mtt_tr(int id, int order) +{ + struct res_mtt *ret; + + ret = kzalloc(sizeof *ret, GFP_KERNEL); + if (!ret) + return NULL; + + ret->com.res_id = id; + ret->order = order; + ret->com.state = RES_MTT_RESERVED; + atomic_set(&ret->ref_count, 0); + + return &ret->com; +} + +static struct res_common *alloc_mpt_tr(int id, int key, + enum mlx4_mr_flags flags) +{ + struct res_mpt *ret; + + ret = kzalloc(sizeof *ret, GFP_KERNEL); + if (!ret) + return NULL; + + ret->com.res_id = id; + ret->com.state = RES_MPT_RESERVED; + ret->key = key; + ret->flags = flags; + + return &ret->com; +} + +static struct res_common *alloc_eq_tr(int id) +{ + struct res_eq *ret; + + ret = kzalloc(sizeof *ret, GFP_KERNEL); + if (!ret) + return NULL; + + ret->com.res_id = id; + ret->com.state = RES_EQ_RESERVED; + + return &ret->com; +} + +static struct res_common *alloc_cq_tr(int id) +{ + struct res_cq *ret; + + ret = kzalloc(sizeof *ret, GFP_KERNEL); + if (!ret) + return NULL; + + ret->com.res_id = id; + ret->com.state = RES_CQ_ALLOCATED; + atomic_set(&ret->ref_count, 0); + + return &ret->com; +} + +static struct res_common *alloc_srq_tr(int id) +{ + struct res_srq *ret; + + ret = kzalloc(sizeof *ret, GFP_KERNEL); + if (!ret) + return NULL; + + ret->com.res_id = id; + ret->com.state = RES_SRQ_ALLOCATED; + atomic_set(&ret->ref_count, 0); + + return &ret->com; +} + +static struct res_common *alloc_counter_tr(int id) +{ + struct res_counter *ret; + + ret = kzalloc(sizeof *ret, GFP_KERNEL); + if (!ret) + return NULL; + + ret->com.res_id = id; + ret->com.state = RES_COUNTER_ALLOCATED; + + return &ret->com; +} + +static struct res_common *alloc_xrcdn_tr(int id) +{ + struct res_xrcdn *ret; + + ret = kzalloc(sizeof *ret, GFP_KERNEL); + if (!ret) + return NULL; + + ret->com.res_id = id; + ret->com.state = RES_XRCDN_ALLOCATED; + + return &ret->com; +} + +static struct res_common *alloc_tr(int id, enum mlx4_resource type, int slave, + int extra, int extra2) +{ + struct res_common *ret; + + switch (type) { + case RES_QP: + ret = alloc_qp_tr(id); + break; + case RES_MPT: + ret = alloc_mpt_tr(id, extra, extra2); + break; + case RES_MTT: + ret = alloc_mtt_tr(id, extra); + break; + case RES_EQ: + ret = alloc_eq_tr(id); + break; + case RES_CQ: + ret = alloc_cq_tr(id); + break; + case RES_SRQ: + ret = alloc_srq_tr(id); + break; + case RES_MAC: + printk(KERN_ERR "implementation missing\n"); + return NULL; + case RES_COUNTER: + ret = alloc_counter_tr(id); + break; + case RES_XRCDN: + ret = alloc_xrcdn_tr(id); + break; + + default: + return NULL; + } + if (ret) + ret->owner = slave; + + return ret; +} + +static int add_res_range(struct mlx4_dev *dev, int slave, int base, int count, + enum mlx4_resource type, int extra, int extra2) +{ + int i; + int err; + struct mlx4_priv *priv = mlx4_priv(dev); + struct res_common **res_arr; + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct radix_tree_root *root = &tracker->res_tree[type]; + + res_arr = kzalloc(count * sizeof *res_arr, GFP_KERNEL); + if (!res_arr) + return -ENOMEM; + + for (i = 0; i < count; ++i) { + res_arr[i] = alloc_tr(base + i, type, slave, extra, extra2); + if (!res_arr[i]) { + for (--i; i >= 0; --i) + kfree(res_arr[i]); + + kfree(res_arr); + return -ENOMEM; + } + } + + spin_lock_irq(mlx4_tlock(dev)); + for (i = 0; i < count; ++i) { + if (find_res(dev, base + i, type)) { + err = -EEXIST; + goto undo; + } + err = radix_tree_insert(root, base + i, res_arr[i]); + if (err) + goto undo; + list_add_tail(&res_arr[i]->list, &tracker->slave_list[slave].res_list[type]); + } + spin_unlock_irq(mlx4_tlock(dev)); + kfree(res_arr); + + return 0; + +undo: + for (--i; i >= base; --i) + radix_tree_delete(&tracker->res_tree[type], i); + + spin_unlock_irq(mlx4_tlock(dev)); + + for (i = 0; i < count; ++i) + kfree(res_arr[i]); + + kfree(res_arr); + + return err; +} + +static int remove_qp_ok(struct res_qp *res) +{ + if (res->com.state == RES_QP_BUSY) + return -EBUSY; + else if (res->com.state != RES_QP_RESERVED) + return -EPERM; + + return 0; +} + +static int remove_mtt_ok(struct res_mtt *res, int order) +{ + if (res->com.state == RES_MTT_BUSY || atomic_read(&res->ref_count)) { + printk(KERN_DEBUG "%s-%d: state %s, ref_count %d\n", __func__, __LINE__, + mtt_states_str(res->com.state), atomic_read(&res->ref_count)); + return -EBUSY; + } else if (res->com.state != RES_MTT_ALLOCATED && + res->com.state != RES_MTT_RESERVED) + return -EPERM; + else if (res->order != order) + return -EINVAL; + + return 0; +} + +static int remove_mpt_ok(struct res_mpt *res) +{ + if (res->com.state == RES_MPT_BUSY) + return -EBUSY; + else if (res->com.state != RES_MPT_RESERVED) + return -EPERM; + + return 0; +} + +static int remove_eq_ok(struct res_eq *res) +{ + if (res->com.state == RES_MPT_BUSY) + return -EBUSY; + else if (res->com.state != RES_MPT_RESERVED) + return -EPERM; + + return 0; +} + +static int remove_counter_ok(struct res_counter *res) +{ + if (res->com.state == RES_COUNTER_BUSY) + return -EBUSY; + else if (res->com.state != RES_COUNTER_ALLOCATED) + return -EPERM; + + return 0; +} + +static int remove_xrcdn_ok(struct res_xrcdn *res) +{ + if (res->com.state == RES_XRCDN_BUSY) + return -EBUSY; + else if (res->com.state != RES_XRCDN_ALLOCATED) + return -EPERM; + + return 0; +} + +static int remove_cq_ok(struct res_cq *res) +{ + if (res->com.state == RES_CQ_BUSY) + return -EBUSY; + else if (res->com.state != RES_CQ_ALLOCATED) + return -EPERM; + + return 0; +} + +static int remove_srq_ok(struct res_srq *res) +{ + if (res->com.state == RES_SRQ_BUSY) + return -EBUSY; + else if (res->com.state != RES_SRQ_ALLOCATED) + return -EPERM; + + return 0; +} + +static int remove_ok(struct res_common *res, enum mlx4_resource type, int extra) +{ + switch (type) { + case RES_QP: + return remove_qp_ok((struct res_qp *)res); + case RES_CQ: + return remove_cq_ok((struct res_cq *)res); + case RES_SRQ: + return remove_srq_ok((struct res_srq *)res); + case RES_MPT: + return remove_mpt_ok((struct res_mpt *)res); + case RES_MTT: + return remove_mtt_ok((struct res_mtt *)res, extra); + case RES_MAC: + return -ENOSYS; + case RES_EQ: + return remove_eq_ok((struct res_eq *)res); + case RES_COUNTER: + return remove_counter_ok((struct res_counter *)res); + case RES_XRCDN: + return remove_xrcdn_ok((struct res_xrcdn *)res); + default: + return -EINVAL; + } +} + +static int rem_res_range(struct mlx4_dev *dev, int slave, int base, int count, + enum mlx4_resource type, int extra) +{ + int i; + int err; + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct res_common *r; + + spin_lock_irq(mlx4_tlock(dev)); + for (i = base; i < base + count; ++i) { + r = radix_tree_lookup(&tracker->res_tree[type], i); + if (!r) { + err = -ENOENT; + goto out; + } + if (r->owner != slave) { + err = -EPERM; + goto out; + } + if ((err = remove_ok(r, type, extra))) + goto out; + } + + for (i = base; i < base + count; ++i) { + r = radix_tree_lookup(&tracker->res_tree[type], i); + radix_tree_delete(&tracker->res_tree[type], i); + list_del(&r->list); + kfree(r); + } + err = 0; + +out: + spin_unlock_irq(mlx4_tlock(dev)); + + return err; +} + +static int qp_res_start_move_to(struct mlx4_dev *dev, int slave, int qpn, + enum res_qp_states state, struct res_qp **qp, int alloc) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct res_qp *r; + int err = 0; + + spin_lock_irq(mlx4_tlock(dev)); + r = radix_tree_lookup(&tracker->res_tree[RES_QP], qpn); + if (!r) + err = -ENOENT; + else if (r->com.owner != slave) + err = -EPERM; + else { + switch (state) { + case RES_QP_BUSY: + mlx4_sdbg("failed RES_QP, 0x%x\n", r->com.res_id); + err = -EBUSY; + break; + + case RES_QP_RESERVED: + if (r->com.state == RES_QP_MAPPED && !alloc) + break; + + mlx4_sdbg("failed RES_QP, 0x%x\n", r->com.res_id); + err = -EINVAL; + break; + + case RES_QP_MAPPED: + if ((r->com.state == RES_QP_RESERVED && alloc) || + r->com.state == RES_QP_HW) + break; + else { + mlx4_sdbg("failed RES_QP, 0x%x\n", r->com.res_id); + err = -EINVAL; + } + + break; + + case RES_QP_HW: + if (r->com.state != RES_QP_MAPPED) { + mlx4_sdbg("failed RES_QP, 0x%x\n", r->com.res_id); + err = -EINVAL; + } + break; + default: + mlx4_sdbg("failed RES_QP, 0x%x\n", r->com.res_id); + err = -EINVAL; + } + + if (!err) { + r->com.from_state = r->com.state; + r->com.to_state = state; + r->com.state = RES_QP_BUSY; + mlx4_sdbg("move to %s from %s qpn 0x%x\n", qp_states_str(state), + qp_states_str(r->com.from_state), r->com.res_id); + if (qp) + *qp = (struct res_qp *)r; + } + } + + spin_unlock_irq(mlx4_tlock(dev)); + + return err; +} + +static int mtt_res_start_move_to(struct mlx4_dev *dev, int slave, int index, + enum res_mtt_states state) { + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = + &priv->mfunc.master.res_tracker; + struct res_mtt *r; + int err = 0; + + spin_lock_irq(mlx4_tlock(dev)); + r = radix_tree_lookup(&tracker->res_tree[RES_MTT], index); + if (!r) + err = -ENOENT; + else if (r->com.owner != slave) + err = -EPERM; + else { + switch (state) { + case RES_MTT_BUSY: + err = -EINVAL; + break; + + case RES_MTT_RESERVED: + if (r->com.state != RES_MTT_ALLOCATED) + err = -EINVAL; + break; + + case RES_MTT_ALLOCATED: + if (r->com.state != RES_MTT_RESERVED) + err = -EINVAL; + break; + + default: + err = -EINVAL; + } + } + + if (!err) { + r->com.from_state = r->com.state; + r->com.to_state = state; + r->com.state = RES_MTT_BUSY; + } + + spin_unlock_irq(mlx4_tlock(dev)); + + return err; +} + +static int mr_res_start_move_to(struct mlx4_dev *dev, int slave, int index, + enum res_mpt_states state, struct res_mpt **mpt) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct res_mpt *r; + int err = 0; + + spin_lock_irq(mlx4_tlock(dev)); + r = radix_tree_lookup(&tracker->res_tree[RES_MPT], index); + if (!r) + err = -ENOENT; + else if (r->com.owner != slave) + err = -EPERM; + else { + switch (state) { + case RES_MPT_BUSY: + err = -EINVAL; + break; + + case RES_MPT_RESERVED: + if (r->com.state != RES_MPT_MAPPED) + err = -EINVAL; + break; + + case RES_MPT_MAPPED: + if (r->com.state != RES_MPT_RESERVED && r->com.state != RES_MPT_HW) + err = -EINVAL; + break; + + case RES_MPT_HW: + if (r->com.state != RES_MPT_MAPPED) + err = -EINVAL; + break; + default: + err = -EINVAL; + } + + if (!err) { + r->com.from_state = r->com.state; + r->com.to_state = state; + r->com.state = RES_MPT_BUSY; + if (mpt) + *mpt = (struct res_mpt *)r; + } + } + + spin_unlock_irq(mlx4_tlock(dev)); + + return err; +} + +static int eq_res_start_move_to(struct mlx4_dev *dev, int slave, int index, + enum res_eq_states state, struct res_eq **eq) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct res_eq *r; + int err = 0; + + spin_lock_irq(mlx4_tlock(dev)); + r = radix_tree_lookup(&tracker->res_tree[RES_EQ], index); + if (!r) + err = -ENOENT; + else if (r->com.owner != slave) { + mlx4_sdbg("EQ res_id 0x%x belongs to slave %d\n", r->com.res_id, r->com.owner); + err = -EPERM; + } else { + switch (state) { + case RES_EQ_BUSY: + err = -EINVAL; + break; + + case RES_EQ_RESERVED: + if (r->com.state != RES_EQ_HW) + err = -EINVAL; + break; + + case RES_EQ_HW: + if (r->com.state != RES_EQ_RESERVED) + err = -EINVAL; + break; + + default: + err = -EINVAL; + } + + if (!err) { + r->com.from_state = r->com.state; + r->com.to_state = state; + r->com.state = RES_EQ_BUSY; + if (eq) + *eq = r; + } + } + + spin_unlock_irq(mlx4_tlock(dev)); + + return err; +} + +static int cq_res_start_move_to(struct mlx4_dev *dev, int slave, int cqn, + enum res_cq_states state, struct res_cq **cq) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct res_cq *r; + int err; + + spin_lock_irq(mlx4_tlock(dev)); + r = radix_tree_lookup(&tracker->res_tree[RES_CQ], cqn); + if (!r) + err = -ENOENT; + else if (r->com.owner != slave) + err = -EPERM; + else { + switch (state) { + case RES_CQ_BUSY: + mlx4_sdbg("CQ 0x%x, ref count %d\n", r->com.res_id, atomic_read(&r->ref_count)); + err = -EBUSY; + break; + + case RES_CQ_ALLOCATED: + if (r->com.state != RES_CQ_HW) + err = -EINVAL; + else if (atomic_read(&r->ref_count)) { + mlx4_sdbg("CQ 0x%x, ref count %d\n", r->com.res_id, atomic_read(&r->ref_count)); + err = -EBUSY; + } + else + err = 0; + break; + + case RES_CQ_HW: + if (r->com.state != RES_CQ_ALLOCATED) + err = -EINVAL; + else + err = 0; + break; + + default: + err = -EINVAL; + } + + if (!err) { + r->com.from_state = r->com.state; + r->com.to_state = state; + r->com.state = RES_CQ_BUSY; + if (cq) + *cq = r; + } + } + + spin_unlock_irq(mlx4_tlock(dev)); + + return err; +} + +static int srq_res_start_move_to(struct mlx4_dev *dev, int slave, int index, + enum res_cq_states state, struct res_srq **srq) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct res_srq *r; + int err = 0; + + spin_lock_irq(mlx4_tlock(dev)); + r = radix_tree_lookup(&tracker->res_tree[RES_SRQ], index); + if (!r) + err = -ENOENT; + else if (r->com.owner != slave) + err = -EPERM; + else { + switch (state) { + case RES_SRQ_BUSY: + err = -EINVAL; + break; + + case RES_SRQ_ALLOCATED: + if (r->com.state != RES_SRQ_HW) + err = -EINVAL; + else if (atomic_read(&r->ref_count)) + err = -EBUSY; + break; + + case RES_SRQ_HW: + if (r->com.state != RES_SRQ_ALLOCATED) + err = -EINVAL; + break; + + default: + err = -EINVAL; + } + + if (!err) { + r->com.from_state = r->com.state; + r->com.to_state = state; + r->com.state = RES_SRQ_BUSY; + if (srq) + *srq = r; + } + } + + spin_unlock_irq(mlx4_tlock(dev)); + + return err; +} + +static void res_abort_move(struct mlx4_dev *dev, int slave, + enum mlx4_resource type, int id) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct res_common *r; + + spin_lock_irq(mlx4_tlock(dev)); + r = radix_tree_lookup(&tracker->res_tree[type], id); + SASSERT(r && (r->owner == slave)); + r->state = r->from_state; + spin_unlock_irq(mlx4_tlock(dev)); +} + +static void res_end_move(struct mlx4_dev *dev, int slave, + enum mlx4_resource type, int id) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct res_common *r; + + spin_lock_irq(mlx4_tlock(dev)); + r = radix_tree_lookup(&tracker->res_tree[type], id); + SASSERT(r && (r->owner == slave)); + if (!(r && (r->owner == slave))) + mlx4_sdbg("r %p, resource %s, owner %d, id 0x%x\n", r, ResourceType(type), r->owner, id); + r->state = r->to_state; + mlx4_sdbg("%s, id 0x%x, completed move from %d to %d\n", + ResourceType(type), r->res_id, r->from_state, r->to_state); + spin_unlock_irq(mlx4_tlock(dev)); +} + +static int valid_reserved(struct mlx4_dev *dev, int slave, int qpn) +{ + return mlx4_is_qp_reserved(dev, qpn) && (dev->caps.sqp_demux || mlx4_is_guest_proxy(dev, slave, qpn)); +} + +static int fw_reserved(struct mlx4_dev *dev, int qpn) +{ + return qpn < dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW]; +} + +static int qp_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int err; + int count; + int align; + int base; + int qpn; + + switch (op) { + case RES_OP_RESERVE: + mlx4_sdbg("\n"); + count = get_param_l(&in_param); + align = get_param_h(&in_param); + err = __mlx4_qp_reserve_range(dev, count, align, &base); + if (err) { + mlx4_sdbg("failed allocating : count %d, align %d\n", count, align); + return err; + } + + err = add_res_range(dev, slave, base, count, RES_QP, 0, 0); + if (err) { + mlx4_sdbg("failed adding resource range: base 0x%x, count %d\n", base, count); + __mlx4_qp_release_range(dev, base, count); + return err; + } + set_param_l(out_param, base); + mlx4_sdbg("success adding: count %d, base 0x%x\n", count, base); + break; + case RES_OP_MAP_ICM: + mlx4_sdbg("\n"); + qpn = get_param_l(&in_param) & 0x7fffff; + + mlx4_sdbg("qpn 0x%x, orig 0x%x, valid_reserved %d\n", qpn, get_param_l(&in_param), valid_reserved(dev, slave, qpn)); + if (valid_reserved(dev, slave, qpn)) { + err = add_res_range(dev, slave, qpn, 1, RES_QP, 0, 0); + if (err) + return err; + } + + err = qp_res_start_move_to(dev, slave, qpn, RES_QP_MAPPED, NULL, 1); + if (err) { + mlx4_sdbg("failed moving qpn 0x%x to %s. err %d\n", + qpn, qp_states_str(RES_QP_MAPPED), err); + SASSERT(!valid_reserved(dev, slave, qpn)); + return err; + } + + if (!fw_reserved(dev, qpn)) { + err = __mlx4_qp_alloc_icm(dev, qpn); + if (err) { + res_abort_move(dev, slave, RES_QP, qpn); + return err; + } + } + + res_end_move(dev, slave, RES_QP, qpn); + break; + + default: + err = -EINVAL; + break; + } + return err; +} + +static int mtt_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int err = -EINVAL; + int base; + int order; + + if (op != RES_OP_RESERVE_AND_MAP && op != RES_OP_RESERVE) { + mlx4_sdbg("invalid opcode %d\n", op); + return err; + } + + order = get_param_l(&in_param); + base = (op == RES_OP_RESERVE) ? + __mlx4_reserve_mtt_range(dev, order) : + __mlx4_alloc_mtt_range(dev, order, MLX4_MR_FLAG_NONE); + + if (base == 0xFFFFFFFF) { + mlx4_sdbg("failed allocating order %d segments\n", order); + return -ENOMEM; + } + + err = add_res_range(dev, slave, base, 1, RES_MTT, order, 0); + if (err) { + mlx4_sdbg("mtt_alloc_res add res range failed\n"); + goto err_mtt_free; + } + + if (op == RES_OP_RESERVE_AND_MAP) { + err = mtt_res_start_move_to(dev, slave, base, + RES_MTT_ALLOCATED); + if (err) + goto err_rem_res; + + res_end_move(dev, slave, RES_MTT, base); + } + + set_param_l(out_param, base); + mlx4_sdbg("alloc mtt: base 0x%x, order %d, reserve only %d\n", base, + order, op == RES_OP_RESERVE); + + return 0; + +err_rem_res: + err = rem_res_range(dev, slave, base, 1, RES_MTT, order); +err_mtt_free: + __mlx4_free_mtt_range(dev, base, order, MLX4_MR_FLAG_NONE); + return err; +} + +static int verify_fmr_index(struct mlx4_dev *dev, int index, int slave) +{ + int size = dev->caps.fmr_num_mpts; + int base = dev->caps.fmr_dmpt_base_idx + slave * size; + + + if (index < base || index >= base + size) + return -EINVAL; + + return 0; +} + +static int mpt_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int err = -EINVAL; + int index; + int id; + struct res_mpt *mpt; + enum mlx4_mr_flags flags; + int fmr_flow; + + switch (op) { + case RES_OP_RESERVE: + mlx4_sdbg("\n"); + flags = get_param_h(&in_param); + fmr_flow = mlx4_fmr_flow(dev, flags); + if (fmr_flow) { + index = get_param_l(&in_param); + mlx4_sdbg("reserve fmr mpt index 0x%x\n", index); + if (verify_fmr_index(dev, index, slave)) { + mlx4_sdbg("verify_fmr_index failed, 0x%x\n", + index); + index = -1; + } + } else + index = __mlx4_mr_reserve(dev); + if (index == -1) { + mlx4_sdbg("failed reserving a MR index, 0x%x\n", index); + break; + } + id = index & mpt_mask(dev); + mlx4_sdbg("alloc mpt index 0x%x, id 0x%x, fmr_flow %d\n", + index, id, fmr_flow); + + err = add_res_range(dev, slave, id, 1, RES_MPT, index, flags); + if (err) { + mlx4_sdbg("failed adding MPT to tracker: id 0x%x\n", id); + if (!fmr_flow) + __mlx4_mr_release(dev, index); + break; + } + set_param_l(out_param, index); + mlx4_sdbg("allocated mpt index 0x%x, flags %d\n", index, flags); + break; + case RES_OP_MAP_ICM: + index = get_param_l(&in_param); + id = index & mpt_mask(dev); + mlx4_sdbg("mpt map index 0x%x, id 0x%x\n", index, id); + + err = mr_res_start_move_to(dev, slave, id, RES_MPT_MAPPED, &mpt); + if (err) { + mlx4_sdbg("failed moving MPT id 0x%x to RES_MPT_MAPPED. err %d\n", + id, err); + return err; + } + + fmr_flow = mlx4_fmr_flow(dev, mpt->flags); + mlx4_sdbg("mpt map index %d, fmr flow %d\n", index, id); + if (!fmr_flow) { + err = __mlx4_mr_alloc_icm(dev, mpt->key, + MLX4_MR_FLAG_NONE); + if (err) { + res_abort_move(dev, slave, RES_MPT, id); + return err; + } + } + + res_end_move(dev, slave, RES_MPT, id); + break; + } + return err; +} + +static int cq_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int cqn; + int err; + + switch (op) { + case RES_OP_RESERVE_AND_MAP: + err = __mlx4_cq_alloc_icm(dev, &cqn); + if (err) + break; + + err = add_res_range(dev, slave, cqn, 1, RES_CQ, 0, 0); + SASSERT(!err && err != -ENOMEM); + if (err) { + __mlx4_cq_free_icm(dev, cqn); + break; + } + + set_param_l(out_param, cqn); + break; + + default: + err = -EINVAL; + } + + return err; +} + +static int srq_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int srqn; + int err; + + mlx4_sdbg("\n"); + switch (op) { + case RES_OP_RESERVE_AND_MAP: + mlx4_sdbg("\n"); + err = __mlx4_srq_alloc_icm(dev, &srqn); + if (err) + break; + + mlx4_sdbg("srqn 0x%x\n", srqn); + err = add_res_range(dev, slave, srqn, 1, RES_SRQ, 0, 0); + SASSERT(!err || (err == -ENOMEM)); + if (err) { + __mlx4_srq_free_icm(dev, srqn); + mlx4_sdbg("srqn 0x%x\n", srqn); + break; + } + + mlx4_sdbg("srqn 0x%x allocated number and ICM mapped\n", srqn); + set_param_l(out_param, srqn); + break; + + default: + err = -EINVAL; + } + + return err; +} + +static int counter_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int index; + int err; + + if (op != RES_OP_RESERVE) + return -EINVAL; + + err = __mlx4_counter_alloc(dev, &index); + if (err) + return err; + + err = add_res_range(dev, slave, index, 1, RES_COUNTER, 0, 0); + if (err) + __mlx4_counter_free(dev, index); + else + set_param_l(out_param, index); + + mlx4_sdbg("counter index %d, err %d\n", index, err); + return err; +} + +static int xrcdn_alloc_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int xrcdn; + int err; + + if (op != RES_OP_RESERVE) + return -EINVAL; + + err = __mlx4_xrcd_alloc(dev, &xrcdn); + if (err) + return err; + + err = add_res_range(dev, slave, xrcdn, 1, RES_XRCDN, 0, 0); + if (err) + __mlx4_xrcd_free(dev, xrcdn); + else + set_param_l(out_param, xrcdn); + + return err; +} + +int mlx4_ALLOC_RES_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int alop = vhcr->op_modifier; + + switch (vhcr->in_modifier) { + case RES_QP: + err = qp_alloc_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_MTT: + err = mtt_alloc_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_MPT: + err = mpt_alloc_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_CQ: + err = cq_alloc_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_SRQ: + err = srq_alloc_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_COUNTER: + err = counter_alloc_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_XRCDN: + err = xrcdn_alloc_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + default: + err = -EINVAL; + break; + } + + if (err) + mlx4_sdbg("resoruce %s, op %d\n", + ResourceType(vhcr->in_modifier), alop); + + return err; +} + +static int qp_free_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param) +{ + int err; + int count; + int base; + int qpn; + + switch (op) { + case RES_OP_RESERVE: + mlx4_sdbg("\n"); + base = get_param_l(&in_param) & 0x7fffff; + count = get_param_h(&in_param); + err = rem_res_range(dev, slave, base, count, RES_QP, 0); + if (err) { + mlx4_sdbg("failed removing resource range, base 0x%x, count %d\n", + base, count); + break; + } + __mlx4_qp_release_range(dev, base, count); + mlx4_sdbg("success removing: base 0x%x, count %d\n", base, count); + break; + case RES_OP_MAP_ICM: + mlx4_sdbg("\n"); + qpn = get_param_l(&in_param) & 0x7fffff; + err = qp_res_start_move_to(dev, slave, qpn, RES_QP_RESERVED, NULL, 0); + if (err) { + mlx4_sdbg("failed moving qpn 0x%x to %s. err %d\n", + qpn, qp_states_str(RES_QP_RESERVED), err); + return err; + } + + if (!fw_reserved(dev, qpn)) + __mlx4_qp_free_icm(dev, qpn); + + res_end_move(dev, slave, RES_QP, qpn); + + if (valid_reserved(dev, slave, qpn)) { + err = rem_res_range(dev, slave, qpn, 1, RES_QP, 0); + SASSERT(!err); + } + break; + default: + err = -EINVAL; + break; + } + return err; +} + +static int mtt_free_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int err = -EINVAL; + int base; + int order; + + mlx4_sdbg("\n"); + if (op != RES_OP_RESERVE_AND_MAP && op != RES_OP_RESERVE) { + mlx4_sdbg("invalid opcode %d\n", op); + return err; + } + + mlx4_sdbg("\n"); + base = get_param_l(&in_param); + order = get_param_h(&in_param); + err = rem_res_range(dev, slave, base, 1, RES_MTT, order); + if (err) + return err; + + if (op == RES_OP_RESERVE_AND_MAP) + __mlx4_free_mtt_range(dev, base, order, MLX4_MR_FLAG_NONE); + else /* op == RES_OP_RESERVE */ + __mlx4_free_mtt_reserved_range(dev, base, order); + + if (!err) + mlx4_sdbg("base 0x%x, order %d\n", base, order); + else + mlx4_sdbg("base 0x%x, order %d, err %d\n", base, order, err); + + return err; +} + +static int mpt_free_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param) +{ + int err = -EINVAL; + u32 index; + int id; + struct res_mpt *mpt; + enum mlx4_mr_flags flags; + int fmr_flow; + + switch (op) { + case RES_OP_RESERVE: + index = get_param_l(&in_param); + flags = get_param_h(&in_param); + fmr_flow = mlx4_fmr_flow(dev, flags); + if (fmr_flow) { + mlx4_sdbg("free fmr mpt index 0x%x\n", index); + if (verify_fmr_index(dev, index, slave)) { + mlx4_sdbg("verify_fmr_index failed, 0x%x\n", + index); + index = -1; + } + } + id = index & mpt_mask(dev); + mlx4_sdbg("free mpt index 0x%x, id 0x%x, fmr_flow %d\n", + index, id, fmr_flow); + err = get_res(dev, slave, id, RES_MPT, &mpt); + if (err) { + mlx4_sdbg("id 0x%x, err %d\n", id, err); + break; + } + index = mpt->key; + put_res(dev, slave, id, RES_MPT); + + err = rem_res_range(dev, slave, id, 1, RES_MPT, 0); + if (err) { + mlx4_sdbg("failed removing RES_MPT at id 0x%x, err %d\n", id, err); + break; + } + if (!fmr_flow) + __mlx4_mr_release(dev, index); + break; + case RES_OP_MAP_ICM: + index = get_param_l(&in_param); + mlx4_sdbg("index 0x%x\n", index); + id = index & mpt_mask(dev); + err = mr_res_start_move_to(dev, slave, id, RES_MPT_RESERVED, &mpt); + if (err) { + mlx4_sdbg("failed moving mr 0x%x to RES_MPT_RESERVED. err %d\n", + id, err); + return err; + } + fmr_flow = mlx4_fmr_flow(dev, mpt->flags); + if (!fmr_flow) + __mlx4_mr_free_icm(dev, mpt->key, + MLX4_MR_FLAG_NONE); + res_end_move(dev, slave, RES_MPT, id); + return err; + break; + default: + err = -EINVAL; + break; + } + return err; +} + +static int cq_free_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int cqn; + int err; + + switch (op) { + case RES_OP_RESERVE_AND_MAP: + mlx4_sdbg("\n"); + cqn = get_param_l(&in_param); + err = rem_res_range(dev, slave, cqn, 1, RES_CQ, 0); + if (err) + break; + + __mlx4_cq_free_icm(dev, cqn); + break; + + default: + err = -EINVAL; + break; + } + + return err; +} + +static int srq_free_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int srqn; + int err; + + mlx4_sdbg("\n"); + switch (op) { + case RES_OP_RESERVE_AND_MAP: + mlx4_sdbg("\n"); + srqn = get_param_l(&in_param); + err = rem_res_range(dev, slave, srqn, 1, RES_SRQ, 0); + if (err) + break; + + __mlx4_srq_free_icm(dev, srqn); + break; + + default: + err = -EINVAL; + break; + } + + return err; +} + +static int counter_free_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int index; + int err; + + if (op != RES_OP_RESERVE) { + mlx4_sdbg("invalid op %d\n", op); + return -EINVAL; + } + + index = get_param_l(&in_param); + err = rem_res_range(dev, slave, index, 1, RES_COUNTER, 0); + if (err) { + mlx4_sdbg("failed freeing index %d, err %d\n", index, err); + return err; + } + + mlx4_sdbg("counter index %d\n", index); + __mlx4_counter_free(dev, index); + + return err; +} + +static int xrcdn_free_res(struct mlx4_dev *dev, int slave, int op, int cmd, + u64 in_param, u64 *out_param) +{ + int xrcdn; + int err; + + if (op != RES_OP_RESERVE) { + mlx4_sdbg("invalid op %d\n", op); + return -EINVAL; + } + + xrcdn = get_param_l(&in_param); + err = rem_res_range(dev, slave, xrcdn, 1, RES_XRCDN, 0); + if (err) { + mlx4_sdbg("failed freeing xrcdn %d, err %d\n", xrcdn, err); + return err; + } + + __mlx4_xrcd_free(dev, xrcdn); + + return err; +} + +int mlx4_FREE_RES_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err = -EINVAL; + int alop = vhcr->op_modifier; + + switch (vhcr->in_modifier) { + case RES_QP: + err = qp_free_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param); + break; + + case RES_MTT: + err = mtt_free_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_MPT: + err = mpt_free_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param); + break; + + case RES_CQ: + err = cq_free_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_SRQ: + err = srq_free_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_COUNTER: + err = counter_free_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + break; + + case RES_XRCDN: + err = xrcdn_free_res(dev, slave, vhcr->op_modifier, alop, + vhcr->in_param, &vhcr->out_param); + + default: + break; + } + return err; +} + +/* ugly but other choices are uglier */ +static int mr_phys_mpt(struct mlx4_mpt_entry *mpt) +{ + return (be32_to_cpu(mpt->flags) >> 9) & 1; +} + +static int mr_get_mtt_seg(struct mlx4_mpt_entry *mpt) +{ + return (int)be64_to_cpu(mpt->mtt_seg) & 0xfffffff8; +} + +static int mr_get_mtt_size(struct mlx4_mpt_entry *mpt) +{ + return be32_to_cpu(mpt->mtt_sz); +} + +static int mr_get_pdn(struct mlx4_mpt_entry *mpt) +{ + return be32_to_cpu(mpt->pd_flags) & 0xffffff; +} + +static int qp_get_mtt_seg(struct mlx4_qp_context *qpc) +{ + SASSERT(!qpc->mtt_base_addr_h); + return be32_to_cpu(qpc->mtt_base_addr_l) & 0xfffffff8; +} + +static int srq_get_mtt_seg(struct mlx4_srq_context *srqc) +{ + SASSERT(!srqc->mtt_base_addr_h); + return be32_to_cpu(srqc->mtt_base_addr_l) & 0xfffffff8; +} + +static int qp_get_mtt_size(struct mlx4_qp_context *qpc) +{ + int page_shift = (qpc->log_page_size & 0x3f) + 12; + int log_sq_size = (qpc->sq_size_stride >> 3) & 0xf; + int log_sq_sride = qpc->sq_size_stride & 7; + int log_rq_size = (qpc->rq_size_stride >> 3) & 0xf; + int log_rq_stride = qpc->rq_size_stride & 7; + int srq = (be32_to_cpu(qpc->srqn) >> 24) & 1; + int rss = (be32_to_cpu(qpc->flags) >> 13) & 1; + int xrc = (be32_to_cpu(qpc->local_qpn) >> 23) & 1; + int sq_size; + int rq_size; + int total_pages; + int total_mem; + int page_offset = (be32_to_cpu(qpc->params2) >> 6 ) & 0x3f; + + sq_size = 1 << (log_sq_size + log_sq_sride + 4); + rq_size = (srq | rss | xrc) ? 0 : (1 << (log_rq_size + log_rq_stride + 4)); + total_mem = sq_size + rq_size; + total_pages = roundup_pow_of_two((total_mem + (page_offset << 6)) >> page_shift); + + return total_pages; +} + +static int qp_get_pdn(struct mlx4_qp_context *qpc) +{ + return be32_to_cpu(qpc->pd) & 0xffffff; +} + +static int pdn2slave(int pdn) +{ + return (pdn >> NOT_MASKED_PD_BITS) - 1; +} + +static int check_mtt_range(struct mlx4_dev *dev, int slave, int start, + int size, struct res_mtt *mtt) +{ + int res_start = mtt->com.res_id * dev->caps.mtts_per_seg; + int res_size = (1 << mtt->order) * dev->caps.mtts_per_seg; + + if (start < res_start || start + size > res_start + res_size) { + SASSERT(slave == mtt->com.owner); + return -EPERM; + } + + return 0; +} + +int mlx4_SW2HW_MPT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int index = vhcr->in_modifier; + struct res_mtt *mtt; + struct res_mpt *mpt; + int mtt_base = (mr_get_mtt_seg(inbox->buf) / dev->caps.mtt_entry_sz) * dev->caps.mtts_per_seg; + int phys; + int id; + int fmr_flow; + + id = index & mpt_mask(dev); + err = mr_res_start_move_to(dev, slave, id, RES_MPT_HW, &mpt); + if (err) { + mlx4_sdbg("failed moving MPT id 0x%x to RES_MPT_HW. err %d\n", + id, err); + return err; + } + + fmr_flow = mlx4_fmr_flow(dev, mpt->flags); + + + phys = mr_phys_mpt(inbox->buf); + if (!(phys || fmr_flow)) { + err = get_res(dev, slave, mtt_base / dev->caps.mtts_per_seg, RES_MTT, &mtt); + if (err) { + mlx4_sdbg("mlx4_SW2HW_MPT_wrapper failed\n"); + goto ex_abort; + } + + err = check_mtt_range(dev, slave, mtt_base, mr_get_mtt_size(inbox->buf), mtt); + if (err) + goto ex_put; + + mpt->mtt = mtt; + } + + if (pdn2slave(mr_get_pdn(inbox->buf)) != slave) { + err = -EPERM; + goto ex_put; + } + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + goto ex_put; + + if (!(phys || fmr_flow)) { + atomic_inc(&mtt->ref_count); + mlx4_sdbg("base 0x%x, count %d\n", mtt->com.res_id, atomic_read(&mtt->ref_count)); + put_res(dev, slave, mtt->com.res_id, RES_MTT); + } + + res_end_move(dev, slave, RES_MPT, id); + mlx4_sdbg("id 0x%x, phys %d\n", id, phys); + + return 0; + +ex_put: + if (!(phys || fmr_flow)) + put_res(dev, slave, mtt->com.res_id, RES_MTT); +ex_abort: + res_abort_move(dev, slave, RES_MPT, id); + + return err; +} + +int mlx4_HW2SW_MPT_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int index = vhcr->in_modifier; + struct res_mpt *mpt; + int id; + + id = index & mpt_mask(dev); + err = mr_res_start_move_to(dev, slave, id, RES_MPT_MAPPED, &mpt); + if (err) { + mlx4_sdbg("failed moving MPT id 0x%x to RES_MPT_MAPPED. err %d\n", + id, err); + return err; + } + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) { + mlx4_sdbg("id 0x%x, err %d\n", id, err); + goto ex_abort; + } + + if (mpt->mtt) { + atomic_dec(&mpt->mtt->ref_count); + mlx4_sdbg("base 0x%x, count %d\n", mpt->mtt->com.res_id, atomic_read(&mpt->mtt->ref_count)); + } + + res_end_move(dev, slave, RES_MPT, id); + mlx4_sdbg("id 0x%x, phys %d\n", id, !!mpt->mtt); + + return 0; + +ex_abort: + res_abort_move(dev, slave, RES_MPT, id); + + return err; +} + +int mlx4_QUERY_MPT_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int index = vhcr->in_modifier; + struct res_mpt *mpt; + int id; + + id = index & mpt_mask(dev); + err = get_res(dev, slave, id, RES_MPT, &mpt); + if (err) + return err; + + if (mpt->com.from_state != RES_MPT_HW) { + err = -EBUSY; + goto out; + } + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + +out: + put_res(dev, slave, id, RES_MPT); + return err; +} + +static int qp_get_rcqn(struct mlx4_qp_context *qpc) +{ + return be32_to_cpu(qpc->cqn_recv) & 0xffffff; +} + +static int qp_get_scqn(struct mlx4_qp_context *qpc) +{ + return be32_to_cpu(qpc->cqn_send) & 0xffffff; +} + +static u32 qp_get_srqn(struct mlx4_qp_context *qpc) +{ + return be32_to_cpu(qpc->srqn) & 0x1ffffff; +} + +static int srq_get_cqn(struct mlx4_srq_context *srqc) +{ + return be32_to_cpu(srqc->pg_offset_cqn) & 0xffffff; +} + +static void adjust_proxy_tun_qkey(struct mlx4_dev *dev, struct mlx4_vhcr *vhcr, + struct mlx4_qp_context *context) +{ + u32 qpn = vhcr->in_modifier & 0xffffff; + u32 qkey = 0; + + if (mlx4_get_parav_qkey(dev, qpn, &qkey)) + return; + + /* adjust qkey in qp context */ + context->qkey = cpu_to_be32(qkey); +} + + +int mlx4_RST2INIT_QP_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int qpn = vhcr->in_modifier & 0x7fffff; + struct res_mtt *mtt; + struct res_qp *qp; + struct mlx4_qp_context *qpc = inbox->buf + 8; + int mtt_base = (qp_get_mtt_seg(qpc) / dev->caps.mtt_entry_sz) * dev->caps.mtts_per_seg; + int mtt_size = qp_get_mtt_size(qpc); + struct res_cq *rcq; + struct res_cq *scq; + int rcqn = qp_get_rcqn(qpc); + int scqn = qp_get_scqn(qpc); + u32 srqn = qp_get_srqn(qpc) & 0xffffff; + int use_srq = (qp_get_srqn(qpc) >> 24) & 1; + struct res_srq *srq; + int local_qpn = be32_to_cpu(qpc->local_qpn) & 0xffffff; + + err = qp_res_start_move_to(dev, slave, qpn, RES_QP_HW, &qp, 0); + if (err) { + mlx4_sdbg("failed moving QP qpn 0x%x to RES_QP_HW. err %d\n", + qpn, err); + return err; + } + qp->local_qpn = local_qpn; + + err = get_res(dev, slave, mtt_base / dev->caps.mtts_per_seg, RES_MTT, &mtt); + if (err) { + mlx4_sdbg("base 0x%x, size %d\n", mtt_base, mtt_size); + goto ex_abort; + } + + err = check_mtt_range(dev, slave, mtt_base, mtt_size, mtt); + if (err) { + mlx4_sdbg("mtt_base 0x%x, mtt_size %d\n", mtt_base, mtt_size); + goto ex_put_mtt; + } + + if (pdn2slave(qp_get_pdn(qpc)) != slave) { + mlx4_sdbg("slave pdn 0x%x\n", pdn2slave(qp_get_pdn(qpc))); + err = -EPERM; + goto ex_put_mtt; + } + + err = get_res(dev, slave, rcqn, RES_CQ, &rcq); + if (err) { + mlx4_sdbg("cqn 0x%x\n", rcqn); + goto ex_put_mtt; + } + + if (scqn != rcqn) { + err = get_res(dev, slave, scqn, RES_CQ, &scq); + if (err) { + mlx4_sdbg("cqn 0x%x\n", scqn); + goto ex_put_rcq; + } + } else + scq = rcq; + + mlx4_sdbg("qpn 0x%x, srqn 0x%x\n", qpn, srqn); + if (use_srq) { + err = get_res(dev, slave, srqn, RES_SRQ, &srq); + if (err) { + mlx4_sdbg("srqn 0x%x, err %d\n", srqn, err); + goto ex_put_scq; + } + mlx4_sdbg("srqn 0x%x\n", srqn); + } + + adjust_proxy_tun_qkey(dev, vhcr, qpc); + update_pkey_index(dev, slave, inbox); + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) { + mlx4_sdbg("qpn 0x%x, err %d\n", qpn, err); + goto ex_put_srq; + } + mlx4_sdbg("qpn 0x%x, successfully move to INIT\n", qpn); + + atomic_inc(&mtt->ref_count); + mlx4_sdbg("base 0x%x, count %d\n", mtt->com.res_id, atomic_read(&mtt->ref_count)); + qp->mtt = mtt; + + atomic_inc(&rcq->ref_count); + mlx4_sdbg("CQ 0x%x, ref count %d\n", rcq->com.res_id, atomic_read(&rcq->ref_count)); + qp->rcq = rcq; + atomic_inc(&scq->ref_count); + mlx4_sdbg("CQ 0x%x, ref count %d\n", scq->com.res_id, atomic_read(&scq->ref_count)); + qp->scq = scq; + + if (scqn != rcqn) + put_res(dev, slave, scqn, RES_CQ); + + if (use_srq) { + atomic_inc(&srq->ref_count); + put_res(dev, slave, srqn, RES_SRQ); + qp->srq = srq; + } + put_res(dev, slave, rcqn, RES_CQ); + put_res(dev, slave, mtt_base / dev->caps.mtts_per_seg, RES_MTT); + res_end_move(dev, slave, RES_QP, qpn); + + return 0; + +ex_put_srq: + if (use_srq) + put_res(dev, slave, srqn, RES_SRQ); +ex_put_scq: + if (scqn != rcqn) + put_res(dev, slave, scqn, RES_CQ); +ex_put_rcq: + put_res(dev, slave, rcqn, RES_CQ); +ex_put_mtt: + put_res(dev, slave, mtt_base / dev->caps.mtts_per_seg, RES_MTT); +ex_abort: + res_abort_move(dev, slave, RES_QP, qpn); + + return err; +} + +static int eq_get_mtt_seg(struct mlx4_eq_context *eqc) +{ + SASSERT(!eqc->mtt_base_addr_h); + return (be32_to_cpu(eqc->mtt_base_addr_l) & 0xfffffff8); +} + +static int eq_get_mtt_size(struct mlx4_eq_context *eqc) +{ + int log_eq_size = eqc->log_eq_size & 0x1f; + int page_shift = (eqc->log_page_size & 0x3f) + 12; + + if (log_eq_size + 5 < page_shift) + return 1; + + return 1 << (log_eq_size + 5 - page_shift); +} + +static int cq_get_mtt_seg(struct mlx4_cq_context *cqc) +{ + return be32_to_cpu(cqc->mtt_base_addr_l) & 0xfffffff8; +} + +static int cq_get_mtt_size(struct mlx4_cq_context *cqc) +{ + int log_cq_size = (be32_to_cpu(cqc->logsize_usrpage) >> 24) & 0x1f; + int page_shift = (cqc->log_page_size & 0x3f) + 12; + + if (log_cq_size + 5 < page_shift) + return 1; + + return 1 << (log_cq_size + 5 - page_shift); +} + +int mlx4_SW2HW_EQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int eqn = vhcr->in_modifier; + int res_id = (slave << 8) | eqn; + struct mlx4_eq_context *eqc = inbox->buf; + int mtt_base = (eq_get_mtt_seg(eqc) / dev->caps.mtt_entry_sz) * dev->caps.mtts_per_seg; + int mtt_size = eq_get_mtt_size(eqc); + struct res_eq *eq; + struct res_mtt *mtt; + + err = add_res_range(dev, slave, res_id, 1, RES_EQ, 0, 0); + if (err) { + mlx4_sdbg("failed adding EQ to tracker: eqn 0x%x\n", eqn); + return err; + } + + mlx4_sdbg("sccuess adding EQ 0x%x (id 0x%x) tracker. err %d\n", + eqn, res_id, err); + + err = eq_res_start_move_to(dev, slave, res_id, RES_EQ_HW, &eq); + if (err) { + mlx4_sdbg("failed moving EQ 0x%x (id 0x%x) to RES_EQ_HW. err %d\n", + eqn, res_id, err); + goto out_add; + } + + err = get_res(dev, slave, mtt_base / dev->caps.mtts_per_seg, RES_MTT, &mtt); + if (err) { + mlx4_sdbg("mtt_base 0x%x\n", mtt_base / dev->caps.mtts_per_seg); + goto out_move; + } + + err = check_mtt_range(dev, slave, mtt_base, mtt_size, mtt); + if (err) { + mlx4_sdbg("mtt_base 0x%x, mtt_size %d\n", mtt_base, mtt_size); + goto out_put; + } + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) { + mlx4_sdbg("failed moving EQ 0x%x to RES_EQ_HW. err %d\n", + eqn, err); + goto out_put; + } + + atomic_inc(&mtt->ref_count); + mlx4_sdbg("base 0x%x, count %d\n", mtt->com.res_id, atomic_read(&mtt->ref_count)); + eq->mtt = mtt; + put_res(dev, slave, mtt->com.res_id, RES_MTT); + res_end_move(dev, slave, RES_EQ, res_id); + return 0; + +out_put: + put_res(dev, slave, mtt->com.res_id, RES_MTT); +out_move: + res_abort_move(dev, slave, RES_EQ, res_id); +out_add: + rem_res_range(dev, slave, res_id, 1, RES_EQ, 0); + return err; +} + +static int get_containing_mtt(struct mlx4_dev *dev, int slave, int start, int len, struct res_mtt **res) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct res_mtt *mtt; + int err = -EINVAL; + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry(mtt, &tracker->slave_list[slave].res_list[RES_MTT], com.list) { + if (!check_mtt_range(dev, slave, start, len, mtt)) { + mlx4_sdbg("owner %d, start 0x%x, order %d\n", mtt->com.owner, mtt->com.res_id, mtt->order); + *res = mtt; + SASSERT(mtt->com.state != RES_MTT_BUSY); + mtt->com.from_state = mtt->com.state; + mtt->com.state = RES_MTT_BUSY; + err = 0; + break; + } + } + spin_unlock_irq(mlx4_tlock(dev)); + + return err; +} + +int mlx4_WRITE_MTT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_mtt mtt; + u64 *page_list = inbox->buf; + int i; + struct res_mtt *rmtt = NULL; + int start = be64_to_cpu(page_list[0]); + int npages = vhcr->in_modifier; + int err; + + err = get_containing_mtt(dev, slave, start, npages, &rmtt); + if (err) { + mlx4_sdbg("start 0x%x, npages %d\n", start, npages); + return err; + } + + /* Call the SW implementation of write_mtt: + * - Prepare a dummy mtt struct + * - Translate inbox contents to simple addresses in host endianess */ + mtt.first_seg = 0; // TBD this is broken but I don't handle it since we don't really use it + mtt.order = 0; + mtt.page_shift = 0; + for (i = 0; i < npages; ++i) + page_list[i + 2] = be64_to_cpu(page_list[i + 2]) & ~1ULL; + err = __mlx4_write_mtt(dev, &mtt, be64_to_cpu(page_list[0]), npages, + page_list + 2); + + mlx4_sdbg("err %d\n", err); + SASSERT(rmtt); + put_res(dev, slave, rmtt->com.res_id, RES_MTT); + + return err; +} + +int mlx4_HW2SW_EQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int eqn = vhcr->in_modifier; + int res_id = eqn | (slave << 8); + struct res_eq *eq; + int err; + + err = eq_res_start_move_to(dev, slave, res_id, RES_EQ_RESERVED, &eq); + if (err) { + mlx4_sdbg("failed moving EQ eqn 0x%x to RES_EQ_RESERVED. err %d\n", + eqn, err); + return err; + } + + err = get_res(dev, slave, eq->mtt->com.res_id, RES_MTT, NULL); + if (err) + goto ex_abort; + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + goto ex_put; + + atomic_dec(&eq->mtt->ref_count); + mlx4_sdbg("base 0x%x, count %d\n", eq->mtt->com.res_id, atomic_read(&eq->mtt->ref_count)); + + put_res(dev, slave, eq->mtt->com.res_id, RES_MTT); + res_end_move(dev, slave, RES_EQ, res_id); + rem_res_range(dev, slave, res_id, 1, RES_EQ, 0); + + return 0; + +ex_put: + put_res(dev, slave, eq->mtt->com.res_id, RES_MTT); +ex_abort: + res_abort_move(dev, slave, RES_EQ, res_id); + + return err; +} + +int mlx4_GEN_EQE(struct mlx4_dev *dev, int slave, struct mlx4_eqe *eqe) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_slave_event_eq_info *event_eq; + struct mlx4_cmd_mailbox *mailbox; + u32 in_modifier = 0; + int err; + int res_id; + struct res_eq *req; + + if (!priv->mfunc.master.slave_state) + return -EINVAL; + + event_eq = &priv->mfunc.master.slave_state[slave].event_eq; + + if (!event_eq->use_int) + return 0; + + /* Create the event only if the slave is registered */ + if ((event_eq->event_type & (1 << eqe->type)) == 0) + return 0; + + mutex_lock(&priv->mfunc.master.gen_eqe_mutex[slave]); + res_id = (slave << 8) | event_eq->eqn; + err = get_res(dev, slave, res_id, RES_EQ, &req); + if (err) + goto unlock; + + if (req->com.from_state != RES_EQ_HW) { + err = -EINVAL; + goto put; + } + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + err = PTR_ERR(mailbox); + goto put; + } + + if (eqe->type == MLX4_EVENT_TYPE_CMD) { + ++event_eq->token; + eqe->event.cmd.token = cpu_to_be16(event_eq->token); + } + + memcpy(mailbox->buf, (u8 *) eqe, 28); + + in_modifier = (slave & 0xff) | ((event_eq->eqn & 0xff) << 16); + + err = mlx4_cmd(dev, mailbox->dma, in_modifier, 0, + MLX4_CMD_GEN_EQE, MLX4_CMD_TIME_CLASS_B, 1); + + put_res(dev, slave, res_id, RES_EQ); + mutex_unlock(&priv->mfunc.master.gen_eqe_mutex[slave]); + mlx4_free_cmd_mailbox(dev, mailbox); + return err; + +put: + mlx4_sdbg("\n"); + put_res(dev, slave, res_id, RES_EQ); + +unlock: + mlx4_sdbg("\n"); + mutex_unlock(&priv->mfunc.master.gen_eqe_mutex[slave]); + return err; +} + +int mlx4_QUERY_EQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int eqn = vhcr->in_modifier; + int res_id = eqn | (slave << 8); + struct res_eq *eq; + int err; + + err = get_res(dev, slave, res_id, RES_EQ, &eq); + if (err) + return err; + + if (eq->com.from_state != RES_EQ_HW) { + err = -EINVAL; + goto ex_put; + } + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + +ex_put: + put_res(dev, slave, res_id, RES_EQ); + return err; +} + +int mlx4_SW2HW_CQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int cqn = vhcr->in_modifier; + struct mlx4_cq_context *cqc = inbox->buf; + int mtt_base = (cq_get_mtt_seg(cqc) / dev->caps.mtt_entry_sz) * dev->caps.mtts_per_seg; + struct res_cq *cq; + struct res_mtt *mtt; + + err = cq_res_start_move_to(dev, slave, cqn, RES_CQ_HW, &cq); + if (err) { + mlx4_sdbg("failed moving CQ 0x%x to RES_CQ_HW. err %d\n", + cqn, err); + return err; + } + + err = get_res(dev, slave, mtt_base / dev->caps.mtts_per_seg, RES_MTT, &mtt); + if (err) { + mlx4_sdbg("\n"); + goto out_move; + } + + err = check_mtt_range(dev, slave, mtt_base, cq_get_mtt_size(cqc), mtt); + if (err) { + mlx4_sdbg("CQ mtt base 0x%x, CQ mtt size %d, mtt.base 0x%x, mtt.size %d\n", + mtt_base, cq_get_mtt_size(cqc), + mtt->com.res_id * dev->caps.mtts_per_seg, (1 << mtt->order) * dev->caps.mtts_per_seg); + goto out_put; + } + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) { + mlx4_sdbg("failed moving CQ 0x%x to RES_CQ_HW. err %d\n", + cqn, err); + goto out_put; + } + + atomic_inc(&mtt->ref_count); + mlx4_sdbg("cqn 0x%x, mtt_base 0x%x, count %d\n", cqn, mtt->com.res_id, atomic_read(&mtt->ref_count)); + cq->mtt = mtt; + put_res(dev, slave, mtt->com.res_id, RES_MTT); + res_end_move(dev, slave, RES_CQ, cqn); + return 0; + +out_put: + put_res(dev, slave, mtt->com.res_id, RES_MTT); +out_move: + res_abort_move(dev, slave, RES_CQ, cqn); + return err; +} + +int mlx4_HW2SW_CQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int cqn = vhcr->in_modifier; + struct res_cq *cq; + + err = cq_res_start_move_to(dev, slave, cqn, RES_CQ_ALLOCATED, &cq); + if (err) { + mlx4_sdbg("failed moving CQ 0x%x to RES_CQ_ALLOCATED. err %d\n", + cqn, err); + return err; + } + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) { + mlx4_sdbg("failed moving CQ 0x%x to RES_CQ_ALLOCATED. err %d\n", + cqn, err); + goto out_move; + } + + atomic_dec(&cq->mtt->ref_count); + mlx4_sdbg("base 0x%x, count %d\n", cq->mtt->com.res_id, atomic_read(&cq->mtt->ref_count)); + mlx4_sdbg("CQ 0x%x, ref count %d\n", cq->com.res_id, atomic_read(&cq->ref_count)); + res_end_move(dev, slave, RES_CQ, cqn); + return 0; + +out_move: + res_abort_move(dev, slave, RES_CQ, cqn); + return err; +} + +int mlx4_QUERY_CQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int cqn = vhcr->in_modifier; + struct res_cq *cq; + int err; + + err = get_res(dev, slave, cqn, RES_CQ, &cq); + if (err) + return err; + + if (cq->com.from_state != RES_CQ_HW) + goto ex_put; + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + mlx4_sdbg("query_cq failed cqn 0x%x. err %d\n", + cqn, err); + +ex_put: + put_res(dev, slave, cqn, RES_CQ); + + return err; +} + +static int handle_resize(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd, + struct res_cq *cq) +{ + int cqn = vhcr->in_modifier; + int err; + struct res_mtt *orig_mtt; + struct res_mtt *mtt; + struct mlx4_cq_context *cqc = inbox->buf; + int mtt_base = (cq_get_mtt_seg(cqc) / dev->caps.mtt_entry_sz) * dev->caps.mtts_per_seg; + + err = get_res(dev, slave, cq->mtt->com.res_id, RES_MTT, &orig_mtt); + SASSERT(!err); + if (err) + return err; + + SASSERT(orig_mtt == cq->mtt); + if (orig_mtt != cq->mtt) { + err = -EINVAL; + goto ex_put; + } + + err = get_res(dev, slave, mtt_base / dev->caps.mtts_per_seg, RES_MTT, &mtt); + if (err) { + mlx4_sdbg("cqn 0x%x, mtt_base 0x%x\n", + cqn, mtt_base / dev->caps.mtts_per_seg); + goto ex_put; + } + + err = check_mtt_range(dev, slave, mtt_base, cq_get_mtt_size(cqc), mtt); + if (err) { + mlx4_sdbg("CQ mtt base 0x%x, CQ mtt size %d, mtt.base 0x%x, mtt.size %d\n", + mtt_base, cq_get_mtt_size(cqc), mtt->com.res_id * dev->caps.mtts_per_seg, + (1 << mtt->order) * dev->caps.mtts_per_seg); + goto ex_put1; + } + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) { + mlx4_sdbg("modify cq failed cqn 0x%x. err %d\n", + cqn, err); + goto ex_put1; + } + + atomic_dec(&orig_mtt->ref_count); + put_res(dev, slave, orig_mtt->com.res_id, RES_MTT); + atomic_inc(&mtt->ref_count); + cq->mtt = mtt; + put_res(dev, slave, mtt->com.res_id, RES_MTT); + return 0; + +ex_put1: + put_res(dev, slave, mtt->com.res_id, RES_MTT); +ex_put: + put_res(dev, slave, orig_mtt->com.res_id, RES_MTT); + + return err; + +} + +int mlx4_MODIFY_CQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int cqn = vhcr->in_modifier; + struct res_cq *cq; + int err; + + err = get_res(dev, slave, cqn, RES_CQ, &cq); + if (err) + return err; + + if (cq->com.from_state != RES_CQ_HW) + goto ex_put; + + if (vhcr->op_modifier == 0) { + mlx4_sdbg("resize cqn 0x%x\n", cqn); + err = handle_resize(dev, slave, vhcr, inbox, outbox, cmd, cq); + mlx4_sdbg("resize cqn 0x%x failed\n", cqn); + goto ex_put; + } + + mlx4_sdbg("modify cqn 0x%x, opmod %d\n", cqn, vhcr->op_modifier); + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + mlx4_sdbg("modify cq failed cqn 0x%x. err %d\n", + cqn, err); + +ex_put: + put_res(dev, slave, cqn, RES_CQ); + + return err; +} + +static int srq_get_pdn(struct mlx4_srq_context *srqc) +{ + return be32_to_cpu(srqc->pd) & 0xffffff; +} + +static int srq_get_mtt_size(struct mlx4_srq_context *srqc) +{ + int log_srq_size = (be32_to_cpu(srqc->state_logsize_srqn) >> 24) & 0xf; + int log_rq_stride = srqc->logstride & 7; +/* + TBD how to use in calcualtions? + int page_offset = be32_to_cpu(srqc->pg_offset_cqn) >> 26; +*/ + int page_shift = (srqc->log_page_size & 0x3f) + 12; + +/* + SASSERT(!page_offset); +*/ + + if (log_srq_size + log_rq_stride + 4 < page_shift) + return 1; + + return 1 << (log_srq_size + log_rq_stride + 4 - page_shift); + +} + +int mlx4_SW2HW_SRQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int srqn = vhcr->in_modifier; + struct res_mtt *mtt; + struct res_srq *srq; + struct mlx4_srq_context *srqc = inbox->buf; + int mtt_base = (srq_get_mtt_seg(srqc) / dev->caps.mtt_entry_sz) * dev->caps.mtts_per_seg; + struct res_cq *cq; + int cqn = srq_get_cqn(srqc); + + mlx4_sdbg("srqn 0x%x\n", srqn); + if (srqn != (be32_to_cpu(srqc->state_logsize_srqn) & 0xffffff)) { + mlx4_sdbg("\n"); + return -EINVAL; + } + + mlx4_sdbg("srqn 0x%x\n", srqn); + err = srq_res_start_move_to(dev, slave, srqn, RES_SRQ_HW, &srq); + if (err) { + mlx4_sdbg("failed moving SRQ 0x%x to RES_SRQ_HW. err %d\n", + srqn, err); + return err; + } + + mlx4_sdbg("srqn 0x%x\n", srqn); + err = get_res(dev, slave, mtt_base / dev->caps.mtts_per_seg, RES_MTT, &mtt); + if (err) { + mlx4_sdbg("mtt_base 0x%x\n", mtt_base / dev->caps.mtts_per_seg); + goto ex_abort; + } + + err = check_mtt_range(dev, slave, mtt_base, srq_get_mtt_size(srqc), mtt); + if (err) { + mlx4_sdbg("\n"); + goto ex_put_mtt; + } + + if (pdn2slave(srq_get_pdn(srqc)) != slave) { + mlx4_sdbg("\n"); + err = -EPERM; + goto ex_put_mtt; + } + + if (cqn) { + mlx4_sdbg("srqn 0x%x used for xrc, cqn 0x%x\n", srqn, cqn); + err = get_res(dev, slave, cqn, RES_CQ, &cq); + if (err) { + mlx4_sdbg("cqn 0x%x\n", cqn); + goto ex_put_mtt; + } + } + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) { + mlx4_sdbg("err %d\n", err); + goto ex_put_cq; + } + + atomic_inc(&mtt->ref_count); + mlx4_sdbg("base 0x%x, count %d\n", mtt->com.res_id, atomic_read(&mtt->ref_count)); + srq->mtt = mtt; + + if (cqn) { + atomic_inc(&cq->ref_count); + mlx4_sdbg("CQ 0x%x, ref count %d\n", cq->com.res_id, atomic_read(&cq->ref_count)); + srq->cq = cq; + put_res(dev, slave, cq->com.res_id, RES_CQ); + } + + put_res(dev, slave, mtt->com.res_id, RES_MTT); + res_end_move(dev, slave, RES_SRQ, srqn); + mlx4_sdbg("srqn 0x%x\n", srqn); + + return 0; + +ex_put_cq: + if (cqn) + put_res(dev, slave, cqn, RES_CQ); +ex_put_mtt: + put_res(dev, slave, mtt->com.res_id, RES_MTT); +ex_abort: + res_abort_move(dev, slave, RES_SRQ, srqn); + + return err; +} + +int mlx4_HW2SW_SRQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int srqn = vhcr->in_modifier; + struct res_srq *srq; + + mlx4_sdbg("srqn 0x%x\n", srqn); + err = srq_res_start_move_to(dev, slave, srqn, RES_SRQ_ALLOCATED, &srq); + if (err) { + mlx4_sdbg("failed moving SRQ 0x%x to RES_SRQ_ALLOCATED. err %d\n", + srqn, err); + return err; + } + + mlx4_sdbg("srqn 0x%x\n", srqn); + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) { + mlx4_sdbg("\n"); + goto ex_abort; + } + + + mlx4_sdbg("srqn 0x%x\n", srqn); + atomic_dec(&srq->mtt->ref_count); + mlx4_sdbg("base 0x%x, count %d\n", srq->mtt->com.res_id, atomic_read(&srq->mtt->ref_count)); + if (srq->cq) { + atomic_dec(&srq->cq->ref_count); + mlx4_sdbg("CQ 0x%x, ref count %d\n", srq->cq->com.res_id, atomic_read(&srq->cq->ref_count)); + } + + mlx4_sdbg("srqn 0x%x\n", srqn); + res_end_move(dev, slave, RES_SRQ, srqn); + + return 0; + +ex_abort: + res_abort_move(dev, slave, RES_SRQ, srqn); + + return err; +} + +int mlx4_QUERY_SRQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int srqn = vhcr->in_modifier; + struct res_srq *srq; + + err = get_res(dev, slave, srqn, RES_SRQ, &srq); + if (err) { + mlx4_sdbg("fail srqn 0x%x\n", srqn); + return err; + } + + if (srq->com.from_state != RES_SRQ_HW) { + mlx4_sdbg("fail srqn 0x%x\n", srqn); + err = -EBUSY; + goto out; + } + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + mlx4_sdbg("fail srqn 0x%x\n", srqn); + +out: + put_res(dev, slave, srqn, RES_SRQ); + return err; +} + +int mlx4_ARM_SRQ_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int srqn = vhcr->in_modifier; + struct res_srq *srq; + + err = get_res(dev, slave, srqn, RES_SRQ, &srq); + if (err) { + mlx4_sdbg("srqn 0x%x\n", srqn); + return err; + } + + if (srq->com.from_state != RES_SRQ_HW) { + mlx4_sdbg("srqn 0x%x\n", srqn); + err = -EBUSY; + goto out; + } + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + mlx4_sdbg("srqn 0x%x\n", srqn); + +out: + put_res(dev, slave, srqn, RES_SRQ); + return err; +} + +static int gen_qp_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int qpn = vhcr->in_modifier & 0x7fffff; + struct res_qp *qp; + + mlx4_sdbg("qpn 0x%x, command 0x%x\n", qpn, vhcr->op); + err = get_res(dev, slave, qpn, RES_QP, &qp); + if (err) { + mlx4_sdbg("qpn 0x%x, command 0x%x, err %d\n", qpn, vhcr->op, err); + return err; + } + + if (qp->com.from_state != RES_QP_HW) { + err = -EBUSY; + mlx4_sdbg("qpn 0x%x inmod 0x%x, command 0x%x, err %d, state %s\n", + qpn, vhcr->in_modifier, vhcr->op, err, qp_states_str(qp->com.from_state)); + goto out; + } + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + mlx4_sdbg("qpn 0x%x, err %d\n", qpn, err); + + mlx4_sdbg("qpn 0x%x, command 0x%x\n", qpn, vhcr->op); +out: + put_res(dev, slave, qpn, RES_QP); + return err; +} + +int mlx4_INIT2RTR_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_qp_context *qpc = inbox->buf + 8; + + update_pkey_index(dev, slave, inbox); + update_ud_gid(qpc, (u8)slave); + adjust_proxy_tun_qkey(dev, vhcr, qpc); + + return gen_qp_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +} + +int mlx4_RTR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_qp_context *context = inbox->buf + 8; + u8 vep_num = mlx4_priv(dev)->mfunc.master.slave_state[slave].vep_num; + u8 port = ((context->pri_path.sched_queue >> 6) & 1) + 1; + + if (mlx4_priv(dev)->vep_mode[port]) + context->pri_path.sched_queue = (context->pri_path.sched_queue & 0xc3 ) | + (vep_num << 3); + + update_pkey_index(dev, slave, inbox); + adjust_proxy_tun_qkey(dev, vhcr, context); + return gen_qp_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +} + +int mlx4_RTS2RTS_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_qp_context *context = inbox->buf + 8; + update_pkey_index(dev, slave, inbox); + adjust_proxy_tun_qkey(dev, vhcr, context); + return gen_qp_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +} + + +int mlx4_SQERR2RTS_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_qp_context *context = inbox->buf + 8; + adjust_proxy_tun_qkey(dev, vhcr, context); + return gen_qp_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +} + +int mlx4_2ERR_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + return gen_qp_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +} + +int mlx4_RTS2SQD_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + return gen_qp_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +} + +int mlx4_SQD2SQD_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_qp_context *context = inbox->buf + 8; + adjust_proxy_tun_qkey(dev, vhcr, context); + update_pkey_index(dev, slave, inbox); + return gen_qp_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +} + +int mlx4_SQD2RTS_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_qp_context *context = inbox->buf + 8; + adjust_proxy_tun_qkey(dev, vhcr, context); + update_pkey_index(dev, slave, inbox); + return gen_qp_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +} + +int mlx4_2RST_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int qpn = vhcr->in_modifier & 0x7fffff; + struct res_qp *qp; + + mlx4_sdbg("qpn 0x%x\n", qpn); + err = qp_res_start_move_to(dev, slave, qpn, RES_QP_MAPPED, &qp, 0); + if (err) { + mlx4_sdbg("failed moving QP 0x%x to RES_QP_MAPPED. err %d, cur_state %s\n", + qpn, err, qp_states_str(qp->com.from_state)); + return err; + } + + mlx4_sdbg("qpn 0x%x\n", qpn); + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + if (err) + goto ex_abort; + + mlx4_sdbg("qpn 0x%x\n", qpn); + atomic_dec(&qp->mtt->ref_count); + mlx4_sdbg("base 0x%x, count %d\n", qp->mtt->com.res_id, atomic_read(&qp->mtt->ref_count)); + atomic_dec(&qp->rcq->ref_count); + mlx4_sdbg("CQ 0x%x, ref count %d\n", qp->rcq->com.res_id, atomic_read(&qp->rcq->ref_count)); + atomic_dec(&qp->scq->ref_count); + mlx4_sdbg("CQ 0x%x, ref count %d\n", qp->scq->com.res_id, atomic_read(&qp->scq->ref_count)); + if (qp->srq) { + atomic_dec(&qp->srq->ref_count); + mlx4_sdbg("srqn 0x%x\n", qp->srq->com.res_id); + } + res_end_move(dev, slave, RES_QP, qpn); + + mlx4_sdbg("qpn 0x%x\n", qpn); + return 0; + +ex_abort: + res_abort_move(dev, slave, RES_QP, qpn); + + return err; +} + +int mlx4_QUERY_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + return gen_qp_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +} + +int mlx4_INIT2INIT_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_qp_context *context = inbox->buf + 8; + adjust_proxy_tun_qkey(dev, vhcr, context); + update_pkey_index(dev, slave, inbox); + return gen_qp_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +} + +int mlx4_SUSPEND_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + return gen_qp_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +} + +int mlx4_UNSUSPEND_QP_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + return gen_qp_wrapper(dev, slave, vhcr, inbox, outbox, cmd); +} + +static struct res_gid *find_gid(struct mlx4_dev *dev, int slave, struct res_qp *rqp, u8 *gid) +{ + struct res_gid *res; + + list_for_each_entry(res, &rqp->mcg_list, list) { + if (!memcmp(res->gid, gid, 16)) + return res; + } + return NULL; +} + +static int add_mcg_res(struct mlx4_dev *dev, int slave, struct res_qp *rqp, + u8 *gid, enum mlx4_protocol prot) +{ + struct res_gid *res; + int err; + + res = kzalloc(sizeof *res, GFP_KERNEL); + if (!res) + return -ENOMEM; + + spin_lock_irq(&rqp->mcg_spl); + if (find_gid(dev,slave,rqp,gid)) { + kfree(res); + err = -EEXIST; + } else { + memcpy(res->gid, gid, 16); + res->prot = prot; + list_add_tail(&res->list, &rqp->mcg_list); + err = 0; + } + spin_unlock_irq(&rqp->mcg_spl); + + return err; +} + +static int rem_mcg_res(struct mlx4_dev *dev, int slave, struct res_qp *rqp, u8 *gid, + enum mlx4_protocol prot) +{ + struct res_gid *res; + int err; + + spin_lock_irq(&rqp->mcg_spl); + res = find_gid(dev, slave, rqp, gid); + if (!res || res->prot != prot) + err = -EINVAL; + else { + list_del(&res->list); + kfree(res); + err = 0; + } + spin_unlock_irq(&rqp->mcg_spl); + + return err; +} + +int mlx4_MCAST_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + struct mlx4_qp qp; /* dummy for calling attach/detach */ + u8 *gid = inbox->buf; + enum mlx4_protocol prot = (vhcr->in_modifier >> 28) & 0x7; + u8 pf_num = mlx4_priv(dev)->mfunc.master.slave_state[slave].pf_num; + int err; + int qpn = vhcr->in_modifier & 0x7fffff; + struct res_qp *rqp; + int attach = vhcr->op_modifier; + int block_loopback = vhcr->in_modifier >> 31; + + err = get_res(dev, slave, qpn, RES_QP, &rqp); + if (err) { + mlx4_sdbg("qpn 0x%x, attach %d, block_loopback %d\n", + qpn, attach, block_loopback); + return err; + } + + if (prot == MLX4_PROT_ETH) + gid[7] |= (pf_num << 4 | MLX4_MC_STEER << 1); + + qp.qpn = qpn; + if (attach) { + err = add_mcg_res(dev, slave, rqp, gid, prot); + if (err) { + mlx4_sdbg("\n"); + goto ex_put; + } + + err = mlx4_qp_attach_common(dev, &qp, gid, + block_loopback, prot, MLX4_MC_STEER); + if (err) { + mlx4_sdbg("\n"); + goto ex_rem; + } + } else { + err = rem_mcg_res(dev, slave, rqp, gid, prot); + if (err) { + mlx4_sdbg("\n"); + goto ex_put; + } + err = mlx4_qp_detach_common(dev, &qp, gid, prot, MLX4_MC_STEER); + SASSERT(!err || err == -ENOMEM); + if (err) + mlx4_sdbg("qpn 0x%x, err %d\n", rqp->local_qpn, err); + } + + put_res(dev, slave, qpn, RES_QP); + return 0; + +ex_rem: + if (rem_mcg_res(dev, slave, rqp, gid, prot)) + SASSERT(0); +ex_put: + put_res(dev, slave, qpn, RES_QP); + + return err; +} + +enum { + BUSY_MAX_RETRIES = 10 +}; + +int mlx4_QUERY_IF_STAT_wrapper(struct mlx4_dev *dev, int slave, + struct mlx4_vhcr *vhcr, + struct mlx4_cmd_mailbox *inbox, + struct mlx4_cmd_mailbox *outbox, + struct mlx4_cmd_info *cmd) +{ + int err; + int index = vhcr->in_modifier & 0xffff; + + err = get_res(dev, slave, index, RES_COUNTER, NULL); + if (err) + return err; + + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + + put_res(dev, slave, index, RES_COUNTER); + return err; +} + +static void dettach_qp(struct mlx4_dev *dev, int slave, struct res_qp *rqp) +{ + struct res_gid *rgid; + struct res_gid *tmp; + int err; + struct mlx4_qp qp; /* dummy for calling attach/detach */ + + list_for_each_entry_safe(rgid, tmp, &rqp->mcg_list, list) { + qp.qpn = rqp->local_qpn; + err = mlx4_qp_detach_common(dev, &qp, rgid->gid, rgid->prot, MLX4_MC_STEER); + SASSERT(!err); + list_del(&rgid->list); + kfree(rgid); + } +} + +static int _move_all_busy(struct mlx4_dev *dev, int slave, + enum mlx4_resource type, int print) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *rlist = &tracker->slave_list[slave].res_list[type]; + struct res_common *r; + struct res_common *tmp; + int busy; + + busy = 0; + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(r, tmp, rlist, list) { + SASSERT(r->owner == slave); + if (r->owner == slave) { + if (!r->removing) { + if (r->state == RES_ANY_BUSY) { + if (print) + mlx4_sdbg("%s id 0x%x is busy\n", ResourceType(type), r->res_id); + ++busy; + } else { + r->from_state = r->state; + r->state = RES_ANY_BUSY; + r->removing = 1; + mlx4_sdbg("%s id 0x%x was grabbed\n", ResourceType(type), r->res_id); + } + } + } + } + spin_unlock_irq(mlx4_tlock(dev)); + + return busy; +} + +static int move_all_busy(struct mlx4_dev *dev, int slave, enum mlx4_resource type) +{ + unsigned long begin; + int busy; + + begin = jiffies; + do { + busy = _move_all_busy(dev, slave, type, 0); + if (time_after(jiffies, begin + 5 * HZ)) + break; + if (busy) + cond_resched(); + } while (busy); + + if (busy) + busy = _move_all_busy(dev, slave, type, 1); + + return busy; +} +static void rem_slave_qps(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *qp_list = &tracker->slave_list[slave].res_list[RES_QP]; + struct res_qp *qp; + struct res_qp *tmp; + int err; + int state; + u64 in_param; + int qpn; + + err = move_all_busy(dev, slave, RES_QP); + SASSERT(!err); + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(qp, tmp, qp_list, com.list) { + spin_unlock_irq(mlx4_tlock(dev)); + if (qp->com.owner == slave) { + qpn = qp->com.res_id; + mlx4_sdbg("qpn 0x%x\n", qpn); + dettach_qp(dev, slave, qp); + + mlx4_sdbg("qpn 0x%x\n", qpn); + state = qp->com.from_state; + while (state != 0) { + switch (state) { + case RES_QP_RESERVED: + spin_lock_irq(mlx4_tlock(dev)); + radix_tree_delete(&tracker->res_tree[RES_QP], qp->com.res_id); + list_del(&qp->com.list); + spin_unlock_irq(mlx4_tlock(dev)); + kfree(qp); + state = 0; + mlx4_sdbg("qpn 0x%x deleted\n", qpn); + break; + case RES_QP_MAPPED: + if (!valid_reserved(dev, slave, qpn)) + __mlx4_qp_free_icm(dev, qpn); + state = RES_QP_RESERVED; + mlx4_sdbg("qpn 0x%x moved to %s\n", qpn, qp_states_str(state)); + break; + case RES_QP_HW: + in_param = slave; + err = mlx4_cmd(dev, in_param, qp->local_qpn, 2, + MLX4_CMD_2RST_QP, MLX4_CMD_TIME_CLASS_A, 1); + SASSERT(!err); + atomic_dec(&qp->rcq->ref_count); + mlx4_sdbg("CQ 0x%x, ref count %d\n", qp->rcq->com.res_id, atomic_read(&qp->rcq->ref_count)); + atomic_dec(&qp->scq->ref_count); + mlx4_sdbg("CQ 0x%x, ref count %d\n", qp->scq->com.res_id, atomic_read(&qp->scq->ref_count)); + atomic_dec(&qp->mtt->ref_count); + if (qp->srq) { + atomic_dec(&qp->srq->ref_count); + mlx4_sdbg("srqn 0x%x\n", qp->srq->com.res_id); + } + state = RES_QP_MAPPED; + mlx4_sdbg("qpn 0x%x moved to %s\n", qpn, qp_states_str(state)); + break; + default: + SASSERT(0); + state = 0; + } + } + } + spin_lock_irq(mlx4_tlock(dev)); + } + spin_unlock_irq(mlx4_tlock(dev)); +} + +static void rem_slave_srqs(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *srq_list = &tracker->slave_list[slave].res_list[RES_SRQ]; + struct res_srq *srq; + struct res_srq *tmp; + int err; + int state; + u64 in_param; + LIST_HEAD(tlist); + int srqn; + + mlx4_sdbg("\n"); + err = move_all_busy(dev, slave, RES_SRQ); + SASSERT(!err); + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(srq, tmp, srq_list, com.list) { + spin_unlock_irq(mlx4_tlock(dev)); + if (srq->com.owner == slave) { + srqn = srq->com.res_id; + mlx4_sdbg("srqn 0x%x\n", srqn); + state = srq->com.from_state; + while (state != 0) { + switch (state) { + case RES_SRQ_ALLOCATED: + __mlx4_srq_free_icm(dev, srqn); + spin_lock_irq(mlx4_tlock(dev)); + radix_tree_delete(&tracker->res_tree[RES_SRQ], srqn); + list_del(&srq->com.list); + spin_unlock_irq(mlx4_tlock(dev)); + kfree(srq); + state = 0; + break; + + case RES_SRQ_HW: + SASSERT(!atomic_read(&srq->ref_count)); + in_param = slave; + err = mlx4_cmd(dev, in_param, srqn, 1, + MLX4_CMD_HW2SW_SRQ, MLX4_CMD_TIME_CLASS_A, 1); + SASSERT(!err); + + atomic_dec(&srq->mtt->ref_count); + if (srq->cq) { + atomic_dec(&srq->cq->ref_count); + mlx4_sdbg("CQ 0x%x, ref count %d\n", srq->cq->com.res_id, atomic_read(&srq->cq->ref_count)); + } + + state = RES_SRQ_ALLOCATED; + break; + + default: + SASSERT(0); + state = 0; + } + } + } + spin_lock_irq(mlx4_tlock(dev)); + } + spin_unlock_irq(mlx4_tlock(dev)); +} + +static void rem_slave_cqs(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *cq_list = &tracker->slave_list[slave].res_list[RES_CQ]; + struct res_cq *cq; + struct res_cq *tmp; + int err; + int state; + u64 in_param; + LIST_HEAD(tlist); + int cqn; + + mlx4_sdbg("\n"); + err = move_all_busy(dev, slave, RES_CQ); + SASSERT(!err); + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(cq, tmp, cq_list, com.list) { + spin_unlock_irq(mlx4_tlock(dev)); + if (cq->com.owner == slave) { + cqn = cq->com.res_id; + mlx4_sdbg("cqn 0x%x, ref_count %d\n", cqn, atomic_read(&cq->ref_count)); + SASSERT(!atomic_read(&cq->ref_count)); + + mlx4_sdbg("cqn 0x%x\n", cqn); + state = cq->com.from_state; + while (state != 0) { + switch (state) { + case RES_CQ_ALLOCATED: + __mlx4_cq_free_icm(dev, cqn); + spin_lock_irq(mlx4_tlock(dev)); + radix_tree_delete(&tracker->res_tree[RES_CQ], cqn); + list_del(&cq->com.list); + spin_unlock_irq(mlx4_tlock(dev)); + kfree(cq); + state = 0; + break; + + case RES_CQ_HW: + in_param = slave; + err = mlx4_cmd(dev, in_param, cqn, 1, + MLX4_CMD_HW2SW_CQ, MLX4_CMD_TIME_CLASS_A, 1); + SASSERT(!err); + + atomic_dec(&cq->mtt->ref_count); + state = RES_CQ_ALLOCATED; + break; + + default: + SASSERT(0); + state = 0; + } + } + } + spin_lock_irq(mlx4_tlock(dev)); + } + spin_unlock_irq(mlx4_tlock(dev)); +} + +static void rem_slave_mrs(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *mpt_list = &tracker->slave_list[slave].res_list[RES_MPT]; + struct res_mpt *mpt; + struct res_mpt *tmp; + int err; + int state; + u64 in_param; + LIST_HEAD(tlist); + int mptn; + int fmr_flow; + + mlx4_sdbg("\n"); + err = move_all_busy(dev, slave, RES_MPT); + SASSERT(!err); + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(mpt, tmp, mpt_list, com.list) { + spin_unlock_irq(mlx4_tlock(dev)); + if (mpt->com.owner == slave) { + mptn = mpt->com.res_id; + fmr_flow = mlx4_fmr_flow(dev, mpt->flags); + mlx4_sdbg("mptn 0x%x\n", mptn); + state = mpt->com.from_state; + while (state != 0) { + switch (state) { + case RES_MPT_RESERVED: + if (!fmr_flow) + __mlx4_mr_release(dev, + mpt->key); + spin_lock_irq(mlx4_tlock(dev)); + radix_tree_delete(&tracker->res_tree[RES_MPT], mptn); + list_del(&mpt->com.list); + spin_unlock_irq(mlx4_tlock(dev)); + kfree(mpt); + state = 0; + break; + + case RES_MPT_MAPPED: + if (!fmr_flow) + __mlx4_mr_free_icm(dev, + mpt->key, + MLX4_MR_FLAG_NONE); + state = RES_MPT_RESERVED; + break; + + case RES_MPT_HW: + in_param = slave; + err = mlx4_cmd(dev, in_param, mptn, 0, + MLX4_CMD_HW2SW_MPT, MLX4_CMD_TIME_CLASS_A, 1); + SASSERT(!err); + + if (mpt->mtt) + atomic_dec(&mpt->mtt->ref_count); + state = RES_MPT_MAPPED; + break; + default: + SASSERT(0); + state = 0; + } + } + } + spin_lock_irq(mlx4_tlock(dev)); + } + spin_unlock_irq(mlx4_tlock(dev)); +} + +static void rem_slave_mtts(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *mtt_list = &tracker->slave_list[slave].res_list[RES_MTT]; + struct res_mtt *mtt; + struct res_mtt *tmp; + int state; + LIST_HEAD(tlist); + int base; + int err; + + mlx4_sdbg("\n"); + err = move_all_busy(dev, slave, RES_MTT); + SASSERT(!err); + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(mtt, tmp, mtt_list, com.list) { + spin_unlock_irq(mlx4_tlock(dev)); + if (mtt->com.owner == slave) { + base = mtt->com.res_id; + mlx4_sdbg("base 0x%x, ref_count %d\n", base, atomic_read(&mtt->ref_count)); + SASSERT(!atomic_read(&mtt->ref_count)); + + state = mtt->com.from_state; + while (state != 0) { + switch (state) { + case RES_MTT_ALLOCATED: + __mlx4_free_mtt_range(dev, base, + mtt->order, MLX4_MR_FLAG_NONE); + spin_lock_irq(mlx4_tlock(dev)); + radix_tree_delete(&tracker->res_tree[RES_MTT], base); + list_del(&mtt->com.list); + spin_unlock_irq(mlx4_tlock(dev)); + kfree(mtt); + state = 0; + break; + case RES_MTT_RESERVED: + __mlx4_free_mtt_reserved_range( + dev, base, mtt->order); + spin_lock_irq(mlx4_tlock(dev)); + radix_tree_delete( + &tracker->res_tree[RES_MTT], + base); + list_del(&mtt->com.list); + spin_unlock_irq(mlx4_tlock(dev)); + kfree(mtt); + state = 0; + break; + default: + SASSERT(0); + state = 0; + } + } + } + spin_lock_irq(mlx4_tlock(dev)); + } + spin_unlock_irq(mlx4_tlock(dev)); +} + +static void rem_slave_eqs(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *eq_list = &tracker->slave_list[slave].res_list[RES_EQ]; + struct res_eq *eq; + struct res_eq *tmp; + int err; + int state; + LIST_HEAD(tlist); + int eqn; + struct mlx4_cmd_mailbox *mailbox; + + mlx4_sdbg("\n"); + err = move_all_busy(dev, slave, RES_EQ); + SASSERT(!err); + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(eq, tmp, eq_list, com.list) { + spin_unlock_irq(mlx4_tlock(dev)); + if (eq->com.owner == slave) { + eqn = eq->com.res_id; + mlx4_sdbg("eqn 0x%x\n", eqn); + state = eq->com.from_state; + while (state != 0) { + switch (state) { + case RES_EQ_RESERVED: + spin_lock_irq(mlx4_tlock(dev)); + radix_tree_delete(&tracker->res_tree[RES_EQ], eqn); + list_del(&eq->com.list); + spin_unlock_irq(mlx4_tlock(dev)); + kfree(eq); + state = 0; + break; + + case RES_EQ_HW: + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + mlx4_sdbg("\n"); + cond_resched(); + continue; + } + err = mlx4_cmd_box(dev, slave, 0, eqn & 0xff, 0, + MLX4_CMD_HW2SW_EQ, MLX4_CMD_TIME_CLASS_A, 1); + SASSERT(!err); + mlx4_free_cmd_mailbox(dev, mailbox); + atomic_dec(&eq->mtt->ref_count); + state = RES_EQ_RESERVED; + break; + + default: + SASSERT(0); + state = 0; + } + } + } + spin_lock_irq(mlx4_tlock(dev)); + } + spin_unlock_irq(mlx4_tlock(dev)); +} + +static void rem_slave_counters(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *counter_list = &tracker->slave_list[slave].res_list[RES_COUNTER]; + struct res_counter *counter; + struct res_counter *tmp; + int err; + int index; + + err = move_all_busy(dev, slave, RES_COUNTER); + SASSERT(!err); + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(counter, tmp, counter_list, com.list) { + if (counter->com.owner == slave) { + index = counter->com.res_id; + radix_tree_delete(&tracker->res_tree[RES_COUNTER], index); + list_del(&counter->com.list); + kfree(counter); + __mlx4_counter_free(dev, index); + mlx4_sdbg("deleted counter index %d\n", index); + } + } + spin_unlock_irq(mlx4_tlock(dev)); +} + +static void rem_slave_xrcdns(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker; + struct list_head *xrcdn_list = &tracker->slave_list[slave].res_list[RES_XRCDN]; + struct res_xrcdn *xrcd; + struct res_xrcdn *tmp; + int err; + int xrcdn; + + err = move_all_busy(dev, slave, RES_XRCDN); + SASSERT(!err); + + spin_lock_irq(mlx4_tlock(dev)); + list_for_each_entry_safe(xrcd, tmp, xrcdn_list, com.list) { + if (xrcd->com.owner == slave) { + xrcdn = xrcd->com.res_id; + radix_tree_delete(&tracker->res_tree[RES_XRCDN], xrcdn); + list_del(&xrcd->com.list); + kfree(xrcd); + __mlx4_xrcd_free(dev, xrcdn); + mlx4_sdbg("deleted xrcdn %d\n", xrcdn); + } + } + spin_unlock_irq(mlx4_tlock(dev)); +} + +void mlx4_delete_all_resources_for_slave(struct mlx4_dev *dev, int slave) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + mlx4_sdbg("\n"); + mutex_lock(&priv->mfunc.master.res_tracker.slave_list[slave].mutex); + /*VLAN*/ + /* MAC */ +// mlx4_sdbg("\n"); +// mlx4_delete_specific_res_type_for_slave(dev, slave, RES_MAC); + + mlx4_sdbg("\n"); + mlx4_fmr_master_delete_slave(dev, slave); + mlx4_sdbg("\n"); + rem_slave_qps(dev, slave); + mlx4_sdbg("\n"); + rem_slave_srqs(dev, slave); + mlx4_sdbg("\n"); + rem_slave_cqs(dev, slave); + mlx4_sdbg("\n"); + rem_slave_mrs(dev, slave); + mlx4_sdbg("\n"); + rem_slave_eqs(dev, slave); + mlx4_sdbg("\n"); + rem_slave_mtts(dev, slave); + mlx4_sdbg("\n"); + rem_slave_counters(dev, slave); + mlx4_sdbg("\n"); + rem_slave_xrcdns(dev, slave); + mlx4_sdbg("\n"); + mutex_unlock(&priv->mfunc.master.res_tracker.slave_list[slave].mutex); +} + diff --git a/drivers/net/mlx4/rt_torture.c b/drivers/net/mlx4/rt_torture.c new file mode 100644 index 0000000000000..a55acfcd64b21 --- /dev/null +++ b/drivers/net/mlx4/rt_torture.c @@ -0,0 +1,312 @@ +/* + * Copyright (c) 2011, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include "mlx4.h" + +#if 0 +enum test_mode { + RANDOM_MTT, + TEST_CQS, + MAX_MODES +}; + +static struct workqueue_struct *wq[MAX_MODES]; + +static int allow_tests; +static int num_pending; + +struct mtt_item { + struct list_head list; + struct mlx4_mtt mtt; +}; + +struct test_work { + struct work_struct work; + struct mlx4_dev *dev; + int slave; +}; + +enum { + MAX_PENDING = 128 +}; + +static int get_npages(int max) +{ + return random32() % max; +} + +static void run_random_mtt(struct work_struct *work) +{ + struct test_work *tw = container_of(work, struct test_work, work); + int n = 1000; + int max_order = 7; + int page_shift = PAGE_SHIFT; + int err; + LIST_HEAD(mtts); + struct mtt_item *item; + struct mtt_item *tmp; + int i; + struct mlx4_dev *dev = tw->dev; + int goods = 0; + + if (!allow_tests) { + --num_pending; + kfree(tw); + return; + } + + for (i = 0; i < n; ++i) { + item = kmalloc(sizeof *item, GFP_KERNEL); + if (!item) + break; + + err = mlx4_mtt_init(dev, + get_npages(1 << max_order), + page_shift, + &item->mtt); + if (!err) { + list_add_tail(&item->list, &mtts); + ++goods; + } else + kfree(item); + } + + list_for_each_entry_safe(item, tmp, &mtts, list) { + mlx4_mtt_cleanup(dev, &item->mtt); + list_del(&item->list); + kfree(item); + } + + kfree(tw); + if (goods != n) + printk(KERN_INFO "%s, failed: ran %d cases but only %d succeeded\n", + __func__, n, goods); + else + printk(KERN_INFO "%s: test finished successfully\n", __func__); + + --num_pending; +} + + +static void fill_random(void *arg, int size) +{ + u32 *p = arg; + int i; + int n = size / 4; + + for (i = 0; i < n; ++i) + p[i] = random32(); +} + +static int cq_bad(struct mlx4_dev *dev, int slave) +{ + int err; + struct mlx4_mtt mtt; + int nent = random32(); + struct mlx4_uar uar; + struct mlx4_cq cq; + + fill_random(&mtt, sizeof mtt); + fill_random(&uar, sizeof uar); + err = mlx4_cq_alloc(dev, nent, &mtt, &uar, 0, &cq, 0, 0); + SASSERT(err); + + return err; +} + +static void run_cqs(struct work_struct *work) +{ + struct test_work *tw = container_of(work, struct test_work, work); + int slave = tw->slave; + int i; + int n = 1000; + int err; + int bads = 0; + + if (!allow_tests) { + --num_pending; + kfree(tw); + return; + } + + for (i = 0; i < n; ++i) { + err = cq_bad(tw->dev, slave); + if (err) + ++bads; + } + + kfree(tw); + + if (bads != n) + printk(KERN_INFO "%s, failed: Ran %d bad cases but only %d failed\n", + __func__, n, bads); + + printk(KERN_INFO "%s: test finished successfully\n", __func__); + --num_pending; +} + +static void run_test(struct mlx4_dev *dev, int slave, enum test_mode mode) +{ + struct test_work *tw; + + tw = kmalloc(sizeof *tw, GFP_KERNEL); + if (!tw) { + mlx4_swarn("kmalloc failed\n"); + return; + } + tw->slave = slave; + tw->dev = dev; + + switch (mode) { + case RANDOM_MTT: + ++num_pending; + INIT_WORK(&tw->work, run_random_mtt); + queue_work(wq[mode], &tw->work); + break; + + case TEST_CQS: + ++num_pending; + INIT_WORK(&tw->work, run_cqs); + queue_work(wq[mode], &tw->work); + break; + + + default: + kfree(tw); + mlx4_swarn("test mode %d not supported\n", mode); + } +} + +static ssize_t show_test(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct mlx4_priv *priv = container_of(attr, struct mlx4_priv, test_attr); + int slave = priv->dev.caps.function; + + mlx4_sdbg("\n"); + + return -ENOSYS; +} + +static ssize_t store_test(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct mlx4_priv *priv = container_of(attr, struct mlx4_priv, test_attr); + int slave = priv->dev.caps.function; + int err; + int mode; + + if (!allow_tests) + return -EINVAL; + + if (num_pending > MAX_PENDING) + return -ENOMEM; + + err = sscanf(buf, "%d", &mode); + if (err == 1) + run_test(&priv->dev, slave, mode); + + return count; +} + +int mlx4_rtt_init(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int slave = dev->caps.function; + int err; + int i; + + for (i = 0; i < MAX_MODES; ++i) { + wq[i] = create_singlethread_workqueue("rt_torture_wq"); + if (!wq[i]) { + mlx4_swarn("failed to create work queue\n"); + err = -ENOMEM; + goto ex_wq; + } + } + + priv->test_attr.attr.name = "test"; + priv->test_attr.attr.mode = S_IRUGO | S_IWUSR; + priv->test_attr.show = show_test; + priv->test_attr.store = store_test; + + allow_tests = 0; + num_pending = 0; + err = device_create_file(&dev->pdev->dev, &priv->test_attr); + if (err) { + mlx4_swarn("Failed to create sysfs file\n"); + goto ex_wq; + } + allow_tests = 1; + + return 0; + +ex_wq: + for (--i; i >= 0; --i) + destroy_workqueue(wq[i]); + + return err; +} + +void mlx4_rtt_cleanup(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + int i; + + allow_tests = 0; + for (i = 0; i < MAX_MODES; ++i) + flush_workqueue(wq[i]); + + device_remove_file(&dev->pdev->dev, &priv->test_attr); + for (i = 0; i < MAX_MODES; ++i) + destroy_workqueue(wq[i]); +} +#else + +int mlx4_rtt_init(struct mlx4_dev *dev) +{ + return 0; +} + +void mlx4_rtt_cleanup(struct mlx4_dev *dev) +{ +} + +#endif diff --git a/drivers/net/mlx4/sense.c b/drivers/net/mlx4/sense.c index e2337a7411d94..792f6e756285b 100644 --- a/drivers/net/mlx4/sense.c +++ b/drivers/net/mlx4/sense.c @@ -38,14 +38,14 @@ #include "mlx4.h" -int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port, - enum mlx4_port_type *type) +static int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port, + enum mlx4_port_type *type) { u64 out_param; int err = 0; err = mlx4_cmd_imm(dev, 0, &out_param, port, 0, - MLX4_CMD_SENSE_PORT, MLX4_CMD_TIME_CLASS_B); + MLX4_CMD_SENSE_PORT, MLX4_CMD_TIME_CLASS_B, 0); if (err) { mlx4_err(dev, "Sense command failed for port: %d\n", port); return err; @@ -53,7 +53,7 @@ int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port, if (out_param > 2) { mlx4_err(dev, "Sense returned illegal value: 0x%llx\n", out_param); - return -EINVAL; + return EINVAL; } *type = out_param; @@ -121,8 +121,9 @@ static void mlx4_sense_port(struct work_struct *work) sense_again: mutex_unlock(&priv->port_mutex); - queue_delayed_work(mlx4_wq , &sense->sense_poll, - round_jiffies_relative(MLX4_SENSE_RANGE)); + if (sense->resched) + queue_delayed_work(sense->sense_wq , &sense->sense_poll, + round_jiffies(MLX4_SENSE_RANGE)); } void mlx4_start_sense(struct mlx4_dev *dev) @@ -133,24 +134,39 @@ void mlx4_start_sense(struct mlx4_dev *dev) if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) return; - queue_delayed_work(mlx4_wq , &sense->sense_poll, - round_jiffies_relative(MLX4_SENSE_RANGE)); + sense->resched = 1; + queue_delayed_work(sense->sense_wq , &sense->sense_poll, + round_jiffies(MLX4_SENSE_RANGE)); } + void mlx4_stop_sense(struct mlx4_dev *dev) { - cancel_delayed_work_sync(&mlx4_priv(dev)->sense.sense_poll); + mlx4_priv(dev)->sense.resched = 0; } -void mlx4_sense_init(struct mlx4_dev *dev) +int mlx4_sense_init(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_sense *sense = &priv->sense; int port; sense->dev = dev; + sense->sense_wq = create_singlethread_workqueue("mlx4_sense"); + if (!sense->sense_wq) + return -ENOMEM; + for (port = 1; port <= dev->caps.num_ports; port++) sense->do_sense_port[port] = 1; INIT_DELAYED_WORK_DEFERRABLE(&sense->sense_poll, mlx4_sense_port); + return 0; } + +void mlx4_sense_cleanup(struct mlx4_dev *dev) +{ + mlx4_stop_sense(dev); + cancel_delayed_work(&mlx4_priv(dev)->sense.sense_poll); + destroy_workqueue(mlx4_priv(dev)->sense.sense_wq); +} + diff --git a/drivers/net/mlx4/srq.c b/drivers/net/mlx4/srq.c index 3b07b80a0456b..d8928e78120a2 100644 --- a/drivers/net/mlx4/srq.c +++ b/drivers/net/mlx4/srq.c @@ -31,32 +31,13 @@ * SOFTWARE. */ +#include + #include -#include #include "mlx4.h" #include "icm.h" -struct mlx4_srq_context { - __be32 state_logsize_srqn; - u8 logstride; - u8 reserved1[3]; - u8 pg_offset; - u8 reserved2[3]; - u32 reserved3; - u8 log_page_size; - u8 reserved4[2]; - u8 mtt_base_addr_h; - __be32 mtt_base_addr_l; - __be32 pd; - __be16 limit_watermark; - __be16 wqe_cnt; - u16 reserved5; - __be16 wqe_counter; - u32 reserved6; - __be64 db_rec_addr; -}; - void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type) { struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table; @@ -64,7 +45,8 @@ void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type) spin_lock(&srq_table->lock); - srq = radix_tree_lookup(&srq_table->tree, srqn & (dev->caps.num_srqs - 1)); + srq = radix_tree_lookup(&dev->srq_table_tree, + srqn & (dev->caps.num_srqs - 1)); if (srq) atomic_inc(&srq->refcount); @@ -84,8 +66,8 @@ void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type) static int mlx4_SW2HW_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, int srq_num) { - return mlx4_cmd(dev, mailbox->dma, srq_num, 0, MLX4_CMD_SW2HW_SRQ, - MLX4_CMD_TIME_CLASS_A); + return mlx4_cmd(dev, mailbox->dma | dev->caps.function, srq_num, 0, + MLX4_CMD_SW2HW_SRQ, MLX4_CMD_TIME_CLASS_A, 0); } static int mlx4_HW2SW_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, @@ -93,48 +75,110 @@ static int mlx4_HW2SW_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox { return mlx4_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, srq_num, mailbox ? 0 : 1, MLX4_CMD_HW2SW_SRQ, - MLX4_CMD_TIME_CLASS_A); + MLX4_CMD_TIME_CLASS_A, 0); } static int mlx4_ARM_SRQ(struct mlx4_dev *dev, int srq_num, int limit_watermark) { return mlx4_cmd(dev, limit_watermark, srq_num, 0, MLX4_CMD_ARM_SRQ, - MLX4_CMD_TIME_CLASS_B); + MLX4_CMD_TIME_CLASS_B, 0); } static int mlx4_QUERY_SRQ(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox, int srq_num) { return mlx4_cmd_box(dev, 0, mailbox->dma, srq_num, 0, MLX4_CMD_QUERY_SRQ, - MLX4_CMD_TIME_CLASS_A); + MLX4_CMD_TIME_CLASS_A, 0); } -int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, struct mlx4_mtt *mtt, - u64 db_rec, struct mlx4_srq *srq) +int __mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn) { struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table; - struct mlx4_cmd_mailbox *mailbox; - struct mlx4_srq_context *srq_context; - u64 mtt_addr; int err; - srq->srqn = mlx4_bitmap_alloc(&srq_table->bitmap); - if (srq->srqn == -1) + + *srqn = mlx4_bitmap_alloc(&srq_table->bitmap); + if (*srqn == -1) return -ENOMEM; - err = mlx4_table_get(dev, &srq_table->table, srq->srqn); + err = mlx4_table_get(dev, &srq_table->table, *srqn, MLX4_MR_FLAG_NONE); if (err) goto err_out; - err = mlx4_table_get(dev, &srq_table->cmpt_table, srq->srqn); + err = mlx4_table_get(dev, &srq_table->cmpt_table, *srqn, + MLX4_MR_FLAG_NONE); if (err) goto err_put; + return 0; + +err_put: + mlx4_table_put(dev, &srq_table->table, *srqn, MLX4_MR_FLAG_NONE); + +err_out: + mlx4_bitmap_free(&srq_table->bitmap, *srqn); + return err; +} + +int mlx4_srq_alloc_icm(struct mlx4_dev *dev, int *srqn) +{ + u64 out_param; + int err; + + if (mlx4_is_mfunc(dev)) { + err = mlx4_cmd_imm(dev, 0, &out_param, RES_SRQ, + RES_OP_RESERVE_AND_MAP, + MLX4_CMD_ALLOC_RES, + MLX4_CMD_TIME_CLASS_A, 0); + if (!err) + *srqn = get_param_l(&out_param); + + return err; + } + return __mlx4_srq_alloc_icm(dev, srqn); +} + +void __mlx4_srq_free_icm(struct mlx4_dev *dev, int srqn) +{ + struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table; + + mlx4_table_put(dev, &srq_table->cmpt_table, srqn, MLX4_MR_FLAG_NONE); + mlx4_table_put(dev, &srq_table->table, srqn, MLX4_MR_FLAG_NONE); + mlx4_bitmap_free(&srq_table->bitmap, srqn); +} + +void mlx4_srq_free_icm(struct mlx4_dev *dev, int srqn) +{ + u64 in_param; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, srqn); + if (mlx4_cmd(dev, in_param, RES_SRQ, RES_OP_RESERVE_AND_MAP, + MLX4_CMD_FREE_RES, + MLX4_CMD_TIME_CLASS_A, 0)) + mlx4_warn(dev, "Failed freeing cq:%d\n", srqn); + return; + } + __mlx4_srq_free_icm(dev, srqn); +} + +int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcd, + struct mlx4_mtt *mtt, u64 db_rec, struct mlx4_srq *srq) +{ + struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table; + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_srq_context *srq_context; + u64 mtt_addr; + int err; + + err = mlx4_srq_alloc_icm(dev, &srq->srqn); + if (err) + return err; spin_lock_irq(&srq_table->lock); - err = radix_tree_insert(&srq_table->tree, srq->srqn, srq); + err = radix_tree_insert(&dev->srq_table_tree, srq->srqn, srq); spin_unlock_irq(&srq_table->lock); if (err) - goto err_cmpt_put; + goto err_icm; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) { @@ -148,6 +192,8 @@ int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, struct mlx4_mtt *mtt, srq_context->state_logsize_srqn = cpu_to_be32((ilog2(srq->max) << 24) | srq->srqn); srq_context->logstride = srq->wqe_shift - 4; + srq_context->xrc_domain = cpu_to_be16(xrcd); + srq_context->pg_offset_cqn = cpu_to_be32(cqn & 0xffffff); srq_context->log_page_size = mtt->page_shift - MLX4_ICM_PAGE_SHIFT; mtt_addr = mlx4_mtt_addr(dev, mtt); @@ -168,41 +214,42 @@ int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, struct mlx4_mtt *mtt, err_radix: spin_lock_irq(&srq_table->lock); - radix_tree_delete(&srq_table->tree, srq->srqn); + radix_tree_delete(&dev->srq_table_tree, srq->srqn); spin_unlock_irq(&srq_table->lock); -err_cmpt_put: - mlx4_table_put(dev, &srq_table->cmpt_table, srq->srqn); - -err_put: - mlx4_table_put(dev, &srq_table->table, srq->srqn); - -err_out: - mlx4_bitmap_free(&srq_table->bitmap, srq->srqn); - +err_icm: + mlx4_srq_free_icm(dev, srq->srqn); return err; } EXPORT_SYMBOL_GPL(mlx4_srq_alloc); -void mlx4_srq_free(struct mlx4_dev *dev, struct mlx4_srq *srq) +void mlx4_srq_invalidate(struct mlx4_dev *dev, struct mlx4_srq *srq) { - struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table; int err; err = mlx4_HW2SW_SRQ(dev, NULL, srq->srqn); if (err) mlx4_warn(dev, "HW2SW_SRQ failed (%d) for SRQN %06x\n", err, srq->srqn); +} +EXPORT_SYMBOL_GPL(mlx4_srq_invalidate); + +void mlx4_srq_remove(struct mlx4_dev *dev, struct mlx4_srq *srq) +{ + struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table; spin_lock_irq(&srq_table->lock); - radix_tree_delete(&srq_table->tree, srq->srqn); + radix_tree_delete(&dev->srq_table_tree, srq->srqn); spin_unlock_irq(&srq_table->lock); +} +EXPORT_SYMBOL_GPL(mlx4_srq_remove); +void mlx4_srq_free(struct mlx4_dev *dev, struct mlx4_srq *srq) +{ if (atomic_dec_and_test(&srq->refcount)) complete(&srq->free); wait_for_completion(&srq->free); - mlx4_table_put(dev, &srq_table->table, srq->srqn); - mlx4_bitmap_free(&srq_table->bitmap, srq->srqn); + mlx4_srq_free_icm(dev, srq->srqn); } EXPORT_SYMBOL_GPL(mlx4_srq_free); @@ -241,7 +288,9 @@ int mlx4_init_srq_table(struct mlx4_dev *dev) int err; spin_lock_init(&srq_table->lock); - INIT_RADIX_TREE(&srq_table->tree, GFP_ATOMIC); + INIT_RADIX_TREE(&dev->srq_table_tree, GFP_ATOMIC); + if (mlx4_is_mfunc(dev) && !mlx4_is_master(dev)) + return 0; err = mlx4_bitmap_init(&srq_table->bitmap, dev->caps.num_srqs, dev->caps.num_srqs - 1, dev->caps.reserved_srqs, 0); @@ -253,5 +302,7 @@ int mlx4_init_srq_table(struct mlx4_dev *dev) void mlx4_cleanup_srq_table(struct mlx4_dev *dev) { + if (mlx4_is_mfunc(dev) && !mlx4_is_master(dev)) + return; mlx4_bitmap_cleanup(&mlx4_priv(dev)->srq_table.bitmap); } diff --git a/drivers/net/mlx4/xen_fmr.h b/drivers/net/mlx4/xen_fmr.h new file mode 100644 index 0000000000000..323083e380aba --- /dev/null +++ b/drivers/net/mlx4/xen_fmr.h @@ -0,0 +1,129 @@ +#ifndef _MLX4_XEN_FMR_GEN_H_ +#define _MLX4_XEN_FMR_GEN_H_ + +#include +#include + +#include + +#include "fmr_api.h" + +#define DRV_VERSION "2.0.0" + +#if 0 + #define DPRINTK(format, args...) printk(KERN_INFO format, ##args) +#else + #define DPRINTK(format, args...) +#endif + +#define XENBUS_DEVID "fmr" + +typedef u64 addr_ref_t; +typedef __be64 addr_ref_be_t; +static inline addr_ref_be_t cpu_to_be_addr_ref(addr_ref_t cpu_var) +{ + return cpu_to_be64((addr_ref_t)cpu_var); +} +static inline addr_ref_t addr_ref_be_to_cpu(addr_ref_be_t be_var) +{ + return be64_to_cpu((addr_ref_be_t)be_var); +} + +/** + * Xen VPM + */ +struct xen_vpm { + /* Base VPM */ + struct vpm vpm; + /* Used to map to page, initialize in slave */ + __be16 dom_id; + /* This field holds diffrent value dependin on + whether slave runs on domU or dom0 + In domU only 32 bits are used to hold gref of + the shared page + In dom0 the full 64 bits are used to hold the + address of the shared page */ + addr_ref_be_t addr_ref; +}; + +#define XEN_VPM_SZ (sizeof(struct xen_vpm) - sizeof(struct vpm)) + +/** + * Ref count: Structure + */ +struct ref_count { + atomic_t var; + struct completion comp; +}; + +/** + * Ref count: Initialize + */ +static inline void ref_count_init(struct ref_count *rc) +{ + atomic_set(&(rc->var), 0); + init_completion(&(rc->comp)); +} + +/** + * Ref count: Increment reference + */ +static inline void ref_count_inc(struct ref_count *rc) +{ + atomic_inc(&(rc->var)); +} + +/** + * Ref count: Decrement reference + */ +static inline void ref_count_dec(struct ref_count *rc) +{ + if (atomic_dec_and_test(&(rc->var))) + complete(&(rc->comp)); +} + +/** + * Ref count: Check if reference count is 0 + */ +static inline int ref_count_is_zero(struct ref_count *rc) +{ + return (atomic_read(&(rc->var)) == 0); +} + +/** + * Ref count: Check if reference count is not 0 + */ +static inline int ref_count_is_not_zero(struct ref_count *rc) +{ + return (atomic_read(&(rc->var)) != 0); +} + +/** + * Ref count: Decrement reference - block until 0 + */ +static inline void ref_count_wait_for_zero(struct ref_count *rc) +{ + if (ref_count_is_not_zero(rc)) + wait_for_completion(&(rc->comp)); +} + +/** + * Wrapper to xenbus_switch_state + */ +inline int mlx4_xen_fmr_switch_state(struct xenbus_device *dev, + enum xenbus_state state) +{ + int sts; + DPRINTK("xen_fmr: Switching to state %s\n", + xenbus_strstate(state)); + + sts = xenbus_switch_state(dev, state); + if (sts) + printk("xen_fmr: Fail to switch state to %s\n", + xenbus_strstate(state)); + + return sts; +} + +#endif + diff --git a/drivers/net/mlx4/xen_fmr_master.c b/drivers/net/mlx4/xen_fmr_master.c new file mode 100644 index 0000000000000..671c7fa61ba7f --- /dev/null +++ b/drivers/net/mlx4/xen_fmr_master.c @@ -0,0 +1,573 @@ +/* + * Copyright (c) 2012 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include "fmr_api.h" +#include "xen_fmr.h" + +/* #define STUB_IB_CORE_DRIVER */ + +MODULE_AUTHOR("Yuval Shaia "); +MODULE_DESCRIPTION("XEN FMR API - Backend"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); + +/* Flags to indicate no xenbus operation until registration completed */ +int m_registered_to_xenbus; +/* Will be use to make sure all Devices were cleaned */ +struct ref_count m_dev_ref_count; +/* Will be use to make sure all VFs were cleaned */ +struct ref_count m_vf_ref_count; +/* Will be use to make sure all mapped pages were unmapped */ +struct ref_count m_map_ref_count; + +/** + * PPF Context initiated in init(), used in map() destroyed in term() + */ +struct xen_master_ppf_ctx { + struct pci_dev *pci_dev; +} xen_master_ppf_ctx; + +/** + * reset page count in struct page + */ +void reset_grant_page(struct page *page) +{ + init_page_count(page); + reset_page_mapcount(page); +} + +/** + * VF Context initiated in add_function(), used in map() destroyed in del_function() + */ +struct xen_master_vf_ctx { + struct xen_master_ppf_ctx *ppf_ctx; +} xen_master_vf_ctx; + +/** + * VPM Context initiated in map(), used and destroyed in unmap() + */ +struct xen_master_vpm_ctx { + dma_addr_t dma_addr; + struct page *pagevec[1]; /* size is always 1 */ + grant_handle_t handle; + struct xen_master_vf_ctx *vf_ctx; +} xen_master_vpm_ctx; + +/** + * Utility to unmap and free page + */ +int mlx_xen_fmr_unmap_page(grant_handle_t handle, struct page **pagevec, + int free_page_ind) +{ + int retval; + struct gnttab_unmap_grant_ref unmap_ops; + struct page *page; + phys_addr_t kaddr; + unsigned long pfn; + + page = pagevec[0]; + pfn = page_to_pfn(page); + kaddr = (phys_addr_t)pfn_to_kaddr(pfn); + + DPRINTK("xen_fmr_master: Unmapping kaddr=%llx, handle=%d\n", + (unsigned long long)kaddr, handle); + + gnttab_set_unmap_op(&unmap_ops, kaddr, GNTMAP_host_map, handle); + retval = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, + &unmap_ops, 1); + if (retval) { + printk(KERN_ERR "xen_fmr_master: Fail to unmap err=%d\n", + retval); + return -EINVAL; + } else if (unmap_ops.status != GNTST_okay) { + printk(KERN_ERR "xen_fmr_master: Fail to unmap status=%d," + " kaddr=%llx, handle=%d\n", unmap_ops.status, + (unsigned long long)kaddr, handle); + retval = -EFAULT; + } + + reset_grant_page(page); + + if (free_page_ind) { + set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + clear_bit(PG_pinned, &(page->flags)); + free_xenballooned_pages(1, pagevec); + } + + return retval; +} + +/** + * Called by each HCA device on load + */ +int mlx_xen_fmr_icm_master_init(struct pci_dev *ppf, void **ppf_ctx) +{ + struct xen_master_ppf_ctx *my_ppf_ctx; + + /* Validate ppf context */ + if (unlikely(ppf == 0)) + printk(KERN_ERR "xen_fmr_master: Warning, got empty" + " PPF in icm_master_init\n"); + + /* Create and initialize device context */ + my_ppf_ctx = (struct xen_master_ppf_ctx *) + kmalloc(sizeof(struct xen_master_ppf_ctx), GFP_KERNEL); + if (my_ppf_ctx == NULL) { + printk(KERN_ERR "xen_fmr_master: Fail to allocate memory" + " for device context\n"); + return -EFAULT; + } + my_ppf_ctx->pci_dev = ppf; + *ppf_ctx = my_ppf_ctx; + + ref_count_inc(&m_dev_ref_count); + + printk(KERN_INFO "xen_fmr_master: FMR-ICM Master Initialized" + " for device %d\n", (ppf != 0) ? ppf->devfn : 0); + + return 0; +} + +/** + * Called each time a new vf registers to ppf + */ +int mlx_xen_fmr_icm_master_add_function(void *ppf_ctx, + struct pci_dev *vf, + u8 *fmr_info, + void **vf_ctx) +{ + struct xen_master_vf_ctx *my_vf_ctx; + + *vf_ctx = 0; + + /* Validate PPF context */ + if (unlikely(ppf_ctx == 0)) { + printk(KERN_ERR "xen_fmr_master: Error, Got empty PPF context" + " in add_function\n"); + return -EFAULT; + } + + /* Create and initialize VF context */ + my_vf_ctx = kmalloc(sizeof(struct xen_master_vf_ctx), GFP_KERNEL); + if (my_vf_ctx == NULL) { + printk("xen_fmr_master: Fail to allocate memory" + " for VF context\n"); + return -EFAULT; + } + my_vf_ctx->ppf_ctx = (struct xen_master_ppf_ctx *)ppf_ctx; + *vf_ctx = my_vf_ctx; + + ref_count_inc(&m_vf_ref_count); + + printk(KERN_INFO "xen_fmr_master: FMR-ICM Master Initialized" + " for virtual function %d\n", + my_vf_ctx->ppf_ctx->pci_dev->devfn); + + return 0; +} + +/** + * Called each time a vf unregisters from ppf + */ +int mlx_xen_fmr_icm_master_del_function(void *vf_ctx) +{ + struct xen_master_vf_ctx *my_vf_ctx; + + /* Validate VF context */ + if (unlikely(vf_ctx == 0)) { + printk(KERN_ERR "xen_fmr_master: Error, Got empty VF context" + " in del_function\n"); + return -EFAULT; + } + my_vf_ctx = (struct xen_master_vf_ctx *)vf_ctx; + + printk(KERN_INFO "xen_fmr_master: FMR-ICM Master terminate virtual" + " function %d\n", my_vf_ctx->ppf_ctx->pci_dev->devfn); + + kfree(vf_ctx); + + ref_count_dec(&m_vf_ref_count); + + return 0; +} + +/** + * Map pages using info from vpm and returns ctx handle + */ +dma_addr_t mlx_xen_fmr_icm_master_dma_map(void *vf_ctx, struct vpm *vpm_page, + void **vpm_ctx) +{ + struct xen_vpm *xen_vpm; + struct gnttab_map_grant_ref mops; + struct xen_master_vpm_ctx *my_vpm_ctx; + struct xen_master_vf_ctx *my_vf_ctx; + struct page *page; + dma_addr_t dma_addr; + void *kaddr; + + *vpm_ctx = NULL; + page = NULL; + + /* Validate VPM */ + if (unlikely(vpm_page == 0)) { + printk(KERN_ERR "xen_fmr_master: Error," + " Got empty VPM Page in dma_map\n"); + return 0; + } + xen_vpm = (struct xen_vpm *)(vpm_page); + DPRINTK("xen_fmr_master: Mapping to page (dom_id=%d," + " addr_ref=%ld (0x%llx))\n", + be16_to_cpu(xen_vpm->dom_id), + addr_ref_be_to_cpu(xen_vpm->addr_ref), + addr_ref_be_to_cpu(xen_vpm->addr_ref)); + + /* Validate VF context */ + if (unlikely(vf_ctx == 0)) { + printk(KERN_ERR "xen_fmr_master: Error," + " Got empty VF context in dma_map\n"); + return 0; + } + my_vf_ctx = (struct xen_master_vf_ctx *)vf_ctx; + + my_vpm_ctx = (struct xen_master_vpm_ctx *) + kzalloc(sizeof(xen_master_vpm_ctx), GFP_KERNEL); + if (!my_vpm_ctx) { + printk(KERN_ERR "xen_fmr_master: Error," + " Failed to allocate vpm context\n"); + return 0; + } + /* + * If slave run in domU then we have to map to addr_ref + * otherwize addr_ref is the page address + */ + if (xen_vpm->dom_id) { + /* Allocate virtual memory area */ + alloc_xenballooned_pages(1, my_vpm_ctx->pagevec, true); + if (my_vpm_ctx->pagevec[0] == NULL) { + printk(KERN_ERR "xen_fmr_master:" + " Fail allocate virtual area\n"); + kfree(my_vpm_ctx); + return 0; + } + page = my_vpm_ctx->pagevec[0]; + set_bit(PG_pinned, &(page->flags)); + kaddr = pfn_to_kaddr(page_to_pfn(page)); + + /* Map to the shared page */ + gnttab_set_map_op(&mops, + (unsigned long)kaddr, + GNTMAP_host_map, + addr_ref_be_to_cpu(xen_vpm->addr_ref), + be16_to_cpu(xen_vpm->dom_id)); + if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, + &mops, 1)) { + printk(KERN_ERR "xen_fmr_master: Fail to map to page," + " status=%d gref=0x%lx va=0x%lx\n", mops.status, + (unsigned long)addr_ref_be_to_cpu(xen_vpm->addr_ref), + (unsigned long)be64_to_cpu(xen_vpm->vpm.va)); + clear_bit(PG_pinned, &(page->flags)); + free_xenballooned_pages(1, my_vpm_ctx->pagevec); + kfree(my_vpm_ctx); + return 0; + } + DPRINTK("xen_fmr_master: kaddr=%lx, pfn=0x%lx, mfn=0x%lx," + " bus_addr=0x%lx\n", + (unsigned long)kaddr, + page_to_pfn(page), + pfn_to_mfn(page_to_pfn(page)), + (unsigned long)mops.dev_bus_addr); + set_phys_to_machine(page_to_pfn(page), + FOREIGN_FRAME(mops.dev_bus_addr >> + PAGE_SHIFT)); + + DPRINTK("xen_fmr_master: kaddr=%llx, pfn=0x%lx, mfn=0x%lx," + " bus_addr=0x%llx\n", + (unsigned long long)kaddr, + page_to_pfn(page), + pfn_to_mfn(page_to_pfn(page)), + FOREIGN_FRAME(mops.dev_bus_addr >> PAGE_SHIFT)); + /* SetPageReserved(page); */ + } else { + kaddr = (void *)addr_ref_be_to_cpu(xen_vpm->addr_ref); + mops.handle = 0; + } + +#ifndef STUB_IB_CORE_DRIVER + /* Map DMA */ + dma_addr = dma_map_single(&(my_vf_ctx->ppf_ctx->pci_dev->dev), + kaddr, + PAGE_SIZE, + DMA_BIDIRECTIONAL); + if (dma_mapping_error(&(my_vf_ctx->ppf_ctx->pci_dev->dev), dma_addr)) { + printk(KERN_ERR "xen_fmr_master: Fail in map address" + " (0x%llx) for DMA\n", + (unsigned long long)kaddr); + mlx_xen_fmr_unmap_page(mops.handle, my_vpm_ctx->pagevec, 1); + kfree(my_vpm_ctx); + return 0; + } +#else + /* This will be used only on testings where we are not + connected to real driver */ + DPRINTK("xen_fmr_master: Fake device\n"); + dma_addr = (dma_addr_t)kaddr; +#endif + + /* Save context to unmap */ + my_vpm_ctx->dma_addr = dma_addr; + my_vpm_ctx->handle = mops.handle; + my_vpm_ctx->vf_ctx = vf_ctx; + *vpm_ctx = my_vpm_ctx; + + ref_count_inc(&m_map_ref_count); + + DPRINTK("xen_fmr_master: Mapped kaddr=0x%llx, dma_addr=0x%llx," + " handle=%d\n", + (unsigned long long)kaddr, + (unsigned long long)my_vpm_ctx->dma_addr, + my_vpm_ctx->handle); + + + return dma_addr; +} + +/** + * Unmap page based on ctx + */ +int mlx_xen_fmr_icm_master_dma_unmap(void *vpm_ctx) +{ + struct xen_master_vpm_ctx *my_vpm_ctx; + int err = 0; + + /* Validate VPM context */ + if (unlikely(vpm_ctx == 0)) { + printk(KERN_ERR "xen_fmr_master: Error," + " Got empty VPMF context in dma_unmap\n"); + return -EFAULT; + } + my_vpm_ctx = (struct xen_master_vpm_ctx *)vpm_ctx; + +#ifndef STUB_IB_CORE_DRIVER + /* Unmap DMA - bus is set to zero in driver stub */ + dma_unmap_single(&(my_vpm_ctx->vf_ctx->ppf_ctx->pci_dev->dev), + my_vpm_ctx->dma_addr, + PAGE_SIZE, + DMA_BIDIRECTIONAL); +#endif + + /* Unmap the page only if we mapped it */ + if (my_vpm_ctx->handle) { + if (mlx_xen_fmr_unmap_page(my_vpm_ctx->handle, + my_vpm_ctx->pagevec, 1) != 0) { + DPRINTK("xen_fmr_master: Fail to unmap %d\n", + my_vpm_ctx->handle); + err = -EFAULT; + } + } + kfree(vpm_ctx); + + ref_count_dec(&m_map_ref_count); + + DPRINTK("xen_fmr_master: Unmapped from addr\n"); + + return err; +} + +/** + * Called by each HCA before unload + */ +void mlx_xen_fmr_icm_master_term(void *ppf_ctx) +{ + struct xen_master_ppf_ctx *my_ppf_ctx; + + /* Validate ppf context */ + if (unlikely(ppf_ctx == 0)) { + printk(KERN_ERR "xen_fmr_master: Error," + " got empty PPF in icm_master_init\n"); + return; + } + + my_ppf_ctx = (struct xen_master_ppf_ctx *)ppf_ctx; + + printk(KERN_INFO "xen_fmr_master: FMR-ICM Master" + " terminated for device %d\n", + my_ppf_ctx->pci_dev->devfn); + + kfree(ppf_ctx); + + ref_count_dec(&m_dev_ref_count); +} + +static struct mlx4_icm_master icm_master = { + .protocol = FMR_PROTOCOL_XEN, + .vpm_info_size = XEN_VPM_SZ, + .fmr_info_size = 0, + .log_page_size = PAGE_SHIFT, + .init = mlx_xen_fmr_icm_master_init, + .add_function = mlx_xen_fmr_icm_master_add_function, + .del_function = mlx_xen_fmr_icm_master_del_function, + .dma_map = mlx_xen_fmr_icm_master_dma_map, + .dma_unmap = mlx_xen_fmr_icm_master_dma_unmap, + .term = mlx_xen_fmr_icm_master_term +}; + +/** + * + */ +static int mlx4_xen_fmr_back_remove(struct xenbus_device *dev) +{ + if (!m_registered_to_xenbus) + return 0; + + return 0; +} + +/** + * Entry point to this code when a new device is created. Allocate the basic + * structures and switch to InitWait. + */ +static int mlx4_xen_fmr_back_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + if (!m_registered_to_xenbus) + return 0; + + return 0; +} + +/** + * + */ +static void mlx4_xen_fmr_frontend_changed(struct xenbus_device *dev, + enum xenbus_state frontend_state) +{ + if (!m_registered_to_xenbus) + return; + + DPRINTK("xen_fmr_master: Domain %d change state to %s\n", + dev->otherend_id, xenbus_strstate(frontend_state)); + DPRINTK("xen_fmr_master: My state is %s\n", + xenbus_strstate(dev->state)); + + if (xenbus_printf(XBT_NIL, dev->nodename, "dom-id", "%d", + dev->otherend_id)) + printk(KERN_ERR "xen_fmr_master: Fail to write to xenbus\n"); + + mlx4_xen_fmr_switch_state(dev, XenbusStateInitialising); + mlx4_xen_fmr_switch_state(dev, XenbusStateInitWait); +} + +/** + * + */ +static const struct xenbus_device_id mlx_xen_fmr_back_ids[] = { + { XENBUS_DEVID }, + { "" } +}; +static DEFINE_XENBUS_DRIVER(mlx_xen_fmr_back, , + .probe = mlx4_xen_fmr_back_probe, + .remove = mlx4_xen_fmr_back_remove, + .otherend_changed = mlx4_xen_fmr_frontend_changed +); + +/** + * Initialize module + */ +static int __init mlx4_xen_fmr_backend_init(void) +{ + if (!xen_domain()) + return -ENODEV; + + m_registered_to_xenbus = 0; + + DPRINTK(KERN_INFO "xen_fmr_master: Registering to XENBUS\n"); + + if (xenbus_register_backend(&mlx_xen_fmr_back_driver)) { + printk(KERN_ERR "xen_fmr_master:" + " Fail to register to XENBUS\n"); + return -ENODEV; + } + m_registered_to_xenbus = 1; + + ref_count_init(&m_dev_ref_count); + ref_count_init(&m_vf_ref_count); + ref_count_init(&m_map_ref_count); + + DPRINTK(KERN_INFO "xen_fmr_master: Registering to ICM\n"); + if (mlx4_reg_icm_master(&icm_master) != 0) { + printk(KERN_ERR "xen_fmr_master: Fail to register to ICM\n"); + return -ENODEV; + } + + printk(KERN_INFO "xen_fmr_master: Initialized\n"); + + return 0; +} + +/** + * Terminate module + */ +static void __exit mlx4_xen_fmr_backend_exit(void) +{ + DPRINTK(KERN_INFO "xen_fmr_master: Going down\n"); + + DPRINTK(KERN_INFO "xen_fmr_master: Unregistering from ICM\n"); + mlx4_unreg_icm_master(&icm_master); + + DPRINTK(KERN_INFO "xen_fmr_master: Verify sharings %d\n", + m_map_ref_count.var); + ref_count_wait_for_zero(&m_map_ref_count); + DPRINTK(KERN_INFO "xen_fmr_master: Verify VFs %d\n", + m_vf_ref_count.var); + ref_count_wait_for_zero(&m_vf_ref_count); + DPRINTK(KERN_INFO "xen_fmr_master: Verify Devices %d\n", + m_dev_ref_count.var); + ref_count_wait_for_zero(&m_dev_ref_count); + + DPRINTK(KERN_INFO "xen_fmr_master: Unregistering from XENBUS\n"); + xenbus_unregister_driver(&mlx_xen_fmr_back_driver); + + printk(KERN_INFO "xen_fmr_master: Terminated\n"); +} + +module_init(mlx4_xen_fmr_backend_init); +module_exit(mlx4_xen_fmr_backend_exit); diff --git a/drivers/net/mlx4/xen_fmr_slave.c b/drivers/net/mlx4/xen_fmr_slave.c new file mode 100644 index 0000000000000..af6f5d9f447ff --- /dev/null +++ b/drivers/net/mlx4/xen_fmr_slave.c @@ -0,0 +1,360 @@ +/* + * Copyright (c) 2012 Oracle. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "fmr_api.h" +#include "xen_fmr.h" + +MODULE_AUTHOR("Yuval Shaia "); +MODULE_DESCRIPTION("XEN FMR API - Frontend"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION(DRV_VERSION); + +/* Indicates if we are running in dom0 or domU */ +bool m_running_on_dom_u; +/* Save running domain ID */ +domid_t m_my_dom_id; +/* Indicate no xenbus operation until registration completed */ +int m_registered_to_xenbus; +/* Will be use to make sure all shared pages were unshared */ +struct ref_count m_shr_ref_count; +/* Will be use to make sure all VFs were cleaned */ +struct ref_count m_vf_ref_count; + +/** + * Slave Context, initiated in share(), used and destroyed in + * unshare() + */ +struct xen_slave_vpm_ctx { + addr_ref_t gref; +} xen_slave_vpm_ctx; + +/** + * Called by each FV on load + */ +int mlx_xen_fmr_icm_slave_init(struct pci_dev *vf, + u8 vpm_info_size, + u8 fmr_info_size, + u8 *fmr_info, + void **vf_ctx) +{ + DPRINTK("xen_fmr_slave: FMR-ICM Slave Initializing for device %d\n", + vf->devfn); + + /* Integration check */ + if (unlikely(vpm_info_size != XEN_VPM_SZ)) { + printk(KERN_ERR "xen_fmr_slave: Invalid vpm_info_size\n"); + return -EINVAL; + } + + /* Caller trust init when context is not NULL so we malloc + dummy buffer */ + *vf_ctx = kmalloc(sizeof(int), GFP_KERNEL); + + ref_count_inc(&m_vf_ref_count); + + printk(KERN_INFO "xen_fmr_slave: FMR-ICM Slave" + " Initialized for device %d\n", vf->devfn); + + return 0; +} + +/** + * Share pages using info from vpm and returns ctx handle + */ +int mlx_xen_fmr_icm_slave_share(void *vf_ctx, + void *virt_addr, + struct vpm *vpm_page, + void **vpm_ctx) +{ + struct xen_slave_vpm_ctx *my_vpm_ctx; + struct xen_vpm *xen_vpm; + addr_ref_t addr_ref; + int res = 0; + + DPRINTK("xen_fmr_slave: Sharing page 0x%lx\n", virt_addr); + + /* Validate VPM Page */ + if (unlikely(vpm_page == 0)) { + printk(KERN_ERR "xen_fmr_slave: Error," + " Got empty VPM address in share\n"); + return -EFAULT; + } + xen_vpm = (struct xen_vpm *)(vpm_page); + + if (m_running_on_dom_u) { + /* Make sure we set dom_id */ + if (unlikely(m_my_dom_id == 0)) { + printk("xen_fmr_slave: Domain ID is not set\n"); + return -EINVAL; + } + + /* Grant access to dom0 */ + res = gnttab_grant_foreign_access(0, virt_to_mfn(virt_addr), 0); + if (res < 0) { + printk(KERN_ERR "xen_fmr_slave: Fail to share\n"); + return res; + } + addr_ref = (addr_ref_t)res; + + xen_vpm->dom_id = cpu_to_be16(m_my_dom_id); + } else { + addr_ref = (addr_ref_t)virt_addr; + xen_vpm->dom_id = 0; + } + xen_vpm->addr_ref = cpu_to_be_addr_ref(addr_ref); + + /* Save context to be used in unshare */ + my_vpm_ctx = kzalloc(sizeof(xen_slave_vpm_ctx), GFP_KERNEL); + my_vpm_ctx->gref = addr_ref; + *vpm_ctx = my_vpm_ctx; + + DPRINTK("xen_fmr_slave: Page shared '%d %ld (0x%llx)'\n", + m_my_dom_id, addr_ref, addr_ref); + + ref_count_inc(&m_shr_ref_count); + + return 0; +} + +/** + * Release pages based on ctx handle + */ +int mlx_xen_fmr_icm_slave_unshare(void *vpm_ctx) +{ + struct xen_slave_vpm_ctx *my_vpm_ctx; + int err = 0; + + /* Validate PPF context */ + if (unlikely(vpm_ctx == 0)) { + printk(KERN_ERR "xen_fmr_slave: Error," + " Got empty VPM context in unshare\n"); + return -EFAULT; + } + my_vpm_ctx = (struct xen_slave_vpm_ctx *)vpm_ctx; + + if (m_running_on_dom_u) { + /* Unshare the page */ + DPRINTK("xen_fmr_slave: Unsharing gref %d\n", my_vpm_ctx->gref); + + if (gnttab_query_foreign_access((grant_ref_t)my_vpm_ctx->gref)) { + DPRINTK("xen_fmr_slave: Can't release grant, ref leak!\n"); + err = -EINVAL; + /*todo: fix grant table leak */ + } else + gnttab_end_foreign_access((grant_ref_t)my_vpm_ctx->gref, 0, 0); + } + + /* Destroy context */ + kfree(vpm_ctx); + + DPRINTK("xen_fmr_slave: Shareing ended for %ld\n", my_vpm_ctx->gref); + + ref_count_dec(&m_shr_ref_count); + + return err; +} + +/** + * Called by each VF before unload + */ +void mlx_xen_fmr_icm_slave_term(void *vf_ctx) +{ + kfree(vf_ctx); + + ref_count_dec(&m_vf_ref_count); + printk(KERN_INFO "xen_fmr_slave: FMR-ICM Slave" + " Terminated for device\n"); +} + +/** + * ICM Slave interface + */ +static struct mlx4_icm_slave icm_slave = { + .protocol = FMR_PROTOCOL_XEN, + .init = mlx_xen_fmr_icm_slave_init, + .share = mlx_xen_fmr_icm_slave_share, + .unshare = mlx_xen_fmr_icm_slave_unshare, + .term = mlx_xen_fmr_icm_slave_term +}; + +/** + * xenbus "probe" event handler + */ +static int mlx4_xen_fmr_front_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + if (xenbus_scanf(XBT_NIL, dev->otherend, "dom-id", "%d", + (int *)&m_my_dom_id)) { + DPRINTK("xen_fmr_slave: My domain ID is %d\n", + (int)m_my_dom_id); + } + + if (!m_registered_to_xenbus) + return 0; + + mlx4_xen_fmr_switch_state(dev, XenbusStateInitialising); + + return 0; +} + +/** + * xenbus "backend change state" event handler + */ +static void mlx4_xen_fmr_backend_changed(struct xenbus_device *dev, + enum xenbus_state backend_state) +{ + if (!m_registered_to_xenbus) + return; + + DPRINTK("xen_fmr_slave: Domain %d change state to %s\n", + dev->otherend_id, + xenbus_strstate(backend_state)); + DPRINTK("xen_fmr_slave: My state is %s\n", + xenbus_strstate(dev->state)); + DPRINTK("xen_fmr_slave: dev->nodename=%s\n", dev->nodename); + + if (xenbus_scanf(XBT_NIL, dev->otherend, "dom-id", "%d", + (int *)&m_my_dom_id)) { + DPRINTK("xen_fmr_slave: My domain ID is %d\n", + (int)m_my_dom_id); + } + mlx4_xen_fmr_switch_state(dev, XenbusStateInitialised); + + return; +} + +/** + * xenbus "un-probe" event handler + */ +static int mlx4_xen_fmr_front_remove(struct xenbus_device *dev) +{ + if (!m_registered_to_xenbus) + return 0; + + return 0; +} + +/** + * + */ +static const struct xenbus_device_id mlx4_xen_fmr_front_ids[] = { + { XENBUS_DEVID }, + { "" } +}; +static DEFINE_XENBUS_DRIVER(mlx4_xen_fmr_front, , + .probe = mlx4_xen_fmr_front_probe, + .remove = mlx4_xen_fmr_front_remove, + .otherend_changed = mlx4_xen_fmr_backend_changed +); + +/** + * Initialize module + */ +static int __init xen_fmr_slave_init(void) +{ + m_my_dom_id = 0; + + m_running_on_dom_u = (!xen_initial_domain()); + + if (!m_running_on_dom_u) + DPRINTK(KERN_ERR "xen_fmr_slave: Running on Dom0\n"); + else + DPRINTK(KERN_ERR "xen_fmr_slave: Running on DomU\n"); + + m_registered_to_xenbus = 0; + if (m_running_on_dom_u) { + DPRINTK(KERN_INFO "xen_fmr_slave: Registering to XENBUS\n"); + if (xenbus_register_frontend(&mlx4_xen_fmr_front_driver)) { + printk(KERN_ERR "xen_fmr_slave: Fail to register to XENBUS\n"); + return -EFAULT; + } + m_registered_to_xenbus = 1; + } + + ref_count_init(&m_vf_ref_count); + ref_count_init(&m_shr_ref_count); + + DPRINTK(KERN_INFO "xen_fmr_slave: Registering to ICM\n"); + if (mlx4_reg_icm_slave(&icm_slave) != 0) { + printk(KERN_ERR "xen_fmr_slave: Fail to register to ICM\n"); + xenbus_unregister_driver(&mlx4_xen_fmr_front_driver); + return -EFAULT; + } + + /* Let backend know we are up */ + if (m_running_on_dom_u) + xenbus_printf(XBT_NIL, "device/fmr/0", "ready", "%d", 1); + + printk(KERN_INFO "xen_fmr_slave: Initialized\n"); + return 0; +} + +/** + * Terminate module + */ +static void __exit xen_fmr_slave_exit(void) +{ + DPRINTK(KERN_INFO "xen_fmr_slave: Going down\n"); + + /* Let backend know we are down */ + if (m_running_on_dom_u) + xenbus_printf(XBT_NIL, "device/fmr/0", "ready", "%d", 0); + + DPRINTK(KERN_INFO "xen_fmr_slave: Unregistering from ICM\n"); + mlx4_unreg_icm_slave(&icm_slave); + + DPRINTK(KERN_INFO "xen_fmr_slave: Verify sharings\n"); + ref_count_wait_for_zero(&m_shr_ref_count); + DPRINTK(KERN_INFO "xen_fmr_slave: Verify VFs\n"); + ref_count_wait_for_zero(&m_vf_ref_count); + + if (m_running_on_dom_u) { + DPRINTK(KERN_INFO "xen_fmr_slave: Unregistering from XENBUS\n"); + xenbus_unregister_driver(&mlx4_xen_fmr_front_driver); + } + + printk(KERN_INFO "xen_fmr_slave: Terminated\n"); +} + +module_init(xen_fmr_slave_init); +module_exit(xen_fmr_slave_exit); diff --git a/drivers/net/mlx4/xrcd.c b/drivers/net/mlx4/xrcd.c new file mode 100644 index 0000000000000..b4a43d8bc2983 --- /dev/null +++ b/drivers/net/mlx4/xrcd.c @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "mlx4.h" + +int __mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + *xrcdn = mlx4_bitmap_alloc(&priv->xrcd_bitmap); + if (*xrcdn == -1) + return -ENOMEM; + + return 0; +} + +int mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn) +{ + u64 out_param; + int err; + + if (mlx4_is_mfunc(dev)) { + err = mlx4_cmd_imm(dev, 0, &out_param, + RES_XRCDN, RES_OP_RESERVE, + MLX4_CMD_ALLOC_RES, + MLX4_CMD_TIME_CLASS_A, 0); + if (err) + return err; + + *xrcdn = get_param_l(&out_param); + return 0; + } + return __mlx4_xrcd_alloc(dev, xrcdn); +} +EXPORT_SYMBOL_GPL(mlx4_xrcd_alloc); + +void __mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn) +{ + mlx4_bitmap_free(&mlx4_priv(dev)->xrcd_bitmap, xrcdn); +} + + +void mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn) +{ + u64 in_param; + int err; + + if (mlx4_is_mfunc(dev)) { + set_param_l(&in_param, xrcdn); + err = mlx4_cmd(dev, in_param, RES_XRCDN, + RES_OP_RESERVE, MLX4_CMD_FREE_RES, + MLX4_CMD_TIME_CLASS_A, 0); + if (err) + mlx4_warn(dev, "Failed to release xrcdn %d\n", xrcdn); + } else + __mlx4_xrcd_free(dev, xrcdn); +} +EXPORT_SYMBOL_GPL(mlx4_xrcd_free); + +int mlx4_init_xrcd_table(struct mlx4_dev *dev) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + return mlx4_bitmap_init(&priv->xrcd_bitmap, (1 << 16), + (1 << 16) - 1, dev->caps.reserved_xrcds + 1, 0); +} + +void mlx4_cleanup_xrcd_table(struct mlx4_dev *dev) +{ + mlx4_bitmap_cleanup(&mlx4_priv(dev)->xrcd_bitmap); +} + + diff --git a/drivers/net/mlx4_vnic/Makefile b/drivers/net/mlx4_vnic/Makefile new file mode 100644 index 0000000000000..e93d37a3dbbe3 --- /dev/null +++ b/drivers/net/mlx4_vnic/Makefile @@ -0,0 +1,14 @@ +obj-$(CONFIG_MLX4_VNIC) += mlx4_vnic.o mlx4_vnic_helper.o + +mlx4_vnic_helper-y := vnic_stats_helper.o + +mlx4_vnic-y := vnic_data_main.o vnic_data_ib.o vnic_data_netdev.o vnic_data_neigh.o \ + vnic_data_fs.o vnic_data_tx.o vnic_data_ethtool.o vnic_data_rx.o \ + vnic_fip_main.o vnic_fip_ib.o vnic_fip_discover.o vnic_fip_pkt.o \ + vnic_fip_login.o vnic_fip_vhub.o vnic_mcast.o vnic_port.o \ + vnic_param.o vnic_qp.o vnic_main.o fip_parser.o \ + vnic_data_mac.o + +EXTRA_CFLAGS += -D_BP_NO_ATT_OWNER +EXTRA_CFLAGS += -D_BP_NO_MC_LIST + diff --git a/drivers/net/mlx4_vnic/fip_parser.c b/drivers/net/mlx4_vnic/fip_parser.c new file mode 100644 index 0000000000000..e1782998467c0 --- /dev/null +++ b/drivers/net/mlx4_vnic/fip_parser.c @@ -0,0 +1,510 @@ +/* + * Copyright (c) 2010 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_fip.h" +#include "vnic_fip_pkt.h" + +static const struct subcode_rules { + u64 req_mask; + u64 opt_mask; +} subcodes_array[FIP_MAX_SUBCODES] = { + [FIP_HOST_SOL_SUB_OPCODE] = { + .req_mask = FIP_MASK(VENDOR_ID) | + FIP_MASK(ADDRESS), + .opt_mask = FIP_MASK(EXT_DESC), + }, + [FIP_GW_ADV_SUB_OPCODE] = { + .req_mask = FIP_MASK(VENDOR_ID) | + FIP_MASK(ADDRESS) | + FIP_MASK(GW_INFORMATION) | + FIP_MASK(GW_IDENTIFIER) | + FIP_MASK(KA_PARAMS), + .opt_mask = FIP_MASK(EXT_DESC), + }, + [FIP_HOST_LOGIN_SUB_OPCODE] = { + .req_mask = FIP_MASK(VENDOR_ID) | + FIP_MASK(ADDRESS) | + FIP_MASK(LOGIN) | + FIP_MASK(PARTITION), + .opt_mask = FIP_MASK(EXT_DESC), + }, + [FIP_GW_LOGIN_SUB_OPCODE] = { + .req_mask = FIP_MASK(VENDOR_ID) | + FIP_MASK(ADDRESS) | + FIP_MASK(LOGIN) | + FIP_MASK(PARTITION), + .opt_mask = FIP_MASK(EXT_DESC), + }, + [FIP_HOST_LOGOUT_SUB_OPCODE] = { + .req_mask = FIP_MASK(VENDOR_ID) | + FIP_MASK(VNIC_IDENTITY), + }, + [FIP_GW_UPDATE_SUB_OPCODE] = { + .req_mask = FIP_MASK(VENDOR_ID) | + FIP_MASK(VHUB_UPDATE), + .opt_mask = FIP_MASK(EXT_DESC), + }, + [FIP_GW_TABLE_SUB_OPCODE] = { + .req_mask = FIP_MASK(VENDOR_ID) | + FIP_MASK(VHUB_TABLE), + }, + [FIP_HOST_ALIVE_SUB_OPCODE] = { + .req_mask = FIP_MASK(VENDOR_ID) | + FIP_MASK(VNIC_IDENTITY), + }, +}; + +static int type2idx(struct fip_content *fc, struct fip_fip_type *ft) +{ + void *p = ft; + + switch (ft->type) { + case FIP_TYPE(VENDOR_ID): + fc->fvend = p; + return FIP_TYPE_IDX(VENDOR_ID); + case FIP_TYPE(ADDRESS): + fc->fa.fa[fc->fa.num++] = p; + return FIP_TYPE_IDX(ADDRESS); + case FIP_TYPE(GW_INFORMATION): + fc->fgwi = p; + return FIP_TYPE_IDX(GW_INFORMATION); + case FIP_TYPE(LOGIN): + fc->fl = p; + return FIP_TYPE_IDX(LOGIN); + case FIP_TYPE(VHUB_UPDATE): + fc->fvu = p; + return FIP_TYPE_IDX(VHUB_UPDATE); + case FIP_TYPE(VHUB_TABLE): + fc->fvt = p; + return FIP_TYPE_IDX(VHUB_TABLE); + case FIP_TYPE(VNIC_IDENTITY): + fc->fvi = p; + return FIP_TYPE_IDX(VNIC_IDENTITY); + case FIP_TYPE(PARTITION): + fc->fp = p; + return FIP_TYPE_IDX(PARTITION); + case FIP_TYPE(GW_IDENTIFIER): + fc->fgid = p; + return FIP_TYPE_IDX(GW_IDENTIFIER); + case FIP_TYPE(KA_PARAMS): + fc->fka = p; + return FIP_TYPE_IDX(KA_PARAMS); + case FIP_TYPE(EXT_DESC): + fc->fed.fed[fc->fed.num++] = p; + return FIP_TYPE_IDX(EXT_DESC); + default: + return -1; + } +} + +#ifdef CONFIG_MLX4_VNIC_DEBUG +static const char *fip_type_str(int type) +{ + switch (type) { + FIP_CASE_STR(VENDOR_ID); + FIP_CASE_STR(ADDRESS); + FIP_CASE_STR(GW_INFORMATION); + FIP_CASE_STR(LOGIN); + FIP_CASE_STR(VHUB_UPDATE); + FIP_CASE_STR(VHUB_TABLE); + FIP_CASE_STR(VNIC_IDENTITY); + FIP_CASE_STR(PARTITION); + FIP_CASE_STR(GW_IDENTIFIER); + FIP_CASE_STR(KA_PARAMS); + FIP_CASE_STR(EXT_DESC); + default: + return "Unknown"; + } +} + +static const char *fip_subcode_str(int subcode) +{ + switch (subcode) { + FIP_SUBCODE_CASE_STR(FIP_HOST_SOL_SUB_OPCODE); + FIP_SUBCODE_CASE_STR(FIP_GW_ADV_SUB_OPCODE); + FIP_SUBCODE_CASE_STR(FIP_HOST_LOGIN_SUB_OPCODE); + FIP_SUBCODE_CASE_STR(FIP_GW_LOGIN_SUB_OPCODE); + FIP_SUBCODE_CASE_STR(FIP_HOST_LOGOUT_SUB_OPCODE); + FIP_SUBCODE_CASE_STR(FIP_GW_UPDATE_SUB_OPCODE); + FIP_SUBCODE_CASE_STR(FIP_GW_TABLE_SUB_OPCODE); + FIP_SUBCODE_CASE_STR(FIP_HOST_ALIVE_SUB_OPCODE); + default: + return "Unknown"; + } +} +#endif + +static int verify_mlx_sig(void *p) +{ + static const char *mlx4_str = "mellanox"; + __be64 mlx_str_64 = *(__be64 *)mlx4_str; + __be64 *sig = p; + + return *sig != mlx_str_64; +} + +static int next_type(struct vnic_port *port, void *tlv, int len, + struct fip_content *fc, int *sz, int *idx) +{ + struct fip_fip_type *ft; + + if (sizeof *ft > len) { + vnic_dbg_parse(port->name, "message too short\n"); + return -1; + } + ft = tlv + ; + vnic_dbg_parse(port->name, "TLV: type %s(%d)\n", fip_type_str(ft->type), + ft->type); + + if (!ft->length || (ft->length << 2 > len)) { + vnic_dbg_parse(port->name, "TLV does not fit in message: %s(%d) " + "tlv->len %d, remaining %d\n", fip_type_str(ft->type), + ft->type, ft->length << 2, len); + return -1; + } + + *sz = (ft->length << 2); + + *idx = type2idx(fc, ft); + if (*idx < 0) { + vnic_dbg_parse(port->name, "unkown type %d\n", ft->type); + return -1; + } + + if (ft->type == FIP_TYPE(VENDOR_ID) && verify_mlx_sig(fc->fvend->vendor_id)) { + vnic_dbg_parse(port->name, "mellanox signature check failed\n"); + return -1; + } + + if (ft->type == FIP_TYPE(VHUB_TABLE) || ft->type == FIP_TYPE(VHUB_UPDATE)) { + int cte_list_sz; + struct context_table_entry *cte_start; + + if (ft->type == FIP_TYPE(VHUB_TABLE)) { + unsigned hdr = be16_to_cpu(fc->fvt->hdr) >> 14; + + if (hdr > FIP_TABLE_HDR_ONLY) { + vnic_dbg_parse(port->name, "invalid table header %d\n", hdr); + return -1; + } + cte_list_sz = *sz - sizeof(struct fip_vhub_table_tlv); + /* Todo, the next 2 lines are comented because the size of the tbl tlv is + miscomputed in BXM versions 1.3.6-5 and it causes tables to be discarded. + In reality the size should be used with the lines in tact. */ + /*if (hdr == FIP_TABLE_HDR_LAST) + cte_list_sz -= 4; + */ + + cte_start = (struct context_table_entry *)(fc->fvt + 1); + } else { + cte_list_sz = *sz - sizeof(struct fip_vhub_update_tlv); + cte_start = (struct context_table_entry *)(fc->fvu + 1); + } + + + fc->cte.num = cte_list_sz / sizeof(struct context_table_entry); + fc->cte.cte = cte_start; + } + + + return 0; +} + +static inline int check_eoib_ver(struct vnic_port *port, + struct fip_eoib_ver *eoib_ver, int sz, int *len) +{ + if (unlikely(sz < sizeof *eoib_ver)) { + vnic_dbg_parse(port->name, "message too short\n"); + *len = sz; + return -ENOMEM; + } + *len = sizeof *eoib_ver; + if (unlikely(eoib_ver->version >> 4)) { + vnic_dbg_parse(port->name, "eoib version check failed: %d\n", eoib_ver->version >> 4); + return -EINVAL; + } + return 0; +} + +static void dump_raw(struct vnic_port *port, void *buf, int len) +{ + int i; + + for (i = 0; i < len / 4; ++i) + vnic_dbg_parse(port->name, "0x%08x\n", be32_to_cpu(((__be32 *)(buf))[i])); +} + +static inline int check_fip_hdr(struct vnic_port *port, + struct fip_header_simple *fh, int sz, int *len) +{ + if (unlikely(sizeof *fh > sz)) { + vnic_dbg_parse(port->name, "message too short\n"); + return -1; + } + + if (unlikely(fh->opcode != cpu_to_be16(EOIB_FIP_OPCODE))) { + vnic_dbg_parse(port->name, "not fip opcode\n"); + return -1; + } + + if (unlikely((be16_to_cpu(fh->list_length) << 2) > (sz - sizeof *fh))) { + vnic_dbg_parse(port->name, "message too short: header length = %u, " + "left length = %lu\n", + be16_to_cpu(fh->list_length) << 2, sz - sizeof *fh); + return -1; + } + + *len = sizeof *fh; + + return 0; +} + +static int check_fip_mask(struct vnic_port *port, struct fip_content *fc) +{ + u64 req_mask = subcodes_array[fc->fh->subcode].req_mask; + u64 opt_mask = subcodes_array[fc->fh->subcode].opt_mask; + + if (((fc->mask & req_mask) != req_mask) || + ((fc->mask & ~opt_mask) & ~req_mask)) { + vnic_dbg_parse(port->name, "%s: mask check failed: mask 0x%llx," + "req_mask 0x%llx, opt_mask 0x%llx\n", + fip_subcode_str(fc->fh->subcode), fc->mask, req_mask, opt_mask); + return -1; + } + + return 0; +} + +static void dump_cte(struct vnic_port *port, struct context_table_entry *cte) +{ + vnic_dbg_parse(port->name, "CTE: V(%d) RSS(%d) type(%d) MAC(%pM) QPN(0x%06x) SL(%d) LID(0x%04x)\n", + (0x1 & (cte->v_rss_type >> 7)), + (0x1 & (cte->v_rss_type >> 6)), + (cte->v_rss_type & 0xf), + cte->mac, be32_to_cpu(cte->qpn) & 0xffffff, + (cte->sl & 0xf), be16_to_cpu(cte->lid)); +} + +static void dump_vnic_identity(struct vnic_port *port, + struct fip_vnic_identity_tlv *fvi) +{ +#define VHUB_ID be32_to_cpu(fvi->flags_vhub_id) + + vnic_dbg_parse(port->name, "%s: U(%d) R(%d) VP(%d) VHUBID(x%x) TUSN(0x%x) VNIC_ID(0x%x)" + "MAC(%pM) GUID("GUID_FORMAT") VNIC NAME (%s)\n", + fip_type_str(fvi->ft.type), (VHUB_ID >> 31), (0x01 & (VHUB_ID >> 30)), + (0x01 & (VHUB_ID >> 24)), VHUB_ID & 0xffffff, be32_to_cpu(fvi->tusn), + be16_to_cpu(fvi->vnic_id), fvi->mac, GUID_ARG(fvi->port_guid), fvi->vnic_name); +} + +static void dump_vnic_partition(struct vnic_port *port, struct fip_partition_tlv *fp) +{ + vnic_dbg_parse(port->name, "%s: PKEY(0x%x)\n", fip_type_str(fp->ft.type), + be16_to_cpu(fp->pkey)); +} + + +static void dump_gw_identifier(struct vnic_port *port, struct fip_gw_identifier_tlv *fgid) +{ + vnic_dbg_parse(port->name, "%s: SYS GUID("GUID_FORMAT") SYS NAME(%s) GW PORT NAME(%s)\n", + fip_type_str(fgid->ft.type), GUID_ARG(fgid->sys_guid), fgid->sys_name, fgid->sys_name); +} + +static void dump_ka_params(struct vnic_port *port, struct fip_ka_params_tlv *fka) +{ + vnic_dbg_parse(port->name, "%s: GW_ADV_PERIOD(%d) GW_KA_PERIOD(%d) VNIC_KA_PERIOD(%d)\n", + fip_type_str(fka->ft.type), be32_to_cpu(fka->adv_period), + be32_to_cpu(fka->ka_period), be32_to_cpu(fka->vnic_ka_period)); +} + +static void dump_vhub_table(struct vnic_port *port, struct fip_content *fc) +{ + int i; + + vnic_dbg_parse(port->name, "%s: VP(%d) vhub id(0x%x) TUSN(0x%x) HDR(%d) table size (%d)\n", + fip_type_str(fc->fvt->ft.type), be32_to_cpu(fc->fvt->vp_vhub_id) >> 24 & 1, + be32_to_cpu(fc->fvt->vp_vhub_id) & 0xffffff, be32_to_cpu(fc->fvt->tusn), + be16_to_cpu(fc->fvt->hdr) >> 14, be16_to_cpu(fc->fvt->table_size)); + for (i = 0; i < fc->cte.num; ++i) + dump_cte(port, &fc->cte.cte[i]); +} + +static void dump_fip_login(struct vnic_port *port, struct fip_login_tlv *p) +{ + vnic_dbg_parse(port->name, "%s: mtu(%d) vnic_id(0x%x) v_m_vp_h(0x%x) vlan(0x%x) mac(%pM)" + "mgid_prefix("MGID_PREFIX_FMT") vfields(0x%0x) syndrom(%d) QPN(0x%x)" + " vnic_name(%s)\n", fip_type_str(p->ft.type), be16_to_cpu(p->mtu), + be16_to_cpu(p->vnic_id), be16_to_cpu(p->flags_vlan) >> 12, + be16_to_cpu(p->flags_vlan) & 0xfff, p->mac, MGID_PRE_ARG(p->eth_gid_prefix), + be16_to_cpu(p->vfields), be32_to_cpu(p->syndrom_ctrl_qpn) >> 24, + be32_to_cpu(p->syndrom_ctrl_qpn) & 0xffffff, p->vnic_name); +} + +static void dump_fip_address(struct vnic_port *port, struct fip_address_tlv *fa) +{ + vnic_dbg_parse(port->name, "%s: GW_TYPE(%d) QPN(0x%x) SL(%d), GW_PORT_ID(0x%x)," + " LID(0x%x) GUID(" GUID_FORMAT ")\n", fip_type_str(fa->ft.type), + be32_to_cpu(fa->gwtype_qpn) >> 24, be32_to_cpu(fa->gwtype_qpn) & 0xffffff, + be16_to_cpu(fa->sl_gwportid) >> 12, be16_to_cpu(fa->sl_gwportid) & 0xfff, + be16_to_cpu(fa->lid), GUID_ARG(fa->guid)); +} + +static void dump_vhub_update(struct vnic_port *port, struct fip_content *fc) +{ +#define VHUB_ID_1 be32_to_cpu(fc->fvu->state_vhub_id) + int i; + + vnic_dbg_parse((port->name), "%s: eport_state(%s) vp(%d) vhub_id(0x%x) tusn(0x%x)\n", + fip_type_str(fc->fvu->ft.type), eport_state_str(VHUB_ID_1 >> 28 & 3), + VHUB_ID_1 >> 24 & 1, VHUB_ID_1 & 0xffffff, be32_to_cpu(fc->fvu->tusn)); + for (i = 0; i < fc->cte.num; ++i) + dump_cte(port, &fc->cte.cte[i]); +} + +static void dump_gateway_information(struct vnic_port *port, + struct fip_gw_information_tlv *fgwi) +{ + vnic_dbg_parse(port->name, "%s: accept host administered(%s) nmac_mgid(%d) " + "nrss_mgid(%d) ntss_qpn(%d), n_rss(%d), num_net_vnics(%d)\n", + fip_type_str(fgwi->ft.type), (fgwi->h_nmac_mgid >> 7) ? "Yes" : "No", + fgwi->h_nmac_mgid & 0x3f, fgwi->n_rss_mgid_tss_qpn >> 4, + fgwi->n_rss_mgid_tss_qpn & 0xf, be16_to_cpu(fgwi->n_rss_qpn_vnics) >> 12, + be16_to_cpu(fgwi->n_rss_qpn_vnics) & 0xfff); +} + +static void dump_fip_packet(struct vnic_port *port, struct fip_content *fc) +{ + int i; + + for (i = 0; i < fc->fa.num; ++i) + dump_fip_address(port, fc->fa.fa[i]); + + if (fc->fgwi) + dump_gateway_information(port, fc->fgwi); + + if (fc->fvu) + dump_vhub_update(port, fc); + + if (fc->fl) + dump_fip_login(port, fc->fl); + + if (fc->fvt) + dump_vhub_table(port, fc); + + if (fc->fvi) + dump_vnic_identity(port, fc->fvi); + + if (fc->fp) + dump_vnic_partition(port, fc->fp); + + if (fc->fgid) + dump_gw_identifier(port, fc->fgid); + + if (fc->fka) + dump_ka_params(port, fc->fka); +} + +int fip_packet_parse(struct vnic_port *port, void *packet, int pkt_size, struct fip_content *fc) +{ + void *ptr = packet; + int len; + int err; + int idx; + u16 offset = 0; + int size = pkt_size; + + vnic_dbg_parse(port->name, "size = %d\n", size); + err = check_eoib_ver(port, ptr, size, &len); + if (err) { + if (err != -EINVAL) + goto out_err; + else + vnic_dbg_parse(port->name, "version check failed\n"); + } + + fc->eoib_ver = ptr; + size -= len; + ptr += len; + offset += len; + fc->fh = ptr; + + err = check_fip_hdr(port, ptr, size, &len); + if (err) + goto out_err; + + ptr += len; + offset += len; + + fc->fa.num = 0; + fc->num = 0; + fc->mask = 0; + + /* workaround a BXM bug not reporting the correct descriptor length */ + if (fc->fh->subcode != FIP_GW_ADV_SUB_OPCODE) + size = be16_to_cpu(fc->fh->list_length) << 2; + else + size -= len; + + vnic_dbg_parse(port->name, "subcode = %s, size %d\n", + fip_subcode_str(fc->fh->subcode), size); + while (size > 0) { + err = next_type(port, ptr, size, fc, &len, &idx); + if (err) + break; + + fc->offsets[fc->num] = offset; + fc->mask |= ((u64)1 << idx); + ptr += len; + size -= len; + offset += len; + fc->num++; + } + + if (err) + goto out_err; + + err = check_fip_mask(port, fc); + if (err) { + vnic_dbg_parse(port->name, "check mask: failed\n"); + goto out_err; + } + + dump_fip_packet(port, fc); + + return 0; + +out_err: + dump_raw(port, packet, pkt_size); + return err; +} diff --git a/drivers/net/mlx4_vnic/vnic.h b/drivers/net/mlx4_vnic/vnic.h new file mode 100644 index 0000000000000..d762442d752f8 --- /dev/null +++ b/drivers/net/mlx4_vnic/vnic.h @@ -0,0 +1,1385 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef VNIC_H +#define VNIC_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* for mlx4_ib dev attr, used also in vnic_qp.c */ +#include "../../infiniband/hw/mlx4/mlx4_ib.h" +#include "../../infiniband/hw/mlx4/user.h" + +#include "vnic_utils.h" + +/* driver info definition */ +#define DRV_NAME "mlx4_vnic" +#define DRV_VER "1.3.9" +#define DRV_LIC "Dual BSD/GPL" +#define DRV_DESC "Mellanox BridgeX Virtual NIC Driver" +#define DRV_AUTH "Ali Ayoub & Gabi Liron" + +/* externs */ +extern u32 vnic_msglvl; +extern u32 vnic_max_tx_outs; +extern u32 vnic_lro_num; +extern u32 vnic_mcast_create; +extern u32 vnic_net_admin; +extern u32 vnic_child_max; +extern u32 vnic_napi_weight; +extern u32 vnic_linear_small_pkt; +extern u32 vnic_tx_rings_num; +extern u32 vnic_rx_rings_num; +extern u32 vnic_tx_rings_len; +extern u32 vnic_rx_rings_len; +extern u32 vnic_mgid_data_type; +extern u32 vnic_encap_headroom; +extern u32 vnic_tx_polling; +extern u32 vnic_rx_linear; +extern u32 vnic_change_mac; +extern u32 vnic_learn_mac_enabled; +extern u32 vnic_synd_backlog; +extern u32 vnic_eport_state_enforce; +extern u32 vnic_src_mac_enforce; +extern u32 vnic_inline_tshold; + +#define MAX_NUM_PKEYS_DISCOVERY (24) +#define ILLEGAL_PKEY_INDEX (0xFFFF) +extern u32 vnic_discovery_pkeys[MAX_NUM_PKEYS_DISCOVERY]; +extern u32 vnic_discovery_pkeys_count; + +extern u32 vnic_gid_index; + +extern u32 no_bxm; + +extern struct workqueue_struct *port_wq; +extern struct workqueue_struct *fip_wq; +extern struct workqueue_struct *mcast_wq; +extern struct workqueue_struct *login_wq; + +extern struct ib_sa_client vnic_sa_client; + +/* definitions */ +#define VNIC_CNT_MAX 32 +#define VNIC_DESC_LEN (64 + 4) +#define VNIC_NAME_LEN 16 /* by spec, use IFNAMSIZ for OS */ +#define VNIC_SYSFS_FLEN (VNIC_NAME_LEN * 2) /* SYSFS file name len, allow pre/suffix (32)*/ +#define VNIC_SYSFS_LLEN 64 +#define VNIC_VENDOR_LEN 8 +#define GID_LEN 16 +#define GUID_LEN 8 +#define IPV4_LEN 4 +#define IPV6_LEN 16 +#define VNIC_SYSTEM_NAME_LEN 32 +#define VNIC_GW_PORT_NAME_LEN 8 +#define GID_PREFIX_LEN 5 +#define VNIC_MAX_DENTRIES 16 +#define VNIC_ID_LEN 16 +#define VNIC_CHILD_MAX 128 +#define VNIC_MAX_RETRIES 0 /* zero = unlimited */ +#define VNIC_WATCHDOG_TIMEOUT (25 * HZ) /* 25 sec */ +#define VNIC_NAPI_SCHED_TIMEOUT (5) +#define FIP_MAX_VNICS_PER_GW (1 << 9) +#define NOT_AVAILABLE_NUM (-1) +#define NOT_AVAILABLE_STRING "N/A" +#define is_valid_str(str) (strcmp(str, NOT_AVAILABLE_STRING)) +#define is_valid_num(num) (num != NOT_AVAILABLE_NUM) +#define is_valid_guid(arr) (!!(*((u64 *)(arr)))) +#define is_valid_ipv4(arr) (!!(*((u32 *)(arr)))) +#define is_mcast_promisc(login) (!(login->n_mac_mcgid)) +#define is_ucast_promisc(login) (!!(login->dev->flags & IFF_PROMISC)) +#define ARRAY_LEN(_x) (sizeof(_x)/sizeof(_x[0])) + +/* TODO: cleanup VNIC_GID_RAW_ARG and friends */ +#define VNIC_GID_RAW_ARG(gid) ((u8 *)(gid))[0], \ + ((u8 *)(gid))[1], \ + ((u8 *)(gid))[2], \ + ((u8 *)(gid))[3], \ + ((u8 *)(gid))[4], \ + ((u8 *)(gid))[5], \ + ((u8 *)(gid))[6], \ + ((u8 *)(gid))[7], \ + ((u8 *)(gid))[8], \ + ((u8 *)(gid))[9], \ + ((u8 *)(gid))[10],\ + ((u8 *)(gid))[11],\ + ((u8 *)(gid))[12],\ + ((u8 *)(gid))[13],\ + ((u8 *)(gid))[14],\ + ((u8 *)(gid))[15] +#define VNIC_GUID_RAW_ARG(gid) ((u8 *)(gid))[0], \ + ((u8 *)(gid))[1], \ + ((u8 *)(gid))[2], \ + ((u8 *)(gid))[3], \ + ((u8 *)(gid))[4], \ + ((u8 *)(gid))[5], \ + ((u8 *)(gid))[6], \ + ((u8 *)(gid))[7] + +#define VNIC_GID_ARG(gid) VNIC_GID_RAW_ARG((gid).raw) +#define VNIC_GID_FMT "%.2x:%.2x:%.2x:%.2x:" \ + "%.2x:%.2x:%.2x:%.2x:" \ + "%.2x:%.2x:%.2x:%.2x:" \ + "%.2x:%.2x:%.2x:%.2x" +#define VNIC_GUID_FMT "%.2x:%.2x:%.2x:%.2x:" \ + "%.2x:%.2x:%.2x:%.2x" + +#define MAC_6_PRINT_FMT "%.2x:%.2x:%.2x:%.2x:" \ + "%.2x:%.2x" +#define MAC_6_PRINT_ARG(mac) (mac)[0], (mac)[1], (mac)[2], \ + (mac)[3], (mac)[4], (mac)[5] + +#define IP_4_PRINT_FMT "%d.%d.%d.%d" +#define IP_4_PRINT_ARG(ip) (ip)[0], (ip)[1], (ip)[2], (ip)[3] + +#define CREATE_VHUB_ID(be_vlan, port_id) \ + ((be16_to_cpu(be_vlan) & 0xFFF) | (((port_id) & 0xFFF) << 12)) +#define CREATE_VHUB_ID_BE(vlan, port_id) \ + cpu_to_be32(CREATE_VHUB_ID(vlan, port_id)) +#define ROUNDUP_LOG2(x) ilog2(roundup_pow_of_two(x)) + +#define VNIC_RX_COAL_TARGET 0x20000 +#define VNIC_RX_COAL_TIME 0x10 +#define VNIC_TX_COAL_PKTS 64 +#define VNIC_TX_COAL_TIME 0x80 +#define VNIC_RX_RATE_LOW 400000 +#define VNIC_RX_COAL_TIME_LOW 0 +#define VNIC_RX_RATE_HIGH 450000 +#define VNIC_RX_COAL_TIME_HIGH 128 +#define VNIC_RX_SIZE_THRESH 1024 +#define VNIC_RX_RATE_THRESH (1000000 / VNIC_RX_COAL_TIME_HIGH) +#define VNIC_SAMPLE_INTERVAL 0 +#define VNIC_AVG_PKT_SMALL 256 +#define VNIC_AUTO_CONF 0xffff +#define VNIC_MCAST_MAX_RETRY 60 +#define VNIC_MCAST_ULIMIT_RETRY 0 +#define VNIC_MCAST_BACKOF_FAC 2 +#define MLX4_DEV_CAP_FLAG_UD_SWP (1 << 28) +#define VNIC_ETHTOOL_LINE_MAX 32 +#define VNIC_ENCAP_LEN 4 +#define VNIC_MAX_TX_SIZE 2048 +#define VNIC_MAX_RX_SIZE 4096 +#define ETH_LLC_SNAP_SIZE 8 + +#define VNIC_MCAST_BACKOFF_MSEC 1000 +#define VNIC_MCAST_BACKOFF_MAX_MSEC 16000 +#define VNIC_SM_HEADSTART 1000 /* time to actually start handling SM handover event */ + +#define SYSFS_VLAN_ID_NO_VLAN (-1) + +#define VNIC_MAX_PAYLOAD_SIZE 4096 +#define VNIC_BUF_SIZE(_port) (min(_port->max_mtu_enum + \ + IB_GRH_BYTES, VNIC_MAX_PAYLOAD_SIZE)) + +#define VNIC_TX_QUEUE_LEN 1024 /* default, tuneable */ +#define VNIC_TX_QUEUE_LEN_MIN 64 +#define VNIC_TX_QUEUE_LEN_MAX (8 * 1024) + +#define VNIC_RX_QUEUE_LEN 2048 /* default, tuneable */ +#define VNIC_RX_QUEUE_LEN_MIN 64 +#define VNIC_RX_QUEUE_LEN_MAX (8 * 1024) + + +#define VNIC_MODER_DELAY (HZ / 4) +#define VNIC_STATS_DELAY VNIC_MODER_DELAY + +#define VNIC_AH_SL_DEFAULT 0x0 + +#define VNIC_DATA_QKEY 0x80020003 +#define VNIC_FIP_QKEY 0x80020002 +#define VNIC_VLAN_OFFSET(login) (login->vlan_used ? VLAN_HLEN : 0) +#define VNIC_VLAN_ENABLED(login) (login->vlan_used ? 1 : 0) +#define VNIC_MAX_TX_CQE 32 /* default, tuneable */ +#define VNIC_MAX_RX_CQE 64 /* default, tuneable */ +#define VNIC_MAX_NUM_CPUS 32 +#define VNIC_MAX_INLINE_TSHOLD 512 + +#define VNIC_EOIB_HDR_VER 0x0 +#define VNIC_EOIB_HDR_SIG 0x3 +#define VNIC_EOIB_HDR_UDP_CHK_OK 0x2 +#define VNIC_EOIB_HDR_TCP_CHK_OK 0x1 +#define VNIC_EOIB_HDR_IP_CHK_OK 0x1 + +#define VNIC_EOIB_HDR_GET_IP_CHK(eoib_hdr) (eoib_hdr->encap_data & 0x3) +#define VNIC_EOIB_HDR_GET_TCP_UDP_CHK(eoib_hdr) ((eoib_hdr->encap_data >> 2) & 0x3) +#define VNIC_EOIB_HDR_GET_VER(eoib_hdr) ((eoib_hdr->encap_data >> 4) & 0x3) +#define VNIC_EOIB_HDR_GET_SIG(eoib_hdr) ((eoib_hdr->encap_data >> 6) & 0x3) + +#define VNIC_EOIB_HDR_SET_IP_CHK_OK(eoib_hdr) (eoib_hdr->encap_data = \ + (eoib_hdr->encap_data & 0xFC) | VNIC_EOIB_HDR_IP_CHK_OK) +#define VNIC_EOIB_HDR_SET_TCP_CHK_OK(eoib_hdr) (eoib_hdr->encap_data = \ + (eoib_hdr->encap_data & 0xF3) | (VNIC_EOIB_HDR_TCP_CHK_OK << 2)) +#define VNIC_EOIB_HDR_SET_UDP_CHK_OK(eoib_hdr) (eoib_hdr->encap_data = \ + (eoib_hdr->encap_data & 0xF3) | (VNIC_EOIB_HDR_UDP_CHK_OK << 2)) + +#define VNIC_IP_CSUM_OK(eoib_hdr) ((VNIC_EOIB_HDR_GET_IP_CHK(eoib_hdr)) == VNIC_EOIB_HDR_IP_CHK_OK) +#define VNIC_TCP_CSUM_OK(eoib_hdr) ((VNIC_EOIB_HDR_GET_TCP_UDP_CHK(eoib_hdr)) == VNIC_EOIB_HDR_TCP_CHK_OK) +#define VNIC_UDP_CSUM_OK(eoib_hdr) ((VNIC_EOIB_HDR_GET_TCP_UDP_CHK(eoib_hdr)) == VNIC_EOIB_HDR_UDP_CHK_OK) +#define VNIC_CSUM_OK(eoib_hdr) (VNIC_IP_CSUM_OK(eoib_hdr) && \ + (VNIC_TCP_CSUM_OK(eoib_hdr) || \ + VNIC_UDP_CSUM_OK(eoib_hdr))) +#define VNIC_EOIB_ZLEN_MAX (ETH_ZLEN + VNIC_ENCAP_LEN + VLAN_HLEN) + +#define VNIC_SKB_GET_HASH(_skb, _max) ((*(u32 *)(_skb->cb + sizeof _skb->cb - 4)) % _max) +#define VNIC_SKB_SET_HASH(_skb, _hash) ((*(u32 *)(_skb->cb + sizeof _skb->cb - 4)) = _hash) +#define VNIC_SKB_GET_ENCAP_CB(_skb) ((struct eoibhdr *)(_skb->cb + sizeof _skb->cb - 12)) +#define VNIC_SKB_GET_ENCAP(_skb) (vnic_encap_headroom ? (struct eoibhdr *)(_skb->data) : VNIC_SKB_GET_ENCAP_CB(_skb)) +#define VNIC_SKB_GET_ENCAP_OFFSET (vnic_encap_headroom ? VNIC_ENCAP_LEN :0) + +#define VNIC_NEIGH_GET_DQPN(_skb, _neighe) ((_neighe->rss) ? (_neighe->qpn + \ + VNIC_SKB_GET_HASH(_skb, _neighe->login->qps_num)) : (_neighe->qpn)) + +#define vnic_netdev_priv(netdev) (((struct vnic_login_info *)netdev_priv(netdev))->login) +#ifndef _BP_NETDEV_NO_TMQ /* >= 2.6.27 */ +#define VNIC_TXQ_GET_HASH(_skb, _max) (skb_get_queue_mapping(_skb)) +#define VNIC_TXQ_ALLOC_NETDEV(sz, nm, sp, qm) alloc_netdev_mq(sz, nm, sp, qm) +#define VNIC_TXQ_SET_ACTIVE(login, num) (login->dev->real_num_tx_queues = \ + login->real_tx_rings_num = \ + login->ndo_tx_rings_num = num) +#define VNIC_TXQ_GET_ACTIVE(login) (login->real_tx_rings_num) +#define VNIC_TXQ_GET(tx_res) netdev_get_tx_queue(tx_res->login->dev, tx_res->index) +#define VNIC_TXQ_STOP(tx_res) netif_tx_stop_queue(VNIC_TXQ_GET(tx_res)) +#define VNIC_TXQ_STOP_ALL(login) netif_tx_stop_all_queues(login->dev) +#define VNIC_TXQ_START(tx_res) netif_tx_start_queue(VNIC_TXQ_GET(tx_res)) +#define VNIC_TXQ_START_ALL(login) netif_tx_start_all_queues(login->dev) +#define VNIC_TXQ_STOPPED(tx_res) netif_tx_queue_stopped(VNIC_TXQ_GET(tx_res)) +#define VNIC_TXQ_WAKE(tx_res) netif_tx_wake_queue(VNIC_TXQ_GET(tx_res)) +#else +#define VNIC_TXQ_GET_HASH(skb, _max) VNIC_SKB_GET_HASH(skb, _max) +#define VNIC_TXQ_ALLOC_NETDEV(sz, nm, sp, qm) alloc_netdev(sz, nm, sp) +#define VNIC_TXQ_SET_ACTIVE(login, num) do { login->real_tx_rings_num = num; \ + login->ndo_tx_rings_num = 1; \ + } while (0) +#define VNIC_TXQ_GET_ACTIVE(login) (login->real_tx_rings_num) +#define VNIC_TXQ_STOP(tx_res) netif_stop_queue(tx_res->login->dev) +#define VNIC_TXQ_STOP_ALL(login) netif_stop_queue(login->dev) +#define VNIC_TXQ_START(tx_res) netif_start_queue(tx_res->login->dev) +#define VNIC_TXQ_START_ALL(login) netif_start_queue(login->dev) +#define VNIC_TXQ_STOPPED(tx_res) netif_queue_stopped(tx_res->login->dev) +#define VNIC_TXQ_WAKE(tx_res) netif_wake_queue(tx_res->login->dev) +#endif + +#define VNIC_ALLOC_ORDER 2 +#define VNIC_ALLOC_SIZE (PAGE_SIZE << VNIC_ALLOC_ORDER) +#define VNIC_MAX_LRO_AGGR 64 +#define VNIC_MAX_RX_FRAGS 4 +#define VNIC_MAX_TX_FRAGS (MAX_SKB_FRAGS + 2) +#define VNIC_MGID_PREFIX_LEN 5 + +/* TODO, when set VNIC_MAX_TX_OUTS to 16, + * noticed that the last CQE overwrites the first one + */ +#define VNIC_MAX_TX_OUTS 8 /* default, tuneable */ +#define VNIC_MAX_LRO_DESCS 32 /* default, tuneable */ +#define VNIC_EOIB_HDR_SIZE (IB_GRH_BYTES + VNIC_ENCAP_LEN) +#define SMALL_PACKET_SIZE (256 - NET_IP_ALIGN) +#define HEADER_COPY_SIZE (128 - NET_IP_ALIGN) +#define MAX_HEADER_SIZE 64 + +#define LAG_MAP_TABLE_SIZE 32 +#define MAX_LAG_MEMBERS 16 + +#define VNIC_FW_STR_MAX VNIC_ETHTOOL_LINE_MAX +#define VNIC_FW_STR(u64_fw_ver, str) \ +do { \ + snprintf(str, VNIC_FW_STR_MAX, "%d.%d.%d", \ + (int)(u64_fw_ver >> 32), \ + (int)(u64_fw_ver >> 16) & 0xffff, \ + (int)(u64_fw_ver & 0xffff)); \ +} while (0); +#define VNIC_STR_STRIP(str) \ +do { \ + int i; \ + for (i = 0; i < strlen(str); ++i) \ + str[i] = str[i] == '\n' ? ' ' : str[i]; \ +} while (0); + +/* well known addresses */ +static const u8 ETH_BCAST_MAC[] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +}; + +static const u8 ETH_ZERO_MAC[] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +/* this used in no_bxm mode only */ +static const u8 NO_BXM_MGID_PREFIX[] = { + 0xff, 0x13, 0xe0, 0x1b, 0x00 +}; + +#define IS_ZERO_MAC(mac) (!memcmp((mac), ETH_ZERO_MAC, ETH_ALEN)) +#define IS_BCAST_MAC(mac) (!memcmp((mac), ETH_BCAST_MAC, ETH_ALEN)) +#define IS_MCAST_MAC(mac) (((unsigned char *)(mac))[0] & 0x01) +#define IS_UCAST_MAC(mac) (!(IS_MCAST_MAC(mac))) + +struct mcast_root { + struct rb_root mcast_tree; + spinlock_t mcast_rb_lock; + struct list_head reattach_list; +}; + +/* structs */ +struct vnic_port_stats { + unsigned long gro_held; + unsigned long gro_merged; + unsigned long gro_normal; + unsigned long gro_drop; + unsigned long lro_aggregated; + unsigned long lro_flushed; + unsigned long lro_no_desc; + unsigned long tso_packets; + unsigned long queue_stopped; + unsigned long wake_queue; + unsigned long tx_timeout; + unsigned long rx_chksum_good; + unsigned long rx_chksum_none; + unsigned long tx_chksum_offload; + unsigned long sig_ver_err; + unsigned long vlan_err; + unsigned long shared_packets; + unsigned long runt_packets; + unsigned long realloc_packets; + unsigned long gw_tx_packets; + unsigned long gw_tx_bytes; +}; + +#define VNIC_STATS_DO_ADD(var, val) ((var) += (unsigned long)(val)) +#define VNIC_STATS_DO_INC(var) (++(var)) +#ifdef VNIC_EXTRA_STATS /* for performance */ +#define VNIC_STATS_ADD(var, val) ((var) += (unsigned long)(val)) +#define VNIC_STATS_INC(var) (++(var)) +#else +#define VNIC_STATS_ADD(var, val) do { } while (0) +#define VNIC_STATS_INC(var) do { } while (0) +#endif + +enum { + MCAST_ATTACHED, + MCAST_JOINED, + MCAST_JOIN_STARTED, + MCAST_JOIN_RUNNING, + MCAST_ATTACH_RUNNING, +}; + +struct vnic_port_mcast { + struct rb_node rb_node; + struct list_head list; + union ib_gid gid; + struct vnic_port *port; + struct completion leave_complete; + struct completion join_event_complete; + struct ib_sa_multicast *sa_mcast; + struct ib_sa_mcmember_rec rec; + + atomic_t ref_cnt; + struct delayed_work join_task; + struct work_struct leave_task; + unsigned long join_task_cnt; + long int state; + spinlock_t lock; + u8 join_state; + /* IN */ + unsigned long backoff; + unsigned long backoff_init; + unsigned long backoff_factor; + unsigned long retry; + u16 pkey; + u32 qkey; + u8 create; +}; + +struct vnic_mcast { + struct vnic_port_mcast *port_mcaste; + u32 qkey; + u16 pkey; + struct ib_qp *qp; + struct vnic_port *port; + struct ib_ah *ah; + struct completion attach_complete; + struct delayed_work attach_task; + struct delayed_work detach_task; + unsigned long attach_task_cnt; + struct rb_node rb_node; + struct list_head list; /* used when delete all */ + /* IN */ + u8 mac[ETH_ALEN]; + union ib_gid gid; + union ib_gid port_gid; + unsigned long backoff; + unsigned long backoff_init; + unsigned backoff_factor; + unsigned long retry; + unsigned long state; + u8 blocking; + void *attach_cb_ctx; + void *detach_cb_ctx; + void (*attach_cb) (struct vnic_mcast *mcaste, void *ctx); + void (*detach_cb) (struct vnic_mcast *mcaste, void *ctx); + u8 create; + u8 join_state; + void *priv_data; + spinlock_t lock; + int attach_bit_nr; + unsigned long *req_attach; + unsigned long *cur_attached; + int sender_only; +}; + +struct vnic_mac { + struct rb_node rb_node; /* list or RB tree */ + struct list_head list; + u16 vnic_id; /* needed for vnic child removal */ + u8 mac[ETH_ALEN]; /* key */ + unsigned long created; + unsigned long last_tx; // use jiffies_to_timeval +}; + +struct lag_properties { + u16 hash_mask; + u8 weights_policy; + u8 ca; /* conjestion aware */ + u8 ca_thresh; +}; + +struct vnic_neigh { + struct neighbour *neighbour; + struct ib_ah *ah; + struct vnic_login *login; + struct rb_node rb_node; + u32 qpn; + u16 lid; + u8 mac[ETH_ALEN]; + u8 rss; + u16 info; +}; + +enum lag_gw_state { + GW_MEMBER_INFO_CREATED = 1 << 0, + GW_MEMBER_INFO_EPORT_UP = 1 << 1, + GW_MEMBER_INFO_MCAST = 1 << 2, + GW_MEMBER_INFO_MAPPED = 1 << 3, +}; + +struct vnic_gw_info { + enum lag_gw_state info; + int member_id; + u16 gw_id; + struct vnic_neigh neigh; +}; + +struct vnic_sysfs_attr { + void *ctx; + struct kobject *kobj; + unsigned long data; + char name[VNIC_SYSFS_FLEN]; + struct module_attribute dentry; + struct device *dev; +}; + +enum gw_ext_lag_hash_policy { + GW_LAG_HASH_DMAC = 1 << 0, + GW_LAG_HASH_SMAC = 1 << 1, + GW_LAG_HASH_TPID = 1 << 2, /* ethertype */ + GW_LAG_HASH_VID = 1 << 3, + GW_LAG_HASH_SIP = 1 << 4, + GW_LAG_HASH_DIP = 1 << 5, + GW_LAG_HASH_IP_NEXT = 1 << 6, + GW_LAG_HASH_SPORT = 1 << 7, + GW_LAG_HASH_DPORT = 1 << 8, + GW_LAG_LAYER_2_3 = 0x1f0 +}; + +struct vnic_tx_buf { + struct sk_buff *skb; + u64 mapping[VNIC_MAX_TX_FRAGS]; + u8 ip_off; + u8 ip6_off; + u8 tcp_off; + u8 udp_off; + void *phead; + int hlen; +}; + +enum { +#if 1 + FRAG_SZ0 = 536 - NET_IP_ALIGN, /* so 1500 mtu fits in first 2 frags */ + FRAG_SZ1 = 1024, + FRAG_SZ2 = 2048, + FRAG_SZ3 = 4096 - FRAG_SZ2 - FRAG_SZ1 - FRAG_SZ0 +#else + FRAG_SZ0 = 512 - NET_IP_ALIGN, + FRAG_SZ1 = 1024, + FRAG_SZ2 = 2048, + FRAG_SZ3 = 4096 << VNIC_ALLOC_ORDER +#endif +}; + +struct vnic_frag_info { + u16 frag_size; + u16 frag_prefix_size; + u16 frag_stride; + u16 frag_align; + u16 last_offset; +}; + +struct vnic_rx_alloc { + struct page *page; + u16 offset; +}; + +struct vnic_frag_data { + struct skb_frag_struct frags[VNIC_MAX_RX_FRAGS]; + u64 dma_addr[VNIC_MAX_RX_FRAGS]; + struct sk_buff *skb; /* used only for linear buffers mode */ +}; + +struct vnic_rx_ring { + struct vnic_port *port; + int index; + struct vnic_rx_alloc page_alloc[VNIC_MAX_RX_FRAGS]; + + u32 size; /* number of RX descs */ + spinlock_t lock; + struct vnic_frag_data *rx_info; + + struct vnic_frag_info frag_info[VNIC_MAX_RX_FRAGS]; + u32 rx_skb_size; + u16 log_rx_info; + u16 num_frags; + + struct ib_recv_wr wr; + struct ib_sge sge[VNIC_MAX_RX_FRAGS]; + + struct ib_srq *srq; + struct net_device_stats stats; +}; + +/* netdevice open state, depeneds on calls to open / stop */ +enum { + VNIC_STATE_LOGIN_OFF = 0, + VNIC_STATE_LOGIN_PRECREATE_1, + VNIC_STATE_LOGIN_PRECREATE_2, + VNIC_STATE_LOGIN_CREATE_1, + VNIC_STATE_LOGIN_CREATE_2, + VNIC_STATE_LOGIN_OPEN_REQ, + VNIC_STATE_LOGIN_OPEN, + VNIC_STATE_LOGIN_CARRIER_ON = 29, + VNIC_STATE_LOGIN_NO_TX_ENABLE = 30, + VNIC_STATE_LOGIN_BCAST_ATTACH = 31, +}; + +struct vnic_rx_res { + struct vnic_login *login; + struct ib_cq *cq; + struct net_lro_mgr lro; + struct net_lro_desc lro_desc[VNIC_MAX_LRO_DESCS]; + struct ib_wc recv_wc[VNIC_MAX_RX_CQE]; + int index; + int stopped; +#ifndef _BP_NAPI_POLL + struct napi_struct napi; +#else + struct net_device *poll_dev; +#endif +}; + +struct vnic_tx_res { + struct vnic_tx_buf *tx_ring; + struct ib_sge tx_sge[VNIC_MAX_TX_FRAGS]; + struct ib_wc send_wc[VNIC_MAX_TX_CQE]; + struct ib_send_wr tx_wr; + struct vnic_login *login; + struct ib_cq *cq; + unsigned tx_head; + unsigned tx_tail; + unsigned tx_outstanding; + unsigned tx_stopped_cnt; + struct net_device_stats stats; + struct ib_ah_attr mcast_av; + u8 lso_hdr[VNIC_MAX_PAYLOAD_SIZE]; + int index; + int stopped; + spinlock_t lock; +}; + +#ifdef VNIC_PROFILLNG +#define VNIC_PROFILLNG_SKB_MAX 100 +struct vnic_prof_skb_entry { + struct sk_buff skb; + struct timespec tstamp; + unsigned long jiffies; + int cnt; + u8 nr_frags; +}; +#endif + +struct vnic_qp_res { + struct vnic_login *login; + struct ib_qp *qp; + struct completion last_wqe_complete; + int tx_index; + int rx_index; +}; + +/* + * Wrapper struct for vnic_login, used as netdev private data. + * some kernels (such as 2.6.18-194.26.1) doesn't allow private + * data struct longer than 64KB (NETDEV_PRIV_LEN_MAX). + * we allocate the private data separately to work-around this limit. + */ +struct vnic_login_info { + struct vnic_login *login; +}; + +struct vnic_login { + spinlock_t lock; + spinlock_t stats_lock; + + struct net_device *dev; + struct ethtool_drvinfo drvinfo; + struct vnic_port *port; + char desc[VNIC_DESC_LEN]; + struct fip_vnic_data *fip_vnic; /* for ethtool/sysfs*/ + int queue_stopped; + unsigned long state; + char name[VNIC_NAME_LEN]; + char vnic_name[VNIC_NAME_LEN]; + char vendor_id[VNIC_VENDOR_LEN]; + struct vnic_neigh *gw_neigh; + struct vnic_gw_info lag_gw_neigh[MAX_LAG_MEMBERS]; + struct lag_properties lag_prop; + int is_lag; + int lag_gw_map[LAG_MAP_TABLE_SIZE]; + int lag_member_count; + int lag_member_active_count; + union ib_gid gw_mgid; + int promisc; + union ib_gid gid; + __be16 vid; + u8 vlan_used; + u32 qkey; + u16 pkey; + u16 pkey_index; + u64 gw_guid; + u8 mgid_prefix[VNIC_MGID_PREFIX_LEN]; + u8 n_mac_mcgid; + u8 sl; + u16 gw_port_id; + u16 vnic_id; + unsigned int max_mtu; + int zlen; + int cnt; + unsigned qps_num; + u32 qp_base_num; + u8 dev_addr[ETH_ALEN]; + u8 all_vlan_gw; + + /* statistics */ + struct net_device_stats stats; + struct net_device_stats ret_stats; + struct vnic_port_stats port_stats; + + /* tasks */ + struct work_struct mcast_restart; + struct delayed_work stats_task; + struct delayed_work mcast_task; + struct delayed_work restart_task; + struct mutex moder_lock; + struct mutex state_lock; + + /* data structures */ + struct rb_root neigh_tree; + struct rb_root mac_tree; + atomic_t vnic_child_cnt; + rwlock_t mac_rwlock; + struct mcast_root mcast_tree; + struct vnic_sysfs_attr dentries[VNIC_MAX_DENTRIES]; + struct list_head list; + + /* QP resources */ + struct vnic_qp_res qp_res[VNIC_MAX_NUM_CPUS]; + + /* RX resouces */ + struct vnic_rx_res rx_res[VNIC_MAX_NUM_CPUS]; + struct ib_recv_wr rx_wr; + u32 lro_num; + unsigned lro_mng_num; + int rx_csum; + unsigned napi_num; + unsigned rx_rings_num; + + /* TX resources */ + struct vnic_tx_res tx_res[VNIC_MAX_NUM_CPUS]; + unsigned tx_rings_num; + unsigned real_tx_rings_num; + unsigned ndo_tx_rings_num; + u8 *pad_va; + u64 pad_dma; + + /* for profiling */ +#ifdef VNIC_PROFILLNG + struct vnic_prof_skb_entry prof_arr[VNIC_PROFILLNG_SKB_MAX]; + int prof_arr_it; +#endif + /* interrupt coalecence */ + u16 rx_usecs; + u16 rx_frames; + u32 pkt_rate_low; + u16 rx_usecs_low; + u32 pkt_rate_high; + u16 rx_usecs_high; + u16 sample_interval; + u16 adaptive_rx_coal; + unsigned long last_moder_packets; + unsigned long last_moder_tx_packets; + unsigned long last_moder_bytes; + unsigned long last_moder_jiffies; + unsigned long last_moder_time; + u16 tx_usecs; + u16 tx_frames; + u8 shared_vnic; + u8 shared_mac[ETH_ALEN]; +}; + +struct eoibhdr { + __u8 encap_data; + __u8 seg_off; + __be16 seg_id; +}; + +struct vnic_ib_dev { + char name[VNIC_DESC_LEN]; + struct mutex mlock; + struct list_head list; + struct list_head port_list; + struct ib_device *ca; + struct mlx4_ib_dev *mdev; + struct ib_device_attr attr; + char fw_ver_str[VNIC_FW_STR_MAX]; +}; + +struct fip_ring_entry { + void *mem; + u64 bus_addr; + int length; + int entry_posted; +}; + +struct fip_ring { + int size; + struct fip_ring_entry *ring; + unsigned long head; + unsigned long tail; + spinlock_t ring_lock; + spinlock_t head_tail_lock; +}; + +enum fip_discover_state { + FIP_DISCOVER_OFF, + FIP_DISCOVER_INIT, + FIP_DISCOVER_SOLICIT, + FIP_DISCOVER_CLEAR +}; + +#define MAX_INPUT_LEN 64 +#define MAX_INPUT_ARG 12 +struct fip_hadmin_cmd { + u8 c_name [MAX_INPUT_LEN]; + u8 c_mac [MAX_INPUT_LEN]; + u8 c_vnic_id [MAX_INPUT_LEN]; + u8 c_vid [MAX_INPUT_LEN]; + u8 c_bxname [MAX_INPUT_LEN]; + u8 c_bxguid [MAX_INPUT_LEN]; + u8 c_eport [MAX_INPUT_LEN]; + u8 c_ipv4 [MAX_INPUT_LEN]; + u8 c_ipv6 [MAX_INPUT_LEN]; + u8 c_emac [MAX_INPUT_LEN]; + u8 c_pkey [MAX_INPUT_LEN]; + u8 c_parent [MAX_INPUT_LEN]; +}; + +struct fip_hadmin_cache { + struct fip_hadmin_cmd cmd; + u8 system_guid[GUID_LEN]; + u8 system_name[VNIC_SYSTEM_NAME_LEN]; + u8 eport_name[VNIC_GW_PORT_NAME_LEN]; + u8 mac[ETH_ALEN]; + u16 vnic_id; + u16 gw_port_id; + u16 vlan; + u8 vlan_used; + u8 all_vlan_gw; + u8 interface_name[VNIC_NAME_LEN]; + u8 parent_name[VNIC_NAME_LEN]; + int parent_used; + int remove; + struct list_head next; + u32 qp_base_num; + u8 shared_vnic_ip[IPV4_LEN]; + u8 shared_vnic_mac[ETH_ALEN]; +}; + +struct pkt_rcv_list { + struct list_head list; + spinlock_t lock; +}; + +struct fip_discover { + char name[VNIC_NAME_LEN]; + struct vnic_port *port; + struct list_head discover_list; + spinlock_t lock; + struct list_head gw_list; + struct rw_semaphore l_rwsem; /* gw list rw semaphore **/ + int hadmin_update; + struct list_head hadmin_cache; + enum fip_discover_state state; + int flush; + struct completion flush_complete; + struct ib_cq *cq; + struct ib_qp *qp; + struct fip_ring rx_ring; + struct fip_ring tx_ring; + struct mcast_root mcast_tree; + struct delayed_work fsm_task; + struct delayed_work cleanup_task; + struct delayed_work hadmin_update_task; + struct work_struct pkt_rcv_task_bh; + struct pkt_rcv_list rcv_list; + + int mcast_dest_mask; + unsigned long discover_mcast_attached_jiffies; + unsigned long discover_mcast_detached_jiffies; + unsigned long discover_mcast_state; + u16 pkey; + u16 pkey_index; + unsigned long req_attach; + unsigned long cur_attached; + unsigned new_prot_gws; + unsigned old_prot_gws; +}; + +struct fip_root { + struct list_head discover_list; +}; + +struct port_fs_dentry { + struct module_attribute fs_entry; + struct vnic_port *port; +}; + +struct vnic_port { + char name[VNIC_DESC_LEN]; + u8 num; + int rx_rings_num; + int tx_rings_num; + struct vnic_ib_dev *dev; + struct mcast_root mcast_tree; + struct list_head list; + struct list_head login_list; + struct delayed_work event_task; + struct delayed_work event_task_light; + struct delayed_work gid_change_event_task; + struct delayed_work discover_restart_task; + struct ib_event_handler event_handler; + struct ib_port_attr attr; + union ib_gid gid; + int rate; + u8 rate_enum; + atomic_t vnic_child_ids; + + /* IB resources per port */ + struct vnic_rx_ring *rx_ring[VNIC_MAX_NUM_CPUS]; + struct ib_pd *pd; + struct ib_mr *mr; + + /* for FIP */ + struct mutex mlock; + struct mutex start_stop_lock; + u16 pkey_index; + u16 pkey; + int max_mtu_enum; + struct fip_root fip; + struct vnic_sysfs_attr dentries[VNIC_MAX_DENTRIES]; +}; + +enum fip_vnic_state { + FIP_VNIC_CLOSED = 0, + FIP_VNIC_HADMIN_IDLE = 1<<0, + FIP_VNIC_LOGIN = 1<<1, + FIP_VNIC_WAIT_4_ACK = 1<<2, + FIP_VNIC_RINGS_INIT = 1<<3, /* temporary, create rings */ + FIP_VNIC_MCAST_INIT = 1<<4, /* temporary, start mcast attach */ + FIP_VNIC_MCAST_INIT_DONE= 1<<5, /* wait for mcast cb */ + FIP_VNIC_VHUB_INIT = 1<<6, + FIP_VNIC_VHUB_INIT_DONE = 1<<7, /* wait for vhub table */ + FIP_VNIC_VHUB_DONE = 1<<8, + FIP_VNIC_VHUB_WRITE = 1<<9, + FIP_VNIC_CONNECTED = 1<<10 +}; + +enum vhub_table_state { + VHUB_TBL_INIT, + VHUB_TBL_UP2DATE, + VHUB_TBL_UPDATED +}; + +struct vhub_elist { + u32 tusn; + int count; + int total_count; + struct list_head vnic_list; /* chain vnics */ +}; + +struct vnic_table_entry { + u32 qpn; + u16 lid; + u8 mac[ETH_ALEN]; + u8 sl; + + struct list_head list; + u8 rss; + u8 valid; +}; + +struct vhub_table { + enum vhub_table_state state; + u32 checksum; + u32 tusn; + struct vhub_elist main_list; + struct vhub_elist update_list; +}; + +struct fip_shared_vnic_data { + u8 ip[IPV4_LEN]; + u8 emac[ETH_ALEN]; + u8 enabled; + u8 arp_proxy; +}; + +struct lag_member { + u32 qpn; + u8 sl; + u16 gw_port_id; + u16 lid; + u8 guid[GUID_LEN]; + u8 eport_state; + u8 weight; + u8 link_utilization; +}; + +struct lag_members { + int num; + long used_bitmask; + struct lag_properties prop; + struct lag_member memb[MAX_LAG_MEMBERS]; +}; + +struct fip_login_data { + u32 qpn; + u32 ctl_qpn; + u16 port_id; /* must always be uptodate */ + u16 lid; /* must always be uptodate */ + u16 vlan; + u16 pkey; + u16 pkey_index; + u16 vnic_id; /* must always be uptodate */ + u32 vhub_id; + u16 mtu; + + u8 sl; /* service level -- 4 bits */ + u8 guid[GUID_LEN]; + u8 mac[ETH_ALEN]; + u8 mgid_prefix[VNIC_MGID_PREFIX_LEN]; + u8 vnic_name[VNIC_NAME_LEN]; + u8 vendor_id[VNIC_VENDOR_LEN]; + u8 n_mac_mcgid; + u8 n_rss_mgid; + u8 syndrome; /* must always be uptodate */ + + u8 vp; /* 1 bit: do we use vlan */ + u8 all_vlan_gw; /* 1 bit. + is promisc vlan supported on this vnic */ + struct lag_members lagm; +}; + +enum fip_flush { + FIP_NO_FLUSH, + FIP_PARTIAL_FLUSH, /* use this for events caused by vnic/gw logic will */ + FIP_FULL_FLUSH /* use this for events caused by unload, host admin destroy */ +}; + +struct fip_vnic_send_info { + u32 gw_qpn; + u32 qkey; + u16 gw_lid; + u8 gw_sl; +}; + +/* + * This struct holds informative info about the GW that can change without + * implecations on GW or vnic logic (only reported to user) + */ +struct fip_gw_volatile_info { + u8 system_guid[GUID_LEN]; + u8 system_name[VNIC_SYSTEM_NAME_LEN+1]; + u8 gw_port_name[VNIC_GW_PORT_NAME_LEN+1]; +}; + +struct fip_vnic_data { + char name[VNIC_NAME_LEN]; + enum fip_vnic_state state; + enum fip_flush flush; + spinlock_t lock; + spinlock_t ka_lock; + struct vnic_sysfs_attr dentry; + + /* data structures maintenance */ + struct fip_gw_data *gw; + struct vnic_port *port; + struct list_head gw_vnics; + struct vhub_table vhub_table; + + /* execution maintenance */ + unsigned long update_jiffs; + unsigned long keep_alive_jiffs; + unsigned long detached_ka_jiffs; + unsigned long vnic_mcaste_state; + struct delayed_work vnic_task; + struct hrtimer keepalive_timer; + struct list_head timer; + struct delayed_work vnic_gw_alive_task; + struct work_struct vnic_pkt_rcv_task_bh; + struct work_struct vnic_login_destroy_task; + struct work_struct vnic_login_create_task; + struct pkt_rcv_list vnic_rcv_list; + struct fip_vnic_send_info gw_address; + + /* vnic driver API */ + struct vnic_login *login; + unsigned long login_status; + int qps_num; + u32 qp_base_num; + int parent_used; + u8 parent_name[VNIC_NAME_LEN]; + + /* rx + tx data structures */ + struct ib_cq *cq; + struct ib_qp *qp; + struct fip_ring rx_ring; + struct fip_ring tx_ring; + + /* data domain */ + union ib_gid mgid; + + /* vHub context update mcast groups */ + struct mcast_root mcast_tree; + struct fip_login_data login_data; + struct fip_shared_vnic_data shared_vnic; + u16 mlid; + /* u16 pkey_index; not used for now */ + + u16 vnic_id; /* unique id for GW */ + u16 vlan; + u8 vlan_used; + u8 all_vlan_gw; + u16 pkey; + u16 pkey_index; + u8 hadmined; /* todo, use the state for this */ + u8 interface_name[VNIC_NAME_LEN]; + u8 mac_cache[ETH_ALEN]; + atomic_t eport_state; + unsigned long last_send_jiffs; + int retry_count; + int synd_backlog; + struct fip_hadmin_cmd cmd; + struct fip_gw_volatile_info gw_info; + struct lag_members lm; + unsigned long req_attach; + unsigned long cur_attached; + union ib_gid ka_mcast_gid; +}; + +enum vhub_mgid_type { + VHUB_MGID_DATA = 0, + VHUB_MGID_UPDATE = 2, + VHUB_MGID_TABLE = 3, + VHUB_MGID_KA = 5, +}; + +enum fip_all_mgids { + FIP_MCAST_DISCOVER, + FIP_MCAST_SOLICIT, + FIP_MCAST_VHUB_DATA, + FIP_MCAST_VHUB_UPDATE, + FIP_MCAST_TABLE, + FIP_MCAST_VHUB_KA, +}; + +union vhub_mgid { + struct mgid { + u8 mgid_prefix[VNIC_MGID_PREFIX_LEN]; + u8 type; + u8 dmac[ETH_ALEN]; + u8 rss_hash; + u8 vhub_id[3]; + } mgid; + union ib_gid ib_gid; +}; + +void vnic_carrier_update(struct vnic_login *login); +int vnic_param_check(void); + +/* mac table funcs */ +void vnic_learn_mac(struct net_device *dev, u8 *mac, int remove); +void vnic_child_flush(struct vnic_login *login, int all); +int vnic_child_update(struct vnic_login *login, u8 *mac, int remove); +int vnic_mace_update(struct vnic_login *login, u8 *mac, u16 vnic_id, int remove); +int vnic_parent_update(struct vnic_port *port, char *name, u16 vnic_id, + u8 *mac, u32 *qp_base_num_ptr, char *parent_name, + int remove); + +/* mcast funcs */ +int vnic_mcast_init(void); +void vnic_mcast_cleanup(void); + +/* + * A helper function to prevent code duplication. Receives a multicast mac + * and a gw_id and attaches it (join + attach). The function also receives + * a default_mcaste (used for the MGID over default MLID hack and a user list. + * Returns 0 on success and non 0 on failure. + * + * in: mmac - to be used in creation MGID address + * in: default_mcaste - mcaste entry of the default MGID. Can be NULL + * in: private_data - A user pointer that can be used to identify owner + * in: gw_id - to be used in creation MGID address + */ +int _vnic_mcast_attach_mgid(struct vnic_login *login, + char *mmac, + struct vnic_mcast *default_mcaste, + void *private_data, + u16 gw_id); + +struct vnic_mcast *vnic_mcast_alloc(struct vnic_port *port, + unsigned long *req_attach, + unsigned long *cur_attach); +/* + * A helper function to prevent code duplication. Fills vnic_mcast struct with + * common values. + * + * in: mcaste - mcaste to fill + * in: gw_id - to be used in creation MGID address + * in: mac - to be used in creation MGID address + * in: rss_hash - to be used in creation MGID address (ususally 0) + * in: create - value of create field in mcaste + */ +void __vnic_mcaste_fill(struct vnic_login *login, struct vnic_mcast *mcaste, + u16 gw_id, const u8 *mac, u8 rss_hash, int create); + +void vnic_mcast_dealloc(struct vnic_mcast *mcaste); + +int vnic_mcast_attach(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste); +int vnic_mcast_detach(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste); + +/* + * This function grabs the mcast_tree->mcast_rb_lock +*/ +int vnic_mcast_add(struct mcast_root *mcast_tree, + struct vnic_mcast *mcaste); +int vnic_mcast_del_all(struct mcast_root *mcast_tree); +int vnic_mcast_del_user(struct mcast_root *mcast_tree, void *owner); + +void vnic_tree_mcast_detach(struct mcast_root *mcast_tree); +void vnic_tree_mcast_attach(struct mcast_root *mcast_tree); + +/*void vnic_port_mcast_del_all(struct mcast_root *port); */ +static inline void vnic_mcast_root_init(struct mcast_root *mcast_tree) +{ + spin_lock_init(&mcast_tree->mcast_rb_lock); + INIT_LIST_HEAD(&mcast_tree->reattach_list); +} + +/* port funcs */ +int vnic_ports_init(void); +void vnic_ports_cleanup(void); + +/* + * The caller must hold the mcast_tree->mcast_rb_lock lock before calling +*/ +void vnic_mcast_del(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste); +struct vnic_mcast *vnic_mcast_search(struct mcast_root *mcast_tree, + union ib_gid *gid); +void port_fip_discover_restart(struct work_struct *work); +int vnic_port_fip_init(struct vnic_port *port); +void vnic_port_fip_cleanup(struct vnic_port *port, int lock); + +/* others */ +void fip_refresh_mcasts(struct fip_discover *discover); +void vnic_login_refresh_mcasts(struct vnic_port *port); + +/* There are 2 different create flows, for host admin and net admin. + * In net admin we always create the vnic after connected with GW but we do not + * yet know the vnic details (mac, vlan etc). We know the ring paramets and + * will need to create the RX/TX rings (before login). + * To accomplish this we call vnic_login_pre_create_1, vnic_login_pre_create_2 + * and after login ACK we will call vnic_login_register_netdev and vnic_login_complete_ack. + * In Host admin, we know the vnic info but not the GW info when we create the + * vnic. So we call vnic_login_pre_create_1 and vnic_login_register_netdev, after + * getting the login ACK we will call vnic_login_pre_create_2, vnic_login_complete_ack. + */ +int vnic_login_register_netdev(struct fip_vnic_data *vnic, + const char *mac, + const char *name); +int vnic_login_complete_ack(struct fip_vnic_data *vnic, + struct fip_login_data *login_data, + struct fip_shared_vnic_data *shared_vnic); +int vnic_login_pre_create_1(struct vnic_port *port, + struct fip_vnic_data *vnic); +int vnic_login_pre_create_2(struct fip_vnic_data *vnic, int qps_num, int is_lag); + +/* + * When destroying login, call to stop login wq tasks. do not call from + * login_wq context. +*/ +void vnic_login_destroy_stop_wq(struct fip_vnic_data *vnic, enum fip_flush flush); +/* + * When destroy login data struct. Assumes all login wq tasks are stopped. + * Can be called from any context, might block for a few secs. +*/ +void vnic_login_destroy_wq_stopped(struct fip_vnic_data *vnic, enum fip_flush flush); + +/* + * Destroy a login datastructure. + * This function can not be called from login_wq context. If you need to run + * from login_wq use the split function vnic_login_destroy_stop_wq/wq_stopped + * instead. + */ +static inline +void vnic_login_destroy(struct fip_vnic_data *vnic, enum fip_flush flush) +{ + vnic_login_destroy_stop_wq(vnic, flush); + vnic_login_destroy_wq_stopped(vnic, flush); +} + +/* add / remove members eports from LAG GW */ +void vnic_member_prop(struct vnic_login *login, struct lag_properties *prop); +int vnic_member_add(struct vnic_login *login, int member_id, + struct lag_member *emember); +int vnic_member_remove(struct vnic_login *login, int member_id); +int vnic_member_modify(struct vnic_login *login, int member_id, + struct lag_member *emember); +void vnic_member_remove_all(struct vnic_login *login); + +int vnic_vhube_add(struct fip_vnic_data *vnic, struct vnic_table_entry *vhube); +void vnic_vhube_flush(struct fip_vnic_data *vnic); +void vnic_vhube_del(struct fip_vnic_data *vnic, u8 *mac); + +void vhub_mgid_create(const char *mgid_prefix, + const char *mmac, /* mcast mac for bcast 0xFF.. */ + u64 n_mac, /* bits to take from mmac */ + u32 vhub_id, + enum vhub_mgid_type type, + u8 rss_hash, + union vhub_mgid *mgid); +/* + * read the state of the gw eport. Can be called from any context. +*/ +int fip_vnic_get_eport_state(struct fip_vnic_data *vnic); +/* + * get GW info funcs. +*/ +int fip_vnic_get_eport_name(struct fip_vnic_data *vnic, char *buff); +int fip_vnic_get_bx_name(struct fip_vnic_data *vnic, char *buff); +int fip_vnic_get_bx_guid(struct fip_vnic_data *vnic, char *buff); +u8 fip_vnic_get_bx_sl(struct fip_vnic_data *vnic); +int fip_vnic_get_gw_type(struct fip_vnic_data *vnic); +int fip_vnic_get_lag_eports(struct fip_vnic_data *vnic, char *buf); +int fip_vnic_get_all_vlan_mode(struct fip_vnic_data *vnic, char *buff); + + +/* + * return short format string of GW info. can be called from any context. +*/ +int fip_vnic_get_short_gw_info(struct fip_vnic_data *vnic, char *buff); + +void vnic_data_cleanup(void); + +/* + * This function is called from the sysfs update callback function. + * it parses the request and adds the request to a list. It then queues a + * work request to process the list from the fip_wq context. +*/ +int fip_hadmin_sysfs_update(struct vnic_port *port, + const char *buffer, int count, int remove); +int fip_gw_sysfs_show(struct vnic_port *port, char *buffer); +int vnic_login_cmd_set(char *buf, struct fip_hadmin_cmd *cmd); +void vnic_login_cmd_init(struct fip_hadmin_cmd *cmd); + +int fip_hadmin_vnic_refresh(struct fip_vnic_data *vnic, struct fip_vnic_send_info *gw_address); +void fip_vnic_set_gw_param(struct fip_vnic_data *vnic, struct fip_vnic_send_info *gw_address); +void fip_vnic_create_gw_param(struct fip_vnic_send_info *gw_address, u32 gw_qpn, + u32 qkey, u16 gw_lid, u8 gw_sl); + +int fip_vnic_hadmin_init(struct vnic_port *port, struct fip_vnic_data *vnic); + +int port_fs_init(struct vnic_port *port); +void port_fs_exit(struct vnic_port *port); + +int vnic_port_query(struct vnic_port *port); + +#endif /* VNIC_H */ diff --git a/drivers/net/mlx4_vnic/vnic_data.h b/drivers/net/mlx4_vnic/vnic_data.h new file mode 100644 index 0000000000000..8444acac974a1 --- /dev/null +++ b/drivers/net/mlx4_vnic/vnic_data.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _VNIC_DATA_H +#define _VNIC_DATA_H + +#include "vnic.h" + +enum { + VNIC_SEND_INLINE_FLAG_POS = 63, +}; + +#define VNIC_SEND_INLINE_FLAG ((u64)1 << VNIC_SEND_INLINE_FLAG_POS) + +/* main funcs */ +int vnic_port_data_init(struct vnic_port *port); +void vnic_port_data_cleanup(struct vnic_port *port); + +/* ib funcs */ +struct sk_buff *vnic_alloc_rx_skb(struct vnic_rx_ring *ring, int buf_ind, + gfp_t gfp_flag); +int vnic_post_recv(struct vnic_rx_ring *ring, u64 wr_id); +int vnic_post_recvs(struct vnic_rx_ring *ring); +int vnic_ib_create_qp_range(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, + struct ib_udata *udata, int nqps, + int align, struct ib_qp *list[]); +int vnic_ib_destroy_qp(struct ib_qp *qp); +int vnic_ib_post_send(struct ib_qp *ibqp, + struct ib_send_wr *wr, + struct ib_send_wr **bad_wr, + u8 ip_off, u8 ip6_off, + u8 tcp_off, u8 udp_off); +struct vnic_rx_ring *vnic_create_rx_ring(struct vnic_port *port, int index); +void vnic_destroy_rx_ring(struct vnic_rx_ring *ring); +int vnic_init_qp(struct vnic_login *login, int qp_index); +int vnic_create_qp(struct vnic_login *login, int qp_index); +int vnic_create_qp_range(struct vnic_login *login); +void vnic_destroy_qp(struct vnic_login *login, int qp_index); +int vnic_create_tx_res(struct vnic_login *login, int tx_res_index); +int vnic_create_rx_res(struct vnic_login *login, int rx_res_index); +void vnic_destroy_tx_res(struct vnic_login *login, int tx_res_index); +void vnic_destroy_rx_res(struct vnic_login *login, int rx_res_index); + +int vnic_ib_up(struct net_device *dev); +int vnic_ib_down(struct net_device *dev); +int vnic_ib_open(struct net_device *dev); +int vnic_ib_stop(struct net_device *dev); + +int vnic_ib_set_moder(struct vnic_login *login, + u16 rx_usecs, u16 rx_frames, u16 tx_usecs, u16 tx_frames); +int vnic_port_ib_init(struct vnic_port *port); +void vnic_port_ib_cleanup(struct vnic_port *port); +void vnic_ib_dispatch_event(struct ib_event *event); +#ifndef _BP_NAPI_POLL +int vnic_poll_cq_rx(struct napi_struct *napi, int budget); +#else +int vnic_poll_cq_rx(struct net_device *poll_dev, int *budget); +#endif +void vnic_send(struct vnic_login *login, struct sk_buff *skb, + struct ib_ah *ah, u32 dqpn, int tx_res_index); +void vnic_ib_free_ring(struct vnic_rx_ring *ring); +int vnic_ib_init_ring(struct vnic_rx_ring *ring); + +/* netdev funcs */ +struct net_device *vnic_alloc_netdev(struct vnic_port *port); +void vnic_free_netdev(struct vnic_login *login); +int vnic_restart(struct net_device *dev); +void __bcast_attach_cb(struct vnic_mcast *mcaste, void *login_ptr); +void __bcast_detach_cb(struct vnic_mcast *mcaste, void *login_ptr); + +/* rx funcs */ +int vnic_rx(struct vnic_login *login, struct sk_buff *skb, struct ib_wc *wc); +int vnic_unmap_and_replace_rx(struct vnic_rx_ring *ring, struct ib_device *dev, + struct skb_frag_struct *skb_frags_rx, + u64 wr_id, int length); +int vnic_rx_skb(struct vnic_login *login, struct vnic_rx_ring *ring, + struct ib_wc *wc, int ip_summed, char *eth_hdr_va); + +/* tx funcs */ +int vnic_tx(struct sk_buff *skb, struct net_device *dev); + +/* sysfs funcs */ +int vnic_create_dentry(struct vnic_login *login); +void vnic_delete_dentry(struct vnic_login *login); + +/* ethtool funcs */ +void vnic_set_ethtool_ops(struct net_device *dev); + +/* neigh funcs */ +void vnic_neigh_del_all(struct vnic_login *login); +struct vnic_neigh *vnic_neighe_search(struct vnic_login *login, u8 *mac); +void vnic_neighe_dealloc(struct vnic_neigh *neighe); +struct vnic_neigh *vnic_neighe_alloc(struct vnic_login *login, + const u8 *mac, u16 dlid, u32 dqpn, u8 rss); +void vnic_neighe_del(struct vnic_login *login, struct vnic_neigh *neighe); +int vnic_neighe_add(struct vnic_login *login, struct vnic_neigh *neighe); + +struct ib_ah *vnic_ah_alloc(struct vnic_login *login, u16 dlid); + + +struct vnic_login *__vnic_login_create(struct vnic_port *port, int index); +u32 vnic_hash(struct net_device *dev, struct sk_buff *skb); +#endif /* _VNIC_DATA_H */ diff --git a/drivers/net/mlx4_vnic/vnic_data_ethtool.c b/drivers/net/mlx4_vnic/vnic_data_ethtool.c new file mode 100644 index 0000000000000..5f1deca6eb057 --- /dev/null +++ b/drivers/net/mlx4_vnic/vnic_data_ethtool.c @@ -0,0 +1,427 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "vnic.h" +#include "vnic_data.h" + +static struct ethtool_ops vnic_ethtool_ops; + +static const char vnic_strings[][ETH_GSTRING_LEN] = { + /* public statistics */ + "rx_packets", "tx_packets", "rx_bytes", + "tx_bytes", "rx_errors", "tx_errors", + "rx_dropped", "tx_dropped", "multicast", + "collisions", "rx_length_errors", "rx_over_errors", + "rx_crc_errors", "rx_frame_errors", "rx_fifo_errors", + "rx_missed_errors", "tx_aborted_errors", "tx_carrier_errors", + "tx_fifo_errors", "tx_heartbeat_errors", "tx_window_errors", +#define VNIC_PUB_STATS_LEN 21 + + /* private statistics */ + "gro_held", "gro_merged", "gro_normal", "gro_drop", + "lro_aggregated", "lro_flushed", "lro_no_desc", + "tso_packets", "queue_stopped", "wake_queue", + "tx_timeout", "rx_chksum_good", "rx_chksum_none", + "tx_chksum_offload", "sig_ver_err", "vlan_err", + "shared_packets", "runt_packets", "realloc_packets", + "gw_tx_packets", "gw_tx_bytes", +#define VNIC_PORT_STATS_LEN 21 + + /* packet statistics rx_prio_X (TODO) */ +#define VNIC_PKT_STATS_LEN 0 +}; + +#define VNIC_STATS_LEN (sizeof(vnic_strings) / ETH_GSTRING_LEN) + +static void vnic_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *drvinfo) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + + *drvinfo = login->drvinfo; +} + +static u32 vnic_get_msglevel(struct net_device *dev) +{ + return vnic_msglvl; +} + +static void vnic_set_msglevel(struct net_device *dev, u32 mlevel) +{ + vnic_msglvl = mlevel; +} + +static int vnic_get_coalesce(struct net_device *dev, + struct ethtool_coalesce *coal) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + + vnic_dbg_ethtool(login->name, "get coalescing params for mtu:%d " + "rx_frames:%d rx_usecs:%d, " + "tx_frames:%d tx_usecs:%d, " + "adaptive_rx_coal:%d, " + "adaptive_tx_coal:%d\n", + login->dev->mtu, + login->rx_frames, login->rx_usecs, + login->tx_frames, login->tx_usecs, + login->adaptive_rx_coal, 0); + + coal->tx_coalesce_usecs = login->tx_usecs; + coal->tx_max_coalesced_frames = login->tx_frames; + coal->rx_coalesce_usecs = login->rx_usecs; + coal->rx_max_coalesced_frames = login->rx_frames; + + coal->pkt_rate_low = login->pkt_rate_low; + coal->rx_coalesce_usecs_low = login->rx_usecs_low; + coal->pkt_rate_high = login->pkt_rate_high; + coal->rx_coalesce_usecs_high = login->rx_usecs_high; + coal->rate_sample_interval = login->sample_interval; + coal->use_adaptive_rx_coalesce = login->adaptive_rx_coal; + + return 0; +} + +static int vnic_set_coalesce(struct net_device *dev, + struct ethtool_coalesce *coal) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + + login->rx_frames = (coal->rx_max_coalesced_frames == + VNIC_AUTO_CONF) ? + VNIC_RX_COAL_TARGET / + login->dev->mtu + 1 : coal->rx_max_coalesced_frames; + login->rx_usecs = (coal->rx_coalesce_usecs == + VNIC_AUTO_CONF) ? + VNIC_RX_COAL_TIME : coal->rx_coalesce_usecs; + login->tx_frames = coal->tx_max_coalesced_frames; + login->tx_usecs = coal->tx_coalesce_usecs; + + /* Set adaptive coalescing params */ + login->pkt_rate_low = coal->pkt_rate_low; + login->rx_usecs_low = coal->rx_coalesce_usecs_low; + login->pkt_rate_high = coal->pkt_rate_high; + login->rx_usecs_high = coal->rx_coalesce_usecs_high; + login->sample_interval = coal->rate_sample_interval; + login->adaptive_rx_coal = coal->use_adaptive_rx_coalesce; + login->last_moder_time = VNIC_AUTO_CONF; + + if (login->adaptive_rx_coal) + return 0; + + vnic_ib_set_moder(login, + login->rx_usecs, login->rx_frames, + login->tx_usecs, login->tx_frames); + + return 0; +} + +static u32 vnic_get_rx_csum(struct net_device *dev) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + + return login->rx_csum; +} + +static int vnic_set_rx_csum(struct net_device *dev, u32 data) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + + login->rx_csum = (data != 0); + + return 0; +} + +static u32 vnic_get_tx_csum(struct net_device *dev) +{ + return (dev->features & NETIF_F_IP_CSUM) != 0; +} + +static int vnic_set_tx_csum(struct net_device *dev, u32 data) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + + /* check capability bit of SWP */ + if (!(login->port->dev->mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UD_SWP)) + return -EPERM; + + if (data) + dev->features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; + else + dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); + + return 0; +} + +static u32 vnic_get_tso(struct net_device *dev) +{ + return (dev->features & NETIF_F_TSO) != 0; +} + +static int vnic_set_tso(struct net_device *dev, u32 data) +{ + if (data) + dev->features |= (NETIF_F_TSO | NETIF_F_TSO6); + else + dev->features &= ~(NETIF_F_TSO | NETIF_F_TSO6); + + return 0; +} + +#ifndef _BP_ETHTOOL_NO_GSFLAGS +#if !(defined(NETIF_F_GRO) && !defined(_BP_NO_GRO)) +static int vnic_set_flags(struct net_device *dev, u32 data) +{ + int rc = 0, changed = 0; + + if (data & ~ETH_FLAG_LRO) + return -EOPNOTSUPP; + + if (data & ETH_FLAG_LRO) { + if (vnic_lro_num == 0) + return -EOPNOTSUPP; + if (!(dev->features & NETIF_F_LRO)) + changed = 1; + } else if (dev->features & NETIF_F_LRO) { + changed = 1; + } + + if (changed) { + dev->features ^= NETIF_F_LRO; + /* stop/start interface to cleanup any pending LRO sessions */ + rc = vnic_restart(dev); + } + + return rc; +} +#endif +#endif + +static int vnic_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) +{ + cmd->autoneg = AUTONEG_DISABLE; + cmd->supported = SUPPORTED_10000baseT_Full; + cmd->advertising = SUPPORTED_10000baseT_Full; + if (netif_carrier_ok(dev)) { + cmd->speed = SPEED_10000; + cmd->duplex = DUPLEX_FULL; + } else { + cmd->speed = -1; + cmd->duplex = -1; + } + return 0; +} + +static int vnic_set_settings(struct net_device *dev, struct ethtool_cmd *cmd) +{ + if ((cmd->autoneg == AUTONEG_ENABLE) || + (cmd->speed != SPEED_10000) || (cmd->duplex != DUPLEX_FULL)) + return -EINVAL; + + /* Nothing to change */ + return 0; +} + +static void vnic_get_strings(struct net_device *dev, + uint32_t stringset, uint8_t *data) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + int index = 0, stats_off = 0, i; + + if (stringset != ETH_SS_STATS) + return; + + /* Add main counters */ + for (i = 0; i < VNIC_PUB_STATS_LEN; i++) + strcpy(data + (index++) * ETH_GSTRING_LEN, + vnic_strings[i + stats_off]); + stats_off += VNIC_PUB_STATS_LEN; + + for (i = 0; i < VNIC_PORT_STATS_LEN; i++) + strcpy(data + (index++) * ETH_GSTRING_LEN, + vnic_strings[i + stats_off]); + stats_off += VNIC_PORT_STATS_LEN; + + for (i = 0; i < VNIC_PKT_STATS_LEN; i++) + strcpy(data + (index++) * ETH_GSTRING_LEN, + vnic_strings[i + stats_off]); + stats_off += VNIC_PKT_STATS_LEN; + + for (i = 0; i < login->tx_rings_num; i++) { + sprintf(data + (index++) * ETH_GSTRING_LEN, + "tx%d_packets", i); + sprintf(data + (index++) * ETH_GSTRING_LEN, + "tx%d_bytes", i); + } + for (i = 0; i < login->rx_rings_num; i++) { + sprintf(data + (index++) * ETH_GSTRING_LEN, + "rx%d_packets", i); + sprintf(data + (index++) * ETH_GSTRING_LEN, + "rx%d_bytes", i); + } +} + +static void vnic_get_ethtool_stats(struct net_device *dev, + struct ethtool_stats *stats, uint64_t *data) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + int index = 0, i; + + spin_lock_bh(&login->stats_lock); + + for (i = 0; i < VNIC_PUB_STATS_LEN; i++) + data[index++] = ((unsigned long *) &login->stats)[i]; + for (i = 0; i < VNIC_PORT_STATS_LEN; i++) + data[index++] = ((unsigned long *) &login->port_stats)[i]; + for (i = 0; i < VNIC_PKT_STATS_LEN; i++) + data[index++] = 0; + for (i = 0; i < login->tx_rings_num; i++) { + data[index++] = login->tx_res[i].stats.tx_packets; + data[index++] = login->tx_res[i].stats.tx_bytes; + } + for (i = 0; i < login->rx_rings_num; i++) { + data[index++] = login->port->rx_ring[i]->stats.rx_packets; + data[index++] = login->port->rx_ring[i]->stats.rx_bytes; + } + spin_unlock_bh(&login->stats_lock); +} + +#ifndef _BP_ETHTOOL_NO_SSETC +static int vnic_get_sset_count(struct net_device *dev, int sset) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + + switch (sset) { + case ETH_SS_STATS: + return VNIC_STATS_LEN + /* static stats + stats per ring */ + (login->tx_rings_num + login->rx_rings_num) * 2; + default: + return -EOPNOTSUPP; + } +} + +#else +static int vnic_get_stats_count(struct net_device *dev) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + + return VNIC_STATS_LEN + + (login->tx_rings_num + login->rx_rings_num) * 2; +} +#endif + +static void vnic_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol) +{ + wol->supported = wol->wolopts = 0; + + return; +} + +void vnic_get_ringparam(struct net_device *dev, struct ethtool_ringparam *param) +{ + memset(param, 0, sizeof *param); + param->rx_max_pending = VNIC_MAX_RX_SIZE; + param->tx_max_pending = VNIC_MAX_TX_SIZE; + param->rx_pending = vnic_rx_rings_len; + param->tx_pending = vnic_tx_rings_len; +} + +void vnic_set_ethtool_ops(struct net_device *dev) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + struct mlx4_ib_dev *mlx4_ibdev = login->port->dev->mdev; + + ASSERT(login); + ASSERT(login->port->dev->ca); + ASSERT(login->port->dev->ca->dma_device); + + SET_ETHTOOL_OPS(dev, &vnic_ethtool_ops); + strncpy(login->drvinfo.driver, DRV_NAME, VNIC_ETHTOOL_LINE_MAX); + strncpy(login->drvinfo.version, DRV_VER, VNIC_ETHTOOL_LINE_MAX); + login->drvinfo.n_stats = 0; + login->drvinfo.regdump_len = 0; + login->drvinfo.eedump_len = 0; + + sprintf(login->drvinfo.bus_info, "%s [%s:%d]", + pci_name(to_pci_dev(login->port->dev->ca->dma_device)), + login->port->dev->ca->name, login->port->num); + sprintf(login->drvinfo.fw_version, "%s [%.*s]", + login->port->dev->fw_ver_str, MLX4_BOARD_ID_LEN, + mlx4_ibdev->dev->board_id); + vnic_dbg_ethtool(login->name, "bus %s, port %d, fw_ver %s\n", + login->drvinfo.bus_info, login->port->num, + login->drvinfo.fw_version); + + return; +} + +static struct ethtool_ops vnic_ethtool_ops = { + .get_link = ethtool_op_get_link, + .get_sg = ethtool_op_get_sg, + .set_sg = ethtool_op_set_sg, +#ifdef NETIF_F_TSO + .get_tso = vnic_get_tso, + .set_tso = vnic_set_tso, +#endif + .get_ufo = ethtool_op_get_ufo, + .set_ufo = ethtool_op_set_ufo, +#ifndef _BP_ETHTOOL_NO_GSFLAGS + .get_flags = ethtool_op_get_flags, +#if !(defined(NETIF_F_GRO) && !defined(_BP_NO_GRO)) + .set_flags = vnic_set_flags, +#endif +#endif + .get_tx_csum = vnic_get_tx_csum, + .set_tx_csum = vnic_set_tx_csum, + .get_rx_csum = vnic_get_rx_csum, + .set_rx_csum = vnic_set_rx_csum, + .get_drvinfo = vnic_get_drvinfo, + .get_msglevel = vnic_get_msglevel, + .set_msglevel = vnic_set_msglevel, + .get_coalesce = vnic_get_coalesce, + .set_coalesce = vnic_set_coalesce, + .get_strings = vnic_get_strings, + .get_ethtool_stats = vnic_get_ethtool_stats, +#ifndef _BP_ETHTOOL_NO_SSETC + .get_sset_count = vnic_get_sset_count, +#else + .get_stats_count = vnic_get_stats_count, +#endif + .get_settings = vnic_get_settings, + .set_settings = vnic_set_settings, + .get_wol = vnic_get_wol, + .get_ringparam = vnic_get_ringparam, + .set_ringparam = NULL, +}; + diff --git a/drivers/net/mlx4_vnic/vnic_data_fs.c b/drivers/net/mlx4_vnic/vnic_data_fs.c new file mode 100644 index 0000000000000..ca7266312aeda --- /dev/null +++ b/drivers/net/mlx4_vnic/vnic_data_fs.c @@ -0,0 +1,922 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "vnic.h" +#include "vnic_data.h" +#include "vnic_fip_discover.h" + +#define ALL_VLAN_GW_VID "all" + +char *login_dentry_name(char *buf, struct vnic_login *login, char *str) +{ + snprintf(buf, VNIC_SYSFS_FLEN, "%s%d-%s", "vnic", + login->cnt, str); + return buf; +} + +char *port_dentry_name(char *buf, struct vnic_port *port, char *str) +{ + snprintf(buf, VNIC_SYSFS_FLEN, "%s_%s_%d", + str, port->dev->name, port->num); + return buf; +} + +char *vnic_dentry_name(char *buf, struct fip_vnic_data *vnic, char *str) +{ + snprintf(buf, VNIC_SYSFS_FLEN, "%s-%s-%s", "vnic", + vnic->interface_name, str); + return buf; +} + +#ifndef _BP_NO_ATT_OWNER +#define DENTRY_OWNER(_vdentry) \ + (_vdentry)->dentry.attr.owner = THIS_MODULE; \ + (_vdentry)->kobj = &vdentry->dentry.attr.owner->mkobj.kobj; +#else +#define DENTRY_OWNER(_vdentry) \ + (_vdentry)->kobj = &(THIS_MODULE)->mkobj.kobj; +#endif + +#define DENTRY_REMOVE(_dentry) \ +do { \ + vnic_dbg_sysfs((_dentry)->name, "deleted\n"); \ + sysfs_remove_file((_dentry)->kobj, &(_dentry)->dentry.attr); \ + (_dentry)->ctx = NULL; \ +} while (0); + +#define DENTRY_CREATE(_ctx, _dentry, _name, _show, _store) \ +do { \ + struct vnic_sysfs_attr *vdentry = _dentry; \ + vdentry->ctx = _ctx; \ + vdentry->dentry.show = _show; \ + vdentry->dentry.store = _store; \ + vdentry->dentry.attr.name = vdentry->name; \ + vdentry->dentry.attr.mode = 0; \ + DENTRY_OWNER(vdentry); \ + snprintf(vdentry->name, VNIC_SYSFS_FLEN, "%s", _name); \ + if (vdentry->dentry.store) \ + vdentry->dentry.attr.mode |= S_IWUSR; \ + if (vdentry->dentry.show) \ + vdentry->dentry.attr.mode |= S_IRUGO; \ + vnic_dbg_sysfs(_ctx->name, "creating %s\n", \ + vdentry->name); \ + if (strlen(_name) > VNIC_SYSFS_FLEN) { \ + vnic_err(_ctx->name, "name too long %d > %d\n", \ + (int)strlen(_name), VNIC_SYSFS_FLEN); \ + vdentry->ctx = NULL; \ + break; \ + } \ + if (sysfs_create_file(vdentry->kobj, &vdentry->dentry.attr)) { \ + vnic_err(_ctx->name, "failed to create %s\n", \ + vdentry->dentry.attr.name); \ + vdentry->ctx = NULL; \ + break; \ + } \ + vnic_dbg_sysfs(_ctx->name, "created %s\n", vdentry->name); \ +} while (0); + +/* helper functions */ +static const char *port_phys_state_str(enum ib_port_state pstate) +{ + switch (pstate) { + case 0: + return "no_state_change"; + case 1: + return "sleep"; + case 2: + return "polling"; + case 3: + return "disabled"; + case 4: + return "port_configuration_training"; + case 5: + return "up"; + case 6: + return "error_recovery"; + case 7: + return "phy_test"; + default: + return "invalid_state"; + } +} +static const char *port_state_str(enum ib_port_state pstate) +{ + switch (pstate) { + case IB_PORT_DOWN: + return "down"; + case IB_PORT_INIT: + return "initializing"; + case IB_PORT_ARMED: + return "armed"; + case IB_PORT_ACTIVE: + return "active"; + case IB_PORT_NOP: + return "nop"; + case IB_PORT_ACTIVE_DEFER: + return "defer"; + default: + return "invalid_state"; + } +} + +/* store/show functions */ +static ssize_t vnic_neigh_show(struct module_attribute *attr, + struct module *mod, char *buf) +{ + char *p = buf; + struct vnic_sysfs_attr *vnic_dentry = + container_of(attr, struct vnic_sysfs_attr, dentry); + struct vnic_login *login = vnic_dentry->ctx; + struct vnic_neigh *neighe; + struct vnic_mcast *mcaste; + struct rb_node *n; + unsigned long flags; + + /* check if GW entry is ready */ + if (!login->gw_neigh) + goto out; + ASSERT(login->gw_neigh); + ASSERT(login->gw_neigh->ah); + + /* print GW entry */ + neighe = login->gw_neigh; + p += _sprintf(p, buf, "G:MAC["MAC_6_PRINT_FMT"] VID[0x%04x] " + "VID_USED[%d] QPN[0x%06x] LID[0x%04x] RSS[%d]\n", + MAC_6_PRINT_ARG(neighe->mac), + be16_to_cpu(login->vid), login->vlan_used, neighe->qpn, + neighe->lid, neighe->rss); + + /* print neigh tree entries */ + n = rb_first(&login->neigh_tree); + while (n) { + neighe = rb_entry(n, struct vnic_neigh, rb_node); + p += _sprintf(p, buf, "U:MAC["MAC_6_PRINT_FMT"] VID[0x%04x] " + "VID_USED[%d] QPN[0x%06x] LID[0x%04x] RSS[%d]\n", + MAC_6_PRINT_ARG(neighe->mac), + be16_to_cpu(login->vid), login->vlan_used, + neighe->qpn, neighe->lid, neighe->rss); + n = rb_next(n); + } + + /* print mcast tree entries */ + spin_lock_irqsave(&login->mcast_tree.mcast_rb_lock, flags); + n = rb_first(&login->mcast_tree.mcast_tree); + while (n) { + u16 lid = 0xFFFF; + mcaste = rb_entry(n, struct vnic_mcast, rb_node); + n = rb_next(n); + if (test_bit(MCAST_ATTACHED, &mcaste->state)) + lid = mcaste->port_mcaste->rec.mlid; + p += _sprintf(p, buf, "M:MAC["MAC_6_PRINT_FMT"] VID[0x%04x] " + "VID_USED[%d] QPN[0x%06x] LID[0x%04x] RSS[%d]\n", + MAC_6_PRINT_ARG(mcaste->mac), + 0, login->vlan_used, IB_MULTICAST_QPN, lid, 0); + } + spin_unlock_irqrestore(&login->mcast_tree.mcast_rb_lock, flags); + +out: + return (ssize_t)(p - buf); +} + +/* store/show functions */ +static ssize_t vnic_member_show(struct module_attribute *attr, + struct module *mod, char *buf) +{ + char *p = buf; + struct vnic_sysfs_attr *vnic_dentry = + container_of(attr, struct vnic_sysfs_attr, dentry); + struct vnic_login *login = vnic_dentry->ctx; + int i; + + if (!login->is_lag) + goto out; + + netif_tx_lock_bh(login->dev); + p += _sprintf(p, buf, "GW member count=%d active count=%d hash " + "bitmask=0x%x\n", login->lag_member_count, + login->lag_member_active_count, + login->lag_prop.hash_mask); + + p += _sprintf(p, buf, "GW hash mapping table:\n"); + + for (i = 0; i < LAG_MAP_TABLE_SIZE; i += 8) { + p += _sprintf(p, buf, "%3d %3d %3d %3d %3d %3d %3d %3d\n", + login->lag_gw_map[i], login->lag_gw_map[i + 1], + login->lag_gw_map[i + 2], login->lag_gw_map[i + 3], + login->lag_gw_map[i + 4], login->lag_gw_map[i + 5], + login->lag_gw_map[i + 6], login->lag_gw_map[i + 7]); + } + + p += _sprintf(p, buf, "\nGW member state info: (0x1-created, " + "0x2-eport up, 0x4-mcast join complete, " + "0x8-member in use)\n"); + + for (i = 0; i < MAX_LAG_MEMBERS; i++) { + p += _sprintf(p, buf, "%2d GW id=%3d State=0x%3x LID=%3d " + "QPN=0x%X\n", i, + login->lag_gw_neigh[i].gw_id, + login->lag_gw_neigh[i].info, + login->lag_gw_neigh[i].neigh.lid, + login->lag_gw_neigh[i].neigh.qpn); + } + netif_tx_unlock_bh(login->dev); + +out: + return (ssize_t)(p - buf); +} + +static ssize_t vnic_login_show(struct module_attribute *attr, + struct module *mod, char *buf) +{ + char *p = buf, tmp_line[VNIC_SYSFS_LLEN]; + struct vnic_sysfs_attr *vnic_dentry = + container_of(attr, struct vnic_sysfs_attr, dentry); + struct vnic_login *login = vnic_dentry->ctx; + struct fip_vnic_data *vnic_fip = login->fip_vnic; + int rc, eport_connected = test_bit(VNIC_STATE_LOGIN_CREATE_2, &login->state); + u16 pkey_used = 0; + int lag_gw; + int ret; + + ASSERT(login->dev); + ASSERT(login->port->dev->ca); + + /* NETDEV attributes */ + p += _sprintf(p, buf, "NETDEV_NAME %s\n", login->dev->name); + p += _sprintf(p, buf, "NETDEV_LINK %s\n", + netif_carrier_ok(login->dev) ? "up" : "down"); + p += _sprintf(p, buf, "NETDEV_OPEN %s\n", + (login->dev->flags & IFF_UP) ? "yes" : "no"); + p += _sprintf(p, buf, "NETDEV_QSTOP %s\n", + netif_queue_stopped(login->dev) ? "yes" : "no"); + p += _sprintf(p, buf, "NETDEV_MTU %d/%d\n", + (int)login->dev->mtu, + (int)login->max_mtu); + + /* IOA attributes */ + p += _sprintf(p, buf, "IOA_PORT %s:%d\n", + login->port->dev->ca->name, + login->port->num); + p += _sprintf(p, buf, "IOA_NAME %s\n", + login->desc); + p += _sprintf(p, buf, "IOA_LID 0x%04x\n", login->port->attr.lid); + p += _sprintf(p, buf, "IOA_GUID "VNIC_GUID_FMT"\n", + VNIC_GUID_RAW_ARG(login->port->gid.raw + 8)); + p += _sprintf(p, buf, "IOA_LOG_LINK %s\n", + port_phys_state_str(login->port->attr.phys_state)); + p += _sprintf(p, buf, "IOA_PHY_LINK %s\n", + port_state_str(login->port->attr.state)); + p += _sprintf(p, buf, "IOA_MTU %d\n", login->port->max_mtu_enum); + + + /* EPORT and BX attributes */ + if (no_bxm) { + p += _sprintf(p, buf, "EPORT_STATE %s\n", "bridgeless"); + } else if (vnic_fip) { + p += _sprintf(p, buf, "EPORT_STATE %s\n", + !eport_connected ? "disconnected" : + (fip_vnic_get_eport_state(vnic_fip) ? + "up" : "down")); + p += _sprintf(p, buf, "EPORT_NAME %s\n", + fip_vnic_get_eport_name(vnic_fip, tmp_line) ? + NOT_AVAILABLE_STRING : tmp_line); + p += _sprintf(p, buf, "EPORT_QPN 0x%06x\n", + login->gw_neigh ? login->gw_neigh->qpn : 0); + p += _sprintf(p, buf, "EPORT_LID 0x%04x\n", + login->gw_neigh ? login->gw_neigh->lid : 0); + p += _sprintf(p, buf, "EPORT_ID %u\n", login->gw_port_id); + + p += _sprintf(p, buf, "BX_NAME %s\n", + fip_vnic_get_bx_name(vnic_fip, tmp_line) ? + NOT_AVAILABLE_STRING : tmp_line); + fip_vnic_get_bx_guid(vnic_fip, tmp_line); + if (*((u64 *)tmp_line) == 0) + p += _sprintf(p, buf, "BX_GUID %s\n", NOT_AVAILABLE_STRING); + else + p += _sprintf(p, buf, "BX_GUID "VNIC_GUID_FMT"\n", + VNIC_GUID_RAW_ARG(tmp_line)); + + lag_gw = fip_vnic_get_gw_type(vnic_fip); + if (lag_gw) { + p += _sprintf(p, buf, "GW_TYPE LAG\n"); + ret = fip_vnic_get_lag_eports(vnic_fip, p); + p += (ret > 0) ? ret : 0; + } else + p += _sprintf(p, buf, "GW_TYPE LEGACY\n"); + + rc = fip_vnic_get_all_vlan_mode(vnic_fip, tmp_line); + p += _sprintf(p, buf, "ALL_VLAN %s\n", + rc < 0 ? NOT_AVAILABLE_STRING : tmp_line); + + } else { + p += _sprintf(p, buf, "EPORT_STATE %s\n", "error"); + } + + /* misc attributes*/ + p += _sprintf(p, buf, "SW_RSS %s\n", + !eport_connected ? NOT_AVAILABLE_STRING : + ((login->qps_num > 1) ? "yes" : "no")); + p += _sprintf(p, buf, "SW_RSS_SIZE %u\n", login->qps_num); + p += _sprintf(p, buf, "RX_RINGS_NUM %d\n", login->rx_rings_num); + p += _sprintf(p, buf, "RX_RINGS_LIN %s\n", + login->port->rx_ring[0]->log_rx_info ? "no" : "yes"); + p += _sprintf(p, buf, "TX_RINGS_NUM %d\n", login->tx_rings_num); + p += _sprintf(p, buf, "TX_RINGS_ACT %d\n", + VNIC_TXQ_GET_ACTIVE(login)); + p += _sprintf(p, buf, "NDO_TSS %s\n", + (login->ndo_tx_rings_num > 1) ? "yes" : "no"); + p += _sprintf(p, buf, "NDO_TSS_SIZE %u\n", login->ndo_tx_rings_num); + p += _sprintf(p, buf, "MCAST_PROMISC %s\n", + !eport_connected ? NOT_AVAILABLE_STRING : + (is_mcast_promisc(login) ? "yes" : "no")); + p += _sprintf(p, buf, "UCAST_PROMISC %s\n", + (is_ucast_promisc(login) ? "yes" : "no")); + p += _sprintf(p, buf, "MCAST_MASK %d\n", login->n_mac_mcgid); + p += _sprintf(p, buf, "CHILD_VNICS %d/%d\n", + atomic_read(&login->vnic_child_cnt), + vnic_child_max); + p += _sprintf(p, buf, "PKEY 0x%04x\n", login->pkey); + p += _sprintf(p, buf, "PKEY_INDEX 0x%04x\n", login->pkey_index); + rc = ib_query_pkey(login->port->dev->ca, login->port->num, + login->pkey_index, &pkey_used); + p += _sprintf(p, buf, "PKEY_MEMBER %s\n", + (rc || !eport_connected) ? NOT_AVAILABLE_STRING : + ((pkey_used & 0x8000) ? "full" : "partial")); + p += _sprintf(p, buf, "SL_DATA %u\n", login->sl); + p += _sprintf(p, buf, "SL_CONTROL %u\n", + vnic_fip ? fip_vnic_get_bx_sl(vnic_fip) : 0); +#if defined(NETIF_F_GRO) && !defined(_BP_NO_GRO) + p += _sprintf(p, buf, "GRO %s\n", + login->dev->features & NETIF_F_GRO ? "yes" : "no"); +#elif defined(NETIF_F_LRO) + p += _sprintf(p, buf, "LRO %s\n", + login->dev->features & NETIF_F_LRO ? "yes" : "no"); + p += _sprintf(p, buf, "LRO_NUM %d\n", login->lro_num); +#endif + p += _sprintf(p, buf, "NAPI %s\n", + login->napi_num ? "yes" : "no"); + p += _sprintf(p, buf, "NAPI_WEIGHT %u\n", + login->napi_num ? vnic_napi_weight : 0); + p += _sprintf(p, buf, "QPN 0x%x\n", + login->qp_base_num); + p += _sprintf(p, buf, "MAC "MAC_6_PRINT_FMT"\n", + MAC_6_PRINT_ARG(login->dev_addr)); + p += _sprintf(p, buf, "VNIC_ID %d\n", + vnic_fip ? vnic_fip->vnic_id : 0); + p += _sprintf(p, buf, "ADMIN_MODE %s\n", + !vnic_fip ? NOT_AVAILABLE_STRING : + (vnic_fip->hadmined ? "host" : "network")); + + if (vnic_fip && vnic_fip->vlan_used) + p += _sprintf(p, buf, "VLAN 0x%03x\n", vnic_fip->vlan); + else + p += _sprintf(p, buf, "VLAN %s\n", NOT_AVAILABLE_STRING); + + if (vnic_fip && vnic_fip->shared_vnic.enabled) { + p += _sprintf(p, buf, "SHARED_MAC "MAC_6_PRINT_FMT"\n", + MAC_6_PRINT_ARG(vnic_fip->shared_vnic.emac)); + p += _sprintf(p, buf, "SHARED_IP "IP_4_PRINT_FMT"\n", + IP_4_PRINT_ARG(vnic_fip->shared_vnic.ip)); + } else { + p += _sprintf(p, buf, "SHARED_MAC %s\n", NOT_AVAILABLE_STRING); + p += _sprintf(p, buf, "SHARED_IP %s\n", NOT_AVAILABLE_STRING); + } + + return (ssize_t)(p - buf); +} + +static ssize_t vnic_qps_show(struct module_attribute *attr, + struct module *mod, char *buf) +{ + char *p = buf; + struct vnic_sysfs_attr *vnic_dentry = + container_of(attr, struct vnic_sysfs_attr, dentry); + struct vnic_login *login = vnic_dentry->ctx; + struct ib_qp *qp; + struct ib_qp_attr query_attr; + struct ib_qp_init_attr query_init_attr; + int i, mask = -1; + + for (i = 0; i < login->qps_num; ++i) { + qp = login->qp_res[i].qp; + if (ib_query_qp(qp, &query_attr, mask, &query_init_attr)) + continue; + p += _sprintf(p, buf, "QP_INDEX %d\n", i); + p += _sprintf(p, buf, "QP_NUM 0x%06x\n", qp->qp_num); + p += _sprintf(p, buf, "QP_QKEY 0x%08x\n", query_attr.qkey); + p += _sprintf(p, buf, "QP_STATE 0x%02x\n", query_attr.qp_state); + p += _sprintf(p, buf, "QP_RX_RING %d\n", i % login->rx_rings_num); + p += _sprintf(p, buf, "QP_PTR %p\n", qp); + p += _sprintf(p, buf, "QP_RX_SRQ_PTR %p\n", qp->srq); + p += _sprintf(p, buf, "QP_RX_CQ_PTR %p\n", qp->recv_cq); + p += _sprintf(p, buf, "QP_TX_CQ_PTR %p\n", qp->send_cq); + p += _sprintf(p, buf, "\n"); + } + + return (ssize_t)(p - buf); +} + +#ifdef VNIC_PROFILLNG +static ssize_t vnic_dentry_prof_skb_show(struct module_attribute *attr, + struct module *mod, char *buf) +{ + char *p = buf; + struct vnic_sysfs_attr *vnic_dentry = + container_of(attr, struct vnic_sysfs_attr, dentry); + struct vnic_login *login = vnic_dentry->ctx; + struct sk_buff *skb; + int i; + + for (i = 0; i < VNIC_PROFILLNG_SKB_MAX; ++i) { + if (!login->prof_arr[i].cnt) + continue; + skb = &login->prof_arr[i].skb; + p += _sprintf(p, buf, "==============\n"); + p += _sprintf(p, buf, "SKB[%d] CNT %d\n", i, login->prof_arr[i].cnt); + p += _sprintf(p, buf, "len %d\n", skb->len); + p += _sprintf(p, buf, "data_len %d\n", skb->data_len); + p += _sprintf(p, buf, "head_len %d\n", skb_headlen(skb)); + p += _sprintf(p, buf, "gso %d\n", skb_is_gso(skb)); + p += _sprintf(p, buf, "nr_frags %d\n", login->prof_arr[i].nr_frags); + p += _sprintf(p, buf, "jiffies %lu\n", login->prof_arr[i].jiffies); + p += _sprintf(p, buf, "msecs %u\n", + jiffies_to_msecs(login->prof_arr[i].jiffies)); + p += _sprintf(p, buf, "msecs_diff %u\n", + jiffies_to_msecs(login->prof_arr[i].jiffies) - + jiffies_to_msecs(login->prof_arr[i ? i -1 : 0].jiffies)); + } + + return (ssize_t)(p - buf); +} + +#endif + +static int get_guid(u8 *guid, char *s) +{ + if (sscanf(s, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx:%hhx:%hhx", + guid + 0, guid + 1, guid + 2, guid + 3, guid + 4, + guid + 5, guid + 6, guid + 7) != 8) + return -1; + + return 0; +} + +static int get_mac(u8 *mac, char *s) +{ + if (sscanf(s, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx", + mac + 0, mac + 1, mac + 2, mac + 3, mac + 4, + mac + 5) != 6) + return -1; + + return 0; +} + +static int get_ipv4(short unsigned int *ip, char *s) +{ + if (sscanf(s, "%hu.%hu.%hu.%hu", ip + 0, ip + 1, ip + 2, ip + 3) != 4) + return -1; + + return 0; +} + +static int get_parent(struct vnic_port *port, char *parent) +{ + struct net_device *parent_netdev; + + /* check parent syntax */ + if (!dev_valid_name(parent)) + return -EINVAL; + + parent_netdev = dev_get_by_name(&init_net, parent); + if (parent_netdev) + dev_put(parent_netdev); + + return parent_netdev ? 0 : -ENODATA; +} + +static struct fip_hadmin_cache *get_hadmin_entry(void) +{ + struct fip_hadmin_cache *hadmin_entry; + + hadmin_entry = kzalloc(sizeof *hadmin_entry, GFP_ATOMIC); + if (!hadmin_entry) + return NULL; + + hadmin_entry->vnic_id = NOT_AVAILABLE_NUM; + hadmin_entry->gw_port_id = NOT_AVAILABLE_NUM; + + return hadmin_entry; +} + +void vnic_login_cmd_init(struct fip_hadmin_cmd *cmd) +{ + char *buf = (char *)cmd; + u8 i; + + for (i = 0; i < MAX_INPUT_ARG; ++i) + sprintf(buf + (i * MAX_INPUT_LEN), NOT_AVAILABLE_STRING); +} + +int vnic_login_cmd_set(char *buf, struct fip_hadmin_cmd *cmd) +{ + int count; + + if (cmd) { + count = sprintf(buf, "name=%s mac=%s vnic_id=%s vid=%s " + "bxname=%s bxguid=%s eport=%s ipv4=%s ipv6=%s " + "emac=%s pkey=%s parent=%s\n", + cmd->c_name, cmd->c_mac, cmd->c_vnic_id, + cmd->c_vid, cmd->c_bxname, cmd->c_bxguid, + cmd->c_eport, cmd->c_ipv4, cmd->c_ipv6, + cmd->c_emac, cmd->c_pkey, cmd->c_parent); + vnic_dbg_sysfs((char *)(cmd->c_name), "cmd: %s", buf); + } else /* print the cmd syntax */ + count = sprintf(buf, "name=%%s mac=%%s vnic_id=%%s vid=%%s " + "bxname=%%s bxguid=%%s eport=%%s ipv4=%%s " + "ipv6=%%s emac=%%s pkey=%%s parent=%%s\n"); + + return count; +} + +/* create/destroy child vNic; syntax example: + * +00:11:22:33:44:55 + */ +static ssize_t vnic_child_write(struct module_attribute *attr, + struct module *mod, + const char *buf, size_t count) +{ + struct vnic_sysfs_attr *vnic_dentry = + container_of(attr, struct vnic_sysfs_attr, dentry); + struct vnic_login *login = vnic_dentry->ctx; + char action = buf[0]; + char *buf_mac = (char *)buf + 1; + int remove = -1; + u8 mac[ETH_ALEN]; + + if (action == '-') + remove = 1; + if (action == '+') + remove = 0; + + if (remove < 0 || get_mac(mac, buf_mac) || !is_valid_ether_addr(mac)) + return -EINVAL; + + vnic_learn_mac(login->dev, mac, remove); + return count; +} + +int fip_hadmin_sysfs_update(struct vnic_port *port, + const char *buf, int count, int remove) +{ + struct fip_discover *discover; + struct fip_hadmin_cache *hadmin_entry, *hadmin_it; + struct fip_hadmin_cmd *cmd; + char *name = NULL; + int rc, num; + u16 pkey; + + hadmin_entry = get_hadmin_entry(); + if (!hadmin_entry) { + rc = -ENOMEM; + vnic_dbg_sysfs(port->name, "get_hadmin_entry failed\n"); + goto err; + } + + cmd = &hadmin_entry->cmd; + rc = sscanf(buf, "name=%s mac=%s vnic_id=%s vid=%s bxname=%s bxguid=%s " + "eport=%s ipv4=%s ipv6=%s emac=%s pkey=%s parent=%s", + cmd->c_name, cmd->c_mac, cmd->c_vnic_id, cmd->c_vid, + cmd->c_bxname, cmd->c_bxguid, cmd->c_eport, cmd->c_ipv4, + cmd->c_ipv6, cmd->c_emac, cmd->c_pkey, cmd->c_parent); + if (rc != MAX_INPUT_ARG) { + vnic_dbg_sysfs(port->name, "sscanf failed, rc %d\n", rc); + rc = -EINVAL; + goto err; + } else + name = (char *)(cmd->c_name); + + /* get parent name */ + if (!dev_valid_name(cmd->c_parent)) + hadmin_entry->parent_used = 0; + else if (remove || !get_parent(port, cmd->c_parent)) { + vnic_dbg_sysfs(name, "parent set %s\n", cmd->c_parent); + strncpy(hadmin_entry->parent_name, cmd->c_parent, + sizeof(hadmin_entry->parent_name)); + hadmin_entry->parent_used = 1; + } else { + vnic_warn(name, "invalid parent name %s\n", cmd->c_parent); + rc = -EINVAL; + goto err; + } + + /* get vNic ID dec (must) */ + if (sscanf(cmd->c_vnic_id, "%d", &num) != 1) { + /* abort on failure */ + vnic_warn(name, "invalid vNic ID %s\n", cmd->c_vnic_id); + rc = -EINVAL; + goto err; + } + hadmin_entry->vnic_id = (u16)num; + + /* get vNic MAC (must) */ + if (get_mac(hadmin_entry->mac, cmd->c_mac)) { + vnic_warn(name, "invalid vNic MAC %s\n", cmd->c_vnic_id); + rc = -EINVAL; + goto err; + } + + /* get interface name (must) */ + if ((!dev_valid_name(cmd->c_name) && !hadmin_entry->parent_used) || + ((strlen(cmd->c_name) > VNIC_NAME_LEN) && hadmin_entry->parent_used)) { + vnic_warn(name, "invalid vNic name %s\n", cmd->c_name); + rc = -EINVAL; + goto err; + } + + strncpy(hadmin_entry->interface_name, cmd->c_name, + sizeof(hadmin_entry->interface_name)); + + /* get BX GUID, if fails, get BX NAME */ + if (get_guid(hadmin_entry->system_guid, cmd->c_bxguid)) { + strncpy(hadmin_entry->system_name, cmd->c_bxname, + sizeof(hadmin_entry->system_name)); + vnic_dbg_sysfs(name, "use BX NAME %s\n", cmd->c_bxname); + } + + /* get shared emac/ip */ + if (!get_ipv4((short unsigned int *)hadmin_entry->shared_vnic_ip, + cmd->c_ipv4)) { + /* TODO, add IPv6 support for shared vNic */ + get_mac(hadmin_entry->shared_vnic_mac, cmd->c_emac); + vnic_dbg_sysfs(name, "use shared ip/mac\n"); + } + + /* get VLAN field (dec) */ + if ((sscanf(cmd->c_vid, "%d", &num) == 1) && + num < VLAN_N_VID && num >= 0) { + /* set other fields on success, skip on failure */ + vnic_dbg_sysfs(name, "vlan set 0x%x\n", hadmin_entry->vlan); + hadmin_entry->vlan_used = 1; + hadmin_entry->vlan = (u16)num; + } else if (!strcmp(cmd->c_vid, ALL_VLAN_GW_VID)) { + /* Dont set 'vlan_used'. the code counts on it being NULL for + * host admin vnics in all_vlan mode, when Vlans are used */ + hadmin_entry->vlan = 0; + hadmin_entry->all_vlan_gw = 1; + } + + /* get eport name */ + if (!strlen(cmd->c_eport)) { + vnic_warn(name, "invalid eport name %s\n", cmd->c_eport); + rc = -EINVAL; + goto err; + } + strncpy(hadmin_entry->eport_name, cmd->c_eport, + sizeof(hadmin_entry->eport_name)); + + /* set remove/add flag */ + vnic_dbg_sysfs(name, "%s hadmin vNic\n", remove ? "remove" : "add"); + hadmin_entry->remove = remove; + + /* set pkey (hex) */ + if ((sscanf(cmd->c_pkey, "%x", &num) != 1) || !num) + pkey = 0xffff; /* default */ + else + pkey = (u16)num | 0x8000; + vnic_dbg_sysfs(name, "pkey 0x%x\n", pkey); + + /* cannot sleep in this functions for child vnics flow + * (avoid schedule while atomic oops) + * TODO: check if holding start_stop_lock is needed here + */ + //mutex_lock(&port->start_stop_lock); + + list_for_each_entry(discover, &port->fip.discover_list, discover_list) { + if (discover->pkey == pkey) { + spin_lock_irq(&discover->lock); + + if (discover->flush != FIP_NO_FLUSH) { + rc = -EBUSY; + spin_unlock_irq(&discover->lock); + goto skip; + } + + /* check that this mac/vlan is not in the cache list + * (saves redundant queue_delayed_work call during + * vnic_learn_mac bursts) + */ + list_for_each_entry_reverse(hadmin_it, &discover->hadmin_cache, next) { + if (!memcmp(hadmin_entry->mac, hadmin_it->mac, ETH_ALEN) && + hadmin_entry->vlan == hadmin_it->vlan && + hadmin_entry->remove == hadmin_it->remove) { + rc = -EEXIST; + spin_unlock_irq(&discover->lock); + goto skip; + } + } + list_add_tail(&hadmin_entry->next, &discover->hadmin_cache); + /* calls fip_discover_hadmin_update() */ + queue_delayed_work(fip_wq, &discover->hadmin_update_task, HZ/10); + spin_unlock_irq(&discover->lock); + goto updated_discover; + } + } + + //mutex_unlock(&port->start_stop_lock); + vnic_dbg_sysfs(name, "Requested PKEY=0x%x is not configured\n", pkey); + goto skip; + +err: + vnic_dbg_sysfs(name, "Invalid host admin request format string. Request rejected\n"); +skip: + kfree(hadmin_entry); + return rc; + +updated_discover: + //mutex_unlock(&port->start_stop_lock); + return count; +} + +static ssize_t vnic_login_cmd(struct module_attribute *attr, + struct module *mod, char *buf) +{ + char *p = buf; + struct vnic_sysfs_attr *vnic_dentry = + container_of(attr, struct vnic_sysfs_attr, dentry); + struct fip_vnic_data *vnic_fip = vnic_dentry->ctx; + struct fip_hadmin_cmd *cmd; + + if (!vnic_fip || !vnic_fip->hadmined) + goto out; + + cmd = &vnic_fip->cmd; + p += _sprintf(p, buf, "name=%s mac=%s vnic_id=%s vid=%s bxname=%s bxguid=%s " + "eport=%s ipv4=%s ipv6=%s emac=%s pkey=%s parent=%s ", + cmd->c_name, cmd->c_mac, cmd->c_vnic_id, cmd->c_vid, + cmd->c_bxname, cmd->c_bxguid, cmd->c_eport, cmd->c_ipv4, + cmd->c_ipv6, cmd->c_emac, cmd->c_pkey, cmd->c_parent); + p += _sprintf(p, buf, "ib_port=%s", vnic_fip->port->name); + p += _sprintf(p, buf, "\n"); + +out: + return (ssize_t)(p - buf); +} + +int vnic_create_hadmin_dentry(struct fip_vnic_data *vnic) +{ + char name[VNIC_SYSFS_FLEN]; + + DENTRY_CREATE(vnic, &vnic->dentry, + vnic_dentry_name(name, vnic, "cmd"), + vnic_login_cmd, NULL); + return 0; +} + +void vnic_delete_hadmin_dentry(struct fip_vnic_data *vnic) +{ + if (vnic->dentry.ctx) + DENTRY_REMOVE(&vnic->dentry); +} + +int vnic_create_dentry(struct vnic_login *login) +{ + int i = 0; + char name[VNIC_SYSFS_FLEN]; + + DENTRY_CREATE(login, &login->dentries[i++], + login_dentry_name(name, login, "info"), + vnic_login_show, NULL); + DENTRY_CREATE(login, &login->dentries[i++], + login_dentry_name(name, login, "child"), + NULL, vnic_child_write); + DENTRY_CREATE(login, &login->dentries[i++], + login_dentry_name(name, login, "neigh"), + vnic_neigh_show, NULL); + DENTRY_CREATE(login, &login->dentries[i++], + login_dentry_name(name, login, "qps"), + vnic_qps_show, NULL); + DENTRY_CREATE(login, &login->dentries[i++], + login_dentry_name(name, login, "member"), + vnic_member_show, NULL); + +#ifdef VNIC_PROFILLNG + DENTRY_CREATE(login, &login->dentries[i++], + login_dentry_name(name, login, "prof_skb"), + vnic_dentry_prof_skb_show, NULL); +#endif + return 0; +} + +void vnic_delete_dentry(struct vnic_login *login) +{ + int i; + + for (i = 0; i < VNIC_MAX_DENTRIES; ++i) { + if (login->dentries[i].ctx) + DENTRY_REMOVE(&login->dentries[i]); + } +} + +static ssize_t port_gw_fs_show(struct module_attribute *attr, + struct module *mod, char *buf) +{ + struct vnic_sysfs_attr *vnic_dentry = + container_of(attr, struct vnic_sysfs_attr, dentry); + struct vnic_port *port = vnic_dentry->ctx; + + return fip_gw_sysfs_show(port, buf); +} + +static ssize_t port_hadmin_syntax(struct module_attribute *attr, + struct module *mod, char *buf) +{ + /* print cmd syntax only (for usage) */ + return vnic_login_cmd_set(buf, NULL); +} + +static ssize_t port_hadmin_add_write(struct module_attribute *attr, + struct module *mod, + const char *buf, size_t count) +{ + struct vnic_sysfs_attr *vnic_dentry = + container_of(attr, struct vnic_sysfs_attr, dentry); + struct vnic_port *port = vnic_dentry->ctx; + + return fip_hadmin_sysfs_update(port, buf, count, 0); +} + +static ssize_t port_hadmin_del_write(struct module_attribute *attr, + struct module *mod, + const char *buf, size_t count) +{ + struct vnic_sysfs_attr *vnic_dentry = + container_of(attr, struct vnic_sysfs_attr, dentry); + struct vnic_port *port = vnic_dentry->ctx; + + return fip_hadmin_sysfs_update(port, buf, count, 1); +} + +int port_fs_init(struct vnic_port *port) +{ + int i = 0; + char name[VNIC_SYSFS_FLEN]; + + DENTRY_CREATE(port, &port->dentries[i++], + port_dentry_name(name, port, "host_add"), + port_hadmin_syntax, port_hadmin_add_write); + + DENTRY_CREATE(port, &port->dentries[i++], + port_dentry_name(name, port, "host_del"), + port_hadmin_syntax, port_hadmin_del_write); + + DENTRY_CREATE(port, &port->dentries[i++], + port_dentry_name(name, port, "gws"), + port_gw_fs_show, NULL); + + return 0; +} + +void port_fs_exit(struct vnic_port *port) +{ + int i; + + for (i = 0; i < VNIC_MAX_DENTRIES; ++i) { + if (port->dentries[i].ctx) + DENTRY_REMOVE(&port->dentries[i]); + } +} + diff --git a/drivers/net/mlx4_vnic/vnic_data_ib.c b/drivers/net/mlx4_vnic/vnic_data_ib.c new file mode 100644 index 0000000000000..ba031d8be293f --- /dev/null +++ b/drivers/net/mlx4_vnic/vnic_data_ib.c @@ -0,0 +1,1632 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include + +#include "vnic.h" +#include "vnic_data.h" + +int vnic_post_recv(struct vnic_rx_ring *ring, u64 wr_id) +{ + struct ib_recv_wr *bad_wr; + int i, rc; + + ring->wr.wr_id = wr_id; + + for (i = 0; i < ring->num_frags; i++) + ring->sge[i].addr = ring->rx_info[wr_id].dma_addr[i]; + + rc = ib_post_srq_recv(ring->srq, &ring->wr, &bad_wr); + if (unlikely(rc)) { + /* we will not use a lock here. In the worst case we will have + * an incorrect value of need_refill. Not a biggie + */ + + /*ring->rx_info[wr_id].info = VNIC_FRAG_NOT_POSTED; + ring->need_refill = 1; + */ + vnic_dbg_data(ring->port->name, "receive failed for buf %llu (%d)\n", + wr_id, rc); + } + + return rc; +} + +static void vnic_dealloc_tx_skb(struct vnic_login *login, unsigned cq_index, + u64 wr_id) +{ + struct vnic_tx_res *tx_res = &login->tx_res[cq_index]; + int is_inline = !!(wr_id & VNIC_SEND_INLINE_FLAG); + struct sk_buff *skb; + u64 *mapping; + int i, off = 0; + + wr_id &= ~VNIC_SEND_INLINE_FLAG; + skb = tx_res->tx_ring[wr_id].skb; + ASSERT(skb); + mapping = tx_res->tx_ring[wr_id].mapping; + + if (!is_inline) { + if (!vnic_encap_headroom && !skb_is_gso(skb)) { + ib_dma_unmap_single(login->port->dev->ca, mapping[off], + VNIC_ENCAP_LEN, DMA_TO_DEVICE); + off++; + } + if (skb_headlen(skb)) { + ib_dma_unmap_single(login->port->dev->ca, mapping[off], + skb_headlen(skb), DMA_TO_DEVICE); + off++; + } + for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + ib_dma_unmap_page(login->port->dev->ca, + mapping[i + off], frag->size, + DMA_TO_DEVICE); + } + } + + /* dealloc skb */ + dev_kfree_skb_any(skb); + tx_res->tx_ring[wr_id].skb = NULL; +} + +static void vnic_ib_handle_tx_wc(struct vnic_login *login, + int tx_res_index, struct ib_wc *wc) +{ + struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index]; + u64 wr_id = wc->wr_id & ~VNIC_SEND_INLINE_FLAG; + + vnic_dbg_data(login->name, "send completion: wr_id %llu, status: %d " + "[head %d - tail %d]\n", wr_id, wc->status, + tx_res->tx_head, tx_res->tx_tail); + + ASSERT(wr_id < vnic_tx_rings_len); + vnic_dealloc_tx_skb(login, tx_res_index, wc->wr_id); + + ++tx_res->tx_tail; + --tx_res->tx_outstanding; + + if (unlikely(wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR)) { + vnic_warn(login->name, "failed send event " + "(status %d, wr_id %llu, vend_err 0x%x)\n", + wc->status, wr_id, wc->vendor_err); + vnic_warn(login->name, "TX CQE error, queueing rings restart\n"); + if (!login->queue_stopped) + queue_delayed_work(login_wq, &login->restart_task, HZ / 100); + } +} + +int vnic_post_recvs(struct vnic_rx_ring *ring) +{ + int i, rc; + + for (i = 0; i < ring->size; i++) { + rc = vnic_post_recv(ring, i); + if (rc) { + vnic_err(ring->port->name, "Failed post receive %d\n", rc); + return rc; + } + } + + return 0; +} + +static int vnic_vlan_is_valid(struct vnic_login *login, + struct vlan_ethhdr *veth) +{ + ASSERT(veth->h_vlan_proto == htons(ETH_P_8021Q)); + if ((be16_to_cpu(veth->h_vlan_TCI) & 0xfff) != + be16_to_cpu(login->vid)) { + vnic_dbg_data(login->name, "invalid vlan, ingress vid " + "0x%x, login: vid 0x%x vlan_used %d\n", + be16_to_cpu(veth->h_vlan_TCI), + be16_to_cpu(login->vid), + login->vlan_used); + return 0; + } + + return 1; +} + +/* If a vlan tag should exist in the eth_hdr - validate it. + is_vlan_proto is set if vlan protocol is present in the eth header + return values 0 - on success, 1 - on error : + for all vlans gateway (promisc vlan): + 0 - there is no vlan or there is a vlan and it is valid + 1 - vlan is present and not valid. + for all other vlans: + 0 - there shouldn't be a vlan, or vlan should be present and is valid. + 1 - vlan should be present and it is not, ot it is not valid. */ +static int validate_vnic_vlan(struct vnic_login *login, + struct vlan_ethhdr *veth, + int *is_vlan_proto) +{ + int is_vlan = !!(veth->h_vlan_proto == htons(ETH_P_8021Q)); + + *is_vlan_proto = is_vlan; + + if (login->all_vlan_gw) + return 0; + + if (VNIC_VLAN_ENABLED(login) && login->vid && !is_vlan) { + vnic_dbg_data(login->name, "missing vlan tag\n"); + VNIC_STATS_INC(login->port_stats.vlan_err); + return 1; + } + + if (is_vlan && unlikely(!vnic_vlan_is_valid(login, veth))) { + vnic_dbg_data(login->name, "invalid vlan tag\n"); + VNIC_STATS_INC(login->port_stats.vlan_err); + return 1; + } + + return 0; +} + +static void vnic_ib_handle_rx_wc_linear(struct vnic_login *login, + struct ib_wc *wc, int rx_ring_index) +{ + struct vnic_rx_ring *ring = login->port->rx_ring[rx_ring_index]; + struct eoibhdr *eoib_hdr; + struct sk_buff *skb; + struct vlan_ethhdr *veth; + int rc, wr_id = wc->wr_id, checksum_ok, ip_summed, + buf_size = VNIC_BUF_SIZE(ring->port); + int is_vlan_proto; + u64 mapping; + u16 eth_type; + u8 *va, *eth_hdr; + + spin_lock_bh(&ring->lock); + ASSERT(wr_id < ring->size); + + skb = ring->rx_info[wr_id].skb; + mapping = ring->rx_info[wr_id].dma_addr[0]; + + /* termination with error */ + if (unlikely(wc->status != IB_WC_SUCCESS)) { + if(wc->status != IB_WC_REM_ABORT_ERR && + wc->status != IB_WC_LOC_LEN_ERR) { + vnic_dbg_data(login->name, "RX CQE error " + "(status %d, vend_err 0x%x), " + "queueing rings restart\n", + wc->status, wc->vendor_err); + if (!login->queue_stopped) + queue_delayed_work(login_wq, + &login->restart_task, + HZ / 10); + } + goto repost; + } + + ASSERT(skb); + ASSERT(mapping); + + /* If we can't allocate a new RX buffer, dump + * this packet and reuse the old buffer. + */ + if (unlikely(!vnic_alloc_rx_skb(ring, wr_id, GFP_ATOMIC))) { + VNIC_STATS_DO_INC(login->stats.rx_dropped); + goto repost; + } + + ib_dma_unmap_single(login->port->dev->ca, mapping, + buf_size, DMA_FROM_DEVICE); + skb_put(skb, wc->byte_len); + skb_pull(skb, IB_GRH_BYTES); + + /* check EoIB header signature and version */ + va = skb->data; + eoib_hdr = (struct eoibhdr *)va; + if (unlikely(VNIC_EOIB_HDR_GET_SIG(eoib_hdr) != VNIC_EOIB_HDR_SIG || + VNIC_EOIB_HDR_GET_VER(eoib_hdr) != VNIC_EOIB_HDR_VER)) { + vnic_dbg_data(login->name, "bad sig (0x%x) or ver (0x%x)\n", + VNIC_EOIB_HDR_GET_SIG(eoib_hdr), + VNIC_EOIB_HDR_GET_VER(eoib_hdr)); + VNIC_STATS_INC(login->port_stats.sig_ver_err); + goto repost; + } + + /* check EoIB CSUM */ + checksum_ok = login->rx_csum && VNIC_CSUM_OK(eoib_hdr); + ip_summed = checksum_ok ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE; + if (likely((checksum_ok))) + VNIC_STATS_INC(login->port_stats.rx_chksum_good); + else + VNIC_STATS_INC(login->port_stats.rx_chksum_none); + + /* Ethernet header */ + skb_pull(skb, VNIC_ENCAP_LEN); + va += VNIC_ENCAP_LEN; + veth = (struct vlan_ethhdr *)(va); + + eth_hdr = va; + eth_type = be16_to_cpu(((struct ethhdr *)(va))->h_proto); + + /* validate VLAN tag, strip it if valid */ + if (validate_vnic_vlan(login, veth, &is_vlan_proto)) + goto repost; + + /* for all_vlan_gw - we don't strip the packet but send it as is*/ + if (!login->all_vlan_gw && is_vlan_proto) { + eth_type = be16_to_cpu(veth->h_vlan_encapsulated_proto); + eth_hdr += VLAN_HLEN; + skb_pull(skb, VLAN_HLEN); + memmove(eth_hdr, va, ETH_ALEN * 2); + } + + /* update skb fields, keep this before LRO/GRO funcs */ + skb->dev = login->dev; + skb->protocol = eth_type_trans(skb, skb->dev); + skb->ip_summed = ip_summed; + +#if defined(NETIF_F_GRO) && !defined(_BP_NO_GRO) + if ((login->dev->features & NETIF_F_GRO) && checksum_ok) { + struct vnic_rx_res *rx_res = &login->rx_res[rx_ring_index]; + int ret; + + ret = napi_gro_receive(&rx_res->napi, skb); + if (ret == GRO_HELD) + VNIC_STATS_INC(login->port_stats.gro_held); + else if (ret == GRO_NORMAL) + VNIC_STATS_INC(login->port_stats.gro_normal); + else if (ret == GRO_MERGED || ret == GRO_MERGED_FREE) + VNIC_STATS_INC(login->port_stats.gro_merged); + else + VNIC_STATS_INC(login->port_stats.gro_drop); + + goto rx_repost; + } +#elif defined(NETIF_F_LRO) + if (login->dev->features & NETIF_F_LRO && checksum_ok) { + struct vnic_rx_res *rx_res = &login->rx_res[rx_ring_index]; + + /* processed for LRO */ + lro_receive_skb(&rx_res->lro, skb, NULL); + VNIC_STATS_INC(login->port_stats.lro_aggregated); + + goto rx_repost; + } +#endif + + rc = vnic_rx(login, skb, wc); + if (unlikely(rc)) { + vnic_dbg_data(login->name, "vnic_rx failed, rc %d\n", rc); + goto repost; + } + +rx_repost: + VNIC_STATS_INC(ring->stats.rx_packets); + VNIC_STATS_ADD(ring->stats.rx_bytes, wc->byte_len); + + VNIC_STATS_DO_INC(login->stats.rx_packets); + VNIC_STATS_DO_ADD(login->stats.rx_bytes, wc->byte_len); + + if (unlikely(vnic_post_recv(ring, wr_id))) + vnic_dbg_data(login->name, "failed to post RX WQE id %d\n", + (int)wr_id); + spin_unlock_bh(&ring->lock); + + return; + +repost: + login->dev->last_rx = jiffies; + if (unlikely(vnic_post_recv(ring, wr_id))) + vnic_dbg_data(login->name, "failed to post RX WQE id %d\n", + (int)wr_id); + + VNIC_STATS_INC(ring->stats.rx_dropped); + VNIC_STATS_DO_INC(login->stats.rx_dropped); + spin_unlock_bh(&ring->lock); + + return; +} + +static void vnic_ib_handle_rx_wc(struct vnic_login *login, + struct ib_wc *wc, int rx_ring_index) +{ + struct vnic_rx_ring *ring = login->port->rx_ring[rx_ring_index]; + struct ib_device *ib_device = login->port->dev->ca; + struct vnic_frag_data *frags_entry; + struct skb_frag_struct frags[VNIC_MAX_RX_FRAGS] = {}; + struct eoibhdr *eoib_hdr; + struct vlan_ethhdr *veth; + struct iphdr *ip_hdr; + u64 wr_id = wc->wr_id; + u16 eth_type; + u8 *va, *eth_hdr, ip_type; + int rc, checksum_ok, ip_offset = ETH_HLEN, + packet_length = wc->byte_len - VNIC_EOIB_HDR_SIZE, + page_offset = VNIC_EOIB_HDR_SIZE, ip_summed; + int is_vlan_proto; + + spin_lock_bh(&ring->lock); + ASSERT(wr_id < ring->size); + + /* termination with error */ + if (unlikely(wc->status != IB_WC_SUCCESS)) { + if(wc->status != IB_WC_REM_ABORT_ERR && + wc->status != IB_WC_LOC_LEN_ERR) { + vnic_dbg_data(login->name, "RX CQE error " + "(status %d, vend_err 0x%x), " + "queueing rings restart\n", + wc->status, wc->vendor_err); + if (!login->queue_stopped) + queue_delayed_work(login_wq, &login->restart_task, HZ / 10); + goto out; + } + goto drop_repost; + } + + frags_entry = &ring->rx_info[wr_id]; + + /* ensure cache coherency for packet headers and get vq */ + ib_dma_sync_single_for_cpu(ib_device, + ring->rx_info[wr_id].dma_addr[0] + IB_GRH_BYTES, + MAX_HEADER_SIZE, DMA_FROM_DEVICE); + va = page_address(ring->rx_info[wr_id].frags[0].page) + + ring->rx_info[wr_id].frags[0].page_offset + IB_GRH_BYTES; + + /* check EoIB header signature and version */ + eoib_hdr = (struct eoibhdr *)va; + if (unlikely(VNIC_EOIB_HDR_GET_SIG(eoib_hdr) != VNIC_EOIB_HDR_SIG || + VNIC_EOIB_HDR_GET_VER(eoib_hdr) != VNIC_EOIB_HDR_VER)) { + vnic_dbg_data(login->name, "bad sig (0x%x) or ver (0x%x)\n", + VNIC_EOIB_HDR_GET_SIG(eoib_hdr), + VNIC_EOIB_HDR_GET_VER(eoib_hdr)); + VNIC_STATS_INC(login->port_stats.sig_ver_err); + goto unmap_repost; + } + + /* check EoIB CSUM */ + checksum_ok = login->rx_csum && VNIC_CSUM_OK(eoib_hdr); + ip_summed = checksum_ok ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE; + if (likely((checksum_ok))) + VNIC_STATS_INC(login->port_stats.rx_chksum_good); + else + VNIC_STATS_INC(login->port_stats.rx_chksum_none); + + /* Ethernet header */ + va += VNIC_ENCAP_LEN; + veth = (struct vlan_ethhdr *)(va); + + eth_hdr = va; + eth_type = be16_to_cpu(((struct ethhdr *)(va))->h_proto); + + /* validate VLAN tag, strip it if valid + * - if VID is set and !0, then VLAN tag must exist + * note: VID zero can accept untagged packets + * - if ingress VID exists: validate it, and update the packet + * note: rx user prio is ignored + * - else; it's valid untagged packet + */ + if (validate_vnic_vlan(login, veth, &is_vlan_proto)) + goto unmap_repost; + + /* for all_vlan_gw - we don't strip the packet but send it as is*/ + if (!login->all_vlan_gw && is_vlan_proto) { + ip_offset += VLAN_HLEN; + page_offset += VLAN_HLEN; + packet_length -= VLAN_HLEN; + eth_hdr += VLAN_HLEN; + eth_type = be16_to_cpu(veth->h_vlan_encapsulated_proto); + memmove(eth_hdr, va, ETH_ALEN * 2); + } + + /* IP header */ + va += ip_offset; + ip_hdr = (struct iphdr *)va; + ip_type = ip_hdr->protocol; + + ib_dma_sync_single_for_device(ib_device, + frags_entry->dma_addr[0] + IB_GRH_BYTES, + MAX_HEADER_SIZE, DMA_FROM_DEVICE); + +#if defined(NETIF_F_GRO) && !defined(_BP_NO_GRO) + if ((login->dev->features & NETIF_F_GRO) && checksum_ok) { + struct vnic_rx_res *rx_res = &login->rx_res[rx_ring_index]; + struct sk_buff *gro_skb; + struct skb_frag_struct *gro_frags; + int nr_frags, ret; + + gro_skb = napi_get_frags(&rx_res->napi); + if (!gro_skb) + goto drop_repost; + + gro_frags = skb_shinfo(gro_skb)->frags; + nr_frags = vnic_unmap_and_replace_rx(ring, ib_device, + gro_frags, wr_id, + wc->byte_len); + if (unlikely(!nr_frags)) + goto drop_repost; + + /* disregard GRH and eoib headers */ + gro_frags[0].page_offset += page_offset; + gro_frags[0].size -= page_offset; + + skb_shinfo(gro_skb)->nr_frags = nr_frags; + gro_skb->len = packet_length; + gro_skb->data_len = packet_length; + gro_skb->truesize += packet_length; + gro_skb->ip_summed = CHECKSUM_UNNECESSARY; + + /* processed for GRO */ + skb_record_rx_queue(gro_skb, rx_res->index); + ret = napi_gro_frags(&rx_res->napi); + if (ret == GRO_HELD) + VNIC_STATS_INC(login->port_stats.gro_held); + else if (ret == GRO_NORMAL) + VNIC_STATS_INC(login->port_stats.gro_normal); + else if (ret == GRO_MERGED || ret == GRO_MERGED_FREE) + VNIC_STATS_INC(login->port_stats.gro_merged); + else + VNIC_STATS_INC(login->port_stats.gro_drop); + + goto rx_repost; + } +#elif defined(NETIF_F_LRO) + if (login->dev->features & NETIF_F_LRO && checksum_ok && + eth_type == ETH_P_IP && ip_type == IPPROTO_TCP) { + struct vnic_rx_res *rx_res = &login->rx_res[rx_ring_index]; + int nr_frags; + + /* unmap the needed fragment and reallocate them. + * Fragments that were not used will be reused as is.*/ + nr_frags = vnic_unmap_and_replace_rx(ring, ib_device, frags, + wr_id, wc->byte_len); + if (unlikely(!nr_frags)) + goto drop_repost; + + /* disregard GRH and eoib headers */ + frags[0].page_offset += page_offset; + frags[0].size -= page_offset; + + /* processed for LRO */ + lro_receive_frags(&rx_res->lro, frags, packet_length, + packet_length, NULL, 0); + VNIC_STATS_INC(login->port_stats.lro_aggregated); + + goto rx_repost; + } +#endif + + rc = vnic_rx_skb(login, ring, wc, ip_summed, eth_hdr); + if (unlikely(rc)) { + vnic_dbg_data(login->name, "vnic_rx_skb failed, rc %d\n", rc); + goto drop_repost; + } + +rx_repost: + /* must hold lock when touching login->stats so the stats + * task won't read invalid values + */ + spin_lock(&login->stats_lock); + VNIC_STATS_INC(ring->stats.rx_packets); + VNIC_STATS_ADD(ring->stats.rx_bytes, packet_length); + + VNIC_STATS_DO_INC(login->stats.rx_packets); + VNIC_STATS_DO_ADD(login->stats.rx_bytes, packet_length); + spin_unlock(&login->stats_lock); + + login->dev->last_rx = jiffies; + if (vnic_post_recv(ring, wr_id)) + vnic_dbg_data(login->name, "vnic_post_recv failed, " + "wr_id %llu\n", wr_id); + spin_unlock_bh(&ring->lock); + + return; + +unmap_repost: + /* ignore rc of vnic_unmap_and_replace_rx() */ + vnic_unmap_and_replace_rx(ring, ib_device, frags, + wr_id, wc->byte_len); +drop_repost: + VNIC_STATS_INC(ring->stats.rx_dropped); + + spin_lock(&login->stats_lock); + VNIC_STATS_DO_INC(login->stats.rx_dropped); + spin_unlock(&login->stats_lock); + + if (vnic_post_recv(ring, wr_id)) + vnic_dbg_data(login->name, "vnic_post_recv failed, " + "wr_id %llu\n", wr_id); +out: + spin_unlock_bh(&ring->lock); + return; +} + +static inline void vnic_drain_tx_cq(struct vnic_login *login, + int tx_res_index) +{ + struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index]; + int n, i; + + do { + n = ib_poll_cq(tx_res->cq, VNIC_MAX_TX_CQE, tx_res->send_wc); + for (i = 0; i < n; ++i) + vnic_ib_handle_tx_wc(login, tx_res_index, + tx_res->send_wc + i); + } while (n == VNIC_MAX_TX_CQE); +} + +static void vnic_drain_arm_tx_cq(struct vnic_login *login, int tx_res_index) +{ + struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index]; + + ASSERT(login); + ASSERT(login->dev); + + /* darin CQ then [arm] it */ + vnic_drain_tx_cq(login, tx_res_index); + + /* in tx interrupt mode, arm TX CQ after every interrupt */ + if (!vnic_tx_polling && ib_req_notify_cq(tx_res->cq, IB_CQ_NEXT_COMP)) + vnic_dbg(login->name, "ib_req_notify_cq failed\n"); + else if (unlikely(VNIC_TXQ_STOPPED(tx_res) && + test_bit(VNIC_STATE_LOGIN_OPEN, &login->state))) { + if ((tx_res->tx_outstanding <= vnic_tx_rings_len >> 1)) { + if (!test_bit(VNIC_STATE_LOGIN_NO_TX_ENABLE, &login->state)) { + VNIC_STATS_DO_INC(login->port_stats.wake_queue); + VNIC_TXQ_WAKE(tx_res); + } + /* make sure that after arming the cq, there is no access to + * login fields to avoid conflict with cq event handler. + * i.e., ib_req_notify_cq() must come at the end of this func + */ + } else if (ib_req_notify_cq(tx_res->cq, IB_CQ_NEXT_COMP)) { + vnic_dbg(login->name, "ib_req_notify_cq failed\n"); + /* TODO: have to reset the device here */ + } + } +} + +static inline void vnic_comp_handler_tx(struct ib_cq *cq, void *ctx) +{ + struct vnic_tx_res *tx_res = ctx; + + if (!vnic_tx_polling) { + spin_lock(&tx_res->lock); + vnic_drain_arm_tx_cq(tx_res->login, tx_res->index); + spin_unlock(&tx_res->lock); + } else + vnic_drain_arm_tx_cq(tx_res->login, tx_res->index); + +} + +static int vnic_drain_rx_cq(struct vnic_login *login, int max_poll, + int rx_res_index) +{ + struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index]; + int polled, i; + + ASSERT(max_poll <= vnic_napi_weight); + polled = ib_poll_cq(rx_res->cq, max_poll, rx_res->recv_wc); + + for (i = 0; vnic_rx_linear && i < polled; ++i) + vnic_ib_handle_rx_wc_linear(login, &rx_res->recv_wc[i], + rx_res_index); + + for (i = 0; !vnic_rx_linear && i < polled; ++i) + vnic_ib_handle_rx_wc(login, &rx_res->recv_wc[i], + rx_res_index); + +#ifdef NETIF_F_LRO + /* Done CQ handling: flush all LRO sessions unconditionally */ + if (login->dev->features & NETIF_F_LRO) { + VNIC_STATS_INC(login->port_stats.lro_flushed); + lro_flush_all(&rx_res->lro); + } +#endif + + return polled; +} + +/* RX CQ polling - called by NAPI */ +#ifndef _BP_NAPI_POLL +int vnic_poll_cq_rx(struct napi_struct *napi, int budget) +{ + struct vnic_rx_res *rx_res = container_of(napi, struct vnic_rx_res, napi); + struct vnic_login *login = rx_res->login; + struct ib_cq *cq_rx = rx_res->cq; + int rx_res_index = rx_res->index, polled; + + /* shouldn't happen, since when stopped=1 NAPI is disabled */ + if (unlikely(rx_res->stopped)) { +#ifndef _BP_NAPI_NETIFRX + napi_complete(napi); +#else + netif_rx_complete(login->dev, napi); +#endif + return 0; + } + + polled = vnic_drain_rx_cq(login, min(budget, VNIC_MAX_RX_CQE), rx_res_index); + vnic_dbg_data(login->name, "after vnic_drain_rx_cq budget %d," + " done %d, index %d\n", budget, polled, rx_res_index); + + /* If we used up all the quota - we're probably not done yet... */ + ASSERT(polled <= budget); + if (polled < budget) { + /* ATTENTION: ARM CQ must come after napi_complete() */ +#ifndef _BP_NAPI_NETIFRX + napi_complete(napi); +#else + netif_rx_complete(login->dev, napi); +#endif + /* Eventually calls vnic_comp_handler_rx() */ + if (ib_req_notify_cq(cq_rx, IB_CQ_NEXT_COMP)) + vnic_err(login->name, "ib_req_notify_cq failed\n"); + } + + return polled; +} +#else +int vnic_poll_cq_rx(struct net_device *poll_dev, int *budget) +{ + struct vnic_rx_res *rx_res = poll_dev->priv; + struct vnic_login *login = rx_res->login; + struct ib_cq *cq_rx = rx_res->cq; + int rx_res_index = rx_res->index, polled, max_poll = min(*budget, poll_dev->quota); + + /* shouldn't happen, since when stopped=1 NAPI is disabled */ + if (unlikely(rx_res->stopped)) { + netif_rx_complete(poll_dev); + return 0; + } + + while (max_poll >= 0) { + polled = vnic_drain_rx_cq(login, min(max_poll, VNIC_MAX_RX_CQE), rx_res_index); + if (polled <= 0) + break; + else { + poll_dev->quota -= polled; + *budget -= polled; + } + max_poll -= polled; + } + + if (!max_poll) + return 1; + + netif_rx_complete(poll_dev); + ib_req_notify_cq(cq_rx, IB_CQ_NEXT_COMP); + + return 0; +} +#endif + +static void vnic_comp_handler_rx(struct ib_cq *cq, void *rx_res_ptr) +{ + struct vnic_rx_res *rx_res = rx_res_ptr; + struct vnic_login *login = rx_res->login; + + ASSERT(rx_res->cq == cq); + ASSERT(login->dev); + + /* is this happens, will re-arm later in vnic_open */ + if (unlikely(rx_res->stopped)) + return; + +#ifndef _BP_NAPI_POLL + /* calls vnic_poll_cq_rx() */ +#ifndef _BP_NAPI_NETIFRX + napi_schedule(&rx_res->napi); +#else + netif_rx_schedule(login->dev, &rx_res->napi); +#endif +#else + netif_rx_schedule(rx_res->poll_dev); +#endif /* _BP_NAPI_POLL*/ + +} + +static void vnic_stop_qp(struct vnic_login *login, int qp_index) +{ + struct ib_qp_attr qp_attr = { .qp_state = IB_QPS_ERR }; + struct vnic_qp_res *qp_res = &login->qp_res[qp_index]; + struct vnic_rx_res *rx_res = &login->rx_res[qp_res->rx_index]; + struct vnic_tx_res *tx_res = &login->tx_res[qp_res->tx_index]; + struct vnic_rx_ring *ring = login->port->rx_ring[rx_res->index]; + unsigned long flags; + int polled, attr_mask, rc, i; + + /* move QP to ERR, wait for last WQE async event to drain the SRQ */ + rc = ib_modify_qp(qp_res->qp, &qp_attr, IB_QP_STATE); + if (rc) { + /* calls vnic_qp_event_handler() */ + vnic_warn(login->name, "failed to modify QP 0x%x to ERR state" + " (err = %d)\n", qp_res->qp->qp_num, rc); + /* continue anyway, but don't wait for completion */ + } else { + wait_for_completion(&qp_res->last_wqe_complete); + } + + /* === at this point, no NAPI/RX comps === */ + + /* drain TX CQ before moving to RESET, must hold tx_res->lock to + * protect from vnic_comp_handler_tx() after this call, all CQEs + * are polled (either by this direct call, or by CQ handlers) + */ + spin_lock_irqsave(&tx_res->lock, flags); + vnic_drain_tx_cq(login, tx_res->index); + spin_unlock_irqrestore(&tx_res->lock, flags); + + /* drain RX CQ before moving to RESET drop and re-post all comps */ + spin_lock_bh(&ring->lock); + do { + polled = ib_poll_cq(rx_res->cq, VNIC_MAX_RX_CQE, rx_res->recv_wc); + for (i = 0; i < polled; ++i) + if (vnic_post_recv(ring, rx_res->recv_wc[i].wr_id)) + vnic_dbg_data(login->name, "vnic_post_recv failed, " + "wr_id %llu\n", rx_res->recv_wc[i].wr_id); + } while (polled == VNIC_MAX_RX_CQE); + spin_unlock_bh(&ring->lock); + + /* move QP to RESET */ + qp_attr.qp_state = IB_QPS_RESET; + rc = ib_modify_qp(qp_res->qp, &qp_attr, IB_QP_STATE); + if (rc) + vnic_warn(login->name, "failed to modify QP 0x%x to RESET" + " state (err = %d)\n", qp_res->qp->qp_num, rc); + + /* move QP to INIT to avoid multicast qp cache misses */ + qp_attr.qp_state = IB_QPS_INIT; + qp_attr.qkey = login->qkey; + qp_attr.port_num = login->port->num; + qp_attr.pkey_index = login->pkey_index; + attr_mask = IB_QP_QKEY | IB_QP_PORT | IB_QP_PKEY_INDEX | IB_QP_STATE; + + rc = ib_modify_qp(qp_res->qp, &qp_attr, attr_mask); + if (rc) + vnic_warn(login->name, "failed to modify QP 0x%x to INIT state" + " (err = %d)\n", qp_res->qp->qp_num, rc); +} + +int vnic_ib_stop(struct net_device *dev) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + struct vnic_tx_res *tx_res; + unsigned long begin = jiffies; + int wr_id, i; + + /* flush tx and rx comps */ + for (i = 0; i < login->qps_num; ++i) + vnic_stop_qp(login, i); + + /* check any pending tx comps */ + for (i = 0; i < login->tx_rings_num; i++) { + tx_res = &login->tx_res[i]; + /* if tx_outstanding is non-zero, give it a chance to complete */ + if (!tx_res->tx_outstanding) + continue; + msleep(10); + + /* else, drain tx cq. This is indicates that something is + * wrong, thus we won't protect vnic_comp_handler_tx() here + */ + while (tx_res->tx_outstanding && + time_before(jiffies, begin + 5 * HZ)) { + vnic_drain_tx_cq(login, i); + msleep(1); + } + + /* if they're still not complete, force skb deallocation */ + if (!tx_res->tx_outstanding) + continue; + vnic_warn(login->name, "timing out: %d sends not completed\n", + tx_res->tx_outstanding); + while (tx_res->tx_outstanding) { + wr_id = tx_res->tx_tail & (vnic_tx_rings_len - 1); + vnic_dealloc_tx_skb(login, i, wr_id); + ++tx_res->tx_tail; + --tx_res->tx_outstanding; + } + } + + return 0; +} + +int vnic_ib_open(struct net_device *dev) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + int i; + + /* move QP to RTS and attach to bcast group */ + for (i = 0; i < login->qps_num; ++i) { + if (vnic_init_qp(login, i)) { + vnic_err(login->name, "vnic_init_qp failed\n"); + goto stop_qps; + } + } + + return 0; + +stop_qps: + for (--i ; i >= 0; --i) + vnic_stop_qp(login, i); + + return -EINVAL; +} + +void vnic_destroy_qp(struct vnic_login *login, int qp_index) +{ + struct ib_qp *qp = login->qp_res[qp_index].qp; + + if (!qp) + return; + if (ib_destroy_qp(qp)) + vnic_warn(login->name, "ib_destroy_qp failed\n"); + return; +} + +void vnic_qp_to_reset(struct vnic_login *login, struct ib_qp *qp) +{ + struct ib_qp_attr qp_attr; + int rc; + + qp_attr.qp_state = IB_QPS_RESET; + rc = ib_modify_qp(qp, &qp_attr, IB_QP_STATE); + if (rc) + vnic_err(login->name, "ib_modify_qp 0x%06x to RESET err %d\n", + qp->qp_num, rc); +} + +int vnic_qp_to_init(struct vnic_login *login, struct ib_qp *qp, u32 qkey) +{ + struct ib_qp_attr qp_attr; + int attr_mask, rc; + + /* move QP to INIT */ + qp_attr.qp_state = IB_QPS_INIT; + qp_attr.qkey = qkey; + qp_attr.port_num = login->port->num; + /* pkey will be overwritten later by login->pkey_index */ + qp_attr.pkey_index = login->port->pkey_index; + attr_mask = IB_QP_QKEY | IB_QP_PORT | IB_QP_PKEY_INDEX | IB_QP_STATE; + + rc = ib_modify_qp(qp, &qp_attr, attr_mask); + if (rc) { + vnic_err(login->name, "ib_modify_qp 0x%06x to INIT err %d\n", + qp->qp_num, rc); + goto out_qp_reset; + } + + return 0; + +out_qp_reset: + vnic_qp_to_reset(login, qp); + return rc; +} + +int vnic_init_qp(struct vnic_login *login, int qp_index) +{ + struct ib_qp_attr qp_attr; + int attr_mask, rc, rc1; + struct ib_qp *qp = login->qp_res[qp_index].qp; + + init_completion(&login->qp_res[qp_index].last_wqe_complete); + /* move QP to INIT */ + qp_attr.qp_state = IB_QPS_INIT; + qp_attr.qkey = login->qkey; + qp_attr.port_num = login->port->num; + qp_attr.pkey_index = login->pkey_index; + attr_mask = IB_QP_QKEY | IB_QP_PORT | IB_QP_PKEY_INDEX | IB_QP_STATE; + + rc = ib_modify_qp(qp, &qp_attr, attr_mask); + if (rc) { + vnic_err(login->name, "ib_modify_qp to INIT err %d\n", rc); + goto out_qp_reset; + } + + /* move QP to RTR */ + qp_attr.qp_state = IB_QPS_RTR; + attr_mask &= ~IB_QP_PORT; + rc = ib_modify_qp(qp, &qp_attr, attr_mask); + if (rc) { + vnic_err(login->name, "ib_modify_qp to RTR err %d\n", rc); + goto out_qp_reset; + } + + /* move QP to RTS */ + qp_attr.qp_state = IB_QPS_RTS; + qp_attr.sq_psn = 0; + attr_mask |= IB_QP_SQ_PSN; + attr_mask &= ~IB_QP_PKEY_INDEX; + rc = ib_modify_qp(qp, &qp_attr, attr_mask); + if (rc) { + vnic_err(login->name, "ib_modify_qp to RTS err, rc %d\n", rc); + goto out_qp_reset; + } + + /* What a Good QP! */ + vnic_dbg_data(login->name, "qpn 0x%06x moved to RTS\n", + qp->qp_num); + + return 0; + +out_qp_reset: + qp_attr.qp_state = IB_QPS_RESET; + rc1 = ib_modify_qp(qp, &qp_attr, IB_QP_STATE); + if (rc1) + vnic_err(login->name, "ib_modify_qp to RESET err %d\n", rc1); + + return rc; +} + +static void vnic_qp_event_handler(struct ib_event *event, void *ctx) +{ + struct vnic_qp_res *qp_res = ctx; + struct vnic_login *login = qp_res->login; + + ASSERT(login); + vnic_dbg_data(login->name, "[%s] qpn %d got event %d\n", + event->device->name, event->element.qp->qp_num, + event->event); + if (event->event == IB_EVENT_QP_LAST_WQE_REACHED) + complete(&qp_res->last_wqe_complete); +} + +void vnic_destroy_rx_res(struct vnic_login *login, int rx_res_index) +{ + struct ib_cq *cq = login->rx_res[rx_res_index].cq; + int rc = 0; + + if (cq) + rc = ib_destroy_cq(cq); + if (rc) + vnic_warn(login->name, "ib_destroy_cq() index %d failed\n", + rx_res_index); +} + +void vnic_destroy_tx_res(struct vnic_login *login, int tx_res_index) +{ + struct ib_cq *cq = login->tx_res[tx_res_index].cq; + struct vnic_tx_buf *tx_ring = login->tx_res[tx_res_index].tx_ring; + int rc = 0; + + if (tx_ring) + vfree(tx_ring); + if (cq) + rc = ib_destroy_cq(cq); + if (rc) + vnic_warn(login->name, "ib_destroy_cq() index %d failed\n", + tx_res_index); +} + +int vnic_create_rx_res(struct vnic_login *login, int rx_res_index) +{ + struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index]; + int comp_vector = + rx_res_index % login->port->dev->ca->num_comp_vectors; + struct ib_cq *cq = + ib_create_cq(login->port->dev->ca, + vnic_comp_handler_rx, + NULL, &login->rx_res[rx_res_index], + vnic_rx_rings_len, comp_vector); + if (IS_ERR(cq)) { + vnic_err(login->name, "ib_create_cq failed, index %d, " + "comp_vector %d, rc %d\n", + rx_res_index, comp_vector, (int)PTR_ERR(cq)); + return -EINVAL; + } + + rx_res->cq = cq; + rx_res->index = rx_res_index; + rx_res->login = login; + + return 0; +} + +int vnic_create_tx_res(struct vnic_login *login, int tx_res_index) +{ + struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index]; + struct ib_cq *cq; + struct vnic_tx_buf *tx_ring; + int i, comp_vector; + + tx_ring = vmalloc(vnic_tx_rings_len * sizeof *tx_res->tx_ring); + if (!tx_ring) { + vnic_err(login->name, "vmalloc failed to allocate %u * %lu\n", + vnic_tx_rings_len, + (long unsigned int) (sizeof *tx_res->tx_ring)); + return -ENOMEM; + } + memset(tx_ring, 0, vnic_tx_rings_len * sizeof *tx_res->tx_ring); + + /* create TX CQ and set WQE drafts */ + tx_res->tx_wr.sg_list = tx_res->tx_sge; + tx_res->tx_wr.send_flags = IB_SEND_SIGNALED; + tx_res->tx_wr.wr.ud.remote_qkey = login->qkey; + + for (i = 0; i < VNIC_MAX_TX_FRAGS; ++i) + tx_res->tx_sge[i].lkey = login->port->mr->lkey; + + /* set mcast av draft*/ + memset(&tx_res->mcast_av, 0, sizeof(struct ib_ah_attr)); + tx_res->mcast_av.port_num = login->port->num; + tx_res->mcast_av.ah_flags = IB_AH_GRH; + + /* create tx cq */ + comp_vector = tx_res_index % login->port->dev->ca->num_comp_vectors; + cq = ib_create_cq(login->port->dev->ca, + vnic_comp_handler_tx, + NULL, &login->tx_res[tx_res_index], + vnic_tx_rings_len, comp_vector); + if (IS_ERR(cq)) { + vnic_err(login->name, "ib_create_cq failed, index %d, " + "comp_vector %d, rc %d\n", + tx_res_index, comp_vector, (int)PTR_ERR(cq)); + vfree(tx_ring); + return -EINVAL; + } + + tx_res->tx_ring = tx_ring; + tx_res->cq = cq; + tx_res->index = tx_res_index; + tx_res->login = login; + + return 0; +} + +int vnic_create_qp_range(struct vnic_login *login) +{ + int qp_index, create_flags = 0, rc; + struct ib_qp_init_attr *attr; + struct ib_qp *qps[VNIC_MAX_NUM_CPUS]; + struct vnic_qp_res *qp_res; + + attr = kzalloc(VNIC_MAX_NUM_CPUS * sizeof *attr, GFP_KERNEL); + if (!attr) + return -ENOMEM; + + create_flags |= login->port->dev->attr.device_cap_flags & + IB_DEVICE_BLOCK_MULTICAST_LOOPBACK ? + IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK : 0; + + /* TODO: rename IB_QP_CREATE_IPOIB_UD_LSO */ + create_flags |= login->port->dev->attr.device_cap_flags & + IB_DEVICE_UD_TSO ? + IB_QP_CREATE_IPOIB_UD_LSO : 0; + + for (qp_index = 0; qp_index < login->qps_num; ++qp_index) { + qp_res = &login->qp_res[qp_index]; + qp_res->tx_index = qp_index % login->tx_rings_num; + qp_res->rx_index = qp_index % login->rx_rings_num; + + memset(&attr[qp_index], 0, sizeof(struct ib_qp_init_attr)); + attr[qp_index].cap.max_send_wr = vnic_tx_rings_len; + attr[qp_index].cap.max_send_sge = VNIC_MAX_TX_FRAGS; + attr[qp_index].cap.max_recv_wr = 0; /* we use SRQ */ + attr[qp_index].cap.max_recv_sge = 0; + attr[qp_index].sq_sig_type = IB_SIGNAL_ALL_WR; + attr[qp_index].qp_type = IB_QPT_UD; + attr[qp_index].send_cq = login->tx_res[qp_res->tx_index].cq; + attr[qp_index].recv_cq = login->rx_res[qp_res->rx_index].cq; + attr[qp_index].srq = login->port->rx_ring[qp_res->rx_index]->srq; + attr[qp_index].event_handler = vnic_qp_event_handler; + attr[qp_index].qp_context = &login->qp_res[qp_index]; + attr[qp_index].create_flags = create_flags; + attr[qp_index].cap.max_inline_data = vnic_inline_tshold; + } + + + rc = vnic_ib_create_qp_range(login->port->pd, attr, NULL, + login->qps_num, login->qps_num, qps); + if (rc) { + vnic_err(login->name, "vnic_ib_create_qp_range failed, rc %d\n", rc); + goto err; + } + + for (qp_index = 0; qp_index < login->qps_num; ++qp_index) { + qp_res = &login->qp_res[qp_index]; + qp_res->qp = qps[qp_index]; + qp_res->login = login; + } + + for (qp_index = 0; qp_index < login->qps_num; ++qp_index) { + rc = vnic_qp_to_init(login, qps[qp_index], login->qkey); + if (rc) { + vnic_err(login->name, "vnic_qp_to_init failed, rc %d\n", rc); + goto destroy_qps; + } + } + + kfree(attr); + return 0; + +destroy_qps: + for (qp_index--; qp_index>=0; qp_index--) + vnic_qp_to_reset(login, qps[qp_index]); + + for (qp_index = 0; qp_index < login->qps_num; ++qp_index) + vnic_destroy_qp(login, qp_index); + +err: + kfree(attr); + return rc; +} + +static inline int use_inline(struct sk_buff *skb) +{ + return skb->len <= vnic_inline_tshold && !skb_shinfo(skb)->nr_frags; +} + +int vnic_post_send(struct vnic_login *login, int tx_res_index, + u64 wr_id, struct ib_ah *ah, u32 dqpn) +{ + struct ib_send_wr *bad_wr; + struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index]; + struct vnic_qp_res *qp_res = &login->qp_res[tx_res_index % login->qps_num]; + struct vnic_tx_buf *tx_req = &tx_res->tx_ring[wr_id]; + skb_frag_t *frags = skb_shinfo(tx_req->skb)->frags; + int nr_frags = skb_shinfo(tx_req->skb)->nr_frags, i, off = 0; + + ASSERT(qp_res); + ASSERT(tx_res); + ASSERT(qp_res->tx_index == tx_res->index); + ASSERT(qp_res->qp->send_cq == tx_res->cq); + + if (!vnic_encap_headroom && !skb_is_gso(tx_req->skb)) { + tx_res->tx_sge[off].addr = tx_req->mapping[off]; + tx_res->tx_sge[off].length = VNIC_ENCAP_LEN; + off++; + } + + if (likely(skb_headlen(tx_req->skb))) { + if (vnic_encap_headroom && use_inline(tx_req->skb)) { + tx_res->tx_wr.send_flags |= IB_SEND_INLINE; + wr_id |= VNIC_SEND_INLINE_FLAG; + tx_res->tx_sge[off].addr = (unsigned long)tx_req->skb->data; + } else { + tx_res->tx_wr.send_flags &= ~IB_SEND_INLINE; + tx_res->tx_sge[off].addr = tx_req->mapping[off]; + } + tx_res->tx_sge[off].length = skb_headlen(tx_req->skb); + off++; + } + + for (i = 0; i < nr_frags; ++i) { + tx_res->tx_sge[i + off].addr = tx_req->mapping[i + off]; + tx_res->tx_sge[i + off].length = frags[i].size; + } + + /* handle runt packets using additional SG */ + if (unlikely(tx_req->skb->len < login->zlen)) { + /* Note: always extend runt packets (for both + * internal & external) for virtualization, some emulators + * drop runt packets, so we need to avoid runt packets even + * if the traffic is not passing the bridge + */ + vnic_dbg_data(login->name, "runt packet, skb %p len %d => %d\n", + tx_req->skb, tx_req->skb->len, login->zlen); + /* If there are frags, then packets is longer than 60B */ + if (use_inline(tx_req->skb)) + tx_res->tx_sge[i + off].addr = (u64)(unsigned long)login->pad_va; + else + tx_res->tx_sge[i + off].addr = login->pad_dma; + + tx_res->tx_sge[i + off].length = login->zlen - tx_req->skb->len; + ++nr_frags; + VNIC_STATS_INC(login->port_stats.runt_packets); + } + + tx_res->tx_wr.num_sge = nr_frags + off; + tx_res->tx_wr.wr_id = wr_id; + tx_res->tx_wr.wr.ud.remote_qpn = dqpn; + tx_res->tx_wr.wr.ud.ah = ah; + + /* check if we need to calc csum */ + if (tx_req->skb->ip_summed == CHECKSUM_PARTIAL) { + u16 csum_pseudo; + + /* calc pseudo header csum without the length + * and put in the transport's header checksum field. + * The HW will calculate the rest of it (SWP) + */ + if (tx_req->ip_off) + csum_pseudo = ~csum_tcpudp_magic(ip_hdr(tx_req->skb)->saddr, + ip_hdr(tx_req->skb)->daddr, + 0, /* length */ + ip_hdr(tx_req->skb)->protocol, + 0); + else + csum_pseudo = ~csum_ipv6_magic(&ipv6_hdr(tx_req->skb)->saddr, + &ipv6_hdr(tx_req->skb)->daddr, + 0, /* length */ + ipv6_hdr(tx_req->skb)->nexthdr, + 0); + + /* place the calculated csum in the checksum field in + * tcp/udp header + */ + if (tx_req->tcp_off) + tcp_hdr(tx_req->skb)->check = csum_pseudo; + else + udp_hdr(tx_req->skb)->check = csum_pseudo; + + /* set CSUM flag in ib_send_wr */ + tx_res->tx_wr.send_flags |= IB_SEND_IP_CSUM; + } else { + /* csum already calculated in SW */ + tx_res->tx_wr.send_flags &= ~IB_SEND_IP_CSUM; + } + + /* prepare TSO header */ + if (skb_is_gso(tx_req->skb)) { + tx_res->tx_wr.wr.ud.mss = skb_shinfo(tx_req->skb)->gso_size + tx_req->hlen; + tx_res->tx_wr.wr.ud.header = tx_req->phead; + tx_res->tx_wr.wr.ud.hlen = tx_req->hlen; + tx_res->tx_wr.opcode = IB_WR_LSO; + } else { + tx_res->tx_wr.opcode = IB_WR_SEND; + } + + vnic_dbg_data(login->name, + "skb %p wr_id %llu sqpn 0x%06x dqpn 0x%06x num_sge " + "%d phead %p was sent\n", tx_req->skb, wr_id, qp_res->qp->qp_num, + dqpn, tx_res->tx_wr.num_sge, tx_req->phead); + + /* if EoIB encap is OOB, copy LRO header to linear part */ + if (!vnic_encap_headroom && skb_is_gso(tx_req->skb)) { + memcpy(tx_res->lso_hdr, VNIC_SKB_GET_ENCAP(tx_req->skb), + VNIC_ENCAP_LEN); + memcpy((u8 *)(tx_res->lso_hdr) + VNIC_ENCAP_LEN, + tx_res->tx_wr.wr.ud.header, + tx_res->tx_wr.wr.ud.hlen); + tx_res->tx_wr.wr.ud.header = tx_res->lso_hdr; + tx_res->tx_wr.wr.ud.mss += VNIC_ENCAP_LEN; + tx_res->tx_wr.wr.ud.hlen += VNIC_ENCAP_LEN; + } + + return vnic_ib_post_send(qp_res->qp, &tx_res->tx_wr, &bad_wr, + tx_req->ip_off, + tx_req->ip6_off, + tx_req->tcp_off, + tx_req->udp_off); +} + +static int vnic_dma_map_tx(struct ib_device *ca, struct vnic_tx_buf *tx_req) +{ + struct sk_buff *skb = tx_req->skb; + struct skb_shared_info *shinfo = skb_shinfo(skb); + u64 *mapping = tx_req->mapping; + int i = 0, off = 0, headlen = skb_headlen(skb); + + if (vnic_encap_headroom && use_inline(skb)) + return 0; + + if (!vnic_encap_headroom && !skb_is_gso(tx_req->skb)) { + mapping[off] = ib_dma_map_single(ca, VNIC_SKB_GET_ENCAP(skb), + VNIC_ENCAP_LEN, DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(ca, mapping[off]))) + return -EIO; + off++; + } + + if (likely(headlen)) { + mapping[off] = ib_dma_map_single(ca, skb->data, + headlen, DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(ca, mapping[off]))) + goto partial_error; + off++; + } + + for (i = 0; i < shinfo->nr_frags; ++i) { + skb_frag_t *frag = &shinfo->frags[i]; + mapping[i + off] = ib_dma_map_page(ca, frag->page, + frag->page_offset, + frag->size, DMA_TO_DEVICE); + if (unlikely(ib_dma_mapping_error(ca, mapping[i + off]))) + goto partial_error; + } + + return 0; + +partial_error: + for (--i; i >= 0; i--) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + ib_dma_unmap_page(ca, mapping[i + off], frag->size, + DMA_TO_DEVICE); + } + + if (headlen) + ib_dma_unmap_single(ca, mapping[--off], skb_headlen(skb), + DMA_TO_DEVICE); + + if (!vnic_encap_headroom && !skb_is_gso(tx_req->skb)) + ib_dma_unmap_single(ca, mapping[--off], VNIC_ENCAP_LEN, + DMA_TO_DEVICE); + + return -EIO; +} + +void vnic_send(struct vnic_login *login, struct sk_buff *skb, + struct ib_ah *ah, u32 dqpn, int tx_res_index) +{ + struct eoibhdr *_eoib_hdr = VNIC_SKB_GET_ENCAP(skb); + struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index]; + struct vnic_tx_buf *tx_req; + unsigned long flags = 0; + u64 wr_id; + int tx_pkt_num = 1; + u8 ip_off; + + if (!vnic_tx_polling) + spin_lock_irqsave(&tx_res->lock, flags); + + ASSERT(tx_res_index < login->tx_rings_num); + wr_id = tx_res->tx_head & (vnic_tx_rings_len - 1); + tx_req = &tx_res->tx_ring[wr_id]; + tx_req->skb = skb; + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + tx_req->ip_off = tx_req->ip6_off = tx_req->tcp_off = tx_req->udp_off = 0; + if (VNIC_IP_CSUM_OK(_eoib_hdr)) { + ip_off = vnic_encap_headroom ? + ((skb_network_header(skb) - skb->data) >> 1) : + /* skb_network_header doesn't count the encap since it's OOB */ + ((skb_network_header(skb) - skb->data + VNIC_ENCAP_LEN) >> 1); + switch (ntohs(skb->protocol)) { + case ETH_P_IP: + tx_req->ip_off = ip_off; + break; + case ETH_P_IPV6: + tx_req->ip6_off = ip_off; + } + } + if (VNIC_TCP_CSUM_OK(_eoib_hdr)) + tx_req->tcp_off = + (skb_transport_header(skb) - skb_network_header(skb)) >> 2; + else if (VNIC_UDP_CSUM_OK(_eoib_hdr)) + tx_req->udp_off = + (skb_transport_header(skb) - skb_network_header(skb)) >> 2; + ASSERT(!tx_req->udp_off || !tx_req->tcp_off); + vnic_dbg_data(login->name, "ip_off = %d, tcp_off = %d, udp_off = %d\n", + tx_req->ip_off, tx_req->tcp_off, tx_req->udp_off); + VNIC_STATS_INC(login->port_stats.tx_chksum_offload); + } + + /* TSO skb */ + if (skb_is_gso(skb)) { + tx_req->hlen = skb_transport_offset(skb) + tcp_hdrlen(skb); + tx_req->phead = skb->data; + ASSERT(skb_pull(skb, tx_req->hlen)); + VNIC_STATS_INC(login->port_stats.tso_packets); + tx_pkt_num = skb_shinfo(tx_req->skb)->gso_segs; + } + + /* map tx skb */ + if (unlikely(vnic_dma_map_tx(login->port->dev->ca, tx_req))) + goto err; + + /* send.. unmap.. free skb.. drain tx cq.. [pray] */ + if (unlikely(++tx_res->tx_outstanding == vnic_tx_rings_len)) { + if (++tx_res->tx_stopped_cnt % 100 == 0) + vnic_dbg(login->name, "tx queue %d stopped cnt %d, outs %d\n", + tx_res->index, + tx_res->tx_stopped_cnt, + tx_res->tx_outstanding); + ASSERT(!VNIC_TXQ_STOPPED(tx_res)); + VNIC_TXQ_STOP(tx_res); + /* vnic_drain_arm_tx_cq() will arm the cq OR resume the ring */ + VNIC_STATS_DO_INC(login->port_stats.queue_stopped); + } + + ASSERT(tx_res->tx_outstanding <= vnic_tx_rings_len); + + if (unlikely(vnic_post_send(login, tx_res_index, wr_id, ah, dqpn))) { + vnic_warn(login->name, "vnic_post_send failed\n"); + VNIC_STATS_DO_INC(tx_res->stats.tx_errors); + VNIC_STATS_DO_INC(tx_res->stats.tx_dropped); + --tx_res->tx_outstanding; + vnic_dealloc_tx_skb(login, tx_res->index, wr_id); + /* no need to netif_wake_queue() here, because + * vnic_comp_handler_tx() will eventually be called + * for armed cq, and it will wake-up the queue when it's ready + */ + } else { + VNIC_STATS_DO_ADD(tx_res->stats.tx_packets, tx_pkt_num); + VNIC_STATS_DO_ADD(tx_res->stats.tx_bytes, skb->len); + login->dev->trans_start = jiffies; + ++tx_res->tx_head; + + + if (vnic_tx_polling) { + if (likely(!skb_shared(skb))) + skb_orphan(skb); + else + VNIC_STATS_DO_INC(login->port_stats.shared_packets); + } + } + + /* poll every vnic_max_tx_outs packets */ + if (vnic_tx_polling) { + if (tx_res->tx_outstanding > vnic_max_tx_outs || + VNIC_TXQ_STOPPED(tx_res)) + vnic_drain_arm_tx_cq(login, tx_res_index); + } else + spin_unlock_irqrestore(&tx_res->lock, flags); + + return; + +err: + VNIC_STATS_DO_INC(tx_res->stats.tx_dropped); + VNIC_STATS_DO_INC(tx_res->stats.tx_errors); + dev_kfree_skb_any(skb); + + if (!vnic_tx_polling) + spin_unlock_irqrestore(&tx_res->lock, flags); + + return; +} + +void vnic_ib_free_ring(struct vnic_rx_ring *ring) +{ + ASSERT(ring->srq); + ib_destroy_srq(ring->srq); +} + +int vnic_ib_init_ring(struct vnic_rx_ring *ring) +{ + struct ib_srq_init_attr srq_attr; + struct vnic_port *port = ring->port; + int rc = 0, headroom = 10; + + /* alloc SRQ */ + memset(&srq_attr, 0, sizeof(struct ib_srq_init_attr)); + srq_attr.attr.max_sge = VNIC_MAX_RX_FRAGS; + srq_attr.attr.max_wr = vnic_rx_rings_len + headroom; + srq_attr.attr.srq_limit = vnic_rx_rings_len + headroom; + ring->srq = ib_create_srq(port->pd, &srq_attr); + if (IS_ERR(ring->srq)) { + vnic_err(ring->port->name, "ib_create_srq failed, index %d, rc %d\n", + ring->index, (int)PTR_ERR(ring->srq)); + rc = (int)PTR_ERR(ring->srq); + } + + return rc; +} + +int vnic_port_ib_init(struct vnic_port *port) +{ + int i; + + /* alloc PD */ + port->pd = ib_alloc_pd(port->dev->ca); + if (IS_ERR(port->pd)) { + vnic_err(port->name, "failed to allocate PD\n"); + goto err; + } + vnic_dbg_data(port->name, "port->pd %p\n", port); + + /* alloc MR */ + port->mr = ib_get_dma_mr(port->pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(port->mr)) { + vnic_err(port->name, "failed to allocate MR\n"); + goto free_pd; + } + vnic_dbg_data(port->name, "port->mr %p\n", port->mr); + + /* alloc RX RING */ + for (i = 0; i < port->rx_rings_num; ++i) { + port->rx_ring[i] = vnic_create_rx_ring(port, i); + if (IS_ERR(port->rx_ring[i])) { + vnic_err(port->name, "failed to allocate rx_ring %d\n", i); + port->rx_ring[i] = NULL; + goto free_rx_ring; + } + } + vnic_dbg_data(port->name, "allocated %d RX ring\n", port->rx_rings_num); + + return 0; + +free_rx_ring: + for (i = 0; i < port->rx_rings_num; ++i) + vnic_destroy_rx_ring(port->rx_ring[i]); +/* free_mr: */ + ib_dereg_mr(port->mr); +free_pd: + ib_dealloc_pd(port->pd); +err: + return -EINVAL; + +} + +void vnic_port_ib_cleanup(struct vnic_port *port) +{ + int i; + + for (i = 0; i < port->rx_rings_num; ++i) + vnic_destroy_rx_ring(port->rx_ring[i]); + + ib_dereg_mr(port->mr); + ib_dealloc_pd(port->pd); + + return; +} + +void vnic_ib_dispatch_event(struct ib_event *event) +{ + return; +} + +int vnic_ib_set_moder(struct vnic_login *login, u16 rx_usecs, u16 rx_frames, + u16 tx_usecs, u16 tx_frames) +{ + int rc, i; + + vnic_dbg_moder(login->name, "set coalescing params for mtu:%d to " + "rx_frames:%d rx_usecs:%d, " + "tx_frames:%d tx_usecs:%d, " + "adaptive_rx_coal:%d, " + "adaptive_tx_coal:%d, " + "sample_interval:%d, " + "port.state: %d\n", + login->dev->mtu, + rx_frames, rx_usecs, + tx_frames, tx_usecs, + login->adaptive_rx_coal, 0, + login->sample_interval, login->port->attr.state); + + for (i = 0; i < login->tx_rings_num; ++i) { + rc = ib_modify_cq(login->tx_res[i].cq, tx_frames, tx_usecs); + if (rc && rc != -ENOSYS) { + vnic_warn(login->name, "failed modifying tx_res," + " rc %d, tx ring index %d\n", rc, i); + return rc; + } + } + + for (i = 0; i < login->rx_rings_num; ++i) { + rc = ib_modify_cq(login->rx_res[i].cq, rx_frames, rx_usecs); + if (rc && rc != -ENOSYS) { + vnic_warn(login->name, "failed modifying rx_res," + " rc %d, rx ring index %d\n", rc, i); + return rc; + } + } + + return 0; +} + +int vnic_ib_down(struct net_device *dev) +{ + return 0; +} + +int vnic_ib_up(struct net_device *dev) +{ + return 0; +} diff --git a/drivers/net/mlx4_vnic/vnic_data_mac.c b/drivers/net/mlx4_vnic/vnic_data_mac.c new file mode 100644 index 0000000000000..cb976aec27d78 --- /dev/null +++ b/drivers/net/mlx4_vnic/vnic_data_mac.c @@ -0,0 +1,375 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_data.h" +#include "vnic_fip_discover.h" + +static void vnic_mace_dealloc(struct vnic_mac *mace) +{ + ASSERT(mace); + kfree(mace); +} + +static struct vnic_mac *vnic_mace_alloc(const u8 *mac, u16 vnic_id) +{ + struct vnic_mac *mace; + + mace = kzalloc(sizeof *mace, GFP_ATOMIC); + if (!mace) + return ERR_PTR(-ENOMEM); + + /* set mac entry fields */ + memcpy(mace->mac, mac, ETH_ALEN); + mace->created = jiffies; + mace->last_tx = jiffies; + mace->vnic_id = vnic_id; + + return mace; +} + +static void vnic_mace_del(struct vnic_login *login, struct vnic_mac *mace) +{ + ASSERT(mace); + rb_erase(&mace->rb_node, &login->mac_tree); +} + +static int vnic_mace_add(struct vnic_login *login, struct vnic_mac *mace) +{ + struct rb_node **n = &login->mac_tree.rb_node, *pn = NULL; + struct vnic_mac *mace_t; + int rc; + + while (*n) { + pn = *n; + mace_t = rb_entry(pn, struct vnic_mac, rb_node); + rc = memcmp(mace->mac, mace_t->mac, ETH_ALEN); + if (rc < 0) + n = &pn->rb_left; + else if (rc > 0) + n = &pn->rb_right; + else { + rc = -EEXIST; + goto out; + } + } + + rb_link_node(&mace->rb_node, pn, n); + rb_insert_color(&mace->rb_node, &login->mac_tree); + rc = 0; + +out: + return rc; +} + +/* vnic_mace_search -- + * Return entry pointer if found, or ERR_PTR(-ENODATA) if not found. + */ +static struct vnic_mac *vnic_mace_search(struct vnic_login *login, u8 *mac) +{ + struct rb_node *n = login->mac_tree.rb_node; + struct vnic_mac *mace_t; + int rc; + + ASSERT(login); + ASSERT(mac); + + while (n) { + mace_t = rb_entry(n, struct vnic_mac, rb_node); + ASSERT(mace_t); + rc = memcmp(mac, mace_t->mac, ETH_ALEN); + if (rc < 0) + n = n->rb_left; + else if (rc > 0) + n = n->rb_right; + else + goto out; + } + + mace_t = ERR_PTR(-ENODATA); + +out: + return mace_t; +} + +/* vnic_mace_update -- + * Remove: -ENODATA if not found, if removed, update ref_cnt, return 0 + * Add: -ENOMEM if no mem, -EEXIST if already exists, + * if added, update ref_cnt, return 0 + * NOTE: ref counters must be updated here, as this function is + * shared among multiple entry points + */ +int vnic_mace_update(struct vnic_login *login, u8 *mac, u16 vnic_id, int remove) +{ + struct vnic_mac *mace; + int rc; + + mace = vnic_mace_search(login, mac); + if (remove) { + if (IS_ERR(mace)) + return -ENODATA; + vnic_mace_del(login, mace); + vnic_mace_dealloc(mace); + /* update ref cnt */ + ASSERT(atomic_read(&login->vnic_child_cnt)); + atomic_dec(&login->vnic_child_cnt); + } else { + if (PTR_ERR(mace) != -ENODATA) + return -EEXIST; + + /* test ref cnt */ + if (atomic_read(&login->vnic_child_cnt) + 1 > vnic_child_max) { + vnic_warn(login->name, "too many child vNics, max %d\n", + vnic_child_max); + return -EUSERS; /* too many users */ + } + + mace = vnic_mace_alloc(mac, vnic_id); + if (!mace) + return -ENOMEM; + + rc = vnic_mace_add(login, mace); + if (rc) { + vnic_mace_dealloc(mace); + return rc; + } + /* update ref cnt */ + atomic_inc(&login->vnic_child_cnt); + vnic_dbg_mac(login->name, + "updated mac "MAC_6_PRINT_FMT" remove %d\n", + MAC_6_PRINT_ARG(mac), remove); + } + + return 0; +} + +/* this function can be called from fast data-path + * need to make sure that login instance is protected here + * likely/unlikely below were added to match the hard_start_xmit fast data flow + * + caller must hold login->mac_rwlock (read_lock is enough because we only + * queue the job here) + * + it queues a job to create a child + */ +int vnic_child_update(struct vnic_login *login, u8 *mac, int remove) +{ + struct vnic_mac *mace; + char *cmd_str; + struct fip_hadmin_cmd *cmd_hadmin; + int count, rc = -EINVAL; + u16 vnic_id = 0; + + vnic_dbg_func(login->name); + + mace = vnic_mace_search(login, mac); + + /* if asked to add, and data already exists, abort */ + if (likely(!remove && !IS_ERR(mace))) { + mace->last_tx = jiffies; + return -EEXIST; + } + + if (!remove) { + /* test if there is too many child vNics same check exist in + * vnic_mace_update(), but we have it here as well to let + * vnic_set_mac return friendly rc + */ + if (atomic_read(&login->vnic_child_cnt) + 1 > vnic_child_max) { + vnic_warn(login->name, "too many child vNics, " + "max %d\n", vnic_child_max); + return -EUSERS; /* too many users */ + } + + /* update last_tx */ + ASSERT(mace); + /* generate new vnic_id only when new child is being added */ + vnic_id = atomic_inc_return(&login->port->vnic_child_ids); + /* set bit 14 so we avoid conflict with normal host/net admin */ + vnic_id %= (1 << (VNIC_ID_LEN - 2)); + vnic_id |= (1 << (VNIC_ID_LEN - 2)); + + /* TODO: update hadmin user-script and manual to make hadmin + * vnic_id interval >= 16K (1<<14 == 16384) so bit 14 is clear + * for parent host admin. + * to avoid atomic counter wrap around, move to bitmap array + */ + } else { + /* if asked to remove, and data not found, abort */ + if (IS_ERR(mace)) + return -ENODATA; + + ASSERT(mace); + vnic_id = mace->vnic_id; + } + + /* allocate cmd structs, too big to be local vars + * use GFP_ATOMIC because this func can be called from data path + */ + cmd_str = kmalloc(sizeof *cmd_str * PAGE_SIZE, GFP_ATOMIC); + if (!cmd_str) + return -ENOMEM; + + cmd_hadmin = kmalloc(sizeof *cmd_hadmin, GFP_ATOMIC); + if (!cmd_hadmin) { + kfree(cmd_str); + return -ENOMEM; + } + + /* inherit command from parent, change: + * name, parent, mac, vnic_id and source + * Note: cannot use parent login->fip_vnic->cmd here + * in order to support net-admin-vnics + */ + vnic_login_cmd_init(cmd_hadmin); + + /* child vNic name scheme: + * eth.c + * Note: avoid sysfs files conflict (that's why parent unique cnt must + * be included in the name here) + */ + snprintf(cmd_hadmin->c_name, MAX_INPUT_LEN, "%s%u.c%u", + "eth", login->cnt, vnic_id); + snprintf(cmd_hadmin->c_mac, MAX_INPUT_LEN, MAC_6_PRINT_FMT, + MAC_6_PRINT_ARG(mac)); + snprintf(cmd_hadmin->c_vnic_id, MAX_INPUT_LEN, "%u", + vnic_id); + snprintf(cmd_hadmin->c_eport, MAX_INPUT_LEN, "%s", + login->fip_vnic->gw_info.gw_port_name); + snprintf(cmd_hadmin->c_parent, MAX_INPUT_LEN, "%s", + login->dev->name); + snprintf(cmd_hadmin->c_bxname, MAX_INPUT_LEN, "%s", + login->fip_vnic->gw_info.system_name); + snprintf(cmd_hadmin->c_bxguid, MAX_INPUT_LEN, VNIC_GUID_FMT, + VNIC_GUID_RAW_ARG(login->fip_vnic->gw_info.system_guid)); + + /* all hadmin vNics must use same BX format (guid vs. name) */ + if (login->fip_vnic->hadmined) { + snprintf(cmd_hadmin->c_bxname, MAX_INPUT_LEN, "%s", + login->fip_vnic->cmd.c_bxname); + snprintf(cmd_hadmin->c_bxguid, MAX_INPUT_LEN, "%s", + login->fip_vnic->cmd.c_bxguid); + } + + /* VLAN is optional, set it only when used by parent */ + if (login->vlan_used) + snprintf(cmd_hadmin->c_vid, MAX_INPUT_LEN, "%d", + login->fip_vnic->vlan); + + /* ready to set the command */ + count = vnic_login_cmd_set(cmd_str, cmd_hadmin); + if (!count) + goto out; + + /* queue job (similar to sysfs write function, + * will eventually call fip_discover_hadmin_update_parent() -> + * vnic_mace_update() + */ + count = fip_hadmin_sysfs_update(login->port, cmd_str, count, remove); + if (count <= 0 && count != -EEXIST) + goto out; + + /* at this point, job queued, return success */ + rc = 0; + +out: + kfree(cmd_str); + kfree(cmd_hadmin); + return rc; +} + +void vnic_child_flush(struct vnic_login *login, int all) +{ + struct rb_node *n; + struct vnic_mac *mace, *mace_t; + LIST_HEAD(local_list); + + vnic_dbg_func(login->name); + + n = rb_first(&login->mac_tree); + while (n) { + mace = rb_entry(n, struct vnic_mac, rb_node); + list_add_tail(&mace->list, &local_list); + n = rb_next(n); + } + + list_for_each_entry_safe(mace, mace_t, &local_list, list) { + list_del(&mace->list); + /* if not-flush-all, and mac is dev_addr mac, skip this entry */ + if (!all && !memcmp(login->dev->dev_addr, mace->mac, ETH_ALEN)) + continue; + vnic_child_update(login, mace->mac, 1); + vnic_mace_del(login, mace); + vnic_mace_dealloc(mace); + } + + +} + +/* find parent vNic + * add the child vnic to its mac_tree + * sync child qp_base_num with parent + * for child removal, it's ok not to find the parent, or the child mac entry + */ +int vnic_parent_update(struct vnic_port *port, char *name, u16 vnic_id, + u8 *mac, u32 *qp_base_num_ptr, char *parent_name, + int remove) +{ + struct vnic_login *login; + int rc = -ENODATA; + + vnic_dbg_func(name); + + mutex_lock(&port->mlock); + list_for_each_entry(login, &port->login_list, list) { + vnic_dbg_mac(name, "checking parent %s for child %s (expect %s)\n", + login->dev->name, name, parent_name); + /* check if parent vnic has valid QPN and not being destroyed */ + if (!strcmp(login->dev->name, parent_name) && + test_bit(VNIC_STATE_LOGIN_PRECREATE_2, &login->state) && + !login->fip_vnic->flush) { + /* sync qp_base_num with parent */ + if (qp_base_num_ptr) + *qp_base_num_ptr = login->qp_base_num; + + /* update mac_tree and mace vnic_id */ + write_lock_bh(&login->mac_rwlock); + rc = vnic_mace_update(login, mac, vnic_id, remove); + write_unlock_bh(&login->mac_rwlock); + + break; + } + } + + mutex_unlock(&port->mlock); + + /* for vNic removal, ignore rc */ + return remove ? 0 : rc; +} diff --git a/drivers/net/mlx4_vnic/vnic_data_main.c b/drivers/net/mlx4_vnic/vnic_data_main.c new file mode 100644 index 0000000000000..c0022ccfa193b --- /dev/null +++ b/drivers/net/mlx4_vnic/vnic_data_main.c @@ -0,0 +1,1119 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_data.h" + +void vnic_login_refresh_mcasts(struct vnic_port *port) +{ + struct vnic_login *login; + + vnic_dbg_mark(); + mutex_lock(&port->mlock); + list_for_each_entry(login, &port->login_list, list) + vnic_tree_mcast_detach(&login->mcast_tree); + list_for_each_entry(login, &port->login_list, list) + vnic_tree_mcast_attach(&login->mcast_tree); + mutex_unlock(&port->mlock); +} + +int vnic_login_pre_create_1(struct vnic_port *port, + struct fip_vnic_data *vnic) +{ + struct vnic_login *login; + struct net_device *dev; + + /* set login to zero first (for parent_used case) */ + vnic->login = NULL; + + /* if parent_used, skip */ + if (vnic->parent_used) { + vnic_dbg_mac(vnic->name, "function skipped\n"); + return 0; + } else { + vnic_dbg_func(vnic->name); + } + + /* create netdev per login, vlan configuration is done from outside */ + dev = vnic_alloc_netdev(port); + if (IS_ERR(dev)) { + vnic_err(port->name, "vnic_alloc_netdev failed\n"); + goto err; + } + + login = vnic_netdev_priv(dev); + login->fip_vnic = vnic; + vnic->login = login; + + set_bit(VNIC_STATE_LOGIN_PRECREATE_1, &login->state); + + return 0; + +err: + return -ENODEV; +} + +int vnic_login_pre_create_2(struct fip_vnic_data *vnic, int qps_num, int is_lag) +{ + struct vnic_login *login = vnic->login; + int i, j; + + if (vnic->parent_used) { + vnic_dbg_mac(vnic->name, "function skipped\n"); + return 0; + } else { + ASSERT(login); + vnic_dbg_func(login->name); + } + + login->qps_num = qps_num; + login->qkey = VNIC_DATA_QKEY; + login->is_lag = is_lag; + VNIC_TXQ_SET_ACTIVE(login, min(login->tx_rings_num, login->qps_num)); + + /* prepare padding for runt packets */ + login->pad_va = kzalloc(VNIC_EOIB_ZLEN_MAX, GFP_KERNEL); + if (!login->pad_va) + return -ENOMEM; + + login->pad_dma = ib_dma_map_single(login->port->dev->ca, login->pad_va, + VNIC_EOIB_ZLEN_MAX, DMA_TO_DEVICE); + if (ib_dma_mapping_error(login->port->dev->ca, login->pad_dma)) + goto err; + + /* create TX resources */ + for (i = 0; i < login->tx_rings_num; ++i) { + if (vnic_create_tx_res(login, i)) { + vnic_err(login->name, "vnic_create_tx_res failed," + " index %d\n", i); + goto free_tx_res; + } + } + + /* create RX resources */ + for (j = 0; j < login->rx_rings_num; ++j) { + if (vnic_create_rx_res(login, j)) { + vnic_err(login->name, "vnic_create_rx_res failed," + " index %d\n", j); + goto free_rx_res; + } + } + + /* create QPs */ + if (vnic_create_qp_range(login)) { + vnic_err(login->name, "vnic_create_qp_range failed\n"); + goto free_rx_res; + } + + /* first QP is the base QP */ + login->qp_base_num = login->qp_res[0].qp->qp_num; + vnic->qp_base_num = login->qp_base_num; + + /* update state */ + set_bit(VNIC_STATE_LOGIN_PRECREATE_2, &login->state); + + login->queue_stopped = 0; + + /* calls vnic_do_get_stats() */ + queue_delayed_work(login_wq, &login->stats_task, VNIC_STATS_DELAY); + + return 0; + +free_rx_res: + for (--j; j >= 0; --j) + vnic_destroy_rx_res(login, j); + + i = login->tx_rings_num; +free_tx_res: + for (--i; i >= 0; --i) + vnic_destroy_tx_res(login, i); +/*free_pad:*/ + ib_dma_unmap_single(login->port->dev->ca, login->pad_dma, + VNIC_EOIB_ZLEN_MAX, DMA_TO_DEVICE); +err: + kfree(login->pad_va); + return -ENODEV; +} + +int vnic_login_register_netdev(struct fip_vnic_data *vnic, + const char *mac, + const char *name) +{ + struct vnic_login *login = vnic->login; + + if (vnic->parent_used) { + vnic_dbg_mac(vnic->name, "function skipped\n"); + vnic_info("%s created (parent %s mac "MAC_6_PRINT_FMT")\n", + name, vnic->parent_name, + MAC_6_PRINT_ARG(vnic->mac_cache)); + return 0; + } else { + ASSERT(login); + vnic_dbg_func(login->name); + } + + /* set netdev name and mac */ + if (name) + strncpy(login->dev->name, name, IFNAMSIZ); + if (mac) { + memcpy(login->dev->dev_addr, mac, ETH_ALEN); + /* save original mac */ + memcpy(login->dev_addr, mac, ETH_ALEN); + } + + /* set device features according to all_vlan mode */ + login->dev->features |= NETIF_F_HIGHDMA; + + //ronni - fixme. add comment here + if (!vnic->all_vlan_gw) { + login->dev->features |= NETIF_F_VLAN_CHALLENGED; + login->dev->features &= ~NETIF_F_HW_VLAN_FILTER; + } else + login->dev->features |= NETIF_F_HW_VLAN_FILTER; + + /* register netdev */ + if (register_netdev(login->dev)) { + vnic_err(login->name, "register_netdev failed name=%s mac=" + MAC_6_PRINT_FMT" login->dev=%p\n", + name ? name : "net_admin", + MAC_6_PRINT_ARG(login->dev->dev_addr), login->dev); + goto err; + } + + /* encode the port number in dev_id: + * This allows us to associate the net device + * with the underlying device's port. + */ + login->dev->dev_id = login->port->num - 1; + + if (vnic_create_dentry(login)) { + vnic_err(login->name, "vnic_create_dentry failed\n"); + goto err; + } + + /* print info only after register_netdev so dev->name is valid */ + sprintf(login->name, "%s", login->dev->name); + vnic_info("%s created (%s port %d)\n", + login->dev->name, + login->port->dev->ca->name, login->port->num); + + /* disable tx queues and carrier. They will be started + * after create 2 is called the mcast is attached ... + */ + netif_tx_disable(login->dev); + netif_carrier_off(login->dev); + + mutex_lock(&login->port->mlock); + vnic_dbg_mac(login->name, "added to login_list\n"); + list_add_tail(&login->list, &login->port->login_list); + mutex_unlock(&login->port->mlock); + + set_bit(VNIC_STATE_LOGIN_CREATE_1, &login->state); + + return 0; + +err: + return -EINVAL; +} + +int vnic_login_complete_ack(struct fip_vnic_data *vnic, + struct fip_login_data *login_data, + struct fip_shared_vnic_data *shared_vnic) +{ + struct vnic_mcast *mcaste, *mcaste_bcast, *mcast_shared = NULL; + struct vnic_login *login = vnic->login; + int rc; + + if (vnic->parent_used) { + vnic_dbg_mac(vnic->name, "function skipped\n"); + return 0; + } else { + ASSERT(login); + vnic_dbg_func(login->name); + } + + /* + * TODO, check if you need them all, check overlap with gw_neigh + * check how pkey is passed from FIP + */ + login->pkey = login_data->pkey; + login->pkey_index = login_data->pkey_index; + login->n_mac_mcgid = login_data->n_mac_mcgid; + login->gw_port_id = login_data->port_id; + login->sl = login_data->sl; + login->vnic_id = login_data->vnic_id; + + memcpy(login->mgid_prefix, login_data->mgid_prefix, VNIC_MGID_PREFIX_LEN); + memcpy(login->vnic_name, login_data->vnic_name, sizeof(login_data->vnic_name)); + memcpy(login->vendor_id, login_data->vendor_id, sizeof(login_data->vendor_id)); + + VNIC_STR_STRIP(login->vnic_name); + VNIC_STR_STRIP(login->vendor_id); /* set ZLEN (varies per VLAN support) */ + + /* set VLAN */ + login->zlen = ETH_ZLEN; + login->vlan_used = login_data->vp; + login->all_vlan_gw = login_data->all_vlan_gw; + if (VNIC_VLAN_ENABLED(login) || login->all_vlan_gw) { + login->vid = cpu_to_be16(login_data->vlan); + login->dev->hard_header_len += VLAN_HLEN; + login->zlen += VLAN_HLEN; + } + + if (vnic_encap_headroom) + login->zlen += VNIC_ENCAP_LEN; + + /* create gw_neigh (no RSS when sending to the GW) + * user zero mac to describe GW L2 address + */ + login->gw_neigh = + vnic_neighe_alloc(login, NULL, login_data->lid, + login_data->qpn, 0); + if (IS_ERR(login->gw_neigh)) { + vnic_err(login->name, "failed to alloc gw neigh\n"); + goto err; + } + + /* alloc mcast entries here to simplify the error flow */ + mcaste = vnic_mcast_alloc(login->port, NULL, NULL); + if (IS_ERR(mcaste)) + goto err_free_gw_ah; + mcaste_bcast = vnic_mcast_alloc(login->port, NULL, NULL); + if (IS_ERR(mcaste_bcast)) { + vnic_mcast_dealloc(mcaste); + goto err_free_gw_ah; + } + /* used by shared vnic mcast group */ + if (shared_vnic && shared_vnic->enabled) { + mcast_shared = vnic_mcast_alloc(login->port, NULL, NULL); + if (IS_ERR(mcast_shared)) { + vnic_mcast_dealloc(mcaste); + vnic_mcast_dealloc(mcaste_bcast); + goto err_free_gw_ah; + } + } + + /* attach to default mgid */ + __vnic_mcaste_fill(login, mcaste, login->gw_port_id, ETH_ZERO_MAC, 0, vnic_mcast_create); + mcaste->backoff_factor = VNIC_MCAST_BACKOF_FAC; + mcaste->retry = VNIC_MCAST_ULIMIT_RETRY; + mcaste->attach_cb = __bcast_attach_cb; + mcaste->detach_cb = __bcast_detach_cb; + mcaste->attach_cb_ctx = login; + mcaste->detach_cb_ctx = login; + rc = vnic_mcast_add(&login->mcast_tree, mcaste); + ASSERT(!rc); + rc = vnic_mcast_attach(&login->mcast_tree, mcaste); + ASSERT(!rc); + + /* attach to bcast mgid (use default mlid) */ + if (login->n_mac_mcgid || vnic_mgid_data_type) { + __vnic_mcaste_fill(login, mcaste_bcast, login->gw_port_id, ETH_BCAST_MAC, 0, 0); + mcaste_bcast->backoff_factor = VNIC_MCAST_BACKOF_FAC; + mcaste_bcast->retry = VNIC_MCAST_ULIMIT_RETRY; + /* The port gid is overun by the default gid as part of the mgid over + * same mlid hack */ + memcpy(&mcaste_bcast->port_gid, &mcaste->port_gid, GID_LEN); + rc = vnic_mcast_add(&login->mcast_tree, mcaste_bcast); + ASSERT(!rc); + rc = vnic_mcast_attach(&login->mcast_tree, mcaste_bcast); + ASSERT(!rc); + } else { + vnic_mcast_dealloc(mcaste_bcast); + } + + login->shared_vnic = 0; + /* attach to bcast mgid (use default mlid) */ + if (shared_vnic && shared_vnic->enabled) { + u8 rss_hash = shared_vnic->ip[0] ^ shared_vnic->ip[1] ^ + shared_vnic->ip[2] ^ shared_vnic->ip[3]; + + login->shared_vnic = 1; + __vnic_mcaste_fill(login, mcast_shared, login->gw_port_id, shared_vnic->emac, 0, 0); + mcast_shared->backoff_factor = VNIC_MCAST_BACKOF_FAC; + mcast_shared->retry = VNIC_MCAST_ULIMIT_RETRY; + memcpy(&mcast_shared->port_gid, &mcaste->port_gid, GID_LEN); + mcast_shared->gid.raw[12]= rss_hash; + + vnic_dbg_mcast(login->name, "vnic %s attaching shared vnic 1 " + "MGID "VNIC_GID_FMT"\n", login->name, + VNIC_GID_RAW_ARG(mcast_shared->gid.raw)); + mcaste = mcast_shared; + memcpy(mcaste->mac, ETH_BCAST_MAC, ETH_ALEN); + rc = vnic_mcast_add(&login->mcast_tree, mcaste); + ASSERT(!rc); + rc = vnic_mcast_attach(&login->mcast_tree, mcaste); + ASSERT(!rc); + } + + /* set state */ + set_bit(VNIC_STATE_LOGIN_CREATE_2, &login->state); + + /* call vnic_open() if open was called when we were not ready to handle it */ + if (test_bit(VNIC_STATE_LOGIN_OPEN_REQ, &login->state)) +#ifndef _BP_NO_NDO_OPS + login->dev->netdev_ops->ndo_open(login->dev); +#else + login->dev->open(login->dev); +#endif + + return 0; + +err_free_gw_ah: + vnic_neighe_dealloc(login->gw_neigh); +err: + return -EINVAL; +} + +/* + * When destroying login, call to stop login wq tasks. do not call from + * login_wq context. +*/ +void vnic_login_destroy_stop_wq(struct fip_vnic_data *vnic, enum fip_flush flush) +{ + struct vnic_login *login = vnic->login; + + if (vnic->parent_used) { + vnic_dbg_mac(vnic->name, "function skipped\n"); + return; + } else { + ASSERT(login); + vnic_dbg_func(login->name); + } + + if (test_bit(VNIC_STATE_LOGIN_PRECREATE_1, &login->state)) { + /* cancel vnic_auto_moder() */ + vnic_dbg_mark(); + mutex_lock(&login->moder_lock); + login->queue_stopped = 1; + mutex_unlock(&login->moder_lock); +#ifndef _BP_WORK_SYNC + cancel_delayed_work_sync(&login->stats_task); + if (cancel_delayed_work_sync(&login->mcast_task)) + dev_put(login->dev); + cancel_delayed_work_sync(&login->restart_task); +#else + cancel_delayed_work(&login->stats_task); + if (cancel_delayed_work(&login->mcast_task)) + dev_put(login->dev); + cancel_delayed_work(&login->restart_task); + flush_workqueue(login_wq); +#endif + } +} + +/* + * When destroy login data struct. Assumes all login wq tasks are stopped. + * Can be called from any context, might block for a few secs. +*/ +void vnic_login_destroy_wq_stopped(struct fip_vnic_data *vnic, enum fip_flush flush) +{ + struct vnic_login *login = vnic->login; + unsigned long flags; + int i; + + if (vnic->parent_used) { + vnic_dbg_mac(vnic->name, "function skipped\n"); + vnic_info("%s destroyed (parent %s mac "MAC_6_PRINT_FMT")\n", + vnic->interface_name, vnic->parent_name, + MAC_6_PRINT_ARG(vnic->mac_cache)); + /* Note: vNics can be logged out by BXM (bypass sysfs calls) + * so we need to cleanup the parent here as well + * if we reach this function from sysfs calls, + * then vnic_parent_update will have no effect here (ok) + */ + vnic_parent_update(vnic->port, vnic->name, vnic->vnic_id, + vnic->mac_cache, NULL, vnic->parent_name, 1); + return; + } else { + ASSERT(login); + vnic_dbg_func(login->name); + } + + /* the cleanup procedure depends on our state, our vnic type + * (host/network admin), and the cleanup level required. In network admined + * vnics there is a single create state and only one cleanup level (full). + * for host admined there are two create states (init, regular) and two + * cleanup level. The flow depends on the reason for the cleanup. */ + vnic_dbg_data(login->name, "vnic_login_destroy flush=%d\n", flush); + + /* we need to change state to prevent from completion to re-open the TX + * queue once we close it. Before calling stop() function, need to make + * sure that all on-going hard_start_xmit() calls are done. + */ + + if (test_bit(VNIC_STATE_LOGIN_CREATE_1, &login->state)) { + set_bit(VNIC_STATE_LOGIN_NO_TX_ENABLE, &login->state); + netif_tx_disable(login->dev); + vnic_dbg_mark(); + } + + if (test_and_clear_bit(VNIC_STATE_LOGIN_CREATE_2, &login->state)) { + if (test_bit(VNIC_STATE_LOGIN_OPEN, &login->state)) { + /* calls vnic_stop() */ +#ifndef _BP_NO_NDO_OPS + login->dev->netdev_ops->ndo_stop(login->dev); +#else + login->dev->stop(login->dev); +#endif + set_bit(VNIC_STATE_LOGIN_OPEN_REQ, &login->state); + vnic_dbg_mark(); + } + vnic_mcast_del_all(&login->mcast_tree); + vnic_member_remove_all(login); + vnic_neighe_dealloc(login->gw_neigh); + vnic_dbg_mark(); + } + if (test_bit(VNIC_STATE_LOGIN_CREATE_1, &login->state)) + clear_bit(VNIC_STATE_LOGIN_NO_TX_ENABLE, &login->state); + + if (flush == FIP_FULL_FLUSH && + test_and_clear_bit(VNIC_STATE_LOGIN_CREATE_1, &login->state)) { + mutex_lock(&login->port->mlock); + vnic_dbg_mac(login->name, "delete from login_list\n"); + list_del(&login->list); + mutex_unlock(&login->port->mlock); + + /* print info if register_netdev was called before so + * dev->name is valid + */ + vnic_info("%s destroyed (%s port %d)\n", login->dev->name, + login->port->dev->ca->name, login->port->num); + + /* use irq save so caller function supports any context */ + write_lock_irqsave(&login->mac_rwlock, flags); + vnic_child_flush(login, 1); + write_unlock_irqrestore(&login->mac_rwlock, flags); + + vnic_delete_dentry(login); + unregister_netdev(login->dev); + vnic_dbg_mark(); + } + + vnic_dbg_mark(); + /* login_ctx was in pre created state [always true] */ + if (test_and_clear_bit(VNIC_STATE_LOGIN_PRECREATE_2, &login->state)) { + vnic_dbg_mark(); + /* tx queues are already stopped here */ + vnic_neigh_del_all(login); + /* take port->mlock in case of refresh event is being called vnic_refresh_mcasts */ + mutex_lock(&login->port->mlock); + vnic_mcast_del_all(&login->mcast_tree); + for (i = 0; i < login->qps_num; ++i) + vnic_destroy_qp(login, i); + mutex_unlock(&login->port->mlock); + + for (i = 0; i < login->rx_rings_num; ++i) + vnic_destroy_rx_res(login, i); + for (i = 0; i < login->tx_rings_num; ++i) + vnic_destroy_tx_res(login, i); + ib_dma_unmap_single(login->port->dev->ca, login->pad_dma, + VNIC_EOIB_ZLEN_MAX, DMA_TO_DEVICE); + kfree(login->pad_va); + } + + if (flush == FIP_FULL_FLUSH && + test_and_clear_bit(VNIC_STATE_LOGIN_PRECREATE_1, &login->state)) { + vnic_free_netdev(login); + } +} + +int vnic_vhube_add(struct fip_vnic_data *vnic, struct vnic_table_entry *vhube) +{ + struct vnic_neigh *neighe; + struct vnic_login *login = vnic->login; + int rc; + + if (vnic->parent_used) { + vnic_dbg_mac(vnic->name, "function skipped\n"); + return 0; + } else { + ASSERT(login); + vnic_dbg_func(login->name); + } + + vnic_dbg_data(login->name, "adding vhube lid 0x%02x qpn 0x%x, mac " + MAC_6_PRINT_FMT"\n", vhube->lid, vhube->qpn, + MAC_6_PRINT_ARG(vhube->mac)); + + neighe = vnic_neighe_alloc(login, vhube->mac, vhube->lid, + vhube->qpn, vhube->rss); + if (IS_ERR(neighe)) + return (int)PTR_ERR(neighe); + + vnic_dbg_mark(); + /* when adding new neighe, make sure that TX queues are not running. */ + netif_tx_lock_bh(login->dev); + rc = vnic_neighe_add(login, neighe); + netif_tx_unlock_bh(login->dev); + if (rc) { + vnic_neighe_dealloc(neighe); + return rc; + } + + return 0; +} + +void vnic_vhube_flush(struct fip_vnic_data *vnic) +{ + struct vnic_login *login = vnic->login; + + if (vnic->parent_used) { + vnic_dbg_mac(vnic->name, "function skipped\n"); + return; + } else { + ASSERT(login); + vnic_dbg_func(login->name); + } + + /* when adding new neighe, make sure that TX queues are not running. */ + vnic_dbg_mark(); + netif_tx_lock_bh(login->dev); + vnic_neigh_del_all(login); + netif_tx_unlock_bh(login->dev); + + return; +} + +void vnic_vhube_del(struct fip_vnic_data *vnic, u8* mac) +{ + struct vnic_neigh *neighe; + struct vnic_login *login = vnic->login; + + if (vnic->parent_used) { + vnic_dbg_mac(vnic->name, "function skipped\n"); + return; + } else { + ASSERT(login); + vnic_dbg_func(login->name); + } + + vnic_dbg_mark(); + /* when adding new neighe, make sure that TX queues are not running. */ + netif_tx_lock_bh(login->dev); + neighe = vnic_neighe_search(login, mac); + if (IS_ERR(neighe)) { + vnic_warn(login->name, "couldn't find "MAC_6_PRINT_FMT"\n", + MAC_6_PRINT_ARG(mac)); + } else { + vnic_neighe_del(login, neighe); + vnic_neighe_dealloc(neighe); + } + netif_tx_unlock_bh(login->dev); + + return; +} + +struct fip_login_data login_data; +struct fip_vnic_data vnic; +struct vnic_login *__vnic_login_create(struct vnic_port *port, int index) +{ + struct vnic_login *login; + int rc, no_bxm_n_rss = 0x4; + int qps_num = (port->rx_rings_num > 1) ? (1 << no_bxm_n_rss) : 1; + + /* pre create vnic */ + rc = vnic_login_pre_create_1(port, &vnic); + if (rc) { + vnic_err(port->name, "vnic_login_pre_create_1 failed" + " for %s port %d index %d\n", + port->dev->ca->name, port->num, index); + goto err; + } + + login = vnic.login; + + rc = vnic_login_pre_create_2(&vnic, qps_num, 0); + if (rc) { + vnic_err(port->name, "vnic_login_pre_create_2 failed" + " for %s port %d index %d\n", + port->dev->ca->name, port->num, index); + goto create_fail; + } + + /* create vnic */ + memset(&login_data, 0, sizeof(struct fip_login_data)); + sprintf(login_data.vendor_id, "%s", NOT_AVAILABLE_STRING); + sprintf(login_data.vnic_name, "%s", NOT_AVAILABLE_STRING); + memcpy(login_data.mgid_prefix, NO_BXM_MGID_PREFIX, VNIC_MGID_PREFIX_LEN); + login_data.qpn = 0xa00000; + login_data.lid = 1; + login_data.pkey = 0xffff; + login_data.mtu = 1500; + + /* random_ether_addr(mac); */ + memcpy(login_data.mac, port->gid.raw + 10, ETH_ALEN); + login_data.mac[0] += index * 0x10; + /* mcast bit must be zero */ + login_data.mac[0] &= 0xfe; + vnic_dbg_mark(); + if (vnic_login_register_netdev(&vnic, login_data.mac, NULL)) { + vnic_err(login->name, "vnic_login_register_netdev failed\n"); + goto create_fail; + } + if (vnic_login_complete_ack(&vnic, &login_data, NULL)) { + vnic_err(login->name, "vnic_login_complete_ack failed\n"); + goto create_fail; + } + + return login; + +create_fail: + vnic_login_destroy(login->fip_vnic, FIP_FULL_FLUSH); +err: + return ERR_PTR(-ENODEV); +} + +int vnic_port_data_init(struct vnic_port *port) +{ + int i, no_bxm_vnic_per_port = 1; + + vnic_dbg_mark(); + mutex_lock(&port->start_stop_lock); + for (i = 0; i < no_bxm_vnic_per_port; ++i) { + __vnic_login_create(port, i); + } + mutex_unlock(&port->start_stop_lock); + + return 0; + /*TODO - JPM: handle vnic_login_create failure */ +} + +void vnic_port_data_cleanup(struct vnic_port *port) +{ + struct vnic_login *login, *login_t; + + vnic_dbg_mark(); + /* vnic_login_destroy() acquires the port->mlock, cannot hold it here */ + list_for_each_entry_safe(login, login_t, + &port->login_list, list) { + vnic_dbg_data(login->name, "login %s\n", login->name); + vnic_login_destroy(login->fip_vnic, FIP_FULL_FLUSH); + } +} + +/* ALI TODO: check if need to replace login ptr with vnic */ +void debug_dump_members(struct vnic_login *login, struct vnic_gw_info *member) +{ + int i; + + vnic_warn(login->name, "Error members_debug_dump " + "member id=%d gw id = %d active_count=%d\n", + member->member_id, member->gw_id, + login->lag_member_active_count); + + /* go over map and count how many entries are mapped to each member*/ + for (i = 0; i < MAX_LAG_MEMBERS; i++) { + vnic_warn(login->name, "%d member %d used %x gw_id %d\n", + i, login->lag_gw_neigh[i].member_id, + login->lag_gw_neigh[i].info, + login->lag_gw_neigh[i].gw_id); + } +} + +static void vnic_build_map_histogram(struct vnic_login *login, int member_id, int *hist) +{ + int i; + + memset(hist, 0, sizeof(int) * MAX_LAG_MEMBERS); + + /* go over map and count how many entries are mapped to each member */ + for (i = 0; i < LAG_MAP_TABLE_SIZE; i++) { + ASSERT(login->lag_gw_map[i] >= 0 && + login->lag_gw_map[i] < MAX_LAG_MEMBERS); + hist[login->lag_gw_map[i]]++; + } +} + +static void _vnic_remove_member_from_map(struct vnic_login *login, int member_id) +{ + int user_count[MAX_LAG_MEMBERS] = {0}, i, j, continue_flag, thresh; + + login->lag_member_active_count--; + if (login->lag_member_active_count > 0) { + /* go over map and count how many entries are mapped to each member*/ + vnic_build_map_histogram(login, member_id, user_count); + + thresh = 2; //it might be possible to find a better lower boundary + + for (i = 0; i < LAG_MAP_TABLE_SIZE; i++) { + /* entries that use the removed member must be remapped */ + if (login->lag_gw_map[i] != member_id) + continue; + + continue_flag = 1; + while (continue_flag) { + for (j = 0; j < MAX_LAG_MEMBERS; j++) { + if (j == member_id) + continue; + + /* Only use members that are connected, and are short of members */ + if (login->lag_gw_neigh[j].info & GW_MEMBER_INFO_MAPPED && + user_count[j] < thresh) { + login->lag_gw_map[i] = j; + user_count[j]++; + continue_flag = 0; + break; + } + } + if (j == MAX_LAG_MEMBERS) + thresh++; + } + } + } +} + +static void _vnic_add_member_to_map(struct vnic_login *login, int member_id) +{ + int user_count[MAX_LAG_MEMBERS] = {0}, expected, i, continue_flag, + thresh; + + /* this is the first active port use it for all maps */ + if (!login->lag_member_active_count) { + for (i = 0; i < LAG_MAP_TABLE_SIZE; i++) + login->lag_gw_map[i] = member_id; + login->lag_member_active_count++; + } else { + /* go over map and count how many entries are mapped to each member + * we will use count to reasign ports from the most heavily used members */ + vnic_build_map_histogram(login, member_id, user_count); + + /* when adding new member, make sure that TX queues are not running. */ + login->lag_member_active_count++; + expected = LAG_MAP_TABLE_SIZE / login->lag_member_active_count; + thresh = LAG_MAP_TABLE_SIZE % login->lag_member_active_count; + continue_flag = 1; + while (continue_flag) { + for (i = 0; i < LAG_MAP_TABLE_SIZE; i++) { + if (user_count[login->lag_gw_map[i]] > expected + thresh) { + user_count[login->lag_gw_map[i]]--; + login->lag_gw_map[i] = member_id; + user_count[login->lag_gw_map[i]]++; + if (user_count[member_id] >= expected) { + continue_flag = 0; + break; + } + } + } + thresh--; + } + } +} + +void __bcast_member_attach_cb(struct vnic_mcast *mcaste, void *gw_ptr) +{ + struct vnic_gw_info *member = gw_ptr; + + /* When SA is local, mcast join works even when port is down */ + if (member->neigh.login->port->attr.state != IB_PORT_ACTIVE) + return; + + vnic_dbg_lag(member->neigh.login->name, "__bcast_member_attach_cb for member id %d and " + "gw_id=%d\n", member->member_id, member->gw_id); + + netif_tx_lock_bh(member->neigh.login->dev); + member->info |= GW_MEMBER_INFO_MCAST; + + if (member->info & GW_MEMBER_INFO_EPORT_UP && + !(member->info & GW_MEMBER_INFO_MAPPED)) { + _vnic_add_member_to_map(member->neigh.login, member->member_id); + member->info |= GW_MEMBER_INFO_MAPPED; + } + netif_tx_unlock_bh(member->neigh.login->dev); +} + +void __bcast_member_detach_cb(struct vnic_mcast *mcaste, void *gw_ptr) +{ + struct vnic_gw_info *member = gw_ptr; + + vnic_dbg_lag(member->neigh.login->name, "__bcast_member_detach_cb for member id %d and " + "gw_id=%d\n", member->member_id, member->gw_id); + + netif_tx_lock_bh(member->neigh.login->dev); + if (member->info & GW_MEMBER_INFO_MAPPED) + _vnic_remove_member_from_map(member->neigh.login, member->member_id); + + member->info &= ~(GW_MEMBER_INFO_MAPPED | GW_MEMBER_INFO_MCAST); + netif_tx_unlock_bh(member->neigh.login->dev); +} + +/* + * create MGIDs and join the default MCAST addresses. The mcaste are added to the + * list contained within member struct. If more MGIDs are used by the vnic when + * a member is added we will join those too using the members GW_ID. +*/ +static int _vnic_add_member_mgid(struct vnic_login *login, + struct vnic_gw_info *member) +{ + struct vnic_mcast *mcaste, *mcaste_bcast; + int rc; +#ifndef _BP_NO_MC_LIST + struct dev_mc_list *mclist; +#else + struct netdev_hw_addr *ha; +#endif + + mcaste = vnic_mcast_alloc(login->port, NULL, NULL); + if (IS_ERR(mcaste)) + return (-ENOMEM); + + /* attach to default mgid */ + __vnic_mcaste_fill(login, mcaste, member->gw_id, ETH_ZERO_MAC, 0, vnic_mcast_create); + mcaste->attach_cb = __bcast_member_attach_cb; + mcaste->detach_cb = __bcast_member_detach_cb; + mcaste->attach_cb_ctx = member; + mcaste->detach_cb_ctx = member; + mcaste->priv_data = member; + rc = vnic_mcast_add(&login->mcast_tree, mcaste); + if (rc) { + debug_dump_members(login, member); + ASSERT(!rc); + } + + rc = vnic_mcast_attach(&login->mcast_tree, mcaste); + if (rc) { + debug_dump_members(login, member); + ASSERT(!rc); + } + + if (login->n_mac_mcgid) { + mcaste_bcast = vnic_mcast_alloc(login->port, NULL, NULL); + if (IS_ERR(mcaste_bcast)) + goto free_mcasts; + + __vnic_mcaste_fill(login, mcaste_bcast, member->gw_id, ETH_BCAST_MAC, 0, 0); + /* The port gid is overun by the default gid as part of the mgid over + * same mlid hack */ + memcpy(&mcaste_bcast->port_gid, &mcaste->port_gid, GID_LEN); + mcaste_bcast->priv_data = member; + rc = vnic_mcast_add(&login->mcast_tree, mcaste_bcast); + ASSERT(!rc); + rc = vnic_mcast_attach(&login->mcast_tree, mcaste_bcast); + ASSERT(!rc); + } + + + /* hold the tx lock so set_multicast_list() won't change mc_list */ + netif_tx_lock_bh(login->dev); +#ifndef _BP_NO_MC_LIST + for (mclist = login->dev->mc_list; mclist; mclist = mclist->next) { + u8* mmac = mclist->dmi_addr; +#else + netdev_for_each_mc_addr(ha, login->dev) { + u8* mmac = ha->addr; +#endif + /* do not add the default MGIDS because they are always used */ + if (IS_ZERO_MAC(mmac)) + continue; + if (IS_BCAST_MAC(mmac)) + continue; + + vnic_dbg_lag(login->name, "_vnic_add_member_mgid for " + MAC_6_PRINT_FMT" and member gw_id=%d\n", + MAC_6_PRINT_ARG(mcaste->mac), member->gw_id); + + if (_vnic_mcast_attach_mgid(login, mmac, mcaste, member, + member->gw_id)) + goto attach_failed; + } + netif_tx_unlock_bh(login->dev); + + return 0; + +attach_failed: + netif_tx_unlock_bh(login->dev); +free_mcasts: + vnic_mcast_del_user(&login->mcast_tree, member); + return -ENOMEM; +} + +int vnic_member_add(struct vnic_login *login, int member_id, struct lag_member *member) +{ + struct vnic_gw_info *member_e; + int ret; + + if (member_id >= MAX_LAG_MEMBERS || member_id < 0) + return -1; + + vnic_dbg_lag(login->name, "vnic_member_add for id %d and gw_id=%d\n", + member_id, member->gw_port_id); + + /* member id is already in use */ + if (login->lag_gw_neigh[member_id].info & GW_MEMBER_INFO_CREATED) + return -1; + + member_e = &login->lag_gw_neigh[member_id]; + + /* create new entry */ + member_e->member_id = member_id; + member_e->neigh.lid = member->lid; + member_e->neigh.qpn = member->qpn; + member_e->gw_id = member->gw_port_id; + member_e->neigh.login = login; + member_e->neigh.ah = vnic_ah_alloc(login, member->lid); + if (IS_ERR(member_e->neigh.ah)) + return -ENOMEM; + + /* need to add multicast code */ + ret = _vnic_add_member_mgid(login, member_e); + if (ret) + goto free_ah; + + netif_tx_lock_bh(login->dev); + member_e->info = GW_MEMBER_INFO_CREATED; + if (member->eport_state) + member_e->info |= GW_MEMBER_INFO_EPORT_UP; + login->lag_member_count++; + netif_tx_unlock_bh(login->dev); + + return 0; + +free_ah: + ib_destroy_ah(member_e->neigh.ah); + return ret; +} + +void vnic_member_remove_all(struct vnic_login *login) +{ + int i; + + if (!login->is_lag) + return; + + for (i = 0; i < MAX_LAG_MEMBERS; i++) + vnic_member_remove(login, i); +} + +int vnic_member_remove(struct vnic_login *login, int member_id) +{ + struct vnic_gw_info *member_e; + + vnic_dbg_lag(login->name, "vnic_member_remove for id %d\n", member_id); + + if (member_id >= MAX_LAG_MEMBERS || member_id < 0) + return -1; + + member_e = &login->lag_gw_neigh[member_id]; + + vnic_dbg_lag(login->name, "vnic_member_remove for id %d and gw_id=%d\n", member_id, member_e->gw_id); + + /* member id is not in use */ + if (!(member_e->info & GW_MEMBER_INFO_CREATED)) + return -1; + + netif_tx_lock_bh(login->dev); + if (member_e->info & GW_MEMBER_INFO_MAPPED) + _vnic_remove_member_from_map(login, member_e->member_id); + + member_e->info &= ~(GW_MEMBER_INFO_MAPPED); + netif_tx_unlock_bh(login->dev); + + /* modification of map will be done through mcast CB if needed */ + vnic_mcast_del_user(&login->mcast_tree, member_e); + + ib_destroy_ah(member_e->neigh.ah); + + member_e->info = 0; + login->lag_member_count--; + + return 0; +} + +void vnic_member_prop(struct vnic_login *login, struct lag_properties *prop) +{ + if (login->lag_prop.hash_mask != prop->hash_mask) { + netif_tx_lock_bh(login->dev); + memcpy(&login->lag_prop, prop, sizeof(login->lag_prop)); + netif_tx_unlock_bh(login->dev); + } +} + +/* + * modify a specific LAG eport member parameters. The parameters might not be + * "interesting" and might not effect data traffic. They might require creating + * a new ah, or might even result in a modification of the transmit hash mapping + * function. +*/ +int vnic_member_modify(struct vnic_login *login, int member_id, struct lag_member *member) +{ + struct vnic_gw_info *member_e; + struct ib_ah *ah, *ah1; + + if (member_id >= MAX_LAG_MEMBERS || member_id < 0) + return -1; + + member_e = &login->lag_gw_neigh[member_id]; + + vnic_dbg_lag(login->name, "vnic_member_modify for id %d and gw_id=%d\n", member_id, member_e->gw_id); + + /* member id is not in use */ + if (!(member_e->info & GW_MEMBER_INFO_CREATED)) + return -1; + + /* change in LID requires new ah */ + if (member_e->neigh.lid != member->lid) { + ah = member_e->neigh.ah; + ah1 = vnic_ah_alloc(login, member->lid); + if (IS_ERR(ah1)) + return -ENOMEM; + + netif_tx_lock_bh(login->dev); + member_e->neigh.lid = member->lid; + member_e->neigh.ah = ah1; + netif_tx_unlock_bh(login->dev); + ib_destroy_ah(ah); + } + + if (member_e->neigh.qpn != member->qpn) + member_e->neigh.qpn = member->qpn; + + netif_tx_lock_bh(login->dev); + /* link changed from up to down */ + if (member_e->info & GW_MEMBER_INFO_MAPPED && !member->eport_state) { + _vnic_remove_member_from_map(login, member_id); + member_e->info &= ~(GW_MEMBER_INFO_MAPPED | GW_MEMBER_INFO_EPORT_UP); + } + + /* link changed from down to up and mcast are connected */ + if (!(member_e->info & GW_MEMBER_INFO_MAPPED) && + member->eport_state) { + if (member_e->info & GW_MEMBER_INFO_MCAST) { + _vnic_add_member_to_map(login, member_id); + member_e->info |= (GW_MEMBER_INFO_MAPPED | GW_MEMBER_INFO_EPORT_UP); + } else + member_e->info |= GW_MEMBER_INFO_EPORT_UP; + } + netif_tx_unlock_bh(login->dev); + + return 0; +} + diff --git a/drivers/net/mlx4_vnic/vnic_data_neigh.c b/drivers/net/mlx4_vnic/vnic_data_neigh.c new file mode 100644 index 0000000000000..0a15fc4664f28 --- /dev/null +++ b/drivers/net/mlx4_vnic/vnic_data_neigh.c @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_data.h" + +void vnic_neighe_dealloc(struct vnic_neigh *neighe) +{ + ASSERT(neighe); + if (neighe->ah) + ib_destroy_ah(neighe->ah); + kfree(neighe); +} + +struct ib_ah *vnic_ah_alloc(struct vnic_login *login, u16 dlid) +{ + struct ib_ah_attr av; + struct ib_ah *ah; + + memset(&av, 0, sizeof(av)); + av.dlid = dlid; + av.port_num = login->port->num; + av.sl = login->sl; + ah = ib_create_ah(login->port->pd, &av); + if (IS_ERR(ah)) { + return ERR_PTR(-ENOMEM); + } + return(ah); +} + +struct vnic_neigh *vnic_neighe_alloc(struct vnic_login *login, + const u8 *mac, + u16 dlid, u32 dqpn, u8 rss) +{ + struct vnic_neigh *neighe; + struct ib_ah *ah; + + neighe = kzalloc(sizeof *neighe, GFP_ATOMIC); + if (!neighe) + return ERR_PTR(-ENOMEM); + + ah = vnic_ah_alloc(login, dlid); + if (IS_ERR(ah)) { + kfree(neighe); + return ERR_PTR(-ENOMEM); + } + if (mac) + memcpy(neighe->mac, mac, ETH_ALEN); + neighe->rss = rss; + neighe->ah = ah; + neighe->qpn = dqpn; + neighe->lid = dlid; + neighe->login = login; + + return neighe; +} + +void vnic_neighe_del(struct vnic_login *login, struct vnic_neigh *neighe) +{ + ASSERT(neighe); + rb_erase(&neighe->rb_node, &login->neigh_tree); +} + +int vnic_neighe_add(struct vnic_login *login, struct vnic_neigh *neighe) +{ + struct rb_node **n = &login->neigh_tree.rb_node, *pn = NULL; + struct vnic_neigh *neighe_t; + int rc; + + while (*n) { + pn = *n; + neighe_t = rb_entry(pn, struct vnic_neigh, rb_node); + rc = memcmp(neighe->mac, neighe_t->mac, ETH_ALEN); + if (rc < 0) + n = &pn->rb_left; + else if (rc > 0) + n = &pn->rb_right; + else { + rc = -EEXIST; + goto out; + } + } + + rb_link_node(&neighe->rb_node, pn, n); + rb_insert_color(&neighe->rb_node, &login->neigh_tree); + rc = 0; + +out: + return rc; +} + +struct vnic_neigh *vnic_neighe_search(struct vnic_login *login, u8 *mac) +{ + struct rb_node *n = login->neigh_tree.rb_node; + struct vnic_neigh *neighe_t; + int rc; + + while (n) { + neighe_t = rb_entry(n, struct vnic_neigh, rb_node); + rc = memcmp(mac, neighe_t->mac, ETH_ALEN); + if (rc < 0) + n = n->rb_left; + else if (rc > 0) + n = n->rb_right; + else { + vnic_dbg_data(login->name, + "found: mac "MAC_6_PRINT_FMT" vid %d " + "qpn 0x%06x lid 0x%02x\n", + MAC_6_PRINT_ARG(neighe_t->mac), + be16_to_cpu(login->vid), neighe_t->qpn, + neighe_t->lid); + goto out; + } + } + neighe_t = ERR_PTR(-ENODATA); + +out: + return neighe_t; +} + +void vnic_neigh_del_all(struct vnic_login *login) +{ + struct rb_node *n; + struct vnic_neigh *neighe; + + ASSERT(login); + n = rb_first(&login->neigh_tree); + while (n) { + neighe = rb_entry(n, struct vnic_neigh, rb_node); + vnic_neighe_del(login, neighe); + n = rb_first(&login->neigh_tree); + vnic_neighe_dealloc(neighe); + } +} + diff --git a/drivers/net/mlx4_vnic/vnic_data_netdev.c b/drivers/net/mlx4_vnic/vnic_data_netdev.c new file mode 100644 index 0000000000000..29859c297f60b --- /dev/null +++ b/drivers/net/mlx4_vnic/vnic_data_netdev.c @@ -0,0 +1,1071 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_data.h" + +extern struct net_device_stats *mlx4_vnic_stats_func_container(struct net_device *n); + +static void mlx4_vnic_vlan_rx_add_vid(struct net_device *dev, unsigned short vid) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + + vnic_dbg_data(login->name, "add VLAN:%d was called\n", vid); +} + +static void mlx4_vnic_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + + vnic_dbg_data(login->name, "Kill VID:%d was called\n", vid); +} + +void vnic_carrier_update(struct vnic_login *login) +{ + int attached, eport_up, eport_enforce, carrier_ok; + + ASSERT(login); + attached = test_bit(VNIC_STATE_LOGIN_BCAST_ATTACH, &login->state); + eport_up = fip_vnic_get_eport_state(login->fip_vnic); + eport_enforce = vnic_eport_state_enforce; + carrier_ok = netif_carrier_ok(login->dev); + + /* bring carrier up */ + if (!carrier_ok && attached && (!eport_enforce || eport_up)) { + set_bit(VNIC_STATE_LOGIN_CARRIER_ON, &login->state); + netif_carrier_on(login->dev); + vnic_info("%s link is up\n", login->dev->name); + return; + } + + /* bring carrier down */ + if (carrier_ok && (!attached || (!eport_up && eport_enforce))) { + clear_bit(VNIC_STATE_LOGIN_CARRIER_ON, &login->state); + netif_carrier_off(login->dev); + vnic_info("%s link is down\n", login->dev->name); + return; + } + +} + +void __bcast_attach_cb(struct vnic_mcast *mcaste, void *login_ptr) +{ + struct vnic_login *login = login_ptr; + + /* When SA is local, mcast join works even when port is down */ + if (login->port->attr.state != IB_PORT_ACTIVE) + return; + set_bit(VNIC_STATE_LOGIN_BCAST_ATTACH, &login->state); + vnic_carrier_update(login); +} + +void __bcast_detach_cb(struct vnic_mcast *mcaste, void *login_ptr) +{ + struct vnic_login *login = login_ptr; + + clear_bit(VNIC_STATE_LOGIN_BCAST_ATTACH, &login->state); + vnic_carrier_update(login); +} + +/* this function cannot sleep, avoid any mutex() in consequent calls */ +static int vnic_set_mac(struct net_device *dev, void *_mac) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + struct sockaddr *saddr = _mac; + u8 *mac = (u8 *)(saddr->sa_data); + int rc = 0; + + vnic_dbg_func(login->name); + + vnic_dbg_mac(login->name, "mac "MAC_6_PRINT_FMT" => "MAC_6_PRINT_FMT"\n", + MAC_6_PRINT_ARG((u8 *)(dev->dev_addr)), + MAC_6_PRINT_ARG(mac)); + + /* must support child vNics for mac modification */ + if (!vnic_child_max) + return -ENOSYS; + + /* skip if invalid address */ + if (unlikely(!is_valid_ether_addr(mac))) + return -EINVAL; + + /* skip if same mac was already set */ + if (!(memcmp((u8 *)(dev->dev_addr), mac, ETH_ALEN))) + return 0; + + /* already in bh, calls vnic_child_update that queues a job, + * so read_lock is enough + */ + read_lock(&login->mac_rwlock); + + /* if mac same as original, delete child, set mac and return */ + if (!(memcmp(mac, login->dev_addr, ETH_ALEN))) + goto out; + + /* else, this is a new child vNic, + * add new child vNic + * NOTE: pay attention that the GC should not destroy a child vNic that + * is being used as mac-change even if it was created by different + * source. + */ + rc = vnic_child_update(login, mac, 0); + if (rc && rc != -EEXIST) + goto err; + +out: + memcpy(dev->dev_addr, mac, ETH_ALEN); + vnic_child_update(login, (u8 *)(dev->dev_addr), 1); + vnic_dbg_mac(login->name, "mac changed successfully to " + MAC_6_PRINT_FMT"\n", MAC_6_PRINT_ARG(mac)); + +err: + read_unlock(&login->mac_rwlock); + return rc; +} + +static void vnic_set_multicast_list(struct net_device *dev) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + + vnic_dbg_func(login->name); + + /* test promisc flag changes */ + if (is_ucast_promisc(login) && !login->promisc) { + /* promisc is being set */ + if (!vnic_child_max) { + /* must support child vNics for promisc mode */ + vnic_info("%s promisc mode cannot be set " + "(vnic_child_max %u)\n", + dev->name, vnic_child_max); + } else if (vnic_src_mac_enforce) { + /* cannot support promisc if source mac is enforced + * because sender should be able to use any smac + */ + vnic_info("%s promisc mode cannot be set " + "(vnic_src_mac_enforce %u)\n", + dev->name, vnic_src_mac_enforce); + } else { + login->promisc = 1; + vnic_dbg_mac(dev->name, + "entered promiscuous mode: confirmed\n"); + } + } else if (!is_ucast_promisc(login) && login->promisc) { + /* promisc is being cleared */ + login->promisc = 0; + write_lock(&login->mac_rwlock); + vnic_child_flush(login, 0); + write_unlock(&login->mac_rwlock); + vnic_dbg_mac(dev->name, + "left promiscuous mode: confirmed\n"); + } + + /* test mcast changes */ + if (!no_bxm && !login->queue_stopped) { + dev_hold(dev); + if (!queue_delayed_work(login_wq, &login->mcast_task, HZ / 100)) + dev_put(dev); + } +} + +static void vnic_auto_moder(struct vnic_login *login) +{ + unsigned long period = + (unsigned long)(jiffies - login->last_moder_jiffies); + unsigned long packets; + unsigned long rate; + unsigned long avg_pkt_size; + unsigned long rx_packets; + unsigned long rx_bytes; + unsigned long tx_packets; + unsigned long tx_pkt_diff; + unsigned long rx_pkt_diff; + int moder_time; + + period = (unsigned long)(jiffies - login->last_moder_jiffies); +#if 0 + vnic_dbg_moder_v(login->name, "adaptive_rx_coal %d, period %d, " + "sample_interval %d, state %d\n", + login->adaptive_rx_coal, period, + login->sample_interval, login->port->attr.state); +#endif + + if (!login->adaptive_rx_coal || period < login->sample_interval * HZ) + return; + + /* TODO: when NAPI is disabled, the RX completion will be called from + * IRQ context (and not BH context) and thus spin_lock_bh should be + * replaced with spin_lock_irq + */ + spin_lock_bh(&login->stats_lock); + rx_packets = login->stats.rx_packets; + rx_bytes = login->stats.rx_bytes; + tx_packets = login->stats.tx_packets; + spin_unlock_bh(&login->stats_lock); + + if (!login->last_moder_jiffies || !period) + goto out_set; + + tx_pkt_diff = ((unsigned long)(tx_packets - + login->last_moder_tx_packets)); + rx_pkt_diff = ((unsigned long)(rx_packets - login->last_moder_packets)); + packets = max(tx_pkt_diff, rx_pkt_diff); + rate = packets * HZ / period; + avg_pkt_size = packets ? ((unsigned long)(rx_bytes - + login->last_moder_bytes)) / + packets : 0; + + if (rate > VNIC_RX_RATE_THRESH && avg_pkt_size > VNIC_AVG_PKT_SMALL) { + /* If tx and rx packet rates are not balanced, assume that + * traffic is mainly BW bound and apply maximum moderation. + * Otherwise, moderate according to packet rate */ + if (2 * tx_pkt_diff > 3 * rx_pkt_diff || + 2 * rx_pkt_diff > 3 * tx_pkt_diff) { + moder_time = login->rx_usecs_high; + } else { + if (rate < login->pkt_rate_low) + moder_time = login->rx_usecs_low; + else if (rate > login->pkt_rate_high) + moder_time = login->rx_usecs_high; + else + moder_time = (rate - login->pkt_rate_low) * + (login->rx_usecs_high - login->rx_usecs_low) / + (login->pkt_rate_high - login->pkt_rate_low) + + login->rx_usecs_low; + } + } else { + moder_time = login->rx_usecs_low; + } + + if (moder_time != login->last_moder_time) { + vnic_dbg_moder(login->name, "tx rate:%lu rx_rate:%lu\n", + tx_pkt_diff * HZ / period, + rx_pkt_diff * HZ / period); + vnic_dbg_moder(login->name, + "Rx moder_time changed from:%lu to %d period:%lu" + " [jiff] packets:%lu avg_pkt_size:%lu rate:%lu" + " [p/s])\n", login->last_moder_time, moder_time, + period, packets, avg_pkt_size, rate); + login->last_moder_time = moder_time; + vnic_ib_set_moder(login, + login->last_moder_time, login->rx_frames, + login->tx_usecs, login->tx_frames); + } + +out_set: + login->last_moder_packets = rx_packets; + login->last_moder_tx_packets = tx_packets; + login->last_moder_bytes = rx_bytes; + login->last_moder_jiffies = jiffies; +} + +void vnic_dump_stats(struct vnic_login *login) +{ + unsigned long *stats, *login_stats = (unsigned long *)(&login->stats); + int i, j, len = sizeof(struct net_device_stats) / sizeof(unsigned long); + struct net_device_stats stats_tmp; + + spin_lock_bh(&login->stats_lock); + /* tx stats are distributed between tx_res entries */ + stats_tmp = login->stats; + memset(&login->stats, 0, sizeof(struct net_device_stats)); + for (i = 0; i < login->tx_rings_num; ++i) { + stats = (unsigned long *)(&login->tx_res[i].stats); + for (j = 0; j < len; ++j) + login_stats[j] += stats[j]; + } + + /* rx stats are in login->stats */ + login->stats.rx_bytes = stats_tmp.rx_bytes; + login->stats.rx_packets = stats_tmp.rx_packets; + login->stats.rx_errors = stats_tmp.rx_errors; + login->stats.rx_dropped = stats_tmp.rx_dropped; + spin_unlock_bh(&login->stats_lock); +} + +static void vnic_do_get_stats(struct work_struct *work) +{ + struct vnic_login *login = + container_of(work, struct vnic_login, stats_task.work); + + mutex_lock(&login->moder_lock); + vnic_dump_stats(login); + + if (login->queue_stopped) + goto out; + + if (!(test_bit(VNIC_STATE_LOGIN_OPEN, &login->state))) + goto resched; + + if (login->port->attr.state == IB_PORT_ACTIVE) + vnic_auto_moder(login); + +resched: + /* calls vnic_do_get_stats() */ + if (!login->queue_stopped) + queue_delayed_work(login_wq, &login->stats_task, VNIC_STATS_DELAY); +out: + mutex_unlock(&login->moder_lock); +} + +static void vnic_mcast_reattach(struct work_struct *work) +{ + struct vnic_mcast *mcaste, *mcaste_t; + struct rb_node *n; + unsigned long flags; + union vhub_mgid mgid; + LIST_HEAD(local_list); + int i; + struct vnic_gw_info *lag_member; + struct vnic_login *login; + struct net_device *dev; +#ifndef _BP_NO_MC_LIST + struct dev_mc_list *mclist; +#else + struct netdev_hw_addr *ha; +#endif + + login = container_of(work, struct vnic_login, mcast_task.work); + dev = login->dev; + + vnic_dbg_mcast(login->name, "set_multicast_list was notified\n"); + if (login->queue_stopped) { + dev_put(dev); + return; + } + + /* detach all mcast (except default and bcast mcasts) */ + spin_lock_irqsave(&login->mcast_tree.mcast_rb_lock, flags); + if (!list_empty(&login->mcast_tree.reattach_list)) { + /* an event is being processed */ + spin_unlock_irqrestore(&login->mcast_tree.mcast_rb_lock, flags); + goto retry; + } + + for (n = rb_first(&login->mcast_tree.mcast_tree); n; n = rb_next(n)) { + mcaste = rb_entry(n, struct vnic_mcast, rb_node); + if (IS_ZERO_MAC(mcaste->mac)) + continue; + if (IS_BCAST_MAC(mcaste->mac)) + continue; + list_add_tail(&mcaste->list, &local_list); + } + + list_for_each_entry(mcaste, &local_list, list) { + vnic_mcast_del(&login->mcast_tree, mcaste); + mcaste->attach_task_cnt = 0; + } + + spin_unlock_irqrestore(&login->mcast_tree.mcast_rb_lock, flags); + + vnic_dbg_mcast(login->name, "local_list is %s empty n_mac_mcgid %u\n", + (list_empty(&local_list) ? "" : "not"), + login->n_mac_mcgid); + + list_for_each_entry_safe(mcaste, mcaste_t, &local_list, list) { + list_del(&mcaste->list); + vnic_mcast_detach(&login->mcast_tree, mcaste); + vnic_mcast_dealloc(mcaste); + } + + /* attach all mcasts in mc_list */ + vhub_mgid_create(login->mgid_prefix, ETH_ZERO_MAC, login->n_mac_mcgid, + CREATE_VHUB_ID(login->vid, login->gw_port_id), + VHUB_MGID_DATA, 0, &mgid); + + spin_lock_irqsave(&login->mcast_tree.mcast_rb_lock, flags); + mcaste_t = vnic_mcast_search(&login->mcast_tree, &mgid.ib_gid); + if (IS_ERR(mcaste_t) || !test_bit(VNIC_STATE_LOGIN_BCAST_ATTACH, &login->state)) { + vnic_dbg_data(login->name, "default mgid not ready\n"); + spin_unlock_irqrestore(&login->mcast_tree.mcast_rb_lock, flags); + dev_put(dev); + return; + } + spin_unlock_irqrestore(&login->mcast_tree.mcast_rb_lock, flags); + + /* hold the tx lock so set_multicast_list() won't change mc_list */ + netif_tx_lock_bh(dev); +#ifndef _BP_NO_MC_LIST + for (mclist = login->dev->mc_list; mclist; mclist = mclist->next) { + u8* mmac = mclist->dmi_addr; +#else + netdev_for_each_mc_addr(ha, login->dev) { + u8* mmac = ha->addr; +#endif + /* do not add the default MGIDS because they are always used */ + if (IS_ZERO_MAC(mmac)) + continue; + if (IS_BCAST_MAC(mmac)) + continue; + + /* attach to the legacy GW / LAG gw id MGID */ + if (_vnic_mcast_attach_mgid(login, mmac, mcaste_t, login, + login->gw_port_id)) + goto attach_failed; + + if (!login->is_lag) + continue; + + for (i = 0; i < MAX_LAG_MEMBERS; i++) { + lag_member = &login->lag_gw_neigh[i]; + /* member id is already in use */ + if (lag_member->info & GW_MEMBER_INFO_CREATED) + /* attach to the legacy GW / LAG gw id MGID */ + if (_vnic_mcast_attach_mgid(login, mmac, + mcaste_t, + lag_member, + lag_member->gw_id)) + goto attach_failed; + } + } + netif_tx_unlock_bh(dev); + dev_put(dev); + return; + +attach_failed: + netif_tx_unlock_bh(dev); + vnic_mcast_del_all(&login->mcast_tree); + +retry: + if (!login->queue_stopped) { + if (!queue_delayed_work(login_wq, &login->mcast_task, HZ / 100)) + dev_put(dev); + } else + dev_put(dev); +} + +static int vnic_change_mtu(struct net_device *dev, int new_mtu) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + + if (new_mtu > login->max_mtu) { + vnic_warn(login->name, "failed: new_mtu %d > %d\n", new_mtu, + login->max_mtu); + return -EINVAL; + } + + vnic_dbg_data(login->name, "mtu %d -> %d\n", dev->mtu, new_mtu); + dev->mtu = new_mtu; + + return 0; +} + +static void vnic_set_default_moder(struct vnic_login *login) +{ + + login->rx_frames = VNIC_RX_COAL_TARGET / login->dev->mtu + 1; + login->rx_usecs = VNIC_RX_COAL_TIME; + login->tx_frames = VNIC_TX_COAL_PKTS; + login->tx_usecs = VNIC_TX_COAL_TIME; + login->pkt_rate_low = VNIC_RX_RATE_LOW; + login->rx_usecs_low = VNIC_RX_COAL_TIME_LOW; + login->pkt_rate_high = VNIC_RX_RATE_HIGH; + login->rx_usecs_high = VNIC_RX_COAL_TIME_HIGH; + login->sample_interval = VNIC_SAMPLE_INTERVAL; + login->adaptive_rx_coal = 1; + login->last_moder_time = VNIC_AUTO_CONF; + login->last_moder_jiffies = 0; + login->last_moder_packets = 0; + login->last_moder_tx_packets = 0; + login->last_moder_bytes = 0; + + vnic_dbg_data(login->name, "default coalescing params for mtu:%d to " + "rx_frames:%d rx_usecs:%d " + "tx_frames:%d tx_usecs:%d\n", + login->dev->mtu, + login->rx_frames, login->rx_usecs, + login->tx_frames, login->tx_usecs); +} + +#ifndef _BP_NAPI_POLL +int vnic_napi_alloc(struct vnic_login *login, int rx_res_index) +{ + + struct napi_struct *napi = &login->rx_res[rx_res_index].napi; + + netif_napi_add(login->dev, napi, vnic_poll_cq_rx, vnic_napi_weight); + + return 0; +} + +void vnic_napi_enable(struct vnic_login *login, int rx_res_index) +{ + + struct napi_struct *napi = &login->rx_res[rx_res_index].napi; + napi_enable(napi); +} + +static void vnic_napi_disable(struct vnic_login *login, int rx_res_index) +{ + struct napi_struct *napi = &login->rx_res[rx_res_index].napi; + + if (!napi->poll) + return; + + napi_disable(napi); +} + +static void vnic_napi_dealloc(struct vnic_login *login, int rx_res_index) +{ +#ifndef _BP_NAPI_NO_DEL + struct napi_struct *napi = &login->rx_res[rx_res_index].napi; + + netif_napi_del(napi); +#else + return; +#endif +} + +#else +int vnic_napi_alloc(struct vnic_login *login, int rx_res_index) +{ + struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index]; + char name[IFNAMSIZ]; + + snprintf(name, IFNAMSIZ, "%s-N%d", login->name, rx_res_index); + rx_res->poll_dev = + alloc_netdev(0, name, ether_setup); + if (!rx_res->poll_dev) + return -ENOMEM; + + rx_res->poll_dev = rx_res->poll_dev; + rx_res->poll_dev->priv = rx_res; + rx_res->poll_dev->weight = vnic_napi_weight; + rx_res->poll_dev->poll = vnic_poll_cq_rx; + + return 0; +} + +void vnic_napi_enable(struct vnic_login *login, int rx_res_index) +{ + struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index]; + + ASSERT(rx_res->poll_dev); + set_bit(__LINK_STATE_START, &rx_res->poll_dev->state); +} + +static void vnic_napi_disable(struct vnic_login *login, int rx_res_index) +{ + struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index]; + struct net_device *poll_dev = rx_res->poll_dev; + + if (!poll_dev) + return; + + while (test_bit(__LINK_STATE_RX_SCHED, &poll_dev->state)) + msleep(VNIC_NAPI_SCHED_TIMEOUT); +} + +static void vnic_napi_dealloc(struct vnic_login *login, int rx_res_index) +{ + struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index]; + struct net_device *poll_dev = rx_res->poll_dev; + + if (!poll_dev) + return; + + free_netdev(poll_dev); + rx_res->poll_dev = NULL; +} +#endif + +static int _vnic_open(struct net_device *dev) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + int i; + + /* Todo add locks here */ + if (!(test_bit(VNIC_STATE_LOGIN_CREATE_2, &login->state))) { + set_bit(VNIC_STATE_LOGIN_OPEN_REQ, &login->state); + return 0; + } + + if (test_and_set_bit(VNIC_STATE_LOGIN_OPEN, &login->state)) + return 0; + + clear_bit(VNIC_STATE_LOGIN_OPEN_REQ, &login->state); + + /* ARM RX handlers */ + for (i = 0; i < login->rx_rings_num; ++i) { + login->rx_res[i].stopped = 0; + if (ib_req_notify_cq(login->rx_res[i].cq, IB_CQ_NEXT_COMP)) { + vnic_err(login->name, "ib_req_notify_cq failed\n"); + goto err; + } + } + + /* ARM TX handlers */ + for (i = 0; i < login->tx_rings_num; ++i) { + login->tx_res[i].stopped = 0; + spin_lock_init(&login->tx_res[i].lock); + if (!vnic_tx_polling && + ib_req_notify_cq(login->tx_res[i].cq, IB_CQ_NEXT_COMP)) { + vnic_err(login->name, "ib_req_notify_cq failed\n"); + goto err; + } + } + + /* enable napi*/ + for (i = 0; i < login->napi_num; ++i) + vnic_napi_enable(login, i); + + /* move QP to RTS, post recv skb */ + if (vnic_ib_open(dev)) + goto err_napi; + + /* dummy call */ + if (vnic_ib_up(dev)) + goto err_ib_stop; + + /* configure */ + vnic_set_default_moder(login); + if (vnic_ib_set_moder(login, login->last_moder_time, login->rx_frames, + login->tx_usecs, login->tx_frames)) + vnic_warn(login->name, "vnic_ib_set_moder failed!\n"); + + /* start interface TX queue */ + VNIC_TXQ_START_ALL(login); + + /* report and return */ + vnic_info("%s is opened\n", dev->name); + + return 0; + +err_ib_stop: + vnic_ib_stop(dev); +err_napi: + /* disable napi*/ + for (i = 0; i < login->napi_num; ++i) + vnic_napi_disable(login, i); +err: + return -EINVAL; +} + +static int vnic_open(struct net_device *dev) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + int ret; + + vnic_dbg_func(login->name); + + mutex_lock(&login->state_lock); + ret = _vnic_open(dev); + mutex_unlock(&login->state_lock); + return ret; +} + +static int _vnic_stop(struct net_device *dev) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + int i, _watchdog_timeo = dev->watchdog_timeo; + + /* check if already stopped */ + if (!(test_and_clear_bit(VNIC_STATE_LOGIN_OPEN, &login->state))) + return 0; + + /* Set trans_start to jiffies and watchdog_timeo to max + * to avoid spurious transmit timeouts in the interval between + * tx queue stopped and carrier down. + */ + dev->trans_start = jiffies; + dev->watchdog_timeo = 0x7fffffff; + + VNIC_TXQ_STOP_ALL(login); + + /* disable rx handlers */ + for (i = 0; i < login->rx_rings_num; ++i) + login->rx_res[i].stopped = 1; + + /* disable tx handlers */ + for (i = 0; i < login->tx_rings_num; ++i) + login->tx_res[i].stopped = 1; + + /* disable napi managers */ + for (i = 0; i < login->napi_num; ++i) + vnic_napi_disable(login, i); + + vnic_ib_down(dev); + vnic_ib_stop(dev); + + /* restore watchdog_timeo */ + dev->watchdog_timeo = _watchdog_timeo; + + vnic_info("%s is stopped\n", dev->name); + + return 0; +} + +static int vnic_stop(struct net_device *dev) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + int ret; + + vnic_dbg_func(login->name); + + mutex_lock(&login->state_lock); + ret = _vnic_stop(dev); + mutex_unlock(&login->state_lock); + + return ret; +} + +int vnic_restart(struct net_device *dev) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + int rc = 0; + + if (login->queue_stopped || !test_bit(VNIC_STATE_LOGIN_OPEN, &login->state)) + return rc; + + set_bit(VNIC_STATE_LOGIN_NO_TX_ENABLE, &login->state); + netif_tx_disable(login->dev); + + mutex_lock(&login->state_lock); + _vnic_stop(login->dev); + + clear_bit(VNIC_STATE_LOGIN_NO_TX_ENABLE, &login->state); + set_bit(VNIC_STATE_LOGIN_OPEN_REQ, &login->state); + + rc = _vnic_open(login->dev); + mutex_unlock(&login->state_lock); + + return rc; +} + +static void vnic_restart_task(struct work_struct *work) +{ + struct vnic_login *login = + container_of(work, struct vnic_login, restart_task.work); + + vnic_restart(login->dev); +} + +struct net_device_stats *vnic_get_stats(struct net_device *dev) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + + spin_lock_bh(&login->stats_lock); + memcpy(&login->ret_stats, &login->stats, sizeof(login->stats)); + spin_unlock_bh(&login->stats_lock); + + return &login->ret_stats; +} + +static void vnic_tx_timeout(struct net_device *dev) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + + vnic_warn(login->name, "TX timeout called on port: %d, " + "latency: %d msec, stopped: %d, carrier_ok: %d," + "queue_stopped: %d, watchdog_timeo: %d msec\n", + login->port->num, + jiffies_to_msecs(jiffies - dev->trans_start), + netif_queue_stopped(dev), netif_carrier_ok(dev), + login->queue_stopped, + jiffies_to_msecs(dev->watchdog_timeo)); + + if (netif_carrier_ok(dev)) { + VNIC_STATS_DO_INC(login->port_stats.tx_timeout); + if (!login->queue_stopped) { + vnic_warn(login->name, "TX timeout, queueing rings restart\n"); + queue_delayed_work(login_wq, &login->restart_task, HZ / 100); + } + } +} + +#ifndef _BP_NETDEV_NO_TMQ +u16 vnic_select_queue(struct net_device *dev, struct sk_buff *skb) +{ + /* Notes: + * - In kernel 2.6.32 the skb->mac_header 0x1a is not set when + * select_queue() is called + * - In OVM Server 3.0, DomU tx skb network and transport + * headers are not set + */ + skb_reset_mac_header(skb); + skb_set_network_header(skb, ETH_HLEN); + skb_set_transport_header(skb, + ETH_HLEN + + (skb->protocol == htons(ETH_P_IPV6) ? + sizeof(struct ipv6hdr) : ip_hdrlen(skb))); + + return vnic_hash(dev, skb) % dev->real_num_tx_queues; +} + +#endif + +#ifndef _BP_NO_NDO_OPS +static struct net_device_ops vnic_netdev_ops = { + .ndo_open = vnic_open, + .ndo_stop = vnic_stop, + .ndo_start_xmit = vnic_tx, + .ndo_get_stats = mlx4_vnic_stats_func_container, + .ndo_set_multicast_list = vnic_set_multicast_list, + .ndo_change_mtu = vnic_change_mtu, + .ndo_tx_timeout = vnic_tx_timeout, + .ndo_set_mac_address = vnic_set_mac, + .ndo_vlan_rx_add_vid = mlx4_vnic_vlan_rx_add_vid, + .ndo_vlan_rx_kill_vid = mlx4_vnic_vlan_rx_kill_vid, +#ifndef _BP_NETDEV_NO_TMQ + .ndo_select_queue = vnic_select_queue, +#endif +}; +#endif + +static void vnic_setup(struct net_device *dev) +{ + ether_setup(dev); + + dev->hard_header_len += VNIC_SKB_GET_ENCAP_OFFSET; + dev->watchdog_timeo = VNIC_WATCHDOG_TIMEOUT; + +#ifndef _BP_NO_NDO_OPS + if (!vnic_change_mac) + vnic_netdev_ops.ndo_set_mac_address = NULL; + + dev->netdev_ops = &vnic_netdev_ops; +#else + dev->open = vnic_open; + dev->stop = vnic_stop; + dev->hard_start_xmit = vnic_tx; + dev->get_stats = mlx4_vnic_stats_func_container; + dev->set_multicast_list = vnic_set_multicast_list; + dev->change_mtu = vnic_change_mtu; + dev->tx_timeout = vnic_tx_timeout; + dev->set_mac_address = vnic_set_mac; + dev->vlan_rx_add_vid = mlx4_vnic_vlan_rx_add_vid; + dev->vlan_rx_kill_vid = mlx4_vnic_vlan_rx_kill_vid; + + if (!vnic_change_mac) + dev->set_mac_address = NULL; + +#ifndef _BP_NETDEV_NO_TMQ + dev->select_queue = vnic_select_queue; +#endif +#endif // _BP_NO_NDO_OPS +} + +static int vnic_get_frag_header(struct skb_frag_struct *frags, void **mac_hdr, + void **ip_hdr, void **tcpudp_hdr, + u64 *hdr_flags, void *priv) +{ + struct iphdr *iph; + + *mac_hdr = page_address(frags->page) + frags->page_offset; + *ip_hdr = iph = (struct iphdr *)(*mac_hdr + ETH_HLEN); + *tcpudp_hdr = (struct tcphdr *)(iph + (iph->ihl << 2)); + *hdr_flags = LRO_IPV4 | LRO_TCP; + + return 0; +} + +static int vnic_get_skb_header(struct sk_buff *skb, void **iphdr, + void **tcphdr, u64 *hdr_flags, void *priv) +{ + struct iphdr *iph; + struct tcphdr *tcph; + + if (unlikely(skb->protocol != htons(ETH_P_IP))) + return -1; + + if (unlikely(skb->ip_summed != CHECKSUM_UNNECESSARY)) + return -1; + + iph = (struct iphdr *)(skb->data + ETH_HLEN); + if (iph->protocol != IPPROTO_TCP) + return -1; + + tcph = (struct tcphdr *)(iph + (iph->ihl << 2)); + + if (ntohs(iph->tot_len) < (iph->ihl * 4 + tcph->doff * 4)) + return -1; + + *hdr_flags = LRO_IPV4 | LRO_TCP; + *iphdr = iph; + *tcphdr = tcph; + + return 0; +} + +static int vnic_lro_enable(struct vnic_login *login, int rx_res_index) +{ + struct net_lro_mgr *lro = &login->rx_res[rx_res_index].lro; + + lro->dev = login->dev; + lro->features = login->napi_num ? LRO_F_NAPI : 0; + lro->frag_align_pad = NET_IP_ALIGN; + lro->ip_summed = CHECKSUM_UNNECESSARY; + lro->ip_summed_aggr = CHECKSUM_UNNECESSARY; + lro->max_desc = login->lro_num; + lro->max_aggr = VNIC_MAX_LRO_AGGR; + lro->lro_arr = login->rx_res[rx_res_index].lro_desc; + + if (lro->max_aggr > MAX_SKB_FRAGS) + lro->max_aggr = MAX_SKB_FRAGS; + + if (!vnic_rx_linear) + lro->get_frag_header = vnic_get_frag_header; + else + lro->get_skb_header = vnic_get_skb_header; + + return 0; +} + +static void vnic_lro_disable(struct vnic_login *login, int rx_res_index) +{ + /* nop */ + return; +} + +struct net_device *vnic_alloc_netdev(struct vnic_port *port) +{ + struct vnic_login_info *info; + struct vnic_login *login; + struct net_device *dev; + static int vnic_cnt = 0; + int i; + + dev = VNIC_TXQ_ALLOC_NETDEV(sizeof *info, "eth%d", vnic_setup, port->tx_rings_num); + if (!dev) { + vnic_err(port->name, "VNIC_TXQ_ALLOC_NETDEV failed " + "(size %Zu, tx_rings_num %d)\n", + sizeof *info, port->tx_rings_num); + goto err; + } + + /* this is a *very* large beast... */ + login = vmalloc(sizeof *login); + if (!login) { + vnic_err(port->name, "failed to allocate login struct (%Zu)\n", + sizeof *login); + goto free_netdev; + } + + /* init fields */ + memset(login, 0, sizeof *login); + info = netdev_priv(dev); + info->login = login; + login->dev = dev; + login->port = port; + login->max_mtu = VNIC_BUF_SIZE(login->port) - IB_GRH_BYTES - + VNIC_ENCAP_LEN - ETH_HLEN - VLAN_HLEN; + login->cnt = ++vnic_cnt; + /* name will be overwritten later */ + sprintf(login->name, "%s-%d", "vnic", login->cnt); + sprintf(login->desc, "%s-P%d", + login->port->dev->ca->node_desc, port->num); + + login->rx_csum = 1; + login->rx_rings_num = port->rx_rings_num; + login->tx_rings_num = port->tx_rings_num; +#ifdef _BP_NETDEV_NO_TMQ + /* if the kernel doesn't support Multiple TX queues, + * then use only one TX queue */ + login->tx_rings_num = 1; +#endif + vnic_dbg_mark(); + spin_lock_init(&login->lock); + spin_lock_init(&login->stats_lock); + rwlock_init(&login->mac_rwlock); + atomic_set(&login->vnic_child_cnt, 0); + vnic_mcast_root_init(&login->mcast_tree); + mutex_init(&login->moder_lock); + mutex_init(&login->state_lock); + SET_NETDEV_DEV(login->dev, login->port->dev->ca->dma_device); + INIT_DELAYED_WORK(&login->stats_task, vnic_do_get_stats); + INIT_DELAYED_WORK(&login->mcast_task, vnic_mcast_reattach); + INIT_DELAYED_WORK(&login->restart_task, vnic_restart_task); + + /* init ethtool */ + vnic_set_ethtool_ops(dev); + do { + login->dev->ethtool_ops->set_rx_csum(login->dev, 1); + if (login->dev->ethtool_ops->set_tx_csum(login->dev, 1)) + break; + if (login->dev->ethtool_ops->set_sg(login->dev, 1)) + break; + if (login->dev->ethtool_ops->set_tso(login->dev, 1)) + break; + } while (0); + + /* init NAPI (must be before LRO init) */ + login->napi_num = login->rx_rings_num; + for (i = 0; i < login->napi_num; ++i) { + if (vnic_napi_alloc(login, i)) { + vnic_err(login->name, "NAPI alloc %d failed\n", i); + goto free_napi; + } + } + +#if defined(NETIF_F_GRO) && !defined(_BP_NO_GRO) + login->dev->features |= NETIF_F_GRO; +#elif defined(NETIF_F_LRO) + login->lro_num = vnic_lro_num; + login->lro_mng_num = vnic_lro_num ? login->rx_rings_num : 0; + login->dev->features |= vnic_lro_num ? NETIF_F_LRO : 0; +#endif + for (i = 0; i < login->lro_mng_num; ++i) { + if (vnic_lro_enable(login, i)) { + vnic_err(login->name, "vnic_lro_enable %d failed\n", i); + goto free_lro; + } + } + + return dev; + +free_lro: + for (--i; i >= 0; --i) + vnic_lro_disable(login, i); + + i = login->napi_num; +free_napi: + for (--i; i >= 0; --i) + vnic_napi_dealloc(login, i); + vfree(login); +free_netdev: + free_netdev(dev); +err: + return ERR_PTR(-ENODEV); +} + +void vnic_free_netdev(struct vnic_login *login) +{ + int i; + + vnic_dbg_func(login->name); + + for (i = 0; i < login->lro_mng_num; ++i) + vnic_lro_disable(login, i); + for (i = 0; i < login->napi_num; ++i) + vnic_napi_dealloc(login, i); + free_netdev(login->dev); + vfree(login); +} diff --git a/drivers/net/mlx4_vnic/vnic_data_rx.c b/drivers/net/mlx4_vnic/vnic_data_rx.c new file mode 100644 index 0000000000000..193c091939599 --- /dev/null +++ b/drivers/net/mlx4_vnic/vnic_data_rx.c @@ -0,0 +1,678 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_data.h" + +#define FREE_SINGLE_FRAG(ring, e, i) \ + do { \ + ib_dma_unmap_single(ring->port->dev->ca, \ + ring->rx_info[e].dma_addr[i], \ + ring->frag_info[i].frag_size, \ + PCI_DMA_FROMDEVICE); \ + ring->rx_info[e].dma_addr[i] = 0; \ + put_page(ring->rx_info[e].frags[i].page); \ + } while (0); + +#ifndef _BP_NETDEV_NO_TMQ +/* this functions used only in no_bxm mode, + * it's not implemented in netdevice.h so we have it here + * based on netif_tx_lock() + */ +static inline int vnic_netif_tx_trylock(struct net_device *dev) +{ + int i, cpu; + + spin_lock(&dev->tx_global_lock); + cpu = smp_processor_id(); + for (i = 0; i < dev->num_tx_queues; ++i) { + struct netdev_queue *txq = netdev_get_tx_queue(dev, i); + if (__netif_tx_trylock(txq)) { + set_bit(__QUEUE_STATE_FROZEN, &txq->state); + __netif_tx_unlock(txq); + } else { + goto unlock; + } + } + + return 1; + +unlock: + /* based on netif_tx_unlock() */ + for (--i; i >= 0; --i) { + struct netdev_queue *txq = netdev_get_tx_queue(dev, i); + clear_bit(__QUEUE_STATE_FROZEN, &txq->state); + if (!test_bit(__QUEUE_STATE_XOFF, &txq->state)) + __netif_schedule(txq->qdisc); + } + spin_unlock(&dev->tx_global_lock); + + return 0; +} +#else +#define vnic_netif_tx_trylock(dev) netif_tx_trylock(dev) +#endif + +int vnic_rx(struct vnic_login *login, struct sk_buff *skb, struct ib_wc *wc) +{ + ASSERT(skb); + vnic_dbg_skb("RX", skb, (unsigned long)-1, (unsigned long)0); + + if (no_bxm) { + /* In no_bxm mode, we update neigh table based on ARP reqlies + * QPN & LID are retrieved from the IB completion + * ATTENTION: on RSS mode, make sure that ARPs are + * sent on base QPN + */ + struct vnic_neigh *neighe; + struct ethhdr *eth_hdr = (struct ethhdr *)skb->data; + struct arphdr *arp_hdr = (struct arphdr *)(skb->data + ETH_HLEN); + u16 eth_proto = ntohs(eth_hdr->h_proto); + u16 arp_proto = ntohs(arp_hdr->ar_op); + + if (eth_proto != ETH_P_ARP) + goto out; + if (arp_proto == ARPOP_REQUEST) + vnic_dbg_data(login->name, "ARP REQUEST\n"); + else + vnic_dbg_data(login->name, "ARP REPLY\n"); + + /* don't stop TX queue, only try, this way we avoid blocking + * IRQs in TX flow (performance wise). + * other vnic_neighe_* functions are not called in parallel + * to this flow (in no_bxm mode) + */ + if (!vnic_netif_tx_trylock(login->dev)) + goto out; + + neighe = vnic_neighe_search(login, eth_hdr->h_source); + if (!IS_ERR(neighe)) { + /* if IB address didn't change, do nothing */ + if (neighe->qpn == wc->src_qp && + neighe->lid == wc->slid) + goto unlock; + /* else, del old neigh entry, and add a new one */ + vnic_neighe_del(login, neighe); + vnic_neighe_dealloc(neighe); + } + + /* RSS: assume that your neighbours are like you */ + neighe = vnic_neighe_alloc(login, eth_hdr->h_source, + wc->slid, wc->src_qp, + login->rx_rings_num > 1 ? 1 : 0); + if (IS_ERR(neighe)) + goto unlock; + if (vnic_neighe_add(login, neighe)) + vnic_neighe_dealloc(neighe); +unlock: + netif_tx_unlock(login->dev); + } +out: + + /* shared_vnic may receive PACKET_OTHERHOST + * we 'fix' the pkt_type here so the kernel + * won't drop it + */ + if (skb->pkt_type == PACKET_OTHERHOST && login->shared_vnic) + skb->pkt_type = PACKET_HOST; + + netif_receive_skb(skb); + + return 0; + +} + +struct sk_buff *vnic_alloc_rx_skb(struct vnic_rx_ring *ring, int buf_ind, + gfp_t gfp_flag) +{ + struct ib_device *ca = ring->port->dev->ca; + struct sk_buff *skb; + u64 mapping; + int buf_size = VNIC_BUF_SIZE(ring->port); + + skb = alloc_skb(buf_size, gfp_flag); + if (!skb) { + vnic_dbg_data(ring->port->name, + "alloc_skb for size %d failed\n", buf_size); + goto err_alloc; + } + + mapping = ib_dma_map_single(ca, skb->data, buf_size, DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(ca, mapping))) { + vnic_dbg_data(ring->port->name, + "ib_dma_map_single len %d failed\n", buf_size); + goto err_map; + } + + ring->rx_info[buf_ind].skb = skb; + ring->rx_info[buf_ind].dma_addr[0] = mapping; + + return skb; + +err_map: + dev_kfree_skb_any(skb); +err_alloc: + return NULL; +} + +static int frag_sizes[] = { + FRAG_SZ0, + FRAG_SZ1, + FRAG_SZ2, + FRAG_SZ3 +}; + +/* Calculate the last offset position that accomodates a full fragment + * (assuming fagment size = stride-align) + */ +static int vnic_last_alloc_offset(struct vnic_rx_ring *ring, u16 stride, u16 align) +{ + u16 res = VNIC_ALLOC_SIZE % stride; + u16 offset = VNIC_ALLOC_SIZE - stride - res + align; + + vnic_dbg_data(ring->port->name, "calculated last offset for stride:%d align:%d " + "res:%d offset:%d\n", stride, align, res, offset); + return offset; +} + +static int vnic_init_allocator(struct vnic_rx_ring *ring) +{ + struct vnic_rx_alloc *page_alloc; + int i; + + if (vnic_rx_linear) + return 0; + + for (i = 0; i < ring->num_frags; i++) { + page_alloc = &ring->page_alloc[i]; + page_alloc->page = alloc_pages(GFP_ATOMIC | __GFP_COMP, VNIC_ALLOC_ORDER); + if (!page_alloc->page) + goto out; + + page_alloc->offset = ring->frag_info[i].frag_align; + vnic_dbg_data(ring->port->name, "Initialized allocator:%d with page:%p\n", + i, page_alloc->page); + } + return 0; + +out: + while (i--) { + page_alloc = &ring->page_alloc[i]; + if (page_alloc->page) { + put_page(page_alloc->page); + page_alloc->page = NULL; + } + } + return -ENOMEM; +} + +static void vnic_destroy_allocator(struct vnic_rx_ring *ring) +{ + struct vnic_rx_alloc *page_alloc; + int i; + + if (vnic_rx_linear) + return; + + for (i = 0; i < ring->num_frags; i++) { + page_alloc = &ring->page_alloc[i]; + vnic_dbg_data(ring->port->name, "Freeing allocator:%d count:%d\n", + i, page_count(page_alloc->page)); + if (page_alloc->page) { + put_page(page_alloc->page); + page_alloc->page = NULL; + } + } +} + +/* + * allocate a single fragment on a single ring entry and map it + * to HW address. + */ +static int vnic_alloc_frag(struct vnic_rx_ring *ring, + struct vnic_frag_data *frags_data, int i) +{ + struct vnic_frag_info *frag_info = &ring->frag_info[i]; + struct vnic_rx_alloc *page_alloc = &ring->page_alloc[i]; + struct skb_frag_struct *skb_frags = &frags_data->frags[i]; + struct skb_frag_struct skbf = *skb_frags; + struct page *page; + struct ib_device *ib_device = ring->port->dev->ca; + u64 dma; + int decision; + + if (vnic_rx_linear) + return 0; + + if (page_alloc->offset >= frag_info->last_offset) { + decision = 0; + /* Allocate new page */ + page = alloc_pages(GFP_ATOMIC | __GFP_COMP, VNIC_ALLOC_ORDER); + if (!page) { + /*frags_data->dma_addr[i] = NULL; + ring->rx_info[wr_id].info = VNIC_FRAG_ALLOC_FAIL; + ring->need_refill = 1; */ + return -ENOMEM; + } + skbf.page = page_alloc->page; + skbf.page_offset = page_alloc->offset; + } else { + decision = 1; + page = page_alloc->page; + get_page(page); + skbf.page = page; + skbf.page_offset = page_alloc->offset; + } + + skbf.size = frag_info->frag_size; + + dma = ib_dma_map_single(ib_device, page_address(skbf.page) + + skbf.page_offset, frag_info->frag_size, + PCI_DMA_FROMDEVICE); + if (unlikely(ib_dma_mapping_error(ib_device, dma))) { + vnic_dbg_data(ring->port->name, + "ib_dma_map_single len %d failed\n", + frag_info->frag_size); + put_page(page); + return -ENOMEM; + } + + if (!decision) { + page_alloc->page = page; + page_alloc->offset = frag_info->frag_align; + } else + page_alloc->offset += frag_info->frag_stride; + + *skb_frags = skbf; + frags_data->dma_addr[i] = dma; + + return 0; +} + +void vnic_calc_rx_buf(struct vnic_rx_ring *ring) +{ + int eff_mtu = VNIC_BUF_SIZE(ring->port), buf_size = 0, i = 0; + + if (vnic_rx_linear) { + ring->num_frags = 1; + return; + } + + while (buf_size < eff_mtu) { + ring->frag_info[i].frag_size = + (eff_mtu > buf_size + frag_sizes[i]) ? + frag_sizes[i] : eff_mtu - buf_size; + ring->frag_info[i].frag_prefix_size = buf_size; + if (!i) { + ring->frag_info[i].frag_align = NET_IP_ALIGN; + ring->frag_info[i].frag_stride = + ALIGN(frag_sizes[i] + NET_IP_ALIGN, SMP_CACHE_BYTES); + } else { + ring->frag_info[i].frag_align = 0; + ring->frag_info[i].frag_stride = + ALIGN(frag_sizes[i], SMP_CACHE_BYTES); + } + ring->frag_info[i].last_offset = + vnic_last_alloc_offset(ring, + ring->frag_info[i].frag_stride, + ring->frag_info[i].frag_align); + buf_size += ring->frag_info[i].frag_size; + i++; + } + + ring->num_frags = i; + ring->rx_skb_size = eff_mtu; + ring->log_rx_info = ROUNDUP_LOG2(i * sizeof(struct skb_frag_struct)); + + vnic_dbg(ring->port->name, "Rx buffer scatter-list (ring %d effective-mtu:%d " + "num_frags:%d):\n", ring->index ,eff_mtu, ring->num_frags); + for (i = 0; i < ring->num_frags; i++) { + vnic_dbg(ring->port->name, "frag:%d - size:%d prefix:%d align:%d " + "stride:%d last_offset:%d\n", i, + ring->frag_info[i].frag_size, + ring->frag_info[i].frag_prefix_size, + ring->frag_info[i].frag_align, + ring->frag_info[i].frag_stride, + ring->frag_info[i].last_offset); + } +} + +static void vnic_empty_rx_entry(struct vnic_rx_ring *ring, int i) +{ + int frag_num, buf_size = VNIC_BUF_SIZE(ring->port); + struct ib_device *ca = ring->port->dev->ca; + struct sk_buff *skb; + u64 mapping; + + if (vnic_rx_linear) { + for (frag_num = 0; frag_num < ring->num_frags; frag_num++) { + mapping = ring->rx_info[i].dma_addr[0]; + skb = ring->rx_info[i].skb; + if (mapping) + ib_dma_unmap_single(ca, mapping, buf_size, DMA_FROM_DEVICE); + if (skb) + dev_kfree_skb_any(skb); + } + + return; + } + + /* non linear buffers */ + for (frag_num = 0; frag_num < ring->num_frags; frag_num++) + FREE_SINGLE_FRAG(ring, i, frag_num); +} + +static int vnic_fill_rx_buffer(struct vnic_rx_ring *ring) +{ + struct vnic_frag_data *frags_data = &ring->rx_info[0]; + struct sk_buff *skb; + struct ib_device *ca = ring->port->dev->ca; + int buf_ind, frag_num, buf_size = VNIC_BUF_SIZE(ring->port); + u64 mapping; + + if (vnic_rx_linear) { + for (buf_ind = 0; buf_ind < ring->size; buf_ind++) { + skb = vnic_alloc_rx_skb(ring, buf_ind, GFP_KERNEL); + if (!skb) + goto err_linear; + } + + return 0; + } + + /* non linear buffers */ + for (buf_ind = 0; buf_ind < ring->size; buf_ind++, frags_data++) { + for (frag_num = 0; frag_num < ring->num_frags; frag_num++) { + if (vnic_alloc_frag(ring, frags_data, frag_num)) + goto err_frags; + } + } + + return 0; + +err_linear: + for (buf_ind = 0; buf_ind < ring->size; buf_ind++) { + mapping = ring->rx_info[buf_ind].dma_addr[0]; + skb = ring->rx_info[buf_ind].skb; + if (mapping) + ib_dma_unmap_single(ca, mapping, buf_size, DMA_FROM_DEVICE); + if (skb) + dev_kfree_skb_any(skb); + } + + return -ENOMEM; + +err_frags: + for (--frag_num; frag_num >= 0; frag_num--) + FREE_SINGLE_FRAG(ring, buf_ind, frag_num); + + for (--buf_ind; buf_ind >= 0; buf_ind--) + vnic_empty_rx_entry(ring, buf_ind); + + return -ENOMEM; +} + +/* + * free entire ring full of fragments. +*/ +static void vnic_empty_rx_buffer(struct vnic_rx_ring *ring) +{ + int buf_ind; + + for (buf_ind = 0; buf_ind < ring->size; buf_ind++) + vnic_empty_rx_entry(ring, buf_ind); + + ring->size = 0; +} + +void vnic_destroy_rx_ring(struct vnic_rx_ring *ring) +{ + if (!ring) + return; + vnic_empty_rx_buffer(ring); + vnic_destroy_allocator(ring); + vfree(ring->rx_info); + vnic_ib_free_ring(ring); + kfree(ring); +} + +int vnic_unmap_and_replace_rx(struct vnic_rx_ring *ring, struct ib_device *dev, + struct skb_frag_struct *skb_frags_rx, + u64 wr_id, int length) +{ + struct vnic_frag_info *frag_info; + struct vnic_frag_data *rx_info = &ring->rx_info[wr_id]; + + int nr; + dma_addr_t dma; + + /* Collect used fragments while replacing them in the HW descriptors */ + for (nr = 0; nr < ring->num_frags; nr++) { + frag_info = &ring->frag_info[nr]; + if (length <= frag_info->frag_prefix_size) + break; + + /* Save page reference in skb */ + skb_frags_rx[nr].page = rx_info->frags[nr].page; + skb_frags_rx[nr].size = rx_info->frags[nr].size; + skb_frags_rx[nr].page_offset = rx_info->frags[nr].page_offset; + dma = rx_info->dma_addr[nr]; + + /* Allocate a replacement page */ + if (vnic_alloc_frag(ring, rx_info, nr)) + goto fail; + + /* Unmap buffer */ + ib_dma_unmap_single(dev, dma, skb_frags_rx[nr].size, + PCI_DMA_FROMDEVICE); + } + + /* Adjust size of last fragment to match actual length */ + if (nr > 0) + skb_frags_rx[nr - 1].size = length - + ring->frag_info[nr - 1].frag_prefix_size; + return nr; + +fail: + /* Drop all accumulated fragments (which have already been replaced in + * the descriptor) of this packet; remaining fragments are reused... */ + while (nr > 0) { + nr--; + put_page(skb_frags_rx[nr].page); + } + + return 0; +} + +int vnic_rx_skb(struct vnic_login *login, struct vnic_rx_ring *ring, + struct ib_wc *wc, int ip_summed, char *eth_hdr_va) +{ + u64 wr_id = (unsigned int)wc->wr_id; + struct sk_buff *skb; + int used_frags; + char *va = eth_hdr_va; + int length = wc->byte_len - VNIC_EOIB_HDR_SIZE - VNIC_VLAN_OFFSET(login), + linear_length = (length <= SMALL_PACKET_SIZE) ? + length : SMALL_PACKET_SIZE, hdr_len = min(length, HEADER_COPY_SIZE), + offest = NET_IP_ALIGN + 16; + struct ib_device *ib_dev = login->port->dev->ca; + + /* alloc a small linear SKB */ + skb = alloc_skb(linear_length + offest, GFP_ATOMIC); + if (unlikely(!skb)) + return -ENOMEM; + + skb_record_rx_queue(skb, ring->index); + skb_reserve(skb, offest); + + if (vnic_linear_small_pkt && length <= SMALL_PACKET_SIZE) { + u64 dma; + + /* We are copying all relevant data to the skb - temporarily + * synch buffers for the copy + */ + dma = ring->rx_info[wr_id].dma_addr[0] + VNIC_EOIB_HDR_SIZE + + VNIC_VLAN_OFFSET(login); + ib_dma_sync_single_for_cpu(ib_dev, dma, length, + DMA_FROM_DEVICE); + skb_copy_to_linear_data(skb, va, length); + ib_dma_sync_single_for_device(ib_dev, dma, length, + DMA_FROM_DEVICE); + skb->tail += length; + } else { + /* unmap the needed fragmentand reallocate them. Fragments that + * were not used will not be reused as is. */ + used_frags = vnic_unmap_and_replace_rx(ring, ib_dev, + skb_shinfo(skb)->frags, + wr_id, wc->byte_len); + if (!used_frags) + goto free_and_repost; + + skb_shinfo(skb)->nr_frags = used_frags; + + /* Copy headers into the skb linear buffer */ + memcpy(skb->data, va, hdr_len); + skb->tail += hdr_len; + /* Skip headers in first fragment */ + skb_shinfo(skb)->frags[0].page_offset += + (VNIC_EOIB_HDR_SIZE + VNIC_VLAN_OFFSET(login) + + hdr_len); + + /* Adjust size of first fragment */ + skb_shinfo(skb)->frags[0].size -= + (VNIC_EOIB_HDR_SIZE + VNIC_VLAN_OFFSET(login) + + hdr_len); + skb->data_len = length - hdr_len; + } + + /* update skb fields */ + skb->len = length; + skb->truesize = length + sizeof(struct sk_buff); + skb->ip_summed = ip_summed; + skb->dev = login->dev; + skb->protocol = eth_type_trans(skb, skb->dev); + + return vnic_rx(login, skb, wc); + +free_and_repost: + dev_kfree_skb(skb); + return -ENODEV; + +} + +static void vnic_set_rx_sge(struct vnic_rx_ring *ring) +{ + int i; + + ring->wr.num_sge = ring->num_frags; + ring->wr.next = NULL; + ring->wr.sg_list = ring->sge; + for (i = 0; i < ring->num_frags; ++i) { + ring->sge[i].lkey = ring->port->mr->lkey; + ring->sge[i].length = ring->frag_info[i].frag_size; + } +} + +struct vnic_rx_ring *vnic_create_rx_ring(struct vnic_port *port, int index) +{ + int rc, rx_info, size = vnic_rx_rings_len; + struct vnic_rx_ring *ring; + + ring = kzalloc(sizeof *ring, GFP_KERNEL); + if (!ring) + return ERR_PTR(-ENOMEM); + + /* init attributes */ + ring->port = port; + ring->size = size; + ring->index = index; + spin_lock_init(&ring->lock); + + /* init rx ring IB resources */ + if (vnic_ib_init_ring(ring)) { + vnic_err(port->name, "vnic_ib_init_ring failed\n"); + goto free_ring; + } + + rx_info = size * roundup_pow_of_two(sizeof(struct vnic_frag_data)); + ring->rx_info = vmalloc(rx_info); + if (!ring->rx_info) { + vnic_err(port->name, "Failed allocating rx_info ring" + " (%d bytes)\n", rx_info); + goto free_ib; + } + memset(ring->rx_info, 0, rx_info); + + /* determine the sizes of the fragments as result of mtu */ + vnic_calc_rx_buf(ring); + + rc = vnic_init_allocator(ring); + if (rc) { + vnic_err(port->name, "Failed initializing ring" + " allocator %d\n", rc); + goto free_rxinfo; + } + + rc = vnic_fill_rx_buffer(ring); + if (rc) { + vnic_err(port->name, "vnic_fill_rx_buffer failed %d\n", rc); + goto free_allocator; + } + + /* set rx WQEs drafts */ + vnic_set_rx_sge(ring); + + /* Initailize all descriptors and post to srq */ + rc = vnic_post_recvs(ring); + if (rc) { + vnic_err(port->name, "vnic_post_recvs failed %d\n", rc); + goto free_rx_buffer; + } + + return ring; + +free_rx_buffer: + /* TODO: we are freeing posted packets need to move SRQ + * to error and free them first + */ + vnic_empty_rx_buffer(ring); +free_allocator: + vnic_destroy_allocator(ring); +free_rxinfo: + vfree(ring->rx_info); +free_ib: + vnic_ib_free_ring(ring); +free_ring: + kfree(ring); + + return ERR_PTR(-EINVAL); +} diff --git a/drivers/net/mlx4_vnic/vnic_data_tx.c b/drivers/net/mlx4_vnic/vnic_data_tx.c new file mode 100644 index 0000000000000..f6ed800669d0d --- /dev/null +++ b/drivers/net/mlx4_vnic/vnic_data_tx.c @@ -0,0 +1,476 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_data.h" + +/* Push VLAN & EoIB headers and calculate RSS hash value + * We do the RSS hash here because we already check IP|TCP|UDP + * in this function for EoIB fields, so we make use of that + * and do RSS too. + */ +static struct eoibhdr eoib_h_draft = { + .encap_data = ((VNIC_EOIB_HDR_VER << 4) | (VNIC_EOIB_HDR_SIG << 6)), + .seg_off = 0, + .seg_id = 0 +}; + +void vnic_learn_mac(struct net_device *dev, u8 *mac, int remove) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + + vnic_dbg_func(login->name); + + /* skip invalid address */ + if (unlikely(!is_valid_ether_addr(mac))) + return; + + /* skip parent vNic address (original dev_addr) */ + if (!(memcmp(login->dev_addr, mac, ETH_ALEN))) + return; + + vnic_dbg_mac(login->name, "learn mac "MAC_6_PRINT_FMT"\n", + MAC_6_PRINT_ARG(mac)); + + /* update child vNic list, ignore returned code */ + read_lock_bh(&login->mac_rwlock); + vnic_child_update(login, mac, remove); + read_unlock_bh(&login->mac_rwlock); +} + +u32 vnic_hash(struct net_device *dev, struct sk_buff *skb) +{ + struct tcphdr *tr_h = tcp_hdr(skb); + struct iphdr *ip_h = ip_hdr(skb); + struct ipv6hdr *ip_h6 = (struct ipv6hdr *)ip_h; + u32 hash = 0, addrlen, i; + + /* All mcast traffic is sent and received on 1st queue + * because only the 1st QP is attached to the MGIDs + * TODO: consider distributing tx/rx mcast traffic as well + */ + if (is_multicast_ether_addr(skb_mac_header(skb))) + goto out; + + switch (ntohs(skb->protocol)) { + case ETH_P_IP: + /* In IPv4, access TCP/UDP header only when IP packet is not + * fragmented: flags == DF == 0x02. + */ + if (ntohs(ip_h->frag_off) >> 13 == 0x2 && + (ip_h->protocol == IPPROTO_TCP || + ip_h->protocol == IPPROTO_UDP)) { + hash ^= (u32)ntohl(ip_h->saddr); + hash ^= (u32)ntohl(ip_h->daddr); + hash ^= (u32)ntohs(tr_h->source); + hash ^= (u32)ntohs(tr_h->dest); + } + break; + case ETH_P_IPV6: + /* In IPv6, access TCP/UDP header only when IP packet is not + * fragmented: main header nexthdr field points to TCP/UDP + */ + if (ip_h6->nexthdr == IPPROTO_TCP || + ip_h6->nexthdr == IPPROTO_UDP) { + addrlen = ARRAY_LEN(ip_h6->saddr.in6_u.u6_addr32); + for (i = 0; i < addrlen; ++i) { + hash ^= (u32)ntohl(ip_h6->saddr.in6_u.u6_addr32[i]); + hash ^= (u32)ntohl(ip_h6->daddr.in6_u.u6_addr32[i]); + } + tr_h = (struct tcphdr *)((void *)ip_h6 + sizeof *ip_h6); + hash ^= (u32)ntohs(tr_h->source); + hash ^= (u32)ntohs(tr_h->dest); + } + } +out: + VNIC_SKB_SET_HASH(skb, hash); + return hash; +} + +u8 vnic_lag_hash(struct sk_buff *skb, u16 hash_mask, u16 vid) +{ + struct tcphdr *tr_h = tcp_hdr(skb); + struct iphdr *ip_h = ip_hdr(skb); + struct ipv6hdr *ip_h6 = (struct ipv6hdr *)ip_h; + u32 hash = 0, addrlen, i; + struct ethhdr *eth = (struct ethhdr *)skb_mac_header(skb); + u32 hash_dmac, hash_smac, hash_prot, hash_vid; + u32 hash_sip = 0, hash_dip = 0, hash_sp = 0, hash_dp = 0; + u8 res_hash; + u8 *tmp; + + hash_dmac = *(u32 *)(ð->h_dest[ETH_ALEN - sizeof hash_smac]); + hash_smac = *(u32 *)(ð->h_source[ETH_ALEN - sizeof hash_smac]); + hash_prot = (u32)ntohs(skb->protocol); + hash_vid = (u32)vid; + + if (hash_mask & GW_LAG_LAYER_2_3) { + switch (hash_prot) { + case ETH_P_IP: + /* In IPv4, access TCP/UDP header only when IP packet is not + * fragmented: flags == DF == 0x02. + */ + if (ntohs(ip_h->frag_off) >> 13 == 0x2 && + (ip_h->protocol == IPPROTO_TCP || + ip_h->protocol == IPPROTO_UDP)) { + hash_sip = (u32)(ip_h->saddr); + hash_dip = (u32)(ip_h->daddr); + hash_sp = (u32)(tr_h->source); + hash_dp = (u32)(tr_h->dest); + } + break; + case ETH_P_IPV6: + /* In IPv6, access TCP/UDP header only when IP packet is not + * fragmented: main header nexthdr field points to TCP/UDP + */ + if (ip_h6->nexthdr == IPPROTO_TCP || + ip_h6->nexthdr == IPPROTO_UDP) { + addrlen = ARRAY_LEN(ip_h6->saddr.in6_u.u6_addr32); + for (i = 0; i < addrlen; ++i) { + hash_sip ^= (u32)(ip_h6->saddr.in6_u.u6_addr32[i]); + hash_dip ^= (u32)(ip_h6->daddr.in6_u.u6_addr32[i]); + } + tr_h = (struct tcphdr *)((void *)ip_h6 + sizeof *ip_h6); + hash_sp = (u32)(tr_h->source); + hash_dp = (u32)(tr_h->dest); + } + } + } + + hash ^= (hash_mask & GW_LAG_HASH_DMAC) ? hash_dmac : 0; + hash ^= (hash_mask & GW_LAG_HASH_SMAC) ? hash_smac : 0; + hash ^= (hash_mask & GW_LAG_HASH_TPID) ? hash_prot : 0; + hash ^= (hash_mask & GW_LAG_HASH_VID) ? hash_vid : 0; + hash ^= (hash_mask & GW_LAG_HASH_SIP) ? hash_sip : 0; + hash ^= (hash_mask & GW_LAG_HASH_DIP) ? hash_dip : 0; + hash ^= (hash_mask & GW_LAG_HASH_SPORT) ? hash_sp : 0; + hash ^= (hash_mask & GW_LAG_HASH_DPORT) ? hash_dp : 0; + + tmp = (u8 *)&hash; + res_hash = tmp[0] ^ tmp[1] ^ tmp[2] ^ tmp[3]; + + return res_hash; +} + +static inline int vnic_header_encap(struct sk_buff *skb) +{ + struct vnic_login *login = vnic_netdev_priv(skb->dev); + struct eoibhdr *eoib_h; + struct iphdr *ip_h = ip_hdr(skb); + struct ipv6hdr *ip_h6 = (struct ipv6hdr *)ip_h; + + /* push VLAN header + * TODO: when VID iz zero, push header only when prio exists, i.e.: + * if (VNIC_VLAN_ENABLED(login) && (login->vid || login->user_prio)) + */ + if (VNIC_VLAN_ENABLED(login) && login->vid) { + struct vlan_ethhdr *veth = + (struct vlan_ethhdr *)skb_push(skb, VLAN_HLEN); + ASSERT(veth); + vnic_dbg_data_v(login->name, "push vlan tag with ID %u\n", + be16_to_cpu(login->vid)); + memmove(skb->data, skb->data + VLAN_HLEN, 2 * ETH_ALEN); + veth->h_vlan_proto = htons(ETH_P_8021Q); + veth->h_vlan_TCI = login->vid; + } + + /* push EoIB header */ + if (vnic_encap_headroom) + skb_push(skb, VNIC_ENCAP_LEN); + + /* reset MAC header here, it can be changed for the following reasons: + * - vnic_encap_headroom is set, thus EoIB header is pushed + * - VLAN is enabled, thus VLAN header is pushed + * - some kernels (e.g., 2.6.18-194.el5) call dev_hard_start_xmit() + * without setting the mac header pointer + */ + skb_set_mac_header(skb, VNIC_SKB_GET_ENCAP_OFFSET); + + /* enforce source mac*/ + if (vnic_src_mac_enforce) + memcpy(skb_mac_header(skb) + ETH_ALEN, + login->dev->dev_addr, ETH_ALEN); + + /* set EoIB header VER/SIG, others set to zero */ + eoib_h = VNIC_SKB_GET_ENCAP(skb); + *eoib_h = eoib_h_draft; + + /* set EoIB header IP_CHK */ + switch (ntohs(skb->protocol)) { + case ETH_P_IP: + VNIC_EOIB_HDR_SET_IP_CHK_OK(eoib_h); + if (ip_h->protocol == IPPROTO_TCP) + VNIC_EOIB_HDR_SET_TCP_CHK_OK(eoib_h); + else if (ip_h->protocol == IPPROTO_UDP) + VNIC_EOIB_HDR_SET_UDP_CHK_OK(eoib_h); + break; + case ETH_P_IPV6: + VNIC_EOIB_HDR_SET_IP_CHK_OK(eoib_h); + if (ip_h6->nexthdr == IPPROTO_TCP) + VNIC_EOIB_HDR_SET_TCP_CHK_OK(eoib_h); + else if (ip_h6->nexthdr == IPPROTO_UDP) + VNIC_EOIB_HDR_SET_UDP_CHK_OK(eoib_h); + } + +#ifdef _BP_NETDEV_NO_TMQ + /* if TSS is enabled, use the hash value calculated by + * vnic_select_queue() otherwise call vnic_hash() + */ + vnic_hash(skb->dev, skb); +#endif + + return 0; +} + +static int vnic_ucast_send(struct vnic_login *login, struct sk_buff *skb, + struct neighbour *neighbour, int tx_res_index) +{ + struct vnic_neigh *neighe; + int hash; + + neighe = vnic_neighe_search(login, skb_mac_header(skb)); + if (IS_ERR(neighe)) { + vnic_dbg_data(login->name, "no dst_neigh and no vnic_neigh - " + "gw unicast packet\n"); + + /* for egress unicast traffic of a shared vnic, + * replace src mac by shared mac + */ + if (login->shared_vnic) + memcpy(skb_mac_header(skb) + ETH_ALEN, + login->shared_mac, ETH_ALEN); + + if (!login->is_lag) + neighe = login->gw_neigh; + else { + if (unlikely(!login->lag_member_active_count)) + return -ENOENT; + + /* use hash value precomputed and mapping to find LAG GW to send to */ + hash = vnic_lag_hash(skb, login->lag_prop.hash_mask, login->vid); + hash = hash % LAG_MAP_TABLE_SIZE; + neighe = &login->lag_gw_neigh[login->lag_gw_map[hash]].neigh; + } + + /* update GW statistics */ + VNIC_STATS_ADD(login->port_stats.gw_tx_bytes, skb->len); + VNIC_STATS_INC(login->port_stats.gw_tx_packets); + } else { + vnic_dbg_data(login->name, + "no dst_neigh but vnic_neigh exists - " + "local unicast packet\n"); + } + + /* TODO: in VNIC_NEIGH_GET_DQPN use neigh qps_num instead of login */ + vnic_dbg_data(login->name, "vnic_send to (base qpn 0x%06x) dqpn 0x%06x" + " dlid 0x%08x %s\n", neighe->qpn, + VNIC_NEIGH_GET_DQPN(skb, neighe), neighe->lid, + neighe == login->gw_neigh ? "[GW]" : ""); + vnic_send(login, skb, neighe->ah, VNIC_NEIGH_GET_DQPN(skb, neighe), tx_res_index); + + return 0; +} + +void vnic_mcast_send(struct vnic_login *login, struct sk_buff *skb, int tx_res_index) +{ + struct vnic_mcast *mcaste; + union vhub_mgid mgid; + struct ethhdr *eth; + struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index]; + struct ib_ah_attr *av = &tx_res->mcast_av; + struct ib_ah *ah; + u16 gw_id; + int hash; + + eth = (struct ethhdr *)skb_mac_header(skb); + + /* for LAG GW, perform hashing on mcast address */ + if (login->is_lag && login->lag_member_active_count) { + hash = vnic_lag_hash(skb, login->lag_prop.hash_mask, login->vid); + hash = hash % LAG_MAP_TABLE_SIZE; + gw_id = login->lag_gw_neigh[login->lag_gw_map[hash]].gw_id; + } + else + gw_id = login->gw_port_id; + + /* retrieve the mlid */ + vhub_mgid_create(login->mgid_prefix, ETH_ZERO_MAC, login->n_mac_mcgid, + CREATE_VHUB_ID(login->vid, gw_id), + VHUB_MGID_DATA, 0, &mgid); + + spin_lock(&login->mcast_tree.mcast_rb_lock); + mcaste = vnic_mcast_search(&login->mcast_tree, &mgid.ib_gid); + if (unlikely(IS_ERR(mcaste) || !mcaste->ah)) { + vnic_dbg_data(login->name, "couldn't find mcaste for " + MAC_6_PRINT_FMT"\n", + MAC_6_PRINT_ARG(eth->h_dest)); + spin_unlock(&login->mcast_tree.mcast_rb_lock); + goto drop; + } + + spin_lock(&mcaste->lock); + vhub_mgid_create(login->mgid_prefix, eth->h_dest, login->n_mac_mcgid, + CREATE_VHUB_ID(login->vid, gw_id), + vnic_mgid_data_type, 0, &mgid); + vnic_dbg_mcast_v(login->name, "sending to ETH "MAC_6_PRINT_FMT"-> " + "GID "VNIC_GID_FMT" (mask %d bit)\n", + MAC_6_PRINT_ARG(eth->h_dest), + VNIC_GID_ARG(mgid.ib_gid), + login->n_mac_mcgid); + + av->dlid = be16_to_cpu(mcaste->port_mcaste->rec.mlid); + av->static_rate = mcaste->port_mcaste->rec.rate; + av->sl = mcaste->port_mcaste->rec.sl; + memcpy(&av->grh.dgid, mgid.ib_gid.raw, GID_LEN); + + ah = ib_create_ah(login->port->pd, av); + spin_unlock(&mcaste->lock); + spin_unlock(&login->mcast_tree.mcast_rb_lock); + + if (!ah) + goto drop; + + vnic_send(login, skb, ah, IB_MULTICAST_QPN, tx_res_index); + ib_destroy_ah(ah); + /* used as a counter for multicast TX packets (not RX) */ + VNIC_STATS_DO_INC(tx_res->stats.multicast); + + return; + +drop: + VNIC_STATS_DO_INC(tx_res->stats.tx_dropped); + dev_kfree_skb_any(skb); +} + +int vnic_tx(struct sk_buff *skb, struct net_device *dev) +{ + struct vnic_login *login = vnic_netdev_priv(dev); + int tx_res_index = 0, headroom = dev->hard_header_len - ETH_HLEN; + struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index]; + + ASSERT(dev); + ASSERT(skb); + +#ifdef VNIC_PROFILLNG + login->prof_arr[login->prof_arr_it].cnt++; + /* copy only fields for reporting, data buffer is invalid */ + login->prof_arr[login->prof_arr_it].skb = *skb; + login->prof_arr[login->prof_arr_it].skb.data = NULL; + login->prof_arr[login->prof_arr_it].tstamp = current_kernel_time(); + login->prof_arr[login->prof_arr_it].jiffies = jiffies; + login->prof_arr[login->prof_arr_it].nr_frags = skb_shinfo(skb)->nr_frags; + login->prof_arr_it = (login->prof_arr_it + 1) % VNIC_PROFILLNG_SKB_MAX; + +#endif + + /* drop zero length skbs */ + if (unlikely(!skb->len)) + goto drop; + + /* sometimes, vnic_tx is called before carrier is up FM #100882 */ + if (unlikely(!test_bit(VNIC_STATE_LOGIN_CARRIER_ON, &login->state))) + goto drop; + + /* check headroom and reallocate skb if needed: + * If VLAN used: need VLAN_HLEN (4) Bytes + * If vnic_encap_headroom set: need VNIC_ENCAP_LEN (4) Bytes + * when vnic_encap_headroom is clear, we do not encap EoIB header + * into the headroom, but rather use additional SG entry to hold it + */ + if (unlikely(skb_headroom(skb) < headroom)) { + struct sk_buff *skb_new; + + skb_new = skb_realloc_headroom(skb, headroom); + if (!skb_new) + goto drop; + dev_kfree_skb(skb); + skb = skb_new; + VNIC_STATS_INC(login->port_stats.realloc_packets); + } + + /* don't use dev->header_ops, use vnic_header_encap() inline + * function instead, because when raw socket is used or BR_CTL mode + * then header_ops are not called as expected, and we'll end up sending + * the packet without EoIB header + */ + if (unlikely(vnic_header_encap(skb))) + goto drop; + + /* in promiscuous mode, learn the source mac */ + if (is_ucast_promisc(login) && vnic_learn_mac_enabled) + vnic_learn_mac(dev, skb_mac_header(skb) + ETH_ALEN, 0); + + /* get TX resource for this SKB, keep it after vnic_header_encap() + * so if we don't have kernel multiple queue support we use the + * RSS hash result for TSS + */ + tx_res_index = VNIC_TXQ_GET_HASH(skb, login->real_tx_rings_num); + ASSERT(tx_res_index <= login->tx_rings_num); + tx_res = &login->tx_res[tx_res_index]; + + + /* send ucast/mcast packet */ + vnic_dbg_skb("TX", skb, (unsigned long)(vnic_encap_headroom ? 0 : -1), + (unsigned long)(vnic_encap_headroom ? VNIC_ENCAP_LEN : 0)); +#if 0 /* neighbour caching disabled */ + if (likely(skb->dst && skb->dst->neighbour)) { + if (is_multicast_ether_addr(skb_mac_header(skb))) { + vnic_dbg_data(login->name, + "dst_neigh exists but no vnic_neigh - " + "multicast packet\n"); + vnic_mcast_send(login, skb, tx_res_index); + } else { + vnic_dbg_data(login->name, + "dst_neigh exists but no vnic_neigh - " + "unicast packet\n"); + vnic_ucast_send(login, skb, skb->dst->neighbour, tx_res_index); + } + } else +#endif + { + if (is_multicast_ether_addr(skb_mac_header(skb))) { + vnic_dbg_data(login->name, + "no dst_neigh - multicast packet\n"); + vnic_mcast_send(login, skb, tx_res_index); + } else { + vnic_dbg_data(login->name, + "no dst_neigh - unicast packet\n"); + vnic_ucast_send(login, skb, NULL, tx_res_index); + } + } + + return NETDEV_TX_OK; + +drop: + dev_kfree_skb(skb); + return NETDEV_TX_OK; +} diff --git a/drivers/net/mlx4_vnic/vnic_fip.h b/drivers/net/mlx4_vnic/vnic_fip.h new file mode 100644 index 0000000000000..cd154b51eaa78 --- /dev/null +++ b/drivers/net/mlx4_vnic/vnic_fip.h @@ -0,0 +1,1025 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _VNIC_FIP_H +#define _VNIC_FIP_H + +#include "vnic.h" + + +#define FIP_TYPE(FIPT) FIP_TYPE_##FIPT +#define FIP_TYPE_IDX(FIPT) FIP_TYPE_IDX_##FIPT + +#define FIP_CASE(FIPT) case FIP_TYPE(FIPT): return FIP_TYPE_IDX(FIPT) + +#define FIP_CASE_STR(FIPT) case FIP_TYPE(FIPT): return # FIPT +#define FIP_SUBCODE_CASE_STR(SUBCODE) case (SUBCODE): return # SUBCODE + +#define FIP_MASK(FIPT) (((u64)1) << FIP_TYPE_IDX(FIPT)) + +#define ADV_EXT_TYPE(FIPT) ADV_EXT_TYPE_##FIPT +#define ADV_EXT_IDX(FIPT) ADV_EXT_IDX_##FIPT + +#define GUID_FORMAT "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x" +#define MGID_PREFIX_FMT "%02x:%02x:%02x:%02x:%02x" +#define GUID_ARG(g) (g)[0], (g)[1], (g)[2], (g)[3], (g)[4], (g)[5], (g)[6], (g)[7] +#define MGID_PRE_ARG(g) (g)[0], (g)[1], (g)[2], (g)[3], (g)[4] + +enum { + FIP_TYPE(VENDOR_ID) = 13, + FIP_TYPE(ADDRESS) = 240, + FIP_TYPE(GW_INFORMATION)= 241, + FIP_TYPE(LOGIN) = 242, + FIP_TYPE(VHUB_UPDATE) = 243, + FIP_TYPE(VHUB_TABLE) = 244, + FIP_TYPE(VNIC_IDENTITY) = 245, + FIP_TYPE(PARTITION) = 246, + FIP_TYPE(GW_IDENTIFIER) = 248, + FIP_TYPE(KA_PARAMS) = 249, + FIP_TYPE(EXT_DESC) = 254, +}; + +enum { + FIP_TYPE_IDX(VENDOR_ID), + FIP_TYPE_IDX(ADDRESS), + FIP_TYPE_IDX(GW_INFORMATION), + FIP_TYPE_IDX(LOGIN), + FIP_TYPE_IDX(VHUB_UPDATE), + FIP_TYPE_IDX(VHUB_TABLE), + FIP_TYPE_IDX(VNIC_IDENTITY), + FIP_TYPE_IDX(PARTITION), + FIP_TYPE_IDX(GW_IDENTIFIER), + FIP_TYPE_IDX(KA_PARAMS), + FIP_TYPE_IDX(EXT_DESC), +}; + +enum { + ADV_EXT_TYPE(CAP) = 40, + ADV_EXT_TYPE(BOOT) = 18, + ADV_EXT_TYPE(LAG) = 41, + ADV_EXT_TYPE(MEMBER) = 42, + ADV_EXT_TYPE(PC_ID) = 43, /* Power Cycle ID */ + ADV_EXT_TYPE(CTRL_IPORT) = 240, +}; + +enum { + ADV_EXT_IDX(CAP), + ADV_EXT_IDX(BOOT), + ADV_EXT_IDX(LAG), + ADV_EXT_IDX(PC_ID), + ADV_EXT_IDX(CTRL_IPORT), +}; + + +enum { + EPORT_STATE_DOWN = 0, + EPORT_STATE_UP = 1, +}; + +enum fip_packet_type { + FIP_DISCOVER_UCAST = 0, + FIP_DISCOVER_MCAST = 1 +}; + +enum { + FIP_TABLE_HDR_MIDDLE = 0, + FIP_TABLE_HDR_FIRST = 1, + FIP_TABLE_HDR_LAST = 2, + FIP_TABLE_HDR_ONLY = 3 +}; + +enum { + FIP_EXT_LAG_W_POLICY_HOST = 1, + FIP_EXT_LAG_W_POLICY_UCAST = 1 << 2 +}; + +/* string "mellanox" */ +#define FIP_VENDOR_MELLANOX { 0x6d, 0x65, 0x6c, 0x6c, 0x61, 0x6e, 0x6f, 0x78 } + + +#define FIP_TEST_PKT_LENGTH(port, length, type) \ + if ((length) != sizeof(type) + IB_GRH_BYTES) { \ + vnic_dbg_fip(port->name, "Dump packet:" \ + "at %d unexpected size. length %d expected %d\n", \ + __LINE__, (int)length, \ + (int)(sizeof(type) + IB_GRH_BYTES)); \ + return -EINVAL; \ + } + +/* + * copy string b to string a and NULL termination. + * length a must be >= length b+1. + */ +#define TERMINATED_MEMCPY(a,b) \ + do { \ + ASSERT(sizeof(a)>=sizeof(b)+1); \ + memcpy((a), (b), sizeof(b)); \ + (a)[sizeof(b)] = '\0'; \ + } while (0); + + +enum { + FIP_MAX_ADDR_TLVS = 6, + FIP_MAX_TLVS = 32, + FIP_MAX_EXT_DESC = 32, +}; + +struct fip_fip_type { + u8 type; + u8 length; + u16 reserved; +}; + +struct fip_header_simple { + __be16 opcode; + u8 reserved; + u8 subcode; + __be16 list_length; + __be16 flags; +}; + +struct fip_vendor_id_tlv { + struct fip_fip_type ft; + u8 vendor_id[8]; +}; + +struct fip_address_tlv { + struct fip_fip_type ft; + u8 vendor_id[8]; + __be32 gwtype_qpn; + __be16 sl_gwportid; + __be16 lid; + u8 guid[8]; +}; + +struct fip_gw_information_tlv { + struct fip_fip_type ft; + u8 vendor_id[8]; + u8 h_nmac_mgid; + u8 n_rss_mgid_tss_qpn; + __be16 n_rss_qpn_vnics; +}; + +struct fip_login_tlv { + struct fip_fip_type ft; + u8 vendor_id[8]; + __be16 mtu; + __be16 vnic_id; + __be16 flags_vlan; + u8 mac[6]; + u8 eth_gid_prefix[5]; + u8 antispoofing; + __be16 vfields; + __be32 syndrom_ctrl_qpn; + u8 vnic_name[16]; +}; + +struct context_table_entry { + u8 v_rss_type; + u8 reserved; + u8 mac[ETH_ALEN]; + __be32 qpn; + u8 reserved1; + u8 sl; + __be16 lid; +}; + +struct fip_vhub_update_tlv { + struct fip_fip_type ft; + u8 vendor_id[8]; + __be32 state_vhub_id; + __be32 tusn; +}; + +struct fip_vhub_table_tlv { + struct fip_fip_type ft; + u8 vendor_id[8]; + __be32 vp_vhub_id; + __be32 tusn; + __be16 hdr; + __be16 table_size; +}; + +struct fip_vnic_identity_tlv { + struct fip_fip_type ft; + u8 vendor_id[8]; + __be32 flags_vhub_id; + __be32 tusn; + __be16 vnic_id; + u8 mac[6]; + u8 port_guid[8]; + u8 vnic_name[16]; +}; + +struct fip_partition_tlv { + struct fip_fip_type ft; + u8 vendor_id[8]; + __be16 reserved; + __be16 pkey; +}; + +struct fip_gw_identifier_tlv { + struct fip_fip_type ft; + u8 vendor_id[8]; + u8 sys_guid[8]; + u8 sys_name[32]; + u8 gw_port_name[8]; +}; + +struct fip_ka_params_tlv { + struct fip_fip_type ft; + u8 vendor_id[8]; + __be32 adv_period; + __be32 ka_period; + __be32 vnic_ka_period; +}; + +struct fip_ext_desc_tlv { + struct fip_fip_type ft; + u8 vendor_id[8]; +}; + +struct fip_extended_type { + u8 ext_type; + u8 len; + u8 reserved; + u8 mandatory; +}; + +struct fip_ext_type_cap { + struct fip_extended_type et; + u32 reserved[4]; +}; + +struct fip_ext_type_boot { + struct fip_extended_type et; + u8 boot_prio; + u8 reserved; + __be16 discovery_timeout; +}; + +struct fip_ext_type_lag_props { + struct fip_extended_type et; + u8 gw_type; + u8 reserved; + __be16 lag_hash; + u8 weight_policy_flags; + u8 ca_threshold; + __be16 link_down_pol_thresh; + u32 reserved2[2]; +}; + +struct fip_ext_type_power_cycle_id { + struct fip_extended_type et; + __be64 power_cycle_id; + u32 reserved; +} __attribute__((packed)); + +struct fip_ext_type_hostname { + struct fip_extended_type et; + u8 hostname[32]; +}; + +struct fip_ext_type_ctrl_iport { + struct fip_extended_type et; + u8 vendor_id[8]; + __be32 gwtype_qpn; + __be16 sl_gwportid; + __be16 lid; + u8 guid[8]; +}; + +struct fip_ext_type_lag_member { + __be32 qpn; + __be16 sl_gw_portid; + __be16 lid; + u8 guid[8]; + u8 eport_state; + u8 reserved1; + u8 weight; + u8 link_utilization; + u32 reserved2; +}; + +struct fip_ext_type_lag_members { + struct fip_extended_type et; + struct fip_ext_type_lag_member lagm[0]; +}; + +struct fip_ext_group { + struct fip_ext_desc_tlv *fed[FIP_MAX_EXT_DESC]; + int num; +}; + +struct fip_address_group { + struct fip_address_tlv *fa[FIP_MAX_ADDR_TLVS]; + int num; +}; + +struct fip_context_group { + struct context_table_entry *cte; + int num; +}; + +struct fip_content { + struct fip_eoib_ver *eoib_ver; + struct fip_header_simple *fh; + struct fip_vendor_id_tlv *fvend; + struct fip_address_group fa; + struct fip_gw_information_tlv *fgwi; + struct fip_login_tlv *fl; + struct fip_vhub_update_tlv *fvu; + struct fip_vhub_table_tlv *fvt; + struct fip_vnic_identity_tlv *fvi; + struct fip_partition_tlv *fp; + struct fip_gw_identifier_tlv *fgid; + struct fip_ka_params_tlv *fka; + struct fip_ext_group fed; + struct fip_context_group cte; + u64 mask; + u16 offsets[FIP_MAX_TLVS]; + int num; +}; + +/**************************************************************************/ +/* packet format structs */ +/**************************************************************************/ +#define VENDOR_ID_LENGTH 8 + +struct fip_eoib_ver { + u8 version; + u8 reserved[3]; +}; + +struct fip_fip_header { + __be16 opcode; + u8 reserved; + u8 subcode; + __be16 list_length; + __be16 flags; + struct fip_fip_type type; + u8 vendor_id[VNIC_VENDOR_LEN]; +}; + +struct fip_discover_base { + struct fip_fip_type type; + u8 vendor_id[VNIC_VENDOR_LEN]; + u32 qpn; + u16 sl_port_id; + u16 lid; + u8 guid[GUID_LEN]; +}; + +struct eoib_adv_gw_info { /* Gabi */ + struct fip_fip_type type; + u8 vendor_id[VNIC_VENDOR_LEN]; + u8 system_guid[GUID_LEN]; + u8 system_name[VNIC_SYSTEM_NAME_LEN]; + u8 gw_port_name[VNIC_GW_PORT_NAME_LEN]; +}; + +/* keep alive information */ +struct eoib_adv_ka_info { /* Gabi */ + struct fip_fip_type type; + u8 vendor_id[VNIC_VENDOR_LEN]; + u32 gw_adv_period; + u32 gw_period; + u32 vnic_ka_period; +}; + +struct eoib_advertise { + struct fip_eoib_ver version; + struct fip_fip_header fip; + struct fip_discover_base base; + struct fip_fip_type type_1; + u8 vendor_id[VNIC_VENDOR_LEN]; + u8 flags; + u8 reserved; + u16 num_net_vnics; + struct eoib_adv_gw_info gw_info; /* Gabi */ + struct eoib_adv_ka_info ka_info; /* Gabi */ +}; + +struct syndrom_dword { + u8 syndrom; + u8 reserved[3]; +}; + +union syn_qp_ctrl { + struct syndrom_dword syn; + u32 ctl_qpn; +}; + +struct eoib_login { + struct fip_eoib_ver eoib_ver; + struct fip_header_simple fh; + struct fip_vendor_id_tlv fvend; + struct fip_address_tlv fa; + struct fip_login_tlv fl; +}; + +struct fip_solicit_legacy { + struct fip_eoib_ver version; + struct fip_header_simple fh; + struct fip_vendor_id_tlv fvend; + struct fip_address_tlv addr; +}; + +struct fip_solicit_new { + struct fip_eoib_ver version; + struct fip_header_simple fh; + struct fip_vendor_id_tlv fvend; + struct fip_address_tlv addr; + struct fip_ext_desc_tlv ext; + struct fip_ext_type_cap ext_cap; + struct fip_ext_type_hostname ext_hostname; +}; + +union fip_vhub_id { + struct { + u8 flags; + u8 reserved[3]; + } flags; + u32 vhub_id; +}; + +struct eoib_context_table { + struct fip_eoib_ver version; + struct fip_fip_header fip; + struct fip_fip_type type_1; + u8 vendor_id[VNIC_VENDOR_LEN]; + union fip_vhub_id vhub_id; + u32 tusn; + u8 flags; + u8 reserved; + u16 table_size; + /* here come the context entries */ +}; + +/* this is the number of DWORDS to subtract from type_1->length + * to get the size of the entries / 4. (size in dwords from start + * of vendor_id field until the first context entry + 1 for checksum + */ +#define FIP_TABLE_SUB_LENGTH 6 + +/* + * eoib_host_update will be used for vHub context requests, + * keep alives and logouts + */ +struct eoib_host_update { + struct fip_eoib_ver version; + struct fip_fip_header fip; + struct fip_fip_type type_1; + u8 vendor_id[VNIC_VENDOR_LEN]; + union fip_vhub_id vhub_id; + u32 tusn; + u16 vnic_id; + u8 mac[ETH_ALEN]; + u8 port_guid[GUID_LEN]; + u8 vnic_name[VNIC_NAME_LEN]; +}; + +enum fip_packet_fields { + EOIB_FIP_OPCODE = 0xFFF9, + FIP_FIP_HDR_LENGTH = 3, + FIP_FIP_HDR_TYPE = 13, + + /* keep all subcodes here */ + FIP_HOST_SOL_SUB_OPCODE = 0x1, + FIP_GW_ADV_SUB_OPCODE = 0x2, + FIP_HOST_LOGIN_SUB_OPCODE = 0x3, + FIP_GW_LOGIN_SUB_OPCODE = 0x4, + FIP_HOST_LOGOUT_SUB_OPCODE = 0x5, + FIP_GW_UPDATE_SUB_OPCODE = 0x6, + FIP_GW_TABLE_SUB_OPCODE = 0x7, + FIP_HOST_ALIVE_SUB_OPCODE = 0x8, + FIP_MAX_SUBCODES, + /* end subcodes section */ + + FIP_FIP_FCF_FLAG = 0x1, + FIP_FIP_SOLICITED_FLAG = 0x2, + FIP_FIP_ADVRTS_FLAG = 0x4, + FIP_FIP_FP_FLAG = 0x80, + FIP_FIP_SP_FLAG = 0x40, + + FIP_BASIC_LENGTH = 7, + FIP_BASIC_TYPE = 240, + + FIP_ADVERTISE_LENGTH_1 = 4, + FIP_ADVERTISE_TYPE_1 = 241, + FIP_ADVERTISE_HOST_VLANS = 0x80, + FIP_ADVERTISE_NUM_VNICS_MASK = 0x0FFF, + FIP_ADVERTISE_N_RSS_SHIFT = 12, + FIP_ADVERTISE_HOST_EN_MASK = 0x80, + FIP_ADVERTISE_ALL_VLAN_GW_MASK = 0x60, + FIP_ADVERTISE_GW_PORT_ID_MASK = 0x0FFF, + FIP_ADVERTISE_SL_SHIFT = 12, + + FIP_ADVERTISE_GW_LENGTH = 15, + FIP_ADVERTISE_GW_TYPE = 248, + + FIP_ADVERTISE_KA_LENGTH = 6, + FIP_ADVERTISE_KA_TYPE = 249, + + FIP_LOGIN_LENGTH_1 = 13, + FIP_LOGIN_TYPE_1 = 242, + FIP_LOGIN_LENGTH_2 = 4, + FIP_LOGIN_TYPE_2 = 246, + + FIP_LOGIN_V_FLAG = 0x8000, + FIP_LOGIN_M_FLAG = 0x4000, + FIP_LOGIN_VP_FLAG = 0x2000, + FIP_LOGIN_H_FLAG = 0x1000, + FIP_LOGIN_VLAN_MASK = 0x0FFF, + FIP_LOGIN_DMAC_MGID_MASK = 0x3F, + FIP_LOGIN_RSS_MGID_MASK = 0x0F, + FIP_LOGIN_RSS_MASK = 0x10, + FIP_LOGIN_RSS_SHIFT = 4, + FIP_LOGIN_CTRL_QPN_MASK = 0xFFFFFF, + FIP_LOGIN_VNIC_ID_BITS = 16, + FIP_LOGIN_ALL_VLAN_GW_FLAG = 0x0040, + + FIP_LOGOUT_LENGTH_1 = 13, + FIP_LOGOUT_TYPE_1 = 245, + + FIP_HOST_UPDATE_LENGTH = 13, + FIP_HOST_UPDATE_TYPE = 245, + FIP_HOST_VP_FLAG = 0x01, + FIP_HOST_U_FLAG = 0x80, + FIP_HOST_R_FLAG = 0x40, + + FIP_CONTEXT_UP_LENGTH = 9, + FIP_CONTEXT_UP_TYPE = 243, + FIP_CONTEXT_UP_EPORT_MASK = 0x30, + FIP_CONTEXT_UP_EPORT_SHIFT = 4, + FIP_CONTEXT_V_FLAG = 0x80, + FIP_CONTEXT_RSS_FLAG = 0x40, + FIP_CONTEXT_TYPE_MASK = 0x0F, + + FIP_CONTEXT_TBL_TYPE = 244, + FIP_CONTEXT_TBL_SEQ_MASK = 0xC0, + FIP_CONTEXT_TBL_SEQ_FIRST = 0x40, + FIP_CONTEXT_TBL_SEQ_LAST = 0x80, + + FKA_ADV_PERIOD = 8000, /* in mSecs */ + FKA_ADV_MISSES = 3 +}; + +enum fip_login_syndroms { + FIP_SYNDROM_SUCCESS = 0, + FIP_SYNDROM_HADMIN_REJECT = 1, + FIP_SYNDROM_GW_RESRC = 2, + FIP_SYNDROM_NO_NADMIN = 3, + FIP_SYNDROM_UNRECOGNISED_HOST = 4, + FIP_SYNDROM_UNSUPPORTED_PARAM = 5, + FIP_SYNDROM_GW_IS_LAG_MEMBER = 6, + FIP_SYNDROM_DUPLICATE_ADDRESS = 7, +}; + +/* + * Send a multicast or unicast solicit packet. The multicast packet is sent + * to the discover mcast group. Unicast packets are sent to the dqpn + dlid + * supplied. The dlid, dqpn, sl are ignored for multicast packets. + * functionreturns 0 on success and error code on failure +*/ +int fip_solicit_send(struct fip_discover *discover, + enum fip_packet_type multicast, u32 dqpn, + u16 dlid, u8 sl, int new_prot, unsigned char *guid); + +/* + * Send a unicast login packet. This function supports both host and + * network admined logins. function returns 0 on success and + * error code on failure +*/ +int fip_login_send(struct fip_vnic_data *vnic); + +int fip_logout_send(struct fip_vnic_data *vnic); + +/* + * This function creates and sends a few types of packets (all ucast): + * vHub context request - new=1, logout=0 + * vHub context update / vnic keep alive - new=0, logout=0 + * vnic logout - new=0, logout=1 +*/ +int fip_update_send(struct fip_vnic_data *vnic, int request_new, int logout); + +/* + * Check if a received packet is a FIP packet, And if so return its subtype. + * The FIP type is also returned in fip_type and can be either EOIB_FIP_OPCODE + * or FCOIB_FIP_OPCODE. If the packet is not a FIP packet -EINVAL is returned. +*/ +int fip_pkt_parse(char *buffer, int length, int *fip_type); + +/* + * Already know that this is a FIP packet, return its subtype. +*/ +int fip_pkt_get_subtype_bh(char *buffer); + +/* + * parse a packet that is suspected of being an advertise packet. The packet + * returns 0 for a valid advertise packet and an error code other wise. The + * packets "interesting" details are returned in data. +*/ +int fip_advertise_parse_bh(struct fip_discover *discover, struct fip_content *fc, + struct fip_gw_data *data); + +/* + * parse a packet that is suspected of being an login ack packet. The packet + * returns 0 for a valid login ack packet and an error code other wise. The + * packets "interesting" details are returned in data. +*/ +int fip_login_parse(struct fip_discover *discover, struct fip_content *fc, + struct fip_login_data *data); + +static inline int _map_generic_pkt(struct vnic_port *port, + struct fip_ring_entry *tx_ring_entry, + void *mem, int pkt_size) +{ + /* alloc packet to be sent */ + tx_ring_entry->mem = mem; + + /* map packet to bus */ + tx_ring_entry->bus_addr = + ib_dma_map_single(port->dev->ca, + tx_ring_entry->mem, pkt_size, DMA_TO_DEVICE); + + if (unlikely(ib_dma_mapping_error(port->dev->ca, + tx_ring_entry->bus_addr))) { + vnic_warn(port->name, + "send_generic_pkt failed to map to pci\n"); + return -ENOMEM; + } + tx_ring_entry->length = pkt_size; + + return 0; +} + +static inline int alloc_map_fip_buffer(struct ib_device *ca, + struct fip_ring_entry *me, + int size, gfp_t mask) +{ + me->mem = kmalloc(size, mask); + if (!me->mem) { + vnic_warn(ca->name, "failed to alloc memory (%d)\n", size); + return -ENOMEM; + } + + me->bus_addr = ib_dma_map_single(ca, me->mem, size, DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(ca, me->bus_addr))) { + kfree(me->mem); + vnic_warn(ca->name, "ib_dma_mapping_error failed\n"); + return -ENOMEM; + } + me->length = size; + me->entry_posted = 0; + + return 0; +} + +#define DELAYED_WORK_CLEANUP_JIFFS 2 +#define FIP_MAX_PKT_PRINT_LENGTH 120 +#define FIP_OP_RECV (1ul << 31) + +static const char fip_discover_mgid[GID_LEN] = { + 0xFF, 0x12, 0xE0, 0x1B, + 0x00, 0x06, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00}; +static const char fip_solicit_mgid[GID_LEN] = { + 0xFF, 0x12, 0xE0, 0x1B, + 0x00, 0x07, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00}; + + +/* TODO - remove this: for initial debug only */ +void fip_dbg_dump_raw_pkt(int level, void *buff, + int length, int is_tx, char *name); +enum { + FIP_ETH_HEADER_LEN = 14, + FIP_ENCAP_LEN = 4, + FIP_PROTOCOL_RX_SIZE = 16, /* must be power of 2 */ + FIP_PROTOCOL_TX_SIZE = 64, /* must be power of 2 */ + FIP_LOGIN_RX_SIZE = 64, /* must be power of 2 */ + FIP_LOGIN_TX_SIZE = 64, /* must be power of 2 */ + + /* timeout in seconds between LOGIN and ACK */ + FIP_LOGIN_TIMEOUT = 8, + FIP_RESOLICIT_TIME = 8, + + IPOIB_UD_HEAD_SIZE = IB_GRH_BYTES + FIP_ENCAP_LEN, +}; + +struct fip_rcv_pkt { + struct list_head list; + struct fip_content *fc; + int length; + void *mem; +}; + +/* + * Alloc the discover CQ, QP. Configure the QP to RTS. + * alloc the RX + TX rings and queue work for discover + * finite state machine code. If complete it set, it clears + * possible previous GW / VNIC data structs on init. + */ +int fip_discover_init(struct vnic_port *port, struct fip_discover *discover, + u16 pkey, int complete); + +/* + * free the discover TX and RX rings, QP and CQ. If complete + * is set, it clears possible previous GW / VNIC data structs + * by using a "complete" flush otherwise vnic data is preserved. +*/ +int fip_discover_cleanup(struct vnic_port *port, struct fip_discover *discover, int complete); + +/* + * send a single multicast packet. + * return 0 on success, other on failure. +*/ +int fip_mcast_send(struct vnic_port *port, struct ib_qp *qp, + unsigned int wr_id, u64 mapping, + int size, u16 pkey_index, struct vnic_mcast *mcast); +/* + * send a single unicast packet. + * return 0 on success, other on failure. +*/ +int fip_ucast_send(struct vnic_port *port, struct ib_qp *qp, + unsigned int wr_id, u64 mapping, + int size, u16 pkey_index, u32 dest_qpn, u16 dlid, + u32 qkey, u8 sl, + unsigned char *dguid); +/* + * qonfigure a newly allocated QP and move it + * from reset->init->RTR->RTS + */ +int fip_init_qp(struct vnic_port *port, struct ib_qp *qp, + u16 pkey_index, char *name); + +/* + * allocs a single rx buffer (of size size), map it to pci bus + * and post it to the qp for receive. id parameter is used + * to keep track of work request when completion is received. + * kernel and bus address are returned in mem_entry. + * returns 0 on success else failure. + * id used to identify entry in receive queue. + */ +int fip_post_receive(struct vnic_port *port, struct ib_qp *qp, int size, + int _id, struct fip_ring_entry *mem_entry, char *name); + +/* trigered by a core event */ +void fip_qp_to_reset(struct ib_qp *qp, char *name); +void fip_flush_rings(struct vnic_port *port, + struct ib_cq *cq, + struct ib_qp *qp, + struct fip_ring *rx_ring, + struct fip_ring *tx_ring, + char *name); +void fip_free_rings(struct vnic_port *port, + struct fip_ring *rx_ring, + struct fip_ring *tx_ring, + char *name); + +/* + * This function allocates the tx buffers and initializes the head and + * tail indexes. + */ +int fip_init_tx(int size, struct fip_ring *tx_ring, char *name); + +/* + * Configure the discover QP. This includes configuring rx+tx + * moving the discover QP to RTS and creating the tx and rx rings + */ +int fip_init_rx(struct vnic_port *port, int ring_size, struct ib_qp *qp, + struct fip_ring *rx_ring, char *name); + +/* + * This is a general purpose CQ completion function that handles + * completions on RX and TX rings. It can serve all users that are + * using RX and TX rings. + * RX completions are destinguished from TX comp by the MSB that is set + * for RX and clear for TX. For RX, the memory is unmapped from the PCI, + * The head is incremented. For TX the memory is unmapped and then freed. + * The function returns the number of packets received. +*/ +int fip_comp(struct vnic_port *port, + struct ib_cq *cq, + struct fip_ring *rx_ring, + struct fip_ring *tx_ring, + char *name); + +/* + * This function is the driving engine of the vnic logic. It manages the + * vnics state machines. + * Some of the states in the state machine could have been removed because + * they contain "actions" and not states. Still it is easier to maintaine + * the code this way and it gives an easy mechanism for exception handling + * and retries. + * Only call this function from fip_wq context. +*/ +void fip_vnic_fsm(struct work_struct *work); + +/* + * Mark the vnic for deletion and trigger a delayed call to the cleanup + * function. In the past the vnic was moved to another list but this + * might cause vnic duplication if new vnics are added to the GW. Even + * if the vnic is being flushed we need to know it is there. + * + * Note: This deletion method insures that all pending vnic work requests + * are cleared without dependency of the calling context. +*/ +void fip_vnic_close(struct fip_vnic_data *vnic, enum fip_flush flush); + +/* + * Free vnic resources. This includes closing the data vnic (data QPs etc) + * and the discovery resources. If the vnic can be totaly destroyed (no + * pending work) the vnic will be removed from the GW list and it's memory + * freed. If not the vnic will not be freed and the function will return an + * error. The caller needs to recall this unction to complete the operation. +*/ +int fip_vnic_destroy(struct fip_vnic_data *vnic); + +struct fip_vnic_data *fip_vnic_alloc(struct vnic_port *port, + struct fip_gw_data *gw, + int hadmin, + u16 vnic_id); + +/* + * Look for a vnic in the GW vnic list. The search key used is either the vnic_id + * that is unique, or the mac+vlan pair. A match on either key will result in the + * return of the vnic. both keys are nesesary because host assigned delete + * flow might not have access to the vnic_id. The search disregards vnics that + * are undergoing full flush (they will be removed soon). +*/ +struct fip_vnic_data *fip_vnic_find_in_list(struct fip_gw_data *gw, + u16 vnic_id, u8 *mac, + u16 vlan, u8 vlan_used); + +/* + * process an incoming login ack packet. The packet was already parsed and + * its data was placed in *data. The function creates RX and TX rings for the + * vnic and starts the multicast join procedure. + * This function should not be called for packets other then login ack packets. +*/ +void fip_vnic_login_ack_recv(struct fip_vnic_data *vnic, + struct fip_login_data *data); + +/* + * This function should be called when the building of a vhub context + * table is done and the vnic state should transition to CONNECTED. +*/ +int fip_vnic_tbl_done(struct fip_vnic_data *vnic); +int fip_vnic_mcast_recnct(struct fip_vnic_data *vnic); + +/* + * Init the vnic's vHub table data structures, before using them + */ +void vhub_ctx_init(struct fip_vnic_data *vnic); +void vhub_table_free(struct vhub_elist *elist); + +/* + * Clear and free the vnic's vHub context table data structures. + */ +void vhub_ctx_free(struct fip_vnic_data *vnic); + +/* + * This function handles a vhub context table packet. The table will + * be processed only if we do not have a up to date local coppy of + * our own. The table update supports multi-packet tables so care + * must be taken in building the complete table. +*/ +int vhub_handle_tbl(struct fip_vnic_data *vnic, struct fip_content *fc, + u32 vhub_id, u32 tusn); + +/* + * This function handles a vhub context update packets. There are three flows + * in handeling update packets. The first is before the main table is up + * to date, the second is after the table is up to date but before it was + * passed to the ownership of the data vnic (login struct) and the local + * lists are freed, and the last is when the table maintanence is done + * by the data vnic. This function handles all cases. +*/ +int vhub_handle_update(struct fip_vnic_data *vnic, + u32 vhub_id, u32 tusn, + struct vnic_table_entry *data); + +/* + * This function writes the main vhub table to the data (login) vnic. + * You should call it when the data vnic is ready for it and after the + * table is up to date (and the update list was applied to the main list) + */ +int fip_vnic_write_tbl(struct fip_vnic_data *vnic); + +/* sysfs entries for hadmin vNics*/ +int vnic_create_hadmin_dentry(struct fip_vnic_data *vnic); +void vnic_delete_hadmin_dentry(struct fip_vnic_data *vnic); +void extract_memb_extended(struct fip_ext_type_lag_members *ext_lag_membs, + int ext_length, + struct lag_members *lagm, + char *name); +int handle_member_update(struct fip_vnic_data *vnic, struct lag_members *lm); +int extract_vhub_extended(struct fip_ext_desc_tlv *fed, + struct fip_vnic_data *vnic); +static inline int send_generic_ucast_pkt(struct vnic_port *port, + struct fip_ring *tx_ring, + void *mem, int pkt_size, + struct ib_qp *qp, + int pkey_index, + u32 dst_qpn, u16 dst_lid, + u32 qkey, u8 sl, + unsigned char *dguid) +{ + int index, rc; + unsigned long flags; + unsigned long tail; + + /* + * we are only allowed to update the head at task level so no need to + * perform any locks here + */ + spin_lock_irqsave(&tx_ring->ring_lock, flags); + index = tx_ring->head & (tx_ring->size - 1); + + vnic_dbg_fip(port->name, "send ucast packet\n"); + + spin_lock(&tx_ring->head_tail_lock); + tail = tx_ring->tail; + spin_unlock(&tx_ring->head_tail_lock); + + /* ring full try again */ + if (tx_ring->head - tail >= tx_ring->size) { + vnic_warn(port->name, "send_generic_pkt ring full: QPN 0x%x: tail=%ld head=%ld diff=%ld\n", + qp->qp_num, tx_ring->tail, tx_ring->head, tx_ring->head - tx_ring->tail); + rc = -EAGAIN; + goto err; + } + + + rc = _map_generic_pkt(port, &tx_ring->ring[index], mem, pkt_size); + if (rc) + goto err; + + rc = fip_ucast_send(port, qp, index, + tx_ring->ring[index].bus_addr, + pkt_size, pkey_index, dst_qpn, dst_lid, + qkey, sl, dguid); + + if (rc) { + vnic_warn(port->name, "fip_ucast_send() failed (%d)\n", rc); + rc = -ENODEV; + goto error_unmap_dma; + } + + tx_ring->head++; + + spin_unlock_irqrestore(&tx_ring->ring_lock, flags); + return 0; + +error_unmap_dma: + ib_dma_unmap_single(port->dev->ca, + tx_ring->ring[index].bus_addr, + pkt_size, DMA_TO_DEVICE); +err: + spin_unlock_irqrestore(&tx_ring->ring_lock, flags); + return rc; +} + +static inline const char *eport_state_str(int state) +{ + switch (state) { + case EPORT_STATE_DOWN: return "Down"; + case EPORT_STATE_UP: return "Up"; + default:return "Invalid"; + } +} + +#endif /* _VNIC_FIP_H */ diff --git a/drivers/net/mlx4_vnic/vnic_fip_discover.c b/drivers/net/mlx4_vnic/vnic_fip_discover.c new file mode 100644 index 0000000000000..f9b0bf882d65a --- /dev/null +++ b/drivers/net/mlx4_vnic/vnic_fip_discover.c @@ -0,0 +1,1936 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_fip.h" +#include "vnic_fip_discover.h" +#include "vnic_fip_pkt.h" + +#define FIP_MAX_PKT_PRINT_LENGTH 120 + +static void fip_purge_gws(struct work_struct *work); +static void fip_discover_gw_fsm(struct work_struct *work); +static void fip_discover_hadmin_update(struct work_struct *work); +static void fip_discover_fsm(struct work_struct *work); + +/* TODO - remove this: for initial debug only */ +void fip_dbg_dump_raw_pkt(int level, void *buff, + int length, int is_tx, char *name) +{ + int i; + int tmp_len; + u32 *data_ptr; + unsigned char *tmp_data_ptr; + + if (!(vnic_msglvl & VNIC_DEBUG_PKT_DUMP)) + return; + + printk(KERN_DEBUG "%s %s: packet length is %d\n", + is_tx ? "TX" : "RX", name, length); + + length = (length > FIP_MAX_PKT_PRINT_LENGTH) ? + FIP_MAX_PKT_PRINT_LENGTH : length; + + tmp_len = (length >> 2) + 1; + data_ptr = (u32 *)buff; + for (i = 0; i < tmp_len; i++) { + if (!is_tx && i == IB_GRH_BYTES >> 2) + printk(KERN_DEBUG "========================\n"); + tmp_data_ptr = (unsigned char *)&data_ptr[i]; + printk(KERN_DEBUG "%02x %02x %02x %02x \n", + tmp_data_ptr[0], tmp_data_ptr[1], + tmp_data_ptr[2], tmp_data_ptr[3]); + } +} + +/* + * Configure the discover QP. This includes configuring rx+tx + * moving the discover QP to RTS and creating the tx and rx rings + */ +int fip_discover_start_rings(struct fip_discover *discover, + struct fip_ring *rx_ring, + struct fip_ring *tx_ring, + struct ib_cq *cq, + struct ib_qp *qp) +{ + int rc; + + rc = fip_init_tx(tx_ring->size, tx_ring, discover->name); + if (rc) { + vnic_warn(discover->name, "fip_init_tx failed rc %d\n", rc); + return rc; + } + + rc = fip_init_rx(discover->port, rx_ring->size, qp, rx_ring, discover->name); + if (rc) { + vnic_warn(discover->name, "fip_init_rx returned %d\n", rc); + goto release_queues; + } + + return 0; + +release_queues: + fip_flush_rings(discover->port, cq, qp, rx_ring, tx_ring, discover->name); + fip_free_rings(discover->port, rx_ring, tx_ring, discover->name); + + return rc; +} + +int fip_discover_init_rings(struct vnic_port *port, + struct fip_discover *discover, + struct fip_ring *rx_ring, + struct fip_ring *tx_ring, + struct ib_cq **cq, + struct ib_qp **qp, + ib_comp_handler comp_handler) +{ + struct ib_qp_init_attr qp_init_attr; + struct ib_device *ca = port->dev->ca; + + + *cq = ib_create_cq(ca, comp_handler, NULL, discover, + rx_ring->size + tx_ring->size, 0); + if (IS_ERR(*cq)) { + vnic_warn(discover->name, "failed to create CQ\n"); + goto out; + } + + memset(&qp_init_attr, 0, sizeof(qp_init_attr)); + qp_init_attr.cap.max_send_wr = tx_ring->size; + qp_init_attr.cap.max_recv_wr = rx_ring->size; + qp_init_attr.cap.max_send_sge = 1; + qp_init_attr.cap.max_recv_sge = 1; + qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; + qp_init_attr.qp_type = IB_QPT_UD; + qp_init_attr.send_cq = *cq; + qp_init_attr.recv_cq = *cq; + + *qp = ib_create_qp(port->pd, &qp_init_attr); + if (IS_ERR(*qp)) { + vnic_warn(discover->name, "failed to create QP\n"); + goto error_free_cq; + } + + /* move QP to RTS */ + if (fip_init_qp(discover->port, *qp, discover->pkey_index, discover->name)) { + vnic_warn(discover->name, "fip_init_qp failed for qp\n"); + goto error_free_qp; + } + + /* init RX + TX rings */ + if (fip_discover_start_rings(discover, rx_ring, tx_ring, *cq, *qp)) { + vnic_warn(discover->name, "failed to start rings\n"); + goto error_free_qp; + } + + /* enable receiving CQ comps, triggers fip_discover_comp() */ + if (ib_req_notify_cq(*cq, IB_CQ_NEXT_COMP)) { + vnic_warn(discover->name, "ib_req_notify_cq failed for cq\n"); + goto error_release_rings; + } + + return 0; + +error_release_rings: + fip_flush_rings(discover->port, *cq, *qp, rx_ring, tx_ring, discover->name); + fip_free_rings(discover->port, rx_ring, tx_ring, discover->name); +error_free_qp: + ib_destroy_qp(*qp); +error_free_cq: + ib_destroy_cq(*cq); +out: + *qp = NULL; + *cq = NULL; + return -ENODEV; +} + +/* + * This function handles completions of both TX and RX + * packets. RX packets are unmapped lightly parsed moved to a list + * and passed to thread processing. TX packets are unmapped and freed. + * Note: this function is called from interrupt context + */ +static void fip_discover_comp(struct ib_cq *cq, void *discover_ptr) +{ + struct fip_discover *discover = discover_ptr; + + /* handle completions. On RX packets this will call discover_process_rx + * from thread context to continue processing */ + if (fip_comp(discover->port, discover->cq, + &discover->rx_ring, &discover->tx_ring, + discover->name)) + fip_discover_process_rx(discover); +} + +/* + * Alloc the discover CQ, QP. Configure the QP to RTS. + * alloc the RX + TX rings and queue work for discover + * finite state machine code. + */ +int fip_discover_init(struct vnic_port *port, struct fip_discover *discover, + u16 pkey, int complete) +{ + int rc; + + discover->port = port; + discover->flush = FIP_NO_FLUSH; + discover->state = FIP_DISCOVER_INIT; + discover->rx_ring.size = FIP_PROTOCOL_RX_SIZE; + discover->tx_ring.size = FIP_PROTOCOL_TX_SIZE; + discover->new_prot_gws = 0; + discover->old_prot_gws = 0; + + /* This is in preparation for pkey discovery */ + + init_completion(&discover->flush_complete); + + INIT_DELAYED_WORK(&discover->fsm_task, fip_discover_fsm); + INIT_DELAYED_WORK(&discover->cleanup_task, fip_purge_gws); + INIT_DELAYED_WORK(&discover->hadmin_update_task, fip_discover_hadmin_update); + INIT_WORK(&discover->pkt_rcv_task_bh, fip_discover_process_rx_bh); + spin_lock_init(&discover->rcv_list.lock); + INIT_LIST_HEAD(&discover->rcv_list.list); + spin_lock_init(&discover->lock); + + + if (complete) { + discover->pkey = pkey; + INIT_LIST_HEAD(&discover->gw_list); + init_rwsem(&discover->l_rwsem); + sprintf(discover->name, "%s_P%x", port->name, discover->pkey); + } + INIT_LIST_HEAD(&discover->hadmin_cache); + vnic_mcast_root_init(&discover->mcast_tree); + + if (!ib_find_pkey(port->dev->ca, port->num, discover->pkey, &discover->pkey_index)) { + rc = fip_discover_init_rings(port, discover, &discover->rx_ring, + &discover->tx_ring, &discover->cq, + &discover->qp, fip_discover_comp); + if (rc) { + vnic_warn(discover->name, "descovered init failed rc=%d\n", rc); + return rc; + } + + /* start discover FSM code */ + /* calls fip_discover_fsm() */ + queue_delayed_work(fip_wq, &discover->fsm_task, 0); + } else { + vnic_warn(discover->name, "Configured PKEY 0x%X is not supported on port\n", discover->pkey); + discover->pkey_index = ILLEGAL_PKEY_INDEX; + } + + + return 0; +} + +void fip_recv_list_flush(struct fip_discover *discover) +{ + struct list_head discov_recv_local; + struct fip_rcv_pkt *rcv, *rcv1; + unsigned long flags; + + INIT_LIST_HEAD(&discov_recv_local); + + spin_lock_irqsave(&discover->rcv_list.lock, flags); + list_replace_init(&discover->rcv_list.list, &discov_recv_local); + spin_unlock_irqrestore(&discover->rcv_list.lock, flags); + + list_for_each_entry_safe(rcv, rcv1, &discov_recv_local, list) { + list_del(&rcv->list); + kfree(rcv); + } + return; +} + +/* + * free the discover TX and RX rings, QP and CQ. + * May not be called from fip wq context. + */ +int fip_discover_cleanup(struct vnic_port *port, struct fip_discover *discover, int complt) +{ + if (discover->state == FIP_DISCOVER_OFF) + return -EINVAL; + + /* move FSM to flush state and wait for the FSM + * to finish whatever it is doing before we continue + */ + vnic_dbg_mark(); + init_completion(&discover->flush_complete); + discover->flush = complt ? FIP_FULL_FLUSH : FIP_PARTIAL_FLUSH; + cancel_delayed_work(&discover->fsm_task); +#ifndef _BP_WORK_SYNC + cancel_delayed_work_sync(&discover->hadmin_update_task); +#else + cancel_delayed_work(&discover->hadmin_update_task); + flush_workqueue(fip_wq); +#endif + /* flush any hadmin entries leftovers */ + { + struct fip_hadmin_cache *hadmin, *hadmin_t; + + spin_lock_irq(&discover->lock); + list_for_each_entry_safe(hadmin, hadmin_t, + &discover->hadmin_cache, next) { + list_del(&hadmin->next); + kfree(hadmin); + } + spin_unlock_irq(&discover->lock); + } + + /* calls fip_discover_fsm() */ + queue_delayed_work(fip_wq, &discover->fsm_task, 0); + vnic_dbg_mark(); + /* calls fip_discover_fsm() */ + wait_for_completion(&discover->flush_complete); + vnic_dbg_mark(); + + /* make sure that discover FSM is idle */ +#ifndef _BP_WORK_SYNC + cancel_delayed_work_sync(&discover->fsm_task); +#else + cancel_delayed_work(&discover->fsm_task); + flush_workqueue(fip_wq); +#endif + + if (discover->pkey_index != ILLEGAL_PKEY_INDEX) { + fip_flush_rings(port, discover->cq, discover->qp, + &discover->rx_ring, &discover->tx_ring, + discover->name); + fip_free_rings(port, &discover->rx_ring, &discover->tx_ring, + discover->name); + + fip_recv_list_flush(discover); + if (discover->qp) + ib_destroy_qp(discover->qp); + discover->qp = NULL; + + if (discover->cq) + ib_destroy_cq(discover->cq); + discover->cq = NULL; + } + + return 0; +} + +/* + * This function runs in interrupt context + * It does sanity checking of the packet, moves it to a list and passes + * handling to a thread. + */ +void fip_discover_process_rx(struct fip_discover *discover) +{ + struct vnic_port *port = discover->port; + int mtu_size = FIP_UD_BUF_SIZE(port->max_mtu_enum); + int rc; + int queue_packet, one_or_more_queued = 0; + struct fip_rcv_pkt *rcv, *rcv1; + struct list_head discov_recv_local; + int index; + struct fip_content *fc; + int err; + struct fip_ring_entry *ring; + + INIT_LIST_HEAD(&discov_recv_local); + + if (discover->flush != FIP_NO_FLUSH) + return; + + while (discover->rx_ring.head != discover->rx_ring.tail) { + fc = NULL; + queue_packet = 0; + index = discover->rx_ring.tail & (discover->rx_ring.size - 1); + ring = &discover->rx_ring.ring[index]; + + if (ring->entry_posted == 1 && + discover->state == FIP_DISCOVER_SOLICIT) { + fc = kzalloc(sizeof *fc, GFP_ATOMIC); + if (likely(fc)) { + /* login is the first state we RX packets in */ + rc = fip_packet_parse(port, ring->mem + IB_GRH_BYTES, + ring->length - IB_GRH_BYTES, fc); + if (!rc) + fip_discover_rx_packet(&queue_packet, fc); + } else + vnic_warn(discover->name, "allocation failed\n"); + } + if (queue_packet) { + int length; + + length = ring->length - IB_GRH_BYTES; + rcv = kmalloc(sizeof *rcv, GFP_ATOMIC); + if (!rcv) { + vnic_dbg_fip(discover->name, "failed kmalloc\n"); + kfree(fc); + } else { + struct fip_ring_entry me; + + err = alloc_map_fip_buffer(port->dev->ca, &me, + mtu_size, GFP_ATOMIC); + if (err) { + kfree(fc); + kfree(rcv); + } else { + rcv->length = length; + rcv->fc = fc; + rcv->mem = ring->mem; + list_add_tail(&rcv->list, &discov_recv_local); + one_or_more_queued++; + ib_dma_unmap_single(port->dev->ca, + ring->bus_addr, + mtu_size, DMA_FROM_DEVICE); + *ring = me; + } + } + } else + kfree(fc); + + rc = fip_post_receive(port, discover->qp, + FIP_UD_BUF_SIZE(discover->port->max_mtu_enum), + index, ring, discover->name); + if (rc) + vnic_warn(discover->name, "fip_post_receive rc %d\n", rc); + + discover->rx_ring.tail++; + } + + if (one_or_more_queued) { + spin_lock(&discover->lock); + if (likely(discover->flush == FIP_NO_FLUSH)) { + spin_lock(&discover->rcv_list.lock); + list_splice_init(&discov_recv_local, discover->rcv_list.list.prev); + spin_unlock(&discover->rcv_list.lock); + /* calls fip_discover_process_rx_bh */ + queue_work(fip_wq, &discover->pkt_rcv_task_bh); + spin_unlock(&discover->lock); + } else { + spin_unlock(&discover->lock); + list_for_each_entry_safe(rcv, rcv1, &discov_recv_local, list) { + list_del(&rcv->list); + kfree(rcv->fc); + kfree(rcv->mem); + kfree(rcv); + } + } + } + + return; +} + +/* + * This function is the RX packet handler bottom half. It runs on the fip wq. +*/ +void fip_discover_process_rx_bh(struct work_struct *work) +{ + struct fip_discover *discover = + container_of(work, struct fip_discover, pkt_rcv_task_bh); + int rc; + struct list_head discov_recv_local; + struct fip_rcv_pkt *rcv, *rcv1; + unsigned long flags; + + INIT_LIST_HEAD(&discov_recv_local); + + /* the irqsave is needed because debug kernel above 2.6.27 complains about + * hard irq safe to hard irq unsafe on discover.lock */ + spin_lock_irqsave(&discover->rcv_list.lock, flags); + list_replace_init(&discover->rcv_list.list, &discov_recv_local); + spin_unlock_irqrestore(&discover->rcv_list.lock, flags); + + if (discover->flush != FIP_NO_FLUSH) { + list_for_each_entry_safe(rcv, rcv1, &discov_recv_local, list) { + list_del(&rcv->list); + kfree(rcv->fc); + kfree(rcv->mem); + kfree(rcv); + } + return; + } + + list_for_each_entry_safe(rcv, rcv1, &discov_recv_local, list) { + rc = fip_discover_rx_packet_bh(discover, rcv->fc); + if (rc) + vnic_warn(discover->name, "discover_rx_packet rc %d\n", rc); + + list_del(&rcv->list); + kfree(rcv->fc); + kfree(rcv->mem); + kfree(rcv); + } + return; +} + +static inline int fip_close_all_vnics(struct fip_gw_data *gw, enum fip_flush flush) +{ + struct fip_vnic_data *vnic; + int open_vnics = 0; + + vnic_dbg_func(gw->discover->name); + + list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) { + open_vnics++; + fip_vnic_close(vnic, flush); + } + return open_vnics; +} + +static int fip_gw_create_vnics(struct fip_gw_data *gw) +{ + struct fip_vnic_data *vnic; + unsigned long first_free_vnic; + struct fip_vnic_send_info gw_address; + int i; + + gw->info.gw_num_vnics = (gw->info.gw_num_vnics > FIP_MAX_VNICS_PER_GW) ? + FIP_MAX_VNICS_PER_GW : gw->info.gw_num_vnics; + + + gw->info.gw_num_vnics = vnic_net_admin ? gw->info.gw_num_vnics : 0; + + fip_vnic_create_gw_param(&gw_address, gw->info.gw_qpn, VNIC_FIP_QKEY, + gw->info.gw_lid, gw->info.sl); + + /* for host admined */ + list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) { + if (vnic->hadmined) { + if (gw->info.hadmined_en) + fip_hadmin_vnic_refresh(vnic, &gw_address); + else { + vnic_dbg_fip(gw->discover->name, + "fip_gw_create_vnics hadmin disabled, " + "close open hadmin vnics\n"); + fip_vnic_close(vnic, FIP_PARTIAL_FLUSH); + } + } + } + + /* for network admined */ + for (i = gw->vnic_count; i < gw->info.gw_num_vnics; i++) { + vnic_dbg_fip(gw->discover->name, "fip_gw_create_vnics available" + " vnics %d needed %d\n", + gw->vnic_count, gw->info.gw_num_vnics); + + /* start network assigned at half array. leave first half to host admin */ + first_free_vnic = find_first_zero_bit(gw->n_bitmask, + FIP_MAX_VNICS_PER_GW); + if (first_free_vnic >= FIP_MAX_VNICS_PER_GW) + return -ENOMEM; + + vnic = fip_vnic_alloc(gw->discover->port, gw, 0 /* hadmin */, first_free_vnic); + if (!vnic) + return -ENOMEM; + + fip_vnic_set_gw_param(vnic, &gw_address); + set_bit(first_free_vnic, gw->n_bitmask); + list_add_tail(&vnic->gw_vnics, &gw->vnic_list); + gw->vnic_count++; + + /* calls fip_vnic_fsm() */ + cancel_delayed_work(&vnic->vnic_task); + fip_vnic_fsm(&vnic->vnic_task.work); + } + + return 0; +} + +/* + * This function goes over vnics and closes network administrated vNics + * that are not open and do not receive neighbor table info (there + * is no way for the BXM to tell the vNics to close before the + * vnic is listening to the neighbour tables). +*/ +static int fip_gw_close_nonopen_vnics(struct fip_gw_data *gw) +{ + struct fip_vnic_data *vnic; + int closed_vnics = 0; + + vnic_dbg_fip(gw->discover->name, "Try to close non open vnics\n"); + + list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) { + vnic_dbg_fip(gw->discover->name, "check vnic %s, hadmin %d state %d\n", + vnic->name, vnic->hadmined, vnic->state); + if (!vnic->hadmined && vnic->state < FIP_VNIC_VHUB_DONE) { + vnic_dbg_fip(gw->discover->name, "closing vnic %s\n", vnic->name); + fip_vnic_close(vnic, FIP_PARTIAL_FLUSH); + closed_vnics++; + } + } + + return closed_vnics; +} + +/* permanently delete all vnics pending delete. The function goes over + * the list of vnics awaiting deletion and tries to delete them. If the + * vnic destructor returns an error value (currently busy) the function + * will requeue it self for another try. The function will also test if + * new vnics need to be added as a result of vnic removal. + */ +static void fip_purge_vnics(struct work_struct *work) +{ + struct fip_gw_data *curr_gw = + container_of(work,struct fip_gw_data, vnic_cleanup_task.work); + struct fip_vnic_data *vnic, *tmp_vnic; + int vnic_id, rc, del_cnt = 0, retry = 0; + unsigned long *bitmask; + + vnic_dbg_fip(curr_gw->discover->name, "fip_purge_vnics\n"); + + list_for_each_entry_safe(vnic, tmp_vnic, &curr_gw->vnic_list, gw_vnics) { + enum fip_flush f; + vnic_id = vnic->vnic_id; + bitmask = vnic->hadmined ? NULL : curr_gw->n_bitmask; + + /* If successful vnic is removed from list and destroyed */ + f = vnic->flush; + if (f != FIP_NO_FLUSH) { + rc = fip_vnic_destroy(vnic); + if (!rc) { + del_cnt++; + if (f == FIP_FULL_FLUSH && bitmask) + clear_bit(vnic_id, bitmask); + } else { + retry |= rc; + } + } + + /* limit the number of vnics to purge in each loop to let other + * tasks on same wq to run (i.e., avoid starvation). + */ + if (del_cnt > 2) { + retry = 1; + break; + } + } + + /* This means we still have vnics that refuse to close retry later */ + if (retry){ + vnic_dbg_mark(); + /* calls fip_purge_vnics() */ + queue_delayed_work(fip_wq, &curr_gw->vnic_cleanup_task, HZ / 10); + } else { + vnic_dbg_fip(curr_gw->discover->name, "fip_purge_vnics, all GW" + " vnics closed\n"); + /* test and open new vnics if vnics are missing */ + /* ALITODO: after GW timeout, a vnic is re-created! why is that? + if (fip_gw_create_vnics(curr_gw)) { + vnic_dbg_mark(); + queue_delayed_work(fip_wq, + &curr_gw->vnic_cleanup_task, HZ); + } + */ + } +} + +/* + * This function adds or removes a single host admined vnic to a GW. + * First the function searches for the vnic. The search function + * disregards vnics that are undergoing a complete flush. +*/ +int fip_gw_update_hadmin_gw(struct fip_gw_data *gw, + struct fip_hadmin_cache *hadmin_entry) +{ + struct fip_vnic_data *vnic; + int vnic_id = hadmin_entry->vnic_id, rc = 0; + + /* set bit 16 for hadmin vNics (by spec) */ + vnic_id |= (1 << (VNIC_ID_LEN - 1)); + + vnic = fip_vnic_find_in_list(gw, vnic_id, hadmin_entry->mac, + hadmin_entry->vlan, + hadmin_entry->vlan_used); + + /* remove: if vNic found - remove it and exit */ + if (hadmin_entry->remove) { + if (vnic) + fip_vnic_close(vnic, FIP_FULL_FLUSH); + else + vnic_dbg_fip(gw->discover->name, "vNic to remove is" + " not found (name:%s mac:"MAC_6_PRINT_FMT + " vlan:%d id:%d)\n", + hadmin_entry->interface_name, + MAC_6_PRINT_ARG(hadmin_entry->mac), + hadmin_entry->vlan, vnic_id); + goto out; + } + + /* add: if vNic found - report error, otherwise add new vNic */ + if (vnic) { + /* skip error reporting between child vNics conflict, + * as vnic_learn_mac() may learn same child while it's still + * pending. TODO: improve this to avoid such cases. + */ + if (hadmin_entry->parent_used && vnic->parent_used) + goto out; + vnic_warn(gw->discover->name, "vNic creation failed, duplicate" + " vNic detected (name:%s mac:"MAC_6_PRINT_FMT + " vlan:%d id:%d & existing name:%s mac:" + MAC_6_PRINT_FMT" vlan:%d id:%d)\n", + hadmin_entry->interface_name, + MAC_6_PRINT_ARG(hadmin_entry->mac), + hadmin_entry->vlan, vnic_id, vnic->interface_name, + MAC_6_PRINT_ARG(vnic->login_data.mac), + vnic->login_data.vlan, vnic->login_data.vnic_id); + goto out; + } + +#if 0 + /* if the GW is in all_vlan mode, + * the host can only create vlans in this mode. + * However if it is not in all_vlan mode, the host must not create + * vlans in this mode */ + if ((gw->info.all_vlan_gw && !hadmin_entry->all_vlan_gw + && hadmin_entry->vlan_used) || + (!gw->info.all_vlan_gw && hadmin_entry->all_vlan_gw)) { + vnic_warn(gw->discover->name, "vnic creation failed, all_vlan" + " gateway policy must be enforced between the gateway" + " and the host\n"); + rc = -EINVAL; + goto out; + } +#endif + + vnic = fip_vnic_alloc(gw->discover->port, gw, 1 /* hadmin */, vnic_id); + if (!vnic) { + rc = -ENOMEM; + goto out; + } + + /* hand over info from hadmin to vnic struct */ + memcpy(vnic->login_data.mac, hadmin_entry->mac, sizeof(vnic->login_data.mac)); + memcpy(vnic->interface_name, hadmin_entry->interface_name, + sizeof(vnic->interface_name)); + vnic->login_data.vlan = hadmin_entry->vlan; + vnic->login_data.vp = hadmin_entry->vlan_used; + vnic->login_data.all_vlan_gw = hadmin_entry->all_vlan_gw; + memcpy(vnic->shared_vnic.ip, hadmin_entry->shared_vnic_ip, + sizeof(vnic->shared_vnic.ip)); + memcpy(vnic->shared_vnic.emac, hadmin_entry->shared_vnic_mac, + sizeof(vnic->shared_vnic.emac)); + vnic->shared_vnic.enabled = is_valid_ipv4(hadmin_entry->shared_vnic_ip); + vnic->vnic_id = vnic_id; /* will be overwritten later */ + vnic->vlan_used = hadmin_entry->vlan_used; + vnic->parent_used = hadmin_entry->parent_used; + memcpy(vnic->parent_name, hadmin_entry->parent_name, + sizeof(vnic->parent_name)); + vnic->qp_base_num = hadmin_entry->qp_base_num; + vnic->vlan = hadmin_entry->vlan; + vnic->cmd = hadmin_entry->cmd; + vnic->all_vlan_gw = hadmin_entry->all_vlan_gw; + + /* create dentry */ + rc = vnic_create_hadmin_dentry(vnic); + if (rc) + goto init_failed; + + rc = fip_vnic_hadmin_init(gw->discover->port, vnic); + if (rc) + goto init_failed; + + list_add_tail(&vnic->gw_vnics, &gw->vnic_list); + + /* calls fip_vnic_fsm() */ + fip_vnic_fsm(&vnic->vnic_task.work); + + return 0; + +init_failed: + vnic_delete_hadmin_dentry(vnic); + kfree(vnic); +out: + return rc; +} + +/* + * Queue the GW for deletion. And trigger a delayed call to the cleanup + * function. + * Note: This deletion method insures that all pending GW work requests + * are cleared without dependency of the calling context. +*/ +void fip_close_gw(struct fip_gw_data *gw, enum fip_flush flush) +{ + enum fip_flush tmp_flush = gw->hadmin_gw ? flush : FIP_FULL_FLUSH; + + if (tmp_flush == FIP_PARTIAL_FLUSH && gw->state < FIP_GW_HOST_ADMIN) + return; + + /* close already in process, disregard*/ + if (gw->flush >= tmp_flush) + return; + + gw->flush = tmp_flush; + gw->info.gw_num_vnics = 0; + cancel_delayed_work(&gw->gw_task); + + /* This is not mandatory but will save us time because there is a + * better chance that all vnics would be destroyed before trying to + * destroy the GW */ + fip_close_all_vnics(gw, tmp_flush); + + /* calls fip_purge_gws() */ + queue_delayed_work(fip_wq, &gw->discover->cleanup_task, DELAYED_WORK_CLEANUP_JIFFS); +} + +/* + * Free GW resources. This includes destroying the vnics. If the GW can be + * totally destroyed (no pending work for the GW and all the vnics have been + * destroyed) the GW will be removed from the GWs list and it's memory + * freed. If the GW can not be closed at this time it will not be freed + * and the function will return an error. + * In this case the caller needs to recall the unction to complete the + * operation. + * Do not call this function directly use: fip_close_gw + */ +static int fip_free_gw(struct fip_discover *discover, struct fip_gw_data *gw) +{ + struct fip_vnic_data *vnic; + int vnic_close_fail = 0; + + gw->info.gw_num_vnics = 0; + + if (delayed_work_pending(&gw->gw_task)) + return -EBUSY; + + list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) + vnic_close_fail |= (vnic->flush != FIP_NO_FLUSH); + + /* true if vnics need to be closed */ + /* if some of the vnics are still open return and retry later */ + if (vnic_close_fail) + return -EBUSY; + + if (delayed_work_pending(&gw->vnic_cleanup_task)) + return -EBUSY; + + /* + * it is possible that during gw removal we added the GW again. Test GW + * list to ensure it is not in the list already before adding it again. + */ + if (gw->state > FIP_GW_HOST_ADMIN) { + if (gw->info.gw_prot_new) + discover->new_prot_gws--; + else + discover->old_prot_gws--; + } + if (gw->flush == FIP_PARTIAL_FLUSH) { + gw->state = FIP_GW_HOST_ADMIN; + gw->flush = FIP_NO_FLUSH; + } else { + list_del(&gw->list); + kfree(gw); + } + + return 0; +} + +/* + * permanently delete all GWs pending delete. The function goes over + * the list of GWs awaiting deletion and tries to delete them. If the + * GW destructor returns an error value (currently busy) the function + * will requeue it self for another try. + */ +static void fip_purge_gws(struct work_struct *work) +{ + struct fip_discover *discover = + container_of(work, struct fip_discover, cleanup_task.work); + struct fip_gw_data *gw, *tmp_gw; + int gw_close_fail = 0; + + down_write(&discover->l_rwsem); + list_for_each_entry_safe(gw, tmp_gw, &discover->gw_list, list) { + if (gw->flush != FIP_NO_FLUSH) { + gw_close_fail |= fip_free_gw(discover, gw); + } + } + up_write(&discover->l_rwsem); + + /* This means we still have vnics that refuse to close, retry later */ + if (gw_close_fail) { + vnic_dbg_fip(discover->name, "still have open GWs\n"); + /* calls fip_purge_gws() */ + queue_delayed_work(fip_wq, &discover->cleanup_task, + DELAYED_WORK_CLEANUP_JIFFS); + } else { + vnic_dbg_fip(discover->name, "fip_purge_gws all gws" + " closed and freed\n"); + } +} + +static int fip_free_gw_done(struct fip_discover *discover, enum fip_flush flush) +{ + struct fip_gw_data *curr_gw; + int rc; + + down_read(&discover->l_rwsem); + if (flush == FIP_FULL_FLUSH) { + rc = list_empty(&discover->gw_list); + up_read(&discover->l_rwsem); + return rc; + } + + list_for_each_entry(curr_gw, &discover->gw_list, list) { + if (curr_gw->flush != FIP_NO_FLUSH) { + up_read(&discover->l_rwsem); + return 0; + } + } + + up_read(&discover->l_rwsem); + return 1; +} + +/* + * Go over the GW list and try to close the GWs. It is possible that some + * of the GWs have pending work and therefore can not be closed. We can not + * sleep on this because we might be running on the same context as the one + * we are waiting for. The user should call this function once and then test + * if the free is done by polling (must release wq context) fip_free_gw_done + */ +static int fip_free_gw_list(struct fip_discover *discover, enum fip_flush flush) +{ + struct fip_gw_data *curr_gw; + + down_read(&discover->l_rwsem); + list_for_each_entry(curr_gw, &discover->gw_list, list) + fip_close_gw(curr_gw, flush); + up_read(&discover->l_rwsem); + + vnic_dbg_fip(discover->name, "fip_free_gw_list not done\n"); + return 0; +} + +static inline void update_gw_address(struct fip_gw_data *gw, + struct fip_gw_data_info *new_gw_data) +{ + gw->info.gw_qpn = new_gw_data->gw_qpn; + gw->info.gw_lid = new_gw_data->gw_lid; + gw->info.gw_port_id = new_gw_data->gw_port_id; + gw->info.sl = new_gw_data->sl; + memcpy(gw->info.gw_guid, new_gw_data->gw_guid, sizeof gw->info.gw_guid); + + vnic_dbg_fip(gw->discover->name, "GW address was modified. " + "QPN: 0x%x, LID: 0x%x, guid: " GUID_FORMAT + "port id: %d, SL: %d\n", gw->info.gw_qpn, + gw->info.gw_lid, GUID_ARG(gw->info.gw_guid), + gw->info.gw_port_id, gw->info.sl); +} + +int fip_gw_modified(struct fip_gw_data *gw, + struct fip_gw_data_info *new_gw_data) +{ + char *name = gw->discover->name; + ASSERT(new_gw_data); + + vnic_dbg_fip(name, "fip_gw_modified called, gw_num_vnics %d -> %d\n", + gw->info.gw_num_vnics, new_gw_data->gw_num_vnics); + + if (memcmp(gw->info.gw_guid, new_gw_data->gw_guid, + sizeof(gw->info.gw_guid)) || + gw->info.gw_lid != new_gw_data->gw_lid || + gw->info.gw_port_id != new_gw_data->gw_port_id || + gw->info.gw_qpn != new_gw_data->gw_qpn || + gw->info.sl != new_gw_data->sl) { + /* In this case the GW address might be modified even + in 'good flow' */ + if (gw->info.gw_type == GW_TYPE_LAG && + gw->info.ext_lag.ucast) + update_gw_address(gw, new_gw_data); + else { + vnic_dbg_fip(name, "fip_gw_modified changing " + "unsupported parameter closing GW\n"); + fip_close_gw(gw, FIP_PARTIAL_FLUSH); + } + } else if (gw->info.gw_num_vnics < new_gw_data->gw_num_vnics) { + vnic_dbg_fip(name, "fip_gw_modified changing num " + "vnics from %d to %d\n", gw->info.gw_num_vnics, + new_gw_data->gw_num_vnics); + gw->info.gw_num_vnics = new_gw_data->gw_num_vnics; + if (fip_gw_create_vnics(gw)) + vnic_err(name, "fip_gw_create_vnics failed\n"); + + } else if (gw->info.gw_num_vnics > new_gw_data->gw_num_vnics) { + gw->info.gw_num_vnics = new_gw_data->gw_num_vnics; + fip_gw_close_nonopen_vnics(gw); + if (gw->vnic_count < gw->info.gw_num_vnics) + fip_gw_create_vnics(gw); + vnic_dbg_fip(name, "fip_gw_modified changing num " + "vnics from %d to %d\n", gw->info.gw_num_vnics, + new_gw_data->gw_num_vnics); + } else if (gw->info.n_rss_qpn != new_gw_data->n_rss_qpn) { + gw->info.n_rss_qpn = new_gw_data->n_rss_qpn; + vnic_dbg_fip(name, "fip_gw_modified changing n_rss_qpn " + "from %d to %d\n", gw->info.n_rss_qpn, + new_gw_data->n_rss_qpn); + } else if (gw->info.hadmined_en != new_gw_data->hadmined_en) { + if (fip_gw_create_vnics(gw)) + vnic_err(name, "fip_gw_create_vnics failed\n"); + } + + return 0; +} + +static inline int is_none_zero_guid(u8 *guid) +{ + int i; + u8 ored = 0; + + if (!guid) + return 0; + + for (i = 0; i < 8; ++i) + ored |= guid[i]; + + return !!ored; +} + +/* + * Look for a GW in the GW list. + * The search need one identifier to identify the Box (either GUID or system name) + * and one identifier for the external port (port_id or eport_name). + * This function uses what ever data is available for the search since + * various callers do not have access to a single pair of ids. + * use NULL for unknown strings and GW_PORT_ID_UNKNOWN for unknown port_id. + * GW that are undergoing complete flush are disregarded by the search. + */ +struct fip_gw_data *fip_find_gw_in_list( + struct fip_discover *discover, + int port_id, + u8 *eport_name, + u8 *gw_guid, + u8 *system_guid, + u8 *system_name) +{ + struct fip_gw_data *curr_gw; + int use_guid = is_none_zero_guid(gw_guid); + int use_system_name = system_name && strlen(system_name) > 0; + int use_system_guid = is_none_zero_guid(system_guid); + int use_eport = eport_name && strlen(eport_name) > 0; + int use_port_id = port_id >= 0; + int port_id_pass; + int eport_match; + + if(!((use_eport || use_port_id) && + (use_guid || use_system_name || use_system_guid))) { + vnic_dbg_fip_v(discover->name, + "fip_find_gw_in_list not enough param for search\n"); + return NULL; + } + + if (use_system_name) + vnic_dbg_fip_v(discover->name, "system name %s\n", system_name); + + if (use_guid) + vnic_dbg_fip_v(discover->name, "gw guid "VNIC_GUID_FMT"\n", + VNIC_GUID_RAW_ARG(gw_guid)); + + if (use_system_guid) + vnic_dbg_fip_v(discover->name, "system guid "VNIC_GUID_FMT"\n", + VNIC_GUID_RAW_ARG(system_guid)); + + if (use_eport) + vnic_dbg_fip_v(discover->name, "eport %s\n", eport_name); + + if (use_port_id) + vnic_dbg_fip_v(discover->name, "port_id 0x%x\n", port_id); + + down_read(&discover->l_rwsem); + list_for_each_entry(curr_gw, &discover->gw_list, list) { + vnic_dbg_fip_v(discover->name, "check gw on eport %s, gw_guid "VNIC_GUID_FMT" " + "system_guid "VNIC_GUID_FMT", flush %d\n", + curr_gw->info.vol_info.gw_port_name, + VNIC_GUID_RAW_ARG(curr_gw->info.gw_guid), + VNIC_GUID_RAW_ARG(curr_gw->info.vol_info.system_guid), + curr_gw->flush); + + if (curr_gw->flush == FIP_FULL_FLUSH) + continue; + /* use the eport names only if you don't have port_id indexes + * This is in order to enable port_id changes. */ + port_id_pass = use_port_id && (curr_gw->info.gw_port_id != (u16)-1); + + eport_match = (use_eport && !port_id_pass && + !strncmp(curr_gw->info.vol_info.gw_port_name, + eport_name,VNIC_GW_PORT_NAME_LEN)) || + (port_id_pass && (port_id == curr_gw->info.gw_port_id)); + + if (!eport_match) + continue; + + if (use_guid && !memcmp(curr_gw->info.gw_guid, gw_guid, GUID_LEN)) + goto found; + + if (use_system_guid && + !memcmp(curr_gw->info.vol_info.system_guid, + system_guid, GUID_LEN)) + goto found; + + if(use_system_name && + !strncmp(curr_gw->info.vol_info.system_name, system_name, + VNIC_SYSTEM_NAME_LEN)) + goto found; + } + + up_read(&discover->l_rwsem); + vnic_dbg_fip(discover->name, "gw not found!\n"); + return NULL; +found: + up_read(&discover->l_rwsem); + return curr_gw; +} + +/* + * Alloc and init a new GW struct + */ +static struct fip_gw_data *fip_discover_create_gw(struct fip_discover *discover) +{ + struct fip_gw_data *gw_data; + + gw_data = kzalloc(sizeof(struct fip_gw_data), GFP_KERNEL); + if (!gw_data) + goto out; + + INIT_DELAYED_WORK(&gw_data->gw_task, fip_discover_gw_fsm); + INIT_DELAYED_WORK(&gw_data->vnic_cleanup_task, fip_purge_vnics); + INIT_LIST_HEAD(&gw_data->vnic_list); + gw_data->discover = discover; + mutex_init(&gw_data->mlock); + +out: + return gw_data; +} + +static void fip_discover_hadmin_update(struct work_struct *work) +{ + struct fip_discover *discover = + container_of(work, struct fip_discover, + hadmin_update_task.work); + struct fip_hadmin_cache *hadmin_entry; + struct fip_hadmin_cache *hadmin_tmp; + struct fip_gw_data *curr_gw; + struct list_head hadmin_head; + char *name; + int flush, used_guid, rc; + + /* move list from hadmin_cache to a temporary list */ + spin_lock_irq(&discover->lock); + list_replace(&discover->hadmin_cache, &hadmin_head); + INIT_LIST_HEAD(&discover->hadmin_cache); + flush = discover->flush; + spin_unlock_irq(&discover->lock); + + if (flush != FIP_NO_FLUSH) + goto out; + + /* process hadmin list */ + list_for_each_entry_safe(hadmin_entry, hadmin_tmp, &hadmin_head, next) { + name = (char *)(hadmin_entry->interface_name); + vnic_dbg_mac(name, "parent_used %d, remove %d\n", + hadmin_entry->parent_used, + hadmin_entry->remove); + if (hadmin_entry->parent_used) { + rc = vnic_parent_update(discover->port, hadmin_entry->interface_name, + hadmin_entry->vnic_id, hadmin_entry->mac, + &(hadmin_entry->qp_base_num), + hadmin_entry->parent_name, + hadmin_entry->remove); + if (rc) + continue; + } + + used_guid = is_valid_guid(hadmin_entry->system_guid); + curr_gw = fip_find_gw_in_list(discover, NOT_AVAILABLE_NUM, + hadmin_entry->eport_name, + NULL, + used_guid ? hadmin_entry->system_guid : NULL, + used_guid ? NULL : hadmin_entry->system_name); + if (!hadmin_entry->remove) { + /* in case no GW or GW is being removed create a new one */ + if (!curr_gw || curr_gw->flush == FIP_FULL_FLUSH) { + curr_gw = fip_discover_create_gw(discover); + if (!curr_gw) { + vnic_warn(discover->name, "failed to create hadmin GW\n"); + continue; + } else { + down_write(&discover->l_rwsem); + list_add_tail(&curr_gw->list, &discover->gw_list); + up_write(&discover->l_rwsem); + } + + memcpy(curr_gw->info.vol_info.system_guid, + hadmin_entry->system_guid, GUID_LEN); + memcpy(curr_gw->info.vol_info.gw_port_name, + hadmin_entry->eport_name, + VNIC_GW_PORT_NAME_LEN); + if (used_guid) + strcpy(curr_gw->info.vol_info.system_name, + NOT_AVAILABLE_STRING); + else + memcpy(curr_gw->info.vol_info.system_name, + hadmin_entry->system_name, + VNIC_SYSTEM_NAME_LEN); + + curr_gw->info.gw_port_id = hadmin_entry->gw_port_id; + curr_gw->state = FIP_GW_HOST_ADMIN; + } + + curr_gw->hadmin_gw = 1; + fip_gw_update_hadmin_gw(curr_gw, hadmin_entry); + } else if(curr_gw) + fip_gw_update_hadmin_gw(curr_gw, hadmin_entry); + + list_del(&hadmin_entry->next); + kfree(hadmin_entry); + } + +out: + /* flush hadmin_tmp list and exit */ + list_for_each_entry_safe(hadmin_entry, hadmin_tmp, &hadmin_head, next) + kfree(hadmin_entry); +} + +int fip_gw_sysfs_show(struct vnic_port *port, char *buf) +{ + struct fip_gw_data *gw; + char *p = buf; + struct fip_discover *discover; + + mutex_lock(&port->start_stop_lock); + list_for_each_entry(discover, &port->fip.discover_list, discover_list) { + + down_read(&discover->l_rwsem); + + list_for_each_entry(gw, &discover->gw_list, list) { + p += _sprintf(p, buf, "IOA_PORT %s:%d\n", + gw->discover->port->dev->ca->name, + gw->discover->port->num); + p += _sprintf(p, buf, "BX_NAME %s\n", + gw->info.vol_info.system_name); + if (!(*(u64 *)(gw->info.vol_info.system_guid))) + p += _sprintf(p, buf, "BX_GUID %s\n", NOT_AVAILABLE_STRING); + else + p += _sprintf(p, buf, "BX_GUID "VNIC_GUID_FMT"\n", + VNIC_GUID_RAW_ARG(gw->info.vol_info.system_guid)); + p += _sprintf(p, buf, "EPORT_NAME %s\n", gw->info.vol_info.gw_port_name); + p += _sprintf(p, buf, "EPORT_ID %u\n", gw->info.gw_port_id); + p += _sprintf(p, buf, "STATE %s\n", + gw->state == FIP_GW_CONNECTED ? + "connected" : "disconnected"); + p += _sprintf(p, buf, "GW_TYPE %s\n", gw->info.gw_type == GW_TYPE_LAG ? + "AGGREGATED" : "LEGACY"); + p += _sprintf(p, buf, "PKEY 0x%x\n", discover->pkey); + p += _sprintf(p, buf, "ALL_VLAN %s\n", + gw->state == FIP_GW_CONNECTED ? + (gw->info.all_vlan_gw ? "yes" : "no") : NOT_AVAILABLE_STRING); + p += _sprintf(p, buf, "\n"); + } + + up_read(&discover->l_rwsem); + } + + mutex_unlock(&port->start_stop_lock); + return (p - buf); +} + +static int fip_discover_rx_advertise_bh(struct fip_discover *discover, + struct fip_gw_data *advertise_data) +{ + struct fip_gw_data *gw_data; + int update_entry = 0; + + /* see if we received advertise packets from this GW before */ + gw_data = fip_find_gw_in_list(discover, + advertise_data->info.gw_port_id, + advertise_data->info.vol_info.gw_port_name, + advertise_data->info.gw_guid, + advertise_data->info.vol_info.system_guid, + advertise_data->info.vol_info.system_name); + + /* + * GW not found in GW list. Create a new GW structure + * and add it to the GW list. + */ + if (!gw_data) { + gw_data = fip_discover_create_gw(discover); + if (!gw_data) { + vnic_dbg_fip(discover->name, "Could not create gw\n"); + return -ENOMEM; + } + gw_data->keep_alive_jiffies = jiffies; + + down_write(&discover->l_rwsem); + list_add_tail(&gw_data->list, &discover->gw_list); + up_write(&discover->l_rwsem); + update_entry = 1; + } else { + gw_data->keep_alive_jiffies = jiffies; + vnic_dbg_fip(discover->name, "gw_data->flush %d\n", gw_data->flush); + if (gw_data->flush != FIP_NO_FLUSH) + return 0; + + if (gw_data->state <= FIP_GW_MCAST_RCVD) + update_entry = 1; + } + + /* If GW is in multicast state (based on received mcast packet), + * replace it with the newer up-to-date packet info. + */ + if (update_entry) { + if (gw_data->state < FIP_GW_MCAST_RCVD) { + down_write(&discover->l_rwsem); + if (advertise_data->info.gw_prot_new) + discover->new_prot_gws++; + else + discover->old_prot_gws++; + up_write(&discover->l_rwsem); + } + memcpy(&gw_data->info, &advertise_data->info, + sizeof(struct fip_gw_data_info)); + gw_data->state = FIP_GW_MCAST_RCVD; + } else { + /* If the pc_id in the adv doesn't match the one + saved - there was a power cycle, so we want to close + the GW */ + if (advertise_data->info.ext_pc_id.valid && + (advertise_data->info.ext_pc_id.power_cycle_id != + gw_data->info.ext_pc_id.power_cycle_id)) { + vnic_dbg_fip_p0(discover->name, "received advertisement with " + "pc_id %llu when expecting %llu. closing the GW", + advertise_data->info.ext_pc_id.power_cycle_id, + gw_data->info.ext_pc_id.power_cycle_id); + fip_close_gw(gw_data, FIP_PARTIAL_FLUSH); + goto no_repost; + } + + /* TBD: enforce discard ?? */ + if (gw_data->info.gw_type != advertise_data->info.gw_type) + vnic_dbg_fip_p0(discover->name, "gateway type must not change\n"); + + /* update GW descriptors that do not require additional processing. + These will be updated as part of GW_MODIFY flow */ + mutex_lock(&gw_data->mlock); + if (advertise_data->info.ext_pc_id.valid) + memcpy(&gw_data->info.ext_pc_id, &advertise_data->info.ext_pc_id, + sizeof(gw_data->info.ext_pc_id)); + + memcpy(&gw_data->info.vol_info, &advertise_data->info.vol_info, + sizeof(gw_data->info.vol_info)); + if (gw_data->info.ext_lag.valid) { + gw_data->info.ext_lag.hash = advertise_data->info.ext_lag.hash; + gw_data->info.ext_lag.ca = advertise_data->info.ext_lag.ca; + gw_data->info.ext_lag.ca_thresh = advertise_data->info.ext_lag.ca_thresh; + gw_data->info.ext_lag.weights_policy = advertise_data->info.ext_lag.weights_policy; + } + mutex_unlock(&gw_data->mlock); + } + + /* if multicast advertisement received */ + if (advertise_data->info.flags & FIP_RCV_MULTICAST) { + vnic_dbg_fip(discover->name, "FIP_RCV_MULTICAST ADVERTISE, state %d\n", + gw_data->state); + /* we are beyond accepting mcast advertisement */ + if (gw_data->state > FIP_GW_MCAST_RCVD) + goto out; + + vnic_dbg_fip(discover->name, "received mcast advertise sending" + " ucast solicit to GW qpn %d lid %d flags 0x%x\n", + gw_data->info.gw_qpn, gw_data->info.gw_lid, + gw_data->info.flags); + } else { /* unicast advertisement received */ + int ack_received = advertise_data->info.flags & FIP_GW_AVAILABLE; + + vnic_dbg_fip(discover->name, "received ucast advertise from GW " + "qpn %d lid %d flags 0x%x, ack_received %s " + "gw_num_vnics %d gw->state=%d, " + VNIC_GUID_FMT"\n", + gw_data->info.gw_qpn, gw_data->info.gw_lid, + gw_data->info.flags, ack_received ? "yes" : "no", + gw_data->info.gw_num_vnics, gw_data->state, + VNIC_GUID_RAW_ARG(gw_data->info.gw_guid)); + + /* if this is first ACK received */ + if (ack_received && gw_data->state <= FIP_GW_MCAST_RCVD) { + /* if GW was ACKed */ + fip_gw_create_vnics(gw_data); + gw_data->state = FIP_GW_CONNECTED; + } else if (ack_received && + (gw_data->state == FIP_GW_CONNECTED)) { + /* + * received an ACK and we are connected. we need to + * check for changes in GW and apply them if needed + */ + if (!fip_gw_modified(gw_data, &advertise_data->info)) + gw_data->state = FIP_GW_CONNECTED; + goto no_repost; + + } else if (!ack_received) { + fip_close_gw(gw_data, FIP_PARTIAL_FLUSH); + goto no_repost; + } + /* + * we don't accept ACKs in transient states. + * This should not be a problem since crowded multiple ACKs + * is not an expected flow, and if the packets are similar + * (no updates) it doesn't matter anyway. + */ + } + +out: + vnic_dbg_fip(discover->name, "out gw->state=%d\n", gw_data->state); + /* + * we will call the GW FSM to hadle + */ + cancel_delayed_work(&gw_data->gw_task); + fip_discover_gw_fsm(&gw_data->gw_task.work); +no_repost: + return 0; +} + +/* + * This function handles a single received packet that are expected to be + * GW advertisements or login ACK packets. The function first parses the + * packet and decides what is the packet type and then validates the packet + * according to its type. This functions runs in ka_wq task context. + */ +void fip_discover_rx_packet(int *queue, struct fip_content *fc) +{ + *queue = 0; + switch (fc->fh->subcode) { + case FIP_GW_ADV_SUB_OPCODE: + case FIP_GW_LOGIN_SUB_OPCODE: + *queue = 1; + break; + default: + break; + } +} + +/* + * Print FIP syndrome number and string + */ +static void fip_print_syndrome(struct fip_vnic_data *vnic, int synd) { + char *syndstr; + + switch (synd) { + case FIP_SYNDROM_HADMIN_REJECT: + syndstr = "FIP_SYNDROM_HADMIN_REJECT"; + break; + case FIP_SYNDROM_GW_RESRC: + syndstr = "FIP_SYNDROM_GW_RESRC"; + break; + case FIP_SYNDROM_NO_NADMIN: + syndstr = "FIP_SYNDROM_NO_NADMIN"; + break; + case FIP_SYNDROM_UNRECOGNISED_HOST: + syndstr = "FIP_SYNDROM_UNRECOGNISED_HOST"; + break; + case FIP_SYNDROM_UNSUPPORTED_PARAM: + syndstr = "FIP_SYNDROM_UNSUPPORTED_PARAM"; + break; + case FIP_SYNDROM_GW_IS_LAG_MEMBER: + syndstr = "FIP_SYNDROM_GW_IS_LAG_MEMBER"; + break; + case FIP_SYNDROM_DUPLICATE_ADDRESS: + syndstr = "FIP_SYNDROM_DUPLICATE_ADDRESS"; + break; + default: + syndstr = "FIP_OTHER"; + } + + vnic_warn(vnic->name, "SYNDROME 0x%x: %s\n", + synd, syndstr); +} + +static void handle_login_packet(struct fip_discover *discover, + struct fip_login_data *login_data) +{ + struct fip_gw_data *gw; + struct fip_vnic_data *vnic; + int mac_vlan_refused = 0; + int synd; + + /* find the GW that this login belongs to */ + gw = fip_find_gw_in_list(discover, + login_data->port_id, + NULL, + login_data->guid, + NULL, NULL); + if (!gw) + return; + + vnic = fip_vnic_find_in_list(gw, login_data->vnic_id, + login_data->mac, + login_data->vlan, + login_data->vp); + if (!vnic) + return; + + /* + * For host administered vNICs we must have login and login ack + * macs equal and different than all zeros. login and and login + * ack must agree on vlan presence. And if vlan is present, vlans + * must be indentical. Otherwise, the request is rejected. + */ + if (vnic->hadmined) { + if (!IS_ZERO_MAC(vnic->login_data.mac) && + memcmp(vnic->login_data.mac, login_data->mac, ETH_ALEN)) { + vnic_dbg_fip(discover->name, "fip_discover_rx_packet" + " host admined mac refused\n"); + mac_vlan_refused = 1; + } else if (vnic->login_data.all_vlan_gw != login_data->all_vlan_gw) + vnic_dbg_fip(discover->name, + "fip_discover_rx_packet host" + " host and GW disagree on all_vlan mode\n"); + /* If the host is not working in all_vlan_gw policy - + check the requested vlan against the accepted */ + else if (!gw->info.all_vlan_gw && + (vnic->login_data.vp != login_data->vp || + (login_data->vp == 1 && + vnic->login_data.vlan != login_data->vlan))) { + vnic_dbg_fip(discover->name, + "fip_discover_rx_packet host" + " admined vlan refused\n"); + mac_vlan_refused = 1; + } + } + + /* process a login packet for the specific vnic */ + synd = (int)login_data->syndrome; + if (synd || mac_vlan_refused) { + /* print syndrome as long as backlog limit is not exceeded */ + if (vnic->synd_backlog++ >= vnic_synd_backlog) + return; + + vnic_warn(discover->name, "%s login failed " + "(mac "MAC_6_PRINT_FMT" vlan %d) " + "backlog %d/%d\n", + (vnic->hadmined ? + (char *)vnic->interface_name : (char *)vnic->name), + MAC_6_PRINT_ARG(vnic->mac_cache), + (vnic->vlan_used ? vnic->vlan : -1), + vnic->synd_backlog, vnic_synd_backlog); + + if (mac_vlan_refused) + vnic_warn(vnic->name, "MAC/VLAN refused\n"); + + fip_print_syndrome(vnic, synd); + } else { + vnic->all_vlan_gw = !!((!vnic->hadmined && vnic->gw->info.all_vlan_gw) || + (vnic->hadmined && vnic->login_data.all_vlan_gw)); + fip_vnic_login_ack_recv(vnic, login_data); + } +} + +/* + * This function handles a single received packet that are expected to be + * GW advertisements or login ACK packets. The function first parses the + * packet and decides what is the packet type and then processes the packet + * according to its type. This functions runs in task context. + */ +int fip_discover_rx_packet_bh(struct fip_discover *discover, struct fip_content *fc) +{ + struct fip_gw_data *advertise_data = NULL; + struct fip_login_data *login_data = NULL; + int rc; + int ret = 0; + + switch (fc->fh->subcode) { + case FIP_GW_ADV_SUB_OPCODE: + advertise_data = kzalloc(sizeof *advertise_data, GFP_KERNEL); + if (!advertise_data) { + vnic_warn(discover->name, + "Failed to allocate %Zu bytes", + sizeof *advertise_data); + return -ENOMEM; + } + + rc = fip_advertise_parse_bh(discover, fc, advertise_data); + if (!rc) + ret = fip_discover_rx_advertise_bh(discover, + advertise_data); + kfree(advertise_data); + break; + + case FIP_GW_LOGIN_SUB_OPCODE: + login_data = kzalloc(sizeof *login_data, GFP_KERNEL); + if (!login_data) { + vnic_warn(discover->name, + "Failed to allocate %Zu bytes", + sizeof *login_data); + return -ENOMEM; + } + + rc = fip_login_parse(discover, fc, login_data); + if (!rc) + handle_login_packet(discover, login_data); + + kfree(login_data); + break; + default: + break; + } + + return ret; +} + +/* + * This function is a callback called upon successful join to a + * multicast group. The function checks if we have joined + attached + * to all required mcast groups and if so moves the discovery FSM to solicit. + */ +static void fip_discover_mcast_connect_cb(struct vnic_mcast *mcaste, void *ctx) +{ + struct fip_discover *discover = mcaste->priv_data; + + if (mcaste->cur_attached && mcaste->req_attach) { + vnic_dbg_parse(discover->name, "attached mask = 0x%lx, req mask = 0x%lx\n", + *mcaste->cur_attached, *mcaste->req_attach); + if ((*mcaste->cur_attached & *mcaste->req_attach) != + *mcaste->req_attach) { + return; + } + } + + discover->discover_mcast_attached_jiffies = jiffies; + set_bit(MCAST_ATTACHED, &discover->discover_mcast_state); + /* in the case of a reconnect don't change state or send a solicit + * packet + */ + if (discover->state < FIP_DISCOVER_SOLICIT) { + vnic_dbg_fip(discover->name, "fip_multicast_connected moved" + " state to solicit\n"); + spin_lock_irq(&discover->lock); + if (discover->flush == FIP_NO_FLUSH) { + /* delay sending solicit packet by 0-100 mSec */ + int rand_delay = jiffies % 100; /*get_random_int()*/ + discover->state = FIP_DISCOVER_SOLICIT; + cancel_delayed_work(&discover->fsm_task); + /* This is really (rand_delay / 1000) * HZ*/ + /* calls fip_discover_fsm() */ + queue_delayed_work(fip_wq, &discover->fsm_task, + (rand_delay * HZ) / 1000); + } + spin_unlock_irq(&discover->lock); + } + vnic_dbg_fip(discover->name, "discover_mcast_connect_cb done\n"); +} + +/* + * This function is a callback called upon a mcast deattach event. + * This event can be triggered due to discovery teardown or due to an async + * event. Currently this code does not participate in the discovery's FSM. +*/ +void fip_discover_mcast_deattach_cb(struct vnic_mcast *mcast, void *ctx) +{ +// struct vnic_mcast *mcast_other = ctx; + struct fip_discover *discover = mcast->priv_data; + + discover->discover_mcast_detached_jiffies = jiffies; + clear_bit(MCAST_ATTACHED, &discover->discover_mcast_state); + + vnic_dbg_fip(NULL, "fip_discover_mcast_deattach_cb\n"); +} + +/* + * Try to connect to the relevant mcast groups. If one of the mcast failed + * The function should be recalled to try and complete the join process + * (for the mcast groups that the join process was not performed). + * Note: A successful return of vnic_mcast_join means that the mcast join + * started, not that the join completed. completion of the connection process + * is asyncronous and uses a supplyed callback. + */ +static int fip_discover_mcast_connect(struct fip_discover *discover) +{ + struct vnic_mcast *mcaste_disc, *mcaste_sol, *mcaste; + int rc; + + mcaste_disc = vnic_mcast_alloc(discover->port, &discover->req_attach, &discover->cur_attached); + if (IS_ERR(mcaste_disc)) + return -EINVAL; + + mcaste_sol = vnic_mcast_alloc(discover->port, &discover->req_attach, &discover->cur_attached); + if (IS_ERR(mcaste_sol)) { + vnic_mcast_dealloc(mcaste_disc); + return -EINVAL; + } + + set_bit(FIP_MCAST_DISCOVER, &discover->req_attach); + set_bit(FIP_MCAST_SOLICIT, &discover->req_attach); + + mcaste = mcaste_disc; + mcaste->priv_data = discover; + mcaste->attach_bit_nr = FIP_MCAST_DISCOVER; + memcpy(mcaste->mac, ETH_BCAST_MAC, ETH_ALEN); + memcpy(&mcaste->gid, fip_discover_mgid, GID_LEN); + if (discover->pkey != 0xffff) + *(u16 *)&mcaste->gid.raw[6] = htons(discover->pkey | 0x8000); + memcpy(&mcaste->port_gid, &mcaste->gid, GID_LEN); + mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC); + mcaste->backoff_factor = VNIC_MCAST_BACKOF_FAC; + mcaste->retry = VNIC_MCAST_ULIMIT_RETRY; + mcaste->attach_cb = fip_discover_mcast_connect_cb; + mcaste->detach_cb = fip_discover_mcast_deattach_cb; + mcaste->attach_cb_ctx = mcaste_sol; + mcaste->detach_cb_ctx = mcaste_sol; + mcaste->pkey = discover->pkey; + mcaste->qkey = VNIC_FIP_QKEY; + mcaste->qp = discover->qp; + mcaste->blocking = 0; + mcaste->join_state = 1; + rc = vnic_mcast_add(&discover->mcast_tree, mcaste); + ASSERT(!rc); + rc = vnic_mcast_attach(&discover->mcast_tree, mcaste); /* MCAST_RECEIVE_ONLY */ + ASSERT(!rc); + + mcaste = mcaste_sol; + mcaste->priv_data = discover; + mcaste->attach_bit_nr = FIP_MCAST_SOLICIT; + memcpy(mcaste->mac, ETH_BCAST_MAC, ETH_ALEN); + memcpy(&mcaste->gid, fip_solicit_mgid, GID_LEN); + if (discover->pkey != 0xffff) + *(u16 *)&mcaste->gid.raw[6] = htons(discover->pkey | 0x8000); + memcpy(&mcaste->port_gid, &mcaste->gid, GID_LEN); + mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC); + mcaste->backoff_factor = VNIC_MCAST_BACKOF_FAC; + mcaste->retry = VNIC_MCAST_ULIMIT_RETRY; + mcaste->attach_cb = fip_discover_mcast_connect_cb; + mcaste->detach_cb = fip_discover_mcast_deattach_cb; + mcaste->attach_cb_ctx = mcaste_disc; + mcaste->detach_cb_ctx = mcaste_disc; + mcaste->pkey = discover->pkey; + mcaste->qkey = VNIC_FIP_QKEY; + mcaste->qp = discover->qp; + mcaste->blocking = 0; + mcaste->join_state = 1; + mcaste->sender_only = 1; + rc = vnic_mcast_add(&discover->mcast_tree, mcaste); + ASSERT(!rc); + rc = vnic_mcast_attach(&discover->mcast_tree, mcaste); /* MCAST_SEND_ONLY */ + ASSERT(!rc); + + return 0; +} + +int fip_discover_mcast_reattach(struct fip_discover *discover, + struct vnic_port *port) +{ + int flush; + + spin_lock_irq(&discover->lock); + flush = discover->flush; + spin_unlock_irq(&discover->lock); + + if (flush == FIP_NO_FLUSH && + discover->state > FIP_DISCOVER_INIT) { + vnic_tree_mcast_detach(&discover->mcast_tree); + vnic_tree_mcast_attach(&discover->mcast_tree); + } + return 0; +} + +static void fip_discover_gw_fsm(struct work_struct *work) +{ + struct fip_gw_data *curr_gw = + container_of(work, struct fip_gw_data, gw_task.work); + unsigned long next_wakeup = curr_gw->info.gw_adv_period; + unsigned long rand = jiffies % 100 + 1; + int ret; + + if (curr_gw->flush != FIP_NO_FLUSH) + return; + + if (test_bit(MCAST_ATTACHED, + &curr_gw->discover->discover_mcast_state)) { + if (time_after(jiffies, curr_gw->keep_alive_jiffies + next_wakeup)) { + if (time_after(jiffies, + curr_gw->discover->discover_mcast_attached_jiffies + + next_wakeup)) { + fip_close_gw(curr_gw, FIP_PARTIAL_FLUSH); + return; + } + } + } else { + /* close gw if 1 minute has elapsed since mcast detach */ + if (time_after(jiffies, + curr_gw->discover->discover_mcast_detached_jiffies + + 60*HZ)) { + fip_close_gw(curr_gw, FIP_PARTIAL_FLUSH); + return; + } + } + + switch (curr_gw->state) { + case FIP_GW_HOST_ADMIN: + break; + case FIP_GW_MCAST_RCVD: + vnic_dbg_fip(curr_gw->discover->name, "DISCOVER_LOGIN GW_MCAST_RCVD\n"); + vnic_dbg_parse(curr_gw->discover->name, "new protocol %d\n", curr_gw->info.gw_prot_new); + ret = fip_solicit_send(curr_gw->discover, FIP_DISCOVER_UCAST, + curr_gw->info.gw_qpn, + curr_gw->info.gw_lid, + curr_gw->info.sl, + curr_gw->info.gw_prot_new, + curr_gw->info.gw_guid); + if (ret) + next_wakeup = (100 + rand * HZ) / 200; + else + next_wakeup = (100 + rand * HZ) / 25; + break; + case FIP_GW_CONNECTED: + vnic_dbg_fip(curr_gw->discover->name, "DISCOVER_LOGIN: GW_CONNECTED!!!\n"); + /* test vnic status */ + fip_gw_create_vnics(curr_gw); + break; + default: + ASSERT(0); + break; + } + + /* go to sleep until time out. We expect that we will be awaken by + * RX packets and never get to wake up due to timeout + */ + cancel_delayed_work(&curr_gw->gw_task); + queue_delayed_work(fip_wq, &curr_gw->gw_task, next_wakeup); +} + +static int is_new_solicit_prot(struct fip_discover *discover) +{ + vnic_dbg_parse(discover->name, "new gw %d, old gw %d\n", + discover->new_prot_gws, discover->old_prot_gws); + + if (!discover->old_prot_gws) { + if (!discover->new_prot_gws) { + /* mcast solicit sent before any + * advertise packets arrive. Use old format. + */ + return 0; + } else + return 1; + } + return 0; +} + +/* + * This is the discover finite state machine that runs the + * advertise and solicit packet exchange of the discovery + * proccess. + * It is assumed that this function is only called from work queue + * task context (for locking) + */ +static void fip_discover_fsm(struct work_struct *work) +{ + struct fip_discover *discover = + container_of(work, struct fip_discover, fsm_task.work); + struct vnic_port *port = discover->port; + int recall_time = -1, flush = discover->flush; + + /* we got a flush request and we have not performed it yet */ + if ((flush != FIP_NO_FLUSH) && + discover->state != FIP_DISCOVER_OFF) { + vnic_dbg_fip(discover->name, "discover_fsm switching to OFF\n"); + + recall_time = DELAYED_WORK_CLEANUP_JIFFS * 2; + + + if (discover->state != FIP_DISCOVER_CLEAR) { + fip_free_gw_list(discover, flush); + discover->state = FIP_DISCOVER_CLEAR; + } + + /* if we open GWs we will test again later */ + if (!fip_free_gw_done(discover, flush)) { + vnic_dbg_fip(discover->name, "fip_free_gw_list not done, recalling \n"); + goto recall_fsm; + } + + if (delayed_work_pending(&discover->cleanup_task)) + goto recall_fsm; + + vnic_dbg_fip(discover->name, "fip_free_gw_list done \n"); + vnic_dbg_mark(); + vnic_mcast_del_all(&discover->mcast_tree); + vnic_dbg_mark(); + discover->state = FIP_DISCOVER_OFF; + + /* signal the unload to continue */ + complete(&discover->flush_complete); + return; + } + + if (discover->state == FIP_DISCOVER_OFF) + return; + + if (!port->attr.lid) { + recall_time = 1 * HZ; + goto recall_fsm; + } + + switch (discover->state) { + int new_prot; + + case FIP_DISCOVER_INIT: + vnic_dbg_fip(discover->name, "FIP_DISCOVER_INIT\n"); + /* in init try and join the discover multicast group + * This is a preliminary request for all other progress + * will eventually call fip_discover_mcast_connect_cb() + */ + if (fip_discover_mcast_connect(discover)) { + vnic_warn(discover->name, "fip_discover_mcast_connect() " + "failed\n"); + recall_time = 1 * HZ; + } + break; + + case FIP_DISCOVER_SOLICIT: + new_prot = is_new_solicit_prot(discover); + vnic_dbg_fip(discover->name, "DISCOVER_SOLICIT\n"); + + /* send multicast solicit of type fip, if send is + * successfull move to login state and await advertise + * packets. It TX fail then retry + */ + fip_solicit_send(discover, FIP_DISCOVER_MCAST, 0, 0, 0, new_prot, NULL); + recall_time = FIP_RESOLICIT_TIME * HZ; + + break; + + case FIP_DISCOVER_OFF: + default: + ASSERT(0); + break; + + } + +recall_fsm: + if (recall_time >= 0) + queue_delayed_work(fip_wq, &discover->fsm_task, recall_time); + + return; +} + diff --git a/drivers/net/mlx4_vnic/vnic_fip_discover.h b/drivers/net/mlx4_vnic/vnic_fip_discover.h new file mode 100644 index 0000000000000..424e99dcccfd0 --- /dev/null +++ b/drivers/net/mlx4_vnic/vnic_fip_discover.h @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _FIP_DISCOVER_H +#define _FIP_DISCOVER_H + +#include "vnic.h" +#include "vnic_fip.h" + +/* TODO - rethink this */ +#define FIP_UD_MTU(ib_mtu) (ib_mtu - FIP_ENCAP_LEN - FIP_ETH_HEADER_LEN) +#define FIP_UD_BUF_SIZE(ib_mtu) (ib_mtu + IB_GRH_BYTES) + +#define FIP_MAX_BACKOFF_SECONDS 16 +#define FIP_MAX_VNICS_PER_GW (1 << 9) + +#define FIP_TIMEOUT_FACTOR(a) ((a)*5/2) + +enum fip_gw_state { + FIP_GW_HOST_ADMIN, + FIP_GW_MCAST_RCVD, /* got mcast advertise. did not receive ucast */ + FIP_GW_CONNECTED /* we are already connected. do nothing */ +}; + + +enum { + GW_TYPE_SINGLE_EPORT = 0, + GW_TYPE_LAG = 1, +}; + +struct gw_ext_boot { + int valid; + int boot_prio; + int timeout; +}; + +struct gw_ext_lag { + int valid; + int hash; /* enum gw_ext_lag_hash_policy */ + int weights_policy; + int member_ka; + int ca; /* conjestion aware */ + int ca_thresh; + int ucast; /* gw supports unicat keep alives */ +}; + + +struct gw_ext_pc_id { + int valid; + u64 power_cycle_id; +}; + +struct fip_gw_data_info { + struct fip_gw_volatile_info vol_info; + long gw_adv_period; /* timeout in jiffies */ + long gw_period; /* timeout in jiffies */ + long vnic_ka_period; /* in jiffies */ + int flags; + u32 gw_qpn; + u16 gw_lid; + u16 gw_port_id; + u16 gw_num_vnics; + u16 n_rss_qpn; + u8 sl; + u8 hadmined_en; + u8 all_vlan_gw; + u8 gw_vendor_id[VNIC_VENDOR_LEN+1]; + u8 gw_guid[GUID_LEN]; + int gw_type; + int gw_prot_new; + int ext_mask; + struct gw_ext_boot ext_boot; + struct gw_ext_lag ext_lag; + struct gw_ext_pc_id ext_pc_id; +}; + +struct fip_gw_data { + enum fip_flush flush; + int hadmin_gw; + struct mutex mlock; + struct fip_discover *discover; + struct list_head list; + unsigned long keep_alive_jiffies; + enum fip_gw_state state; + int vnic_count; + struct list_head vnic_list; + struct delayed_work gw_task; + struct delayed_work vnic_cleanup_task; + struct fip_gw_data_info info; + unsigned long n_bitmask[(FIP_MAX_VNICS_PER_GW >> 3) / + sizeof(unsigned long)]; +}; + +enum fip_gw_data_flags { + FIP_IS_FIP = 1 << 0, /* protocol type */ + FIP_RCV_MULTICAST = 1 << 1, /* received mcast packet */ + FIP_GW_AVAILABLE = 1 << 2, /* GW available bit set in pkt */ + FIP_HADMINED_VLAN = 1 << 3, /* H bit set in advertise pkt */ +}; + +/* + * TODO - we can do a nicer job here. stage 2 + * allocates memory and post receives + */ +int fip_post_discovery_rcv(struct vnic_port *port, + int ring_size, struct ib_qp *qp, + struct fip_ring *rx_ring); + +int fip_discover_mcast_reattach(struct fip_discover *discover, + struct vnic_port *port); + +/* + * This function handles a single received packet that are expected to be + * GW advertisements or login ACK packets. The function first parses the + * packet and decides what is the packet type and then handles the packets + * specifically according to its type. This functions runs in task context. +*/ +void fip_discover_rx_packet(int *queue, struct fip_content *fc); +int fip_discover_rx_packet_bh(struct fip_discover *discover, struct fip_content *fc); + +/* + * This function is the RX packet handler entry point at the thread level + * (unlike the completion handler that runs from interrupt context). + * the function calls a handler function and then reallocats the ring + * entry for the next receive. +*/ +void fip_discover_process_rx(struct fip_discover *discover); +void fip_discover_process_rx_bh(struct work_struct *work); + +/* This function creates an info string from GW attributes published + * by the GW in advertisement pkts */ +int fip_get_short_gw_info(struct fip_gw_data *gw, char *buff); + + +int fip_packet_parse(struct vnic_port *port, void *packet, int size, + struct fip_content *fc); + +#endif /* _FIP_DISCOVER_H */ diff --git a/drivers/net/mlx4_vnic/vnic_fip_ib.c b/drivers/net/mlx4_vnic/vnic_fip_ib.c new file mode 100644 index 0000000000000..9538abefeedf7 --- /dev/null +++ b/drivers/net/mlx4_vnic/vnic_fip_ib.c @@ -0,0 +1,440 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_fip.h" + +#define FIP_OP_RECV (1ul << 31) +/* TODO - rethink this */ +#define FIP_UD_MTU(ib_mtu) (ib_mtu - FIP_ENCAP_LEN - FIP_ETH_HEADER_LEN) +#define FIP_UD_BUF_SIZE(ib_mtu) (ib_mtu + IB_GRH_BYTES) + +static inline void fip_wr_pepare(struct vnic_port *port, + struct ib_send_wr *tx_wr, + struct ib_sge *tx_sge, + unsigned int wr_id, u64 mapping, + int size, u16 pkey_index) +{ + /* This is a fixed part */ + memset(tx_wr, 0, sizeof(struct ib_send_wr)); + tx_wr->num_sge = 1; + tx_wr->sg_list = tx_sge; + tx_wr->opcode = IB_WR_SEND; + tx_wr->send_flags = IB_SEND_SIGNALED; + tx_wr->wr.ud.pkey_index = pkey_index; + tx_wr->wr_id = wr_id; + + memset(tx_sge, 0, sizeof(struct ib_sge)); + tx_sge->lkey = port->mr->lkey; + tx_sge->addr = mapping; + tx_sge->length = size; +} + +/* + * send a single multicast packet. + * return 0 on success, other on failure. +*/ +int fip_mcast_send(struct vnic_port *port, + struct ib_qp *qp, + unsigned int wr_id, + u64 mapping, + int size, + u16 pkey_index, + struct vnic_mcast *mcast) +{ + struct ib_send_wr *bad_wr; + struct ib_sge tx_sge; + struct ib_send_wr tx_wr; + int ret; + + fip_wr_pepare(port, &tx_wr, &tx_sge, wr_id, mapping, size, pkey_index); + + tx_wr.wr.ud.ah = mcast->ah; + tx_wr.wr.ud.remote_qpn = 0xFFFFFFFF; /*dest_qpn; */ + tx_wr.wr.ud.remote_qkey = mcast->qkey; + + ret = ib_post_send(qp, &tx_wr, &bad_wr); + + return ret; +} + +const unsigned char gid_prefix[8]={ + 0xfe, 0x80, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00 +}; + +/* + * send a single unicast packet. + * return 0 on success, other on failure. + */ +int fip_ucast_send(struct vnic_port *port, + struct ib_qp *qp, + unsigned int wr_id, + u64 mapping, + int size, + u16 pkey_index, u32 dest_qpn, u16 dlid, + u32 qkey, u8 sl, + unsigned char *dguid) +{ + struct ib_send_wr *bad_wr; + struct ib_ah *new_ah = NULL; + struct ib_sge tx_sge; + struct ib_send_wr tx_wr; + struct ib_ah_attr ah_attr; + int ret; + + fip_wr_pepare(port, &tx_wr, &tx_sge, wr_id, mapping, size, pkey_index); + + memset(&ah_attr, 0, sizeof ah_attr); + ah_attr.dlid = dlid, + ah_attr.port_num = port->num, + ah_attr.sl = sl & 0xf, + ah_attr.ah_flags = IB_AH_GRH; + memcpy(&ah_attr.grh.dgid.raw[0], gid_prefix, 8); + memcpy(&ah_attr.grh.dgid.raw[8], dguid, 8); + + new_ah = ib_create_ah(port->pd, &ah_attr); + if (IS_ERR(new_ah)) + return -ENOMEM; + + tx_wr.wr.ud.ah = new_ah; + tx_wr.wr.ud.remote_qpn = dest_qpn; + tx_wr.wr.ud.remote_qkey = qkey; + + ret = ib_post_send(qp, &tx_wr, &bad_wr); + + ib_destroy_ah(new_ah); + + return ret; +} + +/* + * This is a general purpose CQ completion function that handles + * completions on RX and TX rings. It can serve all users that are + * using RX and TX rings. + * RX completions are destinguished from TX comp by the MSB that is set + * for RX and clear for TX. For RX, the memory is unmapped from the PCI, + * The head is incremented. For TX the memory is unmapped and then freed. + * The function returns the number of packets received. +*/ +int fip_comp(struct vnic_port *port, + struct ib_cq *cq, + struct fip_ring *rx_ring, + struct fip_ring *tx_ring, + char *name) +{ +#define FIP_DISCOVER_WC_COUNT 4 + struct ib_wc ibwc[FIP_DISCOVER_WC_COUNT]; + int wrid, n, i; + int mtu_size = FIP_UD_BUF_SIZE(port->max_mtu_enum); + int rx_count = 0; + struct ib_device *dev = port->dev->ca; + + do { + /* + * poll for up to FIP_DISCOVER_WC_COUNT in one request. + * returns the number of WC actually polled + */ + n = ib_poll_cq(cq, FIP_DISCOVER_WC_COUNT, ibwc); + for (i = 0; i < n; ++i) { + /* + * use a mask on the id to decide if this is a receive + * or transmit WC + */ + if (ibwc[i].wr_id & FIP_OP_RECV) { + wrid = ibwc[i].wr_id & ~FIP_OP_RECV; + + ib_dma_sync_single_for_cpu(dev, + rx_ring->ring[wrid].bus_addr, + mtu_size, + DMA_FROM_DEVICE); + + if (likely(ibwc[i].status == IB_WC_SUCCESS)) { + rx_ring->ring[wrid].length = + ibwc[i].byte_len; + rx_count++; + } else + rx_ring->ring[wrid].entry_posted = 0; + + rx_ring->head++; + } else { /* TX completion */ + unsigned long flags; + wrid = ibwc[i].wr_id; + + /* unmap and free transmitted packet */ + ib_dma_unmap_single(dev, + tx_ring->ring[wrid]. + bus_addr, tx_ring->ring[wrid].length, + DMA_TO_DEVICE); + + kfree(tx_ring->ring[wrid].mem); + + tx_ring->ring[wrid].length = 0; + spin_lock_irqsave(&tx_ring->head_tail_lock, flags); + tx_ring->tail++; + spin_unlock_irqrestore(&tx_ring->head_tail_lock, flags); + } + } + } while (n == FIP_DISCOVER_WC_COUNT); + + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + + return rx_count; +} + +/* qonfigure a newly allocated QP and move it + * from reset->init->RTR->RTS + */ +int fip_init_qp(struct vnic_port *port, struct ib_qp *qp, u16 pkey_index, char *name) +{ + struct ib_qp_attr qp_attr; + int attr_mask; + + qp_attr.qp_state = IB_QPS_INIT; + qp_attr.qkey = VNIC_FIP_QKEY; + qp_attr.port_num = port->num; + qp_attr.pkey_index = pkey_index; + attr_mask = IB_QP_QKEY | IB_QP_PORT | IB_QP_PKEY_INDEX | IB_QP_STATE; + + if (ib_modify_qp(qp, &qp_attr, attr_mask)) + goto out_fail; + + qp_attr.qp_state = IB_QPS_RTR; + attr_mask &= ~IB_QP_PORT; + if (ib_modify_qp(qp, &qp_attr, attr_mask)) + goto out_fail; + + qp_attr.qp_state = IB_QPS_RTS; + qp_attr.sq_psn = 0; + attr_mask |= IB_QP_SQ_PSN; + attr_mask &= ~IB_QP_PKEY_INDEX; + if (ib_modify_qp(qp, &qp_attr, attr_mask)) + goto out_fail; + + return 0; + +out_fail: + qp_attr.qp_state = IB_QPS_RESET; + if (ib_modify_qp(qp, &qp_attr, IB_QP_STATE)) + vnic_warn(name, "failed to modify QP to RESET state\n"); + + return -EINVAL; +} + +void fip_qp_to_reset(struct ib_qp *qp, char *name) +{ + struct ib_qp_attr qp_attr; + + qp_attr.qp_state = IB_QPS_RESET; + if (ib_modify_qp(qp, &qp_attr, IB_QP_STATE)) + vnic_warn(name, "Failed to modify QP to RESET state\n"); + return; +} + +/* + * alloc a single buffer, map it and post it to the qp. + * id used to identify entry in receive queue. + */ +int fip_post_receive(struct vnic_port *port, struct ib_qp *qp, int size, + int _id, struct fip_ring_entry *mem_entry, char *name) +{ + struct ib_recv_wr rx_wr, *bad_wr; + struct ib_sge rx_sge; + int rc; + + rx_wr.wr_id = _id | FIP_OP_RECV; + rx_wr.next = NULL; + rx_wr.sg_list = &rx_sge; + rx_wr.num_sge = 1; + rx_sge.addr = mem_entry->bus_addr; + rx_sge.length = size; + rx_sge.lkey = port->mr->lkey; + + ib_dma_sync_single_for_device(port->dev->ca, rx_sge.addr, + FIP_UD_BUF_SIZE(port->max_mtu_enum), + DMA_FROM_DEVICE); + + rc = ib_post_recv(qp, &rx_wr, &bad_wr); + if (unlikely(rc)) { + vnic_warn(name, "post receive failed for buf rc %d (id %d)\n", _id, rc); + goto post_recv_failed; + } + mem_entry->entry_posted = 1; + return 0; + +post_recv_failed: + mem_entry->entry_posted = 0; + return -EIO; +} + +void fip_flush_rings(struct vnic_port *port, + struct ib_cq *cq, + struct ib_qp *qp, + struct fip_ring *rx_ring, + struct fip_ring *tx_ring, + char *name) +{ + vnic_dbg_fip(name, "fip_qp_to_err called\n"); + if (qp) { + fip_qp_to_reset(qp, name); + fip_comp(port, cq, rx_ring, tx_ring, name); + } +} + +void fip_free_rings(struct vnic_port *port, + struct fip_ring *rx_ring, + struct fip_ring *tx_ring, + char *name) +{ + struct ib_device *dev = port->dev->ca; + int i; + + for (i = rx_ring->size - 1; i >= 0; --i) { + if (rx_ring->ring[i].mem) { + ib_dma_unmap_single(dev, + rx_ring->ring[i].bus_addr, + FIP_UD_BUF_SIZE(port->max_mtu_enum), + DMA_FROM_DEVICE); + kfree(rx_ring->ring[i].mem); + } + } + rx_ring->size = 0; + + for (i = tx_ring->size - 1; i >= 0; --i) + if (tx_ring->ring[i].length != 0) { + ib_dma_unmap_single(dev, + tx_ring->ring[i].bus_addr, + tx_ring->ring[i].length, + DMA_TO_DEVICE); + kfree(tx_ring->ring[i].mem); + } + tx_ring->size = 0; + + vnic_dbg_fip(name, "Done cleaning RX and TX queues\n"); + + kfree(rx_ring->ring); + rx_ring->ring = NULL; + kfree(tx_ring->ring); + tx_ring->ring = NULL; +} + +/* + * TODO - we can do a nicer job here. stage 2 + * allocates memory and post receives + * TODO2: need to handle the bad flow to free all existing entries in the ring + */ +int fip_init_rx(struct vnic_port *port, + int ring_size, + struct ib_qp *qp, + struct fip_ring *rx_ring, + char *name) +{ + struct ib_device *dev = port->dev->ca; + int i, rc = 0, mtu_size = FIP_UD_BUF_SIZE(port->max_mtu_enum); + + rx_ring->size = ring_size; + rx_ring->ring = kzalloc(rx_ring->size * + sizeof(struct fip_ring_entry), + GFP_KERNEL); + if (!rx_ring->ring) { + vnic_warn(name, "failed to alloc fip RX ring, size %d\n", rx_ring->size); + rx_ring->size = 0; + return -ENOMEM; + } + + /* allocate the ring entries */ + for (i = 0; i < rx_ring->size; i++) { + rx_ring->ring[i].mem = kmalloc(mtu_size, GFP_KERNEL); + if (unlikely(!rx_ring->ring[i].mem)) { + rc = -ENOMEM; + goto error; + } + + rx_ring->ring[i].entry_posted = 0; + rx_ring->ring[i].length = mtu_size; + rx_ring->ring[i].bus_addr = ib_dma_map_single(dev, + rx_ring->ring[i].mem, + mtu_size, DMA_FROM_DEVICE); + if (unlikely(ib_dma_mapping_error(dev, rx_ring->ring[i].bus_addr))) { + rc = -ENODEV; + goto dma_error; + } + + if (fip_post_receive(port, qp, FIP_UD_BUF_SIZE(port->max_mtu_enum), + i, rx_ring->ring + i, name)) { + rc = -EIO; + goto post_recv_failed; + } + } + + rx_ring->head = 0; + rx_ring->tail = 0; + spin_lock_init(&rx_ring->head_tail_lock); + spin_lock_init(&rx_ring->ring_lock); + return 0; + +post_recv_failed: + ib_dma_unmap_single(dev, rx_ring->ring[i].bus_addr, + mtu_size, DMA_FROM_DEVICE); +dma_error: + kfree(rx_ring->ring[i].mem); + rx_ring->ring[i].mem = NULL; +error: + /* previous entries need to be freed after flushing the QP */ + return rc; +} + +/* + * This function allocates the tx buffers and initializes the head and + * tail indexes. + */ +int fip_init_tx(int size, struct fip_ring *tx_ring, char *name) +{ + tx_ring->size = size; + tx_ring->ring = kzalloc(tx_ring->size * + sizeof(struct fip_ring_entry), + GFP_KERNEL); + + if (!tx_ring->ring) { + vnic_warn(name, "failed to alloc fip TX ring, size %d\n", + tx_ring->size); + tx_ring->size = 0; + return -ENOMEM; + } + + tx_ring->head = 0; + tx_ring->tail = 0; + spin_lock_init(&tx_ring->head_tail_lock); + spin_lock_init(&tx_ring->ring_lock); + return 0; +} + diff --git a/drivers/net/mlx4_vnic/vnic_fip_login.c b/drivers/net/mlx4_vnic/vnic_fip_login.c new file mode 100644 index 0000000000000..d9757c70b6710 --- /dev/null +++ b/drivers/net/mlx4_vnic/vnic_fip_login.c @@ -0,0 +1,1727 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_fip.h" +#include "vnic_fip_discover.h" +#include "vnic_fip_pkt.h" + +#ifndef work_pending /* back-port */ +#define work_pending(_work) test_bit(0, &(_work)->pending) +#endif + +enum { + VNIC_LOGIN_REG_NETDEV_PENDING, + VNIC_LOGIN_REG_NETDEV_DONE, + VNIC_LOGIN_DESTROY_PENDING, + VNIC_LOGIN_DESTROY_DONE, + VNIC_LOGIN_DESTROY_FULL +}; + +static int fip_vnic_rings_create(struct vnic_port *port, + struct fip_vnic_data *vnic); +static void fip_vnic_rings_destroy(struct fip_vnic_data *vnic); +static void fip_vnic_recv(struct fip_vnic_data *vnic); + +#ifdef _BP_HR_TIMER +int fip_vnic_keepalive(struct hrtimer * timer); +#else +enum hrtimer_restart fip_vnic_keepalive(struct hrtimer * timer); +#endif +int fip_vnic_keepalive_send(struct fip_vnic_data *vnic, int source); + + +#define QUEUE_VNIC_DWORK(vnic, task, time) \ +do { \ + unsigned long flags; \ + spin_lock_irqsave(&vnic->lock, flags); \ + if (likely(vnic->flush == FIP_NO_FLUSH)) \ + queue_delayed_work(fip_wq, task, time); \ + spin_unlock_irqrestore(&vnic->lock, flags); \ +} while(0) + +#define REQUEUE_VNIC_DWORK(vnic, task, time) \ +do { \ + cancel_delayed_work(task); \ + QUEUE_VNIC_DWORK(vnic, task, time); \ +} while(0); + + +/* + * Look for a vnic in the GW vnic list. The search key used is either the vnic_id + * that is unique, or the mac+vlan pair. A match on either key will result in the + * return of the vnic. both keys are nesesary because host assigned delete + * flow might not have access to the vnic_id. The search disregards vnics that + * are undergoing full flush (they will be removed soon). +*/ +struct fip_vnic_data *fip_vnic_find_in_list(struct fip_gw_data *gw, u16 vnic_id, + u8 *mac, u16 vlan, u8 vlan_used) +{ + struct fip_vnic_data *vnic; + int use_mac = mac ? 1 : 0; + int vlan_match; + + ASSERT(gw); + + if (list_empty(&gw->vnic_list)) + return NULL; + + /* do not use MAC 0:..:0 for vnic matches */ + if (use_mac) + use_mac = !IS_ZERO_MAC(mac); + + list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) { + if (vnic->flush == FIP_FULL_FLUSH) + continue; + + if (vnic->vnic_id == vnic_id) + return vnic; + + if (vlan_used != vnic->login_data.vp) + continue; + + vlan_match = !vlan_used || + (vlan_used && (vlan == vnic->login_data.vlan)); + + if ((use_mac && !memcmp(vnic->login_data.mac, mac, ETH_ALEN)) && + vlan_match) + return vnic; + } + return NULL; +} + +/* + * This function handles completions of both TX and RX + * packets of vnics. RX packets are unmapped lightly parsed moved to a list + * and passed to thread processing. TX packets are unmapped and freed. + * Note: this function is called from interrupt context + */ +static void fip_vnic_comp(struct ib_cq *cq, void *vnic_ptr) +{ + struct fip_vnic_data *vnic = vnic_ptr; + + /* handle completions. On RX packets this will call vnic_recv + * from thread context to continue processing */ + if (fip_comp(vnic->port, vnic->cq, &vnic->rx_ring, + &vnic->tx_ring, vnic->name)) + fip_vnic_recv(vnic); + + fip_vnic_keepalive_send(vnic, 0); +} + +/* + * read the state of the gw eport. This can be done from any context and therefore + * requires protection. +*/ +int fip_vnic_get_eport_state(struct fip_vnic_data *vnic) +{ + int i; + + if (no_bxm) + return 1; + + if (vnic->gw->info.gw_type == GW_TYPE_LAG) { + for (i = 0; i < MAX_LAG_MEMBERS; i++) { + if (!(vnic->lm.used_bitmask & 1 << i)) + continue; + + if (vnic->lm.memb[i].eport_state) + return 1; + } + return 0; + } else { + return atomic_read(&vnic->eport_state); + } +} + +/* + * get GW info funcs. +*/ +int fip_vnic_get_bx_name(struct fip_vnic_data *vnic, char *buff) +{ + struct fip_gw_data *gw = vnic->gw; + struct fip_gw_volatile_info tmp_info; + int rc; + + if (!gw) + return -EINVAL; + + mutex_lock(&gw->mlock); + memcpy(&tmp_info, &gw->info.vol_info, sizeof(tmp_info)); + mutex_unlock(&gw->mlock); + + rc = sprintf(buff, "%s", tmp_info.system_name); + + return rc < 0 ? rc : 0; +} + +int fip_vnic_get_bx_guid(struct fip_vnic_data *vnic, char *buff) +{ + struct fip_gw_data *gw = vnic->gw; + struct fip_gw_volatile_info tmp_info; + void *rc; + + memset(buff, 0, sizeof *buff); + + if (!gw) + return -EINVAL; + + mutex_lock(&gw->mlock); + memcpy(&tmp_info, &gw->info.vol_info, sizeof(tmp_info)); + mutex_unlock(&gw->mlock); + + rc = memcpy(buff, tmp_info.system_guid, GUID_LEN); + + return rc ? 0 : -EINVAL; +} + +int fip_vnic_get_all_vlan_mode(struct fip_vnic_data *vnic, char *buff) +{ + struct fip_gw_data *gw = vnic->gw; + int rc; + + if (!gw) + return -EINVAL; + + rc = sprintf(buff, "%s", gw->info.all_vlan_gw ? "yes" : "no"); + + return rc < 0 ? rc : 0; +} + +int fip_vnic_get_eport_name(struct fip_vnic_data *vnic, char *buff) +{ + + struct fip_gw_data *gw = vnic->gw; + struct fip_gw_volatile_info tmp_info; + int rc; + + if (!gw) + return -EINVAL; + + mutex_lock(&gw->mlock); + memcpy(&tmp_info, &gw->info.vol_info, sizeof(tmp_info)); + mutex_unlock(&gw->mlock); + + rc = sprintf(buff, "%s", tmp_info.gw_port_name); + + return rc < 0 ? rc : 0; +} + +u8 fip_vnic_get_bx_sl(struct fip_vnic_data *vnic) +{ + return vnic->gw->info.sl; +} + +/* + * get GW info funcs. +*/ +int fip_vnic_get_gw_type(struct fip_vnic_data *vnic) +{ + struct fip_gw_data *gw = vnic->gw; + int lag = 0; + + if (!gw) + return -EINVAL; + + lag = gw->info.gw_type == GW_TYPE_LAG; + + return lag; +} + +/* + * get GW info funcs. +*/ +int fip_vnic_get_lag_eports(struct fip_vnic_data *vnic, char *buf) +{ + struct fip_gw_data *gw = vnic->gw; + struct lag_member *member; + char *p = buf; + int i; + + if (!gw) + return -EINVAL; + + if (!gw || (gw->info.gw_type != GW_TYPE_LAG)) + return -EINVAL; + + p += _sprintf(p, buf, "LAG_MEMBER_INFORMATION:\n"); + for (i = 0; i < MAX_LAG_MEMBERS; i++) { + if (!(vnic->lm.used_bitmask & 1 << i)) + continue; + + member = &vnic->lm.memb[i]; + p += _sprintf(p, buf, " %.2d ID=%.3X LID=%4X QPN=%8X STATE=%s\n", + i, member->gw_port_id, member->lid, member->qpn, + member->eport_state ? "UP" : "DOWN"); + } + + return p - buf; +} + +/* + * process an incoming login ack packet. The packet was already parsed and + * its data was placed in *data. The function creates RX and TX rings for the + * vnic and starts the multicast join procedure. + * This function should not be called for packets other then login ack packets. + */ +void fip_vnic_login_ack_recv(struct fip_vnic_data *vnic, + struct fip_login_data *data) +{ + /* we allow login acks only in wait for ack in other states + * we ignore them */ + if (vnic->state != FIP_VNIC_WAIT_4_ACK) { + vnic_dbg_fip_v(vnic->name, + "vnic_login_ack_recv in state other" + " then FIP_VNIC_WAIT_4_ACK state %d\n", + vnic->state); + return; + } + + /* For LAG vnics, process login ack member data */ + if (vnic->gw->info.gw_type == GW_TYPE_LAG) + handle_member_update(vnic, &data->lagm); + + memcpy(&vnic->login_data, data, sizeof(vnic->login_data)); + + vnic->state = FIP_VNIC_RINGS_INIT; + + /* calls fip_vnic_fsm() */ + cancel_delayed_work(&vnic->vnic_task); + fip_vnic_fsm(&vnic->vnic_task.work); + // REQUEUE_VNIC_DWORK(vnic, &vnic->vnic_task, 0); + return; +} + +/* + * This is a helper function we use in order to move the login create + * to another context so we don't block the fip thread for too long. + * The call stack triggered by this function calls register_netdev that + * might block for some time when netdev are removed in parallel. This + * stalls the fip_wq which causes KA not to be sent. +*/ +void fip_vnic_login_create(struct work_struct *work) +{ + struct fip_vnic_data *vnic = + container_of(work, struct fip_vnic_data, vnic_login_create_task); + char *name = NULL; + int rc; + + if (vnic->hadmined) + name = vnic->interface_name; + + rc = vnic_login_register_netdev(vnic, vnic->mac_cache, name); + + spin_lock_irq(&vnic->lock); + clear_bit(VNIC_LOGIN_REG_NETDEV_PENDING, &vnic->login_status); + if (!rc) + set_bit(VNIC_LOGIN_REG_NETDEV_DONE, &vnic->login_status); + spin_unlock_irq(&vnic->lock); +} + +/* + * Test if the create request posted earlier terminated or not. + * If yes and successfully returns 0, if still pending returns + * -EAGAIN , and if failed returns -EINVAL. if retry is set + * it will requeue a create attempt and try again. In this case + * the function will return -EAGAIN. +*/ +static int fip_vnic_test_login(struct fip_vnic_data *vnic, int retry) +{ + int ret = 0; + + spin_lock_irq(&vnic->lock); + + if (!test_bit(VNIC_LOGIN_REG_NETDEV_DONE, &vnic->login_status)) { + /* queue retry login create request */ + if (retry) { + if (!test_and_set_bit(VNIC_LOGIN_REG_NETDEV_PENDING, + &vnic->login_status)) { + memcpy(vnic->mac_cache, vnic->login_data.mac, ETH_ALEN); + vnic->vlan_used = vnic->login_data.vp; + vnic->vlan = vnic->login_data.vlan; + vnic->all_vlan_gw = vnic->login_data.all_vlan_gw; + + /* calls fip_vnic_login_create() */ + if (vnic->flush == FIP_NO_FLUSH) + queue_work(login_wq, &vnic->vnic_login_create_task); + } + ret = -EAGAIN; + } else { + if (test_bit(VNIC_LOGIN_REG_NETDEV_PENDING, + &vnic->login_status)) + ret = -EAGAIN; + else + ret = -EINVAL; + } + } + spin_unlock_irq(&vnic->lock); + + return ret; +} + + +/* + * This function should be called when the building of a vhub context + * table is done and the vnic state should transition to CONNECTED. + */ +int fip_vnic_tbl_done(struct fip_vnic_data *vnic) +{ + vnic->vhub_table.state = VHUB_TBL_UP2DATE; + vnic->vhub_table.tusn = vnic->vhub_table.main_list.tusn; + + if (vnic->state <= FIP_VNIC_VHUB_DONE) + vnic->state = FIP_VNIC_VHUB_DONE; + else + vnic->state = FIP_VNIC_VHUB_WRITE; + + cancel_delayed_work(&vnic->vnic_task); + fip_vnic_fsm(&vnic->vnic_task.work); + return 0; +} + +/* + * This function runs in interrupt context + * It does sanity checking of the packet, moves it to a list and passes + * handleing to a thread. + */ +static void fip_vnic_recv(struct fip_vnic_data *vnic) +{ + struct fip_ring *rx_ring = &vnic->rx_ring; + int ret, length; + u32 vhub_id; + void *mem; + int queue_packet = 0; + int one_or_more_queued = 0; + int index; + int err; + + while (rx_ring->head != rx_ring->tail) { + struct fip_content *fc; + + queue_packet = 0; + index = rx_ring->tail & (vnic->rx_ring.size - 1); + + if (rx_ring->ring[index].entry_posted == 0) + goto repost; + + mem = rx_ring->ring[index].mem; + length = rx_ring->ring[index].length; + + + fc = kzalloc(sizeof *fc, GFP_ATOMIC); + if (!fc) { + vnic_warn(vnic->name, "kzalloc failed\n"); + goto repost; + } + + err = fip_packet_parse(vnic->port, mem + IB_GRH_BYTES, length - IB_GRH_BYTES, fc); + if (err) { + vnic_warn(vnic->name, "packet parse failed\n"); + kfree(fc); + goto repost; + } + + switch (fc->fh->subcode) { + case FIP_GW_UPDATE_SUB_OPCODE: + if (fc->fvu) { + vhub_id = be32_to_cpu(fc->fvu->state_vhub_id) & 0xffffff; + if (vnic->login_data.vhub_id == vhub_id) + queue_packet = 1; + } + + break; + case FIP_GW_TABLE_SUB_OPCODE: + if (vnic->state >= FIP_VNIC_VHUB_INIT && + vnic->vhub_table.state == VHUB_TBL_INIT) { + /* handle vhub context table packets */ + if (fc->fvt) { + vhub_id = be32_to_cpu(fc->fvt->vp_vhub_id) & 0xffffff; + if (vnic->login_data.vhub_id == vhub_id) + queue_packet = 1; + } + } + break; + default: + vnic_dbg_fip_v(vnic->name, + "received unexpected format packet\n"); + break; + } + + if (queue_packet && (likely(vnic->flush == FIP_NO_FLUSH))) { + struct fip_rcv_pkt *rcv; + struct fip_ring_entry me; + + /* record packet time for heart beat */ + vnic->keep_alive_jiffs = jiffies; + length -= IB_GRH_BYTES; + rcv = kzalloc(sizeof *rcv, GFP_ATOMIC); + if (!rcv) { + vnic_warn(vnic->name, "failed kmalloc\n"); + kfree(fc); + goto repost; + } + + /* replace it with new entry, and queue old one */ + err = alloc_map_fip_buffer(vnic->port->dev->ca, &me, + FIP_UD_BUF_SIZE(vnic->port->max_mtu_enum), + GFP_ATOMIC); + if (err) { + vnic_warn(vnic->name, "alloc_map_fip_buffer failed\n"); + kfree(fc); + kfree(rcv); + goto repost; + } + + /* unmap old entry */ + ib_dma_unmap_single(vnic->port->dev->ca, + rx_ring->ring[index].bus_addr, + FIP_UD_BUF_SIZE(vnic->port->max_mtu_enum), + DMA_FROM_DEVICE); + + rx_ring->ring[index] = me; + rcv->fc = fc; + rcv->length = length; + rcv->mem = mem; + spin_lock(&vnic->vnic_rcv_list.lock); + list_add_tail(&rcv->list, &vnic->vnic_rcv_list.list); + spin_unlock(&vnic->vnic_rcv_list.lock); + one_or_more_queued++; + } else + kfree(fc); +repost: + ret = fip_post_receive(vnic->port, vnic->qp, + FIP_UD_BUF_SIZE(vnic->port->max_mtu_enum), + index, rx_ring->ring + index, vnic->name); + if (ret) + vnic_warn(vnic->name, "fip_post_receive ret %d\n", ret); + + rx_ring->tail++; + } + + if (one_or_more_queued && (likely(vnic->flush == FIP_NO_FLUSH))) { + /* calls fip_vnic_recv_bh() */ + queue_work(fip_wq, &vnic->vnic_pkt_rcv_task_bh); + } + + return; +} + +void fip_vnic_recv_list_flush(struct fip_vnic_data *vnic) +{ + struct list_head vnic_recv_local; + struct fip_rcv_pkt *rcv, *rcv1; + unsigned long flags; + + INIT_LIST_HEAD(&vnic_recv_local); + + spin_lock_irqsave(&vnic->vnic_rcv_list.lock, flags); + list_replace_init(&vnic->vnic_rcv_list.list, &vnic_recv_local); + spin_unlock_irqrestore(&vnic->vnic_rcv_list.lock, flags); + + list_for_each_entry_safe(rcv, rcv1, &vnic_recv_local, list) { + list_del(&rcv->list); + kfree(rcv); + } + return; +} + +void lag_ctx_clear(struct fip_vnic_data *vnic) +{ + memset(&vnic->lm, 0, sizeof (vnic->lm)); +} + +/* + * Handle the GW eport member info for a LAG GW. The function compares the + * member information to previous membership information that is stored in the + * vnic. The data path info is updated only after the login ack info was + * updated to prevent race conditions. + * The vnic contains a local cache of the member info. The cache is updated + * in all cases other then if the write to the data path failed. If the write + * failed we will not update the cache and rely on periodic updates packets + * for the retry. + * There are 4 possible flows per member entry: + * 1. the entry is cached in the vnic but not in the packet - remove from vnic + * 2. the entry is not cached in the vnic but is in the packet - add to vnic, + * 3. entry is in vnic and in packet but different params - modifiy vnic + * 4. entry is in vnic and in packet and with similar params - do nothing +*/ +int handle_member_update(struct fip_vnic_data *vnic, struct lag_members *lm) +{ + char packet_used[MAX_LAG_MEMBERS], vnic_used[MAX_LAG_MEMBERS]; + struct lag_member *vnic_mem, *pkt_mem; + int i, j, last_bit = 0, skip; + #define EMPTY_ENTRY (char)0xff + /* we only update data path with new info after certain stage */ + int write_through = !!(vnic->state >= FIP_VNIC_VHUB_WRITE); + struct lag_properties lag_prop; + struct vnic_login *login = vnic->login; + + memset(packet_used, EMPTY_ENTRY, sizeof(packet_used)); + memset(vnic_used, EMPTY_ENTRY, sizeof(vnic_used)); + + /* if LAG is not enabled, or it's a child vNic, abort */ + if (!vnic->gw->info.ext_lag.valid || vnic->parent_used) + return -EINVAL; + + mutex_lock(&vnic->gw->mlock); + lag_prop.ca = vnic->gw->info.ext_lag.ca; + lag_prop.ca_thresh = vnic->gw->info.ext_lag.ca_thresh; + lag_prop.hash_mask = vnic->gw->info.ext_lag.hash; + lag_prop.weights_policy = vnic->gw->info.ext_lag.weights_policy; + mutex_unlock(&vnic->gw->mlock); + if (write_through) + vnic_member_prop(login, &lag_prop); + + /* go over all known members, for each one search for a match in the + * packet member struct */ + for (i = 0; i < MAX_LAG_MEMBERS; i++) { + if (!(vnic->lm.used_bitmask & 1 << i)) + continue; + + vnic_mem = &vnic->lm.memb[i]; + for (j = 0; j < lm->num; j++) { + pkt_mem = &lm->memb[j]; + /* find match for member in vnic data structure */ + if (packet_used[j] == EMPTY_ENTRY && + !memcmp(vnic_mem->guid, pkt_mem->guid, GUID_LEN) && + vnic_mem->gw_port_id == pkt_mem->gw_port_id) { + /* found a match, check for change in parameters */ + if (vnic->login) { + /* check for change in member parameters */ + if (vnic_mem->lid != pkt_mem->lid || + vnic_mem->qpn != pkt_mem->qpn || + vnic_mem->eport_state != pkt_mem->eport_state || + vnic_mem->sl != pkt_mem->sl || + vnic_mem->link_utilization != pkt_mem->link_utilization) { + + vnic_dbg_lag_v(vnic->name, "handle_member_update entry %d modifying lid %d qpn %d state %d\n", + i, lm->memb[j].lid, lm->memb[j].qpn, lm->memb[j].eport_state); + /* update data path if required and store update info localy */ + if (!write_through || + (write_through && !vnic_member_modify(login, i, &lm->memb[j]))) + *vnic_mem = lm->memb[j]; + } + } + packet_used[j] = i; + vnic_used[i] = j; + break; + } + } + /* if member was removed in last packet remove it */ + if (vnic_used[i] == EMPTY_ENTRY) { + if (!write_through || + (write_through && !vnic_member_remove(login, i))) { + vnic_dbg_lag_v(vnic->name, "handle_member_update entry %d removing lid %d qpn %d state %d\n", + i, lm->memb[j].lid, lm->memb[j].qpn, lm->memb[j].eport_state); + vnic->lm.used_bitmask &= ~(1 << i); + } + } + } + + /* go over packet and look for any new members */ + for (j = 0; j < lm->num; j++) { + /* if entry was matched up already */ + if (packet_used[j]!= EMPTY_ENTRY) + continue; + + skip = 0; + /* verify that the same GW_ID is not in use by another port */ + for (i = 0; i < MAX_LAG_MEMBERS; i++) { + if (!(vnic->lm.used_bitmask & 1 << i)) + continue; + if (vnic->lm.memb[i].gw_port_id == lm->memb[j].gw_port_id) + skip = 1; + } + if (skip) + continue; + + /* look for an empty member id and add the member to it */ + for (i = last_bit; i < MAX_LAG_MEMBERS; i++) { + if (vnic->lm.used_bitmask & 1 << i) + continue; + + vnic_dbg_lag_v(vnic->name, "handle_member_update entry %d adding lid %d qpn %d state %d\n", + i, lm->memb[j].lid, lm->memb[j].qpn, lm->memb[j].eport_state); + if (!write_through || + (write_through && !vnic_member_add(login, i, &lm->memb[j]))) { + vnic->lm.used_bitmask |= (1 << i); + vnic->lm.memb[i] = lm->memb[j]; + } + + break; + } + last_bit = i; + } + + return 0; +} + +/* Write the initial member table to the datapath. If we fail we will + * delete the entry from the local cache and rely on periodic updates + * packets for the retry*/ +int fip_vnic_write_members(struct fip_vnic_data *vnic) +{ + int i; + struct lag_properties lag_prop; + struct vnic_login *login = vnic->login; + + /* if LAG is not enabled, or it's a child vNic, abort */ + if (!vnic->gw->info.ext_lag.valid || vnic->parent_used) + return -EINVAL; + + lag_prop.ca = vnic->gw->info.ext_lag.ca; + lag_prop.ca_thresh = vnic->gw->info.ext_lag.ca_thresh; + lag_prop.hash_mask = vnic->gw->info.ext_lag.hash; + lag_prop.weights_policy = vnic->gw->info.ext_lag.weights_policy; + vnic_member_prop(login, &lag_prop); + + /* go over all members, for each one used write it to the data path */ + for (i = 0; i < MAX_LAG_MEMBERS; i++) { + if (!(vnic->lm.used_bitmask & 1 << i)) + continue; + + /* if update failed, delete local entry we will use the + * the update packet flow for retries. + */ + if (vnic_member_add(login, i, &vnic->lm.memb[i])) + vnic->lm.used_bitmask &= ~(1 << i); + } + + return 0; +} + +/* runs in the context of vnic->vnic_pkt_rcv_task_bh */ +void fip_vnic_recv_bh(struct work_struct *work) +{ + struct fip_vnic_data *vnic = + container_of(work, struct fip_vnic_data, vnic_pkt_rcv_task_bh); + int length; + u32 vhub_id, tusn; + int eport_state; + struct vnic_table_entry *vhub_entries; + struct list_head vnic_recv_local; + struct fip_rcv_pkt *rcv, *rcv1; + unsigned long flags; + int i, __eport_state; + + INIT_LIST_HEAD(&vnic_recv_local); + + spin_lock_irqsave(&vnic->vnic_rcv_list.lock, flags); + list_replace_init(&vnic->vnic_rcv_list.list, &vnic_recv_local); + spin_unlock_irqrestore(&vnic->vnic_rcv_list.lock, flags); + + /* We Are not interested in packets prior to FIP_VNIC_VHUB_INIT */ + if (vnic->state < FIP_VNIC_VHUB_INIT || + vnic->flush != FIP_NO_FLUSH) { + list_for_each_entry_safe(rcv, rcv1, &vnic_recv_local, list) { + kfree(rcv->fc); + kfree(rcv->mem); + list_del(&rcv->list); + kfree(rcv); + } + } else { + int err; + + list_for_each_entry_safe(rcv, rcv1, &vnic_recv_local, list) { + length = rcv->length; + + switch (rcv->fc->fh->subcode) { + case FIP_GW_UPDATE_SUB_OPCODE: + /* validate vhub id before processing packet */ + vhub_id = be32_to_cpu(rcv->fc->fvu->state_vhub_id) & 0xffffff; + if(unlikely(vnic->login_data.vhub_id != vhub_id)) + break; + + eport_state = be32_to_cpu(rcv->fc->fvu->state_vhub_id) >> 27 & 3; + __eport_state = (eport_state == 0) ? EPORT_STATE_DOWN : EPORT_STATE_UP; + atomic_set(&vnic->eport_state, __eport_state); + + /* handle vhub context update packets */ + if (rcv->fc->fed.num) { + err = extract_vhub_extended(rcv->fc->fed.fed[0], vnic); + if (err) + vnic_warn(vnic->name, "extract_vhub_extended() failed\n"); + } + if (rcv->fc->cte.num) { + vhub_entries = kmalloc(rcv->fc->cte.num * sizeof *vhub_entries, GFP_KERNEL); + if (!vhub_entries) { + vnic_warn(vnic->port->name, "failed to allocate memory for update CTEs\n"); + goto free_entry; + } + + tusn = be32_to_cpu(rcv->fc->fvu->tusn); + for (i = 0; i < rcv->fc->cte.num; ++i) { + vhub_entries[i].lid = be16_to_cpu(rcv->fc->cte.cte[i].lid); + vhub_entries[i].qpn = be32_to_cpu(rcv->fc->cte.cte[i].qpn) & 0xffffff; + vhub_entries[i].sl = rcv->fc->cte.cte[i].sl & 0xf; + vhub_entries[i].rss = rcv->fc->cte.cte[i].v_rss_type & FIP_CONTEXT_RSS_FLAG ? 1 : 0; + vhub_entries[i].valid = rcv->fc->cte.cte[i].v_rss_type & FIP_CONTEXT_V_FLAG ? 1 : 0; + memcpy(vhub_entries[i].mac, rcv->fc->cte.cte[i].mac, sizeof(vhub_entries[i].mac)); + vhub_handle_update(vnic, vhub_id, tusn - rcv->fc->cte.num + i + 1, &vhub_entries[i]); + } + kfree(vhub_entries); + } + + /* update vnic carrier only when vnic is ready: + * not closing (non zero flush), and per-registered + */ + if (!vnic->flush && vnic->login && + test_bit(VNIC_STATE_LOGIN_CREATE_1, &vnic->login->state)) { + vnic_carrier_update(vnic->login); + } + break; + case FIP_GW_TABLE_SUB_OPCODE: + /* handle vhub context table packets */ + tusn = be32_to_cpu(rcv->fc->fvt->tusn); + vhub_id = be32_to_cpu(rcv->fc->fvt->vp_vhub_id) & 0xffffff; + vhub_handle_tbl(vnic, rcv->fc, vhub_id, tusn); + break; + + default: + break; + } +free_entry: + list_del(&rcv->list); + kfree(rcv->fc); + kfree(rcv->mem); + kfree(rcv); + } + } + return; +} + +/* + * Mark the vnic for deletion and trigger a delayed call to the cleanup + * function. In the past the vnic was moved to another list but this + * might cause vnic duplication if new vnics are added to the GW. Even + * if the vnic is being flushed we need to know it is there. + * + * Note: This deletion method insures that all pending vnic work requests + * are cleared without dependency of the calling context. + */ +void fip_vnic_close(struct fip_vnic_data *vnic, enum fip_flush flush) +{ + int tmp_flush; + + /* net admin -> full flush */ + tmp_flush = vnic->hadmined ? flush : FIP_FULL_FLUSH; + + /* child vNic -> full flush */ + tmp_flush = (!vnic->parent_used) ? tmp_flush : FIP_FULL_FLUSH; + + /* no need for partial cleanup in host admin idle */ + if (tmp_flush == FIP_PARTIAL_FLUSH && + vnic->state < FIP_VNIC_HADMIN_IDLE) + return; + + /* close already in process, disregard */ + if (vnic->flush >= tmp_flush) + return; + + if (vnic->flush == FIP_NO_FLUSH && vnic->state > FIP_VNIC_WAIT_4_ACK) + fip_update_send(vnic, 0, 1 /* logout */); + + spin_lock_irq(&vnic->lock); + vnic->flush = tmp_flush; + cancel_delayed_work(&vnic->vnic_gw_alive_task); + cancel_delayed_work(&vnic->vnic_task); + spin_unlock_irq(&vnic->lock); + /* after this point we should have no work that is not already pending + * for execution, and no new work will be added + */ + + if (vnic->hadmined && tmp_flush == FIP_FULL_FLUSH) + vnic_delete_hadmin_dentry(vnic); + else if (!vnic->hadmined) + /* vnic_count is relevant for net admin only */ + vnic->gw->vnic_count--; + + vnic_dbg_mark(); + + /* calls fip_purge_vnics() */ + queue_delayed_work(fip_wq, &vnic->gw->vnic_cleanup_task, + DELAYED_WORK_CLEANUP_JIFFS); +} + +/* + * This is a helper function we use in order to move the login destroy + * to another context so we don't block the fip thread for too long. +*/ +void fip_vnic_login_destroy(struct work_struct *work) +{ + struct fip_vnic_data *vnic = + container_of(work, struct fip_vnic_data, + vnic_login_destroy_task); + int flush = vnic->flush; + + vnic_login_destroy_wq_stopped(vnic, flush); + + /* we don't want to use a lock here so we will verify that the + * flush level did not change between the request and now */ + if (flush == FIP_FULL_FLUSH) + set_bit(VNIC_LOGIN_DESTROY_FULL, &vnic->login_status); + + set_bit(VNIC_LOGIN_DESTROY_DONE, &vnic->login_status); +} + +/* + * Free vnic resources. This includes closing the data vnic (data QPs etc) + * and the discovery resources. If the vnic can be totaly destroyed (no + * pending work) the vnic will be removed from the GW and it's memory + * freed. If not the vnic will not be freed and the function will return an + * error. The caller needs to recall this unction to complete the operation. + * Note: Do not call this function to remove a vnic, use fip_vnic_close. +*/ +int fip_vnic_destroy(struct fip_vnic_data *vnic) +{ + int pending; + + vnic_dbg_func(vnic->name); + vnic_dbg_fip_p0(vnic->name, "fip_vnic_destroy called flow=%d state=%d mac" MAC_6_PRINT_FMT "\n", + vnic->flush, vnic->state, MAC_6_PRINT_ARG(vnic->login_data.mac)); + + pending = work_pending(&vnic->vnic_pkt_rcv_task_bh) || + delayed_work_pending(&vnic->vnic_gw_alive_task) || + delayed_work_pending(&vnic->vnic_task); + + /* verify no pending packets before we start tearing down the rings */ + if (pending || fip_vnic_test_login(vnic, 0) == -EAGAIN) + goto retry_later; + + if (!test_and_set_bit(VNIC_LOGIN_DESTROY_PENDING, + &vnic->login_status)) { + vnic_login_destroy_stop_wq(vnic, vnic->flush); + /* calls fip_vnic_login_destroy() */ + queue_work(login_wq, &vnic->vnic_login_destroy_task); + } + + if (!test_bit(VNIC_LOGIN_DESTROY_DONE, &vnic->login_status)) + goto retry_later; + + clear_bit(VNIC_LOGIN_DESTROY_DONE, &vnic->login_status); + clear_bit(VNIC_LOGIN_DESTROY_PENDING, &vnic->login_status); + + /* We need to test if when we queued the destroy request it was + * a partial flush but this has changed to a full flush. + * if so we need to try again */ + if (vnic->flush == FIP_FULL_FLUSH && + !test_bit(VNIC_LOGIN_DESTROY_FULL, &vnic->login_status)) + goto retry_later; + + hrtimer_cancel(&vnic->keepalive_timer); + + if (vnic->state >= FIP_VNIC_VHUB_INIT) { + lag_ctx_clear(vnic); + vhub_ctx_free(vnic); + } + + /* disconnect from mcast groups */ + if (vnic->state >= FIP_VNIC_MCAST_INIT) { + vnic_mcast_del_all(&vnic->mcast_tree); + fip_vnic_rings_destroy(vnic); + } + + if (vnic->flush == FIP_PARTIAL_FLUSH) { + vnic->state = FIP_VNIC_HADMIN_IDLE; + vnic->flush = FIP_NO_FLUSH; + vnic->last_send_jiffs = 0; + + vnic_dbg_fip_v(vnic->name, "fip_vnic_remove partial done vnic->retry_count=%d\n", vnic->retry_count); + if (!VNIC_MAX_RETRIES || ++vnic->retry_count < VNIC_MAX_RETRIES) + QUEUE_VNIC_DWORK(vnic, &vnic->vnic_task, FIP_LOGIN_TIMEOUT * HZ); + + } else { + list_del(&vnic->gw_vnics); + vnic_dbg_fip_v(vnic->name, "fip_vnic_remove full done\n"); + kfree(vnic); + } + + return 0; + +retry_later: + return -EBUSY; +} + +int fip_vnic_keepalive_send(struct fip_vnic_data *vnic, int source_timer) +{ + int update; + unsigned long flags; + int ret = 0; + + if (vnic->flush != FIP_NO_FLUSH) + return ret; + + if (vnic->last_send_jiffs > 1 && jiffies - vnic->last_send_jiffs > vnic->gw->info.vnic_ka_period * 3 / 2) + vnic_dbg_fip_p0(vnic->name, "Delaying in sending KA should be %ld actual time=%ld source=%d\n", + vnic->gw->info.vnic_ka_period, jiffies - vnic->last_send_jiffs, source_timer); + + spin_lock_irqsave(&vnic->ka_lock, flags); + if (source_timer || + (vnic->last_send_jiffs && jiffies - vnic->last_send_jiffs > + vnic->gw->info.vnic_ka_period * 6 / 5)) { + + /* we need to have mcast attached before we ask for a table */ + if (vnic->state >= FIP_VNIC_VHUB_INIT && + vnic->vhub_table.state == VHUB_TBL_INIT) + update = 1; + else + update = 0; + + /* send vnic keep alive to GW */ + ret = fip_update_send(vnic, update, 0 /*not logout */); + if (!ret) + vnic->last_send_jiffs = jiffies; + } + spin_unlock_irqrestore(&vnic->ka_lock, flags); + + return ret; + +} + +//void fip_vnic_keepalive(unsigned long data) +#ifdef _BP_HR_TIMER +int fip_vnic_keepalive(struct hrtimer * timer) +#else +enum hrtimer_restart fip_vnic_keepalive(struct hrtimer *timer) +#endif +{ +// struct fip_vnic_data *vnic = (struct fip_vnic_data *)data; + struct fip_vnic_data *vnic = (struct fip_vnic_data *) + container_of(timer, struct fip_vnic_data, keepalive_timer); + unsigned long flags; + ktime_t ktime; + enum hrtimer_restart ret = HRTIMER_NORESTART; + int flush; + + spin_lock_irqsave(&vnic->lock, flags); + flush = vnic->flush; + spin_unlock_irqrestore(&vnic->lock, flags); + + if (flush != FIP_NO_FLUSH) + return ret; + + fip_vnic_keepalive_send(vnic, 1); + + /*mod_timer(&vnic->keepalive, jiffies + time);*/ + ret = HRTIMER_RESTART; + ktime = ktime_set(0, vnic->gw->info.vnic_ka_period * (1000000000 / HZ)); + hrtimer_forward(&vnic->keepalive_timer, vnic->keepalive_timer.base->get_time(), ktime); + + + return ret; + +} + +void fip_vnic_gw_alive(struct work_struct *work) +{ + struct fip_vnic_data *vnic = + container_of(work, struct fip_vnic_data, + vnic_gw_alive_task.work); + long time_to_timeout; + + if (vnic->flush != FIP_NO_FLUSH) + return; + + if (!test_bit(MCAST_ATTACHED, &vnic->vnic_mcaste_state)) { + if (time_after(jiffies, vnic->detached_ka_jiffs + 60*HZ)) { + vnic_dbg_fip_p0(vnic->name, "No GW keep alive timeout when mcast un attached " + "QPN 0x%06x, LID 0x%04x\n", vnic->qp->qp_num, + vnic->port->attr.lid); + fip_vnic_close(vnic, FIP_PARTIAL_FLUSH); + return; + } else { + vnic_dbg_fip_p0(vnic->name, "Got ka poll when bcast not " + "attached QPN 0x%06x, LID 0x%04x, ka=%u\n", + vnic->qp->qp_num, vnic->port->attr.lid, + jiffies_to_msecs(jiffies - vnic->detached_ka_jiffs)); + time_to_timeout = vnic->gw->info.gw_period; + } + } else { + long jiffs_from_last; + jiffs_from_last = (jiffies - vnic->keep_alive_jiffs); + time_to_timeout = vnic->gw->info.gw_period - jiffs_from_last; + } + + /* Todo, change receive of update to rearm work timer so an expiration + * indicates a truie time out */ + if (time_to_timeout <= 0) { + vnic_dbg_fip_p0(vnic->name, "GW keep alives timed out for " + "QPN 0x%06x, LID 0x%04x timeout=%ld\n", vnic->qp->qp_num, + vnic->port->attr.lid, time_to_timeout); + fip_vnic_close(vnic, FIP_PARTIAL_FLUSH); + } else + QUEUE_VNIC_DWORK(vnic, &vnic->vnic_gw_alive_task, + time_to_timeout + 1); +} + +struct fip_vnic_data *fip_vnic_alloc(struct vnic_port *port, + struct fip_gw_data *gw, + int hadmin, u16 vnic_id) +{ + struct fip_vnic_data *vnic; + + vnic = kzalloc(sizeof(struct fip_vnic_data), GFP_KERNEL); + if (!vnic) { + vnic_err(port->name, "failed to alloc vnic\n"); + return NULL; + } + + vnic->state = hadmin ? FIP_VNIC_HADMIN_IDLE : FIP_VNIC_LOGIN; + vnic->vnic_id = vnic_id; + vnic->gw = gw; + vnic->gw_info = gw->info.vol_info; + vnic->port = port; + vnic->hadmined = hadmin; + vnic->flush = FIP_NO_FLUSH; + + sprintf(vnic->name, "vnic-%d", vnic_id); /* will be overwritten */ + + spin_lock_init(&vnic->lock); + spin_lock_init(&vnic->ka_lock); + INIT_DELAYED_WORK(&vnic->vnic_task, fip_vnic_fsm); + INIT_DELAYED_WORK(&vnic->vnic_gw_alive_task, fip_vnic_gw_alive); + INIT_WORK(&vnic->vnic_login_destroy_task, fip_vnic_login_destroy); + INIT_WORK(&vnic->vnic_login_create_task, fip_vnic_login_create); + + +#ifdef _BP_HR_TIMER + hrtimer_init(&vnic->keepalive_timer, CLOCK_MONOTONIC, HRTIMER_REL); +#else + hrtimer_init(&vnic->keepalive_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL ); +#endif + vnic->keepalive_timer.function = fip_vnic_keepalive; + + vnic_mcast_root_init(&vnic->mcast_tree); + atomic_set(&vnic->eport_state,EPORT_STATE_DOWN); + + return vnic; +} + +int fip_vnic_hadmin_init(struct vnic_port *port, struct fip_vnic_data *vnic) +{ + int rc; + + vnic_dbg_func(port->name); + + rc = vnic_login_pre_create_1(port, vnic); + if (rc) { + vnic_warn(port->name, "vnic_login_pre_create_1 failed, rc %d\n", rc); + goto pre_create_failed; + } + + strncpy(vnic->login_data.vnic_name, vnic->interface_name, + sizeof(vnic->interface_name)); + + /* queue login create request */ + fip_vnic_test_login(vnic, 1); + + return 0; + +pre_create_failed: + return -ENODEV; +} + +void fip_vnic_create_gw_param(struct fip_vnic_send_info *gw_address, u32 gw_qpn, + u32 qkey, u16 gw_lid, u8 gw_sl) +{ + gw_address->gw_qpn = gw_qpn; + gw_address->qkey = qkey; + gw_address->gw_lid = gw_lid; + gw_address->gw_sl = gw_sl; +} + +void fip_vnic_set_gw_param(struct fip_vnic_data *vnic, struct fip_vnic_send_info *gw_address) +{ + memcpy(&vnic->gw_address, gw_address, sizeof(vnic->gw_address)); +} + +int fip_hadmin_vnic_refresh(struct fip_vnic_data *vnic, struct fip_vnic_send_info *gw_address) +{ + vnic_dbg_fip(vnic->name, "fip_vnic_to_login host admin flow flush=%d" + " state=%d\n", vnic->flush, vnic->state); + if (likely(vnic->flush == FIP_NO_FLUSH) && + vnic->state == FIP_VNIC_HADMIN_IDLE && + (!VNIC_MAX_RETRIES || vnic->retry_count < VNIC_MAX_RETRIES)) { + fip_vnic_set_gw_param(vnic, gw_address); + vnic->state = FIP_VNIC_LOGIN; + cancel_delayed_work(&vnic->vnic_task); + fip_vnic_fsm(&vnic->vnic_task.work); + } + return 0; +} + +/* + * Call the data vnic precreate 1 + 2 in order to alloc and init the data vnic. + * This function updates qp numbers that the data vnic will use. These qp numbers + * are needed for the login. + * This function does not cleanup on failures. It assumes that the caller will call + * the login destoy. +*/ +static int fip_vnic_login_init(struct vnic_port *port, struct fip_vnic_data *vnic) +{ + int qps_num; + int rc; + + vnic_dbg_func(vnic->name); + + /* If the driver wants to enable RSS (vnic_rss == 1) then the + * number of QPs is what the GW advertises: 1 << n_rss_qpn + */ + qps_num = (port->rx_rings_num > 1) ? (1 << vnic->gw->info.n_rss_qpn) : 1; + qps_num = (qps_num == 0) ? 1 : qps_num; + + /* However, we don't support any qps_num, if the GW asks for more than + * VNIC_MAX_NUM_CPUS QPs, then we're not going to enable RSS + * -- qps_num == 1 means RSS is disabled, otherwise it's enabled + */ + qps_num = qps_num <= VNIC_MAX_NUM_CPUS ? qps_num : 1; + + /* set in vnic, so it can be reported back to the BXM */ + vnic->qps_num = qps_num; + + /* in host admin vnic->login should be non NULL */ + if (!vnic->hadmined) { + rc = vnic_login_pre_create_1(port, vnic); + if (rc) { + vnic_warn(vnic->name, + "vnic_login_pre_create_1 failed, " + "rc %d\n", rc); + goto failed; + } + } + + /* in host admin vnic->login should be non NULL */ + rc = vnic_login_pre_create_2(vnic, qps_num, + vnic->gw->info.gw_type == GW_TYPE_LAG); + if (rc) { + vnic_warn(port->name, "vnic_login_pre_create_2 failed\n"); + goto failed; + } + + /* if parent_used, you must already have the base QPN */ + ASSERT(!vnic->parent_used || vnic->qp_base_num); + + vhub_ctx_init(vnic); + + return 0; + +failed: + return -ENODEV; +} + +/* + * create a CQ and QP for the new vNic. Create RX and TX rings for this + * QP. Move QP to RTS and connect it to the CQ. +*/ +static int fip_vnic_rings_create(struct vnic_port *port, + struct fip_vnic_data *vnic) +{ + struct ib_qp_init_attr qp_init_attr; + int ret; + + vnic->rx_ring.size = FIP_LOGIN_RX_SIZE; + vnic->tx_ring.size = FIP_LOGIN_TX_SIZE; + + INIT_WORK(&vnic->vnic_pkt_rcv_task_bh, fip_vnic_recv_bh); + spin_lock_init(&vnic->vnic_rcv_list.lock); + INIT_LIST_HEAD(&vnic->vnic_rcv_list.list); + + if (ib_find_pkey(port->dev->ca, port->num, vnic->login_data.pkey, + &vnic->login_data.pkey_index)) { + vnic_warn(vnic->name, + "fip_vnic_rings_create PKey 0x%04x not found." + " Check configuration in SM/BX\n", vnic->login_data.pkey); + goto out_w_err; + } + + vnic->pkey = vnic->login_data.pkey; + vnic->pkey_index = vnic->login_data.pkey_index; + + vnic_dbg_fip_v(vnic->name, "fip_vnic_rings_create pkey id %d " + "for pkey 0x%x\n", (int)vnic->pkey_index, + (int)vnic->pkey); + + vnic->cq = ib_create_cq(port->dev->ca, fip_vnic_comp, NULL, vnic, + vnic->rx_ring.size + vnic->tx_ring.size, 0); + if (IS_ERR(vnic->cq)) { + vnic_dbg_fip(vnic->name, "failed to create receive CQ\n"); + goto out_w_err; + } + + memset(&qp_init_attr, 0, sizeof(qp_init_attr)); + qp_init_attr.cap.max_send_wr = vnic->tx_ring.size; + qp_init_attr.cap.max_recv_wr = vnic->rx_ring.size; + qp_init_attr.cap.max_send_sge = 1; + qp_init_attr.cap.max_recv_sge = 1; + qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR; + qp_init_attr.qp_type = IB_QPT_UD; + qp_init_attr.send_cq = vnic->cq; + qp_init_attr.recv_cq = vnic->cq; + + vnic->qp = ib_create_qp(port->pd, &qp_init_attr); + if (IS_ERR(vnic->qp)) { + vnic_dbg_fip(vnic->name, "failed to create QP\n"); + goto error_free_cq; + } + + vnic_dbg_fip_v(vnic->name, "fip_vnic_rings_create QPN %d," + " LID %d\n", (int)vnic->qp->qp_num, (int)port->attr.lid); + + /* move QP from reset to RTS */ + if (fip_init_qp(vnic->port, vnic->qp, vnic->pkey_index, vnic->name)) { + vnic_dbg_fip(vnic->name, "fip_init_qp returned with error\n"); + goto error_free_qp; + } + + ret = fip_init_tx(vnic->tx_ring.size, &vnic->tx_ring, vnic->name); + if (ret) { + vnic_dbg_fip(vnic->name, "fip_init_tx failed ret %d\n", ret); + goto error_free_qp; + } + + ret = fip_init_rx(port, vnic->rx_ring.size, vnic->qp, + &vnic->rx_ring, vnic->name); + if (ret) { + vnic_dbg_fip(vnic->name, "fip_init_rx returned %d\n", ret); + goto error_release_rings; + } + + /* enable recieving CQ completions */ + if (ib_req_notify_cq(vnic->cq, IB_CQ_NEXT_COMP)) + goto error_release_rings; + + vnic_dbg_fip_v(vnic->name, "fip_vnic_rings_create done OK\n"); + + return 0; + +error_release_rings: + fip_flush_rings(port, vnic->cq, vnic->qp, &vnic->rx_ring, + &vnic->tx_ring, vnic->name); + fip_free_rings(port, &vnic->rx_ring, &vnic->tx_ring, vnic->name); +error_free_qp: + ib_destroy_qp(vnic->qp); +error_free_cq: + ib_destroy_cq(vnic->cq); +out_w_err: + vnic->qp = NULL; + vnic->cq = NULL; + vnic->rx_ring.size = 0; + vnic->tx_ring.size = 0; + return -ENODEV; +} + +static void fip_vnic_rings_destroy(struct fip_vnic_data *vnic) +{ + fip_flush_rings(vnic->port, vnic->cq, vnic->qp, &vnic->rx_ring, + &vnic->tx_ring, vnic->name); + fip_free_rings(vnic->port, &vnic->rx_ring, &vnic->tx_ring, vnic->name); + fip_vnic_recv_list_flush(vnic); + ib_destroy_qp(vnic->qp); + ib_destroy_cq(vnic->cq); + vnic->qp = NULL; + vnic->cq = NULL; +} + +/* + * This function is a callback called upon successful join to a + * multicast group. The function checks if we have joined + attached + * to all required mcast groups and if so moves the discovery FSM to solicit. +*/ +void fip_vnic_mcast_cnct_cb(struct vnic_mcast *mcast, void *ctx) +{ + struct fip_vnic_data *vnic = mcast->priv_data; + + vnic_dbg_fip(vnic->name, "fip_vnic_mcast_cnct_cb\n"); + vnic_dbg_parse(vnic->name, "attached mask = 0x%lx, req mask = 0x%lx\n", + *mcast->cur_attached, *mcast->req_attach); + + if ((*mcast->cur_attached & *mcast->req_attach) != *mcast->req_attach) + return; + + vnic->keep_alive_jiffs = jiffies; + set_bit(MCAST_ATTACHED, &vnic->vnic_mcaste_state); + /* in case of a new mcast connection switch to VHUB_INIT, for a + * reconnection stay in the current state */ + if (vnic->state < FIP_VNIC_VHUB_INIT) { + vnic_dbg_fip(vnic->name, + "fip_vnic_mcast_cnct_cb done joining mcasts\n"); + vnic->state = FIP_VNIC_VHUB_INIT; + cancel_delayed_work(&vnic->vnic_task); + REQUEUE_VNIC_DWORK(vnic, &vnic->vnic_task, 0); + } +} + +/* + * This function is a callback called upon a mcast deattach event. + * This event can be triggered due to vnic request or due to an async + * event. Currently this code does not participate in the vnic's FSM. +*/ +void fip_vnic_mcast_deattach_cb(struct vnic_mcast *mcast, void *ctx) +{ + struct fip_vnic_data *vnic = mcast->priv_data; + + vnic->detached_ka_jiffs = jiffies; + clear_bit(MCAST_ATTACHED, &vnic->vnic_mcaste_state); + + vnic_dbg_fip(vnic->name, "fip_vnic_mcast_deattach_cb\n"); +} + +/* + * Try to connect to the relevant mcast groups. If one of the mcast failed + * The function should be recalled to try and complete the join process + * (for the mcast groups that the join process was not performed). + * Note: A successful return of vnic_mcast_join means that the mcast join + * started, not that the join completed. completion of the connection process + * is asyncronous and uses a supplyed callback. + */ +int fip_vnic_mcast_cnct(struct fip_vnic_data *vnic) +{ + struct vnic_port *port = vnic->port; + union vhub_mgid mgid; + struct vnic_mcast *mcaste, *mcaste_upd, *mcaste_tbl; + struct vnic_mcast *uninitialized_var(mcaste_ka); + int rc; + + vnic_dbg_fip(port->name, "fip_vnic_mcast_cnct called\n"); + + mcaste_upd = vnic_mcast_alloc(port, &vnic->req_attach, &vnic->cur_attached); + if (IS_ERR(mcaste_upd)) + return -EINVAL; + + mcaste_tbl = vnic_mcast_alloc(port, &vnic->req_attach, &vnic->cur_attached); + if (IS_ERR(mcaste_tbl)) { + rc = -EINVAL; + goto free_upd; + } + + set_bit(FIP_MCAST_VHUB_UPDATE, &vnic->req_attach); + set_bit(FIP_MCAST_TABLE, &vnic->req_attach); + + vnic_dbg_fip(port->name, "gw type is %d\n", vnic->gw->info.gw_type); + if (vnic->gw->info.gw_type == GW_TYPE_LAG) { + mcaste_ka = vnic_mcast_alloc(port, &vnic->req_attach, &vnic->cur_attached); + if (IS_ERR(mcaste_ka)) { + rc = -EINVAL; + goto free_tbl; + } + set_bit(FIP_MCAST_VHUB_KA, &vnic->req_attach); + } + + mcaste = mcaste_upd; + mcaste->priv_data = vnic; + mcaste->attach_bit_nr = FIP_MCAST_VHUB_UPDATE; + memset(mcaste->mac, 0, ETH_ALEN); + vhub_mgid_create(vnic->login_data.mgid_prefix, + mcaste->mac, + vnic->login_data.n_mac_mcgid, + vnic->login_data.vhub_id, VHUB_MGID_UPDATE, + 0, &mgid); + mcaste->gid = mgid.ib_gid; + mcaste->port_gid = mcaste->gid; + mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC); + mcaste->backoff_factor = VNIC_MCAST_BACKOF_FAC; + mcaste->retry = VNIC_MCAST_ULIMIT_RETRY; + mcaste->attach_cb = fip_vnic_mcast_cnct_cb; + mcaste->detach_cb = fip_vnic_mcast_deattach_cb; + mcaste->attach_cb_ctx = NULL; + mcaste->detach_cb_ctx = NULL; + mcaste->blocking = 0; + mcaste->qkey = VNIC_FIP_QKEY; + mcaste->pkey = vnic->pkey; + mcaste->qp = vnic->qp; + mcaste->create = vnic_mcast_create; + mcaste->blocking = 0; + mcaste->join_state = 1; + rc = vnic_mcast_add(&vnic->mcast_tree, mcaste); + ASSERT(!rc); + rc = vnic_mcast_attach(&vnic->mcast_tree, mcaste); /* MCAST_RECEIVE_ONLY */ + ASSERT(!rc); + + mcaste = mcaste_tbl; + mcaste->priv_data = vnic; + mcaste->attach_bit_nr = FIP_MCAST_TABLE; + memset(mcaste->mac, 0, ETH_ALEN); + vhub_mgid_create(vnic->login_data.mgid_prefix, + mcaste->mac, + vnic->login_data.n_mac_mcgid, + vnic->login_data.vhub_id, VHUB_MGID_TABLE, + 0, &mgid); + mcaste->gid = mgid.ib_gid; + mcaste->port_gid = mcaste->gid; + mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC); + mcaste->backoff_factor = VNIC_MCAST_BACKOF_FAC; + mcaste->retry = VNIC_MCAST_ULIMIT_RETRY; + mcaste->attach_cb = fip_vnic_mcast_cnct_cb; + mcaste->detach_cb = fip_vnic_mcast_deattach_cb; + mcaste->attach_cb_ctx = NULL; + mcaste->detach_cb_ctx = NULL; + mcaste->blocking = 0; + mcaste->qkey = VNIC_FIP_QKEY; + mcaste->pkey = vnic->pkey; + mcaste->qp = vnic->qp; + mcaste->create = vnic_mcast_create; + mcaste->blocking = 0; + mcaste->join_state = 1; + rc = vnic_mcast_add(&vnic->mcast_tree, mcaste); + ASSERT(!rc); + rc = vnic_mcast_attach(&vnic->mcast_tree, mcaste); /* MCAST_RECEIVE_ONLY */ + ASSERT(!rc); + + if (vnic->gw->info.gw_type != GW_TYPE_LAG) + return 0; + + mcaste = mcaste_ka; + mcaste->priv_data = vnic; + mcaste->attach_bit_nr = FIP_MCAST_VHUB_KA; + memset(mcaste->mac, 0, ETH_ALEN); + vhub_mgid_create(vnic->login_data.mgid_prefix, + mcaste->mac, + vnic->login_data.n_mac_mcgid, + vnic->login_data.vhub_id, VHUB_MGID_KA, + 0, &mgid); + mcaste->gid = mgid.ib_gid; + mcaste->port_gid = mcaste->gid; + mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC); + mcaste->backoff_factor = 1; + mcaste->retry = VNIC_MCAST_MAX_RETRY; + mcaste->attach_cb = fip_vnic_mcast_cnct_cb; + mcaste->detach_cb = fip_vnic_mcast_deattach_cb; + mcaste->attach_cb_ctx = NULL; + mcaste->detach_cb_ctx = NULL; + mcaste->blocking = 0; + mcaste->qkey = VNIC_FIP_QKEY; + mcaste->pkey = vnic->pkey; + mcaste->qp = vnic->qp; + mcaste->create = vnic_mcast_create; + mcaste->blocking = 0; + mcaste->join_state = 1; + mcaste->sender_only = 1; + vnic->ka_mcast_gid = mcaste->gid; + rc = vnic_mcast_add(&vnic->mcast_tree, mcaste); + ASSERT(!rc); + rc = vnic_mcast_attach(&vnic->mcast_tree, mcaste); + ASSERT(!rc); + + return 0; + +free_tbl: + vnic_mcast_dealloc(mcaste_tbl); + +free_upd: + vnic_mcast_dealloc(mcaste_upd); + + return rc; +} + +/* + * This function is the driving engine of the vnic logic. It manages the + * vnics state machines. + * Some of the states in the state machine could have been removed because + * they contain "actions" and not states. Still it is easier to maintaine + * the code this way and it gives an easy mechanism for exception handling + * and retries. + * Only call this function from fip_wq context. +*/ +void fip_vnic_fsm(struct work_struct *work) +{ + struct fip_vnic_data *vnic = + container_of(work, struct fip_vnic_data, vnic_task.work); + struct vnic_port *port = vnic->port; + int rc, recall_time = 0; + const long int msec_in_sec = 1000; + struct fip_vnic_send_info gw_address; + ktime_t ktime; + + vnic_dbg_fip(port->name, "fip_vnic_fsm called vnic %d\n", + vnic->vnic_id); + + if (vnic->flush != FIP_NO_FLUSH) + return; + + switch (vnic->state) { + case FIP_VNIC_HADMIN_IDLE: + if (vnic->gw->state < FIP_GW_CONNECTED) + break; + + fip_vnic_create_gw_param(&gw_address, vnic->gw->info.gw_qpn, VNIC_FIP_QKEY, + vnic->gw->info.gw_lid, vnic->gw->info.sl); + fip_vnic_set_gw_param(vnic, &gw_address); + /* fall through */ + + case FIP_VNIC_LOGIN: + vnic_dbg_fip(port->name, "FIP_VNIC_LOGIN vnic %d\n", + vnic->vnic_id); + /* get data QP numbers needed for login request packet. If we fail + * we will close the vnic entirely */ + rc = fip_vnic_login_init(vnic->port, vnic); + if (rc) { + fip_vnic_close(vnic, FIP_PARTIAL_FLUSH); + vnic_warn(vnic->name, "fip_vnic_login_init failed, " + "closing vnic rc %d\n", rc); + break; + } + vnic->state = FIP_VNIC_WAIT_4_ACK; + /* fall through */ + + case FIP_VNIC_WAIT_4_ACK: + vnic_dbg_fip(port->name, "FIP_VNIC_WAIT_4_ACK vnic %d\n", + vnic->vnic_id); + /* resend login request every timeout */ + vnic_dbg_fip(port->name, "fip_login_send vnic %d\n",vnic->vnic_id); + rc = fip_login_send(vnic); + if (!rc) + recall_time = FIP_LOGIN_TIMEOUT * msec_in_sec; + else + recall_time = 1 * msec_in_sec; + + goto queue_vnic_work; + + case FIP_VNIC_RINGS_INIT: + /* create QP and rings */ + rc = fip_vnic_rings_create(vnic->port, vnic); + if (rc) { + fip_vnic_close(vnic, FIP_PARTIAL_FLUSH); + vnic_warn(vnic->name, "fip_vnic_rings_create failed, " + "closing vnic rc=%d\n", rc); + break; + } + + vnic->last_send_jiffs = 1; /* use a non zero value to start transmition */ + { + /* start vnic UCAST KA packets, This will also cause bxm to send us the + * neighbor table */ + if (vnic->gw->info.gw_type != GW_TYPE_LAG) { + ktime = ktime_set(0, 0); +#ifdef _BP_HR_TIMER + hrtimer_start(&vnic->keepalive_timer, ktime, HRTIMER_REL ); +#else + hrtimer_start(&vnic->keepalive_timer, ktime, HRTIMER_MODE_REL ); +#endif + } + } + + vnic->state = FIP_VNIC_MCAST_INIT; + /* fall through */ + + case FIP_VNIC_MCAST_INIT: + rc = fip_vnic_mcast_cnct(vnic); + if (rc) { + vnic_warn(vnic->name, + "fip_vnic_mcast_cnct failed, rc %d\n", rc); + /* try again later */ + recall_time = 1 * msec_in_sec; + goto queue_vnic_work; + } + vnic->state = FIP_VNIC_MCAST_INIT_DONE; + /* fall through */ + + case FIP_VNIC_MCAST_INIT_DONE: + /* wait for mcast attach CB before continueing */ + break; + + case FIP_VNIC_VHUB_INIT: + + /* previous KA if sent did not request a table because MCASTs were not + * available. Send extra KA packet that should trigger table request in + * order to hasten things up */ + fip_vnic_keepalive_send(vnic, 1); + + if (vnic->gw->info.gw_type == GW_TYPE_LAG) { + /* start vnic MCAST KA packets, This will also cause bxm to send us the + * neighbor table */ + ktime = ktime_set(0, 0); +#ifdef _BP_HR_TIMER + hrtimer_start(&vnic->keepalive_timer, ktime, HRTIMER_REL ); +#else + hrtimer_start(&vnic->keepalive_timer, ktime, HRTIMER_MODE_REL ); +#endif + } + + /* start tracking GW keep alives, calls fip_vnic_gw_alive() */ + QUEUE_VNIC_DWORK(vnic, &vnic->vnic_gw_alive_task, + vnic->gw->info.gw_period); + + vnic->state = FIP_VNIC_VHUB_INIT_DONE; + /* fall through */ + + case FIP_VNIC_VHUB_INIT_DONE: + /* we are waiting to receive a full vhub table. The KA will handle + * retries if we do not get the table we are expecting */ + + /* queue login create request */ + if (fip_vnic_test_login(vnic, 1)) { + recall_time = 1 * msec_in_sec; + goto queue_vnic_work; + } + + break; + + case FIP_VNIC_VHUB_DONE: + if (fip_vnic_test_login(vnic, 1)) { + recall_time = 1 * msec_in_sec; + goto queue_vnic_work; + } + + if (vnic_login_complete_ack(vnic, &vnic->login_data, &vnic->shared_vnic)) { + vnic_warn(vnic->name, + "vnic_login_complete_ack failed\n"); + recall_time = 1 * msec_in_sec; + goto queue_vnic_work; + } + + /* for LAG write member info */ + fip_vnic_write_members(vnic); + + vnic->state = FIP_VNIC_VHUB_WRITE; + /* fall through */ + + case FIP_VNIC_VHUB_WRITE: + /* write the vhub table to login */ + fip_vnic_write_tbl(vnic); + vnic->state = FIP_VNIC_CONNECTED; + /* fall through */ + + case FIP_VNIC_CONNECTED: + vnic->retry_count = 0; + break; + default: + ASSERT(0); + break; + } + + vnic_dbg_fip(port->name, "state %d gw_lid %d gw_qpn %d\n", + vnic->state, vnic->gw_address.gw_lid, vnic->gw_address.gw_qpn); + return; + +queue_vnic_work: + QUEUE_VNIC_DWORK(vnic, &vnic->vnic_task, recall_time * HZ / msec_in_sec); +} diff --git a/drivers/net/mlx4_vnic/vnic_fip_main.c b/drivers/net/mlx4_vnic/vnic_fip_main.c new file mode 100644 index 0000000000000..1c175d36c42ac --- /dev/null +++ b/drivers/net/mlx4_vnic/vnic_fip_main.c @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_fip.h" +#include "vnic_fip_discover.h" + +struct workqueue_struct *fip_wq; + +void fip_refresh_mcasts(struct fip_discover *discover) +{ + struct fip_gw_data *gw; + struct fip_vnic_data *vnic; + + fip_discover_mcast_reattach(discover, discover->port); + + down_read(&discover->l_rwsem); + list_for_each_entry(gw, &discover->gw_list, list) + list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) { + if (vnic->flush != FIP_FULL_FLUSH && vnic->state > FIP_VNIC_MCAST_INIT) + vnic_tree_mcast_detach(&vnic->mcast_tree); + } + + list_for_each_entry(gw, &discover->gw_list, list) + list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) { + if (vnic->flush != FIP_FULL_FLUSH && vnic->state > FIP_VNIC_MCAST_INIT) + vnic_tree_mcast_attach(&vnic->mcast_tree); + } + up_read(&discover->l_rwsem); + +} + +void port_fip_discover_restart(struct work_struct *work) +{ + struct vnic_port *port = + container_of(work, struct vnic_port, discover_restart_task.work); + struct fip_discover *discover; + + vnic_dbg_mark(); + mutex_lock(&port->start_stop_lock); + vnic_dbg_mark(); + mutex_lock(&port->mlock); + if (vnic_port_query(port)) + vnic_warn(port->name, "vnic_port_query failed\n"); + mutex_unlock(&port->mlock); + + list_for_each_entry(discover, &port->fip.discover_list, discover_list) { + if (fip_discover_cleanup(port, discover, 0)) { + vnic_dbg(port->name, "fip_discover_cleanup flushed\n"); + goto out; + } + } + + list_for_each_entry(discover, &port->fip.discover_list, discover_list) { + if (fip_discover_init(port, discover, discover->pkey, 0)) { + vnic_warn(port->name, "failed to alloc discover resources\n"); + } + } +out: + mutex_unlock(&port->start_stop_lock); + return; +} + +void vnic_port_fip_cleanup(struct vnic_port *port, int lock) +{ + struct fip_discover *discover, *tmp_discover; + + if (lock) + mutex_lock(&port->start_stop_lock); + + list_for_each_entry_safe(discover, tmp_discover, &port->fip.discover_list, discover_list) { + vnic_dbg_fip_p0(port->name, "Discovery cleanup of PKEY=0x%x\n", discover->pkey); + + list_del(&discover->discover_list); + vnic_info("Removed fip discovery %s port %d pkey 0x%x\n", + port->dev->ca->name, port->num, discover->pkey); + fip_discover_cleanup(port, discover, 1); + kfree(discover); + } + + if (lock) + mutex_unlock(&port->start_stop_lock); +} + + +int vnic_port_fip_init(struct vnic_port *port) +{ + int rc; + struct fip_discover *discover; + int i; + + if (no_bxm) + return 0; + + vnic_discovery_pkeys_count = vnic_discovery_pkeys_count > MAX_NUM_PKEYS_DISCOVERY ? + MAX_NUM_PKEYS_DISCOVERY : vnic_discovery_pkeys_count; + + if (vnic_discovery_pkeys_count == 0 || + (vnic_discovery_pkeys_count == MAX_NUM_PKEYS_DISCOVERY && + vnic_discovery_pkeys[0] == 0)) { + vnic_discovery_pkeys[0] = 0xffff; + vnic_discovery_pkeys_count = 1; + vnic_dbg_fip_p0(port->name, "Creating default PKEY for Discovery\n"); + } + + mutex_lock(&port->start_stop_lock); + + for (i = 0; i < vnic_discovery_pkeys_count; i++) { + vnic_discovery_pkeys[i] &= 0xffff; + vnic_discovery_pkeys[i] |= 0x8000; + + vnic_dbg_fip_p0(port->name, "Init Discovery=%d on PKEY=0x%x\n", i, vnic_discovery_pkeys[i]); + + discover = kzalloc(sizeof(struct fip_discover), GFP_KERNEL); + if (!discover) { + vnic_warn(port->name, "discover alloc failed\n"); + rc = -ENOMEM; + goto fail; + } + + INIT_LIST_HEAD(&discover->discover_list); + + vnic_info("Added fip discovery %s port %d PKEY 0x%x\n", + port->dev->ca->name, port->num, + vnic_discovery_pkeys[i]); + + list_add_tail(&discover->discover_list, &port->fip.discover_list); + rc = fip_discover_init(port, discover, vnic_discovery_pkeys[i], 1); + if (rc) { + vnic_warn(port->name, "fip_discover_init pkey=0x%x " + "failed\n", discover->pkey); + list_del(&discover->discover_list); + kfree(discover); + goto fail; + } + } + mutex_unlock(&port->start_stop_lock); + return 0; + +fail: + mutex_unlock(&port->start_stop_lock); + vnic_port_fip_cleanup(port, 1); + return rc; +} + diff --git a/drivers/net/mlx4_vnic/vnic_fip_pkt.c b/drivers/net/mlx4_vnic/vnic_fip_pkt.c new file mode 100644 index 0000000000000..13e5235a5ab36 --- /dev/null +++ b/drivers/net/mlx4_vnic/vnic_fip_pkt.c @@ -0,0 +1,856 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include "vnic.h" +#include "vnic_fip.h" +#include "vnic_fip_discover.h" +#include "vnic_fip_pkt.h" + +const struct eoib_host_update base_update_pkt = { + .fip.subcode = FIP_HOST_ALIVE_SUB_OPCODE, + .fip.type.type = FIP_FIP_HDR_TYPE, + .fip.type.length = FIP_FIP_HDR_LENGTH, + .fip.vendor_id = FIP_VENDOR_MELLANOX, + + .type_1.type = FIP_HOST_UPDATE_TYPE, + .type_1.length = FIP_HOST_UPDATE_LENGTH, + .vendor_id = FIP_VENDOR_MELLANOX, +}; + +const struct eoib_host_update base_logout_pkt = { + .fip.subcode = FIP_HOST_LOGOUT_SUB_OPCODE, + .fip.type.type = FIP_FIP_HDR_TYPE, + .fip.type.length = FIP_FIP_HDR_LENGTH, + .fip.vendor_id = FIP_VENDOR_MELLANOX, + + .type_1.type = FIP_LOGOUT_TYPE_1, + .type_1.length = FIP_LOGOUT_LENGTH_1, + .vendor_id = FIP_VENDOR_MELLANOX, +}; + +static int extract_adv_extended(struct fip_ext_desc_tlv *fed, + struct fip_gw_data_info *info) +{ + struct fip_ext_type_cap *extended_cap; + struct fip_ext_type_boot *extended_boot; + struct fip_ext_type_power_cycle_id *extended_pc_id; + struct fip_ext_type_lag_props *extended_lag = NULL; + struct fip_extended_type *ext_hdr; + int length_to_go, ext_length; + + vnic_dbg_parse("", "extracting extended descriptor\n"); + + length_to_go = (((int)fed->ft.length) << 2) - sizeof(*fed); + ext_hdr = (struct fip_extended_type *)(fed + 1); + + while (length_to_go > 0) { + ext_length = ((int)ext_hdr->len) << 2; + + vnic_dbg_parse(NULL, "Advertise parse, sub-tlv " + "type %d length %d address=%p\n", + ext_hdr->ext_type, ext_length, ext_hdr); + + if (ext_length < sizeof(*ext_hdr) || + ext_length > length_to_go) { + vnic_dbg_parse(NULL, "Extended length error. " + "Length=%d\n", ext_length); + return -EINVAL; + } + + if (ext_hdr->ext_type == ADV_EXT_TYPE(CAP) && + ext_length == sizeof(*extended_cap)) { /* capabilities*/ + /* do nothing */ + } else if (ext_hdr->ext_type == ADV_EXT_TYPE(LAG) && /* LAG */ + ext_length == sizeof(*extended_lag)) { + extended_lag = (struct fip_ext_type_lag_props *)ext_hdr; + info->gw_type = extended_lag->gw_type; + info->ext_lag.hash = be16_to_cpu(extended_lag->lag_hash); + info->ext_lag.weights_policy = extended_lag->weight_policy_flags >> 4; + info->ext_lag.member_ka = (extended_lag->weight_policy_flags & 0x8) >> 3; + info->ext_lag.ca = !!(extended_lag->weight_policy_flags & + FIP_EXT_LAG_W_POLICY_HOST); + info->ext_lag.ca_thresh = extended_lag->ca_threshold; + info->ext_lag.ucast = !!(extended_lag->weight_policy_flags & + FIP_EXT_LAG_W_POLICY_UCAST); + info->ext_lag.valid = 1; + } else if (ext_hdr->ext_type == ADV_EXT_TYPE(BOOT) && + ext_length == sizeof(*extended_boot)) { /* boot */ + extended_boot = (struct fip_ext_type_boot *)ext_hdr; + info->ext_boot.boot_prio = extended_boot->boot_prio; + info->ext_boot.timeout = extended_boot->discovery_timeout; + info->ext_boot.valid = 1; + } else if (ext_hdr->ext_type == ADV_EXT_TYPE(PC_ID) && + ext_length == sizeof(*extended_pc_id)) { /* Power Cycle ID */ + extended_pc_id = (struct fip_ext_type_power_cycle_id *)ext_hdr; + info->ext_pc_id.power_cycle_id = + be64_to_cpu(extended_pc_id->power_cycle_id); + info->ext_pc_id.valid = 1; + } else if (ext_hdr->mandatory & 0x01) { + vnic_dbg_parse(NULL, "Advertise parse, unknown" + " mandatory extended type %d length %d\n", + ext_hdr->ext_type, ext_length); + return -EINVAL; + } else + vnic_dbg_parse(NULL, "Advertise parse, unknown " + "non-mandatory extended. Skipping, type" + " %d length %d\n", + ext_hdr->ext_type, ext_length); + + ext_hdr = (struct fip_extended_type *)((char *)ext_hdr + ext_length); + length_to_go -= ext_length; + } + + return 0; +} + +int fip_advertise_parse_bh(struct fip_discover *discover, struct fip_content *fc, + struct fip_gw_data *data) +{ + long ka_time; + int err = 0; + + /* make sure we have at least a single address descriptor */ + if (fc->fa.num < 1 || !fc->fgwi || !fc->fgid || !fc->fka) + return -EINVAL; + + data->info.flags = be16_to_cpu(fc->fh->flags) & FIP_FIP_ADVRTS_FLAG ? FIP_GW_AVAILABLE : 0; + + data->info.flags |= + (be16_to_cpu(fc->fh->flags) & FIP_FIP_SOLICITED_FLAG) ? 0 : + FIP_RCV_MULTICAST; + + data->info.flags |= FIP_IS_FIP; + data->info.flags |= (fc->fh->flags & FIP_ADVERTISE_HOST_VLANS) ? + FIP_HADMINED_VLAN : 0; + + data->info.gw_qpn = be32_to_cpu(fc->fa.fa[0]->gwtype_qpn) & 0xffffff; + data->info.gw_lid = be16_to_cpu(fc->fa.fa[0]->lid); + data->info.gw_port_id = be16_to_cpu(fc->fa.fa[0]->sl_gwportid) & + FIP_ADVERTISE_GW_PORT_ID_MASK; + data->info.sl = be16_to_cpu(fc->fa.fa[0]->sl_gwportid) >> FIP_ADVERTISE_SL_SHIFT; + memcpy(data->info.gw_guid, fc->fa.fa[0]->guid, sizeof(data->info.gw_guid)); + data->info.gw_num_vnics = be16_to_cpu(fc->fgwi->n_rss_qpn_vnics) & + FIP_ADVERTISE_NUM_VNICS_MASK; + + data->info.n_rss_qpn = be16_to_cpu(fc->fgwi->n_rss_qpn_vnics) >> + FIP_ADVERTISE_N_RSS_SHIFT; + data->info.hadmined_en = (fc->fgwi->h_nmac_mgid & FIP_ADVERTISE_HOST_EN_MASK); + data->info.all_vlan_gw = !!(fc->fgwi->h_nmac_mgid & FIP_ADVERTISE_ALL_VLAN_GW_MASK); + + TERMINATED_MEMCPY(data->info.gw_vendor_id, fc->fgwi->vendor_id); + memcpy(data->info.vol_info.system_guid, fc->fgid->sys_guid, + sizeof(data->info.vol_info.system_guid)); + TERMINATED_MEMCPY(data->info.vol_info.system_name, + fc->fgid->sys_name); + TERMINATED_MEMCPY(data->info.vol_info.gw_port_name, fc->fgid->gw_port_name); + + ka_time = be32_to_cpu(fc->fka->adv_period); + ka_time = ka_time ? ka_time : FKA_ADV_PERIOD; + /* do not let KA go under 2 secs */ + ka_time = (ka_time < 2000) ? 2000 : ka_time; + data->info.gw_adv_period = FIP_TIMEOUT_FACTOR(msecs_to_jiffies(ka_time)); + + ka_time = be32_to_cpu(fc->fka->ka_period); + ka_time = ka_time ? ka_time : FKA_ADV_PERIOD; + data->info.gw_period = FIP_TIMEOUT_FACTOR(msecs_to_jiffies(ka_time)); + + ka_time = be32_to_cpu(fc->fka->vnic_ka_period); + ka_time = ka_time ? ka_time : FKA_ADV_PERIOD; + data->info.vnic_ka_period = msecs_to_jiffies(ka_time); + + data->info.gw_type = GW_TYPE_SINGLE_EPORT; + if (fc->fed.num > 0) { + if (fc->fed.num == 1) { + /* new version bxm mode */ + data->info.gw_prot_new = 1; + err = extract_adv_extended(fc->fed.fed[0], &data->info); + if (err) + vnic_dbg_parse(discover->name, "invalid extended descripotr\n"); + } else { + vnic_dbg_parse(discover->name, "too many extended descripotrs\n"); + return -EINVAL; + } + } + + return err; +} + +static int send_generic_mcast_pkt(struct vnic_port *port, + struct fip_ring *tx_ring, + void *mem, int pkt_size, + struct ib_qp *qp, + int pkey_index, + struct vnic_mcast *mcast) +{ + int index, rc; + unsigned long flags; + unsigned long tail; + + /* + * we are only allowed to update the head at task level so no need to + * perform any locks here + */ + spin_lock_irqsave(&tx_ring->ring_lock, flags); + index = tx_ring->head & (tx_ring->size - 1); + vnic_dbg_fip(port->name, "mcast packet\n"); + + spin_lock(&tx_ring->head_tail_lock); + tail = tx_ring->tail; + spin_unlock(&tx_ring->head_tail_lock); + + /* ring full try again */ + if (tx_ring->head - tail >= tx_ring->size) { + vnic_warn(port->name, "send_generic_mcast_pkt ring full: QPN 0x%x: tail=%ld head=%ld diff=%ld\n", + qp->qp_num, tx_ring->tail, tx_ring->head, tx_ring->head - tx_ring->tail); + rc = -EAGAIN; + goto err; + } + + rc = _map_generic_pkt(port, &tx_ring->ring[index], mem, pkt_size); + if (rc) + goto err; + + rc = fip_mcast_send(port, qp, index, + tx_ring->ring[index].bus_addr, + pkt_size, pkey_index, mcast); + + if (rc) { + vnic_warn(port->name, + "send_generic_mcast_pkt: fip_mcast_send ret %d\n", + rc); + rc = -ENODEV; + goto error_unmap_dma; + } + + tx_ring->head++; + + spin_unlock_irqrestore(&tx_ring->ring_lock, flags); + return 0; + +error_unmap_dma: + ib_dma_unmap_single(port->dev->ca, + tx_ring->ring[index].bus_addr, + pkt_size, DMA_TO_DEVICE); + +err: + spin_unlock_irqrestore(&tx_ring->ring_lock, flags); + return rc; +} + +static void *alloc_solicit_pkt(int new_prot, char *node_desc) +{ + void *ptr; + struct fip_solicit_new *nptr; + struct fip_solicit_legacy *optr; + int size = new_prot ? sizeof *nptr : sizeof *optr; + + ptr = kzalloc(size, GFP_KERNEL); + if (!ptr) + return ERR_PTR(-ENOMEM); + optr = ptr; + optr->version.version = 1; + optr->fh.opcode = cpu_to_be16(EOIB_FIP_OPCODE); + optr->fh.subcode = FIP_HOST_SOL_SUB_OPCODE; + optr->fh.list_length = cpu_to_be16(size - offsetof(typeof(*optr), fvend)) / 4; + optr->fvend.ft.type = FIP_TYPE(VENDOR_ID); + optr->fvend.ft.length = sizeof optr->fvend / 4; + strncpy(optr->fvend.vendor_id, "mellanox", sizeof optr->fvend.vendor_id); + optr->addr.ft.type = FIP_TYPE(ADDRESS); + optr->addr.ft.length = sizeof optr->addr / 4; + strncpy(optr->addr.vendor_id, "mellanox", sizeof optr->addr.vendor_id); + if (new_prot) { + nptr = ptr; + nptr->ext.ft.type = 254; + nptr->ext.ft.length = sizeof nptr->ext / 4; + strncpy(nptr->ext.vendor_id, "mellanox", sizeof nptr->ext.vendor_id); + nptr->ext_cap.et.ext_type = 40; + nptr->ext_cap.et.len = sizeof nptr->ext_cap / 4; + nptr->ext_cap.et.mandatory = 1; + nptr->ext_hostname.et.ext_type = 39; + nptr->ext_hostname.et.len = sizeof nptr->ext_hostname / 4; + strncpy(nptr->ext_hostname.hostname, node_desc, sizeof nptr->ext_hostname.hostname); + } + + return ptr; +} + +int fip_solicit_send(struct fip_discover *discover, + enum fip_packet_type multicast, + u32 dqpn, u16 dlid, u8 sl, int new_prot, unsigned char *dguid) +{ + int rc = 0; + unsigned long flags, flags1; + struct fip_solicit_legacy *optr; + int size = new_prot ? sizeof(struct fip_solicit_new) : sizeof *optr; + + ASSERT(discover); + + /* alloc packet to be sent */ + optr = alloc_solicit_pkt(new_prot, discover->port->dev->ca->node_desc); + if (IS_ERR(optr)) + return PTR_ERR(optr); + + /* we set bit 24 to signify that we're a new host */ + optr->addr.gwtype_qpn = cpu_to_be32(discover->qp->qp_num | 0x1000000); + optr->addr.lid = cpu_to_be16(discover->port->attr.lid); + memcpy(optr->addr.guid, &discover->port->gid.global.interface_id, sizeof(optr->addr.guid)); + vnic_dbg_fip(discover->name, "fip_solicit_send creating multicast %d" + " solicit packet\n", multicast); + + fip_dbg_dump_raw_pkt(0, optr, size, 1, "sending solicit packet"); + + if (multicast) { + struct vnic_mcast *mcaste; + union ib_gid gid; + + memcpy(&gid, fip_solicit_mgid, GID_LEN); + spin_lock_irqsave(&discover->mcast_tree.mcast_rb_lock, flags); + mcaste = vnic_mcast_search(&discover->mcast_tree, &gid); + /* it is possible for the MCAST entry or AH to be missing in + * transient states (after events). This is a valid condition + * but we can't send packet + */ + if (!IS_ERR(mcaste) && mcaste->ah) { + spin_lock_irqsave(&mcaste->lock, flags1); + rc = send_generic_mcast_pkt(discover->port, &discover->tx_ring, + optr, size, discover->qp, + discover->pkey_index, + mcaste); + spin_unlock_irqrestore(&mcaste->lock, flags1); + } else + kfree(optr); + + spin_unlock_irqrestore(&discover->mcast_tree.mcast_rb_lock, flags); + } else { + rc = send_generic_ucast_pkt(discover->port, &discover->tx_ring, + optr, size, discover->qp, + discover->pkey_index, + dqpn, dlid, VNIC_FIP_QKEY, sl, dguid); + } + if (rc) + goto error_free_mem; + + return 0; + +error_free_mem: + vnic_warn(discover->name, "discover_send error ret %d\n", rc); + kfree(optr); + return -ENOMEM; +} + +static void *alloc_login_pkt(struct fip_vnic_data *vnic) +{ + struct eoib_login *ptr; + int size = sizeof *ptr; + + ptr = kzalloc(size, GFP_KERNEL); + if (!ptr) + return ERR_PTR(-ENOMEM); + + ptr->eoib_ver.version = 1; + ptr->fh.opcode = cpu_to_be16(EOIB_FIP_OPCODE); + ptr->fh.subcode = FIP_HOST_LOGIN_SUB_OPCODE; + ptr->fh.list_length = cpu_to_be16(size - offsetof(typeof(*ptr), fvend) / 4); + ptr->fvend.ft.type = FIP_TYPE(VENDOR_ID); + ptr->fvend.ft.length = sizeof ptr->fvend / 4; + strncpy(ptr->fvend.vendor_id, "mellanox", sizeof ptr->fvend.vendor_id); + ptr->fa.ft.type = FIP_TYPE(ADDRESS); + ptr->fa.ft.length = sizeof ptr->fa / 4; + strncpy(ptr->fa.vendor_id, "mellanox", sizeof ptr->fa.vendor_id); + ptr->fa.gwtype_qpn = cpu_to_be32(vnic->qp_base_num); + /* sl in vnic_login is 0. BXM will provide SL in login ack */ + ptr->fa.sl_gwportid = cpu_to_be16(vnic->gw->info.gw_port_id); + ptr->fa.lid = cpu_to_be16(vnic->port->attr.lid); + memcpy(ptr->fa.guid, &vnic->port->gid.global.interface_id, sizeof ptr->fa.guid); + ptr->fl.ft.type = FIP_TYPE(LOGIN); + ptr->fl.ft.length = sizeof ptr->fl / 4; + strncpy(ptr->fl.vendor_id, "mellanox", sizeof ptr->fl.vendor_id); + ptr->fl.vnic_id = cpu_to_be16(vnic->vnic_id); + + if (vnic->hadmined) { + int mac_valid = !IS_ZERO_MAC(vnic->login_data.mac); + u16 flags = (mac_valid ? FIP_LOGIN_M_FLAG : 0) | + FIP_LOGIN_H_FLAG | + (vnic->login_data.vp ? FIP_LOGIN_VP_FLAG | FIP_LOGIN_V_FLAG : 0); + ptr->fl.flags_vlan = cpu_to_be16(vnic->login_data.vlan | flags ); + memcpy(ptr->fl.mac, vnic->login_data.mac, sizeof ptr->fl.mac); + memcpy(ptr->fl.vnic_name, vnic->login_data.vnic_name, sizeof ptr->fl.vnic_name); + + // TODO remove this when BXM handles 0 addresses + if (!mac_valid) + ptr->fl.mac[ETH_ALEN-1] = 1; + } + + /* all_vlan mode must be enforced between the host and GW side. + For host admin vnic with VLAN we let the host choose the work mode. + If the GW isn't working in that same mode, the login will fail + and the host will enter a login-retry loop + For net admin vnic or host admin without a vlan, we work in the mode + published by the GW */ + if (vnic->gw->info.all_vlan_gw && + (!vnic->hadmined || + (vnic->hadmined && !vnic->login_data.vp))) + ptr->fl.vfields |= cpu_to_be16(FIP_LOGIN_ALL_VLAN_GW_FLAG); + + ptr->fl.syndrom_ctrl_qpn = cpu_to_be32(vnic->gw->discover->qp->qp_num); + ptr->fl.vfields |= cpu_to_be16((vnic->qps_num > 1) << 12); + + /* for child vNics, allow implicit logout */ + if (vnic->parent_used) { + ptr->fl.vfields |= cpu_to_be16(1 << 14); + ptr->fl.vfields |= cpu_to_be16(1 << 13); + } + + return ptr; +} + +/* + * Send a unicast login packet. This function supports both host and + * network admined logins. function returns 0 on success and + * error code on failure +*/ +int fip_login_send(struct fip_vnic_data *vnic) +{ + int ret; + struct eoib_login *ptr; + + ASSERT(vnic); + ASSERT(vnic->port); + + /* don't send packet because GW does not support this */ + if (vnic->hadmined && !vnic->gw->hadmin_gw) + return 0; + + /* alloc packet to be sent */ + ptr = alloc_login_pkt(vnic); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + fip_dbg_dump_raw_pkt(0, ptr, sizeof *ptr, 1, "sending login packet"); + + ret = send_generic_ucast_pkt(vnic->port, &vnic->gw->discover->tx_ring, + ptr, sizeof *ptr, vnic->gw->discover->qp, + vnic->gw->discover->pkey_index, + vnic->gw_address.gw_qpn, + vnic->gw_address.gw_lid, + vnic->gw_address.qkey, + vnic->gw_address.gw_sl, + vnic->gw->info.gw_guid); + if (ret) { + vnic_warn(vnic->port->name, + "fip_login_send: send_generic_ucast_pkt %d\n", ret); + goto error_free_mem; + } + + return 0; + +error_free_mem: + kfree(ptr); + return -ENOMEM; +} + +/* + * This function creates and sends a few types of packets (all ucast): + * vHub context request - new=1, logout=0 + * vHub context update / vnic keep alive - new=0, logout=0 + * vnic logout - new=0, logout=1 +*/ +int fip_update_send(struct fip_vnic_data *vnic, int request_new, int logout) +{ + struct eoib_host_update *pkt; + struct ib_qp *send_qp; + struct fip_ring *tx_ring; + int pkey_index; + int ret = 0; + + ASSERT(vnic); + ASSERT(vnic->port); + + /* alloc packet to be sent */ + pkt = kmalloc(sizeof *pkt, GFP_ATOMIC); + if (!pkt) { + vnic_warn(vnic->port->name, "fip_update_send malloc failed\n"); + return -EAGAIN; + } + + /* copy keep alive packet template */ + if (logout) + memcpy(pkt, &base_logout_pkt, sizeof(struct eoib_host_update)); + else + memcpy(pkt, &base_update_pkt, sizeof(struct eoib_host_update)); + + pkt->fip.opcode = cpu_to_be16(EOIB_FIP_OPCODE); + pkt->fip.list_length = + cpu_to_be16((sizeof(struct eoib_host_update) >> 2) - 3); + pkt->vnic_id = cpu_to_be16(vnic->vnic_id); + memcpy(pkt->mac, vnic->login_data.mac, sizeof(pkt->mac)); + memcpy(pkt->vnic_name, vnic->login_data.vnic_name, + sizeof(pkt->vnic_name)); + memcpy(pkt->port_guid, &vnic->port->gid.global.interface_id, + sizeof(pkt->port_guid)); + + pkt->vhub_id.vhub_id = cpu_to_be32(vnic->login_data.vhub_id); + + if (!logout) { + pkt->tusn = cpu_to_be32(vnic->vhub_table.main_list.tusn); + send_qp = vnic->qp; + tx_ring = &vnic->tx_ring; + pkey_index = vnic->pkey_index; + + if (vnic->login_data.vp) + pkt->vhub_id.flags.flags |= FIP_HOST_VP_FLAG; + + if (request_new) + pkt->vhub_id.flags.flags |= FIP_HOST_R_FLAG; + else + pkt->vhub_id.flags.flags |= FIP_HOST_U_FLAG; + } else { + send_qp = vnic->gw->discover->qp; + tx_ring = &vnic->gw->discover->tx_ring; + pkey_index = vnic->gw->discover->pkey_index; + } + + if (vnic->gw->info.gw_type == GW_TYPE_LAG && + !vnic->gw->info.ext_lag.ucast && !logout) { + struct vnic_mcast *mcaste; + unsigned long flags; + + spin_lock_irqsave(&vnic->mcast_tree.mcast_rb_lock, flags); + mcaste = vnic_mcast_search(&vnic->mcast_tree, &vnic->ka_mcast_gid); + if (!IS_ERR(mcaste)) { + if (mcaste->ah) { + ret = send_generic_mcast_pkt(vnic->port, &vnic->tx_ring, + pkt, sizeof *pkt, vnic->qp, + vnic->pkey_index, mcaste); + vnic_dbg_parse(vnic->name, "sent multicast keep alive\n"); + } + else { + vnic_dbg_parse(vnic->name, "mcaste %p: ah is null\n", mcaste); + kfree(pkt); + } + } else { + vnic_dbg_parse(vnic->name, "ka mcast not found\n"); + ret = -ENOMEM; + } + spin_unlock_irqrestore(&vnic->mcast_tree.mcast_rb_lock, flags); + + } else + ret = send_generic_ucast_pkt(vnic->port, tx_ring, pkt, sizeof *pkt, + send_qp, pkey_index, + vnic->gw_address.gw_qpn, + vnic->gw_address.gw_lid, + vnic->gw_address.qkey, + vnic->gw_address.gw_sl, + vnic->gw->info.gw_guid); + if (ret) { + vnic_warn(vnic->port->name, + "fip_update_send: ret %d\n", ret); + goto error_free_mem; + } + + return 0; + +error_free_mem: + kfree(pkt); + return -ENOMEM; +} + +static void dump_lag_member(struct lag_member *m) +{ + vnic_dbg_lag("", "QPN 0x%x, SL %d, gw_portid 0x%x, LID 0x%x, guid " GUID_FORMAT + ", eport_state %s, weight %d, link_utilization %d\n", + m->qpn, m->sl, m->gw_port_id, m->lid, GUID_ARG(m->guid), + eport_state_str(m->eport_state), m->weight, m->link_utilization); +} + +static inline int handle_lag_member(struct fip_vnic_data *vnic, + struct fip_ext_type_lag_members *ext_lag_membs, + int ext_length) +{ + struct lag_members lag_members; + + extract_memb_extended(ext_lag_membs, ext_length, &lag_members, vnic->name); + + /* propogate change in member state as needed */ + return handle_member_update(vnic, &lag_members); +} + +int extract_vhub_extended(struct fip_ext_desc_tlv *fed, + struct fip_vnic_data *vnic) +{ + struct fip_ext_type_ctrl_iport *ext_ctrl_iport; + struct fip_ext_type_lag_members *ext_lag_memb; + struct fip_extended_type *ext_hdr; + struct fip_vnic_send_info *gw_addr; + int length_to_go, ext_length; + + if (fed->ft.type != 254) + return -EINVAL; + + length_to_go = ((int)(fed->ft.length) << 2) - sizeof(*fed); + ext_hdr = (struct fip_extended_type *)(fed + 1); + + while (length_to_go > 0) { + ext_length = ((int)ext_hdr->len) << 2; + + vnic_dbg_parse(vnic->name, "Table Update parse, sub-tlv " + "type %d length %d address=%p\n", + ext_hdr->ext_type, ext_length, ext_hdr); + + if (ext_length < sizeof(*ext_hdr) || + ext_length > length_to_go) { + vnic_dbg_parse(vnic->name, "Extended length error." + " Length=%d\n", ext_length); + return -EINVAL; + } + + switch (ext_hdr->ext_type) { + case ADV_EXT_TYPE(MEMBER): + ext_lag_memb = (struct fip_ext_type_lag_members *)ext_hdr; + + if (handle_lag_member(vnic, ext_lag_memb, ext_length)) + vnic_dbg_parse(vnic->name, "handle_lag_member() failed"); + break; + case ADV_EXT_TYPE(CTRL_IPORT): + if (ext_length != sizeof(*ext_ctrl_iport)) { + vnic_dbg_parse(vnic->name, "Extended length %d is" + " different than expected\n", + ext_length); + return -EINVAL; + } + + gw_addr = &vnic->gw_address; + ext_ctrl_iport = (struct fip_ext_type_ctrl_iport *)ext_hdr; + gw_addr->gw_qpn = be32_to_cpu(ext_ctrl_iport->gwtype_qpn); + gw_addr->gw_lid = be16_to_cpu(ext_ctrl_iport->lid); + gw_addr->gw_sl = be16_to_cpu(ext_ctrl_iport->sl_gwportid) >> FIP_ADVERTISE_SL_SHIFT; + break; + default: + if (ext_hdr->mandatory & 0x01) { + vnic_dbg_parse(vnic->name, "Unknown mandatory extended type %d length %d\n", + ext_hdr->ext_type, ext_length); + return -EINVAL; + } else { + vnic_dbg_parse(vnic->name, "Unknown non-mandatory extended. Skipping, type %d length %d\n", + ext_hdr->ext_type, ext_length); + ext_hdr = (struct fip_extended_type *)((char *)ext_hdr + ext_length); + length_to_go -= ext_length; + continue; + } + } + + ext_hdr = (struct fip_extended_type *)((char *)ext_hdr + ext_length); + length_to_go -= ext_length; + } + + return 0; +} + +static int extract_login_extended(struct fip_ext_desc_tlv *fed, + struct lag_members *lagm, + char *name) +{ + struct fip_ext_type_lag_members *ext_lag_membs; + struct fip_extended_type *ext_hdr; + int length_to_go, ext_length; + + if (fed->ft.type != 254) + return -EINVAL; + + length_to_go = ((int)(fed->ft.length) << 2) - sizeof(*fed); + ext_hdr = (struct fip_extended_type *)(fed + 1); + + while (length_to_go > 0) { + ext_length = ((int)ext_hdr->len) << 2; + + vnic_dbg_parse(name, "Table Update parse, sub-tlv " + "type %d length %d address=%p\n", + ext_hdr->ext_type, ext_length, ext_hdr); + + if (ext_length < sizeof(*ext_hdr) || + ext_length > length_to_go) { + vnic_dbg_parse(name, "Extended length error." + " Length=%d\n", ext_length); + return -EINVAL; + } + + switch (ext_hdr->ext_type) { + case ADV_EXT_TYPE(MEMBER): + ext_lag_membs = (struct fip_ext_type_lag_members *)ext_hdr; + + extract_memb_extended(ext_lag_membs, ext_length, lagm, name); + + break; + default: + if (ext_hdr->mandatory & 0x01) { + vnic_dbg_parse(name, "Unknown mandatory extended type %d length %d\n", + ext_hdr->ext_type, ext_length); + return -EINVAL; + } else { + vnic_dbg_parse(name, "Unknown non-mandatory extended. Skipping, type %d length %d\n", + ext_hdr->ext_type, ext_length); + ext_hdr = (struct fip_extended_type *)((char *)ext_hdr + ext_length); + length_to_go -= ext_length; + continue; + } + } + ext_hdr = (struct fip_extended_type *)((char *)ext_hdr + ext_length); + length_to_go -= ext_length; + } + + return 0; +} + +void extract_memb_extended(struct fip_ext_type_lag_members *ext_lag_membs, + int ext_length, + struct lag_members *lagm, + char *name) +{ + struct lag_member *m; + struct fip_ext_type_lag_member *lm; + int nmemb = 0; + int i; + + nmemb = (ext_length - sizeof ext_lag_membs->et) / sizeof *lm; + if (nmemb > MAX_LAG_MEMBERS) { + vnic_dbg_parse(name, "recieved %d members but max supported is %d. " + "Using only %d\n", nmemb, MAX_LAG_MEMBERS, + MAX_LAG_MEMBERS); + nmemb = MAX_LAG_MEMBERS; + } + + m = lagm->memb; + lm = ext_lag_membs->lagm; + + for (i = 0; i < nmemb; ++i, ++lm, ++m) { + m->qpn = be32_to_cpu(lm->qpn) & 0xffffff; + m->sl = be16_to_cpu(lm->sl_gw_portid) >> 12; + m->gw_port_id = be16_to_cpu(lm->sl_gw_portid) & 0xfff; + m->lid = be16_to_cpu(lm->lid); + memcpy(m->guid, lm->guid, sizeof m->guid); + m->eport_state = lm->eport_state >> 6; + m->weight = lm->weight; + m->link_utilization = lm->link_utilization; + dump_lag_member(m); + } + lagm->num = nmemb; + + vnic_dbg_parse(name, "Table Update extended parse finished OK. Num members=%d\n", + lagm->num); + return; +} + +/* + * parse a packet that is suspected of being an login ack packet. The packet + * returns 0 for a valid login ack packet and an error code otherwise. The + * packets "interesting" details are returned in data. + */ +int fip_login_parse(struct fip_discover *discover, struct fip_content *fc, + struct fip_login_data *data) +{ + u32 vfields; + int err = 0; + + data->syndrome = be32_to_cpu(fc->fl->syndrom_ctrl_qpn) >> 24; + data->vnic_id = be16_to_cpu(fc->fl->vnic_id); + data->lid = be16_to_cpu(fc->fa.fa[0]->lid); + data->port_id = be16_to_cpu(fc->fa.fa[0]->sl_gwportid) & 0xfff; + data->sl = be16_to_cpu(fc->fa.fa[0]->sl_gwportid) >> FIP_ADVERTISE_SL_SHIFT; + data->qpn = be32_to_cpu(fc->fa.fa[0]->gwtype_qpn) & 0xffffff; + memcpy(data->guid, fc->fa.fa[0]->guid, sizeof(data->guid)); + + if (be16_to_cpu(fc->fl->flags_vlan) & FIP_LOGIN_VP_FLAG) { + data->vp = 1; + data->vlan = be16_to_cpu(fc->fl->flags_vlan) & 0xfff; + } + data->all_vlan_gw = !!(be16_to_cpu(fc->fl->vfields) & FIP_LOGIN_ALL_VLAN_GW_FLAG); + + data->vhub_id = CREATE_VHUB_ID(cpu_to_be16(data->vlan), data->port_id); + + data->ctl_qpn = be32_to_cpu(fc->fl->syndrom_ctrl_qpn) & FIP_LOGIN_CTRL_QPN_MASK; + vfields = be16_to_cpu(fc->fl->vfields); + data->n_mac_mcgid = vfields & FIP_LOGIN_DMAC_MGID_MASK; + data->n_rss_mgid = vfields >> 8 & 0xf; + /* data->rss = pkt->rss & FIP_LOGIN_RSS_MASK; it's redundant in login ack */ + data->pkey = be16_to_cpu(fc->fp->pkey); + data->mtu = be16_to_cpu(fc->fl->mtu); + + memcpy(data->mac, fc->fl->mac, sizeof(data->mac)); + memcpy(data->mgid_prefix, fc->fl->eth_gid_prefix, sizeof(data->mgid_prefix)); + memcpy(data->vnic_name, fc->fl->vnic_name, sizeof(data->vnic_name)); + memcpy(data->vendor_id, fc->fl->vendor_id, sizeof(data->vendor_id)); + + if (fc->fed.num) + err = extract_login_extended(fc->fed.fed[0], &data->lagm, discover->name); + + return err; +} + +/* + * Check if a received packet is a FIP packet, And if so return its subtype. + * The FIP type is also returned in fip_type and can be either EOIB_FIP_OPCODE + * or FCOIB_FIP_OPCODE. If the packet is not a FIP packet -EINVAL is returned. +*/ +int fip_pkt_parse(char *buffer, int length, int *fip_type) +{ + struct fip_fip_header *fip_header; + u16 fip_opcode; + + fip_header = (struct fip_fip_header *) + (buffer + IB_GRH_BYTES + sizeof(struct fip_eoib_ver)); + + fip_opcode = be16_to_cpu(fip_header->opcode); + + if (fip_opcode != EOIB_FIP_OPCODE) { + *fip_type = 0; + return -EINVAL; + } + + *fip_type = fip_opcode; + + return fip_header->subcode; +} + +/* + * Already know that this is a FIP packet, return its subtype. +*/ +int fip_pkt_get_subtype_bh(char *buffer) +{ + struct fip_fip_header *fip_header; + + fip_header = (struct fip_fip_header *) + (buffer + sizeof(struct fip_eoib_ver)); + + return fip_header->subcode; +} + diff --git a/drivers/net/mlx4_vnic/vnic_fip_pkt.h b/drivers/net/mlx4_vnic/vnic_fip_pkt.h new file mode 100644 index 0000000000000..32e34fce15252 --- /dev/null +++ b/drivers/net/mlx4_vnic/vnic_fip_pkt.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _FIP_DISCOVER_PKT_H +#define _FIP_DISCOVER_PKT_H + +#include + + + +#endif /* _FIP_DISCOVER_H */ diff --git a/drivers/net/mlx4_vnic/vnic_fip_vhub.c b/drivers/net/mlx4_vnic/vnic_fip_vhub.c new file mode 100644 index 0000000000000..8bcd6d0b69801 --- /dev/null +++ b/drivers/net/mlx4_vnic/vnic_fip_vhub.c @@ -0,0 +1,635 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_fip.h" +#include "vnic_fip_discover.h" +#include "vnic_fip_pkt.h" + +/* + * construct an mgid address based on vnic login information and the type + * variable (data mcast / vhub update / vhub table). The resulting mgid + * is returned in *mgid. + */ +void vhub_mgid_create(const char *mgid_prefix, + const char *mmac, /* mcast mac for bcast 0xFF.. */ + u64 n_mac, /* bits to take from mmac */ + u32 vhub_id, + enum vhub_mgid_type type, + u8 rss_hash, + union vhub_mgid *mgid) +{ + u32 vhub_id_be; + u64 mac_mask; + u64 *mac_ptr; + u64 one = 1; /* must do that for shift bitwise operation */ + + memcpy(mgid->mgid.mgid_prefix, mgid_prefix, + sizeof(mgid->mgid.mgid_prefix)); + mgid->mgid.type = (u8)type; + memcpy(mgid->mgid.dmac, mmac, sizeof(mgid->mgid.dmac)); + mac_mask = cpu_to_le64(((one << n_mac) - one) | 0xFFFF000000000000ULL); + mac_ptr = (u64*)(mgid->mgid.dmac); + *mac_ptr &= mac_mask; + mgid->mgid.rss_hash = rss_hash; + vhub_id_be = cpu_to_be32(vhub_id); + memcpy(mgid->mgid.vhub_id, ((u8 *) &vhub_id_be) + 1, + sizeof(mgid->mgid.vhub_id)); +}; + +/* + * Init the vnic's vHub table data structures, before using them + */ +void vhub_ctx_init(struct fip_vnic_data *vnic) +{ + INIT_LIST_HEAD(&vnic->vhub_table.main_list.vnic_list); + vnic->vhub_table.main_list.tusn = 0; + vnic->vhub_table.main_list.count = 0; + vnic->vhub_table.main_list.total_count = 0; + + INIT_LIST_HEAD(&vnic->vhub_table.update_list.vnic_list); + vnic->vhub_table.update_list.tusn = 0; + vnic->vhub_table.update_list.count = 0; + vnic->vhub_table.update_list.total_count = 0; + + vnic->vhub_table.checksum = 0; + vnic->vhub_table.tusn = 0; + vnic->vhub_table.state = VHUB_TBL_INIT; +} + +/* print vhub context table */ +static void vhub_ctx_prnt(struct fip_vnic_data *vnic, + struct vhub_elist *vhub_list, int level) +{ + struct vnic_table_entry *vnic_entry; + + if (!(vnic_msglvl & VNIC_DEBUG_VHUB_V)) + return; + + vnic_dbg_vhub_v(vnic->name, "Dumping context table. Count %d tusn %d\n", + vhub_list->count, vhub_list->tusn); + + list_for_each_entry(vnic_entry, &vhub_list->vnic_list, list) { + vnic_dbg_vhub_v(vnic->name, "lid 0x%04x qpn 0x%06x, mac " + MAC_6_PRINT_FMT"\n", vnic_entry->lid, + vnic_entry->qpn, + MAC_6_PRINT_ARG(vnic_entry->mac)); + } +} + +void vhub_table_free(struct vhub_elist *elist) +{ + struct vnic_table_entry *del_vnic, *tmp_vnic; + + list_for_each_entry_safe(del_vnic, tmp_vnic, &elist->vnic_list, list) { + list_del(&del_vnic->list); + kfree(del_vnic); + } +} + +/* + * Clear and free the vnic's vHub context table data structures. + */ +void vhub_ctx_free(struct fip_vnic_data *vnic) +{ + vnic_dbg_fip_v(vnic->name, "vhub_ctx_free called\n"); + + vhub_table_free(&vnic->vhub_table.main_list); + vhub_table_free(&vnic->vhub_table.update_list); + + vhub_ctx_init(vnic); +} + +static struct vnic_table_entry *vhub_find_entry(struct vhub_elist *vnic_list, + u16 lid, u32 qpn) +{ + struct vnic_table_entry *tmp_vnic; + + list_for_each_entry(tmp_vnic, &vnic_list->vnic_list, list) { + if (tmp_vnic->lid == lid && tmp_vnic->qpn == qpn) + return tmp_vnic; + } + return NULL; +} + +/* + * Move vHub context entries from the update list to the main list. The update + * list is used during the wait for the main table to be updated. Once + * the table update is completed the entries need to be moved from the update + * table to the main table. This function does this. +*/ +static int vhub_update_main(struct fip_vnic_data *vnic, + struct vhub_elist *main_list, + struct vhub_elist *update_list) +{ + struct vnic_table_entry *new_entry, *tmp_vnic, *del_vnic; + int first_tusn = (u32) update_list->tusn - (update_list->count - 1); + int extra_tusn; + + /* update list is usually empty */ + if (likely(update_list->count == 0)) + return 0; + + if (first_tusn > main_list->tusn + 1) { + vnic_warn(vnic->name, "Info, vhub_to_main_tbl sync main to" + " update list failed. update tusn %d update " + "first %d main %d\n", + update_list->tusn, first_tusn, main_list->tusn); + return -1; + } + + extra_tusn = main_list->tusn + 1 - first_tusn; + + /* go over update list and move / remove entries in it */ + list_for_each_entry_safe(new_entry, tmp_vnic, + &update_list->vnic_list, list) { + if (extra_tusn > 0) { + list_del(&new_entry->list); + kfree(new_entry); + extra_tusn--; + } else { + /* remove from update list and apply to main list */ + list_del(&new_entry->list); + main_list->tusn++; + + /* Check valid bit, if set add to main list */ + if (new_entry->valid) { + list_add_tail(&new_entry->list, + &main_list->vnic_list); + main_list->count++; + } else { /* remove from main list */ + del_vnic = vhub_find_entry(main_list, + new_entry->lid, + new_entry->qpn); + if (del_vnic) { + list_del(&del_vnic->list); + kfree(del_vnic); + + main_list->count--; + } + vnic_dbg_fip_v(vnic->name, + "vhub_to_main_tbl removed " + "vnic lid %d qpn 0x%x found %d\n", + (int)new_entry->lid, + (int)new_entry->qpn, + (del_vnic != 0)); + kfree(new_entry); + } + } + update_list->count--; + } + return 0; +} + +int fip_vnic_search_mac(struct fip_vnic_data *vnic, struct vhub_elist *elist) +{ + struct vnic_table_entry *vlist_entry; + + list_for_each_entry(vlist_entry, &elist->vnic_list, list) + /* find matching entry based on mac */ + if(!memcmp(vnic->login_data.mac, vlist_entry->mac, ETH_ALEN)) { + /* verify lid/qpn match */ + if (vnic->port->attr.lid == vlist_entry->lid && + vnic->qp_base_num == vlist_entry->qpn) + return 1; + else { + vnic_dbg_vhub(vnic->name, + "vnic LID=0x%x or QPN=0x%x " + "in vhub tbl is different than " + "expected LID=0x%x, QPN=0x%x\n", + vlist_entry->lid, + vlist_entry->qpn, + vnic->port->attr.lid, + vnic->qp_base_num); + break; + } + } + + return 0; +} + +/* + * This function handles a vhub context table packet. The table will + * be processed only if we do not have an up to date local copy of + * our own. The table update supports multi-packet tables so care + * must be taken in building the complete table. + */ +int vhub_handle_tbl(struct fip_vnic_data *vnic, struct fip_content *fc, + u32 vhub_id, u32 tusn) +{ + struct context_table_entry *entry; + struct vnic_table_entry *new_entry; + struct vhub_elist *table; + int i, j, count_in_pkt; + int reason = 0; + int hdr_type; + + /* we already have a table. disregard this one */ + if (vnic->vhub_table.state != VHUB_TBL_INIT) { + vnic_dbg_vhub_v(vnic->name, + "vhub_handle_tbl context not in init\n"); + return 0; + } + + /* compute the number of vnic entries in the packet. + * don't forget the checksum + */ + count_in_pkt = fc->cte.num; + table = &vnic->vhub_table.main_list; + hdr_type = be16_to_cpu(fc->fvt->hdr) >> 14; + + /* first or only packet in sequence */ + if (hdr_type == FIP_TABLE_HDR_FIRST || hdr_type == FIP_TABLE_HDR_ONLY) { + table->total_count = be16_to_cpu(fc->fvt->table_size); + table->tusn = tusn; + } + if (table->tusn != tusn) { + vnic_warn(vnic->name, "Info, vhub_handle_tbl got unexpected " + "tusn. Expect=%d received=%d\n", table->tusn, tusn); + if (!table->tusn) + goto drop_silently; + reason = 1; + goto reset_table; + } + + if ((table->count + count_in_pkt > table->total_count) || + ((table->count + count_in_pkt < table->total_count) && + (hdr_type == FIP_TABLE_HDR_LAST || hdr_type == FIP_TABLE_HDR_ONLY))) { + vnic_dbg_vhub(vnic->name, + "vhub_handle_tbl got unexpected entry count. " + "count %d, in packet %d total expected %d\n", + table->count, count_in_pkt, table->total_count); + reason = 2; + goto reset_table; + } + + entry = fc->cte.cte; + for (i = 0; i < count_in_pkt; ++i, ++entry) { + new_entry = kzalloc(sizeof *new_entry, GFP_KERNEL); + if (!new_entry) + goto reset_table; + + for (j = 0; j < (sizeof *entry) >> 2; ++j) + vnic->vhub_table.checksum += ((u32 *) entry)[j]; + + new_entry->lid = be16_to_cpu(entry->lid); + new_entry->qpn = be32_to_cpu(entry->qpn) & 0xffffff; + new_entry->sl = entry->sl & 0xf; + new_entry->rss = !!(entry->v_rss_type & FIP_CONTEXT_RSS_FLAG); + new_entry->valid = !!(entry->v_rss_type & FIP_CONTEXT_V_FLAG); + memcpy(new_entry->mac, entry->mac, sizeof(new_entry->mac)); + + list_add_tail(&new_entry->list, &table->vnic_list); + table->count++; + } + + /* last packet */ + if (hdr_type == FIP_TABLE_HDR_LAST || hdr_type == FIP_TABLE_HDR_ONLY) { + ASSERT(table->count == table->total_count); + if (vnic->vhub_table.checksum != be32_to_cpu(*(u32 *) entry)) { + vnic_dbg_fip_v(vnic->name, + "vhub_handle_tbl checksum mismatch. " + "expected 0x%x, in packet 0x%x\n", + vnic->vhub_table.checksum, + be32_to_cpu(*(u32 *) entry)); + /* TODO: request checksum match in final code */ + /* goto reset_table; */ + } + + if (vhub_update_main(vnic, &vnic->vhub_table.main_list, + &vnic->vhub_table.update_list)) { + vnic_dbg_fip_v(vnic->name, + "vhub_handle_tbl moving update list to main " + "list failed\n"); + reason = 3; + goto reset_table; + } + + /* we are done receiving the context table */ + vnic_dbg_fip_v(vnic->name, + "vhub_handle_tbl updated with %d entries\n", + vnic->vhub_table.main_list.count); + vhub_ctx_prnt(vnic, &vnic->vhub_table.main_list, 0); + + /* we are not in the main vHub list close ourselves */ + if (!fip_vnic_search_mac(vnic, &vnic->vhub_table.main_list)) { + vnic_dbg_fip_p0(vnic->name, "We are not in the main table close our selves\n"); + fip_vnic_close(vnic, FIP_PARTIAL_FLUSH); + reason = 4; + goto reset_table; + } + + if (fip_vnic_tbl_done(vnic)) { + vnic_warn(vnic->name, "vhub_handle_tbl done failed, reseting table\n"); + reason = 5; + goto reset_table; + } + } + +drop_silently: + return 0; + +reset_table: + vnic_dbg_fip_p0(vnic->name, "We are not in the main table close our selves reason=%d\n", reason); + vhub_ctx_free(vnic); + /* TODO renable tx of update request, fip_update_send() */ + return -EINVAL; +} + +/* + * This function writes the main vhub table to the data (login) vnic. + * You should call it when the data vnic is ready for it and after the + * table is up to date (and the update list was applied to the main list) + */ +int fip_vnic_write_tbl(struct fip_vnic_data *vnic) +{ + struct vnic_table_entry *vlist_entry; + int rc; + + if (vnic->login) + sprintf(vnic->name, "%s", vnic->login->name); + + /* update table in neigh tree */ + list_for_each_entry(vlist_entry, + &vnic->vhub_table.main_list.vnic_list, list) { + rc = vnic_vhube_add(vnic, vlist_entry); + if (rc) { + vnic_warn(vnic->name, "vnic_vhube_add failed for mac " + MAC_6_PRINT_FMT" (rc %d)\n", + MAC_6_PRINT_ARG(vlist_entry->mac), rc); + vhub_ctx_free(vnic); + vnic_vhube_flush(vnic); + return -1; + } + } + + vnic_dbg_fip(vnic->name, "fip_vnic_tbl_done: creation of vnic done\n"); + + vnic->vhub_table.tusn = vnic->vhub_table.main_list.tusn; + vnic->vhub_table.state = VHUB_TBL_UPDATED; + + /* free table memory */ + vhub_table_free(&vnic->vhub_table.main_list); + return 0; +} + +/* + * This function handles a vhub context update packets received AFTER + * we have a valid vhub table. For update additions the code adds an + * entry to the neighbour tree. For update removals we either remove + * the entry from the neighbour list or if the removed entry is "this vnic" + * we remove the vnic. +*/ +static int vhub_update_updated(struct fip_vnic_data *vnic, + u32 vhub_id, u32 pkt_tusn, + struct vnic_table_entry *data) +{ + int curr_tusn; + + curr_tusn = vnic->vhub_table.tusn; + + /* if vnic is being flushed, return */ + if (vnic->flush) + return 0; + + /* we got a GW keep alive packet */ + if (pkt_tusn == curr_tusn) + return 0; + + /* if we got an out of order update clear list and request new table */ + if (pkt_tusn != curr_tusn + 1) { + vnic_warn(vnic->name, "Info, vhub_update_up2date received out" + " of order update. Recvd=%d Expect=%d\n", + pkt_tusn, curr_tusn); + goto error_in_update; + } + + /* new entry added */ + if (data->valid) { + if (vnic_vhube_add(vnic, data)) { + vnic_dbg_fip(vnic->name, "vnic_vhube_add " + "failed to update vnic neigh tree\n"); + goto error_in_update; + } + } else { /* remove entry */ + /* the remove request is for this vnic :-o */ + if (!memcmp(vnic->login_data.mac, data->mac, ETH_ALEN)) { + vnic_dbg_fip_p0(vnic->name, "remove this vnic "MAC_6_PRINT_FMT"\n", + MAC_6_PRINT_ARG(vnic->login_data.mac)); + fip_vnic_close(vnic, FIP_PARTIAL_FLUSH); + } else { + vnic_dbg_fip(vnic->name, "remove neigh vnic\n"); + vnic_vhube_del(vnic, data->mac); + } + } + + vnic->vhub_table.tusn = pkt_tusn; + + return 0; + +error_in_update: + vhub_ctx_free(vnic); + vnic_vhube_flush(vnic); + fip_update_send(vnic, 1 /* new */, 0 /* logout */); + return -1; +} + +/* + * This function handles a vhub context update packets received BEFORE + * we have a valid vhub table. The function adds the update request + * to an update list to be processed after the entire vhub table is received + * and processed. + */ +static int vhub_update_init(struct fip_vnic_data *vnic, + u32 vhub_id, u32 pkt_tusn, + struct vnic_table_entry *data) +{ + struct vnic_table_entry *new_vnic; + struct vhub_elist *vnic_list; + int curr_tusn; + + vnic_list = &vnic->vhub_table.update_list; + curr_tusn = vnic_list->tusn; + + /* if we got an out of order update clear list and request new table */ + if ((pkt_tusn < curr_tusn || pkt_tusn > curr_tusn + 1) + && curr_tusn != 0) { + vnic_warn(vnic->name, "Info, vhub_update_init received out of" + " order update. got %d my %d\n", pkt_tusn, curr_tusn); + goto error_in_update; + } + + /* we got a GW keep alive packet */ + if (pkt_tusn == curr_tusn) { + vnic_dbg_fip_v(vnic->name, "Received GW keep alive update." + " tusn %d\n", curr_tusn); + return 0; + } + + /* got remove request for this vnic don't wait */ + if (!(data->valid) && + !memcmp(vnic->login_data.mac, data->mac, ETH_ALEN)) { + vhub_ctx_free(vnic); + vnic_dbg_fip_p0(vnic->name, "got request to close vNic vhub_update_init\n"); + fip_vnic_close(vnic, FIP_PARTIAL_FLUSH); + goto err; + } + + new_vnic = kzalloc(sizeof *new_vnic, GFP_KERNEL); + if (!new_vnic) + goto error_in_update; + + memcpy(new_vnic, data, sizeof *data); + list_add_tail(&new_vnic->list, &vnic_list->vnic_list); + vnic_list->count++; + vnic_list->tusn = pkt_tusn; + vhub_ctx_prnt(vnic, vnic_list, 0); + return 0; + +error_in_update: + vhub_ctx_free(vnic); + fip_update_send(vnic, 1 /* new */, 0 /* logout */); +err: + return -1; +} + +/* + * This function handles a vhub context update packets received after + * we have a valid vhub table but before it was passed to the data rbtree. + * The function applies the update request to the main vhub table. + */ +static int vhub_update_inter(struct fip_vnic_data *vnic, + u32 vhub_id, u32 pkt_tusn, + struct vnic_table_entry *data) +{ + struct vnic_table_entry *new_vnic, *del_vnic; + struct vhub_elist *vnic_list; + int curr_tusn; + + vnic_list = &vnic->vhub_table.main_list; + curr_tusn = vnic_list->tusn; + + /* if we got an out of order update clear list and request new table */ + if ((pkt_tusn < curr_tusn || pkt_tusn > curr_tusn + 1) + && curr_tusn != 0) { + vnic_warn(vnic->name, "Info, vhub_update_init received out" + " of order update. got %d my %d\n", pkt_tusn, curr_tusn); + goto error_in_update; + } + + /* we got a GW keep alive packet */ + if (pkt_tusn == curr_tusn) { + vnic_dbg_fip_v(vnic->name, "Received GW keep alive update." + " tusn %d\n", curr_tusn); + return 0; + } + + /* we got an add request */ + if (data->valid) { + new_vnic = kzalloc(sizeof *new_vnic, GFP_KERNEL); + if (!new_vnic) + goto error_in_update; + + memcpy(new_vnic, data, sizeof *data); + list_add_tail(&new_vnic->list, &vnic_list->vnic_list); + vnic_list->count++; + vnic_list->tusn = pkt_tusn; + } else { /* we got a remove request */ + /* remove is for this vnic */ + if (!memcmp(vnic->login_data.mac, data->mac, ETH_ALEN)) { + vhub_ctx_free(vnic); + vnic_dbg_fip_p0(vnic->name, "got request to close vNic vhub_update_inter\n"); + fip_vnic_close(vnic, FIP_PARTIAL_FLUSH); + goto err; + } + + /* search and delete the vnic */ + del_vnic = vhub_find_entry(vnic_list, + data->lid, + data->qpn); + if (del_vnic) { + list_del(&del_vnic->list); + kfree(del_vnic); + vnic_list->count--; + } + vnic_dbg_fip_v(vnic->name, + "vhub_update_inter removed " + "vnic lid %d qpn 0x%x found %d\n", + (int)data->lid, (int)data->qpn, + (del_vnic != 0)); + } + + vhub_ctx_prnt(vnic, vnic_list, 0); + return 0; + +error_in_update: + vhub_ctx_free(vnic); + fip_update_send(vnic, 1 /* new */, 0 /* logout */); +err: + return -1; +} + +/* + * This function handles a vhub context update packets. There are three flows + * in handeling update packets. The first is before the main table is up + * to date, the second is after the table is up to date but before it was + * passed to the ownership of the data vnic (login struct) and the local + * lists are freed, and the last is when the table maintanence is done + * by the data vnic. This function handles all cases. +*/ +int vhub_handle_update(struct fip_vnic_data *vnic, + u32 vhub_id, u32 tusn, + struct vnic_table_entry *data) +{ + int ret = 0; + + /* + * if we do not have an up to date table to use the update list. + * if we have an up to date table apply the updates to the + * main table list. + */ + switch (vnic->vhub_table.state) { + case VHUB_TBL_INIT: /* No full table yet, keep updates for later */ + ret = vhub_update_init(vnic, vhub_id, tusn, data); + break; + case VHUB_TBL_UP2DATE: /* full table available, not writen to data half */ + ret = vhub_update_inter(vnic, vhub_id, tusn, data); + break; + case VHUB_TBL_UPDATED: /* full table available and writen to data half */ + ret = vhub_update_updated(vnic, vhub_id, tusn, data); + break; + default: + break; + } + + return ret; +} diff --git a/drivers/net/mlx4_vnic/vnic_main.c b/drivers/net/mlx4_vnic/vnic_main.c new file mode 100644 index 0000000000000..f9f6127528400 --- /dev/null +++ b/drivers/net/mlx4_vnic/vnic_main.c @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_data.h" +#include "vnic_fip.h" + +MODULE_AUTHOR(DRV_AUTH); +MODULE_DESCRIPTION(DRV_DESC); +MODULE_LICENSE(DRV_LIC); +MODULE_VERSION(DRV_VER); + +extern struct net_device_stats *vnic_get_stats(struct net_device *n); +extern int mlx4_vnic_set_stats_function(struct net_device_stats *(*func)(struct net_device *n)); + +static int __init mlx4_ib_init(void) +{ + vnic_dbg_func("module_init"); + + mlx4_vnic_set_stats_function(vnic_get_stats); + if (vnic_param_check()) + goto err; + if (vnic_mcast_init()) + goto err; + if (vnic_ports_init()) + goto free_mcast; + + return 0; + +free_mcast: + vnic_mcast_cleanup(); +err: + mlx4_vnic_set_stats_function(NULL); + return -EINVAL; +} + +static void __exit mlx4_ib_cleanup(void) +{ + int ret; + + vnic_dbg_func("module_exit"); + + do { + ret = mlx4_vnic_set_stats_function(NULL); + if (ret) + msleep(10); + } while(ret); + + vnic_ports_cleanup(); + vnic_dbg_mark(); + vnic_mcast_cleanup(); +} + +module_init(mlx4_ib_init); +module_exit(mlx4_ib_cleanup); diff --git a/drivers/net/mlx4_vnic/vnic_mcast.c b/drivers/net/mlx4_vnic/vnic_mcast.c new file mode 100644 index 0000000000000..a036b484d1461 --- /dev/null +++ b/drivers/net/mlx4_vnic/vnic_mcast.c @@ -0,0 +1,1095 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" + +struct workqueue_struct *mcast_wq; +struct ib_sa_client vnic_sa_client; + +//static void vnic_mcast_detach_task(struct work_struct *work); +static void vnic_mcast_attach_task(struct work_struct *work); +static void vnic_port_mcast_leave_task(struct work_struct *work); +static void vnic_port_mcast_join_task(struct work_struct *work); + +static void vnic_port_mcast_release(struct vnic_port_mcast *mcaste); +static struct vnic_port_mcast *vnic_port_mcast_update(struct vnic_mcast + *_mcaste); + +/* + * A helper function to prevent code duplication. Fills vnic_mcast struct with + * common values. + * + * in: mcaste - mcaste to fill + * in: gw_id - to be used in creation MGID address + * in: mac - to be used in creation MGID address + * in: create - value of create field in mcaste + */ +void __vnic_mcaste_fill(struct vnic_login *login, struct vnic_mcast *mcaste, + u16 gw_id, const u8 *mac, u8 rss_hash, int create) +{ + union vhub_mgid mgid; + + memcpy(mcaste->mac, mac, ETH_ALEN); + vhub_mgid_create(login->mgid_prefix, mcaste->mac, + login->n_mac_mcgid, + CREATE_VHUB_ID(login->vid, gw_id), + VHUB_MGID_DATA, rss_hash, &mgid); + memcpy(&mcaste->gid, mgid.ib_gid.raw, GID_LEN); + memcpy(&mcaste->port_gid, &mcaste->gid, GID_LEN); + mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC); + mcaste->backoff_factor = 1; + mcaste->retry = VNIC_MCAST_MAX_RETRY; + mcaste->blocking = 0; + mcaste->qkey = login->qkey; + mcaste->pkey = login->pkey; + mcaste->create = create; + mcaste->qp = login->qp_res[0].qp; /* mcast/bcast is only on first QP */ + mcaste->join_state = 1; +} + +/* + * A helper function to prevent code duplication. Receives a multicast mac + * and a gw_id and attaches it (join + attach). The function also receives + * a default_mcaste (used for the MGID over default MLID hack and a user list. + * Returns 0 on success and non 0 on failure. + * + * in: mmac - to be used in creation MGID address + * in: default_mcaste - mcaste entry of the default MGID. Can be NULL + * in: user_list - A user list to hang the new mcaste on. Can be NULL + * in: gw_id - to be used in creation MGID address + */ +int _vnic_mcast_attach_mgid(struct vnic_login *login, + char *mmac, + struct vnic_mcast *default_mcaste, + void *private_data, + u16 gw_id) +{ + struct vnic_mcast *mcaste; + int rc = 0; + int rss_hash; + + mcaste = vnic_mcast_alloc(login->port, NULL, NULL); + if (IS_ERR(mcaste)) { + vnic_warn(login->name, "vnic_mcast_alloc for "MAC_6_PRINT_FMT" failed\n", + MAC_6_PRINT_ARG(mmac)); + vnic_dbg_mark(); + return -ENOMEM; + } + memcpy(mcaste->mac, mmac, ETH_ALEN); + + /* if mcast mac has mcast IP in it:*/ + rss_hash = 0; + if ((mcaste->mac[0] & 0xf0) == 0xe0 && + mcaste->mac[4] == 0x00 && + mcaste->mac[5] == 0x00) { + /* calculate mcas rss_hash on IP octets */ + rss_hash = mcaste->mac[0] ^ mcaste->mac[1] ^ + mcaste->mac[2] ^ mcaste->mac[3]; + /* and build the corresponding mcast MAC using the IEEE + * multicast OUI 01:00:5e + */ + mcaste->mac[5] = mcaste->mac[3]; + mcaste->mac[4] = mcaste->mac[2]; + mcaste->mac[3] = mcaste->mac[1] & 0x7f; + mcaste->mac[2] = 0x5e; + mcaste->mac[1] = 0x00; + mcaste->mac[0] = 0x01; + } + + __vnic_mcaste_fill(login, mcaste, gw_id, mcaste->mac, rss_hash, 0); + mcaste->priv_data = private_data; + + if (default_mcaste) + memcpy(&mcaste->port_gid, &default_mcaste->gid, GID_LEN); + + rc = vnic_mcast_add(&login->mcast_tree, mcaste); /* add holds mcast_rb_lock */ + if (!rc) { + rc = vnic_mcast_attach(&login->mcast_tree, mcaste); + ASSERT(!rc); + } else if (rc == -EEXIST){ + /* MGID may be already in the tree when n_mac_mcgid > 0 (ok)*/ + vnic_dbg_mcast(login->name, "vnic_mcast_add for " + MAC_6_PRINT_FMT" already exist, rc %d\n", + MAC_6_PRINT_ARG(mcaste->mac), rc); + vnic_mcast_dealloc(mcaste); + rc = 0; + } else { + vnic_warn(login->name, "vnic_mcast_add for " + MAC_6_PRINT_FMT" failed, rc %d\n", + MAC_6_PRINT_ARG(mcaste->mac), rc); + vnic_mcast_dealloc(mcaste); + } + return rc; +} + +struct vnic_mcast *vnic_mcast_alloc(struct vnic_port *port, + unsigned long *req_attach, + unsigned long *cur_attached) +{ + struct vnic_mcast *mcaste; + + mcaste = kzalloc(sizeof *mcaste, GFP_ATOMIC); + if (!mcaste) + return ERR_PTR(-ENOMEM); + /* set mcaste fields */ + init_completion(&mcaste->attach_complete); + INIT_DELAYED_WORK(&mcaste->attach_task, vnic_mcast_attach_task); + spin_lock_init(&mcaste->lock); + mcaste->port = port; + mcaste->req_attach = req_attach; + mcaste->cur_attached = cur_attached; + + return mcaste; +} + +void vnic_mcast_dealloc(struct vnic_mcast *mcaste) +{ + struct vnic_port *port; + + ASSERT(mcaste); + port = mcaste->port; + vnic_dbg_mcast_vv(port->name, "dealloc vnic_mcast: MAC "MAC_6_PRINT_FMT + " GID "VNIC_GID_FMT"\n", + MAC_6_PRINT_ARG(mcaste->mac), + VNIC_GID_ARG(mcaste->gid)); + kfree(mcaste); +} + +/* + * This function grabs the mcast_tree->mcast_rb_lock +*/ +int vnic_mcast_add(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste) +{ + struct rb_node **n = &mcast_tree->mcast_tree.rb_node, *pn = NULL; + struct vnic_mcast *mcaste_t; + unsigned long flags; + int rc; + + spin_lock_irqsave(&mcast_tree->mcast_rb_lock, flags); + while (*n) { + pn = *n; + mcaste_t = rb_entry(pn, struct vnic_mcast, rb_node); + rc = memcmp(mcaste->gid.raw, mcaste_t->gid.raw, GID_LEN); + if (rc < 0) + n = &pn->rb_left; + else if (rc > 0) + n = &pn->rb_right; + else { + rc = -EEXIST; + goto out; + } + } + + rb_link_node(&mcaste->rb_node, pn, n); + rb_insert_color(&mcaste->rb_node, &mcast_tree->mcast_tree); + + rc = 0; + +out: + vnic_dbg_mcast_v(mcaste->port->name, + "added (rc %d) vnic_mcast: MAC "MAC_6_PRINT_FMT + " GID "VNIC_GID_FMT"\n", rc, + MAC_6_PRINT_ARG(mcaste->mac), + VNIC_GID_ARG(mcaste->gid)); + + spin_unlock_irqrestore(&mcast_tree->mcast_rb_lock, flags); + return rc; +} + +/* + * The caller must hold the mcast_tree->mcast_rb_lock lock before calling + */ +void vnic_mcast_del(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste) +{ + rb_erase(&mcaste->rb_node, &mcast_tree->mcast_tree); +} + +/* + * The caller must hold the mcast_tree->mcast_rb_lock lock before calling +*/ +struct vnic_mcast *vnic_mcast_search(struct mcast_root *mcast_tree, + union ib_gid *gid) +{ + struct rb_node *n = mcast_tree->mcast_tree.rb_node; + struct vnic_mcast *mcaste_t; + int rc; + + while (n) { + mcaste_t = rb_entry(n, struct vnic_mcast, rb_node); + rc = memcmp(gid->raw, mcaste_t->gid.raw, GID_LEN); + if (rc < 0) + n = n->rb_left; + else if (rc > 0) + n = n->rb_right; + else { + vnic_dbg_mcast_v(mcaste_t->port->name, + "found: MAC "MAC_6_PRINT_FMT" GID " + VNIC_GID_FMT"\n", + MAC_6_PRINT_ARG(mcaste_t->mac), + VNIC_GID_ARG(mcaste_t->gid)); + goto out; + } + } + mcaste_t = ERR_PTR(-ENODATA); + +out: + return mcaste_t; +} + +static void vnic_mcast_detach_ll(struct vnic_mcast *mcaste, struct mcast_root *mcast_tree) +{ + struct vnic_port *port = mcaste->port; + struct ib_ah *tmp_ih; + unsigned long flags; + int rc; + + vnic_dbg_mcast_v(port->name, + "mcaste->attached %d for mac "MAC_6_PRINT_FMT"\n", + test_bit(MCAST_ATTACHED, &mcaste->state), + MAC_6_PRINT_ARG(mcaste->mac)); + + spin_lock_irqsave(&mcaste->lock, flags); + if (!test_and_clear_bit(MCAST_ATTACHED, &mcaste->state)) { + spin_unlock_irqrestore(&mcaste->lock, flags); + return; + } + + tmp_ih = mcaste->ah; + mcaste->ah = NULL; + spin_unlock_irqrestore(&mcaste->lock, flags); + + /* callback */ + if (mcaste->detach_cb) { + vnic_dbg_mcast(port->name, "calling detach_cb\n"); + mcaste->detach_cb(mcaste, mcaste->detach_cb_ctx); + } + + if (!mcaste->sender_only) + rc = ib_detach_mcast(mcaste->qp, &mcaste->gid, port->attr.lid); + else + rc = 0; + + ASSERT(tmp_ih); + if (ib_destroy_ah(tmp_ih)) + vnic_warn(port->name, + "ib_destroy_ah failed (rc %d) for mcaste mac " + MAC_6_PRINT_FMT"\n", rc, + MAC_6_PRINT_ARG(mcaste->mac)); + vnic_dbg_mcast(port->name, "GID "VNIC_GID_FMT" detached!\n", + VNIC_GID_ARG(mcaste->gid)); +} + +int vnic_mcast_detach(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste) +{ + struct vnic_port *port = mcaste->port; + unsigned long flags; + + /* must be a task, to make sure no attach task is pending */ + vnic_dbg_mcast_v(port->name, "queue delayed task (%lu) " + "vnic_mcast_detach_task\n", mcaste->backoff); + + /* cancel any pending/queued tasks. We can not use sync + * under the spinlock because it might hang. we need the + * spinlock here to ensure the requeueing is atomic + */ + vnic_dbg_mcast_v(port->name, "cancel attach_task\n"); + spin_lock_irqsave(&mcaste->lock, flags); + clear_bit(MCAST_ATTACH_RUNNING, &mcaste->state); + spin_unlock_irqrestore(&mcaste->lock, flags); +#ifndef _BP_WORK_SYNC + cancel_delayed_work_sync(&mcaste->attach_task); +#else + cancel_delayed_work(&mcaste->attach_task); + flush_workqueue(mcast_wq); +#endif + vnic_mcast_detach_ll(mcaste, mcast_tree); + + if (mcaste->port_mcaste) + vnic_port_mcast_release(mcaste->port_mcaste); + + return 0; +} + +static void vnic_mcast_attach_task(struct work_struct *work) +{ + struct ib_ah_attr av; + struct vnic_mcast *mcaste = + container_of(work, struct vnic_mcast, attach_task.work); + struct vnic_port *port = mcaste->port; + unsigned long flags; + int rc; + u16 mlid; + + if (++mcaste->attach_task_cnt > mcaste->retry && mcaste->retry) { + vnic_dbg_mcast_v(port->name, + "attach_task stopped, tried %ld times\n", + mcaste->retry); + goto out; + } + + /* update backoff time */ + mcaste->backoff = min(mcaste->backoff * mcaste->backoff_factor, + msecs_to_jiffies(VNIC_MCAST_BACKOFF_MAX_MSEC)); + + if (!test_bit(MCAST_JOINED, &mcaste->port_mcaste->state)) { + vnic_dbg_mcast_v(port->name, "joined %d, retry %ld from %ld\n", + test_bit(MCAST_JOINED, &mcaste->port_mcaste->state), + mcaste->attach_task_cnt, mcaste->retry); + goto retry; + } + + /* attach QP */ + ASSERT(mcaste); + ASSERT(mcaste->port_mcaste); + ASSERT(mcaste->port_mcaste->sa_mcast); + mlid = be16_to_cpu(mcaste->port_mcaste->rec.mlid); + vnic_dbg_mcast(port->name, "QPN 0x%06x attaching MGID "VNIC_GID_FMT + " LID 0x%04x\n", mcaste->qp->qp_num, + VNIC_GID_ARG(mcaste->gid), mlid); + if (!mcaste->sender_only) + rc = ib_attach_mcast(mcaste->qp, &mcaste->gid, mlid); + else + rc = 0; + + if (rc) { + int attach_count = atomic_read(&mcaste->port_mcaste->ref_cnt); + + vnic_err(port->name, "failed to attach (rc %d) to multicast " + "group, MGID "VNIC_GID_FMT"\n", + rc, VNIC_GID_ARG(mcaste->gid)); + + if (port->dev->attr.max_mcast_qp_attach <= attach_count) { + vnic_err(port->name, "Attach failed. Too many vnics are on the same" + " vhub on this port. vnics count=%d, max=%d\n", + attach_count, + port->dev->attr.max_mcast_qp_attach); + } + + goto retry; + } else { + /* create mcast ah */ + memset(&av, 0, sizeof(av)); + av.dlid = be16_to_cpu(mcaste->port_mcaste->rec.mlid); + av.port_num = mcaste->port->num; + av.ah_flags = IB_AH_GRH; + av.static_rate = mcaste->port_mcaste->rec.rate; + av.sl = mcaste->port_mcaste->rec.sl; + memcpy(&av.grh.dgid, mcaste->gid.raw, GID_LEN); + mcaste->ah = ib_create_ah(port->pd, &av); + if (IS_ERR(mcaste->ah)) { + mcaste->ah = NULL; + vnic_err(port->name, + "vnic_ib_create_ah failed (rc %d)\n", + (int)PTR_ERR(mcaste->ah)); + /* for such a failure, no need to retry */ + goto out; + } + vnic_dbg_mcast(mcaste->port->name, "created mcast ah for %p\n", mcaste); + + /* callback */ + set_bit(MCAST_ATTACHED, &mcaste->state); + if (mcaste->cur_attached) + set_bit(mcaste->attach_bit_nr, mcaste->cur_attached); + vnic_dbg_mcast(mcaste->port->name, + "attached GID "VNIC_GID_FMT"\n", + VNIC_GID_ARG(mcaste->gid)); + if (mcaste->attach_cb) { + vnic_dbg_mcast(mcaste->port->name, + "calling attach_cb\n"); + mcaste->attach_cb(mcaste, mcaste->attach_cb_ctx); + } + } + +out: + mcaste->attach_task_cnt = 0; /* for next time */ + mcaste->backoff = mcaste->backoff_init; + complete(&mcaste->attach_complete); + + return; + +retry: + spin_lock_irqsave(&mcaste->lock, flags); + if (test_bit(MCAST_ATTACH_RUNNING, &mcaste->state)) { + /* calls vnic_mcast_attach_task() */ + queue_delayed_work(mcast_wq, &mcaste->attach_task, mcaste->backoff); + } + spin_unlock_irqrestore(&mcaste->lock, flags); +} + +int vnic_mcast_attach(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste) +{ + struct vnic_port_mcast *pmcaste; + struct vnic_port *port = mcaste->port; + int rc = 0; + ASSERT(mcaste); + + mcaste->backoff_init = mcaste->backoff; + + pmcaste = vnic_port_mcast_update(mcaste); + if (IS_ERR(pmcaste)) { + vnic_err(port->name, "vnic_port_mcast_update failed GID " + VNIC_GID_FMT"\n", VNIC_GID_ARG(mcaste->gid)); + rc = PTR_ERR(pmcaste); + goto out; + } + + mcaste->port_mcaste = pmcaste; + + set_bit(MCAST_ATTACH_RUNNING, &mcaste->state); + + /* must be a task, to sample the joined flag */ + vnic_dbg_mcast_v(port->name, "queue delayed task (%lu) " + "vnic_mcast_join_task\n", mcaste->backoff); + init_completion(&mcaste->attach_complete); + /* calls vnic_mcast_attach_task() */ + queue_delayed_work(mcast_wq, &mcaste->attach_task, 0); + if (mcaste->blocking) { + wait_for_completion(&mcaste->attach_complete); + if (test_bit(MCAST_ATTACHED, &mcaste->state)) + goto out; + vnic_mcast_detach(mcast_tree, mcaste); + rc = 1; + } + +out: + return rc; +} + +#if 0 +static int vnic_mcast_attach_all(struct mcast_root *mcast_tree) +{ + int fails = 0; + struct vnic_mcast *mcaste; + struct rb_node *n; + + n = rb_first(&mcast_tree->mcast_tree); + while (n) { + mcaste = rb_entry(n, struct vnic_mcast, rb_node); + n = rb_next(n); + /* async call */ + if (vnic_mcast_attach(mcast_tree, mcaste)) + fails++; + } + + return fails; +} +#endif + +int vnic_mcast_del_all(struct mcast_root *mcast_tree) +{ + struct rb_node *n; + struct vnic_mcast *mcaste, *mcaste_t; + unsigned long flags; + int fails = 0; + LIST_HEAD(local_list); + + spin_lock_irqsave(&mcast_tree->mcast_rb_lock, flags); + n = rb_first(&mcast_tree->mcast_tree); + while (n) { + mcaste = rb_entry(n, struct vnic_mcast, rb_node); + vnic_mcast_del(mcast_tree, mcaste); + list_add_tail(&mcaste->list, &local_list); + n = rb_first(&mcast_tree->mcast_tree); + } + spin_unlock_irqrestore(&mcast_tree->mcast_rb_lock, flags); + + list_for_each_entry_safe(mcaste, mcaste_t, &local_list, list) { + list_del(&mcaste->list); + vnic_mcast_detach(mcast_tree, mcaste); + vnic_mcast_dealloc(mcaste); + } + + return fails; +} + +int vnic_mcast_del_user(struct mcast_root *mcast_tree, void *owner) +{ + struct rb_node *n; + struct vnic_mcast *mcaste, *mcaste_t; + unsigned long flags; + int fails = 0; + LIST_HEAD(local_list); + + spin_lock_irqsave(&mcast_tree->mcast_rb_lock, flags); + n = rb_first(&mcast_tree->mcast_tree); + while (n) { + mcaste = rb_entry(n, struct vnic_mcast, rb_node); + if (mcaste->priv_data == owner) { + list_add_tail(&mcaste->list, &local_list); + vnic_mcast_del(mcast_tree, mcaste); + n = rb_first(&mcast_tree->mcast_tree); + continue; + } + n = rb_next(&mcaste->rb_node); + } + spin_unlock_irqrestore(&mcast_tree->mcast_rb_lock, flags); + + list_for_each_entry_safe(mcaste, mcaste_t, &local_list, list) { + list_del(&mcaste->list); + vnic_mcast_detach(mcast_tree, mcaste); + vnic_mcast_dealloc(mcaste); + } + + return fails; +} + +/* PORT MCAST FUNCTIONS */ +static struct vnic_port_mcast *vnic_port_mcast_alloc(struct vnic_port *port, + union ib_gid *gid) +{ + struct vnic_port_mcast *mcaste; + + mcaste = kzalloc(sizeof *mcaste, GFP_ATOMIC); + if (!mcaste) + return ERR_PTR(-ENOMEM); + + mcaste->gid = *gid; + mcaste->port = port; + init_completion(&mcaste->leave_complete); + atomic_set(&mcaste->ref_cnt, 1); + INIT_DELAYED_WORK(&mcaste->join_task, vnic_port_mcast_join_task); + INIT_WORK(&mcaste->leave_task, vnic_port_mcast_leave_task); + mcaste->sa_mcast = ERR_PTR(-EINVAL); + memset(&mcaste->rec,0,sizeof(mcaste->rec)); + vnic_dbg_mcast_v(mcaste->port->name, "allocated port_mcast GID " + VNIC_GID_FMT"\n", VNIC_GID_ARG(mcaste->gid)); + spin_lock_init(&mcaste->lock); + set_bit(MCAST_JOIN_RUNNING, &mcaste->state); + + return mcaste; +} + +static void vnic_port_mcast_dealloc(struct vnic_port_mcast *mcaste) +{ + ASSERT(mcaste); + vnic_dbg_mcast_v(NULL, "dealloc port_mcast GID " + VNIC_GID_FMT"\n", VNIC_GID_ARG(mcaste->gid)); + kfree(mcaste); +} + +/* + * This function accesses the port mcast tree. Please make sure + * to call it only while holding the port mcast_rb_lock +*/ +static int vnic_port_mcast_add(struct vnic_port_mcast *mcaste) +{ + struct rb_node **n = &mcaste->port->mcast_tree.mcast_tree.rb_node; + struct rb_node *pn = NULL; + struct vnic_port_mcast *mcaste_t; + int rc; + + while (*n) { + pn = *n; + mcaste_t = rb_entry(pn, struct vnic_port_mcast, rb_node); + rc = memcmp(mcaste->gid.raw, mcaste_t->gid.raw, GID_LEN); + if (rc < 0) + n = &pn->rb_left; + else if (rc > 0) + n = &pn->rb_right; + else { + rc = -EEXIST; + goto out; + } + } + + rb_link_node(&mcaste->rb_node, pn, n); + rb_insert_color(&mcaste->rb_node, &mcaste->port->mcast_tree.mcast_tree); + rc = 0; + +out: + vnic_dbg_mcast_v(mcaste->port->name, "added (rc %d) port_mcast GID " + VNIC_GID_FMT"\n", rc, VNIC_GID_ARG(mcaste->gid)); + return rc; +} + +/* + * This function accesses the port mcast tree. Please make sure + * to call it only while holding the port mcast_rb_lock +*/ +static void vnic_port_mcast_del(struct vnic_port_mcast *mcaste) +{ + ASSERT(mcaste); + vnic_dbg_mcast_v(mcaste->port->name, "del port_mcast GID " + VNIC_GID_FMT"\n", VNIC_GID_ARG(mcaste->gid)); + rb_erase(&mcaste->rb_node, &mcaste->port->mcast_tree.mcast_tree); +} + +/* + * This function accesses the port mcast tree. Please make sure + * to call it only while holding the port mcast_rb_lock +*/ +struct vnic_port_mcast *vnic_port_mcast_search(struct vnic_port *port, + union ib_gid *gid) +{ + struct rb_node *n = port->mcast_tree.mcast_tree.rb_node; + struct vnic_port_mcast *mcaste_t; + int rc; + + while (n) { + mcaste_t = rb_entry(n, struct vnic_port_mcast, rb_node); + rc = memcmp(gid->raw, mcaste_t->gid.raw, GID_LEN); + if (rc < 0) + n = n->rb_left; + else if (rc > 0) + n = n->rb_right; + else { + vnic_dbg_mcast_v(mcaste_t->port->name, + "found: GID "VNIC_GID_FMT"\n", + VNIC_GID_ARG(mcaste_t->gid)); + goto out; + } + } + mcaste_t = ERR_PTR(-ENODATA); + +out: + return mcaste_t; +} +/* +static void vnic_port_mcast_leave_task(struct work_struct *work) +{ + struct vnic_port_mcast *mcaste = + container_of(work, struct vnic_port_mcast, leave_task.work); + + vnic_dbg_mcast_v(mcaste->port->name, "leave GID "VNIC_GID_FMT"\n", + VNIC_GID_ARG(mcaste->gid)); + + if (!IS_ERR(mcaste->sa_mcast) && test_bit(MCAST_JOINED, &mcaste->port_mcaste->state)) + vnic_dbg_mcast(mcaste->port->name, + "mcast left: GID "VNIC_GID_FMT"\n", + VNIC_GID_ARG(mcaste->gid)); + if (!IS_ERR(mcaste->sa_mcast)) + ib_sa_free_multicast(mcaste->sa_mcast); + mcaste->sa_mcast = ERR_PTR(-EINVAL); + clear_bit(MCAST_JOINED, &mcaste->port_mcaste->state); +} +*/ + +static int vnic_port_mcast_leave(struct vnic_port_mcast *mcaste, + unsigned long backoff) +{ + unsigned long flags; + + ASSERT(mcaste); + vnic_dbg_mcast(NULL, "queue delayed task (%lu) " + "vnic_mcast_leave_task\n", backoff); + + /* cancel any pending/queued tasks. We can not use sync + * under the spinlock because it might hang. we need the + * spinlock here to ensure the requeueing is atomic + */ + spin_lock_irqsave(&mcaste->lock, flags); + clear_bit(MCAST_JOIN_RUNNING, &mcaste->state); + spin_unlock_irqrestore(&mcaste->lock, flags); +#ifndef _BP_WORK_SYNC + cancel_delayed_work_sync(&mcaste->join_task); +#else + cancel_delayed_work(&mcaste->join_task); + if (delayed_work_pending(&mcaste->join_task)) { + return -EBUSY; + } +#endif + + if (test_and_clear_bit(MCAST_JOIN_STARTED, &mcaste->state) + && !IS_ERR(mcaste->sa_mcast)) { + ib_sa_free_multicast(mcaste->sa_mcast); + mcaste->sa_mcast = ERR_PTR(-EINVAL); + } + + return 0; +} + +static int vnic_port_mcast_join_comp(int status, struct ib_sa_multicast *sa_mcast) +{ + struct vnic_port_mcast *mcaste = sa_mcast->context; + unsigned long flags; + + vnic_dbg_mcast(mcaste->port->name, "join completion for GID " + VNIC_GID_FMT" (status %d)\n", + VNIC_GID_ARG(mcaste->gid), status); + + if (status == -ENETRESET) + return 0; + + if (status) + goto retry; + + /* same as mcaste->rec = mcaste->sa_mcast->rec; */ + mcaste->rec = sa_mcast->rec; + + set_bit(MCAST_JOINED, &mcaste->state); + vnic_dbg_mcast(mcaste->port->name, "joined GID "VNIC_GID_FMT"\n", + VNIC_GID_ARG(mcaste->gid)); +#if 0 + vnic_dbg_mcast_v(mcaste->port->name, "mcast record dump:\n"); + vnic_dbg_mcast_v(mcaste->port->name, "mgid "VNIC_GID_FMT"\n", + VNIC_GID_ARG(rec->mgid)); + vnic_dbg_mcast_v(mcaste->port->name, "port_gid "VNIC_GID_FMT"\n", + VNIC_GID_ARG(rec->port_gid)); + vnic_dbg_mcast_v(mcaste->port->name, "pkey 0x%x\n", rec->pkey); + vnic_dbg_mcast_v(mcaste->port->name, "qkey 0x%x\n", rec->qkey); + vnic_dbg_mcast_v(mcaste->port->name, "mtu_slct 0x%x\n", + rec->mtu_selector); + vnic_dbg_mcast_v(mcaste->port->name, "mtu 0x%x\n", rec->mtu); + vnic_dbg_mcast_v(mcaste->port->name, "rate_slct 0x%x\n", + rec->rate_selector); + vnic_dbg_mcast_v(mcaste->port->name, "rate 0x%x\n", rec->rate); + vnic_dbg_mcast_v(mcaste->port->name, "sl 0x%x\n", rec->sl); + vnic_dbg_mcast_v(mcaste->port->name, "flow_label 0x%x\n", + rec->flow_label); + vnic_dbg_mcast_v(mcaste->port->name, "hop_limit 0x%x\n", + rec->hop_limit); +#endif + + goto out; +retry: + /* calls vnic_port_mcast_join_task() */ + spin_lock_irqsave(&mcaste->lock, flags); + if (test_bit(MCAST_JOIN_RUNNING, &mcaste->state)) + queue_delayed_work(mcast_wq, &mcaste->join_task, mcaste->backoff); + spin_unlock_irqrestore(&mcaste->lock, flags); + +out: + /* rc is always zero so we handle ib_sa_free_multicast ourselves */ + return 0; +} + +static void vnic_port_mcast_join_task(struct work_struct *work) +{ + struct vnic_port_mcast *mcaste = + container_of(work, struct vnic_port_mcast, join_task.work); + struct ib_sa_mcmember_rec rec = { + .join_state = mcaste->join_state + }; + int rc; + ib_sa_comp_mask comp_mask; + unsigned long flags; + + if (++mcaste->join_task_cnt > mcaste->retry && mcaste->retry) { + vnic_dbg_mcast(mcaste->port->name, + "join_task stopped, tried %ld times\n", + mcaste->retry); + goto out; + } + + /* update backoff time */ + mcaste->backoff = min(mcaste->backoff * mcaste->backoff_factor, + msecs_to_jiffies(VNIC_MCAST_BACKOFF_MAX_MSEC)); + + rec.mgid.global = mcaste->gid.global; + rec.port_gid.global = mcaste->port->gid.global; + rec.pkey = cpu_to_be16(mcaste->pkey); + + comp_mask = + IB_SA_MCMEMBER_REC_MGID | + IB_SA_MCMEMBER_REC_PORT_GID | + /*IB_SA_MCMEMBER_REC_PKEY | */ + IB_SA_MCMEMBER_REC_JOIN_STATE; + + if (mcaste->create) { + comp_mask |= + IB_SA_MCMEMBER_REC_QKEY | + IB_SA_MCMEMBER_REC_MTU_SELECTOR | + IB_SA_MCMEMBER_REC_MTU | + IB_SA_MCMEMBER_REC_TRAFFIC_CLASS | + IB_SA_MCMEMBER_REC_RATE_SELECTOR | + IB_SA_MCMEMBER_REC_RATE | + IB_SA_MCMEMBER_REC_SL | + IB_SA_MCMEMBER_REC_FLOW_LABEL | + IB_SA_MCMEMBER_REC_HOP_LIMIT | + IB_SA_MCMEMBER_REC_PKEY; + + rec.qkey = cpu_to_be32(mcaste->qkey); + rec.mtu_selector = IB_SA_EQ; + rec.rate_selector = IB_SA_EQ; + /* when no_bxm is set, use min values to let everybody in */ + rec.mtu = no_bxm ? IB_MTU_2048 : mcaste->port->attr.max_mtu; + rec.rate = no_bxm ? IB_RATE_10_GBPS : mcaste->port->rate_enum; + rec.sl = 0; + rec.flow_label = 0; + rec.hop_limit = 0; + } + + vnic_dbg_mcast(mcaste->port->name, "joining MGID "VNIC_GID_FMT + " create %d, comp_mask %lu\n", + VNIC_GID_ARG(mcaste->gid), mcaste->create, (unsigned long)comp_mask); + + if (!IS_ERR(mcaste->sa_mcast)) + ib_sa_free_multicast(mcaste->sa_mcast); + + mcaste->sa_mcast = + ib_sa_join_multicast(&vnic_sa_client, mcaste->port->dev->ca, + mcaste->port->num, &rec, comp_mask, + GFP_KERNEL, vnic_port_mcast_join_comp, mcaste); + set_bit(MCAST_JOIN_STARTED, &mcaste->state); + + if (IS_ERR(mcaste->sa_mcast)) { + rc = PTR_ERR(mcaste->sa_mcast); + vnic_warn(mcaste->port->name, + "ib_sa_join_multicast failed, status %d\n", rc); + /* calls vnic_port_mcast_join_task() */ + spin_lock_irqsave(&mcaste->lock, flags); + if (test_bit(MCAST_JOIN_RUNNING, &mcaste->state)) + queue_delayed_work(mcast_wq, &mcaste->join_task, mcaste->backoff); + spin_unlock_irqrestore(&mcaste->lock, flags); + } + + return; + +out: + mcaste->join_task_cnt = 0; /* for next time */ + mcaste->backoff = mcaste->backoff_init; + return; +} + +static int vnic_port_mcast_join(struct vnic_port_mcast *mcaste) +{ + unsigned long flags; + + ASSERT(mcaste); + vnic_dbg_mcast_v(mcaste->port->name, "queue delayed task (%lu) " + "vnic_port_mcast_join_task\n", mcaste->backoff); + + /* calls vnic_port_mcast_join_task() */ + spin_lock_irqsave(&mcaste->lock, flags); + if (test_bit(MCAST_JOIN_RUNNING, &mcaste->state)) + queue_delayed_work(mcast_wq, &mcaste->join_task, 0); + spin_unlock_irqrestore(&mcaste->lock, flags); + + return 0; +} + +#if 0 +static int vnic_port_mcast_join_all(struct vnic_port *port) +{ + int fails = 0; + struct vnic_port_mcast *mcaste; + struct rb_node *n; + + n = rb_first(&port->mcast_tree.mcast_tree); + while (n) { + mcaste = rb_entry(n, struct vnic_port_mcast, rb_node); + n = rb_next(n); + if (vnic_port_mcast_join(mcaste)) + fails++; + } + + return fails; +} +#endif + +static void vnic_port_mcast_leave_task(struct work_struct *work) +{ + struct vnic_port_mcast *mcaste = + container_of(work, struct vnic_port_mcast, leave_task); + +#ifndef _BP_WORK_SYNC + vnic_port_mcast_leave(mcaste, 0); +#else + if (vnic_port_mcast_leave(mcaste, 0)) { + queue_work(mcast_wq, &mcaste->leave_task); + return; + } +#endif + vnic_port_mcast_dealloc(mcaste); +} + +static void vnic_port_mcast_release(struct vnic_port_mcast *mcaste) +{ + unsigned long flags; + + struct vnic_port *port = mcaste->port; + + vnic_dbg_mcast(port->name, "update mcaste->ref_cnt %d -> %d\n", + atomic_read(&mcaste->ref_cnt), + atomic_read(&mcaste->ref_cnt) - 1); + + spin_lock_irqsave(&port->mcast_tree.mcast_rb_lock, flags); + if (atomic_dec_and_test(&mcaste->ref_cnt)) { + vnic_port_mcast_del(mcaste); + spin_unlock_irqrestore(&port->mcast_tree.mcast_rb_lock, flags); + + /* we are not going to wait for the leave to terminate. + * We will just go on. + * calls vnic_port_mcast_leave_task() + */ + queue_work(mcast_wq, &mcaste->leave_task); + } else + spin_unlock_irqrestore(&port->mcast_tree.mcast_rb_lock, flags); +} + +static struct vnic_port_mcast *vnic_port_mcast_update(struct vnic_mcast *_mcaste) +{ + union ib_gid *gid = &_mcaste->port_gid; + u32 qkey = _mcaste->qkey; + u16 pkey = _mcaste->pkey; + struct vnic_port *port = _mcaste->port; + struct vnic_port_mcast *mcaste; + unsigned long flags; + + spin_lock_irqsave(&port->mcast_tree.mcast_rb_lock, flags); + mcaste = vnic_port_mcast_search(port, gid); + /* entry found */ + if (PTR_ERR(mcaste) != -ENODATA) { + ASSERT(!IS_ERR(mcaste)); + atomic_inc(&mcaste->ref_cnt); + spin_unlock_irqrestore(&port->mcast_tree.mcast_rb_lock, flags); + vnic_dbg_mcast(mcaste->port->name, + "found, add GID "VNIC_GID_FMT" \n", + VNIC_GID_ARG(*gid)); + vnic_dbg_mcast(mcaste->port->name, + "update mcaste->ref_cnt %d -> %d\n", + atomic_read(&mcaste->ref_cnt), + atomic_read(&mcaste->ref_cnt) + 1); + } else { /* not found, add it */ + mcaste = vnic_port_mcast_alloc(port, gid); + if (IS_ERR(mcaste)) { + spin_unlock_irqrestore(&port->mcast_tree.mcast_rb_lock, flags); + return mcaste; + } + vnic_dbg_mcast(mcaste->port->name, + "not found, add GID "VNIC_GID_FMT" \n", + VNIC_GID_ARG(*gid)); + vnic_dbg_mcast(mcaste->port->name, + "update mcaste->ref_cnt %d -> %d\n", + atomic_read(&mcaste->ref_cnt), + atomic_read(&mcaste->ref_cnt) + 1); + mcaste->qkey = qkey; + mcaste->pkey = pkey; + mcaste->backoff_init = _mcaste->backoff_init; + mcaste->backoff = _mcaste->backoff; + mcaste->backoff_factor = _mcaste->backoff_factor; + mcaste->retry = _mcaste->retry; + mcaste->create = _mcaste->create; + mcaste->join_state = _mcaste->join_state; + vnic_port_mcast_add(mcaste); + spin_unlock_irqrestore(&port->mcast_tree.mcast_rb_lock, flags); + + vnic_port_mcast_join(mcaste); + vnic_dbg_mcast(mcaste->port->name, "added\n"); + } + + return mcaste; +} + +#if 0 +void vnic_port_mcast_del_all(struct vnic_port *port) +{ + + struct rb_node *n; + struct vnic_port_mcast *mcaste, *mcaste_t; + LIST_HEAD(local_list); + + ASSERT(port); + + n = rb_first(&port->mcast_tree.mcast_tree); + while (n) { + mcaste = rb_entry(n, struct vnic_port_mcast, rb_node); + list_add_tail(&mcaste->list, &local_list); + n = rb_next(&mcaste->rb_node); + } + + list_for_each_entry_safe(mcaste, mcaste_t, &local_list, list) { + list_del(&mcaste->list); + vnic_warn(port->name, "shouldn't find gid "VNIC_GID_FMT"\n", + VNIC_GID_ARG(mcaste->gid)); + vnic_port_mcast_release(mcaste); + } + + return; +} +#endif + +void vnic_tree_mcast_detach(struct mcast_root *mcast_tree) +{ + struct vnic_mcast *mcaste, *mcaste_t; + struct rb_node *n; + unsigned long flags; + INIT_LIST_HEAD(&mcast_tree->reattach_list); + + spin_lock_irqsave(&mcast_tree->mcast_rb_lock, flags); + n = rb_first(&mcast_tree->mcast_tree); + while (n) { + mcaste = rb_entry(n, struct vnic_mcast, rb_node); + list_add_tail(&mcaste->list, &mcast_tree->reattach_list); + vnic_mcast_del(mcast_tree, mcaste); + mcaste->attach_task_cnt = 0; + n = rb_first(&mcast_tree->mcast_tree); + } + spin_unlock_irqrestore(&mcast_tree->mcast_rb_lock, flags); + + list_for_each_entry_safe(mcaste, mcaste_t, &mcast_tree->reattach_list, list) { + vnic_mcast_detach(mcast_tree, mcaste); + } + + return; +} + +void vnic_tree_mcast_attach(struct mcast_root *mcast_tree) +{ + struct vnic_mcast *mcaste, *mcaste_t; + int rc; + + /* The add function grabs the mcast_rb_lock no need to take it */ + list_for_each_entry_safe(mcaste, mcaste_t, &mcast_tree->reattach_list, list) { + rc = vnic_mcast_add(mcast_tree, mcaste); + ASSERT(!rc); + rc = vnic_mcast_attach(mcast_tree, mcaste); + ASSERT(!rc); + list_del(&mcaste->list); + } + + return; +} + +int vnic_mcast_init() +{ + ib_sa_register_client(&vnic_sa_client); + + mcast_wq = create_singlethread_workqueue("mcast_wq"); + if (!mcast_wq) + return -ENOMEM; + + return 0; +} + +void vnic_mcast_cleanup() +{ + ASSERT(mcast_wq); + vnic_dbg_mark(); + flush_workqueue(mcast_wq); + vnic_dbg_mark(); + destroy_workqueue(mcast_wq); + vnic_dbg_mark(); + ib_sa_unregister_client(&vnic_sa_client); + + return; +} diff --git a/drivers/net/mlx4_vnic/vnic_mcast.h b/drivers/net/mlx4_vnic/vnic_mcast.h new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/drivers/net/mlx4_vnic/vnic_param.c b/drivers/net/mlx4_vnic/vnic_param.c new file mode 100644 index 0000000000000..a1955e360fc68 --- /dev/null +++ b/drivers/net/mlx4_vnic/vnic_param.c @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_fip.h" + +u32 vnic_lro_num = VNIC_MAX_LRO_DESCS; +u32 vnic_net_admin = 1; +u32 vnic_child_max = VNIC_CHILD_MAX; +u32 vnic_tx_rings_num = 0; +u32 vnic_rx_rings_num = 0; +u32 vnic_tx_rings_len = VNIC_TX_QUEUE_LEN; +u32 vnic_rx_rings_len = VNIC_RX_QUEUE_LEN; +u32 vnic_mgid_data_type = 0; +u32 vnic_encap_headroom = 1; +u32 vnic_tx_polling = 1; +u32 vnic_rx_linear = 0; +u32 vnic_change_mac = 0; +u32 vnic_learn_mac_enabled = 1; +u32 vnic_synd_backlog = 4; +u32 vnic_eport_state_enforce = 0; +u32 vnic_src_mac_enforce = 0; +u32 vnic_inline_tshold = 0; +u32 vnic_discovery_pkeys[MAX_NUM_PKEYS_DISCOVERY]; +u32 vnic_discovery_pkeys_count = MAX_NUM_PKEYS_DISCOVERY; +u32 vnic_gid_index = 0; + +/* these params are enbaled in debug mode */ +u32 no_bxm = 0; +u32 vnic_msglvl = 0x80000000; +u32 vnic_max_tx_outs = VNIC_MAX_TX_OUTS; +u32 vnic_linear_small_pkt = 1; +u32 vnic_mcast_create = 0; +u32 vnic_napi_weight = VNIC_MAX_RX_CQE; + +module_param_named(tx_rings_num, vnic_tx_rings_num, int, 0444); +MODULE_PARM_DESC(tx_rings_num, "Number of TX rings, use 0 for #cpus [default 0, max 32]"); + +module_param_named(tx_rings_len, vnic_tx_rings_len, int, 0444); +MODULE_PARM_DESC(tx_rings_len, "Length of TX rings, must be power of two [default 1024, max 8K]"); + +module_param_named(rx_rings_num, vnic_rx_rings_num, int, 0444); +MODULE_PARM_DESC(rx_rings_num, "Number of RX rings, use 0 for #cpus [default 0, max 32]"); + +module_param_named(rx_rings_len, vnic_rx_rings_len, int, 0444); +MODULE_PARM_DESC(rx_rings_len, "Length of RX rings, must be power of two [default 2048, max 8K]"); + +module_param_named(eport_state_enforce, vnic_eport_state_enforce, int, 0644); +MODULE_PARM_DESC(eport_state_enforce, "Bring interface up only when corresponding EPort is up [default 0]"); + +module_param_named(src_mac_enforce, vnic_src_mac_enforce, int, 0644); +MODULE_PARM_DESC(src_mac_enforce, "Enforce source MAC address [default 0]"); + +module_param_named(vnic_net_admin, vnic_net_admin, int, 0644); +MODULE_PARM_DESC(vnic_net_admin, "Enable Network Administration mode [default 1]"); + +module_param_named(vnic_child_max, vnic_child_max, int, 0644); +MODULE_PARM_DESC(vnic_child_max, "Max child vNics (per interface), use 0 to disable [default 128]"); + +module_param_named(mgid_data_type, vnic_mgid_data_type, int, 0444); +MODULE_PARM_DESC(mgid_data_type, "Set MGID data type for multicast traffic [default 0, max 1]"); + +module_param_named(encap_headroom, vnic_encap_headroom, int, 0444); +MODULE_PARM_DESC(encap_headroom, "Use SKB headroom for protocol encapsulation [default 1]"); + +module_param_named(inline_tshold, vnic_inline_tshold, int, 0444); +MODULE_PARM_DESC(inline_tshold, "Packets smaller than this threshold (in bytes) use inline & blue flame [default 0, max 512]"); + +module_param_named(tx_polling, vnic_tx_polling, int, 0444); +MODULE_PARM_DESC(tx_polling, "Enable TX polling mode [default 1]"); + +module_param_named(rx_linear, vnic_rx_linear, int, 0444); +MODULE_PARM_DESC(rx_linear, "Enable linear RX buffers [default 0]"); + +module_param_named(change_mac, vnic_change_mac, int, 0444); +MODULE_PARM_DESC(change_mac, "Enable MAC change using child vNics [default 0]"); + +module_param_named(learn_tx_mac, vnic_learn_mac_enabled, int, 0644); +MODULE_PARM_DESC(learn_tx_mac, "Enable TX MAC learning in promisc mode [default 1]"); + +module_param_named(synd_backlog, vnic_synd_backlog, int, 0644); +MODULE_PARM_DESC(synd_backlog, "Syndrome error reporting backlog limit [default 4]"); + +module_param_array_named(discovery_pkeys, vnic_discovery_pkeys, int, &vnic_discovery_pkeys_count, 0444); +MODULE_PARM_DESC(discovery_pkeys, "Vector of PKeys to be used for discovery [default 0xffff, max vector length 24]"); + +#if !(defined(NETIF_F_GRO) && !defined(_BP_NO_GRO)) +module_param_named(lro_num, vnic_lro_num, int, 0444); +MODULE_PARM_DESC(lro_num, "Number of LRO sessions per ring, use 0 to disable [default 32, max 32]"); +#endif + +module_param_named(guid_index, vnic_gid_index, int, 0444); +MODULE_PARM_DESC(guid_index, "GUIDs table index to use for EoIB [default 0]"); + +#ifdef CONFIG_MLX4_VNIC_DEBUG +module_param_named(no_bxm, no_bxm, int, 0444); +MODULE_PARM_DESC(no_bxm, "Enable NO BXM mode [default 0]"); + +module_param_named(msglvl, vnic_msglvl, uint, 0644); +MODULE_PARM_DESC(msglvl, "Debug message level [default 0]"); + +module_param_named(max_tx_outs, vnic_max_tx_outs, int, 0644); +MODULE_PARM_DESC(max_tx_outs, "Max outstanding TX packets [default 16]"); + +module_param_named(linear_small_pkt, vnic_linear_small_pkt, int, 0644); +MODULE_PARM_DESC(linear_small_pkt, "Use linear buffer for small packets [default 1]"); + +module_param_named(mcast_create, vnic_mcast_create, int, 0444); +MODULE_PARM_DESC(mcast_create, "Create multicast group during join request [default 0]"); + +module_param_named(napi_weight, vnic_napi_weight, int, 0444); +MODULE_PARM_DESC(napi_weight, "NAPI weight [default 32]"); +#endif /* CONFIG_MLX4_VNIC_DEBUG */ + +int vnic_param_check(void) { +#ifdef CONFIG_MLX4_VNIC_DEBUG + vnic_info("VNIC_DEBUG flag is set\n"); +#endif + + vnic_mcast_create = vnic_mcast_create ? 1 : 0; + vnic_mcast_create = no_bxm ? 1 : vnic_mcast_create; + no_bxm = no_bxm ? 1 : 0; + + vnic_mgid_data_type = max_t(u32, vnic_mgid_data_type, 0); + vnic_mgid_data_type = min_t(u32, vnic_mgid_data_type, 1); + + vnic_rx_rings_num = max_t(u32, vnic_rx_rings_num, 0); + vnic_rx_rings_num = min_t(u32, vnic_rx_rings_num, VNIC_MAX_NUM_CPUS); + + vnic_tx_rings_num = max_t(u32, vnic_tx_rings_num, 0); + vnic_tx_rings_num = min_t(u32, vnic_tx_rings_num, VNIC_MAX_NUM_CPUS); + + vnic_tx_rings_len = rounddown_pow_of_two(vnic_tx_rings_len); + vnic_tx_rings_len = max_t(u32, vnic_tx_rings_len, VNIC_TX_QUEUE_LEN_MIN); + vnic_tx_rings_len = min_t(u32, vnic_tx_rings_len, VNIC_TX_QUEUE_LEN_MAX); + + vnic_rx_rings_len = rounddown_pow_of_two(vnic_rx_rings_len); + vnic_rx_rings_len = max_t(u32, vnic_rx_rings_len, VNIC_RX_QUEUE_LEN_MIN); + vnic_rx_rings_len = min_t(u32, vnic_rx_rings_len, VNIC_RX_QUEUE_LEN_MAX); + + vnic_max_tx_outs = min_t(u32, vnic_tx_rings_len, vnic_max_tx_outs); + + vnic_napi_weight = min_t(u32, vnic_napi_weight, VNIC_MAX_NUM_CPUS); + + vnic_lro_num = max_t(u32, vnic_lro_num, 0); + vnic_lro_num = min_t(u32, vnic_lro_num, VNIC_MAX_LRO_DESCS); + + vnic_inline_tshold = max_t(u32, vnic_inline_tshold, 0); + vnic_inline_tshold = min_t(u32, vnic_inline_tshold, VNIC_MAX_INLINE_TSHOLD); + + return 0; +} diff --git a/drivers/net/mlx4_vnic/vnic_port.c b/drivers/net/mlx4_vnic/vnic_port.c new file mode 100644 index 0000000000000..3dbfd4b8781ec --- /dev/null +++ b/drivers/net/mlx4_vnic/vnic_port.c @@ -0,0 +1,518 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vnic.h" +#include "vnic_data.h" + +/* globals */ +struct workqueue_struct *port_wq; +struct workqueue_struct *login_wq; + +/* functions */ +static void vnic_port_event(struct ib_event_handler *handler, + struct ib_event *record) +{ + struct vnic_port *port = + container_of(handler, struct vnic_port, event_handler); + + if (record->element.port_num != port->num) + return; + + vnic_info("Received event 0x%x (device %s port %d)\n", + record->event, record->device->name, + record->element.port_num); + + switch (record->event) { + case IB_EVENT_SM_CHANGE: + case IB_EVENT_CLIENT_REREGISTER: + /* calls vnic_port_event_task_light() */ + queue_delayed_work(fip_wq, &port->event_task_light, msecs_to_jiffies(VNIC_SM_HEADSTART)); + break; + case IB_EVENT_PORT_ERR: + case IB_EVENT_PORT_ACTIVE: + /* calls vnic_port_event_task() */ + queue_delayed_work(fip_wq, &port->event_task, 0); + break; + case IB_EVENT_GID_CHANGE: + queue_delayed_work(port_wq, &port->gid_change_event_task, 0); + break; + case IB_EVENT_PKEY_CHANGE: + case IB_EVENT_LID_CHANGE: + /* calls port_fip_discover_restart() */ + if (no_bxm) + queue_delayed_work(fip_wq, &port->event_task, 0); + else + queue_delayed_work(port_wq, &port->discover_restart_task, 0); + break; + case IB_EVENT_SRQ_ERR: + case IB_EVENT_SRQ_LIMIT_REACHED: + case IB_EVENT_QP_LAST_WQE_REACHED: + case IB_EVENT_DEVICE_FATAL: + default: + vnic_warn(port->name, "event 0x%x unhandled\n", record->event); + break; + } + +} + +static inline u8 vnic_mcast_rate_enum(struct vnic_port *port, int rate) +{ + u8 ret; + + switch (rate) { + case 10: + ret = IB_RATE_10_GBPS; + break; + case 20: + ret = IB_RATE_20_GBPS; + break; + case 40: + ret = IB_RATE_40_GBPS; + break; + case 80: + ret = IB_RATE_80_GBPS; + break; + default: + ret = IB_RATE_10_GBPS; + } + return ret; +} + +int vnic_port_query(struct vnic_port *port) +{ + static int set_gid_idx = 1; + /* gids tbl len is a capability of mlx4_dev */ + u32 gids_tbl_len = port->dev->mdev->dev->caps.gid_table_len[port->num]; + + if (set_gid_idx && vnic_gid_index >= gids_tbl_len) { + vnic_warn(port->name, "Invalid vnic_gid_index %d. Value must be between 0 and %d. defaulting to 0", + vnic_gid_index, gids_tbl_len - 1); + vnic_gid_index = 0; + set_gid_idx = 0; + } + + if (ib_query_gid(port->dev->ca, port->num, vnic_gid_index, &port->gid)) { + vnic_err(port->name, "ib_query_gid of index %d failed\n", + vnic_gid_index); + return -EINVAL; + } + + if (ib_query_port(port->dev->ca, port->num, &port->attr)) { + vnic_err(port->name, "ib_query_port failed\n"); + return -EINVAL; + } + + port->max_mtu_enum = ib_mtu_enum_to_int(port->attr.max_mtu); + port->rate = ((int)port->attr.active_speed * + ib_width_enum_to_int(port->attr.active_width) * 25) / 10; + port->rate_enum = vnic_mcast_rate_enum(port, port->rate); + + if (ib_query_pkey(port->dev->ca, port->num, port->pkey_index, + &port->pkey)) { + vnic_err(port->name, "ib_query_pkey failed for index %d\n", + port->pkey_index); + return -EINVAL; + } + port->pkey |= 0x8000; + + return 0; +} + +void vnic_port_event_task(struct work_struct *work) +{ + struct vnic_port *port = + container_of(work, struct vnic_port, event_task.work); + struct fip_discover *discover; + + /* refresh port attr, TODO: check what else need to be refreshed */ + vnic_dbg_mark(); + mutex_lock(&port->mlock); + if (vnic_port_query(port)) + vnic_warn(port->name, "vnic_port_query failed\n"); + mutex_unlock(&port->mlock); + + /* refresh login mcasts */ + vnic_login_refresh_mcasts(port); + + list_for_each_entry(discover, &port->fip.discover_list, discover_list) { + /* refresh FIP mcasts */ + if ((!no_bxm) && (discover->state != FIP_DISCOVER_OFF)) + fip_refresh_mcasts(discover); + } + +} + +void vnic_port_gid_change_task(struct work_struct *work) +{ + struct vnic_port *port = + container_of(work, struct vnic_port, gid_change_event_task.work); + u8 old_gid[16]; + memcpy(old_gid, port->gid.raw, 16); + if (vnic_port_query(port)) + vnic_warn(port->name, "vnic_port_query failed\n"); + if (!memcmp(old_gid, port->gid.raw,16)) + { + vnic_dbg(NULL, "GID change event port %d: GID was not changed, ignoring...\n", port->num); + return; + } + vnic_dbg(NULL, "GID change event port %d: restarting fip discover...\n", port->num); + if (no_bxm) + queue_delayed_work(fip_wq, &port->event_task, 0); + else + queue_delayed_work(port_wq, &port->discover_restart_task, 0); +} + +void vnic_port_event_task_light(struct work_struct *work) +{ + struct vnic_port *port = + container_of(work, struct vnic_port, event_task_light.work); + unsigned long flags,mc_flags; + struct fip_discover *discover; + struct rb_node *node; + struct vnic_port_mcast *mcaste; + struct mcast_root *mcast_tree = &port->mcast_tree; + vnic_dbg_mark(); + mutex_lock(&port->mlock); + if (vnic_port_query(port)) + vnic_warn(port->name, "vnic_port_query failed\n"); + + spin_lock_irqsave(&mcast_tree->mcast_rb_lock, flags); + for (node = rb_first(&mcast_tree->mcast_tree); node; node = rb_next(node)){ + mcaste = rb_entry(node, struct vnic_port_mcast , rb_node); + clear_bit(MCAST_JOINED, &mcaste->state); + set_bit(MCAST_JOIN_RUNNING, &mcaste->state); + vnic_dbg_mcast(mcaste->port->name,"Rejoin GID="VNIC_GID_FMT"\n",VNIC_GID_ARG(mcaste->gid)); + spin_lock_irqsave(&mcaste->lock, mc_flags); + queue_delayed_work(mcast_wq, &mcaste->join_task, 0); + spin_unlock_irqrestore(&mcaste->lock, mc_flags); + } + + spin_unlock_irqrestore(&mcast_tree->mcast_rb_lock, flags); + mutex_unlock(&port->mlock); + + list_for_each_entry(discover, &port->fip.discover_list, discover_list) { + if ((!no_bxm) && (discover->state != FIP_DISCOVER_OFF)) + fip_refresh_mcasts(discover); + } +} + +struct vnic_port *vnic_port_alloc(struct vnic_ib_dev *vnic_dev, u8 num) +{ + struct vnic_port *port; + int def_rings_num; + + port = kzalloc(sizeof *port, GFP_KERNEL); + if (!port) + return ERR_PTR(-ENOMEM); + + /* pre-init fields */ + port->num = num; + port->dev = vnic_dev; + + def_rings_num = min(vnic_dev->ca->num_comp_vectors, VNIC_MAX_NUM_CPUS); + port->rx_rings_num = vnic_rx_rings_num ? vnic_rx_rings_num : def_rings_num; + port->tx_rings_num = vnic_tx_rings_num ? vnic_tx_rings_num : def_rings_num; + + sprintf(port->name, "%s:%d", port->dev->ca->name, port->num); + INIT_LIST_HEAD(&port->login_list); + INIT_LIST_HEAD(&port->fip.discover_list); + INIT_DELAYED_WORK(&port->gid_change_event_task, vnic_port_gid_change_task); + INIT_DELAYED_WORK(&port->event_task, vnic_port_event_task); + INIT_DELAYED_WORK(&port->event_task_light, vnic_port_event_task_light); + INIT_DELAYED_WORK(&port->discover_restart_task, port_fip_discover_restart); + INIT_IB_EVENT_HANDLER(&port->event_handler, vnic_dev->ca, + vnic_port_event); + mutex_init(&port->mlock); + mutex_init(&port->start_stop_lock); + vnic_mcast_root_init(&port->mcast_tree); + atomic_set(&port->vnic_child_ids, 0); + + port->pkey_index = 0; /* used by fip qps, TBD */ + + if (ib_register_event_handler(&port->event_handler)) { + vnic_err(port->name, "ib_register_event_handler failed\n"); + goto err; + } + + vnic_dbg_mark(); + mutex_lock(&port->mlock); + if (vnic_port_query(port)) { + vnic_err(port->name, "vnic_port_query failed\n"); + mutex_unlock(&port->mlock); + if (ib_unregister_event_handler(&port->event_handler)) + vnic_err(port->name, "ib_unregister_event_handler failed!\n"); + goto err; + } + mutex_unlock(&port->mlock); + + return port; +err: + kfree(port); + return ERR_PTR(-EINVAL); +} + +int vnic_port_init(struct vnic_port *port) +{ + return vnic_port_ib_init(port); +} + +void vnic_port_cleanup(struct vnic_port *port) +{ + /* should be empty list */ + vnic_port_ib_cleanup(port); + return; +} + +static void vnic_ib_dev_add_one(struct ib_device *device); +static void vnic_ib_dev_remove_one(struct ib_device *device); +static struct ib_client vnic_init_client = { + .name = DRV_NAME, + .add = vnic_ib_dev_add_one, + .remove = vnic_ib_dev_remove_one, +}; + +static void vnic_ib_dev_add_one(struct ib_device *device) +{ + struct vnic_port *ib_port; + struct vnic_ib_dev *ib_dev; + int s, e, p, rc; + + vnic_dbg(NULL, "ib_dev %s\n", device->name); + + if (memcmp(device->name, "mlx4", 4)) + return; + + if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) + return; + + s = 1; + e = device->phys_port_cnt; + + /* alloc ib device */ + ib_dev = kzalloc(sizeof *ib_dev, GFP_KERNEL); + if (!ib_dev) + return; + + /* init ib dev */ + mutex_init(&ib_dev->mlock); + ib_dev->ca = device; + mutex_lock(&ib_dev->mlock); + /* TODO: remove mdev once all mlx4 caps are standard */ + ib_dev->mdev = to_mdev(device); + ASSERT(ib_dev->ca); + sprintf(ib_dev->name, "%s", device->name); + if (ib_query_device(device, &ib_dev->attr)) { + vnic_err(ib_dev->name, "ib_query_device failed on %s\n", + device->name); + goto abort; + } + + VNIC_FW_STR(ib_dev->attr.fw_ver, ib_dev->fw_ver_str); + INIT_LIST_HEAD(&ib_dev->port_list); + vnic_dbg_mark(); + for (p = s; p <= e; ++p) { + /* skip non IB link layers */ + if (rdma_port_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND) + continue; + + /* alloc IB port */ + ib_port = vnic_port_alloc(ib_dev, p); + if (IS_ERR(ib_port)) { + vnic_err(ib_dev->name, + "vnic_port_alloc failed %d from %d\n", p, e); + continue; + } + /* init IB port */ + rc = vnic_port_init(ib_port); + if (rc) { + vnic_err(ib_port->name, + "vnic_port_init failed, rc %d\n", rc); + if (ib_unregister_event_handler(&ib_port->event_handler)) + vnic_err(ib_port->name, + "ib_unregister_event_handler failed!\n"); + kfree(ib_port); + continue; + } + if (no_bxm) { + rc = vnic_port_data_init(ib_port); + if (rc) + vnic_err(ib_port->name, + "vnic_port_data_init failed, rc %d\n", rc); + } else { + rc = vnic_port_fip_init(ib_port); + if (rc) + vnic_err(ib_port->name, + "vnic_port_fip_init failed, rc %d\n", rc); + else { + rc = port_fs_init(ib_port); + if (rc) + vnic_warn(ib_port->name, "port_fs_init sysfs:" + "entry creation failed, %d\n", rc); + } + } + if (rc) { + if (ib_unregister_event_handler(&ib_port->event_handler)) + vnic_err(ib_port->name, + "ib_unregister_event_handler failed!\n"); + vnic_port_cleanup(ib_port); + kfree(ib_port); + continue; + + } + vnic_dbg_mark(); + mutex_lock(&ib_port->start_stop_lock); + list_add_tail(&ib_port->list, &ib_dev->port_list); + mutex_unlock(&ib_port->start_stop_lock); + } + + /* set device ctx */ + ib_set_client_data(device, &vnic_init_client, ib_dev); + mutex_unlock(&ib_dev->mlock); + return; + +abort: + mutex_unlock(&ib_dev->mlock); + kfree(ib_dev); +} + +static void vnic_ib_dev_remove_one(struct ib_device *device) +{ + struct vnic_port *port, *port_t; + struct vnic_ib_dev *ib_dev = + ib_get_client_data(device, &vnic_init_client); + + vnic_dbg(NULL, "ib_dev %s\n", device->name); + + if (!ib_dev) + return; + + vnic_dbg_mark(); + mutex_lock(&ib_dev->mlock); + list_for_each_entry_safe(port, port_t, &ib_dev->port_list, list) { + vnic_dbg(port->name, "port %d\n", port->num); + if (ib_unregister_event_handler(&port->event_handler)) + vnic_err(port->name, "ib_unregister_event_handler failed!\n"); + + /* make sure we don't have any more pending events */ +#ifndef _BP_WORK_SYNC + cancel_delayed_work_sync(&port->gid_change_event_task); + cancel_delayed_work_sync(&port->event_task_light); + cancel_delayed_work_sync(&port->event_task); + cancel_delayed_work_sync(&port->discover_restart_task); +#else + cancel_delayed_work(&port->gid_change_event_task); + cancel_delayed_work(&port->event_task_light); + cancel_delayed_work(&port->event_task); + cancel_delayed_work(&port->discover_restart_task); + flush_workqueue(port_wq); + flush_workqueue(fip_wq); +#endif + /* remove sysfs entries related to FIP + * we want to do this outside the lock + */ + port_fs_exit(port); + + /* cleanup any pending vnics */ + vnic_dbg_mark(); + mutex_lock(&port->start_stop_lock); + list_del(&port->list); + if (no_bxm) + vnic_port_data_cleanup(port); + else { + vnic_port_fip_cleanup(port, 0); + } + mutex_unlock(&port->start_stop_lock); + vnic_port_cleanup(port); + kfree(port); + } + mutex_unlock(&ib_dev->mlock); + + kfree(ib_dev); +} + +int vnic_ports_init(void) +{ + int rc; + + /* create global wq */ + port_wq = create_singlethread_workqueue("port_wq"); + if (!port_wq) { + vnic_err(NULL, "create_singlethread_workqueue failed for %s\n", + "port_wq"); + return -EINVAL; + } + + login_wq = create_singlethread_workqueue("login_wq"); + if (!login_wq) { + vnic_err(NULL, "create_singlethread_workqueue failed for %s\n", + "login_wq"); + goto free_wq0; + } + + fip_wq = create_singlethread_workqueue("fip"); + if (!fip_wq) { + vnic_err(NULL, "create_singlethread_workqueue failed for %s\n", + "fip"); + goto free_wq1; + } + + + /* calls vnic_ib_dev_add_one() */ + rc = ib_register_client(&vnic_init_client); + if (rc) { + vnic_err(NULL, "ib_register_client failed %d\n", rc); + goto free_wq2; + } + + return 0; + +free_wq2: + destroy_workqueue(fip_wq); +free_wq1: + destroy_workqueue(login_wq); +free_wq0: + destroy_workqueue(port_wq); + + return -EINVAL; +} + +void vnic_ports_cleanup(void) +{ + vnic_dbg(NULL, "calling ib_unregister_client\n"); + /* calls vnic_ib_dev_remove_one() */ + ib_unregister_client(&vnic_init_client); + vnic_dbg(NULL, "calling destroy_workqueue\n"); + destroy_workqueue(fip_wq); + destroy_workqueue(login_wq); + destroy_workqueue(port_wq); + vnic_dbg(NULL, "vnic_data_cleanup done\n"); +} diff --git a/drivers/net/mlx4_vnic/vnic_qp.c b/drivers/net/mlx4_vnic/vnic_qp.c new file mode 100644 index 0000000000000..e63cca28ce4c3 --- /dev/null +++ b/drivers/net/mlx4_vnic/vnic_qp.c @@ -0,0 +1,1496 @@ +/* + * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include +#include +#include + +#include +#include + +#include "vnic.h" + +/* compare with drivers/infiniband/hw/mlx4/qp.c */ +#ifdef mlx4_ib_dbg +#undef mlx4_ib_dbg +#define mlx4_ib_dbg(format, arg...) vnic_dbg(NULL, format, ## arg) +#endif + +enum { + MLX4_IB_ACK_REQ_FREQ = 8, +}; + +enum { + MLX4_IB_DEFAULT_SCHED_QUEUE = 0x83, + MLX4_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f, + MLX4_IB_LINK_TYPE_IB = 0, + MLX4_IB_LINK_TYPE_ETH = 1, +}; + +enum { + /* + * Largest possible UD header: send with GRH and immediate data. + * 4 bytes added to accommodate for eth header instead of lrh + */ + MLX4_IB_UD_HEADER_SIZE = 76, + MLX4_IB_MAX_RAW_ETY_HDR_SIZE = 12 +}; + +enum { + MLX4_IBOE_ETHERTYPE = 0x8915 +}; + +struct mlx4_ib_sqp { + struct mlx4_ib_qp qp; + int pkey_index; + u32 qkey; + u32 send_psn; + struct ib_ud_header ud_header; + u8 header_buf[MLX4_IB_UD_HEADER_SIZE]; +}; + +enum { + MLX4_IB_MIN_SQ_STRIDE = 6 +}; + +static const __be32 mlx4_ib_opcode[] = { + [IB_WR_SEND] = cpu_to_be32(MLX4_OPCODE_SEND), + [IB_WR_LSO] = cpu_to_be32(MLX4_OPCODE_LSO), + [IB_WR_SEND_WITH_IMM] = cpu_to_be32(MLX4_OPCODE_SEND_IMM), + [IB_WR_RDMA_WRITE] = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE), + [IB_WR_RDMA_WRITE_WITH_IMM] = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM), + [IB_WR_RDMA_READ] = cpu_to_be32(MLX4_OPCODE_RDMA_READ), + [IB_WR_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_ATOMIC_CS), + [IB_WR_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_ATOMIC_FA), + [IB_WR_SEND_WITH_INV] = cpu_to_be32(MLX4_OPCODE_SEND_INVAL), + [IB_WR_LOCAL_INV] = cpu_to_be32(MLX4_OPCODE_LOCAL_INVAL), + [IB_WR_FAST_REG_MR] = cpu_to_be32(MLX4_OPCODE_FMR), + [IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_CS), + [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA), +}; + +#ifndef wc_wmb + #if defined(__i386__) + #define wc_wmb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory") + #elif defined(__x86_64__) + #define wc_wmb() asm volatile("sfence" ::: "memory") + #elif defined(__ia64__) + #define wc_wmb() asm volatile("fwb" ::: "memory") + #else + #define wc_wmb() wmb() + #endif +#endif + +#if 0 +static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp) +{ + return container_of(mqp, struct mlx4_ib_sqp, qp); +} +#endif + +static void *get_wqe(struct mlx4_ib_qp *qp, int offset) +{ + return mlx4_buf_offset(&qp->buf, offset); +} + +static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n) +{ + return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift)); +} + +static void *get_send_wqe(struct mlx4_ib_qp *qp, int n) +{ + return get_wqe(qp, qp->sq.offset + (n << qp->sq.wqe_shift)); +} + +/* + * Stamp a SQ WQE so that it is invalid if prefetched by marking the + * first four bytes of every 64 byte chunk with + * 0x7FFFFFF | (invalid_ownership_value << 31). + * + * When the max work request size is less than or equal to the WQE + * basic block size, as an optimization, we can stamp all WQEs with + * 0xffffffff, and skip the very first chunk of each WQE. + */ +static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size) +{ + __be32 *wqe; + int i; + int s; + int ind; + void *buf; + __be32 stamp; + struct mlx4_wqe_ctrl_seg *ctrl; + + if (qp->sq_max_wqes_per_wr > 1) { + s = roundup(size, 1U << qp->sq.wqe_shift); + for (i = 0; i < s; i += 64) { + ind = (i >> qp->sq.wqe_shift) + n; + stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) : + cpu_to_be32(0xffffffff); + buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); + wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1)); + *wqe = stamp; + } + } else { + ctrl = buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1)); + s = (ctrl->fence_size & 0x3f) << 4; + for (i = 64; i < s; i += 64) { + wqe = buf + i; + *wqe = cpu_to_be32(0xffffffff); + } + } +} + +static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size) +{ + struct mlx4_wqe_ctrl_seg *ctrl; + struct mlx4_wqe_inline_seg *inl; + void *wqe; + int s; + + ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1)); + s = sizeof(struct mlx4_wqe_ctrl_seg); + + if (qp->ibqp.qp_type == IB_QPT_UD) { + struct mlx4_wqe_datagram_seg *dgram = wqe + sizeof *ctrl; + struct mlx4_av *av = (struct mlx4_av *)dgram->av; + memset(dgram, 0, sizeof *dgram); + av->port_pd = cpu_to_be32((qp->port << 24) | to_mpd(qp->ibqp.pd)->pdn); + s += sizeof(struct mlx4_wqe_datagram_seg); + } + + /* Pad the remainder of the WQE with an inline data segment. */ + if (size > s) { + inl = wqe + s; + inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl)); + } + ctrl->srcrb_flags = 0; + ctrl->fence_size = size / 16; + /* + * Make sure descriptor is fully written before setting ownership bit + * (because HW can start executing as soon as we do). + */ + wmb(); + + ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) | + (n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0); + + stamp_send_wqe(qp, n + qp->sq_spare_wqes, size); +} + +/* Post NOP WQE to prevent wrap-around in the middle of WR */ +static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind) +{ + unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1)); + if (unlikely(s < qp->sq_max_wqes_per_wr)) { + post_nop_wqe(qp, ind, s << qp->sq.wqe_shift); + ind += s; + } + return ind; +} + +static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type) +{ + struct ib_event event; + struct mlx4_ib_qp *mqp = to_mibqp(qp); + struct ib_qp *ibqp = &mqp->ibqp; + + if (type == MLX4_EVENT_TYPE_PATH_MIG) + to_mibqp(qp)->port = to_mibqp(qp)->alt_port; + + if (ibqp->event_handler) { + event.device = ibqp->device; + switch (type) { + case MLX4_EVENT_TYPE_PATH_MIG: + event.event = IB_EVENT_PATH_MIG; + break; + case MLX4_EVENT_TYPE_COMM_EST: + event.event = IB_EVENT_COMM_EST; + break; + case MLX4_EVENT_TYPE_SQ_DRAINED: + event.event = IB_EVENT_SQ_DRAINED; + break; + case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE: + event.event = IB_EVENT_QP_LAST_WQE_REACHED; + break; + case MLX4_EVENT_TYPE_WQ_CATAS_ERROR: + event.event = IB_EVENT_QP_FATAL; + break; + case MLX4_EVENT_TYPE_PATH_MIG_FAILED: + event.event = IB_EVENT_PATH_MIG_ERR; + break; + case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + event.event = IB_EVENT_QP_REQ_ERR; + break; + case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR: + event.event = IB_EVENT_QP_ACCESS_ERR; + break; + default: + printk(KERN_WARNING "mlx4_ib: Unexpected event type %d " + "on QP %06x\n", type, qp->qpn); + return; + } + + event.element.qp = ibqp; + ibqp->event_handler(&event, ibqp->qp_context); + } +} + +static int send_wqe_overhead(enum ib_qp_type type, u32 flags) +{ + /* + * UD WQEs must have a datagram segment. + * RC and UC WQEs might have a remote address segment. + * MLX WQEs need two extra inline data segments (for the UD + * header and space for the ICRC). + */ + switch (type) { + case IB_QPT_UD: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_datagram_seg) + + ((flags & MLX4_IB_QP_LSO) ? 128 : 0); + case IB_QPT_UC: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_raddr_seg); + case IB_QPT_XRC: + case IB_QPT_RC: + return sizeof (struct mlx4_wqe_ctrl_seg) + + sizeof (struct mlx4_wqe_atomic_seg) + + sizeof (struct mlx4_wqe_raddr_seg); + case IB_QPT_SMI: + case IB_QPT_GSI: + return sizeof (struct mlx4_wqe_ctrl_seg) + + ALIGN(MLX4_IB_UD_HEADER_SIZE + + DIV_ROUND_UP(MLX4_IB_UD_HEADER_SIZE, + MLX4_INLINE_ALIGN) * + sizeof (struct mlx4_wqe_inline_seg), + sizeof (struct mlx4_wqe_data_seg)) + + ALIGN(4 + + sizeof (struct mlx4_wqe_inline_seg), + sizeof (struct mlx4_wqe_data_seg)); + case IB_QPT_RAW_ETY: + return sizeof(struct mlx4_wqe_ctrl_seg) + + ALIGN(MLX4_IB_MAX_RAW_ETY_HDR_SIZE + + sizeof(struct mlx4_wqe_inline_seg), + sizeof(struct mlx4_wqe_data_seg)); + + default: + return sizeof (struct mlx4_wqe_ctrl_seg); + } +} + +static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, + int is_user, int has_srq_or_is_xrc, struct mlx4_ib_qp *qp) +{ + /* Sanity check RQ size before proceeding */ + if (cap->max_recv_wr > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE || + cap->max_recv_sge > + min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg)) { + mlx4_ib_dbg("Requested RQ size (sge or wr) too large"); + return -EINVAL; + } + + if (has_srq_or_is_xrc) { + /* QPs attached to an SRQ should have no RQ */ + if (cap->max_recv_wr) { + mlx4_ib_dbg("non-zero RQ size for QP using SRQ"); + return -EINVAL; + } + + qp->rq.wqe_cnt = qp->rq.max_gs = 0; + } else { + /* HW requires >= 1 RQ entry with >= 1 gather entry */ + if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge)) { + mlx4_ib_dbg("user QP RQ has 0 wr's or 0 sge's " + "(wr: 0x%x, sge: 0x%x)", cap->max_recv_wr, + cap->max_recv_sge); + return -EINVAL; + } + + qp->rq.wqe_cnt = roundup_pow_of_two(max(1U, cap->max_recv_wr)); + qp->rq.max_gs = roundup_pow_of_two(max(1U, cap->max_recv_sge)); + qp->rq.wqe_shift = ilog2(qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg)); + } + + /* leave userspace return values as they were, so as not to break ABI */ + if (is_user) { + cap->max_recv_wr = qp->rq.max_post = qp->rq.wqe_cnt; + cap->max_recv_sge = qp->rq.max_gs; + } else { + cap->max_recv_wr = qp->rq.max_post = + min_t(int, dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE, qp->rq.wqe_cnt); + cap->max_recv_sge = min(qp->rq.max_gs, + min_t(int, dev->dev->caps.max_sq_sg, + dev->dev->caps.max_rq_sg)); + } + /* We don't support inline sends for kernel QPs (yet) */ + + + return 0; +} + +static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, + enum ib_qp_type type, struct mlx4_ib_qp *qp) +{ + int s; + + /* Sanity check SQ size before proceeding */ + if (cap->max_send_wr > (dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE) || + cap->max_send_sge > + min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg) || + cap->max_inline_data + send_wqe_overhead(type, qp->flags) + + sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz) { + mlx4_ib_dbg("Requested SQ resources exceed device maxima"); + return -EINVAL; + } + + /* + * For MLX transport we need 2 extra S/G entries: + * one for the header and one for the checksum at the end + */ + if ((type == IB_QPT_SMI || type == IB_QPT_GSI) && + cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg) { + mlx4_ib_dbg("No space for SQP hdr/csum sge's"); + return -EINVAL; + } + + if (type == IB_QPT_RAW_ETY && + cap->max_send_sge + 1 > dev->dev->caps.max_sq_sg) { + mlx4_ib_dbg("No space for RAW ETY hdr"); + return -EINVAL; + } + + s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg), + cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) + + send_wqe_overhead(type, qp->flags); + + if (s > dev->dev->caps.max_sq_desc_sz) + return -EINVAL; + + /* + * Hermon supports shrinking WQEs, such that a single work + * request can include multiple units of 1 << wqe_shift. This + * way, work requests can differ in size, and do not have to + * be a power of 2 in size, saving memory and speeding up send + * WR posting. Unfortunately, if we do this then the + * wqe_index field in CQEs can't be used to look up the WR ID + * anymore, so we do this only if selective signaling is off. + * + * Further, on 32-bit platforms, we can't use vmap() to make + * the QP buffer virtually contigious. Thus we have to use + * constant-sized WRs to make sure a WR is always fully within + * a single page-sized chunk. + * + * Finally, we use NOP work requests to pad the end of the + * work queue, to avoid wrap-around in the middle of WR. We + * set NEC bit to avoid getting completions with error for + * these NOP WRs, but since NEC is only supported starting + * with firmware 2.2.232, we use constant-sized WRs for older + * firmware. + * + * And, since MLX QPs only support SEND, we use constant-sized + * WRs in this case. + * + * We look for the smallest value of wqe_shift such that the + * resulting number of wqes does not exceed device + * capabilities. + * + * We set WQE size to at least 64 bytes, this way stamping + * invalidates each WQE. + */ + if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC && + qp->sq_signal_bits && BITS_PER_LONG == 64 && + type != IB_QPT_SMI && type != IB_QPT_GSI && type != IB_QPT_RAW_ETY) + qp->sq.wqe_shift = ilog2(64); + else + qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s)); + + for (;;) { + qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1U << qp->sq.wqe_shift); + + /* + * We need to leave 2 KB + 1 WR of headroom in the SQ to + * allow HW to prefetch. + */ + qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr; + qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr * + qp->sq_max_wqes_per_wr + + qp->sq_spare_wqes); + + if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes) + break; + + if (qp->sq_max_wqes_per_wr <= 1) + return -EINVAL; + + ++qp->sq.wqe_shift; + } + + qp->sq.max_gs = (min_t(int, dev->dev->caps.max_sq_desc_sz, + (qp->sq_max_wqes_per_wr << qp->sq.wqe_shift)) - + send_wqe_overhead(type, qp->flags)) / + sizeof (struct mlx4_wqe_data_seg); + + qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + + (qp->sq.wqe_cnt << qp->sq.wqe_shift); + if (qp->rq.wqe_shift > qp->sq.wqe_shift) { + qp->rq.offset = 0; + qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; + } else { + qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift; + qp->sq.offset = 0; + } + + cap->max_send_wr = qp->sq.max_post = + (qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr; + cap->max_send_sge = min_t(int, qp->sq.max_gs, + min(dev->dev->caps.max_sq_sg, + dev->dev->caps.max_rq_sg)); + qp->max_inline_data = cap->max_inline_data; + + return 0; +} + +static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata, int sqpn, struct mlx4_ib_qp *qp) +{ + int qpn; + int err; + + mutex_init(&qp->mutex); + spin_lock_init(&qp->sq.lock); + spin_lock_init(&qp->rq.lock); + INIT_LIST_HEAD(&qp->gid_list); + + qp->state = IB_QPS_RESET; + qp->mlx4_ib_qp_type = init_attr->qp_type; + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) + qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); + + err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, + !!init_attr->srq || !!init_attr->xrc_domain , qp); + if (err) + goto err; + + if (pd->uobject) { + } else { + qp->sq_no_prefetch = 0; + + if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) + qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK; + + if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) + qp->flags |= MLX4_IB_QP_LSO; + + err = set_kernel_sq_size(dev, &init_attr->cap, init_attr->qp_type, qp); + if (err) + goto err; + + if (!init_attr->srq && init_attr->qp_type != IB_QPT_XRC) { + err = mlx4_db_alloc(dev->dev, &qp->db, 0); + if (err) + goto err; + + *qp->db.db = 0; + } + + if (qp->max_inline_data) { + err = mlx4_bf_alloc(dev->dev, &qp->bf); + if (err) { + mlx4_ib_dbg("failed to allocate blue flame register (%d)", err); + qp->bf.uar = &dev->priv_uar; + } + } else + qp->bf.uar = &dev->priv_uar; + + if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf)) { + err = -ENOMEM; + goto err_db; + } + + err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift, + &qp->mtt, MLX4_MR_FLAG_NONE); + if (err) { + mlx4_ib_dbg("kernel qp mlx4_mtt_init error (%d)", err); + goto err_buf; + } + + err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf); + if (err) { + mlx4_ib_dbg("mlx4_buf_write_mtt error (%d)", err); + goto err_mtt; + } + + /* these are big chunks that may fail, added __GFP_NOWARN */ + qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), + GFP_KERNEL | __GFP_NOWARN); + qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), + GFP_KERNEL | __GFP_NOWARN); + + if (!qp->sq.wrid || !qp->rq.wrid) { + printk(KERN_WARNING "%s:%d: not enough memory\n", + __func__, __LINE__); + err = -ENOMEM; + goto err_wrid; + } + } + + qpn = sqpn; + + err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp); + if (err) + goto err_qpn; + + if (init_attr->qp_type == IB_QPT_XRC) + qp->mqp.qpn |= (1 << 23); + + /* + * Hardware wants QPN written in big-endian order (after + * shifting) for send doorbell. Precompute this value to save + * a little bit when posting sends. + */ + qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); + + qp->mqp.event = mlx4_ib_qp_event; + + return 0; + +err_qpn: +err_wrid: + if (pd->uobject) { + } else { + kfree(qp->sq.wrid); + kfree(qp->rq.wrid); + } + +err_mtt: + mlx4_mtt_cleanup(dev->dev, &qp->mtt, MLX4_MR_FLAG_NONE); + +err_buf: + if (pd->uobject) + ib_umem_release(qp->umem); + else + mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); + +err_db: + if (!pd->uobject && !init_attr->srq && init_attr->qp_type != IB_QPT_XRC) + mlx4_db_free(dev->dev, &qp->db); + + if (qp->max_inline_data) + mlx4_bf_free(dev->dev, &qp->bf); + +err: + return err; +} + +#if 0 +static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, + void *wqe, unsigned *mlx_seg_len) +{ + struct ib_device *ib_dev = &to_mdev(sqp->qp.ibqp.device)->ib_dev; + struct mlx4_wqe_mlx_seg *mlx = wqe; + struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx; + struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah); + u16 pkey; + int send_size; + int header_size; + int spc; + int i; + union ib_gid sgid; + int is_eth; + int is_grh; + int is_vlan = 0; + int err; + u16 vlan; + + send_size = 0; + for (i = 0; i < wr->num_sge; ++i) + send_size += wr->sg_list[i].length; + + is_eth = rdma_port_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET; + is_grh = mlx4_ib_ah_grh_present(ah); + err = ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24, + ah->av.ib.gid_index, &sgid); + if (err) + return err; + + if (is_eth) { + is_vlan = rdma_get_vlan_id(&sgid) < 0x1000; + vlan = rdma_get_vlan_id(&sgid); + } + + ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header); + if (!is_eth) { + sqp->ud_header.lrh.service_level = + be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28; + sqp->ud_header.lrh.destination_lid = ah->av.ib.dlid; + sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f); + } + + if (is_grh) { + sqp->ud_header.grh.traffic_class = + (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff; + sqp->ud_header.grh.flow_label = + ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff); + sqp->ud_header.grh.hop_limit = ah->av.ib.hop_limit; + ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24, + ah->av.ib.gid_index, &sqp->ud_header.grh.source_gid); + memcpy(sqp->ud_header.grh.destination_gid.raw, + ah->av.ib.dgid, 16); + } + + mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); + + if (!is_eth) { + mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) | + (sqp->ud_header.lrh.destination_lid == + IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) | + (sqp->ud_header.lrh.service_level << 8)); + mlx->rlid = sqp->ud_header.lrh.destination_lid; + } + + switch (wr->opcode) { + case IB_WR_SEND: + sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY; + sqp->ud_header.immediate_present = 0; + break; + case IB_WR_SEND_WITH_IMM: + sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; + sqp->ud_header.immediate_present = 1; + sqp->ud_header.immediate_data = wr->ex.imm_data; + break; + default: + return -EINVAL; + } + + if (is_eth) { + u8 *smac; + + memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6); + smac = to_mdev(sqp->qp.ibqp.device)->iboe.netdevs[sqp->qp.port - 1]->dev_addr; /* fixme: cache this value */ + memcpy(sqp->ud_header.eth.smac_h, smac, 6); + if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6)) + mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK); + if (!is_vlan) + sqp->ud_header.eth.type = cpu_to_be16(MLX4_IBOE_ETHERTYPE); + else { + u16 pcp; + + sqp->ud_header.vlan.type = cpu_to_be16(MLX4_IBOE_ETHERTYPE); + pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 27 & 3) << 13; + sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp); + } + } else { + sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : 0; + if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE) + sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE; + } + sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED); + if (!sqp->qp.ibqp.qp_num) + ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey); + else + ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->wr.ud.pkey_index, &pkey); + sqp->ud_header.bth.pkey = cpu_to_be16(pkey); + sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn); + sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1)); + sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ? + sqp->qkey : wr->wr.ud.remote_qkey); + sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num); + + header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf); + + if (0) { + printk(KERN_ERR "built UD header of size %d:\n", header_size); + for (i = 0; i < header_size / 4; ++i) { + if (i % 8 == 0) + printk(" [%02x] ", i * 4); + printk(" %08x", + be32_to_cpu(((__be32 *) sqp->header_buf)[i])); + if ((i + 1) % 8 == 0) + printk("\n"); + } + printk("\n"); + } + + /* + * Inline data segments may not cross a 64 byte boundary. If + * our UD header is bigger than the space available up to the + * next 64 byte boundary in the WQE, use two inline data + * segments to hold the UD header. + */ + spc = MLX4_INLINE_ALIGN - + ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1)); + if (header_size <= spc) { + inl->byte_count = cpu_to_be32(1 << 31 | header_size); + memcpy(inl + 1, sqp->header_buf, header_size); + i = 1; + } else { + inl->byte_count = cpu_to_be32(1 << 31 | spc); + memcpy(inl + 1, sqp->header_buf, spc); + + inl = (void *) (inl + 1) + spc; + memcpy(inl + 1, sqp->header_buf + spc, header_size - spc); + /* + * Need a barrier here to make sure all the data is + * visible before the byte_count field is set. + * Otherwise the HCA prefetcher could grab the 64-byte + * chunk with this inline segment and get a valid (!= + * 0xffffffff) byte count but stale data, and end up + * generating a packet with bad headers. + * + * The first inline segment's byte_count field doesn't + * need a barrier, because it comes after a + * control/MLX segment and therefore is at an offset + * of 16 mod 64. + */ + wmb(); + inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc)); + i = 2; + } + + *mlx_seg_len = + ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16); + return 0; +} +#endif + +static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq) +{ + unsigned cur; + struct mlx4_ib_cq *cq; + + cur = wq->head - wq->tail; + if (likely(cur + nreq < wq->max_post)) + return 0; + + cq = to_mcq(ib_cq); + spin_lock(&cq->lock); + cur = wq->head - wq->tail; + spin_unlock(&cq->lock); + + return cur + nreq >= wq->max_post; +} + +#if 0 +static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey) +{ + iseg->flags = 0; + iseg->mem_key = cpu_to_be32(rkey); + iseg->guest_id = 0; + iseg->pa = 0; +} +#endif + +static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg, + u64 remote_addr, u32 rkey) +{ + rseg->raddr = cpu_to_be64(remote_addr); + rseg->rkey = cpu_to_be32(rkey); + rseg->reserved = 0; +} + +#if 0 +static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ib_send_wr *wr) +{ + if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) { + aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap); + aseg->compare = cpu_to_be64(wr->wr.atomic.compare_add); + } else if (wr->opcode == IB_WR_MASKED_ATOMIC_FETCH_AND_ADD) { + aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add); + aseg->compare = cpu_to_be64(wr->wr.atomic.compare_add_mask); + } else { + aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add); + aseg->compare = 0; + } + +} +#endif + +static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, + struct ib_send_wr *wr, __be16 *vlan) +{ + memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av)); + dseg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn); + dseg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey); + dseg->vlan = to_mah(wr->wr.ud.ah)->av.eth.vlan; + memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->av.eth.mac, 6); + *vlan = dseg->vlan; +} + +#if 0 +static void set_mlx_icrc_seg(void *dseg) +{ + u32 *t = dseg; + struct mlx4_wqe_inline_seg *iseg = dseg; + + t[1] = 0; + + /* + * Need a barrier here before writing the byte_count field to + * make sure that all the data is visible before the + * byte_count field is set. Otherwise, if the segment begins + * a new cacheline, the HCA prefetcher could grab the 64-byte + * chunk and get a valid (!= * 0xffffffff) byte count but + * stale data, and end up sending the wrong data. + */ + wmb(); + + iseg->byte_count = cpu_to_be32((1 << 31) | 4); +} +#endif + +static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) +{ + dseg->lkey = cpu_to_be32(sg->lkey); + dseg->addr = cpu_to_be64(sg->addr); + + /* + * Need a barrier here before writing the byte_count field to + * make sure that all the data is visible before the + * byte_count field is set. Otherwise, if the segment begins + * a new cacheline, the HCA prefetcher could grab the 64-byte + * chunk and get a valid (!= * 0xffffffff) byte count but + * stale data, and end up sending the wrong data. + */ + wmb(); + + dseg->byte_count = cpu_to_be32(sg->length); +} + +static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) +{ + dseg->byte_count = cpu_to_be32(sg->length); + dseg->lkey = cpu_to_be32(sg->lkey); + dseg->addr = cpu_to_be64(sg->addr); +} + +static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr, + struct mlx4_ib_qp *qp, unsigned *lso_seg_len, + __be32 *lso_hdr_sz, int *blh) +{ + unsigned halign = ALIGN(sizeof *wqe + wr->wr.ud.hlen, 16); + + *blh = unlikely(halign > 64) ? 1 : 0; + + if (unlikely(!(qp->flags & MLX4_IB_QP_LSO) && + wr->num_sge > qp->sq.max_gs - (halign >> 4))) + return -EINVAL; + + memcpy(wqe->header, wr->wr.ud.header, wr->wr.ud.hlen); + + *lso_hdr_sz = cpu_to_be32((wr->wr.ud.mss - wr->wr.ud.hlen) << 16 | + wr->wr.ud.hlen); + *lso_seg_len = halign; + return 0; +} + +static __be32 send_ieth(struct ib_send_wr *wr) +{ + switch (wr->opcode) { + case IB_WR_SEND_WITH_IMM: + case IB_WR_RDMA_WRITE_WITH_IMM: + return wr->ex.imm_data; + + case IB_WR_SEND_WITH_INV: + return cpu_to_be32(wr->ex.invalidate_rkey); + + default: + return 0; + } +} + +int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct mlx4_ib_qp *qp = to_mqp(ibqp); + struct mlx4_wqe_data_seg *scat; + unsigned long flags; + int err = 0; + int nreq; + int ind; + int max_gs; + int i; + + max_gs = qp->rq.max_gs; + spin_lock_irqsave(&qp->rq.lock, flags); + + ind = qp->rq.head & (qp->rq.wqe_cnt - 1); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { + mlx4_ib_dbg("QP 0x%x: WQE overflow", ibqp->qp_num); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->num_sge > qp->rq.max_gs)) { + mlx4_ib_dbg("QP 0x%x: too many sg entries (%d)", + ibqp->qp_num, wr->num_sge); + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + scat = get_recv_wqe(qp, ind); + + for (i = 0; i < wr->num_sge; ++i) + __set_data_seg(scat + i, wr->sg_list + i); + + if (i < max_gs) { + scat[i].byte_count = 0; + scat[i].lkey = cpu_to_be32(MLX4_INVALID_LKEY); + scat[i].addr = 0; + } + + qp->rq.wrid[ind] = wr->wr_id; + + ind = (ind + 1) & (qp->rq.wqe_cnt - 1); + } + +out: + if (likely(nreq)) { + qp->rq.head += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff); + } + + spin_unlock_irqrestore(&qp->rq.lock, flags); + + return err; +} + +static inline enum ib_qp_state to_ib_qp_state(enum mlx4_qp_state mlx4_state) +{ + switch (mlx4_state) { + case MLX4_QP_STATE_RST: return IB_QPS_RESET; + case MLX4_QP_STATE_INIT: return IB_QPS_INIT; + case MLX4_QP_STATE_RTR: return IB_QPS_RTR; + case MLX4_QP_STATE_RTS: return IB_QPS_RTS; + case MLX4_QP_STATE_SQ_DRAINING: + case MLX4_QP_STATE_SQD: return IB_QPS_SQD; + case MLX4_QP_STATE_SQER: return IB_QPS_SQE; + case MLX4_QP_STATE_ERR: return IB_QPS_ERR; + default: return -1; + } +} + +static inline enum ib_mig_state to_ib_mig_state(int mlx4_mig_state) +{ + switch (mlx4_mig_state) { + case MLX4_QP_PM_ARMED: return IB_MIG_ARMED; + case MLX4_QP_PM_REARM: return IB_MIG_REARM; + case MLX4_QP_PM_MIGRATED: return IB_MIG_MIGRATED; + default: return -1; + } +} + +static int to_ib_qp_access_flags(int mlx4_flags) +{ + int ib_flags = 0; + + if (mlx4_flags & MLX4_QP_BIT_RRE) + ib_flags |= IB_ACCESS_REMOTE_READ; + if (mlx4_flags & MLX4_QP_BIT_RWE) + ib_flags |= IB_ACCESS_REMOTE_WRITE; + if (mlx4_flags & MLX4_QP_BIT_RAE) + ib_flags |= IB_ACCESS_REMOTE_ATOMIC; + + return ib_flags; +} + +static void to_ib_ah_attr(struct mlx4_ib_dev *ib_dev, struct ib_ah_attr *ib_ah_attr, + struct mlx4_qp_path *path) +{ + struct mlx4_dev *dev = ib_dev->dev; + int is_eth; + + memset(ib_ah_attr, 0, sizeof *ib_ah_attr); + ib_ah_attr->port_num = path->sched_queue & 0x40 ? 2 : 1; + + if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->caps.num_ports) + return; + + is_eth = rdma_port_link_layer(&ib_dev->ib_dev, ib_ah_attr->port_num) == + IB_LINK_LAYER_ETHERNET ? 1 : 0; + if (is_eth) + ib_ah_attr->sl = ((path->sched_queue >> 3) & 0x7) | + ((path->sched_queue & 4) << 1); + else + ib_ah_attr->sl = (path->sched_queue >> 2) & 0xf; + + ib_ah_attr->dlid = be16_to_cpu(path->rlid); + + ib_ah_attr->src_path_bits = path->grh_mylmc & 0x7f; + ib_ah_attr->static_rate = path->static_rate ? path->static_rate - 5 : 0; + ib_ah_attr->ah_flags = (path->grh_mylmc & (1 << 7)) ? IB_AH_GRH : 0; + if (ib_ah_attr->ah_flags) { + ib_ah_attr->grh.sgid_index = path->mgid_index; + ib_ah_attr->grh.hop_limit = path->hop_limit; + ib_ah_attr->grh.traffic_class = + (be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff; + ib_ah_attr->grh.flow_label = + be32_to_cpu(path->tclass_flowlabel) & 0xfffff; + memcpy(ib_ah_attr->grh.dgid.raw, + path->rgid, sizeof ib_ah_attr->grh.dgid.raw); + } +} + +int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct mlx4_ib_dev *dev = to_mdev(ibqp->device); + struct mlx4_ib_qp *qp = to_mqp(ibqp); + struct mlx4_qp_context context; + int mlx4_state; + int err = 0; + + mutex_lock(&qp->mutex); + + if (qp->state == IB_QPS_RESET) { + qp_attr->qp_state = IB_QPS_RESET; + goto done; + } + + err = mlx4_qp_query(dev->dev, &qp->mqp, &context); + if (err) { + err = -EINVAL; + goto out; + } + + mlx4_state = be32_to_cpu(context.flags) >> 28; + + qp->state = to_ib_qp_state(mlx4_state); + qp_attr->qp_state = qp->state; + qp_attr->path_mtu = context.mtu_msgmax >> 5; + qp_attr->path_mig_state = + to_ib_mig_state((be32_to_cpu(context.flags) >> 11) & 0x3); + qp_attr->qkey = be32_to_cpu(context.qkey); + qp_attr->rq_psn = be32_to_cpu(context.rnr_nextrecvpsn) & 0xffffff; + qp_attr->sq_psn = be32_to_cpu(context.next_send_psn) & 0xffffff; + qp_attr->dest_qp_num = be32_to_cpu(context.remote_qpn) & 0xffffff; + qp_attr->qp_access_flags = + to_ib_qp_access_flags(be32_to_cpu(context.params2)); + + if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC || + qp->ibqp.qp_type == IB_QPT_XRC) { + to_ib_ah_attr(dev, &qp_attr->ah_attr, &context.pri_path); + to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context.alt_path); + qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f; + qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num; + } + + qp_attr->pkey_index = context.pri_path.pkey_index & 0x7f; + if (qp_attr->qp_state == IB_QPS_INIT) + qp_attr->port_num = qp->port; + else + qp_attr->port_num = context.pri_path.sched_queue & 0x40 ? 2 : 1; + + /* qp_attr->en_sqd_async_notify is only applicable in modify qp */ + qp_attr->sq_draining = mlx4_state == MLX4_QP_STATE_SQ_DRAINING; + + qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context.params1) >> 21) & 0x7); + + qp_attr->max_dest_rd_atomic = + 1 << ((be32_to_cpu(context.params2) >> 21) & 0x7); + qp_attr->min_rnr_timer = + (be32_to_cpu(context.rnr_nextrecvpsn) >> 24) & 0x1f; + qp_attr->timeout = context.pri_path.ackto >> 3; + qp_attr->retry_cnt = (be32_to_cpu(context.params1) >> 16) & 0x7; + qp_attr->rnr_retry = (be32_to_cpu(context.params1) >> 13) & 0x7; + qp_attr->alt_timeout = context.alt_path.ackto >> 3; + +done: + qp_attr->cur_qp_state = qp_attr->qp_state; + qp_attr->cap.max_recv_wr = qp->rq.wqe_cnt; + qp_attr->cap.max_recv_sge = qp->rq.max_gs; + + if (!ibqp->uobject) { + qp_attr->cap.max_send_wr = qp->sq.wqe_cnt; + qp_attr->cap.max_send_sge = qp->sq.max_gs; + } else { + qp_attr->cap.max_send_wr = 0; + qp_attr->cap.max_send_sge = 0; + } + + /* + * We don't support inline sends for kernel QPs (yet), and we + * don't know what userspace's value should be. + */ + qp_attr->cap.max_inline_data = 0; + + qp_init_attr->cap = qp_attr->cap; + + qp_init_attr->create_flags = 0; + if (qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK) + qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; + + if (qp->flags & MLX4_IB_QP_LSO) + qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO; + +out: + mutex_unlock(&qp->mutex); + return err; +} + + +int mlx4_ib_create_xrc_rcv_qp(struct ib_qp_init_attr *init_attr, + u32 *qp_num) +{ + return -ENOSYS; +} + +int mlx4_ib_modify_xrc_rcv_qp(struct ib_xrcd *ibxrcd, u32 qp_num, + struct ib_qp_attr *attr, int attr_mask) +{ + return -ENOSYS; +} + +int mlx4_ib_query_xrc_rcv_qp(struct ib_xrcd *ibxrcd, u32 qp_num, + struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + return -ENOSYS; +} + +int mlx4_ib_reg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num) +{ + return -ENOSYS; +} + +int mlx4_ib_unreg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num) +{ + return -ENOSYS; +} + +/**** VNIC IB VERBS ****/ +int vnic_ib_post_send(struct ib_qp *ibqp, + struct ib_send_wr *wr, + struct ib_send_wr **bad_wr, + u8 ip_off, u8 ip6_off, + u8 tcp_off, u8 udp_off) +{ + struct mlx4_ib_qp *qp = to_mqp(ibqp); + void *wqe; + struct mlx4_wqe_ctrl_seg *ctrl; + struct mlx4_wqe_data_seg *dseg; + __be32 owner_opcode = 0; + int nreq; + int err = 0; + unsigned ind; + int uninitialized_var(stamp); + int uninitialized_var(size); + unsigned uninitialized_var(seglen); + __be32 dummy; + __be32 *lso_wqe; + __be32 uninitialized_var(lso_hdr_sz); + int i; + int blh = 0; + __be16 vlan = 0; + + ind = qp->sq_next_wqe; + + nreq = 0; + lso_wqe = &dummy; + + if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) { + mlx4_ib_dbg("QP 0x%x: WQE overflow", ibqp->qp_num); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->num_sge > qp->sq.max_gs)) { + mlx4_ib_dbg("QP 0x%x: too many sg entries (%d)", + ibqp->qp_num, wr->num_sge); + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); + *((u32 *) (&ctrl->vlan_tag)) = 0; + qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id; + + ctrl->srcrb_flags = + (wr->send_flags & IB_SEND_SIGNALED ? + cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) | + (wr->send_flags & IB_SEND_SOLICITED ? + cpu_to_be32(MLX4_WQE_CTRL_SOLICITED) : 0) | + qp->sq_signal_bits; + + ctrl->imm = send_ieth(wr); + + wqe += sizeof *ctrl; + size = sizeof *ctrl / 16; + + set_datagram_seg(wqe, wr, &vlan); + wqe += sizeof (struct mlx4_wqe_datagram_seg); + size += sizeof (struct mlx4_wqe_datagram_seg) / 16; + + if (wr->opcode == IB_WR_LSO) { + err = build_lso_seg(wqe, wr, qp, &seglen, &lso_hdr_sz, &blh); + if (unlikely(err)) { + *bad_wr = wr; + goto out; + } + lso_wqe = (__be32 *) wqe; + wqe += seglen; + size += seglen / 16; + } + dseg = wqe; + dseg += wr->num_sge - 1; + + size += wr->num_sge * (sizeof(struct mlx4_wqe_data_seg) / 16); + for (i = wr->num_sge - 1; i >= 0; --i, --dseg) + set_data_seg(dseg, wr->sg_list + i); + + wmb(); + *lso_wqe = lso_hdr_sz; + + ctrl->fence_size = size; + + /* set SWP bits based on ip/ip6/tcp/udp offests */ + if (wr->send_flags & IB_SEND_IP_CSUM) { + /* SWP bit */ + owner_opcode |= cpu_to_be32(1 << 24); + + /* IP offset starts from the begining of IB packet + * (and not ETH packet) in 2 bytes. + * In control segment, we use c & d: + * (a) tcp=0, ip=0 => calc TCP/UDP csum over IPv4 + * (b) tcp=0, ip=1 => calc IP csum only over IPv4 + * (c) tcp=1, ip=0 => calc TCP/UDP csum over IPv6 + * (d) tcp=1, ip=1 => calc TCP/UDP and IP csum over IPv4 + */ + if (ip_off) { + ip_off += (IB_LRH_BYTES + IB_BTH_BYTES + + IB_DETH_BYTES) >> 1; + ip_off += (to_mah(wr->wr.ud.ah)->av.ib.g_slid + & 0x80) ? (IB_GRH_BYTES >> 1) : 0; + owner_opcode |= cpu_to_be32((ip_off) << 8); + ctrl->srcrb_flags |= + cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM); + } else if (ip6_off) { + ip6_off += (IB_LRH_BYTES + IB_BTH_BYTES + + IB_DETH_BYTES) >> 1; + ip6_off += (to_mah(wr->wr.ud.ah)->av.ib.g_slid + & 0x80) ? (IB_GRH_BYTES >> 1) : 0; + owner_opcode |= cpu_to_be32((ip6_off) << 8); + } + + if (udp_off) { /* UDP offset and bit */ + owner_opcode |= cpu_to_be32(udp_off << 16); + owner_opcode |= cpu_to_be32(1 << 25); + ctrl->srcrb_flags |= + cpu_to_be32(MLX4_WQE_CTRL_TCP_UDP_CSUM); + } else if (tcp_off) { /* TCP offset */ + owner_opcode |= cpu_to_be32(tcp_off << 16); + ctrl->srcrb_flags |= + cpu_to_be32(MLX4_WQE_CTRL_TCP_UDP_CSUM); + } + } + + /* set opcode, use 0x4e for BIG_LSO */ + if (!blh) + owner_opcode |= mlx4_ib_opcode[wr->opcode]; + else + owner_opcode |= cpu_to_be32(0x4e); + + /* set owenership bit */ + owner_opcode |= (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0); + + /* Make sure descriptor is fully written */ + wmb(); + ctrl->owner_opcode = owner_opcode; + + stamp = ind + qp->sq_spare_wqes; + ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift); + + /* simulate the for loop */ + nreq++; + +out: + if (likely(nreq)) { + qp->sq.head += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + writel(qp->doorbell_qpn, qp->bf.uar->map + MLX4_SEND_DOORBELL); + + /* + * Make sure doorbells don't leak out of SQ spinlock + * and reach the HCA out of order. + */ + mmiowb(); + + } + + stamp_send_wqe(qp, stamp, size * 16); + + ind = pad_wraparound(qp, ind); + qp->sq_next_wqe = ind; + return err; +} + +static void destroy_qp_common_from_range(struct mlx4_ib_dev *dev, + struct mlx4_ib_qp *qp, + struct ib_qp_init_attr *init_attr) +{ + kfree(qp->sq.wrid); + kfree(qp->rq.wrid); + mlx4_mtt_cleanup(dev->dev, &qp->mtt, MLX4_MR_FLAG_NONE); + mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf); + if (qp->max_inline_data) + mlx4_bf_free(dev->dev, &qp->bf); + if (!init_attr->srq) + mlx4_db_free(dev->dev, &qp->db); + mlx4_qp_free(dev->dev, &qp->mqp); +} + +int __vnic_ib_create_qp_range(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, + struct ib_udata *udata, int nqps, + int align, struct ib_qp *list[]) +{ + struct mlx4_ib_dev *dev = to_mdev(pd->device); + struct mlx4_ib_qp *qp; + int err; + int base_qpn, qpn; + int i; + + for (i = 0; i < nqps; ++i) { + if (init_attr[i].create_flags & ~(IB_QP_CREATE_IPOIB_UD_LSO | + IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)) + return -EINVAL; + if (init_attr[i].create_flags & (IB_QP_CREATE_IPOIB_UD_LSO | + IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) && + (pd->uobject || init_attr[i].qp_type != IB_QPT_UD)) + return -EINVAL; + + /* Userspace is not allowed to create special QPs: */ + if (pd->uobject && (init_attr[i].qp_type == IB_QPT_SMI || + init_attr[i].qp_type == IB_QPT_GSI)) + return -EINVAL; + + if (nqps > 1 && (init_attr[i].qp_type == IB_QPT_SMI || + init_attr[i].qp_type == IB_QPT_GSI)) + return -EINVAL; + } + + err = mlx4_qp_reserve_range(dev->dev, nqps, align, &base_qpn); + if (err) + return err; + + for (i = 0, qpn = base_qpn; i < nqps; ++i, ++qpn) { + qp = kzalloc(sizeof *qp, GFP_KERNEL); + if (!qp) { + err = -ENOMEM; + goto exit_fail; + } + + err = create_qp_common(dev, pd, init_attr + i, udata, qpn, qp); + if (err) { + kfree(qp); + err = err; + goto exit_fail; + } + + qp->ibqp.qp_num = qp->mqp.qpn; + list[i] = &qp->ibqp; + } + return 0; + +exit_fail: + for (--i; i >= 0; --i) { + destroy_qp_common_from_range(dev, to_mqp(list[i]), init_attr + i); + kfree(to_mqp(list[i])); + } + + mlx4_qp_release_range(dev->dev, base_qpn, nqps); + return err; +} + +/* compare with ib_create_qp() in infiniband/core/verbs.c */ +int vnic_ib_create_qp_range(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, + struct ib_udata *udata, int nqps, + int align, struct ib_qp *list[]) +{ + struct ib_qp *qp; + struct ib_qp_init_attr *qp_init_attr; + int rc, i; + + rc = __vnic_ib_create_qp_range(pd, init_attr, udata ,nqps, align, list); + + if (rc) + return rc; + + for (i = 0; i < nqps; ++ i) { + qp = list[i]; + qp_init_attr = &init_attr[i]; + qp->device = pd->device; + qp->pd = pd; + qp->send_cq = qp_init_attr->send_cq; + qp->recv_cq = qp_init_attr->recv_cq; + qp->srq = qp_init_attr->srq; + qp->uobject = NULL; + qp->event_handler = qp_init_attr->event_handler; + qp->qp_context = qp_init_attr->qp_context; + qp->qp_type = qp_init_attr->qp_type; + qp->xrcd = qp->qp_type == IB_QPT_XRC ? + qp_init_attr->xrc_domain : NULL; + atomic_inc(&pd->usecnt); + atomic_inc(&qp_init_attr->send_cq->usecnt); + atomic_inc(&qp_init_attr->recv_cq->usecnt); + if (qp_init_attr->srq) + atomic_inc(&qp_init_attr->srq->usecnt); + if (qp->qp_type == IB_QPT_XRC) + atomic_inc(&qp->xrcd->usecnt); + } + + return 0; +} diff --git a/drivers/net/mlx4_vnic/vnic_stats_helper.c b/drivers/net/mlx4_vnic/vnic_stats_helper.c new file mode 100644 index 0000000000000..720d233ac6b50 --- /dev/null +++ b/drivers/net/mlx4_vnic/vnic_stats_helper.c @@ -0,0 +1,104 @@ +#include +#include +#include + +MODULE_AUTHOR("Eli Cohen"); +MODULE_DESCRIPTION("container for mlx4_vnic stats function"); +MODULE_LICENSE("Dual BSD/GPL"); + +DEFINE_SPINLOCK(spl); +static int busy; + +static struct net_device_stats *(*stat_func)(struct net_device *n); + +static struct module_attribute dentry; +int ref_count = 1; + +static ssize_t mlx4_vnic_reduce_ref_cnt(struct module_attribute *attr, + struct module *mod, const char *buf, size_t count) +{ + if (ref_count == 1) { + module_put(THIS_MODULE); + ref_count --; + printk("reducing ref count on module"); + } + return count; +} + +static void mlx4_vnic_create_sysfs_entry(void) +{ + dentry.show = NULL; + dentry.store = mlx4_vnic_reduce_ref_cnt; + dentry.attr.name = "enable_unload"; + dentry.attr.mode = S_IWUGO; +#ifndef _BP_NO_ATT_OWNER + dentry.attr.owner = THIS_MODULE; +#endif + if (sysfs_create_file(&(THIS_MODULE)->mkobj.kobj, &dentry.attr)) { + printk("failed to create %s\n", dentry.attr.name); + dentry.store = NULL; + } +} + + + +int mlx4_vnic_set_stats_function(struct net_device_stats *(*func)(struct net_device *n)) +{ + unsigned long flags; + + spin_lock_irqsave(&spl, flags); + if (busy) { + spin_unlock_irqrestore(&spl, flags); + return -EBUSY; + } + stat_func = func; + spin_unlock_irqrestore(&spl, flags); + + return 0; +} + +static struct net_device_stats dummy_stats = {0}; + +struct net_device_stats *mlx4_vnic_stats_func_container(struct net_device *n) +{ + unsigned long flags; + struct net_device_stats *ret_stats = &dummy_stats; + + spin_lock_irqsave(&spl, flags); + busy = 1; + spin_unlock_irqrestore(&spl, flags); + if (stat_func) + ret_stats = stat_func(n); + //else + //printk("WARNING stats requested after module unload for " + // "device %s\n", n->name); + + spin_lock_irqsave(&spl, flags); + busy = 0; + spin_unlock_irqrestore(&spl, flags); + return ret_stats; +} + +EXPORT_SYMBOL(mlx4_vnic_set_stats_function); +EXPORT_SYMBOL(mlx4_vnic_stats_func_container); + +static int __init mlx4_vnic_helper_init(void) +{ + mlx4_vnic_create_sysfs_entry(); + + if (!try_module_get(THIS_MODULE)) + return -1; + + return 0; +} + +static void __exit mlx4_vnic_helper_cleanup(void) +{ + if (dentry.store != NULL) + sysfs_remove_file(&(THIS_MODULE)->mkobj.kobj, &dentry.attr); + printk("failed to create %s\n", dentry.attr.name); +} + +module_init(mlx4_vnic_helper_init); +module_exit(mlx4_vnic_helper_cleanup); + diff --git a/drivers/net/mlx4_vnic/vnic_utils.h b/drivers/net/mlx4_vnic/vnic_utils.h new file mode 100644 index 0000000000000..56ee8cff18e12 --- /dev/null +++ b/drivers/net/mlx4_vnic/vnic_utils.h @@ -0,0 +1,317 @@ +/* + * Copyright (c) 2009 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _VNIC_UTILS_H +#define _VNIC_UTILS_H + +/*#define CONFIG_MLX4_VNIC_DEBUG */ /* comment out in RELEASE and PERFORMANCE modes */ +/* #define VNIC_PROFILLNG */ /* comment out in RELEASE and PERFORMANCE modes */ +#define VNIC_EXTRA_STATS /* comment out in PERFORMANCE mode */ + +enum { + VNIC_DEBUG_GENERAL = 1 << 0, /* 0x1 */ + VNIC_DEBUG_MCAST = 1 << 1, /* 0x2 */ + VNIC_DEBUG_MCAST_V = 1 << 2, /* 0x4 */ + VNIC_DEBUG_DATA = 1 << 3, /* 0x8 */ + VNIC_DEBUG_DATA_V = 1 << 4, /* 0x10 */ + VNIC_DEBUG_FIP = 1 << 5, /* 0x20 */ + VNIC_DEBUG_FIP_V = 1 << 6, /* 0x40 */ + VNIC_DEBUG_SKB = 1 << 7, /* 0x80 */ + VNIC_DEBUG_SKB_V = 1 << 8, /* 0x100 */ + VNIC_DEBUG_VHUB = 1 << 9, /* 0x200 */ + VNIC_DEBUG_VHUB_V = 1 << 10, /* 0x400 */ + VNIC_DEBUG_ETHTOOL = 1 << 11, /* 0x800 */ + VNIC_DEBUG_ETHTOOL_V = 1 << 12, /* 0x1000 */ + VNIC_DEBUG_FUNC = 1 << 13, /* 0x2000 */ + VNIC_DEBUG_MARK = 1 << 14, /* 0x4000 */ + VNIC_DEBUG_MODER = 1 << 15, /* 0x8000 */ + VNIC_DEBUG_MODER_v = 1 << 16, /* 0x10000 */ + VNIC_DEBUG_PKT_DUMP = 1 << 17, /* 0x20000 */ + VNIC_DEBUG_FIP_P0 = 1 << 18, /* 0x40000 */ + VNIC_DEBUG_SYSFS = 1 << 19, /* 0x80000 */ + VNIC_DEBUG_MAC = 1 << 20, /* 0x100000 */ + VNIC_DEBUG_TSTAMP = 1 << 21, /* 0x200000 */ + VNIC_DEBUG_PARSER = 1 << 19, /* 0x400000 */ + VNIC_DEBUG_LAG = 1 << 20, /* 0x800000 */ + VNIC_DEBUG_LAG_V = 1 << 21, /* 0x1000000 */ + VNIC_DEBUG_MCAST_VV = 1 << 22, /* 0x2000000 */ + VNIC_DEBUG_DEBUG = 1 << 31, /* 0x80000000 */ +}; + +/* always defined */ +#define vnic_printk(level, prefix, format, arg...) \ + do { printk(level "T%.4ld [%s] %s:%s:%d: " format, \ + jiffies * 1000 / HZ, \ + DRV_NAME, prefix ? prefix : "", __func__, __LINE__ , \ + ## arg); \ +} while(0) + +#define vnic_info(format, arg...) \ +do { printk(KERN_INFO "[%s] " format, DRV_NAME, ## arg); } \ +while (0) + +#define vnic_warn(prefix, format, arg...) \ +do { vnic_printk(KERN_WARNING, prefix, format, ## arg); } \ +while (0) + +#define vnic_err(prefix, format, arg...) \ +do { vnic_printk(KERN_ERR, prefix, format, ## arg); } \ +while (0) + +#define _sprintf(p, buf, format, arg...) \ + (PAGE_SIZE - (int)(p - buf)) <= 0 ? 0 : \ + scnprintf(p, PAGE_SIZE - (int)(p - buf), format, ## arg) + +/* debug functions */ +#ifndef CONFIG_MLX4_VNIC_DEBUG +#define ASSERT(x) do { (void)(x); } while (0) +#define vnic_dbg_mark(void) do { } while (0) +#define vnic_dbg_func(prefix) do { } while (0) +#define vnic_dbg(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_mcast(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_mcast_v(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_mcast_vv(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_debug(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_ethtool(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_ethtool_v(prefix, format, arg...) \ + do { (void)(prefix); } while (0) +#define vnic_dbg_data(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_data_v(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_fip(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_parse(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_lag(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_lag_v(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_fip_p0(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_sysfs(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_mac(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_fip_v(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_vhub(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_vhub_v(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_moder(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_dbg_moder_v(prefix, format, arg...) do { (void)(prefix); } while (0) +#define vnic_printk_skb(prefix, skb, o1, o2) do { (void)(prefix); } while (0) +#define vnic_dbg_skb(prefix, skb, o1, o2) do { (void)(prefix); } while (0) +#else +#define ASSERT(x) \ +do { if (x) break; \ + printk(KERN_EMERG "### ASSERTION FAILED %s: %s: %d: %s\n", \ + __FILE__, __func__, __LINE__, #x); dump_stack(); BUG(); \ +} while (0) + +#define vnic_dbg(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_GENERAL)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_mcast(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_MCAST)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_mcast_v(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_MCAST_V)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_mcast_vv(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_MCAST_VV)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_debug(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_DEBUG)) break; \ + vnic_printk(KERN_WARNING, prefix, format, ## arg); \ +} while (0) + + +#define vnic_dbg_data(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_DATA)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_data_v(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_DATA_V)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_fip_p0(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_FIP_P0)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_sysfs(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_SYSFS)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_mac(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_MAC)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_fip(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_FIP)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_parse(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_PARSER)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_lag(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_LAG)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_lag_v(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_LAG_V)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_fip_v(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_FIP_V)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_vhub(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_VHUB)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_vhub_v(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_VHUB_V)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_moder(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_MODER)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_moder_v(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_MODER_V)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_ethtool(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_ETHTOOL)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_ethtool_v(prefix, format, arg...) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_ETHTOOL_V)) break; \ + vnic_printk(KERN_DEBUG, prefix, format, ## arg); \ +} while (0) + +#define vnic_dbg_mark(void) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_MARK)) break; \ + vnic_printk(KERN_DEBUG, NULL, "###\n"); \ +} while (0) + +#define vnic_dbg_func(prefix) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_FUNC)) break; \ + vnic_printk(KERN_DEBUG, prefix, "function called\n"); \ +} while (0) + +#define ethp2str(p, str) \ +do { \ + switch (ntohs(p)) { \ + case ETH_P_RARP: sprintf(str, "%s", "ETH_P_RARP"); break; \ + case ETH_P_ARP: sprintf(str, "%s", "ETH_P_ARP"); break; \ + case ETH_P_IP: sprintf(str, "%s", "ETH_P_IP"); break; \ + case ETH_P_IPV6: sprintf(str, "%s", "ETH_P_IPV6"); break; \ + case ETH_P_8021Q:sprintf(str, "%s", "ETH_P_8021Q");break; \ + default: sprintf(str, "0x%x", p); break; \ + } \ +} while (0) + +#define skb_printk(prefix, format, arg...) \ + printk(KERN_DEBUG "[%s] " format, prefix, ## arg) + +#define vnic_dbg_skb(_prefix, skb, eoib_off, eth_off) \ +do { if (!(vnic_msglvl & VNIC_DEBUG_SKB)) break; \ + vnic_printk_skb(_prefix, skb, eoib_off, eth_off); \ +} while (0) + +#define VNIC_SYSLOG_LLEN 64 +#define vnic_printk_skb(_prefix, skb, eoib_off, eth_off) \ +do { \ + char pr[VNIC_SYSLOG_LLEN]; \ + char h_proto_str[VNIC_SYSLOG_LLEN]; \ + struct eoibhdr *eoib_hdr = (struct eoibhdr *) \ + (skb->data + eoib_off); \ + struct ethhdr *ethh = (struct ethhdr *) \ + (skb->data + eth_off); \ + struct net_device *dev = skb->dev; \ + ASSERT(dev); \ + snprintf(pr, VNIC_SYSLOG_LLEN, "%s:skb-%s", dev->name, _prefix);\ + skb_printk(pr, "\n"); \ + skb_printk(pr, "--- skb dump ---\n"); \ + skb_printk(pr, "len : %d\n", skb->len); \ + skb_printk(pr, "data_len : %d\n", skb->data_len); \ + skb_printk(pr, "frags : %d\n", \ + skb_shinfo(skb)->nr_frags); \ + skb_printk(pr, "gso : %d\n", skb_is_gso(skb)); \ + skb_printk(pr, "head_len : %d\n", (int)skb_headlen(skb)); \ + skb_printk(pr, "data : %p\n", skb->data); \ + skb_printk(pr, "head : %p\n", skb->head); \ + skb_printk(pr, "tail : %lu\n", \ + (unsigned long)(skb->tail)); \ + skb_printk(pr, "end : %lu\n", \ + (unsigned long)(skb->end)); \ + skb_printk(pr, "eoib_off : %lu\n", eoib_off); \ + skb_printk(pr, "eth_off : %lu\n", eth_off); \ + if (eth_off < 0 || !skb_headlen(skb)) \ + break; \ + ethp2str(ethh->h_proto, h_proto_str); \ + skb_printk(pr, "eth_proto : %s\n", h_proto_str); \ + skb_printk(pr, "eth_dest : "MAC_6_PRINT_FMT"\n", \ + MAC_6_PRINT_ARG(ethh->h_dest)); \ + skb_printk(pr, "eth_source : "MAC_6_PRINT_FMT"\n", \ + MAC_6_PRINT_ARG(ethh->h_source)); \ + if (eoib_off < 0) \ + break; \ + skb_printk(pr, "eoib_seg_id : 0x%04x\n", eoib_hdr->seg_id); \ + skb_printk(pr, "eoib_seg_off : 0x%02x\n", eoib_hdr->seg_off); \ + skb_printk(pr, "eoib_ip_chk : 0x%02x\n", \ + VNIC_EOIB_HDR_GET_IP_CHK(eoib_hdr)); \ + skb_printk(pr, "eoib_tcp_chk : 0x%02x\n", \ + VNIC_EOIB_HDR_GET_TCP_UDP_CHK(eoib_hdr)); \ + skb_printk(pr, "eoib_ver : 0x%02x\n", \ + VNIC_EOIB_HDR_GET_VER(eoib_hdr)); \ + skb_printk(pr, "eoib_sig : 0x%02x\n", \ + VNIC_EOIB_HDR_GET_SIG(eoib_hdr)); \ +} while (0) + +#endif /* CONFIG_MLX4_VNIC_DEBUG */ +#endif /* _VNIC_UTILS_H */ diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h index 9a18667c13cc0..535b1c769cc11 100644 --- a/include/linux/mlx4/cmd.h +++ b/include/linux/mlx4/cmd.h @@ -59,12 +59,15 @@ enum { MLX4_CMD_HW_HEALTH_CHECK = 0x50, MLX4_CMD_SET_PORT = 0xc, MLX4_CMD_SET_NODE = 0x5a, + MLX4_CMD_QUERY_FUNC = 0x56, MLX4_CMD_ACCESS_DDR = 0x2e, MLX4_CMD_MAP_ICM = 0xffa, MLX4_CMD_UNMAP_ICM = 0xff9, MLX4_CMD_MAP_ICM_AUX = 0xffc, MLX4_CMD_UNMAP_ICM_AUX = 0xffb, MLX4_CMD_SET_ICM_SIZE = 0xffd, + MLX4_CMD_INFORM_FLR_DONE = 0x5b, /*master notify fw on finish for slave's flr*/ + MLX4_CMD_MAD_DEMUX = 0x203, /* TPT commands */ MLX4_CMD_SW2HW_MPT = 0xd, @@ -119,20 +122,49 @@ enum { /* miscellaneous commands */ MLX4_CMD_DIAG_RPRT = 0x30, MLX4_CMD_NOP = 0x31, + MLX4_CMD_ACCESS_MEM = 0x2e, + MLX4_CMD_SET_VEP = 0x52, + + /* Ethernet specific commands */ + MLX4_CMD_SET_VLAN_FLTR = 0x47, + MLX4_CMD_SET_MCAST_FLTR = 0x48, + MLX4_CMD_DUMP_ETH_STATS = 0x49, + + /* Communication channel commands */ + MLX4_CMD_ARM_COMM_CHANNEL = 0x57, + MLX4_CMD_GEN_EQE = 0x58, + + /* virtual commands */ + MLX4_CMD_ALLOC_RES = 0xf00, + MLX4_CMD_FREE_RES = 0xf01, + MLX4_CMD_REPLACE_RES = 0xf02, + MLX4_CMD_GET_EVENT = 0xf03, + MLX4_CMD_QUERY_SLAVE_CAP = 0xf04, + MLX4_CMD_MCAST_ATTACH = 0xf05, + MLX4_CMD_COMM_INT = 0xf07, + MLX4_CMD_PROMISC = 0xf08, + MLX4_CMD_GET_PKEY_TABLE = 0xf09, + MLX4_CMD_GET_GID_MAP = 0xf10, + MLX4_CMD_ENABLE_FMR = 0xf11, /* debug commands */ MLX4_CMD_QUERY_DEBUG_MSG = 0x2a, MLX4_CMD_SET_DEBUG_MSG = 0x2b, + + /* statistics commands */ + MLX4_CMD_QUERY_IF_STAT = 0X54, + MLX4_CMD_SET_IF_STAT = 0X55, }; enum { - MLX4_CMD_TIME_CLASS_A = 10000, - MLX4_CMD_TIME_CLASS_B = 10000, - MLX4_CMD_TIME_CLASS_C = 10000, + MLX4_CMD_TIME_CLASS_A = 60000, + MLX4_CMD_TIME_CLASS_B = 60000, + MLX4_CMD_TIME_CLASS_C = 60000, }; enum { - MLX4_MAILBOX_SIZE = 4096 + MLX4_MAILBOX_SIZE = 4096, + MLX4_ACCESS_MEM_ALIGN = 256, }; enum { @@ -143,6 +175,42 @@ enum { MLX4_SET_PORT_VLAN_TABLE = 0x3, MLX4_SET_PORT_PRIO_MAP = 0x4, MLX4_SET_PORT_GID_TABLE = 0x5, + MLX4_SET_PORT_MODIFIERS +}; + +enum { + MLX4_DUMP_STATS_PORT_COUNTERS = 0x0, + MLX4_DUMP_STATS_FUNC_COUNTERS = 0x8, +}; + +enum { + MLX4_CMD_MAD_DEMUX_CONFIG = 0, + MLX4_CMD_MAD_DEMUX_QUERY_STATE = 1, + MLX4_CMD_MAD_DEMUX_QUERY_REST = 2, /* Query mad demux restrictions */ +}; + +enum { + MLX4_CMD_MAD_DEMUX_SUBN_ATTR_NOTICE = 1 << 0, + MLX4_CMD_MAD_DEMUX_SUBN_ATTR_NODE_DESC = 1 << 1, + MLX4_CMD_MAD_DEMUX_SUBN_ATTR_NODE_INFO = 1 << 2, + MLX4_CMD_MAD_DEMUX_SUBN_ATTR_SWITCH_INFO = 1 << 3, + MLX4_CMD_MAD_DEMUX_SUBN_ATTR_GUID_INFO = 1 << 4, + MLX4_CMD_MAD_DEMUX_SUBN_ATTR_PORT_INFO = 1 << 5, + MLX4_CMD_MAD_DEMUX_SUBN_ATTR_PKRY_TBL = 1 << 6, + MLX4_CMD_MAD_DEMUX_SUBN_ATTR_SL2VL_TBL = 1 << 7, + MLX4_CMD_MAD_DEMUX_SUBN_ATTR_VL_ARB_TBL = 1 << 8, + MLX4_CMD_MAD_DEMUX_SUBN_ATTR_LIN_FW_TBL = 1 << 9, + MLX4_CMD_MAD_DEMUX_SUBN_ATTR_RAND_FW_TBL = 1 << 10, + MLX4_CMD_MAD_DEMUX_SUBN_ATTR_MCAST_FW_TBL = 1 << 11, + MLX4_CMD_MAD_DEMUX_SUBN_ATTR_LINK_PAIRS_TBL = 1 << 12, + MLX4_CMD_MAD_DEMUX_SUBN_ATTR_SM_INFO = 1 << 13, + MLX4_CMD_MAD_DEMUX_SUBN_ATTR_VENDOR_DIAG = 1 << 14, + MLX4_CMD_MAD_DEMUX_SUBN_ATTR_LED_INFO = 1 << 15, +}; + +enum { + MLX4_CMD_MAD_DEMUX_SUBN_TRAP = 0x40, + MLX4_CMD_MAD_DEMUX_SUBN_TRAP_REP = 0x70, }; struct mlx4_dev; @@ -154,23 +222,23 @@ struct mlx4_cmd_mailbox { int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param, int out_is_imm, u32 in_modifier, u8 op_modifier, - u16 op, unsigned long timeout); + u16 op, unsigned long timeout, int native); /* Invoke a command with no output parameter */ static inline int mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u32 in_modifier, - u8 op_modifier, u16 op, unsigned long timeout) + u8 op_modifier, u16 op, unsigned long timeout, int native) { return __mlx4_cmd(dev, in_param, NULL, 0, in_modifier, - op_modifier, op, timeout); + op_modifier, op, timeout, native); } /* Invoke a command with an output mailbox */ static inline int mlx4_cmd_box(struct mlx4_dev *dev, u64 in_param, u64 out_param, u32 in_modifier, u8 op_modifier, u16 op, - unsigned long timeout) + unsigned long timeout, int native) { return __mlx4_cmd(dev, in_param, &out_param, 0, in_modifier, - op_modifier, op, timeout); + op_modifier, op, timeout, native); } /* @@ -180,13 +248,18 @@ static inline int mlx4_cmd_box(struct mlx4_dev *dev, u64 in_param, u64 out_param */ static inline int mlx4_cmd_imm(struct mlx4_dev *dev, u64 in_param, u64 *out_param, u32 in_modifier, u8 op_modifier, u16 op, - unsigned long timeout) + unsigned long timeout, int native) { return __mlx4_cmd(dev, in_param, out_param, 1, in_modifier, - op_modifier, op, timeout); + op_modifier, op, timeout, native); } struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev); void mlx4_free_cmd_mailbox(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox); +u32 mlx4_comm_get_version(void); + +#define MLX4_COMM_GET_VER(cmd_chan_ver) (u8)(cmd_chan_ver) +#define MLX4_COMM_GET_IF_REV(cmd_chan_ver) (u8)((cmd_chan_ver) >> 8) + #endif /* MLX4_CMD_H */ diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 1aae95fd92d17..145d1bac3926d 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -39,48 +39,59 @@ #include -#define MAX_MSIX_P_PORT 17 -#define MAX_MSIX 64 -#define MSIX_LEGACY_SZ 4 -#define MIN_MSIX_P_PORT 5 - enum { MLX4_FLAG_MSI_X = 1 << 0, MLX4_FLAG_OLD_PORT_CMDS = 1 << 1, + MLX4_FLAG_MASTER = 1 << 2, + MLX4_FLAG_MFUNC = 1 << 3, + MLX4_FLAG_SRIOV = 1 << 4, }; enum { - MLX4_MAX_PORTS = 2 + MLX4_MAX_PORTS = 2, + MLX4_MAX_PORT_PKEYS = 128, }; +/* base qkey for use in sriov tunnel-qp/proxy-cp communication. + * These qkeys must not be allowed for general use. This is a 64k range, + * and to test for violation, we use the mask (protect against future chg). + */ +#define MLX4_RESERVED_QKEY_BASE (0xFFFF0000) +#define MLX4_RESERVED_QKEY_MASK (0xFFFF0000) + enum { MLX4_BOARD_ID_LEN = 64 }; enum { - MLX4_DEV_CAP_FLAG_RC = 1LL << 0, - MLX4_DEV_CAP_FLAG_UC = 1LL << 1, - MLX4_DEV_CAP_FLAG_UD = 1LL << 2, - MLX4_DEV_CAP_FLAG_SRQ = 1LL << 6, - MLX4_DEV_CAP_FLAG_IPOIB_CSUM = 1LL << 7, - MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR = 1LL << 8, - MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR = 1LL << 9, - MLX4_DEV_CAP_FLAG_DPDP = 1LL << 12, - MLX4_DEV_CAP_FLAG_BLH = 1LL << 15, - MLX4_DEV_CAP_FLAG_MEM_WINDOW = 1LL << 16, - MLX4_DEV_CAP_FLAG_APM = 1LL << 17, - MLX4_DEV_CAP_FLAG_ATOMIC = 1LL << 18, - MLX4_DEV_CAP_FLAG_RAW_MCAST = 1LL << 19, - MLX4_DEV_CAP_FLAG_UD_AV_PORT = 1LL << 20, - MLX4_DEV_CAP_FLAG_UD_MCAST = 1LL << 21, - MLX4_DEV_CAP_FLAG_IBOE = 1LL << 30, - MLX4_DEV_CAP_FLAG_UC_LOOPBACK = 1LL << 32, - MLX4_DEV_CAP_FLAG_FCS_KEEP = 1LL << 34, - MLX4_DEV_CAP_FLAG_WOL_PORT1 = 1LL << 37, - MLX4_DEV_CAP_FLAG_WOL_PORT2 = 1LL << 38, - MLX4_DEV_CAP_FLAG_UDP_RSS = 1LL << 40, - MLX4_DEV_CAP_FLAG_VEP_UC_STEER = 1LL << 41, - MLX4_DEV_CAP_FLAG_VEP_MC_STEER = 1LL << 42 + MLX4_MAX_NUM_PF = 16, + MLX4_MAX_NUM_VF = 64, + MLX4_MFUNC_MAX = 80, + MLX4_MFUNC_EQ_NUM = 4, + MLX4_MFUNC_MAX_EQES = 8, + MLX4_MFUNC_EQE_MASK = (MLX4_MFUNC_MAX_EQES - 1) +}; + +enum { + MLX4_DEV_CAP_FLAG_RC = 1 << 0, + MLX4_DEV_CAP_FLAG_UC = 1 << 1, + MLX4_DEV_CAP_FLAG_UD = 1 << 2, + MLX4_DEV_CAP_FLAG_XRC = 1 << 3, + MLX4_DEV_CAP_FLAG_SRQ = 1 << 6, + MLX4_DEV_CAP_FLAG_IPOIB_CSUM = 1 << 7, + MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR = 1 << 8, + MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR = 1 << 9, + MLX4_DEV_CAP_FLAG_DPDP = 1 << 12, + MLX4_DEV_CAP_FLAG_RAW_ETY = 1 << 13, + MLX4_DEV_CAP_FLAG_BLH = 1 << 15, + MLX4_DEV_CAP_FLAG_MEM_WINDOW = 1 << 16, + MLX4_DEV_CAP_FLAG_APM = 1 << 17, + MLX4_DEV_CAP_FLAG_ATOMIC = 1 << 18, + MLX4_DEV_CAP_FLAG_RAW_MCAST = 1 << 19, + MLX4_DEV_CAP_FLAG_UD_AV_PORT = 1 << 20, + MLX4_DEV_CAP_FLAG_UD_MCAST = 1 << 21, + MLX4_DEV_CAP_FLAG_IBOE = 1 << 30, + MLX4_DEV_CAP_FLAG_FC_T11 = 1 << 31 }; enum { @@ -109,7 +120,24 @@ enum mlx4_event { MLX4_EVENT_TYPE_PORT_CHANGE = 0x09, MLX4_EVENT_TYPE_EQ_OVERFLOW = 0x0f, MLX4_EVENT_TYPE_ECC_DETECT = 0x0e, - MLX4_EVENT_TYPE_CMD = 0x0a + MLX4_EVENT_TYPE_CMD = 0x0a, + MLX4_EVENT_TYPE_CLIENT_REREGISTER = 0x16, + MLX4_EVENT_TYPE_VEP_UPDATE = 0x19, + MLX4_EVENT_TYPE_COMM_CHANNEL = 0x18, + MLX4_EVENT_TYPE_MAC_UPDATE = 0x20, + MLX4_EVENT_TYPE_SW_EVENT = 0x3f, /* TBD add check that future HW does not use this value */ + MLX4_EVENT_TYPE_FLR_EVENT = 0x1c, + MLX4_EVENT_TYPE_PORT_MGMT_CHANGE = 0x1d, + MLX4_EVENT_TYPE_NONE = 0xff, +}; + +enum sw_event_sub_type{ + PKEY_UPDATE_AVIAL = 0, + GUID_CHANGE_AVIAL = 1, + LID_CHANGE_AVIAL = 2, + CLIENT_REREGISTER_AVIAL = 3, + FUNCTION_BOOT = 4, + FUNCTION_SHUTDOWN = 5, }; enum { @@ -117,6 +145,24 @@ enum { MLX4_PORT_CHANGE_SUBTYPE_ACTIVE = 4 }; +enum slave_port_state { + SLAVE_PORT_DOWN = 0, + SLAVE_PENDING_UP, + SLAVE_PORT_UP, +}; + +enum slave_port_gen_event { + SLAVE_PORT_GEN_EVENT_DOWN = 0, + SLAVE_PORT_GEN_EVENT_UP, + SLAVE_PORT_GEN_EVENT_NONE, +}; +enum slave_port_state_event { + MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN, + MLX4_PORT_STATE_DEV_EVENT_PORT_UP, + MLX4_PORT_STATE_IB_PORT_STATE_EVENT_GID_VALID, + MLX4_PORT_STATE_IB_EVENT_GID_INVALID, +}; + enum { MLX4_PERM_LOCAL_READ = 1 << 10, MLX4_PERM_LOCAL_WRITE = 1 << 11, @@ -133,6 +179,7 @@ enum { MLX4_OPCODE_SEND = 0x0a, MLX4_OPCODE_SEND_IMM = 0x0b, MLX4_OPCODE_LSO = 0x0e, + MLX4_OPCODE_BIG_LSO = 0x2e, MLX4_OPCODE_RDMA_READ = 0x10, MLX4_OPCODE_ATOMIC_CS = 0x11, MLX4_OPCODE_ATOMIC_FA = 0x12, @@ -171,11 +218,11 @@ enum mlx4_qp_region { MLX4_QP_REGION_FW = 0, MLX4_QP_REGION_ETH_ADDR, MLX4_QP_REGION_FC_ADDR, - MLX4_QP_REGION_FC_EXCH, MLX4_NUM_QP_REGION }; enum mlx4_port_type { + MLX4_PORT_TYPE_NONE = 0, MLX4_PORT_TYPE_IB = 1, MLX4_PORT_TYPE_ETH = 2, MLX4_PORT_TYPE_AUTO = 3 @@ -186,19 +233,29 @@ enum mlx4_special_vlan_idx { MLX4_VLAN_MISS_IDX, MLX4_VLAN_REGULAR }; +#define MLX4_LEAST_ATTACHED_VECTOR 0xffffffff enum mlx4_steer_type { - MLX4_MC_STEER = 0, - MLX4_UC_STEER, + MLX4_UC_STEER = 0, + MLX4_MC_STEER, MLX4_NUM_STEERS }; enum { - MLX4_NUM_FEXCH = 64 * 1024, + MLX4_CUNTERS_DISABLED, + MLX4_CUNTERS_BASIC, + MLX4_CUNTERS_EXT }; -enum { - MLX4_MAX_FAST_REG_PAGES = 511, +enum mlx4_mr_state { + MLX4_MR_DISABLED = 0, + MLX4_MR_EN_HW, + MLX4_MR_EN_SW +}; + +enum mlx4_mr_flags { + MLX4_MR_FLAG_NONE = 0, + MLX4_MR_FLAG_FMR = 1 }; static inline u64 mlx4_fw_ver(u64 major, u64 minor, u64 subminor) @@ -208,75 +265,102 @@ static inline u64 mlx4_fw_ver(u64 major, u64 minor, u64 subminor) struct mlx4_caps { u64 fw_ver; - int num_ports; - int vl_cap[MLX4_MAX_PORTS + 1]; - int ib_mtu_cap[MLX4_MAX_PORTS + 1]; + u32 function; + u32 pf_num; + u32 num_ports; + u32 vl_cap[MLX4_MAX_PORTS + 1]; + u32 ib_mtu_cap[MLX4_MAX_PORTS + 1]; __be32 ib_port_def_cap[MLX4_MAX_PORTS + 1]; u64 def_mac[MLX4_MAX_PORTS + 1]; - int eth_mtu_cap[MLX4_MAX_PORTS + 1]; - int gid_table_len[MLX4_MAX_PORTS + 1]; - int pkey_table_len[MLX4_MAX_PORTS + 1]; - int trans_type[MLX4_MAX_PORTS + 1]; - int vendor_oui[MLX4_MAX_PORTS + 1]; - int wavelength[MLX4_MAX_PORTS + 1]; + u32 eth_mtu_cap[MLX4_MAX_PORTS + 1]; + u32 gid_table_len[MLX4_MAX_PORTS + 1]; + u32 pkey_table_len[MLX4_MAX_PORTS + 1]; + u32 pkey_table_max_len[MLX4_MAX_PORTS + 1]; + u32 trans_type[MLX4_MAX_PORTS + 1]; + u32 vendor_oui[MLX4_MAX_PORTS + 1]; + u32 wavelength[MLX4_MAX_PORTS + 1]; u64 trans_code[MLX4_MAX_PORTS + 1]; - int local_ca_ack_delay; - int num_uars; - int bf_reg_size; - int bf_regs_per_page; - int max_sq_sg; - int max_rq_sg; - int num_qps; - int max_wqes; - int max_sq_desc_sz; - int max_rq_desc_sz; - int max_qp_init_rdma; - int max_qp_dest_rdma; - int sqp_start; - int num_srqs; - int max_srq_wqes; - int max_srq_sge; - int reserved_srqs; - int num_cqs; - int max_cqes; - int reserved_cqs; - int num_eqs; - int reserved_eqs; - int num_comp_vectors; - int comp_pool; - int num_mpts; - int num_mtt_segs; - int mtts_per_seg; - int fmr_reserved_mtts; - int reserved_mtts; - int reserved_mrws; - int reserved_uars; - int num_mgms; - int num_amgms; - int reserved_mcgs; - int num_qp_per_mgm; - int num_pds; - int reserved_pds; - int mtt_entry_sz; + u32 local_ca_ack_delay; + u32 num_uars; + u32 uar_page_size; + u32 bf_reg_size; + u32 bf_regs_per_page; + u32 max_sq_sg; + u32 max_rq_sg; + u32 num_qps; + u32 max_wqes; + u32 max_sq_desc_sz; + u32 max_rq_desc_sz; + u32 max_qp_init_rdma; + u32 max_qp_dest_rdma; + u32 sqp_start; + u32 tunnel_qpn; + u32 num_srqs; + u32 max_srq_wqes; + u32 max_srq_sge; + u32 reserved_srqs; + u32 num_cqs; + u32 max_cqes; + u32 reserved_cqs; + u32 num_eqs; + u32 reserved_eqs; + u32 num_comp_vectors; + u32 num_mpts; + u32 num_mtt_segs; + u32 mtts_per_seg; + u32 fmr_reserved_mtts; + u32 reserved_mtts; + u32 reserved_mrws; + u32 reserved_uars; + u32 num_mgms; + u32 num_amgms; + u32 reserved_mcgs; + u32 num_qp_per_mgm; + u32 num_pds; + u32 reserved_pds; + u32 mtt_entry_sz; + u32 reserved_xrcds; + u32 max_xrcds; u32 max_msg_sz; u32 page_size_cap; u64 flags; u32 bmme_flags; u32 reserved_lkey; u16 stat_rate_support; + u32 udp_rss; + u32 loopback_support; + u32 vep_uc_steering; + u32 vep_mc_steering; + u32 wol; u8 port_width_cap[MLX4_MAX_PORTS + 1]; - int max_gso_sz; - int reserved_qps_cnt[MLX4_NUM_QP_REGION]; - int reserved_qps; - int reserved_qps_base[MLX4_NUM_QP_REGION]; - int log_num_macs; - int log_num_vlans; - int log_num_prios; - enum mlx4_port_type port_type[MLX4_MAX_PORTS + 1]; + u32 max_gso_sz; + u32 reserved_qps_cnt[MLX4_NUM_QP_REGION]; + u32 reserved_qps; + u32 reserved_qps_base[MLX4_NUM_QP_REGION]; + u32 log_num_macs; + u32 log_num_vlans; + u32 log_num_prios; + u32 port_type[MLX4_MAX_PORTS + 1]; u8 supported_type[MLX4_MAX_PORTS + 1]; - u32 port_mask; - enum mlx4_port_type possible_type[MLX4_MAX_PORTS + 1]; -}; + u8 sqp_demux; + u32 port_mask[MLX4_MAX_PORTS + 1]; + u32 possible_type[MLX4_MAX_PORTS + 1]; + u8 counters_mode; + u32 max_basic_counters; + u32 max_ext_counters; + u32 mc_promisc_mode; + u32 mad_demux; + u32 dmpt_entry_sz; + u64 mtt_base; + u64 dmpt_base; + u64 fmr_dmpt_base; + u32 fmr_dmpt_base_idx; + u32 fmr_num_mpts; + u64 fmr_mtt_base; + u32 fmr_mtt_base_idx; + u32 fmr_num_mtt_segs; + u8 fmr_log_page_size; +} __attribute__((packed)); struct mlx4_buf_list { void *buf; @@ -337,6 +421,7 @@ struct mlx4_mr { u32 pd; u32 access; int enabled; + enum mlx4_mr_flags flags; }; struct mlx4_fmr { @@ -440,13 +525,54 @@ union mlx4_ext_av { struct mlx4_eth_av eth; }; +struct mlx4_counters { + __be32 counter_mode; + __be32 num_ifc; + u32 reserved[2]; + __be64 rx_frames; + __be64 rx_bytes; + __be64 tx_frames; + __be64 tx_bytes; +}; + +struct mlx4_counters_ext { + __be32 counter_mode; + __be32 num_ifc; + u32 reserved[2]; + __be64 rx_uni_frames; + __be64 rx_uni_bytes; + __be64 rx_mcast_frames; + __be64 rx_mcast_bytes; + __be64 rx_bcast_frames; + __be64 rx_bcast_bytes; + __be64 rx_nobuf_frames; + __be64 rx_nobuf_bytes; + __be64 rx_err_frames; + __be64 rx_err_bytes; + __be64 tx_uni_frames; + __be64 tx_uni_bytes; + __be64 tx_mcast_frames; + __be64 tx_mcast_bytes; + __be64 tx_bcast_frames; + __be64 tx_bcast_bytes; + __be64 tx_nobuf_frames; + __be64 tx_nobuf_bytes; + __be64 tx_err_frames; + __be64 tx_err_bytes; +}; + struct mlx4_dev { struct pci_dev *pdev; unsigned long flags; + unsigned long num_slaves; struct mlx4_caps caps; struct radix_tree_root qp_table_tree; - u8 rev_id; + struct radix_tree_root srq_table_tree; + u32 rev_id; char board_id[MLX4_BOARD_ID_LEN]; + int sr_iov; + int is_internal_sma; + u8 gids_per_func; }; struct mlx4_init_port_param { @@ -463,16 +589,678 @@ struct mlx4_init_port_param { u64 si_guid; }; +static inline void mlx4_query_steer_cap(struct mlx4_dev *dev, int *log_mac, + int *log_vlan, int *log_prio) +{ + *log_mac = dev->caps.log_num_macs; + *log_vlan = dev->caps.log_num_vlans; + *log_prio = dev->caps.log_num_prios; +} + +struct mlx4_stat_out_mbox { + /* Received frames with a length of 64 octets */ + __be64 R64_prio_0; + __be64 R64_prio_1; + __be64 R64_prio_2; + __be64 R64_prio_3; + __be64 R64_prio_4; + __be64 R64_prio_5; + __be64 R64_prio_6; + __be64 R64_prio_7; + __be64 R64_novlan; + /* Received frames with a length of 127 octets */ + __be64 R127_prio_0; + __be64 R127_prio_1; + __be64 R127_prio_2; + __be64 R127_prio_3; + __be64 R127_prio_4; + __be64 R127_prio_5; + __be64 R127_prio_6; + __be64 R127_prio_7; + __be64 R127_novlan; + /* Received frames with a length of 255 octets */ + __be64 R255_prio_0; + __be64 R255_prio_1; + __be64 R255_prio_2; + __be64 R255_prio_3; + __be64 R255_prio_4; + __be64 R255_prio_5; + __be64 R255_prio_6; + __be64 R255_prio_7; + __be64 R255_novlan; + /* Received frames with a length of 511 octets */ + __be64 R511_prio_0; + __be64 R511_prio_1; + __be64 R511_prio_2; + __be64 R511_prio_3; + __be64 R511_prio_4; + __be64 R511_prio_5; + __be64 R511_prio_6; + __be64 R511_prio_7; + __be64 R511_novlan; + /* Received frames with a length of 1023 octets */ + __be64 R1023_prio_0; + __be64 R1023_prio_1; + __be64 R1023_prio_2; + __be64 R1023_prio_3; + __be64 R1023_prio_4; + __be64 R1023_prio_5; + __be64 R1023_prio_6; + __be64 R1023_prio_7; + __be64 R1023_novlan; + /* Received frames with a length of 1518 octets */ + __be64 R1518_prio_0; + __be64 R1518_prio_1; + __be64 R1518_prio_2; + __be64 R1518_prio_3; + __be64 R1518_prio_4; + __be64 R1518_prio_5; + __be64 R1518_prio_6; + __be64 R1518_prio_7; + __be64 R1518_novlan; + /* Received frames with a length of 1522 octets */ + __be64 R1522_prio_0; + __be64 R1522_prio_1; + __be64 R1522_prio_2; + __be64 R1522_prio_3; + __be64 R1522_prio_4; + __be64 R1522_prio_5; + __be64 R1522_prio_6; + __be64 R1522_prio_7; + __be64 R1522_novlan; + /* Received frames with a length of 1548 octets */ + __be64 R1548_prio_0; + __be64 R1548_prio_1; + __be64 R1548_prio_2; + __be64 R1548_prio_3; + __be64 R1548_prio_4; + __be64 R1548_prio_5; + __be64 R1548_prio_6; + __be64 R1548_prio_7; + __be64 R1548_novlan; + /* Received frames with a length of 1548 < octets < MTU */ + __be64 R2MTU_prio_0; + __be64 R2MTU_prio_1; + __be64 R2MTU_prio_2; + __be64 R2MTU_prio_3; + __be64 R2MTU_prio_4; + __be64 R2MTU_prio_5; + __be64 R2MTU_prio_6; + __be64 R2MTU_prio_7; + __be64 R2MTU_novlan; + /* Received frames with a length of MTU< octets and good CRC */ + __be64 RGIANT_prio_0; + __be64 RGIANT_prio_1; + __be64 RGIANT_prio_2; + __be64 RGIANT_prio_3; + __be64 RGIANT_prio_4; + __be64 RGIANT_prio_5; + __be64 RGIANT_prio_6; + __be64 RGIANT_prio_7; + __be64 RGIANT_novlan; + /* Received broadcast frames with good CRC */ + __be64 RBCAST_prio_0; + __be64 RBCAST_prio_1; + __be64 RBCAST_prio_2; + __be64 RBCAST_prio_3; + __be64 RBCAST_prio_4; + __be64 RBCAST_prio_5; + __be64 RBCAST_prio_6; + __be64 RBCAST_prio_7; + __be64 RBCAST_novlan; + /* Received multicast frames with good CRC */ + __be64 MCAST_prio_0; + __be64 MCAST_prio_1; + __be64 MCAST_prio_2; + __be64 MCAST_prio_3; + __be64 MCAST_prio_4; + __be64 MCAST_prio_5; + __be64 MCAST_prio_6; + __be64 MCAST_prio_7; + __be64 MCAST_novlan; + /* Received unicast not short or GIANT frames with good CRC */ + __be64 RTOTG_prio_0; + __be64 RTOTG_prio_1; + __be64 RTOTG_prio_2; + __be64 RTOTG_prio_3; + __be64 RTOTG_prio_4; + __be64 RTOTG_prio_5; + __be64 RTOTG_prio_6; + __be64 RTOTG_prio_7; + __be64 RTOTG_novlan; + + /* Count of total octets of received frames, includes framing characters */ + __be64 RTTLOCT_prio_0; + /* Count of total octets of received frames, not including framing + characters */ + __be64 RTTLOCT_NOFRM_prio_0; + /* Count of Total number of octets received + (only for frames without errors) */ + __be64 ROCT_prio_0; + + __be64 RTTLOCT_prio_1; + __be64 RTTLOCT_NOFRM_prio_1; + __be64 ROCT_prio_1; + + __be64 RTTLOCT_prio_2; + __be64 RTTLOCT_NOFRM_prio_2; + __be64 ROCT_prio_2; + + __be64 RTTLOCT_prio_3; + __be64 RTTLOCT_NOFRM_prio_3; + __be64 ROCT_prio_3; + + __be64 RTTLOCT_prio_4; + __be64 RTTLOCT_NOFRM_prio_4; + __be64 ROCT_prio_4; + + __be64 RTTLOCT_prio_5; + __be64 RTTLOCT_NOFRM_prio_5; + __be64 ROCT_prio_5; + + __be64 RTTLOCT_prio_6; + __be64 RTTLOCT_NOFRM_prio_6; + __be64 ROCT_prio_6; + + __be64 RTTLOCT_prio_7; + __be64 RTTLOCT_NOFRM_prio_7; + __be64 ROCT_prio_7; + + __be64 RTTLOCT_novlan; + __be64 RTTLOCT_NOFRM_novlan; + __be64 ROCT_novlan; + + /* Count of Total received frames including bad frames */ + __be64 RTOT_prio_0; + /* Count of Total number of received frames with 802.1Q encapsulation */ + __be64 R1Q_prio_0; + __be64 reserved1; + + __be64 RTOT_prio_1; + __be64 R1Q_prio_1; + __be64 reserved2; + + __be64 RTOT_prio_2; + __be64 R1Q_prio_2; + __be64 reserved3; + + __be64 RTOT_prio_3; + __be64 R1Q_prio_3; + __be64 reserved4; + + __be64 RTOT_prio_4; + __be64 R1Q_prio_4; + __be64 reserved5; + + __be64 RTOT_prio_5; + __be64 R1Q_prio_5; + __be64 reserved6; + + __be64 RTOT_prio_6; + __be64 R1Q_prio_6; + __be64 reserved7; + + __be64 RTOT_prio_7; + __be64 R1Q_prio_7; + __be64 reserved8; + + __be64 RTOT_novlan; + __be64 R1Q_novlan; + __be64 reserved9; + + /* Total number of Successfully Received Control Frames */ + __be64 RCNTL; + __be64 reserved10; + __be64 reserved11; + __be64 reserved12; + /* Count of received frames with a length/type field value between 46 + (42 for VLANtagged frames) and 1500 (also 1500 for VLAN-tagged frames), + inclusive */ + __be64 RInRangeLengthErr; + /* Count of received frames with length/type field between 1501 and 1535 + decimal, inclusive */ + __be64 ROutRangeLengthErr; + /* Count of received frames that are longer than max allowed size for + 802.3 frames (1518/1522) */ + __be64 RFrmTooLong; + /* Count frames received with PCS error */ + __be64 PCS; + + /* Transmit frames with a length of 64 octets */ + __be64 T64_prio_0; + __be64 T64_prio_1; + __be64 T64_prio_2; + __be64 T64_prio_3; + __be64 T64_prio_4; + __be64 T64_prio_5; + __be64 T64_prio_6; + __be64 T64_prio_7; + __be64 T64_novlan; + __be64 T64_loopbk; + /* Transmit frames with a length of 65 to 127 octets. */ + __be64 T127_prio_0; + __be64 T127_prio_1; + __be64 T127_prio_2; + __be64 T127_prio_3; + __be64 T127_prio_4; + __be64 T127_prio_5; + __be64 T127_prio_6; + __be64 T127_prio_7; + __be64 T127_novlan; + __be64 T127_loopbk; + /* Transmit frames with a length of 128 to 255 octets */ + __be64 T255_prio_0; + __be64 T255_prio_1; + __be64 T255_prio_2; + __be64 T255_prio_3; + __be64 T255_prio_4; + __be64 T255_prio_5; + __be64 T255_prio_6; + __be64 T255_prio_7; + __be64 T255_novlan; + __be64 T255_loopbk; + /* Transmit frames with a length of 256 to 511 octets */ + __be64 T511_prio_0; + __be64 T511_prio_1; + __be64 T511_prio_2; + __be64 T511_prio_3; + __be64 T511_prio_4; + __be64 T511_prio_5; + __be64 T511_prio_6; + __be64 T511_prio_7; + __be64 T511_novlan; + __be64 T511_loopbk; + /* Transmit frames with a length of 512 to 1023 octets */ + __be64 T1023_prio_0; + __be64 T1023_prio_1; + __be64 T1023_prio_2; + __be64 T1023_prio_3; + __be64 T1023_prio_4; + __be64 T1023_prio_5; + __be64 T1023_prio_6; + __be64 T1023_prio_7; + __be64 T1023_novlan; + __be64 T1023_loopbk; + /* Transmit frames with a length of 1024 to 1518 octets */ + __be64 T1518_prio_0; + __be64 T1518_prio_1; + __be64 T1518_prio_2; + __be64 T1518_prio_3; + __be64 T1518_prio_4; + __be64 T1518_prio_5; + __be64 T1518_prio_6; + __be64 T1518_prio_7; + __be64 T1518_novlan; + __be64 T1518_loopbk; + /* Counts transmit frames with a length of 1519 to 1522 bytes */ + __be64 T1522_prio_0; + __be64 T1522_prio_1; + __be64 T1522_prio_2; + __be64 T1522_prio_3; + __be64 T1522_prio_4; + __be64 T1522_prio_5; + __be64 T1522_prio_6; + __be64 T1522_prio_7; + __be64 T1522_novlan; + __be64 T1522_loopbk; + /* Transmit frames with a length of 1523 to 1548 octets */ + __be64 T1548_prio_0; + __be64 T1548_prio_1; + __be64 T1548_prio_2; + __be64 T1548_prio_3; + __be64 T1548_prio_4; + __be64 T1548_prio_5; + __be64 T1548_prio_6; + __be64 T1548_prio_7; + __be64 T1548_novlan; + __be64 T1548_loopbk; + /* Counts transmit frames with a length of 1549 to MTU bytes */ + __be64 T2MTU_prio_0; + __be64 T2MTU_prio_1; + __be64 T2MTU_prio_2; + __be64 T2MTU_prio_3; + __be64 T2MTU_prio_4; + __be64 T2MTU_prio_5; + __be64 T2MTU_prio_6; + __be64 T2MTU_prio_7; + __be64 T2MTU_novlan; + __be64 T2MTU_loopbk; + /* Transmit frames with a length greater than MTU octets and a good CRC. */ + __be64 TGIANT_prio_0; + __be64 TGIANT_prio_1; + __be64 TGIANT_prio_2; + __be64 TGIANT_prio_3; + __be64 TGIANT_prio_4; + __be64 TGIANT_prio_5; + __be64 TGIANT_prio_6; + __be64 TGIANT_prio_7; + __be64 TGIANT_novlan; + __be64 TGIANT_loopbk; + /* Transmit broadcast frames with a good CRC */ + __be64 TBCAST_prio_0; + __be64 TBCAST_prio_1; + __be64 TBCAST_prio_2; + __be64 TBCAST_prio_3; + __be64 TBCAST_prio_4; + __be64 TBCAST_prio_5; + __be64 TBCAST_prio_6; + __be64 TBCAST_prio_7; + __be64 TBCAST_novlan; + __be64 TBCAST_loopbk; + /* Transmit multicast frames with a good CRC */ + __be64 TMCAST_prio_0; + __be64 TMCAST_prio_1; + __be64 TMCAST_prio_2; + __be64 TMCAST_prio_3; + __be64 TMCAST_prio_4; + __be64 TMCAST_prio_5; + __be64 TMCAST_prio_6; + __be64 TMCAST_prio_7; + __be64 TMCAST_novlan; + __be64 TMCAST_loopbk; + /* Transmit good frames that are neither broadcast nor multicast */ + __be64 TTOTG_prio_0; + __be64 TTOTG_prio_1; + __be64 TTOTG_prio_2; + __be64 TTOTG_prio_3; + __be64 TTOTG_prio_4; + __be64 TTOTG_prio_5; + __be64 TTOTG_prio_6; + __be64 TTOTG_prio_7; + __be64 TTOTG_novlan; + __be64 TTOTG_loopbk; + + /* total octets of transmitted frames, including framing characters */ + __be64 TTTLOCT_prio_0; + /* total octets of transmitted frames, not including framing characters */ + __be64 TTTLOCT_NOFRM_prio_0; + /* ifOutOctets */ + __be64 TOCT_prio_0; + + __be64 TTTLOCT_prio_1; + __be64 TTTLOCT_NOFRM_prio_1; + __be64 TOCT_prio_1; + + __be64 TTTLOCT_prio_2; + __be64 TTTLOCT_NOFRM_prio_2; + __be64 TOCT_prio_2; + + __be64 TTTLOCT_prio_3; + __be64 TTTLOCT_NOFRM_prio_3; + __be64 TOCT_prio_3; + + __be64 TTTLOCT_prio_4; + __be64 TTTLOCT_NOFRM_prio_4; + __be64 TOCT_prio_4; + + __be64 TTTLOCT_prio_5; + __be64 TTTLOCT_NOFRM_prio_5; + __be64 TOCT_prio_5; + + __be64 TTTLOCT_prio_6; + __be64 TTTLOCT_NOFRM_prio_6; + __be64 TOCT_prio_6; + + __be64 TTTLOCT_prio_7; + __be64 TTTLOCT_NOFRM_prio_7; + __be64 TOCT_prio_7; + + __be64 TTTLOCT_novlan; + __be64 TTTLOCT_NOFRM_novlan; + __be64 TOCT_novlan; + + __be64 TTTLOCT_loopbk; + __be64 TTTLOCT_NOFRM_loopbk; + __be64 TOCT_loopbk; + + /* Total frames transmitted with a good CRC that are not aborted */ + __be64 TTOT_prio_0; + /* Total number of frames transmitted with 802.1Q encapsulation */ + __be64 T1Q_prio_0; + __be64 reserved13; + + __be64 TTOT_prio_1; + __be64 T1Q_prio_1; + __be64 reserved14; + + __be64 TTOT_prio_2; + __be64 T1Q_prio_2; + __be64 reserved15; + + __be64 TTOT_prio_3; + __be64 T1Q_prio_3; + __be64 reserved16; + + __be64 TTOT_prio_4; + __be64 T1Q_prio_4; + __be64 reserved17; + + __be64 TTOT_prio_5; + __be64 T1Q_prio_5; + __be64 reserved18; + + __be64 TTOT_prio_6; + __be64 T1Q_prio_6; + __be64 reserved19; + + __be64 TTOT_prio_7; + __be64 T1Q_prio_7; + __be64 reserved20; + + __be64 TTOT_novlan; + __be64 T1Q_novlan; + __be64 reserved21; + + __be64 TTOT_loopbk; + __be64 T1Q_loopbk; + __be64 reserved22; + + /* Received frames with a length greater than MTU octets and a bad CRC */ + __be32 RJBBR; + /* Received frames with a bad CRC that are not runts, jabbers, + or alignment errors */ + __be32 RCRC; + /* Received frames with SFD with a length of less than 64 octets and a + bad CRC */ + __be32 RRUNT; + /* Received frames with a length less than 64 octets and a good CRC */ + __be32 RSHORT; + /* Total Number of Received Packets Dropped */ + __be32 RDROP; + /* Drop due to overflow */ + __be32 RdropOvflw; + /* Drop due to overflow */ + __be32 RdropLength; + /* Total of good frames. Does not include frames received with + frame-too-long, FCS, or length errors */ + __be32 RTOTFRMS; + /* Total dropped Xmited packets */ + __be32 TDROP; +}; + +struct mlx4_func_stat_out_mbox { + __be64 etherStatsDropEvents; + __be64 etherStatsOctets; + __be64 etherStatsPkts; + __be64 etherStatsBroadcastPkts; + __be64 etherStatsMulticastPkts; + __be64 etherStatsCRCAlignErrors; + __be64 etherStatsUndersizePkts; + __be64 etherStatsOversizePkts; + __be64 etherStatsFragments; + __be64 etherStatsJabbers; + __be64 etherStatsCollisions; + __be64 etherStatsPkts64Octets; + __be64 etherStatsPkts65to127Octets; + __be64 etherStatsPkts128to255Octets; + __be64 etherStatsPkts256to511Octets; + __be64 etherStatsPkts512to1023Octets; + __be64 etherStatsPkts1024to1518Octets; +}; + +struct mlx4_mpt_entry { + __be32 flags; + __be32 qpn; + __be32 key; + __be32 pd_flags; + __be64 start; + __be64 length; + __be32 lkey; + __be32 win_cnt; + u8 reserved1; + u8 flags2; + u8 reserved2; + u8 mtt_rep; + __be64 mtt_seg; + __be32 mtt_sz; + __be32 entity_size; + __be32 first_byte_offset; +} __attribute__((packed)); + + +/* + * Must be packed because start is 64 bits but only aligned to 32 bits. + */ +struct mlx4_eq_context { + __be32 flags; + u16 reserved1[3]; + __be16 page_offset; + u8 log_eq_size; + u8 reserved2[4]; + u8 eq_period; + u8 reserved3; + u8 eq_max_count; + u8 reserved4[3]; + u8 intr; + u8 log_page_size; + u8 reserved5[2]; + u8 mtt_base_addr_h; + __be32 mtt_base_addr_l; + u32 reserved6[2]; + __be32 consumer_index; + __be32 producer_index; + u32 reserved7[4]; +}; + +struct mlx4_cq_context { + __be32 flags; + u16 reserved1[3]; + __be16 page_offset; + __be32 logsize_usrpage; + __be16 cq_period; + __be16 cq_max_count; + u8 reserved2[3]; + u8 comp_eqn; + u8 log_page_size; + u8 reserved3[2]; + u8 mtt_base_addr_h; + __be32 mtt_base_addr_l; + __be32 last_notified_index; + __be32 solicit_producer_index; + __be32 consumer_index; + __be32 producer_index; + u32 reserved4[2]; + __be64 db_rec_addr; +}; + +struct mlx4_srq_context { + __be32 state_logsize_srqn; + u8 logstride; + u8 reserved1; + __be16 xrc_domain; + __be32 pg_offset_cqn; + u32 reserved2; + u8 log_page_size; + u8 reserved3[2]; + u8 mtt_base_addr_h; + __be32 mtt_base_addr_l; + __be32 pd; + __be16 limit_watermark; + __be16 wqe_cnt; + u16 reserved4; + __be16 wqe_counter; + u32 reserved5; + __be64 db_rec_addr; +}; + +struct mlx4_eth_common_counters { + /* bad packets received */ + unsigned long rx_errors; + /* packet transmit problems */ + unsigned long tx_errors; + /* multicast packets received */ + unsigned long multicast; + unsigned long rx_length_errors; + /* receiver ring buff overflow */ + unsigned long rx_over_errors; + /* recved pkt with crc error */ + unsigned long rx_crc_errors; + /* recv'r fifo overrun */ + unsigned long rx_fifo_errors; + /* receiver missed packet */ + unsigned long rx_missed_errors; + unsigned long broadcast; +}; + +struct mlx4_wol_struct { + __be32 flags; + __be32 preserved1; +}; + +struct mlx4_enable_fmr_mbox { + /* protocol KVM = 0, XEN = 1 */ + u8 protocol; + /* size of protocol specific private info */ + u8 fmr_info_size; + /* data size added by the protocol on vpm struct */ + u8 vpm_info_size; + /* log of number of 4K pages be used in FMR ICM mappings */ + u8 log_page_size; + /* reserved */ + __be32 reserved[2]; + /* mpt index assigned by firmware for the vf */ + __be32 base_mpt_entry; /* mpts number is taken from QUERY_HCA */ + /* protocol specific private info */ + u8 fmr_info[0]; +}; + +int mlx4_DUMP_ETH_STATS(struct mlx4_dev *dev, u8 port, u8 reset, + struct mlx4_eth_common_counters *stats); + #define mlx4_foreach_port(port, dev, type) \ for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \ - if (((type) == MLX4_PORT_TYPE_IB ? (dev)->caps.port_mask : \ - ~(dev)->caps.port_mask) & 1 << ((port) - 1)) + if ((type) == (dev)->caps.port_mask[(port)]) -#define mlx4_foreach_ib_transport_port(port, dev) \ - for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \ - if (((dev)->caps.port_mask & 1 << ((port) - 1)) || \ - ((dev)->caps.flags & MLX4_DEV_CAP_FLAG_IBOE)) +#define mlx4_foreach_ib_transport_port(port, dev) \ + for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \ + if (((dev)->caps.port_mask[port] == MLX4_PORT_TYPE_IB) || \ + ((dev)->caps.flags & MLX4_DEV_CAP_FLAG_IBOE)) + +static inline int mlx4_is_master(struct mlx4_dev *dev) +{ + return dev->flags & MLX4_FLAG_MASTER; +} + +static inline int mlx4_is_mfunc(struct mlx4_dev *dev) +{ + return dev->flags & (MLX4_FLAG_MFUNC | MLX4_FLAG_MASTER); +} + +static inline int mlx4_is_qp_reserved(struct mlx4_dev *dev, u32 qpn) +{ + return (qpn < dev->caps.tunnel_qpn + 8 + + 16 * MLX4_MFUNC_MAX * !!mlx4_is_master(dev)); +} +static inline int mlx4_is_guest_proxy(struct mlx4_dev *dev, int slave, u32 qpn) +{ + int base = dev->caps.tunnel_qpn + 8 + slave * 8; + + if (qpn >= base && qpn < base + 8) + return 1; + + return 0; +} int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct, struct mlx4_buf *buf); @@ -486,21 +1274,41 @@ static inline void *mlx4_buf_offset(struct mlx4_buf *buf, int offset) (offset & (PAGE_SIZE - 1)); } +static inline u32 key_to_hw_index(u32 key) +{ + return (key << 24) | (key >> 8); +} + +static inline u32 key_to_mpt_index(struct mlx4_dev *dev, u32 key) +{ + return key_to_hw_index(key) & (dev->caps.num_mpts - 1); +} + int mlx4_pd_alloc(struct mlx4_dev *dev, u32 *pdn); void mlx4_pd_free(struct mlx4_dev *dev, u32 pdn); +int mlx4_xrcd_alloc(struct mlx4_dev *dev, u32 *xrcdn); +void mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn); + int mlx4_uar_alloc(struct mlx4_dev *dev, struct mlx4_uar *uar); void mlx4_uar_free(struct mlx4_dev *dev, struct mlx4_uar *uar); int mlx4_bf_alloc(struct mlx4_dev *dev, struct mlx4_bf *bf); void mlx4_bf_free(struct mlx4_dev *dev, struct mlx4_bf *bf); int mlx4_mtt_init(struct mlx4_dev *dev, int npages, int page_shift, - struct mlx4_mtt *mtt); -void mlx4_mtt_cleanup(struct mlx4_dev *dev, struct mlx4_mtt *mtt); + struct mlx4_mtt *mtt, enum mlx4_mr_flags flags); +void mlx4_mtt_cleanup(struct mlx4_dev *dev, struct mlx4_mtt *mtt, + enum mlx4_mr_flags flags); u64 mlx4_mtt_addr(struct mlx4_dev *dev, struct mlx4_mtt *mtt); +int mlx4_mr_reserve_range(struct mlx4_dev *dev, int cnt, int align, u32 *base_mridx); +void mlx4_mr_release_range(struct mlx4_dev *dev, u32 base_mridx, int cnt); +int mlx4_mr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, u32 pd, + u64 iova, u64 size, u32 access, int npages, + int page_shift, struct mlx4_mr *mr); int mlx4_mr_alloc(struct mlx4_dev *dev, u32 pd, u64 iova, u64 size, u32 access, int npages, int page_shift, struct mlx4_mr *mr); +void mlx4_mr_free_reserved(struct mlx4_dev *dev, struct mlx4_mr *mr); void mlx4_mr_free(struct mlx4_dev *dev, struct mlx4_mr *mr); int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr); int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, @@ -527,19 +1335,24 @@ void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt); int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp); void mlx4_qp_free(struct mlx4_dev *dev, struct mlx4_qp *qp); -int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, struct mlx4_mtt *mtt, - u64 db_rec, struct mlx4_srq *srq); +int mlx4_srq_alloc(struct mlx4_dev *dev, u32 pdn, u32 cqn, u16 xrcd, + struct mlx4_mtt *mtt, u64 db_rec, struct mlx4_srq *srq); void mlx4_srq_free(struct mlx4_dev *dev, struct mlx4_srq *srq); int mlx4_srq_arm(struct mlx4_dev *dev, struct mlx4_srq *srq, int limit_watermark); int mlx4_srq_query(struct mlx4_dev *dev, struct mlx4_srq *srq, int *limit_watermark); +int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu, + u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx); +int mlx4_SET_PORT_qpn_calc(struct mlx4_dev *dev, u8 port, u32 base_qpn, + u8 promisc); + int mlx4_INIT_PORT(struct mlx4_dev *dev, int port); int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port); int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], - int block_mcast_loopback, enum mlx4_protocol protocol); + int block_mcast_loopback, enum mlx4_protocol prot); int mlx4_multicast_detach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16], - enum mlx4_protocol protocol); + enum mlx4_protocol prot); int mlx4_multicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port); int mlx4_multicast_promisc_remove(struct mlx4_dev *dev, u32 qpn, u8 port); int mlx4_unicast_promisc_add(struct mlx4_dev *dev, u32 qpn, u8 port); @@ -548,26 +1361,57 @@ int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mo int mlx4_register_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *qpn, u8 wrap); void mlx4_unregister_mac(struct mlx4_dev *dev, u8 port, int qpn); -int mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac, u8 wrap); +int mlx4_replace_mac(struct mlx4_dev *dev, u8 port, int qpn, u64 new_mac); int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx); int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index); void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, int index); +int mlx4_map_phys_fmr_fbo(struct mlx4_dev *dev, struct mlx4_fmr *fmr, + u64 *page_list, int npages, u64 iova, u32 fbo, + u32 len, u32 *lkey, u32 *rkey, int same_key); +int mlx4_set_fmr_pd(struct mlx4_fmr *fmr, u32 pd); int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list, int npages, u64 iova, u32 *lkey, u32 *rkey); +int mlx4_fmr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, u32 pd, + u32 access, int max_pages, int max_maps, + u8 page_shift, struct mlx4_fmr *fmr); int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages, int max_maps, u8 page_shift, struct mlx4_fmr *fmr); int mlx4_fmr_enable(struct mlx4_dev *dev, struct mlx4_fmr *fmr); void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u32 *lkey, u32 *rkey); +int mlx4_fmr_free_reserved(struct mlx4_dev *dev, struct mlx4_fmr *fmr); int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr); int mlx4_SYNC_TPT(struct mlx4_dev *dev); +int mlx4_query_diag_counters(struct mlx4_dev *mlx4_dev, int array_length, + u8 op_modifier, u32 in_offset[], u32 counter_out[]); int mlx4_test_interrupts(struct mlx4_dev *dev); -int mlx4_assign_eq(struct mlx4_dev *dev, char* name , int* vector); -void mlx4_release_eq(struct mlx4_dev *dev, int vec); +int mlx4_QUERY_PORT(struct mlx4_dev *dev, void *outbox, u8 port); + +void mlx4_get_fc_t11_settings(struct mlx4_dev *dev, int *enable_pre_t11, int *t11_supported); + +int __mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx); +void __mlx4_counter_free(struct mlx4_dev *dev, u32 idx); +int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx); +void mlx4_counter_free(struct mlx4_dev *dev, u32 idx); + +int mlx4_wol_read(struct mlx4_dev *dev, struct mlx4_wol_struct *output, int port); +int mlx4_wol_write(struct mlx4_dev *dev, struct mlx4_wol_struct *input, int port); +int mlx4_GET_PKEY_TABLE(struct mlx4_dev *dev, u8 port, u8 table[]); +int mlx4_gen_pkey_eqe(struct mlx4_dev *dev, int slave, u8 port); + +int mlx4_is_slave_active(struct mlx4_dev *dev, int slave); +void mlx4_sync_pkey_table(struct mlx4_dev *dev, int slave, int port, int i, int val); -int mlx4_wol_read(struct mlx4_dev *dev, u64 *config, int port); -int mlx4_wol_write(struct mlx4_dev *dev, u64 config, int port); +int mlx4_gen_guid_change_eqe(struct mlx4_dev *dev, int slave, u8 port); +int mlx4_gen_all_sw_eqe(struct mlx4_dev *dev, u8 port, int avial); +int mlx4_get_parav_qkey(struct mlx4_dev *dev, u32 qpn, u32 *qkey); +void mlx4_gen_port_state_change_eqe(struct mlx4_dev *dev, int slave, u8 port, u8 port_subtype_change); +enum slave_port_state mlx4_get_slave_port_state(struct mlx4_dev *dev, int slave, u8 port); +int set_and_calc_slave_port_state(struct mlx4_dev *dev, int slave, u8 port, int event, enum slave_port_gen_event* gen_event); +#define MLX4_NOT_SET_GUID cpu_to_be64(0ULL) +void mlx4_slave_handle_guid(struct mlx4_dev *dev, int slave_id, u8 port_num, __be64 cur_ag); +int mlx4_gid_idx_to_slave(struct mlx4_dev *dev, int gid_index); #endif /* MLX4_DEVICE_H */ diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h index e1eebf78caba1..0b842432e2fbd 100644 --- a/include/linux/mlx4/driver.h +++ b/include/linux/mlx4/driver.h @@ -34,7 +34,6 @@ #define MLX4_DRIVER_H #include -#include struct mlx4_dev; @@ -43,21 +42,41 @@ enum mlx4_dev_event { MLX4_DEV_EVENT_PORT_UP, MLX4_DEV_EVENT_PORT_DOWN, MLX4_DEV_EVENT_PORT_REINIT, + MLX4_DEV_EVENT_PKEY_UPDATE, + MLX4_DEV_EVENT_GUID_CHANGE, + MLX4_DEV_EVENT_SLAVE_INIT, + MLX4_DEV_EVENT_SLAVE_SHUTDOWN, + MLX4_DEV_EVENT_LID_CHANGE, + MLX4_DEV_EVENT_CLIENT_REREGISTER, + MLX4_DEV_EVENT_PORT_MGMT_CHANGE +}; + +enum mlx4_query_reply { + MLX4_QUERY_NOT_MINE = -1, + MLX4_QUERY_MINE_NOPORT = 0 +}; + +enum mlx4_prot { + MLX4_PROT_IB, + MLX4_PROT_EN, }; struct mlx4_interface { void * (*add) (struct mlx4_dev *dev); void (*remove)(struct mlx4_dev *dev, void *context); void (*event) (struct mlx4_dev *dev, void *context, - enum mlx4_dev_event event, int port); - void * (*get_dev)(struct mlx4_dev *dev, void *context, u8 port); + enum mlx4_dev_event event, unsigned long param); + void * (*get_prot_dev) (struct mlx4_dev *dev, void *context, u8 port); + enum mlx4_prot protocol; + + enum mlx4_query_reply (*query) (void *context, void *); struct list_head list; - enum mlx4_protocol protocol; }; int mlx4_register_interface(struct mlx4_interface *intf); void mlx4_unregister_interface(struct mlx4_interface *intf); +void *mlx4_get_prot_dev(struct mlx4_dev *dev, enum mlx4_prot proto, int port); -void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto, int port); +struct mlx4_dev *mlx4_query_interface(void *, int *port); #endif /* MLX4_DRIVER_H */ diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h index aa2c829d55bac..367d45b3f927f 100644 --- a/include/linux/mlx4/qp.h +++ b/include/linux/mlx4/qp.h @@ -54,7 +54,8 @@ enum mlx4_qp_optpar { MLX4_QP_OPTPAR_RETRY_COUNT = 1 << 12, MLX4_QP_OPTPAR_RNR_RETRY = 1 << 13, MLX4_QP_OPTPAR_ACK_TIMEOUT = 1 << 14, - MLX4_QP_OPTPAR_SCHED_QUEUE = 1 << 16 + MLX4_QP_OPTPAR_SCHED_QUEUE = 1 << 16, + MLX4_QP_OPTPAR_COUNTER_INDEX = 1 << 20 }; enum mlx4_qp_state { @@ -74,6 +75,7 @@ enum { MLX4_QP_ST_UC = 0x1, MLX4_QP_ST_RD = 0x2, MLX4_QP_ST_UD = 0x3, + MLX4_QP_ST_XRC = 0x6, MLX4_QP_ST_MLX = 0x7 }; @@ -97,9 +99,10 @@ enum { struct mlx4_qp_path { u8 fl; - u8 reserved1[2]; + u8 reserved1[1]; + u8 disable_pkey_check; u8 pkey_index; - u8 reserved2; + u8 counter_index; u8 grh_mylmc; __be16 rlid; u8 ackto; @@ -111,8 +114,7 @@ struct mlx4_qp_path { u8 sched_queue; u8 vlan_index; u8 reserved3[2]; - u8 counter_index; - u8 reserved4; + u8 reserved4[2]; u8 dmac[6]; }; @@ -137,7 +139,7 @@ struct mlx4_qp_context { __be32 ssn; __be32 params2; __be32 rnr_nextrecvpsn; - __be32 srcd; + __be32 xrcd; __be32 cqn_recv; __be64 db_rec_addr; __be32 qkey; @@ -152,7 +154,16 @@ struct mlx4_qp_context { u8 reserved4[2]; u8 mtt_base_addr_h; __be32 mtt_base_addr_l; - u32 reserved5[10]; + u8 VE; + u8 reserved5; + __be16 VFT_id_prio; + u8 reserved6; + u8 exch_size; + __be16 exch_base; + u8 VFT_hop_cnt; + u8 my_fc_id_idx; + __be16 reserved7; + u32 reserved8[7]; }; /* Which firmware version adds support for NEC (NoErrorCompletion) bit */ @@ -182,7 +193,6 @@ struct mlx4_wqe_ctrl_seg { * [4] IP checksum * [3:2] C (generate completion queue entry) * [1] SE (solicited event) - * [0] FL (force loopback) */ __be32 srcrb_flags; /* @@ -195,7 +205,8 @@ struct mlx4_wqe_ctrl_seg { enum { MLX4_WQE_MLX_VL15 = 1 << 17, - MLX4_WQE_MLX_SLR = 1 << 16 + MLX4_WQE_MLX_SLR = 1 << 16, + MLX4_WQE_MLX_ICRC = 1 << 4 }; struct mlx4_wqe_mlx_seg { @@ -329,5 +340,7 @@ static inline struct mlx4_qp *__mlx4_qp_lookup(struct mlx4_dev *dev, u32 qpn) } void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp); +int mlx4_qp_get_region(struct mlx4_dev *dev, enum mlx4_qp_region region, + int *base_qpn, int *cnt); #endif /* MLX4_QP_H */ diff --git a/include/linux/mlx4/srq.h b/include/linux/mlx4/srq.h index 799a0697a3835..5e041e5fe06f3 100644 --- a/include/linux/mlx4/srq.h +++ b/include/linux/mlx4/srq.h @@ -33,10 +33,22 @@ #ifndef MLX4_SRQ_H #define MLX4_SRQ_H +#include +#include + struct mlx4_wqe_srq_next_seg { u16 reserved1; __be16 next_wqe_index; u32 reserved2[3]; }; +void mlx4_srq_invalidate(struct mlx4_dev *dev, struct mlx4_srq *srq); +void mlx4_srq_remove(struct mlx4_dev *dev, struct mlx4_srq *srq); + +static inline struct mlx4_srq *__mlx4_srq_lookup(struct mlx4_dev *dev, u32 srqn) +{ + return radix_tree_lookup(&dev->srq_table_tree, + srqn & (dev->caps.num_srqs - 1)); +} + #endif /* MLX4_SRQ_H */ diff --git a/include/linux/rds.h b/include/linux/rds.h index 91950950aa598..ec0a19475a3b9 100644 --- a/include/linux/rds.h +++ b/include/linux/rds.h @@ -36,18 +36,36 @@ #include +/* These sparse annotated types shouldn't be in any user + * visible header file. We should clean this up rather + * than kludging around them. */ +#ifndef __KERNEL__ +#define __be16 u_int16_t +#define __be32 u_int32_t +#define __be64 u_int64_t +#endif + #define RDS_IB_ABI_VERSION 0x301 /* * setsockopt/getsockopt for SOL_RDS */ -#define RDS_CANCEL_SENT_TO 1 +#define RDS_CANCEL_SENT_TO 1 #define RDS_GET_MR 2 #define RDS_FREE_MR 3 /* deprecated: RDS_BARRIER 4 */ #define RDS_RECVERR 5 #define RDS_CONG_MONITOR 6 #define RDS_GET_MR_FOR_DEST 7 +#define RDS_CONN_RESET 8 + +/* + * ioctl commands for SOL_RDS +*/ +#define SIOCRDSSETTOS (SIOCPROTOPRIVATE) + +typedef u_int8_t rds_tos_t; + /* * Control message types for SOL_RDS. @@ -65,18 +83,19 @@ * R_Key along in an RDS extension header. * The cmsg_data is a struct rds_get_mr_args, * the same as for the GET_MR setsockopt. - * RDS_CMSG_RDMA_STATUS (recvmsg) - * Returns the status of a completed RDMA operation. + * RDS_CMSG_RDMA_SEND_STATUS (recvmsg) + * Returns the status of a completed RDMA/async send operation. */ #define RDS_CMSG_RDMA_ARGS 1 #define RDS_CMSG_RDMA_DEST 2 #define RDS_CMSG_RDMA_MAP 3 -#define RDS_CMSG_RDMA_STATUS 4 +#define RDS_CMSG_RDMA_SEND_STATUS 4 #define RDS_CMSG_CONG_UPDATE 5 #define RDS_CMSG_ATOMIC_FADD 6 #define RDS_CMSG_ATOMIC_CSWP 7 -#define RDS_CMSG_MASKED_ATOMIC_FADD 8 -#define RDS_CMSG_MASKED_ATOMIC_CSWP 9 +#define RDS_CMSG_MASKED_ATOMIC_FADD 8 +#define RDS_CMSG_MASKED_ATOMIC_CSWP 9 +#define RDS_CMSG_ASYNC_SEND 10 #define RDS_INFO_FIRST 10000 #define RDS_INFO_COUNTERS 10000 @@ -93,46 +112,57 @@ #define RDS_INFO_LAST 10010 struct rds_info_counter { - uint8_t name[32]; - uint64_t value; + u_int8_t name[32]; + u_int64_t value; } __attribute__((packed)); #define RDS_INFO_CONNECTION_FLAG_SENDING 0x01 #define RDS_INFO_CONNECTION_FLAG_CONNECTING 0x02 #define RDS_INFO_CONNECTION_FLAG_CONNECTED 0x04 +#define RDS_INFO_CONNECTION_FLAG_ERROR 0x08 #define TRANSNAMSIZ 16 struct rds_info_connection { - uint64_t next_tx_seq; - uint64_t next_rx_seq; + u_int64_t next_tx_seq; + u_int64_t next_rx_seq; __be32 laddr; __be32 faddr; - uint8_t transport[TRANSNAMSIZ]; /* null term ascii */ - uint8_t flags; + u_int8_t transport[TRANSNAMSIZ]; /* null term ascii */ + u_int8_t flags; + u_int8_t tos; +} __attribute__((packed)); + +struct rds_info_flow { + __be32 laddr; + __be32 faddr; + u_int32_t bytes; + __be16 lport; + __be16 fport; } __attribute__((packed)); #define RDS_INFO_MESSAGE_FLAG_ACK 0x01 #define RDS_INFO_MESSAGE_FLAG_FAST_ACK 0x02 struct rds_info_message { - uint64_t seq; - uint32_t len; + u_int64_t seq; + u_int32_t len; __be32 laddr; __be32 faddr; __be16 lport; __be16 fport; - uint8_t flags; + u_int8_t flags; + u_int8_t tos; } __attribute__((packed)); struct rds_info_socket { - uint32_t sndbuf; + u_int32_t sndbuf; __be32 bound_addr; __be32 connected_addr; __be16 bound_port; __be16 connected_port; - uint32_t rcvbuf; - uint64_t inum; + u_int32_t rcvbuf; + u_int64_t inum; } __attribute__((packed)); struct rds_info_tcp_socket { @@ -140,11 +170,11 @@ struct rds_info_tcp_socket { __be16 local_port; __be32 peer_addr; __be16 peer_port; - uint64_t hdr_rem; - uint64_t data_rem; - uint32_t last_sent_nxt; - uint32_t last_expected_una; - uint32_t last_seen_una; + u_int64_t hdr_rem; + u_int64_t data_rem; + u_int32_t last_sent_nxt; + u_int32_t last_expected_una; + u_int32_t last_seen_una; } __attribute__((packed)); #define RDS_IB_GID_LEN 16 @@ -159,6 +189,9 @@ struct rds_info_rdma_connection { uint32_t max_send_sge; uint32_t rdma_mr_max; uint32_t rdma_mr_size; + uint8_t tos; + uint8_t sl; + uint32_t cache_allocs; }; /* @@ -199,77 +232,71 @@ struct rds_info_rdma_connection { * (so that the application does not have to worry about * alignment). */ -typedef uint64_t rds_rdma_cookie_t; +typedef u_int64_t rds_rdma_cookie_t; struct rds_iovec { - uint64_t addr; - uint64_t bytes; + u_int64_t addr; + u_int64_t bytes; }; struct rds_get_mr_args { struct rds_iovec vec; - uint64_t cookie_addr; + u_int64_t cookie_addr; uint64_t flags; }; struct rds_get_mr_for_dest_args { struct sockaddr_storage dest_addr; - struct rds_iovec vec; - uint64_t cookie_addr; + struct rds_iovec vec; + u_int64_t cookie_addr; uint64_t flags; }; struct rds_free_mr_args { rds_rdma_cookie_t cookie; - uint64_t flags; + u_int64_t flags; }; struct rds_rdma_args { rds_rdma_cookie_t cookie; struct rds_iovec remote_vec; - uint64_t local_vec_addr; - uint64_t nr_local; - uint64_t flags; - uint64_t user_token; + u_int64_t local_vec_addr; + u_int64_t nr_local; + u_int64_t flags; + u_int64_t user_token; }; struct rds_atomic_args { rds_rdma_cookie_t cookie; - uint64_t local_addr; - uint64_t remote_addr; - union { - struct { - uint64_t compare; - uint64_t swap; - } cswp; - struct { - uint64_t add; - } fadd; - struct { - uint64_t compare; - uint64_t swap; - uint64_t compare_mask; - uint64_t swap_mask; - } m_cswp; - struct { - uint64_t add; - uint64_t nocarry_mask; - } m_fadd; - }; - uint64_t flags; - uint64_t user_token; + uint64_t local_addr; + uint64_t remote_addr; + uint64_t swap_add; + uint64_t compare; + u_int64_t flags; + u_int64_t user_token; }; -struct rds_rdma_notify { - uint64_t user_token; +struct rds_reset { + u_int8_t tos; + struct in_addr src; + struct in_addr dst; +}; + +struct rds_asend_args { + u_int64_t user_token; + u_int64_t flags; +}; + +struct rds_rdma_send_notify { + u_int64_t user_token; int32_t status; }; -#define RDS_RDMA_SUCCESS 0 +#define RDS_RDMA_SEND_SUCCESS 0 #define RDS_RDMA_REMOTE_ERROR 1 -#define RDS_RDMA_CANCELED 2 -#define RDS_RDMA_DROPPED 3 -#define RDS_RDMA_OTHER_ERROR 4 +#define RDS_RDMA_SEND_CANCELED 2 +#define RDS_RDMA_SEND_DROPPED 3 +#define RDS_RDMA_SEND_OTHER_ERROR 4 /* * Common set of flags for all RDMA related structs @@ -281,5 +308,8 @@ struct rds_rdma_notify { #define RDS_RDMA_DONTWAIT 0x0010 /* Don't wait in SET_BARRIER */ #define RDS_RDMA_NOTIFY_ME 0x0020 /* Notify when operation completes */ #define RDS_RDMA_SILENT 0x0040 /* Do not interrupt remote */ +#define RDS_RDMA_REMOTE_COMPLETE 0x0080 /* Notify when data is available */ +#define RDS_SEND_NOTIFY_ME 0x0100 /* Notify when operation completes */ + #endif /* IB_RDS_H */ diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index ae8c68f30f1bc..d054f83b86267 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -39,9 +39,10 @@ #include #include #include -#include #include #include +#include +#include struct rdma_addr_client { atomic_t refcount; @@ -133,34 +134,31 @@ static inline int rdma_addr_gid_offset(struct rdma_dev_addr *dev_addr) static inline void iboe_mac_vlan_to_ll(union ib_gid *gid, u8 *mac, u16 vid) { memset(gid->raw, 0, 16); - *((__be32 *) gid->raw) = cpu_to_be32(0xfe800000); - if (vid < 0x1000) { + *((u32 *)gid->raw) = cpu_to_be32(0xfe800000); + if (vid) { gid->raw[12] = vid & 0xff; gid->raw[11] = vid >> 8; } else { gid->raw[12] = 0xfe; gid->raw[11] = 0xff; } + memcpy(gid->raw + 13, mac + 3, 3); memcpy(gid->raw + 8, mac, 3); gid->raw[8] ^= 2; } -static inline u16 rdma_vlan_dev_vlan_id(const struct net_device *dev) -{ - return dev->priv_flags & IFF_802_1Q_VLAN ? - vlan_dev_vlan_id(dev) : 0xffff; -} - static inline void iboe_addr_get_sgid(struct rdma_dev_addr *dev_addr, - union ib_gid *gid) + union ib_gid *gid) { struct net_device *dev; - u16 vid = 0xffff; + u16 vid = 0; dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); if (dev) { - vid = rdma_vlan_dev_vlan_id(dev); +#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) + vid = vlan_dev_vlan_id(dev); +#endif dev_put(dev); } @@ -217,19 +215,18 @@ static inline enum ib_mtu iboe_get_mtu(int mtu) static inline int iboe_get_rate(struct net_device *dev) { struct ethtool_cmd cmd; - u32 speed; - if (dev_ethtool_get_settings(dev, &cmd)) + if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings || + dev->ethtool_ops->get_settings(dev, &cmd)) return IB_RATE_PORT_CURRENT; - speed = ethtool_cmd_speed(&cmd); - if (speed >= 40000) + if (cmd.speed >= 40000) return IB_RATE_40_GBPS; - else if (speed >= 30000) + else if (cmd.speed >= 30000) return IB_RATE_30_GBPS; - else if (speed >= 20000) + else if (cmd.speed >= 20000) return IB_RATE_20_GBPS; - else if (speed >= 10000) + else if (cmd.speed >= 10000) return IB_RATE_10_GBPS; else return IB_RATE_PORT_CURRENT; @@ -237,7 +234,7 @@ static inline int iboe_get_rate(struct net_device *dev) static inline int rdma_link_local_addr(struct in6_addr *addr) { - if (addr->s6_addr32[0] == htonl(0xfe800000) && + if (addr->s6_addr32[0] == cpu_to_be32(0xfe800000) && addr->s6_addr32[1] == 0) return 1; @@ -253,7 +250,7 @@ static inline void rdma_get_ll_mac(struct in6_addr *addr, u8 *mac) static inline int rdma_is_multicast_addr(struct in6_addr *addr) { - return addr->s6_addr[0] == 0xff; + return addr->s6_addr[0] == 0xff ? 1 : 0; } static inline void rdma_get_mcast_mac(struct in6_addr *addr, u8 *mac) @@ -271,13 +268,7 @@ static inline u16 rdma_get_vlan_id(union ib_gid *dgid) u16 vid; vid = dgid->raw[11] << 8 | dgid->raw[12]; - return vid < 0x1000 ? vid : 0xffff; -} - -static inline struct net_device *rdma_vlan_dev_real_dev(const struct net_device *dev) -{ - return dev->priv_flags & IFF_802_1Q_VLAN ? - vlan_dev_real_dev(dev) : 0; + return vid == 0xfffe ? 0 : vid & 0xfff; } #endif /* IB_ADDR_H */ diff --git a/include/rdma/ib_cache.h b/include/rdma/ib_cache.h index 00a2b8ec327f7..ad9a3c280944f 100644 --- a/include/rdma/ib_cache.h +++ b/include/rdma/ib_cache.h @@ -100,6 +100,22 @@ int ib_find_cached_pkey(struct ib_device *device, u16 pkey, u16 *index); +/** + * ib_find_exact_cached_pkey - Returns the PKey table index where a specified + * PKey value occurs. Comparison uses the FULL 16 bits (incl membership bit) + * @device: The device to query. + * @port_num: The port number of the device to search for the PKey. + * @pkey: The PKey value to search for. + * @index: The index into the cached PKey table where the PKey was found. + * + * ib_find_exact_cached_pkey() searches the specified PKey table in + * the local software cache. + */ +int ib_find_exact_cached_pkey(struct ib_device *device, + u8 port_num, + u16 pkey, + u16 *index); + /** * ib_get_cached_lmc - Returns a cached lmc table entry * @device: The device to query. diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h index c8f94e8db69c9..a208ceac98c86 100644 --- a/include/rdma/ib_cm.h +++ b/include/rdma/ib_cm.h @@ -66,6 +66,13 @@ enum ib_cm_lap_state { IB_CM_MRA_LAP_RCVD, }; +enum ib_cm_sap_state { + IB_CM_SAP_UNINIT, + IB_CM_SAP_IDLE, + IB_CM_SAP_SENT, + IB_CM_SAP_RCVD, +}; + enum ib_cm_event_type { IB_CM_REQ_ERROR, IB_CM_REQ_RECEIVED, @@ -84,7 +91,9 @@ enum ib_cm_event_type { IB_CM_APR_RECEIVED, IB_CM_SIDR_REQ_ERROR, IB_CM_SIDR_REQ_RECEIVED, - IB_CM_SIDR_REP_RECEIVED + IB_CM_SIDR_REP_RECEIVED, + IB_CM_SAP_RECEIVED, + IB_CM_SPR_RECEIVED }; enum ib_cm_data_size { @@ -194,6 +203,10 @@ struct ib_cm_lap_event_param { struct ib_sa_path_rec *alternate_path; }; +struct ib_cm_sap_event_param { + struct ib_sa_path_rec *alternate_path; +}; + enum ib_cm_apr_status { IB_CM_APR_SUCCESS, IB_CM_APR_INVALID_COMM_ID, @@ -211,12 +224,24 @@ enum ib_cm_apr_status { IB_CM_APR_INVALID_SL }; +enum ib_cm_spr_status { + IB_CM_SPR_SUCCESS, + IB_CM_SPR_BUSY, + IB_CM_SPR_REJECT, +}; + struct ib_cm_apr_event_param { enum ib_cm_apr_status ap_status; void *apr_info; u8 info_len; }; +struct ib_cm_spr_event_param { + enum ib_cm_apr_status ap_status; + void *spr_info; + u8 info_len; +}; + struct ib_cm_sidr_req_event_param { struct ib_cm_id *listen_id; u8 port; @@ -249,7 +274,9 @@ struct ib_cm_event { struct ib_cm_rej_event_param rej_rcvd; struct ib_cm_mra_event_param mra_rcvd; struct ib_cm_lap_event_param lap_rcvd; + struct ib_cm_sap_event_param sap_rcvd; struct ib_cm_apr_event_param apr_rcvd; + struct ib_cm_spr_event_param spr_rcvd; /* No data for DREQ/DREP received events. */ struct ib_cm_sidr_req_event_param sidr_req_rcvd; struct ib_cm_sidr_rep_event_param sidr_rep_rcvd; @@ -286,9 +313,17 @@ struct ib_cm_id { __be64 service_mask; enum ib_cm_state state; /* internal CM/debug use */ enum ib_cm_lap_state lap_state; /* internal CM/debug use */ + enum ib_cm_sap_state sap_state; /* internal CM/debug use */ __be32 local_id; __be32 remote_id; u32 remote_cm_qpn; /* 1 unless redirected */ + + /* + * used by the passive side to indicate whether the active peer + * supports SAP + */ + u8 remote_sap_support; + int sap_support_disabled; }; /** @@ -482,7 +517,7 @@ int ib_send_cm_rej(struct ib_cm_id *cm_id, * message. * @cm_id: Connection identifier associated with the connection message. * @service_timeout: The lower 5-bits specify the maximum time required for - * the sender to reply to the connection message. The upper 3-bits + * the sender to reply to to the connection message. The upper 3-bits * specify additional control flags. * @private_data: Optional user-defined private data sent with the * message receipt acknowledgement. @@ -493,6 +528,40 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id, const void *private_data, u8 private_data_len); +/** + * ib_send_cm_sap - Sends a suggest alternate path request. + * @cm_id: Connection identifier associated with the suggest alternate path + * message. + * @alternate_path: A path record that identifies the suggested path. + * + * @private_data: Optional user-defined private data sent with the + * suggest alternate path message. + * @private_data_len: Size of the private data buffer, in bytes. + */ +int ib_send_cm_sap(struct ib_cm_id *cm_id, + struct ib_sa_path_rec *alternate_path, + const void *private_data, + u8 private_data_len); + +/** + * ib_send_cm_spr - Sends an suggest path response message in response to + * a suggest alternate path request. + * @cm_id: Connection identifier associated with the suggest path response. + * @status: Reply status sent with the suggest path response. + * @info: Optional additional information sent with the suggest path + * response. + * @info_length: Size of the additional information, in bytes. + * @private_data: Optional user-defined private data sent with the + * suggest path response message. + * @private_data_len: Size of the private data buffer, in bytes. + */ +int ib_send_cm_spr(struct ib_cm_id *cm_id, + enum ib_cm_spr_status status, + void *info, + u8 info_length, + const void *private_data, + u8 private_data_len); + /** * ib_send_cm_lap - Sends a load alternate path request. * @cm_id: Connection identifier associated with the load alternate path diff --git a/include/rdma/ib_fmr_pool.h b/include/rdma/ib_fmr_pool.h index f62b842e65961..102ed105a10c0 100644 --- a/include/rdma/ib_fmr_pool.h +++ b/include/rdma/ib_fmr_pool.h @@ -62,6 +62,7 @@ struct ib_fmr_pool_param { void *arg); void *flush_arg; unsigned cache:1; + unsigned relaxed:1; }; struct ib_pool_fmr { @@ -72,10 +73,20 @@ struct ib_pool_fmr { int ref_count; int remap_count; u64 io_virtual_address; + struct ib_pd *pd; + int list_id; + struct scatterlist *sg; + int sg_len; int page_list_len; u64 page_list[0]; }; +struct ib_fmr_args_relaxed { + struct ib_pd *pd; + struct scatterlist *sg; + int sg_len; +}; + struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, struct ib_fmr_pool_param *params); @@ -86,8 +97,10 @@ int ib_flush_fmr_pool(struct ib_fmr_pool *pool); struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle, u64 *page_list, int list_len, - u64 io_virtual_address); + u64 io_virtual_address, + struct ib_fmr_args_relaxed *rargs); int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr); + #endif /* IB_FMR_POOL_H */ diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h index d3b9401b77b02..b7e45db8d3d46 100644 --- a/include/rdma/ib_mad.h +++ b/include/rdma/ib_mad.h @@ -77,6 +77,15 @@ #define IB_MGMT_MAX_METHODS 128 +/* MAD Status field bit masks */ +#define IB_MGMT_MAD_STATUS_SUCCESS 0x0000 +#define IB_MGMT_MAD_STATUS_BUSY 0x0001 +#define IB_MGMT_MAD_STATUS_REDIRECT_REQD 0x0002 +#define IB_MGMT_MAD_STATUS_BAD_VERSION 0x0004 +#define IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD 0x0008 +#define IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB 0x000c +#define IB_MGMT_MAD_STATUS_INVALID_ATTRIB_VALUE 0x001c + /* RMPP information */ #define IB_MGMT_RMPP_VERSION 1 @@ -151,7 +160,7 @@ struct ib_rmpp_hdr { typedef u64 __bitwise ib_sa_comp_mask; -#define IB_SA_COMP_MASK(n) ((__force ib_sa_comp_mask) cpu_to_be64(1ull << n)) +#define IB_SA_COMP_MASK(n) ((__force ib_sa_comp_mask) cpu_to_be64(1ull << (n))) /* * ib_sa_hdr and ib_sa_mad structures must be packed because they have diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h index b37fe3b10a9da..af615a477ffde 100644 --- a/include/rdma/ib_pack.h +++ b/include/rdma/ib_pack.h @@ -228,16 +228,16 @@ struct ib_unpacked_vlan { struct ib_ud_header { int lrh_present; struct ib_unpacked_lrh lrh; - int eth_present; - struct ib_unpacked_eth eth; + int eth_present; + struct ib_unpacked_eth eth; int vlan_present; struct ib_unpacked_vlan vlan; - int grh_present; - struct ib_unpacked_grh grh; - struct ib_unpacked_bth bth; + int grh_present; + struct ib_unpacked_grh grh; + struct ib_unpacked_bth bth; struct ib_unpacked_deth deth; - int immediate_present; - __be32 immediate_data; + int immediate_present; + __be32 immediate_data; }; void ib_pack(const struct ib_field *desc, @@ -250,11 +250,11 @@ void ib_unpack(const struct ib_field *desc, void *buf, void *structure); -void ib_ud_header_init(int payload_bytes, +void ib_ud_header_init(int payload_bytes, int lrh_present, int eth_present, int vlan_present, - int grh_present, + int grh_present, int immediate_present, struct ib_ud_header *header); @@ -263,5 +263,7 @@ int ib_ud_header_pack(struct ib_ud_header *header, int ib_ud_header_unpack(void *buf, struct ib_ud_header *header); +int ib_lrh_header_pack(struct ib_unpacked_lrh *lrh, void *buf); +int ib_lrh_header_unpack(void *buf, struct ib_unpacked_lrh *lrh); #endif /* IB_PACK_H */ diff --git a/include/rdma/ib_pma.h b/include/rdma/ib_pma.h new file mode 100644 index 0000000000000..a5889f18807b6 --- /dev/null +++ b/include/rdma/ib_pma.h @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation. + * All rights reserved. + * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if !defined(IB_PMA_H) +#define IB_PMA_H + +#include + +/* + * PMA class portinfo capability mask bits + */ +#define IB_PMA_CLASS_CAP_ALLPORTSELECT cpu_to_be16(1 << 8) +#define IB_PMA_CLASS_CAP_EXT_WIDTH cpu_to_be16(1 << 9) +#define IB_PMA_CLASS_CAP_XMIT_WAIT cpu_to_be16(1 << 12) + +#define IB_PMA_CLASS_PORT_INFO cpu_to_be16(0x0001) +#define IB_PMA_PORT_SAMPLES_CONTROL cpu_to_be16(0x0010) +#define IB_PMA_PORT_SAMPLES_RESULT cpu_to_be16(0x0011) +#define IB_PMA_PORT_COUNTERS cpu_to_be16(0x0012) +#define IB_PMA_PORT_COUNTERS_EXT cpu_to_be16(0x001D) +#define IB_PMA_PORT_SAMPLES_RESULT_EXT cpu_to_be16(0x001E) + +struct ib_pma_mad { + struct ib_mad_hdr mad_hdr; + u8 reserved[40]; + u8 data[192]; +} __packed; + +struct ib_pma_portsamplescontrol { + u8 opcode; + u8 port_select; + u8 tick; + u8 counter_width; /* resv: 7:3, counter width: 2:0 */ + __be32 counter_mask0_9; /* 2, 10 3-bit fields */ + __be16 counter_mask10_14; /* 1, 5 3-bit fields */ + u8 sample_mechanisms; + u8 sample_status; /* only lower 2 bits */ + __be64 option_mask; + __be64 vendor_mask; + __be32 sample_start; + __be32 sample_interval; + __be16 tag; + __be16 counter_select[15]; + __be32 reserved1; + __be64 samples_only_option_mask; + __be32 reserved2[28]; +}; + +struct ib_pma_portsamplesresult { + __be16 tag; + __be16 sample_status; /* only lower 2 bits */ + __be32 counter[15]; +}; + +struct ib_pma_portsamplesresult_ext { + __be16 tag; + __be16 sample_status; /* only lower 2 bits */ + __be32 extended_width; /* only upper 2 bits */ + __be64 counter[15]; +}; + +struct ib_pma_portcounters { + u8 reserved; + u8 port_select; + __be16 counter_select; + __be16 symbol_error_counter; + u8 link_error_recovery_counter; + u8 link_downed_counter; + __be16 port_rcv_errors; + __be16 port_rcv_remphys_errors; + __be16 port_rcv_switch_relay_errors; + __be16 port_xmit_discards; + u8 port_xmit_constraint_errors; + u8 port_rcv_constraint_errors; + u8 reserved1; + u8 link_overrun_errors; /* LocalLink: 7:4, BufferOverrun: 3:0 */ + __be16 reserved2; + __be16 vl15_dropped; + __be32 port_xmit_data; + __be32 port_rcv_data; + __be32 port_xmit_packets; + __be32 port_rcv_packets; + __be32 port_xmit_wait; +} __packed; + + +#define IB_PMA_SEL_SYMBOL_ERROR cpu_to_be16(0x0001) +#define IB_PMA_SEL_LINK_ERROR_RECOVERY cpu_to_be16(0x0002) +#define IB_PMA_SEL_LINK_DOWNED cpu_to_be16(0x0004) +#define IB_PMA_SEL_PORT_RCV_ERRORS cpu_to_be16(0x0008) +#define IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS cpu_to_be16(0x0010) +#define IB_PMA_SEL_PORT_XMIT_DISCARDS cpu_to_be16(0x0040) +#define IB_PMA_SEL_LOCAL_LINK_INTEGRITY_ERRORS cpu_to_be16(0x0200) +#define IB_PMA_SEL_EXCESSIVE_BUFFER_OVERRUNS cpu_to_be16(0x0400) +#define IB_PMA_SEL_PORT_VL15_DROPPED cpu_to_be16(0x0800) +#define IB_PMA_SEL_PORT_XMIT_DATA cpu_to_be16(0x1000) +#define IB_PMA_SEL_PORT_RCV_DATA cpu_to_be16(0x2000) +#define IB_PMA_SEL_PORT_XMIT_PACKETS cpu_to_be16(0x4000) +#define IB_PMA_SEL_PORT_RCV_PACKETS cpu_to_be16(0x8000) + +struct ib_pma_portcounters_ext { + u8 reserved; + u8 port_select; + __be16 counter_select; + __be32 reserved1; + __be64 port_xmit_data; + __be64 port_rcv_data; + __be64 port_xmit_packets; + __be64 port_rcv_packets; + __be64 port_unicast_xmit_packets; + __be64 port_unicast_rcv_packets; + __be64 port_multicast_xmit_packets; + __be64 port_multicast_rcv_packets; +} __packed; + +#define IB_PMA_SELX_PORT_XMIT_DATA cpu_to_be16(0x0001) +#define IB_PMA_SELX_PORT_RCV_DATA cpu_to_be16(0x0002) +#define IB_PMA_SELX_PORT_XMIT_PACKETS cpu_to_be16(0x0004) +#define IB_PMA_SELX_PORT_RCV_PACKETS cpu_to_be16(0x0008) +#define IB_PMA_SELX_PORT_UNI_XMIT_PACKETS cpu_to_be16(0x0010) +#define IB_PMA_SELX_PORT_UNI_RCV_PACKETS cpu_to_be16(0x0020) +#define IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS cpu_to_be16(0x0040) +#define IB_PMA_SELX_PORT_MULTI_RCV_PACKETS cpu_to_be16(0x0080) + +#endif /* IB_PMA_H */ diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h index 1082afaed1587..c1ae436ce6632 100644 --- a/include/rdma/ib_sa.h +++ b/include/rdma/ib_sa.h @@ -156,6 +156,30 @@ struct ib_sa_path_rec { u8 preference; }; +#define IB_SA_GUIDINFO_REC_LID IB_SA_COMP_MASK(0) +#define IB_SA_GUIDINFO_REC_BLOCK_NUM IB_SA_COMP_MASK(1) +/* reserved */ +#define IB_SA_GUIDINFO_REC_RES1 IB_SA_COMP_MASK(2) +#define IB_SA_GUIDINFO_REC_RES2 IB_SA_COMP_MASK(3) +#define IB_SA_COMPMASK_GID0 IB_SA_COMP_MASK(4) +#define IB_SA_COMPMASK_GID1 IB_SA_COMP_MASK(5) +#define IB_SA_COMPMASK_GID2 IB_SA_COMP_MASK(6) +#define IB_SA_COMPMASK_GID3 IB_SA_COMP_MASK(7) +#define IB_SA_COMPMASK_GID4 IB_SA_COMP_MASK(8) +#define IB_SA_COMPMASK_GID5 IB_SA_COMP_MASK(9) +#define IB_SA_COMPMASK_GID6 IB_SA_COMP_MASK(10) +#define IB_SA_COMPMASK_GID7 IB_SA_COMP_MASK(11) + +struct ib_sa_guidinfo_rec { + __be16 lid; + u8 block_num; + /* reserved */ + u8 res1; + __be32 res2; + __be64 guid_info_list[8]; +}; + + #define IB_SA_MCMEMBER_REC_MGID IB_SA_COMP_MASK( 0) #define IB_SA_MCMEMBER_REC_PORT_GID IB_SA_COMP_MASK( 1) #define IB_SA_MCMEMBER_REC_QKEY IB_SA_COMP_MASK( 2) @@ -251,6 +275,127 @@ struct ib_sa_service_rec { u64 data64[2]; }; +enum { + IB_SA_EVENT_TYPE_FATAL = 0x0, + IB_SA_EVENT_TYPE_URGENT = 0x1, + IB_SA_EVENT_TYPE_SECURITY = 0x2, + IB_SA_EVENT_TYPE_SM = 0x3, + IB_SA_EVENT_TYPE_INFO = 0x4, + IB_SA_EVENT_TYPE_EMPTY = 0x7F, + IB_SA_EVENT_TYPE_ALL = 0xFFFF +}; + +enum { + IB_SA_EVENT_PRODUCER_TYPE_CA = 0x1, + IB_SA_EVENT_PRODUCER_TYPE_SWITCH = 0x2, + IB_SA_EVENT_PRODUCER_TYPE_ROUTER = 0x3, + IB_SA_EVENT_PRODUCER_TYPE_CLASS_MANAGER = 0x4, + IB_SA_EVENT_PRODUCER_TYPE_ALL = 0xFFFFFF +}; + +enum { + IB_SA_SM_TRAP_GID_IN_SERVICE = 64, + IB_SA_SM_TRAP_GID_OUT_OF_SERVICE = 65, + IB_SA_SM_TRAP_CREATE_MC_GROUP = 66, + IB_SA_SM_TRAP_DELETE_MC_GROUP = 67, + IB_SA_SM_TRAP_PORT_CHANGE_STATE = 128, + IB_SA_SM_TRAP_LINK_INTEGRITY = 129, + IB_SA_SM_TRAP_EXCESSIVE_BUFFER_OVERRUN = 130, + IB_SA_SM_TRAP_FLOW_CONTROL_UPDATE_EXPIRED = 131, + IB_SA_SM_TRAP_BAD_M_KEY = 256, + IB_SA_SM_TRAP_BAD_P_KEY = 257, + IB_SA_SM_TRAP_BAD_Q_KEY = 258, + IB_SA_SM_TRAP_SWITCH_BAD_P_KEY = 259, + IB_SA_SM_TRAP_ALL = 0xFFFF +}; + +struct ib_sa_inform { + union ib_gid gid; + __be16 lid_range_begin; + __be16 lid_range_end; + u8 is_generic; + u8 subscribe; + __be16 type; + union { + struct { + __be16 trap_num; + __be32 qpn; + u8 resp_time; + __be32 producer_type; + } generic; + struct { + __be16 device_id; + __be32 qpn; + u8 resp_time; + __be32 vendor_id; + } vendor; + } trap; +}; + +struct ib_sa_notice { + u8 is_generic; + u8 type; + union { + struct { + __be32 producer_type; + __be16 trap_num; + } generic; + struct { + __be32 vendor_id; + __be16 device_id; + } vendor; + } trap; + __be16 issuer_lid; + __be16 notice_count; + u8 notice_toggle; + /* + * Align data 16 bits off 64 bit field to match InformInfo definition. + * Data contained within this field will then align properly. + * See IB spec 1.2, sections 13.4.8.2 and 14.2.5.1. + */ + u8 reserved[5]; + u8 data_details[54]; + union ib_gid issuer_gid; +}; + +/* + * SM notice data details for: + * + * IB_SA_SM_TRAP_GID_IN_SERVICE = 64 + * IB_SA_SM_TRAP_GID_OUT_OF_SERVICE = 65 + * IB_SA_SM_TRAP_CREATE_MC_GROUP = 66 + * IB_SA_SM_TRAP_DELETE_MC_GROUP = 67 + */ +struct ib_sa_notice_data_gid { + u8 reserved[6]; + u8 gid[16]; + u8 padding[32]; +}; + +/* + * SM notice data details for: + * + * IB_SA_SM_TRAP_PORT_CHANGE_STATE = 128 + */ +struct ib_sa_notice_data_port_change { + __be16 lid; + u8 padding[52]; +}; + +/* + * SM notice data details for: + * + * IB_SA_SM_TRAP_LINK_INTEGRITY = 129 + * IB_SA_SM_TRAP_EXCESSIVE_BUFFER_OVERRUN = 130 + * IB_SA_SM_TRAP_FLOW_CONTROL_UPDATE_EXPIRED = 131 + */ +struct ib_sa_notice_data_port_error { + u8 reserved[2]; + __be16 lid; + u8 port_num; + u8 padding[49]; +}; + struct ib_sa_client { atomic_t users; struct completion comp; @@ -377,7 +522,8 @@ int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num, */ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, struct ib_sa_path_rec *rec, - struct ib_ah_attr *ah_attr); + struct ib_ah_attr *ah_attr, + int force_grh); /** * ib_sa_unpack_path - Convert a path record from MAD format to struct @@ -385,4 +531,65 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, */ void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec); +struct ib_inform_info { + void *context; + int (*callback)(int status, + struct ib_inform_info *info, + struct ib_sa_notice *notice); + u16 trap_number; +}; + +/** + * ib_sa_register_inform_info - Registers to receive notice events. + * @device: Device associated with the registration. + * @port_num: Port on the specified device to associate with the registration. + * @trap_number: InformInfo trap number to register for. + * @gfp_mask: GFP mask for memory allocations. + * @callback: User callback invoked once the registration completes and to + * report noticed events. + * @context: User specified context stored with the ib_inform_reg structure. + * + * This call initiates a registration request with the SA for the specified + * trap number. If the operation is started successfully, it returns + * an ib_inform_info structure that is used to track the registration operation. + * Users must free this structure by calling ib_unregister_inform_info, + * even if the operation later fails. (The callback status is non-zero.) + * + * If the registration fails; status will be non-zero. If the registration + * succeeds, the callback status will be zero, but the notice parameter will + * be NULL. If the notice parameter is not NULL, a trap or notice is being + * reported to the user. + * + * A status of -ENETRESET indicates that an error occurred which requires + * reregisteration. + */ +struct ib_inform_info * +ib_sa_register_inform_info(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + u16 trap_number, gfp_t gfp_mask, + int (*callback)(int status, + struct ib_inform_info *info, + struct ib_sa_notice *notice), + void *context); + +/** + * ib_sa_unregister_inform_info - Releases an InformInfo registration. + * @info: InformInfo registration tracking structure. + * + * This call blocks until the registration request is destroyed. It may + * not be called from within the registration callback. + */ +void ib_sa_unregister_inform_info(struct ib_inform_info *info); + +/*Support GuidInfoRecord*/ +int ib_sa_guid_info_rec_query(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + struct ib_sa_guidinfo_rec *rec, + ib_sa_comp_mask comp_mask, u8 method, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_guidinfo_rec *resp, + void *context), + void *context, + struct ib_sa_query **sa_query); #endif /* IB_SA_H */ diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 9ee0d2e51b16e..90f3712fa6bd1 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -36,6 +36,7 @@ #include #include #include +#include struct ib_ucontext; @@ -56,6 +57,7 @@ struct ib_umem_chunk { struct list_head list; int nents; int nmap; + struct dma_attrs attrs; struct scatterlist page_list[0]; }; diff --git a/include/rdma/ib_user_cm.h b/include/rdma/ib_user_cm.h index f79014aa28f99..bd3d380781e0b 100644 --- a/include/rdma/ib_user_cm.h +++ b/include/rdma/ib_user_cm.h @@ -34,7 +34,6 @@ #ifndef IB_USER_CM_H #define IB_USER_CM_H -#include #include #define IB_USER_CM_ABI_VERSION 5 diff --git a/include/rdma/ib_user_verbs.h b/include/rdma/ib_user_verbs.h index fe5b05177a2cb..10abb79aaa945 100644 --- a/include/rdma/ib_user_verbs.h +++ b/include/rdma/ib_user_verbs.h @@ -81,7 +81,21 @@ enum { IB_USER_VERBS_CMD_MODIFY_SRQ, IB_USER_VERBS_CMD_QUERY_SRQ, IB_USER_VERBS_CMD_DESTROY_SRQ, - IB_USER_VERBS_CMD_POST_SRQ_RECV + IB_USER_VERBS_CMD_POST_SRQ_RECV, + IB_USER_VERBS_CMD_CREATE_XRC_SRQ, + IB_USER_VERBS_CMD_OPEN_XRC_DOMAIN, + IB_USER_VERBS_CMD_CLOSE_XRC_DOMAIN, + IB_USER_VERBS_CMD_CREATE_XRC_RCV_QP, + IB_USER_VERBS_CMD_MODIFY_XRC_RCV_QP, + IB_USER_VERBS_CMD_QUERY_XRC_RCV_QP, + IB_USER_VERBS_CMD_REG_XRC_RCV_QP, + IB_USER_VERBS_CMD_UNREG_XRC_RCV_QP, + IB_USER_VERBS_CMD_GET_ETH_L2_ADDR, + IB_USER_VERBS_CMD_ALLOC_SHPD, + IB_USER_VERBS_CMD_SHARE_PD, + IB_USER_VERBS_CMD_REG_MR_RELAXED, + IB_USER_VERBS_CMD_DEREG_MR_RELAXED, + IB_USER_VERBS_CMD_FLUSH_RELAXED_MR, }; /* @@ -206,7 +220,8 @@ struct ib_uverbs_query_port_resp { __u8 active_speed; __u8 phys_state; __u8 link_layer; - __u8 reserved[2]; + __u8 ext_active_speed; + __u8 reserved; }; struct ib_uverbs_alloc_pd { @@ -218,6 +233,26 @@ struct ib_uverbs_alloc_pd_resp { __u32 pd_handle; }; +struct ib_uverbs_alloc_shpd { + __u64 response; + __u32 pd_handle; + __u64 share_key; +}; + +struct ib_uverbs_alloc_shpd_resp { + __u32 shpd_handle; +}; + +struct ib_uverbs_share_pd { + __u64 response; + __u32 shpd_handle; + __u64 share_key; +}; + +struct ib_uverbs_share_pd_resp { + __u32 pd_handle; +}; + struct ib_uverbs_dealloc_pd { __u32 pd_handle; }; @@ -242,6 +277,11 @@ struct ib_uverbs_dereg_mr { __u32 mr_handle; }; +struct ib_uverbs_flush_relaxed_mr { + __u32 pd_handle; +}; + + struct ib_uverbs_create_comp_channel { __u64 response; }; @@ -622,6 +662,20 @@ struct ib_uverbs_destroy_ah { __u32 ah_handle; }; +struct ib_uverbs_get_eth_l2_addr { + __u64 response; + __u32 pd_handle; + __u8 port; + __u8 sgid_idx; + __u8 reserved[2]; + __u8 gid[16]; +}; + +struct ib_uverbs_get_eth_l2_addr_resp { + __u8 mac[6]; + __u16 vlan_id; +}; + struct ib_uverbs_attach_mcast { __u8 gid[16]; __u32 qp_handle; @@ -648,6 +702,18 @@ struct ib_uverbs_create_srq { __u64 driver_data[0]; }; +struct ib_uverbs_create_xrc_srq { + __u64 response; + __u64 user_handle; + __u32 pd_handle; + __u32 max_wr; + __u32 max_sge; + __u32 srq_limit; + __u32 xrcd_handle; + __u32 xrc_cq; + __u64 driver_data[0]; +}; + struct ib_uverbs_create_srq_resp { __u32 srq_handle; __u32 max_wr; @@ -687,4 +753,95 @@ struct ib_uverbs_destroy_srq_resp { __u32 events_reported; }; +struct ib_uverbs_open_xrc_domain { + __u64 response; + __u32 fd; + __u32 oflags; + __u64 driver_data[0]; +}; + +struct ib_uverbs_open_xrc_domain_resp { + __u32 xrcd_handle; +}; + +struct ib_uverbs_close_xrc_domain { + __u64 response; + __u32 xrcd_handle; + __u32 reserved; + __u64 driver_data[0]; +}; + +struct ib_uverbs_create_xrc_rcv_qp { + __u64 response; + __u64 user_handle; + __u32 xrc_domain_handle; + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; + __u8 sq_sig_all; + __u8 qp_type; + __u8 reserved[6]; + __u64 driver_data[0]; +}; + +struct ib_uverbs_create_xrc_rcv_qp_resp { + __u32 qpn; + __u32 reserved; +}; + +struct ib_uverbs_modify_xrc_rcv_qp { + __u32 xrc_domain_handle; + __u32 qp_num; + struct ib_uverbs_qp_dest dest; + struct ib_uverbs_qp_dest alt_dest; + __u32 attr_mask; + __u32 qkey; + __u32 rq_psn; + __u32 sq_psn; + __u32 dest_qp_num; + __u32 qp_access_flags; + __u16 pkey_index; + __u16 alt_pkey_index; + __u8 qp_state; + __u8 cur_qp_state; + __u8 path_mtu; + __u8 path_mig_state; + __u8 en_sqd_async_notify; + __u8 max_rd_atomic; + __u8 max_dest_rd_atomic; + __u8 min_rnr_timer; + __u8 port_num; + __u8 timeout; + __u8 retry_cnt; + __u8 rnr_retry; + __u8 alt_port_num; + __u8 alt_timeout; + __u8 reserved[6]; + __u64 driver_data[0]; +}; + +struct ib_uverbs_query_xrc_rcv_qp { + __u64 response; + __u32 xrc_domain_handle; + __u32 qp_num; + __u32 attr_mask; + __u32 reserved; + __u64 driver_data[0]; +}; + +struct ib_uverbs_reg_xrc_rcv_qp { + __u32 xrc_domain_handle; + __u32 qp_num; + __u64 driver_data[0]; +}; + +struct ib_uverbs_unreg_xrc_rcv_qp { + __u32 xrc_domain_handle; + __u32 qp_num; + __u64 driver_data[0]; +}; + + #endif /* IB_USER_VERBS_H */ diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 55cd0a0bc9778..a6f2eaf612fff 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -47,10 +47,11 @@ #include #include #include -#include #include #include +#include +#include extern struct workqueue_struct *ib_wq; @@ -112,6 +113,7 @@ enum ib_device_cap_flags { */ IB_DEVICE_UD_IP_CSUM = (1<<18), IB_DEVICE_UD_TSO = (1<<19), + IB_DEVICE_XRC = (1<<20), IB_DEVICE_MEM_MGT_EXTENSIONS = (1<<21), IB_DEVICE_BLOCK_MULTICAST_LOOPBACK = (1<<22), }; @@ -207,6 +209,7 @@ enum ib_port_cap_flags { IB_PORT_SM_DISABLED = 1 << 10, IB_PORT_SYS_IMAGE_GUID_SUP = 1 << 11, IB_PORT_PKEY_SW_EXT_PORT_TRAP_SUP = 1 << 12, + IB_PORT_EXTENDED_SPEEDS_SUP = 1 << 14, IB_PORT_CM_SUP = 1 << 16, IB_PORT_SNMP_TUNNEL_SUP = 1 << 17, IB_PORT_REINIT_SUP = 1 << 18, @@ -226,6 +229,15 @@ enum ib_port_width { IB_WIDTH_12X = 8 }; +static inline int ib_ext_active_speed_to_rate(u8 ext_active_speed) +{ + switch (ext_active_speed) { + case 1: return 14; + case 2: return 25; + default: return -1; + } +} + static inline int ib_width_enum_to_int(enum ib_port_width width) { switch (width) { @@ -308,6 +320,9 @@ struct ib_port_attr { u8 active_width; u8 active_speed; u8 phys_state; + enum rdma_link_layer link_layer; + u8 ext_active_speed; + u8 link_encoding; }; enum ib_device_modify_flags { @@ -350,7 +365,12 @@ enum ib_event_type { IB_EVENT_SRQ_ERR, IB_EVENT_SRQ_LIMIT_REACHED, IB_EVENT_QP_LAST_WQE_REACHED, - IB_EVENT_CLIENT_REREGISTER + IB_EVENT_CLIENT_REREGISTER, + IB_EVENT_GID_CHANGE, +}; + +enum ib_event_flags { + IB_XRC_QP_EVENT_FLAG = 0x80000000, }; struct ib_event { @@ -360,6 +380,7 @@ struct ib_event { struct ib_qp *qp; struct ib_srq *srq; u8 port_num; + u32 xrc_qp_num; } element; enum ib_event_type event; }; @@ -414,7 +435,15 @@ enum ib_rate { IB_RATE_40_GBPS = 7, IB_RATE_60_GBPS = 8, IB_RATE_80_GBPS = 9, - IB_RATE_120_GBPS = 10 + IB_RATE_120_GBPS = 10, + IB_RATE_14_GBPS = 11, + IB_RATE_56_GBPS = 12, + IB_RATE_112_GBPS = 13, + IB_RATE_168_GBPS = 14, + IB_RATE_25_GBPS = 15, + IB_RATE_100_GBPS = 16, + IB_RATE_200_GBPS = 17, + IB_RATE_300_GBPS = 18, }; /** @@ -425,6 +454,14 @@ enum ib_rate { */ int ib_rate_to_mult(enum ib_rate rate) __attribute_const__; +/** + * ib_ext_rate_to_int - Convert the extended IB rate enum to a + * real integer value. For example, + * IB_RATE_14_GBPS will be converted to 14 + * @rate: extended rate to convert. + */ +int ib_ext_rate_to_int(enum ib_rate rate) __attribute_const__; + /** * mult_to_ib_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate * enum. @@ -563,13 +600,15 @@ enum ib_qp_type { IB_QPT_RC, IB_QPT_UC, IB_QPT_UD, + IB_QPT_XRC, IB_QPT_RAW_IPV6, - IB_QPT_RAW_ETHERTYPE + IB_QPT_RAW_ETY }; enum ib_qp_create_flags { IB_QP_CREATE_IPOIB_UD_LSO = 1 << 0, IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK = 1 << 1, + IB_QP_CREATE_VENDOR_SPECIFIC_0 = 1 << 31, }; struct ib_qp_init_attr { @@ -582,6 +621,7 @@ struct ib_qp_init_attr { enum ib_sig_type sq_sig_type; enum ib_qp_type qp_type; enum ib_qp_create_flags create_flags; + struct ib_xrcd *xrc_domain; /* XRC qp's only */ u8 port_num; /* special QP types only */ }; @@ -697,6 +737,7 @@ enum ib_wr_opcode { IB_WR_ATOMIC_CMP_AND_SWP, IB_WR_ATOMIC_FETCH_AND_ADD, IB_WR_LSO, + IB_WR_BIG_LSO, IB_WR_SEND_WITH_INV, IB_WR_RDMA_READ_WITH_INV, IB_WR_LOCAL_INV, @@ -768,7 +809,13 @@ struct ib_send_wr { int access_flags; u32 rkey; } fast_reg; + struct { + struct ib_unpacked_lrh *lrh; + u32 eth_type; + u8 static_rate; + } raw_ety; } wr; + u32 xrc_remote_srq_num; /* valid for XRC sends only */ }; struct ib_recv_wr { @@ -825,11 +872,13 @@ struct ib_ucontext { struct ib_device *device; struct list_head pd_list; struct list_head mr_list; + struct list_head fmr_list; struct list_head mw_list; struct list_head cq_list; struct list_head qp_list; struct list_head srq_list; struct list_head ah_list; + struct list_head xrc_domain_list; int closing; }; @@ -851,12 +900,35 @@ struct ib_udata { size_t outlen; }; +struct ib_uxrc_rcv_object { + struct list_head list; /* link to context's list */ + u32 qp_num; + u32 domain_handle; +}; + struct ib_pd { struct ib_device *device; struct ib_uobject *uobject; + struct ib_shpd *shpd; /* global uobj id if this pd is shared */ atomic_t usecnt; /* count all resources */ }; +struct ib_shpd { + struct ib_device *device; + struct ib_uobject *uobject; + atomic_t shared; /* count procs sharing the pd*/ + u64 share_key; +}; + +struct ib_xrcd { + struct ib_device *device; + struct ib_uobject *uobject; + struct inode *inode; + struct rb_node node; + atomic_t usecnt; /* count all resources */ +}; + + struct ib_ah { struct ib_device *device; struct ib_pd *pd; @@ -878,10 +950,13 @@ struct ib_cq { struct ib_srq { struct ib_device *device; struct ib_pd *pd; + struct ib_cq *xrc_cq; + struct ib_xrcd *xrcd; struct ib_uobject *uobject; void (*event_handler)(struct ib_event *, void *); void *srq_context; atomic_t usecnt; + u32 xrc_srq_num; }; struct ib_qp { @@ -895,6 +970,7 @@ struct ib_qp { void *qp_context; u32 qp_num; enum ib_qp_type qp_type; + struct ib_xrcd *xrcd; /* XRC QPs only */ }; struct ib_mr { @@ -1000,9 +1076,9 @@ struct ib_device { struct list_head event_handler_list; spinlock_t event_handler_lock; - spinlock_t client_data_lock; struct list_head core_list; struct list_head client_data_list; + spinlock_t client_data_lock; struct ib_cache cache; int *pkey_tbl_len; @@ -1148,6 +1224,43 @@ struct ib_device { struct ib_grh *in_grh, struct ib_mad *in_mad, struct ib_mad *out_mad); + struct ib_srq * (*create_xrc_srq)(struct ib_pd *pd, + struct ib_cq *xrc_cq, + struct ib_xrcd *xrcd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata); + struct ib_xrcd * (*alloc_xrcd)(struct ib_device *device, + struct ib_ucontext *context, + struct ib_udata *udata); + int (*dealloc_xrcd)(struct ib_xrcd *xrcd); + int (*create_xrc_rcv_qp)(struct ib_qp_init_attr *init_attr, + u32 *qp_num); + int (*modify_xrc_rcv_qp)(struct ib_xrcd *xrcd, + u32 qp_num, + struct ib_qp_attr *attr, + int attr_mask); + int (*query_xrc_rcv_qp)(struct ib_xrcd *xrcd, + u32 qp_num, + struct ib_qp_attr *attr, + int attr_mask, + struct ib_qp_init_attr *init_attr); + int (*reg_xrc_rcv_qp)(struct ib_xrcd *xrcd, + void *context, + u32 qp_num); + int (*unreg_xrc_rcv_qp)(struct ib_xrcd *xrcd, + void *context, + u32 qp_num); + int (*get_eth_l2_addr)(struct ib_device *device, u8 port, + union ib_gid *dgid, int sgid_idx, + u8 *mac, u16 *vlan_id); + struct ib_shpd *(*alloc_shpd)(struct ib_device *ibdev, + struct ib_pd *pd); + struct ib_pd *(*share_pd)(struct ib_device *ibdev, + struct ib_ucontext *context, + struct ib_udata *udata, struct ib_shpd *shpd); + int (*remove_shpd)(struct ib_device *ibdev, + struct ib_shpd *shpd, int atinit); + int (*set_fmr_pd)(struct ib_fmr *fmr, struct ib_pd *pd); struct ib_dma_mapping_ops *dma_ops; @@ -1162,14 +1275,25 @@ struct ib_device { IB_DEV_UNREGISTERED } reg_state; - int uverbs_abi_ver; u64 uverbs_cmd_mask; + int uverbs_abi_ver; char node_desc[64]; __be64 node_guid; u32 local_dma_lkey; u8 node_type; u8 phys_port_cnt; + struct rb_root ib_uverbs_xrcd_table; + struct mutex xrcd_table_mutex; + struct ib_pd *relaxed_pd; + struct list_head relaxed_pool_list; +}; + +struct ib_relaxed_pool_data { + struct ib_fmr_pool *fmr_pool; + u32 access_flags; + int max_pages; + struct list_head pool_list; }; struct ib_client { @@ -1183,9 +1307,7 @@ struct ib_client { struct ib_device *ib_alloc_device(size_t size); void ib_dealloc_device(struct ib_device *device); -int ib_register_device(struct ib_device *device, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)); +int ib_register_device (struct ib_device *device); void ib_unregister_device(struct ib_device *device); int ib_register_client (struct ib_client *client); @@ -1205,6 +1327,15 @@ static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0; } +/** + * ib_sysfs_create_port_files - iterate over port sysfs directories + * @device: the IB device + * @create: a function to create sysfs files in each port directory + */ +int ib_sysfs_create_port_files(struct ib_device *device, + int (*create)(struct ib_device *dev, u8 port_num, + struct kobject *kobj)); + /** * ib_modify_qp_is_ok - Check that the supplied attribute mask * contains all required attributes and no attributes not allowed for @@ -1233,8 +1364,8 @@ int ib_query_device(struct ib_device *device, int ib_query_port(struct ib_device *device, u8 port_num, struct ib_port_attr *port_attr); -enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, - u8 port_num); +enum rdma_link_layer rdma_port_link_layer(struct ib_device *device, + u8 port_num); int ib_query_gid(struct ib_device *device, u8 port_num, int index, union ib_gid *gid); @@ -1335,8 +1466,28 @@ int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr); int ib_destroy_ah(struct ib_ah *ah); /** - * ib_create_srq - Creates a SRQ associated with the specified protection - * domain. + * ib_create_xrc_srq - Creates an XRC SRQ associated with the specified + * protection domain, cq, and xrc domain. + * @pd: The protection domain associated with the SRQ. + * @xrc_cq: The cq to be associated with the XRC SRQ. + * @xrcd: The XRC domain to be associated with the XRC SRQ. + * @srq_init_attr: A list of initial attributes required to create the + * XRC SRQ. If XRC SRQ creation succeeds, then the attributes are updated + * to the actual capabilities of the created XRC SRQ. + * + * srq_attr->max_wr and srq_attr->max_sge are read the determine the + * requested size of the XRC SRQ, and set to the actual values allocated + * on return. If ib_create_xrc_srq() succeeds, then max_wr and max_sge + * will always be at least as large as the requested values. + */ +struct ib_srq *ib_create_xrc_srq(struct ib_pd *pd, + struct ib_cq *xrc_cq, + struct ib_xrcd *xrcd, + struct ib_srq_init_attr *srq_init_attr); + +/** + * ib_create_srq - Creates an SRQ associated with the specified + * protection domain. * @pd: The protection domain associated with the SRQ. * @srq_init_attr: A list of initial attributes required to create the * SRQ. If SRQ creation succeeds, then the attributes are updated to @@ -1448,11 +1599,6 @@ int ib_destroy_qp(struct ib_qp *qp); * @send_wr: A list of work requests to post on the send queue. * @bad_send_wr: On an immediate failure, this parameter will reference * the work request that failed to be posted on the QP. - * - * While IBA Vol. 1 section 11.4.1.1 specifies that if an immediate - * error is returned, the QP state shall not be affected, - * ib_post_send() will return an immediate error after queueing any - * earlier work requests in the list. */ static inline int ib_post_send(struct ib_qp *qp, struct ib_send_wr *send_wr, @@ -1476,6 +1622,13 @@ static inline int ib_post_recv(struct ib_qp *qp, return qp->device->post_recv(qp, recv_wr, bad_recv_wr); } +/* + * IB_CQ_VECTOR_LEAST_ATTACHED: The constant specifies that + * the CQ will be attached to the completion vector that has + * the least number of CQs already attached to it. + */ +#define IB_CQ_VECTOR_LEAST_ATTACHED 0xffffffff + /** * ib_create_cq - Creates a CQ on the specified device. * @device: The device on which to create the CQ. @@ -1487,7 +1640,8 @@ static inline int ib_post_recv(struct ib_qp *qp, * the associated completion and event handlers. * @cqe: The minimum size of the CQ. * @comp_vector - Completion vector used to signal completion events. - * Must be >= 0 and < context->num_comp_vectors. + * Must be >= 0 and < context->num_comp_vectors + * or IB_CQ_VECTOR_LEAST_ATTACHED. * * Users can examine the cq structure to determine the actual CQ size. */ @@ -2011,6 +2165,13 @@ struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd, int mr_access_flags, struct ib_fmr_attr *fmr_attr); +/** + * ib_set_fmr_pd - set new PD for an FMR + * @fmr: The fast memory region to associate with the pd. + * @pd: new pd. + */ +int ib_set_fmr_pd(struct ib_fmr *fmr, struct ib_pd *pd); + /** * ib_map_phys_fmr - Maps a list of physical pages to a fast memory region. * @fmr: The fast memory region to associate with the pages. @@ -2059,4 +2220,29 @@ int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); */ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); + +/** + * ib_dealloc_xrcd - Deallocates an extended reliably connected domain. + * @xrcd: The xrc domain to deallocate. + */ +int ib_dealloc_xrcd(struct ib_xrcd *xrcd); + +/** + * ib_alloc_xrcd - Allocates an extended reliably connected domain. + * @device: The device on which to allocate the xrcd. + */ +struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device); + +/** + * ib_get_eth_l2_addr - get the mac and vlan id for the specified gid + * @device: IB device used for traffic + * @port: port number used. + * @gid: gid to be resolved into mac + * @sgid_idx: index to port's gid table for the corresponding address vector + * @mac: mac of the port bearing this gid + * @vlan_id: vlan to be used to reach this gid + */ +int ib_get_eth_l2_addr(struct ib_device *device, u8 port, union ib_gid *gid, + int sgid_idx, u8 *mac, __u16 *vlan_id); + #endif /* IB_VERBS_H */ diff --git a/include/rdma/iw_cm.h b/include/rdma/iw_cm.h index 2d0191c90f9ee..cbb822e8d7913 100644 --- a/include/rdma/iw_cm.h +++ b/include/rdma/iw_cm.h @@ -46,9 +46,18 @@ enum iw_cm_event_type { IW_CM_EVENT_CLOSE /* close complete */ }; +enum iw_cm_event_status { + IW_CM_EVENT_STATUS_OK = 0, /* request successful */ + IW_CM_EVENT_STATUS_ACCEPTED = 0, /* connect request accepted */ + IW_CM_EVENT_STATUS_REJECTED, /* connect request rejected */ + IW_CM_EVENT_STATUS_TIMEOUT, /* the operation timed out */ + IW_CM_EVENT_STATUS_RESET, /* reset from remote peer */ + IW_CM_EVENT_STATUS_EINVAL, /* asynchronous failure for bad parm */ +}; + struct iw_cm_event { enum iw_cm_event_type event; - int status; + enum iw_cm_event_status status; struct sockaddr_in local_addr; struct sockaddr_in remote_addr; void *private_data; diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 26977c149c414..4e3933e063bac 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -59,7 +59,11 @@ enum rdma_cm_event_type { RDMA_CM_EVENT_MULTICAST_JOIN, RDMA_CM_EVENT_MULTICAST_ERROR, RDMA_CM_EVENT_ADDR_CHANGE, - RDMA_CM_EVENT_TIMEWAIT_EXIT + RDMA_CM_EVENT_TIMEWAIT_EXIT, + RDMA_CM_EVENT_ALT_ROUTE_RESOLVED, + RDMA_CM_EVENT_ALT_ROUTE_ERROR, + RDMA_CM_EVENT_LOAD_ALT_PATH, + RDMA_CM_EVENT_ALT_PATH_LOADED, }; enum rdma_port_space { @@ -67,6 +71,14 @@ enum rdma_port_space { RDMA_PS_IPOIB = 0x0002, RDMA_PS_TCP = 0x0106, RDMA_PS_UDP = 0x0111, + RDMA_PS_SCTP = 0x0183 +}; + +enum alt_path_type { + RDMA_ALT_PATH_NONE, + RDMA_ALT_PATH_PORT, + RDMA_ALT_PATH_LID, + RDMA_ALT_PATH_BEST }; struct rdma_addr { @@ -100,6 +112,7 @@ struct rdma_ud_param { struct ib_ah_attr ah_attr; u32 qp_num; u32 qkey; + u8 alt_path_index; }; struct rdma_cm_event { @@ -111,20 +124,6 @@ struct rdma_cm_event { } param; }; -enum rdma_cm_state { - RDMA_CM_IDLE, - RDMA_CM_ADDR_QUERY, - RDMA_CM_ADDR_RESOLVED, - RDMA_CM_ROUTE_QUERY, - RDMA_CM_ROUTE_RESOLVED, - RDMA_CM_CONNECT, - RDMA_CM_DISCONNECT, - RDMA_CM_ADDR_BOUND, - RDMA_CM_LISTEN, - RDMA_CM_DEVICE_REMOVAL, - RDMA_CM_DESTROYING -}; - struct rdma_cm_id; /** @@ -144,8 +143,8 @@ struct rdma_cm_id { rdma_cm_event_handler event_handler; struct rdma_route route; enum rdma_port_space ps; - enum ib_qp_type qp_type; u8 port_num; + void *ucontext; }; /** @@ -155,11 +154,9 @@ struct rdma_cm_id { * returned rdma_id. * @context: User specified context associated with the id. * @ps: RDMA port space. - * @qp_type: type of queue pair associated with the id. */ struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler, - void *context, enum rdma_port_space ps, - enum ib_qp_type qp_type); + void *context, enum rdma_port_space ps); /** * rdma_destroy_id - Destroys an RDMA identifier. @@ -207,6 +204,19 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, */ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms); +/** + * rdma_enable_apm - Get ready to use APM for the given ID. + * Actual Alternate path discovery and load will take place only + * after a connection has been established. + * + * Calling this function only has an effect on the connection's client side. + * It should be called after rdma_resolve_route and before rdma_connect. + * + * @id: RDMA identifier. + * @alt_type: Alternate path type to resolve. + */ +int rdma_enable_apm(struct rdma_cm_id *id, enum alt_path_type alt_type); + /** * rdma_create_qp - Allocate a QP and associate it with the specified RDMA * identifier. @@ -347,13 +357,11 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr); void rdma_set_service_type(struct rdma_cm_id *id, int tos); /** - * rdma_set_reuseaddr - Allow the reuse of local addresses when binding - * the rdma_cm_id. - * @id: Communication identifier to configure. - * @reuse: Value indicating if the bound address is reusable. - * - * Reuse must be set before an address is bound to the id. + * rdma_set_timeout - Set the QP timeout associated with a connection + * identifier. + * @id: Communication identifier to associated with service type. + * @timeout: QP timeout */ -int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse); +void rdma_set_timeout(struct rdma_cm_id *id, int timeout); #endif /* RDMA_CM_H */ diff --git a/include/rdma/rdma_user_cm.h b/include/rdma/rdma_user_cm.h index fc82c1896f751..d86717ac63bcb 100644 --- a/include/rdma/rdma_user_cm.h +++ b/include/rdma/rdma_user_cm.h @@ -38,7 +38,7 @@ #include #include -#define RDMA_USER_CM_ABI_VERSION 4 +#define RDMA_USER_CM_ABI_VERSION 5 #define RDMA_MAX_PRIVATE_DATA 256 @@ -221,9 +221,9 @@ enum { /* Option details */ enum { - RDMA_OPTION_ID_TOS = 0, - RDMA_OPTION_ID_REUSEADDR = 1, - RDMA_OPTION_IB_PATH = 1 + RDMA_OPTION_ID_TOS = 0, + RDMA_OPTION_IB_PATH = 1, + RDMA_OPTION_IB_APM = 2, }; struct rdma_ucm_set_option { diff --git a/include/rdma/sdp_socket.h b/include/rdma/sdp_socket.h new file mode 100644 index 0000000000000..c289f99659824 --- /dev/null +++ b/include/rdma/sdp_socket.h @@ -0,0 +1,24 @@ +/* Stuff that should go into include/linux/socket.h */ + +#ifndef SDP_SOCKET_H +#define SDP_SOCKET_H + +#ifndef AF_INET_SDP +#define AF_INET_SDP 27 +#define PF_INET_SDP AF_INET_SDP +#endif + +#ifndef AF_INET6_SDP +#define AF_INET6_SDP 28 +#define PF_INET6_SDP AF_INET6_SDP +#endif + +#ifndef SDP_ZCOPY_THRESH +#define SDP_ZCOPY_THRESH 80 +#endif + +#ifndef SDP_LAST_BIND_ERR +#define SDP_LAST_BIND_ERR 81 +#endif + +#endif diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 159c50f1c6bf6..844a7a5607e3d 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -589,8 +589,7 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) return -ENOMEM; /* Create the RDMA CM ID */ - rdma->cm_id = rdma_create_id(p9_cm_event_handler, client, RDMA_PS_TCP, - IB_QPT_RC); + rdma->cm_id = rdma_create_id(p9_cm_event_handler, client, RDMA_PS_TCP); if (IS_ERR(rdma->cm_id)) goto error; diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 424ff622ab5f8..ddbf568da6d4a 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -33,21 +33,17 @@ #include #include #include -#include #include #include +#include #include #include "rds.h" - -char *rds_str_array(char **array, size_t elements, size_t index) -{ - if ((index < elements) && array[index]) - return array[index]; - else - return "unknown"; -} -EXPORT_SYMBOL(rds_str_array); +#include "tcp.h" +/* UNUSED for backwards compat only */ +static unsigned int rds_ib_retry_count = 0xdead; +module_param(rds_ib_retry_count, int, 0444); +MODULE_PARM_DESC(rds_ib_retry_count, "UNUSED, set param in rds_rdma instead"); /* this is just used for stats gathering :/ */ static DEFINE_SPINLOCK(rds_sock_lock); @@ -68,6 +64,7 @@ static int rds_release(struct socket *sock) { struct sock *sk = sock->sk; struct rds_sock *rs; + unsigned long flags; if (!sk) goto out; @@ -81,27 +78,21 @@ static int rds_release(struct socket *sock) rds_clear_recv_queue(rs); rds_cong_remove_socket(rs); - /* - * the binding lookup hash uses rcu, we need to - * make sure we sychronize_rcu before we free our - * entry - */ rds_remove_bound(rs); - synchronize_rcu(); rds_send_drop_to(rs, NULL); rds_rdma_drop_keys(rs); rds_notify_queue_get(rs, NULL); - spin_lock_bh(&rds_sock_lock); + spin_lock_irqsave(&rds_sock_lock, flags); list_del_init(&rs->rs_item); rds_sock_count--; - spin_unlock_bh(&rds_sock_lock); + spin_unlock_irqrestore(&rds_sock_lock, flags); rds_trans_put(rs->rs_transport); sock->sk = NULL; - sock_put(sk); + debug_sock_put(sk); out: return 0; } @@ -193,8 +184,8 @@ static unsigned int rds_poll(struct file *file, struct socket *sock, mask |= (POLLIN | POLLRDNORM); spin_unlock(&rs->rs_lock); } - if (!list_empty(&rs->rs_recv_queue) || - !list_empty(&rs->rs_notify_queue)) + if (!list_empty(&rs->rs_recv_queue) + || !list_empty(&rs->rs_notify_queue)) mask |= (POLLIN | POLLRDNORM); if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) mask |= (POLLOUT | POLLWRNORM); @@ -209,7 +200,28 @@ static unsigned int rds_poll(struct file *file, struct socket *sock, static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { - return -ENOIOCTLCMD; + struct rds_sock *rs = rds_sk_to_rs(sock->sk); + rds_tos_t tos; + unsigned long flags; + + if (get_user(tos, (rds_tos_t __user *)arg)) + return -EFAULT; + + switch (cmd) { + case SIOCRDSSETTOS: + spin_lock_irqsave(&rds_sock_lock, flags); + if (rs->rs_tos || rs->rs_conn) { + spin_unlock_irqrestore(&rds_sock_lock, flags); + return -EINVAL; + } + rs->rs_tos = tos; + spin_unlock_irqrestore(&rds_sock_lock, flags); + break; + default: + return -ENOIOCTLCMD; + } + + return 0; } static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval, @@ -270,6 +282,32 @@ static int rds_cong_monitor(struct rds_sock *rs, char __user *optval, return ret; } +static int rds_user_reset(struct rds_sock *rs, char __user *optval, int optlen) +{ + struct rds_reset reset; + struct rds_connection *conn; + + if (optlen != sizeof(struct rds_reset)) + return -EINVAL; + + if (copy_from_user(&reset, (struct rds_reset __user *)optval, + sizeof(struct rds_reset))) + return -EFAULT; + + conn = rds_conn_find(reset.src.s_addr, reset.dst.s_addr, + rs->rs_transport, reset.tos); + + if (conn) { + printk(KERN_NOTICE "Resetting RDS/IB connection " + "<%u.%u.%u.%u,%u.%u.%u.%u,%d>\n", + NIPQUAD(reset.src.s_addr), + NIPQUAD(reset.dst.s_addr), conn->c_tos); + rds_conn_drop(conn); + } + + return 0; +} + static int rds_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { @@ -300,6 +338,9 @@ static int rds_setsockopt(struct socket *sock, int level, int optname, case RDS_CONG_MONITOR: ret = rds_cong_monitor(rs, optval, optlen); break; + case RDS_CONN_RESET: + ret = rds_user_reset(rs, optval, optlen); + break; default: ret = -ENOPROTOOPT; } @@ -331,8 +372,8 @@ static int rds_getsockopt(struct socket *sock, int level, int optname, if (len < sizeof(int)) ret = -EINVAL; else - if (put_user(rs->rs_recverr, (int __user *) optval) || - put_user(sizeof(int), optlen)) + if (put_user(rs->rs_recverr, (int __user *) optval) + || put_user(sizeof(int), optlen)) ret = -EFAULT; else ret = 0; @@ -385,7 +426,7 @@ static struct proto rds_proto = { .obj_size = sizeof(struct rds_sock), }; -static const struct proto_ops rds_proto_ops = { +static struct proto_ops rds_proto_ops = { .family = AF_RDS, .owner = THIS_MODULE, .release = rds_release, @@ -406,13 +447,23 @@ static const struct proto_ops rds_proto_ops = { .sendpage = sock_no_sendpage, }; +static void rds_sock_destruct(struct sock *sk) +{ + struct rds_sock *rs = rds_sk_to_rs(sk); + + BUG_ON((&rs->rs_item != rs->rs_item.next || + &rs->rs_item != rs->rs_item.prev)); +} + static int __rds_create(struct socket *sock, struct sock *sk, int protocol) { + unsigned long flags; struct rds_sock *rs; sock_init_data(sock, sk); sock->ops = &rds_proto_ops; sk->sk_protocol = protocol; + sk->sk_destruct = rds_sock_destruct; rs = rds_sk_to_rs(sk); spin_lock_init(&rs->rs_lock); @@ -423,41 +474,79 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol) INIT_LIST_HEAD(&rs->rs_cong_list); spin_lock_init(&rs->rs_rdma_lock); rs->rs_rdma_keys = RB_ROOT; + rs->poison = 0xABABABAB; + rs->rs_tos = 0; + rs->rs_conn = 0; + + if (rs->rs_bound_addr) + printk(KERN_CRIT "bound addr %x at create\n", rs->rs_bound_addr); - spin_lock_bh(&rds_sock_lock); + spin_lock_irqsave(&rds_sock_lock, flags); list_add_tail(&rs->rs_item, &rds_sock_list); rds_sock_count++; - spin_unlock_bh(&rds_sock_lock); + spin_unlock_irqrestore(&rds_sock_lock, flags); return 0; } -static int rds_create(struct net *net, struct socket *sock, int protocol, - int kern) +static int rds_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; if (sock->type != SOCK_SEQPACKET || protocol) return -ESOCKTNOSUPPORT; - sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto); + sk = sk_alloc(net, AF_RDS, GFP_KERNEL, &rds_proto); if (!sk) return -ENOMEM; return __rds_create(sock, sk, protocol); } +void debug_sock_hold(struct sock *sk) +{ + struct rds_sock *rs = rds_sk_to_rs(sk); + if ((atomic_read(&sk->sk_refcnt) == 0)) { + printk(KERN_CRIT "zero refcnt on sock hold\n"); + WARN_ON(1); + } + if (rs->poison != 0xABABABAB) { + printk(KERN_CRIT "bad poison on hold %x\n", rs->poison); + WARN_ON(1); + } + sock_hold(sk); +} + + void rds_sock_addref(struct rds_sock *rs) { - sock_hold(rds_rs_to_sk(rs)); + debug_sock_hold(rds_rs_to_sk(rs)); } +void debug_sock_put(struct sock *sk) +{ + if ((atomic_read(&sk->sk_refcnt) == 0)) { + printk(KERN_CRIT "zero refcnt on sock put\n"); + WARN_ON(1); + } + if (atomic_dec_and_test(&sk->sk_refcnt)) { + struct rds_sock *rs = rds_sk_to_rs(sk); + if (rs->poison != 0xABABABAB) { + printk(KERN_CRIT "bad poison on put %x\n", rs->poison); + WARN_ON(1); + } + rs->poison = 0xDEADBEEF; + sk_free(sk); + } +} + + void rds_sock_put(struct rds_sock *rs) { - sock_put(rds_rs_to_sk(rs)); + debug_sock_put(rds_rs_to_sk(rs)); } -static const struct net_proto_family rds_family_ops = { +static struct net_proto_family rds_family_ops = { .family = AF_RDS, .create = rds_create, .owner = THIS_MODULE, @@ -468,14 +557,17 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len, struct rds_info_lengths *lens) { struct rds_sock *rs; + struct sock *sk; struct rds_incoming *inc; + unsigned long flags; unsigned int total = 0; len /= sizeof(struct rds_info_message); - spin_lock_bh(&rds_sock_lock); + spin_lock_irqsave(&rds_sock_lock, flags); list_for_each_entry(rs, &rds_sock_list, rs_item) { + sk = rds_rs_to_sk(rs); read_lock(&rs->rs_recv_lock); /* XXX too lazy to maintain counts.. */ @@ -489,7 +581,7 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len, read_unlock(&rs->rs_recv_lock); } - spin_unlock_bh(&rds_sock_lock); + spin_unlock_irqrestore(&rds_sock_lock, flags); lens->nr = total; lens->each = sizeof(struct rds_info_message); @@ -501,10 +593,11 @@ static void rds_sock_info(struct socket *sock, unsigned int len, { struct rds_info_socket sinfo; struct rds_sock *rs; + unsigned long flags; len /= sizeof(struct rds_info_socket); - spin_lock_bh(&rds_sock_lock); + spin_lock_irqsave(&rds_sock_lock, flags); if (len < rds_sock_count) goto out; @@ -525,7 +618,7 @@ out: lens->nr = rds_sock_count; lens->each = sizeof(struct rds_info_socket); - spin_unlock_bh(&rds_sock_lock); + spin_unlock_irqrestore(&rds_sock_lock, flags); } static void rds_exit(void) diff --git a/net/rds/bind.c b/net/rds/bind.c index 2f6b3fcc79f81..9f3a9df0dc1b9 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -39,7 +39,7 @@ #define BIND_HASH_SIZE 1024 static struct hlist_head bind_hash_table[BIND_HASH_SIZE]; -static DEFINE_SPINLOCK(rds_bind_lock); +static DEFINE_RWLOCK(rds_bind_lock); static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port) { @@ -47,6 +47,9 @@ static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port) (BIND_HASH_SIZE - 1)); } +/* + * must hold either read or write lock (write lock for insert != NULL) + */ static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port, struct rds_sock *insert) { @@ -56,30 +59,26 @@ static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port, u64 cmp; u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port); - rcu_read_lock(); - hlist_for_each_entry_rcu(rs, node, head, rs_bound_node) { + hlist_for_each_entry(rs, node, head, rs_bound_node) { cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) | be16_to_cpu(rs->rs_bound_port); if (cmp == needle) { - rcu_read_unlock(); + rds_sock_addref(rs); return rs; } } - rcu_read_unlock(); if (insert) { /* * make sure our addr and port are set before - * we are added to the list, other people - * in rcu will find us as soon as the - * hlist_add_head_rcu is done + * we are added to the list. */ insert->rs_bound_addr = addr; insert->rs_bound_port = port; rds_sock_addref(insert); - hlist_add_head_rcu(&insert->rs_bound_node, head); + hlist_add_head(&insert->rs_bound_node, head); } return NULL; } @@ -93,16 +92,20 @@ static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port, struct rds_sock *rds_find_bound(__be32 addr, __be16 port) { struct rds_sock *rs; + unsigned long flags; + read_lock_irqsave(&rds_bind_lock, flags); rs = rds_bind_lookup(addr, port, NULL); + read_unlock_irqrestore(&rds_bind_lock, flags); - if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) - rds_sock_addref(rs); - else + if (rs && sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) { + rds_sock_put(rs); rs = NULL; + } rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, ntohs(port)); + return rs; } @@ -121,21 +124,25 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) last = rover - 1; } - spin_lock_irqsave(&rds_bind_lock, flags); + write_lock_irqsave(&rds_bind_lock, flags); do { + struct rds_sock *rrs; if (rover == 0) rover++; - if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) { + rrs = rds_bind_lookup(addr, cpu_to_be16(rover), rs); + + if (!rrs) { *port = rs->rs_bound_port; ret = 0; rdsdebug("rs %p binding to %pI4:%d\n", rs, &addr, (int)ntohs(*port)); break; - } + } else + rds_sock_put(rrs); } while (rover++ != last); - spin_unlock_irqrestore(&rds_bind_lock, flags); + write_unlock_irqrestore(&rds_bind_lock, flags); return ret; } @@ -144,19 +151,19 @@ void rds_remove_bound(struct rds_sock *rs) { unsigned long flags; - spin_lock_irqsave(&rds_bind_lock, flags); + write_lock_irqsave(&rds_bind_lock, flags); if (rs->rs_bound_addr) { rdsdebug("rs %p unbinding from %pI4:%d\n", rs, &rs->rs_bound_addr, ntohs(rs->rs_bound_port)); - hlist_del_init_rcu(&rs->rs_bound_node); + hlist_del_init(&rs->rs_bound_node); rds_sock_put(rs); rs->rs_bound_addr = 0; } - spin_unlock_irqrestore(&rds_bind_lock, flags); + write_unlock_irqrestore(&rds_bind_lock, flags); } int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) @@ -196,9 +203,5 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) out: release_sock(sk); - - /* we might have called rds_remove_bound on error */ - if (ret) - synchronize_rcu(); return ret; } diff --git a/net/rds/cong.c b/net/rds/cong.c index 9347f21a0e225..0d5456c44a9bd 100644 --- a/net/rds/cong.c +++ b/net/rds/cong.c @@ -30,10 +30,10 @@ * SOFTWARE. * */ -#include #include #include -#include + +#include #include "rds.h" @@ -210,9 +210,12 @@ int rds_cong_get_maps(struct rds_connection *conn) return 0; } -void __rds_cong_queue_updates(struct rds_cong_map *map) +void rds_cong_queue_updates(struct rds_cong_map *map) { struct rds_connection *conn; + unsigned long flags; + + spin_lock_irqsave(&rds_cong_lock, flags); list_for_each_entry(conn, &map->m_conn_list, c_map_item) { if (!test_and_set_bit(0, &conn->c_map_queued)) { @@ -220,17 +223,10 @@ void __rds_cong_queue_updates(struct rds_cong_map *map) queue_delayed_work(rds_wq, &conn->c_send_w, 0); } } -} -void rds_cong_queue_updates(struct rds_cong_map *map) -{ - unsigned long flags; - spin_lock_irqsave(&rds_cong_lock, flags); - __rds_cong_queue_updates(map); spin_unlock_irqrestore(&rds_cong_lock, flags); } - void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask) { rdsdebug("waking map %p for %pI4\n", @@ -302,7 +298,7 @@ void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port) i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS; off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS; - __clear_bit_le(off, (void *)map->m_page_addrs[i]); + __set_bit_le(off, (void *)map->m_page_addrs[i]); } static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port) @@ -338,12 +334,12 @@ void rds_cong_remove_socket(struct rds_sock *rs) /* update congestion map for now-closed port */ spin_lock_irqsave(&rds_cong_lock, flags); map = rds_cong_tree_walk(rs->rs_bound_addr, NULL); + spin_unlock_irqrestore(&rds_cong_lock, flags); if (map && rds_cong_test_bit(map, rs->rs_bound_port)) { rds_cong_clear_bit(map, rs->rs_bound_port); - __rds_cong_queue_updates(map); + rds_cong_queue_updates(map); } - spin_unlock_irqrestore(&rds_cong_lock, flags); } int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, diff --git a/net/rds/connection.c b/net/rds/connection.c index 9334d892366eb..71edccf40fc51 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -32,11 +32,11 @@ */ #include #include -#include #include #include "rds.h" #include "loop.h" +#include "tcp.h" #define RDS_CONNECTION_HASH_BITS 12 #define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS) @@ -65,13 +65,15 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr) /* rcu read lock must be held or the connection spinlock */ static struct rds_connection *rds_conn_lookup(struct hlist_head *head, __be32 laddr, __be32 faddr, - struct rds_transport *trans) + struct rds_transport *trans, + u8 tos) { struct rds_connection *conn, *ret = NULL; struct hlist_node *pos; hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) { if (conn->c_faddr == faddr && conn->c_laddr == laddr && + conn->c_tos == tos && conn->c_trans == trans) { ret = conn; break; @@ -88,7 +90,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head, * and receiving over this connection again in the future. It is up to * the transport to have serialized this call with its send and recv. */ -static void rds_conn_reset(struct rds_connection *conn) +void rds_conn_reset(struct rds_connection *conn) { rdsdebug("connection %pI4 to %pI4 reset\n", &conn->c_laddr, &conn->c_faddr); @@ -113,6 +115,7 @@ static void rds_conn_reset(struct rds_connection *conn) */ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp, + u8 tos, int is_outgoing) { struct rds_connection *conn, *parent = NULL; @@ -122,9 +125,12 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, int ret; rcu_read_lock(); - conn = rds_conn_lookup(head, laddr, faddr, trans); - if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && - !is_outgoing) { + conn = rds_conn_lookup(head, laddr, faddr, trans, tos); + if (conn + && conn->c_loopback + && conn->c_trans != &rds_loop_transport + && laddr == faddr + && !is_outgoing) { /* This is a looped back IB connection, and we're * called by the code handling the incoming connect. * We need a second connection object into which we @@ -136,12 +142,14 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, if (conn) goto out; - conn = kmem_cache_zalloc(rds_conn_slab, gfp); + conn = kmem_cache_alloc(rds_conn_slab, gfp); if (!conn) { conn = ERR_PTR(-ENOMEM); goto out; } + memset(conn, 0, sizeof(*conn)); + INIT_HLIST_NODE(&conn->c_hash_node); conn->c_laddr = laddr; conn->c_faddr = faddr; @@ -152,6 +160,8 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, INIT_LIST_HEAD(&conn->c_send_queue); INIT_LIST_HEAD(&conn->c_retrans); + conn->c_tos = tos; + ret = rds_cong_get_maps(conn); if (ret) { kmem_cache_free(rds_conn_slab, conn); @@ -187,10 +197,17 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, } atomic_set(&conn->c_state, RDS_CONN_DOWN); + conn->c_send_gen = 0; conn->c_reconnect_jiffies = 0; + conn->c_reconnect_start = get_seconds(); + conn->c_reconnect_warn = 1; + conn->c_reconnect_drops = 0; + conn->c_reconnect_err = 0; + INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker); INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker); INIT_DELAYED_WORK(&conn->c_conn_w, rds_connect_worker); + INIT_DELAYED_WORK(&conn->c_hb_w, rds_hb_worker); INIT_WORK(&conn->c_down_w, rds_shutdown_worker); mutex_init(&conn->c_cm_lock); conn->c_flags = 0; @@ -223,7 +240,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, /* Creating normal conn */ struct rds_connection *found; - found = rds_conn_lookup(head, laddr, faddr, trans); + found = rds_conn_lookup(head, laddr, faddr, trans, tos); if (found) { trans->conn_free(conn->c_transport_data); kmem_cache_free(rds_conn_slab, conn); @@ -241,19 +258,35 @@ out: } struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, - struct rds_transport *trans, gfp_t gfp) + struct rds_transport *trans, + u8 tos, gfp_t gfp) { - return __rds_conn_create(laddr, faddr, trans, gfp, 0); + return __rds_conn_create(laddr, faddr, trans, gfp, tos, 0); } EXPORT_SYMBOL_GPL(rds_conn_create); struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, - struct rds_transport *trans, gfp_t gfp) + struct rds_transport *trans, + u8 tos, gfp_t gfp) { - return __rds_conn_create(laddr, faddr, trans, gfp, 1); + return __rds_conn_create(laddr, faddr, trans, gfp, tos, 1); } EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); +struct rds_connection *rds_conn_find(__be32 laddr, __be32 faddr, + struct rds_transport *trans, u8 tos) +{ + struct rds_connection *conn; + struct hlist_head *head = rds_conn_bucket(laddr, faddr); + + rcu_read_lock(); + conn = rds_conn_lookup(head, laddr, faddr, trans, tos); + rcu_read_unlock(); + + return conn; +} +EXPORT_SYMBOL_GPL(rds_conn_find); + void rds_conn_shutdown(struct rds_connection *conn) { /* shut it down unless it's down already */ @@ -277,6 +310,8 @@ void rds_conn_shutdown(struct rds_connection *conn) wait_event(conn->c_waitq, !test_bit(RDS_IN_XMIT, &conn->c_flags)); + wait_event(conn->c_waitq, + !test_bit(RDS_RECV_REFILL, &conn->c_flags)); conn->c_trans->conn_shutdown(conn); rds_conn_reset(conn); @@ -478,6 +513,7 @@ static int rds_conn_info_visitor(struct rds_connection *conn, cinfo->next_rx_seq = conn->c_next_rx_seq; cinfo->laddr = conn->c_laddr; cinfo->faddr = conn->c_faddr; + cinfo->tos = conn->c_tos; strncpy(cinfo->transport, conn->c_trans->t_name, sizeof(cinfo->transport)); cinfo->flags = 0; @@ -491,6 +527,8 @@ static int rds_conn_info_visitor(struct rds_connection *conn, rds_conn_info_set(cinfo->flags, atomic_read(&conn->c_state) == RDS_CONN_UP, CONNECTED); + rds_conn_info_set(cinfo->flags, conn->c_pending_flush, + ERROR); return 1; } @@ -540,6 +578,23 @@ void rds_conn_exit(void) */ void rds_conn_drop(struct rds_connection *conn) { + unsigned long now = get_seconds(); + + if (rds_conn_state(conn) == RDS_CONN_UP) { + conn->c_reconnect_start = now; + conn->c_reconnect_warn = 1; + conn->c_reconnect_drops = 0; + conn->c_reconnect_err = 0; + } else if ((conn->c_reconnect_warn) && + (now - conn->c_reconnect_start > 60)) { + printk(KERN_INFO "RDS/IB: re-connect to %u.%u.%u.%u is " + "stalling for more than 1 min...(drops=%u err=%d)\n", + NIPQUAD(conn->c_faddr), conn->c_reconnect_drops, + conn->c_reconnect_err); + conn->c_reconnect_warn = 0; + } + conn->c_reconnect_drops++; + atomic_set(&conn->c_state, RDS_CONN_ERROR); queue_work(rds_wq, &conn->c_down_w); } diff --git a/net/rds/ib.c b/net/rds/ib.c index 3b83086bcc304..df67d497d6d99 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -36,22 +36,52 @@ #include #include #include +#include #include -#include +#include +#include +#include +#include #include "rds.h" #include "ib.h" +#include "tcp.h" +#include -static unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE; -unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */ +unsigned int rds_ib_fmr_1m_pool_size = RDS_FMR_1M_POOL_SIZE; +unsigned int rds_ib_fmr_8k_pool_size = RDS_FMR_8K_POOL_SIZE; unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT; - -module_param(fmr_pool_size, int, 0444); -MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA"); -module_param(fmr_message_size, int, 0444); -MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer"); +unsigned int rds_ib_apm_enabled = 0; +unsigned int rds_ib_apm_fallback = 1; +unsigned int rds_ib_haip_enabled = 0; +unsigned int rds_ib_haip_fallback = 1; +unsigned int rds_ib_haip_hca_failover_enabled = 1; +unsigned int rds_ib_apm_timeout = RDS_IB_DEFAULT_TIMEOUT; +unsigned int rds_ib_rnr_retry_count = RDS_IB_DEFAULT_RNR_RETRY_COUNT; +unsigned int rds_ib_cq_balance_enabled = 1; + +module_param(rds_ib_fmr_1m_pool_size, int, 0444); +MODULE_PARM_DESC(rds_ib_fmr_1m_pool_size, " Max number of 1m fmr per HCA"); +module_param(rds_ib_fmr_8k_pool_size, int, 0444); +MODULE_PARM_DESC(rds_ib_fmr_8k_pool_size, " Max number of 8k fmr per HCA"); module_param(rds_ib_retry_count, int, 0444); MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); +module_param(rds_ib_apm_enabled, int, 0444); +MODULE_PARM_DESC(rds_ib_apm_enabled, " APM Enabled"); +module_param(rds_ib_haip_enabled, int, 0444); +MODULE_PARM_DESC(rds_ib_haip_enabled, " High Availability IP enabled"); +module_param(rds_ib_apm_timeout, int, 0444); +MODULE_PARM_DESC(rds_ib_apm_timeout, " APM timeout"); +module_param(rds_ib_rnr_retry_count, int, 0444); +MODULE_PARM_DESC(rds_ib_rnr_retry_count, " QP rnr retry count"); +module_param(rds_ib_apm_fallback, int, 0444); +MODULE_PARM_DESC(rds_ib_apm_fallback, " APM failback enabled"); +module_param(rds_ib_haip_fallback, int, 0444); +MODULE_PARM_DESC(rds_ib_haip_fallback, " HAIP failback Enabled"); +module_param(rds_ib_haip_hca_failover_enabled, int, 0444); +MODULE_PARM_DESC(rds_ib_haip_hca_failover_enabled, " HAIP HCA failover Enabled"); +module_param(rds_ib_cq_balance_enabled, int, 0444); +MODULE_PARM_DESC(rds_ib_cq_balance_enabled, " CQ load balance Enabled"); /* * we have a clumsy combination of RCU and a rwsem protecting this list @@ -65,7 +95,15 @@ struct list_head rds_ib_devices; DEFINE_SPINLOCK(ib_nodev_conns_lock); LIST_HEAD(ib_nodev_conns); -static void rds_ib_nodev_connect(void) +struct workqueue_struct *rds_aux_wq; + +struct socket *rds_ib_inet_socket; + +static struct rds_ib_port *ip_config; +static u8 ip_port_cnt = 0; +static u8 ip_port_max; + +void rds_ib_nodev_connect(void) { struct rds_ib_connection *ic; @@ -75,7 +113,7 @@ static void rds_ib_nodev_connect(void) spin_unlock(&ib_nodev_conns_lock); } -static void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev) +void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev) { struct rds_ib_connection *ic; unsigned long flags; @@ -96,18 +134,24 @@ static void rds_ib_dev_free(struct work_struct *work) struct rds_ib_device *rds_ibdev = container_of(work, struct rds_ib_device, free_work); - if (rds_ibdev->mr_pool) - rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); + if (rds_ibdev->mr_8k_pool) + rds_ib_destroy_mr_pool(rds_ibdev->mr_8k_pool); + if (rds_ibdev->mr_1m_pool) + rds_ib_destroy_mr_pool(rds_ibdev->mr_1m_pool); if (rds_ibdev->mr) ib_dereg_mr(rds_ibdev->mr); if (rds_ibdev->pd) ib_dealloc_pd(rds_ibdev->pd); + kfree(rds_ibdev->srq); list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { list_del(&i_ipaddr->list); kfree(i_ipaddr); } + if (rds_ibdev->vector_load) + kfree(rds_ibdev->vector_load); + kfree(rds_ibdev); } @@ -118,82 +162,6 @@ void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) queue_work(rds_wq, &rds_ibdev->free_work); } -static void rds_ib_add_one(struct ib_device *device) -{ - struct rds_ib_device *rds_ibdev; - struct ib_device_attr *dev_attr; - - /* Only handle IB (no iWARP) devices */ - if (device->node_type != RDMA_NODE_IB_CA) - return; - - dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); - if (!dev_attr) - return; - - if (ib_query_device(device, dev_attr)) { - rdsdebug("Query device failed for %s\n", device->name); - goto free_attr; - } - - rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL, - ibdev_to_node(device)); - if (!rds_ibdev) - goto free_attr; - - spin_lock_init(&rds_ibdev->spinlock); - atomic_set(&rds_ibdev->refcount, 1); - INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free); - - rds_ibdev->max_wrs = dev_attr->max_qp_wr; - rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); - - rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32; - rds_ibdev->max_fmrs = dev_attr->max_fmr ? - min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) : - fmr_pool_size; - - rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom; - rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom; - - rds_ibdev->dev = device; - rds_ibdev->pd = ib_alloc_pd(device); - if (IS_ERR(rds_ibdev->pd)) { - rds_ibdev->pd = NULL; - goto put_dev; - } - - rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(rds_ibdev->mr)) { - rds_ibdev->mr = NULL; - goto put_dev; - } - - rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); - if (IS_ERR(rds_ibdev->mr_pool)) { - rds_ibdev->mr_pool = NULL; - goto put_dev; - } - - INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); - INIT_LIST_HEAD(&rds_ibdev->conn_list); - - down_write(&rds_ib_devices_lock); - list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices); - up_write(&rds_ib_devices_lock); - atomic_inc(&rds_ibdev->refcount); - - ib_set_client_data(device, &rds_ib_client, rds_ibdev); - atomic_inc(&rds_ibdev->refcount); - - rds_ib_nodev_connect(); - -put_dev: - rds_ib_dev_put(rds_ibdev); -free_attr: - kfree(dev_attr); -} - /* * New connections use this to find the device to associate with the * connection. It's not in the fast path so we're not concerned about the @@ -229,7 +197,7 @@ struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device) * * This can be called at any time and can be racing with any other RDS path. */ -static void rds_ib_remove_one(struct ib_device *device) +void rds_ib_remove_one(struct ib_device *device) { struct rds_ib_device *rds_ibdev; @@ -292,6 +260,9 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, iinfo->max_recv_wr = ic->i_recv_ring.w_nr; iinfo->max_send_sge = rds_ibdev->max_sge; rds_ib_get_mr_info(rds_ibdev, iinfo); + iinfo->tos = ic->conn->c_tos; + iinfo->sl = ic->i_sl; + iinfo->cache_allocs = atomic_read(&ic->i_cache_allocs); } return 1; } @@ -325,9 +296,9 @@ static int rds_ib_laddr_check(__be32 addr) /* Create a CMA ID and try to bind it. This catches both * IB and iWARP capable NICs. */ - cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC); + cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); if (IS_ERR(cm_id)) - return PTR_ERR(cm_id); + return -EADDRNOTAVAIL; memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; @@ -337,7 +308,7 @@ static int rds_ib_laddr_check(__be32 addr) ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); /* due to this, we will claim to support iWARP devices unless we check node_type. */ - if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA) + if (ret || !cm_id->device || cm_id->device->node_type != RDMA_NODE_IB_CA) ret = -EADDRNOTAVAIL; rdsdebug("addr %pI4 ret %d node type %d\n", @@ -349,6 +320,777 @@ static int rds_ib_laddr_check(__be32 addr) return ret; } +static u8 rds_ib_port_lookup(struct rds_ib_device *rds_ibdev, u8 port_num) +{ + u8 i; + + for (i = 1; i <= ip_port_cnt; i++) { + if (ip_config[i].rds_ibdev == rds_ibdev && + ip_config[i].port_num == port_num) + return i; + } + return 0; +} + +static u8 rds_ib_get_failover_port(u8 port) +{ + u8 i; + + for (i = 1; i <= ip_port_cnt; i++) { + if (i != port && + ip_config[i].rds_ibdev == ip_config[port].rds_ibdev && + ip_config[i].port_state == RDS_IB_PORT_UP) { + return i; + } + } + + if (rds_ib_haip_hca_failover_enabled) { + for (i = 1; i <= ip_port_cnt; i++) { + if (i != port && + ip_config[i].port_state == RDS_IB_PORT_UP) { + return i; + } + } + } + + return 0; +} + +static void rds_ib_send_gratuitous_arp(struct net_device *out_dev, + unsigned char *dev_addr, + __be32 ip_addr) +{ + arp_send(ARPOP_REQUEST, ETH_P_ARP, + ip_addr, out_dev, + ip_addr, NULL, + dev_addr, NULL); +} + +static int rds_ib_set_ip(struct net_device *out_dev, + unsigned char *dev_addr, + char *if_name, + __be32 addr, + __be32 bcast, + __be32 mask) +{ + struct ifreq *ir; + struct sockaddr_in *sin; + struct page *page; + int ret = 0; + + page = alloc_page(GFP_HIGHUSER); + if (!page) { + printk(KERN_ERR "RDS/IB: alloc_page failed .. NO MEM\n"); + return 1; + } + + ir = (struct ifreq *)kmap(page); + memset(ir, 0, sizeof(struct ifreq)); + sin = (struct sockaddr_in *)&ir->ifr_addr; + sin->sin_family = AF_INET; + + strcpy(ir->ifr_ifrn.ifrn_name, if_name); + + sin->sin_addr.s_addr = addr; + ret = inet_ioctl(rds_ib_inet_socket, SIOCSIFADDR, (unsigned long) ir); + if (ret && addr) { + printk(KERN_ERR + "RDS/IB: inet_ioctl(SIOCSIFADDR) on %s failed (%d)\n", + if_name, ret); + goto out; + } + + if (!addr) + goto out; + + sin->sin_addr.s_addr = bcast; + ret = inet_ioctl(rds_ib_inet_socket, SIOCSIFBRDADDR, + (unsigned long) ir); + if (ret) { + printk(KERN_ERR + "RDS/IB: inet_ioctl(SIOCSIFBRDADDR) on %s failed (%d)\n", + if_name, ret); + goto out; + } + + sin->sin_addr.s_addr = mask; + ret = inet_ioctl(rds_ib_inet_socket, SIOCSIFNETMASK, + (unsigned long) ir); + if (ret) { + printk(KERN_ERR + "RDS/IB: inet_ioctl(SIOCSIFBRDADDR) on %s failed (%d)\n", + if_name, ret); + goto out; + } + + rds_ib_send_gratuitous_arp(out_dev, dev_addr, addr); + +out: + kunmap(page); + __free_page(page); + + return ret; +} + +static int rds_ib_addr_exist(struct net_device *ndev, + __be32 addr, + char *if_name) +{ + struct in_device *in_dev; + struct in_ifaddr *ifa; + struct in_ifaddr **ifap; + int found = 0; + + in_dev = in_dev_get(ndev); + if (in_dev) { + for (ifap = &in_dev->ifa_list; (ifa = *ifap); + ifap = &ifa->ifa_next) { + if (ifa->ifa_address == addr) { + found = 1; + if (if_name) + strcpy(if_name, ifa->ifa_label); + break; + } + } + } + in_dev_put(in_dev); + + return found; +} + +static int rds_ib_move_ip(char *from_dev, + char *to_dev, + u8 from_port, + u8 to_port, + u8 arp_port, + __be32 addr, + __be32 bcast, + __be32 mask, + int failover) +{ + struct ifreq *ir; + struct sockaddr_in *sin; + struct page *page; + char from_dev2[2*IFNAMSIZ + 1]; + char to_dev2[2*IFNAMSIZ + 1]; + int ret = 0; + u8 active_port; + struct in_device *in_dev; + + page = alloc_page(GFP_HIGHUSER); + if (!page) { + printk(KERN_ERR "RDS/IB: alloc_page failed .. NO MEM\n"); + return 1; + } + + ir = (struct ifreq *)kmap(page); + memset(ir, 0, sizeof(struct ifreq)); + sin = (struct sockaddr_in *)&ir->ifr_addr; + sin->sin_family = AF_INET; + + /* Set the primary IP if it hasn't been set */ + if (ip_config[to_port].ip_addr) { + strcpy(ir->ifr_ifrn.ifrn_name, ip_config[to_port].dev->name); + ret = inet_ioctl(rds_ib_inet_socket, SIOCGIFADDR, + (unsigned long) ir); + if (ret == -EADDRNOTAVAIL) { + sin->sin_addr.s_addr = ip_config[to_port].ip_addr; + ret = inet_ioctl(rds_ib_inet_socket, SIOCSIFADDR, + (unsigned long) ir); + if (ret) { + printk(KERN_ERR + "RDS/IB: inet_ioctl(SIOCSIFADDR) " + "failed (%d)\n", ret); + goto out; + } + } else if (ret) { + printk(KERN_ERR + "RDS/IB: inet_ioctl(SIOCGIFADDR) " + "failed (%d)\n", ret); + goto out; + } + } + + if (failover) { + in_dev = in_dev_get(ip_config[to_port].dev); + if (in_dev && !in_dev->ifa_list) { + strcpy(to_dev2, to_dev); + } else { + strcpy(to_dev2, to_dev); + strcat(to_dev2, ":"); + strcat(to_dev2, from_dev); + to_dev2[IFNAMSIZ-1] = 0; + } + in_dev_put(in_dev); + + /* Bail if IP already exists on target port */ + if (rds_ib_addr_exist(ip_config[to_port].dev, addr, NULL)) + goto out; + + active_port = ip_config[from_port].ip_active_port; + if (active_port == from_port) { + strcpy(from_dev2, from_dev); + } else if (ip_config[active_port].port_state == + RDS_IB_PORT_UP) { + if (!rds_ib_addr_exist(ip_config[active_port].dev, + addr, from_dev2)) { + strcpy(from_dev2, + ip_config[active_port].dev->name); + strcat(from_dev2, ":"); + strcat(from_dev2, from_dev); + } + } else { + strcpy(from_dev2, from_dev); + } + } else { + if (!rds_ib_addr_exist(ip_config[from_port].dev, + addr, from_dev2)) { + strcpy(from_dev2, from_dev); + strcat(from_dev2, ":"); + strcat(from_dev2, to_dev); + from_dev2[IFNAMSIZ-1] = 0; + } + strcpy(to_dev2, to_dev); + } + + /* Clear the IP on old port */ + ret = rds_ib_set_ip(NULL, NULL, from_dev2, 0, 0, 0); + + /* Set the IP on new port */ + ret = rds_ib_set_ip(ip_config[arp_port].dev, + ip_config[to_port].dev->dev_addr, + to_dev2, addr, bcast, mask); + + if (ret) { + printk(KERN_NOTICE + "RDS/IP: failed to move IP %u.%u.%u.%u " + "from %s to %s\n", + NIPQUAD(addr), from_dev2, to_dev2); + } else { + printk(KERN_NOTICE + "RDS/IB: IP %u.%u.%u.%u migrated from %s to %s\n", + NIPQUAD(addr), from_dev2, to_dev2); + } + +out: + kunmap(page); + __free_page(page); + + return ret; +} + +static void rds_ib_init_port(struct rds_ib_device *rds_ibdev, + struct net_device *net_dev, + u8 port_num) +{ + if (ip_port_cnt++ > ip_port_max) { + printk(KERN_ERR "RDS/IB: Exceeded max ports (%d)\n", + ip_port_max); + return; + } + + ip_config[ip_port_cnt].port_num = port_num; + ip_config[ip_port_cnt].dev = net_dev; + ip_config[ip_port_cnt].rds_ibdev = rds_ibdev; + ip_config[ip_port_cnt].ip_active_port = 0; + strcpy(ip_config[ip_port_cnt].if_name, net_dev->name); + + if (net_dev->operstate == IF_OPER_UP) + ip_config[ip_port_cnt].port_state = RDS_IB_PORT_UP; + else + ip_config[ip_port_cnt].port_state = RDS_IB_PORT_DOWN; +} + +static void rds_ib_set_port(struct rds_ib_device *rds_ibdev, + struct net_device *net_dev, + char *if_name, + u8 port_num, + __be32 ip_addr, + __be32 ip_bcast, + __be32 ip_mask) +{ + unsigned int idx; + u8 port = rds_ib_port_lookup(rds_ibdev, port_num); + + if (!strcmp(net_dev->name, if_name)) { + strcpy(ip_config[port].if_name, if_name); + ip_config[port].ip_addr = ip_addr; + ip_config[port].ip_bcast = ip_bcast; + ip_config[port].ip_mask = ip_mask; + ip_config[port].ip_active_port = port; + } else { + idx = ip_config[port].alias_cnt++; + strcpy(ip_config[port].aliases[idx].if_name, if_name); + ip_config[port].aliases[idx].ip_addr = ip_addr; + ip_config[port].aliases[idx].ip_bcast = ip_bcast; + ip_config[port].aliases[idx].ip_mask = ip_mask; + } +} + +static void rds_ib_do_failover(u8 from_port, u8 to_port, u8 arp_port) +{ + u8 j; + int ret; + + if (!ip_config[from_port].ip_addr) + return; + + if (!to_port) + to_port = rds_ib_get_failover_port(from_port); + + if (!arp_port) + arp_port = to_port; + + if (to_port) { + if (!rds_ib_move_ip( + ip_config[from_port].if_name, + ip_config[to_port].if_name, + from_port, + to_port, + arp_port, + ip_config[from_port].ip_addr, + ip_config[from_port].ip_bcast, + ip_config[from_port].ip_mask, + 1)) { + + ip_config[from_port].ip_active_port = to_port; + for (j = 0; j < ip_config[from_port]. + alias_cnt; j++) { + + ret = rds_ib_move_ip( + ip_config[from_port]. + aliases[j].if_name, + ip_config[to_port].if_name, + from_port, + to_port, + arp_port, + ip_config[from_port]. + aliases[j].ip_addr, + ip_config[from_port]. + aliases[j].ip_bcast, + ip_config[from_port]. + aliases[j].ip_mask, + 1); + } + } + } +} + +static void rds_ib_do_failback(u8 port) +{ + u8 ip_active_port = ip_config[port].ip_active_port; + u8 j; + int ret; + + if (!ip_config[port].ip_addr) + return; + + if (port != ip_config[port].ip_active_port) { + if (!rds_ib_move_ip( + ip_config[ip_active_port].if_name, + ip_config[port].if_name, + ip_active_port, + port, + ip_active_port, + ip_config[port].ip_addr, + ip_config[port].ip_bcast, + ip_config[port].ip_mask, + 0)) { + + ip_config[port].ip_active_port = port; + for (j = 0; j < ip_config[port]. + alias_cnt; j++) { + + ret = rds_ib_move_ip( + ip_config[ip_active_port]. + if_name, + ip_config[port]. + aliases[j].if_name, + ip_active_port, + port, + ip_active_port, + ip_config[port]. + aliases[j].ip_addr, + ip_config[port]. + aliases[j].ip_bcast, + ip_config[port]. + aliases[j].ip_mask, + 0); + } + } + } +} + +static void rds_ib_failover(struct work_struct *_work) +{ + struct rds_ib_port_ud_work *work = + container_of(_work, struct rds_ib_port_ud_work, work.work); + int ret; + u8 i; + + for (i = 1; i <= ip_port_cnt; i++) { + if (i != work->port && + ip_config[i].port_state == RDS_IB_PORT_DOWN && + ip_config[i].ip_active_port == work->port) { + rds_ib_do_failover(i, 0, 0); + } + } + + if (ip_config[work->port].ip_addr) + rds_ib_do_failover(work->port, 0, 0); + + if (ip_config[work->port].ip_active_port == work->port) { + ret = rds_ib_set_ip(NULL, NULL, + ip_config[work->port].if_name, + 0, 0, 0); + } + + kfree(work); +} + +static void rds_ib_failback(struct work_struct *_work) +{ + struct rds_ib_port_ud_work *work = + container_of(_work, struct rds_ib_port_ud_work, work.work); + u8 i, ip_active_port, port = work->port; + + ip_active_port = ip_config[port].ip_active_port; + + rds_ib_do_failback(port); + + for (i = 1; i <= ip_port_cnt; i++) { + if (i == port || + ip_config[i].port_state == RDS_IB_PORT_UP || + !ip_config[i].ip_addr) + continue; + + if (ip_config[i].ip_active_port == i) { + rds_ib_do_failover(i, 0, ip_active_port); + } else if (ip_config[i].ip_active_port == port) { + rds_ib_do_failover(i, port, ip_active_port); + } else if (ip_config[ip_config[i].ip_active_port].port_state == + RDS_IB_PORT_DOWN) { + rds_ib_do_failover(i, 0, ip_active_port); + } else if (ip_config[port].rds_ibdev == + ip_config[i].rds_ibdev) { + rds_ib_do_failover(i, port, ip_active_port); + } + } + + if (ip_active_port != ip_config[port].ip_active_port) { + for (i = 1; i <= ip_port_cnt; i++) { + if (ip_config[i].port_state == RDS_IB_PORT_DOWN && + i != ip_active_port && ip_config[i].ip_addr && + ip_config[i].ip_active_port == ip_active_port) { + + rds_ib_do_failover(i, ip_active_port, + ip_active_port); + } + } + } + + kfree(work); +} + +static int rds_ib_ip_config_down(void) +{ + u8 i; + + for (i = 1; i <= ip_port_cnt; i++) { + if (ip_config[i].port_state == RDS_IB_PORT_UP) + return 0; + } + + return 1; +} + +static void rds_ib_net_failback(struct work_struct *_work) +{ + struct rds_ib_port_ud_work *work = + container_of(_work, struct rds_ib_port_ud_work, work.work); + struct in_device *in_dev; + + in_dev = in_dev_get(ip_config[work->port].dev); + if (in_dev && !in_dev->ifa_list && + ip_config[work->port].ip_addr && + work->timeout > 0) { + INIT_DELAYED_WORK(&work->work, rds_ib_net_failback); + work->timeout -= msecs_to_jiffies(100); + queue_delayed_work(rds_wq, &work->work, + msecs_to_jiffies(100)); + } else { + rds_ib_failback((struct work_struct *)&work->work); + } + + in_dev_put(in_dev); +} + +static void rds_ib_event_handler(struct ib_event_handler *handler, + struct ib_event *event) +{ + struct rds_ib_device *rds_ibdev = + container_of(handler, typeof(*rds_ibdev), event_handler); + u8 port = rds_ib_port_lookup(rds_ibdev, event->element.port_num); + struct rds_ib_port_ud_work *work; + + if (!rds_ib_haip_enabled || !ip_port_cnt) + return; + + if (event->event != IB_EVENT_PORT_ACTIVE && + event->event != IB_EVENT_PORT_ERR) + return; + + printk(KERN_NOTICE "RDS/IB: %s/port_%d/%s is %s\n", + rds_ibdev->dev->name, + event->element.port_num, + ip_config[port].if_name, + (event->event == IB_EVENT_PORT_ACTIVE) ? + "ACTIVE" : "ERROR"); + + work = kzalloc(sizeof *work, GFP_KERNEL); + if (!work) { + printk(KERN_ERR "RDS/IB: failed to allocate port work\n"); + return; + } + + work->port = port; + + if (event->event == IB_EVENT_PORT_ACTIVE) { + if (rds_ib_haip_fallback) { + INIT_DELAYED_WORK(&work->work, rds_ib_failback); + queue_delayed_work(rds_wq, &work->work, 0); + } else + kfree(work); + ip_config[port].port_state = RDS_IB_PORT_UP; + } else { + INIT_DELAYED_WORK(&work->work, rds_ib_failover); + queue_delayed_work(rds_wq, &work->work, 0); + ip_config[port].port_state = RDS_IB_PORT_DOWN; + } +} + +static void rds_ib_dump_ip_config(void) +{ + int i, j; + + if (!rds_ib_haip_enabled) + return; + + printk(KERN_ERR "RDS/IB: IP configuration ...\n"); + for (i = 1; i <= ip_port_cnt; i++) { + printk(KERN_ERR "RDS/IB: %s/port_%d/%s: " + "IP %d.%d.%d.%d/%d.%d.%d.%d/%d.%d.%d.%d " + "state %s\n", + ip_config[i].rds_ibdev->dev->name, + ip_config[i].port_num, + ip_config[i].if_name, + NIPQUAD(ip_config[i].ip_addr), + NIPQUAD(ip_config[i].ip_bcast), + NIPQUAD(ip_config[i].ip_mask), + (ip_config[i].port_state == + RDS_IB_PORT_UP ? "UP" : "DOWN")); + + for (j = 0; j < ip_config[i].alias_cnt; j++) { + printk(KERN_ERR "Alias %s " + "IP %d.%d.%d.%d/%d.%d.%d.%d/%d.%d.%d.%d\n", + ip_config[i].aliases[j].if_name, + NIPQUAD(ip_config[i]. + aliases[j].ip_addr), + NIPQUAD(ip_config[i]. + aliases[j].ip_bcast), + NIPQUAD(ip_config[i]. + aliases[j].ip_mask)); + } + } +} + +static int rds_ib_ip_config_init(void) +{ + struct net_device *dev; + struct in_ifaddr *ifa; + struct in_ifaddr **ifap; + struct in_device *in_dev; + struct rds_ib_device *rds_ibdev; + u8 port_num; + int ret = 0; + + if (!rds_ib_haip_enabled) + return 0; + + ip_port_max = 0; + rcu_read_lock(); + list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) { + ip_port_max += rds_ibdev->dev->phys_port_cnt; + } + rcu_read_unlock(); + + ip_config = kzalloc(sizeof(struct rds_ib_port) * + (ip_port_max + 1), GFP_KERNEL); + if (!ip_config) { + printk(KERN_ERR "RDS/IB: failed to allocate IP config\n"); + return 1; + } + + read_lock(&dev_base_lock); + for_each_netdev(&init_net, dev) { + in_dev = in_dev_get(dev); + if ((dev->type == ARPHRD_INFINIBAND) && + !(dev->flags & IFF_SLAVE) && + !(dev->flags & IFF_MASTER) && + in_dev) { + union ib_gid gid; + + memcpy(&gid, dev->dev_addr + 4, sizeof gid); + + rcu_read_lock(); + list_for_each_entry_rcu(rds_ibdev, + &rds_ib_devices, list) { + ret = ib_find_cached_gid(rds_ibdev->dev, + &gid, &port_num, NULL); + if (!ret) + break; + } + rcu_read_unlock(); + + if (!port_num) { + printk(KERN_ERR "RDS/IB: GID "RDS_IB_GID_FMT + " has no associated port\n", + RDS_IB_GID_ARG(gid)); + ret = 1; + goto out; + } + + rds_ib_init_port(rds_ibdev, dev, port_num); + + for (ifap = &in_dev->ifa_list; (ifa = *ifap); + ifap = &ifa->ifa_next) { + rds_ib_set_port(rds_ibdev, dev, + ifa->ifa_label, port_num, + ifa->ifa_address, + ifa->ifa_broadcast, + ifa->ifa_mask); + } + } + in_dev_put(in_dev); + } + + rds_ib_dump_ip_config(); +out: + read_unlock(&dev_base_lock); + return ret; +} + +void rds_ib_add_one(struct ib_device *device) +{ + struct rds_ib_device *rds_ibdev; + struct ib_device_attr *dev_attr; + + /* Only handle IB (no iWARP) devices */ + if (device->node_type != RDMA_NODE_IB_CA) + return; + + dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); + if (!dev_attr) + return; + + if (ib_query_device(device, dev_attr)) { + rdsdebug("Query device failed for %s\n", device->name); + goto free_attr; + } + + rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL, + ibdev_to_node(device)); + if (!rds_ibdev) + goto free_attr; + + spin_lock_init(&rds_ibdev->spinlock); + atomic_set(&rds_ibdev->refcount, 1); + INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free); + + rds_ibdev->max_wrs = dev_attr->max_qp_wr; + rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); + + rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32; + + rds_ibdev->max_1m_fmrs = dev_attr->max_fmr ? + min_t(unsigned int, dev_attr->max_fmr, + rds_ib_fmr_1m_pool_size) : + rds_ib_fmr_1m_pool_size; + + rds_ibdev->max_8k_fmrs = dev_attr->max_fmr ? + min_t(unsigned int, dev_attr->max_fmr, + rds_ib_fmr_8k_pool_size) : + rds_ib_fmr_8k_pool_size; + + rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom; + rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom; + + rds_ibdev->dev = device; + rds_ibdev->pd = ib_alloc_pd(device); + if (IS_ERR(rds_ibdev->pd)) { + rds_ibdev->pd = NULL; + goto put_dev; + } + + if (rds_ib_haip_enabled) { + INIT_IB_EVENT_HANDLER(&rds_ibdev->event_handler, + rds_ibdev->dev, rds_ib_event_handler); + if (ib_register_event_handler(&rds_ibdev->event_handler)) { + printk(KERN_ERR + "RDS/IB: ib_register_event_handler failed\n"); + goto put_dev; + } + } + + rds_ibdev->vector_load = kzalloc(sizeof(int) * + device->num_comp_vectors, GFP_KERNEL); + if (!rds_ibdev->vector_load) { + printk(KERN_ERR "RDS/IB: failed to allocate vector memoru\n"); + goto put_dev; + } + + rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(rds_ibdev->mr)) { + rds_ibdev->mr = NULL; + goto put_dev; + } + + rds_ibdev->mr_1m_pool = + rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_1M_POOL); + if (IS_ERR(rds_ibdev->mr_1m_pool)) { + rds_ibdev->mr_1m_pool = NULL; + goto put_dev; + } + + rds_ibdev->mr_8k_pool = + rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_8K_POOL); + if (IS_ERR(rds_ibdev->mr_8k_pool)) { + rds_ibdev->mr_8k_pool = NULL; + goto put_dev; + } + + rds_ibdev->srq = kmalloc(sizeof(struct rds_ib_srq), GFP_KERNEL); + if (!rds_ibdev->srq) + goto free_attr; + + INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); + INIT_LIST_HEAD(&rds_ibdev->conn_list); + + down_write(&rds_ib_devices_lock); + list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices); + up_write(&rds_ib_devices_lock); + atomic_inc(&rds_ibdev->refcount); + + ib_set_client_data(device, &rds_ib_client, rds_ibdev); + atomic_inc(&rds_ibdev->refcount); + + rds_ib_nodev_connect(); +put_dev: + rds_ib_dev_put(rds_ibdev); +free_attr: + kfree(dev_attr); +} + static void rds_ib_unregister_client(void) { ib_unregister_client(&rds_ib_client); @@ -356,41 +1098,75 @@ static void rds_ib_unregister_client(void) flush_workqueue(rds_wq); } -void rds_ib_exit(void) +static int rds_ib_netdev_callback(struct notifier_block *self, unsigned long event, void *ctx) { - rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); - rds_ib_unregister_client(); - rds_ib_destroy_nodev_conns(); - rds_ib_sysctl_exit(); - rds_ib_recv_exit(); - rds_trans_unregister(&rds_ib_transport); + struct net_device *ndev = (struct net_device *)ctx; + u8 port = 0; + u8 i; + struct rds_ib_port_ud_work *work; + + if (!rds_ib_haip_enabled || !ip_port_cnt) + return NOTIFY_DONE; + + if (event != NETDEV_UP && event != NETDEV_DOWN) + return NOTIFY_DONE; + + for (i = 1; i <= ip_port_cnt; i++) { + if (!strcmp(ndev->name, ip_config[i].if_name)) { + port = i; + break; + } + } + + if (!port) + return NOTIFY_DONE; + + + printk(KERN_NOTICE "RDS/IB: %s/port_%d/%s is %s\n", + ip_config[port].rds_ibdev->dev->name, + ip_config[port].port_num, ndev->name, + (event == NETDEV_UP) ? "UP" : "DOWN"); + + work = kzalloc(sizeof *work, GFP_KERNEL); + if (!work) { + printk(KERN_ERR "RDS/IB: failed to allocate port work\n"); + return NOTIFY_DONE; + } + + work->dev = ndev; + work->port = port; + + switch (event) { + case NETDEV_UP: + if (rds_ib_haip_fallback) { + if (rds_ib_ip_config_down()) { + INIT_DELAYED_WORK(&work->work, + rds_ib_net_failback); + work->timeout = msecs_to_jiffies(10000); + } else { + INIT_DELAYED_WORK(&work->work, + rds_ib_net_failback); + work->timeout = msecs_to_jiffies(1000); + } + queue_delayed_work(rds_wq, &work->work, + msecs_to_jiffies(100)); + } else + kfree(work); + + ip_config[port].port_state = NETDEV_UP; + break; + case NETDEV_DOWN: + INIT_DELAYED_WORK(&work->work, rds_ib_failover); + queue_delayed_work(rds_wq, &work->work, 0); + ip_config[port].port_state = RDS_IB_PORT_DOWN; + break; + } + + return NOTIFY_DONE; } -struct rds_transport rds_ib_transport = { - .laddr_check = rds_ib_laddr_check, - .xmit_complete = rds_ib_xmit_complete, - .xmit = rds_ib_xmit, - .xmit_rdma = rds_ib_xmit_rdma, - .xmit_atomic = rds_ib_xmit_atomic, - .recv = rds_ib_recv, - .conn_alloc = rds_ib_conn_alloc, - .conn_free = rds_ib_conn_free, - .conn_connect = rds_ib_conn_connect, - .conn_shutdown = rds_ib_conn_shutdown, - .inc_copy_to_user = rds_ib_inc_copy_to_user, - .inc_free = rds_ib_inc_free, - .cm_initiate_connect = rds_ib_cm_initiate_connect, - .cm_handle_connect = rds_ib_cm_handle_connect, - .cm_connect_complete = rds_ib_cm_connect_complete, - .stats_info_copy = rds_ib_stats_info_copy, - .exit = rds_ib_exit, - .get_mr = rds_ib_get_mr, - .sync_mr = rds_ib_sync_mr, - .free_mr = rds_ib_free_mr, - .flush_mrs = rds_ib_flush_mrs, - .t_owner = THIS_MODULE, - .t_name = "infiniband", - .t_type = RDS_TRANS_IB +static struct notifier_block rds_ib_nb = { + .notifier_call = rds_ib_netdev_callback }; int rds_ib_init(void) @@ -399,10 +1175,22 @@ int rds_ib_init(void) INIT_LIST_HEAD(&rds_ib_devices); - ret = ib_register_client(&rds_ib_client); + ret = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, + &rds_ib_inet_socket); + if (ret < 0) { + printk(KERN_ERR "RDS/IB: can't create TCP transport socket (%d).\n", -ret); + goto out; + } + rds_ib_inet_socket->sk->sk_net = &init_net; + + ret = rds_ib_fmr_init(); if (ret) goto out; + ret = ib_register_client(&rds_ib_client); + if (ret) + goto out_fmr_exit; + ret = rds_ib_sysctl_init(); if (ret) goto out_ibreg; @@ -411,23 +1199,94 @@ int rds_ib_init(void) if (ret) goto out_sysctl; + ret = rds_ib_srqs_init(); + if (ret) { + printk(KERN_ERR "RDS/IB: Failed to init SRQ\n"); + goto out_recv; + } + + rds_aux_wq = create_singlethread_workqueue("krdsd_aux"); + if (!rds_aux_wq) { + printk(KERN_ERR "RDS/IB: failed to create aux workqueue\n"); + goto out_srq; + } + ret = rds_trans_register(&rds_ib_transport); if (ret) - goto out_recv; + goto out_srq; rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); + ret = rds_ib_ip_config_init(); + if (ret) { + printk(KERN_ERR "RDS/IB: failed to init port\n"); + goto out_srq; + } + + register_netdevice_notifier(&rds_ib_nb); + goto out; +out_srq: + rds_ib_srqs_exit(); out_recv: rds_ib_recv_exit(); out_sysctl: rds_ib_sysctl_exit(); out_ibreg: rds_ib_unregister_client(); +out_fmr_exit: + rds_ib_fmr_exit(); out: return ret; } + +void rds_ib_exit(void) +{ + unregister_netdevice_notifier(&rds_ib_nb); + rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); + rds_ib_unregister_client(); + rds_ib_destroy_nodev_conns(); + rds_ib_sysctl_exit(); + rds_ib_srqs_exit(); + rds_ib_recv_exit(); + flush_workqueue(rds_aux_wq); + destroy_workqueue(rds_aux_wq); + rds_trans_unregister(&rds_ib_transport); + rds_ib_fmr_exit(); + + if (ip_config) + kfree(ip_config); +} + +struct rds_transport rds_ib_transport = { + .laddr_check = rds_ib_laddr_check, + .xmit_complete = rds_ib_xmit_complete, + .xmit = rds_ib_xmit, + .xmit_rdma = rds_ib_xmit_rdma, + .xmit_atomic = rds_ib_xmit_atomic, + .recv = rds_ib_recv, + .conn_alloc = rds_ib_conn_alloc, + .conn_free = rds_ib_conn_free, + .conn_connect = rds_ib_conn_connect, + .conn_shutdown = rds_ib_conn_shutdown, + .inc_copy_to_user = rds_ib_inc_copy_to_user, + .inc_free = rds_ib_inc_free, + .cm_initiate_connect = rds_ib_cm_initiate_connect, + .cm_handle_connect = rds_ib_cm_handle_connect, + .cm_connect_complete = rds_ib_cm_connect_complete, + .stats_info_copy = rds_ib_stats_info_copy, + .exit = rds_ib_exit, + .get_mr = rds_ib_get_mr, + .sync_mr = rds_ib_sync_mr, + .free_mr = rds_ib_free_mr, + .flush_mrs = rds_ib_flush_mrs, + .check_migration = rds_ib_check_migration, + .t_owner = THIS_MODULE, + .t_name = "infiniband", + .t_type = RDS_TRANS_IB +}; + MODULE_LICENSE("GPL"); diff --git a/net/rds/ib.h b/net/rds/ib.h index 4297d92788dc4..3c6675c6663c0 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -8,21 +8,43 @@ #include "rds.h" #include "rdma_transport.h" -#define RDS_FMR_SIZE 256 -#define RDS_FMR_POOL_SIZE 8192 +#define RDS_FMR_1M_POOL_SIZE (8192 / 2) +#define RDS_FMR_1M_MSG_SIZE 256 /* 1M */ +#define RDS_FMR_8K_MSG_SIZE 2 +#define RDS_FMR_8K_POOL_SIZE ((256 / (RDS_FMR_8K_MSG_SIZE + 1)) * (8192 / 2)) #define RDS_IB_MAX_SGE 8 -#define RDS_IB_RECV_SGE 2 +#define RDS_IB_RECV_SGE 2 #define RDS_IB_DEFAULT_RECV_WR 1024 #define RDS_IB_DEFAULT_SEND_WR 256 +#define RDS_IB_DEFAULT_SRQ_MAX_WR 4096 +#define RDS_IB_DEFAULT_SRQ_HWM_REFILL (RDS_IB_DEFAULT_SRQ_MAX_WR/2) +#define RDS_IB_DEFAULT_SRQ_LWM_REFILL (RDS_IB_DEFAULT_SRQ_MAX_WR/10) -#define RDS_IB_DEFAULT_RETRY_COUNT 2 +#define RDS_IB_DEFAULT_RETRY_COUNT 1 -#define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ +#define RDS_IB_DEFAULT_RNR_RETRY_COUNT 7 + +#define RDS_IB_DEFAULT_TIMEOUT 16 /* 4.096 * 2 ^ 16 = 260 msec */ + +#define RDS_IB_SUPPORTED_PROTOCOLS 0x00000007 /* minor versions supported */ #define RDS_IB_RECYCLE_BATCH_COUNT 32 +#define RDS_IB_SRQ_POST_BATCH_COUNT 64 + +#define RDS_IB_GID_FMT "%2.2x%2.2x:%2.2x%2.2x" + +#define RDS_IB_GID_RAW_ARG(gid) ((u8 *)(gid))[12],\ + ((u8 *)(gid))[13],\ + ((u8 *)(gid))[14],\ + ((u8 *)(gid))[15] + +#define RDS_IB_GID_ARG(gid) RDS_IB_GID_RAW_ARG((gid).raw) + +#define RDS_WC_MAX 32 + extern struct rw_semaphore rds_ib_devices_lock; extern struct list_head rds_ib_devices; @@ -64,6 +86,7 @@ struct rds_ib_connect_private { __be32 dp_reserved1; __be64 dp_ack_seq; __be32 dp_credit; /* non-zero enables flow ctl */ + u8 dp_tos; }; struct rds_ib_send_work { @@ -74,10 +97,12 @@ struct rds_ib_send_work { }; struct rds_ib_recv_work { - struct rds_ib_incoming *r_ibinc; + struct rds_ib_incoming *r_ibinc; struct rds_page_frag *r_frag; struct ib_recv_wr r_wr; struct ib_sge r_sge[2]; + struct rds_ib_connection *r_ic; + int r_posted; }; struct rds_ib_work_ring { @@ -88,8 +113,37 @@ struct rds_ib_work_ring { atomic_t w_free_ctr; }; +/* + * Rings are posted with all the allocations they'll need to queue the + * incoming message to the receiving socket so this can't fail. + * All fragments start with a header, so we can make sure we're not receiving + * garbage, and we can tell a small 8 byte fragment from an ACK frame. + */ +struct rds_ib_ack_state { + u64 ack_next; + u64 ack_recv; + unsigned int ack_required:1; + unsigned int ack_next_valid:1; + unsigned int ack_recv_valid:1; +}; + struct rds_ib_device; +struct rds_ib_path { + union ib_gid p_sgid; + union ib_gid p_dgid; +}; + +struct rds_ib_destroy_id_work { + struct delayed_work work; + struct rdma_cm_id *cm_id; +}; + +struct rds_ib_migrate_work { + struct delayed_work work; + struct rds_ib_connection *ic; +}; + struct rds_ib_connection { struct list_head ib_node; @@ -100,8 +154,14 @@ struct rds_ib_connection { struct rdma_cm_id *i_cm_id; struct ib_pd *i_pd; struct ib_mr *i_mr; - struct ib_cq *i_send_cq; - struct ib_cq *i_recv_cq; + struct ib_cq *i_scq; + struct ib_cq *i_rcq; + struct ib_wc i_send_wc[RDS_WC_MAX]; + struct ib_wc i_recv_wc[RDS_WC_MAX]; + + /* interrupt handling */ + struct tasklet_struct i_stasklet; + struct tasklet_struct i_rtasklet; /* tx */ struct rds_ib_work_ring i_send_ring; @@ -153,6 +213,21 @@ struct rds_ib_connection { /* Batched completions */ unsigned int i_unsignaled_wrs; + u8 i_sl; + + atomic_t i_cache_allocs; + + struct completion i_last_wqe_complete; + + /* APM support */ + struct rds_ib_migrate_work i_migrate_w; + struct rds_ib_path i_pri_path; + struct rds_ib_path i_cur_path; + unsigned int i_alt_path_index; + unsigned int i_active_side; + + int i_scq_vector; + int i_rcq_vector; }; /* This assumes that atomic_t is at least 32 bits */ @@ -164,6 +239,61 @@ struct rds_ib_connection { struct rds_ib_ipaddr { struct list_head list; __be32 ipaddr; + struct rcu_head rcu_head; +}; + +struct rds_ib_srq { + struct rds_ib_device *rds_ibdev; + struct ib_srq *s_srq; + struct ib_event_handler s_event_handler; + struct rds_ib_recv_work *s_recvs; + u32 s_n_wr; + struct rds_header *s_recv_hdrs; + u64 s_recv_hdrs_dma; + atomic_t s_num_posted; + unsigned long s_refill_gate; + struct delayed_work s_refill_w; + struct delayed_work s_rearm_w; +}; + +struct rds_ib_alias { + char if_name[IFNAMSIZ]; + __be32 ip_addr; + __be32 ip_bcast; + __be32 ip_mask; +}; + +enum { + RDS_IB_PORT_UNKNOWN = 0, + RDS_IB_PORT_UP, + RDS_IB_PORT_DOWN, +}; + +#define RDS_IB_MAX_ALIASES 100 +struct rds_ib_port { + struct rds_ib_device *rds_ibdev; + struct net_device *dev; + unsigned int port_state; + u8 port_num; + char if_name[IFNAMSIZ]; + __be32 ip_addr; + __be32 ip_bcast; + __be32 ip_mask; + unsigned int ip_active_port; + unsigned int alias_cnt; + struct rds_ib_alias aliases[RDS_IB_MAX_ALIASES]; +}; + +struct rds_ib_port_ud_work { + struct delayed_work work; + struct net_device *dev; + unsigned int port; + int timeout; +}; + +enum { + RDS_IB_MR_8K_POOL, + RDS_IB_MR_1M_POOL, }; struct rds_ib_device { @@ -173,9 +303,11 @@ struct rds_ib_device { struct ib_device *dev; struct ib_pd *pd; struct ib_mr *mr; - struct rds_ib_mr_pool *mr_pool; + struct rds_ib_mr_pool *mr_1m_pool; + struct rds_ib_mr_pool *mr_8k_pool; unsigned int fmr_max_remaps; - unsigned int max_fmrs; + unsigned int max_8k_fmrs; + unsigned int max_1m_fmrs; int max_sge; unsigned int max_wrs; unsigned int max_initiator_depth; @@ -183,6 +315,10 @@ struct rds_ib_device { spinlock_t spinlock; /* protect the above */ atomic_t refcount; struct work_struct free_work; + struct rds_ib_srq *srq; + struct rds_ib_port *ports; + struct ib_event_handler event_handler; + int *vector_load; }; #define pcidev_to_node(pcidev) pcibus_to_node(pcidev->bus) @@ -193,39 +329,51 @@ struct rds_ib_device { #define IB_ACK_IN_FLIGHT 0 #define IB_ACK_REQUESTED 1 +#define RDS_IB_SEND_OP (1ULL << 63) /* Magic WR_ID for ACKs */ #define RDS_IB_ACK_WR_ID (~(u64) 0) struct rds_ib_statistics { uint64_t s_ib_connect_raced; uint64_t s_ib_listen_closed_stale; - uint64_t s_ib_tx_cq_call; + uint64_t s_ib_evt_handler_call; + uint64_t s_ib_tasklet_call; uint64_t s_ib_tx_cq_event; uint64_t s_ib_tx_ring_full; uint64_t s_ib_tx_throttle; uint64_t s_ib_tx_sg_mapping_failure; uint64_t s_ib_tx_stalled; uint64_t s_ib_tx_credit_updates; - uint64_t s_ib_rx_cq_call; uint64_t s_ib_rx_cq_event; uint64_t s_ib_rx_ring_empty; uint64_t s_ib_rx_refill_from_cq; uint64_t s_ib_rx_refill_from_thread; - uint64_t s_ib_rx_alloc_limit; + uint64_t s_ib_rx_alloc_limit; + uint64_t s_ib_rx_total_frags; + uint64_t s_ib_rx_total_incs; uint64_t s_ib_rx_credit_updates; uint64_t s_ib_ack_sent; uint64_t s_ib_ack_send_failure; uint64_t s_ib_ack_send_delayed; uint64_t s_ib_ack_send_piggybacked; uint64_t s_ib_ack_received; - uint64_t s_ib_rdma_mr_alloc; - uint64_t s_ib_rdma_mr_free; - uint64_t s_ib_rdma_mr_used; - uint64_t s_ib_rdma_mr_pool_flush; - uint64_t s_ib_rdma_mr_pool_wait; - uint64_t s_ib_rdma_mr_pool_depleted; + uint64_t s_ib_rdma_mr_8k_alloc; + uint64_t s_ib_rdma_mr_8k_free; + uint64_t s_ib_rdma_mr_8k_used; + uint64_t s_ib_rdma_mr_8k_pool_flush; + uint64_t s_ib_rdma_mr_8k_pool_wait; + uint64_t s_ib_rdma_mr_8k_pool_depleted; + uint64_t s_ib_rdma_mr_1m_alloc; + uint64_t s_ib_rdma_mr_1m_free; + uint64_t s_ib_rdma_mr_1m_used; + uint64_t s_ib_rdma_mr_1m_pool_flush; + uint64_t s_ib_rdma_mr_1m_pool_wait; + uint64_t s_ib_rdma_mr_1m_pool_depleted; uint64_t s_ib_atomic_cswp; uint64_t s_ib_atomic_fadd; + uint64_t s_ib_srq_lows; + uint64_t s_ib_srq_refills; + uint64_t s_ib_srq_empty_refills; }; extern struct workqueue_struct *rds_ib_wq; @@ -264,17 +412,31 @@ static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev, /* ib.c */ +extern struct workqueue_struct *rds_aux_wq; extern struct rds_transport rds_ib_transport; +extern void rds_ib_add_one(struct ib_device *device); +extern void rds_ib_remove_one(struct ib_device *device); struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device); void rds_ib_dev_put(struct rds_ib_device *rds_ibdev); extern struct ib_client rds_ib_client; -extern unsigned int fmr_message_size; +extern unsigned int rds_ib_fmr_1m_pool_size; +extern unsigned int rds_ib_fmr_8k_pool_size; extern unsigned int rds_ib_retry_count; +extern unsigned int rds_ib_rnr_retry_count; +extern unsigned int rds_ib_apm_enabled; +extern unsigned int rds_ib_apm_fallback; +extern unsigned int rds_ib_haip_enabled; +extern unsigned int rds_ib_haip_fallback; +extern unsigned int rds_ib_haip_failover_enabled; +extern unsigned int rds_ib_apm_timeout; +extern unsigned int rds_ib_cq_balance_enabled; extern spinlock_t ib_nodev_conns_lock; extern struct list_head ib_nodev_conns; +extern struct socket *rds_ib_inet_socket; + /* ib_cm.c */ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp); void rds_ib_conn_free(void *arg); @@ -289,6 +451,8 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id); void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event); +void rds_ib_check_migration(struct rds_connection *conn, + struct rdma_cm_event *event); #define rds_ib_conn_error(conn, fmt...) \ @@ -299,7 +463,7 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); void rds_ib_destroy_nodev_conns(void); -struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *); +struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev, int npages); void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, @@ -307,18 +471,24 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, void rds_ib_sync_mr(void *trans_private, int dir); void rds_ib_free_mr(void *trans_private, int invalidate); void rds_ib_flush_mrs(void); +int rds_ib_fmr_init(void); +void rds_ib_fmr_exit(void); /* ib_recv.c */ int rds_ib_recv_init(void); void rds_ib_recv_exit(void); +int rds_ib_srqs_init(void); +void rds_ib_srqs_exit(void); int rds_ib_recv(struct rds_connection *conn); int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic); void rds_ib_recv_free_caches(struct rds_ib_connection *ic); -void rds_ib_recv_refill(struct rds_connection *conn, int prefill); +void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp); void rds_ib_inc_free(struct rds_incoming *inc); int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, size_t size); -void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context); +void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, + struct ib_wc *wc, + struct rds_ib_ack_state *state); void rds_ib_recv_tasklet_fn(unsigned long data); void rds_ib_recv_init_ring(struct rds_ib_connection *ic); void rds_ib_recv_clear_ring(struct rds_ib_connection *ic); @@ -326,6 +496,10 @@ void rds_ib_recv_init_ack(struct rds_ib_connection *ic); void rds_ib_attempt_ack(struct rds_ib_connection *ic); void rds_ib_ack_send_complete(struct rds_ib_connection *ic); u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic); +void rds_ib_srq_refill(struct work_struct *work); +void rds_ib_srq_rearm(struct work_struct *work); +void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required); + /* ib_ring.c */ void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr); @@ -340,18 +514,18 @@ u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest); extern wait_queue_head_t rds_ib_ring_empty_wait; /* ib_send.c */ -char *rds_ib_wc_status_str(enum ib_wc_status status); void rds_ib_xmit_complete(struct rds_connection *conn); int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); -void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context); +void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, + struct ib_wc *wc); void rds_ib_send_init_ring(struct rds_ib_connection *ic); void rds_ib_send_clear_ring(struct rds_ib_connection *ic); int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op); void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits); void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted); int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted, - u32 *adv_credits, int need_posted, int max_posted); + u32 *adv_credits, int need_posted); int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op); /* ib_stats.c */ @@ -360,6 +534,12 @@ DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); +/* ib_recv.c */ +extern unsigned int rds_ib_srq_max_wr; +extern unsigned int rds_ib_srq_hwm_refill; +extern unsigned int rds_ib_srq_lwm_refill; +extern unsigned int rds_ib_srq_enabled; + /* ib_sysctl.c */ int rds_ib_sysctl_init(void); void rds_ib_sysctl_exit(void); @@ -369,5 +549,6 @@ extern unsigned long rds_ib_sysctl_max_unsig_wrs; extern unsigned long rds_ib_sysctl_max_unsig_bytes; extern unsigned long rds_ib_sysctl_max_recv_allocation; extern unsigned int rds_ib_sysctl_flow_control; +extern ctl_table rds_ib_sysctl_table[]; #endif diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index fd453dd5124be..228c21138caef 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -32,15 +32,16 @@ */ #include #include -#include #include +#include +#include #include "rds.h" #include "ib.h" +#include "tcp.h" static char *rds_ib_event_type_strings[] = { -#define RDS_IB_EVENT_STRING(foo) \ - [IB_EVENT_##foo] = __stringify(IB_EVENT_##foo) +#define RDS_IB_EVENT_STRING(foo)[IB_EVENT_##foo] = __stringify(foo) RDS_IB_EVENT_STRING(CQ_ERR), RDS_IB_EVENT_STRING(QP_FATAL), RDS_IB_EVENT_STRING(QP_REQ_ERR), @@ -64,8 +65,11 @@ static char *rds_ib_event_type_strings[] = { static char *rds_ib_event_str(enum ib_event_type type) { - return rds_str_array(rds_ib_event_type_strings, - ARRAY_SIZE(rds_ib_event_type_strings), type); + if (type < ARRAY_SIZE(rds_ib_event_type_strings) && + rds_ib_event_type_strings[type]) + return rds_ib_event_type_strings[type]; + else + return "unknown"; }; /* @@ -140,31 +144,58 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even } } - if (conn->c_version < RDS_PROTOCOL(3,1)) { - printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed," - " no longer supported\n", - &conn->c_faddr, - RDS_PROTOCOL_MAJOR(conn->c_version), - RDS_PROTOCOL_MINOR(conn->c_version)); - rds_conn_destroy(conn); - return; - } else { - printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", - &conn->c_faddr, - RDS_PROTOCOL_MAJOR(conn->c_version), - RDS_PROTOCOL_MINOR(conn->c_version), - ic->i_flowctl ? ", flow control" : ""); + if (conn->c_version < RDS_PROTOCOL(3, 2)) { + if (conn->c_version == RDS_PROTOCOL(3, 1)) { + if (conn->c_tos) { + printk(KERN_NOTICE "RDS: Connection to" + " %u.%u.%u.%u version %u.%u Tos %d" + " failed, not supporting QoS\n", + NIPQUAD(conn->c_faddr), + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version), + conn->c_tos); + rds_conn_drop(conn); + return; + } + } else { + /* + * BUG: destroying connection here can deadlock with + * the CM event handler on the c_cm_lock. + */ + printk(KERN_NOTICE "RDS/IB: Connection to" + " %u.%u.%u.%u version %u.%u failed," + " no longer supported\n", + NIPQUAD(conn->c_faddr), + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version)); + rds_conn_destroy(conn); + return; + } } + printk(KERN_NOTICE + "RDS/IB: connected to %u.%u.%u.%u version %u.%u%s Tos %d\n", + NIPQUAD(conn->c_faddr), + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version), + ic->i_flowctl ? ", flow control" : "", + conn->c_tos); + + ic->i_sl = ic->i_cm_id->route.path_rec->sl; + /* * Init rings and fill recv. this needs to wait until protocol negotiation * is complete, since ring layout is different from 3.0 to 3.1. */ rds_ib_send_init_ring(ic); - rds_ib_recv_init_ring(ic); + + if (!rds_ib_srq_enabled) + rds_ib_recv_init_ring(ic); + /* Post receive buffers - as a side effect, this will update * the posted credit count. */ - rds_ib_recv_refill(conn, 1); + if (!rds_ib_srq_enabled) + rds_ib_recv_refill(conn, 1, GFP_KERNEL); /* Tune RNR behavior */ rds_ib_tune_rnr(ic, &qp_attr); @@ -185,6 +216,33 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even if (dp && dp->dp_ack_seq) rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL); + if (rds_ib_apm_enabled && !ic->conn->c_reconnect) { + memcpy(&ic->i_pri_path.p_sgid, + &ic->i_cm_id->route.path_rec[0].sgid, + sizeof(union ib_gid)); + + memcpy(&ic->i_pri_path.p_dgid, + &ic->i_cm_id->route.path_rec[0].dgid, + sizeof(union ib_gid)); + + memcpy(&ic->i_cur_path.p_sgid, + &ic->i_cm_id->route.path_rec[0].sgid, + sizeof(union ib_gid)); + + memcpy(&ic->i_cur_path.p_dgid, + &ic->i_cm_id->route.path_rec[0].dgid, + sizeof(union ib_gid)); + + printk(KERN_NOTICE "RDS/IB: connection " + "<%u.%u.%u.%u,%u.%u.%u.%u,%d> primary path " + "<"RDS_IB_GID_FMT","RDS_IB_GID_FMT">\n", + NIPQUAD(conn->c_laddr), + NIPQUAD(conn->c_faddr), + conn->c_tos, + RDS_IB_GID_ARG(ic->i_pri_path.p_sgid), + RDS_IB_GID_ARG(ic->i_pri_path.p_dgid)); + } + rds_connect_complete(conn); } @@ -204,8 +262,9 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources); conn_param->initiator_depth = min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth); - conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7); - conn_param->rnr_retry_count = 7; + conn_param->retry_count = + min_t(unsigned int, rds_ib_retry_count, rds_ib_rnr_retry_count); + conn_param->rnr_retry_count = rds_ib_rnr_retry_count; if (dp) { memset(dp, 0, sizeof(*dp)); @@ -215,6 +274,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version); dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS); dp->dp_ack_seq = rds_ib_piggyb_ack(ic); + dp->dp_tos = conn->c_tos; /* Advertise flow control */ if (ic->i_flowctl) { @@ -236,6 +296,105 @@ static void rds_ib_cq_event_handler(struct ib_event *event, void *data) event->event, rds_ib_event_str(event->event), data); } +static void rds_ib_cq_comp_handler_send(struct ib_cq *cq, void *context) +{ + struct rds_connection *conn = context; + struct rds_ib_connection *ic = conn->c_transport_data; + + rdsdebug("conn %p cq %p\n", conn, cq); + + rds_ib_stats_inc(s_ib_evt_handler_call); + + tasklet_schedule(&ic->i_stasklet); +} + +static void rds_ib_cq_comp_handler_recv(struct ib_cq *cq, void *context) +{ + struct rds_connection *conn = context; + struct rds_ib_connection *ic = conn->c_transport_data; + + rdsdebug("conn %p cq %p\n", conn, cq); + + rds_ib_stats_inc(s_ib_evt_handler_call); + + tasklet_schedule(&ic->i_rtasklet); +} + +static void poll_cq(struct rds_ib_connection *ic, struct ib_cq *cq, + struct ib_wc *wcs, + struct rds_ib_ack_state *ack_state) +{ + int nr; + int i; + struct ib_wc *wc; + + while ((nr = ib_poll_cq(cq, RDS_WC_MAX, wcs)) > 0) { + for (i = 0; i < nr; i++) { + wc = wcs + i; + rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", + (unsigned long long)wc->wr_id, wc->status, wc->byte_len, + be32_to_cpu(wc->ex.imm_data)); + + if (wc->wr_id & RDS_IB_SEND_OP) + rds_ib_send_cqe_handler(ic, wc); + else + rds_ib_recv_cqe_handler(ic, wc, ack_state); + } + } +} + +void rds_ib_tasklet_fn_send(unsigned long data) +{ + struct rds_ib_connection *ic = (struct rds_ib_connection *) data; + struct rds_connection *conn = ic->conn; + struct rds_ib_ack_state ack_state; + + memset(&ack_state, 0, sizeof(ack_state)); + rds_ib_stats_inc(s_ib_tasklet_call); + + poll_cq(ic, ic->i_scq, ic->i_send_wc, &ack_state); + ib_req_notify_cq(ic->i_scq, IB_CQ_NEXT_COMP); + poll_cq(ic, ic->i_scq, ic->i_send_wc, &ack_state); + + if (rds_conn_up(conn) && + (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags) || + test_bit(0, &conn->c_map_queued))) + rds_send_xmit(ic->conn); +} + +void rds_ib_tasklet_fn_recv(unsigned long data) +{ + struct rds_ib_connection *ic = (struct rds_ib_connection *) data; + struct rds_connection *conn = ic->conn; + struct rds_ib_ack_state ack_state; + struct rds_ib_device *rds_ibdev = ic->rds_ibdev; + + BUG_ON(conn->c_tos && !rds_ibdev); + + rds_ib_stats_inc(s_ib_tasklet_call); + + memset(&ack_state, 0, sizeof(ack_state)); + + poll_cq(ic, ic->i_rcq, ic->i_recv_wc, &ack_state); + ib_req_notify_cq(ic->i_rcq, IB_CQ_SOLICITED); + poll_cq(ic, ic->i_rcq, ic->i_recv_wc, &ack_state); + + if (ack_state.ack_next_valid) + rds_ib_set_ack(ic, ack_state.ack_next, ack_state.ack_required); + if (ack_state.ack_recv_valid && ack_state.ack_recv > ic->i_ack_recv) { + rds_send_drop_acked(conn, ack_state.ack_recv, NULL); + ic->i_ack_recv = ack_state.ack_recv; + } + if (rds_conn_up(conn)) + rds_ib_attempt_ack(ic); + + if (rds_ib_srq_enabled) + if ((atomic_read(&rds_ibdev->srq->s_num_posted) < + rds_ib_srq_hwm_refill) && + !test_and_set_bit(0, &rds_ibdev->srq->s_refill_gate)) + queue_delayed_work(rds_wq, &rds_ibdev->srq->s_refill_w, 0); +} + static void rds_ib_qp_event_handler(struct ib_event *event, void *data) { struct rds_connection *conn = data; @@ -248,6 +407,47 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data) case IB_EVENT_COMM_EST: rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); break; + case IB_EVENT_QP_LAST_WQE_REACHED: + complete(&ic->i_last_wqe_complete); + break; + case IB_EVENT_PATH_MIG: + memcpy(&ic->i_cur_path.p_sgid, + &ic->i_cm_id->route.path_rec[ic->i_alt_path_index].sgid, + sizeof(union ib_gid)); + + memcpy(&ic->i_cur_path.p_dgid, + &ic->i_cm_id->route.path_rec[ic->i_alt_path_index].dgid, + sizeof(union ib_gid)); + + if (!memcmp(&ic->i_pri_path.p_sgid, &ic->i_cur_path.p_sgid, + sizeof(union ib_gid)) && + !memcmp(&ic->i_pri_path.p_dgid, &ic->i_cur_path.p_dgid, + sizeof(union ib_gid))) { + printk(KERN_NOTICE + "RDS/IB: connection " + "<%u.%u.%u.%u,%u.%u.%u.%u,%d> migrated back to path " + "<"RDS_IB_GID_FMT","RDS_IB_GID_FMT">\n", + NIPQUAD(conn->c_laddr), + NIPQUAD(conn->c_faddr), + conn->c_tos, + RDS_IB_GID_ARG(ic->i_cur_path.p_sgid), + RDS_IB_GID_ARG(ic->i_cur_path.p_dgid)); + } else { + printk(KERN_NOTICE + "RDS/IB: connection " + "<%u.%u.%u.%u,%u.%u.%u.%u,%d> migrated over to path " + "<"RDS_IB_GID_FMT","RDS_IB_GID_FMT">\n", + NIPQUAD(conn->c_laddr), + NIPQUAD(conn->c_faddr), + conn->c_tos, + RDS_IB_GID_ARG(ic->i_cur_path.p_sgid), + RDS_IB_GID_ARG(ic->i_cur_path.p_dgid)); + } + + break; + case IB_EVENT_PATH_MIG_ERR: + rdsdebug("RDS: Path migration error\n"); + break; default: rdsdebug("Fatal QP Event %u (%s) " "- connection %pI4->%pI4, reconnecting\n", @@ -258,6 +458,26 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data) } } +static int rds_ib_find_least_loaded_vector(struct rds_ib_device *rds_ibdev) +{ + int i; + int index = rds_ibdev->dev->num_comp_vectors - 1; + int min = rds_ibdev->vector_load[rds_ibdev->dev->num_comp_vectors - 1]; + + if (!rds_ib_cq_balance_enabled) + return IB_CQ_VECTOR_LEAST_ATTACHED; + + for (i = rds_ibdev->dev->num_comp_vectors - 1; i >= 0; i--) { + if (rds_ibdev->vector_load[i] < min) { + index = i; + min = rds_ibdev->vector_load[i]; + } + } + + rds_ibdev->vector_load[index]++; + return index; +} + /* * This needs to be very careful to not leave IS_ERR pointers around for * cleanup to trip over. @@ -290,33 +510,47 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ic->i_pd = rds_ibdev->pd; ic->i_mr = rds_ibdev->mr; - ic->i_send_cq = ib_create_cq(dev, rds_ib_send_cq_comp_handler, - rds_ib_cq_event_handler, conn, - ic->i_send_ring.w_nr + 1, 0); - if (IS_ERR(ic->i_send_cq)) { - ret = PTR_ERR(ic->i_send_cq); - ic->i_send_cq = NULL; + ic->i_scq_vector = rds_ib_find_least_loaded_vector(rds_ibdev); + ic->i_scq = ib_create_cq(dev, rds_ib_cq_comp_handler_send, + rds_ib_cq_event_handler, conn, + ic->i_send_ring.w_nr + 1, + ic->i_scq_vector); + if (IS_ERR(ic->i_scq)) { + ret = PTR_ERR(ic->i_scq); + ic->i_scq = NULL; rdsdebug("ib_create_cq send failed: %d\n", ret); + if (ic->i_scq_vector != IB_CQ_VECTOR_LEAST_ATTACHED) + rds_ibdev->vector_load[ic->i_scq_vector]--; goto out; } - ic->i_recv_cq = ib_create_cq(dev, rds_ib_recv_cq_comp_handler, - rds_ib_cq_event_handler, conn, - ic->i_recv_ring.w_nr, 0); - if (IS_ERR(ic->i_recv_cq)) { - ret = PTR_ERR(ic->i_recv_cq); - ic->i_recv_cq = NULL; + ic->i_rcq_vector = rds_ib_find_least_loaded_vector(rds_ibdev); + if (rds_ib_srq_enabled) + ic->i_rcq = ib_create_cq(dev, rds_ib_cq_comp_handler_recv, + rds_ib_cq_event_handler, conn, + rds_ib_srq_max_wr - 1, + ic->i_rcq_vector); + else + ic->i_rcq = ib_create_cq(dev, rds_ib_cq_comp_handler_recv, + rds_ib_cq_event_handler, conn, + ic->i_recv_ring.w_nr, + ic->i_rcq_vector); + if (IS_ERR(ic->i_rcq)) { + ret = PTR_ERR(ic->i_rcq); + ic->i_rcq = NULL; rdsdebug("ib_create_cq recv failed: %d\n", ret); + if (ic->i_scq_vector != IB_CQ_VECTOR_LEAST_ATTACHED) + rds_ibdev->vector_load[ic->i_rcq_vector]--; goto out; } - ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); + ret = ib_req_notify_cq(ic->i_scq, IB_CQ_NEXT_COMP); if (ret) { rdsdebug("ib_req_notify_cq send failed: %d\n", ret); goto out; } - ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); + ret = ib_req_notify_cq(ic->i_rcq, IB_CQ_SOLICITED); if (ret) { rdsdebug("ib_req_notify_cq recv failed: %d\n", ret); goto out; @@ -333,8 +567,13 @@ static int rds_ib_setup_qp(struct rds_connection *conn) attr.cap.max_recv_sge = RDS_IB_RECV_SGE; attr.sq_sig_type = IB_SIGNAL_REQ_WR; attr.qp_type = IB_QPT_RC; - attr.send_cq = ic->i_send_cq; - attr.recv_cq = ic->i_recv_cq; + attr.send_cq = ic->i_scq; + attr.recv_cq = ic->i_rcq; + + if (rds_ib_srq_enabled) { + attr.cap.max_recv_wr = 0; + attr.srq = rds_ibdev->srq->s_srq; + } /* * XXX this can fail if max_*_wr is too large? Are we supposed @@ -356,14 +595,16 @@ static int rds_ib_setup_qp(struct rds_connection *conn) goto out; } - ic->i_recv_hdrs = ib_dma_alloc_coherent(dev, - ic->i_recv_ring.w_nr * - sizeof(struct rds_header), - &ic->i_recv_hdrs_dma, GFP_KERNEL); - if (!ic->i_recv_hdrs) { - ret = -ENOMEM; - rdsdebug("ib_dma_alloc_coherent recv failed\n"); - goto out; + if (!rds_ib_srq_enabled) { + ic->i_recv_hdrs = ib_dma_alloc_coherent(dev, + ic->i_recv_ring.w_nr * + sizeof(struct rds_header), + &ic->i_recv_hdrs_dma, GFP_KERNEL); + if (!ic->i_recv_hdrs) { + ret = -ENOMEM; + rdsdebug("ib_dma_alloc_coherent recv failed\n"); + goto out; + } } ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), @@ -383,21 +624,23 @@ static int rds_ib_setup_qp(struct rds_connection *conn) } memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); - ic->i_recvs = vmalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work), - ibdev_to_node(dev)); - if (!ic->i_recvs) { - ret = -ENOMEM; - rdsdebug("recv allocation failed\n"); - goto out; + if (!rds_ib_srq_enabled) { + ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * + sizeof(struct rds_ib_recv_work)); + if (!ic->i_recvs) { + ret = -ENOMEM; + rdsdebug("recv allocation failed\n"); + goto out; + } + memset(ic->i_recvs, 0, ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work)); } - memset(ic->i_recvs, 0, ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work)); rds_ib_recv_init_ack(ic); - rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr, - ic->i_send_cq, ic->i_recv_cq); + rdsdebug("conn %p pd %p mr %p cq %p\n", conn, ic->i_pd, ic->i_mr, ic->i_rcq); out: + conn->c_reconnect_err = ret; rds_ib_dev_put(rds_ibdev); return ret; } @@ -426,8 +669,8 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event) } /* Even if len is crap *now* I still want to check it. -ASG */ - if (event->param.conn.private_data_len < sizeof (*dp) || - dp->dp_protocol_major == 0) + if (event->param.conn.private_data_len < sizeof(*dp) + || dp->dp_protocol_major == 0) return RDS_PROTOCOL_3_0; common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS; @@ -470,7 +713,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, (unsigned long long)be64_to_cpu(fguid)); conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_ib_transport, - GFP_KERNEL); + dp->dp_tos, GFP_KERNEL); if (IS_ERR(conn)) { rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); conn = NULL; @@ -486,20 +729,49 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, */ mutex_lock(&conn->c_cm_lock); if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { + /* + * in both of the cases below, the conn is half setup. + * we need to make sure the lower layers don't destroy it + */ + ic = conn->c_transport_data; + if (ic && ic->i_cm_id == cm_id) + destroy = 0; if (rds_conn_state(conn) == RDS_CONN_UP) { rdsdebug("incoming connect while connecting\n"); rds_conn_drop(conn); rds_ib_stats_inc(s_ib_listen_closed_stale); - } else - if (rds_conn_state(conn) == RDS_CONN_CONNECTING) { - /* Wait and see - our connect may still be succeeding */ - rds_ib_stats_inc(s_ib_connect_raced); + } else if (rds_conn_state(conn) == RDS_CONN_CONNECTING) { + unsigned long now = get_seconds(); + + /* + * after 15 seconds, give up on existing connection + * attempts and make them try again. At this point + * it's no longer a race but something has gone + * horribly wrong + */ + if (now > conn->c_connection_start && + now - conn->c_connection_start > 15) { + printk(KERN_CRIT "rds connection racing for 15s, forcing reset " + "connection %u.%u.%u.%u->%u.%u.%u.%u\n", + NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr)); + rds_conn_drop(conn); + rds_ib_stats_inc(s_ib_listen_closed_stale); + } else { + /* Wait and see - our connect may still be succeeding */ + rds_ib_stats_inc(s_ib_connect_raced); + } } goto out; } ic = conn->c_transport_data; + /* + * record the time we started trying to connect so that we can + * drop the connection if it doesn't work out after a while + */ + conn->c_connection_start = get_seconds(); + rds_ib_set_protocol(conn, version); rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit)); @@ -528,10 +800,18 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, event->param.conn.responder_resources, event->param.conn.initiator_depth); + if (rds_ib_apm_enabled) + rdma_set_timeout(cm_id, rds_ib_apm_timeout); + /* rdma_accept() calls rdma_reject() internally if it fails */ err = rdma_accept(cm_id, &conn_param); if (err) rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err); + else if (rds_ib_apm_enabled && !conn->c_loopback) { + err = rdma_enable_apm(cm_id, RDMA_ALT_PATH_BEST); + if (err) + printk(KERN_WARNING "RDS/IB: APM couldn't be enabled for passive side: %d\n", err); + } out: if (conn) @@ -550,6 +830,12 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) struct rds_ib_connect_private dp; int ret; + if (rds_ib_apm_enabled && !conn->c_loopback) { + ret = rdma_enable_apm(cm_id, RDMA_ALT_PATH_BEST); + if (ret) + printk(KERN_WARNING "RDS/IB: APM couldn't be enabled for active side: %d\n", ret); + } + /* If the peer doesn't do protocol negotiation, we must * default to RDSv3.0 */ rds_ib_set_protocol(conn, RDS_PROTOCOL_3_0); @@ -575,19 +861,129 @@ out: if (ic->i_cm_id == cm_id) ret = 0; } + + ic->i_active_side = 1; return ret; } +static void rds_ib_migrate(struct work_struct *_work) +{ + struct rds_ib_migrate_work *work = + container_of(_work, struct rds_ib_migrate_work, work.work); + struct rds_ib_connection *ic = work->ic; + struct ib_qp_attr qp_attr; + struct ib_qp_init_attr qp_init_attr; + enum ib_mig_state path_mig_state; + struct rdma_cm_id *cm_id = ic->i_cm_id; + int ret = 0; + + if (!rds_ib_apm_fallback) + return; + + if (!ic->i_active_side) { + ret = ib_query_qp(cm_id->qp, &qp_attr, IB_QP_PATH_MIG_STATE, + &qp_init_attr); + if (ret) { + printk(KERN_ERR "RDS/IB: failed to query QP\n"); + return; + } + + path_mig_state = qp_attr.path_mig_state; + if (!path_mig_state) { + printk(KERN_NOTICE + "RDS/IB: Migration in progress..skip\n"); + return; + } + + qp_attr.path_mig_state = 0; + ret = ib_modify_qp(cm_id->qp, &qp_attr, IB_QP_PATH_MIG_STATE); + if (ret) { + printk(KERN_ERR "RDS/IB: failed to modify QP from %s" + " to MIGRATED state\n", + (!path_mig_state) ? "MIGRATED" : + (path_mig_state == 1) ? "REARM" : + (path_mig_state == 2) ? "ARMED" : "UNKNOWN"); + } + } +} + +void rds_ib_check_migration(struct rds_connection *conn, + struct rdma_cm_event *event) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + union ib_gid sgid; + union ib_gid dgid; + struct ib_qp_init_attr qp_init_attr; + struct ib_qp_attr qp_attr; + struct rdma_cm_id *cm_id = ic->i_cm_id; + int err; + + if (!rds_ib_apm_enabled || !rds_conn_up(ic->conn)) + return ; + + ic->i_alt_path_index = event->param.ud.alt_path_index; + + memcpy(&sgid, &cm_id->route.path_rec[event->param.ud.alt_path_index]. + sgid, sizeof(union ib_gid)); + memcpy(&dgid, &cm_id->route.path_rec[event->param.ud.alt_path_index]. + dgid, sizeof(union ib_gid)); + + printk(KERN_NOTICE + "RDS/IB: connection " + "<%u.%u.%u.%u,%u.%u.%u.%u,%d> loaded alternate path " + "<"RDS_IB_GID_FMT","RDS_IB_GID_FMT">\n", + NIPQUAD(conn->c_laddr), + NIPQUAD(conn->c_faddr), + conn->c_tos, + RDS_IB_GID_ARG(sgid), RDS_IB_GID_ARG(dgid)); + + err = ib_query_qp(cm_id->qp, &qp_attr, IB_QP_ALT_PATH, &qp_init_attr); + if (err) { + printk(KERN_ERR "RDS/IB: ib_query_qp failed (%d)\n", err); + return; + } + qp_attr.alt_timeout = rds_ib_apm_timeout; + err = ib_modify_qp(cm_id->qp, &qp_attr, IB_QP_ALT_PATH); + if (err) { + printk(KERN_ERR "RDS/IB: ib_modify_qp failed (%d)\n", err); + return; + } + + if (!memcmp(&ic->i_pri_path.p_sgid, &sgid, sizeof(union ib_gid)) && + !memcmp(&ic->i_pri_path.p_dgid, &dgid, sizeof(union ib_gid))) { + if (memcmp(&ic->i_cur_path.p_sgid, &ic->i_pri_path.p_sgid, + sizeof(union ib_gid)) || + memcmp(&ic->i_cur_path.p_dgid, &ic->i_pri_path.p_dgid, + sizeof(union ib_gid))) { + + ic->i_migrate_w.ic = ic; + queue_delayed_work(rds_wq, &ic->i_migrate_w.work, 0); + } + } +} + +static void rds_ib_destroy_id(struct work_struct *_work) +{ + struct rds_ib_destroy_id_work *work = + container_of(_work, struct rds_ib_destroy_id_work, work.work); + struct rdma_cm_id *cm_id = work->cm_id; + + rdma_destroy_id(cm_id); + + kfree(work); +} + int rds_ib_conn_connect(struct rds_connection *conn) { struct rds_ib_connection *ic = conn->c_transport_data; struct sockaddr_in src, dest; + struct rds_ib_destroy_id_work *work; int ret; /* XXX I wonder what affect the port space has */ /* delegate cm event handler to rdma_transport */ ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, - RDMA_PS_TCP, IB_QPT_RC); + RDMA_PS_TCP); if (IS_ERR(ic->i_cm_id)) { ret = PTR_ERR(ic->i_cm_id); ic->i_cm_id = NULL; @@ -611,7 +1007,14 @@ int rds_ib_conn_connect(struct rds_connection *conn) if (ret) { rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id, ret); - rdma_destroy_id(ic->i_cm_id); + work = kzalloc(sizeof *work, GFP_KERNEL); + if (work) { + work->cm_id = ic->i_cm_id; + INIT_DELAYED_WORK(&work->work, rds_ib_destroy_id); + queue_delayed_work(rds_aux_wq, &work->work, 0); + } else + rdma_destroy_id(ic->i_cm_id); + ic->i_cm_id = NULL; } @@ -627,11 +1030,11 @@ out: void rds_ib_conn_shutdown(struct rds_connection *conn) { struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_ib_destroy_id_work *work; int err = 0; - rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id, - ic->i_pd, ic->i_send_cq, ic->i_recv_cq, - ic->i_cm_id ? ic->i_cm_id->qp : NULL); + rdsdebug("cm %p pd %p cq %p qp %p\n", ic->i_cm_id, + ic->i_pd, ic->i_rcq, ic->i_cm_id ? ic->i_cm_id->qp : NULL); if (ic->i_cm_id) { struct ib_device *dev = ic->i_cm_id->device; @@ -644,22 +1047,41 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) */ rdsdebug("failed to disconnect, cm: %p err %d\n", ic->i_cm_id, err); + } else if (rds_ib_srq_enabled && ic->rds_ibdev) { + /* + wait for the last wqe to complete, then schedule + the recv tasklet to drain the RX CQ. + */ + wait_for_completion(&ic->i_last_wqe_complete); + tasklet_schedule(&ic->i_rtasklet); } - /* - * We want to wait for tx and rx completion to finish - * before we tear down the connection, but we have to be - * careful not to get stuck waiting on a send ring that - * only has unsignaled sends in it. We've shutdown new - * sends before getting here so by waiting for signaled - * sends to complete we're ensured that there will be no - * more tx processing. - */ + /* quiesce tx and rx completion before tearing down */ wait_event(rds_ib_ring_empty_wait, rds_ib_ring_empty(&ic->i_recv_ring) && (atomic_read(&ic->i_signaled_sends) == 0)); - tasklet_kill(&ic->i_recv_tasklet); + tasklet_kill(&ic->i_stasklet); + tasklet_kill(&ic->i_rtasklet); + /* first destroy the ib state that generates callbacks */ + if (ic->i_cm_id->qp) + rdma_destroy_qp(ic->i_cm_id); + + if (ic->i_rcq) { + if (ic->rds_ibdev && + ic->i_rcq_vector != IB_CQ_VECTOR_LEAST_ATTACHED) + ic->rds_ibdev->vector_load[ic->i_rcq_vector]--; + ib_destroy_cq(ic->i_rcq); + } + + if (ic->i_scq) { + if (ic->rds_ibdev && + ic->i_scq_vector != IB_CQ_VECTOR_LEAST_ATTACHED) + ic->rds_ibdev->vector_load[ic->i_scq_vector]--; + ib_destroy_cq(ic->i_scq); + } + + /* then free the resources that ib callbacks use */ if (ic->i_send_hdrs) ib_dma_free_coherent(dev, ic->i_send_ring.w_nr * @@ -683,13 +1105,17 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) if (ic->i_recvs) rds_ib_recv_clear_ring(ic); - if (ic->i_cm_id->qp) - rdma_destroy_qp(ic->i_cm_id); - if (ic->i_send_cq) - ib_destroy_cq(ic->i_send_cq); - if (ic->i_recv_cq) - ib_destroy_cq(ic->i_recv_cq); - rdma_destroy_id(ic->i_cm_id); + /* + * rdma_destroy_id may block so offload it to the aux + * thread for processing. + */ + work = kzalloc(sizeof *work, GFP_KERNEL); + if (work) { + work->cm_id = ic->i_cm_id; + INIT_DELAYED_WORK(&work->work, rds_ib_destroy_id); + queue_delayed_work(rds_aux_wq, &work->work, 0); + } else + rdma_destroy_id(ic->i_cm_id); /* * Move connection back to the nodev list. @@ -700,8 +1126,8 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) ic->i_cm_id = NULL; ic->i_pd = NULL; ic->i_mr = NULL; - ic->i_send_cq = NULL; - ic->i_recv_cq = NULL; + ic->i_scq = NULL; + ic->i_rcq = NULL; ic->i_send_hdrs = NULL; ic->i_recv_hdrs = NULL; ic->i_ack = NULL; @@ -740,8 +1166,14 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) vfree(ic->i_sends); ic->i_sends = NULL; - vfree(ic->i_recvs); + if (!rds_ib_srq_enabled) + vfree(ic->i_recvs); + ic->i_recvs = NULL; + + INIT_COMPLETION(ic->i_last_wqe_complete); + + ic->i_active_side = 0; } int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) @@ -762,8 +1194,8 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) } INIT_LIST_HEAD(&ic->ib_node); - tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn, - (unsigned long) ic); + tasklet_init(&ic->i_stasklet, rds_ib_tasklet_fn_send, (unsigned long) ic); + tasklet_init(&ic->i_rtasklet, rds_ib_tasklet_fn_recv, (unsigned long) ic); mutex_init(&ic->i_recv_mutex); #ifndef KERNEL_HAS_ATOMIC64 spin_lock_init(&ic->i_ack_lock); @@ -780,11 +1212,14 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) ic->conn = conn; conn->c_transport_data = ic; + init_completion(&ic->i_last_wqe_complete); + + INIT_DELAYED_WORK(&ic->i_migrate_w.work, rds_ib_migrate); + spin_lock_irqsave(&ib_nodev_conns_lock, flags); list_add_tail(&ic->ib_node, &ib_nodev_conns); spin_unlock_irqrestore(&ib_nodev_conns_lock, flags); - rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data); return 0; } @@ -804,6 +1239,7 @@ void rds_ib_conn_free(void *arg) * A race with shutdown() or connect() would cause problems * (since rds_ibdev would change) but that should never happen. */ + lock_ptr = ic->rds_ibdev ? &ic->rds_ibdev->spinlock : &ib_nodev_conns_lock; spin_lock_irq(lock_ptr); diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 819c35a0d9cbf..f59272e43372e 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -31,13 +31,14 @@ * */ #include -#include #include #include "rds.h" #include "ib.h" #include "xlist.h" +struct workqueue_struct *rds_ib_fmr_wq; + static DEFINE_PER_CPU(unsigned long, clean_list_grace); #define CLEAN_LIST_BUSY_BIT 0 @@ -65,6 +66,7 @@ struct rds_ib_mr { * Our own little FMR pool */ struct rds_ib_mr_pool { + unsigned int pool_type; struct mutex flush_lock; /* serialize fmr invalidate */ struct delayed_work flush_worker; /* flush worker */ @@ -124,6 +126,12 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) return 0; } +static void ipaddr_free_cb(struct rcu_head *rp) +{ + struct rds_ib_ipaddr *ipaddr = container_of(rp, struct rds_ib_ipaddr, rcu_head); + kfree(ipaddr); +} + static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) { struct rds_ib_ipaddr *i_ipaddr; @@ -140,10 +148,8 @@ static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) } spin_unlock_irq(&rds_ibdev->spinlock); - if (to_free) { - synchronize_rcu(); - kfree(to_free); - } + if (to_free) + call_rcu(&to_free->rcu_head, ipaddr_free_cb); } int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) @@ -151,12 +157,17 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) struct rds_ib_device *rds_ibdev_old; rds_ibdev_old = rds_ib_get_device(ipaddr); - if (rds_ibdev_old) { + if (!rds_ibdev_old) + return rds_ib_add_ipaddr(rds_ibdev, ipaddr); + + if (rds_ibdev_old != rds_ibdev) { rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr); rds_ib_dev_put(rds_ibdev_old); + return rds_ib_add_ipaddr(rds_ibdev, ipaddr); } + rds_ib_dev_put(rds_ibdev_old); - return rds_ib_add_ipaddr(rds_ibdev, ipaddr); + return 0; } void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) @@ -169,9 +180,9 @@ void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *con BUG_ON(list_empty(&ic->ib_node)); list_del(&ic->ib_node); - spin_lock(&rds_ibdev->spinlock); + spin_lock_irq(&rds_ibdev->spinlock); list_add_tail(&ic->ib_node, &rds_ibdev->conn_list); - spin_unlock(&rds_ibdev->spinlock); + spin_unlock_irq(&rds_ibdev->spinlock); spin_unlock_irq(&ib_nodev_conns_lock); ic->rds_ibdev = rds_ibdev; @@ -212,7 +223,8 @@ void rds_ib_destroy_nodev_conns(void) rds_conn_destroy(ic->conn); } -struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev) +struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, + int pool_type) { struct rds_ib_mr_pool *pool; @@ -220,6 +232,7 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev) if (!pool) return ERR_PTR(-ENOMEM); + pool->pool_type = pool_type; INIT_XLIST_HEAD(&pool->free_list); INIT_XLIST_HEAD(&pool->drop_list); INIT_XLIST_HEAD(&pool->clean_list); @@ -227,28 +240,28 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev) init_waitqueue_head(&pool->flush_wait); INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); - pool->fmr_attr.max_pages = fmr_message_size; + if (pool_type == RDS_IB_MR_1M_POOL) { + pool->fmr_attr.max_pages = RDS_FMR_1M_MSG_SIZE + 1; + pool->max_items = rds_ibdev->max_1m_fmrs; + } else /* pool_type == RDS_IB_MR_8K_POOL */ { + pool->fmr_attr.max_pages = RDS_FMR_8K_MSG_SIZE + 1; + pool->max_items = rds_ibdev->max_8k_fmrs; + } + pool->max_free_pinned = + pool->max_items * pool->fmr_attr.max_pages / 4; pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; pool->fmr_attr.page_shift = PAGE_SHIFT; - pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4; - - /* We never allow more than max_items MRs to be allocated. - * When we exceed more than max_items_soft, we start freeing - * items more aggressively. - * Make sure that max_items > max_items_soft > max_items / 2 - */ - pool->max_items_soft = rds_ibdev->max_fmrs * 3 / 4; - pool->max_items = rds_ibdev->max_fmrs; + pool->max_items_soft = pool->max_items * 3 / 4; return pool; } void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo) { - struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool; - iinfo->rdma_mr_max = pool->max_items; - iinfo->rdma_mr_size = pool->fmr_attr.max_pages; + iinfo->rdma_mr_max = pool_1m->max_items; + iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages; } void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) @@ -298,14 +311,21 @@ static inline void wait_clean_list_grace(void) } } -static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) +static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, + int npages) { - struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + struct rds_ib_mr_pool *pool; struct rds_ib_mr *ibmr = NULL; + struct rds_ib_mr *tmp_ibmr = NULL; int err = 0, iter = 0; + if (npages <= RDS_FMR_8K_MSG_SIZE) + pool = rds_ibdev->mr_8k_pool; + else + pool = rds_ibdev->mr_1m_pool; + if (atomic_read(&pool->dirty_count) >= pool->max_items / 10) - schedule_delayed_work(&pool->flush_worker, 10); + queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); while (1) { ibmr = rds_ib_reuse_fmr(pool); @@ -327,18 +347,24 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) atomic_dec(&pool->item_count); if (++iter > 2) { - rds_ib_stats_inc(s_ib_rdma_mr_pool_depleted); + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted); return ERR_PTR(-EAGAIN); } /* We do have some empty MRs. Flush them out. */ - rds_ib_stats_inc(s_ib_rdma_mr_pool_wait); + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait); rds_ib_flush_mr_pool(pool, 0, &ibmr); if (ibmr) return ibmr; } - ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev)); + ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); if (!ibmr) { err = -ENOMEM; goto out_no_cigar; @@ -354,12 +380,35 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) &pool->fmr_attr); if (IS_ERR(ibmr->fmr)) { err = PTR_ERR(ibmr->fmr); + + /* Adjust the pool size to reflect the resources available to + * the VM. + */ + if (err == -ENOMEM) { + int prev_max = pool->max_items; + + pool->max_items = max(atomic_read(&pool->item_count), + RDS_FMR_1M_POOL_SIZE); + + printk(KERN_ERR "RDS/IB: Adjusted FMR pool (%d->%ld)\n", + prev_max, pool->max_items); + + rds_ib_flush_mr_pool(pool, 0, &tmp_ibmr); + if (tmp_ibmr) { + kfree(ibmr); + return tmp_ibmr; + } + } ibmr->fmr = NULL; printk(KERN_WARNING "RDS/IB: ib_alloc_fmr failed (err=%d)\n", err); goto out_no_cigar; } - rds_ib_stats_inc(s_ib_rdma_mr_alloc); + ibmr->pool = pool; + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc); return ibmr; out_no_cigar: @@ -415,11 +464,10 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm } page_cnt += len >> PAGE_SHIFT; - if (page_cnt > fmr_message_size) + if (page_cnt > ibmr->pool->fmr_attr.max_pages) return -EINVAL; - dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC, - rdsibdev_to_node(rds_ibdev)); + dma_pages = kmalloc(sizeof(u64) * page_cnt, GFP_ATOMIC); if (!dma_pages) return -ENOMEM; @@ -447,7 +495,10 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm ibmr->sg_dma_len = sg_dma_len; ibmr->remap_count++; - rds_ib_stats_inc(s_ib_rdma_mr_used); + if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_used); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_used); ret = 0; out: @@ -493,7 +544,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) /* FIXME we need a way to tell a r/w MR * from a r/o MR */ - BUG_ON(irqs_disabled()); + WARN_ON_ONCE(page_mapping(page) && irqs_disabled()); set_page_dirty(page); put_page(page); } @@ -510,8 +561,7 @@ static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr) __rds_ib_teardown_mr(ibmr); if (pinned) { - struct rds_ib_device *rds_ibdev = ibmr->device; - struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + struct rds_ib_mr_pool *pool = ibmr->pool; atomic_sub(pinned, &pool->free_pinned); } @@ -578,7 +628,7 @@ static void list_append_to_xlist(struct rds_ib_mr_pool *pool, * to free as many MRs as needed to get back to this limit. */ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, - int free_all, struct rds_ib_mr **ibmr_ret) + int free_all, struct rds_ib_mr **ibmr_ret) { struct rds_ib_mr *ibmr, *next; struct xlist_head clean_xlist; @@ -589,11 +639,14 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, unsigned int nfreed = 0, ncleaned = 0, free_goal; int ret = 0; - rds_ib_stats_inc(s_ib_rdma_mr_pool_flush); + if (pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_flush); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_flush); if (ibmr_ret) { DEFINE_WAIT(wait); - while(!mutex_trylock(&pool->flush_lock)) { + while (!mutex_trylock(&pool->flush_lock)) { ibmr = rds_ib_reuse_fmr(pool); if (ibmr) { *ibmr_ret = ibmr; @@ -651,7 +704,10 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, unpinned += ibmr->sg_len; __rds_ib_teardown_mr(ibmr); if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) { - rds_ib_stats_inc(s_ib_rdma_mr_free); + if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) + rds_ib_stats_inc(s_ib_rdma_mr_8k_free); + else + rds_ib_stats_inc(s_ib_rdma_mr_1m_free); list_del(&ibmr->unmap_list); ib_dealloc_fmr(ibmr->fmr); kfree(ibmr); @@ -694,6 +750,24 @@ out_nolock: return ret; } +int rds_ib_fmr_init(void) +{ + rds_ib_fmr_wq = create_workqueue("rds_fmr_flushd"); + if (!rds_ib_fmr_wq) + return -ENOMEM; + return 0; +} + +/* + * By the time this is called all the IB devices should have been torn down and + * had their pools freed. As each pool is freed its work struct is waited on, + * so the pool flushing work queue should be idle by the time we get here. + */ +void rds_ib_fmr_exit(void) +{ + destroy_workqueue(rds_ib_fmr_wq); +} + static void rds_ib_mr_pool_flush_worker(struct work_struct *work) { struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work); @@ -705,7 +779,7 @@ void rds_ib_free_mr(void *trans_private, int invalidate) { struct rds_ib_mr *ibmr = trans_private; struct rds_ib_device *rds_ibdev = ibmr->device; - struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + struct rds_ib_mr_pool *pool = ibmr->pool; rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); @@ -719,9 +793,9 @@ void rds_ib_free_mr(void *trans_private, int invalidate) atomic_inc(&pool->dirty_count); /* If we've pinned too many pages, request a flush */ - if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || - atomic_read(&pool->dirty_count) >= pool->max_items / 10) - schedule_delayed_work(&pool->flush_worker, 10); + if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned + || atomic_read(&pool->dirty_count) >= pool->max_items / 5) + queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); if (invalidate) { if (likely(!in_interrupt())) { @@ -729,7 +803,8 @@ void rds_ib_free_mr(void *trans_private, int invalidate) } else { /* We get here if the user created a MR marked * as use_once and invalidate at the same time. */ - schedule_delayed_work(&pool->flush_worker, 10); + queue_delayed_work(rds_ib_fmr_wq, + &pool->flush_worker, 10); } } @@ -742,10 +817,11 @@ void rds_ib_flush_mrs(void) down_read(&rds_ib_devices_lock); list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { - struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; + if (rds_ibdev->mr_8k_pool) + rds_ib_flush_mr_pool(rds_ibdev->mr_8k_pool, 0, NULL); - if (pool) - rds_ib_flush_mr_pool(pool, 0, NULL); + if (rds_ibdev->mr_1m_pool) + rds_ib_flush_mr_pool(rds_ibdev->mr_1m_pool, 0, NULL); } up_read(&rds_ib_devices_lock); } @@ -763,12 +839,12 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, goto out; } - if (!rds_ibdev->mr_pool) { + if (!rds_ibdev->mr_8k_pool || !rds_ibdev->mr_1m_pool) { ret = -ENODEV; goto out; } - ibmr = rds_ib_alloc_fmr(rds_ibdev); + ibmr = rds_ib_alloc_fmr(rds_ibdev, nents); if (IS_ERR(ibmr)) return ibmr; diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index e29e0ca32f740..d7636d671877b 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -31,7 +31,6 @@ * */ #include -#include #include #include #include @@ -39,9 +38,24 @@ #include "rds.h" #include "ib.h" +unsigned int rds_ib_srq_max_wr = RDS_IB_DEFAULT_SRQ_MAX_WR; +unsigned int rds_ib_srq_hwm_refill = RDS_IB_DEFAULT_SRQ_HWM_REFILL; +unsigned int rds_ib_srq_lwm_refill = RDS_IB_DEFAULT_SRQ_LWM_REFILL; +unsigned int rds_ib_srq_enabled = 0; + +module_param(rds_ib_srq_enabled, int, 0444); +MODULE_PARM_DESC(rds_ib_srq_enabled, "Set to enabled SRQ"); +module_param(rds_ib_srq_max_wr, int, 0444); +MODULE_PARM_DESC(rds_ib_srq_max_wr, "Max number of SRQ WRs"); +module_param(rds_ib_srq_hwm_refill, int, 0444); +MODULE_PARM_DESC(rds_ib_srq_hwm_refill, "SRQ HWM refill"); +module_param(rds_ib_srq_lwm_refill, int, 0444); +MODULE_PARM_DESC(rds_ib_srq_lwm_refill, "SRQ LWM refill"); + static struct kmem_cache *rds_ib_incoming_slab; static struct kmem_cache *rds_ib_frag_slab; static atomic_t rds_ib_allocation = ATOMIC_INIT(0); +static unsigned long rds_ib_allocation_warn = 1; void rds_ib_recv_init_ring(struct rds_ib_connection *ic) { @@ -104,7 +118,7 @@ static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache) cache->percpu = alloc_percpu(struct rds_ib_cache_head); if (!cache->percpu) - return -ENOMEM; + return -ENOMEM; for_each_possible_cpu(cpu) { head = per_cpu_ptr(cache->percpu, cpu); @@ -193,6 +207,7 @@ static void rds_ib_frag_free(struct rds_ib_connection *ic, rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg)); rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags); + atomic_inc(&ic->i_cache_allocs); } /* Recycle inc after freeing attached frags */ @@ -243,23 +258,15 @@ static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *i { struct rds_ib_incoming *ibinc; struct list_head *cache_item; - int avail_allocs; cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs); if (cache_item) { ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry); } else { - avail_allocs = atomic_add_unless(&rds_ib_allocation, - 1, rds_ib_sysctl_max_recv_allocation); - if (!avail_allocs) { - rds_ib_stats_inc(s_ib_rx_alloc_limit); - return NULL; - } ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask); - if (!ibinc) { - atomic_dec(&rds_ib_allocation); + if (!ibinc) return NULL; - } + rds_ib_stats_inc(s_ib_rx_total_incs); } INIT_LIST_HEAD(&ibinc->ii_frags); rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr); @@ -273,22 +280,40 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic struct rds_page_frag *frag; struct list_head *cache_item; int ret; + int avail_allocs; cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags); if (cache_item) { frag = container_of(cache_item, struct rds_page_frag, f_cache_entry); + atomic_dec(&ic->i_cache_allocs); } else { frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask); if (!frag) return NULL; + avail_allocs = atomic_add_unless(&rds_ib_allocation, + 1, rds_ib_sysctl_max_recv_allocation); + + if (!avail_allocs) { + if (test_and_clear_bit(0, &rds_ib_allocation_warn)) { + printk(KERN_NOTICE "RDS/IB: WARNING - " + "recv memory exceeded max_recv_allocation %d\n", + atomic_read(&rds_ib_allocation)); + } + rds_ib_stats_inc(s_ib_rx_alloc_limit); + kmem_cache_free(rds_ib_frag_slab, frag); + return NULL; + } + sg_init_table(&frag->f_sg, 1); ret = rds_page_remainder_alloc(&frag->f_sg, RDS_FRAG_SIZE, page_mask); if (ret) { kmem_cache_free(rds_ib_frag_slab, frag); + atomic_dec(&rds_ib_allocation); return NULL; } + rds_ib_stats_inc(s_ib_rx_total_frags); } INIT_LIST_HEAD(&frag->f_item); @@ -297,7 +322,7 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic } static int rds_ib_recv_refill_one(struct rds_connection *conn, - struct rds_ib_recv_work *recv, int prefill) + struct rds_ib_recv_work *recv, gfp_t gfp) { struct rds_ib_connection *ic = conn->c_transport_data; struct ib_sge *sge; @@ -305,7 +330,7 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn, gfp_t slab_mask = GFP_NOWAIT; gfp_t page_mask = GFP_NOWAIT; - if (prefill) { + if (gfp & __GFP_WAIT) { slab_mask = GFP_KERNEL; page_mask = GFP_HIGHUSER; } @@ -325,7 +350,7 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn, goto out; } - WARN_ON(recv->r_frag); /* leak! */ + WARN_ON_ONCE(recv->r_frag); /* leak! */ recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask); if (!recv->r_frag) goto out; @@ -347,6 +372,160 @@ out: return ret; } +static void rds_ib_srq_clear_one(struct rds_ib_srq *srq, + struct rds_ib_connection *ic, + struct rds_ib_recv_work *recv) +{ + if (recv->r_ibinc) { + rds_inc_put(&recv->r_ibinc->ii_inc); + recv->r_ibinc = NULL; + } + if (recv->r_frag) { + ib_dma_unmap_sg(srq->rds_ibdev->dev, &recv->r_frag->f_sg, + 1, DMA_FROM_DEVICE); + rds_ib_frag_free(ic, recv->r_frag); + recv->r_frag = NULL; + recv->r_ic = ic; + recv->r_posted = 0; + } +} + +static int rds_ib_srq_refill_one(struct rds_ib_srq *srq, + struct rds_ib_connection *ic, + struct rds_ib_recv_work *recv, gfp_t gfp) +{ + struct ib_sge *sge; + int ret = -ENOMEM; + gfp_t slab_mask = GFP_NOWAIT; + gfp_t page_mask = GFP_NOWAIT; + + if (gfp & __GFP_WAIT) { + slab_mask = GFP_KERNEL; + page_mask = GFP_HIGHUSER; + } + + if (!ic->i_cache_incs.ready) + rds_ib_cache_xfer_to_ready(&ic->i_cache_incs); + if (!ic->i_cache_frags.ready) + rds_ib_cache_xfer_to_ready(&ic->i_cache_frags); + + /* + * ibinc was taken from recv if recv contained the start of a message. + * recvs that were continuations will still have this allocated. + */ + + if (!recv->r_ibinc) { + recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask); + if (!recv->r_ibinc) + goto out; + } + + WARN_ON_ONCE(recv->r_frag); /* leak! */ + recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask); + if (!recv->r_frag) + goto out; + + ret = ib_dma_map_sg(srq->rds_ibdev->dev, &recv->r_frag->f_sg, + 1, DMA_FROM_DEVICE); + + WARN_ON(ret != 1); + + sge = &recv->r_sge[0]; + + sge->addr = srq->s_recv_hdrs_dma + + (recv - srq->s_recvs) * + sizeof(struct rds_header); + + sge->length = sizeof(struct rds_header); + + sge = &recv->r_sge[1]; + sge->addr = sg_dma_address(&recv->r_frag->f_sg); + sge->length = sg_dma_len(&recv->r_frag->f_sg); + + ret = 0; +out: + return ret; +} + +static int rds_ib_srq_prefill_one(struct rds_ib_device *rds_ibdev, + struct rds_ib_recv_work *recv, int prefill) +{ + struct ib_sge *sge; + int ret = -ENOMEM; + gfp_t slab_mask = GFP_NOWAIT; + gfp_t page_mask = GFP_NOWAIT; + + if (prefill) { + slab_mask = GFP_KERNEL; + page_mask = GFP_HIGHUSER; + } + + if (!recv->r_ibinc) { + recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask); + if (!recv->r_ibinc) + goto out; + rds_ib_stats_inc(s_ib_rx_total_incs); + INIT_LIST_HEAD(&recv->r_ibinc->ii_frags); + } + + WARN_ON_ONCE(recv->r_frag); /* leak! */ + recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask); + if (!recv->r_frag) + goto out; + sg_init_table(&recv->r_frag->f_sg, 1); + ret = rds_page_remainder_alloc(&recv->r_frag->f_sg, + RDS_FRAG_SIZE, page_mask); + if (ret) { + kmem_cache_free(rds_ib_frag_slab, recv->r_frag); + goto out; + } + + rds_ib_stats_inc(s_ib_rx_total_frags); + INIT_LIST_HEAD(&recv->r_frag->f_item); + + ret = ib_dma_map_sg(rds_ibdev->dev, &recv->r_frag->f_sg, + 1, DMA_FROM_DEVICE); + WARN_ON(ret != 1); + + sge = &recv->r_sge[0]; + sge->addr = rds_ibdev->srq->s_recv_hdrs_dma + + (recv - rds_ibdev->srq->s_recvs) * + sizeof(struct rds_header); + sge->length = sizeof(struct rds_header); + sge->lkey = rds_ibdev->mr->lkey; + + sge = &recv->r_sge[1]; + sge->addr = sg_dma_address(&recv->r_frag->f_sg); + sge->length = sg_dma_len(&recv->r_frag->f_sg); + sge->lkey = rds_ibdev->mr->lkey; + + ret = 0; +out: + return ret; +} + + + +static int acquire_refill(struct rds_connection *conn) +{ + return test_and_set_bit(RDS_RECV_REFILL, &conn->c_flags) == 0; +} + +static void release_refill(struct rds_connection *conn) +{ + clear_bit(RDS_RECV_REFILL, &conn->c_flags); + smp_mb__after_clear_bit(); + /* + * We don't use wait_on_bit()/wake_up_bit() because our waking is in a + * hot path and finding waiters is very rare. We don't want to walk + * the system-wide hashed waitqueue buckets in the fast path only to + * almost never find waiters. + */ + if (waitqueue_active(&conn->c_waitq)) + wake_up_all(&conn->c_waitq); +} + + /* * This tries to allocate and post unused work requests after making sure that * they have all the allocations they need to queue received fragments into @@ -354,17 +533,45 @@ out: * * -1 is returned if posting fails due to temporary resource exhaustion. */ -void rds_ib_recv_refill(struct rds_connection *conn, int prefill) +void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp) { struct rds_ib_connection *ic = conn->c_transport_data; struct rds_ib_recv_work *recv; struct ib_recv_wr *failed_wr; unsigned int posted = 0; int ret = 0; + int can_wait = gfp & __GFP_WAIT; + int must_wake = 0; + int ring_low = 0; + int ring_empty = 0; u32 pos; - while ((prefill || rds_conn_up(conn)) && - rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) { + /* + * the goal here is to just make sure that someone, somewhere + * is posting buffers. If we can't get the refill lock, + * let them do their thing + */ + if (!acquire_refill(conn)) + return; + + ring_low = rds_ib_ring_low(&ic->i_recv_ring); + ring_empty = rds_ib_ring_empty(&ic->i_recv_ring); + + /* If we ever end up with a really empty receive ring, we're + * in deep trouble, as the sender will definitely see RNR + * timeouts. */ + if (ring_empty) + rds_ib_stats_inc(s_ib_rx_ring_empty); + + /* + * if we're called from the tasklet, can_wait will be zero. We only + * want to refill if we're getting low in this case + */ + if (!ring_low && !can_wait) + goto release_out; + + while ((prefill || rds_conn_up(conn)) + && rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) { if (pos >= ic->i_recv_ring.w_nr) { printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", pos); @@ -372,10 +579,9 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill) } recv = &ic->i_recvs[pos]; - ret = rds_ib_recv_refill_one(conn, recv, prefill); - if (ret) { + ret = rds_ib_recv_refill_one(conn, recv, gfp); + if (ret) break; - } /* XXX when can this fail? */ ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); @@ -391,14 +597,44 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill) } posted++; + + if ((posted > 128 && need_resched()) || posted > 8192) { + must_wake = 1; + break; + } } + /* read ring_low and ring_empty before we drop our lock */ + ring_low = rds_ib_ring_low(&ic->i_recv_ring); + ring_empty = rds_ib_ring_empty(&ic->i_recv_ring); + /* We're doing flow control - update the window. */ if (ic->i_flowctl && posted) rds_ib_advertise_credits(conn, posted); if (ret) rds_ib_ring_unalloc(&ic->i_recv_ring, 1); + +release_out: + release_refill(conn); + + /* if we're called from the softirq handler, we'll be GFP_NOWAIT. + * in this case the ring being low is going to lead to more interrupts + * and we can safely let the softirq code take care of it unless the + * ring is completely empty. + * + * if we're called from krdsd, we'll be GFP_KERNEL. In this case + * we might have raced with the softirq code while we had the refill + * lock held. Use rds_ib_ring_low() instead of ring_empty to decide + * if we should requeue. + */ + if (rds_conn_up(conn) && + (must_wake || (can_wait && ring_low) + || rds_ib_ring_empty(&ic->i_recv_ring))) { + queue_delayed_work(rds_wq, &conn->c_recv_w, 1); + } + if (can_wait) + cond_resched(); } /* @@ -563,7 +799,7 @@ void rds_ib_recv_init_ack(struct rds_ib_connection *ic) * wr_id and avoids working with the ring in that case. */ #ifndef KERNEL_HAS_ATOMIC64 -static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, +void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required) { unsigned long flags; @@ -589,7 +825,7 @@ static u64 rds_ib_get_ack(struct rds_ib_connection *ic) return seq; } #else -static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, +void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required) { atomic64_set(&ic->i_ack_next, seq); @@ -691,7 +927,7 @@ void rds_ib_attempt_ack(struct rds_ib_connection *ic) } /* Can we get a send credit? */ - if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) { + if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0)) { rds_ib_stats_inc(s_ib_tx_throttle); clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); return; @@ -797,20 +1033,6 @@ static void rds_ib_cong_recv(struct rds_connection *conn, rds_cong_map_updated(map, uncongested); } -/* - * Rings are posted with all the allocations they'll need to queue the - * incoming message to the receiving socket so this can't fail. - * All fragments start with a header, so we can make sure we're not receiving - * garbage, and we can tell a small 8 byte fragment from an ACK frame. - */ -struct rds_ib_ack_state { - u64 ack_next; - u64 ack_recv; - unsigned int ack_required:1; - unsigned int ack_next_valid:1; - unsigned int ack_recv_valid:1; -}; - static void rds_ib_process_recv(struct rds_connection *conn, struct rds_ib_recv_work *recv, u32 data_len, struct rds_ib_ack_state *state) @@ -854,7 +1076,8 @@ static void rds_ib_process_recv(struct rds_connection *conn, if (ihdr->h_credit) rds_ib_send_add_credits(conn, ihdr->h_credit); - if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) { + if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0 && + ihdr->h_flags == 0) { /* This is an ACK-only packet. The fact that it gets * special treatment here is that historically, ACKs * were rather special beasts. @@ -896,10 +1119,10 @@ static void rds_ib_process_recv(struct rds_connection *conn, hdr = &ibinc->ii_inc.i_hdr; /* We can't just use memcmp here; fragments of a * single message may carry different ACKs */ - if (hdr->h_sequence != ihdr->h_sequence || - hdr->h_len != ihdr->h_len || - hdr->h_sport != ihdr->h_sport || - hdr->h_dport != ihdr->h_dport) { + if (hdr->h_sequence != ihdr->h_sequence + || hdr->h_len != ihdr->h_len + || hdr->h_sport != ihdr->h_sport + || hdr->h_dport != ihdr->h_dport) { rds_ib_conn_error(conn, "fragment header mismatch; forcing reconnect\n"); return; @@ -937,108 +1160,283 @@ static void rds_ib_process_recv(struct rds_connection *conn, } } -/* - * Plucking the oldest entry from the ring can be done concurrently with - * the thread refilling the ring. Each ring operation is protected by - * spinlocks and the transient state of refilling doesn't change the - * recording of which entry is oldest. - * - * This relies on IB only calling one cq comp_handler for each cq so that - * there will only be one caller of rds_recv_incoming() per RDS connection. - */ -void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context) +void rds_ib_srq_process_recv(struct rds_connection *conn, + struct rds_ib_recv_work *recv, u32 data_len, + struct rds_ib_ack_state *state) { - struct rds_connection *conn = context; struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_ib_incoming *ibinc = ic->i_ibinc; + struct rds_header *ihdr, *hdr; - rdsdebug("conn %p cq %p\n", conn, cq); + if (data_len < sizeof(struct rds_header)) { + printk(KERN_WARNING "RDS: from %pI4 didn't inclue a " + "header, disconnecting and " + "reconnecting\n", + &conn->c_faddr); + rds_ib_frag_free(ic, recv->r_frag); + recv->r_frag = NULL; + return; + } + data_len -= sizeof(struct rds_header); + + ihdr = &ic->rds_ibdev->srq->s_recv_hdrs[recv->r_wr.wr_id]; - rds_ib_stats_inc(s_ib_rx_cq_call); + /* Validate the checksum. */ + if (!rds_message_verify_checksum(ihdr)) { + printk(KERN_WARNING "RDS: from %pI4 has corrupted header - " + "forcing a reconnect\n", + &conn->c_faddr); + rds_stats_inc(s_recv_drop_bad_checksum); + rds_ib_frag_free(ic, recv->r_frag); + recv->r_frag = NULL; + return; + } - tasklet_schedule(&ic->i_recv_tasklet); + /* Process the ACK sequence which comes with every packet */ + state->ack_recv = be64_to_cpu(ihdr->h_ack); + state->ack_recv = be64_to_cpu(ihdr->h_ack); + state->ack_recv_valid = 1; + + if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && data_len == 0) { + rds_ib_stats_inc(s_ib_ack_received); + rds_ib_frag_free(ic, recv->r_frag); + recv->r_frag = NULL; + return; + } + + if (!ibinc) { + ibinc = recv->r_ibinc; + rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr); + recv->r_ibinc = NULL; + ic->i_ibinc = ibinc; + hdr = &ibinc->ii_inc.i_hdr; + memcpy(hdr, ihdr, sizeof(*hdr)); + ic->i_recv_data_rem = be32_to_cpu(hdr->h_len); + } else { + hdr = &ibinc->ii_inc.i_hdr; + if (hdr->h_sequence != ihdr->h_sequence + || hdr->h_len != ihdr->h_len + || hdr->h_sport != ihdr->h_sport + || hdr->h_dport != ihdr->h_dport) { + printk(KERN_WARNING "RDS: fragment header mismatch; " + "forcing reconnect\n"); + rds_ib_frag_free(ic, recv->r_frag); + recv->r_frag = NULL; + return; + } + } + + list_add_tail(&recv->r_frag->f_item, &ibinc->ii_frags); + + recv->r_frag = NULL; + + if (ic->i_recv_data_rem > RDS_FRAG_SIZE) + ic->i_recv_data_rem -= RDS_FRAG_SIZE; + else { + ic->i_recv_data_rem = 0; + ic->i_ibinc = NULL; + + if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) + rds_ib_cong_recv(conn, ibinc); + else { + rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr, + &ibinc->ii_inc, GFP_ATOMIC, + KM_SOFTIRQ0); + + state->ack_next = be64_to_cpu(hdr->h_sequence); + state->ack_next_valid = 1; + } + if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) { + rds_stats_inc(s_recv_ack_required); + state->ack_required = 1; + } + rds_inc_put(&ibinc->ii_inc); + } } -static inline void rds_poll_cq(struct rds_ib_connection *ic, - struct rds_ib_ack_state *state) +void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, + struct ib_wc *wc, + struct rds_ib_ack_state *state) { struct rds_connection *conn = ic->conn; - struct ib_wc wc; struct rds_ib_recv_work *recv; + struct rds_ib_device *rds_ibdev = ic->rds_ibdev; + + rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", + (unsigned long long)wc->wr_id, wc->status, wc->byte_len, + be32_to_cpu(wc->ex.imm_data)); - while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) { - rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", - (unsigned long long)wc.wr_id, wc.status, - rds_ib_wc_status_str(wc.status), wc.byte_len, - be32_to_cpu(wc.ex.imm_data)); - rds_ib_stats_inc(s_ib_rx_cq_event); + rds_ib_stats_inc(s_ib_rx_cq_event); + if (rds_ib_srq_enabled) { + recv = &rds_ibdev->srq->s_recvs[wc->wr_id]; + atomic_dec(&rds_ibdev->srq->s_num_posted); + } else recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)]; - ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE); + ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE); - /* - * Also process recvs in connecting state because it is possible - * to get a recv completion _before_ the rdmacm ESTABLISHED - * event is processed. - */ - if (wc.status == IB_WC_SUCCESS) { - rds_ib_process_recv(conn, recv, wc.byte_len, state); - } else { - /* We expect errors as the qp is drained during shutdown */ - if (rds_conn_up(conn) || rds_conn_connecting(conn)) - rds_ib_conn_error(conn, "recv completion on %pI4 had " - "status %u (%s), disconnecting and " - "reconnecting\n", &conn->c_faddr, - wc.status, - rds_ib_wc_status_str(wc.status)); - } + if (wc->status == IB_WC_SUCCESS) { + if (rds_ib_srq_enabled) + rds_ib_srq_process_recv(conn, recv, wc->byte_len, state); + else + rds_ib_process_recv(conn, recv, wc->byte_len, state); + } else { + /* We expect errors as the qp is drained during shutdown */ + if (rds_conn_up(conn) || rds_conn_connecting(conn)) + rds_ib_conn_error(conn, "recv completion on " + "%pI4 had status %u, disconnecting and " + "reconnecting\n", &conn->c_faddr, + wc->status); + } - /* - * It's very important that we only free this ring entry if we've truly - * freed the resources allocated to the entry. The refilling path can - * leak if we don't. - */ + /* + * rds_ib_process_recv() doesn't always consume the frag, and + * we might not have called it at all if the wc didn't indicate + * success. We already unmapped the frag's pages, though, and the + * following rds_ib_ring_free() call tells the refill path that it + * will not find an allocated frag here. Make sure we keep that + * promise by freeing a frag that's still on the ring. + */ + if (recv->r_frag) { + rds_ib_frag_free(ic, recv->r_frag); + recv->r_frag = NULL; + } + + if (!rds_ib_srq_enabled) { rds_ib_ring_free(&ic->i_recv_ring, 1); + rds_ib_recv_refill(conn, 0, GFP_NOWAIT); + } else { + recv->r_ic = ic; + recv->r_posted = 0; } } -void rds_ib_recv_tasklet_fn(unsigned long data) +void rds_ib_srq_refill(struct work_struct *work) { - struct rds_ib_connection *ic = (struct rds_ib_connection *) data; - struct rds_connection *conn = ic->conn; - struct rds_ib_ack_state state = { 0, }; + struct rds_ib_srq *srq = container_of(work, struct rds_ib_srq, s_refill_w.work); + struct rds_ib_recv_work *prv = NULL, *cur = NULL, *tmp; + struct ib_recv_wr *bad_wr; + int i, refills = 0, total_refills = 0; + + if (!test_bit(0, &srq->s_refill_gate)) + return; + + rds_ib_stats_inc(s_ib_srq_refills); - rds_poll_cq(ic, &state); - ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); - rds_poll_cq(ic, &state); + for (i = 0; i < srq->s_n_wr; i++) { + tmp = &srq->s_recvs[i]; + if (tmp->r_posted) + continue; - if (state.ack_next_valid) - rds_ib_set_ack(ic, state.ack_next, state.ack_required); - if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) { - rds_send_drop_acked(conn, state.ack_recv, NULL); - ic->i_ack_recv = state.ack_recv; + if (rds_ib_srq_refill_one(srq, tmp->r_ic, tmp, GFP_NOWAIT)) { + printk(KERN_ERR "rds_ib_srq_refill_one failed\n"); + break; + } + cur = tmp; + + if (!prv) { + prv = cur; + prv->r_wr.next = NULL; + } else { + cur->r_wr.next = &prv->r_wr; + prv = cur; + } + cur->r_posted = 1; + + total_refills++; + if (++refills == RDS_IB_SRQ_POST_BATCH_COUNT) { + if (ib_post_srq_recv(srq->s_srq, &cur->r_wr, &bad_wr)) { + struct ib_recv_wr *wr; + struct rds_ib_recv_work *recv; + + for (wr = &cur->r_wr; wr; wr = wr->next) { + recv = container_of(wr, struct rds_ib_recv_work, r_wr); + rds_ib_srq_clear_one(srq, recv->r_ic, recv); + } + printk(KERN_ERR "ib_post_srq_recv failed\n"); + goto out; + } + atomic_add(refills, &srq->s_num_posted); + prv = NULL; + refills = 0; + cur = NULL; + } + } + if (cur) { + if (ib_post_srq_recv(srq->s_srq, &cur->r_wr, &bad_wr)) { + struct ib_recv_wr *wr; + struct rds_ib_recv_work *recv; + + for (wr = &cur->r_wr; wr; wr = wr->next) { + recv = container_of(wr, struct rds_ib_recv_work, r_wr); + rds_ib_srq_clear_one(srq, recv->r_ic, recv); + } + printk(KERN_ERR "ib_post_srq_recv failed\n"); + goto out; + } + atomic_add(refills, &srq->s_num_posted); } - if (rds_conn_up(conn)) - rds_ib_attempt_ack(ic); - /* If we ever end up with a really empty receive ring, we're - * in deep trouble, as the sender will definitely see RNR - * timeouts. */ - if (rds_ib_ring_empty(&ic->i_recv_ring)) - rds_ib_stats_inc(s_ib_rx_ring_empty); + if (!total_refills) + rds_ib_stats_inc(s_ib_srq_empty_refills); +out: + clear_bit(0, &srq->s_refill_gate); +} + +int rds_ib_srq_prefill_ring(struct rds_ib_device *rds_ibdev) +{ + struct rds_ib_recv_work *recv; + struct ib_recv_wr *bad_wr; + u32 i; + int ret; + + for (i = 0, recv = rds_ibdev->srq->s_recvs; + i < rds_ibdev->srq->s_n_wr; i++, recv++) { + recv->r_wr.next = NULL; + recv->r_wr.wr_id = i; + recv->r_wr.sg_list = recv->r_sge; + recv->r_wr.num_sge = RDS_IB_RECV_SGE; + recv->r_ibinc = NULL; + recv->r_frag = NULL; + recv->r_ic = NULL; + + if (rds_ib_srq_prefill_one(rds_ibdev, recv, 1)) + return 1; - if (rds_ib_ring_low(&ic->i_recv_ring)) - rds_ib_recv_refill(conn, 0); + ret = ib_post_srq_recv(rds_ibdev->srq->s_srq, + &recv->r_wr, &bad_wr); + if (ret) { + printk(KERN_WARNING "RDS: ib_post_srq_recv failed %d\n", ret); + return 1; + } + atomic_inc(&rds_ibdev->srq->s_num_posted); + recv->r_posted = 1; + } + return 0; } +static void rds_ib_srq_clear_ring(struct rds_ib_device *rds_ibdev) +{ + u32 i; + struct rds_ib_recv_work *recv; + + for (i = 0, recv = rds_ibdev->srq->s_recvs; + i < rds_ibdev->srq->s_n_wr; i++, recv++) + rds_ib_srq_clear_one(rds_ibdev->srq, recv->r_ic, recv); +} + + int rds_ib_recv(struct rds_connection *conn) { struct rds_ib_connection *ic = conn->c_transport_data; int ret = 0; rdsdebug("conn %p\n", conn); - if (rds_conn_up(conn)) + if (!rds_ib_srq_enabled && rds_conn_up(conn)) { rds_ib_attempt_ack(ic); + rds_ib_recv_refill(conn, 0, GFP_KERNEL); + } return ret; } @@ -1046,7 +1444,6 @@ int rds_ib_recv(struct rds_connection *conn) int rds_ib_recv_init(void) { struct sysinfo si; - int ret = -ENOMEM; /* Default to 30% of all available RAM for recv memory */ si_meminfo(&si); @@ -1054,19 +1451,19 @@ int rds_ib_recv_init(void) rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming", sizeof(struct rds_ib_incoming), - 0, SLAB_HWCACHE_ALIGN, NULL); + SLAB_HWCACHE_ALIGN, 0, NULL); if (!rds_ib_incoming_slab) - goto out; + return -ENOMEM; rds_ib_frag_slab = kmem_cache_create("rds_ib_frag", sizeof(struct rds_page_frag), - 0, SLAB_HWCACHE_ALIGN, NULL); - if (!rds_ib_frag_slab) + SLAB_HWCACHE_ALIGN, 0, NULL); + if (!rds_ib_frag_slab) { kmem_cache_destroy(rds_ib_incoming_slab); - else - ret = 0; -out: - return ret; + rds_ib_incoming_slab = NULL; + return -ENOMEM; + } + return 0; } void rds_ib_recv_exit(void) @@ -1074,3 +1471,145 @@ void rds_ib_recv_exit(void) kmem_cache_destroy(rds_ib_incoming_slab); kmem_cache_destroy(rds_ib_frag_slab); } + +void rds_ib_srq_rearm(struct work_struct *work) +{ + struct rds_ib_srq *srq = container_of(work, struct rds_ib_srq, s_rearm_w.work); + struct ib_srq_attr srq_attr; + + srq_attr.srq_limit = rds_ib_srq_lwm_refill; + if (ib_modify_srq(srq->s_srq, &srq_attr, IB_SRQ_LIMIT)) { + printk(KERN_ERR "RDS: ib_modify_srq failed\n"); + return; + } +} + +static void rds_ib_srq_event(struct ib_event *event, + void *ctx) +{ + struct rds_ib_device *rds_ibdev = ctx; + + switch (event->event) { + case IB_EVENT_SRQ_ERR: + printk(KERN_ERR "RDS: event IB_EVENT_SRQ_ERR unhandled\n"); + break; + case IB_EVENT_SRQ_LIMIT_REACHED: + rds_ib_stats_inc(s_ib_srq_lows); + queue_delayed_work(rds_wq, &rds_ibdev->srq->s_rearm_w, HZ); + + if (!test_and_set_bit(0, &rds_ibdev->srq->s_refill_gate)) + queue_delayed_work(rds_wq, &rds_ibdev->srq->s_refill_w, 0); + break; + default: + break; + } +} + +/* Setup SRQ for a device */ +int rds_ib_srq_init(struct rds_ib_device *rds_ibdev) +{ + struct ib_srq_init_attr srq_init_attr = { + rds_ib_srq_event, + (void *)rds_ibdev, + .attr = { + .max_wr = rds_ib_srq_max_wr - 1, + .max_sge = rds_ibdev->max_sge + } + }; + + rds_ibdev->srq->rds_ibdev = rds_ibdev; + + rds_ibdev->srq->s_n_wr = rds_ib_srq_max_wr - 1; + rds_ibdev->srq->s_srq = ib_create_srq(rds_ibdev->pd, + &srq_init_attr); + + if (IS_ERR(rds_ibdev->srq->s_srq)) { + printk(KERN_WARNING "RDS: ib_create_srq failed %ld\n", + PTR_ERR(rds_ibdev->srq->s_srq)); + return 1; + } + + rds_ibdev->srq->s_recv_hdrs = ib_dma_alloc_coherent(rds_ibdev->dev, + rds_ibdev->srq->s_n_wr * + sizeof(struct rds_header), + &rds_ibdev->srq->s_recv_hdrs_dma, GFP_KERNEL); + if (!rds_ibdev->srq->s_recv_hdrs) { + printk(KERN_WARNING "ib_dma_alloc_coherent failed\n"); + return 1; + } + + rds_ibdev->srq->s_recvs = vmalloc(rds_ibdev->srq->s_n_wr * + sizeof(struct rds_ib_recv_work)); + + if (!rds_ibdev->srq->s_recvs) { + printk(KERN_WARNING "RDS: vmalloc failed\n"); + return 1; + } + + memset(rds_ibdev->srq->s_recvs, 0, rds_ibdev->srq->s_n_wr * + sizeof(struct rds_ib_recv_work)); + + atomic_set(&rds_ibdev->srq->s_num_posted, 0); + clear_bit(0, &rds_ibdev->srq->s_refill_gate); + + if (rds_ib_srq_prefill_ring(rds_ibdev)) + return 1; + + INIT_DELAYED_WORK(&rds_ibdev->srq->s_refill_w, rds_ib_srq_refill); + + INIT_DELAYED_WORK(&rds_ibdev->srq->s_rearm_w, rds_ib_srq_rearm); + + queue_delayed_work(rds_wq, &rds_ibdev->srq->s_rearm_w, 0); + + return 0; +} + +int rds_ib_srqs_init(void) +{ + struct rds_ib_device *rds_ibdev; + int ret; + + if (!rds_ib_srq_enabled) + return 0; + + list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { + ret = rds_ib_srq_init(rds_ibdev); + if (ret) + return ret; + } + + return 0; +} + +void rds_ib_srq_exit(struct rds_ib_device *rds_ibdev) +{ + int ret; + + ret = ib_destroy_srq(rds_ibdev->srq->s_srq); + if (ret) + printk(KERN_WARNING "RDS: ib_destroy_srq failed %d\n", ret); + rds_ibdev->srq->s_srq = NULL; + + if (rds_ibdev->srq->s_recv_hdrs) + ib_dma_free_coherent(rds_ibdev->dev, + rds_ibdev->srq->s_n_wr * + sizeof(struct rds_header), + rds_ibdev->srq->s_recv_hdrs, + rds_ibdev->srq->s_recv_hdrs_dma); + + rds_ib_srq_clear_ring(rds_ibdev); + vfree(rds_ibdev->srq->s_recvs); + rds_ibdev->srq->s_recvs = NULL; +} + +void rds_ib_srqs_exit(void) +{ + struct rds_ib_device *rds_ibdev; + + if (!rds_ib_srq_enabled) + return; + + list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { + rds_ib_srq_exit(rds_ibdev); + } +} diff --git a/net/rds/ib_ring.c b/net/rds/ib_ring.c index ff97e8eda858b..b66cd4244d307 100644 --- a/net/rds/ib_ring.c +++ b/net/rds/ib_ring.c @@ -76,7 +76,6 @@ static inline u32 __rds_ib_ring_used(struct rds_ib_work_ring *ring) /* This assumes that atomic_t has at least as many bits as u32 */ diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr); - BUG_ON(diff > ring->w_nr); return diff; } @@ -118,6 +117,7 @@ void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val) { ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr; atomic_add(val, &ring->w_free_ctr); + smp_mb(); if (__rds_ib_ring_empty(ring) && waitqueue_active(&rds_ib_ring_empty_wait)) @@ -137,7 +137,7 @@ int rds_ib_ring_empty(struct rds_ib_work_ring *ring) int rds_ib_ring_low(struct rds_ib_work_ring *ring) { - return __rds_ib_ring_used(ring) <= (ring->w_nr >> 1); + return __rds_ib_ring_used(ring) <= (ring->w_nr >> 2); } /* diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 7c4dce8fa5e6b..52e497cad5f97 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -37,41 +37,7 @@ #include "rds.h" #include "ib.h" - -static char *rds_ib_wc_status_strings[] = { -#define RDS_IB_WC_STATUS_STR(foo) \ - [IB_WC_##foo] = __stringify(IB_WC_##foo) - RDS_IB_WC_STATUS_STR(SUCCESS), - RDS_IB_WC_STATUS_STR(LOC_LEN_ERR), - RDS_IB_WC_STATUS_STR(LOC_QP_OP_ERR), - RDS_IB_WC_STATUS_STR(LOC_EEC_OP_ERR), - RDS_IB_WC_STATUS_STR(LOC_PROT_ERR), - RDS_IB_WC_STATUS_STR(WR_FLUSH_ERR), - RDS_IB_WC_STATUS_STR(MW_BIND_ERR), - RDS_IB_WC_STATUS_STR(BAD_RESP_ERR), - RDS_IB_WC_STATUS_STR(LOC_ACCESS_ERR), - RDS_IB_WC_STATUS_STR(REM_INV_REQ_ERR), - RDS_IB_WC_STATUS_STR(REM_ACCESS_ERR), - RDS_IB_WC_STATUS_STR(REM_OP_ERR), - RDS_IB_WC_STATUS_STR(RETRY_EXC_ERR), - RDS_IB_WC_STATUS_STR(RNR_RETRY_EXC_ERR), - RDS_IB_WC_STATUS_STR(LOC_RDD_VIOL_ERR), - RDS_IB_WC_STATUS_STR(REM_INV_RD_REQ_ERR), - RDS_IB_WC_STATUS_STR(REM_ABORT_ERR), - RDS_IB_WC_STATUS_STR(INV_EECN_ERR), - RDS_IB_WC_STATUS_STR(INV_EEC_STATE_ERR), - RDS_IB_WC_STATUS_STR(FATAL_ERR), - RDS_IB_WC_STATUS_STR(RESP_TIMEOUT_ERR), - RDS_IB_WC_STATUS_STR(GENERAL_ERR), -#undef RDS_IB_WC_STATUS_STR -}; - -char *rds_ib_wc_status_str(enum ib_wc_status status) -{ - return rds_str_array(rds_ib_wc_status_strings, - ARRAY_SIZE(rds_ib_wc_status_strings), status); -} - +#include "tcp.h" /* * Convert IB-specific error message to RDS error message and call core * completion handler. @@ -87,7 +53,7 @@ static void rds_ib_send_complete(struct rds_message *rm, return; case IB_WC_SUCCESS: - notify_status = RDS_RDMA_SUCCESS; + notify_status = RDS_RDMA_SEND_SUCCESS; break; case IB_WC_REM_ACCESS_ERR: @@ -95,20 +61,33 @@ static void rds_ib_send_complete(struct rds_message *rm, break; default: - notify_status = RDS_RDMA_OTHER_ERROR; + notify_status = RDS_RDMA_SEND_OTHER_ERROR; break; } complete(rm, notify_status); } +static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, + struct rm_rdma_op *op, + int wc_status); + static void rds_ib_send_unmap_data(struct rds_ib_connection *ic, struct rm_data_op *op, int wc_status) { + struct rds_message *rm; + + rm = container_of(op, struct rds_message, data); + if (op->op_nents) ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, op->op_nents, DMA_TO_DEVICE); + + if (rm->data.op_async) + rds_ib_send_complete(rm, wc_status, rds_asend_complete); + else if (rm->rdma.op_active && rm->rdma.op_remote_complete) + rds_ib_send_unmap_rdma(ic, &rm->rdma, wc_status); } static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, @@ -229,7 +208,7 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic) send->s_op = NULL; - send->s_wr.wr_id = i; + send->s_wr.wr_id = i | RDS_IB_SEND_OP; send->s_wr.sg_list = send->s_sge; send->s_wr.ex.imm_data = 0; @@ -253,10 +232,6 @@ void rds_ib_send_clear_ring(struct rds_ib_connection *ic) } } -/* - * The only fast path caller always has a non-zero nr, so we don't - * bother testing nr before performing the atomic sub. - */ static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr) { if ((atomic_sub_return(nr, &ic->i_signaled_sends) == 0) && @@ -271,81 +246,69 @@ static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr) * unallocs the next free entry in the ring it doesn't alter which is * the next to be freed, which is what this is concerned with. */ -void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) +void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) { - struct rds_connection *conn = context; - struct rds_ib_connection *ic = conn->c_transport_data; - struct rds_message *rm = NULL; - struct ib_wc wc; + struct rds_connection *conn = ic->conn; struct rds_ib_send_work *send; u32 completed; u32 oldest; u32 i = 0; - int ret; int nr_sig = 0; - rdsdebug("cq %p conn %p\n", cq, conn); - rds_ib_stats_inc(s_ib_tx_cq_call); - ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); - if (ret) - rdsdebug("ib_req_notify_cq send failed: %d\n", ret); - - while (ib_poll_cq(cq, 1, &wc) > 0) { - rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", - (unsigned long long)wc.wr_id, wc.status, - rds_ib_wc_status_str(wc.status), wc.byte_len, - be32_to_cpu(wc.ex.imm_data)); - rds_ib_stats_inc(s_ib_tx_cq_event); - - if (wc.wr_id == RDS_IB_ACK_WR_ID) { - if (ic->i_ack_queued + HZ/2 < jiffies) - rds_ib_stats_inc(s_ib_tx_stalled); - rds_ib_ack_send_complete(ic); - continue; - } + rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", + (unsigned long long)wc->wr_id, wc->status, wc->byte_len, + be32_to_cpu(wc->ex.imm_data)); - oldest = rds_ib_ring_oldest(&ic->i_send_ring); + rds_ib_stats_inc(s_ib_tx_cq_event); - completed = rds_ib_ring_completed(&ic->i_send_ring, wc.wr_id, oldest); + if (wc->wr_id == RDS_IB_ACK_WR_ID) { + if (ic->i_ack_queued + HZ/2 < jiffies) + rds_ib_stats_inc(s_ib_tx_stalled); + rds_ib_ack_send_complete(ic); + return; + } - for (i = 0; i < completed; i++) { - send = &ic->i_sends[oldest]; - if (send->s_wr.send_flags & IB_SEND_SIGNALED) - nr_sig++; + oldest = rds_ib_ring_oldest(&ic->i_send_ring); - rm = rds_ib_send_unmap_op(ic, send, wc.status); + completed = rds_ib_ring_completed(&ic->i_send_ring, + (wc->wr_id & ~RDS_IB_SEND_OP), oldest); - if (send->s_queued + HZ/2 < jiffies) - rds_ib_stats_inc(s_ib_tx_stalled); + for (i = 0; i < completed; i++) { + struct rds_message *rm; - if (send->s_op) { - if (send->s_op == rm->m_final_op) { - /* If anyone waited for this message to get flushed out, wake - * them up now */ - rds_message_unmapped(rm); - } - rds_message_put(rm); - send->s_op = NULL; - } + send = &ic->i_sends[oldest]; + if (send->s_wr.send_flags & IB_SEND_SIGNALED) + nr_sig++; - oldest = (oldest + 1) % ic->i_send_ring.w_nr; - } + rm = rds_ib_send_unmap_op(ic, send, wc->status); - rds_ib_ring_free(&ic->i_send_ring, completed); - rds_ib_sub_signaled(ic, nr_sig); - nr_sig = 0; - - if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || - test_bit(0, &conn->c_map_queued)) - queue_delayed_work(rds_wq, &conn->c_send_w, 0); - - /* We expect errors as the qp is drained during shutdown */ - if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { - rds_ib_conn_error(conn, "send completion on %pI4 had status " - "%u (%s), disconnecting and reconnecting\n", - &conn->c_faddr, wc.status, - rds_ib_wc_status_str(wc.status)); + if (send->s_queued + HZ/2 < jiffies) + rds_ib_stats_inc(s_ib_tx_stalled); + + if (send->s_op) { + if (send->s_op == rm->m_final_op) { + /* If anyone waited for this message to get flushed out, wake + * them up now */ + rds_message_unmapped(rm); + } + rds_message_put(rm); } + send->s_op = NULL; + + oldest = (oldest + 1) % ic->i_send_ring.w_nr; + } + + rds_ib_ring_free(&ic->i_send_ring, completed); + rds_ib_sub_signaled(ic, nr_sig); + + clear_bit(RDS_LL_SEND_FULL, &conn->c_flags); + + /* We expect errors as the qp is drained during shutdown */ + if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) { + rds_ib_conn_error(conn, + "send completion on %pI4 " + "had status %u, disconnecting and reconnecting\n", + &conn->c_faddr, wc->status); } } @@ -355,7 +318,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) * * Conceptually, we have two counters: * - send credits: this tells us how many WRs we're allowed - * to submit without overruning the receiver's queue. For + * to submit without overruning the reciever's queue. For * each SEND WR we post, we decrement this by one. * * - posted credits: this tells us how many WRs we recently @@ -394,7 +357,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) * and using atomic_cmpxchg when updating the two counters. */ int rds_ib_send_grab_credits(struct rds_ib_connection *ic, - u32 wanted, u32 *adv_credits, int need_posted, int max_posted) + u32 wanted, u32 *adv_credits, int need_posted) { unsigned int avail, posted, got = 0, advertise; long oldval, newval; @@ -434,7 +397,7 @@ try_again: * available. */ if (posted && (got || need_posted)) { - advertise = min_t(unsigned int, posted, max_posted); + advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT); newval -= IB_SET_POST_CREDITS(advertise); } @@ -551,10 +514,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, if (conn->c_loopback && rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) { rds_cong_map_updated(conn->c_fcong, ~(u64) 0); - scat = &rm->data.op_sg[sg]; - ret = sizeof(struct rds_header) + RDS_CONG_MAP_BYTES; - ret = min_t(int, ret, scat->length - conn->c_xmit_data_off); - return ret; + return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES; } /* FIXME we may overallocate here */ @@ -565,14 +525,23 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); if (work_alloc == 0) { + /* there is a window right here where someone could + * have freed up entries on the ring. Lets make + * sure it really really really is full. + */ set_bit(RDS_LL_SEND_FULL, &conn->c_flags); - rds_ib_stats_inc(s_ib_tx_ring_full); - ret = -ENOMEM; - goto out; + smp_mb(); + work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); + if (work_alloc == 0) { + rds_ib_stats_inc(s_ib_tx_ring_full); + ret = -ENOMEM; + goto out; + } + clear_bit(RDS_LL_SEND_FULL, &conn->c_flags); } if (ic->i_flowctl) { - credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT); + credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0); adv_credits += posted; if (credit_alloc < work_alloc) { rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); @@ -580,7 +549,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, flow_controlled = 1; } if (work_alloc == 0) { - set_bit(RDS_LL_SEND_FULL, &conn->c_flags); + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); rds_ib_stats_inc(s_ib_tx_throttle); ret = -ENOMEM; goto out; @@ -640,7 +609,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, * Update adv_credits since we reset the ACK_REQUIRED bit. */ if (ic->i_flowctl) { - rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); + rds_ib_send_grab_credits(ic, 0, &posted, 1); adv_credits += posted; BUG_ON(adv_credits > 255); } @@ -739,6 +708,12 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, if (scat == &rm->data.op_sg[rm->data.op_count]) { prev->s_op = ic->i_data_op; prev->s_wr.send_flags |= IB_SEND_SOLICITED; + if (!(prev->s_wr.send_flags & IB_SEND_SIGNALED) || + (rm->rdma.op_active && rm->rdma.op_remote_complete)) { + ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; + prev->s_wr.send_flags |= IB_SEND_SIGNALED; + nr_sig++; + } ic->i_data_op = NULL; } @@ -810,17 +785,13 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) send->s_queued = jiffies; if (op->op_type == RDS_ATOMIC_TYPE_CSWP) { - send->s_wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP; - send->s_wr.wr.atomic.compare_add = op->op_m_cswp.compare; - send->s_wr.wr.atomic.swap = op->op_m_cswp.swap; - send->s_wr.wr.atomic.compare_add_mask = op->op_m_cswp.compare_mask; - send->s_wr.wr.atomic.swap_mask = op->op_m_cswp.swap_mask; + send->s_wr.opcode = IB_WR_ATOMIC_CMP_AND_SWP; + send->s_wr.wr.atomic.compare_add = op->op_compare; + send->s_wr.wr.atomic.swap = op->op_swap_add; } else { /* FADD */ - send->s_wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD; - send->s_wr.wr.atomic.compare_add = op->op_m_fadd.add; + send->s_wr.opcode = IB_WR_ATOMIC_FETCH_AND_ADD; + send->s_wr.wr.atomic.compare_add = op->op_swap_add; send->s_wr.wr.atomic.swap = 0; - send->s_wr.wr.atomic.compare_add_mask = op->op_m_fadd.nocarry_mask; - send->s_wr.wr.atomic.swap_mask = 0; } nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify); send->s_wr.num_sge = 1; @@ -857,8 +828,8 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) send, &send->s_wr, ret, failed_wr); BUG_ON(failed_wr != &send->s_wr); if (ret) { - printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 " - "returned %d\n", &conn->c_faddr, ret); + printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %u.%u.%u.%u " + "returned %d\n", NIPQUAD(conn->c_faddr), ret); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); rds_ib_sub_signaled(ic, nr_sig); goto out; @@ -934,7 +905,8 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) send->s_queued = jiffies; send->s_op = NULL; - nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify); + if (!op->op_remote_complete) + nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify); send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; send->s_wr.wr.rdma.remote_addr = remote_addr; @@ -974,17 +946,51 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) send = ic->i_sends; } + if (i < work_alloc) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); + work_alloc = i; + } + /* give a reference to the last op */ if (scat == &op->op_sg[op->op_count]) { + if (op->op_write && op->op_silent && op->op_remote_complete) { + int rcomp_alloc, rcomp_pos; + struct rds_ib_send_work *rcomp; + + rcomp_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, + &rcomp_pos); + if (rcomp_alloc != 1) { + ib_dma_unmap_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, + op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + op->op_mapped = 0; + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_stats_inc(s_ib_tx_ring_full); + ret = -ENOMEM; + goto out; + } + rcomp = &ic->i_sends[rcomp_pos]; + rcomp->s_sge[0] = prev->s_sge[prev->s_wr.num_sge-1]; + rcomp->s_sge[0].addr += + (rcomp->s_sge[0].length - sizeof(u8)); + rcomp->s_sge[0].length = sizeof(u8); + + rcomp->s_wr.num_sge = 1; + rcomp->s_wr.opcode = IB_WR_RDMA_READ; + rcomp->s_wr.next = NULL; + rcomp->s_wr.wr.rdma.remote_addr = + remote_addr - sizeof(u8); + rcomp->s_wr.wr.rdma.rkey = op->op_rkey; + prev->s_wr.next = &rcomp->s_wr; + prev = rcomp; + rcomp->s_wr.send_flags = IB_SEND_SIGNALED; + nr_sig++; + } + prev->s_op = op; rds_message_addref(container_of(op, struct rds_message, rdma)); } - if (i < work_alloc) { - rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); - work_alloc = i; - } - if (nr_sig) atomic_add(nr_sig, &ic->i_signaled_sends); diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c index 2d5965d6e97c0..80a4c90ac6cfd 100644 --- a/net/rds/ib_stats.c +++ b/net/rds/ib_stats.c @@ -37,38 +37,49 @@ #include "rds.h" #include "ib.h" -DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats); +DEFINE_PER_CPU(struct rds_ib_statistics, rds_ib_stats) ____cacheline_aligned; -static const char *const rds_ib_stat_names[] = { +static char *rds_ib_stat_names[] = { "ib_connect_raced", "ib_listen_closed_stale", - "ib_tx_cq_call", + "ib_evt_handler_call", + "ib_tasklet_call", "ib_tx_cq_event", "ib_tx_ring_full", "ib_tx_throttle", "ib_tx_sg_mapping_failure", "ib_tx_stalled", "ib_tx_credit_updates", - "ib_rx_cq_call", "ib_rx_cq_event", "ib_rx_ring_empty", "ib_rx_refill_from_cq", "ib_rx_refill_from_thread", "ib_rx_alloc_limit", + "ib_rx_total_frags", + "ib_rx_total_incs", "ib_rx_credit_updates", "ib_ack_sent", "ib_ack_send_failure", "ib_ack_send_delayed", "ib_ack_send_piggybacked", "ib_ack_received", - "ib_rdma_mr_alloc", - "ib_rdma_mr_free", - "ib_rdma_mr_used", - "ib_rdma_mr_pool_flush", - "ib_rdma_mr_pool_wait", - "ib_rdma_mr_pool_depleted", + "ib_rdma_mr_8k_alloc", + "ib_rdma_mr_8k_free", + "ib_rdma_mr_8k_used", + "ib_rdma_mr_8k_pool_flush", + "ib_rdma_mr_8k_pool_wait", + "ib_rdma_mr_8k_pool_depleted", + "ib_rdma_mr_1m_alloc", + "ib_rdma_mr_1m_free", + "ib_rdma_mr_1m_used", + "ib_rdma_mr_1m_pool_flush", + "ib_rdma_mr_1m_pool_wait", + "ib_rdma_mr_1m_pool_depleted", "ib_atomic_cswp", "ib_atomic_fadd", + "ib_srq_lows", + "ib_srq_refills", + "ib_srq_empty_refills", }; unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c index 1253b006efdb3..2cb2aa1439b8a 100644 --- a/net/rds/ib_sysctl.c +++ b/net/rds/ib_sysctl.c @@ -59,15 +59,16 @@ static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64; * rings from ib_cm_connect_complete() back into ib_setup_qp() * will cause credits to be added before protocol negotiation. */ + unsigned int rds_ib_sysctl_flow_control = 0; -static ctl_table rds_ib_sysctl_table[] = { +ctl_table rds_ib_sysctl_table[] = { { .procname = "max_send_wr", .data = &rds_ib_sysctl_max_send_wr, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_doulongvec_minmax, + .proc_handler = &proc_doulongvec_minmax, .extra1 = &rds_ib_sysctl_max_wr_min, .extra2 = &rds_ib_sysctl_max_wr_max, }, @@ -76,7 +77,7 @@ static ctl_table rds_ib_sysctl_table[] = { .data = &rds_ib_sysctl_max_recv_wr, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_doulongvec_minmax, + .proc_handler = &proc_doulongvec_minmax, .extra1 = &rds_ib_sysctl_max_wr_min, .extra2 = &rds_ib_sysctl_max_wr_max, }, @@ -85,7 +86,7 @@ static ctl_table rds_ib_sysctl_table[] = { .data = &rds_ib_sysctl_max_unsig_wrs, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_doulongvec_minmax, + .proc_handler = &proc_doulongvec_minmax, .extra1 = &rds_ib_sysctl_max_unsig_wr_min, .extra2 = &rds_ib_sysctl_max_unsig_wr_max, }, @@ -94,14 +95,14 @@ static ctl_table rds_ib_sysctl_table[] = { .data = &rds_ib_sysctl_max_recv_allocation, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_doulongvec_minmax, + .proc_handler = &proc_doulongvec_minmax, }, { .procname = "flow_control", .data = &rds_ib_sysctl_flow_control, .maxlen = sizeof(rds_ib_sysctl_flow_control), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = &proc_dointvec, }, { } }; @@ -109,7 +110,7 @@ static ctl_table rds_ib_sysctl_table[] = { static struct ctl_path rds_ib_sysctl_path[] = { { .procname = "net", }, { .procname = "rds", }, - { .procname = "ib", }, + { .procname = "ib", }, { } }; diff --git a/net/rds/info.c b/net/rds/info.c index 4fdf1b6e84fff..3dc8410173733 100644 --- a/net/rds/info.c +++ b/net/rds/info.c @@ -32,7 +32,6 @@ */ #include #include -#include #include #include "rds.h" diff --git a/net/rds/iw.c b/net/rds/iw.c index f7474844f096e..13f4c4bc9df7e 100644 --- a/net/rds/iw.c +++ b/net/rds/iw.c @@ -37,7 +37,6 @@ #include #include #include -#include #include "rds.h" #include "iw.h" @@ -56,7 +55,7 @@ struct list_head rds_iw_devices; DEFINE_SPINLOCK(iw_nodev_conns_lock); LIST_HEAD(iw_nodev_conns); -static void rds_iw_add_one(struct ib_device *device) +void rds_iw_add_one(struct ib_device *device) { struct rds_iw_device *rds_iwdev; struct ib_device_attr *dev_attr; @@ -124,7 +123,7 @@ free_attr: kfree(dev_attr); } -static void rds_iw_remove_one(struct ib_device *device) +void rds_iw_remove_one(struct ib_device *device) { struct rds_iw_device *rds_iwdev; struct rds_iw_cm_id *i_cm_id, *next; @@ -226,9 +225,9 @@ static int rds_iw_laddr_check(__be32 addr) /* Create a CMA ID and try to bind it. This catches both * IB and iWARP capable NICs. */ - cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC); + cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); if (IS_ERR(cm_id)) - return PTR_ERR(cm_id); + return -EADDRNOTAVAIL; memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; @@ -238,7 +237,7 @@ static int rds_iw_laddr_check(__be32 addr) ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); /* due to this, we will claim to support IB devices unless we check node_type. */ - if (ret || cm_id->device->node_type != RDMA_NODE_RNIC) + if (ret || !cm_id->device || cm_id->device->node_type != RDMA_NODE_RNIC) ret = -EADDRNOTAVAIL; rdsdebug("addr %pI4 ret %d node type %d\n", diff --git a/net/rds/iw.h b/net/rds/iw.h index 90151922178ca..d3a4adafcde9a 100644 --- a/net/rds/iw.h +++ b/net/rds/iw.h @@ -268,6 +268,8 @@ static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic) /* ib.c */ extern struct rds_transport rds_iw_transport; +extern void rds_iw_add_one(struct ib_device *device); +extern void rds_iw_remove_one(struct ib_device *device); extern struct ib_client rds_iw_client; extern unsigned int fastreg_pool_size; @@ -316,6 +318,7 @@ void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents, void rds_iw_sync_mr(void *trans_private, int dir); void rds_iw_free_mr(void *trans_private, int invalidate); void rds_iw_flush_mrs(void); +void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id); /* ib_recv.c */ int rds_iw_recv_init(void); @@ -358,7 +361,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op); void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits); void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted); int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted, - u32 *adv_credits, int need_posted, int max_posted); + u32 *adv_credits, int need_posted); /* ib_stats.c */ DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats); @@ -375,6 +378,7 @@ extern unsigned long rds_iw_sysctl_max_unsig_wrs; extern unsigned long rds_iw_sysctl_max_unsig_bytes; extern unsigned long rds_iw_sysctl_max_recv_allocation; extern unsigned int rds_iw_sysctl_flow_control; +extern ctl_table rds_iw_sysctl_table[]; /* * Helper functions for getting/setting the header and data SGEs in diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c index c12db66f24c73..f80dac1ff1b16 100644 --- a/net/rds/iw_cm.c +++ b/net/rds/iw_cm.c @@ -32,7 +32,6 @@ */ #include #include -#include #include #include "rds.h" @@ -181,7 +180,7 @@ static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr, unsigned int send_size, recv_size; int ret; - /* The offset of 1 is to accommodate the additional ACK WR. */ + /* The offset of 1 is to accomodate the additional ACK WR. */ send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1); recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1); rds_iw_ring_resize(send_ring, send_size - 1); @@ -397,7 +396,7 @@ int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version)); conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_iw_transport, - GFP_KERNEL); + 0, GFP_KERNEL); if (IS_ERR(conn)) { rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); conn = NULL; @@ -452,7 +451,6 @@ int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, err = rds_iw_setup_qp(conn); if (err) { rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", err); - mutex_unlock(&conn->c_cm_lock); goto out; } @@ -522,7 +520,7 @@ int rds_iw_conn_connect(struct rds_connection *conn) /* XXX I wonder what affect the port space has */ /* delegate cm event handler to rdma_transport */ ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn, - RDMA_PS_TCP, IB_QPT_RC); + RDMA_PS_TCP); if (IS_ERR(ic->i_cm_id)) { ret = PTR_ERR(ic->i_cm_id); ic->i_cm_id = NULL; diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c index 6deaa77495e3f..8b69867611b59 100644 --- a/net/rds/iw_rdma.c +++ b/net/rds/iw_rdma.c @@ -31,7 +31,6 @@ * */ #include -#include #include "rds.h" #include "iw.h" @@ -122,7 +121,7 @@ static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwd #else /* FIXME - needs to compare the local and remote * ipaddr/port tuple, but the ipaddr is the only - * available information in the rds_sock (as the rest are + * available infomation in the rds_sock (as the rest are * zero'ed. It doesn't appear to be properly populated * during connection setup... */ @@ -157,8 +156,7 @@ static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id * return 0; } -static void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, - struct rdma_cm_id *cm_id) +void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id) { struct rds_iw_cm_id *i_cm_id; @@ -207,9 +205,9 @@ void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *con BUG_ON(list_empty(&ic->iw_node)); list_del(&ic->iw_node); - spin_lock(&rds_iwdev->spinlock); + spin_lock_irq(&rds_iwdev->spinlock); list_add_tail(&ic->iw_node, &rds_iwdev->conn_list); - spin_unlock(&rds_iwdev->spinlock); + spin_unlock_irq(&rds_iwdev->spinlock); spin_unlock_irq(&iw_nodev_conns_lock); ic->rds_iwdev = rds_iwdev; @@ -574,8 +572,8 @@ void rds_iw_free_mr(void *trans_private, int invalidate) rds_iw_free_fastreg(pool, ibmr); /* If we've pinned too many pages, request a flush */ - if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || - atomic_read(&pool->dirty_count) >= pool->max_items / 10) + if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned + || atomic_read(&pool->dirty_count) >= pool->max_items / 10) queue_work(rds_wq, &pool->flush_worker); if (invalidate) { diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c index 5e57347f49ff0..bedba05864c9a 100644 --- a/net/rds/iw_recv.c +++ b/net/rds/iw_recv.c @@ -31,7 +31,6 @@ * */ #include -#include #include #include #include @@ -231,8 +230,8 @@ int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, int ret = 0; u32 pos; - while ((prefill || rds_conn_up(conn)) && - rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) { + while ((prefill || rds_conn_up(conn)) + && rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) { if (pos >= ic->i_recv_ring.w_nr) { printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", pos); @@ -526,7 +525,7 @@ void rds_iw_attempt_ack(struct rds_iw_connection *ic) } /* Can we get a send credit? */ - if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) { + if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0)) { rds_iw_stats_inc(s_iw_tx_throttle); clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); return; @@ -731,10 +730,10 @@ static void rds_iw_process_recv(struct rds_connection *conn, hdr = &iwinc->ii_inc.i_hdr; /* We can't just use memcmp here; fragments of a * single message may carry different ACKs */ - if (hdr->h_sequence != ihdr->h_sequence || - hdr->h_len != ihdr->h_len || - hdr->h_sport != ihdr->h_sport || - hdr->h_dport != ihdr->h_dport) { + if (hdr->h_sequence != ihdr->h_sequence + || hdr->h_len != ihdr->h_len + || hdr->h_sport != ihdr->h_sport + || hdr->h_dport != ihdr->h_dport) { rds_iw_conn_error(conn, "fragment header mismatch; forcing reconnect\n"); return; diff --git a/net/rds/iw_ring.c b/net/rds/iw_ring.c index da8e3b63f6636..d422d4b5deef2 100644 --- a/net/rds/iw_ring.c +++ b/net/rds/iw_ring.c @@ -137,7 +137,7 @@ int rds_iw_ring_empty(struct rds_iw_work_ring *ring) int rds_iw_ring_low(struct rds_iw_work_ring *ring) { - return __rds_iw_ring_used(ring) <= (ring->w_nr >> 1); + return __rds_iw_ring_used(ring) <= (ring->w_nr >> 2); } diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c index 545d8ee3efb19..0055c26056d37 100644 --- a/net/rds/iw_send.c +++ b/net/rds/iw_send.c @@ -48,7 +48,7 @@ static void rds_iw_send_rdma_complete(struct rds_message *rm, return; case IB_WC_SUCCESS: - notify_status = RDS_RDMA_SUCCESS; + notify_status = RDS_RDMA_SEND_SUCCESS; break; case IB_WC_REM_ACCESS_ERR: @@ -56,7 +56,7 @@ static void rds_iw_send_rdma_complete(struct rds_message *rm, break; default: - notify_status = RDS_RDMA_OTHER_ERROR; + notify_status = RDS_RDMA_SEND_OTHER_ERROR; break; } rds_rdma_send_complete(rm, notify_status); @@ -287,8 +287,8 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context) rds_iw_ring_free(&ic->i_send_ring, completed); - if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || - test_bit(0, &conn->c_map_queued)) + if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) + || test_bit(0, &conn->c_map_queued)) queue_delayed_work(rds_wq, &conn->c_send_w, 0); /* We expect errors as the qp is drained during shutdown */ @@ -307,7 +307,7 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context) * * Conceptually, we have two counters: * - send credits: this tells us how many WRs we're allowed - * to submit without overruning the receiver's queue. For + * to submit without overruning the reciever's queue. For * each SEND WR we post, we decrement this by one. * * - posted credits: this tells us how many WRs we recently @@ -346,7 +346,7 @@ void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context) * and using atomic_cmpxchg when updating the two counters. */ int rds_iw_send_grab_credits(struct rds_iw_connection *ic, - u32 wanted, u32 *adv_credits, int need_posted, int max_posted) + u32 wanted, u32 *adv_credits, int need_posted) { unsigned int avail, posted, got = 0, advertise; long oldval, newval; @@ -386,7 +386,7 @@ try_again: * available. */ if (posted && (got || need_posted)) { - advertise = min_t(unsigned int, posted, max_posted); + advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT); newval -= IB_SET_POST_CREDITS(advertise); } @@ -518,7 +518,8 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); /* Fastreg support */ - if (rds_rdma_cookie_key(rm->m_rdma_cookie) && !ic->i_fastreg_posted) { + if (rds_rdma_cookie_key(rm->m_rdma_cookie) + && !ic->i_fastreg_posted) { ret = -EAGAIN; goto out; } @@ -539,7 +540,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, credit_alloc = work_alloc; if (ic->i_flowctl) { - credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT); + credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0); adv_credits += posted; if (credit_alloc < work_alloc) { rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); @@ -547,7 +548,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, flow_controlled++; } if (work_alloc == 0) { - set_bit(RDS_LL_SEND_FULL, &conn->c_flags); + rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); rds_iw_stats_inc(s_iw_tx_throttle); ret = -ENOMEM; goto out; @@ -614,7 +615,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, /* * Update adv_credits since we reset the ACK_REQUIRED bit. */ - rds_iw_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); + rds_iw_send_grab_credits(ic, 0, &posted, 1); adv_credits += posted; BUG_ON(adv_credits > 255); } diff --git a/net/rds/iw_stats.c b/net/rds/iw_stats.c index 5fe67f6a1d806..ccc7e8f0bf0e0 100644 --- a/net/rds/iw_stats.c +++ b/net/rds/iw_stats.c @@ -37,9 +37,9 @@ #include "rds.h" #include "iw.h" -DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_iw_statistics, rds_iw_stats); +DEFINE_PER_CPU(struct rds_iw_statistics, rds_iw_stats) ____cacheline_aligned; -static const char *const rds_iw_stat_names[] = { +static char *rds_iw_stat_names[] = { "iw_connect_raced", "iw_listen_closed_stale", "iw_tx_cq_call", diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c index e2e47176e729f..c5ed1c37a7bb1 100644 --- a/net/rds/iw_sysctl.c +++ b/net/rds/iw_sysctl.c @@ -55,13 +55,13 @@ static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL; unsigned int rds_iw_sysctl_flow_control = 1; -static ctl_table rds_iw_sysctl_table[] = { +ctl_table rds_iw_sysctl_table[] = { { .procname = "max_send_wr", .data = &rds_iw_sysctl_max_send_wr, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_doulongvec_minmax, + .proc_handler = &proc_doulongvec_minmax, .extra1 = &rds_iw_sysctl_max_wr_min, .extra2 = &rds_iw_sysctl_max_wr_max, }, @@ -70,7 +70,7 @@ static ctl_table rds_iw_sysctl_table[] = { .data = &rds_iw_sysctl_max_recv_wr, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_doulongvec_minmax, + .proc_handler = &proc_doulongvec_minmax, .extra1 = &rds_iw_sysctl_max_wr_min, .extra2 = &rds_iw_sysctl_max_wr_max, }, @@ -79,7 +79,7 @@ static ctl_table rds_iw_sysctl_table[] = { .data = &rds_iw_sysctl_max_unsig_wrs, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_doulongvec_minmax, + .proc_handler = &proc_doulongvec_minmax, .extra1 = &rds_iw_sysctl_max_unsig_wr_min, .extra2 = &rds_iw_sysctl_max_unsig_wr_max, }, @@ -88,7 +88,7 @@ static ctl_table rds_iw_sysctl_table[] = { .data = &rds_iw_sysctl_max_unsig_bytes, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_doulongvec_minmax, + .proc_handler = &proc_doulongvec_minmax, .extra1 = &rds_iw_sysctl_max_unsig_bytes_min, .extra2 = &rds_iw_sysctl_max_unsig_bytes_max, }, @@ -97,14 +97,14 @@ static ctl_table rds_iw_sysctl_table[] = { .data = &rds_iw_sysctl_max_recv_allocation, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = proc_doulongvec_minmax, + .proc_handler = &proc_doulongvec_minmax, }, { .procname = "flow_control", .data = &rds_iw_sysctl_flow_control, .maxlen = sizeof(rds_iw_sysctl_flow_control), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = &proc_dointvec, }, { } }; @@ -112,7 +112,7 @@ static ctl_table rds_iw_sysctl_table[] = { static struct ctl_path rds_iw_sysctl_path[] = { { .procname = "net", }, { .procname = "rds", }, - { .procname = "iw", }, + { .procname = "iw", }, { } }; diff --git a/net/rds/loop.c b/net/rds/loop.c index bca6761a3ca2f..640f669e745d6 100644 --- a/net/rds/loop.c +++ b/net/rds/loop.c @@ -31,7 +31,6 @@ * */ #include -#include #include #include "rds.h" @@ -61,15 +60,10 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off) { - struct scatterlist *sgp = &rm->data.op_sg[sg]; - int ret = sizeof(struct rds_header) + - be32_to_cpu(rm->m_inc.i_hdr.h_len); - /* Do not send cong updates to loopback */ if (rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) { rds_cong_map_updated(conn->c_fcong, ~(u64) 0); - ret = min_t(int, ret, sgp->length - conn->c_xmit_data_off); - goto out; + return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES; } BUG_ON(hdr_off || sg || off); @@ -85,8 +79,8 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm, NULL); rds_inc_put(&rm->m_inc); -out: - return ret; + + return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len); } /* @@ -139,12 +133,8 @@ static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp) static void rds_loop_conn_free(void *arg) { struct rds_loop_connection *lc = arg; - unsigned long flags; - rdsdebug("lc %p\n", lc); - spin_lock_irqsave(&loop_conns_lock, flags); list_del(&lc->loop_node); - spin_unlock_irqrestore(&loop_conns_lock, flags); kfree(lc); } diff --git a/net/rds/message.c b/net/rds/message.c index 1fd3d29023d76..4ed85e11cca89 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -31,7 +31,6 @@ * */ #include -#include #include "rds.h" @@ -82,8 +81,8 @@ void rds_message_put(struct rds_message *rm) { rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount)); if (atomic_read(&rm->m_refcount) == 0) { -printk(KERN_CRIT "danger refcount zero on %p\n", rm); -WARN_ON(1); + printk(KERN_CRIT "danger refcount zero on %p\n", rm); + WARN_ON(1); } if (atomic_dec_and_test(&rm->m_refcount)) { BUG_ON(!list_empty(&rm->m_sock_item)); @@ -106,8 +105,8 @@ void rds_message_populate_header(struct rds_header *hdr, __be16 sport, } EXPORT_SYMBOL_GPL(rds_message_populate_header); -int rds_message_add_extension(struct rds_header *hdr, unsigned int type, - const void *data, unsigned int len) +int rds_message_add_extension(struct rds_header *hdr, + unsigned int type, const void *data, unsigned int len) { unsigned int ext_len = sizeof(u8) + len; unsigned char *dst; @@ -116,7 +115,8 @@ int rds_message_add_extension(struct rds_header *hdr, unsigned int type, if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE) return 0; - if (type >= __RDS_EXTHDR_MAX || len != rds_exthdr_size[type]) + if (type >= __RDS_EXTHDR_MAX + || len != rds_exthdr_size[type]) return 0; if (ext_len >= RDS_HEADER_EXT_SPACE) @@ -177,6 +177,26 @@ none: return RDS_EXTHDR_NONE; } +int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version) +{ + struct rds_ext_header_version ext_hdr; + + ext_hdr.h_version = cpu_to_be32(version); + return rds_message_add_extension(hdr, RDS_EXTHDR_VERSION, &ext_hdr, sizeof(ext_hdr)); +} + +int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version) +{ + struct rds_ext_header_version ext_hdr; + unsigned int pos = 0, len = sizeof(ext_hdr); + + /* We assume the version extension is the only one present */ + if (rds_message_next_extension(hdr, &pos, &ext_hdr, &len) != RDS_EXTHDR_VERSION) + return 0; + *version = be32_to_cpu(ext_hdr.h_version); + return 1; +} + int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset) { struct rds_ext_header_rdma_dest ext_hdr; @@ -224,9 +244,6 @@ struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents) WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs); WARN_ON(!nents); - if (rm->m_used_sgs + nents > rm->m_total_sgs) - return NULL; - sg_ret = &sg_first[rm->m_used_sgs]; sg_init_table(sg_ret, nents); rm->m_used_sgs += nents; @@ -249,10 +266,6 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); rm->data.op_nents = ceil(total_len, PAGE_SIZE); rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs); - if (!rm->data.op_sg) { - rds_message_put(rm); - return ERR_PTR(-ENOMEM); - } for (i = 0; i < rm->data.op_nents; ++i) { sg_set_page(&rm->data.op_sg[i], diff --git a/net/rds/page.c b/net/rds/page.c index d8acdebe3c7cd..fe5adbb5aa417 100644 --- a/net/rds/page.c +++ b/net/rds/page.c @@ -31,7 +31,6 @@ * */ #include -#include #include "rds.h" @@ -40,8 +39,7 @@ struct rds_page_remainder { unsigned long r_offset; }; -static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, - rds_page_remainders); +DEFINE_PER_CPU(struct rds_page_remainder, rds_page_remainders) ____cacheline_aligned; /* * returns 0 on success or -errno on failure. @@ -58,17 +56,38 @@ int rds_page_copy_user(struct page *page, unsigned long offset, unsigned long ret; void *addr; - addr = kmap(page); - if (to_user) { + if (to_user) + ret = access_ok(VERIFY_WRITE, ptr, bytes); + else + ret = access_ok(VERIFY_READ, ptr, bytes); + + if (!ret) + return -EFAULT; + + if (to_user) rds_stats_add(s_copy_to_user, bytes); - ret = copy_to_user(ptr, addr + offset, bytes); - } else { + else rds_stats_add(s_copy_from_user, bytes); - ret = copy_from_user(addr + offset, ptr, bytes); + + addr = kmap_atomic(page, KM_USER0); + if (to_user) + ret = __copy_to_user_inatomic(ptr, addr + offset, bytes); + else + ret = __copy_from_user_inatomic(addr + offset, ptr, bytes); + kunmap_atomic(addr, KM_USER0); + + if (ret) { + addr = kmap(page); + if (to_user) + ret = copy_to_user(ptr, addr + offset, bytes); + else + ret = copy_from_user(addr + offset, ptr, bytes); + kunmap(page); + if (ret) + return -EFAULT; } - kunmap(page); - return ret ? -EFAULT : 0; + return 0; } EXPORT_SYMBOL_GPL(rds_page_copy_user); diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 4e37c1cbe8b2f..6dd7bb7e9ea92 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -31,7 +31,6 @@ * */ #include -#include #include #include /* for DMA_*_DEVICE */ @@ -161,7 +160,7 @@ static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages, ret = get_user_pages_fast(user_addr, nr_pages, write, pages); - if (ret >= 0 && ret < nr_pages) { + if (ret >= 0 && (unsigned) ret < nr_pages) { while (ret--) put_page(pages[ret]); ret = -EFAULT; @@ -415,7 +414,6 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); if (!mr) { - printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); return; } @@ -435,9 +433,10 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) /* If the MR was marked as invalidate, this will * trigger an async flush. */ - if (zot_me) + if (zot_me) { rds_destroy_mr(mr); - rds_mr_put(mr); + rds_mr_put(mr); + } } void rds_rdma_free_op(struct rm_rdma_op *ro) @@ -451,7 +450,7 @@ void rds_rdma_free_op(struct rm_rdma_op *ro) * is the case for a RDMA_READ which copies from remote * to local memory */ if (!ro->op_write) { - BUG_ON(irqs_disabled()); + WARN_ON_ONCE(page_mapping(page) && irqs_disabled()); set_page_dirty(page); } put_page(page); @@ -479,38 +478,13 @@ void rds_atomic_free_op(struct rm_atomic_op *ao) /* - * Count the number of pages needed to describe an incoming iovec array. + * Count the number of pages needed to describe an incoming iovec. */ -static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs) -{ - int tot_pages = 0; - unsigned int nr_pages; - unsigned int i; - - /* figure out the number of pages in the vector */ - for (i = 0; i < nr_iovecs; i++) { - nr_pages = rds_pages_in_vec(&iov[i]); - if (nr_pages == 0) - return -EINVAL; - - tot_pages += nr_pages; - - /* - * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, - * so tot_pages cannot overflow without first going negative. - */ - if (tot_pages < 0) - return -EINVAL; - } - - return tot_pages; -} - -int rds_rdma_extra_size(struct rds_rdma_args *args) +static int rds_rdma_pages(struct rds_rdma_args *args) { struct rds_iovec vec; struct rds_iovec __user *local_vec; - int tot_pages = 0; + unsigned int tot_pages = 0; unsigned int nr_pages; unsigned int i; @@ -527,16 +501,14 @@ int rds_rdma_extra_size(struct rds_rdma_args *args) return -EINVAL; tot_pages += nr_pages; - - /* - * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, - * so tot_pages cannot overflow without first going negative. - */ - if (tot_pages < 0) - return -EINVAL; } - return tot_pages * sizeof(struct scatterlist); + return tot_pages; +} + +int rds_rdma_extra_size(struct rds_rdma_args *args) +{ + return rds_rdma_pages(args) * sizeof(struct scatterlist); } /* @@ -547,12 +519,13 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg) { struct rds_rdma_args *args; + struct rds_iovec vec; struct rm_rdma_op *op = &rm->rdma; - int nr_pages; + unsigned int nr_pages; unsigned int nr_bytes; struct page **pages = NULL; - struct rds_iovec iovstack[UIO_FASTIOV], *iovs = iovstack; - int iov_size; + struct rds_iovec __user *local_vec; + unsigned int nr; unsigned int i, j; int ret = 0; @@ -567,31 +540,14 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, goto out; } - if (args->nr_local > UIO_MAXIOV) { + if (args->nr_local > (u64)UINT_MAX) { ret = -EMSGSIZE; goto out; } - /* Check whether to allocate the iovec area */ - iov_size = args->nr_local * sizeof(struct rds_iovec); - if (args->nr_local > UIO_FASTIOV) { - iovs = sock_kmalloc(rds_rs_to_sk(rs), iov_size, GFP_KERNEL); - if (!iovs) { - ret = -ENOMEM; - goto out; - } - } - - if (copy_from_user(iovs, (struct rds_iovec __user *)(unsigned long) args->local_vec_addr, iov_size)) { - ret = -EFAULT; + nr_pages = rds_rdma_pages(args); + if (nr_pages < 0) goto out; - } - - nr_pages = rds_rdma_pages(iovs, args->nr_local); - if (nr_pages < 0) { - ret = -EINVAL; - goto out; - } pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) { @@ -603,28 +559,25 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, op->op_fence = !!(args->flags & RDS_RDMA_FENCE); op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); op->op_silent = !!(args->flags & RDS_RDMA_SILENT); + op->op_remote_complete = !!(args->flags & RDS_RDMA_REMOTE_COMPLETE); op->op_active = 1; op->op_recverr = rs->rs_recverr; WARN_ON(!nr_pages); op->op_sg = rds_message_alloc_sgs(rm, nr_pages); - if (!op->op_sg) { - ret = -ENOMEM; - goto out; - } - if (op->op_notify || op->op_recverr) { + if (op->op_notify || op->op_recverr || rds_async_send_enabled) { /* We allocate an uninitialized notifier here, because * we don't want to do that in the completion handler. We * would have to use GFP_ATOMIC there, and don't want to deal * with failed allocations. */ - op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); + op->op_notifier = kzalloc(sizeof(struct rds_notifier), GFP_KERNEL); if (!op->op_notifier) { ret = -ENOMEM; goto out; } op->op_notifier->n_user_token = args->user_token; - op->op_notifier->n_status = RDS_RDMA_SUCCESS; + op->op_notifier->n_status = RDS_RDMA_SEND_SUCCESS; } /* The cookie contains the R_Key of the remote memory region, and @@ -644,40 +597,50 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, (unsigned long long)args->remote_vec.addr, op->op_rkey); + local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; + for (i = 0; i < args->nr_local; i++) { - struct rds_iovec *iov = &iovs[i]; - /* don't need to check, rds_rdma_pages() verified nr will be +nonzero */ - unsigned int nr = rds_pages_in_vec(iov); + if (copy_from_user(&vec, &local_vec[i], + sizeof(struct rds_iovec))) { + ret = -EFAULT; + goto out; + } + + nr = rds_pages_in_vec(&vec); + if (nr == 0) { + ret = -EINVAL; + goto out; + } - rs->rs_user_addr = iov->addr; - rs->rs_user_bytes = iov->bytes; + rs->rs_user_addr = vec.addr; + rs->rs_user_bytes = vec.bytes; /* If it's a WRITE operation, we want to pin the pages for reading. * If it's a READ operation, we need to pin the pages for writing. */ - ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write); + ret = rds_pin_pages(vec.addr, nr, pages, !op->op_write); if (ret < 0) goto out; - rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n", - nr_bytes, nr, iov->bytes, iov->addr); + rdsdebug("RDS: nr_bytes %u nr %u vec.bytes %llu vec.addr %llx\n", + nr_bytes, nr, vec.bytes, vec.addr); - nr_bytes += iov->bytes; + nr_bytes += vec.bytes; for (j = 0; j < nr; j++) { - unsigned int offset = iov->addr & ~PAGE_MASK; + unsigned int offset = vec.addr & ~PAGE_MASK; struct scatterlist *sg; sg = &op->op_sg[op->op_nents + j]; sg_set_page(sg, pages[j], - min_t(unsigned int, iov->bytes, PAGE_SIZE - offset), + min_t(unsigned int, vec.bytes, PAGE_SIZE - offset), offset); - rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n", - sg->offset, sg->length, iov->addr, iov->bytes); + rdsdebug("RDS: sg->offset %x sg->len %x vec.addr %llx vec.bytes %llu\n", + sg->offset, sg->length, vec.addr, vec.bytes); - iov->addr += sg->length; - iov->bytes -= sg->length; + vec.addr += sg->length; + vec.bytes -= sg->length; } op->op_nents += nr; @@ -692,14 +655,13 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, } op->op_bytes = nr_bytes; + ret = 0; out: - if (iovs != iovstack) - sock_kfree_s(rds_rs_to_sk(rs), iovs, iov_size); kfree(pages); if (ret) rds_rdma_free_op(op); - else - rds_stats_inc(s_send_rdma); + + rds_stats_inc(s_send_rdma); return ret; } @@ -716,8 +678,8 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, u32 r_key; int err = 0; - if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t)) || - rm->m_rdma_cookie != 0) + if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t)) + || rm->m_rdma_cookie != 0) return -EINVAL; memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie)); @@ -731,9 +693,10 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); - if (!mr) + if (!mr) { + printk(KERN_ERR "rds_cmsg_rdma_dest: key %x\n", r_key); err = -EINVAL; /* invalid r_key */ - else + } else atomic_inc(&mr->r_refcount); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); @@ -753,11 +716,15 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg) { - if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args)) || - rm->m_rdma_cookie != 0) + int ret; + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args)) + || rm->m_rdma_cookie != 0) return -EINVAL; - return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr); + ret = __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr); + if (!ret) + rm->rdma.op_implicit_mr = 1; + return ret; } /* @@ -776,45 +743,24 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, args = CMSG_DATA(cmsg); - /* Nonmasked & masked cmsg ops converted to masked hw ops */ - switch (cmsg->cmsg_type) { - case RDS_CMSG_ATOMIC_FADD: - rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; - rm->atomic.op_m_fadd.add = args->fadd.add; - rm->atomic.op_m_fadd.nocarry_mask = 0; - break; - case RDS_CMSG_MASKED_ATOMIC_FADD: - rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; - rm->atomic.op_m_fadd.add = args->m_fadd.add; - rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask; - break; - case RDS_CMSG_ATOMIC_CSWP: - rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; - rm->atomic.op_m_cswp.compare = args->cswp.compare; - rm->atomic.op_m_cswp.swap = args->cswp.swap; - rm->atomic.op_m_cswp.compare_mask = ~0; - rm->atomic.op_m_cswp.swap_mask = ~0; - break; - case RDS_CMSG_MASKED_ATOMIC_CSWP: + if (cmsg->cmsg_type == RDS_CMSG_ATOMIC_CSWP) rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; - rm->atomic.op_m_cswp.compare = args->m_cswp.compare; - rm->atomic.op_m_cswp.swap = args->m_cswp.swap; - rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask; - rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask; - break; - default: - BUG(); /* should never happen */ + else { + rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; + /* compare field should be 0 -- unused for FADD ops */ + if (args->compare) { + ret = -EINVAL; + goto err; + } } + rm->atomic.op_swap_add = args->swap_add; + rm->atomic.op_compare = args->compare; rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT); rm->atomic.op_active = 1; rm->atomic.op_recverr = rs->rs_recverr; rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1); - if (!rm->atomic.op_sg) { - ret = -ENOMEM; - goto err; - } /* verify 8 byte-aligned */ if (args->local_addr & 0x7) { @@ -829,20 +775,20 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr)); - if (rm->atomic.op_notify || rm->atomic.op_recverr) { + if (rm->atomic.op_notify || rm->atomic.op_recverr || rds_async_send_enabled) { /* We allocate an uninitialized notifier here, because * we don't want to do that in the completion handler. We * would have to use GFP_ATOMIC there, and don't want to deal * with failed allocations. */ - rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL); + rm->atomic.op_notifier = kzalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL); if (!rm->atomic.op_notifier) { ret = -ENOMEM; goto err; } rm->atomic.op_notifier->n_user_token = args->user_token; - rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS; + rm->atomic.op_notifier->n_status = RDS_RDMA_SEND_SUCCESS; } rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie); diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index f8760e1b6688b..b070a9cab195f 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -33,36 +33,14 @@ #include #include "rdma_transport.h" +#include "ib.h" +#include "net/arp.h" +#include "tcp.h" -static struct rdma_cm_id *rds_rdma_listen_id; - -static char *rds_cm_event_strings[] = { -#define RDS_CM_EVENT_STRING(foo) \ - [RDMA_CM_EVENT_##foo] = __stringify(RDMA_CM_EVENT_##foo) - RDS_CM_EVENT_STRING(ADDR_RESOLVED), - RDS_CM_EVENT_STRING(ADDR_ERROR), - RDS_CM_EVENT_STRING(ROUTE_RESOLVED), - RDS_CM_EVENT_STRING(ROUTE_ERROR), - RDS_CM_EVENT_STRING(CONNECT_REQUEST), - RDS_CM_EVENT_STRING(CONNECT_RESPONSE), - RDS_CM_EVENT_STRING(CONNECT_ERROR), - RDS_CM_EVENT_STRING(UNREACHABLE), - RDS_CM_EVENT_STRING(REJECTED), - RDS_CM_EVENT_STRING(ESTABLISHED), - RDS_CM_EVENT_STRING(DISCONNECTED), - RDS_CM_EVENT_STRING(DEVICE_REMOVAL), - RDS_CM_EVENT_STRING(MULTICAST_JOIN), - RDS_CM_EVENT_STRING(MULTICAST_ERROR), - RDS_CM_EVENT_STRING(ADDR_CHANGE), - RDS_CM_EVENT_STRING(TIMEWAIT_EXIT), -#undef RDS_CM_EVENT_STRING -}; - -static char *rds_cm_event_str(enum rdma_cm_event_type type) -{ - return rds_str_array(rds_cm_event_strings, - ARRAY_SIZE(rds_cm_event_strings), type); -}; +#include +#include + +static struct rdma_cm_id *rds_iw_listen_id; int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) @@ -70,10 +48,13 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, /* this can be null in the listening path */ struct rds_connection *conn = cm_id->context; struct rds_transport *trans; + struct page *page; + struct arpreq *r; + struct sockaddr_in *sin; int ret = 0; - rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id, - event->event, rds_cm_event_str(event->event)); + rdsdebug("conn %p id %p handling event %u\n", conn, cm_id, + event->event); if (cm_id->device->node_type == RDMA_NODE_RNIC) trans = &rds_iw_transport; @@ -103,9 +84,30 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, break; case RDMA_CM_EVENT_ADDR_RESOLVED: + rdma_set_service_type(cm_id, conn->c_tos); + + if (rds_ib_apm_enabled) + rdma_set_timeout(cm_id, rds_ib_apm_timeout); + /* XXX do we need to clean up if this fails? */ ret = rdma_resolve_route(cm_id, RDS_RDMA_RESOLVE_TIMEOUT_MS); + if (ret) { + /* + * The cm_id will get destroyed by addr_handler + * in RDMA CM when we return from here. + */ + if (conn) { + struct rds_ib_connection *ibic; + + printk(KERN_CRIT "rds dropping connection after rdma_resolve_route failure" + "connection %u.%u.%u.%u->%u.%u.%u.%u\n", NIPQUAD(conn->c_laddr), NIPQUAD(conn->c_faddr)); + ibic = conn->c_transport_data; + if (ibic && ibic->i_cm_id == cm_id) + ibic->i_cm_id = NULL; + rds_conn_drop(conn); + } + } break; case RDMA_CM_EVENT_ROUTE_RESOLVED: @@ -113,21 +115,58 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, ret = trans->cm_initiate_connect(cm_id); break; + case RDMA_CM_EVENT_ALT_PATH_LOADED: + rdsdebug("RDS: alt path loaded\n"); + if (conn) + trans->check_migration(conn, event); + break; + + case RDMA_CM_EVENT_ALT_ROUTE_RESOLVED: + rdsdebug("RDS: alt route resolved\n"); + break; + + case RDMA_CM_EVENT_ALT_ROUTE_ERROR: + rdsdebug("RDS: alt route resolve error\n"); + break; + + case RDMA_CM_EVENT_ROUTE_ERROR: + /* IP might have been moved so flush the ARP entry and retry */ + page = alloc_page(GFP_HIGHUSER); + if (!page) { + printk(KERN_ERR "alloc_page failed .. NO MEM\n"); + ret = -ENOMEM; + } else { + r = (struct arpreq *)kmap(page); + memset(r, 0, sizeof(struct arpreq)); + sin = (struct sockaddr_in *)&r->arp_pa; + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = conn->c_faddr; + inet_ioctl(rds_ib_inet_socket, SIOCDARP, (unsigned long) r); + kunmap(page); + __free_page(page); + } + + rds_conn_drop(conn); + break; + case RDMA_CM_EVENT_ESTABLISHED: trans->cm_connect_complete(conn, event); break; case RDMA_CM_EVENT_ADDR_ERROR: - case RDMA_CM_EVENT_ROUTE_ERROR: case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: case RDMA_CM_EVENT_REJECTED: case RDMA_CM_EVENT_DEVICE_REMOVAL: - case RDMA_CM_EVENT_ADDR_CHANGE: if (conn) rds_conn_drop(conn); break; + case RDMA_CM_EVENT_ADDR_CHANGE: + if (conn && !rds_ib_apm_enabled) + rds_conn_drop(conn); + break; + case RDMA_CM_EVENT_DISCONNECTED: rdsdebug("DISCONNECT event - dropping connection " "%pI4->%pI4\n", &conn->c_laddr, @@ -137,8 +176,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, default: /* things like device disconnect? */ - printk(KERN_ERR "RDS: unknown event %u (%s)!\n", - event->event, rds_cm_event_str(event->event)); + printk(KERN_ERR "RDS: unknown event %u!\n", event->event); break; } @@ -146,8 +184,7 @@ out: if (conn) mutex_unlock(&conn->c_cm_lock); - rdsdebug("id %p event %u (%s) handling ret %d\n", cm_id, event->event, - rds_cm_event_str(event->event), ret); + rdsdebug("id %p event %u handling ret %d\n", cm_id, event->event, ret); return ret; } @@ -158,8 +195,7 @@ static int rds_rdma_listen_init(void) struct rdma_cm_id *cm_id; int ret; - cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP, - IB_QPT_RC); + cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP); if (IS_ERR(cm_id)) { ret = PTR_ERR(cm_id); printk(KERN_ERR "RDS/RDMA: failed to setup listener, " @@ -167,7 +203,7 @@ static int rds_rdma_listen_init(void) return ret; } - sin.sin_family = AF_INET, + sin.sin_family = PF_INET, sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY); sin.sin_port = (__force u16)htons(RDS_PORT); @@ -191,7 +227,7 @@ static int rds_rdma_listen_init(void) rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT); - rds_rdma_listen_id = cm_id; + rds_iw_listen_id = cm_id; cm_id = NULL; out: if (cm_id) @@ -201,14 +237,14 @@ out: static void rds_rdma_listen_stop(void) { - if (rds_rdma_listen_id) { - rdsdebug("cm %p\n", rds_rdma_listen_id); - rdma_destroy_id(rds_rdma_listen_id); - rds_rdma_listen_id = NULL; + if (rds_iw_listen_id) { + rdsdebug("cm %p\n", rds_iw_listen_id); + rdma_destroy_id(rds_iw_listen_id); + rds_iw_listen_id = NULL; } } -static int rds_rdma_init(void) +int rds_rdma_init(void) { int ret; @@ -235,7 +271,7 @@ out: } module_init(rds_rdma_init); -static void rds_rdma_exit(void) +void rds_rdma_exit(void) { /* stop listening first to ensure no new connections are attempted */ rds_rdma_listen_stop(); diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h index faba4e382695e..2f2c7d976c219 100644 --- a/net/rds/rdma_transport.h +++ b/net/rds/rdma_transport.h @@ -11,6 +11,10 @@ int rds_rdma_conn_connect(struct rds_connection *conn); int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event); +/* from rdma_transport.c */ +int rds_rdma_init(void); +void rds_rdma_exit(void); + /* from ib.c */ extern struct rds_transport rds_ib_transport; int rds_ib_init(void); diff --git a/net/rds/rds.h b/net/rds/rds.h index da8adac2bf06f..038e809c19638 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -15,7 +15,8 @@ */ #define RDS_PROTOCOL_3_0 0x0300 #define RDS_PROTOCOL_3_1 0x0301 -#define RDS_PROTOCOL_VERSION RDS_PROTOCOL_3_1 +#define RDS_PROTOCOL_3_2 0x0302 +#define RDS_PROTOCOL_VERSION RDS_PROTOCOL_3_2 #define RDS_PROTOCOL_MAJOR(v) ((v) >> 8) #define RDS_PROTOCOL_MINOR(v) ((v) & 255) #define RDS_PROTOCOL(maj, min) (((maj) << 8) | min) @@ -50,6 +51,7 @@ rdsdebug(char *fmt, ...) #define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT)) #define RDS_CONG_MAP_BYTES (65536 / 8) +#define RDS_CONG_MAP_LONGS (RDS_CONG_MAP_BYTES / sizeof(unsigned long)) #define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE) #define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8) @@ -80,6 +82,7 @@ enum { #define RDS_LL_SEND_FULL 0 #define RDS_RECONNECT_PENDING 1 #define RDS_IN_XMIT 2 +#define RDS_RECV_REFILL 3 struct rds_connection { struct hlist_node c_hash_node; @@ -110,29 +113,51 @@ struct rds_connection { void *c_transport_data; atomic_t c_state; + unsigned long c_send_gen; unsigned long c_flags; unsigned long c_reconnect_jiffies; struct delayed_work c_send_w; struct delayed_work c_recv_w; struct delayed_work c_conn_w; + struct delayed_work c_hb_w; struct work_struct c_down_w; struct mutex c_cm_lock; /* protect conn state & cm */ wait_queue_head_t c_waitq; struct list_head c_map_item; unsigned long c_map_queued; + unsigned long c_connection_start; /* when was this connection started */ unsigned int c_unacked_packets; unsigned int c_unacked_bytes; /* Protocol version */ unsigned int c_version; + + /* Re-connect stall diagnostics */ + unsigned long c_reconnect_start; + unsigned int c_reconnect_drops; + int c_reconnect_warn; + int c_reconnect_err; + + unsigned int c_reconnect; + + /* Qos support */ + u8 c_tos; + + unsigned int c_pending_flush; + + unsigned long c_hb_start; + + unsigned int c_active_side; }; #define RDS_FLAG_CONG_BITMAP 0x01 #define RDS_FLAG_ACK_REQUIRED 0x02 #define RDS_FLAG_RETRANSMITTED 0x04 -#define RDS_MAX_ADV_CREDIT 255 +#define RDS_FLAG_HB_PING 0x08 +#define RDS_FLAG_HB_PONG 0x10 +#define RDS_MAX_ADV_CREDIT 127 /* * Maximum space available for extension headers. @@ -284,6 +309,7 @@ static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) #define RDS_MSG_RETRANSMITTED 5 #define RDS_MSG_MAPPED 6 #define RDS_MSG_PAGEVEC 7 +#define RDS_MSG_FLUSH 8 struct rds_message { atomic_t m_refcount; @@ -315,18 +341,8 @@ struct rds_message { struct { struct rm_atomic_op { int op_type; - union { - struct { - uint64_t compare; - uint64_t swap; - uint64_t compare_mask; - uint64_t swap_mask; - } op_m_cswp; - struct { - uint64_t add; - uint64_t nocarry_mask; - } op_m_fadd; - }; + uint64_t op_swap_add; + uint64_t op_compare; u32 op_rkey; u64 op_remote_addr; @@ -350,16 +366,20 @@ struct rds_message { unsigned int op_mapped:1; unsigned int op_silent:1; unsigned int op_active:1; + unsigned int op_implicit_mr:1; + unsigned int op_remote_complete:1; unsigned int op_bytes; unsigned int op_nents; unsigned int op_count; struct scatterlist *op_sg; struct rds_notifier *op_notifier; - struct rds_mr *op_rdma_mr; } rdma; struct rm_data_op { unsigned int op_active:1; + unsigned int op_notify:1; + unsigned int op_async:1; + struct rds_notifier *op_notifier; unsigned int op_nents; unsigned int op_count; struct scatterlist *op_sg; @@ -378,6 +398,7 @@ struct rds_notifier { struct list_head n_list; uint64_t n_user_token; int n_status; + struct rds_connection *n_conn; }; /** @@ -449,6 +470,8 @@ struct rds_transport { void (*sync_mr)(void *trans_private, int direction); void (*free_mr)(void *trans_private, int invalidate); void (*flush_mrs)(void); + void (*check_migration)(struct rds_connection *conn, + struct rdma_cm_event *event); }; struct rds_sock { @@ -513,6 +536,9 @@ struct rds_sock { /* Socket options - in case there will be more */ unsigned char rs_recverr, rs_cong_monitor; + int poison; + + u8 rs_tos; }; static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk) @@ -575,7 +601,6 @@ struct rds_statistics { }; /* af_rds.c */ -char *rds_str_array(char **array, size_t elements, size_t index); void rds_sock_addref(struct rds_sock *rs); void rds_sock_put(struct rds_sock *rs); void rds_wake_sk_sleep(struct rds_sock *rs); @@ -588,6 +613,8 @@ static inline void __rds_wake_sk_sleep(struct sock *sk) } extern wait_queue_head_t rds_poll_waitq; +void debug_sock_hold(struct sock *sock); +void debug_sock_put(struct sock *sock); /* bind.c */ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); @@ -613,11 +640,16 @@ struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); int rds_conn_init(void); void rds_conn_exit(void); struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, - struct rds_transport *trans, gfp_t gfp); + struct rds_transport *trans, + u8 tos, gfp_t gfp); struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, - struct rds_transport *trans, gfp_t gfp); + struct rds_transport *trans, + u8 tos, gfp_t gfp); +struct rds_connection *rds_conn_find(__be32 laddr, __be32 faddr, + struct rds_transport *trans, u8 tos); void rds_conn_shutdown(struct rds_connection *conn); void rds_conn_destroy(struct rds_connection *conn); +void rds_conn_reset(struct rds_connection *conn); void rds_conn_drop(struct rds_connection *conn); void rds_conn_connect_if_down(struct rds_connection *conn); void rds_for_each_conn_info(struct socket *sock, unsigned int len, @@ -666,6 +698,8 @@ int rds_message_add_extension(struct rds_header *hdr, unsigned int type, const void *data, unsigned int len); int rds_message_next_extension(struct rds_header *hdr, unsigned int *pos, void *buf, unsigned int *buflen); +int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version); +int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version); int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset); int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, size_t size); @@ -702,6 +736,7 @@ void rds_page_exit(void); /* recv.c */ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, __be32 saddr); +void rds_inc_addref(struct rds_incoming *inc); void rds_inc_put(struct rds_incoming *inc); void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, struct rds_incoming *inc, gfp_t gfp, enum km_type km); @@ -723,10 +758,14 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest); typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); void rds_send_drop_acked(struct rds_connection *conn, u64 ack, is_acked_func is_acked); +void rds_send_remove_from_sock(struct list_head *messages, int status); int rds_send_pong(struct rds_connection *conn, __be16 dport); +int rds_send_hb(struct rds_connection *conn, int response); struct rds_message *rds_send_get_message(struct rds_connection *, struct rm_rdma_op *); +extern unsigned int rds_async_send_enabled; + /* rdma.c */ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen); @@ -746,6 +785,7 @@ void rds_rdma_free_op(struct rm_rdma_op *ro); void rds_atomic_free_op(struct rm_atomic_op *ao); void rds_rdma_send_complete(struct rds_message *rm, int wc_status); void rds_atomic_send_complete(struct rds_message *rm, int wc_status); +void rds_asend_complete(struct rds_message *rm, int wc_status); int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg); @@ -771,8 +811,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); int rds_stats_init(void); void rds_stats_exit(void); void rds_stats_info_copy(struct rds_info_iterator *iter, - uint64_t *values, const char *const *names, - size_t nr); + uint64_t *values, char **names, size_t nr); /* sysctl.c */ int rds_sysctl_init(void); @@ -797,6 +836,7 @@ void rds_connect_worker(struct work_struct *); void rds_shutdown_worker(struct work_struct *); void rds_send_worker(struct work_struct *); void rds_recv_worker(struct work_struct *); +void rds_hb_worker(struct work_struct *); void rds_connect_complete(struct rds_connection *conn); /* transport.c */ diff --git a/net/rds/recv.c b/net/rds/recv.c index 51a8f8ea6c6f0..a298aadc2fd45 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -31,7 +31,6 @@ * */ #include -#include #include #include @@ -48,11 +47,12 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, } EXPORT_SYMBOL_GPL(rds_inc_init); -static void rds_inc_addref(struct rds_incoming *inc) +void rds_inc_addref(struct rds_incoming *inc) { rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount)); atomic_inc(&inc->i_refcount); } +EXPORT_SYMBOL_GPL(rds_inc_addref); void rds_inc_put(struct rds_incoming *inc) { @@ -194,16 +194,23 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, * XXX we could spend more on the wire to get more robust failure * detection, arguably worth it to avoid data corruption. */ - if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq && - (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) { + + if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq + && (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) { rds_stats_inc(s_recv_drop_old_seq); goto out; } conn->c_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1; if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) { - rds_stats_inc(s_recv_ping); - rds_send_pong(conn, inc->i_hdr.h_sport); + if (inc->i_hdr.h_flags & RDS_FLAG_HB_PING) { + rds_send_hb(conn, 1); + } else if (inc->i_hdr.h_flags & RDS_FLAG_HB_PONG) { + conn->c_hb_start = 0; + } else { + rds_stats_inc(s_recv_ping); + rds_send_pong(conn, inc->i_hdr.h_sport); + } goto out; } @@ -295,7 +302,7 @@ static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc, int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr) { struct rds_notifier *notifier; - struct rds_rdma_notify cmsg = { 0 }; /* fill holes with zero */ + struct rds_rdma_send_notify cmsg; unsigned int count = 0, max_messages = ~0U; unsigned long flags; LIST_HEAD(copy); @@ -319,7 +326,7 @@ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr) while (!list_empty(&rs->rs_notify_queue) && count < max_messages) { notifier = list_entry(rs->rs_notify_queue.next, struct rds_notifier, n_list); - list_move(¬ifier->n_list, ©); + list_move_tail(¬ifier->n_list, ©); count++; } spin_unlock_irqrestore(&rs->rs_lock, flags); @@ -334,12 +341,24 @@ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr) cmsg.user_token = notifier->n_user_token; cmsg.status = notifier->n_status; - err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS, + err = put_cmsg(msghdr, SOL_RDS, + RDS_CMSG_RDMA_SEND_STATUS, sizeof(cmsg), &cmsg); if (err) break; } + /* If this is the last failed op, re-open the connection for + traffic */ + if (notifier->n_conn) { + spin_lock_irqsave(¬ifier->n_conn->c_lock, flags); + if (notifier->n_conn->c_pending_flush) + notifier->n_conn->c_pending_flush--; + else + printk(KERN_ERR "rds_notify_queue_get: OOPS!\n"); + spin_unlock_irqrestore(¬ifier->n_conn->c_lock, flags); + } + list_del_init(¬ifier->n_list); kfree(notifier); } @@ -409,8 +428,6 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo); - msg->msg_namelen = 0; - if (msg_flags & MSG_OOB) goto out; @@ -433,9 +450,10 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, } timeo = wait_event_interruptible_timeout(*sk_sleep(sk), - (!list_empty(&rs->rs_notify_queue) || - rs->rs_cong_notify || - rds_next_incoming(rs, &inc)), timeo); + (!list_empty(&rs->rs_notify_queue) + || rs->rs_cong_notify + || rds_next_incoming(rs, &inc)), + timeo); rdsdebug("recvmsg woke inc %p timeo %ld\n", inc, timeo); if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT) @@ -486,7 +504,6 @@ int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, sin->sin_port = inc->i_hdr.h_sport; sin->sin_addr.s_addr = inc->i_saddr; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); - msg->msg_namelen = sizeof(*sin); } break; } @@ -532,6 +549,7 @@ void rds_inc_info_copy(struct rds_incoming *inc, minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence); minfo.len = be32_to_cpu(inc->i_hdr.h_len); + minfo.tos = inc->i_conn->c_tos; if (flip) { minfo.laddr = daddr; diff --git a/net/rds/send.c b/net/rds/send.c index f6bdfb0fba182..a3b5ecc617fb4 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -31,7 +31,6 @@ * */ #include -#include #include #include #include @@ -52,7 +51,9 @@ static int send_batch_count = 64; module_param(send_batch_count, int, 0444); MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue"); -static void rds_send_remove_from_sock(struct list_head *messages, int status); +unsigned int rds_async_send_enabled = 0; +module_param(rds_async_send_enabled, int, 0444); +MODULE_PARM_DESC(rds_async_send_enabled, "Set to enable Async Send"); /* * Reset the send state. Callers must ensure that this doesn't race with @@ -62,6 +63,7 @@ void rds_send_reset(struct rds_connection *conn) { struct rds_message *rm, *tmp; unsigned long flags; + int failed_op = 0; if (conn->c_xmit_rm) { rm = conn->c_xmit_rm; @@ -70,8 +72,10 @@ void rds_send_reset(struct rds_connection *conn) * transport. This isn't entirely true (it's flushed out * independently) but as the connection is down, there's * no ongoing RDMA to/from that memory */ - rds_message_unmapped(rm); - rds_message_put(rm); + if (!rds_async_send_enabled) { + rds_message_unmapped(rm); + rds_message_put(rm); + } } conn->c_xmit_sg = 0; @@ -91,8 +95,55 @@ void rds_send_reset(struct rds_connection *conn) list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags); + + /* flush internal HB msgs */ + if ((rm->m_inc.i_hdr.h_flags == RDS_FLAG_HB_PONG) || + (rm->m_inc.i_hdr.h_flags == RDS_FLAG_HB_PING)) + set_bit(RDS_MSG_FLUSH, &rm->m_flags); + + /* check for failed op */ + if (rds_async_send_enabled && (rm->rdma.op_active || + (rm->data.op_active && rm->data.op_async))) + failed_op = 1; } list_splice_init(&conn->c_retrans, &conn->c_send_queue); + + /* if there was a failed op, flush all async ops */ + if (failed_op) { + list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, + m_conn_item) { + if (rm->rdma.op_active) { + if (rm->rdma.op_notifier) { + struct rds_notifier *notifier; + + notifier = rm->rdma.op_notifier; + notifier->n_conn = conn; + if (test_bit(RDS_MSG_RETRANSMITTED, + &rm->m_flags) && + !notifier->n_status) { + notifier->n_status = + RDS_RDMA_REMOTE_ERROR; + } + + if (!test_bit(RDS_MSG_FLUSH, + &rm->m_flags)) { + conn->c_pending_flush++; + } + } + set_bit(RDS_MSG_FLUSH, &rm->m_flags); + } + if (rm->data.op_active && rm->data.op_async) { + if (rm->data.op_notifier) { + rm->data.op_notifier->n_conn = conn; + if (!test_bit(RDS_MSG_FLUSH, + &rm->m_flags)) { + conn->c_pending_flush++; + } + } + set_bit(RDS_MSG_FLUSH, &rm->m_flags); + } + } + } spin_unlock_irqrestore(&conn->c_lock, flags); } @@ -116,13 +167,13 @@ static void release_in_xmit(struct rds_connection *conn) } /* - * We're making the conscious trade-off here to only send one message + * We're making the concious trade-off here to only send one message * down the connection at a time. * Pro: * - tx queueing is a simple fifo list - * - reassembly is optional and easily done by transports per conn + * - reassembly is optional and easily done by transports per conn * - no per flow rx lookup at all, straight to the socket - * - less per-frag memory and wire overhead + * - less per-frag memory and wire overhead * Con: * - queued acks can be delayed behind large messages * Depends: @@ -137,9 +188,14 @@ int rds_send_xmit(struct rds_connection *conn) struct scatterlist *sg; int ret = 0; LIST_HEAD(to_be_dropped); + int same_rm = 0; + int batch_count; + unsigned long send_gen = 0; restart: + batch_count = 0; + /* * sendmsg calls here after having queued its message on the send * queue. We only have one task feeding the connection at a time. If @@ -153,6 +209,17 @@ restart: goto out; } + /* + * we record the send generation after doing the xmit acquire. + * if someone else manages to jump in and do some work, we'll use + * this to avoid a goto restart farther down. + * + * we don't need a lock because the counter is only incremented + * while we have the in_xmit bit held. + */ + conn->c_send_gen++; + send_gen = conn->c_send_gen; + /* * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT, * we do the opposite to avoid races. @@ -174,6 +241,17 @@ restart: rm = conn->c_xmit_rm; + if (!rm) { + same_rm = 0; + } else { + same_rm++; + if ((same_rm >= 4096) && printk_ratelimit()) { + printk(KERN_ERR "RDS: Stuck rm\n"); + ret = -EAGAIN; + break; + } + } + /* * If between sending messages, we can send a pending congestion * map update. @@ -199,6 +277,16 @@ restart: if (!rm) { unsigned int len; + batch_count++; + + /* we want to process as big a batch as we can, but + * we also want to avoid softlockups. If we've been + * through a lot of messages, lets back off and see + * if anyone else jumps in + */ + if (batch_count >= 1024) + goto over_batch; + spin_lock_irqsave(&conn->c_lock, flags); if (!list_empty(&conn->c_send_queue)) { @@ -223,22 +311,27 @@ restart: * RDMA to a bad MR key is by moving the entire * queue pair to error state. We cold possibly * recover from that, but right now we drop the - * connection. - * Therefore, we never retransmit messages with RDMA ops. + * connection. Therefore, we never retransmit messages + * with RDMA ops. */ - if (rm->rdma.op_active && - test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { + + if (test_bit(RDS_MSG_FLUSH, &rm->m_flags) || + (rm->rdma.op_active && + test_bit(RDS_MSG_RETRANSMITTED, + &rm->m_flags))) { spin_lock_irqsave(&conn->c_lock, flags); - if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) - list_move(&rm->m_conn_item, &to_be_dropped); + if (test_and_clear_bit(RDS_MSG_ON_CONN, + &rm->m_flags)) + list_move_tail(&rm->m_conn_item, + &to_be_dropped); spin_unlock_irqrestore(&conn->c_lock, flags); continue; } /* Require an ACK every once in a while */ len = ntohl(rm->m_inc.i_hdr.h_len); - if (conn->c_unacked_packets == 0 || - conn->c_unacked_bytes < len) { + if (conn->c_unacked_packets == 0 + || conn->c_unacked_bytes < len) { __set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags); conn->c_unacked_packets = rds_sysctl_max_unacked_packets; @@ -255,26 +348,32 @@ restart: /* The transport either sends the whole rdma or none of it */ if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) { rm->m_final_op = &rm->rdma; + /* The transport owns the mapped memory for now. + * You can't unmap it while it's on the send queue */ + set_bit(RDS_MSG_MAPPED, &rm->m_flags); ret = conn->c_trans->xmit_rdma(conn, &rm->rdma); - if (ret) + if (ret) { + clear_bit(RDS_MSG_MAPPED, &rm->m_flags); + wake_up_interruptible(&rm->m_flush_wait); break; + } conn->c_xmit_rdma_sent = 1; - /* The transport owns the mapped memory for now. - * You can't unmap it while it's on the send queue */ - set_bit(RDS_MSG_MAPPED, &rm->m_flags); } if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) { rm->m_final_op = &rm->atomic; + /* The transport owns the mapped memory for now. + * You can't unmap it while it's on the send queue */ + set_bit(RDS_MSG_MAPPED, &rm->m_flags); ret = conn->c_trans->xmit_atomic(conn, &rm->atomic); - if (ret) + if (ret) { + clear_bit(RDS_MSG_MAPPED, &rm->m_flags); + wake_up_interruptible(&rm->m_flush_wait); break; + } conn->c_xmit_atomic_sent = 1; - /* The transport owns the mapped memory for now. - * You can't unmap it while it's on the send queue */ - set_bit(RDS_MSG_MAPPED, &rm->m_flags); } /* @@ -354,17 +453,21 @@ restart: } } +over_batch: if (conn->c_trans->xmit_complete) conn->c_trans->xmit_complete(conn); - release_in_xmit(conn); /* Nuke any messages we decided not to retransmit. */ if (!list_empty(&to_be_dropped)) { /* irqs on here, so we can put(), unlike above */ - list_for_each_entry(rm, &to_be_dropped, m_conn_item) + list_for_each_entry(rm, &to_be_dropped, m_conn_item) { + if (rds_async_send_enabled && rm->rdma.op_implicit_mr) + rds_rdma_unuse(rm->m_rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1); + rds_message_unmapped(rm); rds_message_put(rm); - rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); + } + rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_SEND_DROPPED); } /* @@ -377,17 +480,26 @@ restart: * If the transport cannot continue (i.e ret != 0), then it must * call us when more room is available, such as from the tx * completion handler. + * + * We have an extra generation check here so that if someone manages + * to jump in after our release_in_xmit, we'll see that they have done + * some work and we will skip our goto */ if (ret == 0) { smp_mb(); - if (!list_empty(&conn->c_send_queue)) { + if ((test_bit(0, &conn->c_map_queued) || + !list_empty(&conn->c_send_queue)) && + send_gen == conn->c_send_gen) { rds_stats_inc(s_send_lock_queue_raced); - goto restart; + if (batch_count < 1024) + goto restart; + queue_delayed_work(rds_wq, &conn->c_send_w, 1); } } out: return ret; } +EXPORT_SYMBOL_GPL(rds_send_xmit); static void rds_send_sndbuf_remove(struct rds_sock *rs, struct rds_message *rm) { @@ -410,6 +522,40 @@ static inline int rds_send_is_acked(struct rds_message *rm, u64 ack, return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack; } +void rds_asend_complete(struct rds_message *rm, int status) +{ + struct rds_sock *rs = NULL; + struct rm_data_op *so; + struct rds_notifier *notifier; + unsigned long flags; + + spin_lock_irqsave(&rm->m_rs_lock, flags); + + so = &rm->data; + if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && + so->op_active && so->op_notifier && so->op_notify) { + notifier = so->op_notifier; + rs = rm->m_rs; + debug_sock_hold(rds_rs_to_sk(rs)); + + notifier->n_status = status; + if (!status) { + spin_lock(&rs->rs_lock); + list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); + spin_unlock(&rs->rs_lock); + so->op_notifier = NULL; + } + } + + spin_unlock_irqrestore(&rm->m_rs_lock, flags); + + if (rs) { + rds_wake_sk_sleep(rs); + debug_sock_put(rds_rs_to_sk(rs)); + } +} +EXPORT_SYMBOL_GPL(rds_asend_complete); + /* * This is pretty similar to what happens below in the ACK * handling code - except that we call here as soon as we get @@ -427,24 +573,29 @@ void rds_rdma_send_complete(struct rds_message *rm, int status) ro = &rm->rdma; if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && - ro->op_active && ro->op_notify && ro->op_notifier) { + ro->op_active && ro->op_notifier && ro->op_notify) { notifier = ro->op_notifier; rs = rm->m_rs; - sock_hold(rds_rs_to_sk(rs)); + debug_sock_hold(rds_rs_to_sk(rs)); notifier->n_status = status; - spin_lock(&rs->rs_lock); - list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); - spin_unlock(&rs->rs_lock); - ro->op_notifier = NULL; + if (!ro->op_remote_complete) { + if (rds_async_send_enabled && !status) { + spin_lock(&rs->rs_lock); + list_add_tail(¬ifier->n_list, + &rs->rs_notify_queue); + spin_unlock(&rs->rs_lock); + ro->op_notifier = NULL; + } + } } spin_unlock_irqrestore(&rm->m_rs_lock, flags); if (rs) { rds_wake_sk_sleep(rs); - sock_put(rds_rs_to_sk(rs)); + debug_sock_put(rds_rs_to_sk(rs)); } } EXPORT_SYMBOL_GPL(rds_rdma_send_complete); @@ -462,25 +613,27 @@ void rds_atomic_send_complete(struct rds_message *rm, int status) spin_lock_irqsave(&rm->m_rs_lock, flags); ao = &rm->atomic; - if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) - && ao->op_active && ao->op_notify && ao->op_notifier) { + if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && + ao->op_active && ao->op_notify && ao->op_notifier) { notifier = ao->op_notifier; rs = rm->m_rs; - sock_hold(rds_rs_to_sk(rs)); + debug_sock_hold(rds_rs_to_sk(rs)); notifier->n_status = status; - spin_lock(&rs->rs_lock); - list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); - spin_unlock(&rs->rs_lock); - - ao->op_notifier = NULL; + if (rds_async_send_enabled && !status) { + spin_lock(&rs->rs_lock); + list_add_tail(¬ifier->n_list, + &rs->rs_notify_queue); + spin_unlock(&rs->rs_lock); + ao->op_notifier = NULL; + } } spin_unlock_irqrestore(&rm->m_rs_lock, flags); if (rs) { rds_wake_sk_sleep(rs); - sock_put(rds_rs_to_sk(rs)); + debug_sock_put(rds_rs_to_sk(rs)); } } EXPORT_SYMBOL_GPL(rds_atomic_send_complete); @@ -495,6 +648,7 @@ __rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) { struct rm_rdma_op *ro; struct rm_atomic_op *ao; + struct rm_data_op *so; ro = &rm->rdma; if (ro->op_active && ro->op_notify && ro->op_notifier) { @@ -510,6 +664,13 @@ __rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) ao->op_notifier = NULL; } + so = &rm->data; + if (so->op_active && so->op_notifier) { + so->op_notifier->n_status = status; + list_add_tail(&so->op_notifier->n_list, &rs->rs_notify_queue); + so->op_notifier = NULL; + } + /* No need to wake the app - caller does this */ } @@ -557,7 +718,7 @@ EXPORT_SYMBOL_GPL(rds_send_get_message); * removing the messages from the 'messages' list regardless of if it found * the messages on the socket list or not. */ -static void rds_send_remove_from_sock(struct list_head *messages, int status) +void rds_send_remove_from_sock(struct list_head *messages, int status) { unsigned long flags; struct rds_sock *rs = NULL; @@ -580,6 +741,7 @@ static void rds_send_remove_from_sock(struct list_head *messages, int status) * while we're messing with it. It does not prevent the * message from being removed from the socket, though. */ + spin_lock_irqsave(&rm->m_rs_lock, flags); if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) goto unlock_and_drop; @@ -587,29 +749,58 @@ static void rds_send_remove_from_sock(struct list_head *messages, int status) if (rs != rm->m_rs) { if (rs) { rds_wake_sk_sleep(rs); - sock_put(rds_rs_to_sk(rs)); + debug_sock_put(rds_rs_to_sk(rs)); } rs = rm->m_rs; - sock_hold(rds_rs_to_sk(rs)); + debug_sock_hold(rds_rs_to_sk(rs)); } spin_lock(&rs->rs_lock); if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) { - struct rm_rdma_op *ro = &rm->rdma; - struct rds_notifier *notifier; - list_del_init(&rm->m_sock_item); rds_send_sndbuf_remove(rs, rm); - if (ro->op_active && ro->op_notifier && - (ro->op_notify || (ro->op_recverr && status))) { - notifier = ro->op_notifier; - list_add_tail(¬ifier->n_list, - &rs->rs_notify_queue); - if (!notifier->n_status) - notifier->n_status = status; + if (rm->rdma.op_active && rm->rdma.op_notifier) { + struct rm_rdma_op *ro = &rm->rdma; + struct rds_notifier *notifier; + + if (ro->op_notify || status) { + notifier = ro->op_notifier; + list_add_tail(¬ifier->n_list, + &rs->rs_notify_queue); + if (!notifier->n_status) + notifier->n_status = status; + } else + kfree(rm->rdma.op_notifier); rm->rdma.op_notifier = NULL; + } else if (rm->atomic.op_active && rm->atomic.op_notifier) { + struct rm_atomic_op *ao = &rm->atomic; + struct rds_notifier *notifier; + + if (ao->op_notify || status) { + notifier = ao->op_notifier; + list_add_tail(¬ifier->n_list, + &rs->rs_notify_queue); + if (!notifier->n_status) + notifier->n_status = status; + } else + kfree(rm->atomic.op_notifier); + rm->atomic.op_notifier = NULL; + } else if (rm->data.op_active && rm->data.op_notifier) { + struct rm_data_op *so = &rm->data; + struct rds_notifier *notifier; + + if (so->op_notify || status) { + notifier = so->op_notifier; + list_add_tail(¬ifier->n_list, + &rs->rs_notify_queue); + if (!notifier->n_status) + notifier->n_status = status; + } else + kfree(rm->data.op_notifier); + rm->data.op_notifier = NULL; } + was_on_sock = 1; rm->m_rs = NULL; } @@ -624,7 +815,7 @@ unlock_and_drop: if (rs) { rds_wake_sk_sleep(rs); - sock_put(rds_rs_to_sk(rs)); + debug_sock_put(rds_rs_to_sk(rs)); } } @@ -663,7 +854,7 @@ void rds_send_drop_acked(struct rds_connection *conn, u64 ack, spin_unlock_irqrestore(&conn->c_lock, flags); /* now remove the messages from the sock list as needed */ - rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS); + rds_send_remove_from_sock(&list, RDS_RDMA_SEND_SUCCESS); } EXPORT_SYMBOL_GPL(rds_send_drop_acked); @@ -720,7 +911,7 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) spin_lock_irqsave(&rm->m_rs_lock, flags); spin_lock(&rs->rs_lock); - __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); + __rds_send_complete(rs, rm, RDS_RDMA_SEND_CANCELED); spin_unlock(&rs->rs_lock); rm->m_rs = NULL; @@ -734,8 +925,23 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) while (!list_empty(&list)) { rm = list_entry(list.next, struct rds_message, m_sock_item); list_del_init(&rm->m_sock_item); - rds_message_wait(rm); + + /* + * just in case the code above skipped this message + * because RDS_MSG_ON_CONN wasn't set, run it again here + * taking m_rs_lock is the only thing that keeps us + * from racing with ack processing. + */ + spin_lock_irqsave(&rm->m_rs_lock, flags); + + spin_lock(&rs->rs_lock); + __rds_send_complete(rs, rm, RDS_RDMA_SEND_CANCELED); + spin_unlock(&rs->rs_lock); + + rm->m_rs = NULL; + spin_unlock_irqrestore(&rm->m_rs_lock, flags); + rds_message_put(rm); } } @@ -795,6 +1001,27 @@ static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn, rm->m_inc.i_hdr.h_sequence = cpu_to_be64(conn->c_next_tx_seq++); list_add_tail(&rm->m_conn_item, &conn->c_send_queue); set_bit(RDS_MSG_ON_CONN, &rm->m_flags); + + /* This can race with rds_send_reset. If an async op sneaked + * in after resetting the send state, flush it too. + */ + if (conn->c_pending_flush) { + if (rm->rdma.op_active) { + if (rm->rdma.op_notifier) { + rm->rdma.op_notifier->n_conn = conn; + conn->c_pending_flush++; + } + set_bit(RDS_MSG_FLUSH, &rm->m_flags); + } + if (rm->data.op_active && rm->data.op_async) { + if (rm->data.op_notifier) { + rm->data.op_notifier->n_conn = conn; + conn->c_pending_flush++; + } + set_bit(RDS_MSG_FLUSH, &rm->m_flags); + } + } + spin_unlock(&conn->c_lock); rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n", @@ -839,14 +1066,13 @@ static int rds_rm_size(struct msghdr *msg, int data_len) case RDS_CMSG_RDMA_DEST: case RDS_CMSG_RDMA_MAP: + case RDS_CMSG_ASYNC_SEND: cmsg_groups |= 2; /* these are valid but do no add any size */ break; case RDS_CMSG_ATOMIC_CSWP: case RDS_CMSG_ATOMIC_FADD: - case RDS_CMSG_MASKED_ATOMIC_CSWP: - case RDS_CMSG_MASKED_ATOMIC_FADD: cmsg_groups |= 1; size += sizeof(struct scatterlist); break; @@ -866,6 +1092,30 @@ static int rds_rm_size(struct msghdr *msg, int data_len) return size; } +static int rds_cmsg_asend(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg) +{ + struct rds_asend_args *args; + + if (!rds_async_send_enabled) + return -EINVAL; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_asend_args))) + return -EINVAL; + + args = CMSG_DATA(cmsg); + rm->data.op_notifier = kzalloc(sizeof(*rm->data.op_notifier), GFP_KERNEL); + if (!rm->data.op_notifier) + return -ENOMEM; + + rm->data.op_notify = !!(args->flags & RDS_SEND_NOTIFY_ME); + rm->data.op_notifier->n_user_token = args->user_token; + rm->data.op_notifier->n_status = RDS_RDMA_SEND_SUCCESS; + rm->data.op_async = 1; + + return 0; +} + static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, struct msghdr *msg, int *allocated_mr) { @@ -898,11 +1148,13 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, break; case RDS_CMSG_ATOMIC_CSWP: case RDS_CMSG_ATOMIC_FADD: - case RDS_CMSG_MASKED_ATOMIC_CSWP: - case RDS_CMSG_MASKED_ATOMIC_FADD: ret = rds_cmsg_atomic(rs, rm, cmsg); break; + case RDS_CMSG_ASYNC_SEND: + ret = rds_cmsg_asend(rs, rm, cmsg); + break; + default: return -EINVAL; } @@ -914,6 +1166,11 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, return ret; } +struct user_hdr { + u32 seq; + u8 op; +}; + int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t payload_len) { @@ -932,6 +1189,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, /* Mirror Linux UDP mirror of BSD error message compatibility */ /* XXX: Perhaps MSG_MORE someday */ if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) { + printk(KERN_INFO "msg_flags 0x%08X\n", msg->msg_flags); ret = -EOPNOTSUPP; goto out; } @@ -971,11 +1229,9 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, /* Attach data to the rm */ if (payload_len) { + struct user_hdr *uhdr = msg->msg_iov->iov_base; + rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE)); - if (!rm->data.op_sg) { - ret = -ENOMEM; - goto out; - } ret = rds_message_copy_from_user(rm, msg->msg_iov, payload_len); if (ret) goto out; @@ -984,13 +1240,19 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, rm->m_daddr = daddr; + /* Parse any control messages the user may have included. */ + ret = rds_cmsg_send(rs, rm, msg, &allocated_mr); + if (ret) + goto out; + /* rds_conn_create has a spinlock that runs with IRQ off. * Caching the conn in the socket helps a lot. */ - if (rs->rs_conn && rs->rs_conn->c_faddr == daddr) + if (rs->rs_conn && rs->rs_conn->c_faddr == daddr && + rs->rs_tos == rs->rs_conn->c_tos) conn = rs->rs_conn; else { conn = rds_conn_create_outgoing(rs->rs_bound_addr, daddr, - rs->rs_transport, + rs->rs_transport, rs->rs_tos, sock->sk->sk_allocation); if (IS_ERR(conn)) { ret = PTR_ERR(conn); @@ -999,10 +1261,18 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, rs->rs_conn = conn; } - /* Parse any control messages the user may have included. */ - ret = rds_cmsg_send(rs, rm, msg, &allocated_mr); - if (ret) + /* + if (allocated_mr && conn->c_cleanup_stale_mrs) { + rds_rdma_cleanup_stale_mrs(rs, conn); + conn->c_cleanup_stale_mrs = 0; + } + */ + + /* Not accepting new sends until all the failed ops have been reaped */ + if (rds_async_send_enabled && conn->c_pending_flush) { + ret = -EAGAIN; goto out; + } if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) { if (printk_ratelimit()) @@ -1063,8 +1333,10 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, */ rds_stats_inc(s_send_queued); - if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) - rds_send_xmit(conn); + ret = rds_send_xmit(conn); + if (ret == -ENOMEM || ret == -EAGAIN) + queue_delayed_work(rds_wq, &conn->c_send_w, 1); + rds_message_put(rm); return payload_len; @@ -1120,8 +1392,9 @@ rds_send_pong(struct rds_connection *conn, __be16 dport) rds_stats_inc(s_send_queued); rds_stats_inc(s_send_pong); - if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) - queue_delayed_work(rds_wq, &conn->c_send_w, 0); + ret = rds_send_xmit(conn); + if (ret == -ENOMEM || ret == -EAGAIN) + queue_delayed_work(rds_wq, &conn->c_send_w, 1); rds_message_put(rm); return 0; @@ -1131,3 +1404,44 @@ out: rds_message_put(rm); return ret; } + +int +rds_send_hb(struct rds_connection *conn, int response) +{ + struct rds_message *rm; + unsigned long flags; + int ret = 0; + + rm = rds_message_alloc(0, GFP_ATOMIC); + if (!rm) + return -ENOMEM; + + rm->m_daddr = conn->c_faddr; + rm->data.op_active = 1; + + spin_lock_irqsave(&conn->c_lock, flags); + list_add_tail(&rm->m_conn_item, &conn->c_send_queue); + set_bit(RDS_MSG_ON_CONN, &rm->m_flags); + rds_message_addref(rm); + rm->m_inc.i_conn = conn; + + rds_message_populate_header(&rm->m_inc.i_hdr, 0, 0, + conn->c_next_tx_seq); + + if (response) + rm->m_inc.i_hdr.h_flags |= RDS_FLAG_HB_PONG; + else + rm->m_inc.i_hdr.h_flags |= RDS_FLAG_HB_PING; + + rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED; + + conn->c_next_tx_seq++; + spin_unlock_irqrestore(&conn->c_lock, flags); + + ret = rds_send_xmit(conn); + if (ret == -ENOMEM || ret == -EAGAIN) + queue_delayed_work(rds_wq, &conn->c_send_w, 1); + + rds_message_put(rm); + return 0; +} diff --git a/net/rds/stats.c b/net/rds/stats.c index 10c759ccac0c7..01acc9faac24a 100644 --- a/net/rds/stats.c +++ b/net/rds/stats.c @@ -41,7 +41,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(rds_stats); /* :.,$s/unsigned long\>.*\ -#include #include #include @@ -41,7 +40,7 @@ /* only for info exporting */ static DEFINE_SPINLOCK(rds_tcp_tc_list_lock); static LIST_HEAD(rds_tcp_tc_list); -static unsigned int rds_tcp_tc_count; +unsigned int rds_tcp_tc_count; /* Track rds_tcp_connection structs so they can be cleaned up */ static DEFINE_SPINLOCK(rds_tcp_conn_lock); @@ -221,13 +220,7 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) static void rds_tcp_conn_free(void *arg) { struct rds_tcp_connection *tc = arg; - unsigned long flags; rdsdebug("freeing tc %p\n", tc); - - spin_lock_irqsave(&rds_tcp_conn_lock, flags); - list_del(&tc->t_tcp_node); - spin_unlock_irqrestore(&rds_tcp_conn_lock, flags); - kmem_cache_free(rds_tcp_conn_slab, tc); } @@ -249,7 +242,7 @@ static void rds_tcp_destroy_conns(void) } } -static void rds_tcp_exit(void) +void rds_tcp_exit(void) { rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); rds_tcp_listen_stop(); @@ -280,7 +273,7 @@ struct rds_transport rds_tcp_transport = { .t_prefer_loopback = 1, }; -static int rds_tcp_init(void) +int rds_tcp_init(void) { int ret; diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 9cf2927d00214..9421ad4b9100c 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -43,6 +43,8 @@ struct rds_tcp_statistics { }; /* tcp.c */ +int rds_tcp_init(void); +void rds_tcp_exit(void); void rds_tcp_tune(struct socket *sock); void rds_tcp_nonagle(struct socket *sock); void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn); @@ -85,4 +87,16 @@ DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats); unsigned int rds_tcp_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); +#ifndef NIPQUAD +#define NIPQUAD(addr) \ + ((unsigned char *)&(addr))[0], \ + ((unsigned char *)&(addr))[1], \ + ((unsigned char *)&(addr))[2], \ + ((unsigned char *)&(addr))[3] +#endif + +#ifndef NIPQUAD_FMT +#define NIPQUAD_FMT "%u.%u.%u.%u" +#endif + #endif diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index af95c8e058fc0..8d7e45aba44c6 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -43,7 +43,7 @@ void rds_tcp_state_change(struct sock *sk) struct rds_connection *conn; struct rds_tcp_connection *tc; - read_lock_bh(&sk->sk_callback_lock); + read_lock(&sk->sk_callback_lock); conn = sk->sk_user_data; if (!conn) { state_change = sk->sk_state_change; @@ -68,7 +68,7 @@ void rds_tcp_state_change(struct sock *sk) break; } out: - read_unlock_bh(&sk->sk_callback_lock); + read_unlock(&sk->sk_callback_lock); state_change(sk); } @@ -90,8 +90,8 @@ int rds_tcp_conn_connect(struct rds_connection *conn) ret = sock->ops->bind(sock, (struct sockaddr *)&src, sizeof(src)); if (ret) { - rdsdebug("bind failed with %d at address %pI4\n", - ret, &conn->c_laddr); + rdsdebug("bind failed with %d at address %u.%u.%u.%u\n", + ret, NIPQUAD(conn->c_laddr)); goto out; } @@ -108,7 +108,8 @@ int rds_tcp_conn_connect(struct rds_connection *conn) O_NONBLOCK); sock = NULL; - rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret); + rdsdebug("connect to address %u.%u.%u.%u returned %d\n", + NIPQUAD(conn->c_faddr), ret); if (ret == -EINPROGRESS) ret = 0; @@ -141,7 +142,7 @@ void rds_tcp_conn_shutdown(struct rds_connection *conn) release_sock(sock->sk); sock_release(sock); - } + }; if (tc->t_tinc) { rds_inc_put(&tc->t_tinc->ti_inc); diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 8b5cc4aa88687..b61efbd1bf758 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -31,7 +31,6 @@ * */ #include -#include #include #include @@ -67,12 +66,12 @@ static int rds_tcp_accept_one(struct socket *sock) inet = inet_sk(new_sock->sk); - rdsdebug("accepted tcp %pI4:%u -> %pI4:%u\n", - &inet->inet_saddr, ntohs(inet->inet_sport), - &inet->inet_daddr, ntohs(inet->inet_dport)); + rdsdebug("accepted tcp %u.%u.%u.%u:%u -> %u.%u.%u.%u:%u\n", + NIPQUAD(inet->inet_saddr), ntohs(inet->inet_sport), + NIPQUAD(inet->inet_daddr), ntohs(inet->inet_dport)); - conn = rds_conn_create(inet->inet_saddr, inet->inet_daddr, - &rds_tcp_transport, GFP_KERNEL); + conn = rds_conn_create(inet->inet_saddr, inet->inet_daddr, &rds_tcp_transport, + 0, GFP_KERNEL); if (IS_ERR(conn)) { ret = PTR_ERR(conn); goto out; @@ -114,7 +113,7 @@ void rds_tcp_listen_data_ready(struct sock *sk, int bytes) rdsdebug("listen data ready sk %p\n", sk); - read_lock_bh(&sk->sk_callback_lock); + read_lock(&sk->sk_callback_lock); ready = sk->sk_user_data; if (!ready) { /* check for teardown race */ ready = sk->sk_data_ready; @@ -131,7 +130,7 @@ void rds_tcp_listen_data_ready(struct sock *sk, int bytes) queue_work(rds_wq, &rds_tcp_listen_work); out: - read_unlock_bh(&sk->sk_callback_lock); + read_unlock(&sk->sk_callback_lock); ready(sk, bytes); } diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index 78205e25500a5..19056f1cfbb67 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -31,7 +31,6 @@ * */ #include -#include #include #include "rds.h" @@ -272,8 +271,7 @@ out: } /* the caller has to hold the sock lock */ -static int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp, - enum km_type km) +int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp, enum km_type km) { struct rds_tcp_connection *tc = conn->c_transport_data; struct socket *sock = tc->t_sock; @@ -325,7 +323,7 @@ void rds_tcp_data_ready(struct sock *sk, int bytes) rdsdebug("data ready sk %p bytes %d\n", sk, bytes); - read_lock_bh(&sk->sk_callback_lock); + read_lock(&sk->sk_callback_lock); conn = sk->sk_user_data; if (!conn) { /* check for teardown race */ ready = sk->sk_data_ready; @@ -339,7 +337,7 @@ void rds_tcp_data_ready(struct sock *sk, int bytes) if (rds_tcp_read_sock(conn, GFP_ATOMIC, KM_SOFTIRQ0) == -ENOMEM) queue_delayed_work(rds_wq, &conn->c_recv_w, 0); out: - read_unlock_bh(&sk->sk_callback_lock); + read_unlock(&sk->sk_callback_lock); ready(sk, bytes); } diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index 1b4fd68f0c7c4..017aa83c14007 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -63,7 +63,7 @@ void rds_tcp_xmit_complete(struct rds_connection *conn) } /* the core send_sem serializes this with other xmit and shutdown */ -static int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len) +int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len) { struct kvec vec = { .iov_base = data, @@ -143,9 +143,9 @@ out: rds_tcp_stats_inc(s_tcp_sndbuf_full); ret = 0; } else { - printk(KERN_WARNING "RDS/tcp: send to %pI4 " + printk(KERN_WARNING "RDS/tcp: send to %u.%u.%u.%u " "returned %d, disconnecting and reconnecting\n", - &conn->c_faddr, ret); + NIPQUAD(conn->c_faddr), ret); rds_conn_drop(conn); } } @@ -174,7 +174,7 @@ void rds_tcp_write_space(struct sock *sk) struct rds_connection *conn; struct rds_tcp_connection *tc; - read_lock_bh(&sk->sk_callback_lock); + read_lock(&sk->sk_callback_lock); conn = sk->sk_user_data; if (!conn) { write_space = sk->sk_write_space; @@ -194,7 +194,7 @@ void rds_tcp_write_space(struct sock *sk) queue_delayed_work(rds_wq, &conn->c_send_w, 0); out: - read_unlock_bh(&sk->sk_callback_lock); + read_unlock(&sk->sk_callback_lock); /* * write_space is only called when data leaves tcp's send queue if diff --git a/net/rds/tcp_stats.c b/net/rds/tcp_stats.c index d5898d03cd686..8ddec01b5ef85 100644 --- a/net/rds/tcp_stats.c +++ b/net/rds/tcp_stats.c @@ -40,7 +40,7 @@ DEFINE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats) ____cacheline_aligned; -static const char const *rds_tcp_stat_names[] = { +static char *rds_tcp_stat_names[] = { "tcp_data_ready_calls", "tcp_write_space_calls", "tcp_sndbuf_full", diff --git a/net/rds/threads.c b/net/rds/threads.c index 0fd90f8c5f59c..96b9c99b7aa2d 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -34,6 +34,11 @@ #include #include "rds.h" +#include "tcp.h" +static unsigned int rds_conn_hb_timeout = 0; +module_param(rds_conn_hb_timeout, int, 0444); +MODULE_PARM_DESC(rds_conn_hb_timeout, " Connection heartbeat timeout"); + /* * All of connection management is simplified by serializing it through @@ -73,8 +78,8 @@ EXPORT_SYMBOL_GPL(rds_wq); void rds_connect_complete(struct rds_connection *conn) { if (!rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_UP)) { - printk(KERN_WARNING "%s: Cannot transition to state UP, " - "current state is %d\n", + printk(KERN_WARNING "%s: Cannot transition to state UP" + ", current state is %d\n", __func__, atomic_read(&conn->c_state)); atomic_set(&conn->c_state, RDS_CONN_ERROR); @@ -89,6 +94,11 @@ void rds_connect_complete(struct rds_connection *conn) set_bit(0, &conn->c_map_queued); queue_delayed_work(rds_wq, &conn->c_send_w, 0); queue_delayed_work(rds_wq, &conn->c_recv_w, 0); + queue_delayed_work(rds_wq, &conn->c_hb_w, 0); + conn->c_hb_start = 0; + + conn->c_connection_start = get_seconds(); + conn->c_reconnect = 1; } EXPORT_SYMBOL_GPL(rds_connect_complete); @@ -143,16 +153,26 @@ void rds_connect_worker(struct work_struct *work) clear_bit(RDS_RECONNECT_PENDING, &conn->c_flags); if (rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { + /* + * record the time we started trying to connect so that we can + * drop the connection if it doesn't work out after a while + */ + conn->c_connection_start = get_seconds(); + ret = conn->c_trans->conn_connect(conn); rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n", conn, &conn->c_laddr, &conn->c_faddr, ret); if (ret) { - if (rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_DOWN)) - rds_queue_reconnect(conn); - else + if (rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_DOWN)) { + if (conn->c_reconnect && conn->c_active_side) + rds_queue_reconnect(conn); + } else rds_conn_error(conn, "RDS: connect failed\n"); } + + if (!conn->c_reconnect) + conn->c_active_side = 1; } } @@ -162,7 +182,9 @@ void rds_send_worker(struct work_struct *work) int ret; if (rds_conn_state(conn) == RDS_CONN_UP) { + clear_bit(RDS_LL_SEND_FULL, &conn->c_flags); ret = rds_send_xmit(conn); + cond_resched(); rdsdebug("conn %p ret %d\n", conn, ret); switch (ret) { case -EAGAIN: @@ -200,11 +222,45 @@ void rds_recv_worker(struct work_struct *work) } } +void rds_hb_worker(struct work_struct *work) +{ + struct rds_connection *conn = container_of(work, struct rds_connection, c_hb_w.work); + unsigned long now = get_seconds(); + int ret; + + if (!rds_conn_hb_timeout || conn->c_loopback) + return; + + if (rds_conn_state(conn) == RDS_CONN_UP) { + if (!conn->c_hb_start) { + ret = rds_send_hb(conn, 0); + if (ret) { + rdsdebug("RDS/IB: rds_hb_worker: failed %d\n", ret); + return; + } + conn->c_hb_start = now; + } else if (now - conn->c_hb_start > rds_conn_hb_timeout) { + printk(KERN_NOTICE + "RDS/IB: connection <%u.%u.%u.%u,%u.%u.%u.%u,%d> " + "timed out (0x%lx,0x%lx)..disconnecting and reconnecting\n", + NIPQUAD(conn->c_laddr), + NIPQUAD(conn->c_faddr), conn->c_tos, + conn->c_hb_start, now); + rds_conn_drop(conn); + return; + } + queue_delayed_work(rds_wq, &conn->c_hb_w, HZ); + } +} + void rds_shutdown_worker(struct work_struct *work) { struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w); rds_conn_shutdown(conn); + + if (!conn->c_reconnect) + conn->c_active_side = 0; } void rds_threads_exit(void) diff --git a/net/rds/transport.c b/net/rds/transport.c index 7f2ac4fec3678..2dd3de3e57f3a 100644 --- a/net/rds/transport.c +++ b/net/rds/transport.c @@ -119,8 +119,7 @@ unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, rds_info_iter_unmap(iter); down_read(&rds_trans_sem); - for (i = 0; i < RDS_TRANS_COUNT; i++) - { + for (i = 0; i < RDS_TRANS_COUNT; i++) { trans = transports[i]; if (!trans || !trans->stats_info_copy) continue; diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index c3c232a88d94d..6c014dd3a20bf 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -695,8 +695,7 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, return ERR_PTR(-ENOMEM); xprt = &cma_xprt->sc_xprt; - listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP, - IB_QPT_RC); + listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP); if (IS_ERR(listen_id)) { ret = PTR_ERR(listen_id); dprintk("svcrdma: rdma_create_id failed = %d\n", ret); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 80f8da344df53..d4297dc43dc4c 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -387,7 +387,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, init_completion(&ia->ri_done); - id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC); + id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP); if (IS_ERR(id)) { rc = PTR_ERR(id); dprintk("RPC: %s: rdma_create_id() failed %i\n", -- 2.50.1