From: Saeed Mahameed <saeedm@mellanox.com>
Date: Wed, 17 Apr 2013 20:21:12 +0000 (+0300)
Subject: mlx4_vnic: add mlx4_vnic
X-Git-Tag: v4.1.12-92~319^2^2~6^2~1
X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=b6f907ed293536ef2a5b910bb8bc736e697c8e79;p=users%2Fjedix%2Flinux-maple.git

mlx4_vnic: add mlx4_vnic

Add mlx4_vnic code

Also squash following porting commmits for compilation
of the integrated commit (without squashing they wont compile)

mlx4_vnic: adapt vnic to ofed2 mlx4 implementation
mlx4_vnic: align with OFED2 upstream 3.7 kernel
mlx4_vnic: Fix reference path to hw/mlx4 header files
mlx4_vnic: remove mlx4_vnic_helper module
mlx4_vnic: use ib_modify_cq() in upstream kernel
        We modify code to use ib_modify_cq() in upstream kernel
        (and not use a modified Mellanox version)
mlx4_vnic: removed reference to mlx4_ib_qp->rules_list in vnic_qp.c
        Remove field introduced with Mellanox OFED 2.4 flow
        steering patches which are not in upstream kernel.
mlx4_vnic: used an older version of mlx4_qp_reserve_range()
        Use mlx4_qp_reserve_range() aligned with version
        in Linux 3.18 (We can use the new API when it is
        available upstream)
mlx4_vnic: port to Linux 3.18*
        mlx4_vnic code is based on the original port
        of mlx4_vnic in UEK3. Make changes to compile
        on UEK4 (based on Linux 3.18). Use upstream APIs
        -not Mellanox specific ones - where they are in
        conflict and other changes to make it compile
        on Linux 3.18

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Ajaykumar Hotchandani <ajaykumar.hotchandani@oracle.com>
Signed-off-by: Qing Huang <qing.huang@oracle.com>
(Ported from UEK3 and Mellanox OFED 2.4)

Signed-off-by: Mukesh Kacker <mukesh.kacker@oracle.com>
---

diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/Makefile b/drivers/net/ethernet/mellanox/mlx4_vnic/Makefile
new file mode 100644
index 0000000000000..09d022a5f56a1
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/Makefile
@@ -0,0 +1,9 @@
+obj-$(CONFIG_MLX4_VNIC)	+= mlx4_vnic.o
+
+mlx4_vnic-y :=  vnic_data_main.o    vnic_data_ib.o    vnic_data_netdev.o    vnic_data_neigh.o   \
+		vnic_data_fs.o      vnic_data_tx.o    vnic_data_ethtool.o   vnic_data_rx.o      \
+		vnic_fip_main.o	    vnic_fip_ib.o     vnic_fip_discover.o   vnic_fip_pkt.o      \
+		vnic_fip_login.o    vnic_fip_vhub.o   vnic_mcast.o          vnic_port.o         \
+		vnic_param.o        vnic_qp.o         vnic_main.o	    fip_parser.o	\
+		vnic_data_mac.o
+
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/doc/gw_state.dot b/drivers/net/ethernet/mellanox/mlx4_vnic/doc/gw_state.dot
new file mode 100644
index 0000000000000..44f59565f31ba
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/doc/gw_state.dot
@@ -0,0 +1,5 @@
+digraph {
+	FIP_GW_HOST_ADMIN;
+	FIP_GW_MCAST_RCVD;
+	FIP_GW_CONNECTED;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/doc/regndev.dot b/drivers/net/ethernet/mellanox/mlx4_vnic/doc/regndev.dot
new file mode 100644
index 0000000000000..ea10aba3add12
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/doc/regndev.dot
@@ -0,0 +1,54 @@
+digraph {
+
+	vnic_login_create_1 -> register_netdev; //
+	__vnic_login_create -> vnic_login_create_1; //
+	vnic_new_intf_store -> __vnic_login_create; //
+	vnic_port_data_init -> __vnic_login_create; //
+	vnic_ib_dev_add_one -> vnic_port_data_init; //
+	fip_vnic_login_create -> vnic_login_create_1; //
+	fip_vnic_test_login -> fip_vnic_login_create [label="login_wq", color=blue]; //
+	fip_vnic_destroy -> fip_vnic_test_login; //
+	fip_purge_vnics -> fip_vnic_destroy; //
+	fip_purge_vnics -> fip_purge_vnics [label="fip_wq", color=blue]; //
+	fip_vnic_close -> fip_purge_vnics [label="fip_wq", color=blue];
+	fip_vnic_hadmin_init -> fip_vnic_test_login; //
+	fip_gw_update_hadmin_gw -> fip_vnic_hadmin_init; //
+	fip_discover_hadmin_update -> fip_gw_update_hadmin_gw; //
+	fip_hadmin_sysfs_update -> fip_discover_hadmin_update [label="fip_wq", color=blue]; //
+	fip_vnic_fsm -> fip_vnic_test_login; //
+	fip_gw_create_vnics -> fip_vnic_fsm; //
+
+
+	fip_gw_update_hadmin_gw -> fip_vnic_fsm;
+	fip_vnic_login_ack_recv -> fip_vnic_fsm; //
+	fip_discover_rx_packet_bh -> fip_vnic_login_ack_recv;
+	fip_vnic_tbl_done -> fip_vnic_fsm; //
+	vhub_handle_tbl -> fip_vnic_tbl_done; //
+	fip_vnic_recv_bh -> vhub_handle_tbl; //
+	fip_vnic_recv -> fip_vnic_recv_bh [label="fip_wq", color=blue]; //
+	fip_vnic_comp -> fip_vnic_recv;
+
+	fip_discover_rx_advertise_bh -> fip_discover_gw_fsm;
+
+	fip_hadmin_vnic_refresh -> fip_vnic_fsm; //
+	fip_gw_create_vnics -> fip_hadmin_vnic_refresh //
+	fip_gw_modified -> fip_gw_create_vnics; //
+	fip_discover_rx_advertise_bh -> fip_gw_modified; //
+	fip_discover_rx_packet_bh -> fip_discover_rx_advertise_bh; //
+	fip_discover_process_rx_bh -> fip_discover_rx_packet_bh; //
+	fip_discover_process_rx -> fip_discover_process_rx_bh [label="fip_wq", color=blue]; //
+	fip_discover_comp -> fip_discover_process_rx;
+
+
+
+	fip_discover_rx_advertise_bh -> fip_gw_create_vnics;
+	fip_discover_gw_fsm -> fip_gw_create_vnics;
+
+	vnic_login_pre_create_1 -> vnic_alloc_netdev; //
+	__vnic_login_create -> vnic_login_pre_create_1;
+	fip_vnic_hadmin_init -> vnic_login_pre_create_1;
+	fip_vnic_login_init -> vnic_login_pre_create_1;
+	fip_vnic_fsm -> fip_vnic_login_init;
+	
+
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/doc/vnic_flush.dot b/drivers/net/ethernet/mellanox/mlx4_vnic/doc/vnic_flush.dot
new file mode 100644
index 0000000000000..fc2a8fd2560c5
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/doc/vnic_flush.dot
@@ -0,0 +1,5 @@
+digraph {
+	-> FIP_NO_FLUSH [label="fip_vnic_alloc"];
+	FIP_PARTIAL_FLUSH;
+	FIP_FULL_FLUSH;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/doc/vnic_state.dot b/drivers/net/ethernet/mellanox/mlx4_vnic/doc/vnic_state.dot
new file mode 100644
index 0000000000000..6adcd8996e52d
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/doc/vnic_state.dot
@@ -0,0 +1,15 @@
+digraph {
+	FIP_VNIC_CLOSED;
+	fip_vnic_alloc [shape=regular];
+	fip_vnic_alloc -> FIP_VNIC_HADMIN_IDLE [label="hadmin"];
+	fip_vnic_alloc -> FIP_VNIC_LOGIN [label="none hadmin"];
+	FIP_VNIC_WAIT_4_ACK;
+	FIP_VNIC_RINGS_INIT;
+	FIP_VNIC_MCAST_INIT;
+	FIP_VNIC_MCAST_INIT_DONE;
+	FIP_VNIC_VHUB_INIT;
+	FIP_VNIC_VHUB_INIT_DONE;
+	FIP_VNIC_VHUB_DONE;
+	FIP_VNIC_VHUB_WRITE;
+	FIP_VNIC_CONNECTED;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/fip_parser.c b/drivers/net/ethernet/mellanox/mlx4_vnic/fip_parser.c
new file mode 100644
index 0000000000000..e1782998467c0
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/fip_parser.c
@@ -0,0 +1,510 @@
+/*
+ * Copyright (c) 2010 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+ 
+#include "vnic.h"
+#include "vnic_fip.h"
+#include "vnic_fip_pkt.h"
+
+static const struct subcode_rules {
+	u64	req_mask;
+	u64	opt_mask;
+} subcodes_array[FIP_MAX_SUBCODES] = {
+	[FIP_HOST_SOL_SUB_OPCODE] = {
+		.req_mask = FIP_MASK(VENDOR_ID) |
+			    FIP_MASK(ADDRESS),
+		.opt_mask = FIP_MASK(EXT_DESC),
+	},
+	[FIP_GW_ADV_SUB_OPCODE] = {
+		.req_mask = FIP_MASK(VENDOR_ID) |
+			    FIP_MASK(ADDRESS) |
+			    FIP_MASK(GW_INFORMATION) |
+			    FIP_MASK(GW_IDENTIFIER) |
+			    FIP_MASK(KA_PARAMS),
+		.opt_mask = FIP_MASK(EXT_DESC),
+	},
+	[FIP_HOST_LOGIN_SUB_OPCODE] = {
+		.req_mask = FIP_MASK(VENDOR_ID) |
+			    FIP_MASK(ADDRESS) |
+			    FIP_MASK(LOGIN) |
+			    FIP_MASK(PARTITION),
+		.opt_mask = FIP_MASK(EXT_DESC),
+	},
+	[FIP_GW_LOGIN_SUB_OPCODE] = {
+		.req_mask = FIP_MASK(VENDOR_ID) |
+			    FIP_MASK(ADDRESS) |
+			    FIP_MASK(LOGIN) |
+			    FIP_MASK(PARTITION),
+		.opt_mask = FIP_MASK(EXT_DESC),
+	},
+	[FIP_HOST_LOGOUT_SUB_OPCODE] = {
+		.req_mask = FIP_MASK(VENDOR_ID) |
+			    FIP_MASK(VNIC_IDENTITY),
+	},
+	[FIP_GW_UPDATE_SUB_OPCODE] = {
+		.req_mask = FIP_MASK(VENDOR_ID) |
+			    FIP_MASK(VHUB_UPDATE),
+		.opt_mask = FIP_MASK(EXT_DESC),
+	},
+	[FIP_GW_TABLE_SUB_OPCODE] = {
+		.req_mask = FIP_MASK(VENDOR_ID) |
+			    FIP_MASK(VHUB_TABLE),
+	},
+	[FIP_HOST_ALIVE_SUB_OPCODE] = {
+		.req_mask = FIP_MASK(VENDOR_ID) |
+			    FIP_MASK(VNIC_IDENTITY),
+	},
+};
+
+static int type2idx(struct fip_content *fc, struct fip_fip_type *ft)
+{
+	void *p = ft;
+
+	switch (ft->type) {
+	case FIP_TYPE(VENDOR_ID):
+		fc->fvend = p;
+		return FIP_TYPE_IDX(VENDOR_ID);
+	case FIP_TYPE(ADDRESS):
+		fc->fa.fa[fc->fa.num++] = p;
+		return FIP_TYPE_IDX(ADDRESS);
+	case FIP_TYPE(GW_INFORMATION):
+		fc->fgwi = p;
+		return FIP_TYPE_IDX(GW_INFORMATION);
+	case FIP_TYPE(LOGIN):
+		fc->fl = p;
+		return FIP_TYPE_IDX(LOGIN);
+	case FIP_TYPE(VHUB_UPDATE):
+		fc->fvu = p;
+		return FIP_TYPE_IDX(VHUB_UPDATE);
+	case FIP_TYPE(VHUB_TABLE):
+		fc->fvt = p;
+		return FIP_TYPE_IDX(VHUB_TABLE);
+	case FIP_TYPE(VNIC_IDENTITY):
+		fc->fvi = p;
+		return FIP_TYPE_IDX(VNIC_IDENTITY);
+	case FIP_TYPE(PARTITION):
+		fc->fp = p;
+		return FIP_TYPE_IDX(PARTITION);
+	case FIP_TYPE(GW_IDENTIFIER):
+		fc->fgid = p;
+		return FIP_TYPE_IDX(GW_IDENTIFIER);
+	case FIP_TYPE(KA_PARAMS):
+		fc->fka = p;
+		return FIP_TYPE_IDX(KA_PARAMS);
+	case FIP_TYPE(EXT_DESC):
+		fc->fed.fed[fc->fed.num++] = p;
+		return FIP_TYPE_IDX(EXT_DESC);
+	default:
+		return -1;
+	}
+}
+
+#ifdef CONFIG_MLX4_VNIC_DEBUG
+static const char *fip_type_str(int type)
+{
+	switch (type) {
+	FIP_CASE_STR(VENDOR_ID);
+	FIP_CASE_STR(ADDRESS);
+	FIP_CASE_STR(GW_INFORMATION);
+	FIP_CASE_STR(LOGIN);
+	FIP_CASE_STR(VHUB_UPDATE);
+	FIP_CASE_STR(VHUB_TABLE);
+	FIP_CASE_STR(VNIC_IDENTITY);
+	FIP_CASE_STR(PARTITION);
+	FIP_CASE_STR(GW_IDENTIFIER);
+	FIP_CASE_STR(KA_PARAMS);
+	FIP_CASE_STR(EXT_DESC);
+	default:
+		return "Unknown";
+	}
+}
+
+static const char *fip_subcode_str(int subcode)
+{
+	switch (subcode) {
+	FIP_SUBCODE_CASE_STR(FIP_HOST_SOL_SUB_OPCODE);
+	FIP_SUBCODE_CASE_STR(FIP_GW_ADV_SUB_OPCODE);
+	FIP_SUBCODE_CASE_STR(FIP_HOST_LOGIN_SUB_OPCODE);
+	FIP_SUBCODE_CASE_STR(FIP_GW_LOGIN_SUB_OPCODE);
+	FIP_SUBCODE_CASE_STR(FIP_HOST_LOGOUT_SUB_OPCODE);
+	FIP_SUBCODE_CASE_STR(FIP_GW_UPDATE_SUB_OPCODE);
+	FIP_SUBCODE_CASE_STR(FIP_GW_TABLE_SUB_OPCODE);
+	FIP_SUBCODE_CASE_STR(FIP_HOST_ALIVE_SUB_OPCODE);
+	default:
+		return "Unknown";
+	}
+}
+#endif
+
+static int verify_mlx_sig(void *p)
+{
+	static const char *mlx4_str = "mellanox";
+	__be64 mlx_str_64 = *(__be64 *)mlx4_str;
+	__be64 *sig = p;
+
+	return *sig != mlx_str_64;
+}
+
+static int next_type(struct vnic_port *port, void *tlv, int len,
+		     struct fip_content *fc, int *sz, int *idx)
+{
+        struct fip_fip_type *ft;
+
+	if (sizeof *ft > len) {
+		vnic_dbg_parse(port->name, "message too short\n");
+		return -1;
+	}
+	ft = tlv
+		;
+        vnic_dbg_parse(port->name, "TLV: type %s(%d)\n", fip_type_str(ft->type),
+		     ft->type);
+
+	if (!ft->length || (ft->length << 2 > len)) {
+		vnic_dbg_parse(port->name, "TLV does not fit in message: %s(%d) "
+			     "tlv->len %d, remaining %d\n", fip_type_str(ft->type),
+			     ft->type, ft->length << 2, len);
+		return -1;
+	}
+
+	*sz = (ft->length << 2);
+
+	*idx = type2idx(fc, ft);
+	if (*idx < 0) {
+		vnic_dbg_parse(port->name, "unkown type %d\n", ft->type);
+		return -1;
+	}
+
+	if (ft->type == FIP_TYPE(VENDOR_ID) && verify_mlx_sig(fc->fvend->vendor_id)) {
+                vnic_dbg_parse(port->name, "mellanox signature check failed\n");
+		return -1;
+	}
+
+        if (ft->type == FIP_TYPE(VHUB_TABLE) || ft->type == FIP_TYPE(VHUB_UPDATE)) {
+		int cte_list_sz;
+		struct context_table_entry *cte_start;
+
+		if (ft->type == FIP_TYPE(VHUB_TABLE)) {
+			unsigned hdr = be16_to_cpu(fc->fvt->hdr) >> 14;
+
+			if (hdr > FIP_TABLE_HDR_ONLY) {
+				vnic_dbg_parse(port->name, "invalid table header %d\n", hdr);
+				return -1;
+			}
+			cte_list_sz = *sz - sizeof(struct fip_vhub_table_tlv);
+			/* Todo, the next 2 lines are comented because the size of the tbl tlv is
+			   miscomputed in BXM versions 1.3.6-5 and it causes tables to be discarded.
+			   In reality the size should be used with the lines in tact. */
+			/*if (hdr == FIP_TABLE_HDR_LAST)
+				cte_list_sz -= 4;
+			*/
+
+			cte_start = (struct context_table_entry *)(fc->fvt + 1);
+		} else {
+			cte_list_sz = *sz - sizeof(struct fip_vhub_update_tlv);
+			cte_start = (struct context_table_entry *)(fc->fvu + 1);
+		}
+
+
+		fc->cte.num = cte_list_sz / sizeof(struct context_table_entry);
+		fc->cte.cte = cte_start;
+	}
+
+
+	return 0;
+}
+
+static inline int check_eoib_ver(struct vnic_port *port,
+				 struct fip_eoib_ver *eoib_ver, int sz, int *len)
+{
+	if (unlikely(sz < sizeof *eoib_ver)) {
+		vnic_dbg_parse(port->name, "message too short\n");
+		*len = sz;
+		return -ENOMEM;
+	}
+	*len = sizeof *eoib_ver;
+	if (unlikely(eoib_ver->version >> 4)) {
+		vnic_dbg_parse(port->name, "eoib version check failed: %d\n", eoib_ver->version >> 4);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void dump_raw(struct vnic_port *port, void *buf, int len)
+{
+	int i;
+
+	for (i = 0; i < len / 4; ++i)
+		vnic_dbg_parse(port->name, "0x%08x\n", be32_to_cpu(((__be32 *)(buf))[i]));
+}
+
+static inline int check_fip_hdr(struct vnic_port *port,
+				struct fip_header_simple *fh, int sz, int *len)
+{
+	if (unlikely(sizeof *fh > sz)) {
+		vnic_dbg_parse(port->name, "message too short\n");
+		return -1;
+	}
+
+	if (unlikely(fh->opcode != cpu_to_be16(EOIB_FIP_OPCODE))) {
+		vnic_dbg_parse(port->name, "not fip opcode\n");
+		return -1;
+	}
+
+	if (unlikely((be16_to_cpu(fh->list_length) << 2) > (sz - sizeof *fh))) {
+		vnic_dbg_parse(port->name, "message too short: header length = %u, "
+			       "left length = %lu\n",
+			       be16_to_cpu(fh->list_length) << 2, sz - sizeof *fh);
+		return -1;
+	}
+
+        *len = sizeof *fh;
+
+	return 0;
+}
+
+static int check_fip_mask(struct vnic_port *port, struct fip_content *fc)
+{
+	u64 req_mask = subcodes_array[fc->fh->subcode].req_mask;
+	u64 opt_mask = subcodes_array[fc->fh->subcode].opt_mask;
+
+	if (((fc->mask & req_mask) != req_mask) ||
+	    ((fc->mask & ~opt_mask) & ~req_mask)) {
+		vnic_dbg_parse(port->name, "%s: mask check failed: mask 0x%llx,"
+			     "req_mask 0x%llx, opt_mask 0x%llx\n",
+			     fip_subcode_str(fc->fh->subcode), fc->mask, req_mask, opt_mask);
+		return -1;
+	}
+
+	return 0;
+}
+
+static void dump_cte(struct vnic_port *port, struct context_table_entry *cte)
+{
+        vnic_dbg_parse(port->name, "CTE: V(%d) RSS(%d) type(%d) MAC(%pM) QPN(0x%06x) SL(%d) LID(0x%04x)\n",
+		       (0x1 & (cte->v_rss_type >> 7)),
+		       (0x1 & (cte->v_rss_type >> 6)),
+		       (cte->v_rss_type & 0xf),
+		       cte->mac, be32_to_cpu(cte->qpn) & 0xffffff,
+		       (cte->sl & 0xf), be16_to_cpu(cte->lid));
+}
+
+static void dump_vnic_identity(struct vnic_port *port,
+			       struct fip_vnic_identity_tlv *fvi)
+{
+#define VHUB_ID	be32_to_cpu(fvi->flags_vhub_id)
+
+        vnic_dbg_parse(port->name, "%s: U(%d) R(%d) VP(%d) VHUBID(x%x) TUSN(0x%x) VNIC_ID(0x%x)"
+		       "MAC(%pM) GUID("GUID_FORMAT") VNIC NAME (%s)\n",
+		       fip_type_str(fvi->ft.type), (VHUB_ID >> 31), (0x01 & (VHUB_ID >> 30)),
+		       (0x01 & (VHUB_ID >> 24)), VHUB_ID & 0xffffff, be32_to_cpu(fvi->tusn),
+		       be16_to_cpu(fvi->vnic_id), fvi->mac, GUID_ARG(fvi->port_guid), fvi->vnic_name);
+}
+
+static void dump_vnic_partition(struct vnic_port *port, struct fip_partition_tlv *fp)
+{
+	vnic_dbg_parse(port->name, "%s: PKEY(0x%x)\n", fip_type_str(fp->ft.type),
+		       be16_to_cpu(fp->pkey));
+}
+
+
+static void dump_gw_identifier(struct vnic_port *port, struct fip_gw_identifier_tlv *fgid)
+{
+	vnic_dbg_parse(port->name, "%s: SYS GUID("GUID_FORMAT") SYS NAME(%s) GW PORT NAME(%s)\n",
+		     fip_type_str(fgid->ft.type), GUID_ARG(fgid->sys_guid), fgid->sys_name, fgid->sys_name);
+}
+
+static void dump_ka_params(struct vnic_port *port, struct fip_ka_params_tlv *fka)
+{
+	vnic_dbg_parse(port->name, "%s: GW_ADV_PERIOD(%d) GW_KA_PERIOD(%d) VNIC_KA_PERIOD(%d)\n",
+		       fip_type_str(fka->ft.type), be32_to_cpu(fka->adv_period),
+		       be32_to_cpu(fka->ka_period), be32_to_cpu(fka->vnic_ka_period));
+}
+
+static void dump_vhub_table(struct vnic_port *port, struct fip_content *fc)
+{
+	int i;
+
+	vnic_dbg_parse(port->name, "%s: VP(%d) vhub id(0x%x) TUSN(0x%x) HDR(%d) table size (%d)\n",
+		       fip_type_str(fc->fvt->ft.type), be32_to_cpu(fc->fvt->vp_vhub_id) >> 24 & 1,
+		       be32_to_cpu(fc->fvt->vp_vhub_id) & 0xffffff, be32_to_cpu(fc->fvt->tusn),
+		       be16_to_cpu(fc->fvt->hdr) >> 14, be16_to_cpu(fc->fvt->table_size));
+	for (i = 0; i < fc->cte.num; ++i)
+		dump_cte(port, &fc->cte.cte[i]);
+}
+
+static void dump_fip_login(struct vnic_port *port, struct fip_login_tlv *p)
+{
+	vnic_dbg_parse(port->name, "%s: mtu(%d) vnic_id(0x%x) v_m_vp_h(0x%x) vlan(0x%x) mac(%pM)"
+		       "mgid_prefix("MGID_PREFIX_FMT") vfields(0x%0x) syndrom(%d) QPN(0x%x)"
+		       " vnic_name(%s)\n", fip_type_str(p->ft.type), be16_to_cpu(p->mtu),
+		       be16_to_cpu(p->vnic_id), be16_to_cpu(p->flags_vlan) >> 12,
+		       be16_to_cpu(p->flags_vlan) & 0xfff, p->mac, MGID_PRE_ARG(p->eth_gid_prefix),
+		       be16_to_cpu(p->vfields), be32_to_cpu(p->syndrom_ctrl_qpn) >> 24,
+		       be32_to_cpu(p->syndrom_ctrl_qpn) & 0xffffff, p->vnic_name);
+}
+
+static void dump_fip_address(struct vnic_port *port, struct fip_address_tlv *fa)
+{
+	vnic_dbg_parse(port->name, "%s: GW_TYPE(%d) QPN(0x%x)  SL(%d), GW_PORT_ID(0x%x),"
+		       " LID(0x%x) GUID(" GUID_FORMAT ")\n", fip_type_str(fa->ft.type),
+		       be32_to_cpu(fa->gwtype_qpn) >> 24, be32_to_cpu(fa->gwtype_qpn) & 0xffffff,
+		       be16_to_cpu(fa->sl_gwportid) >> 12, be16_to_cpu(fa->sl_gwportid) & 0xfff,
+		       be16_to_cpu(fa->lid), GUID_ARG(fa->guid));
+}
+
+static void dump_vhub_update(struct vnic_port *port, struct fip_content *fc)
+{
+#define VHUB_ID_1 	be32_to_cpu(fc->fvu->state_vhub_id)
+	int i;
+
+	vnic_dbg_parse((port->name), "%s: eport_state(%s) vp(%d) vhub_id(0x%x) tusn(0x%x)\n",
+		       fip_type_str(fc->fvu->ft.type), eport_state_str(VHUB_ID_1 >> 28 & 3),
+		       VHUB_ID_1 >> 24 & 1, VHUB_ID_1 & 0xffffff, be32_to_cpu(fc->fvu->tusn));
+	for (i = 0; i < fc->cte.num; ++i)
+		dump_cte(port, &fc->cte.cte[i]);
+}
+
+static void dump_gateway_information(struct vnic_port *port,
+				     struct fip_gw_information_tlv *fgwi)
+{
+	vnic_dbg_parse(port->name, "%s: accept host administered(%s) nmac_mgid(%d) "
+		       "nrss_mgid(%d) ntss_qpn(%d), n_rss(%d), num_net_vnics(%d)\n",
+		       fip_type_str(fgwi->ft.type), (fgwi->h_nmac_mgid >> 7) ? "Yes" : "No",
+		       fgwi->h_nmac_mgid & 0x3f, fgwi->n_rss_mgid_tss_qpn >> 4,
+		       fgwi->n_rss_mgid_tss_qpn & 0xf, be16_to_cpu(fgwi->n_rss_qpn_vnics) >> 12,
+		       be16_to_cpu(fgwi->n_rss_qpn_vnics) & 0xfff);
+}
+
+static void dump_fip_packet(struct vnic_port *port, struct fip_content *fc)
+{
+	int i;
+
+	for (i = 0; i < fc->fa.num; ++i)
+		dump_fip_address(port, fc->fa.fa[i]);
+
+	if (fc->fgwi)
+		dump_gateway_information(port, fc->fgwi);
+
+	if (fc->fvu)
+		dump_vhub_update(port, fc);
+
+	if (fc->fl)
+		dump_fip_login(port, fc->fl);
+
+	if (fc->fvt)
+		dump_vhub_table(port, fc);
+
+	if (fc->fvi)
+		dump_vnic_identity(port, fc->fvi);
+
+	if (fc->fp)
+		dump_vnic_partition(port, fc->fp);
+
+	if (fc->fgid)
+                dump_gw_identifier(port, fc->fgid);
+
+	if (fc->fka)
+                dump_ka_params(port, fc->fka);
+}
+
+int fip_packet_parse(struct vnic_port *port, void *packet, int pkt_size, struct fip_content *fc)
+{
+	void *ptr = packet;
+	int len;
+	int err;
+	int idx;
+	u16 offset = 0;
+	int size = pkt_size;
+
+	vnic_dbg_parse(port->name, "size = %d\n", size);
+	err = check_eoib_ver(port, ptr, size, &len);
+	if (err) {
+		if (err != -EINVAL)
+			goto out_err;
+		else
+			vnic_dbg_parse(port->name, "version check failed\n");
+	}
+
+	fc->eoib_ver = ptr;
+	size -= len;
+	ptr += len;
+	offset += len;
+	fc->fh = ptr;
+
+	err = check_fip_hdr(port, ptr, size, &len);
+	if (err)
+		goto out_err;
+
+	ptr += len;
+	offset += len;
+
+	fc->fa.num = 0;
+	fc->num = 0;
+	fc->mask = 0;
+
+	/* workaround a BXM bug not reporting the correct descriptor length */
+	if (fc->fh->subcode != FIP_GW_ADV_SUB_OPCODE)
+		size = be16_to_cpu(fc->fh->list_length) << 2;
+	else
+		size -= len;
+
+	vnic_dbg_parse(port->name, "subcode = %s, size %d\n",
+		     fip_subcode_str(fc->fh->subcode), size);
+	while (size > 0) {
+		err = next_type(port, ptr, size, fc, &len, &idx);
+		if (err)
+			break;
+
+		fc->offsets[fc->num] = offset;
+		fc->mask |= ((u64)1 << idx);
+		ptr += len;
+		size -= len;
+		offset += len;
+		fc->num++;
+	}
+
+	if (err)
+		goto out_err;
+
+	err = check_fip_mask(port, fc);
+	if (err) {
+		vnic_dbg_parse(port->name, "check mask: failed\n");
+		goto out_err;
+	}
+
+	dump_fip_packet(port, fc);
+
+	return 0;
+
+out_err:
+       	dump_raw(port, packet, pkt_size);
+	return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic.h b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic.h
new file mode 100644
index 0000000000000..04a5e832d4222
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic.h
@@ -0,0 +1,1437 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef VNIC_H
+#define VNIC_H
+
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+#include <linux/if_vlan.h>
+#include <linux/if_arp.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/ktime.h>
+#include <linux/hrtimer.h>
+#include <linux/if_ether.h>
+#include <linux/etherdevice.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/workqueue.h>
+#include <linux/inet_lro.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/rwsem.h>
+#include <linux/vmalloc.h>
+#include <linux/version.h>
+#include <net/dst.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_sa.h>
+
+/* for mlx4_ib dev attr, used also in vnic_qp.c */
+#include "../../../../infiniband/hw/mlx4/mlx4_ib.h"
+#include "../../../../infiniband/hw/mlx4/user.h"
+
+#include "vnic_utils.h"
+
+/* driver info definition */
+#define DRV_NAME  "mlx4_vnic"
+#define DRV_VER   "1.4.0"
+#define DRV_LIC   "Dual BSD/GPL"
+#define DRV_DESC  "Mellanox BridgeX Virtual NIC Driver"
+#define DRV_AUTH  "Ali Ayoub & Gabi Liron"
+
+/* backports */
+
+/* for kernel >= 3.17 */
+#define alloc_netdev_mqs(a, b, c, d, e) alloc_netdev_mqs(a, b, NET_NAME_UNKNOWN, c, d, e)
+
+#ifdef alloc_netdev_mq
+#undef alloc_netdev_mq
+#define alloc_netdev_mq(sizeof_priv, name, setup, count) \
+    alloc_netdev_mqs(sizeof_priv, name, setup, count, count)
+#endif
+
+#ifndef SET_ETHTOOL_OPS
+#define SET_ETHTOOL_OPS(netdev,ops) \
+    ( (netdev)->ethtool_ops = (ops) )
+#endif
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,35))
+#define _BP_NO_MC_LIST
+
+// Not sure this should be here at least this is ok for 2.6.39
+#define _BP_NO_ATT_OWNER
+#endif
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,1,0))
+#define _BP_NO_GRO
+#endif
+
+#ifndef NETIF_F_HW_VLAN_FILTER
+#define NETIF_F_HW_VLAN_FILTER NETIF_F_HW_VLAN_CTAG_FILTER
+#endif
+
+/* externs */
+extern u32 vnic_msglvl;
+extern u32 vnic_max_tx_outs;
+extern u32 vnic_lro_num;
+extern u32 vnic_mcast_create;
+extern u32 vnic_net_admin;
+extern u32 vnic_child_max;
+extern u32 vnic_napi_weight;
+extern u32 vnic_linear_small_pkt;
+extern u32 vnic_tx_rings_num;
+extern u32 vnic_rx_rings_num;
+extern u32 vnic_tx_rings_len;
+extern u32 vnic_rx_rings_len;
+extern u32 vnic_mgid_data_type;
+extern u32 vnic_encap_headroom;
+extern u32 vnic_tx_polling;
+extern u32 vnic_rx_linear;
+extern u32 vnic_change_mac;
+extern u32 vnic_learn_mac_enabled;
+extern u32 vnic_synd_backlog;
+extern u32 vnic_eport_state_enforce;
+extern u32 vnic_src_mac_enforce;
+extern u32 vnic_inline_tshold;
+
+#define MAX_NUM_PKEYS_DISCOVERY	(24)
+#define ILLEGAL_PKEY_INDEX	(0xFFFF)
+extern u32 vnic_discovery_pkeys[MAX_NUM_PKEYS_DISCOVERY];
+extern u32 vnic_discovery_pkeys_count;
+extern u32 vnic_sa_query;
+
+
+extern u32 no_bxm;
+
+extern struct workqueue_struct *port_wq;
+extern struct workqueue_struct *fip_wq;
+extern struct workqueue_struct *mcast_wq;
+extern struct workqueue_struct *login_wq;
+
+extern struct ib_sa_client vnic_sa_client;
+
+/* definitions */
+#define EOIB_SERVICE_ID ((0x10ULL << 56) | (0x0002C9E01B0000ULL))
+#define EOIB_CTRL_SERVICE_ID (EOIB_SERVICE_ID | 0x00FFULL)
+#define VNIC_SKB_QUEUE_LEN	32
+#define VNIC_CNT_MAX		32
+#define VNIC_DESC_LEN		(64 + 4)
+#define VNIC_NAME_LEN		16 /* by spec, use IFNAMSIZ for OS */
+#define VNIC_SYSFS_FLEN		(VNIC_NAME_LEN * 2) /* SYSFS file name len, allow pre/suffix (32)*/
+#define VNIC_SYSFS_LLEN		64
+#define VNIC_VENDOR_LEN		8
+#define GID_LEN			16
+#define GUID_LEN		8
+#define IPV4_LEN		4
+#define IPV6_LEN		16
+#define VNIC_SYSTEM_NAME_LEN	32
+#define VNIC_GW_PORT_NAME_LEN	8
+#define GID_PREFIX_LEN		5
+#define VNIC_MAX_DENTRIES	16
+#define VNIC_ID_LEN		16
+#define VNIC_CHILD_MAX		128
+#define VNIC_MAX_RETRIES	0 /* zero = unlimited */
+#define VNIC_WATCHDOG_TIMEOUT	(25 * HZ) /* 25 sec */
+#define VNIC_NAPI_SCHED_TIMEOUT (5)
+#define FIP_MAX_VNICS_PER_GW	(1 << 9)
+#define NOT_AVAILABLE_NUM	(-1)
+#define NOT_AVAILABLE_STRING	"N/A"
+#define is_valid_str(str)	(strcmp(str, NOT_AVAILABLE_STRING))
+#define is_valid_num(num)	(num != NOT_AVAILABLE_NUM)
+#define is_valid_guid(arr)	(!!(*((u64 *)(arr))))
+#define is_valid_ipv4(arr)	(!!(*((u32 *)(arr))))
+#define is_mcast_promisc(login)	(!(login->n_mac_mcgid))
+#define is_ucast_promisc(login) (!!(login->dev->flags & IFF_PROMISC))
+#define ARRAY_LEN(_x)		(sizeof(_x)/sizeof(_x[0]))
+
+/* TODO: cleanup VNIC_GID_RAW_ARG and friends */
+#define VNIC_GID_RAW_ARG(gid)	((u8 *)(gid))[0], \
+				((u8 *)(gid))[1], \
+				((u8 *)(gid))[2], \
+				((u8 *)(gid))[3], \
+				((u8 *)(gid))[4], \
+				((u8 *)(gid))[5], \
+				((u8 *)(gid))[6], \
+				((u8 *)(gid))[7], \
+				((u8 *)(gid))[8], \
+				((u8 *)(gid))[9], \
+				((u8 *)(gid))[10],\
+				((u8 *)(gid))[11],\
+				((u8 *)(gid))[12],\
+				((u8 *)(gid))[13],\
+				((u8 *)(gid))[14],\
+				((u8 *)(gid))[15]
+#define VNIC_GUID_RAW_ARG(gid)	((u8 *)(gid))[0], \
+				((u8 *)(gid))[1], \
+				((u8 *)(gid))[2], \
+				((u8 *)(gid))[3], \
+				((u8 *)(gid))[4], \
+				((u8 *)(gid))[5], \
+				((u8 *)(gid))[6], \
+				((u8 *)(gid))[7]
+
+#define VNIC_GID_ARG(gid) 	VNIC_GID_RAW_ARG((gid).raw)
+#define VNIC_GID_FMT		"%.2x:%.2x:%.2x:%.2x:" \
+				"%.2x:%.2x:%.2x:%.2x:" \
+				"%.2x:%.2x:%.2x:%.2x:" \
+				"%.2x:%.2x:%.2x:%.2x"
+#define VNIC_GUID_FMT		"%.2x:%.2x:%.2x:%.2x:" \
+				"%.2x:%.2x:%.2x:%.2x"
+
+#define MAC_6_PRINT_FMT		"%.2x:%.2x:%.2x:%.2x:" \
+				"%.2x:%.2x"
+#define MAC_6_PRINT_ARG(mac)	(mac)[0], (mac)[1], (mac)[2], \
+				(mac)[3], (mac)[4], (mac)[5]
+
+#define IP_4_PRINT_FMT		"%d.%d.%d.%d"
+#define IP_4_PRINT_ARG(ip)	(ip)[0], (ip)[1], (ip)[2], (ip)[3]
+
+#define CREATE_VHUB_ID(be_vlan, port_id) \
+	((be16_to_cpu(be_vlan) & 0xFFF) | (((port_id) & 0xFFF) << 12))
+#define CREATE_VHUB_ID_BE(vlan, port_id) \
+	cpu_to_be32(CREATE_VHUB_ID(vlan, port_id))
+#define ROUNDUP_LOG2(x)		ilog2(roundup_pow_of_two(x))
+
+#define VNIC_RX_COAL_TARGET	0x20000
+#define VNIC_RX_COAL_TIME	0x10
+#define VNIC_TX_COAL_PKTS	64
+#define VNIC_TX_COAL_TIME	0x80
+#define VNIC_RX_RATE_LOW	400000
+#define VNIC_RX_COAL_TIME_LOW	0
+#define VNIC_RX_RATE_HIGH	450000
+#define VNIC_RX_COAL_TIME_HIGH	128
+#define VNIC_RX_SIZE_THRESH	1024
+#define VNIC_RX_RATE_THRESH	(1000000 / VNIC_RX_COAL_TIME_HIGH)
+#define VNIC_SAMPLE_INTERVAL	0
+#define VNIC_AVG_PKT_SMALL	256
+#define VNIC_AUTO_CONF		0xffff
+#define VNIC_MCAST_MAX_RETRY	60
+#define VNIC_MCAST_ULIMIT_RETRY	0
+#define VNIC_MCAST_BACKOF_FAC	2
+#define MLX4_DEV_CAP_FLAG_UD_SWP (1 << 28)
+#define VNIC_ETHTOOL_LINE_MAX	32
+#define VNIC_ENCAP_LEN		4
+#define VNIC_MAX_TX_SIZE	2048
+#define VNIC_MAX_RX_SIZE	4096
+#define ETH_LLC_SNAP_SIZE	8
+
+#define VNIC_SM_HEADSTART			250 /* msecs to actually start handling SM events */
+#define VNIC_MCAST_BACKOFF_MSEC		1000
+#define VNIC_MCAST_BACKOFF_MAX_MSEC	16000
+
+#define SYSFS_VLAN_ID_NO_VLAN		(-1)
+
+#define VNIC_MAX_PAYLOAD_SIZE		4096
+#define VNIC_BUF_SIZE(_port)		(min(_port->max_mtu_enum + \
+					IB_GRH_BYTES, VNIC_MAX_PAYLOAD_SIZE))
+
+#define VNIC_TX_QUEUE_LEN		1024 /* default, tuneable */
+#define VNIC_TX_QUEUE_LEN_MIN		64
+#define VNIC_TX_QUEUE_LEN_MAX		(8 * 1024)
+
+#define VNIC_RX_QUEUE_LEN		2048 /* default, tuneable */
+#define VNIC_RX_QUEUE_LEN_MIN		64
+#define VNIC_RX_QUEUE_LEN_MAX		(8 * 1024)
+
+
+#define VNIC_MODER_DELAY		(HZ / 4)
+#define VNIC_STATS_DELAY		VNIC_MODER_DELAY
+
+#define VNIC_AH_SL_DEFAULT		0x0
+
+#define VNIC_DATA_QKEY			0x80020003
+#define VNIC_FIP_QKEY			0x80020002
+#define VNIC_VLAN_OFFSET(login)		(login->vlan_used ? VLAN_HLEN : 0)
+#define VNIC_VLAN_ENABLED(login)	(login->vlan_used ? 1 : 0)
+#define VNIC_MAX_TX_CQE			32	/* default, tuneable */
+#define VNIC_MAX_RX_CQE			64	/* default, tuneable */
+#define VNIC_MAX_NUM_CPUS		32
+#define VNIC_MAX_INLINE_TSHOLD		512
+
+#define VNIC_EOIB_HDR_VER		0x0
+#define VNIC_EOIB_HDR_SIG		0x3
+#define VNIC_EOIB_HDR_UDP_CHK_OK	0x2
+#define VNIC_EOIB_HDR_TCP_CHK_OK	0x1
+#define VNIC_EOIB_HDR_IP_CHK_OK		0x1
+
+#define VNIC_EOIB_HDR_GET_IP_CHK(eoib_hdr)	(eoib_hdr->encap_data & 0x3)
+#define VNIC_EOIB_HDR_GET_TCP_UDP_CHK(eoib_hdr)	((eoib_hdr->encap_data >> 2) & 0x3)
+#define VNIC_EOIB_HDR_GET_VER(eoib_hdr)		((eoib_hdr->encap_data >> 4) & 0x3)
+#define VNIC_EOIB_HDR_GET_SIG(eoib_hdr) 	((eoib_hdr->encap_data >> 6) & 0x3)
+
+#define VNIC_EOIB_HDR_SET_IP_CHK_OK(eoib_hdr)	(eoib_hdr->encap_data = \
+						(eoib_hdr->encap_data & 0xFC) | VNIC_EOIB_HDR_IP_CHK_OK)
+#define VNIC_EOIB_HDR_SET_TCP_CHK_OK(eoib_hdr)	(eoib_hdr->encap_data = \
+						(eoib_hdr->encap_data & 0xF3) | (VNIC_EOIB_HDR_TCP_CHK_OK << 2))
+#define VNIC_EOIB_HDR_SET_UDP_CHK_OK(eoib_hdr)	(eoib_hdr->encap_data = \
+						(eoib_hdr->encap_data & 0xF3) | (VNIC_EOIB_HDR_UDP_CHK_OK << 2))
+
+#define VNIC_IP_CSUM_OK(eoib_hdr)	((VNIC_EOIB_HDR_GET_IP_CHK(eoib_hdr))  == VNIC_EOIB_HDR_IP_CHK_OK)
+#define VNIC_TCP_CSUM_OK(eoib_hdr)	((VNIC_EOIB_HDR_GET_TCP_UDP_CHK(eoib_hdr)) == VNIC_EOIB_HDR_TCP_CHK_OK)
+#define VNIC_UDP_CSUM_OK(eoib_hdr)	((VNIC_EOIB_HDR_GET_TCP_UDP_CHK(eoib_hdr)) == VNIC_EOIB_HDR_UDP_CHK_OK)
+#define VNIC_CSUM_OK(eoib_hdr)		(VNIC_IP_CSUM_OK(eoib_hdr)  && \
+					(VNIC_TCP_CSUM_OK(eoib_hdr) || \
+					 VNIC_UDP_CSUM_OK(eoib_hdr)))
+#define VNIC_EOIB_ZLEN_MAX		(ETH_ZLEN + VNIC_ENCAP_LEN + VLAN_HLEN)
+
+#define VNIC_SKB_GET_HASH(_skb, _max)	((*(u32 *)(_skb->cb + sizeof _skb->cb - 4)) % _max)
+#define VNIC_SKB_SET_HASH(_skb, _hash)  ((*(u32 *)(_skb->cb + sizeof _skb->cb - 4)) = _hash)
+#define VNIC_SKB_GET_ENCAP_CB(_skb)	((struct eoibhdr *)(_skb->cb + sizeof _skb->cb - 12))
+#define VNIC_SKB_GET_ENCAP(_skb)	(vnic_encap_headroom ? (struct eoibhdr *)(_skb->data) : VNIC_SKB_GET_ENCAP_CB(_skb))
+#define VNIC_SKB_GET_ENCAP_OFFSET	(vnic_encap_headroom ? VNIC_ENCAP_LEN :0)
+
+#define VNIC_NEIGH_GET_DQPN(_skb, _neighe) ((_neighe->rss) ? (_neighe->qpn + \
+	VNIC_SKB_GET_HASH(_skb, _neighe->login->qps_num)) : (_neighe->qpn))
+
+#define vnic_netdev_priv(netdev)	(((struct vnic_login_info *)netdev_priv(netdev))->login)
+#ifndef _BP_NETDEV_NO_TMQ /* >= 2.6.27 */
+#define VNIC_TXQ_GET_HASH(_skb, _max)	(skb_get_queue_mapping(_skb))
+#define VNIC_TXQ_ALLOC_NETDEV(sz, nm, sp, qm) alloc_netdev_mq(sz, nm, sp, qm)
+#define VNIC_TXQ_SET_ACTIVE(login, num)	(login->dev->real_num_tx_queues = \
+					login->real_tx_rings_num = \
+					login->ndo_tx_rings_num = num)
+#define VNIC_TXQ_GET_ACTIVE(login)	(login->real_tx_rings_num)
+#define VNIC_TXQ_GET(tx_res)		netdev_get_tx_queue(tx_res->login->dev, tx_res->index)
+#define VNIC_TXQ_STOP(tx_res) 		netif_tx_stop_queue(VNIC_TXQ_GET(tx_res))
+#define VNIC_TXQ_STOP_ALL(login)	netif_tx_stop_all_queues(login->dev)
+#define VNIC_TXQ_START(tx_res)		netif_tx_start_queue(VNIC_TXQ_GET(tx_res))
+#define VNIC_TXQ_START_ALL(login) 	netif_tx_start_all_queues(login->dev)
+#define VNIC_TXQ_STOPPED(tx_res)	netif_tx_queue_stopped(VNIC_TXQ_GET(tx_res))
+#define VNIC_TXQ_WAKE(tx_res)		netif_tx_wake_queue(VNIC_TXQ_GET(tx_res))
+#else
+#define VNIC_TXQ_GET_HASH(skb, _max)	VNIC_SKB_GET_HASH(skb, _max)
+#define VNIC_TXQ_ALLOC_NETDEV(sz, nm, sp, qm) alloc_netdev(sz, nm, sp)
+#define VNIC_TXQ_SET_ACTIVE(login, num)	do { login->real_tx_rings_num = num; \
+					     login->ndo_tx_rings_num = 1;    \
+					} while (0)
+#define VNIC_TXQ_GET_ACTIVE(login)	(login->real_tx_rings_num)
+#define VNIC_TXQ_STOP(tx_res)		netif_stop_queue(tx_res->login->dev)
+#define VNIC_TXQ_STOP_ALL(login)	netif_stop_queue(login->dev)
+#define VNIC_TXQ_START(tx_res)		netif_start_queue(tx_res->login->dev)
+#define VNIC_TXQ_START_ALL(login) 	netif_start_queue(login->dev)
+#define VNIC_TXQ_STOPPED(tx_res)	netif_queue_stopped(tx_res->login->dev)
+#define VNIC_TXQ_WAKE(tx_res)		netif_wake_queue(tx_res->login->dev)
+#endif
+
+#define VNIC_ALLOC_ORDER		2
+#define VNIC_ALLOC_SIZE			(PAGE_SIZE << VNIC_ALLOC_ORDER)
+#define VNIC_MAX_LRO_AGGR		64
+#define VNIC_MAX_RX_FRAGS		4
+#define VNIC_MAX_TX_FRAGS 		(MAX_SKB_FRAGS + 2)
+#define VNIC_MGID_PREFIX_LEN		5
+
+/* TODO, when set VNIC_MAX_TX_OUTS to 16,
+ * noticed that the last CQE overwrites the first one
+ */
+#define VNIC_MAX_TX_OUTS		8  /* default, tuneable */
+#define VNIC_MAX_LRO_DESCS		32 /* default, tuneable */
+#define VNIC_EOIB_HDR_SIZE		(IB_GRH_BYTES + VNIC_ENCAP_LEN)
+#define SMALL_PACKET_SIZE		(256 - NET_IP_ALIGN)
+#define HEADER_COPY_SIZE		(128 - NET_IP_ALIGN)
+#define MAX_HEADER_SIZE			64
+
+#define LAG_MAP_TABLE_SIZE		32
+#define	MAX_LAG_MEMBERS			16
+
+#define VNIC_FW_STR_MAX			VNIC_ETHTOOL_LINE_MAX
+#define VNIC_FW_STR(u64_fw_ver, str)					\
+do {									\
+	snprintf(str, VNIC_FW_STR_MAX, "%d.%d.%d",			\
+	(int)(u64_fw_ver >> 32),					\
+	(int)(u64_fw_ver >> 16) & 0xffff,				\
+	(int)(u64_fw_ver & 0xffff));					\
+} while (0);
+#define VNIC_STR_STRIP(str)						\
+do {									\
+	int i;								\
+	for (i = 0; i < strlen(str); ++i)				\
+		str[i] = str[i] == '\n' ? ' ' : str[i];			\
+} while (0);
+
+/* well known addresses */
+static const u8 ETH_BCAST_MAC[] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+};
+
+static const u8 ETH_ZERO_MAC[] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+/* this used in no_bxm mode only */
+static const u8 NO_BXM_MGID_PREFIX[] = {
+	0xff, 0x13, 0xe0, 0x1b, 0x00
+};
+
+#define IS_ZERO_MAC(mac) (!memcmp((mac), ETH_ZERO_MAC, ETH_ALEN))
+#define IS_BCAST_MAC(mac) (!memcmp((mac), ETH_BCAST_MAC, ETH_ALEN))
+#define IS_MCAST_MAC(mac) (((unsigned char *)(mac))[0] & 0x01)
+#define IS_UCAST_MAC(mac) (!(IS_MCAST_MAC(mac)))
+#define IS_NEIGH_QUERY_RUNNING(neigh) \
+	(neigh->query_id >= 0 && !IS_ERR(neigh->pquery) && neigh->pquery)
+
+struct mcast_root {
+	struct rb_root 	mcast_tree;
+	spinlock_t 	mcast_rb_lock;
+	struct list_head reattach_list;
+};
+
+/* structs */
+struct vnic_port_stats {
+	unsigned long gro_held;
+	unsigned long gro_merged;
+	unsigned long gro_normal;
+	unsigned long gro_drop;
+	unsigned long lro_aggregated;
+	unsigned long lro_flushed;
+	unsigned long lro_no_desc;
+	unsigned long tso_packets;
+	unsigned long queue_stopped;
+	unsigned long wake_queue;
+	unsigned long tx_timeout;
+	unsigned long rx_chksum_good;
+	unsigned long rx_chksum_none;
+	unsigned long tx_chksum_offload;
+	unsigned long sig_ver_err;
+	unsigned long vlan_err;
+	unsigned long shared_packets;
+	unsigned long runt_packets;
+	unsigned long realloc_packets;
+	unsigned long gw_tx_packets;
+	unsigned long gw_tx_bytes;
+};
+
+#define VNIC_STATS_DO_ADD(var, val) ((var) += (unsigned long)(val))
+#define VNIC_STATS_DO_INC(var)      (++(var))
+#ifdef VNIC_EXTRA_STATS /* for performance */
+#define VNIC_STATS_ADD(var, val)    ((var) += (unsigned long)(val))
+#define VNIC_STATS_INC(var)         (++(var))
+#else
+#define VNIC_STATS_ADD(var, val)    do { } while (0)
+#define VNIC_STATS_INC(var)         do { } while (0)
+#endif
+
+enum {
+	MCAST_ATTACHED,
+	MCAST_JOINED,
+	MCAST_JOIN_STARTED,
+	MCAST_JOIN_RUNNING,
+	MCAST_ATTACH_RUNNING,
+};
+
+struct vnic_port_mcast {
+	struct rb_node rb_node;
+	struct list_head list;
+	union ib_gid gid;
+	struct vnic_port *port;
+	struct completion leave_complete;
+	struct completion join_event_complete;
+	struct ib_sa_multicast *sa_mcast;
+	struct ib_sa_mcmember_rec rec;
+
+	atomic_t ref_cnt;
+	struct delayed_work join_task;
+	struct work_struct leave_task;
+	unsigned long join_task_cnt;
+	long int state;
+	spinlock_t lock;
+	u8 join_state;
+	/* IN */
+	unsigned long backoff;
+	unsigned long backoff_init;
+	unsigned long backoff_factor;
+	unsigned long retry;
+	u16 pkey;
+	u32 qkey;
+	u8 create;
+};
+
+struct vnic_mcast {
+	struct vnic_port_mcast *port_mcaste;
+	u32 qkey;
+	u16 pkey;
+	struct ib_qp *qp;
+	struct vnic_port *port;
+	struct ib_ah *ah;
+	struct completion attach_complete;
+	struct delayed_work attach_task;
+	struct delayed_work detach_task;
+	unsigned long attach_task_cnt;
+	struct rb_node rb_node;
+	struct list_head list; /* used when delete all */
+	/* IN */
+	u8 mac[ETH_ALEN];
+	union ib_gid gid;
+	union ib_gid port_gid;
+	unsigned long backoff;
+	unsigned long backoff_init;
+	unsigned backoff_factor;
+	unsigned long retry;
+	unsigned long state;
+	u8 blocking;
+	void *attach_cb_ctx;
+	void *detach_cb_ctx;
+	void (*attach_cb) (struct vnic_mcast *mcaste, void *ctx);
+	void (*detach_cb) (struct vnic_mcast *mcaste, void *ctx);
+	u8 create;
+	u8 join_state;
+	void *priv_data;
+	spinlock_t lock;
+	int attach_bit_nr;
+	unsigned long *req_attach;
+	unsigned long *cur_attached;
+	int sender_only;
+};
+
+struct vnic_mac {
+	struct rb_node rb_node;	/* list or RB tree */
+	struct list_head list;
+	u16 vnic_id;		/* needed for vnic child removal */
+	u8 mac[ETH_ALEN];	/* key */
+	unsigned long created;
+	unsigned long last_tx; // use jiffies_to_timeval
+};
+
+struct lag_properties {
+	u16 	hash_mask;
+	u8 	weights_policy;
+	u8 	ca;		/* conjestion aware */
+	u8 	ca_thresh;
+};
+
+struct vnic_neigh {
+	struct neighbour *neighbour;
+	struct ib_ah *ah;
+	struct vnic_login *login;
+	struct rb_node rb_node;
+	struct ib_sa_query *pquery;
+	struct completion query_comp;
+	int query_id;
+	struct sk_buff_head pkt_queue;
+	struct delayed_work destroy_task;
+	u8 valid;
+	u32 qpn;
+	u16 lid;
+	u8 sl; /* only for debug */
+	u8 mac[ETH_ALEN];
+	u8 rss;
+	u16 info;
+};
+
+enum lag_gw_state {
+	GW_MEMBER_INFO_CREATED 	= 1 << 0,
+	GW_MEMBER_INFO_EPORT_UP	= 1 << 1,
+	GW_MEMBER_INFO_MCAST	= 1 << 2,
+	GW_MEMBER_INFO_MAPPED	= 1 << 3,
+};
+
+struct vnic_gw_info {
+	enum lag_gw_state info;
+	int member_id;
+	u16 gw_id;
+	struct vnic_neigh neigh;
+};
+
+struct vnic_sysfs_attr {
+	void *ctx;
+	struct kobject *kobj;
+	unsigned long data;
+	char name[VNIC_SYSFS_FLEN];
+	struct module_attribute dentry;
+	struct device *dev;
+};
+
+enum gw_ext_lag_hash_policy {
+	GW_LAG_HASH_DMAC 	= 1 << 0,
+	GW_LAG_HASH_SMAC 	= 1 << 1,
+	GW_LAG_HASH_TPID 	= 1 << 2,	/* ethertype */
+	GW_LAG_HASH_VID 	= 1 << 3,
+	GW_LAG_HASH_SIP 	= 1 << 4,
+	GW_LAG_HASH_DIP 	= 1 << 5,
+	GW_LAG_HASH_IP_NEXT 	= 1 << 6,
+	GW_LAG_HASH_SPORT 	= 1 << 7,
+	GW_LAG_HASH_DPORT 	= 1 << 8,
+	GW_LAG_LAYER_2_3	= 0x1f0
+};
+
+struct vnic_tx_buf {
+	struct sk_buff *skb;
+	u64 mapping[VNIC_MAX_TX_FRAGS];
+	u8 ip_off;
+	u8 ip6_off;
+	u8 tcp_off;
+	u8 udp_off;
+	void *phead;
+	int hlen;
+};
+
+enum {
+#if 1
+	FRAG_SZ0 = 536 - NET_IP_ALIGN, /* so 1500 mtu fits in first 2 frags */
+	FRAG_SZ1 = 1024,
+	FRAG_SZ2 = 2048,
+	FRAG_SZ3 = 4096 - FRAG_SZ2 - FRAG_SZ1 - FRAG_SZ0
+#else
+	FRAG_SZ0 = 512 - NET_IP_ALIGN,
+	FRAG_SZ1 = 1024,
+	FRAG_SZ2 = 2048,
+	FRAG_SZ3 = 4096 << VNIC_ALLOC_ORDER
+#endif
+};
+
+struct vnic_frag_info {
+	u16 frag_size;
+	u16 frag_prefix_size;
+	u16 frag_stride;
+	u16 frag_align;
+	u16 last_offset;
+};
+
+struct vnic_rx_alloc {
+	struct page *page;
+	u16 offset;
+};
+
+struct vnic_frag_data {
+	struct skb_frag_struct frags[VNIC_MAX_RX_FRAGS];
+	u64 dma_addr[VNIC_MAX_RX_FRAGS];
+	struct sk_buff *skb; /* used only for linear buffers mode */
+};
+
+struct vnic_rx_ring {
+	struct vnic_port *port;
+	int index;
+	struct vnic_rx_alloc page_alloc[VNIC_MAX_RX_FRAGS];
+
+	u32 size; /* number of RX descs */
+	spinlock_t lock;
+	struct vnic_frag_data *rx_info;
+
+	struct vnic_frag_info frag_info[VNIC_MAX_RX_FRAGS];
+	u32 rx_skb_size;
+	u16 log_rx_info;
+	u16 num_frags;
+
+	struct ib_recv_wr wr;
+	struct ib_sge sge[VNIC_MAX_RX_FRAGS];
+
+	struct ib_srq *srq;
+	struct net_device_stats stats;
+};
+
+/* vnic states
+   these vlaues can be used only in struct fip_vnic_data.login_state */
+enum {
+	VNIC_STATE_LOGIN_OFF = 0,
+	VNIC_STATE_LOGIN_PRECREATE_1,
+	VNIC_STATE_LOGIN_PRECREATE_2,
+	VNIC_STATE_LOGIN_CREATE_1,
+	VNIC_STATE_LOGIN_CREATE_2,
+	VNIC_STATE_LOGIN_BCAST_ATTACH = 31
+};
+
+/* netdevice open state, depeneds on calls to open/stop
+   these vlaues can be used only in struct vnic_login.netdev_state */
+enum {
+	VNIC_STATE_NETDEV_OFF = 0,
+	VNIC_STATE_NETDEV_OPEN_REQ,
+	VNIC_STATE_NETDEV_OPEN,
+	VNIC_STATE_NETDEV_CARRIER_ON,
+	VNIC_STATE_NETDEV_NO_TX_ENABLE = 31
+};
+
+struct vnic_rx_res {
+	struct vnic_login *login;
+	struct ib_cq *cq;
+	struct net_lro_mgr lro;
+        struct net_lro_desc lro_desc[VNIC_MAX_LRO_DESCS];
+	struct ib_wc recv_wc[VNIC_MAX_RX_CQE];
+	int index;
+	int stopped;
+#ifndef _BP_NAPI_POLL
+	struct napi_struct napi;
+#else
+	struct net_device *poll_dev;
+#endif
+};
+
+struct vnic_tx_res {
+	struct vnic_tx_buf *tx_ring;
+	struct ib_sge tx_sge[VNIC_MAX_TX_FRAGS];
+	struct ib_wc send_wc[VNIC_MAX_TX_CQE];
+	struct ib_send_wr tx_wr;
+	struct vnic_login *login;
+	struct ib_cq *cq;
+	unsigned tx_head;
+	unsigned tx_tail;
+	unsigned tx_outstanding;
+	unsigned tx_stopped_cnt;
+	struct net_device_stats stats;
+	struct ib_ah_attr mcast_av;
+	u8 lso_hdr[VNIC_MAX_PAYLOAD_SIZE];
+	int index;
+	int stopped;
+	spinlock_t lock;
+};
+
+#ifdef VNIC_PROFILLNG
+#define VNIC_PROFILLNG_SKB_MAX 100
+struct vnic_prof_skb_entry {
+	struct sk_buff skb;
+	struct timespec tstamp;
+	unsigned long jiffies;
+	int cnt;
+	u8 nr_frags;
+};
+#endif
+
+struct vnic_qp_res {
+	struct vnic_login *login;
+	struct ib_qp *qp;
+	struct completion last_wqe_complete;
+	int tx_index;
+	int rx_index;
+};
+
+/*
+ * Wrapper struct for vnic_login, used as netdev private data.
+ * some kernels (such as 2.6.18-194.26.1) doesn't allow private
+ * data struct longer than 64KB (NETDEV_PRIV_LEN_MAX).
+ * we allocate the private data separately to work-around this limit.
+ */
+struct vnic_login_info {
+	struct vnic_login *login;
+};
+
+struct vnic_login {
+	spinlock_t lock;
+	spinlock_t stats_lock;
+	struct net_device *dev;
+	struct ethtool_drvinfo drvinfo;
+	struct vnic_port *port;
+	char desc[VNIC_DESC_LEN];
+	struct fip_vnic_data *fip_vnic;	/* for ethtool/sysfs*/
+	int queue_stopped;
+	unsigned long netdev_state;
+	char name[VNIC_NAME_LEN];
+	char vnic_name[VNIC_NAME_LEN];
+	char vendor_id[VNIC_VENDOR_LEN];
+	struct vnic_neigh *gw_neigh;
+	struct vnic_gw_info lag_gw_neigh[MAX_LAG_MEMBERS];
+	struct 	lag_properties lag_prop;
+	int is_lag;
+	int lag_gw_map[LAG_MAP_TABLE_SIZE];
+	int lag_member_count;
+	int lag_member_active_count;
+	union ib_gid gw_mgid;
+	int promisc;
+	union ib_gid gid;
+	__be16 vid;
+	u8 vlan_used;
+	u32 qkey;
+	u16 pkey;
+	u16 pkey_index;
+	u64 gw_guid;
+	u8 mgid_prefix[VNIC_MGID_PREFIX_LEN];
+	u8 n_mac_mcgid;
+	u8 sl;
+	u16 gw_port_id;
+	u16 vnic_id;
+	unsigned int max_mtu;
+	int zlen;
+	int cnt;
+	unsigned qps_num;
+	u32 qp_base_num;
+	u8 dev_addr[ETH_ALEN];
+	u8 all_vlan_gw;
+
+	/* statistics */
+	struct net_device_stats stats;
+	struct vnic_port_stats port_stats;
+
+	/* tasks */
+	struct work_struct mcast_restart;
+	struct delayed_work stats_task;
+	struct delayed_work mcast_task;
+	struct delayed_work restart_task;
+	struct mutex moder_lock;
+	struct mutex state_lock;
+
+	/* data structures */
+	struct workqueue_struct *neigh_wq;
+	struct rb_root neigh_tree;
+	struct rb_root mac_tree;
+	atomic_t vnic_child_cnt;
+	rwlock_t mac_rwlock;
+	struct mcast_root mcast_tree;
+	struct vnic_sysfs_attr dentries[VNIC_MAX_DENTRIES];
+	struct list_head list;
+
+	/* QP resources */
+	struct vnic_qp_res qp_res[VNIC_MAX_NUM_CPUS];
+
+	/* RX resouces */
+	struct vnic_rx_res rx_res[VNIC_MAX_NUM_CPUS];
+	struct ib_recv_wr rx_wr;
+	u32 lro_num;
+	unsigned lro_mng_num;
+	int rx_csum;
+	unsigned napi_num;
+	unsigned rx_rings_num;
+
+	/* TX resources */
+	struct vnic_tx_res tx_res[VNIC_MAX_NUM_CPUS];
+	unsigned tx_rings_num;
+	unsigned real_tx_rings_num;
+	unsigned ndo_tx_rings_num;
+	u8 *pad_va;
+	u64 pad_dma;
+
+	/* for profiling */
+#ifdef VNIC_PROFILLNG
+	struct vnic_prof_skb_entry prof_arr[VNIC_PROFILLNG_SKB_MAX];
+	int prof_arr_it;
+#endif
+	/* interrupt coalecence */
+	u16 rx_usecs;
+	u16 rx_frames;
+	u32 pkt_rate_low;
+	u16 rx_usecs_low;
+	u32 pkt_rate_high;
+	u16 rx_usecs_high;
+	u16 sample_interval;
+	u16 adaptive_rx_coal;
+	unsigned long last_moder_packets;
+	unsigned long last_moder_tx_packets;
+	unsigned long last_moder_bytes;
+	unsigned long last_moder_jiffies;
+	unsigned long last_moder_time;
+	u16 tx_usecs;
+	u16 tx_frames;
+	u8 shared_vnic;
+	u8 shared_mac[ETH_ALEN];
+};
+
+struct eoibhdr {
+	__u8 encap_data;
+	__u8 seg_off;
+	__be16 seg_id;
+};
+
+struct vnic_ib_dev {
+	char name[VNIC_DESC_LEN];
+	struct mutex mlock;
+	struct list_head list;
+	struct list_head port_list;
+	struct ib_device *ca;
+	struct mlx4_ib_dev *mdev;
+	struct ib_device_attr attr;
+	char fw_ver_str[VNIC_FW_STR_MAX];
+};
+
+struct fip_ring_entry {
+	void *mem;
+	u64 bus_addr;
+	int length;
+	int entry_posted;
+};
+
+struct fip_ring {
+	int size;
+	struct fip_ring_entry *ring;
+	unsigned long head;
+	unsigned long tail;
+	spinlock_t ring_lock;
+	spinlock_t head_tail_lock;
+};
+
+enum fip_discover_state {
+	FIP_DISCOVER_OFF,
+	FIP_DISCOVER_INIT,
+	FIP_DISCOVER_SOLICIT,
+	FIP_DISCOVER_CLEAR
+};
+
+#define MAX_INPUT_LEN 64
+#define MAX_INPUT_ARG 12
+struct fip_hadmin_cmd {
+	u8 c_name    [MAX_INPUT_LEN];
+	u8 c_mac     [MAX_INPUT_LEN];
+	u8 c_vnic_id [MAX_INPUT_LEN];
+	u8 c_vid     [MAX_INPUT_LEN];
+	u8 c_bxname  [MAX_INPUT_LEN];
+	u8 c_bxguid  [MAX_INPUT_LEN];
+	u8 c_eport   [MAX_INPUT_LEN];
+	u8 c_ipv4    [MAX_INPUT_LEN];
+	u8 c_ipv6    [MAX_INPUT_LEN];
+	u8 c_emac    [MAX_INPUT_LEN];
+	u8 c_pkey    [MAX_INPUT_LEN];
+	u8 c_parent  [MAX_INPUT_LEN];
+};
+
+struct fip_hadmin_cache {
+	struct fip_hadmin_cmd cmd;
+	u8 system_guid[GUID_LEN];
+	u8 system_name[VNIC_SYSTEM_NAME_LEN];
+	u8 eport_name[VNIC_GW_PORT_NAME_LEN];
+	u8 mac[ETH_ALEN];
+	u16 vnic_id;
+	u16 gw_port_id;
+	u16 vlan;
+	u8 vlan_used;
+	u8 all_vlan_gw;
+	u8 interface_name[VNIC_NAME_LEN];
+	u8 parent_name[VNIC_NAME_LEN];
+	int parent_used;
+	int remove;
+	struct list_head next;
+	u32 qp_base_num;
+	u8 shared_vnic_ip[IPV4_LEN];
+	u8 shared_vnic_mac[ETH_ALEN];
+};
+
+struct pkt_rcv_list {
+	struct list_head list;
+	spinlock_t lock;
+};
+
+struct fip_discover {
+	char name[VNIC_NAME_LEN];
+	struct vnic_port *port;
+	struct list_head discover_list;
+	spinlock_t lock;
+	struct list_head gw_list;
+	struct rw_semaphore l_rwsem;	/* gw list rw semaphore **/
+	int hadmin_update;
+	struct list_head hadmin_cache;
+	enum fip_discover_state state;
+	int flush;
+	struct completion flush_complete;
+	struct ib_cq *cq;
+	struct ib_qp *qp;
+	struct fip_ring rx_ring;
+	struct fip_ring tx_ring;
+	struct mcast_root mcast_tree;
+	struct delayed_work fsm_task;
+	struct delayed_work cleanup_task;
+	struct delayed_work hadmin_update_task;
+	struct work_struct pkt_rcv_task_bh;
+	struct pkt_rcv_list rcv_list;
+
+	int mcast_dest_mask;
+	unsigned long discover_mcast_attached_jiffies;
+	unsigned long discover_mcast_detached_jiffies;
+	unsigned long discover_mcast_state;
+	u16 pkey;
+	u16 pkey_index;
+	unsigned long   req_attach;
+	unsigned long   cur_attached;
+	unsigned new_prot_gws;
+	unsigned old_prot_gws;
+};
+
+struct fip_root {
+	struct list_head discover_list;
+};
+
+struct port_fs_dentry {
+	struct module_attribute fs_entry;
+	struct vnic_port *port;
+};
+
+struct vnic_port {
+	char name[VNIC_DESC_LEN];
+	u8 num;
+	int rx_rings_num;
+	int tx_rings_num;
+	struct vnic_ib_dev *dev;
+	struct mcast_root mcast_tree;
+	struct list_head list;
+	struct list_head login_list;
+	struct delayed_work event_task;
+	struct delayed_work event_task_light;
+	struct delayed_work discover_restart_task;
+	struct ib_event_handler event_handler;
+	struct ib_port_attr attr;
+	union ib_gid gid;
+	int rate;
+	u8 rate_enum;
+	atomic_t vnic_child_ids;
+
+	/* IB resources per port */
+	struct vnic_rx_ring *rx_ring[VNIC_MAX_NUM_CPUS];
+	struct ib_pd *pd;
+	struct ib_mr *mr;
+
+	/* for FIP */
+	struct mutex mlock;
+	struct mutex start_stop_lock;
+	u16 pkey_index;
+	u16 pkey;
+	int max_mtu_enum;
+	struct fip_root fip;
+	struct vnic_sysfs_attr dentries[VNIC_MAX_DENTRIES];
+};
+
+enum fip_vnic_state {
+	FIP_VNIC_CLOSED		= 0,
+	FIP_VNIC_HADMIN_IDLE	= 1<<0,
+	FIP_VNIC_LOGIN		= 1<<1,
+	FIP_VNIC_WAIT_4_ACK	= 1<<2,
+	FIP_VNIC_RINGS_INIT	= 1<<3, /* temporary, create rings */
+	FIP_VNIC_MCAST_INIT	= 1<<4, /* temporary, start mcast attach */
+	FIP_VNIC_MCAST_INIT_DONE= 1<<5, /* wait for mcast cb */
+	FIP_VNIC_VHUB_INIT	= 1<<6,
+	FIP_VNIC_VHUB_INIT_DONE	= 1<<7, /* wait for vhub table */
+	FIP_VNIC_VHUB_DONE	= 1<<8,
+	FIP_VNIC_VHUB_WRITE	= 1<<9,
+	FIP_VNIC_CONNECTED	= 1<<10
+};
+
+enum vhub_table_state {
+	VHUB_TBL_INIT,
+	VHUB_TBL_UP2DATE,
+	VHUB_TBL_UPDATED
+};
+
+struct vhub_elist {
+	u32 tusn;
+	int count;
+	int total_count;
+	struct list_head vnic_list;	/* chain vnics */
+};
+
+struct vnic_table_entry {
+	u32 qpn;
+	u16 lid;
+	u8 mac[ETH_ALEN];
+	u8 sl;
+
+	struct list_head list;
+	u8 rss;
+	u8 valid;
+};
+
+struct vhub_table {
+	enum vhub_table_state state;
+	u32 checksum;
+	u32 tusn;
+	struct vhub_elist main_list;
+	struct vhub_elist update_list;
+};
+
+struct fip_shared_vnic_data {
+	u8 ip[IPV4_LEN];
+	u8 emac[ETH_ALEN];
+	u8 enabled;
+	u8 arp_proxy;
+};
+
+struct lag_member {
+	u32	qpn;
+	u8	sl;
+	u16	gw_port_id;
+	u16	lid;
+	u8	guid[GUID_LEN];
+	u8	eport_state;
+	u8	weight;
+	u8	link_utilization;
+};
+
+struct lag_members {
+	int	num;
+	long	used_bitmask;
+	struct 	lag_properties prop;
+	struct 	lag_member memb[MAX_LAG_MEMBERS];
+};
+
+struct fip_login_data {
+	u32 qpn;
+	u32 ctl_qpn;
+	u16 port_id;		/* must always be uptodate */
+	u16 lid;		/* must always be uptodate */
+	u16 vlan;
+	u16 pkey;
+	u16 pkey_index;
+	u16 vnic_id;		/* must always be uptodate */
+	u32 vhub_id;
+	u16 mtu;
+
+	u8 sl;			/* service level -- 4 bits */
+	u8 guid[GUID_LEN];
+	u8 mac[ETH_ALEN];
+	u8 mgid_prefix[VNIC_MGID_PREFIX_LEN];
+	u8 vnic_name[VNIC_NAME_LEN];
+	u8 vendor_id[VNIC_VENDOR_LEN];
+	u8 n_mac_mcgid;
+	u8 n_rss_mgid;
+	u8 syndrome;		/* must always be uptodate */
+
+	u8 vp;			/* 1 bit: do we use vlan */
+	u8 all_vlan_gw;		/* 1 bit.
+				   is promisc vlan supported on this vnic */
+	struct lag_members lagm;
+};
+
+enum fip_flush {
+	FIP_NO_FLUSH,
+	FIP_PARTIAL_FLUSH,	/* use this for events caused by vnic/gw logic will */
+	FIP_FULL_FLUSH		/* use this for events caused by unload, host admin destroy */
+};
+
+struct fip_vnic_send_info {
+	u32 gw_qpn;
+	u32 qkey;
+	u16 gw_lid;
+	u8 gw_sl;
+};
+
+/*
+ * This struct holds informative info about the GW that can change without
+ * implecations on GW or vnic logic (only reported to user)
+ */
+struct fip_gw_volatile_info {
+	u8 system_guid[GUID_LEN];
+	u8 system_name[VNIC_SYSTEM_NAME_LEN+1];
+	u8 gw_port_name[VNIC_GW_PORT_NAME_LEN+1];
+};
+
+struct fip_vnic_data {
+	char name[VNIC_NAME_LEN];
+	enum fip_vnic_state state;
+	enum fip_flush flush;
+	spinlock_t lock;
+	spinlock_t ka_lock;
+	struct vnic_sysfs_attr dentry;
+	unsigned long login_state;
+
+	/* data structures maintenance */
+	struct fip_gw_data *gw;
+	struct vnic_port *port;
+	struct list_head gw_vnics;
+	struct vhub_table vhub_table;
+
+	/* execution maintenance */
+	unsigned long update_jiffs;
+	unsigned long keep_alive_jiffs;
+	unsigned long detached_ka_jiffs;
+	unsigned long vnic_mcaste_state;
+	struct delayed_work vnic_task;
+	struct hrtimer keepalive_timer;
+	struct list_head timer;
+	struct delayed_work vnic_gw_alive_task;
+	struct work_struct vnic_pkt_rcv_task_bh;
+	struct work_struct vnic_login_destroy_task;
+	struct work_struct vnic_login_create_task;
+	struct pkt_rcv_list vnic_rcv_list;
+	struct fip_vnic_send_info gw_address;
+
+	/* vnic driver API */
+	struct vnic_login *login;
+	unsigned long login_status;
+	int qps_num;
+	u32 qp_base_num;
+	int parent_used;
+	u8 parent_name[VNIC_NAME_LEN];
+
+	/* rx + tx data structures */
+	struct ib_cq *cq;
+	struct ib_qp *qp;
+	struct fip_ring rx_ring;
+	struct fip_ring tx_ring;
+	struct ib_ah *ah;
+
+	/* data domain */
+	union ib_gid mgid;
+
+	/* vHub context update mcast groups */
+	struct mcast_root mcast_tree;
+	struct fip_login_data login_data;
+	struct fip_shared_vnic_data shared_vnic;
+	u16 mlid;
+	/* u16 pkey_index; not used for now */
+
+	u16 vnic_id; /* unique id for GW */
+	u16 vlan;
+	u8 vlan_used;
+	u8 all_vlan_gw;
+	u16 pkey;
+	u16 pkey_index;
+	u8 hadmined; /* todo, use the state for this */
+	u8 interface_name[VNIC_NAME_LEN];
+	u8 mac_cache[ETH_ALEN];
+	atomic_t eport_state;
+	unsigned long last_send_jiffs;
+	int retry_count;
+	int synd_backlog;
+	struct fip_hadmin_cmd cmd;
+	struct fip_gw_volatile_info gw_info;
+	struct lag_members lm;
+	unsigned long	req_attach;
+	unsigned long	cur_attached;
+	union ib_gid	ka_mcast_gid;
+};
+
+enum vhub_mgid_type {
+	VHUB_MGID_DATA = 0,
+	VHUB_MGID_UPDATE = 2,
+	VHUB_MGID_TABLE = 3,
+	VHUB_MGID_KA = 5,
+};
+
+enum fip_all_mgids {
+	FIP_MCAST_DISCOVER,
+	FIP_MCAST_SOLICIT,
+	FIP_MCAST_VHUB_DATA,
+	FIP_MCAST_VHUB_UPDATE,
+	FIP_MCAST_TABLE,
+	FIP_MCAST_VHUB_KA,
+};
+
+union vhub_mgid {
+	struct mgid {
+		u8 mgid_prefix[VNIC_MGID_PREFIX_LEN];
+		u8 type;
+		u8 dmac[ETH_ALEN];
+		u8 rss_hash;
+		u8 vhub_id[3];
+	} mgid;
+	union ib_gid ib_gid;
+};
+
+void vnic_carrier_update(struct vnic_login *login);
+int vnic_param_check(void);
+
+/* mac table funcs */
+void vnic_learn_mac(struct net_device *dev, u8 *mac, int remove);
+void vnic_child_flush(struct vnic_login *login, int all);
+int vnic_child_update(struct vnic_login *login, u8 *mac, int remove);
+int vnic_mace_update(struct vnic_login *login, u8 *mac, u16 vnic_id, int remove);
+int vnic_parent_update(struct vnic_port *port, char *name, u16 vnic_id,
+		       u8 *mac, u32 *qp_base_num_ptr, char *parent_name,
+		       int remove);
+
+/* mcast funcs */
+int vnic_mcast_init(void);
+void vnic_mcast_cleanup(void);
+
+/*
+ * A helper function to prevent code duplication. Receives a multicast mac
+ * and a gw_id and attaches it (join + attach). The function also receives
+ * a default_mcaste (used for the MGID over default MLID hack and a user list.
+ * Returns 0 on success and non 0 on failure.
+ *
+ * in: mmac - to be used in creation MGID address
+ * in: default_mcaste - mcaste entry of the default MGID. Can be NULL
+ * in: private_data - A user pointer that can be used to identify owner
+ * in: gw_id - to be used in creation MGID address
+ */
+int _vnic_mcast_attach_mgid(struct vnic_login *login,
+			   char *mmac,
+			   struct vnic_mcast *default_mcaste,
+			   void *private_data,
+			   u16 gw_id);
+
+struct vnic_mcast *vnic_mcast_alloc(struct vnic_port *port,
+				    unsigned long *req_attach,
+				    unsigned long *cur_attach);
+/*
+ * A helper function to prevent code duplication. Fills vnic_mcast struct with
+ * common values.
+ *
+ * in: mcaste - mcaste to fill
+ * in: gw_id - to be used in creation MGID address
+ * in: mac - to be used in creation MGID address
+ * in: rss_hash - to be used in creation MGID address (ususally 0)
+ * in: create - value of create field in mcaste
+ */
+void __vnic_mcaste_fill(struct vnic_login *login, struct vnic_mcast *mcaste,
+			u16 gw_id, const u8 *mac, u8 rss_hash, int create);
+
+void vnic_mcast_dealloc(struct vnic_mcast *mcaste);
+
+int vnic_mcast_attach(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste);
+int vnic_mcast_detach(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste);
+
+/*
+ * This function grabs the mcast_tree->mcast_rb_lock
+*/
+int vnic_mcast_add(struct mcast_root *mcast_tree,
+		   struct vnic_mcast *mcaste);
+int vnic_mcast_del_all(struct mcast_root *mcast_tree);
+int vnic_mcast_del_user(struct mcast_root *mcast_tree, void *owner);
+
+void vnic_tree_mcast_detach(struct mcast_root *mcast_tree);
+void vnic_tree_mcast_attach(struct mcast_root *mcast_tree);
+
+/*void vnic_port_mcast_del_all(struct mcast_root *port); */
+static inline void vnic_mcast_root_init(struct mcast_root *mcast_tree)
+{
+	spin_lock_init(&mcast_tree->mcast_rb_lock);
+	INIT_LIST_HEAD(&mcast_tree->reattach_list);
+}
+
+/* port funcs */
+int vnic_ports_init(void);
+void vnic_ports_cleanup(void);
+
+/*
+ * The caller must hold the mcast_tree->mcast_rb_lock lock before calling
+*/
+void vnic_mcast_del(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste);
+struct vnic_mcast *vnic_mcast_search(struct mcast_root *mcast_tree,
+				     union ib_gid *gid);
+void port_fip_discover_restart(struct work_struct *work);
+int vnic_port_fip_init(struct vnic_port *port);
+void vnic_port_fip_cleanup(struct vnic_port *port, int lock);
+
+/* others */
+void fip_refresh_mcasts(struct fip_discover *discover);
+void vnic_login_refresh_mcasts(struct vnic_port *port);
+
+/* There are 2 different create flows, for host admin and net admin.
+ * In net admin we always create the vnic after connected with GW but we do not
+ * yet know the vnic details (mac, vlan etc). We know the ring paramets and 
+ * will need to create the RX/TX rings (before login).
+ * To accomplish this we call vnic_login_pre_create_1, vnic_login_pre_create_2
+ * and after login ACK we will call vnic_login_register_netdev and vnic_login_complete_ack.
+ * In Host admin, we know the vnic info but not the GW info when we create the
+ * vnic. So we call vnic_login_pre_create_1 and vnic_login_register_netdev, after
+ * getting the login ACK we will call vnic_login_pre_create_2, vnic_login_complete_ack.
+ */
+int vnic_login_register_netdev(struct fip_vnic_data *vnic,
+			       const char *mac,
+			       const char *name);
+int vnic_login_complete_ack(struct fip_vnic_data *vnic,
+			    struct fip_login_data *login_data,
+			    struct fip_shared_vnic_data *shared_vnic);
+int vnic_login_pre_create_1(struct vnic_port *port,
+			    struct fip_vnic_data *vnic);
+int vnic_login_pre_create_2(struct fip_vnic_data *vnic, int qps_num, int is_lag);
+
+/*
+ * When destroying login, call to stop login wq tasks. do not call from
+ * login_wq context.
+*/
+void vnic_login_destroy_stop_wq(struct fip_vnic_data *vnic, enum fip_flush flush);
+/*
+ * When destroy login data struct. Assumes all login wq tasks are stopped.
+ * Can be called from any context, might block for a few secs.
+*/
+void vnic_login_destroy_wq_stopped(struct fip_vnic_data *vnic, enum fip_flush flush);
+
+/*
+ * Destroy a login datastructure.
+ * This function can not be called from login_wq context. If you need to run
+ * from login_wq use the split function vnic_login_destroy_stop_wq/wq_stopped
+ * instead.
+ */
+static inline
+void vnic_login_destroy(struct fip_vnic_data *vnic, enum fip_flush flush)
+{
+	vnic_login_destroy_stop_wq(vnic, flush);
+	vnic_login_destroy_wq_stopped(vnic, flush);
+}
+
+/* add / remove members eports from LAG GW */
+void vnic_member_prop(struct vnic_login *login, struct lag_properties *prop);
+int vnic_member_add(struct vnic_login *login, int member_id,
+		    struct lag_member *emember);
+int vnic_member_remove(struct vnic_login *login, int member_id);
+int vnic_member_modify(struct vnic_login *login, int member_id,
+		       struct lag_member *emember);
+void vnic_member_remove_all(struct vnic_login *login);
+
+int vnic_vhube_add(struct fip_vnic_data *vnic, struct vnic_table_entry *vhube);
+void vnic_vhube_flush(struct fip_vnic_data *vnic);
+void vnic_vhube_del(struct fip_vnic_data *vnic, u8 *mac);
+int vnic_neighe_path_query(struct vnic_neigh *neighe);
+
+void vhub_mgid_create(const char *mgid_prefix,
+		      const char *mmac, /* mcast mac for bcast 0xFF.. */
+		      u64 n_mac,	/* bits to take from mmac */
+		      u32 vhub_id,
+		      enum vhub_mgid_type type,
+		      u8 rss_hash,
+		      union vhub_mgid *mgid);
+/*
+ * read the state of the gw eport. Can be called from any context.
+*/
+int fip_vnic_get_eport_state(struct fip_vnic_data *vnic);
+/*
+ * get GW info funcs.
+*/
+int fip_vnic_get_eport_name(struct fip_vnic_data *vnic, char *buff);
+int fip_vnic_get_bx_name(struct fip_vnic_data *vnic, char *buff);
+int fip_vnic_get_bx_guid(struct fip_vnic_data *vnic, char *buff);
+u8 fip_vnic_get_bx_sl(struct fip_vnic_data *vnic);
+int fip_vnic_get_gw_type(struct fip_vnic_data *vnic);
+int fip_vnic_get_lag_eports(struct fip_vnic_data *vnic, char *buf);
+int fip_vnic_get_all_vlan_mode(struct fip_vnic_data *vnic, char *buff);
+
+
+/*
+ * return short format string of GW info. can be called from any context.
+*/
+int fip_vnic_get_short_gw_info(struct fip_vnic_data *vnic, char *buff);
+
+void vnic_data_cleanup(void);
+
+/*
+ * This function is called from the sysfs update callback function. 
+ * it parses the request and adds the request to a list. It then queues a
+ * work request to process the list from the fip_wq context.  
+*/
+int fip_hadmin_sysfs_update(struct vnic_port *port,
+			    const char *buffer, int count, int remove);
+int fip_gw_sysfs_show(struct vnic_port *port, char *buffer);
+int vnic_login_cmd_set(char *buf, struct fip_hadmin_cmd *cmd);
+void vnic_login_cmd_init(struct fip_hadmin_cmd *cmd);
+
+int fip_hadmin_vnic_refresh(struct fip_vnic_data *vnic, struct fip_vnic_send_info *gw_address);
+void fip_vnic_set_gw_param(struct fip_vnic_data *vnic, struct fip_vnic_send_info *gw_address);
+void fip_vnic_create_gw_param(struct fip_vnic_send_info *gw_address, u32 gw_qpn,
+			      u32 qkey, u16 gw_lid, u8 gw_sl);
+
+int fip_vnic_hadmin_init(struct vnic_port *port, struct fip_vnic_data *vnic);
+
+int port_fs_init(struct vnic_port *port);
+void port_fs_exit(struct vnic_port *port);
+
+int vnic_port_query(struct vnic_port *port);
+
+#endif /* VNIC_H */
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data.h b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data.h
new file mode 100644
index 0000000000000..d21517f916bbd
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _VNIC_DATA_H
+#define _VNIC_DATA_H
+
+#include "vnic.h"
+
+enum {
+	VNIC_SEND_INLINE_FLAG_POS = 63,
+};
+
+#define	VNIC_SEND_INLINE_FLAG ((u64)1 << VNIC_SEND_INLINE_FLAG_POS)
+
+/* main funcs */
+int vnic_port_data_init(struct vnic_port *port);
+void vnic_port_data_cleanup(struct vnic_port *port);
+
+/* ib funcs */
+struct sk_buff *vnic_alloc_rx_skb(struct vnic_rx_ring *ring, int buf_ind,
+				  gfp_t gfp_flag);
+int vnic_post_recv(struct vnic_rx_ring *ring, u64 wr_id);
+int vnic_post_recvs(struct vnic_rx_ring *ring);
+int vnic_ib_create_qp_range(struct ib_pd *pd, struct ib_qp_init_attr *init_attr,
+			    struct ib_udata *udata, int nqps,
+			    int align, struct ib_qp *list[]);
+int vnic_ib_destroy_qp(struct ib_qp *qp);
+int vnic_ib_post_send(struct ib_qp *ibqp,
+		      struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr,
+		      u8 ip_off, u8 ip6_off,
+		      u8 tcp_off, u8 udp_off);
+struct vnic_rx_ring *vnic_create_rx_ring(struct vnic_port *port, int index);
+void vnic_destroy_rx_ring(struct vnic_rx_ring *ring);
+int vnic_init_qp(struct vnic_login *login, int qp_index);
+int vnic_create_qp(struct vnic_login *login, int qp_index);
+int vnic_create_qp_range(struct vnic_login *login);
+void vnic_destroy_qp(struct vnic_login *login, int qp_index);
+int vnic_create_tx_res(struct vnic_login *login, int tx_res_index);
+int vnic_create_rx_res(struct vnic_login *login, int rx_res_index);
+void vnic_destroy_tx_res(struct vnic_login *login, int tx_res_index);
+void vnic_destroy_rx_res(struct vnic_login *login, int rx_res_index);
+
+int vnic_ib_up(struct net_device *dev);
+int vnic_ib_down(struct net_device *dev);
+int vnic_ib_open(struct net_device *dev);
+int vnic_ib_stop(struct net_device *dev);
+
+int vnic_ib_set_moder(struct vnic_login *login,
+		      u16 rx_usecs, u16 rx_frames, u16 tx_usecs, u16 tx_frames);
+int vnic_port_ib_init(struct vnic_port *port);
+void vnic_port_ib_cleanup(struct vnic_port *port);
+void vnic_ib_dispatch_event(struct ib_event *event);
+#ifndef _BP_NAPI_POLL
+int vnic_poll_cq_rx(struct napi_struct *napi, int budget);
+#else
+int vnic_poll_cq_rx(struct net_device *poll_dev, int *budget);
+#endif
+void vnic_send(struct vnic_login *login, struct sk_buff *skb,
+	       struct ib_ah *ah, u32 dqpn, int tx_res_index);
+void vnic_ib_free_ring(struct vnic_rx_ring *ring);
+int vnic_ib_init_ring(struct vnic_rx_ring *ring);
+
+/* netdev funcs */
+struct net_device *vnic_alloc_netdev(struct vnic_port *port);
+void vnic_free_netdev(struct vnic_login *login);
+int vnic_restart(struct net_device *dev);
+void __bcast_attach_cb(struct vnic_mcast *mcaste, void *login_ptr);
+void __bcast_detach_cb(struct vnic_mcast *mcaste, void *login_ptr);
+
+/* rx funcs */
+int vnic_rx(struct vnic_login *login, struct sk_buff *skb, struct ib_wc *wc);
+int vnic_unmap_and_replace_rx(struct vnic_rx_ring *ring, struct ib_device *dev,
+			      struct skb_frag_struct *skb_frags_rx,
+			      u64 wr_id, int length);
+int vnic_rx_skb(struct vnic_login *login, struct vnic_rx_ring *ring,
+		struct ib_wc *wc, int ip_summed, char *eth_hdr_va);
+
+/* tx funcs */
+int vnic_tx(struct sk_buff *skb, struct net_device *dev);
+
+/* sysfs funcs */
+int vnic_create_dentry(struct vnic_login *login);
+void vnic_delete_dentry(struct vnic_login *login);
+
+/* ethtool funcs */
+void vnic_set_ethtool_ops(struct net_device *dev);
+
+/* neigh funcs */
+void vnic_neigh_del_all(struct vnic_login *login);
+struct vnic_neigh *vnic_neighe_search(struct vnic_login *login, u8 *mac);
+void vnic_neighe_dealloc_task(struct work_struct *work);
+void vnic_neighe_dealloc(struct vnic_neigh *neighe);
+struct vnic_neigh *vnic_neighe_alloc(struct vnic_login *login,
+				     const u8 *mac, u16 dlid, u32 dqpn, u8 rss);
+void vnic_neighe_del(struct vnic_login *login, struct vnic_neigh *neighe);
+int vnic_neighe_add(struct vnic_login *login, struct vnic_neigh *neighe);
+struct ib_ah *vnic_ah_alloc(struct vnic_login *login, u16 dlid);
+void vnic_neigh_invalidate(struct vnic_login *login);
+
+
+
+struct vnic_login *__vnic_login_create(struct vnic_port *port, int index);
+u32 vnic_hash(struct net_device *dev, struct sk_buff *skb);
+#endif /* _VNIC_DATA_H */
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_ethtool.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_ethtool.c
new file mode 100644
index 0000000000000..16ff551dd95c3
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_ethtool.c
@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+
+#include "vnic.h"
+#include "vnic_data.h"
+
+static struct ethtool_ops vnic_ethtool_ops;
+
+static const char vnic_strings[][ETH_GSTRING_LEN] = {
+	/* public statistics */
+	"rx_packets", "tx_packets", "rx_bytes",
+	"tx_bytes", "rx_errors", "tx_errors",
+	"rx_dropped", "tx_dropped", "multicast",
+	"collisions", "rx_length_errors", "rx_over_errors",
+	"rx_crc_errors", "rx_frame_errors", "rx_fifo_errors",
+	"rx_missed_errors", "tx_aborted_errors", "tx_carrier_errors",
+	"tx_fifo_errors", "tx_heartbeat_errors", "tx_window_errors",
+#define VNIC_PUB_STATS_LEN	21
+
+	/* private statistics */
+	"gro_held", "gro_merged", "gro_normal", "gro_drop",
+	"lro_aggregated", "lro_flushed", "lro_no_desc",
+	"tso_packets", "queue_stopped", "wake_queue",
+	"tx_timeout", "rx_chksum_good", "rx_chksum_none",
+	"tx_chksum_offload", "sig_ver_err", "vlan_err",
+	"shared_packets", "runt_packets", "realloc_packets",
+	"gw_tx_packets", "gw_tx_bytes",
+#define VNIC_PORT_STATS_LEN	21
+
+	/* packet statistics rx_prio_X (TODO) */
+#define VNIC_PKT_STATS_LEN	0
+};
+
+#define VNIC_STATS_LEN (sizeof(vnic_strings) / ETH_GSTRING_LEN)
+
+static void vnic_get_drvinfo(struct net_device *dev,
+			     struct ethtool_drvinfo *drvinfo)
+{
+	struct vnic_login *login = vnic_netdev_priv(dev);
+
+	*drvinfo = login->drvinfo;
+}
+
+static u32 vnic_get_msglevel(struct net_device *dev)
+{
+	return vnic_msglvl;
+}
+
+static void vnic_set_msglevel(struct net_device *dev, u32 mlevel)
+{
+	vnic_msglvl = mlevel;
+}
+
+static int vnic_get_coalesce(struct net_device *dev,
+			     struct ethtool_coalesce *coal)
+{
+	struct vnic_login *login = vnic_netdev_priv(dev);
+
+	vnic_dbg_ethtool(login->name, "get coalescing params for mtu:%d "
+			 "rx_frames:%d rx_usecs:%d, "
+			 "tx_frames:%d tx_usecs:%d, "
+			 "adaptive_rx_coal:%d, "
+			 "adaptive_tx_coal:%d\n",
+			 login->dev->mtu,
+			 login->rx_frames, login->rx_usecs,
+			 login->tx_frames, login->tx_usecs,
+			 login->adaptive_rx_coal, 0);
+
+	coal->tx_coalesce_usecs = login->tx_usecs;
+	coal->tx_max_coalesced_frames = login->tx_frames;
+	coal->rx_coalesce_usecs = login->rx_usecs;
+	coal->rx_max_coalesced_frames = login->rx_frames;
+
+	coal->pkt_rate_low = login->pkt_rate_low;
+	coal->rx_coalesce_usecs_low = login->rx_usecs_low;
+	coal->pkt_rate_high = login->pkt_rate_high;
+	coal->rx_coalesce_usecs_high = login->rx_usecs_high;
+	coal->rate_sample_interval = login->sample_interval;
+	coal->use_adaptive_rx_coalesce = login->adaptive_rx_coal;
+
+	return 0;
+}
+
+static int vnic_set_coalesce(struct net_device *dev,
+			     struct ethtool_coalesce *coal)
+{
+	struct vnic_login *login = vnic_netdev_priv(dev);
+
+	login->rx_frames = (coal->rx_max_coalesced_frames ==
+			    VNIC_AUTO_CONF) ?
+	    VNIC_RX_COAL_TARGET /
+	    login->dev->mtu + 1 : coal->rx_max_coalesced_frames;
+	login->rx_usecs = (coal->rx_coalesce_usecs ==
+			   VNIC_AUTO_CONF) ?
+	    VNIC_RX_COAL_TIME : coal->rx_coalesce_usecs;
+	login->tx_frames = coal->tx_max_coalesced_frames;
+	login->tx_usecs = coal->tx_coalesce_usecs;
+
+	/* Set adaptive coalescing params */
+	login->pkt_rate_low = coal->pkt_rate_low;
+	login->rx_usecs_low = coal->rx_coalesce_usecs_low;
+	login->pkt_rate_high = coal->pkt_rate_high;
+	login->rx_usecs_high = coal->rx_coalesce_usecs_high;
+	login->sample_interval = coal->rate_sample_interval;
+	login->adaptive_rx_coal = coal->use_adaptive_rx_coalesce;
+	login->last_moder_time = VNIC_AUTO_CONF;
+
+	if (login->adaptive_rx_coal)
+		return 0;
+
+	vnic_ib_set_moder(login,
+			  login->rx_usecs, login->rx_frames,
+			  login->tx_usecs, login->tx_frames);
+
+	return 0;
+}
+
+static int vnic_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	cmd->autoneg = AUTONEG_DISABLE;
+	cmd->supported = SUPPORTED_10000baseT_Full;
+	cmd->advertising = SUPPORTED_10000baseT_Full;
+	if (netif_carrier_ok(dev)) {
+		cmd->speed = SPEED_10000;
+		cmd->duplex = DUPLEX_FULL;
+	} else {
+		cmd->speed = -1;
+		cmd->duplex = -1;
+	}
+	return 0;
+}
+
+static int vnic_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	if ((cmd->autoneg == AUTONEG_ENABLE) ||
+	    (cmd->speed != SPEED_10000) || (cmd->duplex != DUPLEX_FULL))
+		return -EINVAL;
+
+	/* Nothing to change */
+	return 0;
+}
+
+static void vnic_get_strings(struct net_device *dev,
+			     uint32_t stringset, uint8_t *data)
+{
+	struct vnic_login *login = vnic_netdev_priv(dev);
+	int index = 0, stats_off = 0, i;
+
+	if (stringset != ETH_SS_STATS)
+		return;
+
+	/* Add main counters */
+	for (i = 0; i < VNIC_PUB_STATS_LEN; i++)
+		strcpy(data + (index++) * ETH_GSTRING_LEN,
+		       vnic_strings[i + stats_off]);
+	stats_off += VNIC_PUB_STATS_LEN;
+
+	for (i = 0; i < VNIC_PORT_STATS_LEN; i++)
+		strcpy(data + (index++) * ETH_GSTRING_LEN,
+		       vnic_strings[i + stats_off]);
+	stats_off += VNIC_PORT_STATS_LEN;
+
+	for (i = 0; i < VNIC_PKT_STATS_LEN; i++)
+		strcpy(data + (index++) * ETH_GSTRING_LEN,
+		       vnic_strings[i + stats_off]);
+	stats_off += VNIC_PKT_STATS_LEN;
+
+	for (i = 0; i < login->tx_rings_num; i++) {
+		sprintf(data + (index++) * ETH_GSTRING_LEN,
+			"tx%d_packets", i);
+		sprintf(data + (index++) * ETH_GSTRING_LEN,
+			"tx%d_bytes", i);
+	}
+	for (i = 0; i < login->rx_rings_num; i++) {
+		sprintf(data + (index++) * ETH_GSTRING_LEN,
+			"rx%d_packets", i);
+		sprintf(data + (index++) * ETH_GSTRING_LEN,
+			"rx%d_bytes", i);
+	}
+}
+
+static void vnic_get_ethtool_stats(struct net_device *dev,
+				   struct ethtool_stats *stats, uint64_t *data)
+{
+	struct vnic_login *login = vnic_netdev_priv(dev);
+	int index = 0, i;
+
+	spin_lock_bh(&login->stats_lock);
+
+	for (i = 0; i < VNIC_PUB_STATS_LEN; i++)
+		data[index++] = ((unsigned long *) &login->stats)[i];
+	for (i = 0; i < VNIC_PORT_STATS_LEN; i++)
+		data[index++] = ((unsigned long *) &login->port_stats)[i];
+	for (i = 0; i < VNIC_PKT_STATS_LEN; i++)
+		data[index++] = 0;
+	for (i = 0; i < login->tx_rings_num; i++) {
+		data[index++] = login->tx_res[i].stats.tx_packets;
+		data[index++] = login->tx_res[i].stats.tx_bytes;
+	}
+	for (i = 0; i < login->rx_rings_num; i++) {
+		data[index++] = login->port->rx_ring[i]->stats.rx_packets;
+		data[index++] = login->port->rx_ring[i]->stats.rx_bytes;
+	}
+	spin_unlock_bh(&login->stats_lock);
+}
+
+#ifndef _BP_ETHTOOL_NO_SSETC
+static int vnic_get_sset_count(struct net_device *dev, int sset)
+{
+	struct vnic_login *login = vnic_netdev_priv(dev);
+
+	switch (sset) {
+	case ETH_SS_STATS:
+		return VNIC_STATS_LEN + /* static stats + stats per ring */
+		       (login->tx_rings_num + login->rx_rings_num) * 2;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+#else
+static int vnic_get_stats_count(struct net_device *dev)
+{
+	struct vnic_login *login = vnic_netdev_priv(dev);
+
+	return VNIC_STATS_LEN +
+	       (login->tx_rings_num + login->rx_rings_num) * 2;
+}
+#endif
+
+static void vnic_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
+{
+	wol->supported = wol->wolopts = 0;
+
+	return;
+}
+
+void vnic_get_ringparam(struct net_device *dev, struct ethtool_ringparam *param)
+{
+	memset(param, 0, sizeof *param);
+	param->rx_max_pending = VNIC_MAX_RX_SIZE;
+	param->tx_max_pending = VNIC_MAX_TX_SIZE;
+	param->rx_pending = vnic_rx_rings_len;
+	param->tx_pending = vnic_tx_rings_len;
+}
+
+void vnic_set_ethtool_ops(struct net_device *dev)
+{
+	struct vnic_login *login = vnic_netdev_priv(dev);
+	struct mlx4_ib_dev *mlx4_ibdev = login->port->dev->mdev;
+
+	ASSERT(login);
+	ASSERT(login->port->dev->ca);
+	ASSERT(login->port->dev->ca->dma_device);
+
+	SET_ETHTOOL_OPS(dev, &vnic_ethtool_ops);
+	strncpy(login->drvinfo.driver, DRV_NAME, VNIC_ETHTOOL_LINE_MAX);
+	strncpy(login->drvinfo.version, DRV_VER, VNIC_ETHTOOL_LINE_MAX);
+	login->drvinfo.n_stats = 0;
+	login->drvinfo.regdump_len = 0;
+	login->drvinfo.eedump_len = 0;
+
+	sprintf(login->drvinfo.bus_info, "%s [%s:%d]",
+		pci_name(to_pci_dev(login->port->dev->ca->dma_device)),
+		login->port->dev->ca->name, login->port->num);
+	sprintf(login->drvinfo.fw_version, "%s [%.*s]",
+		login->port->dev->fw_ver_str, MLX4_BOARD_ID_LEN,
+		mlx4_ibdev->dev->board_id);
+	vnic_dbg_ethtool(login->name, "bus %s, port %d, fw_ver %s\n",
+			 login->drvinfo.bus_info, login->port->num,
+			 login->drvinfo.fw_version);
+
+	return;
+}
+
+static struct ethtool_ops vnic_ethtool_ops = {
+	.get_link = ethtool_op_get_link,
+	.get_drvinfo = vnic_get_drvinfo,
+	.get_msglevel = vnic_get_msglevel,
+	.set_msglevel = vnic_set_msglevel,
+	.get_coalesce = vnic_get_coalesce,
+	.set_coalesce = vnic_set_coalesce,
+	.get_strings = vnic_get_strings,
+	.get_ethtool_stats = vnic_get_ethtool_stats,
+#ifndef _BP_ETHTOOL_NO_SSETC
+	.get_sset_count = vnic_get_sset_count,
+#else
+	.get_stats_count = vnic_get_stats_count,
+#endif
+	.get_settings = vnic_get_settings,
+	.set_settings = vnic_set_settings,
+	.get_wol = vnic_get_wol,
+	.get_ringparam = vnic_get_ringparam,
+	.set_ringparam = NULL,
+};
+
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_fs.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_fs.c
new file mode 100644
index 0000000000000..95d7ef796fc18
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_fs.c
@@ -0,0 +1,993 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/version.h>
+
+#include "vnic.h"
+#include "vnic_data.h"
+#include "vnic_fip_discover.h"
+
+#define ALL_VLAN_GW_VID "all"
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,1,0))
+#define __MODULE_KOBJ_TYPE struct module_kobject
+#else
+#define __MODULE_KOBJ_TYPE struct module
+#endif
+
+char *login_dentry_name(char *buf, struct vnic_login *login, char *str)
+{
+	snprintf(buf, VNIC_SYSFS_FLEN, "%s%d-%s", "vnic",
+		 login->cnt, str);
+	return buf;
+}
+
+char *port_dentry_name(char *buf, struct vnic_port *port, char *str)
+{
+	snprintf(buf, VNIC_SYSFS_FLEN, "%s_%s_%d",
+		 str, port->dev->name, port->num);
+	return buf;
+}
+
+char *vnic_dentry_name(char *buf, struct fip_vnic_data *vnic, char *str)
+{
+	snprintf(buf, VNIC_SYSFS_FLEN, "%s-%s-%s", "vnic",
+		 vnic->interface_name, str);
+	return buf;
+}
+
+#ifndef _BP_NO_ATT_OWNER
+#define DENTRY_OWNER(_vdentry)						\
+	(_vdentry)->dentry.attr.owner = THIS_MODULE;			\
+	(_vdentry)->kobj = &vdentry->dentry.attr.owner->mkobj.kobj;
+#else
+#define DENTRY_OWNER(_vdentry)						\
+	(_vdentry)->kobj = &(THIS_MODULE)->mkobj.kobj;
+#endif
+
+#define DENTRY_REMOVE(_dentry)						\
+do {									\
+	vnic_dbg_sysfs((_dentry)->name, "deleted\n");			\
+	sysfs_remove_file((_dentry)->kobj, &(_dentry)->dentry.attr);	\
+	(_dentry)->ctx = NULL;						\
+} while (0);
+
+#define DENTRY_CREATE(_ctx, _dentry, _name, _show, _store)		\
+do {									\
+	struct vnic_sysfs_attr *vdentry = _dentry;			\
+	vdentry->ctx = _ctx;						\
+	vdentry->dentry.show = _show;					\
+	vdentry->dentry.store = _store;					\
+	vdentry->dentry.attr.name = vdentry->name;			\
+	vdentry->dentry.attr.mode = 0;					\
+	DENTRY_OWNER(vdentry);						\
+	snprintf(vdentry->name, VNIC_SYSFS_FLEN, "%s", _name);		\
+	if (vdentry->dentry.store)					\
+		vdentry->dentry.attr.mode |= S_IWUSR;			\
+	if (vdentry->dentry.show)					\
+		vdentry->dentry.attr.mode |= S_IRUGO;			\
+	vnic_dbg_sysfs(_ctx->name, "creating %s\n",			\
+		vdentry->name);						\
+	if (strlen(_name) > VNIC_SYSFS_FLEN) {				\
+		vnic_err(_ctx->name, "name too long %d > %d\n",		\
+			 (int)strlen(_name), VNIC_SYSFS_FLEN);		\
+		vdentry->ctx = NULL;					\
+		break;							\
+	}								\
+	if (sysfs_create_file(vdentry->kobj, &vdentry->dentry.attr)) {	\
+		vnic_err(_ctx->name, "failed to create %s\n",		\
+			 vdentry->dentry.attr.name);			\
+		vdentry->ctx = NULL;					\
+		break;							\
+	}								\
+	vnic_dbg_sysfs(_ctx->name, "created %s\n", vdentry->name);	\
+} while (0);
+
+/* helper functions */
+static const char *port_phys_state_str(enum ib_port_state pstate)
+{
+	switch (pstate) {
+	case 0:
+		return "no_state_change";
+	case 1:
+		return "sleep";
+	case 2:
+		return "polling";
+	case 3:
+		return "disabled";
+	case 4:
+		return "port_configuration_training";
+	case 5:
+		return "up";
+	case 6:
+		return "error_recovery";
+	case 7:
+		return "phy_test";
+	default:
+		return "invalid_state";
+	}
+}
+static const char *port_state_str(enum ib_port_state pstate)
+{
+	switch (pstate) {
+	case IB_PORT_DOWN:
+		return "down";
+	case IB_PORT_INIT:
+		return "initializing";
+	case IB_PORT_ARMED:
+		return "armed";
+	case IB_PORT_ACTIVE:
+		return "active";
+	case IB_PORT_NOP:
+		return "nop";
+	case IB_PORT_ACTIVE_DEFER:
+		return "defer";
+	default:
+		return "invalid_state";
+	}
+}
+
+/* store/show functions */
+static ssize_t vnic_neigh_show(struct module_attribute *attr,
+			       __MODULE_KOBJ_TYPE *mod, char *buf)
+{
+	char *p = buf;
+	struct vnic_sysfs_attr *vnic_dentry =
+		container_of(attr, struct vnic_sysfs_attr, dentry);
+	struct vnic_login *login = vnic_dentry->ctx;
+	struct vnic_neigh *neighe;
+	struct vnic_mcast *mcaste;
+	struct rb_node *n;
+	unsigned long flags;
+
+	/* check if GW entry is ready */
+	if (!login->gw_neigh)
+		goto out;
+	ASSERT(login->gw_neigh);
+
+	/* print GW entry */
+	neighe = login->gw_neigh;
+	p += _sprintf(p, buf, "G:MAC["MAC_6_PRINT_FMT"] VID[0x%04x] "
+		     "VID_USED[%d] QPN[0x%06x] LID[0x%04x] RSS[%d] SL[%d] VALID[%d]\n",
+		     MAC_6_PRINT_ARG(neighe->mac),
+		     be16_to_cpu(login->vid), login->vlan_used, neighe->qpn,
+		     neighe->lid, neighe->rss, neighe->sl, neighe->valid);
+
+	/* print neigh tree entries */
+	n = rb_first(&login->neigh_tree);
+	while (n) {
+		neighe = rb_entry(n, struct vnic_neigh, rb_node);
+		p += _sprintf(p, buf, "U:MAC["MAC_6_PRINT_FMT"] VID[0x%04x] "
+			     "VID_USED[%d] QPN[0x%06x] LID[0x%04x] RSS[%d] SL[%d] VALID[%d]\n",
+			     MAC_6_PRINT_ARG(neighe->mac),
+			     be16_to_cpu(login->vid), login->vlan_used,
+			     neighe->qpn, neighe->lid, neighe->rss, neighe->sl, neighe->valid);
+		n = rb_next(n);
+	}
+
+	/* print mcast tree entries */
+	spin_lock_irqsave(&login->mcast_tree.mcast_rb_lock, flags);
+	n = rb_first(&login->mcast_tree.mcast_tree);
+	while (n) {
+		u16 lid = 0xFFFF;
+		mcaste = rb_entry(n, struct vnic_mcast, rb_node);
+		n = rb_next(n);
+		if (test_bit(MCAST_ATTACHED, &mcaste->state))
+			lid = mcaste->port_mcaste->rec.mlid;
+		p += _sprintf(p, buf, "M:MAC["MAC_6_PRINT_FMT"] VID[0x%04x] "
+			     "VID_USED[%d] QPN[0x%06x] LID[0x%04x] RSS[%d] SL[%d]\n",
+			     MAC_6_PRINT_ARG(mcaste->mac),
+			     0, login->vlan_used, IB_MULTICAST_QPN, lid, 0, mcaste->port_mcaste->sa_mcast->rec.sl);
+	}
+	spin_unlock_irqrestore(&login->mcast_tree.mcast_rb_lock, flags);
+
+out:
+	return (ssize_t)(p - buf);
+}
+
+/* store/show functions */
+static ssize_t vnic_member_show(struct module_attribute *attr,
+			       __MODULE_KOBJ_TYPE *mod, char *buf)
+{
+	char *p = buf;
+	struct vnic_sysfs_attr *vnic_dentry =
+		container_of(attr, struct vnic_sysfs_attr, dentry);
+	struct vnic_login *login = vnic_dentry->ctx;
+	int i;
+
+	if (!login->is_lag)
+		goto out;
+
+	netif_tx_lock_bh(login->dev);
+	p += _sprintf(p, buf, "GW member count=%d active count=%d hash bitmask=0x%X\n",
+		     login->lag_member_count, login->lag_member_active_count, login->lag_prop.hash_mask);
+
+	p += _sprintf(p, buf, "GW hash mapping table:\n");
+
+	for (i=0; i<LAG_MAP_TABLE_SIZE; i+=8) {
+		p += _sprintf(p, buf, "%3d %3d %3d %3d %3d %3d %3d %3d\n",
+		       login->lag_gw_map[i], login->lag_gw_map[i+1], login->lag_gw_map[i+2], login->lag_gw_map[i+3],
+		       login->lag_gw_map[i+4], login->lag_gw_map[i+5], login->lag_gw_map[i+6], login->lag_gw_map[i+7]);
+	}
+
+	p += _sprintf(p, buf, "\nGW member state info:   (0x1-created, 0x2-eport up, 0x4-mcast join complete, 0x8-member in use)\n");
+
+	for (i=0; i<MAX_LAG_MEMBERS; i++) {
+		p += _sprintf(p, buf, "%.2d GW id=%.3d State=0x%.3x LID=%.3d QPN=0x%.6x SL[%d] VALID[%d]\n", i,
+			      login->lag_gw_neigh[i].gw_id,
+			      login->lag_gw_neigh[i].info,
+			      login->lag_gw_neigh[i].neigh.lid,
+			      login->lag_gw_neigh[i].neigh.qpn,
+			      login->lag_gw_neigh[i].neigh.sl,
+			      login->lag_gw_neigh[i].neigh.valid);
+	}
+	netif_tx_unlock_bh(login->dev);
+
+out:
+	return (ssize_t)(p - buf);
+}
+
+static ssize_t vnic_login_show(struct module_attribute *attr,
+			     __MODULE_KOBJ_TYPE *mod, char *buf)
+{
+	char *p = buf, tmp_line[VNIC_SYSFS_LLEN];
+	struct vnic_sysfs_attr *vnic_dentry =
+	    container_of(attr, struct vnic_sysfs_attr, dentry);
+	struct vnic_login *login = vnic_dentry->ctx;
+	struct fip_vnic_data *vnic_fip = login->fip_vnic;
+	int rc, eport_connected = test_bit(VNIC_STATE_LOGIN_CREATE_2, &vnic_fip->login_state);
+	u16 pkey_used = 0;
+	int lag_gw;
+	int ret;
+
+	ASSERT(login->dev);
+	ASSERT(login->port->dev->ca);
+
+	/* NETDEV attributes */
+	p += _sprintf(p, buf, "NETDEV_NAME   %s\n", login->dev->name);
+	p += _sprintf(p, buf, "NETDEV_LINK   %s\n",
+		     netif_carrier_ok(login->dev) ? "up" : "down");
+	p += _sprintf(p, buf, "NETDEV_OPEN   %s\n",
+		     (login->dev->flags & IFF_UP) ? "yes" : "no");
+	p += _sprintf(p, buf, "NETDEV_QSTOP  %s\n",
+		     netif_queue_stopped(login->dev) ? "yes" : "no");
+	p += _sprintf(p, buf, "NETDEV_MTU    %d/%d\n",
+		      (int)login->dev->mtu,
+		      (int)login->max_mtu);
+
+	/* IOA attributes */
+	p += _sprintf(p, buf, "IOA_PORT      %s:%d\n",
+		      login->port->dev->ca->name,
+		      login->port->num);
+	p += _sprintf(p, buf, "IOA_NAME      %s\n",
+		      login->desc);
+	p += _sprintf(p, buf, "IOA_LID       0x%04x\n", login->port->attr.lid);
+	p += _sprintf(p, buf, "IOA_GUID      "VNIC_GUID_FMT"\n",
+		     VNIC_GUID_RAW_ARG(login->port->gid.raw + 8));
+	p += _sprintf(p, buf, "IOA_LOG_LINK  %s\n",
+		     port_phys_state_str(login->port->attr.phys_state));
+	p += _sprintf(p, buf, "IOA_PHY_LINK  %s\n",
+		     port_state_str(login->port->attr.state));
+	p += _sprintf(p, buf, "IOA_MTU       %d\n", login->port->max_mtu_enum);
+
+
+	/* EPORT and BX attributes */
+	if (no_bxm) {
+		p += _sprintf(p, buf, "EPORT_STATE   %s\n", "bridgeless");
+	} else if (vnic_fip) {
+		p += _sprintf(p, buf, "EPORT_STATE   %s\n",
+			      !eport_connected ? "disconnected" :
+			      (fip_vnic_get_eport_state(vnic_fip) ?
+			       "up" : "down"));
+		p += _sprintf(p, buf, "EPORT_NAME    %s\n",
+			      fip_vnic_get_eport_name(vnic_fip, tmp_line) ?
+			      NOT_AVAILABLE_STRING : tmp_line);
+		p += _sprintf(p, buf, "EPORT_QPN     0x%06x\n",
+			      login->gw_neigh ? login->gw_neigh->qpn : 0);
+		p += _sprintf(p, buf, "EPORT_LID     0x%04x\n",
+			      login->gw_neigh ? login->gw_neigh->lid : 0);
+		p += _sprintf(p, buf, "EPORT_ID      %u\n", login->gw_port_id);
+
+		p += _sprintf(p, buf, "BX_NAME       %s\n",
+			      fip_vnic_get_bx_name(vnic_fip, tmp_line) ?
+			      NOT_AVAILABLE_STRING : tmp_line);
+		fip_vnic_get_bx_guid(vnic_fip, tmp_line);
+		if (*((u64 *)tmp_line) == 0)
+			p += _sprintf(p, buf, "BX_GUID       %s\n", NOT_AVAILABLE_STRING);
+		else
+			p += _sprintf(p, buf, "BX_GUID       "VNIC_GUID_FMT"\n",
+				      VNIC_GUID_RAW_ARG(tmp_line));
+
+		lag_gw = fip_vnic_get_gw_type(vnic_fip);
+		if (lag_gw) {
+			p += _sprintf(p, buf, "GW_TYPE       LAG\n");
+			ret = fip_vnic_get_lag_eports(vnic_fip, p);
+			p += (ret > 0) ? ret : 0;
+		} else
+			p += _sprintf(p, buf, "GW_TYPE       LEGACY\n");
+
+		rc = fip_vnic_get_all_vlan_mode(vnic_fip, tmp_line);
+		p += _sprintf(p, buf, "ALL_VLAN      %s\n",
+			      rc < 0 ? NOT_AVAILABLE_STRING : tmp_line);
+
+	} else {
+		p += _sprintf(p, buf, "EPORT_STATE %s\n", "error");
+	}
+
+	/* misc attributes*/
+	p += _sprintf(p, buf, "SW_RSS        %s\n",
+		      !eport_connected ? NOT_AVAILABLE_STRING :
+		      ((login->qps_num > 1) ? "yes" : "no"));
+	p += _sprintf(p, buf, "SW_RSS_SIZE   %u\n", login->qps_num);
+	p += _sprintf(p, buf, "RX_RINGS_NUM  %d\n", login->rx_rings_num);
+	p += _sprintf(p, buf, "RX_RINGS_LIN  %s\n",
+		      login->port->rx_ring[0]->log_rx_info ? "no" : "yes");
+	p += _sprintf(p, buf, "TX_RINGS_NUM  %d\n", login->tx_rings_num);
+	p += _sprintf(p, buf, "TX_RINGS_ACT  %d\n",
+		      VNIC_TXQ_GET_ACTIVE(login));
+	p += _sprintf(p, buf, "NDO_TSS       %s\n",
+		      (login->ndo_tx_rings_num > 1) ? "yes" : "no");
+	p += _sprintf(p, buf, "NDO_TSS_SIZE  %u\n", login->ndo_tx_rings_num);
+	p += _sprintf(p, buf, "MCAST_PROMISC %s\n",
+		      !eport_connected ? NOT_AVAILABLE_STRING :
+		      (is_mcast_promisc(login) ? "yes" : "no"));
+	p += _sprintf(p, buf, "UCAST_PROMISC %s\n",
+		      (is_ucast_promisc(login) ? "yes" : "no"));
+	p += _sprintf(p, buf, "MCAST_MASK    %d\n", login->n_mac_mcgid);
+	p += _sprintf(p, buf, "CHILD_VNICS   %d/%d\n",
+		      atomic_read(&login->vnic_child_cnt),
+		      vnic_child_max);
+	p += _sprintf(p, buf, "PKEY          0x%04x\n", login->pkey);
+	p += _sprintf(p, buf, "PKEY_INDEX    0x%04x\n", login->pkey_index);
+	rc = ib_query_pkey(login->port->dev->ca, login->port->num,
+			   login->pkey_index, &pkey_used);
+	p += _sprintf(p, buf, "PKEY_MEMBER   %s\n",
+		      (rc || !eport_connected) ? NOT_AVAILABLE_STRING :
+		      ((pkey_used & 0x8000) ? "full" : "partial"));
+	p += _sprintf(p, buf, "SL_DATA       %u\n", login->sl);
+	p += _sprintf(p, buf, "SL_CONTROL    %u\n",
+		      vnic_fip ? fip_vnic_get_bx_sl(vnic_fip) : 0);
+#if defined(NETIF_F_GRO) && !defined(_BP_NO_GRO)
+	p += _sprintf(p, buf, "GRO           %s\n",
+		      login->dev->features & NETIF_F_GRO ? "yes" : "no");
+#elif defined(NETIF_F_LRO)
+	p += _sprintf(p, buf, "LRO           %s\n",
+		      login->dev->features & NETIF_F_LRO ? "yes" : "no");
+	p += _sprintf(p, buf, "LRO_NUM       %d\n", login->lro_num);
+#endif
+	p += _sprintf(p, buf, "NAPI          %s\n",
+		      login->napi_num ? "yes" : "no");
+	p += _sprintf(p, buf, "NAPI_WEIGHT   %u\n",
+		      login->napi_num ? vnic_napi_weight : 0);
+	p += _sprintf(p, buf, "QPN           0x%x\n",
+		      login->qp_base_num);
+	p += _sprintf(p, buf, "MAC           "MAC_6_PRINT_FMT"\n",
+		     MAC_6_PRINT_ARG(login->dev_addr));
+	p += _sprintf(p, buf, "VNIC_ID       %d\n",
+		      vnic_fip ? vnic_fip->vnic_id : 0);
+	p += _sprintf(p, buf, "ADMIN_MODE    %s\n",
+		      !vnic_fip ? NOT_AVAILABLE_STRING :
+		      (vnic_fip->hadmined ? "host" : "network"));
+
+	if (vnic_fip && vnic_fip->vlan_used)
+		p += _sprintf(p, buf, "VLAN          0x%03x\n", vnic_fip->vlan);
+	else
+		p += _sprintf(p, buf, "VLAN          %s\n", NOT_AVAILABLE_STRING);
+
+	if (vnic_fip && vnic_fip->shared_vnic.enabled) {
+		p += _sprintf(p, buf, "SHARED_MAC    "MAC_6_PRINT_FMT"\n",
+			      MAC_6_PRINT_ARG(vnic_fip->shared_vnic.emac));
+		p += _sprintf(p, buf, "SHARED_IP     "IP_4_PRINT_FMT"\n",
+			      IP_4_PRINT_ARG(vnic_fip->shared_vnic.ip));
+	} else {
+		p += _sprintf(p, buf, "SHARED_MAC    %s\n", NOT_AVAILABLE_STRING);
+		p += _sprintf(p, buf, "SHARED_IP     %s\n", NOT_AVAILABLE_STRING);
+	}
+
+	return (ssize_t)(p - buf);
+}
+
+static ssize_t vnic_qps_show(struct module_attribute *attr,
+			     __MODULE_KOBJ_TYPE *mod, char *buf)
+{
+	char *p = buf;
+	struct vnic_sysfs_attr *vnic_dentry =
+	    container_of(attr, struct vnic_sysfs_attr, dentry);
+	struct vnic_login *login = vnic_dentry->ctx;
+	struct ib_qp *qp;
+	struct ib_qp_attr query_attr;
+	struct ib_qp_init_attr query_init_attr;
+	int i, mask = -1;
+
+	for (i = 0; i < login->qps_num; ++i) {
+		qp = login->qp_res[i].qp;
+		if (ib_query_qp(qp, &query_attr, mask, &query_init_attr))
+			continue;
+		p += _sprintf(p, buf, "QP_INDEX         %d\n", i);
+		p += _sprintf(p, buf, "QP_NUM           0x%06x\n", qp->qp_num);
+		p += _sprintf(p, buf, "QP_QKEY          0x%08x\n", query_attr.qkey);
+		p += _sprintf(p, buf, "QP_STATE         0x%02x\n", query_attr.qp_state);
+		p += _sprintf(p, buf, "QP_RX_RING       %d\n", i % login->rx_rings_num);
+		p += _sprintf(p, buf, "QP_PTR           %p\n", qp);
+		p += _sprintf(p, buf, "QP_RX_SRQ_PTR    %p\n", qp->srq);
+		p += _sprintf(p, buf, "QP_RX_CQ_PTR     %p\n", qp->recv_cq);
+		p += _sprintf(p, buf, "QP_TX_CQ_PTR     %p\n", qp->send_cq);
+		p += _sprintf(p, buf, "\n");
+	}
+
+	return (ssize_t)(p - buf);
+}
+static char* vnic_state_2str(enum fip_vnic_state state)
+{
+	switch(state) {
+	case FIP_VNIC_CLOSED: return "CLOSED";
+	case FIP_VNIC_CONNECTED: return "CONNECTED";
+	case FIP_VNIC_HADMIN_IDLE: return "HADMIN_IDLE";
+	case FIP_VNIC_LOGIN: return "LOGIN";
+	case FIP_VNIC_MCAST_INIT: return "MCAST_INIT";
+	case FIP_VNIC_MCAST_INIT_DONE: return "MCAST_INIT_DONE";
+	case FIP_VNIC_RINGS_INIT: return "RINGS_INIT";
+	case FIP_VNIC_VHUB_DONE: return "VHUB_DONE";
+	case FIP_VNIC_VHUB_INIT: return "VHUB_INIT";
+	case FIP_VNIC_VHUB_INIT_DONE: return "VHUB_INIT_DONE";
+	case FIP_VNIC_VHUB_WRITE: return "VHUB_WRITE";
+	case FIP_VNIC_WAIT_4_ACK: return "WAIT_4_ACK";
+	}
+	return "UNKNOWN";
+
+
+}
+
+int port_vnics_sysfs_show(struct vnic_port *port, char *buf)
+{
+	struct fip_gw_data *gw;
+	char *p = buf;
+	struct fip_discover *discover;
+	struct fip_vnic_data *vnic;
+
+	mutex_lock(&port->start_stop_lock);
+	list_for_each_entry(discover, &port->fip.discover_list, discover_list) {
+
+		down_read(&discover->l_rwsem);
+
+		list_for_each_entry(gw, &discover->gw_list, list) {
+			list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) {
+				p += _sprintf(p, buf, "%-15s\t%-10s\t%10s:%d  %-10s\t%.7d\t%-10s\t%s\n",
+							  gw->info.vol_info.system_name,
+							  gw->info.vol_info.gw_port_name,
+							  gw->discover->port->dev->ca->name,
+							  gw->discover->port->num,
+							  vnic->name,
+							  vnic->vnic_id,
+							  vnic->hadmined?"HOSTADMIN":"NETADMIN",
+							  vnic_state_2str(vnic->state));
+			}
+		}
+
+		up_read(&discover->l_rwsem);
+	}
+
+	mutex_unlock(&port->start_stop_lock);
+	return (p - buf);
+}
+
+
+#ifdef VNIC_PROFILLNG
+static ssize_t vnic_dentry_prof_skb_show(struct module_attribute *attr,
+				     __MODULE_KOBJ_TYPE *mod, char *buf)
+{
+	char *p = buf;
+	struct vnic_sysfs_attr *vnic_dentry =
+	       container_of(attr, struct vnic_sysfs_attr, dentry);
+	struct vnic_login *login = vnic_dentry->ctx;
+	struct sk_buff *skb;
+	int i;
+
+	for (i = 0; i < VNIC_PROFILLNG_SKB_MAX; ++i) {
+		if (!login->prof_arr[i].cnt)
+			continue;
+		skb = &login->prof_arr[i].skb;
+		p += _sprintf(p, buf, "==============\n");
+		p += _sprintf(p, buf, "SKB[%d] CNT %d\n", i, login->prof_arr[i].cnt);
+		p += _sprintf(p, buf, "len         %d\n", skb->len);
+		p += _sprintf(p, buf, "data_len    %d\n", skb->data_len);
+		p += _sprintf(p, buf, "head_len    %d\n", skb_headlen(skb));
+		p += _sprintf(p, buf, "gso         %d\n", skb_is_gso(skb));
+		p += _sprintf(p, buf, "nr_frags    %d\n", login->prof_arr[i].nr_frags);
+		p += _sprintf(p, buf, "jiffies     %lu\n", login->prof_arr[i].jiffies);
+		p += _sprintf(p, buf, "msecs       %u\n",
+			      jiffies_to_msecs(login->prof_arr[i].jiffies));
+		p += _sprintf(p, buf, "msecs_diff  %u\n",
+			      jiffies_to_msecs(login->prof_arr[i].jiffies) -
+			      jiffies_to_msecs(login->prof_arr[i ? i -1 : 0].jiffies));
+	}
+
+	return (ssize_t)(p - buf);
+}
+
+#endif
+
+static int get_guid(u8 *guid, char *s)
+{
+	if (sscanf(s, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx:%hhx:%hhx",
+		   guid + 0, guid + 1, guid + 2, guid + 3, guid + 4,
+		   guid + 5, guid + 6, guid + 7) != 8)
+		return -1;
+
+	return 0;
+}
+
+static int get_mac(u8 *mac, char *s)
+{
+	if (sscanf(s, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx",
+		   mac + 0, mac + 1, mac + 2, mac + 3, mac + 4,
+		   mac + 5) != 6)
+		return -1;
+
+	return 0;
+}
+
+static int get_ipv4(short unsigned int *ip, char *s)
+{
+	if (sscanf(s, "%hu.%hu.%hu.%hu", ip + 0, ip + 1, ip + 2, ip + 3) != 4)
+		return -1;
+
+	return 0;
+}
+
+static int get_parent(struct vnic_port *port, char *parent)
+{
+	struct net_device *parent_netdev;
+
+	/* check parent syntax */
+	if (!dev_valid_name(parent))
+		return -EINVAL;
+
+	parent_netdev = dev_get_by_name(&init_net, parent);
+	if (parent_netdev)
+		dev_put(parent_netdev);
+
+	return parent_netdev ? 0 : -ENODATA;
+}
+
+static struct fip_hadmin_cache *get_hadmin_entry(void)
+{
+	struct fip_hadmin_cache *hadmin_entry;
+
+	hadmin_entry = kzalloc(sizeof *hadmin_entry, GFP_ATOMIC);
+	if (!hadmin_entry)
+		return NULL;
+
+	hadmin_entry->vnic_id = NOT_AVAILABLE_NUM;
+	hadmin_entry->gw_port_id = NOT_AVAILABLE_NUM;
+
+	return hadmin_entry;
+}
+
+void vnic_login_cmd_init(struct fip_hadmin_cmd *cmd)
+{
+	char *buf = (char *)cmd;
+	u8 i;
+
+	for (i = 0; i < MAX_INPUT_ARG; ++i)
+		sprintf(buf + (i * MAX_INPUT_LEN),  NOT_AVAILABLE_STRING);
+}
+
+int vnic_login_cmd_set(char *buf, struct fip_hadmin_cmd *cmd)
+{
+	int count;
+
+	if (cmd) {
+		count = sprintf(buf, "name=%s mac=%s vnic_id=%s vid=%s "
+				"bxname=%s bxguid=%s eport=%s ipv4=%s ipv6=%s "
+				"emac=%s pkey=%s parent=%s\n",
+				cmd->c_name, cmd->c_mac, cmd->c_vnic_id,
+				cmd->c_vid, cmd->c_bxname, cmd->c_bxguid,
+				cmd->c_eport, cmd->c_ipv4, cmd->c_ipv6,
+				cmd->c_emac, cmd->c_pkey, cmd->c_parent);
+		vnic_dbg_sysfs((char *)(cmd->c_name), "cmd: %s", buf);
+	} else /* print the cmd syntax */
+		count = sprintf(buf, "name=%%s mac=%%s vnic_id=%%s vid=%%s "
+				"bxname=%%s bxguid=%%s eport=%%s ipv4=%%s "
+				"ipv6=%%s emac=%%s pkey=%%s parent=%%s\n");
+
+	return count;
+}
+
+/* create/destroy child vNic; syntax example:
+ * +00:11:22:33:44:55
+ */
+static ssize_t vnic_child_write(struct module_attribute *attr,
+				__MODULE_KOBJ_TYPE *mod,
+				const char *buf, size_t count)
+{
+	struct vnic_sysfs_attr *vnic_dentry =
+	    container_of(attr, struct vnic_sysfs_attr, dentry);
+	struct vnic_login *login = vnic_dentry->ctx;
+	char action = buf[0];
+	char *buf_mac = (char *)buf + 1;
+	int remove = -1;
+	u8 mac[ETH_ALEN];
+
+	if (action == '-')
+		remove = 1;
+	if (action == '+')
+		remove = 0;
+
+	if (remove < 0 || get_mac(mac, buf_mac) || !is_valid_ether_addr(mac))
+		return -EINVAL;
+
+	vnic_learn_mac(login->dev, mac, remove);
+	return count;
+}
+
+int fip_hadmin_sysfs_update(struct vnic_port *port,
+			    const char *buf, int count, int remove)
+{
+	struct fip_discover *discover;
+	struct fip_hadmin_cache *hadmin_entry, *hadmin_it;
+	struct fip_hadmin_cmd *cmd;
+	char *name = NULL;
+	int rc, num;
+	u16 pkey;
+
+	hadmin_entry = get_hadmin_entry();
+	if (!hadmin_entry) {
+		rc = -ENOMEM;
+		vnic_dbg_sysfs(port->name, "get_hadmin_entry failed\n");
+		goto err;
+	}
+
+	cmd = &hadmin_entry->cmd;
+	rc = sscanf(buf, "name=%s mac=%s vnic_id=%s vid=%s bxname=%s bxguid=%s "
+		    "eport=%s ipv4=%s ipv6=%s emac=%s pkey=%s parent=%s",
+		    cmd->c_name, cmd->c_mac, cmd->c_vnic_id, cmd->c_vid,
+		    cmd->c_bxname, cmd->c_bxguid, cmd->c_eport, cmd->c_ipv4,
+		    cmd->c_ipv6, cmd->c_emac, cmd->c_pkey, cmd->c_parent);
+	if (rc != MAX_INPUT_ARG) {
+		vnic_dbg_sysfs(port->name, "sscanf failed, rc %d\n", rc);
+		rc = -EINVAL;
+		goto err;
+	} else
+		name = (char *)(cmd->c_name);
+
+	/* get parent name */
+	if (!dev_valid_name(cmd->c_parent))
+		hadmin_entry->parent_used = 0;
+	else if (remove || !get_parent(port, cmd->c_parent)) {
+		vnic_dbg_sysfs(name, "parent set %s\n", cmd->c_parent);
+		strncpy(hadmin_entry->parent_name, cmd->c_parent,
+		        sizeof(hadmin_entry->parent_name));
+		hadmin_entry->parent_used = 1;
+	} else {
+		vnic_warn(name, "invalid parent name %s\n", cmd->c_parent);
+		rc = -EINVAL;
+		goto err;
+	}
+
+	/* get vNic ID dec (must) */
+	if (sscanf(cmd->c_vnic_id, "%d", &num) != 1) {
+		/* abort on failure */
+		vnic_warn(name, "invalid vNic ID %s\n", cmd->c_vnic_id);
+		rc = -EINVAL;
+		goto err;
+	}
+	hadmin_entry->vnic_id = (u16)num;
+
+	/* get vNic MAC (must) */
+	if (get_mac(hadmin_entry->mac, cmd->c_mac)) {
+		vnic_warn(name, "invalid vNic MAC %s\n", cmd->c_vnic_id);
+		rc = -EINVAL;
+		goto err;
+	}
+
+	/* get interface name (must) */
+	if ((!dev_valid_name(cmd->c_name) && !hadmin_entry->parent_used) ||
+	    ((strlen(cmd->c_name) > VNIC_NAME_LEN) && hadmin_entry->parent_used)) {
+		vnic_warn(name, "invalid vNic name %s\n", cmd->c_name);
+		rc = -EINVAL;
+		goto err;
+	}
+
+	strncpy(hadmin_entry->interface_name, cmd->c_name,
+		sizeof(hadmin_entry->interface_name));
+
+	/* get BX GUID, if fails, get BX NAME */
+	if (get_guid(hadmin_entry->system_guid, cmd->c_bxguid)) {
+		strncpy(hadmin_entry->system_name, cmd->c_bxname,
+			sizeof(hadmin_entry->system_name));
+		vnic_dbg_sysfs(name, "use BX NAME %s\n", cmd->c_bxname);
+	}
+
+	/* get shared emac/ip */
+	if (!get_ipv4((short unsigned int *)hadmin_entry->shared_vnic_ip,
+		      cmd->c_ipv4)) {
+		/* TODO, add IPv6 support for shared vNic */
+		get_mac(hadmin_entry->shared_vnic_mac, cmd->c_emac);
+		vnic_dbg_sysfs(name, "use shared ip/mac\n");
+	}
+
+#ifndef VLAN_GROUP_ARRAY_LEN
+#define VLAN_GROUP_ARRAY_LEN VLAN_N_VID
+#endif
+
+	/* get VLAN field (dec) */
+	if ((sscanf(cmd->c_vid, "%d", &num) == 1) &&
+	    num < VLAN_GROUP_ARRAY_LEN && num >= 0) {
+		/* set other fields on success, skip on failure */
+		vnic_dbg_sysfs(name, "vlan set 0x%x\n", hadmin_entry->vlan);
+		hadmin_entry->vlan_used = 1;
+		hadmin_entry->vlan = (u16)num;
+	} else if (!strcmp(cmd->c_vid, ALL_VLAN_GW_VID)) {
+		/* Dont set 'vlan_used'. the code counts on it being NULL for
+		 * host admin vnics in all_vlan mode, when Vlans are used */
+		hadmin_entry->vlan = 0;
+		hadmin_entry->all_vlan_gw = 1;
+	}
+
+	/* get eport name */
+	if (!strlen(cmd->c_eport)) {
+		vnic_warn(name, "invalid eport name %s\n", cmd->c_eport);
+		rc = -EINVAL;
+		goto err;
+	}
+	strncpy(hadmin_entry->eport_name, cmd->c_eport,
+		sizeof(hadmin_entry->eport_name));
+
+	/* set remove/add flag */
+	vnic_dbg_sysfs(name, "%s hadmin vNic\n", remove ? "remove" : "add");
+	hadmin_entry->remove = remove;
+
+	/* set pkey (hex) */
+	if ((sscanf(cmd->c_pkey, "%x", &num) != 1) || !num)
+		pkey = 0xffff; /* default */
+	else
+		pkey = (u16)num | 0x8000;
+	vnic_dbg_sysfs(name, "pkey 0x%x\n", pkey);
+
+	/* cannot sleep in this functions for child vnics flow
+	 * (avoid schedule while atomic oops)
+	 * TODO: check if holding start_stop_lock is needed here
+	 */
+	//mutex_lock(&port->start_stop_lock);
+
+	list_for_each_entry(discover, &port->fip.discover_list, discover_list) {
+		if (discover->pkey == pkey) {
+			spin_lock_irq(&discover->lock);
+
+			if (discover->flush != FIP_NO_FLUSH) {
+				rc = -EBUSY;
+				spin_unlock_irq(&discover->lock);
+				goto skip;
+			}
+
+			/* check that this mac/vlan is not in the cache list
+			 * (saves redundant queue_delayed_work call during
+			 * vnic_learn_mac bursts)
+			 */
+			list_for_each_entry_reverse(hadmin_it, &discover->hadmin_cache, next) {
+				if (!memcmp(hadmin_entry->mac, hadmin_it->mac, ETH_ALEN) &&
+				    hadmin_entry->vlan == hadmin_it->vlan &&
+				    hadmin_entry->remove == hadmin_it->remove) {
+					rc = -EEXIST;
+					spin_unlock_irq(&discover->lock);
+					goto skip;
+				}
+			}
+			list_add_tail(&hadmin_entry->next, &discover->hadmin_cache);
+			/* calls fip_discover_hadmin_update() */
+			queue_delayed_work(fip_wq, &discover->hadmin_update_task, HZ/10);
+			spin_unlock_irq(&discover->lock);
+			goto updated_discover;
+		}
+	}
+
+	//mutex_unlock(&port->start_stop_lock);
+	vnic_dbg_sysfs(name, "Requested PKEY=0x%x is not configured\n", pkey);
+	goto skip;
+
+err:
+	vnic_dbg_sysfs(name, "Invalid host admin request format string. Request rejected\n");
+skip:
+	kfree(hadmin_entry);
+	return rc;
+
+updated_discover:
+	//mutex_unlock(&port->start_stop_lock);
+	return count;
+}
+
+static ssize_t vnic_login_cmd(struct module_attribute *attr,
+			      __MODULE_KOBJ_TYPE *mod, char *buf)
+{
+	char *p = buf;
+	struct vnic_sysfs_attr *vnic_dentry =
+		container_of(attr, struct vnic_sysfs_attr, dentry);
+	struct fip_vnic_data *vnic_fip = vnic_dentry->ctx;
+	struct fip_hadmin_cmd *cmd;
+
+	if (!vnic_fip || !vnic_fip->hadmined)
+		goto out;
+
+	cmd = &vnic_fip->cmd;
+	p += _sprintf(p, buf, "name=%s mac=%s vnic_id=%s vid=%s bxname=%s bxguid=%s "
+		      "eport=%s ipv4=%s ipv6=%s emac=%s pkey=%s parent=%s ",
+		      cmd->c_name, cmd->c_mac, cmd->c_vnic_id, cmd->c_vid,
+		      cmd->c_bxname, cmd->c_bxguid, cmd->c_eport, cmd->c_ipv4,
+		      cmd->c_ipv6, cmd->c_emac, cmd->c_pkey, cmd->c_parent);
+	p += _sprintf(p, buf, "ib_port=%s", vnic_fip->port->name);
+	p += _sprintf(p, buf, "\n");
+
+out:
+	return (ssize_t)(p - buf);
+}
+
+int vnic_create_hadmin_dentry(struct fip_vnic_data *vnic)
+{
+	char name[VNIC_SYSFS_FLEN];
+
+	DENTRY_CREATE(vnic, &vnic->dentry,
+		      vnic_dentry_name(name, vnic, "cmd"),
+		      vnic_login_cmd, NULL);
+	return 0;
+}
+
+void vnic_delete_hadmin_dentry(struct fip_vnic_data *vnic)
+{
+	if (vnic->dentry.ctx)
+		DENTRY_REMOVE(&vnic->dentry);
+}
+
+int vnic_create_dentry(struct vnic_login *login)
+{
+	int i = 0;
+	char name[VNIC_SYSFS_FLEN];
+
+	DENTRY_CREATE(login, &login->dentries[i++],
+		      login_dentry_name(name, login, "info"),
+		      vnic_login_show, NULL);
+	DENTRY_CREATE(login, &login->dentries[i++],
+		      login_dentry_name(name, login, "child"),
+		      NULL, vnic_child_write);
+	DENTRY_CREATE(login, &login->dentries[i++],
+		      login_dentry_name(name, login, "neigh"),
+		      vnic_neigh_show, NULL);
+	DENTRY_CREATE(login, &login->dentries[i++],
+		      login_dentry_name(name, login, "qps"),
+		      vnic_qps_show, NULL);
+	DENTRY_CREATE(login, &login->dentries[i++],
+		      login_dentry_name(name, login, "member"),
+		      vnic_member_show, NULL);
+
+#ifdef VNIC_PROFILLNG
+	DENTRY_CREATE(login, &login->dentries[i++],
+		      login_dentry_name(name, login, "prof_skb"),
+		      vnic_dentry_prof_skb_show, NULL);
+#endif
+	return 0;
+}
+
+void vnic_delete_dentry(struct vnic_login *login)
+{
+	int i;
+
+	for (i = 0; i < VNIC_MAX_DENTRIES; ++i) {
+		if (login->dentries[i].ctx)
+			DENTRY_REMOVE(&login->dentries[i]);
+	}
+}
+
+static ssize_t port_gw_fs_show(struct module_attribute *attr,
+			       __MODULE_KOBJ_TYPE *mod, char *buf)
+{
+	struct vnic_sysfs_attr *vnic_dentry =
+		container_of(attr, struct vnic_sysfs_attr, dentry);
+	struct vnic_port *port = vnic_dentry->ctx;
+
+	return fip_gw_sysfs_show(port, buf);
+}
+
+
+static ssize_t port_vnics_fs_show(struct module_attribute *attr,
+			       __MODULE_KOBJ_TYPE *mod, char *buf)
+{
+	struct vnic_sysfs_attr *vnic_dentry =
+		container_of(attr, struct vnic_sysfs_attr, dentry);
+	struct vnic_port *port = vnic_dentry->ctx;
+	return port_vnics_sysfs_show(port, buf);
+}
+
+static ssize_t port_hadmin_syntax(struct module_attribute *attr,
+				  __MODULE_KOBJ_TYPE *mod, char *buf)
+{
+	/* print cmd syntax only (for usage) */
+	return vnic_login_cmd_set(buf, NULL);
+}
+
+static ssize_t port_hadmin_add_write(struct module_attribute *attr,
+				     __MODULE_KOBJ_TYPE *mod,
+				     const char *buf, size_t count)
+{
+	struct vnic_sysfs_attr *vnic_dentry =
+		container_of(attr, struct vnic_sysfs_attr, dentry);
+	struct vnic_port *port = vnic_dentry->ctx;
+
+	return fip_hadmin_sysfs_update(port, buf, count, 0);
+}
+
+static ssize_t port_hadmin_del_write(struct module_attribute *attr,
+				     __MODULE_KOBJ_TYPE *mod,
+				     const char *buf, size_t count)
+{
+	struct vnic_sysfs_attr *vnic_dentry =
+		container_of(attr, struct vnic_sysfs_attr, dentry);
+	struct vnic_port *port = vnic_dentry->ctx;
+
+	return fip_hadmin_sysfs_update(port, buf, count, 1);
+}
+
+int port_fs_init(struct vnic_port *port)
+{
+	int i = 0;
+	char name[VNIC_SYSFS_FLEN];
+
+	DENTRY_CREATE(port, &port->dentries[i++],
+		      port_dentry_name(name, port, "host_add"),
+		      port_hadmin_syntax, port_hadmin_add_write);
+
+	DENTRY_CREATE(port, &port->dentries[i++],
+		      port_dentry_name(name, port, "host_del"),
+		      port_hadmin_syntax, port_hadmin_del_write);
+
+	DENTRY_CREATE(port, &port->dentries[i++],
+		      port_dentry_name(name, port, "gws"),
+		      port_gw_fs_show, NULL);
+
+	DENTRY_CREATE(port, &port->dentries[i++],
+		      port_dentry_name(name, port, "vnics"),
+		      port_vnics_fs_show, NULL);
+	return 0;
+}
+
+void port_fs_exit(struct vnic_port *port)
+{
+	int i;
+
+	for (i = 0; i < VNIC_MAX_DENTRIES; ++i) {
+		if (port->dentries[i].ctx)
+			DENTRY_REMOVE(&port->dentries[i]);
+	}
+}
+
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_ib.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_ib.c
new file mode 100644
index 0000000000000..ba6e93bb85ce9
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_ib.c
@@ -0,0 +1,1649 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/srq.h>
+#include <rdma/ib_cache.h>
+#include <net/ip6_checksum.h>
+
+#include "vnic.h"
+#include "vnic_data.h"
+
+int vnic_post_recv(struct vnic_rx_ring *ring, u64 wr_id)
+{
+	struct ib_recv_wr *bad_wr;
+	int i, rc;
+
+	ring->wr.wr_id = wr_id;
+
+	for (i = 0; i < ring->num_frags; i++)
+		ring->sge[i].addr = ring->rx_info[wr_id].dma_addr[i];
+
+	rc = ib_post_srq_recv(ring->srq, &ring->wr, &bad_wr);
+	if (unlikely(rc)) {
+		/* we will not use a lock here. In the worst case we will have
+		 * an incorrect value of need_refill. Not a biggie
+		 */
+
+		/*ring->rx_info[wr_id].info = VNIC_FRAG_NOT_POSTED;
+		   ring->need_refill = 1;
+		 */
+		vnic_dbg_data(ring->port->name, "receive failed for buf %llu (%d)\n",
+			      wr_id, rc);
+	}
+
+	return rc;
+}
+
+static void vnic_dealloc_tx_skb(struct vnic_login *login, unsigned cq_index,
+				u64 wr_id)
+{
+	struct vnic_tx_res *tx_res = &login->tx_res[cq_index];
+	int is_inline = !!(wr_id & VNIC_SEND_INLINE_FLAG);
+	struct sk_buff *skb;
+	u64 *mapping;
+	int i, off = 0;
+
+	wr_id &= ~VNIC_SEND_INLINE_FLAG;
+	skb = tx_res->tx_ring[wr_id].skb;
+	ASSERT(skb);
+	mapping = tx_res->tx_ring[wr_id].mapping;
+
+	if (!is_inline) {
+		if (!vnic_encap_headroom && !skb_is_gso(skb)) {
+			ib_dma_unmap_single(login->port->dev->ca, mapping[off],
+					    VNIC_ENCAP_LEN, DMA_TO_DEVICE);
+			off++;
+		}
+		if (skb_headlen(skb)) {
+			ib_dma_unmap_single(login->port->dev->ca, mapping[off],
+					    skb_headlen(skb), DMA_TO_DEVICE);
+			off++;
+		}
+		for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+			ib_dma_unmap_page(login->port->dev->ca,
+					  mapping[i + off], frag->size,
+					  DMA_TO_DEVICE);
+		}
+	}
+
+	/* dealloc skb */
+	dev_kfree_skb_any(skb);
+	tx_res->tx_ring[wr_id].skb = NULL;
+}
+
+static void vnic_ib_handle_tx_wc(struct vnic_login *login,
+				 int tx_res_index, struct ib_wc *wc)
+{
+	struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index];
+	u64 wr_id = wc->wr_id & ~VNIC_SEND_INLINE_FLAG;
+
+	vnic_dbg_data(login->name, "send completion: wr_id %llu, status: %d "
+		      "[head %d - tail %d]\n", wr_id, wc->status,
+		      tx_res->tx_head, tx_res->tx_tail);
+
+	ASSERT(wr_id < vnic_tx_rings_len);
+	vnic_dealloc_tx_skb(login, tx_res_index, wc->wr_id);
+
+	++tx_res->tx_tail;
+	--tx_res->tx_outstanding;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR)) {
+		vnic_warn(login->name, "failed send event "
+			  "(status %d, wr_id %llu, vend_err 0x%x)\n",
+			  wc->status, wr_id, wc->vendor_err);
+		vnic_warn(login->name, "TX CQE error, queueing rings restart\n");
+		if (!login->queue_stopped)
+			queue_delayed_work(login_wq, &login->restart_task, HZ / 100);
+	}
+}
+
+int vnic_post_recvs(struct vnic_rx_ring *ring)
+{
+	int i, rc;
+
+	for (i = 0; i < ring->size; i++) {
+		rc = vnic_post_recv(ring, i);
+		if (rc) {
+			vnic_err(ring->port->name, "Failed post receive %d\n", rc);
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+static int vnic_vlan_is_valid(struct vnic_login *login,
+			      struct vlan_ethhdr *veth)
+{
+	ASSERT(veth->h_vlan_proto == htons(ETH_P_8021Q));
+	if ((be16_to_cpu(veth->h_vlan_TCI) & 0xfff) !=
+	    be16_to_cpu(login->vid)) {
+		vnic_dbg_data(login->name, "invalid vlan, ingress vid "
+			      "0x%x, login: vid 0x%x vlan_used %d\n",
+			      be16_to_cpu(veth->h_vlan_TCI),
+			      be16_to_cpu(login->vid),
+			      login->vlan_used);
+		return 0;
+	}
+
+	return 1;
+}
+
+/* If a vlan tag should exist in the eth_hdr - validate it.
+   is_vlan_proto is set if vlan protocol is present in the eth header
+   return values 0 - on success, 1 - on error :
+   for all vlans gateway (promisc vlan):
+	0 - there is no vlan or there is a vlan and it is valid
+	1 - vlan is present and not valid.
+   for all other vlans:
+	0 - there shouldn't be a vlan, or vlan should be present and is valid.
+	1 - vlan should be present and it is not, ot it is not valid. */
+static int validate_vnic_vlan(struct vnic_login *login,
+			      struct vlan_ethhdr *veth,
+			      int *is_vlan_proto)
+{
+	int is_vlan = !!(veth->h_vlan_proto == htons(ETH_P_8021Q));
+
+	*is_vlan_proto = is_vlan;
+
+	if (login->all_vlan_gw)
+		return 0;
+
+	if (VNIC_VLAN_ENABLED(login) && login->vid && !is_vlan) {
+		vnic_dbg_data(login->name, "missing vlan tag\n");
+		VNIC_STATS_INC(login->port_stats.vlan_err);
+		return 1;
+	}
+
+	if (is_vlan && unlikely(!vnic_vlan_is_valid(login, veth))) {
+		vnic_dbg_data(login->name, "invalid vlan tag\n");
+		VNIC_STATS_INC(login->port_stats.vlan_err);
+		return 1;
+	}
+
+	return 0;
+}
+
+static void vnic_ib_handle_rx_wc_linear(struct vnic_login *login,
+					struct ib_wc *wc, int rx_ring_index)
+{
+	struct vnic_rx_ring *ring = login->port->rx_ring[rx_ring_index];
+	struct eoibhdr *eoib_hdr;
+	struct sk_buff *skb;
+	struct vlan_ethhdr *veth;
+	int rc, wr_id = wc->wr_id, checksum_ok, ip_summed,
+	    buf_size = VNIC_BUF_SIZE(ring->port);
+	int is_vlan_proto;
+	u64 mapping;
+	u16 eth_type;
+	u8 *va, *eth_hdr;
+
+	spin_lock_bh(&ring->lock);
+	ASSERT(wr_id < ring->size);
+
+	skb = ring->rx_info[wr_id].skb;
+	mapping = ring->rx_info[wr_id].dma_addr[0];
+
+	/* termination with error */
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		if(wc->status != IB_WC_REM_ABORT_ERR &&
+		   wc->status != IB_WC_LOC_LEN_ERR) {
+			vnic_dbg_data(login->name, "RX CQE error "
+				      "(status %d, vend_err 0x%x), "
+				      "queueing rings restart\n",
+				      wc->status, wc->vendor_err);
+			if (!login->queue_stopped)
+				queue_delayed_work(login_wq,
+						   &login->restart_task,
+						   HZ / 10);
+		}
+		goto repost;
+	}
+
+	ASSERT(skb);
+	ASSERT(mapping);
+
+	/* If we can't allocate a new RX buffer, dump
+	 * this packet and reuse the old buffer.
+	 */
+	if (unlikely(!vnic_alloc_rx_skb(ring, wr_id, GFP_ATOMIC))) {
+		VNIC_STATS_DO_INC(login->stats.rx_dropped);
+		goto repost;
+	}
+
+	ib_dma_unmap_single(login->port->dev->ca, mapping,
+			    buf_size, DMA_FROM_DEVICE);
+	skb_put(skb, wc->byte_len);
+	skb_pull(skb, IB_GRH_BYTES);
+
+	/* check EoIB header signature and version */
+	va = skb->data;
+	eoib_hdr = (struct eoibhdr *)va;
+	if (unlikely(VNIC_EOIB_HDR_GET_SIG(eoib_hdr) != VNIC_EOIB_HDR_SIG ||
+		     VNIC_EOIB_HDR_GET_VER(eoib_hdr) != VNIC_EOIB_HDR_VER)) {
+		vnic_dbg_data(login->name, "bad sig (0x%x) or ver (0x%x)\n",
+			      VNIC_EOIB_HDR_GET_SIG(eoib_hdr),
+			      VNIC_EOIB_HDR_GET_VER(eoib_hdr));
+		VNIC_STATS_INC(login->port_stats.sig_ver_err);
+		goto repost;
+	}
+
+	/* check EoIB CSUM */
+	checksum_ok = login->rx_csum && VNIC_CSUM_OK(eoib_hdr);
+	ip_summed = checksum_ok ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
+	if (likely((checksum_ok)))
+		VNIC_STATS_INC(login->port_stats.rx_chksum_good);
+	else
+		VNIC_STATS_INC(login->port_stats.rx_chksum_none);
+
+	/* Ethernet header */
+	skb_pull(skb, VNIC_ENCAP_LEN);
+	va += VNIC_ENCAP_LEN;
+	veth = (struct vlan_ethhdr *)(va);
+
+	eth_hdr = va;
+	eth_type = be16_to_cpu(((struct ethhdr *)(va))->h_proto);
+
+	/* validate VLAN tag, strip it if valid */
+	if (validate_vnic_vlan(login, veth, &is_vlan_proto))
+		goto repost;
+
+	/* for all_vlan_gw - we don't strip the packet but send it as is*/
+	if (!login->all_vlan_gw && is_vlan_proto) {
+		eth_type = be16_to_cpu(veth->h_vlan_encapsulated_proto);
+		eth_hdr += VLAN_HLEN;
+		skb_pull(skb, VLAN_HLEN);
+		memmove(eth_hdr, va, ETH_ALEN * 2);
+	}
+
+	/* update skb fields, keep this before LRO/GRO funcs */
+	skb->dev = login->dev;
+	skb->protocol = eth_type_trans(skb, skb->dev);
+	skb->ip_summed = ip_summed;
+
+#if defined(NETIF_F_GRO) && !defined(_BP_NO_GRO)
+	if ((login->dev->features & NETIF_F_GRO) && checksum_ok) {
+		struct vnic_rx_res *rx_res = &login->rx_res[rx_ring_index];
+		int ret;
+
+		ret = napi_gro_receive(&rx_res->napi, skb);
+		if (ret == GRO_HELD)
+			VNIC_STATS_INC(login->port_stats.gro_held);
+		else if (ret == GRO_NORMAL)
+			VNIC_STATS_INC(login->port_stats.gro_normal);
+		else if (ret == GRO_MERGED || ret == GRO_MERGED_FREE)
+			VNIC_STATS_INC(login->port_stats.gro_merged);
+		else
+			VNIC_STATS_INC(login->port_stats.gro_drop);
+
+		goto rx_repost;
+	}
+#elif defined(NETIF_F_LRO)
+	if (login->dev->features & NETIF_F_LRO && checksum_ok) {
+		struct vnic_rx_res *rx_res = &login->rx_res[rx_ring_index];
+
+		/* processed for LRO */
+                lro_receive_skb(&rx_res->lro, skb, NULL);
+		VNIC_STATS_INC(login->port_stats.lro_aggregated);
+
+		goto rx_repost;
+	}
+#endif
+
+	rc = vnic_rx(login, skb, wc);
+	if (unlikely(rc)) {
+		vnic_dbg_data(login->name, "vnic_rx failed, rc %d\n", rc);
+		goto repost;
+	}
+
+rx_repost:
+	VNIC_STATS_INC(ring->stats.rx_packets);
+	VNIC_STATS_ADD(ring->stats.rx_bytes, wc->byte_len);
+
+	VNIC_STATS_DO_INC(login->stats.rx_packets);
+	VNIC_STATS_DO_ADD(login->stats.rx_bytes, wc->byte_len);
+
+	if (unlikely(vnic_post_recv(ring, wr_id)))
+		vnic_dbg_data(login->name, "failed to post RX WQE id %d\n",
+			      (int)wr_id);
+	spin_unlock_bh(&ring->lock);
+
+	return;
+
+repost:
+	login->dev->last_rx = jiffies;
+	if (unlikely(vnic_post_recv(ring, wr_id)))
+		vnic_dbg_data(login->name, "failed to post RX WQE id %d\n",
+			      (int)wr_id);
+
+	VNIC_STATS_INC(ring->stats.rx_dropped);
+	VNIC_STATS_DO_INC(login->stats.rx_dropped);
+	spin_unlock_bh(&ring->lock);
+
+	return;
+}
+
+static void vnic_ib_handle_rx_wc(struct vnic_login *login,
+				 struct ib_wc *wc, int rx_ring_index)
+{
+	struct vnic_rx_ring *ring = login->port->rx_ring[rx_ring_index];
+	struct ib_device *ib_device = login->port->dev->ca;
+	struct vnic_frag_data *frags_entry;
+	struct skb_frag_struct frags[VNIC_MAX_RX_FRAGS] = {};
+	struct eoibhdr *eoib_hdr;
+	struct vlan_ethhdr *veth;
+	struct iphdr *ip_hdr;
+	u64 wr_id = wc->wr_id;
+	u16 eth_type;
+	u8 *va, *eth_hdr, ip_type;
+	int rc, checksum_ok, ip_offset = ETH_HLEN,
+		packet_length = wc->byte_len - VNIC_EOIB_HDR_SIZE,
+		page_offset = VNIC_EOIB_HDR_SIZE, ip_summed;
+	int is_vlan_proto;
+
+	spin_lock_bh(&ring->lock);
+	ASSERT(wr_id < ring->size);
+
+	/* termination with error */
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		if(wc->status != IB_WC_REM_ABORT_ERR &&
+		   wc->status != IB_WC_LOC_LEN_ERR) {
+			vnic_dbg_data(login->name, "RX CQE error "
+				      "(status %d, vend_err 0x%x), "
+				      "queueing rings restart\n",
+				      wc->status, wc->vendor_err);
+			if (!login->queue_stopped)
+				queue_delayed_work(login_wq, &login->restart_task, HZ / 10);
+			goto out;
+		}
+		goto drop_repost;
+	}
+
+	frags_entry = &ring->rx_info[wr_id];
+
+	/* ensure cache coherency for packet headers and get vq */
+	ib_dma_sync_single_for_cpu(ib_device,
+				   ring->rx_info[wr_id].dma_addr[0] + IB_GRH_BYTES,
+				   MAX_HEADER_SIZE, DMA_FROM_DEVICE);
+
+	va = page_address(ring->rx_info[wr_id].frags[0].page.p) +
+		ring->rx_info[wr_id].frags[0].page_offset + IB_GRH_BYTES;
+
+	/* check EoIB header signature and version */
+	eoib_hdr = (struct eoibhdr *)va;
+	if (unlikely(VNIC_EOIB_HDR_GET_SIG(eoib_hdr) != VNIC_EOIB_HDR_SIG ||
+		     VNIC_EOIB_HDR_GET_VER(eoib_hdr) != VNIC_EOIB_HDR_VER)) {
+		vnic_dbg_data(login->name, "bad sig (0x%x) or ver (0x%x)\n",
+			      VNIC_EOIB_HDR_GET_SIG(eoib_hdr),
+			      VNIC_EOIB_HDR_GET_VER(eoib_hdr));
+		VNIC_STATS_INC(login->port_stats.sig_ver_err);
+		goto unmap_repost;
+	}
+
+	/* check EoIB CSUM */
+	checksum_ok = login->rx_csum && VNIC_CSUM_OK(eoib_hdr);
+	ip_summed = checksum_ok ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
+	if (likely((checksum_ok)))
+		VNIC_STATS_INC(login->port_stats.rx_chksum_good);
+	else
+		VNIC_STATS_INC(login->port_stats.rx_chksum_none);
+
+	/* Ethernet header */
+	va += VNIC_ENCAP_LEN;
+	veth = (struct vlan_ethhdr *)(va);
+
+	eth_hdr = va;
+	eth_type = be16_to_cpu(((struct ethhdr *)(va))->h_proto);
+
+	/* validate VLAN tag, strip it if valid
+	 * - if VID is set and !0, then VLAN tag must exist
+	 *   note: VID zero can accept untagged packets
+	 * - if ingress VID exists: validate it, and update the packet
+	 *   note: rx user prio is ignored
+	 * - else; it's valid untagged packet
+	 */
+	if (validate_vnic_vlan(login, veth, &is_vlan_proto))
+		goto unmap_repost;
+
+	/* for all_vlan_gw - we don't strip the packet but send it as is*/
+	if (!login->all_vlan_gw && is_vlan_proto) {
+		ip_offset += VLAN_HLEN;
+		page_offset += VLAN_HLEN;
+		packet_length -= VLAN_HLEN;
+		eth_hdr += VLAN_HLEN;
+		eth_type = be16_to_cpu(veth->h_vlan_encapsulated_proto);
+		memmove(eth_hdr, va, ETH_ALEN * 2);
+	}
+
+	/* IP header */
+	va += ip_offset;
+	ip_hdr = (struct iphdr *)va;
+	ip_type = ip_hdr->protocol;
+
+	ib_dma_sync_single_for_device(ib_device,
+				      frags_entry->dma_addr[0] + IB_GRH_BYTES,
+				      MAX_HEADER_SIZE, DMA_FROM_DEVICE);
+
+#if defined(NETIF_F_GRO) && !defined(_BP_NO_GRO)
+	if ((login->dev->features & NETIF_F_GRO) && checksum_ok) {
+		struct vnic_rx_res *rx_res = &login->rx_res[rx_ring_index];
+		struct sk_buff *gro_skb;
+		struct skb_frag_struct *gro_frags;
+		int nr_frags, ret;
+
+		gro_skb = napi_get_frags(&rx_res->napi);
+		if (!gro_skb)
+			goto drop_repost;
+
+		gro_frags = skb_shinfo(gro_skb)->frags;
+		nr_frags = vnic_unmap_and_replace_rx(ring, ib_device,
+						     gro_frags, wr_id,
+						     wc->byte_len);
+		if (unlikely(!nr_frags))
+			goto drop_repost;
+
+		/* disregard GRH and eoib headers */
+		gro_frags[0].page_offset += page_offset;
+		gro_frags[0].size -= page_offset;
+
+		skb_shinfo(gro_skb)->nr_frags = nr_frags;
+		gro_skb->len = packet_length;
+		gro_skb->data_len = packet_length;
+		gro_skb->truesize += packet_length;
+		gro_skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+		/* processed for GRO */
+		skb_record_rx_queue(gro_skb, rx_res->index);
+		ret = napi_gro_frags(&rx_res->napi);
+		if (ret == GRO_HELD)
+			VNIC_STATS_INC(login->port_stats.gro_held);
+		else if (ret == GRO_NORMAL)
+			VNIC_STATS_INC(login->port_stats.gro_normal);
+		else if (ret == GRO_MERGED || ret == GRO_MERGED_FREE)
+			VNIC_STATS_INC(login->port_stats.gro_merged);
+		else
+			VNIC_STATS_INC(login->port_stats.gro_drop);
+
+		goto rx_repost;
+	}
+#elif defined(NETIF_F_LRO)
+	if (login->dev->features & NETIF_F_LRO && checksum_ok &&
+	    eth_type == ETH_P_IP && ip_type == IPPROTO_TCP) {
+		struct vnic_rx_res *rx_res = &login->rx_res[rx_ring_index];
+		int nr_frags;
+
+		/* unmap the needed fragment and reallocate them.
+		 * Fragments that were not used will be reused as is.*/
+		nr_frags = vnic_unmap_and_replace_rx(ring, ib_device, frags,
+						     wr_id, wc->byte_len);
+		if (unlikely(!nr_frags))
+			goto drop_repost;
+
+		/* disregard GRH and eoib headers */
+		frags[0].page_offset += page_offset;
+		frags[0].size -= page_offset;
+
+		/* processed for LRO */
+#if defined(CONFIG_COMPAT_LRO_ENABLED)
+		lro_receive_frags(&rx_res->lro, frags, packet_length,
+				  packet_length, NULL, 0);
+#endif
+		VNIC_STATS_INC(login->port_stats.lro_aggregated);
+
+		goto rx_repost;
+	}
+#endif
+
+	rc = vnic_rx_skb(login, ring, wc, ip_summed, eth_hdr);
+	if (unlikely(rc)) {
+		vnic_dbg_data(login->name, "vnic_rx_skb failed, rc %d\n", rc);
+		goto drop_repost;
+	}
+
+rx_repost:
+	/* must hold lock when touching login->stats so the stats
+	 * task won't read invalid values
+	 */
+	spin_lock(&login->stats_lock);
+	VNIC_STATS_INC(ring->stats.rx_packets);
+	VNIC_STATS_ADD(ring->stats.rx_bytes, packet_length);
+
+	VNIC_STATS_DO_INC(login->stats.rx_packets);
+	VNIC_STATS_DO_ADD(login->stats.rx_bytes, packet_length);
+	spin_unlock(&login->stats_lock);
+
+	login->dev->last_rx = jiffies;
+	if (vnic_post_recv(ring, wr_id))
+		vnic_dbg_data(login->name, "vnic_post_recv failed, "
+			      "wr_id %llu\n", wr_id);
+	spin_unlock_bh(&ring->lock);
+
+	return;
+
+unmap_repost:
+	/* ignore rc of vnic_unmap_and_replace_rx() */
+	vnic_unmap_and_replace_rx(ring, ib_device, frags,
+				  wr_id, wc->byte_len);
+drop_repost:
+	VNIC_STATS_INC(ring->stats.rx_dropped);
+
+	spin_lock(&login->stats_lock);
+	VNIC_STATS_DO_INC(login->stats.rx_dropped);
+	spin_unlock(&login->stats_lock);
+
+	if (vnic_post_recv(ring, wr_id))
+		vnic_dbg_data(login->name, "vnic_post_recv failed, "
+			      "wr_id %llu\n", wr_id);
+out:
+	spin_unlock_bh(&ring->lock);
+	return;
+}
+
+static inline void vnic_drain_tx_cq(struct vnic_login *login,
+				    int tx_res_index)
+{
+	struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index];
+	int n, i;
+
+	do {
+		n = ib_poll_cq(tx_res->cq, VNIC_MAX_TX_CQE, tx_res->send_wc);
+		for (i = 0; i < n; ++i)
+			vnic_ib_handle_tx_wc(login, tx_res_index,
+					     tx_res->send_wc + i);
+	} while (n == VNIC_MAX_TX_CQE);
+}
+
+static void vnic_drain_arm_tx_cq(struct vnic_login *login, int tx_res_index)
+{
+	struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index];
+
+	ASSERT(login);
+	ASSERT(login->dev);
+
+	/* darin CQ then [arm] it */
+	vnic_drain_tx_cq(login, tx_res_index);
+
+	/* in tx interrupt mode, arm TX CQ after every interrupt */
+	if (!vnic_tx_polling && ib_req_notify_cq(tx_res->cq, IB_CQ_NEXT_COMP))
+		vnic_dbg(login->name, "ib_req_notify_cq failed\n");
+	else if (unlikely(VNIC_TXQ_STOPPED(tx_res) &&
+		     test_bit(VNIC_STATE_NETDEV_OPEN, &login->netdev_state))) {
+		if ((tx_res->tx_outstanding <= vnic_tx_rings_len >> 1)) {
+			if (!test_bit(VNIC_STATE_NETDEV_NO_TX_ENABLE, &login->netdev_state)) {
+				VNIC_STATS_DO_INC(login->port_stats.wake_queue);
+				VNIC_TXQ_WAKE(tx_res);
+			}
+		/* make sure that after arming the cq, there is no access to
+		 * login fields to avoid conflict with cq event handler.
+		 * i.e., ib_req_notify_cq() must come at the end of this func
+		 */
+		} else if (ib_req_notify_cq(tx_res->cq, IB_CQ_NEXT_COMP)) {
+			vnic_dbg(login->name, "ib_req_notify_cq failed\n");
+			/* TODO: have to reset the device here */
+		}
+	}
+}
+
+static inline void vnic_comp_handler_tx(struct ib_cq *cq, void *ctx)
+{
+	struct vnic_tx_res *tx_res = ctx;
+
+	if (!vnic_tx_polling) {
+		spin_lock(&tx_res->lock);
+		vnic_drain_arm_tx_cq(tx_res->login, tx_res->index);
+		spin_unlock(&tx_res->lock);
+	} else
+		vnic_drain_arm_tx_cq(tx_res->login, tx_res->index);
+
+}
+
+static int vnic_drain_rx_cq(struct vnic_login *login, int max_poll,
+			    int rx_res_index)
+{
+	struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index];
+	int polled, i;
+
+	ASSERT(max_poll <= vnic_napi_weight);
+	polled = ib_poll_cq(rx_res->cq, max_poll, rx_res->recv_wc);
+
+	for (i = 0; vnic_rx_linear && i < polled; ++i)
+		vnic_ib_handle_rx_wc_linear(login, &rx_res->recv_wc[i],
+					    rx_res_index);
+
+	for (i = 0; !vnic_rx_linear && i < polled; ++i)
+		vnic_ib_handle_rx_wc(login, &rx_res->recv_wc[i],
+				     rx_res_index);
+
+#ifdef NETIF_F_LRO
+	/* Done CQ handling: flush all LRO sessions unconditionally */
+	if (login->dev->features & NETIF_F_LRO) {
+		VNIC_STATS_INC(login->port_stats.lro_flushed);
+		lro_flush_all(&rx_res->lro);
+	}
+#endif
+
+	return polled;
+}
+
+/* RX CQ polling - called by NAPI */
+#ifndef _BP_NAPI_POLL
+int vnic_poll_cq_rx(struct napi_struct *napi, int budget)
+{
+	struct vnic_rx_res *rx_res = container_of(napi, struct vnic_rx_res, napi);
+	struct vnic_login *login = rx_res->login;
+	struct ib_cq *cq_rx = rx_res->cq;
+	int rx_res_index = rx_res->index, polled;
+
+	/* shouldn't happen, since when stopped=1 NAPI is disabled */
+	if (unlikely(rx_res->stopped)) {
+#ifndef _BP_NAPI_NETIFRX
+		napi_complete(napi);
+#else
+		netif_rx_complete(login->dev, napi);
+#endif
+		return 0;
+	}
+
+	polled = vnic_drain_rx_cq(login, min(budget, VNIC_MAX_RX_CQE), rx_res_index);
+	vnic_dbg_data(login->name, "after vnic_drain_rx_cq budget %d,"
+		      " done %d, index %d\n", budget, polled, rx_res_index);
+
+	/* If we used up all the quota - we're probably not done yet... */
+	ASSERT(polled <= budget);
+	if (polled < budget) {
+		/* ATTENTION: ARM CQ must come after napi_complete() */
+#ifndef _BP_NAPI_NETIFRX
+		napi_complete(napi);
+#else
+		netif_rx_complete(login->dev, napi);
+#endif
+		/* Eventually calls vnic_comp_handler_rx() */
+		if (ib_req_notify_cq(cq_rx, IB_CQ_NEXT_COMP))
+			vnic_err(login->name, "ib_req_notify_cq failed\n");
+	}
+
+	return polled;
+}
+#else
+int vnic_poll_cq_rx(struct net_device *poll_dev, int *budget)
+{
+	struct vnic_rx_res *rx_res = poll_dev->priv;
+	struct vnic_login *login = rx_res->login;
+	struct ib_cq *cq_rx = rx_res->cq;
+	int rx_res_index = rx_res->index, polled, max_poll = min(*budget, poll_dev->quota);
+
+	/* shouldn't happen, since when stopped=1 NAPI is disabled */
+	if (unlikely(rx_res->stopped)) {
+		netif_rx_complete(poll_dev);
+		return 0;
+	}
+
+	while (max_poll >= 0) {
+		polled = vnic_drain_rx_cq(login, min(max_poll, VNIC_MAX_RX_CQE), rx_res_index);
+		if (polled <= 0)
+			break;
+		else {
+			poll_dev->quota -= polled;
+			*budget -= polled;
+		}
+		max_poll -= polled;
+	}
+
+	if (!max_poll)
+		return 1;
+
+	netif_rx_complete(poll_dev);
+	ib_req_notify_cq(cq_rx, IB_CQ_NEXT_COMP);
+
+	return 0;
+}
+#endif
+
+static void vnic_comp_handler_rx(struct ib_cq *cq, void *rx_res_ptr)
+{
+	struct vnic_rx_res *rx_res = rx_res_ptr;
+	struct vnic_login *login = rx_res->login;
+
+	ASSERT(rx_res->cq == cq);
+	ASSERT(login->dev);
+
+	/* is this happens, will re-arm later in vnic_open */
+	if (unlikely(rx_res->stopped))
+		return;
+
+#ifndef _BP_NAPI_POLL
+	/* calls vnic_poll_cq_rx() */
+#ifndef _BP_NAPI_NETIFRX
+	napi_schedule(&rx_res->napi);
+#else
+	netif_rx_schedule(login->dev, &rx_res->napi);
+#endif
+#else
+	netif_rx_schedule(rx_res->poll_dev);
+#endif /* _BP_NAPI_POLL*/
+
+}
+
+static void vnic_stop_qp(struct vnic_login *login, int qp_index)
+{
+	struct ib_qp_attr qp_attr = { .qp_state = IB_QPS_ERR };
+	struct vnic_qp_res *qp_res = &login->qp_res[qp_index];
+	struct vnic_rx_res *rx_res = &login->rx_res[qp_res->rx_index];
+	struct vnic_tx_res *tx_res = &login->tx_res[qp_res->tx_index];
+	struct vnic_rx_ring *ring = login->port->rx_ring[rx_res->index];
+	unsigned long flags;
+	int polled, attr_mask, rc, i;
+
+	/* move QP to ERR, wait for last WQE async event to drain the SRQ */
+	rc = ib_modify_qp(qp_res->qp, &qp_attr, IB_QP_STATE);
+	if (rc) {
+		/* calls vnic_qp_event_handler() */
+		vnic_warn(login->name, "failed to modify QP 0x%x to ERR state"
+			  " (err = %d)\n", qp_res->qp->qp_num, rc);
+		/* continue anyway, but don't wait for completion */
+	} else {
+		wait_for_completion(&qp_res->last_wqe_complete);
+	}
+
+	/* === at this point, no NAPI/RX comps === */
+
+	/* drain TX CQ before moving to RESET, must hold tx_res->lock to
+	 * protect from vnic_comp_handler_tx() after this call, all CQEs
+	 * are polled (either by this direct call, or by CQ handlers)
+	 */
+	spin_lock_irqsave(&tx_res->lock, flags);
+	vnic_drain_tx_cq(login, tx_res->index);
+	spin_unlock_irqrestore(&tx_res->lock, flags);
+
+	/* drain RX CQ before moving to RESET drop and re-post all comps */
+	spin_lock_bh(&ring->lock);
+	do {
+		polled = ib_poll_cq(rx_res->cq, VNIC_MAX_RX_CQE, rx_res->recv_wc);
+		for (i = 0; i < polled; ++i)
+			if (vnic_post_recv(ring, rx_res->recv_wc[i].wr_id))
+				vnic_dbg_data(login->name, "vnic_post_recv failed, "
+					      "wr_id %llu\n", rx_res->recv_wc[i].wr_id);
+	} while (polled == VNIC_MAX_RX_CQE);
+	spin_unlock_bh(&ring->lock);
+
+	/* move QP to RESET */
+	qp_attr.qp_state = IB_QPS_RESET;
+	rc = ib_modify_qp(qp_res->qp, &qp_attr, IB_QP_STATE);
+	if (rc)
+		vnic_warn(login->name, "failed to modify QP 0x%x to RESET"
+			  " state (err = %d)\n", qp_res->qp->qp_num, rc);
+
+	/* move QP to INIT to avoid multicast qp cache misses */
+	qp_attr.qp_state = IB_QPS_INIT;
+	qp_attr.qkey = login->qkey;
+	qp_attr.port_num = login->port->num;
+	qp_attr.pkey_index = login->pkey_index;
+	attr_mask = IB_QP_QKEY | IB_QP_PORT | IB_QP_PKEY_INDEX | IB_QP_STATE;
+
+	rc = ib_modify_qp(qp_res->qp, &qp_attr, attr_mask);
+	if (rc)
+		vnic_warn(login->name, "failed to modify QP 0x%x to INIT state"
+			  " (err = %d)\n", qp_res->qp->qp_num, rc);
+}
+
+int vnic_ib_stop(struct net_device *dev)
+{
+	struct vnic_login *login = vnic_netdev_priv(dev);
+	struct vnic_tx_res *tx_res;
+	unsigned long begin = jiffies;
+	int wr_id, i;
+
+	/* flush tx and rx comps */
+	for (i = 0; i < login->qps_num; ++i)
+		vnic_stop_qp(login, i);
+
+	/* check any pending tx comps */
+	for (i = 0; i < login->tx_rings_num; i++) {
+		tx_res = &login->tx_res[i];
+		/* if tx_outstanding is non-zero, give it a chance to complete */
+		if (!tx_res->tx_outstanding)
+			continue;
+		msleep(10);
+
+		/* else, drain tx cq. This is indicates that something is
+		 * wrong, thus we won't protect vnic_comp_handler_tx() here
+		 */
+		while (tx_res->tx_outstanding &&
+		       time_before(jiffies, begin + 5 * HZ)) {
+			vnic_drain_tx_cq(login, i);
+			msleep(1);
+		}
+
+		/* if they're still not complete, force skb deallocation */
+		if (!tx_res->tx_outstanding)
+			continue;
+		vnic_warn(login->name, "timing out: %d sends not completed\n",
+			  tx_res->tx_outstanding);
+		while (tx_res->tx_outstanding) {
+			wr_id = tx_res->tx_tail & (vnic_tx_rings_len - 1);
+			vnic_dealloc_tx_skb(login, i, wr_id);
+			++tx_res->tx_tail;
+			--tx_res->tx_outstanding;
+		}
+	}
+
+	return 0;
+}
+
+int vnic_ib_open(struct net_device *dev)
+{
+	struct vnic_login *login = vnic_netdev_priv(dev);
+	int i;
+
+	/* move QP to RTS and attach to bcast group */
+	for (i = 0; i < login->qps_num; ++i) {
+		if (vnic_init_qp(login, i)) {
+			vnic_err(login->name, "vnic_init_qp failed\n");
+			goto stop_qps;
+		}
+	}
+
+	return 0;
+
+stop_qps:
+	for (--i ; i >= 0; --i)
+		vnic_stop_qp(login, i);
+
+	return -EINVAL;
+}
+
+void vnic_destroy_qp(struct vnic_login *login, int qp_index)
+{
+	struct ib_qp *qp = login->qp_res[qp_index].qp;
+
+	if (!qp)
+		return;
+	if (ib_destroy_qp(qp))
+		vnic_warn(login->name, "ib_destroy_qp failed\n");
+	return;
+}
+
+void vnic_qp_to_reset(struct vnic_login *login, struct ib_qp *qp)
+{
+	struct ib_qp_attr qp_attr;
+	int rc;
+
+	qp_attr.qp_state = IB_QPS_RESET;
+	rc = ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+	if (rc)
+		vnic_err(login->name, "ib_modify_qp 0x%06x to RESET err %d\n",
+			 qp->qp_num, rc);
+}
+
+int vnic_qp_to_init(struct vnic_login *login, struct ib_qp *qp, u32 qkey)
+{
+	struct ib_qp_attr qp_attr;
+	int attr_mask, rc;
+
+	/* move QP to INIT */
+	qp_attr.qp_state = IB_QPS_INIT;
+	qp_attr.qkey = qkey;
+	qp_attr.port_num = login->port->num;
+	/* pkey will be overwritten later by login->pkey_index */
+	qp_attr.pkey_index = login->port->pkey_index;
+	attr_mask = IB_QP_QKEY | IB_QP_PORT | IB_QP_PKEY_INDEX | IB_QP_STATE;
+
+	rc = ib_modify_qp(qp, &qp_attr, attr_mask);
+	if (rc) {
+		vnic_err(login->name, "ib_modify_qp 0x%06x to INIT err %d\n",
+			 qp->qp_num, rc);
+		goto out_qp_reset;
+	}
+
+	return 0;
+
+out_qp_reset:
+	vnic_qp_to_reset(login, qp);
+	return rc;
+}
+
+int vnic_init_qp(struct vnic_login *login, int qp_index)
+{
+	struct ib_qp_attr qp_attr;
+	int attr_mask, rc, rc1;
+	struct ib_qp *qp = login->qp_res[qp_index].qp;
+
+	init_completion(&login->qp_res[qp_index].last_wqe_complete);
+	/* move QP to INIT */
+	qp_attr.qp_state = IB_QPS_INIT;
+	qp_attr.qkey = login->qkey;
+	qp_attr.port_num = login->port->num;
+	qp_attr.pkey_index = login->pkey_index;
+	attr_mask = IB_QP_QKEY | IB_QP_PORT | IB_QP_PKEY_INDEX | IB_QP_STATE;
+
+	rc = ib_modify_qp(qp, &qp_attr, attr_mask);
+	if (rc) {
+		vnic_err(login->name, "ib_modify_qp to INIT err %d\n", rc);
+		goto out_qp_reset;
+	}
+
+	/* move QP to RTR */
+	qp_attr.qp_state = IB_QPS_RTR;
+	attr_mask &= ~IB_QP_PORT;
+	rc = ib_modify_qp(qp, &qp_attr, attr_mask);
+	if (rc) {
+		vnic_err(login->name, "ib_modify_qp to RTR err %d\n", rc);
+		goto out_qp_reset;
+	}
+
+	/* move QP to RTS */
+	qp_attr.qp_state = IB_QPS_RTS;
+	qp_attr.sq_psn = 0;
+	attr_mask |= IB_QP_SQ_PSN;
+	attr_mask &= ~IB_QP_PKEY_INDEX;
+	rc = ib_modify_qp(qp, &qp_attr, attr_mask);
+	if (rc) {
+		vnic_err(login->name, "ib_modify_qp to RTS err, rc %d\n", rc);
+		goto out_qp_reset;
+	}
+
+	/* What a Good QP! */
+	vnic_dbg_data(login->name, "qpn 0x%06x moved to RTS\n",
+		      qp->qp_num);
+
+	return 0;
+
+out_qp_reset:
+	qp_attr.qp_state = IB_QPS_RESET;
+	rc1 = ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+	if (rc1)
+		vnic_err(login->name, "ib_modify_qp to RESET err %d\n", rc1);
+
+	return rc;
+}
+
+static void vnic_qp_event_handler(struct ib_event *event, void *ctx)
+{
+	struct vnic_qp_res *qp_res = ctx;
+	struct vnic_login *login = qp_res->login;
+
+	ASSERT(login);
+	vnic_dbg_data(login->name, "[%s] qpn %d got event %d\n",
+		      event->device->name, event->element.qp->qp_num,
+		      event->event);
+	if (event->event == IB_EVENT_QP_LAST_WQE_REACHED)
+		complete(&qp_res->last_wqe_complete);
+}
+
+void vnic_destroy_rx_res(struct vnic_login *login, int rx_res_index)
+{
+	struct ib_cq *cq = login->rx_res[rx_res_index].cq;
+	int rc = 0;
+
+	if (cq)
+		rc = ib_destroy_cq(cq);
+	if (rc)
+		vnic_warn(login->name, "ib_destroy_cq() index %d failed\n",
+			  rx_res_index);
+}
+
+void vnic_destroy_tx_res(struct vnic_login *login, int tx_res_index)
+{
+	struct ib_cq *cq = login->tx_res[tx_res_index].cq;
+	struct vnic_tx_buf *tx_ring = login->tx_res[tx_res_index].tx_ring;
+	int rc = 0;
+
+	if (tx_ring)
+		vfree(tx_ring);
+	if (cq)
+		rc = ib_destroy_cq(cq);
+	if (rc)
+		vnic_warn(login->name, "ib_destroy_cq() index %d failed\n",
+			  tx_res_index);
+}
+
+#if 0
+static inline int get_comp_vector(int index, struct vnic_port *port)
+{
+	int vector;
+	int num_cpus = roundup_pow_of_two(num_online_cpus());
+	int port_for_eq;
+
+	port_for_eq = (((index / port->dev->mdev->eq_per_port) %
+			port->dev->mdev->dev->caps.num_ports) + 1);
+	vector = (index % port->dev->mdev->eq_per_port) +
+		 (port_for_eq * num_cpus);
+
+	return vector;
+}
+#endif
+
+int vnic_create_rx_res(struct vnic_login *login, int rx_res_index)
+{
+	struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index];
+	int comp_vector = rx_res_index % login->port->dev->ca->num_comp_vectors;
+	struct ib_cq *cq =
+		ib_create_cq(login->port->dev->ca,
+			     vnic_comp_handler_rx,
+			     NULL, &login->rx_res[rx_res_index],
+			     vnic_rx_rings_len, comp_vector);
+	if (IS_ERR(cq)) {
+		vnic_err(login->name, "ib_create_cq failed, index %d, "
+			 "comp_vector %d, rc %d\n",
+			 rx_res_index, comp_vector, (int)PTR_ERR(cq));
+		return -EINVAL;
+	}
+
+	rx_res->cq = cq;
+	rx_res->index = rx_res_index;
+	rx_res->login = login;
+
+	return 0;
+}
+
+int vnic_create_tx_res(struct vnic_login *login, int tx_res_index)
+{
+	struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index];
+	struct ib_cq *cq;
+	struct vnic_tx_buf *tx_ring;
+	int i, comp_vector;
+
+	tx_ring = vmalloc(vnic_tx_rings_len * sizeof *tx_res->tx_ring);
+	if (!tx_ring) {
+		vnic_err(login->name, "vmalloc failed to allocate %u * %lu\n",
+			 vnic_tx_rings_len,
+			 (long unsigned int) (sizeof *tx_res->tx_ring));
+		return -ENOMEM;
+	}
+	memset(tx_ring, 0, vnic_tx_rings_len * sizeof *tx_res->tx_ring);
+
+	/* create TX CQ and set WQE drafts */
+	tx_res->tx_wr.sg_list = tx_res->tx_sge;
+	tx_res->tx_wr.send_flags = IB_SEND_SIGNALED;
+	tx_res->tx_wr.wr.ud.remote_qkey = login->qkey;
+
+	for (i = 0; i < VNIC_MAX_TX_FRAGS; ++i)
+		tx_res->tx_sge[i].lkey = login->port->mr->lkey;
+
+	/* set mcast av draft*/
+	memset(&tx_res->mcast_av, 0, sizeof(struct ib_ah_attr));
+	tx_res->mcast_av.port_num = login->port->num;
+	tx_res->mcast_av.ah_flags = IB_AH_GRH;
+
+	/* create tx cq */
+	comp_vector = tx_res_index % login->port->dev->ca->num_comp_vectors;
+	cq = ib_create_cq(login->port->dev->ca,
+			  vnic_comp_handler_tx,
+			  NULL, &login->tx_res[tx_res_index],
+			  vnic_tx_rings_len, comp_vector);
+	if (IS_ERR(cq)) {
+		vnic_err(login->name, "ib_create_cq failed, index %d, "
+			 "comp_vector %d, rc %d\n",
+			 tx_res_index, comp_vector, (int)PTR_ERR(cq));
+		vfree(tx_ring);
+		return -EINVAL;
+	}
+
+	tx_res->tx_ring = tx_ring;
+	tx_res->cq = cq;
+	tx_res->index = tx_res_index;
+	tx_res->login = login;
+
+	return 0;
+}
+
+int vnic_create_qp_range(struct vnic_login *login)
+{
+	int qp_index, create_flags = 0, rc;
+	struct ib_qp_init_attr *attr;
+	struct ib_qp *qps[VNIC_MAX_NUM_CPUS];
+	struct vnic_qp_res *qp_res;
+
+	attr = kzalloc(VNIC_MAX_NUM_CPUS * sizeof *attr, GFP_KERNEL);
+	if (!attr)
+		return -ENOMEM;
+
+	create_flags |= login->port->dev->attr.device_cap_flags &
+		IB_DEVICE_BLOCK_MULTICAST_LOOPBACK ?
+		IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK : 0;
+
+	/* TODO: rename IB_QP_CREATE_IPOIB_UD_LSO */
+	create_flags |= login->port->dev->attr.device_cap_flags &
+		IB_DEVICE_UD_TSO ?
+		IB_QP_CREATE_IPOIB_UD_LSO : 0;
+
+	for (qp_index = 0; qp_index < login->qps_num; ++qp_index) {
+		qp_res = &login->qp_res[qp_index];
+		qp_res->tx_index = qp_index % login->tx_rings_num;
+		qp_res->rx_index = qp_index % login->rx_rings_num;
+		memset(&attr[qp_index], 0, sizeof(struct ib_qp_init_attr));
+		attr[qp_index].cap.max_send_wr = vnic_tx_rings_len;
+		attr[qp_index].cap.max_send_sge = VNIC_MAX_TX_FRAGS;
+		attr[qp_index].cap.max_recv_wr = 0; /* we use SRQ */
+		attr[qp_index].cap.max_recv_sge = 0;
+		attr[qp_index].sq_sig_type = IB_SIGNAL_ALL_WR;
+		attr[qp_index].qp_type = IB_QPT_UD;
+		attr[qp_index].send_cq = login->tx_res[qp_res->tx_index].cq;
+		attr[qp_index].recv_cq = login->rx_res[qp_res->rx_index].cq;
+		attr[qp_index].srq = login->port->rx_ring[qp_res->rx_index]->srq;
+		attr[qp_index].event_handler = vnic_qp_event_handler;
+		attr[qp_index].qp_context = &login->qp_res[qp_index];
+		attr[qp_index].create_flags = create_flags;
+		attr[qp_index].cap.max_inline_data = vnic_inline_tshold;
+	}
+
+
+	rc = vnic_ib_create_qp_range(login->port->pd, attr, NULL,
+				     login->qps_num, login->qps_num, qps);
+	if (rc) {
+		vnic_err(login->name, "vnic_ib_create_qp_range failed, rc %d\n", rc);
+		goto err;
+	}
+
+	for (qp_index = 0; qp_index < login->qps_num; ++qp_index) {
+		qp_res = &login->qp_res[qp_index];
+		qp_res->qp = qps[qp_index];
+		qp_res->login = login;
+	}
+
+	for (qp_index = 0; qp_index < login->qps_num; ++qp_index) {
+		rc = vnic_qp_to_init(login, qps[qp_index], login->qkey);
+		if (rc) {
+			vnic_err(login->name, "vnic_qp_to_init failed, rc %d\n", rc);
+			goto destroy_qps;
+		}
+	}
+
+	kfree(attr);
+	return 0;
+
+destroy_qps:
+	for (qp_index--; qp_index>=0; qp_index--)
+		vnic_qp_to_reset(login, qps[qp_index]);
+
+	for (qp_index = 0; qp_index < login->qps_num; ++qp_index)
+		vnic_destroy_qp(login, qp_index);
+
+err:
+	kfree(attr);
+	return rc;
+}
+
+static inline int use_inline(struct sk_buff *skb)
+{
+	return skb->len <= vnic_inline_tshold && !skb_shinfo(skb)->nr_frags;
+}
+
+int vnic_post_send(struct vnic_login *login, int tx_res_index,
+		   u64 wr_id, struct ib_ah *ah, u32 dqpn)
+{
+	struct ib_send_wr *bad_wr;
+	struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index];
+	struct vnic_qp_res *qp_res = &login->qp_res[tx_res_index % login->qps_num];
+	struct vnic_tx_buf *tx_req = &tx_res->tx_ring[wr_id];
+	skb_frag_t *frags = skb_shinfo(tx_req->skb)->frags;
+	int nr_frags = skb_shinfo(tx_req->skb)->nr_frags, i, off = 0;
+
+	ASSERT(qp_res);
+	ASSERT(tx_res);
+	ASSERT(qp_res->tx_index == tx_res->index);
+	ASSERT(qp_res->qp->send_cq == tx_res->cq);
+
+	if (!vnic_encap_headroom && !skb_is_gso(tx_req->skb)) {
+		tx_res->tx_sge[off].addr = tx_req->mapping[off];
+		tx_res->tx_sge[off].length = VNIC_ENCAP_LEN;
+		off++;	
+	}
+
+	if (likely(skb_headlen(tx_req->skb))) {
+		if (vnic_encap_headroom && use_inline(tx_req->skb)) {
+			tx_res->tx_wr.send_flags |= IB_SEND_INLINE;
+			wr_id |= VNIC_SEND_INLINE_FLAG;
+			tx_res->tx_sge[off].addr = (unsigned long)tx_req->skb->data;
+		} else {
+			tx_res->tx_wr.send_flags &= ~IB_SEND_INLINE;
+			tx_res->tx_sge[off].addr = tx_req->mapping[off];
+		}
+		tx_res->tx_sge[off].length = skb_headlen(tx_req->skb);
+		off++;
+	}
+
+	for (i = 0; i < nr_frags; ++i) {
+		tx_res->tx_sge[i + off].addr = tx_req->mapping[i + off];
+		tx_res->tx_sge[i + off].length = frags[i].size;
+	}
+
+	/* handle runt packets using additional SG */
+	if (unlikely(tx_req->skb->len < login->zlen)) {
+		/* Note: always extend runt packets (for both
+		 * internal & external) for virtualization, some emulators
+		 * drop runt packets, so we need to avoid runt packets even
+		 * if the traffic is not passing the bridge
+		 */
+		vnic_dbg_data(login->name, "runt packet, skb %p len %d => %d\n",
+			      tx_req->skb, tx_req->skb->len, login->zlen);
+		/* If there are frags, then packets is longer than 60B */
+		if (use_inline(tx_req->skb))
+			tx_res->tx_sge[i + off].addr = (u64)(unsigned long)login->pad_va;
+		else
+			tx_res->tx_sge[i + off].addr = login->pad_dma;
+
+		tx_res->tx_sge[i + off].length = login->zlen - tx_req->skb->len;
+		++nr_frags;
+		VNIC_STATS_INC(login->port_stats.runt_packets);
+	}
+
+	tx_res->tx_wr.num_sge = nr_frags + off;
+	tx_res->tx_wr.wr_id = wr_id;
+	tx_res->tx_wr.wr.ud.remote_qpn = dqpn;
+	tx_res->tx_wr.wr.ud.ah = ah;
+
+	/* check if we need to calc csum */
+	if (tx_req->skb->ip_summed == CHECKSUM_PARTIAL) {
+		u16 csum_pseudo;
+
+		/* calc pseudo header csum without the length
+		 * and put in the transport's header checksum field.
+		 * The HW will calculate the rest of it (SWP)
+		 */
+		if (tx_req->ip_off)
+			csum_pseudo = ~csum_tcpudp_magic(ip_hdr(tx_req->skb)->saddr,
+							  ip_hdr(tx_req->skb)->daddr,
+							  0, /* length */
+							  ip_hdr(tx_req->skb)->protocol,
+							  0);
+		else
+			csum_pseudo = ~csum_ipv6_magic(&ipv6_hdr(tx_req->skb)->saddr,
+							&ipv6_hdr(tx_req->skb)->daddr,
+							0, /* length */
+							ipv6_hdr(tx_req->skb)->nexthdr,
+							0);
+
+		/* place the calculated csum in the checksum field in
+		 * tcp/udp header
+		 */
+		if (tx_req->tcp_off)
+			tcp_hdr(tx_req->skb)->check = csum_pseudo;
+		else
+			udp_hdr(tx_req->skb)->check = csum_pseudo;
+
+		/* set CSUM flag in ib_send_wr */
+		tx_res->tx_wr.send_flags |= IB_SEND_IP_CSUM;
+	} else {
+		/* csum already calculated in SW */
+		tx_res->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
+	}
+
+	/* prepare TSO header */
+	if (skb_is_gso(tx_req->skb)) {
+		tx_res->tx_wr.wr.ud.mss = skb_shinfo(tx_req->skb)->gso_size + tx_req->hlen;
+		tx_res->tx_wr.wr.ud.header = tx_req->phead;
+		tx_res->tx_wr.wr.ud.hlen = tx_req->hlen;
+		tx_res->tx_wr.opcode = IB_WR_LSO;
+	} else {
+		tx_res->tx_wr.opcode = IB_WR_SEND;
+	}
+
+	vnic_dbg_data(login->name,
+		      "skb %p wr_id %llu sqpn 0x%06x dqpn 0x%06x num_sge "
+		      "%d phead %p was sent\n", tx_req->skb, wr_id, qp_res->qp->qp_num,
+		      dqpn, tx_res->tx_wr.num_sge, tx_req->phead);
+
+	/* if EoIB encap is OOB, copy LRO header to linear part */
+	if (!vnic_encap_headroom && skb_is_gso(tx_req->skb)) {
+		memcpy(tx_res->lso_hdr, VNIC_SKB_GET_ENCAP(tx_req->skb),
+		       VNIC_ENCAP_LEN);
+		memcpy((u8 *)(tx_res->lso_hdr) + VNIC_ENCAP_LEN,
+		       tx_res->tx_wr.wr.ud.header,
+		       tx_res->tx_wr.wr.ud.hlen);
+		tx_res->tx_wr.wr.ud.header = tx_res->lso_hdr;
+		tx_res->tx_wr.wr.ud.mss += VNIC_ENCAP_LEN;
+		tx_res->tx_wr.wr.ud.hlen += VNIC_ENCAP_LEN;
+	}
+
+	return vnic_ib_post_send(qp_res->qp, &tx_res->tx_wr, &bad_wr,
+				 tx_req->ip_off,
+				 tx_req->ip6_off,
+				 tx_req->tcp_off,
+				 tx_req->udp_off);
+}
+
+static int vnic_dma_map_tx(struct ib_device *ca, struct vnic_tx_buf *tx_req)
+{
+	struct sk_buff *skb = tx_req->skb;
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
+	u64 *mapping = tx_req->mapping;
+	int i = 0, off = 0, headlen = skb_headlen(skb);
+
+	if (vnic_encap_headroom && use_inline(skb))
+		return 0;
+
+	if (!vnic_encap_headroom && !skb_is_gso(tx_req->skb)) {
+		mapping[off] = ib_dma_map_single(ca, VNIC_SKB_GET_ENCAP(skb),
+						 VNIC_ENCAP_LEN, DMA_TO_DEVICE);
+		if (unlikely(ib_dma_mapping_error(ca, mapping[off])))
+			return -EIO;
+		off++;
+	}
+
+	if (likely(headlen)) {
+		mapping[off] = ib_dma_map_single(ca, skb->data,
+						 headlen, DMA_TO_DEVICE);
+		if (unlikely(ib_dma_mapping_error(ca, mapping[off])))
+			goto partial_error;
+		off++;
+	}
+
+	for (i = 0; i < shinfo->nr_frags; ++i) {
+		skb_frag_t *frag = &shinfo->frags[i];
+		mapping[i + off] = ib_dma_map_page(ca, frag->page.p,
+						   frag->page_offset,
+						   frag->size, DMA_TO_DEVICE);
+		if (unlikely(ib_dma_mapping_error(ca, mapping[i + off])))
+			goto partial_error;
+	}
+
+	return 0;
+
+partial_error:
+	for (--i; i >= 0; i--) {
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+		ib_dma_unmap_page(ca, mapping[i + off], frag->size,
+				  DMA_TO_DEVICE);
+	}
+
+	if (headlen)
+		ib_dma_unmap_single(ca, mapping[--off], skb_headlen(skb),
+				    DMA_TO_DEVICE);
+
+	if (!vnic_encap_headroom && !skb_is_gso(tx_req->skb))
+		ib_dma_unmap_single(ca, mapping[--off], VNIC_ENCAP_LEN,
+				    DMA_TO_DEVICE);
+
+	return -EIO;
+}
+
+void vnic_send(struct vnic_login *login, struct sk_buff *skb,
+	       struct ib_ah *ah, u32 dqpn, int tx_res_index)
+{
+	struct eoibhdr *_eoib_hdr = VNIC_SKB_GET_ENCAP(skb);
+	struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index];
+	struct vnic_tx_buf *tx_req;
+	unsigned long flags = 0;
+	u64 wr_id;
+	int tx_pkt_num = 1;
+	u8 ip_off;
+
+	if (!vnic_tx_polling)
+		spin_lock_irqsave(&tx_res->lock, flags);
+
+	ASSERT(tx_res_index < login->tx_rings_num);
+	wr_id = tx_res->tx_head & (vnic_tx_rings_len - 1);
+	tx_req = &tx_res->tx_ring[wr_id];
+	tx_req->skb = skb;
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		tx_req->ip_off = tx_req->ip6_off = tx_req->tcp_off = tx_req->udp_off = 0;
+		if (VNIC_IP_CSUM_OK(_eoib_hdr)) {
+			ip_off = vnic_encap_headroom ?
+				((skb_network_header(skb) - skb->data) >> 1) :
+				/* skb_network_header doesn't count the encap since it's OOB */
+				((skb_network_header(skb) - skb->data + VNIC_ENCAP_LEN) >> 1);
+			switch (ntohs(skb->protocol)) {
+			case ETH_P_IP:
+				tx_req->ip_off = ip_off;
+				break;
+			case ETH_P_IPV6:
+				tx_req->ip6_off = ip_off;
+			}
+		}
+		if (VNIC_TCP_CSUM_OK(_eoib_hdr))
+			tx_req->tcp_off =
+			    (skb_transport_header(skb) - skb_network_header(skb)) >> 2;
+		else if (VNIC_UDP_CSUM_OK(_eoib_hdr))
+			tx_req->udp_off =
+			    (skb_transport_header(skb) - skb_network_header(skb)) >> 2;
+		ASSERT(!tx_req->udp_off || !tx_req->tcp_off);
+		vnic_dbg_data(login->name, "ip_off = %d, tcp_off = %d, udp_off = %d\n",
+			      tx_req->ip_off, tx_req->tcp_off, tx_req->udp_off);
+		VNIC_STATS_INC(login->port_stats.tx_chksum_offload);
+	}
+
+	/* TSO skb */
+	if (skb_is_gso(skb)) {
+		tx_req->hlen = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		tx_req->phead = skb->data;
+		ASSERT(skb_pull(skb, tx_req->hlen));
+		VNIC_STATS_INC(login->port_stats.tso_packets);
+		tx_pkt_num = skb_shinfo(tx_req->skb)->gso_segs;
+	}
+
+	/* map tx skb */
+	if (unlikely(vnic_dma_map_tx(login->port->dev->ca, tx_req)))
+		goto err;
+
+	/* send.. unmap.. free skb.. drain tx cq.. [pray] */
+	if (unlikely(++tx_res->tx_outstanding == vnic_tx_rings_len)) {
+		if (++tx_res->tx_stopped_cnt % 100 == 0)
+			vnic_dbg(login->name, "tx queue %d stopped cnt %d, outs %d\n",
+				 tx_res->index,
+				 tx_res->tx_stopped_cnt,
+				 tx_res->tx_outstanding);
+		ASSERT(!VNIC_TXQ_STOPPED(tx_res));
+		VNIC_TXQ_STOP(tx_res);
+		/* vnic_drain_arm_tx_cq() will arm the cq OR resume the ring */
+		VNIC_STATS_DO_INC(login->port_stats.queue_stopped);
+	}
+
+	ASSERT(tx_res->tx_outstanding <= vnic_tx_rings_len);
+
+	if (unlikely(vnic_post_send(login, tx_res_index, wr_id, ah, dqpn))) {
+		vnic_warn(login->name, "vnic_post_send failed\n");
+		VNIC_STATS_DO_INC(tx_res->stats.tx_errors);
+		VNIC_STATS_DO_INC(tx_res->stats.tx_dropped);
+		--tx_res->tx_outstanding;
+		vnic_dealloc_tx_skb(login, tx_res->index, wr_id);
+		/* no need to netif_wake_queue() here, because
+		 * vnic_comp_handler_tx() will eventually be called 
+		 * for armed cq, and it will wake-up the queue when it's ready
+		 */
+	} else {
+		VNIC_STATS_DO_ADD(tx_res->stats.tx_packets, tx_pkt_num);
+		VNIC_STATS_DO_ADD(tx_res->stats.tx_bytes, skb->len);
+		login->dev->trans_start = jiffies;
+		++tx_res->tx_head;
+
+
+		if (vnic_tx_polling) {
+			if (likely(!skb_shared(skb)))
+				skb_orphan(skb);
+			else
+				VNIC_STATS_DO_INC(login->port_stats.shared_packets);
+		}
+	}
+
+	/* poll every vnic_max_tx_outs packets */
+	if (vnic_tx_polling) {
+		if (tx_res->tx_outstanding > vnic_max_tx_outs ||
+		    VNIC_TXQ_STOPPED(tx_res))
+			vnic_drain_arm_tx_cq(login, tx_res_index);
+	} else
+		spin_unlock_irqrestore(&tx_res->lock, flags);
+
+	return;
+
+err:
+	VNIC_STATS_DO_INC(tx_res->stats.tx_dropped);
+	VNIC_STATS_DO_INC(tx_res->stats.tx_errors);
+	dev_kfree_skb_any(skb);
+
+	if (!vnic_tx_polling)
+		spin_unlock_irqrestore(&tx_res->lock, flags);
+
+	return;
+}
+
+void vnic_ib_free_ring(struct vnic_rx_ring *ring)
+{
+	ASSERT(ring->srq);
+	ib_destroy_srq(ring->srq);
+}
+
+int vnic_ib_init_ring(struct vnic_rx_ring *ring)
+{
+	struct ib_srq_init_attr srq_attr;
+	struct vnic_port *port = ring->port;
+	int rc = 0, headroom = 10;
+
+	/* alloc SRQ */
+	memset(&srq_attr, 0, sizeof(struct ib_srq_init_attr));
+	srq_attr.attr.max_sge = VNIC_MAX_RX_FRAGS;
+	srq_attr.attr.max_wr = vnic_rx_rings_len + headroom;
+	srq_attr.attr.srq_limit = vnic_rx_rings_len + headroom;
+	ring->srq = ib_create_srq(port->pd, &srq_attr);
+	if (IS_ERR(ring->srq)) {
+		vnic_err(ring->port->name, "ib_create_srq failed, index %d, rc %d\n",
+			 ring->index, (int)PTR_ERR(ring->srq));
+		rc = (int)PTR_ERR(ring->srq);
+	}
+
+	return rc;
+}
+
+int vnic_port_ib_init(struct vnic_port *port)
+{
+	int i;
+
+	/* alloc PD */
+	port->pd = ib_alloc_pd(port->dev->ca);
+	if (IS_ERR(port->pd)) {
+		vnic_err(port->name, "failed to allocate PD\n");
+		goto err;
+	}
+	vnic_dbg_data(port->name, "port->pd %p\n", port);
+
+	/* alloc MR */
+	port->mr = ib_get_dma_mr(port->pd, IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(port->mr)) {
+		vnic_err(port->name, "failed to allocate MR\n");
+		goto free_pd;
+	}
+	vnic_dbg_data(port->name, "port->mr %p\n", port->mr);
+
+	/* alloc RX RING */
+	for (i = 0; i < port->rx_rings_num; ++i) {
+		port->rx_ring[i] = vnic_create_rx_ring(port, i);
+		if (IS_ERR(port->rx_ring[i])) {
+			vnic_err(port->name, "failed to allocate rx_ring %d\n", i);
+			port->rx_ring[i] = NULL;
+			goto free_rx_ring;
+		}
+	}
+	vnic_dbg_data(port->name, "allocated %d RX ring\n", port->rx_rings_num);
+
+	return 0;
+
+free_rx_ring:
+	for (i = 0; i < port->rx_rings_num; ++i)
+		vnic_destroy_rx_ring(port->rx_ring[i]);
+/* free_mr: */
+	ib_dereg_mr(port->mr);
+free_pd:
+	ib_dealloc_pd(port->pd);
+err:
+	return -EINVAL;
+
+}
+
+void vnic_port_ib_cleanup(struct vnic_port *port)
+{
+	int i;
+
+	for (i = 0; i < port->rx_rings_num; ++i)
+		vnic_destroy_rx_ring(port->rx_ring[i]);
+
+	ib_dereg_mr(port->mr);
+	ib_dealloc_pd(port->pd);
+
+	return;
+}
+
+void vnic_ib_dispatch_event(struct ib_event *event)
+{
+	return;
+}
+
+int vnic_ib_set_moder(struct vnic_login *login, u16 rx_usecs, u16 rx_frames,
+		      u16 tx_usecs, u16 tx_frames)
+{
+	int rc, i;
+
+	vnic_dbg_moder(login->name, "set coalescing params for mtu:%d to "
+		       "rx_frames:%d rx_usecs:%d, "
+		       "tx_frames:%d tx_usecs:%d, "
+		       "adaptive_rx_coal:%d, "
+		       "adaptive_tx_coal:%d, "
+		       "sample_interval:%d, "
+		       "port.state: %d\n",
+		       login->dev->mtu,
+		       rx_frames, rx_usecs,
+		       tx_frames, tx_usecs,
+		       login->adaptive_rx_coal, 0,
+		       login->sample_interval, login->port->attr.state);
+
+	for (i = 0; i < login->tx_rings_num; ++i) {
+		rc = ib_modify_cq(login->tx_res[i].cq, tx_frames, tx_usecs);
+		if (rc && rc != -ENOSYS) {
+			vnic_warn(login->name, "failed modifying tx_res,"
+				  " rc %d, tx ring index %d\n", rc, i);
+			return rc;
+		}
+	}
+
+	for (i = 0; i < login->rx_rings_num; ++i) {
+		rc = ib_modify_cq(login->rx_res[i].cq, rx_frames, rx_usecs);
+		if (rc && rc != -ENOSYS) {
+			vnic_warn(login->name, "failed modifying rx_res,"
+				  " rc %d, rx ring index %d\n", rc, i);
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+int vnic_ib_down(struct net_device *dev)
+{
+	return 0;
+}
+
+int vnic_ib_up(struct net_device *dev)
+{
+	return 0;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_mac.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_mac.c
new file mode 100644
index 0000000000000..996d70dbbc802
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_mac.c
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_data.h"
+#include "vnic_fip_discover.h"
+
+static void vnic_mace_dealloc(struct vnic_mac *mace)
+{
+	ASSERT(mace);
+	kfree(mace);
+}
+
+static struct vnic_mac *vnic_mace_alloc(const u8 *mac, u16 vnic_id)
+{
+	struct vnic_mac *mace;
+
+	mace = kzalloc(sizeof *mace, GFP_ATOMIC);
+	if (!mace)
+		return ERR_PTR(-ENOMEM);
+
+	/* set mac entry fields */
+	memcpy(mace->mac, mac, ETH_ALEN);
+	mace->created = jiffies;
+	mace->last_tx = jiffies;
+	mace->vnic_id = vnic_id;
+
+	return mace;
+}
+
+static void vnic_mace_del(struct vnic_login *login, struct vnic_mac *mace)
+{
+	ASSERT(mace);
+	rb_erase(&mace->rb_node, &login->mac_tree);
+}
+
+static int vnic_mace_add(struct vnic_login *login, struct vnic_mac *mace)
+{
+	struct rb_node **n = &login->mac_tree.rb_node, *pn = NULL;
+	struct vnic_mac *mace_t;
+	int rc;
+
+	while (*n) {
+		pn = *n;
+		mace_t = rb_entry(pn, struct vnic_mac, rb_node);
+		rc = memcmp(mace->mac, mace_t->mac, ETH_ALEN);
+		if (rc < 0)
+			n = &pn->rb_left;
+		else if (rc > 0)
+			n = &pn->rb_right;
+		else {
+			rc = -EEXIST;
+			goto out;
+		}
+	}
+
+	rb_link_node(&mace->rb_node, pn, n);
+	rb_insert_color(&mace->rb_node, &login->mac_tree);
+	rc = 0;
+
+out:
+	return rc;
+}
+
+/* vnic_mace_search --
+ * Return entry pointer if found, or ERR_PTR(-ENODATA) if not found.
+ */
+static struct vnic_mac *vnic_mace_search(struct vnic_login *login, u8 *mac)
+{
+	struct rb_node *n = login->mac_tree.rb_node;
+	struct vnic_mac *mace_t;
+	int rc;
+
+	ASSERT(login);
+	ASSERT(mac);
+
+	while (n) {
+		mace_t = rb_entry(n, struct vnic_mac, rb_node);
+		ASSERT(mace_t);
+		rc = memcmp(mac, mace_t->mac, ETH_ALEN);
+		if (rc < 0)
+			n = n->rb_left;
+		else if (rc > 0)
+			n = n->rb_right;
+		else
+			goto out;
+	}
+
+	mace_t = ERR_PTR(-ENODATA);
+
+out:
+	return mace_t;
+}
+
+/* vnic_mace_update --
+ * Remove: -ENODATA if not found, if removed, update ref_cnt, return 0
+ * Add:    -ENOMEM if no mem, -EEXIST if already exists,
+ *         if added, update ref_cnt, return 0
+ * NOTE: ref counters must be updated here, as this function is
+ *       shared among multiple entry points
+ */
+int vnic_mace_update(struct vnic_login *login, u8 *mac, u16 vnic_id, int remove)
+{
+	struct vnic_mac *mace;
+	int rc;
+
+	mace = vnic_mace_search(login, mac);
+	if (remove) {
+		if (IS_ERR(mace))
+			return -ENODATA;
+		vnic_mace_del(login, mace);
+		vnic_mace_dealloc(mace);
+		/* update ref cnt */
+		ASSERT(atomic_read(&login->vnic_child_cnt));
+		atomic_dec(&login->vnic_child_cnt);
+	} else {
+		if (PTR_ERR(mace) != -ENODATA)
+			return -EEXIST;
+
+		/* test ref cnt */
+		if (atomic_read(&login->vnic_child_cnt) + 1 > vnic_child_max) {
+			vnic_warn(login->name, "too many child vNics, max %d\n",
+				  vnic_child_max);
+			return -EUSERS; /* too many users */
+		}
+
+		mace = vnic_mace_alloc(mac, vnic_id);
+		if (!mace)
+			return -ENOMEM;
+
+		rc = vnic_mace_add(login, mace);
+		if (rc) {
+			vnic_mace_dealloc(mace);
+			return rc;
+		}
+		/* update ref cnt */
+		atomic_inc(&login->vnic_child_cnt);
+		vnic_dbg_mac(login->name,
+			     "updated mac "MAC_6_PRINT_FMT" remove %d\n",
+			     MAC_6_PRINT_ARG(mac), remove);
+	}
+
+	return 0;
+}
+
+/* this function can be called from fast data-path 
+ * need to make sure that login instance is protected here
+ * likely/unlikely below were added to match the hard_start_xmit fast data flow
+ * + caller must hold login->mac_rwlock (read_lock is enough because we only
+ *   queue the job here)
+ * + it queues a job to create a child
+ */
+int vnic_child_update(struct vnic_login *login, u8 *mac, int remove)
+{
+	struct vnic_mac *mace;
+	char *cmd_str;
+	struct fip_hadmin_cmd *cmd_hadmin;
+	int count, rc = -EINVAL;
+	u16 vnic_id = 0;
+
+	vnic_dbg_func(login->name);
+
+	mace = vnic_mace_search(login, mac);
+
+	/* if asked to add, and data already exists, abort */
+	if (likely(!remove && !IS_ERR(mace))) {
+		mace->last_tx = jiffies;
+		return -EEXIST;
+	}
+
+	if (!remove) {
+		/* test if there is too many child vNics same check exist in
+		 * vnic_mace_update(), but we have it here as well to let
+		 * vnic_set_mac return friendly rc
+		 */
+		if (atomic_read(&login->vnic_child_cnt) + 1 > vnic_child_max) {
+			vnic_warn(login->name, "too many child vNics, "
+				  "max %d\n", vnic_child_max);
+			return -EUSERS; /* too many users */
+		}
+
+		/* update last_tx */
+		ASSERT(mace);
+		/* generate new vnic_id only when new child is being added */
+		vnic_id = atomic_inc_return(&login->port->vnic_child_ids);
+		/* set bit 14 so we avoid conflict with normal host/net admin */
+		vnic_id %= (1 << (VNIC_ID_LEN - 2));
+		vnic_id |= (1 << (VNIC_ID_LEN - 2));
+
+		/* TODO: update hadmin user-script and manual to make hadmin
+		 * vnic_id interval >= 16K (1<<14 == 16384) so bit 14 is clear
+		 * for parent host admin.
+		 * to avoid atomic counter wrap around, move to bitmap array
+		 */ 
+	} else {
+		/* if asked to remove, and data not found, abort */
+		if (IS_ERR(mace))
+			return -ENODATA;
+
+		ASSERT(mace);
+		vnic_id = mace->vnic_id;
+	}
+
+	/* allocate cmd structs, too big to be local vars
+	 * use GFP_ATOMIC because this func can be called from data path
+	 */
+	cmd_str = kmalloc(sizeof *cmd_str * PAGE_SIZE, GFP_ATOMIC);
+	if (!cmd_str)
+		return -ENOMEM;
+
+	cmd_hadmin = kmalloc(sizeof *cmd_hadmin, GFP_ATOMIC);
+	if (!cmd_hadmin) {
+		kfree(cmd_str);
+		return -ENOMEM;
+	}
+
+	/* inherit command from parent, change:
+	 * name, parent, mac, vnic_id and source
+	 * Note: cannot use parent login->fip_vnic->cmd here
+	 * in order to support net-admin-vnics
+	 */
+	vnic_login_cmd_init(cmd_hadmin);
+
+	/* child vNic name scheme:
+	 * eth<parent-cnt>.c<child-vnic-id>
+	 * Note: avoid sysfs files conflict (that's why parent unique cnt must
+	 * be included in the name here)
+	 */
+	snprintf(cmd_hadmin->c_name, MAX_INPUT_LEN, "%s%u.c%u",
+		 "eth", login->cnt, vnic_id);
+	snprintf(cmd_hadmin->c_mac, MAX_INPUT_LEN, MAC_6_PRINT_FMT,
+		 MAC_6_PRINT_ARG(mac));
+	snprintf(cmd_hadmin->c_vnic_id, MAX_INPUT_LEN, "%u",
+		 vnic_id);
+	snprintf(cmd_hadmin->c_eport, MAX_INPUT_LEN, "%s",
+		 login->fip_vnic->gw_info.gw_port_name);
+	snprintf(cmd_hadmin->c_parent, MAX_INPUT_LEN, "%s",
+		 login->dev->name);
+	snprintf(cmd_hadmin->c_bxname, MAX_INPUT_LEN, "%s",
+		 login->fip_vnic->gw_info.system_name);
+	snprintf(cmd_hadmin->c_bxguid, MAX_INPUT_LEN, VNIC_GUID_FMT,
+		 VNIC_GUID_RAW_ARG(login->fip_vnic->gw_info.system_guid));
+
+	/* all hadmin vNics must use same BX format (guid vs. name) */
+	if (login->fip_vnic->hadmined) {
+		snprintf(cmd_hadmin->c_bxname, MAX_INPUT_LEN, "%s",
+			 login->fip_vnic->cmd.c_bxname);
+		snprintf(cmd_hadmin->c_bxguid, MAX_INPUT_LEN, "%s",
+			 login->fip_vnic->cmd.c_bxguid);
+	}
+
+	/* VLAN is optional, set it only when used by parent */
+	if (login->vlan_used)
+		snprintf(cmd_hadmin->c_vid, MAX_INPUT_LEN, "%d",
+			 login->fip_vnic->vlan);
+
+	/* ready to set the command */
+	count = vnic_login_cmd_set(cmd_str, cmd_hadmin);
+	if (!count)
+		goto out;
+
+	/* queue job (similar to sysfs write function,
+	 * will eventually call fip_discover_hadmin_update_parent() ->
+	 * vnic_mace_update()
+	 */
+	count = fip_hadmin_sysfs_update(login->port, cmd_str, count, remove);
+	if (count <= 0 && count != -EEXIST)
+		goto out;
+
+	/* at this point, job queued, return success */
+	rc = 0;
+
+out:
+	kfree(cmd_str);
+	kfree(cmd_hadmin);
+	return rc;
+}
+
+void vnic_child_flush(struct vnic_login *login, int all)
+{
+	struct rb_node *n;
+	struct vnic_mac *mace, *mace_t;
+	LIST_HEAD(local_list);
+
+	vnic_dbg_func(login->name);
+
+	n = rb_first(&login->mac_tree);
+	while (n) {
+		mace = rb_entry(n, struct vnic_mac, rb_node);
+		list_add_tail(&mace->list, &local_list);
+		n = rb_next(n);
+	}
+
+	list_for_each_entry_safe(mace, mace_t, &local_list, list) {
+		list_del(&mace->list);
+		/* if not-flush-all, and mac is dev_addr mac, skip this entry */
+		if (!all && !memcmp(login->dev->dev_addr, mace->mac, ETH_ALEN))
+			continue;
+		vnic_child_update(login, mace->mac, 1);
+		vnic_mace_del(login, mace);
+		vnic_mace_dealloc(mace);
+	}
+
+
+}
+
+/* find parent vNic
+ * add the child vnic to its mac_tree
+ * sync child qp_base_num with parent
+ * for child removal, it's ok not to find the parent, or the child mac entry
+ */
+int vnic_parent_update(struct vnic_port *port, char *name, u16 vnic_id,
+		       u8 *mac, u32 *qp_base_num_ptr, char *parent_name,
+		       int remove)
+{
+	struct vnic_login *login;
+	int rc = -ENODATA;
+
+	vnic_dbg_func(name);
+
+	mutex_lock(&port->mlock);
+	list_for_each_entry(login, &port->login_list, list) {
+		vnic_dbg_mac(name, "checking parent %s for child %s (expect %s)\n",
+			     login->dev->name, name, parent_name);
+		/* check if parent vnic has valid QPN and not being destroyed */
+		if (!strcmp(login->dev->name, parent_name) &&
+		    test_bit(VNIC_STATE_LOGIN_PRECREATE_2, &login->fip_vnic->login_state) &&
+		    !login->fip_vnic->flush) {
+			/* sync qp_base_num with parent */
+			if (qp_base_num_ptr)
+				*qp_base_num_ptr = login->qp_base_num;
+
+			/* update mac_tree and mace vnic_id */
+			write_lock_bh(&login->mac_rwlock);
+			rc = vnic_mace_update(login, mac, vnic_id, remove);
+			write_unlock_bh(&login->mac_rwlock);
+
+			break;
+		}
+	}
+
+	mutex_unlock(&port->mlock);
+
+	/* for vNic removal, ignore rc */
+	return remove ? 0 : rc;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_main.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_main.c
new file mode 100644
index 0000000000000..7e17e8de5a2ca
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_main.c
@@ -0,0 +1,1179 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_data.h"
+
+void vnic_login_refresh_mcasts(struct vnic_port *port)
+{
+	struct vnic_login *login;
+
+	vnic_dbg_mark();
+	mutex_lock(&port->mlock);
+	list_for_each_entry(login, &port->login_list, list)
+		vnic_tree_mcast_detach(&login->mcast_tree);
+	list_for_each_entry(login, &port->login_list, list)
+	{
+			if (vnic_sa_query) {
+				/* take the tx lock to make sure no delete function is called at the time */
+				netif_tx_lock_bh(login->dev);
+				vnic_neigh_invalidate(login);
+				netif_tx_unlock_bh(login->dev);
+			}
+
+			vnic_tree_mcast_attach(&login->mcast_tree);
+	}
+	mutex_unlock(&port->mlock);
+}
+
+int vnic_login_pre_create_1(struct vnic_port *port,
+			    struct fip_vnic_data *vnic)
+{
+	struct vnic_login *login;
+	struct net_device *dev;
+
+	/* set login to zero first (for parent_used case) */
+	vnic->login = NULL;
+
+	/* if parent_used, skip */
+	if (vnic->parent_used) {
+		vnic_dbg_mac(vnic->name, "function skipped\n");
+		return 0;
+	} else {
+		vnic_dbg_func(vnic->name);
+	}
+
+	/* create netdev per login, vlan configuration is done from outside */
+	dev = vnic_alloc_netdev(port);
+	if (IS_ERR(dev)) {
+		vnic_err(port->name, "vnic_alloc_netdev failed\n");
+		goto err;
+	}
+
+	login = vnic_netdev_priv(dev);
+	login->fip_vnic = vnic;
+	vnic->login = login;
+	login->vlan_used = vnic->vlan_used;
+	login->dev->hard_header_len += (vnic->vlan_used && vnic->hadmined)? VLAN_HLEN: 0;
+	vnic_dbg_fip(vnic->name,"creating vnic, hadmin=%d vlan_used=%d hard_header_len += %d\n",
+				 vnic->hadmined, vnic->vlan_used, (vnic->vlan_used && vnic->hadmined)? VLAN_HLEN: 0);
+	set_bit(VNIC_STATE_LOGIN_PRECREATE_1, &vnic->login_state);
+
+	return 0;
+
+err:
+	return -ENODEV;
+}
+
+int vnic_login_pre_create_2(struct fip_vnic_data *vnic, int qps_num, int is_lag)
+{
+	struct vnic_login *login = vnic->login;
+	int i, j;
+
+	if (vnic->parent_used) {
+		vnic_dbg_mac(vnic->name, "function skipped\n");
+		return 0;
+	} else {
+		ASSERT(login);
+		vnic_dbg_func(login->name);
+	}
+
+	login->qps_num = qps_num;
+	login->qkey = VNIC_DATA_QKEY;
+	login->is_lag = is_lag;
+	VNIC_TXQ_SET_ACTIVE(login, min(login->tx_rings_num, login->qps_num));
+
+	/* prepare padding for runt packets */
+	login->pad_va = kzalloc(VNIC_EOIB_ZLEN_MAX, GFP_KERNEL);
+	if (!login->pad_va)
+		return -ENOMEM;
+
+	login->pad_dma = ib_dma_map_single(login->port->dev->ca, login->pad_va,
+					   VNIC_EOIB_ZLEN_MAX, DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(login->port->dev->ca, login->pad_dma))
+		goto err;
+
+	/* create TX resources */
+	for (i = 0; i < login->tx_rings_num; ++i) {
+		if (vnic_create_tx_res(login, i)) {
+			vnic_err(login->name, "vnic_create_tx_res failed,"
+				 " index %d\n", i);
+			goto free_tx_res;
+		}
+	}
+
+	/* create RX resources */
+	for (j = 0; j < login->rx_rings_num; ++j) {
+		if (vnic_create_rx_res(login, j)) {
+			vnic_err(login->name, "vnic_create_rx_res failed,"
+				 " index %d\n", j);
+			goto free_rx_res;
+		}
+	}
+
+	/* create QPs */
+	if (vnic_create_qp_range(login)) {
+		vnic_err(login->name, "vnic_create_qp_range failed\n");
+		goto free_rx_res;
+	}
+
+	/* first QP is the base QP */
+	login->qp_base_num = login->qp_res[0].qp->qp_num;
+	vnic->qp_base_num = login->qp_base_num;
+
+	/* update state */
+	set_bit(VNIC_STATE_LOGIN_PRECREATE_2, &vnic->login_state);
+
+	login->queue_stopped = 0;
+
+	/* calls vnic_do_get_stats() */
+	queue_delayed_work(login_wq, &login->stats_task, VNIC_STATS_DELAY);
+
+	return 0;
+
+free_rx_res:
+	for (--j; j >= 0; --j)
+		vnic_destroy_rx_res(login, j);
+
+	i = login->tx_rings_num;
+free_tx_res:
+	for (--i; i >= 0; --i)
+		vnic_destroy_tx_res(login, i);
+/*free_pad:*/
+	ib_dma_unmap_single(login->port->dev->ca, login->pad_dma,
+			    VNIC_EOIB_ZLEN_MAX, DMA_TO_DEVICE);
+err:
+	kfree(login->pad_va);
+	return -ENODEV;
+}
+
+int vnic_login_register_netdev(struct fip_vnic_data *vnic,
+			       const char *mac,
+			       const char *name)
+{
+	struct vnic_login *login = vnic->login;
+
+	if (vnic->parent_used) {
+		vnic_dbg_mac(vnic->name, "function skipped\n");
+		vnic_info("%s created (parent %s mac "MAC_6_PRINT_FMT")\n",
+			  name, vnic->parent_name,
+			  MAC_6_PRINT_ARG(vnic->mac_cache));
+		return 0;
+	} else {
+		ASSERT(login);
+		vnic_dbg_func(login->name);
+	}
+
+	/* set netdev name and mac */
+	if (name)
+		strncpy(login->dev->name, name, IFNAMSIZ);
+	if (mac) {
+		memcpy(login->dev->dev_addr, mac, ETH_ALEN);
+		/* save original mac */
+		memcpy(login->dev_addr, mac, ETH_ALEN);
+	}
+
+	/* set device features according to all_vlan mode */
+	login->dev->features |= NETIF_F_HIGHDMA;
+
+	//ronni - fixme. add comment here
+        if (!vnic->all_vlan_gw) {
+                login->dev->features |= NETIF_F_VLAN_CHALLENGED;
+                login->dev->features &= ~NETIF_F_HW_VLAN_FILTER;
+        } else
+                login->dev->features |= NETIF_F_HW_VLAN_FILTER;
+
+	/* register netdev */
+	if (register_netdev(login->dev)) {
+		vnic_err(login->name, "register_netdev failed name=%s mac="
+			 MAC_6_PRINT_FMT" login->dev=%p\n",
+			 name ? name : "net_admin",
+			 MAC_6_PRINT_ARG(login->dev->dev_addr), login->dev);
+		goto err;
+	}
+
+	/* encode the port number in dev_id:
+	 * This allows us to associate the net device
+	 * with the underlying device's port.
+	 */
+	login->dev->dev_id = login->port->num - 1;
+
+	if (vnic_create_dentry(login)) {
+		vnic_err(login->name, "vnic_create_dentry failed\n");
+		goto err;
+	}
+	
+	/* print info only after register_netdev so dev->name is valid */
+	sprintf(login->name, "%s", login->dev->name);
+	vnic_info("%s created (%s port %d)\n",
+		  login->dev->name,
+		  login->port->dev->ca->name, login->port->num);
+
+	/* disable tx queues and carrier. They will be started
+	 * after create 2 is called the mcast is attached ...
+	 */
+	netif_tx_disable(login->dev);
+	netif_carrier_off(login->dev);
+
+	mutex_lock(&login->port->mlock);
+	vnic_dbg_mac(login->name, "added to login_list\n");
+	list_add_tail(&login->list, &login->port->login_list);
+	mutex_unlock(&login->port->mlock);
+
+	set_bit(VNIC_STATE_LOGIN_CREATE_1, &vnic->login_state);
+
+	return 0;
+
+err:
+	return -EINVAL;
+}
+
+int vnic_login_complete_ack(struct fip_vnic_data *vnic,
+			    struct fip_login_data *login_data,
+			    struct fip_shared_vnic_data *shared_vnic)
+{
+	struct vnic_mcast *mcaste, *mcaste_bcast, *mcast_shared = NULL;
+	struct vnic_login *login = vnic->login;
+	int rc;
+	int first_time_vlan = 0;
+
+	if (vnic->parent_used) {
+		vnic_dbg_mac(vnic->name, "function skipped\n");
+		return 0;
+	} else {
+		ASSERT(login);
+		vnic_dbg_func(login->name);
+	}
+
+	/*
+	* TODO, check if you need them all, check overlap with gw_neigh
+	* check how pkey is passed from FIP
+	*/
+	login->pkey = login_data->pkey;
+	login->pkey_index = login_data->pkey_index;
+	login->n_mac_mcgid = login_data->n_mac_mcgid;
+	login->gw_port_id = login_data->port_id;
+
+	/*GW should send the data SL from the login packet*/
+	login->sl = login_data->sl;
+
+	login->vnic_id = login_data->vnic_id;
+
+	memcpy(login->mgid_prefix, login_data->mgid_prefix, VNIC_MGID_PREFIX_LEN);
+	memcpy(login->vnic_name, login_data->vnic_name, sizeof(login_data->vnic_name));
+	memcpy(login->vendor_id, login_data->vendor_id, sizeof(login_data->vendor_id));
+
+	VNIC_STR_STRIP(login->vnic_name);
+	VNIC_STR_STRIP(login->vendor_id);	/* set ZLEN (varies per VLAN support) */
+
+	/* set VLAN */
+	login->zlen = ETH_ZLEN + (vnic_encap_headroom? VNIC_ENCAP_LEN: 0);
+	first_time_vlan = !login->vlan_used; /* always false for hadmin vnics with vlans */
+	login->vlan_used = login_data->vp;
+	login->all_vlan_gw = login_data->all_vlan_gw;
+	if ((VNIC_VLAN_ENABLED(login))) {
+		login->vid = cpu_to_be16(login_data->vlan);
+		if (first_time_vlan) {
+			vnic_dbg_fip(login->dev->name,"Updating hard_header_len %d+%d=%d\n",
+						 login->dev->hard_header_len, VLAN_HLEN,
+						 login->dev->hard_header_len + VLAN_HLEN);
+			login->dev->hard_header_len += VLAN_HLEN;
+		}
+		login->zlen = ETH_ZLEN + VLAN_HLEN + (vnic_encap_headroom? VNIC_ENCAP_LEN: 0);
+	}
+
+	/* create gw_neigh (no RSS when sending to the GW)
+	 * user zero mac to describe GW L2 address
+	 */
+	login->gw_neigh = 
+		vnic_neighe_alloc(login, NULL, login_data->lid,
+				  login_data->qpn, 0);
+	if (IS_ERR(login->gw_neigh)) {
+		vnic_err(login->name, "failed to alloc gw neigh\n");
+		goto err;
+	}
+
+	/* alloc mcast entries here to simplify the error flow */
+	mcaste = vnic_mcast_alloc(login->port, NULL, NULL);
+	if (IS_ERR(mcaste))
+		goto err_free_gw_ah;
+	mcaste_bcast = vnic_mcast_alloc(login->port, NULL, NULL);
+	if (IS_ERR(mcaste_bcast)) {
+		vnic_mcast_dealloc(mcaste);
+		goto err_free_gw_ah;
+	}
+	/* used by shared vnic mcast group */
+	if (shared_vnic && shared_vnic->enabled) {
+		mcast_shared = vnic_mcast_alloc(login->port, NULL, NULL);
+		if (IS_ERR(mcast_shared)) {
+			vnic_mcast_dealloc(mcaste);
+			vnic_mcast_dealloc(mcaste_bcast);
+			goto err_free_gw_ah;
+		}
+	}
+
+	/* attach to default mgid */
+	__vnic_mcaste_fill(login, mcaste, login->gw_port_id, ETH_ZERO_MAC, 0, vnic_mcast_create);
+	mcaste->backoff_factor = VNIC_MCAST_BACKOF_FAC;
+	mcaste->retry = VNIC_MCAST_ULIMIT_RETRY;
+	mcaste->attach_cb = __bcast_attach_cb;
+	mcaste->detach_cb = __bcast_detach_cb;
+	mcaste->attach_cb_ctx = login;
+	mcaste->detach_cb_ctx = login;
+	rc = vnic_mcast_add(&login->mcast_tree, mcaste);
+	ASSERT(!rc);
+	rc = vnic_mcast_attach(&login->mcast_tree, mcaste);
+	ASSERT(!rc);
+
+	/* attach to bcast mgid (use default mlid) */
+	if (login->n_mac_mcgid || vnic_mgid_data_type) {
+		__vnic_mcaste_fill(login, mcaste_bcast, login->gw_port_id, ETH_BCAST_MAC, 0, 0);
+		mcaste_bcast->backoff_factor = VNIC_MCAST_BACKOF_FAC;
+		mcaste_bcast->retry = VNIC_MCAST_ULIMIT_RETRY;
+		/* The port gid is overun by the default gid as part of the mgid over
+		 * same mlid hack */
+		memcpy(&mcaste_bcast->port_gid, &mcaste->port_gid, GID_LEN);
+		rc = vnic_mcast_add(&login->mcast_tree, mcaste_bcast);
+		ASSERT(!rc);
+		rc = vnic_mcast_attach(&login->mcast_tree, mcaste_bcast);
+		ASSERT(!rc);
+	} else {
+		vnic_mcast_dealloc(mcaste_bcast);
+	}
+
+	login->shared_vnic = 0;
+	/* attach to bcast mgid (use default mlid) */
+	if (shared_vnic && shared_vnic->enabled) {
+		u8 rss_hash = shared_vnic->ip[0] ^  shared_vnic->ip[1] ^
+			shared_vnic->ip[2] ^ shared_vnic->ip[3];
+
+		login->shared_vnic = 1;
+		__vnic_mcaste_fill(login, mcast_shared, login->gw_port_id, shared_vnic->emac, 0, 0);
+		mcast_shared->backoff_factor = VNIC_MCAST_BACKOF_FAC;
+		mcast_shared->retry = VNIC_MCAST_ULIMIT_RETRY;
+		memcpy(&mcast_shared->port_gid, &mcaste->port_gid, GID_LEN);
+		mcast_shared->gid.raw[12]= rss_hash;
+
+		vnic_dbg_mcast(login->name, "vnic %s attaching shared vnic 1 "
+			       "MGID "VNIC_GID_FMT"\n", login->name,
+			       VNIC_GID_RAW_ARG(mcast_shared->gid.raw));
+		mcaste = mcast_shared;
+		memcpy(mcaste->mac, ETH_BCAST_MAC, ETH_ALEN);
+		rc = vnic_mcast_add(&login->mcast_tree, mcaste);
+		ASSERT(!rc);
+		rc = vnic_mcast_attach(&login->mcast_tree, mcaste);
+		ASSERT(!rc);
+	}
+
+	/* set state */
+	set_bit(VNIC_STATE_LOGIN_CREATE_2, &vnic->login_state);
+
+	/* call vnic_open() if open was called when we were not ready to handle it */
+	if (test_bit(VNIC_STATE_NETDEV_OPEN_REQ, &login->netdev_state))
+#ifndef _BP_NO_NDO_OPS
+		login->dev->netdev_ops->ndo_open(login->dev);
+#else
+		login->dev->open(login->dev);
+#endif
+
+	return 0;
+
+err_free_gw_ah:
+	vnic_neighe_dealloc(login->gw_neigh);
+err:
+	return -EINVAL;
+}
+
+/*
+ * When destroying login, call to stop login wq tasks. do not call from
+ * login_wq context.
+*/
+void vnic_login_destroy_stop_wq(struct fip_vnic_data *vnic, enum fip_flush flush)
+{
+	struct vnic_login *login = vnic->login;
+
+	if (vnic->parent_used) {
+		vnic_dbg_mac(vnic->name, "function skipped\n");
+		return;
+	} else {
+		ASSERT(login);
+		vnic_dbg_func(login->name);
+	}
+
+	if (test_bit(VNIC_STATE_LOGIN_PRECREATE_1, &vnic->login_state)) {
+		/* cancel vnic_auto_moder() */
+		vnic_dbg_mark();
+		mutex_lock(&login->moder_lock);
+		login->queue_stopped = 1;
+		mutex_unlock(&login->moder_lock);
+#ifndef _BP_WORK_SYNC
+		cancel_delayed_work_sync(&login->stats_task);
+		if (cancel_delayed_work_sync(&login->mcast_task))
+			dev_put(login->dev);
+		cancel_delayed_work_sync(&login->restart_task);
+#else
+		cancel_delayed_work(&login->stats_task);
+		if (cancel_delayed_work(&login->mcast_task))
+			dev_put(login->dev);
+		cancel_delayed_work(&login->restart_task);
+		flush_workqueue(login_wq);
+#endif
+	}
+}
+
+/*
+ * When destroy login data struct. Assumes all login wq tasks are stopped.
+ * Can be called from any context, might block for a few secs.
+*/
+void vnic_login_destroy_wq_stopped(struct fip_vnic_data *vnic, enum fip_flush flush)
+{
+	struct vnic_login *login = vnic->login;
+	unsigned long flags;
+	int i;
+
+	if (vnic->parent_used) {
+		vnic_dbg_mac(vnic->name, "function skipped\n");
+		vnic_info("%s destroyed (parent %s mac "MAC_6_PRINT_FMT")\n",
+			  vnic->interface_name, vnic->parent_name,
+			  MAC_6_PRINT_ARG(vnic->mac_cache));
+		/* Note: vNics can be logged out by BXM (bypass sysfs calls)
+		 * so we need to cleanup the parent here as well
+		 * if we reach this function from sysfs calls,
+		 * then vnic_parent_update will have no effect here (ok)
+		 */
+		vnic_parent_update(vnic->port, vnic->name, vnic->vnic_id,
+				   vnic->mac_cache, NULL, vnic->parent_name, 1);
+		return;
+	} else {
+		ASSERT(login);
+		vnic_dbg_func(login->name);
+	}
+
+	/* the cleanup procedure depends on our state, our vnic type 
+	 * (host/network admin), and the cleanup level required. In network admined
+	 * vnics there is a single create state and only one cleanup level (full).
+	 * for host admined there are two create states (init, regular) and two
+	 * cleanup level. The flow depends on the reason for the cleanup. */
+	vnic_dbg_data(login->name, "vnic_login_destroy flush=%d\n", flush);
+
+	/* we need to change state to prevent from completion to re-open the TX
+	 * queue once we close it. Before calling stop() function, need to make
+	 * sure that all on-going hard_start_xmit() calls are done.
+	 */
+
+	if (test_bit(VNIC_STATE_LOGIN_CREATE_1, &vnic->login_state)) {
+		set_bit(VNIC_STATE_NETDEV_NO_TX_ENABLE, &login->netdev_state);
+		netif_tx_disable(login->dev);
+		vnic_dbg_mark();
+	}
+
+	if (test_and_clear_bit(VNIC_STATE_LOGIN_CREATE_2, &vnic->login_state)) {
+		if (test_bit(VNIC_STATE_NETDEV_OPEN, &login->netdev_state)) {
+			/* calls vnic_stop() */
+#ifndef _BP_NO_NDO_OPS
+			login->dev->netdev_ops->ndo_stop(login->dev);
+#else
+			login->dev->stop(login->dev);
+#endif
+			set_bit(VNIC_STATE_NETDEV_OPEN_REQ, &login->netdev_state);
+			vnic_dbg_mark();
+		}
+		vnic_mcast_del_all(&login->mcast_tree);
+		vnic_member_remove_all(login);
+		vnic_neighe_dealloc(login->gw_neigh);
+		vnic_dbg_mark();
+	}
+	if (test_bit(VNIC_STATE_LOGIN_CREATE_1, &vnic->login_state))
+		clear_bit(VNIC_STATE_NETDEV_NO_TX_ENABLE, &login->netdev_state);
+
+	if (flush == FIP_FULL_FLUSH &&
+	    test_and_clear_bit(VNIC_STATE_LOGIN_CREATE_1, &vnic->login_state)) {
+		mutex_lock(&login->port->mlock);
+		vnic_dbg_mac(login->name, "delete from login_list\n");
+		list_del(&login->list);
+		mutex_unlock(&login->port->mlock);
+
+		/* print info if register_netdev was called before so
+		 * dev->name is valid
+		 */
+		vnic_info("%s destroyed (%s port %d)\n", login->dev->name,
+			  login->port->dev->ca->name, login->port->num);
+
+		/* use irq save so caller function supports any context */
+		write_lock_irqsave(&login->mac_rwlock, flags);
+		vnic_child_flush(login, 1);
+		write_unlock_irqrestore(&login->mac_rwlock, flags);
+
+		vnic_delete_dentry(login);
+		unregister_netdev(login->dev);
+		vnic_dbg_mark();
+	}
+
+	vnic_dbg_mark();
+	/* login_ctx was in pre created state [always true] */
+	spin_lock_bh(&login->stats_lock);
+	if (test_and_clear_bit(VNIC_STATE_LOGIN_PRECREATE_2, &vnic->login_state)) {
+		spin_unlock_bh(&login->stats_lock);
+		vnic_dbg_mark();
+		/* take port->mlock in case of refresh event is being called vnic_refresh_mcasts */
+		mutex_lock(&login->port->mlock);
+		/* tx queues are already stopped here */
+		vnic_neigh_del_all(login);
+		vnic_mcast_del_all(&login->mcast_tree);
+		for (i = 0; i < login->qps_num; ++i)
+			vnic_destroy_qp(login, i);
+		mutex_unlock(&login->port->mlock);
+
+		for (i = 0; i < login->rx_rings_num; ++i)
+			vnic_destroy_rx_res(login, i);
+		for (i = 0; i < login->tx_rings_num; ++i)
+			vnic_destroy_tx_res(login, i);
+		ib_dma_unmap_single(login->port->dev->ca, login->pad_dma,
+				    VNIC_EOIB_ZLEN_MAX, DMA_TO_DEVICE);
+		kfree(login->pad_va);
+	} else
+		spin_unlock_bh(&login->stats_lock);
+
+	if (flush == FIP_FULL_FLUSH &&
+	    test_and_clear_bit(VNIC_STATE_LOGIN_PRECREATE_1, &vnic->login_state)) {
+		vnic_free_netdev(login);
+	}
+}
+
+int vnic_vhube_add(struct fip_vnic_data *vnic, struct vnic_table_entry *vhube)
+{
+	struct vnic_neigh *neighe;
+	struct vnic_login *login = vnic->login;
+	int rc;
+
+	if (vnic->parent_used) {
+		vnic_dbg_mac(vnic->name, "function skipped\n");
+		return 0;
+	} else {
+		ASSERT(login);
+		vnic_dbg_func(login->name);
+	}
+
+	vnic_dbg_data(login->name, "adding vhube lid 0x%02x qpn 0x%x, mac "
+		      MAC_6_PRINT_FMT"\n", vhube->lid, vhube->qpn,
+		      MAC_6_PRINT_ARG(vhube->mac));
+
+	neighe = vnic_neighe_alloc(login, vhube->mac, vhube->lid,
+				   vhube->qpn, vhube->rss);
+	if (IS_ERR(neighe))
+		return (int)PTR_ERR(neighe);
+
+	vnic_dbg_mark();
+	/* when adding new neighe, make sure that TX queues are not running. */
+	netif_tx_lock_bh(login->dev);
+	rc = vnic_neighe_add(login, neighe);
+	netif_tx_unlock_bh(login->dev);
+	if (rc) {
+		vnic_neighe_dealloc(neighe);
+		return rc;
+	}
+
+	return 0;
+}
+
+void vnic_vhube_flush(struct fip_vnic_data *vnic)
+{
+	struct vnic_login *login = vnic->login;
+
+	if (vnic->parent_used) {
+		vnic_dbg_mac(vnic->name, "function skipped\n");
+		return;
+	} else {
+		ASSERT(login);
+		vnic_dbg_func(login->name);
+	}
+
+	/* when adding new neighe, make sure that TX queues are not running. */
+	vnic_dbg_mark();
+	netif_tx_lock_bh(login->dev);
+	vnic_neigh_del_all(login);
+	netif_tx_unlock_bh(login->dev);
+
+	return;
+}
+
+void vnic_vhube_del(struct fip_vnic_data *vnic, u8* mac)
+{
+	struct vnic_neigh *neighe;
+	struct vnic_login *login = vnic->login;
+
+	if (vnic->parent_used) {
+		vnic_dbg_mac(vnic->name, "function skipped\n");
+		return;
+	} else {
+		ASSERT(login);
+		vnic_dbg_func(login->name);
+	}
+
+	vnic_dbg_mark();
+	/* when adding new neighe, make sure that TX queues are not running. */
+	netif_tx_lock_bh(login->dev);
+	neighe = vnic_neighe_search(login, mac);
+	if (IS_ERR(neighe)) {
+		vnic_warn(login->name, "couldn't find "MAC_6_PRINT_FMT"\n",
+			  MAC_6_PRINT_ARG(mac));
+	} else {
+		vnic_neighe_del(login, neighe);
+		vnic_neighe_dealloc(neighe);
+	}
+	netif_tx_unlock_bh(login->dev);
+	return;
+}
+
+struct fip_login_data login_data;
+struct fip_vnic_data vnic;
+struct vnic_login *__vnic_login_create(struct vnic_port *port, int index)
+{
+	struct vnic_login *login;
+	int rc, no_bxm_n_rss = 0x4;
+	int qps_num = (port->rx_rings_num > 1) ? (1 << no_bxm_n_rss) : 1;
+
+	/* pre create vnic */
+	rc = vnic_login_pre_create_1(port, &vnic);
+	if (rc) {
+		vnic_err(port->name, "vnic_login_pre_create_1 failed"
+			 " for %s port %d index %d\n",
+			 port->dev->ca->name, port->num, index);
+		goto err;
+	}
+
+	login = vnic.login;
+
+	rc = vnic_login_pre_create_2(&vnic, qps_num, 0);
+	if (rc) {
+		vnic_err(port->name, "vnic_login_pre_create_2 failed"
+			 " for %s port %d index %d\n",
+			 port->dev->ca->name, port->num, index);
+		goto create_fail;
+	}
+
+	/* create vnic */
+	memset(&login_data, 0, sizeof(struct fip_login_data));
+	sprintf(login_data.vendor_id, "%s", NOT_AVAILABLE_STRING);
+	sprintf(login_data.vnic_name, "%s", NOT_AVAILABLE_STRING);
+	memcpy(login_data.mgid_prefix, NO_BXM_MGID_PREFIX, VNIC_MGID_PREFIX_LEN);
+	login_data.qpn = 0xa00000;
+	login_data.lid = 1;
+	login_data.pkey = 0xffff;
+	login_data.mtu = 1500;
+
+	/* random_ether_addr(mac); */
+	memcpy(login_data.mac, port->gid.raw + 10, ETH_ALEN);
+	login_data.mac[0] += index * 0x10;
+	/* mcast bit must be zero */
+	login_data.mac[0] &= 0xfe;
+	vnic_dbg_mark();
+	if (vnic_login_register_netdev(&vnic, login_data.mac, NULL)) {
+		vnic_err(login->name, "vnic_login_register_netdev failed\n");
+		goto create_fail;
+	}
+	if (vnic_login_complete_ack(&vnic, &login_data, NULL)) {
+		vnic_err(login->name, "vnic_login_complete_ack failed\n");
+		goto create_fail;
+	}
+
+	return login;
+
+create_fail:
+	vnic_login_destroy(login->fip_vnic, FIP_FULL_FLUSH);
+err:
+	return ERR_PTR(-ENODEV);
+}
+
+int vnic_port_data_init(struct vnic_port *port)
+{
+	int i, no_bxm_vnic_per_port = 1;
+
+	vnic_dbg_mark();
+	mutex_lock(&port->start_stop_lock);
+	for (i = 0; i < no_bxm_vnic_per_port; ++i) {
+		__vnic_login_create(port, i);
+	}
+	mutex_unlock(&port->start_stop_lock);
+
+	return 0;
+	/*TODO - JPM: handle vnic_login_create failure */
+}
+
+void vnic_port_data_cleanup(struct vnic_port *port)
+{
+	struct vnic_login *login, *login_t;
+
+	vnic_dbg_mark();
+	/* vnic_login_destroy() acquires the port->mlock, cannot hold it here */
+	list_for_each_entry_safe(login, login_t,
+				 &port->login_list, list) {
+		vnic_dbg_data(login->name, "login %s\n", login->name);
+		vnic_login_destroy(login->fip_vnic, FIP_FULL_FLUSH);
+	}
+}
+
+/* ALI TODO: check if need to replace login ptr with vnic */
+void debug_dump_members(struct vnic_login *login, struct vnic_gw_info *member)
+{
+	int i;
+
+	vnic_warn(login->name, "Error members_debug_dump "
+		  "member id=%d gw id = %d active_count=%d\n",
+		  member->member_id, member->gw_id,
+		  login->lag_member_active_count);
+
+	/* go over map and count how many entries are mapped to each member*/
+	for (i=0; i<MAX_LAG_MEMBERS; i++) {
+		vnic_warn(login->name, "%d member %d used %x gw_id %d\n",
+			  i, login->lag_gw_neigh[i].member_id,
+			  login->lag_gw_neigh[i].info,
+			  login->lag_gw_neigh[i].gw_id);
+	}
+}
+
+static void vnic_build_map_histogram(struct vnic_login *login, int member_id, int *hist)
+{
+	int i;
+
+	memset(hist, 0, sizeof(int) * MAX_LAG_MEMBERS);
+
+	/* go over map and count how many entries are mapped to each member*/
+	for (i=0; i<LAG_MAP_TABLE_SIZE; i++) {
+		ASSERT(login->lag_gw_map[i] >= 0 && login->lag_gw_map[i] < MAX_LAG_MEMBERS);
+		hist[login->lag_gw_map[i]]++;
+	}
+}
+
+static void _vnic_remove_member_from_map(struct vnic_login *login, int member_id)
+{
+	int user_count[MAX_LAG_MEMBERS] = {0};
+	int i, j;
+	int continue_flag;
+	int thresh;
+
+	login->lag_member_active_count--;
+	if (login->lag_member_active_count > 0) {
+		/* go over map and count how many entries are mapped to each member*/
+		vnic_build_map_histogram(login, member_id, user_count);
+	
+		thresh = 2; //it might be possible to find a better lower boundary
+
+		for (i=0; i<LAG_MAP_TABLE_SIZE; i++) {
+			/* entries that use the removed member must be remapped */
+			if (login->lag_gw_map[i] != member_id)
+				continue;
+
+			continue_flag = 1;
+			while (continue_flag) {
+				for (j = 0; j < MAX_LAG_MEMBERS; j++) {
+					if (j == member_id)
+						continue;
+
+					/* Only use members that are connected, and are short of members */
+					if (login->lag_gw_neigh[j].info & GW_MEMBER_INFO_MAPPED &&
+					    user_count[j] < thresh) {
+						login->lag_gw_map[i] = j;
+						user_count[j]++;
+						continue_flag = 0;
+						break;
+					}
+				}
+				if (j == MAX_LAG_MEMBERS)
+					thresh++;
+			}
+		}
+	}
+}
+
+static void _vnic_add_member_to_map(struct vnic_login *login, int member_id)
+{
+	int i;
+	int expected;
+	int user_count[MAX_LAG_MEMBERS] = {0};
+	int continue_flag;
+	int thresh;
+
+	/* this is the first active port use it for all maps */
+	if (!login->lag_member_active_count) {
+		for (i=0; i<LAG_MAP_TABLE_SIZE; i++)
+			login->lag_gw_map[i] = member_id;
+		login->lag_member_active_count++;
+	} else {
+		/* go over map and count how many entries are mapped to each member
+		 * we will use count to reasign ports from the most heavily used members */
+		vnic_build_map_histogram(login, member_id, user_count);
+
+		/* when adding new member, make sure that TX queues are not running. */
+		login->lag_member_active_count++;
+		expected = LAG_MAP_TABLE_SIZE / login->lag_member_active_count;
+		thresh = LAG_MAP_TABLE_SIZE % login->lag_member_active_count;
+		continue_flag = 1;
+		while (continue_flag) {
+			for (i = 0; i < LAG_MAP_TABLE_SIZE; i++) {
+				if (user_count[login->lag_gw_map[i]] > expected + thresh) {
+					user_count[login->lag_gw_map[i]]--;
+					login->lag_gw_map[i] = member_id;
+					user_count[login->lag_gw_map[i]]++;
+					if (user_count[member_id] >= expected) {
+						continue_flag = 0;
+						break;
+					}
+				}
+ 			}
+			thresh--;
+		}
+	}
+}
+
+void __bcast_member_attach_cb(struct vnic_mcast *mcaste, void *gw_ptr)
+{
+	struct vnic_gw_info *member = gw_ptr;
+
+	/* When SA is local, mcast join works even when port is down */
+	if (member->neigh.login->port->attr.state != IB_PORT_ACTIVE)
+		return;
+
+	vnic_dbg_lag(member->neigh.login->name, "__bcast_member_attach_cb for member id %d and "
+		     "gw_id=%d\n", member->member_id, member->gw_id);
+
+	netif_tx_lock_bh(member->neigh.login->dev);
+	member->info |= GW_MEMBER_INFO_MCAST;
+
+	if (member->info & GW_MEMBER_INFO_EPORT_UP &&
+	    !(member->info & GW_MEMBER_INFO_MAPPED)) {
+		_vnic_add_member_to_map(member->neigh.login, member->member_id);
+		member->info |= GW_MEMBER_INFO_MAPPED;
+	}
+	netif_tx_unlock_bh(member->neigh.login->dev);
+}
+
+void __bcast_member_detach_cb(struct vnic_mcast *mcaste, void *gw_ptr)
+{
+	struct vnic_gw_info *member = gw_ptr;
+
+	vnic_dbg_lag(member->neigh.login->name, "__bcast_member_detach_cb for member id %d and "
+		     "gw_id=%d\n", member->member_id, member->gw_id);
+
+	netif_tx_lock_bh(member->neigh.login->dev);
+	if (member->info & GW_MEMBER_INFO_MAPPED)
+		_vnic_remove_member_from_map(member->neigh.login, member->member_id);
+
+	member->info &= ~(GW_MEMBER_INFO_MAPPED | GW_MEMBER_INFO_MCAST);
+	netif_tx_unlock_bh(member->neigh.login->dev);
+}
+
+/*
+ * create MGIDs and join the default MCAST addresses. The mcaste are added to the
+ * list contained within member struct. If more MGIDs are used by the vnic when
+ * a member is added we will join those too using the members GW_ID.
+*/
+static int _vnic_add_member_mgid(struct vnic_login *login, struct vnic_gw_info *member)
+{
+	struct vnic_mcast *mcaste, *mcaste_bcast;
+	int rc;
+#ifndef _BP_NO_MC_LIST
+	struct dev_mc_list *mclist;
+#else
+	struct netdev_hw_addr *ha;
+#endif
+
+	mcaste = vnic_mcast_alloc(login->port, NULL, NULL);
+	if (IS_ERR(mcaste))
+		return (-ENOMEM);
+
+	/* attach to default mgid */
+	__vnic_mcaste_fill(login, mcaste, member->gw_id, ETH_ZERO_MAC, 0, vnic_mcast_create);
+	mcaste->attach_cb = __bcast_member_attach_cb;
+	mcaste->detach_cb = __bcast_member_detach_cb;
+	mcaste->attach_cb_ctx = member;
+	mcaste->detach_cb_ctx = member;
+	mcaste->priv_data = member;
+	rc = vnic_mcast_add(&login->mcast_tree, mcaste);
+	if (rc) {
+		debug_dump_members(login, member);
+		ASSERT(!rc);
+	}
+
+	rc = vnic_mcast_attach(&login->mcast_tree, mcaste);
+	if (rc) {
+		debug_dump_members(login, member);
+		ASSERT(!rc);
+	}
+
+	if (login->n_mac_mcgid) {
+		mcaste_bcast = vnic_mcast_alloc(login->port, NULL, NULL);
+		if (IS_ERR(mcaste_bcast))
+			goto  free_mcasts;
+
+		__vnic_mcaste_fill(login, mcaste_bcast, member->gw_id, ETH_BCAST_MAC, 0, 0);
+		/* The port gid is overun by the default gid as part of the mgid over
+		 * same mlid hack */
+		memcpy(&mcaste_bcast->port_gid, &mcaste->port_gid, GID_LEN);
+		mcaste_bcast->priv_data = member;
+		rc = vnic_mcast_add(&login->mcast_tree, mcaste_bcast);
+		ASSERT(!rc);
+		rc = vnic_mcast_attach(&login->mcast_tree, mcaste_bcast);
+		ASSERT(!rc);
+	}
+
+
+	/* hold the tx lock so set_multicast_list() won't change mc_list */
+	netif_tx_lock_bh(login->dev);
+#ifndef _BP_NO_MC_LIST
+	for (mclist = login->dev->mc_list; mclist; mclist = mclist->next) {
+		u8* mmac = mclist->dmi_addr;
+#else
+	netdev_for_each_mc_addr(ha, login->dev) {
+		u8* mmac = ha->addr;
+#endif
+		/* do not add the default MGIDS because they are always used */
+		if (IS_ZERO_MAC(mmac))
+			continue;
+		if (IS_BCAST_MAC(mmac))
+			continue;
+
+		vnic_dbg_lag(login->name, "_vnic_add_member_mgid for "
+			  MAC_6_PRINT_FMT" and member gw_id=%d\n",
+			  MAC_6_PRINT_ARG(mcaste->mac), member->gw_id);
+
+		if (_vnic_mcast_attach_mgid(login, mmac, mcaste, member,
+					    member->gw_id))
+			goto attach_failed;
+	}
+	netif_tx_unlock_bh(login->dev);
+
+	return 0;
+
+attach_failed:
+	netif_tx_unlock_bh(login->dev);
+free_mcasts:
+	vnic_mcast_del_user(&login->mcast_tree, member);
+	return -ENOMEM;
+}
+
+int vnic_member_add(struct vnic_login *login, int member_id, struct lag_member *member)
+{
+	struct vnic_gw_info *member_e;
+	int ret;
+
+	if (member_id >= MAX_LAG_MEMBERS || member_id < 0)
+		return -1;
+
+	vnic_dbg_lag(login->name,"vnic_member_add id:%d gw_id:%d lid:%x qpn:%x sl:%d\n",
+			  member_id, member_e->gw_id, member->lid, member->qpn, member->sl);
+	/* member id is already in use */
+	if (login->lag_gw_neigh[member_id].info & GW_MEMBER_INFO_CREATED)
+		return -1;
+
+	member_e = &login->lag_gw_neigh[member_id];
+
+	/* create new entry */
+	member_e->member_id = member_id;
+	member_e->neigh.lid = member->lid;
+	member_e->neigh.qpn = member->qpn;
+	member_e->gw_id = member->gw_port_id;
+	member_e->neigh.login = login;
+	INIT_DELAYED_WORK(&member_e->neigh.destroy_task, vnic_neighe_dealloc_task);
+	skb_queue_head_init(&member_e->neigh.pkt_queue);
+	init_completion(&member_e->neigh.query_comp);
+	complete(&member_e->neigh.query_comp); /* mark as complete since no query is running */
+	member_e->neigh.valid = 0;
+	member_e->neigh.pquery = ERR_PTR(-ENODATA);
+	member_e->neigh.query_id = -1;
+	member_e->neigh.ah = ERR_PTR(-ENODATA); /* ah query will be done via datapath */
+	if (!vnic_sa_query) {
+		member_e->neigh.ah = vnic_ah_alloc(login, member->lid);
+		if (IS_ERR(member_e->neigh.ah))
+			return -ENOMEM;
+	}
+	/* need to add multicast code */
+	ret = _vnic_add_member_mgid(login, member_e);
+	if (ret)
+		goto free_ah;
+
+	netif_tx_lock_bh(login->dev);
+	member_e->info = GW_MEMBER_INFO_CREATED;
+	if (member->eport_state)
+		member_e->info |= GW_MEMBER_INFO_EPORT_UP;
+	login->lag_member_count++;
+	netif_tx_unlock_bh(login->dev);
+
+	return 0;
+free_ah:
+	if (!IS_ERR(member_e->neigh.ah))
+		ib_destroy_ah(member_e->neigh.ah);
+	return ret;
+}
+
+void vnic_member_remove_all(struct vnic_login *login)
+{
+	int i;
+
+	if (!login->is_lag)
+		return;
+
+	for (i=0; i<MAX_LAG_MEMBERS; i++)
+		vnic_member_remove(login, i);
+}
+
+int vnic_member_remove(struct vnic_login *login, int member_id)
+{
+	struct vnic_gw_info *member_e;
+
+	vnic_dbg_lag(login->name, "vnic_member_remove for id %d\n", member_id);
+
+	if (member_id >= MAX_LAG_MEMBERS || member_id < 0)
+		return -1;
+
+	member_e = &login->lag_gw_neigh[member_id];
+
+	vnic_dbg_lag(login->name,"vnic_member_remove id:%d gw_id:%d lid:%x qpn:%x sl:%d\n",
+			  member_id, member_e->gw_id, member_e->neigh.lid, member_e->neigh.qpn, member_e->neigh.sl);
+
+	/* member id is not in use */
+	if (!(member_e->info & GW_MEMBER_INFO_CREATED))
+		return -1;
+
+	if (member_e->neigh.query_id >=0 && member_e->neigh.pquery && !IS_ERR(member_e->neigh.pquery))
+		ib_sa_cancel_query(member_e->neigh.query_id, member_e->neigh.pquery);
+
+	netif_tx_lock_bh(login->dev);
+	if (member_e->info & GW_MEMBER_INFO_MAPPED)
+		_vnic_remove_member_from_map(login, member_e->member_id);
+	member_e->info &= ~(GW_MEMBER_INFO_MAPPED);
+	member_e->neigh.valid = 0;
+	netif_tx_unlock_bh(login->dev);
+
+	/* wait for completion after the entry was removed from login data path */
+	wait_for_completion(&member_e->neigh.query_comp);
+
+	/* modification of map will be done through mcast CB if needed */
+	vnic_mcast_del_user(&login->mcast_tree, member_e);
+
+	if(member_e->neigh.ah && !IS_ERR(member_e->neigh.ah))
+		ib_destroy_ah(member_e->neigh.ah);
+	member_e->neigh.ah = ERR_PTR(-ENODATA);
+	member_e->info = 0;
+	login->lag_member_count--;
+
+	return 0;
+}
+
+void vnic_member_prop(struct vnic_login *login, struct lag_properties *prop)
+{
+	if (login->lag_prop.hash_mask != prop->hash_mask) {
+		netif_tx_lock_bh(login->dev);
+		memcpy(&login->lag_prop, prop,
+		       sizeof(login->lag_prop));
+		netif_tx_unlock_bh(login->dev);
+	}
+}
+
+/*
+ * modify a specific LAG eport member parameters. The parameters might not be
+ * "interesting" and might not effect data traffic. They might require creating
+ * a new ah, or might even result in a modification of the transmit hash mapping
+ * function.
+*/
+int vnic_member_modify(struct vnic_login *login, int member_id, struct lag_member *member)
+{
+	struct vnic_gw_info *member_e;
+
+	if (member_id >= MAX_LAG_MEMBERS || member_id < 0)
+		return -1;
+
+	member_e = &login->lag_gw_neigh[member_id];
+
+	vnic_dbg_lag(login->name,"vnic_member_modify id:%d gw_id:%d lid:%x qpn:%x sl:%d\n",
+		   member_id, member_e->gw_id, member_e->neigh.lid, member_e->neigh.qpn, member_e->neigh.sl);
+
+	/* member id is not in use */
+	if (! member_e->info & GW_MEMBER_INFO_CREATED)
+		return -1;
+
+	/* change in LID requires new ah */
+	/* TODO Test this */
+	if (member_e->neigh.lid != member->lid) {
+		/* take tx lock to make sure ah is not being used */
+		if (vnic_sa_query) {
+			/* Cancel SA query in case */
+			if (member_e->neigh.query_id >=0 && member_e->neigh.pquery && !IS_ERR(member_e->neigh.pquery))
+				ib_sa_cancel_query(member_e->neigh.query_id, member_e->neigh.pquery);
+			netif_tx_lock_bh(login->dev);
+			member_e->neigh.lid = member->lid;
+			member_e->neigh.valid = 0;
+			if ((member_e->neigh.ah && !IS_ERR(member_e->neigh.ah)))
+			{
+				/* lid is not the same : destroy AH */
+				ib_destroy_ah(member_e->neigh.ah);
+				member_e->neigh.ah = ERR_PTR(-ENODATA);
+			}
+			netif_tx_unlock_bh(login->dev);
+		} else {
+			struct ib_ah *ah, *ah1;
+			ah = member_e->neigh.ah;
+			ah1 = vnic_ah_alloc(login, member->lid);
+			if (IS_ERR(ah1))
+				  return -ENOMEM;
+			netif_tx_lock_bh(login->dev);
+			member_e->neigh.lid = member->lid;
+			member_e->neigh.ah = ah1;
+			netif_tx_unlock_bh(login->dev);
+			ib_destroy_ah(ah);
+		}
+	}
+
+	if (member_e->neigh.qpn != member->qpn)
+		member_e->neigh.qpn = member->qpn;
+
+	netif_tx_lock_bh(login->dev);
+	/* link changed from up to down */
+	if (member_e->info & GW_MEMBER_INFO_MAPPED && !member->eport_state) {
+		_vnic_remove_member_from_map(login, member_id);
+		member_e->info &= ~(GW_MEMBER_INFO_MAPPED | GW_MEMBER_INFO_EPORT_UP);
+	} 
+
+	/* link changed from down to up and mcast are connected */
+	if (!(member_e->info & GW_MEMBER_INFO_MAPPED) &&
+	    member->eport_state) {
+		if (member_e->info & GW_MEMBER_INFO_MCAST) {
+			_vnic_add_member_to_map(login, member_id);
+			member_e->info |= (GW_MEMBER_INFO_MAPPED | GW_MEMBER_INFO_EPORT_UP);
+		} else
+			member_e->info |= GW_MEMBER_INFO_EPORT_UP;
+	}
+	netif_tx_unlock_bh(login->dev);
+
+	return 0;
+}
+
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_neigh.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_neigh.c
new file mode 100644
index 0000000000000..a331aebbc6dc4
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_neigh.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_data.h"
+
+void vnic_neighe_dealloc_task(struct work_struct *work)
+{
+	struct vnic_neigh *neighe =
+		container_of(work, struct vnic_neigh, destroy_task.work);
+	if (IS_NEIGH_QUERY_RUNNING(neighe))
+		ib_sa_cancel_query(neighe->query_id, neighe->pquery);
+	wait_for_completion(&neighe->query_comp);
+	if (neighe->ah && !IS_ERR(neighe->ah))
+		ib_destroy_ah(neighe->ah);
+	kfree(neighe);
+}
+
+void vnic_neighe_dealloc(struct vnic_neigh *neighe)
+{
+	ASSERT(neighe);
+	/* calls vnic_neighe_dealloc_task */
+	queue_delayed_work(neighe->login->neigh_wq, &neighe->destroy_task, 0);
+}
+
+struct ib_ah *vnic_ah_alloc(struct vnic_login *login, u16 dlid)
+{
+	struct ib_ah_attr av;
+	struct ib_ah *ah;
+
+	memset(&av, 0, sizeof(av));
+	av.dlid = dlid;
+	av.port_num = login->port->num;
+	av.sl = login->sl; /* PATH Query is need here to allocate the data sl*/
+	ah = ib_create_ah(login->port->pd, &av);
+	if (IS_ERR(ah)) {
+		return ERR_PTR(-ENOMEM);
+	}
+	return(ah);
+}
+
+struct vnic_neigh *vnic_neighe_alloc(struct vnic_login *login,
+				     const u8 *mac,
+				     u16 dlid, u32 dqpn, u8 rss)
+{
+	struct vnic_neigh *neighe;
+	neighe = kzalloc(sizeof *neighe, GFP_ATOMIC);
+	if (!neighe)
+		return ERR_PTR(-ENOMEM);
+	INIT_DELAYED_WORK(&neighe->destroy_task, vnic_neighe_dealloc_task);
+	skb_queue_head_init(&neighe->pkt_queue);
+	if (mac)
+		memcpy(neighe->mac, mac, ETH_ALEN);
+	neighe->rss = rss;
+	neighe->ah = ERR_PTR(-ENODATA);
+	if (!vnic_sa_query) {
+		neighe->ah = vnic_ah_alloc(login, dlid);
+		if (IS_ERR(neighe->ah)) {
+			   kfree(neighe);
+			   return ERR_PTR(-ENOMEM);
+		}
+	}
+	init_completion(&neighe->query_comp);
+	complete(&neighe->query_comp); /* mark as complete since no query is running */
+	neighe->pquery = ERR_PTR(-ENODATA);
+	neighe->query_id = -1;
+	neighe->qpn = dqpn;
+	neighe->lid = dlid;
+	neighe->login = login;
+
+	return neighe;
+}
+
+void vnic_neighe_del(struct vnic_login *login, struct vnic_neigh *neighe)
+{
+	ASSERT(neighe);
+	rb_erase(&neighe->rb_node, &login->neigh_tree);
+}
+
+int vnic_neighe_add(struct vnic_login *login, struct vnic_neigh *neighe)
+{
+	struct rb_node **n = &login->neigh_tree.rb_node, *pn = NULL;
+	struct vnic_neigh *neighe_t;
+	int rc;
+
+	while (*n) {
+		pn = *n;
+		neighe_t = rb_entry(pn, struct vnic_neigh, rb_node);
+		rc = memcmp(neighe->mac, neighe_t->mac, ETH_ALEN);
+		if (rc < 0)
+			n = &pn->rb_left;
+		else if (rc > 0)
+			n = &pn->rb_right;
+		else {
+			rc = -EEXIST;
+			goto out;
+		}
+	}
+
+	rb_link_node(&neighe->rb_node, pn, n);
+	rb_insert_color(&neighe->rb_node, &login->neigh_tree);
+	rc = 0;
+
+out:
+	return rc;
+}
+
+struct vnic_neigh *vnic_neighe_search(struct vnic_login *login, u8 *mac)
+{
+	struct rb_node *n = login->neigh_tree.rb_node;
+	struct vnic_neigh *neighe_t;
+	int rc;
+
+	while (n) {
+		neighe_t = rb_entry(n, struct vnic_neigh, rb_node);
+		rc = memcmp(mac, neighe_t->mac, ETH_ALEN);
+		if (rc < 0)
+			n = n->rb_left;
+		else if (rc > 0)
+			n = n->rb_right;
+		else {
+			vnic_dbg_data(login->name,
+				      "found: mac "MAC_6_PRINT_FMT" vid %d "
+				      "qpn 0x%06x lid 0x%02x\n",
+				      MAC_6_PRINT_ARG(neighe_t->mac),
+				      be16_to_cpu(login->vid), neighe_t->qpn,
+				      neighe_t->lid);
+			goto out;
+		}
+	}
+	neighe_t = ERR_PTR(-ENODATA);
+
+out:
+	return neighe_t;
+}
+
+void vnic_neigh_del_all(struct vnic_login *login)
+{
+	struct rb_node *n;
+	struct vnic_neigh *neighe;
+
+	ASSERT(login);
+	n = rb_first(&login->neigh_tree);
+	while (n) {
+		neighe = rb_entry(n, struct vnic_neigh, rb_node);
+		vnic_neighe_del(login, neighe);
+		n = rb_first(&login->neigh_tree);
+		vnic_neighe_dealloc(neighe);
+	}
+}
+
+void vnic_neigh_invalidate(struct vnic_login *login)
+{
+	struct vnic_neigh *neighe;
+	struct rb_node *n;
+	int i;
+
+	if (login->gw_neigh && !IS_ERR(login->gw_neigh))
+		login->gw_neigh->valid = 0;
+
+	n = rb_first(&login->neigh_tree);
+	while (n) {
+		neighe = rb_entry(n, struct vnic_neigh, rb_node);
+		neighe->valid = 0;
+		n = rb_next(n);
+	}
+
+	if (login->is_lag)
+		for (i=0; i<MAX_LAG_MEMBERS; i++)
+			login->lag_gw_neigh[i].neigh.valid = 0;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_netdev.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_netdev.c
new file mode 100644
index 0000000000000..abfd2e237671c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_netdev.c
@@ -0,0 +1,1085 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_data.h"
+
+extern struct net_device_stats *mlx4_vnic_stats_func_container(struct net_device *n);
+
+static int mlx4_vnic_vlan_rx_add_vid(struct net_device *dev, __be16 proto,
+				     unsigned short vid)
+{
+	struct vnic_login *login = vnic_netdev_priv(dev);
+
+	vnic_dbg_data(login->name, "add VLAN:%d was called\n", vid);
+	return 0;
+}
+
+static int mlx4_vnic_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
+				      unsigned short vid)
+{
+	struct vnic_login *login = vnic_netdev_priv(dev);
+
+	vnic_dbg_data(login->name, "Kill VID:%d was called\n", vid);
+	return 0;
+}
+
+void vnic_carrier_update(struct vnic_login *login)
+{
+	int attached, eport_up, eport_enforce, carrier_ok;
+
+	ASSERT(login);
+	attached = test_bit(VNIC_STATE_LOGIN_BCAST_ATTACH, &login->fip_vnic->login_state);
+	eport_up = fip_vnic_get_eport_state(login->fip_vnic);
+	eport_enforce = vnic_eport_state_enforce;
+	carrier_ok = netif_carrier_ok(login->dev);
+
+	/* bring carrier up */
+	if (!carrier_ok && attached && (!eport_enforce || eport_up)) {
+		set_bit(VNIC_STATE_NETDEV_CARRIER_ON, &login->netdev_state);
+		netif_carrier_on(login->dev);
+		vnic_info("%s link is up\n", login->dev->name);
+		return;
+	}
+
+	/* bring carrier down */
+	if (carrier_ok && (!attached || (!eport_up && eport_enforce))) {
+		clear_bit(VNIC_STATE_NETDEV_CARRIER_ON, &login->netdev_state);
+		netif_carrier_off(login->dev);
+		vnic_info("%s link is down\n", login->dev->name);
+		return;
+	}
+
+}
+
+void __bcast_attach_cb(struct vnic_mcast *mcaste, void *login_ptr)
+{
+	struct vnic_login *login = login_ptr;
+
+	/* When SA is local, mcast join works even when port is down */
+	if (login->port->attr.state != IB_PORT_ACTIVE)
+		return;
+	set_bit(VNIC_STATE_LOGIN_BCAST_ATTACH, &login->fip_vnic->login_state);
+	vnic_carrier_update(login);
+}
+
+void __bcast_detach_cb(struct vnic_mcast *mcaste, void *login_ptr)
+{
+	struct vnic_login *login = login_ptr;
+
+	clear_bit(VNIC_STATE_LOGIN_BCAST_ATTACH, &login->fip_vnic->login_state);
+	vnic_carrier_update(login);
+}
+
+/* this function cannot sleep, avoid any mutex() in consequent calls */
+static int vnic_set_mac(struct net_device *dev, void *_mac)
+{
+	struct vnic_login *login = vnic_netdev_priv(dev);
+	struct sockaddr *saddr = _mac;
+	u8 *mac = (u8 *)(saddr->sa_data);
+	int rc = 0;
+
+	vnic_dbg_func(login->name);
+
+	vnic_dbg_mac(login->name, "mac "MAC_6_PRINT_FMT" => "MAC_6_PRINT_FMT"\n",
+		     MAC_6_PRINT_ARG((u8 *)(dev->dev_addr)),
+		     MAC_6_PRINT_ARG(mac));
+
+	/* must support child vNics for mac modification */
+	if (!vnic_child_max)
+		return -ENOSYS;
+
+	/* skip if invalid address */
+	if (unlikely(!is_valid_ether_addr(mac)))
+		return -EINVAL;
+
+	/* skip if same mac was already set */
+	if (!(memcmp((u8 *)(dev->dev_addr), mac, ETH_ALEN)))
+		return 0;
+
+	/* already in bh, calls vnic_child_update that queues a job,
+	 * so read_lock is enough
+	 */
+	read_lock(&login->mac_rwlock);
+
+	/* if mac same as original, delete child, set mac and return */
+	if (!(memcmp(mac, login->dev_addr, ETH_ALEN)))
+		goto out;
+
+	/* else, this is a new child vNic,
+	 * add new child vNic
+	 * NOTE: pay attention that the GC should not destroy a child vNic that
+	 * is being used as mac-change even if it was created by different
+	 * source.
+	 */
+	rc = vnic_child_update(login, mac, 0);
+	if (rc && rc != -EEXIST)
+		goto err;
+
+out:
+	memcpy(dev->dev_addr, mac, ETH_ALEN);
+	vnic_child_update(login, (u8 *)(dev->dev_addr), 1);
+	vnic_dbg_mac(login->name, "mac changed successfully to "
+		     MAC_6_PRINT_FMT"\n", MAC_6_PRINT_ARG(mac));
+
+err:
+	read_unlock(&login->mac_rwlock);
+	return rc;
+}
+
+static void vnic_set_multicast_list(struct net_device *dev)
+{
+	struct vnic_login *login = vnic_netdev_priv(dev);
+
+	vnic_dbg_func(login->name);
+
+	/* test promisc flag changes */
+	if (is_ucast_promisc(login) && !login->promisc) {
+		/* promisc is being set */
+		if (!vnic_child_max) {
+			/* must support child vNics for promisc mode */
+			vnic_info("%s promisc mode cannot be set "
+				  "(vnic_child_max %u)\n",
+				  dev->name, vnic_child_max);
+		 } else if (vnic_src_mac_enforce) {
+			/* cannot support promisc if source mac is enforced
+			 * because sender should be able to use any smac
+			 */
+			vnic_info("%s promisc mode cannot be set "
+				  "(vnic_src_mac_enforce %u)\n",
+				  dev->name, vnic_src_mac_enforce);
+		 } else {
+			 login->promisc = 1;
+			 vnic_dbg_mac(dev->name,
+				      "entered promiscuous mode: confirmed\n");
+		 }
+	} else if (!is_ucast_promisc(login) && login->promisc) {
+		/* promisc is being cleared */
+		login->promisc = 0;
+		write_lock(&login->mac_rwlock);
+		vnic_child_flush(login, 0);
+		write_unlock(&login->mac_rwlock);
+		vnic_dbg_mac(dev->name,
+			     "left promiscuous mode: confirmed\n");
+	}
+
+	/* test mcast changes */
+	if (!no_bxm && !login->queue_stopped) {
+		dev_hold(dev);
+		if (!queue_delayed_work(login_wq, &login->mcast_task, HZ / 100))
+			dev_put(dev);
+	}
+}
+
+static void vnic_auto_moder(struct vnic_login *login)
+{
+	unsigned long period =
+		(unsigned long)(jiffies - login->last_moder_jiffies);
+	unsigned long packets;
+	unsigned long rate;
+	unsigned long avg_pkt_size;
+	unsigned long rx_packets;
+	unsigned long rx_bytes;
+	unsigned long tx_packets;
+	unsigned long tx_pkt_diff;
+	unsigned long rx_pkt_diff;
+	int moder_time;
+
+	period = (unsigned long)(jiffies - login->last_moder_jiffies);
+#if 0
+	vnic_dbg_moder_v(login->name, "adaptive_rx_coal %d, period %d, "
+			 "sample_interval %d, state %d\n",
+			 login->adaptive_rx_coal, period,
+			 login->sample_interval, login->port->attr.state);
+#endif
+
+	if (!login->adaptive_rx_coal || period < login->sample_interval * HZ)
+		return;
+
+	/* TODO: when NAPI is disabled, the RX completion will be called from
+	 * IRQ context (and not BH context) and thus spin_lock_bh should be
+	 * replaced with spin_lock_irq
+	 */
+	spin_lock_bh(&login->stats_lock);
+	rx_packets = login->stats.rx_packets;
+	rx_bytes = login->stats.rx_bytes;
+	tx_packets = login->stats.tx_packets;
+	spin_unlock_bh(&login->stats_lock);
+
+	if (!login->last_moder_jiffies || !period)
+		goto out_set;
+
+	tx_pkt_diff = ((unsigned long)(tx_packets -
+				       login->last_moder_tx_packets));
+	rx_pkt_diff = ((unsigned long)(rx_packets - login->last_moder_packets));
+	packets = max(tx_pkt_diff, rx_pkt_diff);
+	rate = packets * HZ / period;
+	avg_pkt_size = packets ? ((unsigned long)(rx_bytes -
+						  login->last_moder_bytes)) /
+	    packets : 0;
+
+	if (rate > VNIC_RX_RATE_THRESH && avg_pkt_size > VNIC_AVG_PKT_SMALL) {
+		/* If tx and rx packet rates are not balanced, assume that
+		 * traffic is mainly BW bound and apply maximum moderation.
+		 * Otherwise, moderate according to packet rate */
+		if (2 * tx_pkt_diff > 3 * rx_pkt_diff ||
+		    2 * rx_pkt_diff > 3 * tx_pkt_diff) {
+			moder_time = login->rx_usecs_high;
+		} else {
+			if (rate < login->pkt_rate_low)
+				moder_time = login->rx_usecs_low;
+			else if (rate > login->pkt_rate_high)
+				moder_time = login->rx_usecs_high;
+			else
+				moder_time = (rate - login->pkt_rate_low) *
+					(login->rx_usecs_high - login->rx_usecs_low) /
+					(login->pkt_rate_high - login->pkt_rate_low) +
+					login->rx_usecs_low;
+		}
+	} else {
+		moder_time = login->rx_usecs_low;
+	}
+
+	if (moder_time != login->last_moder_time) {
+		vnic_dbg_moder(login->name, "tx rate:%lu rx_rate:%lu\n",
+			       tx_pkt_diff * HZ / period,
+			       rx_pkt_diff * HZ / period);
+		vnic_dbg_moder(login->name,
+			       "Rx moder_time changed from:%lu to %d period:%lu"
+			       " [jiff] packets:%lu avg_pkt_size:%lu rate:%lu"
+			       " [p/s])\n", login->last_moder_time, moder_time,
+			      period, packets, avg_pkt_size, rate);
+		login->last_moder_time = moder_time;
+		vnic_ib_set_moder(login,
+				  login->last_moder_time, login->rx_frames,
+				  login->tx_usecs, login->tx_frames);
+	}
+
+out_set:
+	login->last_moder_packets = rx_packets;
+	login->last_moder_tx_packets = tx_packets;
+	login->last_moder_bytes = rx_bytes;
+	login->last_moder_jiffies = jiffies;
+}
+
+void vnic_dump_stats(struct vnic_login *login)
+{
+	unsigned long *stats, *login_stats = (unsigned long *)(&login->stats);
+	int i, j, len = sizeof(struct net_device_stats) / sizeof(unsigned long);
+	struct net_device_stats stats_tmp;
+
+	spin_lock_bh(&login->stats_lock);
+	/* tx stats are distributed between tx_res entries */
+	stats_tmp = login->stats;
+	memset(&login->stats, 0, sizeof(struct net_device_stats));
+	for (i = 0; i < login->tx_rings_num; ++i) {
+		stats = (unsigned long *)(&login->tx_res[i].stats);
+		for (j = 0; j < len; ++j)
+			login_stats[j] += stats[j];
+	}
+
+	/* rx stats are in login->stats */
+	login->stats.rx_bytes = stats_tmp.rx_bytes;
+	login->stats.rx_packets = stats_tmp.rx_packets;
+	login->stats.rx_errors = stats_tmp.rx_errors;
+	login->stats.rx_dropped = stats_tmp.rx_dropped;
+        spin_unlock_bh(&login->stats_lock);
+}
+
+static void vnic_do_get_stats(struct work_struct *work)
+{
+	struct vnic_login *login =
+		container_of(work, struct vnic_login, stats_task.work);
+
+	mutex_lock(&login->moder_lock);
+	vnic_dump_stats(login);
+
+	if (login->queue_stopped)
+		goto out;
+
+	if (!(test_bit(VNIC_STATE_NETDEV_OPEN, &login->netdev_state)))
+		goto resched;
+
+	if (login->port->attr.state == IB_PORT_ACTIVE)
+		vnic_auto_moder(login);
+
+resched:
+	/* calls vnic_do_get_stats() */
+	if (!login->queue_stopped)
+		queue_delayed_work(login_wq, &login->stats_task, VNIC_STATS_DELAY);
+out:
+	mutex_unlock(&login->moder_lock);
+}
+
+static void vnic_mcast_reattach(struct work_struct *work)
+{
+	struct vnic_mcast *mcaste, *mcaste_t;
+	struct rb_node *n;
+	unsigned long flags;
+	union vhub_mgid mgid;
+	LIST_HEAD(local_list);
+	int i;
+	struct vnic_gw_info *lag_member;
+	struct vnic_login *login;
+	struct net_device *dev;
+#ifndef _BP_NO_MC_LIST
+	struct dev_mc_list *mclist;
+#else
+	struct netdev_hw_addr *ha;
+#endif
+
+	login = container_of(work, struct vnic_login, mcast_task.work);
+	dev = login->dev;
+
+	vnic_dbg_mcast(login->name, "set_multicast_list was notified\n");
+	if (login->queue_stopped) {
+		dev_put(dev);
+		return;
+	}
+
+	/* detach all mcast (except default and bcast mcasts) */
+	spin_lock_irqsave(&login->mcast_tree.mcast_rb_lock, flags);
+	if (!list_empty(&login->mcast_tree.reattach_list)) {
+		/* an event is being processed */
+		spin_unlock_irqrestore(&login->mcast_tree.mcast_rb_lock, flags);
+		goto retry;
+	}
+		
+	for (n = rb_first(&login->mcast_tree.mcast_tree); n; n = rb_next(n)) {
+		mcaste = rb_entry(n, struct vnic_mcast, rb_node);
+		if (IS_ZERO_MAC(mcaste->mac))
+			continue;
+		if (IS_BCAST_MAC(mcaste->mac))
+			continue;		
+		list_add_tail(&mcaste->list, &local_list);
+	}
+
+	list_for_each_entry(mcaste, &local_list, list) {
+		vnic_mcast_del(&login->mcast_tree, mcaste);
+		mcaste->attach_task_cnt = 0;
+	}
+
+	spin_unlock_irqrestore(&login->mcast_tree.mcast_rb_lock, flags);
+
+	vnic_dbg_mcast(login->name, "local_list is %s empty n_mac_mcgid %u\n",
+		       (list_empty(&local_list) ? "" : "not"),
+		       login->n_mac_mcgid);
+
+	list_for_each_entry_safe(mcaste, mcaste_t, &local_list, list) {
+		list_del(&mcaste->list);
+		vnic_mcast_detach(&login->mcast_tree, mcaste);
+		vnic_mcast_dealloc(mcaste);
+	}
+
+	/* attach all mcasts in mc_list */
+	vhub_mgid_create(login->mgid_prefix, ETH_ZERO_MAC, login->n_mac_mcgid,
+			 CREATE_VHUB_ID(login->vid, login->gw_port_id),
+			 VHUB_MGID_DATA, 0, &mgid);
+
+	spin_lock_irqsave(&login->mcast_tree.mcast_rb_lock, flags);
+	mcaste_t = vnic_mcast_search(&login->mcast_tree, &mgid.ib_gid);
+	if (IS_ERR(mcaste_t) || !test_bit(VNIC_STATE_LOGIN_BCAST_ATTACH, &login->fip_vnic->login_state)) {
+		vnic_dbg_data(login->name, "default mgid not ready\n");
+		spin_unlock_irqrestore(&login->mcast_tree.mcast_rb_lock, flags);
+		dev_put(dev);
+		return;
+	}
+	spin_unlock_irqrestore(&login->mcast_tree.mcast_rb_lock, flags);
+
+	/* hold the tx lock so set_multicast_list() won't change mc_list */
+	netif_tx_lock_bh(dev);
+#ifndef _BP_NO_MC_LIST
+	for (mclist = login->dev->mc_list; mclist; mclist = mclist->next) {
+		u8* mmac = mclist->dmi_addr;
+#else
+	netdev_for_each_mc_addr(ha, login->dev) {
+		u8* mmac = ha->addr;
+#endif
+		/* do not add the default MGIDS because they are always used */
+		if (IS_ZERO_MAC(mmac))
+			continue;
+		if (IS_BCAST_MAC(mmac))
+			continue;
+
+		/* attach to the legacy GW / LAG gw id MGID */
+		if (_vnic_mcast_attach_mgid(login, mmac, mcaste_t, login,
+					    login->gw_port_id))
+			goto attach_failed;
+
+		if (!login->is_lag)
+			continue;
+
+		for (i=0; i<MAX_LAG_MEMBERS; i++) {
+			lag_member = &login->lag_gw_neigh[i];
+			/* member id is already in use */
+			if (lag_member->info & GW_MEMBER_INFO_CREATED)
+				/* attach to the legacy GW / LAG gw id MGID */
+				if (_vnic_mcast_attach_mgid(login, mmac,
+							    mcaste_t,
+							    lag_member,
+							    lag_member->gw_id))
+					goto attach_failed;
+		}
+	}
+	netif_tx_unlock_bh(dev);
+	dev_put(dev);
+	return;
+
+attach_failed:
+	netif_tx_unlock_bh(dev);
+	vnic_mcast_del_all(&login->mcast_tree);
+
+retry:
+	if (!login->queue_stopped) {
+		if (!queue_delayed_work(login_wq, &login->mcast_task, HZ / 100))
+			dev_put(dev);
+	} else
+		dev_put(dev);
+}
+
+static int vnic_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct vnic_login *login = vnic_netdev_priv(dev);
+
+	if (new_mtu > login->max_mtu) {
+		vnic_warn(login->name, "failed: new_mtu %d > %d\n", new_mtu,
+			  login->max_mtu);
+		return -EINVAL;
+	}
+
+	vnic_dbg_data(login->name, "mtu %d -> %d\n", dev->mtu, new_mtu);
+	dev->mtu = new_mtu;
+
+	return 0;
+}
+
+static void vnic_set_default_moder(struct vnic_login *login)
+{
+
+	login->rx_frames = VNIC_RX_COAL_TARGET / login->dev->mtu + 1;
+	login->rx_usecs = VNIC_RX_COAL_TIME;
+	login->tx_frames = VNIC_TX_COAL_PKTS;
+	login->tx_usecs = VNIC_TX_COAL_TIME;
+	login->pkt_rate_low = VNIC_RX_RATE_LOW;
+	login->rx_usecs_low = VNIC_RX_COAL_TIME_LOW;
+	login->pkt_rate_high = VNIC_RX_RATE_HIGH;
+	login->rx_usecs_high = VNIC_RX_COAL_TIME_HIGH;
+	login->sample_interval = VNIC_SAMPLE_INTERVAL;
+	login->adaptive_rx_coal = 1;
+	login->last_moder_time = VNIC_AUTO_CONF;
+	login->last_moder_jiffies = 0;
+	login->last_moder_packets = 0;
+	login->last_moder_tx_packets = 0;
+	login->last_moder_bytes = 0;
+
+	vnic_dbg_data(login->name, "default coalescing params for mtu:%d to "
+		      "rx_frames:%d rx_usecs:%d "
+		      "tx_frames:%d tx_usecs:%d\n",
+		      login->dev->mtu,
+		      login->rx_frames, login->rx_usecs,
+		      login->tx_frames, login->tx_usecs);
+}
+
+#ifndef _BP_NAPI_POLL
+int vnic_napi_alloc(struct vnic_login *login, int rx_res_index)
+{
+
+	struct napi_struct *napi = &login->rx_res[rx_res_index].napi;
+
+	netif_napi_add(login->dev, napi, vnic_poll_cq_rx, vnic_napi_weight);
+
+	return 0;
+}
+
+void vnic_napi_enable(struct vnic_login *login, int rx_res_index)
+{
+
+	struct napi_struct *napi = &login->rx_res[rx_res_index].napi;
+	napi_enable(napi);
+}
+
+static void vnic_napi_disable(struct vnic_login *login, int rx_res_index)
+{
+	struct napi_struct *napi = &login->rx_res[rx_res_index].napi;
+
+	if (!napi->poll)
+		return;
+
+	napi_disable(napi);
+}
+
+static void vnic_napi_dealloc(struct vnic_login *login, int rx_res_index)
+{
+#ifndef _BP_NAPI_NO_DEL
+	struct napi_struct *napi = &login->rx_res[rx_res_index].napi;
+
+	netif_napi_del(napi);
+#else
+	return;
+#endif
+}
+
+#else
+int vnic_napi_alloc(struct vnic_login *login, int rx_res_index)
+{
+	struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index];
+	char name[IFNAMSIZ];
+
+	snprintf(name, IFNAMSIZ, "%s-N%d", login->name, rx_res_index);
+	rx_res->poll_dev =
+		alloc_netdev(0, name, ether_setup);
+	if (!rx_res->poll_dev)
+		return -ENOMEM;
+
+	rx_res->poll_dev = rx_res->poll_dev;
+	rx_res->poll_dev->priv = rx_res;
+	rx_res->poll_dev->weight = vnic_napi_weight;
+	rx_res->poll_dev->poll = vnic_poll_cq_rx;
+
+	return 0;
+}
+
+void vnic_napi_enable(struct vnic_login *login, int rx_res_index)
+{
+	struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index];
+
+	ASSERT(rx_res->poll_dev);
+	set_bit(__LINK_STATE_START, &rx_res->poll_dev->state);
+}
+
+static void vnic_napi_disable(struct vnic_login *login, int rx_res_index)
+{
+	struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index];
+	struct net_device *poll_dev = rx_res->poll_dev;
+
+	if (!poll_dev)
+		return;
+
+	while (test_bit(__LINK_STATE_RX_SCHED, &poll_dev->state))
+		msleep(VNIC_NAPI_SCHED_TIMEOUT);
+}
+
+static void vnic_napi_dealloc(struct vnic_login *login, int rx_res_index)
+{
+	struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index];
+	struct net_device *poll_dev = rx_res->poll_dev;
+
+	if (!poll_dev)
+		return;
+
+	free_netdev(poll_dev);
+	rx_res->poll_dev = NULL;
+}
+#endif
+
+static int _vnic_open(struct net_device *dev)
+{
+	struct vnic_login *login = vnic_netdev_priv(dev);
+	int i;
+
+	/* Todo add locks here */
+	if (!(test_bit(VNIC_STATE_LOGIN_CREATE_2, &login->fip_vnic->login_state))) {
+		set_bit(VNIC_STATE_NETDEV_OPEN_REQ, &login->netdev_state);
+		return 0;
+	}
+
+	if (test_and_set_bit(VNIC_STATE_NETDEV_OPEN, &login->netdev_state))
+		return 0;
+
+	clear_bit(VNIC_STATE_NETDEV_OPEN_REQ, &login->netdev_state);
+
+	/* ARM RX handlers */
+	for (i = 0; i < login->rx_rings_num; ++i) {
+		login->rx_res[i].stopped = 0;
+		if (ib_req_notify_cq(login->rx_res[i].cq, IB_CQ_NEXT_COMP)) {
+			vnic_err(login->name, "ib_req_notify_cq failed\n");
+			goto err;
+		}
+	}
+
+	/* ARM TX handlers */
+	for (i = 0; i < login->tx_rings_num; ++i) {
+		login->tx_res[i].stopped = 0;
+		spin_lock_init(&login->tx_res[i].lock);
+		if (!vnic_tx_polling &&
+		    ib_req_notify_cq(login->tx_res[i].cq, IB_CQ_NEXT_COMP)) {
+			vnic_err(login->name, "ib_req_notify_cq failed\n");
+			goto err;
+		}
+	}
+
+	/* enable napi*/
+	for (i = 0; i < login->napi_num; ++i)
+		vnic_napi_enable(login, i);
+
+	/* move QP to RTS, post recv skb */
+	if (vnic_ib_open(dev))
+		goto err_napi;
+
+	/* dummy call */
+	if (vnic_ib_up(dev))
+		goto err_ib_stop;
+
+	/* configure */
+	vnic_set_default_moder(login);
+	if (vnic_ib_set_moder(login, login->last_moder_time, login->rx_frames,
+			      login->tx_usecs, login->tx_frames))
+		vnic_warn(login->name, "vnic_ib_set_moder failed!\n");
+
+	/* start interface TX queue */
+	VNIC_TXQ_START_ALL(login);
+
+	/* report and return */
+	vnic_info("%s is opened\n", dev->name);
+
+	return 0;
+
+err_ib_stop:
+	vnic_ib_stop(dev);
+err_napi:
+	/* disable napi*/
+	for (i = 0; i < login->napi_num; ++i)
+		vnic_napi_disable(login, i);
+err:
+	clear_bit(VNIC_STATE_NETDEV_OPEN, &login->netdev_state);
+	return -EINVAL;
+}
+
+static int vnic_open(struct net_device *dev)
+{
+	struct vnic_login *login = vnic_netdev_priv(dev);
+	int ret;
+
+	vnic_dbg_func(login->name);
+
+	mutex_lock(&login->state_lock);
+	ret = _vnic_open(dev);
+	mutex_unlock(&login->state_lock);
+	return ret;
+}
+
+static int _vnic_stop(struct net_device *dev)
+{
+	struct vnic_login *login = vnic_netdev_priv(dev);
+	int i, _watchdog_timeo = dev->watchdog_timeo;
+
+	/* check if already stopped */
+	if (!(test_and_clear_bit(VNIC_STATE_NETDEV_OPEN, &login->netdev_state)))
+		return 0;
+
+	/* Set trans_start to jiffies and watchdog_timeo to max
+	 * to avoid spurious transmit timeouts in the interval between
+	 * tx queue stopped and carrier down.
+	 */
+	dev->trans_start = jiffies;
+	dev->watchdog_timeo = 0x7fffffff;
+
+	VNIC_TXQ_STOP_ALL(login);
+
+	/* disable rx handlers */
+	for (i = 0; i < login->rx_rings_num; ++i)
+		login->rx_res[i].stopped = 1;
+
+	/* disable tx handlers */
+	for (i = 0; i < login->tx_rings_num; ++i)
+		login->tx_res[i].stopped = 1;
+
+	/* disable napi managers */
+	for (i = 0; i < login->napi_num; ++i)
+		vnic_napi_disable(login, i);
+
+	vnic_ib_down(dev);
+	vnic_ib_stop(dev);
+
+	/* restore watchdog_timeo */
+	dev->watchdog_timeo = _watchdog_timeo;
+
+	vnic_info("%s is stopped\n", dev->name);
+
+	return 0;
+}
+
+static int vnic_stop(struct net_device *dev)
+{
+	struct vnic_login *login = vnic_netdev_priv(dev);
+	int ret;
+
+	vnic_dbg_func(login->name);
+
+	mutex_lock(&login->state_lock);
+	ret = _vnic_stop(dev);
+	mutex_unlock(&login->state_lock);
+
+	return ret;
+}
+
+int vnic_restart(struct net_device *dev)
+{
+	struct vnic_login *login = vnic_netdev_priv(dev);
+	int rc = 0;
+
+	if (login->queue_stopped || !test_bit(VNIC_STATE_NETDEV_OPEN, &login->netdev_state))
+		return rc;
+
+	set_bit(VNIC_STATE_NETDEV_NO_TX_ENABLE, &login->netdev_state);
+	netif_tx_disable(login->dev);
+
+	mutex_lock(&login->state_lock);
+	_vnic_stop(login->dev);
+
+	clear_bit(VNIC_STATE_NETDEV_NO_TX_ENABLE, &login->netdev_state);
+	set_bit(VNIC_STATE_NETDEV_OPEN_REQ, &login->netdev_state);
+
+	rc = _vnic_open(login->dev);
+	mutex_unlock(&login->state_lock);
+
+	return rc;
+}
+
+static void vnic_restart_task(struct work_struct *work)
+{
+	struct vnic_login *login =
+		container_of(work, struct vnic_login, restart_task.work);
+
+	vnic_restart(login->dev);
+}
+
+struct net_device_stats *vnic_get_stats(struct net_device *dev)
+{
+	struct vnic_login *login = vnic_netdev_priv(dev);
+
+	if (dev->reg_state != NETREG_REGISTERED)
+		return &dev->stats;
+
+	spin_lock_bh(&login->stats_lock);
+	if (test_bit(VNIC_STATE_LOGIN_PRECREATE_2, &login->fip_vnic->login_state))
+		memcpy(&dev->stats, &login->stats, sizeof(login->stats));
+	spin_unlock_bh(&login->stats_lock);
+
+	return &dev->stats;
+}
+
+static void vnic_tx_timeout(struct net_device *dev)
+{
+	struct vnic_login *login = vnic_netdev_priv(dev);
+
+	vnic_warn(login->name, "TX timeout called on port: %d, "
+		  "latency: %d msec,  stopped: %d, carrier_ok: %d,"
+		  "queue_stopped: %d, watchdog_timeo: %d msec\n",
+		  login->port->num,
+		  jiffies_to_msecs(jiffies - dev->trans_start),
+		  netif_queue_stopped(dev), netif_carrier_ok(dev),
+		  login->queue_stopped,
+		  jiffies_to_msecs(dev->watchdog_timeo));
+
+	if (netif_carrier_ok(dev)) {
+		VNIC_STATS_DO_INC(login->port_stats.tx_timeout);
+		if (!login->queue_stopped) {
+			vnic_warn(login->name, "TX timeout, queueing rings restart\n");
+			queue_delayed_work(login_wq, &login->restart_task, HZ / 100);
+		}
+	}
+}
+
+#ifndef _BP_NETDEV_NO_TMQ
+u16 vnic_select_queue(struct net_device *dev, struct sk_buff *skb,
+		      void *accel_priv, select_queue_fallback_t fallback)
+{
+	/* Notes:
+	 * - In kernel 2.6.32 the skb->mac_header 0x1a is not set when
+	 * select_queue() is called
+	 * - In OVM Server 3.0, DomU tx skb network and transport
+	 * headers are not set
+	 */
+	skb_reset_mac_header(skb);
+	skb_set_network_header(skb, ETH_HLEN);
+        skb_set_transport_header(skb,
+                                 ETH_HLEN +
+                                 (skb->protocol == htons(ETH_P_IPV6) ?
+                                  sizeof(struct ipv6hdr) : ip_hdrlen(skb)));
+
+	return vnic_hash(dev, skb) % dev->real_num_tx_queues;
+}
+
+#endif
+
+#ifndef _BP_NO_NDO_OPS
+static struct net_device_ops vnic_netdev_ops = {
+	.ndo_open = vnic_open,
+	.ndo_stop = vnic_stop,
+	.ndo_start_xmit = vnic_tx,
+	.ndo_get_stats = vnic_get_stats,
+	.ndo_set_rx_mode = vnic_set_multicast_list,
+	.ndo_change_mtu = vnic_change_mtu,
+	.ndo_tx_timeout = vnic_tx_timeout,
+	.ndo_set_mac_address = vnic_set_mac,
+	.ndo_vlan_rx_add_vid = mlx4_vnic_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid = mlx4_vnic_vlan_rx_kill_vid,
+#ifndef _BP_NETDEV_NO_TMQ
+	.ndo_select_queue = vnic_select_queue,
+#endif
+};
+#endif
+
+static void vnic_setup(struct net_device *dev)
+{
+	ether_setup(dev);
+
+	dev->hard_header_len += VNIC_SKB_GET_ENCAP_OFFSET;
+	dev->watchdog_timeo = VNIC_WATCHDOG_TIMEOUT;
+
+#ifndef _BP_NO_NDO_OPS
+	if (!vnic_change_mac)
+		vnic_netdev_ops.ndo_set_mac_address = NULL;
+
+	dev->netdev_ops = &vnic_netdev_ops;
+#else
+	dev->open = vnic_open;
+	dev->stop = vnic_stop;
+	dev->hard_start_xmit = vnic_tx;
+	dev->get_stats = mlx4_vnic_stats_func_container;
+	dev->set_multicast_list = vnic_set_multicast_list;
+	dev->change_mtu = vnic_change_mtu;
+	dev->tx_timeout = vnic_tx_timeout;
+	dev->set_mac_address = vnic_set_mac;
+	dev->vlan_rx_add_vid = mlx4_vnic_vlan_rx_add_vid;
+	dev->vlan_rx_kill_vid = mlx4_vnic_vlan_rx_kill_vid;
+
+	if (!vnic_change_mac)
+		dev->set_mac_address = NULL;
+
+#ifndef _BP_NETDEV_NO_TMQ
+	dev->select_queue = vnic_select_queue;
+#endif
+#endif // _BP_NO_NDO_OPS
+}
+
+static int vnic_get_frag_header(struct skb_frag_struct *frags, void **mac_hdr,
+				void **ip_hdr, void **tcpudp_hdr,
+				u64 *hdr_flags, void *priv)
+{
+	struct iphdr *iph;
+	*mac_hdr = page_address(frags->page.p) + frags->page_offset;
+	*ip_hdr = iph = (struct iphdr *)(*mac_hdr + ETH_HLEN);
+	*tcpudp_hdr = (struct tcphdr *)(iph + (iph->ihl << 2));
+	*hdr_flags = LRO_IPV4 | LRO_TCP;
+
+	return 0;
+}
+
+static int vnic_get_skb_header(struct sk_buff *skb, void **iphdr,
+			       void **tcphdr, u64 *hdr_flags, void *priv)
+{
+	struct iphdr *iph;
+	struct tcphdr *tcph;
+
+	if (unlikely(skb->protocol != htons(ETH_P_IP)))
+		return -1;
+
+	if (unlikely(skb->ip_summed != CHECKSUM_UNNECESSARY))
+		return -1;
+
+	iph = (struct iphdr *)(skb->data + ETH_HLEN);
+	if (iph->protocol != IPPROTO_TCP)
+		return -1;
+
+	tcph = (struct tcphdr *)(iph + (iph->ihl << 2));
+
+	if (ntohs(iph->tot_len) < (iph->ihl * 4 + tcph->doff * 4))
+		return -1;
+
+	*hdr_flags = LRO_IPV4 | LRO_TCP;
+	*iphdr = iph;
+	*tcphdr = tcph;
+
+	return 0;
+}
+
+static int vnic_lro_enable(struct vnic_login *login, int rx_res_index)
+{
+	struct net_lro_mgr *lro = &login->rx_res[rx_res_index].lro;
+
+	lro->dev = login->dev;
+	lro->features = login->napi_num ? LRO_F_NAPI : 0;
+	lro->frag_align_pad = NET_IP_ALIGN;
+	lro->ip_summed = CHECKSUM_UNNECESSARY;
+	lro->ip_summed_aggr = CHECKSUM_UNNECESSARY;
+	lro->max_desc = login->lro_num;
+	lro->max_aggr = VNIC_MAX_LRO_AGGR;
+	lro->lro_arr = login->rx_res[rx_res_index].lro_desc;
+
+	if (lro->max_aggr > MAX_SKB_FRAGS)
+		lro->max_aggr = MAX_SKB_FRAGS;
+
+	if (!vnic_rx_linear)
+		lro->get_frag_header = vnic_get_frag_header;
+	else
+		lro->get_skb_header = vnic_get_skb_header;
+
+	return 0;
+}
+
+static void vnic_lro_disable(struct vnic_login *login, int rx_res_index)
+{
+	/* nop */
+	return;
+}
+
+struct net_device *vnic_alloc_netdev(struct vnic_port *port)
+{
+	struct vnic_login_info *info;
+	struct vnic_login *login;
+	struct net_device *dev;
+	static int vnic_cnt = 0;
+	int i;
+
+	dev = VNIC_TXQ_ALLOC_NETDEV(sizeof *info, "eth%d", vnic_setup, port->tx_rings_num);
+	if (!dev) {
+		vnic_err(port->name, "VNIC_TXQ_ALLOC_NETDEV failed "
+			 "(size %Zu, tx_rings_num %d)\n",
+			 sizeof *info, port->tx_rings_num);
+		goto err;
+	}
+
+	/* this is a *very* large beast... */
+	login = vmalloc(sizeof *login);
+	if (!login) {
+		vnic_err(port->name, "failed to allocate login struct (%Zu)\n",
+			 sizeof *login);
+		goto free_netdev;
+	}
+
+	/* init fields */
+	memset(login, 0, sizeof *login);
+	info = netdev_priv(dev);
+	info->login = login;
+	login->dev = dev;
+	login->port = port;
+	login->max_mtu = VNIC_BUF_SIZE(login->port) - IB_GRH_BYTES -
+			 VNIC_ENCAP_LEN - ETH_HLEN - VLAN_HLEN;
+	login->cnt = ++vnic_cnt;
+	/* name will be overwritten later */
+	sprintf(login->name, "%s-%d", "vnic", login->cnt);
+	sprintf(login->desc, "%s-P%d",
+		login->port->dev->ca->node_desc, port->num);
+
+	login->neigh_wq = create_singlethread_workqueue(login->name);
+	if (!login->neigh_wq) {
+		vnic_err(NULL, "create_singlethread_workqueue failed for %s\n",
+				 login->name);
+		goto free_login;
+	}
+
+	login->rx_csum = 1;
+	login->rx_rings_num = port->rx_rings_num;
+	login->tx_rings_num = port->tx_rings_num;
+#ifdef _BP_NETDEV_NO_TMQ
+	/* if the kernel doesn't support Multiple TX queues,
+	 * then use only one TX queue */
+	login->tx_rings_num = 1;
+#endif
+	vnic_dbg_mark();
+	spin_lock_init(&login->lock);
+	spin_lock_init(&login->stats_lock);
+	rwlock_init(&login->mac_rwlock);
+	atomic_set(&login->vnic_child_cnt, 0);
+	vnic_mcast_root_init(&login->mcast_tree);
+	mutex_init(&login->moder_lock);
+	mutex_init(&login->state_lock);
+	SET_NETDEV_DEV(login->dev, login->port->dev->ca->dma_device);
+	INIT_DELAYED_WORK(&login->stats_task, vnic_do_get_stats);
+	INIT_DELAYED_WORK(&login->mcast_task, vnic_mcast_reattach);
+	INIT_DELAYED_WORK(&login->restart_task, vnic_restart_task);
+
+	vnic_set_ethtool_ops(dev);
+	/* init ethtool */
+	dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+	dev->hw_features |= NETIF_F_RXCSUM | NETIF_F_RXHASH;
+	dev->hw_features |= NETIF_F_TSO | NETIF_F_TSO6;
+	dev->features |= dev->hw_features;
+
+	/* init NAPI (must be before LRO init) */
+	login->napi_num = login->rx_rings_num;
+	for (i = 0; i < login->napi_num; ++i) {
+		if (vnic_napi_alloc(login, i)) {
+			vnic_err(login->name, "NAPI alloc %d failed\n", i);
+			goto free_napi;
+		}
+	}
+
+#if defined(NETIF_F_GRO) && !defined(_BP_NO_GRO)
+	login->dev->features |= NETIF_F_GRO;
+#elif defined(NETIF_F_LRO)
+	login->lro_num = vnic_lro_num;
+	login->lro_mng_num = vnic_lro_num ? login->rx_rings_num : 0;
+	login->dev->features |= vnic_lro_num ? NETIF_F_LRO : 0;
+#endif
+	for (i = 0; i < login->lro_mng_num; ++i) {
+		if (vnic_lro_enable(login, i)) {
+			vnic_err(login->name, "vnic_lro_enable %d failed\n", i);
+			goto free_lro;
+		}
+	}
+
+	return dev;
+
+free_lro:
+	for (--i; i >= 0; --i)
+		vnic_lro_disable(login, i);
+
+	i = login->napi_num;
+free_napi:
+	for (--i; i >= 0; --i)
+		vnic_napi_dealloc(login, i);
+free_login:
+	vfree(login);
+free_netdev:
+	free_netdev(dev);
+err:
+	return ERR_PTR(-ENODEV);
+}
+
+void vnic_free_netdev(struct vnic_login *login)
+{
+	int i;
+
+	vnic_dbg_func(login->name);
+
+	for (i = 0; i < login->lro_mng_num; ++i)
+		vnic_lro_disable(login, i);
+	for (i = 0; i < login->napi_num; ++i)
+		vnic_napi_dealloc(login, i);
+	flush_workqueue(login->neigh_wq);
+	destroy_workqueue(login->neigh_wq);
+	free_netdev(login->dev);
+	vfree(login);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_rx.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_rx.c
new file mode 100644
index 0000000000000..0051dee4882ea
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_rx.c
@@ -0,0 +1,677 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_data.h"
+
+static inline void free_single_frag(struct vnic_rx_ring *ring, int e,int i)
+{
+		ib_dma_unmap_single(ring->port->dev->ca,
+			ring->rx_info[e].dma_addr[i],
+			ring->frag_info[i].frag_size,
+			PCI_DMA_FROMDEVICE);
+		ring->rx_info[e].dma_addr[i] = 0;
+		put_page(ring->rx_info[e].frags[i].page.p);
+}
+
+#ifndef _BP_NETDEV_NO_TMQ
+/* this functions used only in no_bxm mode,
+ * it's not implemented in netdevice.h so we have it here
+ * based on netif_tx_lock()
+ */
+static inline int vnic_netif_tx_trylock(struct net_device *dev)
+{
+	int i, cpu;
+
+	spin_lock(&dev->tx_global_lock);
+	cpu = smp_processor_id();
+	for (i = 0; i < dev->num_tx_queues; ++i) {
+		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
+		if (__netif_tx_trylock(txq)) {
+			set_bit(__QUEUE_STATE_FROZEN, &txq->state);
+			__netif_tx_unlock(txq);
+		} else {
+			goto unlock;
+		}
+	}
+
+	return 1;
+
+unlock:
+	/* based on netif_tx_unlock() */
+	for (--i; i >= 0; --i) {
+		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
+		clear_bit(__QUEUE_STATE_FROZEN, &txq->state);
+		if (!test_bit(QUEUE_STATE_ANY_XOFF, &txq->state))
+			__netif_schedule(txq->qdisc);
+	}
+	spin_unlock(&dev->tx_global_lock);
+
+	return 0;
+}
+#else
+#define vnic_netif_tx_trylock(dev) netif_tx_trylock(dev)
+#endif
+
+int vnic_rx(struct vnic_login *login, struct sk_buff *skb, struct ib_wc *wc)
+{
+	ASSERT(skb);
+	vnic_dbg_skb("RX", skb, (unsigned long)-1, (unsigned long)0);
+
+	if (no_bxm) {
+		/* In no_bxm mode, we update neigh table based on ARP reqlies
+		 * QPN & LID are retrieved from the IB completion
+		 * ATTENTION: on RSS mode, make sure that ARPs are
+		 * sent on base QPN
+		 */
+		struct vnic_neigh *neighe;
+		struct ethhdr *eth_hdr = (struct ethhdr *)skb->data;
+		struct arphdr *arp_hdr = (struct arphdr *)(skb->data + ETH_HLEN);
+		u16 eth_proto = ntohs(eth_hdr->h_proto);
+		u16 arp_proto = ntohs(arp_hdr->ar_op);
+
+		if (eth_proto != ETH_P_ARP)
+			goto out;
+		if (arp_proto == ARPOP_REQUEST)
+			vnic_dbg_data(login->name, "ARP REQUEST\n");
+		else
+			vnic_dbg_data(login->name, "ARP REPLY\n");
+
+		/* don't stop TX queue, only try, this way we avoid blocking 
+		 * IRQs in TX flow (performance wise).
+		 * other vnic_neighe_* functions are not called in parallel 
+		 * to this flow (in no_bxm mode)
+		 */
+		if (!vnic_netif_tx_trylock(login->dev))
+			goto out;
+
+		neighe = vnic_neighe_search(login, eth_hdr->h_source);
+		if (!IS_ERR(neighe)) {
+			/* if IB address didn't change, do nothing */
+			if (neighe->qpn == wc->src_qp &&
+			    neighe->lid == wc->slid)
+				goto unlock;
+			/* else, del old neigh entry, and add a new one */
+			vnic_neighe_del(login, neighe);
+			vnic_neighe_dealloc(neighe);
+		}
+
+		/* RSS: assume that your neighbours are like you */
+		neighe = vnic_neighe_alloc(login, eth_hdr->h_source,
+					   wc->slid, wc->src_qp,
+					   login->rx_rings_num > 1 ? 1 : 0);
+		if (IS_ERR(neighe))
+			goto unlock;
+		if (vnic_neighe_add(login, neighe))
+			vnic_neighe_dealloc(neighe);
+unlock:
+		netif_tx_unlock(login->dev);
+	}
+out:
+
+	/* shared_vnic may receive PACKET_OTHERHOST
+	 * we 'fix' the pkt_type here so the kernel
+	 * won't drop it
+	 */
+	if (skb->pkt_type == PACKET_OTHERHOST && login->shared_vnic)
+		skb->pkt_type = PACKET_HOST;
+
+	netif_receive_skb(skb);
+
+	return 0;
+
+}
+
+struct sk_buff *vnic_alloc_rx_skb(struct vnic_rx_ring *ring, int buf_ind,
+				  gfp_t gfp_flag)
+{
+	struct ib_device *ca = ring->port->dev->ca;
+	struct sk_buff *skb;
+	u64 mapping;
+	int buf_size = VNIC_BUF_SIZE(ring->port);
+
+	skb = alloc_skb(buf_size, gfp_flag);
+	if (!skb) {
+		vnic_dbg_data(ring->port->name,
+			      "alloc_skb for size %d failed\n", buf_size);
+		goto err_alloc;
+	}
+
+	mapping = ib_dma_map_single(ca, skb->data, buf_size, DMA_FROM_DEVICE);
+	if (unlikely(ib_dma_mapping_error(ca, mapping))) {
+		vnic_dbg_data(ring->port->name,
+			      "ib_dma_map_single len %d failed\n", buf_size);
+		goto err_map;
+	}
+
+	ring->rx_info[buf_ind].skb = skb;
+	ring->rx_info[buf_ind].dma_addr[0] = mapping;
+
+	return skb;
+
+err_map:
+	dev_kfree_skb_any(skb);
+err_alloc:
+	return NULL;
+}
+
+static int frag_sizes[] = {
+	FRAG_SZ0,
+	FRAG_SZ1,
+	FRAG_SZ2,
+	FRAG_SZ3
+};
+
+/* Calculate the last offset position that accomodates a full fragment
+ * (assuming fagment size = stride-align)
+ */
+static int vnic_last_alloc_offset(struct vnic_rx_ring *ring, u16 stride, u16 align)
+{
+	u16 res = VNIC_ALLOC_SIZE % stride;
+	u16 offset = VNIC_ALLOC_SIZE - stride - res + align;
+
+	vnic_dbg_data(ring->port->name, "calculated last offset for stride:%d align:%d "
+		      "res:%d offset:%d\n", stride, align, res, offset);
+	return offset;
+}
+
+static int vnic_init_allocator(struct vnic_rx_ring *ring)
+{
+	struct vnic_rx_alloc *page_alloc;
+	int i;
+
+	if (vnic_rx_linear)
+		return 0;
+
+	for (i = 0; i < ring->num_frags; i++) {
+		page_alloc = &ring->page_alloc[i];
+		page_alloc->page = alloc_pages(GFP_ATOMIC | __GFP_COMP, VNIC_ALLOC_ORDER);
+		if (!page_alloc->page)
+			goto out;
+
+		page_alloc->offset = ring->frag_info[i].frag_align;
+		vnic_dbg_data(ring->port->name, "Initialized allocator:%d with page:%p\n",
+			      i, page_alloc->page);
+	}
+	return 0;
+
+out:
+	while (i--) {
+		page_alloc = &ring->page_alloc[i];
+		if (page_alloc->page) {
+			put_page(page_alloc->page);
+			page_alloc->page = NULL;
+		}
+	}
+	return -ENOMEM;
+}
+
+static void vnic_destroy_allocator(struct vnic_rx_ring *ring)
+{
+	struct vnic_rx_alloc *page_alloc;
+	int i;
+
+	if (vnic_rx_linear)
+		return;
+
+	for (i = 0; i < ring->num_frags; i++) {
+		page_alloc = &ring->page_alloc[i];
+		vnic_dbg_data(ring->port->name, "Freeing allocator:%d count:%d\n",
+			      i, page_count(page_alloc->page));
+		if (page_alloc->page) {
+			put_page(page_alloc->page);
+			page_alloc->page = NULL;
+		}
+	}
+}
+
+/*
+ * allocate a single fragment on a single ring entry and map it
+ * to HW address.
+ */
+static int vnic_alloc_frag(struct vnic_rx_ring *ring,
+			   struct vnic_frag_data *frags_data, int i)
+{
+	struct vnic_frag_info *frag_info = &ring->frag_info[i];
+	struct vnic_rx_alloc *page_alloc = &ring->page_alloc[i];
+	struct skb_frag_struct *skb_frags = &frags_data->frags[i];
+	struct skb_frag_struct skbf = *skb_frags;
+	struct page *page;	
+	struct ib_device *ib_device = ring->port->dev->ca;
+	u64 dma;
+	int decision;
+
+	if (vnic_rx_linear)
+		return 0;
+
+	if (page_alloc->offset >= frag_info->last_offset) {
+		decision = 0;
+		/* Allocate new page */
+		page = alloc_pages(GFP_ATOMIC | __GFP_COMP, VNIC_ALLOC_ORDER);
+		if (!page) {
+			/*frags_data->dma_addr[i] = NULL;
+			   ring->rx_info[wr_id].info = VNIC_FRAG_ALLOC_FAIL;
+			   ring->need_refill = 1; */
+			return -ENOMEM;
+		}
+		skbf.page.p = page_alloc->page;
+		skbf.page_offset = page_alloc->offset;
+	} else {
+		decision = 1;
+		page = page_alloc->page;
+		get_page(page);
+		skbf.page.p = page;
+		skbf.page_offset = page_alloc->offset;
+	}
+
+	skbf.size = frag_info->frag_size;
+	dma = ib_dma_map_single(ib_device, page_address(skbf.page.p) +
+			     skbf.page_offset, frag_info->frag_size,
+			     PCI_DMA_FROMDEVICE);
+	if (unlikely(ib_dma_mapping_error(ib_device, dma))) {
+		vnic_dbg_data(ring->port->name,
+			      "ib_dma_map_single len %d failed\n",
+			      frag_info->frag_size);
+		put_page(page);
+		return -ENOMEM;
+	}
+
+	if (!decision) {
+		page_alloc->page = page;
+		page_alloc->offset = frag_info->frag_align;
+	} else
+		page_alloc->offset += frag_info->frag_stride;
+
+	*skb_frags = skbf;
+	frags_data->dma_addr[i] = dma;
+
+	return 0;
+}
+
+void vnic_calc_rx_buf(struct vnic_rx_ring *ring)
+{
+	int eff_mtu = VNIC_BUF_SIZE(ring->port), buf_size = 0, i = 0;
+
+	if (vnic_rx_linear) {
+		ring->num_frags = 1;
+		return;
+	}
+
+	while (buf_size < eff_mtu) {
+		ring->frag_info[i].frag_size =
+			(eff_mtu > buf_size + frag_sizes[i]) ?
+				frag_sizes[i] : eff_mtu - buf_size;
+		ring->frag_info[i].frag_prefix_size = buf_size;
+		if (!i)	{
+			ring->frag_info[i].frag_align = NET_IP_ALIGN;
+			ring->frag_info[i].frag_stride =
+				ALIGN(frag_sizes[i] + NET_IP_ALIGN, SMP_CACHE_BYTES);
+		} else {
+			ring->frag_info[i].frag_align = 0;
+			ring->frag_info[i].frag_stride =
+				ALIGN(frag_sizes[i], SMP_CACHE_BYTES);
+		}
+		ring->frag_info[i].last_offset =
+			vnic_last_alloc_offset(ring,
+					       ring->frag_info[i].frag_stride,
+					       ring->frag_info[i].frag_align);
+		buf_size += ring->frag_info[i].frag_size;
+		i++;
+	}
+
+	ring->num_frags = i;
+	ring->rx_skb_size = eff_mtu;
+	ring->log_rx_info = ROUNDUP_LOG2(i * sizeof(struct skb_frag_struct));
+
+	vnic_dbg(ring->port->name, "Rx buffer scatter-list (ring %d effective-mtu:%d "
+		  "num_frags:%d):\n", ring->index ,eff_mtu, ring->num_frags);
+	for (i = 0; i < ring->num_frags; i++) {
+		vnic_dbg(ring->port->name, "frag:%d - size:%d prefix:%d align:%d "
+			 "stride:%d last_offset:%d\n", i,
+			 ring->frag_info[i].frag_size,
+			 ring->frag_info[i].frag_prefix_size,
+			 ring->frag_info[i].frag_align,
+			 ring->frag_info[i].frag_stride,
+			 ring->frag_info[i].last_offset);
+	}
+}
+
+static void vnic_empty_rx_entry(struct vnic_rx_ring *ring, int i)
+{
+	int frag_num, buf_size = VNIC_BUF_SIZE(ring->port);
+	struct ib_device *ca = ring->port->dev->ca;
+	struct sk_buff *skb;
+	u64 mapping;
+
+	if (vnic_rx_linear) {
+		for (frag_num = 0; frag_num < ring->num_frags; frag_num++) {
+			mapping = ring->rx_info[i].dma_addr[0];
+			skb = ring->rx_info[i].skb;
+			if (mapping)
+				ib_dma_unmap_single(ca, mapping, buf_size, DMA_FROM_DEVICE);
+			if (skb)
+				dev_kfree_skb_any(skb);
+		}
+
+		return;
+	}
+
+	/* non linear buffers */
+	for (frag_num = 0; frag_num < ring->num_frags; frag_num++)
+		free_single_frag(ring, i, frag_num);
+}
+
+static int vnic_fill_rx_buffer(struct vnic_rx_ring *ring)
+{
+	struct vnic_frag_data *frags_data = &ring->rx_info[0];
+	struct sk_buff *skb;
+	struct ib_device *ca = ring->port->dev->ca;
+	int buf_ind, frag_num, buf_size = VNIC_BUF_SIZE(ring->port);
+	u64 mapping;
+
+	if (vnic_rx_linear) {
+		for (buf_ind = 0; buf_ind < ring->size; buf_ind++) {
+			skb = vnic_alloc_rx_skb(ring, buf_ind, GFP_KERNEL);
+			if (!skb)
+				goto err_linear;
+		}
+
+		return 0;
+	}
+
+	/* non linear buffers */
+	for (buf_ind = 0; buf_ind < ring->size; buf_ind++, frags_data++) {
+		for (frag_num = 0; frag_num < ring->num_frags; frag_num++) {
+			if (vnic_alloc_frag(ring, frags_data, frag_num))
+				goto err_frags;
+		}
+	}
+
+	return 0;
+
+err_linear:
+	for (buf_ind = 0; buf_ind < ring->size; buf_ind++) {
+		mapping = ring->rx_info[buf_ind].dma_addr[0];
+		skb = ring->rx_info[buf_ind].skb;
+		if (mapping)
+			ib_dma_unmap_single(ca, mapping, buf_size, DMA_FROM_DEVICE);
+		if (skb)
+			dev_kfree_skb_any(skb);
+	}
+
+	return -ENOMEM;
+
+err_frags:
+	for (--frag_num; frag_num >= 0; frag_num--)
+		free_single_frag(ring, buf_ind, frag_num);
+
+	for (--buf_ind; buf_ind >= 0; buf_ind--)
+		vnic_empty_rx_entry(ring, buf_ind);
+
+	return -ENOMEM;
+}
+
+/*
+ * free entire ring full of fragments.
+*/
+static void vnic_empty_rx_buffer(struct vnic_rx_ring *ring)
+{
+	int buf_ind;
+
+	for (buf_ind = 0; buf_ind < ring->size; buf_ind++)
+		vnic_empty_rx_entry(ring, buf_ind);
+
+	ring->size = 0;
+}
+
+void vnic_destroy_rx_ring(struct vnic_rx_ring *ring)
+{
+	if (!ring)
+		return;
+	vnic_empty_rx_buffer(ring);
+	vnic_destroy_allocator(ring);
+	vfree(ring->rx_info);
+	vnic_ib_free_ring(ring);
+	kfree(ring);
+}
+
+int vnic_unmap_and_replace_rx(struct vnic_rx_ring *ring, struct ib_device *dev,
+			      struct skb_frag_struct *skb_frags_rx,
+			      u64 wr_id, int length)
+{
+	struct vnic_frag_info *frag_info;
+	struct vnic_frag_data *rx_info = &ring->rx_info[wr_id];
+
+	int nr;
+	dma_addr_t dma;
+
+	/* Collect used fragments while replacing them in the HW descriptors */
+	for (nr = 0; nr < ring->num_frags; nr++) {
+		frag_info = &ring->frag_info[nr];
+		if (length <= frag_info->frag_prefix_size)
+			break;
+
+		/* Save page reference in skb */
+		skb_frags_rx[nr].page = rx_info->frags[nr].page;
+		skb_frags_rx[nr].size = rx_info->frags[nr].size;
+		skb_frags_rx[nr].page_offset = rx_info->frags[nr].page_offset;
+		dma = rx_info->dma_addr[nr];
+
+		/* Allocate a replacement page */
+		if (vnic_alloc_frag(ring, rx_info, nr))
+			goto fail;
+
+		/* Unmap buffer */
+		ib_dma_unmap_single(dev, dma, skb_frags_rx[nr].size,
+				 PCI_DMA_FROMDEVICE);
+	}
+
+	/* Adjust size of last fragment to match actual length */
+	if (nr > 0)
+		skb_frags_rx[nr - 1].size = length -
+			ring->frag_info[nr - 1].frag_prefix_size;
+	return nr;
+
+fail:
+	/* Drop all accumulated fragments (which have already been replaced in
+	 * the descriptor) of this packet; remaining fragments are reused... */
+	while (nr > 0) {
+		nr--;
+		put_page(skb_frags_rx[nr].page.p);
+	}
+
+	return 0;
+}
+
+int vnic_rx_skb(struct vnic_login *login, struct vnic_rx_ring *ring,
+		struct ib_wc *wc, int ip_summed, char *eth_hdr_va)
+{
+	u64 wr_id = (unsigned int)wc->wr_id;
+	struct sk_buff *skb;
+	int used_frags;
+	char *va = eth_hdr_va;
+	int length = wc->byte_len - VNIC_EOIB_HDR_SIZE - VNIC_VLAN_OFFSET(login),
+	    linear_length = (length <= SMALL_PACKET_SIZE) ?
+	    length : SMALL_PACKET_SIZE, hdr_len = min(length, HEADER_COPY_SIZE),
+	    offest = NET_IP_ALIGN + 16;
+	struct ib_device *ib_dev = login->port->dev->ca;
+
+	/* alloc a small linear SKB */
+	skb = alloc_skb(linear_length + offest, GFP_ATOMIC);
+	if (unlikely(!skb))
+		return -ENOMEM;
+
+	skb_record_rx_queue(skb, ring->index);
+	skb_reserve(skb, offest);
+
+	if (vnic_linear_small_pkt && length <= SMALL_PACKET_SIZE) {
+		u64 dma;
+
+		/* We are copying all relevant data to the skb - temporarily
+		 * synch buffers for the copy
+		 */
+		dma = ring->rx_info[wr_id].dma_addr[0] + VNIC_EOIB_HDR_SIZE +
+			VNIC_VLAN_OFFSET(login);
+		ib_dma_sync_single_for_cpu(ib_dev, dma, length,
+					   DMA_FROM_DEVICE);
+		skb_copy_to_linear_data(skb, va, length);
+		ib_dma_sync_single_for_device(ib_dev, dma, length,
+					      DMA_FROM_DEVICE);
+		skb->tail += length;
+	} else {
+		/* unmap the needed fragmentand reallocate them. Fragments that
+		 * were not used will not be reused as is. */
+		used_frags = vnic_unmap_and_replace_rx(ring, ib_dev,
+						       skb_shinfo(skb)->frags,
+						       wr_id, wc->byte_len);
+		if (!used_frags)
+			goto free_and_repost;
+
+		skb_shinfo(skb)->nr_frags = used_frags;
+
+		/* Copy headers into the skb linear buffer */
+		memcpy(skb->data, va, hdr_len);
+		skb->tail += hdr_len;
+		/* Skip headers in first fragment */
+		skb_shinfo(skb)->frags[0].page_offset +=
+		    (VNIC_EOIB_HDR_SIZE + VNIC_VLAN_OFFSET(login) +
+		     hdr_len);
+
+		/* Adjust size of first fragment */
+		skb_shinfo(skb)->frags[0].size -=
+		    (VNIC_EOIB_HDR_SIZE + VNIC_VLAN_OFFSET(login) +
+		     hdr_len);
+		skb->data_len = length - hdr_len;
+	}
+
+	/* update skb fields */
+	skb->len = length;
+	skb->truesize = length + sizeof(struct sk_buff);
+	skb->ip_summed = ip_summed;
+	skb->dev = login->dev;
+	skb->protocol = eth_type_trans(skb, skb->dev);
+
+	return vnic_rx(login, skb, wc);
+
+free_and_repost:
+	dev_kfree_skb(skb);
+	return -ENODEV;
+
+}
+
+static void vnic_set_rx_sge(struct vnic_rx_ring *ring)
+{
+	int i;
+
+	ring->wr.num_sge = ring->num_frags;
+	ring->wr.next = NULL;
+	ring->wr.sg_list = ring->sge;
+	for (i = 0; i < ring->num_frags; ++i) {
+		ring->sge[i].lkey = ring->port->mr->lkey;
+		ring->sge[i].length = ring->frag_info[i].frag_size;
+	}
+}
+
+struct vnic_rx_ring *vnic_create_rx_ring(struct vnic_port *port, int index)
+{
+	int rc, rx_info, size = vnic_rx_rings_len;
+	struct vnic_rx_ring *ring;
+
+	ring = kzalloc(sizeof *ring, GFP_KERNEL);
+	if (!ring)
+		return ERR_PTR(-ENOMEM);
+
+	/* init attributes */
+	ring->port = port;
+	ring->size = size;
+	ring->index = index;
+	spin_lock_init(&ring->lock);
+
+	/* init rx ring IB resources */
+	if (vnic_ib_init_ring(ring)) {
+		vnic_err(port->name, "vnic_ib_init_ring failed\n");
+		goto free_ring;
+	}
+
+	rx_info = size * roundup_pow_of_two(sizeof(struct vnic_frag_data));
+	ring->rx_info = vmalloc(rx_info);
+	if (!ring->rx_info) {
+		vnic_err(port->name, "Failed allocating rx_info ring"
+			 " (%d bytes)\n", rx_info);
+		goto free_ib;
+	}
+	memset(ring->rx_info, 0, rx_info);
+
+	/* determine the sizes of the fragments as result of mtu */
+	vnic_calc_rx_buf(ring);
+
+	rc = vnic_init_allocator(ring);
+	if (rc) {
+		vnic_err(port->name, "Failed initializing ring"
+			 " allocator %d\n", rc);
+		goto free_rxinfo;
+	}
+
+	rc = vnic_fill_rx_buffer(ring);
+	if (rc) {
+		vnic_err(port->name, "vnic_fill_rx_buffer failed %d\n", rc);
+		goto free_allocator;
+	}
+
+	/* set rx WQEs drafts */
+	vnic_set_rx_sge(ring);
+
+	/* Initailize all descriptors and post to srq */
+	rc = vnic_post_recvs(ring);
+	if (rc) {
+		vnic_err(port->name, "vnic_post_recvs failed %d\n", rc);
+		goto free_rx_buffer;
+	}
+
+	return ring;
+
+free_rx_buffer:
+	/* TODO: we are freeing posted packets need to move SRQ
+	 * to error and free them first
+	 */
+	vnic_empty_rx_buffer(ring);
+free_allocator:
+	vnic_destroy_allocator(ring);
+free_rxinfo:
+	vfree(ring->rx_info);
+free_ib:
+	vnic_ib_free_ring(ring);
+free_ring:
+	kfree(ring);
+
+	return ERR_PTR(-EINVAL);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_tx.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_tx.c
new file mode 100644
index 0000000000000..0233d4fe7e1e4
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_tx.c
@@ -0,0 +1,622 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_data.h"
+
+static int vnic_ucast_send(struct vnic_login *login, struct sk_buff *skb,
+			    struct neighbour *neighbour, int tx_res_index);
+/* Push VLAN & EoIB headers and calculate RSS hash value
+ * We do the RSS hash here because we already check IP|TCP|UDP
+ * in this function for EoIB fields, so we make use of that
+ * and do RSS too.
+ */
+static struct eoibhdr eoib_h_draft = {
+	.encap_data = ((VNIC_EOIB_HDR_VER << 4) | (VNIC_EOIB_HDR_SIG << 6)),
+	.seg_off = 0,
+	.seg_id = 0
+};
+
+void vnic_learn_mac(struct net_device *dev, u8 *mac, int remove)
+{
+	struct vnic_login *login = vnic_netdev_priv(dev);
+
+	vnic_dbg_func(login->name);
+
+	/* skip invalid address */
+	if (unlikely(!is_valid_ether_addr(mac)))
+		return;
+
+	/* skip parent vNic address (original dev_addr) */
+	if (!(memcmp(login->dev_addr, mac, ETH_ALEN)))
+		return;
+
+	vnic_dbg_mac(login->name, "learn mac "MAC_6_PRINT_FMT"\n",
+		     MAC_6_PRINT_ARG(mac));
+
+	/* update child vNic list, ignore returned code */
+	read_lock_bh(&login->mac_rwlock);
+	vnic_child_update(login, mac, remove);
+	read_unlock_bh(&login->mac_rwlock);
+}
+
+u32 vnic_hash(struct net_device *dev, struct sk_buff *skb)
+{
+	struct tcphdr *tr_h = tcp_hdr(skb);
+	struct iphdr *ip_h = ip_hdr(skb);
+	struct ipv6hdr *ip_h6 = (struct ipv6hdr *)ip_h;
+	u32 hash = 0, addrlen, i;
+
+	/* All mcast traffic is sent and received on 1st queue
+	 * because only the 1st QP is attached to the MGIDs
+	 * TODO: consider distributing tx/rx mcast traffic as well
+	 */
+	if (is_multicast_ether_addr(skb_mac_header(skb)))
+		goto out;
+
+	switch (ntohs(skb->protocol)) {
+	case ETH_P_IP:
+		/* In IPv4, access TCP/UDP header only when IP packet is not
+		 * fragmented: flags == DF == 0x02.
+		 */
+		if (ntohs(ip_h->frag_off) >> 13 == 0x2 &&
+		    (ip_h->protocol == IPPROTO_TCP ||
+		     ip_h->protocol == IPPROTO_UDP)) {
+			hash ^= (u32)ntohl(ip_h->saddr);
+			hash ^= (u32)ntohl(ip_h->daddr);
+			hash ^= (u32)ntohs(tr_h->source);
+			hash ^= (u32)ntohs(tr_h->dest);
+		}
+		break;
+	case ETH_P_IPV6:
+		/* In IPv6, access TCP/UDP header only when IP packet is not
+		 * fragmented: main header nexthdr field points to TCP/UDP
+		 */
+		if (ip_h6->nexthdr == IPPROTO_TCP ||
+		    ip_h6->nexthdr == IPPROTO_UDP) {
+			addrlen = ARRAY_LEN(ip_h6->saddr.in6_u.u6_addr32);
+			for (i = 0; i < addrlen; ++i) {
+				hash ^= (u32)ntohl(ip_h6->saddr.in6_u.u6_addr32[i]);
+				hash ^= (u32)ntohl(ip_h6->daddr.in6_u.u6_addr32[i]);
+			}
+			tr_h = (struct tcphdr *)((void *)ip_h6 + sizeof *ip_h6);
+			hash ^= (u32)ntohs(tr_h->source);
+			hash ^= (u32)ntohs(tr_h->dest);
+		}
+	}
+out:
+	VNIC_SKB_SET_HASH(skb, hash);
+	return hash;
+}
+
+u8 vnic_lag_hash(struct sk_buff *skb, u16 hash_mask, u16 vid)
+{
+	struct tcphdr *tr_h = tcp_hdr(skb);
+	struct iphdr *ip_h = ip_hdr(skb);
+	struct ipv6hdr *ip_h6 = (struct ipv6hdr *)ip_h;
+	u32 hash = 0, addrlen, i;
+	struct ethhdr *eth = (struct ethhdr *)skb_mac_header(skb);
+	u32 hash_dmac, hash_smac, hash_prot, hash_vid;
+	u32 hash_sip = 0, hash_dip = 0, hash_sp = 0, hash_dp = 0;
+	u8 res_hash;
+	u8 *tmp;
+
+	hash_dmac = *(u32 *)(&eth->h_dest[ETH_ALEN - sizeof hash_smac]);
+	hash_smac = *(u32 *)(&eth->h_source[ETH_ALEN - sizeof hash_smac]);
+	hash_prot = (u32)ntohs(skb->protocol);
+	hash_vid  = (u32)vid;
+
+	if (hash_mask & GW_LAG_LAYER_2_3) {
+		switch (hash_prot) {
+		case ETH_P_IP:
+			/* In IPv4, access TCP/UDP header only when IP packet is not
+			 * fragmented: flags == DF == 0x02.
+			 */
+			if (ntohs(ip_h->frag_off) >> 13 == 0x2 &&
+			    (ip_h->protocol == IPPROTO_TCP ||
+			     ip_h->protocol == IPPROTO_UDP)) {
+				hash_sip = (u32)(ip_h->saddr);
+				hash_dip = (u32)(ip_h->daddr);
+				hash_sp  = (u32)(tr_h->source);
+				hash_dp  = (u32)(tr_h->dest);
+			}
+			break;
+		case ETH_P_IPV6:
+			/* In IPv6, access TCP/UDP header only when IP packet is not
+			 * fragmented: main header nexthdr field points to TCP/UDP
+			 */
+			if (ip_h6->nexthdr == IPPROTO_TCP ||
+			    ip_h6->nexthdr == IPPROTO_UDP) {
+				addrlen = ARRAY_LEN(ip_h6->saddr.in6_u.u6_addr32);
+				for (i = 0; i < addrlen; ++i) {
+					hash_sip ^= (u32)(ip_h6->saddr.in6_u.u6_addr32[i]);
+					hash_dip ^= (u32)(ip_h6->daddr.in6_u.u6_addr32[i]);
+				}
+				tr_h = (struct tcphdr *)((void *)ip_h6 + sizeof *ip_h6);
+				hash_sp = (u32)(tr_h->source);
+				hash_dp = (u32)(tr_h->dest);
+			}
+		}
+	}
+
+	hash ^= (hash_mask & GW_LAG_HASH_DMAC) ? hash_dmac : 0;
+	hash ^= (hash_mask & GW_LAG_HASH_SMAC) ? hash_smac : 0;
+	hash ^= (hash_mask & GW_LAG_HASH_TPID) ? hash_prot : 0;
+	hash ^= (hash_mask & GW_LAG_HASH_VID)  ? hash_vid  : 0;
+	hash ^= (hash_mask & GW_LAG_HASH_SIP)  ? hash_sip  : 0;
+	hash ^= (hash_mask & GW_LAG_HASH_DIP)  ? hash_dip  : 0;
+	hash ^= (hash_mask & GW_LAG_HASH_SPORT)  ? hash_sp  : 0;
+	hash ^= (hash_mask & GW_LAG_HASH_DPORT)  ? hash_dp  : 0;
+
+	tmp  = (u8 *)&hash;
+	res_hash = tmp[0] ^ tmp[1] ^ tmp[2] ^ tmp[3];
+
+	return res_hash;
+}
+
+static inline int vnic_header_encap(struct sk_buff *skb)
+{
+	struct vnic_login *login = vnic_netdev_priv(skb->dev);
+	struct eoibhdr *eoib_h;
+	struct iphdr *ip_h = ip_hdr(skb);
+	struct ipv6hdr *ip_h6 = (struct ipv6hdr *)ip_h;
+
+	/* push VLAN header
+	 * TODO: when VID iz zero, push header only when prio exists, i.e.:
+	 * if (VNIC_VLAN_ENABLED(login) && (login->vid || login->user_prio))
+	 */
+	if (VNIC_VLAN_ENABLED(login) && login->vid) {
+		struct vlan_ethhdr *veth =
+			(struct vlan_ethhdr *)skb_push(skb, VLAN_HLEN);
+		ASSERT(veth);
+		vnic_dbg_data_v(login->name, "push vlan tag with ID %u\n",
+				be16_to_cpu(login->vid));
+		memmove(skb->data, skb->data + VLAN_HLEN, 2 * ETH_ALEN);
+		veth->h_vlan_proto = htons(ETH_P_8021Q);
+		veth->h_vlan_TCI = login->vid;
+	}
+
+	/* push EoIB header */
+	if (vnic_encap_headroom)
+		skb_push(skb, VNIC_ENCAP_LEN);
+
+	/* reset MAC header here, it can be changed for the following reasons:
+	 * - vnic_encap_headroom is set, thus EoIB header is pushed
+	 * - VLAN is enabled, thus VLAN header is pushed
+	 * - some kernels (e.g., 2.6.18-194.el5) call dev_hard_start_xmit()
+	 *   without setting the mac header pointer
+	 */
+	skb_set_mac_header(skb, VNIC_SKB_GET_ENCAP_OFFSET);
+
+	/* enforce source mac*/
+	if (vnic_src_mac_enforce)
+		memcpy(skb_mac_header(skb) + ETH_ALEN,
+		       login->dev->dev_addr, ETH_ALEN);
+
+	/* set EoIB header VER/SIG, others set to zero */
+	eoib_h = VNIC_SKB_GET_ENCAP(skb);
+	*eoib_h = eoib_h_draft;
+
+	/* set EoIB header IP_CHK */
+	switch (ntohs(skb->protocol)) {
+	case ETH_P_IP:
+		VNIC_EOIB_HDR_SET_IP_CHK_OK(eoib_h);
+		if (ip_h->protocol == IPPROTO_TCP)
+			VNIC_EOIB_HDR_SET_TCP_CHK_OK(eoib_h);
+		else if (ip_h->protocol == IPPROTO_UDP)
+			VNIC_EOIB_HDR_SET_UDP_CHK_OK(eoib_h);
+		break;
+	case ETH_P_IPV6:
+		VNIC_EOIB_HDR_SET_IP_CHK_OK(eoib_h);
+		if (ip_h6->nexthdr == IPPROTO_TCP)
+			VNIC_EOIB_HDR_SET_TCP_CHK_OK(eoib_h);
+		else if (ip_h6->nexthdr == IPPROTO_UDP)
+			VNIC_EOIB_HDR_SET_UDP_CHK_OK(eoib_h);
+	}
+
+#ifdef _BP_NETDEV_NO_TMQ
+	/* if TSS is enabled, use the hash value calculated by
+	 * vnic_select_queue() otherwise call vnic_hash()
+	 */
+	vnic_hash(skb->dev, skb);
+#endif
+
+	return 0;
+}
+
+static void vnic_neigh_path_query_complete(int status,
+										   struct ib_sa_path_rec *pathrec,
+										   void *context)
+{
+	struct vnic_neigh *neigh = context;
+	struct ib_ah *old_ah, *new_ah;
+	struct net_device *dev = neigh->login->dev;
+	struct sk_buff_head skqueue;
+	struct vnic_login *login = neigh->login;
+
+	if (status) {
+		vnic_dbg_data(neigh->login->name, "neigh %d "MAC_6_PRINT_FMT" path query complete FAILED\n",
+						neigh->lid, MAC_6_PRINT_ARG(neigh->mac));
+		goto drop_pkts;
+	} else {
+		struct ib_ah_attr av;
+		struct sk_buff *skb;
+		vnic_dbg_data(login->name, "neigh %d "MAC_6_PRINT_FMT" path query complete sucess SL=%d\n",
+						neigh->lid, MAC_6_PRINT_ARG(neigh->mac), pathrec->sl);
+		if(ib_init_ah_from_path(login->port->dev->ca, login->port->num, pathrec, &av)){
+			vnic_warn(login->name, "ib_init_ah_from_path %d "MAC_6_PRINT_FMT" failed!\n",
+						neigh->lid, MAC_6_PRINT_ARG(neigh->mac));
+			goto drop_pkts;
+		}
+
+		old_ah = neigh->ah;
+		new_ah = ib_create_ah(login->port->pd, &av);
+		if (IS_ERR(new_ah) || !new_ah) {
+			vnic_warn(login->name, "ib_create_ah %d "MAC_6_PRINT_FMT" failed!\n",
+						neigh->lid, MAC_6_PRINT_ARG(neigh->mac));
+
+			goto drop_pkts;
+		}
+
+		neigh->sl = pathrec->sl;
+		skb_queue_head_init(&skqueue);
+		netif_tx_lock_bh(login->dev);
+		neigh->ah = new_ah;
+		neigh->valid = 1;
+		neigh->query_id = -1;
+		while ((skb = __skb_dequeue(&neigh->pkt_queue)))
+			__skb_queue_tail(&skqueue, skb);
+		netif_tx_unlock_bh(login->dev);
+
+		/* retransmit all pending packets */
+		while ((skb = __skb_dequeue(&skqueue))) {
+			/* reset skb headers */
+			/* TODO ALL VLAN ?? */
+			if (VNIC_VLAN_ENABLED(login) && login->vid)
+				skb_pull(skb, VLAN_HLEN);
+			if (vnic_encap_headroom)
+				skb_pull(skb, VNIC_ENCAP_LEN);
+
+			skb->dev = dev;
+			dev_queue_xmit(skb);
+		}
+
+		if (old_ah && !IS_ERR(old_ah))
+			ib_destroy_ah(old_ah);
+	}
+	complete(&neigh->query_comp);
+	return;
+
+drop_pkts:
+	netif_tx_lock_bh(dev);
+	neigh->query_id = -1; /* this will cause a retry */
+	while (!skb_queue_empty(&neigh->pkt_queue))
+	{
+		struct sk_buff *skb = skb_dequeue(&neigh->pkt_queue);
+		int tx_res_index;
+		struct vnic_tx_res *tx_res;
+		skb->dev = dev;
+		tx_res_index = VNIC_TXQ_GET_HASH(skb, login->real_tx_rings_num);
+		ASSERT(tx_res_index <= login->tx_rings_num);
+		tx_res = &login->tx_res[tx_res_index];
+		VNIC_STATS_DO_INC(tx_res->stats.tx_dropped);
+		dev_kfree_skb_any(skb);
+	}
+	netif_tx_unlock_bh(dev);
+	complete(&neigh->query_comp);
+}
+
+int vnic_neighe_path_query(struct vnic_neigh *neighe)
+{
+	ib_sa_comp_mask comp_mask;
+	struct ib_sa_path_rec p_rec;
+	u16 slid = neighe->login->port->attr.lid;
+	vnic_dbg_data(neighe->login->vnic_name,"neighe SL Query slid %d dlid %d dmac:"MAC_6_PRINT_FMT"\n",
+				  slid, neighe->lid, MAC_6_PRINT_ARG(neighe->mac));
+
+	comp_mask =        IB_SA_PATH_REC_SERVICE_ID  |
+					   IB_SA_PATH_REC_DLID        |
+					   IB_SA_PATH_REC_SLID        |
+					   IB_SA_PATH_REC_PKEY;
+
+	if (IS_NEIGH_QUERY_RUNNING(neighe))
+		ib_sa_cancel_query(neighe->query_id, neighe->pquery);
+
+	init_completion(&neighe->query_comp);
+	neighe->query_id = -1;
+	neighe->pquery = NULL;
+
+	p_rec.dlid = cpu_to_be16(neighe->lid);
+	p_rec.slid = cpu_to_be16(slid);
+	p_rec.service_id = cpu_to_be64(EOIB_SERVICE_ID);
+	p_rec.pkey = cpu_to_be16(neighe->login->pkey);
+
+	neighe->query_id = ib_sa_path_rec_get(&vnic_sa_client,
+                                          neighe->login->port->dev->ca,
+                                          neighe->login->port->num,
+                                          &p_rec,
+                                          comp_mask,
+                                          1000/*TOUT*/,
+                                          GFP_ATOMIC,
+                                          vnic_neigh_path_query_complete,
+                                          neighe,
+                                          &neighe->pquery);
+	if (neighe->query_id < 0) {
+		vnic_dbg_data(neighe->login->vnic_name, "FAILED neigh SL Query slid %d dlid %d dmac:"MAC_6_PRINT_FMT"\n",
+			  slid, neighe->lid, MAC_6_PRINT_ARG(neighe->mac));
+		complete(&neighe->query_comp);
+	}
+	return neighe->query_id;
+}
+
+static int vnic_ucast_send(struct vnic_login *login, struct sk_buff *skb,
+			    struct neighbour *neighbour, int tx_res_index)
+{
+	struct vnic_neigh *neighe;
+	int hash;
+
+	neighe = vnic_neighe_search(login, skb_mac_header(skb));
+	if (IS_ERR(neighe)) {
+		vnic_dbg_data(login->name, "no dst_neigh and no vnic_neigh - "
+			      "gw unicast packet\n");
+
+		/* for egress unicast traffic of a shared vnic,
+		 * replace src mac by shared mac
+		 */
+		if (login->shared_vnic)
+			memcpy(skb_mac_header(skb) + ETH_ALEN,
+			       login->shared_mac, ETH_ALEN);
+
+		if (!login->is_lag)
+			neighe = login->gw_neigh;
+		else {
+			if (unlikely(!login->lag_member_active_count))
+				return -ENOENT;
+
+			/* use hash value precomputed and mapping to find LAG GW to send to */
+			hash = vnic_lag_hash(skb, login->lag_prop.hash_mask, login->vid);
+			hash = hash % LAG_MAP_TABLE_SIZE;
+			neighe = &login->lag_gw_neigh[login->lag_gw_map[hash]].neigh;
+		}
+
+		/* update GW statistics */
+		VNIC_STATS_ADD(login->port_stats.gw_tx_bytes, skb->len);
+		VNIC_STATS_INC(login->port_stats.gw_tx_packets);
+	} else {
+		vnic_dbg_data(login->name,
+			      "no dst_neigh but vnic_neigh exists - "
+			      "local unicast packet\n");
+	}
+
+	/* TODO: in VNIC_NEIGH_GET_DQPN use neigh qps_num instead of login */
+	vnic_dbg_data(login->name, "vnic_send to (base qpn 0x%06x) dqpn 0x%06x"
+		      " dlid 0x%08x %s\n", neighe->qpn,
+		      VNIC_NEIGH_GET_DQPN(skb, neighe), neighe->lid,
+		      neighe == login->gw_neigh ? "[GW]" : "");
+
+	if (unlikely(vnic_sa_query && !neighe->valid)) {
+		/* query neigh ah*/
+		vnic_dbg_data(login->name, "AH is not %s, running path query: LID=%d mac="MAC_6_PRINT_FMT"\n",
+				  !IS_ERR(neighe->ah) && neighe->ah ? "valid":"found",
+				  neighe->lid, MAC_6_PRINT_ARG(neighe->mac));
+
+		if (!IS_NEIGH_QUERY_RUNNING(neighe))
+			vnic_neighe_path_query(neighe);
+
+		if (IS_ERR(neighe->ah) || !neighe->ah)
+		{   /* AH is not ready yet, Queue pkt */
+			if (skb_queue_len(&neighe->pkt_queue) > VNIC_SKB_QUEUE_LEN || !IS_NEIGH_QUERY_RUNNING(neighe))
+				return 1; /* Drop in case queue is full or no query is currently runnig*/
+			__skb_queue_tail(&neighe->pkt_queue, skb);
+			return 0;
+		}
+		/* if ah is initialized send anyway */
+	}
+	vnic_send(login, skb, neighe->ah, VNIC_NEIGH_GET_DQPN(skb, neighe), tx_res_index);
+	return 0;
+}
+
+void vnic_mcast_send(struct vnic_login *login, struct sk_buff *skb, int tx_res_index)
+{
+	struct vnic_mcast *mcaste;
+	union vhub_mgid mgid;
+	struct ethhdr *eth;
+	struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index];
+	struct ib_ah_attr *av = &tx_res->mcast_av;
+	struct ib_ah *ah;
+	u16 gw_id;
+	int hash;
+
+	eth = (struct ethhdr *)skb_mac_header(skb);
+
+	/* for LAG GW, perform hashing on mcast address */
+	if (login->is_lag && login->lag_member_active_count) {
+		hash = vnic_lag_hash(skb, login->lag_prop.hash_mask, login->vid);
+		hash = hash % LAG_MAP_TABLE_SIZE;
+		gw_id = login->lag_gw_neigh[login->lag_gw_map[hash]].gw_id;
+	}
+	else
+		gw_id = login->gw_port_id;
+
+	/* retrieve the mlid */
+	vhub_mgid_create(login->mgid_prefix, ETH_ZERO_MAC, login->n_mac_mcgid,
+			 CREATE_VHUB_ID(login->vid, gw_id),
+			 VHUB_MGID_DATA, 0, &mgid);
+
+	spin_lock(&login->mcast_tree.mcast_rb_lock);
+	mcaste = vnic_mcast_search(&login->mcast_tree, &mgid.ib_gid);
+	if (unlikely(IS_ERR(mcaste) || !mcaste->ah)) {
+		vnic_dbg_data(login->name, "couldn't find mcaste for "
+			      MAC_6_PRINT_FMT"\n",
+			      MAC_6_PRINT_ARG(eth->h_dest));
+		spin_unlock(&login->mcast_tree.mcast_rb_lock);
+		goto drop;
+	}
+
+	spin_lock(&mcaste->lock);
+	vhub_mgid_create(login->mgid_prefix, eth->h_dest, login->n_mac_mcgid,
+			 CREATE_VHUB_ID(login->vid, gw_id),
+			 vnic_mgid_data_type, 0, &mgid);
+	vnic_dbg_mcast_v(login->name, "sending to ETH "MAC_6_PRINT_FMT"-> "
+			 "GID "VNIC_GID_FMT" (mask %d bit)\n",
+			 MAC_6_PRINT_ARG(eth->h_dest),
+			 VNIC_GID_ARG(mgid.ib_gid),
+			 login->n_mac_mcgid);
+
+	av->dlid = be16_to_cpu(mcaste->port_mcaste->rec.mlid);
+	av->static_rate = mcaste->port_mcaste->rec.rate;
+	av->sl = mcaste->port_mcaste->rec.sl;
+	memcpy(&av->grh.dgid, mgid.ib_gid.raw, GID_LEN);
+
+	ah = ib_create_ah(login->port->pd, av);
+	spin_unlock(&mcaste->lock);
+	spin_unlock(&login->mcast_tree.mcast_rb_lock);
+
+	if (!ah || IS_ERR(ah))
+		goto drop;
+
+	vnic_send(login, skb, ah, IB_MULTICAST_QPN, tx_res_index);
+	ib_destroy_ah(ah);
+	/* used as a counter for multicast TX packets (not RX) */
+	VNIC_STATS_DO_INC(tx_res->stats.multicast);
+
+	return;
+
+drop:
+	VNIC_STATS_DO_INC(tx_res->stats.tx_dropped);
+	dev_kfree_skb_any(skb);
+}
+
+int vnic_tx(struct sk_buff *skb, struct net_device *dev)
+{
+	struct vnic_login *login = vnic_netdev_priv(dev);
+	int tx_res_index = 0, headroom = dev->hard_header_len - ETH_HLEN;
+	struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index];
+
+	ASSERT(dev);
+	ASSERT(skb);
+#ifdef VNIC_PROFILLNG
+	login->prof_arr[login->prof_arr_it].cnt++;
+	/* copy only fields for reporting, data buffer is invalid */
+	login->prof_arr[login->prof_arr_it].skb = *skb;
+	login->prof_arr[login->prof_arr_it].skb.data = NULL;
+	login->prof_arr[login->prof_arr_it].tstamp = current_kernel_time();
+	login->prof_arr[login->prof_arr_it].jiffies = jiffies;
+	login->prof_arr[login->prof_arr_it].nr_frags = skb_shinfo(skb)->nr_frags;
+	login->prof_arr_it = (login->prof_arr_it + 1) % VNIC_PROFILLNG_SKB_MAX;
+
+#endif
+
+	/* drop zero length skbs */
+	if (unlikely(!skb->len))
+		goto drop;
+
+	/* sometimes, vnic_tx is called before carrier is up FM #100882 */
+	if (unlikely(!test_bit(VNIC_STATE_NETDEV_CARRIER_ON, &login->netdev_state)))
+		goto drop;
+
+	/* check headroom and reallocate skb if needed:
+	 * If VLAN used: need VLAN_HLEN (4) Bytes
+	 * If vnic_encap_headroom set: need VNIC_ENCAP_LEN (4) Bytes
+	 * when vnic_encap_headroom is clear, we do not encap EoIB header
+	 * into the headroom, but rather use additional SG entry to hold it
+	 */
+
+	if (unlikely(skb_headroom(skb) < headroom)) {
+		struct sk_buff *skb_new;
+
+		skb_new = skb_realloc_headroom(skb, headroom);
+		if (!skb_new)
+			goto drop;
+
+		dev_kfree_skb(skb);
+		skb = skb_new;
+		VNIC_STATS_INC(login->port_stats.realloc_packets);
+	}
+	/* don't use dev->header_ops, use vnic_header_encap() inline
+	 * function instead, because when raw socket is used or BR_CTL mode
+	 * then header_ops are not called as expected, and we'll end up sending
+	 * the packet without EoIB header
+	 */
+	if (unlikely(vnic_header_encap(skb)))
+		goto drop;
+
+	/* in promiscuous mode, learn the source mac */
+	if (is_ucast_promisc(login) && vnic_learn_mac_enabled)
+		vnic_learn_mac(dev, skb_mac_header(skb) + ETH_ALEN, 0);
+
+	/* get TX resource for this SKB, keep it after vnic_header_encap()
+	 * so if we don't have kernel multiple queue support we use the
+	 * RSS hash result for TSS
+	 */
+	tx_res_index = VNIC_TXQ_GET_HASH(skb, login->real_tx_rings_num);
+	ASSERT(tx_res_index <= login->tx_rings_num);
+	tx_res = &login->tx_res[tx_res_index];
+
+
+	/* send ucast/mcast packet */
+	vnic_dbg_skb("TX", skb, (unsigned long)(vnic_encap_headroom ? 0 : -1),
+		     (unsigned long)(vnic_encap_headroom ? VNIC_ENCAP_LEN : 0));
+#if 0 /* neighbour caching disabled */
+	if (likely(skb->dst && skb->dst->neighbour)) {
+		if (is_multicast_ether_addr(skb_mac_header(skb))) {
+			vnic_dbg_data(login->name,
+				      "dst_neigh exists but no vnic_neigh - "
+				      "multicast packet\n");
+			vnic_mcast_send(login, skb, tx_res_index);
+		} else {
+			vnic_dbg_data(login->name,
+				      "dst_neigh exists but no vnic_neigh - "
+				      "unicast packet\n");
+			vnic_ucast_send(login, skb, skb->dst->neighbour, tx_res_index);
+		}
+	} else 
+#endif
+	{
+		if (is_multicast_ether_addr(skb_mac_header(skb))) {
+			vnic_dbg_data(login->name,
+				      "no dst_neigh - multicast packet\n");
+			vnic_mcast_send(login, skb, tx_res_index);
+		} else {
+			vnic_dbg_data(login->name,
+				      "no dst_neigh - unicast packet\n");
+			if (unlikely(vnic_ucast_send(login, skb, NULL, tx_res_index)))
+				goto drop;
+		}
+	}
+
+	return NETDEV_TX_OK;
+
+drop:
+	dev_kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip.h b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip.h
new file mode 100644
index 0000000000000..0f77c1abde17f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip.h
@@ -0,0 +1,1025 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _VNIC_FIP_H
+#define _VNIC_FIP_H
+
+#include "vnic.h"
+
+
+#define FIP_TYPE(FIPT) FIP_TYPE_##FIPT
+#define FIP_TYPE_IDX(FIPT) FIP_TYPE_IDX_##FIPT
+
+#define FIP_CASE(FIPT) case FIP_TYPE(FIPT): return FIP_TYPE_IDX(FIPT)
+
+#define FIP_CASE_STR(FIPT) case FIP_TYPE(FIPT): return # FIPT
+#define FIP_SUBCODE_CASE_STR(SUBCODE) case (SUBCODE): return # SUBCODE
+
+#define FIP_MASK(FIPT) (((u64)1) << FIP_TYPE_IDX(FIPT))
+
+#define ADV_EXT_TYPE(FIPT) ADV_EXT_TYPE_##FIPT
+#define ADV_EXT_IDX(FIPT) ADV_EXT_IDX_##FIPT
+
+#define GUID_FORMAT "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x"
+#define MGID_PREFIX_FMT "%02x:%02x:%02x:%02x:%02x"
+#define GUID_ARG(g) (g)[0], (g)[1], (g)[2], (g)[3], (g)[4], (g)[5], (g)[6], (g)[7]
+#define MGID_PRE_ARG(g) (g)[0], (g)[1], (g)[2], (g)[3], (g)[4]
+
+enum {
+	FIP_TYPE(VENDOR_ID)	= 13,
+	FIP_TYPE(ADDRESS)	= 240,
+	FIP_TYPE(GW_INFORMATION)= 241,
+	FIP_TYPE(LOGIN)		= 242,
+	FIP_TYPE(VHUB_UPDATE)	= 243,
+	FIP_TYPE(VHUB_TABLE)	= 244,
+	FIP_TYPE(VNIC_IDENTITY)	= 245,
+	FIP_TYPE(PARTITION)	= 246,
+	FIP_TYPE(GW_IDENTIFIER)	= 248,
+	FIP_TYPE(KA_PARAMS)	= 249,
+	FIP_TYPE(EXT_DESC)	= 254,
+};
+
+enum {
+	FIP_TYPE_IDX(VENDOR_ID),
+	FIP_TYPE_IDX(ADDRESS),
+	FIP_TYPE_IDX(GW_INFORMATION),
+	FIP_TYPE_IDX(LOGIN),
+	FIP_TYPE_IDX(VHUB_UPDATE),
+	FIP_TYPE_IDX(VHUB_TABLE),
+	FIP_TYPE_IDX(VNIC_IDENTITY),
+	FIP_TYPE_IDX(PARTITION),
+	FIP_TYPE_IDX(GW_IDENTIFIER),
+	FIP_TYPE_IDX(KA_PARAMS),
+	FIP_TYPE_IDX(EXT_DESC),
+};
+
+enum {
+	ADV_EXT_TYPE(CAP)	 = 40,
+	ADV_EXT_TYPE(BOOT)	 = 18,
+	ADV_EXT_TYPE(LAG)	 = 41,
+	ADV_EXT_TYPE(MEMBER)	 = 42,
+	ADV_EXT_TYPE(PC_ID)	 = 43, /* Power Cycle ID */
+	ADV_EXT_TYPE(CTRL_IPORT) = 240,
+};
+
+enum {
+	ADV_EXT_IDX(CAP),
+	ADV_EXT_IDX(BOOT),
+	ADV_EXT_IDX(LAG),
+	ADV_EXT_IDX(PC_ID),
+	ADV_EXT_IDX(CTRL_IPORT),
+};
+
+
+enum {
+	EPORT_STATE_DOWN = 0,
+	EPORT_STATE_UP = 1,
+};
+
+enum fip_packet_type {
+	FIP_DISCOVER_UCAST = 0,
+	FIP_DISCOVER_MCAST = 1
+};
+
+enum {
+	FIP_TABLE_HDR_MIDDLE = 0,
+	FIP_TABLE_HDR_FIRST = 1,
+	FIP_TABLE_HDR_LAST = 2,
+	FIP_TABLE_HDR_ONLY = 3
+};
+
+enum {
+	FIP_EXT_LAG_W_POLICY_HOST  = 1,
+	FIP_EXT_LAG_W_POLICY_UCAST = 1 << 2
+};
+
+/* string "mellanox" */
+#define FIP_VENDOR_MELLANOX { 0x6d, 0x65, 0x6c, 0x6c, 0x61, 0x6e, 0x6f, 0x78 }
+
+
+#define FIP_TEST_PKT_LENGTH(port, length, type)				   \
+	 if ((length) != sizeof(type) + IB_GRH_BYTES) {			   \
+		 vnic_dbg_fip(port->name, "Dump packet:"		   \
+			 "at %d unexpected size. length %d expected %d\n", \
+			  __LINE__, (int)length,			   \
+			  (int)(sizeof(type) + IB_GRH_BYTES));		   \
+		 return -EINVAL;					   \
+	 }
+
+/*
+ * copy string b to string a and NULL termination.
+ * length a must be >= length b+1.
+ */
+#define TERMINATED_MEMCPY(a,b)			\
+	do {					\
+		ASSERT(sizeof(a)>=sizeof(b)+1);	\
+		memcpy((a), (b), sizeof(b));	\
+		(a)[sizeof(b)] = '\0';		\
+	} while (0);
+
+
+enum {
+	FIP_MAX_ADDR_TLVS = 6,
+	FIP_MAX_TLVS = 32,
+	FIP_MAX_EXT_DESC = 32,
+};
+
+struct fip_fip_type {
+	u8	type;
+	u8 	length;
+	u16 	reserved;
+};
+
+struct fip_header_simple {
+	__be16 opcode;
+	u8 reserved;
+	u8 subcode;
+	__be16 list_length;
+	__be16 flags;
+};
+
+struct fip_vendor_id_tlv {
+	struct fip_fip_type ft;
+	u8	vendor_id[8];
+};
+
+struct fip_address_tlv {
+	struct fip_fip_type ft;
+	u8		    vendor_id[8];
+	__be32		    gwtype_qpn;
+	__be16		    sl_gwportid;
+	__be16		    lid;
+	u8		    guid[8];
+};
+
+struct fip_gw_information_tlv {
+	struct fip_fip_type ft;
+	u8		    vendor_id[8];
+	u8		    h_nmac_mgid;
+	u8		    n_rss_mgid_tss_qpn;
+	__be16		    n_rss_qpn_vnics;
+};
+
+struct fip_login_tlv {
+	struct fip_fip_type ft;
+	u8		    vendor_id[8];
+	__be16		    mtu;
+	__be16		    vnic_id;
+	__be16		    flags_vlan;
+	u8		    mac[6];
+	u8		    eth_gid_prefix[5];
+	u8		    antispoofing;
+	__be16		    vfields;
+	__be32		    syndrom_ctrl_qpn;
+	u8		    vnic_name[16];
+};
+
+struct context_table_entry {
+	u8	v_rss_type;
+	u8	reserved;
+	u8	mac[ETH_ALEN];
+	__be32	qpn;
+	u8	reserved1;
+	u8	sl;
+	__be16	lid;
+};
+
+struct fip_vhub_update_tlv {
+	struct fip_fip_type ft;
+	u8		    vendor_id[8];
+	__be32		    state_vhub_id;
+	__be32		    tusn;
+};
+
+struct fip_vhub_table_tlv {
+	struct fip_fip_type ft;
+	u8		    vendor_id[8];
+	__be32		    vp_vhub_id;
+	__be32		    tusn;
+	__be16		    hdr;
+	__be16		    table_size;
+};
+
+struct fip_vnic_identity_tlv {
+	struct fip_fip_type ft;
+	u8		    vendor_id[8];
+	__be32		    flags_vhub_id;
+	__be32		    tusn;
+	__be16		    vnic_id;
+	u8		    mac[6];
+	u8		    port_guid[8];
+	u8		    vnic_name[16];
+};
+
+struct fip_partition_tlv {
+	struct fip_fip_type ft;
+	u8		    vendor_id[8];
+	__be16		    reserved;
+	__be16		    pkey;
+};
+
+struct fip_gw_identifier_tlv {
+	struct fip_fip_type ft;
+	u8		    vendor_id[8];
+	u8		    sys_guid[8];
+	u8		    sys_name[32];
+	u8		    gw_port_name[8];
+};
+
+struct fip_ka_params_tlv {
+	struct fip_fip_type ft;
+	u8		    vendor_id[8];
+	__be32		    adv_period;
+	__be32		    ka_period;
+	__be32		    vnic_ka_period;
+};
+
+struct fip_ext_desc_tlv {
+	struct fip_fip_type ft;
+	u8		    vendor_id[8];
+};
+
+struct fip_extended_type {
+	u8	ext_type;
+	u8	len;
+	u8	reserved;
+	u8	mandatory;
+};
+
+struct fip_ext_type_cap {
+	struct fip_extended_type et;
+	u32			 reserved[4];
+};
+
+struct fip_ext_type_boot {
+	struct fip_extended_type et;
+	u8			 boot_prio;
+	u8			 reserved;
+	__be16			 discovery_timeout;
+};
+
+struct fip_ext_type_lag_props {
+	struct fip_extended_type et;
+	u8			 gw_type;
+	u8			 reserved;
+	__be16			 lag_hash;
+	u8    			 weight_policy_flags;
+	u8			 ca_threshold;
+	__be16			 link_down_pol_thresh;
+	u32			 reserved2[2];
+};
+
+struct fip_ext_type_power_cycle_id {
+	struct fip_extended_type et;
+	__be64			 power_cycle_id;
+	u32			 reserved;
+} __attribute__((packed));
+
+struct fip_ext_type_hostname {
+	struct fip_extended_type et;
+	u8			 hostname[32];
+};
+
+struct fip_ext_type_ctrl_iport {
+	struct fip_extended_type et;
+	u8		    vendor_id[8];
+	__be32		    gwtype_qpn;
+	__be16		    sl_gwportid;
+	__be16		    lid;
+	u8		    guid[8];
+};
+
+struct fip_ext_type_lag_member {
+	__be32			 qpn;
+	__be16			 sl_gw_portid;
+	__be16			 lid;
+	u8			 guid[8];
+	u8			 eport_state;
+	u8			 reserved1;
+	u8			 weight;
+	u8			 link_utilization;
+	u32			 reserved2;
+};
+
+struct fip_ext_type_lag_members {
+	struct fip_extended_type et;
+	struct fip_ext_type_lag_member lagm[0];
+};
+
+struct fip_ext_group {
+	struct fip_ext_desc_tlv	*fed[FIP_MAX_EXT_DESC];
+	int			 num;
+};
+
+struct fip_address_group {
+	struct fip_address_tlv *fa[FIP_MAX_ADDR_TLVS];
+	int 			num;
+};
+
+struct fip_context_group {
+	struct context_table_entry *cte;
+	int			    num;
+};
+
+struct fip_content {
+	struct fip_eoib_ver *eoib_ver;
+	struct fip_header_simple *fh;
+	struct fip_vendor_id_tlv *fvend;
+	struct fip_address_group fa;
+	struct fip_gw_information_tlv *fgwi;
+	struct fip_login_tlv *fl;
+	struct fip_vhub_update_tlv *fvu;
+	struct fip_vhub_table_tlv *fvt;
+	struct fip_vnic_identity_tlv *fvi;
+	struct fip_partition_tlv *fp;
+	struct fip_gw_identifier_tlv *fgid;
+	struct fip_ka_params_tlv *fka;
+        struct fip_ext_group fed;
+	struct fip_context_group cte;
+	u64	mask;
+	u16	offsets[FIP_MAX_TLVS];
+	int	num;
+};
+
+/**************************************************************************/
+/*                           packet format structs                        */
+/**************************************************************************/
+#define VENDOR_ID_LENGTH 8
+
+struct fip_eoib_ver {
+	u8 version;
+	u8 reserved[3];
+};
+
+struct fip_fip_header {
+	__be16 opcode;
+	u8 reserved;
+	u8 subcode;
+	__be16 list_length;
+	__be16 flags;
+	struct fip_fip_type type;
+	u8 vendor_id[VNIC_VENDOR_LEN];
+};
+
+struct fip_discover_base {
+	struct fip_fip_type type;
+	u8 vendor_id[VNIC_VENDOR_LEN];
+	u32 qpn;
+	u16 sl_port_id;
+	u16 lid;
+	u8 guid[GUID_LEN];
+};
+
+struct eoib_adv_gw_info { /* Gabi */
+	struct fip_fip_type type; 
+	u8 vendor_id[VNIC_VENDOR_LEN];
+	u8 system_guid[GUID_LEN];
+	u8 system_name[VNIC_SYSTEM_NAME_LEN];
+	u8 gw_port_name[VNIC_GW_PORT_NAME_LEN];
+};
+
+/* keep alive information */
+struct eoib_adv_ka_info { /* Gabi */
+	struct fip_fip_type type; 
+	u8 vendor_id[VNIC_VENDOR_LEN];
+	u32 gw_adv_period;
+	u32 gw_period;
+	u32 vnic_ka_period;
+};
+
+struct eoib_advertise {
+	struct fip_eoib_ver version;
+	struct fip_fip_header fip;
+	struct fip_discover_base base;
+	struct fip_fip_type type_1;
+	u8 vendor_id[VNIC_VENDOR_LEN];
+	u8 flags;
+	u8 reserved;
+	u16 num_net_vnics;
+	struct eoib_adv_gw_info gw_info; /* Gabi */
+	struct eoib_adv_ka_info ka_info; /* Gabi */
+};
+
+struct syndrom_dword {
+	u8 syndrom;
+	u8 reserved[3];
+};
+
+union syn_qp_ctrl {
+	struct syndrom_dword syn;
+	u32 ctl_qpn;
+};
+
+struct eoib_login {
+	struct fip_eoib_ver		eoib_ver;
+	struct fip_header_simple	fh;
+	struct fip_vendor_id_tlv	fvend;
+	struct fip_address_tlv		fa;
+	struct fip_login_tlv		fl;
+};
+
+struct fip_solicit_legacy {
+	struct fip_eoib_ver version;
+	struct fip_header_simple fh;
+	struct fip_vendor_id_tlv fvend;
+	struct fip_address_tlv addr;
+};
+
+struct fip_solicit_new {
+	struct fip_eoib_ver version;
+	struct fip_header_simple fh;
+	struct fip_vendor_id_tlv fvend;
+	struct fip_address_tlv addr;
+	struct fip_ext_desc_tlv ext;
+	struct fip_ext_type_cap ext_cap;
+        struct fip_ext_type_hostname ext_hostname;
+};
+
+union fip_vhub_id {
+	struct {
+		u8 flags;
+		u8 reserved[3];
+	} flags;
+	u32 vhub_id;
+};
+
+struct eoib_context_table {
+	struct fip_eoib_ver version;
+	struct fip_fip_header fip;
+	struct fip_fip_type type_1;
+	u8 vendor_id[VNIC_VENDOR_LEN];
+	union fip_vhub_id vhub_id;
+	u32 tusn;
+	u8 flags;
+	u8 reserved;
+	u16 table_size;
+	/* here come the context entries */
+};
+
+/* this is the number of DWORDS to subtract from type_1->length
+ * to get the size of the entries / 4. (size in dwords from start
+ * of vendor_id field until the first context entry + 1 for checksum
+ */
+#define FIP_TABLE_SUB_LENGTH 6
+
+/*
+ * eoib_host_update will be used for vHub context requests,
+ * keep alives and logouts
+ */
+struct eoib_host_update {
+	struct fip_eoib_ver version;
+	struct fip_fip_header fip;
+	struct fip_fip_type type_1;
+	u8 vendor_id[VNIC_VENDOR_LEN];
+	union fip_vhub_id vhub_id;
+	u32 tusn;
+	u16 vnic_id;
+	u8 mac[ETH_ALEN];
+	u8 port_guid[GUID_LEN];
+	u8 vnic_name[VNIC_NAME_LEN];
+};
+
+enum fip_packet_fields {
+	EOIB_FIP_OPCODE = 0xFFF9,
+	FIP_FIP_HDR_LENGTH = 3,
+	FIP_FIP_HDR_TYPE = 13,
+
+	/* keep all subcodes here */
+	FIP_HOST_SOL_SUB_OPCODE = 0x1,
+	FIP_GW_ADV_SUB_OPCODE = 0x2,
+	FIP_HOST_LOGIN_SUB_OPCODE = 0x3,
+	FIP_GW_LOGIN_SUB_OPCODE = 0x4,
+	FIP_HOST_LOGOUT_SUB_OPCODE = 0x5,
+	FIP_GW_UPDATE_SUB_OPCODE = 0x6,
+	FIP_GW_TABLE_SUB_OPCODE = 0x7,
+	FIP_HOST_ALIVE_SUB_OPCODE = 0x8,
+	FIP_MAX_SUBCODES,
+	/* end subcodes section */
+
+	FIP_FIP_FCF_FLAG = 0x1,
+	FIP_FIP_SOLICITED_FLAG = 0x2,
+	FIP_FIP_ADVRTS_FLAG = 0x4,
+	FIP_FIP_FP_FLAG = 0x80,
+	FIP_FIP_SP_FLAG = 0x40,
+
+	FIP_BASIC_LENGTH = 7,
+	FIP_BASIC_TYPE = 240,
+
+	FIP_ADVERTISE_LENGTH_1 = 4,
+	FIP_ADVERTISE_TYPE_1 = 241,
+	FIP_ADVERTISE_HOST_VLANS = 0x80,
+	FIP_ADVERTISE_NUM_VNICS_MASK = 0x0FFF,
+	FIP_ADVERTISE_N_RSS_SHIFT = 12,
+	FIP_ADVERTISE_HOST_EN_MASK = 0x80,
+	FIP_ADVERTISE_ALL_VLAN_GW_MASK = 0x60,
+	FIP_ADVERTISE_GW_PORT_ID_MASK = 0x0FFF,
+	FIP_ADVERTISE_SL_SHIFT = 12,
+
+	FIP_ADVERTISE_GW_LENGTH = 15,
+	FIP_ADVERTISE_GW_TYPE = 248,
+
+	FIP_ADVERTISE_KA_LENGTH = 6,
+	FIP_ADVERTISE_KA_TYPE = 249,
+
+	FIP_LOGIN_LENGTH_1 = 13,
+	FIP_LOGIN_TYPE_1 = 242,
+	FIP_LOGIN_LENGTH_2 = 4,
+	FIP_LOGIN_TYPE_2 = 246,
+
+	FIP_LOGIN_V_FLAG = 0x8000,
+	FIP_LOGIN_M_FLAG = 0x4000,
+	FIP_LOGIN_VP_FLAG = 0x2000,
+	FIP_LOGIN_H_FLAG = 0x1000,
+	FIP_LOGIN_VLAN_MASK = 0x0FFF,
+	FIP_LOGIN_DMAC_MGID_MASK = 0x3F,
+	FIP_LOGIN_RSS_MGID_MASK = 0x0F,
+	FIP_LOGIN_RSS_MASK = 0x10,
+	FIP_LOGIN_RSS_SHIFT = 4,
+	FIP_LOGIN_CTRL_QPN_MASK = 0xFFFFFF,
+	FIP_LOGIN_VNIC_ID_BITS = 16,
+	FIP_LOGIN_ALL_VLAN_GW_FLAG = 0x0040,
+
+	FIP_LOGOUT_LENGTH_1 = 13,
+	FIP_LOGOUT_TYPE_1 = 245,
+
+	FIP_HOST_UPDATE_LENGTH = 13,
+	FIP_HOST_UPDATE_TYPE = 245,
+	FIP_HOST_VP_FLAG = 0x01,
+	FIP_HOST_U_FLAG = 0x80,
+	FIP_HOST_R_FLAG = 0x40,
+
+	FIP_CONTEXT_UP_LENGTH = 9,
+	FIP_CONTEXT_UP_TYPE = 243,
+	FIP_CONTEXT_UP_EPORT_MASK = 0x30,
+	FIP_CONTEXT_UP_EPORT_SHIFT = 4,
+	FIP_CONTEXT_V_FLAG = 0x80,
+	FIP_CONTEXT_RSS_FLAG = 0x40,
+	FIP_CONTEXT_TYPE_MASK = 0x0F,
+
+	FIP_CONTEXT_TBL_TYPE = 244,
+	FIP_CONTEXT_TBL_SEQ_MASK = 0xC0,
+	FIP_CONTEXT_TBL_SEQ_FIRST = 0x40,
+	FIP_CONTEXT_TBL_SEQ_LAST = 0x80,
+
+	FKA_ADV_PERIOD = 8000,	/* in mSecs */
+	FKA_ADV_MISSES = 3
+};
+
+enum fip_login_syndroms {
+	FIP_SYNDROM_SUCCESS = 0,
+	FIP_SYNDROM_HADMIN_REJECT = 1,
+	FIP_SYNDROM_GW_RESRC = 2,
+	FIP_SYNDROM_NO_NADMIN = 3,
+	FIP_SYNDROM_UNRECOGNISED_HOST = 4,
+	FIP_SYNDROM_UNSUPPORTED_PARAM = 5,
+	FIP_SYNDROM_GW_IS_LAG_MEMBER = 6,
+	FIP_SYNDROM_DUPLICATE_ADDRESS = 7,
+};
+
+/*
+ * Send a multicast or unicast solicit packet. The multicast packet is sent
+ * to the discover mcast group. Unicast packets are sent to the dqpn + dlid
+ * supplied. The dlid, dqpn, sl are ignored for multicast packets.
+ * functionreturns 0 on success and error code on failure
+*/
+int fip_solicit_send(struct fip_discover *discover,
+		     enum fip_packet_type multicast, u32 dqpn,
+		     u16 dlid, u8 sl, int new_prot);
+
+/*
+ * Send a unicast login packet. This function supports both host and
+ * network admined logins. function returns 0 on success and
+ * error code on failure
+*/
+int fip_login_send(struct fip_vnic_data *vnic);
+
+int fip_logout_send(struct fip_vnic_data *vnic);
+
+/*
+ * This function creates and sends a few types of packets (all ucast):
+ *   vHub context request - new=1, logout=0
+ *   vHub context update / vnic keep alive - new=0, logout=0
+ *   vnic logout - new=0, logout=1
+*/
+int fip_update_send(struct fip_vnic_data *vnic, int request_new, int logout);
+
+/*
+ * Check if a received packet is a FIP packet, And if so return its subtype.
+ * The FIP type is also returned in fip_type and can be either EOIB_FIP_OPCODE
+ * or FCOIB_FIP_OPCODE. If the packet is not a FIP packet -EINVAL is returned.
+*/
+int fip_pkt_parse(char *buffer, int length, int *fip_type);
+
+/*
+ * Already know that this is a FIP packet, return its subtype.
+*/
+int fip_pkt_get_subtype_bh(char *buffer);
+
+/*
+ * parse a packet that is suspected of being an advertise packet. The packet
+ * returns 0 for a valid advertise packet and an error code other wise. The
+ * packets "interesting" details are returned in data.
+*/
+int fip_advertise_parse_bh(struct fip_discover *discover, struct fip_content *fc,
+			   struct fip_gw_data *data);
+
+/*
+ * parse a packet that is suspected of being an login ack packet. The packet
+ * returns 0 for a valid login ack packet and an error code other wise. The
+ * packets "interesting" details are returned in data.
+*/
+int fip_login_parse(struct fip_discover *discover, struct fip_content *fc,
+		    struct fip_login_data *data);
+
+static inline int _map_generic_pkt(struct vnic_port *port,
+				   struct fip_ring_entry *tx_ring_entry,
+				   void *mem, int pkt_size)
+{
+	/* alloc packet to be sent */
+	tx_ring_entry->mem = mem;
+
+	/* map packet to bus */
+	tx_ring_entry->bus_addr =
+	    ib_dma_map_single(port->dev->ca,
+			      tx_ring_entry->mem, pkt_size, DMA_TO_DEVICE);
+
+	if (unlikely(ib_dma_mapping_error(port->dev->ca,
+					  tx_ring_entry->bus_addr))) {
+		vnic_warn(port->name,
+			  "send_generic_pkt failed to map to pci\n");
+		return -ENOMEM;
+	}
+	tx_ring_entry->length = pkt_size;
+
+	return 0;
+}
+
+static inline int alloc_map_fip_buffer(struct ib_device *ca,
+				       struct fip_ring_entry *me,
+				       int size, gfp_t mask)
+{
+	me->mem = kmalloc(size, mask);
+	if (!me->mem) {
+		vnic_warn(ca->name, "failed to alloc memory (%d)\n", size);
+		return -ENOMEM;
+	}
+
+	me->bus_addr = ib_dma_map_single(ca, me->mem, size, DMA_FROM_DEVICE);
+	if (unlikely(ib_dma_mapping_error(ca, me->bus_addr))) {
+		kfree(me->mem);
+		vnic_warn(ca->name, "ib_dma_mapping_error failed\n");
+		return -ENOMEM;
+	}
+	me->length = size;
+	me->entry_posted = 0;
+
+	return 0;
+}
+
+#define DELAYED_WORK_CLEANUP_JIFFS	2
+#define FIP_MAX_PKT_PRINT_LENGTH	120
+#define	FIP_OP_RECV			(1ul << 31)
+
+static const char fip_discover_mgid[GID_LEN] = {
+	0xFF, 0x12, 0xE0, 0x1B,
+	0x00, 0x06, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00};
+static const char fip_solicit_mgid[GID_LEN] = {
+	0xFF, 0x12, 0xE0, 0x1B,
+	0x00, 0x07, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00};
+
+
+/* TODO - remove this: for initial debug only */
+void fip_dbg_dump_raw_pkt(int level, void *buff,
+			  int length, int is_tx, char *name);
+enum {
+	FIP_ETH_HEADER_LEN = 14,
+	FIP_ENCAP_LEN = 4,
+	FIP_PROTOCOL_RX_SIZE = 16,	/* must be power of 2 */
+	FIP_PROTOCOL_TX_SIZE = 64,	/* must be power of 2 */
+	FIP_LOGIN_RX_SIZE = 64,	/* must be power of 2 */
+	FIP_LOGIN_TX_SIZE = 64,		/* must be power of 2 */
+
+	/* timeout in seconds between LOGIN and ACK */
+	FIP_LOGIN_TIMEOUT = 8,
+	FIP_RESOLICIT_TIME = 8,
+
+	IPOIB_UD_HEAD_SIZE = IB_GRH_BYTES + FIP_ENCAP_LEN,
+};
+
+struct fip_rcv_pkt {
+	struct list_head list;
+	struct fip_content *fc;
+	int length;
+	void *mem;
+};
+
+/*
+ * Alloc the discover CQ, QP. Configure the QP to RTS.
+ * alloc the RX + TX rings and queue work for discover
+ * finite state machine code. If complete it set, it clears
+ * possible previous GW / VNIC data structs on init.
+ */
+int fip_discover_init(struct vnic_port *port, struct fip_discover *discover,
+		      u16 pkey, int complete);
+
+/*
+ * free the discover TX and RX rings, QP and CQ. If complete 
+ * is set, it clears possible previous GW / VNIC data structs
+ * by using a "complete" flush otherwise vnic data is preserved.
+*/
+int fip_discover_cleanup(struct vnic_port *port, struct fip_discover *discover, int complete);
+
+/*
+ * send a single multicast packet.
+ * return 0 on success, other on failure.
+*/
+int fip_mcast_send(struct vnic_port *port, struct ib_qp *qp,
+		   unsigned int wr_id, u64 mapping,
+		   int size, u16 pkey_index, struct vnic_mcast *mcast);
+/*
+ * send a single unicast packet.
+ * return 0 on success, other on failure.
+*/
+int fip_ucast_send(struct vnic_port *port, struct ib_ah *ah,
+		   struct ib_qp *qp,
+		   unsigned int wr_id, u64 mapping,
+		   int size, u16 pkey_index, u32 dest_qpn, u16 dlid,
+		   u32 qkey, u8 sl);
+/*
+ * qonfigure a newly allocated QP and move it
+ * from reset->init->RTR->RTS
+ */
+int fip_init_qp(struct vnic_port *port, struct ib_qp *qp,
+		u16 pkey_index, char *name);
+
+/*
+ * allocs a single rx buffer (of size size), map it to pci bus
+ * and post it to the qp for receive. id parameter is used
+ * to keep track of work request when completion is received.
+ * kernel and bus address are returned in mem_entry.
+ * returns 0 on success else failure.
+ * id used to identify entry in receive queue.
+ */
+int fip_post_receive(struct vnic_port *port, struct ib_qp *qp, int size,
+		     int _id, struct fip_ring_entry *mem_entry, char *name);
+
+/* trigered by a core event */
+void fip_qp_to_reset(struct ib_qp *qp, char *name);
+void fip_flush_rings(struct vnic_port *port,
+		     struct ib_cq *cq,
+		     struct ib_qp *qp,
+		     struct fip_ring *rx_ring,
+		     struct fip_ring *tx_ring,
+		     char *name);
+void fip_free_rings(struct vnic_port *port,
+		    struct fip_ring *rx_ring,
+		    struct fip_ring *tx_ring,
+		    char *name);
+
+/*
+ * This function allocates the tx buffers and initializes the head and
+ * tail indexes.
+ */
+int fip_init_tx(int size, struct fip_ring *tx_ring, char *name);
+
+/*
+ * Configure the discover QP. This includes configuring rx+tx
+ * moving the discover QP to RTS and creating the tx  and rx rings
+ */
+int fip_init_rx(struct vnic_port *port, int ring_size, struct ib_qp *qp,
+		struct fip_ring *rx_ring, char *name);
+
+/*
+ * This is a general purpose CQ completion function that handles
+ * completions on RX and TX rings. It can serve all users that are
+ * using RX and TX rings.
+ * RX completions are destinguished from TX comp by the MSB that is set
+ * for RX and clear for TX. For RX, the memory is unmapped from the PCI,
+ * The head is incremented. For TX the memory is unmapped and then freed.
+ * The function returns the number of packets received.
+*/
+int fip_comp(struct vnic_port *port,
+	     struct ib_cq *cq,
+	     struct fip_ring *rx_ring,
+	     struct fip_ring *tx_ring,
+	     char *name);
+
+/*
+ * This function is the driving engine of the vnic logic. It manages the
+ * vnics state machines.
+ * Some of the states in the state machine could have been removed because
+ * they contain "actions" and not states. Still it is easier to maintaine
+ * the code this way and it gives an easy mechanism for exception handling
+ * and retries.
+ * Only call this function from fip_wq context.
+*/
+void fip_vnic_fsm(struct work_struct *work);
+
+/*
+ * Mark the vnic for deletion and trigger a delayed call to the cleanup
+ * function. In the past the vnic was moved to another list but this
+ * might cause vnic duplication if new vnics are added to the GW. Even
+ * if the vnic is being flushed we need to know it is there.
+ *
+ * Note: This deletion method insures that all pending vnic work requests
+ * are cleared without dependency of the calling context.
+*/
+void fip_vnic_close(struct fip_vnic_data *vnic, enum fip_flush flush);
+
+/*
+ * Free vnic resources. This includes closing the data vnic (data QPs etc)
+ * and the discovery resources. If the vnic can be totaly destroyed (no
+ * pending work) the vnic will be removed from the GW list and it's memory
+ * freed. If not the vnic will not be freed and the function will return an
+ * error. The caller needs to recall this unction to complete the operation.
+*/
+int fip_vnic_destroy(struct fip_vnic_data *vnic);
+
+struct fip_vnic_data *fip_vnic_alloc(struct vnic_port *port,
+				     struct fip_gw_data *gw,
+				     int hadmin,
+				     u16 vnic_id);
+
+/*
+ * Look for a vnic in the GW vnic list. The search key used is either the vnic_id
+ * that is unique, or the mac+vlan pair. A match on either key will result in the
+ * return of the vnic. both keys are nesesary because host assigned delete
+ * flow might not have access to the vnic_id. The search disregards vnics that
+ * are undergoing full flush (they will be removed soon).
+*/
+struct fip_vnic_data *fip_vnic_find_in_list(struct fip_gw_data *gw,
+					    u16 vnic_id, u8 *mac,
+					    u16 vlan, u8 vlan_used);
+
+/*
+ * process an incoming login ack packet. The packet was already parsed and
+ * its data was placed in *data. The function creates RX and TX rings for the
+ * vnic and starts the multicast join procedure.
+ * This function should not be called for packets other then login ack packets.
+*/
+void fip_vnic_login_ack_recv(struct fip_vnic_data *vnic,
+			     struct fip_login_data *data);
+
+/*
+ * This function should be called when the building of a vhub context
+ * table is done and the vnic state should transition to CONNECTED.
+*/
+int fip_vnic_tbl_done(struct fip_vnic_data *vnic);
+int fip_vnic_mcast_recnct(struct fip_vnic_data *vnic);
+
+/*
+ * Init the vnic's vHub table data structures, before using them
+ */
+void vhub_ctx_init(struct fip_vnic_data *vnic);
+void vhub_table_free(struct vhub_elist *elist);
+
+/*
+ * Clear and free the vnic's vHub context table data structures.
+ */
+void vhub_ctx_free(struct fip_vnic_data *vnic);
+
+/*
+ * This function handles a vhub context table packet. The table will
+ * be processed only if we do not have a up to date local coppy of
+ * our own. The table update supports multi-packet tables so care
+ * must be taken in building the complete table.
+*/
+int vhub_handle_tbl(struct fip_vnic_data *vnic, struct fip_content *fc,
+		    u32 vhub_id, u32 tusn);
+
+/*
+ * This function handles a vhub context update packets. There are three flows
+ * in handeling update packets. The first is before the main table is up
+ * to date, the second is after the table is up to date but before it was
+ * passed to the ownership of the data vnic (login struct) and the local
+ * lists are freed, and the last is when the table maintanence is done
+ * by the data vnic. This function handles all cases.
+*/
+int vhub_handle_update(struct fip_vnic_data *vnic,
+		       u32 vhub_id, u32 tusn,
+		       struct vnic_table_entry *data);
+
+/*
+ * This function writes the main vhub table to the data (login) vnic.
+ * You should call it when the data vnic is ready for it and after the
+ * table is up to date (and the update list was applied to the main list)
+ */
+int fip_vnic_write_tbl(struct fip_vnic_data *vnic);
+
+/* sysfs entries for hadmin vNics*/
+int vnic_create_hadmin_dentry(struct fip_vnic_data *vnic);
+void vnic_delete_hadmin_dentry(struct fip_vnic_data *vnic);
+void extract_memb_extended(struct fip_ext_type_lag_members *ext_lag_membs,
+			   int ext_length,			  
+			   struct lag_members *lagm,
+			   char *name);
+int handle_member_update(struct fip_vnic_data *vnic, struct lag_members *lm);
+int extract_vhub_extended(struct fip_ext_desc_tlv *fed,
+                          struct fip_vnic_data *vnic);
+static inline int send_generic_ucast_pkt(struct vnic_port *port,
+					 struct ib_ah *ah,
+					 struct fip_ring *tx_ring,
+					 void *mem, int pkt_size,
+					 struct ib_qp *qp,
+					 int pkey_index,
+					 u32 dst_qpn, u16 dst_lid,
+					 u32 qkey, u8 sl)
+{
+	int index, rc;
+	unsigned long flags;
+	unsigned long tail;
+
+	/*
+	 * we are only allowed to update the head at task level so no need to
+	 * perform any locks here
+	 */
+	spin_lock_irqsave(&tx_ring->ring_lock, flags);
+	index = tx_ring->head & (tx_ring->size - 1);
+
+	vnic_dbg_fip(port->name, "send ucast packet\n");
+
+	spin_lock(&tx_ring->head_tail_lock);
+	tail = tx_ring->tail;
+	spin_unlock(&tx_ring->head_tail_lock);
+
+	/* ring full try again */
+	if (tx_ring->head - tail >=  tx_ring->size) {
+		vnic_warn(port->name, "send_generic_pkt ring full: QPN 0x%x: tail=%ld head=%ld diff=%ld\n",
+			  qp->qp_num, tx_ring->tail, tx_ring->head, tx_ring->head - tx_ring->tail);
+		rc = -EAGAIN;
+		goto err;
+	}
+
+
+	rc = _map_generic_pkt(port, &tx_ring->ring[index], mem, pkt_size);
+	if (rc)
+		goto err;
+
+	rc = fip_ucast_send(port, ah, qp, index,
+			    tx_ring->ring[index].bus_addr,
+			    pkt_size, pkey_index, dst_qpn, dst_lid,
+			    qkey, sl);
+
+	if (rc) {
+		vnic_warn(port->name, "fip_ucast_send() failed (%d)\n", rc);
+		rc = -ENODEV;
+		goto error_unmap_dma;
+	}
+
+	tx_ring->head++;
+
+	spin_unlock_irqrestore(&tx_ring->ring_lock, flags);
+	return 0;
+
+error_unmap_dma:
+	ib_dma_unmap_single(port->dev->ca,
+			    tx_ring->ring[index].bus_addr,
+			    pkt_size, DMA_TO_DEVICE);
+err:
+	spin_unlock_irqrestore(&tx_ring->ring_lock, flags);
+	return rc;
+}
+
+static inline const char *eport_state_str(int state)
+{
+	switch (state) {
+	case EPORT_STATE_DOWN: return "Down";
+	case EPORT_STATE_UP: return "Up";
+	default:return "Invalid";
+	}
+}
+
+#endif /* _VNIC_FIP_H */
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_discover.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_discover.c
new file mode 100644
index 0000000000000..71829aaa626ae
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_discover.c
@@ -0,0 +1,2183 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_fip.h"
+#include "vnic_fip_discover.h"
+#include "vnic_fip_pkt.h"
+
+#define FIP_MAX_PKT_PRINT_LENGTH 120
+
+static void fip_purge_gws(struct work_struct *work);
+static void fip_discover_gw_fsm(struct work_struct *work);
+static void fip_discover_hadmin_update(struct work_struct *work);
+static void fip_discover_fsm(struct work_struct *work);
+void fip_close_gw(struct fip_gw_data *gw, enum fip_flush flush);
+
+/* TODO - remove this: for initial debug only */
+void fip_dbg_dump_raw_pkt(int level, void *buff,
+			  int length, int is_tx, char *name)
+{
+	int i;
+	int tmp_len;
+	u32 *data_ptr;
+	unsigned char *tmp_data_ptr;
+
+	if (!(vnic_msglvl & VNIC_DEBUG_PKT_DUMP))
+		return;
+
+	printk(KERN_DEBUG "%s %s: packet length is %d\n",
+	       is_tx ? "TX" : "RX", name, length);
+
+	length = (length > FIP_MAX_PKT_PRINT_LENGTH) ?
+		FIP_MAX_PKT_PRINT_LENGTH : length;
+
+	tmp_len = (length >> 2) + 1;
+	data_ptr = (u32 *)buff;
+	for (i = 0; i < tmp_len; i++) {
+		if (!is_tx && i == IB_GRH_BYTES >> 2)
+			printk(KERN_DEBUG "========================\n");
+		tmp_data_ptr = (unsigned char *)&data_ptr[i];
+		printk(KERN_DEBUG "%02x %02x %02x %02x \n",
+			   tmp_data_ptr[0], tmp_data_ptr[1],
+			   tmp_data_ptr[2], tmp_data_ptr[3]);
+	}
+}
+
+/*
+ * Configure the discover QP. This includes configuring rx+tx
+ * moving the discover QP to RTS and creating the tx and rx rings
+ */
+int fip_discover_start_rings(struct fip_discover *discover,
+			     struct fip_ring *rx_ring,
+			     struct fip_ring *tx_ring,
+			     struct ib_cq *cq,
+			     struct ib_qp *qp)
+{
+	int rc;
+
+	rc = fip_init_tx(tx_ring->size, tx_ring, discover->name);
+	if (rc) {
+		vnic_warn(discover->name, "fip_init_tx failed rc %d\n", rc);
+		/* set RX ring size to 0 as indication of the failure
+		   so RX rings won't be freed, no need to set tx_ring->size
+		   since fip_init_tx error flow will handle it */
+		rx_ring->size = 0;
+		return rc;
+	}
+
+	rc = fip_init_rx(discover->port, rx_ring->size, qp, rx_ring, discover->name);
+	if (rc) {
+		vnic_warn(discover->name, "fip_init_rx returned %d\n", rc);
+		goto release_queues;
+	}
+
+	return 0;
+
+release_queues:
+	fip_flush_rings(discover->port, cq, qp, rx_ring, tx_ring, discover->name);
+	fip_free_rings(discover->port, rx_ring, tx_ring, discover->name);
+
+	return rc;
+}
+
+int fip_discover_init_rings(struct vnic_port *port,
+			    struct fip_discover *discover,
+			    struct fip_ring *rx_ring,
+			    struct fip_ring *tx_ring,
+			    struct ib_cq **cq,
+			    struct ib_qp **qp,
+			    ib_comp_handler comp_handler)
+{
+	struct ib_qp_init_attr qp_init_attr;
+	struct ib_device *ca = port->dev->ca;
+
+
+	*cq = ib_create_cq(ca, comp_handler, NULL, discover,
+			   rx_ring->size + tx_ring->size, 0);
+	if (IS_ERR(*cq)) {
+		vnic_warn(discover->name, "failed to create CQ\n");
+		goto out;
+	}
+
+	memset(&qp_init_attr, 0, sizeof(qp_init_attr));
+	qp_init_attr.cap.max_send_wr = tx_ring->size;
+	qp_init_attr.cap.max_recv_wr = rx_ring->size;
+	qp_init_attr.cap.max_send_sge = 1;
+	qp_init_attr.cap.max_recv_sge = 1;
+	qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+	qp_init_attr.qp_type = IB_QPT_UD;
+	qp_init_attr.send_cq = *cq;
+	qp_init_attr.recv_cq = *cq;
+
+	*qp = ib_create_qp(port->pd, &qp_init_attr);
+	if (IS_ERR(*qp)) {
+		vnic_warn(discover->name, "failed to create QP\n");
+		goto error_free_cq;
+	}
+
+	/* move QP to RTS */
+	if (fip_init_qp(discover->port, *qp, discover->pkey_index, discover->name)) {
+		vnic_warn(discover->name, "fip_init_qp failed for  qp\n");
+		goto error_free_qp;
+	}
+
+	/* init RX + TX rings */
+	if (fip_discover_start_rings(discover, rx_ring, tx_ring, *cq, *qp)) {
+		vnic_warn(discover->name, "failed to start rings\n");
+		goto error_free_qp;
+	}
+
+	/* enable receiving CQ comps, triggers fip_discover_comp()  */
+	if (ib_req_notify_cq(*cq, IB_CQ_NEXT_COMP)) {
+		vnic_warn(discover->name, "ib_req_notify_cq failed for cq\n");
+		goto error_release_rings;
+	}
+
+	return 0;
+
+error_release_rings:
+	fip_flush_rings(discover->port, *cq, *qp, rx_ring, tx_ring, discover->name);
+	fip_free_rings(discover->port, rx_ring, tx_ring, discover->name);
+error_free_qp:
+	ib_destroy_qp(*qp);
+error_free_cq:
+	ib_destroy_cq(*cq);
+out:
+	*qp = NULL;
+	*cq = NULL;
+	return -ENODEV;
+}
+
+/*
+ * This function handles completions of both TX and RX
+ * packets. RX packets are unmapped lightly parsed moved to a list
+ * and passed to thread processing. TX packets are unmapped and freed.
+ * Note: this function is called from interrupt context
+ */
+static void fip_discover_comp(struct ib_cq *cq, void *discover_ptr)
+{
+	struct fip_discover *discover = discover_ptr;
+
+	/* handle completions. On RX packets this will call discover_process_rx
+	 * from thread context to continue processing */
+	if (fip_comp(discover->port, discover->cq,
+		     &discover->rx_ring, &discover->tx_ring,
+		     discover->name))
+		fip_discover_process_rx(discover);
+}
+
+/*
+ * Alloc the discover CQ, QP. Configure the QP to RTS.
+ * alloc the RX + TX rings and queue work for discover
+ * finite state machine code.
+ */
+int fip_discover_init(struct vnic_port *port, struct fip_discover *discover,
+		      u16 pkey, int complete)
+{
+	int rc;
+
+	discover->port = port;
+	discover->flush = FIP_NO_FLUSH;
+	discover->state = FIP_DISCOVER_INIT;
+	discover->rx_ring.size = FIP_PROTOCOL_RX_SIZE;
+	discover->tx_ring.size = FIP_PROTOCOL_TX_SIZE;
+	discover->new_prot_gws = 0;
+	discover->old_prot_gws = 0;
+
+	/* This is in preparation for pkey discovery */
+
+	init_completion(&discover->flush_complete);
+
+	INIT_DELAYED_WORK(&discover->fsm_task, fip_discover_fsm);
+	INIT_DELAYED_WORK(&discover->cleanup_task, fip_purge_gws);
+	INIT_DELAYED_WORK(&discover->hadmin_update_task, fip_discover_hadmin_update);
+	INIT_WORK(&discover->pkt_rcv_task_bh, fip_discover_process_rx_bh);
+	spin_lock_init(&discover->rcv_list.lock);
+	INIT_LIST_HEAD(&discover->rcv_list.list);
+	spin_lock_init(&discover->lock);
+
+
+	if (complete) {
+		discover->pkey = pkey;
+		INIT_LIST_HEAD(&discover->gw_list);
+		init_rwsem(&discover->l_rwsem);
+		sprintf(discover->name, "%s_P%x", port->name, discover->pkey);
+	}
+	INIT_LIST_HEAD(&discover->hadmin_cache);
+	vnic_mcast_root_init(&discover->mcast_tree);
+
+	if (!ib_find_pkey(port->dev->ca, port->num, discover->pkey, &discover->pkey_index)) {
+		rc = fip_discover_init_rings(port, discover, &discover->rx_ring,
+					     &discover->tx_ring, &discover->cq,
+					     &discover->qp, fip_discover_comp);
+		if (rc) {
+			vnic_warn(discover->name, "descovered init failed rc=%d\n", rc);
+			return rc;
+		}
+
+		/* start discover FSM code */
+		/* calls fip_discover_fsm() */
+		queue_delayed_work(fip_wq, &discover->fsm_task, 0);
+	} else {
+		vnic_warn(discover->name, "Configured PKEY 0x%X is not supported on port\n", discover->pkey);
+		discover->pkey_index = ILLEGAL_PKEY_INDEX;
+	}
+
+
+	return 0;
+}
+
+void fip_recv_list_flush(struct fip_discover *discover)
+{
+	struct list_head discov_recv_local;
+	struct fip_rcv_pkt *rcv, *rcv1;
+	unsigned long flags;
+
+	INIT_LIST_HEAD(&discov_recv_local);
+
+	spin_lock_irqsave(&discover->rcv_list.lock, flags);
+	list_replace_init(&discover->rcv_list.list, &discov_recv_local);
+	spin_unlock_irqrestore(&discover->rcv_list.lock, flags);
+
+	list_for_each_entry_safe(rcv, rcv1, &discov_recv_local, list) {
+		list_del(&rcv->list);
+		kfree(rcv);
+	}
+	return;
+}
+
+/*
+ * free the discover TX and RX rings, QP and CQ.
+ * May not be called from fip wq context.
+ */
+int fip_discover_cleanup(struct vnic_port *port, struct fip_discover *discover, int complt)
+{
+	if (discover->state == FIP_DISCOVER_OFF)
+		return -EINVAL;
+
+	/* move FSM to flush state and wait for the FSM
+	 * to finish whatever it is doing before we continue
+	 */
+	vnic_dbg_mark();
+	init_completion(&discover->flush_complete);
+	discover->flush = complt ? FIP_FULL_FLUSH : FIP_PARTIAL_FLUSH;
+	cancel_delayed_work(&discover->fsm_task);
+#ifndef _BP_WORK_SYNC
+	cancel_delayed_work_sync(&discover->hadmin_update_task);
+#else
+	cancel_delayed_work(&discover->hadmin_update_task);
+	flush_workqueue(fip_wq);
+#endif
+	/* flush any hadmin entries leftovers */
+	{
+		struct fip_hadmin_cache *hadmin, *hadmin_t;
+
+		spin_lock_irq(&discover->lock);
+		list_for_each_entry_safe(hadmin, hadmin_t,
+					 &discover->hadmin_cache, next) {
+			list_del(&hadmin->next);
+			kfree(hadmin);
+		}
+		spin_unlock_irq(&discover->lock);
+	}
+
+	/* calls fip_discover_fsm() */
+	queue_delayed_work(fip_wq, &discover->fsm_task, 0);
+	vnic_dbg_mark();
+	/* calls fip_discover_fsm() */
+	wait_for_completion(&discover->flush_complete);
+	vnic_dbg_mark();
+
+	/* make sure that discover FSM is idle */
+#ifndef _BP_WORK_SYNC
+	cancel_delayed_work_sync(&discover->fsm_task);
+#else
+	cancel_delayed_work(&discover->fsm_task);
+	flush_workqueue(fip_wq);
+#endif
+
+	if (discover->pkey_index != ILLEGAL_PKEY_INDEX) {
+		fip_flush_rings(port, discover->cq, discover->qp,
+				&discover->rx_ring, &discover->tx_ring,
+				discover->name);
+		fip_free_rings(port, &discover->rx_ring, &discover->tx_ring,
+			       discover->name);
+
+		fip_recv_list_flush(discover);
+		if (discover->qp)
+			ib_destroy_qp(discover->qp);
+		discover->qp = NULL;
+
+		if (discover->cq)
+			ib_destroy_cq(discover->cq);
+		discover->cq = NULL;
+	}
+
+	return 0;
+}
+
+/*
+ * This function runs in interrupt context
+ * It does sanity checking of the packet, moves it to a list and passes
+ * handling to a thread.
+ */
+void fip_discover_process_rx(struct fip_discover *discover)
+{
+	struct vnic_port *port = discover->port;
+	int mtu_size = FIP_UD_BUF_SIZE(port->max_mtu_enum);
+	int rc;
+	int queue_packet, one_or_more_queued = 0;
+	struct fip_rcv_pkt *rcv, *rcv1;
+	struct list_head discov_recv_local;
+	int index;
+	struct fip_content *fc;
+	int err;
+	struct fip_ring_entry *ring;
+
+	INIT_LIST_HEAD(&discov_recv_local);
+
+	if (discover->flush != FIP_NO_FLUSH)
+		return;
+
+	while (discover->rx_ring.head != discover->rx_ring.tail) {
+		fc = NULL;
+		queue_packet = 0;
+		index = discover->rx_ring.tail & (discover->rx_ring.size - 1);
+		ring = &discover->rx_ring.ring[index];
+
+		if (ring->entry_posted == 1 &&
+		    discover->state == FIP_DISCOVER_SOLICIT) {
+			fc = kzalloc(sizeof *fc, GFP_ATOMIC);
+			if (likely(fc)) {
+				/* login is the first state we RX packets in */
+				rc = fip_packet_parse(port, ring->mem + IB_GRH_BYTES,
+						      ring->length - IB_GRH_BYTES, fc);
+				if (!rc)
+					fip_discover_rx_packet(&queue_packet, fc);
+			} else
+				vnic_warn(discover->name, "allocation failed\n");
+		}
+		if (queue_packet) {
+			int length;
+
+			length = ring->length - IB_GRH_BYTES;
+			rcv = kmalloc(sizeof *rcv, GFP_ATOMIC);
+			if (!rcv) {
+				vnic_dbg_fip(discover->name, "failed kmalloc\n");
+				kfree(fc);
+			} else {
+				struct fip_ring_entry me;
+
+				err = alloc_map_fip_buffer(port->dev->ca, &me,
+							   mtu_size, GFP_ATOMIC);
+				if (err) {
+					kfree(fc);
+					kfree(rcv);
+				} else {
+					rcv->length = length;
+					rcv->fc = fc;
+					rcv->mem = ring->mem;
+					list_add_tail(&rcv->list, &discov_recv_local);
+					one_or_more_queued++;
+					ib_dma_unmap_single(port->dev->ca,
+							    ring->bus_addr,
+							    mtu_size, DMA_FROM_DEVICE);
+					*ring = me;
+				}
+			}
+		} else
+                        kfree(fc);
+
+		rc = fip_post_receive(port, discover->qp,
+				      FIP_UD_BUF_SIZE(discover->port->max_mtu_enum),
+				      index, ring, discover->name);
+		if (rc)
+			vnic_warn(discover->name, "fip_post_receive rc %d\n", rc);
+
+		discover->rx_ring.tail++;
+	}
+
+	if (one_or_more_queued) {
+		spin_lock(&discover->lock);
+		if (likely(discover->flush == FIP_NO_FLUSH)) {
+			spin_lock(&discover->rcv_list.lock);
+			list_splice_init(&discov_recv_local, discover->rcv_list.list.prev);
+			spin_unlock(&discover->rcv_list.lock);
+			/* calls fip_discover_process_rx_bh */
+			queue_work(fip_wq, &discover->pkt_rcv_task_bh);
+			spin_unlock(&discover->lock);
+		} else {
+			spin_unlock(&discover->lock);
+			list_for_each_entry_safe(rcv, rcv1, &discov_recv_local, list) {
+				list_del(&rcv->list);
+				kfree(rcv->fc);
+				kfree(rcv->mem);
+				kfree(rcv);
+			}
+		}
+	}
+
+	return;
+}
+
+/*
+ * This function is the RX packet handler bottom half. It runs on the fip wq.
+*/
+void fip_discover_process_rx_bh(struct work_struct *work)
+{
+	struct fip_discover *discover =
+		container_of(work, struct fip_discover, pkt_rcv_task_bh);
+	int rc;
+	struct list_head discov_recv_local;
+	struct fip_rcv_pkt *rcv, *rcv1;
+	unsigned long flags;
+
+	INIT_LIST_HEAD(&discov_recv_local);
+
+	/* the irqsave is needed because debug kernel above 2.6.27 complains about
+	 * hard irq safe to hard irq unsafe on discover.lock */
+	spin_lock_irqsave(&discover->rcv_list.lock, flags);
+	list_replace_init(&discover->rcv_list.list, &discov_recv_local);
+	spin_unlock_irqrestore(&discover->rcv_list.lock, flags);
+
+	if (discover->flush != FIP_NO_FLUSH) {
+		list_for_each_entry_safe(rcv, rcv1, &discov_recv_local, list) {
+			list_del(&rcv->list);
+			kfree(rcv->fc);
+			kfree(rcv->mem);
+			kfree(rcv);
+		}
+		return;
+	}
+
+	list_for_each_entry_safe(rcv, rcv1, &discov_recv_local, list) {
+			rc = fip_discover_rx_packet_bh(discover, rcv->fc);
+			if (rc)
+				vnic_warn(discover->name, "discover_rx_packet rc %d\n", rc);
+
+		list_del(&rcv->list);
+		kfree(rcv->fc);
+		kfree(rcv->mem);
+		kfree(rcv);
+	}
+	return;
+}
+
+static inline int fip_close_all_vnics(struct fip_gw_data *gw, enum fip_flush flush)
+{
+	struct fip_vnic_data *vnic;
+	int open_vnics = 0;
+
+	vnic_dbg_func(gw->discover->name);
+
+	list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) {
+		open_vnics++;
+		fip_vnic_close(vnic, flush);
+	}
+	return open_vnics;
+}
+
+static int fip_gw_create_vnics(struct fip_gw_data *gw)
+{
+	struct fip_vnic_data *vnic;
+	unsigned long first_free_vnic;
+	struct fip_vnic_send_info gw_address;
+	int i;
+
+	gw->info.gw_num_vnics = (gw->info.gw_num_vnics > FIP_MAX_VNICS_PER_GW) ?
+		FIP_MAX_VNICS_PER_GW : gw->info.gw_num_vnics;
+
+
+	gw->info.gw_num_vnics = vnic_net_admin ? gw->info.gw_num_vnics : 0;
+	fip_vnic_create_gw_param(&gw_address, gw->info.gw_qpn, VNIC_FIP_QKEY,
+				 gw->info.gw_lid,  vnic_gw_ctrl_sl(gw));
+	/* for host admined  */
+	list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) {
+		if (vnic->hadmined) {
+			if (gw->info.hadmined_en)
+				fip_hadmin_vnic_refresh(vnic, &gw_address);
+			else {
+				vnic_dbg_fip(gw->discover->name,
+					     "fip_gw_create_vnics hadmin disabled, "
+					     "close open hadmin vnics\n");
+				fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+			}
+		}
+	}
+
+	/* for network admined  */
+	for (i = gw->vnic_count; i < gw->info.gw_num_vnics; i++) {
+		vnic_dbg_fip(gw->discover->name, "fip_gw_create_vnics available"
+			     " vnics %d needed %d\n",
+			     gw->vnic_count, gw->info.gw_num_vnics);
+
+		/* start network assigned at half array. leave first half to host admin */
+		first_free_vnic = find_first_zero_bit(gw->n_bitmask,
+						      FIP_MAX_VNICS_PER_GW);
+		if (first_free_vnic >= FIP_MAX_VNICS_PER_GW)
+			return -ENOMEM;
+
+		vnic = fip_vnic_alloc(gw->discover->port, gw, 0 /* hadmin */, first_free_vnic);
+		if (!vnic)
+			return -ENOMEM;
+
+		fip_vnic_set_gw_param(vnic, &gw_address);
+		set_bit(first_free_vnic, gw->n_bitmask);
+		list_add_tail(&vnic->gw_vnics, &gw->vnic_list);
+		gw->vnic_count++;
+
+		/* calls fip_vnic_fsm() */
+		cancel_delayed_work(&vnic->vnic_task);
+		fip_vnic_fsm(&vnic->vnic_task.work);
+	}
+
+	return 0;
+}
+
+/*
+ * This function goes over vnics and closes network administrated vNics
+ * that are not open and do not receive neighbor table info (there
+ * is no way for the BXM to tell the vNics to close before the
+ * vnic is listening to the neighbour tables).
+*/
+static int fip_gw_close_nonopen_vnics(struct fip_gw_data *gw)
+{
+	struct fip_vnic_data *vnic;
+	int closed_vnics = 0;
+
+	vnic_dbg_fip(gw->discover->name, "Try to close non open vnics\n");
+
+	list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) {
+		vnic_dbg_fip(gw->discover->name, "check vnic %s, hadmin %d state %d\n",
+			     vnic->name, vnic->hadmined, vnic->state);
+		if (!vnic->hadmined && vnic->state < FIP_VNIC_VHUB_DONE) {
+			vnic_dbg_fip(gw->discover->name, "closing vnic %s\n", vnic->name);
+			fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+			closed_vnics++;
+		}
+	}
+
+	return closed_vnics;
+}
+
+/* permanently delete all vnics pending delete. The function goes over
+ * the list of vnics awaiting deletion and tries to delete them. If the
+ * vnic destructor returns an error value (currently busy) the function
+ * will requeue it self for another try. The function will also test if
+ * new vnics need to be added as a result of vnic removal.
+ */
+static void fip_purge_vnics(struct work_struct *work)
+{
+	struct fip_gw_data *curr_gw =
+		container_of(work,struct fip_gw_data, vnic_cleanup_task.work);
+	struct fip_vnic_data *vnic, *tmp_vnic;
+	int vnic_id, rc, del_cnt = 0, retry = 0;
+	unsigned long *bitmask;
+
+	vnic_dbg_fip(curr_gw->discover->name, "fip_purge_vnics\n");
+
+	list_for_each_entry_safe(vnic, tmp_vnic, &curr_gw->vnic_list, gw_vnics) {
+		enum fip_flush f;
+		vnic_id = vnic->vnic_id;
+		bitmask = vnic->hadmined ? NULL : curr_gw->n_bitmask;
+
+		/* If successful vnic is removed from list and destroyed */
+		f = vnic->flush;
+		if (f != FIP_NO_FLUSH) {
+			rc = fip_vnic_destroy(vnic);
+			if (!rc) {
+				del_cnt++;
+				if (f == FIP_FULL_FLUSH && bitmask)
+					clear_bit(vnic_id, bitmask);
+			} else {
+				retry |= rc;
+			}
+		}
+
+		/* limit the number of vnics to purge in each loop to let other
+		 * tasks on same wq to run (i.e., avoid starvation).
+		 */
+		if (del_cnt > 2) {
+			retry = 1;
+			break;
+		}
+	}
+
+	/* This means we still have vnics that refuse to close retry later */
+	if (retry){
+		vnic_dbg_mark();
+		/* calls fip_purge_vnics() */
+		queue_delayed_work(fip_wq, &curr_gw->vnic_cleanup_task, HZ / 10);
+	} else {
+		vnic_dbg_fip(curr_gw->discover->name, "fip_purge_vnics, all GW"
+			     " vnics closed\n");
+
+		if (curr_gw->hadmin_gw && curr_gw->state == FIP_GW_HOST_ADMIN && list_empty(&curr_gw->vnic_list)) {
+			vnic_warn(curr_gw->discover->name,
+					  "Removing Host admin GW %s with no vnics\n",
+					  (char*)curr_gw->info.vol_info.gw_port_name);
+			fip_close_gw(curr_gw, FIP_FULL_FLUSH);
+		}
+		/* test and open new vnics if vnics are missing */
+		/* ALITODO: after GW timeout, a vnic is re-created! why is that?
+		if (fip_gw_create_vnics(curr_gw)) {
+			vnic_dbg_mark();
+			queue_delayed_work(fip_wq,
+					   &curr_gw->vnic_cleanup_task, HZ);
+		}
+		*/
+	}
+}
+
+/*
+ * This function adds or removes a single host admined vnic to a GW.
+ * First the function searches for the vnic. The search function
+ * disregards vnics that are undergoing a complete flush.
+*/
+int fip_gw_update_hadmin_gw(struct fip_gw_data *gw,
+			    struct fip_hadmin_cache *hadmin_entry)
+{
+	struct fip_vnic_data *vnic;
+	int vnic_id = hadmin_entry->vnic_id, rc = 0;
+
+	/* set bit 16 for hadmin vNics (by spec) */
+	vnic_id |= (1 << (VNIC_ID_LEN - 1));
+
+	vnic = fip_vnic_find_in_list(gw, vnic_id, hadmin_entry->mac,
+				     hadmin_entry->vlan,
+				     hadmin_entry->vlan_used);
+
+	/* remove: if vNic found - remove it and exit */
+	if (hadmin_entry->remove) {
+		if (vnic)
+			fip_vnic_close(vnic, FIP_FULL_FLUSH);
+		else
+			vnic_dbg_fip(gw->discover->name, "vNic to remove is"
+				     " not found (name:%s mac:"MAC_6_PRINT_FMT
+				     " vlan:%d id:%d)\n",
+			  hadmin_entry->interface_name,
+			  MAC_6_PRINT_ARG(hadmin_entry->mac),
+			  hadmin_entry->vlan, vnic_id);
+		goto out;
+	}
+
+	/* add: if vNic found - report error, otherwise add new vNic */
+	if (vnic) {
+		/* skip error reporting between child vNics conflict,
+		 * as vnic_learn_mac() may learn same child while it's still
+		 * pending. TODO: improve this to avoid such cases.
+		 */
+		if (hadmin_entry->parent_used && vnic->parent_used)
+			goto out;
+		vnic_warn(gw->discover->name, "vNic creation failed, duplicate"
+			  " vNic detected (name:%s mac:"MAC_6_PRINT_FMT
+			  " vlan:%d id:%d & existing name:%s mac:"
+			  MAC_6_PRINT_FMT" vlan:%d id:%d)\n",
+			  hadmin_entry->interface_name,
+			  MAC_6_PRINT_ARG(hadmin_entry->mac),
+			  hadmin_entry->vlan, vnic_id, vnic->interface_name,
+			  MAC_6_PRINT_ARG(vnic->login_data.mac),
+			  vnic->login_data.vlan, vnic->login_data.vnic_id);
+		goto out;
+	}
+
+#if 0
+	/* if the GW is in all_vlan mode,
+	 * the host can only create vlans in this mode.
+	 * However if it is not in all_vlan mode, the host must not create
+	 * vlans in this mode */
+	if ((gw->info.all_vlan_gw && !hadmin_entry->all_vlan_gw
+	     && hadmin_entry->vlan_used) ||
+	     (!gw->info.all_vlan_gw && hadmin_entry->all_vlan_gw)) {
+		vnic_warn(gw->discover->name, "vnic creation failed, all_vlan"
+			  " gateway policy must be enforced between the gateway"
+			  "  and the host\n");
+		rc = -EINVAL;
+		goto out;
+	}
+#endif
+
+	vnic = fip_vnic_alloc(gw->discover->port, gw, 1 /* hadmin */, vnic_id);
+	if (!vnic) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	/* hand over info from hadmin to vnic struct */
+	memcpy(vnic->login_data.mac, hadmin_entry->mac, sizeof(vnic->login_data.mac));
+	memcpy(vnic->interface_name, hadmin_entry->interface_name,
+	       sizeof(vnic->interface_name));
+	vnic->login_data.vlan = hadmin_entry->vlan;
+	vnic->login_data.vp = hadmin_entry->vlan_used;
+	vnic->login_data.all_vlan_gw = hadmin_entry->all_vlan_gw;
+	memcpy(vnic->shared_vnic.ip, hadmin_entry->shared_vnic_ip,
+	       sizeof(vnic->shared_vnic.ip));
+	memcpy(vnic->shared_vnic.emac, hadmin_entry->shared_vnic_mac,
+	       sizeof(vnic->shared_vnic.emac));
+	vnic->shared_vnic.enabled = is_valid_ipv4(hadmin_entry->shared_vnic_ip);
+	vnic->vnic_id = vnic_id; /* will be overwritten later */
+	vnic->vlan_used = hadmin_entry->vlan_used;
+	vnic->parent_used =  hadmin_entry->parent_used;
+	memcpy(vnic->parent_name, hadmin_entry->parent_name,
+	       sizeof(vnic->parent_name));
+	vnic->qp_base_num = hadmin_entry->qp_base_num;
+	vnic->vlan = hadmin_entry->vlan;
+	vnic->cmd = hadmin_entry->cmd;
+	vnic->all_vlan_gw = hadmin_entry->all_vlan_gw;
+
+	/* create dentry */
+	rc = vnic_create_hadmin_dentry(vnic);
+	if (rc)
+		goto init_failed;
+
+	rc = fip_vnic_hadmin_init(gw->discover->port, vnic);
+	if (rc)
+		goto init_failed;
+
+	list_add_tail(&vnic->gw_vnics, &gw->vnic_list);
+
+	/* calls fip_vnic_fsm() */
+	fip_vnic_fsm(&vnic->vnic_task.work);
+
+	return 0;
+
+init_failed:
+	vnic_delete_hadmin_dentry(vnic);
+	kfree(vnic);
+out:
+	return rc;
+}
+
+/*
+ * Queue the GW for deletion. And trigger a delayed call to the cleanup
+ * function.
+ * Note: This deletion method insures that all pending GW work requests
+ * are cleared without dependency of the calling context.
+*/
+void fip_close_gw(struct fip_gw_data *gw, enum fip_flush flush)
+{
+	enum fip_flush tmp_flush = gw->hadmin_gw ? flush : FIP_FULL_FLUSH;
+
+	if (tmp_flush == FIP_PARTIAL_FLUSH && gw->state < FIP_GW_HOST_ADMIN)
+		return;
+
+	/* close already in process, disregard*/
+	if (gw->flush >= tmp_flush)
+		return;
+
+	gw->flush = tmp_flush;
+	gw->info.gw_num_vnics = 0;
+	cancel_delayed_work(&gw->gw_task);
+
+	/* This is not mandatory but will save us time because there is a
+	 * better chance that all vnics would be destroyed before trying to
+	 * destroy the GW */
+	fip_close_all_vnics(gw, tmp_flush);
+
+	/* calls fip_purge_gws() */
+	queue_delayed_work(fip_wq, &gw->discover->cleanup_task, DELAYED_WORK_CLEANUP_JIFFS);
+}
+
+/*
+ * Free GW resources. This includes destroying the vnics. If the GW can be
+ * totally destroyed (no pending work for the GW and all the vnics have been
+ * destroyed) the GW will be removed from the GWs list and it's memory
+ * freed. If the GW can not be closed at this time it will not be freed
+ * and the function will return an error.
+ * In this case the caller needs to recall the unction to complete the
+ * operation.
+ * Do not call this function directly use: fip_close_gw
+ */
+static int fip_free_gw(struct fip_discover *discover, struct fip_gw_data *gw)
+{
+	struct fip_vnic_data *vnic;
+	int vnic_close_fail = 0;
+
+	gw->info.gw_num_vnics = 0;
+
+	if (delayed_work_pending(&gw->gw_task))
+		return -EBUSY;
+
+	list_for_each_entry(vnic, &gw->vnic_list, gw_vnics)
+		vnic_close_fail |= (vnic->flush != FIP_NO_FLUSH);
+
+	/* true if vnics need to be closed */
+	/* if some of the vnics are still open return and retry later */
+	if (vnic_close_fail)
+		return -EBUSY;
+
+	if (delayed_work_pending(&gw->vnic_cleanup_task))
+		return -EBUSY;
+
+	/*
+	 * it is possible that during gw removal we added the GW again. Test GW
+	 * list to ensure it is not in the list already before adding it again.
+	 */
+	if (gw->state > FIP_GW_HOST_ADMIN) {
+		if (gw->info.gw_prot_new)
+			discover->new_prot_gws--;
+		else
+			discover->old_prot_gws--;
+	}
+	if (gw->flush == FIP_PARTIAL_FLUSH) {
+		gw->state = FIP_GW_HOST_ADMIN;
+		gw->flush = FIP_NO_FLUSH;
+	} else {
+		list_del(&gw->list);
+		if (!IS_ERR(gw->pquery) && gw->query_id >= 0)
+			ib_sa_cancel_query(gw->query_id, gw->pquery);
+		wait_for_completion(&gw->query_comp);
+		kfree(gw);
+	}
+	return 0;
+}
+
+/*
+ * permanently delete all GWs pending delete. The function goes over
+ * the list of GWs awaiting deletion and tries to delete them. If the
+ * GW destructor returns an error value (currently busy) the function
+ * will requeue it self for another try.
+ */
+static void fip_purge_gws(struct work_struct *work)
+{
+	struct fip_discover *discover =
+		container_of(work, struct fip_discover, cleanup_task.work);
+	struct fip_gw_data *gw, *tmp_gw;
+	int gw_close_fail = 0;
+
+	down_write(&discover->l_rwsem);
+	list_for_each_entry_safe(gw, tmp_gw, &discover->gw_list, list) {
+		if (gw->flush  != FIP_NO_FLUSH) {
+			gw_close_fail |= fip_free_gw(discover, gw);
+		}
+	}
+	up_write(&discover->l_rwsem);
+
+	/* This means we still have vnics that refuse to close, retry later */
+	if (gw_close_fail) {
+		vnic_dbg_fip(discover->name, "still have open GWs\n");
+		/* calls fip_purge_gws() */
+		queue_delayed_work(fip_wq, &discover->cleanup_task,
+				   DELAYED_WORK_CLEANUP_JIFFS);
+	} else {
+		vnic_dbg_fip(discover->name, "fip_purge_gws all gws"
+			     " closed and freed\n");
+	}
+}
+
+static int fip_free_gw_done(struct fip_discover *discover, enum fip_flush flush)
+{
+	struct fip_gw_data *curr_gw;
+	int rc;
+
+	down_read(&discover->l_rwsem);
+	if (flush == FIP_FULL_FLUSH) {
+		rc = list_empty(&discover->gw_list);
+		up_read(&discover->l_rwsem);
+		return rc;
+	}
+
+	list_for_each_entry(curr_gw, &discover->gw_list, list) {
+		if (curr_gw->flush  != FIP_NO_FLUSH) {
+			up_read(&discover->l_rwsem);
+			return 0;
+		}
+	}
+
+	up_read(&discover->l_rwsem);
+	return 1;
+}
+
+/*
+ * Go over the GW list and try to close the GWs. It is possible that some
+ * of the GWs have pending work and therefore can not be closed. We can not
+ * sleep on this because we might be running on the same context as the one
+ * we are waiting for. The user should call this function once and then test
+ * if the free is done by polling (must release wq context) fip_free_gw_done
+ */
+static int fip_free_gw_list(struct fip_discover *discover, enum fip_flush flush)
+{
+	struct fip_gw_data *curr_gw;
+
+	down_read(&discover->l_rwsem);
+	list_for_each_entry(curr_gw, &discover->gw_list, list)
+		fip_close_gw(curr_gw, flush);
+	up_read(&discover->l_rwsem);
+
+	vnic_dbg_fip(discover->name, "fip_free_gw_list not done\n");
+	return 0;
+}
+
+static inline void update_gw_address(struct fip_gw_data *gw,
+				     struct fip_gw_data_info *new_gw_data)
+{
+	gw->info.gw_qpn = new_gw_data->gw_qpn;
+	gw->info.gw_lid = new_gw_data->gw_lid;
+	gw->info.gw_port_id = new_gw_data->gw_port_id;
+	gw->info.gw_sl = new_gw_data->gw_sl;
+	memcpy(gw->info.gw_guid, new_gw_data->gw_guid, sizeof gw->info.gw_guid);
+
+	vnic_dbg_fip(gw->discover->name, "GW address was modified. "
+		     "QPN: 0x%x, LID: 0x%x, guid: " GUID_FORMAT
+		     "port id: %d, SL: %d\n", gw->info.gw_qpn,
+		     gw->info.gw_lid, GUID_ARG(gw->info.gw_guid),
+		     gw->info.gw_port_id, gw->info.gw_sl);
+	/* restart fsm to path query */
+	if (vnic_sa_query)
+		fip_discover_gw_fsm_move(gw, FIP_GW_CTRL_PATH_QUERY);
+}
+
+int fip_gw_modified(struct fip_gw_data *gw,
+		    struct fip_gw_data_info *new_gw_data)
+{
+	char *name = gw->discover->name;
+	ASSERT(new_gw_data);
+
+	vnic_dbg_fip(name, "fip_gw_modified called, gw_num_vnics %d -> %d\n",
+		     gw->info.gw_num_vnics, new_gw_data->gw_num_vnics);
+
+	if (memcmp(gw->info.gw_guid, new_gw_data->gw_guid,
+		   sizeof(gw->info.gw_guid)) ||
+	    gw->info.gw_lid != new_gw_data->gw_lid ||
+	    gw->info.gw_port_id != new_gw_data->gw_port_id ||
+	    gw->info.gw_qpn != new_gw_data->gw_qpn ||
+	    (!vnic_sa_query && gw->info.gw_sl != new_gw_data->gw_sl)) {
+		/* TODO: Make sure that the GW doesn't change the sl sent in solicitation */
+		/* In this case the GW address might be modified even
+		   in 'good flow' */
+		if (gw->info.gw_type == GW_TYPE_LAG &&
+		    gw->info.ext_lag.ucast)
+			update_gw_address(gw, new_gw_data);
+		else {
+			vnic_dbg_fip(name, "fip_gw_modified changing "
+				     "unsupported parameter closing GW\n");
+			fip_close_gw(gw, FIP_PARTIAL_FLUSH);
+		}
+	} else if (gw->info.gw_num_vnics < new_gw_data->gw_num_vnics) {
+		vnic_dbg_fip(name, "fip_gw_modified changing num "
+			     "vnics from %d to %d\n", gw->info.gw_num_vnics,
+			     new_gw_data->gw_num_vnics);
+		gw->info.gw_num_vnics = new_gw_data->gw_num_vnics;
+		if (fip_gw_create_vnics(gw))
+			vnic_err(name, "fip_gw_create_vnics failed\n");
+
+	}  else if (gw->info.gw_num_vnics > new_gw_data->gw_num_vnics) {
+		gw->info.gw_num_vnics = new_gw_data->gw_num_vnics;
+		fip_gw_close_nonopen_vnics(gw);
+		if (gw->vnic_count < gw->info.gw_num_vnics)
+			fip_gw_create_vnics(gw);
+		vnic_dbg_fip(name, "fip_gw_modified changing num "
+			     "vnics from %d to %d\n", gw->info.gw_num_vnics,
+			     new_gw_data->gw_num_vnics);
+	} else if (gw->info.n_rss_qpn != new_gw_data->n_rss_qpn) {
+		gw->info.n_rss_qpn = new_gw_data->n_rss_qpn;
+		vnic_dbg_fip(name, "fip_gw_modified changing n_rss_qpn "
+			     "from %d to %d\n", gw->info.n_rss_qpn,
+			     new_gw_data->n_rss_qpn);
+	} else if (gw->info.hadmined_en != new_gw_data->hadmined_en) {
+		if (fip_gw_create_vnics(gw))
+			vnic_err(name, "fip_gw_create_vnics failed\n");
+	}
+
+	return 0;
+}
+
+static inline int is_none_zero_guid(u8 *guid)
+{
+	int i;
+	u8 ored = 0;
+
+	if (!guid)
+		return 0;
+
+	for (i = 0; i < 8; ++i)
+		ored |= guid[i];
+
+	return !!ored;
+}
+
+/*
+ * Look for a GW in the GW list.
+ * The search need one identifier to identify the Box (either GUID or system name)
+ * and one identifier for the external port (port_id or eport_name).
+ * This function uses what ever data is available for the search since
+ * various callers do not have access to a single pair of ids.
+ * use NULL for unknown strings and GW_PORT_ID_UNKNOWN for unknown port_id.
+ * GW that are undergoing complete flush are disregarded by the search.
+ */
+struct fip_gw_data *fip_find_gw_in_list(
+				struct fip_discover *discover,
+				int 	port_id,
+				u8	*eport_name,
+				u8	*gw_guid,
+				u8	*system_guid,
+				u8	*system_name,
+				int	is_login)
+{
+	struct fip_gw_data *curr_gw;
+	int use_guid = is_none_zero_guid(gw_guid);
+	int use_system_name = system_name && strlen(system_name) > 0;
+	int use_system_guid = is_none_zero_guid(system_guid);
+	int use_eport = eport_name && strlen(eport_name) > 0;
+	int use_port_id = port_id >= 0;
+	int port_id_pass;
+	int eport_match;
+
+	if(!((use_eport || use_port_id) && 
+	     (use_guid || use_system_name || use_system_guid))) {
+		vnic_dbg_fip_v(discover->name,
+			       "fip_find_gw_in_list not enough param for search\n");
+		return NULL;
+	}
+
+	if (use_system_name)
+		vnic_dbg_fip_v(discover->name, "system name %s\n", system_name);
+
+	if (use_guid)
+		vnic_dbg_fip_v(discover->name, "gw guid "VNIC_GUID_FMT"\n",
+			       VNIC_GUID_RAW_ARG(gw_guid));
+
+	if (use_system_guid)
+		vnic_dbg_fip_v(discover->name, "system guid "VNIC_GUID_FMT"\n",
+			       VNIC_GUID_RAW_ARG(system_guid));
+
+	if (use_eport)
+		vnic_dbg_fip_v(discover->name, "eport %s\n", eport_name);
+
+	if (use_port_id)
+		vnic_dbg_fip_v(discover->name, "port_id 0x%x\n", port_id);
+
+	down_read(&discover->l_rwsem);
+	list_for_each_entry(curr_gw, &discover->gw_list, list) {
+		vnic_dbg_fip_v(discover->name, "check gw on eport %s, gw_guid "VNIC_GUID_FMT" "
+			       "system_guid "VNIC_GUID_FMT", flush %d\n",
+			       curr_gw->info.vol_info.gw_port_name,
+			       VNIC_GUID_RAW_ARG(curr_gw->info.gw_guid),
+			       VNIC_GUID_RAW_ARG(curr_gw->info.vol_info.system_guid),
+			       curr_gw->flush);
+
+		if (curr_gw->flush == FIP_FULL_FLUSH)
+			continue;
+
+		/* for login ack, skip non connected GWs */
+		if (is_login && use_port_id && curr_gw->state == FIP_GW_HOST_ADMIN) /* skip dangling hadmined GWs */
+			continue;
+
+		/* use the eport names only if you don't have port_id indexes
+		 * This is in order to enable port_id changes.
+		 * in case of host admin GW, ignore gw_port_id since the old GW
+		 * will never be flushed and the new GW id can change */
+		port_id_pass = use_port_id && (curr_gw->info.gw_port_id != (u16)-1) && !(curr_gw->hadmin_gw && use_eport);
+		eport_match = (use_eport && !port_id_pass &&
+			 !strncmp(curr_gw->info.vol_info.gw_port_name,
+				  eport_name,VNIC_GW_PORT_NAME_LEN)) ||
+			(port_id_pass && (port_id == curr_gw->info.gw_port_id));
+		if (!eport_match)
+			continue;
+
+		if (use_guid && !memcmp(curr_gw->info.gw_guid, gw_guid, GUID_LEN))
+			goto found;
+
+		if (use_system_guid &&
+		    !memcmp(curr_gw->info.vol_info.system_guid,
+			    system_guid, GUID_LEN))
+			goto found;
+
+		if(use_system_name &&
+		   !strncmp(curr_gw->info.vol_info.system_name, system_name,
+			    VNIC_SYSTEM_NAME_LEN))
+			goto found;
+	}
+
+	up_read(&discover->l_rwsem);
+	vnic_dbg_fip(discover->name, "gw not found!\n");
+	return NULL;
+found:
+	if (curr_gw->hadmin_gw && use_eport && use_port_id &&
+		!strncmp(curr_gw->info.vol_info.gw_port_name,eport_name,VNIC_GW_PORT_NAME_LEN) &&
+		curr_gw->info.gw_port_id != port_id) {
+		vnic_info("%s:["VNIC_GUID_FMT"] %s eport ID changed from %d to %d\n",
+				  curr_gw->info.vol_info.system_name,
+				  VNIC_GUID_RAW_ARG(curr_gw->info.vol_info.system_guid),
+				  curr_gw->info.vol_info.gw_port_name,
+				  curr_gw->info.gw_port_id, port_id);
+	}
+
+	up_read(&discover->l_rwsem);
+	return curr_gw;
+}
+
+/*
+ * Alloc and init a new GW struct
+ */
+static struct fip_gw_data *fip_discover_create_gw(struct fip_discover *discover)
+{
+	struct fip_gw_data *gw_data;
+
+	gw_data = kzalloc(sizeof(struct fip_gw_data), GFP_KERNEL);
+	if (!gw_data)
+		goto out;
+
+	INIT_DELAYED_WORK(&gw_data->gw_task, fip_discover_gw_fsm);
+	INIT_DELAYED_WORK(&gw_data->vnic_cleanup_task, fip_purge_vnics);
+	INIT_LIST_HEAD(&gw_data->vnic_list);
+	gw_data->discover = discover;
+	gw_data->pquery = ERR_PTR(-ENODATA);
+	gw_data->query_id = -1;
+	init_completion(&gw_data->query_comp);
+	complete(&gw_data->query_comp);
+	mutex_init(&gw_data->mlock);
+
+out:
+	return gw_data;
+}
+
+static void fip_discover_hadmin_update(struct work_struct *work)
+{
+	struct fip_discover *discover =
+		container_of(work, struct fip_discover,
+			     hadmin_update_task.work);
+	struct fip_hadmin_cache *hadmin_entry;
+	struct fip_hadmin_cache *hadmin_tmp;
+	struct fip_gw_data *curr_gw;
+	struct list_head hadmin_head;
+	char *name;
+	int flush, used_guid, rc;
+
+	/* move list from hadmin_cache to a temporary list */
+	spin_lock_irq(&discover->lock);
+	list_replace(&discover->hadmin_cache, &hadmin_head);
+	INIT_LIST_HEAD(&discover->hadmin_cache);
+	flush = discover->flush;
+	spin_unlock_irq(&discover->lock);
+
+	if (flush != FIP_NO_FLUSH)
+		goto out;
+
+	/* process hadmin list */
+	list_for_each_entry_safe(hadmin_entry, hadmin_tmp, &hadmin_head, next) {
+		name = (char *)(hadmin_entry->interface_name);
+		vnic_dbg_mac(name, "parent_used %d, remove %d\n",
+			     hadmin_entry->parent_used,
+			     hadmin_entry->remove);
+		if (hadmin_entry->parent_used) {
+			rc = vnic_parent_update(discover->port, hadmin_entry->interface_name,
+						hadmin_entry->vnic_id, hadmin_entry->mac,
+						&(hadmin_entry->qp_base_num),
+						hadmin_entry->parent_name,
+						hadmin_entry->remove);
+			if (rc)
+				continue;
+		}
+
+		used_guid = is_valid_guid(hadmin_entry->system_guid);
+		curr_gw = fip_find_gw_in_list(discover, NOT_AVAILABLE_NUM,
+					      hadmin_entry->eport_name,
+					      NULL,
+					      used_guid ? hadmin_entry->system_guid : NULL,
+					      used_guid ? NULL : hadmin_entry->system_name, 0/* is_login */);
+		if (!hadmin_entry->remove) {
+			/* in case no GW or GW is being removed create a new one */
+			if (!curr_gw || curr_gw->flush == FIP_FULL_FLUSH) {
+				curr_gw = fip_discover_create_gw(discover);
+				if (!curr_gw) {
+					vnic_warn(discover->name, "failed to create hadmin GW\n");
+					continue;
+				} else {
+					down_write(&discover->l_rwsem);
+					list_add_tail(&curr_gw->list, &discover->gw_list);
+					up_write(&discover->l_rwsem);
+				}
+
+				memcpy(curr_gw->info.vol_info.system_guid,
+				       hadmin_entry->system_guid, GUID_LEN);
+				memcpy(curr_gw->info.vol_info.gw_port_name,
+				       hadmin_entry->eport_name,
+				       VNIC_GW_PORT_NAME_LEN);
+				if (used_guid)
+					strcpy(curr_gw->info.vol_info.system_name,
+					       NOT_AVAILABLE_STRING);
+				else
+					memcpy(curr_gw->info.vol_info.system_name,
+					       hadmin_entry->system_name,
+					       VNIC_SYSTEM_NAME_LEN);
+
+				curr_gw->info.gw_port_id = hadmin_entry->gw_port_id;
+				curr_gw->state = FIP_GW_HOST_ADMIN;
+			}
+
+			curr_gw->hadmin_gw = 1;
+			fip_gw_update_hadmin_gw(curr_gw, hadmin_entry);
+		} else if(curr_gw)
+			fip_gw_update_hadmin_gw(curr_gw, hadmin_entry);
+
+		list_del(&hadmin_entry->next);
+		kfree(hadmin_entry);
+	}
+
+out:
+	/* flush hadmin_tmp list and exit */
+	list_for_each_entry_safe(hadmin_entry, hadmin_tmp, &hadmin_head, next)
+		kfree(hadmin_entry);
+}
+
+static const char *gw_state_to_str(enum fip_gw_state state)
+{
+	switch (state) {
+	case FIP_GW_CONNECTED:
+		return "FIP_GW_CONNECTED";
+	case FIP_GW_CTRL_PATH_QUERY:
+		return "FIP_GW_CTRL_PATH_QUERY";
+	case FIP_GW_DATA_PATH_QUERY:
+		return "FIP_GW_DATA_PATH_QUERY";
+	case FIP_GW_HOST_ADMIN:
+		return "FIP_GW_HOST_ADMIN";
+	case FIP_GW_SEND_SOLICIT:
+		return "FIP_GW_SEND_SOLICIT";
+	default:
+		return "UNKNOWN";
+	}
+}
+
+int fip_gw_sysfs_show(struct vnic_port *port, char *buf)
+{
+	struct fip_gw_data *gw;
+	char *p = buf;
+	struct fip_discover *discover;
+
+	mutex_lock(&port->start_stop_lock);
+	list_for_each_entry(discover, &port->fip.discover_list, discover_list) {
+
+		down_read(&discover->l_rwsem);
+
+		list_for_each_entry(gw, &discover->gw_list, list) {
+			p += _sprintf(p, buf, "IOA_PORT      %s:%d\n",
+				      gw->discover->port->dev->ca->name,
+				      gw->discover->port->num);
+			p += _sprintf(p, buf, "BX_NAME       %s\n",
+				      gw->info.vol_info.system_name);
+			if (!(*(u64 *)(gw->info.vol_info.system_guid)))
+				p += _sprintf(p, buf, "BX_GUID       %s\n", NOT_AVAILABLE_STRING);
+			else
+				p += _sprintf(p, buf, "BX_GUID       "VNIC_GUID_FMT"\n",
+					      VNIC_GUID_RAW_ARG(gw->info.vol_info.system_guid));
+			p += _sprintf(p, buf, "EPORT_NAME    %s\n", gw->info.vol_info.gw_port_name);
+			p += _sprintf(p, buf, "EPORT_ID      %u\n", gw->info.gw_port_id);
+			p += _sprintf(p, buf, "STATE         %s\n", gw_state_to_str(gw->state));
+			p += _sprintf(p, buf, "GW_TYPE       %s\n", gw->info.gw_type == GW_TYPE_LAG ?
+				      "AGGREGATED" : "LEGACY");
+			p += _sprintf(p, buf, "PKEY          0x%x\n", discover->pkey);
+			p += _sprintf(p, buf, "ALL_VLAN      %s\n",
+				      gw->state == FIP_GW_CONNECTED ?
+				      (gw->info.all_vlan_gw ? "yes" : "no") : NOT_AVAILABLE_STRING);
+			p += _sprintf(p, buf, "CTRL_SL       %d\n", gw->ctrl_prec.sl);
+			p += _sprintf(p, buf, "DATA_SL       %d\n", gw->data_prec.sl);
+			p += _sprintf(p, buf, "\n");
+		}
+
+		up_read(&discover->l_rwsem);
+	}
+
+	mutex_unlock(&port->start_stop_lock);
+	return (p - buf);
+}
+
+static int fip_discover_rx_advertise_bh(struct fip_discover *discover,
+					struct fip_gw_data *advertise_data)
+{
+	struct fip_gw_data *gw_data;
+	int update_entry = 0;
+
+	/* see if we received advertise packets from this GW before */
+	gw_data = fip_find_gw_in_list(discover,
+				      advertise_data->info.gw_port_id,
+				      advertise_data->info.vol_info.gw_port_name,
+				      advertise_data->info.gw_guid,
+				      advertise_data->info.vol_info.system_guid,
+				      advertise_data->info.vol_info.system_name, 0/* is_login */);
+
+	/*
+	 * GW not found in GW list. Create a new GW structure
+	 * and add it to the GW list. 
+	 */
+	if (!gw_data) {
+		gw_data = fip_discover_create_gw(discover);
+		if (!gw_data) {
+			vnic_dbg_fip(discover->name, "Could not create gw\n");
+			return -ENOMEM;
+		}
+		gw_data->keep_alive_jiffies = jiffies;
+		
+		down_write(&discover->l_rwsem);
+		list_add_tail(&gw_data->list, &discover->gw_list);
+		up_write(&discover->l_rwsem);
+		update_entry = 1;
+	} else {
+		gw_data->keep_alive_jiffies = jiffies;
+		vnic_dbg_fip(discover->name, "gw_data->flush %d\n", gw_data->flush);
+		if (gw_data->flush != FIP_NO_FLUSH)
+			return 0;
+
+		if (gw_data->state <= FIP_GW_SEND_SOLICIT)
+			update_entry = 1;
+	}
+
+	/* If GW is in multicast state (based on received mcast packet),
+	 * replace it with the newer up-to-date packet info.
+	 */
+	if (update_entry) {
+		if (gw_data->state < FIP_GW_CTRL_PATH_QUERY) {
+			down_write(&discover->l_rwsem);
+			if (advertise_data->info.gw_prot_new)
+				discover->new_prot_gws++;
+			else
+				discover->old_prot_gws++;
+			up_write(&discover->l_rwsem);
+		}
+		memcpy(&gw_data->info, &advertise_data->info,
+		       sizeof(struct fip_gw_data_info));
+		if (gw_data->state < FIP_GW_SEND_SOLICIT)
+			gw_data->state = vnic_sa_query? FIP_GW_CTRL_PATH_QUERY : FIP_GW_SEND_SOLICIT;
+	} else {
+		/* If the pc_id in the adv doesn't match the one
+		   saved - there was a power cycle, so we want to close
+		   the GW */
+		if (advertise_data->info.ext_pc_id.valid &&
+		    (advertise_data->info.ext_pc_id.power_cycle_id !=
+		     gw_data->info.ext_pc_id.power_cycle_id)) {
+			vnic_dbg_fip_p0(discover->name, "received advertisement with "
+				        "pc_id %llu when expecting %llu. closing the GW",
+				         advertise_data->info.ext_pc_id.power_cycle_id,
+				         gw_data->info.ext_pc_id.power_cycle_id);
+			fip_close_gw(gw_data, FIP_PARTIAL_FLUSH);
+			goto no_repost;
+		}
+
+		/* TBD: enforce discard ?? */
+		if (gw_data->info.gw_type != advertise_data->info.gw_type)
+			vnic_dbg_fip_p0(discover->name, "gateway type must not change\n");
+
+		/* update GW descriptors that do not require additional processing.
+		   These will be updated as part of GW_MODIFY flow */
+		mutex_lock(&gw_data->mlock);
+		if (advertise_data->info.ext_pc_id.valid)
+			memcpy(&gw_data->info.ext_pc_id, &advertise_data->info.ext_pc_id,
+			       sizeof(gw_data->info.ext_pc_id));
+
+		memcpy(&gw_data->info.vol_info, &advertise_data->info.vol_info,
+		       sizeof(gw_data->info.vol_info));
+		if (gw_data->info.ext_lag.valid) {
+			gw_data->info.ext_lag.hash = advertise_data->info.ext_lag.hash;
+			gw_data->info.ext_lag.ca = advertise_data->info.ext_lag.ca;
+			gw_data->info.ext_lag.ca_thresh = advertise_data->info.ext_lag.ca_thresh;
+			gw_data->info.ext_lag.weights_policy = advertise_data->info.ext_lag.weights_policy;
+		}
+		mutex_unlock(&gw_data->mlock);
+	}
+
+	/* if multicast advertisement received */
+	if (advertise_data->info.flags & FIP_RCV_MULTICAST) {
+		vnic_dbg_fip(discover->name, "FIP_RCV_MULTICAST ADVERTISE, state %d\n",
+			     gw_data->state);
+		/* we are beyond accepting mcast advertisement */
+		if (gw_data->state > FIP_GW_SEND_SOLICIT)
+			goto out;
+
+		vnic_dbg_fip(discover->name, "received mcast advertise sending"
+			     " ucast solicit to GW qpn %d lid %d flags 0x%x\n",
+			     gw_data->info.gw_qpn, gw_data->info.gw_lid,
+			     gw_data->info.flags);
+	} else { /* unicast advertisement received */
+		int ack_received = advertise_data->info.flags & FIP_GW_AVAILABLE;
+
+		vnic_dbg_fip(discover->name, "received ucast advertise from GW "
+			     "qpn %d lid %d flags 0x%x, ack_received %s "
+			     "gw_num_vnics %d gw->state=%d, "
+			     VNIC_GUID_FMT"\n",
+			     gw_data->info.gw_qpn, gw_data->info.gw_lid,
+			     gw_data->info.flags, ack_received ? "yes" : "no",
+			     gw_data->info.gw_num_vnics, gw_data->state,
+			     VNIC_GUID_RAW_ARG(gw_data->info.gw_guid));
+
+		if (ack_received) {
+			/* if this is first ACK received */
+			switch (gw_data->state) {
+			case FIP_GW_CTRL_PATH_QUERY:
+				/*
+				* in case we are in FIP_GW_CTRL_PATH_QUERY we wait until it completes
+				* to move us to FIP_GW_SEND_SOLICIT
+				*/
+				break;
+			case FIP_GW_SEND_SOLICIT:
+				/* in case we received an ack in this state we move to DATA_PATH_QUERY */
+				gw_data->state = vnic_sa_query ? FIP_GW_DATA_PATH_QUERY : FIP_GW_CONNECTED;
+				break;
+			case FIP_GW_CONNECTED:
+				 /*
+				* received an ACK and we are connected. we need to
+				* check for changes in GW and apply them if needed
+				*/
+				if (!fip_gw_modified(gw_data, &advertise_data->info))
+					gw_data->state = FIP_GW_CONNECTED;
+				goto no_repost;
+			default:
+				break;
+			}
+		} else  /* !ack_received */ {
+			fip_close_gw(gw_data, FIP_PARTIAL_FLUSH);
+			goto no_repost;
+		}
+		/*
+		 * we don't accept ACKs in transient states.
+		 * This should not be a problem since crowded multiple ACKs
+		 * is not an expected flow, and if the packets are similar
+		 * (no updates) it doesn't matter anyway.
+		 */
+	}
+
+out:
+	vnic_dbg_fip(discover->name, "out gw->state=%d\n", gw_data->state);
+	/*
+	 * we will call the GW FSM to hadle
+	 */
+	cancel_delayed_work(&gw_data->gw_task);
+	fip_discover_gw_fsm(&gw_data->gw_task.work);
+no_repost:
+	return 0;
+}
+
+/*
+ * This function handles a single received packet that are expected to be
+ * GW advertisements or login ACK packets. The function first parses the
+ * packet and decides what is the packet type and then validates the packet
+ * according to its type. This functions runs in ka_wq task context.
+ */
+void fip_discover_rx_packet(int *queue, struct fip_content *fc)
+{
+	*queue = 0;
+	switch (fc->fh->subcode) {
+	case FIP_GW_ADV_SUB_OPCODE:
+	case FIP_GW_LOGIN_SUB_OPCODE:
+		*queue = 1;
+		break;
+	default:
+		break;
+	}
+}
+
+/*
+ * Print FIP syndrome number and string
+ */
+static void fip_print_syndrome(struct fip_vnic_data *vnic, int synd) {
+	char *syndstr;
+
+	switch (synd) {
+	case FIP_SYNDROM_HADMIN_REJECT:
+		syndstr = "FIP_SYNDROM_HADMIN_REJECT";
+		break;
+	case FIP_SYNDROM_GW_RESRC:
+		syndstr = "FIP_SYNDROM_GW_RESRC";
+		break;
+	case FIP_SYNDROM_NO_NADMIN:
+		syndstr = "FIP_SYNDROM_NO_NADMIN";
+		break;
+	case FIP_SYNDROM_UNRECOGNISED_HOST:
+		syndstr = "FIP_SYNDROM_UNRECOGNISED_HOST";
+		break;
+	case FIP_SYNDROM_UNSUPPORTED_PARAM:
+		syndstr = "FIP_SYNDROM_UNSUPPORTED_PARAM";
+		break;
+	case FIP_SYNDROM_GW_IS_LAG_MEMBER:
+		syndstr = "FIP_SYNDROM_GW_IS_LAG_MEMBER";
+		break;
+	case FIP_SYNDROM_DUPLICATE_ADDRESS:
+		syndstr = "FIP_SYNDROM_DUPLICATE_ADDRESS";
+		break;
+	default:
+		syndstr = "FIP_OTHER";
+	}
+
+	vnic_warn(vnic->name, "SYNDROME 0x%x: %s\n",
+		  synd, syndstr);
+}
+
+static void handle_login_packet(struct fip_discover *discover,
+				struct fip_login_data *login_data)
+{
+	struct fip_gw_data *gw;
+	struct fip_vnic_data *vnic;
+	int mac_vlan_refused = 0;
+	int synd;
+
+	/* find the GW that this login belongs to */
+	gw = fip_find_gw_in_list(discover,
+				 login_data->port_id,
+				 NULL,
+				 login_data->guid,
+				 NULL, NULL, 1/* is_login */);
+
+	if (!gw){
+		vnic_warn(discover->name,"dropping login ack with vnic_id:%d mac:"MAC_6_PRINT_FMT
+				  "  BX port_id:%d GUID: "VNIC_GUID_FMT", GW not found!\n",
+				  login_data->vnic_id,
+				  MAC_6_PRINT_ARG(login_data->mac),
+				  login_data->port_id,
+				  VNIC_GUID_RAW_ARG(login_data->guid));
+		return;
+	}
+	vnic = fip_vnic_find_in_list(gw, login_data->vnic_id,
+				     login_data->mac,
+				     login_data->vlan,
+				     login_data->vp);
+	if (!vnic){
+		vnic_warn(discover->name,"dropping login ack with vnic_id:%d mac:"MAC_6_PRINT_FMT
+				  "  BX port_id:%d GUID: "VNIC_GUID_FMT", vnic not found!\n",
+				  login_data->vnic_id,
+				  MAC_6_PRINT_ARG(login_data->mac),
+				  login_data->port_id,
+				  VNIC_GUID_RAW_ARG(login_data->guid));
+		return;
+	}
+
+	/*
+	 * For host administered vNICs we must have login and login ack
+	 * macs equal and different than all zeros. login and and login
+	 * ack must agree on vlan presence. And if vlan is present, vlans
+	 * must be indentical. Otherwise, the request is rejected.
+	 */
+	if (vnic->hadmined) {
+		if (!IS_ZERO_MAC(vnic->login_data.mac) &&
+		    memcmp(vnic->login_data.mac, login_data->mac, ETH_ALEN)) {
+			vnic_dbg_fip(discover->name, "fip_discover_rx_packet"
+				     " host admined mac refused\n");
+			mac_vlan_refused = 1;
+		} else if (vnic->login_data.all_vlan_gw != login_data->all_vlan_gw)
+			vnic_dbg_fip(discover->name,
+				     "fip_discover_rx_packet host"
+				     " host and GW disagree on all_vlan mode\n");
+		/* If the host is not working in all_vlan_gw policy -
+		   check the requested vlan against the accepted */
+		else if (!gw->info.all_vlan_gw &&
+			   (vnic->login_data.vp != login_data->vp ||
+			    (login_data->vp == 1 &&
+			     vnic->login_data.vlan != login_data->vlan))) {
+			vnic_dbg_fip(discover->name,
+				     "fip_discover_rx_packet host"
+				     " admined vlan refused\n");
+			mac_vlan_refused = 1;
+		}
+	}
+
+	/* process a login packet for the specific vnic */
+	synd = (int)login_data->syndrome;
+	if (synd || mac_vlan_refused) {
+		char *vnic_name = vnic->hadmined ?
+			  (char *)vnic->interface_name : (char *)vnic->name;
+		/* print syndrome as long as backlog limit is not exceeded */
+		if (vnic->synd_backlog++ >= vnic_synd_backlog)
+			return;
+
+		vnic_warn(discover->name, "%s login failed "
+			  "(mac "MAC_6_PRINT_FMT" vlan %d) "
+			  "backlog %d/%d\n",
+			  vnic_name,
+			  MAC_6_PRINT_ARG(vnic->mac_cache),
+			  (vnic->vlan_used ? vnic->vlan : -1),
+			  vnic->synd_backlog, vnic_synd_backlog);
+
+		if (mac_vlan_refused)
+			vnic_warn(vnic->name, "MAC/VLAN refused\n");
+
+		fip_print_syndrome(vnic, synd);
+
+		if (synd == FIP_SYNDROM_UNRECOGNISED_HOST) {
+			vnic_info("%s %s sending ucast sloicit to Gateway\n",
+					  discover->name, vnic_name);
+			if(fip_solicit_send(gw->discover,
+                                    FIP_DISCOVER_UCAST,
+                                    gw->info.gw_qpn,
+                                    gw->info.gw_lid,
+                                    vnic_gw_ctrl_sl(gw),
+                                    gw->info.gw_prot_new))
+				vnic_warn(discover->name, "%s Failed to send ucast solicit\n", vnic_name);
+		}
+	} else {
+		vnic->all_vlan_gw = !!((!vnic->hadmined && vnic->gw->info.all_vlan_gw) ||
+				       (vnic->hadmined && vnic->login_data.all_vlan_gw));
+		fip_vnic_login_ack_recv(vnic, login_data);
+	}
+}
+
+/*
+ * This function handles a single received packet that are expected to be
+ * GW advertisements or login ACK packets. The function first parses the
+ * packet and decides what is the packet type and then processes the packet
+ * according to its type. This functions runs in task context.
+ */
+int fip_discover_rx_packet_bh(struct fip_discover *discover, struct fip_content *fc)
+{
+	struct fip_gw_data *advertise_data = NULL;
+	struct fip_login_data *login_data = NULL;
+	int rc;
+	int ret = 0;
+
+	switch (fc->fh->subcode) {
+	case FIP_GW_ADV_SUB_OPCODE:
+		advertise_data = kzalloc(sizeof *advertise_data, GFP_KERNEL);
+		if (!advertise_data) {
+			vnic_warn(discover->name,
+				  "Failed to allocate %Zu bytes",
+				  sizeof *advertise_data);
+			return -ENOMEM;
+		}
+
+		rc = fip_advertise_parse_bh(discover, fc, advertise_data);
+		if (!rc)
+			ret = fip_discover_rx_advertise_bh(discover,
+							   advertise_data);
+		kfree(advertise_data);
+		break;
+   
+	case FIP_GW_LOGIN_SUB_OPCODE:
+		login_data = kzalloc(sizeof *login_data, GFP_KERNEL);
+		if (!login_data) {
+			vnic_warn(discover->name,
+				  "Failed to allocate %Zu bytes",
+				  sizeof *login_data);
+			return -ENOMEM;
+		}
+
+		rc = fip_login_parse(discover, fc, login_data);
+		if (!rc)
+			handle_login_packet(discover, login_data);
+
+		kfree(login_data);
+		break;
+	default:
+		break;
+	}
+
+	return ret;
+}
+
+/*
+ * This function is a callback called upon successful join to a
+ * multicast group. The function checks if we have joined + attached
+ * to all required mcast groups and if so moves the discovery FSM to solicit.
+ */
+static void fip_discover_mcast_connect_cb(struct vnic_mcast *mcaste, void *ctx)
+{
+	struct fip_discover *discover = mcaste->priv_data;
+
+	if (mcaste->cur_attached && mcaste->req_attach) {
+		vnic_dbg_parse(discover->name, "attached mask = 0x%lx, req mask = 0x%lx\n",
+			       *mcaste->cur_attached, *mcaste->req_attach);
+		if ((*mcaste->cur_attached & *mcaste->req_attach) !=
+		    *mcaste->req_attach) {
+			return;
+		}
+	}
+
+	discover->discover_mcast_attached_jiffies = jiffies;
+	set_bit(MCAST_ATTACHED, &discover->discover_mcast_state);
+	/* in the case of a reconnect don't change state or send a solicit
+	 * packet
+	 */
+	if (discover->state < FIP_DISCOVER_SOLICIT) {
+		vnic_dbg_fip(discover->name, "fip_multicast_connected moved"
+			     " state to solicit\n");
+		spin_lock_irq(&discover->lock);
+		if (discover->flush == FIP_NO_FLUSH) {
+			/* delay sending solicit packet by 0-100 mSec */
+			int rand_delay = jiffies % 100; /*get_random_int()*/
+			discover->state = FIP_DISCOVER_SOLICIT;
+			cancel_delayed_work(&discover->fsm_task);
+			/* This is really (rand_delay / 1000) * HZ*/
+			/* calls fip_discover_fsm() */
+			queue_delayed_work(fip_wq, &discover->fsm_task,
+					   (rand_delay * HZ) / 1000);
+		}
+		spin_unlock_irq(&discover->lock);
+	}
+	vnic_dbg_fip(discover->name, "discover_mcast_connect_cb done\n");
+}
+
+/*
+ * This function is a callback called upon a mcast deattach event.
+ * This event can be triggered due to discovery teardown or due to an async
+ * event. Currently this code does not participate in the discovery's FSM.
+*/
+void fip_discover_mcast_deattach_cb(struct vnic_mcast *mcast, void *ctx)
+{
+//	struct vnic_mcast *mcast_other = ctx;
+	struct fip_discover *discover = mcast->priv_data;
+
+	discover->discover_mcast_detached_jiffies = jiffies;
+	clear_bit(MCAST_ATTACHED, &discover->discover_mcast_state);
+
+	vnic_dbg_fip(NULL, "fip_discover_mcast_deattach_cb\n");
+}
+
+/*
+ * Try to connect to the relevant mcast groups. If one of the mcast failed
+ * The function should be recalled to try and complete the join process
+ * (for the mcast groups that the join process was not performed).
+ * Note: A successful return of vnic_mcast_join means that the mcast join
+ * started, not that the join completed. completion of the connection process
+ * is asyncronous and uses a supplyed callback.
+ */
+static int fip_discover_mcast_connect(struct fip_discover *discover)
+{
+	struct vnic_mcast *mcaste_disc, *mcaste_sol, *mcaste;
+	int rc;
+
+	mcaste_disc = vnic_mcast_alloc(discover->port, &discover->req_attach, &discover->cur_attached);
+	if (IS_ERR(mcaste_disc))
+		return -EINVAL;
+
+	mcaste_sol = vnic_mcast_alloc(discover->port, &discover->req_attach, &discover->cur_attached);
+	if (IS_ERR(mcaste_sol)) {
+		vnic_mcast_dealloc(mcaste_disc);
+		return -EINVAL;
+	}
+
+	set_bit(FIP_MCAST_DISCOVER, &discover->req_attach);
+	set_bit(FIP_MCAST_SOLICIT, &discover->req_attach);
+
+	mcaste = mcaste_disc;
+	mcaste->priv_data = discover;
+	mcaste->attach_bit_nr = FIP_MCAST_DISCOVER;
+	memcpy(mcaste->mac, ETH_BCAST_MAC, ETH_ALEN);
+	memcpy(&mcaste->gid, fip_discover_mgid, GID_LEN);
+	if (discover->pkey != 0xffff)
+		*(u16 *)&mcaste->gid.raw[6] = htons(discover->pkey | 0x8000);
+	memcpy(&mcaste->port_gid, &mcaste->gid, GID_LEN);
+	mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC);
+	mcaste->backoff_factor = VNIC_MCAST_BACKOF_FAC;
+	mcaste->retry = VNIC_MCAST_ULIMIT_RETRY;
+	mcaste->attach_cb = fip_discover_mcast_connect_cb;
+	mcaste->detach_cb = fip_discover_mcast_deattach_cb;
+	mcaste->attach_cb_ctx = mcaste_sol;
+	mcaste->detach_cb_ctx = mcaste_sol;
+	mcaste->pkey = discover->pkey;
+	mcaste->qkey = VNIC_FIP_QKEY;
+	mcaste->qp = discover->qp;
+	mcaste->blocking = 0;
+	mcaste->join_state = 1;
+	rc = vnic_mcast_add(&discover->mcast_tree, mcaste);
+	ASSERT(!rc);
+	rc = vnic_mcast_attach(&discover->mcast_tree, mcaste); /* MCAST_RECEIVE_ONLY */
+	ASSERT(!rc);
+
+	mcaste = mcaste_sol;
+	mcaste->priv_data = discover;
+	mcaste->attach_bit_nr = FIP_MCAST_SOLICIT;
+	memcpy(mcaste->mac, ETH_BCAST_MAC, ETH_ALEN);
+	memcpy(&mcaste->gid, fip_solicit_mgid, GID_LEN);
+	if (discover->pkey != 0xffff)
+		*(u16 *)&mcaste->gid.raw[6] = htons(discover->pkey | 0x8000);
+	memcpy(&mcaste->port_gid, &mcaste->gid, GID_LEN);
+	mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC);
+	mcaste->backoff_factor = VNIC_MCAST_BACKOF_FAC;
+	mcaste->retry = VNIC_MCAST_ULIMIT_RETRY;
+	mcaste->attach_cb = fip_discover_mcast_connect_cb;
+	mcaste->detach_cb = fip_discover_mcast_deattach_cb;
+	mcaste->attach_cb_ctx = mcaste_disc;
+	mcaste->detach_cb_ctx = mcaste_disc;
+	mcaste->pkey = discover->pkey;
+	mcaste->qkey = VNIC_FIP_QKEY;
+	mcaste->qp = discover->qp;
+	mcaste->blocking = 0;
+	mcaste->join_state = 1;
+	mcaste->sender_only = 1;
+	rc = vnic_mcast_add(&discover->mcast_tree, mcaste);
+	ASSERT(!rc);
+	rc = vnic_mcast_attach(&discover->mcast_tree, mcaste); /* MCAST_SEND_ONLY */
+	ASSERT(!rc);
+
+	return 0;
+}
+
+int fip_discover_mcast_reattach(struct fip_discover *discover,
+				struct vnic_port *port)
+{
+	int flush;
+
+	spin_lock_irq(&discover->lock);
+	flush = discover->flush;
+	spin_unlock_irq(&discover->lock);
+
+	if (flush == FIP_NO_FLUSH &&
+	    discover->state > FIP_DISCOVER_INIT) {
+		vnic_tree_mcast_detach(&discover->mcast_tree);
+		vnic_tree_mcast_attach(&discover->mcast_tree);
+	}
+	return 0;
+}
+
+static void fip_discover_ctrl_path_query_complete(
+					int status,
+					struct ib_sa_path_rec *pathrec,
+					void *context)
+{
+	struct fip_gw_data *gw = context;
+	vnic_dbg_fip_p0(gw->discover->name, "fip ctrl path query complete status=%d\n", status);
+	if (!status) {
+		vnic_dbg_fip_p0(gw->discover->name, "fip ctrl path query success srcgid:"VNIC_GUID_FMT" dgid:"VNIC_GUID_FMT"\n",
+						VNIC_GUID_RAW_ARG(pathrec->sgid.raw+8),
+						VNIC_GUID_RAW_ARG(pathrec->dgid.raw+8));
+		gw->ctrl_prec = *pathrec;
+		fip_discover_gw_fsm_move(gw, FIP_GW_SEND_SOLICIT);
+	} else {
+		vnic_dbg_fip_p0(gw->discover->name, "fip ctrl path query FAILED ret=%d\n", status);
+		gw->query_id = -1; /* this will cause a retry */
+	}
+	complete(&gw->query_comp);
+}
+
+static void fip_discover_data_path_query_complete(
+						int status,
+						struct ib_sa_path_rec *pathrec,
+						void *context)
+{
+	struct fip_gw_data *gw = context;
+	vnic_dbg_fip_p0(gw->discover->name, "fip data path query complete status=%d\n", status);
+	if (!status) {
+		struct ib_sa_path_rec old_pathrec;
+		struct fip_vnic_data *vnic;
+		vnic_dbg_fip_p0(gw->discover->name, "fip data path query success srcgid:"VNIC_GUID_FMT" dgid:"VNIC_GUID_FMT"\n",
+						VNIC_GUID_RAW_ARG(pathrec->sgid.raw+8),
+						VNIC_GUID_RAW_ARG(pathrec->dgid.raw+8));
+		old_pathrec = gw->data_prec;
+		gw->data_prec = *pathrec;
+		if (old_pathrec.sl != gw->data_prec.sl) {
+			/* in case of SL change close the vnic to relogin with the new SL */
+			vnic_info("[%s] %s %s Data SL changed from %d to %d\n",
+					  gw->info.vol_info.system_name,
+					  gw->discover->port->name,
+					  gw->info.vol_info.gw_port_name,
+					  old_pathrec.sl, gw->data_prec.sl);
+			 list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) {
+                if (vnic->flush != FIP_FULL_FLUSH && vnic->state >= FIP_VNIC_LOGIN)
+					fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+			}
+		}
+		fip_discover_gw_fsm_move(gw, FIP_GW_CONNECTED);
+	} else {
+		vnic_dbg_fip_p0(gw->discover->name, "fip data path query FAILED ret=%d\n", status);
+		gw->query_id = -1; /* this will cause a retry */
+	}
+	complete(&gw->query_comp);
+}
+
+static int fip_discover_path_query(struct fip_gw_data *gw, int is_data_sl)
+{
+	ib_sa_comp_mask comp_mask;
+	struct ib_sa_path_rec p_rec;
+	void(*callback)(int status, struct ib_sa_path_rec *resp, void *context);
+
+	vnic_dbg_fip_p0(gw->discover->name, "fip path query %d of GW lid:%d sl=%d GID:"VNIC_GUID_FMT" SID=%llx data_path=%d!\n",
+				 gw->query_path_cnt,
+				 gw->info.gw_lid,
+				 gw->info.gw_sl,
+				 VNIC_GUID_RAW_ARG(gw->info.gw_guid),
+				 is_data_sl ? EOIB_SERVICE_ID : EOIB_CTRL_SERVICE_ID,
+				 is_data_sl);
+
+	comp_mask =      IB_SA_PATH_REC_SERVICE_ID  |
+					 IB_SA_PATH_REC_DGID         |
+					 IB_SA_PATH_REC_SGID         |
+					 IB_SA_PATH_REC_REVERSIBLE  |
+					 IB_SA_PATH_REC_PKEY;
+
+	callback = is_data_sl ? fip_discover_data_path_query_complete : fip_discover_ctrl_path_query_complete;
+	memset(&p_rec, 0, sizeof(p_rec));
+
+	p_rec.service_id = is_data_sl ? cpu_to_be64(EOIB_SERVICE_ID) : cpu_to_be64(EOIB_CTRL_SERVICE_ID);
+	p_rec.sgid = gw->discover->port->gid;
+	/* copy the subnet prefix from source gid */
+	memcpy(p_rec.dgid.raw, p_rec.sgid.raw, 8);
+	/* copy gw dgid */
+	memcpy(p_rec.dgid.raw+8, gw->info.gw_guid,8);
+	p_rec.pkey = cpu_to_be16(gw->discover->pkey);
+	p_rec.reversible = cpu_to_be32(1);
+
+	if (gw->query_id >= 0 && !IS_ERR(gw->pquery) && gw->pquery) {
+		ib_sa_cancel_query(gw->query_id, gw->pquery);
+		return -1; /* retry later */
+	}
+
+	init_completion(&gw->query_comp);
+	gw->query_path_cnt++;
+	gw->query_id = -1;
+	gw->pquery = ERR_PTR(-ENODATA);
+
+	gw->query_id =
+		ib_sa_path_rec_get(&vnic_sa_client,
+						   gw->discover->port->dev->ca,
+						   gw->discover->port->num,
+						   &p_rec,
+						   comp_mask,
+						   2000 /*TOUT*/,
+						   GFP_KERNEL,
+						   callback,
+						   gw,
+						   &gw->pquery);
+	if (gw->query_id < 0) {
+		complete(&gw->query_comp);
+		vnic_dbg_fip_p0(gw->discover->name, "ib_sa_path_rec_get failed, error %d\n", gw->query_id);
+		gw->pquery = ERR_PTR(-ENODATA);
+	}
+	return gw->query_id;
+}
+
+void fip_discover_gw_fsm_move(struct fip_gw_data *gw, enum fip_gw_state state)
+{
+	cancel_delayed_work(&gw->gw_task);
+	if (gw->pquery && !IS_ERR(gw->pquery) && gw->query_id >= 0)
+		ib_sa_cancel_query(gw->query_id, gw->pquery);
+
+	gw->state = state;
+	gw->query_id = -1;
+	gw->query_path_cnt = 0;
+	queue_delayed_work(fip_wq, &gw->gw_task, 0);
+}
+
+
+static void fip_discover_gw_fsm(struct work_struct *work)
+{
+	struct fip_gw_data *curr_gw =
+		container_of(work, struct fip_gw_data, gw_task.work);
+	unsigned long next_wakeup = curr_gw->info.gw_adv_period;
+	unsigned long rand = jiffies % 100 + 1;
+	int ret;
+
+	if (curr_gw->flush != FIP_NO_FLUSH)
+		return;
+
+	if (test_bit(MCAST_ATTACHED,
+		     &curr_gw->discover->discover_mcast_state)) {
+		if (time_after(jiffies, curr_gw->keep_alive_jiffies + next_wakeup)) {
+			if (time_after(jiffies,
+				       curr_gw->discover->discover_mcast_attached_jiffies
+				        + next_wakeup)) {
+				fip_close_gw(curr_gw, FIP_PARTIAL_FLUSH);
+				return;
+			}
+		}
+	} else {
+		/* close gw if 1 minute has elapsed since mcast detach */
+		if (time_after(jiffies,
+			       curr_gw->discover->discover_mcast_detached_jiffies
+				+ 60*HZ)) {
+			fip_close_gw(curr_gw, FIP_PARTIAL_FLUSH);
+			return;
+		}
+	}
+
+	switch (curr_gw->state) {
+	case FIP_GW_HOST_ADMIN:
+		break;
+	case FIP_GW_CTRL_PATH_QUERY:
+		if (curr_gw->query_path_cnt && curr_gw->query_id >= 0) {
+			/* PATH query is running */
+			next_wakeup = msecs_to_jiffies(100);
+			break;
+		}
+		ret = fip_discover_path_query(curr_gw, 0/*ctrl SL*/);
+		if (ret < 0)
+			vnic_dbg_fip_p0(curr_gw->discover->name, "Query ctrl path Failed : retry num %d ...\n", curr_gw->query_path_cnt);
+		next_wakeup = msecs_to_jiffies(100);
+		break;
+
+	case FIP_GW_SEND_SOLICIT:
+		curr_gw->query_path_cnt = 0;
+		curr_gw->query_id = -1;
+		curr_gw->pquery = ERR_PTR(-ENODATA);
+		vnic_dbg_fip(curr_gw->discover->name, "DISCOVER_LOGIN FIP_GW_SEND_SOLICIT\n");
+		vnic_dbg_parse(curr_gw->discover->name, "new protocol %d\n", curr_gw->info.gw_prot_new);
+		ret = fip_solicit_send(curr_gw->discover, FIP_DISCOVER_UCAST,
+							   curr_gw->info.gw_qpn,
+							   curr_gw->info.gw_lid,
+							   vnic_gw_ctrl_sl(curr_gw),
+							   curr_gw->info.gw_prot_new);
+		if (ret)
+			next_wakeup = (100 + rand * HZ) / 200;
+		else
+			next_wakeup = (100 + rand * HZ) / 25;
+		break;
+
+	case FIP_GW_DATA_PATH_QUERY:
+		if (curr_gw->query_path_cnt && curr_gw->query_id >= 0) {
+			/* PATH query is running */
+			next_wakeup = msecs_to_jiffies(100);
+			break;
+		}
+		ret = fip_discover_path_query(curr_gw, 1/*data SL*/);
+		if (ret < 0)
+			vnic_dbg_fip_p0(curr_gw->discover->name, "Query data path Failed : retry num %d ...\n", curr_gw->query_path_cnt);
+		next_wakeup = msecs_to_jiffies(100);
+		break;
+
+	case FIP_GW_CONNECTED:
+		vnic_dbg_fip(curr_gw->discover->name, "DISCOVER_LOGIN: GW_CONNECTED!!!\n");
+		/* test vnic status */
+		fip_gw_create_vnics(curr_gw);
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	/* go to sleep until time out. We expect that we will be awaken by
+	 * RX packets and never get to wake up due to timeout
+	 */
+	cancel_delayed_work(&curr_gw->gw_task);
+	queue_delayed_work(fip_wq, &curr_gw->gw_task, next_wakeup);
+}
+
+static int is_new_solicit_prot(struct fip_discover *discover)
+{
+	vnic_dbg_parse(discover->name, "new gw %d, old gw %d\n",
+		       discover->new_prot_gws, discover->old_prot_gws);
+
+	if (!discover->old_prot_gws) {
+		if (!discover->new_prot_gws) {
+			/* mcast solicit sent before any
+			 * advertise packets arrive. Use old format.
+			 */
+			return 0;
+		} else
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * This is the discover finite state machine that runs the
+ * advertise and solicit packet exchange of the discovery
+ * proccess.
+ * It is assumed that this function is only called from work queue
+ * task context (for locking)
+ */
+static void fip_discover_fsm(struct work_struct *work)
+{
+	struct fip_discover *discover =
+		container_of(work, struct fip_discover, fsm_task.work);
+	struct vnic_port *port = discover->port;
+	int recall_time = -1, flush = discover->flush;
+
+	/* we got a flush request and we have not performed it yet */
+	if ((flush != FIP_NO_FLUSH) &&
+	     discover->state != FIP_DISCOVER_OFF) {
+		vnic_dbg_fip(discover->name, "discover_fsm switching to OFF\n");
+
+		recall_time = DELAYED_WORK_CLEANUP_JIFFS * 2;
+
+
+		if (discover->state != FIP_DISCOVER_CLEAR) {
+			fip_free_gw_list(discover, flush);
+			discover->state = FIP_DISCOVER_CLEAR;
+		}
+
+		/* if we open GWs we will test again later */
+		if (!fip_free_gw_done(discover, flush)) {
+			vnic_dbg_fip(discover->name, "fip_free_gw_list not done, recalling \n");
+			goto recall_fsm;
+		}
+
+		if (delayed_work_pending(&discover->cleanup_task))
+			goto recall_fsm;
+
+		vnic_dbg_fip(discover->name, "fip_free_gw_list done \n");
+		vnic_dbg_mark();
+		vnic_mcast_del_all(&discover->mcast_tree);
+		vnic_dbg_mark();
+		discover->state = FIP_DISCOVER_OFF;
+
+		/* signal the unload to continue */
+		complete(&discover->flush_complete);
+		return;
+	}
+
+	if (discover->state == FIP_DISCOVER_OFF)
+		return;
+
+	if (!port->attr.lid) {
+		recall_time = 1 * HZ;
+		goto recall_fsm;
+	}
+
+	switch (discover->state) {
+        int new_prot;
+
+	case FIP_DISCOVER_INIT:
+		vnic_dbg_fip(discover->name, "FIP_DISCOVER_INIT\n");
+		/* in init try and join the discover multicast group
+		 * This is a preliminary request for all other progress
+		 * will eventually call fip_discover_mcast_connect_cb()
+		 */
+		if (fip_discover_mcast_connect(discover)) {
+			vnic_warn(discover->name, "fip_discover_mcast_connect() "
+				  "failed\n");
+			recall_time = 1 * HZ;
+		}
+		break;
+
+	case FIP_DISCOVER_SOLICIT:
+		new_prot = is_new_solicit_prot(discover);
+		vnic_dbg_fip(discover->name, "DISCOVER_SOLICIT\n");
+
+		/* send multicast solicit of type fip, if send is
+		 * successfull move to login state and await advertise
+		 * packets. It TX fail then retry
+		 */
+		fip_solicit_send(discover, FIP_DISCOVER_MCAST, 0, 0, 0, new_prot);
+		recall_time = FIP_RESOLICIT_TIME * HZ;
+
+		break;
+
+	case FIP_DISCOVER_OFF:
+	default:
+		ASSERT(0);
+		break;
+
+	}
+
+recall_fsm:
+	if (recall_time >= 0)
+		queue_delayed_work(fip_wq, &discover->fsm_task, recall_time);
+
+	return;
+}
+
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_discover.h b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_discover.h
new file mode 100644
index 0000000000000..52e11d359a6cf
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_discover.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _FIP_DISCOVER_H
+#define _FIP_DISCOVER_H
+
+#include "vnic.h"
+#include "vnic_fip.h"
+
+/* TODO - rethink this */
+#define FIP_UD_MTU(ib_mtu) (ib_mtu - FIP_ENCAP_LEN - FIP_ETH_HEADER_LEN)
+#define FIP_UD_BUF_SIZE(ib_mtu)	(ib_mtu + IB_GRH_BYTES)
+
+#define FIP_MAX_BACKOFF_SECONDS	16
+#define FIP_MAX_VNICS_PER_GW	(1 << 9)
+
+#define FIP_TIMEOUT_FACTOR(a) ((a)*5/2)
+
+enum fip_gw_state {
+	FIP_GW_HOST_ADMIN,
+	FIP_GW_CTRL_PATH_QUERY,
+	FIP_GW_SEND_SOLICIT,	/* got mcast advertise & ctrl path query. sending solicit */
+	FIP_GW_DATA_PATH_QUERY,
+	FIP_GW_CONNECTED	/* we are already connected. do nothing */
+};
+
+
+enum {
+	GW_TYPE_SINGLE_EPORT = 0,
+	GW_TYPE_LAG = 1,
+};
+
+struct gw_ext_boot {
+	int valid;
+	int boot_prio;
+	int timeout;
+};
+
+struct gw_ext_lag {
+	int valid;
+	int hash;	/* enum gw_ext_lag_hash_policy */
+	int weights_policy;
+	int member_ka;
+	int ca;		/* conjestion aware */
+	int ca_thresh;
+	int ucast;	/* gw supports unicat keep alives */
+};
+
+
+struct gw_ext_pc_id {
+	int valid;
+	u64 power_cycle_id;
+};
+
+struct fip_gw_data_info {
+	struct fip_gw_volatile_info vol_info;
+	long gw_adv_period;  /* timeout in jiffies */
+	long gw_period;      /* timeout in jiffies */
+	long vnic_ka_period; /* in jiffies */
+	int flags;
+	u32 gw_qpn;
+	u16 gw_lid;
+	u16 gw_port_id;
+	u16 gw_num_vnics;
+	u16 n_rss_qpn;
+	u8 gw_sl; /* GW ctrl SL */
+	u8 hadmined_en;
+	u8 all_vlan_gw;
+	u8 gw_vendor_id[VNIC_VENDOR_LEN+1];
+	u8 gw_guid[GUID_LEN];
+	int gw_type;
+	int gw_prot_new;
+	int ext_mask;
+	struct gw_ext_boot   ext_boot;
+	struct gw_ext_lag    ext_lag;
+	struct gw_ext_pc_id  ext_pc_id;
+};
+
+struct fip_gw_data {
+	enum fip_flush flush;
+	int hadmin_gw;
+	struct mutex mlock;
+	struct fip_discover *discover;
+	struct list_head list;
+	unsigned long keep_alive_jiffies;
+	enum fip_gw_state state;
+	int vnic_count;
+	struct list_head vnic_list;
+	struct delayed_work gw_task;
+	struct delayed_work vnic_cleanup_task;
+	struct fip_gw_data_info info;
+	unsigned long n_bitmask[(FIP_MAX_VNICS_PER_GW >> 3) /
+			      sizeof(unsigned long)];
+
+	struct ib_sa_path_rec ctrl_prec;
+	struct ib_sa_path_rec data_prec;
+	struct ib_sa_query *pquery;
+	int query_path_cnt;
+	int query_id;
+	struct completion query_comp;
+};
+
+enum fip_gw_data_flags {
+	FIP_IS_FIP = 1 << 0,	/* protocol type */
+	FIP_RCV_MULTICAST = 1 << 1,	/* received mcast packet */
+	FIP_GW_AVAILABLE = 1 << 2,	/* GW available bit set in pkt */
+	FIP_HADMINED_VLAN = 1 << 3,	/* H bit set in advertise pkt */
+};
+
+static inline u8 vnic_gw_ctrl_sl(struct fip_gw_data *gw)
+{
+	return vnic_sa_query? gw->ctrl_prec.sl : gw->info.gw_sl;
+}
+
+/*
+ * TODO - we can do a nicer job here. stage 2
+ * allocates memory and post receives
+ */
+int fip_post_discovery_rcv(struct vnic_port *port,
+			   int ring_size, struct ib_qp *qp,
+			   struct fip_ring *rx_ring);
+
+int fip_discover_mcast_reattach(struct fip_discover *discover,
+				struct vnic_port *port);
+
+/*
+ * This function handles a single received packet that are expected to be
+ * GW advertisements or login ACK packets. The function first parses the
+ * packet and decides what is the packet type and then handles the packets
+ * specifically according to its type. This functions runs in task context.
+*/
+void fip_discover_rx_packet(int *queue, struct fip_content *fc);
+int fip_discover_rx_packet_bh(struct fip_discover *discover, struct fip_content *fc);
+
+/*
+ * This function is the RX packet handler entry point at the thread level
+ * (unlike the completion handler that runs from interrupt context).
+ * the function calls a handler function and then reallocats the ring
+ * entry for the next receive.
+*/
+void fip_discover_process_rx(struct fip_discover *discover);
+void fip_discover_process_rx_bh(struct work_struct *work);
+void fip_discover_gw_fsm_move(struct fip_gw_data *gw, enum fip_gw_state state);
+
+/* This function creates an info string from GW attributes published
+ * by the GW in advertisement pkts */
+int fip_get_short_gw_info(struct fip_gw_data *gw, char *buff);
+
+
+int fip_packet_parse(struct vnic_port *port, void *packet, int size,
+		     struct fip_content *fc);
+
+#endif /* _FIP_DISCOVER_H */
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_ib.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_ib.c
new file mode 100644
index 0000000000000..ba630673777a1
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_ib.c
@@ -0,0 +1,437 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_fip.h"
+
+#define	FIP_OP_RECV   (1ul << 31)
+/* TODO - rethink this */
+#define FIP_UD_MTU(ib_mtu)	(ib_mtu - FIP_ENCAP_LEN - FIP_ETH_HEADER_LEN)
+#define FIP_UD_BUF_SIZE(ib_mtu)	(ib_mtu + IB_GRH_BYTES)
+
+static inline void fip_wr_pepare(struct vnic_port *port,
+				 struct ib_send_wr *tx_wr,
+				 struct ib_sge *tx_sge,
+				 unsigned int wr_id, u64 mapping,
+				 int size, u16 pkey_index)
+{
+	/* This is a fixed part */
+	memset(tx_wr, 0, sizeof(struct ib_send_wr));
+	tx_wr->num_sge = 1;
+	tx_wr->sg_list = tx_sge;
+	tx_wr->opcode = IB_WR_SEND;
+	tx_wr->send_flags = IB_SEND_SIGNALED; 
+	tx_wr->wr.ud.pkey_index = pkey_index;
+	tx_wr->wr_id = wr_id;
+
+	memset(tx_sge, 0, sizeof(struct ib_sge));
+	tx_sge->lkey = port->mr->lkey;
+	tx_sge->addr = mapping;
+	tx_sge->length = size;
+}
+
+/*
+ * send a single multicast packet.
+ * return 0 on success, other on failure.
+*/
+int fip_mcast_send(struct vnic_port *port,
+		   struct ib_qp *qp,
+		   unsigned int wr_id,
+		   u64 mapping,
+		   int size,
+		   u16 pkey_index,
+		   struct vnic_mcast *mcast)
+{
+	struct ib_send_wr *bad_wr;
+	struct ib_sge tx_sge;
+	struct ib_send_wr tx_wr;
+	int ret;
+
+	fip_wr_pepare(port, &tx_wr, &tx_sge, wr_id, mapping, size, pkey_index);
+
+	tx_wr.wr.ud.ah = mcast->ah;
+	tx_wr.wr.ud.remote_qpn = 0xFFFFFFFF;	/*dest_qpn; */
+	tx_wr.wr.ud.remote_qkey = mcast->qkey;
+
+	ret = ib_post_send(qp, &tx_wr, &bad_wr);
+
+	return ret;
+}
+
+/*
+ * send a single unicast packet.
+ * return 0 on success, other on failure.
+ */
+int fip_ucast_send(struct vnic_port *port,
+		   struct ib_ah *ah,
+		   struct ib_qp *qp,
+		   unsigned int wr_id,
+		   u64 mapping,
+		   int size,
+		   u16 pkey_index, u32 dest_qpn, u16 dlid,
+		   u32 qkey, u8 sl)
+{
+	struct ib_send_wr *bad_wr;
+	struct ib_ah *new_ah = NULL;
+	struct ib_sge tx_sge;
+	struct ib_send_wr tx_wr;
+	int ret;
+
+	fip_wr_pepare(port, &tx_wr, &tx_sge, wr_id, mapping, size, pkey_index);
+
+	if (!ah) {
+		struct ib_ah_attr ah_attr = {
+			.dlid = dlid,
+			.port_num = port->num,
+			.sl = sl & 0xf,
+		};
+
+		new_ah = ib_create_ah(port->pd, &ah_attr);
+		if (IS_ERR(new_ah))
+			return -1;
+
+		tx_wr.wr.ud.ah = new_ah;
+	} else
+		tx_wr.wr.ud.ah = ah;
+
+	tx_wr.wr.ud.remote_qpn = dest_qpn;
+	tx_wr.wr.ud.remote_qkey = qkey;
+
+	ret = ib_post_send(qp, &tx_wr, &bad_wr);
+
+	if (new_ah)
+		ib_destroy_ah(new_ah);
+
+	return ret;
+}
+
+/*
+ * This is a general purpose CQ completion function that handles
+ * completions on RX and TX rings. It can serve all users that are
+ * using RX and TX rings.
+ * RX completions are destinguished from TX comp by the MSB that is set
+ * for RX and clear for TX. For RX, the memory is unmapped from the PCI,
+ * The head is incremented. For TX the memory is unmapped and then freed.
+ * The function returns the number of packets received.
+*/
+int fip_comp(struct vnic_port *port,
+	     struct ib_cq *cq,
+	     struct fip_ring *rx_ring,
+	     struct fip_ring *tx_ring,
+	     char *name)
+{
+#define FIP_DISCOVER_WC_COUNT 4
+	struct ib_wc ibwc[FIP_DISCOVER_WC_COUNT];
+	int wrid, n, i;
+	int mtu_size = FIP_UD_BUF_SIZE(port->max_mtu_enum);
+	int rx_count = 0;
+	struct ib_device *dev = port->dev->ca;
+
+	do {
+		/*
+		 * poll for up to FIP_DISCOVER_WC_COUNT in one request.
+		 * returns the number of WC actually polled
+		 */
+		n = ib_poll_cq(cq, FIP_DISCOVER_WC_COUNT, ibwc);
+		for (i = 0; i < n; ++i) {
+			/*
+			 * use a mask on the id to decide if this is a receive
+			 * or transmit WC
+			 */
+			if (ibwc[i].wr_id & FIP_OP_RECV) {
+				wrid = ibwc[i].wr_id & ~FIP_OP_RECV;
+
+				ib_dma_sync_single_for_cpu(dev,
+							   rx_ring->ring[wrid].bus_addr,
+							   mtu_size,
+							   DMA_FROM_DEVICE);
+
+				if (likely(ibwc[i].status == IB_WC_SUCCESS)) {
+					rx_ring->ring[wrid].length =
+					    ibwc[i].byte_len;
+					rx_count++;
+				} else
+					rx_ring->ring[wrid].entry_posted = 0;
+
+				rx_ring->head++;
+			} else {	/* TX completion */
+				unsigned long flags;
+				wrid = ibwc[i].wr_id;
+
+				/* unmap and free transmitted packet */
+				ib_dma_unmap_single(dev,
+						    tx_ring->ring[wrid].
+						    bus_addr, tx_ring->ring[wrid].length,
+						    DMA_TO_DEVICE);
+
+				kfree(tx_ring->ring[wrid].mem);
+				tx_ring->ring[wrid].mem = NULL;
+				tx_ring->ring[wrid].length = 0;
+				spin_lock_irqsave(&tx_ring->head_tail_lock, flags);
+				tx_ring->tail++;
+				spin_unlock_irqrestore(&tx_ring->head_tail_lock, flags);
+			}
+		}
+	} while (n == FIP_DISCOVER_WC_COUNT);
+
+	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+
+	return rx_count;
+}
+
+/* qonfigure a newly allocated QP and move it
+ * from reset->init->RTR->RTS
+ */
+int fip_init_qp(struct vnic_port *port, struct ib_qp *qp, u16 pkey_index, char *name)
+{
+	struct ib_qp_attr qp_attr;
+	int attr_mask;
+
+	qp_attr.qp_state = IB_QPS_INIT;
+	qp_attr.qkey = VNIC_FIP_QKEY;
+	qp_attr.port_num = port->num;
+	qp_attr.pkey_index = pkey_index;
+	attr_mask = IB_QP_QKEY | IB_QP_PORT | IB_QP_PKEY_INDEX | IB_QP_STATE;
+
+	if (ib_modify_qp(qp, &qp_attr, attr_mask))
+		goto out_fail;
+
+	qp_attr.qp_state = IB_QPS_RTR;
+	attr_mask &= ~IB_QP_PORT;
+	if (ib_modify_qp(qp, &qp_attr, attr_mask))
+		goto out_fail;
+
+	qp_attr.qp_state = IB_QPS_RTS;
+	qp_attr.sq_psn = 0;
+	attr_mask |= IB_QP_SQ_PSN;
+	attr_mask &= ~IB_QP_PKEY_INDEX;
+	if (ib_modify_qp(qp, &qp_attr, attr_mask))
+		goto out_fail;
+
+	return 0;
+
+out_fail:
+	qp_attr.qp_state = IB_QPS_RESET;
+	if (ib_modify_qp(qp, &qp_attr, IB_QP_STATE))
+		vnic_warn(name, "failed to modify QP to RESET state\n");
+
+	return -EINVAL;
+}
+
+void fip_qp_to_reset(struct ib_qp *qp, char *name)
+{
+	struct ib_qp_attr qp_attr;
+
+	qp_attr.qp_state = IB_QPS_RESET;
+	if (ib_modify_qp(qp, &qp_attr, IB_QP_STATE))
+		vnic_warn(name, "Failed to modify QP to RESET state\n");
+	return;
+}
+
+/*
+ * alloc a single buffer, map it and post it to the qp.
+ * id used to identify entry in receive queue.
+ */
+int fip_post_receive(struct vnic_port *port, struct ib_qp *qp, int size,
+		     int _id, struct fip_ring_entry *mem_entry, char *name)
+{
+	struct ib_recv_wr rx_wr, *bad_wr;
+	struct ib_sge rx_sge;
+	int rc;
+
+	rx_wr.wr_id = _id | FIP_OP_RECV;
+	rx_wr.next = NULL;
+	rx_wr.sg_list = &rx_sge;
+	rx_wr.num_sge = 1;
+	rx_sge.addr = mem_entry->bus_addr;
+	rx_sge.length = size;
+	rx_sge.lkey = port->mr->lkey;
+
+	ib_dma_sync_single_for_device(port->dev->ca, rx_sge.addr,
+				      FIP_UD_BUF_SIZE(port->max_mtu_enum),
+				      DMA_FROM_DEVICE);
+
+	rc = ib_post_recv(qp, &rx_wr, &bad_wr);
+	if (unlikely(rc)) {
+		vnic_warn(name, "post receive failed for buf rc %d (id %d)\n", _id, rc);
+		goto post_recv_failed;
+	}
+	mem_entry->entry_posted = 1;
+	return 0;
+
+post_recv_failed:
+	mem_entry->entry_posted = 0;
+	return -EIO;
+}
+
+void fip_flush_rings(struct vnic_port *port,
+		     struct ib_cq *cq,
+		     struct ib_qp *qp,
+		     struct fip_ring *rx_ring,
+		     struct fip_ring *tx_ring,
+		     char *name)
+{
+	vnic_dbg_fip(name, "fip_qp_to_err called\n");
+	if (qp) {
+		fip_qp_to_reset(qp, name);
+		fip_comp(port, cq, rx_ring, tx_ring, name);
+	}
+}
+
+void fip_free_rings(struct vnic_port *port,
+		    struct fip_ring *rx_ring,
+		    struct fip_ring *tx_ring,
+		    char *name)
+{
+	struct ib_device *dev = port->dev->ca;
+	int i;
+
+	for (i = rx_ring->size - 1; i >= 0; --i) {
+		if (rx_ring->ring[i].mem) {
+			ib_dma_unmap_single(dev,
+					    rx_ring->ring[i].bus_addr,
+					    FIP_UD_BUF_SIZE(port->max_mtu_enum),
+					    DMA_FROM_DEVICE);
+			kfree(rx_ring->ring[i].mem);
+		}
+	}
+	rx_ring->size = 0;
+
+	for (i = tx_ring->size - 1; i >= 0; --i)
+		if (tx_ring->ring[i].length != 0) {
+			ib_dma_unmap_single(dev,
+					    tx_ring->ring[i].bus_addr,
+					    tx_ring->ring[i].length,
+					    DMA_TO_DEVICE);
+			kfree(tx_ring->ring[i].mem);
+		}
+	tx_ring->size = 0;
+
+	vnic_dbg_fip(name, "Done cleaning RX and TX queues\n");
+
+	kfree(rx_ring->ring);
+	rx_ring->ring = NULL;
+	kfree(tx_ring->ring);
+	tx_ring->ring = NULL;
+}
+
+/*
+ * TODO - we can do a nicer job here. stage 2
+ *  allocates memory and post receives
+ * TODO2: need to handle the bad flow to free all existing entries in the ring
+ */
+int fip_init_rx(struct vnic_port *port,
+		int ring_size,
+		struct ib_qp *qp,
+		struct fip_ring *rx_ring,
+		char *name)
+{
+	struct ib_device *dev = port->dev->ca;
+	int i, rc = 0, mtu_size = FIP_UD_BUF_SIZE(port->max_mtu_enum);
+
+	rx_ring->size = ring_size;
+	rx_ring->ring = kzalloc(rx_ring->size *
+				sizeof(struct fip_ring_entry),
+				GFP_KERNEL);
+	if (!rx_ring->ring) {
+		vnic_warn(name, "failed to alloc fip RX ring, size %d\n", rx_ring->size);
+		rx_ring->size = 0;
+		return -ENOMEM;
+	}
+
+	/* allocate the ring entries */
+	for (i = 0; i < rx_ring->size; i++) {
+		rx_ring->ring[i].mem = kmalloc(mtu_size, GFP_KERNEL);
+		if (unlikely(!rx_ring->ring[i].mem)) {
+			rc = -ENOMEM;
+			goto error;
+		}
+
+		rx_ring->ring[i].entry_posted = 0;
+		rx_ring->ring[i].length = mtu_size;
+		rx_ring->ring[i].bus_addr = ib_dma_map_single(dev,
+							      rx_ring->ring[i].mem,
+							      mtu_size, DMA_FROM_DEVICE);
+		if (unlikely(ib_dma_mapping_error(dev, rx_ring->ring[i].bus_addr))) {
+			rc = -ENODEV;
+			goto dma_error;
+		}
+
+		if (fip_post_receive(port, qp, FIP_UD_BUF_SIZE(port->max_mtu_enum),
+				     i, rx_ring->ring + i, name)) {
+			rc = -EIO;
+			goto post_recv_failed;
+		}
+	}
+
+	rx_ring->head = 0;
+	rx_ring->tail = 0;
+	spin_lock_init(&rx_ring->head_tail_lock);
+	spin_lock_init(&rx_ring->ring_lock);
+	return 0;
+
+post_recv_failed:
+	ib_dma_unmap_single(dev, rx_ring->ring[i].bus_addr,
+			    mtu_size, DMA_FROM_DEVICE);
+dma_error:
+	kfree(rx_ring->ring[i].mem);
+	rx_ring->ring[i].mem = NULL;
+error:
+	/* previous entries need to be freed after flushing the QP */
+	return rc;
+}
+
+/*
+ * This function allocates the tx buffers and initializes the head and
+ * tail indexes.
+ */
+int fip_init_tx(int size, struct fip_ring *tx_ring, char *name)
+{
+	tx_ring->size = size;
+	tx_ring->ring = kzalloc(tx_ring->size *
+				sizeof(struct fip_ring_entry),
+				GFP_KERNEL);
+
+	if (!tx_ring->ring) {
+		vnic_warn(name, "failed to alloc fip TX ring, size %d\n",
+			  tx_ring->size);
+		tx_ring->size = 0;
+		return -ENOMEM;
+	}
+
+	tx_ring->head = 0;
+	tx_ring->tail = 0;
+	spin_lock_init(&tx_ring->head_tail_lock);
+	spin_lock_init(&tx_ring->ring_lock);
+	return 0;
+}
+
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_login.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_login.c
new file mode 100644
index 0000000000000..55729f2ac0254
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_login.c
@@ -0,0 +1,1752 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_fip.h"
+#include "vnic_fip_discover.h"
+#include "vnic_fip_pkt.h"
+
+#ifndef work_pending /* back-port */
+#define work_pending(_work) test_bit(0, &(_work)->pending)
+#endif
+
+enum {
+	VNIC_LOGIN_REG_NETDEV_PENDING,
+	VNIC_LOGIN_REG_NETDEV_DONE,
+	VNIC_LOGIN_DESTROY_PENDING,
+	VNIC_LOGIN_DESTROY_DONE,
+	VNIC_LOGIN_DESTROY_FULL
+};
+
+static int fip_vnic_rings_create(struct vnic_port *port,
+				 struct fip_vnic_data *vnic);
+static void fip_vnic_rings_destroy(struct fip_vnic_data *vnic);
+static void fip_vnic_recv(struct fip_vnic_data *vnic);
+
+#ifdef _BP_HR_TIMER
+int fip_vnic_keepalive(struct hrtimer * timer);
+#else
+enum hrtimer_restart fip_vnic_keepalive(struct hrtimer * timer);
+#endif
+int fip_vnic_keepalive_send(struct fip_vnic_data *vnic, int source);
+
+
+#define QUEUE_VNIC_DWORK(vnic, task, time)			\
+do {								\
+	unsigned long flags;					\
+	spin_lock_irqsave(&vnic->lock, flags);			\
+	if (likely(vnic->flush == FIP_NO_FLUSH))		\
+		queue_delayed_work(fip_wq, task, time);  \
+	spin_unlock_irqrestore(&vnic->lock, flags);		\
+} while(0)
+
+#define REQUEUE_VNIC_DWORK(vnic, task, time)			\
+do {								\
+	cancel_delayed_work(task);				\
+	QUEUE_VNIC_DWORK(vnic, task, time);			\
+} while(0);
+
+
+/*
+ * Look for a vnic in the GW vnic list. The search key used is either the vnic_id
+ * that is unique, or the mac+vlan pair. A match on either key will result in the
+ * return of the vnic. both keys are nesesary because host assigned delete
+ * flow might not have access to the vnic_id. The search disregards vnics that
+ * are undergoing full flush (they will be removed soon).
+*/
+struct fip_vnic_data *fip_vnic_find_in_list(struct fip_gw_data *gw, u16 vnic_id,
+					    u8 *mac, u16 vlan, u8 vlan_used)
+{
+	struct fip_vnic_data *vnic;
+	int use_mac = mac ? 1 : 0;
+	int vlan_match;
+
+	ASSERT(gw);
+
+	if (list_empty(&gw->vnic_list))
+		return NULL;
+
+	/* do not use MAC 0:..:0 for vnic matches */
+	if (use_mac)
+		use_mac = !IS_ZERO_MAC(mac);
+
+	list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) {
+		if (vnic->flush == FIP_FULL_FLUSH)
+			continue;
+
+		if (vnic->vnic_id == vnic_id)
+			return vnic;
+
+		if (vlan_used != vnic->login_data.vp)
+			continue;
+
+		vlan_match = !vlan_used ||
+			(vlan_used && (vlan == vnic->login_data.vlan));
+
+		if ((use_mac && !memcmp(vnic->login_data.mac, mac, ETH_ALEN)) &&
+		    vlan_match)
+			return vnic;
+	}
+	return NULL;
+}
+
+/*
+ * This function handles completions of both TX and RX
+ * packets of vnics. RX packets are unmapped lightly parsed moved to a list
+ * and passed to thread processing. TX packets are unmapped and freed.
+ * Note: this function is called from interrupt context
+ */
+static void fip_vnic_comp(struct ib_cq *cq, void *vnic_ptr)
+{
+	struct fip_vnic_data *vnic = vnic_ptr;
+
+	/* handle completions. On RX packets this will call vnic_recv
+	 * from thread context to continue processing */
+	if (fip_comp(vnic->port, vnic->cq, &vnic->rx_ring,
+		     &vnic->tx_ring, vnic->name))
+		fip_vnic_recv(vnic);
+
+	fip_vnic_keepalive_send(vnic, 0);
+}
+
+/*
+ * read the state of the gw eport. This can be done from any context and therefore
+ * requires protection.
+*/
+int fip_vnic_get_eport_state(struct fip_vnic_data *vnic)
+{
+	int i;
+
+	if (no_bxm)
+		return 1;
+
+	if (vnic->gw->info.gw_type == GW_TYPE_LAG) {
+		for (i = 0; i < MAX_LAG_MEMBERS; i++) {
+			if (!(vnic->lm.used_bitmask & 1 << i))
+				continue;
+
+			if (vnic->lm.memb[i].eport_state)
+				return 1;
+		}
+		return 0;
+	} else {
+		return atomic_read(&vnic->eport_state);
+	}
+}
+
+/*
+ * get GW info funcs.
+*/
+int fip_vnic_get_bx_name(struct fip_vnic_data *vnic, char *buff)
+{
+	struct fip_gw_data *gw = vnic->gw;
+	struct fip_gw_volatile_info tmp_info;
+	int rc;
+
+	if (!gw)
+		return -EINVAL;
+
+	mutex_lock(&gw->mlock);
+	memcpy(&tmp_info, &gw->info.vol_info, sizeof(tmp_info));
+	mutex_unlock(&gw->mlock);
+
+	rc = sprintf(buff, "%s", tmp_info.system_name);
+
+	return rc < 0 ? rc : 0;
+}
+
+int fip_vnic_get_bx_guid(struct fip_vnic_data *vnic, char *buff)
+{
+	struct fip_gw_data *gw = vnic->gw;
+	struct fip_gw_volatile_info tmp_info;
+	void *rc;
+
+	memset(buff, 0, sizeof *buff);
+
+	if (!gw)
+		return -EINVAL;
+
+	mutex_lock(&gw->mlock);
+	memcpy(&tmp_info, &gw->info.vol_info, sizeof(tmp_info));
+	mutex_unlock(&gw->mlock);
+
+	rc = memcpy(buff, tmp_info.system_guid, GUID_LEN);
+
+	return rc ? 0 : -EINVAL;
+}
+
+int fip_vnic_get_all_vlan_mode(struct fip_vnic_data *vnic, char *buff)
+{
+	struct fip_gw_data *gw = vnic->gw;
+	int rc;
+
+	if (!gw)
+		return -EINVAL;
+
+	rc = sprintf(buff, "%s", gw->info.all_vlan_gw ? "yes" : "no");
+
+	return rc < 0 ? rc : 0;
+}
+
+int fip_vnic_get_eport_name(struct fip_vnic_data *vnic, char *buff)
+{
+
+	struct fip_gw_data *gw = vnic->gw;
+	struct fip_gw_volatile_info tmp_info;
+	int rc;
+
+	if (!gw)
+		return -EINVAL;
+
+	mutex_lock(&gw->mlock);
+	memcpy(&tmp_info, &gw->info.vol_info, sizeof(tmp_info));
+	mutex_unlock(&gw->mlock);
+
+	rc = sprintf(buff, "%s", tmp_info.gw_port_name);
+
+	return rc < 0 ? rc : 0;
+}
+
+u8 fip_vnic_get_bx_sl(struct fip_vnic_data *vnic)
+{
+	return vnic->gw->info.gw_sl;
+}
+
+/*
+ * get GW info funcs.
+*/
+int fip_vnic_get_gw_type(struct fip_vnic_data *vnic)
+{
+	struct fip_gw_data *gw = vnic->gw;
+	int lag = 0;
+
+	if (!gw)
+		return -EINVAL;
+
+	lag = gw->info.gw_type == GW_TYPE_LAG;
+
+	return lag;
+}
+
+/*
+ * get GW info funcs.
+*/
+int fip_vnic_get_lag_eports(struct fip_vnic_data *vnic, char *buf)
+{
+	struct fip_gw_data *gw = vnic->gw;
+	int i;
+	struct lag_member *member;
+	char *p = buf;
+
+	if (!gw)
+		return -EINVAL;
+
+	if (gw->info.gw_type != GW_TYPE_LAG)
+		return -EINVAL;
+
+	p += _sprintf(p, buf, "LAG_MEMBER_INFORMATION:\n");
+	for (i=0; i<MAX_LAG_MEMBERS; i++) {
+		if (!(vnic->lm.used_bitmask & 1 << i))
+			continue;
+
+		member = &vnic->lm.memb[i];
+		p += _sprintf(p, buf, "  %.2d ID=%.3X LID=%4X QPN=%8X STATE=%s\n",
+			      i, member->gw_port_id, member->lid, member->qpn,
+			      member->eport_state ? "UP" : "DOWN");
+	}
+
+	return p - buf;
+}
+
+/*
+ * process an incoming login ack packet. The packet was already parsed and
+ * its data was placed in *data. The function creates RX and TX rings for the
+ * vnic and starts the multicast join procedure.
+ * This function should not be called for packets other then login ack packets.
+ */
+void fip_vnic_login_ack_recv(struct fip_vnic_data *vnic,
+			     struct fip_login_data *data)
+{
+	/* we allow login acks only in wait for ack in other states
+	 * we ignore them */
+	if (vnic->state != FIP_VNIC_WAIT_4_ACK) {
+		vnic_dbg_fip_v(vnic->name,
+			       "vnic_login_ack_recv in state other"
+			       " then FIP_VNIC_WAIT_4_ACK state %d\n",
+			       vnic->state);
+		return;
+	}
+
+	/* For LAG vnics, process login ack member data */
+	if (vnic->gw->info.gw_type == GW_TYPE_LAG)
+		handle_member_update(vnic, &data->lagm);
+
+	memcpy(&vnic->login_data, data, sizeof(vnic->login_data));
+
+	vnic->state = FIP_VNIC_RINGS_INIT;
+
+	/* calls fip_vnic_fsm() */
+	cancel_delayed_work(&vnic->vnic_task);
+	fip_vnic_fsm(&vnic->vnic_task.work);
+	// REQUEUE_VNIC_DWORK(vnic, &vnic->vnic_task, 0);
+	return;
+}
+
+/*
+ * This is a helper function we use in order to move the login create
+ * to another context so we don't block the fip thread for too long.
+ * The call stack triggered by this function calls register_netdev that
+ * might block for some time when netdev are removed in parallel. This
+ * stalls the fip_wq which causes KA not to be sent. 
+*/
+void fip_vnic_login_create(struct work_struct *work)
+{
+	struct fip_vnic_data *vnic =
+		container_of(work, struct fip_vnic_data, vnic_login_create_task);
+	char *name = NULL;
+	int rc;
+
+	if (vnic->hadmined)
+		name = vnic->interface_name;
+
+	rc = vnic_login_register_netdev(vnic, vnic->mac_cache, name);
+
+	spin_lock_irq(&vnic->lock);
+	clear_bit(VNIC_LOGIN_REG_NETDEV_PENDING, &vnic->login_status);
+	if (!rc)
+		set_bit(VNIC_LOGIN_REG_NETDEV_DONE, &vnic->login_status);
+	spin_unlock_irq(&vnic->lock);
+}
+
+/*
+ * Test if the create request posted earlier terminated or not.
+ * If yes and successfully returns 0, if still pending returns
+ * -EAGAIN , and if failed returns -EINVAL. if retry is set
+ * it will requeue a create attempt and try again. In this case 
+ * the function will return -EAGAIN. 
+*/
+static int fip_vnic_test_login(struct fip_vnic_data *vnic, int retry)
+{
+	int ret = 0;
+
+	spin_lock_irq(&vnic->lock);
+
+	if (!test_bit(VNIC_LOGIN_REG_NETDEV_DONE, &vnic->login_status)) {
+		/* queue retry login create request */
+		if (retry) {
+			if (!test_and_set_bit(VNIC_LOGIN_REG_NETDEV_PENDING,
+					      &vnic->login_status)) {
+				memcpy(vnic->mac_cache, vnic->login_data.mac, ETH_ALEN);
+				vnic->vlan_used = vnic->login_data.vp;
+				vnic->vlan = vnic->login_data.vlan;
+				vnic->all_vlan_gw = vnic->login_data.all_vlan_gw;
+
+				/* calls fip_vnic_login_create() */
+				if (vnic->flush == FIP_NO_FLUSH)
+					queue_work(login_wq, &vnic->vnic_login_create_task);
+			}
+			ret = -EAGAIN;
+		} else {
+			if (test_bit(VNIC_LOGIN_REG_NETDEV_PENDING,
+				     &vnic->login_status))
+                                ret = -EAGAIN;
+			else
+				ret = -EINVAL;
+		}
+	} 
+	spin_unlock_irq(&vnic->lock);
+
+	return ret;
+}
+
+
+/*
+ * This function should be called when the building of a vhub context
+ * table is done and the vnic state should transition to CONNECTED.
+ */
+int fip_vnic_tbl_done(struct fip_vnic_data *vnic)
+{
+	vnic->vhub_table.state = VHUB_TBL_UP2DATE;
+	vnic->vhub_table.tusn = vnic->vhub_table.main_list.tusn;
+
+	if (vnic->state <= FIP_VNIC_VHUB_DONE)
+		vnic->state = FIP_VNIC_VHUB_DONE;
+	else 
+		vnic->state = FIP_VNIC_VHUB_WRITE;
+
+	cancel_delayed_work(&vnic->vnic_task);
+	fip_vnic_fsm(&vnic->vnic_task.work);
+	return 0;
+}
+
+/*
+ * This function runs in interrupt context
+ * It does sanity checking of the packet, moves it to a list and passes
+ * handleing to a thread.
+ */
+static void fip_vnic_recv(struct fip_vnic_data *vnic)
+{
+	struct fip_ring *rx_ring = &vnic->rx_ring;
+	int ret, length;
+	u32 vhub_id;
+	void *mem;
+	int queue_packet = 0;
+	int one_or_more_queued = 0;
+	int index;
+	int err;
+
+	while (rx_ring->head != rx_ring->tail) {
+		struct fip_content *fc;
+
+		queue_packet = 0;
+		index = rx_ring->tail & (vnic->rx_ring.size - 1);
+
+		if (rx_ring->ring[index].entry_posted == 0)
+			goto repost;
+
+		mem = rx_ring->ring[index].mem;
+		length = rx_ring->ring[index].length;
+
+
+		fc = kzalloc(sizeof *fc, GFP_ATOMIC);
+		if (!fc) {
+			vnic_warn(vnic->name, "kzalloc failed\n");
+			goto repost;
+		}
+
+		err = fip_packet_parse(vnic->port, mem + IB_GRH_BYTES, length - IB_GRH_BYTES, fc);
+		if (err) {
+			vnic_warn(vnic->name, "packet parse failed\n");
+			kfree(fc);
+			goto repost;
+		}
+
+		switch (fc->fh->subcode) {
+		case FIP_GW_UPDATE_SUB_OPCODE:
+			if (fc->fvu) {
+				vhub_id = be32_to_cpu(fc->fvu->state_vhub_id) & 0xffffff;
+				if (vnic->login_data.vhub_id == vhub_id)
+					queue_packet = 1;
+			}
+
+			break;
+		case FIP_GW_TABLE_SUB_OPCODE:
+			if (vnic->state >= FIP_VNIC_VHUB_INIT &&
+			    vnic->vhub_table.state == VHUB_TBL_INIT) {
+				/* handle vhub context table packets */
+				if (fc->fvt) {
+					vhub_id = be32_to_cpu(fc->fvt->vp_vhub_id) & 0xffffff;
+					if (vnic->login_data.vhub_id == vhub_id)
+						queue_packet = 1;
+				}
+			}
+			break;
+		default:
+			vnic_dbg_fip_v(vnic->name,
+				       "received unexpected format packet\n");
+			break;
+		}
+
+		if (queue_packet && (likely(vnic->flush == FIP_NO_FLUSH))) {
+			struct fip_rcv_pkt *rcv;
+			struct fip_ring_entry me;
+
+			/* record packet time for heart beat */
+			vnic->keep_alive_jiffs = jiffies;
+			length -= IB_GRH_BYTES;
+			rcv = kzalloc(sizeof *rcv, GFP_ATOMIC);
+			if (!rcv) {
+				vnic_warn(vnic->name, "failed kmalloc\n");
+				kfree(fc);
+				goto repost;
+			}
+
+			/* replace it with new entry, and queue old one */
+			err = alloc_map_fip_buffer(vnic->port->dev->ca, &me,
+						   FIP_UD_BUF_SIZE(vnic->port->max_mtu_enum),
+						   GFP_ATOMIC);
+			if (err) {
+				vnic_warn(vnic->name, "alloc_map_fip_buffer failed\n");
+				kfree(fc);
+				kfree(rcv);
+				goto repost;
+			}
+
+			/* unmap old entry */
+			ib_dma_unmap_single(vnic->port->dev->ca,
+					    rx_ring->ring[index].bus_addr,
+					    FIP_UD_BUF_SIZE(vnic->port->max_mtu_enum),
+					    DMA_FROM_DEVICE);
+
+			rx_ring->ring[index] = me;
+			rcv->fc = fc;
+			rcv->length = length;
+			rcv->mem = mem;
+			spin_lock(&vnic->vnic_rcv_list.lock);
+			list_add_tail(&rcv->list, &vnic->vnic_rcv_list.list);
+			spin_unlock(&vnic->vnic_rcv_list.lock);
+			one_or_more_queued++;
+		} else
+			kfree(fc);
+repost:
+		ret = fip_post_receive(vnic->port, vnic->qp,
+				       FIP_UD_BUF_SIZE(vnic->port->max_mtu_enum),
+				       index, rx_ring->ring + index, vnic->name);
+		if (ret)
+			vnic_warn(vnic->name, "fip_post_receive ret %d\n", ret);
+
+		rx_ring->tail++;
+	}
+
+	if (one_or_more_queued && (likely(vnic->flush == FIP_NO_FLUSH))) {
+		/* calls fip_vnic_recv_bh() */
+		queue_work(fip_wq, &vnic->vnic_pkt_rcv_task_bh);
+	}
+
+	return;
+}
+
+void fip_vnic_recv_list_flush(struct fip_vnic_data *vnic)
+{
+	struct list_head vnic_recv_local;
+	struct fip_rcv_pkt *rcv, *rcv1;
+	unsigned long flags;
+
+	INIT_LIST_HEAD(&vnic_recv_local);
+
+	spin_lock_irqsave(&vnic->vnic_rcv_list.lock, flags);
+	list_replace_init(&vnic->vnic_rcv_list.list, &vnic_recv_local);
+	spin_unlock_irqrestore(&vnic->vnic_rcv_list.lock, flags);
+
+	list_for_each_entry_safe(rcv, rcv1, &vnic_recv_local, list) {
+		list_del(&rcv->list);
+		kfree(rcv);
+	}
+	return;
+}
+
+void lag_ctx_clear(struct fip_vnic_data *vnic)
+{
+	memset(&vnic->lm, 0, sizeof (vnic->lm));
+}
+
+/*
+ * Handle the GW eport member info for a LAG GW. The function compares the
+ * member information to previous membership information that is stored in the
+ * vnic. The data path info is updated only after the login ack info was
+ * updated to prevent race conditions. 
+ * The vnic contains a local cache of the member info. The cache is updated
+ * in all cases other then if the write to the data path failed. If the write
+ * failed we will not update the cache and rely on periodic updates packets
+ * for the retry.
+ * There are 4 possible flows per member entry:
+ * 1. the entry is cached in the vnic but not in the packet - remove from vnic
+ * 2. the entry is not cached in the vnic but is in the packet - add to vnic,
+ * 3. entry is in vnic and in packet but different params - modifiy vnic
+ * 4. entry is in vnic and in packet and with similar params - do nothing
+*/
+int handle_member_update(struct fip_vnic_data *vnic, struct lag_members *lm)
+{
+	int i, j;
+	char packet_used[MAX_LAG_MEMBERS];
+	char vnic_used[MAX_LAG_MEMBERS];
+	struct lag_member *vnic_mem, *pkt_mem;
+	int last_bit = 0;
+	#define EMPTY_ENTRY (char)0xff
+	/* we only update data path  with new info after certain stage */
+	int write_through = !!(vnic->state >= FIP_VNIC_VHUB_WRITE);
+	int skip;
+	struct lag_properties lag_prop;
+	struct vnic_login *login = vnic->login;
+
+	memset(packet_used, EMPTY_ENTRY, sizeof(packet_used));
+	memset(vnic_used, EMPTY_ENTRY, sizeof(vnic_used));
+
+        /* if LAG is not enabled, or it's a child vNic, abort */
+	if (!vnic->gw->info.ext_lag.valid || vnic->parent_used)
+		return -EINVAL;
+
+	mutex_lock(&vnic->gw->mlock);
+	lag_prop.ca = vnic->gw->info.ext_lag.ca;
+	lag_prop.ca_thresh = vnic->gw->info.ext_lag.ca_thresh;
+	lag_prop.hash_mask = vnic->gw->info.ext_lag.hash;
+	lag_prop.weights_policy = vnic->gw->info.ext_lag.weights_policy;
+	mutex_unlock(&vnic->gw->mlock);
+	if (write_through)
+		vnic_member_prop(login, &lag_prop);
+
+	/* go over all known members, for each one search for a match in the
+	 * packet member struct */
+	for (i=0; i<MAX_LAG_MEMBERS; i++) {
+		if (!(vnic->lm.used_bitmask & 1 << i))
+			continue;
+
+		vnic_mem = &vnic->lm.memb[i];
+		for (j=0; j<lm->num; j++) {
+
+			pkt_mem = &lm->memb[j];
+			/* find match for member in vnic data structure */
+			if (packet_used[j] == EMPTY_ENTRY &&
+			    !memcmp(vnic_mem->guid, pkt_mem->guid, GUID_LEN) &&
+			    vnic_mem->gw_port_id == pkt_mem->gw_port_id) {
+				/* found a match, check for change in parameters */
+				if (vnic->login) {
+					/* check for change in member parameters */
+					if (vnic_mem->lid != pkt_mem->lid ||
+					    vnic_mem->qpn != pkt_mem->qpn ||
+					    vnic_mem->eport_state != pkt_mem->eport_state ||
+					    vnic_mem->sl != pkt_mem->sl ||
+					    vnic_mem->link_utilization != pkt_mem->link_utilization) {
+
+						vnic_dbg_lag_v(vnic->name, "handle_member_update entry %d modifying lid %d qpn %d state %d\n",
+							     i, lm->memb[j].lid, lm->memb[j].qpn, lm->memb[j].eport_state);
+						/* update data path if required and store update info localy */
+						if (!write_through ||
+						    (write_through && !vnic_member_modify(login, i, &lm->memb[j])))
+							*vnic_mem = lm->memb[j];
+					}
+				}
+				packet_used[j] = i;
+				vnic_used[i] = j;
+				break;
+			}
+		}
+		/* if member was removed in last packet remove it */
+		if (vnic_used[i] == EMPTY_ENTRY) {
+			if (!write_through ||
+			    (write_through && !vnic_member_remove(login, i))) {
+				vnic_dbg_lag_v(vnic->name, "handle_member_update entry %d removing lid %d qpn %d state %d\n",
+					     i, lm->memb[j].lid, lm->memb[j].qpn, lm->memb[j].eport_state);
+				vnic->lm.used_bitmask &= ~(1 << i);
+			}
+		}
+	}
+
+	/* go over packet and look for any new members */
+	for (j=0; j<lm->num; j++) {
+		/* if entry was matched up already */
+		if (packet_used[j]!= EMPTY_ENTRY)
+			continue;
+
+		skip = 0;
+		/* verify that the same GW_ID is not in use by another port */
+		for (i=0; i<MAX_LAG_MEMBERS; i++) {
+			if (!(vnic->lm.used_bitmask & 1 << i))
+				continue;
+			if (vnic->lm.memb[i].gw_port_id == lm->memb[j].gw_port_id)
+				skip = 1;
+		}
+		if (skip)
+			continue;
+
+		/* look for an empty member id and add the member to it */
+		for (i=last_bit; i<MAX_LAG_MEMBERS; i++) {
+			if (vnic->lm.used_bitmask & 1 << i)
+				continue;
+
+			vnic_dbg_lag_v(vnic->name, "handle_member_update entry %d adding lid %d qpn %d state %d\n",
+				     i, lm->memb[j].lid, lm->memb[j].qpn, lm->memb[j].eport_state);
+			if (!write_through ||
+			    (write_through && !vnic_member_add(login, i, &lm->memb[j]))) {
+				vnic->lm.used_bitmask |= (1 << i);
+				vnic->lm.memb[i] = lm->memb[j];
+			}
+
+			break;
+		}
+		last_bit = i;
+	}
+
+	return 0;
+}
+
+/* Write the initial member table to the datapath. If we fail we will
+ * delete the entry from the local cache and rely on periodic updates
+ * packets for the retry*/
+int fip_vnic_write_members(struct fip_vnic_data *vnic)
+{
+	int i;
+	struct lag_properties lag_prop;
+	struct vnic_login *login = vnic->login;
+
+        /* if LAG is not enabled, or it's a child vNic, abort */
+	if (!vnic->gw->info.ext_lag.valid || vnic->parent_used)
+		return -EINVAL;
+
+	lag_prop.ca = vnic->gw->info.ext_lag.ca;
+	lag_prop.ca_thresh = vnic->gw->info.ext_lag.ca_thresh;
+	lag_prop.hash_mask = vnic->gw->info.ext_lag.hash;
+	lag_prop.weights_policy = vnic->gw->info.ext_lag.weights_policy;
+	vnic_member_prop(login, &lag_prop);
+
+	/* go over all members, for each une used write it to the data path */
+	for (i=0; i<MAX_LAG_MEMBERS; i++) {
+		if (!(vnic->lm.used_bitmask & 1 << i))
+			continue;
+
+		/* if update failed, delete local entry we will use the
+		 * the update packet flow for retries.
+		 */
+		if (vnic_member_add(login, i, &vnic->lm.memb[i]))
+			vnic->lm.used_bitmask &= ~(1 << i);
+	}
+
+	return 0;
+}
+
+/* runs in the context of vnic->vnic_pkt_rcv_task_bh */
+void fip_vnic_recv_bh(struct work_struct *work)
+{
+	struct fip_vnic_data *vnic =
+		container_of(work, struct fip_vnic_data, vnic_pkt_rcv_task_bh);
+	int length;
+	u32 vhub_id, tusn;
+	int eport_state;
+	struct vnic_table_entry *vhub_entries;
+	struct list_head vnic_recv_local;
+	struct fip_rcv_pkt *rcv, *rcv1;
+	unsigned long flags;
+	int i, __eport_state;
+	
+	INIT_LIST_HEAD(&vnic_recv_local);
+
+	spin_lock_irqsave(&vnic->vnic_rcv_list.lock, flags);
+	list_replace_init(&vnic->vnic_rcv_list.list, &vnic_recv_local);
+	spin_unlock_irqrestore(&vnic->vnic_rcv_list.lock, flags);
+
+	/* We Are not interested in packets prior to FIP_VNIC_VHUB_INIT */
+	if (vnic->state < FIP_VNIC_VHUB_INIT ||
+	    vnic->flush != FIP_NO_FLUSH) {
+		list_for_each_entry_safe(rcv, rcv1, &vnic_recv_local, list) {
+			kfree(rcv->fc);
+			kfree(rcv->mem);
+			list_del(&rcv->list);
+			kfree(rcv);
+		}
+	} else {
+		int err;
+
+		list_for_each_entry_safe(rcv, rcv1, &vnic_recv_local, list) {
+			length = rcv->length;
+
+			switch (rcv->fc->fh->subcode) {
+			case FIP_GW_UPDATE_SUB_OPCODE:
+				/* validate vhub id before processing packet */
+				vhub_id = be32_to_cpu(rcv->fc->fvu->state_vhub_id) & 0xffffff;
+				if(unlikely(vnic->login_data.vhub_id != vhub_id))
+					break;
+
+				eport_state = be32_to_cpu(rcv->fc->fvu->state_vhub_id) >> 27 & 3;
+				__eport_state = (eport_state == 0) ? EPORT_STATE_DOWN : EPORT_STATE_UP;
+				atomic_set(&vnic->eport_state, __eport_state);
+
+				/* handle vhub context update packets */
+				if (rcv->fc->fed.num) {
+					err = extract_vhub_extended(rcv->fc->fed.fed[0], vnic);
+					if (err)
+						vnic_warn(vnic->name, "extract_vhub_extended() failed\n");
+				}
+				if (rcv->fc->cte.num) {
+					vhub_entries = kmalloc(rcv->fc->cte.num * sizeof *vhub_entries, GFP_KERNEL);
+					if (!vhub_entries) {
+						vnic_warn(vnic->port->name, "failed to allocate memory for update CTEs\n");
+						goto free_entry;
+					}
+
+					tusn = be32_to_cpu(rcv->fc->fvu->tusn);
+					for (i = 0; i < rcv->fc->cte.num; ++i) {
+						vhub_entries[i].lid = be16_to_cpu(rcv->fc->cte.cte[i].lid);
+						vhub_entries[i].qpn = be32_to_cpu(rcv->fc->cte.cte[i].qpn) & 0xffffff;
+						vhub_entries[i].sl = rcv->fc->cte.cte[i].sl & 0xf;
+						vhub_entries[i].rss = rcv->fc->cte.cte[i].v_rss_type & FIP_CONTEXT_RSS_FLAG ? 1 : 0;
+						vhub_entries[i].valid = rcv->fc->cte.cte[i].v_rss_type & FIP_CONTEXT_V_FLAG ? 1 : 0;
+						memcpy(vhub_entries[i].mac, rcv->fc->cte.cte[i].mac, sizeof(vhub_entries[i].mac));
+						vhub_handle_update(vnic, vhub_id, tusn - rcv->fc->cte.num + i + 1, &vhub_entries[i]);
+					}
+					kfree(vhub_entries);
+				}
+
+				/* update vnic carrier only when vnic is ready:
+				 * not closing (non zero flush), and per-registered
+				 */
+				if (!vnic->flush && vnic->login &&
+				    test_bit(VNIC_STATE_LOGIN_CREATE_1, &vnic->login_state)) {
+						vnic_carrier_update(vnic->login);
+				}
+				break;
+			case FIP_GW_TABLE_SUB_OPCODE:
+				/* handle vhub context table packets */
+				tusn = be32_to_cpu(rcv->fc->fvt->tusn);
+				vhub_id = be32_to_cpu(rcv->fc->fvt->vp_vhub_id) & 0xffffff;
+				vhub_handle_tbl(vnic, rcv->fc, vhub_id, tusn);
+				break;
+
+			default:
+				break;
+			}
+free_entry:
+			list_del(&rcv->list);
+			kfree(rcv->fc);
+			kfree(rcv->mem);
+			kfree(rcv);
+		}
+	}
+	return;
+}
+
+/*
+ * Mark the vnic for deletion and trigger a delayed call to the cleanup
+ * function. In the past the vnic was moved to another list but this
+ * might cause vnic duplication if new vnics are added to the GW. Even
+ * if the vnic is being flushed we need to know it is there.
+ *
+ * Note: This deletion method insures that all pending vnic work requests
+ * are cleared without dependency of the calling context.
+ */
+void fip_vnic_close(struct fip_vnic_data *vnic, enum fip_flush flush)
+{
+	int tmp_flush;
+
+	/* net admin -> full flush */
+	tmp_flush = vnic->hadmined ? flush : FIP_FULL_FLUSH;
+
+	/* child vNic -> full flush */
+	tmp_flush = (!vnic->parent_used) ? tmp_flush : FIP_FULL_FLUSH;
+
+	/* no need for partial cleanup in host admin idle */
+	if (tmp_flush == FIP_PARTIAL_FLUSH &&
+	    vnic->state < FIP_VNIC_HADMIN_IDLE)
+		return;
+
+	/* close already in process, disregard */
+	spin_lock_irq(&vnic->lock);
+	if (vnic->flush >= tmp_flush){
+		spin_unlock_irq(&vnic->lock);
+		return;
+	}
+	if (vnic->flush == FIP_NO_FLUSH && vnic->state > FIP_VNIC_WAIT_4_ACK)
+		fip_update_send(vnic, 0, 1 /* logout */);
+
+	vnic->flush = tmp_flush;
+	cancel_delayed_work(&vnic->vnic_gw_alive_task);
+	cancel_delayed_work(&vnic->vnic_task);
+	spin_unlock_irq(&vnic->lock);
+	/* after this point we should have no work that is not already pending
+	 * for execution, and no new work will be added
+	 */
+
+	if (vnic->hadmined && tmp_flush == FIP_FULL_FLUSH)
+		vnic_delete_hadmin_dentry(vnic);
+	else if (!vnic->hadmined)
+		/* vnic_count is relevant for net admin only */
+		vnic->gw->vnic_count--;
+
+	vnic_dbg_mark();
+
+	/* calls fip_purge_vnics() */
+	queue_delayed_work(fip_wq, &vnic->gw->vnic_cleanup_task,
+			   DELAYED_WORK_CLEANUP_JIFFS);
+}
+
+/*
+ * This is a helper function we use in order to move the login destroy
+ * to another context so we don't block the fip thread for too long.
+*/
+void fip_vnic_login_destroy(struct work_struct *work)
+{
+	struct fip_vnic_data *vnic =
+		container_of(work, struct fip_vnic_data,
+			     vnic_login_destroy_task);
+	int flush = vnic->flush;
+
+	vnic_login_destroy_wq_stopped(vnic, flush);
+
+	/* we don't want to use a lock here so we will verify that the
+	 * flush level did not change between the request and now */
+	if (flush == FIP_FULL_FLUSH)
+		set_bit(VNIC_LOGIN_DESTROY_FULL, &vnic->login_status);
+
+	set_bit(VNIC_LOGIN_DESTROY_DONE, &vnic->login_status);
+}
+
+/*
+ * Free vnic resources. This includes closing the data vnic (data QPs etc)
+ * and the discovery resources. If the vnic can be totaly destroyed (no
+ * pending work) the vnic will be removed from the GW and it's memory
+ * freed. If not the vnic will not be freed and the function will return an
+ * error. The caller needs to recall this unction to complete the operation.
+ * Note: Do not call this function to remove a vnic, use fip_vnic_close.
+*/
+int fip_vnic_destroy(struct fip_vnic_data *vnic)
+{
+	int pending;
+
+	vnic_dbg_func(vnic->name);
+	vnic_dbg_fip_p0(vnic->name, "fip_vnic_destroy called flow=%d state=%d mac" MAC_6_PRINT_FMT "\n",
+		     vnic->flush, vnic->state, MAC_6_PRINT_ARG(vnic->login_data.mac));
+
+	pending = work_pending(&vnic->vnic_pkt_rcv_task_bh) ||
+		delayed_work_pending(&vnic->vnic_gw_alive_task) ||
+		delayed_work_pending(&vnic->vnic_task);
+
+	/* verify no pending packets before we start tearing down the rings */
+	if (pending || fip_vnic_test_login(vnic, 0) == -EAGAIN)
+		goto retry_later;
+
+	if (!test_and_set_bit(VNIC_LOGIN_DESTROY_PENDING,
+			      &vnic->login_status)) {
+		vnic_login_destroy_stop_wq(vnic, vnic->flush);
+		/* calls fip_vnic_login_destroy() */
+		queue_work(login_wq, &vnic->vnic_login_destroy_task);
+	}
+
+	if (!test_bit(VNIC_LOGIN_DESTROY_DONE, &vnic->login_status))
+		goto retry_later;
+
+	clear_bit(VNIC_LOGIN_DESTROY_DONE, &vnic->login_status);
+	clear_bit(VNIC_LOGIN_DESTROY_PENDING, &vnic->login_status);
+
+	/* We need to test if when we queued the destroy request it was
+	 * a partial flush but this has changed to a full flush.
+	 * if so we need to try again */
+	if (vnic->flush == FIP_FULL_FLUSH &&
+	    !test_bit(VNIC_LOGIN_DESTROY_FULL, &vnic->login_status))
+		goto retry_later;
+
+	hrtimer_cancel(&vnic->keepalive_timer);
+
+	if (vnic->state >= FIP_VNIC_VHUB_INIT) {
+		lag_ctx_clear(vnic);
+		vhub_ctx_free(vnic);
+	}
+
+	/* disconnect from mcast groups */
+	if (vnic->state >= FIP_VNIC_MCAST_INIT) {
+		vnic_mcast_del_all(&vnic->mcast_tree);
+		fip_vnic_rings_destroy(vnic);
+	}
+
+	if (vnic->state > FIP_VNIC_LOGIN)
+		ib_destroy_ah(vnic->ah);
+
+	if (vnic->flush == FIP_PARTIAL_FLUSH) {
+		if (vnic->hadmined) /* we close Host admin vnics so they won't do any login from fip_vnic_fsm */
+			vnic->state = FIP_VNIC_CLOSED;
+		else
+			vnic->state = FIP_VNIC_HADMIN_IDLE;
+
+		vnic->flush = FIP_NO_FLUSH;
+		vnic->last_send_jiffs = 0;
+
+		vnic_dbg_fip_v(vnic->name, "fip_vnic_remove partial done vnic->retry_count=%d\n", vnic->retry_count);
+		if (!VNIC_MAX_RETRIES || ++vnic->retry_count < VNIC_MAX_RETRIES)
+			QUEUE_VNIC_DWORK(vnic, &vnic->vnic_task, FIP_LOGIN_TIMEOUT * HZ);
+
+	} else {
+		list_del(&vnic->gw_vnics);
+		vnic_dbg_fip_v(vnic->name, "fip_vnic_remove full done\n");
+		kfree(vnic);
+	}
+
+	return 0;
+
+retry_later:
+	return -EBUSY;
+}
+
+int fip_vnic_keepalive_send(struct fip_vnic_data *vnic, int source_timer)
+{
+	int update;
+	unsigned long flags;
+	int ret = 0;
+
+	if (vnic->flush != FIP_NO_FLUSH)
+		return ret;
+
+	if (vnic->last_send_jiffs > 1 && jiffies - vnic->last_send_jiffs > vnic->gw->info.vnic_ka_period * 3 / 2)
+		vnic_dbg_fip_p0(vnic->name, "Delaying in sending KA should be %ld actual time=%ld source=%d\n",
+			vnic->gw->info.vnic_ka_period, jiffies - vnic->last_send_jiffs, source_timer);
+
+	spin_lock_irqsave(&vnic->ka_lock, flags);
+	if (source_timer ||
+	    (vnic->last_send_jiffs && jiffies - vnic->last_send_jiffs >
+	     vnic->gw->info.vnic_ka_period * 6 / 5)) {
+
+		/* we need to have mcast attached before we ask for a table */
+		if (vnic->state >= FIP_VNIC_VHUB_INIT &&
+		    vnic->vhub_table.state == VHUB_TBL_INIT)
+			update = 1;
+		else
+			update = 0;
+
+		/* send vnic keep alive to GW */
+		ret = fip_update_send(vnic, update, 0 /*not logout */);
+		if (!ret)
+			vnic->last_send_jiffs = jiffies;
+	}
+	spin_unlock_irqrestore(&vnic->ka_lock, flags);
+
+	return ret;
+
+}
+
+//void fip_vnic_keepalive(unsigned long data)
+#ifdef _BP_HR_TIMER
+int fip_vnic_keepalive(struct hrtimer * timer)
+#else
+enum hrtimer_restart fip_vnic_keepalive(struct hrtimer *timer)
+#endif
+{
+//	struct fip_vnic_data *vnic = (struct fip_vnic_data *)data;
+	struct fip_vnic_data *vnic = (struct fip_vnic_data *)
+					container_of(timer, struct fip_vnic_data, keepalive_timer);
+	unsigned long flags;
+	ktime_t ktime;   
+	enum hrtimer_restart ret = HRTIMER_NORESTART;
+	int flush;
+
+	spin_lock_irqsave(&vnic->lock, flags);
+	flush = vnic->flush;
+	spin_unlock_irqrestore(&vnic->lock, flags);
+
+	if (flush != FIP_NO_FLUSH)
+		return ret;
+
+	fip_vnic_keepalive_send(vnic, 1);
+
+	/*mod_timer(&vnic->keepalive, jiffies + time);*/
+	ret = HRTIMER_RESTART;
+	ktime = ktime_set(0, vnic->gw->info.vnic_ka_period * (1000000000 / HZ));
+	hrtimer_forward(&vnic->keepalive_timer, vnic->keepalive_timer.base->get_time(), ktime);
+
+
+	return ret;
+
+}
+
+void fip_vnic_gw_alive(struct work_struct *work)
+{
+	struct fip_vnic_data *vnic =
+		container_of(work, struct fip_vnic_data,
+			     vnic_gw_alive_task.work);
+	long time_to_timeout;
+
+	if (vnic->flush != FIP_NO_FLUSH)
+		return;
+
+	if (!test_bit(MCAST_ATTACHED, &vnic->vnic_mcaste_state)) {
+		if (time_after(jiffies, vnic->detached_ka_jiffs + 60*HZ)) {
+			vnic_dbg_fip_p0(vnic->name, "No GW keep alive timeout when mcast un attached "
+				     "QPN 0x%06x, LID 0x%04x\n", vnic->qp->qp_num,
+				     vnic->port->attr.lid);
+			fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+			return;
+		} else {
+			vnic_dbg_fip_p0(vnic->name, "Got ka poll when bcast not "
+				     "attached QPN 0x%06x, LID 0x%04x, ka=%u\n",
+				     vnic->qp->qp_num, vnic->port->attr.lid,
+				     jiffies_to_msecs(jiffies - vnic->detached_ka_jiffs));
+			time_to_timeout = vnic->gw->info.gw_period;
+               }
+	} else {
+		long jiffs_from_last;
+		jiffs_from_last = (jiffies - vnic->keep_alive_jiffs);
+		time_to_timeout = vnic->gw->info.gw_period - jiffs_from_last;
+	}
+
+	/* Todo, change receive of update to rearm work timer so an expiration
+	 * indicates a truie time out */
+	if (time_to_timeout <= 0) {
+		vnic_dbg_fip_p0(vnic->name, "GW keep alives timed out for "
+			  "QPN 0x%06x, LID 0x%04x timeout=%ld\n", vnic->qp->qp_num,
+			  vnic->port->attr.lid, time_to_timeout);
+		fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+	} else
+		QUEUE_VNIC_DWORK(vnic, &vnic->vnic_gw_alive_task,
+				 time_to_timeout + 1);
+}
+
+struct fip_vnic_data *fip_vnic_alloc(struct vnic_port *port,
+				     struct fip_gw_data *gw,
+				     int hadmin, u16 vnic_id)
+{
+	struct fip_vnic_data *vnic;
+
+	vnic = kzalloc(sizeof(struct fip_vnic_data), GFP_KERNEL);
+	if (!vnic) {
+		vnic_err(port->name, "failed to alloc vnic\n");
+		return NULL;
+	}
+
+	vnic->state = hadmin ? FIP_VNIC_HADMIN_IDLE : FIP_VNIC_LOGIN;
+	vnic->vnic_id = vnic_id;
+	vnic->gw = gw;
+	vnic->gw_info = gw->info.vol_info;
+	vnic->port = port;
+	vnic->hadmined = hadmin;
+	vnic->flush = FIP_NO_FLUSH;
+
+	sprintf(vnic->name, "vnic-%d", vnic_id); /* will be overwritten */
+
+	spin_lock_init(&vnic->lock);
+	spin_lock_init(&vnic->ka_lock);
+	INIT_DELAYED_WORK(&vnic->vnic_task, fip_vnic_fsm);
+	INIT_DELAYED_WORK(&vnic->vnic_gw_alive_task, fip_vnic_gw_alive);
+	INIT_WORK(&vnic->vnic_login_destroy_task, fip_vnic_login_destroy);
+	INIT_WORK(&vnic->vnic_login_create_task, fip_vnic_login_create);
+
+
+#ifdef _BP_HR_TIMER
+	hrtimer_init(&vnic->keepalive_timer, CLOCK_MONOTONIC, HRTIMER_REL);
+#else
+	hrtimer_init(&vnic->keepalive_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL );
+#endif
+	vnic->keepalive_timer.function = fip_vnic_keepalive;
+
+	vnic_mcast_root_init(&vnic->mcast_tree);
+	atomic_set(&vnic->eport_state,EPORT_STATE_DOWN);
+
+	return vnic;
+}
+
+int fip_vnic_hadmin_init(struct vnic_port *port, struct fip_vnic_data *vnic)
+{
+	int rc;
+
+	vnic_dbg_func(port->name);
+
+	rc = vnic_login_pre_create_1(port, vnic);
+	if (rc) {
+		vnic_warn(port->name, "vnic_login_pre_create_1 failed, rc %d\n", rc);
+		goto pre_create_failed;
+	}
+
+	strncpy(vnic->login_data.vnic_name, vnic->interface_name,
+		sizeof(vnic->interface_name));
+
+	/* queue login create request */
+	fip_vnic_test_login(vnic, 1);
+
+	return 0;
+
+pre_create_failed:
+	return -ENODEV;
+}
+
+void fip_vnic_create_gw_param(struct fip_vnic_send_info *gw_address, u32 gw_qpn,
+			      u32 qkey, u16 gw_lid, u8 gw_sl)
+{
+	gw_address->gw_qpn = gw_qpn;
+	gw_address->qkey = qkey;
+	gw_address->gw_lid = gw_lid;
+	gw_address->gw_sl = gw_sl;
+}
+
+void fip_vnic_set_gw_param(struct fip_vnic_data *vnic, struct fip_vnic_send_info *gw_address)
+{
+	memcpy(&vnic->gw_address, gw_address, sizeof(vnic->gw_address));
+}
+
+int fip_hadmin_vnic_refresh(struct fip_vnic_data *vnic, struct fip_vnic_send_info *gw_address)
+{
+	vnic_dbg_fip(vnic->name, "fip_vnic_to_login host admin flow flush=%d"
+		     " state=%d\n", vnic->flush, vnic->state);
+	if (likely(vnic->flush == FIP_NO_FLUSH) &&
+	    vnic->state <= FIP_VNIC_HADMIN_IDLE &&
+	    (!VNIC_MAX_RETRIES || vnic->retry_count < VNIC_MAX_RETRIES)) {
+		fip_vnic_set_gw_param(vnic, gw_address);
+		cancel_delayed_work(&vnic->vnic_task);
+		vnic->state = FIP_VNIC_LOGIN;
+		fip_vnic_fsm(&vnic->vnic_task.work);
+	}
+	return 0;
+}
+
+/*
+ * Call the data vnic precreate 1 + 2 in order to alloc and init the data vnic.
+ * This function updates qp numbers that the data vnic will use. These qp numbers
+ * are needed for the login.
+ * This function does not cleanup on failures. It assumes that the caller will call
+ * the login destoy.
+*/
+static int fip_vnic_login_init(struct vnic_port *port, struct fip_vnic_data *vnic)
+{
+	int qps_num;
+	int rc;
+
+	struct ib_ah_attr ah_attr = {
+		.dlid = vnic->gw_address.gw_lid,
+		.port_num = port->num,
+		.sl = vnic_gw_ctrl_sl(vnic->gw) & 0xf,
+	};
+
+	vnic_dbg_func(vnic->name);
+
+	/* If the driver wants to enable RSS (vnic_rss == 1) then the
+	 * number of QPs is what the GW advertises: 1 << n_rss_qpn
+         */
+	qps_num = (port->rx_rings_num > 1) ? (1 << vnic->gw->info.n_rss_qpn) : 1;
+	qps_num = (qps_num == 0) ? 1 : qps_num;
+
+	/* However, we don't support any qps_num, if the GW asks for more than
+	 * VNIC_MAX_NUM_CPUS QPs, then we're not going to enable RSS
+	 * -- qps_num == 1 means RSS is disabled, otherwise it's enabled
+	 */
+	qps_num = qps_num <= VNIC_MAX_NUM_CPUS ? qps_num : 1;
+
+	/* set in vnic, so it can be reported back to the BXM */
+	vnic->qps_num = qps_num;
+
+	/* in host admin vnic->login should be non NULL */
+	if (!vnic->hadmined) {
+		rc = vnic_login_pre_create_1(port, vnic);
+		if (rc) {
+			vnic_warn(vnic->name,
+				  "vnic_login_pre_create_1 failed, "
+				  "rc %d\n", rc);
+			goto failed;
+		}
+	}
+
+	/* in host admin vnic->login should be non NULL */
+	rc = vnic_login_pre_create_2(vnic, qps_num,
+				     vnic->gw->info.gw_type == GW_TYPE_LAG);
+	if (rc) {
+		vnic_warn(port->name, "vnic_login_pre_create_2 failed\n");
+		goto failed;
+	}
+
+	/* if parent_used, you must already have the base QPN */
+	ASSERT(!vnic->parent_used || vnic->qp_base_num);
+
+	vnic->ah = ib_create_ah(port->pd, &ah_attr);
+	if (IS_ERR(vnic->ah)) {
+		vnic_warn(vnic->name, "fip_vnic_login_init failed to create ah\n");
+		vnic->ah = NULL;
+		goto failed;
+	}
+
+	vhub_ctx_init(vnic);
+
+	return 0;
+
+failed:
+	return -ENODEV;
+}
+
+/*
+ * create a CQ and QP for the new vNic. Create RX and TX rings for this
+ * QP. Move QP to RTS and connect it to the CQ.
+*/
+static int fip_vnic_rings_create(struct vnic_port *port,
+				 struct fip_vnic_data *vnic)
+{
+	struct ib_qp_init_attr qp_init_attr;
+	int ret;
+
+	vnic->rx_ring.size = FIP_LOGIN_RX_SIZE;
+	vnic->tx_ring.size = FIP_LOGIN_TX_SIZE;
+
+	INIT_WORK(&vnic->vnic_pkt_rcv_task_bh, fip_vnic_recv_bh);
+	spin_lock_init(&vnic->vnic_rcv_list.lock);
+	INIT_LIST_HEAD(&vnic->vnic_rcv_list.list);
+
+	if (ib_find_pkey(port->dev->ca, port->num, vnic->login_data.pkey,
+			 &vnic->login_data.pkey_index)) {
+		vnic_warn(vnic->name,
+			     "fip_vnic_rings_create PKey 0x%04x not found."
+			     " Check configuration in SM/BX\n", vnic->login_data.pkey);
+		goto out_w_err;
+	}
+
+	vnic->pkey = vnic->login_data.pkey;
+	vnic->pkey_index = vnic->login_data.pkey_index;
+
+	vnic_dbg_fip_v(vnic->name, "fip_vnic_rings_create pkey id %d "
+		       "for pkey 0x%x\n", (int)vnic->pkey_index,
+		       (int)vnic->pkey);
+
+	vnic->cq = ib_create_cq(port->dev->ca, fip_vnic_comp, NULL, vnic,
+				vnic->rx_ring.size + vnic->tx_ring.size, 0);
+	if (IS_ERR(vnic->cq)) {
+		vnic_dbg_fip(vnic->name, "failed to create receive CQ\n");
+		goto out_w_err;
+	}
+
+	memset(&qp_init_attr, 0, sizeof(qp_init_attr));
+	qp_init_attr.cap.max_send_wr = vnic->tx_ring.size;
+	qp_init_attr.cap.max_recv_wr = vnic->rx_ring.size;
+	qp_init_attr.cap.max_send_sge = 1;
+	qp_init_attr.cap.max_recv_sge = 1;
+	qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+	qp_init_attr.qp_type = IB_QPT_UD;
+	qp_init_attr.send_cq = vnic->cq;
+	qp_init_attr.recv_cq = vnic->cq;
+
+	vnic->qp = ib_create_qp(port->pd, &qp_init_attr);
+	if (IS_ERR(vnic->qp)) {
+		vnic_dbg_fip(vnic->name, "failed to create QP\n");
+		goto error_free_cq;
+	}
+
+	vnic_dbg_fip_v(vnic->name, "fip_vnic_rings_create QPN %d,"
+		       " LID %d\n", (int)vnic->qp->qp_num, (int)port->attr.lid);
+
+	/* move QP from reset to RTS */
+	if (fip_init_qp(vnic->port, vnic->qp, vnic->pkey_index, vnic->name)) {
+		vnic_dbg_fip(vnic->name, "fip_init_qp returned with error\n");
+		goto error_free_qp;
+	}
+
+	ret = fip_init_tx(vnic->tx_ring.size, &vnic->tx_ring, vnic->name);
+	if (ret) {
+		vnic_dbg_fip(vnic->name, "fip_init_tx failed ret %d\n", ret);
+		goto error_free_qp;
+	}
+
+	ret = fip_init_rx(port, vnic->rx_ring.size, vnic->qp,
+			  &vnic->rx_ring, vnic->name);
+	if (ret) {
+		vnic_dbg_fip(vnic->name, "fip_init_rx returned %d\n", ret);
+		goto error_release_rings;
+	}
+
+	/* enable recieving CQ completions */
+	if (ib_req_notify_cq(vnic->cq, IB_CQ_NEXT_COMP))
+		goto error_release_rings;
+	vnic_dbg_fip_v(vnic->name, "fip_vnic_rings_create done OK\n");
+
+	return 0;
+
+error_release_rings:
+	fip_flush_rings(port, vnic->cq, vnic->qp, &vnic->rx_ring,
+			&vnic->tx_ring, vnic->name);
+	fip_free_rings(port, &vnic->rx_ring, &vnic->tx_ring, vnic->name);
+error_free_qp:
+	ib_destroy_qp(vnic->qp);
+error_free_cq:
+	ib_destroy_cq(vnic->cq);
+out_w_err:
+	vnic->qp = NULL;
+	vnic->cq = NULL;
+	vnic->rx_ring.size = 0;
+	vnic->tx_ring.size = 0;
+	return -ENODEV;
+}
+
+static void fip_vnic_rings_destroy(struct fip_vnic_data *vnic)
+{
+	fip_flush_rings(vnic->port, vnic->cq, vnic->qp, &vnic->rx_ring,
+			&vnic->tx_ring, vnic->name);
+	fip_free_rings(vnic->port, &vnic->rx_ring, &vnic->tx_ring, vnic->name);
+	fip_vnic_recv_list_flush(vnic);
+	ib_destroy_qp(vnic->qp);
+	ib_destroy_cq(vnic->cq);
+	vnic->qp = NULL;
+	vnic->cq = NULL;
+}
+
+/*
+ * This function is a callback called upon successful join to a
+ * multicast group. The function checks if we have joined + attached
+ * to all required mcast groups and if so moves the discovery FSM to solicit.
+*/
+void fip_vnic_mcast_cnct_cb(struct vnic_mcast *mcast, void *ctx)
+{
+	struct fip_vnic_data *vnic = mcast->priv_data;
+
+	vnic_dbg_fip(vnic->name, "fip_vnic_mcast_cnct_cb\n");
+	vnic_dbg_parse(vnic->name, "attached mask = 0x%lx, req mask = 0x%lx\n",
+		       *mcast->cur_attached, *mcast->req_attach);
+
+	if ((*mcast->cur_attached & *mcast->req_attach) != *mcast->req_attach)
+		return;
+
+	vnic->keep_alive_jiffs = jiffies;
+	set_bit(MCAST_ATTACHED, &vnic->vnic_mcaste_state);
+	/* in case of a new mcast connection switch to VHUB_INIT, for a
+	 * reconnection stay in the current state */
+	if (vnic->state < FIP_VNIC_VHUB_INIT) {
+		vnic_dbg_fip(vnic->name,
+			"fip_vnic_mcast_cnct_cb done joining mcasts\n");
+		vnic->state = FIP_VNIC_VHUB_INIT;
+		cancel_delayed_work(&vnic->vnic_task);
+		REQUEUE_VNIC_DWORK(vnic, &vnic->vnic_task, 0);
+	}
+}
+
+/*
+ * This function is a callback called upon a mcast deattach event.
+ * This event can be triggered due to vnic request or due to an async
+ * event. Currently this code does not participate in the vnic's FSM.
+*/
+void fip_vnic_mcast_deattach_cb(struct vnic_mcast *mcast, void *ctx)
+{
+	struct fip_vnic_data *vnic = mcast->priv_data;
+
+	vnic->detached_ka_jiffs = jiffies;
+	clear_bit(MCAST_ATTACHED, &vnic->vnic_mcaste_state);
+
+	vnic_dbg_fip(vnic->name, "fip_vnic_mcast_deattach_cb\n");
+}
+
+/*
+ * Try to connect to the relevant mcast groups. If one of the mcast failed
+ * The function should be recalled to try and complete the join process
+ * (for the mcast groups that the join process was not performed).
+ * Note: A successful return of vnic_mcast_join means that the mcast join
+ * started, not that the join completed. completion of the connection process
+ * is asyncronous and uses a supplyed callback.
+ */
+int fip_vnic_mcast_cnct(struct fip_vnic_data *vnic)
+{
+	struct vnic_port *port = vnic->port;
+	union vhub_mgid mgid;
+	struct vnic_mcast *mcaste, *mcaste_upd, *mcaste_tbl;
+	struct vnic_mcast *uninitialized_var(mcaste_ka);
+	int rc;
+
+	vnic_dbg_fip(port->name, "fip_vnic_mcast_cnct called\n");
+
+	mcaste_upd = vnic_mcast_alloc(port, &vnic->req_attach, &vnic->cur_attached);
+	if (IS_ERR(mcaste_upd))
+		return -EINVAL;
+
+	mcaste_tbl = vnic_mcast_alloc(port, &vnic->req_attach, &vnic->cur_attached);
+	if (IS_ERR(mcaste_tbl)) {
+		rc = -EINVAL;
+		goto free_upd;
+	}
+
+	set_bit(FIP_MCAST_VHUB_UPDATE, &vnic->req_attach);
+	set_bit(FIP_MCAST_TABLE, &vnic->req_attach);
+
+	vnic_dbg_fip(port->name, "gw type is %d\n", vnic->gw->info.gw_type);
+	if (vnic->gw->info.gw_type == GW_TYPE_LAG) {
+		mcaste_ka = vnic_mcast_alloc(port, &vnic->req_attach, &vnic->cur_attached);
+		if (IS_ERR(mcaste_ka)) {
+			rc = -EINVAL;
+			goto free_tbl;
+		}
+		set_bit(FIP_MCAST_VHUB_KA, &vnic->req_attach);
+	}
+
+	mcaste = mcaste_upd;
+	mcaste->priv_data = vnic;
+	mcaste->attach_bit_nr = FIP_MCAST_VHUB_UPDATE;
+	memset(mcaste->mac, 0, ETH_ALEN);
+	vhub_mgid_create(vnic->login_data.mgid_prefix,
+			 mcaste->mac,
+			 vnic->login_data.n_mac_mcgid,
+			 vnic->login_data.vhub_id, VHUB_MGID_UPDATE,
+			 0, &mgid);
+	mcaste->gid = mgid.ib_gid;
+	mcaste->port_gid = mcaste->gid;
+	mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC);
+	mcaste->backoff_factor = VNIC_MCAST_BACKOF_FAC;
+	mcaste->retry = VNIC_MCAST_ULIMIT_RETRY;
+	mcaste->attach_cb = fip_vnic_mcast_cnct_cb;
+	mcaste->detach_cb = fip_vnic_mcast_deattach_cb;
+	mcaste->attach_cb_ctx = NULL;
+	mcaste->detach_cb_ctx = NULL;
+	mcaste->blocking = 0;
+	mcaste->qkey = VNIC_FIP_QKEY;
+	mcaste->pkey = vnic->pkey;
+	mcaste->qp = vnic->qp;
+	mcaste->create = vnic_mcast_create;
+	mcaste->blocking = 0;
+	mcaste->join_state = 1;
+	rc = vnic_mcast_add(&vnic->mcast_tree, mcaste);
+	ASSERT(!rc);
+	rc = vnic_mcast_attach(&vnic->mcast_tree, mcaste);	/* MCAST_RECEIVE_ONLY */
+	ASSERT(!rc);
+
+	mcaste = mcaste_tbl;
+	mcaste->priv_data = vnic;
+	mcaste->attach_bit_nr = FIP_MCAST_TABLE;
+	memset(mcaste->mac, 0, ETH_ALEN);
+	vhub_mgid_create(vnic->login_data.mgid_prefix,
+			 mcaste->mac,
+			 vnic->login_data.n_mac_mcgid,
+			 vnic->login_data.vhub_id, VHUB_MGID_TABLE,
+			 0, &mgid);
+	mcaste->gid = mgid.ib_gid;
+	mcaste->port_gid = mcaste->gid;
+	mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC);
+	mcaste->backoff_factor = VNIC_MCAST_BACKOF_FAC;
+	mcaste->retry = VNIC_MCAST_ULIMIT_RETRY;
+	mcaste->attach_cb = fip_vnic_mcast_cnct_cb;
+	mcaste->detach_cb = fip_vnic_mcast_deattach_cb;
+	mcaste->attach_cb_ctx = NULL;
+	mcaste->detach_cb_ctx = NULL;
+	mcaste->blocking = 0;
+	mcaste->qkey = VNIC_FIP_QKEY;
+	mcaste->pkey = vnic->pkey;
+	mcaste->qp = vnic->qp;
+	mcaste->create = vnic_mcast_create;
+	mcaste->blocking = 0;
+	mcaste->join_state = 1;
+	rc = vnic_mcast_add(&vnic->mcast_tree, mcaste);
+	ASSERT(!rc);
+	rc = vnic_mcast_attach(&vnic->mcast_tree, mcaste);	/* MCAST_RECEIVE_ONLY */
+	ASSERT(!rc);
+
+	if (vnic->gw->info.gw_type != GW_TYPE_LAG)
+		return 0;
+
+	mcaste = mcaste_ka;
+	mcaste->priv_data = vnic;
+	mcaste->attach_bit_nr = FIP_MCAST_VHUB_KA;
+	memset(mcaste->mac, 0, ETH_ALEN);
+	vhub_mgid_create(vnic->login_data.mgid_prefix,
+			 mcaste->mac,
+			 vnic->login_data.n_mac_mcgid,
+			 vnic->login_data.vhub_id, VHUB_MGID_KA,
+			 0, &mgid);
+	mcaste->gid = mgid.ib_gid;
+	mcaste->port_gid = mcaste->gid;
+	mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC);
+	mcaste->backoff_factor = 1;
+	mcaste->retry = VNIC_MCAST_MAX_RETRY;
+	mcaste->attach_cb = fip_vnic_mcast_cnct_cb;
+	mcaste->detach_cb = fip_vnic_mcast_deattach_cb;
+	mcaste->attach_cb_ctx = NULL;
+	mcaste->detach_cb_ctx = NULL;
+	mcaste->blocking = 0;
+	mcaste->qkey = VNIC_FIP_QKEY;
+	mcaste->pkey = vnic->pkey;
+	mcaste->qp = vnic->qp;
+	mcaste->create = vnic_mcast_create;
+	mcaste->blocking = 0;
+	mcaste->join_state = 1;
+	mcaste->sender_only = 1;
+	vnic->ka_mcast_gid = mcaste->gid;
+	rc = vnic_mcast_add(&vnic->mcast_tree, mcaste);
+	ASSERT(!rc);
+	rc = vnic_mcast_attach(&vnic->mcast_tree, mcaste);
+	ASSERT(!rc);
+
+        return 0;
+
+free_tbl:
+	vnic_mcast_dealloc(mcaste_tbl);
+
+free_upd:
+	vnic_mcast_dealloc(mcaste_upd);
+
+	return rc;
+}
+
+/*
+ * This function is the driving engine of the vnic logic. It manages the
+ * vnics state machines.
+ * Some of the states in the state machine could have been removed because
+ * they contain "actions" and not states. Still it is easier to maintaine
+ * the code this way and it gives an easy mechanism for exception handling
+ * and retries.
+ * Only call this function from fip_wq context.
+*/
+void fip_vnic_fsm(struct work_struct *work)
+{
+	struct fip_vnic_data *vnic =
+		container_of(work, struct fip_vnic_data, vnic_task.work);
+	struct vnic_port *port = vnic->port;
+	int rc, recall_time = 0;
+	const long int msec_in_sec = 1000;
+	struct fip_vnic_send_info gw_address;
+	ktime_t ktime;
+
+	vnic_dbg_fip(port->name, "fip_vnic_fsm called vnic %d\n",
+		     vnic->vnic_id);
+
+	if (vnic->flush != FIP_NO_FLUSH)
+		return;
+
+	switch (vnic->state) {
+	case FIP_VNIC_CLOSED:
+		break;
+	case FIP_VNIC_HADMIN_IDLE:
+		if (vnic->gw->state < FIP_GW_CONNECTED)
+			break;
+		fip_vnic_create_gw_param(&gw_address, vnic->gw->info.gw_qpn, VNIC_FIP_QKEY,
+					  vnic->gw->info.gw_lid, vnic_gw_ctrl_sl(vnic->gw));
+		fip_vnic_set_gw_param(vnic, &gw_address);
+		/* fall through */
+
+	case FIP_VNIC_LOGIN:
+		vnic_dbg_fip(port->name, "FIP_VNIC_LOGIN vnic %d\n",
+			     vnic->vnic_id);
+		/* get data QP numbers needed for login request packet. If we fail
+		 * we will close the vnic entirely */
+		rc = fip_vnic_login_init(vnic->port, vnic);
+		if (rc) {
+			fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+			vnic_warn(vnic->name, "fip_vnic_login_init failed, "
+				 "closing vnic rc %d\n", rc);
+			break;
+		}
+		vnic->state = FIP_VNIC_WAIT_4_ACK;
+		/* fall through */
+
+	case FIP_VNIC_WAIT_4_ACK:
+		vnic_dbg_fip(port->name, "FIP_VNIC_WAIT_4_ACK vnic %d\n",
+			     vnic->vnic_id);
+		/* resend login request every timeout */
+		vnic_dbg_fip(port->name, "fip_login_send vnic %d\n",vnic->vnic_id);
+		rc = fip_login_send(vnic);
+		if (!rc)
+			recall_time = FIP_LOGIN_TIMEOUT * msec_in_sec;
+		else
+			recall_time = 1 * msec_in_sec;
+
+		goto queue_vnic_work;
+
+	case FIP_VNIC_RINGS_INIT:
+		/* create QP and rings */
+		rc = fip_vnic_rings_create(vnic->port, vnic);
+		if (rc) {
+			fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+			vnic_warn(vnic->name, "fip_vnic_rings_create failed, "
+				  "closing vnic rc=%d\n", rc);
+			break;
+		}
+
+		vnic->last_send_jiffs = 1; /* use a non zero value to start transmition */
+		{
+                       /* start vnic UCAST KA packets, This will also cause bxm to send us the
+                         * neighbor table */
+			if (vnic->gw->info.gw_type != GW_TYPE_LAG) {
+				ktime = ktime_set(0, 0);
+#ifdef _BP_HR_TIMER
+				hrtimer_start(&vnic->keepalive_timer, ktime, HRTIMER_REL );
+#else
+				hrtimer_start(&vnic->keepalive_timer, ktime, HRTIMER_MODE_REL );
+#endif
+			}
+		}
+
+		vnic->state = FIP_VNIC_MCAST_INIT;
+		/* fall through */
+
+	case FIP_VNIC_MCAST_INIT:
+		rc = fip_vnic_mcast_cnct(vnic);
+		if (rc) {
+			vnic_warn(vnic->name,
+				     "fip_vnic_mcast_cnct failed, rc %d\n", rc);
+			/* try again later */
+			recall_time = 1 * msec_in_sec;
+			goto queue_vnic_work;
+		}
+		vnic->state = FIP_VNIC_MCAST_INIT_DONE;
+		/* fall through */
+
+	case FIP_VNIC_MCAST_INIT_DONE:
+		/* wait for mcast attach CB before continueing */
+		break;
+
+	case FIP_VNIC_VHUB_INIT:
+
+		/* previous KA if sent did not request a table because MCASTs were not
+		 * available. Send extra KA packet that should trigger table request in
+		 * order to hasten things up */
+		fip_vnic_keepalive_send(vnic, 1);
+
+		if (vnic->gw->info.gw_type == GW_TYPE_LAG) {
+			/* start vnic MCAST KA packets, This will also cause bxm to send us the
+			  * neighbor table */
+			ktime = ktime_set(0, 0);
+#ifdef _BP_HR_TIMER
+			hrtimer_start(&vnic->keepalive_timer, ktime, HRTIMER_REL );
+#else
+			hrtimer_start(&vnic->keepalive_timer, ktime, HRTIMER_MODE_REL );
+#endif
+		}
+
+		/* start tracking GW keep alives, calls  fip_vnic_gw_alive() */
+		QUEUE_VNIC_DWORK(vnic, &vnic->vnic_gw_alive_task,
+				 vnic->gw->info.gw_period);
+
+		vnic->state = FIP_VNIC_VHUB_INIT_DONE;
+		/* fall through */
+
+	case FIP_VNIC_VHUB_INIT_DONE:
+		/* we are waiting to receive a full vhub table. The KA will handle
+		 * retries if we do not get the table we are expecting */
+
+		/* queue login create request */
+		if (fip_vnic_test_login(vnic, 1)) {
+			recall_time = 1 * msec_in_sec;
+			goto queue_vnic_work;
+		}
+
+		break;
+
+	case FIP_VNIC_VHUB_DONE:
+		if (fip_vnic_test_login(vnic, 1)) {
+			recall_time = 1 * msec_in_sec;
+			goto queue_vnic_work;
+		}
+
+                if (vnic_login_complete_ack(vnic, &vnic->login_data, &vnic->shared_vnic)) {
+			vnic_warn(vnic->name,
+				     "vnic_login_complete_ack failed\n");
+			recall_time = 1 * msec_in_sec;
+			goto queue_vnic_work;
+		}
+
+		/* for LAG write member info */
+		fip_vnic_write_members(vnic);
+
+		vnic->state = FIP_VNIC_VHUB_WRITE;
+		/* fall through */
+
+	case FIP_VNIC_VHUB_WRITE:
+		/* write the vhub table to login */
+		fip_vnic_write_tbl(vnic);
+		vnic->state = FIP_VNIC_CONNECTED;
+		/* fall through */
+
+	case FIP_VNIC_CONNECTED:
+		vnic->retry_count = 0;
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	vnic_dbg_fip(port->name, "state %d gw_lid %d gw_qpn %d\n",
+		     vnic->state, vnic->gw_address.gw_lid, vnic->gw_address.gw_qpn);
+	return;
+
+queue_vnic_work:
+	QUEUE_VNIC_DWORK(vnic, &vnic->vnic_task, recall_time * HZ / msec_in_sec);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_main.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_main.c
new file mode 100644
index 0000000000000..07a6f2ebe54d7
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_main.c
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_fip.h"
+#include "vnic_fip_discover.h"
+
+struct workqueue_struct *fip_wq;
+
+void fip_refresh_mcasts(struct fip_discover *discover)
+{
+	struct fip_gw_data *gw;
+	struct fip_vnic_data *vnic;
+
+	fip_discover_mcast_reattach(discover, discover->port);
+
+	down_read(&discover->l_rwsem);
+	list_for_each_entry(gw, &discover->gw_list, list)
+		list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) {
+			if (vnic->flush != FIP_FULL_FLUSH && vnic->state > FIP_VNIC_MCAST_INIT)
+				vnic_tree_mcast_detach(&vnic->mcast_tree);
+		}
+
+	list_for_each_entry(gw, &discover->gw_list, list) {
+            list_for_each_entry(vnic, &gw->vnic_list, gw_vnics)  {
+                if (vnic->flush != FIP_FULL_FLUSH && vnic->state > FIP_VNIC_MCAST_INIT)
+					vnic_tree_mcast_attach(&vnic->mcast_tree);
+			}
+            /* restart path query */
+            if (vnic_sa_query && gw->state >= FIP_GW_CTRL_PATH_QUERY && gw->flush == FIP_NO_FLUSH)
+				fip_discover_gw_fsm_move(gw, FIP_GW_CTRL_PATH_QUERY);
+    }
+	up_read(&discover->l_rwsem);
+
+}
+
+void port_fip_discover_restart(struct work_struct *work)
+{
+	struct vnic_port *port =
+	    container_of(work, struct vnic_port, discover_restart_task.work);
+	struct fip_discover *discover;
+	struct vnic_login *login;
+
+	vnic_dbg_mark();
+	mutex_lock(&port->start_stop_lock);
+	vnic_dbg_mark();
+	mutex_lock(&port->mlock);
+	if (vnic_port_query(port))
+		vnic_warn(port->name, "vnic_port_query failed\n");
+
+	/* bring vnics links down */
+	list_for_each_entry(login, &port->login_list, list)
+		vnic_mcast_del_all(&login->mcast_tree);
+
+	mutex_unlock(&port->mlock);
+	list_for_each_entry(discover, &port->fip.discover_list, discover_list) {
+		if (fip_discover_cleanup(port, discover, 0)) {
+			vnic_dbg(port->name, "fip_discover_cleanup flushed\n");
+			goto out;
+		}
+	}
+
+	list_for_each_entry(discover, &port->fip.discover_list, discover_list) {
+		if (fip_discover_init(port, discover, discover->pkey, 0)) {
+			vnic_warn(port->name, "failed to alloc discover resources\n");
+		}
+	}
+out:
+	mutex_unlock(&port->start_stop_lock);
+	return;
+}
+
+void vnic_port_fip_cleanup(struct vnic_port *port, int lock)
+{
+	struct fip_discover *discover, *tmp_discover;
+
+	if (lock)
+		mutex_lock(&port->start_stop_lock);
+
+	list_for_each_entry_safe(discover, tmp_discover, &port->fip.discover_list, discover_list) {
+		vnic_dbg_fip_p0(port->name, "Discovery cleanup of PKEY=0x%x\n", discover->pkey);
+
+		list_del(&discover->discover_list);
+		vnic_info("Removed fip discovery %s port %d pkey 0x%x\n",
+			  port->dev->ca->name, port->num, discover->pkey);
+		fip_discover_cleanup(port, discover, 1);
+		kfree(discover);
+	}
+
+	if (lock)
+		mutex_unlock(&port->start_stop_lock);
+}
+
+
+int vnic_port_fip_init(struct vnic_port *port)
+{
+	int rc;
+	struct fip_discover *discover;
+	int i;
+
+	if (no_bxm)
+		return 0;
+
+	vnic_discovery_pkeys_count = vnic_discovery_pkeys_count > MAX_NUM_PKEYS_DISCOVERY ?
+		MAX_NUM_PKEYS_DISCOVERY : vnic_discovery_pkeys_count;
+
+	if (vnic_discovery_pkeys_count == 0 ||
+	    (vnic_discovery_pkeys_count == MAX_NUM_PKEYS_DISCOVERY &&
+	     vnic_discovery_pkeys[0] == 0)) {
+		vnic_discovery_pkeys[0] = 0xffff;
+		vnic_discovery_pkeys_count = 1;
+		vnic_dbg_fip_p0(port->name, "Creating default PKEY for Discovery\n");
+	}
+
+	mutex_lock(&port->start_stop_lock);
+
+	for (i = 0; i < vnic_discovery_pkeys_count; i++) {
+		vnic_discovery_pkeys[i] &= 0xffff;
+		vnic_discovery_pkeys[i] |= 0x8000;
+
+		vnic_dbg_fip_p0(port->name, "Init Discovery=%d on PKEY=0x%x\n", i, vnic_discovery_pkeys[i]);
+
+		discover = kzalloc(sizeof(struct fip_discover), GFP_KERNEL);
+		if (!discover) {
+			vnic_warn(port->name, "discover alloc failed\n");
+			rc = -ENOMEM;
+			goto fail;
+		}
+
+		INIT_LIST_HEAD(&discover->discover_list);
+
+		vnic_info("Added fip discovery %s port %d PKEY 0x%x\n",
+			  port->dev->ca->name, port->num,
+			  vnic_discovery_pkeys[i]);
+
+		list_add_tail(&discover->discover_list, &port->fip.discover_list);
+		rc = fip_discover_init(port, discover, vnic_discovery_pkeys[i], 1);
+		if (rc) {
+			vnic_warn(port->name, "fip_discover_init pkey=0x%x "
+				  "failed\n", discover->pkey);
+			list_del(&discover->discover_list);
+			kfree(discover);
+			goto fail;
+		}
+	}
+	mutex_unlock(&port->start_stop_lock);
+	return 0;
+
+fail:
+	mutex_unlock(&port->start_stop_lock);
+	vnic_port_fip_cleanup(port, 1);
+	return rc;
+}
+
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_pkt.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_pkt.c
new file mode 100644
index 0000000000000..078d4aa0ea5f2
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_pkt.c
@@ -0,0 +1,863 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/uts.h>
+#include "vnic.h"
+#include "vnic_fip.h"
+#include "vnic_fip_discover.h"
+#include "vnic_fip_pkt.h"
+
+const struct eoib_host_update base_update_pkt = {
+	.fip.subcode = FIP_HOST_ALIVE_SUB_OPCODE,
+	.fip.type.type = FIP_FIP_HDR_TYPE,
+	.fip.type.length = FIP_FIP_HDR_LENGTH,
+	.fip.vendor_id = FIP_VENDOR_MELLANOX,
+
+	.type_1.type = FIP_HOST_UPDATE_TYPE,
+	.type_1.length = FIP_HOST_UPDATE_LENGTH,
+	.vendor_id = FIP_VENDOR_MELLANOX,
+};
+
+const struct eoib_host_update base_logout_pkt = {
+	.fip.subcode = FIP_HOST_LOGOUT_SUB_OPCODE,
+	.fip.type.type = FIP_FIP_HDR_TYPE,
+	.fip.type.length = FIP_FIP_HDR_LENGTH,
+	.fip.vendor_id = FIP_VENDOR_MELLANOX,
+
+	.type_1.type = FIP_LOGOUT_TYPE_1,
+	.type_1.length = FIP_LOGOUT_LENGTH_1,
+	.vendor_id = FIP_VENDOR_MELLANOX,
+};
+
+static int extract_adv_extended(struct fip_ext_desc_tlv *fed,
+				struct fip_gw_data_info *info)
+{
+        struct fip_ext_type_cap *extended_cap;
+        struct fip_ext_type_boot *extended_boot;
+	struct fip_ext_type_power_cycle_id *extended_pc_id;
+	struct fip_ext_type_lag_props *extended_lag = NULL;
+	struct fip_extended_type *ext_hdr;
+	int length_to_go, ext_length;
+	
+	vnic_dbg_parse("", "extracting extended descriptor\n");
+
+	length_to_go = (((int)fed->ft.length) << 2) - sizeof(*fed);
+	ext_hdr = (struct fip_extended_type *)(fed + 1);
+
+	while (length_to_go > 0) {
+		ext_length = ((int)ext_hdr->len) << 2;
+
+		vnic_dbg_parse(NULL, "Advertise parse, sub-tlv "
+			       "type  %d length %d address=%p\n",
+			       ext_hdr->ext_type, ext_length, ext_hdr);
+
+		if (ext_length < sizeof(*ext_hdr) ||
+		    ext_length > length_to_go) {
+			vnic_dbg_parse(NULL, "Extended length error. "
+				       "Length=%d\n", ext_length);
+			return -EINVAL;
+		}
+
+		if (ext_hdr->ext_type == ADV_EXT_TYPE(CAP) &&
+		    ext_length == sizeof(*extended_cap)) {		/* capabilities*/
+			/* do nothing */
+		} else if (ext_hdr->ext_type == ADV_EXT_TYPE(LAG) &&	/* LAG */
+			   ext_length == sizeof(*extended_lag)) {
+			extended_lag = (struct fip_ext_type_lag_props *)ext_hdr;
+			info->gw_type = extended_lag->gw_type;
+			info->ext_lag.hash =  be16_to_cpu(extended_lag->lag_hash);
+			info->ext_lag.weights_policy = extended_lag->weight_policy_flags >> 4;
+			info->ext_lag.member_ka = (extended_lag->weight_policy_flags & 0x8) >> 3;
+			info->ext_lag.ca = !!(extended_lag->weight_policy_flags &
+						FIP_EXT_LAG_W_POLICY_HOST);
+			info->ext_lag.ca_thresh = extended_lag->ca_threshold;
+			info->ext_lag.ucast = !!(extended_lag->weight_policy_flags &
+						 FIP_EXT_LAG_W_POLICY_UCAST);
+			info->ext_lag.valid = 1;
+		} else if (ext_hdr->ext_type == ADV_EXT_TYPE(BOOT) &&
+			   ext_length == sizeof(*extended_boot)) {	/* boot */
+			extended_boot = (struct fip_ext_type_boot *)ext_hdr;
+			info->ext_boot.boot_prio = extended_boot->boot_prio;
+			info->ext_boot.timeout = extended_boot->discovery_timeout;
+			info->ext_boot.valid = 1;
+		} else if (ext_hdr->ext_type == ADV_EXT_TYPE(PC_ID) && 
+			   ext_length == sizeof(*extended_pc_id)) { /* Power Cycle ID */
+			extended_pc_id = (struct fip_ext_type_power_cycle_id *)ext_hdr;
+			info->ext_pc_id.power_cycle_id =
+				be64_to_cpu(extended_pc_id->power_cycle_id);
+			info->ext_pc_id.valid = 1;
+		} else if (ext_hdr->mandatory & 0x01) {
+			vnic_dbg_parse(NULL, "Advertise parse, unknown"
+				       " mandatory extended type %d length %d\n",
+				       ext_hdr->ext_type, ext_length);
+			return -EINVAL;
+		} else
+			vnic_dbg_parse(NULL, "Advertise parse, unknown "
+				       "non-mandatory extended. Skipping, type"
+				       " %d length %d\n",
+				       ext_hdr->ext_type, ext_length);
+
+		ext_hdr = (struct fip_extended_type *)((char *)ext_hdr + ext_length);
+		length_to_go -= ext_length;
+	}
+
+	return 0;
+}
+
+int fip_advertise_parse_bh(struct fip_discover *discover, struct fip_content *fc,
+			   struct fip_gw_data *data)
+{
+	long ka_time;
+	int err = 0;
+
+	/* make sure we have at least a single address descriptor */
+	if (fc->fa.num < 1 || !fc->fgwi || !fc->fgid || !fc->fka)
+		return -EINVAL;
+
+	data->info.flags = be16_to_cpu(fc->fh->flags) & FIP_FIP_ADVRTS_FLAG ? FIP_GW_AVAILABLE : 0;
+
+	data->info.flags |=
+	    (be16_to_cpu(fc->fh->flags) & FIP_FIP_SOLICITED_FLAG) ? 0 :
+	    FIP_RCV_MULTICAST;
+
+	data->info.flags |= FIP_IS_FIP;
+	data->info.flags |= (fc->fh->flags & FIP_ADVERTISE_HOST_VLANS) ?
+	    FIP_HADMINED_VLAN : 0;
+
+	data->info.gw_qpn = be32_to_cpu(fc->fa.fa[0]->gwtype_qpn) & 0xffffff;
+	data->info.gw_lid = be16_to_cpu(fc->fa.fa[0]->lid);
+	data->info.gw_port_id = be16_to_cpu(fc->fa.fa[0]->sl_gwportid) &
+		FIP_ADVERTISE_GW_PORT_ID_MASK;
+	data->info.gw_sl = be16_to_cpu(fc->fa.fa[0]->sl_gwportid) >> FIP_ADVERTISE_SL_SHIFT; /*ignore this value.*/
+	memcpy(data->info.gw_guid, fc->fa.fa[0]->guid, sizeof(data->info.gw_guid));
+	data->info.gw_num_vnics = be16_to_cpu(fc->fgwi->n_rss_qpn_vnics) &
+		FIP_ADVERTISE_NUM_VNICS_MASK;
+
+	data->info.n_rss_qpn = be16_to_cpu(fc->fgwi->n_rss_qpn_vnics) >>
+		FIP_ADVERTISE_N_RSS_SHIFT;
+	data->info.hadmined_en = (fc->fgwi->h_nmac_mgid & FIP_ADVERTISE_HOST_EN_MASK);
+	data->info.all_vlan_gw = !!(fc->fgwi->h_nmac_mgid & FIP_ADVERTISE_ALL_VLAN_GW_MASK);
+
+	TERMINATED_MEMCPY(data->info.gw_vendor_id, fc->fgwi->vendor_id);
+	memcpy(data->info.vol_info.system_guid, fc->fgid->sys_guid,
+	       sizeof(data->info.vol_info.system_guid));
+	TERMINATED_MEMCPY(data->info.vol_info.system_name,
+			  fc->fgid->sys_name);
+	TERMINATED_MEMCPY(data->info.vol_info.gw_port_name, fc->fgid->gw_port_name);
+
+	ka_time	= be32_to_cpu(fc->fka->adv_period);
+	ka_time = ka_time ? ka_time : FKA_ADV_PERIOD;
+	/* do not let KA go under 2 secs */
+	ka_time = (ka_time < 2000) ? 2000 : ka_time;
+	data->info.gw_adv_period = FIP_TIMEOUT_FACTOR(msecs_to_jiffies(ka_time));
+
+	ka_time	= be32_to_cpu(fc->fka->ka_period);
+	ka_time = ka_time ? ka_time : FKA_ADV_PERIOD;
+	data->info.gw_period = FIP_TIMEOUT_FACTOR(msecs_to_jiffies(ka_time));
+
+	ka_time	= be32_to_cpu(fc->fka->vnic_ka_period);
+	ka_time = ka_time ? ka_time : FKA_ADV_PERIOD;
+	data->info.vnic_ka_period = msecs_to_jiffies(ka_time);
+
+	data->info.gw_type = GW_TYPE_SINGLE_EPORT;
+	if (fc->fed.num > 0) {
+		if (fc->fed.num == 1) {
+			/* new version bxm mode */
+			data->info.gw_prot_new = 1;
+			err = extract_adv_extended(fc->fed.fed[0], &data->info);
+			if (err)
+				vnic_dbg_parse(discover->name, "invalid extended descripotr\n");
+		} else {
+			vnic_dbg_parse(discover->name, "too many extended descripotrs\n");
+			return -EINVAL;
+		}
+	}
+
+	return err;
+}
+
+static int send_generic_mcast_pkt(struct vnic_port *port,
+				  struct fip_ring *tx_ring,
+				  void *mem, int pkt_size,
+				  struct ib_qp *qp,
+				  int pkey_index,
+				  struct vnic_mcast *mcast)
+{
+	int index, rc;
+	unsigned long flags;
+	unsigned long tail;
+
+	/*
+	 * we are only allowed to update the head at task level so no need to
+	 * perform any locks here
+	 */
+	spin_lock_irqsave(&tx_ring->ring_lock, flags);
+	index = tx_ring->head & (tx_ring->size - 1);
+	vnic_dbg_fip(port->name, "mcast packet\n");
+
+	spin_lock(&tx_ring->head_tail_lock);
+	tail = tx_ring->tail;
+	spin_unlock(&tx_ring->head_tail_lock);
+
+	/* ring full try again */
+	if (tx_ring->head - tail >=  tx_ring->size) {
+		vnic_warn(port->name, "send_generic_mcast_pkt ring full: QPN 0x%x: tail=%ld head=%ld diff=%ld\n",
+			  qp->qp_num, tx_ring->tail, tx_ring->head, tx_ring->head - tx_ring->tail);
+		rc = -EAGAIN;
+		goto err;
+	}
+
+	rc = _map_generic_pkt(port, &tx_ring->ring[index], mem, pkt_size);
+	if (rc)
+		goto err;
+
+	rc = fip_mcast_send(port, qp, index,
+			    tx_ring->ring[index].bus_addr,
+			    pkt_size, pkey_index, mcast);
+
+	if (rc) {
+		vnic_warn(port->name,
+			  "send_generic_mcast_pkt: fip_mcast_send ret %d\n",
+			  rc);
+		rc = -ENODEV;
+		goto error_unmap_dma;
+	}
+
+	tx_ring->head++;
+
+	spin_unlock_irqrestore(&tx_ring->ring_lock, flags);
+	return 0;
+
+error_unmap_dma:
+	ib_dma_unmap_single(port->dev->ca,
+			    tx_ring->ring[index].bus_addr,
+			    pkt_size, DMA_TO_DEVICE);
+
+err:
+	spin_unlock_irqrestore(&tx_ring->ring_lock, flags);
+	return rc;
+}
+
+static void *alloc_solicit_pkt(int new_prot, char *node_desc)
+{
+	void *ptr;
+	struct fip_solicit_new *nptr;
+	struct fip_solicit_legacy *optr;
+	int size = new_prot ? sizeof *nptr : sizeof *optr;
+
+	ptr = kzalloc(size, GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+	optr = ptr;
+	optr->version.version = 1;
+	optr->fh.opcode = cpu_to_be16(EOIB_FIP_OPCODE);
+	optr->fh.subcode = FIP_HOST_SOL_SUB_OPCODE;
+	optr->fh.list_length = cpu_to_be16(size - offsetof(typeof(*optr), fvend)) / 4;
+	optr->fvend.ft.type = FIP_TYPE(VENDOR_ID);
+	optr->fvend.ft.length = sizeof optr->fvend / 4; 
+	strncpy(optr->fvend.vendor_id, "mellanox", sizeof optr->fvend.vendor_id);
+	optr->addr.ft.type = FIP_TYPE(ADDRESS);
+	optr->addr.ft.length = sizeof optr->addr / 4;
+	strncpy(optr->addr.vendor_id, "mellanox", sizeof optr->addr.vendor_id);
+	if (new_prot) {
+		nptr = ptr;
+		nptr->ext.ft.type = 254;
+		nptr->ext.ft.length = sizeof nptr->ext / 4;
+		strncpy(nptr->ext.vendor_id, "mellanox", sizeof nptr->ext.vendor_id);
+		nptr->ext_cap.et.ext_type = 40;
+		nptr->ext_cap.et.len = sizeof nptr->ext_cap / 4;
+		nptr->ext_cap.et.mandatory = 1;
+		nptr->ext_hostname.et.ext_type = 39;
+		nptr->ext_hostname.et.len = sizeof nptr->ext_hostname / 4;
+		strncpy(nptr->ext_hostname.hostname, node_desc, sizeof nptr->ext_hostname.hostname);
+	}
+
+	return ptr;
+}
+
+int fip_solicit_send(struct fip_discover *discover,
+		     enum fip_packet_type multicast,
+		     u32 dqpn, u16 dlid, u8 sl, int new_prot)
+{
+	int rc = 0;
+	unsigned long flags, flags1;
+	struct fip_solicit_legacy *optr;
+	int size = new_prot ? sizeof(struct fip_solicit_new) : sizeof *optr;
+
+	ASSERT(discover);
+
+	/* alloc packet to be sent */
+	optr = alloc_solicit_pkt(new_prot, discover->port->dev->ca->node_desc);
+	if (IS_ERR(optr))
+		return PTR_ERR(optr);
+
+	/* we set bit 24 to signify that we're a new host */
+	optr->addr.gwtype_qpn = cpu_to_be32(discover->qp->qp_num | 0x1000000);
+	optr->addr.lid = cpu_to_be16(discover->port->attr.lid);
+	/* send the SL to the GW*/
+	optr->addr.sl_gwportid = cpu_to_be16(sl << FIP_ADVERTISE_SL_SHIFT);
+
+	memcpy(optr->addr.guid, &discover->port->gid.global.interface_id, sizeof(optr->addr.guid));
+	vnic_dbg_fip(discover->name, "fip_solicit_send creating multicast %d"
+		     " solicit packet\n", multicast);
+
+	fip_dbg_dump_raw_pkt(0, optr, size, 1, "sending solicit packet");
+
+	if (multicast) {
+		struct vnic_mcast *mcaste;
+		union ib_gid gid;
+
+		memcpy(&gid, fip_solicit_mgid, GID_LEN);
+		spin_lock_irqsave(&discover->mcast_tree.mcast_rb_lock, flags);
+		mcaste = vnic_mcast_search(&discover->mcast_tree, &gid);
+		/* it is possible for the MCAST entry or AH to be missing in
+		 * transient states (after events). This is a valid condition
+		 * but we can't send packet
+		 */
+		if (!IS_ERR(mcaste) && mcaste->ah) {
+			spin_lock_irqsave(&mcaste->lock, flags1);
+			rc = send_generic_mcast_pkt(discover->port, &discover->tx_ring,
+					    optr, size, discover->qp,
+					    discover->pkey_index,
+					    mcaste);
+			spin_unlock_irqrestore(&mcaste->lock, flags1);
+		} else
+			kfree(optr);
+
+		spin_unlock_irqrestore(&discover->mcast_tree.mcast_rb_lock, flags);
+	} else {
+		rc = send_generic_ucast_pkt(discover->port, NULL, &discover->tx_ring,
+					    optr, size, discover->qp,
+					    discover->pkey_index,
+					    dqpn, dlid, VNIC_FIP_QKEY, sl);
+	}
+	if (rc)
+		goto error_free_mem;
+
+	return 0;
+
+error_free_mem:
+	vnic_warn(discover->name, "discover_send error ret %d\n", rc);
+	kfree(optr);
+	return -ENOMEM;
+}
+
+static void *alloc_login_pkt(struct fip_vnic_data *vnic)
+{
+	struct eoib_login *ptr;
+	int size = sizeof *ptr;
+
+	ptr = kzalloc(size, GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	ptr->eoib_ver.version = 1;
+	ptr->fh.opcode = cpu_to_be16(EOIB_FIP_OPCODE);
+	ptr->fh.subcode = FIP_HOST_LOGIN_SUB_OPCODE;
+	ptr->fh.list_length = cpu_to_be16(size - offsetof(typeof(*ptr), fvend) / 4);
+	ptr->fvend.ft.type = FIP_TYPE(VENDOR_ID);
+	ptr->fvend.ft.length = sizeof ptr->fvend / 4; 
+	strncpy(ptr->fvend.vendor_id, "mellanox", sizeof ptr->fvend.vendor_id);
+	ptr->fa.ft.type = FIP_TYPE(ADDRESS);
+	ptr->fa.ft.length = sizeof ptr->fa / 4;
+	strncpy(ptr->fa.vendor_id, "mellanox", sizeof ptr->fa.vendor_id);
+	ptr->fa.gwtype_qpn = cpu_to_be32(vnic->qp_base_num);
+	ptr->fa.sl_gwportid = cpu_to_be16(vnic->gw->info.gw_port_id);
+	/* sl will be taken from the data path record query */
+	ptr->fa.sl_gwportid |= cpu_to_be16(vnic->gw->data_prec.sl << FIP_ADVERTISE_SL_SHIFT);
+	ptr->fa.lid = cpu_to_be16(vnic->port->attr.lid);
+	memcpy(ptr->fa.guid, &vnic->port->gid.global.interface_id, sizeof ptr->fa.guid);
+	ptr->fl.ft.type = FIP_TYPE(LOGIN);
+	ptr->fl.ft.length = sizeof ptr->fl / 4;
+	strncpy(ptr->fl.vendor_id, "mellanox", sizeof ptr->fl.vendor_id);
+	ptr->fl.vnic_id = cpu_to_be16(vnic->vnic_id);
+
+	if (vnic->hadmined) {
+		int mac_valid = !IS_ZERO_MAC(vnic->login_data.mac);
+		u16 flags = (mac_valid ? FIP_LOGIN_M_FLAG : 0) |
+			    FIP_LOGIN_H_FLAG |
+			    (vnic->login_data.vp ? FIP_LOGIN_VP_FLAG  | FIP_LOGIN_V_FLAG : 0);
+		ptr->fl.flags_vlan = cpu_to_be16(vnic->login_data.vlan | flags );
+		memcpy(ptr->fl.mac, vnic->login_data.mac, sizeof ptr->fl.mac);
+		memcpy(ptr->fl.vnic_name, vnic->login_data.vnic_name, sizeof ptr->fl.vnic_name);
+
+		// TODO remove this when BXM handles 0 addresses
+		if (!mac_valid)
+			ptr->fl.mac[ETH_ALEN-1] = 1;
+	}
+
+	/* all_vlan mode must be enforced between the host and GW side.
+	   For host admin vnic with VLAN we let the host choose the work mode.
+	   If the GW isn't working in that same mode, the login will fail
+	   and the host will enter a login-retry loop
+	   For net admin vnic or host admin without a vlan, we work in the mode
+	   published by the GW */
+	if (vnic->gw->info.all_vlan_gw &&
+	    (!vnic->hadmined ||
+	     (vnic->hadmined && !vnic->login_data.vp)))
+		ptr->fl.vfields |= cpu_to_be16(FIP_LOGIN_ALL_VLAN_GW_FLAG);
+
+	ptr->fl.syndrom_ctrl_qpn = cpu_to_be32(vnic->gw->discover->qp->qp_num);
+	ptr->fl.vfields |= cpu_to_be16((vnic->qps_num > 1) << 12);
+
+	/* for child vNics, allow implicit logout */
+	if (vnic->parent_used) {
+		ptr->fl.vfields |= cpu_to_be16(1 << 14);
+		ptr->fl.vfields |= cpu_to_be16(1 << 13);
+	}
+
+	return ptr;
+}
+
+/*
+ * Send a unicast login packet. This function supports both host and
+ * network admined logins. function returns 0 on success and
+ * error code on failure
+*/
+int fip_login_send(struct fip_vnic_data *vnic)
+{
+	int ret;
+	struct eoib_login *ptr;
+
+	ASSERT(vnic);
+	ASSERT(vnic->port);
+
+	/* don't send packet because GW does not support this */
+	if (vnic->hadmined && !vnic->gw->hadmin_gw)
+		return 0;
+
+	/* alloc packet to be sent */
+	ptr = alloc_login_pkt(vnic);
+        if (IS_ERR(ptr))
+		return PTR_ERR(ptr);
+
+	fip_dbg_dump_raw_pkt(0, ptr, sizeof *ptr, 1, "sending login packet");
+
+	ret = send_generic_ucast_pkt(vnic->port, NULL/*ah : create a new ah inside*/,
+				     &vnic->gw->discover->tx_ring,
+				     ptr, sizeof *ptr, vnic->gw->discover->qp,
+				     vnic->gw->discover->pkey_index,
+				     vnic->gw_address.gw_qpn,
+				     vnic->gw_address.gw_lid,
+				     vnic->gw_address.qkey,
+				     vnic_gw_ctrl_sl(vnic->gw));
+	if (ret) {
+		vnic_warn(vnic->port->name,
+			  "fip_login_send: fip_ucast_send ret %d\n", ret);
+		goto error_free_mem;
+	}
+
+	return 0;
+
+error_free_mem:
+	kfree(ptr);
+	return -ENOMEM;
+}
+
+/*
+ * This function creates and sends a few types of packets (all ucast):
+ *   vHub context request - new=1, logout=0
+ *   vHub context update / vnic keep alive - new=0, logout=0
+ *   vnic logout - new=0, logout=1
+*/
+int fip_update_send(struct fip_vnic_data *vnic, int request_new, int logout)
+{
+	struct eoib_host_update *pkt;
+	struct ib_qp *send_qp;
+	struct fip_ring *tx_ring;
+	int pkey_index;
+	int ret = 0;
+
+	ASSERT(vnic);
+	ASSERT(vnic->port);
+
+	/* alloc packet to be sent */
+	pkt = kmalloc(sizeof *pkt, GFP_ATOMIC);
+	if (!pkt) {
+		vnic_warn(vnic->port->name, "fip_update_send malloc failed\n");
+		return -EAGAIN;
+	}
+
+	/* copy keep alive packet template */
+	if (logout)
+		memcpy(pkt, &base_logout_pkt, sizeof(struct eoib_host_update));
+	else
+		memcpy(pkt, &base_update_pkt, sizeof(struct eoib_host_update));
+
+	pkt->fip.opcode = cpu_to_be16(EOIB_FIP_OPCODE);
+	pkt->fip.list_length =
+	    cpu_to_be16((sizeof(struct eoib_host_update) >> 2) - 3);
+	pkt->vnic_id = cpu_to_be16(vnic->vnic_id);
+	memcpy(pkt->mac, vnic->login_data.mac, sizeof(pkt->mac));
+	memcpy(pkt->vnic_name, vnic->login_data.vnic_name,
+	       sizeof(pkt->vnic_name));
+	memcpy(pkt->port_guid, &vnic->port->gid.global.interface_id,
+	       sizeof(pkt->port_guid));
+
+	pkt->vhub_id.vhub_id = cpu_to_be32(vnic->login_data.vhub_id);
+
+	if (!logout) {
+		pkt->tusn = cpu_to_be32(vnic->vhub_table.main_list.tusn);
+		send_qp = vnic->qp;
+		tx_ring = &vnic->tx_ring;
+		pkey_index = vnic->pkey_index;
+
+		if (vnic->login_data.vp)
+			pkt->vhub_id.flags.flags |= FIP_HOST_VP_FLAG;
+
+		if (request_new)
+			pkt->vhub_id.flags.flags |= FIP_HOST_R_FLAG;
+		else
+			pkt->vhub_id.flags.flags |= FIP_HOST_U_FLAG;
+	} else {
+		send_qp = vnic->gw->discover->qp;
+		tx_ring = &vnic->gw->discover->tx_ring;
+		pkey_index = vnic->gw->discover->pkey_index;
+	}
+
+	if (vnic->gw->info.gw_type == GW_TYPE_LAG && 
+	    !vnic->gw->info.ext_lag.ucast && !logout) {
+		struct vnic_mcast *mcaste;
+		unsigned long flags;
+
+		spin_lock_irqsave(&vnic->mcast_tree.mcast_rb_lock, flags);
+		mcaste = vnic_mcast_search(&vnic->mcast_tree, &vnic->ka_mcast_gid);
+		if (!IS_ERR(mcaste)) {
+			if (mcaste->ah) {
+				ret = send_generic_mcast_pkt(vnic->port, &vnic->tx_ring,
+							     pkt, sizeof *pkt, vnic->qp,
+							     vnic->pkey_index, mcaste);
+                                vnic_dbg_parse(vnic->name, "sent multicast keep alive\n");
+			}
+			else {
+				vnic_dbg_parse(vnic->name, "mcaste %p: ah is null\n", mcaste);
+				kfree(pkt);
+			}
+		} else {
+			vnic_dbg_parse(vnic->name, "ka mcast not found\n");
+			ret = -ENOMEM;
+		}
+		spin_unlock_irqrestore(&vnic->mcast_tree.mcast_rb_lock, flags);
+
+	} else
+		/* For LAG gateway the ah is not up to date and therefore
+		   should not be used */
+		ret = send_generic_ucast_pkt(vnic->port, NULL/*ah : create a new ah inside*/,
+					     tx_ring, pkt, sizeof *pkt,
+					     send_qp,
+					     pkey_index,
+					     vnic->gw_address.gw_qpn,
+					     vnic->gw_address.gw_lid,
+					     vnic->gw_address.qkey,
+					     vnic_gw_ctrl_sl(vnic->gw));
+	if (ret) {
+		vnic_warn(vnic->port->name,
+			  "fip_update_send: ret %d\n", ret);
+		goto error_free_mem;
+	}
+
+	return 0;
+
+error_free_mem:
+	kfree(pkt);
+	return -ENOMEM;
+}
+
+static void dump_lag_member(struct lag_member *m)
+{
+	vnic_dbg_lag("", "QPN 0x%x, SL %d, gw_portid 0x%x, LID 0x%x, guid " GUID_FORMAT
+		       ", eport_state %s, weight %d, link_utilization %d\n",
+		       m->qpn, m->sl, m->gw_port_id, m->lid, GUID_ARG(m->guid),
+		       eport_state_str(m->eport_state), m->weight, m->link_utilization);
+}
+
+static inline int handle_lag_member(struct fip_vnic_data *vnic,
+			     struct fip_ext_type_lag_members *ext_lag_membs,
+			     int ext_length)
+{
+	struct lag_members lag_members;
+
+	extract_memb_extended(ext_lag_membs, ext_length, &lag_members, vnic->name);
+
+	/* propogate change in member state as needed */
+	return handle_member_update(vnic, &lag_members);
+}
+
+int extract_vhub_extended(struct fip_ext_desc_tlv *fed,
+			  struct fip_vnic_data *vnic)
+{
+	struct fip_ext_type_ctrl_iport *ext_ctrl_iport;
+	struct fip_ext_type_lag_members *ext_lag_memb;
+	struct fip_extended_type *ext_hdr;
+	struct fip_vnic_send_info *gw_addr;
+	int length_to_go, ext_length;
+
+	if (fed->ft.type != 254)
+		return -EINVAL;
+
+	length_to_go = ((int)(fed->ft.length) << 2) - sizeof(*fed);
+	ext_hdr = (struct fip_extended_type *)(fed + 1);
+
+	while (length_to_go > 0) {
+		ext_length = ((int)ext_hdr->len) << 2;
+
+		vnic_dbg_parse(vnic->name, "Table Update parse, sub-tlv "
+			       "type  %d length %d address=%p\n",
+			       ext_hdr->ext_type, ext_length, ext_hdr);
+
+		if (ext_length < sizeof(*ext_hdr) ||
+		    ext_length > length_to_go) {
+			vnic_dbg_parse(vnic->name, "Extended length error."
+				       " Length=%d\n", ext_length);
+			return -EINVAL;
+		}
+
+		switch (ext_hdr->ext_type) {
+		case ADV_EXT_TYPE(MEMBER):
+			ext_lag_memb = (struct fip_ext_type_lag_members *)ext_hdr;
+
+			if (handle_lag_member(vnic, ext_lag_memb, ext_length))
+				vnic_dbg_parse(vnic->name, "handle_lag_member() failed");
+			break;
+		case ADV_EXT_TYPE(CTRL_IPORT):
+			if (ext_length != sizeof(*ext_ctrl_iport)) {
+				vnic_dbg_parse(vnic->name, "Extended length %d is"
+					       " different than expected\n", 
+					       ext_length);
+				return -EINVAL;
+			}
+
+			gw_addr = &vnic->gw_address;
+			ext_ctrl_iport 	= (struct fip_ext_type_ctrl_iport *)ext_hdr;
+			gw_addr->gw_qpn = be32_to_cpu(ext_ctrl_iport->gwtype_qpn);
+			gw_addr->gw_lid = be16_to_cpu(ext_ctrl_iport->lid);
+			gw_addr->gw_sl 	= be16_to_cpu(ext_ctrl_iport->sl_gwportid) >> FIP_ADVERTISE_SL_SHIFT;
+			break;
+		default:
+			if (ext_hdr->mandatory & 0x01) {
+				vnic_dbg_parse(vnic->name, "Unknown mandatory extended type %d length %d\n",
+					       ext_hdr->ext_type, ext_length);
+				return -EINVAL;
+			} else {
+				vnic_dbg_parse(vnic->name, "Unknown non-mandatory extended. Skipping, type %d length %d\n",
+					       ext_hdr->ext_type, ext_length);
+				ext_hdr = (struct fip_extended_type *)((char *)ext_hdr + ext_length);
+				length_to_go -= ext_length;
+					continue;
+				}
+			}
+	
+		ext_hdr = (struct fip_extended_type *)((char *)ext_hdr + ext_length);
+		length_to_go -= ext_length;
+	}
+
+	return 0;
+}
+
+static int extract_login_extended(struct fip_ext_desc_tlv *fed,
+				  struct lag_members *lagm,
+				  char *name)
+{
+	struct fip_ext_type_lag_members *ext_lag_membs;
+	struct fip_extended_type *ext_hdr;
+	int length_to_go, ext_length;
+
+	if (fed->ft.type != 254)
+		return -EINVAL;
+
+	length_to_go = ((int)(fed->ft.length) << 2) - sizeof(*fed);
+	ext_hdr = (struct fip_extended_type *)(fed + 1);
+
+	while (length_to_go > 0) {
+		ext_length = ((int)ext_hdr->len) << 2;
+
+		vnic_dbg_parse(name, "Table Update parse, sub-tlv "
+			       "type  %d length %d address=%p\n",
+			       ext_hdr->ext_type, ext_length, ext_hdr);
+
+		if (ext_length < sizeof(*ext_hdr) ||
+		    ext_length > length_to_go) {
+			vnic_dbg_parse(name, "Extended length error."
+				       " Length=%d\n", ext_length);
+			return -EINVAL;
+		}
+
+		switch (ext_hdr->ext_type) {
+		case ADV_EXT_TYPE(MEMBER):
+			ext_lag_membs = (struct fip_ext_type_lag_members *)ext_hdr;
+
+			extract_memb_extended(ext_lag_membs, ext_length, lagm, name);
+			
+			break;
+		default:
+			if (ext_hdr->mandatory & 0x01) {
+				vnic_dbg_parse(name, "Unknown mandatory extended type %d length %d\n",
+					       ext_hdr->ext_type, ext_length);
+				return -EINVAL;
+			} else {
+				vnic_dbg_parse(name, "Unknown non-mandatory extended. Skipping, type %d length %d\n",
+					       ext_hdr->ext_type, ext_length);
+				ext_hdr = (struct fip_extended_type *)((char *)ext_hdr + ext_length);
+				length_to_go -= ext_length;
+					continue;
+			}
+		}
+		ext_hdr = (struct fip_extended_type *)((char *)ext_hdr + ext_length);
+		length_to_go -= ext_length;
+	}
+
+	return 0;
+}
+
+void extract_memb_extended(struct fip_ext_type_lag_members *ext_lag_membs,
+			   int ext_length,			  
+			   struct lag_members *lagm,
+			   char *name)
+{	
+	struct lag_member *m;
+	struct fip_ext_type_lag_member *lm;
+	int nmemb = 0;
+	int i;	
+
+	nmemb = (ext_length - sizeof ext_lag_membs->et) / sizeof *lm;
+	if (nmemb > MAX_LAG_MEMBERS) {
+		vnic_dbg_parse(name, "recieved %d members but max supported is %d. "
+			       "Using only %d\n", nmemb, MAX_LAG_MEMBERS,
+			       MAX_LAG_MEMBERS);
+		nmemb = MAX_LAG_MEMBERS;
+	}
+
+	m = lagm->memb;
+	lm = ext_lag_membs->lagm;
+
+	for (i = 0; i < nmemb; ++i, ++lm, ++m) {
+		m->qpn = be32_to_cpu(lm->qpn) & 0xffffff;
+		m->sl = be16_to_cpu(lm->sl_gw_portid) >> 12;
+		m->gw_port_id = be16_to_cpu(lm->sl_gw_portid) & 0xfff;
+		m->lid = be16_to_cpu(lm->lid);
+		memcpy(m->guid, lm->guid, sizeof m->guid);
+		m->eport_state = lm->eport_state >> 6;
+		m->weight = lm->weight;
+		m->link_utilization = lm->link_utilization;
+		dump_lag_member(m);
+	}
+	lagm->num = nmemb;
+
+	vnic_dbg_parse(name, "Table Update extended parse finished OK. Num members=%d\n",
+		       lagm->num);
+	return;
+}
+
+/*
+ * parse a packet that is suspected of being an login ack packet. The packet
+ * returns 0 for a valid login ack packet and an error code otherwise. The
+ * packets "interesting" details are returned in data.
+ */
+int fip_login_parse(struct fip_discover *discover, struct fip_content *fc,
+		    struct fip_login_data *data)
+{
+	u32 vfields;
+	int err = 0;
+
+	data->syndrome = be32_to_cpu(fc->fl->syndrom_ctrl_qpn) >> 24;
+	data->vnic_id = be16_to_cpu(fc->fl->vnic_id);
+	data->lid = be16_to_cpu(fc->fa.fa[0]->lid);
+	data->port_id = be16_to_cpu(fc->fa.fa[0]->sl_gwportid) & 0xfff;
+	data->sl = be16_to_cpu(fc->fa.fa[0]->sl_gwportid) >> FIP_ADVERTISE_SL_SHIFT;
+	data->qpn = be32_to_cpu(fc->fa.fa[0]->gwtype_qpn) & 0xffffff;
+	memcpy(data->guid, fc->fa.fa[0]->guid, sizeof(data->guid));
+
+	if (be16_to_cpu(fc->fl->flags_vlan) & FIP_LOGIN_VP_FLAG) {
+		data->vp = 1;
+		data->vlan = be16_to_cpu(fc->fl->flags_vlan) & 0xfff;
+	}
+	data->all_vlan_gw = !!(be16_to_cpu(fc->fl->vfields) & FIP_LOGIN_ALL_VLAN_GW_FLAG);
+
+	data->vhub_id = CREATE_VHUB_ID(cpu_to_be16(data->vlan), data->port_id);
+
+	data->ctl_qpn = be32_to_cpu(fc->fl->syndrom_ctrl_qpn) & FIP_LOGIN_CTRL_QPN_MASK;
+	vfields = be16_to_cpu(fc->fl->vfields);
+	data->n_mac_mcgid = vfields & FIP_LOGIN_DMAC_MGID_MASK;
+	data->n_rss_mgid = vfields >> 8 & 0xf;
+	/* data->rss = pkt->rss & FIP_LOGIN_RSS_MASK; it's redundant in login ack */
+	data->pkey = be16_to_cpu(fc->fp->pkey);
+	data->mtu = be16_to_cpu(fc->fl->mtu);
+
+	memcpy(data->mac, fc->fl->mac, sizeof(data->mac));
+	memcpy(data->mgid_prefix, fc->fl->eth_gid_prefix, sizeof(data->mgid_prefix));
+	memcpy(data->vnic_name, fc->fl->vnic_name, sizeof(data->vnic_name));
+	memcpy(data->vendor_id, fc->fl->vendor_id, sizeof(data->vendor_id));
+
+	if (fc->fed.num)
+		err = extract_login_extended(fc->fed.fed[0], &data->lagm, discover->name);
+
+	return err;
+}
+
+/*
+ * Check if a received packet is a FIP packet, And if so return its subtype.
+ * The FIP type is also returned in fip_type and can be either EOIB_FIP_OPCODE
+ * or FCOIB_FIP_OPCODE. If the packet is not a FIP packet -EINVAL is returned.
+*/
+int fip_pkt_parse(char *buffer, int length, int *fip_type)
+{
+	struct fip_fip_header *fip_header;
+	u16 fip_opcode;
+
+	fip_header = (struct fip_fip_header *)
+	    (buffer + IB_GRH_BYTES + sizeof(struct fip_eoib_ver));
+
+	fip_opcode = be16_to_cpu(fip_header->opcode);
+
+	if (fip_opcode != EOIB_FIP_OPCODE) {
+		*fip_type = 0;
+		return -EINVAL;
+	}
+
+	*fip_type = fip_opcode;
+
+	return fip_header->subcode;
+}
+
+/*
+ * Already know that this is a FIP packet, return its subtype.
+*/
+int fip_pkt_get_subtype_bh(char *buffer)
+{
+	struct fip_fip_header *fip_header;
+
+	fip_header = (struct fip_fip_header *)
+	    (buffer + sizeof(struct fip_eoib_ver));
+
+	return fip_header->subcode;
+}
+
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_pkt.h b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_pkt.h
new file mode 100644
index 0000000000000..32e34fce15252
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_pkt.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _FIP_DISCOVER_PKT_H
+#define _FIP_DISCOVER_PKT_H
+
+#include <linux/kref.h>
+
+
+
+#endif /* _FIP_DISCOVER_H */
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_vhub.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_vhub.c
new file mode 100644
index 0000000000000..8bcd6d0b69801
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_vhub.c
@@ -0,0 +1,635 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_fip.h"
+#include "vnic_fip_discover.h"
+#include "vnic_fip_pkt.h"
+
+/*
+ * construct an mgid address based on vnic login information and the type
+ * variable (data mcast / vhub update / vhub table). The resulting mgid
+ * is returned in *mgid.
+ */
+void vhub_mgid_create(const char *mgid_prefix,
+		      const char *mmac, /* mcast mac for bcast 0xFF.. */
+		      u64 n_mac,	/* bits to take from mmac */
+		      u32 vhub_id,
+		      enum vhub_mgid_type type,
+		      u8 rss_hash,
+		      union vhub_mgid *mgid)
+{
+	u32 vhub_id_be;
+	u64 mac_mask;
+	u64 *mac_ptr;
+	u64 one = 1; /* must do that for shift bitwise operation */
+
+	memcpy(mgid->mgid.mgid_prefix, mgid_prefix,
+	       sizeof(mgid->mgid.mgid_prefix));
+	mgid->mgid.type = (u8)type;
+	memcpy(mgid->mgid.dmac, mmac, sizeof(mgid->mgid.dmac));
+	mac_mask = cpu_to_le64(((one << n_mac) - one) | 0xFFFF000000000000ULL);
+	mac_ptr = (u64*)(mgid->mgid.dmac);
+	*mac_ptr &= mac_mask;
+	mgid->mgid.rss_hash = rss_hash;
+	vhub_id_be = cpu_to_be32(vhub_id);
+	memcpy(mgid->mgid.vhub_id, ((u8 *) &vhub_id_be) + 1,
+	       sizeof(mgid->mgid.vhub_id));
+};
+
+/*
+ * Init the vnic's vHub table data structures, before using them
+ */
+void vhub_ctx_init(struct fip_vnic_data *vnic)
+{
+	INIT_LIST_HEAD(&vnic->vhub_table.main_list.vnic_list);
+	vnic->vhub_table.main_list.tusn = 0;
+	vnic->vhub_table.main_list.count = 0;
+	vnic->vhub_table.main_list.total_count = 0;
+
+	INIT_LIST_HEAD(&vnic->vhub_table.update_list.vnic_list);
+	vnic->vhub_table.update_list.tusn = 0;
+	vnic->vhub_table.update_list.count = 0;
+	vnic->vhub_table.update_list.total_count = 0;
+
+	vnic->vhub_table.checksum = 0;
+	vnic->vhub_table.tusn = 0;
+	vnic->vhub_table.state = VHUB_TBL_INIT;
+}
+
+/* print vhub context table */
+static void vhub_ctx_prnt(struct fip_vnic_data *vnic,
+			  struct vhub_elist *vhub_list, int level)
+{
+	struct vnic_table_entry *vnic_entry;
+
+	if (!(vnic_msglvl & VNIC_DEBUG_VHUB_V))
+		return;
+
+	vnic_dbg_vhub_v(vnic->name, "Dumping context table. Count %d tusn %d\n",
+			vhub_list->count, vhub_list->tusn);
+
+	list_for_each_entry(vnic_entry, &vhub_list->vnic_list, list) {
+		vnic_dbg_vhub_v(vnic->name, "lid 0x%04x qpn 0x%06x, mac "
+				MAC_6_PRINT_FMT"\n", vnic_entry->lid,
+				vnic_entry->qpn,
+				MAC_6_PRINT_ARG(vnic_entry->mac));
+	}
+}
+
+void vhub_table_free(struct vhub_elist *elist)
+{
+	struct vnic_table_entry *del_vnic, *tmp_vnic;
+
+	list_for_each_entry_safe(del_vnic, tmp_vnic, &elist->vnic_list, list) {
+		list_del(&del_vnic->list);
+		kfree(del_vnic);
+	}
+}
+
+/*
+ * Clear and free the vnic's vHub context table data structures.
+ */
+void vhub_ctx_free(struct fip_vnic_data *vnic)
+{
+	vnic_dbg_fip_v(vnic->name, "vhub_ctx_free called\n");
+
+	vhub_table_free(&vnic->vhub_table.main_list);
+	vhub_table_free(&vnic->vhub_table.update_list);
+
+	vhub_ctx_init(vnic);
+}
+
+static struct vnic_table_entry *vhub_find_entry(struct vhub_elist *vnic_list,
+					       u16 lid, u32 qpn)
+{
+	struct vnic_table_entry *tmp_vnic;
+
+	list_for_each_entry(tmp_vnic, &vnic_list->vnic_list, list) {
+		if (tmp_vnic->lid == lid && tmp_vnic->qpn == qpn)
+			return tmp_vnic;
+	}
+	return NULL;
+}
+
+/*
+ * Move vHub context entries from the update list to the main list. The update
+ * list is used during the wait for the main table to be updated. Once
+ * the table update is completed the entries need to be moved from the update
+ * table to the main table. This function does this.
+*/
+static int vhub_update_main(struct fip_vnic_data *vnic,
+			    struct vhub_elist *main_list,
+			    struct vhub_elist *update_list)
+{
+	struct vnic_table_entry *new_entry, *tmp_vnic, *del_vnic;
+	int first_tusn = (u32) update_list->tusn - (update_list->count - 1);
+	int extra_tusn;
+
+	/* update list is usually empty */
+	if (likely(update_list->count == 0))
+		return 0;
+
+	if (first_tusn > main_list->tusn + 1) {
+		vnic_warn(vnic->name, "Info, vhub_to_main_tbl sync main to"
+			  " update list failed. update tusn %d update "
+			  "first %d main %d\n",
+			  update_list->tusn, first_tusn, main_list->tusn);
+		return -1;
+	}
+
+	extra_tusn = main_list->tusn + 1 - first_tusn;
+
+	/* go over update list and move / remove entries in it */
+	list_for_each_entry_safe(new_entry, tmp_vnic,
+				 &update_list->vnic_list, list) {
+		if (extra_tusn > 0) {
+			list_del(&new_entry->list);
+			kfree(new_entry);
+			extra_tusn--;
+		} else {
+			/* remove from update list and apply to main list */
+			list_del(&new_entry->list);
+			main_list->tusn++;
+
+			/* Check valid bit, if set add to main list */
+			if (new_entry->valid) {
+				list_add_tail(&new_entry->list,
+					      &main_list->vnic_list);
+				main_list->count++;
+			} else {	/* remove from main list */
+				del_vnic = vhub_find_entry(main_list,
+							   new_entry->lid,
+							   new_entry->qpn);
+				if (del_vnic) {
+					list_del(&del_vnic->list);
+					kfree(del_vnic);
+
+					main_list->count--;
+				}
+				vnic_dbg_fip_v(vnic->name,
+					       "vhub_to_main_tbl removed "
+					       "vnic lid %d qpn 0x%x found %d\n",
+					       (int)new_entry->lid,
+					       (int)new_entry->qpn,
+					       (del_vnic != 0));
+				kfree(new_entry);
+			}
+		}
+		update_list->count--;
+	}
+	return 0;
+}
+
+int fip_vnic_search_mac(struct fip_vnic_data *vnic, struct vhub_elist *elist)
+{
+	struct vnic_table_entry *vlist_entry;
+
+	list_for_each_entry(vlist_entry, &elist->vnic_list, list)
+		/* find matching entry based on mac */
+		if(!memcmp(vnic->login_data.mac, vlist_entry->mac, ETH_ALEN)) {
+			/* verify lid/qpn match */
+			if (vnic->port->attr.lid == vlist_entry->lid &&
+			    vnic->qp_base_num == vlist_entry->qpn)
+				return 1;
+			else {
+				vnic_dbg_vhub(vnic->name,
+					      "vnic LID=0x%x or QPN=0x%x "
+					      "in vhub tbl is different than "
+					      "expected LID=0x%x, QPN=0x%x\n",
+					      vlist_entry->lid,
+					      vlist_entry->qpn,
+					      vnic->port->attr.lid, 
+					      vnic->qp_base_num);
+				break;
+			}
+		}
+
+	return 0;
+}
+
+/*
+ * This function handles a vhub context table packet. The table will
+ * be processed only if we do not have an up to date local copy of
+ * our own. The table update supports multi-packet tables so care
+ * must be taken in building the complete table.
+ */
+int vhub_handle_tbl(struct fip_vnic_data *vnic, struct fip_content *fc,
+		    u32 vhub_id, u32 tusn)
+{
+	struct context_table_entry *entry;
+	struct vnic_table_entry *new_entry;
+	struct vhub_elist *table;
+	int i, j, count_in_pkt;
+	int reason = 0;
+	int hdr_type;
+
+	/* we already have a table. disregard this one */
+	if (vnic->vhub_table.state != VHUB_TBL_INIT) {
+		vnic_dbg_vhub_v(vnic->name,
+			       "vhub_handle_tbl context not in init\n");
+		return 0;
+	}
+
+	/* compute the number of vnic entries in the packet.
+	 * don't forget the checksum
+	 */
+	count_in_pkt = fc->cte.num;
+	table = &vnic->vhub_table.main_list;
+	hdr_type = be16_to_cpu(fc->fvt->hdr) >> 14;
+
+	/* first or only packet in sequence */
+	if (hdr_type == FIP_TABLE_HDR_FIRST || hdr_type == FIP_TABLE_HDR_ONLY) {
+		table->total_count = be16_to_cpu(fc->fvt->table_size);
+		table->tusn = tusn;
+	}
+	if (table->tusn != tusn) {
+		vnic_warn(vnic->name, "Info, vhub_handle_tbl got unexpected "
+			  "tusn. Expect=%d received=%d\n", table->tusn, tusn);
+		if (!table->tusn)
+			goto drop_silently;
+		reason = 1;
+		goto reset_table;
+	}
+
+	if ((table->count + count_in_pkt > table->total_count) ||
+	    ((table->count + count_in_pkt < table->total_count) &&
+	     (hdr_type == FIP_TABLE_HDR_LAST || hdr_type == FIP_TABLE_HDR_ONLY))) {
+		vnic_dbg_vhub(vnic->name,
+			      "vhub_handle_tbl got unexpected entry count. "
+			      "count %d, in packet %d total expected %d\n",
+			      table->count, count_in_pkt, table->total_count);
+		reason = 2;
+		goto reset_table;
+	}
+
+	entry = fc->cte.cte;
+	for (i = 0; i < count_in_pkt; ++i, ++entry) {
+		new_entry = kzalloc(sizeof *new_entry, GFP_KERNEL);
+		if (!new_entry)
+			goto reset_table;
+
+		for (j = 0; j < (sizeof *entry) >> 2; ++j)
+			vnic->vhub_table.checksum += ((u32 *) entry)[j];
+
+		new_entry->lid = be16_to_cpu(entry->lid);
+		new_entry->qpn = be32_to_cpu(entry->qpn) & 0xffffff;
+		new_entry->sl = entry->sl & 0xf;
+		new_entry->rss = !!(entry->v_rss_type & FIP_CONTEXT_RSS_FLAG);
+		new_entry->valid = !!(entry->v_rss_type & FIP_CONTEXT_V_FLAG);
+		memcpy(new_entry->mac, entry->mac, sizeof(new_entry->mac));
+
+		list_add_tail(&new_entry->list, &table->vnic_list);
+		table->count++;
+	}
+
+	/* last packet */
+	if (hdr_type == FIP_TABLE_HDR_LAST || hdr_type == FIP_TABLE_HDR_ONLY) {
+		ASSERT(table->count == table->total_count);
+		if (vnic->vhub_table.checksum != be32_to_cpu(*(u32 *) entry)) {
+			vnic_dbg_fip_v(vnic->name,
+				       "vhub_handle_tbl checksum mismatch. "
+				       "expected 0x%x, in packet 0x%x\n",
+				       vnic->vhub_table.checksum,
+				       be32_to_cpu(*(u32 *) entry));
+			/* TODO: request checksum match in final code */
+			/* goto reset_table; */
+		}
+
+		if (vhub_update_main(vnic, &vnic->vhub_table.main_list,
+				     &vnic->vhub_table.update_list)) {
+			vnic_dbg_fip_v(vnic->name,
+				       "vhub_handle_tbl moving update list to main "
+				       "list failed\n");
+			reason = 3;
+			goto reset_table;
+		}
+
+		/* we are done receiving the context table */
+		vnic_dbg_fip_v(vnic->name,
+			       "vhub_handle_tbl updated with %d entries\n",
+			       vnic->vhub_table.main_list.count);
+		vhub_ctx_prnt(vnic, &vnic->vhub_table.main_list, 0);
+
+		/* we are not in the main vHub list close ourselves */
+		if (!fip_vnic_search_mac(vnic, &vnic->vhub_table.main_list)) {
+			vnic_dbg_fip_p0(vnic->name, "We are not in the main table close our selves\n");
+			fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+			reason = 4;
+			goto reset_table;
+		}
+
+		if (fip_vnic_tbl_done(vnic)) {
+			vnic_warn(vnic->name, "vhub_handle_tbl done failed, reseting table\n");
+			reason = 5;
+			goto reset_table;
+		}
+	}
+
+drop_silently:
+	return 0;
+
+reset_table:
+	vnic_dbg_fip_p0(vnic->name, "We are not in the main table close our selves reason=%d\n", reason);
+	vhub_ctx_free(vnic);
+	/* TODO renable tx of update request, fip_update_send() */
+	return -EINVAL;
+}
+
+/*
+ * This function writes the main vhub table to the data (login) vnic.
+ * You should call it when the data vnic is ready for it and after the
+ * table is up to date (and the update list was applied to the main list)
+ */
+int fip_vnic_write_tbl(struct fip_vnic_data *vnic)
+{
+	struct vnic_table_entry *vlist_entry;
+	int rc;
+
+	if (vnic->login)
+		sprintf(vnic->name, "%s", vnic->login->name);
+
+	/* update table in neigh tree */
+	list_for_each_entry(vlist_entry,
+			    &vnic->vhub_table.main_list.vnic_list, list) {
+		rc = vnic_vhube_add(vnic, vlist_entry);
+		if (rc) {
+			vnic_warn(vnic->name, "vnic_vhube_add failed for mac "
+				  MAC_6_PRINT_FMT" (rc %d)\n",
+				  MAC_6_PRINT_ARG(vlist_entry->mac), rc);
+			vhub_ctx_free(vnic);
+			vnic_vhube_flush(vnic);
+			return -1;
+		}
+	}
+
+	vnic_dbg_fip(vnic->name, "fip_vnic_tbl_done: creation of vnic done\n");
+
+	vnic->vhub_table.tusn = vnic->vhub_table.main_list.tusn;
+	vnic->vhub_table.state = VHUB_TBL_UPDATED;
+
+	/* free table memory */
+	vhub_table_free(&vnic->vhub_table.main_list);
+	return 0;
+}
+
+/*
+ * This function handles a vhub context update packets received AFTER
+ * we have a valid vhub table. For update additions the code adds an
+ * entry to the neighbour tree. For update removals we either remove
+ * the entry from the neighbour list or if the removed entry is "this vnic"
+ * we remove the vnic.
+*/
+static int vhub_update_updated(struct fip_vnic_data *vnic,
+			       u32 vhub_id, u32 pkt_tusn,
+			       struct vnic_table_entry *data)
+{
+	int curr_tusn;
+
+	curr_tusn = vnic->vhub_table.tusn;
+
+	/* if vnic is being flushed, return */
+	if (vnic->flush)
+		return 0;
+
+	/* we got a GW keep alive packet */
+	if (pkt_tusn == curr_tusn)
+		return 0;
+
+	/* if we got an out of order update clear list and request new table */
+	if (pkt_tusn != curr_tusn + 1) {
+		vnic_warn(vnic->name, "Info, vhub_update_up2date received out"
+			  " of order update. Recvd=%d Expect=%d\n",
+			  pkt_tusn, curr_tusn);
+		goto error_in_update;
+	}
+
+	/* new entry added */
+	if (data->valid) {
+		if (vnic_vhube_add(vnic, data)) {
+			vnic_dbg_fip(vnic->name, "vnic_vhube_add "
+				     "failed to update vnic neigh tree\n");
+			goto error_in_update;
+		}
+	} else {		/* remove entry */
+		/* the remove request is for this vnic :-o */
+		if (!memcmp(vnic->login_data.mac, data->mac, ETH_ALEN)) {
+			vnic_dbg_fip_p0(vnic->name, "remove this vnic "MAC_6_PRINT_FMT"\n",
+				     MAC_6_PRINT_ARG(vnic->login_data.mac));
+			fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+		} else {
+			vnic_dbg_fip(vnic->name, "remove neigh vnic\n");
+			vnic_vhube_del(vnic, data->mac);
+		}
+	}
+
+	vnic->vhub_table.tusn = pkt_tusn;
+
+	return 0;
+
+error_in_update:
+	vhub_ctx_free(vnic);
+	vnic_vhube_flush(vnic);
+	fip_update_send(vnic, 1 /* new */, 0 /* logout */);
+	return -1;
+}
+
+/*
+ * This function handles a vhub context update packets received BEFORE
+ * we have a valid vhub table. The function adds the update request
+ * to an update list to be processed after the entire vhub table is received
+ * and processed.
+ */
+static int vhub_update_init(struct fip_vnic_data *vnic,
+			     u32 vhub_id, u32 pkt_tusn,
+			     struct vnic_table_entry *data)
+{
+	struct vnic_table_entry *new_vnic;
+	struct vhub_elist *vnic_list;
+	int curr_tusn;
+
+	vnic_list = &vnic->vhub_table.update_list;
+	curr_tusn = vnic_list->tusn;
+
+	/* if we got an out of order update clear list and request new table */
+	if ((pkt_tusn < curr_tusn || pkt_tusn > curr_tusn + 1)
+	    && curr_tusn != 0) {
+		vnic_warn(vnic->name, "Info, vhub_update_init received out of"
+			  " order update. got %d my %d\n", pkt_tusn, curr_tusn);
+		goto error_in_update;
+	}
+
+	/* we got a GW keep alive packet */
+	if (pkt_tusn == curr_tusn) {
+		vnic_dbg_fip_v(vnic->name, "Received GW keep alive update."
+			       " tusn %d\n", curr_tusn);
+		return 0;
+	}
+
+	/* got remove request for this vnic don't wait */
+	if (!(data->valid) &&
+	    !memcmp(vnic->login_data.mac, data->mac, ETH_ALEN)) {
+		vhub_ctx_free(vnic);
+		vnic_dbg_fip_p0(vnic->name, "got request to close vNic vhub_update_init\n");
+		fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+		goto err;
+	}
+
+	new_vnic = kzalloc(sizeof *new_vnic, GFP_KERNEL);
+	if (!new_vnic)
+		goto error_in_update;
+
+	memcpy(new_vnic, data, sizeof *data);
+	list_add_tail(&new_vnic->list, &vnic_list->vnic_list);
+	vnic_list->count++;
+	vnic_list->tusn = pkt_tusn;
+	vhub_ctx_prnt(vnic, vnic_list, 0);
+	return 0;
+
+error_in_update:
+	vhub_ctx_free(vnic);
+	fip_update_send(vnic, 1 /* new */, 0 /* logout */);
+err:
+	return -1;
+}
+
+/*
+ * This function handles a vhub context update packets received after
+ * we have a valid vhub table but  before it was passed to the data rbtree.
+ * The function applies the update request to the main vhub table.
+ */
+static int vhub_update_inter(struct fip_vnic_data *vnic,
+			     u32 vhub_id, u32 pkt_tusn,
+			     struct vnic_table_entry *data)
+{
+	struct vnic_table_entry *new_vnic, *del_vnic;
+	struct vhub_elist *vnic_list;
+	int curr_tusn;
+
+	vnic_list = &vnic->vhub_table.main_list;
+	curr_tusn = vnic_list->tusn;
+
+	/* if we got an out of order update clear list and request new table */
+	if ((pkt_tusn < curr_tusn || pkt_tusn > curr_tusn + 1)
+	    && curr_tusn != 0) {
+		vnic_warn(vnic->name, "Info, vhub_update_init received out"
+			  " of order update. got %d my %d\n", pkt_tusn, curr_tusn);
+		goto error_in_update;
+	}
+
+	/* we got a GW keep alive packet */
+	if (pkt_tusn == curr_tusn) {
+		vnic_dbg_fip_v(vnic->name, "Received GW keep alive update."
+			       " tusn %d\n", curr_tusn);
+		return 0;
+	}
+
+	/* we got an add request */
+	if (data->valid) {
+		new_vnic = kzalloc(sizeof *new_vnic, GFP_KERNEL);
+		if (!new_vnic)
+			goto error_in_update;
+
+		memcpy(new_vnic, data, sizeof *data);
+		list_add_tail(&new_vnic->list, &vnic_list->vnic_list);
+		vnic_list->count++;
+		vnic_list->tusn = pkt_tusn;
+	} else { /* we got a remove request */
+		/* remove is for this vnic */
+		if (!memcmp(vnic->login_data.mac, data->mac, ETH_ALEN)) {
+			vhub_ctx_free(vnic);
+			vnic_dbg_fip_p0(vnic->name, "got request to close vNic vhub_update_inter\n");
+			fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+			goto err;
+		}
+
+		/* search and delete the vnic */
+		del_vnic = vhub_find_entry(vnic_list,
+					   data->lid,
+					   data->qpn);
+		if (del_vnic) {
+			list_del(&del_vnic->list);
+			kfree(del_vnic);
+			vnic_list->count--;
+		}
+		vnic_dbg_fip_v(vnic->name,
+			       "vhub_update_inter removed "
+			       "vnic lid %d qpn 0x%x found %d\n",
+			       (int)data->lid, (int)data->qpn,
+			       (del_vnic != 0));
+	}
+
+	vhub_ctx_prnt(vnic, vnic_list, 0);
+	return 0;
+
+error_in_update:
+	vhub_ctx_free(vnic);
+	fip_update_send(vnic, 1 /* new */, 0 /* logout */);
+err:
+	return -1;
+}
+
+/*
+ * This function handles a vhub context update packets. There are three flows
+ * in handeling update packets. The first is before the main table is up
+ * to date, the second is after the table is up to date but before it was
+ * passed to the ownership of the data vnic (login struct) and the local
+ * lists are freed, and the last is when the table maintanence is done
+ * by the data vnic. This function handles all cases.
+*/
+int vhub_handle_update(struct fip_vnic_data *vnic,
+		       u32 vhub_id, u32 tusn,
+		       struct vnic_table_entry *data)
+{
+	int ret = 0;
+
+	/*
+	 * if we do not have an up to date table to use the update list.
+	 * if we have an up to date table apply the updates to the
+	 * main table list.
+	 */
+	switch (vnic->vhub_table.state) {
+	case VHUB_TBL_INIT:	/* No full table yet, keep updates for later */
+		ret = vhub_update_init(vnic, vhub_id, tusn, data);
+		break;
+	case VHUB_TBL_UP2DATE:  /* full table available, not writen to data half */
+		ret = vhub_update_inter(vnic, vhub_id, tusn, data);
+		break;
+	case VHUB_TBL_UPDATED:  /* full table available and writen to data half */
+		ret = vhub_update_updated(vnic, vhub_id, tusn, data);
+		break;
+	default:
+		break;
+	}
+
+        return ret;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_main.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_main.c
new file mode 100644
index 0000000000000..f07ee4e63fb7e
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_main.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_data.h"
+#include "vnic_fip.h"
+
+MODULE_AUTHOR(DRV_AUTH);
+MODULE_DESCRIPTION(DRV_DESC);
+MODULE_LICENSE(DRV_LIC);
+MODULE_VERSION(DRV_VER);
+
+static int __init mlx4_ib_init(void)
+{
+	vnic_dbg_func("module_init");
+
+	if (vnic_param_check())
+		goto err;
+	if (vnic_mcast_init())
+		goto err;
+	if (vnic_ports_init())
+		goto free_mcast;
+
+	return 0;
+
+free_mcast:
+	vnic_mcast_cleanup();
+err:
+	return -EINVAL;
+}
+
+static void __exit mlx4_ib_cleanup(void)
+{
+	int ret;
+
+	vnic_dbg_func("module_exit");
+	vnic_ports_cleanup();
+	vnic_dbg_mark();
+	vnic_mcast_cleanup();
+}
+
+module_init(mlx4_ib_init);
+module_exit(mlx4_ib_cleanup);
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_mcast.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_mcast.c
new file mode 100644
index 0000000000000..c82190cd576be
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_mcast.c
@@ -0,0 +1,1098 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+
+struct workqueue_struct *mcast_wq;
+struct ib_sa_client vnic_sa_client;
+
+//static void vnic_mcast_detach_task(struct work_struct *work);
+static void vnic_mcast_attach_task(struct work_struct *work);
+static void vnic_port_mcast_leave_task(struct work_struct *work);
+static void vnic_port_mcast_join_task(struct work_struct *work);
+
+static void vnic_port_mcast_release(struct vnic_port_mcast *mcaste);
+static struct vnic_port_mcast *vnic_port_mcast_update(struct vnic_mcast
+						      *_mcaste);
+
+/*
+ * A helper function to prevent code duplication. Fills vnic_mcast struct with
+ * common values.
+ *
+ * in: mcaste - mcaste to fill
+ * in: gw_id - to be used in creation MGID address
+ * in: mac - to be used in creation MGID address
+ * in: create - value of create field in mcaste
+ */
+void __vnic_mcaste_fill(struct vnic_login *login, struct vnic_mcast *mcaste,
+			u16 gw_id, const u8 *mac, u8 rss_hash, int create)
+{
+	union vhub_mgid mgid;
+
+	memcpy(mcaste->mac, mac, ETH_ALEN);
+	vhub_mgid_create(login->mgid_prefix, mcaste->mac,
+			 login->n_mac_mcgid,
+			 CREATE_VHUB_ID(login->vid, gw_id),
+			 VHUB_MGID_DATA, rss_hash, &mgid);
+	memcpy(&mcaste->gid, mgid.ib_gid.raw, GID_LEN);
+	memcpy(&mcaste->port_gid, &mcaste->gid, GID_LEN);
+	mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC);
+	mcaste->backoff_factor = 1;
+	mcaste->retry = VNIC_MCAST_MAX_RETRY;
+	mcaste->blocking = 0;
+	mcaste->qkey = login->qkey;
+	mcaste->pkey = login->pkey;
+	mcaste->create = create;
+	mcaste->qp = login->qp_res[0].qp; /* mcast/bcast is only on first QP */
+	mcaste->join_state = 1;
+}
+
+/*
+ * A helper function to prevent code duplication. Receives a multicast mac
+ * and a gw_id and attaches it (join + attach). The function also receives
+ * a default_mcaste (used for the MGID over default MLID hack and a user list.
+ * Returns 0 on success and non 0 on failure.
+ *
+ * in: mmac - to be used in creation MGID address
+ * in: default_mcaste - mcaste entry of the default MGID. Can be NULL
+ * in: user_list - A user list to hang the new mcaste on. Can be NULL
+ * in: gw_id - to be used in creation MGID address
+ */
+int _vnic_mcast_attach_mgid(struct vnic_login *login,
+			   char *mmac,
+			   struct vnic_mcast *default_mcaste,
+			   void *private_data,
+			   u16 gw_id)
+{
+	struct vnic_mcast *mcaste;
+	int rc = 0;
+	int rss_hash;
+
+	mcaste = vnic_mcast_alloc(login->port, NULL, NULL);
+	if (IS_ERR(mcaste)) {
+		vnic_warn(login->name, "vnic_mcast_alloc for "MAC_6_PRINT_FMT" failed\n",
+			  MAC_6_PRINT_ARG(mmac));
+		vnic_dbg_mark();
+		return -ENOMEM;
+	}
+	memcpy(mcaste->mac, mmac, ETH_ALEN);
+
+	/* if mcast mac has mcast IP in it:*/
+	rss_hash = 0;
+	if ((mcaste->mac[0] & 0xf0) == 0xe0 &&
+	     mcaste->mac[4] == 0x00 &&
+	     mcaste->mac[5] == 0x00) {
+		/* calculate mcas rss_hash on IP octets */
+		rss_hash = mcaste->mac[0] ^ mcaste->mac[1] ^
+			   mcaste->mac[2] ^ mcaste->mac[3];
+		/* and build the corresponding mcast MAC using the IEEE
+		 * multicast OUI 01:00:5e
+		 */
+		mcaste->mac[5] = mcaste->mac[3];
+		mcaste->mac[4] = mcaste->mac[2];
+		mcaste->mac[3] = mcaste->mac[1] & 0x7f;
+		mcaste->mac[2] = 0x5e;
+		mcaste->mac[1] = 0x00;
+		mcaste->mac[0] = 0x01;
+	}
+
+	__vnic_mcaste_fill(login, mcaste, gw_id, mcaste->mac, rss_hash, 0);
+	mcaste->priv_data = private_data;
+
+	if (default_mcaste)
+		memcpy(&mcaste->port_gid, &default_mcaste->gid, GID_LEN);
+
+	rc = vnic_mcast_add(&login->mcast_tree, mcaste); /* add holds mcast_rb_lock */
+	if (!rc) {
+		rc = vnic_mcast_attach(&login->mcast_tree, mcaste);
+		ASSERT(!rc);
+	} else if (rc == -EEXIST){
+		/* MGID may be already in the tree when n_mac_mcgid > 0 (ok)*/
+		vnic_dbg_mcast(login->name, "vnic_mcast_add for "
+			       MAC_6_PRINT_FMT" already exist, rc %d\n",
+			       MAC_6_PRINT_ARG(mcaste->mac), rc);
+		vnic_mcast_dealloc(mcaste);
+		rc = 0;
+	} else {
+		vnic_warn(login->name, "vnic_mcast_add for "
+			  MAC_6_PRINT_FMT" failed, rc %d\n",
+			  MAC_6_PRINT_ARG(mcaste->mac), rc);
+		vnic_mcast_dealloc(mcaste);
+	}
+	return rc;
+}
+
+struct vnic_mcast *vnic_mcast_alloc(struct vnic_port *port,
+				    unsigned long *req_attach,
+				    unsigned long *cur_attached)
+{
+	struct vnic_mcast *mcaste;
+
+	mcaste = kzalloc(sizeof *mcaste, GFP_ATOMIC);
+	if (!mcaste)
+		return ERR_PTR(-ENOMEM);
+	/* set mcaste fields */
+	init_completion(&mcaste->attach_complete);
+	INIT_DELAYED_WORK(&mcaste->attach_task, vnic_mcast_attach_task);
+	spin_lock_init(&mcaste->lock);
+	mcaste->port = port;
+	mcaste->req_attach = req_attach;
+	mcaste->cur_attached = cur_attached;
+
+	return mcaste;
+}
+
+void vnic_mcast_dealloc(struct vnic_mcast *mcaste)
+{
+	struct vnic_port *port;
+
+	ASSERT(mcaste);
+	port = mcaste->port;
+	vnic_dbg_mcast_vv(port->name, "dealloc vnic_mcast: MAC "MAC_6_PRINT_FMT
+			 " GID "VNIC_GID_FMT"\n",
+			 MAC_6_PRINT_ARG(mcaste->mac),
+			 VNIC_GID_ARG(mcaste->gid));
+	kfree(mcaste);
+}
+
+/*
+ * This function grabs the mcast_tree->mcast_rb_lock
+*/
+int vnic_mcast_add(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste)
+{
+	struct rb_node **n = &mcast_tree->mcast_tree.rb_node, *pn = NULL;
+	struct vnic_mcast *mcaste_t;
+	unsigned long flags;
+	int rc;
+
+	spin_lock_irqsave(&mcast_tree->mcast_rb_lock, flags);
+	while (*n) {
+		pn = *n;
+		mcaste_t = rb_entry(pn, struct vnic_mcast, rb_node);
+		rc = memcmp(mcaste->gid.raw, mcaste_t->gid.raw, GID_LEN);
+		if (rc < 0)
+			n = &pn->rb_left;
+		else if (rc > 0)
+			n = &pn->rb_right;
+		else {
+			rc = -EEXIST;
+			goto out;
+		}
+	}
+
+	rb_link_node(&mcaste->rb_node, pn, n);
+	rb_insert_color(&mcaste->rb_node, &mcast_tree->mcast_tree);
+
+	rc = 0;
+
+out:
+	vnic_dbg_mcast_v(mcaste->port->name,
+			 "added (rc %d) vnic_mcast: MAC "MAC_6_PRINT_FMT
+			 " GID "VNIC_GID_FMT"\n", rc,
+			 MAC_6_PRINT_ARG(mcaste->mac),
+			 VNIC_GID_ARG(mcaste->gid));
+
+	spin_unlock_irqrestore(&mcast_tree->mcast_rb_lock, flags);
+	return rc;
+}
+
+/*
+ * The caller must hold the mcast_tree->mcast_rb_lock lock before calling
+ */
+void vnic_mcast_del(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste)
+{
+	rb_erase(&mcaste->rb_node, &mcast_tree->mcast_tree);
+}
+
+/*
+ * The caller must hold the mcast_tree->mcast_rb_lock lock before calling
+*/
+struct vnic_mcast *vnic_mcast_search(struct mcast_root *mcast_tree,
+				     union ib_gid *gid)
+{
+	struct rb_node *n = mcast_tree->mcast_tree.rb_node;
+	struct vnic_mcast *mcaste_t;
+	int rc;
+
+	while (n) {
+		mcaste_t = rb_entry(n, struct vnic_mcast, rb_node);
+		rc = memcmp(gid->raw, mcaste_t->gid.raw, GID_LEN);
+		if (rc < 0)
+			n = n->rb_left;
+		else if (rc > 0)
+			n = n->rb_right;
+		else {
+			vnic_dbg_mcast_v(mcaste_t->port->name,
+					 "found: MAC "MAC_6_PRINT_FMT" GID "
+					 VNIC_GID_FMT"\n",
+					 MAC_6_PRINT_ARG(mcaste_t->mac),
+					 VNIC_GID_ARG(mcaste_t->gid));
+			goto out;
+		}
+	}
+	mcaste_t = ERR_PTR(-ENODATA);
+
+out:
+	return mcaste_t;
+}
+
+static void vnic_mcast_detach_ll(struct vnic_mcast *mcaste, struct mcast_root *mcast_tree)
+{
+	struct vnic_port *port = mcaste->port;
+	struct ib_ah *tmp_ih;
+	unsigned long flags;
+	int rc;
+
+	vnic_dbg_mcast_v(port->name,
+			 "mcaste->attached %d for mac "MAC_6_PRINT_FMT"\n",
+			 test_bit(MCAST_ATTACHED, &mcaste->state),
+			 MAC_6_PRINT_ARG(mcaste->mac));
+
+	spin_lock_irqsave(&mcaste->lock, flags);
+	if (!test_and_clear_bit(MCAST_ATTACHED, &mcaste->state)) {
+		spin_unlock_irqrestore(&mcaste->lock, flags);
+		return;
+	}
+
+	tmp_ih = mcaste->ah;
+	mcaste->ah = NULL;
+	spin_unlock_irqrestore(&mcaste->lock, flags);
+
+	/* callback */
+	if (mcaste->detach_cb) {
+		vnic_dbg_mcast(port->name, "calling detach_cb\n");
+		mcaste->detach_cb(mcaste, mcaste->detach_cb_ctx);
+	}
+
+	if (!mcaste->sender_only)
+		rc = ib_detach_mcast(mcaste->qp, &mcaste->gid, port->attr.lid);
+	else
+		rc = 0;
+
+	ASSERT(tmp_ih);
+	if (ib_destroy_ah(tmp_ih))
+		vnic_warn(port->name,
+			  "ib_destroy_ah failed (rc %d) for mcaste mac "
+			  MAC_6_PRINT_FMT"\n", rc,
+			  MAC_6_PRINT_ARG(mcaste->mac));
+	vnic_dbg_mcast(port->name, "GID "VNIC_GID_FMT" detached!\n",
+		       VNIC_GID_ARG(mcaste->gid));
+}
+
+int vnic_mcast_detach(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste)
+{
+	struct vnic_port *port = mcaste->port;
+	unsigned long flags;
+
+	/* must be a task, to make sure no attach task is pending */
+	vnic_dbg_mcast_v(port->name, "queue delayed task (%lu) "
+			 "vnic_mcast_detach_task\n", mcaste->backoff);
+
+	/* cancel any pending/queued tasks. We can not use sync
+	 * under the spinlock because it might hang. we need the
+	 * spinlock here to ensure the requeueing is atomic
+	 */
+	vnic_dbg_mcast_v(port->name, "cancel attach_task\n");
+	spin_lock_irqsave(&mcaste->lock, flags);
+	clear_bit(MCAST_ATTACH_RUNNING, &mcaste->state);
+	spin_unlock_irqrestore(&mcaste->lock, flags);
+#ifndef _BP_WORK_SYNC
+	cancel_delayed_work_sync(&mcaste->attach_task);
+#else
+	cancel_delayed_work(&mcaste->attach_task);
+	flush_workqueue(mcast_wq);
+#endif
+	vnic_mcast_detach_ll(mcaste, mcast_tree);
+
+	if (mcaste->port_mcaste)
+		vnic_port_mcast_release(mcaste->port_mcaste);
+
+	return 0;
+}
+
+static void vnic_mcast_attach_task(struct work_struct *work)
+{
+	struct ib_ah_attr av;
+	struct vnic_mcast *mcaste =
+	    container_of(work, struct vnic_mcast, attach_task.work);
+	struct vnic_port *port = mcaste->port;
+	unsigned long flags;
+	int rc;
+	u16 mlid;
+
+	if ((++mcaste->attach_task_cnt > mcaste->retry && mcaste->retry) ||
+		!test_bit(MCAST_ATTACH_RUNNING, &mcaste->state)) {
+		vnic_dbg_mcast_v(port->name,
+				 "attach_task stopped, tried %ld times\n",
+				 mcaste->retry);
+		goto out;
+	}
+
+	/* update backoff time */
+	mcaste->backoff = min(mcaste->backoff * mcaste->backoff_factor,
+			      msecs_to_jiffies(VNIC_MCAST_BACKOFF_MAX_MSEC));
+
+	if (!test_bit(MCAST_JOINED, &mcaste->port_mcaste->state)) {
+		vnic_dbg_mcast_v(port->name, "joined %d, retry %ld from %ld\n",
+				 test_bit(MCAST_JOINED, &mcaste->port_mcaste->state),
+				 mcaste->attach_task_cnt, mcaste->retry);
+		goto retry;
+	}
+
+	/* attach QP */
+	ASSERT(mcaste);
+	ASSERT(mcaste->port_mcaste);
+	ASSERT(mcaste->port_mcaste->sa_mcast);
+	mlid = be16_to_cpu(mcaste->port_mcaste->rec.mlid);
+	vnic_dbg_mcast(port->name, "QPN 0x%06x attaching MGID "VNIC_GID_FMT
+		       " LID 0x%04x\n", mcaste->qp->qp_num,
+		       VNIC_GID_ARG(mcaste->gid), mlid);
+	if (!mcaste->sender_only)
+		rc = ib_attach_mcast(mcaste->qp, &mcaste->gid, mlid);
+	else
+		rc = 0;
+
+	if (rc) {
+		int attach_count = atomic_read(&mcaste->port_mcaste->ref_cnt);
+
+		vnic_err(port->name, "failed to attach (rc %d) to multicast "
+			 "group, MGID "VNIC_GID_FMT"\n",
+			 rc, VNIC_GID_ARG(mcaste->gid));
+
+		if (port->dev->attr.max_mcast_qp_attach <= attach_count) {
+			vnic_err(port->name, "Attach failed. Too many vnics are on the same"
+				 " vhub on this port. vnics count=%d, max=%d\n", 
+				 attach_count,
+				 port->dev->attr.max_mcast_qp_attach);
+		}
+
+		goto retry;
+	} else {
+		/* create mcast ah */
+		memset(&av, 0, sizeof(av));
+		av.dlid = be16_to_cpu(mcaste->port_mcaste->rec.mlid);
+		av.port_num = mcaste->port->num;
+		av.ah_flags = IB_AH_GRH;
+		av.static_rate = mcaste->port_mcaste->rec.rate;
+		av.sl = mcaste->port_mcaste->rec.sl;
+		memcpy(&av.grh.dgid, mcaste->gid.raw, GID_LEN);
+		spin_lock_irqsave(&mcaste->lock, flags);
+		mcaste->ah = ib_create_ah(port->pd, &av);
+		if (IS_ERR(mcaste->ah)) {
+			mcaste->ah = NULL;
+			vnic_err(port->name,
+				 "vnic_ib_create_ah failed (rc %d)\n",
+				 (int)PTR_ERR(mcaste->ah));
+			spin_unlock_irqrestore(&mcaste->lock, flags);
+			/* for such a failure, no need to retry */
+			goto out;
+		}
+		vnic_dbg_mcast(mcaste->port->name, "created mcast ah for %p\n", mcaste);
+
+		/* callback */
+		set_bit(MCAST_ATTACHED, &mcaste->state);
+		spin_unlock_irqrestore(&mcaste->lock, flags);
+
+		if (mcaste->cur_attached)
+			set_bit(mcaste->attach_bit_nr, mcaste->cur_attached);
+		vnic_dbg_mcast(mcaste->port->name,
+			       "attached GID "VNIC_GID_FMT"\n",
+			       VNIC_GID_ARG(mcaste->gid));
+		if (mcaste->attach_cb) {
+			vnic_dbg_mcast(mcaste->port->name,
+				       "calling attach_cb\n");
+			mcaste->attach_cb(mcaste, mcaste->attach_cb_ctx);
+		}
+	}
+
+out:
+	mcaste->attach_task_cnt = 0; /* for next time */
+	mcaste->backoff = mcaste->backoff_init;
+	clear_bit(MCAST_ATTACH_RUNNING, &mcaste->state);
+	complete(&mcaste->attach_complete);
+	return;
+
+retry:
+	spin_lock_irqsave(&mcaste->lock, flags);
+	if (test_bit(MCAST_ATTACH_RUNNING, &mcaste->state)) {
+		/* calls vnic_mcast_attach_task() */
+		queue_delayed_work(mcast_wq, &mcaste->attach_task, mcaste->backoff);
+	}
+	spin_unlock_irqrestore(&mcaste->lock, flags);
+}
+
+int vnic_mcast_attach(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste)
+{
+	struct vnic_port_mcast *pmcaste;
+	struct vnic_port *port = mcaste->port;
+	int rc = 0;
+	ASSERT(mcaste);
+
+	mcaste->backoff_init = mcaste->backoff;
+
+	pmcaste = vnic_port_mcast_update(mcaste);
+	if (IS_ERR(pmcaste)) {
+		vnic_err(port->name, "vnic_port_mcast_update failed GID "
+			 VNIC_GID_FMT"\n", VNIC_GID_ARG(mcaste->gid));
+		rc = PTR_ERR(pmcaste);
+		goto out;
+	}
+
+	mcaste->port_mcaste = pmcaste;
+
+	set_bit(MCAST_ATTACH_RUNNING, &mcaste->state);
+
+	/* must be a task, to sample the joined flag */
+	vnic_dbg_mcast_v(port->name, "queue delayed task (%lu) "
+			 "vnic_mcast_join_task\n", mcaste->backoff);
+	init_completion(&mcaste->attach_complete);
+	/* calls vnic_mcast_attach_task() */
+	queue_delayed_work(mcast_wq, &mcaste->attach_task, 0);
+	if (mcaste->blocking) {
+		wait_for_completion(&mcaste->attach_complete);
+		if (test_bit(MCAST_ATTACHED, &mcaste->state))
+			goto out;
+		vnic_mcast_detach(mcast_tree, mcaste);
+		rc = 1;
+	}
+
+out:
+	return rc;
+}
+
+#if 0
+static int vnic_mcast_attach_all(struct mcast_root *mcast_tree)
+{
+	int fails = 0;
+	struct vnic_mcast *mcaste;
+	struct rb_node *n;
+
+	n = rb_first(&mcast_tree->mcast_tree);
+	while (n) {
+		mcaste = rb_entry(n, struct vnic_mcast, rb_node);
+		n = rb_next(n);
+		/* async call */
+		if (vnic_mcast_attach(mcast_tree, mcaste))
+			fails++;
+	}
+
+	return fails;
+}
+#endif
+
+int vnic_mcast_del_all(struct mcast_root *mcast_tree)
+{
+	struct rb_node *n;
+	struct vnic_mcast *mcaste, *mcaste_t;
+	unsigned long flags;
+	int fails = 0;
+	LIST_HEAD(local_list);
+
+	spin_lock_irqsave(&mcast_tree->mcast_rb_lock, flags);
+	n = rb_first(&mcast_tree->mcast_tree);
+	while (n) {
+		mcaste = rb_entry(n, struct vnic_mcast, rb_node);
+		vnic_mcast_del(mcast_tree, mcaste);
+		list_add_tail(&mcaste->list, &local_list);
+		n = rb_first(&mcast_tree->mcast_tree);
+	}
+	spin_unlock_irqrestore(&mcast_tree->mcast_rb_lock, flags);
+
+	list_for_each_entry_safe(mcaste, mcaste_t, &local_list, list) {
+		list_del(&mcaste->list);
+		vnic_mcast_detach(mcast_tree, mcaste);
+		vnic_mcast_dealloc(mcaste);
+	}
+
+	return fails;
+}
+
+int vnic_mcast_del_user(struct mcast_root *mcast_tree, void *owner)
+{
+	struct rb_node *n;
+	struct vnic_mcast *mcaste, *mcaste_t;
+	unsigned long flags;
+	int fails = 0;
+	LIST_HEAD(local_list);
+
+	spin_lock_irqsave(&mcast_tree->mcast_rb_lock, flags);
+	n = rb_first(&mcast_tree->mcast_tree);
+	while (n) {
+		mcaste = rb_entry(n, struct vnic_mcast, rb_node);
+		n = rb_next(&mcaste->rb_node);
+		if (mcaste->priv_data == owner) {
+			list_add_tail(&mcaste->list, &local_list);
+			vnic_mcast_del(mcast_tree, mcaste);
+		}
+	}
+	spin_unlock_irqrestore(&mcast_tree->mcast_rb_lock, flags);
+
+	list_for_each_entry_safe(mcaste, mcaste_t, &local_list, list) {
+		list_del(&mcaste->list);
+		vnic_mcast_detach(mcast_tree, mcaste);
+		vnic_mcast_dealloc(mcaste);
+	}
+
+	return fails;
+}
+
+/* PORT MCAST FUNCTIONS */
+static struct vnic_port_mcast *vnic_port_mcast_alloc(struct vnic_port *port,
+						     union ib_gid *gid)
+{
+	struct vnic_port_mcast *mcaste;
+
+	mcaste = kzalloc(sizeof *mcaste, GFP_ATOMIC);
+	if (!mcaste)
+		return ERR_PTR(-ENOMEM);
+
+	mcaste->gid = *gid;
+	mcaste->port = port;
+	init_completion(&mcaste->leave_complete);
+	atomic_set(&mcaste->ref_cnt, 1);
+	INIT_DELAYED_WORK(&mcaste->join_task, vnic_port_mcast_join_task);
+	INIT_WORK(&mcaste->leave_task, vnic_port_mcast_leave_task);
+	mcaste->sa_mcast = ERR_PTR(-EINVAL);
+	memset(&mcaste->rec,0,sizeof(mcaste->rec));
+	vnic_dbg_mcast_v(mcaste->port->name, "allocated port_mcast GID "
+			 VNIC_GID_FMT"\n", VNIC_GID_ARG(mcaste->gid));
+	spin_lock_init(&mcaste->lock);
+	set_bit(MCAST_JOIN_RUNNING, &mcaste->state);
+
+	return mcaste;
+}
+
+static void vnic_port_mcast_dealloc(struct vnic_port_mcast *mcaste)
+{
+	ASSERT(mcaste);
+	vnic_dbg_mcast_v(NULL, "dealloc port_mcast GID "
+			 VNIC_GID_FMT"\n", VNIC_GID_ARG(mcaste->gid));
+	kfree(mcaste);
+}
+
+/*
+ * This function accesses the port mcast tree. Please make sure
+ * to call it only while holding the port mcast_rb_lock
+*/
+static int vnic_port_mcast_add(struct vnic_port_mcast *mcaste)
+{
+	struct rb_node **n = &mcaste->port->mcast_tree.mcast_tree.rb_node;
+	struct rb_node *pn = NULL;
+	struct vnic_port_mcast *mcaste_t;
+	int rc;
+
+	while (*n) {
+		pn = *n;
+		mcaste_t = rb_entry(pn, struct vnic_port_mcast, rb_node);
+		rc = memcmp(mcaste->gid.raw, mcaste_t->gid.raw, GID_LEN);
+		if (rc < 0)
+			n = &pn->rb_left;
+		else if (rc > 0)
+			n = &pn->rb_right;
+		else {
+			rc = -EEXIST;
+			goto out;
+		}
+	}
+
+	rb_link_node(&mcaste->rb_node, pn, n);
+	rb_insert_color(&mcaste->rb_node, &mcaste->port->mcast_tree.mcast_tree);
+	rc = 0;
+
+out:
+	vnic_dbg_mcast_v(mcaste->port->name, "added (rc %d) port_mcast GID "
+			 VNIC_GID_FMT"\n", rc, VNIC_GID_ARG(mcaste->gid));
+	return rc;
+}
+
+/*
+ * This function accesses the port mcast tree. Please make sure
+ * to call it only while holding the port mcast_rb_lock
+*/
+static void vnic_port_mcast_del(struct vnic_port_mcast *mcaste)
+{
+	ASSERT(mcaste);
+	vnic_dbg_mcast_v(mcaste->port->name, "del port_mcast GID "
+			 VNIC_GID_FMT"\n", VNIC_GID_ARG(mcaste->gid));
+	rb_erase(&mcaste->rb_node, &mcaste->port->mcast_tree.mcast_tree);
+}
+
+/*
+ * This function accesses the port mcast tree. Please make sure
+ * to call it only while holding the port mcast_rb_lock
+*/
+struct vnic_port_mcast *vnic_port_mcast_search(struct vnic_port *port,
+					       union ib_gid *gid)
+{
+	struct rb_node *n = port->mcast_tree.mcast_tree.rb_node;
+	struct vnic_port_mcast *mcaste_t;
+	int rc;
+
+	while (n) {
+		mcaste_t = rb_entry(n, struct vnic_port_mcast, rb_node);
+		rc = memcmp(gid->raw, mcaste_t->gid.raw, GID_LEN);
+		if (rc < 0)
+			n = n->rb_left;
+		else if (rc > 0)
+			n = n->rb_right;
+		else {
+			vnic_dbg_mcast_v(mcaste_t->port->name,
+					 "found: GID "VNIC_GID_FMT"\n",
+					 VNIC_GID_ARG(mcaste_t->gid));
+			goto out;
+		}
+	}
+	mcaste_t = ERR_PTR(-ENODATA);
+
+out:
+	return mcaste_t;
+}
+/*
+static void vnic_port_mcast_leave_task(struct work_struct *work)
+{
+	struct vnic_port_mcast *mcaste =
+		container_of(work, struct vnic_port_mcast, leave_task.work);
+
+	vnic_dbg_mcast_v(mcaste->port->name, "leave GID "VNIC_GID_FMT"\n",
+			 VNIC_GID_ARG(mcaste->gid));
+
+	if (!IS_ERR(mcaste->sa_mcast) && test_bit(MCAST_JOINED, &mcaste->port_mcaste->state))
+		vnic_dbg_mcast(mcaste->port->name,
+			       "mcast left: GID "VNIC_GID_FMT"\n",
+			       VNIC_GID_ARG(mcaste->gid));
+	if (!IS_ERR(mcaste->sa_mcast))
+		ib_sa_free_multicast(mcaste->sa_mcast);
+	mcaste->sa_mcast = ERR_PTR(-EINVAL);
+	clear_bit(MCAST_JOINED, &mcaste->port_mcaste->state);
+}
+*/
+
+static int vnic_port_mcast_leave(struct vnic_port_mcast *mcaste,
+				 unsigned long backoff)
+{
+	unsigned long flags;
+
+	ASSERT(mcaste);
+	vnic_dbg_mcast(NULL, "queue delayed task (%lu) "
+		       "vnic_mcast_leave_task\n", backoff);
+
+	/* cancel any pending/queued tasks. We can not use sync
+	 * under the spinlock because it might hang. we need the
+	 * spinlock here to ensure the requeueing is atomic
+	 */
+	spin_lock_irqsave(&mcaste->lock, flags);
+	clear_bit(MCAST_JOIN_RUNNING, &mcaste->state);
+	spin_unlock_irqrestore(&mcaste->lock, flags);
+#ifndef _BP_WORK_SYNC
+	cancel_delayed_work_sync(&mcaste->join_task);
+#else
+	cancel_delayed_work(&mcaste->join_task);
+	if (delayed_work_pending(&mcaste->join_task)) {
+		return -EBUSY;
+	}
+#endif
+
+	if (test_and_clear_bit(MCAST_JOIN_STARTED, &mcaste->state)
+	    && !IS_ERR(mcaste->sa_mcast)) {
+		ib_sa_free_multicast(mcaste->sa_mcast);
+		mcaste->sa_mcast = ERR_PTR(-EINVAL);
+	}
+
+	return 0;
+}
+
+static int vnic_port_mcast_join_comp(int status, struct ib_sa_multicast *sa_mcast)
+{
+	struct vnic_port_mcast *mcaste = sa_mcast->context;
+	unsigned long flags;
+
+	vnic_dbg_mcast(mcaste->port->name, "join completion for GID "
+		       VNIC_GID_FMT" (status %d)\n",
+		       VNIC_GID_ARG(mcaste->gid), status);
+
+	if (status == -ENETRESET)
+		return 0;
+
+	if (status)
+		goto retry;
+
+	/* same as mcaste->rec = mcaste->sa_mcast->rec; */
+	mcaste->rec = sa_mcast->rec;
+
+	set_bit(MCAST_JOINED, &mcaste->state);
+	vnic_dbg_mcast(mcaste->port->name, "joined GID "VNIC_GID_FMT"\n",
+		       VNIC_GID_ARG(mcaste->gid));
+#if 0
+	vnic_dbg_mcast_v(mcaste->port->name, "mcast record dump:\n");
+	vnic_dbg_mcast_v(mcaste->port->name, "mgid      "VNIC_GID_FMT"\n",
+			 VNIC_GID_ARG(rec->mgid));
+	vnic_dbg_mcast_v(mcaste->port->name, "port_gid  "VNIC_GID_FMT"\n",
+			 VNIC_GID_ARG(rec->port_gid));
+	vnic_dbg_mcast_v(mcaste->port->name, "pkey       0x%x\n", rec->pkey);
+	vnic_dbg_mcast_v(mcaste->port->name, "qkey       0x%x\n", rec->qkey);
+	vnic_dbg_mcast_v(mcaste->port->name, "mtu_slct   0x%x\n",
+			 rec->mtu_selector);
+	vnic_dbg_mcast_v(mcaste->port->name, "mtu        0x%x\n", rec->mtu);
+	vnic_dbg_mcast_v(mcaste->port->name, "rate_slct  0x%x\n",
+			 rec->rate_selector);
+	vnic_dbg_mcast_v(mcaste->port->name, "rate       0x%x\n", rec->rate);
+	vnic_dbg_mcast_v(mcaste->port->name, "sl         0x%x\n", rec->sl);
+	vnic_dbg_mcast_v(mcaste->port->name, "flow_label 0x%x\n",
+			 rec->flow_label);
+	vnic_dbg_mcast_v(mcaste->port->name, "hop_limit  0x%x\n",
+			 rec->hop_limit);
+#endif
+
+	goto out;
+retry:
+	/* calls vnic_port_mcast_join_task() */
+	spin_lock_irqsave(&mcaste->lock, flags);
+	if (test_bit(MCAST_JOIN_RUNNING, &mcaste->state))
+		queue_delayed_work(mcast_wq, &mcaste->join_task, mcaste->backoff);
+	spin_unlock_irqrestore(&mcaste->lock, flags);
+
+out:
+	/* rc is always zero so we handle ib_sa_free_multicast ourselves */
+	return 0;
+}
+
+static void vnic_port_mcast_join_task(struct work_struct *work)
+{
+	struct vnic_port_mcast *mcaste =
+	    container_of(work, struct vnic_port_mcast, join_task.work);
+	struct ib_sa_mcmember_rec rec = {
+		.join_state = mcaste->join_state
+	};
+	int rc;
+	ib_sa_comp_mask comp_mask;
+	unsigned long flags;
+
+	if (++mcaste->join_task_cnt > mcaste->retry && mcaste->retry) {
+		vnic_dbg_mcast(mcaste->port->name,
+			       "join_task stopped, tried %ld times\n",
+			       mcaste->retry);
+		goto out;
+	}
+
+	/* update backoff time */
+	mcaste->backoff = min(mcaste->backoff * mcaste->backoff_factor,
+			      msecs_to_jiffies(VNIC_MCAST_BACKOFF_MAX_MSEC));
+
+	rec.mgid.global = mcaste->gid.global;
+	rec.port_gid.global = mcaste->port->gid.global;
+	rec.pkey = cpu_to_be16(mcaste->pkey);
+
+	comp_mask =
+	    IB_SA_MCMEMBER_REC_MGID |
+	    IB_SA_MCMEMBER_REC_PORT_GID |
+	    /*IB_SA_MCMEMBER_REC_PKEY | */
+	    IB_SA_MCMEMBER_REC_JOIN_STATE;
+
+	if (mcaste->create) {
+		comp_mask |=
+		    IB_SA_MCMEMBER_REC_QKEY |
+		    IB_SA_MCMEMBER_REC_MTU_SELECTOR |
+		    IB_SA_MCMEMBER_REC_MTU |
+		    IB_SA_MCMEMBER_REC_TRAFFIC_CLASS |
+		    IB_SA_MCMEMBER_REC_RATE_SELECTOR |
+		    IB_SA_MCMEMBER_REC_RATE |
+		    IB_SA_MCMEMBER_REC_SL |
+		    IB_SA_MCMEMBER_REC_FLOW_LABEL |
+		    IB_SA_MCMEMBER_REC_HOP_LIMIT |
+		    IB_SA_MCMEMBER_REC_PKEY;
+
+		rec.qkey = cpu_to_be32(mcaste->qkey);
+		rec.mtu_selector = IB_SA_EQ;
+		rec.rate_selector = IB_SA_EQ;
+		/* when no_bxm is set, use min values to let everybody in */
+		rec.mtu = no_bxm ? IB_MTU_2048 : mcaste->port->attr.max_mtu;
+		rec.rate = no_bxm ? IB_RATE_10_GBPS : mcaste->port->rate_enum;
+		rec.sl = 0;
+		rec.flow_label = 0;
+		rec.hop_limit = 0;
+	}
+
+	vnic_dbg_mcast(mcaste->port->name, "joining MGID "VNIC_GID_FMT
+		       " create %d, comp_mask %lu\n",
+		       VNIC_GID_ARG(mcaste->gid), mcaste->create, (unsigned long)comp_mask);
+
+	if (!IS_ERR(mcaste->sa_mcast))
+		ib_sa_free_multicast(mcaste->sa_mcast);
+
+	mcaste->sa_mcast =
+	    ib_sa_join_multicast(&vnic_sa_client, mcaste->port->dev->ca,
+				 mcaste->port->num, &rec, comp_mask,
+				 GFP_KERNEL, vnic_port_mcast_join_comp, mcaste);
+	set_bit(MCAST_JOIN_STARTED, &mcaste->state);
+
+	if (IS_ERR(mcaste->sa_mcast)) {
+		rc = PTR_ERR(mcaste->sa_mcast);
+		vnic_warn(mcaste->port->name,
+			  "ib_sa_join_multicast failed, status %d\n", rc);
+		/* calls vnic_port_mcast_join_task() */
+		spin_lock_irqsave(&mcaste->lock, flags);
+		if (test_bit(MCAST_JOIN_RUNNING, &mcaste->state))
+			queue_delayed_work(mcast_wq, &mcaste->join_task, mcaste->backoff);
+		spin_unlock_irqrestore(&mcaste->lock, flags);
+	}
+
+	return;
+
+out:
+	mcaste->join_task_cnt = 0; /* for next time */
+	mcaste->backoff = mcaste->backoff_init;
+	return;
+}
+
+static int vnic_port_mcast_join(struct vnic_port_mcast *mcaste)
+{
+	unsigned long flags;
+
+	ASSERT(mcaste);
+	vnic_dbg_mcast_v(mcaste->port->name, "queue delayed task (%lu) "
+			 "vnic_port_mcast_join_task\n", mcaste->backoff);
+
+	/* calls vnic_port_mcast_join_task() */
+	spin_lock_irqsave(&mcaste->lock, flags);
+	if (test_bit(MCAST_JOIN_RUNNING, &mcaste->state))
+		queue_delayed_work(mcast_wq, &mcaste->join_task, 0);
+	spin_unlock_irqrestore(&mcaste->lock, flags);
+
+	return 0;
+}
+
+#if 0
+static int vnic_port_mcast_join_all(struct vnic_port *port)
+{
+	int fails = 0;
+	struct vnic_port_mcast *mcaste;
+	struct rb_node *n;
+
+	n = rb_first(&port->mcast_tree.mcast_tree);
+	while (n) {
+		mcaste = rb_entry(n, struct vnic_port_mcast, rb_node);
+		n = rb_next(n);
+		if (vnic_port_mcast_join(mcaste))
+			fails++;
+	}
+
+	return fails;
+}
+#endif
+
+static void vnic_port_mcast_leave_task(struct work_struct *work)
+{
+	struct vnic_port_mcast *mcaste =
+	    container_of(work, struct vnic_port_mcast, leave_task);
+
+#ifndef _BP_WORK_SYNC
+	vnic_port_mcast_leave(mcaste, 0);
+#else
+	if (vnic_port_mcast_leave(mcaste, 0)) {
+		queue_work(mcast_wq, &mcaste->leave_task);
+		return;
+	}
+#endif
+	vnic_port_mcast_dealloc(mcaste);
+}
+
+static void vnic_port_mcast_release(struct vnic_port_mcast *mcaste)
+{
+	unsigned long flags;
+
+	struct vnic_port *port = mcaste->port;
+
+	vnic_dbg_mcast(port->name, "update mcaste->ref_cnt %d -> %d\n",
+		       atomic_read(&mcaste->ref_cnt),
+		       atomic_read(&mcaste->ref_cnt) - 1);
+
+	spin_lock_irqsave(&port->mcast_tree.mcast_rb_lock, flags);
+	if (atomic_dec_and_test(&mcaste->ref_cnt)) {
+		vnic_port_mcast_del(mcaste);
+		spin_unlock_irqrestore(&port->mcast_tree.mcast_rb_lock, flags);
+
+		/* we are not going to wait for the leave to terminate.
+		 *  We will just go on.
+		 *  calls vnic_port_mcast_leave_task()
+		 */
+		queue_work(mcast_wq, &mcaste->leave_task);
+	} else
+		spin_unlock_irqrestore(&port->mcast_tree.mcast_rb_lock, flags);
+}
+
+static struct vnic_port_mcast *vnic_port_mcast_update(struct vnic_mcast *_mcaste)
+{
+	union ib_gid *gid = &_mcaste->port_gid;
+	u32 qkey = _mcaste->qkey;
+	u16 pkey = _mcaste->pkey;
+	struct vnic_port *port = _mcaste->port;
+	struct vnic_port_mcast *mcaste;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->mcast_tree.mcast_rb_lock, flags);
+	mcaste = vnic_port_mcast_search(port, gid);
+	/* entry found */
+	if (PTR_ERR(mcaste) != -ENODATA) {
+		ASSERT(!IS_ERR(mcaste));
+		atomic_inc(&mcaste->ref_cnt);
+		spin_unlock_irqrestore(&port->mcast_tree.mcast_rb_lock, flags);
+		vnic_dbg_mcast(mcaste->port->name,
+			       "found, add GID "VNIC_GID_FMT" \n",
+			       VNIC_GID_ARG(*gid));
+		vnic_dbg_mcast(mcaste->port->name,
+			       "update mcaste->ref_cnt %d -> %d\n",
+			       atomic_read(&mcaste->ref_cnt),
+			       atomic_read(&mcaste->ref_cnt) + 1);
+	} else { /* not found, add it */
+		mcaste = vnic_port_mcast_alloc(port, gid);
+		if (IS_ERR(mcaste)) {
+			spin_unlock_irqrestore(&port->mcast_tree.mcast_rb_lock, flags);
+			return mcaste;
+		}
+		vnic_dbg_mcast(mcaste->port->name,
+			       "not found, add GID "VNIC_GID_FMT" \n",
+			       VNIC_GID_ARG(*gid));
+		vnic_dbg_mcast(mcaste->port->name,
+			       "update mcaste->ref_cnt %d -> %d\n",
+			       atomic_read(&mcaste->ref_cnt),
+			       atomic_read(&mcaste->ref_cnt) + 1);
+		mcaste->qkey = qkey;
+		mcaste->pkey = pkey;
+		mcaste->backoff_init = _mcaste->backoff_init;
+		mcaste->backoff = _mcaste->backoff;
+		mcaste->backoff_factor = _mcaste->backoff_factor;
+		mcaste->retry = _mcaste->retry;
+		mcaste->create = _mcaste->create;
+		mcaste->join_state = _mcaste->join_state;
+		vnic_port_mcast_add(mcaste);
+		spin_unlock_irqrestore(&port->mcast_tree.mcast_rb_lock, flags);
+
+		vnic_port_mcast_join(mcaste);
+		vnic_dbg_mcast(mcaste->port->name, "added\n");
+	}
+
+	return mcaste;
+}
+
+#if 0
+void vnic_port_mcast_del_all(struct vnic_port *port)
+{
+
+	struct rb_node *n;
+	struct vnic_port_mcast *mcaste, *mcaste_t;
+	LIST_HEAD(local_list);
+
+	ASSERT(port);
+
+	n = rb_first(&port->mcast_tree.mcast_tree);
+	while (n) {
+		mcaste = rb_entry(n, struct vnic_port_mcast, rb_node);
+		list_add_tail(&mcaste->list, &local_list);
+		n = rb_next(&mcaste->rb_node);
+	}
+
+	list_for_each_entry_safe(mcaste, mcaste_t, &local_list, list) {
+		list_del(&mcaste->list);
+		vnic_warn(port->name, "shouldn't find gid "VNIC_GID_FMT"\n",
+			  VNIC_GID_ARG(mcaste->gid));
+		vnic_port_mcast_release(mcaste);
+	}
+
+	return;
+}
+#endif
+
+void vnic_tree_mcast_detach(struct mcast_root *mcast_tree)
+{
+	struct vnic_mcast *mcaste, *mcaste_t;
+	struct rb_node *n;
+	unsigned long flags;
+	INIT_LIST_HEAD(&mcast_tree->reattach_list);
+
+	spin_lock_irqsave(&mcast_tree->mcast_rb_lock, flags);
+	n = rb_first(&mcast_tree->mcast_tree);
+	while (n) {
+		mcaste = rb_entry(n, struct vnic_mcast, rb_node);
+		list_add_tail(&mcaste->list, &mcast_tree->reattach_list);
+		n = rb_next(&mcaste->rb_node);
+		vnic_mcast_del(mcast_tree, mcaste);
+		mcaste->attach_task_cnt = 0;
+	}
+	spin_unlock_irqrestore(&mcast_tree->mcast_rb_lock, flags);
+
+	list_for_each_entry_safe(mcaste, mcaste_t, &mcast_tree->reattach_list, list) {
+		vnic_mcast_detach(mcast_tree, mcaste);
+	}
+
+	return;
+}
+
+void vnic_tree_mcast_attach(struct mcast_root *mcast_tree)
+{
+	struct vnic_mcast *mcaste, *mcaste_t;
+	int rc;
+
+	/* The add function grabs the mcast_rb_lock no need to take it */
+	list_for_each_entry_safe(mcaste, mcaste_t, &mcast_tree->reattach_list, list) {
+		rc = vnic_mcast_add(mcast_tree, mcaste);
+		ASSERT(!rc);
+		rc = vnic_mcast_attach(mcast_tree, mcaste);
+		ASSERT(!rc);
+		list_del(&mcaste->list);
+	}
+
+	return;
+}
+
+int vnic_mcast_init()
+{
+	ib_sa_register_client(&vnic_sa_client);
+
+	mcast_wq = create_singlethread_workqueue("mcast_wq");
+	if (!mcast_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void vnic_mcast_cleanup()
+{
+	ASSERT(mcast_wq);
+	vnic_dbg_mark();
+	flush_workqueue(mcast_wq);
+	vnic_dbg_mark();
+	destroy_workqueue(mcast_wq);
+	vnic_dbg_mark();
+	ib_sa_unregister_client(&vnic_sa_client);
+
+	return;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_mcast.h b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_mcast.h
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_param.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_param.c
new file mode 100644
index 0000000000000..56751aa752740
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_param.c
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_fip.h"
+
+u32 vnic_lro_num = VNIC_MAX_LRO_DESCS;
+u32 vnic_net_admin = 1;
+u32 vnic_child_max = VNIC_CHILD_MAX;
+u32 vnic_tx_rings_num = 0;
+u32 vnic_rx_rings_num = 0;
+u32 vnic_tx_rings_len = VNIC_TX_QUEUE_LEN;
+u32 vnic_rx_rings_len = VNIC_RX_QUEUE_LEN;
+u32 vnic_mgid_data_type = 0;
+u32 vnic_encap_headroom = 1;
+u32 vnic_tx_polling = 1;
+u32 vnic_rx_linear = 0;
+u32 vnic_change_mac = 0;
+u32 vnic_learn_mac_enabled = 1;
+u32 vnic_synd_backlog = 4;
+u32 vnic_eport_state_enforce = 0;
+u32 vnic_src_mac_enforce = 0;
+u32 vnic_inline_tshold = 0;
+u32 vnic_discovery_pkeys[MAX_NUM_PKEYS_DISCOVERY];
+u32 vnic_discovery_pkeys_count = MAX_NUM_PKEYS_DISCOVERY;
+u32 vnic_sa_query = 0;
+
+/* these params are enbaled in debug mode */
+u32 no_bxm = 0;
+u32 vnic_msglvl = 0x80000000;
+u32 vnic_max_tx_outs = VNIC_MAX_TX_OUTS;
+u32 vnic_linear_small_pkt = 1;
+u32 vnic_mcast_create = 0;
+u32 vnic_napi_weight = VNIC_MAX_RX_CQE;
+
+module_param_named(tx_rings_num, vnic_tx_rings_num, int, 0444);
+MODULE_PARM_DESC(tx_rings_num, "Number of TX rings, use 0 for #cpus [default 0, max 32]");
+
+module_param_named(tx_rings_len, vnic_tx_rings_len, int, 0444);
+MODULE_PARM_DESC(tx_rings_len, "Length of TX rings, must be power of two [default 1024, max 8K]");
+
+module_param_named(rx_rings_num, vnic_rx_rings_num, int, 0444);
+MODULE_PARM_DESC(rx_rings_num, "Number of RX rings, use 0 for #cpus [default 0, max 32]");
+
+module_param_named(rx_rings_len, vnic_rx_rings_len, int, 0444);
+MODULE_PARM_DESC(rx_rings_len, "Length of RX rings, must be power of two [default 2048, max 8K]");
+
+module_param_named(eport_state_enforce, vnic_eport_state_enforce, int, 0644);
+MODULE_PARM_DESC(eport_state_enforce, "Bring interface up only when corresponding EPort is up [default 0]");
+
+module_param_named(src_mac_enforce, vnic_src_mac_enforce, int, 0644);
+MODULE_PARM_DESC(src_mac_enforce, "Enforce source MAC address [default 0]");
+
+module_param_named(vnic_net_admin, vnic_net_admin, int, 0644);
+MODULE_PARM_DESC(vnic_net_admin, "Enable Network Administration mode [default 1]");
+
+module_param_named(vnic_child_max, vnic_child_max, int, 0644);
+MODULE_PARM_DESC(vnic_child_max, "Max child vNics (per interface), use 0 to disable [default 128]");
+
+module_param_named(mgid_data_type, vnic_mgid_data_type, int, 0444);
+MODULE_PARM_DESC(mgid_data_type, "Set MGID data type for multicast traffic [default 0, max 1]");
+
+module_param_named(encap_headroom, vnic_encap_headroom, int, 0444);
+MODULE_PARM_DESC(encap_headroom, "Use SKB headroom for protocol encapsulation [default 1]");
+
+module_param_named(inline_tshold, vnic_inline_tshold, int, 0444);
+MODULE_PARM_DESC(inline_tshold, "Packets smaller than this threshold (in bytes) use inline & blue flame [default 0, max 512]");
+
+module_param_named(tx_polling, vnic_tx_polling, int, 0444);
+MODULE_PARM_DESC(tx_polling, "Enable TX polling mode [default 1]");
+
+module_param_named(rx_linear, vnic_rx_linear, int, 0444);
+MODULE_PARM_DESC(rx_linear, "Enable linear RX buffers [default 0]");
+
+module_param_named(change_mac, vnic_change_mac, int, 0444);
+MODULE_PARM_DESC(change_mac, "Enable MAC change using child vNics [default 0]");
+
+module_param_named(learn_tx_mac, vnic_learn_mac_enabled, int, 0644);
+MODULE_PARM_DESC(learn_tx_mac, "Enable TX MAC learning in promisc mode [default 1]");
+
+module_param_named(synd_backlog, vnic_synd_backlog, int, 0644);
+MODULE_PARM_DESC(synd_backlog, "Syndrome error reporting backlog limit [default 4]");
+
+module_param_array_named(discovery_pkeys, vnic_discovery_pkeys, int, &vnic_discovery_pkeys_count, 0444);
+MODULE_PARM_DESC(discovery_pkeys, "Vector of PKeys to be used for discovery [default 0xffff, max vector length 24]");
+
+module_param_named(sa_query, vnic_sa_query, int, 0644);
+MODULE_PARM_DESC(sa_query, "Query SA for each IB address and ignore gateway assigned SLs [default 0]");
+
+
+#if !(defined(NETIF_F_GRO) && !defined(_BP_NO_GRO))
+module_param_named(lro_num, vnic_lro_num, int, 0444);
+MODULE_PARM_DESC(lro_num, "Number of LRO sessions per ring, use 0 to disable [default 32, max 32]");
+#endif
+
+#ifdef CONFIG_MLX4_VNIC_DEBUG
+module_param_named(no_bxm, no_bxm, int, 0444);
+MODULE_PARM_DESC(no_bxm, "Enable NO BXM mode [default 0]");
+
+module_param_named(msglvl, vnic_msglvl, uint, 0644);
+MODULE_PARM_DESC(msglvl, "Debug message level [default 0]");
+
+module_param_named(max_tx_outs, vnic_max_tx_outs, int, 0644);
+MODULE_PARM_DESC(max_tx_outs, "Max outstanding TX packets [default 16]");
+
+module_param_named(linear_small_pkt, vnic_linear_small_pkt, int, 0644);
+MODULE_PARM_DESC(linear_small_pkt, "Use linear buffer for small packets [default 1]");
+
+module_param_named(mcast_create, vnic_mcast_create, int, 0444);
+MODULE_PARM_DESC(mcast_create, "Create multicast group during join request [default 0]");
+
+module_param_named(napi_weight, vnic_napi_weight, int, 0444);
+MODULE_PARM_DESC(napi_weight, "NAPI weight [default 32]");
+#endif /* CONFIG_MLX4_VNIC_DEBUG */
+
+int vnic_param_check(void) {
+#ifdef CONFIG_MLX4_VNIC_DEBUG
+	vnic_info("VNIC_DEBUG flag is set\n");
+#endif
+
+	vnic_mcast_create = vnic_mcast_create ? 1 : 0;
+	vnic_mcast_create = no_bxm ? 1 : vnic_mcast_create;
+	no_bxm            = no_bxm ? 1 : 0;
+	vnic_sa_query     = vnic_sa_query ? 1 : 0;
+
+	vnic_mgid_data_type = max_t(u32, vnic_mgid_data_type, 0);
+	vnic_mgid_data_type = min_t(u32, vnic_mgid_data_type, 1);
+
+	vnic_rx_rings_num = max_t(u32, vnic_rx_rings_num, 0);
+	vnic_rx_rings_num = min_t(u32, vnic_rx_rings_num, VNIC_MAX_NUM_CPUS);
+
+	vnic_tx_rings_num = max_t(u32, vnic_tx_rings_num, 0);
+	vnic_tx_rings_num = min_t(u32, vnic_tx_rings_num, VNIC_MAX_NUM_CPUS);
+
+	vnic_tx_rings_len = rounddown_pow_of_two(vnic_tx_rings_len);
+	vnic_tx_rings_len = max_t(u32, vnic_tx_rings_len, VNIC_TX_QUEUE_LEN_MIN);
+	vnic_tx_rings_len = min_t(u32, vnic_tx_rings_len, VNIC_TX_QUEUE_LEN_MAX);
+
+	vnic_rx_rings_len = rounddown_pow_of_two(vnic_rx_rings_len);
+	vnic_rx_rings_len = max_t(u32, vnic_rx_rings_len, VNIC_RX_QUEUE_LEN_MIN);
+	vnic_rx_rings_len = min_t(u32, vnic_rx_rings_len, VNIC_RX_QUEUE_LEN_MAX);
+
+	vnic_max_tx_outs  = min_t(u32, vnic_tx_rings_len, vnic_max_tx_outs);
+
+	vnic_napi_weight  = min_t(u32, vnic_napi_weight, VNIC_MAX_NUM_CPUS);
+
+	vnic_lro_num      = max_t(u32, vnic_lro_num, 0);
+	vnic_lro_num      = min_t(u32, vnic_lro_num, VNIC_MAX_LRO_DESCS);
+
+	vnic_inline_tshold = max_t(u32, vnic_inline_tshold, 0);
+	vnic_inline_tshold = min_t(u32, vnic_inline_tshold, VNIC_MAX_INLINE_TSHOLD);
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_port.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_port.c
new file mode 100644
index 0000000000000..a973deb7aecdf
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_port.c
@@ -0,0 +1,492 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_data.h"
+
+/* globals */
+struct workqueue_struct *port_wq;
+struct workqueue_struct *login_wq;
+
+/* functions */
+static void vnic_port_event(struct ib_event_handler *handler,
+			    struct ib_event *record)
+{
+	struct vnic_port *port =
+		container_of(handler, struct vnic_port, event_handler);
+
+	if (record->element.port_num != port->num)
+		return;
+
+	vnic_info("Received event 0x%x (device %s port %d)\n",
+		  record->event, record->device->name,
+		  record->element.port_num);
+
+	switch (record->event) {
+	case IB_EVENT_SM_CHANGE:
+	case IB_EVENT_CLIENT_REREGISTER:
+		/* calls vnic_port_event_task_light() */
+		queue_delayed_work(fip_wq, &port->event_task_light, msecs_to_jiffies(VNIC_SM_HEADSTART));
+		break;
+	case IB_EVENT_PORT_ERR:
+	case IB_EVENT_PORT_ACTIVE:
+		/* calls vnic_port_event_task() */
+		queue_delayed_work(fip_wq, &port->event_task, msecs_to_jiffies(VNIC_SM_HEADSTART));
+		break;
+	case IB_EVENT_PKEY_CHANGE:
+	case IB_EVENT_LID_CHANGE:
+		/* calls port_fip_discover_restart() */
+		if (no_bxm)
+			queue_delayed_work(fip_wq, &port->event_task, 0);
+		else
+			queue_delayed_work(port_wq, &port->discover_restart_task, msecs_to_jiffies(VNIC_SM_HEADSTART));
+		break;
+	case IB_EVENT_SRQ_ERR:
+	case IB_EVENT_SRQ_LIMIT_REACHED:
+	case IB_EVENT_QP_LAST_WQE_REACHED:
+	case IB_EVENT_DEVICE_FATAL:
+	default:
+		vnic_warn(port->name, "event 0x%x unhandled\n", record->event);
+		break;
+	}
+
+}
+
+static inline u8 vnic_mcast_rate_enum(struct vnic_port *port, int rate)
+{
+	u8 ret;
+
+	switch (rate) {
+	case 10:
+		ret = IB_RATE_10_GBPS;
+		break;
+	case 20:
+		ret = IB_RATE_20_GBPS;
+		break;
+	case 40:
+		ret = IB_RATE_40_GBPS;
+		break;
+	case 80:
+		ret = IB_RATE_80_GBPS;
+		break;
+	default:
+		ret = IB_RATE_10_GBPS;
+	}
+	return ret;
+}
+
+int vnic_port_query(struct vnic_port *port)
+{
+	if (ib_query_gid(port->dev->ca, port->num, 0, &port->gid)) {
+		vnic_err(port->name, "ib_query_gid failed\n");
+		return -EINVAL;
+	}
+
+	if (ib_query_port(port->dev->ca, port->num, &port->attr)) {
+		vnic_err(port->name, "ib_query_port failed\n");
+		return -EINVAL;
+	}
+
+	port->max_mtu_enum = ib_mtu_enum_to_int(port->attr.max_mtu);
+	port->rate = ((int)port->attr.active_speed *
+		      ib_width_enum_to_int(port->attr.active_width) * 25) / 10;
+	port->rate_enum = vnic_mcast_rate_enum(port, port->rate);
+
+	if (ib_query_pkey(port->dev->ca, port->num, port->pkey_index,
+			  &port->pkey)) {
+		vnic_err(port->name, "ib_query_pkey failed for index %d\n",
+			 port->pkey_index);
+		return -EINVAL;
+	}
+	port->pkey |= 0x8000;
+
+	return 0;
+}
+
+void vnic_port_event_task(struct work_struct *work)
+{
+	struct vnic_port *port =
+		container_of(work, struct vnic_port, event_task.work);
+	struct fip_discover *discover;
+
+	/* refresh port attr, TODO: check what else need to be refreshed */
+	vnic_dbg_mark();
+	mutex_lock(&port->mlock);
+	if (vnic_port_query(port))
+		vnic_warn(port->name, "vnic_port_query failed\n");
+	mutex_unlock(&port->mlock);
+
+	/* refresh login mcasts */
+	vnic_login_refresh_mcasts(port);
+
+	list_for_each_entry(discover, &port->fip.discover_list, discover_list) {
+		/* refresh FIP mcasts */
+		if ((!no_bxm) && (discover->state != FIP_DISCOVER_OFF))
+			fip_refresh_mcasts(discover);
+	}
+}
+
+void vnic_port_event_task_light(struct work_struct *work)
+{
+	struct vnic_port *port =
+		container_of(work, struct vnic_port, event_task_light.work);
+	unsigned long flags,mc_flags;
+	struct fip_discover *discover;
+	struct rb_node *node;
+	struct vnic_port_mcast *mcaste;
+	struct mcast_root *mcast_tree = &port->mcast_tree;
+	struct vnic_login *login;
+	vnic_dbg_mark();
+	mutex_lock(&port->mlock);
+
+	if (vnic_port_query(port))
+		vnic_warn(port->name, "vnic_port_query failed\n");
+
+	spin_lock_irqsave(&mcast_tree->mcast_rb_lock, flags);
+	for (node = rb_first(&mcast_tree->mcast_tree); node; node = rb_next(node)){
+			mcaste = rb_entry(node, struct vnic_port_mcast , rb_node);
+			clear_bit(MCAST_JOINED, &mcaste->state);
+			set_bit(MCAST_JOIN_RUNNING, &mcaste->state);
+			vnic_dbg_mcast(mcaste->port->name,"Rejoin GID="VNIC_GID_FMT"\n",VNIC_GID_ARG(mcaste->gid));
+			spin_lock_irqsave(&mcaste->lock, mc_flags);
+			queue_delayed_work(mcast_wq, &mcaste->join_task, 0);
+			spin_unlock_irqrestore(&mcaste->lock, mc_flags);
+	}
+
+	spin_unlock_irqrestore(&mcast_tree->mcast_rb_lock, flags);
+
+	vnic_dbg_mark();
+	if (vnic_sa_query)
+		list_for_each_entry(login, &port->login_list, list)
+		{
+				/* take the tx lock to make sure no delete function is called at the time */
+				netif_tx_lock_bh(login->dev);
+				vnic_neigh_invalidate(login);
+				netif_tx_unlock_bh(login->dev);
+		}
+
+	mutex_unlock(&port->mlock);
+
+	list_for_each_entry(discover, &port->fip.discover_list, discover_list) {
+		if ((!no_bxm) && (discover->state != FIP_DISCOVER_OFF))
+			fip_refresh_mcasts(discover);
+	}
+}
+
+struct vnic_port *vnic_port_alloc(struct vnic_ib_dev *vnic_dev, u8 num)
+{
+	struct vnic_port *port;
+	int def_rings_num;
+	int max_num_cpus;
+
+	port = kzalloc(sizeof *port, GFP_KERNEL);
+	if (!port)
+		return ERR_PTR(-ENOMEM);
+
+	/* pre-init fields */
+	port->num = num;
+	port->dev = vnic_dev;
+
+	max_num_cpus = min((int)num_online_cpus(), VNIC_MAX_NUM_CPUS);
+	def_rings_num = min(vnic_dev->ca->num_comp_vectors, max_num_cpus);
+	port->rx_rings_num = vnic_rx_rings_num ? vnic_rx_rings_num : def_rings_num;
+	port->tx_rings_num = vnic_tx_rings_num ? vnic_tx_rings_num : def_rings_num;
+
+	sprintf(port->name, "%s:%d", port->dev->ca->name, port->num);
+	INIT_LIST_HEAD(&port->login_list);
+	INIT_LIST_HEAD(&port->fip.discover_list);
+	INIT_DELAYED_WORK(&port->event_task, vnic_port_event_task);
+	INIT_DELAYED_WORK(&port->event_task_light, vnic_port_event_task_light);
+	INIT_DELAYED_WORK(&port->discover_restart_task, port_fip_discover_restart);
+	INIT_IB_EVENT_HANDLER(&port->event_handler, vnic_dev->ca,
+			      vnic_port_event);
+	mutex_init(&port->mlock);
+	mutex_init(&port->start_stop_lock);
+	vnic_mcast_root_init(&port->mcast_tree);
+	atomic_set(&port->vnic_child_ids, 0);
+
+	port->pkey_index = 0;	/* used by fip qps, TBD */
+
+	if (ib_register_event_handler(&port->event_handler)) {
+		vnic_err(port->name, "ib_register_event_handler failed\n");
+		goto err;
+	}
+
+	vnic_dbg_mark();
+	mutex_lock(&port->mlock);
+	if (vnic_port_query(port)) {
+		vnic_err(port->name, "vnic_port_query failed\n");
+		mutex_unlock(&port->mlock);
+		if (ib_unregister_event_handler(&port->event_handler))
+			vnic_err(port->name, "ib_unregister_event_handler failed!\n");
+		goto err;
+	}
+	mutex_unlock(&port->mlock);
+
+	return port;
+err:
+	kfree(port);
+	return ERR_PTR(-EINVAL);
+}
+
+int vnic_port_init(struct vnic_port *port)
+{
+	return vnic_port_ib_init(port);
+}
+
+void vnic_port_cleanup(struct vnic_port *port)
+{
+	/* should be empty list */
+	vnic_port_ib_cleanup(port);
+	return;
+}
+
+static void vnic_ib_dev_add_one(struct ib_device *device);
+static void vnic_ib_dev_remove_one(struct ib_device *device);
+static struct ib_client vnic_init_client = {
+	.name = DRV_NAME,
+	.add = vnic_ib_dev_add_one,
+	.remove = vnic_ib_dev_remove_one,
+};
+
+static void vnic_ib_dev_add_one(struct ib_device *device)
+{
+	struct vnic_port *ib_port;
+	struct vnic_ib_dev *ib_dev;
+	int s, e, p, rc;
+
+	vnic_dbg(NULL, "ib_dev %s\n", device->name);
+
+	if (memcmp(device->name, "mlx4", 4))
+		return;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	s = 1;
+	e = device->phys_port_cnt;
+
+	/* alloc ib device */
+	ib_dev = kzalloc(sizeof *ib_dev, GFP_KERNEL);
+	if (!ib_dev)
+		return;
+
+	/* init ib dev */
+	mutex_init(&ib_dev->mlock);
+	ib_dev->ca = device;
+	mutex_lock(&ib_dev->mlock);
+	/* TODO: remove mdev once all mlx4 caps are standard */
+	ib_dev->mdev = to_mdev(device);
+	ASSERT(ib_dev->ca);
+	sprintf(ib_dev->name, "%s", device->name);
+	if (ib_query_device(device, &ib_dev->attr)) {
+		vnic_err(ib_dev->name, "ib_query_device failed on %s\n",
+			 device->name);
+		goto abort;
+	}
+
+	VNIC_FW_STR(ib_dev->attr.fw_ver, ib_dev->fw_ver_str);
+	INIT_LIST_HEAD(&ib_dev->port_list);
+	vnic_dbg_mark();
+	for (p = s; p <= e; ++p) {
+		/* skip non IB link layers */
+                if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND)
+                        continue;
+
+		/* alloc IB port */
+		ib_port = vnic_port_alloc(ib_dev, p);
+		if (IS_ERR(ib_port)) {
+			vnic_err(ib_dev->name,
+				 "vnic_port_alloc failed %d from %d\n", p, e);
+			continue;
+		}
+		/* init IB port */
+		rc = vnic_port_init(ib_port);
+		if (rc) {
+			vnic_err(ib_port->name,
+				 "vnic_port_init failed, rc %d\n", rc);
+			if (ib_unregister_event_handler(&ib_port->event_handler))
+				vnic_err(ib_port->name,
+					 "ib_unregister_event_handler failed!\n");
+			kfree(ib_port);
+			continue;
+		}
+		if (no_bxm) {
+			rc = vnic_port_data_init(ib_port);
+			if (rc)
+				 vnic_err(ib_port->name,
+					  "vnic_port_data_init failed, rc %d\n", rc);
+		} else {
+			rc = vnic_port_fip_init(ib_port);
+			if (rc)
+				vnic_err(ib_port->name,
+					 "vnic_port_fip_init failed, rc %d\n", rc);
+			else {
+				rc = port_fs_init(ib_port);
+				if (rc)
+					vnic_warn(ib_port->name, "port_fs_init sysfs:"
+						  "entry creation failed, %d\n", rc);
+			}
+		}
+		if (rc) {
+			if (ib_unregister_event_handler(&ib_port->event_handler))
+				vnic_err(ib_port->name,
+					 "ib_unregister_event_handler failed!\n");
+			vnic_port_cleanup(ib_port);
+			kfree(ib_port);
+			continue;
+
+		}
+		vnic_dbg_mark();
+		mutex_lock(&ib_port->start_stop_lock);
+		list_add_tail(&ib_port->list, &ib_dev->port_list);
+		mutex_unlock(&ib_port->start_stop_lock);
+	}
+
+	/* set device ctx */
+	ib_set_client_data(device, &vnic_init_client, ib_dev);
+	mutex_unlock(&ib_dev->mlock);
+	return;
+
+abort:
+	mutex_unlock(&ib_dev->mlock);
+	kfree(ib_dev);
+}
+
+static void vnic_ib_dev_remove_one(struct ib_device *device)
+{
+	struct vnic_port *port, *port_t;
+	struct vnic_ib_dev *ib_dev =
+		ib_get_client_data(device, &vnic_init_client);
+
+	vnic_dbg(NULL, "ib_dev %s\n", device->name);
+
+	if (!ib_dev)
+		return;
+
+	vnic_dbg_mark();
+	mutex_lock(&ib_dev->mlock);
+	list_for_each_entry_safe(port, port_t, &ib_dev->port_list, list) {
+		vnic_dbg(port->name, "port %d\n", port->num);
+		if (ib_unregister_event_handler(&port->event_handler))
+			vnic_err(port->name, "ib_unregister_event_handler failed!\n");
+		/* make sure we don't have any more pending events */
+#ifndef _BP_WORK_SYNC
+		cancel_delayed_work_sync(&port->event_task_light);
+		cancel_delayed_work_sync(&port->event_task);
+		cancel_delayed_work_sync(&port->discover_restart_task);
+#else
+		cancel_delayed_work(&port->event_task_light);
+		cancel_delayed_work(&port->event_task);
+		cancel_delayed_work(&port->discover_restart_task);
+		flush_workqueue(port_wq);
+		flush_workqueue(fip_wq);
+#endif
+		/* remove sysfs entries related to FIP
+		 *  we want to do this outside the lock
+		 */
+		port_fs_exit(port);
+
+		/* cleanup any pending vnics */
+		vnic_dbg_mark();
+		mutex_lock(&port->start_stop_lock);
+		list_del(&port->list);
+		if (no_bxm)
+			vnic_port_data_cleanup(port);
+		else {
+			vnic_port_fip_cleanup(port, 0);
+		}
+		mutex_unlock(&port->start_stop_lock);
+		vnic_port_cleanup(port);
+		kfree(port);
+	}
+	mutex_unlock(&ib_dev->mlock);
+
+	kfree(ib_dev);
+}
+
+int vnic_ports_init(void)
+{
+	int rc;
+
+	/* create global wq */
+	port_wq = create_singlethread_workqueue("port_wq");
+	if (!port_wq) {
+		vnic_err(NULL, "create_singlethread_workqueue failed for %s\n",
+			 "port_wq");
+		return -EINVAL;
+	}
+
+	login_wq = create_singlethread_workqueue("login_wq");
+	if (!login_wq) {
+		vnic_err(NULL, "create_singlethread_workqueue failed for %s\n",
+			 "login_wq");
+		goto free_wq0;
+	}
+
+	fip_wq = create_singlethread_workqueue("fip");
+	if (!fip_wq) {
+		vnic_err(NULL, "create_singlethread_workqueue failed for %s\n",
+			 "fip");
+		goto free_wq1;
+	}
+
+	/* calls vnic_ib_dev_add_one() */
+	rc = ib_register_client(&vnic_init_client);
+	if (rc) {
+		vnic_err(NULL, "ib_register_client failed %d\n", rc);
+		goto free_wq2;
+	}
+
+	return 0;
+
+free_wq2:
+	destroy_workqueue(fip_wq);
+free_wq1:
+	destroy_workqueue(login_wq);
+free_wq0:
+	destroy_workqueue(port_wq);
+
+	return -EINVAL;
+}
+
+void vnic_ports_cleanup(void)
+{
+	vnic_dbg(NULL, "calling ib_unregister_client\n");
+	/* calls vnic_ib_dev_remove_one() */
+	ib_unregister_client(&vnic_init_client);
+	vnic_dbg(NULL, "calling destroy_workqueue\n");
+	destroy_workqueue(fip_wq);
+	destroy_workqueue(login_wq);
+	destroy_workqueue(port_wq);
+	vnic_dbg(NULL, "vnic_data_cleanup done\n");
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_qp.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_qp.c
new file mode 100644
index 0000000000000..c8fb317a0cd43
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_qp.c
@@ -0,0 +1,1636 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/log2.h>
+#include <linux/netdevice.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_addr.h>
+
+#include <linux/mlx4/qp.h>
+#include <linux/io.h>
+
+#include "vnic.h"
+
+/* compare with drivers/infiniband/hw/mlx4/qp.c */
+#define mlx4_ib_dbg(format, arg...) vnic_dbg(NULL, format, ## arg)
+
+enum {
+	MLX4_IB_ACK_REQ_FREQ	= 8,
+};
+
+enum {
+	MLX4_IB_DEFAULT_SCHED_QUEUE	= 0x83,
+	MLX4_IB_DEFAULT_QP0_SCHED_QUEUE	= 0x3f,
+	MLX4_IB_LINK_TYPE_IB		= 0,
+	MLX4_IB_LINK_TYPE_ETH		= 1,
+};
+
+enum {
+	/*
+	 * Largest possible UD header: send with GRH and immediate data.
+	 * 4 bytes added to accommodate for eth header instead of lrh
+	 */
+	MLX4_IB_UD_HEADER_SIZE		= 76,
+	MLX4_IB_MAX_RAW_ETY_HDR_SIZE	= 12
+};
+
+enum {
+	MLX4_IBOE_ETHERTYPE = 0x8915
+};
+
+struct mlx4_ib_sqp {
+	struct mlx4_ib_qp	qp;
+	int			pkey_index;
+	u32			qkey;
+	u32			send_psn;
+	struct ib_ud_header	ud_header;
+	u8			header_buf[MLX4_IB_UD_HEADER_SIZE];
+};
+
+enum {
+	MLX4_IB_MIN_SQ_STRIDE = 6
+};
+
+static const __be32 mlx4_ib_opcode[] = {
+	[IB_WR_SEND]			= cpu_to_be32(MLX4_OPCODE_SEND),
+	[IB_WR_LSO]			= cpu_to_be32(MLX4_OPCODE_LSO),
+	[IB_WR_SEND_WITH_IMM]		= cpu_to_be32(MLX4_OPCODE_SEND_IMM),
+	[IB_WR_RDMA_WRITE]		= cpu_to_be32(MLX4_OPCODE_RDMA_WRITE),
+	[IB_WR_RDMA_WRITE_WITH_IMM]	= cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM),
+	[IB_WR_RDMA_READ]		= cpu_to_be32(MLX4_OPCODE_RDMA_READ),
+	[IB_WR_ATOMIC_CMP_AND_SWP]	= cpu_to_be32(MLX4_OPCODE_ATOMIC_CS),
+	[IB_WR_ATOMIC_FETCH_AND_ADD]	= cpu_to_be32(MLX4_OPCODE_ATOMIC_FA),
+	[IB_WR_SEND_WITH_INV]		= cpu_to_be32(MLX4_OPCODE_SEND_INVAL),
+	[IB_WR_LOCAL_INV]		= cpu_to_be32(MLX4_OPCODE_LOCAL_INVAL),
+	[IB_WR_FAST_REG_MR]		= cpu_to_be32(MLX4_OPCODE_FMR),
+	[IB_WR_MASKED_ATOMIC_CMP_AND_SWP]	= cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_CS),
+	[IB_WR_MASKED_ATOMIC_FETCH_AND_ADD]	= cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA),
+};
+
+#ifndef wc_wmb
+	#if defined(__i386__)
+		#define wc_wmb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
+	#elif defined(__x86_64__)
+		#define wc_wmb() asm volatile("sfence" ::: "memory")
+	#elif defined(__ia64__)
+		#define wc_wmb() asm volatile("fwb" ::: "memory")
+	#else
+		#define wc_wmb() wmb()
+	#endif
+#endif
+
+#if 0
+static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
+{
+	return container_of(mqp, struct mlx4_ib_sqp, qp);
+}
+#endif
+
+static void *get_wqe(struct mlx4_ib_qp *qp, int offset)
+{
+	return mlx4_buf_offset(&qp->buf, offset);
+}
+
+static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n)
+{
+	return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift));
+}
+
+static void *get_send_wqe(struct mlx4_ib_qp *qp, int n)
+{
+	return get_wqe(qp, qp->sq.offset + (n << qp->sq.wqe_shift));
+}
+
+/*
+ * Stamp a SQ WQE so that it is invalid if prefetched by marking the
+ * first four bytes of every 64 byte chunk with
+ *     0x7FFFFFF | (invalid_ownership_value << 31).
+ *
+ * When the max work request size is less than or equal to the WQE
+ * basic block size, as an optimization, we can stamp all WQEs with
+ * 0xffffffff, and skip the very first chunk of each WQE.
+ */
+static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size)
+{
+	__be32 *wqe;
+	int i;
+	int s;
+	int ind;
+	void *buf;
+	__be32 stamp;
+	struct mlx4_wqe_ctrl_seg *ctrl;
+
+	if (qp->sq_max_wqes_per_wr > 1) {
+		s = roundup(size, 1U << qp->sq.wqe_shift);
+		for (i = 0; i < s; i += 64) {
+			ind = (i >> qp->sq.wqe_shift) + n;
+			stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) :
+						       cpu_to_be32(0xffffffff);
+			buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+			wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1));
+			*wqe = stamp;
+		}
+	} else {
+		ctrl = buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+		s = (ctrl->fence_size & 0x3f) << 4;
+		for (i = 64; i < s; i += 64) {
+			wqe = buf + i;
+			*wqe = cpu_to_be32(0xffffffff);
+		}
+	}
+}
+
+static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size)
+{
+	struct mlx4_wqe_ctrl_seg *ctrl;
+	struct mlx4_wqe_inline_seg *inl;
+	void *wqe;
+	int s;
+
+	ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+	s = sizeof(struct mlx4_wqe_ctrl_seg);
+
+	if (qp->ibqp.qp_type == IB_QPT_UD) {
+		struct mlx4_wqe_datagram_seg *dgram = wqe + sizeof *ctrl;
+		struct mlx4_av *av = (struct mlx4_av *)dgram->av;
+		memset(dgram, 0, sizeof *dgram);
+		av->port_pd = cpu_to_be32((qp->port << 24) | to_mpd(qp->ibqp.pd)->pdn);
+		s += sizeof(struct mlx4_wqe_datagram_seg);
+	}
+
+	/* Pad the remainder of the WQE with an inline data segment. */
+	if (size > s) {
+		inl = wqe + s;
+		inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl));
+	}
+	ctrl->srcrb_flags = 0;
+	ctrl->fence_size = size / 16;
+	/*
+	 * Make sure descriptor is fully written before setting ownership bit
+	 * (because HW can start executing as soon as we do).
+	 */
+	wmb();
+
+	ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) |
+		(n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
+
+	stamp_send_wqe(qp, n + qp->sq_spare_wqes, size);
+}
+
+/* Post NOP WQE to prevent wrap-around in the middle of WR */
+static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind)
+{
+	unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1));
+	if (unlikely(s < qp->sq_max_wqes_per_wr)) {
+		post_nop_wqe(qp, ind, s << qp->sq.wqe_shift);
+		ind += s;
+	}
+	return ind;
+}
+
+static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
+{
+	struct ib_event event;
+	struct mlx4_ib_qp *mqp = to_mibqp(qp);
+	struct ib_qp *ibqp = &mqp->ibqp;
+
+	if (type == MLX4_EVENT_TYPE_PATH_MIG)
+		to_mibqp(qp)->port = to_mibqp(qp)->alt_port;
+
+	if (ibqp->event_handler) {
+		event.device     = ibqp->device;
+		switch (type) {
+		case MLX4_EVENT_TYPE_PATH_MIG:
+			event.event = IB_EVENT_PATH_MIG;
+			break;
+		case MLX4_EVENT_TYPE_COMM_EST:
+			event.event = IB_EVENT_COMM_EST;
+			break;
+		case MLX4_EVENT_TYPE_SQ_DRAINED:
+			event.event = IB_EVENT_SQ_DRAINED;
+			break;
+		case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE:
+			event.event = IB_EVENT_QP_LAST_WQE_REACHED;
+			break;
+		case MLX4_EVENT_TYPE_WQ_CATAS_ERROR:
+			event.event = IB_EVENT_QP_FATAL;
+			break;
+		case MLX4_EVENT_TYPE_PATH_MIG_FAILED:
+			event.event = IB_EVENT_PATH_MIG_ERR;
+			break;
+		case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+			event.event = IB_EVENT_QP_REQ_ERR;
+			break;
+		case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR:
+			event.event = IB_EVENT_QP_ACCESS_ERR;
+			break;
+		default:
+			printk(KERN_WARNING "mlx4_ib: Unexpected event type %d "
+			       "on QP %06x\n", type, qp->qpn);
+			return;
+		}
+
+		event.element.qp = ibqp;
+		ibqp->event_handler(&event, ibqp->qp_context);
+	}
+}
+
+static int send_wqe_overhead(enum ib_qp_type type, u32 flags)
+{
+	/*
+	 * UD WQEs must have a datagram segment.
+	 * RC and UC WQEs might have a remote address segment.
+	 * MLX WQEs need two extra inline data segments (for the UD
+	 * header and space for the ICRC).
+	 */
+	switch (type) {
+	case IB_QPT_UD:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			sizeof (struct mlx4_wqe_datagram_seg) +
+			((flags & MLX4_IB_QP_LSO) ? 128 : 0);
+	case IB_QPT_UC:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			sizeof (struct mlx4_wqe_raddr_seg);
+	case IB_QPT_XRC_TGT:
+	case IB_QPT_RC:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			sizeof (struct mlx4_wqe_atomic_seg) +
+			sizeof (struct mlx4_wqe_raddr_seg);
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			ALIGN(MLX4_IB_UD_HEADER_SIZE +
+			      DIV_ROUND_UP(MLX4_IB_UD_HEADER_SIZE,
+					   MLX4_INLINE_ALIGN) *
+			      sizeof (struct mlx4_wqe_inline_seg),
+			      sizeof (struct mlx4_wqe_data_seg)) +
+			ALIGN(4 +
+			      sizeof (struct mlx4_wqe_inline_seg),
+			      sizeof (struct mlx4_wqe_data_seg));
+	case IB_QPT_RAW_ETHERTYPE:
+		return sizeof(struct mlx4_wqe_ctrl_seg) +
+			ALIGN(MLX4_IB_MAX_RAW_ETY_HDR_SIZE +
+			      sizeof(struct mlx4_wqe_inline_seg),
+			      sizeof(struct mlx4_wqe_data_seg));
+
+	default:
+		return sizeof (struct mlx4_wqe_ctrl_seg);
+	}
+}
+
+static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
+		       int is_user, int has_rq, struct mlx4_ib_qp *qp)
+{
+	/* Sanity check RQ size before proceeding */
+	if (cap->max_recv_wr > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE ||
+		cap->max_recv_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg))
+		return -EINVAL;
+
+	if (!has_rq) {
+		if (cap->max_recv_wr)
+			return -EINVAL;
+
+		qp->rq.wqe_cnt = qp->rq.max_gs = 0;
+	} else {
+		/* HW requires >= 1 RQ entry with >= 1 gather entry */
+		if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge))
+			return -EINVAL;
+
+		qp->rq.wqe_cnt	 = roundup_pow_of_two(max(1U, cap->max_recv_wr));
+		qp->rq.max_gs	 = roundup_pow_of_two(max(1U, cap->max_recv_sge));
+		qp->rq.wqe_shift = ilog2(qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg));
+	}
+
+	/* leave userspace return values as they were, so as not to break ABI */
+	if (is_user) {
+		cap->max_recv_wr  = qp->rq.max_post = qp->rq.wqe_cnt;
+		cap->max_recv_sge = qp->rq.max_gs;
+	} else {
+		cap->max_recv_wr  = qp->rq.max_post =
+			min(dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE, qp->rq.wqe_cnt);
+		cap->max_recv_sge = min(qp->rq.max_gs,
+					min(dev->dev->caps.max_sq_sg,
+					    dev->dev->caps.max_rq_sg));
+	}
+
+	return 0;
+}
+
+static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
+			      enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp)
+{
+	int s;
+
+	/* Sanity check SQ size before proceeding */
+	if (cap->max_send_wr  > (dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE) ||
+	    cap->max_send_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg) ||
+	    cap->max_inline_data + send_wqe_overhead(type, qp->flags) +
+	    sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz)
+		return -EINVAL;
+
+	/*
+	 * For MLX transport we need 2 extra S/G entries:
+	 * one for the header and one for the checksum at the end
+	 */
+	if ((type == MLX4_IB_QPT_SMI || type == MLX4_IB_QPT_GSI ||
+	     type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) &&
+	    cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg)
+		return -EINVAL;
+
+	s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg),
+		cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) +
+		send_wqe_overhead(type, qp->flags);
+
+	if (s > dev->dev->caps.max_sq_desc_sz)
+		return -EINVAL;
+
+	/*
+	 * Hermon supports shrinking WQEs, such that a single work
+	 * request can include multiple units of 1 << wqe_shift.  This
+	 * way, work requests can differ in size, and do not have to
+	 * be a power of 2 in size, saving memory and speeding up send
+	 * WR posting.  Unfortunately, if we do this then the
+	 * wqe_index field in CQEs can't be used to look up the WR ID
+	 * anymore, so we do this only if selective signaling is off.
+	 *
+	 * Further, on 32-bit platforms, we can't use vmap() to make
+	 * the QP buffer virtually contiguous.  Thus we have to use
+	 * constant-sized WRs to make sure a WR is always fully within
+	 * a single page-sized chunk.
+	 *
+	 * Finally, we use NOP work requests to pad the end of the
+	 * work queue, to avoid wrap-around in the middle of WR.  We
+	 * set NEC bit to avoid getting completions with error for
+	 * these NOP WRs, but since NEC is only supported starting
+	 * with firmware 2.2.232, we use constant-sized WRs for older
+	 * firmware.
+	 *
+	 * And, since MLX QPs only support SEND, we use constant-sized
+	 * WRs in this case.
+	 *
+	 * We look for the smallest value of wqe_shift such that the
+	 * resulting number of wqes does not exceed device
+	 * capabilities.
+	 *
+	 * We set WQE size to at least 64 bytes, this way stamping
+	 * invalidates each WQE.
+	 */
+	if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC &&
+	    qp->sq_signal_bits && BITS_PER_LONG == 64 &&
+	    type != MLX4_IB_QPT_SMI && type != MLX4_IB_QPT_GSI &&
+	    !(type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI |
+		      MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER)))
+		qp->sq.wqe_shift = ilog2(64);
+	else
+		qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s));
+
+	for (;;) {
+		qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1U << qp->sq.wqe_shift);
+
+		/*
+		 * We need to leave 2 KB + 1 WR of headroom in the SQ to
+		 * allow HW to prefetch.
+		 */
+		qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr;
+		qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr *
+						    qp->sq_max_wqes_per_wr +
+						    qp->sq_spare_wqes);
+
+		if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes)
+			break;
+
+		if (qp->sq_max_wqes_per_wr <= 1)
+			return -EINVAL;
+
+		++qp->sq.wqe_shift;
+	}
+
+	qp->sq.max_gs = (min(dev->dev->caps.max_sq_desc_sz,
+			     (qp->sq_max_wqes_per_wr << qp->sq.wqe_shift)) -
+			 send_wqe_overhead(type, qp->flags)) /
+		sizeof (struct mlx4_wqe_data_seg);
+
+	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+		(qp->sq.wqe_cnt << qp->sq.wqe_shift);
+	if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
+		qp->rq.offset = 0;
+		qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
+	} else {
+		qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
+		qp->sq.offset = 0;
+	}
+
+	cap->max_send_wr  = qp->sq.max_post =
+		(qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr;
+	cap->max_send_sge = min(qp->sq.max_gs,
+				min(dev->dev->caps.max_sq_sg,
+				    dev->dev->caps.max_rq_sg));
+	qp->max_inline_data = cap->max_inline_data;
+
+	return 0;
+}
+
+
+
+static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state)
+{
+	switch (state) {
+	case IB_QPS_RESET:	return MLX4_QP_STATE_RST;
+	case IB_QPS_INIT:	return MLX4_QP_STATE_INIT;
+	case IB_QPS_RTR:	return MLX4_QP_STATE_RTR;
+	case IB_QPS_RTS:	return MLX4_QP_STATE_RTS;
+	case IB_QPS_SQD:	return MLX4_QP_STATE_SQD;
+	case IB_QPS_SQE:	return MLX4_QP_STATE_SQER;
+	case IB_QPS_ERR:	return MLX4_QP_STATE_ERR;
+	default:		return -1;
+	}
+}
+
+static void del_gid_entries(struct mlx4_ib_qp *qp)
+{
+	struct mlx4_ib_gid_entry *ge, *tmp;
+
+	list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
+		list_del(&ge->list);
+		kfree(ge);
+	}
+}
+
+static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
+			      struct ib_qp_init_attr *init_attr)
+{
+	if (qp->state != IB_QPS_RESET)
+		if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state),
+				   MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp))
+			printk(KERN_WARNING "mlx4_ib: modify QP %06x to RESET failed.\n",
+			       qp->mqp.qpn);
+
+	mlx4_qp_remove(dev->dev, &qp->mqp);
+	mlx4_qp_free(dev->dev, &qp->mqp);
+	mlx4_mtt_cleanup(dev->dev, &qp->mtt);
+	mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
+	kfree(qp->sq.wrid);
+	kfree(qp->rq.wrid);
+	mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
+	if (qp->max_inline_data)
+		mlx4_bf_free(dev->dev, &qp->bf);
+	if (!init_attr->srq)
+		mlx4_db_free(dev->dev, &qp->db);
+
+	del_gid_entries(qp);
+}
+
+static int qp_has_rq(struct ib_qp_init_attr *attr)
+{
+	if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT)
+		return 0;
+
+	return !attr->srq;
+}
+
+
+static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
+			    struct ib_qp_init_attr *init_attr,
+			    struct ib_udata *udata, int sqpn, struct mlx4_ib_qp *qp)
+{
+	int qpn;
+	int err;
+	enum mlx4_ib_qp_type qp_type =
+			(enum mlx4_ib_qp_type) init_attr->qp_type;
+	qp->mlx4_ib_qp_type = qp_type;
+	qp->pri.vid = qp->alt.vid = 0xFFFF;
+	mutex_init(&qp->mutex);
+	spin_lock_init(&qp->sq.lock);
+	spin_lock_init(&qp->rq.lock);
+	INIT_LIST_HEAD(&qp->gid_list);
+	INIT_LIST_HEAD(&qp->steering_rules);
+
+	qp->state	 = IB_QPS_RESET;
+	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
+		qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+
+	err = set_rq_size(dev, &init_attr->cap, !!pd->uobject,
+					  qp_has_rq(init_attr), qp);
+	if (err)
+		goto err;
+
+	if (pd->uobject) {
+	} else {
+		qp->sq_no_prefetch = 0;
+
+		if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
+			qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
+
+		if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
+			qp->flags |= MLX4_IB_QP_LSO;
+
+		if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP &&
+		    dev->dev->caps.steering_mode ==
+		    MLX4_STEERING_MODE_DEVICE_MANAGED &&
+		    !mlx4_is_mfunc(dev->dev))
+			qp->flags |= MLX4_IB_QP_NETIF;
+
+		err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp);
+		if (err)
+			goto err;
+
+		if (qp_has_rq(init_attr)) {
+			err = mlx4_db_alloc(dev->dev, &qp->db, 0, GFP_KERNEL);
+			if (err)
+				goto err;
+
+			*qp->db.db = 0;
+		}
+
+		if (qp->max_inline_data) {
+			err = mlx4_bf_alloc(dev->dev, &qp->bf, 0);
+			if (err) {
+				mlx4_ib_dbg("failed to allocate blue flame register (%d)", err);
+				qp->bf.uar = &dev->priv_uar;
+			}
+		} else
+			qp->bf.uar = &dev->priv_uar;
+
+		if (mlx4_buf_alloc(dev->dev, qp->buf_size,
+					   PAGE_SIZE * 2, &qp->buf, GFP_KERNEL)) {
+			err = -ENOMEM;
+			goto err_db;
+		}
+
+		err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift,
+				    &qp->mtt);
+		if (err) {
+			mlx4_ib_dbg("kernel qp mlx4_mtt_init error (%d)", err);
+			goto err_buf;
+		}
+
+		err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf, GFP_KERNEL);
+		if (err) {
+			mlx4_ib_dbg("mlx4_buf_write_mtt error (%d)", err);
+			goto err_mtt;
+		}
+
+		/* these are big chunks that may fail, added __GFP_NOWARN */
+		qp->sq.wrid  = kmalloc(qp->sq.wqe_cnt * sizeof (u64),
+				       GFP_KERNEL | __GFP_NOWARN);
+		qp->rq.wrid  = kmalloc(qp->rq.wqe_cnt * sizeof (u64),
+				       GFP_KERNEL | __GFP_NOWARN);
+
+		if (!qp->sq.wrid || !qp->rq.wrid) {
+			printk(KERN_WARNING "%s:%d: not enough memory\n",
+			       __func__, __LINE__);
+			err = -ENOMEM;
+			goto err_wrid;
+		}
+	}
+
+	qpn = sqpn;
+
+	err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp, GFP_KERNEL);
+	if (err)
+		goto err_qpn;
+
+	if (init_attr->qp_type == IB_QPT_XRC_TGT)
+		qp->mqp.qpn |= (1 << 23);
+
+	/*
+	 * Hardware wants QPN written in big-endian order (after
+	 * shifting) for send doorbell.  Precompute this value to save
+	 * a little bit when posting sends.
+	 */
+	qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
+
+	qp->mqp.event = mlx4_ib_qp_event;
+
+	return 0;
+
+err_qpn:
+err_wrid:
+	if (pd->uobject) {
+	} else {
+		kfree(qp->sq.wrid);
+		kfree(qp->rq.wrid);
+	}
+
+err_mtt:
+	mlx4_mtt_cleanup(dev->dev, &qp->mtt);
+
+err_buf:
+	if (pd->uobject)
+		ib_umem_release(qp->umem);
+	else
+		mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
+
+err_db:
+	if (!pd->uobject && !init_attr->srq
+		&& init_attr->qp_type != IB_QPT_XRC_TGT)
+		mlx4_db_free(dev->dev, &qp->db);
+
+	if (qp->max_inline_data)
+		mlx4_bf_free(dev->dev, &qp->bf);
+
+err:
+	return err;
+}
+
+#if 0
+static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
+			    void *wqe, unsigned *mlx_seg_len)
+{
+	struct ib_device *ib_dev = &to_mdev(sqp->qp.ibqp.device)->ib_dev;
+	struct mlx4_wqe_mlx_seg *mlx = wqe;
+	struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
+	struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah);
+	u16 pkey;
+	int send_size;
+	int header_size;
+	int spc;
+	int i;
+	union ib_gid sgid;
+	int is_eth;
+	int is_grh;
+	int is_vlan = 0;
+	int err;
+	u16 vlan;
+
+	send_size = 0;
+	for (i = 0; i < wr->num_sge; ++i)
+		send_size += wr->sg_list[i].length;
+
+	is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET;
+	is_grh = mlx4_ib_ah_grh_present(ah);
+	err = ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24,
+				ah->av.ib.gid_index, &sgid);
+	if (err)
+		return err;
+
+	if (is_eth) {
+		is_vlan = rdma_get_vlan_id(&sgid) < 0x1000;
+		vlan = rdma_get_vlan_id(&sgid);
+	}
+
+	ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header);
+	if (!is_eth) {
+		sqp->ud_header.lrh.service_level =
+			be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
+		sqp->ud_header.lrh.destination_lid = ah->av.ib.dlid;
+		sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f);
+	}
+
+	if (is_grh) {
+		sqp->ud_header.grh.traffic_class =
+			(be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff;
+		sqp->ud_header.grh.flow_label    =
+			ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
+		sqp->ud_header.grh.hop_limit     = ah->av.ib.hop_limit;
+		ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24,
+				  ah->av.ib.gid_index, &sqp->ud_header.grh.source_gid);
+		memcpy(sqp->ud_header.grh.destination_gid.raw,
+		       ah->av.ib.dgid, 16);
+	}
+
+	mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+
+	if (!is_eth) {
+		mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) |
+					  (sqp->ud_header.lrh.destination_lid ==
+					   IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) |
+					  (sqp->ud_header.lrh.service_level << 8));
+		mlx->rlid = sqp->ud_header.lrh.destination_lid;
+	}
+
+	switch (wr->opcode) {
+	case IB_WR_SEND:
+		sqp->ud_header.bth.opcode        = IB_OPCODE_UD_SEND_ONLY;
+		sqp->ud_header.immediate_present = 0;
+		break;
+	case IB_WR_SEND_WITH_IMM:
+		sqp->ud_header.bth.opcode        = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+		sqp->ud_header.immediate_present = 1;
+		sqp->ud_header.immediate_data    = wr->ex.imm_data;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (is_eth) {
+		u8 *smac;
+
+		memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6);
+		smac = to_mdev(sqp->qp.ibqp.device)->iboe.netdevs[sqp->qp.port - 1]->dev_addr; /* fixme: cache this value */
+		memcpy(sqp->ud_header.eth.smac_h, smac, 6);
+		if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6))
+			mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK);
+		if (!is_vlan)
+			sqp->ud_header.eth.type = cpu_to_be16(MLX4_IBOE_ETHERTYPE);
+		else {
+			u16 pcp;
+
+			sqp->ud_header.vlan.type = cpu_to_be16(MLX4_IBOE_ETHERTYPE);
+			pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 27 & 3) << 13;
+			sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp);
+		}
+	} else {
+		sqp->ud_header.lrh.virtual_lane    = !sqp->qp.ibqp.qp_num ? 15 : 0;
+		if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE)
+			sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE;
+	}
+	sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED);
+	if (!sqp->qp.ibqp.qp_num)
+		ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey);
+	else
+		ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->wr.ud.pkey_index, &pkey);
+	sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
+	sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+	sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
+	sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ?
+					       sqp->qkey : wr->wr.ud.remote_qkey);
+	sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num);
+
+	header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf);
+
+	if (0) {
+		printk(KERN_ERR "built UD header of size %d:\n", header_size);
+		for (i = 0; i < header_size / 4; ++i) {
+			if (i % 8 == 0)
+				printk("  [%02x] ", i * 4);
+			printk(" %08x",
+			       be32_to_cpu(((__be32 *) sqp->header_buf)[i]));
+			if ((i + 1) % 8 == 0)
+				printk("\n");
+		}
+		printk("\n");
+	}
+
+	/*
+	 * Inline data segments may not cross a 64 byte boundary.  If
+	 * our UD header is bigger than the space available up to the
+	 * next 64 byte boundary in the WQE, use two inline data
+	 * segments to hold the UD header.
+	 */
+	spc = MLX4_INLINE_ALIGN -
+	      ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
+	if (header_size <= spc) {
+		inl->byte_count = cpu_to_be32(1 << 31 | header_size);
+		memcpy(inl + 1, sqp->header_buf, header_size);
+		i = 1;
+	} else {
+		inl->byte_count = cpu_to_be32(1 << 31 | spc);
+		memcpy(inl + 1, sqp->header_buf, spc);
+
+		inl = (void *) (inl + 1) + spc;
+		memcpy(inl + 1, sqp->header_buf + spc, header_size - spc);
+		/*
+		 * Need a barrier here to make sure all the data is
+		 * visible before the byte_count field is set.
+		 * Otherwise the HCA prefetcher could grab the 64-byte
+		 * chunk with this inline segment and get a valid (!=
+		 * 0xffffffff) byte count but stale data, and end up
+		 * generating a packet with bad headers.
+		 *
+		 * The first inline segment's byte_count field doesn't
+		 * need a barrier, because it comes after a
+		 * control/MLX segment and therefore is at an offset
+		 * of 16 mod 64.
+		 */
+		wmb();
+		inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc));
+		i = 2;
+	}
+
+	*mlx_seg_len =
+	ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16);
+	return 0;
+}
+#endif
+
+static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq)
+{
+	unsigned cur;
+	struct mlx4_ib_cq *cq;
+
+	cur = wq->head - wq->tail;
+	if (likely(cur + nreq < wq->max_post))
+		return 0;
+
+	cq = to_mcq(ib_cq);
+	spin_lock(&cq->lock);
+	cur = wq->head - wq->tail;
+	spin_unlock(&cq->lock);
+
+	return cur + nreq >= wq->max_post;
+}
+
+#if 0
+static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey)
+{
+	iseg->flags	= 0;
+	iseg->mem_key	= cpu_to_be32(rkey);
+	iseg->guest_id	= 0;
+	iseg->pa	= 0;
+}
+#endif
+
+static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
+					  u64 remote_addr, u32 rkey)
+{
+	rseg->raddr    = cpu_to_be64(remote_addr);
+	rseg->rkey     = cpu_to_be32(rkey);
+	rseg->reserved = 0;
+}
+
+#if 0
+static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ib_send_wr *wr)
+{
+	if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+		aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap);
+		aseg->compare  = cpu_to_be64(wr->wr.atomic.compare_add);
+	} else if (wr->opcode == IB_WR_MASKED_ATOMIC_FETCH_AND_ADD) {
+		aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add);
+		aseg->compare  = cpu_to_be64(wr->wr.atomic.compare_add_mask);
+	} else {
+		aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add);
+		aseg->compare  = 0;
+	}
+
+}
+#endif
+
+static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
+			     struct ib_send_wr *wr, __be16 *vlan)
+{
+	memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
+	dseg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+	dseg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
+	dseg->vlan = to_mah(wr->wr.ud.ah)->av.eth.vlan;
+	memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->av.eth.mac, 6);
+	*vlan = dseg->vlan;
+}
+
+#if 0
+static void set_mlx_icrc_seg(void *dseg)
+{
+	u32 *t = dseg;
+	struct mlx4_wqe_inline_seg *iseg = dseg;
+
+	t[1] = 0;
+
+	/*
+	 * Need a barrier here before writing the byte_count field to
+	 * make sure that all the data is visible before the
+	 * byte_count field is set.  Otherwise, if the segment begins
+	 * a new cacheline, the HCA prefetcher could grab the 64-byte
+	 * chunk and get a valid (!= * 0xffffffff) byte count but
+	 * stale data, and end up sending the wrong data.
+	 */
+	wmb();
+
+	iseg->byte_count = cpu_to_be32((1 << 31) | 4);
+}
+#endif
+
+static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
+{
+	dseg->lkey       = cpu_to_be32(sg->lkey);
+	dseg->addr       = cpu_to_be64(sg->addr);
+
+	/*
+	 * Need a barrier here before writing the byte_count field to
+	 * make sure that all the data is visible before the
+	 * byte_count field is set.  Otherwise, if the segment begins
+	 * a new cacheline, the HCA prefetcher could grab the 64-byte
+	 * chunk and get a valid (!= * 0xffffffff) byte count but
+	 * stale data, and end up sending the wrong data.
+	 */
+	wmb();
+
+	dseg->byte_count = cpu_to_be32(sg->length);
+}
+
+static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
+{
+	dseg->byte_count = cpu_to_be32(sg->length);
+	dseg->lkey       = cpu_to_be32(sg->lkey);
+	dseg->addr       = cpu_to_be64(sg->addr);
+}
+
+static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr,
+			 struct mlx4_ib_qp *qp, unsigned *lso_seg_len,
+			 __be32 *lso_hdr_sz, int *blh)
+{
+	unsigned halign = ALIGN(sizeof *wqe + wr->wr.ud.hlen, 16);
+
+	*blh = unlikely(halign > 64) ? 1 : 0;
+
+	if (unlikely(!(qp->flags & MLX4_IB_QP_LSO) &&
+		     wr->num_sge > qp->sq.max_gs - (halign >> 4)))
+		return -EINVAL;
+
+	memcpy(wqe->header, wr->wr.ud.header, wr->wr.ud.hlen);
+
+	*lso_hdr_sz  = cpu_to_be32((wr->wr.ud.mss - wr->wr.ud.hlen) << 16 |
+				   wr->wr.ud.hlen);
+	*lso_seg_len = halign;
+	return 0;
+}
+
+static __be32 send_ieth(struct ib_send_wr *wr)
+{
+	switch (wr->opcode) {
+	case IB_WR_SEND_WITH_IMM:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		return wr->ex.imm_data;
+
+	case IB_WR_SEND_WITH_INV:
+		return cpu_to_be32(wr->ex.invalidate_rkey);
+
+	default:
+		return 0;
+	}
+}
+
+static int lay_inline_data(struct mlx4_ib_qp *qp, struct ib_send_wr *wr,
+			   void *wqe, int *sz)
+{
+	struct mlx4_wqe_inline_seg *seg;
+	void *addr;
+	int len, seg_len;
+	int num_seg;
+	int off, to_copy;
+	int i;
+	int inl = 0;
+
+	seg = wqe; // current segment
+	wqe += sizeof *seg; // wqe pointer
+	off = ((unsigned long)wqe) & (unsigned long)(MLX4_INLINE_ALIGN - 1);
+	num_seg = 0;
+	seg_len = 0;
+
+	for (i = 0; i < wr->num_sge; ++i) {
+		addr = (void *) (unsigned long)(wr->sg_list[i].addr);
+		len  = wr->sg_list[i].length;
+		inl += len;
+
+		if (inl > qp->max_inline_data) {
+			inl = 0;
+			return -1;
+		}
+
+		while (len >= MLX4_INLINE_ALIGN - off) {
+			to_copy = MLX4_INLINE_ALIGN - off;
+			memcpy(wqe, addr, to_copy);
+			len -= to_copy;
+			wqe += to_copy;
+			addr += to_copy;
+			seg_len += to_copy;
+			wmb(); /* see comment below */
+			seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
+			seg_len = 0;
+			seg = wqe;
+			wqe += sizeof *seg;
+			off = sizeof *seg;
+			++num_seg;
+		}
+
+		memcpy(wqe, addr, len);
+		wqe += len;
+		seg_len += len;
+		off += len;
+	}
+
+	if (seg_len) {
+		++num_seg;
+		/*
+		 * Need a barrier here to make sure
+		 * all the data is visible before the
+		 * byte_count field is set.  Otherwise
+		 * the HCA prefetcher could grab the
+		 * 64-byte chunk with this inline
+		 * segment and get a valid (!=
+		 * 0xffffffff) byte count but stale
+		 * data, and end up sending the wrong
+		 * data.
+		 */
+		wmb();
+		seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
+	}
+
+	*sz = (inl + num_seg * sizeof * seg + 15) / 16;
+
+	return 0;
+}
+
+/*
+ * Avoid using memcpy() to copy to BlueFlame page, since memcpy()
+ * implementations may use move-string-buffer assembler instructions,
+ * which do not guarantee order of copying.
+ */
+static void mlx4_bf_copy(unsigned long *dst, unsigned long *src, unsigned bytecnt)
+{
+	__iowrite64_copy(dst, src, bytecnt / 8);
+}
+
+int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr)
+{
+	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	struct mlx4_wqe_data_seg *scat;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int ind;
+	int i;
+
+	spin_lock_irqsave(&qp->rq.lock, flags);
+
+	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+			mlx4_ib_dbg("QP 0x%x: WQE overflow", ibqp->qp_num);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+			mlx4_ib_dbg("QP 0x%x: too many sg entries (%d)",
+				    ibqp->qp_num, wr->num_sge);
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		scat = get_recv_wqe(qp, ind);
+
+		for (i = 0; i < wr->num_sge; ++i)
+			__set_data_seg(scat + i, wr->sg_list + i);
+
+		if (i < qp->rq.max_gs) {
+			scat[i].byte_count = 0;
+			scat[i].lkey       = cpu_to_be32(MLX4_INVALID_LKEY);
+			scat[i].addr       = 0;
+		}
+
+		qp->rq.wrid[ind] = wr->wr_id;
+
+		ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
+	}
+
+out:
+	if (likely(nreq)) {
+		qp->rq.head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		*qp->db.db = cpu_to_be32(qp->rq.head & 0xffff);
+	}
+
+	spin_unlock_irqrestore(&qp->rq.lock, flags);
+
+	return err;
+}
+
+static inline enum ib_qp_state to_ib_qp_state(enum mlx4_qp_state mlx4_state)
+{
+	switch (mlx4_state) {
+	case MLX4_QP_STATE_RST:      return IB_QPS_RESET;
+	case MLX4_QP_STATE_INIT:     return IB_QPS_INIT;
+	case MLX4_QP_STATE_RTR:      return IB_QPS_RTR;
+	case MLX4_QP_STATE_RTS:      return IB_QPS_RTS;
+	case MLX4_QP_STATE_SQ_DRAINING:
+	case MLX4_QP_STATE_SQD:      return IB_QPS_SQD;
+	case MLX4_QP_STATE_SQER:     return IB_QPS_SQE;
+	case MLX4_QP_STATE_ERR:      return IB_QPS_ERR;
+	default:		     return -1;
+	}
+}
+
+static inline enum ib_mig_state to_ib_mig_state(int mlx4_mig_state)
+{
+	switch (mlx4_mig_state) {
+	case MLX4_QP_PM_ARMED:		return IB_MIG_ARMED;
+	case MLX4_QP_PM_REARM:		return IB_MIG_REARM;
+	case MLX4_QP_PM_MIGRATED:	return IB_MIG_MIGRATED;
+	default: return -1;
+	}
+}
+
+static int to_ib_qp_access_flags(int mlx4_flags)
+{
+	int ib_flags = 0;
+
+	if (mlx4_flags & MLX4_QP_BIT_RRE)
+		ib_flags |= IB_ACCESS_REMOTE_READ;
+	if (mlx4_flags & MLX4_QP_BIT_RWE)
+		ib_flags |= IB_ACCESS_REMOTE_WRITE;
+	if (mlx4_flags & MLX4_QP_BIT_RAE)
+		ib_flags |= IB_ACCESS_REMOTE_ATOMIC;
+
+	return ib_flags;
+}
+
+static void to_ib_ah_attr(struct mlx4_ib_dev *ib_dev, struct ib_ah_attr *ib_ah_attr,
+			  struct mlx4_qp_path *path)
+{
+	struct mlx4_dev *dev = ib_dev->dev;
+	int is_eth;
+
+	memset(ib_ah_attr, 0, sizeof *ib_ah_attr);
+	ib_ah_attr->port_num	  = path->sched_queue & 0x40 ? 2 : 1;
+
+	if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->caps.num_ports)
+		return;
+
+	is_eth = rdma_port_get_link_layer(&ib_dev->ib_dev, ib_ah_attr->port_num) ==
+		IB_LINK_LAYER_ETHERNET;
+	if (is_eth)
+		ib_ah_attr->sl = ((path->sched_queue >> 3) & 0x7) |
+		((path->sched_queue & 4) << 1);
+	else
+		ib_ah_attr->sl = (path->sched_queue >> 2) & 0xf;
+
+	ib_ah_attr->dlid	  = be16_to_cpu(path->rlid);
+
+	ib_ah_attr->src_path_bits = path->grh_mylmc & 0x7f;
+	ib_ah_attr->static_rate   = path->static_rate ? path->static_rate - 5 : 0;
+	ib_ah_attr->ah_flags      = (path->grh_mylmc & (1 << 7)) ? IB_AH_GRH : 0;
+	if (ib_ah_attr->ah_flags) {
+		ib_ah_attr->grh.sgid_index = path->mgid_index;
+		ib_ah_attr->grh.hop_limit  = path->hop_limit;
+		ib_ah_attr->grh.traffic_class =
+			(be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff;
+		ib_ah_attr->grh.flow_label =
+			be32_to_cpu(path->tclass_flowlabel) & 0xfffff;
+		memcpy(ib_ah_attr->grh.dgid.raw,
+			path->rgid, sizeof ib_ah_attr->grh.dgid.raw);
+	}
+}
+
+int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+		     struct ib_qp_init_attr *qp_init_attr)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	struct mlx4_qp_context context;
+	int mlx4_state;
+	int err = 0;
+
+	mutex_lock(&qp->mutex);
+
+	if (qp->state == IB_QPS_RESET) {
+		qp_attr->qp_state = IB_QPS_RESET;
+		goto done;
+	}
+
+	err = mlx4_qp_query(dev->dev, &qp->mqp, &context);
+	if (err) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	mlx4_state = be32_to_cpu(context.flags) >> 28;
+
+	qp->state		     = to_ib_qp_state(mlx4_state);
+	qp_attr->qp_state	     = qp->state;
+	qp_attr->path_mtu	     = context.mtu_msgmax >> 5;
+	qp_attr->path_mig_state	     =
+		to_ib_mig_state((be32_to_cpu(context.flags) >> 11) & 0x3);
+	qp_attr->qkey		     = be32_to_cpu(context.qkey);
+	qp_attr->rq_psn		     = be32_to_cpu(context.rnr_nextrecvpsn) & 0xffffff;
+	qp_attr->sq_psn		     = be32_to_cpu(context.next_send_psn) & 0xffffff;
+	qp_attr->dest_qp_num	     = be32_to_cpu(context.remote_qpn) & 0xffffff;
+	qp_attr->qp_access_flags     =
+		to_ib_qp_access_flags(be32_to_cpu(context.params2));
+
+	if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC ||
+	    qp->ibqp.qp_type == IB_QPT_XRC_TGT) {
+		to_ib_ah_attr(dev, &qp_attr->ah_attr, &context.pri_path);
+		to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context.alt_path);
+		qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f;
+		qp_attr->alt_port_num	= qp_attr->alt_ah_attr.port_num;
+	}
+
+	qp_attr->pkey_index = context.pri_path.pkey_index & 0x7f;
+	if (qp_attr->qp_state == IB_QPS_INIT)
+		qp_attr->port_num = qp->port;
+	else
+		qp_attr->port_num = context.pri_path.sched_queue & 0x40 ? 2 : 1;
+
+	/* qp_attr->en_sqd_async_notify is only applicable in modify qp */
+	qp_attr->sq_draining = mlx4_state == MLX4_QP_STATE_SQ_DRAINING;
+
+	qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context.params1) >> 21) & 0x7);
+
+	qp_attr->max_dest_rd_atomic =
+		1 << ((be32_to_cpu(context.params2) >> 21) & 0x7);
+	qp_attr->min_rnr_timer	    =
+		(be32_to_cpu(context.rnr_nextrecvpsn) >> 24) & 0x1f;
+	qp_attr->timeout	    = context.pri_path.ackto >> 3;
+	qp_attr->retry_cnt	    = (be32_to_cpu(context.params1) >> 16) & 0x7;
+	qp_attr->rnr_retry	    = (be32_to_cpu(context.params1) >> 13) & 0x7;
+	qp_attr->alt_timeout	    = context.alt_path.ackto >> 3;
+
+done:
+	qp_attr->cur_qp_state	     = qp_attr->qp_state;
+	qp_attr->cap.max_recv_wr     = qp->rq.wqe_cnt;
+	qp_attr->cap.max_recv_sge    = qp->rq.max_gs;
+
+	if (!ibqp->uobject) {
+		qp_attr->cap.max_send_wr  = qp->sq.wqe_cnt;
+		qp_attr->cap.max_send_sge = qp->sq.max_gs;
+	} else {
+		qp_attr->cap.max_send_wr  = 0;
+		qp_attr->cap.max_send_sge = 0;
+	}
+
+	/*
+	 * We don't support inline sends for kernel QPs (yet), and we
+	 * don't know what userspace's value should be.
+	 */
+	qp_attr->cap.max_inline_data = 0;
+
+	qp_init_attr->cap	     = qp_attr->cap;
+
+	qp_init_attr->create_flags = 0;
+	if (qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK)
+		qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
+
+	if (qp->flags & MLX4_IB_QP_LSO)
+		qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;
+
+out:
+	mutex_unlock(&qp->mutex);
+	return err;
+}
+
+
+int mlx4_ib_create_xrc_rcv_qp(struct ib_qp_init_attr *init_attr,
+			      u32 *qp_num)
+{
+	return -ENOSYS;
+}
+
+int mlx4_ib_modify_xrc_rcv_qp(struct ib_xrcd *ibxrcd, u32 qp_num,
+			      struct ib_qp_attr *attr, int attr_mask)
+{
+	return -ENOSYS;
+}
+
+int mlx4_ib_query_xrc_rcv_qp(struct ib_xrcd *ibxrcd, u32 qp_num,
+			     struct ib_qp_attr *qp_attr, int qp_attr_mask,
+			     struct ib_qp_init_attr *qp_init_attr)
+{
+	return -ENOSYS;
+}
+
+int mlx4_ib_reg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num)
+{
+	return -ENOSYS;
+}
+
+int mlx4_ib_unreg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num)
+{
+	return -ENOSYS;
+}
+
+/**** VNIC IB VERBS ****/
+int vnic_ib_post_send(struct ib_qp *ibqp,
+		      struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr,
+		      u8 ip_off, u8 ip6_off,
+		      u8 tcp_off, u8 udp_off)
+{
+	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	void *wqe;
+	struct mlx4_wqe_ctrl_seg *ctrl;
+	struct mlx4_wqe_data_seg *dseg;
+	__be32 owner_opcode = 0;
+	int nreq;
+	int err = 0;
+	unsigned ind;
+	int uninitialized_var(stamp);
+	int uninitialized_var(size);
+	unsigned uninitialized_var(seglen);
+	__be32 dummy;
+	__be32 *lso_wqe;
+	__be32 uninitialized_var(lso_hdr_sz);
+	int i;
+	int blh = 0;
+	__be16 vlan = 0;
+	int inl = 0;
+
+	ind = qp->sq_next_wqe;
+
+	nreq = 0;
+	lso_wqe = &dummy;
+
+	if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+		mlx4_ib_dbg("QP 0x%x: WQE overflow", ibqp->qp_num);
+		err = -ENOMEM;
+		*bad_wr = wr;
+		goto out;
+	}
+
+	if (unlikely(wr->num_sge > qp->sq.max_gs)) {
+		mlx4_ib_dbg("QP 0x%x: too many sg entries (%d)",
+			    ibqp->qp_num, wr->num_sge);
+		err = -EINVAL;
+		*bad_wr = wr;
+		goto out;
+	}
+
+	ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+	*((u32 *) (&ctrl->vlan_tag)) = 0;
+	qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
+
+	ctrl->srcrb_flags =
+		(wr->send_flags & IB_SEND_SIGNALED ?
+		 cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
+		(wr->send_flags & IB_SEND_SOLICITED ?
+		 cpu_to_be32(MLX4_WQE_CTRL_SOLICITED) : 0) |
+		qp->sq_signal_bits;
+
+	ctrl->imm = send_ieth(wr);
+
+	wqe += sizeof *ctrl;
+	size = sizeof *ctrl / 16;
+
+	set_datagram_seg(wqe, wr, &vlan);
+	wqe  += sizeof (struct mlx4_wqe_datagram_seg);
+	size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+
+	if (wr->opcode == IB_WR_LSO) {
+		err = build_lso_seg(wqe, wr, qp, &seglen, &lso_hdr_sz, &blh);
+		if (unlikely(err)) {
+			*bad_wr = wr;
+			goto out;
+		}
+		lso_wqe = (__be32 *) wqe;
+		wqe  += seglen;
+		size += seglen / 16;
+	}
+	dseg = wqe;
+	dseg += wr->num_sge - 1;
+
+	if (wr->send_flags & IB_SEND_INLINE && wr->num_sge) {
+		int sz;
+
+		err = lay_inline_data(qp, wr, wqe, &sz);
+		if (!err) {
+			inl = 1;
+			size += sz;
+		}
+	} else {
+		size += wr->num_sge * (sizeof(struct mlx4_wqe_data_seg) / 16);
+		for (i = wr->num_sge - 1; i >= 0; --i, --dseg)
+			set_data_seg(dseg, wr->sg_list + i);
+	}
+
+	wmb();
+	*lso_wqe = lso_hdr_sz;
+
+	ctrl->fence_size = size;
+
+	/* set SWP bits based on ip/ip6/tcp/udp offests */
+	if (wr->send_flags & IB_SEND_IP_CSUM) {
+		 /* SWP bit */
+		owner_opcode |= cpu_to_be32(1 << 24);
+
+		/* IP offset starts from the begining of IB packet
+		 * (and not ETH packet) in 2 bytes.
+		 * In control segment, we use c & d:
+		 * (a) tcp=0, ip=0 => calc TCP/UDP csum over IPv4
+		 * (b) tcp=0, ip=1 => calc IP csum only over IPv4
+		 * (c) tcp=1, ip=0 => calc TCP/UDP csum over IPv6
+		 * (d) tcp=1, ip=1 => calc TCP/UDP and IP csum over IPv4
+		 */
+		if (ip_off) {
+			ip_off += (IB_LRH_BYTES + IB_BTH_BYTES +
+				   IB_DETH_BYTES) >> 1;
+			ip_off += (to_mah(wr->wr.ud.ah)->av.ib.g_slid
+				   & 0x80) ? (IB_GRH_BYTES >> 1) : 0;
+			owner_opcode |= cpu_to_be32((ip_off) << 8);
+			ctrl->srcrb_flags |=
+				cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM);
+		} else if (ip6_off) {
+			ip6_off += (IB_LRH_BYTES + IB_BTH_BYTES +
+				   IB_DETH_BYTES) >> 1;
+			ip6_off += (to_mah(wr->wr.ud.ah)->av.ib.g_slid
+				   & 0x80) ? (IB_GRH_BYTES >> 1) : 0;
+			owner_opcode |= cpu_to_be32((ip6_off) << 8);
+		}
+
+		if (udp_off) { /* UDP offset and bit */
+			owner_opcode |= cpu_to_be32(udp_off << 16);
+			owner_opcode |= cpu_to_be32(1 << 25);
+			ctrl->srcrb_flags |=
+				cpu_to_be32(MLX4_WQE_CTRL_TCP_UDP_CSUM);
+		} else if (tcp_off) { /* TCP offset */
+			owner_opcode |= cpu_to_be32(tcp_off << 16);
+			ctrl->srcrb_flags |=
+				cpu_to_be32(MLX4_WQE_CTRL_TCP_UDP_CSUM);
+		}
+	}
+
+	/* set opcode, use 0x4e for BIG_LSO */
+	if (!blh)
+		owner_opcode |= mlx4_ib_opcode[wr->opcode];
+	else
+		owner_opcode |= cpu_to_be32(0x4e);
+
+	/* set owenership bit */
+	owner_opcode |= (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
+
+	/* Make sure descriptor is fully written */
+	wmb();
+	ctrl->owner_opcode = owner_opcode;
+
+	stamp = ind + qp->sq_spare_wqes;
+	ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift);
+
+	/* simulate the for loop */
+	nreq++;
+
+out:
+	if (nreq == 1 && inl && size > 1 && size < qp->bf.buf_size / 16) {
+		ctrl->owner_opcode |= htonl((qp->sq_next_wqe & 0xffff) << 8);
+		*(u32 *) (&ctrl->vlan_tag) |= qp->doorbell_qpn;
+		/*
+		 * Make sure that descriptor is written to memory
+		 * before writing to BlueFlame page.
+		 */
+		wmb();
+
+		++qp->sq.head;
+
+		mlx4_bf_copy(qp->bf.reg + qp->bf.offset, (unsigned long *) ctrl,
+			     ALIGN(size * 16, 64));
+		wc_wmb();
+
+		qp->bf.offset ^= qp->bf.buf_size;
+
+	} else if (nreq) {
+		qp->sq.head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		writel(qp->doorbell_qpn, qp->bf.uar->map + MLX4_SEND_DOORBELL);
+
+		/*
+		 * Make sure doorbells don't leak out of SQ spinlock
+		 * and reach the HCA out of order.
+		 */
+		mmiowb();
+
+	}
+
+	stamp_send_wqe(qp, stamp, size * 16);
+
+	ind = pad_wraparound(qp, ind);
+	qp->sq_next_wqe = ind;
+	return err;
+}
+
+int __vnic_ib_create_qp_range(struct ib_pd *pd, struct ib_qp_init_attr *init_attr,
+			      struct ib_udata *udata, int nqps,
+			      int align, struct ib_qp *list[])
+{
+	struct mlx4_ib_dev *dev = to_mdev(pd->device);
+	struct mlx4_ib_qp *qp;
+	int err;
+	int base_qpn, qpn;
+	int i;
+
+	for (i = 0; i < nqps; ++i) {
+		if (init_attr[i].create_flags & ~(IB_QP_CREATE_IPOIB_UD_LSO |
+						  IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK))
+			return -EINVAL;
+		if (init_attr[i].create_flags & (IB_QP_CREATE_IPOIB_UD_LSO |
+						 IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) &&
+		    (pd->uobject || init_attr[i].qp_type != IB_QPT_UD))
+			return -EINVAL;
+
+		/* Userspace is not allowed to create special QPs: */
+		if (pd->uobject && (init_attr[i].qp_type == IB_QPT_SMI ||
+				    init_attr[i].qp_type == IB_QPT_GSI))
+			return -EINVAL;
+ 
+		if (nqps > 1 && (init_attr[i].qp_type == IB_QPT_SMI ||
+				    init_attr[i].qp_type == IB_QPT_GSI))
+			return -EINVAL;
+	}
+ 
+	err = mlx4_qp_reserve_range(dev->dev, nqps, align, &base_qpn, 0);
+	if (err)
+		return err;
+
+	for (i = 0, qpn = base_qpn; i < nqps; ++i, ++qpn) {
+		qp = kzalloc(sizeof *qp, GFP_KERNEL);
+		if (!qp) {
+			err = -ENOMEM;
+			goto exit_fail;
+		}
+
+		err = create_qp_common(dev, pd, init_attr + i, udata, qpn, qp);
+		if (err) {
+			kfree(qp);
+			err = err;
+			goto exit_fail;
+		}
+		qp->xrcdn = 0;
+		qp->ibqp.qp_num = qp->mqp.qpn;
+		list[i] = &qp->ibqp;
+	}
+	return 0;
+
+exit_fail:
+	for (--i; i >= 0; --i) {
+		destroy_qp_common(dev, to_mqp(list[i]), init_attr + i);
+		kfree(to_mqp(list[i]));
+	}
+ 
+	mlx4_qp_release_range(dev->dev, base_qpn, nqps);
+	return err;
+}
+
+/* compare with ib_create_qp() in infiniband/core/verbs.c */
+int vnic_ib_create_qp_range(struct ib_pd *pd, struct ib_qp_init_attr *init_attr,
+			    struct ib_udata *udata, int nqps,
+			    int align, struct ib_qp *list[])
+{
+	struct ib_qp *qp;
+	struct ib_qp_init_attr *qp_init_attr;
+	int rc, i;
+
+	rc = __vnic_ib_create_qp_range(pd, init_attr, udata ,nqps, align, list);
+
+	if (rc)
+		return rc;
+
+	for (i = 0; i < nqps; ++ i) {
+		qp = list[i];
+		qp_init_attr      = &init_attr[i];
+		qp->device        = pd->device;
+		qp->real_qp       = qp;
+		qp->pd            = pd;
+		qp->send_cq       = qp_init_attr->send_cq;
+		qp->recv_cq       = qp_init_attr->recv_cq;
+		qp->srq           = qp_init_attr->srq;
+		qp->uobject       = NULL;
+		qp->event_handler = qp_init_attr->event_handler;
+		qp->qp_context    = qp_init_attr->qp_context;
+		qp->qp_type       = qp_init_attr->qp_type;
+		qp->xrcd          = qp->qp_type == IB_QPT_XRC_TGT ?
+			qp_init_attr->xrcd : NULL;
+		atomic_inc(&pd->usecnt);
+		atomic_inc(&qp_init_attr->send_cq->usecnt);
+		atomic_inc(&qp_init_attr->recv_cq->usecnt);
+		if (qp_init_attr->srq)
+			atomic_inc(&qp_init_attr->srq->usecnt);
+		if (qp->qp_type == IB_QPT_XRC_TGT)
+			atomic_inc(&qp->xrcd->usecnt);
+	}
+	return 0;
+}
+
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_utils.h b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_utils.h
new file mode 100644
index 0000000000000..56ee8cff18e12
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_utils.h
@@ -0,0 +1,317 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _VNIC_UTILS_H
+#define _VNIC_UTILS_H
+
+/*#define CONFIG_MLX4_VNIC_DEBUG  */     /* comment out in RELEASE and PERFORMANCE modes */
+/* #define VNIC_PROFILLNG */  		/* comment out in RELEASE and PERFORMANCE modes */
+#define VNIC_EXTRA_STATS 		/* comment out in PERFORMANCE mode */
+
+enum {
+	VNIC_DEBUG_GENERAL	= 1 << 0,  /* 0x1    */
+	VNIC_DEBUG_MCAST	= 1 << 1,  /* 0x2    */
+	VNIC_DEBUG_MCAST_V	= 1 << 2,  /* 0x4    */
+	VNIC_DEBUG_DATA		= 1 << 3,  /* 0x8    */
+	VNIC_DEBUG_DATA_V	= 1 << 4,  /* 0x10   */
+	VNIC_DEBUG_FIP		= 1 << 5,  /* 0x20   */
+	VNIC_DEBUG_FIP_V	= 1 << 6,  /* 0x40   */
+	VNIC_DEBUG_SKB		= 1 << 7,  /* 0x80   */
+	VNIC_DEBUG_SKB_V	= 1 << 8,  /* 0x100  */
+	VNIC_DEBUG_VHUB		= 1 << 9,  /* 0x200  */
+	VNIC_DEBUG_VHUB_V	= 1 << 10, /* 0x400  */
+	VNIC_DEBUG_ETHTOOL	= 1 << 11, /* 0x800  */
+	VNIC_DEBUG_ETHTOOL_V	= 1 << 12, /* 0x1000 */
+	VNIC_DEBUG_FUNC		= 1 << 13, /* 0x2000 */
+	VNIC_DEBUG_MARK		= 1 << 14, /* 0x4000 */
+	VNIC_DEBUG_MODER	= 1 << 15, /* 0x8000 */
+	VNIC_DEBUG_MODER_v	= 1 << 16, /* 0x10000 */
+	VNIC_DEBUG_PKT_DUMP	= 1 << 17, /* 0x20000 */
+	VNIC_DEBUG_FIP_P0	= 1 << 18, /* 0x40000 */
+	VNIC_DEBUG_SYSFS	= 1 << 19, /* 0x80000 */
+	VNIC_DEBUG_MAC		= 1 << 20, /* 0x100000 */
+	VNIC_DEBUG_TSTAMP	= 1 << 21, /* 0x200000 */
+	VNIC_DEBUG_PARSER	= 1 << 19, /* 0x400000 */
+	VNIC_DEBUG_LAG		= 1 << 20, /* 0x800000 */
+	VNIC_DEBUG_LAG_V	= 1 << 21, /* 0x1000000 */
+	VNIC_DEBUG_MCAST_VV	= 1 << 22, /* 0x2000000 */
+	VNIC_DEBUG_DEBUG	= 1 << 31, /* 0x80000000 */
+};
+
+/* always defined */
+#define vnic_printk(level, prefix, format, arg...)			\
+	do {   printk(level "T%.4ld [%s] %s:%s:%d: " format,		\
+		jiffies * 1000 / HZ,  					\
+	       DRV_NAME, prefix ? prefix : "", __func__, __LINE__ ,	\
+	       ## arg);							\
+} while(0)
+
+#define vnic_info(format, arg...)					\
+do {	printk(KERN_INFO "[%s] " format, DRV_NAME, ## arg); }		\
+while (0)
+
+#define vnic_warn(prefix, format, arg...)				\
+do { vnic_printk(KERN_WARNING, prefix, format, ## arg); }		\
+while (0)
+
+#define vnic_err(prefix, format, arg...)				\
+do { vnic_printk(KERN_ERR, prefix, format, ## arg); }			\
+while (0)
+
+#define _sprintf(p, buf, format, arg...)				\
+	(PAGE_SIZE - (int)(p - buf)) <= 0 ? 0 :				\
+	scnprintf(p, PAGE_SIZE - (int)(p - buf), format, ## arg)
+
+/* debug functions */
+#ifndef CONFIG_MLX4_VNIC_DEBUG
+#define ASSERT(x) 			         do { (void)(x);      } while (0)
+#define vnic_dbg_mark(void)		         do {                 } while (0)
+#define vnic_dbg_func(prefix)		         do {                 } while (0)
+#define vnic_dbg(prefix, format, arg...)         do { (void)(prefix); } while (0)
+#define vnic_dbg_mcast(prefix, format, arg...)   do { (void)(prefix); } while (0)
+#define vnic_dbg_mcast_v(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_dbg_mcast_vv(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_dbg_debug(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_dbg_ethtool(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_dbg_ethtool_v(prefix, format, arg...) \
+					         do { (void)(prefix); } while (0)
+#define vnic_dbg_data(prefix, format, arg...)    do { (void)(prefix); } while (0)
+#define vnic_dbg_data_v(prefix, format, arg...)  do { (void)(prefix); } while (0)
+#define vnic_dbg_fip(prefix, format, arg...)     do { (void)(prefix); } while (0)
+#define vnic_dbg_parse(prefix, format, arg...)     do { (void)(prefix); } while (0)
+#define vnic_dbg_lag(prefix, format, arg...)     do { (void)(prefix); } while (0)
+#define vnic_dbg_lag_v(prefix, format, arg...)     do { (void)(prefix); } while (0)
+#define vnic_dbg_fip_p0(prefix, format, arg...)  do { (void)(prefix); } while (0)
+#define vnic_dbg_sysfs(prefix, format, arg...)   do { (void)(prefix); } while (0)
+#define vnic_dbg_mac(prefix, format, arg...)     do { (void)(prefix); } while (0)
+#define vnic_dbg_fip_v(prefix, format, arg...)   do { (void)(prefix); } while (0)
+#define vnic_dbg_vhub(prefix, format, arg...)    do { (void)(prefix); } while (0)
+#define vnic_dbg_vhub_v(prefix, format, arg...)  do { (void)(prefix); } while (0)
+#define vnic_dbg_moder(prefix, format, arg...)   do { (void)(prefix); } while (0)
+#define vnic_dbg_moder_v(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_printk_skb(prefix, skb, o1, o2)     do { (void)(prefix); } while (0)
+#define vnic_dbg_skb(prefix, skb, o1, o2)        do { (void)(prefix); } while (0)
+#else
+#define ASSERT(x)  							\
+do {	if (x) break;							\
+	printk(KERN_EMERG "### ASSERTION FAILED %s: %s: %d: %s\n",	\
+	       __FILE__, __func__, __LINE__, #x); dump_stack(); BUG();	\
+} while (0)
+
+#define vnic_dbg(prefix, format, arg...)				\
+do {	if (!(vnic_msglvl & VNIC_DEBUG_GENERAL)) break;			\
+	vnic_printk(KERN_DEBUG, prefix, format, ## arg);		\
+} while (0)
+
+#define vnic_dbg_mcast(prefix, format, arg...)				\
+do {	if (!(vnic_msglvl & VNIC_DEBUG_MCAST)) break;			\
+	vnic_printk(KERN_DEBUG, prefix, format, ## arg);		\
+} while (0)
+
+#define vnic_dbg_mcast_v(prefix, format, arg...)			\
+do {	if (!(vnic_msglvl & VNIC_DEBUG_MCAST_V)) break;			\
+	vnic_printk(KERN_DEBUG, prefix, format, ## arg);		\
+} while (0)
+
+#define vnic_dbg_mcast_vv(prefix, format, arg...)			\
+do {	if (!(vnic_msglvl & VNIC_DEBUG_MCAST_VV)) break;		\
+	vnic_printk(KERN_DEBUG, prefix, format, ## arg);		\
+} while (0)
+
+#define vnic_dbg_debug(prefix, format, arg...)			\
+do {	if (!(vnic_msglvl & VNIC_DEBUG_DEBUG)) break;			\
+	vnic_printk(KERN_WARNING, prefix, format, ## arg);		\
+} while (0)
+
+
+#define vnic_dbg_data(prefix, format, arg...)				\
+do {	if (!(vnic_msglvl & VNIC_DEBUG_DATA)) break;			\
+	vnic_printk(KERN_DEBUG, prefix, format, ## arg);		\
+} while (0)
+
+#define vnic_dbg_data_v(prefix, format, arg...)				\
+do {	if (!(vnic_msglvl & VNIC_DEBUG_DATA_V)) break;			\
+	vnic_printk(KERN_DEBUG, prefix, format, ## arg);		\
+} while (0)
+
+#define vnic_dbg_fip_p0(prefix, format, arg...)				\
+do {	if (!(vnic_msglvl & VNIC_DEBUG_FIP_P0)) break;			\
+	vnic_printk(KERN_DEBUG, prefix, format, ## arg);		\
+} while (0)
+
+#define vnic_dbg_sysfs(prefix, format, arg...)				\
+do {	if (!(vnic_msglvl & VNIC_DEBUG_SYSFS)) break;			\
+	vnic_printk(KERN_DEBUG, prefix, format, ## arg);		\
+} while (0)
+
+#define vnic_dbg_mac(prefix, format, arg...)				\
+do {	if (!(vnic_msglvl & VNIC_DEBUG_MAC)) break;			\
+	vnic_printk(KERN_DEBUG, prefix, format, ## arg);		\
+} while (0)
+
+#define vnic_dbg_fip(prefix, format, arg...)				\
+do {	if (!(vnic_msglvl & VNIC_DEBUG_FIP)) break;			\
+	vnic_printk(KERN_DEBUG, prefix, format, ## arg);		\
+} while (0)
+
+#define vnic_dbg_parse(prefix, format, arg...)				\
+do {	if (!(vnic_msglvl & VNIC_DEBUG_PARSER)) break;			\
+	vnic_printk(KERN_DEBUG, prefix, format, ## arg);		\
+} while (0)
+
+#define vnic_dbg_lag(prefix, format, arg...)				\
+do {	if (!(vnic_msglvl & VNIC_DEBUG_LAG)) break;			\
+	vnic_printk(KERN_DEBUG, prefix, format, ## arg);		\
+} while (0)
+
+#define vnic_dbg_lag_v(prefix, format, arg...)				\
+do {	if (!(vnic_msglvl & VNIC_DEBUG_LAG_V)) break;			\
+	vnic_printk(KERN_DEBUG, prefix, format, ## arg);		\
+} while (0)
+
+#define vnic_dbg_fip_v(prefix, format, arg...)				\
+do {	if (!(vnic_msglvl & VNIC_DEBUG_FIP_V)) break;			\
+	vnic_printk(KERN_DEBUG, prefix, format, ## arg);		\
+} while (0)
+
+#define vnic_dbg_vhub(prefix, format, arg...)				\
+do {	if (!(vnic_msglvl & VNIC_DEBUG_VHUB)) break;			\
+	vnic_printk(KERN_DEBUG, prefix, format, ## arg);		\
+} while (0)
+
+#define vnic_dbg_vhub_v(prefix, format, arg...)				\
+do {	if (!(vnic_msglvl & VNIC_DEBUG_VHUB_V)) break;			\
+	vnic_printk(KERN_DEBUG, prefix, format, ## arg);		\
+} while (0)
+
+#define vnic_dbg_moder(prefix, format, arg...)				\
+do {	if (!(vnic_msglvl & VNIC_DEBUG_MODER)) break;			\
+	vnic_printk(KERN_DEBUG, prefix, format, ## arg);		\
+} while (0)
+
+#define vnic_dbg_moder_v(prefix, format, arg...)			\
+do {	if (!(vnic_msglvl & VNIC_DEBUG_MODER_V)) break;			\
+	vnic_printk(KERN_DEBUG, prefix, format, ## arg);		\
+} while (0)
+
+#define vnic_dbg_ethtool(prefix, format, arg...)			\
+do {	if (!(vnic_msglvl & VNIC_DEBUG_ETHTOOL)) break;			\
+	vnic_printk(KERN_DEBUG, prefix, format, ## arg);		\
+} while (0)
+
+#define vnic_dbg_ethtool_v(prefix, format, arg...)			\
+do {	if (!(vnic_msglvl & VNIC_DEBUG_ETHTOOL_V)) break;		\
+	vnic_printk(KERN_DEBUG, prefix, format, ## arg);		\
+} while (0)
+
+#define vnic_dbg_mark(void)						\
+do {	if (!(vnic_msglvl & VNIC_DEBUG_MARK)) break;			\
+	vnic_printk(KERN_DEBUG, NULL, "###\n");				\
+} while (0)
+
+#define vnic_dbg_func(prefix)						\
+do {	if (!(vnic_msglvl & VNIC_DEBUG_FUNC)) break;			\
+	vnic_printk(KERN_DEBUG, prefix, "function called\n");		\
+} while (0)
+
+#define ethp2str(p, str)						\
+do { 									\
+	switch (ntohs(p)) {						\
+	case ETH_P_RARP: sprintf(str, "%s", "ETH_P_RARP"); break;	\
+	case ETH_P_ARP:  sprintf(str, "%s", "ETH_P_ARP");  break;	\
+	case ETH_P_IP:   sprintf(str, "%s", "ETH_P_IP");   break;	\
+	case ETH_P_IPV6: sprintf(str, "%s", "ETH_P_IPV6"); break;	\
+	case ETH_P_8021Q:sprintf(str, "%s", "ETH_P_8021Q");break;	\
+	default:         sprintf(str, "0x%x", p);	   break;	\
+	}								\
+} while (0)
+
+#define skb_printk(prefix, format, arg...)				\
+	printk(KERN_DEBUG "[%s] " format, prefix, ## arg)
+
+#define vnic_dbg_skb(_prefix, skb, eoib_off, eth_off)			\
+do {	if (!(vnic_msglvl & VNIC_DEBUG_SKB)) break;			\
+	vnic_printk_skb(_prefix, skb, eoib_off, eth_off);		\
+} while (0)
+
+#define VNIC_SYSLOG_LLEN 64
+#define vnic_printk_skb(_prefix, skb, eoib_off, eth_off)		\
+do { 									\
+	char pr[VNIC_SYSLOG_LLEN];					\
+	char h_proto_str[VNIC_SYSLOG_LLEN];				\
+	struct eoibhdr *eoib_hdr = (struct eoibhdr *)			\
+			(skb->data + eoib_off);				\
+	struct ethhdr *ethh = (struct ethhdr *)				\
+			(skb->data + eth_off);				\
+	struct net_device *dev = skb->dev;				\
+	ASSERT(dev);							\
+	snprintf(pr, VNIC_SYSLOG_LLEN, "%s:skb-%s", dev->name, _prefix);\
+	skb_printk(pr, "\n");						\
+	skb_printk(pr, "--- skb dump ---\n");				\
+	skb_printk(pr, "len          : %d\n", skb->len);		\
+	skb_printk(pr, "data_len     : %d\n", skb->data_len);		\
+	skb_printk(pr, "frags        : %d\n",				\
+		skb_shinfo(skb)->nr_frags);				\
+	skb_printk(pr, "gso          : %d\n", skb_is_gso(skb));		\
+	skb_printk(pr, "head_len     : %d\n", (int)skb_headlen(skb));	\
+	skb_printk(pr, "data         : %p\n", skb->data);		\
+	skb_printk(pr, "head         : %p\n", skb->head);		\
+	skb_printk(pr, "tail         : %lu\n",				\
+		   (unsigned long)(skb->tail));				\
+	skb_printk(pr, "end          : %lu\n",				\
+		   (unsigned long)(skb->end));				\
+	skb_printk(pr, "eoib_off     : %lu\n", eoib_off);		\
+	skb_printk(pr, "eth_off      : %lu\n", eth_off);		\
+	if (eth_off < 0 || !skb_headlen(skb))				\
+		break;							\
+	ethp2str(ethh->h_proto, h_proto_str);				\
+	skb_printk(pr, "eth_proto    : %s\n", h_proto_str);		\
+	skb_printk(pr, "eth_dest     : "MAC_6_PRINT_FMT"\n",		\
+		   MAC_6_PRINT_ARG(ethh->h_dest));			\
+	skb_printk(pr, "eth_source   : "MAC_6_PRINT_FMT"\n",		\
+		   MAC_6_PRINT_ARG(ethh->h_source));			\
+	if (eoib_off < 0)						\
+		break;							\
+	skb_printk(pr, "eoib_seg_id  : 0x%04x\n", eoib_hdr->seg_id);	\
+	skb_printk(pr, "eoib_seg_off : 0x%02x\n", eoib_hdr->seg_off);	\
+	skb_printk(pr, "eoib_ip_chk  : 0x%02x\n",			\
+		   VNIC_EOIB_HDR_GET_IP_CHK(eoib_hdr));			\
+	skb_printk(pr, "eoib_tcp_chk : 0x%02x\n",			\
+		   VNIC_EOIB_HDR_GET_TCP_UDP_CHK(eoib_hdr));		\
+	skb_printk(pr, "eoib_ver     : 0x%02x\n",			\
+		   VNIC_EOIB_HDR_GET_VER(eoib_hdr));			\
+	skb_printk(pr, "eoib_sig     : 0x%02x\n",			\
+		   VNIC_EOIB_HDR_GET_SIG(eoib_hdr));			\
+} while (0)
+
+#endif /* CONFIG_MLX4_VNIC_DEBUG */
+#endif /* _VNIC_UTILS_H */