#define LPFC_HB_MBOX_INTERVAL   5      /* Heart beat interval in seconds. */
 #define LPFC_HB_MBOX_TIMEOUT    30     /* Heart beat timeout  in seconds. */
 
-#define LPFC_LOOK_AHEAD_OFF    0       /* Look ahead logic is turned off */
-
 /* Error Attention event polling interval */
 #define LPFC_ERATT_POLL_INTERVAL       5 /* EATT poll interval in seconds */
 
        uint32_t cfg_fcp_imax;
        uint32_t cfg_fcp_cpu_map;
        uint32_t cfg_hdw_queue;
+       uint32_t cfg_irq_chann;
        uint32_t cfg_suppress_rsp;
        uint32_t cfg_nvme_oas;
        uint32_t cfg_nvme_embed_cmd;
        struct dentry *debug_nvmeio_trc;
        struct lpfc_debugfs_nvmeio_trc *nvmeio_trc;
        struct dentry *debug_hdwqinfo;
+#ifdef LPFC_HDWQ_LOCK_STAT
+       struct dentry *debug_lockstat;
+#endif
        atomic_t nvmeio_trc_cnt;
        uint32_t nvmeio_trc_size;
        uint32_t nvmeio_trc_output_idx;
 #define LPFC_CHECK_NVME_IO     1
 #define LPFC_CHECK_NVMET_RCV   2
 #define LPFC_CHECK_NVMET_IO    4
+#define LPFC_CHECK_SCSI_IO     8
        uint16_t ktime_on;
        uint64_t ktime_data_samples;
        uint64_t ktime_status_samples;
 
        phba->cfg_fcp_imax = (uint32_t)val;
        phba->initial_imax = phba->cfg_fcp_imax;
 
-       for (i = 0; i < phba->cfg_hdw_queue; i += LPFC_MAX_EQ_DELAY_EQID_CNT)
+       for (i = 0; i < phba->cfg_irq_chann; i += LPFC_MAX_EQ_DELAY_EQID_CNT)
                lpfc_modify_hba_eq_delay(phba, i, LPFC_MAX_EQ_DELAY_EQID_CNT,
                                         val);
 
                                phba->cfg_fcp_cpu_map,
                                phba->sli4_hba.num_online_cpu);
                break;
-       case 2:
-               len += snprintf(buf + len, PAGE_SIZE-len,
-                               "fcp_cpu_map: Driver centric mapping (%d): "
-                               "%d online CPUs\n",
-                               phba->cfg_fcp_cpu_map,
-                               phba->sli4_hba.num_online_cpu);
-               break;
        }
 
        while (phba->sli4_hba.curr_disp_cpu < phba->sli4_hba.num_present_cpu) {
                                len += snprintf(
                                        buf + len, PAGE_SIZE - len,
                                        "CPU %02d hdwq None "
-                                       "physid %d coreid %d\n",
+                                       "physid %d coreid %d ht %d\n",
                                        phba->sli4_hba.curr_disp_cpu,
                                        cpup->phys_id,
-                                       cpup->core_id);
+                                       cpup->core_id, cpup->hyper);
                        else
                                len += snprintf(
                                        buf + len, PAGE_SIZE - len,
-                                       "CPU %02d hdwq %04d "
-                                       "physid %d coreid %d\n",
+                                       "CPU %02d EQ %04d hdwq %04d "
+                                       "physid %d coreid %d ht %d\n",
                                        phba->sli4_hba.curr_disp_cpu,
-                                       cpup->hdwq, cpup->phys_id,
-                                       cpup->core_id);
+                                       cpup->eq, cpup->hdwq, cpup->phys_id,
+                                       cpup->core_id, cpup->hyper);
                } else {
                        if (cpup->hdwq == LPFC_VECTOR_MAP_EMPTY)
                                len += snprintf(
                                        buf + len, PAGE_SIZE - len,
                                        "CPU %02d hdwq None "
-                                       "physid %d coreid %d IRQ %d\n",
+                                       "physid %d coreid %d ht %d IRQ %d\n",
                                        phba->sli4_hba.curr_disp_cpu,
                                        cpup->phys_id,
-                                       cpup->core_id, cpup->irq);
+                                       cpup->core_id, cpup->hyper, cpup->irq);
                        else
                                len += snprintf(
                                        buf + len, PAGE_SIZE - len,
-                                       "CPU %02d hdwq %04d "
-                                       "physid %d coreid %d IRQ %d\n",
+                                       "CPU %02d EQ %04d hdwq %04d "
+                                       "physid %d coreid %d ht %d IRQ %d\n",
                                        phba->sli4_hba.curr_disp_cpu,
-                                       cpup->hdwq, cpup->phys_id,
-                                       cpup->core_id, cpup->irq);
+                                       cpup->eq, cpup->hdwq, cpup->phys_id,
+                                       cpup->core_id, cpup->hyper, cpup->irq);
                }
 
                phba->sli4_hba.curr_disp_cpu++;
 # lpfc_fcp_cpu_map: Defines how to map CPUs to IRQ vectors
 # for the HBA.
 #
-# Value range is [0 to 2]. Default value is LPFC_DRIVER_CPU_MAP (2).
+# Value range is [0 to 1]. Default value is LPFC_HBA_CPU_MAP (1).
 #      0 - Do not affinitze IRQ vectors
 #      1 - Affintize HBA vectors with respect to each HBA
 #          (start with CPU0 for each HBA)
-#      2 - Affintize HBA vectors with respect to the entire driver
-#          (round robin thru all CPUs across all HBAs)
+# This also defines how Hardware Queues are mapped to specific CPUs.
 */
-static int lpfc_fcp_cpu_map = LPFC_DRIVER_CPU_MAP;
+static int lpfc_fcp_cpu_map = LPFC_HBA_CPU_MAP;
 module_param(lpfc_fcp_cpu_map, int, S_IRUGO|S_IWUSR);
 MODULE_PARM_DESC(lpfc_fcp_cpu_map,
                 "Defines how to map CPUs to IRQ vectors per HBA");
        lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
                        "3326 lpfc_fcp_cpu_map: %d out of range, using "
                        "default\n", val);
-       phba->cfg_fcp_cpu_map = LPFC_DRIVER_CPU_MAP;
+       phba->cfg_fcp_cpu_map = LPFC_HBA_CPU_MAP;
 
        return 0;
 }
  * CPU. Otherwise, the default 0 (Round Robin) scheduling of FCP/NVME I/Os
  * through WQs will be used.
  */
-LPFC_ATTR_RW(fcp_io_sched, LPFC_FCP_SCHED_BY_HDWQ,
+LPFC_ATTR_RW(fcp_io_sched, LPFC_FCP_SCHED_BY_CPU,
             LPFC_FCP_SCHED_BY_HDWQ,
             LPFC_FCP_SCHED_BY_CPU,
             "Determine scheduling algorithm for "
             "Embed NVME Command in WQE");
 
 /*
- * lpfc_hdw_queue: Set the number of IO channels the driver
+ * lpfc_hdw_queue: Set the number of Hardware Queues the driver
  * will advertise it supports to the NVME and  SCSI layers. This also
- * will map to the number of EQ/CQ/WQs the driver will create.
+ * will map to the number of CQ/WQ pairs the driver will create.
  *
  * The NVME Layer will try to create this many, plus 1 administrative
  * hardware queue. The administrative queue will always map to WQ 0
- * A hardware IO queue maps (qidx) to a specific driver WQ.
+ * A hardware IO queue maps (qidx) to a specific driver CQ/WQ.
  *
  *      0    = Configure the number of hdw queues to the number of active CPUs.
- *      1,64 = Manually specify how many hdw queues to use.
+ *      1,128 = Manually specify how many hdw queues to use.
  *
- * Value range is [0,64]. Default value is 0.
+ * Value range is [0,128]. Default value is 0.
  */
 LPFC_ATTR_R(hdw_queue,
            LPFC_HBA_HDWQ_DEF,
            LPFC_HBA_HDWQ_MIN, LPFC_HBA_HDWQ_MAX,
            "Set the number of I/O Hardware Queues");
 
+/*
+ * lpfc_irq_chann: Set the number of IRQ vectors that are available
+ * for Hardware Queues to utilize.  This also will map to the number
+ * of EQ / MSI-X vectors the driver will create. This should never be
+ * more than the number of Hardware Queues
+ *
+ *      0     = Configure number of IRQ Channels to the number of active CPUs.
+ *      1,128 = Manually specify how many IRQ Channels to use.
+ *
+ * Value range is [0,128]. Default value is 0.
+ */
+LPFC_ATTR_R(irq_chann,
+           LPFC_HBA_HDWQ_DEF,
+           LPFC_HBA_HDWQ_MIN, LPFC_HBA_HDWQ_MAX,
+           "Set the number of I/O IRQ Channels");
+
 /*
 # lpfc_enable_hba_reset: Allow or prevent HBA resets to the hardware.
 #       0  = HBA resets disabled
 */
 LPFC_ATTR_R(enable_bg, 0, 0, 1, "Enable BlockGuard Support");
 
-/*
-# lpfc_fcp_look_ahead: Look ahead for completions in FCP start routine
-#       0  = disabled (default)
-#       1  = enabled
-# Value range is [0,1]. Default value is 0.
-#
-# This feature in under investigation and may be supported in the future.
-*/
-unsigned int lpfc_fcp_look_ahead = LPFC_LOOK_AHEAD_OFF;
-
 /*
 # lpfc_prot_mask: i
 #      - Bit mask of host protection capabilities used to register with the
        &dev_attr_lpfc_fcp_imax,
        &dev_attr_lpfc_fcp_cpu_map,
        &dev_attr_lpfc_hdw_queue,
+       &dev_attr_lpfc_irq_chann,
        &dev_attr_lpfc_suppress_rsp,
        &dev_attr_lpfc_nvmet_mrq,
        &dev_attr_lpfc_nvmet_mrq_post,
        lpfc_nvme_enable_fb_init(phba, lpfc_nvme_enable_fb);
        lpfc_nvmet_fb_size_init(phba, lpfc_nvmet_fb_size);
        lpfc_hdw_queue_init(phba, lpfc_hdw_queue);
+       lpfc_irq_chann_init(phba, lpfc_irq_chann);
        lpfc_enable_bbcr_init(phba, lpfc_enable_bbcr);
        lpfc_enable_dpp_init(phba, lpfc_enable_dpp);
 
        /* A value of 0 means use the number of CPUs found in the system */
        if (phba->cfg_hdw_queue == 0)
                phba->cfg_hdw_queue = phba->sli4_hba.num_present_cpu;
+       if (phba->cfg_irq_chann == 0)
+               phba->cfg_irq_chann = phba->sli4_hba.num_present_cpu;
+       if (phba->cfg_irq_chann > phba->cfg_hdw_queue)
+               phba->cfg_irq_chann = phba->cfg_hdw_queue;
 
        phba->cfg_soft_wwnn = 0L;
        phba->cfg_soft_wwpn = 0L;
 {
        if (phba->cfg_hdw_queue > phba->sli4_hba.num_present_cpu)
                phba->cfg_hdw_queue = phba->sli4_hba.num_present_cpu;
+       if (phba->cfg_irq_chann > phba->sli4_hba.num_present_cpu)
+               phba->cfg_irq_chann = phba->sli4_hba.num_present_cpu;
+       if (phba->cfg_irq_chann > phba->cfg_hdw_queue)
+               phba->cfg_irq_chann = phba->cfg_hdw_queue;
 
        if (phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME &&
            phba->nvmet_support) {
                }
 
                if (!phba->cfg_nvmet_mrq)
-                       phba->cfg_nvmet_mrq = phba->cfg_hdw_queue;
+                       phba->cfg_nvmet_mrq = phba->cfg_irq_chann;
 
                /* Adjust lpfc_nvmet_mrq to avoid running out of WQE slots */
-               if (phba->cfg_nvmet_mrq > phba->cfg_hdw_queue) {
-                       phba->cfg_nvmet_mrq = phba->cfg_hdw_queue;
+               if (phba->cfg_nvmet_mrq > phba->cfg_irq_chann) {
+                       phba->cfg_nvmet_mrq = phba->cfg_irq_chann;
                        lpfc_printf_log(phba, KERN_ERR, LOG_NVME_DISC,
                                        "6018 Adjust lpfc_nvmet_mrq to %d\n",
                                        phba->cfg_nvmet_mrq);
 
 extern int _dump_buf_done;
 extern spinlock_t pgcnt_lock;
 extern unsigned int pgcnt;
-extern unsigned int lpfc_fcp_look_ahead;
 
 /* Interface exported by fabric iocb scheduler */
 void lpfc_fabric_abort_nport(struct lpfc_nodelist *);
 
        return len;
 }
 
+static int lpfc_debugfs_last_xripool;
+
+/**
+ * lpfc_debugfs_common_xri_data - Dump Hardware Queue info to a buffer
+ * @phba: The HBA to gather host buffer info from.
+ * @buf: The buffer to dump log into.
+ * @size: The maximum amount of data to process.
+ *
+ * Description:
+ * This routine dumps the Hardware Queue info from the @phba to @buf up to
+ * @size number of bytes. A header that describes the current hdwq state will be
+ * dumped to @buf first and then info on each hdwq entry will be dumped to @buf
+ * until @size bytes have been dumped or all the hdwq info has been dumped.
+ *
+ * Notes:
+ * This routine will rotate through each configured Hardware Queue each
+ * time called.
+ *
+ * Return Value:
+ * This routine returns the amount of bytes that were dumped into @buf and will
+ * not exceed @size.
+ **/
+static int
+lpfc_debugfs_commonxripools_data(struct lpfc_hba *phba, char *buf, int size)
+{
+       struct lpfc_sli4_hdw_queue *qp;
+       int len = 0;
+       int i, out;
+       unsigned long iflag;
+
+       for (i = 0; i < phba->cfg_hdw_queue; i++) {
+               if (len > (LPFC_DUMP_MULTIXRIPOOL_SIZE - 80))
+                       break;
+               qp = &phba->sli4_hba.hdwq[lpfc_debugfs_last_xripool];
+
+               len +=  snprintf(buf + len, size - len, "HdwQ %d Info ", i);
+               spin_lock_irqsave(&qp->abts_scsi_buf_list_lock, iflag);
+               spin_lock(&qp->abts_nvme_buf_list_lock);
+               spin_lock(&qp->io_buf_list_get_lock);
+               spin_lock(&qp->io_buf_list_put_lock);
+               out = qp->total_io_bufs - (qp->get_io_bufs + qp->put_io_bufs +
+                       qp->abts_scsi_io_bufs + qp->abts_nvme_io_bufs);
+               len +=  snprintf(buf + len, size - len,
+                                "tot:%d get:%d put:%d mt:%d "
+                                "ABTS scsi:%d nvme:%d Out:%d\n",
+                       qp->total_io_bufs, qp->get_io_bufs, qp->put_io_bufs,
+                       qp->empty_io_bufs, qp->abts_scsi_io_bufs,
+                       qp->abts_nvme_io_bufs, out);
+               spin_unlock(&qp->io_buf_list_put_lock);
+               spin_unlock(&qp->io_buf_list_get_lock);
+               spin_unlock(&qp->abts_nvme_buf_list_lock);
+               spin_unlock_irqrestore(&qp->abts_scsi_buf_list_lock, iflag);
+
+               lpfc_debugfs_last_xripool++;
+               if (lpfc_debugfs_last_xripool >= phba->cfg_hdw_queue)
+                       lpfc_debugfs_last_xripool = 0;
+       }
+
+       return len;
+}
+
 /**
  * lpfc_debugfs_multixripools_data - Display multi-XRI pools information
  * @phba: The HBA to gather host buffer info from.
        u32 txcmplq_cnt;
        char tmp[LPFC_DEBUG_OUT_LINE_SZ] = {0};
 
+       if (phba->sli_rev != LPFC_SLI_REV4)
+               return 0;
+
+       if (!phba->sli4_hba.hdwq)
+               return 0;
+
+       if (!phba->cfg_xri_rebalancing) {
+               i = lpfc_debugfs_commonxripools_data(phba, buf, size);
+               return i;
+       }
+
        /*
         * Pbl: Current number of free XRIs in public pool
         * Pvt: Current number of free XRIs in private pool
        return strnlen(buf, size);
 }
 
-static int lpfc_debugfs_last_hdwq;
+
+#ifdef LPFC_HDWQ_LOCK_STAT
+static int lpfc_debugfs_last_lock;
 
 /**
- * lpfc_debugfs_hdwqinfo_data - Dump Hardware Queue info to a buffer
+ * lpfc_debugfs_lockstat_data - Dump Hardware Queue info to a buffer
  * @phba: The HBA to gather host buffer info from.
  * @buf: The buffer to dump log into.
  * @size: The maximum amount of data to process.
  * not exceed @size.
  **/
 static int
-lpfc_debugfs_hdwqinfo_data(struct lpfc_hba *phba, char *buf, int size)
+lpfc_debugfs_lockstat_data(struct lpfc_hba *phba, char *buf, int size)
 {
        struct lpfc_sli4_hdw_queue *qp;
        int len = 0;
-       int i, out;
-       unsigned long iflag;
+       int i;
 
        if (phba->sli_rev != LPFC_SLI_REV4)
                return 0;
                return 0;
 
        for (i = 0; i < phba->cfg_hdw_queue; i++) {
-               if (len > (LPFC_HDWQINFO_SIZE - 80))
+               if (len > (LPFC_HDWQINFO_SIZE - 100))
                        break;
-               qp = &phba->sli4_hba.hdwq[lpfc_debugfs_last_hdwq];
+               qp = &phba->sli4_hba.hdwq[lpfc_debugfs_last_lock];
 
-               len +=  snprintf(buf + len, size - len, "HdwQ %d Info ", i);
-               spin_lock_irqsave(&qp->abts_scsi_buf_list_lock, iflag);
-               spin_lock(&qp->abts_nvme_buf_list_lock);
-               spin_lock(&qp->io_buf_list_get_lock);
-               spin_lock(&qp->io_buf_list_put_lock);
-               out = qp->total_io_bufs - (qp->get_io_bufs + qp->put_io_bufs +
-                       qp->abts_scsi_io_bufs + qp->abts_nvme_io_bufs);
-               len +=  snprintf(buf + len, size - len,
-                                "tot:%d get:%d put:%d mt:%d "
-                                "ABTS scsi:%d nvme:%d Out:%d\n",
-                       qp->total_io_bufs, qp->get_io_bufs, qp->put_io_bufs,
-                       qp->empty_io_bufs, qp->abts_scsi_io_bufs,
-                       qp->abts_nvme_io_bufs, out);
-               spin_unlock(&qp->io_buf_list_put_lock);
-               spin_unlock(&qp->io_buf_list_get_lock);
-               spin_unlock(&qp->abts_nvme_buf_list_lock);
-               spin_unlock_irqrestore(&qp->abts_scsi_buf_list_lock, iflag);
+               len +=  snprintf(buf + len, size - len, "HdwQ %03d Lock ", i);
+               if (phba->cfg_xri_rebalancing) {
+                       len +=  snprintf(buf + len, size - len,
+                                        "get_pvt:%d mv_pvt:%d "
+                                        "mv2pub:%d mv2pvt:%d "
+                                        "put_pvt:%d put_pub:%d wq:%d\n",
+                                        qp->lock_conflict.alloc_pvt_pool,
+                                        qp->lock_conflict.mv_from_pvt_pool,
+                                        qp->lock_conflict.mv_to_pub_pool,
+                                        qp->lock_conflict.mv_to_pvt_pool,
+                                        qp->lock_conflict.free_pvt_pool,
+                                        qp->lock_conflict.free_pub_pool,
+                                        qp->lock_conflict.wq_access);
+               } else {
+                       len +=  snprintf(buf + len, size - len,
+                                        "get:%d put:%d free:%d wq:%d\n",
+                                        qp->lock_conflict.alloc_xri_get,
+                                        qp->lock_conflict.alloc_xri_put,
+                                        qp->lock_conflict.free_xri,
+                                        qp->lock_conflict.wq_access);
+               }
 
-               lpfc_debugfs_last_hdwq++;
-               if (lpfc_debugfs_last_hdwq >= phba->cfg_hdw_queue)
-                       lpfc_debugfs_last_hdwq = 0;
+               lpfc_debugfs_last_lock++;
+               if (lpfc_debugfs_last_lock >= phba->cfg_hdw_queue)
+                       lpfc_debugfs_last_lock = 0;
        }
 
        return len;
 }
+#endif
 
 static int lpfc_debugfs_last_hba_slim_off;
 
        struct lpfc_nvme_lport *lport;
        uint64_t data1, data2, data3;
        uint64_t tot, totin, totout;
-       int cnt, i, maxch;
+       int cnt, i;
        int len = 0;
 
        if (phba->nvmet_support) {
                                atomic_read(&lport->fc4NvmeLsRequests),
                                atomic_read(&lport->fc4NvmeLsCmpls));
 
-               if (phba->cfg_hdw_queue < LPFC_HBA_HDWQ_MAX)
-                       maxch = phba->cfg_hdw_queue;
-               else
-                       maxch = LPFC_HBA_HDWQ_MAX;
                totin = 0;
                totout = 0;
                for (i = 0; i < phba->cfg_hdw_queue; i++) {
 {
        struct lpfc_hba   *phba = vport->phba;
        struct lpfc_sli4_hdw_queue *qp;
-       int i, j;
+       int i, j, max_cnt;
        int len = 0;
        uint32_t tot_xmt;
        uint32_t tot_rcv;
        } else {
                len += snprintf(buf + len, PAGE_SIZE - len, "\n");
        }
+       max_cnt = size - LPFC_DEBUG_OUT_LINE_SZ;
 
        for (i = 0; i < phba->cfg_hdw_queue; i++) {
                qp = &phba->sli4_hba.hdwq[i];
                }
                len += snprintf(buf + len, PAGE_SIZE - len,
                                "Total: %x\n", tot_xmt);
+               if (len >= max_cnt) {
+                       len += snprintf(buf + len, PAGE_SIZE - len,
+                                       "Truncated ...\n");
+                       return len;
+               }
        }
        return len;
 }
                goto out;
        }
 
-       if (phba->cfg_xri_rebalancing)
-               debug->len = lpfc_debugfs_multixripools_data(
-                       phba, debug->buffer, LPFC_DUMP_MULTIXRIPOOL_SIZE);
-       else
-               debug->len = 0;
+       debug->len = lpfc_debugfs_multixripools_data(
+               phba, debug->buffer, LPFC_DUMP_MULTIXRIPOOL_SIZE);
 
        debug->i_private = inode->i_private;
        file->private_data = debug;
        return rc;
 }
 
+#ifdef LPFC_HDWQ_LOCK_STAT
 /**
- * lpfc_debugfs_hdwqinfo_open - Open the hdwqinfo debugfs buffer
+ * lpfc_debugfs_lockstat_open - Open the lockstat debugfs buffer
  * @inode: The inode pointer that contains a vport pointer.
  * @file: The file pointer to attach the log output.
  *
  * error value.
  **/
 static int
-lpfc_debugfs_hdwqinfo_open(struct inode *inode, struct file *file)
+lpfc_debugfs_lockstat_open(struct inode *inode, struct file *file)
 {
        struct lpfc_hba *phba = inode->i_private;
        struct lpfc_debug *debug;
                goto out;
        }
 
-       debug->len = lpfc_debugfs_hdwqinfo_data(phba, debug->buffer,
+       debug->len = lpfc_debugfs_lockstat_data(phba, debug->buffer,
                LPFC_HBQINFO_SIZE);
        file->private_data = debug;
 
        return rc;
 }
 
+static ssize_t
+lpfc_debugfs_lockstat_write(struct file *file, const char __user *buf,
+                           size_t nbytes, loff_t *ppos)
+{
+       struct lpfc_debug *debug = file->private_data;
+       struct lpfc_hba *phba = (struct lpfc_hba *)debug->i_private;
+       struct lpfc_sli4_hdw_queue *qp;
+       char mybuf[64];
+       char *pbuf;
+       int i;
+
+       /* Protect copy from user */
+       if (!access_ok(buf, nbytes))
+               return -EFAULT;
+
+       memset(mybuf, 0, sizeof(mybuf));
+
+       if (copy_from_user(mybuf, buf, nbytes))
+               return -EFAULT;
+       pbuf = &mybuf[0];
+
+       if ((strncmp(pbuf, "reset", strlen("reset")) == 0) ||
+           (strncmp(pbuf, "zero", strlen("zero")) == 0)) {
+               for (i = 0; i < phba->cfg_hdw_queue; i++) {
+                       qp = &phba->sli4_hba.hdwq[i];
+                       qp->lock_conflict.alloc_xri_get = 0;
+                       qp->lock_conflict.alloc_xri_put = 0;
+                       qp->lock_conflict.free_xri = 0;
+                       qp->lock_conflict.wq_access = 0;
+                       qp->lock_conflict.alloc_pvt_pool = 0;
+                       qp->lock_conflict.mv_from_pvt_pool = 0;
+                       qp->lock_conflict.mv_to_pub_pool = 0;
+                       qp->lock_conflict.mv_to_pvt_pool = 0;
+                       qp->lock_conflict.free_pvt_pool = 0;
+                       qp->lock_conflict.free_pub_pool = 0;
+                       qp->lock_conflict.wq_access = 0;
+               }
+       }
+       return nbytes;
+}
+#endif
+
 /**
  * lpfc_debugfs_dumpHBASlim_open - Open the Dump HBA SLIM debugfs buffer
  * @inode: The inode pointer that contains a vport pointer.
        }
 
        debug->len = lpfc_debugfs_cpucheck_data(vport, debug->buffer,
-               LPFC_NVMEKTIME_SIZE);
+               LPFC_CPUCHECK_SIZE);
 
        debug->i_private = inode->i_private;
        file->private_data = debug;
        pbuf = &mybuf[0];
 
        if ((strncmp(pbuf, "on", sizeof("on") - 1) == 0)) {
+               if (phba->nvmet_support)
+                       phba->cpucheck_on |= LPFC_CHECK_NVMET_IO;
+               else
+                       phba->cpucheck_on |= (LPFC_CHECK_NVME_IO |
+                               LPFC_CHECK_SCSI_IO);
+               return strlen(pbuf);
+       } else if ((strncmp(pbuf, "nvme_on", sizeof("nvme_on") - 1) == 0)) {
                if (phba->nvmet_support)
                        phba->cpucheck_on |= LPFC_CHECK_NVMET_IO;
                else
                        phba->cpucheck_on |= LPFC_CHECK_NVME_IO;
                return strlen(pbuf);
+       } else if ((strncmp(pbuf, "scsi_on", sizeof("scsi_on") - 1) == 0)) {
+               phba->cpucheck_on |= LPFC_CHECK_SCSI_IO;
+               return strlen(pbuf);
        } else if ((strncmp(pbuf, "rcv",
                   sizeof("rcv") - 1) == 0)) {
                if (phba->nvmet_support)
                int *len, int max_cnt, int eqidx, int eq_id)
 {
        struct lpfc_queue *qp;
-       int qidx, rc;
+       int rc;
 
-       for (qidx = 0; qidx < phba->cfg_hdw_queue; qidx++) {
-               qp = phba->sli4_hba.hdwq[qidx].fcp_cq;
-               if (qp->assoc_qid != eq_id)
-                       continue;
+       qp = phba->sli4_hba.hdwq[eqidx].fcp_cq;
 
-               *len = __lpfc_idiag_print_cq(qp, "FCP", pbuffer, *len);
+       *len = __lpfc_idiag_print_cq(qp, "FCP", pbuffer, *len);
 
-               /* Reset max counter */
-               qp->CQ_max_cqe = 0;
+       /* Reset max counter */
+       qp->CQ_max_cqe = 0;
 
-               if (*len >= max_cnt)
-                       return 1;
+       if (*len >= max_cnt)
+               return 1;
 
-               rc = lpfc_idiag_wqs_for_cq(phba, "FCP", pbuffer, len,
-                               max_cnt, qp->queue_id);
-               if (rc)
-                       return 1;
-       }
+       rc = lpfc_idiag_wqs_for_cq(phba, "FCP", pbuffer, len,
+                                  max_cnt, qp->queue_id);
+       if (rc)
+               return 1;
 
        if (phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME) {
-               for (qidx = 0; qidx < phba->cfg_hdw_queue; qidx++) {
-                       qp = phba->sli4_hba.hdwq[qidx].nvme_cq;
-                       if (qp->assoc_qid != eq_id)
-                               continue;
+               qp = phba->sli4_hba.hdwq[eqidx].nvme_cq;
 
-                       *len = __lpfc_idiag_print_cq(qp, "NVME", pbuffer, *len);
+               *len = __lpfc_idiag_print_cq(qp, "NVME", pbuffer, *len);
 
-                       /* Reset max counter */
-                       qp->CQ_max_cqe = 0;
+               /* Reset max counter */
+               qp->CQ_max_cqe = 0;
 
-                       if (*len >= max_cnt)
-                               return 1;
+               if (*len >= max_cnt)
+                       return 1;
 
-                       rc = lpfc_idiag_wqs_for_cq(phba, "NVME", pbuffer, len,
-                                                  max_cnt, qp->queue_id);
-                       if (rc)
-                               return 1;
-               }
+               rc = lpfc_idiag_wqs_for_cq(phba, "NVME", pbuffer, len,
+                                          max_cnt, qp->queue_id);
+               if (rc)
+                       return 1;
        }
 
        if ((eqidx < phba->cfg_nvmet_mrq) && phba->nvmet_support) {
                        (unsigned long long)qp->q_cnt_4, qp->q_mode);
        len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len,
                        "EQID[%02d], QE-CNT[%04d], QE-SZ[%04d], "
-                       "HST-IDX[%04d], PRT-IDX[%04d], PST[%03d]",
+                       "HST-IDX[%04d], PRT-IDX[%04d], PST[%03d] AFFIN[%03d]",
                        qp->queue_id, qp->entry_count, qp->entry_size,
-                       qp->host_index, qp->hba_index, qp->entry_repost);
+                       qp->host_index, qp->hba_index, qp->entry_repost,
+                       qp->chann);
        len +=  snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len, "\n");
 
        return len;
                        phba->lpfc_idiag_last_eq = 0;
 
                len += snprintf(pbuffer + len, LPFC_QUE_INFO_GET_BUF_SIZE - len,
-                                       "EQ %d out of %d HBA EQs\n",
+                                       "HDWQ %d out of %d HBA HDWQs\n",
                                        x, phba->cfg_hdw_queue);
 
                /* Fast-path EQ */
        .release =      lpfc_debugfs_release,
 };
 
-#undef lpfc_debugfs_op_hdwqinfo
-static const struct file_operations lpfc_debugfs_op_hdwqinfo = {
+#ifdef LPFC_HDWQ_LOCK_STAT
+#undef lpfc_debugfs_op_lockstat
+static const struct file_operations lpfc_debugfs_op_lockstat = {
        .owner =        THIS_MODULE,
-       .open =         lpfc_debugfs_hdwqinfo_open,
+       .open =         lpfc_debugfs_lockstat_open,
        .llseek =       lpfc_debugfs_lseek,
        .read =         lpfc_debugfs_read,
+       .write =        lpfc_debugfs_lockstat_write,
        .release =      lpfc_debugfs_release,
 };
+#endif
 
 #undef lpfc_debugfs_op_dumpHBASlim
 static const struct file_operations lpfc_debugfs_op_dumpHBASlim = {
                                            phba->hba_debugfs_root,
                                            phba, &lpfc_debugfs_op_hbqinfo);
 
-               /* Setup hdwqinfo */
-               snprintf(name, sizeof(name), "hdwqinfo");
-               phba->debug_hdwqinfo =
+#ifdef LPFC_HDWQ_LOCK_STAT
+               /* Setup lockstat */
+               snprintf(name, sizeof(name), "lockstat");
+               phba->debug_lockstat =
                        debugfs_create_file(name, S_IFREG | 0644,
                                            phba->hba_debugfs_root,
-                                           phba, &lpfc_debugfs_op_hdwqinfo);
-               if (!phba->debug_hdwqinfo) {
+                                           phba, &lpfc_debugfs_op_lockstat);
+               if (!phba->debug_lockstat) {
                        lpfc_printf_vlog(vport, KERN_ERR, LOG_INIT,
-                                        "0511 Cant create debugfs hdwqinfo\n");
+                                        "0913 Cant create debugfs lockstat\n");
                        goto debug_failed;
                }
+#endif
 
                /* Setup dumpHBASlim */
                if (phba->sli_rev < LPFC_SLI_REV4) {
                                    vport, &lpfc_debugfs_op_scsistat);
        if (!vport->debug_scsistat) {
                lpfc_printf_vlog(vport, KERN_ERR, LOG_INIT,
-                                "0811 Cannot create debugfs scsistat\n");
+                                "0914 Cannot create debugfs scsistat\n");
                goto debug_failed;
        }
 
                debugfs_remove(phba->debug_hbqinfo); /* hbqinfo */
                phba->debug_hbqinfo = NULL;
 
-               debugfs_remove(phba->debug_hdwqinfo); /* hdwqinfo */
-               phba->debug_hdwqinfo = NULL;
-
+#ifdef LPFC_HDWQ_LOCK_STAT
+               debugfs_remove(phba->debug_lockstat); /* lockstat */
+               phba->debug_lockstat = NULL;
+#endif
                debugfs_remove(phba->debug_dumpHBASlim); /* HBASlim */
                phba->debug_dumpHBASlim = NULL;
 
 
 /* multixripool output buffer size */
 #define LPFC_DUMP_MULTIXRIPOOL_SIZE 8192
 
-/* hdwqinfo output buffer size */
-#define LPFC_HDWQINFO_SIZE 8192
-
 enum {
        DUMP_FCP,
        DUMP_NVME,
 
 #define LPFC_DEF_IMAX          150000
 
 #define LPFC_MIN_CPU_MAP       0
-#define LPFC_MAX_CPU_MAP       2
+#define LPFC_MAX_CPU_MAP       1
 #define LPFC_HBA_CPU_MAP       1
-#define LPFC_DRIVER_CPU_MAP    2  /* Default */
 
 /* PORT_CAPABILITIES constants. */
 #define LPFC_MAX_SUPPORTED_PAGES       8
 
 #include <linux/miscdevice.h>
 #include <linux/percpu.h>
 #include <linux/msi.h>
+#include <linux/irq.h>
 #include <linux/bitops.h>
 
 #include <scsi/scsi.h>
 static void lpfc_sli4_disable_intr(struct lpfc_hba *);
 static uint32_t lpfc_sli4_enable_intr(struct lpfc_hba *, uint32_t);
 static void lpfc_sli4_oas_verify(struct lpfc_hba *phba);
+static uint16_t lpfc_find_eq_handle(struct lpfc_hba *, uint16_t);
+static uint16_t lpfc_find_cpu_handle(struct lpfc_hba *, uint16_t, int);
 
 static struct scsi_transport_template *lpfc_transport_template = NULL;
 static struct scsi_transport_template *lpfc_vport_transport_template = NULL;
                }
 
                /* Interrupts per sec per EQ */
-               val = phba->cfg_fcp_imax / phba->cfg_hdw_queue;
+               val = phba->cfg_fcp_imax / phba->cfg_irq_chann;
                tick_cqe = val / CONFIG_HZ; /* Per tick per EQ */
 
                /* Assume 1 CQE/ISR, calc max CQEs allowed for time duration */
                max_cqe = time_elapsed * tick_cqe;
 
-               for (i = 0; i < phba->cfg_hdw_queue; i++) {
+               for (i = 0; i < phba->cfg_irq_chann; i++) {
                        /* Fast-path EQ */
                        qp = phba->sli4_hba.hdwq[i].hba_eq;
                        if (!qp)
                                if (val) {
                                        /* First, interrupts per sec per EQ */
                                        val = phba->cfg_fcp_imax /
-                                               phba->cfg_hdw_queue;
+                                               phba->cfg_irq_chann;
 
                                        /* us delay between each interrupt */
                                        val = LPFC_SEC_TO_USEC / val;
        shost->max_lun = vport->cfg_max_luns;
        shost->this_id = -1;
        shost->max_cmd_len = 16;
+
        if (phba->sli_rev == LPFC_SLI_REV4) {
-               shost->nr_hw_queues = phba->cfg_hdw_queue;
+               if (phba->cfg_fcp_io_sched == LPFC_FCP_SCHED_BY_HDWQ)
+                       shost->nr_hw_queues = phba->cfg_hdw_queue;
+               else
+                       shost->nr_hw_queues = phba->sli4_hba.num_present_cpu;
+
                shost->dma_boundary =
                        phba->sli4_hba.pc_sli4_params.sge_supp_len-1;
                shost->sg_tablesize = phba->cfg_scsi_seg_cnt;
                goto out_remove_rpi_hdrs;
        }
 
-       phba->sli4_hba.hba_eq_hdl = kcalloc(phba->cfg_hdw_queue,
+       phba->sli4_hba.hba_eq_hdl = kcalloc(phba->cfg_irq_chann,
                                            sizeof(struct lpfc_hba_eq_hdl),
                                            GFP_KERNEL);
        if (!phba->sli4_hba.hba_eq_hdl) {
        struct lpfc_rsrc_desc_fcfcoe *desc;
        char *pdesc_0;
        uint16_t forced_link_speed;
-       uint32_t if_type;
+       uint32_t if_type, qmin;
        int length, i, rc = 0, rc2;
 
        pmb = (LPFC_MBOXQ_t *) mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL);
                                phba->sli4_hba.max_cfg_param.max_rq);
 
                /*
-                * Calculate NVME queue resources based on how
-                * many WQ/CQs are available.
+                * Calculate queue resources based on how
+                * many WQ/CQ/EQs are available.
                 */
-               if (phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME) {
-                       length = phba->sli4_hba.max_cfg_param.max_wq;
-                       if (phba->sli4_hba.max_cfg_param.max_cq <
-                           phba->sli4_hba.max_cfg_param.max_wq)
-                               length = phba->sli4_hba.max_cfg_param.max_cq;
+               qmin = phba->sli4_hba.max_cfg_param.max_wq;
+               if (phba->sli4_hba.max_cfg_param.max_cq < qmin)
+                       qmin = phba->sli4_hba.max_cfg_param.max_cq;
+               if (phba->sli4_hba.max_cfg_param.max_eq < qmin)
+                       qmin = phba->sli4_hba.max_cfg_param.max_eq;
+               /*
+                * Whats left after this can go toward NVME / FCP.
+                * The minus 4 accounts for ELS, NVME LS, MBOX
+                * plus one extra. When configured for
+                * NVMET, FCP io channel WQs are not created.
+                */
+               qmin -= 4;
 
-                       /*
-                        * Whats left after this can go toward NVME.
-                        * The minus 6 accounts for ELS, NVME LS, MBOX
-                        * plus a couple extra. When configured for
-                        * NVMET, FCP io channel WQs are not created.
-                        */
-                       length -= 6;
-
-                       /* Take off FCP queues */
-                       if (!phba->nvmet_support)
-                               length -= phba->cfg_hdw_queue;
-
-                       /* Check to see if there is enough for NVME */
-                       if (phba->cfg_hdw_queue > length) {
-                               lpfc_printf_log(
-                                       phba, KERN_ERR, LOG_SLI,
-                                       "2005 Reducing NVME IO channel to %d: "
-                                       "WQ %d CQ %d CommonIO %d\n",
-                                       length,
+               /* If NVME is configured, double the number of CQ/WQs needed */
+               if ((phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME) &&
+                   !phba->nvmet_support)
+                       qmin /= 2;
+
+               /* Check to see if there is enough for NVME */
+               if ((phba->cfg_irq_chann > qmin) ||
+                   (phba->cfg_hdw_queue > qmin)) {
+                       lpfc_printf_log(phba, KERN_ERR, LOG_SLI,
+                                       "2005 Reducing Queues: "
+                                       "WQ %d CQ %d EQ %d: min %d: "
+                                       "IRQ %d HDWQ %d\n",
                                        phba->sli4_hba.max_cfg_param.max_wq,
                                        phba->sli4_hba.max_cfg_param.max_cq,
+                                       phba->sli4_hba.max_cfg_param.max_eq,
+                                       qmin, phba->cfg_irq_chann,
                                        phba->cfg_hdw_queue);
 
-                               phba->cfg_hdw_queue = length;
-                       }
+                       if (phba->cfg_irq_chann > qmin)
+                               phba->cfg_irq_chann = qmin;
+                       if (phba->cfg_hdw_queue > qmin)
+                               phba->cfg_hdw_queue = qmin;
                }
        }
 
         * device parameters
         */
 
-       if (phba->cfg_hdw_queue > phba->sli4_hba.max_cfg_param.max_eq) {
-               lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
-                               "2575 Reducing IO channels to match number of "
-                               "available EQs: from %d to %d\n",
-                               phba->cfg_hdw_queue,
-                               phba->sli4_hba.max_cfg_param.max_eq);
-               phba->cfg_hdw_queue = phba->sli4_hba.max_cfg_param.max_eq;
-       }
-
        if (phba->nvmet_support) {
-               if (phba->cfg_hdw_queue < phba->cfg_nvmet_mrq)
-                       phba->cfg_nvmet_mrq = phba->cfg_hdw_queue;
+               if (phba->cfg_irq_chann < phba->cfg_nvmet_mrq)
+                       phba->cfg_nvmet_mrq = phba->cfg_irq_chann;
        }
        if (phba->cfg_nvmet_mrq > LPFC_NVMET_MRQ_MAX)
                phba->cfg_nvmet_mrq = LPFC_NVMET_MRQ_MAX;
 
        lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
-                       "2574 IO channels: hdwQ %d MRQ: %d\n",
-                       phba->cfg_hdw_queue, phba->cfg_nvmet_mrq);
+                       "2574 IO channels: hdwQ %d IRQ %d MRQ: %d\n",
+                       phba->cfg_hdw_queue, phba->cfg_irq_chann,
+                       phba->cfg_nvmet_mrq);
 
        /* Get EQ depth from module parameter, fake the default for now */
        phba->sli4_hba.eq_esize = LPFC_EQE_SIZE_4B;
        }
        qdesc->qe_valid = 1;
        qdesc->hdwq = wqidx;
+       qdesc->chann = lpfc_find_cpu_handle(phba, wqidx, LPFC_FIND_BY_HDWQ);
        phba->sli4_hba.hdwq[wqidx].nvme_cq = qdesc;
 
        qdesc = lpfc_sli4_queue_alloc(phba, LPFC_EXPANDED_PAGE_SIZE,
                return 1;
        }
        qdesc->hdwq = wqidx;
+       qdesc->chann = wqidx;
        phba->sli4_hba.hdwq[wqidx].nvme_wq = qdesc;
        list_add_tail(&qdesc->wq_list, &phba->sli4_hba.lpfc_wq_list);
        return 0;
        }
        qdesc->qe_valid = 1;
        qdesc->hdwq = wqidx;
+       qdesc->chann = lpfc_find_cpu_handle(phba, wqidx, LPFC_FIND_BY_HDWQ);
        phba->sli4_hba.hdwq[wqidx].fcp_cq = qdesc;
 
        /* Create Fast Path FCP WQs */
                return 1;
        }
        qdesc->hdwq = wqidx;
+       qdesc->chann = wqidx;
        phba->sli4_hba.hdwq[wqidx].fcp_wq = qdesc;
        list_add_tail(&qdesc->wq_list, &phba->sli4_hba.lpfc_wq_list);
        return 0;
 lpfc_sli4_queue_create(struct lpfc_hba *phba)
 {
        struct lpfc_queue *qdesc;
-       int idx;
+       int idx, eqidx;
        struct lpfc_sli4_hdw_queue *qp;
 
        /*
 
        /* Create HBA Event Queues (EQs) */
        for (idx = 0; idx < phba->cfg_hdw_queue; idx++) {
-               /* Create EQs */
+               /*
+                * If there are more Hardware Queues than available
+                * CQs, multiple Hardware Queues may share a common EQ.
+                */
+               if (idx >= phba->cfg_irq_chann) {
+                       /* Share an existing EQ */
+                       eqidx = lpfc_find_eq_handle(phba, idx);
+                       phba->sli4_hba.hdwq[idx].hba_eq =
+                               phba->sli4_hba.hdwq[eqidx].hba_eq;
+                       continue;
+               }
+               /* Create an EQ */
                qdesc = lpfc_sli4_queue_alloc(phba, LPFC_DEFAULT_PAGE_SIZE,
                                              phba->sli4_hba.eq_esize,
                                              phba->sli4_hba.eq_ecount);
                }
                qdesc->qe_valid = 1;
                qdesc->hdwq = idx;
+
+               /* Save the CPU this EQ is affinitised to */
+               eqidx = lpfc_find_eq_handle(phba, idx);
+               qdesc->chann = lpfc_find_cpu_handle(phba, eqidx,
+                                                   LPFC_FIND_BY_EQ);
                phba->sli4_hba.hdwq[idx].hba_eq = qdesc;
        }
 
 
        /* Allocate SCSI SLI4 CQ/WQs */
-       for (idx = 0; idx < phba->cfg_hdw_queue; idx++)
+       for (idx = 0; idx < phba->cfg_hdw_queue; idx++) {
                if (lpfc_alloc_fcp_wq_cq(phba, idx))
                        goto out_error;
+       }
 
        /* Allocate NVME SLI4 CQ/WQs */
        if (phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME) {
-               for (idx = 0; idx < phba->cfg_hdw_queue; idx++)
+               for (idx = 0; idx < phba->cfg_hdw_queue; idx++) {
                        if (lpfc_alloc_nvme_wq_cq(phba, idx))
                                goto out_error;
+               }
 
                if (phba->nvmet_support) {
                        for (idx = 0; idx < phba->cfg_nvmet_mrq; idx++) {
                                }
                                qdesc->qe_valid = 1;
                                qdesc->hdwq = idx;
+                               qdesc->chann = idx;
                                phba->sli4_hba.nvmet_cqset[idx] = qdesc;
                        }
                }
                goto out_error;
        }
        qdesc->qe_valid = 1;
+       qdesc->chann = 0;
        phba->sli4_hba.els_cq = qdesc;
 
 
                                "0505 Failed allocate slow-path MQ\n");
                goto out_error;
        }
+       qdesc->chann = 0;
        phba->sli4_hba.mbx_wq = qdesc;
 
        /*
                                "0504 Failed allocate slow-path ELS WQ\n");
                goto out_error;
        }
+       qdesc->chann = 0;
        phba->sli4_hba.els_wq = qdesc;
        list_add_tail(&qdesc->wq_list, &phba->sli4_hba.lpfc_wq_list);
 
                                        "6079 Failed allocate NVME LS CQ\n");
                        goto out_error;
                }
+               qdesc->chann = 0;
                qdesc->qe_valid = 1;
                phba->sli4_hba.nvmels_cq = qdesc;
 
                                        "6080 Failed allocate NVME LS WQ\n");
                        goto out_error;
                }
+               qdesc->chann = 0;
                phba->sli4_hba.nvmels_wq = qdesc;
                list_add_tail(&qdesc->wq_list, &phba->sli4_hba.lpfc_wq_list);
        }
 }
 
 static inline void
-lpfc_sli4_release_hdwq(struct lpfc_sli4_hdw_queue *hdwq, int max)
+lpfc_sli4_release_hdwq(struct lpfc_hba *phba)
 {
+       struct lpfc_sli4_hdw_queue *hdwq;
        uint32_t idx;
 
-       for (idx = 0; idx < max; idx++) {
-               lpfc_sli4_queue_free(hdwq[idx].hba_eq);
+       hdwq = phba->sli4_hba.hdwq;
+       for (idx = 0; idx < phba->cfg_hdw_queue; idx++) {
+               if (idx < phba->cfg_irq_chann)
+                       lpfc_sli4_queue_free(hdwq[idx].hba_eq);
+               hdwq[idx].hba_eq = NULL;
+
                lpfc_sli4_queue_free(hdwq[idx].fcp_cq);
                lpfc_sli4_queue_free(hdwq[idx].nvme_cq);
                lpfc_sli4_queue_free(hdwq[idx].fcp_wq);
                lpfc_sli4_queue_free(hdwq[idx].nvme_wq);
-               hdwq[idx].hba_eq = NULL;
                hdwq[idx].fcp_cq = NULL;
                hdwq[idx].nvme_cq = NULL;
                hdwq[idx].fcp_wq = NULL;
 {
        /* Release HBA eqs */
        if (phba->sli4_hba.hdwq)
-               lpfc_sli4_release_hdwq(phba->sli4_hba.hdwq,
-                                      phba->cfg_hdw_queue);
+               lpfc_sli4_release_hdwq(phba);
 
        if (phba->nvmet_support) {
                lpfc_sli4_release_queues(&phba->sli4_hba.nvmet_cqset,
                        qidx, (uint32_t)rc);
                return rc;
        }
-       cq->chann = qidx;
 
        if (qtype != LPFC_MBOX) {
                /* Setup cq_map for fast lookup */
                        /* no need to tear down cq - caller will do so */
                        return rc;
                }
-               wq->chann = qidx;
 
                /* Bind this CQ/WQ to the NVME ring */
                pring = wq->pring;
        return 0;
 }
 
+/**
+ * lpfc_setup_cq_lookup - Setup the CQ lookup table
+ * @phba: pointer to lpfc hba data structure.
+ *
+ * This routine will populate the cq_lookup table by all
+ * available CQ queue_id's.
+ **/
+void
+lpfc_setup_cq_lookup(struct lpfc_hba *phba)
+{
+       struct lpfc_queue *eq, *childq;
+       struct lpfc_sli4_hdw_queue *qp;
+       int qidx;
+
+       qp = phba->sli4_hba.hdwq;
+       memset(phba->sli4_hba.cq_lookup, 0,
+              (sizeof(struct lpfc_queue *) * (phba->sli4_hba.cq_max + 1)));
+       for (qidx = 0; qidx < phba->cfg_irq_chann; qidx++) {
+               eq = qp[qidx].hba_eq;
+               if (!eq)
+                       continue;
+               list_for_each_entry(childq, &eq->child_list, list) {
+                       if (childq->queue_id > phba->sli4_hba.cq_max)
+                               continue;
+                       if ((childq->subtype == LPFC_FCP) ||
+                           (childq->subtype == LPFC_NVME))
+                               phba->sli4_hba.cq_lookup[childq->queue_id] =
+                                       childq;
+               }
+       }
+}
+
 /**
  * lpfc_sli4_queue_setup - Set up all the SLI4 queues
  * @phba: pointer to lpfc hba data structure.
                rc = -ENOMEM;
                goto out_error;
        }
-       for (qidx = 0; qidx < phba->cfg_hdw_queue; qidx++) {
+       for (qidx = 0; qidx < phba->cfg_irq_chann; qidx++) {
                if (!qp[qidx].hba_eq) {
                        lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
                                        "0522 Fast-path EQ (%d) not "
                        phba->sli4_hba.dat_rq->queue_id,
                        phba->sli4_hba.els_cq->queue_id);
 
-       for (qidx = 0; qidx < phba->cfg_hdw_queue;
+       for (qidx = 0; qidx < phba->cfg_irq_chann;
             qidx += LPFC_MAX_EQ_DELAY_EQID_CNT)
                lpfc_modify_hba_eq_delay(phba, qidx, LPFC_MAX_EQ_DELAY_EQID_CNT,
                                         phba->cfg_fcp_imax);
 
+       if (phba->sli4_hba.cq_max) {
+               kfree(phba->sli4_hba.cq_lookup);
+               phba->sli4_hba.cq_lookup = kcalloc((phba->sli4_hba.cq_max + 1),
+                       sizeof(struct lpfc_queue *), GFP_KERNEL);
+               if (!phba->sli4_hba.cq_lookup) {
+                       lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
+                                       "0549 Failed setup of CQ Lookup table: "
+                                       "size 0x%x\n", phba->sli4_hba.cq_max);
+                       goto out_destroy;
+               }
+               lpfc_setup_cq_lookup(phba);
+       }
        return 0;
 
 out_destroy:
                        lpfc_wq_destroy(phba, qp->nvme_wq);
                        lpfc_cq_destroy(phba, qp->fcp_cq);
                        lpfc_cq_destroy(phba, qp->nvme_cq);
-                       lpfc_eq_destroy(phba, qp->hba_eq);
+                       if (qidx < phba->cfg_irq_chann)
+                               lpfc_eq_destroy(phba, qp->hba_eq);
                }
        }
+
+       kfree(phba->sli4_hba.cq_lookup);
+       phba->sli4_hba.cq_lookup = NULL;
+       phba->sli4_hba.cq_max = 0;
 }
 
 /**
        phba->sli.slistat.sli_intr = 0;
 }
 
+/**
+ * lpfc_find_cpu_handle - Find the CPU that corresponds to the specified EQ
+ * @phba: pointer to lpfc hba data structure.
+ * @id: EQ vector index or Hardware Queue index
+ * @match: LPFC_FIND_BY_EQ = match by EQ
+ *         LPFC_FIND_BY_HDWQ = match by Hardware Queue
+ */
+static uint16_t
+lpfc_find_cpu_handle(struct lpfc_hba *phba, uint16_t id, int match)
+{
+       struct lpfc_vector_map_info *cpup;
+       int cpu;
+
+       /* Find the desired phys_id for the specified EQ */
+       cpup = phba->sli4_hba.cpu_map;
+       for (cpu = 0; cpu < phba->sli4_hba.num_present_cpu; cpu++) {
+               if ((match == LPFC_FIND_BY_EQ) &&
+                   (cpup->irq != LPFC_VECTOR_MAP_EMPTY) &&
+                   (cpup->eq == id))
+                       return cpu;
+               if ((match == LPFC_FIND_BY_HDWQ) && (cpup->hdwq == id))
+                       return cpu;
+               cpup++;
+       }
+       return 0;
+}
+
+/**
+ * lpfc_find_eq_handle - Find the EQ that corresponds to the specified
+ *                       Hardware Queue
+ * @phba: pointer to lpfc hba data structure.
+ * @hdwq: Hardware Queue index
+ */
+static uint16_t
+lpfc_find_eq_handle(struct lpfc_hba *phba, uint16_t hdwq)
+{
+       struct lpfc_vector_map_info *cpup;
+       int cpu;
+
+       /* Find the desired phys_id for the specified EQ */
+       cpup = phba->sli4_hba.cpu_map;
+       for (cpu = 0; cpu < phba->sli4_hba.num_present_cpu; cpu++) {
+               if (cpup->hdwq == hdwq)
+                       return cpup->eq;
+               cpup++;
+       }
+       return 0;
+}
+
+/**
+ * lpfc_find_phys_id_eq - Find the next EQ that corresponds to the specified
+ *                        Physical Id.
+ * @phba: pointer to lpfc hba data structure.
+ * @eqidx: EQ index
+ * @phys_id: CPU package physical id
+ */
+static uint16_t
+lpfc_find_phys_id_eq(struct lpfc_hba *phba, uint16_t eqidx, uint16_t phys_id)
+{
+       struct lpfc_vector_map_info *cpup;
+       int cpu, desired_phys_id;
+
+       desired_phys_id = LPFC_VECTOR_MAP_EMPTY;
+
+       /* Find the desired phys_id for the specified EQ */
+       cpup = phba->sli4_hba.cpu_map;
+       for (cpu = 0; cpu < phba->sli4_hba.num_present_cpu; cpu++) {
+               if ((cpup->irq != LPFC_VECTOR_MAP_EMPTY) &&
+                   (cpup->eq == eqidx)) {
+                       desired_phys_id = cpup->phys_id;
+                       break;
+               }
+               cpup++;
+       }
+       if (phys_id == desired_phys_id)
+               return eqidx;
+
+       /* Find a EQ thats on the specified phys_id */
+       cpup = phba->sli4_hba.cpu_map;
+       for (cpu = 0; cpu < phba->sli4_hba.num_present_cpu; cpu++) {
+               if ((cpup->irq != LPFC_VECTOR_MAP_EMPTY) &&
+                   (cpup->phys_id == phys_id))
+                       return cpup->eq;
+               cpup++;
+       }
+       return 0;
+}
+
+/**
+ * lpfc_find_cpu_map - Find next available CPU map entry that matches the
+ *                     phys_id and core_id.
+ * @phba: pointer to lpfc hba data structure.
+ * @phys_id: CPU package physical id
+ * @core_id: CPU core id
+ * @hdwqidx: Hardware Queue index
+ * @eqidx: EQ index
+ * @isr_avail: Should an IRQ be associated with this entry
+ */
+static struct lpfc_vector_map_info *
+lpfc_find_cpu_map(struct lpfc_hba *phba, uint16_t phys_id, uint16_t core_id,
+                 uint16_t hdwqidx, uint16_t eqidx, int isr_avail)
+{
+       struct lpfc_vector_map_info *cpup;
+       int cpu;
+
+       cpup = phba->sli4_hba.cpu_map;
+       for (cpu = 0; cpu < phba->sli4_hba.num_present_cpu; cpu++) {
+               /* Does the cpup match the one we are looking for */
+               if ((cpup->phys_id == phys_id) &&
+                   (cpup->core_id == core_id)) {
+                       /* If it has been already assigned, then skip it */
+                       if (cpup->hdwq != LPFC_VECTOR_MAP_EMPTY) {
+                               cpup++;
+                               continue;
+                       }
+                       /* Ensure we are on the same phys_id as the first one */
+                       if (!isr_avail)
+                               cpup->eq = lpfc_find_phys_id_eq(phba, eqidx,
+                                                               phys_id);
+                       else
+                               cpup->eq = eqidx;
+
+                       cpup->hdwq = hdwqidx;
+                       if (isr_avail) {
+                               cpup->irq =
+                                   pci_irq_vector(phba->pcidev, eqidx);
+
+                               /* Now affinitize to the selected CPU */
+                               irq_set_affinity_hint(cpup->irq,
+                                                     get_cpu_mask(cpu));
+                               irq_set_status_flags(cpup->irq,
+                                                    IRQ_NO_BALANCING);
+
+                               lpfc_printf_log(phba, KERN_INFO, LOG_INIT,
+                                               "3330 Set Affinity: CPU %d "
+                                               "EQ %d irq %d (HDWQ %x)\n",
+                                               cpu, cpup->eq,
+                                               cpup->irq, cpup->hdwq);
+                       }
+                       return cpup;
+               }
+               cpup++;
+       }
+       return 0;
+}
+
+#ifdef CONFIG_X86
+/**
+ * lpfc_find_hyper - Determine if the CPU map entry is hyper-threaded
+ * @phba: pointer to lpfc hba data structure.
+ * @cpu: CPU map index
+ * @phys_id: CPU package physical id
+ * @core_id: CPU core id
+ */
+static int
+lpfc_find_hyper(struct lpfc_hba *phba, int cpu,
+               uint16_t phys_id, uint16_t core_id)
+{
+       struct lpfc_vector_map_info *cpup;
+       int idx;
+
+       cpup = phba->sli4_hba.cpu_map;
+       for (idx = 0; idx < phba->sli4_hba.num_present_cpu; idx++) {
+               /* Does the cpup match the one we are looking for */
+               if ((cpup->phys_id == phys_id) &&
+                   (cpup->core_id == core_id) &&
+                   (cpu != idx)) {
+                       return 1;
+               }
+               cpup++;
+       }
+       return 0;
+}
+#endif
+
 /**
  * lpfc_cpu_affinity_check - Check vector CPU affinity mappings
  * @phba: pointer to lpfc hba data structure.
+ * @vectors: number of msix vectors allocated.
  *
  * The routine will figure out the CPU affinity assignment for every
- * MSI-X vector allocated for the HBA.  The hba_eq_hdl will be updated
- * with a pointer to the CPU mask that defines ALL the CPUs this vector
- * can be associated with. If the vector can be unquely associated with
- * a single CPU, that CPU will be recorded in hba_eq_hdl[index].cpu.
+ * MSI-X vector allocated for the HBA.
  * In addition, the CPU to IO channel mapping will be calculated
  * and the phba->sli4_hba.cpu_map array will reflect this.
  */
 static void
-lpfc_cpu_affinity_check(struct lpfc_hba *phba)
+lpfc_cpu_affinity_check(struct lpfc_hba *phba, int vectors)
 {
+       int i, j, idx, phys_id;
+       int max_phys_id, min_phys_id;
+       int max_core_id, min_core_id;
        struct lpfc_vector_map_info *cpup;
-       int cpu, idx;
+       int cpu, eqidx, hdwqidx, isr_avail;
 #ifdef CONFIG_X86
        struct cpuinfo_x86 *cpuinfo;
 #endif
               (sizeof(struct lpfc_vector_map_info) *
               phba->sli4_hba.num_present_cpu));
 
+       max_phys_id = 0;
+       min_phys_id = 0xffff;
+       max_core_id = 0;
+       min_core_id = 0xffff;
+       phys_id = 0;
+
        /* Update CPU map with physical id and core id of each CPU */
        cpup = phba->sli4_hba.cpu_map;
        for (cpu = 0; cpu < phba->sli4_hba.num_present_cpu; cpu++) {
                cpuinfo = &cpu_data(cpu);
                cpup->phys_id = cpuinfo->phys_proc_id;
                cpup->core_id = cpuinfo->cpu_core_id;
+               cpup->hyper = lpfc_find_hyper(phba, cpu,
+                                             cpup->phys_id, cpup->core_id);
 #else
                /* No distinction between CPUs for other platforms */
                cpup->phys_id = 0;
-               cpup->core_id = 0;
+               cpup->core_id = cpu;
+               cpup->hyper = 0;
 #endif
+
                lpfc_printf_log(phba, KERN_INFO, LOG_INIT,
                                "3328 CPU physid %d coreid %d\n",
                                cpup->phys_id, cpup->core_id);
+
+               if (cpup->phys_id > max_phys_id)
+                       max_phys_id = cpup->phys_id;
+               if (cpup->phys_id < min_phys_id)
+                       min_phys_id = cpup->phys_id;
+
+               if (cpup->core_id > max_core_id)
+                       max_core_id = cpup->core_id;
+               if (cpup->core_id < min_core_id)
+                       min_core_id = cpup->core_id;
+
                cpup++;
        }
 
-       for (idx = 0; idx <  phba->cfg_hdw_queue; idx++) {
-               cpup = &phba->sli4_hba.cpu_map[idx];
-               cpup->irq = pci_irq_vector(phba->pcidev, idx);
+       /*
+        * If the number of IRQ vectors == number of CPUs,
+        * mapping is pretty simple: 1 to 1.
+        * This is the desired path if NVME is enabled.
+        */
+       if (vectors == phba->sli4_hba.num_present_cpu) {
+               cpup = phba->sli4_hba.cpu_map;
+               for (idx = 0; idx < vectors; idx++) {
+                       cpup->eq = idx;
+                       cpup->hdwq = idx;
+                       cpup->irq = pci_irq_vector(phba->pcidev, idx);
+
+                       /* Now affinitize to the selected CPU */
+                       irq_set_affinity_hint(
+                               pci_irq_vector(phba->pcidev, idx),
+                               get_cpu_mask(idx));
+                       irq_set_status_flags(cpup->irq, IRQ_NO_BALANCING);
 
-               /* For now assume vector N maps to CPU N */
-               irq_set_affinity_hint(cpup->irq, get_cpu_mask(idx));
-               cpup->hdwq = idx;
+                       lpfc_printf_log(phba, KERN_INFO, LOG_INIT,
+                                       "3336 Set Affinity: CPU %d "
+                                       "EQ %d irq %d\n",
+                                       idx, cpup->eq,
+                                       pci_irq_vector(phba->pcidev, idx));
+                       cpup++;
+               }
+               return;
+       }
 
-               lpfc_printf_log(phba, KERN_INFO, LOG_INIT,
-                       "3336 Set Affinity: CPU %d "
-                       "hdwq %d irq %d\n",
-                       cpu, cpup->hdwq, cpup->irq);
+       idx = 0;
+       isr_avail = 1;
+       eqidx = 0;
+       hdwqidx = 0;
+
+       /* Mapping is more complicated for this case. Hardware Queues are
+        * assigned in a "ping pong" fashion, ping pong-ing between the
+        * available phys_id's.
+        */
+       while (idx < phba->sli4_hba.num_present_cpu) {
+               for (i = min_core_id; i <= max_core_id; i++) {
+                       for (j = min_phys_id; j <= max_phys_id; j++) {
+                               cpup = lpfc_find_cpu_map(phba, j, i, hdwqidx,
+                                                        eqidx, isr_avail);
+                               if (!cpup)
+                                       continue;
+                               idx++;
+                               hdwqidx++;
+                               if (hdwqidx >= phba->cfg_hdw_queue)
+                                       hdwqidx = 0;
+                               eqidx++;
+                               if (eqidx >= phba->cfg_irq_chann) {
+                                       isr_avail = 0;
+                                       eqidx = 0;
+                               }
+                       }
+               }
        }
        return;
 }
 
-
 /**
  * lpfc_sli4_enable_msix - Enable MSI-X interrupt mode to SLI-4 device
  * @phba: pointer to lpfc hba data structure.
        char *name;
 
        /* Set up MSI-X multi-message vectors */
-       vectors = phba->cfg_hdw_queue;
+       vectors = phba->cfg_irq_chann;
 
        rc = pci_alloc_irq_vectors(phba->pcidev,
                                (phba->nvmet_support) ? 1 : 2,
 
                phba->sli4_hba.hba_eq_hdl[index].idx = index;
                phba->sli4_hba.hba_eq_hdl[index].phba = phba;
-               atomic_set(&phba->sli4_hba.hba_eq_hdl[index].hba_eq_in_use, 1);
                rc = request_irq(pci_irq_vector(phba->pcidev, index),
                         &lpfc_sli4_hba_intr_handler, 0,
                         name,
                }
        }
 
-       if (vectors != phba->cfg_hdw_queue) {
+       if (vectors != phba->cfg_irq_chann) {
                lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
                                "3238 Reducing IO channels to match number of "
                                "MSI-X vectors, requested %d got %d\n",
-                               phba->cfg_hdw_queue, vectors);
-               if (phba->cfg_hdw_queue > vectors)
-                       phba->cfg_hdw_queue = vectors;
+                               phba->cfg_irq_chann, vectors);
+               if (phba->cfg_irq_chann > vectors)
+                       phba->cfg_irq_chann = vectors;
                if (phba->cfg_nvmet_mrq > vectors)
                        phba->cfg_nvmet_mrq = vectors;
        }
-       lpfc_cpu_affinity_check(phba);
 
        return rc;
 
                return rc;
        }
 
-       for (index = 0; index < phba->cfg_hdw_queue; index++) {
+       for (index = 0; index < phba->cfg_irq_chann; index++) {
                phba->sli4_hba.hba_eq_hdl[index].idx = index;
                phba->sli4_hba.hba_eq_hdl[index].phba = phba;
        }
                        phba->intr_type = INTx;
                        intr_mode = 0;
 
-                       for (idx = 0; idx < phba->cfg_hdw_queue; idx++) {
+                       for (idx = 0; idx < phba->cfg_irq_chann; idx++) {
                                eqhdl = &phba->sli4_hba.hba_eq_hdl[idx];
                                eqhdl->idx = idx;
                                eqhdl->phba = phba;
-                               atomic_set(&eqhdl->hba_eq_in_use, 1);
                        }
                }
        }
                int index;
 
                /* Free up MSI-X multi-message vectors */
-               for (index = 0; index < phba->cfg_hdw_queue; index++) {
+               for (index = 0; index < phba->cfg_irq_chann; index++) {
                        irq_set_affinity_hint(
                                pci_irq_vector(phba->pcidev, index),
                                NULL);
        }
        /* Default to single EQ for non-MSI-X */
        if (phba->intr_type != MSIX) {
-               phba->cfg_hdw_queue = 1;
+               phba->cfg_irq_chann = 1;
                if (phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME) {
                        if (phba->nvmet_support)
                                phba->cfg_nvmet_mrq = 1;
                }
        }
+       lpfc_cpu_affinity_check(phba, phba->cfg_irq_chann);
 
        /* Create SCSI host to the physical port */
        error = lpfc_create_shost(phba);
 
        if (qidx) {
                str = "IO ";  /* IO queue */
                qhandle->index = ((qidx - 1) %
-                       vport->phba->cfg_hdw_queue);
+                       lpfc_nvme_template.max_hw_queues);
        } else {
                str = "ADM";  /* Admin queue */
                qhandle->index = qidx;
                }
        }
 
+       /* Lookup Hardware Queue index based on fcp_io_sched module parameter */
        if (phba->cfg_fcp_io_sched == LPFC_FCP_SCHED_BY_HDWQ) {
                idx = lpfc_queue_info->index;
        } else {
                cpu = smp_processor_id();
-               if (cpu < phba->cfg_hdw_queue)
-                       idx = cpu;
-               else
-                       idx = cpu % phba->cfg_hdw_queue;
+               idx = phba->sli4_hba.cpu_map[cpu].hdwq;
        }
 
        lpfc_ncmd = lpfc_get_nvme_buf(phba, ndlp, idx, expedite);
         * allocate + 3, one for cmd, one for rsp and one for this alignment
         */
        lpfc_nvme_template.max_sgl_segments = phba->cfg_nvme_seg_cnt + 1;
-       lpfc_nvme_template.max_hw_queues = phba->cfg_hdw_queue;
+
+       /* Advertise how many hw queues we support based on fcp_io_sched */
+       if (phba->cfg_fcp_io_sched == LPFC_FCP_SCHED_BY_HDWQ)
+               lpfc_nvme_template.max_hw_queues = phba->cfg_hdw_queue;
+       else
+               lpfc_nvme_template.max_hw_queues =
+                       phba->sli4_hba.num_present_cpu;
 
        /* localport is allocated from the stack, but the registration
         * call allocates heap memory as well as the private area.
         * WQEs have been removed from the txcmplqs.
         */
        for (i = 0; i < phba->cfg_hdw_queue; i++) {
+               if (!phba->sli4_hba.hdwq[i].nvme_wq)
+                       continue;
                pring = phba->sli4_hba.hdwq[i].nvme_wq->pring;
 
                if (!pring)
 
                tag = blk_mq_unique_tag(cmnd->request);
                idx = blk_mq_unique_tag_to_hwq(tag);
        } else {
-               if (cpu < phba->cfg_hdw_queue)
-                       idx = cpu;
-               else
-                       idx = cpu % phba->cfg_hdw_queue;
+               idx = phba->sli4_hba.cpu_map[cpu].hdwq;
        }
 
        lpfc_cmd = lpfc_get_io_buf(phba, ndlp, idx,
        struct Scsi_Host *shost;
        int idx;
        uint32_t logit = LOG_FCP;
+#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
+       int cpu;
+#endif
 
        /* Sanity check on return of outstanding command */
        cmd = lpfc_cmd->pCmd;
        if (phba->sli4_hba.hdwq)
                phba->sli4_hba.hdwq[idx].scsi_cstat.io_cmpls++;
 
+#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
+       if (phba->cpucheck_on & LPFC_CHECK_SCSI_IO) {
+               cpu = smp_processor_id();
+               if (cpu < LPFC_CHECK_CPU_CNT)
+                       phba->sli4_hba.hdwq[idx].cpucheck_cmpl_io[cpu]++;
+       }
+#endif
        shost = cmd->device->host;
 
        lpfc_cmd->result = (pIocbOut->iocb.un.ulpWord[4] & IOERR_PARAM_MASK);
        struct lpfc_io_buf *lpfc_cmd;
        struct fc_rport *rport = starget_to_rport(scsi_target(cmnd->device));
        int err, idx;
+#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
+       int cpu;
+#endif
 
        rdata = lpfc_rport_data_from_scsi_device(cmnd->device);
 
 
        lpfc_scsi_prep_cmnd(vport, lpfc_cmd, ndlp);
 
+#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
+       if (phba->cpucheck_on & LPFC_CHECK_SCSI_IO) {
+               cpu = smp_processor_id();
+               if (cpu < LPFC_CHECK_CPU_CNT) {
+                       struct lpfc_sli4_hdw_queue *hdwq =
+                                       &phba->sli4_hba.hdwq[lpfc_cmd->hdwq_no];
+                       hdwq->cpucheck_xmt_io[cpu]++;
+               }
+       }
+#endif
        err = lpfc_sli_issue_iocb(phba, LPFC_FCP_RING,
                                  &lpfc_cmd->cur_iocbq, SLI_IOCB_RET_IOCB);
        if (err) {
 
                                                LPFC_QUEUE_REARM);
                }
 
-               for (qidx = 0; qidx < phba->cfg_hdw_queue; qidx++)
+               for (qidx = 0; qidx < phba->cfg_irq_chann; qidx++)
                        sli4_hba->sli4_eq_release(qp[qidx].hba_eq,
                                                LPFC_QUEUE_REARM);
        }
        /* Find the eq associated with the mcq */
 
        if (sli4_hba->hdwq)
-               for (eqidx = 0; eqidx < phba->cfg_hdw_queue; eqidx++)
+               for (eqidx = 0; eqidx < phba->cfg_irq_chann; eqidx++)
                        if (sli4_hba->hdwq[eqidx].hba_eq->queue_id ==
                            sli4_hba->mbx_cq->assoc_qid) {
                                fpeq = sli4_hba->hdwq[eqidx].hba_eq;
 lpfc_sli_issue_iocb(struct lpfc_hba *phba, uint32_t ring_number,
                    struct lpfc_iocbq *piocb, uint32_t flag)
 {
-       struct lpfc_hba_eq_hdl *hba_eq_hdl;
        struct lpfc_sli_ring *pring;
-       struct lpfc_queue *fpeq;
-       struct lpfc_eqe *eqe;
        unsigned long iflags;
-       int rc, idx;
+       int rc;
 
        if (phba->sli_rev == LPFC_SLI_REV4) {
                pring = lpfc_sli4_calc_ring(phba, piocb);
                spin_lock_irqsave(&pring->ring_lock, iflags);
                rc = __lpfc_sli_issue_iocb(phba, ring_number, piocb, flag);
                spin_unlock_irqrestore(&pring->ring_lock, iflags);
-
-               if (lpfc_fcp_look_ahead && (piocb->iocb_flag &  LPFC_IO_FCP)) {
-                       idx = piocb->hba_wqidx;
-                       hba_eq_hdl = &phba->sli4_hba.hba_eq_hdl[idx];
-
-                       if (atomic_dec_and_test(&hba_eq_hdl->hba_eq_in_use)) {
-
-                               /* Get associated EQ with this index */
-                               fpeq = phba->sli4_hba.hdwq[idx].hba_eq;
-
-                               /* Turn off interrupts from this EQ */
-                               phba->sli4_hba.sli4_eq_clr_intr(fpeq);
-
-                               /*
-                                * Process all the events on FCP EQ
-                                */
-                               while ((eqe = lpfc_sli4_eq_get(fpeq))) {
-                                       lpfc_sli4_hba_handle_eqe(phba,
-                                               eqe, idx);
-                                       fpeq->EQ_processed++;
-                               }
-
-                               /* Always clear and re-arm the EQ */
-                               phba->sli4_hba.sli4_eq_release(fpeq,
-                                       LPFC_QUEUE_REARM);
-                       }
-                       atomic_inc(&hba_eq_hdl->hba_eq_in_use);
-               }
        } else {
                /* For now, SLI2/3 will still use hbalock */
                spin_lock_irqsave(&phba->hbalock, iflags);
        /* Save EQ associated with this CQ */
        cq->assoc_qp = speq;
 
-       if (!queue_work(phba->wq, &cq->spwork))
+       if (!queue_work_on(cq->chann, phba->wq, &cq->spwork))
                lpfc_printf_log(phba, KERN_ERR, LOG_SLI,
                                "0390 Cannot schedule soft IRQ "
                                "for CQ eqcqid=%d, cqid=%d on CPU %d\n",
        /* Get the reference to the corresponding CQ */
        cqid = bf_get_le32(lpfc_eqe_resource_id, eqe);
 
-       /* First check for NVME/SCSI completion */
-       if ((phba->cfg_enable_fc4_type & LPFC_ENABLE_NVME) &&
-           (cqid == phba->sli4_hba.hdwq[qidx].nvme_cq_map)) {
-               /* Process NVME / NVMET command completion */
-               cq = phba->sli4_hba.hdwq[qidx].nvme_cq;
-               goto  process_cq;
-       }
-
-       if (cqid == phba->sli4_hba.hdwq[qidx].fcp_cq_map) {
-               /* Process FCP command completion */
-               cq = phba->sli4_hba.hdwq[qidx].fcp_cq;
-               goto  process_cq;
+       /* Use the fast lookup method first */
+       if (cqid <= phba->sli4_hba.cq_max) {
+               cq = phba->sli4_hba.cq_lookup[cqid];
+               if (cq)
+                       goto  work_cq;
        }
 
        /* Next check for NVMET completion */
                return;
        }
 
-       /* Save EQ associated with this CQ */
-       cq->assoc_qp = phba->sli4_hba.hdwq[qidx].hba_eq;
-
+work_cq:
        if (!queue_work_on(cq->chann, phba->wq, &cq->irqwork))
                lpfc_printf_log(phba, KERN_ERR, LOG_SLI,
                                "0363 Cannot schedule soft IRQ "
        if (unlikely(!fpeq))
                return IRQ_NONE;
 
-       if (lpfc_fcp_look_ahead) {
-               if (atomic_dec_and_test(&hba_eq_hdl->hba_eq_in_use))
-                       phba->sli4_hba.sli4_eq_clr_intr(fpeq);
-               else {
-                       atomic_inc(&hba_eq_hdl->hba_eq_in_use);
-                       return IRQ_NONE;
-               }
-       }
-
        /* Check device state for handling interrupt */
        if (unlikely(lpfc_intr_state_check(phba))) {
                /* Check again for link_state with lock held */
                        /* Flush, clear interrupt, and rearm the EQ */
                        lpfc_sli4_eq_flush(phba, fpeq);
                spin_unlock_irqrestore(&phba->hbalock, iflag);
-               if (lpfc_fcp_look_ahead)
-                       atomic_inc(&hba_eq_hdl->hba_eq_in_use);
                return IRQ_NONE;
        }
 
 
        if (unlikely(ecount == 0)) {
                fpeq->EQ_no_entry++;
-
-               if (lpfc_fcp_look_ahead) {
-                       atomic_inc(&hba_eq_hdl->hba_eq_in_use);
-                       return IRQ_NONE;
-               }
-
                if (phba->intr_type == MSIX)
                        /* MSI-X treated interrupt served as no EQ share INT */
                        lpfc_printf_log(phba, KERN_WARNING, LOG_SLI,
                        return IRQ_NONE;
        }
 
-       if (lpfc_fcp_look_ahead)
-               atomic_inc(&hba_eq_hdl->hba_eq_in_use);
-
        return IRQ_HANDLED;
 } /* lpfc_sli4_fp_intr_handler */
 
        /*
         * Invoke fast-path host attention interrupt handling as appropriate.
         */
-       for (qidx = 0; qidx < phba->cfg_hdw_queue; qidx++) {
+       for (qidx = 0; qidx < phba->cfg_irq_chann; qidx++) {
                hba_irq_rc = lpfc_sli4_hba_intr_handler(irq,
                                        &phba->sli4_hba.hba_eq_hdl[qidx]);
                if (hba_irq_rc == IRQ_HANDLED)
        union lpfc_sli4_cfg_shdr *shdr;
        uint16_t dmult;
 
-       if (startq >= phba->cfg_hdw_queue)
+       if (startq >= phba->cfg_irq_chann)
                return 0;
 
        mbox = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL);
        eq_delay = &mbox->u.mqe.un.eq_delay;
 
        /* Calculate delay multiper from maximum interrupt per second */
-       result = imax / phba->cfg_hdw_queue;
+       result = imax / phba->cfg_irq_chann;
        if (result > LPFC_DMULT_CONST || result == 0)
                dmult = 0;
        else
                dmult = LPFC_DMULT_MAX;
 
        cnt = 0;
-       for (qidx = startq; qidx < phba->cfg_hdw_queue; qidx++) {
+       for (qidx = startq; qidx < phba->cfg_irq_chann; qidx++) {
                eq = phba->sli4_hba.hdwq[qidx].hba_eq;
                if (!eq)
                        continue;
                        val =  phba->cfg_fcp_imax;
                        if (val) {
                                /* First, interrupts per sec per EQ */
-                               val = phba->cfg_fcp_imax / phba->cfg_hdw_queue;
+                               val = phba->cfg_fcp_imax / phba->cfg_irq_chann;
 
                                /* us delay between each interrupt */
                                val = LPFC_SEC_TO_USEC / val;
        cq->subtype = subtype;
        cq->queue_id = bf_get(lpfc_mbx_cq_create_q_id, &cq_create->u.response);
        cq->assoc_qid = eq->queue_id;
+       cq->assoc_qp = eq;
        cq->host_index = 0;
        cq->hba_index = 0;
        cq->entry_repost = LPFC_CQ_REPOST;
 
+       if (cq->queue_id > phba->sli4_hba.cq_max)
+               phba->sli4_hba.cq_max = cq->queue_id;
 out:
        mempool_free(mbox, phba->mbox_mem_pool);
        return status;
                cq->type = type;
                cq->subtype = subtype;
                cq->assoc_qid = eq->queue_id;
+               cq->assoc_qp = eq;
                cq->host_index = 0;
                cq->hba_index = 0;
                cq->entry_repost = LPFC_CQ_REPOST;
        for (idx = 0; idx < numcq; idx++) {
                cq = cqp[idx];
                cq->queue_id = rc + idx;
+               if (cq->queue_id > phba->sli4_hba.cq_max)
+                       phba->sli4_hba.cq_max = cq->queue_id;
        }
 
 out:
        /* NVME_LS and NVME_LS ABTS requests. */
        if (pwqe->iocb_flag & LPFC_IO_NVME_LS) {
                pring =  phba->sli4_hba.nvmels_wq->pring;
-               spin_lock_irqsave(&pring->ring_lock, iflags);
+               lpfc_qp_spin_lock_irqsave(&pring->ring_lock, iflags,
+                                         qp, wq_access);
                sglq = __lpfc_sli_get_els_sglq(phba, pwqe);
                if (!sglq) {
                        spin_unlock_irqrestore(&pring->ring_lock, iflags);
 
                bf_set(wqe_cqid, &wqe->generic.wqe_com, qp->nvme_cq_map);
 
-               spin_lock_irqsave(&pring->ring_lock, iflags);
+               lpfc_qp_spin_lock_irqsave(&pring->ring_lock, iflags,
+                                         qp, wq_access);
                ret = lpfc_sli4_wq_put(wq, wqe);
                if (ret) {
                        spin_unlock_irqrestore(&pring->ring_lock, iflags);
                       pwqe->sli4_xritag);
                bf_set(wqe_cqid, &wqe->generic.wqe_com, qp->nvme_cq_map);
 
-               spin_lock_irqsave(&pring->ring_lock, iflags);
+               lpfc_qp_spin_lock_irqsave(&pring->ring_lock, iflags,
+                                         qp, wq_access);
                ret = lpfc_sli4_wq_put(wq, wqe);
                if (ret) {
                        spin_unlock_irqrestore(&pring->ring_lock, iflags);
 {
        struct lpfc_pbl_pool *pbl_pool;
        struct lpfc_pvt_pool *pvt_pool;
+       struct lpfc_sli4_hdw_queue *qp;
        struct lpfc_io_buf *lpfc_ncmd;
        struct lpfc_io_buf *lpfc_ncmd_next;
        unsigned long iflag;
        struct list_head tmp_list;
        u32 tmp_count;
 
-       pbl_pool = &phba->sli4_hba.hdwq[hwqid].p_multixri_pool->pbl_pool;
-       pvt_pool = &phba->sli4_hba.hdwq[hwqid].p_multixri_pool->pvt_pool;
+       qp = &phba->sli4_hba.hdwq[hwqid];
+       pbl_pool = &qp->p_multixri_pool->pbl_pool;
+       pvt_pool = &qp->p_multixri_pool->pvt_pool;
        tmp_count = 0;
 
-       spin_lock_irqsave(&pbl_pool->lock, iflag);
-       spin_lock(&pvt_pool->lock);
+       lpfc_qp_spin_lock_irqsave(&pbl_pool->lock, iflag, qp, mv_to_pub_pool);
+       lpfc_qp_spin_lock(&pvt_pool->lock, qp, mv_from_pvt_pool);
 
        if (pvt_pool->count > pvt_pool->low_watermark) {
                /* Step 1: move (all - low_watermark) from pvt_pool
  *   false - if the specified pbl_pool is empty or locked by someone else
  **/
 static bool
-_lpfc_move_xri_pbl_to_pvt(struct lpfc_hba *phba, struct lpfc_pbl_pool *pbl_pool,
+_lpfc_move_xri_pbl_to_pvt(struct lpfc_hba *phba, struct lpfc_sli4_hdw_queue *qp,
+                         struct lpfc_pbl_pool *pbl_pool,
                          struct lpfc_pvt_pool *pvt_pool, u32 count)
 {
        struct lpfc_io_buf *lpfc_ncmd;
        if (ret) {
                if (pbl_pool->count) {
                        /* Move a batch of XRIs from public to private pool */
-                       spin_lock(&pvt_pool->lock);
+                       lpfc_qp_spin_lock(&pvt_pool->lock, qp, mv_to_pvt_pool);
                        list_for_each_entry_safe(lpfc_ncmd,
                                                 lpfc_ncmd_next,
                                                 &pbl_pool->list,
        struct lpfc_multixri_pool *next_multixri_pool;
        struct lpfc_pvt_pool *pvt_pool;
        struct lpfc_pbl_pool *pbl_pool;
+       struct lpfc_sli4_hdw_queue *qp;
        u32 next_hwqid;
        u32 hwq_count;
        int ret;
 
-       multixri_pool = phba->sli4_hba.hdwq[hwqid].p_multixri_pool;
+       qp = &phba->sli4_hba.hdwq[hwqid];
+       multixri_pool = qp->p_multixri_pool;
        pvt_pool = &multixri_pool->pvt_pool;
        pbl_pool = &multixri_pool->pbl_pool;
 
        /* Check if local pbl_pool is available */
-       ret = _lpfc_move_xri_pbl_to_pvt(phba, pbl_pool, pvt_pool, count);
+       ret = _lpfc_move_xri_pbl_to_pvt(phba, qp, pbl_pool, pvt_pool, count);
        if (ret) {
 #ifdef LPFC_MXP_STAT
                multixri_pool->local_pbl_hit_count++;
 
                /* Check if the public free xri pool is available */
                ret = _lpfc_move_xri_pbl_to_pvt(
-                       phba, pbl_pool, pvt_pool, count);
+                       phba, qp, pbl_pool, pvt_pool, count);
 
                /* Exit while-loop if success or all hwqid are checked */
        } while (!ret && next_hwqid != multixri_pool->rrb_next_hwqid);
                if ((pvt_pool->count < pvt_pool->low_watermark) ||
                    (xri_owned < xri_limit &&
                     pvt_pool->count < pvt_pool->high_watermark)) {
-                       spin_lock_irqsave(&pvt_pool->lock, iflag);
+                       lpfc_qp_spin_lock_irqsave(&pvt_pool->lock, iflag,
+                                                 qp, free_pvt_pool);
                        list_add_tail(&lpfc_ncmd->list,
                                      &pvt_pool->list);
                        pvt_pool->count++;
                        spin_unlock_irqrestore(&pvt_pool->lock, iflag);
                } else {
-                       spin_lock_irqsave(&pbl_pool->lock, iflag);
+                       lpfc_qp_spin_lock_irqsave(&pbl_pool->lock, iflag,
+                                                 qp, free_pub_pool);
                        list_add_tail(&lpfc_ncmd->list,
                                      &pbl_pool->list);
                        pbl_pool->count++;
                        spin_unlock_irqrestore(&pbl_pool->lock, iflag);
                }
        } else {
-               spin_lock_irqsave(&qp->io_buf_list_put_lock, iflag);
+               lpfc_qp_spin_lock_irqsave(&qp->io_buf_list_put_lock, iflag,
+                                         qp, free_xri);
                list_add_tail(&lpfc_ncmd->list,
                              &qp->lpfc_io_buf_list_put);
                qp->put_io_bufs++;
  **/
 static struct lpfc_io_buf *
 lpfc_get_io_buf_from_private_pool(struct lpfc_hba *phba,
+                                 struct lpfc_sli4_hdw_queue *qp,
                                  struct lpfc_pvt_pool *pvt_pool,
                                  struct lpfc_nodelist *ndlp)
 {
        struct lpfc_io_buf *lpfc_ncmd_next;
        unsigned long iflag;
 
-       spin_lock_irqsave(&pvt_pool->lock, iflag);
+       lpfc_qp_spin_lock_irqsave(&pvt_pool->lock, iflag, qp, alloc_pvt_pool);
        list_for_each_entry_safe(lpfc_ncmd, lpfc_ncmd_next,
                                 &pvt_pool->list, list) {
                if (lpfc_test_rrq_active(
                lpfc_move_xri_pbl_to_pvt(phba, hwqid, XRI_BATCH);
 
        /* Get one XRI from private free xri pool */
-       lpfc_ncmd = lpfc_get_io_buf_from_private_pool(phba, pvt_pool, ndlp);
+       lpfc_ncmd = lpfc_get_io_buf_from_private_pool(phba, qp, pvt_pool, ndlp);
 
        if (lpfc_ncmd) {
                lpfc_ncmd->hdwq = qp;
                lpfc_cmd = lpfc_get_io_buf_from_multixri_pools(
                        phba, ndlp, hwqid, expedite);
        else {
-               spin_lock_irqsave(&qp->io_buf_list_get_lock, iflag);
+               lpfc_qp_spin_lock_irqsave(&qp->io_buf_list_get_lock, iflag,
+                                         qp, alloc_xri_get);
                if (qp->get_io_bufs > LPFC_NVME_EXPEDITE_XRICNT || expedite)
                        lpfc_cmd = lpfc_io_buf(phba, ndlp, hwqid);
                if (!lpfc_cmd) {
-                       spin_lock(&qp->io_buf_list_put_lock);
+                       lpfc_qp_spin_lock(&qp->io_buf_list_put_lock,
+                                         qp, alloc_xri_put);
                        list_splice(&qp->lpfc_io_buf_list_put,
                                    &qp->lpfc_io_buf_list_get);
                        qp->get_io_bufs += qp->put_io_bufs;
 
 
 /* Multi-queue arrangement for FCP EQ/CQ/WQ tuples */
 #define LPFC_HBA_HDWQ_MIN      0
-#define LPFC_HBA_HDWQ_MAX      64
+#define LPFC_HBA_HDWQ_MAX      128
 #define LPFC_HBA_HDWQ_DEF      0
 
 /* Common buffer size to accomidate SCSI and NVME IO buffers */
        uint32_t assoc_qid;     /* Queue ID associated with, for CQ/WQ/MQ */
        uint32_t host_index;    /* The host's index for putting or getting */
        uint32_t hba_index;     /* The last known hba index for get or put */
+       uint32_t q_mode;
 
        struct lpfc_sli_ring *pring; /* ptr to io ring associated with q */
        struct lpfc_rqb *rqbp;  /* ptr to RQ buffers */
 
-       uint32_t q_mode;
        uint16_t page_count;    /* Number of pages allocated for this queue */
        uint16_t page_size;     /* size of page allocated for this queue */
 #define LPFC_EXPANDED_PAGE_SIZE        16384
 #define LPFC_DEFAULT_PAGE_SIZE 4096
-       uint16_t chann;         /* IO channel this queue is associated with */
+       uint16_t chann;         /* Hardware Queue association WQ/CQ */
+                               /* CPU affinity for EQ */
+#define LPFC_FIND_BY_EQ                0
+#define LPFC_FIND_BY_HDWQ      1
        uint8_t db_format;
 #define LPFC_DB_RING_FORMAT    0x01
 #define LPFC_DB_LIST_FORMAT    0x02
        uint32_t idx;
        char handler_name[LPFC_SLI4_HANDLER_NAME_SZ];
        struct lpfc_hba *phba;
-       atomic_t hba_eq_in_use;
-       struct cpumask *cpumask;
-       /* CPU affinitsed to or 0xffffffff if multiple */
-       uint32_t cpu;
-#define LPFC_MULTI_CPU_AFFINITY 0xffffffff
 };
 
 /*BB Credit recovery value*/
        uint16_t        phys_id;
        uint16_t        core_id;
        uint16_t        irq;
+       uint16_t        eq;
        uint16_t        hdwq;
+       uint16_t        hyper;
 };
 #define LPFC_VECTOR_MAP_EMPTY  0xffff
 
        u32 io_cmpls;
 };
 
+#ifdef LPFC_HDWQ_LOCK_STAT
+struct lpfc_lock_stat {
+       uint32_t alloc_xri_get;
+       uint32_t alloc_xri_put;
+       uint32_t free_xri;
+       uint32_t wq_access;
+       uint32_t alloc_pvt_pool;
+       uint32_t mv_from_pvt_pool;
+       uint32_t mv_to_pub_pool;
+       uint32_t mv_to_pvt_pool;
+       uint32_t free_pub_pool;
+       uint32_t free_pvt_pool;
+};
+#endif
+
 /* SLI4 HBA data structure entries */
 struct lpfc_sli4_hdw_queue {
        /* Pointers to the constructed SLI4 queues */
        /* FC-4 Stats counters */
        struct lpfc_fc4_ctrl_stat nvme_cstat;
        struct lpfc_fc4_ctrl_stat scsi_cstat;
+#ifdef LPFC_HDWQ_LOCK_STAT
+       struct lpfc_lock_stat lock_conflict;
+#endif
 
 #ifdef CONFIG_SCSI_LPFC_DEBUG_FS
 #define LPFC_CHECK_CPU_CNT    128
 #endif
 };
 
+#ifdef LPFC_HDWQ_LOCK_STAT
+/* compile time trylock stats */
+#define lpfc_qp_spin_lock_irqsave(lock, flag, qp, lstat) \
+       { \
+       int only_once = 1; \
+       while (spin_trylock_irqsave(lock, flag) == 0) { \
+               if (only_once) { \
+                       only_once = 0; \
+                       qp->lock_conflict.lstat++; \
+               } \
+       } \
+       }
+#define lpfc_qp_spin_lock(lock, qp, lstat) \
+       { \
+       int only_once = 1; \
+       while (spin_trylock(lock) == 0) { \
+               if (only_once) { \
+                       only_once = 0; \
+                       qp->lock_conflict.lstat++; \
+               } \
+       } \
+       }
+#else
+#define lpfc_qp_spin_lock_irqsave(lock, flag, qp, lstat) \
+       spin_lock_irqsave(lock, flag)
+#define lpfc_qp_spin_lock(lock, qp, lstat) spin_lock(lock)
+#endif
+
 struct lpfc_sli4_hba {
        void __iomem *conf_regs_memmap_p; /* Kernel memory mapped address for
                                           * config space registers
        uint16_t nvmet_xri_cnt;
        uint16_t nvmet_io_wait_cnt;
        uint16_t nvmet_io_wait_total;
+       uint16_t cq_max;
+       struct lpfc_queue **cq_lookup;
        struct list_head lpfc_els_sgl_list;
        struct list_head lpfc_abts_els_sgl_list;
        spinlock_t abts_scsi_buf_list_lock; /* list of aborted SCSI IOs */