if (handler == sdma_interrupt) {
                        dd_dev_info(dd, "sdma engine %d cpu %d\n",
                                sde->this_idx, sdma_cpu);
+                       sde->cpu = sdma_cpu;
                        cpumask_set_cpu(sdma_cpu, dd->msix_entries[i].mask);
                        sdma_cpu = cpumask_next(sdma_cpu, def);
                        if (sdma_cpu >= nr_cpu_ids)
 
        for (pidx = 0; pidx < dd->num_pports; ++pidx) {
                ppd = dd->pport + pidx;
                if (!ppd->hfi1_wq) {
-                       char wq_name[8]; /* 3 + 2 + 1 + 1 + 1 */
-
-                       snprintf(wq_name, sizeof(wq_name), "hfi%d_%d",
-                                dd->unit, pidx);
                        ppd->hfi1_wq =
-                               create_singlethread_workqueue(wq_name);
+                               alloc_workqueue(
+                                   "hfi%d_%d",
+                                   WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE,
+                                   dd->num_sdma,
+                                   dd->unit, pidx);
                        if (!ppd->hfi1_wq)
                                goto wq_error;
                }
        }
        return 0;
 wq_error:
-       pr_err("create_singlethread_workqueue failed for port %d\n",
-               pidx + 1);
+       pr_err("alloc_workqueue failed for port %d\n", pidx + 1);
        for (pidx = 0; pidx < dd->num_pports; ++pidx) {
                ppd = dd->pport + pidx;
                if (ppd->hfi1_wq) {
 
  * iowait_schedule() - initialize wait structure
  * @wait: wait struct to schedule
  * @wq: workqueue for schedule
+ * @cpu: cpu
  */
 static inline void iowait_schedule(
        struct iowait *wait,
-       struct workqueue_struct *wq)
+       struct workqueue_struct *wq,
+       int cpu)
 {
-       queue_work(wq, &wait->iowork);
+       queue_work_on(cpu, wq, &wait->iowork);
 }
 
 /**
 
  */
 void qp_comm_est(struct hfi1_qp *qp);
 
+/**
+ * _hfi1_schedule_send - schedule progress
+ * @qp: the QP
+ *
+ * This schedules qp progress w/o regard to the s_flags.
+ *
+ * It is only used in the post send, which doesn't hold
+ * the s_lock.
+ */
+static inline void _hfi1_schedule_send(struct hfi1_qp *qp)
+{
+       struct hfi1_ibport *ibp =
+               to_iport(qp->ibqp.device, qp->port_num);
+       struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+       struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+
+       iowait_schedule(&qp->s_iowait, ppd->hfi1_wq,
+                       qp->s_sde ?
+                       qp->s_sde->cpu :
+                       cpumask_first(cpumask_of_node(dd->assigned_node_id)));
+}
+
+/**
+ * hfi1_schedule_send - schedule progress
+ * @qp: the QP
+ *
+ * This schedules qp progress and caller should hold
+ * the s_lock.
+ */
+static inline void hfi1_schedule_send(struct hfi1_qp *qp)
+{
+       if (hfi1_send_ok(qp))
+               _hfi1_schedule_send(qp);
+}
+
 void hfi1_migrate_qp(struct hfi1_qp *qp);
 
 #endif /* _QP_H */
 
        struct sdma_engine *rval;
 
        if (WARN_ON(vl > 8))
-               return NULL;
+               return &dd->per_sdma[0];
 
        rcu_read_lock();
        m = rcu_dereference(dd->sdma_map);
        if (unlikely(!m)) {
                rcu_read_unlock();
-               return NULL;
+               return &dd->per_sdma[0];
        }
        e = m->map[vl & m->mask];
        rval = e->sde[selector & e->mask];
        rcu_read_unlock();
 
+       rval =  !rval ? &dd->per_sdma[0] : rval;
        trace_hfi1_sdma_engine_select(dd, selector, vl, rval->this_idx);
        return rval;
 }
 }
 
 #define SDE_FMT \
-       "SDE %u STE %s C 0x%llx S 0x%016llx E 0x%llx T(HW) 0x%llx T(SW) 0x%x H(HW) 0x%llx H(SW) 0x%x H(D) 0x%llx DM 0x%llx GL 0x%llx R 0x%llx LIS 0x%llx AHGI 0x%llx TXT %u TXH %u DT %u DH %u FLNE %d DQF %u SLC 0x%llx\n"
+       "SDE %u CPU %d STE %s C 0x%llx S 0x%016llx E 0x%llx T(HW) 0x%llx T(SW) 0x%x H(HW) 0x%llx H(SW) 0x%x H(D) 0x%llx DM 0x%llx GL 0x%llx R 0x%llx LIS 0x%llx AHGI 0x%llx TXT %u TXH %u DT %u DH %u FLNE %d DQF %u SLC 0x%llx\n"
 /**
  * sdma_seqfile_dump_sde() - debugfs dump of sde
  * @s: seq file
        head = sde->descq_head & sde->sdma_mask;
        tail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
        seq_printf(s, SDE_FMT, sde->this_idx,
+               sde->cpu,
                sdma_state_name(sde->state.current_state),
                (unsigned long long)read_sde_csr(sde, SD(CTRL)),
                (unsigned long long)read_sde_csr(sde, SD(STATUS)),
 
        u64 idle_mask;
        u64 progress_mask;
        /* private: */
-       struct workqueue_struct *wq;
-       /* private: */
        volatile __le64      *head_dma; /* DMA'ed by chip */
        /* private: */
        dma_addr_t            head_phys;
        u32 sdma_mask;
        /* private */
        struct sdma_state state;
+       /* private */
+       int cpu;
        /* private: */
        u8 sdma_shift;
        /* private: */
        struct sdma_engine *sde,
        struct iowait *wait)
 {
-       iowait_schedule(wait, sde->wq);
+       struct hfi1_pportdata *ppd = sde->dd->pport;
+
+       iowait_schedule(wait, ppd->hfi1_wq, sde->cpu);
 }
 
 /* for use by interrupt handling */
 
        return container_of(ibucontext, struct hfi1_ucontext, ibucontext);
 }
 
+static inline void _hfi1_schedule_send(struct hfi1_qp *qp);
+
 /*
  * Translate ib_wr_opcode into ib_wc_opcode.
  */
                nreq++;
        }
 bail:
-       if (nreq && !call_send)
-               hfi1_schedule_send(qp);
        spin_unlock_irqrestore(&qp->s_lock, flags);
+       if (nreq && !call_send)
+               _hfi1_schedule_send(qp);
        if (nreq && call_send)
                hfi1_do_send(&qp->s_iowait.iowork);
        return err;
        vfree(dev->lk_table.table);
 }
 
-/*
- * This must be called with s_lock held.
- */
-void hfi1_schedule_send(struct hfi1_qp *qp)
-{
-       if (hfi1_send_ok(qp)) {
-               struct hfi1_ibport *ibp =
-                       to_iport(qp->ibqp.device, qp->port_num);
-               struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
-
-               iowait_schedule(&qp->s_iowait, ppd->hfi1_wq);
-       }
-}
-
 void hfi1_cnp_rcv(struct hfi1_packet *packet)
 {
        struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data;
 
 /*
  * This must be called with s_lock held.
  */
-void hfi1_schedule_send(struct hfi1_qp *qp);
 void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl,
                    u32 qp1, u32 qp2, __be16 lid1, __be16 lid2);
 void hfi1_cap_mask_chg(struct hfi1_ibport *ibp);