void hisi_sas_slot_task_free(struct hisi_hba *hisi_hba, struct sas_task *task,
                             struct hisi_sas_slot *slot)
 {
-       struct hisi_sas_dq *dq = &hisi_hba->dq[slot->dlvry_queue];
        unsigned long flags;
+       int device_id = slot->device_id;
+       struct hisi_sas_device *sas_dev = &hisi_hba->devices[device_id];
 
        if (task) {
                struct device *dev = hisi_hba->dev;
                }
        }
 
-
-       spin_lock_irqsave(&dq->lock, flags);
+       spin_lock_irqsave(&sas_dev->lock, flags);
        list_del_init(&slot->entry);
-       spin_unlock_irqrestore(&dq->lock, flags);
+       spin_unlock_irqrestore(&sas_dev->lock, flags);
 
        memset(slot, 0, offsetof(struct hisi_sas_slot, buf));
 
                return -ECOMM;
        }
 
-       *dq_pointer = dq = sas_dev->dq;
+       if (hisi_hba->reply_map) {
+               int cpu = raw_smp_processor_id();
+               unsigned int dq_index = hisi_hba->reply_map[cpu];
+
+               *dq_pointer = dq = &hisi_hba->dq[dq_index];
+       } else {
+               *dq_pointer = dq = sas_dev->dq;
+       }
 
        port = to_hisi_sas_port(sas_port);
        if (port && !port->port_attached) {
        }
 
        list_add_tail(&slot->delivery, &dq->list);
-       list_add_tail(&slot->entry, &sas_dev->list);
        spin_unlock_irqrestore(&dq->lock, flags);
+       spin_lock_irqsave(&sas_dev->lock, flags);
+       list_add_tail(&slot->entry, &sas_dev->list);
+       spin_unlock_irqrestore(&sas_dev->lock, flags);
 
        dlvry_queue = dq->id;
        dlvry_queue_slot = wr_q_index;
 
+       slot->device_id = sas_dev->device_id;
        slot->n_elem = n_elem;
        slot->n_elem_dif = n_elem_dif;
        slot->dlvry_queue = dlvry_queue;
                        sas_dev->hisi_hba = hisi_hba;
                        sas_dev->sas_device = device;
                        sas_dev->dq = dq;
+                       spin_lock_init(&sas_dev->lock);
                        INIT_LIST_HEAD(&hisi_hba->devices[i].list);
                        break;
                }
        }
        list_add_tail(&slot->delivery, &dq->list);
        spin_unlock_irqrestore(&dq->lock, flags_dq);
+       spin_lock_irqsave(&sas_dev->lock, flags);
+       list_add_tail(&slot->entry, &sas_dev->list);
+       spin_unlock_irqrestore(&sas_dev->lock, flags);
 
        dlvry_queue = dq->id;
        dlvry_queue_slot = wr_q_index;
 
+       slot->device_id = sas_dev->device_id;
        slot->n_elem = n_elem;
        slot->dlvry_queue = dlvry_queue;
        slot->dlvry_queue_slot = dlvry_queue_slot;
        WRITE_ONCE(slot->ready, 1);
        /* send abort command to the chip */
        spin_lock_irqsave(&dq->lock, flags);
-       list_add_tail(&slot->entry, &sas_dev->list);
        hisi_hba->hw->start_delivery(dq);
        spin_unlock_irqrestore(&dq->lock, flags);
 
                                                     abort_flag, tag, dq);
        case HISI_SAS_INT_ABT_DEV:
                for (i = 0; i < hisi_hba->cq_nvecs; i++) {
+                       struct hisi_sas_cq *cq = &hisi_hba->cq[i];
+                       const struct cpumask *mask = cq->pci_irq_mask;
+
+                       if (mask && !cpumask_intersects(cpu_online_mask, mask))
+                               continue;
                        dq = &hisi_hba->dq[i];
                        rc = _hisi_sas_internal_task_abort(hisi_hba, device,
                                                           abort_flag, tag,
 
 #define T10_CHK_APP_TAG_MSK (0xc << T10_CHK_MSK_OFF)
 
 #define BASE_VECTORS_V3_HW  16
+#define MIN_AFFINE_VECTORS_V3_HW  (BASE_VECTORS_V3_HW + 1)
 
 static bool hisi_sas_intr_conv;
 MODULE_PARM_DESC(intr_conv, "interrupt converge enable (0-1)");
 module_param(prot_mask, int, 0);
 MODULE_PARM_DESC(prot_mask, " host protection capabilities mask, def=0x0 ");
 
+static bool auto_affine_msi_experimental;
+module_param(auto_affine_msi_experimental, bool, 0444);
+MODULE_PARM_DESC(auto_affine_msi_experimental, "Enable auto-affinity of MSI IRQs as experimental:\n"
+                "default is off");
+
 static u32 hisi_sas_read32(struct hisi_hba *hisi_hba, u32 off)
 {
        void __iomem *regs = hisi_hba->regs + off;
        return IRQ_HANDLED;
 }
 
+static void setup_reply_map_v3_hw(struct hisi_hba *hisi_hba, int nvecs)
+{
+       const struct cpumask *mask;
+       int queue, cpu;
+
+       for (queue = 0; queue < nvecs; queue++) {
+               struct hisi_sas_cq *cq = &hisi_hba->cq[queue];
+
+               mask = pci_irq_get_affinity(hisi_hba->pci_dev, queue +
+                                           BASE_VECTORS_V3_HW);
+               if (!mask)
+                       goto fallback;
+               cq->pci_irq_mask = mask;
+               for_each_cpu(cpu, mask)
+                       hisi_hba->reply_map[cpu] = queue;
+       }
+       return;
+
+fallback:
+       for_each_possible_cpu(cpu)
+               hisi_hba->reply_map[cpu] = cpu % hisi_hba->queue_count;
+       /* Don't clean all CQ masks */
+}
+
 static int interrupt_init_v3_hw(struct hisi_hba *hisi_hba)
 {
        struct device *dev = hisi_hba->dev;
        struct pci_dev *pdev = hisi_hba->pci_dev;
        int vectors, rc;
        int i, k;
-       int max_msi = HISI_SAS_MSI_COUNT_V3_HW;
-
-       vectors = pci_alloc_irq_vectors(hisi_hba->pci_dev, 1,
-                                       max_msi, PCI_IRQ_MSI);
-       if (vectors < max_msi) {
-               dev_err(dev, "could not allocate all msi (%d)\n", vectors);
-               return -ENOENT;
+       int max_msi = HISI_SAS_MSI_COUNT_V3_HW, min_msi;
+
+       if (auto_affine_msi_experimental) {
+               struct irq_affinity desc = {
+                       .pre_vectors = BASE_VECTORS_V3_HW,
+               };
+
+               min_msi = MIN_AFFINE_VECTORS_V3_HW;
+
+               hisi_hba->reply_map = devm_kcalloc(dev, nr_cpu_ids,
+                                                  sizeof(unsigned int),
+                                                  GFP_KERNEL);
+               if (!hisi_hba->reply_map)
+                       return -ENOMEM;
+               vectors = pci_alloc_irq_vectors_affinity(hisi_hba->pci_dev,
+                                                        min_msi, max_msi,
+                                                        PCI_IRQ_MSI |
+                                                        PCI_IRQ_AFFINITY,
+                                                        &desc);
+               if (vectors < 0)
+                       return -ENOENT;
+               setup_reply_map_v3_hw(hisi_hba, vectors - BASE_VECTORS_V3_HW);
+       } else {
+               min_msi = max_msi;
+               vectors = pci_alloc_irq_vectors(hisi_hba->pci_dev, min_msi,
+                                               max_msi, PCI_IRQ_MSI);
+               if (vectors < 0)
+                       return vectors;
        }
 
        hisi_hba->cq_nvecs = vectors - BASE_VECTORS_V3_HW;