]> www.infradead.org Git - qemu-nvme.git/commitdiff
hw/nvme: use KVM irqfd when available
authorJinhao Fan <fanjinhao21s@ict.ac.cn>
Sat, 27 Aug 2022 09:12:56 +0000 (17:12 +0800)
committerKlaus Jensen <k.jensen@samsung.com>
Thu, 8 Jun 2023 19:15:39 +0000 (21:15 +0200)
Use KVM's irqfd to send interrupts when possible. This approach is
thread safe. Moreover, it does not have the inter-thread communication
overhead of plain event notifiers since handler callback are called
in the same system call as irqfd write.

Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
hw/nvme/ctrl.c
hw/nvme/nvme.h
hw/nvme/trace-events

index 5c3e676cbab3084ea9ff6c89fe595cddd4d14dea..1e0ce1f719a52f4d700deedca3bd9564d9a90446 100644 (file)
 #include "qapi/error.h"
 #include "qapi/visitor.h"
 #include "sysemu/sysemu.h"
+#include "sysemu/kvm.h"
 #include "sysemu/block-backend.h"
 #include "sysemu/hostmem.h"
 #include "hw/pci/msix.h"
@@ -1553,8 +1554,115 @@ static void nvme_irq_deassert_notify(EventNotifier *e)
     }
 }
 
+static int nvme_kvm_vector_use(NvmeCtrl *n, NvmeCQueue *cq, uint32_t vector)
+{
+    KVMRouteChange c = kvm_irqchip_begin_route_changes(kvm_state);
+    int ret;
+
+    ret = kvm_irqchip_add_msi_route(&c, vector, &n->parent_obj);
+    if (ret < 0) {
+        return ret;
+    }
+
+    kvm_irqchip_commit_route_changes(&c);
+
+    cq->virq = ret;
+
+    return 0;
+}
+
+static int nvme_kvm_vector_unmask(PCIDevice *pci_dev, unsigned vector,
+                                  MSIMessage msg)
+{
+    NvmeCtrl *n = NVME(pci_dev);
+    int ret;
+
+    trace_pci_nvme_irq_unmask(vector, msg.address, msg.data);
+
+    for (uint32_t i = 1; i <= n->params.max_ioqpairs; i++) {
+        NvmeCQueue *cq = n->cq[i];
+
+        if (!cq) {
+            continue;
+        }
+
+        if (cq->vector == vector) {
+            if (cq->msg.data != msg.data || cq->msg.address != msg.address) {
+                ret = kvm_irqchip_update_msi_route(kvm_state, cq->virq, msg,
+                                                   pci_dev);
+                if (ret < 0) {
+                    return ret;
+                }
+
+                kvm_irqchip_commit_routes(kvm_state);
+
+                cq->msg = msg;
+            }
+
+            ret = kvm_irqchip_add_irqfd_notifier_gsi(kvm_state,
+                                                     &cq->assert_notifier,
+                                                     NULL, cq->virq);
+            if (ret < 0) {
+                return ret;
+            }
+        }
+    }
+
+    return 0;
+}
+
+static void nvme_kvm_vector_mask(PCIDevice *pci_dev, unsigned vector)
+{
+    NvmeCtrl *n = NVME(pci_dev);
+
+    trace_pci_nvme_irq_mask(vector);
+
+    for (uint32_t i = 1; i <= n->params.max_ioqpairs; i++) {
+        NvmeCQueue *cq = n->cq[i];
+
+        if (!cq) {
+            continue;
+        }
+
+        if (cq->vector == vector) {
+            kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state,
+                                                  &cq->assert_notifier,
+                                                  cq->virq);
+        }
+    }
+}
+
+static void nvme_kvm_vector_poll(PCIDevice *pci_dev, unsigned int vector_start,
+                                 unsigned int vector_end)
+{
+    NvmeCtrl *n = NVME(pci_dev);
+
+    trace_pci_nvme_irq_poll(vector_start, vector_end);
+
+    for (uint32_t i = 1; i <= n->params.max_ioqpairs; i++) {
+        NvmeCQueue *cq = n->cq[i];
+
+        if (!cq) {
+            continue;
+        }
+
+        if (!msix_is_masked(pci_dev, cq->vector)) {
+            continue;
+        }
+
+        if (cq->vector >= vector_start && cq->vector <= vector_end) {
+            if (event_notifier_test_and_clear(&cq->assert_notifier)) {
+                msix_set_pending(pci_dev, i);
+            }
+        }
+    }
+}
+
+
 static void nvme_init_irq_notifier(NvmeCtrl *n, NvmeCQueue *cq)
 {
+    bool with_irqfd = msix_enabled(&n->parent_obj) &&
+                      kvm_msi_via_irqfd_enabled();
     int ret;
 
     ret = event_notifier_init(&cq->assert_notifier, 0);
@@ -1562,11 +1670,27 @@ static void nvme_init_irq_notifier(NvmeCtrl *n, NvmeCQueue *cq)
         return;
     }
 
-    event_notifier_set_handler(&cq->assert_notifier, nvme_irq_assert_notify);
+    if (with_irqfd) {
+        ret = nvme_kvm_vector_use(n, cq, cq->vector);
+        if (ret < 0) {
+            event_notifier_cleanup(&cq->assert_notifier);
+
+            return;
+        }
+    } else {
+        event_notifier_set_handler(&cq->assert_notifier,
+                                   nvme_irq_assert_notify);
+    }
 
     if (!msix_enabled(&n->parent_obj)) {
         ret = event_notifier_init(&cq->deassert_notifier, 0);
         if (ret < 0) {
+            if (with_irqfd) {
+                kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state,
+                                                      &cq->assert_notifier,
+                                                      cq->virq);
+            }
+
             event_notifier_set_handler(&cq->assert_notifier, NULL);
             event_notifier_cleanup(&cq->assert_notifier);
 
@@ -4637,6 +4761,8 @@ static void nvme_cq_notifier(EventNotifier *e)
     NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier);
     bool start_sqs = nvme_cq_full(cq);
 
+    trace_pci_nvme_cq_notifier(cq->cqid);
+
     if (!event_notifier_test_and_clear(e)) {
         return;
     }
@@ -4667,6 +4793,8 @@ static void nvme_sq_notifier(EventNotifier *e)
 {
     NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier);
 
+    trace_pci_nvme_sq_notifier(sq->sqid);
+
     if (!event_notifier_test_and_clear(e)) {
         return;
     }
@@ -5345,6 +5473,7 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
 static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
 {
     PCIDevice *pci = PCI_DEVICE(n);
+    bool with_irqfd = msix_enabled(pci) && kvm_msi_via_irqfd_enabled();
     uint16_t offset = (cq->cqid << 3) + (1 << 2);
 
     n->cq[cq->cqid] = NULL;
@@ -5357,6 +5486,12 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
         event_notifier_cleanup(&cq->notifier);
     }
     if (cq->assert_notifier.initialized) {
+        if (with_irqfd) {
+            kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state,
+                                                  &cq->assert_notifier,
+                                                  cq->virq);
+            kvm_irqchip_release_virq(kvm_state, cq->virq);
+        }
         event_notifier_set_handler(&cq->assert_notifier, NULL);
         event_notifier_cleanup(&cq->assert_notifier);
     }
@@ -7346,6 +7481,9 @@ static int nvme_start_ctrl(NvmeCtrl *n)
     uint32_t page_size = 1 << page_bits;
     NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
 
+    bool with_irqfd = msix_enabled(&n->parent_obj) &&
+                      kvm_msi_via_irqfd_enabled();
+
     if (pci_is_vf(PCI_DEVICE(n)) && !sctrl->scs) {
         trace_pci_nvme_err_startfail_virt_state(le16_to_cpu(sctrl->nvi),
                                                 le16_to_cpu(sctrl->nvq));
@@ -7433,6 +7571,12 @@ static int nvme_start_ctrl(NvmeCtrl *n)
 
     nvme_select_iocs(n);
 
+    if (n->params.irq_eventfd && with_irqfd) {
+        return msix_set_vector_notifiers(PCI_DEVICE(n), nvme_kvm_vector_unmask,
+                                         nvme_kvm_vector_mask,
+                                         nvme_kvm_vector_poll);
+    }
+
     return 0;
 }
 
@@ -8547,6 +8691,7 @@ static void nvme_exit(PCIDevice *pci_dev)
         pcie_sriov_pf_exit(pci_dev);
     }
 
+    msix_unset_vector_notifiers(pci_dev);
     msix_uninit(pci_dev, &n->bar0, &n->bar0);
     memory_region_del_subregion(&n->bar0, &n->iomem);
 }
index 6f7b8c23d66ec2ad56feb795d1ea3c547e6305e7..ae08caa5ff66e96da73bb1e4613e1ba02881012e 100644 (file)
@@ -20,6 +20,7 @@
 
 #include "qemu/uuid.h"
 #include "hw/pci/pci_device.h"
+#include "hw/pci/msi.h"
 #include "hw/block/block.h"
 
 #include "block/nvme.h"
@@ -484,10 +485,12 @@ typedef struct NvmeCQueue {
     uint64_t    dma_addr;
     uint64_t    db_addr;
     uint64_t    ei_addr;
+    int         virq;
     QEMUBH      *do_irq, *post_queued_cqes;
     EventNotifier notifier;
     EventNotifier assert_notifier;
     EventNotifier deassert_notifier;
+    MSIMessage  msg;
     bool        ioeventfd_enabled;
     QTAILQ_HEAD(, NvmeSQueue) sq_list;
     QTAILQ_HEAD(, NvmeRequest) req_list;
index 18f02ea304a97d07d7465418c699a2bea99f2215..bff0bcac1a30d434e5e003dde3d2beef642b496d 100644 (file)
@@ -8,6 +8,9 @@ pci_nvme_irq_assert_notify(uint16_t cqid) "cqid %"PRIu16""
 pci_nvme_irq_deassert_notify(uint16_t cqid) "cqid %"PRIu16""
 pci_nvme_irq_do_assert(uint16_t cqid) "cqid %"PRIu16""
 pci_nvme_irq_do_deassert(uint16_t cqid) "cqid %"PRIu16""
+pci_nvme_irq_mask(uint32_t vector) "IRQ %u gets masked"
+pci_nvme_irq_unmask(uint32_t vector, uint64_t addr, uint32_t data) "IRQ %u gets unmasked, addr=0x%"PRIx64" data=0x%"PRIu32""
+pci_nvme_irq_poll(uint32_t vector_start, uint32_t vector_end) "IRQ poll, start=0x%"PRIu32" end=0x%"PRIu32""
 pci_nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64""
 pci_nvme_dbbuf_config(uint64_t dbs_addr, uint64_t eis_addr) "dbs_addr=0x%"PRIx64" eis_addr=0x%"PRIx64""
 pci_nvme_map_addr(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64""
@@ -92,6 +95,8 @@ pci_nvme_enqueue_event_noqueue(int queued) "queued %d"
 pci_nvme_enqueue_event_masked(uint8_t typ) "type 0x%"PRIx8""
 pci_nvme_no_outstanding_aers(void) "ignoring event; no outstanding AERs"
 pci_nvme_enqueue_req_completion(uint16_t cid, uint16_t cqid, uint32_t dw0, uint32_t dw1, uint16_t status) "cid %"PRIu16" cqid %"PRIu16" dw0 0x%"PRIx32" dw1 0x%"PRIx32" status 0x%"PRIx16""
+pci_nvme_cq_notifier(uint16_t cqid) "cqid %"PRIu16""
+pci_nvme_sq_notifier(uint16_t sqid) "sqid %"PRIu16""
 pci_nvme_update_cq_eventidx(uint16_t cqid, uint16_t new_eventidx) "cqid %"PRIu16" new_eventidx %"PRIu16""
 pci_nvme_update_sq_eventidx(uint16_t sqid, uint16_t new_eventidx) "sqid %"PRIu16" new_eventidx %"PRIu16""
 pci_nvme_mmio_read(uint64_t addr, unsigned size) "addr 0x%"PRIx64" size %d"