From a1d3cefb049bbd314d773a698b5f8794eab0a71d Mon Sep 17 00:00:00 2001 From: Jinhao Fan Date: Sat, 27 Aug 2022 17:12:55 +0800 Subject: [PATCH] hw/nvme: support irq(de)assertion with eventfd When the new option 'x-irq-eventfd' is turned on, the IO emulation code signals an eventfd when it want to (de)assert an irq. The main loop eventfd handler does the actual irq (de)assertion. This paves the way for iothread support since QEMU's interrupt emulation is not thread safe. Asserting and deasseting irqs with eventfd has some performance implications. For small queue depth it increases request latency, but for large queue depth it effectively coalesces irqs. Comparision (KIOPS): QD 1 4 16 64 QEMU 38 123 210 329 irq-eventfd 32 106 240 364 Signed-off-by: Jinhao Fan Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 111 +++++++++++++++++++++++++++++++++++++++++-- hw/nvme/nvme.h | 3 ++ hw/nvme/trace-events | 5 ++ 3 files changed, 114 insertions(+), 5 deletions(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 45d15224b1..5c3e676cba 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -660,11 +660,11 @@ static void nvme_irq_check_intx(NvmeCtrl *n) } } -static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq) +static void nvme_irq_do_assert(NvmeCtrl *n, NvmeCQueue *cq) { PCIDevice *pci = PCI_DEVICE(n); - assert(cq->irq_enabled); + trace_pci_nvme_irq_do_assert(cq->cqid); if (msix_enabled(pci)) { trace_pci_nvme_irq_msix(cq->vector); @@ -680,16 +680,32 @@ static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq) nvme_irq_check_intx(n); } -static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq) +static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq) +{ + trace_pci_nvme_irq_assert(cq->cqid); + + assert(cq->irq_enabled); + + if (cq->assert_notifier.initialized) { + event_notifier_set(&cq->assert_notifier); + return; + } + + nvme_irq_do_assert(n, cq); +} + +static void nvme_irq_do_deassert(NvmeCtrl *n, NvmeCQueue *cq) { PCIDevice *pci = PCI_DEVICE(n); + trace_pci_nvme_irq_do_deassert(cq->cqid); + assert(cq->irq_enabled); if (msix_enabled(pci)) { return; } - + assert(cq->vector < 32); if (qatomic_dec_fetch(&n->cq_pending) == 0) { @@ -699,6 +715,29 @@ static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq) nvme_irq_check_intx(n); } +static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq) +{ + PCIDevice *pci = PCI_DEVICE(n); + + assert(cq->irq_enabled); + + if (msix_enabled(pci)) { + return; + } + + if (cq->deassert_notifier.initialized) { + /* + * The deassert notifier will only be initilized when MSI-X is NOT + * in use. Therefore no need to worry about extra eventfd syscall + * for pin-based interrupts. + */ + event_notifier_set(&cq->deassert_notifier); + return; + } + + nvme_irq_do_deassert(n, cq); +} + static void nvme_req_clear(NvmeRequest *req) { req->ns = NULL; @@ -1492,6 +1531,55 @@ static void nvme_update_cq_head(NvmeCQueue *cq) trace_pci_nvme_update_cq_head(cq->cqid, cq->head); } +static void nvme_irq_assert_notify(EventNotifier *e) +{ + NvmeCQueue *cq = container_of(e, NvmeCQueue, assert_notifier); + + trace_pci_nvme_irq_assert_notify(cq->cqid); + + if (event_notifier_test_and_clear(e)) { + nvme_irq_do_assert(cq->ctrl, cq); + } +} + +static void nvme_irq_deassert_notify(EventNotifier *e) +{ + NvmeCQueue *cq = container_of(e, NvmeCQueue, deassert_notifier); + + trace_pci_nvme_irq_deassert_notify(cq->cqid); + + if (event_notifier_test_and_clear(e)) { + nvme_irq_do_deassert(cq->ctrl, cq); + } +} + +static void nvme_init_irq_notifier(NvmeCtrl *n, NvmeCQueue *cq) +{ + int ret; + + ret = event_notifier_init(&cq->assert_notifier, 0); + if (ret < 0) { + return; + } + + event_notifier_set_handler(&cq->assert_notifier, nvme_irq_assert_notify); + + if (!msix_enabled(&n->parent_obj)) { + ret = event_notifier_init(&cq->deassert_notifier, 0); + if (ret < 0) { + event_notifier_set_handler(&cq->assert_notifier, NULL); + event_notifier_cleanup(&cq->assert_notifier); + + return; + } + + event_notifier_set_handler(&cq->deassert_notifier, + nvme_irq_deassert_notify); + } + + return; +} + static void nvme_post_cqe(NvmeCQueue *cq, NvmeRequest *req) { NvmeCtrl *n = cq->ctrl; @@ -5268,6 +5356,14 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n) event_notifier_set_handler(&cq->notifier, NULL); event_notifier_cleanup(&cq->notifier); } + if (cq->assert_notifier.initialized) { + event_notifier_set_handler(&cq->assert_notifier, NULL); + event_notifier_cleanup(&cq->assert_notifier); + } + if (cq->deassert_notifier.initialized) { + event_notifier_set_handler(&cq->deassert_notifier, NULL); + event_notifier_cleanup(&cq->deassert_notifier); + } if (msix_enabled(pci)) { msix_vector_unuse(pci, cq->vector); } @@ -5294,7 +5390,7 @@ static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req) } if (cq->irq_enabled) { - nvme_irq_deassert(n, cq); + nvme_irq_do_deassert(n, cq); } trace_pci_nvme_del_cq(qid); @@ -5338,6 +5434,10 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr, cq->do_irq = qemu_bh_new_guarded(nvme_do_irq, cq, guard); cq->post_queued_cqes = qemu_bh_new_guarded(nvme_post_cqes, cq, guard); + + if (cqid && n->params.irq_eventfd) { + nvme_init_irq_notifier(n, cq); + } } static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req) @@ -8469,6 +8569,7 @@ static Property nvme_props[] = { DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false), DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false), DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, false), + DEFINE_PROP_BOOL("x-irq-eventfd", NvmeCtrl, params.irq_eventfd, false), DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0), DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl, params.auto_transition_zones, true), diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h index 8e4b78492b..6f7b8c23d6 100644 --- a/hw/nvme/nvme.h +++ b/hw/nvme/nvme.h @@ -486,6 +486,8 @@ typedef struct NvmeCQueue { uint64_t ei_addr; QEMUBH *do_irq, *post_queued_cqes; EventNotifier notifier; + EventNotifier assert_notifier; + EventNotifier deassert_notifier; bool ioeventfd_enabled; QTAILQ_HEAD(, NvmeSQueue) sq_list; QTAILQ_HEAD(, NvmeRequest) req_list; @@ -510,6 +512,7 @@ typedef struct NvmeParams { bool auto_transition_zones; bool legacy_cmb; bool ioeventfd; + bool irq_eventfd; uint8_t sriov_max_vfs; uint16_t sriov_vq_flexible; uint16_t sriov_vi_flexible; diff --git a/hw/nvme/trace-events b/hw/nvme/trace-events index c19f756429..18f02ea304 100644 --- a/hw/nvme/trace-events +++ b/hw/nvme/trace-events @@ -3,6 +3,11 @@ pci_nvme_irq_msix(uint32_t vector) "raising MSI-X IRQ vector %u" pci_nvme_irq_check_intx(uint32_t intms, uint32_t irq_status) "intms 0x%"PRIx32" irq_status 0x%"PRIx32"" pci_nvme_irq_pin(uint8_t assert) "assert %"PRIu8"" pci_nvme_irq_masked(void) "IRQ is masked" +pci_nvme_irq_assert(uint16_t cqid) "cqid %"PRIu16"" +pci_nvme_irq_assert_notify(uint16_t cqid) "cqid %"PRIu16"" +pci_nvme_irq_deassert_notify(uint16_t cqid) "cqid %"PRIu16"" +pci_nvme_irq_do_assert(uint16_t cqid) "cqid %"PRIu16"" +pci_nvme_irq_do_deassert(uint16_t cqid) "cqid %"PRIu16"" pci_nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64"" pci_nvme_dbbuf_config(uint64_t dbs_addr, uint64_t eis_addr) "dbs_addr=0x%"PRIx64" eis_addr=0x%"PRIx64"" pci_nvme_map_addr(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64"" -- 2.50.1