aboutsummaryrefslogtreecommitdiff
path: root/hw/nvme
diff options
context:
space:
mode:
authorJinhao Fan <fanjinhao21s@ict.ac.cn>2022-07-05 22:24:03 +0800
committerKlaus Jensen <k.jensen@samsung.com>2022-07-15 10:40:33 +0200
commit2e53b0b450246044efd27418c5d05ad6919deb87 (patch)
tree6f3af9fae825f294e1aa513fe9290175bc0c7de5 /hw/nvme
parent43f76aac49c439ea79c125d1befd9d5d7057dbb4 (diff)
hw/nvme: Use ioeventfd to handle doorbell updates
Add property "ioeventfd" which is enabled by default. When this is enabled, updates on the doorbell registers will cause KVM to signal an event to the QEMU main loop to handle the doorbell updates. Therefore, instead of letting the vcpu thread run both guest VM and IO emulation, we now use the main loop thread to do IO emulation and thus the vcpu thread has more cycles for the guest VM. Since ioeventfd does not tell us the exact value that is written, it is only useful when shadow doorbell buffer is enabled, where we check for the value in the shadow doorbell buffer when we get the doorbell update event. IOPS comparison on Linux 5.19-rc2: (Unit: KIOPS) qd 1 4 16 64 qemu 35 121 176 153 ioeventfd 41 133 258 313 Changes since v3: - Do not deregister ioeventfd when it was not enabled on a SQ/CQ Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn> Reviewed-by: Klaus Jensen <k.jensen@samsung.com> Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
Diffstat (limited to 'hw/nvme')
-rw-r--r--hw/nvme/ctrl.c113
-rw-r--r--hw/nvme/nvme.h5
2 files changed, 117 insertions, 1 deletions
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 55cb0ba1d5..533ad14e7a 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -1400,7 +1400,14 @@ static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
- timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
+
+ if (req->sq->ioeventfd_enabled) {
+ /* Post CQE directly since we are in main loop thread */
+ nvme_post_cqes(cq);
+ } else {
+ /* Schedule the timer to post CQE later since we are in vcpu thread */
+ timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
+ }
}
static void nvme_process_aers(void *opaque)
@@ -4226,10 +4233,82 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
return NVME_INVALID_OPCODE | NVME_DNR;
}
+static void nvme_cq_notifier(EventNotifier *e)
+{
+ NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier);
+ NvmeCtrl *n = cq->ctrl;
+
+ event_notifier_test_and_clear(&cq->notifier);
+
+ nvme_update_cq_head(cq);
+
+ if (cq->tail == cq->head) {
+ if (cq->irq_enabled) {
+ n->cq_pending--;
+ }
+
+ nvme_irq_deassert(n, cq);
+ }
+
+ nvme_post_cqes(cq);
+}
+
+static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
+{
+ NvmeCtrl *n = cq->ctrl;
+ uint16_t offset = (cq->cqid << 3) + (1 << 2);
+ int ret;
+
+ ret = event_notifier_init(&cq->notifier, 0);
+ if (ret < 0) {
+ return ret;
+ }
+
+ event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
+ memory_region_add_eventfd(&n->iomem,
+ 0x1000 + offset, 4, false, 0, &cq->notifier);
+
+ return 0;
+}
+
+static void nvme_sq_notifier(EventNotifier *e)
+{
+ NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier);
+
+ event_notifier_test_and_clear(&sq->notifier);
+
+ nvme_process_sq(sq);
+}
+
+static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
+{
+ NvmeCtrl *n = sq->ctrl;
+ uint16_t offset = sq->sqid << 3;
+ int ret;
+
+ ret = event_notifier_init(&sq->notifier, 0);
+ if (ret < 0) {
+ return ret;
+ }
+
+ event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
+ memory_region_add_eventfd(&n->iomem,
+ 0x1000 + offset, 4, false, 0, &sq->notifier);
+
+ return 0;
+}
+
static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
{
+ uint16_t offset = sq->sqid << 3;
+
n->sq[sq->sqid] = NULL;
timer_free(sq->timer);
+ if (sq->ioeventfd_enabled) {
+ memory_region_del_eventfd(&n->iomem,
+ 0x1000 + offset, 4, false, 0, &sq->notifier);
+ event_notifier_cleanup(&sq->notifier);
+ }
g_free(sq->io_req);
if (sq->sqid) {
g_free(sq);
@@ -4302,6 +4381,12 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
if (n->dbbuf_enabled) {
sq->db_addr = n->dbbuf_dbs + (sqid << 3);
sq->ei_addr = n->dbbuf_eis + (sqid << 3);
+
+ if (n->params.ioeventfd && sq->sqid != 0) {
+ if (!nvme_init_sq_ioeventfd(sq)) {
+ sq->ioeventfd_enabled = true;
+ }
+ }
}
assert(n->cq[cqid]);
@@ -4605,8 +4690,15 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
{
+ uint16_t offset = (cq->cqid << 3) + (1 << 2);
+
n->cq[cq->cqid] = NULL;
timer_free(cq->timer);
+ if (cq->ioeventfd_enabled) {
+ memory_region_del_eventfd(&n->iomem,
+ 0x1000 + offset, 4, false, 0, &cq->notifier);
+ event_notifier_cleanup(&cq->notifier);
+ }
if (msix_enabled(&n->parent_obj)) {
msix_vector_unuse(&n->parent_obj, cq->vector);
}
@@ -4665,6 +4757,12 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
if (n->dbbuf_enabled) {
cq->db_addr = n->dbbuf_dbs + (cqid << 3) + (1 << 2);
cq->ei_addr = n->dbbuf_eis + (cqid << 3) + (1 << 2);
+
+ if (n->params.ioeventfd && cqid != 0) {
+ if (!nvme_init_cq_ioeventfd(cq)) {
+ cq->ioeventfd_enabled = true;
+ }
+ }
}
n->cq[cqid] = cq;
cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
@@ -6039,6 +6137,12 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
sq->ei_addr = eis_addr + (i << 3);
pci_dma_write(&n->parent_obj, sq->db_addr, &sq->tail,
sizeof(sq->tail));
+
+ if (n->params.ioeventfd && sq->sqid != 0) {
+ if (!nvme_init_sq_ioeventfd(sq)) {
+ sq->ioeventfd_enabled = true;
+ }
+ }
}
if (cq) {
@@ -6047,6 +6151,12 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
cq->ei_addr = eis_addr + (i << 3) + (1 << 2);
pci_dma_write(&n->parent_obj, cq->db_addr, &cq->head,
sizeof(cq->head));
+
+ if (n->params.ioeventfd && cq->cqid != 0) {
+ if (!nvme_init_cq_ioeventfd(cq)) {
+ cq->ioeventfd_enabled = true;
+ }
+ }
}
}
@@ -7554,6 +7664,7 @@ static Property nvme_props[] = {
DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7),
DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
+ DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, true),
DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
params.auto_transition_zones, true),
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 0711b9748c..79f5c281c2 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -376,6 +376,8 @@ typedef struct NvmeSQueue {
uint64_t db_addr;
uint64_t ei_addr;
QEMUTimer *timer;
+ EventNotifier notifier;
+ bool ioeventfd_enabled;
NvmeRequest *io_req;
QTAILQ_HEAD(, NvmeRequest) req_list;
QTAILQ_HEAD(, NvmeRequest) out_req_list;
@@ -395,6 +397,8 @@ typedef struct NvmeCQueue {
uint64_t db_addr;
uint64_t ei_addr;
QEMUTimer *timer;
+ EventNotifier notifier;
+ bool ioeventfd_enabled;
QTAILQ_HEAD(, NvmeSQueue) sq_list;
QTAILQ_HEAD(, NvmeRequest) req_list;
} NvmeCQueue;
@@ -417,6 +421,7 @@ typedef struct NvmeParams {
uint8_t zasl;
bool auto_transition_zones;
bool legacy_cmb;
+ bool ioeventfd;
uint8_t sriov_max_vfs;
uint16_t sriov_vq_flexible;
uint16_t sriov_vi_flexible;