diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2022-07-15 15:38:13 +0100 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2022-07-15 15:38:13 +0100 |
commit | 0ebf76aae58324b8f7bf6af798696687f5f4c2a9 (patch) | |
tree | 25a2e0699aee01510724a6abbb3c7f5e47d402e2 /hw | |
parent | 44bfcf628b1531f11ecc21ae96d025a238e1083f (diff) | |
parent | 2e53b0b450246044efd27418c5d05ad6919deb87 (diff) |
Merge tag 'nvme-next-pull-request' of git://git.infradead.org/qemu-nvme into staging
hw/nvme updates
performance improvements by Jinhao
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* shadow doorbells
* ioeventfd
plus some misc fixes (Darren, Niklas).
# gpg: Signature made Fri 15 Jul 2022 09:42:20 BST
# gpg: using RSA key 522833AA75E2DCE6A24766C04DE1AF316D4F0DE9
# gpg: Good signature from "Klaus Jensen <its@irrelevant.dk>" [unknown]
# gpg: aka "Klaus Jensen <k.jensen@samsung.com>" [unknown]
# gpg: WARNING: This key is not certified with a trusted signature!
# gpg: There is no indication that the signature belongs to the owner.
# Primary key fingerprint: DDCA 4D9C 9EF9 31CC 3468 4272 63D5 6FC5 E55D A838
# Subkey fingerprint: 5228 33AA 75E2 DCE6 A247 66C0 4DE1 AF31 6D4F 0DE9
* tag 'nvme-next-pull-request' of git://git.infradead.org/qemu-nvme:
hw/nvme: Use ioeventfd to handle doorbell updates
nvme: Fix misleading macro when mixed with ternary operator
hw/nvme: force nvme-ns param 'shared' to false if no nvme-subsys node
hw/nvme: fix example serial in documentation
hw/nvme: Add trace events for shadow doorbell buffer
hw/nvme: Implement shadow doorbell buffer support
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'hw')
-rw-r--r-- | hw/nvme/ctrl.c | 233 | ||||
-rw-r--r-- | hw/nvme/ns.c | 2 | ||||
-rw-r--r-- | hw/nvme/nvme.h | 13 | ||||
-rw-r--r-- | hw/nvme/trace-events | 5 |
4 files changed, 251 insertions, 2 deletions
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index ca335dd7da..533ad14e7a 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -264,6 +264,7 @@ static const uint32_t nvme_cse_acs[256] = { [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP, [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC, [NVME_ADM_CMD_VIRT_MNGMT] = NVME_CMD_EFF_CSUPP, + [NVME_ADM_CMD_DBBUF_CONFIG] = NVME_CMD_EFF_CSUPP, [NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, }; @@ -1330,6 +1331,13 @@ static inline void nvme_blk_write(BlockBackend *blk, int64_t offset, } } +static void nvme_update_cq_head(NvmeCQueue *cq) +{ + pci_dma_read(&cq->ctrl->parent_obj, cq->db_addr, &cq->head, + sizeof(cq->head)); + trace_pci_nvme_shadow_doorbell_cq(cq->cqid, cq->head); +} + static void nvme_post_cqes(void *opaque) { NvmeCQueue *cq = opaque; @@ -1342,6 +1350,10 @@ static void nvme_post_cqes(void *opaque) NvmeSQueue *sq; hwaddr addr; + if (n->dbbuf_enabled) { + nvme_update_cq_head(cq); + } + if (nvme_cq_full(cq)) { break; } @@ -1388,7 +1400,14 @@ static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req) QTAILQ_REMOVE(&req->sq->out_req_list, req, entry); QTAILQ_INSERT_TAIL(&cq->req_list, req, entry); - timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); + + if (req->sq->ioeventfd_enabled) { + /* Post CQE directly since we are in main loop thread */ + nvme_post_cqes(cq); + } else { + /* Schedule the timer to post CQE later since we are in vcpu thread */ + timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); + } } static void nvme_process_aers(void *opaque) @@ -4214,10 +4233,82 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) return NVME_INVALID_OPCODE | NVME_DNR; } +static void nvme_cq_notifier(EventNotifier *e) +{ + NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier); + NvmeCtrl *n = cq->ctrl; + + event_notifier_test_and_clear(&cq->notifier); + + nvme_update_cq_head(cq); + + if (cq->tail == cq->head) { + if (cq->irq_enabled) { + n->cq_pending--; + } + + nvme_irq_deassert(n, cq); + } + + nvme_post_cqes(cq); +} + +static int nvme_init_cq_ioeventfd(NvmeCQueue *cq) +{ + NvmeCtrl *n = cq->ctrl; + uint16_t offset = (cq->cqid << 3) + (1 << 2); + int ret; + + ret = event_notifier_init(&cq->notifier, 0); + if (ret < 0) { + return ret; + } + + event_notifier_set_handler(&cq->notifier, nvme_cq_notifier); + memory_region_add_eventfd(&n->iomem, + 0x1000 + offset, 4, false, 0, &cq->notifier); + + return 0; +} + +static void nvme_sq_notifier(EventNotifier *e) +{ + NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier); + + event_notifier_test_and_clear(&sq->notifier); + + nvme_process_sq(sq); +} + +static int nvme_init_sq_ioeventfd(NvmeSQueue *sq) +{ + NvmeCtrl *n = sq->ctrl; + uint16_t offset = sq->sqid << 3; + int ret; + + ret = event_notifier_init(&sq->notifier, 0); + if (ret < 0) { + return ret; + } + + event_notifier_set_handler(&sq->notifier, nvme_sq_notifier); + memory_region_add_eventfd(&n->iomem, + 0x1000 + offset, 4, false, 0, &sq->notifier); + + return 0; +} + static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n) { + uint16_t offset = sq->sqid << 3; + n->sq[sq->sqid] = NULL; timer_free(sq->timer); + if (sq->ioeventfd_enabled) { + memory_region_del_eventfd(&n->iomem, + 0x1000 + offset, 4, false, 0, &sq->notifier); + event_notifier_cleanup(&sq->notifier); + } g_free(sq->io_req); if (sq->sqid) { g_free(sq); @@ -4287,6 +4378,17 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr, } sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq); + if (n->dbbuf_enabled) { + sq->db_addr = n->dbbuf_dbs + (sqid << 3); + sq->ei_addr = n->dbbuf_eis + (sqid << 3); + + if (n->params.ioeventfd && sq->sqid != 0) { + if (!nvme_init_sq_ioeventfd(sq)) { + sq->ioeventfd_enabled = true; + } + } + } + assert(n->cq[cqid]); cq = n->cq[cqid]; QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry); @@ -4588,8 +4690,15 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n) { + uint16_t offset = (cq->cqid << 3) + (1 << 2); + n->cq[cq->cqid] = NULL; timer_free(cq->timer); + if (cq->ioeventfd_enabled) { + memory_region_del_eventfd(&n->iomem, + 0x1000 + offset, 4, false, 0, &cq->notifier); + event_notifier_cleanup(&cq->notifier); + } if (msix_enabled(&n->parent_obj)) { msix_vector_unuse(&n->parent_obj, cq->vector); } @@ -4645,6 +4754,16 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr, cq->head = cq->tail = 0; QTAILQ_INIT(&cq->req_list); QTAILQ_INIT(&cq->sq_list); + if (n->dbbuf_enabled) { + cq->db_addr = n->dbbuf_dbs + (cqid << 3) + (1 << 2); + cq->ei_addr = n->dbbuf_eis + (cqid << 3) + (1 << 2); + + if (n->params.ioeventfd && cqid != 0) { + if (!nvme_init_cq_ioeventfd(cq)) { + cq->ioeventfd_enabled = true; + } + } + } n->cq[cqid] = cq; cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq); } @@ -5988,6 +6107,64 @@ static uint16_t nvme_virt_mngmt(NvmeCtrl *n, NvmeRequest *req) } } +static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req) +{ + uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1); + uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2); + int i; + + /* Address should be page aligned */ + if (dbs_addr & (n->page_size - 1) || eis_addr & (n->page_size - 1)) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + /* Save shadow buffer base addr for use during queue creation */ + n->dbbuf_dbs = dbs_addr; + n->dbbuf_eis = eis_addr; + n->dbbuf_enabled = true; + + for (i = 0; i < n->params.max_ioqpairs + 1; i++) { + NvmeSQueue *sq = n->sq[i]; + NvmeCQueue *cq = n->cq[i]; + + if (sq) { + /* + * CAP.DSTRD is 0, so offset of ith sq db_addr is (i<<3) + * nvme_process_db() uses this hard-coded way to calculate + * doorbell offsets. Be consistent with that here. + */ + sq->db_addr = dbs_addr + (i << 3); + sq->ei_addr = eis_addr + (i << 3); + pci_dma_write(&n->parent_obj, sq->db_addr, &sq->tail, + sizeof(sq->tail)); + + if (n->params.ioeventfd && sq->sqid != 0) { + if (!nvme_init_sq_ioeventfd(sq)) { + sq->ioeventfd_enabled = true; + } + } + } + + if (cq) { + /* CAP.DSTRD is 0, so offset of ith cq db_addr is (i<<3)+(1<<2) */ + cq->db_addr = dbs_addr + (i << 3) + (1 << 2); + cq->ei_addr = eis_addr + (i << 3) + (1 << 2); + pci_dma_write(&n->parent_obj, cq->db_addr, &cq->head, + sizeof(cq->head)); + + if (n->params.ioeventfd && cq->cqid != 0) { + if (!nvme_init_cq_ioeventfd(cq)) { + cq->ioeventfd_enabled = true; + } + } + } + } + + trace_pci_nvme_dbbuf_config(dbs_addr, eis_addr); + + return NVME_SUCCESS; +} + static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req) { trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode, @@ -6032,6 +6209,8 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req) return nvme_ns_attachment(n, req); case NVME_ADM_CMD_VIRT_MNGMT: return nvme_virt_mngmt(n, req); + case NVME_ADM_CMD_DBBUF_CONFIG: + return nvme_dbbuf_config(n, req); case NVME_ADM_CMD_FORMAT_NVM: return nvme_format(n, req); default: @@ -6041,6 +6220,20 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req) return NVME_INVALID_OPCODE | NVME_DNR; } +static void nvme_update_sq_eventidx(const NvmeSQueue *sq) +{ + pci_dma_write(&sq->ctrl->parent_obj, sq->ei_addr, &sq->tail, + sizeof(sq->tail)); + trace_pci_nvme_eventidx_sq(sq->sqid, sq->tail); +} + +static void nvme_update_sq_tail(NvmeSQueue *sq) +{ + pci_dma_read(&sq->ctrl->parent_obj, sq->db_addr, &sq->tail, + sizeof(sq->tail)); + trace_pci_nvme_shadow_doorbell_sq(sq->sqid, sq->tail); +} + static void nvme_process_sq(void *opaque) { NvmeSQueue *sq = opaque; @@ -6052,6 +6245,10 @@ static void nvme_process_sq(void *opaque) NvmeCmd cmd; NvmeRequest *req; + if (n->dbbuf_enabled) { + nvme_update_sq_tail(sq); + } + while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) { addr = sq->dma_addr + sq->head * n->sqe_size; if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) { @@ -6075,6 +6272,11 @@ static void nvme_process_sq(void *opaque) req->status = status; nvme_enqueue_req_completion(cq, req); } + + if (n->dbbuf_enabled) { + nvme_update_sq_eventidx(sq); + nvme_update_sq_tail(sq); + } } } @@ -6184,6 +6386,10 @@ static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst) stl_le_p(&n->bar.intms, 0); stl_le_p(&n->bar.intmc, 0); stl_le_p(&n->bar.cc, 0); + + n->dbbuf_dbs = 0; + n->dbbuf_eis = 0; + n->dbbuf_enabled = false; } static void nvme_ctrl_shutdown(NvmeCtrl *n) @@ -6694,6 +6900,10 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) start_sqs = nvme_cq_full(cq) ? 1 : 0; cq->head = new_head; + if (!qid && n->dbbuf_enabled) { + pci_dma_write(&n->parent_obj, cq->db_addr, &cq->head, + sizeof(cq->head)); + } if (start_sqs) { NvmeSQueue *sq; QTAILQ_FOREACH(sq, &cq->sq_list, entry) { @@ -6751,6 +6961,23 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail); sq->tail = new_tail; + if (!qid && n->dbbuf_enabled) { + /* + * The spec states "the host shall also update the controller's + * corresponding doorbell property to match the value of that entry + * in the Shadow Doorbell buffer." + * + * Since this context is currently a VM trap, we can safely enforce + * the requirement from the device side in case the host is + * misbehaving. + * + * Note, we shouldn't have to do this, but various drivers + * including ones that run on Linux, are not updating Admin Queues, + * so we can't trust reading it for an appropriate sq tail. + */ + pci_dma_write(&n->parent_obj, sq->db_addr, &sq->tail, + sizeof(sq->tail)); + } timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); } } @@ -7231,7 +7458,8 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->mdts = n->params.mdts; id->ver = cpu_to_le32(NVME_SPEC_VER); - id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT); + id->oacs = + cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT | NVME_OACS_DBBUF); id->cntrltype = 0x1; /* @@ -7436,6 +7664,7 @@ static Property nvme_props[] = { DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7), DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false), DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false), + DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, true), DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0), DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl, params.auto_transition_zones, true), diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c index 870c3ca1a2..62a1f97be0 100644 --- a/hw/nvme/ns.c +++ b/hw/nvme/ns.c @@ -546,6 +546,8 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp) int i; if (!n->subsys) { + /* If no subsys, the ns cannot be attached to more than one ctrl. */ + ns->params.shared = false; if (ns->params.detached) { error_setg(errp, "detached requires that the nvme device is " "linked to an nvme-subsys device"); diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h index 99437d39bb..79f5c281c2 100644 --- a/hw/nvme/nvme.h +++ b/hw/nvme/nvme.h @@ -341,6 +341,7 @@ static inline const char *nvme_adm_opc_str(uint8_t opc) case NVME_ADM_CMD_ASYNC_EV_REQ: return "NVME_ADM_CMD_ASYNC_EV_REQ"; case NVME_ADM_CMD_NS_ATTACHMENT: return "NVME_ADM_CMD_NS_ATTACHMENT"; case NVME_ADM_CMD_VIRT_MNGMT: return "NVME_ADM_CMD_VIRT_MNGMT"; + case NVME_ADM_CMD_DBBUF_CONFIG: return "NVME_ADM_CMD_DBBUF_CONFIG"; case NVME_ADM_CMD_FORMAT_NVM: return "NVME_ADM_CMD_FORMAT_NVM"; default: return "NVME_ADM_CMD_UNKNOWN"; } @@ -372,7 +373,11 @@ typedef struct NvmeSQueue { uint32_t tail; uint32_t size; uint64_t dma_addr; + uint64_t db_addr; + uint64_t ei_addr; QEMUTimer *timer; + EventNotifier notifier; + bool ioeventfd_enabled; NvmeRequest *io_req; QTAILQ_HEAD(, NvmeRequest) req_list; QTAILQ_HEAD(, NvmeRequest) out_req_list; @@ -389,7 +394,11 @@ typedef struct NvmeCQueue { uint32_t vector; uint32_t size; uint64_t dma_addr; + uint64_t db_addr; + uint64_t ei_addr; QEMUTimer *timer; + EventNotifier notifier; + bool ioeventfd_enabled; QTAILQ_HEAD(, NvmeSQueue) sq_list; QTAILQ_HEAD(, NvmeRequest) req_list; } NvmeCQueue; @@ -412,6 +421,7 @@ typedef struct NvmeParams { uint8_t zasl; bool auto_transition_zones; bool legacy_cmb; + bool ioeventfd; uint8_t sriov_max_vfs; uint16_t sriov_vq_flexible; uint16_t sriov_vi_flexible; @@ -445,6 +455,9 @@ typedef struct NvmeCtrl { uint8_t smart_critical_warning; uint32_t conf_msix_qsize; uint32_t conf_ioqpairs; + uint64_t dbbuf_dbs; + uint64_t dbbuf_eis; + bool dbbuf_enabled; struct { MemoryRegion mem; diff --git a/hw/nvme/trace-events b/hw/nvme/trace-events index 065e1c891d..fccb79f489 100644 --- a/hw/nvme/trace-events +++ b/hw/nvme/trace-events @@ -3,6 +3,7 @@ pci_nvme_irq_msix(uint32_t vector) "raising MSI-X IRQ vector %u" pci_nvme_irq_pin(void) "pulsing IRQ pin" pci_nvme_irq_masked(void) "IRQ is masked" pci_nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64"" +pci_nvme_dbbuf_config(uint64_t dbs_addr, uint64_t eis_addr) "dbs_addr=0x%"PRIx64" eis_addr=0x%"PRIx64"" pci_nvme_map_addr(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64"" pci_nvme_map_addr_cmb(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64"" pci_nvme_map_prp(uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t prp2, int num_prps) "trans_len %"PRIu64" len %"PRIu32" prp1 0x%"PRIx64" prp2 0x%"PRIx64" num_prps %d" @@ -83,6 +84,8 @@ pci_nvme_enqueue_event_noqueue(int queued) "queued %d" pci_nvme_enqueue_event_masked(uint8_t typ) "type 0x%"PRIx8"" pci_nvme_no_outstanding_aers(void) "ignoring event; no outstanding AERs" pci_nvme_enqueue_req_completion(uint16_t cid, uint16_t cqid, uint32_t dw0, uint32_t dw1, uint16_t status) "cid %"PRIu16" cqid %"PRIu16" dw0 0x%"PRIx32" dw1 0x%"PRIx32" status 0x%"PRIx16"" +pci_nvme_eventidx_cq(uint16_t cqid, uint16_t new_eventidx) "cqid %"PRIu16" new_eventidx %"PRIu16"" +pci_nvme_eventidx_sq(uint16_t sqid, uint16_t new_eventidx) "sqid %"PRIu16" new_eventidx %"PRIu16"" pci_nvme_mmio_read(uint64_t addr, unsigned size) "addr 0x%"PRIx64" size %d" pci_nvme_mmio_write(uint64_t addr, uint64_t data, unsigned size) "addr 0x%"PRIx64" data 0x%"PRIx64" size %d" pci_nvme_mmio_doorbell_cq(uint16_t cqid, uint16_t new_head) "cqid %"PRIu16" new_head %"PRIu16"" @@ -99,6 +102,8 @@ pci_nvme_mmio_start_success(void) "setting controller enable bit succeeded" pci_nvme_mmio_stopped(void) "cleared controller enable bit" pci_nvme_mmio_shutdown_set(void) "shutdown bit set" pci_nvme_mmio_shutdown_cleared(void) "shutdown bit cleared" +pci_nvme_shadow_doorbell_cq(uint16_t cqid, uint16_t new_shadow_doorbell) "cqid %"PRIu16" new_shadow_doorbell %"PRIu16"" +pci_nvme_shadow_doorbell_sq(uint16_t sqid, uint16_t new_shadow_doorbell) "sqid %"PRIu16" new_shadow_doorbell %"PRIu16"" pci_nvme_open_zone(uint64_t slba, uint32_t zone_idx, int all) "open zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32"" pci_nvme_close_zone(uint64_t slba, uint32_t zone_idx, int all) "close zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32"" pci_nvme_finish_zone(uint64_t slba, uint32_t zone_idx, int all) "finish zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32"" |