From 3f7fe8de3d49fdd2c1461fcd22fe73d84d2a9f8a Mon Sep 17 00:00:00 2001 From: Jinhao Fan Date: Thu, 16 Jun 2022 20:34:07 +0800 Subject: hw/nvme: Implement shadow doorbell buffer support Implement Doorbel Buffer Config command (Section 5.7 in NVMe Spec 1.3) and Shadow Doorbel buffer & EventIdx buffer handling logic (Section 7.13 in NVMe Spec 1.3). For queues created before the Doorbell Buffer Config command, the nvme_dbbuf_config function tries to associate each existing SQ and CQ with its Shadow Doorbel buffer and EventIdx buffer address. Queues created after the Doorbell Buffer Config command will have the doorbell buffers associated with them when they are initialized. In nvme_process_sq and nvme_post_cqe, proactively check for Shadow Doorbell buffer changes instead of wait for doorbell register changes. This reduces the number of MMIOs. In nvme_process_db(), update the shadow doorbell buffer value with the doorbell register value if it is the admin queue. This is a hack since hosts like Linux NVMe driver and SPDK do not use shadow doorbell buffer for the admin queue. Copying the doorbell register value to the shadow doorbell buffer allows us to support these hosts as well as spec-compliant hosts that use shadow doorbell buffer for the admin queue. Signed-off-by: Jinhao Fan Reviewed-by: Klaus Jensen Reviewed-by: Keith Busch [k.jensen: rebased] Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 115 ++++++++++++++++++++++++++++++++++++++++++++++++++- hw/nvme/nvme.h | 8 ++++ include/block/nvme.h | 2 + 3 files changed, 124 insertions(+), 1 deletion(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index ca335dd7da..46e8d54ef0 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -264,6 +264,7 @@ static const uint32_t nvme_cse_acs[256] = { [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP, [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC, [NVME_ADM_CMD_VIRT_MNGMT] = NVME_CMD_EFF_CSUPP, + [NVME_ADM_CMD_DBBUF_CONFIG] = NVME_CMD_EFF_CSUPP, [NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC, }; @@ -1330,6 +1331,12 @@ static inline void nvme_blk_write(BlockBackend *blk, int64_t offset, } } +static void nvme_update_cq_head(NvmeCQueue *cq) +{ + pci_dma_read(&cq->ctrl->parent_obj, cq->db_addr, &cq->head, + sizeof(cq->head)); +} + static void nvme_post_cqes(void *opaque) { NvmeCQueue *cq = opaque; @@ -1342,6 +1349,10 @@ static void nvme_post_cqes(void *opaque) NvmeSQueue *sq; hwaddr addr; + if (n->dbbuf_enabled) { + nvme_update_cq_head(cq); + } + if (nvme_cq_full(cq)) { break; } @@ -4287,6 +4298,11 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr, } sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq); + if (n->dbbuf_enabled) { + sq->db_addr = n->dbbuf_dbs + (sqid << 3); + sq->ei_addr = n->dbbuf_eis + (sqid << 3); + } + assert(n->cq[cqid]); cq = n->cq[cqid]; QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry); @@ -4645,6 +4661,10 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr, cq->head = cq->tail = 0; QTAILQ_INIT(&cq->req_list); QTAILQ_INIT(&cq->sq_list); + if (n->dbbuf_enabled) { + cq->db_addr = n->dbbuf_dbs + (cqid << 3) + (1 << 2); + cq->ei_addr = n->dbbuf_eis + (cqid << 3) + (1 << 2); + } n->cq[cqid] = cq; cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq); } @@ -5988,6 +6008,50 @@ static uint16_t nvme_virt_mngmt(NvmeCtrl *n, NvmeRequest *req) } } +static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req) +{ + uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1); + uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2); + int i; + + /* Address should be page aligned */ + if (dbs_addr & (n->page_size - 1) || eis_addr & (n->page_size - 1)) { + return NVME_INVALID_FIELD | NVME_DNR; + } + + /* Save shadow buffer base addr for use during queue creation */ + n->dbbuf_dbs = dbs_addr; + n->dbbuf_eis = eis_addr; + n->dbbuf_enabled = true; + + for (i = 0; i < n->params.max_ioqpairs + 1; i++) { + NvmeSQueue *sq = n->sq[i]; + NvmeCQueue *cq = n->cq[i]; + + if (sq) { + /* + * CAP.DSTRD is 0, so offset of ith sq db_addr is (i<<3) + * nvme_process_db() uses this hard-coded way to calculate + * doorbell offsets. Be consistent with that here. + */ + sq->db_addr = dbs_addr + (i << 3); + sq->ei_addr = eis_addr + (i << 3); + pci_dma_write(&n->parent_obj, sq->db_addr, &sq->tail, + sizeof(sq->tail)); + } + + if (cq) { + /* CAP.DSTRD is 0, so offset of ith cq db_addr is (i<<3)+(1<<2) */ + cq->db_addr = dbs_addr + (i << 3) + (1 << 2); + cq->ei_addr = eis_addr + (i << 3) + (1 << 2); + pci_dma_write(&n->parent_obj, cq->db_addr, &cq->head, + sizeof(cq->head)); + } + } + + return NVME_SUCCESS; +} + static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req) { trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode, @@ -6032,6 +6096,8 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req) return nvme_ns_attachment(n, req); case NVME_ADM_CMD_VIRT_MNGMT: return nvme_virt_mngmt(n, req); + case NVME_ADM_CMD_DBBUF_CONFIG: + return nvme_dbbuf_config(n, req); case NVME_ADM_CMD_FORMAT_NVM: return nvme_format(n, req); default: @@ -6041,6 +6107,18 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req) return NVME_INVALID_OPCODE | NVME_DNR; } +static void nvme_update_sq_eventidx(const NvmeSQueue *sq) +{ + pci_dma_write(&sq->ctrl->parent_obj, sq->ei_addr, &sq->tail, + sizeof(sq->tail)); +} + +static void nvme_update_sq_tail(NvmeSQueue *sq) +{ + pci_dma_read(&sq->ctrl->parent_obj, sq->db_addr, &sq->tail, + sizeof(sq->tail)); +} + static void nvme_process_sq(void *opaque) { NvmeSQueue *sq = opaque; @@ -6052,6 +6130,10 @@ static void nvme_process_sq(void *opaque) NvmeCmd cmd; NvmeRequest *req; + if (n->dbbuf_enabled) { + nvme_update_sq_tail(sq); + } + while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) { addr = sq->dma_addr + sq->head * n->sqe_size; if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) { @@ -6075,6 +6157,11 @@ static void nvme_process_sq(void *opaque) req->status = status; nvme_enqueue_req_completion(cq, req); } + + if (n->dbbuf_enabled) { + nvme_update_sq_eventidx(sq); + nvme_update_sq_tail(sq); + } } } @@ -6184,6 +6271,10 @@ static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst) stl_le_p(&n->bar.intms, 0); stl_le_p(&n->bar.intmc, 0); stl_le_p(&n->bar.cc, 0); + + n->dbbuf_dbs = 0; + n->dbbuf_eis = 0; + n->dbbuf_enabled = false; } static void nvme_ctrl_shutdown(NvmeCtrl *n) @@ -6694,6 +6785,10 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) start_sqs = nvme_cq_full(cq) ? 1 : 0; cq->head = new_head; + if (!qid && n->dbbuf_enabled) { + pci_dma_write(&n->parent_obj, cq->db_addr, &cq->head, + sizeof(cq->head)); + } if (start_sqs) { NvmeSQueue *sq; QTAILQ_FOREACH(sq, &cq->sq_list, entry) { @@ -6751,6 +6846,23 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val) trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail); sq->tail = new_tail; + if (!qid && n->dbbuf_enabled) { + /* + * The spec states "the host shall also update the controller's + * corresponding doorbell property to match the value of that entry + * in the Shadow Doorbell buffer." + * + * Since this context is currently a VM trap, we can safely enforce + * the requirement from the device side in case the host is + * misbehaving. + * + * Note, we shouldn't have to do this, but various drivers + * including ones that run on Linux, are not updating Admin Queues, + * so we can't trust reading it for an appropriate sq tail. + */ + pci_dma_write(&n->parent_obj, sq->db_addr, &sq->tail, + sizeof(sq->tail)); + } timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); } } @@ -7231,7 +7343,8 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev) id->mdts = n->params.mdts; id->ver = cpu_to_le32(NVME_SPEC_VER); - id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT); + id->oacs = + cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT | NVME_OACS_DBBUF); id->cntrltype = 0x1; /* diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h index 99437d39bb..0711b9748c 100644 --- a/hw/nvme/nvme.h +++ b/hw/nvme/nvme.h @@ -341,6 +341,7 @@ static inline const char *nvme_adm_opc_str(uint8_t opc) case NVME_ADM_CMD_ASYNC_EV_REQ: return "NVME_ADM_CMD_ASYNC_EV_REQ"; case NVME_ADM_CMD_NS_ATTACHMENT: return "NVME_ADM_CMD_NS_ATTACHMENT"; case NVME_ADM_CMD_VIRT_MNGMT: return "NVME_ADM_CMD_VIRT_MNGMT"; + case NVME_ADM_CMD_DBBUF_CONFIG: return "NVME_ADM_CMD_DBBUF_CONFIG"; case NVME_ADM_CMD_FORMAT_NVM: return "NVME_ADM_CMD_FORMAT_NVM"; default: return "NVME_ADM_CMD_UNKNOWN"; } @@ -372,6 +373,8 @@ typedef struct NvmeSQueue { uint32_t tail; uint32_t size; uint64_t dma_addr; + uint64_t db_addr; + uint64_t ei_addr; QEMUTimer *timer; NvmeRequest *io_req; QTAILQ_HEAD(, NvmeRequest) req_list; @@ -389,6 +392,8 @@ typedef struct NvmeCQueue { uint32_t vector; uint32_t size; uint64_t dma_addr; + uint64_t db_addr; + uint64_t ei_addr; QEMUTimer *timer; QTAILQ_HEAD(, NvmeSQueue) sq_list; QTAILQ_HEAD(, NvmeRequest) req_list; @@ -445,6 +450,9 @@ typedef struct NvmeCtrl { uint8_t smart_critical_warning; uint32_t conf_msix_qsize; uint32_t conf_ioqpairs; + uint64_t dbbuf_dbs; + uint64_t dbbuf_eis; + bool dbbuf_enabled; struct { MemoryRegion mem; diff --git a/include/block/nvme.h b/include/block/nvme.h index 373c70b5ca..351fd44ca8 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -596,6 +596,7 @@ enum NvmeAdminCommands { NVME_ADM_CMD_DOWNLOAD_FW = 0x11, NVME_ADM_CMD_NS_ATTACHMENT = 0x15, NVME_ADM_CMD_VIRT_MNGMT = 0x1c, + NVME_ADM_CMD_DBBUF_CONFIG = 0x7c, NVME_ADM_CMD_FORMAT_NVM = 0x80, NVME_ADM_CMD_SECURITY_SEND = 0x81, NVME_ADM_CMD_SECURITY_RECV = 0x82, @@ -1141,6 +1142,7 @@ enum NvmeIdCtrlOacs { NVME_OACS_FORMAT = 1 << 1, NVME_OACS_FW = 1 << 2, NVME_OACS_NS_MGMT = 1 << 3, + NVME_OACS_DBBUF = 1 << 8, }; enum NvmeIdCtrlOncs { -- cgit v1.2.3 From 387350d5f451b9f0c804f82f64ec127d5392bb3b Mon Sep 17 00:00:00 2001 From: Jinhao Fan Date: Thu, 16 Jun 2022 20:34:08 +0800 Subject: hw/nvme: Add trace events for shadow doorbell buffer When shadow doorbell buffer is enabled, doorbell registers are lazily updated. The actual queue head and tail pointers are stored in Shadow Doorbell buffers. Add trace events for updates on the Shadow Doorbell buffers and EventIdx buffers. Also add trace event for the Doorbell Buffer Config command. Signed-off-by: Jinhao Fan Reviewed-by: Klaus Jensen Reviewed-by: Keith Busch [k.jensen: rebased] Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 5 +++++ hw/nvme/trace-events | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 46e8d54ef0..55cb0ba1d5 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -1335,6 +1335,7 @@ static void nvme_update_cq_head(NvmeCQueue *cq) { pci_dma_read(&cq->ctrl->parent_obj, cq->db_addr, &cq->head, sizeof(cq->head)); + trace_pci_nvme_shadow_doorbell_cq(cq->cqid, cq->head); } static void nvme_post_cqes(void *opaque) @@ -6049,6 +6050,8 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req) } } + trace_pci_nvme_dbbuf_config(dbs_addr, eis_addr); + return NVME_SUCCESS; } @@ -6111,12 +6114,14 @@ static void nvme_update_sq_eventidx(const NvmeSQueue *sq) { pci_dma_write(&sq->ctrl->parent_obj, sq->ei_addr, &sq->tail, sizeof(sq->tail)); + trace_pci_nvme_eventidx_sq(sq->sqid, sq->tail); } static void nvme_update_sq_tail(NvmeSQueue *sq) { pci_dma_read(&sq->ctrl->parent_obj, sq->db_addr, &sq->tail, sizeof(sq->tail)); + trace_pci_nvme_shadow_doorbell_sq(sq->sqid, sq->tail); } static void nvme_process_sq(void *opaque) diff --git a/hw/nvme/trace-events b/hw/nvme/trace-events index 065e1c891d..fccb79f489 100644 --- a/hw/nvme/trace-events +++ b/hw/nvme/trace-events @@ -3,6 +3,7 @@ pci_nvme_irq_msix(uint32_t vector) "raising MSI-X IRQ vector %u" pci_nvme_irq_pin(void) "pulsing IRQ pin" pci_nvme_irq_masked(void) "IRQ is masked" pci_nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64"" +pci_nvme_dbbuf_config(uint64_t dbs_addr, uint64_t eis_addr) "dbs_addr=0x%"PRIx64" eis_addr=0x%"PRIx64"" pci_nvme_map_addr(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64"" pci_nvme_map_addr_cmb(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64"" pci_nvme_map_prp(uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t prp2, int num_prps) "trans_len %"PRIu64" len %"PRIu32" prp1 0x%"PRIx64" prp2 0x%"PRIx64" num_prps %d" @@ -83,6 +84,8 @@ pci_nvme_enqueue_event_noqueue(int queued) "queued %d" pci_nvme_enqueue_event_masked(uint8_t typ) "type 0x%"PRIx8"" pci_nvme_no_outstanding_aers(void) "ignoring event; no outstanding AERs" pci_nvme_enqueue_req_completion(uint16_t cid, uint16_t cqid, uint32_t dw0, uint32_t dw1, uint16_t status) "cid %"PRIu16" cqid %"PRIu16" dw0 0x%"PRIx32" dw1 0x%"PRIx32" status 0x%"PRIx16"" +pci_nvme_eventidx_cq(uint16_t cqid, uint16_t new_eventidx) "cqid %"PRIu16" new_eventidx %"PRIu16"" +pci_nvme_eventidx_sq(uint16_t sqid, uint16_t new_eventidx) "sqid %"PRIu16" new_eventidx %"PRIu16"" pci_nvme_mmio_read(uint64_t addr, unsigned size) "addr 0x%"PRIx64" size %d" pci_nvme_mmio_write(uint64_t addr, uint64_t data, unsigned size) "addr 0x%"PRIx64" data 0x%"PRIx64" size %d" pci_nvme_mmio_doorbell_cq(uint16_t cqid, uint16_t new_head) "cqid %"PRIu16" new_head %"PRIu16"" @@ -99,6 +102,8 @@ pci_nvme_mmio_start_success(void) "setting controller enable bit succeeded" pci_nvme_mmio_stopped(void) "cleared controller enable bit" pci_nvme_mmio_shutdown_set(void) "shutdown bit set" pci_nvme_mmio_shutdown_cleared(void) "shutdown bit cleared" +pci_nvme_shadow_doorbell_cq(uint16_t cqid, uint16_t new_shadow_doorbell) "cqid %"PRIu16" new_shadow_doorbell %"PRIu16"" +pci_nvme_shadow_doorbell_sq(uint16_t sqid, uint16_t new_shadow_doorbell) "sqid %"PRIu16" new_shadow_doorbell %"PRIu16"" pci_nvme_open_zone(uint64_t slba, uint32_t zone_idx, int all) "open zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32"" pci_nvme_close_zone(uint64_t slba, uint32_t zone_idx, int all) "close zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32"" pci_nvme_finish_zone(uint64_t slba, uint32_t zone_idx, int all) "finish zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32"" -- cgit v1.2.3 From 146b5fa505fc21baa129c7538936fbdd9875b6ed Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Mon, 27 Jun 2022 14:39:57 +0200 Subject: hw/nvme: fix example serial in documentation The serial prop on the controller is actually describing the nvme subsystem serial, which has to be identical for all controllers within the same nvme subsystem. This is enforced since commit a859eb9f8f64 ("hw/nvme: enforce common serial per subsystem"). Fix the documentation, so that people copying the qemu command line example won't get an error on qemu start. Signed-off-by: Niklas Cassel Signed-off-by: Klaus Jensen --- docs/system/devices/nvme.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/system/devices/nvme.rst b/docs/system/devices/nvme.rst index aba253304e..30f841ef62 100644 --- a/docs/system/devices/nvme.rst +++ b/docs/system/devices/nvme.rst @@ -104,8 +104,8 @@ multipath I/O. .. code-block:: console -device nvme-subsys,id=nvme-subsys-0,nqn=subsys0 - -device nvme,serial=a,subsys=nvme-subsys-0 - -device nvme,serial=b,subsys=nvme-subsys-0 + -device nvme,serial=deadbeef,subsys=nvme-subsys-0 + -device nvme,serial=deadbeef,subsys=nvme-subsys-0 This will create an NVM subsystem with two controllers. Having controllers linked to an ``nvme-subsys`` device allows additional ``nvme-ns`` parameters: -- cgit v1.2.3 From dfa82ac201af28c451271d3f1dc6827b431cd827 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Tue, 28 Jun 2022 14:22:09 +0200 Subject: hw/nvme: force nvme-ns param 'shared' to false if no nvme-subsys node Since commit 916b0f0b5264 ("hw/nvme: change nvme-ns 'shared' default") the default value of nvme-ns param 'shared' is set to true, regardless if there is a nvme-subsys node or not. On a system without a nvme-subsys node, a namespace will never be able to be attached to more than one controller, so for this configuration, it is counterintuitive for this parameter to be set by default. Force the nvme-ns param 'shared' to false for configurations where there is no nvme-subsys node, as the namespace will never be able to attach to more than one controller anyway. Signed-off-by: Niklas Cassel Reviewed-by: Klaus Jensen Signed-off-by: Klaus Jensen --- hw/nvme/ns.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c index 870c3ca1a2..62a1f97be0 100644 --- a/hw/nvme/ns.c +++ b/hw/nvme/ns.c @@ -546,6 +546,8 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp) int i; if (!n->subsys) { + /* If no subsys, the ns cannot be attached to more than one ctrl. */ + ns->params.shared = false; if (ns->params.detached) { error_setg(errp, "detached requires that the nvme device is " "linked to an nvme-subsys device"); -- cgit v1.2.3 From 43f76aac49c439ea79c125d1befd9d5d7057dbb4 Mon Sep 17 00:00:00 2001 From: Darren Kenny Date: Thu, 7 Jul 2022 13:36:21 +0000 Subject: nvme: Fix misleading macro when mixed with ternary operator Using the Parfait source code analyser and issue was found in hw/nvme/ctrl.c where the macros NVME_CAP_SET_CMBS and NVME_CAP_SET_PMRS are called with a ternary operatore in the second parameter, resulting in a potentially unexpected expansion of the form: x ? a: b & FLAG_TEST which will result in a different result to: (x ? a: b) & FLAG_TEST. The macros should wrap each of the parameters in brackets to ensure the correct result on expansion. Signed-off-by: Darren Kenny Reviewed-by: Stefan Hajnoczi Reviewed-by: Klaus Jensen Signed-off-by: Klaus Jensen --- include/block/nvme.h | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/include/block/nvme.h b/include/block/nvme.h index 351fd44ca8..8027b7126b 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -98,28 +98,28 @@ enum NvmeCapMask { #define NVME_CAP_PMRS(cap) (((cap) >> CAP_PMRS_SHIFT) & CAP_PMRS_MASK) #define NVME_CAP_CMBS(cap) (((cap) >> CAP_CMBS_SHIFT) & CAP_CMBS_MASK) -#define NVME_CAP_SET_MQES(cap, val) (cap |= (uint64_t)(val & CAP_MQES_MASK) \ - << CAP_MQES_SHIFT) -#define NVME_CAP_SET_CQR(cap, val) (cap |= (uint64_t)(val & CAP_CQR_MASK) \ - << CAP_CQR_SHIFT) -#define NVME_CAP_SET_AMS(cap, val) (cap |= (uint64_t)(val & CAP_AMS_MASK) \ - << CAP_AMS_SHIFT) -#define NVME_CAP_SET_TO(cap, val) (cap |= (uint64_t)(val & CAP_TO_MASK) \ - << CAP_TO_SHIFT) -#define NVME_CAP_SET_DSTRD(cap, val) (cap |= (uint64_t)(val & CAP_DSTRD_MASK) \ - << CAP_DSTRD_SHIFT) -#define NVME_CAP_SET_NSSRS(cap, val) (cap |= (uint64_t)(val & CAP_NSSRS_MASK) \ - << CAP_NSSRS_SHIFT) -#define NVME_CAP_SET_CSS(cap, val) (cap |= (uint64_t)(val & CAP_CSS_MASK) \ - << CAP_CSS_SHIFT) -#define NVME_CAP_SET_MPSMIN(cap, val) (cap |= (uint64_t)(val & CAP_MPSMIN_MASK)\ - << CAP_MPSMIN_SHIFT) -#define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & CAP_MPSMAX_MASK)\ - << CAP_MPSMAX_SHIFT) -#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMRS_MASK) \ - << CAP_PMRS_SHIFT) -#define NVME_CAP_SET_CMBS(cap, val) (cap |= (uint64_t)(val & CAP_CMBS_MASK) \ - << CAP_CMBS_SHIFT) +#define NVME_CAP_SET_MQES(cap, val) \ + ((cap) |= (uint64_t)((val) & CAP_MQES_MASK) << CAP_MQES_SHIFT) +#define NVME_CAP_SET_CQR(cap, val) \ + ((cap) |= (uint64_t)((val) & CAP_CQR_MASK) << CAP_CQR_SHIFT) +#define NVME_CAP_SET_AMS(cap, val) \ + ((cap) |= (uint64_t)((val) & CAP_AMS_MASK) << CAP_AMS_SHIFT) +#define NVME_CAP_SET_TO(cap, val) \ + ((cap) |= (uint64_t)((val) & CAP_TO_MASK) << CAP_TO_SHIFT) +#define NVME_CAP_SET_DSTRD(cap, val) \ + ((cap) |= (uint64_t)((val) & CAP_DSTRD_MASK) << CAP_DSTRD_SHIFT) +#define NVME_CAP_SET_NSSRS(cap, val) \ + ((cap) |= (uint64_t)((val) & CAP_NSSRS_MASK) << CAP_NSSRS_SHIFT) +#define NVME_CAP_SET_CSS(cap, val) \ + ((cap) |= (uint64_t)((val) & CAP_CSS_MASK) << CAP_CSS_SHIFT) +#define NVME_CAP_SET_MPSMIN(cap, val) \ + ((cap) |= (uint64_t)((val) & CAP_MPSMIN_MASK) << CAP_MPSMIN_SHIFT) +#define NVME_CAP_SET_MPSMAX(cap, val) \ + ((cap) |= (uint64_t)((val) & CAP_MPSMAX_MASK) << CAP_MPSMAX_SHIFT) +#define NVME_CAP_SET_PMRS(cap, val) \ + ((cap) |= (uint64_t)((val) & CAP_PMRS_MASK) << CAP_PMRS_SHIFT) +#define NVME_CAP_SET_CMBS(cap, val) \ + ((cap) |= (uint64_t)((val) & CAP_CMBS_MASK) << CAP_CMBS_SHIFT) enum NvmeCapCss { NVME_CAP_CSS_NVM = 1 << 0, -- cgit v1.2.3 From 2e53b0b450246044efd27418c5d05ad6919deb87 Mon Sep 17 00:00:00 2001 From: Jinhao Fan Date: Tue, 5 Jul 2022 22:24:03 +0800 Subject: hw/nvme: Use ioeventfd to handle doorbell updates Add property "ioeventfd" which is enabled by default. When this is enabled, updates on the doorbell registers will cause KVM to signal an event to the QEMU main loop to handle the doorbell updates. Therefore, instead of letting the vcpu thread run both guest VM and IO emulation, we now use the main loop thread to do IO emulation and thus the vcpu thread has more cycles for the guest VM. Since ioeventfd does not tell us the exact value that is written, it is only useful when shadow doorbell buffer is enabled, where we check for the value in the shadow doorbell buffer when we get the doorbell update event. IOPS comparison on Linux 5.19-rc2: (Unit: KIOPS) qd 1 4 16 64 qemu 35 121 176 153 ioeventfd 41 133 258 313 Changes since v3: - Do not deregister ioeventfd when it was not enabled on a SQ/CQ Signed-off-by: Jinhao Fan Reviewed-by: Klaus Jensen Signed-off-by: Klaus Jensen --- hw/nvme/ctrl.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- hw/nvme/nvme.h | 5 +++ 2 files changed, 117 insertions(+), 1 deletion(-) diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c index 55cb0ba1d5..533ad14e7a 100644 --- a/hw/nvme/ctrl.c +++ b/hw/nvme/ctrl.c @@ -1400,7 +1400,14 @@ static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req) QTAILQ_REMOVE(&req->sq->out_req_list, req, entry); QTAILQ_INSERT_TAIL(&cq->req_list, req, entry); - timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); + + if (req->sq->ioeventfd_enabled) { + /* Post CQE directly since we are in main loop thread */ + nvme_post_cqes(cq); + } else { + /* Schedule the timer to post CQE later since we are in vcpu thread */ + timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); + } } static void nvme_process_aers(void *opaque) @@ -4226,10 +4233,82 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req) return NVME_INVALID_OPCODE | NVME_DNR; } +static void nvme_cq_notifier(EventNotifier *e) +{ + NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier); + NvmeCtrl *n = cq->ctrl; + + event_notifier_test_and_clear(&cq->notifier); + + nvme_update_cq_head(cq); + + if (cq->tail == cq->head) { + if (cq->irq_enabled) { + n->cq_pending--; + } + + nvme_irq_deassert(n, cq); + } + + nvme_post_cqes(cq); +} + +static int nvme_init_cq_ioeventfd(NvmeCQueue *cq) +{ + NvmeCtrl *n = cq->ctrl; + uint16_t offset = (cq->cqid << 3) + (1 << 2); + int ret; + + ret = event_notifier_init(&cq->notifier, 0); + if (ret < 0) { + return ret; + } + + event_notifier_set_handler(&cq->notifier, nvme_cq_notifier); + memory_region_add_eventfd(&n->iomem, + 0x1000 + offset, 4, false, 0, &cq->notifier); + + return 0; +} + +static void nvme_sq_notifier(EventNotifier *e) +{ + NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier); + + event_notifier_test_and_clear(&sq->notifier); + + nvme_process_sq(sq); +} + +static int nvme_init_sq_ioeventfd(NvmeSQueue *sq) +{ + NvmeCtrl *n = sq->ctrl; + uint16_t offset = sq->sqid << 3; + int ret; + + ret = event_notifier_init(&sq->notifier, 0); + if (ret < 0) { + return ret; + } + + event_notifier_set_handler(&sq->notifier, nvme_sq_notifier); + memory_region_add_eventfd(&n->iomem, + 0x1000 + offset, 4, false, 0, &sq->notifier); + + return 0; +} + static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n) { + uint16_t offset = sq->sqid << 3; + n->sq[sq->sqid] = NULL; timer_free(sq->timer); + if (sq->ioeventfd_enabled) { + memory_region_del_eventfd(&n->iomem, + 0x1000 + offset, 4, false, 0, &sq->notifier); + event_notifier_cleanup(&sq->notifier); + } g_free(sq->io_req); if (sq->sqid) { g_free(sq); @@ -4302,6 +4381,12 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr, if (n->dbbuf_enabled) { sq->db_addr = n->dbbuf_dbs + (sqid << 3); sq->ei_addr = n->dbbuf_eis + (sqid << 3); + + if (n->params.ioeventfd && sq->sqid != 0) { + if (!nvme_init_sq_ioeventfd(sq)) { + sq->ioeventfd_enabled = true; + } + } } assert(n->cq[cqid]); @@ -4605,8 +4690,15 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req) static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n) { + uint16_t offset = (cq->cqid << 3) + (1 << 2); + n->cq[cq->cqid] = NULL; timer_free(cq->timer); + if (cq->ioeventfd_enabled) { + memory_region_del_eventfd(&n->iomem, + 0x1000 + offset, 4, false, 0, &cq->notifier); + event_notifier_cleanup(&cq->notifier); + } if (msix_enabled(&n->parent_obj)) { msix_vector_unuse(&n->parent_obj, cq->vector); } @@ -4665,6 +4757,12 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr, if (n->dbbuf_enabled) { cq->db_addr = n->dbbuf_dbs + (cqid << 3) + (1 << 2); cq->ei_addr = n->dbbuf_eis + (cqid << 3) + (1 << 2); + + if (n->params.ioeventfd && cqid != 0) { + if (!nvme_init_cq_ioeventfd(cq)) { + cq->ioeventfd_enabled = true; + } + } } n->cq[cqid] = cq; cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq); @@ -6039,6 +6137,12 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req) sq->ei_addr = eis_addr + (i << 3); pci_dma_write(&n->parent_obj, sq->db_addr, &sq->tail, sizeof(sq->tail)); + + if (n->params.ioeventfd && sq->sqid != 0) { + if (!nvme_init_sq_ioeventfd(sq)) { + sq->ioeventfd_enabled = true; + } + } } if (cq) { @@ -6047,6 +6151,12 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req) cq->ei_addr = eis_addr + (i << 3) + (1 << 2); pci_dma_write(&n->parent_obj, cq->db_addr, &cq->head, sizeof(cq->head)); + + if (n->params.ioeventfd && cq->cqid != 0) { + if (!nvme_init_cq_ioeventfd(cq)) { + cq->ioeventfd_enabled = true; + } + } } } @@ -7554,6 +7664,7 @@ static Property nvme_props[] = { DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7), DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false), DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false), + DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, true), DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0), DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl, params.auto_transition_zones, true), diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h index 0711b9748c..79f5c281c2 100644 --- a/hw/nvme/nvme.h +++ b/hw/nvme/nvme.h @@ -376,6 +376,8 @@ typedef struct NvmeSQueue { uint64_t db_addr; uint64_t ei_addr; QEMUTimer *timer; + EventNotifier notifier; + bool ioeventfd_enabled; NvmeRequest *io_req; QTAILQ_HEAD(, NvmeRequest) req_list; QTAILQ_HEAD(, NvmeRequest) out_req_list; @@ -395,6 +397,8 @@ typedef struct NvmeCQueue { uint64_t db_addr; uint64_t ei_addr; QEMUTimer *timer; + EventNotifier notifier; + bool ioeventfd_enabled; QTAILQ_HEAD(, NvmeSQueue) sq_list; QTAILQ_HEAD(, NvmeRequest) req_list; } NvmeCQueue; @@ -417,6 +421,7 @@ typedef struct NvmeParams { uint8_t zasl; bool auto_transition_zones; bool legacy_cmb; + bool ioeventfd; uint8_t sriov_max_vfs; uint16_t sriov_vq_flexible; uint16_t sriov_vi_flexible; -- cgit v1.2.3