aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--MAINTAINERS2
-rw-r--r--accel/kvm/kvm-all.c6
-rw-r--r--block/nvme.c209
-rw-r--r--block/trace-events30
-rw-r--r--include/block/nvme.h18
-rw-r--r--softmmu/memory.c11
-rw-r--r--util/trace-events10
-rw-r--r--util/vfio-helpers.c43
8 files changed, 195 insertions, 134 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 2e018a0c1d..e47bf8cf28 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1882,6 +1882,7 @@ M: Klaus Jensen <its@irrelevant.dk>
L: qemu-block@nongnu.org
S: Supported
F: hw/block/nvme*
+F: include/block/nvme.h
F: tests/qtest/nvme-test.c
F: docs/specs/nvme.txt
T: git git://git.infradead.org/qemu-nvme.git nvme-next
@@ -2972,6 +2973,7 @@ R: Fam Zheng <fam@euphon.net>
L: qemu-block@nongnu.org
S: Supported
F: block/nvme*
+F: include/block/nvme.h
T: git https://github.com/stefanha/qemu.git block
Bootdevice
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 9ef5daf4c5..baaa54249d 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -2239,8 +2239,10 @@ static int kvm_init(MachineState *ms)
kvm_memory_listener_register(s, &s->memory_listener,
&address_space_memory, 0);
- memory_listener_register(&kvm_io_listener,
- &address_space_io);
+ if (kvm_eventfds_allowed) {
+ memory_listener_register(&kvm_io_listener,
+ &address_space_io);
+ }
memory_listener_register(&kvm_coalesced_pio_listener,
&address_space_io);
diff --git a/block/nvme.c b/block/nvme.c
index 739a0a700c..a06a188d53 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -41,6 +41,16 @@
typedef struct BDRVNVMeState BDRVNVMeState;
+/* Same index is used for queues and IRQs */
+#define INDEX_ADMIN 0
+#define INDEX_IO(n) (1 + n)
+
+/* This driver shares a single MSIX IRQ for the admin and I/O queues */
+enum {
+ MSIX_SHARED_IRQ_IDX = 0,
+ MSIX_IRQ_COUNT = 1
+};
+
typedef struct {
int32_t head, tail;
uint8_t *queue;
@@ -81,18 +91,10 @@ typedef struct {
QEMUBH *completion_bh;
} NVMeQueuePair;
-#define INDEX_ADMIN 0
-#define INDEX_IO(n) (1 + n)
-
-/* This driver shares a single MSIX IRQ for the admin and I/O queues */
-enum {
- MSIX_SHARED_IRQ_IDX = 0,
- MSIX_IRQ_COUNT = 1
-};
-
struct BDRVNVMeState {
AioContext *aio_context;
QEMUVFIOState *vfio;
+ void *bar0_wo_map;
/* Memory mapped registers */
volatile struct {
uint32_t sq_tail;
@@ -103,7 +105,7 @@ struct BDRVNVMeState {
* [1..]: io queues.
*/
NVMeQueuePair **queues;
- int nr_queues;
+ unsigned queue_count;
size_t page_size;
/* How many uint32_t elements does each doorbell entry take. */
size_t doorbell_scale;
@@ -159,28 +161,32 @@ static QemuOptsList runtime_opts = {
},
};
-static void nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
- int nentries, int entry_bytes, Error **errp)
+/* Returns true on success, false on failure. */
+static bool nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
+ unsigned nentries, size_t entry_bytes, Error **errp)
{
size_t bytes;
int r;
- bytes = ROUND_UP(nentries * entry_bytes, s->page_size);
+ bytes = ROUND_UP(nentries * entry_bytes, qemu_real_host_page_size);
q->head = q->tail = 0;
- q->queue = qemu_try_memalign(s->page_size, bytes);
+ q->queue = qemu_try_memalign(qemu_real_host_page_size, bytes);
if (!q->queue) {
error_setg(errp, "Cannot allocate queue");
- return;
+ return false;
}
memset(q->queue, 0, bytes);
r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, &q->iova);
if (r) {
error_setg(errp, "Cannot map queue");
+ return false;
}
+ return true;
}
static void nvme_free_queue_pair(NVMeQueuePair *q)
{
+ trace_nvme_free_queue_pair(q->index, q);
if (q->completion_bh) {
qemu_bh_delete(q->completion_bh);
}
@@ -204,31 +210,33 @@ static void nvme_free_req_queue_cb(void *opaque)
static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
AioContext *aio_context,
- int idx, int size,
+ unsigned idx, size_t size,
Error **errp)
{
int i, r;
- Error *local_err = NULL;
NVMeQueuePair *q;
uint64_t prp_list_iova;
+ size_t bytes;
q = g_try_new0(NVMeQueuePair, 1);
if (!q) {
return NULL;
}
- q->prp_list_pages = qemu_try_memalign(s->page_size,
- s->page_size * NVME_NUM_REQS);
+ trace_nvme_create_queue_pair(idx, q, size, aio_context,
+ event_notifier_get_fd(s->irq_notifier));
+ bytes = QEMU_ALIGN_UP(s->page_size * NVME_NUM_REQS,
+ qemu_real_host_page_size);
+ q->prp_list_pages = qemu_try_memalign(qemu_real_host_page_size, bytes);
if (!q->prp_list_pages) {
goto fail;
}
- memset(q->prp_list_pages, 0, s->page_size * NVME_NUM_REQS);
+ memset(q->prp_list_pages, 0, bytes);
qemu_mutex_init(&q->lock);
q->s = s;
q->index = idx;
qemu_co_queue_init(&q->free_req_queue);
q->completion_bh = aio_bh_new(aio_context, nvme_process_completion_bh, q);
- r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
- s->page_size * NVME_NUM_REQS,
+ r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, bytes,
false, &prp_list_iova);
if (r) {
goto fail;
@@ -243,16 +251,12 @@ static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
req->prp_list_iova = prp_list_iova + i * s->page_size;
}
- nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err);
- if (local_err) {
- error_propagate(errp, local_err);
+ if (!nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, errp)) {
goto fail;
}
q->sq.doorbell = &s->doorbells[idx * s->doorbell_scale].sq_tail;
- nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, &local_err);
- if (local_err) {
- error_propagate(errp, local_err);
+ if (!nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, errp)) {
goto fail;
}
q->cq.doorbell = &s->doorbells[idx * s->doorbell_scale].cq_head;
@@ -292,7 +296,7 @@ static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
while (q->free_req_head == -1) {
if (qemu_in_coroutine()) {
- trace_nvme_free_req_queue_wait(q);
+ trace_nvme_free_req_queue_wait(q->s, q->index);
qemu_co_queue_wait(&q->free_req_queue, &q->lock);
} else {
qemu_mutex_unlock(&q->lock);
@@ -399,8 +403,8 @@ static bool nvme_process_completion(NVMeQueuePair *q)
}
cid = le16_to_cpu(c->cid);
if (cid == 0 || cid > NVME_QUEUE_SIZE) {
- fprintf(stderr, "Unexpected CID in completion queue: %" PRIu32 "\n",
- cid);
+ warn_report("NVMe: Unexpected CID in completion queue: %"PRIu32", "
+ "queue size: %u", cid, NVME_QUEUE_SIZE);
continue;
}
trace_nvme_complete_command(s, q->index, cid);
@@ -465,7 +469,7 @@ static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
assert(!req->cb);
req->cb = cb;
req->opaque = opaque;
- cmd->cid = cpu_to_le32(req->cid);
+ cmd->cid = cpu_to_le16(req->cid);
trace_nvme_submit_command(q->s, q->index, req->cid);
nvme_trace_command(cmd);
@@ -479,16 +483,17 @@ static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
qemu_mutex_unlock(&q->lock);
}
-static void nvme_cmd_sync_cb(void *opaque, int ret)
+static void nvme_admin_cmd_sync_cb(void *opaque, int ret)
{
int *pret = opaque;
*pret = ret;
aio_wait_kick();
}
-static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
- NvmeCmd *cmd)
+static int nvme_admin_cmd_sync(BlockDriverState *bs, NvmeCmd *cmd)
{
+ BDRVNVMeState *s = bs->opaque;
+ NVMeQueuePair *q = s->queues[INDEX_ADMIN];
AioContext *aio_context = bdrv_get_aio_context(bs);
NVMeRequest *req;
int ret = -EINPROGRESS;
@@ -496,15 +501,17 @@ static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
if (!req) {
return -EBUSY;
}
- nvme_submit_command(q, req, cmd, nvme_cmd_sync_cb, &ret);
+ nvme_submit_command(q, req, cmd, nvme_admin_cmd_sync_cb, &ret);
AIO_WAIT_WHILE(aio_context, ret == -EINPROGRESS);
return ret;
}
-static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
+/* Returns true on success, false on failure. */
+static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
{
BDRVNVMeState *s = bs->opaque;
+ bool ret = false;
union {
NvmeIdCtrl ctrl;
NvmeIdNs ns;
@@ -517,21 +524,22 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
.opcode = NVME_ADM_CMD_IDENTIFY,
.cdw10 = cpu_to_le32(0x1),
};
+ size_t id_size = QEMU_ALIGN_UP(sizeof(*id), qemu_real_host_page_size);
- id = qemu_try_memalign(s->page_size, sizeof(*id));
+ id = qemu_try_memalign(qemu_real_host_page_size, id_size);
if (!id) {
error_setg(errp, "Cannot allocate buffer for identify response");
goto out;
}
- r = qemu_vfio_dma_map(s->vfio, id, sizeof(*id), true, &iova);
+ r = qemu_vfio_dma_map(s->vfio, id, id_size, true, &iova);
if (r) {
error_setg(errp, "Cannot map buffer for DMA");
goto out;
}
- memset(id, 0, sizeof(*id));
+ memset(id, 0, id_size);
cmd.dptr.prp1 = cpu_to_le64(iova);
- if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
+ if (nvme_admin_cmd_sync(bs, &cmd)) {
error_setg(errp, "Failed to identify controller");
goto out;
}
@@ -551,10 +559,10 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROES);
s->supports_discard = !!(oncs & NVME_ONCS_DSM);
- memset(id, 0, sizeof(*id));
+ memset(id, 0, id_size);
cmd.cdw10 = 0;
cmd.nsid = cpu_to_le32(namespace);
- if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
+ if (nvme_admin_cmd_sync(bs, &cmd)) {
error_setg(errp, "Failed to identify namespace");
goto out;
}
@@ -581,10 +589,13 @@ static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
goto out;
}
+ ret = true;
s->blkshift = lbaf->ds;
out:
qemu_vfio_dma_unmap(s->vfio, id);
qemu_vfree(id);
+
+ return ret;
}
static bool nvme_poll_queue(NVMeQueuePair *q)
@@ -594,6 +605,7 @@ static bool nvme_poll_queue(NVMeQueuePair *q)
const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset];
+ trace_nvme_poll_queue(q->s, q->index);
/*
* Do an early check for completions. q->lock isn't needed because
* nvme_process_completion() only runs in the event loop thread and
@@ -618,7 +630,7 @@ static bool nvme_poll_queues(BDRVNVMeState *s)
bool progress = false;
int i;
- for (i = 0; i < s->nr_queues; i++) {
+ for (i = 0; i < s->queue_count; i++) {
if (nvme_poll_queue(s->queues[i])) {
progress = true;
}
@@ -639,11 +651,12 @@ static void nvme_handle_event(EventNotifier *n)
static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
{
BDRVNVMeState *s = bs->opaque;
- int n = s->nr_queues;
+ unsigned n = s->queue_count;
NVMeQueuePair *q;
NvmeCmd cmd;
- int queue_size = NVME_QUEUE_SIZE;
+ unsigned queue_size = NVME_QUEUE_SIZE;
+ assert(n <= UINT16_MAX);
q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs),
n, queue_size, errp);
if (!q) {
@@ -652,26 +665,26 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
cmd = (NvmeCmd) {
.opcode = NVME_ADM_CMD_CREATE_CQ,
.dptr.prp1 = cpu_to_le64(q->cq.iova),
- .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)),
- .cdw11 = cpu_to_le32(0x3),
+ .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n),
+ .cdw11 = cpu_to_le32(NVME_CQ_IEN | NVME_CQ_PC),
};
- if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
- error_setg(errp, "Failed to create CQ io queue [%d]", n);
+ if (nvme_admin_cmd_sync(bs, &cmd)) {
+ error_setg(errp, "Failed to create CQ io queue [%u]", n);
goto out_error;
}
cmd = (NvmeCmd) {
.opcode = NVME_ADM_CMD_CREATE_SQ,
.dptr.prp1 = cpu_to_le64(q->sq.iova),
- .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)),
- .cdw11 = cpu_to_le32(0x1 | (n << 16)),
+ .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n),
+ .cdw11 = cpu_to_le32(NVME_SQ_PC | (n << 16)),
};
- if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd)) {
- error_setg(errp, "Failed to create SQ io queue [%d]", n);
+ if (nvme_admin_cmd_sync(bs, &cmd)) {
+ error_setg(errp, "Failed to create SQ io queue [%u]", n);
goto out_error;
}
s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1);
s->queues[n] = q;
- s->nr_queues++;
+ s->queue_count++;
return true;
out_error:
nvme_free_queue_pair(q);
@@ -684,7 +697,6 @@ static bool nvme_poll_cb(void *opaque)
BDRVNVMeState *s = container_of(e, BDRVNVMeState,
irq_notifier[MSIX_SHARED_IRQ_IDX]);
- trace_nvme_poll_cb(s);
return nvme_poll_queues(s);
}
@@ -692,12 +704,12 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
Error **errp)
{
BDRVNVMeState *s = bs->opaque;
+ NVMeQueuePair *q;
AioContext *aio_context = bdrv_get_aio_context(bs);
int ret;
uint64_t cap;
uint64_t timeout_ms;
uint64_t deadline, now;
- Error *local_err = NULL;
volatile NvmeBar *regs = NULL;
qemu_co_mutex_init(&s->dma_map_lock);
@@ -727,15 +739,29 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
* Initialization". */
cap = le64_to_cpu(regs->cap);
+ trace_nvme_controller_capability_raw(cap);
+ trace_nvme_controller_capability("Maximum Queue Entries Supported",
+ 1 + NVME_CAP_MQES(cap));
+ trace_nvme_controller_capability("Contiguous Queues Required",
+ NVME_CAP_CQR(cap));
+ trace_nvme_controller_capability("Doorbell Stride",
+ 2 << (2 + NVME_CAP_DSTRD(cap)));
+ trace_nvme_controller_capability("Subsystem Reset Supported",
+ NVME_CAP_NSSRS(cap));
+ trace_nvme_controller_capability("Memory Page Size Minimum",
+ 1 << (12 + NVME_CAP_MPSMIN(cap)));
+ trace_nvme_controller_capability("Memory Page Size Maximum",
+ 1 << (12 + NVME_CAP_MPSMAX(cap)));
if (!NVME_CAP_CSS(cap)) {
error_setg(errp, "Device doesn't support NVMe command set");
ret = -EINVAL;
goto out;
}
- s->page_size = MAX(4096, 1 << NVME_CAP_MPSMIN(cap));
+ s->page_size = 1u << (12 + NVME_CAP_MPSMIN(cap));
s->doorbell_scale = (4 << NVME_CAP_DSTRD(cap)) / sizeof(uint32_t);
bs->bl.opt_mem_alignment = s->page_size;
+ bs->bl.request_alignment = s->page_size;
timeout_ms = MIN(500 * NVME_CAP_TO(cap), 30000);
/* Reset device to get a clean state. */
@@ -752,8 +778,10 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
}
}
- s->doorbells = qemu_vfio_pci_map_bar(s->vfio, 0, sizeof(NvmeBar),
- NVME_DOORBELL_SIZE, PROT_WRITE, errp);
+ s->bar0_wo_map = qemu_vfio_pci_map_bar(s->vfio, 0, 0,
+ sizeof(NvmeBar) + NVME_DOORBELL_SIZE,
+ PROT_WRITE, errp);
+ s->doorbells = (void *)((uintptr_t)s->bar0_wo_map + sizeof(NvmeBar));
if (!s->doorbells) {
ret = -EINVAL;
goto out;
@@ -761,19 +789,18 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
/* Set up admin queue. */
s->queues = g_new(NVMeQueuePair *, 1);
- s->queues[INDEX_ADMIN] = nvme_create_queue_pair(s, aio_context, 0,
- NVME_QUEUE_SIZE,
- errp);
- if (!s->queues[INDEX_ADMIN]) {
+ q = nvme_create_queue_pair(s, aio_context, 0, NVME_QUEUE_SIZE, errp);
+ if (!q) {
ret = -EINVAL;
goto out;
}
- s->nr_queues = 1;
- QEMU_BUILD_BUG_ON(NVME_QUEUE_SIZE & 0xF000);
- regs->aqa = cpu_to_le32((NVME_QUEUE_SIZE << AQA_ACQS_SHIFT) |
- (NVME_QUEUE_SIZE << AQA_ASQS_SHIFT));
- regs->asq = cpu_to_le64(s->queues[INDEX_ADMIN]->sq.iova);
- regs->acq = cpu_to_le64(s->queues[INDEX_ADMIN]->cq.iova);
+ s->queues[INDEX_ADMIN] = q;
+ s->queue_count = 1;
+ QEMU_BUILD_BUG_ON((NVME_QUEUE_SIZE - 1) & 0xF000);
+ regs->aqa = cpu_to_le32(((NVME_QUEUE_SIZE - 1) << AQA_ACQS_SHIFT) |
+ ((NVME_QUEUE_SIZE - 1) << AQA_ASQS_SHIFT));
+ regs->asq = cpu_to_le64(q->sq.iova);
+ regs->acq = cpu_to_le64(q->cq.iova);
/* After setting up all control registers we can enable device now. */
regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << CC_IOCQES_SHIFT) |
@@ -801,9 +828,7 @@ static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
&s->irq_notifier[MSIX_SHARED_IRQ_IDX],
false, nvme_handle_event, nvme_poll_cb);
- nvme_identify(bs, namespace, &local_err);
- if (local_err) {
- error_propagate(errp, local_err);
+ if (!nvme_identify(bs, namespace, errp)) {
ret = -EIO;
goto out;
}
@@ -869,7 +894,7 @@ static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable,
.cdw11 = cpu_to_le32(enable ? 0x01 : 0x00),
};
- ret = nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], &cmd);
+ ret = nvme_admin_cmd_sync(bs, &cmd);
if (ret) {
error_setg(errp, "Failed to configure NVMe write cache");
}
@@ -878,10 +903,9 @@ static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable,
static void nvme_close(BlockDriverState *bs)
{
- int i;
BDRVNVMeState *s = bs->opaque;
- for (i = 0; i < s->nr_queues; ++i) {
+ for (unsigned i = 0; i < s->queue_count; ++i) {
nvme_free_queue_pair(s->queues[i]);
}
g_free(s->queues);
@@ -889,8 +913,8 @@ static void nvme_close(BlockDriverState *bs)
&s->irq_notifier[MSIX_SHARED_IRQ_IDX],
false, NULL, NULL);
event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]);
- qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->doorbells,
- sizeof(NvmeBar), NVME_DOORBELL_SIZE);
+ qemu_vfio_pci_unmap_bar(s->vfio, 0, s->bar0_wo_map,
+ 0, sizeof(NvmeBar) + NVME_DOORBELL_SIZE);
qemu_vfio_close(s->vfio);
g_free(s->device);
@@ -994,11 +1018,12 @@ static coroutine_fn int nvme_cmd_map_qiov(BlockDriverState *bs, NvmeCmd *cmd,
for (i = 0; i < qiov->niov; ++i) {
bool retry = true;
uint64_t iova;
+ size_t len = QEMU_ALIGN_UP(qiov->iov[i].iov_len,
+ qemu_real_host_page_size);
try_map:
r = qemu_vfio_dma_map(s->vfio,
qiov->iov[i].iov_base,
- qiov->iov[i].iov_len,
- true, &iova);
+ len, true, &iova);
if (r == -ENOMEM && retry) {
retry = false;
trace_nvme_dma_flush_queue_wait(s);
@@ -1106,7 +1131,7 @@ static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
};
trace_nvme_prw_aligned(s, is_write, offset, bytes, flags, qiov->niov);
- assert(s->nr_queues > 1);
+ assert(s->queue_count > 1);
req = nvme_get_free_req(ioq);
assert(req);
@@ -1142,8 +1167,9 @@ static inline bool nvme_qiov_aligned(BlockDriverState *bs,
BDRVNVMeState *s = bs->opaque;
for (i = 0; i < qiov->niov; ++i) {
- if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base, s->page_size) ||
- !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, s->page_size)) {
+ if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base,
+ qemu_real_host_page_size) ||
+ !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, qemu_real_host_page_size)) {
trace_nvme_qiov_unaligned(qiov, i, qiov->iov[i].iov_base,
qiov->iov[i].iov_len, s->page_size);
return false;
@@ -1159,7 +1185,7 @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
int r;
uint8_t *buf = NULL;
QEMUIOVector local_qiov;
-
+ size_t len = QEMU_ALIGN_UP(bytes, qemu_real_host_page_size);
assert(QEMU_IS_ALIGNED(offset, s->page_size));
assert(QEMU_IS_ALIGNED(bytes, s->page_size));
assert(bytes <= s->max_transfer);
@@ -1169,7 +1195,7 @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
}
s->stats.unaligned_accesses++;
trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write);
- buf = qemu_try_memalign(s->page_size, bytes);
+ buf = qemu_try_memalign(qemu_real_host_page_size, len);
if (!buf) {
return -ENOMEM;
@@ -1216,7 +1242,7 @@ static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
.ret = -EINPROGRESS,
};
- assert(s->nr_queues > 1);
+ assert(s->queue_count > 1);
req = nvme_get_free_req(ioq);
assert(req);
nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
@@ -1268,7 +1294,7 @@ static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
cmd.cdw12 = cpu_to_le32(cdw12);
trace_nvme_write_zeroes(s, offset, bytes, flags);
- assert(s->nr_queues > 1);
+ assert(s->queue_count > 1);
req = nvme_get_free_req(ioq);
assert(req);
@@ -1311,7 +1337,7 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
return -ENOTSUP;
}
- assert(s->nr_queues > 1);
+ assert(s->queue_count > 1);
buf = qemu_try_memalign(s->page_size, s->page_size);
if (!buf) {
@@ -1391,7 +1417,7 @@ static void nvme_detach_aio_context(BlockDriverState *bs)
{
BDRVNVMeState *s = bs->opaque;
- for (int i = 0; i < s->nr_queues; i++) {
+ for (unsigned i = 0; i < s->queue_count; i++) {
NVMeQueuePair *q = s->queues[i];
qemu_bh_delete(q->completion_bh);
@@ -1412,7 +1438,7 @@ static void nvme_attach_aio_context(BlockDriverState *bs,
aio_set_event_notifier(new_context, &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
false, nvme_handle_event, nvme_poll_cb);
- for (int i = 0; i < s->nr_queues; i++) {
+ for (unsigned i = 0; i < s->queue_count; i++) {
NVMeQueuePair *q = s->queues[i];
q->completion_bh =
@@ -1429,11 +1455,10 @@ static void nvme_aio_plug(BlockDriverState *bs)
static void nvme_aio_unplug(BlockDriverState *bs)
{
- int i;
BDRVNVMeState *s = bs->opaque;
assert(s->plugged);
s->plugged = false;
- for (i = INDEX_IO(0); i < s->nr_queues; i++) {
+ for (unsigned i = INDEX_IO(0); i < s->queue_count; i++) {
NVMeQueuePair *q = s->queues[i];
qemu_mutex_lock(&q->lock);
nvme_kick(q);
diff --git a/block/trace-events b/block/trace-events
index 0e351c3fa3..8368f4acb0 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -134,25 +134,29 @@ qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, uint64_t
qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t len) "s %p acb %p ret %d offset %"PRIu64" len %zu"
# nvme.c
-nvme_kick(void *s, int queue) "s %p queue %d"
+nvme_controller_capability_raw(uint64_t value) "0x%08"PRIx64
+nvme_controller_capability(const char *desc, uint64_t value) "%s: %"PRIu64
+nvme_kick(void *s, unsigned q_index) "s %p q #%u"
nvme_dma_flush_queue_wait(void *s) "s %p"
nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x"
-nvme_process_completion(void *s, int index, int inflight) "s %p queue %d inflight %d"
-nvme_process_completion_queue_plugged(void *s, int index) "s %p queue %d"
-nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d"
-nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d"
+nvme_process_completion(void *s, unsigned q_index, int inflight) "s %p q #%u inflight %d"
+nvme_process_completion_queue_plugged(void *s, unsigned q_index) "s %p q #%u"
+nvme_complete_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d"
+nvme_submit_command(void *s, unsigned q_index, int cid) "s %p q #%u cid %d"
nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x"
nvme_handle_event(void *s) "s %p"
-nvme_poll_cb(void *s) "s %p"
-nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset %"PRId64" bytes %"PRId64" flags %d niov %d"
-nvme_write_zeroes(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offset %"PRId64" bytes %"PRId64" flags %d"
+nvme_poll_queue(void *s, unsigned q_index) "s %p q #%u"
+nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset 0x%"PRIx64" bytes %"PRId64" flags %d niov %d"
+nvme_write_zeroes(void *s, uint64_t offset, uint64_t bytes, int flags) "s %p offset 0x%"PRIx64" bytes %"PRId64" flags %d"
nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x"
-nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset %"PRId64" bytes %"PRId64" niov %d is_write %d"
-nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset %"PRId64" bytes %"PRId64" ret %d"
-nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset %"PRId64" bytes %"PRId64""
-nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset %"PRId64" bytes %"PRId64" ret %d"
+nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset 0x%"PRIx64" bytes %"PRId64" niov %d is_write %d"
+nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset 0x%"PRIx64" bytes %"PRId64" ret %d"
+nvme_dsm(void *s, uint64_t offset, uint64_t bytes) "s %p offset 0x%"PRIx64" bytes %"PRId64""
+nvme_dsm_done(void *s, uint64_t offset, uint64_t bytes, int ret) "s %p offset 0x%"PRIx64" bytes %"PRId64" ret %d"
nvme_dma_map_flush(void *s) "s %p"
-nvme_free_req_queue_wait(void *q) "q %p"
+nvme_free_req_queue_wait(void *s, unsigned q_index) "s %p q #%u"
+nvme_create_queue_pair(unsigned q_index, void *q, unsigned size, void *aio_context, int fd) "index %u q %p size %u aioctx %p fd %d"
+nvme_free_queue_pair(unsigned q_index, void *q) "index %u q %p"
nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d"
nvme_cmd_map_qiov_pages(void *s, int i, uint64_t page) "s %p page[%d] 0x%"PRIx64
nvme_cmd_map_qiov_iov(void *s, int i, void *page, int pages) "s %p iov[%d] %p pages %d"
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 8a46d9cf01..3e02d9ca98 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -501,6 +501,11 @@ typedef struct QEMU_PACKED NvmeCreateCq {
#define NVME_CQ_FLAGS_PC(cq_flags) (cq_flags & 0x1)
#define NVME_CQ_FLAGS_IEN(cq_flags) ((cq_flags >> 1) & 0x1)
+enum NvmeFlagsCq {
+ NVME_CQ_PC = 1,
+ NVME_CQ_IEN = 2,
+};
+
typedef struct QEMU_PACKED NvmeCreateSq {
uint8_t opcode;
uint8_t flags;
@@ -518,12 +523,13 @@ typedef struct QEMU_PACKED NvmeCreateSq {
#define NVME_SQ_FLAGS_PC(sq_flags) (sq_flags & 0x1)
#define NVME_SQ_FLAGS_QPRIO(sq_flags) ((sq_flags >> 1) & 0x3)
-enum NvmeQueueFlags {
- NVME_Q_PC = 1,
- NVME_Q_PRIO_URGENT = 0,
- NVME_Q_PRIO_HIGH = 1,
- NVME_Q_PRIO_NORMAL = 2,
- NVME_Q_PRIO_LOW = 3,
+enum NvmeFlagsSq {
+ NVME_SQ_PC = 1,
+
+ NVME_SQ_PRIO_URGENT = 0,
+ NVME_SQ_PRIO_HIGH = 1,
+ NVME_SQ_PRIO_NORMAL = 2,
+ NVME_SQ_PRIO_LOW = 3,
};
typedef struct QEMU_PACKED NvmeIdentify {
diff --git a/softmmu/memory.c b/softmmu/memory.c
index aa393f1bb0..11ca94d037 100644
--- a/softmmu/memory.c
+++ b/softmmu/memory.c
@@ -205,8 +205,15 @@ static bool memory_region_ioeventfd_before(MemoryRegionIoeventfd *a,
static bool memory_region_ioeventfd_equal(MemoryRegionIoeventfd *a,
MemoryRegionIoeventfd *b)
{
- return !memory_region_ioeventfd_before(a, b)
- && !memory_region_ioeventfd_before(b, a);
+ if (int128_eq(a->addr.start, b->addr.start) &&
+ (!int128_nz(a->addr.size) || !int128_nz(b->addr.size) ||
+ (int128_eq(a->addr.size, b->addr.size) &&
+ (a->match_data == b->match_data) &&
+ ((a->match_data && (a->data == b->data)) || !a->match_data) &&
+ (a->e == b->e))))
+ return true;
+
+ return false;
}
/* Range of memory in the global map. Addresses are absolute. */
diff --git a/util/trace-events b/util/trace-events
index 24c31803b0..61e0d4bcdf 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -80,8 +80,14 @@ qemu_mutex_unlock(void *mutex, const char *file, const int line) "released mutex
qemu_vfio_dma_reset_temporary(void *s) "s %p"
qemu_vfio_ram_block_added(void *s, void *p, size_t size) "s %p host %p size 0x%zx"
qemu_vfio_ram_block_removed(void *s, void *p, size_t size) "s %p host %p size 0x%zx"
+qemu_vfio_dump_mapping(void *host, uint64_t iova, size_t size) "vfio mapping %p to iova 0x%08" PRIx64 " size 0x%zx"
qemu_vfio_find_mapping(void *s, void *p) "s %p host %p"
qemu_vfio_new_mapping(void *s, void *host, size_t size, int index, uint64_t iova) "s %p host %p size 0x%zx index %d iova 0x%"PRIx64
-qemu_vfio_do_mapping(void *s, void *host, size_t size, uint64_t iova) "s %p host %p size 0x%zx iova 0x%"PRIx64
-qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size 0x%zx temporary %d iova %p"
+qemu_vfio_do_mapping(void *s, void *host, uint64_t iova, size_t size) "s %p host %p <-> iova 0x%"PRIx64 " size 0x%zx"
+qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size 0x%zx temporary %d &iova %p"
+qemu_vfio_dma_mapped(void *s, void *host, uint64_t iova, size_t size) "s %p host %p <-> iova 0x%"PRIx64" size 0x%zx"
qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p"
+qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "read cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
+qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
+qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_size, uint32_t cap_offset) "region '%s' addr 0x%"PRIx64" size 0x%"PRIx64" cap_ofs 0x%"PRIx32
+qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p"
diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
index 2bec48e163..97dfa3fd57 100644
--- a/util/vfio-helpers.c
+++ b/util/vfio-helpers.c
@@ -137,6 +137,7 @@ static inline void assert_bar_index_valid(QEMUVFIOState *s, int index)
static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp)
{
+ g_autofree char *barname = NULL;
assert_bar_index_valid(s, index);
s->bar_region_info[index] = (struct vfio_region_info) {
.index = VFIO_PCI_BAR0_REGION_INDEX + index,
@@ -146,6 +147,10 @@ static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp)
error_setg_errno(errp, errno, "Failed to get BAR region info");
return -errno;
}
+ barname = g_strdup_printf("bar[%d]", index);
+ trace_qemu_vfio_region_info(barname, s->bar_region_info[index].offset,
+ s->bar_region_info[index].size,
+ s->bar_region_info[index].cap_offset);
return 0;
}
@@ -158,10 +163,13 @@ void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index,
Error **errp)
{
void *p;
+ assert(QEMU_IS_ALIGNED(offset, qemu_real_host_page_size));
assert_bar_index_valid(s, index);
p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset),
prot, MAP_SHARED,
s->device, s->bar_region_info[index].offset + offset);
+ trace_qemu_vfio_pci_map_bar(index, s->bar_region_info[index].offset ,
+ size, offset, p);
if (p == MAP_FAILED) {
error_setg_errno(errp, errno, "Failed to map BAR region");
p = NULL;
@@ -228,6 +236,10 @@ static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf,
{
int ret;
+ trace_qemu_vfio_pci_read_config(buf, ofs, size,
+ s->config_region_info.offset,
+ s->config_region_info.size);
+ assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size));
do {
ret = pread(s->device, buf, size, s->config_region_info.offset + ofs);
} while (ret == -1 && errno == EINTR);
@@ -238,6 +250,10 @@ static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int
{
int ret;
+ trace_qemu_vfio_pci_write_config(buf, ofs, size,
+ s->config_region_info.offset,
+ s->config_region_info.size);
+ assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size));
do {
ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs);
} while (ret == -1 && errno == EINTR);
@@ -301,7 +317,7 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
}
if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
- error_setg_errno(errp, errno, "VFIO IOMMU check failed");
+ error_setg_errno(errp, errno, "VFIO IOMMU Type1 is not supported");
ret = -EINVAL;
goto fail_container;
}
@@ -409,6 +425,9 @@ static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
ret = -errno;
goto fail;
}
+ trace_qemu_vfio_region_info("config", s->config_region_info.offset,
+ s->config_region_info.size,
+ s->config_region_info.cap_offset);
for (i = 0; i < ARRAY_SIZE(s->bar_region_info); i++) {
ret = qemu_vfio_pci_init_bar(s, i, errp);
@@ -516,23 +535,12 @@ QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp)
return s;
}
-static void qemu_vfio_dump_mapping(IOVAMapping *m)
-{
- if (QEMU_VFIO_DEBUG) {
- printf(" vfio mapping %p %" PRIx64 " to %" PRIx64 "\n", m->host,
- (uint64_t)m->size, (uint64_t)m->iova);
- }
-}
-
static void qemu_vfio_dump_mappings(QEMUVFIOState *s)
{
- int i;
-
- if (QEMU_VFIO_DEBUG) {
- printf("vfio mappings\n");
- for (i = 0; i < s->nr_mappings; ++i) {
- qemu_vfio_dump_mapping(&s->mappings[i]);
- }
+ for (int i = 0; i < s->nr_mappings; ++i) {
+ trace_qemu_vfio_dump_mapping(s->mappings[i].host,
+ s->mappings[i].iova,
+ s->mappings[i].size);
}
}
@@ -622,7 +630,7 @@ static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size,
.vaddr = (uintptr_t)host,
.size = size,
};
- trace_qemu_vfio_do_mapping(s, host, size, iova);
+ trace_qemu_vfio_do_mapping(s, host, iova, size);
if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
error_report("VFIO_MAP_DMA failed: %s", strerror(errno));
@@ -778,6 +786,7 @@ int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
}
}
}
+ trace_qemu_vfio_dma_mapped(s, host, iova0, size);
if (iova) {
*iova = iova0;
}