diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2018-02-08 14:31:51 +0000 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2018-02-08 14:31:51 +0000 |
commit | 008a51bbb343972dd8cf09126da8c3b87f4e1c96 (patch) | |
tree | 6e5f7c697a796ef35198a65623e4dbdc76d3e6da | |
parent | b256b89c63c12e902645dd55e7b5362f60574742 (diff) | |
parent | 4eb995603479f0f7aff14b518f8ada16fe694ca7 (diff) |
Merge remote-tracking branch 'remotes/famz/tags/staging-pull-request' into staging
# gpg: Signature made Thu 08 Feb 2018 01:29:22 GMT
# gpg: using RSA key CA35624C6A9171C6
# gpg: Good signature from "Fam Zheng <famz@redhat.com>"
# Primary key fingerprint: 5003 7CB7 9706 0F76 F021 AD56 CA35 624C 6A91 71C6
* remotes/famz/tags/staging-pull-request:
docs: Add docs/devel/testing.rst
qapi: Add NVMe driver options to the schema
docs: Add section for NVMe VFIO driver
block: Move NVMe constants to a separate header
qemu-img: Map bench buffer
block/nvme: Implement .bdrv_(un)register_buf
block: Introduce buf register API
block: Add VFIO based NVMe driver
util: Introduce vfio helpers
stubs: Add stubs for ram block API
curl: convert to CoQueue
coroutine-lock: make qemu_co_enter_next thread-safe
coroutine-lock: convert CoQueue to use QemuLockable
lockable: add QemuLockable
test-coroutine: add simple CoMutex test
docker: change Fedora base image to fedora:27
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
33 files changed, 3580 insertions, 829 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index bbc3a617c2..301b6996e1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1888,6 +1888,12 @@ L: qemu-block@nongnu.org S: Supported F: block/null.c +NVMe Block Driver +M: Fam Zheng <famz@redhat.com> +L: qemu-block@nongnu.org +S: Supported +F: block/nvme* + Bootdevice M: Gonglei <arei.gonglei@huawei.com> S: Maintained diff --git a/block/Makefile.objs b/block/Makefile.objs index a73387f1bf..aede94f105 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -11,6 +11,7 @@ block-obj-$(CONFIG_POSIX) += file-posix.o block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o block-obj-y += null.o mirror.o commit.o io.o block-obj-y += throttle-groups.o +block-obj-$(CONFIG_LINUX) += nvme.o block-obj-y += nbd.o nbd-client.o sheepdog.o block-obj-$(CONFIG_LIBISCSI) += iscsi.o diff --git a/block/block-backend.c b/block/block-backend.c index baef8e7abc..f66349c2c9 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -2096,3 +2096,13 @@ static void blk_root_drained_end(BdrvChild *child) } } } + +void blk_register_buf(BlockBackend *blk, void *host, size_t size) +{ + bdrv_register_buf(blk_bs(blk), host, size); +} + +void blk_unregister_buf(BlockBackend *blk, void *host) +{ + bdrv_unregister_buf(blk_bs(blk), host); +} diff --git a/block/curl.c b/block/curl.c index 35cf417f59..cd578d3d14 100644 --- a/block/curl.c +++ b/block/curl.c @@ -101,8 +101,6 @@ typedef struct CURLAIOCB { size_t start; size_t end; - - QSIMPLEQ_ENTRY(CURLAIOCB) next; } CURLAIOCB; typedef struct CURLSocket { @@ -138,7 +136,7 @@ typedef struct BDRVCURLState { bool accept_range; AioContext *aio_context; QemuMutex mutex; - QSIMPLEQ_HEAD(, CURLAIOCB) free_state_waitq; + CoQueue free_state_waitq; char *username; char *password; char *proxyusername; @@ -538,7 +536,6 @@ static int curl_init_state(BDRVCURLState *s, CURLState *state) /* Called with s->mutex held. */ static void curl_clean_state(CURLState *s) { - CURLAIOCB *next; int j; for (j = 0; j < CURL_NUM_ACB; j++) { assert(!s->acb[j]); @@ -556,13 +553,7 @@ static void curl_clean_state(CURLState *s) s->in_use = 0; - next = QSIMPLEQ_FIRST(&s->s->free_state_waitq); - if (next) { - QSIMPLEQ_REMOVE_HEAD(&s->s->free_state_waitq, next); - qemu_mutex_unlock(&s->s->mutex); - aio_co_wake(next->co); - qemu_mutex_lock(&s->s->mutex); - } + qemu_co_enter_next(&s->s->free_state_waitq, &s->s->mutex); } static void curl_parse_filename(const char *filename, QDict *options, @@ -784,7 +775,7 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags, } DPRINTF("CURL: Opening %s\n", file); - QSIMPLEQ_INIT(&s->free_state_waitq); + qemu_co_queue_init(&s->free_state_waitq); s->aio_context = bdrv_get_aio_context(bs); s->url = g_strdup(file); qemu_mutex_lock(&s->mutex); @@ -888,10 +879,7 @@ static void curl_setup_preadv(BlockDriverState *bs, CURLAIOCB *acb) if (state) { break; } - QSIMPLEQ_INSERT_TAIL(&s->free_state_waitq, acb, next); - qemu_mutex_unlock(&s->mutex); - qemu_coroutine_yield(); - qemu_mutex_lock(&s->mutex); + qemu_co_queue_wait(&s->free_state_waitq, &s->mutex); } if (curl_init_state(s, state) < 0) { diff --git a/block/io.c b/block/io.c index 7ea402352e..89d0745e95 100644 --- a/block/io.c +++ b/block/io.c @@ -2825,3 +2825,27 @@ void bdrv_io_unplug(BlockDriverState *bs) bdrv_io_unplug(child->bs); } } + +void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size) +{ + BdrvChild *child; + + if (bs->drv && bs->drv->bdrv_register_buf) { + bs->drv->bdrv_register_buf(bs, host, size); + } + QLIST_FOREACH(child, &bs->children, next) { + bdrv_register_buf(child->bs, host, size); + } +} + +void bdrv_unregister_buf(BlockDriverState *bs, void *host) +{ + BdrvChild *child; + + if (bs->drv && bs->drv->bdrv_unregister_buf) { + bs->drv->bdrv_unregister_buf(bs, host); + } + QLIST_FOREACH(child, &bs->children, next) { + bdrv_unregister_buf(child->bs, host); + } +} diff --git a/block/nvme.c b/block/nvme.c new file mode 100644 index 0000000000..e9d0e218fc --- /dev/null +++ b/block/nvme.c @@ -0,0 +1,1201 @@ +/* + * NVMe block driver based on vfio + * + * Copyright 2016 - 2018 Red Hat, Inc. + * + * Authors: + * Fam Zheng <famz@redhat.com> + * Paolo Bonzini <pbonzini@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include <linux/vfio.h> +#include "qapi/error.h" +#include "qapi/qmp/qdict.h" +#include "qapi/qmp/qstring.h" +#include "qemu/error-report.h" +#include "qemu/cutils.h" +#include "qemu/vfio-helpers.h" +#include "block/block_int.h" +#include "trace.h" + +#include "block/nvme.h" + +#define NVME_SQ_ENTRY_BYTES 64 +#define NVME_CQ_ENTRY_BYTES 16 +#define NVME_QUEUE_SIZE 128 +#define NVME_BAR_SIZE 8192 + +typedef struct { + int32_t head, tail; + uint8_t *queue; + uint64_t iova; + /* Hardware MMIO register */ + volatile uint32_t *doorbell; +} NVMeQueue; + +typedef struct { + BlockCompletionFunc *cb; + void *opaque; + int cid; + void *prp_list_page; + uint64_t prp_list_iova; + bool busy; +} NVMeRequest; + +typedef struct { + CoQueue free_req_queue; + QemuMutex lock; + + /* Fields protected by BQL */ + int index; + uint8_t *prp_list_pages; + + /* Fields protected by @lock */ + NVMeQueue sq, cq; + int cq_phase; + NVMeRequest reqs[NVME_QUEUE_SIZE]; + bool busy; + int need_kick; + int inflight; +} NVMeQueuePair; + +/* Memory mapped registers */ +typedef volatile struct { + uint64_t cap; + uint32_t vs; + uint32_t intms; + uint32_t intmc; + uint32_t cc; + uint32_t reserved0; + uint32_t csts; + uint32_t nssr; + uint32_t aqa; + uint64_t asq; + uint64_t acq; + uint32_t cmbloc; + uint32_t cmbsz; + uint8_t reserved1[0xec0]; + uint8_t cmd_set_specfic[0x100]; + uint32_t doorbells[]; +} QEMU_PACKED NVMeRegs; + +QEMU_BUILD_BUG_ON(offsetof(NVMeRegs, doorbells) != 0x1000); + +typedef struct { + AioContext *aio_context; + QEMUVFIOState *vfio; + NVMeRegs *regs; + /* The submission/completion queue pairs. + * [0]: admin queue. + * [1..]: io queues. + */ + NVMeQueuePair **queues; + int nr_queues; + size_t page_size; + /* How many uint32_t elements does each doorbell entry take. */ + size_t doorbell_scale; + bool write_cache_supported; + EventNotifier irq_notifier; + uint64_t nsze; /* Namespace size reported by identify command */ + int nsid; /* The namespace id to read/write data. */ + uint64_t max_transfer; + int plugged; + + CoMutex dma_map_lock; + CoQueue dma_flush_queue; + + /* Total size of mapped qiov, accessed under dma_map_lock */ + int dma_map_count; +} BDRVNVMeState; + +#define NVME_BLOCK_OPT_DEVICE "device" +#define NVME_BLOCK_OPT_NAMESPACE "namespace" + +static QemuOptsList runtime_opts = { + .name = "nvme", + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), + .desc = { + { + .name = NVME_BLOCK_OPT_DEVICE, + .type = QEMU_OPT_STRING, + .help = "NVMe PCI device address", + }, + { + .name = NVME_BLOCK_OPT_NAMESPACE, + .type = QEMU_OPT_NUMBER, + .help = "NVMe namespace", + }, + { /* end of list */ } + }, +}; + +static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q, + int nentries, int entry_bytes, Error **errp) +{ + BDRVNVMeState *s = bs->opaque; + size_t bytes; + int r; + + bytes = ROUND_UP(nentries * entry_bytes, s->page_size); + q->head = q->tail = 0; + q->queue = qemu_try_blockalign0(bs, bytes); + + if (!q->queue) { + error_setg(errp, "Cannot allocate queue"); + return; + } + r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, &q->iova); + if (r) { + error_setg(errp, "Cannot map queue"); + } +} + +static void nvme_free_queue_pair(BlockDriverState *bs, NVMeQueuePair *q) +{ + qemu_vfree(q->prp_list_pages); + qemu_vfree(q->sq.queue); + qemu_vfree(q->cq.queue); + qemu_mutex_destroy(&q->lock); + g_free(q); +} + +static void nvme_free_req_queue_cb(void *opaque) +{ + NVMeQueuePair *q = opaque; + + qemu_mutex_lock(&q->lock); + while (qemu_co_enter_next(&q->free_req_queue, &q->lock)) { + /* Retry all pending requests */ + } + qemu_mutex_unlock(&q->lock); +} + +static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs, + int idx, int size, + Error **errp) +{ + int i, r; + BDRVNVMeState *s = bs->opaque; + Error *local_err = NULL; + NVMeQueuePair *q = g_new0(NVMeQueuePair, 1); + uint64_t prp_list_iova; + + qemu_mutex_init(&q->lock); + q->index = idx; + qemu_co_queue_init(&q->free_req_queue); + q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_QUEUE_SIZE); + r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, + s->page_size * NVME_QUEUE_SIZE, + false, &prp_list_iova); + if (r) { + goto fail; + } + for (i = 0; i < NVME_QUEUE_SIZE; i++) { + NVMeRequest *req = &q->reqs[i]; + req->cid = i + 1; + req->prp_list_page = q->prp_list_pages + i * s->page_size; + req->prp_list_iova = prp_list_iova + i * s->page_size; + } + nvme_init_queue(bs, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err); + if (local_err) { + error_propagate(errp, local_err); + goto fail; + } + q->sq.doorbell = &s->regs->doorbells[idx * 2 * s->doorbell_scale]; + + nvme_init_queue(bs, &q->cq, size, NVME_CQ_ENTRY_BYTES, &local_err); + if (local_err) { + error_propagate(errp, local_err); + goto fail; + } + q->cq.doorbell = &s->regs->doorbells[idx * 2 * s->doorbell_scale + 1]; + + return q; +fail: + nvme_free_queue_pair(bs, q); + return NULL; +} + +/* With q->lock */ +static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q) +{ + if (s->plugged || !q->need_kick) { + return; + } + trace_nvme_kick(s, q->index); + assert(!(q->sq.tail & 0xFF00)); + /* Fence the write to submission queue entry before notifying the device. */ + smp_wmb(); + *q->sq.doorbell = cpu_to_le32(q->sq.tail); + q->inflight += q->need_kick; + q->need_kick = 0; +} + +/* Find a free request element if any, otherwise: + * a) if in coroutine context, try to wait for one to become available; + * b) if not in coroutine, return NULL; + */ +static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q) +{ + int i; + NVMeRequest *req = NULL; + + qemu_mutex_lock(&q->lock); + while (q->inflight + q->need_kick > NVME_QUEUE_SIZE - 2) { + /* We have to leave one slot empty as that is the full queue case (head + * == tail + 1). */ + if (qemu_in_coroutine()) { + trace_nvme_free_req_queue_wait(q); + qemu_co_queue_wait(&q->free_req_queue, &q->lock); + } else { + qemu_mutex_unlock(&q->lock); + return NULL; + } + } + for (i = 0; i < NVME_QUEUE_SIZE; i++) { + if (!q->reqs[i].busy) { + q->reqs[i].busy = true; + req = &q->reqs[i]; + break; + } + } + /* We have checked inflight and need_kick while holding q->lock, so one + * free req must be available. */ + assert(req); + qemu_mutex_unlock(&q->lock); + return req; +} + +static inline int nvme_translate_error(const NvmeCqe *c) +{ + uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF; + if (status) { + trace_nvme_error(le32_to_cpu(c->result), + le16_to_cpu(c->sq_head), + le16_to_cpu(c->sq_id), + le16_to_cpu(c->cid), + le16_to_cpu(status)); + } + switch (status) { + case 0: + return 0; + case 1: + return -ENOSYS; + case 2: + return -EINVAL; + default: + return -EIO; + } +} + +/* With q->lock */ +static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q) +{ + bool progress = false; + NVMeRequest *preq; + NVMeRequest req; + NvmeCqe *c; + + trace_nvme_process_completion(s, q->index, q->inflight); + if (q->busy || s->plugged) { + trace_nvme_process_completion_queue_busy(s, q->index); + return false; + } + q->busy = true; + assert(q->inflight >= 0); + while (q->inflight) { + int16_t cid; + c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES]; + if (!c->cid || (le16_to_cpu(c->status) & 0x1) == q->cq_phase) { + break; + } + q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE; + if (!q->cq.head) { + q->cq_phase = !q->cq_phase; + } + cid = le16_to_cpu(c->cid); + if (cid == 0 || cid > NVME_QUEUE_SIZE) { + fprintf(stderr, "Unexpected CID in completion queue: %" PRIu32 "\n", + cid); + continue; + } + assert(cid <= NVME_QUEUE_SIZE); + trace_nvme_complete_command(s, q->index, cid); + preq = &q->reqs[cid - 1]; + req = *preq; + assert(req.cid == cid); + assert(req.cb); + preq->busy = false; + preq->cb = preq->opaque = NULL; + qemu_mutex_unlock(&q->lock); + req.cb(req.opaque, nvme_translate_error(c)); + qemu_mutex_lock(&q->lock); + c->cid = cpu_to_le16(0); + q->inflight--; + /* Flip Phase Tag bit. */ + c->status = cpu_to_le16(le16_to_cpu(c->status) ^ 0x1); + progress = true; + } + if (progress) { + /* Notify the device so it can post more completions. */ + smp_mb_release(); + *q->cq.doorbell = cpu_to_le32(q->cq.head); + if (!qemu_co_queue_empty(&q->free_req_queue)) { + aio_bh_schedule_oneshot(s->aio_context, nvme_free_req_queue_cb, q); + } + } + q->busy = false; + return progress; +} + +static void nvme_trace_command(const NvmeCmd *cmd) +{ + int i; + + for (i = 0; i < 8; ++i) { + uint8_t *cmdp = (uint8_t *)cmd + i * 8; + trace_nvme_submit_command_raw(cmdp[0], cmdp[1], cmdp[2], cmdp[3], + cmdp[4], cmdp[5], cmdp[6], cmdp[7]); + } +} + +static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q, + NVMeRequest *req, + NvmeCmd *cmd, BlockCompletionFunc cb, + void *opaque) +{ + assert(!req->cb); + req->cb = cb; + req->opaque = opaque; + cmd->cid = cpu_to_le32(req->cid); + + trace_nvme_submit_command(s, q->index, req->cid); + nvme_trace_command(cmd); + qemu_mutex_lock(&q->lock); + memcpy((uint8_t *)q->sq.queue + + q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd)); + q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE; + q->need_kick++; + nvme_kick(s, q); + nvme_process_completion(s, q); + qemu_mutex_unlock(&q->lock); +} + +static void nvme_cmd_sync_cb(void *opaque, int ret) +{ + int *pret = opaque; + *pret = ret; +} + +static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q, + NvmeCmd *cmd) +{ + NVMeRequest *req; + BDRVNVMeState *s = bs->opaque; + int ret = -EINPROGRESS; + req = nvme_get_free_req(q); + if (!req) { + return -EBUSY; + } + nvme_submit_command(s, q, req, cmd, nvme_cmd_sync_cb, &ret); + + BDRV_POLL_WHILE(bs, ret == -EINPROGRESS); + return ret; +} + +static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp) +{ + BDRVNVMeState *s = bs->opaque; + NvmeIdCtrl *idctrl; + NvmeIdNs *idns; + uint8_t *resp; + int r; + uint64_t iova; + NvmeCmd cmd = { + .opcode = NVME_ADM_CMD_IDENTIFY, + .cdw10 = cpu_to_le32(0x1), + }; + + resp = qemu_try_blockalign0(bs, sizeof(NvmeIdCtrl)); + if (!resp) { + error_setg(errp, "Cannot allocate buffer for identify response"); + goto out; + } + idctrl = (NvmeIdCtrl *)resp; + idns = (NvmeIdNs *)resp; + r = qemu_vfio_dma_map(s->vfio, resp, sizeof(NvmeIdCtrl), true, &iova); + if (r) { + error_setg(errp, "Cannot map buffer for DMA"); + goto out; + } + cmd.prp1 = cpu_to_le64(iova); + + if (nvme_cmd_sync(bs, s->queues[0], &cmd)) { + error_setg(errp, "Failed to identify controller"); + goto out; + } + + if (le32_to_cpu(idctrl->nn) < namespace) { + error_setg(errp, "Invalid namespace"); + goto out; + } + s->write_cache_supported = le32_to_cpu(idctrl->vwc) & 0x1; + s->max_transfer = (idctrl->mdts ? 1 << idctrl->mdts : 0) * s->page_size; + /* For now the page list buffer per command is one page, to hold at most + * s->page_size / sizeof(uint64_t) entries. */ + s->max_transfer = MIN_NON_ZERO(s->max_transfer, + s->page_size / sizeof(uint64_t) * s->page_size); + + memset(resp, 0, 4096); + + cmd.cdw10 = 0; + cmd.nsid = cpu_to_le32(namespace); + if (nvme_cmd_sync(bs, s->queues[0], &cmd)) { + error_setg(errp, "Failed to identify namespace"); + goto out; + } + + s->nsze = le64_to_cpu(idns->nsze); + +out: + qemu_vfio_dma_unmap(s->vfio, resp); + qemu_vfree(resp); +} + +static bool nvme_poll_queues(BDRVNVMeState *s) +{ + bool progress = false; + int i; + + for (i = 0; i < s->nr_queues; i++) { + NVMeQueuePair *q = s->queues[i]; + qemu_mutex_lock(&q->lock); + while (nvme_process_completion(s, q)) { + /* Keep polling */ + progress = true; + } + qemu_mutex_unlock(&q->lock); + } + return progress; +} + +static void nvme_handle_event(EventNotifier *n) +{ + BDRVNVMeState *s = container_of(n, BDRVNVMeState, irq_notifier); + + trace_nvme_handle_event(s); + aio_context_acquire(s->aio_context); + event_notifier_test_and_clear(n); + nvme_poll_queues(s); + aio_context_release(s->aio_context); +} + +static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp) +{ + BDRVNVMeState *s = bs->opaque; + int n = s->nr_queues; + NVMeQueuePair *q; + NvmeCmd cmd; + int queue_size = NVME_QUEUE_SIZE; + + q = nvme_create_queue_pair(bs, n, queue_size, errp); + if (!q) { + return false; + } + cmd = (NvmeCmd) { + .opcode = NVME_ADM_CMD_CREATE_CQ, + .prp1 = cpu_to_le64(q->cq.iova), + .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)), + .cdw11 = cpu_to_le32(0x3), + }; + if (nvme_cmd_sync(bs, s->queues[0], &cmd)) { + error_setg(errp, "Failed to create io queue [%d]", n); + nvme_free_queue_pair(bs, q); + return false; + } + cmd = (NvmeCmd) { + .opcode = NVME_ADM_CMD_CREATE_SQ, + .prp1 = cpu_to_le64(q->sq.iova), + .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)), + .cdw11 = cpu_to_le32(0x1 | (n << 16)), + }; + if (nvme_cmd_sync(bs, s->queues[0], &cmd)) { + error_setg(errp, "Failed to create io queue [%d]", n); + nvme_free_queue_pair(bs, q); + return false; + } + s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1); + s->queues[n] = q; + s->nr_queues++; + return true; +} + +static bool nvme_poll_cb(void *opaque) +{ + EventNotifier *e = opaque; + BDRVNVMeState *s = container_of(e, BDRVNVMeState, irq_notifier); + bool progress = false; + + trace_nvme_poll_cb(s); + progress = nvme_poll_queues(s); + return progress; +} + +static int nvme_init(BlockDriverState *bs, const char *device, int namespace, + Error **errp) +{ + BDRVNVMeState *s = bs->opaque; + int ret; + uint64_t cap; + uint64_t timeout_ms; + uint64_t deadline, now; + Error *local_err = NULL; + + qemu_co_mutex_init(&s->dma_map_lock); + qemu_co_queue_init(&s->dma_flush_queue); + s->nsid = namespace; + s->aio_context = bdrv_get_aio_context(bs); + ret = event_notifier_init(&s->irq_notifier, 0); + if (ret) { + error_setg(errp, "Failed to init event notifier"); + return ret; + } + + s->vfio = qemu_vfio_open_pci(device, errp); + if (!s->vfio) { + ret = -EINVAL; + goto fail; + } + + s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, NVME_BAR_SIZE, errp); + if (!s->regs) { + ret = -EINVAL; + goto fail; + } + + /* Perform initialize sequence as described in NVMe spec "7.6.1 + * Initialization". */ + + cap = le64_to_cpu(s->regs->cap); + if (!(cap & (1ULL << 37))) { + error_setg(errp, "Device doesn't support NVMe command set"); + ret = -EINVAL; + goto fail; + } + + s->page_size = MAX(4096, 1 << (12 + ((cap >> 48) & 0xF))); + s->doorbell_scale = (4 << (((cap >> 32) & 0xF))) / sizeof(uint32_t); + bs->bl.opt_mem_alignment = s->page_size; + timeout_ms = MIN(500 * ((cap >> 24) & 0xFF), 30000); + + /* Reset device to get a clean state. */ + s->regs->cc = cpu_to_le32(le32_to_cpu(s->regs->cc) & 0xFE); + /* Wait for CSTS.RDY = 0. */ + deadline = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ms * 1000000ULL; + while (le32_to_cpu(s->regs->csts) & 0x1) { + if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) { + error_setg(errp, "Timeout while waiting for device to reset (%" + PRId64 " ms)", + timeout_ms); + ret = -ETIMEDOUT; + goto fail; + } + } + + /* Set up admin queue. */ + s->queues = g_new(NVMeQueuePair *, 1); + s->nr_queues = 1; + s->queues[0] = nvme_create_queue_pair(bs, 0, NVME_QUEUE_SIZE, errp); + if (!s->queues[0]) { + ret = -EINVAL; + goto fail; + } + QEMU_BUILD_BUG_ON(NVME_QUEUE_SIZE & 0xF000); + s->regs->aqa = cpu_to_le32((NVME_QUEUE_SIZE << 16) | NVME_QUEUE_SIZE); + s->regs->asq = cpu_to_le64(s->queues[0]->sq.iova); + s->regs->acq = cpu_to_le64(s->queues[0]->cq.iova); + + /* After setting up all control registers we can enable device now. */ + s->regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << 20) | + (ctz32(NVME_SQ_ENTRY_BYTES) << 16) | + 0x1); + /* Wait for CSTS.RDY = 1. */ + now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); + deadline = now + timeout_ms * 1000000; + while (!(le32_to_cpu(s->regs->csts) & 0x1)) { + if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) { + error_setg(errp, "Timeout while waiting for device to start (%" + PRId64 " ms)", + timeout_ms); + ret = -ETIMEDOUT; + goto fail_queue; + } + } + + ret = qemu_vfio_pci_init_irq(s->vfio, &s->irq_notifier, + VFIO_PCI_MSIX_IRQ_INDEX, errp); + if (ret) { + goto fail_queue; + } + aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier, + false, nvme_handle_event, nvme_poll_cb); + + nvme_identify(bs, namespace, errp); + if (local_err) { + error_propagate(errp, local_err); + ret = -EIO; + goto fail_handler; + } + + /* Set up command queues. */ + if (!nvme_add_io_queue(bs, errp)) { + ret = -EIO; + goto fail_handler; + } + return 0; + +fail_handler: + aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier, + false, NULL, NULL); +fail_queue: + nvme_free_queue_pair(bs, s->queues[0]); +fail: + g_free(s->queues); + qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0, NVME_BAR_SIZE); + qemu_vfio_close(s->vfio); + event_notifier_cleanup(&s->irq_notifier); + return ret; +} + +/* Parse a filename in the format of nvme://XXXX:XX:XX.X/X. Example: + * + * nvme://0000:44:00.0/1 + * + * where the "nvme://" is a fixed form of the protocol prefix, the middle part + * is the PCI address, and the last part is the namespace number starting from + * 1 according to the NVMe spec. */ +static void nvme_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + int pref = strlen("nvme://"); + + if (strlen(filename) > pref && !strncmp(filename, "nvme://", pref)) { + const char *tmp = filename + pref; + char *device; + const char *namespace; + unsigned long ns; + const char *slash = strchr(tmp, '/'); + if (!slash) { + qdict_put(options, NVME_BLOCK_OPT_DEVICE, + qstring_from_str(tmp)); + return; + } + device = g_strndup(tmp, slash - tmp); + qdict_put(options, NVME_BLOCK_OPT_DEVICE, qstring_from_str(device)); + g_free(device); + namespace = slash + 1; + if (*namespace && qemu_strtoul(namespace, NULL, 10, &ns)) { + error_setg(errp, "Invalid namespace '%s', positive number expected", + namespace); + return; + } + qdict_put(options, NVME_BLOCK_OPT_NAMESPACE, + qstring_from_str(*namespace ? namespace : "1")); + } +} + +static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable, + Error **errp) +{ + int ret; + BDRVNVMeState *s = bs->opaque; + NvmeCmd cmd = { + .opcode = NVME_ADM_CMD_SET_FEATURES, + .nsid = cpu_to_le32(s->nsid), + .cdw10 = cpu_to_le32(0x06), + .cdw11 = cpu_to_le32(enable ? 0x01 : 0x00), + }; + + ret = nvme_cmd_sync(bs, s->queues[0], &cmd); + if (ret) { + error_setg(errp, "Failed to configure NVMe write cache"); + } + return ret; +} + +static void nvme_close(BlockDriverState *bs) +{ + int i; + BDRVNVMeState *s = bs->opaque; + + for (i = 0; i < s->nr_queues; ++i) { + nvme_free_queue_pair(bs, s->queues[i]); + } + aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier, + false, NULL, NULL); + qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0, NVME_BAR_SIZE); + qemu_vfio_close(s->vfio); +} + +static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + const char *device; + QemuOpts *opts; + int namespace; + int ret; + BDRVNVMeState *s = bs->opaque; + + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &error_abort); + device = qemu_opt_get(opts, NVME_BLOCK_OPT_DEVICE); + if (!device) { + error_setg(errp, "'" NVME_BLOCK_OPT_DEVICE "' option is required"); + qemu_opts_del(opts); + return -EINVAL; + } + + namespace = qemu_opt_get_number(opts, NVME_BLOCK_OPT_NAMESPACE, 1); + ret = nvme_init(bs, device, namespace, errp); + qemu_opts_del(opts); + if (ret) { + goto fail; + } + if (flags & BDRV_O_NOCACHE) { + if (!s->write_cache_supported) { + error_setg(errp, + "NVMe controller doesn't support write cache configuration"); + ret = -EINVAL; + } else { + ret = nvme_enable_disable_write_cache(bs, !(flags & BDRV_O_NOCACHE), + errp); + } + if (ret) { + goto fail; + } + } + bs->supported_write_flags = BDRV_REQ_FUA; + return 0; +fail: + nvme_close(bs); + return ret; +} + +static int64_t nvme_getlength(BlockDriverState *bs) +{ + BDRVNVMeState *s = bs->opaque; + + return s->nsze << BDRV_SECTOR_BITS; +} + +/* Called with s->dma_map_lock */ +static coroutine_fn int nvme_cmd_unmap_qiov(BlockDriverState *bs, + QEMUIOVector *qiov) +{ + int r = 0; + BDRVNVMeState *s = bs->opaque; + + s->dma_map_count -= qiov->size; + if (!s->dma_map_count && !qemu_co_queue_empty(&s->dma_flush_queue)) { + r = qemu_vfio_dma_reset_temporary(s->vfio); + if (!r) { + qemu_co_queue_restart_all(&s->dma_flush_queue); + } + } + return r; +} + +/* Called with s->dma_map_lock */ +static coroutine_fn int nvme_cmd_map_qiov(BlockDriverState *bs, NvmeCmd *cmd, + NVMeRequest *req, QEMUIOVector *qiov) +{ + BDRVNVMeState *s = bs->opaque; + uint64_t *pagelist = req->prp_list_page; + int i, j, r; + int entries = 0; + + assert(qiov->size); + assert(QEMU_IS_ALIGNED(qiov->size, s->page_size)); + assert(qiov->size / s->page_size <= s->page_size / sizeof(uint64_t)); + for (i = 0; i < qiov->niov; ++i) { + bool retry = true; + uint64_t iova; +try_map: + r = qemu_vfio_dma_map(s->vfio, + qiov->iov[i].iov_base, + qiov->iov[i].iov_len, + true, &iova); + if (r == -ENOMEM && retry) { + retry = false; + trace_nvme_dma_flush_queue_wait(s); + if (s->dma_map_count) { + trace_nvme_dma_map_flush(s); + qemu_co_queue_wait(&s->dma_flush_queue, &s->dma_map_lock); + } else { + r = qemu_vfio_dma_reset_temporary(s->vfio); + if (r) { + goto fail; + } + } + goto try_map; + } + if (r) { + goto fail; + } + + for (j = 0; j < qiov->iov[i].iov_len / s->page_size; j++) { + pagelist[entries++] = iova + j * s->page_size; + } + trace_nvme_cmd_map_qiov_iov(s, i, qiov->iov[i].iov_base, + qiov->iov[i].iov_len / s->page_size); + } + + s->dma_map_count += qiov->size; + + assert(entries <= s->page_size / sizeof(uint64_t)); + switch (entries) { + case 0: + abort(); + case 1: + cmd->prp1 = cpu_to_le64(pagelist[0]); + cmd->prp2 = 0; + break; + case 2: + cmd->prp1 = cpu_to_le64(pagelist[0]); + cmd->prp2 = cpu_to_le64(pagelist[1]);; + break; + default: + cmd->prp1 = cpu_to_le64(pagelist[0]); + cmd->prp2 = cpu_to_le64(req->prp_list_iova); + for (i = 0; i < entries - 1; ++i) { + pagelist[i] = cpu_to_le64(pagelist[i + 1]); + } + pagelist[entries - 1] = 0; + break; + } + trace_nvme_cmd_map_qiov(s, cmd, req, qiov, entries); + for (i = 0; i < entries; ++i) { + trace_nvme_cmd_map_qiov_pages(s, i, pagelist[i]); + } + return 0; +fail: + /* No need to unmap [0 - i) iovs even if we've failed, since we don't + * increment s->dma_map_count. This is okay for fixed mapping memory areas + * because they are already mapped before calling this function; for + * temporary mappings, a later nvme_cmd_(un)map_qiov will reclaim by + * calling qemu_vfio_dma_reset_temporary when necessary. */ + return r; +} + +typedef struct { + Coroutine *co; + int ret; + AioContext *ctx; +} NVMeCoData; + +static void nvme_rw_cb_bh(void *opaque) +{ + NVMeCoData *data = opaque; + qemu_coroutine_enter(data->co); +} + +static void nvme_rw_cb(void *opaque, int ret) +{ + NVMeCoData *data = opaque; + data->ret = ret; + if (!data->co) { + /* The rw coroutine hasn't yielded, don't try to enter. */ + return; + } + aio_bh_schedule_oneshot(data->ctx, nvme_rw_cb_bh, data); +} + +static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, + bool is_write, + int flags) +{ + int r; + BDRVNVMeState *s = bs->opaque; + NVMeQueuePair *ioq = s->queues[1]; + NVMeRequest *req; + uint32_t cdw12 = (((bytes >> BDRV_SECTOR_BITS) - 1) & 0xFFFF) | + (flags & BDRV_REQ_FUA ? 1 << 30 : 0); + NvmeCmd cmd = { + .opcode = is_write ? NVME_CMD_WRITE : NVME_CMD_READ, + .nsid = cpu_to_le32(s->nsid), + .cdw10 = cpu_to_le32((offset >> BDRV_SECTOR_BITS) & 0xFFFFFFFF), + .cdw11 = cpu_to_le32(((offset >> BDRV_SECTOR_BITS) >> 32) & 0xFFFFFFFF), + .cdw12 = cpu_to_le32(cdw12), + }; + NVMeCoData data = { + .ctx = bdrv_get_aio_context(bs), + .ret = -EINPROGRESS, + }; + + trace_nvme_prw_aligned(s, is_write, offset, bytes, flags, qiov->niov); + assert(s->nr_queues > 1); + req = nvme_get_free_req(ioq); + assert(req); + + qemu_co_mutex_lock(&s->dma_map_lock); + r = nvme_cmd_map_qiov(bs, &cmd, req, qiov); + qemu_co_mutex_unlock(&s->dma_map_lock); + if (r) { + req->busy = false; + return r; + } + nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data); + + data.co = qemu_coroutine_self(); + while (data.ret == -EINPROGRESS) { + qemu_coroutine_yield(); + } + + qemu_co_mutex_lock(&s->dma_map_lock); + r = nvme_cmd_unmap_qiov(bs, qiov); + qemu_co_mutex_unlock(&s->dma_map_lock); + if (r) { + return r; + } + + trace_nvme_rw_done(s, is_write, offset, bytes, data.ret); + return data.ret; +} + +static inline bool nvme_qiov_aligned(BlockDriverState *bs, + const QEMUIOVector *qiov) +{ + int i; + BDRVNVMeState *s = bs->opaque; + + for (i = 0; i < qiov->niov; ++i) { + if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base, s->page_size) || + !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, s->page_size)) { + trace_nvme_qiov_unaligned(qiov, i, qiov->iov[i].iov_base, + qiov->iov[i].iov_len, s->page_size); + return false; + } + } + return true; +} + +static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, bool is_write, int flags) +{ + BDRVNVMeState *s = bs->opaque; + int r; + uint8_t *buf = NULL; + QEMUIOVector local_qiov; + + assert(QEMU_IS_ALIGNED(offset, s->page_size)); + assert(QEMU_IS_ALIGNED(bytes, s->page_size)); + assert(bytes <= s->max_transfer); + if (nvme_qiov_aligned(bs, qiov)) { + return nvme_co_prw_aligned(bs, offset, bytes, qiov, is_write, flags); + } + trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write); + buf = qemu_try_blockalign(bs, bytes); + + if (!buf) { + return -ENOMEM; + } + qemu_iovec_init(&local_qiov, 1); + if (is_write) { + qemu_iovec_to_buf(qiov, 0, buf, bytes); + } + qemu_iovec_add(&local_qiov, buf, bytes); + r = nvme_co_prw_aligned(bs, offset, bytes, &local_qiov, is_write, flags); + qemu_iovec_destroy(&local_qiov); + if (!r && !is_write) { + qemu_iovec_from_buf(qiov, 0, buf, bytes); + } + qemu_vfree(buf); + return r; +} + +static coroutine_fn int nvme_co_preadv(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) +{ + return nvme_co_prw(bs, offset, bytes, qiov, false, flags); +} + +static coroutine_fn int nvme_co_pwritev(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) +{ + return nvme_co_prw(bs, offset, bytes, qiov, true, flags); +} + +static coroutine_fn int nvme_co_flush(BlockDriverState *bs) +{ + BDRVNVMeState *s = bs->opaque; + NVMeQueuePair *ioq = s->queues[1]; + NVMeRequest *req; + NvmeCmd cmd = { + .opcode = NVME_CMD_FLUSH, + .nsid = cpu_to_le32(s->nsid), + }; + NVMeCoData data = { + .ctx = bdrv_get_aio_context(bs), + .ret = -EINPROGRESS, + }; + + assert(s->nr_queues > 1); + req = nvme_get_free_req(ioq); + assert(req); + nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data); + + data.co = qemu_coroutine_self(); + if (data.ret == -EINPROGRESS) { + qemu_coroutine_yield(); + } + + return data.ret; +} + + +static int nvme_reopen_prepare(BDRVReopenState *reopen_state, + BlockReopenQueue *queue, Error **errp) +{ + return 0; +} + +static int64_t coroutine_fn nvme_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, int *pnum, + BlockDriverState **file) +{ + *pnum = nb_sectors; + *file = bs; + + return BDRV_BLOCK_ALLOCATED | BDRV_BLOCK_OFFSET_VALID | + (sector_num << BDRV_SECTOR_BITS); +} + +static void nvme_refresh_filename(BlockDriverState *bs, QDict *opts) +{ + QINCREF(opts); + qdict_del(opts, "filename"); + + if (!qdict_size(opts)) { + snprintf(bs->exact_filename, sizeof(bs->exact_filename), "%s://", + bs->drv->format_name); + } + + qdict_put(opts, "driver", qstring_from_str(bs->drv->format_name)); + bs->full_open_options = opts; +} + +static void nvme_refresh_limits(BlockDriverState *bs, Error **errp) +{ + BDRVNVMeState *s = bs->opaque; + + bs->bl.opt_mem_alignment = s->page_size; + bs->bl.request_alignment = s->page_size; + bs->bl.max_transfer = s->max_transfer; +} + +static void nvme_detach_aio_context(BlockDriverState *bs) +{ + BDRVNVMeState *s = bs->opaque; + + aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier, + false, NULL, NULL); +} + +static void nvme_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) +{ + BDRVNVMeState *s = bs->opaque; + + s->aio_context = new_context; + aio_set_event_notifier(new_context, &s->irq_notifier, + false, nvme_handle_event, nvme_poll_cb); +} + +static void nvme_aio_plug(BlockDriverState *bs) +{ + BDRVNVMeState *s = bs->opaque; + s->plugged++; +} + +static void nvme_aio_unplug(BlockDriverState *bs) +{ + int i; + BDRVNVMeState *s = bs->opaque; + assert(s->plugged); + if (!--s->plugged) { + for (i = 1; i < s->nr_queues; i++) { + NVMeQueuePair *q = s->queues[i]; + qemu_mutex_lock(&q->lock); + nvme_kick(s, q); + nvme_process_completion(s, q); + qemu_mutex_unlock(&q->lock); + } + } +} + +static void nvme_register_buf(BlockDriverState *bs, void *host, size_t size) +{ + int ret; + BDRVNVMeState *s = bs->opaque; + + ret = qemu_vfio_dma_map(s->vfio, host, size, false, NULL); + if (ret) { + /* FIXME: we may run out of IOVA addresses after repeated + * bdrv_register_buf/bdrv_unregister_buf, because nvme_vfio_dma_unmap + * doesn't reclaim addresses for fixed mappings. */ + error_report("nvme_register_buf failed: %s", strerror(-ret)); + } +} + +static void nvme_unregister_buf(BlockDriverState *bs, void *host) +{ + BDRVNVMeState *s = bs->opaque; + + qemu_vfio_dma_unmap(s->vfio, host); +} + +static BlockDriver bdrv_nvme = { + .format_name = "nvme", + .protocol_name = "nvme", + .instance_size = sizeof(BDRVNVMeState), + + .bdrv_parse_filename = nvme_parse_filename, + .bdrv_file_open = nvme_file_open, + .bdrv_close = nvme_close, + .bdrv_getlength = nvme_getlength, + + .bdrv_co_preadv = nvme_co_preadv, + .bdrv_co_pwritev = nvme_co_pwritev, + .bdrv_co_flush_to_disk = nvme_co_flush, + .bdrv_reopen_prepare = nvme_reopen_prepare, + + .bdrv_co_get_block_status = nvme_co_get_block_status, + + .bdrv_refresh_filename = nvme_refresh_filename, + .bdrv_refresh_limits = nvme_refresh_limits, + + .bdrv_detach_aio_context = nvme_detach_aio_context, + .bdrv_attach_aio_context = nvme_attach_aio_context, + + .bdrv_io_plug = nvme_aio_plug, + .bdrv_io_unplug = nvme_aio_unplug, + + .bdrv_register_buf = nvme_register_buf, + .bdrv_unregister_buf = nvme_unregister_buf, +}; + +static void bdrv_nvme_init(void) +{ + bdrv_register(&bdrv_nvme); +} + +block_init(bdrv_nvme_init); diff --git a/block/trace-events b/block/trace-events index 11c8d5f590..02dd80ff0c 100644 --- a/block/trace-events +++ b/block/trace-events @@ -124,3 +124,24 @@ vxhs_open_iio_open(const char *host) "Failed to connect to storage agent on host vxhs_parse_uri_hostinfo(char *host, int port) "Host: IP %s, Port %d" vxhs_close(char *vdisk_guid) "Closing vdisk %s" vxhs_get_creds(const char *cacert, const char *client_key, const char *client_cert) "cacert %s, client_key %s, client_cert %s" + +# block/nvme.c +nvme_kick(void *s, int queue) "s %p queue %d" +nvme_dma_flush_queue_wait(void *s) "s %p" +nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x" +nvme_process_completion(void *s, int index, int inflight) "s %p queue %d inflight %d" +nvme_process_completion_queue_busy(void *s, int index) "s %p queue %d" +nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d" +nvme_submit_command(void *s, int index, int cid) "s %p queue %d cid %d" +nvme_submit_command_raw(int c0, int c1, int c2, int c3, int c4, int c5, int c6, int c7) "%02x %02x %02x %02x %02x %02x %02x %02x" +nvme_handle_event(void *s) "s %p" +nvme_poll_cb(void *s) "s %p" +nvme_prw_aligned(void *s, int is_write, uint64_t offset, uint64_t bytes, int flags, int niov) "s %p is_write %d offset %"PRId64" bytes %"PRId64" flags %d niov %d" +nvme_qiov_unaligned(const void *qiov, int n, void *base, size_t size, int align) "qiov %p n %d base %p size 0x%zx align 0x%x" +nvme_prw_buffered(void *s, uint64_t offset, uint64_t bytes, int niov, int is_write) "s %p offset %"PRId64" bytes %"PRId64" niov %d is_write %d" +nvme_rw_done(void *s, int is_write, uint64_t offset, uint64_t bytes, int ret) "s %p is_write %d offset %"PRId64" bytes %"PRId64" ret %d" +nvme_dma_map_flush(void *s) "s %p" +nvme_free_req_queue_wait(void *q) "q %p" +nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s %p cmd %p req %p qiov %p entries %d" +nvme_cmd_map_qiov_pages(void *s, int i, uint64_t page) "s %p page[%d] 0x%"PRIx64 +nvme_cmd_map_qiov_iov(void *s, int i, void *page, int pages) "s %p iov[%d] %p pages %d" diff --git a/docs/devel/testing.rst b/docs/devel/testing.rst new file mode 100644 index 0000000000..0ca1a2d4b5 --- /dev/null +++ b/docs/devel/testing.rst @@ -0,0 +1,486 @@ +=============== +Testing in QEMU +=============== + +This document describes the testing infrastructure in QEMU. + +Testing with "make check" +========================= + +The "make check" testing family includes most of the C based tests in QEMU. For +a quick help, run ``make check-help`` from the source tree. + +The usual way to run these tests is: + +.. code:: + + make check + +which includes QAPI schema tests, unit tests, and QTests. Different sub-types +of "make check" tests will be explained below. + +Before running tests, it is best to build QEMU programs first. Some tests +expect the executables to exist and will fail with obscure messages if they +cannot find them. + +Unit tests +---------- + +Unit tests, which can be invoked with ``make check-unit``, are simple C tests +that typically link to individual QEMU object files and exercise them by +calling exported functions. + +If you are writing new code in QEMU, consider adding a unit test, especially +for utility modules that are relatively stateless or have few dependencies. To +add a new unit test: + +1. Create a new source file. For example, ``tests/foo-test.c``. + +2. Write the test. Normally you would include the header file which exports + the module API, then verify the interface behaves as expected from your + test. The test code should be organized with the glib testing framework. + Copying and modifying an existing test is usually a good idea. + +3. Add the test to ``tests/Makefile.include``. First, name the unit test + program and add it to ``$(check-unit-y)``; then add a rule to build the + executable. Optionally, you can add a magical variable to support ``gcov``. + For example: + +.. code:: + + check-unit-y += tests/foo-test$(EXESUF) + tests/foo-test$(EXESUF): tests/foo-test.o $(test-util-obj-y) + ... + gcov-files-foo-test-y = util/foo.c + +Since unit tests don't require environment variables, the simplest way to debug +a unit test failure is often directly invoking it or even running it under +``gdb``. However there can still be differences in behavior between ``make`` +invocations and your manual run, due to ``$MALLOC_PERTURB_`` environment +variable (which affects memory reclamation and catches invalid pointers better) +and gtester options. If necessary, you can run + +.. code:: + make check-unit V=1 + +and copy the actual command line which executes the unit test, then run +it from the command line. + +QTest +----- + +QTest is a device emulation testing framework. It can be very useful to test +device models; it could also control certain aspects of QEMU (such as virtual +clock stepping), with a special purpose "qtest" protocol. Refer to the +documentation in ``qtest.c`` for more details of the protocol. + +QTest cases can be executed with + +.. code:: + + make check-qtest + +The QTest library is implemented by ``tests/libqtest.c`` and the API is defined +in ``tests/libqtest.h``. + +Consider adding a new QTest case when you are introducing a new virtual +hardware, or extending one if you are adding functionalities to an existing +virtual device. + +On top of libqtest, a higher level library, ``libqos``, was created to +encapsulate common tasks of device drivers, such as memory management and +communicating with system buses or devices. Many virtual device tests use +libqos instead of directly calling into libqtest. + +Steps to add a new QTest case are: + +1. Create a new source file for the test. (More than one file can be added as + necessary.) For example, ``tests/test-foo-device.c``. + +2. Write the test code with the glib and libqtest/libqos API. See also existing + tests and the library headers for reference. + +3. Register the new test in ``tests/Makefile.include``. Add the test executable + name to an appropriate ``check-qtest-*-y`` variable. For example: + + ``check-qtest-generic-y = tests/test-foo-device$(EXESUF)`` + +4. Add object dependencies of the executable in the Makefile, including the + test source file(s) and other interesting objects. For example: + + ``tests/test-foo-device$(EXESUF): tests/test-foo-device.o $(libqos-obj-y)`` + +Debugging a QTest failure is slightly harder than the unit test because the +tests look up QEMU program names in the environment variables, such as +``QTEST_QEMU_BINARY`` and ``QTEST_QEMU_IMG``, and also because it is not easy +to attach gdb to the QEMU process spawned from the test. But manual invoking +and using gdb on the test is still simple to do: find out the actual command +from the output of + +.. code:: + make check-qtest V=1 + +which you can run manually. + +QAPI schema tests +----------------- + +The QAPI schema tests validate the QAPI parser used by QMP, by feeding +predefined input to the parser and comparing the result with the reference +output. + +The input/output data is managed under the ``tests/qapi-schema`` directory. +Each test case includes four files that have a common base name: + + * ``${casename}.json`` - the file contains the JSON input for feeding the + parser + * ``${casename}.out`` - the file contains the expected stdout from the parser + * ``${casename}.err`` - the file contains the expected stderr from the parser + * ``${casename}.exit`` - the expected error code + +Consider adding a new QAPI schema test when you are making a change on the QAPI +parser (either fixing a bug or extending/modifying the syntax). To do this: + +1. Add four files for the new case as explained above. For example: + + ``$EDITOR tests/qapi-schema/foo.{json,out,err,exit}``. + +2. Add the new test in ``tests/Makefile.include``. For example: + + ``qapi-schema += foo.json`` + +check-block +----------- + +``make check-block`` is a legacy command to invoke block layer iotests and is +rarely used. See "QEMU iotests" section below for more information. + +GCC gcov support +---------------- + +``gcov`` is a GCC tool to analyze the testing coverage by instrumenting the +tested code. To use it, configure QEMU with ``--enable-gcov`` option and build. +Then run ``make check`` as usual. There will be additional ``gcov`` output as +the testing goes on, showing the test coverage percentage numbers per analyzed +source file. More detailed reports can be obtained by running ``gcov`` command +on the output files under ``$build_dir/tests/``, please read the ``gcov`` +documentation for more information. + +QEMU iotests +============ + +QEMU iotests, under the directory ``tests/qemu-iotests``, is the testing +framework widely used to test block layer related features. It is higher level +than "make check" tests and 99% of the code is written in bash or Python +scripts. The testing success criteria is golden output comparison, and the +test files are named with numbers. + +To run iotests, make sure QEMU is built successfully, then switch to the +``tests/qemu-iotests`` directory under the build directory, and run ``./check`` +with desired arguments from there. + +By default, "raw" format and "file" protocol is used; all tests will be +executed, except the unsupported ones. You can override the format and protocol +with arguments: + +.. code:: + + # test with qcow2 format + ./check -qcow2 + # or test a different protocol + ./check -nbd + +It's also possible to list test numbers explicitly: + +.. code:: + + # run selected cases with qcow2 format + ./check -qcow2 001 030 153 + +Cache mode can be selected with the "-c" option, which may help reveal bugs +that are specific to certain cache mode. + +More options are supported by the ``./check`` script, run ``./check -h`` for +help. + +Writing a new test case +----------------------- + +Consider writing a tests case when you are making any changes to the block +layer. An iotest case is usually the choice for that. There are already many +test cases, so it is possible that extending one of them may achieve the goal +and save the boilerplate to create one. (Unfortunately, there isn't a 100% +reliable way to find a related one out of hundreds of tests. One approach is +using ``git grep``.) + +Usually an iotest case consists of two files. One is an executable that +produces output to stdout and stderr, the other is the expected reference +output. They are given the same number in file names. E.g. Test script ``055`` +and reference output ``055.out``. + +In rare cases, when outputs differ between cache mode ``none`` and others, a +``.out.nocache`` file is added. In other cases, when outputs differ between +image formats, more than one ``.out`` files are created ending with the +respective format names, e.g. ``178.out.qcow2`` and ``178.out.raw``. + +There isn't a hard rule about how to write a test script, but a new test is +usually a (copy and) modification of an existing case. There are a few +commonly used ways to create a test: + +* A Bash script. It will make use of several environmental variables related + to the testing procedure, and could source a group of ``common.*`` libraries + for some common helper routines. + +* A Python unittest script. Import ``iotests`` and create a subclass of + ``iotests.QMPTestCase``, then call ``iotests.main`` method. The downside of + this approach is that the output is too scarce, and the script is considered + harder to debug. + +* A simple Python script without using unittest module. This could also import + ``iotests`` for launching QEMU and utilities etc, but it doesn't inherit + from ``iotests.QMPTestCase`` therefore doesn't use the Python unittest + execution. This is a combination of 1 and 2. + +Pick the language per your preference since both Bash and Python have +comparable library support for invoking and interacting with QEMU programs. If +you opt for Python, it is strongly recommended to write Python 3 compatible +code. + +Docker based tests +================== + +Introduction +------------ + +The Docker testing framework in QEMU utilizes public Docker images to build and +test QEMU in predefined and widely accessible Linux environments. This makes +it possible to expand the test coverage across distros, toolchain flavors and +library versions. + +Prerequisites +------------- + +Install "docker" with the system package manager and start the Docker service +on your development machine, then make sure you have the privilege to run +Docker commands. Typically it means setting up passwordless ``sudo docker`` +command or login as root. For example: + +.. code:: + + $ sudo yum install docker + $ # or `apt-get install docker` for Ubuntu, etc. + $ sudo systemctl start docker + $ sudo docker ps + +The last command should print an empty table, to verify the system is ready. + +An alternative method to set up permissions is by adding the current user to +"docker" group and making the docker daemon socket file (by default +``/var/run/docker.sock``) accessible to the group: + +.. code:: + + $ sudo groupadd docker + $ sudo usermod $USER -G docker + $ sudo chown :docker /var/run/docker.sock + +Note that any one of above configurations makes it possible for the user to +exploit the whole host with Docker bind mounting or other privileged +operations. So only do it on development machines. + +Quickstart +---------- + +From source tree, type ``make docker`` to see the help. Testing can be started +without configuring or building QEMU (``configure`` and ``make`` are done in +the container, with parameters defined by the make target): + +.. code:: + + make docker-test-build@min-glib + +This will create a container instance using the ``min-glib`` image (the image +is downloaded and initialized automatically), in which the ``test-build`` job +is executed. + +Images +------ + +Along with many other images, the ``min-glib`` image is defined in a Dockerfile +in ``tests/docker/dockefiles/``, called ``min-glib.docker``. ``make docker`` +command will list all the available images. + +To add a new image, simply create a new ``.docker`` file under the +``tests/docker/dockerfiles/`` directory. + +A ``.pre`` script can be added beside the ``.docker`` file, which will be +executed before building the image under the build context directory. This is +mainly used to do necessary host side setup. One such setup is ``binfmt_misc``, +for example, to make qemu-user powered cross build containers work. + +Tests +----- + +Different tests are added to cover various configurations to build and test +QEMU. Docker tests are the executables under ``tests/docker`` named +``test-*``. They are typically shell scripts and are built on top of a shell +library, ``tests/docker/common.rc``, which provides helpers to find the QEMU +source and build it. + +The full list of tests is printed in the ``make docker`` help. + +Tools +----- + +There are executables that are created to run in a specific Docker environment. +This makes it easy to write scripts that have heavy or special dependencies, +but are still very easy to use. + +Currently the only tool is ``travis``, which mimics the Travis-CI tests in a +container. It runs in the ``travis`` image: + +.. code:: + + make docker-travis@travis + +Debugging a Docker test failure +------------------------------- + +When CI tasks, maintainers or yourself report a Docker test failure, follow the +below steps to debug it: + +1. Locally reproduce the failure with the reported command line. E.g. run + ``make docker-test-mingw@fedora J=8``. +2. Add "V=1" to the command line, try again, to see the verbose output. +3. Further add "DEBUG=1" to the command line. This will pause in a shell prompt + in the container right before testing starts. You could either manually + build QEMU and run tests from there, or press Ctrl-D to let the Docker + testing continue. +4. If you press Ctrl-D, the same building and testing procedure will begin, and + will hopefully run into the error again. After that, you will be dropped to + the prompt for debug. + +Options +------- + +Various options can be used to affect how Docker tests are done. The full +list is in the ``make docker`` help text. The frequently used ones are: + +* ``V=1``: the same as in top level ``make``. It will be propagated to the + container and enable verbose output. +* ``J=$N``: the number of parallel tasks in make commands in the container, + similar to the ``-j $N`` option in top level ``make``. (The ``-j`` option in + top level ``make`` will not be propagated into the container.) +* ``DEBUG=1``: enables debug. See the previous "Debugging a Docker test + failure" section. + +VM testing +========== + +This test suite contains scripts that bootstrap various guest images that have +necessary packages to build QEMU. The basic usage is documented in ``Makefile`` +help which is displayed with ``make vm-test``. + +Quickstart +---------- + +Run ``make vm-test`` to list available make targets. Invoke a specific make +command to run build test in an image. For example, ``make vm-build-freebsd`` +will build the source tree in the FreeBSD image. The command can be executed +from either the source tree or the build dir; if the former, ``./configure`` is +not needed. The command will then generate the test image in ``./tests/vm/`` +under the working directory. + +Note: images created by the scripts accept a well-known RSA key pair for SSH +access, so they SHOULD NOT be exposed to external interfaces if you are +concerned about attackers taking control of the guest and potentially +exploiting a QEMU security bug to compromise the host. + +QEMU binary +----------- + +By default, qemu-system-x86_64 is searched in $PATH to run the guest. If there +isn't one, or if it is older than 2.10, the test won't work. In this case, +provide the QEMU binary in env var: ``QEMU=/path/to/qemu-2.10+``. + +Make jobs +--------- + +The ``-j$X`` option in the make command line is not propagated into the VM, +specify ``J=$X`` to control the make jobs in the guest. + +Debugging +--------- + +Add ``DEBUG=1`` and/or ``V=1`` to the make command to allow interactive +debugging and verbose output. If this is not enough, see the next section. + +Manual invocation +----------------- + +Each guest script is an executable script with the same command line options. +For example to work with the netbsd guest, use ``$QEMU_SRC/tests/vm/netbsd``: + +.. code:: + + $ cd $QEMU_SRC/tests/vm + + # To bootstrap the image + $ ./netbsd --build-image --image /var/tmp/netbsd.img + <...> + + # To run an arbitrary command in guest (the output will not be echoed unless + # --debug is added) + $ ./netbsd --debug --image /var/tmp/netbsd.img uname -a + + # To build QEMU in guest + $ ./netbsd --debug --image /var/tmp/netbsd.img --build-qemu $QEMU_SRC + + # To get to an interactive shell + $ ./netbsd --interactive --image /var/tmp/netbsd.img sh + +Adding new guests +----------------- + +Please look at existing guest scripts for how to add new guests. + +Most importantly, create a subclass of BaseVM and implement ``build_image()`` +method and define ``BUILD_SCRIPT``, then finally call ``basevm.main()`` from +the script's ``main()``. + +* Usually in ``build_image()``, a template image is downloaded from a + predefined URL. ``BaseVM._download_with_cache()`` takes care of the cache and + the checksum, so consider using it. + +* Once the image is downloaded, users, SSH server and QEMU build deps should + be set up: + + - Root password set to ``BaseVM.ROOT_PASS`` + - User ``BaseVM.GUEST_USER`` is created, and password set to + ``BaseVM.GUEST_PASS`` + - SSH service is enabled and started on boot, + ``$QEMU_SRC/tests/keys/id_rsa.pub`` is added to ssh's ``authorized_keys`` + file of both root and the normal user + - DHCP client service is enabled and started on boot, so that it can + automatically configure the virtio-net-pci NIC and communicate with QEMU + user net (10.0.2.2) + - Necessary packages are installed to untar the source tarball and build + QEMU + +* Write a proper ``BUILD_SCRIPT`` template, which should be a shell script that + untars a raw virtio-blk block device, which is the tarball data blob of the + QEMU source tree, then configure/build it. Running "make check" is also + recommended. + +Image fuzzer testing +==================== + +An image fuzzer was added to exercise format drivers. Currently only qcow2 is +supported. To start the fuzzer, run + +.. code:: + + tests/image-fuzzer/runner.py -c '[["qemu-img", "info", "$test_img"]]' /tmp/test qcow2 + +Alternatively, some command different from "qemu-img info" can be tested, by +changing the ``-c`` option. diff --git a/docs/qemu-block-drivers.texi b/docs/qemu-block-drivers.texi index 503c1847aa..cd74767ed3 100644 --- a/docs/qemu-block-drivers.texi +++ b/docs/qemu-block-drivers.texi @@ -785,6 +785,43 @@ warning: ssh server @code{ssh.example.com:22} does not support fsync With sufficiently new versions of libssh2 and OpenSSH, @code{fsync} is supported. +@node disk_images_nvme +@subsection NVMe disk images + +NVM Express (NVMe) storage controllers can be accessed directly by a userspace +driver in QEMU. This bypasses the host kernel file system and block layers +while retaining QEMU block layer functionalities, such as block jobs, I/O +throttling, image formats, etc. Disk I/O performance is typically higher than +with @code{-drive file=/dev/sda} using either thread pool or linux-aio. + +The controller will be exclusively used by the QEMU process once started. To be +able to share storage between multiple VMs and other applications on the host, +please use the file based protocols. + +Before starting QEMU, bind the host NVMe controller to the host vfio-pci +driver. For example: + +@example +# modprobe vfio-pci +# lspci -n -s 0000:06:0d.0 +06:0d.0 0401: 1102:0002 (rev 08) +# echo 0000:06:0d.0 > /sys/bus/pci/devices/0000:06:0d.0/driver/unbind +# echo 1102 0002 > /sys/bus/pci/drivers/vfio-pci/new_id + +# qemu-system-x86_64 -drive file=nvme://@var{host}:@var{bus}:@var{slot}.@var{func}/@var{namespace} +@end example + +Alternative syntax using properties: + +@example +qemu-system-x86_64 -drive file.driver=nvme,file.device=@var{host}:@var{bus}:@var{slot}.@var{func},file.namespace=@var{namespace} +@end example + +@var{host}:@var{bus}:@var{slot}.@var{func} is the NVMe controller's PCI device +address on the host. + +@var{namespace} is the NVMe namespace number, starting from 1. + @node disk_image_locking @subsection Disk image file locking diff --git a/fsdev/qemu-fsdev-throttle.c b/fsdev/qemu-fsdev-throttle.c index 49eebb5412..1dc07fbc12 100644 --- a/fsdev/qemu-fsdev-throttle.c +++ b/fsdev/qemu-fsdev-throttle.c @@ -20,13 +20,13 @@ static void fsdev_throttle_read_timer_cb(void *opaque) { FsThrottle *fst = opaque; - qemu_co_enter_next(&fst->throttled_reqs[false]); + qemu_co_enter_next(&fst->throttled_reqs[false], NULL); } static void fsdev_throttle_write_timer_cb(void *opaque) { FsThrottle *fst = opaque; - qemu_co_enter_next(&fst->throttled_reqs[true]); + qemu_co_enter_next(&fst->throttled_reqs[true], NULL); } void fsdev_throttle_parse_opts(QemuOpts *opts, FsThrottle *fst, Error **errp) diff --git a/hw/block/nvme.h b/hw/block/nvme.h index 7b62dad072..8f3981121d 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -1,703 +1,7 @@ #ifndef HW_NVME_H #define HW_NVME_H #include "qemu/cutils.h" - -typedef struct NvmeBar { - uint64_t cap; - uint32_t vs; - uint32_t intms; - uint32_t intmc; - uint32_t cc; - uint32_t rsvd1; - uint32_t csts; - uint32_t nssrc; - uint32_t aqa; - uint64_t asq; - uint64_t acq; - uint32_t cmbloc; - uint32_t cmbsz; -} NvmeBar; - -enum NvmeCapShift { - CAP_MQES_SHIFT = 0, - CAP_CQR_SHIFT = 16, - CAP_AMS_SHIFT = 17, - CAP_TO_SHIFT = 24, - CAP_DSTRD_SHIFT = 32, - CAP_NSSRS_SHIFT = 33, - CAP_CSS_SHIFT = 37, - CAP_MPSMIN_SHIFT = 48, - CAP_MPSMAX_SHIFT = 52, -}; - -enum NvmeCapMask { - CAP_MQES_MASK = 0xffff, - CAP_CQR_MASK = 0x1, - CAP_AMS_MASK = 0x3, - CAP_TO_MASK = 0xff, - CAP_DSTRD_MASK = 0xf, - CAP_NSSRS_MASK = 0x1, - CAP_CSS_MASK = 0xff, - CAP_MPSMIN_MASK = 0xf, - CAP_MPSMAX_MASK = 0xf, -}; - -#define NVME_CAP_MQES(cap) (((cap) >> CAP_MQES_SHIFT) & CAP_MQES_MASK) -#define NVME_CAP_CQR(cap) (((cap) >> CAP_CQR_SHIFT) & CAP_CQR_MASK) -#define NVME_CAP_AMS(cap) (((cap) >> CAP_AMS_SHIFT) & CAP_AMS_MASK) -#define NVME_CAP_TO(cap) (((cap) >> CAP_TO_SHIFT) & CAP_TO_MASK) -#define NVME_CAP_DSTRD(cap) (((cap) >> CAP_DSTRD_SHIFT) & CAP_DSTRD_MASK) -#define NVME_CAP_NSSRS(cap) (((cap) >> CAP_NSSRS_SHIFT) & CAP_NSSRS_MASK) -#define NVME_CAP_CSS(cap) (((cap) >> CAP_CSS_SHIFT) & CAP_CSS_MASK) -#define NVME_CAP_MPSMIN(cap)(((cap) >> CAP_MPSMIN_SHIFT) & CAP_MPSMIN_MASK) -#define NVME_CAP_MPSMAX(cap)(((cap) >> CAP_MPSMAX_SHIFT) & CAP_MPSMAX_MASK) - -#define NVME_CAP_SET_MQES(cap, val) (cap |= (uint64_t)(val & CAP_MQES_MASK) \ - << CAP_MQES_SHIFT) -#define NVME_CAP_SET_CQR(cap, val) (cap |= (uint64_t)(val & CAP_CQR_MASK) \ - << CAP_CQR_SHIFT) -#define NVME_CAP_SET_AMS(cap, val) (cap |= (uint64_t)(val & CAP_AMS_MASK) \ - << CAP_AMS_SHIFT) -#define NVME_CAP_SET_TO(cap, val) (cap |= (uint64_t)(val & CAP_TO_MASK) \ - << CAP_TO_SHIFT) -#define NVME_CAP_SET_DSTRD(cap, val) (cap |= (uint64_t)(val & CAP_DSTRD_MASK) \ - << CAP_DSTRD_SHIFT) -#define NVME_CAP_SET_NSSRS(cap, val) (cap |= (uint64_t)(val & CAP_NSSRS_MASK) \ - << CAP_NSSRS_SHIFT) -#define NVME_CAP_SET_CSS(cap, val) (cap |= (uint64_t)(val & CAP_CSS_MASK) \ - << CAP_CSS_SHIFT) -#define NVME_CAP_SET_MPSMIN(cap, val) (cap |= (uint64_t)(val & CAP_MPSMIN_MASK)\ - << CAP_MPSMIN_SHIFT) -#define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & CAP_MPSMAX_MASK)\ - << CAP_MPSMAX_SHIFT) - -enum NvmeCcShift { - CC_EN_SHIFT = 0, - CC_CSS_SHIFT = 4, - CC_MPS_SHIFT = 7, - CC_AMS_SHIFT = 11, - CC_SHN_SHIFT = 14, - CC_IOSQES_SHIFT = 16, - CC_IOCQES_SHIFT = 20, -}; - -enum NvmeCcMask { - CC_EN_MASK = 0x1, - CC_CSS_MASK = 0x7, - CC_MPS_MASK = 0xf, - CC_AMS_MASK = 0x7, - CC_SHN_MASK = 0x3, - CC_IOSQES_MASK = 0xf, - CC_IOCQES_MASK = 0xf, -}; - -#define NVME_CC_EN(cc) ((cc >> CC_EN_SHIFT) & CC_EN_MASK) -#define NVME_CC_CSS(cc) ((cc >> CC_CSS_SHIFT) & CC_CSS_MASK) -#define NVME_CC_MPS(cc) ((cc >> CC_MPS_SHIFT) & CC_MPS_MASK) -#define NVME_CC_AMS(cc) ((cc >> CC_AMS_SHIFT) & CC_AMS_MASK) -#define NVME_CC_SHN(cc) ((cc >> CC_SHN_SHIFT) & CC_SHN_MASK) -#define NVME_CC_IOSQES(cc) ((cc >> CC_IOSQES_SHIFT) & CC_IOSQES_MASK) -#define NVME_CC_IOCQES(cc) ((cc >> CC_IOCQES_SHIFT) & CC_IOCQES_MASK) - -enum NvmeCstsShift { - CSTS_RDY_SHIFT = 0, - CSTS_CFS_SHIFT = 1, - CSTS_SHST_SHIFT = 2, - CSTS_NSSRO_SHIFT = 4, -}; - -enum NvmeCstsMask { - CSTS_RDY_MASK = 0x1, - CSTS_CFS_MASK = 0x1, - CSTS_SHST_MASK = 0x3, - CSTS_NSSRO_MASK = 0x1, -}; - -enum NvmeCsts { - NVME_CSTS_READY = 1 << CSTS_RDY_SHIFT, - NVME_CSTS_FAILED = 1 << CSTS_CFS_SHIFT, - NVME_CSTS_SHST_NORMAL = 0 << CSTS_SHST_SHIFT, - NVME_CSTS_SHST_PROGRESS = 1 << CSTS_SHST_SHIFT, - NVME_CSTS_SHST_COMPLETE = 2 << CSTS_SHST_SHIFT, - NVME_CSTS_NSSRO = 1 << CSTS_NSSRO_SHIFT, -}; - -#define NVME_CSTS_RDY(csts) ((csts >> CSTS_RDY_SHIFT) & CSTS_RDY_MASK) -#define NVME_CSTS_CFS(csts) ((csts >> CSTS_CFS_SHIFT) & CSTS_CFS_MASK) -#define NVME_CSTS_SHST(csts) ((csts >> CSTS_SHST_SHIFT) & CSTS_SHST_MASK) -#define NVME_CSTS_NSSRO(csts) ((csts >> CSTS_NSSRO_SHIFT) & CSTS_NSSRO_MASK) - -enum NvmeAqaShift { - AQA_ASQS_SHIFT = 0, - AQA_ACQS_SHIFT = 16, -}; - -enum NvmeAqaMask { - AQA_ASQS_MASK = 0xfff, - AQA_ACQS_MASK = 0xfff, -}; - -#define NVME_AQA_ASQS(aqa) ((aqa >> AQA_ASQS_SHIFT) & AQA_ASQS_MASK) -#define NVME_AQA_ACQS(aqa) ((aqa >> AQA_ACQS_SHIFT) & AQA_ACQS_MASK) - -enum NvmeCmblocShift { - CMBLOC_BIR_SHIFT = 0, - CMBLOC_OFST_SHIFT = 12, -}; - -enum NvmeCmblocMask { - CMBLOC_BIR_MASK = 0x7, - CMBLOC_OFST_MASK = 0xfffff, -}; - -#define NVME_CMBLOC_BIR(cmbloc) ((cmbloc >> CMBLOC_BIR_SHIFT) & \ - CMBLOC_BIR_MASK) -#define NVME_CMBLOC_OFST(cmbloc)((cmbloc >> CMBLOC_OFST_SHIFT) & \ - CMBLOC_OFST_MASK) - -#define NVME_CMBLOC_SET_BIR(cmbloc, val) \ - (cmbloc |= (uint64_t)(val & CMBLOC_BIR_MASK) << CMBLOC_BIR_SHIFT) -#define NVME_CMBLOC_SET_OFST(cmbloc, val) \ - (cmbloc |= (uint64_t)(val & CMBLOC_OFST_MASK) << CMBLOC_OFST_SHIFT) - -enum NvmeCmbszShift { - CMBSZ_SQS_SHIFT = 0, - CMBSZ_CQS_SHIFT = 1, - CMBSZ_LISTS_SHIFT = 2, - CMBSZ_RDS_SHIFT = 3, - CMBSZ_WDS_SHIFT = 4, - CMBSZ_SZU_SHIFT = 8, - CMBSZ_SZ_SHIFT = 12, -}; - -enum NvmeCmbszMask { - CMBSZ_SQS_MASK = 0x1, - CMBSZ_CQS_MASK = 0x1, - CMBSZ_LISTS_MASK = 0x1, - CMBSZ_RDS_MASK = 0x1, - CMBSZ_WDS_MASK = 0x1, - CMBSZ_SZU_MASK = 0xf, - CMBSZ_SZ_MASK = 0xfffff, -}; - -#define NVME_CMBSZ_SQS(cmbsz) ((cmbsz >> CMBSZ_SQS_SHIFT) & CMBSZ_SQS_MASK) -#define NVME_CMBSZ_CQS(cmbsz) ((cmbsz >> CMBSZ_CQS_SHIFT) & CMBSZ_CQS_MASK) -#define NVME_CMBSZ_LISTS(cmbsz)((cmbsz >> CMBSZ_LISTS_SHIFT) & CMBSZ_LISTS_MASK) -#define NVME_CMBSZ_RDS(cmbsz) ((cmbsz >> CMBSZ_RDS_SHIFT) & CMBSZ_RDS_MASK) -#define NVME_CMBSZ_WDS(cmbsz) ((cmbsz >> CMBSZ_WDS_SHIFT) & CMBSZ_WDS_MASK) -#define NVME_CMBSZ_SZU(cmbsz) ((cmbsz >> CMBSZ_SZU_SHIFT) & CMBSZ_SZU_MASK) -#define NVME_CMBSZ_SZ(cmbsz) ((cmbsz >> CMBSZ_SZ_SHIFT) & CMBSZ_SZ_MASK) - -#define NVME_CMBSZ_SET_SQS(cmbsz, val) \ - (cmbsz |= (uint64_t)(val & CMBSZ_SQS_MASK) << CMBSZ_SQS_SHIFT) -#define NVME_CMBSZ_SET_CQS(cmbsz, val) \ - (cmbsz |= (uint64_t)(val & CMBSZ_CQS_MASK) << CMBSZ_CQS_SHIFT) -#define NVME_CMBSZ_SET_LISTS(cmbsz, val) \ - (cmbsz |= (uint64_t)(val & CMBSZ_LISTS_MASK) << CMBSZ_LISTS_SHIFT) -#define NVME_CMBSZ_SET_RDS(cmbsz, val) \ - (cmbsz |= (uint64_t)(val & CMBSZ_RDS_MASK) << CMBSZ_RDS_SHIFT) -#define NVME_CMBSZ_SET_WDS(cmbsz, val) \ - (cmbsz |= (uint64_t)(val & CMBSZ_WDS_MASK) << CMBSZ_WDS_SHIFT) -#define NVME_CMBSZ_SET_SZU(cmbsz, val) \ - (cmbsz |= (uint64_t)(val & CMBSZ_SZU_MASK) << CMBSZ_SZU_SHIFT) -#define NVME_CMBSZ_SET_SZ(cmbsz, val) \ - (cmbsz |= (uint64_t)(val & CMBSZ_SZ_MASK) << CMBSZ_SZ_SHIFT) - -#define NVME_CMBSZ_GETSIZE(cmbsz) \ - (NVME_CMBSZ_SZ(cmbsz) * (1 << (12 + 4 * NVME_CMBSZ_SZU(cmbsz)))) - -typedef struct NvmeCmd { - uint8_t opcode; - uint8_t fuse; - uint16_t cid; - uint32_t nsid; - uint64_t res1; - uint64_t mptr; - uint64_t prp1; - uint64_t prp2; - uint32_t cdw10; - uint32_t cdw11; - uint32_t cdw12; - uint32_t cdw13; - uint32_t cdw14; - uint32_t cdw15; -} NvmeCmd; - -enum NvmeAdminCommands { - NVME_ADM_CMD_DELETE_SQ = 0x00, - NVME_ADM_CMD_CREATE_SQ = 0x01, - NVME_ADM_CMD_GET_LOG_PAGE = 0x02, - NVME_ADM_CMD_DELETE_CQ = 0x04, - NVME_ADM_CMD_CREATE_CQ = 0x05, - NVME_ADM_CMD_IDENTIFY = 0x06, - NVME_ADM_CMD_ABORT = 0x08, - NVME_ADM_CMD_SET_FEATURES = 0x09, - NVME_ADM_CMD_GET_FEATURES = 0x0a, - NVME_ADM_CMD_ASYNC_EV_REQ = 0x0c, - NVME_ADM_CMD_ACTIVATE_FW = 0x10, - NVME_ADM_CMD_DOWNLOAD_FW = 0x11, - NVME_ADM_CMD_FORMAT_NVM = 0x80, - NVME_ADM_CMD_SECURITY_SEND = 0x81, - NVME_ADM_CMD_SECURITY_RECV = 0x82, -}; - -enum NvmeIoCommands { - NVME_CMD_FLUSH = 0x00, - NVME_CMD_WRITE = 0x01, - NVME_CMD_READ = 0x02, - NVME_CMD_WRITE_UNCOR = 0x04, - NVME_CMD_COMPARE = 0x05, - NVME_CMD_WRITE_ZEROS = 0x08, - NVME_CMD_DSM = 0x09, -}; - -typedef struct NvmeDeleteQ { - uint8_t opcode; - uint8_t flags; - uint16_t cid; - uint32_t rsvd1[9]; - uint16_t qid; - uint16_t rsvd10; - uint32_t rsvd11[5]; -} NvmeDeleteQ; - -typedef struct NvmeCreateCq { - uint8_t opcode; - uint8_t flags; - uint16_t cid; - uint32_t rsvd1[5]; - uint64_t prp1; - uint64_t rsvd8; - uint16_t cqid; - uint16_t qsize; - uint16_t cq_flags; - uint16_t irq_vector; - uint32_t rsvd12[4]; -} NvmeCreateCq; - -#define NVME_CQ_FLAGS_PC(cq_flags) (cq_flags & 0x1) -#define NVME_CQ_FLAGS_IEN(cq_flags) ((cq_flags >> 1) & 0x1) - -typedef struct NvmeCreateSq { - uint8_t opcode; - uint8_t flags; - uint16_t cid; - uint32_t rsvd1[5]; - uint64_t prp1; - uint64_t rsvd8; - uint16_t sqid; - uint16_t qsize; - uint16_t sq_flags; - uint16_t cqid; - uint32_t rsvd12[4]; -} NvmeCreateSq; - -#define NVME_SQ_FLAGS_PC(sq_flags) (sq_flags & 0x1) -#define NVME_SQ_FLAGS_QPRIO(sq_flags) ((sq_flags >> 1) & 0x3) - -enum NvmeQueueFlags { - NVME_Q_PC = 1, - NVME_Q_PRIO_URGENT = 0, - NVME_Q_PRIO_HIGH = 1, - NVME_Q_PRIO_NORMAL = 2, - NVME_Q_PRIO_LOW = 3, -}; - -typedef struct NvmeIdentify { - uint8_t opcode; - uint8_t flags; - uint16_t cid; - uint32_t nsid; - uint64_t rsvd2[2]; - uint64_t prp1; - uint64_t prp2; - uint32_t cns; - uint32_t rsvd11[5]; -} NvmeIdentify; - -typedef struct NvmeRwCmd { - uint8_t opcode; - uint8_t flags; - uint16_t cid; - uint32_t nsid; - uint64_t rsvd2; - uint64_t mptr; - uint64_t prp1; - uint64_t prp2; - uint64_t slba; - uint16_t nlb; - uint16_t control; - uint32_t dsmgmt; - uint32_t reftag; - uint16_t apptag; - uint16_t appmask; -} NvmeRwCmd; - -enum { - NVME_RW_LR = 1 << 15, - NVME_RW_FUA = 1 << 14, - NVME_RW_DSM_FREQ_UNSPEC = 0, - NVME_RW_DSM_FREQ_TYPICAL = 1, - NVME_RW_DSM_FREQ_RARE = 2, - NVME_RW_DSM_FREQ_READS = 3, - NVME_RW_DSM_FREQ_WRITES = 4, - NVME_RW_DSM_FREQ_RW = 5, - NVME_RW_DSM_FREQ_ONCE = 6, - NVME_RW_DSM_FREQ_PREFETCH = 7, - NVME_RW_DSM_FREQ_TEMP = 8, - NVME_RW_DSM_LATENCY_NONE = 0 << 4, - NVME_RW_DSM_LATENCY_IDLE = 1 << 4, - NVME_RW_DSM_LATENCY_NORM = 2 << 4, - NVME_RW_DSM_LATENCY_LOW = 3 << 4, - NVME_RW_DSM_SEQ_REQ = 1 << 6, - NVME_RW_DSM_COMPRESSED = 1 << 7, - NVME_RW_PRINFO_PRACT = 1 << 13, - NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12, - NVME_RW_PRINFO_PRCHK_APP = 1 << 11, - NVME_RW_PRINFO_PRCHK_REF = 1 << 10, -}; - -typedef struct NvmeDsmCmd { - uint8_t opcode; - uint8_t flags; - uint16_t cid; - uint32_t nsid; - uint64_t rsvd2[2]; - uint64_t prp1; - uint64_t prp2; - uint32_t nr; - uint32_t attributes; - uint32_t rsvd12[4]; -} NvmeDsmCmd; - -enum { - NVME_DSMGMT_IDR = 1 << 0, - NVME_DSMGMT_IDW = 1 << 1, - NVME_DSMGMT_AD = 1 << 2, -}; - -typedef struct NvmeDsmRange { - uint32_t cattr; - uint32_t nlb; - uint64_t slba; -} NvmeDsmRange; - -enum NvmeAsyncEventRequest { - NVME_AER_TYPE_ERROR = 0, - NVME_AER_TYPE_SMART = 1, - NVME_AER_TYPE_IO_SPECIFIC = 6, - NVME_AER_TYPE_VENDOR_SPECIFIC = 7, - NVME_AER_INFO_ERR_INVALID_SQ = 0, - NVME_AER_INFO_ERR_INVALID_DB = 1, - NVME_AER_INFO_ERR_DIAG_FAIL = 2, - NVME_AER_INFO_ERR_PERS_INTERNAL_ERR = 3, - NVME_AER_INFO_ERR_TRANS_INTERNAL_ERR = 4, - NVME_AER_INFO_ERR_FW_IMG_LOAD_ERR = 5, - NVME_AER_INFO_SMART_RELIABILITY = 0, - NVME_AER_INFO_SMART_TEMP_THRESH = 1, - NVME_AER_INFO_SMART_SPARE_THRESH = 2, -}; - -typedef struct NvmeAerResult { - uint8_t event_type; - uint8_t event_info; - uint8_t log_page; - uint8_t resv; -} NvmeAerResult; - -typedef struct NvmeCqe { - uint32_t result; - uint32_t rsvd; - uint16_t sq_head; - uint16_t sq_id; - uint16_t cid; - uint16_t status; -} NvmeCqe; - -enum NvmeStatusCodes { - NVME_SUCCESS = 0x0000, - NVME_INVALID_OPCODE = 0x0001, - NVME_INVALID_FIELD = 0x0002, - NVME_CID_CONFLICT = 0x0003, - NVME_DATA_TRAS_ERROR = 0x0004, - NVME_POWER_LOSS_ABORT = 0x0005, - NVME_INTERNAL_DEV_ERROR = 0x0006, - NVME_CMD_ABORT_REQ = 0x0007, - NVME_CMD_ABORT_SQ_DEL = 0x0008, - NVME_CMD_ABORT_FAILED_FUSE = 0x0009, - NVME_CMD_ABORT_MISSING_FUSE = 0x000a, - NVME_INVALID_NSID = 0x000b, - NVME_CMD_SEQ_ERROR = 0x000c, - NVME_LBA_RANGE = 0x0080, - NVME_CAP_EXCEEDED = 0x0081, - NVME_NS_NOT_READY = 0x0082, - NVME_NS_RESV_CONFLICT = 0x0083, - NVME_INVALID_CQID = 0x0100, - NVME_INVALID_QID = 0x0101, - NVME_MAX_QSIZE_EXCEEDED = 0x0102, - NVME_ACL_EXCEEDED = 0x0103, - NVME_RESERVED = 0x0104, - NVME_AER_LIMIT_EXCEEDED = 0x0105, - NVME_INVALID_FW_SLOT = 0x0106, - NVME_INVALID_FW_IMAGE = 0x0107, - NVME_INVALID_IRQ_VECTOR = 0x0108, - NVME_INVALID_LOG_ID = 0x0109, - NVME_INVALID_FORMAT = 0x010a, - NVME_FW_REQ_RESET = 0x010b, - NVME_INVALID_QUEUE_DEL = 0x010c, - NVME_FID_NOT_SAVEABLE = 0x010d, - NVME_FID_NOT_NSID_SPEC = 0x010f, - NVME_FW_REQ_SUSYSTEM_RESET = 0x0110, - NVME_CONFLICTING_ATTRS = 0x0180, - NVME_INVALID_PROT_INFO = 0x0181, - NVME_WRITE_TO_RO = 0x0182, - NVME_WRITE_FAULT = 0x0280, - NVME_UNRECOVERED_READ = 0x0281, - NVME_E2E_GUARD_ERROR = 0x0282, - NVME_E2E_APP_ERROR = 0x0283, - NVME_E2E_REF_ERROR = 0x0284, - NVME_CMP_FAILURE = 0x0285, - NVME_ACCESS_DENIED = 0x0286, - NVME_MORE = 0x2000, - NVME_DNR = 0x4000, - NVME_NO_COMPLETE = 0xffff, -}; - -typedef struct NvmeFwSlotInfoLog { - uint8_t afi; - uint8_t reserved1[7]; - uint8_t frs1[8]; - uint8_t frs2[8]; - uint8_t frs3[8]; - uint8_t frs4[8]; - uint8_t frs5[8]; - uint8_t frs6[8]; - uint8_t frs7[8]; - uint8_t reserved2[448]; -} NvmeFwSlotInfoLog; - -typedef struct NvmeErrorLog { - uint64_t error_count; - uint16_t sqid; - uint16_t cid; - uint16_t status_field; - uint16_t param_error_location; - uint64_t lba; - uint32_t nsid; - uint8_t vs; - uint8_t resv[35]; -} NvmeErrorLog; - -typedef struct NvmeSmartLog { - uint8_t critical_warning; - uint8_t temperature[2]; - uint8_t available_spare; - uint8_t available_spare_threshold; - uint8_t percentage_used; - uint8_t reserved1[26]; - uint64_t data_units_read[2]; - uint64_t data_units_written[2]; - uint64_t host_read_commands[2]; - uint64_t host_write_commands[2]; - uint64_t controller_busy_time[2]; - uint64_t power_cycles[2]; - uint64_t power_on_hours[2]; - uint64_t unsafe_shutdowns[2]; - uint64_t media_errors[2]; - uint64_t number_of_error_log_entries[2]; - uint8_t reserved2[320]; -} NvmeSmartLog; - -enum NvmeSmartWarn { - NVME_SMART_SPARE = 1 << 0, - NVME_SMART_TEMPERATURE = 1 << 1, - NVME_SMART_RELIABILITY = 1 << 2, - NVME_SMART_MEDIA_READ_ONLY = 1 << 3, - NVME_SMART_FAILED_VOLATILE_MEDIA = 1 << 4, -}; - -enum LogIdentifier { - NVME_LOG_ERROR_INFO = 0x01, - NVME_LOG_SMART_INFO = 0x02, - NVME_LOG_FW_SLOT_INFO = 0x03, -}; - -typedef struct NvmePSD { - uint16_t mp; - uint16_t reserved; - uint32_t enlat; - uint32_t exlat; - uint8_t rrt; - uint8_t rrl; - uint8_t rwt; - uint8_t rwl; - uint8_t resv[16]; -} NvmePSD; - -typedef struct NvmeIdCtrl { - uint16_t vid; - uint16_t ssvid; - uint8_t sn[20]; - uint8_t mn[40]; - uint8_t fr[8]; - uint8_t rab; - uint8_t ieee[3]; - uint8_t cmic; - uint8_t mdts; - uint8_t rsvd255[178]; - uint16_t oacs; - uint8_t acl; - uint8_t aerl; - uint8_t frmw; - uint8_t lpa; - uint8_t elpe; - uint8_t npss; - uint8_t rsvd511[248]; - uint8_t sqes; - uint8_t cqes; - uint16_t rsvd515; - uint32_t nn; - uint16_t oncs; - uint16_t fuses; - uint8_t fna; - uint8_t vwc; - uint16_t awun; - uint16_t awupf; - uint8_t rsvd703[174]; - uint8_t rsvd2047[1344]; - NvmePSD psd[32]; - uint8_t vs[1024]; -} NvmeIdCtrl; - -enum NvmeIdCtrlOacs { - NVME_OACS_SECURITY = 1 << 0, - NVME_OACS_FORMAT = 1 << 1, - NVME_OACS_FW = 1 << 2, -}; - -enum NvmeIdCtrlOncs { - NVME_ONCS_COMPARE = 1 << 0, - NVME_ONCS_WRITE_UNCORR = 1 << 1, - NVME_ONCS_DSM = 1 << 2, - NVME_ONCS_WRITE_ZEROS = 1 << 3, - NVME_ONCS_FEATURES = 1 << 4, - NVME_ONCS_RESRVATIONS = 1 << 5, -}; - -#define NVME_CTRL_SQES_MIN(sqes) ((sqes) & 0xf) -#define NVME_CTRL_SQES_MAX(sqes) (((sqes) >> 4) & 0xf) -#define NVME_CTRL_CQES_MIN(cqes) ((cqes) & 0xf) -#define NVME_CTRL_CQES_MAX(cqes) (((cqes) >> 4) & 0xf) - -typedef struct NvmeFeatureVal { - uint32_t arbitration; - uint32_t power_mgmt; - uint32_t temp_thresh; - uint32_t err_rec; - uint32_t volatile_wc; - uint32_t num_queues; - uint32_t int_coalescing; - uint32_t *int_vector_config; - uint32_t write_atomicity; - uint32_t async_config; - uint32_t sw_prog_marker; -} NvmeFeatureVal; - -#define NVME_ARB_AB(arb) (arb & 0x7) -#define NVME_ARB_LPW(arb) ((arb >> 8) & 0xff) -#define NVME_ARB_MPW(arb) ((arb >> 16) & 0xff) -#define NVME_ARB_HPW(arb) ((arb >> 24) & 0xff) - -#define NVME_INTC_THR(intc) (intc & 0xff) -#define NVME_INTC_TIME(intc) ((intc >> 8) & 0xff) - -enum NvmeFeatureIds { - NVME_ARBITRATION = 0x1, - NVME_POWER_MANAGEMENT = 0x2, - NVME_LBA_RANGE_TYPE = 0x3, - NVME_TEMPERATURE_THRESHOLD = 0x4, - NVME_ERROR_RECOVERY = 0x5, - NVME_VOLATILE_WRITE_CACHE = 0x6, - NVME_NUMBER_OF_QUEUES = 0x7, - NVME_INTERRUPT_COALESCING = 0x8, - NVME_INTERRUPT_VECTOR_CONF = 0x9, - NVME_WRITE_ATOMICITY = 0xa, - NVME_ASYNCHRONOUS_EVENT_CONF = 0xb, - NVME_SOFTWARE_PROGRESS_MARKER = 0x80 -}; - -typedef struct NvmeRangeType { - uint8_t type; - uint8_t attributes; - uint8_t rsvd2[14]; - uint64_t slba; - uint64_t nlb; - uint8_t guid[16]; - uint8_t rsvd48[16]; -} NvmeRangeType; - -typedef struct NvmeLBAF { - uint16_t ms; - uint8_t ds; - uint8_t rp; -} NvmeLBAF; - -typedef struct NvmeIdNs { - uint64_t nsze; - uint64_t ncap; - uint64_t nuse; - uint8_t nsfeat; - uint8_t nlbaf; - uint8_t flbas; - uint8_t mc; - uint8_t dpc; - uint8_t dps; - uint8_t res30[98]; - NvmeLBAF lbaf[16]; - uint8_t res192[192]; - uint8_t vs[3712]; -} NvmeIdNs; - -#define NVME_ID_NS_NSFEAT_THIN(nsfeat) ((nsfeat & 0x1)) -#define NVME_ID_NS_FLBAS_EXTENDED(flbas) ((flbas >> 4) & 0x1) -#define NVME_ID_NS_FLBAS_INDEX(flbas) ((flbas & 0xf)) -#define NVME_ID_NS_MC_SEPARATE(mc) ((mc >> 1) & 0x1) -#define NVME_ID_NS_MC_EXTENDED(mc) ((mc & 0x1)) -#define NVME_ID_NS_DPC_LAST_EIGHT(dpc) ((dpc >> 4) & 0x1) -#define NVME_ID_NS_DPC_FIRST_EIGHT(dpc) ((dpc >> 3) & 0x1) -#define NVME_ID_NS_DPC_TYPE_3(dpc) ((dpc >> 2) & 0x1) -#define NVME_ID_NS_DPC_TYPE_2(dpc) ((dpc >> 1) & 0x1) -#define NVME_ID_NS_DPC_TYPE_1(dpc) ((dpc & 0x1)) -#define NVME_ID_NS_DPC_TYPE_MASK 0x7 - -enum NvmeIdNsDps { - DPS_TYPE_NONE = 0, - DPS_TYPE_1 = 1, - DPS_TYPE_2 = 2, - DPS_TYPE_3 = 3, - DPS_TYPE_MASK = 0x7, - DPS_FIRST_EIGHT = 8, -}; - -static inline void _nvme_check_size(void) -{ - QEMU_BUILD_BUG_ON(sizeof(NvmeAerResult) != 4); - QEMU_BUILD_BUG_ON(sizeof(NvmeCqe) != 16); - QEMU_BUILD_BUG_ON(sizeof(NvmeDsmRange) != 16); - QEMU_BUILD_BUG_ON(sizeof(NvmeCmd) != 64); - QEMU_BUILD_BUG_ON(sizeof(NvmeDeleteQ) != 64); - QEMU_BUILD_BUG_ON(sizeof(NvmeCreateCq) != 64); - QEMU_BUILD_BUG_ON(sizeof(NvmeCreateSq) != 64); - QEMU_BUILD_BUG_ON(sizeof(NvmeIdentify) != 64); - QEMU_BUILD_BUG_ON(sizeof(NvmeRwCmd) != 64); - QEMU_BUILD_BUG_ON(sizeof(NvmeDsmCmd) != 64); - QEMU_BUILD_BUG_ON(sizeof(NvmeRangeType) != 64); - QEMU_BUILD_BUG_ON(sizeof(NvmeErrorLog) != 64); - QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512); - QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512); - QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrl) != 4096); - QEMU_BUILD_BUG_ON(sizeof(NvmeIdNs) != 4096); -} +#include "block/nvme.h" typedef struct NvmeAsyncEvent { QSIMPLEQ_ENTRY(NvmeAsyncEvent) entry; diff --git a/include/block/block.h b/include/block/block.h index 9b12774ddf..2025d7ed19 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -631,5 +631,14 @@ void bdrv_del_child(BlockDriverState *parent, BdrvChild *child, Error **errp); bool bdrv_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name, uint32_t granularity, Error **errp); - +/** + * + * bdrv_register_buf/bdrv_unregister_buf: + * + * Register/unregister a buffer for I/O. For example, VFIO drivers are + * interested to know the memory areas that would later be used for I/O, so + * that they can prepare IOMMU mapping etc., to get better performance. + */ +void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size); +void bdrv_unregister_buf(BlockDriverState *bs, void *host); #endif diff --git a/include/block/block_int.h b/include/block/block_int.h index 29cafa4236..99b9190627 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -446,6 +446,15 @@ struct BlockDriver { const char *name, Error **errp); + /** + * Register/unregister a buffer for I/O. For example, when the driver is + * interested to know the memory areas that will later be used in iovs, so + * that it can do IOMMU mapping with VFIO etc., in order to get better + * performance. In the case of VFIO drivers, this callback is used to do + * DMA mapping for hot buffers. + */ + void (*bdrv_register_buf)(BlockDriverState *bs, void *host, size_t size); + void (*bdrv_unregister_buf)(BlockDriverState *bs, void *host); QLIST_ENTRY(BlockDriver) list; }; diff --git a/include/block/nvme.h b/include/block/nvme.h new file mode 100644 index 0000000000..849a6f3fa3 --- /dev/null +++ b/include/block/nvme.h @@ -0,0 +1,700 @@ +#ifndef BLOCK_NVME_H +#define BLOCK_NVME_H + +typedef struct NvmeBar { + uint64_t cap; + uint32_t vs; + uint32_t intms; + uint32_t intmc; + uint32_t cc; + uint32_t rsvd1; + uint32_t csts; + uint32_t nssrc; + uint32_t aqa; + uint64_t asq; + uint64_t acq; + uint32_t cmbloc; + uint32_t cmbsz; +} NvmeBar; + +enum NvmeCapShift { + CAP_MQES_SHIFT = 0, + CAP_CQR_SHIFT = 16, + CAP_AMS_SHIFT = 17, + CAP_TO_SHIFT = 24, + CAP_DSTRD_SHIFT = 32, + CAP_NSSRS_SHIFT = 33, + CAP_CSS_SHIFT = 37, + CAP_MPSMIN_SHIFT = 48, + CAP_MPSMAX_SHIFT = 52, +}; + +enum NvmeCapMask { + CAP_MQES_MASK = 0xffff, + CAP_CQR_MASK = 0x1, + CAP_AMS_MASK = 0x3, + CAP_TO_MASK = 0xff, + CAP_DSTRD_MASK = 0xf, + CAP_NSSRS_MASK = 0x1, + CAP_CSS_MASK = 0xff, + CAP_MPSMIN_MASK = 0xf, + CAP_MPSMAX_MASK = 0xf, +}; + +#define NVME_CAP_MQES(cap) (((cap) >> CAP_MQES_SHIFT) & CAP_MQES_MASK) +#define NVME_CAP_CQR(cap) (((cap) >> CAP_CQR_SHIFT) & CAP_CQR_MASK) +#define NVME_CAP_AMS(cap) (((cap) >> CAP_AMS_SHIFT) & CAP_AMS_MASK) +#define NVME_CAP_TO(cap) (((cap) >> CAP_TO_SHIFT) & CAP_TO_MASK) +#define NVME_CAP_DSTRD(cap) (((cap) >> CAP_DSTRD_SHIFT) & CAP_DSTRD_MASK) +#define NVME_CAP_NSSRS(cap) (((cap) >> CAP_NSSRS_SHIFT) & CAP_NSSRS_MASK) +#define NVME_CAP_CSS(cap) (((cap) >> CAP_CSS_SHIFT) & CAP_CSS_MASK) +#define NVME_CAP_MPSMIN(cap)(((cap) >> CAP_MPSMIN_SHIFT) & CAP_MPSMIN_MASK) +#define NVME_CAP_MPSMAX(cap)(((cap) >> CAP_MPSMAX_SHIFT) & CAP_MPSMAX_MASK) + +#define NVME_CAP_SET_MQES(cap, val) (cap |= (uint64_t)(val & CAP_MQES_MASK) \ + << CAP_MQES_SHIFT) +#define NVME_CAP_SET_CQR(cap, val) (cap |= (uint64_t)(val & CAP_CQR_MASK) \ + << CAP_CQR_SHIFT) +#define NVME_CAP_SET_AMS(cap, val) (cap |= (uint64_t)(val & CAP_AMS_MASK) \ + << CAP_AMS_SHIFT) +#define NVME_CAP_SET_TO(cap, val) (cap |= (uint64_t)(val & CAP_TO_MASK) \ + << CAP_TO_SHIFT) +#define NVME_CAP_SET_DSTRD(cap, val) (cap |= (uint64_t)(val & CAP_DSTRD_MASK) \ + << CAP_DSTRD_SHIFT) +#define NVME_CAP_SET_NSSRS(cap, val) (cap |= (uint64_t)(val & CAP_NSSRS_MASK) \ + << CAP_NSSRS_SHIFT) +#define NVME_CAP_SET_CSS(cap, val) (cap |= (uint64_t)(val & CAP_CSS_MASK) \ + << CAP_CSS_SHIFT) +#define NVME_CAP_SET_MPSMIN(cap, val) (cap |= (uint64_t)(val & CAP_MPSMIN_MASK)\ + << CAP_MPSMIN_SHIFT) +#define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & CAP_MPSMAX_MASK)\ + << CAP_MPSMAX_SHIFT) + +enum NvmeCcShift { + CC_EN_SHIFT = 0, + CC_CSS_SHIFT = 4, + CC_MPS_SHIFT = 7, + CC_AMS_SHIFT = 11, + CC_SHN_SHIFT = 14, + CC_IOSQES_SHIFT = 16, + CC_IOCQES_SHIFT = 20, +}; + +enum NvmeCcMask { + CC_EN_MASK = 0x1, + CC_CSS_MASK = 0x7, + CC_MPS_MASK = 0xf, + CC_AMS_MASK = 0x7, + CC_SHN_MASK = 0x3, + CC_IOSQES_MASK = 0xf, + CC_IOCQES_MASK = 0xf, +}; + +#define NVME_CC_EN(cc) ((cc >> CC_EN_SHIFT) & CC_EN_MASK) +#define NVME_CC_CSS(cc) ((cc >> CC_CSS_SHIFT) & CC_CSS_MASK) +#define NVME_CC_MPS(cc) ((cc >> CC_MPS_SHIFT) & CC_MPS_MASK) +#define NVME_CC_AMS(cc) ((cc >> CC_AMS_SHIFT) & CC_AMS_MASK) +#define NVME_CC_SHN(cc) ((cc >> CC_SHN_SHIFT) & CC_SHN_MASK) +#define NVME_CC_IOSQES(cc) ((cc >> CC_IOSQES_SHIFT) & CC_IOSQES_MASK) +#define NVME_CC_IOCQES(cc) ((cc >> CC_IOCQES_SHIFT) & CC_IOCQES_MASK) + +enum NvmeCstsShift { + CSTS_RDY_SHIFT = 0, + CSTS_CFS_SHIFT = 1, + CSTS_SHST_SHIFT = 2, + CSTS_NSSRO_SHIFT = 4, +}; + +enum NvmeCstsMask { + CSTS_RDY_MASK = 0x1, + CSTS_CFS_MASK = 0x1, + CSTS_SHST_MASK = 0x3, + CSTS_NSSRO_MASK = 0x1, +}; + +enum NvmeCsts { + NVME_CSTS_READY = 1 << CSTS_RDY_SHIFT, + NVME_CSTS_FAILED = 1 << CSTS_CFS_SHIFT, + NVME_CSTS_SHST_NORMAL = 0 << CSTS_SHST_SHIFT, + NVME_CSTS_SHST_PROGRESS = 1 << CSTS_SHST_SHIFT, + NVME_CSTS_SHST_COMPLETE = 2 << CSTS_SHST_SHIFT, + NVME_CSTS_NSSRO = 1 << CSTS_NSSRO_SHIFT, +}; + +#define NVME_CSTS_RDY(csts) ((csts >> CSTS_RDY_SHIFT) & CSTS_RDY_MASK) +#define NVME_CSTS_CFS(csts) ((csts >> CSTS_CFS_SHIFT) & CSTS_CFS_MASK) +#define NVME_CSTS_SHST(csts) ((csts >> CSTS_SHST_SHIFT) & CSTS_SHST_MASK) +#define NVME_CSTS_NSSRO(csts) ((csts >> CSTS_NSSRO_SHIFT) & CSTS_NSSRO_MASK) + +enum NvmeAqaShift { + AQA_ASQS_SHIFT = 0, + AQA_ACQS_SHIFT = 16, +}; + +enum NvmeAqaMask { + AQA_ASQS_MASK = 0xfff, + AQA_ACQS_MASK = 0xfff, +}; + +#define NVME_AQA_ASQS(aqa) ((aqa >> AQA_ASQS_SHIFT) & AQA_ASQS_MASK) +#define NVME_AQA_ACQS(aqa) ((aqa >> AQA_ACQS_SHIFT) & AQA_ACQS_MASK) + +enum NvmeCmblocShift { + CMBLOC_BIR_SHIFT = 0, + CMBLOC_OFST_SHIFT = 12, +}; + +enum NvmeCmblocMask { + CMBLOC_BIR_MASK = 0x7, + CMBLOC_OFST_MASK = 0xfffff, +}; + +#define NVME_CMBLOC_BIR(cmbloc) ((cmbloc >> CMBLOC_BIR_SHIFT) & \ + CMBLOC_BIR_MASK) +#define NVME_CMBLOC_OFST(cmbloc)((cmbloc >> CMBLOC_OFST_SHIFT) & \ + CMBLOC_OFST_MASK) + +#define NVME_CMBLOC_SET_BIR(cmbloc, val) \ + (cmbloc |= (uint64_t)(val & CMBLOC_BIR_MASK) << CMBLOC_BIR_SHIFT) +#define NVME_CMBLOC_SET_OFST(cmbloc, val) \ + (cmbloc |= (uint64_t)(val & CMBLOC_OFST_MASK) << CMBLOC_OFST_SHIFT) + +enum NvmeCmbszShift { + CMBSZ_SQS_SHIFT = 0, + CMBSZ_CQS_SHIFT = 1, + CMBSZ_LISTS_SHIFT = 2, + CMBSZ_RDS_SHIFT = 3, + CMBSZ_WDS_SHIFT = 4, + CMBSZ_SZU_SHIFT = 8, + CMBSZ_SZ_SHIFT = 12, +}; + +enum NvmeCmbszMask { + CMBSZ_SQS_MASK = 0x1, + CMBSZ_CQS_MASK = 0x1, + CMBSZ_LISTS_MASK = 0x1, + CMBSZ_RDS_MASK = 0x1, + CMBSZ_WDS_MASK = 0x1, + CMBSZ_SZU_MASK = 0xf, + CMBSZ_SZ_MASK = 0xfffff, +}; + +#define NVME_CMBSZ_SQS(cmbsz) ((cmbsz >> CMBSZ_SQS_SHIFT) & CMBSZ_SQS_MASK) +#define NVME_CMBSZ_CQS(cmbsz) ((cmbsz >> CMBSZ_CQS_SHIFT) & CMBSZ_CQS_MASK) +#define NVME_CMBSZ_LISTS(cmbsz)((cmbsz >> CMBSZ_LISTS_SHIFT) & CMBSZ_LISTS_MASK) +#define NVME_CMBSZ_RDS(cmbsz) ((cmbsz >> CMBSZ_RDS_SHIFT) & CMBSZ_RDS_MASK) +#define NVME_CMBSZ_WDS(cmbsz) ((cmbsz >> CMBSZ_WDS_SHIFT) & CMBSZ_WDS_MASK) +#define NVME_CMBSZ_SZU(cmbsz) ((cmbsz >> CMBSZ_SZU_SHIFT) & CMBSZ_SZU_MASK) +#define NVME_CMBSZ_SZ(cmbsz) ((cmbsz >> CMBSZ_SZ_SHIFT) & CMBSZ_SZ_MASK) + +#define NVME_CMBSZ_SET_SQS(cmbsz, val) \ + (cmbsz |= (uint64_t)(val & CMBSZ_SQS_MASK) << CMBSZ_SQS_SHIFT) +#define NVME_CMBSZ_SET_CQS(cmbsz, val) \ + (cmbsz |= (uint64_t)(val & CMBSZ_CQS_MASK) << CMBSZ_CQS_SHIFT) +#define NVME_CMBSZ_SET_LISTS(cmbsz, val) \ + (cmbsz |= (uint64_t)(val & CMBSZ_LISTS_MASK) << CMBSZ_LISTS_SHIFT) +#define NVME_CMBSZ_SET_RDS(cmbsz, val) \ + (cmbsz |= (uint64_t)(val & CMBSZ_RDS_MASK) << CMBSZ_RDS_SHIFT) +#define NVME_CMBSZ_SET_WDS(cmbsz, val) \ + (cmbsz |= (uint64_t)(val & CMBSZ_WDS_MASK) << CMBSZ_WDS_SHIFT) +#define NVME_CMBSZ_SET_SZU(cmbsz, val) \ + (cmbsz |= (uint64_t)(val & CMBSZ_SZU_MASK) << CMBSZ_SZU_SHIFT) +#define NVME_CMBSZ_SET_SZ(cmbsz, val) \ + (cmbsz |= (uint64_t)(val & CMBSZ_SZ_MASK) << CMBSZ_SZ_SHIFT) + +#define NVME_CMBSZ_GETSIZE(cmbsz) \ + (NVME_CMBSZ_SZ(cmbsz) * (1 << (12 + 4 * NVME_CMBSZ_SZU(cmbsz)))) + +typedef struct NvmeCmd { + uint8_t opcode; + uint8_t fuse; + uint16_t cid; + uint32_t nsid; + uint64_t res1; + uint64_t mptr; + uint64_t prp1; + uint64_t prp2; + uint32_t cdw10; + uint32_t cdw11; + uint32_t cdw12; + uint32_t cdw13; + uint32_t cdw14; + uint32_t cdw15; +} NvmeCmd; + +enum NvmeAdminCommands { + NVME_ADM_CMD_DELETE_SQ = 0x00, + NVME_ADM_CMD_CREATE_SQ = 0x01, + NVME_ADM_CMD_GET_LOG_PAGE = 0x02, + NVME_ADM_CMD_DELETE_CQ = 0x04, + NVME_ADM_CMD_CREATE_CQ = 0x05, + NVME_ADM_CMD_IDENTIFY = 0x06, + NVME_ADM_CMD_ABORT = 0x08, + NVME_ADM_CMD_SET_FEATURES = 0x09, + NVME_ADM_CMD_GET_FEATURES = 0x0a, + NVME_ADM_CMD_ASYNC_EV_REQ = 0x0c, + NVME_ADM_CMD_ACTIVATE_FW = 0x10, + NVME_ADM_CMD_DOWNLOAD_FW = 0x11, + NVME_ADM_CMD_FORMAT_NVM = 0x80, + NVME_ADM_CMD_SECURITY_SEND = 0x81, + NVME_ADM_CMD_SECURITY_RECV = 0x82, +}; + +enum NvmeIoCommands { + NVME_CMD_FLUSH = 0x00, + NVME_CMD_WRITE = 0x01, + NVME_CMD_READ = 0x02, + NVME_CMD_WRITE_UNCOR = 0x04, + NVME_CMD_COMPARE = 0x05, + NVME_CMD_WRITE_ZEROS = 0x08, + NVME_CMD_DSM = 0x09, +}; + +typedef struct NvmeDeleteQ { + uint8_t opcode; + uint8_t flags; + uint16_t cid; + uint32_t rsvd1[9]; + uint16_t qid; + uint16_t rsvd10; + uint32_t rsvd11[5]; +} NvmeDeleteQ; + +typedef struct NvmeCreateCq { + uint8_t opcode; + uint8_t flags; + uint16_t cid; + uint32_t rsvd1[5]; + uint64_t prp1; + uint64_t rsvd8; + uint16_t cqid; + uint16_t qsize; + uint16_t cq_flags; + uint16_t irq_vector; + uint32_t rsvd12[4]; +} NvmeCreateCq; + +#define NVME_CQ_FLAGS_PC(cq_flags) (cq_flags & 0x1) +#define NVME_CQ_FLAGS_IEN(cq_flags) ((cq_flags >> 1) & 0x1) + +typedef struct NvmeCreateSq { + uint8_t opcode; + uint8_t flags; + uint16_t cid; + uint32_t rsvd1[5]; + uint64_t prp1; + uint64_t rsvd8; + uint16_t sqid; + uint16_t qsize; + uint16_t sq_flags; + uint16_t cqid; + uint32_t rsvd12[4]; +} NvmeCreateSq; + +#define NVME_SQ_FLAGS_PC(sq_flags) (sq_flags & 0x1) +#define NVME_SQ_FLAGS_QPRIO(sq_flags) ((sq_flags >> 1) & 0x3) + +enum NvmeQueueFlags { + NVME_Q_PC = 1, + NVME_Q_PRIO_URGENT = 0, + NVME_Q_PRIO_HIGH = 1, + NVME_Q_PRIO_NORMAL = 2, + NVME_Q_PRIO_LOW = 3, +}; + +typedef struct NvmeIdentify { + uint8_t opcode; + uint8_t flags; + uint16_t cid; + uint32_t nsid; + uint64_t rsvd2[2]; + uint64_t prp1; + uint64_t prp2; + uint32_t cns; + uint32_t rsvd11[5]; +} NvmeIdentify; + +typedef struct NvmeRwCmd { + uint8_t opcode; + uint8_t flags; + uint16_t cid; + uint32_t nsid; + uint64_t rsvd2; + uint64_t mptr; + uint64_t prp1; + uint64_t prp2; + uint64_t slba; + uint16_t nlb; + uint16_t control; + uint32_t dsmgmt; + uint32_t reftag; + uint16_t apptag; + uint16_t appmask; +} NvmeRwCmd; + +enum { + NVME_RW_LR = 1 << 15, + NVME_RW_FUA = 1 << 14, + NVME_RW_DSM_FREQ_UNSPEC = 0, + NVME_RW_DSM_FREQ_TYPICAL = 1, + NVME_RW_DSM_FREQ_RARE = 2, + NVME_RW_DSM_FREQ_READS = 3, + NVME_RW_DSM_FREQ_WRITES = 4, + NVME_RW_DSM_FREQ_RW = 5, + NVME_RW_DSM_FREQ_ONCE = 6, + NVME_RW_DSM_FREQ_PREFETCH = 7, + NVME_RW_DSM_FREQ_TEMP = 8, + NVME_RW_DSM_LATENCY_NONE = 0 << 4, + NVME_RW_DSM_LATENCY_IDLE = 1 << 4, + NVME_RW_DSM_LATENCY_NORM = 2 << 4, + NVME_RW_DSM_LATENCY_LOW = 3 << 4, + NVME_RW_DSM_SEQ_REQ = 1 << 6, + NVME_RW_DSM_COMPRESSED = 1 << 7, + NVME_RW_PRINFO_PRACT = 1 << 13, + NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12, + NVME_RW_PRINFO_PRCHK_APP = 1 << 11, + NVME_RW_PRINFO_PRCHK_REF = 1 << 10, +}; + +typedef struct NvmeDsmCmd { + uint8_t opcode; + uint8_t flags; + uint16_t cid; + uint32_t nsid; + uint64_t rsvd2[2]; + uint64_t prp1; + uint64_t prp2; + uint32_t nr; + uint32_t attributes; + uint32_t rsvd12[4]; +} NvmeDsmCmd; + +enum { + NVME_DSMGMT_IDR = 1 << 0, + NVME_DSMGMT_IDW = 1 << 1, + NVME_DSMGMT_AD = 1 << 2, +}; + +typedef struct NvmeDsmRange { + uint32_t cattr; + uint32_t nlb; + uint64_t slba; +} NvmeDsmRange; + +enum NvmeAsyncEventRequest { + NVME_AER_TYPE_ERROR = 0, + NVME_AER_TYPE_SMART = 1, + NVME_AER_TYPE_IO_SPECIFIC = 6, + NVME_AER_TYPE_VENDOR_SPECIFIC = 7, + NVME_AER_INFO_ERR_INVALID_SQ = 0, + NVME_AER_INFO_ERR_INVALID_DB = 1, + NVME_AER_INFO_ERR_DIAG_FAIL = 2, + NVME_AER_INFO_ERR_PERS_INTERNAL_ERR = 3, + NVME_AER_INFO_ERR_TRANS_INTERNAL_ERR = 4, + NVME_AER_INFO_ERR_FW_IMG_LOAD_ERR = 5, + NVME_AER_INFO_SMART_RELIABILITY = 0, + NVME_AER_INFO_SMART_TEMP_THRESH = 1, + NVME_AER_INFO_SMART_SPARE_THRESH = 2, +}; + +typedef struct NvmeAerResult { + uint8_t event_type; + uint8_t event_info; + uint8_t log_page; + uint8_t resv; +} NvmeAerResult; + +typedef struct NvmeCqe { + uint32_t result; + uint32_t rsvd; + uint16_t sq_head; + uint16_t sq_id; + uint16_t cid; + uint16_t status; +} NvmeCqe; + +enum NvmeStatusCodes { + NVME_SUCCESS = 0x0000, + NVME_INVALID_OPCODE = 0x0001, + NVME_INVALID_FIELD = 0x0002, + NVME_CID_CONFLICT = 0x0003, + NVME_DATA_TRAS_ERROR = 0x0004, + NVME_POWER_LOSS_ABORT = 0x0005, + NVME_INTERNAL_DEV_ERROR = 0x0006, + NVME_CMD_ABORT_REQ = 0x0007, + NVME_CMD_ABORT_SQ_DEL = 0x0008, + NVME_CMD_ABORT_FAILED_FUSE = 0x0009, + NVME_CMD_ABORT_MISSING_FUSE = 0x000a, + NVME_INVALID_NSID = 0x000b, + NVME_CMD_SEQ_ERROR = 0x000c, + NVME_LBA_RANGE = 0x0080, + NVME_CAP_EXCEEDED = 0x0081, + NVME_NS_NOT_READY = 0x0082, + NVME_NS_RESV_CONFLICT = 0x0083, + NVME_INVALID_CQID = 0x0100, + NVME_INVALID_QID = 0x0101, + NVME_MAX_QSIZE_EXCEEDED = 0x0102, + NVME_ACL_EXCEEDED = 0x0103, + NVME_RESERVED = 0x0104, + NVME_AER_LIMIT_EXCEEDED = 0x0105, + NVME_INVALID_FW_SLOT = 0x0106, + NVME_INVALID_FW_IMAGE = 0x0107, + NVME_INVALID_IRQ_VECTOR = 0x0108, + NVME_INVALID_LOG_ID = 0x0109, + NVME_INVALID_FORMAT = 0x010a, + NVME_FW_REQ_RESET = 0x010b, + NVME_INVALID_QUEUE_DEL = 0x010c, + NVME_FID_NOT_SAVEABLE = 0x010d, + NVME_FID_NOT_NSID_SPEC = 0x010f, + NVME_FW_REQ_SUSYSTEM_RESET = 0x0110, + NVME_CONFLICTING_ATTRS = 0x0180, + NVME_INVALID_PROT_INFO = 0x0181, + NVME_WRITE_TO_RO = 0x0182, + NVME_WRITE_FAULT = 0x0280, + NVME_UNRECOVERED_READ = 0x0281, + NVME_E2E_GUARD_ERROR = 0x0282, + NVME_E2E_APP_ERROR = 0x0283, + NVME_E2E_REF_ERROR = 0x0284, + NVME_CMP_FAILURE = 0x0285, + NVME_ACCESS_DENIED = 0x0286, + NVME_MORE = 0x2000, + NVME_DNR = 0x4000, + NVME_NO_COMPLETE = 0xffff, +}; + +typedef struct NvmeFwSlotInfoLog { + uint8_t afi; + uint8_t reserved1[7]; + uint8_t frs1[8]; + uint8_t frs2[8]; + uint8_t frs3[8]; + uint8_t frs4[8]; + uint8_t frs5[8]; + uint8_t frs6[8]; + uint8_t frs7[8]; + uint8_t reserved2[448]; +} NvmeFwSlotInfoLog; + +typedef struct NvmeErrorLog { + uint64_t error_count; + uint16_t sqid; + uint16_t cid; + uint16_t status_field; + uint16_t param_error_location; + uint64_t lba; + uint32_t nsid; + uint8_t vs; + uint8_t resv[35]; +} NvmeErrorLog; + +typedef struct NvmeSmartLog { + uint8_t critical_warning; + uint8_t temperature[2]; + uint8_t available_spare; + uint8_t available_spare_threshold; + uint8_t percentage_used; + uint8_t reserved1[26]; + uint64_t data_units_read[2]; + uint64_t data_units_written[2]; + uint64_t host_read_commands[2]; + uint64_t host_write_commands[2]; + uint64_t controller_busy_time[2]; + uint64_t power_cycles[2]; + uint64_t power_on_hours[2]; + uint64_t unsafe_shutdowns[2]; + uint64_t media_errors[2]; + uint64_t number_of_error_log_entries[2]; + uint8_t reserved2[320]; +} NvmeSmartLog; + +enum NvmeSmartWarn { + NVME_SMART_SPARE = 1 << 0, + NVME_SMART_TEMPERATURE = 1 << 1, + NVME_SMART_RELIABILITY = 1 << 2, + NVME_SMART_MEDIA_READ_ONLY = 1 << 3, + NVME_SMART_FAILED_VOLATILE_MEDIA = 1 << 4, +}; + +enum LogIdentifier { + NVME_LOG_ERROR_INFO = 0x01, + NVME_LOG_SMART_INFO = 0x02, + NVME_LOG_FW_SLOT_INFO = 0x03, +}; + +typedef struct NvmePSD { + uint16_t mp; + uint16_t reserved; + uint32_t enlat; + uint32_t exlat; + uint8_t rrt; + uint8_t rrl; + uint8_t rwt; + uint8_t rwl; + uint8_t resv[16]; +} NvmePSD; + +typedef struct NvmeIdCtrl { + uint16_t vid; + uint16_t ssvid; + uint8_t sn[20]; + uint8_t mn[40]; + uint8_t fr[8]; + uint8_t rab; + uint8_t ieee[3]; + uint8_t cmic; + uint8_t mdts; + uint8_t rsvd255[178]; + uint16_t oacs; + uint8_t acl; + uint8_t aerl; + uint8_t frmw; + uint8_t lpa; + uint8_t elpe; + uint8_t npss; + uint8_t rsvd511[248]; + uint8_t sqes; + uint8_t cqes; + uint16_t rsvd515; + uint32_t nn; + uint16_t oncs; + uint16_t fuses; + uint8_t fna; + uint8_t vwc; + uint16_t awun; + uint16_t awupf; + uint8_t rsvd703[174]; + uint8_t rsvd2047[1344]; + NvmePSD psd[32]; + uint8_t vs[1024]; +} NvmeIdCtrl; + +enum NvmeIdCtrlOacs { + NVME_OACS_SECURITY = 1 << 0, + NVME_OACS_FORMAT = 1 << 1, + NVME_OACS_FW = 1 << 2, +}; + +enum NvmeIdCtrlOncs { + NVME_ONCS_COMPARE = 1 << 0, + NVME_ONCS_WRITE_UNCORR = 1 << 1, + NVME_ONCS_DSM = 1 << 2, + NVME_ONCS_WRITE_ZEROS = 1 << 3, + NVME_ONCS_FEATURES = 1 << 4, + NVME_ONCS_RESRVATIONS = 1 << 5, +}; + +#define NVME_CTRL_SQES_MIN(sqes) ((sqes) & 0xf) +#define NVME_CTRL_SQES_MAX(sqes) (((sqes) >> 4) & 0xf) +#define NVME_CTRL_CQES_MIN(cqes) ((cqes) & 0xf) +#define NVME_CTRL_CQES_MAX(cqes) (((cqes) >> 4) & 0xf) + +typedef struct NvmeFeatureVal { + uint32_t arbitration; + uint32_t power_mgmt; + uint32_t temp_thresh; + uint32_t err_rec; + uint32_t volatile_wc; + uint32_t num_queues; + uint32_t int_coalescing; + uint32_t *int_vector_config; + uint32_t write_atomicity; + uint32_t async_config; + uint32_t sw_prog_marker; +} NvmeFeatureVal; + +#define NVME_ARB_AB(arb) (arb & 0x7) +#define NVME_ARB_LPW(arb) ((arb >> 8) & 0xff) +#define NVME_ARB_MPW(arb) ((arb >> 16) & 0xff) +#define NVME_ARB_HPW(arb) ((arb >> 24) & 0xff) + +#define NVME_INTC_THR(intc) (intc & 0xff) +#define NVME_INTC_TIME(intc) ((intc >> 8) & 0xff) + +enum NvmeFeatureIds { + NVME_ARBITRATION = 0x1, + NVME_POWER_MANAGEMENT = 0x2, + NVME_LBA_RANGE_TYPE = 0x3, + NVME_TEMPERATURE_THRESHOLD = 0x4, + NVME_ERROR_RECOVERY = 0x5, + NVME_VOLATILE_WRITE_CACHE = 0x6, + NVME_NUMBER_OF_QUEUES = 0x7, + NVME_INTERRUPT_COALESCING = 0x8, + NVME_INTERRUPT_VECTOR_CONF = 0x9, + NVME_WRITE_ATOMICITY = 0xa, + NVME_ASYNCHRONOUS_EVENT_CONF = 0xb, + NVME_SOFTWARE_PROGRESS_MARKER = 0x80 +}; + +typedef struct NvmeRangeType { + uint8_t type; + uint8_t attributes; + uint8_t rsvd2[14]; + uint64_t slba; + uint64_t nlb; + uint8_t guid[16]; + uint8_t rsvd48[16]; +} NvmeRangeType; + +typedef struct NvmeLBAF { + uint16_t ms; + uint8_t ds; + uint8_t rp; +} NvmeLBAF; + +typedef struct NvmeIdNs { + uint64_t nsze; + uint64_t ncap; + uint64_t nuse; + uint8_t nsfeat; + uint8_t nlbaf; + uint8_t flbas; + uint8_t mc; + uint8_t dpc; + uint8_t dps; + uint8_t res30[98]; + NvmeLBAF lbaf[16]; + uint8_t res192[192]; + uint8_t vs[3712]; +} NvmeIdNs; + +#define NVME_ID_NS_NSFEAT_THIN(nsfeat) ((nsfeat & 0x1)) +#define NVME_ID_NS_FLBAS_EXTENDED(flbas) ((flbas >> 4) & 0x1) +#define NVME_ID_NS_FLBAS_INDEX(flbas) ((flbas & 0xf)) +#define NVME_ID_NS_MC_SEPARATE(mc) ((mc >> 1) & 0x1) +#define NVME_ID_NS_MC_EXTENDED(mc) ((mc & 0x1)) +#define NVME_ID_NS_DPC_LAST_EIGHT(dpc) ((dpc >> 4) & 0x1) +#define NVME_ID_NS_DPC_FIRST_EIGHT(dpc) ((dpc >> 3) & 0x1) +#define NVME_ID_NS_DPC_TYPE_3(dpc) ((dpc >> 2) & 0x1) +#define NVME_ID_NS_DPC_TYPE_2(dpc) ((dpc >> 1) & 0x1) +#define NVME_ID_NS_DPC_TYPE_1(dpc) ((dpc & 0x1)) +#define NVME_ID_NS_DPC_TYPE_MASK 0x7 + +enum NvmeIdNsDps { + DPS_TYPE_NONE = 0, + DPS_TYPE_1 = 1, + DPS_TYPE_2 = 2, + DPS_TYPE_3 = 3, + DPS_TYPE_MASK = 0x7, + DPS_FIRST_EIGHT = 8, +}; + +static inline void _nvme_check_size(void) +{ + QEMU_BUILD_BUG_ON(sizeof(NvmeAerResult) != 4); + QEMU_BUILD_BUG_ON(sizeof(NvmeCqe) != 16); + QEMU_BUILD_BUG_ON(sizeof(NvmeDsmRange) != 16); + QEMU_BUILD_BUG_ON(sizeof(NvmeCmd) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeDeleteQ) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeCreateCq) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeCreateSq) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeIdentify) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeRwCmd) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeDsmCmd) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeRangeType) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeErrorLog) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512); + QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512); + QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrl) != 4096); + QEMU_BUILD_BUG_ON(sizeof(NvmeIdNs) != 4096); +} +#endif diff --git a/include/qemu/compiler.h b/include/qemu/compiler.h index 5fcc4f7ec7..2cbe6a4f16 100644 --- a/include/qemu/compiler.h +++ b/include/qemu/compiler.h @@ -114,5 +114,44 @@ #ifndef __has_feature #define __has_feature(x) 0 /* compatibility with non-clang compilers */ #endif +/* Implement C11 _Generic via GCC builtins. Example: + * + * QEMU_GENERIC(x, (float, sinf), (long double, sinl), sin) (x) + * + * The first argument is the discriminator. The last is the default value. + * The middle ones are tuples in "(type, expansion)" format. + */ + +/* First, find out the number of generic cases. */ +#define QEMU_GENERIC(x, ...) \ + QEMU_GENERIC_(typeof(x), __VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) + +/* There will be extra arguments, but they are not used. */ +#define QEMU_GENERIC_(x, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, count, ...) \ + QEMU_GENERIC##count(x, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) + +/* Two more helper macros, this time to extract items from a parenthesized + * list. + */ +#define QEMU_FIRST_(a, b) a +#define QEMU_SECOND_(a, b) b + +/* ... and a final one for the common part of the "recursion". */ +#define QEMU_GENERIC_IF(x, type_then, else_) \ + __builtin_choose_expr(__builtin_types_compatible_p(x, \ + QEMU_FIRST_ type_then), \ + QEMU_SECOND_ type_then, else_) + +/* CPP poor man's "recursion". */ +#define QEMU_GENERIC1(x, a0, ...) (a0) +#define QEMU_GENERIC2(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC1(x, __VA_ARGS__)) +#define QEMU_GENERIC3(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC2(x, __VA_ARGS__)) +#define QEMU_GENERIC4(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC3(x, __VA_ARGS__)) +#define QEMU_GENERIC5(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC4(x, __VA_ARGS__)) +#define QEMU_GENERIC6(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC5(x, __VA_ARGS__)) +#define QEMU_GENERIC7(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC6(x, __VA_ARGS__)) +#define QEMU_GENERIC8(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC7(x, __VA_ARGS__)) +#define QEMU_GENERIC9(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC8(x, __VA_ARGS__)) +#define QEMU_GENERIC10(x, a0, ...) QEMU_GENERIC_IF(x, a0, QEMU_GENERIC9(x, __VA_ARGS__)) #endif /* COMPILER_H */ diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h index ce2eb73670..6f8a487041 100644 --- a/include/qemu/coroutine.h +++ b/include/qemu/coroutine.h @@ -121,7 +121,7 @@ bool qemu_coroutine_entered(Coroutine *co); * Provides a mutex that can be used to synchronise coroutines */ struct CoWaitRecord; -typedef struct CoMutex { +struct CoMutex { /* Count of pending lockers; 0 for a free mutex, 1 for an * uncontended mutex. */ @@ -142,7 +142,7 @@ typedef struct CoMutex { unsigned handoff, sequence; Coroutine *holder; -} CoMutex; +}; /** * Initialises a CoMutex. This must be called before any other operation is used @@ -183,24 +183,33 @@ void qemu_co_queue_init(CoQueue *queue); * caller of the coroutine. The mutex is unlocked during the wait and * locked again afterwards. */ -void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex); +#define qemu_co_queue_wait(queue, lock) \ + qemu_co_queue_wait_impl(queue, QEMU_MAKE_LOCKABLE(lock)) +void coroutine_fn qemu_co_queue_wait_impl(CoQueue *queue, QemuLockable *lock); /** - * Restarts the next coroutine in the CoQueue and removes it from the queue. - * - * Returns true if a coroutine was restarted, false if the queue is empty. + * Removes the next coroutine from the CoQueue, and wake it up. + * Returns true if a coroutine was removed, false if the queue is empty. */ bool coroutine_fn qemu_co_queue_next(CoQueue *queue); /** - * Restarts all coroutines in the CoQueue and leaves the queue empty. + * Empties the CoQueue; all coroutines are woken up. */ void coroutine_fn qemu_co_queue_restart_all(CoQueue *queue); /** - * Enter the next coroutine in the queue + * Removes the next coroutine from the CoQueue, and wake it up. Unlike + * qemu_co_queue_next, this function releases the lock during aio_co_wake + * because it is meant to be used outside coroutine context; in that case, the + * coroutine is entered immediately, before qemu_co_enter_next returns. + * + * If used in coroutine context, qemu_co_enter_next is equivalent to + * qemu_co_queue_next. */ -bool qemu_co_enter_next(CoQueue *queue); +#define qemu_co_enter_next(queue, lock) \ + qemu_co_enter_next_impl(queue, QEMU_MAKE_LOCKABLE(lock)) +bool qemu_co_enter_next_impl(CoQueue *queue, QemuLockable *lock); /** * Checks if the CoQueue is empty. @@ -271,4 +280,6 @@ void coroutine_fn qemu_co_sleep_ns(QEMUClockType type, int64_t ns); */ void coroutine_fn yield_until_fd_readable(int fd); +#include "qemu/lockable.h" + #endif /* QEMU_COROUTINE_H */ diff --git a/include/qemu/lockable.h b/include/qemu/lockable.h new file mode 100644 index 0000000000..b6ed6c89ec --- /dev/null +++ b/include/qemu/lockable.h @@ -0,0 +1,96 @@ +/* + * Polymorphic locking functions (aka poor man templates) + * + * Copyright Red Hat, Inc. 2017, 2018 + * + * Author: Paolo Bonzini <pbonzini@redhat.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#ifndef QEMU_LOCKABLE_H +#define QEMU_LOCKABLE_H + +#include "qemu/coroutine.h" +#include "qemu/thread.h" + +typedef void QemuLockUnlockFunc(void *); + +struct QemuLockable { + void *object; + QemuLockUnlockFunc *lock; + QemuLockUnlockFunc *unlock; +}; + +/* This function gives an error if an invalid, non-NULL pointer type is passed + * to QEMU_MAKE_LOCKABLE. For optimized builds, we can rely on dead-code elimination + * from the compiler, and give the errors already at link time. + */ +#ifdef __OPTIMIZE__ +void unknown_lock_type(void *); +#else +static inline void unknown_lock_type(void *unused) +{ + abort(); +} +#endif + +static inline __attribute__((__always_inline__)) QemuLockable * +qemu_make_lockable(void *x, QemuLockable *lockable) +{ + /* We cannot test this in a macro, otherwise we get compiler + * warnings like "the address of 'm' will always evaluate as 'true'". + */ + return x ? lockable : NULL; +} + +/* Auxiliary macros to simplify QEMU_MAKE_LOCABLE. */ +#define QEMU_LOCK_FUNC(x) ((QemuLockUnlockFunc *) \ + QEMU_GENERIC(x, \ + (QemuMutex *, qemu_mutex_lock), \ + (CoMutex *, qemu_co_mutex_lock), \ + (QemuSpin *, qemu_spin_lock), \ + unknown_lock_type)) + +#define QEMU_UNLOCK_FUNC(x) ((QemuLockUnlockFunc *) \ + QEMU_GENERIC(x, \ + (QemuMutex *, qemu_mutex_unlock), \ + (CoMutex *, qemu_co_mutex_unlock), \ + (QemuSpin *, qemu_spin_unlock), \ + unknown_lock_type)) + +/* In C, compound literals have the lifetime of an automatic variable. + * In C++ it would be different, but then C++ wouldn't need QemuLockable + * either... + */ +#define QEMU_MAKE_LOCKABLE_(x) qemu_make_lockable((x), &(QemuLockable) { \ + .object = (x), \ + .lock = QEMU_LOCK_FUNC(x), \ + .unlock = QEMU_UNLOCK_FUNC(x), \ + }) + +/* QEMU_MAKE_LOCKABLE - Make a polymorphic QemuLockable + * + * @x: a lock object (currently one of QemuMutex, CoMutex, QemuSpin). + * + * Returns a QemuLockable object that can be passed around + * to a function that can operate with locks of any kind. + */ +#define QEMU_MAKE_LOCKABLE(x) \ + QEMU_GENERIC(x, \ + (QemuLockable *, (x)), \ + QEMU_MAKE_LOCKABLE_(x)) + +static inline void qemu_lockable_lock(QemuLockable *x) +{ + x->lock(x->object); +} + +static inline void qemu_lockable_unlock(QemuLockable *x) +{ + x->unlock(x->object); +} + +#endif diff --git a/include/qemu/thread.h b/include/qemu/thread.h index 9af4e945aa..ef7bd16123 100644 --- a/include/qemu/thread.h +++ b/include/qemu/thread.h @@ -4,7 +4,6 @@ #include "qemu/processor.h" #include "qemu/atomic.h" -typedef struct QemuMutex QemuMutex; typedef struct QemuCond QemuCond; typedef struct QemuSemaphore QemuSemaphore; typedef struct QemuEvent QemuEvent; @@ -97,9 +96,9 @@ struct Notifier; void qemu_thread_atexit_add(struct Notifier *notifier); void qemu_thread_atexit_remove(struct Notifier *notifier); -typedef struct QemuSpin { +struct QemuSpin { int value; -} QemuSpin; +}; static inline void qemu_spin_init(QemuSpin *spin) { diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h index 9bd7a834ba..5923849cdd 100644 --- a/include/qemu/typedefs.h +++ b/include/qemu/typedefs.h @@ -19,6 +19,7 @@ typedef struct BusClass BusClass; typedef struct BusState BusState; typedef struct Chardev Chardev; typedef struct CompatProperty CompatProperty; +typedef struct CoMutex CoMutex; typedef struct CPUAddressSpace CPUAddressSpace; typedef struct CPUState CPUState; typedef struct DeviceListener DeviceListener; @@ -86,9 +87,12 @@ typedef struct QEMUBH QEMUBH; typedef struct QemuConsole QemuConsole; typedef struct QemuDmaBuf QemuDmaBuf; typedef struct QEMUFile QEMUFile; +typedef struct QemuLockable QemuLockable; +typedef struct QemuMutex QemuMutex; typedef struct QemuOpt QemuOpt; typedef struct QemuOpts QemuOpts; typedef struct QemuOptsList QemuOptsList; +typedef struct QemuSpin QemuSpin; typedef struct QEMUSGList QEMUSGList; typedef struct QEMUTimer QEMUTimer; typedef struct QEMUTimerListGroup QEMUTimerListGroup; diff --git a/include/qemu/vfio-helpers.h b/include/qemu/vfio-helpers.h new file mode 100644 index 0000000000..ce7e7b057f --- /dev/null +++ b/include/qemu/vfio-helpers.h @@ -0,0 +1,33 @@ +/* + * QEMU VFIO helpers + * + * Copyright 2016 - 2018 Red Hat, Inc. + * + * Authors: + * Fam Zheng <famz@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef QEMU_VFIO_HELPERS_H +#define QEMU_VFIO_HELPERS_H +#include "qemu/typedefs.h" + +typedef struct QEMUVFIOState QEMUVFIOState; + +QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp); +void qemu_vfio_close(QEMUVFIOState *s); +int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size, + bool temporary, uint64_t *iova_list); +int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s); +void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host); +void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index, + uint64_t offset, uint64_t size, + Error **errp); +void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar, + uint64_t offset, uint64_t size); +int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e, + int irq_type, Error **errp); + +#endif diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h index c4e52a5fa3..92ab624fac 100644 --- a/include/sysemu/block-backend.h +++ b/include/sysemu/block-backend.h @@ -229,4 +229,7 @@ void blk_io_limits_enable(BlockBackend *blk, const char *group); void blk_io_limits_update_group(BlockBackend *blk, const char *group); void blk_set_force_allow_inactivate(BlockBackend *blk); +void blk_register_buf(BlockBackend *blk, void *host, size_t size); +void blk_unregister_buf(BlockBackend *blk, void *host); + #endif diff --git a/qapi/block-core.json b/qapi/block-core.json index 8225308904..8046c2da23 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -2248,6 +2248,7 @@ # # @vxhs: Since 2.10 # @throttle: Since 2.11 +# @nvme: Since 2.12 # # Since: 2.9 ## @@ -2255,7 +2256,7 @@ 'data': [ 'blkdebug', 'blkverify', 'bochs', 'cloop', 'dmg', 'file', 'ftp', 'ftps', 'gluster', 'host_cdrom', 'host_device', 'http', 'https', 'iscsi', 'luks', 'nbd', 'nfs', - 'null-aio', 'null-co', 'parallels', 'qcow', 'qcow2', 'qed', + 'null-aio', 'null-co', 'nvme', 'parallels', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd', 'replication', 'sheepdog', 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] } @@ -2297,6 +2298,19 @@ 'data': { '*size': 'int', '*latency-ns': 'uint64' } } ## +# @BlockdevOptionsNVMe: +# +# Driver specific block device options for the NVMe backend. +# +# @device: controller address of the NVMe device. +# @namespace: namespace number of the device, starting from 1. +# +# Since: 2.12 +## +{ 'struct': 'BlockdevOptionsNVMe', + 'data': { 'device': 'str', 'namespace': 'int' } } + +## # @BlockdevOptionsVVFAT: # # Driver specific block device options for the vvfat protocol. @@ -3201,6 +3215,7 @@ 'nfs': 'BlockdevOptionsNfs', 'null-aio': 'BlockdevOptionsNull', 'null-co': 'BlockdevOptionsNull', + 'nvme': 'BlockdevOptionsNVMe', 'parallels': 'BlockdevOptionsGenericFormat', 'qcow2': 'BlockdevOptionsQcow2', 'qcow': 'BlockdevOptionsQcow', diff --git a/qemu-doc.texi b/qemu-doc.texi index 19a82bfea3..769968aba4 100644 --- a/qemu-doc.texi +++ b/qemu-doc.texi @@ -621,6 +621,7 @@ encrypted disk images. * disk_images_iscsi:: iSCSI LUNs * disk_images_gluster:: GlusterFS disk images * disk_images_ssh:: Secure Shell (ssh) disk images +* disk_images_nvme:: NVMe userspace driver * disk_image_locking:: Disk image file locking @end menu diff --git a/qemu-img.c b/qemu-img.c index 68b375f998..28d0e4e9f8 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -3862,6 +3862,7 @@ static int img_bench(int argc, char **argv) struct timeval t1, t2; int i; bool force_share = false; + size_t buf_size; for (;;) { static const struct option long_options[] = { @@ -4050,9 +4051,12 @@ static int img_bench(int argc, char **argv) printf("Sending flush every %d requests\n", flush_interval); } - data.buf = blk_blockalign(blk, data.nrreq * data.bufsize); + buf_size = data.nrreq * data.bufsize; + data.buf = blk_blockalign(blk, buf_size); memset(data.buf, pattern, data.nrreq * data.bufsize); + blk_register_buf(blk, data.buf, buf_size); + data.qiov = g_new(QEMUIOVector, data.nrreq); for (i = 0; i < data.nrreq; i++) { qemu_iovec_init(&data.qiov[i], 1); @@ -4073,6 +4077,9 @@ static int img_bench(int argc, char **argv) + ((double)(t2.tv_usec - t1.tv_usec) / 1000000)); out: + if (data.buf) { + blk_unregister_buf(blk, data.buf); + } qemu_vfree(data.buf); blk_unref(blk); diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs index 8cfe34328a..2d59d84091 100644 --- a/stubs/Makefile.objs +++ b/stubs/Makefile.objs @@ -42,3 +42,4 @@ stub-obj-y += vmgenid.o stub-obj-y += xen-common.o stub-obj-y += xen-hvm.o stub-obj-y += pci-host-piix.o +stub-obj-y += ram-block.o diff --git a/stubs/ram-block.c b/stubs/ram-block.c new file mode 100644 index 0000000000..cfa5d8678f --- /dev/null +++ b/stubs/ram-block.c @@ -0,0 +1,16 @@ +#include "qemu/osdep.h" +#include "exec/ramlist.h" +#include "exec/cpu-common.h" + +void ram_block_notifier_add(RAMBlockNotifier *n) +{ +} + +void ram_block_notifier_remove(RAMBlockNotifier *n) +{ +} + +int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque) +{ + return 0; +} diff --git a/tests/docker/dockerfiles/fedora.docker b/tests/docker/dockerfiles/fedora.docker index 26ede4f1d6..994a35a332 100644 --- a/tests/docker/dockerfiles/fedora.docker +++ b/tests/docker/dockerfiles/fedora.docker @@ -1,4 +1,4 @@ -FROM fedora:latest +FROM fedora:27 ENV PACKAGES \ ccache gettext git tar PyYAML sparse flex bison python3 bzip2 hostname \ glib2-devel pixman-devel zlib-devel SDL-devel libfdt-devel \ diff --git a/tests/test-coroutine.c b/tests/test-coroutine.c index 76c646107e..28e79b3210 100644 --- a/tests/test-coroutine.c +++ b/tests/test-coroutine.c @@ -14,6 +14,7 @@ #include "qemu/osdep.h" #include "qemu/coroutine.h" #include "qemu/coroutine_int.h" +#include "qemu/lockable.h" /* * Check that qemu_in_coroutine() works @@ -175,7 +176,7 @@ static void coroutine_fn c1_fn(void *opaque) qemu_coroutine_enter(c2); } -static void test_co_queue(void) +static void test_no_dangling_access(void) { Coroutine *c1; Coroutine *c2; @@ -195,6 +196,74 @@ static void test_co_queue(void) *c1 = tmp; } +static bool locked; +static int done; + +static void coroutine_fn mutex_fn(void *opaque) +{ + CoMutex *m = opaque; + qemu_co_mutex_lock(m); + assert(!locked); + locked = true; + qemu_coroutine_yield(); + locked = false; + qemu_co_mutex_unlock(m); + done++; +} + +static void coroutine_fn lockable_fn(void *opaque) +{ + QemuLockable *x = opaque; + qemu_lockable_lock(x); + assert(!locked); + locked = true; + qemu_coroutine_yield(); + locked = false; + qemu_lockable_unlock(x); + done++; +} + +static void do_test_co_mutex(CoroutineEntry *entry, void *opaque) +{ + Coroutine *c1 = qemu_coroutine_create(entry, opaque); + Coroutine *c2 = qemu_coroutine_create(entry, opaque); + + done = 0; + qemu_coroutine_enter(c1); + g_assert(locked); + qemu_coroutine_enter(c2); + + /* Unlock queues c2. It is then started automatically when c1 yields or + * terminates. + */ + qemu_coroutine_enter(c1); + g_assert_cmpint(done, ==, 1); + g_assert(locked); + + qemu_coroutine_enter(c2); + g_assert_cmpint(done, ==, 2); + g_assert(!locked); +} + +static void test_co_mutex(void) +{ + CoMutex m; + + qemu_co_mutex_init(&m); + do_test_co_mutex(mutex_fn, &m); +} + +static void test_co_mutex_lockable(void) +{ + CoMutex m; + CoMutex *null_pointer = NULL; + + qemu_co_mutex_init(&m); + do_test_co_mutex(lockable_fn, QEMU_MAKE_LOCKABLE(&m)); + + g_assert(QEMU_MAKE_LOCKABLE(null_pointer) == NULL); +} + /* * Check that creation, enter, and return work */ @@ -422,7 +491,7 @@ int main(int argc, char **argv) * crash, so skip it. */ if (CONFIG_COROUTINE_POOL) { - g_test_add_func("/basic/co_queue", test_co_queue); + g_test_add_func("/basic/no-dangling-access", test_no_dangling_access); } g_test_add_func("/basic/lifecycle", test_lifecycle); @@ -432,6 +501,8 @@ int main(int argc, char **argv) g_test_add_func("/basic/entered", test_entered); g_test_add_func("/basic/in_coroutine", test_in_coroutine); g_test_add_func("/basic/order", test_order); + g_test_add_func("/locking/co-mutex", test_co_mutex); + g_test_add_func("/locking/co-mutex/lockable", test_co_mutex_lockable); if (g_test_perf()) { g_test_add_func("/perf/lifecycle", perf_lifecycle); g_test_add_func("/perf/nesting", perf_nesting); diff --git a/tests/vm/README b/tests/vm/README index ae53dce6ee..f9c04cc0e7 100644 --- a/tests/vm/README +++ b/tests/vm/README @@ -1,89 +1 @@ -=== VM test suite to run build in guests === - -== Intro == - -This test suite contains scripts that bootstrap various guest images that have -necessary packages to build QEMU. The basic usage is documented in Makefile -help which is displayed with "make vm-test". - -== Quick start == - -Run "make vm-test" to list available make targets. Invoke a specific make -command to run build test in an image. For example, "make vm-build-freebsd" -will build the source tree in the FreeBSD image. The command can be executed -from either the source tree or the build dir; if the former, ./configure is not -needed. The command will then generate the test image in ./tests/vm/ under the -working directory. - -Note: images created by the scripts accept a well-known RSA key pair for SSH -access, so they SHOULD NOT be exposed to external interfaces if you are -concerned about attackers taking control of the guest and potentially -exploiting a QEMU security bug to compromise the host. - -== QEMU binary == - -By default, qemu-system-x86_64 is searched in $PATH to run the guest. If there -isn't one, or if it is older than 2.10, the test won't work. In this case, -provide the QEMU binary in env var: QEMU=/path/to/qemu-2.10+. - -== Make jobs == - -The "-j$X" option in the make command line is not propagated into the VM, -specify "J=$X" to control the make jobs in the guest. - -== Debugging == - -Add "DEBUG=1" and/or "V=1" to the make command to allow interactive debugging -and verbose output. If this is not enough, see the next section. - -== Manual invocation == - -Each guest script is an executable script with the same command line options. -For example to work with the netbsd guest, use $QEMU_SRC/tests/vm/netbsd: - - $ cd $QEMU_SRC/tests/vm - - # To bootstrap the image - $ ./netbsd --build-image --image /var/tmp/netbsd.img - <...> - - # To run an arbitrary command in guest (the output will not be echoed unless - # --debug is added) - $ ./netbsd --debug --image /var/tmp/netbsd.img uname -a - - # To build QEMU in guest - $ ./netbsd --debug --image /var/tmp/netbsd.img --build-qemu $QEMU_SRC - - # To get to an interactive shell - $ ./netbsd --interactive --image /var/tmp/netbsd.img sh - -== Adding new guests == - -Please look at existing guest scripts for how to add new guests. - -Most importantly, create a subclass of BaseVM and implement build_image() -method and define BUILD_SCRIPT, then finally call basevm.main() from the -script's main(). - - - Usually in build_image(), a template image is downloaded from a predefined - URL. BaseVM._download_with_cache() takes care of the cache and the - checksum, so consider using it. - - - Once the image is downloaded, users, SSH server and QEMU build deps should - be set up: - - * Root password set to BaseVM.ROOT_PASS - * User BaseVM.GUEST_USER is created, and password set to BaseVM.GUEST_PASS - * SSH service is enabled and started on boot, - $QEMU_SRC/tests/keys/id_rsa.pub is added to ssh's "authorized_keys" file - of both root and the normal user - * DHCP client service is enabled and started on boot, so that it can - automatically configure the virtio-net-pci NIC and communicate with QEMU - user net (10.0.2.2) - * Necessary packages are installed to untar the source tarball and build - QEMU - - - Write a proper BUILD_SCRIPT template, which should be a shell script that - untars a raw virtio-blk block device, which is the tarball data blob of the - QEMU source tree, then configure/build it. Running "make check" is also - recommended. +See docs/devel/testing.rst for help. diff --git a/util/Makefile.objs b/util/Makefile.objs index 2973b0a323..3fb611631f 100644 --- a/util/Makefile.objs +++ b/util/Makefile.objs @@ -46,3 +46,4 @@ util-obj-y += qht.o util-obj-y += range.o util-obj-y += stats64.o util-obj-y += systemd.o +util-obj-$(CONFIG_LINUX) += vfio-helpers.o diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c index 846ff9167f..78fb79acf8 100644 --- a/util/qemu-coroutine-lock.c +++ b/util/qemu-coroutine-lock.c @@ -40,13 +40,13 @@ void qemu_co_queue_init(CoQueue *queue) QSIMPLEQ_INIT(&queue->entries); } -void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex) +void coroutine_fn qemu_co_queue_wait_impl(CoQueue *queue, QemuLockable *lock) { Coroutine *self = qemu_coroutine_self(); QSIMPLEQ_INSERT_TAIL(&queue->entries, self, co_queue_next); - if (mutex) { - qemu_co_mutex_unlock(mutex); + if (lock) { + qemu_lockable_unlock(lock); } /* There is no race condition here. Other threads will call @@ -60,9 +60,11 @@ void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex) /* TODO: OSv implements wait morphing here, where the wakeup * primitive automatically places the woken coroutine on the * mutex's queue. This avoids the thundering herd effect. + * This could be implemented for CoMutexes, but not really for + * other cases of QemuLockable. */ - if (mutex) { - qemu_co_mutex_lock(mutex); + if (lock) { + qemu_lockable_lock(lock); } } @@ -130,7 +132,7 @@ void coroutine_fn qemu_co_queue_restart_all(CoQueue *queue) qemu_co_queue_do_restart(queue, false); } -bool qemu_co_enter_next(CoQueue *queue) +bool qemu_co_enter_next_impl(CoQueue *queue, QemuLockable *lock) { Coroutine *next; @@ -140,7 +142,13 @@ bool qemu_co_enter_next(CoQueue *queue) } QSIMPLEQ_REMOVE_HEAD(&queue->entries, co_queue_next); - qemu_coroutine_enter(next); + if (lock) { + qemu_lockable_unlock(lock); + } + aio_co_wake(next); + if (lock) { + qemu_lockable_lock(lock); + } return true; } diff --git a/util/trace-events b/util/trace-events index 515e6257fb..4822434c89 100644 --- a/util/trace-events +++ b/util/trace-events @@ -60,3 +60,14 @@ lockcnt_futex_wake(const void *lockcnt) "lockcnt %p waking up one waiter" qemu_mutex_lock(void *mutex, const char *file, const int line) "waiting on mutex %p (%s:%d)" qemu_mutex_locked(void *mutex, const char *file, const int line) "taken mutex %p (%s:%d)" qemu_mutex_unlock(void *mutex, const char *file, const int line) "released mutex %p (%s:%d)" + +# util/vfio-helpers.c +qemu_vfio_dma_reset_temporary(void *s) "s %p" +qemu_vfio_ram_block_added(void *s, void *p, size_t size) "s %p host %p size 0x%zx" +qemu_vfio_ram_block_removed(void *s, void *p, size_t size) "s %p host %p size 0x%zx" +qemu_vfio_find_mapping(void *s, void *p) "s %p host %p" +qemu_vfio_new_mapping(void *s, void *host, size_t size, int index, uint64_t iova) "s %p host %p size %zu index %d iova 0x%"PRIx64 +qemu_vfio_do_mapping(void *s, void *host, size_t size, uint64_t iova) "s %p host %p size %zu iova 0x%"PRIx64 +qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size %zu temporary %d iova %p" +qemu_vfio_dma_map_invalid(void *s, void *mapping_host, size_t mapping_size, void *host, size_t size) "s %p mapping %p %zu requested %p %zu" +qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p" diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c new file mode 100644 index 0000000000..f478b68400 --- /dev/null +++ b/util/vfio-helpers.c @@ -0,0 +1,727 @@ +/* + * VFIO utility + * + * Copyright 2016 - 2018 Red Hat, Inc. + * + * Authors: + * Fam Zheng <famz@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include <sys/ioctl.h> +#include <linux/vfio.h> +#include "qapi/error.h" +#include "exec/ramlist.h" +#include "exec/cpu-common.h" +#include "trace.h" +#include "qemu/queue.h" +#include "qemu/error-report.h" +#include "standard-headers/linux/pci_regs.h" +#include "qemu/event_notifier.h" +#include "qemu/vfio-helpers.h" +#include "trace.h" + +#define QEMU_VFIO_DEBUG 0 + +#define QEMU_VFIO_IOVA_MIN 0x10000ULL +/* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface, + * we can use a runtime limit; alternatively it's also possible to do platform + * specific detection by reading sysfs entries. Until then, 39 is a safe bet. + **/ +#define QEMU_VFIO_IOVA_MAX (1ULL << 39) + +typedef struct { + /* Page aligned addr. */ + void *host; + size_t size; + uint64_t iova; +} IOVAMapping; + +struct QEMUVFIOState { + QemuMutex lock; + + /* These fields are protected by BQL */ + int container; + int group; + int device; + RAMBlockNotifier ram_notifier; + struct vfio_region_info config_region_info, bar_region_info[6]; + + /* These fields are protected by @lock */ + /* VFIO's IO virtual address space is managed by splitting into a few + * sections: + * + * --------------- <= 0 + * |xxxxxxxxxxxxx| + * |-------------| <= QEMU_VFIO_IOVA_MIN + * | | + * | Fixed | + * | | + * |-------------| <= low_water_mark + * | | + * | Free | + * | | + * |-------------| <= high_water_mark + * | | + * | Temp | + * | | + * |-------------| <= QEMU_VFIO_IOVA_MAX + * |xxxxxxxxxxxxx| + * |xxxxxxxxxxxxx| + * --------------- + * + * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid; + * + * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of + * [QEMU_VFIO_IOVA_MIN, low_water_mark). Once allocated they will not be + * reclaimed - low_water_mark never shrinks; + * + * - IOVAs in range [low_water_mark, high_water_mark) are free; + * + * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile + * mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area + * is recycled. The caller should make sure I/O's depending on these + * mappings are completed before calling. + **/ + uint64_t low_water_mark; + uint64_t high_water_mark; + IOVAMapping *mappings; + int nr_mappings; +}; + +/** + * Find group file by PCI device address as specified @device, and return the + * path. The returned string is owned by caller and should be g_free'ed later. + */ +static char *sysfs_find_group_file(const char *device, Error **errp) +{ + char *sysfs_link; + char *sysfs_group; + char *p; + char *path = NULL; + + sysfs_link = g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", device); + sysfs_group = g_malloc(PATH_MAX); + if (readlink(sysfs_link, sysfs_group, PATH_MAX - 1) == -1) { + error_setg_errno(errp, errno, "Failed to find iommu group sysfs path"); + goto out; + } + p = strrchr(sysfs_group, '/'); + if (!p) { + error_setg(errp, "Failed to find iommu group number"); + goto out; + } + + path = g_strdup_printf("/dev/vfio/%s", p + 1); +out: + g_free(sysfs_link); + g_free(sysfs_group); + return path; +} + +static inline void assert_bar_index_valid(QEMUVFIOState *s, int index) +{ + assert(index >= 0 && index < ARRAY_SIZE(s->bar_region_info)); +} + +static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp) +{ + assert_bar_index_valid(s, index); + s->bar_region_info[index] = (struct vfio_region_info) { + .index = VFIO_PCI_BAR0_REGION_INDEX + index, + .argsz = sizeof(struct vfio_region_info), + }; + if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->bar_region_info[index])) { + error_setg_errno(errp, errno, "Failed to get BAR region info"); + return -errno; + } + + return 0; +} + +/** + * Map a PCI bar area. + */ +void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index, + uint64_t offset, uint64_t size, + Error **errp) +{ + void *p; + assert_bar_index_valid(s, index); + p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset), + PROT_READ | PROT_WRITE, MAP_SHARED, + s->device, s->bar_region_info[index].offset + offset); + if (p == MAP_FAILED) { + error_setg_errno(errp, errno, "Failed to map BAR region"); + p = NULL; + } + return p; +} + +/** + * Unmap a PCI bar area. + */ +void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar, + uint64_t offset, uint64_t size) +{ + if (bar) { + munmap(bar, MIN(size, s->bar_region_info[index].size - offset)); + } +} + +/** + * Initialize device IRQ with @irq_type and and register an event notifier. + */ +int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e, + int irq_type, Error **errp) +{ + int r; + struct vfio_irq_set *irq_set; + size_t irq_set_size; + struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; + + irq_info.index = irq_type; + if (ioctl(s->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info)) { + error_setg_errno(errp, errno, "Failed to get device interrupt info"); + return -errno; + } + if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) { + error_setg(errp, "Device interrupt doesn't support eventfd"); + return -EINVAL; + } + + irq_set_size = sizeof(*irq_set) + sizeof(int); + irq_set = g_malloc0(irq_set_size); + + /* Get to a known IRQ state */ + *irq_set = (struct vfio_irq_set) { + .argsz = irq_set_size, + .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER, + .index = irq_info.index, + .start = 0, + .count = 1, + }; + + *(int *)&irq_set->data = event_notifier_get_fd(e); + r = ioctl(s->device, VFIO_DEVICE_SET_IRQS, irq_set); + g_free(irq_set); + if (r) { + error_setg_errno(errp, errno, "Failed to setup device interrupt"); + return -errno; + } + return 0; +} + +static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf, + int size, int ofs) +{ + int ret; + + do { + ret = pread(s->device, buf, size, s->config_region_info.offset + ofs); + } while (ret == -1 && errno == EINTR); + return ret == size ? 0 : -errno; +} + +static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int ofs) +{ + int ret; + + do { + ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs); + } while (ret == -1 && errno == EINTR); + return ret == size ? 0 : -errno; +} + +static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device, + Error **errp) +{ + int ret; + int i; + uint16_t pci_cmd; + struct vfio_group_status group_status = { .argsz = sizeof(group_status) }; + struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) }; + struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; + char *group_file = NULL; + + /* Create a new container */ + s->container = open("/dev/vfio/vfio", O_RDWR); + + if (s->container == -1) { + error_setg_errno(errp, errno, "Failed to open /dev/vfio/vfio"); + return -errno; + } + if (ioctl(s->container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) { + error_setg(errp, "Invalid VFIO version"); + ret = -EINVAL; + goto fail_container; + } + + if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) { + error_setg_errno(errp, errno, "VFIO IOMMU check failed"); + ret = -EINVAL; + goto fail_container; + } + + /* Open the group */ + group_file = sysfs_find_group_file(device, errp); + if (!group_file) { + ret = -EINVAL; + goto fail_container; + } + + s->group = open(group_file, O_RDWR); + if (s->group == -1) { + error_setg_errno(errp, errno, "Failed to open VFIO group file: %s", + group_file); + g_free(group_file); + ret = -errno; + goto fail_container; + } + g_free(group_file); + + /* Test the group is viable and available */ + if (ioctl(s->group, VFIO_GROUP_GET_STATUS, &group_status)) { + error_setg_errno(errp, errno, "Failed to get VFIO group status"); + ret = -errno; + goto fail; + } + + if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { + error_setg(errp, "VFIO group is not viable"); + ret = -EINVAL; + goto fail; + } + + /* Add the group to the container */ + if (ioctl(s->group, VFIO_GROUP_SET_CONTAINER, &s->container)) { + error_setg_errno(errp, errno, "Failed to add group to VFIO container"); + ret = -errno; + goto fail; + } + + /* Enable the IOMMU model we want */ + if (ioctl(s->container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) { + error_setg_errno(errp, errno, "Failed to set VFIO IOMMU type"); + ret = -errno; + goto fail; + } + + /* Get additional IOMMU info */ + if (ioctl(s->container, VFIO_IOMMU_GET_INFO, &iommu_info)) { + error_setg_errno(errp, errno, "Failed to get IOMMU info"); + ret = -errno; + goto fail; + } + + s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device); + + if (s->device < 0) { + error_setg_errno(errp, errno, "Failed to get device fd"); + ret = -errno; + goto fail; + } + + /* Test and setup the device */ + if (ioctl(s->device, VFIO_DEVICE_GET_INFO, &device_info)) { + error_setg_errno(errp, errno, "Failed to get device info"); + ret = -errno; + goto fail; + } + + if (device_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) { + error_setg(errp, "Invalid device regions"); + ret = -EINVAL; + goto fail; + } + + s->config_region_info = (struct vfio_region_info) { + .index = VFIO_PCI_CONFIG_REGION_INDEX, + .argsz = sizeof(struct vfio_region_info), + }; + if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->config_region_info)) { + error_setg_errno(errp, errno, "Failed to get config region info"); + ret = -errno; + goto fail; + } + + for (i = 0; i < 6; i++) { + ret = qemu_vfio_pci_init_bar(s, i, errp); + if (ret) { + goto fail; + } + } + + /* Enable bus master */ + ret = qemu_vfio_pci_read_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND); + if (ret) { + goto fail; + } + pci_cmd |= PCI_COMMAND_MASTER; + ret = qemu_vfio_pci_write_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND); + if (ret) { + goto fail; + } + return 0; +fail: + close(s->group); +fail_container: + close(s->container); + return ret; +} + +static void qemu_vfio_ram_block_added(RAMBlockNotifier *n, + void *host, size_t size) +{ + QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier); + trace_qemu_vfio_ram_block_added(s, host, size); + qemu_vfio_dma_map(s, host, size, false, NULL); +} + +static void qemu_vfio_ram_block_removed(RAMBlockNotifier *n, + void *host, size_t size) +{ + QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier); + if (host) { + trace_qemu_vfio_ram_block_removed(s, host, size); + qemu_vfio_dma_unmap(s, host); + } +} + +static int qemu_vfio_init_ramblock(const char *block_name, void *host_addr, + ram_addr_t offset, ram_addr_t length, + void *opaque) +{ + int ret; + QEMUVFIOState *s = opaque; + + if (!host_addr) { + return 0; + } + ret = qemu_vfio_dma_map(s, host_addr, length, false, NULL); + if (ret) { + fprintf(stderr, "qemu_vfio_init_ramblock: failed %p %" PRId64 "\n", + host_addr, (uint64_t)length); + } + return 0; +} + +static void qemu_vfio_open_common(QEMUVFIOState *s) +{ + s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added; + s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed; + ram_block_notifier_add(&s->ram_notifier); + s->low_water_mark = QEMU_VFIO_IOVA_MIN; + s->high_water_mark = QEMU_VFIO_IOVA_MAX; + qemu_ram_foreach_block(qemu_vfio_init_ramblock, s); + qemu_mutex_init(&s->lock); +} + +/** + * Open a PCI device, e.g. "0000:00:01.0". + */ +QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp) +{ + int r; + QEMUVFIOState *s = g_new0(QEMUVFIOState, 1); + + r = qemu_vfio_init_pci(s, device, errp); + if (r) { + g_free(s); + return NULL; + } + qemu_vfio_open_common(s); + return s; +} + +static void qemu_vfio_dump_mapping(IOVAMapping *m) +{ + if (QEMU_VFIO_DEBUG) { + printf(" vfio mapping %p %" PRIx64 " to %" PRIx64 "\n", m->host, + (uint64_t)m->size, (uint64_t)m->iova); + } +} + +static void qemu_vfio_dump_mappings(QEMUVFIOState *s) +{ + int i; + + if (QEMU_VFIO_DEBUG) { + printf("vfio mappings\n"); + for (i = 0; i < s->nr_mappings; ++i) { + qemu_vfio_dump_mapping(&s->mappings[i]); + } + } +} + +/** + * Find the mapping entry that contains [host, host + size) and set @index to + * the position. If no entry contains it, @index is the position _after_ which + * to insert the new mapping. IOW, it is the index of the largest element that + * is smaller than @host, or -1 if no entry is. + */ +static IOVAMapping *qemu_vfio_find_mapping(QEMUVFIOState *s, void *host, + int *index) +{ + IOVAMapping *p = s->mappings; + IOVAMapping *q = p ? p + s->nr_mappings - 1 : NULL; + IOVAMapping *mid; + trace_qemu_vfio_find_mapping(s, host); + if (!p) { + *index = -1; + return NULL; + } + while (true) { + mid = p + (q - p) / 2; + if (mid == p) { + break; + } + if (mid->host > host) { + q = mid; + } else if (mid->host < host) { + p = mid; + } else { + break; + } + } + if (mid->host > host) { + mid--; + } else if (mid < &s->mappings[s->nr_mappings - 1] + && (mid + 1)->host <= host) { + mid++; + } + *index = mid - &s->mappings[0]; + if (mid >= &s->mappings[0] && + mid->host <= host && mid->host + mid->size > host) { + assert(mid < &s->mappings[s->nr_mappings]); + return mid; + } + /* At this point *index + 1 is the right position to insert the new + * mapping.*/ + return NULL; +} + +/** + * Allocate IOVA and and create a new mapping record and insert it in @s. + */ +static IOVAMapping *qemu_vfio_add_mapping(QEMUVFIOState *s, + void *host, size_t size, + int index, uint64_t iova) +{ + int shift; + IOVAMapping m = {.host = host, .size = size, .iova = iova}; + IOVAMapping *insert; + + assert(QEMU_IS_ALIGNED(size, getpagesize())); + assert(QEMU_IS_ALIGNED(s->low_water_mark, getpagesize())); + assert(QEMU_IS_ALIGNED(s->high_water_mark, getpagesize())); + trace_qemu_vfio_new_mapping(s, host, size, index, iova); + + assert(index >= 0); + s->nr_mappings++; + s->mappings = g_realloc_n(s->mappings, sizeof(s->mappings[0]), + s->nr_mappings); + insert = &s->mappings[index]; + shift = s->nr_mappings - index - 1; + if (shift) { + memmove(insert + 1, insert, shift * sizeof(s->mappings[0])); + } + *insert = m; + return insert; +} + +/* Do the DMA mapping with VFIO. */ +static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size, + uint64_t iova) +{ + struct vfio_iommu_type1_dma_map dma_map = { + .argsz = sizeof(dma_map), + .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, + .iova = iova, + .vaddr = (uintptr_t)host, + .size = size, + }; + trace_qemu_vfio_do_mapping(s, host, size, iova); + + if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) { + error_report("VFIO_MAP_DMA: %d", -errno); + return -errno; + } + return 0; +} + +/** + * Undo the DMA mapping from @s with VFIO, and remove from mapping list. + */ +static void qemu_vfio_undo_mapping(QEMUVFIOState *s, IOVAMapping *mapping, + Error **errp) +{ + int index; + struct vfio_iommu_type1_dma_unmap unmap = { + .argsz = sizeof(unmap), + .flags = 0, + .iova = mapping->iova, + .size = mapping->size, + }; + + index = mapping - s->mappings; + assert(mapping->size > 0); + assert(QEMU_IS_ALIGNED(mapping->size, getpagesize())); + assert(index >= 0 && index < s->nr_mappings); + if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) { + error_setg(errp, "VFIO_UNMAP_DMA failed: %d", -errno); + } + memmove(mapping, &s->mappings[index + 1], + sizeof(s->mappings[0]) * (s->nr_mappings - index - 1)); + s->nr_mappings--; + s->mappings = g_realloc_n(s->mappings, sizeof(s->mappings[0]), + s->nr_mappings); +} + +/* Check if the mapping list is (ascending) ordered. */ +static bool qemu_vfio_verify_mappings(QEMUVFIOState *s) +{ + int i; + if (QEMU_VFIO_DEBUG) { + for (i = 0; i < s->nr_mappings - 1; ++i) { + if (!(s->mappings[i].host < s->mappings[i + 1].host)) { + fprintf(stderr, "item %d not sorted!\n", i); + qemu_vfio_dump_mappings(s); + return false; + } + if (!(s->mappings[i].host + s->mappings[i].size <= + s->mappings[i + 1].host)) { + fprintf(stderr, "item %d overlap with next!\n", i); + qemu_vfio_dump_mappings(s); + return false; + } + } + } + return true; +} + +/* Map [host, host + size) area into a contiguous IOVA address space, and store + * the result in @iova if not NULL. The caller need to make sure the area is + * aligned to page size, and mustn't overlap with existing mapping areas (split + * mapping status within this area is not allowed). + */ +int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size, + bool temporary, uint64_t *iova) +{ + int ret = 0; + int index; + IOVAMapping *mapping; + uint64_t iova0; + + assert(QEMU_PTR_IS_ALIGNED(host, getpagesize())); + assert(QEMU_IS_ALIGNED(size, getpagesize())); + trace_qemu_vfio_dma_map(s, host, size, temporary, iova); + qemu_mutex_lock(&s->lock); + mapping = qemu_vfio_find_mapping(s, host, &index); + if (mapping) { + iova0 = mapping->iova + ((uint8_t *)host - (uint8_t *)mapping->host); + } else { + if (s->high_water_mark - s->low_water_mark + 1 < size) { + ret = -ENOMEM; + goto out; + } + if (!temporary) { + iova0 = s->low_water_mark; + mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0); + if (!mapping) { + ret = -ENOMEM; + goto out; + } + assert(qemu_vfio_verify_mappings(s)); + ret = qemu_vfio_do_mapping(s, host, size, iova0); + if (ret) { + qemu_vfio_undo_mapping(s, mapping, NULL); + goto out; + } + s->low_water_mark += size; + qemu_vfio_dump_mappings(s); + } else { + iova0 = s->high_water_mark - size; + ret = qemu_vfio_do_mapping(s, host, size, iova0); + if (ret) { + goto out; + } + s->high_water_mark -= size; + } + } + if (iova) { + *iova = iova0; + } +out: + qemu_mutex_unlock(&s->lock); + return ret; +} + +/* Reset the high watermark and free all "temporary" mappings. */ +int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s) +{ + struct vfio_iommu_type1_dma_unmap unmap = { + .argsz = sizeof(unmap), + .flags = 0, + .iova = s->high_water_mark, + .size = QEMU_VFIO_IOVA_MAX - s->high_water_mark, + }; + trace_qemu_vfio_dma_reset_temporary(s); + qemu_mutex_lock(&s->lock); + if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) { + error_report("VFIO_UNMAP_DMA: %d", -errno); + qemu_mutex_unlock(&s->lock); + return -errno; + } + s->high_water_mark = QEMU_VFIO_IOVA_MAX; + qemu_mutex_unlock(&s->lock); + return 0; +} + +/* Unmapping the whole area that was previously mapped with + * qemu_vfio_dma_map(). */ +void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host) +{ + int index = 0; + IOVAMapping *m; + + if (!host) { + return; + } + + trace_qemu_vfio_dma_unmap(s, host); + qemu_mutex_lock(&s->lock); + m = qemu_vfio_find_mapping(s, host, &index); + if (!m) { + goto out; + } + qemu_vfio_undo_mapping(s, m, NULL); +out: + qemu_mutex_unlock(&s->lock); +} + +static void qemu_vfio_reset(QEMUVFIOState *s) +{ + ioctl(s->device, VFIO_DEVICE_RESET); +} + +/* Close and free the VFIO resources. */ +void qemu_vfio_close(QEMUVFIOState *s) +{ + int i; + + if (!s) { + return; + } + for (i = 0; i < s->nr_mappings; ++i) { + qemu_vfio_undo_mapping(s, &s->mappings[i], NULL); + } + ram_block_notifier_remove(&s->ram_notifier); + qemu_vfio_reset(s); + close(s->device); + close(s->group); + close(s->container); +} |