diff options
50 files changed, 3139 insertions, 1868 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 585cd5abd7..1444b26dc0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -508,7 +508,6 @@ M: Shannon Zhao <shannon.zhao@linaro.org> L: qemu-arm@nongnu.org S: Maintained F: hw/arm/virt-acpi-build.c -F: include/hw/arm/virt-acpi-build.h STM32F205 M: Alistair Francis <alistair@alistair23.me> @@ -885,7 +884,6 @@ F: hw/acpi/* F: hw/smbios/* F: hw/i386/acpi-build.[hc] F: hw/arm/virt-acpi-build.c -F: include/hw/arm/virt-acpi-build.h ppc4xx M: Alexander Graf <agraf@suse.de> @@ -1720,9 +1718,9 @@ L: qemu-block@nongnu.org S: Supported F: block/linux-aio.c F: include/block/raw-aio.h -F: block/raw-posix.c -F: block/raw-win32.c -F: block/raw_bsd.c +F: block/raw-format.c +F: block/file-posix.c +F: block/file-win32.c F: block/win32-aio.c qcow2 @@ -149,6 +149,7 @@ dummy := $(call unnest-vars,, \ qga-obj-y \ ivshmem-client-obj-y \ ivshmem-server-obj-y \ + libvhost-user-obj-y \ qga-vss-dll-obj-y \ block-obj-y \ block-obj-m \ diff --git a/Makefile.objs b/Makefile.objs index 51c36a4d54..01cef866e4 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -115,7 +115,7 @@ qga-vss-dll-obj-y = qga/ # contrib ivshmem-client-obj-y = contrib/ivshmem-client/ ivshmem-server-obj-y = contrib/ivshmem-server/ - +libvhost-user-obj-y = contrib/libvhost-user/ ###################################################################### trace-events-y = trace-events diff --git a/block/Makefile.objs b/block/Makefile.objs index 67a036a1df..0b8fd06f27 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -1,4 +1,4 @@ -block-obj-y += raw_bsd.o qcow.o vdi.o vmdk.o cloop.o bochs.o vpc.o vvfat.o dmg.o +block-obj-y += raw-format.o qcow.o vdi.o vmdk.o cloop.o bochs.o vpc.o vvfat.o dmg.o block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o block-obj-y += qed-check.o @@ -6,8 +6,8 @@ block-obj-y += vhdx.o vhdx-endian.o vhdx-log.o block-obj-y += quorum.o block-obj-y += parallels.o blkdebug.o blkverify.o blkreplay.o block-obj-y += block-backend.o snapshot.o qapi.o -block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o -block-obj-$(CONFIG_POSIX) += raw-posix.o +block-obj-$(CONFIG_WIN32) += file-win32.o win32-aio.o +block-obj-$(CONFIG_POSIX) += file-posix.o block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o block-obj-y += null.o mirror.o commit.o io.o block-obj-y += throttle-groups.o diff --git a/block/blkdebug.c b/block/blkdebug.c index 4127571454..acccf85666 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -58,10 +58,6 @@ typedef struct BlkdebugSuspendedReq { QLIST_ENTRY(BlkdebugSuspendedReq) next; } BlkdebugSuspendedReq; -static const AIOCBInfo blkdebug_aiocb_info = { - .aiocb_size = sizeof(BlkdebugAIOCB), -}; - enum { ACTION_INJECT_ERROR, ACTION_SET_STATE, @@ -77,7 +73,7 @@ typedef struct BlkdebugRule { int error; int immediately; int once; - int64_t sector; + int64_t offset; } inject; struct { int new_state; @@ -174,6 +170,7 @@ static int add_rule(void *opaque, QemuOpts *opts, Error **errp) const char* event_name; BlkdebugEvent event; struct BlkdebugRule *rule; + int64_t sector; /* Find the right event for the rule */ event_name = qemu_opt_get(opts, "event"); @@ -200,7 +197,9 @@ static int add_rule(void *opaque, QemuOpts *opts, Error **errp) rule->options.inject.once = qemu_opt_get_bool(opts, "once", 0); rule->options.inject.immediately = qemu_opt_get_bool(opts, "immediately", 0); - rule->options.inject.sector = qemu_opt_get_number(opts, "sector", -1); + sector = qemu_opt_get_number(opts, "sector", -1); + rule->options.inject.offset = + sector == -1 ? -1 : sector * BDRV_SECTOR_SIZE; break; case ACTION_SET_STATE: @@ -408,17 +407,14 @@ out: static void error_callback_bh(void *opaque) { - struct BlkdebugAIOCB *acb = opaque; - acb->common.cb(acb->common.opaque, acb->ret); - qemu_aio_unref(acb); + Coroutine *co = opaque; + qemu_coroutine_enter(co); } -static BlockAIOCB *inject_error(BlockDriverState *bs, - BlockCompletionFunc *cb, void *opaque, BlkdebugRule *rule) +static int inject_error(BlockDriverState *bs, BlkdebugRule *rule) { BDRVBlkdebugState *s = bs->opaque; int error = rule->options.inject.error; - struct BlkdebugAIOCB *acb; bool immediately = rule->options.inject.immediately; if (rule->options.inject.once) { @@ -426,81 +422,79 @@ static BlockAIOCB *inject_error(BlockDriverState *bs, remove_rule(rule); } - if (immediately) { - return NULL; + if (!immediately) { + aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), error_callback_bh, + qemu_coroutine_self()); + qemu_coroutine_yield(); } - acb = qemu_aio_get(&blkdebug_aiocb_info, bs, cb, opaque); - acb->ret = -error; - - aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), error_callback_bh, acb); - - return &acb->common; + return -error; } -static BlockAIOCB *blkdebug_aio_readv(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) +static int coroutine_fn +blkdebug_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { BDRVBlkdebugState *s = bs->opaque; BlkdebugRule *rule = NULL; QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) { - if (rule->options.inject.sector == -1 || - (rule->options.inject.sector >= sector_num && - rule->options.inject.sector < sector_num + nb_sectors)) { + uint64_t inject_offset = rule->options.inject.offset; + + if (inject_offset == -1 || + (inject_offset >= offset && inject_offset < offset + bytes)) + { break; } } if (rule && rule->options.inject.error) { - return inject_error(bs, cb, opaque, rule); + return inject_error(bs, rule); } - return bdrv_aio_readv(bs->file, sector_num, qiov, nb_sectors, - cb, opaque); + return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags); } -static BlockAIOCB *blkdebug_aio_writev(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) +static int coroutine_fn +blkdebug_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { BDRVBlkdebugState *s = bs->opaque; BlkdebugRule *rule = NULL; QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) { - if (rule->options.inject.sector == -1 || - (rule->options.inject.sector >= sector_num && - rule->options.inject.sector < sector_num + nb_sectors)) { + uint64_t inject_offset = rule->options.inject.offset; + + if (inject_offset == -1 || + (inject_offset >= offset && inject_offset < offset + bytes)) + { break; } } if (rule && rule->options.inject.error) { - return inject_error(bs, cb, opaque, rule); + return inject_error(bs, rule); } - return bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, - cb, opaque); + return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); } -static BlockAIOCB *blkdebug_aio_flush(BlockDriverState *bs, - BlockCompletionFunc *cb, void *opaque) +static int blkdebug_co_flush(BlockDriverState *bs) { BDRVBlkdebugState *s = bs->opaque; BlkdebugRule *rule = NULL; QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) { - if (rule->options.inject.sector == -1) { + if (rule->options.inject.offset == -1) { break; } } if (rule && rule->options.inject.error) { - return inject_error(bs, cb, opaque, rule); + return inject_error(bs, rule); } - return bdrv_aio_flush(bs->file->bs, cb, opaque); + return bdrv_co_flush(bs->file->bs); } @@ -752,9 +746,9 @@ static BlockDriver bdrv_blkdebug = { .bdrv_refresh_filename = blkdebug_refresh_filename, .bdrv_refresh_limits = blkdebug_refresh_limits, - .bdrv_aio_readv = blkdebug_aio_readv, - .bdrv_aio_writev = blkdebug_aio_writev, - .bdrv_aio_flush = blkdebug_aio_flush, + .bdrv_co_preadv = blkdebug_co_preadv, + .bdrv_co_pwritev = blkdebug_co_pwritev, + .bdrv_co_flush_to_disk = blkdebug_co_flush, .bdrv_debug_event = blkdebug_debug_event, .bdrv_debug_breakpoint = blkdebug_debug_breakpoint, diff --git a/block/blkverify.c b/block/blkverify.c index 28f9af6dba..43a940c2f5 100644 --- a/block/blkverify.c +++ b/block/blkverify.c @@ -19,38 +19,36 @@ typedef struct { BdrvChild *test_file; } BDRVBlkverifyState; -typedef struct BlkverifyAIOCB BlkverifyAIOCB; -struct BlkverifyAIOCB { - BlockAIOCB common; +typedef struct BlkverifyRequest { + Coroutine *co; + BlockDriverState *bs; /* Request metadata */ bool is_write; - int64_t sector_num; - int nb_sectors; + uint64_t offset; + uint64_t bytes; + int flags; - int ret; /* first completed request's result */ - unsigned int done; /* completion counter */ + int (*request_fn)(BdrvChild *, int64_t, unsigned int, QEMUIOVector *, + BdrvRequestFlags); - QEMUIOVector *qiov; /* user I/O vector */ - QEMUIOVector raw_qiov; /* cloned I/O vector for raw file */ - void *buf; /* buffer for raw file I/O */ + int ret; /* test image result */ + int raw_ret; /* raw image result */ - void (*verify)(BlkverifyAIOCB *acb); -}; + unsigned int done; /* completion counter */ -static const AIOCBInfo blkverify_aiocb_info = { - .aiocb_size = sizeof(BlkverifyAIOCB), -}; + QEMUIOVector *qiov; /* user I/O vector */ + QEMUIOVector *raw_qiov; /* cloned I/O vector for raw file */ +} BlkverifyRequest; -static void GCC_FMT_ATTR(2, 3) blkverify_err(BlkverifyAIOCB *acb, +static void GCC_FMT_ATTR(2, 3) blkverify_err(BlkverifyRequest *r, const char *fmt, ...) { va_list ap; va_start(ap, fmt); - fprintf(stderr, "blkverify: %s sector_num=%" PRId64 " nb_sectors=%d ", - acb->is_write ? "write" : "read", acb->sector_num, - acb->nb_sectors); + fprintf(stderr, "blkverify: %s offset=%" PRId64 " bytes=%" PRId64 " ", + r->is_write ? "write" : "read", r->offset, r->bytes); vfprintf(stderr, fmt, ap); fprintf(stderr, "\n"); va_end(ap); @@ -166,113 +164,106 @@ static int64_t blkverify_getlength(BlockDriverState *bs) return bdrv_getlength(s->test_file->bs); } -static BlkverifyAIOCB *blkverify_aio_get(BlockDriverState *bs, bool is_write, - int64_t sector_num, QEMUIOVector *qiov, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque) +static void coroutine_fn blkverify_do_test_req(void *opaque) { - BlkverifyAIOCB *acb = qemu_aio_get(&blkverify_aiocb_info, bs, cb, opaque); - - acb->is_write = is_write; - acb->sector_num = sector_num; - acb->nb_sectors = nb_sectors; - acb->ret = -EINPROGRESS; - acb->done = 0; - acb->qiov = qiov; - acb->buf = NULL; - acb->verify = NULL; - return acb; + BlkverifyRequest *r = opaque; + BDRVBlkverifyState *s = r->bs->opaque; + + r->ret = r->request_fn(s->test_file, r->offset, r->bytes, r->qiov, + r->flags); + r->done++; + qemu_coroutine_enter_if_inactive(r->co); } -static void blkverify_aio_bh(void *opaque) +static void coroutine_fn blkverify_do_raw_req(void *opaque) { - BlkverifyAIOCB *acb = opaque; + BlkverifyRequest *r = opaque; - if (acb->buf) { - qemu_iovec_destroy(&acb->raw_qiov); - qemu_vfree(acb->buf); - } - acb->common.cb(acb->common.opaque, acb->ret); - qemu_aio_unref(acb); + r->raw_ret = r->request_fn(r->bs->file, r->offset, r->bytes, r->raw_qiov, + r->flags); + r->done++; + qemu_coroutine_enter_if_inactive(r->co); } -static void blkverify_aio_cb(void *opaque, int ret) +static int coroutine_fn +blkverify_co_prwv(BlockDriverState *bs, BlkverifyRequest *r, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov, QEMUIOVector *raw_qiov, + int flags, bool is_write) { - BlkverifyAIOCB *acb = opaque; - - switch (++acb->done) { - case 1: - acb->ret = ret; - break; - - case 2: - if (acb->ret != ret) { - blkverify_err(acb, "return value mismatch %d != %d", acb->ret, ret); - } - - if (acb->verify) { - acb->verify(acb); - } + Coroutine *co_a, *co_b; + + *r = (BlkverifyRequest) { + .co = qemu_coroutine_self(), + .bs = bs, + .offset = offset, + .bytes = bytes, + .qiov = qiov, + .raw_qiov = raw_qiov, + .flags = flags, + .is_write = is_write, + .request_fn = is_write ? bdrv_co_pwritev : bdrv_co_preadv, + }; + + co_a = qemu_coroutine_create(blkverify_do_test_req, r); + co_b = qemu_coroutine_create(blkverify_do_raw_req, r); + + qemu_coroutine_enter(co_a); + qemu_coroutine_enter(co_b); + + while (r->done < 2) { + qemu_coroutine_yield(); + } - aio_bh_schedule_oneshot(bdrv_get_aio_context(acb->common.bs), - blkverify_aio_bh, acb); - break; + if (r->ret != r->raw_ret) { + blkverify_err(r, "return value mismatch %d != %d", r->ret, r->raw_ret); } + + return r->ret; } -static void blkverify_verify_readv(BlkverifyAIOCB *acb) +static int coroutine_fn +blkverify_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { - ssize_t offset = qemu_iovec_compare(acb->qiov, &acb->raw_qiov); - if (offset != -1) { - blkverify_err(acb, "contents mismatch in sector %" PRId64, - acb->sector_num + (int64_t)(offset / BDRV_SECTOR_SIZE)); + BlkverifyRequest r; + QEMUIOVector raw_qiov; + void *buf; + ssize_t cmp_offset; + int ret; + + buf = qemu_blockalign(bs->file->bs, qiov->size); + qemu_iovec_init(&raw_qiov, qiov->niov); + qemu_iovec_clone(&raw_qiov, qiov, buf); + + ret = blkverify_co_prwv(bs, &r, offset, bytes, qiov, &raw_qiov, flags, + false); + + cmp_offset = qemu_iovec_compare(qiov, &raw_qiov); + if (cmp_offset != -1) { + blkverify_err(&r, "contents mismatch at offset %" PRId64, + offset + cmp_offset); } -} -static BlockAIOCB *blkverify_aio_readv(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - BDRVBlkverifyState *s = bs->opaque; - BlkverifyAIOCB *acb = blkverify_aio_get(bs, false, sector_num, qiov, - nb_sectors, cb, opaque); - - acb->verify = blkverify_verify_readv; - acb->buf = qemu_blockalign(bs->file->bs, qiov->size); - qemu_iovec_init(&acb->raw_qiov, acb->qiov->niov); - qemu_iovec_clone(&acb->raw_qiov, qiov, acb->buf); - - bdrv_aio_readv(s->test_file, sector_num, qiov, nb_sectors, - blkverify_aio_cb, acb); - bdrv_aio_readv(bs->file, sector_num, &acb->raw_qiov, nb_sectors, - blkverify_aio_cb, acb); - return &acb->common; + qemu_iovec_destroy(&raw_qiov); + qemu_vfree(buf); + + return ret; } -static BlockAIOCB *blkverify_aio_writev(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) +static int coroutine_fn +blkverify_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) { - BDRVBlkverifyState *s = bs->opaque; - BlkverifyAIOCB *acb = blkverify_aio_get(bs, true, sector_num, qiov, - nb_sectors, cb, opaque); - - bdrv_aio_writev(s->test_file, sector_num, qiov, nb_sectors, - blkverify_aio_cb, acb); - bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, - blkverify_aio_cb, acb); - return &acb->common; + BlkverifyRequest r; + return blkverify_co_prwv(bs, &r, offset, bytes, qiov, qiov, flags, true); } -static BlockAIOCB *blkverify_aio_flush(BlockDriverState *bs, - BlockCompletionFunc *cb, - void *opaque) +static int blkverify_co_flush(BlockDriverState *bs) { BDRVBlkverifyState *s = bs->opaque; /* Only flush test file, the raw file is not important */ - return bdrv_aio_flush(s->test_file->bs, cb, opaque); + return bdrv_co_flush(s->test_file->bs); } static bool blkverify_recurse_is_first_non_filter(BlockDriverState *bs, @@ -332,9 +323,9 @@ static BlockDriver bdrv_blkverify = { .bdrv_getlength = blkverify_getlength, .bdrv_refresh_filename = blkverify_refresh_filename, - .bdrv_aio_readv = blkverify_aio_readv, - .bdrv_aio_writev = blkverify_aio_writev, - .bdrv_aio_flush = blkverify_aio_flush, + .bdrv_co_preadv = blkverify_co_preadv, + .bdrv_co_pwritev = blkverify_co_pwritev, + .bdrv_co_flush = blkverify_co_flush, .is_filter = true, .bdrv_recurse_is_first_non_filter = blkverify_recurse_is_first_non_filter, diff --git a/block/raw-posix.c b/block/file-posix.c index 28b47d977b..28b47d977b 100644 --- a/block/raw-posix.c +++ b/block/file-posix.c diff --git a/block/raw-win32.c b/block/file-win32.c index 800fabdd72..800fabdd72 100644 --- a/block/raw-win32.c +++ b/block/file-win32.c diff --git a/block/gluster.c b/block/gluster.c index a0a74e49fd..1a22f2982d 100644 --- a/block/gluster.c +++ b/block/gluster.c @@ -1253,7 +1253,7 @@ static int qemu_gluster_has_zero_init(BlockDriverState *bs) * If @start is in a trailing hole or beyond EOF, return -ENXIO. * If we can't find out, return a negative errno other than -ENXIO. * - * (Shamefully copied from raw-posix.c, only miniscule adaptions.) + * (Shamefully copied from file-posix.c, only miniscule adaptions.) */ static int find_allocation(BlockDriverState *bs, off_t start, off_t *data, off_t *hole) @@ -1349,7 +1349,7 @@ exit: * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes * beyond the end of the disk image it will be clamped. * - * (Based on raw_co_get_block_status() from raw-posix.c.) + * (Based on raw_co_get_block_status() from file-posix.c.) */ static int64_t coroutine_fn qemu_gluster_co_get_block_status( BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum, diff --git a/block/quorum.c b/block/quorum.c index d122299352..86e2072dce 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -97,7 +97,7 @@ typedef struct QuorumAIOCB QuorumAIOCB; * $children_count QuorumChildRequest. */ typedef struct QuorumChildRequest { - BlockAIOCB *aiocb; + BlockDriverState *bs; QEMUIOVector qiov; uint8_t *buf; int ret; @@ -110,11 +110,12 @@ typedef struct QuorumChildRequest { * used to do operations on each children and track overall progress. */ struct QuorumAIOCB { - BlockAIOCB common; + BlockDriverState *bs; + Coroutine *co; /* Request metadata */ - uint64_t sector_num; - int nb_sectors; + uint64_t offset; + uint64_t bytes; QEMUIOVector *qiov; /* calling IOV */ @@ -133,32 +134,15 @@ struct QuorumAIOCB { int children_read; /* how many children have been read from */ }; -static bool quorum_vote(QuorumAIOCB *acb); - -static void quorum_aio_cancel(BlockAIOCB *blockacb) -{ - QuorumAIOCB *acb = container_of(blockacb, QuorumAIOCB, common); - BDRVQuorumState *s = acb->common.bs->opaque; - int i; - - /* cancel all callbacks */ - for (i = 0; i < s->num_children; i++) { - if (acb->qcrs[i].aiocb) { - bdrv_aio_cancel_async(acb->qcrs[i].aiocb); - } - } -} - -static AIOCBInfo quorum_aiocb_info = { - .aiocb_size = sizeof(QuorumAIOCB), - .cancel_async = quorum_aio_cancel, -}; +typedef struct QuorumCo { + QuorumAIOCB *acb; + int idx; +} QuorumCo; static void quorum_aio_finalize(QuorumAIOCB *acb) { - acb->common.cb(acb->common.opaque, acb->vote_ret); g_free(acb->qcrs); - qemu_aio_unref(acb); + g_free(acb); } static bool quorum_sha256_compare(QuorumVoteValue *a, QuorumVoteValue *b) @@ -171,30 +155,26 @@ static bool quorum_64bits_compare(QuorumVoteValue *a, QuorumVoteValue *b) return a->l == b->l; } -static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s, - BlockDriverState *bs, +static QuorumAIOCB *quorum_aio_get(BlockDriverState *bs, QEMUIOVector *qiov, - uint64_t sector_num, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque) + uint64_t offset, + uint64_t bytes) { - QuorumAIOCB *acb = qemu_aio_get(&quorum_aiocb_info, bs, cb, opaque); + BDRVQuorumState *s = bs->opaque; + QuorumAIOCB *acb = g_new(QuorumAIOCB, 1); int i; - acb->common.bs->opaque = s; - acb->sector_num = sector_num; - acb->nb_sectors = nb_sectors; - acb->qiov = qiov; - acb->qcrs = g_new0(QuorumChildRequest, s->num_children); - acb->count = 0; - acb->success_count = 0; - acb->rewrite_count = 0; - acb->votes.compare = quorum_sha256_compare; - QLIST_INIT(&acb->votes.vote_list); - acb->is_read = false; - acb->vote_ret = 0; + *acb = (QuorumAIOCB) { + .co = qemu_coroutine_self(), + .bs = bs, + .offset = offset, + .bytes = bytes, + .qiov = qiov, + .votes.compare = quorum_sha256_compare, + .votes.vote_list = QLIST_HEAD_INITIALIZER(acb.votes.vote_list), + }; + acb->qcrs = g_new0(QuorumChildRequest, s->num_children); for (i = 0; i < s->num_children; i++) { acb->qcrs[i].buf = NULL; acb->qcrs[i].ret = 0; @@ -204,30 +184,37 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s, return acb; } -static void quorum_report_bad(QuorumOpType type, uint64_t sector_num, - int nb_sectors, char *node_name, int ret) +static void quorum_report_bad(QuorumOpType type, uint64_t offset, + uint64_t bytes, char *node_name, int ret) { const char *msg = NULL; + int64_t start_sector = offset / BDRV_SECTOR_SIZE; + int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE); + if (ret < 0) { msg = strerror(-ret); } - qapi_event_send_quorum_report_bad(type, !!msg, msg, node_name, - sector_num, nb_sectors, &error_abort); + qapi_event_send_quorum_report_bad(type, !!msg, msg, node_name, start_sector, + end_sector - start_sector, &error_abort); } static void quorum_report_failure(QuorumAIOCB *acb) { - const char *reference = bdrv_get_device_or_node_name(acb->common.bs); - qapi_event_send_quorum_failure(reference, acb->sector_num, - acb->nb_sectors, &error_abort); + const char *reference = bdrv_get_device_or_node_name(acb->bs); + int64_t start_sector = acb->offset / BDRV_SECTOR_SIZE; + int64_t end_sector = DIV_ROUND_UP(acb->offset + acb->bytes, + BDRV_SECTOR_SIZE); + + qapi_event_send_quorum_failure(reference, start_sector, + end_sector - start_sector, &error_abort); } static int quorum_vote_error(QuorumAIOCB *acb); static bool quorum_has_too_much_io_failed(QuorumAIOCB *acb) { - BDRVQuorumState *s = acb->common.bs->opaque; + BDRVQuorumState *s = acb->bs->opaque; if (acb->success_count < s->threshold) { acb->vote_ret = quorum_vote_error(acb); @@ -238,22 +225,7 @@ static bool quorum_has_too_much_io_failed(QuorumAIOCB *acb) return false; } -static void quorum_rewrite_aio_cb(void *opaque, int ret) -{ - QuorumAIOCB *acb = opaque; - - /* one less rewrite to do */ - acb->rewrite_count--; - - /* wait until all rewrite callbacks have completed */ - if (acb->rewrite_count) { - return; - } - - quorum_aio_finalize(acb); -} - -static BlockAIOCB *read_fifo_child(QuorumAIOCB *acb); +static int read_fifo_child(QuorumAIOCB *acb); static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source) { @@ -272,70 +244,7 @@ static void quorum_report_bad_acb(QuorumChildRequest *sacb, int ret) { QuorumAIOCB *acb = sacb->parent; QuorumOpType type = acb->is_read ? QUORUM_OP_TYPE_READ : QUORUM_OP_TYPE_WRITE; - quorum_report_bad(type, acb->sector_num, acb->nb_sectors, - sacb->aiocb->bs->node_name, ret); -} - -static void quorum_fifo_aio_cb(void *opaque, int ret) -{ - QuorumChildRequest *sacb = opaque; - QuorumAIOCB *acb = sacb->parent; - BDRVQuorumState *s = acb->common.bs->opaque; - - assert(acb->is_read && s->read_pattern == QUORUM_READ_PATTERN_FIFO); - - if (ret < 0) { - quorum_report_bad_acb(sacb, ret); - - /* We try to read next child in FIFO order if we fail to read */ - if (acb->children_read < s->num_children) { - read_fifo_child(acb); - return; - } - } - - acb->vote_ret = ret; - - /* FIXME: rewrite failed children if acb->children_read > 1? */ - quorum_aio_finalize(acb); -} - -static void quorum_aio_cb(void *opaque, int ret) -{ - QuorumChildRequest *sacb = opaque; - QuorumAIOCB *acb = sacb->parent; - BDRVQuorumState *s = acb->common.bs->opaque; - bool rewrite = false; - int i; - - sacb->ret = ret; - if (ret == 0) { - acb->success_count++; - } else { - quorum_report_bad_acb(sacb, ret); - } - acb->count++; - assert(acb->count <= s->num_children); - assert(acb->success_count <= s->num_children); - if (acb->count < s->num_children) { - return; - } - - /* Do the vote on read */ - if (acb->is_read) { - rewrite = quorum_vote(acb); - for (i = 0; i < s->num_children; i++) { - qemu_vfree(acb->qcrs[i].buf); - qemu_iovec_destroy(&acb->qcrs[i].qiov); - } - } else { - quorum_has_too_much_io_failed(acb); - } - - /* if no rewrite is done the code will finish right away */ - if (!rewrite) { - quorum_aio_finalize(acb); - } + quorum_report_bad(type, acb->offset, acb->bytes, sacb->bs->node_name, ret); } static void quorum_report_bad_versions(BDRVQuorumState *s, @@ -350,14 +259,31 @@ static void quorum_report_bad_versions(BDRVQuorumState *s, continue; } QLIST_FOREACH(item, &version->items, next) { - quorum_report_bad(QUORUM_OP_TYPE_READ, acb->sector_num, - acb->nb_sectors, + quorum_report_bad(QUORUM_OP_TYPE_READ, acb->offset, acb->bytes, s->children[item->index]->bs->node_name, 0); } } } -static bool quorum_rewrite_bad_versions(BDRVQuorumState *s, QuorumAIOCB *acb, +static void quorum_rewrite_entry(void *opaque) +{ + QuorumCo *co = opaque; + QuorumAIOCB *acb = co->acb; + BDRVQuorumState *s = acb->bs->opaque; + + /* Ignore any errors, it's just a correction attempt for already + * corrupted data. */ + bdrv_co_pwritev(s->children[co->idx], acb->offset, acb->bytes, + acb->qiov, 0); + + /* Wake up the caller after the last rewrite */ + acb->rewrite_count--; + if (!acb->rewrite_count) { + qemu_coroutine_enter_if_inactive(acb->co); + } +} + +static bool quorum_rewrite_bad_versions(QuorumAIOCB *acb, QuorumVoteValue *value) { QuorumVoteVersion *version; @@ -376,7 +302,7 @@ static bool quorum_rewrite_bad_versions(BDRVQuorumState *s, QuorumAIOCB *acb, } } - /* quorum_rewrite_aio_cb will count down this to zero */ + /* quorum_rewrite_entry will count down this to zero */ acb->rewrite_count = count; /* now fire the correcting rewrites */ @@ -385,9 +311,14 @@ static bool quorum_rewrite_bad_versions(BDRVQuorumState *s, QuorumAIOCB *acb, continue; } QLIST_FOREACH(item, &version->items, next) { - bdrv_aio_writev(s->children[item->index], acb->sector_num, - acb->qiov, acb->nb_sectors, quorum_rewrite_aio_cb, - acb); + Coroutine *co; + QuorumCo data = { + .acb = acb, + .idx = item->index, + }; + + co = qemu_coroutine_create(quorum_rewrite_entry, &data); + qemu_coroutine_enter(co); } } @@ -507,8 +438,8 @@ static void GCC_FMT_ATTR(2, 3) quorum_err(QuorumAIOCB *acb, va_list ap; va_start(ap, fmt); - fprintf(stderr, "quorum: sector_num=%" PRId64 " nb_sectors=%d ", - acb->sector_num, acb->nb_sectors); + fprintf(stderr, "quorum: offset=%" PRIu64 " bytes=%" PRIu64 " ", + acb->offset, acb->bytes); vfprintf(stderr, fmt, ap); fprintf(stderr, "\n"); va_end(ap); @@ -519,16 +450,15 @@ static bool quorum_compare(QuorumAIOCB *acb, QEMUIOVector *a, QEMUIOVector *b) { - BDRVQuorumState *s = acb->common.bs->opaque; + BDRVQuorumState *s = acb->bs->opaque; ssize_t offset; /* This driver will replace blkverify in this particular case */ if (s->is_blkverify) { offset = qemu_iovec_compare(a, b); if (offset != -1) { - quorum_err(acb, "contents mismatch in sector %" PRId64, - acb->sector_num + - (uint64_t)(offset / BDRV_SECTOR_SIZE)); + quorum_err(acb, "contents mismatch at offset %" PRIu64, + acb->offset + offset); } return true; } @@ -539,7 +469,7 @@ static bool quorum_compare(QuorumAIOCB *acb, /* Do a vote to get the error code */ static int quorum_vote_error(QuorumAIOCB *acb) { - BDRVQuorumState *s = acb->common.bs->opaque; + BDRVQuorumState *s = acb->bs->opaque; QuorumVoteVersion *winner = NULL; QuorumVotes error_votes; QuorumVoteValue result_value; @@ -568,17 +498,16 @@ static int quorum_vote_error(QuorumAIOCB *acb) return ret; } -static bool quorum_vote(QuorumAIOCB *acb) +static void quorum_vote(QuorumAIOCB *acb) { bool quorum = true; - bool rewrite = false; int i, j, ret; QuorumVoteValue hash; - BDRVQuorumState *s = acb->common.bs->opaque; + BDRVQuorumState *s = acb->bs->opaque; QuorumVoteVersion *winner; if (quorum_has_too_much_io_failed(acb)) { - return false; + return; } /* get the index of the first successful read */ @@ -606,7 +535,7 @@ static bool quorum_vote(QuorumAIOCB *acb) /* Every successful read agrees */ if (quorum) { quorum_copy_qiov(acb->qiov, &acb->qcrs[i].qiov); - return false; + return; } /* compute hashes for each successful read, also store indexes */ @@ -641,19 +570,46 @@ static bool quorum_vote(QuorumAIOCB *acb) /* corruption correction is enabled */ if (s->rewrite_corrupted) { - rewrite = quorum_rewrite_bad_versions(s, acb, &winner->value); + quorum_rewrite_bad_versions(acb, &winner->value); } free_exit: /* free lists */ quorum_free_vote_list(&acb->votes); - return rewrite; } -static BlockAIOCB *read_quorum_children(QuorumAIOCB *acb) +static void read_quorum_children_entry(void *opaque) { - BDRVQuorumState *s = acb->common.bs->opaque; - int i; + QuorumCo *co = opaque; + QuorumAIOCB *acb = co->acb; + BDRVQuorumState *s = acb->bs->opaque; + int i = co->idx; + QuorumChildRequest *sacb = &acb->qcrs[i]; + + sacb->bs = s->children[i]->bs; + sacb->ret = bdrv_co_preadv(s->children[i], acb->offset, acb->bytes, + &acb->qcrs[i].qiov, 0); + + if (sacb->ret == 0) { + acb->success_count++; + } else { + quorum_report_bad_acb(sacb, sacb->ret); + } + + acb->count++; + assert(acb->count <= s->num_children); + assert(acb->success_count <= s->num_children); + + /* Wake up the caller after the last read */ + if (acb->count == s->num_children) { + qemu_coroutine_enter_if_inactive(acb->co); + } +} + +static int read_quorum_children(QuorumAIOCB *acb) +{ + BDRVQuorumState *s = acb->bs->opaque; + int i, ret; acb->children_read = s->num_children; for (i = 0; i < s->num_children; i++) { @@ -663,65 +619,131 @@ static BlockAIOCB *read_quorum_children(QuorumAIOCB *acb) } for (i = 0; i < s->num_children; i++) { - acb->qcrs[i].aiocb = bdrv_aio_readv(s->children[i], acb->sector_num, - &acb->qcrs[i].qiov, acb->nb_sectors, - quorum_aio_cb, &acb->qcrs[i]); + Coroutine *co; + QuorumCo data = { + .acb = acb, + .idx = i, + }; + + co = qemu_coroutine_create(read_quorum_children_entry, &data); + qemu_coroutine_enter(co); } - return &acb->common; + while (acb->count < s->num_children) { + qemu_coroutine_yield(); + } + + /* Do the vote on read */ + quorum_vote(acb); + for (i = 0; i < s->num_children; i++) { + qemu_vfree(acb->qcrs[i].buf); + qemu_iovec_destroy(&acb->qcrs[i].qiov); + } + + while (acb->rewrite_count) { + qemu_coroutine_yield(); + } + + ret = acb->vote_ret; + + return ret; } -static BlockAIOCB *read_fifo_child(QuorumAIOCB *acb) +static int read_fifo_child(QuorumAIOCB *acb) { - BDRVQuorumState *s = acb->common.bs->opaque; - int n = acb->children_read++; + BDRVQuorumState *s = acb->bs->opaque; + int n, ret; + + /* We try to read the next child in FIFO order if we failed to read */ + do { + n = acb->children_read++; + acb->qcrs[n].bs = s->children[n]->bs; + ret = bdrv_co_preadv(s->children[n], acb->offset, acb->bytes, + acb->qiov, 0); + if (ret < 0) { + quorum_report_bad_acb(&acb->qcrs[n], ret); + } + } while (ret < 0 && acb->children_read < s->num_children); - acb->qcrs[n].aiocb = bdrv_aio_readv(s->children[n], acb->sector_num, - acb->qiov, acb->nb_sectors, - quorum_fifo_aio_cb, &acb->qcrs[n]); + /* FIXME: rewrite failed children if acb->children_read > 1? */ - return &acb->common; + return ret; } -static BlockAIOCB *quorum_aio_readv(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque) +static int quorum_co_preadv(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov, int flags) { BDRVQuorumState *s = bs->opaque; - QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num, - nb_sectors, cb, opaque); + QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes); + int ret; + acb->is_read = true; acb->children_read = 0; if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) { - return read_quorum_children(acb); + ret = read_quorum_children(acb); + } else { + ret = read_fifo_child(acb); + } + quorum_aio_finalize(acb); + + return ret; +} + +static void write_quorum_entry(void *opaque) +{ + QuorumCo *co = opaque; + QuorumAIOCB *acb = co->acb; + BDRVQuorumState *s = acb->bs->opaque; + int i = co->idx; + QuorumChildRequest *sacb = &acb->qcrs[i]; + + sacb->bs = s->children[i]->bs; + sacb->ret = bdrv_co_pwritev(s->children[i], acb->offset, acb->bytes, + acb->qiov, 0); + if (sacb->ret == 0) { + acb->success_count++; + } else { + quorum_report_bad_acb(sacb, sacb->ret); } + acb->count++; + assert(acb->count <= s->num_children); + assert(acb->success_count <= s->num_children); - return read_fifo_child(acb); + /* Wake up the caller after the last write */ + if (acb->count == s->num_children) { + qemu_coroutine_enter_if_inactive(acb->co); + } } -static BlockAIOCB *quorum_aio_writev(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque) +static int quorum_co_pwritev(BlockDriverState *bs, uint64_t offset, + uint64_t bytes, QEMUIOVector *qiov, int flags) { BDRVQuorumState *s = bs->opaque; - QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num, nb_sectors, - cb, opaque); - int i; + QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes); + int i, ret; for (i = 0; i < s->num_children; i++) { - acb->qcrs[i].aiocb = bdrv_aio_writev(s->children[i], sector_num, - qiov, nb_sectors, &quorum_aio_cb, - &acb->qcrs[i]); + Coroutine *co; + QuorumCo data = { + .acb = acb, + .idx = i, + }; + + co = qemu_coroutine_create(write_quorum_entry, &data); + qemu_coroutine_enter(co); + } + + while (acb->count < s->num_children) { + qemu_coroutine_yield(); } - return &acb->common; + quorum_has_too_much_io_failed(acb); + + ret = acb->vote_ret; + quorum_aio_finalize(acb); + + return ret; } static int64_t quorum_getlength(BlockDriverState *bs) @@ -765,7 +787,7 @@ static coroutine_fn int quorum_co_flush(BlockDriverState *bs) result = bdrv_co_flush(s->children[i]->bs); if (result) { quorum_report_bad(QUORUM_OP_TYPE_FLUSH, 0, - bdrv_nb_sectors(s->children[i]->bs), + bdrv_getlength(s->children[i]->bs), s->children[i]->bs->node_name, result); result_value.l = result; quorum_count_vote(&error_votes, &result_value, i); @@ -1098,8 +1120,8 @@ static BlockDriver bdrv_quorum = { .bdrv_getlength = quorum_getlength, - .bdrv_aio_readv = quorum_aio_readv, - .bdrv_aio_writev = quorum_aio_writev, + .bdrv_co_preadv = quorum_co_preadv, + .bdrv_co_pwritev = quorum_co_pwritev, .bdrv_add_child = quorum_add_child, .bdrv_del_child = quorum_del_child, diff --git a/block/raw_bsd.c b/block/raw-format.c index 8a5b9b0424..8404a82e0c 100644 --- a/block/raw_bsd.c +++ b/block/raw-format.c @@ -1,4 +1,4 @@ -/* BlockDriver implementation for "raw" +/* BlockDriver implementation for "raw" format driver * * Copyright (C) 2010-2016 Red Hat, Inc. * Copyright (C) 2010, Blue Swirl <blauwirbel@gmail.com> diff --git a/block/trace-events b/block/trace-events index cfc05f2478..671a6a851c 100644 --- a/block/trace-events +++ b/block/trace-events @@ -53,8 +53,8 @@ qmp_block_job_resume(void *job) "job %p" qmp_block_job_complete(void *job) "job %p" qmp_block_stream(void *bs, void *job) "bs %p job %p" -# block/raw-win32.c -# block/raw-posix.c +# block/file-win32.c +# block/file-posix.c paio_submit_co(int64_t offset, int count, int type) "offset %"PRId64" count %d type %d" paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb %p opaque %p offset %"PRId64" count %d type %d" @@ -2750,7 +2750,7 @@ if compile_prog "" "" ; then fi ########################################## -# xfsctl() probe, used for raw-posix +# xfsctl() probe, used for file-posix.c if test "$xfs" != "no" ; then cat > $TMPC << EOF #include <stddef.h> /* NULL */ diff --git a/contrib/libvhost-user/Makefile.objs b/contrib/libvhost-user/Makefile.objs new file mode 100644 index 0000000000..cef1ad6e31 --- /dev/null +++ b/contrib/libvhost-user/Makefile.objs @@ -0,0 +1 @@ +libvhost-user-obj-y = libvhost-user.o diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c new file mode 100644 index 0000000000..af4faad60b --- /dev/null +++ b/contrib/libvhost-user/libvhost-user.c @@ -0,0 +1,1499 @@ +/* + * Vhost User library + * + * Copyright IBM, Corp. 2007 + * Copyright (c) 2016 Red Hat, Inc. + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * Marc-André Lureau <mlureau@redhat.com> + * Victor Kaplansky <victork@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#include <qemu/osdep.h> +#include <sys/eventfd.h> +#include <linux/vhost.h> + +#include "qemu/atomic.h" + +#include "libvhost-user.h" + +#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64) + +/* The version of the protocol we support */ +#define VHOST_USER_VERSION 1 +#define LIBVHOST_USER_DEBUG 0 + +#define DPRINT(...) \ + do { \ + if (LIBVHOST_USER_DEBUG) { \ + fprintf(stderr, __VA_ARGS__); \ + } \ + } while (0) + +static const char * +vu_request_to_string(int req) +{ +#define REQ(req) [req] = #req + static const char *vu_request_str[] = { + REQ(VHOST_USER_NONE), + REQ(VHOST_USER_GET_FEATURES), + REQ(VHOST_USER_SET_FEATURES), + REQ(VHOST_USER_NONE), + REQ(VHOST_USER_GET_FEATURES), + REQ(VHOST_USER_SET_FEATURES), + REQ(VHOST_USER_SET_OWNER), + REQ(VHOST_USER_RESET_OWNER), + REQ(VHOST_USER_SET_MEM_TABLE), + REQ(VHOST_USER_SET_LOG_BASE), + REQ(VHOST_USER_SET_LOG_FD), + REQ(VHOST_USER_SET_VRING_NUM), + REQ(VHOST_USER_SET_VRING_ADDR), + REQ(VHOST_USER_SET_VRING_BASE), + REQ(VHOST_USER_GET_VRING_BASE), + REQ(VHOST_USER_SET_VRING_KICK), + REQ(VHOST_USER_SET_VRING_CALL), + REQ(VHOST_USER_SET_VRING_ERR), + REQ(VHOST_USER_GET_PROTOCOL_FEATURES), + REQ(VHOST_USER_SET_PROTOCOL_FEATURES), + REQ(VHOST_USER_GET_QUEUE_NUM), + REQ(VHOST_USER_SET_VRING_ENABLE), + REQ(VHOST_USER_SEND_RARP), + REQ(VHOST_USER_INPUT_GET_CONFIG), + REQ(VHOST_USER_MAX), + }; +#undef REQ + + if (req < VHOST_USER_MAX) { + return vu_request_str[req]; + } else { + return "unknown"; + } +} + +static void +vu_panic(VuDev *dev, const char *msg, ...) +{ + char *buf = NULL; + va_list ap; + + va_start(ap, msg); + (void)vasprintf(&buf, msg, ap); + va_end(ap); + + dev->broken = true; + dev->panic(dev, buf); + free(buf); + + /* FIXME: find a way to call virtio_error? */ +} + +/* Translate guest physical address to our virtual address. */ +void * +vu_gpa_to_va(VuDev *dev, uint64_t guest_addr) +{ + int i; + + /* Find matching memory region. */ + for (i = 0; i < dev->nregions; i++) { + VuDevRegion *r = &dev->regions[i]; + + if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) { + return (void *)(uintptr_t) + guest_addr - r->gpa + r->mmap_addr + r->mmap_offset; + } + } + + return NULL; +} + +/* Translate qemu virtual address to our virtual address. */ +static void * +qva_to_va(VuDev *dev, uint64_t qemu_addr) +{ + int i; + + /* Find matching memory region. */ + for (i = 0; i < dev->nregions; i++) { + VuDevRegion *r = &dev->regions[i]; + + if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) { + return (void *)(uintptr_t) + qemu_addr - r->qva + r->mmap_addr + r->mmap_offset; + } + } + + return NULL; +} + +static void +vmsg_close_fds(VhostUserMsg *vmsg) +{ + int i; + + for (i = 0; i < vmsg->fd_num; i++) { + close(vmsg->fds[i]); + } +} + +static bool +vu_message_read(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) +{ + char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { }; + struct iovec iov = { + .iov_base = (char *)vmsg, + .iov_len = VHOST_USER_HDR_SIZE, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = control, + .msg_controllen = sizeof(control), + }; + size_t fd_size; + struct cmsghdr *cmsg; + int rc; + + do { + rc = recvmsg(conn_fd, &msg, 0); + } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); + + if (rc <= 0) { + vu_panic(dev, "Error while recvmsg: %s", strerror(errno)); + return false; + } + + vmsg->fd_num = 0; + for (cmsg = CMSG_FIRSTHDR(&msg); + cmsg != NULL; + cmsg = CMSG_NXTHDR(&msg, cmsg)) + { + if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) { + fd_size = cmsg->cmsg_len - CMSG_LEN(0); + vmsg->fd_num = fd_size / sizeof(int); + memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size); + break; + } + } + + if (vmsg->size > sizeof(vmsg->payload)) { + vu_panic(dev, + "Error: too big message request: %d, size: vmsg->size: %u, " + "while sizeof(vmsg->payload) = %zu\n", + vmsg->request, vmsg->size, sizeof(vmsg->payload)); + goto fail; + } + + if (vmsg->size) { + do { + rc = read(conn_fd, &vmsg->payload, vmsg->size); + } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); + + if (rc <= 0) { + vu_panic(dev, "Error while reading: %s", strerror(errno)); + goto fail; + } + + assert(rc == vmsg->size); + } + + return true; + +fail: + vmsg_close_fds(vmsg); + + return false; +} + +static bool +vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg) +{ + int rc; + uint8_t *p = (uint8_t *)vmsg; + + /* Set the version in the flags when sending the reply */ + vmsg->flags &= ~VHOST_USER_VERSION_MASK; + vmsg->flags |= VHOST_USER_VERSION; + vmsg->flags |= VHOST_USER_REPLY_MASK; + + do { + rc = write(conn_fd, p, VHOST_USER_HDR_SIZE); + } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); + + do { + if (vmsg->data) { + rc = write(conn_fd, vmsg->data, vmsg->size); + } else { + rc = write(conn_fd, p + VHOST_USER_HDR_SIZE, vmsg->size); + } + } while (rc < 0 && (errno == EINTR || errno == EAGAIN)); + + if (rc <= 0) { + vu_panic(dev, "Error while writing: %s", strerror(errno)); + return false; + } + + return true; +} + +/* Kick the log_call_fd if required. */ +static void +vu_log_kick(VuDev *dev) +{ + if (dev->log_call_fd != -1) { + DPRINT("Kicking the QEMU's log...\n"); + if (eventfd_write(dev->log_call_fd, 1) < 0) { + vu_panic(dev, "Error writing eventfd: %s", strerror(errno)); + } + } +} + +static void +vu_log_page(uint8_t *log_table, uint64_t page) +{ + DPRINT("Logged dirty guest page: %"PRId64"\n", page); + atomic_or(&log_table[page / 8], 1 << (page % 8)); +} + +static void +vu_log_write(VuDev *dev, uint64_t address, uint64_t length) +{ + uint64_t page; + + if (!(dev->features & (1ULL << VHOST_F_LOG_ALL)) || + !dev->log_table || !length) { + return; + } + + assert(dev->log_size > ((address + length - 1) / VHOST_LOG_PAGE / 8)); + + page = address / VHOST_LOG_PAGE; + while (page * VHOST_LOG_PAGE < address + length) { + vu_log_page(dev->log_table, page); + page += VHOST_LOG_PAGE; + } + + vu_log_kick(dev); +} + +static void +vu_kick_cb(VuDev *dev, int condition, void *data) +{ + int index = (intptr_t)data; + VuVirtq *vq = &dev->vq[index]; + int sock = vq->kick_fd; + eventfd_t kick_data; + ssize_t rc; + + rc = eventfd_read(sock, &kick_data); + if (rc == -1) { + vu_panic(dev, "kick eventfd_read(): %s", strerror(errno)); + dev->remove_watch(dev, dev->vq[index].kick_fd); + } else { + DPRINT("Got kick_data: %016"PRIx64" handler:%p idx:%d\n", + kick_data, vq->handler, index); + if (vq->handler) { + vq->handler(dev, index); + } + } +} + +static bool +vu_get_features_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + vmsg->payload.u64 = + 1ULL << VHOST_F_LOG_ALL | + 1ULL << VHOST_USER_F_PROTOCOL_FEATURES; + + if (dev->iface->get_features) { + vmsg->payload.u64 |= dev->iface->get_features(dev); + } + + vmsg->size = sizeof(vmsg->payload.u64); + + DPRINT("Sending back to guest u64: 0x%016"PRIx64"\n", vmsg->payload.u64); + + return true; +} + +static void +vu_set_enable_all_rings(VuDev *dev, bool enabled) +{ + int i; + + for (i = 0; i < VHOST_MAX_NR_VIRTQUEUE; i++) { + dev->vq[i].enable = enabled; + } +} + +static bool +vu_set_features_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); + + dev->features = vmsg->payload.u64; + + if (!(dev->features & VHOST_USER_F_PROTOCOL_FEATURES)) { + vu_set_enable_all_rings(dev, true); + } + + if (dev->iface->set_features) { + dev->iface->set_features(dev, dev->features); + } + + return false; +} + +static bool +vu_set_owner_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + return false; +} + +static void +vu_close_log(VuDev *dev) +{ + if (dev->log_table) { + if (munmap(dev->log_table, dev->log_size) != 0) { + perror("close log munmap() error"); + } + + dev->log_table = NULL; + } + if (dev->log_call_fd != -1) { + close(dev->log_call_fd); + dev->log_call_fd = -1; + } +} + +static bool +vu_reset_device_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + vu_set_enable_all_rings(dev, false); + + return false; +} + +static bool +vu_set_mem_table_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + int i; + VhostUserMemory *memory = &vmsg->payload.memory; + dev->nregions = memory->nregions; + + DPRINT("Nregions: %d\n", memory->nregions); + for (i = 0; i < dev->nregions; i++) { + void *mmap_addr; + VhostUserMemoryRegion *msg_region = &memory->regions[i]; + VuDevRegion *dev_region = &dev->regions[i]; + + DPRINT("Region %d\n", i); + DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", + msg_region->guest_phys_addr); + DPRINT(" memory_size: 0x%016"PRIx64"\n", + msg_region->memory_size); + DPRINT(" userspace_addr 0x%016"PRIx64"\n", + msg_region->userspace_addr); + DPRINT(" mmap_offset 0x%016"PRIx64"\n", + msg_region->mmap_offset); + + dev_region->gpa = msg_region->guest_phys_addr; + dev_region->size = msg_region->memory_size; + dev_region->qva = msg_region->userspace_addr; + dev_region->mmap_offset = msg_region->mmap_offset; + + /* We don't use offset argument of mmap() since the + * mapped address has to be page aligned, and we use huge + * pages. */ + mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, + PROT_READ | PROT_WRITE, MAP_SHARED, + vmsg->fds[i], 0); + + if (mmap_addr == MAP_FAILED) { + vu_panic(dev, "region mmap error: %s", strerror(errno)); + } else { + dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; + DPRINT(" mmap_addr: 0x%016"PRIx64"\n", + dev_region->mmap_addr); + } + + close(vmsg->fds[i]); + } + + return false; +} + +static bool +vu_set_log_base_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + int fd; + uint64_t log_mmap_size, log_mmap_offset; + void *rc; + + if (vmsg->fd_num != 1 || + vmsg->size != sizeof(vmsg->payload.log)) { + vu_panic(dev, "Invalid log_base message"); + return true; + } + + fd = vmsg->fds[0]; + log_mmap_offset = vmsg->payload.log.mmap_offset; + log_mmap_size = vmsg->payload.log.mmap_size; + DPRINT("Log mmap_offset: %"PRId64"\n", log_mmap_offset); + DPRINT("Log mmap_size: %"PRId64"\n", log_mmap_size); + + rc = mmap(0, log_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, + log_mmap_offset); + if (rc == MAP_FAILED) { + perror("log mmap error"); + } + dev->log_table = rc; + dev->log_size = log_mmap_size; + + vmsg->size = sizeof(vmsg->payload.u64); + + return true; +} + +static bool +vu_set_log_fd_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + if (vmsg->fd_num != 1) { + vu_panic(dev, "Invalid log_fd message"); + return false; + } + + if (dev->log_call_fd != -1) { + close(dev->log_call_fd); + } + dev->log_call_fd = vmsg->fds[0]; + DPRINT("Got log_call_fd: %d\n", vmsg->fds[0]); + + return false; +} + +static bool +vu_set_vring_num_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + unsigned int index = vmsg->payload.state.index; + unsigned int num = vmsg->payload.state.num; + + DPRINT("State.index: %d\n", index); + DPRINT("State.num: %d\n", num); + dev->vq[index].vring.num = num; + + return false; +} + +static bool +vu_set_vring_addr_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + struct vhost_vring_addr *vra = &vmsg->payload.addr; + unsigned int index = vra->index; + VuVirtq *vq = &dev->vq[index]; + + DPRINT("vhost_vring_addr:\n"); + DPRINT(" index: %d\n", vra->index); + DPRINT(" flags: %d\n", vra->flags); + DPRINT(" desc_user_addr: 0x%016llx\n", vra->desc_user_addr); + DPRINT(" used_user_addr: 0x%016llx\n", vra->used_user_addr); + DPRINT(" avail_user_addr: 0x%016llx\n", vra->avail_user_addr); + DPRINT(" log_guest_addr: 0x%016llx\n", vra->log_guest_addr); + + vq->vring.flags = vra->flags; + vq->vring.desc = qva_to_va(dev, vra->desc_user_addr); + vq->vring.used = qva_to_va(dev, vra->used_user_addr); + vq->vring.avail = qva_to_va(dev, vra->avail_user_addr); + vq->vring.log_guest_addr = vra->log_guest_addr; + + DPRINT("Setting virtq addresses:\n"); + DPRINT(" vring_desc at %p\n", vq->vring.desc); + DPRINT(" vring_used at %p\n", vq->vring.used); + DPRINT(" vring_avail at %p\n", vq->vring.avail); + + if (!(vq->vring.desc && vq->vring.used && vq->vring.avail)) { + vu_panic(dev, "Invalid vring_addr message"); + return false; + } + + vq->used_idx = vq->vring.used->idx; + + return false; +} + +static bool +vu_set_vring_base_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + unsigned int index = vmsg->payload.state.index; + unsigned int num = vmsg->payload.state.num; + + DPRINT("State.index: %d\n", index); + DPRINT("State.num: %d\n", num); + dev->vq[index].shadow_avail_idx = dev->vq[index].last_avail_idx = num; + + return false; +} + +static bool +vu_get_vring_base_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + unsigned int index = vmsg->payload.state.index; + + DPRINT("State.index: %d\n", index); + vmsg->payload.state.num = dev->vq[index].last_avail_idx; + vmsg->size = sizeof(vmsg->payload.state); + + dev->vq[index].started = false; + if (dev->iface->queue_set_started) { + dev->iface->queue_set_started(dev, index, false); + } + + if (dev->vq[index].call_fd != -1) { + close(dev->vq[index].call_fd); + dev->vq[index].call_fd = -1; + } + if (dev->vq[index].kick_fd != -1) { + dev->remove_watch(dev, dev->vq[index].kick_fd); + close(dev->vq[index].kick_fd); + dev->vq[index].kick_fd = -1; + } + + return true; +} + +static bool +vu_check_queue_msg_file(VuDev *dev, VhostUserMsg *vmsg) +{ + int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + + if (index >= VHOST_MAX_NR_VIRTQUEUE) { + vmsg_close_fds(vmsg); + vu_panic(dev, "Invalid queue index: %u", index); + return false; + } + + if (vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK || + vmsg->fd_num != 1) { + vmsg_close_fds(vmsg); + vu_panic(dev, "Invalid fds in request: %d", vmsg->request); + return false; + } + + return true; +} + +static bool +vu_set_vring_kick_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + + DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); + + if (!vu_check_queue_msg_file(dev, vmsg)) { + return false; + } + + if (dev->vq[index].kick_fd != -1) { + dev->remove_watch(dev, dev->vq[index].kick_fd); + close(dev->vq[index].kick_fd); + dev->vq[index].kick_fd = -1; + } + + if (!(vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)) { + dev->vq[index].kick_fd = vmsg->fds[0]; + DPRINT("Got kick_fd: %d for vq: %d\n", vmsg->fds[0], index); + } + + dev->vq[index].started = true; + if (dev->iface->queue_set_started) { + dev->iface->queue_set_started(dev, index, true); + } + + if (dev->vq[index].kick_fd != -1 && dev->vq[index].handler) { + dev->set_watch(dev, dev->vq[index].kick_fd, VU_WATCH_IN, + vu_kick_cb, (void *)(long)index); + + DPRINT("Waiting for kicks on fd: %d for vq: %d\n", + dev->vq[index].kick_fd, index); + } + + return false; +} + +void vu_set_queue_handler(VuDev *dev, VuVirtq *vq, + vu_queue_handler_cb handler) +{ + int qidx = vq - dev->vq; + + vq->handler = handler; + if (vq->kick_fd >= 0) { + if (handler) { + dev->set_watch(dev, vq->kick_fd, VU_WATCH_IN, + vu_kick_cb, (void *)(long)qidx); + } else { + dev->remove_watch(dev, vq->kick_fd); + } + } +} + +static bool +vu_set_vring_call_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + + DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); + + if (!vu_check_queue_msg_file(dev, vmsg)) { + return false; + } + + if (dev->vq[index].call_fd != -1) { + close(dev->vq[index].call_fd); + dev->vq[index].call_fd = -1; + } + + if (!(vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)) { + dev->vq[index].call_fd = vmsg->fds[0]; + } + + DPRINT("Got call_fd: %d for vq: %d\n", vmsg->fds[0], index); + + return false; +} + +static bool +vu_set_vring_err_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + int index = vmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + + DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); + + if (!vu_check_queue_msg_file(dev, vmsg)) { + return false; + } + + if (dev->vq[index].err_fd != -1) { + close(dev->vq[index].err_fd); + dev->vq[index].err_fd = -1; + } + + if (!(vmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)) { + dev->vq[index].err_fd = vmsg->fds[0]; + } + + return false; +} + +static bool +vu_get_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + uint64_t features = 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD; + + if (dev->iface->get_protocol_features) { + features |= dev->iface->get_protocol_features(dev); + } + + vmsg->payload.u64 = features; + vmsg->size = sizeof(vmsg->payload.u64); + + return true; +} + +static bool +vu_set_protocol_features_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + uint64_t features = vmsg->payload.u64; + + DPRINT("u64: 0x%016"PRIx64"\n", features); + + dev->protocol_features = vmsg->payload.u64; + + if (dev->iface->set_protocol_features) { + dev->iface->set_protocol_features(dev, features); + } + + return false; +} + +static bool +vu_get_queue_num_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + DPRINT("Function %s() not implemented yet.\n", __func__); + return false; +} + +static bool +vu_set_vring_enable_exec(VuDev *dev, VhostUserMsg *vmsg) +{ + unsigned int index = vmsg->payload.state.index; + unsigned int enable = vmsg->payload.state.num; + + DPRINT("State.index: %d\n", index); + DPRINT("State.enable: %d\n", enable); + + if (index >= VHOST_MAX_NR_VIRTQUEUE) { + vu_panic(dev, "Invalid vring_enable index: %u", index); + return false; + } + + dev->vq[index].enable = enable; + return false; +} + +static bool +vu_process_message(VuDev *dev, VhostUserMsg *vmsg) +{ + int do_reply = 0; + + /* Print out generic part of the request. */ + DPRINT("================ Vhost user message ================\n"); + DPRINT("Request: %s (%d)\n", vu_request_to_string(vmsg->request), + vmsg->request); + DPRINT("Flags: 0x%x\n", vmsg->flags); + DPRINT("Size: %d\n", vmsg->size); + + if (vmsg->fd_num) { + int i; + DPRINT("Fds:"); + for (i = 0; i < vmsg->fd_num; i++) { + DPRINT(" %d", vmsg->fds[i]); + } + DPRINT("\n"); + } + + if (dev->iface->process_msg && + dev->iface->process_msg(dev, vmsg, &do_reply)) { + return do_reply; + } + + switch (vmsg->request) { + case VHOST_USER_GET_FEATURES: + return vu_get_features_exec(dev, vmsg); + case VHOST_USER_SET_FEATURES: + return vu_set_features_exec(dev, vmsg); + case VHOST_USER_GET_PROTOCOL_FEATURES: + return vu_get_protocol_features_exec(dev, vmsg); + case VHOST_USER_SET_PROTOCOL_FEATURES: + return vu_set_protocol_features_exec(dev, vmsg); + case VHOST_USER_SET_OWNER: + return vu_set_owner_exec(dev, vmsg); + case VHOST_USER_RESET_OWNER: + return vu_reset_device_exec(dev, vmsg); + case VHOST_USER_SET_MEM_TABLE: + return vu_set_mem_table_exec(dev, vmsg); + case VHOST_USER_SET_LOG_BASE: + return vu_set_log_base_exec(dev, vmsg); + case VHOST_USER_SET_LOG_FD: + return vu_set_log_fd_exec(dev, vmsg); + case VHOST_USER_SET_VRING_NUM: + return vu_set_vring_num_exec(dev, vmsg); + case VHOST_USER_SET_VRING_ADDR: + return vu_set_vring_addr_exec(dev, vmsg); + case VHOST_USER_SET_VRING_BASE: + return vu_set_vring_base_exec(dev, vmsg); + case VHOST_USER_GET_VRING_BASE: + return vu_get_vring_base_exec(dev, vmsg); + case VHOST_USER_SET_VRING_KICK: + return vu_set_vring_kick_exec(dev, vmsg); + case VHOST_USER_SET_VRING_CALL: + return vu_set_vring_call_exec(dev, vmsg); + case VHOST_USER_SET_VRING_ERR: + return vu_set_vring_err_exec(dev, vmsg); + case VHOST_USER_GET_QUEUE_NUM: + return vu_get_queue_num_exec(dev, vmsg); + case VHOST_USER_SET_VRING_ENABLE: + return vu_set_vring_enable_exec(dev, vmsg); + default: + vmsg_close_fds(vmsg); + vu_panic(dev, "Unhandled request: %d", vmsg->request); + } + + return false; +} + +bool +vu_dispatch(VuDev *dev) +{ + VhostUserMsg vmsg = { 0, }; + int reply_requested; + bool success = false; + + if (!vu_message_read(dev, dev->sock, &vmsg)) { + goto end; + } + + reply_requested = vu_process_message(dev, &vmsg); + if (!reply_requested) { + success = true; + goto end; + } + + if (!vu_message_write(dev, dev->sock, &vmsg)) { + goto end; + } + + success = true; + +end: + g_free(vmsg.data); + return success; +} + +void +vu_deinit(VuDev *dev) +{ + int i; + + for (i = 0; i < dev->nregions; i++) { + VuDevRegion *r = &dev->regions[i]; + void *m = (void *) (uintptr_t) r->mmap_addr; + if (m != MAP_FAILED) { + munmap(m, r->size + r->mmap_offset); + } + } + dev->nregions = 0; + + for (i = 0; i < VHOST_MAX_NR_VIRTQUEUE; i++) { + VuVirtq *vq = &dev->vq[i]; + + if (vq->call_fd != -1) { + close(vq->call_fd); + vq->call_fd = -1; + } + + if (vq->kick_fd != -1) { + close(vq->kick_fd); + vq->kick_fd = -1; + } + + if (vq->err_fd != -1) { + close(vq->err_fd); + vq->err_fd = -1; + } + } + + + vu_close_log(dev); + + if (dev->sock != -1) { + close(dev->sock); + } +} + +void +vu_init(VuDev *dev, + int socket, + vu_panic_cb panic, + vu_set_watch_cb set_watch, + vu_remove_watch_cb remove_watch, + const VuDevIface *iface) +{ + int i; + + assert(socket >= 0); + assert(set_watch); + assert(remove_watch); + assert(iface); + assert(panic); + + memset(dev, 0, sizeof(*dev)); + + dev->sock = socket; + dev->panic = panic; + dev->set_watch = set_watch; + dev->remove_watch = remove_watch; + dev->iface = iface; + dev->log_call_fd = -1; + for (i = 0; i < VHOST_MAX_NR_VIRTQUEUE; i++) { + dev->vq[i] = (VuVirtq) { + .call_fd = -1, .kick_fd = -1, .err_fd = -1, + .notification = true, + }; + } +} + +VuVirtq * +vu_get_queue(VuDev *dev, int qidx) +{ + assert(qidx < VHOST_MAX_NR_VIRTQUEUE); + return &dev->vq[qidx]; +} + +bool +vu_queue_enabled(VuDev *dev, VuVirtq *vq) +{ + return vq->enable; +} + +static inline uint16_t +vring_avail_flags(VuVirtq *vq) +{ + return vq->vring.avail->flags; +} + +static inline uint16_t +vring_avail_idx(VuVirtq *vq) +{ + vq->shadow_avail_idx = vq->vring.avail->idx; + + return vq->shadow_avail_idx; +} + +static inline uint16_t +vring_avail_ring(VuVirtq *vq, int i) +{ + return vq->vring.avail->ring[i]; +} + +static inline uint16_t +vring_get_used_event(VuVirtq *vq) +{ + return vring_avail_ring(vq, vq->vring.num); +} + +static int +virtqueue_num_heads(VuDev *dev, VuVirtq *vq, unsigned int idx) +{ + uint16_t num_heads = vring_avail_idx(vq) - idx; + + /* Check it isn't doing very strange things with descriptor numbers. */ + if (num_heads > vq->vring.num) { + vu_panic(dev, "Guest moved used index from %u to %u", + idx, vq->shadow_avail_idx); + return -1; + } + if (num_heads) { + /* On success, callers read a descriptor at vq->last_avail_idx. + * Make sure descriptor read does not bypass avail index read. */ + smp_rmb(); + } + + return num_heads; +} + +static bool +virtqueue_get_head(VuDev *dev, VuVirtq *vq, + unsigned int idx, unsigned int *head) +{ + /* Grab the next descriptor number they're advertising, and increment + * the index we've seen. */ + *head = vring_avail_ring(vq, idx % vq->vring.num); + + /* If their number is silly, that's a fatal mistake. */ + if (*head >= vq->vring.num) { + vu_panic(dev, "Guest says index %u is available", head); + return false; + } + + return true; +} + +enum { + VIRTQUEUE_READ_DESC_ERROR = -1, + VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */ + VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */ +}; + +static int +virtqueue_read_next_desc(VuDev *dev, struct vring_desc *desc, + int i, unsigned int max, unsigned int *next) +{ + /* If this descriptor says it doesn't chain, we're done. */ + if (!(desc[i].flags & VRING_DESC_F_NEXT)) { + return VIRTQUEUE_READ_DESC_DONE; + } + + /* Check they're not leading us off end of descriptors. */ + *next = desc[i].next; + /* Make sure compiler knows to grab that: we don't want it changing! */ + smp_wmb(); + + if (*next >= max) { + vu_panic(dev, "Desc next is %u", next); + return VIRTQUEUE_READ_DESC_ERROR; + } + + return VIRTQUEUE_READ_DESC_MORE; +} + +void +vu_queue_get_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int *in_bytes, + unsigned int *out_bytes, + unsigned max_in_bytes, unsigned max_out_bytes) +{ + unsigned int idx; + unsigned int total_bufs, in_total, out_total; + int rc; + + idx = vq->last_avail_idx; + + total_bufs = in_total = out_total = 0; + while ((rc = virtqueue_num_heads(dev, vq, idx)) > 0) { + unsigned int max, num_bufs, indirect = 0; + struct vring_desc *desc; + unsigned int i; + + max = vq->vring.num; + num_bufs = total_bufs; + if (!virtqueue_get_head(dev, vq, idx++, &i)) { + goto err; + } + desc = vq->vring.desc; + + if (desc[i].flags & VRING_DESC_F_INDIRECT) { + if (desc[i].len % sizeof(struct vring_desc)) { + vu_panic(dev, "Invalid size for indirect buffer table"); + goto err; + } + + /* If we've got too many, that implies a descriptor loop. */ + if (num_bufs >= max) { + vu_panic(dev, "Looped descriptor"); + goto err; + } + + /* loop over the indirect descriptor table */ + indirect = 1; + max = desc[i].len / sizeof(struct vring_desc); + desc = vu_gpa_to_va(dev, desc[i].addr); + num_bufs = i = 0; + } + + do { + /* If we've got too many, that implies a descriptor loop. */ + if (++num_bufs > max) { + vu_panic(dev, "Looped descriptor"); + goto err; + } + + if (desc[i].flags & VRING_DESC_F_WRITE) { + in_total += desc[i].len; + } else { + out_total += desc[i].len; + } + if (in_total >= max_in_bytes && out_total >= max_out_bytes) { + goto done; + } + rc = virtqueue_read_next_desc(dev, desc, i, max, &i); + } while (rc == VIRTQUEUE_READ_DESC_MORE); + + if (rc == VIRTQUEUE_READ_DESC_ERROR) { + goto err; + } + + if (!indirect) { + total_bufs = num_bufs; + } else { + total_bufs++; + } + } + if (rc < 0) { + goto err; + } +done: + if (in_bytes) { + *in_bytes = in_total; + } + if (out_bytes) { + *out_bytes = out_total; + } + return; + +err: + in_total = out_total = 0; + goto done; +} + +bool +vu_queue_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int in_bytes, + unsigned int out_bytes) +{ + unsigned int in_total, out_total; + + vu_queue_get_avail_bytes(dev, vq, &in_total, &out_total, + in_bytes, out_bytes); + + return in_bytes <= in_total && out_bytes <= out_total; +} + +/* Fetch avail_idx from VQ memory only when we really need to know if + * guest has added some buffers. */ +int +vu_queue_empty(VuDev *dev, VuVirtq *vq) +{ + if (vq->shadow_avail_idx != vq->last_avail_idx) { + return 0; + } + + return vring_avail_idx(vq) == vq->last_avail_idx; +} + +static inline +bool has_feature(uint64_t features, unsigned int fbit) +{ + assert(fbit < 64); + return !!(features & (1ULL << fbit)); +} + +static inline +bool vu_has_feature(VuDev *dev, + unsigned int fbit) +{ + return has_feature(dev->features, fbit); +} + +static bool +vring_notify(VuDev *dev, VuVirtq *vq) +{ + uint16_t old, new; + bool v; + + /* We need to expose used array entries before checking used event. */ + smp_mb(); + + /* Always notify when queue is empty (when feature acknowledge) */ + if (vu_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) && + !vq->inuse && vu_queue_empty(dev, vq)) { + return true; + } + + if (!vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { + return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT); + } + + v = vq->signalled_used_valid; + vq->signalled_used_valid = true; + old = vq->signalled_used; + new = vq->signalled_used = vq->used_idx; + return !v || vring_need_event(vring_get_used_event(vq), new, old); +} + +void +vu_queue_notify(VuDev *dev, VuVirtq *vq) +{ + if (unlikely(dev->broken)) { + return; + } + + if (!vring_notify(dev, vq)) { + DPRINT("skipped notify...\n"); + return; + } + + if (eventfd_write(vq->call_fd, 1) < 0) { + vu_panic(dev, "Error writing eventfd: %s", strerror(errno)); + } +} + +static inline void +vring_used_flags_set_bit(VuVirtq *vq, int mask) +{ + uint16_t *flags; + + flags = (uint16_t *)((char*)vq->vring.used + + offsetof(struct vring_used, flags)); + *flags |= mask; +} + +static inline void +vring_used_flags_unset_bit(VuVirtq *vq, int mask) +{ + uint16_t *flags; + + flags = (uint16_t *)((char*)vq->vring.used + + offsetof(struct vring_used, flags)); + *flags &= ~mask; +} + +static inline void +vring_set_avail_event(VuVirtq *vq, uint16_t val) +{ + if (!vq->notification) { + return; + } + + *((uint16_t *) &vq->vring.used->ring[vq->vring.num]) = val; +} + +void +vu_queue_set_notification(VuDev *dev, VuVirtq *vq, int enable) +{ + vq->notification = enable; + if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { + vring_set_avail_event(vq, vring_avail_idx(vq)); + } else if (enable) { + vring_used_flags_unset_bit(vq, VRING_USED_F_NO_NOTIFY); + } else { + vring_used_flags_set_bit(vq, VRING_USED_F_NO_NOTIFY); + } + if (enable) { + /* Expose avail event/used flags before caller checks the avail idx. */ + smp_mb(); + } +} + +static void +virtqueue_map_desc(VuDev *dev, + unsigned int *p_num_sg, struct iovec *iov, + unsigned int max_num_sg, bool is_write, + uint64_t pa, size_t sz) +{ + unsigned num_sg = *p_num_sg; + + assert(num_sg <= max_num_sg); + + if (!sz) { + vu_panic(dev, "virtio: zero sized buffers are not allowed"); + return; + } + + iov[num_sg].iov_base = vu_gpa_to_va(dev, pa); + iov[num_sg].iov_len = sz; + num_sg++; + + *p_num_sg = num_sg; +} + +/* Round number down to multiple */ +#define ALIGN_DOWN(n, m) ((n) / (m) * (m)) + +/* Round number up to multiple */ +#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m)) + +static void * +virtqueue_alloc_element(size_t sz, + unsigned out_num, unsigned in_num) +{ + VuVirtqElement *elem; + size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0])); + size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]); + size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]); + + assert(sz >= sizeof(VuVirtqElement)); + elem = malloc(out_sg_end); + elem->out_num = out_num; + elem->in_num = in_num; + elem->in_sg = (void *)elem + in_sg_ofs; + elem->out_sg = (void *)elem + out_sg_ofs; + return elem; +} + +void * +vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz) +{ + unsigned int i, head, max; + VuVirtqElement *elem; + unsigned out_num, in_num; + struct iovec iov[VIRTQUEUE_MAX_SIZE]; + struct vring_desc *desc; + int rc; + + if (unlikely(dev->broken)) { + return NULL; + } + + if (vu_queue_empty(dev, vq)) { + return NULL; + } + /* Needed after virtio_queue_empty(), see comment in + * virtqueue_num_heads(). */ + smp_rmb(); + + /* When we start there are none of either input nor output. */ + out_num = in_num = 0; + + max = vq->vring.num; + if (vq->inuse >= vq->vring.num) { + vu_panic(dev, "Virtqueue size exceeded"); + return NULL; + } + + if (!virtqueue_get_head(dev, vq, vq->last_avail_idx++, &head)) { + return NULL; + } + + if (vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { + vring_set_avail_event(vq, vq->last_avail_idx); + } + + i = head; + desc = vq->vring.desc; + if (desc[i].flags & VRING_DESC_F_INDIRECT) { + if (desc[i].len % sizeof(struct vring_desc)) { + vu_panic(dev, "Invalid size for indirect buffer table"); + } + + /* loop over the indirect descriptor table */ + max = desc[i].len / sizeof(struct vring_desc); + desc = vu_gpa_to_va(dev, desc[i].addr); + i = 0; + } + + /* Collect all the descriptors */ + do { + if (desc[i].flags & VRING_DESC_F_WRITE) { + virtqueue_map_desc(dev, &in_num, iov + out_num, + VIRTQUEUE_MAX_SIZE - out_num, true, + desc[i].addr, desc[i].len); + } else { + if (in_num) { + vu_panic(dev, "Incorrect order for descriptors"); + return NULL; + } + virtqueue_map_desc(dev, &out_num, iov, + VIRTQUEUE_MAX_SIZE, false, + desc[i].addr, desc[i].len); + } + + /* If we've got too many, that implies a descriptor loop. */ + if ((in_num + out_num) > max) { + vu_panic(dev, "Looped descriptor"); + } + rc = virtqueue_read_next_desc(dev, desc, i, max, &i); + } while (rc == VIRTQUEUE_READ_DESC_MORE); + + if (rc == VIRTQUEUE_READ_DESC_ERROR) { + return NULL; + } + + /* Now copy what we have collected and mapped */ + elem = virtqueue_alloc_element(sz, out_num, in_num); + elem->index = head; + for (i = 0; i < out_num; i++) { + elem->out_sg[i] = iov[i]; + } + for (i = 0; i < in_num; i++) { + elem->in_sg[i] = iov[out_num + i]; + } + + vq->inuse++; + + return elem; +} + +bool +vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num) +{ + if (num > vq->inuse) { + return false; + } + vq->last_avail_idx -= num; + vq->inuse -= num; + return true; +} + +static inline +void vring_used_write(VuDev *dev, VuVirtq *vq, + struct vring_used_elem *uelem, int i) +{ + struct vring_used *used = vq->vring.used; + + used->ring[i] = *uelem; + vu_log_write(dev, vq->vring.log_guest_addr + + offsetof(struct vring_used, ring[i]), + sizeof(used->ring[i])); +} + + +static void +vu_log_queue_fill(VuDev *dev, VuVirtq *vq, + const VuVirtqElement *elem, + unsigned int len) +{ + struct vring_desc *desc = vq->vring.desc; + unsigned int i, max, min; + unsigned num_bufs = 0; + + max = vq->vring.num; + i = elem->index; + + if (desc[i].flags & VRING_DESC_F_INDIRECT) { + if (desc[i].len % sizeof(struct vring_desc)) { + vu_panic(dev, "Invalid size for indirect buffer table"); + } + + /* loop over the indirect descriptor table */ + max = desc[i].len / sizeof(struct vring_desc); + desc = vu_gpa_to_va(dev, desc[i].addr); + i = 0; + } + + do { + if (++num_bufs > max) { + vu_panic(dev, "Looped descriptor"); + return; + } + + if (desc[i].flags & VRING_DESC_F_WRITE) { + min = MIN(desc[i].len, len); + vu_log_write(dev, desc[i].addr, min); + len -= min; + } + + } while (len > 0 && + (virtqueue_read_next_desc(dev, desc, i, max, &i) + == VIRTQUEUE_READ_DESC_MORE)); +} + +void +vu_queue_fill(VuDev *dev, VuVirtq *vq, + const VuVirtqElement *elem, + unsigned int len, unsigned int idx) +{ + struct vring_used_elem uelem; + + if (unlikely(dev->broken)) { + return; + } + + vu_log_queue_fill(dev, vq, elem, len); + + idx = (idx + vq->used_idx) % vq->vring.num; + + uelem.id = elem->index; + uelem.len = len; + vring_used_write(dev, vq, &uelem, idx); +} + +static inline +void vring_used_idx_set(VuDev *dev, VuVirtq *vq, uint16_t val) +{ + vq->vring.used->idx = val; + vu_log_write(dev, + vq->vring.log_guest_addr + offsetof(struct vring_used, idx), + sizeof(vq->vring.used->idx)); + + vq->used_idx = val; +} + +void +vu_queue_flush(VuDev *dev, VuVirtq *vq, unsigned int count) +{ + uint16_t old, new; + + if (unlikely(dev->broken)) { + return; + } + + /* Make sure buffer is written before we update index. */ + smp_wmb(); + + old = vq->used_idx; + new = old + count; + vring_used_idx_set(dev, vq, new); + vq->inuse -= count; + if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) { + vq->signalled_used_valid = false; + } +} + +void +vu_queue_push(VuDev *dev, VuVirtq *vq, + const VuVirtqElement *elem, unsigned int len) +{ + vu_queue_fill(dev, vq, elem, len, 0); + vu_queue_flush(dev, vq, 1); +} diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h new file mode 100644 index 0000000000..156b50e989 --- /dev/null +++ b/contrib/libvhost-user/libvhost-user.h @@ -0,0 +1,435 @@ +/* + * Vhost User library + * + * Copyright (c) 2016 Red Hat, Inc. + * + * Authors: + * Victor Kaplansky <victork@redhat.com> + * Marc-André Lureau <mlureau@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#ifndef LIBVHOST_USER_H +#define LIBVHOST_USER_H + +#include <stdint.h> +#include <stdbool.h> +#include <stddef.h> +#include <linux/vhost.h> +#include "standard-headers/linux/virtio_ring.h" + +/* Based on qemu/hw/virtio/vhost-user.c */ +#define VHOST_USER_F_PROTOCOL_FEATURES 30 +#define VHOST_LOG_PAGE 4096 + +#define VHOST_MAX_NR_VIRTQUEUE 8 +#define VIRTQUEUE_MAX_SIZE 1024 + +#define VHOST_MEMORY_MAX_NREGIONS 8 + +enum VhostUserProtocolFeature { + VHOST_USER_PROTOCOL_F_MQ = 0, + VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1, + VHOST_USER_PROTOCOL_F_RARP = 2, + + VHOST_USER_PROTOCOL_F_MAX +}; + +#define VHOST_USER_PROTOCOL_FEATURE_MASK ((1 << VHOST_USER_PROTOCOL_F_MAX) - 1) + +typedef enum VhostUserRequest { + VHOST_USER_NONE = 0, + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_OWNER = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_GET_PROTOCOL_FEATURES = 15, + VHOST_USER_SET_PROTOCOL_FEATURES = 16, + VHOST_USER_GET_QUEUE_NUM = 17, + VHOST_USER_SET_VRING_ENABLE = 18, + VHOST_USER_SEND_RARP = 19, + VHOST_USER_INPUT_GET_CONFIG = 20, + VHOST_USER_MAX +} VhostUserRequest; + +typedef struct VhostUserMemoryRegion { + uint64_t guest_phys_addr; + uint64_t memory_size; + uint64_t userspace_addr; + uint64_t mmap_offset; +} VhostUserMemoryRegion; + +typedef struct VhostUserMemory { + uint32_t nregions; + uint32_t padding; + VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS]; +} VhostUserMemory; + +typedef struct VhostUserLog { + uint64_t mmap_size; + uint64_t mmap_offset; +} VhostUserLog; + +#if defined(_WIN32) +# define VU_PACKED __attribute__((gcc_struct, packed)) +#else +# define VU_PACKED __attribute__((packed)) +#endif + +typedef struct VhostUserMsg { + VhostUserRequest request; + +#define VHOST_USER_VERSION_MASK (0x3) +#define VHOST_USER_REPLY_MASK (0x1 << 2) + uint32_t flags; + uint32_t size; /* the following payload size */ + + union { +#define VHOST_USER_VRING_IDX_MASK (0xff) +#define VHOST_USER_VRING_NOFD_MASK (0x1 << 8) + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + VhostUserMemory memory; + VhostUserLog log; + } payload; + + int fds[VHOST_MEMORY_MAX_NREGIONS]; + int fd_num; + uint8_t *data; +} VU_PACKED VhostUserMsg; + +typedef struct VuDevRegion { + /* Guest Physical address. */ + uint64_t gpa; + /* Memory region size. */ + uint64_t size; + /* QEMU virtual address (userspace). */ + uint64_t qva; + /* Starting offset in our mmaped space. */ + uint64_t mmap_offset; + /* Start address of mmaped space. */ + uint64_t mmap_addr; +} VuDevRegion; + +typedef struct VuDev VuDev; + +typedef uint64_t (*vu_get_features_cb) (VuDev *dev); +typedef void (*vu_set_features_cb) (VuDev *dev, uint64_t features); +typedef int (*vu_process_msg_cb) (VuDev *dev, VhostUserMsg *vmsg, + int *do_reply); +typedef void (*vu_queue_set_started_cb) (VuDev *dev, int qidx, bool started); + +typedef struct VuDevIface { + /* called by VHOST_USER_GET_FEATURES to get the features bitmask */ + vu_get_features_cb get_features; + /* enable vhost implementation features */ + vu_set_features_cb set_features; + /* get the protocol feature bitmask from the underlying vhost + * implementation */ + vu_get_features_cb get_protocol_features; + /* enable protocol features in the underlying vhost implementation. */ + vu_set_features_cb set_protocol_features; + /* process_msg is called for each vhost-user message received */ + /* skip libvhost-user processing if return value != 0 */ + vu_process_msg_cb process_msg; + /* tells when queues can be processed */ + vu_queue_set_started_cb queue_set_started; +} VuDevIface; + +typedef void (*vu_queue_handler_cb) (VuDev *dev, int qidx); + +typedef struct VuRing { + unsigned int num; + struct vring_desc *desc; + struct vring_avail *avail; + struct vring_used *used; + uint64_t log_guest_addr; + uint32_t flags; +} VuRing; + +typedef struct VuVirtq { + VuRing vring; + + /* Next head to pop */ + uint16_t last_avail_idx; + + /* Last avail_idx read from VQ. */ + uint16_t shadow_avail_idx; + + uint16_t used_idx; + + /* Last used index value we have signalled on */ + uint16_t signalled_used; + + /* Last used index value we have signalled on */ + bool signalled_used_valid; + + /* Notification enabled? */ + bool notification; + + int inuse; + + vu_queue_handler_cb handler; + + int call_fd; + int kick_fd; + int err_fd; + unsigned int enable; + bool started; +} VuVirtq; + +enum VuWatchCondtion { + VU_WATCH_IN = 1 << 0, + VU_WATCH_OUT = 1 << 1, + VU_WATCH_PRI = 1 << 2, + VU_WATCH_ERR = 1 << 3, + VU_WATCH_HUP = 1 << 4, +}; + +typedef void (*vu_panic_cb) (VuDev *dev, const char *err); +typedef void (*vu_watch_cb) (VuDev *dev, int condition, void *data); +typedef void (*vu_set_watch_cb) (VuDev *dev, int fd, int condition, + vu_watch_cb cb, void *data); +typedef void (*vu_remove_watch_cb) (VuDev *dev, int fd); + +struct VuDev { + int sock; + uint32_t nregions; + VuDevRegion regions[VHOST_MEMORY_MAX_NREGIONS]; + VuVirtq vq[VHOST_MAX_NR_VIRTQUEUE]; + int log_call_fd; + uint64_t log_size; + uint8_t *log_table; + uint64_t features; + uint64_t protocol_features; + bool broken; + + /* @set_watch: add or update the given fd to the watch set, + * call cb when condition is met */ + vu_set_watch_cb set_watch; + + /* @remove_watch: remove the given fd from the watch set */ + vu_remove_watch_cb remove_watch; + + /* @panic: encountered an unrecoverable error, you may try to + * re-initialize */ + vu_panic_cb panic; + const VuDevIface *iface; +}; + +typedef struct VuVirtqElement { + unsigned int index; + unsigned int out_num; + unsigned int in_num; + struct iovec *in_sg; + struct iovec *out_sg; +} VuVirtqElement; + +/** + * vu_init: + * @dev: a VuDev context + * @socket: the socket connected to vhost-user master + * @panic: a panic callback + * @set_watch: a set_watch callback + * @remove_watch: a remove_watch callback + * @iface: a VuDevIface structure with vhost-user device callbacks + * + * Intializes a VuDev vhost-user context. + **/ +void vu_init(VuDev *dev, + int socket, + vu_panic_cb panic, + vu_set_watch_cb set_watch, + vu_remove_watch_cb remove_watch, + const VuDevIface *iface); + + +/** + * vu_deinit: + * @dev: a VuDev context + * + * Cleans up the VuDev context + */ +void vu_deinit(VuDev *dev); + +/** + * vu_dispatch: + * @dev: a VuDev context + * + * Process one vhost-user message. + * + * Returns: TRUE on success, FALSE on failure. + */ +bool vu_dispatch(VuDev *dev); + +/** + * vu_gpa_to_va: + * @dev: a VuDev context + * @guest_addr: guest address + * + * Translate a guest address to a pointer. Returns NULL on failure. + */ +void *vu_gpa_to_va(VuDev *dev, uint64_t guest_addr); + +/** + * vu_get_queue: + * @dev: a VuDev context + * @qidx: queue index + * + * Returns the queue number @qidx. + */ +VuVirtq *vu_get_queue(VuDev *dev, int qidx); + +/** + * vu_set_queue_handler: + * @dev: a VuDev context + * @vq: a VuVirtq queue + * @handler: the queue handler callback + * + * Set the queue handler. This function may be called several times + * for the same queue. If called with NULL @handler, the handler is + * removed. + */ +void vu_set_queue_handler(VuDev *dev, VuVirtq *vq, + vu_queue_handler_cb handler); + + +/** + * vu_queue_set_notification: + * @dev: a VuDev context + * @vq: a VuVirtq queue + * @enable: state + * + * Set whether the queue notifies (via event index or interrupt) + */ +void vu_queue_set_notification(VuDev *dev, VuVirtq *vq, int enable); + +/** + * vu_queue_enabled: + * @dev: a VuDev context + * @vq: a VuVirtq queue + * + * Returns: whether the queue is enabled. + */ +bool vu_queue_enabled(VuDev *dev, VuVirtq *vq); + +/** + * vu_queue_enabled: + * @dev: a VuDev context + * @vq: a VuVirtq queue + * + * Returns: whether the queue is empty. + */ +int vu_queue_empty(VuDev *dev, VuVirtq *vq); + +/** + * vu_queue_notify: + * @dev: a VuDev context + * @vq: a VuVirtq queue + * + * Request to notify the queue via callfd (skipped if unnecessary) + */ +void vu_queue_notify(VuDev *dev, VuVirtq *vq); + +/** + * vu_queue_pop: + * @dev: a VuDev context + * @vq: a VuVirtq queue + * @sz: the size of struct to return (must be >= VuVirtqElement) + * + * Returns: a VuVirtqElement filled from the queue or NULL. + */ +void *vu_queue_pop(VuDev *dev, VuVirtq *vq, size_t sz); + +/** + * vu_queue_rewind: + * @dev: a VuDev context + * @vq: a VuVirtq queue + * @num: number of elements to push back + * + * Pretend that elements weren't popped from the virtqueue. The next + * virtqueue_pop() will refetch the oldest element. + * + * Returns: true on success, false if @num is greater than the number of in use + * elements. + */ +bool vu_queue_rewind(VuDev *dev, VuVirtq *vq, unsigned int num); + +/** + * vu_queue_fill: + * @dev: a VuDev context + * @vq: a VuVirtq queue + * @elem: a VuVirtqElement + * @len: length in bytes to write + * @idx: optional offset for the used ring index (0 in general) + * + * Fill the used ring with @elem element. + */ +void vu_queue_fill(VuDev *dev, VuVirtq *vq, + const VuVirtqElement *elem, + unsigned int len, unsigned int idx); + +/** + * vu_queue_push: + * @dev: a VuDev context + * @vq: a VuVirtq queue + * @elem: a VuVirtqElement + * @len: length in bytes to write + * + * Helper that combines vu_queue_fill() with a vu_queue_flush(). + */ +void vu_queue_push(VuDev *dev, VuVirtq *vq, + const VuVirtqElement *elem, unsigned int len); + +/** + * vu_queue_flush: + * @dev: a VuDev context + * @vq: a VuVirtq queue + * @num: number of elements to flush + * + * Mark the last number of elements as done (used.idx is updated by + * num elements). +*/ +void vu_queue_flush(VuDev *dev, VuVirtq *vq, unsigned int num); + +/** + * vu_queue_get_avail_bytes: + * @dev: a VuDev context + * @vq: a VuVirtq queue + * @in_bytes: in bytes + * @out_bytes: out bytes + * @max_in_bytes: stop counting after max_in_bytes + * @max_out_bytes: stop counting after max_out_bytes + * + * Count the number of available bytes, up to max_in_bytes/max_out_bytes. + */ +void vu_queue_get_avail_bytes(VuDev *vdev, VuVirtq *vq, unsigned int *in_bytes, + unsigned int *out_bytes, + unsigned max_in_bytes, unsigned max_out_bytes); + +/** + * vu_queue_avail_bytes: + * @dev: a VuDev context + * @vq: a VuVirtq queue + * @in_bytes: expected in bytes + * @out_bytes: expected out bytes + * + * Returns: true if in_bytes <= in_total && out_bytes <= out_total + */ +bool vu_queue_avail_bytes(VuDev *dev, VuVirtq *vq, unsigned int in_bytes, + unsigned int out_bytes); + +#endif /* LIBVHOST_USER_H */ diff --git a/hw/arm/pxa2xx.c b/hw/arm/pxa2xx.c index bdcf6bcce7..d31b4577f0 100644 --- a/hw/arm/pxa2xx.c +++ b/hw/arm/pxa2xx.c @@ -1258,7 +1258,7 @@ static void pxa2xx_i2c_update(PXA2xxI2CState *s) } /* These are only stubs now. */ -static void pxa2xx_i2c_event(I2CSlave *i2c, enum i2c_event event) +static int pxa2xx_i2c_event(I2CSlave *i2c, enum i2c_event event) { PXA2xxI2CSlaveState *slave = PXA2XX_I2C_SLAVE(i2c); PXA2xxI2CState *s = slave->host; @@ -1280,6 +1280,8 @@ static void pxa2xx_i2c_event(I2CSlave *i2c, enum i2c_event event) break; } pxa2xx_i2c_update(s); + + return 0; } static int pxa2xx_i2c_rx(I2CSlave *i2c) diff --git a/hw/arm/tosa.c b/hw/arm/tosa.c index 39d9dbbae6..c3db996930 100644 --- a/hw/arm/tosa.c +++ b/hw/arm/tosa.c @@ -172,7 +172,7 @@ static int tosa_dac_send(I2CSlave *i2c, uint8_t data) return 0; } -static void tosa_dac_event(I2CSlave *i2c, enum i2c_event event) +static int tosa_dac_event(I2CSlave *i2c, enum i2c_event event) { TosaDACState *s = TOSA_DAC(i2c); @@ -194,6 +194,8 @@ static void tosa_dac_event(I2CSlave *i2c, enum i2c_event event) default: break; } + + return 0; } static int tosa_dac_recv(I2CSlave *s) diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index 7102686882..085a611173 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -29,7 +29,6 @@ #include "qemu/osdep.h" #include "qapi/error.h" #include "qemu-common.h" -#include "hw/arm/virt-acpi-build.h" #include "qemu/bitmap.h" #include "trace.h" #include "qom/cpu.h" @@ -43,6 +42,7 @@ #include "hw/acpi/aml-build.h" #include "hw/pci/pcie_host.h" #include "hw/pci/pci.h" +#include "hw/arm/virt.h" #include "sysemu/numa.h" #include "kvm_arm.h" @@ -384,7 +384,7 @@ build_rsdp(GArray *rsdp_table, BIOSLinker *linker, unsigned rsdt_tbl_offset) } static void -build_iort(GArray *table_data, BIOSLinker *linker, VirtGuestInfo *guest_info) +build_iort(GArray *table_data, BIOSLinker *linker) { int iort_start = table_data->len; AcpiIortIdMapping *idmap; @@ -439,11 +439,11 @@ build_iort(GArray *table_data, BIOSLinker *linker, VirtGuestInfo *guest_info) } static void -build_spcr(GArray *table_data, BIOSLinker *linker, VirtGuestInfo *guest_info) +build_spcr(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) { AcpiSerialPortConsoleRedirection *spcr; - const MemMapEntry *uart_memmap = &guest_info->memmap[VIRT_UART]; - int irq = guest_info->irqmap[VIRT_UART] + ARM_SPI_BASE; + const MemMapEntry *uart_memmap = &vms->memmap[VIRT_UART]; + int irq = vms->irqmap[VIRT_UART] + ARM_SPI_BASE; spcr = acpi_data_push(table_data, sizeof(*spcr)); @@ -472,16 +472,16 @@ build_spcr(GArray *table_data, BIOSLinker *linker, VirtGuestInfo *guest_info) } static void -build_srat(GArray *table_data, BIOSLinker *linker, VirtGuestInfo *guest_info) +build_srat(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) { AcpiSystemResourceAffinityTable *srat; AcpiSratProcessorGiccAffinity *core; AcpiSratMemoryAffinity *numamem; int i, j, srat_start; uint64_t mem_base; - uint32_t *cpu_node = g_malloc0(guest_info->smp_cpus * sizeof(uint32_t)); + uint32_t *cpu_node = g_malloc0(vms->smp_cpus * sizeof(uint32_t)); - for (i = 0; i < guest_info->smp_cpus; i++) { + for (i = 0; i < vms->smp_cpus; i++) { j = numa_get_node_for_cpu(i); if (j < nb_numa_nodes) { cpu_node[i] = j; @@ -492,7 +492,7 @@ build_srat(GArray *table_data, BIOSLinker *linker, VirtGuestInfo *guest_info) srat = acpi_data_push(table_data, sizeof(*srat)); srat->reserved1 = cpu_to_le32(1); - for (i = 0; i < guest_info->smp_cpus; ++i) { + for (i = 0; i < vms->smp_cpus; ++i) { core = acpi_data_push(table_data, sizeof(*core)); core->type = ACPI_SRAT_PROCESSOR_GICC; core->length = sizeof(*core); @@ -502,7 +502,7 @@ build_srat(GArray *table_data, BIOSLinker *linker, VirtGuestInfo *guest_info) } g_free(cpu_node); - mem_base = guest_info->memmap[VIRT_MEM].base; + mem_base = vms->memmap[VIRT_MEM].base; for (i = 0; i < nb_numa_nodes; ++i) { numamem = acpi_data_push(table_data, sizeof(*numamem)); build_srat_memory(numamem, mem_base, numa_info[i].node_mem, i, @@ -515,10 +515,10 @@ build_srat(GArray *table_data, BIOSLinker *linker, VirtGuestInfo *guest_info) } static void -build_mcfg(GArray *table_data, BIOSLinker *linker, VirtGuestInfo *guest_info) +build_mcfg(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) { AcpiTableMcfg *mcfg; - const MemMapEntry *memmap = guest_info->memmap; + const MemMapEntry *memmap = vms->memmap; int len = sizeof(*mcfg) + sizeof(mcfg->allocation[0]); mcfg = acpi_data_push(table_data, len); @@ -535,24 +535,33 @@ build_mcfg(GArray *table_data, BIOSLinker *linker, VirtGuestInfo *guest_info) /* GTDT */ static void -build_gtdt(GArray *table_data, BIOSLinker *linker) +build_gtdt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) { + VirtMachineClass *vmc = VIRT_MACHINE_GET_CLASS(vms); int gtdt_start = table_data->len; AcpiGenericTimerTable *gtdt; + uint32_t irqflags; + + if (vmc->claim_edge_triggered_timers) { + irqflags = ACPI_GTDT_INTERRUPT_MODE_EDGE; + } else { + irqflags = ACPI_GTDT_INTERRUPT_MODE_LEVEL; + } gtdt = acpi_data_push(table_data, sizeof *gtdt); /* The interrupt values are the same with the device tree when adding 16 */ - gtdt->secure_el1_interrupt = ARCH_TIMER_S_EL1_IRQ + 16; - gtdt->secure_el1_flags = ACPI_EDGE_SENSITIVE; + gtdt->secure_el1_interrupt = cpu_to_le32(ARCH_TIMER_S_EL1_IRQ + 16); + gtdt->secure_el1_flags = cpu_to_le32(irqflags); - gtdt->non_secure_el1_interrupt = ARCH_TIMER_NS_EL1_IRQ + 16; - gtdt->non_secure_el1_flags = ACPI_EDGE_SENSITIVE | ACPI_GTDT_ALWAYS_ON; + gtdt->non_secure_el1_interrupt = cpu_to_le32(ARCH_TIMER_NS_EL1_IRQ + 16); + gtdt->non_secure_el1_flags = cpu_to_le32(irqflags | + ACPI_GTDT_CAP_ALWAYS_ON); - gtdt->virtual_timer_interrupt = ARCH_TIMER_VIRT_IRQ + 16; - gtdt->virtual_timer_flags = ACPI_EDGE_SENSITIVE; + gtdt->virtual_timer_interrupt = cpu_to_le32(ARCH_TIMER_VIRT_IRQ + 16); + gtdt->virtual_timer_flags = cpu_to_le32(irqflags); - gtdt->non_secure_el2_interrupt = ARCH_TIMER_NS_EL2_IRQ + 16; - gtdt->non_secure_el2_flags = ACPI_EDGE_SENSITIVE; + gtdt->non_secure_el2_interrupt = cpu_to_le32(ARCH_TIMER_NS_EL2_IRQ + 16); + gtdt->non_secure_el2_flags = cpu_to_le32(irqflags); build_header(linker, table_data, (void *)(table_data->data + gtdt_start), "GTDT", @@ -561,11 +570,12 @@ build_gtdt(GArray *table_data, BIOSLinker *linker) /* MADT */ static void -build_madt(GArray *table_data, BIOSLinker *linker, VirtGuestInfo *guest_info) +build_madt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) { + VirtMachineClass *vmc = VIRT_MACHINE_GET_CLASS(vms); int madt_start = table_data->len; - const MemMapEntry *memmap = guest_info->memmap; - const int *irqmap = guest_info->irqmap; + const MemMapEntry *memmap = vms->memmap; + const int *irqmap = vms->irqmap; AcpiMultipleApicTable *madt; AcpiMadtGenericDistributor *gicd; AcpiMadtGenericMsiFrame *gic_msi; @@ -576,30 +586,30 @@ build_madt(GArray *table_data, BIOSLinker *linker, VirtGuestInfo *guest_info) gicd = acpi_data_push(table_data, sizeof *gicd); gicd->type = ACPI_APIC_GENERIC_DISTRIBUTOR; gicd->length = sizeof(*gicd); - gicd->base_address = memmap[VIRT_GIC_DIST].base; - gicd->version = guest_info->gic_version; + gicd->base_address = cpu_to_le64(memmap[VIRT_GIC_DIST].base); + gicd->version = vms->gic_version; - for (i = 0; i < guest_info->smp_cpus; i++) { - AcpiMadtGenericInterrupt *gicc = acpi_data_push(table_data, - sizeof *gicc); + for (i = 0; i < vms->smp_cpus; i++) { + AcpiMadtGenericCpuInterface *gicc = acpi_data_push(table_data, + sizeof(*gicc)); ARMCPU *armcpu = ARM_CPU(qemu_get_cpu(i)); - gicc->type = ACPI_APIC_GENERIC_INTERRUPT; + gicc->type = ACPI_APIC_GENERIC_CPU_INTERFACE; gicc->length = sizeof(*gicc); - if (guest_info->gic_version == 2) { - gicc->base_address = memmap[VIRT_GIC_CPU].base; + if (vms->gic_version == 2) { + gicc->base_address = cpu_to_le64(memmap[VIRT_GIC_CPU].base); } - gicc->cpu_interface_number = i; - gicc->arm_mpidr = armcpu->mp_affinity; - gicc->uid = i; - gicc->flags = cpu_to_le32(ACPI_GICC_ENABLED); + gicc->cpu_interface_number = cpu_to_le32(i); + gicc->arm_mpidr = cpu_to_le64(armcpu->mp_affinity); + gicc->uid = cpu_to_le32(i); + gicc->flags = cpu_to_le32(ACPI_MADT_GICC_ENABLED); if (arm_feature(&armcpu->env, ARM_FEATURE_PMU)) { gicc->performance_interrupt = cpu_to_le32(PPI(VIRTUAL_PMU_IRQ)); } } - if (guest_info->gic_version == 3) { + if (vms->gic_version == 3) { AcpiMadtGenericTranslator *gic_its; AcpiMadtGenericRedistributor *gicr = acpi_data_push(table_data, sizeof *gicr); @@ -609,7 +619,7 @@ build_madt(GArray *table_data, BIOSLinker *linker, VirtGuestInfo *guest_info) gicr->base_address = cpu_to_le64(memmap[VIRT_GIC_REDIST].base); gicr->range_length = cpu_to_le32(memmap[VIRT_GIC_REDIST].size); - if (its_class_name() && !guest_info->no_its) { + if (its_class_name() && !vmc->no_its) { gic_its = acpi_data_push(table_data, sizeof *gic_its); gic_its->type = ACPI_APIC_GENERIC_TRANSLATOR; gic_its->length = sizeof(*gic_its); @@ -641,8 +651,8 @@ build_fadt(GArray *table_data, BIOSLinker *linker, unsigned dsdt_tbl_offset) /* Hardware Reduced = 1 and use PSCI 0.2+ and with HVC */ fadt->flags = cpu_to_le32(1 << ACPI_FADT_F_HW_REDUCED_ACPI); - fadt->arm_boot_flags = cpu_to_le16((1 << ACPI_FADT_ARM_USE_PSCI_G_0_2) | - (1 << ACPI_FADT_ARM_PSCI_USE_HVC)); + fadt->arm_boot_flags = cpu_to_le16(ACPI_FADT_ARM_PSCI_COMPLIANT | + ACPI_FADT_ARM_PSCI_USE_HVC); /* ACPI v5.1 (fadt->revision.fadt->minor_revision) */ fadt->minor_revision = 0x1; @@ -658,11 +668,11 @@ build_fadt(GArray *table_data, BIOSLinker *linker, unsigned dsdt_tbl_offset) /* DSDT */ static void -build_dsdt(GArray *table_data, BIOSLinker *linker, VirtGuestInfo *guest_info) +build_dsdt(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) { Aml *scope, *dsdt; - const MemMapEntry *memmap = guest_info->memmap; - const int *irqmap = guest_info->irqmap; + const MemMapEntry *memmap = vms->memmap; + const int *irqmap = vms->irqmap; dsdt = init_aml_allocator(); /* Reserve space for header */ @@ -674,7 +684,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, VirtGuestInfo *guest_info) * the RTC ACPI device at all when using UEFI. */ scope = aml_scope("\\_SB"); - acpi_dsdt_add_cpus(scope, guest_info->smp_cpus); + acpi_dsdt_add_cpus(scope, vms->smp_cpus); acpi_dsdt_add_uart(scope, &memmap[VIRT_UART], (irqmap[VIRT_UART] + ARM_SPI_BASE)); acpi_dsdt_add_flash(scope, &memmap[VIRT_FLASH]); @@ -682,7 +692,7 @@ build_dsdt(GArray *table_data, BIOSLinker *linker, VirtGuestInfo *guest_info) acpi_dsdt_add_virtio(scope, &memmap[VIRT_MMIO], (irqmap[VIRT_MMIO] + ARM_SPI_BASE), NUM_VIRTIO_TRANSPORTS); acpi_dsdt_add_pci(scope, memmap, (irqmap[VIRT_PCIE] + ARM_SPI_BASE), - guest_info->use_highmem); + vms->highmem); acpi_dsdt_add_gpio(scope, &memmap[VIRT_GPIO], (irqmap[VIRT_GPIO] + ARM_SPI_BASE)); acpi_dsdt_add_power_button(scope); @@ -705,12 +715,12 @@ struct AcpiBuildState { MemoryRegion *linker_mr; /* Is table patched? */ bool patched; - VirtGuestInfo *guest_info; } AcpiBuildState; static -void virt_acpi_build(VirtGuestInfo *guest_info, AcpiBuildTables *tables) +void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables) { + VirtMachineClass *vmc = VIRT_MACHINE_GET_CLASS(vms); GArray *table_offsets; unsigned dsdt, rsdt; GArray *tables_blob = tables->table_data; @@ -724,32 +734,32 @@ void virt_acpi_build(VirtGuestInfo *guest_info, AcpiBuildTables *tables) /* DSDT is pointed to by FADT */ dsdt = tables_blob->len; - build_dsdt(tables_blob, tables->linker, guest_info); + build_dsdt(tables_blob, tables->linker, vms); /* FADT MADT GTDT MCFG SPCR pointed to by RSDT */ acpi_add_table(table_offsets, tables_blob); build_fadt(tables_blob, tables->linker, dsdt); acpi_add_table(table_offsets, tables_blob); - build_madt(tables_blob, tables->linker, guest_info); + build_madt(tables_blob, tables->linker, vms); acpi_add_table(table_offsets, tables_blob); - build_gtdt(tables_blob, tables->linker); + build_gtdt(tables_blob, tables->linker, vms); acpi_add_table(table_offsets, tables_blob); - build_mcfg(tables_blob, tables->linker, guest_info); + build_mcfg(tables_blob, tables->linker, vms); acpi_add_table(table_offsets, tables_blob); - build_spcr(tables_blob, tables->linker, guest_info); + build_spcr(tables_blob, tables->linker, vms); if (nb_numa_nodes > 0) { acpi_add_table(table_offsets, tables_blob); - build_srat(tables_blob, tables->linker, guest_info); + build_srat(tables_blob, tables->linker, vms); } - if (its_class_name() && !guest_info->no_its) { + if (its_class_name() && !vmc->no_its) { acpi_add_table(table_offsets, tables_blob); - build_iort(tables_blob, tables->linker, guest_info); + build_iort(tables_blob, tables->linker); } /* RSDT is pointed to by RSDP */ @@ -788,13 +798,12 @@ static void virt_acpi_build_update(void *build_opaque) acpi_build_tables_init(&tables); - virt_acpi_build(build_state->guest_info, &tables); + virt_acpi_build(VIRT_MACHINE(qdev_get_machine()), &tables); acpi_ram_update(build_state->table_mr, tables.table_data); acpi_ram_update(build_state->rsdp_mr, tables.rsdp); acpi_ram_update(build_state->linker_mr, tables.linker->cmd_blob); - acpi_build_tables_cleanup(&tables, true); } @@ -822,12 +831,12 @@ static const VMStateDescription vmstate_virt_acpi_build = { }, }; -void virt_acpi_setup(VirtGuestInfo *guest_info) +void virt_acpi_setup(VirtMachineState *vms) { AcpiBuildTables tables; AcpiBuildState *build_state; - if (!guest_info->fw_cfg) { + if (!vms->fw_cfg) { trace_virt_acpi_setup(); return; } @@ -838,10 +847,9 @@ void virt_acpi_setup(VirtGuestInfo *guest_info) } build_state = g_malloc0(sizeof *build_state); - build_state->guest_info = guest_info; acpi_build_tables_init(&tables); - virt_acpi_build(build_state->guest_info, &tables); + virt_acpi_build(vms, &tables); /* Now expose it all to Guest */ build_state->table_mr = acpi_add_rom_blob(build_state, tables.table_data, @@ -853,8 +861,8 @@ void virt_acpi_setup(VirtGuestInfo *guest_info) acpi_add_rom_blob(build_state, tables.linker->cmd_blob, "etc/table-loader", 0); - fw_cfg_add_file(guest_info->fw_cfg, ACPI_BUILD_TPMLOG_FILE, - tables.tcpalog->data, acpi_data_len(tables.tcpalog)); + fw_cfg_add_file(vms->fw_cfg, ACPI_BUILD_TPMLOG_FILE, tables.tcpalog->data, + acpi_data_len(tables.tcpalog)); build_state->rsdp_mr = acpi_add_rom_blob(build_state, tables.rsdp, ACPI_BUILD_RSDP_FILE, 0); diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 11c53a56e0..7a03f84051 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -41,14 +41,12 @@ #include "sysemu/numa.h" #include "sysemu/sysemu.h" #include "sysemu/kvm.h" -#include "hw/boards.h" #include "hw/compat.h" #include "hw/loader.h" #include "exec/address-spaces.h" #include "qemu/bitops.h" #include "qemu/error-report.h" #include "hw/pci-host/gpex.h" -#include "hw/arm/virt-acpi-build.h" #include "hw/arm/sysbus-fdt.h" #include "hw/platform-bus.h" #include "hw/arm/fdt.h" @@ -59,51 +57,6 @@ #include "qapi/visitor.h" #include "standard-headers/linux/input.h" -/* Number of external interrupt lines to configure the GIC with */ -#define NUM_IRQS 256 - -#define PLATFORM_BUS_NUM_IRQS 64 - -static ARMPlatformBusSystemParams platform_bus_params; - -typedef struct VirtBoardInfo { - struct arm_boot_info bootinfo; - const char *cpu_model; - const MemMapEntry *memmap; - const int *irqmap; - int smp_cpus; - void *fdt; - int fdt_size; - uint32_t clock_phandle; - uint32_t gic_phandle; - uint32_t msi_phandle; - bool using_psci; -} VirtBoardInfo; - -typedef struct { - MachineClass parent; - VirtBoardInfo *daughterboard; - bool disallow_affinity_adjustment; - bool no_its; - bool no_pmu; -} VirtMachineClass; - -typedef struct { - MachineState parent; - bool secure; - bool highmem; - int32_t gic_version; -} VirtMachineState; - -#define TYPE_VIRT_MACHINE MACHINE_TYPE_NAME("virt") -#define VIRT_MACHINE(obj) \ - OBJECT_CHECK(VirtMachineState, (obj), TYPE_VIRT_MACHINE) -#define VIRT_MACHINE_GET_CLASS(obj) \ - OBJECT_GET_CLASS(VirtMachineClass, obj, TYPE_VIRT_MACHINE) -#define VIRT_MACHINE_CLASS(klass) \ - OBJECT_CLASS_CHECK(VirtMachineClass, klass, TYPE_VIRT_MACHINE) - - #define DEFINE_VIRT_MACHINE_LATEST(major, minor, latest) \ static void virt_##major##_##minor##_class_init(ObjectClass *oc, \ void *data) \ @@ -133,6 +86,13 @@ typedef struct { DEFINE_VIRT_MACHINE_LATEST(major, minor, false) +/* Number of external interrupt lines to configure the GIC with */ +#define NUM_IRQS 256 + +#define PLATFORM_BUS_NUM_IRQS 64 + +static ARMPlatformBusSystemParams platform_bus_params; + /* RAM limit in GB. Since VIRT_MEM starts at the 1GB mark, this means * RAM can go up to the 256GB mark, leaving 256GB of the physical * address space unallocated and free for future use between 256G and 512G. @@ -202,51 +162,36 @@ static const int a15irqmap[] = { [VIRT_PLATFORM_BUS] = 112, /* ...to 112 + PLATFORM_BUS_NUM_IRQS -1 */ }; -static VirtBoardInfo machines[] = { - { - .cpu_model = "cortex-a15", - .memmap = a15memmap, - .irqmap = a15irqmap, - }, - { - .cpu_model = "cortex-a53", - .memmap = a15memmap, - .irqmap = a15irqmap, - }, - { - .cpu_model = "cortex-a57", - .memmap = a15memmap, - .irqmap = a15irqmap, - }, - { - .cpu_model = "host", - .memmap = a15memmap, - .irqmap = a15irqmap, - }, +static const char *valid_cpus[] = { + "cortex-a15", + "cortex-a53", + "cortex-a57", + "host", + NULL }; -static VirtBoardInfo *find_machine_info(const char *cpu) +static bool cpuname_valid(const char *cpu) { int i; - for (i = 0; i < ARRAY_SIZE(machines); i++) { - if (strcmp(cpu, machines[i].cpu_model) == 0) { - return &machines[i]; + for (i = 0; i < ARRAY_SIZE(valid_cpus); i++) { + if (strcmp(cpu, valid_cpus[i]) == 0) { + return true; } } - return NULL; + return false; } -static void create_fdt(VirtBoardInfo *vbi) +static void create_fdt(VirtMachineState *vms) { - void *fdt = create_device_tree(&vbi->fdt_size); + void *fdt = create_device_tree(&vms->fdt_size); if (!fdt) { error_report("create_device_tree() failed"); exit(1); } - vbi->fdt = fdt; + vms->fdt = fdt; /* Header */ qemu_fdt_setprop_string(fdt, "/", "compatible", "linux,dummy-virt"); @@ -266,27 +211,27 @@ static void create_fdt(VirtBoardInfo *vbi) * optional but in practice if you omit them the kernel refuses to * probe for the device. */ - vbi->clock_phandle = qemu_fdt_alloc_phandle(fdt); + vms->clock_phandle = qemu_fdt_alloc_phandle(fdt); qemu_fdt_add_subnode(fdt, "/apb-pclk"); qemu_fdt_setprop_string(fdt, "/apb-pclk", "compatible", "fixed-clock"); qemu_fdt_setprop_cell(fdt, "/apb-pclk", "#clock-cells", 0x0); qemu_fdt_setprop_cell(fdt, "/apb-pclk", "clock-frequency", 24000000); qemu_fdt_setprop_string(fdt, "/apb-pclk", "clock-output-names", "clk24mhz"); - qemu_fdt_setprop_cell(fdt, "/apb-pclk", "phandle", vbi->clock_phandle); + qemu_fdt_setprop_cell(fdt, "/apb-pclk", "phandle", vms->clock_phandle); } -static void fdt_add_psci_node(const VirtBoardInfo *vbi) +static void fdt_add_psci_node(const VirtMachineState *vms) { uint32_t cpu_suspend_fn; uint32_t cpu_off_fn; uint32_t cpu_on_fn; uint32_t migrate_fn; - void *fdt = vbi->fdt; + void *fdt = vms->fdt; ARMCPU *armcpu = ARM_CPU(qemu_get_cpu(0)); - if (!vbi->using_psci) { + if (!vms->using_psci) { return; } @@ -327,41 +272,60 @@ static void fdt_add_psci_node(const VirtBoardInfo *vbi) qemu_fdt_setprop_cell(fdt, "/psci", "migrate", migrate_fn); } -static void fdt_add_timer_nodes(const VirtBoardInfo *vbi, int gictype) +static void fdt_add_timer_nodes(const VirtMachineState *vms) { - /* Note that on A15 h/w these interrupts are level-triggered, - * but for the GIC implementation provided by both QEMU and KVM - * they are edge-triggered. + /* On real hardware these interrupts are level-triggered. + * On KVM they were edge-triggered before host kernel version 4.4, + * and level-triggered afterwards. + * On emulated QEMU they are level-triggered. + * + * Getting the DTB info about them wrong is awkward for some + * guest kernels: + * pre-4.8 ignore the DT and leave the interrupt configured + * with whatever the GIC reset value (or the bootloader) left it at + * 4.8 before rc6 honour the incorrect data by programming it back + * into the GIC, causing problems + * 4.8rc6 and later ignore the DT and always write "level triggered" + * into the GIC + * + * For backwards-compatibility, virt-2.8 and earlier will continue + * to say these are edge-triggered, but later machines will report + * the correct information. */ ARMCPU *armcpu; - uint32_t irqflags = GIC_FDT_IRQ_FLAGS_EDGE_LO_HI; + VirtMachineClass *vmc = VIRT_MACHINE_GET_CLASS(vms); + uint32_t irqflags = GIC_FDT_IRQ_FLAGS_LEVEL_HI; + + if (vmc->claim_edge_triggered_timers) { + irqflags = GIC_FDT_IRQ_FLAGS_EDGE_LO_HI; + } - if (gictype == 2) { + if (vms->gic_version == 2) { irqflags = deposit32(irqflags, GIC_FDT_IRQ_PPI_CPU_START, GIC_FDT_IRQ_PPI_CPU_WIDTH, - (1 << vbi->smp_cpus) - 1); + (1 << vms->smp_cpus) - 1); } - qemu_fdt_add_subnode(vbi->fdt, "/timer"); + qemu_fdt_add_subnode(vms->fdt, "/timer"); armcpu = ARM_CPU(qemu_get_cpu(0)); if (arm_feature(&armcpu->env, ARM_FEATURE_V8)) { const char compat[] = "arm,armv8-timer\0arm,armv7-timer"; - qemu_fdt_setprop(vbi->fdt, "/timer", "compatible", + qemu_fdt_setprop(vms->fdt, "/timer", "compatible", compat, sizeof(compat)); } else { - qemu_fdt_setprop_string(vbi->fdt, "/timer", "compatible", + qemu_fdt_setprop_string(vms->fdt, "/timer", "compatible", "arm,armv7-timer"); } - qemu_fdt_setprop(vbi->fdt, "/timer", "always-on", NULL, 0); - qemu_fdt_setprop_cells(vbi->fdt, "/timer", "interrupts", + qemu_fdt_setprop(vms->fdt, "/timer", "always-on", NULL, 0); + qemu_fdt_setprop_cells(vms->fdt, "/timer", "interrupts", GIC_FDT_IRQ_TYPE_PPI, ARCH_TIMER_S_EL1_IRQ, irqflags, GIC_FDT_IRQ_TYPE_PPI, ARCH_TIMER_NS_EL1_IRQ, irqflags, GIC_FDT_IRQ_TYPE_PPI, ARCH_TIMER_VIRT_IRQ, irqflags, GIC_FDT_IRQ_TYPE_PPI, ARCH_TIMER_NS_EL2_IRQ, irqflags); } -static void fdt_add_cpu_nodes(const VirtBoardInfo *vbi) +static void fdt_add_cpu_nodes(const VirtMachineState *vms) { int cpu; int addr_cells = 1; @@ -380,7 +344,7 @@ static void fdt_add_cpu_nodes(const VirtBoardInfo *vbi) * The simplest way to go is to examine affinity IDs of all our CPUs. If * at least one of them has Aff3 populated, we set #address-cells to 2. */ - for (cpu = 0; cpu < vbi->smp_cpus; cpu++) { + for (cpu = 0; cpu < vms->smp_cpus; cpu++) { ARMCPU *armcpu = ARM_CPU(qemu_get_cpu(cpu)); if (armcpu->mp_affinity & ARM_AFF3_MASK) { @@ -389,101 +353,101 @@ static void fdt_add_cpu_nodes(const VirtBoardInfo *vbi) } } - qemu_fdt_add_subnode(vbi->fdt, "/cpus"); - qemu_fdt_setprop_cell(vbi->fdt, "/cpus", "#address-cells", addr_cells); - qemu_fdt_setprop_cell(vbi->fdt, "/cpus", "#size-cells", 0x0); + qemu_fdt_add_subnode(vms->fdt, "/cpus"); + qemu_fdt_setprop_cell(vms->fdt, "/cpus", "#address-cells", addr_cells); + qemu_fdt_setprop_cell(vms->fdt, "/cpus", "#size-cells", 0x0); - for (cpu = vbi->smp_cpus - 1; cpu >= 0; cpu--) { + for (cpu = vms->smp_cpus - 1; cpu >= 0; cpu--) { char *nodename = g_strdup_printf("/cpus/cpu@%d", cpu); ARMCPU *armcpu = ARM_CPU(qemu_get_cpu(cpu)); - qemu_fdt_add_subnode(vbi->fdt, nodename); - qemu_fdt_setprop_string(vbi->fdt, nodename, "device_type", "cpu"); - qemu_fdt_setprop_string(vbi->fdt, nodename, "compatible", + qemu_fdt_add_subnode(vms->fdt, nodename); + qemu_fdt_setprop_string(vms->fdt, nodename, "device_type", "cpu"); + qemu_fdt_setprop_string(vms->fdt, nodename, "compatible", armcpu->dtb_compatible); - if (vbi->using_psci && vbi->smp_cpus > 1) { - qemu_fdt_setprop_string(vbi->fdt, nodename, + if (vms->using_psci && vms->smp_cpus > 1) { + qemu_fdt_setprop_string(vms->fdt, nodename, "enable-method", "psci"); } if (addr_cells == 2) { - qemu_fdt_setprop_u64(vbi->fdt, nodename, "reg", + qemu_fdt_setprop_u64(vms->fdt, nodename, "reg", armcpu->mp_affinity); } else { - qemu_fdt_setprop_cell(vbi->fdt, nodename, "reg", + qemu_fdt_setprop_cell(vms->fdt, nodename, "reg", armcpu->mp_affinity); } i = numa_get_node_for_cpu(cpu); if (i < nb_numa_nodes) { - qemu_fdt_setprop_cell(vbi->fdt, nodename, "numa-node-id", i); + qemu_fdt_setprop_cell(vms->fdt, nodename, "numa-node-id", i); } g_free(nodename); } } -static void fdt_add_its_gic_node(VirtBoardInfo *vbi) +static void fdt_add_its_gic_node(VirtMachineState *vms) { - vbi->msi_phandle = qemu_fdt_alloc_phandle(vbi->fdt); - qemu_fdt_add_subnode(vbi->fdt, "/intc/its"); - qemu_fdt_setprop_string(vbi->fdt, "/intc/its", "compatible", + vms->msi_phandle = qemu_fdt_alloc_phandle(vms->fdt); + qemu_fdt_add_subnode(vms->fdt, "/intc/its"); + qemu_fdt_setprop_string(vms->fdt, "/intc/its", "compatible", "arm,gic-v3-its"); - qemu_fdt_setprop(vbi->fdt, "/intc/its", "msi-controller", NULL, 0); - qemu_fdt_setprop_sized_cells(vbi->fdt, "/intc/its", "reg", - 2, vbi->memmap[VIRT_GIC_ITS].base, - 2, vbi->memmap[VIRT_GIC_ITS].size); - qemu_fdt_setprop_cell(vbi->fdt, "/intc/its", "phandle", vbi->msi_phandle); + qemu_fdt_setprop(vms->fdt, "/intc/its", "msi-controller", NULL, 0); + qemu_fdt_setprop_sized_cells(vms->fdt, "/intc/its", "reg", + 2, vms->memmap[VIRT_GIC_ITS].base, + 2, vms->memmap[VIRT_GIC_ITS].size); + qemu_fdt_setprop_cell(vms->fdt, "/intc/its", "phandle", vms->msi_phandle); } -static void fdt_add_v2m_gic_node(VirtBoardInfo *vbi) +static void fdt_add_v2m_gic_node(VirtMachineState *vms) { - vbi->msi_phandle = qemu_fdt_alloc_phandle(vbi->fdt); - qemu_fdt_add_subnode(vbi->fdt, "/intc/v2m"); - qemu_fdt_setprop_string(vbi->fdt, "/intc/v2m", "compatible", + vms->msi_phandle = qemu_fdt_alloc_phandle(vms->fdt); + qemu_fdt_add_subnode(vms->fdt, "/intc/v2m"); + qemu_fdt_setprop_string(vms->fdt, "/intc/v2m", "compatible", "arm,gic-v2m-frame"); - qemu_fdt_setprop(vbi->fdt, "/intc/v2m", "msi-controller", NULL, 0); - qemu_fdt_setprop_sized_cells(vbi->fdt, "/intc/v2m", "reg", - 2, vbi->memmap[VIRT_GIC_V2M].base, - 2, vbi->memmap[VIRT_GIC_V2M].size); - qemu_fdt_setprop_cell(vbi->fdt, "/intc/v2m", "phandle", vbi->msi_phandle); + qemu_fdt_setprop(vms->fdt, "/intc/v2m", "msi-controller", NULL, 0); + qemu_fdt_setprop_sized_cells(vms->fdt, "/intc/v2m", "reg", + 2, vms->memmap[VIRT_GIC_V2M].base, + 2, vms->memmap[VIRT_GIC_V2M].size); + qemu_fdt_setprop_cell(vms->fdt, "/intc/v2m", "phandle", vms->msi_phandle); } -static void fdt_add_gic_node(VirtBoardInfo *vbi, int type) +static void fdt_add_gic_node(VirtMachineState *vms) { - vbi->gic_phandle = qemu_fdt_alloc_phandle(vbi->fdt); - qemu_fdt_setprop_cell(vbi->fdt, "/", "interrupt-parent", vbi->gic_phandle); - - qemu_fdt_add_subnode(vbi->fdt, "/intc"); - qemu_fdt_setprop_cell(vbi->fdt, "/intc", "#interrupt-cells", 3); - qemu_fdt_setprop(vbi->fdt, "/intc", "interrupt-controller", NULL, 0); - qemu_fdt_setprop_cell(vbi->fdt, "/intc", "#address-cells", 0x2); - qemu_fdt_setprop_cell(vbi->fdt, "/intc", "#size-cells", 0x2); - qemu_fdt_setprop(vbi->fdt, "/intc", "ranges", NULL, 0); - if (type == 3) { - qemu_fdt_setprop_string(vbi->fdt, "/intc", "compatible", + vms->gic_phandle = qemu_fdt_alloc_phandle(vms->fdt); + qemu_fdt_setprop_cell(vms->fdt, "/", "interrupt-parent", vms->gic_phandle); + + qemu_fdt_add_subnode(vms->fdt, "/intc"); + qemu_fdt_setprop_cell(vms->fdt, "/intc", "#interrupt-cells", 3); + qemu_fdt_setprop(vms->fdt, "/intc", "interrupt-controller", NULL, 0); + qemu_fdt_setprop_cell(vms->fdt, "/intc", "#address-cells", 0x2); + qemu_fdt_setprop_cell(vms->fdt, "/intc", "#size-cells", 0x2); + qemu_fdt_setprop(vms->fdt, "/intc", "ranges", NULL, 0); + if (vms->gic_version == 3) { + qemu_fdt_setprop_string(vms->fdt, "/intc", "compatible", "arm,gic-v3"); - qemu_fdt_setprop_sized_cells(vbi->fdt, "/intc", "reg", - 2, vbi->memmap[VIRT_GIC_DIST].base, - 2, vbi->memmap[VIRT_GIC_DIST].size, - 2, vbi->memmap[VIRT_GIC_REDIST].base, - 2, vbi->memmap[VIRT_GIC_REDIST].size); + qemu_fdt_setprop_sized_cells(vms->fdt, "/intc", "reg", + 2, vms->memmap[VIRT_GIC_DIST].base, + 2, vms->memmap[VIRT_GIC_DIST].size, + 2, vms->memmap[VIRT_GIC_REDIST].base, + 2, vms->memmap[VIRT_GIC_REDIST].size); } else { /* 'cortex-a15-gic' means 'GIC v2' */ - qemu_fdt_setprop_string(vbi->fdt, "/intc", "compatible", + qemu_fdt_setprop_string(vms->fdt, "/intc", "compatible", "arm,cortex-a15-gic"); - qemu_fdt_setprop_sized_cells(vbi->fdt, "/intc", "reg", - 2, vbi->memmap[VIRT_GIC_DIST].base, - 2, vbi->memmap[VIRT_GIC_DIST].size, - 2, vbi->memmap[VIRT_GIC_CPU].base, - 2, vbi->memmap[VIRT_GIC_CPU].size); + qemu_fdt_setprop_sized_cells(vms->fdt, "/intc", "reg", + 2, vms->memmap[VIRT_GIC_DIST].base, + 2, vms->memmap[VIRT_GIC_DIST].size, + 2, vms->memmap[VIRT_GIC_CPU].base, + 2, vms->memmap[VIRT_GIC_CPU].size); } - qemu_fdt_setprop_cell(vbi->fdt, "/intc", "phandle", vbi->gic_phandle); + qemu_fdt_setprop_cell(vms->fdt, "/intc", "phandle", vms->gic_phandle); } -static void fdt_add_pmu_nodes(const VirtBoardInfo *vbi, int gictype) +static void fdt_add_pmu_nodes(const VirtMachineState *vms) { CPUState *cpu; ARMCPU *armcpu; @@ -497,24 +461,24 @@ static void fdt_add_pmu_nodes(const VirtBoardInfo *vbi, int gictype) } } - if (gictype == 2) { + if (vms->gic_version == 2) { irqflags = deposit32(irqflags, GIC_FDT_IRQ_PPI_CPU_START, GIC_FDT_IRQ_PPI_CPU_WIDTH, - (1 << vbi->smp_cpus) - 1); + (1 << vms->smp_cpus) - 1); } armcpu = ARM_CPU(qemu_get_cpu(0)); - qemu_fdt_add_subnode(vbi->fdt, "/pmu"); + qemu_fdt_add_subnode(vms->fdt, "/pmu"); if (arm_feature(&armcpu->env, ARM_FEATURE_V8)) { const char compat[] = "arm,armv8-pmuv3"; - qemu_fdt_setprop(vbi->fdt, "/pmu", "compatible", + qemu_fdt_setprop(vms->fdt, "/pmu", "compatible", compat, sizeof(compat)); - qemu_fdt_setprop_cells(vbi->fdt, "/pmu", "interrupts", + qemu_fdt_setprop_cells(vms->fdt, "/pmu", "interrupts", GIC_FDT_IRQ_TYPE_PPI, VIRTUAL_PMU_IRQ, irqflags); } } -static void create_its(VirtBoardInfo *vbi, DeviceState *gicdev) +static void create_its(VirtMachineState *vms, DeviceState *gicdev) { const char *itsclass = its_class_name(); DeviceState *dev; @@ -529,19 +493,19 @@ static void create_its(VirtBoardInfo *vbi, DeviceState *gicdev) object_property_set_link(OBJECT(dev), OBJECT(gicdev), "parent-gicv3", &error_abort); qdev_init_nofail(dev); - sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, vbi->memmap[VIRT_GIC_ITS].base); + sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, vms->memmap[VIRT_GIC_ITS].base); - fdt_add_its_gic_node(vbi); + fdt_add_its_gic_node(vms); } -static void create_v2m(VirtBoardInfo *vbi, qemu_irq *pic) +static void create_v2m(VirtMachineState *vms, qemu_irq *pic) { int i; - int irq = vbi->irqmap[VIRT_GIC_V2M]; + int irq = vms->irqmap[VIRT_GIC_V2M]; DeviceState *dev; dev = qdev_create(NULL, "arm-gicv2m"); - sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, vbi->memmap[VIRT_GIC_V2M].base); + sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, vms->memmap[VIRT_GIC_V2M].base); qdev_prop_set_uint32(dev, "base-spi", irq); qdev_prop_set_uint32(dev, "num-spi", NUM_GICV2M_SPIS); qdev_init_nofail(dev); @@ -550,17 +514,17 @@ static void create_v2m(VirtBoardInfo *vbi, qemu_irq *pic) sysbus_connect_irq(SYS_BUS_DEVICE(dev), i, pic[irq + i]); } - fdt_add_v2m_gic_node(vbi); + fdt_add_v2m_gic_node(vms); } -static void create_gic(VirtBoardInfo *vbi, qemu_irq *pic, int type, - bool secure, bool no_its) +static void create_gic(VirtMachineState *vms, qemu_irq *pic) { /* We create a standalone GIC */ + VirtMachineClass *vmc = VIRT_MACHINE_GET_CLASS(vms); DeviceState *gicdev; SysBusDevice *gicbusdev; const char *gictype; - int i; + int type = vms->gic_version, i; gictype = (type == 3) ? gicv3_class_name() : gic_class_name(); @@ -572,15 +536,15 @@ static void create_gic(VirtBoardInfo *vbi, qemu_irq *pic, int type, */ qdev_prop_set_uint32(gicdev, "num-irq", NUM_IRQS + 32); if (!kvm_irqchip_in_kernel()) { - qdev_prop_set_bit(gicdev, "has-security-extensions", secure); + qdev_prop_set_bit(gicdev, "has-security-extensions", vms->secure); } qdev_init_nofail(gicdev); gicbusdev = SYS_BUS_DEVICE(gicdev); - sysbus_mmio_map(gicbusdev, 0, vbi->memmap[VIRT_GIC_DIST].base); + sysbus_mmio_map(gicbusdev, 0, vms->memmap[VIRT_GIC_DIST].base); if (type == 3) { - sysbus_mmio_map(gicbusdev, 1, vbi->memmap[VIRT_GIC_REDIST].base); + sysbus_mmio_map(gicbusdev, 1, vms->memmap[VIRT_GIC_REDIST].base); } else { - sysbus_mmio_map(gicbusdev, 1, vbi->memmap[VIRT_GIC_CPU].base); + sysbus_mmio_map(gicbusdev, 1, vms->memmap[VIRT_GIC_CPU].base); } /* Wire the outputs from each CPU's generic timer to the @@ -616,22 +580,22 @@ static void create_gic(VirtBoardInfo *vbi, qemu_irq *pic, int type, pic[i] = qdev_get_gpio_in(gicdev, i); } - fdt_add_gic_node(vbi, type); + fdt_add_gic_node(vms); - if (type == 3 && !no_its) { - create_its(vbi, gicdev); + if (type == 3 && !vmc->no_its) { + create_its(vms, gicdev); } else if (type == 2) { - create_v2m(vbi, pic); + create_v2m(vms, pic); } } -static void create_uart(const VirtBoardInfo *vbi, qemu_irq *pic, int uart, +static void create_uart(const VirtMachineState *vms, qemu_irq *pic, int uart, MemoryRegion *mem, CharDriverState *chr) { char *nodename; - hwaddr base = vbi->memmap[uart].base; - hwaddr size = vbi->memmap[uart].size; - int irq = vbi->irqmap[uart]; + hwaddr base = vms->memmap[uart].base; + hwaddr size = vms->memmap[uart].size; + int irq = vms->irqmap[uart]; const char compat[] = "arm,pl011\0arm,primecell"; const char clocknames[] = "uartclk\0apb_pclk"; DeviceState *dev = qdev_create(NULL, "pl011"); @@ -644,51 +608,51 @@ static void create_uart(const VirtBoardInfo *vbi, qemu_irq *pic, int uart, sysbus_connect_irq(s, 0, pic[irq]); nodename = g_strdup_printf("/pl011@%" PRIx64, base); - qemu_fdt_add_subnode(vbi->fdt, nodename); + qemu_fdt_add_subnode(vms->fdt, nodename); /* Note that we can't use setprop_string because of the embedded NUL */ - qemu_fdt_setprop(vbi->fdt, nodename, "compatible", + qemu_fdt_setprop(vms->fdt, nodename, "compatible", compat, sizeof(compat)); - qemu_fdt_setprop_sized_cells(vbi->fdt, nodename, "reg", + qemu_fdt_setprop_sized_cells(vms->fdt, nodename, "reg", 2, base, 2, size); - qemu_fdt_setprop_cells(vbi->fdt, nodename, "interrupts", + qemu_fdt_setprop_cells(vms->fdt, nodename, "interrupts", GIC_FDT_IRQ_TYPE_SPI, irq, GIC_FDT_IRQ_FLAGS_LEVEL_HI); - qemu_fdt_setprop_cells(vbi->fdt, nodename, "clocks", - vbi->clock_phandle, vbi->clock_phandle); - qemu_fdt_setprop(vbi->fdt, nodename, "clock-names", + qemu_fdt_setprop_cells(vms->fdt, nodename, "clocks", + vms->clock_phandle, vms->clock_phandle); + qemu_fdt_setprop(vms->fdt, nodename, "clock-names", clocknames, sizeof(clocknames)); if (uart == VIRT_UART) { - qemu_fdt_setprop_string(vbi->fdt, "/chosen", "stdout-path", nodename); + qemu_fdt_setprop_string(vms->fdt, "/chosen", "stdout-path", nodename); } else { /* Mark as not usable by the normal world */ - qemu_fdt_setprop_string(vbi->fdt, nodename, "status", "disabled"); - qemu_fdt_setprop_string(vbi->fdt, nodename, "secure-status", "okay"); + qemu_fdt_setprop_string(vms->fdt, nodename, "status", "disabled"); + qemu_fdt_setprop_string(vms->fdt, nodename, "secure-status", "okay"); } g_free(nodename); } -static void create_rtc(const VirtBoardInfo *vbi, qemu_irq *pic) +static void create_rtc(const VirtMachineState *vms, qemu_irq *pic) { char *nodename; - hwaddr base = vbi->memmap[VIRT_RTC].base; - hwaddr size = vbi->memmap[VIRT_RTC].size; - int irq = vbi->irqmap[VIRT_RTC]; + hwaddr base = vms->memmap[VIRT_RTC].base; + hwaddr size = vms->memmap[VIRT_RTC].size; + int irq = vms->irqmap[VIRT_RTC]; const char compat[] = "arm,pl031\0arm,primecell"; sysbus_create_simple("pl031", base, pic[irq]); nodename = g_strdup_printf("/pl031@%" PRIx64, base); - qemu_fdt_add_subnode(vbi->fdt, nodename); - qemu_fdt_setprop(vbi->fdt, nodename, "compatible", compat, sizeof(compat)); - qemu_fdt_setprop_sized_cells(vbi->fdt, nodename, "reg", + qemu_fdt_add_subnode(vms->fdt, nodename); + qemu_fdt_setprop(vms->fdt, nodename, "compatible", compat, sizeof(compat)); + qemu_fdt_setprop_sized_cells(vms->fdt, nodename, "reg", 2, base, 2, size); - qemu_fdt_setprop_cells(vbi->fdt, nodename, "interrupts", + qemu_fdt_setprop_cells(vms->fdt, nodename, "interrupts", GIC_FDT_IRQ_TYPE_SPI, irq, GIC_FDT_IRQ_FLAGS_LEVEL_HI); - qemu_fdt_setprop_cell(vbi->fdt, nodename, "clocks", vbi->clock_phandle); - qemu_fdt_setprop_string(vbi->fdt, nodename, "clock-names", "apb_pclk"); + qemu_fdt_setprop_cell(vms->fdt, nodename, "clocks", vms->clock_phandle); + qemu_fdt_setprop_string(vms->fdt, nodename, "clock-names", "apb_pclk"); g_free(nodename); } @@ -703,45 +667,45 @@ static Notifier virt_system_powerdown_notifier = { .notify = virt_powerdown_req }; -static void create_gpio(const VirtBoardInfo *vbi, qemu_irq *pic) +static void create_gpio(const VirtMachineState *vms, qemu_irq *pic) { char *nodename; DeviceState *pl061_dev; - hwaddr base = vbi->memmap[VIRT_GPIO].base; - hwaddr size = vbi->memmap[VIRT_GPIO].size; - int irq = vbi->irqmap[VIRT_GPIO]; + hwaddr base = vms->memmap[VIRT_GPIO].base; + hwaddr size = vms->memmap[VIRT_GPIO].size; + int irq = vms->irqmap[VIRT_GPIO]; const char compat[] = "arm,pl061\0arm,primecell"; pl061_dev = sysbus_create_simple("pl061", base, pic[irq]); - uint32_t phandle = qemu_fdt_alloc_phandle(vbi->fdt); + uint32_t phandle = qemu_fdt_alloc_phandle(vms->fdt); nodename = g_strdup_printf("/pl061@%" PRIx64, base); - qemu_fdt_add_subnode(vbi->fdt, nodename); - qemu_fdt_setprop_sized_cells(vbi->fdt, nodename, "reg", + qemu_fdt_add_subnode(vms->fdt, nodename); + qemu_fdt_setprop_sized_cells(vms->fdt, nodename, "reg", 2, base, 2, size); - qemu_fdt_setprop(vbi->fdt, nodename, "compatible", compat, sizeof(compat)); - qemu_fdt_setprop_cell(vbi->fdt, nodename, "#gpio-cells", 2); - qemu_fdt_setprop(vbi->fdt, nodename, "gpio-controller", NULL, 0); - qemu_fdt_setprop_cells(vbi->fdt, nodename, "interrupts", + qemu_fdt_setprop(vms->fdt, nodename, "compatible", compat, sizeof(compat)); + qemu_fdt_setprop_cell(vms->fdt, nodename, "#gpio-cells", 2); + qemu_fdt_setprop(vms->fdt, nodename, "gpio-controller", NULL, 0); + qemu_fdt_setprop_cells(vms->fdt, nodename, "interrupts", GIC_FDT_IRQ_TYPE_SPI, irq, GIC_FDT_IRQ_FLAGS_LEVEL_HI); - qemu_fdt_setprop_cell(vbi->fdt, nodename, "clocks", vbi->clock_phandle); - qemu_fdt_setprop_string(vbi->fdt, nodename, "clock-names", "apb_pclk"); - qemu_fdt_setprop_cell(vbi->fdt, nodename, "phandle", phandle); + qemu_fdt_setprop_cell(vms->fdt, nodename, "clocks", vms->clock_phandle); + qemu_fdt_setprop_string(vms->fdt, nodename, "clock-names", "apb_pclk"); + qemu_fdt_setprop_cell(vms->fdt, nodename, "phandle", phandle); gpio_key_dev = sysbus_create_simple("gpio-key", -1, qdev_get_gpio_in(pl061_dev, 3)); - qemu_fdt_add_subnode(vbi->fdt, "/gpio-keys"); - qemu_fdt_setprop_string(vbi->fdt, "/gpio-keys", "compatible", "gpio-keys"); - qemu_fdt_setprop_cell(vbi->fdt, "/gpio-keys", "#size-cells", 0); - qemu_fdt_setprop_cell(vbi->fdt, "/gpio-keys", "#address-cells", 1); + qemu_fdt_add_subnode(vms->fdt, "/gpio-keys"); + qemu_fdt_setprop_string(vms->fdt, "/gpio-keys", "compatible", "gpio-keys"); + qemu_fdt_setprop_cell(vms->fdt, "/gpio-keys", "#size-cells", 0); + qemu_fdt_setprop_cell(vms->fdt, "/gpio-keys", "#address-cells", 1); - qemu_fdt_add_subnode(vbi->fdt, "/gpio-keys/poweroff"); - qemu_fdt_setprop_string(vbi->fdt, "/gpio-keys/poweroff", + qemu_fdt_add_subnode(vms->fdt, "/gpio-keys/poweroff"); + qemu_fdt_setprop_string(vms->fdt, "/gpio-keys/poweroff", "label", "GPIO Key Poweroff"); - qemu_fdt_setprop_cell(vbi->fdt, "/gpio-keys/poweroff", "linux,code", + qemu_fdt_setprop_cell(vms->fdt, "/gpio-keys/poweroff", "linux,code", KEY_POWER); - qemu_fdt_setprop_cells(vbi->fdt, "/gpio-keys/poweroff", + qemu_fdt_setprop_cells(vms->fdt, "/gpio-keys/poweroff", "gpios", phandle, 3, 0); /* connect powerdown request */ @@ -750,10 +714,10 @@ static void create_gpio(const VirtBoardInfo *vbi, qemu_irq *pic) g_free(nodename); } -static void create_virtio_devices(const VirtBoardInfo *vbi, qemu_irq *pic) +static void create_virtio_devices(const VirtMachineState *vms, qemu_irq *pic) { int i; - hwaddr size = vbi->memmap[VIRT_MMIO].size; + hwaddr size = vms->memmap[VIRT_MMIO].size; /* We create the transports in forwards order. Since qbus_realize() * prepends (not appends) new child buses, the incrementing loop below will @@ -783,8 +747,8 @@ static void create_virtio_devices(const VirtBoardInfo *vbi, qemu_irq *pic) * of disks users must use UUIDs or similar mechanisms. */ for (i = 0; i < NUM_VIRTIO_TRANSPORTS; i++) { - int irq = vbi->irqmap[VIRT_MMIO] + i; - hwaddr base = vbi->memmap[VIRT_MMIO].base + i * size; + int irq = vms->irqmap[VIRT_MMIO] + i; + hwaddr base = vms->memmap[VIRT_MMIO].base + i * size; sysbus_create_simple("virtio-mmio", base, pic[irq]); } @@ -798,16 +762,16 @@ static void create_virtio_devices(const VirtBoardInfo *vbi, qemu_irq *pic) */ for (i = NUM_VIRTIO_TRANSPORTS - 1; i >= 0; i--) { char *nodename; - int irq = vbi->irqmap[VIRT_MMIO] + i; - hwaddr base = vbi->memmap[VIRT_MMIO].base + i * size; + int irq = vms->irqmap[VIRT_MMIO] + i; + hwaddr base = vms->memmap[VIRT_MMIO].base + i * size; nodename = g_strdup_printf("/virtio_mmio@%" PRIx64, base); - qemu_fdt_add_subnode(vbi->fdt, nodename); - qemu_fdt_setprop_string(vbi->fdt, nodename, + qemu_fdt_add_subnode(vms->fdt, nodename); + qemu_fdt_setprop_string(vms->fdt, nodename, "compatible", "virtio,mmio"); - qemu_fdt_setprop_sized_cells(vbi->fdt, nodename, "reg", + qemu_fdt_setprop_sized_cells(vms->fdt, nodename, "reg", 2, base, 2, size); - qemu_fdt_setprop_cells(vbi->fdt, nodename, "interrupts", + qemu_fdt_setprop_cells(vms->fdt, nodename, "interrupts", GIC_FDT_IRQ_TYPE_SPI, irq, GIC_FDT_IRQ_FLAGS_EDGE_LO_HI); g_free(nodename); @@ -870,7 +834,7 @@ static void create_one_flash(const char *name, hwaddr flashbase, } } -static void create_flash(const VirtBoardInfo *vbi, +static void create_flash(const VirtMachineState *vms, MemoryRegion *sysmem, MemoryRegion *secure_sysmem) { @@ -882,8 +846,8 @@ static void create_flash(const VirtBoardInfo *vbi, * If sysmem == secure_sysmem this means there is no separate Secure * address space and both flash devices are generally visible. */ - hwaddr flashsize = vbi->memmap[VIRT_FLASH].size / 2; - hwaddr flashbase = vbi->memmap[VIRT_FLASH].base; + hwaddr flashsize = vms->memmap[VIRT_FLASH].size / 2; + hwaddr flashbase = vms->memmap[VIRT_FLASH].base; char *nodename; create_one_flash("virt.flash0", flashbase, flashsize, @@ -894,41 +858,41 @@ static void create_flash(const VirtBoardInfo *vbi, if (sysmem == secure_sysmem) { /* Report both flash devices as a single node in the DT */ nodename = g_strdup_printf("/flash@%" PRIx64, flashbase); - qemu_fdt_add_subnode(vbi->fdt, nodename); - qemu_fdt_setprop_string(vbi->fdt, nodename, "compatible", "cfi-flash"); - qemu_fdt_setprop_sized_cells(vbi->fdt, nodename, "reg", + qemu_fdt_add_subnode(vms->fdt, nodename); + qemu_fdt_setprop_string(vms->fdt, nodename, "compatible", "cfi-flash"); + qemu_fdt_setprop_sized_cells(vms->fdt, nodename, "reg", 2, flashbase, 2, flashsize, 2, flashbase + flashsize, 2, flashsize); - qemu_fdt_setprop_cell(vbi->fdt, nodename, "bank-width", 4); + qemu_fdt_setprop_cell(vms->fdt, nodename, "bank-width", 4); g_free(nodename); } else { /* Report the devices as separate nodes so we can mark one as * only visible to the secure world. */ nodename = g_strdup_printf("/secflash@%" PRIx64, flashbase); - qemu_fdt_add_subnode(vbi->fdt, nodename); - qemu_fdt_setprop_string(vbi->fdt, nodename, "compatible", "cfi-flash"); - qemu_fdt_setprop_sized_cells(vbi->fdt, nodename, "reg", + qemu_fdt_add_subnode(vms->fdt, nodename); + qemu_fdt_setprop_string(vms->fdt, nodename, "compatible", "cfi-flash"); + qemu_fdt_setprop_sized_cells(vms->fdt, nodename, "reg", 2, flashbase, 2, flashsize); - qemu_fdt_setprop_cell(vbi->fdt, nodename, "bank-width", 4); - qemu_fdt_setprop_string(vbi->fdt, nodename, "status", "disabled"); - qemu_fdt_setprop_string(vbi->fdt, nodename, "secure-status", "okay"); + qemu_fdt_setprop_cell(vms->fdt, nodename, "bank-width", 4); + qemu_fdt_setprop_string(vms->fdt, nodename, "status", "disabled"); + qemu_fdt_setprop_string(vms->fdt, nodename, "secure-status", "okay"); g_free(nodename); nodename = g_strdup_printf("/flash@%" PRIx64, flashbase); - qemu_fdt_add_subnode(vbi->fdt, nodename); - qemu_fdt_setprop_string(vbi->fdt, nodename, "compatible", "cfi-flash"); - qemu_fdt_setprop_sized_cells(vbi->fdt, nodename, "reg", + qemu_fdt_add_subnode(vms->fdt, nodename); + qemu_fdt_setprop_string(vms->fdt, nodename, "compatible", "cfi-flash"); + qemu_fdt_setprop_sized_cells(vms->fdt, nodename, "reg", 2, flashbase + flashsize, 2, flashsize); - qemu_fdt_setprop_cell(vbi->fdt, nodename, "bank-width", 4); + qemu_fdt_setprop_cell(vms->fdt, nodename, "bank-width", 4); g_free(nodename); } } -static void create_fw_cfg(const VirtBoardInfo *vbi, AddressSpace *as) +static FWCfgState *create_fw_cfg(const VirtMachineState *vms, AddressSpace *as) { - hwaddr base = vbi->memmap[VIRT_FW_CFG].base; - hwaddr size = vbi->memmap[VIRT_FW_CFG].size; + hwaddr base = vms->memmap[VIRT_FW_CFG].base; + hwaddr size = vms->memmap[VIRT_FW_CFG].size; FWCfgState *fw_cfg; char *nodename; @@ -936,15 +900,17 @@ static void create_fw_cfg(const VirtBoardInfo *vbi, AddressSpace *as) fw_cfg_add_i16(fw_cfg, FW_CFG_NB_CPUS, (uint16_t)smp_cpus); nodename = g_strdup_printf("/fw-cfg@%" PRIx64, base); - qemu_fdt_add_subnode(vbi->fdt, nodename); - qemu_fdt_setprop_string(vbi->fdt, nodename, + qemu_fdt_add_subnode(vms->fdt, nodename); + qemu_fdt_setprop_string(vms->fdt, nodename, "compatible", "qemu,fw-cfg-mmio"); - qemu_fdt_setprop_sized_cells(vbi->fdt, nodename, "reg", + qemu_fdt_setprop_sized_cells(vms->fdt, nodename, "reg", 2, base, 2, size); g_free(nodename); + return fw_cfg; } -static void create_pcie_irq_map(const VirtBoardInfo *vbi, uint32_t gic_phandle, +static void create_pcie_irq_map(const VirtMachineState *vms, + uint32_t gic_phandle, int first_irq, const char *nodename) { int devfn, pin; @@ -971,28 +937,27 @@ static void create_pcie_irq_map(const VirtBoardInfo *vbi, uint32_t gic_phandle, } } - qemu_fdt_setprop(vbi->fdt, nodename, "interrupt-map", + qemu_fdt_setprop(vms->fdt, nodename, "interrupt-map", full_irq_map, sizeof(full_irq_map)); - qemu_fdt_setprop_cells(vbi->fdt, nodename, "interrupt-map-mask", + qemu_fdt_setprop_cells(vms->fdt, nodename, "interrupt-map-mask", 0x1800, 0, 0, /* devfn (PCI_SLOT(3)) */ 0x7 /* PCI irq */); } -static void create_pcie(const VirtBoardInfo *vbi, qemu_irq *pic, - bool use_highmem) +static void create_pcie(const VirtMachineState *vms, qemu_irq *pic) { - hwaddr base_mmio = vbi->memmap[VIRT_PCIE_MMIO].base; - hwaddr size_mmio = vbi->memmap[VIRT_PCIE_MMIO].size; - hwaddr base_mmio_high = vbi->memmap[VIRT_PCIE_MMIO_HIGH].base; - hwaddr size_mmio_high = vbi->memmap[VIRT_PCIE_MMIO_HIGH].size; - hwaddr base_pio = vbi->memmap[VIRT_PCIE_PIO].base; - hwaddr size_pio = vbi->memmap[VIRT_PCIE_PIO].size; - hwaddr base_ecam = vbi->memmap[VIRT_PCIE_ECAM].base; - hwaddr size_ecam = vbi->memmap[VIRT_PCIE_ECAM].size; + hwaddr base_mmio = vms->memmap[VIRT_PCIE_MMIO].base; + hwaddr size_mmio = vms->memmap[VIRT_PCIE_MMIO].size; + hwaddr base_mmio_high = vms->memmap[VIRT_PCIE_MMIO_HIGH].base; + hwaddr size_mmio_high = vms->memmap[VIRT_PCIE_MMIO_HIGH].size; + hwaddr base_pio = vms->memmap[VIRT_PCIE_PIO].base; + hwaddr size_pio = vms->memmap[VIRT_PCIE_PIO].size; + hwaddr base_ecam = vms->memmap[VIRT_PCIE_ECAM].base; + hwaddr size_ecam = vms->memmap[VIRT_PCIE_ECAM].size; hwaddr base = base_mmio; int nr_pcie_buses = size_ecam / PCIE_MMCFG_SIZE_MIN; - int irq = vbi->irqmap[VIRT_PCIE]; + int irq = vms->irqmap[VIRT_PCIE]; MemoryRegion *mmio_alias; MemoryRegion *mmio_reg; MemoryRegion *ecam_alias; @@ -1023,7 +988,7 @@ static void create_pcie(const VirtBoardInfo *vbi, qemu_irq *pic, mmio_reg, base_mmio, size_mmio); memory_region_add_subregion(get_system_memory(), base_mmio, mmio_alias); - if (use_highmem) { + if (vms->highmem) { /* Map high MMIO space */ MemoryRegion *high_mmio_alias = g_new0(MemoryRegion, 1); @@ -1054,26 +1019,26 @@ static void create_pcie(const VirtBoardInfo *vbi, qemu_irq *pic, } nodename = g_strdup_printf("/pcie@%" PRIx64, base); - qemu_fdt_add_subnode(vbi->fdt, nodename); - qemu_fdt_setprop_string(vbi->fdt, nodename, + qemu_fdt_add_subnode(vms->fdt, nodename); + qemu_fdt_setprop_string(vms->fdt, nodename, "compatible", "pci-host-ecam-generic"); - qemu_fdt_setprop_string(vbi->fdt, nodename, "device_type", "pci"); - qemu_fdt_setprop_cell(vbi->fdt, nodename, "#address-cells", 3); - qemu_fdt_setprop_cell(vbi->fdt, nodename, "#size-cells", 2); - qemu_fdt_setprop_cells(vbi->fdt, nodename, "bus-range", 0, + qemu_fdt_setprop_string(vms->fdt, nodename, "device_type", "pci"); + qemu_fdt_setprop_cell(vms->fdt, nodename, "#address-cells", 3); + qemu_fdt_setprop_cell(vms->fdt, nodename, "#size-cells", 2); + qemu_fdt_setprop_cells(vms->fdt, nodename, "bus-range", 0, nr_pcie_buses - 1); - qemu_fdt_setprop(vbi->fdt, nodename, "dma-coherent", NULL, 0); + qemu_fdt_setprop(vms->fdt, nodename, "dma-coherent", NULL, 0); - if (vbi->msi_phandle) { - qemu_fdt_setprop_cells(vbi->fdt, nodename, "msi-parent", - vbi->msi_phandle); + if (vms->msi_phandle) { + qemu_fdt_setprop_cells(vms->fdt, nodename, "msi-parent", + vms->msi_phandle); } - qemu_fdt_setprop_sized_cells(vbi->fdt, nodename, "reg", + qemu_fdt_setprop_sized_cells(vms->fdt, nodename, "reg", 2, base_ecam, 2, size_ecam); - if (use_highmem) { - qemu_fdt_setprop_sized_cells(vbi->fdt, nodename, "ranges", + if (vms->highmem) { + qemu_fdt_setprop_sized_cells(vms->fdt, nodename, "ranges", 1, FDT_PCI_RANGE_IOPORT, 2, 0, 2, base_pio, 2, size_pio, 1, FDT_PCI_RANGE_MMIO, 2, base_mmio, @@ -1082,20 +1047,20 @@ static void create_pcie(const VirtBoardInfo *vbi, qemu_irq *pic, 2, base_mmio_high, 2, base_mmio_high, 2, size_mmio_high); } else { - qemu_fdt_setprop_sized_cells(vbi->fdt, nodename, "ranges", + qemu_fdt_setprop_sized_cells(vms->fdt, nodename, "ranges", 1, FDT_PCI_RANGE_IOPORT, 2, 0, 2, base_pio, 2, size_pio, 1, FDT_PCI_RANGE_MMIO, 2, base_mmio, 2, base_mmio, 2, size_mmio); } - qemu_fdt_setprop_cell(vbi->fdt, nodename, "#interrupt-cells", 1); - create_pcie_irq_map(vbi, vbi->gic_phandle, irq, nodename); + qemu_fdt_setprop_cell(vms->fdt, nodename, "#interrupt-cells", 1); + create_pcie_irq_map(vms, vms->gic_phandle, irq, nodename); g_free(nodename); } -static void create_platform_bus(VirtBoardInfo *vbi, qemu_irq *pic) +static void create_platform_bus(VirtMachineState *vms, qemu_irq *pic) { DeviceState *dev; SysBusDevice *s; @@ -1103,13 +1068,13 @@ static void create_platform_bus(VirtBoardInfo *vbi, qemu_irq *pic) ARMPlatformBusFDTParams *fdt_params = g_new(ARMPlatformBusFDTParams, 1); MemoryRegion *sysmem = get_system_memory(); - platform_bus_params.platform_bus_base = vbi->memmap[VIRT_PLATFORM_BUS].base; - platform_bus_params.platform_bus_size = vbi->memmap[VIRT_PLATFORM_BUS].size; - platform_bus_params.platform_bus_first_irq = vbi->irqmap[VIRT_PLATFORM_BUS]; + platform_bus_params.platform_bus_base = vms->memmap[VIRT_PLATFORM_BUS].base; + platform_bus_params.platform_bus_size = vms->memmap[VIRT_PLATFORM_BUS].size; + platform_bus_params.platform_bus_first_irq = vms->irqmap[VIRT_PLATFORM_BUS]; platform_bus_params.platform_bus_num_irqs = PLATFORM_BUS_NUM_IRQS; fdt_params->system_params = &platform_bus_params; - fdt_params->binfo = &vbi->bootinfo; + fdt_params->binfo = &vms->bootinfo; fdt_params->intc = "/intc"; /* * register a machine init done notifier that creates the device tree @@ -1136,43 +1101,44 @@ static void create_platform_bus(VirtBoardInfo *vbi, qemu_irq *pic) sysbus_mmio_get_region(s, 0)); } -static void create_secure_ram(VirtBoardInfo *vbi, MemoryRegion *secure_sysmem) +static void create_secure_ram(VirtMachineState *vms, + MemoryRegion *secure_sysmem) { MemoryRegion *secram = g_new(MemoryRegion, 1); char *nodename; - hwaddr base = vbi->memmap[VIRT_SECURE_MEM].base; - hwaddr size = vbi->memmap[VIRT_SECURE_MEM].size; + hwaddr base = vms->memmap[VIRT_SECURE_MEM].base; + hwaddr size = vms->memmap[VIRT_SECURE_MEM].size; memory_region_init_ram(secram, NULL, "virt.secure-ram", size, &error_fatal); vmstate_register_ram_global(secram); memory_region_add_subregion(secure_sysmem, base, secram); nodename = g_strdup_printf("/secram@%" PRIx64, base); - qemu_fdt_add_subnode(vbi->fdt, nodename); - qemu_fdt_setprop_string(vbi->fdt, nodename, "device_type", "memory"); - qemu_fdt_setprop_sized_cells(vbi->fdt, nodename, "reg", 2, base, 2, size); - qemu_fdt_setprop_string(vbi->fdt, nodename, "status", "disabled"); - qemu_fdt_setprop_string(vbi->fdt, nodename, "secure-status", "okay"); + qemu_fdt_add_subnode(vms->fdt, nodename); + qemu_fdt_setprop_string(vms->fdt, nodename, "device_type", "memory"); + qemu_fdt_setprop_sized_cells(vms->fdt, nodename, "reg", 2, base, 2, size); + qemu_fdt_setprop_string(vms->fdt, nodename, "status", "disabled"); + qemu_fdt_setprop_string(vms->fdt, nodename, "secure-status", "okay"); g_free(nodename); } static void *machvirt_dtb(const struct arm_boot_info *binfo, int *fdt_size) { - const VirtBoardInfo *board = (const VirtBoardInfo *)binfo; + const VirtMachineState *board = container_of(binfo, VirtMachineState, + bootinfo); *fdt_size = board->fdt_size; return board->fdt; } -static void virt_build_smbios(VirtGuestInfo *guest_info) +static void virt_build_smbios(VirtMachineState *vms) { - FWCfgState *fw_cfg = guest_info->fw_cfg; uint8_t *smbios_tables, *smbios_anchor; size_t smbios_tables_len, smbios_anchor_len; const char *product = "QEMU Virtual Machine"; - if (!fw_cfg) { + if (!vms->fw_cfg) { return; } @@ -1187,20 +1153,21 @@ static void virt_build_smbios(VirtGuestInfo *guest_info) &smbios_anchor, &smbios_anchor_len); if (smbios_anchor) { - fw_cfg_add_file(fw_cfg, "etc/smbios/smbios-tables", + fw_cfg_add_file(vms->fw_cfg, "etc/smbios/smbios-tables", smbios_tables, smbios_tables_len); - fw_cfg_add_file(fw_cfg, "etc/smbios/smbios-anchor", + fw_cfg_add_file(vms->fw_cfg, "etc/smbios/smbios-anchor", smbios_anchor, smbios_anchor_len); } } static -void virt_guest_info_machine_done(Notifier *notifier, void *data) +void virt_machine_done(Notifier *notifier, void *data) { - VirtGuestInfoState *guest_info_state = container_of(notifier, - VirtGuestInfoState, machine_done); - virt_acpi_setup(&guest_info_state->info); - virt_build_smbios(&guest_info_state->info); + VirtMachineState *vms = container_of(notifier, VirtMachineState, + machine_done); + + virt_acpi_setup(vms); + virt_build_smbios(vms); } static void machvirt_init(MachineState *machine) @@ -1210,13 +1177,9 @@ static void machvirt_init(MachineState *machine) qemu_irq pic[NUM_IRQS]; MemoryRegion *sysmem = get_system_memory(); MemoryRegion *secure_sysmem = NULL; - int gic_version = vms->gic_version; int n, virt_max_cpus; MemoryRegion *ram = g_new(MemoryRegion, 1); const char *cpu_model = machine->cpu_model; - VirtBoardInfo *vbi; - VirtGuestInfoState *guest_info_state = g_malloc0(sizeof *guest_info_state); - VirtGuestInfo *guest_info = &guest_info_state->info; char **cpustr; ObjectClass *oc; const char *typename; @@ -1232,14 +1195,14 @@ static void machvirt_init(MachineState *machine) /* We can probe only here because during property set * KVM is not available yet */ - if (!gic_version) { + if (!vms->gic_version) { if (!kvm_enabled()) { error_report("gic-version=host requires KVM"); exit(1); } - gic_version = kvm_arm_vgic_probe(); - if (!gic_version) { + vms->gic_version = kvm_arm_vgic_probe(); + if (!vms->gic_version) { error_report("Unable to determine GIC version supported by host"); exit(1); } @@ -1248,9 +1211,7 @@ static void machvirt_init(MachineState *machine) /* Separate the actual CPU model name from any appended features */ cpustr = g_strsplit(cpu_model, ",", 2); - vbi = find_machine_info(cpustr[0]); - - if (!vbi) { + if (!cpuname_valid(cpustr[0])) { error_report("mach-virt: CPU %s not supported", cpustr[0]); exit(1); } @@ -1262,13 +1223,13 @@ static void machvirt_init(MachineState *machine) * let the boot ROM sort them out. * The usual case is that we do use QEMU's PSCI implementation. */ - vbi->using_psci = !(vms->secure && firmware_loaded); + vms->using_psci = !(vms->secure && firmware_loaded); /* The maximum number of CPUs depends on the GIC version, or on how * many redistributors we can fit into the memory map. */ - if (gic_version == 3) { - virt_max_cpus = vbi->memmap[VIRT_GIC_REDIST].size / 0x20000; + if (vms->gic_version == 3) { + virt_max_cpus = vms->memmap[VIRT_GIC_REDIST].size / 0x20000; clustersz = GICV3_TARGETLIST_BITS; } else { virt_max_cpus = GIC_NCPU; @@ -1282,9 +1243,9 @@ static void machvirt_init(MachineState *machine) exit(1); } - vbi->smp_cpus = smp_cpus; + vms->smp_cpus = smp_cpus; - if (machine->ram_size > vbi->memmap[VIRT_MEM].size) { + if (machine->ram_size > vms->memmap[VIRT_MEM].size) { error_report("mach-virt: cannot model more than %dGB RAM", RAMLIMIT_GB); exit(1); } @@ -1306,7 +1267,7 @@ static void machvirt_init(MachineState *machine) memory_region_add_subregion_overlap(secure_sysmem, 0, sysmem, -1); } - create_fdt(vbi); + create_fdt(vms); oc = cpu_class_by_name(TYPE_ARM_CPU, cpustr[0]); if (!oc) { @@ -1345,7 +1306,7 @@ static void machvirt_init(MachineState *machine) object_property_set_bool(cpuobj, false, "has_el3", NULL); } - if (vbi->using_psci) { + if (vms->using_psci) { object_property_set_int(cpuobj, QEMU_PSCI_CONDUIT_HVC, "psci-conduit", NULL); @@ -1361,7 +1322,7 @@ static void machvirt_init(MachineState *machine) } if (object_property_find(cpuobj, "reset-cbar", NULL)) { - object_property_set_int(cpuobj, vbi->memmap[VIRT_CPUPERIPHS].base, + object_property_set_int(cpuobj, vms->memmap[VIRT_CPUPERIPHS].base, "reset-cbar", &error_abort); } @@ -1374,62 +1335,55 @@ static void machvirt_init(MachineState *machine) object_property_set_bool(cpuobj, true, "realized", NULL); } - fdt_add_timer_nodes(vbi, gic_version); - fdt_add_cpu_nodes(vbi); - fdt_add_psci_node(vbi); + fdt_add_timer_nodes(vms); + fdt_add_cpu_nodes(vms); + fdt_add_psci_node(vms); memory_region_allocate_system_memory(ram, NULL, "mach-virt.ram", machine->ram_size); - memory_region_add_subregion(sysmem, vbi->memmap[VIRT_MEM].base, ram); + memory_region_add_subregion(sysmem, vms->memmap[VIRT_MEM].base, ram); - create_flash(vbi, sysmem, secure_sysmem ? secure_sysmem : sysmem); + create_flash(vms, sysmem, secure_sysmem ? secure_sysmem : sysmem); - create_gic(vbi, pic, gic_version, vms->secure, vmc->no_its); + create_gic(vms, pic); - fdt_add_pmu_nodes(vbi, gic_version); + fdt_add_pmu_nodes(vms); - create_uart(vbi, pic, VIRT_UART, sysmem, serial_hds[0]); + create_uart(vms, pic, VIRT_UART, sysmem, serial_hds[0]); if (vms->secure) { - create_secure_ram(vbi, secure_sysmem); - create_uart(vbi, pic, VIRT_SECURE_UART, secure_sysmem, serial_hds[1]); + create_secure_ram(vms, secure_sysmem); + create_uart(vms, pic, VIRT_SECURE_UART, secure_sysmem, serial_hds[1]); } - create_rtc(vbi, pic); + create_rtc(vms, pic); - create_pcie(vbi, pic, vms->highmem); + create_pcie(vms, pic); - create_gpio(vbi, pic); + create_gpio(vms, pic); /* Create mmio transports, so the user can create virtio backends * (which will be automatically plugged in to the transports). If * no backend is created the transport will just sit harmlessly idle. */ - create_virtio_devices(vbi, pic); - - create_fw_cfg(vbi, &address_space_memory); - rom_set_fw(fw_cfg_find()); - - guest_info->smp_cpus = smp_cpus; - guest_info->fw_cfg = fw_cfg_find(); - guest_info->memmap = vbi->memmap; - guest_info->irqmap = vbi->irqmap; - guest_info->use_highmem = vms->highmem; - guest_info->gic_version = gic_version; - guest_info->no_its = vmc->no_its; - guest_info_state->machine_done.notify = virt_guest_info_machine_done; - qemu_add_machine_init_done_notifier(&guest_info_state->machine_done); - - vbi->bootinfo.ram_size = machine->ram_size; - vbi->bootinfo.kernel_filename = machine->kernel_filename; - vbi->bootinfo.kernel_cmdline = machine->kernel_cmdline; - vbi->bootinfo.initrd_filename = machine->initrd_filename; - vbi->bootinfo.nb_cpus = smp_cpus; - vbi->bootinfo.board_id = -1; - vbi->bootinfo.loader_start = vbi->memmap[VIRT_MEM].base; - vbi->bootinfo.get_dtb = machvirt_dtb; - vbi->bootinfo.firmware_loaded = firmware_loaded; - arm_load_kernel(ARM_CPU(first_cpu), &vbi->bootinfo); + create_virtio_devices(vms, pic); + + vms->fw_cfg = create_fw_cfg(vms, &address_space_memory); + rom_set_fw(vms->fw_cfg); + + vms->machine_done.notify = virt_machine_done; + qemu_add_machine_init_done_notifier(&vms->machine_done); + + vms->bootinfo.ram_size = machine->ram_size; + vms->bootinfo.kernel_filename = machine->kernel_filename; + vms->bootinfo.kernel_cmdline = machine->kernel_cmdline; + vms->bootinfo.initrd_filename = machine->initrd_filename; + vms->bootinfo.nb_cpus = smp_cpus; + vms->bootinfo.board_id = -1; + vms->bootinfo.loader_start = vms->memmap[VIRT_MEM].base; + vms->bootinfo.get_dtb = machvirt_dtb; + vms->bootinfo.firmware_loaded = firmware_loaded; + arm_load_kernel(ARM_CPU(first_cpu), &vms->bootinfo); /* * arm_load_kernel machine init done notifier registration must @@ -1437,7 +1391,7 @@ static void machvirt_init(MachineState *machine) * another notifier is registered which adds platform bus nodes. * Notifiers are executed in registration reverse order. */ - create_platform_bus(vbi, pic); + create_platform_bus(vms, pic); } static bool virt_get_secure(Object *obj, Error **errp) @@ -1556,6 +1510,9 @@ static void virt_2_9_instance_init(Object *obj) object_property_set_description(obj, "gic-version", "Set GIC version. " "Valid values are 2, 3 and host", NULL); + + vms->memmap = a15memmap; + vms->irqmap = a15irqmap; } static void virt_machine_2_9_options(MachineClass *mc) @@ -1573,8 +1530,14 @@ static void virt_2_8_instance_init(Object *obj) static void virt_machine_2_8_options(MachineClass *mc) { + VirtMachineClass *vmc = VIRT_MACHINE_CLASS(OBJECT_CLASS(mc)); + virt_machine_2_9_options(mc); SET_MACHINE_COMPAT(mc, VIRT_COMPAT_2_8); + /* For 2.8 and earlier we falsely claimed in the DT that + * our timers were edge-triggered, not level-triggered. + */ + vmc->claim_edge_triggered_timers = true; } DEFINE_VIRT_MACHINE(2, 8) diff --git a/hw/arm/z2.c b/hw/arm/z2.c index b3a6bbd210..1607cbdb03 100644 --- a/hw/arm/z2.c +++ b/hw/arm/z2.c @@ -220,7 +220,7 @@ static int aer915_send(I2CSlave *i2c, uint8_t data) return 0; } -static void aer915_event(I2CSlave *i2c, enum i2c_event event) +static int aer915_event(I2CSlave *i2c, enum i2c_event event) { AER915State *s = AER915(i2c); @@ -238,6 +238,8 @@ static void aer915_event(I2CSlave *i2c, enum i2c_event event) default: break; } + + return 0; } static int aer915_recv(I2CSlave *slave) diff --git a/hw/audio/wm8750.c b/hw/audio/wm8750.c index 0c6500e96a..f8b5bebfc2 100644 --- a/hw/audio/wm8750.c +++ b/hw/audio/wm8750.c @@ -303,7 +303,7 @@ static void wm8750_reset(I2CSlave *i2c) s->i2c_len = 0; } -static void wm8750_event(I2CSlave *i2c, enum i2c_event event) +static int wm8750_event(I2CSlave *i2c, enum i2c_event event) { WM8750State *s = WM8750(i2c); @@ -321,6 +321,8 @@ static void wm8750_event(I2CSlave *i2c, enum i2c_event event) default: break; } + + return 0; } #define WM8750_LINVOL 0x00 diff --git a/hw/block/m25p80.c b/hw/block/m25p80.c index e3c1166ea6..4c5f8c3590 100644 --- a/hw/block/m25p80.c +++ b/hw/block/m25p80.c @@ -28,6 +28,7 @@ #include "hw/ssi/ssi.h" #include "qemu/bitops.h" #include "qemu/log.h" +#include "qemu/error-report.h" #include "qapi/error.h" #ifndef M25P80_ERR_DEBUG @@ -377,6 +378,8 @@ typedef enum { MAN_GENERIC, } Manufacturer; +#define M25P80_INTERNAL_DATA_BUFFER_SZ 16 + typedef struct Flash { SSISlave parent_obj; @@ -387,7 +390,7 @@ typedef struct Flash { int page_size; uint8_t state; - uint8_t data[16]; + uint8_t data[M25P80_INTERNAL_DATA_BUFFER_SZ]; uint32_t len; uint32_t pos; uint8_t needed_bytes; @@ -1115,6 +1118,17 @@ static uint32_t m25p80_transfer8(SSISlave *ss, uint32_t tx) case STATE_COLLECTING_DATA: case STATE_COLLECTING_VAR_LEN_DATA: + + if (s->len >= M25P80_INTERNAL_DATA_BUFFER_SZ) { + qemu_log_mask(LOG_GUEST_ERROR, + "M25P80: Write overrun internal data buffer. " + "SPI controller (QEMU emulator or guest driver) " + "is misbehaving\n"); + s->len = s->pos = 0; + s->state = STATE_IDLE; + break; + } + s->data[s->len] = (uint8_t)tx; s->len++; @@ -1124,6 +1138,17 @@ static uint32_t m25p80_transfer8(SSISlave *ss, uint32_t tx) break; case STATE_READING_DATA: + + if (s->pos >= M25P80_INTERNAL_DATA_BUFFER_SZ) { + qemu_log_mask(LOG_GUEST_ERROR, + "M25P80: Read overrun internal data buffer. " + "SPI controller (QEMU emulator or guest driver) " + "is misbehaving\n"); + s->len = s->pos = 0; + s->state = STATE_IDLE; + break; + } + r = s->data[s->pos]; s->pos++; if (s->pos == s->len) { @@ -1196,7 +1221,7 @@ static const VMStateDescription vmstate_m25p80 = { .pre_save = m25p80_pre_save, .fields = (VMStateField[]) { VMSTATE_UINT8(state, Flash), - VMSTATE_UINT8_ARRAY(data, Flash, 16), + VMSTATE_UINT8_ARRAY(data, Flash, M25P80_INTERNAL_DATA_BUFFER_SZ), VMSTATE_UINT32(len, Flash), VMSTATE_UINT32(pos, Flash), VMSTATE_UINT8(needed_bytes, Flash), diff --git a/hw/char/exynos4210_uart.c b/hw/char/exynos4210_uart.c index 571c324004..820d1abeb9 100644 --- a/hw/char/exynos4210_uart.c +++ b/hw/char/exynos4210_uart.c @@ -629,22 +629,26 @@ DeviceState *exynos4210_uart_create(hwaddr addr, return dev; } -static int exynos4210_uart_init(SysBusDevice *dev) +static void exynos4210_uart_init(Object *obj) { + SysBusDevice *dev = SYS_BUS_DEVICE(obj); Exynos4210UartState *s = EXYNOS4210_UART(dev); /* memory mapping */ - memory_region_init_io(&s->iomem, OBJECT(s), &exynos4210_uart_ops, s, + memory_region_init_io(&s->iomem, obj, &exynos4210_uart_ops, s, "exynos4210.uart", EXYNOS4210_UART_REGS_MEM_SIZE); sysbus_init_mmio(dev, &s->iomem); sysbus_init_irq(dev, &s->irq); +} + +static void exynos4210_uart_realize(DeviceState *dev, Error **errp) +{ + Exynos4210UartState *s = EXYNOS4210_UART(dev); qemu_chr_fe_set_handlers(&s->chr, exynos4210_uart_can_receive, exynos4210_uart_receive, exynos4210_uart_event, s, NULL, true); - - return 0; } static Property exynos4210_uart_properties[] = { @@ -658,9 +662,8 @@ static Property exynos4210_uart_properties[] = { static void exynos4210_uart_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); - SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass); - k->init = exynos4210_uart_init; + dc->realize = exynos4210_uart_realize; dc->reset = exynos4210_uart_reset; dc->props = exynos4210_uart_properties; dc->vmsd = &vmstate_exynos4210_uart; @@ -670,6 +673,7 @@ static const TypeInfo exynos4210_uart_info = { .name = TYPE_EXYNOS4210_UART, .parent = TYPE_SYS_BUS_DEVICE, .instance_size = sizeof(Exynos4210UartState), + .instance_init = exynos4210_uart_init, .class_init = exynos4210_uart_class_init, }; diff --git a/hw/display/ssd0303.c b/hw/display/ssd0303.c index d3017563f3..68a80b9d64 100644 --- a/hw/display/ssd0303.c +++ b/hw/display/ssd0303.c @@ -179,7 +179,7 @@ static int ssd0303_send(I2CSlave *i2c, uint8_t data) return 0; } -static void ssd0303_event(I2CSlave *i2c, enum i2c_event event) +static int ssd0303_event(I2CSlave *i2c, enum i2c_event event) { ssd0303_state *s = SSD0303(i2c); @@ -193,6 +193,8 @@ static void ssd0303_event(I2CSlave *i2c, enum i2c_event event) /* Nothing to do. */ break; } + + return 0; } static void ssd0303_update_display(void *opaque) diff --git a/hw/gpio/max7310.c b/hw/gpio/max7310.c index 1bd5eaf911..f82e3e6555 100644 --- a/hw/gpio/max7310.c +++ b/hw/gpio/max7310.c @@ -129,7 +129,7 @@ static int max7310_tx(I2CSlave *i2c, uint8_t data) return 0; } -static void max7310_event(I2CSlave *i2c, enum i2c_event event) +static int max7310_event(I2CSlave *i2c, enum i2c_event event) { MAX7310State *s = MAX7310(i2c); s->len = 0; @@ -147,6 +147,8 @@ static void max7310_event(I2CSlave *i2c, enum i2c_event event) default: break; } + + return 0; } static const VMStateDescription vmstate_max7310 = { diff --git a/hw/i2c/core.c b/hw/i2c/core.c index e40781ea3b..2c1234cdff 100644 --- a/hw/i2c/core.c +++ b/hw/i2c/core.c @@ -88,18 +88,26 @@ int i2c_bus_busy(I2CBus *bus) return !QLIST_EMPTY(&bus->current_devs); } +/* TODO: Make this handle multiple masters. */ /* - * Returns non-zero if the address is not valid. If this is called - * again without an intervening i2c_end_transfer(), like in the SMBus - * case where the operation is switched from write to read, this - * function will not rescan the bus and thus cannot fail. + * Start or continue an i2c transaction. When this is called for the + * first time or after an i2c_end_transfer(), if it returns an error + * the bus transaction is terminated (or really never started). If + * this is called after another i2c_start_transfer() without an + * intervening i2c_end_transfer(), and it returns an error, the + * transaction will not be terminated. The caller must do it. + * + * This corresponds with the way real hardware works. The SMBus + * protocol uses a start transfer to switch from write to read mode + * without releasing the bus. If that fails, the bus is still + * in a transaction. */ -/* TODO: Make this handle multiple masters. */ int i2c_start_transfer(I2CBus *bus, uint8_t address, int recv) { BusChild *kid; I2CSlaveClass *sc; I2CNode *node; + bool bus_scanned = false; if (address == I2C_BROADCAST) { /* @@ -130,6 +138,7 @@ int i2c_start_transfer(I2CBus *bus, uint8_t address, int recv) } } } + bus_scanned = true; } if (QLIST_EMPTY(&bus->current_devs)) { @@ -137,11 +146,21 @@ int i2c_start_transfer(I2CBus *bus, uint8_t address, int recv) } QLIST_FOREACH(node, &bus->current_devs, next) { + int rv; + sc = I2C_SLAVE_GET_CLASS(node->elt); /* If the bus is already busy, assume this is a repeated start condition. */ + if (sc->event) { - sc->event(node->elt, recv ? I2C_START_RECV : I2C_START_SEND); + rv = sc->event(node->elt, recv ? I2C_START_RECV : I2C_START_SEND); + if (rv && !bus->broadcast) { + if (bus_scanned) { + /* First call, terminate the transfer. */ + i2c_end_transfer(bus); + } + return rv; + } } } return 0; diff --git a/hw/i2c/i2c-ddc.c b/hw/i2c/i2c-ddc.c index 1227212934..66899d7233 100644 --- a/hw/i2c/i2c-ddc.c +++ b/hw/i2c/i2c-ddc.c @@ -230,13 +230,15 @@ static void i2c_ddc_reset(DeviceState *ds) s->reg = 0; } -static void i2c_ddc_event(I2CSlave *i2c, enum i2c_event event) +static int i2c_ddc_event(I2CSlave *i2c, enum i2c_event event) { I2CDDCState *s = I2CDDC(i2c); if (event == I2C_START_SEND) { s->firstbyte = true; } + + return 0; } static int i2c_ddc_rx(I2CSlave *i2c) diff --git a/hw/i2c/smbus.c b/hw/i2c/smbus.c index 5b4dd3eba4..2d1b79a689 100644 --- a/hw/i2c/smbus.c +++ b/hw/i2c/smbus.c @@ -67,7 +67,7 @@ static void smbus_do_write(SMBusDevice *dev) } } -static void smbus_i2c_event(I2CSlave *s, enum i2c_event event) +static int smbus_i2c_event(I2CSlave *s, enum i2c_event event) { SMBusDevice *dev = SMBUS_DEVICE(s); @@ -148,6 +148,8 @@ static void smbus_i2c_event(I2CSlave *s, enum i2c_event event) break; } } + + return 0; } static int smbus_i2c_recv(I2CSlave *s) @@ -249,7 +251,8 @@ int smbus_read_byte(I2CBus *bus, uint8_t addr, uint8_t command) } i2c_send(bus, command); if (i2c_start_transfer(bus, addr, 1)) { - assert(0); + i2c_end_transfer(bus); + return -1; } data = i2c_recv(bus); i2c_nack(bus); @@ -276,7 +279,8 @@ int smbus_read_word(I2CBus *bus, uint8_t addr, uint8_t command) } i2c_send(bus, command); if (i2c_start_transfer(bus, addr, 1)) { - assert(0); + i2c_end_transfer(bus); + return -1; } data = i2c_recv(bus); data |= i2c_recv(bus) << 8; @@ -307,7 +311,8 @@ int smbus_read_block(I2CBus *bus, uint8_t addr, uint8_t command, uint8_t *data) } i2c_send(bus, command); if (i2c_start_transfer(bus, addr, 1)) { - assert(0); + i2c_end_transfer(bus); + return -1; } len = i2c_recv(bus); if (len > 32) { diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c index 47b79d9112..e0732ccaf1 100644 --- a/hw/i386/amd_iommu.c +++ b/hw/i386/amd_iommu.c @@ -562,7 +562,7 @@ static void amdvi_mmio_trace(hwaddr addr, unsigned size) trace_amdvi_mmio_read(amdvi_mmio_high[index], addr, size, addr & ~0x07); } else { index = index >= AMDVI_MMIO_REGS_LOW ? AMDVI_MMIO_REGS_LOW : index; - trace_amdvi_mmio_read(amdvi_mmio_high[index], addr, size, addr & ~0x07); + trace_amdvi_mmio_read(amdvi_mmio_low[index], addr, size, addr & ~0x07); } } diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h index 884926e9e7..0d3dc6a9f2 100644 --- a/hw/i386/amd_iommu.h +++ b/hw/i386/amd_iommu.h @@ -49,8 +49,8 @@ #define AMDVI_CAPAB_INIT_TYPE (3 << 16) /* No. of used MMIO registers */ -#define AMDVI_MMIO_REGS_HIGH 8 -#define AMDVI_MMIO_REGS_LOW 7 +#define AMDVI_MMIO_REGS_HIGH 7 +#define AMDVI_MMIO_REGS_LOW 8 /* MMIO registers */ #define AMDVI_MMIO_DEVICE_TABLE 0x0000 diff --git a/hw/input/lm832x.c b/hw/input/lm832x.c index 539682cac8..2340523da0 100644 --- a/hw/input/lm832x.c +++ b/hw/input/lm832x.c @@ -383,7 +383,7 @@ static void lm_kbd_write(LM823KbdState *s, int reg, int byte, uint8_t value) } } -static void lm_i2c_event(I2CSlave *i2c, enum i2c_event event) +static int lm_i2c_event(I2CSlave *i2c, enum i2c_event event) { LM823KbdState *s = LM8323(i2c); @@ -397,6 +397,8 @@ static void lm_i2c_event(I2CSlave *i2c, enum i2c_event event) default: break; } + + return 0; } static int lm_i2c_rx(I2CSlave *i2c) diff --git a/hw/misc/tmp105.c b/hw/misc/tmp105.c index f5c2472b5b..04e83787d4 100644 --- a/hw/misc/tmp105.c +++ b/hw/misc/tmp105.c @@ -176,7 +176,7 @@ static int tmp105_tx(I2CSlave *i2c, uint8_t data) return 0; } -static void tmp105_event(I2CSlave *i2c, enum i2c_event event) +static int tmp105_event(I2CSlave *i2c, enum i2c_event event) { TMP105State *s = TMP105(i2c); @@ -185,6 +185,7 @@ static void tmp105_event(I2CSlave *i2c, enum i2c_event event) } s->len = 0; + return 0; } static int tmp105_post_load(void *opaque, int version_id) diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 24fae1689d..637d54549e 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -982,8 +982,8 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus, pci_get_function_0(pci_dev)) { error_setg(errp, "PCI: slot %d function 0 already ocuppied by %s," " new func %s cannot be exposed to guest.", - PCI_SLOT(devfn), - bus->devices[PCI_DEVFN(PCI_SLOT(devfn), 0)]->name, + PCI_SLOT(pci_get_function_0(pci_dev)->devfn), + pci_get_function_0(pci_dev)->name, name); return NULL; diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c index f5c1d98192..07650683f7 100644 --- a/hw/s390x/virtio-ccw.c +++ b/hw/s390x/virtio-ccw.c @@ -1098,7 +1098,7 @@ static int virtio_ccw_set_guest_notifier(VirtioCcwDevice *dev, int n, * We do not support individual masking for channel devices, so we * need to manually trigger any guest masking callbacks here. */ - if (k->guest_notifier_mask) { + if (k->guest_notifier_mask && vdev->use_guest_notifier_mask) { k->guest_notifier_mask(vdev, n, false); } /* get lost events and re-inject */ @@ -1107,7 +1107,7 @@ static int virtio_ccw_set_guest_notifier(VirtioCcwDevice *dev, int n, event_notifier_set(notifier); } } else { - if (k->guest_notifier_mask) { + if (k->guest_notifier_mask && vdev->use_guest_notifier_mask) { k->guest_notifier_mask(vdev, n, true); } if (with_irqfd) { diff --git a/hw/ssi/imx_spi.c b/hw/ssi/imx_spi.c index e4e395fa67..b66505ca49 100644 --- a/hw/ssi/imx_spi.c +++ b/hw/ssi/imx_spi.c @@ -320,9 +320,6 @@ static void imx_spi_write(void *opaque, hwaddr offset, uint64_t value, TYPE_IMX_SPI, __func__); break; case ECSPI_TXDATA: - case ECSPI_MSGDATA: - /* Is there any difference between TXDATA and MSGDATA ? */ - /* I'll have to look in the linux driver */ if (!imx_spi_is_enabled(s)) { /* Ignore writes if device is disabled */ break; @@ -380,6 +377,14 @@ static void imx_spi_write(void *opaque, hwaddr offset, uint64_t value, } break; + case ECSPI_MSGDATA: + /* it is not clear from the spec what MSGDATA is for */ + /* Anyway it is not used by Linux driver */ + /* So for now we just ignore it */ + qemu_log_mask(LOG_UNIMP, + "[%s]%s: Trying to write to MSGDATA, ignoring\n", + TYPE_IMX_SPI, __func__); + break; default: s->regs[index] = value; diff --git a/hw/timer/ds1338.c b/hw/timer/ds1338.c index f5d04dd5d7..3849b74a68 100644 --- a/hw/timer/ds1338.c +++ b/hw/timer/ds1338.c @@ -94,7 +94,7 @@ static void inc_regptr(DS1338State *s) } } -static void ds1338_event(I2CSlave *i2c, enum i2c_event event) +static int ds1338_event(I2CSlave *i2c, enum i2c_event event) { DS1338State *s = DS1338(i2c); @@ -113,6 +113,8 @@ static void ds1338_event(I2CSlave *i2c, enum i2c_event event) default: break; } + + return 0; } static int ds1338_recv(I2CSlave *i2c) diff --git a/hw/timer/twl92230.c b/hw/timer/twl92230.c index 7ba4e9a7c9..b8d914e49b 100644 --- a/hw/timer/twl92230.c +++ b/hw/timer/twl92230.c @@ -713,12 +713,14 @@ static void menelaus_write(void *opaque, uint8_t addr, uint8_t value) } } -static void menelaus_event(I2CSlave *i2c, enum i2c_event event) +static int menelaus_event(I2CSlave *i2c, enum i2c_event event) { MenelausState *s = TWL92230(i2c); if (event == I2C_START_SEND) s->firstbyte = 1; + + return 0; } static int menelaus_tx(I2CSlave *i2c, uint8_t data) diff --git a/hw/virtio/virtio-mmio.c b/hw/virtio/virtio-mmio.c index 17412cb7b5..60654dc19d 100644 --- a/hw/virtio/virtio-mmio.c +++ b/hw/virtio/virtio-mmio.c @@ -402,7 +402,7 @@ static int virtio_mmio_set_guest_notifier(DeviceState *d, int n, bool assign, event_notifier_cleanup(notifier); } - if (vdc->guest_notifier_mask) { + if (vdc->guest_notifier_mask && vdev->use_guest_notifier_mask) { vdc->guest_notifier_mask(vdev, n, !assign); } diff --git a/include/block/block_int.h b/include/block/block_int.h index 83a423c580..4e4562d444 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -184,7 +184,7 @@ struct BlockDriver { /* * Flushes all data that was already written to the OS all the way down to - * the disk (for example raw-posix calls fsync()). + * the disk (for example file-posix.c calls fsync()). */ int coroutine_fn (*bdrv_co_flush_to_disk)(BlockDriverState *bs); diff --git a/include/hw/acpi/acpi-defs.h b/include/hw/acpi/acpi-defs.h index 154f3b82f6..d43ec005cb 100644 --- a/include/hw/acpi/acpi-defs.h +++ b/include/hw/acpi/acpi-defs.h @@ -191,10 +191,8 @@ struct AcpiFadtDescriptorRev5_1 { typedef struct AcpiFadtDescriptorRev5_1 AcpiFadtDescriptorRev5_1; -enum { - ACPI_FADT_ARM_USE_PSCI_G_0_2 = 0, - ACPI_FADT_ARM_PSCI_USE_HVC = 1, -}; +#define ACPI_FADT_ARM_PSCI_COMPLIANT (1 << 0) +#define ACPI_FADT_ARM_PSCI_USE_HVC (1 << 1) /* * Serial Port Console Redirection Table (SPCR), Rev. 1.02 @@ -290,7 +288,7 @@ typedef struct AcpiMultipleApicTable AcpiMultipleApicTable; #define ACPI_APIC_XRUPT_SOURCE 8 #define ACPI_APIC_LOCAL_X2APIC 9 #define ACPI_APIC_LOCAL_X2APIC_NMI 10 -#define ACPI_APIC_GENERIC_INTERRUPT 11 +#define ACPI_APIC_GENERIC_CPU_INTERFACE 11 #define ACPI_APIC_GENERIC_DISTRIBUTOR 12 #define ACPI_APIC_GENERIC_MSI_FRAME 13 #define ACPI_APIC_GENERIC_REDISTRIBUTOR 14 @@ -361,7 +359,7 @@ struct AcpiMadtLocalX2ApicNmi { } QEMU_PACKED; typedef struct AcpiMadtLocalX2ApicNmi AcpiMadtLocalX2ApicNmi; -struct AcpiMadtGenericInterrupt { +struct AcpiMadtGenericCpuInterface { ACPI_SUB_HEADER_DEF uint16_t reserved; uint32_t cpu_interface_number; @@ -378,7 +376,10 @@ struct AcpiMadtGenericInterrupt { uint64_t arm_mpidr; } QEMU_PACKED; -typedef struct AcpiMadtGenericInterrupt AcpiMadtGenericInterrupt; +typedef struct AcpiMadtGenericCpuInterface AcpiMadtGenericCpuInterface; + +/* GICC CPU Interface Flags */ +#define ACPI_MADT_GICC_ENABLED 1 struct AcpiMadtGenericDistributor { ACPI_SUB_HEADER_DEF @@ -427,21 +428,9 @@ typedef struct AcpiMadtGenericTranslator AcpiMadtGenericTranslator; /* * Generic Timer Description Table (GTDT) */ - -#define ACPI_GTDT_INTERRUPT_MODE (1 << 0) -#define ACPI_GTDT_INTERRUPT_POLARITY (1 << 1) -#define ACPI_GTDT_ALWAYS_ON (1 << 2) - -/* Triggering */ - -#define ACPI_LEVEL_SENSITIVE ((uint8_t) 0x00) -#define ACPI_EDGE_SENSITIVE ((uint8_t) 0x01) - -/* Polarity */ - -#define ACPI_ACTIVE_HIGH ((uint8_t) 0x00) -#define ACPI_ACTIVE_LOW ((uint8_t) 0x01) -#define ACPI_ACTIVE_BOTH ((uint8_t) 0x02) +#define ACPI_GTDT_INTERRUPT_MODE_LEVEL (0 << 0) +#define ACPI_GTDT_INTERRUPT_MODE_EDGE (1 << 0) +#define ACPI_GTDT_CAP_ALWAYS_ON (1 << 2) struct AcpiGenericTimerTable { ACPI_TABLE_HEADER_DEF diff --git a/include/hw/arm/virt-acpi-build.h b/include/hw/arm/virt-acpi-build.h deleted file mode 100644 index f5ec749b8f..0000000000 --- a/include/hw/arm/virt-acpi-build.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * - * Copyright (c) 2015 HUAWEI TECHNOLOGIES CO.,LTD. - * - * Author: Shannon Zhao <zhaoshenglong@huawei.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2 or later, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see <http://www.gnu.org/licenses/>. - */ - -#ifndef QEMU_VIRT_ACPI_BUILD_H -#define QEMU_VIRT_ACPI_BUILD_H - -#include "qemu-common.h" -#include "hw/arm/virt.h" -#include "qemu/notify.h" - -#define ACPI_GICC_ENABLED 1 - -typedef struct VirtGuestInfo { - int smp_cpus; - FWCfgState *fw_cfg; - const MemMapEntry *memmap; - const int *irqmap; - bool use_highmem; - int gic_version; - bool no_its; -} VirtGuestInfo; - - -typedef struct VirtGuestInfoState { - VirtGuestInfo info; - Notifier machine_done; -} VirtGuestInfoState; - -void virt_acpi_setup(VirtGuestInfo *guest_info); - -#endif diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h index 9650193253..eb1c63d688 100644 --- a/include/hw/arm/virt.h +++ b/include/hw/arm/virt.h @@ -32,6 +32,9 @@ #include "qemu-common.h" #include "exec/hwaddr.h" +#include "qemu/notify.h" +#include "hw/boards.h" +#include "hw/arm/arm.h" #define NUM_GICV2M_SPIS 64 #define NUM_VIRTIO_TRANSPORTS 32 @@ -74,5 +77,41 @@ typedef struct MemMapEntry { hwaddr size; } MemMapEntry; +typedef struct { + MachineClass parent; + bool disallow_affinity_adjustment; + bool no_its; + bool no_pmu; + bool claim_edge_triggered_timers; +} VirtMachineClass; -#endif +typedef struct { + MachineState parent; + Notifier machine_done; + FWCfgState *fw_cfg; + bool secure; + bool highmem; + int32_t gic_version; + struct arm_boot_info bootinfo; + const MemMapEntry *memmap; + const int *irqmap; + int smp_cpus; + void *fdt; + int fdt_size; + uint32_t clock_phandle; + uint32_t gic_phandle; + uint32_t msi_phandle; + bool using_psci; +} VirtMachineState; + +#define TYPE_VIRT_MACHINE MACHINE_TYPE_NAME("virt") +#define VIRT_MACHINE(obj) \ + OBJECT_CHECK(VirtMachineState, (obj), TYPE_VIRT_MACHINE) +#define VIRT_MACHINE_GET_CLASS(obj) \ + OBJECT_GET_CLASS(VirtMachineClass, obj, TYPE_VIRT_MACHINE) +#define VIRT_MACHINE_CLASS(klass) \ + OBJECT_CLASS_CHECK(VirtMachineClass, klass, TYPE_VIRT_MACHINE) + +void virt_acpi_setup(VirtMachineState *vms); + +#endif /* QEMU_ARM_VIRT_H */ diff --git a/include/hw/i2c/i2c.h b/include/hw/i2c/i2c.h index c4085aa366..2ce611d4c8 100644 --- a/include/hw/i2c/i2c.h +++ b/include/hw/i2c/i2c.h @@ -32,14 +32,22 @@ typedef struct I2CSlaveClass /* Callbacks provided by the device. */ int (*init)(I2CSlave *dev); - /* Master to slave. */ + /* Master to slave. Returns non-zero for a NAK, 0 for success. */ int (*send)(I2CSlave *s, uint8_t data); - /* Slave to master. */ + /* + * Slave to master. This cannot fail, the device should always + * return something here. Negative values probably result in 0xff + * and a possible log from the driver, and shouldn't be used. + */ int (*recv)(I2CSlave *s); - /* Notify the slave of a bus state change. */ - void (*event)(I2CSlave *s, enum i2c_event event); + /* + * Notify the slave of a bus state change. For start event, + * returns non-zero to NAK an operation. For other events the + * return code is not used and should be zero. + */ + int (*event)(I2CSlave *s, enum i2c_event event); } I2CSlaveClass; struct I2CSlave diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h index e6a60d55fd..12584ed1b7 100644 --- a/include/qemu/coroutine.h +++ b/include/qemu/coroutine.h @@ -71,6 +71,12 @@ Coroutine *qemu_coroutine_create(CoroutineEntry *entry, void *opaque); void qemu_coroutine_enter(Coroutine *coroutine); /** + * Transfer control to a coroutine if it's not active (i.e. part of the call + * stack of the running coroutine). Otherwise, do nothing. + */ +void qemu_coroutine_enter_if_inactive(Coroutine *co); + +/** * Transfer control back to a coroutine's caller * * This function does not return until the coroutine is re-entered using diff --git a/qemu-img.c b/qemu-img.c index 6949b73ca5..5df66fe661 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -3559,20 +3559,23 @@ static void bench_cb(void *opaque, int ret) } while (b->n > b->in_flight && b->in_flight < b->nrreq) { + int64_t offset = b->offset; + /* blk_aio_* might look for completed I/Os and kick bench_cb + * again, so make sure this operation is counted by in_flight + * and b->offset is ready for the next submission. + */ + b->in_flight++; + b->offset += b->step; + b->offset %= b->image_size; if (b->write) { - acb = blk_aio_pwritev(b->blk, b->offset, b->qiov, 0, - bench_cb, b); + acb = blk_aio_pwritev(b->blk, offset, b->qiov, 0, bench_cb, b); } else { - acb = blk_aio_preadv(b->blk, b->offset, b->qiov, 0, - bench_cb, b); + acb = blk_aio_preadv(b->blk, offset, b->qiov, 0, bench_cb, b); } if (!acb) { error_report("Failed to issue request"); exit(EXIT_FAILURE); } - b->in_flight++; - b->offset += b->step; - b->offset %= b->image_size; } } diff --git a/tests/Makefile.include b/tests/Makefile.include index 4841d582a1..f776404d86 100644 --- a/tests/Makefile.include +++ b/tests/Makefile.include @@ -689,7 +689,7 @@ tests/test-filter-mirror$(EXESUF): tests/test-filter-mirror.o $(qtest-obj-y) tests/test-filter-redirector$(EXESUF): tests/test-filter-redirector.o $(qtest-obj-y) tests/test-x86-cpuid-compat$(EXESUF): tests/test-x86-cpuid-compat.o $(qtest-obj-y) tests/ivshmem-test$(EXESUF): tests/ivshmem-test.o contrib/ivshmem-server/ivshmem-server.o $(libqos-pc-obj-y) -tests/vhost-user-bridge$(EXESUF): tests/vhost-user-bridge.o +tests/vhost-user-bridge$(EXESUF): tests/vhost-user-bridge.o contrib/libvhost-user/libvhost-user.o $(test-util-obj-y) tests/test-uuid$(EXESUF): tests/test-uuid.o $(test-util-obj-y) tests/test-arm-mptimer$(EXESUF): tests/test-arm-mptimer.o diff --git a/tests/qemu-iotests/071.out b/tests/qemu-iotests/071.out index 8ff423f56b..dd879f1212 100644 --- a/tests/qemu-iotests/071.out +++ b/tests/qemu-iotests/071.out @@ -12,7 +12,7 @@ read 512/512 bytes at offset 229376 512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) wrote 512/512 bytes at offset 0 512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) -blkverify: read sector_num=0 nb_sectors=1 contents mismatch in sector 0 +blkverify: read offset=0 bytes=512 contents mismatch at offset 0 === Testing blkverify through file blockref === @@ -26,7 +26,7 @@ read 512/512 bytes at offset 229376 512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) wrote 512/512 bytes at offset 0 512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) -blkverify: read sector_num=0 nb_sectors=1 contents mismatch in sector 0 +blkverify: read offset=0 bytes=512 contents mismatch at offset 0 === Testing blkdebug through filename === @@ -56,7 +56,7 @@ QMP_VERSION {"return": {}} {"return": {}} {"return": {}} -blkverify: read sector_num=0 nb_sectors=1 contents mismatch in sector 0 +blkverify: read offset=0 bytes=512 contents mismatch at offset 0 === Testing blkverify on existing raw block device === @@ -66,7 +66,7 @@ QMP_VERSION {"return": {}} {"return": {}} {"return": {}} -blkverify: read sector_num=0 nb_sectors=1 contents mismatch in sector 0 +blkverify: read offset=0 bytes=512 contents mismatch at offset 0 === Testing blkdebug's set-state through QMP === diff --git a/tests/vhost-user-bridge.c b/tests/vhost-user-bridge.c index 775e031069..8618c20d53 100644 --- a/tests/vhost-user-bridge.c +++ b/tests/vhost-user-bridge.c @@ -30,17 +30,9 @@ #define _FILE_OFFSET_BITS 64 #include "qemu/osdep.h" -#include <sys/socket.h> -#include <sys/un.h> -#include <sys/unistd.h> -#include <sys/eventfd.h> -#include <arpa/inet.h> -#include <netdb.h> -#include <linux/vhost.h> - -#include "qemu/atomic.h" +#include "qemu/iov.h" #include "standard-headers/linux/virtio_net.h" -#include "standard-headers/linux/virtio_ring.h" +#include "contrib/libvhost-user/libvhost-user.h" #define VHOST_USER_BRIDGE_DEBUG 1 @@ -64,6 +56,17 @@ typedef struct Dispatcher { Event events[FD_SETSIZE]; } Dispatcher; +typedef struct VubrDev { + VuDev vudev; + Dispatcher dispatcher; + int backend_udp_sock; + struct sockaddr_in backend_udp_dest; + int hdrlen; + int sock; + int ready; + int quit; +} VubrDev; + static void vubr_die(const char *s) { @@ -101,8 +104,6 @@ dispatcher_add(Dispatcher *dispr, int sock, void *ctx, CallbackFunc cb) return 0; } -/* dispatcher_remove() is not currently in use but may be useful - * in the future. */ static int dispatcher_remove(Dispatcher *dispr, int sock) { @@ -157,1039 +158,313 @@ dispatcher_wait(Dispatcher *dispr, uint32_t timeout) return 0; } -typedef struct VubrVirtq { - int call_fd; - int kick_fd; - uint32_t size; - uint16_t last_avail_index; - uint16_t last_used_index; - struct vring_desc *desc; - struct vring_avail *avail; - struct vring_used *used; - uint64_t log_guest_addr; - int enable; -} VubrVirtq; - -/* Based on qemu/hw/virtio/vhost-user.c */ - -#define VHOST_MEMORY_MAX_NREGIONS 8 -#define VHOST_USER_F_PROTOCOL_FEATURES 30 -/* v1.0 compliant. */ -#define VIRTIO_F_VERSION_1 32 - -#define VHOST_LOG_PAGE 4096 - -enum VhostUserProtocolFeature { - VHOST_USER_PROTOCOL_F_MQ = 0, - VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1, - VHOST_USER_PROTOCOL_F_RARP = 2, - - VHOST_USER_PROTOCOL_F_MAX -}; - -#define VHOST_USER_PROTOCOL_FEATURE_MASK ((1 << VHOST_USER_PROTOCOL_F_MAX) - 1) - -typedef enum VhostUserRequest { - VHOST_USER_NONE = 0, - VHOST_USER_GET_FEATURES = 1, - VHOST_USER_SET_FEATURES = 2, - VHOST_USER_SET_OWNER = 3, - VHOST_USER_RESET_OWNER = 4, - VHOST_USER_SET_MEM_TABLE = 5, - VHOST_USER_SET_LOG_BASE = 6, - VHOST_USER_SET_LOG_FD = 7, - VHOST_USER_SET_VRING_NUM = 8, - VHOST_USER_SET_VRING_ADDR = 9, - VHOST_USER_SET_VRING_BASE = 10, - VHOST_USER_GET_VRING_BASE = 11, - VHOST_USER_SET_VRING_KICK = 12, - VHOST_USER_SET_VRING_CALL = 13, - VHOST_USER_SET_VRING_ERR = 14, - VHOST_USER_GET_PROTOCOL_FEATURES = 15, - VHOST_USER_SET_PROTOCOL_FEATURES = 16, - VHOST_USER_GET_QUEUE_NUM = 17, - VHOST_USER_SET_VRING_ENABLE = 18, - VHOST_USER_SEND_RARP = 19, - VHOST_USER_MAX -} VhostUserRequest; - -typedef struct VhostUserMemoryRegion { - uint64_t guest_phys_addr; - uint64_t memory_size; - uint64_t userspace_addr; - uint64_t mmap_offset; -} VhostUserMemoryRegion; - -typedef struct VhostUserMemory { - uint32_t nregions; - uint32_t padding; - VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS]; -} VhostUserMemory; - -typedef struct VhostUserLog { - uint64_t mmap_size; - uint64_t mmap_offset; -} VhostUserLog; - -typedef struct VhostUserMsg { - VhostUserRequest request; - -#define VHOST_USER_VERSION_MASK (0x3) -#define VHOST_USER_REPLY_MASK (0x1<<2) - uint32_t flags; - uint32_t size; /* the following payload size */ - union { -#define VHOST_USER_VRING_IDX_MASK (0xff) -#define VHOST_USER_VRING_NOFD_MASK (0x1<<8) - uint64_t u64; - struct vhost_vring_state state; - struct vhost_vring_addr addr; - VhostUserMemory memory; - VhostUserLog log; - } payload; - int fds[VHOST_MEMORY_MAX_NREGIONS]; - int fd_num; -} QEMU_PACKED VhostUserMsg; - -#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64) - -/* The version of the protocol we support */ -#define VHOST_USER_VERSION (0x1) - -#define MAX_NR_VIRTQUEUE (8) - -typedef struct VubrDevRegion { - /* Guest Physical address. */ - uint64_t gpa; - /* Memory region size. */ - uint64_t size; - /* QEMU virtual address (userspace). */ - uint64_t qva; - /* Starting offset in our mmaped space. */ - uint64_t mmap_offset; - /* Start address of mmaped space. */ - uint64_t mmap_addr; -} VubrDevRegion; - -typedef struct VubrDev { - int sock; - Dispatcher dispatcher; - uint32_t nregions; - VubrDevRegion regions[VHOST_MEMORY_MAX_NREGIONS]; - VubrVirtq vq[MAX_NR_VIRTQUEUE]; - int log_call_fd; - uint64_t log_size; - uint8_t *log_table; - int backend_udp_sock; - struct sockaddr_in backend_udp_dest; - int ready; - uint64_t features; - int hdrlen; -} VubrDev; - -static const char *vubr_request_str[] = { - [VHOST_USER_NONE] = "VHOST_USER_NONE", - [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES", - [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES", - [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER", - [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER", - [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE", - [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE", - [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD", - [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM", - [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR", - [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE", - [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE", - [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK", - [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL", - [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR", - [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES", - [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES", - [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM", - [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE", - [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP", - [VHOST_USER_MAX] = "VHOST_USER_MAX", -}; - static void -print_buffer(uint8_t *buf, size_t len) +vubr_handle_tx(VuDev *dev, int qidx) { - int i; - printf("Raw buffer:\n"); - for (i = 0; i < len; i++) { - if (i % 16 == 0) { - printf("\n"); - } - if (i % 4 == 0) { - printf(" "); - } - printf("%02x ", buf[i]); - } - printf("\n............................................................\n"); -} + VuVirtq *vq = vu_get_queue(dev, qidx); + VubrDev *vubr = container_of(dev, VubrDev, vudev); + int hdrlen = vubr->hdrlen; + VuVirtqElement *elem = NULL; -/* Translate guest physical address to our virtual address. */ -static uint64_t -gpa_to_va(VubrDev *dev, uint64_t guest_addr) -{ - int i; - - /* Find matching memory region. */ - for (i = 0; i < dev->nregions; i++) { - VubrDevRegion *r = &dev->regions[i]; - - if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) { - return guest_addr - r->gpa + r->mmap_addr + r->mmap_offset; - } - } - - assert(!"address not found in regions"); - return 0; -} - -/* Translate qemu virtual address to our virtual address. */ -static uint64_t -qva_to_va(VubrDev *dev, uint64_t qemu_addr) -{ - int i; + assert(qidx % 2); - /* Find matching memory region. */ - for (i = 0; i < dev->nregions; i++) { - VubrDevRegion *r = &dev->regions[i]; + for (;;) { + ssize_t ret; + unsigned int out_num; + struct iovec sg[VIRTQUEUE_MAX_SIZE], *out_sg; - if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) { - return qemu_addr - r->qva + r->mmap_addr + r->mmap_offset; + elem = vu_queue_pop(dev, vq, sizeof(VuVirtqElement)); + if (!elem) { + break; } - } - - assert(!"address not found in regions"); - return 0; -} -static void -vubr_message_read(int conn_fd, VhostUserMsg *vmsg) -{ - char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { }; - struct iovec iov = { - .iov_base = (char *)vmsg, - .iov_len = VHOST_USER_HDR_SIZE, - }; - struct msghdr msg = { - .msg_iov = &iov, - .msg_iovlen = 1, - .msg_control = control, - .msg_controllen = sizeof(control), - }; - size_t fd_size; - struct cmsghdr *cmsg; - int rc; - - rc = recvmsg(conn_fd, &msg, 0); - - if (rc == 0) { - vubr_die("recvmsg"); - fprintf(stderr, "Peer disconnected.\n"); - exit(1); - } - if (rc < 0) { - vubr_die("recvmsg"); - } - - vmsg->fd_num = 0; - for (cmsg = CMSG_FIRSTHDR(&msg); - cmsg != NULL; - cmsg = CMSG_NXTHDR(&msg, cmsg)) - { - if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) { - fd_size = cmsg->cmsg_len - CMSG_LEN(0); - vmsg->fd_num = fd_size / sizeof(int); - memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size); + out_num = elem->out_num; + out_sg = elem->out_sg; + if (out_num < 1) { + fprintf(stderr, "virtio-net header not in first element\n"); break; } - } - - if (vmsg->size > sizeof(vmsg->payload)) { - fprintf(stderr, - "Error: too big message request: %d, size: vmsg->size: %u, " - "while sizeof(vmsg->payload) = %zu\n", - vmsg->request, vmsg->size, sizeof(vmsg->payload)); - exit(1); - } - - if (vmsg->size) { - rc = read(conn_fd, &vmsg->payload, vmsg->size); - if (rc == 0) { - vubr_die("recvmsg"); - fprintf(stderr, "Peer disconnected.\n"); - exit(1); + if (VHOST_USER_BRIDGE_DEBUG) { + iov_hexdump(out_sg, out_num, stderr, "TX:", 1024); } - if (rc < 0) { - vubr_die("recvmsg"); + + if (hdrlen) { + unsigned sg_num = iov_copy(sg, ARRAY_SIZE(sg), + out_sg, out_num, + hdrlen, -1); + out_num = sg_num; + out_sg = sg; } - assert(rc == vmsg->size); - } -} + struct msghdr msg = { + .msg_name = (struct sockaddr *) &vubr->backend_udp_dest, + .msg_namelen = sizeof(struct sockaddr_in), + .msg_iov = out_sg, + .msg_iovlen = out_num, + }; + do { + ret = sendmsg(vubr->backend_udp_sock, &msg, 0); + } while (ret == -1 && (errno == EAGAIN || errno == EINTR)); -static void -vubr_message_write(int conn_fd, VhostUserMsg *vmsg) -{ - int rc; + if (ret == -1) { + vubr_die("sendmsg()"); + } - do { - rc = write(conn_fd, vmsg, VHOST_USER_HDR_SIZE + vmsg->size); - } while (rc < 0 && errno == EINTR); + vu_queue_push(dev, vq, elem, 0); + vu_queue_notify(dev, vq); - if (rc < 0) { - vubr_die("write"); + free(elem); + elem = NULL; } -} -static void -vubr_backend_udp_sendbuf(VubrDev *dev, uint8_t *buf, size_t len) -{ - int slen = sizeof(struct sockaddr_in); - - if (sendto(dev->backend_udp_sock, buf, len, 0, - (struct sockaddr *) &dev->backend_udp_dest, slen) == -1) { - vubr_die("sendto()"); - } + free(elem); } -static int -vubr_backend_udp_recvbuf(VubrDev *dev, uint8_t *buf, size_t buflen) +static void +iov_restore_front(struct iovec *front, struct iovec *iov, size_t bytes) { - int slen = sizeof(struct sockaddr_in); - int rc; + struct iovec *cur; - rc = recvfrom(dev->backend_udp_sock, buf, buflen, 0, - (struct sockaddr *) &dev->backend_udp_dest, - (socklen_t *)&slen); - if (rc == -1) { - vubr_die("recvfrom()"); + for (cur = front; front != iov; cur++) { + bytes -= cur->iov_len; } - return rc; + cur->iov_base -= bytes; + cur->iov_len += bytes; } static void -vubr_consume_raw_packet(VubrDev *dev, uint8_t *buf, uint32_t len) +iov_truncate(struct iovec *iov, unsigned iovc, size_t bytes) { - int hdrlen = dev->hdrlen; - DPRINT(" hdrlen = %d\n", dev->hdrlen); + unsigned i; - if (VHOST_USER_BRIDGE_DEBUG) { - print_buffer(buf, len); - } - vubr_backend_udp_sendbuf(dev, buf + hdrlen, len - hdrlen); -} + for (i = 0; i < iovc; i++, iov++) { + if (bytes < iov->iov_len) { + iov->iov_len = bytes; + return; + } -/* Kick the log_call_fd if required. */ -static void -vubr_log_kick(VubrDev *dev) -{ - if (dev->log_call_fd != -1) { - DPRINT("Kicking the QEMU's log...\n"); - eventfd_write(dev->log_call_fd, 1); + bytes -= iov->iov_len; } -} -/* Kick the guest if necessary. */ -static void -vubr_virtqueue_kick(VubrVirtq *vq) -{ - if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { - DPRINT("Kicking the guest...\n"); - eventfd_write(vq->call_fd, 1); - } + assert(!"couldn't truncate iov"); } static void -vubr_log_page(uint8_t *log_table, uint64_t page) +vubr_backend_recv_cb(int sock, void *ctx) { - DPRINT("Logged dirty guest page: %"PRId64"\n", page); - atomic_or(&log_table[page / 8], 1 << (page % 8)); -} + VubrDev *vubr = (VubrDev *) ctx; + VuDev *dev = &vubr->vudev; + VuVirtq *vq = vu_get_queue(dev, 0); + VuVirtqElement *elem = NULL; + struct iovec mhdr_sg[VIRTQUEUE_MAX_SIZE]; + struct virtio_net_hdr_mrg_rxbuf mhdr; + unsigned mhdr_cnt = 0; + int hdrlen = vubr->hdrlen; + int i = 0; + struct virtio_net_hdr hdr = { + .flags = 0, + .gso_type = VIRTIO_NET_HDR_GSO_NONE + }; -static void -vubr_log_write(VubrDev *dev, uint64_t address, uint64_t length) -{ - uint64_t page; + DPRINT("\n\n *** IN UDP RECEIVE CALLBACK ***\n\n"); + DPRINT(" hdrlen = %d\n", hdrlen); - if (!(dev->features & (1ULL << VHOST_F_LOG_ALL)) || - !dev->log_table || !length) { + if (!vu_queue_enabled(dev, vq) || + !vu_queue_avail_bytes(dev, vq, hdrlen, 0)) { + DPRINT("Got UDP packet, but no available descriptors on RX virtq.\n"); return; } - assert(dev->log_size > ((address + length - 1) / VHOST_LOG_PAGE / 8)); - - page = address / VHOST_LOG_PAGE; - while (page * VHOST_LOG_PAGE < address + length) { - vubr_log_page(dev->log_table, page); - page += VHOST_LOG_PAGE; - } - vubr_log_kick(dev); -} - -static void -vubr_post_buffer(VubrDev *dev, VubrVirtq *vq, uint8_t *buf, int32_t len) -{ - struct vring_desc *desc = vq->desc; - struct vring_avail *avail = vq->avail; - struct vring_used *used = vq->used; - uint64_t log_guest_addr = vq->log_guest_addr; - int32_t remaining_len = len; - - unsigned int size = vq->size; - - uint16_t avail_index = atomic_mb_read(&avail->idx); - - /* We check the available descriptors before posting the - * buffer, so here we assume that enough available - * descriptors. */ - assert(vq->last_avail_index != avail_index); - uint16_t a_index = vq->last_avail_index % size; - uint16_t u_index = vq->last_used_index % size; - uint16_t d_index = avail->ring[a_index]; - - int i = d_index; - uint32_t written_len = 0; - do { - DPRINT("Post packet to guest on vq:\n"); - DPRINT(" size = %d\n", vq->size); - DPRINT(" last_avail_index = %d\n", vq->last_avail_index); - DPRINT(" last_used_index = %d\n", vq->last_used_index); - DPRINT(" a_index = %d\n", a_index); - DPRINT(" u_index = %d\n", u_index); - DPRINT(" d_index = %d\n", d_index); - DPRINT(" desc[%d].addr = 0x%016"PRIx64"\n", i, desc[i].addr); - DPRINT(" desc[%d].len = %d\n", i, desc[i].len); - DPRINT(" desc[%d].flags = %d\n", i, desc[i].flags); - DPRINT(" avail->idx = %d\n", avail_index); - DPRINT(" used->idx = %d\n", used->idx); - - if (!(desc[i].flags & VRING_DESC_F_WRITE)) { - /* FIXME: we should find writable descriptor. */ - fprintf(stderr, "Error: descriptor is not writable. Exiting.\n"); - exit(1); - } - - void *chunk_start = (void *)(uintptr_t)gpa_to_va(dev, desc[i].addr); - uint32_t chunk_len = desc[i].len; - uint32_t chunk_write_len = MIN(remaining_len, chunk_len); + struct iovec *sg; + ssize_t ret, total = 0; + unsigned int num; - memcpy(chunk_start, buf + written_len, chunk_write_len); - vubr_log_write(dev, desc[i].addr, chunk_write_len); - remaining_len -= chunk_write_len; - written_len += chunk_write_len; - - if ((remaining_len == 0) || !(desc[i].flags & VRING_DESC_F_NEXT)) { + elem = vu_queue_pop(dev, vq, sizeof(VuVirtqElement)); + if (!elem) { break; } - i = desc[i].next; - } while (1); - - if (remaining_len > 0) { - fprintf(stderr, - "Too long packet for RX, remaining_len = %d, Dropping...\n", - remaining_len); - return; - } - - /* Add descriptor to the used ring. */ - used->ring[u_index].id = d_index; - used->ring[u_index].len = len; - vubr_log_write(dev, - log_guest_addr + offsetof(struct vring_used, ring[u_index]), - sizeof(used->ring[u_index])); - - vq->last_avail_index++; - vq->last_used_index++; - - atomic_mb_set(&used->idx, vq->last_used_index); - vubr_log_write(dev, - log_guest_addr + offsetof(struct vring_used, idx), - sizeof(used->idx)); - - /* Kick the guest if necessary. */ - vubr_virtqueue_kick(vq); -} - -static int -vubr_process_desc(VubrDev *dev, VubrVirtq *vq) -{ - struct vring_desc *desc = vq->desc; - struct vring_avail *avail = vq->avail; - struct vring_used *used = vq->used; - uint64_t log_guest_addr = vq->log_guest_addr; - - unsigned int size = vq->size; - - uint16_t a_index = vq->last_avail_index % size; - uint16_t u_index = vq->last_used_index % size; - uint16_t d_index = avail->ring[a_index]; - - uint32_t i, len = 0; - size_t buf_size = 4096; - uint8_t buf[4096]; - - DPRINT("Chunks: "); - i = d_index; - do { - void *chunk_start = (void *)(uintptr_t)gpa_to_va(dev, desc[i].addr); - uint32_t chunk_len = desc[i].len; - - assert(!(desc[i].flags & VRING_DESC_F_WRITE)); - - if (len + chunk_len < buf_size) { - memcpy(buf + len, chunk_start, chunk_len); - DPRINT("%d ", chunk_len); - } else { - fprintf(stderr, "Error: too long packet. Dropping...\n"); + if (elem->in_num < 1) { + fprintf(stderr, "virtio-net contains no in buffers\n"); break; } - len += chunk_len; - - if (!(desc[i].flags & VRING_DESC_F_NEXT)) { - break; + sg = elem->in_sg; + num = elem->in_num; + if (i == 0) { + if (hdrlen == 12) { + mhdr_cnt = iov_copy(mhdr_sg, ARRAY_SIZE(mhdr_sg), + sg, elem->in_num, + offsetof(typeof(mhdr), num_buffers), + sizeof(mhdr.num_buffers)); + } + iov_from_buf(sg, elem->in_num, 0, &hdr, sizeof hdr); + total += hdrlen; + assert(iov_discard_front(&sg, &num, hdrlen) == hdrlen); } - i = desc[i].next; - } while (1); - DPRINT("\n"); - - if (!len) { - return -1; - } - - /* Add descriptor to the used ring. */ - used->ring[u_index].id = d_index; - used->ring[u_index].len = len; - vubr_log_write(dev, - log_guest_addr + offsetof(struct vring_used, ring[u_index]), - sizeof(used->ring[u_index])); - - vubr_consume_raw_packet(dev, buf, len); - - return 0; -} + struct msghdr msg = { + .msg_name = (struct sockaddr *) &vubr->backend_udp_dest, + .msg_namelen = sizeof(struct sockaddr_in), + .msg_iov = sg, + .msg_iovlen = elem->in_num, + .msg_flags = MSG_DONTWAIT, + }; + do { + ret = recvmsg(vubr->backend_udp_sock, &msg, 0); + } while (ret == -1 && (errno == EINTR)); -static void -vubr_process_avail(VubrDev *dev, VubrVirtq *vq) -{ - struct vring_avail *avail = vq->avail; - struct vring_used *used = vq->used; - uint64_t log_guest_addr = vq->log_guest_addr; - - while (vq->last_avail_index != atomic_mb_read(&avail->idx)) { - vubr_process_desc(dev, vq); - vq->last_avail_index++; - vq->last_used_index++; - } + if (i == 0) { + iov_restore_front(elem->in_sg, sg, hdrlen); + } - atomic_mb_set(&used->idx, vq->last_used_index); - vubr_log_write(dev, - log_guest_addr + offsetof(struct vring_used, idx), - sizeof(used->idx)); -} + if (ret == -1) { + if (errno == EWOULDBLOCK) { + vu_queue_rewind(dev, vq, 1); + break; + } -static void -vubr_backend_recv_cb(int sock, void *ctx) -{ - VubrDev *dev = (VubrDev *) ctx; - VubrVirtq *rx_vq = &dev->vq[0]; - uint8_t buf[4096]; - struct virtio_net_hdr_v1 *hdr = (struct virtio_net_hdr_v1 *)buf; - int hdrlen = dev->hdrlen; - int buflen = sizeof(buf); - int len; - - if (!dev->ready) { - return; - } + vubr_die("recvmsg()"); + } - DPRINT("\n\n *** IN UDP RECEIVE CALLBACK ***\n\n"); - DPRINT(" hdrlen = %d\n", hdrlen); + total += ret; + iov_truncate(elem->in_sg, elem->in_num, total); + vu_queue_fill(dev, vq, elem, total, i++); - uint16_t avail_index = atomic_mb_read(&rx_vq->avail->idx); + free(elem); + elem = NULL; + } while (false); /* could loop if DONTWAIT worked? */ - /* If there is no available descriptors, just do nothing. - * The buffer will be handled by next arrived UDP packet, - * or next kick on receive virtq. */ - if (rx_vq->last_avail_index == avail_index) { - DPRINT("Got UDP packet, but no available descriptors on RX virtq.\n"); - return; + if (mhdr_cnt) { + mhdr.num_buffers = i; + iov_from_buf(mhdr_sg, mhdr_cnt, + 0, + &mhdr.num_buffers, sizeof mhdr.num_buffers); } - memset(buf, 0, hdrlen); - /* TODO: support mergeable buffers. */ - if (hdrlen == 12) - hdr->num_buffers = 1; - len = vubr_backend_udp_recvbuf(dev, buf + hdrlen, buflen - hdrlen); + vu_queue_flush(dev, vq, i); + vu_queue_notify(dev, vq); - vubr_post_buffer(dev, rx_vq, buf, len + hdrlen); + free(elem); } static void -vubr_kick_cb(int sock, void *ctx) +vubr_receive_cb(int sock, void *ctx) { - VubrDev *dev = (VubrDev *) ctx; - eventfd_t kick_data; - ssize_t rc; + VubrDev *vubr = (VubrDev *)ctx; - rc = eventfd_read(sock, &kick_data); - if (rc == -1) { - vubr_die("eventfd_read()"); - } else { - DPRINT("Got kick_data: %016"PRIx64"\n", kick_data); - vubr_process_avail(dev, &dev->vq[1]); + if (!vu_dispatch(&vubr->vudev)) { + fprintf(stderr, "Error while dispatching\n"); } } -static int -vubr_none_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - DPRINT("Function %s() not implemented yet.\n", __func__); - return 0; -} +typedef struct WatchData { + VuDev *dev; + vu_watch_cb cb; + void *data; +} WatchData; -static int -vubr_get_features_exec(VubrDev *dev, VhostUserMsg *vmsg) +static void +watch_cb(int sock, void *ctx) { - vmsg->payload.u64 = - ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | - (1ULL << VHOST_F_LOG_ALL) | - (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | - (1ULL << VHOST_USER_F_PROTOCOL_FEATURES)); - - vmsg->size = sizeof(vmsg->payload.u64); + struct WatchData *wd = ctx; - DPRINT("Sending back to guest u64: 0x%016"PRIx64"\n", vmsg->payload.u64); - - /* Reply */ - return 1; + wd->cb(wd->dev, VU_WATCH_IN, wd->data); } -static int -vubr_set_features_exec(VubrDev *dev, VhostUserMsg *vmsg) +static void +vubr_set_watch(VuDev *dev, int fd, int condition, + vu_watch_cb cb, void *data) { - DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); - - dev->features = vmsg->payload.u64; - if ((dev->features & (1ULL << VIRTIO_F_VERSION_1)) || - (dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF))) { - dev->hdrlen = 12; - } else { - dev->hdrlen = 10; - } + VubrDev *vubr = container_of(dev, VubrDev, vudev); + static WatchData watches[FD_SETSIZE]; + struct WatchData *wd = &watches[fd]; - return 0; -} - -static int -vubr_set_owner_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - return 0; + wd->cb = cb; + wd->data = data; + wd->dev = dev; + dispatcher_add(&vubr->dispatcher, fd, wd, watch_cb); } static void -vubr_close_log(VubrDev *dev) +vubr_remove_watch(VuDev *dev, int fd) { - if (dev->log_table) { - if (munmap(dev->log_table, dev->log_size) != 0) { - vubr_die("munmap()"); - } + VubrDev *vubr = container_of(dev, VubrDev, vudev); - dev->log_table = 0; - } - if (dev->log_call_fd != -1) { - close(dev->log_call_fd); - dev->log_call_fd = -1; - } -} - -static int -vubr_reset_device_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - vubr_close_log(dev); - dev->ready = 0; - dev->features = 0; - return 0; + dispatcher_remove(&vubr->dispatcher, fd); } static int -vubr_set_mem_table_exec(VubrDev *dev, VhostUserMsg *vmsg) +vubr_send_rarp_exec(VuDev *dev, VhostUserMsg *vmsg) { - int i; - VhostUserMemory *memory = &vmsg->payload.memory; - dev->nregions = memory->nregions; - - DPRINT("Nregions: %d\n", memory->nregions); - for (i = 0; i < dev->nregions; i++) { - void *mmap_addr; - VhostUserMemoryRegion *msg_region = &memory->regions[i]; - VubrDevRegion *dev_region = &dev->regions[i]; - - DPRINT("Region %d\n", i); - DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", - msg_region->guest_phys_addr); - DPRINT(" memory_size: 0x%016"PRIx64"\n", - msg_region->memory_size); - DPRINT(" userspace_addr 0x%016"PRIx64"\n", - msg_region->userspace_addr); - DPRINT(" mmap_offset 0x%016"PRIx64"\n", - msg_region->mmap_offset); - - dev_region->gpa = msg_region->guest_phys_addr; - dev_region->size = msg_region->memory_size; - dev_region->qva = msg_region->userspace_addr; - dev_region->mmap_offset = msg_region->mmap_offset; - - /* We don't use offset argument of mmap() since the - * mapped address has to be page aligned, and we use huge - * pages. */ - mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, - PROT_READ | PROT_WRITE, MAP_SHARED, - vmsg->fds[i], 0); - - if (mmap_addr == MAP_FAILED) { - vubr_die("mmap"); - } - dev_region->mmap_addr = (uint64_t)(uintptr_t)mmap_addr; - DPRINT(" mmap_addr: 0x%016"PRIx64"\n", dev_region->mmap_addr); - - close(vmsg->fds[i]); - } - + DPRINT("Function %s() not implemented yet.\n", __func__); return 0; } static int -vubr_set_log_base_exec(VubrDev *dev, VhostUserMsg *vmsg) +vubr_process_msg(VuDev *dev, VhostUserMsg *vmsg, int *do_reply) { - int fd; - uint64_t log_mmap_size, log_mmap_offset; - void *rc; - - assert(vmsg->fd_num == 1); - fd = vmsg->fds[0]; - - assert(vmsg->size == sizeof(vmsg->payload.log)); - log_mmap_offset = vmsg->payload.log.mmap_offset; - log_mmap_size = vmsg->payload.log.mmap_size; - DPRINT("Log mmap_offset: %"PRId64"\n", log_mmap_offset); - DPRINT("Log mmap_size: %"PRId64"\n", log_mmap_size); - - rc = mmap(0, log_mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, - log_mmap_offset); - if (rc == MAP_FAILED) { - vubr_die("mmap"); + switch (vmsg->request) { + case VHOST_USER_SEND_RARP: + *do_reply = vubr_send_rarp_exec(dev, vmsg); + return 1; + default: + /* let the library handle the rest */ + return 0; } - dev->log_table = rc; - dev->log_size = log_mmap_size; - vmsg->size = sizeof(vmsg->payload.u64); - /* Reply */ - return 1; -} - -static int -vubr_set_log_fd_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - assert(vmsg->fd_num == 1); - dev->log_call_fd = vmsg->fds[0]; - DPRINT("Got log_call_fd: %d\n", vmsg->fds[0]); return 0; } -static int -vubr_set_vring_num_exec(VubrDev *dev, VhostUserMsg *vmsg) +static void +vubr_set_features(VuDev *dev, uint64_t features) { - unsigned int index = vmsg->payload.state.index; - unsigned int num = vmsg->payload.state.num; - - DPRINT("State.index: %d\n", index); - DPRINT("State.num: %d\n", num); - dev->vq[index].size = num; - return 0; -} + VubrDev *vubr = container_of(dev, VubrDev, vudev); -static int -vubr_set_vring_addr_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - struct vhost_vring_addr *vra = &vmsg->payload.addr; - unsigned int index = vra->index; - VubrVirtq *vq = &dev->vq[index]; - - DPRINT("vhost_vring_addr:\n"); - DPRINT(" index: %d\n", vra->index); - DPRINT(" flags: %d\n", vra->flags); - DPRINT(" desc_user_addr: 0x%016llx\n", vra->desc_user_addr); - DPRINT(" used_user_addr: 0x%016llx\n", vra->used_user_addr); - DPRINT(" avail_user_addr: 0x%016llx\n", vra->avail_user_addr); - DPRINT(" log_guest_addr: 0x%016llx\n", vra->log_guest_addr); - - vq->desc = (struct vring_desc *)(uintptr_t)qva_to_va(dev, vra->desc_user_addr); - vq->used = (struct vring_used *)(uintptr_t)qva_to_va(dev, vra->used_user_addr); - vq->avail = (struct vring_avail *)(uintptr_t)qva_to_va(dev, vra->avail_user_addr); - vq->log_guest_addr = vra->log_guest_addr; - - DPRINT("Setting virtq addresses:\n"); - DPRINT(" vring_desc at %p\n", vq->desc); - DPRINT(" vring_used at %p\n", vq->used); - DPRINT(" vring_avail at %p\n", vq->avail); - - vq->last_used_index = vq->used->idx; - - if (vq->last_avail_index != vq->used->idx) { - DPRINT("Last avail index != used index: %d != %d, resuming", - vq->last_avail_index, vq->used->idx); - vq->last_avail_index = vq->used->idx; + if ((features & (1ULL << VIRTIO_F_VERSION_1)) || + (features & (1ULL << VIRTIO_NET_F_MRG_RXBUF))) { + vubr->hdrlen = 12; + } else { + vubr->hdrlen = 10; } - - return 0; } -static int -vubr_set_vring_base_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - unsigned int index = vmsg->payload.state.index; - unsigned int num = vmsg->payload.state.num; - - DPRINT("State.index: %d\n", index); - DPRINT("State.num: %d\n", num); - dev->vq[index].last_avail_index = num; - - return 0; -} - -static int -vubr_get_vring_base_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - unsigned int index = vmsg->payload.state.index; - - DPRINT("State.index: %d\n", index); - vmsg->payload.state.num = dev->vq[index].last_avail_index; - vmsg->size = sizeof(vmsg->payload.state); - /* FIXME: this is a work-around for a bug in QEMU enabling - * too early vrings. When protocol features are enabled, - * we have to respect * VHOST_USER_SET_VRING_ENABLE request. */ - dev->ready = 0; - - if (dev->vq[index].call_fd != -1) { - close(dev->vq[index].call_fd); - dispatcher_remove(&dev->dispatcher, dev->vq[index].call_fd); - dev->vq[index].call_fd = -1; - } - if (dev->vq[index].kick_fd != -1) { - close(dev->vq[index].kick_fd); - dispatcher_remove(&dev->dispatcher, dev->vq[index].kick_fd); - dev->vq[index].kick_fd = -1; - } - - /* Reply */ - return 1; -} - -static int -vubr_set_vring_kick_exec(VubrDev *dev, VhostUserMsg *vmsg) +static uint64_t +vubr_get_features(VuDev *dev) { - uint64_t u64_arg = vmsg->payload.u64; - int index = u64_arg & VHOST_USER_VRING_IDX_MASK; - - DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); - - assert((u64_arg & VHOST_USER_VRING_NOFD_MASK) == 0); - assert(vmsg->fd_num == 1); - - if (dev->vq[index].kick_fd != -1) { - close(dev->vq[index].kick_fd); - dispatcher_remove(&dev->dispatcher, dev->vq[index].kick_fd); - } - dev->vq[index].kick_fd = vmsg->fds[0]; - DPRINT("Got kick_fd: %d for vq: %d\n", vmsg->fds[0], index); - - if (index % 2 == 1) { - /* TX queue. */ - dispatcher_add(&dev->dispatcher, dev->vq[index].kick_fd, - dev, vubr_kick_cb); - - DPRINT("Waiting for kicks on fd: %d for vq: %d\n", - dev->vq[index].kick_fd, index); - } - /* We temporarily use this hack to determine that both TX and RX - * queues are set up and ready for processing. - * FIXME: we need to rely in VHOST_USER_SET_VRING_ENABLE and - * actual kicks. */ - if (dev->vq[0].kick_fd != -1 && - dev->vq[1].kick_fd != -1) { - dev->ready = 1; - DPRINT("vhost-user-bridge is ready for processing queues.\n"); - } - return 0; - + return 1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE | + 1ULL << VIRTIO_NET_F_MRG_RXBUF; } -static int -vubr_set_vring_call_exec(VubrDev *dev, VhostUserMsg *vmsg) +static void +vubr_queue_set_started(VuDev *dev, int qidx, bool started) { - uint64_t u64_arg = vmsg->payload.u64; - int index = u64_arg & VHOST_USER_VRING_IDX_MASK; + VuVirtq *vq = vu_get_queue(dev, qidx); - DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); - assert((u64_arg & VHOST_USER_VRING_NOFD_MASK) == 0); - assert(vmsg->fd_num == 1); - - if (dev->vq[index].call_fd != -1) { - close(dev->vq[index].call_fd); - dispatcher_remove(&dev->dispatcher, dev->vq[index].call_fd); + if (qidx % 2 == 1) { + vu_set_queue_handler(dev, vq, started ? vubr_handle_tx : NULL); } - dev->vq[index].call_fd = vmsg->fds[0]; - DPRINT("Got call_fd: %d for vq: %d\n", vmsg->fds[0], index); - - return 0; -} - -static int -vubr_set_vring_err_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); - return 0; -} - -static int -vubr_get_protocol_features_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - vmsg->payload.u64 = 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD; - DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); - vmsg->size = sizeof(vmsg->payload.u64); - - /* Reply */ - return 1; } -static int -vubr_set_protocol_features_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - /* FIXME: unimplented */ - DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); - return 0; -} - -static int -vubr_get_queue_num_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - DPRINT("Function %s() not implemented yet.\n", __func__); - return 0; -} - -static int -vubr_set_vring_enable_exec(VubrDev *dev, VhostUserMsg *vmsg) +static void +vubr_panic(VuDev *dev, const char *msg) { - unsigned int index = vmsg->payload.state.index; - unsigned int enable = vmsg->payload.state.num; + VubrDev *vubr = container_of(dev, VubrDev, vudev); - DPRINT("State.index: %d\n", index); - DPRINT("State.enable: %d\n", enable); - dev->vq[index].enable = enable; - return 0; -} + fprintf(stderr, "PANIC: %s\n", msg); -static int -vubr_send_rarp_exec(VubrDev *dev, VhostUserMsg *vmsg) -{ - DPRINT("Function %s() not implemented yet.\n", __func__); - return 0; + dispatcher_remove(&vubr->dispatcher, dev->sock); + vubr->quit = 1; } -static int -vubr_execute_request(VubrDev *dev, VhostUserMsg *vmsg) -{ - /* Print out generic part of the request. */ - DPRINT( - "================== Vhost user message from QEMU ==================\n"); - DPRINT("Request: %s (%d)\n", vubr_request_str[vmsg->request], - vmsg->request); - DPRINT("Flags: 0x%x\n", vmsg->flags); - DPRINT("Size: %d\n", vmsg->size); - - if (vmsg->fd_num) { - int i; - DPRINT("Fds:"); - for (i = 0; i < vmsg->fd_num; i++) { - DPRINT(" %d", vmsg->fds[i]); - } - DPRINT("\n"); - } - - switch (vmsg->request) { - case VHOST_USER_NONE: - return vubr_none_exec(dev, vmsg); - case VHOST_USER_GET_FEATURES: - return vubr_get_features_exec(dev, vmsg); - case VHOST_USER_SET_FEATURES: - return vubr_set_features_exec(dev, vmsg); - case VHOST_USER_SET_OWNER: - return vubr_set_owner_exec(dev, vmsg); - case VHOST_USER_RESET_OWNER: - return vubr_reset_device_exec(dev, vmsg); - case VHOST_USER_SET_MEM_TABLE: - return vubr_set_mem_table_exec(dev, vmsg); - case VHOST_USER_SET_LOG_BASE: - return vubr_set_log_base_exec(dev, vmsg); - case VHOST_USER_SET_LOG_FD: - return vubr_set_log_fd_exec(dev, vmsg); - case VHOST_USER_SET_VRING_NUM: - return vubr_set_vring_num_exec(dev, vmsg); - case VHOST_USER_SET_VRING_ADDR: - return vubr_set_vring_addr_exec(dev, vmsg); - case VHOST_USER_SET_VRING_BASE: - return vubr_set_vring_base_exec(dev, vmsg); - case VHOST_USER_GET_VRING_BASE: - return vubr_get_vring_base_exec(dev, vmsg); - case VHOST_USER_SET_VRING_KICK: - return vubr_set_vring_kick_exec(dev, vmsg); - case VHOST_USER_SET_VRING_CALL: - return vubr_set_vring_call_exec(dev, vmsg); - case VHOST_USER_SET_VRING_ERR: - return vubr_set_vring_err_exec(dev, vmsg); - case VHOST_USER_GET_PROTOCOL_FEATURES: - return vubr_get_protocol_features_exec(dev, vmsg); - case VHOST_USER_SET_PROTOCOL_FEATURES: - return vubr_set_protocol_features_exec(dev, vmsg); - case VHOST_USER_GET_QUEUE_NUM: - return vubr_get_queue_num_exec(dev, vmsg); - case VHOST_USER_SET_VRING_ENABLE: - return vubr_set_vring_enable_exec(dev, vmsg); - case VHOST_USER_SEND_RARP: - return vubr_send_rarp_exec(dev, vmsg); - - case VHOST_USER_MAX: - assert(vmsg->request != VHOST_USER_MAX); - } - return 0; -} - -static void -vubr_receive_cb(int sock, void *ctx) -{ - VubrDev *dev = (VubrDev *) ctx; - VhostUserMsg vmsg; - int reply_requested; - - vubr_message_read(sock, &vmsg); - reply_requested = vubr_execute_request(dev, &vmsg); - if (reply_requested) { - /* Set the version in the flags when sending the reply */ - vmsg.flags &= ~VHOST_USER_VERSION_MASK; - vmsg.flags |= VHOST_USER_VERSION; - vmsg.flags |= VHOST_USER_REPLY_MASK; - vubr_message_write(sock, &vmsg); - } -} +static const VuDevIface vuiface = { + .get_features = vubr_get_features, + .set_features = vubr_set_features, + .process_msg = vubr_process_msg, + .queue_set_started = vubr_queue_set_started, +}; static void vubr_accept_cb(int sock, void *ctx) @@ -1204,36 +479,26 @@ vubr_accept_cb(int sock, void *ctx) vubr_die("accept()"); } DPRINT("Got connection from remote peer on sock %d\n", conn_fd); + + vu_init(&dev->vudev, + conn_fd, + vubr_panic, + vubr_set_watch, + vubr_remove_watch, + &vuiface); + dispatcher_add(&dev->dispatcher, conn_fd, ctx, vubr_receive_cb); + dispatcher_remove(&dev->dispatcher, sock); } static VubrDev * vubr_new(const char *path, bool client) { VubrDev *dev = (VubrDev *) calloc(1, sizeof(VubrDev)); - dev->nregions = 0; - int i; struct sockaddr_un un; CallbackFunc cb; size_t len; - for (i = 0; i < MAX_NR_VIRTQUEUE; i++) { - dev->vq[i] = (VubrVirtq) { - .call_fd = -1, .kick_fd = -1, - .size = 0, - .last_avail_index = 0, .last_used_index = 0, - .desc = 0, .avail = 0, .used = 0, - .enable = 0, - }; - } - - /* Init log */ - dev->log_call_fd = -1; - dev->log_size = 0; - dev->log_table = 0; - dev->ready = 0; - dev->features = 0; - /* Get a UNIX socket. */ dev->sock = socket(AF_UNIX, SOCK_STREAM, 0); if (dev->sock == -1) { @@ -1261,10 +526,17 @@ vubr_new(const char *path, bool client) if (connect(dev->sock, (struct sockaddr *)&un, len) == -1) { vubr_die("connect"); } + vu_init(&dev->vudev, + dev->sock, + vubr_panic, + vubr_set_watch, + vubr_remove_watch, + &vuiface); cb = vubr_receive_cb; } dispatcher_init(&dev->dispatcher); + dispatcher_add(&dev->dispatcher, dev->sock, (void *)dev, cb); return dev; @@ -1345,7 +617,7 @@ vubr_backend_udp_setup(VubrDev *dev, static void vubr_run(VubrDev *dev) { - while (1) { + while (!dev->quit) { /* timeout 200ms */ dispatcher_wait(&dev->dispatcher, 200000); /* Here one can try polling strategy. */ @@ -1421,6 +693,9 @@ main(int argc, char *argv[]) vubr_backend_udp_setup(dev, lhost, lport, rhost, rport); vubr_run(dev); + + vu_deinit(&dev->vudev); + return 0; out: diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c index 737bffa984..a5d2f6c0c3 100644 --- a/util/qemu-coroutine.c +++ b/util/qemu-coroutine.c @@ -131,6 +131,13 @@ void qemu_coroutine_enter(Coroutine *co) } } +void qemu_coroutine_enter_if_inactive(Coroutine *co) +{ + if (!qemu_coroutine_entered(co)) { + qemu_coroutine_enter(co); + } +} + void coroutine_fn qemu_coroutine_yield(void) { Coroutine *self = qemu_coroutine_self(); |