aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--MAINTAINERS11
-rw-r--r--Makefile16
-rw-r--r--block/backup-top.c6
-rw-r--r--block/backup.c38
-rw-r--r--block/block-copy.c405
-rw-r--r--block/crypto.c62
-rw-r--r--block/curl.c32
-rw-r--r--block/qcow2-threads.c12
-rw-r--r--block/qcow2.c75
-rw-r--r--block/trace-events1
-rw-r--r--blockjob.c16
-rwxr-xr-xconfigure5
-rw-r--r--crypto/block.c36
-rw-r--r--docs/can.txt2
-rw-r--r--docs/devel/atomics.txt2
-rw-r--r--docs/devel/kconfig.rst2
-rw-r--r--docs/devel/loads-stores.rst2
-rw-r--r--docs/devel/multi-thread-tcg.txt8
-rw-r--r--docs/devel/tcg.rst2
-rw-r--r--docs/index.html.in10
-rw-r--r--docs/index.rst8
-rw-r--r--docs/qemu-option-trace.rst.inc4
-rw-r--r--docs/replay.txt2
-rw-r--r--docs/specs/fw_cfg.txt2
-rw-r--r--docs/specs/tpm.rst6
-rw-r--r--docs/sphinx/hxtool.py28
-rw-r--r--docs/system/arm/cpu-features.rst (renamed from docs/arm-cpu-features.rst)12
-rw-r--r--docs/system/arm/integratorcp.rst16
-rw-r--r--docs/system/arm/musicpal.rst19
-rw-r--r--docs/system/arm/nseries.rst33
-rw-r--r--docs/system/arm/palm.rst23
-rw-r--r--docs/system/arm/realview.rst34
-rw-r--r--docs/system/arm/stellaris.rst26
-rw-r--r--docs/system/arm/sx1.rst18
-rw-r--r--docs/system/arm/versatile.rst29
-rw-r--r--docs/system/arm/xscale.rst29
-rw-r--r--docs/system/target-arm.rst295
-rw-r--r--docs/user/main.rst8
-rw-r--r--hmp-commands-info.hx8
-rw-r--r--hmp-commands.hx8
-rw-r--r--hw/9pfs/9p-proxy.c4
-rw-r--r--hw/misc/mac_via.c7
-rw-r--r--include/block/aio.h71
-rw-r--r--include/block/block-copy.h65
-rw-r--r--include/crypto/block.h22
-rw-r--r--include/qemu/job.h11
-rw-r--r--include/qemu/progress_meter.h58
-rw-r--r--include/qemu/queue.h19
-rw-r--r--job-qmp.c4
-rw-r--r--job.c6
-rw-r--r--qemu-img.c14
-rw-r--r--qemu-options.hx8
-rw-r--r--scripts/hxtool78
-rwxr-xr-xtests/qemu-iotests/1782
-rw-r--r--tests/qemu-iotests/178.out.qcow28
-rw-r--r--tests/qemu-iotests/178.out.raw8
-rwxr-xr-xtests/qemu-iotests/28893
-rw-r--r--tests/qemu-iotests/288.out30
-rw-r--r--tests/qemu-iotests/common.rc22
-rw-r--r--tests/qemu-iotests/group1
-rw-r--r--tests/qtest/Makefile.include3
-rw-r--r--util/Makefile.objs3
-rw-r--r--util/aio-posix.c451
-rw-r--r--util/aio-posix.h81
-rw-r--r--util/fdmon-epoll.c155
-rw-r--r--util/fdmon-io_uring.c332
-rw-r--r--util/fdmon-poll.c107
-rw-r--r--util/trace-events2
68 files changed, 2042 insertions, 974 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index d881ba7d9c..2903cbe564 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -155,6 +155,7 @@ F: include/hw/cpu/a*mpcore.h
F: disas/arm.c
F: disas/arm-a64.cc
F: disas/libvixl/
+F: docs/system/target-arm.rst
ARM SMMU
M: Eric Auger <eric.auger@redhat.com>
@@ -615,6 +616,7 @@ F: hw/arm/integratorcp.c
F: hw/misc/arm_integrator_debug.c
F: include/hw/misc/arm_integrator_debug.h
F: tests/acceptance/machine_arm_integratorcp.py
+F: docs/system/arm/integratorcp.rst
MCIMX6UL EVK / i.MX6ul
M: Peter Maydell <peter.maydell@linaro.org>
@@ -673,6 +675,7 @@ M: Peter Maydell <peter.maydell@linaro.org>
L: qemu-arm@nongnu.org
S: Odd Fixes
F: hw/arm/musicpal.c
+F: docs/system/arm/musicpal.rst
nSeries
M: Andrzej Zaborowski <balrogg@gmail.com>
@@ -689,6 +692,7 @@ F: include/hw/display/blizzard.h
F: include/hw/input/tsc2xxx.h
F: include/hw/misc/cbus.h
F: tests/acceptance/machine_arm_n8x0.py
+F: docs/system/arm/nseries.rst
Palm
M: Andrzej Zaborowski <balrogg@gmail.com>
@@ -698,6 +702,7 @@ S: Odd Fixes
F: hw/arm/palm.c
F: hw/input/tsc210x.c
F: include/hw/input/tsc2xxx.h
+F: docs/system/arm/palm.rst
Raspberry Pi
M: Peter Maydell <peter.maydell@linaro.org>
@@ -719,6 +724,7 @@ F: hw/arm/realview*
F: hw/cpu/realview_mpcore.c
F: hw/intc/realview_gic.c
F: include/hw/intc/realview_gic.h
+F: docs/system/arm/realview.rst
PXA2XX
M: Andrzej Zaborowski <balrogg@gmail.com>
@@ -738,6 +744,7 @@ F: hw/misc/max111x.c
F: include/hw/arm/pxa.h
F: include/hw/arm/sharpsl.h
F: include/hw/display/tc6393xb.h
+F: docs/system/arm/xscale.rst
SABRELITE / i.MX6
M: Peter Maydell <peter.maydell@linaro.org>
@@ -773,6 +780,7 @@ L: qemu-arm@nongnu.org
S: Maintained
F: hw/*/stellaris*
F: include/hw/input/gamepad.h
+F: docs/system/arm/stellaris.rst
Versatile Express
M: Peter Maydell <peter.maydell@linaro.org>
@@ -786,6 +794,7 @@ L: qemu-arm@nongnu.org
S: Maintained
F: hw/*/versatile*
F: hw/misc/arm_sysctl.c
+F: docs/system/arm/versatile.rst
Virt
M: Peter Maydell <peter.maydell@linaro.org>
@@ -1885,6 +1894,8 @@ L: qemu-block@nongnu.org
S: Supported
F: util/async.c
F: util/aio-*.c
+F: util/aio-*.h
+F: util/fdmon-*.c
F: block/io.c
F: migration/block*
F: include/block/aio.h
diff --git a/Makefile b/Makefile
index 37aed4a244..7df22fcc5d 100644
--- a/Makefile
+++ b/Makefile
@@ -795,7 +795,7 @@ rm -f $(MANUAL_BUILDDIR)/$1/objects.inv $(MANUAL_BUILDDIR)/$1/searchindex.js $(M
endef
distclean: clean
- rm -f config-host.mak config-host.h* config-host.ld $(DOCS) qemu-options.texi qemu-monitor.texi qemu-monitor-info.texi
+ rm -f config-host.mak config-host.h* config-host.ld $(DOCS)
rm -f tests/tcg/config-*.mak
rm -f config-all-devices.mak config-all-disas.mak config.status
rm -f $(SUBDIR_DEVICES_MAK)
@@ -1078,9 +1078,10 @@ sphinxdocs: $(MANUAL_BUILDDIR)/devel/index.html \
# a single doctree: https://github.com/sphinx-doc/sphinx/issues/2946
build-manual = $(call quiet-command,CONFDIR="$(qemu_confdir)" $(SPHINX_BUILD) $(if $(V),,-q) -W -b $2 -D version=$(VERSION) -D release="$(FULL_VERSION)" -d .doctrees/$1-$2 $(SRC_PATH)/docs/$1 $(MANUAL_BUILDDIR)/$1 ,"SPHINX","$(MANUAL_BUILDDIR)/$1")
# We assume all RST files in the manual's directory are used in it
-manual-deps = $(wildcard $(SRC_PATH)/docs/$1/*.rst) \
+manual-deps = $(wildcard $(SRC_PATH)/docs/$1/*.rst $(SRC_PATH)/docs/$1/*/*.rst) \
$(SRC_PATH)/docs/defs.rst.inc \
- $(SRC_PATH)/docs/$1/conf.py $(SRC_PATH)/docs/conf.py
+ $(SRC_PATH)/docs/$1/conf.py $(SRC_PATH)/docs/conf.py \
+ $(SRC_PATH)/docs/sphinx/*.py
# Macro to write out the rule and dependencies for building manpages
# Usage: $(call define-manpage-rule,manualname,manpage1 manpage2...[,extradeps])
# 'extradeps' is optional, and specifies extra files (eg .hx files) that
@@ -1122,15 +1123,6 @@ $(MANUAL_BUILDDIR)/index.html: $(SRC_PATH)/docs/index.html.in qemu-version.h
$(call quiet-command, sed "s|@@VERSION@@|${VERSION}|g" $< >$@, \
"GEN","$@")
-qemu-options.texi: $(SRC_PATH)/qemu-options.hx $(SRC_PATH)/scripts/hxtool
- $(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -t < $< > $@,"GEN","$@")
-
-qemu-monitor.texi: $(SRC_PATH)/hmp-commands.hx $(SRC_PATH)/scripts/hxtool
- $(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -t < $< > $@,"GEN","$@")
-
-qemu-monitor-info.texi: $(SRC_PATH)/hmp-commands-info.hx $(SRC_PATH)/scripts/hxtool
- $(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -t < $< > $@,"GEN","$@")
-
docs/interop/qemu-qmp-qapi.texi: qapi/qapi-doc.texi
@cp -p $< $@
diff --git a/block/backup-top.c b/block/backup-top.c
index 1bfb360bd3..3b50c06e2c 100644
--- a/block/backup-top.c
+++ b/block/backup-top.c
@@ -38,6 +38,7 @@ typedef struct BDRVBackupTopState {
BlockCopyState *bcs;
BdrvChild *target;
bool active;
+ int64_t cluster_size;
} BDRVBackupTopState;
static coroutine_fn int backup_top_co_preadv(
@@ -57,8 +58,8 @@ static coroutine_fn int backup_top_cbw(BlockDriverState *bs, uint64_t offset,
return 0;
}
- off = QEMU_ALIGN_DOWN(offset, s->bcs->cluster_size);
- end = QEMU_ALIGN_UP(offset + bytes, s->bcs->cluster_size);
+ off = QEMU_ALIGN_DOWN(offset, s->cluster_size);
+ end = QEMU_ALIGN_UP(offset + bytes, s->cluster_size);
return block_copy(s->bcs, off, end - off, NULL);
}
@@ -238,6 +239,7 @@ BlockDriverState *bdrv_backup_top_append(BlockDriverState *source,
goto fail;
}
+ state->cluster_size = cluster_size;
state->bcs = block_copy_state_new(top->backing, state->target,
cluster_size, write_flags, &local_err);
if (local_err) {
diff --git a/block/backup.c b/block/backup.c
index 1383e219f5..7430ca5883 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -57,15 +57,6 @@ static void backup_progress_bytes_callback(int64_t bytes, void *opaque)
BackupBlockJob *s = opaque;
s->bytes_read += bytes;
- job_progress_update(&s->common.job, bytes);
-}
-
-static void backup_progress_reset_callback(void *opaque)
-{
- BackupBlockJob *s = opaque;
- uint64_t estimate = bdrv_get_dirty_count(s->bcs->copy_bitmap);
-
- job_progress_set_remaining(&s->common.job, estimate);
}
static int coroutine_fn backup_do_cow(BackupBlockJob *job,
@@ -111,7 +102,7 @@ static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
if (ret < 0 && job->bitmap_mode == BITMAP_SYNC_MODE_ALWAYS) {
/* If we failed and synced, merge in the bits we didn't copy: */
- bdrv_dirty_bitmap_merge_internal(bm, job->bcs->copy_bitmap,
+ bdrv_dirty_bitmap_merge_internal(bm, block_copy_dirty_bitmap(job->bcs),
NULL, true);
}
}
@@ -154,7 +145,8 @@ void backup_do_checkpoint(BlockJob *job, Error **errp)
return;
}
- bdrv_set_dirty_bitmap(backup_job->bcs->copy_bitmap, 0, backup_job->len);
+ bdrv_set_dirty_bitmap(block_copy_dirty_bitmap(backup_job->bcs), 0,
+ backup_job->len);
}
static BlockErrorAction backup_error_action(BackupBlockJob *job,
@@ -199,7 +191,7 @@ static int coroutine_fn backup_loop(BackupBlockJob *job)
BdrvDirtyBitmapIter *bdbi;
int ret = 0;
- bdbi = bdrv_dirty_iter_new(job->bcs->copy_bitmap);
+ bdbi = bdrv_dirty_iter_new(block_copy_dirty_bitmap(job->bcs));
while ((offset = bdrv_dirty_iter_next(bdbi)) != -1) {
do {
if (yield_and_check(job)) {
@@ -219,14 +211,14 @@ static int coroutine_fn backup_loop(BackupBlockJob *job)
return ret;
}
-static void backup_init_copy_bitmap(BackupBlockJob *job)
+static void backup_init_bcs_bitmap(BackupBlockJob *job)
{
bool ret;
uint64_t estimate;
+ BdrvDirtyBitmap *bcs_bitmap = block_copy_dirty_bitmap(job->bcs);
if (job->sync_mode == MIRROR_SYNC_MODE_BITMAP) {
- ret = bdrv_dirty_bitmap_merge_internal(job->bcs->copy_bitmap,
- job->sync_bitmap,
+ ret = bdrv_dirty_bitmap_merge_internal(bcs_bitmap, job->sync_bitmap,
NULL, true);
assert(ret);
} else {
@@ -235,12 +227,12 @@ static void backup_init_copy_bitmap(BackupBlockJob *job)
* We can't hog the coroutine to initialize this thoroughly.
* Set a flag and resume work when we are able to yield safely.
*/
- job->bcs->skip_unallocated = true;
+ block_copy_set_skip_unallocated(job->bcs, true);
}
- bdrv_set_dirty_bitmap(job->bcs->copy_bitmap, 0, job->len);
+ bdrv_set_dirty_bitmap(bcs_bitmap, 0, job->len);
}
- estimate = bdrv_get_dirty_count(job->bcs->copy_bitmap);
+ estimate = bdrv_get_dirty_count(bcs_bitmap);
job_progress_set_remaining(&job->common.job, estimate);
}
@@ -249,7 +241,7 @@ static int coroutine_fn backup_run(Job *job, Error **errp)
BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
int ret = 0;
- backup_init_copy_bitmap(s);
+ backup_init_bcs_bitmap(s);
if (s->sync_mode == MIRROR_SYNC_MODE_TOP) {
int64_t offset = 0;
@@ -268,12 +260,12 @@ static int coroutine_fn backup_run(Job *job, Error **errp)
offset += count;
}
- s->bcs->skip_unallocated = false;
+ block_copy_set_skip_unallocated(s->bcs, false);
}
if (s->sync_mode == MIRROR_SYNC_MODE_NONE) {
/*
- * All bits are set in copy_bitmap to allow any cluster to be copied.
+ * All bits are set in bcs bitmap to allow any cluster to be copied.
* This does not actually require them to be copied.
*/
while (!job_is_cancelled(job)) {
@@ -464,8 +456,8 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
job->cluster_size = cluster_size;
job->len = len;
- block_copy_set_callbacks(bcs, backup_progress_bytes_callback,
- backup_progress_reset_callback, job);
+ block_copy_set_progress_callback(bcs, backup_progress_bytes_callback, job);
+ block_copy_set_progress_meter(bcs, &job->common.job.progress);
/* Required permissions are already taken by backup-top target */
block_job_add_bdrv(&job->common, "target", target, 0, BLK_PERM_ALL,
diff --git a/block/block-copy.c b/block/block-copy.c
index 79798a1567..05227e18bf 100644
--- a/block/block-copy.c
+++ b/block/block-copy.c
@@ -24,37 +24,136 @@
#define BLOCK_COPY_MAX_BUFFER (1 * MiB)
#define BLOCK_COPY_MAX_MEM (128 * MiB)
-static void coroutine_fn block_copy_wait_inflight_reqs(BlockCopyState *s,
- int64_t start,
- int64_t end)
+typedef struct BlockCopyInFlightReq {
+ int64_t offset;
+ int64_t bytes;
+ QLIST_ENTRY(BlockCopyInFlightReq) list;
+ CoQueue wait_queue; /* coroutines blocked on this request */
+} BlockCopyInFlightReq;
+
+typedef struct BlockCopyState {
+ /*
+ * BdrvChild objects are not owned or managed by block-copy. They are
+ * provided by block-copy user and user is responsible for appropriate
+ * permissions on these children.
+ */
+ BdrvChild *source;
+ BdrvChild *target;
+ BdrvDirtyBitmap *copy_bitmap;
+ int64_t in_flight_bytes;
+ int64_t cluster_size;
+ bool use_copy_range;
+ int64_t copy_size;
+ uint64_t len;
+ QLIST_HEAD(, BlockCopyInFlightReq) inflight_reqs;
+
+ BdrvRequestFlags write_flags;
+
+ /*
+ * skip_unallocated:
+ *
+ * Used by sync=top jobs, which first scan the source node for unallocated
+ * areas and clear them in the copy_bitmap. During this process, the bitmap
+ * is thus not fully initialized: It may still have bits set for areas that
+ * are unallocated and should actually not be copied.
+ *
+ * This is indicated by skip_unallocated.
+ *
+ * In this case, block_copy() will query the source’s allocation status,
+ * skip unallocated regions, clear them in the copy_bitmap, and invoke
+ * block_copy_reset_unallocated() every time it does.
+ */
+ bool skip_unallocated;
+
+ ProgressMeter *progress;
+ /* progress_bytes_callback: called when some copying progress is done. */
+ ProgressBytesCallbackFunc progress_bytes_callback;
+ void *progress_opaque;
+
+ SharedResource *mem;
+} BlockCopyState;
+
+static BlockCopyInFlightReq *find_conflicting_inflight_req(BlockCopyState *s,
+ int64_t offset,
+ int64_t bytes)
{
BlockCopyInFlightReq *req;
- bool waited;
- do {
- waited = false;
- QLIST_FOREACH(req, &s->inflight_reqs, list) {
- if (end > req->start_byte && start < req->end_byte) {
- qemu_co_queue_wait(&req->wait_queue, NULL);
- waited = true;
- break;
- }
+ QLIST_FOREACH(req, &s->inflight_reqs, list) {
+ if (offset + bytes > req->offset && offset < req->offset + req->bytes) {
+ return req;
}
- } while (waited);
+ }
+
+ return NULL;
+}
+
+/*
+ * If there are no intersecting requests return false. Otherwise, wait for the
+ * first found intersecting request to finish and return true.
+ */
+static bool coroutine_fn block_copy_wait_one(BlockCopyState *s, int64_t offset,
+ int64_t bytes)
+{
+ BlockCopyInFlightReq *req = find_conflicting_inflight_req(s, offset, bytes);
+
+ if (!req) {
+ return false;
+ }
+
+ qemu_co_queue_wait(&req->wait_queue, NULL);
+
+ return true;
}
+/* Called only on full-dirty region */
static void block_copy_inflight_req_begin(BlockCopyState *s,
BlockCopyInFlightReq *req,
- int64_t start, int64_t end)
+ int64_t offset, int64_t bytes)
{
- req->start_byte = start;
- req->end_byte = end;
+ assert(!find_conflicting_inflight_req(s, offset, bytes));
+
+ bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
+ s->in_flight_bytes += bytes;
+
+ req->offset = offset;
+ req->bytes = bytes;
qemu_co_queue_init(&req->wait_queue);
QLIST_INSERT_HEAD(&s->inflight_reqs, req, list);
}
-static void coroutine_fn block_copy_inflight_req_end(BlockCopyInFlightReq *req)
+/*
+ * block_copy_inflight_req_shrink
+ *
+ * Drop the tail of the request to be handled later. Set dirty bits back and
+ * wake up all requests waiting for us (may be some of them are not intersecting
+ * with shrunk request)
+ */
+static void coroutine_fn block_copy_inflight_req_shrink(BlockCopyState *s,
+ BlockCopyInFlightReq *req, int64_t new_bytes)
+{
+ if (new_bytes == req->bytes) {
+ return;
+ }
+
+ assert(new_bytes > 0 && new_bytes < req->bytes);
+
+ s->in_flight_bytes -= req->bytes - new_bytes;
+ bdrv_set_dirty_bitmap(s->copy_bitmap,
+ req->offset + new_bytes, req->bytes - new_bytes);
+
+ req->bytes = new_bytes;
+ qemu_co_queue_restart_all(&req->wait_queue);
+}
+
+static void coroutine_fn block_copy_inflight_req_end(BlockCopyState *s,
+ BlockCopyInFlightReq *req,
+ int ret)
{
+ s->in_flight_bytes -= req->bytes;
+ if (ret < 0) {
+ bdrv_set_dirty_bitmap(s->copy_bitmap, req->offset, req->bytes);
+ }
QLIST_REMOVE(req, list);
qemu_co_queue_restart_all(&req->wait_queue);
}
@@ -70,16 +169,19 @@ void block_copy_state_free(BlockCopyState *s)
g_free(s);
}
+static uint32_t block_copy_max_transfer(BdrvChild *source, BdrvChild *target)
+{
+ return MIN_NON_ZERO(INT_MAX,
+ MIN_NON_ZERO(source->bs->bl.max_transfer,
+ target->bs->bl.max_transfer));
+}
+
BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
int64_t cluster_size,
BdrvRequestFlags write_flags, Error **errp)
{
BlockCopyState *s;
BdrvDirtyBitmap *copy_bitmap;
- uint32_t max_transfer =
- MIN_NON_ZERO(INT_MAX,
- MIN_NON_ZERO(source->bs->bl.max_transfer,
- target->bs->bl.max_transfer));
copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL,
errp);
@@ -99,7 +201,7 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
.mem = shres_create(BLOCK_COPY_MAX_MEM),
};
- if (max_transfer < cluster_size) {
+ if (block_copy_max_transfer(source, target) < cluster_size) {
/*
* copy_range does not respect max_transfer. We don't want to bother
* with requests smaller than block-copy cluster size, so fallback to
@@ -114,12 +216,11 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
s->copy_size = cluster_size;
} else {
/*
- * copy_range does not respect max_transfer (it's a TODO), so we factor
- * that in here.
+ * We enable copy-range, but keep small copy_size, until first
+ * successful copy_range (look at block_copy_do_copy).
*/
s->use_copy_range = true;
- s->copy_size = MIN(MAX(cluster_size, BLOCK_COPY_MAX_COPY_RANGE),
- QEMU_ALIGN_DOWN(max_transfer, cluster_size));
+ s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER);
}
QLIST_INIT(&s->inflight_reqs);
@@ -127,48 +228,83 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
return s;
}
-void block_copy_set_callbacks(
+void block_copy_set_progress_callback(
BlockCopyState *s,
ProgressBytesCallbackFunc progress_bytes_callback,
- ProgressResetCallbackFunc progress_reset_callback,
void *progress_opaque)
{
s->progress_bytes_callback = progress_bytes_callback;
- s->progress_reset_callback = progress_reset_callback;
s->progress_opaque = progress_opaque;
}
+void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm)
+{
+ s->progress = pm;
+}
+
/*
* block_copy_do_copy
*
- * Do copy of cluser-aligned chunk. @end is allowed to exceed s->len only to
- * cover last cluster when s->len is not aligned to clusters.
+ * Do copy of cluster-aligned chunk. Requested region is allowed to exceed
+ * s->len only to cover last cluster when s->len is not aligned to clusters.
*
* No sync here: nor bitmap neighter intersecting requests handling, only copy.
*
* Returns 0 on success.
*/
static int coroutine_fn block_copy_do_copy(BlockCopyState *s,
- int64_t start, int64_t end,
- bool *error_is_read)
+ int64_t offset, int64_t bytes,
+ bool zeroes, bool *error_is_read)
{
int ret;
- int nbytes = MIN(end, s->len) - start;
+ int64_t nbytes = MIN(offset + bytes, s->len) - offset;
void *bounce_buffer = NULL;
- assert(QEMU_IS_ALIGNED(start, s->cluster_size));
- assert(QEMU_IS_ALIGNED(end, s->cluster_size));
- assert(end < s->len || end == QEMU_ALIGN_UP(s->len, s->cluster_size));
+ assert(offset >= 0 && bytes > 0 && INT64_MAX - offset >= bytes);
+ assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
+ assert(QEMU_IS_ALIGNED(bytes, s->cluster_size));
+ assert(offset < s->len);
+ assert(offset + bytes <= s->len ||
+ offset + bytes == QEMU_ALIGN_UP(s->len, s->cluster_size));
+ assert(nbytes < INT_MAX);
+
+ if (zeroes) {
+ ret = bdrv_co_pwrite_zeroes(s->target, offset, nbytes, s->write_flags &
+ ~BDRV_REQ_WRITE_COMPRESSED);
+ if (ret < 0) {
+ trace_block_copy_write_zeroes_fail(s, offset, ret);
+ if (error_is_read) {
+ *error_is_read = false;
+ }
+ }
+ return ret;
+ }
if (s->use_copy_range) {
- ret = bdrv_co_copy_range(s->source, start, s->target, start, nbytes,
+ ret = bdrv_co_copy_range(s->source, offset, s->target, offset, nbytes,
0, s->write_flags);
if (ret < 0) {
- trace_block_copy_copy_range_fail(s, start, ret);
+ trace_block_copy_copy_range_fail(s, offset, ret);
s->use_copy_range = false;
s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER);
/* Fallback to read+write with allocated buffer */
} else {
+ if (s->use_copy_range) {
+ /*
+ * Successful copy-range. Now increase copy_size. copy_range
+ * does not respect max_transfer (it's a TODO), so we factor
+ * that in here.
+ *
+ * Note: we double-check s->use_copy_range for the case when
+ * parallel block-copy request unsets it during previous
+ * bdrv_co_copy_range call.
+ */
+ s->copy_size =
+ MIN(MAX(s->cluster_size, BLOCK_COPY_MAX_COPY_RANGE),
+ QEMU_ALIGN_DOWN(block_copy_max_transfer(s->source,
+ s->target),
+ s->cluster_size));
+ }
goto out;
}
}
@@ -176,24 +312,27 @@ static int coroutine_fn block_copy_do_copy(BlockCopyState *s,
/*
* In case of failed copy_range request above, we may proceed with buffered
* request larger than BLOCK_COPY_MAX_BUFFER. Still, further requests will
- * be properly limited, so don't care too much.
+ * be properly limited, so don't care too much. Moreover the most likely
+ * case (copy_range is unsupported for the configuration, so the very first
+ * copy_range request fails) is handled by setting large copy_size only
+ * after first successful copy_range.
*/
bounce_buffer = qemu_blockalign(s->source->bs, nbytes);
- ret = bdrv_co_pread(s->source, start, nbytes, bounce_buffer, 0);
+ ret = bdrv_co_pread(s->source, offset, nbytes, bounce_buffer, 0);
if (ret < 0) {
- trace_block_copy_read_fail(s, start, ret);
+ trace_block_copy_read_fail(s, offset, ret);
if (error_is_read) {
*error_is_read = true;
}
goto out;
}
- ret = bdrv_co_pwrite(s->target, start, nbytes, bounce_buffer,
+ ret = bdrv_co_pwrite(s->target, offset, nbytes, bounce_buffer,
s->write_flags);
if (ret < 0) {
- trace_block_copy_write_fail(s, start, ret);
+ trace_block_copy_write_fail(s, offset, ret);
if (error_is_read) {
*error_is_read = false;
}
@@ -206,6 +345,38 @@ out:
return ret;
}
+static int block_copy_block_status(BlockCopyState *s, int64_t offset,
+ int64_t bytes, int64_t *pnum)
+{
+ int64_t num;
+ BlockDriverState *base;
+ int ret;
+
+ if (s->skip_unallocated && s->source->bs->backing) {
+ base = s->source->bs->backing->bs;
+ } else {
+ base = NULL;
+ }
+
+ ret = bdrv_block_status_above(s->source->bs, base, offset, bytes, &num,
+ NULL, NULL);
+ if (ret < 0 || num < s->cluster_size) {
+ /*
+ * On error or if failed to obtain large enough chunk just fallback to
+ * copy one cluster.
+ */
+ num = s->cluster_size;
+ ret = BDRV_BLOCK_ALLOCATED | BDRV_BLOCK_DATA;
+ } else if (offset + num == s->len) {
+ num = QEMU_ALIGN_UP(num, s->cluster_size);
+ } else {
+ num = QEMU_ALIGN_DOWN(num, s->cluster_size);
+ }
+
+ *pnum = num;
+ return ret;
+}
+
/*
* Check if the cluster starting at offset is allocated or not.
* return via pnum the number of contiguous clusters sharing this allocation.
@@ -269,21 +440,28 @@ int64_t block_copy_reset_unallocated(BlockCopyState *s,
if (!ret) {
bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
- s->progress_reset_callback(s->progress_opaque);
+ progress_set_remaining(s->progress,
+ bdrv_get_dirty_count(s->copy_bitmap) +
+ s->in_flight_bytes);
}
*count = bytes;
return ret;
}
-int coroutine_fn block_copy(BlockCopyState *s,
- int64_t start, uint64_t bytes,
- bool *error_is_read)
+/*
+ * block_copy_dirty_clusters
+ *
+ * Copy dirty clusters in @offset/@bytes range.
+ * Returns 1 if dirty clusters found and successfully copied, 0 if no dirty
+ * clusters found and -errno on failure.
+ */
+static int coroutine_fn block_copy_dirty_clusters(BlockCopyState *s,
+ int64_t offset, int64_t bytes,
+ bool *error_is_read)
{
int ret = 0;
- int64_t end = bytes + start; /* bytes */
- int64_t status_bytes;
- BlockCopyInFlightReq req;
+ bool found_dirty = false;
/*
* block_copy() user is responsible for keeping source and target in same
@@ -292,60 +470,109 @@ int coroutine_fn block_copy(BlockCopyState *s,
assert(bdrv_get_aio_context(s->source->bs) ==
bdrv_get_aio_context(s->target->bs));
- assert(QEMU_IS_ALIGNED(start, s->cluster_size));
- assert(QEMU_IS_ALIGNED(end, s->cluster_size));
-
- block_copy_wait_inflight_reqs(s, start, bytes);
- block_copy_inflight_req_begin(s, &req, start, end);
+ assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
+ assert(QEMU_IS_ALIGNED(bytes, s->cluster_size));
- while (start < end) {
- int64_t next_zero, chunk_end;
+ while (bytes) {
+ BlockCopyInFlightReq req;
+ int64_t next_zero, cur_bytes, status_bytes;
- if (!bdrv_dirty_bitmap_get(s->copy_bitmap, start)) {
- trace_block_copy_skip(s, start);
- start += s->cluster_size;
+ if (!bdrv_dirty_bitmap_get(s->copy_bitmap, offset)) {
+ trace_block_copy_skip(s, offset);
+ offset += s->cluster_size;
+ bytes -= s->cluster_size;
continue; /* already copied */
}
- chunk_end = MIN(end, start + s->copy_size);
+ found_dirty = true;
+
+ cur_bytes = MIN(bytes, s->copy_size);
- next_zero = bdrv_dirty_bitmap_next_zero(s->copy_bitmap, start,
- chunk_end - start);
+ next_zero = bdrv_dirty_bitmap_next_zero(s->copy_bitmap, offset,
+ cur_bytes);
if (next_zero >= 0) {
- assert(next_zero > start); /* start is dirty */
- assert(next_zero < chunk_end); /* no need to do MIN() */
- chunk_end = next_zero;
+ assert(next_zero > offset); /* offset is dirty */
+ assert(next_zero < offset + cur_bytes); /* no need to do MIN() */
+ cur_bytes = next_zero - offset;
}
-
- if (s->skip_unallocated) {
- ret = block_copy_reset_unallocated(s, start, &status_bytes);
- if (ret == 0) {
- trace_block_copy_skip_range(s, start, status_bytes);
- start += status_bytes;
- continue;
- }
- /* Clamp to known allocated region */
- chunk_end = MIN(chunk_end, start + status_bytes);
+ block_copy_inflight_req_begin(s, &req, offset, cur_bytes);
+
+ ret = block_copy_block_status(s, offset, cur_bytes, &status_bytes);
+ assert(ret >= 0); /* never fail */
+ cur_bytes = MIN(cur_bytes, status_bytes);
+ block_copy_inflight_req_shrink(s, &req, cur_bytes);
+ if (s->skip_unallocated && !(ret & BDRV_BLOCK_ALLOCATED)) {
+ block_copy_inflight_req_end(s, &req, 0);
+ progress_set_remaining(s->progress,
+ bdrv_get_dirty_count(s->copy_bitmap) +
+ s->in_flight_bytes);
+ trace_block_copy_skip_range(s, offset, status_bytes);
+ offset += status_bytes;
+ bytes -= status_bytes;
+ continue;
}
- trace_block_copy_process(s, start);
-
- bdrv_reset_dirty_bitmap(s->copy_bitmap, start, chunk_end - start);
+ trace_block_copy_process(s, offset);
- co_get_from_shres(s->mem, chunk_end - start);
- ret = block_copy_do_copy(s, start, chunk_end, error_is_read);
- co_put_to_shres(s->mem, chunk_end - start);
+ co_get_from_shres(s->mem, cur_bytes);
+ ret = block_copy_do_copy(s, offset, cur_bytes, ret & BDRV_BLOCK_ZERO,
+ error_is_read);
+ co_put_to_shres(s->mem, cur_bytes);
+ block_copy_inflight_req_end(s, &req, ret);
if (ret < 0) {
- bdrv_set_dirty_bitmap(s->copy_bitmap, start, chunk_end - start);
- break;
+ return ret;
}
- s->progress_bytes_callback(chunk_end - start, s->progress_opaque);
- start = chunk_end;
- ret = 0;
+ progress_work_done(s->progress, cur_bytes);
+ s->progress_bytes_callback(cur_bytes, s->progress_opaque);
+ offset += cur_bytes;
+ bytes -= cur_bytes;
}
- block_copy_inflight_req_end(&req);
+ return found_dirty;
+}
+
+/*
+ * block_copy
+ *
+ * Copy requested region, accordingly to dirty bitmap.
+ * Collaborate with parallel block_copy requests: if they succeed it will help
+ * us. If they fail, we will retry not-copied regions. So, if we return error,
+ * it means that some I/O operation failed in context of _this_ block_copy call,
+ * not some parallel operation.
+ */
+int coroutine_fn block_copy(BlockCopyState *s, int64_t offset, int64_t bytes,
+ bool *error_is_read)
+{
+ int ret;
+
+ do {
+ ret = block_copy_dirty_clusters(s, offset, bytes, error_is_read);
+
+ if (ret == 0) {
+ ret = block_copy_wait_one(s, offset, bytes);
+ }
+
+ /*
+ * We retry in two cases:
+ * 1. Some progress done
+ * Something was copied, which means that there were yield points
+ * and some new dirty bits may have appeared (due to failed parallel
+ * block-copy requests).
+ * 2. We have waited for some intersecting block-copy request
+ * It may have failed and produced new dirty bits.
+ */
+ } while (ret > 0);
return ret;
}
+
+BdrvDirtyBitmap *block_copy_dirty_bitmap(BlockCopyState *s)
+{
+ return s->copy_bitmap;
+}
+
+void block_copy_set_skip_unallocated(BlockCopyState *s, bool skip)
+{
+ s->skip_unallocated = skip;
+}
diff --git a/block/crypto.c b/block/crypto.c
index 00e8ec537d..4425ebeb47 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -485,6 +485,67 @@ static int64_t block_crypto_getlength(BlockDriverState *bs)
}
+static BlockMeasureInfo *block_crypto_measure(QemuOpts *opts,
+ BlockDriverState *in_bs,
+ Error **errp)
+{
+ g_autoptr(QCryptoBlockCreateOptions) create_opts = NULL;
+ Error *local_err = NULL;
+ BlockMeasureInfo *info;
+ uint64_t size;
+ size_t luks_payload_size;
+ QDict *cryptoopts;
+
+ /*
+ * Preallocation mode doesn't affect size requirements but we must consume
+ * the option.
+ */
+ g_free(qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC));
+
+ size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
+
+ if (in_bs) {
+ int64_t ssize = bdrv_getlength(in_bs);
+
+ if (ssize < 0) {
+ error_setg_errno(&local_err, -ssize,
+ "Unable to get image virtual_size");
+ goto err;
+ }
+
+ size = ssize;
+ }
+
+ cryptoopts = qemu_opts_to_qdict_filtered(opts, NULL,
+ &block_crypto_create_opts_luks, true);
+ qdict_put_str(cryptoopts, "format", "luks");
+ create_opts = block_crypto_create_opts_init(cryptoopts, &local_err);
+ qobject_unref(cryptoopts);
+ if (!create_opts) {
+ goto err;
+ }
+
+ if (!qcrypto_block_calculate_payload_offset(create_opts, NULL,
+ &luks_payload_size,
+ &local_err)) {
+ goto err;
+ }
+
+ /*
+ * Unallocated blocks are still encrypted so allocation status makes no
+ * difference to the file size.
+ */
+ info = g_new(BlockMeasureInfo, 1);
+ info->fully_allocated = luks_payload_size + size;
+ info->required = luks_payload_size + size;
+ return info;
+
+err:
+ error_propagate(errp, local_err);
+ return NULL;
+}
+
+
static int block_crypto_probe_luks(const uint8_t *buf,
int buf_size,
const char *filename) {
@@ -688,6 +749,7 @@ static BlockDriver bdrv_crypto_luks = {
.bdrv_co_preadv = block_crypto_co_preadv,
.bdrv_co_pwritev = block_crypto_co_pwritev,
.bdrv_getlength = block_crypto_getlength,
+ .bdrv_measure = block_crypto_measure,
.bdrv_get_info = block_crypto_get_info_luks,
.bdrv_get_specific_info = block_crypto_get_specific_info_luks,
diff --git a/block/curl.c b/block/curl.c
index f86299378e..6e325901dc 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -214,11 +214,35 @@ static size_t curl_header_cb(void *ptr, size_t size, size_t nmemb, void *opaque)
{
BDRVCURLState *s = opaque;
size_t realsize = size * nmemb;
- const char *accept_line = "Accept-Ranges: bytes";
+ const char *header = (char *)ptr;
+ const char *end = header + realsize;
+ const char *accept_ranges = "accept-ranges:";
+ const char *bytes = "bytes";
- if (realsize >= strlen(accept_line)
- && strncmp((char *)ptr, accept_line, strlen(accept_line)) == 0) {
- s->accept_range = true;
+ if (realsize >= strlen(accept_ranges)
+ && g_ascii_strncasecmp(header, accept_ranges,
+ strlen(accept_ranges)) == 0) {
+
+ char *p = strchr(header, ':') + 1;
+
+ /* Skip whitespace between the header name and value. */
+ while (p < end && *p && g_ascii_isspace(*p)) {
+ p++;
+ }
+
+ if (end - p >= strlen(bytes)
+ && strncmp(p, bytes, strlen(bytes)) == 0) {
+
+ /* Check that there is nothing but whitespace after the value. */
+ p += strlen(bytes);
+ while (p < end && *p && g_ascii_isspace(*p)) {
+ p++;
+ }
+
+ if (p == end || !*p) {
+ s->accept_range = true;
+ }
+ }
}
return realsize;
diff --git a/block/qcow2-threads.c b/block/qcow2-threads.c
index 77bb578cdf..a68126f291 100644
--- a/block/qcow2-threads.c
+++ b/block/qcow2-threads.c
@@ -128,12 +128,12 @@ static ssize_t qcow2_compress(void *dest, size_t dest_size,
* @src - source buffer, @src_size bytes
*
* Returns: 0 on success
- * -1 on fail
+ * -EIO on fail
*/
static ssize_t qcow2_decompress(void *dest, size_t dest_size,
const void *src, size_t src_size)
{
- int ret = 0;
+ int ret;
z_stream strm;
memset(&strm, 0, sizeof(strm));
@@ -144,17 +144,19 @@ static ssize_t qcow2_decompress(void *dest, size_t dest_size,
ret = inflateInit2(&strm, -12);
if (ret != Z_OK) {
- return -1;
+ return -EIO;
}
ret = inflate(&strm, Z_FINISH);
- if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || strm.avail_out != 0) {
+ if ((ret == Z_STREAM_END || ret == Z_BUF_ERROR) && strm.avail_out == 0) {
/*
* We approve Z_BUF_ERROR because we need @dest buffer to be filled, but
* @src buffer may be processed partly (because in qcow2 we know size of
* compressed data with precision of one sector)
*/
- ret = -1;
+ ret = 0;
+ } else {
+ ret = -EIO;
}
inflateEnd(&strm);
diff --git a/block/qcow2.c b/block/qcow2.c
index 3640e8c07d..d44b45633d 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2610,6 +2610,7 @@ static void qcow2_close(BlockDriverState *bs)
qcrypto_block_free(s->crypto);
s->crypto = NULL;
+ qapi_free_QCryptoBlockOpenOptions(s->crypto_opts);
g_free(s->unknown_header_fields);
cleanup_unknown_header_ext(bs);
@@ -4608,60 +4609,6 @@ static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
return ret;
}
-static ssize_t qcow2_measure_crypto_hdr_init_func(QCryptoBlock *block,
- size_t headerlen, void *opaque, Error **errp)
-{
- size_t *headerlenp = opaque;
-
- /* Stash away the payload size */
- *headerlenp = headerlen;
- return 0;
-}
-
-static ssize_t qcow2_measure_crypto_hdr_write_func(QCryptoBlock *block,
- size_t offset, const uint8_t *buf, size_t buflen,
- void *opaque, Error **errp)
-{
- /* Discard the bytes, we're not actually writing to an image */
- return buflen;
-}
-
-/* Determine the number of bytes for the LUKS payload */
-static bool qcow2_measure_luks_headerlen(QemuOpts *opts, size_t *len,
- Error **errp)
-{
- QDict *opts_qdict;
- QDict *cryptoopts_qdict;
- QCryptoBlockCreateOptions *cryptoopts;
- QCryptoBlock *crypto;
-
- /* Extract "encrypt." options into a qdict */
- opts_qdict = qemu_opts_to_qdict(opts, NULL);
- qdict_extract_subqdict(opts_qdict, &cryptoopts_qdict, "encrypt.");
- qobject_unref(opts_qdict);
-
- /* Build QCryptoBlockCreateOptions object from qdict */
- qdict_put_str(cryptoopts_qdict, "format", "luks");
- cryptoopts = block_crypto_create_opts_init(cryptoopts_qdict, errp);
- qobject_unref(cryptoopts_qdict);
- if (!cryptoopts) {
- return false;
- }
-
- /* Fake LUKS creation in order to determine the payload size */
- crypto = qcrypto_block_create(cryptoopts, "encrypt.",
- qcow2_measure_crypto_hdr_init_func,
- qcow2_measure_crypto_hdr_write_func,
- len, errp);
- qapi_free_QCryptoBlockCreateOptions(cryptoopts);
- if (!crypto) {
- return false;
- }
-
- qcrypto_block_free(crypto);
- return true;
-}
-
static BlockMeasureInfo *qcow2_measure(QemuOpts *opts, BlockDriverState *in_bs,
Error **errp)
{
@@ -4712,9 +4659,27 @@ static BlockMeasureInfo *qcow2_measure(QemuOpts *opts, BlockDriverState *in_bs,
g_free(optstr);
if (has_luks) {
+ g_autoptr(QCryptoBlockCreateOptions) create_opts = NULL;
+ QDict *opts_qdict;
+ QDict *cryptoopts;
size_t headerlen;
- if (!qcow2_measure_luks_headerlen(opts, &headerlen, &local_err)) {
+ opts_qdict = qemu_opts_to_qdict(opts, NULL);
+ qdict_extract_subqdict(opts_qdict, &cryptoopts, "encrypt.");
+ qobject_unref(opts_qdict);
+
+ qdict_put_str(cryptoopts, "format", "luks");
+
+ create_opts = block_crypto_create_opts_init(cryptoopts, errp);
+ qobject_unref(cryptoopts);
+ if (!create_opts) {
+ goto err;
+ }
+
+ if (!qcrypto_block_calculate_payload_offset(create_opts,
+ "encrypt.",
+ &headerlen,
+ &local_err)) {
goto err;
}
diff --git a/block/trace-events b/block/trace-events
index 1a7329b736..29dff8881c 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -48,6 +48,7 @@ block_copy_process(void *bcs, int64_t start) "bcs %p start %"PRId64
block_copy_copy_range_fail(void *bcs, int64_t start, int ret) "bcs %p start %"PRId64" ret %d"
block_copy_read_fail(void *bcs, int64_t start, int ret) "bcs %p start %"PRId64" ret %d"
block_copy_write_fail(void *bcs, int64_t start, int ret) "bcs %p start %"PRId64" ret %d"
+block_copy_write_zeroes_fail(void *bcs, int64_t start, int ret) "bcs %p start %"PRId64" ret %d"
# ../blockdev.c
qmp_block_job_cancel(void *job) "job %p"
diff --git a/blockjob.c b/blockjob.c
index 5d63b1e89d..fc850312c1 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -299,8 +299,8 @@ BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
info->device = g_strdup(job->job.id);
info->busy = atomic_read(&job->job.busy);
info->paused = job->job.pause_count > 0;
- info->offset = job->job.progress_current;
- info->len = job->job.progress_total;
+ info->offset = job->job.progress.current;
+ info->len = job->job.progress.total;
info->speed = job->speed;
info->io_status = job->iostatus;
info->ready = job_is_ready(&job->job),
@@ -330,8 +330,8 @@ static void block_job_event_cancelled(Notifier *n, void *opaque)
qapi_event_send_block_job_cancelled(job_type(&job->job),
job->job.id,
- job->job.progress_total,
- job->job.progress_current,
+ job->job.progress.total,
+ job->job.progress.current,
job->speed);
}
@@ -350,8 +350,8 @@ static void block_job_event_completed(Notifier *n, void *opaque)
qapi_event_send_block_job_completed(job_type(&job->job),
job->job.id,
- job->job.progress_total,
- job->job.progress_current,
+ job->job.progress.total,
+ job->job.progress.current,
job->speed,
!!msg,
msg);
@@ -379,8 +379,8 @@ static void block_job_event_ready(Notifier *n, void *opaque)
qapi_event_send_block_job_ready(job_type(&job->job),
job->job.id,
- job->job.progress_total,
- job->job.progress_current,
+ job->job.progress.total,
+ job->job.progress.current,
job->speed);
}
diff --git a/configure b/configure
index cbf864bff1..3c7470096f 100755
--- a/configure
+++ b/configure
@@ -4093,6 +4093,11 @@ if test "$linux_io_uring" != "no" ; then
linux_io_uring_cflags=$($pkg_config --cflags liburing)
linux_io_uring_libs=$($pkg_config --libs liburing)
linux_io_uring=yes
+
+ # io_uring is used in libqemuutil.a where per-file -libs variables are not
+ # seen by programs linking the archive. It's not ideal, but just add the
+ # library dependency globally.
+ LIBS="$linux_io_uring_libs $LIBS"
else
if test "$linux_io_uring" = "yes" ; then
feature_not_found "linux io_uring" "Install liburing devel"
diff --git a/crypto/block.c b/crypto/block.c
index 325752871c..6f42b32f1e 100644
--- a/crypto/block.c
+++ b/crypto/block.c
@@ -115,6 +115,42 @@ QCryptoBlock *qcrypto_block_create(QCryptoBlockCreateOptions *options,
}
+static ssize_t qcrypto_block_headerlen_hdr_init_func(QCryptoBlock *block,
+ size_t headerlen, void *opaque, Error **errp)
+{
+ size_t *headerlenp = opaque;
+
+ /* Stash away the payload size */
+ *headerlenp = headerlen;
+ return 0;
+}
+
+
+static ssize_t qcrypto_block_headerlen_hdr_write_func(QCryptoBlock *block,
+ size_t offset, const uint8_t *buf, size_t buflen,
+ void *opaque, Error **errp)
+{
+ /* Discard the bytes, we're not actually writing to an image */
+ return buflen;
+}
+
+
+bool
+qcrypto_block_calculate_payload_offset(QCryptoBlockCreateOptions *create_opts,
+ const char *optprefix,
+ size_t *len,
+ Error **errp)
+{
+ /* Fake LUKS creation in order to determine the payload size */
+ g_autoptr(QCryptoBlock) crypto =
+ qcrypto_block_create(create_opts, optprefix,
+ qcrypto_block_headerlen_hdr_init_func,
+ qcrypto_block_headerlen_hdr_write_func,
+ len, errp);
+ return crypto != NULL;
+}
+
+
QCryptoBlockInfo *qcrypto_block_get_info(QCryptoBlock *block,
Error **errp)
{
diff --git a/docs/can.txt b/docs/can.txt
index 9fa6ed51c8..11ed8f2d68 100644
--- a/docs/can.txt
+++ b/docs/can.txt
@@ -13,7 +13,7 @@ controller is implemented.
The PCI addon card hardware has been selected as the first CAN
interface to implement because such device can be easily connected
-to systems with different CPU architectures (x86, PowerPC, ARM, etc.).
+to systems with different CPU architectures (x86, PowerPC, Arm, etc.).
The project has been initially started in frame of RTEMS GSoC 2013
slot by Jin Yang under our mentoring The initial idea was to provide generic
diff --git a/docs/devel/atomics.txt b/docs/devel/atomics.txt
index a4db3a4aaa..67bdf82628 100644
--- a/docs/devel/atomics.txt
+++ b/docs/devel/atomics.txt
@@ -87,7 +87,7 @@ Sequentially consistent loads and stores can be done using:
atomic_xchg(ptr, val) for stores
However, they are quite expensive on some platforms, notably POWER and
-ARM. Therefore, qemu/atomic.h provides two primitives with slightly
+Arm. Therefore, qemu/atomic.h provides two primitives with slightly
weaker constraints:
typeof(*ptr) atomic_mb_read(ptr)
diff --git a/docs/devel/kconfig.rst b/docs/devel/kconfig.rst
index b7bca44704..e5df72b342 100644
--- a/docs/devel/kconfig.rst
+++ b/docs/devel/kconfig.rst
@@ -8,7 +8,7 @@ time different targets can share large amounts of code. For example,
a POWER and an x86 board can run the same code to emulate a PCI network
card, even though the boards use different PCI host bridges, and they
can run the same code to emulate a SCSI disk while using different
-SCSI adapters. ARM, s390 and x86 boards can all present a virtio-blk
+SCSI adapters. Arm, s390 and x86 boards can all present a virtio-blk
disk to their guests, but with three different virtio guest interfaces.
Each QEMU target enables a subset of the boards, devices and buses that
diff --git a/docs/devel/loads-stores.rst b/docs/devel/loads-stores.rst
index 03aa9e7ff8..0d99eb24c1 100644
--- a/docs/devel/loads-stores.rst
+++ b/docs/devel/loads-stores.rst
@@ -302,7 +302,7 @@ way QEMU defines the view of memory that a device or CPU has.
or bus fabric.)
Each CPU has an AddressSpace. Some kinds of CPU have more than
-one AddressSpace (for instance ARM guest CPUs have an AddressSpace
+one AddressSpace (for instance Arm guest CPUs have an AddressSpace
for the Secure world and one for NonSecure if they implement TrustZone).
Devices which can do DMA-type operations should generally have an
AddressSpace. There is also a "system address space" which typically
diff --git a/docs/devel/multi-thread-tcg.txt b/docs/devel/multi-thread-tcg.txt
index 782bebc28b..3c85ac0eab 100644
--- a/docs/devel/multi-thread-tcg.txt
+++ b/docs/devel/multi-thread-tcg.txt
@@ -227,7 +227,7 @@ minimise contention.
(Current solution)
MMIO access automatically serialises hardware emulation by way of the
-BQL. Currently ARM targets serialise all ARM_CP_IO register accesses
+BQL. Currently Arm targets serialise all ARM_CP_IO register accesses
and also defer the reset/startup of vCPUs to the vCPU context by way
of async_run_on_cpu().
@@ -268,7 +268,7 @@ ordered backends this could become a NOP.
Aside from explicit standalone memory barrier instructions there are
also implicit memory ordering semantics which comes with each guest
memory access instruction. For example all x86 load/stores come with
-fairly strong guarantees of sequential consistency where as ARM has
+fairly strong guarantees of sequential consistency whereas Arm has
special variants of load/store instructions that imply acquire/release
semantics.
@@ -317,7 +317,7 @@ x86 cmpxchg instruction.
The second type offer a pair of load/store instructions which offer a
guarantee that a region of memory has not been touched between the
-load and store instructions. An example of this is ARM's ldrex/strex
+load and store instructions. An example of this is Arm's ldrex/strex
pair where the strex instruction will return a flag indicating a
successful store only if no other CPU has accessed the memory region
since the ldrex.
@@ -339,7 +339,7 @@ CURRENT OPEN QUESTIONS:
The TCG provides a number of atomic helpers (tcg_gen_atomic_*) which
can be used directly or combined to emulate other instructions like
-ARM's ldrex/strex instructions. While they are susceptible to the ABA
+Arm's ldrex/strex instructions. While they are susceptible to the ABA
problem so far common guests have not implemented patterns where
this may be a problem - typically presenting a locking ABI which
assumes cmpxchg like semantics.
diff --git a/docs/devel/tcg.rst b/docs/devel/tcg.rst
index 4956a30a4e..4ebde44b9d 100644
--- a/docs/devel/tcg.rst
+++ b/docs/devel/tcg.rst
@@ -83,7 +83,7 @@ memory until the end of the translation block. This is done for internal
emulation state that is rarely accessed directly by the program and/or changes
very often throughout the execution of a translation block---this includes
condition codes on x86, delay slots on SPARC, conditional execution on
-ARM, and so on. This state is stored for each target instruction, and
+Arm, and so on. This state is stored for each target instruction, and
looked up on exceptions.
MMU emulation
diff --git a/docs/index.html.in b/docs/index.html.in
index cc19aad2ec..e9a160384c 100644
--- a/docs/index.html.in
+++ b/docs/index.html.in
@@ -7,13 +7,13 @@
<body>
<h1>QEMU @@VERSION@@ Documentation</h1>
<ul>
- <li><a href="qemu-qmp-ref.html">QMP Reference Manual</a></li>
- <li><a href="qemu-ga-ref.html">Guest Agent Protocol Reference</a></li>
- <li><a href="interop/index.html">System Emulation Management and Interoperability Guide</a></li>
- <li><a href="specs/index.html">System Emulation Guest Hardware Specifications</a></li>
<li><a href="system/index.html">System Emulation User's Guide</a></li>
- <li><a href="tools/index.html">Tools Guide</a></li>
<li><a href="user/index.html">User Mode Emulation User's Guide</a></li>
+ <li><a href="tools/index.html">Tools Guide</a></li>
+ <li><a href="interop/index.html">System Emulation Management and Interoperability Guide</a></li>
+ <li><a href="specs/index.html">System Emulation Guest Hardware Specifications</a></li>
+ <li><a href="qemu-qmp-ref.html">QMP Reference Manual</a></li>
+ <li><a href="qemu-ga-ref.html">Guest Agent Protocol Reference</a></li>
</ul>
</body>
</html>
diff --git a/docs/index.rst b/docs/index.rst
index 376dab2885..763e3d0426 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -10,9 +10,9 @@ Welcome to QEMU's documentation!
:maxdepth: 2
:caption: Contents:
- interop/index
- devel/index
- specs/index
system/index
- tools/index
user/index
+ tools/index
+ interop/index
+ specs/index
+ devel/index
diff --git a/docs/qemu-option-trace.rst.inc b/docs/qemu-option-trace.rst.inc
index 23cfcb4853..7e09773a9c 100644
--- a/docs/qemu-option-trace.rst.inc
+++ b/docs/qemu-option-trace.rst.inc
@@ -1,7 +1,3 @@
-..
- The contents of this file must be kept in sync with qemu-option-trace.texi
- until all the users of the texi file have been converted to rst and
- the texi file can be removed.
Specify tracing options.
diff --git a/docs/replay.txt b/docs/replay.txt
index f4619a62a3..70c27edb36 100644
--- a/docs/replay.txt
+++ b/docs/replay.txt
@@ -19,7 +19,7 @@ Deterministic replay has the following features:
the memory, state of the hardware devices, clocks, and screen of the VM.
* Writes execution log into the file for later replaying for multiple times
on different machines.
- * Supports i386, x86_64, and ARM hardware platforms.
+ * Supports i386, x86_64, and Arm hardware platforms.
* Performs deterministic replay of all operations with keyboard and mouse
input devices.
diff --git a/docs/specs/fw_cfg.txt b/docs/specs/fw_cfg.txt
index 08c00bdf44..8f1ebc66fa 100644
--- a/docs/specs/fw_cfg.txt
+++ b/docs/specs/fw_cfg.txt
@@ -82,7 +82,7 @@ Selector Register IOport: 0x510
Data Register IOport: 0x511
DMA Address IOport: 0x514
-=== ARM Register Locations ===
+=== Arm Register Locations ===
Selector Register address: Base + 8 (2 bytes)
Data Register address: Base + 0 (8 bytes)
diff --git a/docs/specs/tpm.rst b/docs/specs/tpm.rst
index da9eb39ca9..5e61238bc5 100644
--- a/docs/specs/tpm.rst
+++ b/docs/specs/tpm.rst
@@ -25,7 +25,7 @@ QEMU files related to TPM TIS interface:
Both an ISA device and a sysbus device are available. The former is
used with pc/q35 machine while the latter can be instantiated in the
-ARM virt machine.
+Arm virt machine.
CRB interface
-------------
@@ -331,7 +331,7 @@ In case a pSeries machine is emulated, use the following command line:
-device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x3,drive=drive-virtio-disk0,id=virtio-disk0 \
-drive file=test.img,format=raw,if=none,id=drive-virtio-disk0
-In case an ARM virt machine is emulated, use the following command line:
+In case an Arm virt machine is emulated, use the following command line:
.. code-block:: console
@@ -346,7 +346,7 @@ In case an ARM virt machine is emulated, use the following command line:
-drive if=pflash,format=raw,file=flash0.img,readonly \
-drive if=pflash,format=raw,file=flash1.img
- On ARM, ACPI boot with TPM is not yet supported.
+ On Arm, ACPI boot with TPM is not yet supported.
In case SeaBIOS is used as firmware, it should show the TPM menu item
after entering the menu with 'ESC'.
diff --git a/docs/sphinx/hxtool.py b/docs/sphinx/hxtool.py
index 7dd223fe36..fb0649a3d5 100644
--- a/docs/sphinx/hxtool.py
+++ b/docs/sphinx/hxtool.py
@@ -37,13 +37,11 @@ else:
__version__ = '1.0'
-# We parse hx files with a state machine which may be in one of three
-# states: reading the C code fragment, inside a texi fragment,
-# or inside a rST fragment.
+# We parse hx files with a state machine which may be in one of two
+# states: reading the C code fragment, or inside a rST fragment.
class HxState(Enum):
CTEXT = 1
- TEXI = 2
- RST = 3
+ RST = 2
def serror(file, lnum, errtext):
"""Raise an exception giving a user-friendly syntax error message"""
@@ -110,31 +108,13 @@ class HxtoolDocDirective(Directive):
if directive == 'HXCOMM':
pass
- elif directive == 'STEXI':
- if state == HxState.RST:
- serror(hxfile, lnum, 'expected ERST, found STEXI')
- elif state == HxState.TEXI:
- serror(hxfile, lnum, 'expected ETEXI, found STEXI')
- else:
- state = HxState.TEXI
- elif directive == 'ETEXI':
- if state == HxState.RST:
- serror(hxfile, lnum, 'expected ERST, found ETEXI')
- elif state == HxState.CTEXT:
- serror(hxfile, lnum, 'expected STEXI, found ETEXI')
- else:
- state = HxState.CTEXT
elif directive == 'SRST':
if state == HxState.RST:
serror(hxfile, lnum, 'expected ERST, found SRST')
- elif state == HxState.TEXI:
- serror(hxfile, lnum, 'expected ETEXI, found SRST')
else:
state = HxState.RST
elif directive == 'ERST':
- if state == HxState.TEXI:
- serror(hxfile, lnum, 'expected ETEXI, found ERST')
- elif state == HxState.CTEXT:
+ if state == HxState.CTEXT:
serror(hxfile, lnum, 'expected SRST, found ERST')
else:
state = HxState.CTEXT
diff --git a/docs/arm-cpu-features.rst b/docs/system/arm/cpu-features.rst
index fc1623aeca..2d5c06cd01 100644
--- a/docs/arm-cpu-features.rst
+++ b/docs/system/arm/cpu-features.rst
@@ -1,19 +1,13 @@
+Arm CPU Features
================
-ARM CPU Features
-================
-
-Examples of probing and using ARM CPU features
-
-Introduction
-============
CPU features are optional features that a CPU of supporting type may
choose to implement or not. In QEMU, optional CPU features have
corresponding boolean CPU proprieties that, when enabled, indicate
that the feature is implemented, and, conversely, when disabled,
-indicate that it is not implemented. An example of an ARM CPU feature
+indicate that it is not implemented. An example of an Arm CPU feature
is the Performance Monitoring Unit (PMU). CPU types such as the
-Cortex-A15 and the Cortex-A57, which respectively implement ARM
+Cortex-A15 and the Cortex-A57, which respectively implement Arm
architecture reference manuals ARMv7-A and ARMv8-A, may both optionally
implement PMUs. For example, if a user wants to use a Cortex-A15 without
a PMU, then the `-cpu` parameter should contain `pmu=off` on the QEMU
diff --git a/docs/system/arm/integratorcp.rst b/docs/system/arm/integratorcp.rst
new file mode 100644
index 0000000000..e6f050f602
--- /dev/null
+++ b/docs/system/arm/integratorcp.rst
@@ -0,0 +1,16 @@
+Integrator/CP (``integratorcp``)
+================================
+
+The Arm Integrator/CP board is emulated with the following devices:
+
+- ARM926E, ARM1026E, ARM946E, ARM1136 or Cortex-A8 CPU
+
+- Two PL011 UARTs
+
+- SMC 91c111 Ethernet adapter
+
+- PL110 LCD controller
+
+- PL050 KMI with PS/2 keyboard and mouse.
+
+- PL181 MultiMedia Card Interface with SD card.
diff --git a/docs/system/arm/musicpal.rst b/docs/system/arm/musicpal.rst
new file mode 100644
index 0000000000..9de380edf8
--- /dev/null
+++ b/docs/system/arm/musicpal.rst
@@ -0,0 +1,19 @@
+Freecom MusicPal (``musicpal``)
+===============================
+
+The Freecom MusicPal internet radio emulation includes the following
+elements:
+
+- Marvell MV88W8618 Arm core.
+
+- 32 MB RAM, 256 KB SRAM, 8 MB flash.
+
+- Up to 2 16550 UARTs
+
+- MV88W8xx8 Ethernet controller
+
+- MV88W8618 audio controller, WM8750 CODEC and mixer
+
+- 128x64 display with brightness control
+
+- 2 buttons, 2 navigation wheels with button function
diff --git a/docs/system/arm/nseries.rst b/docs/system/arm/nseries.rst
new file mode 100644
index 0000000000..cd9edf5d88
--- /dev/null
+++ b/docs/system/arm/nseries.rst
@@ -0,0 +1,33 @@
+Nokia N800 and N810 tablets (``n800``, ``n810``)
+================================================
+
+Nokia N800 and N810 internet tablets (known also as RX-34 and RX-44 /
+48) emulation supports the following elements:
+
+- Texas Instruments OMAP2420 System-on-chip (ARM1136 core)
+
+- RAM and non-volatile OneNAND Flash memories
+
+- Display connected to EPSON remote framebuffer chip and OMAP on-chip
+ display controller and a LS041y3 MIPI DBI-C controller
+
+- TI TSC2301 (in N800) and TI TSC2005 (in N810) touchscreen
+ controllers driven through SPI bus
+
+- National Semiconductor LM8323-controlled qwerty keyboard driven
+ through |I2C| bus
+
+- Secure Digital card connected to OMAP MMC/SD host
+
+- Three OMAP on-chip UARTs and on-chip STI debugging console
+
+- Mentor Graphics \"Inventra\" dual-role USB controller embedded in a
+ TI TUSB6010 chip - only USB host mode is supported
+
+- TI TMP105 temperature sensor driven through |I2C| bus
+
+- TI TWL92230C power management companion with an RTC on
+ |I2C| bus
+
+- Nokia RETU and TAHVO multi-purpose chips with an RTC, connected
+ through CBUS
diff --git a/docs/system/arm/palm.rst b/docs/system/arm/palm.rst
new file mode 100644
index 0000000000..47ff9b36d4
--- /dev/null
+++ b/docs/system/arm/palm.rst
@@ -0,0 +1,23 @@
+Palm Tungsten|E PDA (``cheetah``)
+=================================
+
+The Palm Tungsten|E PDA (codename \"Cheetah\") emulation includes the
+following elements:
+
+- Texas Instruments OMAP310 System-on-chip (ARM925T core)
+
+- ROM and RAM memories (ROM firmware image can be loaded with
+ -option-rom)
+
+- On-chip LCD controller
+
+- On-chip Real Time Clock
+
+- TI TSC2102i touchscreen controller / analog-digital converter /
+ Audio CODEC, connected through MicroWire and |I2S| busses
+
+- GPIO-connected matrix keypad
+
+- Secure Digital card connected to OMAP MMC/SD host
+
+- Three on-chip UARTs
diff --git a/docs/system/arm/realview.rst b/docs/system/arm/realview.rst
new file mode 100644
index 0000000000..65f5be346b
--- /dev/null
+++ b/docs/system/arm/realview.rst
@@ -0,0 +1,34 @@
+Arm Realview boards (``realview-eb``, ``realview-eb-mpcore``, ``realview-pb-a8``, ``realview-pbx-a9``)
+======================================================================================================
+
+Several variants of the Arm RealView baseboard are emulated, including
+the EB, PB-A8 and PBX-A9. Due to interactions with the bootloader, only
+certain Linux kernel configurations work out of the box on these boards.
+
+Kernels for the PB-A8 board should have CONFIG_REALVIEW_HIGH_PHYS_OFFSET
+enabled in the kernel, and expect 512M RAM. Kernels for The PBX-A9 board
+should have CONFIG_SPARSEMEM enabled, CONFIG_REALVIEW_HIGH_PHYS_OFFSET
+disabled and expect 1024M RAM.
+
+The following devices are emulated:
+
+- ARM926E, ARM1136, ARM11MPCore, Cortex-A8 or Cortex-A9 MPCore CPU
+
+- Arm AMBA Generic/Distributed Interrupt Controller
+
+- Four PL011 UARTs
+
+- SMC 91c111 or SMSC LAN9118 Ethernet adapter
+
+- PL110 LCD controller
+
+- PL050 KMI with PS/2 keyboard and mouse
+
+- PCI host bridge
+
+- PCI OHCI USB controller
+
+- LSI53C895A PCI SCSI Host Bus Adapter with hard disk and CD-ROM
+ devices
+
+- PL181 MultiMedia Card Interface with SD card.
diff --git a/docs/system/arm/stellaris.rst b/docs/system/arm/stellaris.rst
new file mode 100644
index 0000000000..8af4ad79c7
--- /dev/null
+++ b/docs/system/arm/stellaris.rst
@@ -0,0 +1,26 @@
+Stellaris boards (``lm3s6965evb``, ``lm3s811evb``)
+==================================================
+
+The Luminary Micro Stellaris LM3S811EVB emulation includes the following
+devices:
+
+- Cortex-M3 CPU core.
+
+- 64k Flash and 8k SRAM.
+
+- Timers, UARTs, ADC and |I2C| interface.
+
+- OSRAM Pictiva 96x16 OLED with SSD0303 controller on
+ |I2C| bus.
+
+The Luminary Micro Stellaris LM3S6965EVB emulation includes the
+following devices:
+
+- Cortex-M3 CPU core.
+
+- 256k Flash and 64k SRAM.
+
+- Timers, UARTs, ADC, |I2C| and SSI interfaces.
+
+- OSRAM Pictiva 128x64 OLED with SSD0323 controller connected via
+ SSI.
diff --git a/docs/system/arm/sx1.rst b/docs/system/arm/sx1.rst
new file mode 100644
index 0000000000..8bce30d4b2
--- /dev/null
+++ b/docs/system/arm/sx1.rst
@@ -0,0 +1,18 @@
+Siemens SX1 (``sx1``, ``sx1-v1``)
+=================================
+
+The Siemens SX1 models v1 and v2 (default) basic emulation. The
+emulation includes the following elements:
+
+- Texas Instruments OMAP310 System-on-chip (ARM925T core)
+
+- ROM and RAM memories (ROM firmware image can be loaded with
+ -pflash) V1 1 Flash of 16MB and 1 Flash of 8MB V2 1 Flash of 32MB
+
+- On-chip LCD controller
+
+- On-chip Real Time Clock
+
+- Secure Digital card connected to OMAP MMC/SD host
+
+- Three on-chip UARTs
diff --git a/docs/system/arm/versatile.rst b/docs/system/arm/versatile.rst
new file mode 100644
index 0000000000..51221c30a4
--- /dev/null
+++ b/docs/system/arm/versatile.rst
@@ -0,0 +1,29 @@
+Arm Versatile boards (``versatileab``, ``versatilepb``)
+=======================================================
+
+The Arm Versatile baseboard is emulated with the following devices:
+
+- ARM926E, ARM1136 or Cortex-A8 CPU
+
+- PL190 Vectored Interrupt Controller
+
+- Four PL011 UARTs
+
+- SMC 91c111 Ethernet adapter
+
+- PL110 LCD controller
+
+- PL050 KMI with PS/2 keyboard and mouse.
+
+- PCI host bridge. Note the emulated PCI bridge only provides access
+ to PCI memory space. It does not provide access to PCI IO space. This
+ means some devices (eg. ne2k_pci NIC) are not usable, and others (eg.
+ rtl8139 NIC) are only usable when the guest drivers use the memory
+ mapped control registers.
+
+- PCI OHCI USB controller.
+
+- LSI53C895A PCI SCSI Host Bus Adapter with hard disk and CD-ROM
+ devices.
+
+- PL181 MultiMedia Card Interface with SD card.
diff --git a/docs/system/arm/xscale.rst b/docs/system/arm/xscale.rst
new file mode 100644
index 0000000000..89ec93e904
--- /dev/null
+++ b/docs/system/arm/xscale.rst
@@ -0,0 +1,29 @@
+Sharp XScale-based PDA models (``akita``, ``borzoi``, ``spitz``, ``terrier``)
+=============================================================================
+
+The XScale-based clamshell PDA models (\"Spitz\", \"Akita\", \"Borzoi\"
+and \"Terrier\") emulation includes the following peripherals:
+
+- Intel PXA270 System-on-chip (ARMv5TE core)
+
+- NAND Flash memory
+
+- IBM/Hitachi DSCM microdrive in a PXA PCMCIA slot - not in \"Akita\"
+
+- On-chip OHCI USB controller
+
+- On-chip LCD controller
+
+- On-chip Real Time Clock
+
+- TI ADS7846 touchscreen controller on SSP bus
+
+- Maxim MAX1111 analog-digital converter on |I2C| bus
+
+- GPIO-connected keyboard controller and LEDs
+
+- Secure Digital card connected to PXA MMC/SD host
+
+- Three on-chip UARTs
+
+- WM8750 audio CODEC on |I2C| and |I2S| busses
diff --git a/docs/system/target-arm.rst b/docs/system/target-arm.rst
index d2a3b44ce8..1425bd5303 100644
--- a/docs/system/target-arm.rst
+++ b/docs/system/target-arm.rst
@@ -1,217 +1,86 @@
.. _ARM-System-emulator:
-ARM System emulator
+Arm System emulator
-------------------
-Use the executable ``qemu-system-arm`` to simulate a ARM machine. The
-ARM Integrator/CP board is emulated with the following devices:
-
-- ARM926E, ARM1026E, ARM946E, ARM1136 or Cortex-A8 CPU
-
-- Two PL011 UARTs
-
-- SMC 91c111 Ethernet adapter
-
-- PL110 LCD controller
-
-- PL050 KMI with PS/2 keyboard and mouse.
-
-- PL181 MultiMedia Card Interface with SD card.
-
-The ARM Versatile baseboard is emulated with the following devices:
-
-- ARM926E, ARM1136 or Cortex-A8 CPU
-
-- PL190 Vectored Interrupt Controller
-
-- Four PL011 UARTs
-
-- SMC 91c111 Ethernet adapter
-
-- PL110 LCD controller
-
-- PL050 KMI with PS/2 keyboard and mouse.
-
-- PCI host bridge. Note the emulated PCI bridge only provides access
- to PCI memory space. It does not provide access to PCI IO space. This
- means some devices (eg. ne2k_pci NIC) are not usable, and others (eg.
- rtl8139 NIC) are only usable when the guest drivers use the memory
- mapped control registers.
-
-- PCI OHCI USB controller.
-
-- LSI53C895A PCI SCSI Host Bus Adapter with hard disk and CD-ROM
- devices.
-
-- PL181 MultiMedia Card Interface with SD card.
-
-Several variants of the ARM RealView baseboard are emulated, including
-the EB, PB-A8 and PBX-A9. Due to interactions with the bootloader, only
-certain Linux kernel configurations work out of the box on these boards.
-
-Kernels for the PB-A8 board should have CONFIG_REALVIEW_HIGH_PHYS_OFFSET
-enabled in the kernel, and expect 512M RAM. Kernels for The PBX-A9 board
-should have CONFIG_SPARSEMEM enabled, CONFIG_REALVIEW_HIGH_PHYS_OFFSET
-disabled and expect 1024M RAM.
-
-The following devices are emulated:
-
-- ARM926E, ARM1136, ARM11MPCore, Cortex-A8 or Cortex-A9 MPCore CPU
-
-- ARM AMBA Generic/Distributed Interrupt Controller
-
-- Four PL011 UARTs
-
-- SMC 91c111 or SMSC LAN9118 Ethernet adapter
-
-- PL110 LCD controller
-
-- PL050 KMI with PS/2 keyboard and mouse
-
-- PCI host bridge
-
-- PCI OHCI USB controller
-
-- LSI53C895A PCI SCSI Host Bus Adapter with hard disk and CD-ROM
- devices
-
-- PL181 MultiMedia Card Interface with SD card.
-
-The XScale-based clamshell PDA models (\"Spitz\", \"Akita\", \"Borzoi\"
-and \"Terrier\") emulation includes the following peripherals:
-
-- Intel PXA270 System-on-chip (ARM V5TE core)
-
-- NAND Flash memory
-
-- IBM/Hitachi DSCM microdrive in a PXA PCMCIA slot - not in \"Akita\"
-
-- On-chip OHCI USB controller
-
-- On-chip LCD controller
-
-- On-chip Real Time Clock
-
-- TI ADS7846 touchscreen controller on SSP bus
-
-- Maxim MAX1111 analog-digital converter on |I2C| bus
-
-- GPIO-connected keyboard controller and LEDs
-
-- Secure Digital card connected to PXA MMC/SD host
-
-- Three on-chip UARTs
-
-- WM8750 audio CODEC on |I2C| and |I2S| busses
-
-The Palm Tungsten|E PDA (codename \"Cheetah\") emulation includes the
-following elements:
-
-- Texas Instruments OMAP310 System-on-chip (ARM 925T core)
-
-- ROM and RAM memories (ROM firmware image can be loaded with
- -option-rom)
-
-- On-chip LCD controller
-
-- On-chip Real Time Clock
-
-- TI TSC2102i touchscreen controller / analog-digital converter /
- Audio CODEC, connected through MicroWire and |I2S| busses
-
-- GPIO-connected matrix keypad
-
-- Secure Digital card connected to OMAP MMC/SD host
-
-- Three on-chip UARTs
-
-Nokia N800 and N810 internet tablets (known also as RX-34 and RX-44 /
-48) emulation supports the following elements:
-
-- Texas Instruments OMAP2420 System-on-chip (ARM 1136 core)
-
-- RAM and non-volatile OneNAND Flash memories
-
-- Display connected to EPSON remote framebuffer chip and OMAP on-chip
- display controller and a LS041y3 MIPI DBI-C controller
-
-- TI TSC2301 (in N800) and TI TSC2005 (in N810) touchscreen
- controllers driven through SPI bus
-
-- National Semiconductor LM8323-controlled qwerty keyboard driven
- through |I2C| bus
-
-- Secure Digital card connected to OMAP MMC/SD host
-
-- Three OMAP on-chip UARTs and on-chip STI debugging console
-
-- Mentor Graphics \"Inventra\" dual-role USB controller embedded in a
- TI TUSB6010 chip - only USB host mode is supported
-
-- TI TMP105 temperature sensor driven through |I2C| bus
-
-- TI TWL92230C power management companion with an RTC on
- |I2C| bus
-
-- Nokia RETU and TAHVO multi-purpose chips with an RTC, connected
- through CBUS
-
-The Luminary Micro Stellaris LM3S811EVB emulation includes the following
-devices:
-
-- Cortex-M3 CPU core.
-
-- 64k Flash and 8k SRAM.
-
-- Timers, UARTs, ADC and |I2C| interface.
-
-- OSRAM Pictiva 96x16 OLED with SSD0303 controller on
- |I2C| bus.
-
-The Luminary Micro Stellaris LM3S6965EVB emulation includes the
-following devices:
-
-- Cortex-M3 CPU core.
-
-- 256k Flash and 64k SRAM.
-
-- Timers, UARTs, ADC, |I2C| and SSI interfaces.
-
-- OSRAM Pictiva 128x64 OLED with SSD0323 controller connected via
- SSI.
-
-The Freecom MusicPal internet radio emulation includes the following
-elements:
-
-- Marvell MV88W8618 ARM core.
-
-- 32 MB RAM, 256 KB SRAM, 8 MB flash.
-
-- Up to 2 16550 UARTs
-
-- MV88W8xx8 Ethernet controller
-
-- MV88W8618 audio controller, WM8750 CODEC and mixer
-
-- 128x64 display with brightness control
-
-- 2 buttons, 2 navigation wheels with button function
-
-The Siemens SX1 models v1 and v2 (default) basic emulation. The
-emulation includes the following elements:
-
-- Texas Instruments OMAP310 System-on-chip (ARM 925T core)
-
-- ROM and RAM memories (ROM firmware image can be loaded with
- -pflash) V1 1 Flash of 16MB and 1 Flash of 8MB V2 1 Flash of 32MB
-
-- On-chip LCD controller
-
-- On-chip Real Time Clock
-
-- Secure Digital card connected to OMAP MMC/SD host
-
-- Three on-chip UARTs
-
-A Linux 2.6 test image is available on the QEMU web site. More
-information is available in the QEMU mailing-list archive.
+QEMU can emulate both 32-bit and 64-bit Arm CPUs. Use the
+``qemu-system-aarch64`` executable to simulate a 64-bit Arm machine.
+You can use either ``qemu-system-arm`` or ``qemu-system-aarch64``
+to simulate a 32-bit Arm machine: in general, command lines that
+work for ``qemu-system-arm`` will behave the same when used with
+``qemu-system-aarch64``.
+
+QEMU has generally good support for Arm guests. It has support for
+nearly fifty different machines. The reason we support so many is that
+Arm hardware is much more widely varying than x86 hardware. Arm CPUs
+are generally built into "system-on-chip" (SoC) designs created by
+many different companies with different devices, and these SoCs are
+then built into machines which can vary still further even if they use
+the same SoC. Even with fifty boards QEMU does not cover more than a
+small fraction of the Arm hardware ecosystem.
+
+The situation for 64-bit Arm is fairly similar, except that we don't
+implement so many different machines.
+
+As well as the more common "A-profile" CPUs (which have MMUs and will
+run Linux) QEMU also supports "M-profile" CPUs such as the Cortex-M0,
+Cortex-M4 and Cortex-M33 (which are microcontrollers used in very
+embedded boards). For most boards the CPU type is fixed (matching what
+the hardware has), so typically you don't need to specify the CPU type
+by hand, except for special cases like the ``virt`` board.
+
+Choosing a board model
+======================
+
+For QEMU's Arm system emulation, you must specify which board
+model you want to use with the ``-M`` or ``--machine`` option;
+there is no default.
+
+Because Arm systems differ so much and in fundamental ways, typically
+operating system or firmware images intended to run on one machine
+will not run at all on any other. This is often surprising for new
+users who are used to the x86 world where every system looks like a
+standard PC. (Once the kernel has booted, most userspace software
+cares much less about the detail of the hardware.)
+
+If you already have a system image or a kernel that works on hardware
+and you want to boot with QEMU, check whether QEMU lists that machine
+in its ``-machine help`` output. If it is listed, then you can probably
+use that board model. If it is not listed, then unfortunately your image
+will almost certainly not boot on QEMU. (You might be able to
+extract the filesystem and use that with a different kernel which
+boots on a system that QEMU does emulate.)
+
+If you don't care about reproducing the idiosyncrasies of a particular
+bit of hardware, such as small amount of RAM, no PCI or other hard
+disk, etc., and just want to run Linux, the best option is to use the
+``virt`` board. This is a platform which doesn't correspond to any
+real hardware and is designed for use in virtual machines. You'll
+need to compile Linux with a suitable configuration for running on
+the ``virt`` board. ``virt`` supports PCI, virtio, recent CPUs and
+large amounts of RAM. It also supports 64-bit CPUs.
+
+Board-specific documentation
+============================
+
+Unfortunately many of the Arm boards QEMU supports are currently
+undocumented; you can get a complete list by running
+``qemu-system-aarch64 --machine help``.
+
+.. toctree::
+
+ arm/integratorcp
+ arm/versatile
+ arm/realview
+ arm/xscale
+ arm/palm
+ arm/nseries
+ arm/stellaris
+ arm/musicpal
+ arm/sx1
+
+Arm CPU features
+================
+
+.. toctree::
+ arm/cpu-features
diff --git a/docs/user/main.rst b/docs/user/main.rst
index ca69f7727d..bd99b0fdbe 100644
--- a/docs/user/main.rst
+++ b/docs/user/main.rst
@@ -35,7 +35,7 @@ QEMU user space emulation has the following notable features:
On Linux, QEMU can emulate the ``clone`` syscall and create a real
host thread (with a separate virtual CPU) for each emulated thread.
Note that not all targets currently emulate atomic operations
- correctly. x86 and ARM use a global lock in order to preserve their
+ correctly. x86 and Arm use a global lock in order to preserve their
semantics.
QEMU was conceived so that ultimately it can emulate itself. Although it
@@ -173,11 +173,11 @@ Other binaries
user mode (Alpha)
``qemu-alpha`` TODO.
-user mode (ARM)
+user mode (Arm)
``qemu-armeb`` TODO.
-user mode (ARM)
-``qemu-arm`` is also capable of running ARM \"Angel\" semihosted ELF
+user mode (Arm)
+``qemu-arm`` is also capable of running Arm \"Angel\" semihosted ELF
binaries (as implemented by the arm-elf and arm-eabi Newlib/GDB
configurations), and arm-uclinux bFLT format binaries.
diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx
index 499d6d54b0..ca5198438d 100644
--- a/hmp-commands-info.hx
+++ b/hmp-commands-info.hx
@@ -1,9 +1,9 @@
-HXCOMM Use DEFHEADING() to define headings in both help text and texi
-HXCOMM Text between STEXI and ETEXI are copied to texi version and
-HXCOMM discarded from C version
+HXCOMM Use DEFHEADING() to define headings in both help text and rST.
+HXCOMM Text between SRST and ERST is copied to the rST version and
+HXCOMM discarded from C version.
HXCOMM DEF(command, args, callback, arg_string, help) is used to construct
HXCOMM monitor info commands
-HXCOMM HXCOMM can be used for comments, discarded from both texi and C
+HXCOMM HXCOMM can be used for comments, discarded from both rST and C.
HXCOMM
HXCOMM In this file, generally SRST fragments should have two extra
HXCOMM spaces of indent, so that the documentation list item for "info foo"
diff --git a/hmp-commands.hx b/hmp-commands.hx
index f12263e071..7f0f3974ad 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1,9 +1,9 @@
-HXCOMM Use DEFHEADING() to define headings in both help text and texi
-HXCOMM Text between STEXI and ETEXI are copied to texi version and
-HXCOMM discarded from C version
+HXCOMM Use DEFHEADING() to define headings in both help text and rST.
+HXCOMM Text between SRST and ERST is copied to the rST version and
+HXCOMM discarded from C version.
HXCOMM DEF(command, args, callback, arg_string, help) is used to construct
HXCOMM monitor commands
-HXCOMM HXCOMM can be used for comments, discarded from both texi and C
+HXCOMM HXCOMM can be used for comments, discarded from both rST and C.
{
diff --git a/hw/9pfs/9p-proxy.c b/hw/9pfs/9p-proxy.c
index 8136e1342d..6f598a0f11 100644
--- a/hw/9pfs/9p-proxy.c
+++ b/hw/9pfs/9p-proxy.c
@@ -1139,10 +1139,10 @@ static int proxy_parse_opts(QemuOpts *opts, FsDriverEntry *fs, Error **errp)
}
if (socket) {
fs->path = g_strdup(socket);
- fs->export_flags = V9FS_PROXY_SOCK_NAME;
+ fs->export_flags |= V9FS_PROXY_SOCK_NAME;
} else {
fs->path = g_strdup(sock_fd);
- fs->export_flags = V9FS_PROXY_SOCK_FD;
+ fs->export_flags |= V9FS_PROXY_SOCK_FD;
}
return 0;
}
diff --git a/hw/misc/mac_via.c b/hw/misc/mac_via.c
index b7d0012794..81343301b1 100644
--- a/hw/misc/mac_via.c
+++ b/hw/misc/mac_via.c
@@ -30,6 +30,7 @@
#include "hw/qdev-properties.h"
#include "sysemu/block-backend.h"
#include "trace.h"
+#include "qemu/log.h"
/*
* VIAs: There are two in every machine,
@@ -381,8 +382,10 @@ static void via2_irq_request(void *opaque, int irq, int level)
static void pram_update(MacVIAState *m)
{
if (m->blk) {
- blk_pwrite(m->blk, 0, m->mos6522_via1.PRAM,
- sizeof(m->mos6522_via1.PRAM), 0);
+ if (blk_pwrite(m->blk, 0, m->mos6522_via1.PRAM,
+ sizeof(m->mos6522_via1.PRAM), 0) < 0) {
+ qemu_log("pram_update: cannot write to file\n");
+ }
}
}
diff --git a/include/block/aio.h b/include/block/aio.h
index 9dd61cee7e..cb1989105a 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -14,6 +14,9 @@
#ifndef QEMU_AIO_H
#define QEMU_AIO_H
+#ifdef CONFIG_LINUX_IO_URING
+#include <liburing.h>
+#endif
#include "qemu/queue.h"
#include "qemu/event_notifier.h"
#include "qemu/thread.h"
@@ -52,6 +55,56 @@ struct ThreadPool;
struct LinuxAioState;
struct LuringState;
+/* Is polling disabled? */
+bool aio_poll_disabled(AioContext *ctx);
+
+/* Callbacks for file descriptor monitoring implementations */
+typedef struct {
+ /*
+ * update:
+ * @ctx: the AioContext
+ * @old_node: the existing handler or NULL if this file descriptor is being
+ * monitored for the first time
+ * @new_node: the new handler or NULL if this file descriptor is being
+ * removed
+ *
+ * Add/remove/modify a monitored file descriptor.
+ *
+ * Called with ctx->list_lock acquired.
+ */
+ void (*update)(AioContext *ctx, AioHandler *old_node, AioHandler *new_node);
+
+ /*
+ * wait:
+ * @ctx: the AioContext
+ * @ready_list: list for handlers that become ready
+ * @timeout: maximum duration to wait, in nanoseconds
+ *
+ * Wait for file descriptors to become ready and place them on ready_list.
+ *
+ * Called with ctx->list_lock incremented but not locked.
+ *
+ * Returns: number of ready file descriptors.
+ */
+ int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t timeout);
+
+ /*
+ * need_wait:
+ * @ctx: the AioContext
+ *
+ * Tell aio_poll() when to stop userspace polling early because ->wait()
+ * has fds ready.
+ *
+ * File descriptor monitoring implementations that cannot poll fd readiness
+ * from userspace should use aio_poll_disabled() here. This ensures that
+ * file descriptors are not starved by handlers that frequently make
+ * progress via userspace polling.
+ *
+ * Returns: true if ->wait() should be called, false otherwise.
+ */
+ bool (*need_wait)(AioContext *ctx);
+} FDMonOps;
+
/*
* Each aio_bh_poll() call carves off a slice of the BH list, so that newly
* scheduled BHs are not processed until the next aio_bh_poll() call. All
@@ -65,6 +118,8 @@ struct BHListSlice {
QSIMPLEQ_ENTRY(BHListSlice) next;
};
+typedef QSLIST_HEAD(, AioHandler) AioHandlerSList;
+
struct AioContext {
GSource source;
@@ -150,6 +205,10 @@ struct AioContext {
* locking.
*/
struct LuringState *linux_io_uring;
+
+ /* State for file descriptor monitoring using Linux io_uring */
+ struct io_uring fdmon_io_uring;
+ AioHandlerSList submit_list;
#endif
/* TimerLists for calling timers - one per clock type. Has its own
@@ -168,13 +227,21 @@ struct AioContext {
int64_t poll_grow; /* polling time growth factor */
int64_t poll_shrink; /* polling time shrink factor */
+ /*
+ * List of handlers participating in userspace polling. Protected by
+ * ctx->list_lock. Iterated and modified mostly by the event loop thread
+ * from aio_poll() with ctx->list_lock incremented. aio_set_fd_handler()
+ * only touches the list to delete nodes if ctx->list_lock's count is zero.
+ */
+ AioHandlerList poll_aio_handlers;
+
/* Are we in polling mode or monitoring file descriptors? */
bool poll_started;
/* epoll(7) state used when built with CONFIG_EPOLL */
int epollfd;
- bool epoll_enabled;
- bool epoll_available;
+
+ const FDMonOps *fdmon_ops;
};
/**
diff --git a/include/block/block-copy.h b/include/block/block-copy.h
index 0a161724d7..aac85e1488 100644
--- a/include/block/block-copy.h
+++ b/include/block/block-copy.h
@@ -18,79 +18,30 @@
#include "block/block.h"
#include "qemu/co-shared-resource.h"
-typedef struct BlockCopyInFlightReq {
- int64_t start_byte;
- int64_t end_byte;
- QLIST_ENTRY(BlockCopyInFlightReq) list;
- CoQueue wait_queue; /* coroutines blocked on this request */
-} BlockCopyInFlightReq;
-
typedef void (*ProgressBytesCallbackFunc)(int64_t bytes, void *opaque);
-typedef void (*ProgressResetCallbackFunc)(void *opaque);
-typedef struct BlockCopyState {
- /*
- * BdrvChild objects are not owned or managed by block-copy. They are
- * provided by block-copy user and user is responsible for appropriate
- * permissions on these children.
- */
- BdrvChild *source;
- BdrvChild *target;
- BdrvDirtyBitmap *copy_bitmap;
- int64_t cluster_size;
- bool use_copy_range;
- int64_t copy_size;
- uint64_t len;
- QLIST_HEAD(, BlockCopyInFlightReq) inflight_reqs;
-
- BdrvRequestFlags write_flags;
-
- /*
- * skip_unallocated:
- *
- * Used by sync=top jobs, which first scan the source node for unallocated
- * areas and clear them in the copy_bitmap. During this process, the bitmap
- * is thus not fully initialized: It may still have bits set for areas that
- * are unallocated and should actually not be copied.
- *
- * This is indicated by skip_unallocated.
- *
- * In this case, block_copy() will query the source’s allocation status,
- * skip unallocated regions, clear them in the copy_bitmap, and invoke
- * block_copy_reset_unallocated() every time it does.
- */
- bool skip_unallocated;
-
- /* progress_bytes_callback: called when some copying progress is done. */
- ProgressBytesCallbackFunc progress_bytes_callback;
-
- /*
- * progress_reset_callback: called when some bytes reset from copy_bitmap
- * (see @skip_unallocated above). The callee is assumed to recalculate how
- * many bytes remain based on the dirty bit count of copy_bitmap.
- */
- ProgressResetCallbackFunc progress_reset_callback;
- void *progress_opaque;
-
- SharedResource *mem;
-} BlockCopyState;
+typedef struct BlockCopyState BlockCopyState;
BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
int64_t cluster_size,
BdrvRequestFlags write_flags,
Error **errp);
-void block_copy_set_callbacks(
+void block_copy_set_progress_callback(
BlockCopyState *s,
ProgressBytesCallbackFunc progress_bytes_callback,
- ProgressResetCallbackFunc progress_reset_callback,
void *progress_opaque);
+void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm);
+
void block_copy_state_free(BlockCopyState *s);
int64_t block_copy_reset_unallocated(BlockCopyState *s,
int64_t offset, int64_t *count);
-int coroutine_fn block_copy(BlockCopyState *s, int64_t start, uint64_t bytes,
+int coroutine_fn block_copy(BlockCopyState *s, int64_t offset, int64_t bytes,
bool *error_is_read);
+BdrvDirtyBitmap *block_copy_dirty_bitmap(BlockCopyState *s);
+void block_copy_set_skip_unallocated(BlockCopyState *s, bool skip);
+
#endif /* BLOCK_COPY_H */
diff --git a/include/crypto/block.h b/include/crypto/block.h
index d49d2c2da9..c77ccaf9c0 100644
--- a/include/crypto/block.h
+++ b/include/crypto/block.h
@@ -146,6 +146,26 @@ QCryptoBlock *qcrypto_block_create(QCryptoBlockCreateOptions *options,
/**
+ * qcrypto_block_calculate_payload_offset:
+ * @create_opts: the encryption options
+ * @optprefix: name prefix for options
+ * @len: output for number of header bytes before payload
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Calculate the number of header bytes before the payload in an encrypted
+ * storage volume. The header is an area before the payload that is reserved
+ * for encryption metadata.
+ *
+ * Returns: true on success, false on error
+ */
+bool
+qcrypto_block_calculate_payload_offset(QCryptoBlockCreateOptions *create_opts,
+ const char *optprefix,
+ size_t *len,
+ Error **errp);
+
+
+/**
* qcrypto_block_get_info:
* @block: the block encryption object
* @errp: pointer to a NULL-initialized error object
@@ -269,5 +289,7 @@ uint64_t qcrypto_block_get_sector_size(QCryptoBlock *block);
void qcrypto_block_free(QCryptoBlock *block);
G_DEFINE_AUTOPTR_CLEANUP_FUNC(QCryptoBlock, qcrypto_block_free)
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(QCryptoBlockCreateOptions,
+ qapi_free_QCryptoBlockCreateOptions)
#endif /* QCRYPTO_BLOCK_H */
diff --git a/include/qemu/job.h b/include/qemu/job.h
index bd59cd8944..32aabb1c60 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -28,6 +28,7 @@
#include "qapi/qapi-types-job.h"
#include "qemu/queue.h"
+#include "qemu/progress_meter.h"
#include "qemu/coroutine.h"
#include "block/aio.h"
@@ -117,15 +118,7 @@ typedef struct Job {
/** True if this job should automatically dismiss itself */
bool auto_dismiss;
- /**
- * Current progress. The unit is arbitrary as long as the ratio between
- * progress_current and progress_total represents the estimated percentage
- * of work already done.
- */
- int64_t progress_current;
-
- /** Estimated progress_current value at the completion of the job */
- int64_t progress_total;
+ ProgressMeter progress;
/**
* Return code from @run and/or @prepare callback(s).
diff --git a/include/qemu/progress_meter.h b/include/qemu/progress_meter.h
new file mode 100644
index 0000000000..9a23ff071c
--- /dev/null
+++ b/include/qemu/progress_meter.h
@@ -0,0 +1,58 @@
+/*
+ * Helper functionality for some process progress tracking.
+ *
+ * Copyright (c) 2011 IBM Corp.
+ * Copyright (c) 2012, 2018 Red Hat, Inc.
+ * Copyright (c) 2020 Virtuozzo International GmbH
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef QEMU_PROGRESS_METER_H
+#define QEMU_PROGRESS_METER_H
+
+typedef struct ProgressMeter {
+ /**
+ * Current progress. The unit is arbitrary as long as the ratio between
+ * current and total represents the estimated percentage
+ * of work already done.
+ */
+ uint64_t current;
+
+ /** Estimated current value at the completion of the process */
+ uint64_t total;
+} ProgressMeter;
+
+static inline void progress_work_done(ProgressMeter *pm, uint64_t done)
+{
+ pm->current += done;
+}
+
+static inline void progress_set_remaining(ProgressMeter *pm, uint64_t remaining)
+{
+ pm->total = pm->current + remaining;
+}
+
+static inline void progress_increase_remaining(ProgressMeter *pm,
+ uint64_t delta)
+{
+ pm->total += delta;
+}
+
+#endif /* QEMU_PROGRESS_METER_H */
diff --git a/include/qemu/queue.h b/include/qemu/queue.h
index 294db54eb1..456a5b01ee 100644
--- a/include/qemu/queue.h
+++ b/include/qemu/queue.h
@@ -142,6 +142,8 @@ struct { \
(elm)->field.le_next->field.le_prev = \
(elm)->field.le_prev; \
*(elm)->field.le_prev = (elm)->field.le_next; \
+ (elm)->field.le_next = NULL; \
+ (elm)->field.le_prev = NULL; \
} while (/*CONSTCOND*/0)
/*
@@ -225,12 +227,15 @@ struct { \
} while (/*CONSTCOND*/0)
#define QSLIST_REMOVE_HEAD(head, field) do { \
- (head)->slh_first = (head)->slh_first->field.sle_next; \
+ typeof((head)->slh_first) elm = (head)->slh_first; \
+ (head)->slh_first = elm->field.sle_next; \
+ elm->field.sle_next = NULL; \
} while (/*CONSTCOND*/0)
#define QSLIST_REMOVE_AFTER(slistelm, field) do { \
- (slistelm)->field.sle_next = \
- QSLIST_NEXT(QSLIST_NEXT((slistelm), field), field); \
+ typeof(slistelm) next = (slistelm)->field.sle_next; \
+ (slistelm)->field.sle_next = next->field.sle_next; \
+ next->field.sle_next = NULL; \
} while (/*CONSTCOND*/0)
#define QSLIST_REMOVE(head, elm, type, field) do { \
@@ -241,6 +246,7 @@ struct { \
while (curelm->field.sle_next != (elm)) \
curelm = curelm->field.sle_next; \
curelm->field.sle_next = curelm->field.sle_next->field.sle_next; \
+ (elm)->field.sle_next = NULL; \
} \
} while (/*CONSTCOND*/0)
@@ -304,8 +310,10 @@ struct { \
} while (/*CONSTCOND*/0)
#define QSIMPLEQ_REMOVE_HEAD(head, field) do { \
- if (((head)->sqh_first = (head)->sqh_first->field.sqe_next) == NULL)\
+ typeof((head)->sqh_first) elm = (head)->sqh_first; \
+ if (((head)->sqh_first = elm->field.sqe_next) == NULL) \
(head)->sqh_last = &(head)->sqh_first; \
+ elm->field.sqe_next = NULL; \
} while (/*CONSTCOND*/0)
#define QSIMPLEQ_SPLIT_AFTER(head, elm, field, removed) do { \
@@ -329,6 +337,7 @@ struct { \
if ((curelm->field.sqe_next = \
curelm->field.sqe_next->field.sqe_next) == NULL) \
(head)->sqh_last = &(curelm)->field.sqe_next; \
+ (elm)->field.sqe_next = NULL; \
} \
} while (/*CONSTCOND*/0)
@@ -446,6 +455,8 @@ union { \
(head)->tqh_circ.tql_prev = (elm)->field.tqe_circ.tql_prev; \
(elm)->field.tqe_circ.tql_prev->tql_next = (elm)->field.tqe_next; \
(elm)->field.tqe_circ.tql_prev = NULL; \
+ (elm)->field.tqe_circ.tql_next = NULL; \
+ (elm)->field.tqe_next = NULL; \
} while (/*CONSTCOND*/0)
/* remove @left, @right and all elements in between from @head */
diff --git a/job-qmp.c b/job-qmp.c
index fbfed25a00..fecc939ebd 100644
--- a/job-qmp.c
+++ b/job-qmp.c
@@ -143,8 +143,8 @@ static JobInfo *job_query_single(Job *job, Error **errp)
.id = g_strdup(job->id),
.type = job_type(job),
.status = job->status,
- .current_progress = job->progress_current,
- .total_progress = job->progress_total,
+ .current_progress = job->progress.current,
+ .total_progress = job->progress.total,
.has_error = !!job->err,
.error = job->err ? \
g_strdup(error_get_pretty(job->err)) : NULL,
diff --git a/job.c b/job.c
index 04409b40aa..134a07b92e 100644
--- a/job.c
+++ b/job.c
@@ -369,17 +369,17 @@ void job_unref(Job *job)
void job_progress_update(Job *job, uint64_t done)
{
- job->progress_current += done;
+ progress_work_done(&job->progress, done);
}
void job_progress_set_remaining(Job *job, uint64_t remaining)
{
- job->progress_total = job->progress_current + remaining;
+ progress_set_remaining(&job->progress, remaining);
}
void job_progress_increase_remaining(Job *job, uint64_t delta)
{
- job->progress_total += delta;
+ progress_increase_remaining(&job->progress, delta);
}
void job_event_cancelled(Job *job)
diff --git a/qemu-img.c b/qemu-img.c
index 804630a368..afddf33f08 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -817,6 +817,8 @@ static int img_check(int argc, char **argv)
check->corruptions_fixed);
}
+ qapi_free_ImageCheck(check);
+ check = g_new0(ImageCheck, 1);
ret = collect_image_check(bs, check, filename, fmt, 0);
check->leaks_fixed = leaks_fixed;
@@ -882,9 +884,9 @@ static void run_block_job(BlockJob *job, Error **errp)
do {
float progress = 0.0f;
aio_poll(aio_context, true);
- if (job->job.progress_total) {
- progress = (float)job->job.progress_current /
- job->job.progress_total * 100.f;
+ if (job->job.progress.total) {
+ progress = (float)job->job.progress.current /
+ job->job.progress.total * 100.f;
}
qemu_progress_print(progress, 0);
} while (!job_is_ready(&job->job) && !job_is_completed(&job->job));
@@ -4932,10 +4934,8 @@ static int img_measure(int argc, char **argv)
filename = argv[optind];
}
- if (!filename &&
- (object_opts || image_opts || fmt || snapshot_name || sn_opts)) {
- error_report("--object, --image-opts, -f, and -l "
- "require a filename argument.");
+ if (!filename && (image_opts || fmt || snapshot_name || sn_opts)) {
+ error_report("--image-opts, -f, and -l require a filename argument.");
goto out;
}
if (filename && img_size != UINT64_MAX) {
diff --git a/qemu-options.hx b/qemu-options.hx
index f9fefd43be..1d8f852d89 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -1,10 +1,10 @@
-HXCOMM Use DEFHEADING() to define headings in both help text and texi
-HXCOMM Text between STEXI and ETEXI are copied to texi version and
-HXCOMM discarded from C version
+HXCOMM Use DEFHEADING() to define headings in both help text and rST.
+HXCOMM Text between SRST and ERST is copied to the rST version and
+HXCOMM discarded from C version.
HXCOMM DEF(option, HAS_ARG/0, opt_enum, opt_help, arch_mask) is used to
HXCOMM construct option structures, enums and help message for specified
HXCOMM architectures.
-HXCOMM HXCOMM can be used for comments, discarded from both texi and C
+HXCOMM HXCOMM can be used for comments, discarded from both rST and C.
DEFHEADING(Standard options:)
diff --git a/scripts/hxtool b/scripts/hxtool
index 0003e7b673..7b1452f3cf 100644
--- a/scripts/hxtool
+++ b/scripts/hxtool
@@ -7,7 +7,7 @@ hxtoh()
case $str in
HXCOMM*)
;;
- STEXI*|ETEXI*|SRST*|ERST*) flag=$(($flag^1))
+ SRST*|ERST*) flag=$(($flag^1))
;;
*)
test $flag -eq 1 && printf "%s\n" "$str"
@@ -16,84 +16,8 @@ hxtoh()
done
}
-print_texi_heading()
-{
- if test "$*" != ""; then
- title="$*"
- printf "@subsection %s\n" "${title%:}"
- fi
-}
-
-hxtotexi()
-{
- flag=0
- rstflag=0
- line=1
- while read -r str; do
- case "$str" in
- HXCOMM*)
- ;;
- STEXI*)
- if test $rstflag -eq 1 ; then
- printf "line %d: syntax error: expected ERST, found '%s'\n" "$line" "$str" >&2
- exit 1
- fi
- if test $flag -eq 1 ; then
- printf "line %d: syntax error: expected ETEXI, found '%s'\n" "$line" "$str" >&2
- exit 1
- fi
- flag=1
- ;;
- ETEXI*)
- if test $rstflag -eq 1 ; then
- printf "line %d: syntax error: expected ERST, found '%s'\n" "$line" "$str" >&2
- exit 1
- fi
- if test $flag -ne 1 ; then
- printf "line %d: syntax error: expected STEXI, found '%s'\n" "$line" "$str" >&2
- exit 1
- fi
- flag=0
- ;;
- SRST*)
- if test $rstflag -eq 1 ; then
- printf "line %d: syntax error: expected ERST, found '%s'\n" "$line" "$str" >&2
- exit 1
- fi
- if test $flag -eq 1 ; then
- printf "line %d: syntax error: expected ETEXI, found '%s'\n" "$line" "$str" >&2
- exit 1
- fi
- rstflag=1
- ;;
- ERST*)
- if test $flag -eq 1 ; then
- printf "line %d: syntax error: expected ETEXI, found '%s'\n" "$line" "$str" >&2
- exit 1
- fi
- if test $rstflag -ne 1 ; then
- printf "line %d: syntax error: expected SRST, found '%s'\n" "$line" "$str" >&2
- exit 1
- fi
- rstflag=0
- ;;
- DEFHEADING*)
- print_texi_heading "$(expr "$str" : "DEFHEADING(\(.*\))")"
- ;;
- ARCHHEADING*)
- print_texi_heading "$(expr "$str" : "ARCHHEADING(\(.*\),.*)")"
- ;;
- *)
- test $flag -eq 1 && printf '%s\n' "$str"
- ;;
- esac
- line=$((line+1))
- done
-}
-
case "$1" in
"-h") hxtoh ;;
-"-t") hxtotexi ;;
*) exit 1 ;;
esac
diff --git a/tests/qemu-iotests/178 b/tests/qemu-iotests/178
index 51a70fe669..7cf0e27154 100755
--- a/tests/qemu-iotests/178
+++ b/tests/qemu-iotests/178
@@ -50,7 +50,7 @@ _make_test_img 1G
$QEMU_IMG measure # missing arguments
$QEMU_IMG measure --size 2G "$TEST_IMG" # only one allowed
$QEMU_IMG measure "$TEST_IMG" a # only one filename allowed
-$QEMU_IMG measure --object secret,id=sec0,data=MTIzNDU2,format=base64 # missing filename
+$QEMU_IMG measure --object secret,id=sec0,data=MTIzNDU2,format=base64 # size or filename needed
$QEMU_IMG measure --image-opts # missing filename
$QEMU_IMG measure -f qcow2 # missing filename
$QEMU_IMG measure -l snap1 # missing filename
diff --git a/tests/qemu-iotests/178.out.qcow2 b/tests/qemu-iotests/178.out.qcow2
index 9e7d8c44df..f59bf4b2fb 100644
--- a/tests/qemu-iotests/178.out.qcow2
+++ b/tests/qemu-iotests/178.out.qcow2
@@ -5,10 +5,10 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
qemu-img: Either --size N or one filename must be specified.
qemu-img: --size N cannot be used together with a filename.
qemu-img: At most one filename argument is allowed.
-qemu-img: --object, --image-opts, -f, and -l require a filename argument.
-qemu-img: --object, --image-opts, -f, and -l require a filename argument.
-qemu-img: --object, --image-opts, -f, and -l require a filename argument.
-qemu-img: --object, --image-opts, -f, and -l require a filename argument.
+qemu-img: Either --size N or one filename must be specified.
+qemu-img: --image-opts, -f, and -l require a filename argument.
+qemu-img: --image-opts, -f, and -l require a filename argument.
+qemu-img: --image-opts, -f, and -l require a filename argument.
qemu-img: Invalid option list: ,
qemu-img: Invalid parameter 'snapshot.foo'
qemu-img: Failed in parsing snapshot param 'snapshot.foo'
diff --git a/tests/qemu-iotests/178.out.raw b/tests/qemu-iotests/178.out.raw
index 6478365905..404ca908d8 100644
--- a/tests/qemu-iotests/178.out.raw
+++ b/tests/qemu-iotests/178.out.raw
@@ -5,10 +5,10 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
qemu-img: Either --size N or one filename must be specified.
qemu-img: --size N cannot be used together with a filename.
qemu-img: At most one filename argument is allowed.
-qemu-img: --object, --image-opts, -f, and -l require a filename argument.
-qemu-img: --object, --image-opts, -f, and -l require a filename argument.
-qemu-img: --object, --image-opts, -f, and -l require a filename argument.
-qemu-img: --object, --image-opts, -f, and -l require a filename argument.
+qemu-img: Either --size N or one filename must be specified.
+qemu-img: --image-opts, -f, and -l require a filename argument.
+qemu-img: --image-opts, -f, and -l require a filename argument.
+qemu-img: --image-opts, -f, and -l require a filename argument.
qemu-img: Invalid option list: ,
qemu-img: Invalid parameter 'snapshot.foo'
qemu-img: Failed in parsing snapshot param 'snapshot.foo'
diff --git a/tests/qemu-iotests/288 b/tests/qemu-iotests/288
new file mode 100755
index 0000000000..6c62065aef
--- /dev/null
+++ b/tests/qemu-iotests/288
@@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+#
+# qemu-img measure tests for LUKS images
+#
+# Copyright (C) 2020 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+# creator
+owner=stefanha@redhat.com
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+status=1 # failure is the default!
+
+_cleanup()
+{
+ _cleanup_test_img
+ rm -f "$TEST_IMG.converted"
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+. ./common.pattern
+
+_supported_fmt luks
+_supported_proto file
+_supported_os Linux
+
+SECRET=secret,id=sec0,data=passphrase
+
+echo "== measure 1G image file =="
+echo
+
+$QEMU_IMG measure --object "$SECRET" \
+ -O "$IMGFMT" \
+ -o key-secret=sec0,iter-time=10 \
+ --size 1G
+
+echo
+echo "== create 1G image file (size should be no greater than measured) =="
+echo
+
+_make_test_img 1G
+stat -c "image file size in bytes: %s" "$TEST_IMG_FILE"
+
+echo
+echo "== modified 1G image file (size should be no greater than measured) =="
+echo
+
+$QEMU_IO --object "$SECRET" --image-opts "$TEST_IMG" -c "write -P 0x51 0x10000 0x400" | _filter_qemu_io | _filter_testdir
+stat -c "image file size in bytes: %s" "$TEST_IMG_FILE"
+
+echo
+echo "== measure preallocation=falloc 1G image file =="
+echo
+
+$QEMU_IMG measure --object "$SECRET" \
+ -O "$IMGFMT" \
+ -o key-secret=sec0,iter-time=10,preallocation=falloc \
+ --size 1G
+
+echo
+echo "== measure with input image file =="
+echo
+
+IMGFMT=raw IMGKEYSECRET= IMGOPTS= _make_test_img 1G | _filter_imgfmt
+QEMU_IO_OPTIONS= IMGOPTSSYNTAX= $QEMU_IO -f raw -c "write -P 0x51 0x10000 0x400" "$TEST_IMG_FILE" | _filter_qemu_io | _filter_testdir
+$QEMU_IMG measure --object "$SECRET" \
+ -O "$IMGFMT" \
+ -o key-secret=sec0,iter-time=10 \
+ -f raw \
+ "$TEST_IMG_FILE"
+
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/288.out b/tests/qemu-iotests/288.out
new file mode 100644
index 0000000000..4bc593dc48
--- /dev/null
+++ b/tests/qemu-iotests/288.out
@@ -0,0 +1,30 @@
+QA output created by 288
+== measure 1G image file ==
+
+required size: 1075810304
+fully allocated size: 1075810304
+
+== create 1G image file (size should be no greater than measured) ==
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
+image file size in bytes: 1075810304
+
+== modified 1G image file (size should be no greater than measured) ==
+
+wrote 1024/1024 bytes at offset 65536
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+image file size in bytes: 1075810304
+
+== measure preallocation=falloc 1G image file ==
+
+required size: 1075810304
+fully allocated size: 1075810304
+
+== measure with input image file ==
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
+wrote 1024/1024 bytes at offset 65536
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+required size: 1075810304
+fully allocated size: 1075810304
+*** done
diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc
index 8a6366c09d..4c246c0450 100644
--- a/tests/qemu-iotests/common.rc
+++ b/tests/qemu-iotests/common.rc
@@ -56,18 +56,30 @@ poke_file()
# peek_file_le 'test.img' 512 2 => 65534
peek_file_le()
{
- # Wrap in echo $() to strip spaces
- echo $(od -j"$2" -N"$3" --endian=little -An -vtu"$3" "$1")
+ local val=0 shift=0 byte
+
+ # coreutils' od --endian is not portable, so manually assemble bytes.
+ for byte in $(od -j"$2" -N"$3" -An -v -tu1 "$1"); do
+ val=$(( val | (byte << shift) ))
+ shift=$((shift + 8))
+ done
+ printf %llu $val
}
# peek_file_be 'test.img' 512 2 => 65279
peek_file_be()
{
- # Wrap in echo $() to strip spaces
- echo $(od -j"$2" -N"$3" --endian=big -An -vtu"$3" "$1")
+ local val=0 byte
+
+ # coreutils' od --endian is not portable, so manually assemble bytes.
+ for byte in $(od -j"$2" -N"$3" -An -v -tu1 "$1"); do
+ val=$(( (val << 8) | byte ))
+ done
+ printf %llu $val
}
-# peek_file_raw 'test.img' 512 2 => '\xff\xfe'
+# peek_file_raw 'test.img' 512 2 => '\xff\xfe'. Do not use if the raw data
+# is likely to contain \0 or trailing \n.
peek_file_raw()
{
dd if="$1" bs=1 skip="$2" count="$3" status=none
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index 3c1329b081..ec2b2302e5 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -294,3 +294,4 @@
283 auto quick
284 rw
286 rw quick
+288 quick
diff --git a/tests/qtest/Makefile.include b/tests/qtest/Makefile.include
index 76672990a7..10a28de8a3 100644
--- a/tests/qtest/Makefile.include
+++ b/tests/qtest/Makefile.include
@@ -18,7 +18,8 @@ check-qtest-pci-$(CONFIG_IVSHMEM_DEVICE) += ivshmem-test
DBUS_DAEMON := $(shell which dbus-daemon 2>/dev/null)
ifneq ($(GDBUS_CODEGEN),)
ifneq ($(DBUS_DAEMON),)
-check-qtest-pci-$(CONFIG_GIO) += dbus-vmstate-test
+# Temporarily disabled due to Patchew failures:
+#check-qtest-pci-$(CONFIG_GIO) += dbus-vmstate-test
endif
endif
diff --git a/util/Makefile.objs b/util/Makefile.objs
index 6b38b67cf1..6718a38b61 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -5,6 +5,9 @@ util-obj-y += aiocb.o async.o aio-wait.o thread-pool.o qemu-timer.o
util-obj-y += main-loop.o
util-obj-$(call lnot,$(CONFIG_ATOMIC64)) += atomic64.o
util-obj-$(CONFIG_POSIX) += aio-posix.o
+util-obj-$(CONFIG_POSIX) += fdmon-poll.o
+util-obj-$(CONFIG_EPOLL_CREATE1) += fdmon-epoll.o
+util-obj-$(CONFIG_LINUX_IO_URING) += fdmon-io_uring.o
util-obj-$(CONFIG_POSIX) += compatfd.o
util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
util-obj-$(CONFIG_POSIX) += mmap-alloc.o
diff --git a/util/aio-posix.c b/util/aio-posix.c
index 9e1befc0c0..cd6cf0a4a9 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -20,191 +20,25 @@
#include "qemu/sockets.h"
#include "qemu/cutils.h"
#include "trace.h"
-#ifdef CONFIG_EPOLL_CREATE1
-#include <sys/epoll.h>
-#endif
+#include "aio-posix.h"
-struct AioHandler
-{
- GPollFD pfd;
- IOHandler *io_read;
- IOHandler *io_write;
- AioPollFn *io_poll;
- IOHandler *io_poll_begin;
- IOHandler *io_poll_end;
- void *opaque;
- bool is_external;
- QLIST_ENTRY(AioHandler) node;
- QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
- QLIST_ENTRY(AioHandler) node_deleted;
-};
-
-/* Add a handler to a ready list */
-static void add_ready_handler(AioHandlerList *ready_list,
- AioHandler *node,
- int revents)
-{
- QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */
- node->pfd.revents = revents;
- QLIST_INSERT_HEAD(ready_list, node, node_ready);
-}
-
-#ifdef CONFIG_EPOLL_CREATE1
-
-/* The fd number threshold to switch to epoll */
-#define EPOLL_ENABLE_THRESHOLD 64
-
-static void aio_epoll_disable(AioContext *ctx)
-{
- ctx->epoll_enabled = false;
- if (!ctx->epoll_available) {
- return;
- }
- ctx->epoll_available = false;
- close(ctx->epollfd);
-}
-
-static inline int epoll_events_from_pfd(int pfd_events)
-{
- return (pfd_events & G_IO_IN ? EPOLLIN : 0) |
- (pfd_events & G_IO_OUT ? EPOLLOUT : 0) |
- (pfd_events & G_IO_HUP ? EPOLLHUP : 0) |
- (pfd_events & G_IO_ERR ? EPOLLERR : 0);
-}
-
-static bool aio_epoll_try_enable(AioContext *ctx)
-{
- AioHandler *node;
- struct epoll_event event;
-
- QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
- int r;
- if (QLIST_IS_INSERTED(node, node_deleted) || !node->pfd.events) {
- continue;
- }
- event.events = epoll_events_from_pfd(node->pfd.events);
- event.data.ptr = node;
- r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
- if (r) {
- return false;
- }
- }
- ctx->epoll_enabled = true;
- return true;
-}
-
-static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
-{
- struct epoll_event event;
- int r;
- int ctl;
-
- if (!ctx->epoll_enabled) {
- return;
- }
- if (!node->pfd.events) {
- ctl = EPOLL_CTL_DEL;
- } else {
- event.data.ptr = node;
- event.events = epoll_events_from_pfd(node->pfd.events);
- ctl = is_new ? EPOLL_CTL_ADD : EPOLL_CTL_MOD;
- }
-
- r = epoll_ctl(ctx->epollfd, ctl, node->pfd.fd, &event);
- if (r) {
- aio_epoll_disable(ctx);
- }
-}
-
-static int aio_epoll(AioContext *ctx, AioHandlerList *ready_list,
- int64_t timeout)
-{
- GPollFD pfd = {
- .fd = ctx->epollfd,
- .events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR,
- };
- AioHandler *node;
- int i, ret = 0;
- struct epoll_event events[128];
-
- if (timeout > 0) {
- ret = qemu_poll_ns(&pfd, 1, timeout);
- if (ret > 0) {
- timeout = 0;
- }
- }
- if (timeout <= 0 || ret > 0) {
- ret = epoll_wait(ctx->epollfd, events,
- ARRAY_SIZE(events),
- timeout);
- if (ret <= 0) {
- goto out;
- }
- for (i = 0; i < ret; i++) {
- int ev = events[i].events;
- int revents = (ev & EPOLLIN ? G_IO_IN : 0) |
- (ev & EPOLLOUT ? G_IO_OUT : 0) |
- (ev & EPOLLHUP ? G_IO_HUP : 0) |
- (ev & EPOLLERR ? G_IO_ERR : 0);
-
- node = events[i].data.ptr;
- add_ready_handler(ready_list, node, revents);
- }
- }
-out:
- return ret;
-}
+/* Stop userspace polling on a handler if it isn't active for some time */
+#define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND)
-static bool aio_epoll_enabled(AioContext *ctx)
+bool aio_poll_disabled(AioContext *ctx)
{
- /* Fall back to ppoll when external clients are disabled. */
- return !aio_external_disabled(ctx) && ctx->epoll_enabled;
+ return atomic_read(&ctx->poll_disable_cnt);
}
-static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
- unsigned npfd, int64_t timeout)
+void aio_add_ready_handler(AioHandlerList *ready_list,
+ AioHandler *node,
+ int revents)
{
- if (!ctx->epoll_available) {
- return false;
- }
- if (aio_epoll_enabled(ctx)) {
- return true;
- }
- if (npfd >= EPOLL_ENABLE_THRESHOLD) {
- if (aio_epoll_try_enable(ctx)) {
- return true;
- } else {
- aio_epoll_disable(ctx);
- }
- }
- return false;
-}
-
-#else
-
-static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
-{
-}
-
-static int aio_epoll(AioContext *ctx, AioHandlerList *ready_list,
- int64_t timeout)
-{
- assert(false);
-}
-
-static bool aio_epoll_enabled(AioContext *ctx)
-{
- return false;
-}
-
-static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
- unsigned npfd, int64_t timeout)
-{
- return false;
+ QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */
+ node->pfd.revents = revents;
+ QLIST_INSERT_HEAD(ready_list, node, node_ready);
}
-#endif
-
static AioHandler *find_aio_handler(AioContext *ctx, int fd)
{
AioHandler *node;
@@ -231,16 +65,23 @@ static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
g_source_remove_poll(&ctx->source, &node->pfd);
}
+ node->pfd.revents = 0;
+
+ /* If the fd monitor has already marked it deleted, leave it alone */
+ if (QLIST_IS_INSERTED(node, node_deleted)) {
+ return false;
+ }
+
/* If a read is in progress, just mark the node as deleted */
if (qemu_lockcnt_count(&ctx->list_lock)) {
QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
- node->pfd.revents = 0;
return false;
}
/* Otherwise, delete it for real. We can't just mark it as
* deleted because deleted nodes are only cleaned up while
* no one is walking the handlers list.
*/
+ QLIST_SAFE_REMOVE(node, node_poll);
QLIST_REMOVE(node, node);
return true;
}
@@ -300,9 +141,6 @@ void aio_set_fd_handler(AioContext *ctx,
QLIST_INSERT_HEAD_RCU(&ctx->aio_handlers, new_node, node);
}
- if (node) {
- deleted = aio_remove_fd_handler(ctx, node);
- }
/* No need to order poll_disable_cnt writes against other updates;
* the counter is only used to avoid wasting time and latency on
@@ -313,11 +151,9 @@ void aio_set_fd_handler(AioContext *ctx,
atomic_set(&ctx->poll_disable_cnt,
atomic_read(&ctx->poll_disable_cnt) + poll_disable_change);
- if (new_node) {
- aio_epoll_update(ctx, new_node, is_new);
- } else if (node) {
- /* Unregister deleted fd_handler */
- aio_epoll_update(ctx, node, false);
+ ctx->fdmon_ops->update(ctx, node, new_node);
+ if (node) {
+ deleted = aio_remove_fd_handler(ctx, node);
}
qemu_lockcnt_unlock(&ctx->list_lock);
aio_notify(ctx);
@@ -361,18 +197,19 @@ void aio_set_event_notifier_poll(AioContext *ctx,
(IOHandler *)io_poll_end);
}
-static void poll_set_started(AioContext *ctx, bool started)
+static bool poll_set_started(AioContext *ctx, bool started)
{
AioHandler *node;
+ bool progress = false;
if (started == ctx->poll_started) {
- return;
+ return false;
}
ctx->poll_started = started;
qemu_lockcnt_inc(&ctx->list_lock);
- QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
+ QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
IOHandler *fn;
if (QLIST_IS_INSERTED(node, node_deleted)) {
@@ -388,8 +225,15 @@ static void poll_set_started(AioContext *ctx, bool started)
if (fn) {
fn(node->opaque);
}
+
+ /* Poll one last time in case ->io_poll_end() raced with the event */
+ if (!started) {
+ progress = node->io_poll(node->opaque) || progress;
+ }
}
qemu_lockcnt_dec(&ctx->list_lock);
+
+ return progress;
}
@@ -446,6 +290,7 @@ static void aio_free_deleted_handlers(AioContext *ctx)
while ((node = QLIST_FIRST_RCU(&ctx->deleted_aio_handlers))) {
QLIST_REMOVE(node, node);
QLIST_REMOVE(node, node_deleted);
+ QLIST_SAFE_REMOVE(node, node_poll);
g_free(node);
}
@@ -460,6 +305,22 @@ static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node)
revents = node->pfd.revents & node->pfd.events;
node->pfd.revents = 0;
+ /*
+ * Start polling AioHandlers when they become ready because activity is
+ * likely to continue. Note that starvation is theoretically possible when
+ * fdmon_supports_polling(), but only until the fd fires for the first
+ * time.
+ */
+ if (!QLIST_IS_INSERTED(node, node_deleted) &&
+ !QLIST_IS_INSERTED(node, node_poll) &&
+ node->io_poll) {
+ trace_poll_add(ctx, node, node->pfd.fd, revents);
+ if (ctx->poll_started && node->io_poll_begin) {
+ node->io_poll_begin(node->opaque);
+ }
+ QLIST_INSERT_HEAD(&ctx->poll_aio_handlers, node, node_poll);
+ }
+
if (!QLIST_IS_INSERTED(node, node_deleted) &&
(revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
aio_node_check(ctx, node->is_external) &&
@@ -493,7 +354,7 @@ static bool aio_dispatch_ready_handlers(AioContext *ctx,
AioHandler *node;
while ((node = QLIST_FIRST(ready_list))) {
- QLIST_SAFE_REMOVE(node, node_ready);
+ QLIST_REMOVE(node, node_ready);
progress = aio_dispatch_handler(ctx, node) || progress;
}
@@ -524,71 +385,19 @@ void aio_dispatch(AioContext *ctx)
timerlistgroup_run_timers(&ctx->tlg);
}
-/* These thread-local variables are used only in a small part of aio_poll
- * around the call to the poll() system call. In particular they are not
- * used while aio_poll is performing callbacks, which makes it much easier
- * to think about reentrancy!
- *
- * Stack-allocated arrays would be perfect but they have size limitations;
- * heap allocation is expensive enough that we want to reuse arrays across
- * calls to aio_poll(). And because poll() has to be called without holding
- * any lock, the arrays cannot be stored in AioContext. Thread-local data
- * has none of the disadvantages of these three options.
- */
-static __thread GPollFD *pollfds;
-static __thread AioHandler **nodes;
-static __thread unsigned npfd, nalloc;
-static __thread Notifier pollfds_cleanup_notifier;
-
-static void pollfds_cleanup(Notifier *n, void *unused)
-{
- g_assert(npfd == 0);
- g_free(pollfds);
- g_free(nodes);
- nalloc = 0;
-}
-
-static void add_pollfd(AioHandler *node)
-{
- if (npfd == nalloc) {
- if (nalloc == 0) {
- pollfds_cleanup_notifier.notify = pollfds_cleanup;
- qemu_thread_atexit_add(&pollfds_cleanup_notifier);
- nalloc = 8;
- } else {
- g_assert(nalloc <= INT_MAX);
- nalloc *= 2;
- }
- pollfds = g_renew(GPollFD, pollfds, nalloc);
- nodes = g_renew(AioHandler *, nodes, nalloc);
- }
- nodes[npfd] = node;
- pollfds[npfd] = (GPollFD) {
- .fd = node->pfd.fd,
- .events = node->pfd.events,
- };
- npfd++;
-}
-
-static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
+static bool run_poll_handlers_once(AioContext *ctx,
+ int64_t now,
+ int64_t *timeout)
{
bool progress = false;
AioHandler *node;
+ AioHandler *tmp;
- /*
- * Optimization: ->io_poll() handlers often contain RCU read critical
- * sections and we therefore see many rcu_read_lock() -> rcu_read_unlock()
- * -> rcu_read_lock() -> ... sequences with expensive memory
- * synchronization primitives. Make the entire polling loop an RCU
- * critical section because nested rcu_read_lock()/rcu_read_unlock() calls
- * are cheap.
- */
- RCU_READ_LOCK_GUARD();
-
- QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
- if (!QLIST_IS_INSERTED(node, node_deleted) && node->io_poll &&
- aio_node_check(ctx, node->is_external) &&
+ QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
+ if (aio_node_check(ctx, node->is_external) &&
node->io_poll(node->opaque)) {
+ node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
+
/*
* Polling was successful, exit try_poll_mode immediately
* to adjust the next polling time.
@@ -605,6 +414,50 @@ static bool run_poll_handlers_once(AioContext *ctx, int64_t *timeout)
return progress;
}
+static bool fdmon_supports_polling(AioContext *ctx)
+{
+ return ctx->fdmon_ops->need_wait != aio_poll_disabled;
+}
+
+static bool remove_idle_poll_handlers(AioContext *ctx, int64_t now)
+{
+ AioHandler *node;
+ AioHandler *tmp;
+ bool progress = false;
+
+ /*
+ * File descriptor monitoring implementations without userspace polling
+ * support suffer from starvation when a subset of handlers is polled
+ * because fds will not be processed in a timely fashion. Don't remove
+ * idle poll handlers.
+ */
+ if (!fdmon_supports_polling(ctx)) {
+ return false;
+ }
+
+ QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
+ if (node->poll_idle_timeout == 0LL) {
+ node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
+ } else if (now >= node->poll_idle_timeout) {
+ trace_poll_remove(ctx, node, node->pfd.fd);
+ node->poll_idle_timeout = 0LL;
+ QLIST_SAFE_REMOVE(node, node_poll);
+ if (ctx->poll_started && node->io_poll_end) {
+ node->io_poll_end(node->opaque);
+
+ /*
+ * Final poll in case ->io_poll_end() races with an event.
+ * Nevermind about re-adding the handler in the rare case where
+ * this causes progress.
+ */
+ progress = node->io_poll(node->opaque) || progress;
+ }
+ }
+ }
+
+ return progress;
+}
+
/* run_poll_handlers:
* @ctx: the AioContext
* @max_ns: maximum time to poll for, in nanoseconds
@@ -628,13 +481,28 @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
trace_run_poll_handlers_begin(ctx, max_ns, *timeout);
+ /*
+ * Optimization: ->io_poll() handlers often contain RCU read critical
+ * sections and we therefore see many rcu_read_lock() -> rcu_read_unlock()
+ * -> rcu_read_lock() -> ... sequences with expensive memory
+ * synchronization primitives. Make the entire polling loop an RCU
+ * critical section because nested rcu_read_lock()/rcu_read_unlock() calls
+ * are cheap.
+ */
+ RCU_READ_LOCK_GUARD();
+
start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
do {
- progress = run_poll_handlers_once(ctx, timeout);
+ progress = run_poll_handlers_once(ctx, start_time, timeout);
elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time;
max_ns = qemu_soonest_timeout(*timeout, max_ns);
assert(!(max_ns && progress));
- } while (elapsed_time < max_ns && !atomic_read(&ctx->poll_disable_cnt));
+ } while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx));
+
+ if (remove_idle_poll_handlers(ctx, start_time + elapsed_time)) {
+ *timeout = 0;
+ progress = true;
+ }
/* If time has passed with no successful polling, adjust *timeout to
* keep the same ending time.
@@ -660,9 +528,14 @@ static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
*/
static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
{
- int64_t max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
+ int64_t max_ns;
- if (max_ns && !atomic_read(&ctx->poll_disable_cnt)) {
+ if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) {
+ return false;
+ }
+
+ max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
+ if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
poll_set_started(ctx, true);
if (run_poll_handlers(ctx, max_ns, timeout)) {
@@ -670,19 +543,17 @@ static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
}
}
- poll_set_started(ctx, false);
+ if (poll_set_started(ctx, false)) {
+ *timeout = 0;
+ return true;
+ }
- /* Even if we don't run busy polling, try polling once in case it can make
- * progress and the caller will be able to avoid ppoll(2)/epoll_wait(2).
- */
- return run_poll_handlers_once(ctx, timeout);
+ return false;
}
bool aio_poll(AioContext *ctx, bool blocking)
{
AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
- AioHandler *node;
- int i;
int ret = 0;
bool progress;
int64_t timeout;
@@ -714,27 +585,8 @@ bool aio_poll(AioContext *ctx, bool blocking)
/* If polling is allowed, non-blocking aio_poll does not need the
* system call---a single round of run_poll_handlers_once suffices.
*/
- if (timeout || atomic_read(&ctx->poll_disable_cnt)) {
- assert(npfd == 0);
-
- /* fill pollfds */
-
- if (!aio_epoll_enabled(ctx)) {
- QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
- if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events
- && aio_node_check(ctx, node->is_external)) {
- add_pollfd(node);
- }
- }
- }
-
- /* wait until next event */
- if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
- npfd = 0; /* pollfds[] is not being used */
- ret = aio_epoll(ctx, &ready_list, timeout);
- } else {
- ret = qemu_poll_ns(pollfds, npfd, timeout);
- }
+ if (timeout || ctx->fdmon_ops->need_wait(ctx)) {
+ ret = ctx->fdmon_ops->wait(ctx, &ready_list, timeout);
}
if (blocking) {
@@ -783,19 +635,6 @@ bool aio_poll(AioContext *ctx, bool blocking)
}
}
- /* if we have any readable fds, dispatch event */
- if (ret > 0) {
- for (i = 0; i < npfd; i++) {
- int revents = pollfds[i].revents;
-
- if (revents) {
- add_ready_handler(&ready_list, nodes[i], revents);
- }
- }
- }
-
- npfd = 0;
-
progress |= aio_bh_poll(ctx);
if (ret > 0) {
@@ -813,23 +652,21 @@ bool aio_poll(AioContext *ctx, bool blocking)
void aio_context_setup(AioContext *ctx)
{
-#ifdef CONFIG_EPOLL_CREATE1
- assert(!ctx->epollfd);
- ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
- if (ctx->epollfd == -1) {
- fprintf(stderr, "Failed to create epoll instance: %s", strerror(errno));
- ctx->epoll_available = false;
- } else {
- ctx->epoll_available = true;
+ ctx->fdmon_ops = &fdmon_poll_ops;
+ ctx->epollfd = -1;
+
+ /* Use the fastest fd monitoring implementation if available */
+ if (fdmon_io_uring_setup(ctx)) {
+ return;
}
-#endif
+
+ fdmon_epoll_setup(ctx);
}
void aio_context_destroy(AioContext *ctx)
{
-#ifdef CONFIG_EPOLL_CREATE1
- aio_epoll_disable(ctx);
-#endif
+ fdmon_io_uring_destroy(ctx);
+ fdmon_epoll_disable(ctx);
}
void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
diff --git a/util/aio-posix.h b/util/aio-posix.h
new file mode 100644
index 0000000000..c80c04506a
--- /dev/null
+++ b/util/aio-posix.h
@@ -0,0 +1,81 @@
+/*
+ * AioContext POSIX event loop implementation internal APIs
+ *
+ * Copyright IBM, Corp. 2008
+ * Copyright Red Hat, Inc. 2020
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#ifndef AIO_POSIX_H
+#define AIO_POSIX_H
+
+#include "block/aio.h"
+
+struct AioHandler {
+ GPollFD pfd;
+ IOHandler *io_read;
+ IOHandler *io_write;
+ AioPollFn *io_poll;
+ IOHandler *io_poll_begin;
+ IOHandler *io_poll_end;
+ void *opaque;
+ QLIST_ENTRY(AioHandler) node;
+ QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
+ QLIST_ENTRY(AioHandler) node_deleted;
+ QLIST_ENTRY(AioHandler) node_poll;
+#ifdef CONFIG_LINUX_IO_URING
+ QSLIST_ENTRY(AioHandler) node_submitted;
+ unsigned flags; /* see fdmon-io_uring.c */
+#endif
+ int64_t poll_idle_timeout; /* when to stop userspace polling */
+ bool is_external;
+};
+
+/* Add a handler to a ready list */
+void aio_add_ready_handler(AioHandlerList *ready_list, AioHandler *node,
+ int revents);
+
+extern const FDMonOps fdmon_poll_ops;
+
+#ifdef CONFIG_EPOLL_CREATE1
+bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd);
+void fdmon_epoll_setup(AioContext *ctx);
+void fdmon_epoll_disable(AioContext *ctx);
+#else
+static inline bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd)
+{
+ return false;
+}
+
+static inline void fdmon_epoll_setup(AioContext *ctx)
+{
+}
+
+static inline void fdmon_epoll_disable(AioContext *ctx)
+{
+}
+#endif /* !CONFIG_EPOLL_CREATE1 */
+
+#ifdef CONFIG_LINUX_IO_URING
+bool fdmon_io_uring_setup(AioContext *ctx);
+void fdmon_io_uring_destroy(AioContext *ctx);
+#else
+static inline bool fdmon_io_uring_setup(AioContext *ctx)
+{
+ return false;
+}
+
+static inline void fdmon_io_uring_destroy(AioContext *ctx)
+{
+}
+#endif /* !CONFIG_LINUX_IO_URING */
+
+#endif /* AIO_POSIX_H */
diff --git a/util/fdmon-epoll.c b/util/fdmon-epoll.c
new file mode 100644
index 0000000000..fcd989d47d
--- /dev/null
+++ b/util/fdmon-epoll.c
@@ -0,0 +1,155 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * epoll(7) file descriptor monitoring
+ */
+
+#include "qemu/osdep.h"
+#include <sys/epoll.h>
+#include "qemu/rcu_queue.h"
+#include "aio-posix.h"
+
+/* The fd number threshold to switch to epoll */
+#define EPOLL_ENABLE_THRESHOLD 64
+
+void fdmon_epoll_disable(AioContext *ctx)
+{
+ if (ctx->epollfd >= 0) {
+ close(ctx->epollfd);
+ ctx->epollfd = -1;
+ }
+
+ /* Switch back */
+ ctx->fdmon_ops = &fdmon_poll_ops;
+}
+
+static inline int epoll_events_from_pfd(int pfd_events)
+{
+ return (pfd_events & G_IO_IN ? EPOLLIN : 0) |
+ (pfd_events & G_IO_OUT ? EPOLLOUT : 0) |
+ (pfd_events & G_IO_HUP ? EPOLLHUP : 0) |
+ (pfd_events & G_IO_ERR ? EPOLLERR : 0);
+}
+
+static void fdmon_epoll_update(AioContext *ctx,
+ AioHandler *old_node,
+ AioHandler *new_node)
+{
+ struct epoll_event event = {
+ .data.ptr = new_node,
+ .events = new_node ? epoll_events_from_pfd(new_node->pfd.events) : 0,
+ };
+ int r;
+
+ if (!new_node) {
+ r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, old_node->pfd.fd, &event);
+ } else if (!old_node) {
+ r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, new_node->pfd.fd, &event);
+ } else {
+ r = epoll_ctl(ctx->epollfd, EPOLL_CTL_MOD, new_node->pfd.fd, &event);
+ }
+
+ if (r) {
+ fdmon_epoll_disable(ctx);
+ }
+}
+
+static int fdmon_epoll_wait(AioContext *ctx, AioHandlerList *ready_list,
+ int64_t timeout)
+{
+ GPollFD pfd = {
+ .fd = ctx->epollfd,
+ .events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR,
+ };
+ AioHandler *node;
+ int i, ret = 0;
+ struct epoll_event events[128];
+
+ /* Fall back while external clients are disabled */
+ if (atomic_read(&ctx->external_disable_cnt)) {
+ return fdmon_poll_ops.wait(ctx, ready_list, timeout);
+ }
+
+ if (timeout > 0) {
+ ret = qemu_poll_ns(&pfd, 1, timeout);
+ if (ret > 0) {
+ timeout = 0;
+ }
+ }
+ if (timeout <= 0 || ret > 0) {
+ ret = epoll_wait(ctx->epollfd, events,
+ ARRAY_SIZE(events),
+ timeout);
+ if (ret <= 0) {
+ goto out;
+ }
+ for (i = 0; i < ret; i++) {
+ int ev = events[i].events;
+ int revents = (ev & EPOLLIN ? G_IO_IN : 0) |
+ (ev & EPOLLOUT ? G_IO_OUT : 0) |
+ (ev & EPOLLHUP ? G_IO_HUP : 0) |
+ (ev & EPOLLERR ? G_IO_ERR : 0);
+
+ node = events[i].data.ptr;
+ aio_add_ready_handler(ready_list, node, revents);
+ }
+ }
+out:
+ return ret;
+}
+
+static const FDMonOps fdmon_epoll_ops = {
+ .update = fdmon_epoll_update,
+ .wait = fdmon_epoll_wait,
+ .need_wait = aio_poll_disabled,
+};
+
+static bool fdmon_epoll_try_enable(AioContext *ctx)
+{
+ AioHandler *node;
+ struct epoll_event event;
+
+ QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
+ int r;
+ if (QLIST_IS_INSERTED(node, node_deleted) || !node->pfd.events) {
+ continue;
+ }
+ event.events = epoll_events_from_pfd(node->pfd.events);
+ event.data.ptr = node;
+ r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
+ if (r) {
+ return false;
+ }
+ }
+
+ ctx->fdmon_ops = &fdmon_epoll_ops;
+ return true;
+}
+
+bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd)
+{
+ if (ctx->epollfd < 0) {
+ return false;
+ }
+
+ /* Do not upgrade while external clients are disabled */
+ if (atomic_read(&ctx->external_disable_cnt)) {
+ return false;
+ }
+
+ if (npfd >= EPOLL_ENABLE_THRESHOLD) {
+ if (fdmon_epoll_try_enable(ctx)) {
+ return true;
+ } else {
+ fdmon_epoll_disable(ctx);
+ }
+ }
+ return false;
+}
+
+void fdmon_epoll_setup(AioContext *ctx)
+{
+ ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
+ if (ctx->epollfd == -1) {
+ fprintf(stderr, "Failed to create epoll instance: %s", strerror(errno));
+ }
+}
diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
new file mode 100644
index 0000000000..893b79b622
--- /dev/null
+++ b/util/fdmon-io_uring.c
@@ -0,0 +1,332 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Linux io_uring file descriptor monitoring
+ *
+ * The Linux io_uring API supports file descriptor monitoring with a few
+ * advantages over existing APIs like poll(2) and epoll(7):
+ *
+ * 1. Userspace polling of events is possible because the completion queue (cq
+ * ring) is shared between the kernel and userspace. This allows
+ * applications that rely on userspace polling to also monitor file
+ * descriptors in the same userspace polling loop.
+ *
+ * 2. Submission and completion is batched and done together in a single system
+ * call. This minimizes the number of system calls.
+ *
+ * 3. File descriptor monitoring is O(1) like epoll(7) so it scales better than
+ * poll(2).
+ *
+ * 4. Nanosecond timeouts are supported so it requires fewer syscalls than
+ * epoll(7).
+ *
+ * This code only monitors file descriptors and does not do asynchronous disk
+ * I/O. Implementing disk I/O efficiently has other requirements and should
+ * use a separate io_uring so it does not make sense to unify the code.
+ *
+ * File descriptor monitoring is implemented using the following operations:
+ *
+ * 1. IORING_OP_POLL_ADD - adds a file descriptor to be monitored.
+ * 2. IORING_OP_POLL_REMOVE - removes a file descriptor being monitored. When
+ * the poll mask changes for a file descriptor it is first removed and then
+ * re-added with the new poll mask, so this operation is also used as part
+ * of modifying an existing monitored file descriptor.
+ * 3. IORING_OP_TIMEOUT - added every time a blocking syscall is made to wait
+ * for events. This operation self-cancels if another event completes
+ * before the timeout.
+ *
+ * io_uring calls the submission queue the "sq ring" and the completion queue
+ * the "cq ring". Ring entries are called "sqe" and "cqe", respectively.
+ *
+ * The code is structured so that sq/cq rings are only modified within
+ * fdmon_io_uring_wait(). Changes to AioHandlers are made by enqueuing them on
+ * ctx->submit_list so that fdmon_io_uring_wait() can submit IORING_OP_POLL_ADD
+ * and/or IORING_OP_POLL_REMOVE sqes for them.
+ */
+
+#include "qemu/osdep.h"
+#include <poll.h>
+#include "qemu/rcu_queue.h"
+#include "aio-posix.h"
+
+enum {
+ FDMON_IO_URING_ENTRIES = 128, /* sq/cq ring size */
+
+ /* AioHandler::flags */
+ FDMON_IO_URING_PENDING = (1 << 0),
+ FDMON_IO_URING_ADD = (1 << 1),
+ FDMON_IO_URING_REMOVE = (1 << 2),
+};
+
+static inline int poll_events_from_pfd(int pfd_events)
+{
+ return (pfd_events & G_IO_IN ? POLLIN : 0) |
+ (pfd_events & G_IO_OUT ? POLLOUT : 0) |
+ (pfd_events & G_IO_HUP ? POLLHUP : 0) |
+ (pfd_events & G_IO_ERR ? POLLERR : 0);
+}
+
+static inline int pfd_events_from_poll(int poll_events)
+{
+ return (poll_events & POLLIN ? G_IO_IN : 0) |
+ (poll_events & POLLOUT ? G_IO_OUT : 0) |
+ (poll_events & POLLHUP ? G_IO_HUP : 0) |
+ (poll_events & POLLERR ? G_IO_ERR : 0);
+}
+
+/*
+ * Returns an sqe for submitting a request. Only be called within
+ * fdmon_io_uring_wait().
+ */
+static struct io_uring_sqe *get_sqe(AioContext *ctx)
+{
+ struct io_uring *ring = &ctx->fdmon_io_uring;
+ struct io_uring_sqe *sqe = io_uring_get_sqe(ring);
+ int ret;
+
+ if (likely(sqe)) {
+ return sqe;
+ }
+
+ /* No free sqes left, submit pending sqes first */
+ ret = io_uring_submit(ring);
+ assert(ret > 1);
+ sqe = io_uring_get_sqe(ring);
+ assert(sqe);
+ return sqe;
+}
+
+/* Atomically enqueue an AioHandler for sq ring submission */
+static void enqueue(AioHandlerSList *head, AioHandler *node, unsigned flags)
+{
+ unsigned old_flags;
+
+ old_flags = atomic_fetch_or(&node->flags, FDMON_IO_URING_PENDING | flags);
+ if (!(old_flags & FDMON_IO_URING_PENDING)) {
+ QSLIST_INSERT_HEAD_ATOMIC(head, node, node_submitted);
+ }
+}
+
+/* Dequeue an AioHandler for sq ring submission. Called by fill_sq_ring(). */
+static AioHandler *dequeue(AioHandlerSList *head, unsigned *flags)
+{
+ AioHandler *node = QSLIST_FIRST(head);
+
+ if (!node) {
+ return NULL;
+ }
+
+ /* Doesn't need to be atomic since fill_sq_ring() moves the list */
+ QSLIST_REMOVE_HEAD(head, node_submitted);
+
+ /*
+ * Don't clear FDMON_IO_URING_REMOVE. It's sticky so it can serve two
+ * purposes: telling fill_sq_ring() to submit IORING_OP_POLL_REMOVE and
+ * telling process_cqe() to delete the AioHandler when its
+ * IORING_OP_POLL_ADD completes.
+ */
+ *flags = atomic_fetch_and(&node->flags, ~(FDMON_IO_URING_PENDING |
+ FDMON_IO_URING_ADD));
+ return node;
+}
+
+static void fdmon_io_uring_update(AioContext *ctx,
+ AioHandler *old_node,
+ AioHandler *new_node)
+{
+ if (new_node) {
+ enqueue(&ctx->submit_list, new_node, FDMON_IO_URING_ADD);
+ }
+
+ if (old_node) {
+ /*
+ * Deletion is tricky because IORING_OP_POLL_ADD and
+ * IORING_OP_POLL_REMOVE are async. We need to wait for the original
+ * IORING_OP_POLL_ADD to complete before this handler can be freed
+ * safely.
+ *
+ * It's possible that the file descriptor becomes ready and the
+ * IORING_OP_POLL_ADD cqe is enqueued before IORING_OP_POLL_REMOVE is
+ * submitted, too.
+ *
+ * Mark this handler deleted right now but don't place it on
+ * ctx->deleted_aio_handlers yet. Instead, manually fudge the list
+ * entry to make QLIST_IS_INSERTED() think this handler has been
+ * inserted and other code recognizes this AioHandler as deleted.
+ *
+ * Once the original IORING_OP_POLL_ADD completes we enqueue the
+ * handler on the real ctx->deleted_aio_handlers list to be freed.
+ */
+ assert(!QLIST_IS_INSERTED(old_node, node_deleted));
+ old_node->node_deleted.le_prev = &old_node->node_deleted.le_next;
+
+ enqueue(&ctx->submit_list, old_node, FDMON_IO_URING_REMOVE);
+ }
+}
+
+static void add_poll_add_sqe(AioContext *ctx, AioHandler *node)
+{
+ struct io_uring_sqe *sqe = get_sqe(ctx);
+ int events = poll_events_from_pfd(node->pfd.events);
+
+ io_uring_prep_poll_add(sqe, node->pfd.fd, events);
+ io_uring_sqe_set_data(sqe, node);
+}
+
+static void add_poll_remove_sqe(AioContext *ctx, AioHandler *node)
+{
+ struct io_uring_sqe *sqe = get_sqe(ctx);
+
+ io_uring_prep_poll_remove(sqe, node);
+}
+
+/* Add a timeout that self-cancels when another cqe becomes ready */
+static void add_timeout_sqe(AioContext *ctx, int64_t ns)
+{
+ struct io_uring_sqe *sqe;
+ struct __kernel_timespec ts = {
+ .tv_sec = ns / NANOSECONDS_PER_SECOND,
+ .tv_nsec = ns % NANOSECONDS_PER_SECOND,
+ };
+
+ sqe = get_sqe(ctx);
+ io_uring_prep_timeout(sqe, &ts, 1, 0);
+}
+
+/* Add sqes from ctx->submit_list for submission */
+static void fill_sq_ring(AioContext *ctx)
+{
+ AioHandlerSList submit_list;
+ AioHandler *node;
+ unsigned flags;
+
+ QSLIST_MOVE_ATOMIC(&submit_list, &ctx->submit_list);
+
+ while ((node = dequeue(&submit_list, &flags))) {
+ /* Order matters, just in case both flags were set */
+ if (flags & FDMON_IO_URING_ADD) {
+ add_poll_add_sqe(ctx, node);
+ }
+ if (flags & FDMON_IO_URING_REMOVE) {
+ add_poll_remove_sqe(ctx, node);
+ }
+ }
+}
+
+/* Returns true if a handler became ready */
+static bool process_cqe(AioContext *ctx,
+ AioHandlerList *ready_list,
+ struct io_uring_cqe *cqe)
+{
+ AioHandler *node = io_uring_cqe_get_data(cqe);
+ unsigned flags;
+
+ /* poll_timeout and poll_remove have a zero user_data field */
+ if (!node) {
+ return false;
+ }
+
+ /*
+ * Deletion can only happen when IORING_OP_POLL_ADD completes. If we race
+ * with enqueue() here then we can safely clear the FDMON_IO_URING_REMOVE
+ * bit before IORING_OP_POLL_REMOVE is submitted.
+ */
+ flags = atomic_fetch_and(&node->flags, ~FDMON_IO_URING_REMOVE);
+ if (flags & FDMON_IO_URING_REMOVE) {
+ QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
+ return false;
+ }
+
+ aio_add_ready_handler(ready_list, node, pfd_events_from_poll(cqe->res));
+
+ /* IORING_OP_POLL_ADD is one-shot so we must re-arm it */
+ add_poll_add_sqe(ctx, node);
+ return true;
+}
+
+static int process_cq_ring(AioContext *ctx, AioHandlerList *ready_list)
+{
+ struct io_uring *ring = &ctx->fdmon_io_uring;
+ struct io_uring_cqe *cqe;
+ unsigned num_cqes = 0;
+ unsigned num_ready = 0;
+ unsigned head;
+
+ io_uring_for_each_cqe(ring, head, cqe) {
+ if (process_cqe(ctx, ready_list, cqe)) {
+ num_ready++;
+ }
+
+ num_cqes++;
+ }
+
+ io_uring_cq_advance(ring, num_cqes);
+ return num_ready;
+}
+
+static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list,
+ int64_t timeout)
+{
+ unsigned wait_nr = 1; /* block until at least one cqe is ready */
+ int ret;
+
+ /* Fall back while external clients are disabled */
+ if (atomic_read(&ctx->external_disable_cnt)) {
+ return fdmon_poll_ops.wait(ctx, ready_list, timeout);
+ }
+
+ if (timeout == 0) {
+ wait_nr = 0; /* non-blocking */
+ } else if (timeout > 0) {
+ add_timeout_sqe(ctx, timeout);
+ }
+
+ fill_sq_ring(ctx);
+
+ ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr);
+ assert(ret >= 0);
+
+ return process_cq_ring(ctx, ready_list);
+}
+
+static bool fdmon_io_uring_need_wait(AioContext *ctx)
+{
+ return io_uring_cq_ready(&ctx->fdmon_io_uring);
+}
+
+static const FDMonOps fdmon_io_uring_ops = {
+ .update = fdmon_io_uring_update,
+ .wait = fdmon_io_uring_wait,
+ .need_wait = fdmon_io_uring_need_wait,
+};
+
+bool fdmon_io_uring_setup(AioContext *ctx)
+{
+ int ret;
+
+ ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0);
+ if (ret != 0) {
+ return false;
+ }
+
+ QSLIST_INIT(&ctx->submit_list);
+ ctx->fdmon_ops = &fdmon_io_uring_ops;
+ return true;
+}
+
+void fdmon_io_uring_destroy(AioContext *ctx)
+{
+ if (ctx->fdmon_ops == &fdmon_io_uring_ops) {
+ AioHandler *node;
+
+ io_uring_queue_exit(&ctx->fdmon_io_uring);
+
+ /* No need to submit these anymore, just free them. */
+ while ((node = QSLIST_FIRST_RCU(&ctx->submit_list))) {
+ QSLIST_REMOVE_HEAD_RCU(&ctx->submit_list, node_submitted);
+ QLIST_REMOVE(node, node);
+ g_free(node);
+ }
+
+ ctx->fdmon_ops = &fdmon_poll_ops;
+ }
+}
diff --git a/util/fdmon-poll.c b/util/fdmon-poll.c
new file mode 100644
index 0000000000..488067b679
--- /dev/null
+++ b/util/fdmon-poll.c
@@ -0,0 +1,107 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * poll(2) file descriptor monitoring
+ *
+ * Uses ppoll(2) when available, g_poll() otherwise.
+ */
+
+#include "qemu/osdep.h"
+#include "aio-posix.h"
+#include "qemu/rcu_queue.h"
+
+/*
+ * These thread-local variables are used only in fdmon_poll_wait() around the
+ * call to the poll() system call. In particular they are not used while
+ * aio_poll is performing callbacks, which makes it much easier to think about
+ * reentrancy!
+ *
+ * Stack-allocated arrays would be perfect but they have size limitations;
+ * heap allocation is expensive enough that we want to reuse arrays across
+ * calls to aio_poll(). And because poll() has to be called without holding
+ * any lock, the arrays cannot be stored in AioContext. Thread-local data
+ * has none of the disadvantages of these three options.
+ */
+static __thread GPollFD *pollfds;
+static __thread AioHandler **nodes;
+static __thread unsigned npfd, nalloc;
+static __thread Notifier pollfds_cleanup_notifier;
+
+static void pollfds_cleanup(Notifier *n, void *unused)
+{
+ g_assert(npfd == 0);
+ g_free(pollfds);
+ g_free(nodes);
+ nalloc = 0;
+}
+
+static void add_pollfd(AioHandler *node)
+{
+ if (npfd == nalloc) {
+ if (nalloc == 0) {
+ pollfds_cleanup_notifier.notify = pollfds_cleanup;
+ qemu_thread_atexit_add(&pollfds_cleanup_notifier);
+ nalloc = 8;
+ } else {
+ g_assert(nalloc <= INT_MAX);
+ nalloc *= 2;
+ }
+ pollfds = g_renew(GPollFD, pollfds, nalloc);
+ nodes = g_renew(AioHandler *, nodes, nalloc);
+ }
+ nodes[npfd] = node;
+ pollfds[npfd] = (GPollFD) {
+ .fd = node->pfd.fd,
+ .events = node->pfd.events,
+ };
+ npfd++;
+}
+
+static int fdmon_poll_wait(AioContext *ctx, AioHandlerList *ready_list,
+ int64_t timeout)
+{
+ AioHandler *node;
+ int ret;
+
+ assert(npfd == 0);
+
+ QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
+ if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events
+ && aio_node_check(ctx, node->is_external)) {
+ add_pollfd(node);
+ }
+ }
+
+ /* epoll(7) is faster above a certain number of fds */
+ if (fdmon_epoll_try_upgrade(ctx, npfd)) {
+ return ctx->fdmon_ops->wait(ctx, ready_list, timeout);
+ }
+
+ ret = qemu_poll_ns(pollfds, npfd, timeout);
+ if (ret > 0) {
+ int i;
+
+ for (i = 0; i < npfd; i++) {
+ int revents = pollfds[i].revents;
+
+ if (revents) {
+ aio_add_ready_handler(ready_list, nodes[i], revents);
+ }
+ }
+ }
+
+ npfd = 0;
+ return ret;
+}
+
+static void fdmon_poll_update(AioContext *ctx,
+ AioHandler *old_node,
+ AioHandler *new_node)
+{
+ /* Do nothing, AioHandler already contains the state we'll need */
+}
+
+const FDMonOps fdmon_poll_ops = {
+ .update = fdmon_poll_update,
+ .wait = fdmon_poll_wait,
+ .need_wait = aio_poll_disabled,
+};
diff --git a/util/trace-events b/util/trace-events
index 83b6639018..0ce42822eb 100644
--- a/util/trace-events
+++ b/util/trace-events
@@ -5,6 +5,8 @@ run_poll_handlers_begin(void *ctx, int64_t max_ns, int64_t timeout) "ctx %p max_
run_poll_handlers_end(void *ctx, bool progress, int64_t timeout) "ctx %p progress %d new timeout %"PRId64
poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
+poll_add(void *ctx, void *node, int fd, unsigned revents) "ctx %p node %p fd %d revents 0x%x"
+poll_remove(void *ctx, void *node, int fd) "ctx %p node %p fd %d"
# async.c
aio_co_schedule(void *ctx, void *co) "ctx %p co %p"