5 files changed, 280 insertions, 0 deletions
diff --git a/block/Makefile.objs b/block/Makefile.objs
index 554f429d05..806e526eb4 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -12,3 +12,4 @@ block-obj-$(CONFIG_GLUSTERFS) += gluster.o
 
 common-obj-y += stream.o
 common-obj-y += commit.o
+common-obj-y += mirror.o
diff --git a/block/mirror.c b/block/mirror.c
new file mode 100644
index 0000000000..b353798e8f
--- /dev/null
+++ b/block/mirror.c
@@ -0,0 +1,235 @@
+/*
+ * Image mirroring
+ *
+ * Copyright Red Hat, Inc. 2012
+ *
+ * Authors:
+ *  Paolo Bonzini  <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "trace.h"
+#include "blockjob.h"
+#include "block_int.h"
+#include "qemu/ratelimit.h"
+
+enum {
+    /*
+     * Size of data buffer for populating the image file.  This should be large
+     * enough to process multiple clusters in a single call, so that populating
+     * contiguous regions of the image is efficient.
+     */
+    BLOCK_SIZE = 512 * BDRV_SECTORS_PER_DIRTY_CHUNK, /* in bytes */
+};
+
+#define SLICE_TIME 100000000ULL /* ns */
+
+typedef struct MirrorBlockJob {
+    BlockJob common;
+    RateLimit limit;
+    BlockDriverState *target;
+    MirrorSyncMode mode;
+    int64_t sector_num;
+    uint8_t *buf;
+} MirrorBlockJob;
+
+static int coroutine_fn mirror_iteration(MirrorBlockJob *s)
+{
+    BlockDriverState *source = s->common.bs;
+    BlockDriverState *target = s->target;
+    QEMUIOVector qiov;
+    int ret, nb_sectors;
+    int64_t end;
+    struct iovec iov;
+
+    end = s->common.len >> BDRV_SECTOR_BITS;
+    s->sector_num = bdrv_get_next_dirty(source, s->sector_num);
+    nb_sectors = MIN(BDRV_SECTORS_PER_DIRTY_CHUNK, end - s->sector_num);
+    bdrv_reset_dirty(source, s->sector_num, nb_sectors);
+
+    /* Copy the dirty cluster.  */
+    iov.iov_base = s->buf;
+    iov.iov_len  = nb_sectors * 512;
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    trace_mirror_one_iteration(s, s->sector_num, nb_sectors);
+    ret = bdrv_co_readv(source, s->sector_num, nb_sectors, &qiov);
+    if (ret < 0) {
+        return ret;
+    }
+    return bdrv_co_writev(target, s->sector_num, nb_sectors, &qiov);
+}
+
+static void coroutine_fn mirror_run(void *opaque)
+{
+    MirrorBlockJob *s = opaque;
+    BlockDriverState *bs = s->common.bs;
+    int64_t sector_num, end;
+    int ret = 0;
+    int n;
+    bool synced = false;
+
+    if (block_job_is_cancelled(&s->common)) {
+        goto immediate_exit;
+    }
+
+    s->common.len = bdrv_getlength(bs);
+    if (s->common.len < 0) {
+        block_job_completed(&s->common, s->common.len);
+        return;
+    }
+
+    end = s->common.len >> BDRV_SECTOR_BITS;
+    s->buf = qemu_blockalign(bs, BLOCK_SIZE);
+
+    if (s->mode != MIRROR_SYNC_MODE_NONE) {
+        /* First part, loop on the sectors and initialize the dirty bitmap.  */
+        BlockDriverState *base;
+        base = s->mode == MIRROR_SYNC_MODE_FULL ? NULL : bs->backing_hd;
+        for (sector_num = 0; sector_num < end; ) {
+            int64_t next = (sector_num | (BDRV_SECTORS_PER_DIRTY_CHUNK - 1)) + 1;
+            ret = bdrv_co_is_allocated_above(bs, base,
+                                             sector_num, next - sector_num, &n);
+
+            if (ret < 0) {
+                goto immediate_exit;
+            }
+
+            assert(n > 0);
+            if (ret == 1) {
+                bdrv_set_dirty(bs, sector_num, n);
+                sector_num = next;
+            } else {
+                sector_num += n;
+            }
+        }
+    }
+
+    s->sector_num = -1;
+    for (;;) {
+        uint64_t delay_ns;
+        int64_t cnt;
+        bool should_complete;
+
+        cnt = bdrv_get_dirty_count(bs);
+        if (cnt != 0) {
+            ret = mirror_iteration(s);
+            if (ret < 0) {
+                goto immediate_exit;
+            }
+            cnt = bdrv_get_dirty_count(bs);
+        }
+
+        should_complete = false;
+        if (cnt == 0) {
+            trace_mirror_before_flush(s);
+            ret = bdrv_flush(s->target);
+            if (ret < 0) {
+                goto immediate_exit;
+            }
+
+            /* We're out of the streaming phase.  From now on, if the job
+             * is cancelled we will actually complete all pending I/O and
+             * report completion.  This way, block-job-cancel will leave
+             * the target in a consistent state.
+             */
+            synced = true;
+            s->common.offset = end * BDRV_SECTOR_SIZE;
+            should_complete = block_job_is_cancelled(&s->common);
+            cnt = bdrv_get_dirty_count(bs);
+        }
+
+        if (cnt == 0 && should_complete) {
+            /* The dirty bitmap is not updated while operations are pending.
+             * If we're about to exit, wait for pending operations before
+             * calling bdrv_get_dirty_count(bs), or we may exit while the
+             * source has dirty data to copy!
+             *
+             * Note that I/O can be submitted by the guest while
+             * mirror_populate runs.
+             */
+            trace_mirror_before_drain(s, cnt);
+            bdrv_drain_all();
+            cnt = bdrv_get_dirty_count(bs);
+        }
+
+        ret = 0;
+        trace_mirror_before_sleep(s, cnt, synced);
+        if (!synced) {
+            /* Publish progress */
+            s->common.offset = end * BDRV_SECTOR_SIZE - cnt * BLOCK_SIZE;
+
+            if (s->common.speed) {
+                delay_ns = ratelimit_calculate_delay(&s->limit, BDRV_SECTORS_PER_DIRTY_CHUNK);
+            } else {
+                delay_ns = 0;
+            }
+
+            /* Note that even when no rate limit is applied we need to yield
+             * with no pending I/O here so that qemu_aio_flush() returns.
+             */
+            block_job_sleep_ns(&s->common, rt_clock, delay_ns);
+            if (block_job_is_cancelled(&s->common)) {
+                break;
+            }
+        } else if (!should_complete) {
+            delay_ns = (cnt == 0 ? SLICE_TIME : 0);
+            block_job_sleep_ns(&s->common, rt_clock, delay_ns);
+        } else if (cnt == 0) {
+            /* The two disks are in sync.  Exit and report successful
+             * completion.
+             */
+            assert(QLIST_EMPTY(&bs->tracked_requests));
+            s->common.cancelled = false;
+            break;
+        }
+    }
+
+immediate_exit:
+    g_free(s->buf);
+    bdrv_set_dirty_tracking(bs, false);
+    bdrv_close(s->target);
+    bdrv_delete(s->target);
+    block_job_completed(&s->common, ret);
+}
+
+static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp)
+{
+    MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
+
+    if (speed < 0) {
+        error_set(errp, QERR_INVALID_PARAMETER, "speed");
+        return;
+    }
+    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
+}
+
+static BlockJobType mirror_job_type = {
+    .instance_size = sizeof(MirrorBlockJob),
+    .job_type      = "mirror",
+    .set_speed     = mirror_set_speed,
+};
+
+void mirror_start(BlockDriverState *bs, BlockDriverState *target,
+                  int64_t speed, MirrorSyncMode mode,
+                  BlockDriverCompletionFunc *cb,
+                  void *opaque, Error **errp)
+{
+    MirrorBlockJob *s;
+
+    s = block_job_create(&mirror_job_type, bs, speed, cb, opaque, errp);
+    if (!s) {
+        return;
+    }
+
+    s->target = target;
+    s->mode = mode;
+    bdrv_set_dirty_tracking(bs, true);
+    bdrv_set_enable_write_cache(s->target, true);
+    s->common.co = qemu_coroutine_create(mirror_run);
+    trace_mirror_start(bs, s, s->common.co, opaque);
+    qemu_coroutine_enter(s->common.co, s);
+}
diff --git a/block_int.h b/block_int.h
index f4bae04401..aaa46a83b0 100644
--- a/block_int.h
+++ b/block_int.h
@@ -331,4 +331,24 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
                  BlockdevOnError on_error, BlockDriverCompletionFunc *cb,
                  void *opaque, Error **errp);
 
+/*
+ * mirror_start:
+ * @bs: Block device to operate on.
+ * @target: Block device to write to.
+ * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
+ * @mode: Whether to collapse all images in the chain to the target.
+ * @cb: Completion function for the job.
+ * @opaque: Opaque pointer value passed to @cb.
+ * @errp: Error object.
+ *
+ * Start a mirroring operation on @bs.  Clusters that are allocated
+ * in @bs will be written to @bs until the job is cancelled or
+ * manually completed.  At the end of a successful mirroring job,
+ * @bs will be switched to read from @target.
+ */
+void mirror_start(BlockDriverState *bs, BlockDriverState *target,
+                  int64_t speed, MirrorSyncMode mode,
+                  BlockDriverCompletionFunc *cb,
+                  void *opaque, Error **errp);
+
 #endif /* BLOCK_INT_H */
diff --git a/qapi-schema.json b/qapi-schema.json
index 37bbecab60..8c4b7c808b 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -1166,6 +1166,23 @@
   'data': ['report', 'ignore', 'enospc', 'stop'] }
 
 ##
+# @MirrorSyncMode:
+#
+# An enumeration of possible behaviors for the initial synchronization
+# phase of storage mirroring.
+#
+# @top: copies data in the topmost image to the destination
+#
+# @full: copies data from all images to the destination
+#
+# @none: only copy data written from now on
+#
+# Since: 1.3
+##
+{ 'enum': 'MirrorSyncMode',
+  'data': ['top', 'full', 'none'] }
+
+##
 # @BlockJobInfo:
 #
 # Information about a long-running block device operation.
diff --git a/trace-events b/trace-events
index 9ab8e2781a..09b5d558c6 100644
--- a/trace-events
+++ b/trace-events
@@ -77,6 +77,13 @@ stream_start(void *bs, void *base, void *s, void *co, void *opaque) "bs %p base
 commit_one_iteration(void *s, int64_t sector_num, int nb_sectors, int is_allocated) "s %p sector_num %"PRId64" nb_sectors %d is_allocated %d"
 commit_start(void *bs, void *base, void *top, void *s, void *co, void *opaque) "bs %p base %p top %p s %p co %p opaque %p"
 
+# block/mirror.c
+mirror_start(void *bs, void *s, void *co, void *opaque) "bs %p s %p co %p opaque %p"
+mirror_before_flush(void *s) "s %p"
+mirror_before_drain(void *s, int64_t cnt) "s %p dirty count %"PRId64
+mirror_before_sleep(void *s, int64_t cnt, int synced) "s %p dirty count %"PRId64" synced %d"
+mirror_one_iteration(void *s, int64_t sector_num, int nb_sectors) "s %p sector_num %"PRId64" nb_sectors %d"
+
 # blockdev.c
 qmp_block_job_cancel(void *job) "job %p"
 qmp_block_job_pause(void *job) "job %p"