23 files changed, 653 insertions, 377 deletions
diff --git a/include/block/aio.h b/include/block/aio.h
index b9fe2cb37e..c7ae27c91c 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -18,7 +18,6 @@
 #include "qemu/queue.h"
 #include "qemu/event_notifier.h"
 #include "qemu/thread.h"
-#include "qemu/rfifolock.h"
 #include "qemu/timer.h"
 
 typedef struct BlockAIOCB BlockAIOCB;
@@ -54,7 +53,7 @@ struct AioContext {
     GSource source;
 
     /* Protects all fields from multi-threaded access */
-    RFifoLock lock;
+    QemuRecMutex lock;
 
     /* The list of registered AIO handlers */
     QLIST_HEAD(, AioHandler) aio_handlers;
@@ -116,9 +115,6 @@ struct AioContext {
     bool notified;
     EventNotifier notifier;
 
-    /* Scheduling this BH forces the event loop it iterate */
-    QEMUBH *notify_dummy_bh;
-
     /* Thread pool for performing work and receiving completion callbacks */
     struct ThreadPool *thread_pool;
 
@@ -453,6 +449,24 @@ static inline bool aio_node_check(AioContext *ctx, bool is_external)
 }
 
 /**
+ * Return the AioContext whose event loop runs in the current thread.
+ *
+ * If called from an IOThread this will be the IOThread's AioContext.  If
+ * called from another thread it will be the main loop AioContext.
+ */
+AioContext *qemu_get_current_aio_context(void);
+
+/**
+ * @ctx: the aio context
+ *
+ * Return whether we are running in the I/O thread that manages @ctx.
+ */
+static inline bool aio_context_in_iothread(AioContext *ctx)
+{
+    return ctx == qemu_get_current_aio_context();
+}
+
+/**
  * aio_context_setup:
  * @ctx: the aio context
  *
diff --git a/include/block/block.h b/include/block/block.h
index 398a050176..49bb0b239a 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -7,16 +7,15 @@
 #include "qemu/coroutine.h"
 #include "block/accounting.h"
 #include "block/dirty-bitmap.h"
+#include "block/blockjob.h"
 #include "qapi/qmp/qobject.h"
 #include "qapi-types.h"
 #include "qemu/hbitmap.h"
 
 /* block.c */
 typedef struct BlockDriver BlockDriver;
-typedef struct BlockJob BlockJob;
 typedef struct BdrvChild BdrvChild;
 typedef struct BdrvChildRole BdrvChildRole;
-typedef struct BlockJobTxn BlockJobTxn;
 
 typedef struct BlockDriverInfo {
     /* in bytes, 0 if irrelevant */
@@ -218,7 +217,7 @@ BlockDriverState *bdrv_open(const char *filename, const char *reference,
 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
                                     BlockDriverState *bs,
                                     QDict *options, int flags);
-int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp);
+int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error **errp);
 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp);
 int bdrv_reopen_prepare(BDRVReopenState *reopen_state,
                         BlockReopenQueue *queue, Error **errp);
@@ -332,8 +331,39 @@ int bdrv_flush_all(void);
 void bdrv_close_all(void);
 void bdrv_drain(BlockDriverState *bs);
 void coroutine_fn bdrv_co_drain(BlockDriverState *bs);
+void bdrv_drain_all_begin(void);
+void bdrv_drain_all_end(void);
 void bdrv_drain_all(void);
 
+#define BDRV_POLL_WHILE(bs, cond) ({                       \
+    bool waited_ = false;                                  \
+    BlockDriverState *bs_ = (bs);                          \
+    AioContext *ctx_ = bdrv_get_aio_context(bs_);          \
+    if (aio_context_in_iothread(ctx_)) {                   \
+        while ((cond)) {                                   \
+            aio_poll(ctx_, true);                          \
+            waited_ = true;                                \
+        }                                                  \
+    } else {                                               \
+        assert(qemu_get_current_aio_context() ==           \
+               qemu_get_aio_context());                    \
+        /* Ask bdrv_dec_in_flight to wake up the main      \
+         * QEMU AioContext.  Extra I/O threads never take  \
+         * other I/O threads' AioContexts (see for example \
+         * block_job_defer_to_main_loop for how to do it). \
+         */                                                \
+        assert(!bs_->wakeup);                              \
+        bs_->wakeup = true;                                \
+        while ((cond)) {                                   \
+            aio_context_release(ctx_);                     \
+            aio_poll(qemu_get_aio_context(), true);        \
+            aio_context_acquire(ctx_);                     \
+            waited_ = true;                                \
+        }                                                  \
+        bs_->wakeup = false;                               \
+    }                                                      \
+    waited_; })
+
 int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count);
 int bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, int count);
 int bdrv_has_zero_init_1(BlockDriverState *bs);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index e96e9ada57..b02abbd618 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -62,8 +62,6 @@
 enum BdrvTrackedRequestType {
     BDRV_TRACKED_READ,
     BDRV_TRACKED_WRITE,
-    BDRV_TRACKED_FLUSH,
-    BDRV_TRACKED_IOCTL,
     BDRV_TRACKED_DISCARD,
 };
 
@@ -445,7 +443,7 @@ struct BlockDriverState {
                          note this is a reference count */
 
     CoQueue flush_queue;            /* Serializing flush queue */
-    BdrvTrackedRequest *active_flush_req; /* Flush request in flight */
+    bool active_flush_req;          /* Flush request in flight? */
     unsigned int write_gen;         /* Current data generation */
     unsigned int flushed_gen;       /* Flushed write generation */
 
@@ -473,9 +471,12 @@ struct BlockDriverState {
     /* Callback before write request is processed */
     NotifierWithReturnList before_write_notifiers;
 
-    /* number of in-flight serialising requests */
+    /* number of in-flight requests; overall and serialising */
+    unsigned int in_flight;
     unsigned int serialising_in_flight;
 
+    bool wakeup;
+
     /* Offset after the highest byte written to */
     uint64_t wr_highest_offset;
 
@@ -634,6 +635,21 @@ void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
                                       void (*aio_context_detached)(void *),
                                       void *opaque);
 
+/**
+ * bdrv_wakeup:
+ * @bs: The BlockDriverState for which an I/O operation has been completed.
+ *
+ * Wake up the main thread if it is waiting on BDRV_POLL_WHILE.  During
+ * synchronous I/O on a BlockDriverState that is attached to another
+ * I/O thread, the main thread lets the I/O thread's event loop run,
+ * waiting for the I/O operation to complete.  A bdrv_wakeup will wake
+ * up the main thread if necessary.
+ *
+ * Manual calls to bdrv_wakeup are rarely necessary, because
+ * bdrv_dec_in_flight already calls it.
+ */
+void bdrv_wakeup(BlockDriverState *bs);
+
 #ifdef _WIN32
 int is_windows_drive(const char *filename);
 #endif
@@ -649,8 +665,6 @@ int is_windows_drive(const char *filename);
  * the new backing file if the job completes. Ignored if @base is %NULL.
  * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
  * @on_error: The action to take upon error.
- * @cb: Completion function for the job.
- * @opaque: Opaque pointer value passed to @cb.
  * @errp: Error object.
  *
  * Start a streaming operation on @bs.  Clusters that are unallocated
@@ -662,8 +676,7 @@ int is_windows_drive(const char *filename);
  */
 void stream_start(const char *job_id, BlockDriverState *bs,
                   BlockDriverState *base, const char *backing_file_str,
-                  int64_t speed, BlockdevOnError on_error,
-                  BlockCompletionFunc *cb, void *opaque, Error **errp);
+                  int64_t speed, BlockdevOnError on_error, Error **errp);
 
 /**
  * commit_start:
@@ -674,22 +687,22 @@ void stream_start(const char *job_id, BlockDriverState *bs,
  * @base: Block device that will be written into, and become the new top.
  * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
  * @on_error: The action to take upon error.
- * @cb: Completion function for the job.
- * @opaque: Opaque pointer value passed to @cb.
  * @backing_file_str: String to use as the backing file in @top's overlay
  * @errp: Error object.
  *
  */
 void commit_start(const char *job_id, BlockDriverState *bs,
                   BlockDriverState *base, BlockDriverState *top, int64_t speed,
-                  BlockdevOnError on_error, BlockCompletionFunc *cb,
-                  void *opaque, const char *backing_file_str, Error **errp);
+                  BlockdevOnError on_error, const char *backing_file_str,
+                  Error **errp);
 /**
  * commit_active_start:
  * @job_id: The id of the newly-created job, or %NULL to use the
  * device name of @bs.
  * @bs: Active block device to be committed.
  * @base: Block device that will be written into, and become the new top.
+ * @creation_flags: Flags that control the behavior of the Job lifetime.
+ *                  See @BlockJobCreateFlags
  * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
  * @on_error: The action to take upon error.
  * @cb: Completion function for the job.
@@ -699,8 +712,8 @@ void commit_start(const char *job_id, BlockDriverState *bs,
  *
  */
 void commit_active_start(const char *job_id, BlockDriverState *bs,
-                         BlockDriverState *base, int64_t speed,
-                         BlockdevOnError on_error,
+                         BlockDriverState *base, int creation_flags,
+                         int64_t speed, BlockdevOnError on_error,
                          BlockCompletionFunc *cb,
                          void *opaque, Error **errp, bool auto_complete);
 /*
@@ -719,8 +732,6 @@ void commit_active_start(const char *job_id, BlockDriverState *bs,
  * @on_source_error: The action to take upon error reading from the source.
  * @on_target_error: The action to take upon error writing to the target.
  * @unmap: Whether to unmap target where source sectors only contain zeroes.
- * @cb: Completion function for the job.
- * @opaque: Opaque pointer value passed to @cb.
  * @errp: Error object.
  *
  * Start a mirroring operation on @bs.  Clusters that are allocated
@@ -734,9 +745,7 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
                   MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
                   BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
-                  bool unmap,
-                  BlockCompletionFunc *cb,
-                  void *opaque, Error **errp);
+                  bool unmap, Error **errp);
 
 /*
  * backup_start:
@@ -749,6 +758,8 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
  * @sync_bitmap: The dirty bitmap if sync_mode is MIRROR_SYNC_MODE_INCREMENTAL.
  * @on_source_error: The action to take upon error reading from the source.
  * @on_target_error: The action to take upon error writing to the target.
+ * @creation_flags: Flags that control the behavior of the Job lifetime.
+ *                  See @BlockJobCreateFlags
  * @cb: Completion function for the job.
  * @opaque: Opaque pointer value passed to @cb.
  * @txn: Transaction that this job is part of (may be NULL).
@@ -762,6 +773,7 @@ void backup_start(const char *job_id, BlockDriverState *bs,
                   bool compress,
                   BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
+                  int creation_flags,
                   BlockCompletionFunc *cb, void *opaque,
                   BlockJobTxn *txn, Error **errp);
 
@@ -787,6 +799,9 @@ bool bdrv_requests_pending(BlockDriverState *bs);
 void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out);
 void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in);
 
+void bdrv_inc_in_flight(BlockDriverState *bs);
+void bdrv_dec_in_flight(BlockDriverState *bs);
+
 void blockdev_close_all_bdrv_states(void);
 
 #endif /* BLOCK_INT_H */
diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index 4ddb4ae2e1..356cacf004 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -28,78 +28,15 @@
 
 #include "block/block.h"
 
-/**
- * BlockJobDriver:
- *
- * A class type for block job driver.
- */
-typedef struct BlockJobDriver {
-    /** Derived BlockJob struct size */
-    size_t instance_size;
-
-    /** String describing the operation, part of query-block-jobs QMP API */
-    BlockJobType job_type;
-
-    /** Optional callback for job types that support setting a speed limit */
-    void (*set_speed)(BlockJob *job, int64_t speed, Error **errp);
-
-    /** Optional callback for job types that need to forward I/O status reset */
-    void (*iostatus_reset)(BlockJob *job);
-
-    /**
-     * Optional callback for job types whose completion must be triggered
-     * manually.
-     */
-    void (*complete)(BlockJob *job, Error **errp);
-
-    /**
-     * If the callback is not NULL, it will be invoked when all the jobs
-     * belonging to the same transaction complete; or upon this job's
-     * completion if it is not in a transaction. Skipped if NULL.
-     *
-     * All jobs will complete with a call to either .commit() or .abort() but
-     * never both.
-     */
-    void (*commit)(BlockJob *job);
-
-    /**
-     * If the callback is not NULL, it will be invoked when any job in the
-     * same transaction fails; or upon this job's failure (due to error or
-     * cancellation) if it is not in a transaction. Skipped if NULL.
-     *
-     * All jobs will complete with a call to either .commit() or .abort() but
-     * never both.
-     */
-    void (*abort)(BlockJob *job);
-
-    /**
-     * If the callback is not NULL, it will be invoked when the job transitions
-     * into the paused state.  Paused jobs must not perform any asynchronous
-     * I/O or event loop activity.  This callback is used to quiesce jobs.
-     */
-    void coroutine_fn (*pause)(BlockJob *job);
-
-    /**
-     * If the callback is not NULL, it will be invoked when the job transitions
-     * out of the paused state.  Any asynchronous I/O or event loop activity
-     * should be restarted from this callback.
-     */
-    void coroutine_fn (*resume)(BlockJob *job);
-
-    /*
-     * If the callback is not NULL, it will be invoked before the job is
-     * resumed in a new AioContext.  This is the place to move any resources
-     * besides job->blk to the new AioContext.
-     */
-    void (*attached_aio_context)(BlockJob *job, AioContext *new_context);
-} BlockJobDriver;
+typedef struct BlockJobDriver BlockJobDriver;
+typedef struct BlockJobTxn BlockJobTxn;
 
 /**
  * BlockJob:
  *
  * Long-running operation on a BlockDriverState.
  */
-struct BlockJob {
+typedef struct BlockJob {
     /** The job type, including the job vtable.  */
     const BlockJobDriver *driver;
 
@@ -107,7 +44,7 @@ struct BlockJob {
     BlockBackend *blk;
 
     /**
-     * The ID of the block job.
+     * The ID of the block job. May be NULL for internal jobs.
      */
     char *id;
 
@@ -181,6 +118,9 @@ struct BlockJob {
     /** Block other operations when block job is running */
     Error *blocker;
 
+    /** BlockDriverStates that are involved in this block job */
+    GSList *nodes;
+
     /** The opaque value that is passed to the completion function.  */
     void *opaque;
 
@@ -198,7 +138,12 @@ struct BlockJob {
     /** Non-NULL if this job is part of a transaction */
     BlockJobTxn *txn;
     QLIST_ENTRY(BlockJob) txn_list;
-};
+} BlockJob;
+
+typedef enum BlockJobCreateFlags {
+    BLOCK_JOB_DEFAULT = 0x00,
+    BLOCK_JOB_INTERNAL = 0x01,
+} BlockJobCreateFlags;
 
 /**
  * block_job_next:
@@ -222,74 +167,15 @@ BlockJob *block_job_next(BlockJob *job);
 BlockJob *block_job_get(const char *id);
 
 /**
- * block_job_create:
- * @job_id: The id of the newly-created job, or %NULL to have one
- * generated automatically.
- * @job_type: The class object for the newly-created job.
- * @bs: The block
- * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
- * @cb: Completion function for the job.
- * @opaque: Opaque pointer value passed to @cb.
- * @errp: Error object.
- *
- * Create a new long-running block device job and return it.  The job
- * will call @cb asynchronously when the job completes.  Note that
- * @bs may have been closed at the time the @cb it is called.  If
- * this is the case, the job may be reported as either cancelled or
- * completed.
- *
- * This function is not part of the public job interface; it should be
- * called from a wrapper that is specific to the job type.
- */
-void *block_job_create(const char *job_id, const BlockJobDriver *driver,
-                       BlockDriverState *bs, int64_t speed,
-                       BlockCompletionFunc *cb, void *opaque, Error **errp);
-
-/**
- * block_job_sleep_ns:
- * @job: The job that calls the function.
- * @clock: The clock to sleep on.
- * @ns: How many nanoseconds to stop for.
- *
- * Put the job to sleep (assuming that it wasn't canceled) for @ns
- * nanoseconds.  Canceling the job will interrupt the wait immediately.
- */
-void block_job_sleep_ns(BlockJob *job, QEMUClockType type, int64_t ns);
-
-/**
- * block_job_yield:
- * @job: The job that calls the function.
- *
- * Yield the block job coroutine.
- */
-void block_job_yield(BlockJob *job);
-
-/**
- * block_job_ref:
- * @bs: The block device.
- *
- * Grab a reference to the block job. Should be paired with block_job_unref.
- */
-void block_job_ref(BlockJob *job);
-
-/**
- * block_job_unref:
- * @bs: The block device.
- *
- * Release reference to the block job and release resources if it is the last
- * reference.
- */
-void block_job_unref(BlockJob *job);
-
-/**
- * block_job_completed:
- * @job: The job being completed.
- * @ret: The status code.
+ * block_job_add_bdrv:
+ * @job: A block job
+ * @bs: A BlockDriverState that is involved in @job
  *
- * Call the completion function that was registered at creation time, and
- * free @job.
+ * Add @bs to the list of BlockDriverState that are involved in
+ * @job. This means that all operations will be blocked on @bs while
+ * @job exists.
  */
-void block_job_completed(BlockJob *job, int ret);
+void block_job_add_bdrv(BlockJob *job, BlockDriverState *bs);
 
 /**
  * block_job_set_speed:
@@ -320,29 +206,12 @@ void block_job_cancel(BlockJob *job);
 void block_job_complete(BlockJob *job, Error **errp);
 
 /**
- * block_job_is_cancelled:
- * @job: The job being queried.
- *
- * Returns whether the job is scheduled for cancellation.
- */
-bool block_job_is_cancelled(BlockJob *job);
-
-/**
  * block_job_query:
  * @job: The job to get information about.
  *
  * Return information about a job.
  */
-BlockJobInfo *block_job_query(BlockJob *job);
-
-/**
- * block_job_pause_point:
- * @job: The job that is ready to pause.
- *
- * Pause now if block_job_pause() has been called.  Block jobs that perform
- * lots of I/O must call this between requests so that the job can be paused.
- */
-void coroutine_fn block_job_pause_point(BlockJob *job);
+BlockJobInfo *block_job_query(BlockJob *job, Error **errp);
 
 /**
  * block_job_pause:
@@ -353,45 +222,38 @@ void coroutine_fn block_job_pause_point(BlockJob *job);
 void block_job_pause(BlockJob *job);
 
 /**
- * block_job_resume:
- * @job: The job to be resumed.
- *
- * Resume the specified job.  Must be paired with a preceding block_job_pause.
- */
-void block_job_resume(BlockJob *job);
-
-/**
- * block_job_enter:
- * @job: The job to enter.
+ * block_job_user_pause:
+ * @job: The job to be paused.
  *
- * Continue the specified job by entering the coroutine.
+ * Asynchronously pause the specified job.
+ * Do not allow a resume until a matching call to block_job_user_resume.
  */
-void block_job_enter(BlockJob *job);
+void block_job_user_pause(BlockJob *job);
 
 /**
- * block_job_event_cancelled:
- * @job: The job whose information is requested.
+ * block_job_paused:
+ * @job: The job to query.
  *
- * Send a BLOCK_JOB_CANCELLED event for the specified job.
+ * Returns true if the job is user-paused.
  */
-void block_job_event_cancelled(BlockJob *job);
+bool block_job_user_paused(BlockJob *job);
 
 /**
- * block_job_ready:
- * @job: The job which is now ready to complete.
- * @msg: Error message. Only present on failure.
+ * block_job_resume:
+ * @job: The job to be resumed.
  *
- * Send a BLOCK_JOB_COMPLETED event for the specified job.
+ * Resume the specified job.  Must be paired with a preceding block_job_pause.
  */
-void block_job_event_completed(BlockJob *job, const char *msg);
+void block_job_resume(BlockJob *job);
 
 /**
- * block_job_ready:
- * @job: The job which is now ready to complete.
+ * block_job_user_resume:
+ * @job: The job to be resumed.
  *
- * Send a BLOCK_JOB_READY event for the specified job.
+ * Resume the specified job.
+ * Must be paired with a preceding block_job_user_pause.
  */
-void block_job_event_ready(BlockJob *job);
+void block_job_user_resume(BlockJob *job);
 
 /**
  * block_job_cancel_sync:
@@ -439,37 +301,6 @@ int block_job_complete_sync(BlockJob *job, Error **errp);
 void block_job_iostatus_reset(BlockJob *job);
 
 /**
- * block_job_error_action:
- * @job: The job to signal an error for.
- * @on_err: The error action setting.
- * @is_read: Whether the operation was a read.
- * @error: The error that was reported.
- *
- * Report an I/O error for a block job and possibly stop the VM.  Return the
- * action that was selected based on @on_err and @error.
- */
-BlockErrorAction block_job_error_action(BlockJob *job, BlockdevOnError on_err,
-                                        int is_read, int error);
-
-typedef void BlockJobDeferToMainLoopFn(BlockJob *job, void *opaque);
-
-/**
- * block_job_defer_to_main_loop:
- * @job: The job
- * @fn: The function to run in the main loop
- * @opaque: The opaque value that is passed to @fn
- *
- * Execute a given function in the main loop with the BlockDriverState
- * AioContext acquired.  Block jobs must call bdrv_unref(), bdrv_close(), and
- * anything that uses bdrv_drain_all() in the main loop.
- *
- * The @job AioContext is held while @fn executes.
- */
-void block_job_defer_to_main_loop(BlockJob *job,
-                                  BlockJobDeferToMainLoopFn *fn,
-                                  void *opaque);
-
-/**
  * block_job_txn_new:
  *
  * Allocate and return a new block job transaction.  Jobs can be added to the
@@ -504,4 +335,12 @@ void block_job_txn_unref(BlockJobTxn *txn);
  */
 void block_job_txn_add_job(BlockJobTxn *txn, BlockJob *job);
 
+/**
+ * block_job_is_internal:
+ * @job: The job to determine if it is user-visible or not.
+ *
+ * Returns true if the job should not be visible to the management layer.
+ */
+bool block_job_is_internal(BlockJob *job);
+
 #endif
diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h
new file mode 100644
index 0000000000..40275e4437
--- /dev/null
+++ b/include/block/blockjob_int.h
@@ -0,0 +1,239 @@
+/*
+ * Declarations for long-running block device operations
+ *
+ * Copyright (c) 2011 IBM Corp.
+ * Copyright (c) 2012 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef BLOCKJOB_INT_H
+#define BLOCKJOB_INT_H
+
+#include "block/blockjob.h"
+#include "block/block.h"
+
+/**
+ * BlockJobDriver:
+ *
+ * A class type for block job driver.
+ */
+struct BlockJobDriver {
+    /** Derived BlockJob struct size */
+    size_t instance_size;
+
+    /** String describing the operation, part of query-block-jobs QMP API */
+    BlockJobType job_type;
+
+    /** Optional callback for job types that support setting a speed limit */
+    void (*set_speed)(BlockJob *job, int64_t speed, Error **errp);
+
+    /** Optional callback for job types that need to forward I/O status reset */
+    void (*iostatus_reset)(BlockJob *job);
+
+    /**
+     * Optional callback for job types whose completion must be triggered
+     * manually.
+     */
+    void (*complete)(BlockJob *job, Error **errp);
+
+    /**
+     * If the callback is not NULL, it will be invoked when all the jobs
+     * belonging to the same transaction complete; or upon this job's
+     * completion if it is not in a transaction. Skipped if NULL.
+     *
+     * All jobs will complete with a call to either .commit() or .abort() but
+     * never both.
+     */
+    void (*commit)(BlockJob *job);
+
+    /**
+     * If the callback is not NULL, it will be invoked when any job in the
+     * same transaction fails; or upon this job's failure (due to error or
+     * cancellation) if it is not in a transaction. Skipped if NULL.
+     *
+     * All jobs will complete with a call to either .commit() or .abort() but
+     * never both.
+     */
+    void (*abort)(BlockJob *job);
+
+    /**
+     * If the callback is not NULL, it will be invoked when the job transitions
+     * into the paused state.  Paused jobs must not perform any asynchronous
+     * I/O or event loop activity.  This callback is used to quiesce jobs.
+     */
+    void coroutine_fn (*pause)(BlockJob *job);
+
+    /**
+     * If the callback is not NULL, it will be invoked when the job transitions
+     * out of the paused state.  Any asynchronous I/O or event loop activity
+     * should be restarted from this callback.
+     */
+    void coroutine_fn (*resume)(BlockJob *job);
+
+    /*
+     * If the callback is not NULL, it will be invoked before the job is
+     * resumed in a new AioContext.  This is the place to move any resources
+     * besides job->blk to the new AioContext.
+     */
+    void (*attached_aio_context)(BlockJob *job, AioContext *new_context);
+
+    /*
+     * If the callback is not NULL, it will be invoked when the job has to be
+     * synchronously cancelled or completed; it should drain BlockDriverStates
+     * as required to ensure progress.
+     */
+    void (*drain)(BlockJob *job);
+};
+
+/**
+ * block_job_create:
+ * @job_id: The id of the newly-created job, or %NULL to have one
+ * generated automatically.
+ * @job_type: The class object for the newly-created job.
+ * @bs: The block
+ * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
+ * @cb: Completion function for the job.
+ * @opaque: Opaque pointer value passed to @cb.
+ * @errp: Error object.
+ *
+ * Create a new long-running block device job and return it.  The job
+ * will call @cb asynchronously when the job completes.  Note that
+ * @bs may have been closed at the time the @cb it is called.  If
+ * this is the case, the job may be reported as either cancelled or
+ * completed.
+ *
+ * This function is not part of the public job interface; it should be
+ * called from a wrapper that is specific to the job type.
+ */
+void *block_job_create(const char *job_id, const BlockJobDriver *driver,
+                       BlockDriverState *bs, int64_t speed, int flags,
+                       BlockCompletionFunc *cb, void *opaque, Error **errp);
+
+/**
+ * block_job_sleep_ns:
+ * @job: The job that calls the function.
+ * @clock: The clock to sleep on.
+ * @ns: How many nanoseconds to stop for.
+ *
+ * Put the job to sleep (assuming that it wasn't canceled) for @ns
+ * nanoseconds.  Canceling the job will interrupt the wait immediately.
+ */
+void block_job_sleep_ns(BlockJob *job, QEMUClockType type, int64_t ns);
+
+/**
+ * block_job_yield:
+ * @job: The job that calls the function.
+ *
+ * Yield the block job coroutine.
+ */
+void block_job_yield(BlockJob *job);
+
+/**
+ * block_job_ref:
+ * @bs: The block device.
+ *
+ * Grab a reference to the block job. Should be paired with block_job_unref.
+ */
+void block_job_ref(BlockJob *job);
+
+/**
+ * block_job_unref:
+ * @bs: The block device.
+ *
+ * Release reference to the block job and release resources if it is the last
+ * reference.
+ */
+void block_job_unref(BlockJob *job);
+
+/**
+ * block_job_completed:
+ * @job: The job being completed.
+ * @ret: The status code.
+ *
+ * Call the completion function that was registered at creation time, and
+ * free @job.
+ */
+void block_job_completed(BlockJob *job, int ret);
+
+/**
+ * block_job_is_cancelled:
+ * @job: The job being queried.
+ *
+ * Returns whether the job is scheduled for cancellation.
+ */
+bool block_job_is_cancelled(BlockJob *job);
+
+/**
+ * block_job_pause_point:
+ * @job: The job that is ready to pause.
+ *
+ * Pause now if block_job_pause() has been called.  Block jobs that perform
+ * lots of I/O must call this between requests so that the job can be paused.
+ */
+void coroutine_fn block_job_pause_point(BlockJob *job);
+
+/**
+ * block_job_enter:
+ * @job: The job to enter.
+ *
+ * Continue the specified job by entering the coroutine.
+ */
+void block_job_enter(BlockJob *job);
+
+/**
+ * block_job_event_ready:
+ * @job: The job which is now ready to be completed.
+ *
+ * Send a BLOCK_JOB_READY event for the specified job.
+ */
+void block_job_event_ready(BlockJob *job);
+
+/**
+ * block_job_error_action:
+ * @job: The job to signal an error for.
+ * @on_err: The error action setting.
+ * @is_read: Whether the operation was a read.
+ * @error: The error that was reported.
+ *
+ * Report an I/O error for a block job and possibly stop the VM.  Return the
+ * action that was selected based on @on_err and @error.
+ */
+BlockErrorAction block_job_error_action(BlockJob *job, BlockdevOnError on_err,
+                                        int is_read, int error);
+
+typedef void BlockJobDeferToMainLoopFn(BlockJob *job, void *opaque);
+
+/**
+ * block_job_defer_to_main_loop:
+ * @job: The job
+ * @fn: The function to run in the main loop
+ * @opaque: The opaque value that is passed to @fn
+ *
+ * Execute a given function in the main loop with the BlockDriverState
+ * AioContext acquired.  Block jobs must call bdrv_unref(), bdrv_close(), and
+ * anything that uses bdrv_drain_all() in the main loop.
+ *
+ * The @job AioContext is held while @fn executes.
+ */
+void block_job_defer_to_main_loop(BlockJob *job,
+                                  BlockJobDeferToMainLoopFn *fn,
+                                  void *opaque);
+
+#endif
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index cb624e4acc..a8c13cee66 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -316,6 +316,7 @@ static inline void tb_set_jmp_target(TranslationBlock *tb,
 
 #endif
 
+/* Called with tb_lock held.  */
 static inline void tb_add_jump(TranslationBlock *tb, int n,
                                TranslationBlock *tb_next)
 {
@@ -369,6 +370,7 @@ void tlb_fill(CPUState *cpu, target_ulong addr, MMUAccessType access_type,
 #if defined(CONFIG_USER_ONLY)
 void mmap_lock(void);
 void mmap_unlock(void);
+bool have_mmap_lock(void);
 
 static inline tb_page_addr_t get_page_addr_code(CPUArchState *env1, target_ulong addr)
 {
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 79ccaaba1f..9728a2fb1a 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -209,7 +209,7 @@ struct MemoryRegion {
     void (*destructor)(MemoryRegion *mr);
     uint64_t align;
     bool terminates;
-    bool skip_dump;
+    bool ram_device;
     bool enabled;
     bool warning_printed; /* For reservations */
     uint8_t vga_logging_count;
@@ -449,6 +449,30 @@ void memory_region_init_ram_ptr(MemoryRegion *mr,
                                 void *ptr);
 
 /**
+ * memory_region_init_ram_device_ptr:  Initialize RAM device memory region from
+ *                                     a user-provided pointer.
+ *
+ * A RAM device represents a mapping to a physical device, such as to a PCI
+ * MMIO BAR of an vfio-pci assigned device.  The memory region may be mapped
+ * into the VM address space and access to the region will modify memory
+ * directly.  However, the memory region should not be included in a memory
+ * dump (device may not be enabled/mapped at the time of the dump), and
+ * operations incompatible with manipulating MMIO should be avoided.  Replaces
+ * skip_dump flag.
+ *
+ * @mr: the #MemoryRegion to be initialized.
+ * @owner: the object that tracks the region's reference count
+ * @name: the name of the region.
+ * @size: size of the region.
+ * @ptr: memory to be mapped; must contain at least @size bytes.
+ */
+void memory_region_init_ram_device_ptr(MemoryRegion *mr,
+                                       struct Object *owner,
+                                       const char *name,
+                                       uint64_t size,
+                                       void *ptr);
+
+/**
  * memory_region_init_alias: Initialize a memory region that aliases all or a
  *                           part of another memory region.
  *
@@ -574,22 +598,13 @@ static inline bool memory_region_is_ram(MemoryRegion *mr)
 }
 
 /**
- * memory_region_is_skip_dump: check whether a memory region should not be
- *                             dumped
- *
- * Returns %true is a memory region should not be dumped(e.g. VFIO BAR MMAP).
+ * memory_region_is_ram_device: check whether a memory region is a ram device
  *
- * @mr: the memory region being queried
- */
-bool memory_region_is_skip_dump(MemoryRegion *mr);
-
-/**
- * memory_region_set_skip_dump: Set skip_dump flag, dump will ignore this memory
- *                              region
+ * Returns %true is a memory region is a device backed ram region
  *
  * @mr: the memory region being queried
  */
-void memory_region_set_skip_dump(MemoryRegion *mr);
+bool memory_region_is_ram_device(MemoryRegion *mr);
 
 /**
  * memory_region_is_romd: check whether a memory region is in ROMD mode
@@ -1465,9 +1480,11 @@ void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr);
 static inline bool memory_access_is_direct(MemoryRegion *mr, bool is_write)
 {
     if (is_write) {
-        return memory_region_is_ram(mr) && !mr->readonly;
+        return memory_region_is_ram(mr) &&
+               !mr->readonly && !memory_region_is_ram_device(mr);
     } else {
-        return memory_region_is_ram(mr) || memory_region_is_romd(mr);
+        return (memory_region_is_ram(mr) && !memory_region_is_ram_device(mr)) ||
+               memory_region_is_romd(mr);
     }
 }
 
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 17fff80c8a..98dc7722c3 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -13,7 +13,6 @@
 #include "qemu/bitmap.h"
 #include "sysemu/sysemu.h"
 #include "hw/pci/pci.h"
-#include "hw/boards.h"
 #include "hw/compat.h"
 #include "hw/mem/pc-dimm.h"
 #include "hw/mem/nvdimm.h"
diff --git a/include/hw/xen/xen_backend.h b/include/hw/xen/xen_backend.h
index 0df282ab5f..cbda40ee53 100644
--- a/include/hw/xen/xen_backend.h
+++ b/include/hw/xen/xen_backend.h
@@ -2,60 +2,10 @@
 #define QEMU_HW_XEN_BACKEND_H
 
 #include "hw/xen/xen_common.h"
+#include "hw/xen/xen_pvdev.h"
 #include "sysemu/sysemu.h"
 #include "net/net.h"
 
-/* ------------------------------------------------------------- */
-
-#define XEN_BUFSIZE 1024
-
-struct XenDevice;
-
-/* driver uses grant tables  ->  open gntdev device (xendev->gnttabdev) */
-#define DEVOPS_FLAG_NEED_GNTDEV   1
-/* don't expect frontend doing correct state transitions (aka console quirk) */
-#define DEVOPS_FLAG_IGNORE_STATE  2
-
-struct XenDevOps {
-    size_t    size;
-    uint32_t  flags;
-    void      (*alloc)(struct XenDevice *xendev);
-    int       (*init)(struct XenDevice *xendev);
-    int       (*initialise)(struct XenDevice *xendev);
-    void      (*connected)(struct XenDevice *xendev);
-    void      (*event)(struct XenDevice *xendev);
-    void      (*disconnect)(struct XenDevice *xendev);
-    int       (*free)(struct XenDevice *xendev);
-    void      (*backend_changed)(struct XenDevice *xendev, const char *node);
-    void      (*frontend_changed)(struct XenDevice *xendev, const char *node);
-    int       (*backend_register)(void);
-};
-
-struct XenDevice {
-    const char         *type;
-    int                dom;
-    int                dev;
-    char               name[64];
-    int                debug;
-
-    enum xenbus_state  be_state;
-    enum xenbus_state  fe_state;
-    int                online;
-    char               be[XEN_BUFSIZE];
-    char               *fe;
-    char               *protocol;
-    int                remote_port;
-    int                local_port;
-
-    xenevtchn_handle   *evtchndev;
-    xengnttab_handle   *gnttabdev;
-
-    struct XenDevOps   *ops;
-    QTAILQ_ENTRY(XenDevice) next;
-};
-
-/* ------------------------------------------------------------- */
-
 /* variables */
 extern xc_interface *xen_xc;
 extern xenforeignmemory_handle *xen_fmem;
@@ -63,26 +13,20 @@ extern struct xs_handle *xenstore;
 extern const char *xen_protocol;
 extern DeviceState *xen_sysdev;
 
-/* xenstore helper functions */
 int xenstore_mkdir(char *path, int p);
-int xenstore_write_str(const char *base, const char *node, const char *val);
-int xenstore_write_int(const char *base, const char *node, int ival);
-int xenstore_write_int64(const char *base, const char *node, int64_t ival);
-char *xenstore_read_str(const char *base, const char *node);
-int xenstore_read_int(const char *base, const char *node, int *ival);
-
 int xenstore_write_be_str(struct XenDevice *xendev, const char *node, const char *val);
 int xenstore_write_be_int(struct XenDevice *xendev, const char *node, int ival);
 int xenstore_write_be_int64(struct XenDevice *xendev, const char *node, int64_t ival);
 char *xenstore_read_be_str(struct XenDevice *xendev, const char *node);
 int xenstore_read_be_int(struct XenDevice *xendev, const char *node, int *ival);
+void xenstore_update_fe(char *watch, struct XenDevice *xendev);
+void xenstore_update_be(char *watch, char *type, int dom,
+                        struct XenDevOps *ops);
 char *xenstore_read_fe_str(struct XenDevice *xendev, const char *node);
 int xenstore_read_fe_int(struct XenDevice *xendev, const char *node, int *ival);
-int xenstore_read_uint64(const char *base, const char *node, uint64_t *uval);
-int xenstore_read_fe_uint64(struct XenDevice *xendev, const char *node, uint64_t *uval);
+int xenstore_read_fe_uint64(struct XenDevice *xendev, const char *node,
+                            uint64_t *uval);
 
-const char *xenbus_strstate(enum xenbus_state state);
-struct XenDevice *xen_be_find_xendev(const char *type, int dom, int dev);
 void xen_be_check_state(struct XenDevice *xendev);
 
 /* xen backend driver bits */
@@ -91,10 +35,6 @@ void xen_be_register_common(void);
 int xen_be_register(const char *type, struct XenDevOps *ops);
 int xen_be_set_state(struct XenDevice *xendev, enum xenbus_state state);
 int xen_be_bind_evtchn(struct XenDevice *xendev);
-void xen_be_unbind_evtchn(struct XenDevice *xendev);
-int xen_be_send_notify(struct XenDevice *xendev);
-void xen_be_printf(struct XenDevice *xendev, int msg_level, const char *fmt, ...)
-    GCC_FMT_ATTR(3, 4);
 
 /* actual backend drivers */
 extern struct XenDevOps xen_console_ops;      /* xen_console.c     */
diff --git a/include/hw/xen/xen_pvdev.h b/include/hw/xen/xen_pvdev.h
new file mode 100644
index 0000000000..083f0a9cc7
--- /dev/null
+++ b/include/hw/xen/xen_pvdev.h
@@ -0,0 +1,78 @@
+#ifndef QEMU_HW_XEN_PVDEV_H
+#define QEMU_HW_XEN_PVDEV_H
+
+#include "hw/xen/xen_common.h"
+/* ------------------------------------------------------------- */
+
+#define XEN_BUFSIZE 1024
+
+struct XenDevice;
+
+/* driver uses grant tables  ->  open gntdev device (xendev->gnttabdev) */
+#define DEVOPS_FLAG_NEED_GNTDEV   1
+/* don't expect frontend doing correct state transitions (aka console quirk) */
+#define DEVOPS_FLAG_IGNORE_STATE  2
+
+struct XenDevOps {
+    size_t    size;
+    uint32_t  flags;
+    void      (*alloc)(struct XenDevice *xendev);
+    int       (*init)(struct XenDevice *xendev);
+    int       (*initialise)(struct XenDevice *xendev);
+    void      (*connected)(struct XenDevice *xendev);
+    void      (*event)(struct XenDevice *xendev);
+    void      (*disconnect)(struct XenDevice *xendev);
+    int       (*free)(struct XenDevice *xendev);
+    void      (*backend_changed)(struct XenDevice *xendev, const char *node);
+    void      (*frontend_changed)(struct XenDevice *xendev, const char *node);
+    int       (*backend_register)(void);
+};
+
+struct XenDevice {
+    const char         *type;
+    int                dom;
+    int                dev;
+    char               name[64];
+    int                debug;
+
+    enum xenbus_state  be_state;
+    enum xenbus_state  fe_state;
+    int                online;
+    char               be[XEN_BUFSIZE];
+    char               *fe;
+    char               *protocol;
+    int                remote_port;
+    int                local_port;
+
+    xenevtchn_handle   *evtchndev;
+    xengnttab_handle   *gnttabdev;
+
+    struct XenDevOps   *ops;
+    QTAILQ_ENTRY(XenDevice) next;
+};
+
+/* ------------------------------------------------------------- */
+
+/* xenstore helper functions */
+int xenstore_write_str(const char *base, const char *node, const char *val);
+int xenstore_write_int(const char *base, const char *node, int ival);
+int xenstore_write_int64(const char *base, const char *node, int64_t ival);
+char *xenstore_read_str(const char *base, const char *node);
+int xenstore_read_int(const char *base, const char *node, int *ival);
+int xenstore_read_uint64(const char *base, const char *node, uint64_t *uval);
+void xenstore_update(void *unused);
+
+const char *xenbus_strstate(enum xenbus_state state);
+
+void xen_pv_evtchn_event(void *opaque);
+void xen_pv_insert_xendev(struct XenDevice *xendev);
+void xen_pv_del_xendev(struct XenDevice *xendev);
+struct XenDevice *xen_pv_find_xendev(const char *type, int dom, int dev);
+
+void xen_pv_unbind_evtchn(struct XenDevice *xendev);
+int xen_pv_send_notify(struct XenDevice *xendev);
+
+void xen_pv_printf(struct XenDevice *xendev, int msg_level,
+                   const char *fmt, ...)  GCC_FMT_ATTR(3, 4);
+
+#endif /* QEMU_HW_XEN_PVDEV_H */
diff --git a/include/migration/colo.h b/include/migration/colo.h
new file mode 100644
index 0000000000..e32eef4763
--- /dev/null
+++ b/include/migration/colo.h
@@ -0,0 +1,38 @@
+/*
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ * (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_COLO_H
+#define QEMU_COLO_H
+
+#include "qemu-common.h"
+#include "migration/migration.h"
+#include "qemu/coroutine_int.h"
+#include "qemu/thread.h"
+#include "qemu/main-loop.h"
+
+bool colo_supported(void);
+void colo_info_init(void);
+
+void migrate_start_colo_process(MigrationState *s);
+bool migration_in_colo_state(void);
+
+/* loadvm */
+bool migration_incoming_enable_colo(void);
+void migration_incoming_exit_colo(void);
+void *colo_process_incoming_thread(void *opaque);
+bool migration_incoming_in_colo_state(void);
+
+COLOMode get_colo_mode(void);
+
+/* failover */
+void colo_do_failover(MigrationState *s);
+#endif
diff --git a/include/migration/failover.h b/include/migration/failover.h
new file mode 100644
index 0000000000..ad91ef2381
--- /dev/null
+++ b/include/migration/failover.h
@@ -0,0 +1,26 @@
+/*
+ *  COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ *  (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO.,LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_FAILOVER_H
+#define QEMU_FAILOVER_H
+
+#include "qemu-common.h"
+#include "qapi-types.h"
+
+void failover_init_state(void);
+FailoverStatus failover_set_state(FailoverStatus old_state,
+                                     FailoverStatus new_state);
+FailoverStatus failover_get_state(void);
+void failover_request_active(Error **errp);
+bool failover_request_is_active(void);
+
+#endif
diff --git a/include/migration/migration.h b/include/migration/migration.h
index 2791b90c00..c309d23370 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -21,6 +21,7 @@
 #include "migration/vmstate.h"
 #include "qapi-types.h"
 #include "exec/cpu-common.h"
+#include "qemu/coroutine_int.h"
 
 #define QEMU_VM_FILE_MAGIC           0x5145564d
 #define QEMU_VM_FILE_VERSION_COMPAT  0x00000002
@@ -107,6 +108,12 @@ struct MigrationIncomingState {
     QEMUBH *bh;
 
     int state;
+
+    bool have_colo_incoming_thread;
+    QemuThread colo_incoming_thread;
+    /* The coroutine we should enter (back) after failover */
+    Coroutine *migration_incoming_co;
+
     /* See savevm.c */
     LoadStateEntry_Head loadvm_handlers;
 };
@@ -298,6 +305,7 @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen);
 
 int migrate_use_xbzrle(void);
 int64_t migrate_xbzrle_cache_size(void);
+bool migrate_colo_enabled(void);
 
 int64_t xbzrle_cache_resize(int64_t new_size);
 
diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h
index a714d8ef80..8cc532ec0e 100644
--- a/include/monitor/monitor.h
+++ b/include/monitor/monitor.h
@@ -9,7 +9,7 @@
 extern Monitor *cur_mon;
 
 /* flags for monitor_init */
-#define MONITOR_IS_DEFAULT    0x01
+/* 0x01 unused */
 #define MONITOR_USE_READLINE  0x02
 #define MONITOR_USE_CONTROL   0x04
 #define MONITOR_USE_PRETTY    0x08
diff --git a/include/qemu/log.h b/include/qemu/log.h
index 00bf37fc0f..a50e994c21 100644
--- a/include/qemu/log.h
+++ b/include/qemu/log.h
@@ -51,6 +51,22 @@ static inline bool qemu_loglevel_mask(int mask)
     return (qemu_loglevel & mask) != 0;
 }
 
+/* Lock output for a series of related logs.  Since this is not needed
+ * for a single qemu_log / qemu_log_mask / qemu_log_mask_and_addr, we
+ * assume that qemu_loglevel_mask has already been tested, and that
+ * qemu_loglevel is never set when qemu_logfile is unset.
+ */
+
+static inline void qemu_log_lock(void)
+{
+    qemu_flockfile(qemu_logfile);
+}
+
+static inline void qemu_log_unlock(void)
+{
+    qemu_funlockfile(qemu_logfile);
+}
+
 /* Logging functions: */
 
 /* main logging function
diff --git a/include/qemu/rfifolock.h b/include/qemu/rfifolock.h
deleted file mode 100644
index b23ab538a6..0000000000
--- a/include/qemu/rfifolock.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Recursive FIFO lock
- *
- * Copyright Red Hat, Inc. 2013
- *
- * Authors:
- *  Stefan Hajnoczi   <stefanha@redhat.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2 or later.
- * See the COPYING file in the top-level directory.
- *
- */
-
-#ifndef QEMU_RFIFOLOCK_H
-#define QEMU_RFIFOLOCK_H
-
-#include "qemu/thread.h"
-
-/* Recursive FIFO lock
- *
- * This lock provides more features than a plain mutex:
- *
- * 1. Fairness - enforces FIFO order.
- * 2. Nesting - can be taken recursively.
- * 3. Contention callback - optional, called when thread must wait.
- *
- * The recursive FIFO lock is heavyweight so prefer other synchronization
- * primitives if you do not need its features.
- */
-typedef struct {
-    QemuMutex lock;             /* protects all fields */
-
-    /* FIFO order */
-    unsigned int head;          /* active ticket number */
-    unsigned int tail;          /* waiting ticket number */
-    QemuCond cond;              /* used to wait for our ticket number */
-
-    /* Nesting */
-    QemuThread owner_thread;    /* thread that currently has ownership */
-    unsigned int nesting;       /* amount of nesting levels */
-
-    /* Contention callback */
-    void (*cb)(void *);         /* called when thread must wait, with ->lock
-                                 * held so it may not recursively lock/unlock
-                                 */
-    void *cb_opaque;
-} RFifoLock;
-
-void rfifolock_init(RFifoLock *r, void (*cb)(void *), void *opaque);
-void rfifolock_destroy(RFifoLock *r);
-void rfifolock_lock(RFifoLock *r);
-void rfifolock_unlock(RFifoLock *r);
-
-#endif /* QEMU_RFIFOLOCK_H */
diff --git a/include/qemu/sockets.h b/include/qemu/sockets.h
index 9eb24707df..5589e6842b 100644
--- a/include/qemu/sockets.h
+++ b/include/qemu/sockets.h
@@ -34,6 +34,8 @@ typedef void NonBlockingConnectHandler(int fd, Error *err, void *opaque);
 
 InetSocketAddress *inet_parse(const char *str, Error **errp);
 int inet_connect(const char *str, Error **errp);
+int inet_connect_saddr(InetSocketAddress *saddr, Error **errp,
+                       NonBlockingConnectHandler *callback, void *opaque);
 
 NetworkAddressFamily inet_netfamily(int family);
 
diff --git a/include/qemu/thread-posix.h b/include/qemu/thread-posix.h
index aa03567e5e..09d1e15728 100644
--- a/include/qemu/thread-posix.h
+++ b/include/qemu/thread-posix.h
@@ -4,6 +4,12 @@
 #include <pthread.h>
 #include <semaphore.h>
 
+typedef QemuMutex QemuRecMutex;
+#define qemu_rec_mutex_destroy qemu_mutex_destroy
+#define qemu_rec_mutex_lock qemu_mutex_lock
+#define qemu_rec_mutex_try_lock qemu_mutex_try_lock
+#define qemu_rec_mutex_unlock qemu_mutex_unlock
+
 struct QemuMutex {
     pthread_mutex_t lock;
 };
diff --git a/include/qemu/thread-win32.h b/include/qemu/thread-win32.h
index c7ce8dcd45..5fb6541ae9 100644
--- a/include/qemu/thread-win32.h
+++ b/include/qemu/thread-win32.h
@@ -8,6 +8,16 @@ struct QemuMutex {
     LONG owner;
 };
 
+typedef struct QemuRecMutex QemuRecMutex;
+struct QemuRecMutex {
+    CRITICAL_SECTION lock;
+};
+
+void qemu_rec_mutex_destroy(QemuRecMutex *mutex);
+void qemu_rec_mutex_lock(QemuRecMutex *mutex);
+int qemu_rec_mutex_trylock(QemuRecMutex *mutex);
+void qemu_rec_mutex_unlock(QemuRecMutex *mutex);
+
 struct QemuCond {
     LONG waiters, target;
     HANDLE sema;
diff --git a/include/qemu/thread.h b/include/qemu/thread.h
index 31237e93ee..e8e665f020 100644
--- a/include/qemu/thread.h
+++ b/include/qemu/thread.h
@@ -25,6 +25,9 @@ void qemu_mutex_lock(QemuMutex *mutex);
 int qemu_mutex_trylock(QemuMutex *mutex);
 void qemu_mutex_unlock(QemuMutex *mutex);
 
+/* Prototypes for other functions are in thread-posix.h/thread-win32.h.  */
+void qemu_rec_mutex_init(QemuRecMutex *mutex);
+
 void qemu_cond_init(QemuCond *cond);
 void qemu_cond_destroy(QemuCond *cond);
 
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index 633c3fc124..3f79a8e955 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -231,7 +231,25 @@ struct kvm_run;
 #define TB_JMP_CACHE_SIZE (1 << TB_JMP_CACHE_BITS)
 
 /* work queue */
-typedef void (*run_on_cpu_func)(CPUState *cpu, void *data);
+
+/* The union type allows passing of 64 bit target pointers on 32 bit
+ * hosts in a single parameter
+ */
+typedef union {
+    int           host_int;
+    unsigned long host_ulong;
+    void         *host_ptr;
+    vaddr         target_ptr;
+} run_on_cpu_data;
+
+#define RUN_ON_CPU_HOST_PTR(p)    ((run_on_cpu_data){.host_ptr = (p)})
+#define RUN_ON_CPU_HOST_INT(i)    ((run_on_cpu_data){.host_int = (i)})
+#define RUN_ON_CPU_HOST_ULONG(ul) ((run_on_cpu_data){.host_ulong = (ul)})
+#define RUN_ON_CPU_TARGET_PTR(v)  ((run_on_cpu_data){.target_ptr = (v)})
+#define RUN_ON_CPU_NULL           RUN_ON_CPU_HOST_PTR(NULL)
+
+typedef void (*run_on_cpu_func)(CPUState *cpu, run_on_cpu_data data);
+
 struct qemu_work_item;
 
 /**
@@ -319,7 +337,10 @@ struct CPUState {
     MemoryRegion *memory;
 
     void *env_ptr; /* CPUArchState */
+
+    /* Writes protected by tb_lock, reads not thread-safe  */
     struct TranslationBlock *tb_jmp_cache[TB_JMP_CACHE_SIZE];
+
     struct GDBRegisterState *gdb_regs;
     int gdb_num_regs;
     int gdb_num_g_regs;
@@ -634,7 +655,7 @@ bool cpu_is_stopped(CPUState *cpu);
  *
  * Used internally in the implementation of run_on_cpu.
  */
-void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data,
+void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data,
                    QemuMutex *mutex);
 
 /**
@@ -645,7 +666,7 @@ void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data,
  *
  * Schedules the function @func for execution on the vCPU @cpu.
  */
-void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data);
+void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data);
 
 /**
  * async_run_on_cpu:
@@ -655,7 +676,7 @@ void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data);
  *
  * Schedules the function @func for execution on the vCPU @cpu asynchronously.
  */
-void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data);
+void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data);
 
 /**
  * async_safe_run_on_cpu:
@@ -669,7 +690,7 @@ void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data);
  * Unlike run_on_cpu and async_run_on_cpu, the function is run outside the
  * BQL.
  */
-void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data);
+void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data);
 
 /**
  * qemu_get_cpu:
diff --git a/include/sysemu/os-posix.h b/include/sysemu/os-posix.h
index 3cfedbc28b..b0a6c0695b 100644
--- a/include/sysemu/os-posix.h
+++ b/include/sysemu/os-posix.h
@@ -87,4 +87,16 @@ void *qemu_alloc_stack(size_t *sz);
  */
 void qemu_free_stack(void *stack, size_t sz);
 
+/* POSIX and Mingw32 differ in the name of the stdio lock functions.  */
+
+static inline void qemu_flockfile(FILE *f)
+{
+    flockfile(f);
+}
+
+static inline void qemu_funlockfile(FILE *f)
+{
+    funlockfile(f);
+}
+
 #endif
diff --git a/include/sysemu/os-win32.h b/include/sysemu/os-win32.h
index 17aad3b20f..ff18b23db1 100644
--- a/include/sysemu/os-win32.h
+++ b/include/sysemu/os-win32.h
@@ -103,6 +103,21 @@ static inline char *realpath(const char *path, char *resolved_path)
     return resolved_path;
 }
 
+/* ??? Mingw appears to export _lock_file and _unlock_file as the functions
+ * with which to lock a stdio handle.  But something is wrong in the markup,
+ * either in the header or the library, such that we get undefined references
+ * to "_imp___lock_file" etc when linking.  Since we seem to have no other
+ * alternative, and the usage within the logging functions isn't critical,
+ * ignore FILE locking.
+ */
+
+static inline void qemu_flockfile(FILE *f)
+{
+}
+
+static inline void qemu_funlockfile(FILE *f)
+{
+}
 
 /* We wrap all the sockets functions so that we can
  * set errno based on WSAGetLastError()