From bfe24e1a26d33d57df3c75e7f44273f0b0ca5943 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 12 Mar 2012 10:28:34 +0100
Subject: trace-events: Rename 'next' argument

'next' is a systemtap keyword, so it's a bad idea to use it as an
argument name.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 trace-events | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/trace-events b/trace-events
index db2cd39950..a5f276d020 100644
--- a/trace-events
+++ b/trace-events
@@ -562,7 +562,7 @@ qemu_coroutine_terminate(void *co) "self %p"
 
 # qemu-coroutine-lock.c
 qemu_co_queue_next_bh(void) ""
-qemu_co_queue_next(void *next) "next %p"
+qemu_co_queue_next(void *nxt) "next %p"
 qemu_co_mutex_lock_entry(void *mutex, void *self) "mutex %p self %p"
 qemu_co_mutex_lock_return(void *mutex, void *self) "mutex %p self %p"
 qemu_co_mutex_unlock_entry(void *mutex, void *self) "mutex %p self %p"
-- 
cgit v1.2.3


From fa6b8733c93107d7fcd7018d1822e93ba9edaa94 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 12 Mar 2012 10:30:02 +0100
Subject: tracetool: Forbid argument name 'next'

It has happened more than once that patches that look perfectly sane
and work with simpletrace broke systemtap because they use 'next' as an
argument name for a tracing function. However, 'next' is a keyword for
systemtap, so we shouldn't use it.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 scripts/tracetool | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scripts/tracetool b/scripts/tracetool
index 47389b62ea..7b1c142b67 100755
--- a/scripts/tracetool
+++ b/scripts/tracetool
@@ -81,6 +81,10 @@ get_args()
     args=${1#*\(}
     args=${args%%\)*}
     echo "$args"
+
+    if (echo "$args" | grep "[ *]next\($\|[, ]\)" > /dev/null 2>&1); then
+        echo -e "\n#error 'next' is a bad argument name (clash with systemtap keyword)\n "
+    fi
 }
 
 # Get the argument name list of a trace event
-- 
cgit v1.2.3


From 3948d1d4876065160583e79533bf604481063833 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 12 Mar 2012 17:46:51 +0100
Subject: qcow2: Remove unused parameter in get_cluster_table()

Since everything goes through the cache, callers don't use the L2 table
offset any more.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
 block/qcow2-cluster.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index e0fb90792f..cbd224dc46 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -466,7 +466,6 @@ out:
  */
 static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
                              uint64_t **new_l2_table,
-                             uint64_t *new_l2_offset,
                              int *new_l2_index)
 {
     BDRVQcowState *s = bs->opaque;
@@ -514,7 +513,6 @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
     l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
 
     *new_l2_table = l2_table;
-    *new_l2_offset = l2_offset;
     *new_l2_index = l2_index;
 
     return 0;
@@ -539,11 +537,11 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
 {
     BDRVQcowState *s = bs->opaque;
     int l2_index, ret;
-    uint64_t l2_offset, *l2_table;
+    uint64_t *l2_table;
     int64_t cluster_offset;
     int nb_csectors;
 
-    ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
+    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
     if (ret < 0) {
         return 0;
     }
@@ -588,7 +586,7 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
 {
     BDRVQcowState *s = bs->opaque;
     int i, j = 0, l2_index, ret;
-    uint64_t *old_cluster, start_sect, l2_offset, *l2_table;
+    uint64_t *old_cluster, start_sect, *l2_table;
     uint64_t cluster_offset = m->alloc_offset;
     bool cow = false;
 
@@ -633,7 +631,7 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
     }
 
     qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
-    ret = get_cluster_table(bs, m->offset, &l2_table, &l2_offset, &l2_index);
+    ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index);
     if (ret < 0) {
         goto err;
     }
@@ -817,7 +815,7 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
 {
     BDRVQcowState *s = bs->opaque;
     int l2_index, ret, sectors;
-    uint64_t l2_offset, *l2_table;
+    uint64_t *l2_table;
     unsigned int nb_clusters, keep_clusters;
     uint64_t cluster_offset;
 
@@ -825,7 +823,7 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
                                       n_start, n_end);
 
     /* Find L2 entry for the first involved cluster */
-    ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
+    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
     if (ret < 0) {
         return ret;
     }
@@ -1000,12 +998,12 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
     unsigned int nb_clusters)
 {
     BDRVQcowState *s = bs->opaque;
-    uint64_t l2_offset, *l2_table;
+    uint64_t *l2_table;
     int l2_index;
     int ret;
     int i;
 
-    ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
+    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
     if (ret < 0) {
         return ret;
     }
-- 
cgit v1.2.3


From d7bb72c83cfbcfd0563e158800e4052819223524 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Mon, 12 Mar 2012 16:36:07 +0000
Subject: qemu-io: add option to enable tracing

It can be useful to enable QEMU tracing when trying out block layer
interfaces via qemu-io.  Tracing can be enabled using the new -T FILE
option where the given file contains a list of trace events to enable
(just like the qemu --trace events=FILE option).

  $ echo qemu_vfree >my-events
  $ ./qemu-io -T my-events ...

Remember to use ./configure --enable-trace-backend=BACKEND when building
qemu-io.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 qemu-io.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/qemu-io.c b/qemu-io.c
index 31895305f1..e6fcd7719e 100644
--- a/qemu-io.c
+++ b/qemu-io.c
@@ -17,6 +17,7 @@
 #include "qemu-common.h"
 #include "block_int.h"
 #include "cmd.h"
+#include "trace/control.h"
 
 #define VERSION	"0.0.1"
 
@@ -1783,6 +1784,7 @@ static void usage(const char *name)
 "  -g, --growable       allow file to grow (only applies to protocols)\n"
 "  -m, --misalign       misalign allocations for O_DIRECT\n"
 "  -k, --native-aio     use kernel AIO implementation (on Linux only)\n"
+"  -T, --trace FILE     enable trace events listed in the given file\n"
 "  -h, --help           display this help and exit\n"
 "  -V, --version        output version information and exit\n"
 "\n",
@@ -1794,7 +1796,7 @@ int main(int argc, char **argv)
 {
     int readonly = 0;
     int growable = 0;
-    const char *sopt = "hVc:rsnmgk";
+    const char *sopt = "hVc:rsnmgkT:";
     const struct option lopt[] = {
         { "help", 0, NULL, 'h' },
         { "version", 0, NULL, 'V' },
@@ -1806,6 +1808,7 @@ int main(int argc, char **argv)
         { "misalign", 0, NULL, 'm' },
         { "growable", 0, NULL, 'g' },
         { "native-aio", 0, NULL, 'k' },
+        { "trace", 1, NULL, 'T' },
         { NULL, 0, NULL, 0 }
     };
     int c;
@@ -1837,6 +1840,11 @@ int main(int argc, char **argv)
         case 'k':
             flags |= BDRV_O_NATIVE_AIO;
             break;
+        case 'T':
+            if (!trace_backend_init(optarg, NULL)) {
+                exit(1); /* error message will have been printed */
+            }
+            break;
         case 'V':
             printf("%s version %s\n", progname, VERSION);
             exit(0);
-- 
cgit v1.2.3


From 29cdb2513c3f22f51d6f3585b41009576485cb35 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 12 Mar 2012 18:26:01 +0100
Subject: block: push recursive flushing up from drivers

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c          | 22 ++++++++++++++--------
 block/blkdebug.c |  7 -------
 block/cow.c      |  6 ------
 block/qcow.c     |  6 ------
 block/qcow2.c    |  6 ------
 block/qed.c      |  8 --------
 block/raw.c      |  6 ------
 block/vdi.c      |  8 --------
 block/vmdk.c     |  4 ++--
 block/vpc.c      |  6 ------
 10 files changed, 16 insertions(+), 63 deletions(-)

diff --git a/block.c b/block.c
index b88ee90817..8858be096b 100644
--- a/block.c
+++ b/block.c
@@ -2331,9 +2331,7 @@ void bdrv_flush_all(void)
     BlockDriverState *bs;
 
     QTAILQ_FOREACH(bs, &bdrv_states, list) {
-        if (!bdrv_is_read_only(bs) && bdrv_is_inserted(bs)) {
-            bdrv_flush(bs);
-        }
+        bdrv_flush(bs);
     }
 }
 
@@ -3520,7 +3518,7 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
 {
     int ret;
 
-    if (!bs->drv) {
+    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
         return 0;
     }
 
@@ -3538,7 +3536,7 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
     }
 
     if (bs->drv->bdrv_co_flush_to_disk) {
-        return bs->drv->bdrv_co_flush_to_disk(bs);
+        ret = bs->drv->bdrv_co_flush_to_disk(bs);
     } else if (bs->drv->bdrv_aio_flush) {
         BlockDriverAIOCB *acb;
         CoroutineIOCompletion co = {
@@ -3547,10 +3545,10 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
 
         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
         if (acb == NULL) {
-            return -EIO;
+            ret = -EIO;
         } else {
             qemu_coroutine_yield();
-            return co.ret;
+            ret = co.ret;
         }
     } else {
         /*
@@ -3564,8 +3562,16 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
          *
          * Let's hope the user knows what he's doing.
          */
-        return 0;
+        ret = 0;
     }
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
+     * in the case of cache=unsafe, so there are no useless flushes.
+     */
+    return bdrv_co_flush(bs->file);
 }
 
 void bdrv_invalidate_cache(BlockDriverState *bs)
diff --git a/block/blkdebug.c b/block/blkdebug.c
index a251802ad4..e56e37da51 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -397,12 +397,6 @@ static void blkdebug_close(BlockDriverState *bs)
     }
 }
 
-static BlockDriverAIOCB *blkdebug_aio_flush(BlockDriverState *bs,
-    BlockDriverCompletionFunc *cb, void *opaque)
-{
-    return bdrv_aio_flush(bs->file, cb, opaque);
-}
-
 static void process_rule(BlockDriverState *bs, struct BlkdebugRule *rule,
     BlkdebugVars *old_vars)
 {
@@ -452,7 +446,6 @@ static BlockDriver bdrv_blkdebug = {
 
     .bdrv_aio_readv     = blkdebug_aio_readv,
     .bdrv_aio_writev    = blkdebug_aio_writev,
-    .bdrv_aio_flush     = blkdebug_aio_flush,
 
     .bdrv_debug_event   = blkdebug_debug_event,
 };
diff --git a/block/cow.c b/block/cow.c
index bb5927c6aa..8d3c9f873c 100644
--- a/block/cow.c
+++ b/block/cow.c
@@ -318,11 +318,6 @@ exit:
     return ret;
 }
 
-static coroutine_fn int cow_co_flush(BlockDriverState *bs)
-{
-    return bdrv_co_flush(bs->file);
-}
-
 static QEMUOptionParameter cow_create_options[] = {
     {
         .name = BLOCK_OPT_SIZE,
@@ -348,7 +343,6 @@ static BlockDriver bdrv_cow = {
 
     .bdrv_read              = cow_co_read,
     .bdrv_write             = cow_co_write,
-    .bdrv_co_flush_to_disk  = cow_co_flush,
     .bdrv_co_is_allocated   = cow_co_is_allocated,
 
     .create_options = cow_create_options,
diff --git a/block/qcow.c b/block/qcow.c
index b1cfe1f696..35dff497ae 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -835,11 +835,6 @@ fail:
     return ret;
 }
 
-static coroutine_fn int qcow_co_flush(BlockDriverState *bs)
-{
-    return bdrv_co_flush(bs->file);
-}
-
 static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
     BDRVQcowState *s = bs->opaque;
@@ -877,7 +872,6 @@ static BlockDriver bdrv_qcow = {
 
     .bdrv_co_readv          = qcow_co_readv,
     .bdrv_co_writev         = qcow_co_writev,
-    .bdrv_co_flush_to_disk  = qcow_co_flush,
     .bdrv_co_is_allocated   = qcow_co_is_allocated,
 
     .bdrv_set_key           = qcow_set_key,
diff --git a/block/qcow2.c b/block/qcow2.c
index 7aece65406..70d3141dd1 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1253,11 +1253,6 @@ static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
     return 0;
 }
 
-static coroutine_fn int qcow2_co_flush_to_disk(BlockDriverState *bs)
-{
-    return bdrv_co_flush(bs->file);
-}
-
 static int64_t qcow2_vm_state_offset(BDRVQcowState *s)
 {
 	return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits);
@@ -1377,7 +1372,6 @@ static BlockDriver bdrv_qcow2 = {
     .bdrv_co_readv          = qcow2_co_readv,
     .bdrv_co_writev         = qcow2_co_writev,
     .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,
-    .bdrv_co_flush_to_disk  = qcow2_co_flush_to_disk,
 
     .bdrv_co_discard        = qcow2_co_discard,
     .bdrv_truncate          = qcow2_truncate,
diff --git a/block/qed.c b/block/qed.c
index a041d31e66..d6c1580866 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1350,13 +1350,6 @@ static BlockDriverAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs,
                          opaque, QED_AIOCB_WRITE);
 }
 
-static BlockDriverAIOCB *bdrv_qed_aio_flush(BlockDriverState *bs,
-                                            BlockDriverCompletionFunc *cb,
-                                            void *opaque)
-{
-    return bdrv_aio_flush(bs->file, cb, opaque);
-}
-
 typedef struct {
     Coroutine *co;
     int ret;
@@ -1562,7 +1555,6 @@ static BlockDriver bdrv_qed = {
     .bdrv_make_empty          = bdrv_qed_make_empty,
     .bdrv_aio_readv           = bdrv_qed_aio_readv,
     .bdrv_aio_writev          = bdrv_qed_aio_writev,
-    .bdrv_aio_flush           = bdrv_qed_aio_flush,
     .bdrv_co_write_zeroes     = bdrv_qed_co_write_zeroes,
     .bdrv_truncate            = bdrv_qed_truncate,
     .bdrv_getlength           = bdrv_qed_getlength,
diff --git a/block/raw.c b/block/raw.c
index 1cdac0ccdc..7086e314a6 100644
--- a/block/raw.c
+++ b/block/raw.c
@@ -25,11 +25,6 @@ static void raw_close(BlockDriverState *bs)
 {
 }
 
-static int coroutine_fn raw_co_flush(BlockDriverState *bs)
-{
-    return bdrv_co_flush(bs->file);
-}
-
 static int64_t raw_getlength(BlockDriverState *bs)
 {
     return bdrv_getlength(bs->file);
@@ -113,7 +108,6 @@ static BlockDriver bdrv_raw = {
 
     .bdrv_co_readv          = raw_co_readv,
     .bdrv_co_writev         = raw_co_writev,
-    .bdrv_co_flush_to_disk  = raw_co_flush,
     .bdrv_co_discard        = raw_co_discard,
 
     .bdrv_probe         = raw_probe,
diff --git a/block/vdi.c b/block/vdi.c
index 6a0011fbcc..f8fa865745 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -930,13 +930,6 @@ static void vdi_close(BlockDriverState *bs)
     error_free(s->migration_blocker);
 }
 
-static coroutine_fn int vdi_co_flush(BlockDriverState *bs)
-{
-    logout("\n");
-    return bdrv_co_flush(bs->file);
-}
-
-
 static QEMUOptionParameter vdi_create_options[] = {
     {
         .name = BLOCK_OPT_SIZE,
@@ -969,7 +962,6 @@ static BlockDriver bdrv_vdi = {
     .bdrv_open = vdi_open,
     .bdrv_close = vdi_close,
     .bdrv_create = vdi_create,
-    .bdrv_co_flush_to_disk = vdi_co_flush,
     .bdrv_co_is_allocated = vdi_co_is_allocated,
     .bdrv_make_empty = vdi_make_empty,
 
diff --git a/block/vmdk.c b/block/vmdk.c
index 45c003a0f1..18e9b4caf6 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1525,10 +1525,10 @@ static void vmdk_close(BlockDriverState *bs)
 
 static coroutine_fn int vmdk_co_flush(BlockDriverState *bs)
 {
-    int i, ret, err;
     BDRVVmdkState *s = bs->opaque;
+    int i, err;
+    int ret = 0;
 
-    ret = bdrv_co_flush(bs->file);
     for (i = 0; i < s->num_extents; i++) {
         err = bdrv_co_flush(s->extents[i].file);
         if (err < 0) {
diff --git a/block/vpc.c b/block/vpc.c
index 6b4816f563..706faf3f38 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -507,11 +507,6 @@ static coroutine_fn int vpc_co_write(BlockDriverState *bs, int64_t sector_num,
     return ret;
 }
 
-static coroutine_fn int vpc_co_flush(BlockDriverState *bs)
-{
-    return bdrv_co_flush(bs->file);
-}
-
 /*
  * Calculates the number of cylinders, heads and sectors per cylinder
  * based on a given number of sectors. This is the algorithm described
@@ -789,7 +784,6 @@ static BlockDriver bdrv_vpc = {
 
     .bdrv_read              = vpc_co_read,
     .bdrv_write             = vpc_co_write,
-    .bdrv_co_flush_to_disk  = vpc_co_flush,
 
     .create_options = vpc_create_options,
 };
-- 
cgit v1.2.3


From 85e8dab1efc7228bb674e72f6fb3be78c1e883bf Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 12 Mar 2012 17:01:48 +0100
Subject: aio: move BlockDriverAIOCB to qemu-aio.h

And remove several block_int.h inclusions that should not be there.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.h         |  2 --
 block_int.h     | 18 ------------------
 dma-helpers.c   |  1 -
 hw/ide/core.c   |  1 -
 hw/lsi53c895a.c |  1 -
 linux-aio.c     |  1 -
 qemu-aio.h      | 21 +++++++++++++++++++++
 7 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/block.h b/block.h
index 415bb17c73..e670606afa 100644
--- a/block.h
+++ b/block.h
@@ -180,8 +180,6 @@ typedef struct BdrvCheckResult {
 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res);
 
 /* async block I/O */
-typedef struct BlockDriverAIOCB BlockDriverAIOCB;
-typedef void BlockDriverCompletionFunc(void *opaque, int ret);
 typedef void BlockDriverDirtyHandler(BlockDriverState *bs, int64_t sector,
                                      int sector_num);
 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
diff --git a/block_int.h b/block_int.h
index b460c369ca..22c86a53fe 100644
--- a/block_int.h
+++ b/block_int.h
@@ -53,12 +53,6 @@
 
 typedef struct BdrvTrackedRequest BdrvTrackedRequest;
 
-typedef struct AIOPool {
-    void (*cancel)(BlockDriverAIOCB *acb);
-    int aiocb_size;
-    BlockDriverAIOCB *free_aiocb;
-} AIOPool;
-
 typedef struct BlockIOLimit {
     int64_t bps[3];
     int64_t iops[3];
@@ -302,20 +296,8 @@ struct BlockDriverState {
     BlockJob *job;
 };
 
-struct BlockDriverAIOCB {
-    AIOPool *pool;
-    BlockDriverState *bs;
-    BlockDriverCompletionFunc *cb;
-    void *opaque;
-    BlockDriverAIOCB *next;
-};
-
 void get_tmp_filename(char *filename, int size);
 
-void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
-                   BlockDriverCompletionFunc *cb, void *opaque);
-void qemu_aio_release(void *p);
-
 void bdrv_set_io_limits(BlockDriverState *bs,
                         BlockIOLimit *io_limits);
 
diff --git a/dma-helpers.c b/dma-helpers.c
index c29ea6d3ab..7fcc86dca9 100644
--- a/dma-helpers.c
+++ b/dma-helpers.c
@@ -8,7 +8,6 @@
  */
 
 #include "dma.h"
-#include "block_int.h"
 #include "trace.h"
 
 void qemu_sglist_init(QEMUSGList *qsg, int alloc_hint)
diff --git a/hw/ide/core.c b/hw/ide/core.c
index 4d568acc9c..6f06d2818e 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -31,7 +31,6 @@
 #include "sysemu.h"
 #include "dma.h"
 #include "blockdev.h"
-#include "block_int.h"
 
 #include <hw/ide/internal.h>
 
diff --git a/hw/lsi53c895a.c b/hw/lsi53c895a.c
index edc09b7307..f022a02447 100644
--- a/hw/lsi53c895a.c
+++ b/hw/lsi53c895a.c
@@ -15,7 +15,6 @@
 #include "hw.h"
 #include "pci.h"
 #include "scsi.h"
-#include "block_int.h"
 #include "dma.h"
 
 //#define DEBUG_LSI
diff --git a/linux-aio.c b/linux-aio.c
index d2fc2e7d02..15261ece3d 100644
--- a/linux-aio.c
+++ b/linux-aio.c
@@ -9,7 +9,6 @@
  */
 #include "qemu-common.h"
 #include "qemu-aio.h"
-#include "block_int.h"
 #include "block/raw-posix-aio.h"
 
 #include <sys/eventfd.h>
diff --git a/qemu-aio.h b/qemu-aio.h
index 3bdd749f80..230c2f79a0 100644
--- a/qemu-aio.h
+++ b/qemu-aio.h
@@ -17,6 +17,27 @@
 #include "qemu-common.h"
 #include "qemu-char.h"
 
+typedef struct BlockDriverAIOCB BlockDriverAIOCB;
+typedef void BlockDriverCompletionFunc(void *opaque, int ret);
+
+typedef struct AIOPool {
+    void (*cancel)(BlockDriverAIOCB *acb);
+    int aiocb_size;
+    BlockDriverAIOCB *free_aiocb;
+} AIOPool;
+
+struct BlockDriverAIOCB {
+    AIOPool *pool;
+    BlockDriverState *bs;
+    BlockDriverCompletionFunc *cb;
+    void *opaque;
+    BlockDriverAIOCB *next;
+};
+
+void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
+                   BlockDriverCompletionFunc *cb, void *opaque);
+void qemu_aio_release(void *p);
+
 /* Returns 1 if there are still outstanding AIO requests; 0 otherwise */
 typedef int (AioFlushHandler)(void *opaque);
 
-- 
cgit v1.2.3


From 2844bdd99a96dc32a6ebf464f49e65be31da1c14 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 13 Mar 2012 14:44:22 +0100
Subject: ide: IDENTIFY word 86 bit 14 is reserved

Reserved bits should be cleared to zero.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
 hw/ide/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/ide/core.c b/hw/ide/core.c
index 6f06d2818e..771811c33b 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -150,7 +150,7 @@ static void ide_identify(IDEState *s)
     else
          put_le16(p + 85, (1 << 14) | 1);
     /* 13=flush_cache_ext,12=flush_cache,10=lba48 */
-    put_le16(p + 86, (1 << 14) | (1 << 13) | (1 <<12) | (1 << 10));
+    put_le16(p + 86, (1 << 13) | (1 <<12) | (1 << 10));
     /* 14=set to 1, 1=smart self test, 0=smart error logging */
     put_le16(p + 87, (1 << 14) | 0);
     put_le16(p + 88, 0x3f | (1 << 13)); /* udma5 set and supported */
-- 
cgit v1.2.3


From 27e0c9a1bbd166a67c16291016fba298a8e47140 Mon Sep 17 00:00:00 2001
From: Floris Bos <bos@je-eigen-domein.nl>
Date: Mon, 12 Mar 2012 21:05:09 +0100
Subject: ide: Add "model=s" qdev option

Allow the user to override the default disk model name "QEMU HARDDISK".

Some Linux distributions use the /dev/disk/by-id/scsi-SATA_name-of-disk-
model_serial addressing scheme when refering to partitions in /etc/fstab
and elsewhere. This causes problems when starting a disk image taken from
an existing physical server under qemu, because when running under qemu
name-of-disk-model is always "QEMU HARDDISK".

This patch introduces a model=s option which in combination with the
existing serial=s option can be used to fake the disk the operating
system was previously on, allowing the OS to boot properly.

Cc: kwolf@redhat.com
Signed-off-by: Floris Bos <dev@noc-ps.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/ide/core.c     | 27 ++++++++++++++++++++++-----
 hw/ide/internal.h |  4 +++-
 hw/ide/qdev.c     |  6 ++++--
 3 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/hw/ide/core.c b/hw/ide/core.c
index 771811c33b..e38cace45b 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -100,7 +100,7 @@ static void ide_identify(IDEState *s)
     put_le16(p + 21, 512); /* cache size in sectors */
     put_le16(p + 22, 4); /* ecc bytes */
     padstr((char *)(p + 23), s->version, 8); /* firmware version */
-    padstr((char *)(p + 27), "QEMU HARDDISK", 40); /* model */
+    padstr((char *)(p + 27), s->drive_model_str, 40); /* model */
 #if MAX_MULT_SECTORS > 1
     put_le16(p + 47, 0x8000 | MAX_MULT_SECTORS);
 #endif
@@ -188,7 +188,7 @@ static void ide_atapi_identify(IDEState *s)
     put_le16(p + 21, 512); /* cache size in sectors */
     put_le16(p + 22, 4); /* ecc bytes */
     padstr((char *)(p + 23), s->version, 8); /* firmware version */
-    padstr((char *)(p + 27), "QEMU DVD-ROM", 40); /* model */
+    padstr((char *)(p + 27), s->drive_model_str, 40); /* model */
     put_le16(p + 48, 1); /* dword I/O (XXX: should not be set on CDROM) */
 #ifdef USE_DMA_CDROM
     put_le16(p + 49, 1 << 9 | 1 << 8); /* DMA and LBA supported */
@@ -245,7 +245,7 @@ static void ide_cfata_identify(IDEState *s)
     padstr((char *)(p + 10), s->drive_serial_str, 20); /* serial number */
     put_le16(p + 22, 0x0004);			/* ECC bytes */
     padstr((char *) (p + 23), s->version, 8);	/* Firmware Revision */
-    padstr((char *) (p + 27), "QEMU MICRODRIVE", 40);/* Model number */
+    padstr((char *) (p + 27), s->drive_model_str, 40);/* Model number */
 #if MAX_MULT_SECTORS > 1
     put_le16(p + 47, 0x8000 | MAX_MULT_SECTORS);
 #else
@@ -1833,7 +1833,7 @@ static const BlockDevOps ide_cd_block_ops = {
 };
 
 int ide_init_drive(IDEState *s, BlockDriverState *bs, IDEDriveKind kind,
-                   const char *version, const char *serial)
+                   const char *version, const char *serial, const char *model)
 {
     int cylinders, heads, secs;
     uint64_t nb_sectors;
@@ -1884,6 +1884,22 @@ int ide_init_drive(IDEState *s, BlockDriverState *bs, IDEDriveKind kind,
         snprintf(s->drive_serial_str, sizeof(s->drive_serial_str),
                  "QM%05d", s->drive_serial);
     }
+    if (model) {
+        pstrcpy(s->drive_model_str, sizeof(s->drive_model_str), model);
+    } else {
+        switch (kind) {
+        case IDE_CD:
+            strcpy(s->drive_model_str, "QEMU DVD-ROM");
+            break;
+        case IDE_CFATA:
+            strcpy(s->drive_model_str, "QEMU MICRODRIVE");
+            break;
+        default:
+            strcpy(s->drive_model_str, "QEMU HARDDISK");
+            break;
+        }
+    }
+
     if (version) {
         pstrcpy(s->version, sizeof(s->version), version);
     } else {
@@ -1976,7 +1992,8 @@ void ide_init2_with_non_qdev_drives(IDEBus *bus, DriveInfo *hd0,
         if (dinfo) {
             if (ide_init_drive(&bus->ifs[i], dinfo->bdrv,
                                dinfo->media_cd ? IDE_CD : IDE_HD, NULL,
-                               *dinfo->serial ? dinfo->serial : NULL) < 0) {
+                               *dinfo->serial ? dinfo->serial : NULL,
+                               NULL) < 0) {
                 error_report("Can't set up IDE drive %s", dinfo->id);
                 exit(1);
             }
diff --git a/hw/ide/internal.h b/hw/ide/internal.h
index c808a0ddf8..b1319dca72 100644
--- a/hw/ide/internal.h
+++ b/hw/ide/internal.h
@@ -348,6 +348,7 @@ struct IDEState {
     uint8_t identify_data[512];
     int drive_serial;
     char drive_serial_str[21];
+    char drive_model_str[41];
     /* ide regs */
     uint8_t feature;
     uint8_t error;
@@ -468,6 +469,7 @@ struct IDEDevice {
     BlockConf conf;
     char *version;
     char *serial;
+    char *model;
 };
 
 #define BM_STATUS_DMAING 0x01
@@ -534,7 +536,7 @@ void ide_data_writel(void *opaque, uint32_t addr, uint32_t val);
 uint32_t ide_data_readl(void *opaque, uint32_t addr);
 
 int ide_init_drive(IDEState *s, BlockDriverState *bs, IDEDriveKind kind,
-                   const char *version, const char *serial);
+                   const char *version, const char *serial, const char *model);
 void ide_init2(IDEBus *bus, qemu_irq irq);
 void ide_init2_with_non_qdev_drives(IDEBus *bus, DriveInfo *hd0,
                                     DriveInfo *hd1, qemu_irq irq);
diff --git a/hw/ide/qdev.c b/hw/ide/qdev.c
index f6a48961c5..07227c7455 100644
--- a/hw/ide/qdev.c
+++ b/hw/ide/qdev.c
@@ -136,7 +136,8 @@ static int ide_dev_initfn(IDEDevice *dev, IDEDriveKind kind)
         }
     }
 
-    if (ide_init_drive(s, dev->conf.bs, kind, dev->version, serial) < 0) {
+    if (ide_init_drive(s, dev->conf.bs, kind,
+                       dev->version, serial, dev->model) < 0) {
         return -1;
     }
 
@@ -173,7 +174,8 @@ static int ide_drive_initfn(IDEDevice *dev)
 #define DEFINE_IDE_DEV_PROPERTIES()                     \
     DEFINE_BLOCK_PROPERTIES(IDEDrive, dev.conf),        \
     DEFINE_PROP_STRING("ver",  IDEDrive, dev.version),  \
-    DEFINE_PROP_STRING("serial",  IDEDrive, dev.serial)
+    DEFINE_PROP_STRING("serial",  IDEDrive, dev.serial),\
+    DEFINE_PROP_STRING("model", IDEDrive, dev.model)
 
 static Property ide_hd_properties[] = {
     DEFINE_IDE_DEV_PROPERTIES(),
-- 
cgit v1.2.3


From aa2c91bdfebbfa314e9522573a9656626e21ac67 Mon Sep 17 00:00:00 2001
From: Floris Bos <bos@je-eigen-domein.nl>
Date: Mon, 12 Mar 2012 21:05:10 +0100
Subject: ide: Change serial number strncpy() to pstrcpy()

strncpy may not null-terminate the destination string.

Cc: kwolf@redhat.com
Signed-off-by: Floris Bos <dev@noc-ps.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockdev.c    | 5 +++--
 hw/ide/core.c | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index 1a500b830d..f5e7dba90c 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -532,8 +532,9 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
     dinfo->unit = unit_id;
     dinfo->opts = opts;
     dinfo->refcount = 1;
-    if (serial)
-        strncpy(dinfo->serial, serial, sizeof(dinfo->serial) - 1);
+    if (serial) {
+        pstrcpy(dinfo->serial, sizeof(dinfo->serial), serial);
+    }
     QTAILQ_INSERT_TAIL(&drives, dinfo, next);
 
     bdrv_set_on_error(dinfo->bdrv, on_read_error, on_write_error);
diff --git a/hw/ide/core.c b/hw/ide/core.c
index e38cace45b..5647694585 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -1879,7 +1879,7 @@ int ide_init_drive(IDEState *s, BlockDriverState *bs, IDEDriveKind kind,
         }
     }
     if (serial) {
-        strncpy(s->drive_serial_str, serial, sizeof(s->drive_serial_str));
+        pstrcpy(s->drive_serial_str, sizeof(s->drive_serial_str), serial);
     } else {
         snprintf(s->drive_serial_str, sizeof(s->drive_serial_str),
                  "QM%05d", s->drive_serial);
-- 
cgit v1.2.3


From 95ebda85e09ed2b7f00deb2adbdafa5ccf5db948 Mon Sep 17 00:00:00 2001
From: Floris Bos <bos@je-eigen-domein.nl>
Date: Tue, 13 Mar 2012 13:31:47 +0100
Subject: ide: Adds wwn=hex qdev option

Allow the user to specify a disk's World Wide Name.

Linux guests can address disks by their unique World Wide Name number
(e.g. /dev/disk/by-id/wwn-0x5001517959123522). This patch adds support
for assigning a World Wide Name number to a virtual IDE disk.

Cc: kwolf@redhat.com
Signed-off-by: Floris Bos <dev@noc-ps.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/ide/core.c     | 29 +++++++++++++++++++++++------
 hw/ide/internal.h |  5 ++++-
 hw/ide/qdev.c     |  3 ++-
 3 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/hw/ide/core.c b/hw/ide/core.c
index 5647694585..6e25338615 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -142,8 +142,12 @@ static void ide_identify(IDEState *s)
     put_le16(p + 82, (1 << 14) | (1 << 5) | 1);
     /* 13=flush_cache_ext,12=flush_cache,10=lba48 */
     put_le16(p + 83, (1 << 14) | (1 << 13) | (1 <<12) | (1 << 10));
-    /* 14=set to 1, 1=SMART self test, 0=SMART error logging */
-    put_le16(p + 84, (1 << 14) | 0);
+    /* 14=set to 1, 8=has WWN, 1=SMART self test, 0=SMART error logging */
+    if (s->wwn) {
+        put_le16(p + 84, (1 << 14) | (1 << 8) | 0);
+    } else {
+        put_le16(p + 84, (1 << 14) | 0);
+    }
     /* 14 = NOP supported, 5=WCACHE enabled, 0=SMART feature set enabled */
     if (bdrv_enable_write_cache(s->bs))
          put_le16(p + 85, (1 << 14) | (1 << 5) | 1);
@@ -151,8 +155,12 @@ static void ide_identify(IDEState *s)
          put_le16(p + 85, (1 << 14) | 1);
     /* 13=flush_cache_ext,12=flush_cache,10=lba48 */
     put_le16(p + 86, (1 << 13) | (1 <<12) | (1 << 10));
-    /* 14=set to 1, 1=smart self test, 0=smart error logging */
-    put_le16(p + 87, (1 << 14) | 0);
+    /* 14=set to 1, 8=has WWN, 1=SMART self test, 0=SMART error logging */
+    if (s->wwn) {
+        put_le16(p + 87, (1 << 14) | (1 << 8) | 0);
+    } else {
+        put_le16(p + 87, (1 << 14) | 0);
+    }
     put_le16(p + 88, 0x3f | (1 << 13)); /* udma5 set and supported */
     put_le16(p + 93, 1 | (1 << 14) | 0x2000);
     put_le16(p + 100, s->nb_sectors);
@@ -162,6 +170,13 @@ static void ide_identify(IDEState *s)
 
     if (dev && dev->conf.physical_block_size)
         put_le16(p + 106, 0x6000 | get_physical_block_exp(&dev->conf));
+    if (s->wwn) {
+        /* LE 16-bit words 111-108 contain 64-bit World Wide Name */
+        put_le16(p + 108, s->wwn >> 48);
+        put_le16(p + 109, s->wwn >> 32);
+        put_le16(p + 110, s->wwn >> 16);
+        put_le16(p + 111, s->wwn);
+    }
     if (dev && dev->conf.discard_granularity) {
         put_le16(p + 169, 1); /* TRIM support */
     }
@@ -1833,7 +1848,8 @@ static const BlockDevOps ide_cd_block_ops = {
 };
 
 int ide_init_drive(IDEState *s, BlockDriverState *bs, IDEDriveKind kind,
-                   const char *version, const char *serial, const char *model)
+                   const char *version, const char *serial, const char *model,
+                   uint64_t wwn)
 {
     int cylinders, heads, secs;
     uint64_t nb_sectors;
@@ -1859,6 +1875,7 @@ int ide_init_drive(IDEState *s, BlockDriverState *bs, IDEDriveKind kind,
     s->heads = heads;
     s->sectors = secs;
     s->nb_sectors = nb_sectors;
+    s->wwn = wwn;
     /* The SMART values should be preserved across power cycles
        but they aren't.  */
     s->smart_enabled = 1;
@@ -1993,7 +2010,7 @@ void ide_init2_with_non_qdev_drives(IDEBus *bus, DriveInfo *hd0,
             if (ide_init_drive(&bus->ifs[i], dinfo->bdrv,
                                dinfo->media_cd ? IDE_CD : IDE_HD, NULL,
                                *dinfo->serial ? dinfo->serial : NULL,
-                               NULL) < 0) {
+                               NULL, 0) < 0) {
                 error_report("Can't set up IDE drive %s", dinfo->id);
                 exit(1);
             }
diff --git a/hw/ide/internal.h b/hw/ide/internal.h
index b1319dca72..100efd3076 100644
--- a/hw/ide/internal.h
+++ b/hw/ide/internal.h
@@ -349,6 +349,7 @@ struct IDEState {
     int drive_serial;
     char drive_serial_str[21];
     char drive_model_str[41];
+    uint64_t wwn;
     /* ide regs */
     uint8_t feature;
     uint8_t error;
@@ -470,6 +471,7 @@ struct IDEDevice {
     char *version;
     char *serial;
     char *model;
+    uint64_t wwn;
 };
 
 #define BM_STATUS_DMAING 0x01
@@ -536,7 +538,8 @@ void ide_data_writel(void *opaque, uint32_t addr, uint32_t val);
 uint32_t ide_data_readl(void *opaque, uint32_t addr);
 
 int ide_init_drive(IDEState *s, BlockDriverState *bs, IDEDriveKind kind,
-                   const char *version, const char *serial, const char *model);
+                   const char *version, const char *serial, const char *model,
+                   uint64_t wwn);
 void ide_init2(IDEBus *bus, qemu_irq irq);
 void ide_init2_with_non_qdev_drives(IDEBus *bus, DriveInfo *hd0,
                                     DriveInfo *hd1, qemu_irq irq);
diff --git a/hw/ide/qdev.c b/hw/ide/qdev.c
index 07227c7455..a46578d685 100644
--- a/hw/ide/qdev.c
+++ b/hw/ide/qdev.c
@@ -137,7 +137,7 @@ static int ide_dev_initfn(IDEDevice *dev, IDEDriveKind kind)
     }
 
     if (ide_init_drive(s, dev->conf.bs, kind,
-                       dev->version, serial, dev->model) < 0) {
+                       dev->version, serial, dev->model, dev->wwn) < 0) {
         return -1;
     }
 
@@ -174,6 +174,7 @@ static int ide_drive_initfn(IDEDevice *dev)
 #define DEFINE_IDE_DEV_PROPERTIES()                     \
     DEFINE_BLOCK_PROPERTIES(IDEDrive, dev.conf),        \
     DEFINE_PROP_STRING("ver",  IDEDrive, dev.version),  \
+    DEFINE_PROP_HEX64("wwn",  IDEDrive, dev.wwn, 0),    \
     DEFINE_PROP_STRING("serial",  IDEDrive, dev.serial),\
     DEFINE_PROP_STRING("model", IDEDrive, dev.model)
 
-- 
cgit v1.2.3


From c088b691363070d151f80cc1fde4b7c151bdfe8f Mon Sep 17 00:00:00 2001
From: Zhang Shengju <zhangsju@gmail.com>
Date: Tue, 13 Mar 2012 22:38:13 +0800
Subject: block/vpc: write checksum back to footer after check

After validation check, the 'checksum' is not written back
to footer, which leave it with zero.

This results in errors while loadding it under Microsoft's
Hyper-V environment, and also errors from utilities like
Citrix's vhd-util.

Signed-off-by: Zhang Shengju <sean_zhang@trendmicro.com.cn>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/vpc.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block/vpc.c b/block/vpc.c
index 706faf3f38..5cd13d17a1 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -189,6 +189,9 @@ static int vpc_open(BlockDriverState *bs, int flags)
         fprintf(stderr, "block-vpc: The header checksum of '%s' is "
             "incorrect.\n", bs->filename);
 
+    /* Write 'checksum' back to footer, or else will leave it with zero. */
+    footer->checksum = be32_to_cpu(checksum);
+
     // The visible size of a image in Virtual PC depends on the geometry
     // rather than on the size stored in the footer (the size in the footer
     // is too large usually)
-- 
cgit v1.2.3


From 41453412ca16ff1871ed2bd6ee83cb9a5c734823 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Wed, 14 Mar 2012 15:57:04 +0000
Subject: qerror: fix QERR_PROPERTY_VALUE_OUT_OF_RANGE description

Fix a typo in the description for QERR_PROPERTY_VALUE_OUT_OF_RANGE where
"'" was used instead of ")".

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 qerror.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qerror.c b/qerror.c
index 41c729a01f..1f565fc98c 100644
--- a/qerror.c
+++ b/qerror.c
@@ -243,7 +243,7 @@ static const QErrorStringTable qerror_table[] = {
     {
         .error_fmt = QERR_PROPERTY_VALUE_OUT_OF_RANGE,
         .desc      = "Property '%(device).%(property)' doesn't take "
-                     "value %(value) (minimum: %(min), maximum: %(max)'",
+                     "value %(value) (minimum: %(min), maximum: %(max))",
     },
     {
         .error_fmt = QERR_QGA_COMMAND_FAILED,
-- 
cgit v1.2.3


From 02fda01c3024463e561820afb0ba09daba4014d9 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Wed, 14 Mar 2012 15:57:05 +0000
Subject: qdev: add blocksize property type

Storage interfaces like virtio-blk can be configured with block size
information so that the guest can take advantage of efficient I/O
request sizes.

According to the SCSI Block Commands (SBC) standard a device's block
size is "almost always greater than one byte and may be a multiple of
512 bytes".  QEMU currently has a 512 byte minimum block size because
the block layer functions work at that granularity.  Furthermore, the
block size should be a power of 2 because QEMU calculates bitmasks from
the value.

Introduce a "blocksize" property type so devices can enforce these
constraints on block size values.  If the constraints are relaxed in the
future then this property can be updated.

Introduce the new PropertyValueNotPowerOf2 QError so QMP clients know
exactly why a block size value was rejected.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/qdev-properties.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 hw/qdev.h            |  3 +++
 qerror.c             |  5 +++++
 qerror.h             |  4 ++++
 4 files changed, 58 insertions(+)

diff --git a/hw/qdev-properties.c b/hw/qdev-properties.c
index bff9152df5..98dd06aeba 100644
--- a/hw/qdev-properties.c
+++ b/hw/qdev-properties.c
@@ -877,6 +877,52 @@ PropertyInfo qdev_prop_pci_devfn = {
     .max   = 0xFFFFFFFFULL,
 };
 
+/* --- blocksize --- */
+
+static void set_blocksize(Object *obj, Visitor *v, void *opaque,
+                          const char *name, Error **errp)
+{
+    DeviceState *dev = DEVICE(obj);
+    Property *prop = opaque;
+    int16_t *ptr = qdev_get_prop_ptr(dev, prop);
+    Error *local_err = NULL;
+    int64_t value;
+
+    if (dev->state != DEV_STATE_CREATED) {
+        error_set(errp, QERR_PERMISSION_DENIED);
+        return;
+    }
+
+    visit_type_int(v, &value, name, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+    if (value < prop->info->min || value > prop->info->max) {
+        error_set(errp, QERR_PROPERTY_VALUE_OUT_OF_RANGE,
+                  dev->id?:"", name, value, prop->info->min,
+                  prop->info->max);
+        return;
+    }
+
+    /* We rely on power-of-2 blocksizes for bitmasks */
+    if ((value & (value - 1)) != 0) {
+        error_set(errp, QERR_PROPERTY_VALUE_NOT_POWER_OF_2,
+                  dev->id?:"", name, value);
+        return;
+    }
+
+    *ptr = value;
+}
+
+PropertyInfo qdev_prop_blocksize = {
+    .name  = "blocksize",
+    .get   = get_int16,
+    .set   = set_blocksize,
+    .min   = 512,
+    .max   = 65024,
+};
+
 /* --- public helpers --- */
 
 static Property *qdev_prop_walk(Property *props, const char *name)
diff --git a/hw/qdev.h b/hw/qdev.h
index a8df42f6fa..1c109607b4 100644
--- a/hw/qdev.h
+++ b/hw/qdev.h
@@ -223,6 +223,7 @@ extern PropertyInfo qdev_prop_drive;
 extern PropertyInfo qdev_prop_netdev;
 extern PropertyInfo qdev_prop_vlan;
 extern PropertyInfo qdev_prop_pci_devfn;
+extern PropertyInfo qdev_prop_blocksize;
 
 #define DEFINE_PROP(_name, _state, _field, _prop, _type) { \
         .name      = (_name),                                    \
@@ -284,6 +285,8 @@ extern PropertyInfo qdev_prop_pci_devfn;
 #define DEFINE_PROP_LOSTTICKPOLICY(_n, _s, _f, _d) \
     DEFINE_PROP_DEFAULT(_n, _s, _f, _d, qdev_prop_losttickpolicy, \
                         LostTickPolicy)
+#define DEFINE_PROP_BLOCKSIZE(_n, _s, _f, _d) \
+    DEFINE_PROP_DEFAULT(_n, _s, _f, _d, qdev_prop_blocksize, uint16_t)
 
 #define DEFINE_PROP_END_OF_LIST()               \
     {}
diff --git a/qerror.c b/qerror.c
index 1f565fc98c..96fbe71ae5 100644
--- a/qerror.c
+++ b/qerror.c
@@ -240,6 +240,11 @@ static const QErrorStringTable qerror_table[] = {
         .error_fmt = QERR_PROPERTY_VALUE_NOT_FOUND,
         .desc      = "Property '%(device).%(property)' can't find value '%(value)'",
     },
+    {
+        .error_fmt = QERR_PROPERTY_VALUE_NOT_POWER_OF_2,
+        .desc      = "Property '%(device).%(property)' doesn't take "
+                     "value '%(value)', it's not a power of 2",
+    },
     {
         .error_fmt = QERR_PROPERTY_VALUE_OUT_OF_RANGE,
         .desc      = "Property '%(device).%(property)' doesn't take "
diff --git a/qerror.h b/qerror.h
index e16f9c27e1..5c23c1ff20 100644
--- a/qerror.h
+++ b/qerror.h
@@ -202,6 +202,10 @@ QError *qobject_to_qerror(const QObject *obj);
 #define QERR_PROPERTY_VALUE_NOT_FOUND \
     "{ 'class': 'PropertyValueNotFound', 'data': { 'device': %s, 'property': %s, 'value': %s } }"
 
+#define QERR_PROPERTY_VALUE_NOT_POWER_OF_2 \
+    "{ 'class': 'PropertyValueNotPowerOf2', 'data': { " \
+    "'device': %s, 'property': %s, 'value': %"PRId64" } }"
+
 #define QERR_PROPERTY_VALUE_OUT_OF_RANGE \
     "{ 'class': 'PropertyValueOutOfRange', 'data': { 'device': %s, 'property': %s, 'value': %"PRId64", 'min': %"PRId64", 'max': %"PRId64" } }"
 
-- 
cgit v1.2.3


From 1fc86bf963a0617bc9eabfa6c14dbca76cd57976 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Wed, 14 Mar 2012 15:57:06 +0000
Subject: block: enforce constraints on block size properties

Nicolae Mogoreanu <mogo@google.com> noticed that I/O requests can lead
to QEMU crashes when the logical_block_size property is smaller than 512
bytes.

Using the new "blocksize" property we can properly enforce constraints
on the block size such that QEMU's block layer is able to operate
correctly.

Reported-by: Nicolae Mogoreanu <mogo@google.com>
Reported-by: Michael Halcrow <mhalcrow@google.com>
Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/block.h b/block.h
index e670606afa..c51ab163d9 100644
--- a/block.h
+++ b/block.h
@@ -435,10 +435,10 @@ static inline unsigned int get_physical_block_exp(BlockConf *conf)
 
 #define DEFINE_BLOCK_PROPERTIES(_state, _conf)                          \
     DEFINE_PROP_DRIVE("drive", _state, _conf.bs),                       \
-    DEFINE_PROP_UINT16("logical_block_size", _state,                    \
-                       _conf.logical_block_size, 512),                  \
-    DEFINE_PROP_UINT16("physical_block_size", _state,                   \
-                       _conf.physical_block_size, 512),                 \
+    DEFINE_PROP_BLOCKSIZE("logical_block_size", _state,                 \
+                          _conf.logical_block_size, 512),               \
+    DEFINE_PROP_BLOCKSIZE("physical_block_size", _state,                \
+                          _conf.physical_block_size, 512),              \
     DEFINE_PROP_UINT16("min_io_size", _state, _conf.min_io_size, 0),  \
     DEFINE_PROP_UINT32("opt_io_size", _state, _conf.opt_io_size, 0),    \
     DEFINE_PROP_INT32("bootindex", _state, _conf.bootindex, -1),        \
-- 
cgit v1.2.3


From 3d46a75aa5031788cb283ab4429acca23cf1cf92 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 19 Mar 2012 18:07:45 +0100
Subject: vdi: basic conversion to coroutines

Even a basic conversion changing the bdrv_aio_readv/bdrv_aio_writev calls
to bdrv_co_readv/bdrv_co_writev, and callbacks to goto statements can
eliminate a lot of code.  This is because error handling is simplified
and indirections through bottom halves can go away.

After this patch, I/O to the underlying file already happens via
coroutines, but the code still looks a lot like if asynchronous I/O was
being used.

Acked-by: Stefan Weil <sw@weilnetz.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/vdi.c | 158 ++++++++++++++----------------------------------------------
 1 file changed, 37 insertions(+), 121 deletions(-)

diff --git a/block/vdi.c b/block/vdi.c
index f8fa865745..5fd870054f 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -160,7 +160,6 @@ typedef struct {
     void *orig_buf;
     bool is_write;
     int header_modified;
-    BlockDriverAIOCB *hd_aiocb;
     struct iovec hd_iov;
     QEMUIOVector hd_qiov;
     QEMUBH *bh;
@@ -489,33 +488,19 @@ static int coroutine_fn vdi_co_is_allocated(BlockDriverState *bs,
     return VDI_IS_ALLOCATED(bmap_entry);
 }
 
-static void vdi_aio_cancel(BlockDriverAIOCB *blockacb)
-{
-    /* TODO: This code is untested. How can I get it executed? */
-    VdiAIOCB *acb = container_of(blockacb, VdiAIOCB, common);
-    logout("\n");
-    if (acb->hd_aiocb) {
-        bdrv_aio_cancel(acb->hd_aiocb);
-    }
-    qemu_aio_release(acb);
-}
-
 static AIOPool vdi_aio_pool = {
     .aiocb_size = sizeof(VdiAIOCB),
-    .cancel = vdi_aio_cancel,
 };
 
 static VdiAIOCB *vdi_aio_setup(BlockDriverState *bs, int64_t sector_num,
-        QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque, int is_write)
+        QEMUIOVector *qiov, int nb_sectors, int is_write)
 {
     VdiAIOCB *acb;
 
     logout("%p, %" PRId64 ", %p, %d, %p, %p, %d\n",
-           bs, sector_num, qiov, nb_sectors, cb, opaque, is_write);
+           bs, sector_num, qiov, nb_sectors, is_write);
 
-    acb = qemu_aio_get(&vdi_aio_pool, bs, cb, opaque);
-    acb->hd_aiocb = NULL;
+    acb = qemu_aio_get(&vdi_aio_pool, bs, NULL, NULL);
     acb->sector_num = sector_num;
     acb->qiov = qiov;
     acb->is_write = is_write;
@@ -538,42 +523,7 @@ static VdiAIOCB *vdi_aio_setup(BlockDriverState *bs, int64_t sector_num,
     return acb;
 }
 
-static int vdi_schedule_bh(QEMUBHFunc *cb, VdiAIOCB *acb)
-{
-    logout("\n");
-
-    if (acb->bh) {
-        return -EIO;
-    }
-
-    acb->bh = qemu_bh_new(cb, acb);
-    if (!acb->bh) {
-        return -EIO;
-    }
-
-    qemu_bh_schedule(acb->bh);
-
-    return 0;
-}
-
-static void vdi_aio_read_cb(void *opaque, int ret);
-static void vdi_aio_write_cb(void *opaque, int ret);
-
-static void vdi_aio_rw_bh(void *opaque)
-{
-    VdiAIOCB *acb = opaque;
-    logout("\n");
-    qemu_bh_delete(acb->bh);
-    acb->bh = NULL;
-
-    if (acb->is_write) {
-        vdi_aio_write_cb(opaque, 0);
-    } else {
-        vdi_aio_read_cb(opaque, 0);
-    }
-}
-
-static void vdi_aio_read_cb(void *opaque, int ret)
+static int vdi_aio_read_cb(void *opaque, int ret)
 {
     VdiAIOCB *acb = opaque;
     BlockDriverState *bs = acb->common.bs;
@@ -585,12 +535,7 @@ static void vdi_aio_read_cb(void *opaque, int ret)
 
     logout("%u sectors read\n", acb->n_sectors);
 
-    acb->hd_aiocb = NULL;
-
-    if (ret < 0) {
-        goto done;
-    }
-
+restart:
     acb->nb_sectors -= acb->n_sectors;
 
     if (acb->nb_sectors == 0) {
@@ -618,10 +563,7 @@ static void vdi_aio_read_cb(void *opaque, int ret)
     if (!VDI_IS_ALLOCATED(bmap_entry)) {
         /* Block not allocated, return zeros, no need to wait. */
         memset(acb->buf, 0, n_sectors * SECTOR_SIZE);
-        ret = vdi_schedule_bh(vdi_aio_rw_bh, acb);
-        if (ret < 0) {
-            goto done;
-        }
+        ret = 0;
     } else {
         uint64_t offset = s->header.offset_data / SECTOR_SIZE +
                           (uint64_t)bmap_entry * s->block_sectors +
@@ -629,41 +571,34 @@ static void vdi_aio_read_cb(void *opaque, int ret)
         acb->hd_iov.iov_base = (void *)acb->buf;
         acb->hd_iov.iov_len = n_sectors * SECTOR_SIZE;
         qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
-        acb->hd_aiocb = bdrv_aio_readv(bs->file, offset, &acb->hd_qiov,
-                                       n_sectors, vdi_aio_read_cb, acb);
+        ret = bdrv_co_readv(bs->file, offset, n_sectors, &acb->hd_qiov);
+    }
+    if (ret >= 0) {
+        goto restart;
     }
-    return;
+
 done:
     if (acb->qiov->niov > 1) {
         qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size);
         qemu_vfree(acb->orig_buf);
     }
-    acb->common.cb(acb->common.opaque, ret);
     qemu_aio_release(acb);
+    return ret;
 }
 
-static BlockDriverAIOCB *vdi_aio_readv(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+static int vdi_co_readv(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
 {
     VdiAIOCB *acb;
     int ret;
 
     logout("\n");
-    acb = vdi_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
-    ret = vdi_schedule_bh(vdi_aio_rw_bh, acb);
-    if (ret < 0) {
-        if (acb->qiov->niov > 1) {
-            qemu_vfree(acb->orig_buf);
-        }
-        qemu_aio_release(acb);
-        return NULL;
-    }
-
-    return &acb->common;
+    acb = vdi_aio_setup(bs, sector_num, qiov, nb_sectors, 0);
+    ret = vdi_aio_read_cb(acb, 0);
+    return ret;
 }
 
-static void vdi_aio_write_cb(void *opaque, int ret)
+static int vdi_aio_write_cb(void *opaque, int ret)
 {
     VdiAIOCB *acb = opaque;
     BlockDriverState *bs = acb->common.bs;
@@ -673,12 +608,7 @@ static void vdi_aio_write_cb(void *opaque, int ret)
     uint32_t sector_in_block;
     uint32_t n_sectors;
 
-    acb->hd_aiocb = NULL;
-
-    if (ret < 0) {
-        goto done;
-    }
-
+restart:
     acb->nb_sectors -= acb->n_sectors;
     acb->sector_num += acb->n_sectors;
     acb->buf += acb->n_sectors * SECTOR_SIZE;
@@ -686,6 +616,7 @@ static void vdi_aio_write_cb(void *opaque, int ret)
     if (acb->nb_sectors == 0) {
         logout("finished data write\n");
         acb->n_sectors = 0;
+        ret = 0;
         if (acb->header_modified) {
             VdiHeader *header = acb->block_buffer;
             logout("now writing modified header\n");
@@ -696,10 +627,9 @@ static void vdi_aio_write_cb(void *opaque, int ret)
             acb->hd_iov.iov_base = acb->block_buffer;
             acb->hd_iov.iov_len = SECTOR_SIZE;
             qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
-            acb->hd_aiocb = bdrv_aio_writev(bs->file, 0, &acb->hd_qiov, 1,
-                                            vdi_aio_write_cb, acb);
-            return;
-        } else if (VDI_IS_ALLOCATED(acb->bmap_first)) {
+            ret = bdrv_co_writev(bs->file, 0, 1, &acb->hd_qiov);
+        }
+        if (ret >= 0 && VDI_IS_ALLOCATED(acb->bmap_first)) {
             /* One or more new blocks were allocated. */
             uint64_t offset;
             uint32_t bmap_first;
@@ -722,11 +652,8 @@ static void vdi_aio_write_cb(void *opaque, int ret)
             qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
             logout("will write %u block map sectors starting from entry %u\n",
                    n_sectors, bmap_first);
-            acb->hd_aiocb = bdrv_aio_writev(bs->file, offset, &acb->hd_qiov,
-                                            n_sectors, vdi_aio_write_cb, acb);
-            return;
+            ret = bdrv_co_writev(bs->file, offset, n_sectors, &acb->hd_qiov);
         }
-        ret = 0;
         goto done;
     }
 
@@ -772,9 +699,7 @@ static void vdi_aio_write_cb(void *opaque, int ret)
         acb->hd_iov.iov_base = (void *)block;
         acb->hd_iov.iov_len = s->block_size;
         qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
-        acb->hd_aiocb = bdrv_aio_writev(bs->file, offset,
-                                        &acb->hd_qiov, s->block_sectors,
-                                        vdi_aio_write_cb, acb);
+        ret = bdrv_co_writev(bs->file, offset, s->block_sectors, &acb->hd_qiov);
     } else {
         uint64_t offset = s->header.offset_data / SECTOR_SIZE +
                           (uint64_t)bmap_entry * s->block_sectors +
@@ -782,39 +707,30 @@ static void vdi_aio_write_cb(void *opaque, int ret)
         acb->hd_iov.iov_base = (void *)acb->buf;
         acb->hd_iov.iov_len = n_sectors * SECTOR_SIZE;
         qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
-        acb->hd_aiocb = bdrv_aio_writev(bs->file, offset, &acb->hd_qiov,
-                                        n_sectors, vdi_aio_write_cb, acb);
+        ret = bdrv_co_writev(bs->file, offset, n_sectors, &acb->hd_qiov);
+    }
+    if (ret >= 0) {
+        goto restart;
     }
-
-    return;
 
 done:
     if (acb->qiov->niov > 1) {
         qemu_vfree(acb->orig_buf);
     }
-    acb->common.cb(acb->common.opaque, ret);
     qemu_aio_release(acb);
+    return ret;
 }
 
-static BlockDriverAIOCB *vdi_aio_writev(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+static int vdi_co_writev(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
 {
     VdiAIOCB *acb;
     int ret;
 
     logout("\n");
-    acb = vdi_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
-    ret = vdi_schedule_bh(vdi_aio_rw_bh, acb);
-    if (ret < 0) {
-        if (acb->qiov->niov > 1) {
-            qemu_vfree(acb->orig_buf);
-        }
-        qemu_aio_release(acb);
-        return NULL;
-    }
-
-    return &acb->common;
+    acb = vdi_aio_setup(bs, sector_num, qiov, nb_sectors, 1);
+    ret = vdi_aio_write_cb(acb, 0);
+    return ret;
 }
 
 static int vdi_create(const char *filename, QEMUOptionParameter *options)
@@ -965,9 +881,9 @@ static BlockDriver bdrv_vdi = {
     .bdrv_co_is_allocated = vdi_co_is_allocated,
     .bdrv_make_empty = vdi_make_empty,
 
-    .bdrv_aio_readv = vdi_aio_readv,
+    .bdrv_co_readv = vdi_co_readv,
 #if defined(CONFIG_VDI_WRITE)
-    .bdrv_aio_writev = vdi_aio_writev,
+    .bdrv_co_writev = vdi_co_writev,
 #endif
 
     .bdrv_get_info = vdi_get_info,
-- 
cgit v1.2.3


From 0c7bfc321b4d19c91a59f1ff7b31418e67b8681a Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 19 Mar 2012 18:07:46 +0100
Subject: vdi: move end-of-I/O handling at the end

The next step is to take code that only triggers after the first operation,
and move it at the end of vdi_aio_read_cb and vdi_aio_write_cb.

Acked-by: Stefan Weil <sw@weilnetz.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/vdi.c | 123 +++++++++++++++++++++++++++---------------------------------
 1 file changed, 56 insertions(+), 67 deletions(-)

diff --git a/block/vdi.c b/block/vdi.c
index 5fd870054f..df0f4316d4 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -533,20 +533,7 @@ static int vdi_aio_read_cb(void *opaque, int ret)
     uint32_t sector_in_block;
     uint32_t n_sectors;
 
-    logout("%u sectors read\n", acb->n_sectors);
-
 restart:
-    acb->nb_sectors -= acb->n_sectors;
-
-    if (acb->nb_sectors == 0) {
-        /* request completed */
-        ret = 0;
-        goto done;
-    }
-
-    acb->sector_num += acb->n_sectors;
-    acb->buf += acb->n_sectors * SECTOR_SIZE;
-
     block_index = acb->sector_num / s->block_sectors;
     sector_in_block = acb->sector_num % s->block_sectors;
     n_sectors = s->block_sectors - sector_in_block;
@@ -573,11 +560,16 @@ restart:
         qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
         ret = bdrv_co_readv(bs->file, offset, n_sectors, &acb->hd_qiov);
     }
-    if (ret >= 0) {
+    logout("%u sectors read\n", acb->n_sectors);
+
+    acb->nb_sectors -= acb->n_sectors;
+    acb->sector_num += acb->n_sectors;
+    acb->buf += acb->n_sectors * SECTOR_SIZE;
+
+    if (ret >= 0 && acb->nb_sectors > 0) {
         goto restart;
     }
 
-done:
     if (acb->qiov->niov > 1) {
         qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size);
         qemu_vfree(acb->orig_buf);
@@ -609,56 +601,6 @@ static int vdi_aio_write_cb(void *opaque, int ret)
     uint32_t n_sectors;
 
 restart:
-    acb->nb_sectors -= acb->n_sectors;
-    acb->sector_num += acb->n_sectors;
-    acb->buf += acb->n_sectors * SECTOR_SIZE;
-
-    if (acb->nb_sectors == 0) {
-        logout("finished data write\n");
-        acb->n_sectors = 0;
-        ret = 0;
-        if (acb->header_modified) {
-            VdiHeader *header = acb->block_buffer;
-            logout("now writing modified header\n");
-            assert(VDI_IS_ALLOCATED(acb->bmap_first));
-            *header = s->header;
-            vdi_header_to_le(header);
-            acb->header_modified = 0;
-            acb->hd_iov.iov_base = acb->block_buffer;
-            acb->hd_iov.iov_len = SECTOR_SIZE;
-            qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
-            ret = bdrv_co_writev(bs->file, 0, 1, &acb->hd_qiov);
-        }
-        if (ret >= 0 && VDI_IS_ALLOCATED(acb->bmap_first)) {
-            /* One or more new blocks were allocated. */
-            uint64_t offset;
-            uint32_t bmap_first;
-            uint32_t bmap_last;
-            g_free(acb->block_buffer);
-            acb->block_buffer = NULL;
-            bmap_first = acb->bmap_first;
-            bmap_last = acb->bmap_last;
-            logout("now writing modified block map entry %u...%u\n",
-                   bmap_first, bmap_last);
-            /* Write modified sectors from block map. */
-            bmap_first /= (SECTOR_SIZE / sizeof(uint32_t));
-            bmap_last /= (SECTOR_SIZE / sizeof(uint32_t));
-            n_sectors = bmap_last - bmap_first + 1;
-            offset = s->bmap_sector + bmap_first;
-            acb->bmap_first = VDI_UNALLOCATED;
-            acb->hd_iov.iov_base = (void *)((uint8_t *)&s->bmap[0] +
-                                            bmap_first * SECTOR_SIZE);
-            acb->hd_iov.iov_len = n_sectors * SECTOR_SIZE;
-            qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
-            logout("will write %u block map sectors starting from entry %u\n",
-                   n_sectors, bmap_first);
-            ret = bdrv_co_writev(bs->file, offset, n_sectors, &acb->hd_qiov);
-        }
-        goto done;
-    }
-
-    logout("%u sectors written\n", acb->n_sectors);
-
     block_index = acb->sector_num / s->block_sectors;
     sector_in_block = acb->sector_num % s->block_sectors;
     n_sectors = s->block_sectors - sector_in_block;
@@ -709,11 +651,58 @@ restart:
         qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
         ret = bdrv_co_writev(bs->file, offset, n_sectors, &acb->hd_qiov);
     }
-    if (ret >= 0) {
+
+    acb->nb_sectors -= acb->n_sectors;
+    acb->sector_num += acb->n_sectors;
+    acb->buf += acb->n_sectors * SECTOR_SIZE;
+
+    logout("%u sectors written\n", acb->n_sectors);
+    if (ret >= 0 && acb->nb_sectors > 0) {
         goto restart;
     }
 
-done:
+    logout("finished data write\n");
+    if (ret >= 0) {
+        ret = 0;
+        if (acb->header_modified) {
+            VdiHeader *header = acb->block_buffer;
+            logout("now writing modified header\n");
+            assert(VDI_IS_ALLOCATED(acb->bmap_first));
+            *header = s->header;
+            vdi_header_to_le(header);
+            acb->header_modified = 0;
+            acb->hd_iov.iov_base = acb->block_buffer;
+            acb->hd_iov.iov_len = SECTOR_SIZE;
+            qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
+            ret = bdrv_co_writev(bs->file, 0, 1, &acb->hd_qiov);
+        }
+        if (ret >= 0 && VDI_IS_ALLOCATED(acb->bmap_first)) {
+            /* One or more new blocks were allocated. */
+            uint64_t offset;
+            uint32_t bmap_first;
+            uint32_t bmap_last;
+            g_free(acb->block_buffer);
+            acb->block_buffer = NULL;
+            bmap_first = acb->bmap_first;
+            bmap_last = acb->bmap_last;
+            logout("now writing modified block map entry %u...%u\n",
+                   bmap_first, bmap_last);
+            /* Write modified sectors from block map. */
+            bmap_first /= (SECTOR_SIZE / sizeof(uint32_t));
+            bmap_last /= (SECTOR_SIZE / sizeof(uint32_t));
+            n_sectors = bmap_last - bmap_first + 1;
+            offset = s->bmap_sector + bmap_first;
+            acb->bmap_first = VDI_UNALLOCATED;
+            acb->hd_iov.iov_base = (void *)((uint8_t *)&s->bmap[0] +
+                                            bmap_first * SECTOR_SIZE);
+            acb->hd_iov.iov_len = n_sectors * SECTOR_SIZE;
+            qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
+            logout("will write %u block map sectors starting from entry %u\n",
+                   n_sectors, bmap_first);
+            ret = bdrv_co_writev(bs->file, offset, n_sectors, &acb->hd_qiov);
+        }
+    }
+
     if (acb->qiov->niov > 1) {
         qemu_vfree(acb->orig_buf);
     }
-- 
cgit v1.2.3


From 4de659e8eb75df995310c5e57018c5a089c8fc6f Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 19 Mar 2012 18:07:47 +0100
Subject: vdi: merge aio_read_cb and aio_write_cb into callers

Now inline the former AIO callbacks into vdi_co_readv and vdi_co_writev.
While many cleanups are possible, the code now really looks synchronous.

Acked-by: Stefan Weil <sw@weilnetz.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/vdi.c | 40 ++++++++++++----------------------------
 1 file changed, 12 insertions(+), 28 deletions(-)

diff --git a/block/vdi.c b/block/vdi.c
index df0f4316d4..407fccc9a6 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -523,15 +523,19 @@ static VdiAIOCB *vdi_aio_setup(BlockDriverState *bs, int64_t sector_num,
     return acb;
 }
 
-static int vdi_aio_read_cb(void *opaque, int ret)
+static int vdi_co_readv(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
 {
-    VdiAIOCB *acb = opaque;
-    BlockDriverState *bs = acb->common.bs;
+    VdiAIOCB *acb;
     BDRVVdiState *s = bs->opaque;
     uint32_t bmap_entry;
     uint32_t block_index;
     uint32_t sector_in_block;
     uint32_t n_sectors;
+    int ret;
+
+    logout("\n");
+    acb = vdi_aio_setup(bs, sector_num, qiov, nb_sectors, 0);
 
 restart:
     block_index = acb->sector_num / s->block_sectors;
@@ -578,27 +582,19 @@ restart:
     return ret;
 }
 
-static int vdi_co_readv(BlockDriverState *bs,
+static int vdi_co_writev(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
 {
     VdiAIOCB *acb;
-    int ret;
-
-    logout("\n");
-    acb = vdi_aio_setup(bs, sector_num, qiov, nb_sectors, 0);
-    ret = vdi_aio_read_cb(acb, 0);
-    return ret;
-}
-
-static int vdi_aio_write_cb(void *opaque, int ret)
-{
-    VdiAIOCB *acb = opaque;
-    BlockDriverState *bs = acb->common.bs;
     BDRVVdiState *s = bs->opaque;
     uint32_t bmap_entry;
     uint32_t block_index;
     uint32_t sector_in_block;
     uint32_t n_sectors;
+    int ret;
+
+    logout("\n");
+    acb = vdi_aio_setup(bs, sector_num, qiov, nb_sectors, 1);
 
 restart:
     block_index = acb->sector_num / s->block_sectors;
@@ -710,18 +706,6 @@ restart:
     return ret;
 }
 
-static int vdi_co_writev(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
-{
-    VdiAIOCB *acb;
-    int ret;
-
-    logout("\n");
-    acb = vdi_aio_setup(bs, sector_num, qiov, nb_sectors, 1);
-    ret = vdi_aio_write_cb(acb, 0);
-    return ret;
-}
-
 static int vdi_create(const char *filename, QEMUOptionParameter *options)
 {
     int fd;
-- 
cgit v1.2.3


From bfc45fc183619eb4f76e3979598a510f41cd88e3 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 19 Mar 2012 18:07:48 +0100
Subject: vdi: move aiocb fields to locals

Most of the AIOCB really holds local variables that need to persist
across callback invocation.  It can go away now.

Acked-by: Stefan Weil <sw@weilnetz.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/vdi.c | 163 ++++++++++++++++++++++++------------------------------------
 1 file changed, 65 insertions(+), 98 deletions(-)

diff --git a/block/vdi.c b/block/vdi.c
index 407fccc9a6..0c1cff6031 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -145,24 +145,8 @@ void uuid_unparse(const uuid_t uu, char *out)
 
 typedef struct {
     BlockDriverAIOCB common;
-    int64_t sector_num;
-    QEMUIOVector *qiov;
     uint8_t *buf;
-    /* Total number of sectors. */
-    int nb_sectors;
-    /* Number of sectors for current AIO. */
-    int n_sectors;
-    /* New allocated block map entry. */
-    uint32_t bmap_first;
-    uint32_t bmap_last;
-    /* Buffer for new allocated block. */
-    void *block_buffer;
     void *orig_buf;
-    bool is_write;
-    int header_modified;
-    struct iovec hd_iov;
-    QEMUIOVector hd_qiov;
-    QEMUBH *bh;
 } VdiAIOCB;
 
 typedef struct {
@@ -492,34 +476,20 @@ static AIOPool vdi_aio_pool = {
     .aiocb_size = sizeof(VdiAIOCB),
 };
 
-static VdiAIOCB *vdi_aio_setup(BlockDriverState *bs, int64_t sector_num,
-        QEMUIOVector *qiov, int nb_sectors, int is_write)
+static VdiAIOCB *vdi_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov)
 {
     VdiAIOCB *acb;
 
-    logout("%p, %" PRId64 ", %p, %d, %p, %p, %d\n",
-           bs, sector_num, qiov, nb_sectors, is_write);
+    logout("%p, %p\n", bs, qiov);
 
     acb = qemu_aio_get(&vdi_aio_pool, bs, NULL, NULL);
-    acb->sector_num = sector_num;
-    acb->qiov = qiov;
-    acb->is_write = is_write;
 
     if (qiov->niov > 1) {
         acb->buf = qemu_blockalign(bs, qiov->size);
         acb->orig_buf = acb->buf;
-        if (is_write) {
-            qemu_iovec_to_buffer(qiov, acb->buf);
-        }
     } else {
         acb->buf = (uint8_t *)qiov->iov->iov_base;
     }
-    acb->nb_sectors = nb_sectors;
-    acb->n_sectors = 0;
-    acb->bmap_first = VDI_UNALLOCATED;
-    acb->bmap_last = VDI_UNALLOCATED;
-    acb->block_buffer = NULL;
-    acb->header_modified = 0;
     return acb;
 }
 
@@ -532,24 +502,25 @@ static int vdi_co_readv(BlockDriverState *bs,
     uint32_t block_index;
     uint32_t sector_in_block;
     uint32_t n_sectors;
+    struct iovec hd_iov;
+    QEMUIOVector hd_qiov;
     int ret;
 
     logout("\n");
-    acb = vdi_aio_setup(bs, sector_num, qiov, nb_sectors, 0);
+    acb = vdi_aio_setup(bs, qiov);
 
 restart:
-    block_index = acb->sector_num / s->block_sectors;
-    sector_in_block = acb->sector_num % s->block_sectors;
+    block_index = sector_num / s->block_sectors;
+    sector_in_block = sector_num % s->block_sectors;
     n_sectors = s->block_sectors - sector_in_block;
-    if (n_sectors > acb->nb_sectors) {
-        n_sectors = acb->nb_sectors;
+    if (n_sectors > nb_sectors) {
+        n_sectors = nb_sectors;
     }
 
     logout("will read %u sectors starting at sector %" PRIu64 "\n",
-           n_sectors, acb->sector_num);
+           n_sectors, sector_num);
 
     /* prepare next AIO request */
-    acb->n_sectors = n_sectors;
     bmap_entry = le32_to_cpu(s->bmap[block_index]);
     if (!VDI_IS_ALLOCATED(bmap_entry)) {
         /* Block not allocated, return zeros, no need to wait. */
@@ -559,23 +530,23 @@ restart:
         uint64_t offset = s->header.offset_data / SECTOR_SIZE +
                           (uint64_t)bmap_entry * s->block_sectors +
                           sector_in_block;
-        acb->hd_iov.iov_base = (void *)acb->buf;
-        acb->hd_iov.iov_len = n_sectors * SECTOR_SIZE;
-        qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
-        ret = bdrv_co_readv(bs->file, offset, n_sectors, &acb->hd_qiov);
+        hd_iov.iov_base = (void *)acb->buf;
+        hd_iov.iov_len = n_sectors * SECTOR_SIZE;
+        qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
+        ret = bdrv_co_readv(bs->file, offset, n_sectors, &hd_qiov);
     }
-    logout("%u sectors read\n", acb->n_sectors);
+    logout("%u sectors read\n", n_sectors);
 
-    acb->nb_sectors -= acb->n_sectors;
-    acb->sector_num += acb->n_sectors;
-    acb->buf += acb->n_sectors * SECTOR_SIZE;
+    nb_sectors -= n_sectors;
+    sector_num += n_sectors;
+    acb->buf += n_sectors * SECTOR_SIZE;
 
-    if (ret >= 0 && acb->nb_sectors > 0) {
+    if (ret >= 0 && nb_sectors > 0) {
         goto restart;
     }
 
-    if (acb->qiov->niov > 1) {
-        qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size);
+    if (acb->orig_buf) {
+        qemu_iovec_from_buffer(qiov, acb->orig_buf, qiov->size);
         qemu_vfree(acb->orig_buf);
     }
     qemu_aio_release(acb);
@@ -591,96 +562,93 @@ static int vdi_co_writev(BlockDriverState *bs,
     uint32_t block_index;
     uint32_t sector_in_block;
     uint32_t n_sectors;
+    uint32_t bmap_first = VDI_UNALLOCATED;
+    uint32_t bmap_last = VDI_UNALLOCATED;
+    struct iovec hd_iov;
+    QEMUIOVector hd_qiov;
+    uint8_t *block = NULL;
     int ret;
 
     logout("\n");
-    acb = vdi_aio_setup(bs, sector_num, qiov, nb_sectors, 1);
+    acb = vdi_aio_setup(bs, qiov);
+    if (acb->orig_buf) {
+        qemu_iovec_to_buffer(qiov, acb->buf);
+    }
 
 restart:
-    block_index = acb->sector_num / s->block_sectors;
-    sector_in_block = acb->sector_num % s->block_sectors;
+    block_index = sector_num / s->block_sectors;
+    sector_in_block = sector_num % s->block_sectors;
     n_sectors = s->block_sectors - sector_in_block;
-    if (n_sectors > acb->nb_sectors) {
-        n_sectors = acb->nb_sectors;
+    if (n_sectors > nb_sectors) {
+        n_sectors = nb_sectors;
     }
 
     logout("will write %u sectors starting at sector %" PRIu64 "\n",
-           n_sectors, acb->sector_num);
+           n_sectors, sector_num);
 
     /* prepare next AIO request */
-    acb->n_sectors = n_sectors;
     bmap_entry = le32_to_cpu(s->bmap[block_index]);
     if (!VDI_IS_ALLOCATED(bmap_entry)) {
         /* Allocate new block and write to it. */
         uint64_t offset;
-        uint8_t *block;
         bmap_entry = s->header.blocks_allocated;
         s->bmap[block_index] = cpu_to_le32(bmap_entry);
         s->header.blocks_allocated++;
         offset = s->header.offset_data / SECTOR_SIZE +
                  (uint64_t)bmap_entry * s->block_sectors;
-        block = acb->block_buffer;
         if (block == NULL) {
             block = g_malloc(s->block_size);
-            acb->block_buffer = block;
-            acb->bmap_first = block_index;
-            assert(!acb->header_modified);
-            acb->header_modified = 1;
+            bmap_first = block_index;
         }
-        acb->bmap_last = block_index;
+        bmap_last = block_index;
         /* Copy data to be written to new block and zero unused parts. */
         memset(block, 0, sector_in_block * SECTOR_SIZE);
         memcpy(block + sector_in_block * SECTOR_SIZE,
                acb->buf, n_sectors * SECTOR_SIZE);
         memset(block + (sector_in_block + n_sectors) * SECTOR_SIZE, 0,
                (s->block_sectors - n_sectors - sector_in_block) * SECTOR_SIZE);
-        acb->hd_iov.iov_base = (void *)block;
-        acb->hd_iov.iov_len = s->block_size;
-        qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
-        ret = bdrv_co_writev(bs->file, offset, s->block_sectors, &acb->hd_qiov);
+        hd_iov.iov_base = (void *)block;
+        hd_iov.iov_len = s->block_size;
+        qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
+        ret = bdrv_co_writev(bs->file, offset, s->block_sectors, &hd_qiov);
     } else {
         uint64_t offset = s->header.offset_data / SECTOR_SIZE +
                           (uint64_t)bmap_entry * s->block_sectors +
                           sector_in_block;
-        acb->hd_iov.iov_base = (void *)acb->buf;
-        acb->hd_iov.iov_len = n_sectors * SECTOR_SIZE;
-        qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
-        ret = bdrv_co_writev(bs->file, offset, n_sectors, &acb->hd_qiov);
+        hd_iov.iov_base = (void *)acb->buf;
+        hd_iov.iov_len = n_sectors * SECTOR_SIZE;
+        qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
+        ret = bdrv_co_writev(bs->file, offset, n_sectors, &hd_qiov);
     }
 
-    acb->nb_sectors -= acb->n_sectors;
-    acb->sector_num += acb->n_sectors;
-    acb->buf += acb->n_sectors * SECTOR_SIZE;
+    nb_sectors -= n_sectors;
+    sector_num += n_sectors;
+    acb->buf += n_sectors * SECTOR_SIZE;
 
-    logout("%u sectors written\n", acb->n_sectors);
-    if (ret >= 0 && acb->nb_sectors > 0) {
+    logout("%u sectors written\n", n_sectors);
+    if (ret >= 0 && nb_sectors > 0) {
         goto restart;
     }
 
     logout("finished data write\n");
     if (ret >= 0) {
         ret = 0;
-        if (acb->header_modified) {
-            VdiHeader *header = acb->block_buffer;
+        if (block) {
+            VdiHeader *header = (VdiHeader *) block;
             logout("now writing modified header\n");
-            assert(VDI_IS_ALLOCATED(acb->bmap_first));
+            assert(VDI_IS_ALLOCATED(bmap_first));
             *header = s->header;
             vdi_header_to_le(header);
-            acb->header_modified = 0;
-            acb->hd_iov.iov_base = acb->block_buffer;
-            acb->hd_iov.iov_len = SECTOR_SIZE;
-            qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
-            ret = bdrv_co_writev(bs->file, 0, 1, &acb->hd_qiov);
+            hd_iov.iov_base = block;
+            hd_iov.iov_len = SECTOR_SIZE;
+            qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
+            ret = bdrv_co_writev(bs->file, 0, 1, &hd_qiov);
         }
-        if (ret >= 0 && VDI_IS_ALLOCATED(acb->bmap_first)) {
+        g_free(block);
+        block = NULL;
+        if (ret >= 0 && VDI_IS_ALLOCATED(bmap_first)) {
             /* One or more new blocks were allocated. */
             uint64_t offset;
-            uint32_t bmap_first;
-            uint32_t bmap_last;
-            g_free(acb->block_buffer);
-            acb->block_buffer = NULL;
-            bmap_first = acb->bmap_first;
-            bmap_last = acb->bmap_last;
             logout("now writing modified block map entry %u...%u\n",
                    bmap_first, bmap_last);
             /* Write modified sectors from block map. */
@@ -688,18 +656,17 @@ restart:
             bmap_last /= (SECTOR_SIZE / sizeof(uint32_t));
             n_sectors = bmap_last - bmap_first + 1;
             offset = s->bmap_sector + bmap_first;
-            acb->bmap_first = VDI_UNALLOCATED;
-            acb->hd_iov.iov_base = (void *)((uint8_t *)&s->bmap[0] +
+            hd_iov.iov_base = (void *)((uint8_t *)&s->bmap[0] +
                                             bmap_first * SECTOR_SIZE);
-            acb->hd_iov.iov_len = n_sectors * SECTOR_SIZE;
-            qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
+            hd_iov.iov_len = n_sectors * SECTOR_SIZE;
+            qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
             logout("will write %u block map sectors starting from entry %u\n",
                    n_sectors, bmap_first);
-            ret = bdrv_co_writev(bs->file, offset, n_sectors, &acb->hd_qiov);
+            ret = bdrv_co_writev(bs->file, offset, n_sectors, &hd_qiov);
         }
     }
 
-    if (acb->qiov->niov > 1) {
+    if (acb->orig_buf) {
         qemu_vfree(acb->orig_buf);
     }
     qemu_aio_release(acb);
-- 
cgit v1.2.3


From a7a43aa199571a336b76a9a9310c95476c6dc5c4 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 19 Mar 2012 18:07:49 +0100
Subject: vdi: leave bounce buffering to block layer

vdi.c really works as if it implemented bdrv_read and bdrv_write.  However,
because only vector I/O is supported by the asynchronous callbacks, it
went through extra pain to bounce-buffer the I/O.  This can be handled
by the block layer now that the format is coroutine-based.

Acked-by: Stefan Weil <sw@weilnetz.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/vdi.c | 67 +++++++++++--------------------------------------------------
 1 file changed, 12 insertions(+), 55 deletions(-)

diff --git a/block/vdi.c b/block/vdi.c
index 0c1cff6031..790b61ef59 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -143,12 +143,6 @@ void uuid_unparse(const uuid_t uu, char *out)
 }
 #endif
 
-typedef struct {
-    BlockDriverAIOCB common;
-    uint8_t *buf;
-    void *orig_buf;
-} VdiAIOCB;
-
 typedef struct {
     char text[0x40];
     uint32_t signature;
@@ -472,31 +466,9 @@ static int coroutine_fn vdi_co_is_allocated(BlockDriverState *bs,
     return VDI_IS_ALLOCATED(bmap_entry);
 }
 
-static AIOPool vdi_aio_pool = {
-    .aiocb_size = sizeof(VdiAIOCB),
-};
-
-static VdiAIOCB *vdi_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov)
+static int vdi_co_read(BlockDriverState *bs,
+        int64_t sector_num, uint8_t *buf, int nb_sectors)
 {
-    VdiAIOCB *acb;
-
-    logout("%p, %p\n", bs, qiov);
-
-    acb = qemu_aio_get(&vdi_aio_pool, bs, NULL, NULL);
-
-    if (qiov->niov > 1) {
-        acb->buf = qemu_blockalign(bs, qiov->size);
-        acb->orig_buf = acb->buf;
-    } else {
-        acb->buf = (uint8_t *)qiov->iov->iov_base;
-    }
-    return acb;
-}
-
-static int vdi_co_readv(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
-{
-    VdiAIOCB *acb;
     BDRVVdiState *s = bs->opaque;
     uint32_t bmap_entry;
     uint32_t block_index;
@@ -507,7 +479,6 @@ static int vdi_co_readv(BlockDriverState *bs,
     int ret;
 
     logout("\n");
-    acb = vdi_aio_setup(bs, qiov);
 
 restart:
     block_index = sector_num / s->block_sectors;
@@ -524,13 +495,13 @@ restart:
     bmap_entry = le32_to_cpu(s->bmap[block_index]);
     if (!VDI_IS_ALLOCATED(bmap_entry)) {
         /* Block not allocated, return zeros, no need to wait. */
-        memset(acb->buf, 0, n_sectors * SECTOR_SIZE);
+        memset(buf, 0, n_sectors * SECTOR_SIZE);
         ret = 0;
     } else {
         uint64_t offset = s->header.offset_data / SECTOR_SIZE +
                           (uint64_t)bmap_entry * s->block_sectors +
                           sector_in_block;
-        hd_iov.iov_base = (void *)acb->buf;
+        hd_iov.iov_base = (void *)buf;
         hd_iov.iov_len = n_sectors * SECTOR_SIZE;
         qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
         ret = bdrv_co_readv(bs->file, offset, n_sectors, &hd_qiov);
@@ -539,24 +510,18 @@ restart:
 
     nb_sectors -= n_sectors;
     sector_num += n_sectors;
-    acb->buf += n_sectors * SECTOR_SIZE;
+    buf += n_sectors * SECTOR_SIZE;
 
     if (ret >= 0 && nb_sectors > 0) {
         goto restart;
     }
 
-    if (acb->orig_buf) {
-        qemu_iovec_from_buffer(qiov, acb->orig_buf, qiov->size);
-        qemu_vfree(acb->orig_buf);
-    }
-    qemu_aio_release(acb);
     return ret;
 }
 
-static int vdi_co_writev(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+static int vdi_co_write(BlockDriverState *bs,
+        int64_t sector_num, const uint8_t *buf, int nb_sectors)
 {
-    VdiAIOCB *acb;
     BDRVVdiState *s = bs->opaque;
     uint32_t bmap_entry;
     uint32_t block_index;
@@ -570,10 +535,6 @@ static int vdi_co_writev(BlockDriverState *bs,
     int ret;
 
     logout("\n");
-    acb = vdi_aio_setup(bs, qiov);
-    if (acb->orig_buf) {
-        qemu_iovec_to_buffer(qiov, acb->buf);
-    }
 
 restart:
     block_index = sector_num / s->block_sectors;
@@ -604,7 +565,7 @@ restart:
         /* Copy data to be written to new block and zero unused parts. */
         memset(block, 0, sector_in_block * SECTOR_SIZE);
         memcpy(block + sector_in_block * SECTOR_SIZE,
-               acb->buf, n_sectors * SECTOR_SIZE);
+               buf, n_sectors * SECTOR_SIZE);
         memset(block + (sector_in_block + n_sectors) * SECTOR_SIZE, 0,
                (s->block_sectors - n_sectors - sector_in_block) * SECTOR_SIZE);
         hd_iov.iov_base = (void *)block;
@@ -615,7 +576,7 @@ restart:
         uint64_t offset = s->header.offset_data / SECTOR_SIZE +
                           (uint64_t)bmap_entry * s->block_sectors +
                           sector_in_block;
-        hd_iov.iov_base = (void *)acb->buf;
+        hd_iov.iov_base = (void *)buf;
         hd_iov.iov_len = n_sectors * SECTOR_SIZE;
         qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
         ret = bdrv_co_writev(bs->file, offset, n_sectors, &hd_qiov);
@@ -623,7 +584,7 @@ restart:
 
     nb_sectors -= n_sectors;
     sector_num += n_sectors;
-    acb->buf += n_sectors * SECTOR_SIZE;
+    buf += n_sectors * SECTOR_SIZE;
 
     logout("%u sectors written\n", n_sectors);
     if (ret >= 0 && nb_sectors > 0) {
@@ -666,10 +627,6 @@ restart:
         }
     }
 
-    if (acb->orig_buf) {
-        qemu_vfree(acb->orig_buf);
-    }
-    qemu_aio_release(acb);
     return ret;
 }
 
@@ -821,9 +778,9 @@ static BlockDriver bdrv_vdi = {
     .bdrv_co_is_allocated = vdi_co_is_allocated,
     .bdrv_make_empty = vdi_make_empty,
 
-    .bdrv_co_readv = vdi_co_readv,
+    .bdrv_read = vdi_co_read,
 #if defined(CONFIG_VDI_WRITE)
-    .bdrv_co_writev = vdi_co_writev,
+    .bdrv_write = vdi_co_write,
 #endif
 
     .bdrv_get_info = vdi_get_info,
-- 
cgit v1.2.3


From 4eea78e6345518bfe87493d8b163e1e268e64f74 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 19 Mar 2012 18:07:50 +0100
Subject: vdi: do not create useless iovecs

Reads and writes to the underlying file can also occur with the simple
non-vectored I/O interfaces.

Acked-by: Stefan Weil <sw@weilnetz.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/vdi.c | 79 ++++++++++++++++++++++++++-----------------------------------
 1 file changed, 33 insertions(+), 46 deletions(-)

diff --git a/block/vdi.c b/block/vdi.c
index 790b61ef59..bfeafea1d8 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -474,8 +474,6 @@ static int vdi_co_read(BlockDriverState *bs,
     uint32_t block_index;
     uint32_t sector_in_block;
     uint32_t n_sectors;
-    struct iovec hd_iov;
-    QEMUIOVector hd_qiov;
     int ret;
 
     logout("\n");
@@ -501,10 +499,7 @@ restart:
         uint64_t offset = s->header.offset_data / SECTOR_SIZE +
                           (uint64_t)bmap_entry * s->block_sectors +
                           sector_in_block;
-        hd_iov.iov_base = (void *)buf;
-        hd_iov.iov_len = n_sectors * SECTOR_SIZE;
-        qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
-        ret = bdrv_co_readv(bs->file, offset, n_sectors, &hd_qiov);
+        ret = bdrv_read(bs->file, offset, buf, n_sectors);
     }
     logout("%u sectors read\n", n_sectors);
 
@@ -529,8 +524,6 @@ static int vdi_co_write(BlockDriverState *bs,
     uint32_t n_sectors;
     uint32_t bmap_first = VDI_UNALLOCATED;
     uint32_t bmap_last = VDI_UNALLOCATED;
-    struct iovec hd_iov;
-    QEMUIOVector hd_qiov;
     uint8_t *block = NULL;
     int ret;
 
@@ -568,18 +561,12 @@ restart:
                buf, n_sectors * SECTOR_SIZE);
         memset(block + (sector_in_block + n_sectors) * SECTOR_SIZE, 0,
                (s->block_sectors - n_sectors - sector_in_block) * SECTOR_SIZE);
-        hd_iov.iov_base = (void *)block;
-        hd_iov.iov_len = s->block_size;
-        qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
-        ret = bdrv_co_writev(bs->file, offset, s->block_sectors, &hd_qiov);
+        ret = bdrv_write(bs->file, offset, block, s->block_sectors);
     } else {
         uint64_t offset = s->header.offset_data / SECTOR_SIZE +
                           (uint64_t)bmap_entry * s->block_sectors +
                           sector_in_block;
-        hd_iov.iov_base = (void *)buf;
-        hd_iov.iov_len = n_sectors * SECTOR_SIZE;
-        qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
-        ret = bdrv_co_writev(bs->file, offset, n_sectors, &hd_qiov);
+        ret = bdrv_write(bs->file, offset, buf, n_sectors);
     }
 
     nb_sectors -= n_sectors;
@@ -592,39 +579,39 @@ restart:
     }
 
     logout("finished data write\n");
-    if (ret >= 0) {
-        ret = 0;
-        if (block) {
-            VdiHeader *header = (VdiHeader *) block;
-            logout("now writing modified header\n");
-            assert(VDI_IS_ALLOCATED(bmap_first));
-            *header = s->header;
-            vdi_header_to_le(header);
-            hd_iov.iov_base = block;
-            hd_iov.iov_len = SECTOR_SIZE;
-            qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
-            ret = bdrv_co_writev(bs->file, 0, 1, &hd_qiov);
-        }
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (block) {
+        /* One or more new blocks were allocated. */
+        VdiHeader *header = (VdiHeader *) block;
+        uint8_t *base;
+        uint64_t offset;
+
+        logout("now writing modified header\n");
+        assert(VDI_IS_ALLOCATED(bmap_first));
+        *header = s->header;
+        vdi_header_to_le(header);
+        ret = bdrv_write(bs->file, 0, block, 1);
         g_free(block);
         block = NULL;
-        if (ret >= 0 && VDI_IS_ALLOCATED(bmap_first)) {
-            /* One or more new blocks were allocated. */
-            uint64_t offset;
-            logout("now writing modified block map entry %u...%u\n",
-                   bmap_first, bmap_last);
-            /* Write modified sectors from block map. */
-            bmap_first /= (SECTOR_SIZE / sizeof(uint32_t));
-            bmap_last /= (SECTOR_SIZE / sizeof(uint32_t));
-            n_sectors = bmap_last - bmap_first + 1;
-            offset = s->bmap_sector + bmap_first;
-            hd_iov.iov_base = (void *)((uint8_t *)&s->bmap[0] +
-                                            bmap_first * SECTOR_SIZE);
-            hd_iov.iov_len = n_sectors * SECTOR_SIZE;
-            qemu_iovec_init_external(&hd_qiov, &hd_iov, 1);
-            logout("will write %u block map sectors starting from entry %u\n",
-                   n_sectors, bmap_first);
-            ret = bdrv_co_writev(bs->file, offset, n_sectors, &hd_qiov);
+
+        if (ret < 0) {
+            return ret;
         }
+
+        logout("now writing modified block map entry %u...%u\n",
+               bmap_first, bmap_last);
+        /* Write modified sectors from block map. */
+        bmap_first /= (SECTOR_SIZE / sizeof(uint32_t));
+        bmap_last /= (SECTOR_SIZE / sizeof(uint32_t));
+        n_sectors = bmap_last - bmap_first + 1;
+        offset = s->bmap_sector + bmap_first;
+        base = ((uint8_t *)&s->bmap[0]) + bmap_first * SECTOR_SIZE;
+        logout("will write %u block map sectors starting from entry %u\n",
+               n_sectors, bmap_first);
+        ret = bdrv_write(bs->file, offset, base, n_sectors);
     }
 
     return ret;
-- 
cgit v1.2.3


From eb9566d13e30dd7e20d978632a13915cbdb9a668 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 19 Mar 2012 18:07:51 +0100
Subject: vdi: change goto to loop

Finally reindent all code and change goto statements to a loop.

Acked-by: Stefan Weil <sw@weilnetz.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/vdi.c | 141 +++++++++++++++++++++++++++++-------------------------------
 1 file changed, 68 insertions(+), 73 deletions(-)

diff --git a/block/vdi.c b/block/vdi.c
index bfeafea1d8..119d3c74da 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -474,41 +474,38 @@ static int vdi_co_read(BlockDriverState *bs,
     uint32_t block_index;
     uint32_t sector_in_block;
     uint32_t n_sectors;
-    int ret;
+    int ret = 0;
 
     logout("\n");
 
-restart:
-    block_index = sector_num / s->block_sectors;
-    sector_in_block = sector_num % s->block_sectors;
-    n_sectors = s->block_sectors - sector_in_block;
-    if (n_sectors > nb_sectors) {
-        n_sectors = nb_sectors;
-    }
-
-    logout("will read %u sectors starting at sector %" PRIu64 "\n",
-           n_sectors, sector_num);
+    while (ret >= 0 && nb_sectors > 0) {
+        block_index = sector_num / s->block_sectors;
+        sector_in_block = sector_num % s->block_sectors;
+        n_sectors = s->block_sectors - sector_in_block;
+        if (n_sectors > nb_sectors) {
+            n_sectors = nb_sectors;
+        }
 
-    /* prepare next AIO request */
-    bmap_entry = le32_to_cpu(s->bmap[block_index]);
-    if (!VDI_IS_ALLOCATED(bmap_entry)) {
-        /* Block not allocated, return zeros, no need to wait. */
-        memset(buf, 0, n_sectors * SECTOR_SIZE);
-        ret = 0;
-    } else {
-        uint64_t offset = s->header.offset_data / SECTOR_SIZE +
-                          (uint64_t)bmap_entry * s->block_sectors +
-                          sector_in_block;
-        ret = bdrv_read(bs->file, offset, buf, n_sectors);
-    }
-    logout("%u sectors read\n", n_sectors);
+        logout("will read %u sectors starting at sector %" PRIu64 "\n",
+               n_sectors, sector_num);
 
-    nb_sectors -= n_sectors;
-    sector_num += n_sectors;
-    buf += n_sectors * SECTOR_SIZE;
+        /* prepare next AIO request */
+        bmap_entry = le32_to_cpu(s->bmap[block_index]);
+        if (!VDI_IS_ALLOCATED(bmap_entry)) {
+            /* Block not allocated, return zeros, no need to wait. */
+            memset(buf, 0, n_sectors * SECTOR_SIZE);
+            ret = 0;
+        } else {
+            uint64_t offset = s->header.offset_data / SECTOR_SIZE +
+                              (uint64_t)bmap_entry * s->block_sectors +
+                              sector_in_block;
+            ret = bdrv_read(bs->file, offset, buf, n_sectors);
+        }
+        logout("%u sectors read\n", n_sectors);
 
-    if (ret >= 0 && nb_sectors > 0) {
-        goto restart;
+        nb_sectors -= n_sectors;
+        sector_num += n_sectors;
+        buf += n_sectors * SECTOR_SIZE;
     }
 
     return ret;
@@ -525,57 +522,55 @@ static int vdi_co_write(BlockDriverState *bs,
     uint32_t bmap_first = VDI_UNALLOCATED;
     uint32_t bmap_last = VDI_UNALLOCATED;
     uint8_t *block = NULL;
-    int ret;
+    int ret = 0;
 
     logout("\n");
 
-restart:
-    block_index = sector_num / s->block_sectors;
-    sector_in_block = sector_num % s->block_sectors;
-    n_sectors = s->block_sectors - sector_in_block;
-    if (n_sectors > nb_sectors) {
-        n_sectors = nb_sectors;
-    }
-
-    logout("will write %u sectors starting at sector %" PRIu64 "\n",
-           n_sectors, sector_num);
+    while (ret >= 0 && nb_sectors > 0) {
+        block_index = sector_num / s->block_sectors;
+        sector_in_block = sector_num % s->block_sectors;
+        n_sectors = s->block_sectors - sector_in_block;
+        if (n_sectors > nb_sectors) {
+            n_sectors = nb_sectors;
+        }
 
-    /* prepare next AIO request */
-    bmap_entry = le32_to_cpu(s->bmap[block_index]);
-    if (!VDI_IS_ALLOCATED(bmap_entry)) {
-        /* Allocate new block and write to it. */
-        uint64_t offset;
-        bmap_entry = s->header.blocks_allocated;
-        s->bmap[block_index] = cpu_to_le32(bmap_entry);
-        s->header.blocks_allocated++;
-        offset = s->header.offset_data / SECTOR_SIZE +
-                 (uint64_t)bmap_entry * s->block_sectors;
-        if (block == NULL) {
-            block = g_malloc(s->block_size);
-            bmap_first = block_index;
+        logout("will write %u sectors starting at sector %" PRIu64 "\n",
+               n_sectors, sector_num);
+
+        /* prepare next AIO request */
+        bmap_entry = le32_to_cpu(s->bmap[block_index]);
+        if (!VDI_IS_ALLOCATED(bmap_entry)) {
+            /* Allocate new block and write to it. */
+            uint64_t offset;
+            bmap_entry = s->header.blocks_allocated;
+            s->bmap[block_index] = cpu_to_le32(bmap_entry);
+            s->header.blocks_allocated++;
+            offset = s->header.offset_data / SECTOR_SIZE +
+                     (uint64_t)bmap_entry * s->block_sectors;
+            if (block == NULL) {
+                block = g_malloc(s->block_size);
+                bmap_first = block_index;
+            }
+            bmap_last = block_index;
+            /* Copy data to be written to new block and zero unused parts. */
+            memset(block, 0, sector_in_block * SECTOR_SIZE);
+            memcpy(block + sector_in_block * SECTOR_SIZE,
+                   buf, n_sectors * SECTOR_SIZE);
+            memset(block + (sector_in_block + n_sectors) * SECTOR_SIZE, 0,
+                   (s->block_sectors - n_sectors - sector_in_block) * SECTOR_SIZE);
+            ret = bdrv_write(bs->file, offset, block, s->block_sectors);
+        } else {
+            uint64_t offset = s->header.offset_data / SECTOR_SIZE +
+                              (uint64_t)bmap_entry * s->block_sectors +
+                              sector_in_block;
+            ret = bdrv_write(bs->file, offset, buf, n_sectors);
         }
-        bmap_last = block_index;
-        /* Copy data to be written to new block and zero unused parts. */
-        memset(block, 0, sector_in_block * SECTOR_SIZE);
-        memcpy(block + sector_in_block * SECTOR_SIZE,
-               buf, n_sectors * SECTOR_SIZE);
-        memset(block + (sector_in_block + n_sectors) * SECTOR_SIZE, 0,
-               (s->block_sectors - n_sectors - sector_in_block) * SECTOR_SIZE);
-        ret = bdrv_write(bs->file, offset, block, s->block_sectors);
-    } else {
-        uint64_t offset = s->header.offset_data / SECTOR_SIZE +
-                          (uint64_t)bmap_entry * s->block_sectors +
-                          sector_in_block;
-        ret = bdrv_write(bs->file, offset, buf, n_sectors);
-    }
 
-    nb_sectors -= n_sectors;
-    sector_num += n_sectors;
-    buf += n_sectors * SECTOR_SIZE;
+        nb_sectors -= n_sectors;
+        sector_num += n_sectors;
+        buf += n_sectors * SECTOR_SIZE;
 
-    logout("%u sectors written\n", n_sectors);
-    if (ret >= 0 && nb_sectors > 0) {
-        goto restart;
+        logout("%u sectors written\n", n_sectors);
     }
 
     logout("finished data write\n");
-- 
cgit v1.2.3


From 43cf8ae69ba8510e45d7bd42dd67bc8ae13c48ec Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 27 Mar 2012 13:42:23 +1100
Subject: Use DMADirection type for dma_bdrv_io
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently dma_bdrv_io() takes a 'to_dev' boolean parameter to
determine the direction of DMA it is emulating.  We already have a
DMADirection enum designed specifically to encode DMA directions.
This patch uses it for dma_bdrv_io() as well.  This involves removing
the DMADirection definition from the #ifdef it was inside, but since that
only existed to protect the definition of dma_addr_t from places where
config.h is not included, there wasn't any reason for it to be there in
the first place.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Reviewed-by: Andreas Färber <afaerber@suse.de>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 dma-helpers.c  | 20 ++++++++++++--------
 dma.h          | 12 ++++++------
 hw/ide/core.c  |  3 ++-
 hw/ide/macio.c |  3 ++-
 4 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/dma-helpers.c b/dma-helpers.c
index 7fcc86dca9..7971a89c14 100644
--- a/dma-helpers.c
+++ b/dma-helpers.c
@@ -41,7 +41,7 @@ typedef struct {
     BlockDriverAIOCB *acb;
     QEMUSGList *sg;
     uint64_t sector_num;
-    bool to_dev;
+    DMADirection dir;
     bool in_cancel;
     int sg_cur_index;
     dma_addr_t sg_cur_byte;
@@ -75,7 +75,8 @@ static void dma_bdrv_unmap(DMAAIOCB *dbs)
 
     for (i = 0; i < dbs->iov.niov; ++i) {
         cpu_physical_memory_unmap(dbs->iov.iov[i].iov_base,
-                                  dbs->iov.iov[i].iov_len, !dbs->to_dev,
+                                  dbs->iov.iov[i].iov_len,
+                                  dbs->dir != DMA_DIRECTION_TO_DEVICE,
                                   dbs->iov.iov[i].iov_len);
     }
     qemu_iovec_reset(&dbs->iov);
@@ -122,7 +123,8 @@ static void dma_bdrv_cb(void *opaque, int ret)
     while (dbs->sg_cur_index < dbs->sg->nsg) {
         cur_addr = dbs->sg->sg[dbs->sg_cur_index].base + dbs->sg_cur_byte;
         cur_len = dbs->sg->sg[dbs->sg_cur_index].len - dbs->sg_cur_byte;
-        mem = cpu_physical_memory_map(cur_addr, &cur_len, !dbs->to_dev);
+        mem = cpu_physical_memory_map(cur_addr, &cur_len,
+                                      dbs->dir != DMA_DIRECTION_TO_DEVICE);
         if (!mem)
             break;
         qemu_iovec_add(&dbs->iov, mem, cur_len);
@@ -169,11 +171,11 @@ static AIOPool dma_aio_pool = {
 BlockDriverAIOCB *dma_bdrv_io(
     BlockDriverState *bs, QEMUSGList *sg, uint64_t sector_num,
     DMAIOFunc *io_func, BlockDriverCompletionFunc *cb,
-    void *opaque, bool to_dev)
+    void *opaque, DMADirection dir)
 {
     DMAAIOCB *dbs = qemu_aio_get(&dma_aio_pool, bs, cb, opaque);
 
-    trace_dma_bdrv_io(dbs, bs, sector_num, to_dev);
+    trace_dma_bdrv_io(dbs, bs, sector_num, (dir == DMA_DIRECTION_TO_DEVICE));
 
     dbs->acb = NULL;
     dbs->bs = bs;
@@ -181,7 +183,7 @@ BlockDriverAIOCB *dma_bdrv_io(
     dbs->sector_num = sector_num;
     dbs->sg_cur_index = 0;
     dbs->sg_cur_byte = 0;
-    dbs->to_dev = to_dev;
+    dbs->dir = dir;
     dbs->io_func = io_func;
     dbs->bh = NULL;
     qemu_iovec_init(&dbs->iov, sg->nsg);
@@ -194,14 +196,16 @@ BlockDriverAIOCB *dma_bdrv_read(BlockDriverState *bs,
                                 QEMUSGList *sg, uint64_t sector,
                                 void (*cb)(void *opaque, int ret), void *opaque)
 {
-    return dma_bdrv_io(bs, sg, sector, bdrv_aio_readv, cb, opaque, false);
+    return dma_bdrv_io(bs, sg, sector, bdrv_aio_readv, cb, opaque,
+                       DMA_DIRECTION_FROM_DEVICE);
 }
 
 BlockDriverAIOCB *dma_bdrv_write(BlockDriverState *bs,
                                  QEMUSGList *sg, uint64_t sector,
                                  void (*cb)(void *opaque, int ret), void *opaque)
 {
-    return dma_bdrv_io(bs, sg, sector, bdrv_aio_writev, cb, opaque, true);
+    return dma_bdrv_io(bs, sg, sector, bdrv_aio_writev, cb, opaque,
+                       DMA_DIRECTION_TO_DEVICE);
 }
 
 
diff --git a/dma.h b/dma.h
index 20e86d28ee..05ac325e27 100644
--- a/dma.h
+++ b/dma.h
@@ -17,6 +17,11 @@
 
 typedef struct ScatterGatherEntry ScatterGatherEntry;
 
+typedef enum {
+    DMA_DIRECTION_TO_DEVICE = 0,
+    DMA_DIRECTION_FROM_DEVICE = 1,
+} DMADirection;
+
 struct QEMUSGList {
     ScatterGatherEntry *sg;
     int nsg;
@@ -29,11 +34,6 @@ typedef target_phys_addr_t dma_addr_t;
 
 #define DMA_ADDR_FMT TARGET_FMT_plx
 
-typedef enum {
-    DMA_DIRECTION_TO_DEVICE = 0,
-    DMA_DIRECTION_FROM_DEVICE = 1,
-} DMADirection;
-
 struct ScatterGatherEntry {
     dma_addr_t base;
     dma_addr_t len;
@@ -51,7 +51,7 @@ typedef BlockDriverAIOCB *DMAIOFunc(BlockDriverState *bs, int64_t sector_num,
 BlockDriverAIOCB *dma_bdrv_io(BlockDriverState *bs,
                               QEMUSGList *sg, uint64_t sector_num,
                               DMAIOFunc *io_func, BlockDriverCompletionFunc *cb,
-                              void *opaque, bool to_dev);
+                              void *opaque, DMADirection dir);
 BlockDriverAIOCB *dma_bdrv_read(BlockDriverState *bs,
                                 QEMUSGList *sg, uint64_t sector,
                                 BlockDriverCompletionFunc *cb, void *opaque);
diff --git a/hw/ide/core.c b/hw/ide/core.c
index 6e25338615..35723fd800 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -618,7 +618,8 @@ void ide_dma_cb(void *opaque, int ret)
         break;
     case IDE_DMA_TRIM:
         s->bus->dma->aiocb = dma_bdrv_io(s->bs, &s->sg, sector_num,
-                                         ide_issue_trim, ide_dma_cb, s, true);
+                                         ide_issue_trim, ide_dma_cb, s,
+                                         DMA_DIRECTION_TO_DEVICE);
         break;
     }
     return;
diff --git a/hw/ide/macio.c b/hw/ide/macio.c
index a4df24406a..7b38d9e683 100644
--- a/hw/ide/macio.c
+++ b/hw/ide/macio.c
@@ -149,7 +149,8 @@ static void pmac_ide_transfer_cb(void *opaque, int ret)
         break;
     case IDE_DMA_TRIM:
         m->aiocb = dma_bdrv_io(s->bs, &s->sg, sector_num,
-                               ide_issue_trim, pmac_ide_transfer_cb, s, true);
+                               ide_issue_trim, pmac_ide_transfer_cb, s,
+                               DMA_DIRECTION_TO_DEVICE);
         break;
     }
     return;
-- 
cgit v1.2.3


From 498e386c58497f621a37301ea6e8ca886f9f2e07 Mon Sep 17 00:00:00 2001
From: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
Date: Mon, 2 Apr 2012 18:59:34 +0800
Subject: block: disable I/O throttling on sync api

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/block.c b/block.c
index 8858be096b..0344673bd7 100644
--- a/block.c
+++ b/block.c
@@ -1463,6 +1463,17 @@ static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
 
     qemu_iovec_init_external(&qiov, &iov, 1);
 
+    /**
+     * In sync call context, when the vcpu is blocked, this throttling timer
+     * will not fire; so the I/O throttling function has to be disabled here
+     * if it has been enabled.
+     */
+    if (bs->io_limits_enabled) {
+        fprintf(stderr, "Disabling I/O throttling on '%s' due "
+                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
+        bdrv_io_limits_disable(bs);
+    }
+
     if (qemu_in_coroutine()) {
         /* Fast-path if already in coroutine context */
         bdrv_rw_co_entry(&rwco);
@@ -1969,10 +1980,19 @@ static int guess_disk_lchs(BlockDriverState *bs,
     struct partition *p;
     uint32_t nr_sects;
     uint64_t nb_sectors;
+    bool enabled;
 
     bdrv_get_geometry(bs, &nb_sectors);
 
+    /**
+     * The function will be invoked during startup not only in sync I/O mode,
+     * but also in async I/O mode. So the I/O throttling function has to
+     * be disabled temporarily here, not permanently.
+     */
+    enabled = bs->io_limits_enabled;
+    bs->io_limits_enabled = false;
     ret = bdrv_read(bs, 0, buf, 1);
+    bs->io_limits_enabled = enabled;
     if (ret < 0)
         return -1;
     /* test msdos magic */
-- 
cgit v1.2.3


From 12bde0eed6b740787bca2c998a838b20c556d0ec Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 30 Mar 2012 13:17:10 +0200
Subject: block: cancel jobs when a device is ready to go away

We do not want jobs to keep a device busy for a possibly very long
time, and management could become confused because they thought a
device was not even there anymore.  So, cancel long-running jobs
as soon as their device is going to disappear.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockdev.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/blockdev.c b/blockdev.c
index f5e7dba90c..4d17486e31 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -64,6 +64,9 @@ void blockdev_mark_auto_del(BlockDriverState *bs)
 {
     DriveInfo *dinfo = drive_get_by_blockdev(bs);
 
+    if (bs->job) {
+        block_job_cancel(bs->job);
+    }
     if (dinfo) {
         dinfo->auto_del = 1;
     }
-- 
cgit v1.2.3


From 3e914655f268f627ef004a8f1ea0355311b5aca6 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 30 Mar 2012 13:17:11 +0200
Subject: block: fix streaming/closing race

Streaming can issue I/O while qcow2_close is running.  This causes the
L2 caches to become very confused or, alternatively, could cause a
segfault when the streaming coroutine is reentered after closing its
block device.  The fix is to cancel streaming jobs when closing their
underlying device.

The cancellation must be synchronous, on the other hand qemu_aio_wait
will not restart a coroutine that is sleeping in co_sleep.  So add
a flag saying whether streaming has in-flight I/O.  If the busy flag
is false, the coroutine is quiescent and, when cancelled, will not
issue any new I/O.

This protects streaming against closing, but not against deleting.
We have a reference count protecting us against concurrent deletion,
but I still added an assertion to ensure nothing bad happens.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c        | 16 ++++++++++++++++
 block/stream.c |  6 ++++--
 block_int.h    |  2 ++
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/block.c b/block.c
index 0344673bd7..16e14fa7d5 100644
--- a/block.c
+++ b/block.c
@@ -813,6 +813,9 @@ unlink_and_fail:
 void bdrv_close(BlockDriverState *bs)
 {
     if (bs->drv) {
+        if (bs->job) {
+            block_job_cancel_sync(bs->job);
+        }
         if (bs == bs_snapshots) {
             bs_snapshots = NULL;
         }
@@ -966,6 +969,8 @@ void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
 void bdrv_delete(BlockDriverState *bs)
 {
     assert(!bs->dev);
+    assert(!bs->job);
+    assert(!bs->in_use);
 
     /* remove from list, if necessary */
     bdrv_make_anon(bs);
@@ -4095,3 +4100,14 @@ bool block_job_is_cancelled(BlockJob *job)
 {
     return job->cancelled;
 }
+
+void block_job_cancel_sync(BlockJob *job)
+{
+    BlockDriverState *bs = job->bs;
+
+    assert(bs->job == job);
+    block_job_cancel(job);
+    while (bs->job != NULL && bs->job->busy) {
+        qemu_aio_wait();
+    }
+}
diff --git a/block/stream.c b/block/stream.c
index d1b3986a8a..f186bfd4f5 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -175,7 +175,7 @@ retry:
             break;
         }
 
-
+        s->common.busy = true;
         if (base) {
             ret = is_allocated_base(bs, base, sector_num,
                                     STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n);
@@ -189,6 +189,7 @@ retry:
             if (s->common.speed) {
                 uint64_t delay_ns = ratelimit_calculate_delay(&s->limit, n);
                 if (delay_ns > 0) {
+                    s->common.busy = false;
                     co_sleep_ns(rt_clock, delay_ns);
 
                     /* Recheck cancellation and that sectors are unallocated */
@@ -208,6 +209,7 @@ retry:
         /* Note that even when no rate limit is applied we need to yield
          * with no pending I/O here so that qemu_aio_flush() returns.
          */
+        s->common.busy = false;
         co_sleep_ns(rt_clock, 0);
     }
 
@@ -215,7 +217,7 @@ retry:
         bdrv_disable_copy_on_read(bs);
     }
 
-    if (sector_num == end && ret == 0) {
+    if (!block_job_is_cancelled(&s->common) && sector_num == end && ret == 0) {
         const char *base_id = NULL;
         if (base) {
             base_id = s->backing_file_id;
diff --git a/block_int.h b/block_int.h
index 22c86a53fe..a96aabdd87 100644
--- a/block_int.h
+++ b/block_int.h
@@ -83,6 +83,7 @@ struct BlockJob {
     const BlockJobType *job_type;
     BlockDriverState *bs;
     bool cancelled;
+    bool busy;
 
     /* These fields are published by the query-block-jobs QMP API */
     int64_t offset;
@@ -311,6 +312,7 @@ void block_job_complete(BlockJob *job, int ret);
 int block_job_set_speed(BlockJob *job, int64_t value);
 void block_job_cancel(BlockJob *job);
 bool block_job_is_cancelled(BlockJob *job);
+void block_job_cancel_sync(BlockJob *job);
 
 int stream_start(BlockDriverState *bs, BlockDriverState *base,
                  const char *base_id, BlockDriverCompletionFunc *cb,
-- 
cgit v1.2.3


From 9f25eccc1cdbe6ee985b7a5954fa621c2012912e Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 30 Mar 2012 13:17:12 +0200
Subject: block: set job->speed in block_set_speed

There is no need to do this in every implementation of set_speed
(even though there is only one right now).

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c        | 8 +++++++-
 block/stream.c | 1 -
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/block.c b/block.c
index 16e14fa7d5..33630ebacc 100644
--- a/block.c
+++ b/block.c
@@ -4085,10 +4085,16 @@ void block_job_complete(BlockJob *job, int ret)
 
 int block_job_set_speed(BlockJob *job, int64_t value)
 {
+    int rc;
+
     if (!job->job_type->set_speed) {
         return -ENOTSUP;
     }
-    return job->job_type->set_speed(job, value);
+    rc = job->job_type->set_speed(job, value);
+    if (rc == 0) {
+        job->speed = value;
+    }
+    return rc;
 }
 
 void block_job_cancel(BlockJob *job)
diff --git a/block/stream.c b/block/stream.c
index f186bfd4f5..61ff7a236e 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -236,7 +236,6 @@ static int stream_set_speed(BlockJob *job, int64_t value)
     if (value < 0) {
         return -EINVAL;
     }
-    job->speed = value;
     ratelimit_set_speed(&s->limit, value / BDRV_SECTOR_SIZE);
     return 0;
 }
-- 
cgit v1.2.3


From dc534f8fc045a51a8eb678ab3d330fba1e9e631c Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 30 Mar 2012 13:17:13 +0200
Subject: block: document job API

I am not sure that these are really proper GtkDoc, but they follow
the existing documentation in block_int.h.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block_int.h | 115 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 112 insertions(+), 3 deletions(-)

diff --git a/block_int.h b/block_int.h
index a96aabdd87..0e5a032e77 100644
--- a/block_int.h
+++ b/block_int.h
@@ -63,8 +63,13 @@ typedef struct BlockIOBaseValue {
     uint64_t ios[2];
 } BlockIOBaseValue;
 
-typedef void BlockJobCancelFunc(void *opaque);
 typedef struct BlockJob BlockJob;
+
+/**
+ * BlockJobType:
+ *
+ * A class type for block job objects.
+ */
 typedef struct BlockJobType {
     /** Derived BlockJob struct size */
     size_t instance_size;
@@ -77,20 +82,48 @@ typedef struct BlockJobType {
 } BlockJobType;
 
 /**
- * Long-running operation on a BlockDriverState
+ * BlockJob:
+ *
+ * Long-running operation on a BlockDriverState.
  */
 struct BlockJob {
+    /** The job type, including the job vtable.  */
     const BlockJobType *job_type;
+
+    /** The block device on which the job is operating.  */
     BlockDriverState *bs;
+
+    /**
+     * Set to true if the job should cancel itself.  The flag must
+     * always be tested just before toggling the busy flag from false
+     * to true.  After a job has detected that the cancelled flag is
+     * true, it should not anymore issue any I/O operation to the
+     * block device.
+     */
     bool cancelled;
+
+    /**
+     * Set to false by the job while it is in a quiescent state, where
+     * no I/O is pending and cancellation can be processed without
+     * issuing new I/O.  The busy flag must be set to false when the
+     * job goes to sleep on any condition that is not detected by
+     * #qemu_aio_wait, such as a timer.
+     */
     bool busy;
 
-    /* These fields are published by the query-block-jobs QMP API */
+    /** Offset that is published by the query-block-jobs QMP API */
     int64_t offset;
+
+    /** Length that is published by the query-block-jobs QMP API */
     int64_t len;
+
+    /** Speed that was set with @block_job_set_speed.  */
     int64_t speed;
 
+    /** The completion function that will be called when the job completes.  */
     BlockDriverCompletionFunc *cb;
+
+    /** The opaque value that is passed to the completion function.  */
     void *opaque;
 };
 
@@ -306,14 +339,90 @@ void bdrv_set_io_limits(BlockDriverState *bs,
 int is_windows_drive(const char *filename);
 #endif
 
+/**
+ * block_job_create:
+ * @job_type: The class object for the newly-created job.
+ * @bs: The block
+ * @cb: Completion function for the job.
+ * @opaque: Opaque pointer value passed to @cb.
+ *
+ * Create a new long-running block device job and return it.  The job
+ * will call @cb asynchronously when the job completes.  Note that
+ * @bs may have been closed at the time the @cb it is called.  If
+ * this is the case, the job may be reported as either cancelled or
+ * completed.
+ *
+ * This function is not part of the public job interface; it should be
+ * called from a wrapper that is specific to the job type.
+ */
 void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
                        BlockDriverCompletionFunc *cb, void *opaque);
+
+/**
+ * block_job_complete:
+ * @job: The job being completed.
+ * @ret: The status code.
+ *
+ * Call the completion function that was registered at creation time, and
+ * free @job.
+ */
 void block_job_complete(BlockJob *job, int ret);
+
+/**
+ * block_job_set_speed:
+ * @job: The job to set the speed for.
+ * @speed: The new value
+ *
+ * Set a rate-limiting parameter for the job; the actual meaning may
+ * vary depending on the job type.
+ */
 int block_job_set_speed(BlockJob *job, int64_t value);
+
+/**
+ * block_job_cancel:
+ * @job: The job to be canceled.
+ *
+ * Asynchronously cancel the specified job.
+ */
 void block_job_cancel(BlockJob *job);
+
+/**
+ * block_job_is_cancelled:
+ * @job: The job being queried.
+ *
+ * Returns whether the job is scheduled for cancellation.
+ */
 bool block_job_is_cancelled(BlockJob *job);
+
+/**
+ * block_job_cancel:
+ * @job: The job to be canceled.
+ *
+ * Asynchronously cancel the job and wait for it to reach a quiescent
+ * state.  Note that the completion callback will still be called
+ * asynchronously, hence it is *not* valid to call #bdrv_delete
+ * immediately after #block_job_cancel_sync.  Users of block jobs
+ * will usually protect the BlockDriverState objects with a reference
+ * count, should this be a concern.
+ */
 void block_job_cancel_sync(BlockJob *job);
 
+/**
+ * stream_start:
+ * @bs: Block device to operate on.
+ * @base: Block device that will become the new base, or %NULL to
+ * flatten the whole backing file chain onto @bs.
+ * @base_id: The file name that will be written to @bs as the new
+ * backing file if the job completes.  Ignored if @base is %NULL.
+ * @cb: Completion function for the job.
+ * @opaque: Opaque pointer value passed to @cb.
+ *
+ * Start a streaming operation on @bs.  Clusters that are unallocated
+ * in @bs, but allocated in any image between @base and @bs (both
+ * exclusive) will be written to @bs.  At the end of a successful
+ * streaming job, the backing file of @bs will be changed to
+ * @base_id in the written image and to @base in the live BlockDriverState.
+ */
 int stream_start(BlockDriverState *bs, BlockDriverState *base,
                  const char *base_id, BlockDriverCompletionFunc *cb,
                  void *opaque);
-- 
cgit v1.2.3


From f8111c241afa75544032dcfa23df0699c91f9866 Mon Sep 17 00:00:00 2001
From: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
Date: Thu, 15 Mar 2012 20:13:31 +0800
Subject: qemu-img: add image fragmentation statistics

Discussion can be found at:
http://patchwork.ozlabs.org/patch/128730/

This patch add image fragmentation statistics while using qemu-img check.

Signed-off-by: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.h    | 7 +++++++
 qemu-img.c | 9 ++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/block.h b/block.h
index c51ab163d9..ea12f5df30 100644
--- a/block.h
+++ b/block.h
@@ -17,6 +17,12 @@ typedef struct BlockDriverInfo {
     int64_t vm_state_offset;
 } BlockDriverInfo;
 
+typedef struct BlockFragInfo {
+    uint64_t allocated_clusters;
+    uint64_t total_clusters;
+    uint64_t fragmented_clusters;
+} BlockFragInfo;
+
 typedef struct QEMUSnapshotInfo {
     char id_str[128]; /* unique snapshot id */
     /* the following fields are informative. They are not needed for
@@ -175,6 +181,7 @@ typedef struct BdrvCheckResult {
     int corruptions;
     int leaks;
     int check_errors;
+    BlockFragInfo bfi;
 } BdrvCheckResult;
 
 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res);
diff --git a/qemu-img.c b/qemu-img.c
index 0e48b35296..4de48bac58 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -428,6 +428,13 @@ static int img_check(int argc, char **argv)
         }
     }
 
+    if (result.bfi.total_clusters != 0 && result.bfi.allocated_clusters != 0) {
+        printf("%" PRId64 "/%" PRId64 "= %0.2f%% allocated, %0.2f%% fragmented\n",
+        result.bfi.allocated_clusters, result.bfi.total_clusters,
+        result.bfi.allocated_clusters * 100.0 / result.bfi.total_clusters,
+        result.bfi.fragmented_clusters * 100.0 / result.bfi.allocated_clusters);
+    }
+
     bdrv_delete(bs);
 
     if (ret < 0 || result.check_errors) {
@@ -716,7 +723,7 @@ static int img_convert(int argc, char **argv)
         ret = -1;
         goto out;
     }
-        
+
     qemu_progress_init(progress, 2.0);
     qemu_progress_print(0, 100);
 
-- 
cgit v1.2.3


From 11c9c615c8e776e59ad0c6c6ed00ec7d38b5d865 Mon Sep 17 00:00:00 2001
From: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
Date: Thu, 15 Mar 2012 20:13:32 +0800
Subject: qed: image fragmentation statistics

Signed-off-by: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qed-check.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/block/qed-check.c b/block/qed-check.c
index e4a49ce72c..94327ff5b3 100644
--- a/block/qed-check.c
+++ b/block/qed-check.c
@@ -68,6 +68,7 @@ static unsigned int qed_check_l2_table(QEDCheck *check, QEDTable *table)
 {
     BDRVQEDState *s = check->s;
     unsigned int i, num_invalid = 0;
+    uint64_t last_offset = 0;
 
     for (i = 0; i < s->table_nelems; i++) {
         uint64_t offset = table->offsets[i];
@@ -76,6 +77,11 @@ static unsigned int qed_check_l2_table(QEDCheck *check, QEDTable *table)
             qed_offset_is_zero_cluster(offset)) {
             continue;
         }
+        check->result->bfi.allocated_clusters++;
+        if (last_offset && (last_offset + s->header.cluster_size != offset)) {
+            check->result->bfi.fragmented_clusters++;
+        }
+        last_offset = offset;
 
         /* Detect invalid cluster offset */
         if (!qed_check_cluster_offset(s, offset)) {
@@ -200,6 +206,9 @@ int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix)
     check.used_clusters = g_malloc0(((check.nclusters + 31) / 32) *
                                        sizeof(check.used_clusters[0]));
 
+    check.result->bfi.total_clusters =
+        (s->header.image_size + s->header.cluster_size - 1) /
+            s->header.cluster_size;
     ret = qed_check_l1_table(&check, s->l1_table);
     if (ret == 0) {
         /* Only check for leaks if entire image was scanned successfully */
-- 
cgit v1.2.3


From 64c79160b456c2d6aabc63e40b32e392a9ea3c90 Mon Sep 17 00:00:00 2001
From: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
Date: Thu, 15 Mar 2012 20:13:33 +0800
Subject: qemu-img: add dirty flag status

Some block drivers can verify their image files are clean or not. So we can show
it while using "qemu-img info".

Signed-off-by: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.h    | 1 +
 qemu-img.c | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/block.h b/block.h
index ea12f5df30..4e99e1816e 100644
--- a/block.h
+++ b/block.h
@@ -15,6 +15,7 @@ typedef struct BlockDriverInfo {
     int cluster_size;
     /* offset at which the VM state can be saved (0 if not possible) */
     int64_t vm_state_offset;
+    bool is_dirty;
 } BlockDriverInfo;
 
 typedef struct BlockFragInfo {
diff --git a/qemu-img.c b/qemu-img.c
index 4de48bac58..6a61ca8d06 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1132,6 +1132,9 @@ static int img_info(int argc, char **argv)
         if (bdi.cluster_size != 0) {
             printf("cluster_size: %d\n", bdi.cluster_size);
         }
+        if (bdi.is_dirty) {
+            printf("cleanly shut down: no\n");
+        }
     }
     bdrv_get_backing_filename(bs, backing_filename, sizeof(backing_filename));
     if (backing_filename[0] != '\0') {
-- 
cgit v1.2.3


From d68dbee80e3964d9bb0ab29ad64675da148c3cac Mon Sep 17 00:00:00 2001
From: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
Date: Thu, 15 Mar 2012 20:13:34 +0800
Subject: qed: track dirty flag status

Signed-off-by: Dong Xu Wang <wdongxu@linux.vnet.ibm.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qed.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/qed.c b/block/qed.c
index d6c1580866..19d87f3bef 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1434,6 +1434,7 @@ static int bdrv_qed_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 
     memset(bdi, 0, sizeof(*bdi));
     bdi->cluster_size = s->header.cluster_size;
+    bdi->is_dirty = s->header.features & QED_F_NEED_CHECK;
     return 0;
 }
 
-- 
cgit v1.2.3


From f6801b83d0e77ca025867800d805ee80f6bda938 Mon Sep 17 00:00:00 2001
From: Jeff Cody <jcody@redhat.com>
Date: Tue, 27 Mar 2012 16:30:19 -0400
Subject: block: bdrv_append() fixes

A few fixups for bdrv_append():

The new bs (bs_new) passed into bdrv_append() should be anonymous.  Rather
than call bdrv_make_anon() to enforce this, use an assert to catch when a caller
is passing in a bs_new that is not anonymous.

Also, the new top layer should have its backing_format reflect the original
top's format.

And last, after the swap of bs contents, the device_name will have been copied
down. This needs to be cleared to reflect the anonymity of the bs that was
pushed down.

Signed-off-by: Jeff Cody <jcody@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/block.c b/block.c
index 33630ebacc..b3117ef3f1 100644
--- a/block.c
+++ b/block.c
@@ -892,14 +892,16 @@ void bdrv_make_anon(BlockDriverState *bs)
  * This will modify the BlockDriverState fields, and swap contents
  * between bs_new and bs_top. Both bs_new and bs_top are modified.
  *
+ * bs_new is required to be anonymous.
+ *
  * This function does not create any image files.
  */
 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
 {
     BlockDriverState tmp;
 
-    /* the new bs must not be in bdrv_states */
-    bdrv_make_anon(bs_new);
+    /* bs_new must be anonymous */
+    assert(bs_new->device_name[0] == '\0');
 
     tmp = *bs_new;
 
@@ -944,11 +946,18 @@ void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
      * swapping bs_new and bs_top contents. */
     tmp.backing_hd = bs_new;
     pstrcpy(tmp.backing_file, sizeof(tmp.backing_file), bs_top->filename);
+    bdrv_get_format(bs_top, tmp.backing_format, sizeof(tmp.backing_format));
 
     /* swap contents of the fixed new bs and the current top */
     *bs_new = *bs_top;
     *bs_top = tmp;
 
+    /* device_name[] was carried over from the old bs_top.  bs_new
+     * shouldn't be in bdrv_states, so we need to make device_name[]
+     * reflect the anonymity of bs_new
+     */
+    bs_new->device_name[0] = '\0';
+
     /* clear the copied fields in the new backing file */
     bdrv_detach_dev(bs_new, bs_new->dev);
 
-- 
cgit v1.2.3


From 47622c44d0d9a717c1a8f7f5fec6c25ff3b30eec Mon Sep 17 00:00:00 2001
From: Liu Yuan <tailai.ly@taobao.com>
Date: Wed, 4 Apr 2012 04:03:58 +0800
Subject: sheepdog: implement SD_OP_FLUSH_VDI operation

Flush operation is supposed to flush the write-back cache of
sheepdog cluster.

By issuing flush operation, we can assure the Guest of data
reaching the sheepdog cluster storage.

Cc: Kevin Wolf <kwolf@redhat.com>
Cc: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Signed-off-by: Liu Yuan <tailai.ly@taobao.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/sheepdog.c | 142 +++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 128 insertions(+), 14 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 00276f6f46..1248534984 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -32,9 +32,11 @@
 #define SD_OP_RELEASE_VDI    0x13
 #define SD_OP_GET_VDI_INFO   0x14
 #define SD_OP_READ_VDIS      0x15
+#define SD_OP_FLUSH_VDI      0x16
 
 #define SD_FLAG_CMD_WRITE    0x01
 #define SD_FLAG_CMD_COW      0x02
+#define SD_FLAG_CMD_CACHE    0x04
 
 #define SD_RES_SUCCESS       0x00 /* Success */
 #define SD_RES_UNKNOWN       0x01 /* Unknown error */
@@ -293,10 +295,12 @@ typedef struct BDRVSheepdogState {
 
     char name[SD_MAX_VDI_LEN];
     int is_snapshot;
+    uint8_t cache_enabled;
 
     char *addr;
     char *port;
     int fd;
+    int flush_fd;
 
     CoMutex lock;
     Coroutine *co_send;
@@ -516,6 +520,23 @@ static int send_req(int sockfd, SheepdogReq *hdr, void *data,
     return ret;
 }
 
+static int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
+                       unsigned int *wlen)
+{
+    int ret;
+
+    ret = qemu_co_send(sockfd, hdr, sizeof(*hdr));
+    if (ret < sizeof(*hdr)) {
+        error_report("failed to send a req, %s", strerror(errno));
+    }
+
+    ret = qemu_co_send(sockfd, data, *wlen);
+    if (ret < *wlen) {
+        error_report("failed to send a req, %s", strerror(errno));
+    }
+
+    return ret;
+}
 static int do_req(int sockfd, SheepdogReq *hdr, void *data,
                   unsigned int *wlen, unsigned int *rlen)
 {
@@ -550,6 +571,40 @@ out:
     return ret;
 }
 
+static int do_co_req(int sockfd, SheepdogReq *hdr, void *data,
+                     unsigned int *wlen, unsigned int *rlen)
+{
+    int ret;
+
+    socket_set_block(sockfd);
+    ret = send_co_req(sockfd, hdr, data, wlen);
+    if (ret < 0) {
+        goto out;
+    }
+
+    ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
+    if (ret < sizeof(*hdr)) {
+        error_report("failed to get a rsp, %s", strerror(errno));
+        goto out;
+    }
+
+    if (*rlen > hdr->data_length) {
+        *rlen = hdr->data_length;
+    }
+
+    if (*rlen) {
+        ret = qemu_co_recv(sockfd, data, *rlen);
+        if (ret < *rlen) {
+            error_report("failed to get the data, %s", strerror(errno));
+            goto out;
+        }
+    }
+    ret = 0;
+out:
+    socket_set_nonblock(sockfd);
+    return ret;
+}
+
 static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
                            struct iovec *iov, int niov, int create,
                            enum AIOCBState aiocb_type);
@@ -900,6 +955,10 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
         hdr.flags = SD_FLAG_CMD_WRITE | flags;
     }
 
+    if (s->cache_enabled) {
+        hdr.flags |= SD_FLAG_CMD_CACHE;
+    }
+
     hdr.oid = oid;
     hdr.cow_oid = old_oid;
     hdr.copies = s->inode.nr_copies;
@@ -942,7 +1001,7 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
 
 static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
                              unsigned int datalen, uint64_t offset,
-                             int write, int create)
+                             int write, int create, uint8_t cache)
 {
     SheepdogObjReq hdr;
     SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
@@ -965,6 +1024,11 @@ static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
         rlen = datalen;
         hdr.opcode = SD_OP_READ_OBJ;
     }
+
+    if (cache) {
+        hdr.flags |= SD_FLAG_CMD_CACHE;
+    }
+
     hdr.oid = oid;
     hdr.data_length = datalen;
     hdr.offset = offset;
@@ -986,15 +1050,18 @@ static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
 }
 
 static int read_object(int fd, char *buf, uint64_t oid, int copies,
-                       unsigned int datalen, uint64_t offset)
+                       unsigned int datalen, uint64_t offset, uint8_t cache)
 {
-    return read_write_object(fd, buf, oid, copies, datalen, offset, 0, 0);
+    return read_write_object(fd, buf, oid, copies, datalen, offset, 0, 0,
+                             cache);
 }
 
 static int write_object(int fd, char *buf, uint64_t oid, int copies,
-                        unsigned int datalen, uint64_t offset, int create)
+                        unsigned int datalen, uint64_t offset, int create,
+                        uint8_t cache)
 {
-    return read_write_object(fd, buf, oid, copies, datalen, offset, 1, create);
+    return read_write_object(fd, buf, oid, copies, datalen, offset, 1, create,
+                             cache);
 }
 
 static int sd_open(BlockDriverState *bs, const char *filename, int flags)
@@ -1026,6 +1093,15 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
         goto out;
     }
 
+    if (flags & BDRV_O_CACHE_WB) {
+        s->cache_enabled = 1;
+        s->flush_fd = connect_to_sdog(s->addr, s->port);
+        if (s->flush_fd < 0) {
+            error_report("failed to connect");
+            goto out;
+        }
+    }
+
     if (snapid) {
         dprintf("%" PRIx32 " snapshot inode was open.\n", vid);
         s->is_snapshot = 1;
@@ -1038,7 +1114,8 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
     }
 
     buf = g_malloc(SD_INODE_SIZE);
-    ret = read_object(fd, buf, vid_to_vdi_oid(vid), 0, SD_INODE_SIZE, 0);
+    ret = read_object(fd, buf, vid_to_vdi_oid(vid), 0, SD_INODE_SIZE, 0,
+                      s->cache_enabled);
 
     closesocket(fd);
 
@@ -1272,6 +1349,9 @@ static void sd_close(BlockDriverState *bs)
 
     qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL, NULL);
     closesocket(s->fd);
+    if (s->cache_enabled) {
+        closesocket(s->flush_fd);
+    }
     g_free(s->addr);
 }
 
@@ -1305,7 +1385,7 @@ static int sd_truncate(BlockDriverState *bs, int64_t offset)
     datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
     s->inode.vdi_size = offset;
     ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
-                       s->inode.nr_copies, datalen, 0, 0);
+                       s->inode.nr_copies, datalen, 0, 0, s->cache_enabled);
     close(fd);
 
     if (ret < 0) {
@@ -1387,7 +1467,7 @@ static int sd_create_branch(BDRVSheepdogState *s)
     }
 
     ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
-                      SD_INODE_SIZE, 0);
+                      SD_INODE_SIZE, 0, s->cache_enabled);
 
     closesocket(fd);
 
@@ -1575,6 +1655,36 @@ static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
     return acb->ret;
 }
 
+static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
+{
+    BDRVSheepdogState *s = bs->opaque;
+    SheepdogObjReq hdr = { 0 };
+    SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
+    SheepdogInode *inode = &s->inode;
+    int ret;
+    unsigned int wlen = 0, rlen = 0;
+
+    if (!s->cache_enabled) {
+        return 0;
+    }
+
+    hdr.opcode = SD_OP_FLUSH_VDI;
+    hdr.oid = vid_to_vdi_oid(inode->vdi_id);
+
+    ret = do_co_req(s->flush_fd, (SheepdogReq *)&hdr, NULL, &wlen, &rlen);
+    if (ret) {
+        error_report("failed to send a request to the sheep");
+        return ret;
+    }
+
+    if (rsp->result != SD_RES_SUCCESS) {
+        error_report("%s", sd_strerror(rsp->result));
+        return -EIO;
+    }
+
+    return 0;
+}
+
 static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
 {
     BDRVSheepdogState *s = bs->opaque;
@@ -1610,7 +1720,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
     }
 
     ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
-                       s->inode.nr_copies, datalen, 0, 0);
+                       s->inode.nr_copies, datalen, 0, 0, s->cache_enabled);
     if (ret < 0) {
         error_report("failed to write snapshot's inode.");
         ret = -EIO;
@@ -1629,7 +1739,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
     inode = (SheepdogInode *)g_malloc(datalen);
 
     ret = read_object(fd, (char *)inode, vid_to_vdi_oid(new_vid),
-                      s->inode.nr_copies, datalen, 0);
+                      s->inode.nr_copies, datalen, 0, s->cache_enabled);
 
     if (ret < 0) {
         error_report("failed to read new inode info. %s", strerror(errno));
@@ -1684,7 +1794,7 @@ static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
 
     buf = g_malloc(SD_INODE_SIZE);
     ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
-                      SD_INODE_SIZE, 0);
+                      SD_INODE_SIZE, 0, s->cache_enabled);
 
     closesocket(fd);
 
@@ -1779,7 +1889,8 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
 
         /* we don't need to read entire object */
         ret = read_object(fd, (char *)&inode, vid_to_vdi_oid(vid),
-                          0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0);
+                          0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0,
+                          s->cache_enabled);
 
         if (ret) {
             continue;
@@ -1835,10 +1946,12 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
         create = (offset == 0);
         if (load) {
             ret = read_object(fd, (char *)data, vmstate_oid,
-                              s->inode.nr_copies, data_len, offset);
+                              s->inode.nr_copies, data_len, offset,
+                              s->cache_enabled);
         } else {
             ret = write_object(fd, (char *)data, vmstate_oid,
-                               s->inode.nr_copies, data_len, offset, create);
+                               s->inode.nr_copies, data_len, offset, create,
+                               s->cache_enabled);
         }
 
         if (ret < 0) {
@@ -1904,6 +2017,7 @@ BlockDriver bdrv_sheepdog = {
 
     .bdrv_co_readv  = sd_co_readv,
     .bdrv_co_writev = sd_co_writev,
+    .bdrv_co_flush_to_disk  = sd_co_flush_to_disk,
 
     .bdrv_snapshot_create   = sd_snapshot_create,
     .bdrv_snapshot_goto     = sd_snapshot_goto,
-- 
cgit v1.2.3


From eb09218077a495bc55d84de91f448f72fe78a60b Mon Sep 17 00:00:00 2001
From: Liu Yuan <tailai.ly@taobao.com>
Date: Tue, 3 Apr 2012 18:04:21 +0800
Subject: sheepdog: fix send req helpers

We should return if reading of the header fails.

Cc: Kevin Wolf <kwolf@redhat.com>
Cc: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Signed-off-by: Liu Yuan <tailai.ly@taobao.com>
Acked-by: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/sheepdog.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 1248534984..3eaf625e98 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -510,6 +510,7 @@ static int send_req(int sockfd, SheepdogReq *hdr, void *data,
     ret = qemu_send_full(sockfd, hdr, sizeof(*hdr), 0);
     if (ret < sizeof(*hdr)) {
         error_report("failed to send a req, %s", strerror(errno));
+        return ret;
     }
 
     ret = qemu_send_full(sockfd, data, *wlen, 0);
@@ -528,6 +529,7 @@ static int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
     ret = qemu_co_send(sockfd, hdr, sizeof(*hdr));
     if (ret < sizeof(*hdr)) {
         error_report("failed to send a req, %s", strerror(errno));
+        return ret;
     }
 
     ret = qemu_co_send(sockfd, data, *wlen);
-- 
cgit v1.2.3


From 6e19b3c4e0b40b08ccb550a0c0a65798f3a17ac8 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 15 Feb 2012 16:36:03 +0100
Subject: qemu-iotests: qcow2.py

This adds a tool that is meant to inspect and edit qcow2 files in a
low-level way, that wouldn't be possible with qemu-img/io, for example
by adding yet unknown extensions or flags. This way we can test whether
qemu deals properly with future backwards compatible extensions.

For now, let's start with the image header and header extensions.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/qcow2.py | 207 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 207 insertions(+)
 create mode 100755 tests/qemu-iotests/qcow2.py

diff --git a/tests/qemu-iotests/qcow2.py b/tests/qemu-iotests/qcow2.py
new file mode 100755
index 0000000000..bfb47e88fc
--- /dev/null
+++ b/tests/qemu-iotests/qcow2.py
@@ -0,0 +1,207 @@
+#!/usr/bin/env python
+
+import sys
+import struct
+import string
+
+class QcowHeaderExtension:
+
+    def __init__(self, magic, length, data):
+        self.magic  = magic
+        self.length = length
+        self.data   = data
+
+    @classmethod
+    def create(cls, magic, data):
+        return QcowHeaderExtension(magic, len(data), data)
+
+class QcowHeader:
+
+    uint32_t = 'I'
+    uint64_t = 'Q'
+
+    fields = [
+        # Version 2 header fields
+        [ uint32_t, '%#x',  'magic' ],
+        [ uint32_t, '%d',   'version' ],
+        [ uint64_t, '%#x',  'backing_file_offset' ],
+        [ uint32_t, '%#x',  'backing_file_size' ],
+        [ uint32_t, '%d',   'cluster_bits' ],
+        [ uint64_t, '%d',   'size' ],
+        [ uint32_t, '%d',   'crypt_method' ],
+        [ uint32_t, '%d',   'l1_size' ],
+        [ uint64_t, '%#x',  'l1_table_offset' ],
+        [ uint64_t, '%#x',  'refcount_table_offset' ],
+        [ uint32_t, '%d',   'refcount_table_clusters' ],
+        [ uint32_t, '%d',   'nb_snapshots' ],
+        [ uint64_t, '%#x',  'snapshot_offset' ],
+    ];
+
+    fmt = '>' + ''.join(field[0] for field in fields)
+
+    def __init__(self, fd):
+
+        buf_size = struct.calcsize(QcowHeader.fmt)
+
+        fd.seek(0)
+        buf = fd.read(buf_size)
+
+        header = struct.unpack(QcowHeader.fmt, buf)
+        self.__dict__ = dict((field[2], header[i])
+            for i, field in enumerate(QcowHeader.fields))
+
+        self.cluster_size = 1 << self.cluster_bits
+
+        fd.seek(self.get_header_length())
+        self.load_extensions(fd)
+
+        if self.backing_file_offset:
+            fd.seek(self.backing_file_offset)
+            self.backing_file = fd.read(self.backing_file_size)
+        else:
+            self.backing_file = None
+
+    def get_header_length(self):
+        if self.version == 2:
+            return 72
+        else:
+            raise Exception("version != 2 not supported")
+
+    def load_extensions(self, fd):
+        self.extensions = []
+
+        if self.backing_file_offset != 0:
+            end = min(self.cluster_size, self.backing_file_offset)
+        else:
+            end = self.cluster_size
+
+        while fd.tell() < end:
+            (magic, length) = struct.unpack('>II', fd.read(8))
+            if magic == 0:
+                break
+            else:
+                padded = (length + 7) & ~7
+                data = fd.read(padded)
+                self.extensions.append(QcowHeaderExtension(magic, length, data))
+
+    def update_extensions(self, fd):
+
+        fd.seek(self.get_header_length())
+        extensions = self.extensions
+        extensions.append(QcowHeaderExtension(0, 0, ""))
+        for ex in extensions:
+            buf = struct.pack('>II', ex.magic, ex.length)
+            fd.write(buf)
+            fd.write(ex.data)
+
+        if self.backing_file != None:
+            self.backing_file_offset = fd.tell()
+            fd.write(self.backing_file)
+
+        if fd.tell() > self.cluster_size:
+            raise Exception("I think I just broke the image...")
+
+
+    def update(self, fd):
+        header_bytes = self.get_header_length()
+
+        self.update_extensions(fd)
+
+        fd.seek(0)
+        header = tuple(self.__dict__[f] for t, p, f in QcowHeader.fields)
+        buf = struct.pack(QcowHeader.fmt, *header)
+        buf = buf[0:header_bytes-1]
+        fd.write(buf)
+
+    def dump(self):
+        for f in QcowHeader.fields:
+            print "%-25s" % f[2], f[1] % self.__dict__[f[2]]
+        print ""
+
+    def dump_extensions(self):
+        for ex in self.extensions:
+
+            data = ex.data[:ex.length]
+            if all(c in string.printable for c in data):
+                data = "'%s'" % data
+            else:
+                data = "<binary>"
+
+            print "Header extension:"
+            print "%-25s %#x" % ("magic", ex.magic)
+            print "%-25s %d" % ("length", ex.length)
+            print "%-25s %s" % ("data", data)
+            print ""
+
+
+def cmd_dump_header(fd):
+    h = QcowHeader(fd)
+    h.dump()
+    h.dump_extensions()
+
+def cmd_add_header_ext(fd, magic, data):
+    try:
+        magic = int(magic, 0)
+    except:
+        print "'%s' is not a valid magic number" % magic
+        sys.exit(1)
+
+    h = QcowHeader(fd)
+    h.extensions.append(QcowHeaderExtension.create(magic, data))
+    h.update(fd)
+
+def cmd_del_header_ext(fd, magic):
+    try:
+        magic = int(magic, 0)
+    except:
+        print "'%s' is not a valid magic number" % magic
+        sys.exit(1)
+
+    h = QcowHeader(fd)
+    found = False
+
+    for ex in h.extensions:
+        if ex.magic == magic:
+            found = True
+            h.extensions.remove(ex)
+
+    if not found:
+        print "No such header extension"
+        return
+
+    h.update(fd)
+
+cmds = [
+    [ 'dump-header',    cmd_dump_header,    0, 'Dump image header and header extensions' ],
+    [ 'add-header-ext', cmd_add_header_ext, 2, 'Add a header extension' ],
+    [ 'del-header-ext', cmd_del_header_ext, 1, 'Delete a header extension' ],
+]
+
+def main(filename, cmd, args):
+    fd = open(filename, "r+b")
+    try:
+        for name, handler, num_args, desc in cmds:
+            if name != cmd:
+                continue
+            elif len(args) != num_args:
+                usage()
+                return
+            else:
+                handler(fd, *args)
+                return
+        print "Unknown command '%s'" % cmd
+    finally:
+        fd.close()
+
+def usage():
+    print "Usage: %s <file> <cmd> [<arg>, ...]" % sys.argv[0]
+    print ""
+    print "Supported commands:"
+    for name, handler, num_args, desc in cmds:
+        print "    %-20s - %s" % (name, desc)
+
+if len(sys.argv) < 3:
+    usage()
+    sys.exit(1)
+
+main(sys.argv[1], sys.argv[2], sys.argv[3:])
-- 
cgit v1.2.3


From f394f1feb921d5a4b278e7ac95bdaa49e34a52f2 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 16 Feb 2012 16:55:01 +0100
Subject: qemu-iotests: Test unknown qcow2 header extensions

The immportant thing here is that header extensions don't get silently
dropped when the header is rewritten, e.g. during a rebase.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/031     | 72 +++++++++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/031.out | 76 ++++++++++++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/group   |  1 +
 3 files changed, 149 insertions(+)
 create mode 100755 tests/qemu-iotests/031
 create mode 100644 tests/qemu-iotests/031.out

diff --git a/tests/qemu-iotests/031 b/tests/qemu-iotests/031
new file mode 100755
index 0000000000..6365f287e0
--- /dev/null
+++ b/tests/qemu-iotests/031
@@ -0,0 +1,72 @@
+#!/bin/bash
+#
+# Test that all qcow2 header extensions survive a header rewrite
+#
+# Copyright (C) 2011 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# creator
+owner=kwolf@redhat.com
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1	# failure is the default!
+
+_cleanup()
+{
+	_cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+. ./common.pattern
+
+# This tests qcow2-specific low-level functionality
+_supported_fmt qcow2
+_supported_proto generic
+_supported_os Linux
+
+CLUSTER_SIZE=65536
+echo
+echo === Create image with unknown header extension ===
+echo
+_make_test_img 64M
+./qcow2.py $TEST_IMG add-header-ext 0x12345678 "This is a test header extension"
+./qcow2.py $TEST_IMG dump-header
+_check_test_img
+
+echo
+echo === Rewrite header with no backing file ===
+echo
+$QEMU_IMG rebase -u -b "" $TEST_IMG
+./qcow2.py $TEST_IMG dump-header
+_check_test_img
+
+echo
+echo === Add a backing file and format ===
+echo
+$QEMU_IMG rebase -u -b "/some/backing/file/path" -F host_device $TEST_IMG
+./qcow2.py $TEST_IMG dump-header
+
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/031.out b/tests/qemu-iotests/031.out
new file mode 100644
index 0000000000..0f1bf685f3
--- /dev/null
+++ b/tests/qemu-iotests/031.out
@@ -0,0 +1,76 @@
+QA output created by 031
+
+=== Create image with unknown header extension ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 cluster_size=65536 
+magic                     0x514649fb
+version                   2
+backing_file_offset       0x0
+backing_file_size         0x0
+cluster_bits              16
+size                      67108864
+crypt_method              0
+l1_size                   1
+l1_table_offset           0x30000
+refcount_table_offset     0x10000
+refcount_table_clusters   1
+nb_snapshots              0
+snapshot_offset           0x0
+
+Header extension:
+magic                     0x12345678
+length                    31
+data                      'This is a test header extension'
+
+No errors were found on the image.
+
+=== Rewrite header with no backing file ===
+
+magic                     0x514649fb
+version                   2
+backing_file_offset       0x0
+backing_file_size         0x0
+cluster_bits              16
+size                      67108864
+crypt_method              0
+l1_size                   1
+l1_table_offset           0x30000
+refcount_table_offset     0x10000
+refcount_table_clusters   1
+nb_snapshots              0
+snapshot_offset           0x0
+
+Header extension:
+magic                     0x12345678
+length                    31
+data                      'This is a test header extension'
+
+No errors were found on the image.
+
+=== Add a backing file and format ===
+
+magic                     0x514649fb
+version                   2
+backing_file_offset       0x90
+backing_file_size         0x17
+cluster_bits              16
+size                      67108864
+crypt_method              0
+l1_size                   1
+l1_table_offset           0x30000
+refcount_table_offset     0x10000
+refcount_table_clusters   1
+nb_snapshots              0
+snapshot_offset           0x0
+
+Header extension:
+magic                     0xe2792aca
+length                    11
+data                      'host_device'
+
+Header extension:
+magic                     0x12345678
+length                    31
+data                      'This is a test header extension'
+
+*** done
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index b549f10f17..1742ede180 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -37,3 +37,4 @@
 028 rw backing auto
 029 rw auto quick
 030 rw auto
+031 rw auto quick
-- 
cgit v1.2.3


From 21af81488799e2290cb2e58eb2a91d1d8044ebc4 Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Wed, 14 Mar 2012 19:57:23 +0100
Subject: qemu-iotests: Fix call syntax for qemu-img

qemu-img requires first options, then file name, then size.

GNU getopt also allows options at the end, but POSIX getopt
doesn't. Try "export POSIXLY_CORRECT=y" to get the POSIX
behaviour with GNU getopt, too.

Cc: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Weil <sw@weilnetz.de>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/common.rc | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc
index 26811ca660..4cb8dae8c6 100644
--- a/tests/qemu-iotests/common.rc
+++ b/tests/qemu-iotests/common.rc
@@ -57,16 +57,21 @@ _make_test_img()
 {
     # extra qemu-img options can be added by tests
     # at least one argument (the image size) needs to be added
-    local extra_img_options=$*
+    local extra_img_options=""
     local cluster_size_filter="s# cluster_size=[0-9]\\+##g"
+    local image_size=$*
 
+    if [ "$1" = "-b" ]; then
+        extra_img_options="$1 $2"
+        image_size=$3
+    fi
     if [ \( "$IMGFMT" = "qcow2" -o "$IMGFMT" = "qed" \) -a -n "$CLUSTER_SIZE" ]; then
         extra_img_options="-o cluster_size=$CLUSTER_SIZE $extra_img_options"
         cluster_size_filter=""
     fi
 
     # XXX(hch): have global image options?
-    $QEMU_IMG create -f $IMGFMT $TEST_IMG $extra_img_options | \
+    $QEMU_IMG create -f $IMGFMT $extra_img_options $TEST_IMG $image_size | \
     	sed -e "s#$IMGPROTO:$TEST_DIR#TEST_DIR#g" | \
     	sed -e "s#$TEST_DIR#TEST_DIR#g" | \
     	sed -e "s#$IMGFMT#IMGFMT#g" | \
-- 
cgit v1.2.3


From 28d3d1658a3692f9c34c3ecce14941faa1d2fe92 Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Wed, 14 Mar 2012 19:57:24 +0100
Subject: qemu-iotests: Fix call syntax for qemu-io

qemu-io requires options first, then fixed parameters.

GNU getopt also allows options at the end, but POSIX getopt
doesn't. Try "export POSIXLY_CORRECT=y" to get the POSIX
behaviour with GNU getopt, too.

Cc: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Weil <sw@weilnetz.de>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/009 | 4 ++--
 tests/qemu-iotests/010 | 6 +++---
 tests/qemu-iotests/011 | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/qemu-iotests/009 b/tests/qemu-iotests/009
index f7262b57bf..25368c819b 100755
--- a/tests/qemu-iotests/009
+++ b/tests/qemu-iotests/009
@@ -53,10 +53,10 @@ _make_test_img $size
 echo
 echo "creating pattern"
 $QEMU_IO \
-	-c "write 2048k 4k -P 65" \
+	-c "write -P 65 2048k 4k" \
 	-c "write 4k 4k" \
 	-c "write 9M 4k" \
-	-c "read 2044k 8k -P 65 -s 4k -l 4k" \
+	-c "read -P 65 -s 4k -l 4k 2044k 8k" \
 $TEST_IMG | _filter_qemu_io
 
 echo
diff --git a/tests/qemu-iotests/010 b/tests/qemu-iotests/010
index e3205aa206..7b5792934a 100755
--- a/tests/qemu-iotests/010
+++ b/tests/qemu-iotests/010
@@ -53,11 +53,11 @@ _make_test_img $size
 echo
 echo "creating pattern"
 $QEMU_IO \
-	-c "write 2048k 4k -P 165" \
+	-c "write -P 165 2048k 4k" \
 	-c "write 64k 4k" \
 	-c "write 9M 4k" \
-	-c "write 2044k 4k -P 165" \
-	-c "write 8M 4k -P 99" \
+	-c "write -P 165 2044k 4k" \
+	-c "write -P 99 8M 4k" \
 	-c "read -P 165 2044k 8k" \
 $TEST_IMG | _filter_qemu_io
 
diff --git a/tests/qemu-iotests/011 b/tests/qemu-iotests/011
index 59df1ae48e..b03df6887d 100755
--- a/tests/qemu-iotests/011
+++ b/tests/qemu-iotests/011
@@ -60,7 +60,7 @@ for i in `seq 1 10`; do
     # Note that we filter away the actual offset.  That's because qemu
     # may re-order the two aio requests.  We only want to make sure the
     # filesystem isn't corrupted afterwards anyway.
-    $QEMU_IO $TEST_IMG -c "aio_write $off1 1M" -c "aio_write $off2 1M" | \
+    $QEMU_IO -c "aio_write $off1 1M" -c "aio_write $off2 1M" $TEST_IMG | \
     	_filter_qemu_io | \
 	sed -e 's/bytes at offset [0-9]*/bytes at offset XXX/g'
 done
-- 
cgit v1.2.3


From 5a67a1048e1294b35940e7432d57d8cd801e6571 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Mon, 26 Mar 2012 21:22:10 -0300
Subject: block stream: close unused files and update ->backing_hd

Close the now unused images that were part of the previous backing file
chain and adjust ->backing_hd, backing_filename and backing_format
properly.

Fixes https://bugzilla.redhat.com/show_bug.cgi?id=801449

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/stream.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/block/stream.c b/block/stream.c
index 61ff7a236e..0efe1adfd5 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -76,6 +76,39 @@ static int coroutine_fn stream_populate(BlockDriverState *bs,
     return bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, &qiov);
 }
 
+static void close_unused_images(BlockDriverState *top, BlockDriverState *base,
+                                const char *base_id)
+{
+    BlockDriverState *intermediate;
+    intermediate = top->backing_hd;
+
+    while (intermediate) {
+        BlockDriverState *unused;
+
+        /* reached base */
+        if (intermediate == base) {
+            break;
+        }
+
+        unused = intermediate;
+        intermediate = intermediate->backing_hd;
+        unused->backing_hd = NULL;
+        bdrv_delete(unused);
+    }
+    top->backing_hd = base;
+
+    pstrcpy(top->backing_file, sizeof(top->backing_file), "");
+    pstrcpy(top->backing_format, sizeof(top->backing_format), "");
+    if (base_id) {
+        pstrcpy(top->backing_file, sizeof(top->backing_file), base_id);
+        if (base->drv) {
+            pstrcpy(top->backing_format, sizeof(top->backing_format),
+                    base->drv->format_name);
+        }
+    }
+
+}
+
 /*
  * Given an image chain: [BASE] -> [INTER1] -> [INTER2] -> [TOP]
  *
@@ -223,6 +256,7 @@ retry:
             base_id = s->backing_file_id;
         }
         ret = bdrv_change_backing_file(bs, base_id, NULL);
+        close_unused_images(bs, base, base_id);
     }
 
     qemu_vfree(buf);
-- 
cgit v1.2.3


From ccb1f4a7b32fbebcf0d49beab1d2614f0a657d5a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Beno=C3=AEt=20Canet?= <benoit.canet@gmail.com>
Date: Fri, 23 Mar 2012 08:36:48 +0100
Subject: block: Add new BDRV_O_INCOMING flag to notice incoming live migration

From original patch with Patchwork-id: 31110 by
Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>

"Add a flag to indicate that incoming migration is pending and care needs
to be taken for data consistency.  Block drivers should not modify the
image file before incoming migration is complete since the migration
source host is still using the image file."

The rationale for not using bdrv->read_only is the following.

"Unfortunately this is not possible because too many other places in QEMU
test bdrv_is_read_only() and use it for their own evil purposes.  For
example, ide_init_drive() will error out because read-only harddisks are
not supported.  We're mixing guest and host side read-only concepts so
this simpler alternative does not work."

Signed-off-by: Benoit Canet <benoit.canet@gmail.com>
Reviewed-by: Stefan Hajnoczi <stefanha@gmail.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block.h b/block.h
index 4e99e1816e..5151deab48 100644
--- a/block.h
+++ b/block.h
@@ -78,6 +78,7 @@ typedef struct BlockDevOps {
 #define BDRV_O_NO_BACKING  0x0100 /* don't open the backing file */
 #define BDRV_O_NO_FLUSH    0x0200 /* disable flushing on this disk */
 #define BDRV_O_COPY_ON_READ 0x0400 /* copy read backing sectors into image */
+#define BDRV_O_INCOMING    0x0800  /* consistency hint for incoming migration */
 
 #define BDRV_O_CACHE_MASK  (BDRV_O_NOCACHE | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH)
 
-- 
cgit v1.2.3


From 077892696b900ebf88052fef917367213db80477 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Beno=C3=AEt=20Canet?= <benoit.canet@gmail.com>
Date: Fri, 23 Mar 2012 08:36:49 +0100
Subject: block: add a function to clear incoming live migration flags

This function will clear all BDRV_O_INCOMING flags.

Signed-off-by: Benoit Canet <benoit.canet@gmail.com>
Reviewed-by: Stefan Hajnoczi <stefanha@gmail.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 9 +++++++++
 block.h | 2 ++
 2 files changed, 11 insertions(+)

diff --git a/block.c b/block.c
index b3117ef3f1..c0c90f061b 100644
--- a/block.c
+++ b/block.c
@@ -3624,6 +3624,15 @@ void bdrv_invalidate_cache_all(void)
     }
 }
 
+void bdrv_clear_incoming_migration_all(void)
+{
+    BlockDriverState *bs;
+
+    QTAILQ_FOREACH(bs, &bdrv_states, list) {
+        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
+    }
+}
+
 int bdrv_flush(BlockDriverState *bs)
 {
     Coroutine *co;
diff --git a/block.h b/block.h
index 5151deab48..f163e54b5f 100644
--- a/block.h
+++ b/block.h
@@ -229,6 +229,8 @@ BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
 void bdrv_invalidate_cache(BlockDriverState *bs);
 void bdrv_invalidate_cache_all(void);
 
+void bdrv_clear_incoming_migration_all(void);
+
 /* Ensure contents are flushed to disk.  */
 int bdrv_flush(BlockDriverState *bs);
 int coroutine_fn bdrv_co_flush(BlockDriverState *bs);
-- 
cgit v1.2.3


From ed9d4205cecb2f6c06821cb9d8fe9e1ca5f9b8ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Beno=C3=AEt=20Canet?= <benoit.canet@gmail.com>
Date: Fri, 23 Mar 2012 08:36:50 +0100
Subject: blockdev: open images with BDRV_O_INCOMING on incoming live migration

Open images with BDRV_O_INCOMING in order to inform block drivers
that an incoming live migration is coming.

Signed-off-by: Benoit Canet <benoit.canet@gmail.com>
Reviewed-by: Stefan Hajnoczi <stefanha@gmail.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockdev.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/blockdev.c b/blockdev.c
index 4d17486e31..0c2440e249 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -595,6 +595,10 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
         bdrv_flags |= BDRV_O_COPY_ON_READ;
     }
 
+    if (runstate_check(RUN_STATE_INMIGRATE)) {
+        bdrv_flags |= BDRV_O_INCOMING;
+    }
+
     if (media == MEDIA_CDROM) {
         /* CDROM is fine for any interface, don't check.  */
         ro = 1;
-- 
cgit v1.2.3


From c82954e529929c2d650589d8bccaaf19dec33431 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Beno=C3=AEt=20Canet?= <benoit.canet@gmail.com>
Date: Fri, 23 Mar 2012 08:36:51 +0100
Subject: qed: add bdrv_invalidate_cache to be called after incoming live
 migration

The QED image is reopened to flush metadata and check consistency.

Signed-off-by: Benoit Canet <benoit.canet@gmail.com>
Reviewed-by: Stefan Hajnoczi <stefanha@gmail.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qed.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/block/qed.c b/block/qed.c
index 19d87f3bef..a5e9d57082 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1510,6 +1510,15 @@ static int bdrv_qed_change_backing_file(BlockDriverState *bs,
     return ret;
 }
 
+static void bdrv_qed_invalidate_cache(BlockDriverState *bs)
+{
+    BDRVQEDState *s = bs->opaque;
+
+    bdrv_qed_close(bs);
+    memset(s, 0, sizeof(BDRVQEDState));
+    bdrv_qed_open(bs, bs->open_flags);
+}
+
 static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result)
 {
     BDRVQEDState *s = bs->opaque;
@@ -1561,6 +1570,7 @@ static BlockDriver bdrv_qed = {
     .bdrv_getlength           = bdrv_qed_getlength,
     .bdrv_get_info            = bdrv_qed_get_info,
     .bdrv_change_backing_file = bdrv_qed_change_backing_file,
+    .bdrv_invalidate_cache    = bdrv_qed_invalidate_cache,
     .bdrv_check               = bdrv_qed_check,
 };
 
-- 
cgit v1.2.3


From 901862cbf49639aaef23a575d40ae81282b6fcfc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Beno=C3=AEt=20Canet?= <benoit.canet@gmail.com>
Date: Fri, 23 Mar 2012 08:36:52 +0100
Subject: migration: clear BDRV_O_INCOMING flags on end of incoming live
 migration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Benoît Canet <benoit.canet@gmail.com>
Reviewed-by: Stefan Hajnoczi <stefanha@gmail.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 migration.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/migration.c b/migration.c
index 8c119ba8ff..94f7839e8b 100644
--- a/migration.c
+++ b/migration.c
@@ -91,6 +91,7 @@ void process_incoming_migration(QEMUFile *f)
     qemu_announce_self();
     DPRINTF("successfully loaded vm state\n");
 
+    bdrv_clear_incoming_migration_all();
     /* Make sure all file formats flush their mutable metadata */
     bdrv_invalidate_cache_all();
 
-- 
cgit v1.2.3


From 2d1f3c2360053dec7dacc0292f52cff17104feff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Beno=C3=AEt=20Canet?= <benoit.canet@gmail.com>
Date: Fri, 23 Mar 2012 08:36:53 +0100
Subject: qed: honor BDRV_O_INCOMING for incoming live migration

From original commit with Patchwork-id: 31108 by
Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>

"The QED image format includes a file header bit to mark images dirty.
QED normally checks dirty images on open and fixes inconsistent
metadata.  This is undesirable during live migration since the dirty bit
may be set if the source host is modifying the image file.  The check
should be postponed until migration completes.

Skip operations that modify the image file if the BDRV_O_INCOMING flag
is set."

Signed-off-by: Benoit Canet <benoit.canet@gmail.com>
Reviewed-by: Stefan Hajnoczi <stefanha@gmail.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qed.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index a5e9d57082..aea2772e3d 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -450,7 +450,7 @@ static int bdrv_qed_open(BlockDriverState *bs, int flags)
      * feature is no longer valid.
      */
     if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 &&
-        !bdrv_is_read_only(bs->file)) {
+        !bdrv_is_read_only(bs->file) && !(flags & BDRV_O_INCOMING)) {
         s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK;
 
         ret = qed_write_header_sync(s);
@@ -477,7 +477,8 @@ static int bdrv_qed_open(BlockDriverState *bs, int flags)
          * potentially inconsistent images to be opened read-only.  This can
          * aid data recovery from an otherwise inconsistent image.
          */
-        if (!bdrv_is_read_only(bs->file)) {
+        if (!bdrv_is_read_only(bs->file) &&
+            !(flags & BDRV_O_INCOMING)) {
             BdrvCheckResult result = {0};
 
             ret = qed_check(s, &result, true);
-- 
cgit v1.2.3


From 50d30c267563bf492fd403dd23abc7888f3e220c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Beno=C3=AEt=20Canet?= <benoit.canet@gmail.com>
Date: Fri, 23 Mar 2012 08:36:54 +0100
Subject: qed: remove incoming live migration blocker

Signed-off-by: Benoit Canet <benoit.canet@gmail.com>
Reviewed-by: Stefan Hajnoczi <stefanha@gmail.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qed.c | 9 ---------
 block/qed.h | 2 --
 2 files changed, 11 deletions(-)

diff --git a/block/qed.c b/block/qed.c
index aea2772e3d..366cde7ad8 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -498,12 +498,6 @@ static int bdrv_qed_open(BlockDriverState *bs, int flags)
     s->need_check_timer = qemu_new_timer_ns(vm_clock,
                                             qed_need_check_timer_cb, s);
 
-    error_set(&s->migration_blocker,
-              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-              "qed", bs->device_name, "live migration");
-    migrate_add_blocker(s->migration_blocker);
-
-
 out:
     if (ret) {
         qed_free_l2_cache(&s->l2_cache);
@@ -516,9 +510,6 @@ static void bdrv_qed_close(BlockDriverState *bs)
 {
     BDRVQEDState *s = bs->opaque;
 
-    migrate_del_blocker(s->migration_blocker);
-    error_free(s->migration_blocker);
-
     qed_cancel_need_check_timer(s);
     qemu_free_timer(s->need_check_timer);
 
diff --git a/block/qed.h b/block/qed.h
index 62624a1f34..c716772ad7 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -169,8 +169,6 @@ typedef struct {
 
     /* Periodic flush and clear need check flag */
     QEMUTimer *need_check_timer;
-
-    Error *migration_blocker;
 } BDRVQEDState;
 
 enum {
-- 
cgit v1.2.3