diff options
author | Aurelien Jarno <aurelien@aurel32.net> | 2011-03-21 21:28:19 +0100 |
---|---|---|
committer | Aurelien Jarno <aurelien@aurel32.net> | 2011-03-21 21:28:19 +0100 |
commit | 32465727627711ff3e1cde6777a014413c3cb9ee (patch) | |
tree | 8564302dab9b2939e556ad01e369cf1712485eb7 | |
parent | cc4e8741ccdaa905017f3c7c59e14c685a239c2d (diff) | |
parent | 03feae73056ba3223151c31871860e30630645ac (diff) |
Merge branch 'for-anthony' of git://repo.or.cz/qemu/kevin
* 'for-anthony' of git://repo.or.cz/qemu/kevin:
Add qcow2 documentation
hw/xen_disk: aio_inflight not released in handling ioreq when nr_segments==0
Improve error handling in do_snapshot_blkdev()
Fix ATA SMART and CHECK POWER MODE
Don't allow multiwrites against a block device without underlying medium
tools: Use real async.c instead of stubs
Add error message for loading snapshot without VM state
block/qcow: Don't ignore immediate read/write and other failures
block/vdi: Don't ignore immediate read/write failures
-rw-r--r-- | Makefile.objs | 4 | ||||
-rw-r--r-- | block.c | 8 | ||||
-rw-r--r-- | block/qcow.c | 16 | ||||
-rw-r--r-- | block/vdi.c | 5 | ||||
-rw-r--r-- | blockdev.c | 23 | ||||
-rw-r--r-- | docs/specs/qcow2.txt | 260 | ||||
-rw-r--r-- | hw/ide/core.c | 41 | ||||
-rw-r--r-- | hw/xen_disk.c | 2 | ||||
-rw-r--r-- | qemu-tool.c | 47 | ||||
-rw-r--r-- | savevm.c | 2 |
10 files changed, 339 insertions, 69 deletions
diff --git a/Makefile.objs b/Makefile.objs index a52f42fb72..167ccc2c6c 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -13,7 +13,7 @@ oslib-obj-$(CONFIG_POSIX) += oslib-posix.o ####################################################################### # block-obj-y is code used by both qemu system emulation and qemu-img -block-obj-y = cutils.o cache-utils.o qemu-malloc.o qemu-option.o module.o +block-obj-y = cutils.o cache-utils.o qemu-malloc.o qemu-option.o module.o async.o block-obj-y += nbd.o block.o aio.o aes.o qemu-config.o block-obj-$(CONFIG_POSIX) += posix-aio-compat.o block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o @@ -63,7 +63,7 @@ common-obj-y = $(block-obj-y) blockdev.o common-obj-y += $(net-obj-y) common-obj-y += $(qobject-obj-y) common-obj-$(CONFIG_LINUX) += $(fsdev-obj-$(CONFIG_LINUX)) -common-obj-y += readline.o console.o cursor.o async.o qemu-error.o +common-obj-y += readline.o console.o cursor.o qemu-error.o common-obj-y += $(oslib-obj-y) common-obj-$(CONFIG_WIN32) += os-win32.o common-obj-$(CONFIG_POSIX) += os-posix.o @@ -2398,6 +2398,14 @@ int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) MultiwriteCB *mcb; int i; + /* don't submit writes if we don't have a medium */ + if (bs->drv == NULL) { + for (i = 0; i < num_reqs; i++) { + reqs[i].error = -ENOMEDIUM; + } + return -1; + } + if (num_reqs == 0) { return 0; } diff --git a/block/qcow.c b/block/qcow.c index f67d3d39f2..a26c88620f 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -589,8 +589,10 @@ static void qcow_aio_read_cb(void *opaque, int ret) qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1); acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num, &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb); - if (acb->hd_aiocb == NULL) + if (acb->hd_aiocb == NULL) { + ret = -EIO; goto done; + } } else { /* Note: in this case, no need to wait */ memset(acb->buf, 0, 512 * acb->n); @@ -598,8 +600,10 @@ static void qcow_aio_read_cb(void *opaque, int ret) } } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ - if (decompress_cluster(bs, acb->cluster_offset) < 0) + if (decompress_cluster(bs, acb->cluster_offset) < 0) { + ret = -EIO; goto done; + } memcpy(acb->buf, s->cluster_cache + index_in_cluster * 512, 512 * acb->n); goto redo; @@ -614,8 +618,10 @@ static void qcow_aio_read_cb(void *opaque, int ret) acb->hd_aiocb = bdrv_aio_readv(bs->file, (acb->cluster_offset >> 9) + index_in_cluster, &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb); - if (acb->hd_aiocb == NULL) + if (acb->hd_aiocb == NULL) { + ret = -EIO; goto done; + } } return; @@ -700,8 +706,10 @@ static void qcow_aio_write_cb(void *opaque, int ret) (cluster_offset >> 9) + index_in_cluster, &acb->hd_qiov, acb->n, qcow_aio_write_cb, acb); - if (acb->hd_aiocb == NULL) + if (acb->hd_aiocb == NULL) { + ret = -EIO; goto done; + } return; done: diff --git a/block/vdi.c b/block/vdi.c index 116b25bc9b..90540792d3 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -610,6 +610,7 @@ static void vdi_aio_read_cb(void *opaque, int ret) acb->hd_aiocb = bdrv_aio_readv(bs->file, offset, &acb->hd_qiov, n_sectors, vdi_aio_read_cb, acb); if (acb->hd_aiocb == NULL) { + ret = -EIO; goto done; } } @@ -673,6 +674,7 @@ static void vdi_aio_write_cb(void *opaque, int ret) acb->hd_aiocb = bdrv_aio_writev(bs->file, 0, &acb->hd_qiov, 1, vdi_aio_write_cb, acb); if (acb->hd_aiocb == NULL) { + ret = -EIO; goto done; } return; @@ -702,6 +704,7 @@ static void vdi_aio_write_cb(void *opaque, int ret) acb->hd_aiocb = bdrv_aio_writev(bs->file, offset, &acb->hd_qiov, n_sectors, vdi_aio_write_cb, acb); if (acb->hd_aiocb == NULL) { + ret = -EIO; goto done; } return; @@ -752,6 +755,7 @@ static void vdi_aio_write_cb(void *opaque, int ret) &acb->hd_qiov, s->block_sectors, vdi_aio_write_cb, acb); if (acb->hd_aiocb == NULL) { + ret = -EIO; goto done; } } else { @@ -764,6 +768,7 @@ static void vdi_aio_write_cb(void *opaque, int ret) acb->hd_aiocb = bdrv_aio_writev(bs->file, offset, &acb->hd_qiov, n_sectors, vdi_aio_write_cb, acb); if (acb->hd_aiocb == NULL) { + ret = -EIO; goto done; } } diff --git a/blockdev.c b/blockdev.c index 0690cc8bea..ecf2252d83 100644 --- a/blockdev.c +++ b/blockdev.c @@ -574,9 +574,10 @@ int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data) const char *filename = qdict_get_try_str(qdict, "snapshot_file"); const char *format = qdict_get_try_str(qdict, "format"); BlockDriverState *bs; - BlockDriver *drv, *proto_drv; + BlockDriver *drv, *old_drv, *proto_drv; int ret = 0; int flags; + char old_filename[1024]; if (!filename) { qerror_report(QERR_MISSING_PARAMETER, "snapshot_file"); @@ -591,6 +592,11 @@ int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data) goto out; } + pstrcpy(old_filename, sizeof(old_filename), bs->filename); + + old_drv = bs->drv; + flags = bs->open_flags; + if (!format) { format = "qcow2"; } @@ -610,7 +616,7 @@ int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data) } ret = bdrv_img_create(filename, format, bs->filename, - bs->drv->format_name, NULL, -1, bs->open_flags); + bs->drv->format_name, NULL, -1, flags); if (ret) { goto out; } @@ -618,15 +624,20 @@ int do_snapshot_blkdev(Monitor *mon, const QDict *qdict, QObject **ret_data) qemu_aio_flush(); bdrv_flush(bs); - flags = bs->open_flags; bdrv_close(bs); ret = bdrv_open(bs, filename, flags, drv); /* - * If reopening the image file we just created fails, we really - * are in trouble :( + * If reopening the image file we just created fails, fall back + * and try to re-open the original image. If that fails too, we + * are in serious trouble. */ if (ret != 0) { - abort(); + ret = bdrv_open(bs, old_filename, flags, old_drv); + if (ret != 0) { + qerror_report(QERR_OPEN_FILE_FAILED, old_filename); + } else { + qerror_report(QERR_OPEN_FILE_FAILED, filename); + } } out: if (ret) { diff --git a/docs/specs/qcow2.txt b/docs/specs/qcow2.txt new file mode 100644 index 0000000000..8fc3cb2f1a --- /dev/null +++ b/docs/specs/qcow2.txt @@ -0,0 +1,260 @@ +== General == + +A qcow2 image file is organized in units of constant size, which are called +(host) clusters. A cluster is the unit in which all allocations are done, +both for actual guest data and for image metadata. + +Likewise, the virtual disk as seen by the guest is divided into (guest) +clusters of the same size. + +All numbers in qcow2 are stored in Big Endian byte order. + + +== Header == + +The first cluster of a qcow2 image contains the file header: + + Byte 0 - 3: magic + QCOW magic string ("QFI\xfb") + + 4 - 7: version + Version number (only valid value is 2) + + 8 - 15: backing_file_offset + Offset into the image file at which the backing file name + is stored (NB: The string is not null terminated). 0 if the + image doesn't have a backing file. + + 16 - 19: backing_file_size + Length of the backing file name in bytes. Must not be + longer than 1023 bytes. Undefined if the image doesn't have + a backing file. + + 20 - 23: cluster_bits + Number of bits that are used for addressing an offset + within a cluster (1 << cluster_bits is the cluster size). + Must not be less than 9 (i.e. 512 byte clusters). + + Note: qemu as of today has an implementation limit of 2 MB + as the maximum cluster size and won't be able to open images + with larger cluster sizes. + + 24 - 31: size + Virtual disk size in bytes + + 32 - 35: crypt_method + 0 for no encryption + 1 for AES encryption + + 36 - 39: l1_size + Number of entries in the active L1 table + + 40 - 47: l1_table_offset + Offset into the image file at which the active L1 table + starts. Must be aligned to a cluster boundary. + + 48 - 55: refcount_table_offset + Offset into the image file at which the refcount table + starts. Must be aligned to a cluster boundary. + + 56 - 59: refcount_table_clusters + Number of clusters that the refcount table occupies + + 60 - 63: nb_snapshots + Number of snapshots contained in the image + + 64 - 71: snapshots_offset + Offset into the image file at which the snapshot table + starts. Must be aligned to a cluster boundary. + +Directly after the image header, optional sections called header extensions can +be stored. Each extension has a structure like the following: + + Byte 0 - 3: Header extension type: + 0x00000000 - End of the header extension area + 0xE2792ACA - Backing file format name + other - Unknown header extension, can be safely + ignored + + 4 - 7: Length of the header extension data + + 8 - n: Header extension data + + n - m: Padding to round up the header extension size to the next + multiple of 8. + +The remaining space between the end of the header extension area and the end of +the first cluster can be used for other data. Usually, the backing file name is +stored there. + + +== Host cluster management == + +qcow2 manages the allocation of host clusters by maintaining a reference count +for each host cluster. A refcount of 0 means that the cluster is free, 1 means +that it is used, and >= 2 means that it is used and any write access must +perform a COW (copy on write) operation. + +The refcounts are managed in a two-level table. The first level is called +refcount table and has a variable size (which is stored in the header). The +refcount table can cover multiple clusters, however it needs to be contiguous +in the image file. + +It contains pointers to the second level structures which are called refcount +blocks and are exactly one cluster in size. + +Given a offset into the image file, the refcount of its cluster can be obtained +as follows: + + refcount_block_entries = (cluster_size / sizeof(uint16_t)) + + refcount_block_index = (offset / cluster_size) % refcount_table_entries + refcount_table_index = (offset / cluster_size) / refcount_table_entries + + refcount_block = load_cluster(refcount_table[refcount_table_index]); + return refcount_block[refcount_block_index]; + +Refcount table entry: + + Bit 0 - 8: Reserved (set to 0) + + 9 - 63: Bits 9-63 of the offset into the image file at which the + refcount block starts. Must be aligned to a cluster + boundary. + + If this is 0, the corresponding refcount block has not yet + been allocated. All refcounts managed by this refcount block + are 0. + +Refcount block entry: + + Bit 0 - 15: Reference count of the cluster + + +== Cluster mapping == + +Just as for refcounts, qcow2 uses a two-level structure for the mapping of +guest clusters to host clusters. They are called L1 and L2 table. + +The L1 table has a variable size (stored in the header) and may use multiple +clusters, however it must be contiguous in the image file. L2 tables are +exactly one cluster in size. + +Given a offset into the virtual disk, the offset into the image file can be +obtained as follows: + + l2_entries = (cluster_size / sizeof(uint64_t)) + + l2_index = (offset / cluster_size) % l2_entries + l1_index = (offset / cluster_size) / l2_entries + + l2_table = load_cluster(l1_table[l1_index]); + cluster_offset = l2_table[l2_index]; + + return cluster_offset + (offset % cluster_size) + +L1 table entry: + + Bit 0 - 8: Reserved (set to 0) + + 9 - 55: Bits 9-55 of the offset into the image file at which the L2 + table starts. Must be aligned to a cluster boundary. If the + offset is 0, the L2 table and all clusters described by this + L2 table are unallocated. + + 56 - 62: Reserved (set to 0) + + 63: 0 for an L2 table that is unused or requires COW, 1 if its + refcount is exactly one. This information is only accurate + in the active L1 table. + +L2 table entry (for normal clusters): + + Bit 0 - 8: Reserved (set to 0) + + 9 - 55: Bits 9-55 of host cluster offset. Must be aligned to a + cluster boundary. If the offset is 0, the cluster is + unallocated. + + 56 - 61: Reserved (set to 0) + + 62: 0 (this cluster is not compressed) + + 63: 0 for a cluster that is unused or requires COW, 1 if its + refcount is exactly one. This information is only accurate + in L2 tables that are reachable from the the active L1 + table. + +L2 table entry (for compressed clusters; x = 62 - (cluster_size - 8)): + + Bit 0 - x: Host cluster offset. This is usually _not_ aligned to a + cluster boundary! + + x+1 - 61: Compressed size of the images in sectors of 512 bytes + + 62: 1 (this cluster is compressed using zlib) + + 63: 0 for a cluster that is unused or requires COW, 1 if its + refcount is exactly one. This information is only accurate + in L2 tables that are reachable from the the active L1 + table. + +If a cluster is unallocated, read requests shall read the data from the backing +file. If there is no backing file or the backing file is smaller than the image, +they shall read zeros for all parts that are not covered by the backing file. + + +== Snapshots == + +qcow2 supports internal snapshots. Their basic principle of operation is to +switch the active L1 table, so that a different set of host clusters are +exposed to the guest. + +When creating a snapshot, the L1 table should be copied and the refcount of all +L2 tables and clusters reachable form this L1 table must be increased, so that +a write causes a COW and isn't visible in other snapshots. + +When loading a snapshot, bit 63 of all entries in the new active L1 table and +all L2 tables referenced by it must be reconstructed from the refcount table +as it doesn't need to be accurate in inactive L1 tables. + +A directory of all snapshots is stored in the snapshot table, a contiguous area +in the image file, whose starting offset and length are given by the header +fields snapshots_offset and nb_snapshots. The entries of the snapshot table +have variable length, depending on the length of ID, name and extra data. + +Snapshot table entry: + + Byte 0 - 7: Offset into the image file at which the L1 table for the + snapshot starts. Must be aligned to a cluster boundary. + + 8 - 11: Number of entries in the L1 table of the snapshots + + 12 - 13: Length of the unique ID string describing the snapshot + + 14 - 15: Length of the name of the snapshot + + 16 - 19: Time at which the snapshot was taken in seconds since the + Epoch + + 20 - 23: Subsecond part of the time at which the snapshot was taken + in nanoseconds + + 24 - 31: Time that the guest was running until the snapshot was + taken in nanoseconds + + 32 - 35: Size of the VM state in bytes. 0 if no VM state is saved. + If there is VM state, it starts at the first cluster + described by first L1 table entry that doesn't describe a + regular guest cluster (i.e. VM state is stored like guest + disk content, except that it is stored at offsets that are + larger than the virtual disk presented to the guest) + + 36 - 39: Size of extra data in the table entry (used for future + extensions of the format) + + variable: Extra data for future extensions. Must be ignored. + + variable: Unique ID string for the snapshot (not null terminated) + + variable: Name of the snapshot (not null terminated) diff --git a/hw/ide/core.c b/hw/ide/core.c index 9c91a49767..1ffca56887 100644 --- a/hw/ide/core.c +++ b/hw/ide/core.c @@ -34,13 +34,26 @@ #include <hw/ide/internal.h> -static const int smart_attributes[][5] = { - /* id, flags, val, wrst, thrsh */ - { 0x01, 0x03, 0x64, 0x64, 0x06}, /* raw read */ - { 0x03, 0x03, 0x64, 0x64, 0x46}, /* spin up */ - { 0x04, 0x02, 0x64, 0x64, 0x14}, /* start stop count */ - { 0x05, 0x03, 0x64, 0x64, 0x36}, /* remapped sectors */ - { 0x00, 0x00, 0x00, 0x00, 0x00} +/* These values were based on a Seagate ST3500418AS but have been modified + to make more sense in QEMU */ +static const int smart_attributes[][12] = { + /* id, flags, hflags, val, wrst, raw (6 bytes), threshold */ + /* raw read error rate*/ + { 0x01, 0x03, 0x00, 0x64, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06}, + /* spin up */ + { 0x03, 0x03, 0x00, 0x64, 0x64, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + /* start stop count */ + { 0x04, 0x02, 0x00, 0x64, 0x64, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x14}, + /* remapped sectors */ + { 0x05, 0x03, 0x00, 0x64, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x24}, + /* power on hours */ + { 0x09, 0x03, 0x00, 0x64, 0x64, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + /* power cycle count */ + { 0x0c, 0x03, 0x00, 0x64, 0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + /* airflow-temperature-celsius */ + { 190, 0x03, 0x00, 0x45, 0x45, 0x1f, 0x00, 0x1f, 0x1f, 0x00, 0x00, 0x32}, + /* end of list */ + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00} }; /* XXX: DVDs that could fit on a CD will be reported as a CD */ @@ -1843,6 +1856,7 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val) break; case WIN_CHECKPOWERMODE1: case WIN_CHECKPOWERMODE2: + s->error = 0; s->nsector = 0xff; /* device active or idle */ s->status = READY_STAT | SEEK_STAT; ide_set_irq(s->bus); @@ -2097,7 +2111,7 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val) if (smart_attributes[n][0] == 0) break; s->io_buffer[2+0+(n*12)] = smart_attributes[n][0]; - s->io_buffer[2+1+(n*12)] = smart_attributes[n][4]; + s->io_buffer[2+1+(n*12)] = smart_attributes[n][11]; } for (n=0; n<511; n++) /* checksum */ s->io_buffer[511] += s->io_buffer[n]; @@ -2110,12 +2124,13 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val) memset(s->io_buffer, 0, 0x200); s->io_buffer[0] = 0x01; /* smart struct version */ for (n=0; n<30; n++) { - if (smart_attributes[n][0] == 0) + if (smart_attributes[n][0] == 0) { break; - s->io_buffer[2+0+(n*12)] = smart_attributes[n][0]; - s->io_buffer[2+1+(n*12)] = smart_attributes[n][1]; - s->io_buffer[2+3+(n*12)] = smart_attributes[n][2]; - s->io_buffer[2+4+(n*12)] = smart_attributes[n][3]; + } + int i; + for(i = 0; i < 11; i++) { + s->io_buffer[2+i+(n*12)] = smart_attributes[n][i]; + } } s->io_buffer[362] = 0x02 | (s->smart_autosave?0x80:0x00); if (s->smart_selftest_count == 0) { diff --git a/hw/xen_disk.c b/hw/xen_disk.c index ed9e5eb4d7..445bf03aa0 100644 --- a/hw/xen_disk.c +++ b/hw/xen_disk.c @@ -408,9 +408,9 @@ static int ioreq_runio_qemu_aio(struct ioreq *ioreq) break; case BLKIF_OP_WRITE: case BLKIF_OP_WRITE_BARRIER: - ioreq->aio_inflight++; if (!ioreq->req.nr_segments) break; + ioreq->aio_inflight++; bdrv_aio_writev(blkdev->bs, ioreq->start / BLOCK_SIZE, &ioreq->v, ioreq->v.size / BLOCK_SIZE, qemu_aio_complete, ioreq); diff --git a/qemu-tool.c b/qemu-tool.c index 392e1c9505..d45840de28 100644 --- a/qemu-tool.c +++ b/qemu-tool.c @@ -56,53 +56,10 @@ void monitor_print_filename(Monitor *mon, const char *filename) { } -void async_context_push(void) -{ -} - -void async_context_pop(void) -{ -} - -int get_async_context_id(void) -{ - return 0; -} - void monitor_protocol_event(MonitorEvent event, QObject *data) { } -QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque) -{ - QEMUBH *bh; - - bh = qemu_malloc(sizeof(*bh)); - bh->cb = cb; - bh->opaque = opaque; - - return bh; -} - -int qemu_bh_poll(void) -{ - return 0; -} - -void qemu_bh_schedule(QEMUBH *bh) -{ - bh->cb(bh->opaque); -} - -void qemu_bh_cancel(QEMUBH *bh) -{ -} - -void qemu_bh_delete(QEMUBH *bh) -{ - qemu_free(bh); -} - int qemu_set_fd_handler2(int fd, IOCanReadHandler *fd_read_poll, IOHandler *fd_read, @@ -111,3 +68,7 @@ int qemu_set_fd_handler2(int fd, { return 0; } + +void qemu_notify_event(void) +{ +} @@ -2021,6 +2021,8 @@ int load_vmstate(const char *name) if (ret < 0) { return ret; } else if (sn.vm_state_size == 0) { + error_report("This is a disk-only snapshot. Revert to it offline " + "using qemu-img."); return -EINVAL; } |