aboutsummaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorAnthony Liguori <aliguori@us.ibm.com>2009-05-09 17:14:19 -0500
committerAnthony Liguori <aliguori@us.ibm.com>2009-05-14 16:13:46 -0500
commit019d6b8ff0d495ded6977f24a4e8fd1c7fec09e0 (patch)
treeffaf507f7440b5c7d8ed8a4de193b1df41e4a2d8 /block
parent5efa9d5a8b18841c9c62208a494d7f519238979a (diff)
Move block drivers into their own directory
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
Diffstat (limited to 'block')
-rw-r--r--block/bochs.c259
-rw-r--r--block/cloop.c171
-rw-r--r--block/cow.c275
-rw-r--r--block/dmg.c301
-rw-r--r--block/nbd.c196
-rw-r--r--block/parallels.c181
-rw-r--r--block/qcow.c945
-rw-r--r--block/qcow2.c2931
-rw-r--r--block/raw-posix.c1438
-rw-r--r--block/raw-win32.c394
-rw-r--r--block/vmdk.c833
-rw-r--r--block/vpc.c606
-rw-r--r--block/vvfat.c2855
13 files changed, 11385 insertions, 0 deletions
diff --git a/block/bochs.c b/block/bochs.c
new file mode 100644
index 0000000000..bac81c42b7
--- /dev/null
+++ b/block/bochs.c
@@ -0,0 +1,259 @@
+/*
+ * Block driver for the various disk image formats used by Bochs
+ * Currently only for "growing" type in read-only mode
+ *
+ * Copyright (c) 2005 Alex Beregszaszi
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+
+/**************************************************************/
+
+#define HEADER_MAGIC "Bochs Virtual HD Image"
+#define HEADER_VERSION 0x00020000
+#define HEADER_V1 0x00010000
+#define HEADER_SIZE 512
+
+#define REDOLOG_TYPE "Redolog"
+#define GROWING_TYPE "Growing"
+
+// not allocated: 0xffffffff
+
+// always little-endian
+struct bochs_header_v1 {
+ char magic[32]; // "Bochs Virtual HD Image"
+ char type[16]; // "Redolog"
+ char subtype[16]; // "Undoable" / "Volatile" / "Growing"
+ uint32_t version;
+ uint32_t header; // size of header
+
+ union {
+ struct {
+ uint32_t catalog; // num of entries
+ uint32_t bitmap; // bitmap size
+ uint32_t extent; // extent size
+ uint64_t disk; // disk size
+ char padding[HEADER_SIZE - 64 - 8 - 20];
+ } redolog;
+ char padding[HEADER_SIZE - 64 - 8];
+ } extra;
+};
+
+// always little-endian
+struct bochs_header {
+ char magic[32]; // "Bochs Virtual HD Image"
+ char type[16]; // "Redolog"
+ char subtype[16]; // "Undoable" / "Volatile" / "Growing"
+ uint32_t version;
+ uint32_t header; // size of header
+
+ union {
+ struct {
+ uint32_t catalog; // num of entries
+ uint32_t bitmap; // bitmap size
+ uint32_t extent; // extent size
+ uint32_t reserved; // for ???
+ uint64_t disk; // disk size
+ char padding[HEADER_SIZE - 64 - 8 - 24];
+ } redolog;
+ char padding[HEADER_SIZE - 64 - 8];
+ } extra;
+};
+
+typedef struct BDRVBochsState {
+ int fd;
+
+ uint32_t *catalog_bitmap;
+ int catalog_size;
+
+ int data_offset;
+
+ int bitmap_blocks;
+ int extent_blocks;
+ int extent_size;
+} BDRVBochsState;
+
+static int bochs_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+ const struct bochs_header *bochs = (const void *)buf;
+
+ if (buf_size < HEADER_SIZE)
+ return 0;
+
+ if (!strcmp(bochs->magic, HEADER_MAGIC) &&
+ !strcmp(bochs->type, REDOLOG_TYPE) &&
+ !strcmp(bochs->subtype, GROWING_TYPE) &&
+ ((le32_to_cpu(bochs->version) == HEADER_VERSION) ||
+ (le32_to_cpu(bochs->version) == HEADER_V1)))
+ return 100;
+
+ return 0;
+}
+
+static int bochs_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ BDRVBochsState *s = bs->opaque;
+ int fd, i;
+ struct bochs_header bochs;
+ struct bochs_header_v1 header_v1;
+
+ fd = open(filename, O_RDWR | O_BINARY);
+ if (fd < 0) {
+ fd = open(filename, O_RDONLY | O_BINARY);
+ if (fd < 0)
+ return -1;
+ }
+
+ bs->read_only = 1; // no write support yet
+
+ s->fd = fd;
+
+ if (read(fd, &bochs, sizeof(bochs)) != sizeof(bochs)) {
+ goto fail;
+ }
+
+ if (strcmp(bochs.magic, HEADER_MAGIC) ||
+ strcmp(bochs.type, REDOLOG_TYPE) ||
+ strcmp(bochs.subtype, GROWING_TYPE) ||
+ ((le32_to_cpu(bochs.version) != HEADER_VERSION) &&
+ (le32_to_cpu(bochs.version) != HEADER_V1))) {
+ goto fail;
+ }
+
+ if (le32_to_cpu(bochs.version) == HEADER_V1) {
+ memcpy(&header_v1, &bochs, sizeof(bochs));
+ bs->total_sectors = le64_to_cpu(header_v1.extra.redolog.disk) / 512;
+ } else {
+ bs->total_sectors = le64_to_cpu(bochs.extra.redolog.disk) / 512;
+ }
+
+ lseek(s->fd, le32_to_cpu(bochs.header), SEEK_SET);
+
+ s->catalog_size = le32_to_cpu(bochs.extra.redolog.catalog);
+ s->catalog_bitmap = qemu_malloc(s->catalog_size * 4);
+ if (read(s->fd, s->catalog_bitmap, s->catalog_size * 4) !=
+ s->catalog_size * 4)
+ goto fail;
+ for (i = 0; i < s->catalog_size; i++)
+ le32_to_cpus(&s->catalog_bitmap[i]);
+
+ s->data_offset = le32_to_cpu(bochs.header) + (s->catalog_size * 4);
+
+ s->bitmap_blocks = 1 + (le32_to_cpu(bochs.extra.redolog.bitmap) - 1) / 512;
+ s->extent_blocks = 1 + (le32_to_cpu(bochs.extra.redolog.extent) - 1) / 512;
+
+ s->extent_size = le32_to_cpu(bochs.extra.redolog.extent);
+
+ return 0;
+ fail:
+ close(fd);
+ return -1;
+}
+
+static inline int seek_to_sector(BlockDriverState *bs, int64_t sector_num)
+{
+ BDRVBochsState *s = bs->opaque;
+ int64_t offset = sector_num * 512;
+ int64_t extent_index, extent_offset, bitmap_offset, block_offset;
+ char bitmap_entry;
+
+ // seek to sector
+ extent_index = offset / s->extent_size;
+ extent_offset = (offset % s->extent_size) / 512;
+
+ if (s->catalog_bitmap[extent_index] == 0xffffffff)
+ {
+// fprintf(stderr, "page not allocated [%x - %x:%x]\n",
+// sector_num, extent_index, extent_offset);
+ return -1; // not allocated
+ }
+
+ bitmap_offset = s->data_offset + (512 * s->catalog_bitmap[extent_index] *
+ (s->extent_blocks + s->bitmap_blocks));
+ block_offset = bitmap_offset + (512 * (s->bitmap_blocks + extent_offset));
+
+// fprintf(stderr, "sect: %x [ext i: %x o: %x] -> %x bitmap: %x block: %x\n",
+// sector_num, extent_index, extent_offset,
+// le32_to_cpu(s->catalog_bitmap[extent_index]),
+// bitmap_offset, block_offset);
+
+ // read in bitmap for current extent
+ lseek(s->fd, bitmap_offset + (extent_offset / 8), SEEK_SET);
+
+ read(s->fd, &bitmap_entry, 1);
+
+ if (!((bitmap_entry >> (extent_offset % 8)) & 1))
+ {
+// fprintf(stderr, "sector (%x) in bitmap not allocated\n",
+// sector_num);
+ return -1; // not allocated
+ }
+
+ lseek(s->fd, block_offset, SEEK_SET);
+
+ return 0;
+}
+
+static int bochs_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ BDRVBochsState *s = bs->opaque;
+ int ret;
+
+ while (nb_sectors > 0) {
+ if (!seek_to_sector(bs, sector_num))
+ {
+ ret = read(s->fd, buf, 512);
+ if (ret != 512)
+ return -1;
+ }
+ else
+ memset(buf, 0, 512);
+ nb_sectors--;
+ sector_num++;
+ buf += 512;
+ }
+ return 0;
+}
+
+static void bochs_close(BlockDriverState *bs)
+{
+ BDRVBochsState *s = bs->opaque;
+ qemu_free(s->catalog_bitmap);
+ close(s->fd);
+}
+
+static BlockDriver bdrv_bochs = {
+ .format_name = "bochs",
+ .instance_size = sizeof(BDRVBochsState),
+ .bdrv_probe = bochs_probe,
+ .bdrv_open = bochs_open,
+ .bdrv_read = bochs_read,
+ .bdrv_close = bochs_close,
+};
+
+static void bdrv_bochs_init(void)
+{
+ bdrv_register(&bdrv_bochs);
+}
+
+block_init(bdrv_bochs_init);
diff --git a/block/cloop.c b/block/cloop.c
new file mode 100644
index 0000000000..06c687e690
--- /dev/null
+++ b/block/cloop.c
@@ -0,0 +1,171 @@
+/*
+ * QEMU Block driver for CLOOP images
+ *
+ * Copyright (c) 2004 Johannes E. Schindelin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+#include <zlib.h>
+
+typedef struct BDRVCloopState {
+ int fd;
+ uint32_t block_size;
+ uint32_t n_blocks;
+ uint64_t* offsets;
+ uint32_t sectors_per_block;
+ uint32_t current_block;
+ uint8_t *compressed_block;
+ uint8_t *uncompressed_block;
+ z_stream zstream;
+} BDRVCloopState;
+
+static int cloop_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+ const char* magic_version_2_0="#!/bin/sh\n"
+ "#V2.0 Format\n"
+ "modprobe cloop file=$0 && mount -r -t iso9660 /dev/cloop $1\n";
+ int length=strlen(magic_version_2_0);
+ if(length>buf_size)
+ length=buf_size;
+ if(!memcmp(magic_version_2_0,buf,length))
+ return 2;
+ return 0;
+}
+
+static int cloop_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ BDRVCloopState *s = bs->opaque;
+ uint32_t offsets_size,max_compressed_block_size=1,i;
+
+ s->fd = open(filename, O_RDONLY | O_BINARY);
+ if (s->fd < 0)
+ return -errno;
+ bs->read_only = 1;
+
+ /* read header */
+ if(lseek(s->fd,128,SEEK_SET)<0) {
+cloop_close:
+ close(s->fd);
+ return -1;
+ }
+ if(read(s->fd,&s->block_size,4)<4)
+ goto cloop_close;
+ s->block_size=be32_to_cpu(s->block_size);
+ if(read(s->fd,&s->n_blocks,4)<4)
+ goto cloop_close;
+ s->n_blocks=be32_to_cpu(s->n_blocks);
+
+ /* read offsets */
+ offsets_size=s->n_blocks*sizeof(uint64_t);
+ s->offsets=(uint64_t*)qemu_malloc(offsets_size);
+ if(read(s->fd,s->offsets,offsets_size)<offsets_size)
+ goto cloop_close;
+ for(i=0;i<s->n_blocks;i++) {
+ s->offsets[i]=be64_to_cpu(s->offsets[i]);
+ if(i>0) {
+ uint32_t size=s->offsets[i]-s->offsets[i-1];
+ if(size>max_compressed_block_size)
+ max_compressed_block_size=size;
+ }
+ }
+
+ /* initialize zlib engine */
+ s->compressed_block = qemu_malloc(max_compressed_block_size+1);
+ s->uncompressed_block = qemu_malloc(s->block_size);
+ if(inflateInit(&s->zstream) != Z_OK)
+ goto cloop_close;
+ s->current_block=s->n_blocks;
+
+ s->sectors_per_block = s->block_size/512;
+ bs->total_sectors = s->n_blocks*s->sectors_per_block;
+ return 0;
+}
+
+static inline int cloop_read_block(BDRVCloopState *s,int block_num)
+{
+ if(s->current_block != block_num) {
+ int ret;
+ uint32_t bytes = s->offsets[block_num+1]-s->offsets[block_num];
+
+ lseek(s->fd, s->offsets[block_num], SEEK_SET);
+ ret = read(s->fd, s->compressed_block, bytes);
+ if (ret != bytes)
+ return -1;
+
+ s->zstream.next_in = s->compressed_block;
+ s->zstream.avail_in = bytes;
+ s->zstream.next_out = s->uncompressed_block;
+ s->zstream.avail_out = s->block_size;
+ ret = inflateReset(&s->zstream);
+ if(ret != Z_OK)
+ return -1;
+ ret = inflate(&s->zstream, Z_FINISH);
+ if(ret != Z_STREAM_END || s->zstream.total_out != s->block_size)
+ return -1;
+
+ s->current_block = block_num;
+ }
+ return 0;
+}
+
+static int cloop_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ BDRVCloopState *s = bs->opaque;
+ int i;
+
+ for(i=0;i<nb_sectors;i++) {
+ uint32_t sector_offset_in_block=((sector_num+i)%s->sectors_per_block),
+ block_num=(sector_num+i)/s->sectors_per_block;
+ if(cloop_read_block(s, block_num) != 0)
+ return -1;
+ memcpy(buf+i*512,s->uncompressed_block+sector_offset_in_block*512,512);
+ }
+ return 0;
+}
+
+static void cloop_close(BlockDriverState *bs)
+{
+ BDRVCloopState *s = bs->opaque;
+ close(s->fd);
+ if(s->n_blocks>0)
+ free(s->offsets);
+ free(s->compressed_block);
+ free(s->uncompressed_block);
+ inflateEnd(&s->zstream);
+}
+
+static BlockDriver bdrv_cloop = {
+ .format_name = "cloop",
+ .instance_size = sizeof(BDRVCloopState),
+ .bdrv_probe = cloop_probe,
+ .bdrv_open = cloop_open,
+ .bdrv_read = cloop_read,
+ .bdrv_close = cloop_close,
+};
+
+static void bdrv_cloop_init(void)
+{
+ bdrv_register(&bdrv_cloop);
+}
+
+block_init(bdrv_cloop_init);
diff --git a/block/cow.c b/block/cow.c
new file mode 100644
index 0000000000..94b3549389
--- /dev/null
+++ b/block/cow.c
@@ -0,0 +1,275 @@
+/*
+ * Block driver for the COW format
+ *
+ * Copyright (c) 2004 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef _WIN32
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+#include <sys/mman.h>
+
+/**************************************************************/
+/* COW block driver using file system holes */
+
+/* user mode linux compatible COW file */
+#define COW_MAGIC 0x4f4f4f4d /* MOOO */
+#define COW_VERSION 2
+
+struct cow_header_v2 {
+ uint32_t magic;
+ uint32_t version;
+ char backing_file[1024];
+ int32_t mtime;
+ uint64_t size;
+ uint32_t sectorsize;
+};
+
+typedef struct BDRVCowState {
+ int fd;
+ uint8_t *cow_bitmap; /* if non NULL, COW mappings are used first */
+ uint8_t *cow_bitmap_addr; /* mmap address of cow_bitmap */
+ int cow_bitmap_size;
+ int64_t cow_sectors_offset;
+} BDRVCowState;
+
+static int cow_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+ const struct cow_header_v2 *cow_header = (const void *)buf;
+
+ if (buf_size >= sizeof(struct cow_header_v2) &&
+ be32_to_cpu(cow_header->magic) == COW_MAGIC &&
+ be32_to_cpu(cow_header->version) == COW_VERSION)
+ return 100;
+ else
+ return 0;
+}
+
+static int cow_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ BDRVCowState *s = bs->opaque;
+ int fd;
+ struct cow_header_v2 cow_header;
+ int64_t size;
+
+ fd = open(filename, O_RDWR | O_BINARY | O_LARGEFILE);
+ if (fd < 0) {
+ fd = open(filename, O_RDONLY | O_BINARY | O_LARGEFILE);
+ if (fd < 0)
+ return -1;
+ }
+ s->fd = fd;
+ /* see if it is a cow image */
+ if (read(fd, &cow_header, sizeof(cow_header)) != sizeof(cow_header)) {
+ goto fail;
+ }
+
+ if (be32_to_cpu(cow_header.magic) != COW_MAGIC ||
+ be32_to_cpu(cow_header.version) != COW_VERSION) {
+ goto fail;
+ }
+
+ /* cow image found */
+ size = be64_to_cpu(cow_header.size);
+ bs->total_sectors = size / 512;
+
+ pstrcpy(bs->backing_file, sizeof(bs->backing_file),
+ cow_header.backing_file);
+
+ /* mmap the bitmap */
+ s->cow_bitmap_size = ((bs->total_sectors + 7) >> 3) + sizeof(cow_header);
+ s->cow_bitmap_addr = (void *)mmap(get_mmap_addr(s->cow_bitmap_size),
+ s->cow_bitmap_size,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED, s->fd, 0);
+ if (s->cow_bitmap_addr == MAP_FAILED)
+ goto fail;
+ s->cow_bitmap = s->cow_bitmap_addr + sizeof(cow_header);
+ s->cow_sectors_offset = (s->cow_bitmap_size + 511) & ~511;
+ return 0;
+ fail:
+ close(fd);
+ return -1;
+}
+
+static inline void cow_set_bit(uint8_t *bitmap, int64_t bitnum)
+{
+ bitmap[bitnum / 8] |= (1 << (bitnum%8));
+}
+
+static inline int is_bit_set(const uint8_t *bitmap, int64_t bitnum)
+{
+ return !!(bitmap[bitnum / 8] & (1 << (bitnum%8)));
+}
+
+
+/* Return true if first block has been changed (ie. current version is
+ * in COW file). Set the number of continuous blocks for which that
+ * is true. */
+static inline int is_changed(uint8_t *bitmap,
+ int64_t sector_num, int nb_sectors,
+ int *num_same)
+{
+ int changed;
+
+ if (!bitmap || nb_sectors == 0) {
+ *num_same = nb_sectors;
+ return 0;
+ }
+
+ changed = is_bit_set(bitmap, sector_num);
+ for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) {
+ if (is_bit_set(bitmap, sector_num + *num_same) != changed)
+ break;
+ }
+
+ return changed;
+}
+
+static int cow_is_allocated(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ BDRVCowState *s = bs->opaque;
+ return is_changed(s->cow_bitmap, sector_num, nb_sectors, pnum);
+}
+
+static int cow_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ BDRVCowState *s = bs->opaque;
+ int ret, n;
+
+ while (nb_sectors > 0) {
+ if (is_changed(s->cow_bitmap, sector_num, nb_sectors, &n)) {
+ lseek(s->fd, s->cow_sectors_offset + sector_num * 512, SEEK_SET);
+ ret = read(s->fd, buf, n * 512);
+ if (ret != n * 512)
+ return -1;
+ } else {
+ if (bs->backing_hd) {
+ /* read from the base image */
+ ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
+ if (ret < 0)
+ return -1;
+ } else {
+ memset(buf, 0, n * 512);
+ }
+ }
+ nb_sectors -= n;
+ sector_num += n;
+ buf += n * 512;
+ }
+ return 0;
+}
+
+static int cow_write(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ BDRVCowState *s = bs->opaque;
+ int ret, i;
+
+ lseek(s->fd, s->cow_sectors_offset + sector_num * 512, SEEK_SET);
+ ret = write(s->fd, buf, nb_sectors * 512);
+ if (ret != nb_sectors * 512)
+ return -1;
+ for (i = 0; i < nb_sectors; i++)
+ cow_set_bit(s->cow_bitmap, sector_num + i);
+ return 0;
+}
+
+static void cow_close(BlockDriverState *bs)
+{
+ BDRVCowState *s = bs->opaque;
+ munmap((void *)s->cow_bitmap_addr, s->cow_bitmap_size);
+ close(s->fd);
+}
+
+static int cow_create(const char *filename, int64_t image_sectors,
+ const char *image_filename, int flags)
+{
+ int fd, cow_fd;
+ struct cow_header_v2 cow_header;
+ struct stat st;
+
+ if (flags)
+ return -ENOTSUP;
+
+ cow_fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
+ 0644);
+ if (cow_fd < 0)
+ return -1;
+ memset(&cow_header, 0, sizeof(cow_header));
+ cow_header.magic = cpu_to_be32(COW_MAGIC);
+ cow_header.version = cpu_to_be32(COW_VERSION);
+ if (image_filename) {
+ /* Note: if no file, we put a dummy mtime */
+ cow_header.mtime = cpu_to_be32(0);
+
+ fd = open(image_filename, O_RDONLY | O_BINARY);
+ if (fd < 0) {
+ close(cow_fd);
+ goto mtime_fail;
+ }
+ if (fstat(fd, &st) != 0) {
+ close(fd);
+ goto mtime_fail;
+ }
+ close(fd);
+ cow_header.mtime = cpu_to_be32(st.st_mtime);
+ mtime_fail:
+ pstrcpy(cow_header.backing_file, sizeof(cow_header.backing_file),
+ image_filename);
+ }
+ cow_header.sectorsize = cpu_to_be32(512);
+ cow_header.size = cpu_to_be64(image_sectors * 512);
+ write(cow_fd, &cow_header, sizeof(cow_header));
+ /* resize to include at least all the bitmap */
+ ftruncate(cow_fd, sizeof(cow_header) + ((image_sectors + 7) >> 3));
+ close(cow_fd);
+ return 0;
+}
+
+static void cow_flush(BlockDriverState *bs)
+{
+ BDRVCowState *s = bs->opaque;
+ fsync(s->fd);
+}
+
+static BlockDriver bdrv_cow = {
+ .format_name = "cow",
+ .instance_size = sizeof(BDRVCowState),
+ .bdrv_probe = cow_probe,
+ .bdrv_open = cow_open,
+ .bdrv_read = cow_read,
+ .bdrv_write = cow_write,
+ .bdrv_close = cow_close,
+ .bdrv_create = cow_create,
+ .bdrv_flush = cow_flush,
+ .bdrv_is_allocated = cow_is_allocated,
+};
+
+static void bdrv_cow_init(void)
+{
+ bdrv_register(&bdrv_cow);
+}
+
+block_init(bdrv_cow_init);
+#endif
diff --git a/block/dmg.c b/block/dmg.c
new file mode 100644
index 0000000000..262560ffd3
--- /dev/null
+++ b/block/dmg.c
@@ -0,0 +1,301 @@
+/*
+ * QEMU Block driver for DMG images
+ *
+ * Copyright (c) 2004 Johannes E. Schindelin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include "bswap.h"
+#include "module.h"
+#include <zlib.h>
+
+typedef struct BDRVDMGState {
+ int fd;
+
+ /* each chunk contains a certain number of sectors,
+ * offsets[i] is the offset in the .dmg file,
+ * lengths[i] is the length of the compressed chunk,
+ * sectors[i] is the sector beginning at offsets[i],
+ * sectorcounts[i] is the number of sectors in that chunk,
+ * the sectors array is ordered
+ * 0<=i<n_chunks */
+
+ uint32_t n_chunks;
+ uint32_t* types;
+ uint64_t* offsets;
+ uint64_t* lengths;
+ uint64_t* sectors;
+ uint64_t* sectorcounts;
+ uint32_t current_chunk;
+ uint8_t *compressed_chunk;
+ uint8_t *uncompressed_chunk;
+ z_stream zstream;
+} BDRVDMGState;
+
+static int dmg_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+ int len=strlen(filename);
+ if(len>4 && !strcmp(filename+len-4,".dmg"))
+ return 2;
+ return 0;
+}
+
+static off_t read_off(int fd)
+{
+ uint64_t buffer;
+ if(read(fd,&buffer,8)<8)
+ return 0;
+ return be64_to_cpu(buffer);
+}
+
+static off_t read_uint32(int fd)
+{
+ uint32_t buffer;
+ if(read(fd,&buffer,4)<4)
+ return 0;
+ return be32_to_cpu(buffer);
+}
+
+static int dmg_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ BDRVDMGState *s = bs->opaque;
+ off_t info_begin,info_end,last_in_offset,last_out_offset;
+ uint32_t count;
+ uint32_t max_compressed_size=1,max_sectors_per_chunk=1,i;
+
+ s->fd = open(filename, O_RDONLY | O_BINARY);
+ if (s->fd < 0)
+ return -errno;
+ bs->read_only = 1;
+ s->n_chunks = 0;
+ s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL;
+
+ /* read offset of info blocks */
+ if(lseek(s->fd,-0x1d8,SEEK_END)<0) {
+dmg_close:
+ close(s->fd);
+ /* open raw instead */
+ bs->drv=bdrv_find_format("raw");
+ return bs->drv->bdrv_open(bs, filename, flags);
+ }
+ info_begin=read_off(s->fd);
+ if(info_begin==0)
+ goto dmg_close;
+ if(lseek(s->fd,info_begin,SEEK_SET)<0)
+ goto dmg_close;
+ if(read_uint32(s->fd)!=0x100)
+ goto dmg_close;
+ if((count = read_uint32(s->fd))==0)
+ goto dmg_close;
+ info_end = info_begin+count;
+ if(lseek(s->fd,0xf8,SEEK_CUR)<0)
+ goto dmg_close;
+
+ /* read offsets */
+ last_in_offset = last_out_offset = 0;
+ while(lseek(s->fd,0,SEEK_CUR)<info_end) {
+ uint32_t type;
+
+ count = read_uint32(s->fd);
+ if(count==0)
+ goto dmg_close;
+ type = read_uint32(s->fd);
+ if(type!=0x6d697368 || count<244)
+ lseek(s->fd,count-4,SEEK_CUR);
+ else {
+ int new_size, chunk_count;
+ if(lseek(s->fd,200,SEEK_CUR)<0)
+ goto dmg_close;
+ chunk_count = (count-204)/40;
+ new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count);
+ s->types = qemu_realloc(s->types, new_size/2);
+ s->offsets = qemu_realloc(s->offsets, new_size);
+ s->lengths = qemu_realloc(s->lengths, new_size);
+ s->sectors = qemu_realloc(s->sectors, new_size);
+ s->sectorcounts = qemu_realloc(s->sectorcounts, new_size);
+
+ for(i=s->n_chunks;i<s->n_chunks+chunk_count;i++) {
+ s->types[i] = read_uint32(s->fd);
+ if(s->types[i]!=0x80000005 && s->types[i]!=1 && s->types[i]!=2) {
+ if(s->types[i]==0xffffffff) {
+ last_in_offset = s->offsets[i-1]+s->lengths[i-1];
+ last_out_offset = s->sectors[i-1]+s->sectorcounts[i-1];
+ }
+ chunk_count--;
+ i--;
+ if(lseek(s->fd,36,SEEK_CUR)<0)
+ goto dmg_close;
+ continue;
+ }
+ read_uint32(s->fd);
+ s->sectors[i] = last_out_offset+read_off(s->fd);
+ s->sectorcounts[i] = read_off(s->fd);
+ s->offsets[i] = last_in_offset+read_off(s->fd);
+ s->lengths[i] = read_off(s->fd);
+ if(s->lengths[i]>max_compressed_size)
+ max_compressed_size = s->lengths[i];
+ if(s->sectorcounts[i]>max_sectors_per_chunk)
+ max_sectors_per_chunk = s->sectorcounts[i];
+ }
+ s->n_chunks+=chunk_count;
+ }
+ }
+
+ /* initialize zlib engine */
+ s->compressed_chunk = qemu_malloc(max_compressed_size+1);
+ s->uncompressed_chunk = qemu_malloc(512*max_sectors_per_chunk);
+ if(inflateInit(&s->zstream) != Z_OK)
+ goto dmg_close;
+
+ s->current_chunk = s->n_chunks;
+
+ return 0;
+}
+
+static inline int is_sector_in_chunk(BDRVDMGState* s,
+ uint32_t chunk_num,int sector_num)
+{
+ if(chunk_num>=s->n_chunks || s->sectors[chunk_num]>sector_num ||
+ s->sectors[chunk_num]+s->sectorcounts[chunk_num]<=sector_num)
+ return 0;
+ else
+ return -1;
+}
+
+static inline uint32_t search_chunk(BDRVDMGState* s,int sector_num)
+{
+ /* binary search */
+ uint32_t chunk1=0,chunk2=s->n_chunks,chunk3;
+ while(chunk1!=chunk2) {
+ chunk3 = (chunk1+chunk2)/2;
+ if(s->sectors[chunk3]>sector_num)
+ chunk2 = chunk3;
+ else if(s->sectors[chunk3]+s->sectorcounts[chunk3]>sector_num)
+ return chunk3;
+ else
+ chunk1 = chunk3;
+ }
+ return s->n_chunks; /* error */
+}
+
+static inline int dmg_read_chunk(BDRVDMGState *s,int sector_num)
+{
+ if(!is_sector_in_chunk(s,s->current_chunk,sector_num)) {
+ int ret;
+ uint32_t chunk = search_chunk(s,sector_num);
+
+ if(chunk>=s->n_chunks)
+ return -1;
+
+ s->current_chunk = s->n_chunks;
+ switch(s->types[chunk]) {
+ case 0x80000005: { /* zlib compressed */
+ int i;
+
+ ret = lseek(s->fd, s->offsets[chunk], SEEK_SET);
+ if(ret<0)
+ return -1;
+
+ /* we need to buffer, because only the chunk as whole can be
+ * inflated. */
+ i=0;
+ do {
+ ret = read(s->fd, s->compressed_chunk+i, s->lengths[chunk]-i);
+ if(ret<0 && errno==EINTR)
+ ret=0;
+ i+=ret;
+ } while(ret>=0 && ret+i<s->lengths[chunk]);
+
+ if (ret != s->lengths[chunk])
+ return -1;
+
+ s->zstream.next_in = s->compressed_chunk;
+ s->zstream.avail_in = s->lengths[chunk];
+ s->zstream.next_out = s->uncompressed_chunk;
+ s->zstream.avail_out = 512*s->sectorcounts[chunk];
+ ret = inflateReset(&s->zstream);
+ if(ret != Z_OK)
+ return -1;
+ ret = inflate(&s->zstream, Z_FINISH);
+ if(ret != Z_STREAM_END || s->zstream.total_out != 512*s->sectorcounts[chunk])
+ return -1;
+ break; }
+ case 1: /* copy */
+ ret = read(s->fd, s->uncompressed_chunk, s->lengths[chunk]);
+ if (ret != s->lengths[chunk])
+ return -1;
+ break;
+ case 2: /* zero */
+ memset(s->uncompressed_chunk, 0, 512*s->sectorcounts[chunk]);
+ break;
+ }
+ s->current_chunk = chunk;
+ }
+ return 0;
+}
+
+static int dmg_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ BDRVDMGState *s = bs->opaque;
+ int i;
+
+ for(i=0;i<nb_sectors;i++) {
+ uint32_t sector_offset_in_chunk;
+ if(dmg_read_chunk(s, sector_num+i) != 0)
+ return -1;
+ sector_offset_in_chunk = sector_num+i-s->sectors[s->current_chunk];
+ memcpy(buf+i*512,s->uncompressed_chunk+sector_offset_in_chunk*512,512);
+ }
+ return 0;
+}
+
+static void dmg_close(BlockDriverState *bs)
+{
+ BDRVDMGState *s = bs->opaque;
+ close(s->fd);
+ if(s->n_chunks>0) {
+ free(s->types);
+ free(s->offsets);
+ free(s->lengths);
+ free(s->sectors);
+ free(s->sectorcounts);
+ }
+ free(s->compressed_chunk);
+ free(s->uncompressed_chunk);
+ inflateEnd(&s->zstream);
+}
+
+static BlockDriver bdrv_dmg = {
+ .format_name = "dmg",
+ .instance_size = sizeof(BDRVDMGState),
+ .bdrv_probe = dmg_probe,
+ .bdrv_open = dmg_open,
+ .bdrv_read = dmg_read,
+ .bdrv_close = dmg_close,
+};
+
+static void bdrv_dmg_init(void)
+{
+ bdrv_register(&bdrv_dmg);
+}
+
+block_init(bdrv_dmg_init);
diff --git a/block/nbd.c b/block/nbd.c
new file mode 100644
index 0000000000..47d4778999
--- /dev/null
+++ b/block/nbd.c
@@ -0,0 +1,196 @@
+/*
+ * QEMU Block driver for NBD
+ *
+ * Copyright (C) 2008 Bull S.A.S.
+ * Author: Laurent Vivier <Laurent.Vivier@bull.net>
+ *
+ * Some parts:
+ * Copyright (C) 2007 Anthony Liguori <anthony@codemonkey.ws>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu-common.h"
+#include "nbd.h"
+#include "module.h"
+
+#include <sys/types.h>
+#include <unistd.h>
+
+typedef struct BDRVNBDState {
+ int sock;
+ off_t size;
+ size_t blocksize;
+} BDRVNBDState;
+
+static int nbd_open(BlockDriverState *bs, const char* filename, int flags)
+{
+ BDRVNBDState *s = bs->opaque;
+ const char *host;
+ const char *unixpath;
+ int sock;
+ off_t size;
+ size_t blocksize;
+ int ret;
+
+ if ((flags & BDRV_O_CREAT))
+ return -EINVAL;
+
+ if (!strstart(filename, "nbd:", &host))
+ return -EINVAL;
+
+ if (strstart(host, "unix:", &unixpath)) {
+
+ if (unixpath[0] != '/')
+ return -EINVAL;
+
+ sock = unix_socket_outgoing(unixpath);
+
+ } else {
+ uint16_t port;
+ char *p, *r;
+ char hostname[128];
+
+ pstrcpy(hostname, 128, host);
+
+ p = strchr(hostname, ':');
+ if (p == NULL)
+ return -EINVAL;
+
+ *p = '\0';
+ p++;
+
+ port = strtol(p, &r, 0);
+ if (r == p)
+ return -EINVAL;
+ sock = tcp_socket_outgoing(hostname, port);
+ }
+
+ if (sock == -1)
+ return -errno;
+
+ ret = nbd_receive_negotiate(sock, &size, &blocksize);
+ if (ret == -1)
+ return -errno;
+
+ s->sock = sock;
+ s->size = size;
+ s->blocksize = blocksize;
+
+ return 0;
+}
+
+static int nbd_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ BDRVNBDState *s = bs->opaque;
+ struct nbd_request request;
+ struct nbd_reply reply;
+
+ request.type = NBD_CMD_READ;
+ request.handle = (uint64_t)(intptr_t)bs;
+ request.from = sector_num * 512;;
+ request.len = nb_sectors * 512;
+
+ if (nbd_send_request(s->sock, &request) == -1)
+ return -errno;
+
+ if (nbd_receive_reply(s->sock, &reply) == -1)
+ return -errno;
+
+ if (reply.error !=0)
+ return -reply.error;
+
+ if (reply.handle != request.handle)
+ return -EIO;
+
+ if (nbd_wr_sync(s->sock, buf, request.len, 1) != request.len)
+ return -EIO;
+
+ return 0;
+}
+
+static int nbd_write(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ BDRVNBDState *s = bs->opaque;
+ struct nbd_request request;
+ struct nbd_reply reply;
+
+ request.type = NBD_CMD_WRITE;
+ request.handle = (uint64_t)(intptr_t)bs;
+ request.from = sector_num * 512;;
+ request.len = nb_sectors * 512;
+
+ if (nbd_send_request(s->sock, &request) == -1)
+ return -errno;
+
+ if (nbd_wr_sync(s->sock, (uint8_t*)buf, request.len, 0) != request.len)
+ return -EIO;
+
+ if (nbd_receive_reply(s->sock, &reply) == -1)
+ return -errno;
+
+ if (reply.error !=0)
+ return -reply.error;
+
+ if (reply.handle != request.handle)
+ return -EIO;
+
+ return 0;
+}
+
+static void nbd_close(BlockDriverState *bs)
+{
+ BDRVNBDState *s = bs->opaque;
+ struct nbd_request request;
+
+ request.type = NBD_CMD_DISC;
+ request.handle = (uint64_t)(intptr_t)bs;
+ request.from = 0;
+ request.len = 0;
+ nbd_send_request(s->sock, &request);
+
+ close(s->sock);
+}
+
+static int64_t nbd_getlength(BlockDriverState *bs)
+{
+ BDRVNBDState *s = bs->opaque;
+
+ return s->size;
+}
+
+static BlockDriver bdrv_nbd = {
+ .format_name = "nbd",
+ .instance_size = sizeof(BDRVNBDState),
+ .bdrv_open = nbd_open,
+ .bdrv_read = nbd_read,
+ .bdrv_write = nbd_write,
+ .bdrv_close = nbd_close,
+ .bdrv_getlength = nbd_getlength,
+ .protocol_name = "nbd",
+};
+
+static void bdrv_nbd_init(void)
+{
+ bdrv_register(&bdrv_nbd);
+}
+
+block_init(bdrv_nbd_init);
diff --git a/block/parallels.c b/block/parallels.c
new file mode 100644
index 0000000000..0b64a5c625
--- /dev/null
+++ b/block/parallels.c
@@ -0,0 +1,181 @@
+/*
+ * Block driver for Parallels disk image format
+ *
+ * Copyright (c) 2007 Alex Beregszaszi
+ *
+ * This code is based on comparing different disk images created by Parallels.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+
+/**************************************************************/
+
+#define HEADER_MAGIC "WithoutFreeSpace"
+#define HEADER_VERSION 2
+#define HEADER_SIZE 64
+
+// always little-endian
+struct parallels_header {
+ char magic[16]; // "WithoutFreeSpace"
+ uint32_t version;
+ uint32_t heads;
+ uint32_t cylinders;
+ uint32_t tracks;
+ uint32_t catalog_entries;
+ uint32_t nb_sectors;
+ char padding[24];
+} __attribute__((packed));
+
+typedef struct BDRVParallelsState {
+ int fd;
+
+ uint32_t *catalog_bitmap;
+ int catalog_size;
+
+ int tracks;
+} BDRVParallelsState;
+
+static int parallels_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+ const struct parallels_header *ph = (const void *)buf;
+
+ if (buf_size < HEADER_SIZE)
+ return 0;
+
+ if (!memcmp(ph->magic, HEADER_MAGIC, 16) &&
+ (le32_to_cpu(ph->version) == HEADER_VERSION))
+ return 100;
+
+ return 0;
+}
+
+static int parallels_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ BDRVParallelsState *s = bs->opaque;
+ int fd, i;
+ struct parallels_header ph;
+
+ fd = open(filename, O_RDWR | O_BINARY | O_LARGEFILE);
+ if (fd < 0) {
+ fd = open(filename, O_RDONLY | O_BINARY | O_LARGEFILE);
+ if (fd < 0)
+ return -1;
+ }
+
+ bs->read_only = 1; // no write support yet
+
+ s->fd = fd;
+
+ if (read(fd, &ph, sizeof(ph)) != sizeof(ph))
+ goto fail;
+
+ if (memcmp(ph.magic, HEADER_MAGIC, 16) ||
+ (le32_to_cpu(ph.version) != HEADER_VERSION)) {
+ goto fail;
+ }
+
+ bs->total_sectors = le32_to_cpu(ph.nb_sectors);
+
+ if (lseek(s->fd, 64, SEEK_SET) != 64)
+ goto fail;
+
+ s->tracks = le32_to_cpu(ph.tracks);
+
+ s->catalog_size = le32_to_cpu(ph.catalog_entries);
+ s->catalog_bitmap = qemu_malloc(s->catalog_size * 4);
+ if (read(s->fd, s->catalog_bitmap, s->catalog_size * 4) !=
+ s->catalog_size * 4)
+ goto fail;
+ for (i = 0; i < s->catalog_size; i++)
+ le32_to_cpus(&s->catalog_bitmap[i]);
+
+ return 0;
+fail:
+ if (s->catalog_bitmap)
+ qemu_free(s->catalog_bitmap);
+ close(fd);
+ return -1;
+}
+
+static inline int seek_to_sector(BlockDriverState *bs, int64_t sector_num)
+{
+ BDRVParallelsState *s = bs->opaque;
+ uint32_t index, offset, position;
+
+ index = sector_num / s->tracks;
+ offset = sector_num % s->tracks;
+
+ // not allocated
+ if ((index > s->catalog_size) || (s->catalog_bitmap[index] == 0))
+ return -1;
+
+ position = (s->catalog_bitmap[index] + offset) * 512;
+
+// fprintf(stderr, "sector: %llx index=%x offset=%x pointer=%x position=%x\n",
+// sector_num, index, offset, s->catalog_bitmap[index], position);
+
+ if (lseek(s->fd, position, SEEK_SET) != position)
+ return -1;
+
+ return 0;
+}
+
+static int parallels_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ BDRVParallelsState *s = bs->opaque;
+
+ while (nb_sectors > 0) {
+ if (!seek_to_sector(bs, sector_num)) {
+ if (read(s->fd, buf, 512) != 512)
+ return -1;
+ } else
+ memset(buf, 0, 512);
+ nb_sectors--;
+ sector_num++;
+ buf += 512;
+ }
+ return 0;
+}
+
+static void parallels_close(BlockDriverState *bs)
+{
+ BDRVParallelsState *s = bs->opaque;
+ qemu_free(s->catalog_bitmap);
+ close(s->fd);
+}
+
+static BlockDriver bdrv_parallels = {
+ .format_name = "parallels",
+ .instance_size = sizeof(BDRVParallelsState),
+ .bdrv_probe = parallels_probe,
+ .bdrv_open = parallels_open,
+ .bdrv_read = parallels_read,
+ .bdrv_close = parallels_close,
+};
+
+static void bdrv_parallels_init(void)
+{
+ bdrv_register(&bdrv_parallels);
+}
+
+block_init(bdrv_parallels_init);
diff --git a/block/qcow.c b/block/qcow.c
new file mode 100644
index 0000000000..1cf7c3be77
--- /dev/null
+++ b/block/qcow.c
@@ -0,0 +1,945 @@
+/*
+ * Block driver for the QCOW format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+#include <zlib.h>
+#include "aes.h"
+
+/**************************************************************/
+/* QEMU COW block driver with compression and encryption support */
+
+#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
+#define QCOW_VERSION 1
+
+#define QCOW_CRYPT_NONE 0
+#define QCOW_CRYPT_AES 1
+
+#define QCOW_OFLAG_COMPRESSED (1LL << 63)
+
+typedef struct QCowHeader {
+ uint32_t magic;
+ uint32_t version;
+ uint64_t backing_file_offset;
+ uint32_t backing_file_size;
+ uint32_t mtime;
+ uint64_t size; /* in bytes */
+ uint8_t cluster_bits;
+ uint8_t l2_bits;
+ uint32_t crypt_method;
+ uint64_t l1_table_offset;
+} QCowHeader;
+
+#define L2_CACHE_SIZE 16
+
+typedef struct BDRVQcowState {
+ BlockDriverState *hd;
+ int cluster_bits;
+ int cluster_size;
+ int cluster_sectors;
+ int l2_bits;
+ int l2_size;
+ int l1_size;
+ uint64_t cluster_offset_mask;
+ uint64_t l1_table_offset;
+ uint64_t *l1_table;
+ uint64_t *l2_cache;
+ uint64_t l2_cache_offsets[L2_CACHE_SIZE];
+ uint32_t l2_cache_counts[L2_CACHE_SIZE];
+ uint8_t *cluster_cache;
+ uint8_t *cluster_data;
+ uint64_t cluster_cache_offset;
+ uint32_t crypt_method; /* current crypt method, 0 if no key yet */
+ uint32_t crypt_method_header;
+ AES_KEY aes_encrypt_key;
+ AES_KEY aes_decrypt_key;
+} BDRVQcowState;
+
+static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset);
+
+static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+ const QCowHeader *cow_header = (const void *)buf;
+
+ if (buf_size >= sizeof(QCowHeader) &&
+ be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
+ be32_to_cpu(cow_header->version) == QCOW_VERSION)
+ return 100;
+ else
+ return 0;
+}
+
+static int qcow_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ BDRVQcowState *s = bs->opaque;
+ int len, i, shift, ret;
+ QCowHeader header;
+
+ ret = bdrv_file_open(&s->hd, filename, flags);
+ if (ret < 0)
+ return ret;
+ if (bdrv_pread(s->hd, 0, &header, sizeof(header)) != sizeof(header))
+ goto fail;
+ be32_to_cpus(&header.magic);
+ be32_to_cpus(&header.version);
+ be64_to_cpus(&header.backing_file_offset);
+ be32_to_cpus(&header.backing_file_size);
+ be32_to_cpus(&header.mtime);
+ be64_to_cpus(&header.size);
+ be32_to_cpus(&header.crypt_method);
+ be64_to_cpus(&header.l1_table_offset);
+
+ if (header.magic != QCOW_MAGIC || header.version != QCOW_VERSION)
+ goto fail;
+ if (header.size <= 1 || header.cluster_bits < 9)
+ goto fail;
+ if (header.crypt_method > QCOW_CRYPT_AES)
+ goto fail;
+ s->crypt_method_header = header.crypt_method;
+ if (s->crypt_method_header)
+ bs->encrypted = 1;
+ s->cluster_bits = header.cluster_bits;
+ s->cluster_size = 1 << s->cluster_bits;
+ s->cluster_sectors = 1 << (s->cluster_bits - 9);
+ s->l2_bits = header.l2_bits;
+ s->l2_size = 1 << s->l2_bits;
+ bs->total_sectors = header.size / 512;
+ s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
+
+ /* read the level 1 table */
+ shift = s->cluster_bits + s->l2_bits;
+ s->l1_size = (header.size + (1LL << shift) - 1) >> shift;
+
+ s->l1_table_offset = header.l1_table_offset;
+ s->l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
+ if (!s->l1_table)
+ goto fail;
+ if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) !=
+ s->l1_size * sizeof(uint64_t))
+ goto fail;
+ for(i = 0;i < s->l1_size; i++) {
+ be64_to_cpus(&s->l1_table[i]);
+ }
+ /* alloc L2 cache */
+ s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+ if (!s->l2_cache)
+ goto fail;
+ s->cluster_cache = qemu_malloc(s->cluster_size);
+ if (!s->cluster_cache)
+ goto fail;
+ s->cluster_data = qemu_malloc(s->cluster_size);
+ if (!s->cluster_data)
+ goto fail;
+ s->cluster_cache_offset = -1;
+
+ /* read the backing file name */
+ if (header.backing_file_offset != 0) {
+ len = header.backing_file_size;
+ if (len > 1023)
+ len = 1023;
+ if (bdrv_pread(s->hd, header.backing_file_offset, bs->backing_file, len) != len)
+ goto fail;
+ bs->backing_file[len] = '\0';
+ }
+ return 0;
+
+ fail:
+ qemu_free(s->l1_table);
+ qemu_free(s->l2_cache);
+ qemu_free(s->cluster_cache);
+ qemu_free(s->cluster_data);
+ bdrv_delete(s->hd);
+ return -1;
+}
+
+static int qcow_set_key(BlockDriverState *bs, const char *key)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint8_t keybuf[16];
+ int len, i;
+
+ memset(keybuf, 0, 16);
+ len = strlen(key);
+ if (len > 16)
+ len = 16;
+ /* XXX: we could compress the chars to 7 bits to increase
+ entropy */
+ for(i = 0;i < len;i++) {
+ keybuf[i] = key[i];
+ }
+ s->crypt_method = s->crypt_method_header;
+
+ if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
+ return -1;
+ if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
+ return -1;
+#if 0
+ /* test */
+ {
+ uint8_t in[16];
+ uint8_t out[16];
+ uint8_t tmp[16];
+ for(i=0;i<16;i++)
+ in[i] = i;
+ AES_encrypt(in, tmp, &s->aes_encrypt_key);
+ AES_decrypt(tmp, out, &s->aes_decrypt_key);
+ for(i = 0; i < 16; i++)
+ printf(" %02x", tmp[i]);
+ printf("\n");
+ for(i = 0; i < 16; i++)
+ printf(" %02x", out[i]);
+ printf("\n");
+ }
+#endif
+ return 0;
+}
+
+/* The crypt function is compatible with the linux cryptoloop
+ algorithm for < 4 GB images. NOTE: out_buf == in_buf is
+ supported */
+static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
+ uint8_t *out_buf, const uint8_t *in_buf,
+ int nb_sectors, int enc,
+ const AES_KEY *key)
+{
+ union {
+ uint64_t ll[2];
+ uint8_t b[16];
+ } ivec;
+ int i;
+
+ for(i = 0; i < nb_sectors; i++) {
+ ivec.ll[0] = cpu_to_le64(sector_num);
+ ivec.ll[1] = 0;
+ AES_cbc_encrypt(in_buf, out_buf, 512, key,
+ ivec.b, enc);
+ sector_num++;
+ in_buf += 512;
+ out_buf += 512;
+ }
+}
+
+/* 'allocate' is:
+ *
+ * 0 to not allocate.
+ *
+ * 1 to allocate a normal cluster (for sector indexes 'n_start' to
+ * 'n_end')
+ *
+ * 2 to allocate a compressed cluster of size
+ * 'compressed_size'. 'compressed_size' must be > 0 and <
+ * cluster_size
+ *
+ * return 0 if not allocated.
+ */
+static uint64_t get_cluster_offset(BlockDriverState *bs,
+ uint64_t offset, int allocate,
+ int compressed_size,
+ int n_start, int n_end)
+{
+ BDRVQcowState *s = bs->opaque;
+ int min_index, i, j, l1_index, l2_index;
+ uint64_t l2_offset, *l2_table, cluster_offset, tmp;
+ uint32_t min_count;
+ int new_l2_table;
+
+ l1_index = offset >> (s->l2_bits + s->cluster_bits);
+ l2_offset = s->l1_table[l1_index];
+ new_l2_table = 0;
+ if (!l2_offset) {
+ if (!allocate)
+ return 0;
+ /* allocate a new l2 entry */
+ l2_offset = bdrv_getlength(s->hd);
+ /* round to cluster size */
+ l2_offset = (l2_offset + s->cluster_size - 1) & ~(s->cluster_size - 1);
+ /* update the L1 entry */
+ s->l1_table[l1_index] = l2_offset;
+ tmp = cpu_to_be64(l2_offset);
+ if (bdrv_pwrite(s->hd, s->l1_table_offset + l1_index * sizeof(tmp),
+ &tmp, sizeof(tmp)) != sizeof(tmp))
+ return 0;
+ new_l2_table = 1;
+ }
+ for(i = 0; i < L2_CACHE_SIZE; i++) {
+ if (l2_offset == s->l2_cache_offsets[i]) {
+ /* increment the hit count */
+ if (++s->l2_cache_counts[i] == 0xffffffff) {
+ for(j = 0; j < L2_CACHE_SIZE; j++) {
+ s->l2_cache_counts[j] >>= 1;
+ }
+ }
+ l2_table = s->l2_cache + (i << s->l2_bits);
+ goto found;
+ }
+ }
+ /* not found: load a new entry in the least used one */
+ min_index = 0;
+ min_count = 0xffffffff;
+ for(i = 0; i < L2_CACHE_SIZE; i++) {
+ if (s->l2_cache_counts[i] < min_count) {
+ min_count = s->l2_cache_counts[i];
+ min_index = i;
+ }
+ }
+ l2_table = s->l2_cache + (min_index << s->l2_bits);
+ if (new_l2_table) {
+ memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
+ if (bdrv_pwrite(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
+ s->l2_size * sizeof(uint64_t))
+ return 0;
+ } else {
+ if (bdrv_pread(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
+ s->l2_size * sizeof(uint64_t))
+ return 0;
+ }
+ s->l2_cache_offsets[min_index] = l2_offset;
+ s->l2_cache_counts[min_index] = 1;
+ found:
+ l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+ cluster_offset = be64_to_cpu(l2_table[l2_index]);
+ if (!cluster_offset ||
+ ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1)) {
+ if (!allocate)
+ return 0;
+ /* allocate a new cluster */
+ if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
+ (n_end - n_start) < s->cluster_sectors) {
+ /* if the cluster is already compressed, we must
+ decompress it in the case it is not completely
+ overwritten */
+ if (decompress_cluster(s, cluster_offset) < 0)
+ return 0;
+ cluster_offset = bdrv_getlength(s->hd);
+ cluster_offset = (cluster_offset + s->cluster_size - 1) &
+ ~(s->cluster_size - 1);
+ /* write the cluster content */
+ if (bdrv_pwrite(s->hd, cluster_offset, s->cluster_cache, s->cluster_size) !=
+ s->cluster_size)
+ return -1;
+ } else {
+ cluster_offset = bdrv_getlength(s->hd);
+ if (allocate == 1) {
+ /* round to cluster size */
+ cluster_offset = (cluster_offset + s->cluster_size - 1) &
+ ~(s->cluster_size - 1);
+ bdrv_truncate(s->hd, cluster_offset + s->cluster_size);
+ /* if encrypted, we must initialize the cluster
+ content which won't be written */
+ if (s->crypt_method &&
+ (n_end - n_start) < s->cluster_sectors) {
+ uint64_t start_sect;
+ start_sect = (offset & ~(s->cluster_size - 1)) >> 9;
+ memset(s->cluster_data + 512, 0x00, 512);
+ for(i = 0; i < s->cluster_sectors; i++) {
+ if (i < n_start || i >= n_end) {
+ encrypt_sectors(s, start_sect + i,
+ s->cluster_data,
+ s->cluster_data + 512, 1, 1,
+ &s->aes_encrypt_key);
+ if (bdrv_pwrite(s->hd, cluster_offset + i * 512,
+ s->cluster_data, 512) != 512)
+ return -1;
+ }
+ }
+ }
+ } else if (allocate == 2) {
+ cluster_offset |= QCOW_OFLAG_COMPRESSED |
+ (uint64_t)compressed_size << (63 - s->cluster_bits);
+ }
+ }
+ /* update L2 table */
+ tmp = cpu_to_be64(cluster_offset);
+ l2_table[l2_index] = tmp;
+ if (bdrv_pwrite(s->hd,
+ l2_offset + l2_index * sizeof(tmp), &tmp, sizeof(tmp)) != sizeof(tmp))
+ return 0;
+ }
+ return cluster_offset;
+}
+
+static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ BDRVQcowState *s = bs->opaque;
+ int index_in_cluster, n;
+ uint64_t cluster_offset;
+
+ cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
+ index_in_cluster = sector_num & (s->cluster_sectors - 1);
+ n = s->cluster_sectors - index_in_cluster;
+ if (n > nb_sectors)
+ n = nb_sectors;
+ *pnum = n;
+ return (cluster_offset != 0);
+}
+
+static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
+ const uint8_t *buf, int buf_size)
+{
+ z_stream strm1, *strm = &strm1;
+ int ret, out_len;
+
+ memset(strm, 0, sizeof(*strm));
+
+ strm->next_in = (uint8_t *)buf;
+ strm->avail_in = buf_size;
+ strm->next_out = out_buf;
+ strm->avail_out = out_buf_size;
+
+ ret = inflateInit2(strm, -12);
+ if (ret != Z_OK)
+ return -1;
+ ret = inflate(strm, Z_FINISH);
+ out_len = strm->next_out - out_buf;
+ if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
+ out_len != out_buf_size) {
+ inflateEnd(strm);
+ return -1;
+ }
+ inflateEnd(strm);
+ return 0;
+}
+
+static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset)
+{
+ int ret, csize;
+ uint64_t coffset;
+
+ coffset = cluster_offset & s->cluster_offset_mask;
+ if (s->cluster_cache_offset != coffset) {
+ csize = cluster_offset >> (63 - s->cluster_bits);
+ csize &= (s->cluster_size - 1);
+ ret = bdrv_pread(s->hd, coffset, s->cluster_data, csize);
+ if (ret != csize)
+ return -1;
+ if (decompress_buffer(s->cluster_cache, s->cluster_size,
+ s->cluster_data, csize) < 0) {
+ return -1;
+ }
+ s->cluster_cache_offset = coffset;
+ }
+ return 0;
+}
+
+#if 0
+
+static int qcow_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ BDRVQcowState *s = bs->opaque;
+ int ret, index_in_cluster, n;
+ uint64_t cluster_offset;
+
+ while (nb_sectors > 0) {
+ cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
+ index_in_cluster = sector_num & (s->cluster_sectors - 1);
+ n = s->cluster_sectors - index_in_cluster;
+ if (n > nb_sectors)
+ n = nb_sectors;
+ if (!cluster_offset) {
+ if (bs->backing_hd) {
+ /* read from the base image */
+ ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
+ if (ret < 0)
+ return -1;
+ } else {
+ memset(buf, 0, 512 * n);
+ }
+ } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
+ if (decompress_cluster(s, cluster_offset) < 0)
+ return -1;
+ memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n);
+ } else {
+ ret = bdrv_pread(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
+ if (ret != n * 512)
+ return -1;
+ if (s->crypt_method) {
+ encrypt_sectors(s, sector_num, buf, buf, n, 0,
+ &s->aes_decrypt_key);
+ }
+ }
+ nb_sectors -= n;
+ sector_num += n;
+ buf += n * 512;
+ }
+ return 0;
+}
+#endif
+
+static int qcow_write(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ BDRVQcowState *s = bs->opaque;
+ int ret, index_in_cluster, n;
+ uint64_t cluster_offset;
+
+ while (nb_sectors > 0) {
+ index_in_cluster = sector_num & (s->cluster_sectors - 1);
+ n = s->cluster_sectors - index_in_cluster;
+ if (n > nb_sectors)
+ n = nb_sectors;
+ cluster_offset = get_cluster_offset(bs, sector_num << 9, 1, 0,
+ index_in_cluster,
+ index_in_cluster + n);
+ if (!cluster_offset)
+ return -1;
+ if (s->crypt_method) {
+ encrypt_sectors(s, sector_num, s->cluster_data, buf, n, 1,
+ &s->aes_encrypt_key);
+ ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512,
+ s->cluster_data, n * 512);
+ } else {
+ ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
+ }
+ if (ret != n * 512)
+ return -1;
+ nb_sectors -= n;
+ sector_num += n;
+ buf += n * 512;
+ }
+ s->cluster_cache_offset = -1; /* disable compressed cache */
+ return 0;
+}
+
+typedef struct QCowAIOCB {
+ BlockDriverAIOCB common;
+ int64_t sector_num;
+ QEMUIOVector *qiov;
+ uint8_t *buf;
+ void *orig_buf;
+ int nb_sectors;
+ int n;
+ uint64_t cluster_offset;
+ uint8_t *cluster_data;
+ struct iovec hd_iov;
+ QEMUIOVector hd_qiov;
+ BlockDriverAIOCB *hd_aiocb;
+} QCowAIOCB;
+
+static void qcow_aio_read_cb(void *opaque, int ret)
+{
+ QCowAIOCB *acb = opaque;
+ BlockDriverState *bs = acb->common.bs;
+ BDRVQcowState *s = bs->opaque;
+ int index_in_cluster;
+
+ acb->hd_aiocb = NULL;
+ if (ret < 0)
+ goto done;
+
+ redo:
+ /* post process the read buffer */
+ if (!acb->cluster_offset) {
+ /* nothing to do */
+ } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
+ /* nothing to do */
+ } else {
+ if (s->crypt_method) {
+ encrypt_sectors(s, acb->sector_num, acb->buf, acb->buf,
+ acb->n, 0,
+ &s->aes_decrypt_key);
+ }
+ }
+
+ acb->nb_sectors -= acb->n;
+ acb->sector_num += acb->n;
+ acb->buf += acb->n * 512;
+
+ if (acb->nb_sectors == 0) {
+ /* request completed */
+ ret = 0;
+ goto done;
+ }
+
+ /* prepare next AIO request */
+ acb->cluster_offset = get_cluster_offset(bs, acb->sector_num << 9,
+ 0, 0, 0, 0);
+ index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
+ acb->n = s->cluster_sectors - index_in_cluster;
+ if (acb->n > acb->nb_sectors)
+ acb->n = acb->nb_sectors;
+
+ if (!acb->cluster_offset) {
+ if (bs->backing_hd) {
+ /* read from the base image */
+ acb->hd_iov.iov_base = (void *)acb->buf;
+ acb->hd_iov.iov_len = acb->n * 512;
+ qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
+ acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num,
+ &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb);
+ if (acb->hd_aiocb == NULL)
+ goto done;
+ } else {
+ /* Note: in this case, no need to wait */
+ memset(acb->buf, 0, 512 * acb->n);
+ goto redo;
+ }
+ } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
+ /* add AIO support for compressed blocks ? */
+ if (decompress_cluster(s, acb->cluster_offset) < 0)
+ goto done;
+ memcpy(acb->buf,
+ s->cluster_cache + index_in_cluster * 512, 512 * acb->n);
+ goto redo;
+ } else {
+ if ((acb->cluster_offset & 511) != 0) {
+ ret = -EIO;
+ goto done;
+ }
+ acb->hd_iov.iov_base = (void *)acb->buf;
+ acb->hd_iov.iov_len = acb->n * 512;
+ qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
+ acb->hd_aiocb = bdrv_aio_readv(s->hd,
+ (acb->cluster_offset >> 9) + index_in_cluster,
+ &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb);
+ if (acb->hd_aiocb == NULL)
+ goto done;
+ }
+
+ return;
+
+done:
+ if (acb->qiov->niov > 1) {
+ qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size);
+ qemu_vfree(acb->orig_buf);
+ }
+ acb->common.cb(acb->common.opaque, ret);
+ qemu_aio_release(acb);
+}
+
+static BlockDriverAIOCB *qcow_aio_readv(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ QCowAIOCB *acb;
+
+ acb = qemu_aio_get(bs, cb, opaque);
+ if (!acb)
+ return NULL;
+ acb->hd_aiocb = NULL;
+ acb->sector_num = sector_num;
+ acb->qiov = qiov;
+ if (qiov->niov > 1)
+ acb->buf = acb->orig_buf = qemu_blockalign(bs, qiov->size);
+ else
+ acb->buf = (uint8_t *)qiov->iov->iov_base;
+ acb->nb_sectors = nb_sectors;
+ acb->n = 0;
+ acb->cluster_offset = 0;
+
+ qcow_aio_read_cb(acb, 0);
+ return &acb->common;
+}
+
+static void qcow_aio_write_cb(void *opaque, int ret)
+{
+ QCowAIOCB *acb = opaque;
+ BlockDriverState *bs = acb->common.bs;
+ BDRVQcowState *s = bs->opaque;
+ int index_in_cluster;
+ uint64_t cluster_offset;
+ const uint8_t *src_buf;
+
+ acb->hd_aiocb = NULL;
+
+ if (ret < 0)
+ goto done;
+
+ acb->nb_sectors -= acb->n;
+ acb->sector_num += acb->n;
+ acb->buf += acb->n * 512;
+
+ if (acb->nb_sectors == 0) {
+ /* request completed */
+ ret = 0;
+ goto done;
+ }
+
+ index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
+ acb->n = s->cluster_sectors - index_in_cluster;
+ if (acb->n > acb->nb_sectors)
+ acb->n = acb->nb_sectors;
+ cluster_offset = get_cluster_offset(bs, acb->sector_num << 9, 1, 0,
+ index_in_cluster,
+ index_in_cluster + acb->n);
+ if (!cluster_offset || (cluster_offset & 511) != 0) {
+ ret = -EIO;
+ goto done;
+ }
+ if (s->crypt_method) {
+ if (!acb->cluster_data) {
+ acb->cluster_data = qemu_mallocz(s->cluster_size);
+ if (!acb->cluster_data) {
+ ret = -ENOMEM;
+ goto done;
+ }
+ }
+ encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf,
+ acb->n, 1, &s->aes_encrypt_key);
+ src_buf = acb->cluster_data;
+ } else {
+ src_buf = acb->buf;
+ }
+
+ acb->hd_iov.iov_base = (void *)src_buf;
+ acb->hd_iov.iov_len = acb->n * 512;
+ qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
+ acb->hd_aiocb = bdrv_aio_writev(s->hd,
+ (cluster_offset >> 9) + index_in_cluster,
+ &acb->hd_qiov, acb->n,
+ qcow_aio_write_cb, acb);
+ if (acb->hd_aiocb == NULL)
+ goto done;
+ return;
+
+done:
+ if (acb->qiov->niov > 1)
+ qemu_vfree(acb->orig_buf);
+ acb->common.cb(acb->common.opaque, ret);
+ qemu_aio_release(acb);
+}
+
+static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ BDRVQcowState *s = bs->opaque;
+ QCowAIOCB *acb;
+
+ s->cluster_cache_offset = -1; /* disable compressed cache */
+
+ acb = qemu_aio_get(bs, cb, opaque);
+ if (!acb)
+ return NULL;
+ acb->hd_aiocb = NULL;
+ acb->sector_num = sector_num;
+ acb->qiov = qiov;
+ if (qiov->niov > 1) {
+ acb->buf = acb->orig_buf = qemu_blockalign(bs, qiov->size);
+ qemu_iovec_to_buffer(qiov, acb->buf);
+ } else {
+ acb->buf = (uint8_t *)qiov->iov->iov_base;
+ }
+ acb->nb_sectors = nb_sectors;
+ acb->n = 0;
+
+ qcow_aio_write_cb(acb, 0);
+ return &acb->common;
+}
+
+static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+ QCowAIOCB *acb = (QCowAIOCB *)blockacb;
+ if (acb->hd_aiocb)
+ bdrv_aio_cancel(acb->hd_aiocb);
+ qemu_aio_release(acb);
+}
+
+static void qcow_close(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ qemu_free(s->l1_table);
+ qemu_free(s->l2_cache);
+ qemu_free(s->cluster_cache);
+ qemu_free(s->cluster_data);
+ bdrv_delete(s->hd);
+}
+
+static int qcow_create(const char *filename, int64_t total_size,
+ const char *backing_file, int flags)
+{
+ int fd, header_size, backing_filename_len, l1_size, i, shift;
+ QCowHeader header;
+ uint64_t tmp;
+
+ fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
+ if (fd < 0)
+ return -1;
+ memset(&header, 0, sizeof(header));
+ header.magic = cpu_to_be32(QCOW_MAGIC);
+ header.version = cpu_to_be32(QCOW_VERSION);
+ header.size = cpu_to_be64(total_size * 512);
+ header_size = sizeof(header);
+ backing_filename_len = 0;
+ if (backing_file) {
+ if (strcmp(backing_file, "fat:")) {
+ header.backing_file_offset = cpu_to_be64(header_size);
+ backing_filename_len = strlen(backing_file);
+ header.backing_file_size = cpu_to_be32(backing_filename_len);
+ header_size += backing_filename_len;
+ } else {
+ /* special backing file for vvfat */
+ backing_file = NULL;
+ }
+ header.cluster_bits = 9; /* 512 byte cluster to avoid copying
+ unmodifyed sectors */
+ header.l2_bits = 12; /* 32 KB L2 tables */
+ } else {
+ header.cluster_bits = 12; /* 4 KB clusters */
+ header.l2_bits = 9; /* 4 KB L2 tables */
+ }
+ header_size = (header_size + 7) & ~7;
+ shift = header.cluster_bits + header.l2_bits;
+ l1_size = ((total_size * 512) + (1LL << shift) - 1) >> shift;
+
+ header.l1_table_offset = cpu_to_be64(header_size);
+ if (flags & BLOCK_FLAG_ENCRYPT) {
+ header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
+ } else {
+ header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
+ }
+
+ /* write all the data */
+ write(fd, &header, sizeof(header));
+ if (backing_file) {
+ write(fd, backing_file, backing_filename_len);
+ }
+ lseek(fd, header_size, SEEK_SET);
+ tmp = 0;
+ for(i = 0;i < l1_size; i++) {
+ write(fd, &tmp, sizeof(tmp));
+ }
+ close(fd);
+ return 0;
+}
+
+static int qcow_make_empty(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint32_t l1_length = s->l1_size * sizeof(uint64_t);
+ int ret;
+
+ memset(s->l1_table, 0, l1_length);
+ if (bdrv_pwrite(s->hd, s->l1_table_offset, s->l1_table, l1_length) < 0)
+ return -1;
+ ret = bdrv_truncate(s->hd, s->l1_table_offset + l1_length);
+ if (ret < 0)
+ return ret;
+
+ memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+ memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
+ memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
+
+ return 0;
+}
+
+/* XXX: put compressed sectors first, then all the cluster aligned
+ tables to avoid losing bytes in alignment */
+static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ BDRVQcowState *s = bs->opaque;
+ z_stream strm;
+ int ret, out_len;
+ uint8_t *out_buf;
+ uint64_t cluster_offset;
+
+ if (nb_sectors != s->cluster_sectors)
+ return -EINVAL;
+
+ out_buf = qemu_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
+ if (!out_buf)
+ return -1;
+
+ /* best compression, small window, no zlib header */
+ memset(&strm, 0, sizeof(strm));
+ ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+ Z_DEFLATED, -12,
+ 9, Z_DEFAULT_STRATEGY);
+ if (ret != 0) {
+ qemu_free(out_buf);
+ return -1;
+ }
+
+ strm.avail_in = s->cluster_size;
+ strm.next_in = (uint8_t *)buf;
+ strm.avail_out = s->cluster_size;
+ strm.next_out = out_buf;
+
+ ret = deflate(&strm, Z_FINISH);
+ if (ret != Z_STREAM_END && ret != Z_OK) {
+ qemu_free(out_buf);
+ deflateEnd(&strm);
+ return -1;
+ }
+ out_len = strm.next_out - out_buf;
+
+ deflateEnd(&strm);
+
+ if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
+ /* could not compress: write normal cluster */
+ qcow_write(bs, sector_num, buf, s->cluster_sectors);
+ } else {
+ cluster_offset = get_cluster_offset(bs, sector_num << 9, 2,
+ out_len, 0, 0);
+ cluster_offset &= s->cluster_offset_mask;
+ if (bdrv_pwrite(s->hd, cluster_offset, out_buf, out_len) != out_len) {
+ qemu_free(out_buf);
+ return -1;
+ }
+ }
+
+ qemu_free(out_buf);
+ return 0;
+}
+
+static void qcow_flush(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ bdrv_flush(s->hd);
+}
+
+static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+ BDRVQcowState *s = bs->opaque;
+ bdi->cluster_size = s->cluster_size;
+ return 0;
+}
+
+static BlockDriver bdrv_qcow = {
+ .format_name = "qcow",
+ .instance_size = sizeof(BDRVQcowState),
+ .bdrv_probe = qcow_probe,
+ .bdrv_open = qcow_open,
+ .bdrv_close = qcow_close,
+ .bdrv_create = qcow_create,
+ .bdrv_flush = qcow_flush,
+ .bdrv_is_allocated = qcow_is_allocated,
+ .bdrv_set_key = qcow_set_key,
+ .bdrv_make_empty = qcow_make_empty,
+ .bdrv_aio_readv = qcow_aio_readv,
+ .bdrv_aio_writev = qcow_aio_writev,
+ .bdrv_aio_cancel = qcow_aio_cancel,
+ .aiocb_size = sizeof(QCowAIOCB),
+ .bdrv_write_compressed = qcow_write_compressed,
+ .bdrv_get_info = qcow_get_info,
+};
+
+static void bdrv_qcow_init(void)
+{
+ bdrv_register(&bdrv_qcow);
+}
+
+block_init(bdrv_qcow_init);
diff --git a/block/qcow2.c b/block/qcow2.c
new file mode 100644
index 0000000000..a6de9b6919
--- /dev/null
+++ b/block/qcow2.c
@@ -0,0 +1,2931 @@
+/*
+ * Block driver for the QCOW version 2 format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+#include <zlib.h>
+#include "aes.h"
+
+/*
+ Differences with QCOW:
+
+ - Support for multiple incremental snapshots.
+ - Memory management by reference counts.
+ - Clusters which have a reference count of one have the bit
+ QCOW_OFLAG_COPIED to optimize write performance.
+ - Size of compressed clusters is stored in sectors to reduce bit usage
+ in the cluster offsets.
+ - Support for storing additional data (such as the VM state) in the
+ snapshots.
+ - If a backing store is used, the cluster size is not constrained
+ (could be backported to QCOW).
+ - L2 tables have always a size of one cluster.
+*/
+
+//#define DEBUG_ALLOC
+//#define DEBUG_ALLOC2
+//#define DEBUG_EXT
+
+#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
+#define QCOW_VERSION 2
+
+#define QCOW_CRYPT_NONE 0
+#define QCOW_CRYPT_AES 1
+
+#define QCOW_MAX_CRYPT_CLUSTERS 32
+
+/* indicate that the refcount of the referenced cluster is exactly one. */
+#define QCOW_OFLAG_COPIED (1LL << 63)
+/* indicate that the cluster is compressed (they never have the copied flag) */
+#define QCOW_OFLAG_COMPRESSED (1LL << 62)
+
+#define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */
+
+typedef struct QCowHeader {
+ uint32_t magic;
+ uint32_t version;
+ uint64_t backing_file_offset;
+ uint32_t backing_file_size;
+ uint32_t cluster_bits;
+ uint64_t size; /* in bytes */
+ uint32_t crypt_method;
+ uint32_t l1_size; /* XXX: save number of clusters instead ? */
+ uint64_t l1_table_offset;
+ uint64_t refcount_table_offset;
+ uint32_t refcount_table_clusters;
+ uint32_t nb_snapshots;
+ uint64_t snapshots_offset;
+} QCowHeader;
+
+
+typedef struct {
+ uint32_t magic;
+ uint32_t len;
+} QCowExtension;
+#define QCOW_EXT_MAGIC_END 0
+#define QCOW_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
+
+
+typedef struct __attribute__((packed)) QCowSnapshotHeader {
+ /* header is 8 byte aligned */
+ uint64_t l1_table_offset;
+
+ uint32_t l1_size;
+ uint16_t id_str_size;
+ uint16_t name_size;
+
+ uint32_t date_sec;
+ uint32_t date_nsec;
+
+ uint64_t vm_clock_nsec;
+
+ uint32_t vm_state_size;
+ uint32_t extra_data_size; /* for extension */
+ /* extra data follows */
+ /* id_str follows */
+ /* name follows */
+} QCowSnapshotHeader;
+
+#define L2_CACHE_SIZE 16
+
+typedef struct QCowSnapshot {
+ uint64_t l1_table_offset;
+ uint32_t l1_size;
+ char *id_str;
+ char *name;
+ uint32_t vm_state_size;
+ uint32_t date_sec;
+ uint32_t date_nsec;
+ uint64_t vm_clock_nsec;
+} QCowSnapshot;
+
+typedef struct BDRVQcowState {
+ BlockDriverState *hd;
+ int cluster_bits;
+ int cluster_size;
+ int cluster_sectors;
+ int l2_bits;
+ int l2_size;
+ int l1_size;
+ int l1_vm_state_index;
+ int csize_shift;
+ int csize_mask;
+ uint64_t cluster_offset_mask;
+ uint64_t l1_table_offset;
+ uint64_t *l1_table;
+ uint64_t *l2_cache;
+ uint64_t l2_cache_offsets[L2_CACHE_SIZE];
+ uint32_t l2_cache_counts[L2_CACHE_SIZE];
+ uint8_t *cluster_cache;
+ uint8_t *cluster_data;
+ uint64_t cluster_cache_offset;
+
+ uint64_t *refcount_table;
+ uint64_t refcount_table_offset;
+ uint32_t refcount_table_size;
+ uint64_t refcount_block_cache_offset;
+ uint16_t *refcount_block_cache;
+ int64_t free_cluster_index;
+ int64_t free_byte_offset;
+
+ uint32_t crypt_method; /* current crypt method, 0 if no key yet */
+ uint32_t crypt_method_header;
+ AES_KEY aes_encrypt_key;
+ AES_KEY aes_decrypt_key;
+ uint64_t snapshots_offset;
+ int snapshots_size;
+ int nb_snapshots;
+ QCowSnapshot *snapshots;
+} BDRVQcowState;
+
+static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset);
+static int qcow_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors);
+static int qcow_read_snapshots(BlockDriverState *bs);
+static void qcow_free_snapshots(BlockDriverState *bs);
+static int refcount_init(BlockDriverState *bs);
+static void refcount_close(BlockDriverState *bs);
+static int get_refcount(BlockDriverState *bs, int64_t cluster_index);
+static int update_cluster_refcount(BlockDriverState *bs,
+ int64_t cluster_index,
+ int addend);
+static void update_refcount(BlockDriverState *bs,
+ int64_t offset, int64_t length,
+ int addend);
+static int64_t alloc_clusters(BlockDriverState *bs, int64_t size);
+static int64_t alloc_bytes(BlockDriverState *bs, int size);
+static void free_clusters(BlockDriverState *bs,
+ int64_t offset, int64_t size);
+static int check_refcounts(BlockDriverState *bs);
+
+static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+ const QCowHeader *cow_header = (const void *)buf;
+
+ if (buf_size >= sizeof(QCowHeader) &&
+ be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
+ be32_to_cpu(cow_header->version) == QCOW_VERSION)
+ return 100;
+ else
+ return 0;
+}
+
+
+/*
+ * read qcow2 extension and fill bs
+ * start reading from start_offset
+ * finish reading upon magic of value 0 or when end_offset reached
+ * unknown magic is skipped (future extension this version knows nothing about)
+ * return 0 upon success, non-0 otherwise
+ */
+static int qcow_read_extensions(BlockDriverState *bs, uint64_t start_offset,
+ uint64_t end_offset)
+{
+ BDRVQcowState *s = bs->opaque;
+ QCowExtension ext;
+ uint64_t offset;
+
+#ifdef DEBUG_EXT
+ printf("qcow_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
+#endif
+ offset = start_offset;
+ while (offset < end_offset) {
+
+#ifdef DEBUG_EXT
+ /* Sanity check */
+ if (offset > s->cluster_size)
+ printf("qcow_handle_extension: suspicious offset %lu\n", offset);
+
+ printf("attemting to read extended header in offset %lu\n", offset);
+#endif
+
+ if (bdrv_pread(s->hd, offset, &ext, sizeof(ext)) != sizeof(ext)) {
+ fprintf(stderr, "qcow_handle_extension: ERROR: pread fail from offset %llu\n",
+ (unsigned long long)offset);
+ return 1;
+ }
+ be32_to_cpus(&ext.magic);
+ be32_to_cpus(&ext.len);
+ offset += sizeof(ext);
+#ifdef DEBUG_EXT
+ printf("ext.magic = 0x%x\n", ext.magic);
+#endif
+ switch (ext.magic) {
+ case QCOW_EXT_MAGIC_END:
+ return 0;
+
+ case QCOW_EXT_MAGIC_BACKING_FORMAT:
+ if (ext.len >= sizeof(bs->backing_format)) {
+ fprintf(stderr, "ERROR: ext_backing_format: len=%u too large"
+ " (>=%zu)\n",
+ ext.len, sizeof(bs->backing_format));
+ return 2;
+ }
+ if (bdrv_pread(s->hd, offset , bs->backing_format,
+ ext.len) != ext.len)
+ return 3;
+ bs->backing_format[ext.len] = '\0';
+#ifdef DEBUG_EXT
+ printf("Qcow2: Got format extension %s\n", bs->backing_format);
+#endif
+ offset += ((ext.len + 7) & ~7);
+ break;
+
+ default:
+ /* unknown magic -- just skip it */
+ offset += ((ext.len + 7) & ~7);
+ break;
+ }
+ }
+
+ return 0;
+}
+
+
+static int qcow_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ BDRVQcowState *s = bs->opaque;
+ int len, i, shift, ret;
+ QCowHeader header;
+ uint64_t ext_end;
+
+ /* Performance is terrible right now with cache=writethrough due mainly
+ * to reference count updates. If the user does not explicitly specify
+ * a caching type, force to writeback caching.
+ */
+ if ((flags & BDRV_O_CACHE_DEF)) {
+ flags |= BDRV_O_CACHE_WB;
+ flags &= ~BDRV_O_CACHE_DEF;
+ }
+ ret = bdrv_file_open(&s->hd, filename, flags);
+ if (ret < 0)
+ return ret;
+ if (bdrv_pread(s->hd, 0, &header, sizeof(header)) != sizeof(header))
+ goto fail;
+ be32_to_cpus(&header.magic);
+ be32_to_cpus(&header.version);
+ be64_to_cpus(&header.backing_file_offset);
+ be32_to_cpus(&header.backing_file_size);
+ be64_to_cpus(&header.size);
+ be32_to_cpus(&header.cluster_bits);
+ be32_to_cpus(&header.crypt_method);
+ be64_to_cpus(&header.l1_table_offset);
+ be32_to_cpus(&header.l1_size);
+ be64_to_cpus(&header.refcount_table_offset);
+ be32_to_cpus(&header.refcount_table_clusters);
+ be64_to_cpus(&header.snapshots_offset);
+ be32_to_cpus(&header.nb_snapshots);
+
+ if (header.magic != QCOW_MAGIC || header.version != QCOW_VERSION)
+ goto fail;
+ if (header.size <= 1 ||
+ header.cluster_bits < 9 ||
+ header.cluster_bits > 16)
+ goto fail;
+ if (header.crypt_method > QCOW_CRYPT_AES)
+ goto fail;
+ s->crypt_method_header = header.crypt_method;
+ if (s->crypt_method_header)
+ bs->encrypted = 1;
+ s->cluster_bits = header.cluster_bits;
+ s->cluster_size = 1 << s->cluster_bits;
+ s->cluster_sectors = 1 << (s->cluster_bits - 9);
+ s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
+ s->l2_size = 1 << s->l2_bits;
+ bs->total_sectors = header.size / 512;
+ s->csize_shift = (62 - (s->cluster_bits - 8));
+ s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
+ s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
+ s->refcount_table_offset = header.refcount_table_offset;
+ s->refcount_table_size =
+ header.refcount_table_clusters << (s->cluster_bits - 3);
+
+ s->snapshots_offset = header.snapshots_offset;
+ s->nb_snapshots = header.nb_snapshots;
+
+ /* read the level 1 table */
+ s->l1_size = header.l1_size;
+ shift = s->cluster_bits + s->l2_bits;
+ s->l1_vm_state_index = (header.size + (1LL << shift) - 1) >> shift;
+ /* the L1 table must contain at least enough entries to put
+ header.size bytes */
+ if (s->l1_size < s->l1_vm_state_index)
+ goto fail;
+ s->l1_table_offset = header.l1_table_offset;
+ s->l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
+ if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) !=
+ s->l1_size * sizeof(uint64_t))
+ goto fail;
+ for(i = 0;i < s->l1_size; i++) {
+ be64_to_cpus(&s->l1_table[i]);
+ }
+ /* alloc L2 cache */
+ s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+ s->cluster_cache = qemu_malloc(s->cluster_size);
+ /* one more sector for decompressed data alignment */
+ s->cluster_data = qemu_malloc(QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
+ + 512);
+ s->cluster_cache_offset = -1;
+
+ if (refcount_init(bs) < 0)
+ goto fail;
+
+ /* read qcow2 extensions */
+ if (header.backing_file_offset)
+ ext_end = header.backing_file_offset;
+ else
+ ext_end = s->cluster_size;
+ if (qcow_read_extensions(bs, sizeof(header), ext_end))
+ goto fail;
+
+ /* read the backing file name */
+ if (header.backing_file_offset != 0) {
+ len = header.backing_file_size;
+ if (len > 1023)
+ len = 1023;
+ if (bdrv_pread(s->hd, header.backing_file_offset, bs->backing_file, len) != len)
+ goto fail;
+ bs->backing_file[len] = '\0';
+ }
+ if (qcow_read_snapshots(bs) < 0)
+ goto fail;
+
+#ifdef DEBUG_ALLOC
+ check_refcounts(bs);
+#endif
+ return 0;
+
+ fail:
+ qcow_free_snapshots(bs);
+ refcount_close(bs);
+ qemu_free(s->l1_table);
+ qemu_free(s->l2_cache);
+ qemu_free(s->cluster_cache);
+ qemu_free(s->cluster_data);
+ bdrv_delete(s->hd);
+ return -1;
+}
+
+static int qcow_set_key(BlockDriverState *bs, const char *key)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint8_t keybuf[16];
+ int len, i;
+
+ memset(keybuf, 0, 16);
+ len = strlen(key);
+ if (len > 16)
+ len = 16;
+ /* XXX: we could compress the chars to 7 bits to increase
+ entropy */
+ for(i = 0;i < len;i++) {
+ keybuf[i] = key[i];
+ }
+ s->crypt_method = s->crypt_method_header;
+
+ if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
+ return -1;
+ if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
+ return -1;
+#if 0
+ /* test */
+ {
+ uint8_t in[16];
+ uint8_t out[16];
+ uint8_t tmp[16];
+ for(i=0;i<16;i++)
+ in[i] = i;
+ AES_encrypt(in, tmp, &s->aes_encrypt_key);
+ AES_decrypt(tmp, out, &s->aes_decrypt_key);
+ for(i = 0; i < 16; i++)
+ printf(" %02x", tmp[i]);
+ printf("\n");
+ for(i = 0; i < 16; i++)
+ printf(" %02x", out[i]);
+ printf("\n");
+ }
+#endif
+ return 0;
+}
+
+/* The crypt function is compatible with the linux cryptoloop
+ algorithm for < 4 GB images. NOTE: out_buf == in_buf is
+ supported */
+static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
+ uint8_t *out_buf, const uint8_t *in_buf,
+ int nb_sectors, int enc,
+ const AES_KEY *key)
+{
+ union {
+ uint64_t ll[2];
+ uint8_t b[16];
+ } ivec;
+ int i;
+
+ for(i = 0; i < nb_sectors; i++) {
+ ivec.ll[0] = cpu_to_le64(sector_num);
+ ivec.ll[1] = 0;
+ AES_cbc_encrypt(in_buf, out_buf, 512, key,
+ ivec.b, enc);
+ sector_num++;
+ in_buf += 512;
+ out_buf += 512;
+ }
+}
+
+static int copy_sectors(BlockDriverState *bs, uint64_t start_sect,
+ uint64_t cluster_offset, int n_start, int n_end)
+{
+ BDRVQcowState *s = bs->opaque;
+ int n, ret;
+
+ n = n_end - n_start;
+ if (n <= 0)
+ return 0;
+ ret = qcow_read(bs, start_sect + n_start, s->cluster_data, n);
+ if (ret < 0)
+ return ret;
+ if (s->crypt_method) {
+ encrypt_sectors(s, start_sect + n_start,
+ s->cluster_data,
+ s->cluster_data, n, 1,
+ &s->aes_encrypt_key);
+ }
+ ret = bdrv_write(s->hd, (cluster_offset >> 9) + n_start,
+ s->cluster_data, n);
+ if (ret < 0)
+ return ret;
+ return 0;
+}
+
+static void l2_cache_reset(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+
+ memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+ memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
+ memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
+}
+
+static inline int l2_cache_new_entry(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint32_t min_count;
+ int min_index, i;
+
+ /* find a new entry in the least used one */
+ min_index = 0;
+ min_count = 0xffffffff;
+ for(i = 0; i < L2_CACHE_SIZE; i++) {
+ if (s->l2_cache_counts[i] < min_count) {
+ min_count = s->l2_cache_counts[i];
+ min_index = i;
+ }
+ }
+ return min_index;
+}
+
+static int64_t align_offset(int64_t offset, int n)
+{
+ offset = (offset + n - 1) & ~(n - 1);
+ return offset;
+}
+
+static int grow_l1_table(BlockDriverState *bs, int min_size)
+{
+ BDRVQcowState *s = bs->opaque;
+ int new_l1_size, new_l1_size2, ret, i;
+ uint64_t *new_l1_table;
+ uint64_t new_l1_table_offset;
+ uint8_t data[12];
+
+ new_l1_size = s->l1_size;
+ if (min_size <= new_l1_size)
+ return 0;
+ while (min_size > new_l1_size) {
+ new_l1_size = (new_l1_size * 3 + 1) / 2;
+ }
+#ifdef DEBUG_ALLOC2
+ printf("grow l1_table from %d to %d\n", s->l1_size, new_l1_size);
+#endif
+
+ new_l1_size2 = sizeof(uint64_t) * new_l1_size;
+ new_l1_table = qemu_mallocz(new_l1_size2);
+ memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
+
+ /* write new table (align to cluster) */
+ new_l1_table_offset = alloc_clusters(bs, new_l1_size2);
+
+ for(i = 0; i < s->l1_size; i++)
+ new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
+ ret = bdrv_pwrite(s->hd, new_l1_table_offset, new_l1_table, new_l1_size2);
+ if (ret != new_l1_size2)
+ goto fail;
+ for(i = 0; i < s->l1_size; i++)
+ new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
+
+ /* set new table */
+ cpu_to_be32w((uint32_t*)data, new_l1_size);
+ cpu_to_be64w((uint64_t*)(data + 4), new_l1_table_offset);
+ if (bdrv_pwrite(s->hd, offsetof(QCowHeader, l1_size), data,
+ sizeof(data)) != sizeof(data))
+ goto fail;
+ qemu_free(s->l1_table);
+ free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t));
+ s->l1_table_offset = new_l1_table_offset;
+ s->l1_table = new_l1_table;
+ s->l1_size = new_l1_size;
+ return 0;
+ fail:
+ qemu_free(s->l1_table);
+ return -EIO;
+}
+
+/*
+ * seek_l2_table
+ *
+ * seek l2_offset in the l2_cache table
+ * if not found, return NULL,
+ * if found,
+ * increments the l2 cache hit count of the entry,
+ * if counter overflow, divide by two all counters
+ * return the pointer to the l2 cache entry
+ *
+ */
+
+static uint64_t *seek_l2_table(BDRVQcowState *s, uint64_t l2_offset)
+{
+ int i, j;
+
+ for(i = 0; i < L2_CACHE_SIZE; i++) {
+ if (l2_offset == s->l2_cache_offsets[i]) {
+ /* increment the hit count */
+ if (++s->l2_cache_counts[i] == 0xffffffff) {
+ for(j = 0; j < L2_CACHE_SIZE; j++) {
+ s->l2_cache_counts[j] >>= 1;
+ }
+ }
+ return s->l2_cache + (i << s->l2_bits);
+ }
+ }
+ return NULL;
+}
+
+/*
+ * l2_load
+ *
+ * Loads a L2 table into memory. If the table is in the cache, the cache
+ * is used; otherwise the L2 table is loaded from the image file.
+ *
+ * Returns a pointer to the L2 table on success, or NULL if the read from
+ * the image file failed.
+ */
+
+static uint64_t *l2_load(BlockDriverState *bs, uint64_t l2_offset)
+{
+ BDRVQcowState *s = bs->opaque;
+ int min_index;
+ uint64_t *l2_table;
+
+ /* seek if the table for the given offset is in the cache */
+
+ l2_table = seek_l2_table(s, l2_offset);
+ if (l2_table != NULL)
+ return l2_table;
+
+ /* not found: load a new entry in the least used one */
+
+ min_index = l2_cache_new_entry(bs);
+ l2_table = s->l2_cache + (min_index << s->l2_bits);
+ if (bdrv_pread(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
+ s->l2_size * sizeof(uint64_t))
+ return NULL;
+ s->l2_cache_offsets[min_index] = l2_offset;
+ s->l2_cache_counts[min_index] = 1;
+
+ return l2_table;
+}
+
+/*
+ * l2_allocate
+ *
+ * Allocate a new l2 entry in the file. If l1_index points to an already
+ * used entry in the L2 table (i.e. we are doing a copy on write for the L2
+ * table) copy the contents of the old L2 table into the newly allocated one.
+ * Otherwise the new table is initialized with zeros.
+ *
+ */
+
+static uint64_t *l2_allocate(BlockDriverState *bs, int l1_index)
+{
+ BDRVQcowState *s = bs->opaque;
+ int min_index;
+ uint64_t old_l2_offset, tmp;
+ uint64_t *l2_table, l2_offset;
+
+ old_l2_offset = s->l1_table[l1_index];
+
+ /* allocate a new l2 entry */
+
+ l2_offset = alloc_clusters(bs, s->l2_size * sizeof(uint64_t));
+
+ /* update the L1 entry */
+
+ s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
+
+ tmp = cpu_to_be64(l2_offset | QCOW_OFLAG_COPIED);
+ if (bdrv_pwrite(s->hd, s->l1_table_offset + l1_index * sizeof(tmp),
+ &tmp, sizeof(tmp)) != sizeof(tmp))
+ return NULL;
+
+ /* allocate a new entry in the l2 cache */
+
+ min_index = l2_cache_new_entry(bs);
+ l2_table = s->l2_cache + (min_index << s->l2_bits);
+
+ if (old_l2_offset == 0) {
+ /* if there was no old l2 table, clear the new table */
+ memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
+ } else {
+ /* if there was an old l2 table, read it from the disk */
+ if (bdrv_pread(s->hd, old_l2_offset,
+ l2_table, s->l2_size * sizeof(uint64_t)) !=
+ s->l2_size * sizeof(uint64_t))
+ return NULL;
+ }
+ /* write the l2 table to the file */
+ if (bdrv_pwrite(s->hd, l2_offset,
+ l2_table, s->l2_size * sizeof(uint64_t)) !=
+ s->l2_size * sizeof(uint64_t))
+ return NULL;
+
+ /* update the l2 cache entry */
+
+ s->l2_cache_offsets[min_index] = l2_offset;
+ s->l2_cache_counts[min_index] = 1;
+
+ return l2_table;
+}
+
+static int size_to_clusters(BDRVQcowState *s, int64_t size)
+{
+ return (size + (s->cluster_size - 1)) >> s->cluster_bits;
+}
+
+static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size,
+ uint64_t *l2_table, uint64_t start, uint64_t mask)
+{
+ int i;
+ uint64_t offset = be64_to_cpu(l2_table[0]) & ~mask;
+
+ if (!offset)
+ return 0;
+
+ for (i = start; i < start + nb_clusters; i++)
+ if (offset + i * cluster_size != (be64_to_cpu(l2_table[i]) & ~mask))
+ break;
+
+ return (i - start);
+}
+
+static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table)
+{
+ int i = 0;
+
+ while(nb_clusters-- && l2_table[i] == 0)
+ i++;
+
+ return i;
+}
+
+/*
+ * get_cluster_offset
+ *
+ * For a given offset of the disk image, return cluster offset in
+ * qcow2 file.
+ *
+ * on entry, *num is the number of contiguous clusters we'd like to
+ * access following offset.
+ *
+ * on exit, *num is the number of contiguous clusters we can read.
+ *
+ * Return 1, if the offset is found
+ * Return 0, otherwise.
+ *
+ */
+
+static uint64_t get_cluster_offset(BlockDriverState *bs,
+ uint64_t offset, int *num)
+{
+ BDRVQcowState *s = bs->opaque;
+ int l1_index, l2_index;
+ uint64_t l2_offset, *l2_table, cluster_offset;
+ int l1_bits, c;
+ int index_in_cluster, nb_available, nb_needed, nb_clusters;
+
+ index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1);
+ nb_needed = *num + index_in_cluster;
+
+ l1_bits = s->l2_bits + s->cluster_bits;
+
+ /* compute how many bytes there are between the offset and
+ * the end of the l1 entry
+ */
+
+ nb_available = (1 << l1_bits) - (offset & ((1 << l1_bits) - 1));
+
+ /* compute the number of available sectors */
+
+ nb_available = (nb_available >> 9) + index_in_cluster;
+
+ if (nb_needed > nb_available) {
+ nb_needed = nb_available;
+ }
+
+ cluster_offset = 0;
+
+ /* seek the the l2 offset in the l1 table */
+
+ l1_index = offset >> l1_bits;
+ if (l1_index >= s->l1_size)
+ goto out;
+
+ l2_offset = s->l1_table[l1_index];
+
+ /* seek the l2 table of the given l2 offset */
+
+ if (!l2_offset)
+ goto out;
+
+ /* load the l2 table in memory */
+
+ l2_offset &= ~QCOW_OFLAG_COPIED;
+ l2_table = l2_load(bs, l2_offset);
+ if (l2_table == NULL)
+ return 0;
+
+ /* find the cluster offset for the given disk offset */
+
+ l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+ cluster_offset = be64_to_cpu(l2_table[l2_index]);
+ nb_clusters = size_to_clusters(s, nb_needed << 9);
+
+ if (!cluster_offset) {
+ /* how many empty clusters ? */
+ c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]);
+ } else {
+ /* how many allocated clusters ? */
+ c = count_contiguous_clusters(nb_clusters, s->cluster_size,
+ &l2_table[l2_index], 0, QCOW_OFLAG_COPIED);
+ }
+
+ nb_available = (c * s->cluster_sectors);
+out:
+ if (nb_available > nb_needed)
+ nb_available = nb_needed;
+
+ *num = nb_available - index_in_cluster;
+
+ return cluster_offset & ~QCOW_OFLAG_COPIED;
+}
+
+/*
+ * free_any_clusters
+ *
+ * free clusters according to its type: compressed or not
+ *
+ */
+
+static void free_any_clusters(BlockDriverState *bs,
+ uint64_t cluster_offset, int nb_clusters)
+{
+ BDRVQcowState *s = bs->opaque;
+
+ /* free the cluster */
+
+ if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
+ int nb_csectors;
+ nb_csectors = ((cluster_offset >> s->csize_shift) &
+ s->csize_mask) + 1;
+ free_clusters(bs, (cluster_offset & s->cluster_offset_mask) & ~511,
+ nb_csectors * 512);
+ return;
+ }
+
+ free_clusters(bs, cluster_offset, nb_clusters << s->cluster_bits);
+
+ return;
+}
+
+/*
+ * get_cluster_table
+ *
+ * for a given disk offset, load (and allocate if needed)
+ * the l2 table.
+ *
+ * the l2 table offset in the qcow2 file and the cluster index
+ * in the l2 table are given to the caller.
+ *
+ */
+
+static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
+ uint64_t **new_l2_table,
+ uint64_t *new_l2_offset,
+ int *new_l2_index)
+{
+ BDRVQcowState *s = bs->opaque;
+ int l1_index, l2_index, ret;
+ uint64_t l2_offset, *l2_table;
+
+ /* seek the the l2 offset in the l1 table */
+
+ l1_index = offset >> (s->l2_bits + s->cluster_bits);
+ if (l1_index >= s->l1_size) {
+ ret = grow_l1_table(bs, l1_index + 1);
+ if (ret < 0)
+ return 0;
+ }
+ l2_offset = s->l1_table[l1_index];
+
+ /* seek the l2 table of the given l2 offset */
+
+ if (l2_offset & QCOW_OFLAG_COPIED) {
+ /* load the l2 table in memory */
+ l2_offset &= ~QCOW_OFLAG_COPIED;
+ l2_table = l2_load(bs, l2_offset);
+ if (l2_table == NULL)
+ return 0;
+ } else {
+ if (l2_offset)
+ free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t));
+ l2_table = l2_allocate(bs, l1_index);
+ if (l2_table == NULL)
+ return 0;
+ l2_offset = s->l1_table[l1_index] & ~QCOW_OFLAG_COPIED;
+ }
+
+ /* find the cluster offset for the given disk offset */
+
+ l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+
+ *new_l2_table = l2_table;
+ *new_l2_offset = l2_offset;
+ *new_l2_index = l2_index;
+
+ return 1;
+}
+
+/*
+ * alloc_compressed_cluster_offset
+ *
+ * For a given offset of the disk image, return cluster offset in
+ * qcow2 file.
+ *
+ * If the offset is not found, allocate a new compressed cluster.
+ *
+ * Return the cluster offset if successful,
+ * Return 0, otherwise.
+ *
+ */
+
+static uint64_t alloc_compressed_cluster_offset(BlockDriverState *bs,
+ uint64_t offset,
+ int compressed_size)
+{
+ BDRVQcowState *s = bs->opaque;
+ int l2_index, ret;
+ uint64_t l2_offset, *l2_table, cluster_offset;
+ int nb_csectors;
+
+ ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
+ if (ret == 0)
+ return 0;
+
+ cluster_offset = be64_to_cpu(l2_table[l2_index]);
+ if (cluster_offset & QCOW_OFLAG_COPIED)
+ return cluster_offset & ~QCOW_OFLAG_COPIED;
+
+ if (cluster_offset)
+ free_any_clusters(bs, cluster_offset, 1);
+
+ cluster_offset = alloc_bytes(bs, compressed_size);
+ nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) -
+ (cluster_offset >> 9);
+
+ cluster_offset |= QCOW_OFLAG_COMPRESSED |
+ ((uint64_t)nb_csectors << s->csize_shift);
+
+ /* update L2 table */
+
+ /* compressed clusters never have the copied flag */
+
+ l2_table[l2_index] = cpu_to_be64(cluster_offset);
+ if (bdrv_pwrite(s->hd,
+ l2_offset + l2_index * sizeof(uint64_t),
+ l2_table + l2_index,
+ sizeof(uint64_t)) != sizeof(uint64_t))
+ return 0;
+
+ return cluster_offset;
+}
+
+typedef struct QCowL2Meta
+{
+ uint64_t offset;
+ int n_start;
+ int nb_available;
+ int nb_clusters;
+} QCowL2Meta;
+
+static int alloc_cluster_link_l2(BlockDriverState *bs, uint64_t cluster_offset,
+ QCowL2Meta *m)
+{
+ BDRVQcowState *s = bs->opaque;
+ int i, j = 0, l2_index, ret;
+ uint64_t *old_cluster, start_sect, l2_offset, *l2_table;
+
+ if (m->nb_clusters == 0)
+ return 0;
+
+ old_cluster = qemu_malloc(m->nb_clusters * sizeof(uint64_t));
+
+ /* copy content of unmodified sectors */
+ start_sect = (m->offset & ~(s->cluster_size - 1)) >> 9;
+ if (m->n_start) {
+ ret = copy_sectors(bs, start_sect, cluster_offset, 0, m->n_start);
+ if (ret < 0)
+ goto err;
+ }
+
+ if (m->nb_available & (s->cluster_sectors - 1)) {
+ uint64_t end = m->nb_available & ~(uint64_t)(s->cluster_sectors - 1);
+ ret = copy_sectors(bs, start_sect + end, cluster_offset + (end << 9),
+ m->nb_available - end, s->cluster_sectors);
+ if (ret < 0)
+ goto err;
+ }
+
+ ret = -EIO;
+ /* update L2 table */
+ if (!get_cluster_table(bs, m->offset, &l2_table, &l2_offset, &l2_index))
+ goto err;
+
+ for (i = 0; i < m->nb_clusters; i++) {
+ /* if two concurrent writes happen to the same unallocated cluster
+ * each write allocates separate cluster and writes data concurrently.
+ * The first one to complete updates l2 table with pointer to its
+ * cluster the second one has to do RMW (which is done above by
+ * copy_sectors()), update l2 table with its cluster pointer and free
+ * old cluster. This is what this loop does */
+ if(l2_table[l2_index + i] != 0)
+ old_cluster[j++] = l2_table[l2_index + i];
+
+ l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
+ (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
+ }
+
+ if (bdrv_pwrite(s->hd, l2_offset + l2_index * sizeof(uint64_t),
+ l2_table + l2_index, m->nb_clusters * sizeof(uint64_t)) !=
+ m->nb_clusters * sizeof(uint64_t))
+ goto err;
+
+ for (i = 0; i < j; i++)
+ free_any_clusters(bs, be64_to_cpu(old_cluster[i]) & ~QCOW_OFLAG_COPIED,
+ 1);
+
+ ret = 0;
+err:
+ qemu_free(old_cluster);
+ return ret;
+ }
+
+/*
+ * alloc_cluster_offset
+ *
+ * For a given offset of the disk image, return cluster offset in
+ * qcow2 file.
+ *
+ * If the offset is not found, allocate a new cluster.
+ *
+ * Return the cluster offset if successful,
+ * Return 0, otherwise.
+ *
+ */
+
+static uint64_t alloc_cluster_offset(BlockDriverState *bs,
+ uint64_t offset,
+ int n_start, int n_end,
+ int *num, QCowL2Meta *m)
+{
+ BDRVQcowState *s = bs->opaque;
+ int l2_index, ret;
+ uint64_t l2_offset, *l2_table, cluster_offset;
+ int nb_clusters, i = 0;
+
+ ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
+ if (ret == 0)
+ return 0;
+
+ nb_clusters = size_to_clusters(s, n_end << 9);
+
+ nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+
+ cluster_offset = be64_to_cpu(l2_table[l2_index]);
+
+ /* We keep all QCOW_OFLAG_COPIED clusters */
+
+ if (cluster_offset & QCOW_OFLAG_COPIED) {
+ nb_clusters = count_contiguous_clusters(nb_clusters, s->cluster_size,
+ &l2_table[l2_index], 0, 0);
+
+ cluster_offset &= ~QCOW_OFLAG_COPIED;
+ m->nb_clusters = 0;
+
+ goto out;
+ }
+
+ /* for the moment, multiple compressed clusters are not managed */
+
+ if (cluster_offset & QCOW_OFLAG_COMPRESSED)
+ nb_clusters = 1;
+
+ /* how many available clusters ? */
+
+ while (i < nb_clusters) {
+ i += count_contiguous_clusters(nb_clusters - i, s->cluster_size,
+ &l2_table[l2_index], i, 0);
+
+ if(be64_to_cpu(l2_table[l2_index + i]))
+ break;
+
+ i += count_contiguous_free_clusters(nb_clusters - i,
+ &l2_table[l2_index + i]);
+
+ cluster_offset = be64_to_cpu(l2_table[l2_index + i]);
+
+ if ((cluster_offset & QCOW_OFLAG_COPIED) ||
+ (cluster_offset & QCOW_OFLAG_COMPRESSED))
+ break;
+ }
+ nb_clusters = i;
+
+ /* allocate a new cluster */
+
+ cluster_offset = alloc_clusters(bs, nb_clusters * s->cluster_size);
+
+ /* save info needed for meta data update */
+ m->offset = offset;
+ m->n_start = n_start;
+ m->nb_clusters = nb_clusters;
+
+out:
+ m->nb_available = MIN(nb_clusters << (s->cluster_bits - 9), n_end);
+
+ *num = m->nb_available - n_start;
+
+ return cluster_offset;
+}
+
+static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ uint64_t cluster_offset;
+
+ *pnum = nb_sectors;
+ cluster_offset = get_cluster_offset(bs, sector_num << 9, pnum);
+
+ return (cluster_offset != 0);
+}
+
+static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
+ const uint8_t *buf, int buf_size)
+{
+ z_stream strm1, *strm = &strm1;
+ int ret, out_len;
+
+ memset(strm, 0, sizeof(*strm));
+
+ strm->next_in = (uint8_t *)buf;
+ strm->avail_in = buf_size;
+ strm->next_out = out_buf;
+ strm->avail_out = out_buf_size;
+
+ ret = inflateInit2(strm, -12);
+ if (ret != Z_OK)
+ return -1;
+ ret = inflate(strm, Z_FINISH);
+ out_len = strm->next_out - out_buf;
+ if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
+ out_len != out_buf_size) {
+ inflateEnd(strm);
+ return -1;
+ }
+ inflateEnd(strm);
+ return 0;
+}
+
+static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset)
+{
+ int ret, csize, nb_csectors, sector_offset;
+ uint64_t coffset;
+
+ coffset = cluster_offset & s->cluster_offset_mask;
+ if (s->cluster_cache_offset != coffset) {
+ nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
+ sector_offset = coffset & 511;
+ csize = nb_csectors * 512 - sector_offset;
+ ret = bdrv_read(s->hd, coffset >> 9, s->cluster_data, nb_csectors);
+ if (ret < 0) {
+ return -1;
+ }
+ if (decompress_buffer(s->cluster_cache, s->cluster_size,
+ s->cluster_data + sector_offset, csize) < 0) {
+ return -1;
+ }
+ s->cluster_cache_offset = coffset;
+ }
+ return 0;
+}
+
+/* handle reading after the end of the backing file */
+static int backing_read1(BlockDriverState *bs,
+ int64_t sector_num, uint8_t *buf, int nb_sectors)
+{
+ int n1;
+ if ((sector_num + nb_sectors) <= bs->total_sectors)
+ return nb_sectors;
+ if (sector_num >= bs->total_sectors)
+ n1 = 0;
+ else
+ n1 = bs->total_sectors - sector_num;
+ memset(buf + n1 * 512, 0, 512 * (nb_sectors - n1));
+ return n1;
+}
+
+static int qcow_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ BDRVQcowState *s = bs->opaque;
+ int ret, index_in_cluster, n, n1;
+ uint64_t cluster_offset;
+
+ while (nb_sectors > 0) {
+ n = nb_sectors;
+ cluster_offset = get_cluster_offset(bs, sector_num << 9, &n);
+ index_in_cluster = sector_num & (s->cluster_sectors - 1);
+ if (!cluster_offset) {
+ if (bs->backing_hd) {
+ /* read from the base image */
+ n1 = backing_read1(bs->backing_hd, sector_num, buf, n);
+ if (n1 > 0) {
+ ret = bdrv_read(bs->backing_hd, sector_num, buf, n1);
+ if (ret < 0)
+ return -1;
+ }
+ } else {
+ memset(buf, 0, 512 * n);
+ }
+ } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
+ if (decompress_cluster(s, cluster_offset) < 0)
+ return -1;
+ memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n);
+ } else {
+ ret = bdrv_pread(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
+ if (ret != n * 512)
+ return -1;
+ if (s->crypt_method) {
+ encrypt_sectors(s, sector_num, buf, buf, n, 0,
+ &s->aes_decrypt_key);
+ }
+ }
+ nb_sectors -= n;
+ sector_num += n;
+ buf += n * 512;
+ }
+ return 0;
+}
+
+static int qcow_write(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ BDRVQcowState *s = bs->opaque;
+ int ret, index_in_cluster, n;
+ uint64_t cluster_offset;
+ int n_end;
+ QCowL2Meta l2meta;
+
+ while (nb_sectors > 0) {
+ index_in_cluster = sector_num & (s->cluster_sectors - 1);
+ n_end = index_in_cluster + nb_sectors;
+ if (s->crypt_method &&
+ n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
+ n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
+ cluster_offset = alloc_cluster_offset(bs, sector_num << 9,
+ index_in_cluster,
+ n_end, &n, &l2meta);
+ if (!cluster_offset)
+ return -1;
+ if (s->crypt_method) {
+ encrypt_sectors(s, sector_num, s->cluster_data, buf, n, 1,
+ &s->aes_encrypt_key);
+ ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512,
+ s->cluster_data, n * 512);
+ } else {
+ ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
+ }
+ if (ret != n * 512 || alloc_cluster_link_l2(bs, cluster_offset, &l2meta) < 0) {
+ free_any_clusters(bs, cluster_offset, l2meta.nb_clusters);
+ return -1;
+ }
+ nb_sectors -= n;
+ sector_num += n;
+ buf += n * 512;
+ }
+ s->cluster_cache_offset = -1; /* disable compressed cache */
+ return 0;
+}
+
+typedef struct QCowAIOCB {
+ BlockDriverAIOCB common;
+ int64_t sector_num;
+ QEMUIOVector *qiov;
+ uint8_t *buf;
+ void *orig_buf;
+ int nb_sectors;
+ int n;
+ uint64_t cluster_offset;
+ uint8_t *cluster_data;
+ BlockDriverAIOCB *hd_aiocb;
+ struct iovec hd_iov;
+ QEMUIOVector hd_qiov;
+ QEMUBH *bh;
+ QCowL2Meta l2meta;
+} QCowAIOCB;
+
+static void qcow_aio_read_cb(void *opaque, int ret);
+static void qcow_aio_read_bh(void *opaque)
+{
+ QCowAIOCB *acb = opaque;
+ qemu_bh_delete(acb->bh);
+ acb->bh = NULL;
+ qcow_aio_read_cb(opaque, 0);
+}
+
+static int qcow_schedule_bh(QEMUBHFunc *cb, QCowAIOCB *acb)
+{
+ if (acb->bh)
+ return -EIO;
+
+ acb->bh = qemu_bh_new(cb, acb);
+ if (!acb->bh)
+ return -EIO;
+
+ qemu_bh_schedule(acb->bh);
+
+ return 0;
+}
+
+static void qcow_aio_read_cb(void *opaque, int ret)
+{
+ QCowAIOCB *acb = opaque;
+ BlockDriverState *bs = acb->common.bs;
+ BDRVQcowState *s = bs->opaque;
+ int index_in_cluster, n1;
+
+ acb->hd_aiocb = NULL;
+ if (ret < 0)
+ goto done;
+
+ /* post process the read buffer */
+ if (!acb->cluster_offset) {
+ /* nothing to do */
+ } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
+ /* nothing to do */
+ } else {
+ if (s->crypt_method) {
+ encrypt_sectors(s, acb->sector_num, acb->buf, acb->buf,
+ acb->n, 0,
+ &s->aes_decrypt_key);
+ }
+ }
+
+ acb->nb_sectors -= acb->n;
+ acb->sector_num += acb->n;
+ acb->buf += acb->n * 512;
+
+ if (acb->nb_sectors == 0) {
+ /* request completed */
+ ret = 0;
+ goto done;
+ }
+
+ /* prepare next AIO request */
+ acb->n = acb->nb_sectors;
+ acb->cluster_offset = get_cluster_offset(bs, acb->sector_num << 9, &acb->n);
+ index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
+
+ if (!acb->cluster_offset) {
+ if (bs->backing_hd) {
+ /* read from the base image */
+ n1 = backing_read1(bs->backing_hd, acb->sector_num,
+ acb->buf, acb->n);
+ if (n1 > 0) {
+ acb->hd_iov.iov_base = (void *)acb->buf;
+ acb->hd_iov.iov_len = acb->n * 512;
+ qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
+ acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num,
+ &acb->hd_qiov, acb->n,
+ qcow_aio_read_cb, acb);
+ if (acb->hd_aiocb == NULL)
+ goto done;
+ } else {
+ ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
+ if (ret < 0)
+ goto done;
+ }
+ } else {
+ /* Note: in this case, no need to wait */
+ memset(acb->buf, 0, 512 * acb->n);
+ ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
+ if (ret < 0)
+ goto done;
+ }
+ } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
+ /* add AIO support for compressed blocks ? */
+ if (decompress_cluster(s, acb->cluster_offset) < 0)
+ goto done;
+ memcpy(acb->buf,
+ s->cluster_cache + index_in_cluster * 512, 512 * acb->n);
+ ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
+ if (ret < 0)
+ goto done;
+ } else {
+ if ((acb->cluster_offset & 511) != 0) {
+ ret = -EIO;
+ goto done;
+ }
+
+ acb->hd_iov.iov_base = (void *)acb->buf;
+ acb->hd_iov.iov_len = acb->n * 512;
+ qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
+ acb->hd_aiocb = bdrv_aio_readv(s->hd,
+ (acb->cluster_offset >> 9) + index_in_cluster,
+ &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb);
+ if (acb->hd_aiocb == NULL)
+ goto done;
+ }
+
+ return;
+done:
+ if (acb->qiov->niov > 1) {
+ qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size);
+ qemu_vfree(acb->orig_buf);
+ }
+ acb->common.cb(acb->common.opaque, ret);
+ qemu_aio_release(acb);
+}
+
+static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque, int is_write)
+{
+ QCowAIOCB *acb;
+
+ acb = qemu_aio_get(bs, cb, opaque);
+ if (!acb)
+ return NULL;
+ acb->hd_aiocb = NULL;
+ acb->sector_num = sector_num;
+ acb->qiov = qiov;
+ if (qiov->niov > 1) {
+ acb->buf = acb->orig_buf = qemu_blockalign(bs, qiov->size);
+ if (is_write)
+ qemu_iovec_to_buffer(qiov, acb->buf);
+ } else {
+ acb->buf = (uint8_t *)qiov->iov->iov_base;
+ }
+ acb->nb_sectors = nb_sectors;
+ acb->n = 0;
+ acb->cluster_offset = 0;
+ acb->l2meta.nb_clusters = 0;
+ return acb;
+}
+
+static BlockDriverAIOCB *qcow_aio_readv(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ QCowAIOCB *acb;
+
+ acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+ if (!acb)
+ return NULL;
+
+ qcow_aio_read_cb(acb, 0);
+ return &acb->common;
+}
+
+static void qcow_aio_write_cb(void *opaque, int ret)
+{
+ QCowAIOCB *acb = opaque;
+ BlockDriverState *bs = acb->common.bs;
+ BDRVQcowState *s = bs->opaque;
+ int index_in_cluster;
+ const uint8_t *src_buf;
+ int n_end;
+
+ acb->hd_aiocb = NULL;
+
+ if (ret < 0)
+ goto done;
+
+ if (alloc_cluster_link_l2(bs, acb->cluster_offset, &acb->l2meta) < 0) {
+ free_any_clusters(bs, acb->cluster_offset, acb->l2meta.nb_clusters);
+ goto done;
+ }
+
+ acb->nb_sectors -= acb->n;
+ acb->sector_num += acb->n;
+ acb->buf += acb->n * 512;
+
+ if (acb->nb_sectors == 0) {
+ /* request completed */
+ ret = 0;
+ goto done;
+ }
+
+ index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
+ n_end = index_in_cluster + acb->nb_sectors;
+ if (s->crypt_method &&
+ n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
+ n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
+
+ acb->cluster_offset = alloc_cluster_offset(bs, acb->sector_num << 9,
+ index_in_cluster,
+ n_end, &acb->n, &acb->l2meta);
+ if (!acb->cluster_offset || (acb->cluster_offset & 511) != 0) {
+ ret = -EIO;
+ goto done;
+ }
+ if (s->crypt_method) {
+ if (!acb->cluster_data) {
+ acb->cluster_data = qemu_mallocz(QCOW_MAX_CRYPT_CLUSTERS *
+ s->cluster_size);
+ }
+ encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf,
+ acb->n, 1, &s->aes_encrypt_key);
+ src_buf = acb->cluster_data;
+ } else {
+ src_buf = acb->buf;
+ }
+ acb->hd_iov.iov_base = (void *)src_buf;
+ acb->hd_iov.iov_len = acb->n * 512;
+ qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
+ acb->hd_aiocb = bdrv_aio_writev(s->hd,
+ (acb->cluster_offset >> 9) + index_in_cluster,
+ &acb->hd_qiov, acb->n,
+ qcow_aio_write_cb, acb);
+ if (acb->hd_aiocb == NULL)
+ goto done;
+
+ return;
+
+done:
+ if (acb->qiov->niov > 1)
+ qemu_vfree(acb->orig_buf);
+ acb->common.cb(acb->common.opaque, ret);
+ qemu_aio_release(acb);
+}
+
+static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ BDRVQcowState *s = bs->opaque;
+ QCowAIOCB *acb;
+
+ s->cluster_cache_offset = -1; /* disable compressed cache */
+
+ acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
+ if (!acb)
+ return NULL;
+
+ qcow_aio_write_cb(acb, 0);
+ return &acb->common;
+}
+
+static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+ QCowAIOCB *acb = (QCowAIOCB *)blockacb;
+ if (acb->hd_aiocb)
+ bdrv_aio_cancel(acb->hd_aiocb);
+ qemu_aio_release(acb);
+}
+
+static void qcow_close(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ qemu_free(s->l1_table);
+ qemu_free(s->l2_cache);
+ qemu_free(s->cluster_cache);
+ qemu_free(s->cluster_data);
+ refcount_close(bs);
+ bdrv_delete(s->hd);
+}
+
+/* XXX: use std qcow open function ? */
+typedef struct QCowCreateState {
+ int cluster_size;
+ int cluster_bits;
+ uint16_t *refcount_block;
+ uint64_t *refcount_table;
+ int64_t l1_table_offset;
+ int64_t refcount_table_offset;
+ int64_t refcount_block_offset;
+} QCowCreateState;
+
+static void create_refcount_update(QCowCreateState *s,
+ int64_t offset, int64_t size)
+{
+ int refcount;
+ int64_t start, last, cluster_offset;
+ uint16_t *p;
+
+ start = offset & ~(s->cluster_size - 1);
+ last = (offset + size - 1) & ~(s->cluster_size - 1);
+ for(cluster_offset = start; cluster_offset <= last;
+ cluster_offset += s->cluster_size) {
+ p = &s->refcount_block[cluster_offset >> s->cluster_bits];
+ refcount = be16_to_cpu(*p);
+ refcount++;
+ *p = cpu_to_be16(refcount);
+ }
+}
+
+static int qcow_create2(const char *filename, int64_t total_size,
+ const char *backing_file, const char *backing_format,
+ int flags)
+{
+
+ int fd, header_size, backing_filename_len, l1_size, i, shift, l2_bits;
+ int ref_clusters, backing_format_len = 0;
+ QCowHeader header;
+ uint64_t tmp, offset;
+ QCowCreateState s1, *s = &s1;
+ QCowExtension ext_bf = {0, 0};
+
+
+ memset(s, 0, sizeof(*s));
+
+ fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
+ if (fd < 0)
+ return -1;
+ memset(&header, 0, sizeof(header));
+ header.magic = cpu_to_be32(QCOW_MAGIC);
+ header.version = cpu_to_be32(QCOW_VERSION);
+ header.size = cpu_to_be64(total_size * 512);
+ header_size = sizeof(header);
+ backing_filename_len = 0;
+ if (backing_file) {
+ if (backing_format) {
+ ext_bf.magic = QCOW_EXT_MAGIC_BACKING_FORMAT;
+ backing_format_len = strlen(backing_format);
+ ext_bf.len = (backing_format_len + 7) & ~7;
+ header_size += ((sizeof(ext_bf) + ext_bf.len + 7) & ~7);
+ }
+ header.backing_file_offset = cpu_to_be64(header_size);
+ backing_filename_len = strlen(backing_file);
+ header.backing_file_size = cpu_to_be32(backing_filename_len);
+ header_size += backing_filename_len;
+ }
+ s->cluster_bits = 12; /* 4 KB clusters */
+ s->cluster_size = 1 << s->cluster_bits;
+ header.cluster_bits = cpu_to_be32(s->cluster_bits);
+ header_size = (header_size + 7) & ~7;
+ if (flags & BLOCK_FLAG_ENCRYPT) {
+ header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
+ } else {
+ header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
+ }
+ l2_bits = s->cluster_bits - 3;
+ shift = s->cluster_bits + l2_bits;
+ l1_size = (((total_size * 512) + (1LL << shift) - 1) >> shift);
+ offset = align_offset(header_size, s->cluster_size);
+ s->l1_table_offset = offset;
+ header.l1_table_offset = cpu_to_be64(s->l1_table_offset);
+ header.l1_size = cpu_to_be32(l1_size);
+ offset += align_offset(l1_size * sizeof(uint64_t), s->cluster_size);
+
+ s->refcount_table = qemu_mallocz(s->cluster_size);
+
+ s->refcount_table_offset = offset;
+ header.refcount_table_offset = cpu_to_be64(offset);
+ header.refcount_table_clusters = cpu_to_be32(1);
+ offset += s->cluster_size;
+ s->refcount_block_offset = offset;
+
+ /* count how many refcount blocks needed */
+ tmp = offset >> s->cluster_bits;
+ ref_clusters = (tmp >> (s->cluster_bits - REFCOUNT_SHIFT)) + 1;
+ for (i=0; i < ref_clusters; i++) {
+ s->refcount_table[i] = cpu_to_be64(offset);
+ offset += s->cluster_size;
+ }
+
+ s->refcount_block = qemu_mallocz(ref_clusters * s->cluster_size);
+
+ /* update refcounts */
+ create_refcount_update(s, 0, header_size);
+ create_refcount_update(s, s->l1_table_offset, l1_size * sizeof(uint64_t));
+ create_refcount_update(s, s->refcount_table_offset, s->cluster_size);
+ create_refcount_update(s, s->refcount_block_offset, ref_clusters * s->cluster_size);
+
+ /* write all the data */
+ write(fd, &header, sizeof(header));
+ if (backing_file) {
+ if (backing_format_len) {
+ char zero[16];
+ int d = ext_bf.len - backing_format_len;
+
+ memset(zero, 0, sizeof(zero));
+ cpu_to_be32s(&ext_bf.magic);
+ cpu_to_be32s(&ext_bf.len);
+ write(fd, &ext_bf, sizeof(ext_bf));
+ write(fd, backing_format, backing_format_len);
+ if (d>0) {
+ write(fd, zero, d);
+ }
+ }
+ write(fd, backing_file, backing_filename_len);
+ }
+ lseek(fd, s->l1_table_offset, SEEK_SET);
+ tmp = 0;
+ for(i = 0;i < l1_size; i++) {
+ write(fd, &tmp, sizeof(tmp));
+ }
+ lseek(fd, s->refcount_table_offset, SEEK_SET);
+ write(fd, s->refcount_table, s->cluster_size);
+
+ lseek(fd, s->refcount_block_offset, SEEK_SET);
+ write(fd, s->refcount_block, ref_clusters * s->cluster_size);
+
+ qemu_free(s->refcount_table);
+ qemu_free(s->refcount_block);
+ close(fd);
+ return 0;
+}
+
+static int qcow_create(const char *filename, int64_t total_size,
+ const char *backing_file, int flags)
+{
+ return qcow_create2(filename, total_size, backing_file, NULL, flags);
+}
+
+static int qcow_make_empty(BlockDriverState *bs)
+{
+#if 0
+ /* XXX: not correct */
+ BDRVQcowState *s = bs->opaque;
+ uint32_t l1_length = s->l1_size * sizeof(uint64_t);
+ int ret;
+
+ memset(s->l1_table, 0, l1_length);
+ if (bdrv_pwrite(s->hd, s->l1_table_offset, s->l1_table, l1_length) < 0)
+ return -1;
+ ret = bdrv_truncate(s->hd, s->l1_table_offset + l1_length);
+ if (ret < 0)
+ return ret;
+
+ l2_cache_reset(bs);
+#endif
+ return 0;
+}
+
+/* XXX: put compressed sectors first, then all the cluster aligned
+ tables to avoid losing bytes in alignment */
+static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ BDRVQcowState *s = bs->opaque;
+ z_stream strm;
+ int ret, out_len;
+ uint8_t *out_buf;
+ uint64_t cluster_offset;
+
+ if (nb_sectors == 0) {
+ /* align end of file to a sector boundary to ease reading with
+ sector based I/Os */
+ cluster_offset = bdrv_getlength(s->hd);
+ cluster_offset = (cluster_offset + 511) & ~511;
+ bdrv_truncate(s->hd, cluster_offset);
+ return 0;
+ }
+
+ if (nb_sectors != s->cluster_sectors)
+ return -EINVAL;
+
+ out_buf = qemu_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
+
+ /* best compression, small window, no zlib header */
+ memset(&strm, 0, sizeof(strm));
+ ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+ Z_DEFLATED, -12,
+ 9, Z_DEFAULT_STRATEGY);
+ if (ret != 0) {
+ qemu_free(out_buf);
+ return -1;
+ }
+
+ strm.avail_in = s->cluster_size;
+ strm.next_in = (uint8_t *)buf;
+ strm.avail_out = s->cluster_size;
+ strm.next_out = out_buf;
+
+ ret = deflate(&strm, Z_FINISH);
+ if (ret != Z_STREAM_END && ret != Z_OK) {
+ qemu_free(out_buf);
+ deflateEnd(&strm);
+ return -1;
+ }
+ out_len = strm.next_out - out_buf;
+
+ deflateEnd(&strm);
+
+ if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
+ /* could not compress: write normal cluster */
+ qcow_write(bs, sector_num, buf, s->cluster_sectors);
+ } else {
+ cluster_offset = alloc_compressed_cluster_offset(bs, sector_num << 9,
+ out_len);
+ if (!cluster_offset)
+ return -1;
+ cluster_offset &= s->cluster_offset_mask;
+ if (bdrv_pwrite(s->hd, cluster_offset, out_buf, out_len) != out_len) {
+ qemu_free(out_buf);
+ return -1;
+ }
+ }
+
+ qemu_free(out_buf);
+ return 0;
+}
+
+static void qcow_flush(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ bdrv_flush(s->hd);
+}
+
+static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+ BDRVQcowState *s = bs->opaque;
+ bdi->cluster_size = s->cluster_size;
+ bdi->vm_state_offset = (int64_t)s->l1_vm_state_index <<
+ (s->cluster_bits + s->l2_bits);
+ return 0;
+}
+
+/*********************************************************/
+/* snapshot support */
+
+/* update the refcounts of snapshots and the copied flag */
+static int update_snapshot_refcount(BlockDriverState *bs,
+ int64_t l1_table_offset,
+ int l1_size,
+ int addend)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated;
+ int64_t old_offset, old_l2_offset;
+ int l2_size, i, j, l1_modified, l2_modified, nb_csectors, refcount;
+
+ l2_cache_reset(bs);
+
+ l2_table = NULL;
+ l1_table = NULL;
+ l1_size2 = l1_size * sizeof(uint64_t);
+ l1_allocated = 0;
+ if (l1_table_offset != s->l1_table_offset) {
+ l1_table = qemu_malloc(l1_size2);
+ l1_allocated = 1;
+ if (bdrv_pread(s->hd, l1_table_offset,
+ l1_table, l1_size2) != l1_size2)
+ goto fail;
+ for(i = 0;i < l1_size; i++)
+ be64_to_cpus(&l1_table[i]);
+ } else {
+ assert(l1_size == s->l1_size);
+ l1_table = s->l1_table;
+ l1_allocated = 0;
+ }
+
+ l2_size = s->l2_size * sizeof(uint64_t);
+ l2_table = qemu_malloc(l2_size);
+ l1_modified = 0;
+ for(i = 0; i < l1_size; i++) {
+ l2_offset = l1_table[i];
+ if (l2_offset) {
+ old_l2_offset = l2_offset;
+ l2_offset &= ~QCOW_OFLAG_COPIED;
+ l2_modified = 0;
+ if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size)
+ goto fail;
+ for(j = 0; j < s->l2_size; j++) {
+ offset = be64_to_cpu(l2_table[j]);
+ if (offset != 0) {
+ old_offset = offset;
+ offset &= ~QCOW_OFLAG_COPIED;
+ if (offset & QCOW_OFLAG_COMPRESSED) {
+ nb_csectors = ((offset >> s->csize_shift) &
+ s->csize_mask) + 1;
+ if (addend != 0)
+ update_refcount(bs, (offset & s->cluster_offset_mask) & ~511,
+ nb_csectors * 512, addend);
+ /* compressed clusters are never modified */
+ refcount = 2;
+ } else {
+ if (addend != 0) {
+ refcount = update_cluster_refcount(bs, offset >> s->cluster_bits, addend);
+ } else {
+ refcount = get_refcount(bs, offset >> s->cluster_bits);
+ }
+ }
+
+ if (refcount == 1) {
+ offset |= QCOW_OFLAG_COPIED;
+ }
+ if (offset != old_offset) {
+ l2_table[j] = cpu_to_be64(offset);
+ l2_modified = 1;
+ }
+ }
+ }
+ if (l2_modified) {
+ if (bdrv_pwrite(s->hd,
+ l2_offset, l2_table, l2_size) != l2_size)
+ goto fail;
+ }
+
+ if (addend != 0) {
+ refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend);
+ } else {
+ refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
+ }
+ if (refcount == 1) {
+ l2_offset |= QCOW_OFLAG_COPIED;
+ }
+ if (l2_offset != old_l2_offset) {
+ l1_table[i] = l2_offset;
+ l1_modified = 1;
+ }
+ }
+ }
+ if (l1_modified) {
+ for(i = 0; i < l1_size; i++)
+ cpu_to_be64s(&l1_table[i]);
+ if (bdrv_pwrite(s->hd, l1_table_offset, l1_table,
+ l1_size2) != l1_size2)
+ goto fail;
+ for(i = 0; i < l1_size; i++)
+ be64_to_cpus(&l1_table[i]);
+ }
+ if (l1_allocated)
+ qemu_free(l1_table);
+ qemu_free(l2_table);
+ return 0;
+ fail:
+ if (l1_allocated)
+ qemu_free(l1_table);
+ qemu_free(l2_table);
+ return -EIO;
+}
+
+static void qcow_free_snapshots(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ int i;
+
+ for(i = 0; i < s->nb_snapshots; i++) {
+ qemu_free(s->snapshots[i].name);
+ qemu_free(s->snapshots[i].id_str);
+ }
+ qemu_free(s->snapshots);
+ s->snapshots = NULL;
+ s->nb_snapshots = 0;
+}
+
+static int qcow_read_snapshots(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ QCowSnapshotHeader h;
+ QCowSnapshot *sn;
+ int i, id_str_size, name_size;
+ int64_t offset;
+ uint32_t extra_data_size;
+
+ if (!s->nb_snapshots) {
+ s->snapshots = NULL;
+ s->snapshots_size = 0;
+ return 0;
+ }
+
+ offset = s->snapshots_offset;
+ s->snapshots = qemu_mallocz(s->nb_snapshots * sizeof(QCowSnapshot));
+ for(i = 0; i < s->nb_snapshots; i++) {
+ offset = align_offset(offset, 8);
+ if (bdrv_pread(s->hd, offset, &h, sizeof(h)) != sizeof(h))
+ goto fail;
+ offset += sizeof(h);
+ sn = s->snapshots + i;
+ sn->l1_table_offset = be64_to_cpu(h.l1_table_offset);
+ sn->l1_size = be32_to_cpu(h.l1_size);
+ sn->vm_state_size = be32_to_cpu(h.vm_state_size);
+ sn->date_sec = be32_to_cpu(h.date_sec);
+ sn->date_nsec = be32_to_cpu(h.date_nsec);
+ sn->vm_clock_nsec = be64_to_cpu(h.vm_clock_nsec);
+ extra_data_size = be32_to_cpu(h.extra_data_size);
+
+ id_str_size = be16_to_cpu(h.id_str_size);
+ name_size = be16_to_cpu(h.name_size);
+
+ offset += extra_data_size;
+
+ sn->id_str = qemu_malloc(id_str_size + 1);
+ if (bdrv_pread(s->hd, offset, sn->id_str, id_str_size) != id_str_size)
+ goto fail;
+ offset += id_str_size;
+ sn->id_str[id_str_size] = '\0';
+
+ sn->name = qemu_malloc(name_size + 1);
+ if (bdrv_pread(s->hd, offset, sn->name, name_size) != name_size)
+ goto fail;
+ offset += name_size;
+ sn->name[name_size] = '\0';
+ }
+ s->snapshots_size = offset - s->snapshots_offset;
+ return 0;
+ fail:
+ qcow_free_snapshots(bs);
+ return -1;
+}
+
+/* add at the end of the file a new list of snapshots */
+static int qcow_write_snapshots(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ QCowSnapshot *sn;
+ QCowSnapshotHeader h;
+ int i, name_size, id_str_size, snapshots_size;
+ uint64_t data64;
+ uint32_t data32;
+ int64_t offset, snapshots_offset;
+
+ /* compute the size of the snapshots */
+ offset = 0;
+ for(i = 0; i < s->nb_snapshots; i++) {
+ sn = s->snapshots + i;
+ offset = align_offset(offset, 8);
+ offset += sizeof(h);
+ offset += strlen(sn->id_str);
+ offset += strlen(sn->name);
+ }
+ snapshots_size = offset;
+
+ snapshots_offset = alloc_clusters(bs, snapshots_size);
+ offset = snapshots_offset;
+
+ for(i = 0; i < s->nb_snapshots; i++) {
+ sn = s->snapshots + i;
+ memset(&h, 0, sizeof(h));
+ h.l1_table_offset = cpu_to_be64(sn->l1_table_offset);
+ h.l1_size = cpu_to_be32(sn->l1_size);
+ h.vm_state_size = cpu_to_be32(sn->vm_state_size);
+ h.date_sec = cpu_to_be32(sn->date_sec);
+ h.date_nsec = cpu_to_be32(sn->date_nsec);
+ h.vm_clock_nsec = cpu_to_be64(sn->vm_clock_nsec);
+
+ id_str_size = strlen(sn->id_str);
+ name_size = strlen(sn->name);
+ h.id_str_size = cpu_to_be16(id_str_size);
+ h.name_size = cpu_to_be16(name_size);
+ offset = align_offset(offset, 8);
+ if (bdrv_pwrite(s->hd, offset, &h, sizeof(h)) != sizeof(h))
+ goto fail;
+ offset += sizeof(h);
+ if (bdrv_pwrite(s->hd, offset, sn->id_str, id_str_size) != id_str_size)
+ goto fail;
+ offset += id_str_size;
+ if (bdrv_pwrite(s->hd, offset, sn->name, name_size) != name_size)
+ goto fail;
+ offset += name_size;
+ }
+
+ /* update the various header fields */
+ data64 = cpu_to_be64(snapshots_offset);
+ if (bdrv_pwrite(s->hd, offsetof(QCowHeader, snapshots_offset),
+ &data64, sizeof(data64)) != sizeof(data64))
+ goto fail;
+ data32 = cpu_to_be32(s->nb_snapshots);
+ if (bdrv_pwrite(s->hd, offsetof(QCowHeader, nb_snapshots),
+ &data32, sizeof(data32)) != sizeof(data32))
+ goto fail;
+
+ /* free the old snapshot table */
+ free_clusters(bs, s->snapshots_offset, s->snapshots_size);
+ s->snapshots_offset = snapshots_offset;
+ s->snapshots_size = snapshots_size;
+ return 0;
+ fail:
+ return -1;
+}
+
+static void find_new_snapshot_id(BlockDriverState *bs,
+ char *id_str, int id_str_size)
+{
+ BDRVQcowState *s = bs->opaque;
+ QCowSnapshot *sn;
+ int i, id, id_max = 0;
+
+ for(i = 0; i < s->nb_snapshots; i++) {
+ sn = s->snapshots + i;
+ id = strtoul(sn->id_str, NULL, 10);
+ if (id > id_max)
+ id_max = id;
+ }
+ snprintf(id_str, id_str_size, "%d", id_max + 1);
+}
+
+static int find_snapshot_by_id(BlockDriverState *bs, const char *id_str)
+{
+ BDRVQcowState *s = bs->opaque;
+ int i;
+
+ for(i = 0; i < s->nb_snapshots; i++) {
+ if (!strcmp(s->snapshots[i].id_str, id_str))
+ return i;
+ }
+ return -1;
+}
+
+static int find_snapshot_by_id_or_name(BlockDriverState *bs, const char *name)
+{
+ BDRVQcowState *s = bs->opaque;
+ int i, ret;
+
+ ret = find_snapshot_by_id(bs, name);
+ if (ret >= 0)
+ return ret;
+ for(i = 0; i < s->nb_snapshots; i++) {
+ if (!strcmp(s->snapshots[i].name, name))
+ return i;
+ }
+ return -1;
+}
+
+/* if no id is provided, a new one is constructed */
+static int qcow_snapshot_create(BlockDriverState *bs,
+ QEMUSnapshotInfo *sn_info)
+{
+ BDRVQcowState *s = bs->opaque;
+ QCowSnapshot *snapshots1, sn1, *sn = &sn1;
+ int i, ret;
+ uint64_t *l1_table = NULL;
+
+ memset(sn, 0, sizeof(*sn));
+
+ if (sn_info->id_str[0] == '\0') {
+ /* compute a new id */
+ find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
+ }
+
+ /* check that the ID is unique */
+ if (find_snapshot_by_id(bs, sn_info->id_str) >= 0)
+ return -ENOENT;
+
+ sn->id_str = qemu_strdup(sn_info->id_str);
+ if (!sn->id_str)
+ goto fail;
+ sn->name = qemu_strdup(sn_info->name);
+ if (!sn->name)
+ goto fail;
+ sn->vm_state_size = sn_info->vm_state_size;
+ sn->date_sec = sn_info->date_sec;
+ sn->date_nsec = sn_info->date_nsec;
+ sn->vm_clock_nsec = sn_info->vm_clock_nsec;
+
+ ret = update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1);
+ if (ret < 0)
+ goto fail;
+
+ /* create the L1 table of the snapshot */
+ sn->l1_table_offset = alloc_clusters(bs, s->l1_size * sizeof(uint64_t));
+ sn->l1_size = s->l1_size;
+
+ l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
+ for(i = 0; i < s->l1_size; i++) {
+ l1_table[i] = cpu_to_be64(s->l1_table[i]);
+ }
+ if (bdrv_pwrite(s->hd, sn->l1_table_offset,
+ l1_table, s->l1_size * sizeof(uint64_t)) !=
+ (s->l1_size * sizeof(uint64_t)))
+ goto fail;
+ qemu_free(l1_table);
+ l1_table = NULL;
+
+ snapshots1 = qemu_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot));
+ if (s->snapshots) {
+ memcpy(snapshots1, s->snapshots, s->nb_snapshots * sizeof(QCowSnapshot));
+ qemu_free(s->snapshots);
+ }
+ s->snapshots = snapshots1;
+ s->snapshots[s->nb_snapshots++] = *sn;
+
+ if (qcow_write_snapshots(bs) < 0)
+ goto fail;
+#ifdef DEBUG_ALLOC
+ check_refcounts(bs);
+#endif
+ return 0;
+ fail:
+ qemu_free(sn->name);
+ qemu_free(l1_table);
+ return -1;
+}
+
+/* copy the snapshot 'snapshot_name' into the current disk image */
+static int qcow_snapshot_goto(BlockDriverState *bs,
+ const char *snapshot_id)
+{
+ BDRVQcowState *s = bs->opaque;
+ QCowSnapshot *sn;
+ int i, snapshot_index, l1_size2;
+
+ snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
+ if (snapshot_index < 0)
+ return -ENOENT;
+ sn = &s->snapshots[snapshot_index];
+
+ if (update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, -1) < 0)
+ goto fail;
+
+ if (grow_l1_table(bs, sn->l1_size) < 0)
+ goto fail;
+
+ s->l1_size = sn->l1_size;
+ l1_size2 = s->l1_size * sizeof(uint64_t);
+ /* copy the snapshot l1 table to the current l1 table */
+ if (bdrv_pread(s->hd, sn->l1_table_offset,
+ s->l1_table, l1_size2) != l1_size2)
+ goto fail;
+ if (bdrv_pwrite(s->hd, s->l1_table_offset,
+ s->l1_table, l1_size2) != l1_size2)
+ goto fail;
+ for(i = 0;i < s->l1_size; i++) {
+ be64_to_cpus(&s->l1_table[i]);
+ }
+
+ if (update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1) < 0)
+ goto fail;
+
+#ifdef DEBUG_ALLOC
+ check_refcounts(bs);
+#endif
+ return 0;
+ fail:
+ return -EIO;
+}
+
+static int qcow_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
+{
+ BDRVQcowState *s = bs->opaque;
+ QCowSnapshot *sn;
+ int snapshot_index, ret;
+
+ snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
+ if (snapshot_index < 0)
+ return -ENOENT;
+ sn = &s->snapshots[snapshot_index];
+
+ ret = update_snapshot_refcount(bs, sn->l1_table_offset, sn->l1_size, -1);
+ if (ret < 0)
+ return ret;
+ /* must update the copied flag on the current cluster offsets */
+ ret = update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
+ if (ret < 0)
+ return ret;
+ free_clusters(bs, sn->l1_table_offset, sn->l1_size * sizeof(uint64_t));
+
+ qemu_free(sn->id_str);
+ qemu_free(sn->name);
+ memmove(sn, sn + 1, (s->nb_snapshots - snapshot_index - 1) * sizeof(*sn));
+ s->nb_snapshots--;
+ ret = qcow_write_snapshots(bs);
+ if (ret < 0) {
+ /* XXX: restore snapshot if error ? */
+ return ret;
+ }
+#ifdef DEBUG_ALLOC
+ check_refcounts(bs);
+#endif
+ return 0;
+}
+
+static int qcow_snapshot_list(BlockDriverState *bs,
+ QEMUSnapshotInfo **psn_tab)
+{
+ BDRVQcowState *s = bs->opaque;
+ QEMUSnapshotInfo *sn_tab, *sn_info;
+ QCowSnapshot *sn;
+ int i;
+
+ sn_tab = qemu_mallocz(s->nb_snapshots * sizeof(QEMUSnapshotInfo));
+ for(i = 0; i < s->nb_snapshots; i++) {
+ sn_info = sn_tab + i;
+ sn = s->snapshots + i;
+ pstrcpy(sn_info->id_str, sizeof(sn_info->id_str),
+ sn->id_str);
+ pstrcpy(sn_info->name, sizeof(sn_info->name),
+ sn->name);
+ sn_info->vm_state_size = sn->vm_state_size;
+ sn_info->date_sec = sn->date_sec;
+ sn_info->date_nsec = sn->date_nsec;
+ sn_info->vm_clock_nsec = sn->vm_clock_nsec;
+ }
+ *psn_tab = sn_tab;
+ return s->nb_snapshots;
+}
+
+/*********************************************************/
+/* refcount handling */
+
+static int refcount_init(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ int ret, refcount_table_size2, i;
+
+ s->refcount_block_cache = qemu_malloc(s->cluster_size);
+ refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
+ s->refcount_table = qemu_malloc(refcount_table_size2);
+ if (s->refcount_table_size > 0) {
+ ret = bdrv_pread(s->hd, s->refcount_table_offset,
+ s->refcount_table, refcount_table_size2);
+ if (ret != refcount_table_size2)
+ goto fail;
+ for(i = 0; i < s->refcount_table_size; i++)
+ be64_to_cpus(&s->refcount_table[i]);
+ }
+ return 0;
+ fail:
+ return -ENOMEM;
+}
+
+static void refcount_close(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ qemu_free(s->refcount_block_cache);
+ qemu_free(s->refcount_table);
+}
+
+
+static int load_refcount_block(BlockDriverState *bs,
+ int64_t refcount_block_offset)
+{
+ BDRVQcowState *s = bs->opaque;
+ int ret;
+ ret = bdrv_pread(s->hd, refcount_block_offset, s->refcount_block_cache,
+ s->cluster_size);
+ if (ret != s->cluster_size)
+ return -EIO;
+ s->refcount_block_cache_offset = refcount_block_offset;
+ return 0;
+}
+
+static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
+{
+ BDRVQcowState *s = bs->opaque;
+ int refcount_table_index, block_index;
+ int64_t refcount_block_offset;
+
+ refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
+ if (refcount_table_index >= s->refcount_table_size)
+ return 0;
+ refcount_block_offset = s->refcount_table[refcount_table_index];
+ if (!refcount_block_offset)
+ return 0;
+ if (refcount_block_offset != s->refcount_block_cache_offset) {
+ /* better than nothing: return allocated if read error */
+ if (load_refcount_block(bs, refcount_block_offset) < 0)
+ return 1;
+ }
+ block_index = cluster_index &
+ ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
+ return be16_to_cpu(s->refcount_block_cache[block_index]);
+}
+
+/* return < 0 if error */
+static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size)
+{
+ BDRVQcowState *s = bs->opaque;
+ int i, nb_clusters;
+
+ nb_clusters = size_to_clusters(s, size);
+retry:
+ for(i = 0; i < nb_clusters; i++) {
+ int64_t i = s->free_cluster_index++;
+ if (get_refcount(bs, i) != 0)
+ goto retry;
+ }
+#ifdef DEBUG_ALLOC2
+ printf("alloc_clusters: size=%lld -> %lld\n",
+ size,
+ (s->free_cluster_index - nb_clusters) << s->cluster_bits);
+#endif
+ return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
+}
+
+static int64_t alloc_clusters(BlockDriverState *bs, int64_t size)
+{
+ int64_t offset;
+
+ offset = alloc_clusters_noref(bs, size);
+ update_refcount(bs, offset, size, 1);
+ return offset;
+}
+
+/* only used to allocate compressed sectors. We try to allocate
+ contiguous sectors. size must be <= cluster_size */
+static int64_t alloc_bytes(BlockDriverState *bs, int size)
+{
+ BDRVQcowState *s = bs->opaque;
+ int64_t offset, cluster_offset;
+ int free_in_cluster;
+
+ assert(size > 0 && size <= s->cluster_size);
+ if (s->free_byte_offset == 0) {
+ s->free_byte_offset = alloc_clusters(bs, s->cluster_size);
+ }
+ redo:
+ free_in_cluster = s->cluster_size -
+ (s->free_byte_offset & (s->cluster_size - 1));
+ if (size <= free_in_cluster) {
+ /* enough space in current cluster */
+ offset = s->free_byte_offset;
+ s->free_byte_offset += size;
+ free_in_cluster -= size;
+ if (free_in_cluster == 0)
+ s->free_byte_offset = 0;
+ if ((offset & (s->cluster_size - 1)) != 0)
+ update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
+ } else {
+ offset = alloc_clusters(bs, s->cluster_size);
+ cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1);
+ if ((cluster_offset + s->cluster_size) == offset) {
+ /* we are lucky: contiguous data */
+ offset = s->free_byte_offset;
+ update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
+ s->free_byte_offset += size;
+ } else {
+ s->free_byte_offset = offset;
+ goto redo;
+ }
+ }
+ return offset;
+}
+
+static void free_clusters(BlockDriverState *bs,
+ int64_t offset, int64_t size)
+{
+ update_refcount(bs, offset, size, -1);
+}
+
+static int grow_refcount_table(BlockDriverState *bs, int min_size)
+{
+ BDRVQcowState *s = bs->opaque;
+ int new_table_size, new_table_size2, refcount_table_clusters, i, ret;
+ uint64_t *new_table;
+ int64_t table_offset;
+ uint8_t data[12];
+ int old_table_size;
+ int64_t old_table_offset;
+
+ if (min_size <= s->refcount_table_size)
+ return 0;
+ /* compute new table size */
+ refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
+ for(;;) {
+ if (refcount_table_clusters == 0) {
+ refcount_table_clusters = 1;
+ } else {
+ refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
+ }
+ new_table_size = refcount_table_clusters << (s->cluster_bits - 3);
+ if (min_size <= new_table_size)
+ break;
+ }
+#ifdef DEBUG_ALLOC2
+ printf("grow_refcount_table from %d to %d\n",
+ s->refcount_table_size,
+ new_table_size);
+#endif
+ new_table_size2 = new_table_size * sizeof(uint64_t);
+ new_table = qemu_mallocz(new_table_size2);
+ memcpy(new_table, s->refcount_table,
+ s->refcount_table_size * sizeof(uint64_t));
+ for(i = 0; i < s->refcount_table_size; i++)
+ cpu_to_be64s(&new_table[i]);
+ /* Note: we cannot update the refcount now to avoid recursion */
+ table_offset = alloc_clusters_noref(bs, new_table_size2);
+ ret = bdrv_pwrite(s->hd, table_offset, new_table, new_table_size2);
+ if (ret != new_table_size2)
+ goto fail;
+ for(i = 0; i < s->refcount_table_size; i++)
+ be64_to_cpus(&new_table[i]);
+
+ cpu_to_be64w((uint64_t*)data, table_offset);
+ cpu_to_be32w((uint32_t*)(data + 8), refcount_table_clusters);
+ if (bdrv_pwrite(s->hd, offsetof(QCowHeader, refcount_table_offset),
+ data, sizeof(data)) != sizeof(data))
+ goto fail;
+ qemu_free(s->refcount_table);
+ old_table_offset = s->refcount_table_offset;
+ old_table_size = s->refcount_table_size;
+ s->refcount_table = new_table;
+ s->refcount_table_size = new_table_size;
+ s->refcount_table_offset = table_offset;
+
+ update_refcount(bs, table_offset, new_table_size2, 1);
+ free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t));
+ return 0;
+ fail:
+ free_clusters(bs, table_offset, new_table_size2);
+ qemu_free(new_table);
+ return -EIO;
+}
+
+/* addend must be 1 or -1 */
+/* XXX: cache several refcount block clusters ? */
+static int update_cluster_refcount(BlockDriverState *bs,
+ int64_t cluster_index,
+ int addend)
+{
+ BDRVQcowState *s = bs->opaque;
+ int64_t offset, refcount_block_offset;
+ int ret, refcount_table_index, block_index, refcount;
+ uint64_t data64;
+
+ refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
+ if (refcount_table_index >= s->refcount_table_size) {
+ if (addend < 0)
+ return -EINVAL;
+ ret = grow_refcount_table(bs, refcount_table_index + 1);
+ if (ret < 0)
+ return ret;
+ }
+ refcount_block_offset = s->refcount_table[refcount_table_index];
+ if (!refcount_block_offset) {
+ if (addend < 0)
+ return -EINVAL;
+ /* create a new refcount block */
+ /* Note: we cannot update the refcount now to avoid recursion */
+ offset = alloc_clusters_noref(bs, s->cluster_size);
+ memset(s->refcount_block_cache, 0, s->cluster_size);
+ ret = bdrv_pwrite(s->hd, offset, s->refcount_block_cache, s->cluster_size);
+ if (ret != s->cluster_size)
+ return -EINVAL;
+ s->refcount_table[refcount_table_index] = offset;
+ data64 = cpu_to_be64(offset);
+ ret = bdrv_pwrite(s->hd, s->refcount_table_offset +
+ refcount_table_index * sizeof(uint64_t),
+ &data64, sizeof(data64));
+ if (ret != sizeof(data64))
+ return -EINVAL;
+
+ refcount_block_offset = offset;
+ s->refcount_block_cache_offset = offset;
+ update_refcount(bs, offset, s->cluster_size, 1);
+ } else {
+ if (refcount_block_offset != s->refcount_block_cache_offset) {
+ if (load_refcount_block(bs, refcount_block_offset) < 0)
+ return -EIO;
+ }
+ }
+ /* we can update the count and save it */
+ block_index = cluster_index &
+ ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
+ refcount = be16_to_cpu(s->refcount_block_cache[block_index]);
+ refcount += addend;
+ if (refcount < 0 || refcount > 0xffff)
+ return -EINVAL;
+ if (refcount == 0 && cluster_index < s->free_cluster_index) {
+ s->free_cluster_index = cluster_index;
+ }
+ s->refcount_block_cache[block_index] = cpu_to_be16(refcount);
+ if (bdrv_pwrite(s->hd,
+ refcount_block_offset + (block_index << REFCOUNT_SHIFT),
+ &s->refcount_block_cache[block_index], 2) != 2)
+ return -EIO;
+ return refcount;
+}
+
+static void update_refcount(BlockDriverState *bs,
+ int64_t offset, int64_t length,
+ int addend)
+{
+ BDRVQcowState *s = bs->opaque;
+ int64_t start, last, cluster_offset;
+
+#ifdef DEBUG_ALLOC2
+ printf("update_refcount: offset=%lld size=%lld addend=%d\n",
+ offset, length, addend);
+#endif
+ if (length <= 0)
+ return;
+ start = offset & ~(s->cluster_size - 1);
+ last = (offset + length - 1) & ~(s->cluster_size - 1);
+ for(cluster_offset = start; cluster_offset <= last;
+ cluster_offset += s->cluster_size) {
+ update_cluster_refcount(bs, cluster_offset >> s->cluster_bits, addend);
+ }
+}
+
+/*
+ * Increases the refcount for a range of clusters in a given refcount table.
+ * This is used to construct a temporary refcount table out of L1 and L2 tables
+ * which can be compared the the refcount table saved in the image.
+ *
+ * Returns the number of errors in the image that were found
+ */
+static int inc_refcounts(BlockDriverState *bs,
+ uint16_t *refcount_table,
+ int refcount_table_size,
+ int64_t offset, int64_t size)
+{
+ BDRVQcowState *s = bs->opaque;
+ int64_t start, last, cluster_offset;
+ int k;
+ int errors = 0;
+
+ if (size <= 0)
+ return 0;
+
+ start = offset & ~(s->cluster_size - 1);
+ last = (offset + size - 1) & ~(s->cluster_size - 1);
+ for(cluster_offset = start; cluster_offset <= last;
+ cluster_offset += s->cluster_size) {
+ k = cluster_offset >> s->cluster_bits;
+ if (k < 0 || k >= refcount_table_size) {
+ fprintf(stderr, "ERROR: invalid cluster offset=0x%" PRIx64 "\n",
+ cluster_offset);
+ errors++;
+ } else {
+ if (++refcount_table[k] == 0) {
+ fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
+ "\n", cluster_offset);
+ errors++;
+ }
+ }
+ }
+
+ return errors;
+}
+
+/*
+ * Increases the refcount in the given refcount table for the all clusters
+ * referenced in the L2 table. While doing so, performs some checks on L2
+ * entries.
+ *
+ * Returns the number of errors found by the checks or -errno if an internal
+ * error occurred.
+ */
+static int check_refcounts_l2(BlockDriverState *bs,
+ uint16_t *refcount_table, int refcount_table_size, int64_t l2_offset,
+ int check_copied)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint64_t *l2_table, offset;
+ int i, l2_size, nb_csectors, refcount;
+ int errors = 0;
+
+ /* Read L2 table from disk */
+ l2_size = s->l2_size * sizeof(uint64_t);
+ l2_table = qemu_malloc(l2_size);
+
+ if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size)
+ goto fail;
+
+ /* Do the actual checks */
+ for(i = 0; i < s->l2_size; i++) {
+ offset = be64_to_cpu(l2_table[i]);
+ if (offset != 0) {
+ if (offset & QCOW_OFLAG_COMPRESSED) {
+ /* Compressed clusters don't have QCOW_OFLAG_COPIED */
+ if (offset & QCOW_OFLAG_COPIED) {
+ fprintf(stderr, "ERROR: cluster %" PRId64 ": "
+ "copied flag must never be set for compressed "
+ "clusters\n", offset >> s->cluster_bits);
+ offset &= ~QCOW_OFLAG_COPIED;
+ errors++;
+ }
+
+ /* Mark cluster as used */
+ nb_csectors = ((offset >> s->csize_shift) &
+ s->csize_mask) + 1;
+ offset &= s->cluster_offset_mask;
+ errors += inc_refcounts(bs, refcount_table,
+ refcount_table_size,
+ offset & ~511, nb_csectors * 512);
+ } else {
+ /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
+ if (check_copied) {
+ uint64_t entry = offset;
+ offset &= ~QCOW_OFLAG_COPIED;
+ refcount = get_refcount(bs, offset >> s->cluster_bits);
+ if ((refcount == 1) != ((entry & QCOW_OFLAG_COPIED) != 0)) {
+ fprintf(stderr, "ERROR OFLAG_COPIED: offset=%"
+ PRIx64 " refcount=%d\n", entry, refcount);
+ errors++;
+ }
+ }
+
+ /* Mark cluster as used */
+ offset &= ~QCOW_OFLAG_COPIED;
+ errors += inc_refcounts(bs, refcount_table,
+ refcount_table_size,
+ offset, s->cluster_size);
+
+ /* Correct offsets are cluster aligned */
+ if (offset & (s->cluster_size - 1)) {
+ fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
+ "properly aligned; L2 entry corrupted.\n", offset);
+ errors++;
+ }
+ }
+ }
+ }
+
+ qemu_free(l2_table);
+ return errors;
+
+fail:
+ fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
+ qemu_free(l2_table);
+ return -EIO;
+}
+
+/*
+ * Increases the refcount for the L1 table, its L2 tables and all referenced
+ * clusters in the given refcount table. While doing so, performs some checks
+ * on L1 and L2 entries.
+ *
+ * Returns the number of errors found by the checks or -errno if an internal
+ * error occurred.
+ */
+static int check_refcounts_l1(BlockDriverState *bs,
+ uint16_t *refcount_table,
+ int refcount_table_size,
+ int64_t l1_table_offset, int l1_size,
+ int check_copied)
+{
+ BDRVQcowState *s = bs->opaque;
+ uint64_t *l1_table, l2_offset, l1_size2;
+ int i, refcount, ret;
+ int errors = 0;
+
+ l1_size2 = l1_size * sizeof(uint64_t);
+
+ /* Mark L1 table as used */
+ errors += inc_refcounts(bs, refcount_table, refcount_table_size,
+ l1_table_offset, l1_size2);
+
+ /* Read L1 table entries from disk */
+ l1_table = qemu_malloc(l1_size2);
+ if (bdrv_pread(s->hd, l1_table_offset,
+ l1_table, l1_size2) != l1_size2)
+ goto fail;
+ for(i = 0;i < l1_size; i++)
+ be64_to_cpus(&l1_table[i]);
+
+ /* Do the actual checks */
+ for(i = 0; i < l1_size; i++) {
+ l2_offset = l1_table[i];
+ if (l2_offset) {
+ /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
+ if (check_copied) {
+ refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED)
+ >> s->cluster_bits);
+ if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) {
+ fprintf(stderr, "ERROR OFLAG_COPIED: l2_offset=%" PRIx64
+ " refcount=%d\n", l2_offset, refcount);
+ errors++;
+ }
+ }
+
+ /* Mark L2 table as used */
+ l2_offset &= ~QCOW_OFLAG_COPIED;
+ errors += inc_refcounts(bs, refcount_table,
+ refcount_table_size,
+ l2_offset,
+ s->cluster_size);
+
+ /* L2 tables are cluster aligned */
+ if (l2_offset & (s->cluster_size - 1)) {
+ fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
+ "cluster aligned; L1 entry corrupted\n", l2_offset);
+ errors++;
+ }
+
+ /* Process and check L2 entries */
+ ret = check_refcounts_l2(bs, refcount_table, refcount_table_size,
+ l2_offset, check_copied);
+ if (ret < 0) {
+ goto fail;
+ }
+ errors += ret;
+ }
+ }
+ qemu_free(l1_table);
+ return errors;
+
+fail:
+ fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
+ qemu_free(l1_table);
+ return -EIO;
+}
+
+/*
+ * Checks an image for refcount consistency.
+ *
+ * Returns 0 if no errors are found, the number of errors in case the image is
+ * detected as corrupted, and -errno when an internal error occured.
+ */
+static int check_refcounts(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ int64_t size;
+ int nb_clusters, refcount1, refcount2, i;
+ QCowSnapshot *sn;
+ uint16_t *refcount_table;
+ int ret, errors = 0;
+
+ size = bdrv_getlength(s->hd);
+ nb_clusters = size_to_clusters(s, size);
+ refcount_table = qemu_mallocz(nb_clusters * sizeof(uint16_t));
+
+ /* header */
+ errors += inc_refcounts(bs, refcount_table, nb_clusters,
+ 0, s->cluster_size);
+
+ /* current L1 table */
+ ret = check_refcounts_l1(bs, refcount_table, nb_clusters,
+ s->l1_table_offset, s->l1_size, 1);
+ if (ret < 0) {
+ return ret;
+ }
+ errors += ret;
+
+ /* snapshots */
+ for(i = 0; i < s->nb_snapshots; i++) {
+ sn = s->snapshots + i;
+ check_refcounts_l1(bs, refcount_table, nb_clusters,
+ sn->l1_table_offset, sn->l1_size, 0);
+ }
+ errors += inc_refcounts(bs, refcount_table, nb_clusters,
+ s->snapshots_offset, s->snapshots_size);
+
+ /* refcount data */
+ errors += inc_refcounts(bs, refcount_table, nb_clusters,
+ s->refcount_table_offset,
+ s->refcount_table_size * sizeof(uint64_t));
+ for(i = 0; i < s->refcount_table_size; i++) {
+ int64_t offset;
+ offset = s->refcount_table[i];
+ if (offset != 0) {
+ errors += inc_refcounts(bs, refcount_table, nb_clusters,
+ offset, s->cluster_size);
+ }
+ }
+
+ /* compare ref counts */
+ for(i = 0; i < nb_clusters; i++) {
+ refcount1 = get_refcount(bs, i);
+ refcount2 = refcount_table[i];
+ if (refcount1 != refcount2) {
+ fprintf(stderr, "ERROR cluster %d refcount=%d reference=%d\n",
+ i, refcount1, refcount2);
+ errors++;
+ }
+ }
+
+ qemu_free(refcount_table);
+
+ return errors;
+}
+
+static int qcow_check(BlockDriverState *bs)
+{
+ return check_refcounts(bs);
+}
+
+#if 0
+static void dump_refcounts(BlockDriverState *bs)
+{
+ BDRVQcowState *s = bs->opaque;
+ int64_t nb_clusters, k, k1, size;
+ int refcount;
+
+ size = bdrv_getlength(s->hd);
+ nb_clusters = size_to_clusters(s, size);
+ for(k = 0; k < nb_clusters;) {
+ k1 = k;
+ refcount = get_refcount(bs, k);
+ k++;
+ while (k < nb_clusters && get_refcount(bs, k) == refcount)
+ k++;
+ printf("%lld: refcount=%d nb=%lld\n", k, refcount, k - k1);
+ }
+}
+#endif
+
+static int qcow_put_buffer(BlockDriverState *bs, const uint8_t *buf,
+ int64_t pos, int size)
+{
+ int growable = bs->growable;
+
+ bs->growable = 1;
+ bdrv_pwrite(bs, pos, buf, size);
+ bs->growable = growable;
+
+ return size;
+}
+
+static int qcow_get_buffer(BlockDriverState *bs, uint8_t *buf,
+ int64_t pos, int size)
+{
+ int growable = bs->growable;
+ int ret;
+
+ bs->growable = 1;
+ ret = bdrv_pread(bs, pos, buf, size);
+ bs->growable = growable;
+
+ return ret;
+}
+
+static BlockDriver bdrv_qcow2 = {
+ .format_name = "qcow2",
+ .instance_size = sizeof(BDRVQcowState),
+ .bdrv_probe = qcow_probe,
+ .bdrv_open = qcow_open,
+ .bdrv_close = qcow_close,
+ .bdrv_create = qcow_create,
+ .bdrv_flush = qcow_flush,
+ .bdrv_is_allocated = qcow_is_allocated,
+ .bdrv_set_key = qcow_set_key,
+ .bdrv_make_empty = qcow_make_empty,
+
+ .bdrv_aio_readv = qcow_aio_readv,
+ .bdrv_aio_writev = qcow_aio_writev,
+ .bdrv_aio_cancel = qcow_aio_cancel,
+ .aiocb_size = sizeof(QCowAIOCB),
+ .bdrv_write_compressed = qcow_write_compressed,
+
+ .bdrv_snapshot_create = qcow_snapshot_create,
+ .bdrv_snapshot_goto = qcow_snapshot_goto,
+ .bdrv_snapshot_delete = qcow_snapshot_delete,
+ .bdrv_snapshot_list = qcow_snapshot_list,
+ .bdrv_get_info = qcow_get_info,
+
+ .bdrv_put_buffer = qcow_put_buffer,
+ .bdrv_get_buffer = qcow_get_buffer,
+
+ .bdrv_create2 = qcow_create2,
+ .bdrv_check = qcow_check,
+};
+
+static void bdrv_qcow2_init(void)
+{
+ bdrv_register(&bdrv_qcow2);
+}
+
+block_init(bdrv_qcow2_init);
diff --git a/block/raw-posix.c b/block/raw-posix.c
new file mode 100644
index 0000000000..f3a9476b44
--- /dev/null
+++ b/block/raw-posix.c
@@ -0,0 +1,1438 @@
+/*
+ * Block driver for RAW files (posix)
+ *
+ * Copyright (c) 2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "qemu-timer.h"
+#include "qemu-char.h"
+#include "block_int.h"
+#include "module.h"
+#ifdef CONFIG_AIO
+#include "posix-aio-compat.h"
+#endif
+
+#ifdef CONFIG_COCOA
+#include <paths.h>
+#include <sys/param.h>
+#include <IOKit/IOKitLib.h>
+#include <IOKit/IOBSD.h>
+#include <IOKit/storage/IOMediaBSDClient.h>
+#include <IOKit/storage/IOMedia.h>
+#include <IOKit/storage/IOCDMedia.h>
+//#include <IOKit/storage/IOCDTypes.h>
+#include <CoreFoundation/CoreFoundation.h>
+#endif
+
+#ifdef __sun__
+#define _POSIX_PTHREAD_SEMANTICS 1
+#include <signal.h>
+#include <sys/dkio.h>
+#endif
+#ifdef __linux__
+#include <sys/ioctl.h>
+#include <linux/cdrom.h>
+#include <linux/fd.h>
+#endif
+#ifdef __FreeBSD__
+#include <signal.h>
+#include <sys/disk.h>
+#include <sys/cdio.h>
+#endif
+
+#ifdef __OpenBSD__
+#include <sys/ioctl.h>
+#include <sys/disklabel.h>
+#include <sys/dkio.h>
+#endif
+
+#ifdef __DragonFly__
+#include <sys/ioctl.h>
+#include <sys/diskslice.h>
+#endif
+
+//#define DEBUG_FLOPPY
+
+//#define DEBUG_BLOCK
+#if defined(DEBUG_BLOCK)
+#define DEBUG_BLOCK_PRINT(formatCstr, ...) do { if (qemu_log_enabled()) \
+ { qemu_log(formatCstr, ## __VA_ARGS__); qemu_log_flush(); } } while (0)
+#else
+#define DEBUG_BLOCK_PRINT(formatCstr, ...)
+#endif
+
+/* OS X does not have O_DSYNC */
+#ifndef O_DSYNC
+#define O_DSYNC O_SYNC
+#endif
+
+/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */
+#ifndef O_DIRECT
+#define O_DIRECT O_DSYNC
+#endif
+
+#define FTYPE_FILE 0
+#define FTYPE_CD 1
+#define FTYPE_FD 2
+
+#define ALIGNED_BUFFER_SIZE (32 * 512)
+
+/* if the FD is not accessed during that time (in ms), we try to
+ reopen it to see if the disk has been changed */
+#define FD_OPEN_TIMEOUT 1000
+
+typedef struct BDRVRawState {
+ int fd;
+ int type;
+ unsigned int lseek_err_cnt;
+#if defined(__linux__)
+ /* linux floppy specific */
+ int fd_open_flags;
+ int64_t fd_open_time;
+ int64_t fd_error_time;
+ int fd_got_error;
+ int fd_media_changed;
+#endif
+#if defined(__FreeBSD__)
+ int cd_open_flags;
+#endif
+ uint8_t* aligned_buf;
+} BDRVRawState;
+
+static int posix_aio_init(void);
+
+static int fd_open(BlockDriverState *bs);
+
+#if defined(__FreeBSD__)
+static int cd_open(BlockDriverState *bs);
+#endif
+
+static int raw_is_inserted(BlockDriverState *bs);
+
+static int raw_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ BDRVRawState *s = bs->opaque;
+ int fd, open_flags, ret;
+
+ posix_aio_init();
+
+ s->lseek_err_cnt = 0;
+
+ open_flags = O_BINARY;
+ if ((flags & BDRV_O_ACCESS) == O_RDWR) {
+ open_flags |= O_RDWR;
+ } else {
+ open_flags |= O_RDONLY;
+ bs->read_only = 1;
+ }
+ if (flags & BDRV_O_CREAT)
+ open_flags |= O_CREAT | O_TRUNC;
+
+ /* Use O_DSYNC for write-through caching, no flags for write-back caching,
+ * and O_DIRECT for no caching. */
+ if ((flags & BDRV_O_NOCACHE))
+ open_flags |= O_DIRECT;
+ else if (!(flags & BDRV_O_CACHE_WB))
+ open_flags |= O_DSYNC;
+
+ s->type = FTYPE_FILE;
+
+ fd = open(filename, open_flags, 0644);
+ if (fd < 0) {
+ ret = -errno;
+ if (ret == -EROFS)
+ ret = -EACCES;
+ return ret;
+ }
+ s->fd = fd;
+ s->aligned_buf = NULL;
+ if ((flags & BDRV_O_NOCACHE)) {
+ s->aligned_buf = qemu_blockalign(bs, ALIGNED_BUFFER_SIZE);
+ if (s->aligned_buf == NULL) {
+ ret = -errno;
+ close(fd);
+ return ret;
+ }
+ }
+ return 0;
+}
+
+/* XXX: use host sector size if necessary with:
+#ifdef DIOCGSECTORSIZE
+ {
+ unsigned int sectorsize = 512;
+ if (!ioctl(fd, DIOCGSECTORSIZE, &sectorsize) &&
+ sectorsize > bufsize)
+ bufsize = sectorsize;
+ }
+#endif
+#ifdef CONFIG_COCOA
+ u_int32_t blockSize = 512;
+ if ( !ioctl( fd, DKIOCGETBLOCKSIZE, &blockSize ) && blockSize > bufsize) {
+ bufsize = blockSize;
+ }
+#endif
+*/
+
+/*
+ * offset and count are in bytes, but must be multiples of 512 for files
+ * opened with O_DIRECT. buf must be aligned to 512 bytes then.
+ *
+ * This function may be called without alignment if the caller ensures
+ * that O_DIRECT is not in effect.
+ */
+static int raw_pread_aligned(BlockDriverState *bs, int64_t offset,
+ uint8_t *buf, int count)
+{
+ BDRVRawState *s = bs->opaque;
+ int ret;
+
+ ret = fd_open(bs);
+ if (ret < 0)
+ return ret;
+
+ if (offset >= 0 && lseek(s->fd, offset, SEEK_SET) == (off_t)-1) {
+ ++(s->lseek_err_cnt);
+ if(s->lseek_err_cnt <= 10) {
+ DEBUG_BLOCK_PRINT("raw_pread(%d:%s, %" PRId64 ", %p, %d) [%" PRId64
+ "] lseek failed : %d = %s\n",
+ s->fd, bs->filename, offset, buf, count,
+ bs->total_sectors, errno, strerror(errno));
+ }
+ return -1;
+ }
+ s->lseek_err_cnt=0;
+
+ ret = read(s->fd, buf, count);
+ if (ret == count)
+ goto label__raw_read__success;
+
+ DEBUG_BLOCK_PRINT("raw_pread(%d:%s, %" PRId64 ", %p, %d) [%" PRId64
+ "] read failed %d : %d = %s\n",
+ s->fd, bs->filename, offset, buf, count,
+ bs->total_sectors, ret, errno, strerror(errno));
+
+ /* Try harder for CDrom. */
+ if (bs->type == BDRV_TYPE_CDROM) {
+ lseek(s->fd, offset, SEEK_SET);
+ ret = read(s->fd, buf, count);
+ if (ret == count)
+ goto label__raw_read__success;
+ lseek(s->fd, offset, SEEK_SET);
+ ret = read(s->fd, buf, count);
+ if (ret == count)
+ goto label__raw_read__success;
+
+ DEBUG_BLOCK_PRINT("raw_pread(%d:%s, %" PRId64 ", %p, %d) [%" PRId64
+ "] retry read failed %d : %d = %s\n",
+ s->fd, bs->filename, offset, buf, count,
+ bs->total_sectors, ret, errno, strerror(errno));
+ }
+
+label__raw_read__success:
+
+ return ret;
+}
+
+/*
+ * offset and count are in bytes, but must be multiples of 512 for files
+ * opened with O_DIRECT. buf must be aligned to 512 bytes then.
+ *
+ * This function may be called without alignment if the caller ensures
+ * that O_DIRECT is not in effect.
+ */
+static int raw_pwrite_aligned(BlockDriverState *bs, int64_t offset,
+ const uint8_t *buf, int count)
+{
+ BDRVRawState *s = bs->opaque;
+ int ret;
+
+ ret = fd_open(bs);
+ if (ret < 0)
+ return -errno;
+
+ if (offset >= 0 && lseek(s->fd, offset, SEEK_SET) == (off_t)-1) {
+ ++(s->lseek_err_cnt);
+ if(s->lseek_err_cnt) {
+ DEBUG_BLOCK_PRINT("raw_pwrite(%d:%s, %" PRId64 ", %p, %d) [%"
+ PRId64 "] lseek failed : %d = %s\n",
+ s->fd, bs->filename, offset, buf, count,
+ bs->total_sectors, errno, strerror(errno));
+ }
+ return -EIO;
+ }
+ s->lseek_err_cnt = 0;
+
+ ret = write(s->fd, buf, count);
+ if (ret == count)
+ goto label__raw_write__success;
+
+ DEBUG_BLOCK_PRINT("raw_pwrite(%d:%s, %" PRId64 ", %p, %d) [%" PRId64
+ "] write failed %d : %d = %s\n",
+ s->fd, bs->filename, offset, buf, count,
+ bs->total_sectors, ret, errno, strerror(errno));
+
+label__raw_write__success:
+
+ return (ret < 0) ? -errno : ret;
+}
+
+
+/*
+ * offset and count are in bytes and possibly not aligned. For files opened
+ * with O_DIRECT, necessary alignments are ensured before calling
+ * raw_pread_aligned to do the actual read.
+ */
+static int raw_pread(BlockDriverState *bs, int64_t offset,
+ uint8_t *buf, int count)
+{
+ BDRVRawState *s = bs->opaque;
+ int size, ret, shift, sum;
+
+ sum = 0;
+
+ if (s->aligned_buf != NULL) {
+
+ if (offset & 0x1ff) {
+ /* align offset on a 512 bytes boundary */
+
+ shift = offset & 0x1ff;
+ size = (shift + count + 0x1ff) & ~0x1ff;
+ if (size > ALIGNED_BUFFER_SIZE)
+ size = ALIGNED_BUFFER_SIZE;
+ ret = raw_pread_aligned(bs, offset - shift, s->aligned_buf, size);
+ if (ret < 0)
+ return ret;
+
+ size = 512 - shift;
+ if (size > count)
+ size = count;
+ memcpy(buf, s->aligned_buf + shift, size);
+
+ buf += size;
+ offset += size;
+ count -= size;
+ sum += size;
+
+ if (count == 0)
+ return sum;
+ }
+ if (count & 0x1ff || (uintptr_t) buf & 0x1ff) {
+
+ /* read on aligned buffer */
+
+ while (count) {
+
+ size = (count + 0x1ff) & ~0x1ff;
+ if (size > ALIGNED_BUFFER_SIZE)
+ size = ALIGNED_BUFFER_SIZE;
+
+ ret = raw_pread_aligned(bs, offset, s->aligned_buf, size);
+ if (ret < 0)
+ return ret;
+
+ size = ret;
+ if (size > count)
+ size = count;
+
+ memcpy(buf, s->aligned_buf, size);
+
+ buf += size;
+ offset += size;
+ count -= size;
+ sum += size;
+ }
+
+ return sum;
+ }
+ }
+
+ return raw_pread_aligned(bs, offset, buf, count) + sum;
+}
+
+static int raw_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ int ret;
+
+ ret = raw_pread(bs, sector_num * 512, buf, nb_sectors * 512);
+ if (ret == (nb_sectors * 512))
+ ret = 0;
+ return ret;
+}
+
+/*
+ * offset and count are in bytes and possibly not aligned. For files opened
+ * with O_DIRECT, necessary alignments are ensured before calling
+ * raw_pwrite_aligned to do the actual write.
+ */
+static int raw_pwrite(BlockDriverState *bs, int64_t offset,
+ const uint8_t *buf, int count)
+{
+ BDRVRawState *s = bs->opaque;
+ int size, ret, shift, sum;
+
+ sum = 0;
+
+ if (s->aligned_buf != NULL) {
+
+ if (offset & 0x1ff) {
+ /* align offset on a 512 bytes boundary */
+ shift = offset & 0x1ff;
+ ret = raw_pread_aligned(bs, offset - shift, s->aligned_buf, 512);
+ if (ret < 0)
+ return ret;
+
+ size = 512 - shift;
+ if (size > count)
+ size = count;
+ memcpy(s->aligned_buf + shift, buf, size);
+
+ ret = raw_pwrite_aligned(bs, offset - shift, s->aligned_buf, 512);
+ if (ret < 0)
+ return ret;
+
+ buf += size;
+ offset += size;
+ count -= size;
+ sum += size;
+
+ if (count == 0)
+ return sum;
+ }
+ if (count & 0x1ff || (uintptr_t) buf & 0x1ff) {
+
+ while ((size = (count & ~0x1ff)) != 0) {
+
+ if (size > ALIGNED_BUFFER_SIZE)
+ size = ALIGNED_BUFFER_SIZE;
+
+ memcpy(s->aligned_buf, buf, size);
+
+ ret = raw_pwrite_aligned(bs, offset, s->aligned_buf, size);
+ if (ret < 0)
+ return ret;
+
+ buf += ret;
+ offset += ret;
+ count -= ret;
+ sum += ret;
+ }
+ /* here, count < 512 because (count & ~0x1ff) == 0 */
+ if (count) {
+ ret = raw_pread_aligned(bs, offset, s->aligned_buf, 512);
+ if (ret < 0)
+ return ret;
+ memcpy(s->aligned_buf, buf, count);
+
+ ret = raw_pwrite_aligned(bs, offset, s->aligned_buf, 512);
+ if (ret < 0)
+ return ret;
+ if (count < ret)
+ ret = count;
+
+ sum += ret;
+ }
+ return sum;
+ }
+ }
+ return raw_pwrite_aligned(bs, offset, buf, count) + sum;
+}
+
+static int raw_write(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ int ret;
+ ret = raw_pwrite(bs, sector_num * 512, buf, nb_sectors * 512);
+ if (ret == (nb_sectors * 512))
+ ret = 0;
+ return ret;
+}
+
+#ifdef CONFIG_AIO
+/***********************************************************/
+/* Unix AIO using POSIX AIO */
+
+typedef struct RawAIOCB {
+ BlockDriverAIOCB common;
+ struct qemu_paiocb aiocb;
+ struct RawAIOCB *next;
+ int ret;
+} RawAIOCB;
+
+typedef struct PosixAioState
+{
+ int rfd, wfd;
+ RawAIOCB *first_aio;
+} PosixAioState;
+
+static void posix_aio_read(void *opaque)
+{
+ PosixAioState *s = opaque;
+ RawAIOCB *acb, **pacb;
+ int ret;
+ ssize_t len;
+
+ /* read all bytes from signal pipe */
+ for (;;) {
+ char bytes[16];
+
+ len = read(s->rfd, bytes, sizeof(bytes));
+ if (len == -1 && errno == EINTR)
+ continue; /* try again */
+ if (len == sizeof(bytes))
+ continue; /* more to read */
+ break;
+ }
+
+ for(;;) {
+ pacb = &s->first_aio;
+ for(;;) {
+ acb = *pacb;
+ if (!acb)
+ goto the_end;
+ ret = qemu_paio_error(&acb->aiocb);
+ if (ret == ECANCELED) {
+ /* remove the request */
+ *pacb = acb->next;
+ qemu_aio_release(acb);
+ } else if (ret != EINPROGRESS) {
+ /* end of aio */
+ if (ret == 0) {
+ ret = qemu_paio_return(&acb->aiocb);
+ if (ret == acb->aiocb.aio_nbytes)
+ ret = 0;
+ else
+ ret = -EINVAL;
+ } else {
+ ret = -ret;
+ }
+ /* remove the request */
+ *pacb = acb->next;
+ /* call the callback */
+ acb->common.cb(acb->common.opaque, ret);
+ qemu_aio_release(acb);
+ break;
+ } else {
+ pacb = &acb->next;
+ }
+ }
+ }
+ the_end: ;
+}
+
+static int posix_aio_flush(void *opaque)
+{
+ PosixAioState *s = opaque;
+ return !!s->first_aio;
+}
+
+static PosixAioState *posix_aio_state;
+
+static void aio_signal_handler(int signum)
+{
+ if (posix_aio_state) {
+ char byte = 0;
+
+ write(posix_aio_state->wfd, &byte, sizeof(byte));
+ }
+
+ qemu_service_io();
+}
+
+static int posix_aio_init(void)
+{
+ struct sigaction act;
+ PosixAioState *s;
+ int fds[2];
+ struct qemu_paioinit ai;
+
+ if (posix_aio_state)
+ return 0;
+
+ s = qemu_malloc(sizeof(PosixAioState));
+
+ sigfillset(&act.sa_mask);
+ act.sa_flags = 0; /* do not restart syscalls to interrupt select() */
+ act.sa_handler = aio_signal_handler;
+ sigaction(SIGUSR2, &act, NULL);
+
+ s->first_aio = NULL;
+ if (pipe(fds) == -1) {
+ fprintf(stderr, "failed to create pipe\n");
+ return -errno;
+ }
+
+ s->rfd = fds[0];
+ s->wfd = fds[1];
+
+ fcntl(s->rfd, F_SETFL, O_NONBLOCK);
+ fcntl(s->wfd, F_SETFL, O_NONBLOCK);
+
+ qemu_aio_set_fd_handler(s->rfd, posix_aio_read, NULL, posix_aio_flush, s);
+
+ memset(&ai, 0, sizeof(ai));
+ ai.aio_threads = 64;
+ ai.aio_num = 64;
+ qemu_paio_init(&ai);
+
+ posix_aio_state = s;
+
+ return 0;
+}
+
+static RawAIOCB *raw_aio_setup(BlockDriverState *bs, int64_t sector_num,
+ QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ BDRVRawState *s = bs->opaque;
+ RawAIOCB *acb;
+
+ if (fd_open(bs) < 0)
+ return NULL;
+
+ acb = qemu_aio_get(bs, cb, opaque);
+ if (!acb)
+ return NULL;
+ acb->aiocb.aio_fildes = s->fd;
+ acb->aiocb.ev_signo = SIGUSR2;
+ acb->aiocb.aio_iov = qiov->iov;
+ acb->aiocb.aio_niov = qiov->niov;
+ acb->aiocb.aio_nbytes = nb_sectors * 512;
+ acb->aiocb.aio_offset = sector_num * 512;
+ acb->aiocb.aio_flags = 0;
+
+ /*
+ * If O_DIRECT is used the buffer needs to be aligned on a sector
+ * boundary. Tell the low level code to ensure that in case it's
+ * not done yet.
+ */
+ if (s->aligned_buf)
+ acb->aiocb.aio_flags |= QEMU_AIO_SECTOR_ALIGNED;
+
+ acb->next = posix_aio_state->first_aio;
+ posix_aio_state->first_aio = acb;
+ return acb;
+}
+
+static void raw_aio_remove(RawAIOCB *acb)
+{
+ RawAIOCB **pacb;
+
+ /* remove the callback from the queue */
+ pacb = &posix_aio_state->first_aio;
+ for(;;) {
+ if (*pacb == NULL) {
+ fprintf(stderr, "raw_aio_remove: aio request not found!\n");
+ break;
+ } else if (*pacb == acb) {
+ *pacb = acb->next;
+ qemu_aio_release(acb);
+ break;
+ }
+ pacb = &(*pacb)->next;
+ }
+}
+
+static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ RawAIOCB *acb;
+
+ acb = raw_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque);
+ if (!acb)
+ return NULL;
+ if (qemu_paio_read(&acb->aiocb) < 0) {
+ raw_aio_remove(acb);
+ return NULL;
+ }
+ return &acb->common;
+}
+
+static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ RawAIOCB *acb;
+
+ acb = raw_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque);
+ if (!acb)
+ return NULL;
+ if (qemu_paio_write(&acb->aiocb) < 0) {
+ raw_aio_remove(acb);
+ return NULL;
+ }
+ return &acb->common;
+}
+
+static void raw_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+ int ret;
+ RawAIOCB *acb = (RawAIOCB *)blockacb;
+
+ ret = qemu_paio_cancel(acb->aiocb.aio_fildes, &acb->aiocb);
+ if (ret == QEMU_PAIO_NOTCANCELED) {
+ /* fail safe: if the aio could not be canceled, we wait for
+ it */
+ while (qemu_paio_error(&acb->aiocb) == EINPROGRESS);
+ }
+
+ raw_aio_remove(acb);
+}
+#else /* CONFIG_AIO */
+static int posix_aio_init(void)
+{
+ return 0;
+}
+#endif /* CONFIG_AIO */
+
+
+static void raw_close(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+ if (s->fd >= 0) {
+ close(s->fd);
+ s->fd = -1;
+ if (s->aligned_buf != NULL)
+ qemu_free(s->aligned_buf);
+ }
+}
+
+static int raw_truncate(BlockDriverState *bs, int64_t offset)
+{
+ BDRVRawState *s = bs->opaque;
+ if (s->type != FTYPE_FILE)
+ return -ENOTSUP;
+ if (ftruncate(s->fd, offset) < 0)
+ return -errno;
+ return 0;
+}
+
+#ifdef __OpenBSD__
+static int64_t raw_getlength(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+ int fd = s->fd;
+ struct stat st;
+
+ if (fstat(fd, &st))
+ return -1;
+ if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
+ struct disklabel dl;
+
+ if (ioctl(fd, DIOCGDINFO, &dl))
+ return -1;
+ return (uint64_t)dl.d_secsize *
+ dl.d_partitions[DISKPART(st.st_rdev)].p_size;
+ } else
+ return st.st_size;
+}
+#else /* !__OpenBSD__ */
+static int64_t raw_getlength(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+ int fd = s->fd;
+ int64_t size;
+#ifdef HOST_BSD
+ struct stat sb;
+#ifdef __FreeBSD__
+ int reopened = 0;
+#endif
+#endif
+#ifdef __sun__
+ struct dk_minfo minfo;
+ int rv;
+#endif
+ int ret;
+
+ ret = fd_open(bs);
+ if (ret < 0)
+ return ret;
+
+#ifdef HOST_BSD
+#ifdef __FreeBSD__
+again:
+#endif
+ if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
+#ifdef DIOCGMEDIASIZE
+ if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size))
+#elif defined(DIOCGPART)
+ {
+ struct partinfo pi;
+ if (ioctl(fd, DIOCGPART, &pi) == 0)
+ size = pi.media_size;
+ else
+ size = 0;
+ }
+ if (size == 0)
+#endif
+#ifdef CONFIG_COCOA
+ size = LONG_LONG_MAX;
+#else
+ size = lseek(fd, 0LL, SEEK_END);
+#endif
+#ifdef __FreeBSD__
+ switch(s->type) {
+ case FTYPE_CD:
+ /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */
+ if (size == 2048LL * (unsigned)-1)
+ size = 0;
+ /* XXX no disc? maybe we need to reopen... */
+ if (size <= 0 && !reopened && cd_open(bs) >= 0) {
+ reopened = 1;
+ goto again;
+ }
+ }
+#endif
+ } else
+#endif
+#ifdef __sun__
+ /*
+ * use the DKIOCGMEDIAINFO ioctl to read the size.
+ */
+ rv = ioctl ( fd, DKIOCGMEDIAINFO, &minfo );
+ if ( rv != -1 ) {
+ size = minfo.dki_lbsize * minfo.dki_capacity;
+ } else /* there are reports that lseek on some devices
+ fails, but irc discussion said that contingency
+ on contingency was overkill */
+#endif
+ {
+ size = lseek(fd, 0, SEEK_END);
+ }
+ return size;
+}
+#endif
+
+static int raw_create(const char *filename, int64_t total_size,
+ const char *backing_file, int flags)
+{
+ int fd;
+
+ if (flags || backing_file)
+ return -ENOTSUP;
+
+ fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
+ 0644);
+ if (fd < 0)
+ return -EIO;
+ ftruncate(fd, total_size * 512);
+ close(fd);
+ return 0;
+}
+
+static void raw_flush(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+ fsync(s->fd);
+}
+
+static BlockDriver bdrv_raw = {
+ .format_name = "raw",
+ .instance_size = sizeof(BDRVRawState),
+ .bdrv_probe = NULL, /* no probe for protocols */
+ .bdrv_open = raw_open,
+ .bdrv_read = raw_read,
+ .bdrv_write = raw_write,
+ .bdrv_close = raw_close,
+ .bdrv_create = raw_create,
+ .bdrv_flush = raw_flush,
+
+#ifdef CONFIG_AIO
+ .bdrv_aio_readv = raw_aio_readv,
+ .bdrv_aio_writev = raw_aio_writev,
+ .bdrv_aio_cancel = raw_aio_cancel,
+ .aiocb_size = sizeof(RawAIOCB),
+#endif
+
+ .bdrv_truncate = raw_truncate,
+ .bdrv_getlength = raw_getlength,
+};
+
+/***********************************************/
+/* host device */
+
+#ifdef CONFIG_COCOA
+static kern_return_t FindEjectableCDMedia( io_iterator_t *mediaIterator );
+static kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex maxPathSize );
+
+kern_return_t FindEjectableCDMedia( io_iterator_t *mediaIterator )
+{
+ kern_return_t kernResult;
+ mach_port_t masterPort;
+ CFMutableDictionaryRef classesToMatch;
+
+ kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort );
+ if ( KERN_SUCCESS != kernResult ) {
+ printf( "IOMasterPort returned %d\n", kernResult );
+ }
+
+ classesToMatch = IOServiceMatching( kIOCDMediaClass );
+ if ( classesToMatch == NULL ) {
+ printf( "IOServiceMatching returned a NULL dictionary.\n" );
+ } else {
+ CFDictionarySetValue( classesToMatch, CFSTR( kIOMediaEjectableKey ), kCFBooleanTrue );
+ }
+ kernResult = IOServiceGetMatchingServices( masterPort, classesToMatch, mediaIterator );
+ if ( KERN_SUCCESS != kernResult )
+ {
+ printf( "IOServiceGetMatchingServices returned %d\n", kernResult );
+ }
+
+ return kernResult;
+}
+
+kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex maxPathSize )
+{
+ io_object_t nextMedia;
+ kern_return_t kernResult = KERN_FAILURE;
+ *bsdPath = '\0';
+ nextMedia = IOIteratorNext( mediaIterator );
+ if ( nextMedia )
+ {
+ CFTypeRef bsdPathAsCFString;
+ bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 );
+ if ( bsdPathAsCFString ) {
+ size_t devPathLength;
+ strcpy( bsdPath, _PATH_DEV );
+ strcat( bsdPath, "r" );
+ devPathLength = strlen( bsdPath );
+ if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) {
+ kernResult = KERN_SUCCESS;
+ }
+ CFRelease( bsdPathAsCFString );
+ }
+ IOObjectRelease( nextMedia );
+ }
+
+ return kernResult;
+}
+
+#endif
+
+static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ BDRVRawState *s = bs->opaque;
+ int fd, open_flags, ret;
+
+ posix_aio_init();
+
+#ifdef CONFIG_COCOA
+ if (strstart(filename, "/dev/cdrom", NULL)) {
+ kern_return_t kernResult;
+ io_iterator_t mediaIterator;
+ char bsdPath[ MAXPATHLEN ];
+ int fd;
+
+ kernResult = FindEjectableCDMedia( &mediaIterator );
+ kernResult = GetBSDPath( mediaIterator, bsdPath, sizeof( bsdPath ) );
+
+ if ( bsdPath[ 0 ] != '\0' ) {
+ strcat(bsdPath,"s0");
+ /* some CDs don't have a partition 0 */
+ fd = open(bsdPath, O_RDONLY | O_BINARY | O_LARGEFILE);
+ if (fd < 0) {
+ bsdPath[strlen(bsdPath)-1] = '1';
+ } else {
+ close(fd);
+ }
+ filename = bsdPath;
+ }
+
+ if ( mediaIterator )
+ IOObjectRelease( mediaIterator );
+ }
+#endif
+ open_flags = O_BINARY;
+ if ((flags & BDRV_O_ACCESS) == O_RDWR) {
+ open_flags |= O_RDWR;
+ } else {
+ open_flags |= O_RDONLY;
+ bs->read_only = 1;
+ }
+ /* Use O_DSYNC for write-through caching, no flags for write-back caching,
+ * and O_DIRECT for no caching. */
+ if ((flags & BDRV_O_NOCACHE))
+ open_flags |= O_DIRECT;
+ else if (!(flags & BDRV_O_CACHE_WB))
+ open_flags |= O_DSYNC;
+
+ s->type = FTYPE_FILE;
+#if defined(__linux__)
+ if (strstart(filename, "/dev/cd", NULL)) {
+ /* open will not fail even if no CD is inserted */
+ open_flags |= O_NONBLOCK;
+ s->type = FTYPE_CD;
+ } else if (strstart(filename, "/dev/fd", NULL)) {
+ s->type = FTYPE_FD;
+ s->fd_open_flags = open_flags;
+ /* open will not fail even if no floppy is inserted */
+ open_flags |= O_NONBLOCK;
+#ifdef CONFIG_AIO
+ } else if (strstart(filename, "/dev/sg", NULL)) {
+ bs->sg = 1;
+#endif
+ }
+#endif
+#if defined(__FreeBSD__)
+ if (strstart(filename, "/dev/cd", NULL) ||
+ strstart(filename, "/dev/acd", NULL)) {
+ s->type = FTYPE_CD;
+ s->cd_open_flags = open_flags;
+ }
+#endif
+ s->fd = -1;
+ fd = open(filename, open_flags, 0644);
+ if (fd < 0) {
+ ret = -errno;
+ if (ret == -EROFS)
+ ret = -EACCES;
+ return ret;
+ }
+ s->fd = fd;
+#if defined(__FreeBSD__)
+ /* make sure the door isnt locked at this time */
+ if (s->type == FTYPE_CD)
+ ioctl (s->fd, CDIOCALLOW);
+#endif
+#if defined(__linux__)
+ /* close fd so that we can reopen it as needed */
+ if (s->type == FTYPE_FD) {
+ close(s->fd);
+ s->fd = -1;
+ s->fd_media_changed = 1;
+ }
+#endif
+ return 0;
+}
+
+#if defined(__linux__)
+/* Note: we do not have a reliable method to detect if the floppy is
+ present. The current method is to try to open the floppy at every
+ I/O and to keep it opened during a few hundreds of ms. */
+static int fd_open(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+ int last_media_present;
+
+ if (s->type != FTYPE_FD)
+ return 0;
+ last_media_present = (s->fd >= 0);
+ if (s->fd >= 0 &&
+ (qemu_get_clock(rt_clock) - s->fd_open_time) >= FD_OPEN_TIMEOUT) {
+ close(s->fd);
+ s->fd = -1;
+#ifdef DEBUG_FLOPPY
+ printf("Floppy closed\n");
+#endif
+ }
+ if (s->fd < 0) {
+ if (s->fd_got_error &&
+ (qemu_get_clock(rt_clock) - s->fd_error_time) < FD_OPEN_TIMEOUT) {
+#ifdef DEBUG_FLOPPY
+ printf("No floppy (open delayed)\n");
+#endif
+ return -EIO;
+ }
+ s->fd = open(bs->filename, s->fd_open_flags);
+ if (s->fd < 0) {
+ s->fd_error_time = qemu_get_clock(rt_clock);
+ s->fd_got_error = 1;
+ if (last_media_present)
+ s->fd_media_changed = 1;
+#ifdef DEBUG_FLOPPY
+ printf("No floppy\n");
+#endif
+ return -EIO;
+ }
+#ifdef DEBUG_FLOPPY
+ printf("Floppy opened\n");
+#endif
+ }
+ if (!last_media_present)
+ s->fd_media_changed = 1;
+ s->fd_open_time = qemu_get_clock(rt_clock);
+ s->fd_got_error = 0;
+ return 0;
+}
+
+static int raw_is_inserted(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+ int ret;
+
+ switch(s->type) {
+ case FTYPE_CD:
+ ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
+ if (ret == CDS_DISC_OK)
+ return 1;
+ else
+ return 0;
+ break;
+ case FTYPE_FD:
+ ret = fd_open(bs);
+ return (ret >= 0);
+ default:
+ return 1;
+ }
+}
+
+/* currently only used by fdc.c, but a CD version would be good too */
+static int raw_media_changed(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+
+ switch(s->type) {
+ case FTYPE_FD:
+ {
+ int ret;
+ /* XXX: we do not have a true media changed indication. It
+ does not work if the floppy is changed without trying
+ to read it */
+ fd_open(bs);
+ ret = s->fd_media_changed;
+ s->fd_media_changed = 0;
+#ifdef DEBUG_FLOPPY
+ printf("Floppy changed=%d\n", ret);
+#endif
+ return ret;
+ }
+ default:
+ return -ENOTSUP;
+ }
+}
+
+static int raw_eject(BlockDriverState *bs, int eject_flag)
+{
+ BDRVRawState *s = bs->opaque;
+
+ switch(s->type) {
+ case FTYPE_CD:
+ if (eject_flag) {
+ if (ioctl (s->fd, CDROMEJECT, NULL) < 0)
+ perror("CDROMEJECT");
+ } else {
+ if (ioctl (s->fd, CDROMCLOSETRAY, NULL) < 0)
+ perror("CDROMEJECT");
+ }
+ break;
+ case FTYPE_FD:
+ {
+ int fd;
+ if (s->fd >= 0) {
+ close(s->fd);
+ s->fd = -1;
+ }
+ fd = open(bs->filename, s->fd_open_flags | O_NONBLOCK);
+ if (fd >= 0) {
+ if (ioctl(fd, FDEJECT, 0) < 0)
+ perror("FDEJECT");
+ close(fd);
+ }
+ }
+ break;
+ default:
+ return -ENOTSUP;
+ }
+ return 0;
+}
+
+static int raw_set_locked(BlockDriverState *bs, int locked)
+{
+ BDRVRawState *s = bs->opaque;
+
+ switch(s->type) {
+ case FTYPE_CD:
+ if (ioctl (s->fd, CDROM_LOCKDOOR, locked) < 0) {
+ /* Note: an error can happen if the distribution automatically
+ mounts the CD-ROM */
+ // perror("CDROM_LOCKDOOR");
+ }
+ break;
+ default:
+ return -ENOTSUP;
+ }
+ return 0;
+}
+
+static int raw_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+{
+ BDRVRawState *s = bs->opaque;
+
+ return ioctl(s->fd, req, buf);
+}
+
+#ifdef CONFIG_AIO
+static BlockDriverAIOCB *raw_aio_ioctl(BlockDriverState *bs,
+ unsigned long int req, void *buf,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ BDRVRawState *s = bs->opaque;
+ RawAIOCB *acb;
+
+ if (fd_open(bs) < 0)
+ return NULL;
+
+ acb = qemu_aio_get(bs, cb, opaque);
+ if (!acb)
+ return NULL;
+ acb->aiocb.aio_fildes = s->fd;
+ acb->aiocb.ev_signo = SIGUSR2;
+ acb->aiocb.aio_offset = 0;
+ acb->aiocb.aio_flags = 0;
+
+ acb->next = posix_aio_state->first_aio;
+ posix_aio_state->first_aio = acb;
+
+ acb->aiocb.aio_ioctl_buf = buf;
+ acb->aiocb.aio_ioctl_cmd = req;
+ if (qemu_paio_ioctl(&acb->aiocb) < 0) {
+ raw_aio_remove(acb);
+ return NULL;
+ }
+
+ return &acb->common;
+}
+#endif
+
+#elif defined(__FreeBSD__)
+
+static int fd_open(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+
+ /* this is just to ensure s->fd is sane (its called by io ops) */
+ if (s->fd >= 0)
+ return 0;
+ return -EIO;
+}
+
+static int cd_open(BlockDriverState *bs)
+{
+#if defined(__FreeBSD__)
+ BDRVRawState *s = bs->opaque;
+ int fd;
+
+ switch(s->type) {
+ case FTYPE_CD:
+ /* XXX force reread of possibly changed/newly loaded disc,
+ * FreeBSD seems to not notice sometimes... */
+ if (s->fd >= 0)
+ close (s->fd);
+ fd = open(bs->filename, s->cd_open_flags, 0644);
+ if (fd < 0) {
+ s->fd = -1;
+ return -EIO;
+ }
+ s->fd = fd;
+ /* make sure the door isnt locked at this time */
+ ioctl (s->fd, CDIOCALLOW);
+ }
+#endif
+ return 0;
+}
+
+static int raw_is_inserted(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+
+ switch(s->type) {
+ case FTYPE_CD:
+ return (raw_getlength(bs) > 0);
+ case FTYPE_FD:
+ /* XXX handle this */
+ /* FALLTHRU */
+ default:
+ return 1;
+ }
+}
+
+static int raw_media_changed(BlockDriverState *bs)
+{
+ return -ENOTSUP;
+}
+
+static int raw_eject(BlockDriverState *bs, int eject_flag)
+{
+ BDRVRawState *s = bs->opaque;
+
+ switch(s->type) {
+ case FTYPE_CD:
+ if (s->fd < 0)
+ return -ENOTSUP;
+ (void) ioctl (s->fd, CDIOCALLOW);
+ if (eject_flag) {
+ if (ioctl (s->fd, CDIOCEJECT) < 0)
+ perror("CDIOCEJECT");
+ } else {
+ if (ioctl (s->fd, CDIOCCLOSE) < 0)
+ perror("CDIOCCLOSE");
+ }
+ if (cd_open(bs) < 0)
+ return -ENOTSUP;
+ break;
+ case FTYPE_FD:
+ /* XXX handle this */
+ /* FALLTHRU */
+ default:
+ return -ENOTSUP;
+ }
+ return 0;
+}
+
+static int raw_set_locked(BlockDriverState *bs, int locked)
+{
+ BDRVRawState *s = bs->opaque;
+
+ switch(s->type) {
+ case FTYPE_CD:
+ if (s->fd < 0)
+ return -ENOTSUP;
+ if (ioctl (s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) {
+ /* Note: an error can happen if the distribution automatically
+ mounts the CD-ROM */
+ // perror("CDROM_LOCKDOOR");
+ }
+ break;
+ default:
+ return -ENOTSUP;
+ }
+ return 0;
+}
+
+static int raw_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+{
+ return -ENOTSUP;
+}
+#else /* !linux && !FreeBSD */
+
+static int fd_open(BlockDriverState *bs)
+{
+ return 0;
+}
+
+static int raw_is_inserted(BlockDriverState *bs)
+{
+ return 1;
+}
+
+static int raw_media_changed(BlockDriverState *bs)
+{
+ return -ENOTSUP;
+}
+
+static int raw_eject(BlockDriverState *bs, int eject_flag)
+{
+ return -ENOTSUP;
+}
+
+static int raw_set_locked(BlockDriverState *bs, int locked)
+{
+ return -ENOTSUP;
+}
+
+static int raw_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+{
+ return -ENOTSUP;
+}
+
+static BlockDriverAIOCB *raw_aio_ioctl(BlockDriverState *bs,
+ unsigned long int req, void *buf,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ return NULL;
+}
+#endif /* !linux && !FreeBSD */
+
+#if defined(__linux__) || defined(__FreeBSD__)
+static int hdev_create(const char *filename, int64_t total_size,
+ const char *backing_file, int flags)
+{
+ int fd;
+ int ret = 0;
+ struct stat stat_buf;
+
+ if (flags || backing_file)
+ return -ENOTSUP;
+
+ fd = open(filename, O_WRONLY | O_BINARY);
+ if (fd < 0)
+ return -EIO;
+
+ if (fstat(fd, &stat_buf) < 0)
+ ret = -EIO;
+ else if (!S_ISBLK(stat_buf.st_mode))
+ ret = -EIO;
+ else if (lseek(fd, 0, SEEK_END) < total_size * 512)
+ ret = -ENOSPC;
+
+ close(fd);
+ return ret;
+}
+
+#else /* !(linux || freebsd) */
+
+static int hdev_create(const char *filename, int64_t total_size,
+ const char *backing_file, int flags)
+{
+ return -ENOTSUP;
+}
+#endif
+
+static BlockDriver bdrv_host_device = {
+ .format_name = "host_device",
+ .instance_size = sizeof(BDRVRawState),
+ .bdrv_open = hdev_open,
+ .bdrv_close = raw_close,
+ .bdrv_create = hdev_create,
+ .bdrv_flush = raw_flush,
+
+#ifdef CONFIG_AIO
+ .bdrv_aio_readv = raw_aio_readv,
+ .bdrv_aio_writev = raw_aio_writev,
+ .bdrv_aio_cancel = raw_aio_cancel,
+ .aiocb_size = sizeof(RawAIOCB),
+#endif
+
+ .bdrv_read = raw_read,
+ .bdrv_write = raw_write,
+ .bdrv_getlength = raw_getlength,
+
+ /* removable device support */
+ .bdrv_is_inserted = raw_is_inserted,
+ .bdrv_media_changed = raw_media_changed,
+ .bdrv_eject = raw_eject,
+ .bdrv_set_locked = raw_set_locked,
+ /* generic scsi device */
+ .bdrv_ioctl = raw_ioctl,
+#ifdef CONFIG_AIO
+ .bdrv_aio_ioctl = raw_aio_ioctl,
+#endif
+};
+
+static void bdrv_raw_init(void)
+{
+ bdrv_register(&bdrv_raw);
+ bdrv_register(&bdrv_host_device);
+}
+
+block_init(bdrv_raw_init);
diff --git a/block/raw-win32.c b/block/raw-win32.c
new file mode 100644
index 0000000000..ab3abd636d
--- /dev/null
+++ b/block/raw-win32.c
@@ -0,0 +1,394 @@
+/*
+ * Block driver for RAW files (win32)
+ *
+ * Copyright (c) 2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "qemu-timer.h"
+#include "block_int.h"
+#include "module.h"
+#include <windows.h>
+#include <winioctl.h>
+
+#define FTYPE_FILE 0
+#define FTYPE_CD 1
+#define FTYPE_HARDDISK 2
+
+typedef struct BDRVRawState {
+ HANDLE hfile;
+ int type;
+ char drive_path[16]; /* format: "d:\" */
+} BDRVRawState;
+
+int qemu_ftruncate64(int fd, int64_t length)
+{
+ LARGE_INTEGER li;
+ LONG high;
+ HANDLE h;
+ BOOL res;
+
+ if ((GetVersion() & 0x80000000UL) && (length >> 32) != 0)
+ return -1;
+
+ h = (HANDLE)_get_osfhandle(fd);
+
+ /* get current position, ftruncate do not change position */
+ li.HighPart = 0;
+ li.LowPart = SetFilePointer (h, 0, &li.HighPart, FILE_CURRENT);
+ if (li.LowPart == 0xffffffffUL && GetLastError() != NO_ERROR)
+ return -1;
+
+ high = length >> 32;
+ if (!SetFilePointer(h, (DWORD) length, &high, FILE_BEGIN))
+ return -1;
+ res = SetEndOfFile(h);
+
+ /* back to old position */
+ SetFilePointer(h, li.LowPart, &li.HighPart, FILE_BEGIN);
+ return res ? 0 : -1;
+}
+
+static int set_sparse(int fd)
+{
+ DWORD returned;
+ return (int) DeviceIoControl((HANDLE)_get_osfhandle(fd), FSCTL_SET_SPARSE,
+ NULL, 0, NULL, 0, &returned, NULL);
+}
+
+static int raw_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ BDRVRawState *s = bs->opaque;
+ int access_flags, create_flags;
+ DWORD overlapped;
+
+ s->type = FTYPE_FILE;
+
+ if ((flags & BDRV_O_ACCESS) == O_RDWR) {
+ access_flags = GENERIC_READ | GENERIC_WRITE;
+ } else {
+ access_flags = GENERIC_READ;
+ }
+ if (flags & BDRV_O_CREAT) {
+ create_flags = CREATE_ALWAYS;
+ } else {
+ create_flags = OPEN_EXISTING;
+ }
+ overlapped = FILE_ATTRIBUTE_NORMAL;
+ if ((flags & BDRV_O_NOCACHE))
+ overlapped |= FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH;
+ else if (!(flags & BDRV_O_CACHE_WB))
+ overlapped |= FILE_FLAG_WRITE_THROUGH;
+ s->hfile = CreateFile(filename, access_flags,
+ FILE_SHARE_READ, NULL,
+ create_flags, overlapped, NULL);
+ if (s->hfile == INVALID_HANDLE_VALUE) {
+ int err = GetLastError();
+
+ if (err == ERROR_ACCESS_DENIED)
+ return -EACCES;
+ return -1;
+ }
+ return 0;
+}
+
+static int raw_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ BDRVRawState *s = bs->opaque;
+ OVERLAPPED ov;
+ DWORD ret_count;
+ int ret;
+ int64_t offset = sector_num * 512;
+ int count = nb_sectors * 512;
+
+ memset(&ov, 0, sizeof(ov));
+ ov.Offset = offset;
+ ov.OffsetHigh = offset >> 32;
+ ret = ReadFile(s->hfile, buf, count, &ret_count, &ov);
+ if (!ret)
+ return ret_count;
+ if (ret_count == count)
+ ret_count = 0;
+ return ret_count;
+}
+
+static int raw_write(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ BDRVRawState *s = bs->opaque;
+ OVERLAPPED ov;
+ DWORD ret_count;
+ int ret;
+ int64_t offset = sector_num * 512;
+ int count = nb_sectors * 512;
+
+ memset(&ov, 0, sizeof(ov));
+ ov.Offset = offset;
+ ov.OffsetHigh = offset >> 32;
+ ret = WriteFile(s->hfile, buf, count, &ret_count, &ov);
+ if (!ret)
+ return ret_count;
+ if (ret_count == count)
+ ret_count = 0;
+ return ret_count;
+}
+
+static void raw_flush(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+ FlushFileBuffers(s->hfile);
+}
+
+static void raw_close(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+ CloseHandle(s->hfile);
+}
+
+static int raw_truncate(BlockDriverState *bs, int64_t offset)
+{
+ BDRVRawState *s = bs->opaque;
+ LONG low, high;
+
+ low = offset;
+ high = offset >> 32;
+ if (!SetFilePointer(s->hfile, low, &high, FILE_BEGIN))
+ return -EIO;
+ if (!SetEndOfFile(s->hfile))
+ return -EIO;
+ return 0;
+}
+
+static int64_t raw_getlength(BlockDriverState *bs)
+{
+ BDRVRawState *s = bs->opaque;
+ LARGE_INTEGER l;
+ ULARGE_INTEGER available, total, total_free;
+ DISK_GEOMETRY_EX dg;
+ DWORD count;
+ BOOL status;
+
+ switch(s->type) {
+ case FTYPE_FILE:
+ l.LowPart = GetFileSize(s->hfile, (PDWORD)&l.HighPart);
+ if (l.LowPart == 0xffffffffUL && GetLastError() != NO_ERROR)
+ return -EIO;
+ break;
+ case FTYPE_CD:
+ if (!GetDiskFreeSpaceEx(s->drive_path, &available, &total, &total_free))
+ return -EIO;
+ l.QuadPart = total.QuadPart;
+ break;
+ case FTYPE_HARDDISK:
+ status = DeviceIoControl(s->hfile, IOCTL_DISK_GET_DRIVE_GEOMETRY_EX,
+ NULL, 0, &dg, sizeof(dg), &count, NULL);
+ if (status != 0) {
+ l = dg.DiskSize;
+ }
+ break;
+ default:
+ return -EIO;
+ }
+ return l.QuadPart;
+}
+
+static int raw_create(const char *filename, int64_t total_size,
+ const char *backing_file, int flags)
+{
+ int fd;
+
+ if (flags || backing_file)
+ return -ENOTSUP;
+
+ fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
+ 0644);
+ if (fd < 0)
+ return -EIO;
+ set_sparse(fd);
+ ftruncate(fd, total_size * 512);
+ close(fd);
+ return 0;
+}
+
+static BlockDriver bdrv_raw = {
+ .format_name = "raw",
+ .instance_size = sizeof(BDRVRawState),
+ .bdrv_open = raw_open,
+ .bdrv_close = raw_close,
+ .bdrv_create = raw_create,
+ .bdrv_flush = raw_flush,
+ .bdrv_read = raw_read,
+ .bdrv_write = raw_write,
+ .bdrv_truncate = raw_truncate,
+ .bdrv_getlength = raw_getlength,
+};
+
+/***********************************************/
+/* host device */
+
+static int find_cdrom(char *cdrom_name, int cdrom_name_size)
+{
+ char drives[256], *pdrv = drives;
+ UINT type;
+
+ memset(drives, 0, sizeof(drives));
+ GetLogicalDriveStrings(sizeof(drives), drives);
+ while(pdrv[0] != '\0') {
+ type = GetDriveType(pdrv);
+ switch(type) {
+ case DRIVE_CDROM:
+ snprintf(cdrom_name, cdrom_name_size, "\\\\.\\%c:", pdrv[0]);
+ return 0;
+ break;
+ }
+ pdrv += lstrlen(pdrv) + 1;
+ }
+ return -1;
+}
+
+static int find_device_type(BlockDriverState *bs, const char *filename)
+{
+ BDRVRawState *s = bs->opaque;
+ UINT type;
+ const char *p;
+
+ if (strstart(filename, "\\\\.\\", &p) ||
+ strstart(filename, "//./", &p)) {
+ if (stristart(p, "PhysicalDrive", NULL))
+ return FTYPE_HARDDISK;
+ snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", p[0]);
+ type = GetDriveType(s->drive_path);
+ switch (type) {
+ case DRIVE_REMOVABLE:
+ case DRIVE_FIXED:
+ return FTYPE_HARDDISK;
+ case DRIVE_CDROM:
+ return FTYPE_CD;
+ default:
+ return FTYPE_FILE;
+ }
+ } else {
+ return FTYPE_FILE;
+ }
+}
+
+static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ BDRVRawState *s = bs->opaque;
+ int access_flags, create_flags;
+ DWORD overlapped;
+ char device_name[64];
+
+ if (strstart(filename, "/dev/cdrom", NULL)) {
+ if (find_cdrom(device_name, sizeof(device_name)) < 0)
+ return -ENOENT;
+ filename = device_name;
+ } else {
+ /* transform drive letters into device name */
+ if (((filename[0] >= 'a' && filename[0] <= 'z') ||
+ (filename[0] >= 'A' && filename[0] <= 'Z')) &&
+ filename[1] == ':' && filename[2] == '\0') {
+ snprintf(device_name, sizeof(device_name), "\\\\.\\%c:", filename[0]);
+ filename = device_name;
+ }
+ }
+ s->type = find_device_type(bs, filename);
+
+ if ((flags & BDRV_O_ACCESS) == O_RDWR) {
+ access_flags = GENERIC_READ | GENERIC_WRITE;
+ } else {
+ access_flags = GENERIC_READ;
+ }
+ create_flags = OPEN_EXISTING;
+
+ overlapped = FILE_ATTRIBUTE_NORMAL;
+ if ((flags & BDRV_O_NOCACHE))
+ overlapped |= FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH;
+ else if (!(flags & BDRV_O_CACHE_WB))
+ overlapped |= FILE_FLAG_WRITE_THROUGH;
+ s->hfile = CreateFile(filename, access_flags,
+ FILE_SHARE_READ, NULL,
+ create_flags, overlapped, NULL);
+ if (s->hfile == INVALID_HANDLE_VALUE) {
+ int err = GetLastError();
+
+ if (err == ERROR_ACCESS_DENIED)
+ return -EACCES;
+ return -1;
+ }
+ return 0;
+}
+
+#if 0
+/***********************************************/
+/* removable device additional commands */
+
+static int raw_is_inserted(BlockDriverState *bs)
+{
+ return 1;
+}
+
+static int raw_media_changed(BlockDriverState *bs)
+{
+ return -ENOTSUP;
+}
+
+static int raw_eject(BlockDriverState *bs, int eject_flag)
+{
+ DWORD ret_count;
+
+ if (s->type == FTYPE_FILE)
+ return -ENOTSUP;
+ if (eject_flag) {
+ DeviceIoControl(s->hfile, IOCTL_STORAGE_EJECT_MEDIA,
+ NULL, 0, NULL, 0, &lpBytesReturned, NULL);
+ } else {
+ DeviceIoControl(s->hfile, IOCTL_STORAGE_LOAD_MEDIA,
+ NULL, 0, NULL, 0, &lpBytesReturned, NULL);
+ }
+}
+
+static int raw_set_locked(BlockDriverState *bs, int locked)
+{
+ return -ENOTSUP;
+}
+#endif
+
+static BlockDriver bdrv_host_device = {
+ .format_name = "host_device",
+ .instance_size = sizeof(BDRVRawState),
+ .bdrv_open = hdev_open,
+ .bdrv_close = raw_close,
+ .bdrv_flush = raw_flush,
+
+ .bdrv_read = raw_read,
+ .bdrv_write = raw_write,
+ .bdrv_getlength = raw_getlength,
+};
+
+static void bdrv_raw_init(void)
+{
+ bdrv_register(&bdrv_raw);
+ bdrv_register(&bdrv_host_device);
+ return 0;
+}
+
+block_init(bdrv_raw_init);
diff --git a/block/vmdk.c b/block/vmdk.c
new file mode 100644
index 0000000000..13866e9b06
--- /dev/null
+++ b/block/vmdk.c
@@ -0,0 +1,833 @@
+/*
+ * Block driver for the VMDK format
+ *
+ * Copyright (c) 2004 Fabrice Bellard
+ * Copyright (c) 2005 Filip Navara
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+
+#define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
+#define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
+
+typedef struct {
+ uint32_t version;
+ uint32_t flags;
+ uint32_t disk_sectors;
+ uint32_t granularity;
+ uint32_t l1dir_offset;
+ uint32_t l1dir_size;
+ uint32_t file_sectors;
+ uint32_t cylinders;
+ uint32_t heads;
+ uint32_t sectors_per_track;
+} VMDK3Header;
+
+typedef struct {
+ uint32_t version;
+ uint32_t flags;
+ int64_t capacity;
+ int64_t granularity;
+ int64_t desc_offset;
+ int64_t desc_size;
+ int32_t num_gtes_per_gte;
+ int64_t rgd_offset;
+ int64_t gd_offset;
+ int64_t grain_offset;
+ char filler[1];
+ char check_bytes[4];
+} __attribute__((packed)) VMDK4Header;
+
+#define L2_CACHE_SIZE 16
+
+typedef struct BDRVVmdkState {
+ BlockDriverState *hd;
+ int64_t l1_table_offset;
+ int64_t l1_backup_table_offset;
+ uint32_t *l1_table;
+ uint32_t *l1_backup_table;
+ unsigned int l1_size;
+ uint32_t l1_entry_sectors;
+
+ unsigned int l2_size;
+ uint32_t *l2_cache;
+ uint32_t l2_cache_offsets[L2_CACHE_SIZE];
+ uint32_t l2_cache_counts[L2_CACHE_SIZE];
+
+ unsigned int cluster_sectors;
+ uint32_t parent_cid;
+ int is_parent;
+} BDRVVmdkState;
+
+typedef struct VmdkMetaData {
+ uint32_t offset;
+ unsigned int l1_index;
+ unsigned int l2_index;
+ unsigned int l2_offset;
+ int valid;
+} VmdkMetaData;
+
+typedef struct ActiveBDRVState{
+ BlockDriverState *hd; // active image handler
+ uint64_t cluster_offset; // current write offset
+}ActiveBDRVState;
+
+static ActiveBDRVState activeBDRV;
+
+
+static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+ uint32_t magic;
+
+ if (buf_size < 4)
+ return 0;
+ magic = be32_to_cpu(*(uint32_t *)buf);
+ if (magic == VMDK3_MAGIC ||
+ magic == VMDK4_MAGIC)
+ return 100;
+ else
+ return 0;
+}
+
+#define CHECK_CID 1
+
+#define SECTOR_SIZE 512
+#define DESC_SIZE 20*SECTOR_SIZE // 20 sectors of 512 bytes each
+#define HEADER_SIZE 512 // first sector of 512 bytes
+
+static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
+{
+ BDRVVmdkState *s = bs->opaque;
+ char desc[DESC_SIZE];
+ uint32_t cid;
+ const char *p_name, *cid_str;
+ size_t cid_str_size;
+
+ /* the descriptor offset = 0x200 */
+ if (bdrv_pread(s->hd, 0x200, desc, DESC_SIZE) != DESC_SIZE)
+ return 0;
+
+ if (parent) {
+ cid_str = "parentCID";
+ cid_str_size = sizeof("parentCID");
+ } else {
+ cid_str = "CID";
+ cid_str_size = sizeof("CID");
+ }
+
+ if ((p_name = strstr(desc,cid_str)) != NULL) {
+ p_name += cid_str_size;
+ sscanf(p_name,"%x",&cid);
+ }
+
+ return cid;
+}
+
+static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
+{
+ BDRVVmdkState *s = bs->opaque;
+ char desc[DESC_SIZE], tmp_desc[DESC_SIZE];
+ char *p_name, *tmp_str;
+
+ /* the descriptor offset = 0x200 */
+ if (bdrv_pread(s->hd, 0x200, desc, DESC_SIZE) != DESC_SIZE)
+ return -1;
+
+ tmp_str = strstr(desc,"parentCID");
+ pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str);
+ if ((p_name = strstr(desc,"CID")) != NULL) {
+ p_name += sizeof("CID");
+ snprintf(p_name, sizeof(desc) - (p_name - desc), "%x\n", cid);
+ pstrcat(desc, sizeof(desc), tmp_desc);
+ }
+
+ if (bdrv_pwrite(s->hd, 0x200, desc, DESC_SIZE) != DESC_SIZE)
+ return -1;
+ return 0;
+}
+
+static int vmdk_is_cid_valid(BlockDriverState *bs)
+{
+#ifdef CHECK_CID
+ BDRVVmdkState *s = bs->opaque;
+ BlockDriverState *p_bs = s->hd->backing_hd;
+ uint32_t cur_pcid;
+
+ if (p_bs) {
+ cur_pcid = vmdk_read_cid(p_bs,0);
+ if (s->parent_cid != cur_pcid)
+ // CID not valid
+ return 0;
+ }
+#endif
+ // CID valid
+ return 1;
+}
+
+static int vmdk_snapshot_create(const char *filename, const char *backing_file)
+{
+ int snp_fd, p_fd;
+ uint32_t p_cid;
+ char *p_name, *gd_buf, *rgd_buf;
+ const char *real_filename, *temp_str;
+ VMDK4Header header;
+ uint32_t gde_entries, gd_size;
+ int64_t gd_offset, rgd_offset, capacity, gt_size;
+ char p_desc[DESC_SIZE], s_desc[DESC_SIZE], hdr[HEADER_SIZE];
+ static const char desc_template[] =
+ "# Disk DescriptorFile\n"
+ "version=1\n"
+ "CID=%x\n"
+ "parentCID=%x\n"
+ "createType=\"monolithicSparse\"\n"
+ "parentFileNameHint=\"%s\"\n"
+ "\n"
+ "# Extent description\n"
+ "RW %u SPARSE \"%s\"\n"
+ "\n"
+ "# The Disk Data Base \n"
+ "#DDB\n"
+ "\n";
+
+ snp_fd = open(filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, 0644);
+ if (snp_fd < 0)
+ return -1;
+ p_fd = open(backing_file, O_RDONLY | O_BINARY | O_LARGEFILE);
+ if (p_fd < 0) {
+ close(snp_fd);
+ return -1;
+ }
+
+ /* read the header */
+ if (lseek(p_fd, 0x0, SEEK_SET) == -1)
+ goto fail;
+ if (read(p_fd, hdr, HEADER_SIZE) != HEADER_SIZE)
+ goto fail;
+
+ /* write the header */
+ if (lseek(snp_fd, 0x0, SEEK_SET) == -1)
+ goto fail;
+ if (write(snp_fd, hdr, HEADER_SIZE) == -1)
+ goto fail;
+
+ memset(&header, 0, sizeof(header));
+ memcpy(&header,&hdr[4], sizeof(header)); // skip the VMDK4_MAGIC
+
+ ftruncate(snp_fd, header.grain_offset << 9);
+ /* the descriptor offset = 0x200 */
+ if (lseek(p_fd, 0x200, SEEK_SET) == -1)
+ goto fail;
+ if (read(p_fd, p_desc, DESC_SIZE) != DESC_SIZE)
+ goto fail;
+
+ if ((p_name = strstr(p_desc,"CID")) != NULL) {
+ p_name += sizeof("CID");
+ sscanf(p_name,"%x",&p_cid);
+ }
+
+ real_filename = filename;
+ if ((temp_str = strrchr(real_filename, '\\')) != NULL)
+ real_filename = temp_str + 1;
+ if ((temp_str = strrchr(real_filename, '/')) != NULL)
+ real_filename = temp_str + 1;
+ if ((temp_str = strrchr(real_filename, ':')) != NULL)
+ real_filename = temp_str + 1;
+
+ snprintf(s_desc, sizeof(s_desc), desc_template, p_cid, p_cid, backing_file,
+ (uint32_t)header.capacity, real_filename);
+
+ /* write the descriptor */
+ if (lseek(snp_fd, 0x200, SEEK_SET) == -1)
+ goto fail;
+ if (write(snp_fd, s_desc, strlen(s_desc)) == -1)
+ goto fail;
+
+ gd_offset = header.gd_offset * SECTOR_SIZE; // offset of GD table
+ rgd_offset = header.rgd_offset * SECTOR_SIZE; // offset of RGD table
+ capacity = header.capacity * SECTOR_SIZE; // Extent size
+ /*
+ * Each GDE span 32M disk, means:
+ * 512 GTE per GT, each GTE points to grain
+ */
+ gt_size = (int64_t)header.num_gtes_per_gte * header.granularity * SECTOR_SIZE;
+ if (!gt_size)
+ goto fail;
+ gde_entries = (uint32_t)(capacity / gt_size); // number of gde/rgde
+ gd_size = gde_entries * sizeof(uint32_t);
+
+ /* write RGD */
+ rgd_buf = qemu_malloc(gd_size);
+ if (lseek(p_fd, rgd_offset, SEEK_SET) == -1)
+ goto fail_rgd;
+ if (read(p_fd, rgd_buf, gd_size) != gd_size)
+ goto fail_rgd;
+ if (lseek(snp_fd, rgd_offset, SEEK_SET) == -1)
+ goto fail_rgd;
+ if (write(snp_fd, rgd_buf, gd_size) == -1)
+ goto fail_rgd;
+ qemu_free(rgd_buf);
+
+ /* write GD */
+ gd_buf = qemu_malloc(gd_size);
+ if (lseek(p_fd, gd_offset, SEEK_SET) == -1)
+ goto fail_gd;
+ if (read(p_fd, gd_buf, gd_size) != gd_size)
+ goto fail_gd;
+ if (lseek(snp_fd, gd_offset, SEEK_SET) == -1)
+ goto fail_gd;
+ if (write(snp_fd, gd_buf, gd_size) == -1)
+ goto fail_gd;
+ qemu_free(gd_buf);
+
+ close(p_fd);
+ close(snp_fd);
+ return 0;
+
+ fail_gd:
+ qemu_free(gd_buf);
+ fail_rgd:
+ qemu_free(rgd_buf);
+ fail:
+ close(p_fd);
+ close(snp_fd);
+ return -1;
+}
+
+static void vmdk_parent_close(BlockDriverState *bs)
+{
+ if (bs->backing_hd)
+ bdrv_close(bs->backing_hd);
+}
+
+static int parent_open = 0;
+static int vmdk_parent_open(BlockDriverState *bs, const char * filename)
+{
+ BDRVVmdkState *s = bs->opaque;
+ char *p_name;
+ char desc[DESC_SIZE];
+ char parent_img_name[1024];
+
+ /* the descriptor offset = 0x200 */
+ if (bdrv_pread(s->hd, 0x200, desc, DESC_SIZE) != DESC_SIZE)
+ return -1;
+
+ if ((p_name = strstr(desc,"parentFileNameHint")) != NULL) {
+ char *end_name;
+ struct stat file_buf;
+
+ p_name += sizeof("parentFileNameHint") + 1;
+ if ((end_name = strchr(p_name,'\"')) == NULL)
+ return -1;
+ if ((end_name - p_name) > sizeof (s->hd->backing_file) - 1)
+ return -1;
+
+ pstrcpy(s->hd->backing_file, end_name - p_name + 1, p_name);
+ if (stat(s->hd->backing_file, &file_buf) != 0) {
+ path_combine(parent_img_name, sizeof(parent_img_name),
+ filename, s->hd->backing_file);
+ } else {
+ pstrcpy(parent_img_name, sizeof(parent_img_name),
+ s->hd->backing_file);
+ }
+
+ s->hd->backing_hd = bdrv_new("");
+ if (!s->hd->backing_hd) {
+ failure:
+ bdrv_close(s->hd);
+ return -1;
+ }
+ parent_open = 1;
+ if (bdrv_open(s->hd->backing_hd, parent_img_name, BDRV_O_RDONLY) < 0)
+ goto failure;
+ parent_open = 0;
+ }
+
+ return 0;
+}
+
+static int vmdk_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ BDRVVmdkState *s = bs->opaque;
+ uint32_t magic;
+ int l1_size, i, ret;
+
+ if (parent_open)
+ // Parent must be opened as RO.
+ flags = BDRV_O_RDONLY;
+
+ ret = bdrv_file_open(&s->hd, filename, flags);
+ if (ret < 0)
+ return ret;
+ if (bdrv_pread(s->hd, 0, &magic, sizeof(magic)) != sizeof(magic))
+ goto fail;
+
+ magic = be32_to_cpu(magic);
+ if (magic == VMDK3_MAGIC) {
+ VMDK3Header header;
+
+ if (bdrv_pread(s->hd, sizeof(magic), &header, sizeof(header)) != sizeof(header))
+ goto fail;
+ s->cluster_sectors = le32_to_cpu(header.granularity);
+ s->l2_size = 1 << 9;
+ s->l1_size = 1 << 6;
+ bs->total_sectors = le32_to_cpu(header.disk_sectors);
+ s->l1_table_offset = le32_to_cpu(header.l1dir_offset) << 9;
+ s->l1_backup_table_offset = 0;
+ s->l1_entry_sectors = s->l2_size * s->cluster_sectors;
+ } else if (magic == VMDK4_MAGIC) {
+ VMDK4Header header;
+
+ if (bdrv_pread(s->hd, sizeof(magic), &header, sizeof(header)) != sizeof(header))
+ goto fail;
+ bs->total_sectors = le64_to_cpu(header.capacity);
+ s->cluster_sectors = le64_to_cpu(header.granularity);
+ s->l2_size = le32_to_cpu(header.num_gtes_per_gte);
+ s->l1_entry_sectors = s->l2_size * s->cluster_sectors;
+ if (s->l1_entry_sectors <= 0)
+ goto fail;
+ s->l1_size = (bs->total_sectors + s->l1_entry_sectors - 1)
+ / s->l1_entry_sectors;
+ s->l1_table_offset = le64_to_cpu(header.rgd_offset) << 9;
+ s->l1_backup_table_offset = le64_to_cpu(header.gd_offset) << 9;
+
+ if (parent_open)
+ s->is_parent = 1;
+ else
+ s->is_parent = 0;
+
+ // try to open parent images, if exist
+ if (vmdk_parent_open(bs, filename) != 0)
+ goto fail;
+ // write the CID once after the image creation
+ s->parent_cid = vmdk_read_cid(bs,1);
+ } else {
+ goto fail;
+ }
+
+ /* read the L1 table */
+ l1_size = s->l1_size * sizeof(uint32_t);
+ s->l1_table = qemu_malloc(l1_size);
+ if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, l1_size) != l1_size)
+ goto fail;
+ for(i = 0; i < s->l1_size; i++) {
+ le32_to_cpus(&s->l1_table[i]);
+ }
+
+ if (s->l1_backup_table_offset) {
+ s->l1_backup_table = qemu_malloc(l1_size);
+ if (bdrv_pread(s->hd, s->l1_backup_table_offset, s->l1_backup_table, l1_size) != l1_size)
+ goto fail;
+ for(i = 0; i < s->l1_size; i++) {
+ le32_to_cpus(&s->l1_backup_table[i]);
+ }
+ }
+
+ s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint32_t));
+ return 0;
+ fail:
+ qemu_free(s->l1_backup_table);
+ qemu_free(s->l1_table);
+ qemu_free(s->l2_cache);
+ bdrv_delete(s->hd);
+ return -1;
+}
+
+static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
+ uint64_t offset, int allocate);
+
+static int get_whole_cluster(BlockDriverState *bs, uint64_t cluster_offset,
+ uint64_t offset, int allocate)
+{
+ uint64_t parent_cluster_offset;
+ BDRVVmdkState *s = bs->opaque;
+ uint8_t whole_grain[s->cluster_sectors*512]; // 128 sectors * 512 bytes each = grain size 64KB
+
+ // we will be here if it's first write on non-exist grain(cluster).
+ // try to read from parent image, if exist
+ if (s->hd->backing_hd) {
+ BDRVVmdkState *ps = s->hd->backing_hd->opaque;
+
+ if (!vmdk_is_cid_valid(bs))
+ return -1;
+
+ parent_cluster_offset = get_cluster_offset(s->hd->backing_hd, NULL, offset, allocate);
+
+ if (parent_cluster_offset) {
+ BDRVVmdkState *act_s = activeBDRV.hd->opaque;
+
+ if (bdrv_pread(ps->hd, parent_cluster_offset, whole_grain, ps->cluster_sectors*512) != ps->cluster_sectors*512)
+ return -1;
+
+ //Write grain only into the active image
+ if (bdrv_pwrite(act_s->hd, activeBDRV.cluster_offset << 9, whole_grain, sizeof(whole_grain)) != sizeof(whole_grain))
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static int vmdk_L2update(BlockDriverState *bs, VmdkMetaData *m_data)
+{
+ BDRVVmdkState *s = bs->opaque;
+
+ /* update L2 table */
+ if (bdrv_pwrite(s->hd, ((int64_t)m_data->l2_offset * 512) + (m_data->l2_index * sizeof(m_data->offset)),
+ &(m_data->offset), sizeof(m_data->offset)) != sizeof(m_data->offset))
+ return -1;
+ /* update backup L2 table */
+ if (s->l1_backup_table_offset != 0) {
+ m_data->l2_offset = s->l1_backup_table[m_data->l1_index];
+ if (bdrv_pwrite(s->hd, ((int64_t)m_data->l2_offset * 512) + (m_data->l2_index * sizeof(m_data->offset)),
+ &(m_data->offset), sizeof(m_data->offset)) != sizeof(m_data->offset))
+ return -1;
+ }
+
+ return 0;
+}
+
+static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
+ uint64_t offset, int allocate)
+{
+ BDRVVmdkState *s = bs->opaque;
+ unsigned int l1_index, l2_offset, l2_index;
+ int min_index, i, j;
+ uint32_t min_count, *l2_table, tmp = 0;
+ uint64_t cluster_offset;
+
+ if (m_data)
+ m_data->valid = 0;
+
+ l1_index = (offset >> 9) / s->l1_entry_sectors;
+ if (l1_index >= s->l1_size)
+ return 0;
+ l2_offset = s->l1_table[l1_index];
+ if (!l2_offset)
+ return 0;
+ for(i = 0; i < L2_CACHE_SIZE; i++) {
+ if (l2_offset == s->l2_cache_offsets[i]) {
+ /* increment the hit count */
+ if (++s->l2_cache_counts[i] == 0xffffffff) {
+ for(j = 0; j < L2_CACHE_SIZE; j++) {
+ s->l2_cache_counts[j] >>= 1;
+ }
+ }
+ l2_table = s->l2_cache + (i * s->l2_size);
+ goto found;
+ }
+ }
+ /* not found: load a new entry in the least used one */
+ min_index = 0;
+ min_count = 0xffffffff;
+ for(i = 0; i < L2_CACHE_SIZE; i++) {
+ if (s->l2_cache_counts[i] < min_count) {
+ min_count = s->l2_cache_counts[i];
+ min_index = i;
+ }
+ }
+ l2_table = s->l2_cache + (min_index * s->l2_size);
+ if (bdrv_pread(s->hd, (int64_t)l2_offset * 512, l2_table, s->l2_size * sizeof(uint32_t)) !=
+ s->l2_size * sizeof(uint32_t))
+ return 0;
+
+ s->l2_cache_offsets[min_index] = l2_offset;
+ s->l2_cache_counts[min_index] = 1;
+ found:
+ l2_index = ((offset >> 9) / s->cluster_sectors) % s->l2_size;
+ cluster_offset = le32_to_cpu(l2_table[l2_index]);
+
+ if (!cluster_offset) {
+ if (!allocate)
+ return 0;
+ // Avoid the L2 tables update for the images that have snapshots.
+ if (!s->is_parent) {
+ cluster_offset = bdrv_getlength(s->hd);
+ bdrv_truncate(s->hd, cluster_offset + (s->cluster_sectors << 9));
+
+ cluster_offset >>= 9;
+ tmp = cpu_to_le32(cluster_offset);
+ l2_table[l2_index] = tmp;
+ // Save the active image state
+ activeBDRV.cluster_offset = cluster_offset;
+ activeBDRV.hd = bs;
+ }
+ /* First of all we write grain itself, to avoid race condition
+ * that may to corrupt the image.
+ * This problem may occur because of insufficient space on host disk
+ * or inappropriate VM shutdown.
+ */
+ if (get_whole_cluster(bs, cluster_offset, offset, allocate) == -1)
+ return 0;
+
+ if (m_data) {
+ m_data->offset = tmp;
+ m_data->l1_index = l1_index;
+ m_data->l2_index = l2_index;
+ m_data->l2_offset = l2_offset;
+ m_data->valid = 1;
+ }
+ }
+ cluster_offset <<= 9;
+ return cluster_offset;
+}
+
+static int vmdk_is_allocated(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, int *pnum)
+{
+ BDRVVmdkState *s = bs->opaque;
+ int index_in_cluster, n;
+ uint64_t cluster_offset;
+
+ cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0);
+ index_in_cluster = sector_num % s->cluster_sectors;
+ n = s->cluster_sectors - index_in_cluster;
+ if (n > nb_sectors)
+ n = nb_sectors;
+ *pnum = n;
+ return (cluster_offset != 0);
+}
+
+static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ BDRVVmdkState *s = bs->opaque;
+ int index_in_cluster, n, ret;
+ uint64_t cluster_offset;
+
+ while (nb_sectors > 0) {
+ cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0);
+ index_in_cluster = sector_num % s->cluster_sectors;
+ n = s->cluster_sectors - index_in_cluster;
+ if (n > nb_sectors)
+ n = nb_sectors;
+ if (!cluster_offset) {
+ // try to read from parent image, if exist
+ if (s->hd->backing_hd) {
+ if (!vmdk_is_cid_valid(bs))
+ return -1;
+ ret = bdrv_read(s->hd->backing_hd, sector_num, buf, n);
+ if (ret < 0)
+ return -1;
+ } else {
+ memset(buf, 0, 512 * n);
+ }
+ } else {
+ if(bdrv_pread(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512) != n * 512)
+ return -1;
+ }
+ nb_sectors -= n;
+ sector_num += n;
+ buf += n * 512;
+ }
+ return 0;
+}
+
+static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ BDRVVmdkState *s = bs->opaque;
+ VmdkMetaData m_data;
+ int index_in_cluster, n;
+ uint64_t cluster_offset;
+ static int cid_update = 0;
+
+ if (sector_num > bs->total_sectors) {
+ fprintf(stderr,
+ "(VMDK) Wrong offset: sector_num=0x%" PRIx64
+ " total_sectors=0x%" PRIx64 "\n",
+ sector_num, bs->total_sectors);
+ return -1;
+ }
+
+ while (nb_sectors > 0) {
+ index_in_cluster = sector_num & (s->cluster_sectors - 1);
+ n = s->cluster_sectors - index_in_cluster;
+ if (n > nb_sectors)
+ n = nb_sectors;
+ cluster_offset = get_cluster_offset(bs, &m_data, sector_num << 9, 1);
+ if (!cluster_offset)
+ return -1;
+
+ if (bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512) != n * 512)
+ return -1;
+ if (m_data.valid) {
+ /* update L2 tables */
+ if (vmdk_L2update(bs, &m_data) == -1)
+ return -1;
+ }
+ nb_sectors -= n;
+ sector_num += n;
+ buf += n * 512;
+
+ // update CID on the first write every time the virtual disk is opened
+ if (!cid_update) {
+ vmdk_write_cid(bs, time(NULL));
+ cid_update++;
+ }
+ }
+ return 0;
+}
+
+static int vmdk_create(const char *filename, int64_t total_size,
+ const char *backing_file, int flags)
+{
+ int fd, i;
+ VMDK4Header header;
+ uint32_t tmp, magic, grains, gd_size, gt_size, gt_count;
+ static const char desc_template[] =
+ "# Disk DescriptorFile\n"
+ "version=1\n"
+ "CID=%x\n"
+ "parentCID=ffffffff\n"
+ "createType=\"monolithicSparse\"\n"
+ "\n"
+ "# Extent description\n"
+ "RW %" PRId64 " SPARSE \"%s\"\n"
+ "\n"
+ "# The Disk Data Base \n"
+ "#DDB\n"
+ "\n"
+ "ddb.virtualHWVersion = \"%d\"\n"
+ "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
+ "ddb.geometry.heads = \"16\"\n"
+ "ddb.geometry.sectors = \"63\"\n"
+ "ddb.adapterType = \"ide\"\n";
+ char desc[1024];
+ const char *real_filename, *temp_str;
+
+ /* XXX: add support for backing file */
+ if (backing_file) {
+ return vmdk_snapshot_create(filename, backing_file);
+ }
+
+ fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
+ 0644);
+ if (fd < 0)
+ return -1;
+ magic = cpu_to_be32(VMDK4_MAGIC);
+ memset(&header, 0, sizeof(header));
+ header.version = cpu_to_le32(1);
+ header.flags = cpu_to_le32(3); /* ?? */
+ header.capacity = cpu_to_le64(total_size);
+ header.granularity = cpu_to_le64(128);
+ header.num_gtes_per_gte = cpu_to_le32(512);
+
+ grains = (total_size + header.granularity - 1) / header.granularity;
+ gt_size = ((header.num_gtes_per_gte * sizeof(uint32_t)) + 511) >> 9;
+ gt_count = (grains + header.num_gtes_per_gte - 1) / header.num_gtes_per_gte;
+ gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9;
+
+ header.desc_offset = 1;
+ header.desc_size = 20;
+ header.rgd_offset = header.desc_offset + header.desc_size;
+ header.gd_offset = header.rgd_offset + gd_size + (gt_size * gt_count);
+ header.grain_offset =
+ ((header.gd_offset + gd_size + (gt_size * gt_count) +
+ header.granularity - 1) / header.granularity) *
+ header.granularity;
+
+ header.desc_offset = cpu_to_le64(header.desc_offset);
+ header.desc_size = cpu_to_le64(header.desc_size);
+ header.rgd_offset = cpu_to_le64(header.rgd_offset);
+ header.gd_offset = cpu_to_le64(header.gd_offset);
+ header.grain_offset = cpu_to_le64(header.grain_offset);
+
+ header.check_bytes[0] = 0xa;
+ header.check_bytes[1] = 0x20;
+ header.check_bytes[2] = 0xd;
+ header.check_bytes[3] = 0xa;
+
+ /* write all the data */
+ write(fd, &magic, sizeof(magic));
+ write(fd, &header, sizeof(header));
+
+ ftruncate(fd, header.grain_offset << 9);
+
+ /* write grain directory */
+ lseek(fd, le64_to_cpu(header.rgd_offset) << 9, SEEK_SET);
+ for (i = 0, tmp = header.rgd_offset + gd_size;
+ i < gt_count; i++, tmp += gt_size)
+ write(fd, &tmp, sizeof(tmp));
+
+ /* write backup grain directory */
+ lseek(fd, le64_to_cpu(header.gd_offset) << 9, SEEK_SET);
+ for (i = 0, tmp = header.gd_offset + gd_size;
+ i < gt_count; i++, tmp += gt_size)
+ write(fd, &tmp, sizeof(tmp));
+
+ /* compose the descriptor */
+ real_filename = filename;
+ if ((temp_str = strrchr(real_filename, '\\')) != NULL)
+ real_filename = temp_str + 1;
+ if ((temp_str = strrchr(real_filename, '/')) != NULL)
+ real_filename = temp_str + 1;
+ if ((temp_str = strrchr(real_filename, ':')) != NULL)
+ real_filename = temp_str + 1;
+ snprintf(desc, sizeof(desc), desc_template, (unsigned int)time(NULL),
+ total_size, real_filename,
+ (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
+ total_size / (int64_t)(63 * 16));
+
+ /* write the descriptor */
+ lseek(fd, le64_to_cpu(header.desc_offset) << 9, SEEK_SET);
+ write(fd, desc, strlen(desc));
+
+ close(fd);
+ return 0;
+}
+
+static void vmdk_close(BlockDriverState *bs)
+{
+ BDRVVmdkState *s = bs->opaque;
+
+ qemu_free(s->l1_table);
+ qemu_free(s->l2_cache);
+ // try to close parent image, if exist
+ vmdk_parent_close(s->hd);
+ bdrv_delete(s->hd);
+}
+
+static void vmdk_flush(BlockDriverState *bs)
+{
+ BDRVVmdkState *s = bs->opaque;
+ bdrv_flush(s->hd);
+}
+
+static BlockDriver bdrv_vmdk = {
+ .format_name = "vmdk",
+ .instance_size = sizeof(BDRVVmdkState),
+ .bdrv_probe = vmdk_probe,
+ .bdrv_open = vmdk_open,
+ .bdrv_read = vmdk_read,
+ .bdrv_write = vmdk_write,
+ .bdrv_close = vmdk_close,
+ .bdrv_create = vmdk_create,
+ .bdrv_flush = vmdk_flush,
+ .bdrv_is_allocated = vmdk_is_allocated,
+};
+
+static void bdrv_vmdk_init(void)
+{
+ bdrv_register(&bdrv_vmdk);
+}
+
+block_init(bdrv_vmdk_init);
diff --git a/block/vpc.c b/block/vpc.c
new file mode 100644
index 0000000000..211ae5c72f
--- /dev/null
+++ b/block/vpc.c
@@ -0,0 +1,606 @@
+/*
+ * Block driver for Conectix/Microsoft Virtual PC images
+ *
+ * Copyright (c) 2005 Alex Beregszaszi
+ * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+
+/**************************************************************/
+
+#define HEADER_SIZE 512
+
+//#define CACHE
+
+enum vhd_type {
+ VHD_FIXED = 2,
+ VHD_DYNAMIC = 3,
+ VHD_DIFFERENCING = 4,
+};
+
+// Seconds since Jan 1, 2000 0:00:00 (UTC)
+#define VHD_TIMESTAMP_BASE 946684800
+
+// always big-endian
+struct vhd_footer {
+ char creator[8]; // "conectix"
+ uint32_t features;
+ uint32_t version;
+
+ // Offset of next header structure, 0xFFFFFFFF if none
+ uint64_t data_offset;
+
+ // Seconds since Jan 1, 2000 0:00:00 (UTC)
+ uint32_t timestamp;
+
+ char creator_app[4]; // "vpc "
+ uint16_t major;
+ uint16_t minor;
+ char creator_os[4]; // "Wi2k"
+
+ uint64_t orig_size;
+ uint64_t size;
+
+ uint16_t cyls;
+ uint8_t heads;
+ uint8_t secs_per_cyl;
+
+ uint32_t type;
+
+ // Checksum of the Hard Disk Footer ("one's complement of the sum of all
+ // the bytes in the footer without the checksum field")
+ uint32_t checksum;
+
+ // UUID used to identify a parent hard disk (backing file)
+ uint8_t uuid[16];
+
+ uint8_t in_saved_state;
+};
+
+struct vhd_dyndisk_header {
+ char magic[8]; // "cxsparse"
+
+ // Offset of next header structure, 0xFFFFFFFF if none
+ uint64_t data_offset;
+
+ // Offset of the Block Allocation Table (BAT)
+ uint64_t table_offset;
+
+ uint32_t version;
+ uint32_t max_table_entries; // 32bit/entry
+
+ // 2 MB by default, must be a power of two
+ uint32_t block_size;
+
+ uint32_t checksum;
+ uint8_t parent_uuid[16];
+ uint32_t parent_timestamp;
+ uint32_t reserved;
+
+ // Backing file name (in UTF-16)
+ uint8_t parent_name[512];
+
+ struct {
+ uint32_t platform;
+ uint32_t data_space;
+ uint32_t data_length;
+ uint32_t reserved;
+ uint64_t data_offset;
+ } parent_locator[8];
+};
+
+typedef struct BDRVVPCState {
+ BlockDriverState *hd;
+
+ uint8_t footer_buf[HEADER_SIZE];
+ uint64_t free_data_block_offset;
+ int max_table_entries;
+ uint32_t *pagetable;
+ uint64_t bat_offset;
+ uint64_t last_bitmap_offset;
+
+ uint32_t block_size;
+ uint32_t bitmap_size;
+
+#ifdef CACHE
+ uint8_t *pageentry_u8;
+ uint32_t *pageentry_u32;
+ uint16_t *pageentry_u16;
+
+ uint64_t last_bitmap;
+#endif
+} BDRVVPCState;
+
+static uint32_t vpc_checksum(uint8_t* buf, size_t size)
+{
+ uint32_t res = 0;
+ int i;
+
+ for (i = 0; i < size; i++)
+ res += buf[i];
+
+ return ~res;
+}
+
+
+static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+ if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8))
+ return 100;
+ return 0;
+}
+
+static int vpc_open(BlockDriverState *bs, const char *filename, int flags)
+{
+ BDRVVPCState *s = bs->opaque;
+ int ret, i;
+ struct vhd_footer* footer;
+ struct vhd_dyndisk_header* dyndisk_header;
+ uint8_t buf[HEADER_SIZE];
+ uint32_t checksum;
+
+ ret = bdrv_file_open(&s->hd, filename, flags);
+ if (ret < 0)
+ return ret;
+
+ if (bdrv_pread(s->hd, 0, s->footer_buf, HEADER_SIZE) != HEADER_SIZE)
+ goto fail;
+
+ footer = (struct vhd_footer*) s->footer_buf;
+ if (strncmp(footer->creator, "conectix", 8))
+ goto fail;
+
+ checksum = be32_to_cpu(footer->checksum);
+ footer->checksum = 0;
+ if (vpc_checksum(s->footer_buf, HEADER_SIZE) != checksum)
+ fprintf(stderr, "block-vpc: The header checksum of '%s' is "
+ "incorrect.\n", filename);
+
+ // The visible size of a image in Virtual PC depends on the geometry
+ // rather than on the size stored in the footer (the size in the footer
+ // is too large usually)
+ bs->total_sectors = (int64_t)
+ be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
+
+ if (bdrv_pread(s->hd, be64_to_cpu(footer->data_offset), buf, HEADER_SIZE)
+ != HEADER_SIZE)
+ goto fail;
+
+ dyndisk_header = (struct vhd_dyndisk_header*) buf;
+
+ if (strncmp(dyndisk_header->magic, "cxsparse", 8))
+ goto fail;
+
+
+ s->block_size = be32_to_cpu(dyndisk_header->block_size);
+ s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511;
+
+ s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries);
+ s->pagetable = qemu_malloc(s->max_table_entries * 4);
+
+ s->bat_offset = be64_to_cpu(dyndisk_header->table_offset);
+ if (bdrv_pread(s->hd, s->bat_offset, s->pagetable,
+ s->max_table_entries * 4) != s->max_table_entries * 4)
+ goto fail;
+
+ s->free_data_block_offset =
+ (s->bat_offset + (s->max_table_entries * 4) + 511) & ~511;
+
+ for (i = 0; i < s->max_table_entries; i++) {
+ be32_to_cpus(&s->pagetable[i]);
+ if (s->pagetable[i] != 0xFFFFFFFF) {
+ int64_t next = (512 * (int64_t) s->pagetable[i]) +
+ s->bitmap_size + s->block_size;
+
+ if (next> s->free_data_block_offset)
+ s->free_data_block_offset = next;
+ }
+ }
+
+ s->last_bitmap_offset = (int64_t) -1;
+
+#ifdef CACHE
+ s->pageentry_u8 = qemu_malloc(512);
+ s->pageentry_u32 = s->pageentry_u8;
+ s->pageentry_u16 = s->pageentry_u8;
+ s->last_pagetable = -1;
+#endif
+
+ return 0;
+ fail:
+ bdrv_delete(s->hd);
+ return -1;
+}
+
+/*
+ * Returns the absolute byte offset of the given sector in the image file.
+ * If the sector is not allocated, -1 is returned instead.
+ *
+ * The parameter write must be 1 if the offset will be used for a write
+ * operation (the block bitmaps is updated then), 0 otherwise.
+ */
+static inline int64_t get_sector_offset(BlockDriverState *bs,
+ int64_t sector_num, int write)
+{
+ BDRVVPCState *s = bs->opaque;
+ uint64_t offset = sector_num * 512;
+ uint64_t bitmap_offset, block_offset;
+ uint32_t pagetable_index, pageentry_index;
+
+ pagetable_index = offset / s->block_size;
+ pageentry_index = (offset % s->block_size) / 512;
+
+ if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
+ return -1; // not allocated
+
+ bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
+ block_offset = bitmap_offset + s->bitmap_size + (512 * pageentry_index);
+
+ // We must ensure that we don't write to any sectors which are marked as
+ // unused in the bitmap. We get away with setting all bits in the block
+ // bitmap each time we write to a new block. This might cause Virtual PC to
+ // miss sparse read optimization, but it's not a problem in terms of
+ // correctness.
+ if (write && (s->last_bitmap_offset != bitmap_offset)) {
+ uint8_t bitmap[s->bitmap_size];
+
+ s->last_bitmap_offset = bitmap_offset;
+ memset(bitmap, 0xff, s->bitmap_size);
+ bdrv_pwrite(s->hd, bitmap_offset, bitmap, s->bitmap_size);
+ }
+
+// printf("sector: %" PRIx64 ", index: %x, offset: %x, bioff: %" PRIx64 ", bloff: %" PRIx64 "\n",
+// sector_num, pagetable_index, pageentry_index,
+// bitmap_offset, block_offset);
+
+// disabled by reason
+#if 0
+#ifdef CACHE
+ if (bitmap_offset != s->last_bitmap)
+ {
+ lseek(s->fd, bitmap_offset, SEEK_SET);
+
+ s->last_bitmap = bitmap_offset;
+
+ // Scary! Bitmap is stored as big endian 32bit entries,
+ // while we used to look it up byte by byte
+ read(s->fd, s->pageentry_u8, 512);
+ for (i = 0; i < 128; i++)
+ be32_to_cpus(&s->pageentry_u32[i]);
+ }
+
+ if ((s->pageentry_u8[pageentry_index / 8] >> (pageentry_index % 8)) & 1)
+ return -1;
+#else
+ lseek(s->fd, bitmap_offset + (pageentry_index / 8), SEEK_SET);
+
+ read(s->fd, &bitmap_entry, 1);
+
+ if ((bitmap_entry >> (pageentry_index % 8)) & 1)
+ return -1; // not allocated
+#endif
+#endif
+
+ return block_offset;
+}
+
+/*
+ * Writes the footer to the end of the image file. This is needed when the
+ * file grows as it overwrites the old footer
+ *
+ * Returns 0 on success and < 0 on error
+ */
+static int rewrite_footer(BlockDriverState* bs)
+{
+ int ret;
+ BDRVVPCState *s = bs->opaque;
+ int64_t offset = s->free_data_block_offset;
+
+ ret = bdrv_pwrite(s->hd, offset, s->footer_buf, HEADER_SIZE);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+/*
+ * Allocates a new block. This involves writing a new footer and updating
+ * the Block Allocation Table to use the space at the old end of the image
+ * file (overwriting the old footer)
+ *
+ * Returns the sectors' offset in the image file on success and < 0 on error
+ */
+static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num)
+{
+ BDRVVPCState *s = bs->opaque;
+ int64_t bat_offset;
+ uint32_t index, bat_value;
+ int ret;
+ uint8_t bitmap[s->bitmap_size];
+
+ // Check if sector_num is valid
+ if ((sector_num < 0) || (sector_num > bs->total_sectors))
+ return -1;
+
+ // Write entry into in-memory BAT
+ index = (sector_num * 512) / s->block_size;
+ if (s->pagetable[index] != 0xFFFFFFFF)
+ return -1;
+
+ s->pagetable[index] = s->free_data_block_offset / 512;
+
+ // Initialize the block's bitmap
+ memset(bitmap, 0xff, s->bitmap_size);
+ bdrv_pwrite(s->hd, s->free_data_block_offset, bitmap, s->bitmap_size);
+
+ // Write new footer (the old one will be overwritten)
+ s->free_data_block_offset += s->block_size + s->bitmap_size;
+ ret = rewrite_footer(bs);
+ if (ret < 0)
+ goto fail;
+
+ // Write BAT entry to disk
+ bat_offset = s->bat_offset + (4 * index);
+ bat_value = be32_to_cpu(s->pagetable[index]);
+ ret = bdrv_pwrite(s->hd, bat_offset, &bat_value, 4);
+ if (ret < 0)
+ goto fail;
+
+ return get_sector_offset(bs, sector_num, 0);
+
+fail:
+ s->free_data_block_offset -= (s->block_size + s->bitmap_size);
+ return -1;
+}
+
+static int vpc_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ BDRVVPCState *s = bs->opaque;
+ int ret;
+ int64_t offset;
+
+ while (nb_sectors > 0) {
+ offset = get_sector_offset(bs, sector_num, 0);
+
+ if (offset == -1) {
+ memset(buf, 0, 512);
+ } else {
+ ret = bdrv_pread(s->hd, offset, buf, 512);
+ if (ret != 512)
+ return -1;
+ }
+
+ nb_sectors--;
+ sector_num++;
+ buf += 512;
+ }
+ return 0;
+}
+
+static int vpc_write(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ BDRVVPCState *s = bs->opaque;
+ int64_t offset;
+ int ret;
+
+ while (nb_sectors > 0) {
+ offset = get_sector_offset(bs, sector_num, 1);
+
+ if (offset == -1) {
+ offset = alloc_block(bs, sector_num);
+ if (offset < 0)
+ return -1;
+ }
+
+ ret = bdrv_pwrite(s->hd, offset, buf, 512);
+ if (ret != 512)
+ return -1;
+
+ nb_sectors--;
+ sector_num++;
+ buf += 512;
+ }
+
+ return 0;
+}
+
+
+/*
+ * Calculates the number of cylinders, heads and sectors per cylinder
+ * based on a given number of sectors. This is the algorithm described
+ * in the VHD specification.
+ *
+ * Note that the geometry doesn't always exactly match total_sectors but
+ * may round it down.
+ *
+ * Returns 0 on success, -EFBIG if the size is larger than 127 GB
+ */
+static int calculate_geometry(int64_t total_sectors, uint16_t* cyls,
+ uint8_t* heads, uint8_t* secs_per_cyl)
+{
+ uint32_t cyls_times_heads;
+
+ if (total_sectors > 65535 * 16 * 255)
+ return -EFBIG;
+
+ if (total_sectors > 65535 * 16 * 63) {
+ *secs_per_cyl = 255;
+ *heads = 16;
+ cyls_times_heads = total_sectors / *secs_per_cyl;
+ } else {
+ *secs_per_cyl = 17;
+ cyls_times_heads = total_sectors / *secs_per_cyl;
+ *heads = (cyls_times_heads + 1023) / 1024;
+
+ if (*heads < 4)
+ *heads = 4;
+
+ if (cyls_times_heads >= (*heads * 1024) || *heads > 16) {
+ *secs_per_cyl = 31;
+ *heads = 16;
+ cyls_times_heads = total_sectors / *secs_per_cyl;
+ }
+
+ if (cyls_times_heads >= (*heads * 1024)) {
+ *secs_per_cyl = 63;
+ *heads = 16;
+ cyls_times_heads = total_sectors / *secs_per_cyl;
+ }
+ }
+
+ // Note: Rounding up deviates from the Virtual PC behaviour
+ // However, we need this to avoid truncating images in qemu-img convert
+ *cyls = (cyls_times_heads + *heads - 1) / *heads;
+
+ return 0;
+}
+
+static int vpc_create(const char *filename, int64_t total_sectors,
+ const char *backing_file, int flags)
+{
+ uint8_t buf[1024];
+ struct vhd_footer* footer = (struct vhd_footer*) buf;
+ struct vhd_dyndisk_header* dyndisk_header =
+ (struct vhd_dyndisk_header*) buf;
+ int fd, i;
+ uint16_t cyls;
+ uint8_t heads;
+ uint8_t secs_per_cyl;
+ size_t block_size, num_bat_entries;
+
+ if (backing_file != NULL)
+ return -ENOTSUP;
+
+ fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
+ if (fd < 0)
+ return -EIO;
+
+ // Calculate matching total_size and geometry
+ if (calculate_geometry(total_sectors, &cyls, &heads, &secs_per_cyl))
+ return -EFBIG;
+ total_sectors = (int64_t) cyls * heads * secs_per_cyl;
+
+ // Prepare the Hard Disk Footer
+ memset(buf, 0, 1024);
+
+ strncpy(footer->creator, "conectix", 8);
+ // TODO Check if "qemu" creator_app is ok for VPC
+ strncpy(footer->creator_app, "qemu", 4);
+ strncpy(footer->creator_os, "Wi2k", 4);
+
+ footer->features = be32_to_cpu(0x02);
+ footer->version = be32_to_cpu(0x00010000);
+ footer->data_offset = be64_to_cpu(HEADER_SIZE);
+ footer->timestamp = be32_to_cpu(time(NULL) - VHD_TIMESTAMP_BASE);
+
+ // Version of Virtual PC 2007
+ footer->major = be16_to_cpu(0x0005);
+ footer->minor =be16_to_cpu(0x0003);
+
+ footer->orig_size = be64_to_cpu(total_sectors * 512);
+ footer->size = be64_to_cpu(total_sectors * 512);
+
+ footer->cyls = be16_to_cpu(cyls);
+ footer->heads = heads;
+ footer->secs_per_cyl = secs_per_cyl;
+
+ footer->type = be32_to_cpu(VHD_DYNAMIC);
+
+ // TODO uuid is missing
+
+ footer->checksum = be32_to_cpu(vpc_checksum(buf, HEADER_SIZE));
+
+ // Write the footer (twice: at the beginning and at the end)
+ block_size = 0x200000;
+ num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512);
+
+ if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE)
+ return -EIO;
+
+ if (lseek(fd, 1536 + ((num_bat_entries * 4 + 511) & ~511), SEEK_SET) < 0)
+ return -EIO;
+ if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE)
+ return -EIO;
+
+ // Write the initial BAT
+ if (lseek(fd, 3 * 512, SEEK_SET) < 0)
+ return -EIO;
+
+ memset(buf, 0xFF, 512);
+ for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++)
+ if (write(fd, buf, 512) != 512)
+ return -EIO;
+
+
+ // Prepare the Dynamic Disk Header
+ memset(buf, 0, 1024);
+
+ strncpy(dyndisk_header->magic, "cxsparse", 8);
+
+ dyndisk_header->data_offset = be64_to_cpu(0xFFFFFFFF);
+ dyndisk_header->table_offset = be64_to_cpu(3 * 512);
+ dyndisk_header->version = be32_to_cpu(0x00010000);
+ dyndisk_header->block_size = be32_to_cpu(block_size);
+ dyndisk_header->max_table_entries = be32_to_cpu(num_bat_entries);
+
+ dyndisk_header->checksum = be32_to_cpu(vpc_checksum(buf, 1024));
+
+ // Write the header
+ if (lseek(fd, 512, SEEK_SET) < 0)
+ return -EIO;
+ if (write(fd, buf, 1024) != 1024)
+ return -EIO;
+
+ close(fd);
+ return 0;
+}
+
+static void vpc_close(BlockDriverState *bs)
+{
+ BDRVVPCState *s = bs->opaque;
+ qemu_free(s->pagetable);
+#ifdef CACHE
+ qemu_free(s->pageentry_u8);
+#endif
+ bdrv_delete(s->hd);
+}
+
+static BlockDriver bdrv_vpc = {
+ .format_name = "vpc",
+ .instance_size = sizeof(BDRVVPCState),
+ .bdrv_probe = vpc_probe,
+ .bdrv_open = vpc_open,
+ .bdrv_read = vpc_read,
+ .bdrv_write = vpc_write,
+ .bdrv_close = vpc_close,
+ .bdrv_create = vpc_create,
+};
+
+static void bdrv_vpc_init(void)
+{
+ bdrv_register(&bdrv_vpc);
+}
+
+block_init(bdrv_vpc_init);
diff --git a/block/vvfat.c b/block/vvfat.c
new file mode 100644
index 0000000000..2a8feb38d8
--- /dev/null
+++ b/block/vvfat.c
@@ -0,0 +1,2855 @@
+/* vim:set shiftwidth=4 ts=8: */
+/*
+ * QEMU Block driver for virtual VFAT (shadows a local directory)
+ *
+ * Copyright (c) 2004,2005 Johannes E. Schindelin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <sys/stat.h>
+#include <dirent.h>
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+
+#ifndef S_IWGRP
+#define S_IWGRP 0
+#endif
+#ifndef S_IWOTH
+#define S_IWOTH 0
+#endif
+
+/* TODO: add ":bootsector=blabla.img:" */
+/* LATER TODO: add automatic boot sector generation from
+ BOOTEASY.ASM and Ranish Partition Manager
+ Note that DOS assumes the system files to be the first files in the
+ file system (test if the boot sector still relies on that fact)! */
+/* MAYBE TODO: write block-visofs.c */
+/* TODO: call try_commit() only after a timeout */
+
+/* #define DEBUG */
+
+#ifdef DEBUG
+
+#define DLOG(a) a
+
+#undef stderr
+#define stderr STDERR
+FILE* stderr = NULL;
+
+static void checkpoint(void);
+
+#ifdef __MINGW32__
+void nonono(const char* file, int line, const char* msg) {
+ fprintf(stderr, "Nonono! %s:%d %s\n", file, line, msg);
+ exit(-5);
+}
+#undef assert
+#define assert(a) do {if (!(a)) nonono(__FILE__, __LINE__, #a);}while(0)
+#endif
+
+#else
+
+#define DLOG(a)
+
+#endif
+
+/* dynamic array functions */
+typedef struct array_t {
+ char* pointer;
+ unsigned int size,next,item_size;
+} array_t;
+
+static inline void array_init(array_t* array,unsigned int item_size)
+{
+ array->pointer = NULL;
+ array->size=0;
+ array->next=0;
+ array->item_size=item_size;
+}
+
+static inline void array_free(array_t* array)
+{
+ if(array->pointer)
+ free(array->pointer);
+ array->size=array->next=0;
+}
+
+/* does not automatically grow */
+static inline void* array_get(array_t* array,unsigned int index) {
+ assert(index < array->next);
+ return array->pointer + index * array->item_size;
+}
+
+static inline int array_ensure_allocated(array_t* array, int index)
+{
+ if((index + 1) * array->item_size > array->size) {
+ int new_size = (index + 32) * array->item_size;
+ array->pointer = qemu_realloc(array->pointer, new_size);
+ if (!array->pointer)
+ return -1;
+ array->size = new_size;
+ array->next = index + 1;
+ }
+
+ return 0;
+}
+
+static inline void* array_get_next(array_t* array) {
+ unsigned int next = array->next;
+ void* result;
+
+ if (array_ensure_allocated(array, next) < 0)
+ return NULL;
+
+ array->next = next + 1;
+ result = array_get(array, next);
+
+ return result;
+}
+
+static inline void* array_insert(array_t* array,unsigned int index,unsigned int count) {
+ if((array->next+count)*array->item_size>array->size) {
+ int increment=count*array->item_size;
+ array->pointer=qemu_realloc(array->pointer,array->size+increment);
+ if(!array->pointer)
+ return NULL;
+ array->size+=increment;
+ }
+ memmove(array->pointer+(index+count)*array->item_size,
+ array->pointer+index*array->item_size,
+ (array->next-index)*array->item_size);
+ array->next+=count;
+ return array->pointer+index*array->item_size;
+}
+
+/* this performs a "roll", so that the element which was at index_from becomes
+ * index_to, but the order of all other elements is preserved. */
+static inline int array_roll(array_t* array,int index_to,int index_from,int count)
+{
+ char* buf;
+ char* from;
+ char* to;
+ int is;
+
+ if(!array ||
+ index_to<0 || index_to>=array->next ||
+ index_from<0 || index_from>=array->next)
+ return -1;
+
+ if(index_to==index_from)
+ return 0;
+
+ is=array->item_size;
+ from=array->pointer+index_from*is;
+ to=array->pointer+index_to*is;
+ buf=qemu_malloc(is*count);
+ memcpy(buf,from,is*count);
+
+ if(index_to<index_from)
+ memmove(to+is*count,to,from-to);
+ else
+ memmove(from,from+is*count,to-from);
+
+ memcpy(to,buf,is*count);
+
+ free(buf);
+
+ return 0;
+}
+
+static inline int array_remove_slice(array_t* array,int index, int count)
+{
+ assert(index >=0);
+ assert(count > 0);
+ assert(index + count <= array->next);
+ if(array_roll(array,array->next-1,index,count))
+ return -1;
+ array->next -= count;
+ return 0;
+}
+
+static int array_remove(array_t* array,int index)
+{
+ return array_remove_slice(array, index, 1);
+}
+
+/* return the index for a given member */
+static int array_index(array_t* array, void* pointer)
+{
+ size_t offset = (char*)pointer - array->pointer;
+ assert((offset % array->item_size) == 0);
+ assert(offset/array->item_size < array->next);
+ return offset/array->item_size;
+}
+
+/* These structures are used to fake a disk and the VFAT filesystem.
+ * For this reason we need to use __attribute__((packed)). */
+
+typedef struct bootsector_t {
+ uint8_t jump[3];
+ uint8_t name[8];
+ uint16_t sector_size;
+ uint8_t sectors_per_cluster;
+ uint16_t reserved_sectors;
+ uint8_t number_of_fats;
+ uint16_t root_entries;
+ uint16_t total_sectors16;
+ uint8_t media_type;
+ uint16_t sectors_per_fat;
+ uint16_t sectors_per_track;
+ uint16_t number_of_heads;
+ uint32_t hidden_sectors;
+ uint32_t total_sectors;
+ union {
+ struct {
+ uint8_t drive_number;
+ uint8_t current_head;
+ uint8_t signature;
+ uint32_t id;
+ uint8_t volume_label[11];
+ } __attribute__((packed)) fat16;
+ struct {
+ uint32_t sectors_per_fat;
+ uint16_t flags;
+ uint8_t major,minor;
+ uint32_t first_cluster_of_root_directory;
+ uint16_t info_sector;
+ uint16_t backup_boot_sector;
+ uint16_t ignored;
+ } __attribute__((packed)) fat32;
+ } u;
+ uint8_t fat_type[8];
+ uint8_t ignored[0x1c0];
+ uint8_t magic[2];
+} __attribute__((packed)) bootsector_t;
+
+typedef struct {
+ uint8_t head;
+ uint8_t sector;
+ uint8_t cylinder;
+} mbr_chs_t;
+
+typedef struct partition_t {
+ uint8_t attributes; /* 0x80 = bootable */
+ mbr_chs_t start_CHS;
+ uint8_t fs_type; /* 0x1 = FAT12, 0x6 = FAT16, 0xe = FAT16_LBA, 0xb = FAT32, 0xc = FAT32_LBA */
+ mbr_chs_t end_CHS;
+ uint32_t start_sector_long;
+ uint32_t length_sector_long;
+} __attribute__((packed)) partition_t;
+
+typedef struct mbr_t {
+ uint8_t ignored[0x1b8];
+ uint32_t nt_id;
+ uint8_t ignored2[2];
+ partition_t partition[4];
+ uint8_t magic[2];
+} __attribute__((packed)) mbr_t;
+
+typedef struct direntry_t {
+ uint8_t name[8];
+ uint8_t extension[3];
+ uint8_t attributes;
+ uint8_t reserved[2];
+ uint16_t ctime;
+ uint16_t cdate;
+ uint16_t adate;
+ uint16_t begin_hi;
+ uint16_t mtime;
+ uint16_t mdate;
+ uint16_t begin;
+ uint32_t size;
+} __attribute__((packed)) direntry_t;
+
+/* this structure are used to transparently access the files */
+
+typedef struct mapping_t {
+ /* begin is the first cluster, end is the last+1 */
+ uint32_t begin,end;
+ /* as s->directory is growable, no pointer may be used here */
+ unsigned int dir_index;
+ /* the clusters of a file may be in any order; this points to the first */
+ int first_mapping_index;
+ union {
+ /* offset is
+ * - the offset in the file (in clusters) for a file, or
+ * - the next cluster of the directory for a directory, and
+ * - the address of the buffer for a faked entry
+ */
+ struct {
+ uint32_t offset;
+ } file;
+ struct {
+ int parent_mapping_index;
+ int first_dir_index;
+ } dir;
+ } info;
+ /* path contains the full path, i.e. it always starts with s->path */
+ char* path;
+
+ enum { MODE_UNDEFINED = 0, MODE_NORMAL = 1, MODE_MODIFIED = 2,
+ MODE_DIRECTORY = 4, MODE_FAKED = 8,
+ MODE_DELETED = 16, MODE_RENAMED = 32 } mode;
+ int read_only;
+} mapping_t;
+
+#ifdef DEBUG
+static void print_direntry(const struct direntry_t*);
+static void print_mapping(const struct mapping_t* mapping);
+#endif
+
+/* here begins the real VVFAT driver */
+
+typedef struct BDRVVVFATState {
+ BlockDriverState* bs; /* pointer to parent */
+ unsigned int first_sectors_number; /* 1 for a single partition, 0x40 for a disk with partition table */
+ unsigned char first_sectors[0x40*0x200];
+
+ int fat_type; /* 16 or 32 */
+ array_t fat,directory,mapping;
+
+ unsigned int cluster_size;
+ unsigned int sectors_per_cluster;
+ unsigned int sectors_per_fat;
+ unsigned int sectors_of_root_directory;
+ uint32_t last_cluster_of_root_directory;
+ unsigned int faked_sectors; /* how many sectors are faked before file data */
+ uint32_t sector_count; /* total number of sectors of the partition */
+ uint32_t cluster_count; /* total number of clusters of this partition */
+ uint32_t max_fat_value;
+
+ int current_fd;
+ mapping_t* current_mapping;
+ unsigned char* cluster; /* points to current cluster */
+ unsigned char* cluster_buffer; /* points to a buffer to hold temp data */
+ unsigned int current_cluster;
+
+ /* write support */
+ BlockDriverState* write_target;
+ char* qcow_filename;
+ BlockDriverState* qcow;
+ void* fat2;
+ char* used_clusters;
+ array_t commits;
+ const char* path;
+ int downcase_short_names;
+} BDRVVVFATState;
+
+/* take the sector position spos and convert it to Cylinder/Head/Sector position
+ * if the position is outside the specified geometry, fill maximum value for CHS
+ * and return 1 to signal overflow.
+ */
+static int sector2CHS(BlockDriverState* bs, mbr_chs_t * chs, int spos){
+ int head,sector;
+ sector = spos % (bs->secs); spos/= bs->secs;
+ head = spos % (bs->heads); spos/= bs->heads;
+ if(spos >= bs->cyls){
+ /* Overflow,
+ it happens if 32bit sector positions are used, while CHS is only 24bit.
+ Windows/Dos is said to take 1023/255/63 as nonrepresentable CHS */
+ chs->head = 0xFF;
+ chs->sector = 0xFF;
+ chs->cylinder = 0xFF;
+ return 1;
+ }
+ chs->head = (uint8_t)head;
+ chs->sector = (uint8_t)( (sector+1) | ((spos>>8)<<6) );
+ chs->cylinder = (uint8_t)spos;
+ return 0;
+}
+
+static void init_mbr(BDRVVVFATState* s)
+{
+ /* TODO: if the files mbr.img and bootsect.img exist, use them */
+ mbr_t* real_mbr=(mbr_t*)s->first_sectors;
+ partition_t* partition=&(real_mbr->partition[0]);
+ int lba;
+
+ memset(s->first_sectors,0,512);
+
+ /* Win NT Disk Signature */
+ real_mbr->nt_id= cpu_to_le32(0xbe1afdfa);
+
+ partition->attributes=0x80; /* bootable */
+
+ /* LBA is used when partition is outside the CHS geometry */
+ lba = sector2CHS(s->bs, &partition->start_CHS, s->first_sectors_number-1);
+ lba|= sector2CHS(s->bs, &partition->end_CHS, s->sector_count);
+
+ /*LBA partitions are identified only by start/length_sector_long not by CHS*/
+ partition->start_sector_long =cpu_to_le32(s->first_sectors_number-1);
+ partition->length_sector_long=cpu_to_le32(s->sector_count - s->first_sectors_number+1);
+
+ /* FAT12/FAT16/FAT32 */
+ /* DOS uses different types when partition is LBA,
+ probably to prevent older versions from using CHS on them */
+ partition->fs_type= s->fat_type==12 ? 0x1:
+ s->fat_type==16 ? (lba?0xe:0x06):
+ /*fat_tyoe==32*/ (lba?0xc:0x0b);
+
+ real_mbr->magic[0]=0x55; real_mbr->magic[1]=0xaa;
+}
+
+/* direntry functions */
+
+/* dest is assumed to hold 258 bytes, and pads with 0xffff up to next multiple of 26 */
+static inline int short2long_name(char* dest,const char* src)
+{
+ int i;
+ int len;
+ for(i=0;i<129 && src[i];i++) {
+ dest[2*i]=src[i];
+ dest[2*i+1]=0;
+ }
+ len=2*i;
+ dest[2*i]=dest[2*i+1]=0;
+ for(i=2*i+2;(i%26);i++)
+ dest[i]=0xff;
+ return len;
+}
+
+static inline direntry_t* create_long_filename(BDRVVVFATState* s,const char* filename)
+{
+ char buffer[258];
+ int length=short2long_name(buffer,filename),
+ number_of_entries=(length+25)/26,i;
+ direntry_t* entry;
+
+ for(i=0;i<number_of_entries;i++) {
+ entry=array_get_next(&(s->directory));
+ entry->attributes=0xf;
+ entry->reserved[0]=0;
+ entry->begin=0;
+ entry->name[0]=(number_of_entries-i)|(i==0?0x40:0);
+ }
+ for(i=0;i<26*number_of_entries;i++) {
+ int offset=(i%26);
+ if(offset<10) offset=1+offset;
+ else if(offset<22) offset=14+offset-10;
+ else offset=28+offset-22;
+ entry=array_get(&(s->directory),s->directory.next-1-(i/26));
+ entry->name[offset]=buffer[i];
+ }
+ return array_get(&(s->directory),s->directory.next-number_of_entries);
+}
+
+static char is_free(const direntry_t* direntry)
+{
+ return direntry->name[0]==0xe5 || direntry->name[0]==0x00;
+}
+
+static char is_volume_label(const direntry_t* direntry)
+{
+ return direntry->attributes == 0x28;
+}
+
+static char is_long_name(const direntry_t* direntry)
+{
+ return direntry->attributes == 0xf;
+}
+
+static char is_short_name(const direntry_t* direntry)
+{
+ return !is_volume_label(direntry) && !is_long_name(direntry)
+ && !is_free(direntry);
+}
+
+static char is_directory(const direntry_t* direntry)
+{
+ return direntry->attributes & 0x10 && direntry->name[0] != 0xe5;
+}
+
+static inline char is_dot(const direntry_t* direntry)
+{
+ return is_short_name(direntry) && direntry->name[0] == '.';
+}
+
+static char is_file(const direntry_t* direntry)
+{
+ return is_short_name(direntry) && !is_directory(direntry);
+}
+
+static inline uint32_t begin_of_direntry(const direntry_t* direntry)
+{
+ return le16_to_cpu(direntry->begin)|(le16_to_cpu(direntry->begin_hi)<<16);
+}
+
+static inline uint32_t filesize_of_direntry(const direntry_t* direntry)
+{
+ return le32_to_cpu(direntry->size);
+}
+
+static void set_begin_of_direntry(direntry_t* direntry, uint32_t begin)
+{
+ direntry->begin = cpu_to_le16(begin & 0xffff);
+ direntry->begin_hi = cpu_to_le16((begin >> 16) & 0xffff);
+}
+
+/* fat functions */
+
+static inline uint8_t fat_chksum(const direntry_t* entry)
+{
+ uint8_t chksum=0;
+ int i;
+
+ for(i=0;i<11;i++) {
+ unsigned char c;
+
+ c = (i <= 8) ? entry->name[i] : entry->extension[i-8];
+ chksum=(((chksum&0xfe)>>1)|((chksum&0x01)?0x80:0)) + c;
+ }
+
+ return chksum;
+}
+
+/* if return_time==0, this returns the fat_date, else the fat_time */
+static uint16_t fat_datetime(time_t time,int return_time) {
+ struct tm* t;
+#ifdef _WIN32
+ t=localtime(&time); /* this is not thread safe */
+#else
+ struct tm t1;
+ t=&t1;
+ localtime_r(&time,t);
+#endif
+ if(return_time)
+ return cpu_to_le16((t->tm_sec/2)|(t->tm_min<<5)|(t->tm_hour<<11));
+ return cpu_to_le16((t->tm_mday)|((t->tm_mon+1)<<5)|((t->tm_year-80)<<9));
+}
+
+static inline void fat_set(BDRVVVFATState* s,unsigned int cluster,uint32_t value)
+{
+ if(s->fat_type==32) {
+ uint32_t* entry=array_get(&(s->fat),cluster);
+ *entry=cpu_to_le32(value);
+ } else if(s->fat_type==16) {
+ uint16_t* entry=array_get(&(s->fat),cluster);
+ *entry=cpu_to_le16(value&0xffff);
+ } else {
+ int offset = (cluster*3/2);
+ unsigned char* p = array_get(&(s->fat), offset);
+ switch (cluster&1) {
+ case 0:
+ p[0] = value&0xff;
+ p[1] = (p[1]&0xf0) | ((value>>8)&0xf);
+ break;
+ case 1:
+ p[0] = (p[0]&0xf) | ((value&0xf)<<4);
+ p[1] = (value>>4);
+ break;
+ }
+ }
+}
+
+static inline uint32_t fat_get(BDRVVVFATState* s,unsigned int cluster)
+{
+ if(s->fat_type==32) {
+ uint32_t* entry=array_get(&(s->fat),cluster);
+ return le32_to_cpu(*entry);
+ } else if(s->fat_type==16) {
+ uint16_t* entry=array_get(&(s->fat),cluster);
+ return le16_to_cpu(*entry);
+ } else {
+ const uint8_t* x=(uint8_t*)(s->fat.pointer)+cluster*3/2;
+ return ((x[0]|(x[1]<<8))>>(cluster&1?4:0))&0x0fff;
+ }
+}
+
+static inline int fat_eof(BDRVVVFATState* s,uint32_t fat_entry)
+{
+ if(fat_entry>s->max_fat_value-8)
+ return -1;
+ return 0;
+}
+
+static inline void init_fat(BDRVVVFATState* s)
+{
+ if (s->fat_type == 12) {
+ array_init(&(s->fat),1);
+ array_ensure_allocated(&(s->fat),
+ s->sectors_per_fat * 0x200 * 3 / 2 - 1);
+ } else {
+ array_init(&(s->fat),(s->fat_type==32?4:2));
+ array_ensure_allocated(&(s->fat),
+ s->sectors_per_fat * 0x200 / s->fat.item_size - 1);
+ }
+ memset(s->fat.pointer,0,s->fat.size);
+
+ switch(s->fat_type) {
+ case 12: s->max_fat_value=0xfff; break;
+ case 16: s->max_fat_value=0xffff; break;
+ case 32: s->max_fat_value=0x0fffffff; break;
+ default: s->max_fat_value=0; /* error... */
+ }
+
+}
+
+/* TODO: in create_short_filename, 0xe5->0x05 is not yet handled! */
+/* TODO: in parse_short_filename, 0x05->0xe5 is not yet handled! */
+static inline direntry_t* create_short_and_long_name(BDRVVVFATState* s,
+ unsigned int directory_start, const char* filename, int is_dot)
+{
+ int i,j,long_index=s->directory.next;
+ direntry_t* entry = NULL;
+ direntry_t* entry_long = NULL;
+
+ if(is_dot) {
+ entry=array_get_next(&(s->directory));
+ memset(entry->name,0x20,11);
+ memcpy(entry->name,filename,strlen(filename));
+ return entry;
+ }
+
+ entry_long=create_long_filename(s,filename);
+
+ i = strlen(filename);
+ for(j = i - 1; j>0 && filename[j]!='.';j--);
+ if (j > 0)
+ i = (j > 8 ? 8 : j);
+ else if (i > 8)
+ i = 8;
+
+ entry=array_get_next(&(s->directory));
+ memset(entry->name,0x20,11);
+ memcpy(entry->name, filename, i);
+
+ if(j > 0)
+ for (i = 0; i < 3 && filename[j+1+i]; i++)
+ entry->extension[i] = filename[j+1+i];
+
+ /* upcase & remove unwanted characters */
+ for(i=10;i>=0;i--) {
+ if(i==10 || i==7) for(;i>0 && entry->name[i]==' ';i--);
+ if(entry->name[i]<=' ' || entry->name[i]>0x7f
+ || strchr(".*?<>|\":/\\[];,+='",entry->name[i]))
+ entry->name[i]='_';
+ else if(entry->name[i]>='a' && entry->name[i]<='z')
+ entry->name[i]+='A'-'a';
+ }
+
+ /* mangle duplicates */
+ while(1) {
+ direntry_t* entry1=array_get(&(s->directory),directory_start);
+ int j;
+
+ for(;entry1<entry;entry1++)
+ if(!is_long_name(entry1) && !memcmp(entry1->name,entry->name,11))
+ break; /* found dupe */
+ if(entry1==entry) /* no dupe found */
+ break;
+
+ /* use all 8 characters of name */
+ if(entry->name[7]==' ') {
+ int j;
+ for(j=6;j>0 && entry->name[j]==' ';j--)
+ entry->name[j]='~';
+ }
+
+ /* increment number */
+ for(j=7;j>0 && entry->name[j]=='9';j--)
+ entry->name[j]='0';
+ if(j>0) {
+ if(entry->name[j]<'0' || entry->name[j]>'9')
+ entry->name[j]='0';
+ else
+ entry->name[j]++;
+ }
+ }
+
+ /* calculate checksum; propagate to long name */
+ if(entry_long) {
+ uint8_t chksum=fat_chksum(entry);
+
+ /* calculate anew, because realloc could have taken place */
+ entry_long=array_get(&(s->directory),long_index);
+ while(entry_long<entry && is_long_name(entry_long)) {
+ entry_long->reserved[1]=chksum;
+ entry_long++;
+ }
+ }
+
+ return entry;
+}
+
+/*
+ * Read a directory. (the index of the corresponding mapping must be passed).
+ */
+static int read_directory(BDRVVVFATState* s, int mapping_index)
+{
+ mapping_t* mapping = array_get(&(s->mapping), mapping_index);
+ direntry_t* direntry;
+ const char* dirname = mapping->path;
+ int first_cluster = mapping->begin;
+ int parent_index = mapping->info.dir.parent_mapping_index;
+ mapping_t* parent_mapping = (mapping_t*)
+ (parent_index >= 0 ? array_get(&(s->mapping), parent_index) : NULL);
+ int first_cluster_of_parent = parent_mapping ? parent_mapping->begin : -1;
+
+ DIR* dir=opendir(dirname);
+ struct dirent* entry;
+ int i;
+
+ assert(mapping->mode & MODE_DIRECTORY);
+
+ if(!dir) {
+ mapping->end = mapping->begin;
+ return -1;
+ }
+
+ i = mapping->info.dir.first_dir_index =
+ first_cluster == 0 ? 0 : s->directory.next;
+
+ /* actually read the directory, and allocate the mappings */
+ while((entry=readdir(dir))) {
+ unsigned int length=strlen(dirname)+2+strlen(entry->d_name);
+ char* buffer;
+ direntry_t* direntry;
+ struct stat st;
+ int is_dot=!strcmp(entry->d_name,".");
+ int is_dotdot=!strcmp(entry->d_name,"..");
+
+ if(first_cluster == 0 && (is_dotdot || is_dot))
+ continue;
+
+ buffer=(char*)qemu_malloc(length);
+ snprintf(buffer,length,"%s/%s",dirname,entry->d_name);
+
+ if(stat(buffer,&st)<0) {
+ free(buffer);
+ continue;
+ }
+
+ /* create directory entry for this file */
+ direntry=create_short_and_long_name(s, i, entry->d_name,
+ is_dot || is_dotdot);
+ direntry->attributes=(S_ISDIR(st.st_mode)?0x10:0x20);
+ direntry->reserved[0]=direntry->reserved[1]=0;
+ direntry->ctime=fat_datetime(st.st_ctime,1);
+ direntry->cdate=fat_datetime(st.st_ctime,0);
+ direntry->adate=fat_datetime(st.st_atime,0);
+ direntry->begin_hi=0;
+ direntry->mtime=fat_datetime(st.st_mtime,1);
+ direntry->mdate=fat_datetime(st.st_mtime,0);
+ if(is_dotdot)
+ set_begin_of_direntry(direntry, first_cluster_of_parent);
+ else if(is_dot)
+ set_begin_of_direntry(direntry, first_cluster);
+ else
+ direntry->begin=0; /* do that later */
+ if (st.st_size > 0x7fffffff) {
+ fprintf(stderr, "File %s is larger than 2GB\n", buffer);
+ free(buffer);
+ return -2;
+ }
+ direntry->size=cpu_to_le32(S_ISDIR(st.st_mode)?0:st.st_size);
+
+ /* create mapping for this file */
+ if(!is_dot && !is_dotdot && (S_ISDIR(st.st_mode) || st.st_size)) {
+ s->current_mapping=(mapping_t*)array_get_next(&(s->mapping));
+ s->current_mapping->begin=0;
+ s->current_mapping->end=st.st_size;
+ /*
+ * we get the direntry of the most recent direntry, which
+ * contains the short name and all the relevant information.
+ */
+ s->current_mapping->dir_index=s->directory.next-1;
+ s->current_mapping->first_mapping_index = -1;
+ if (S_ISDIR(st.st_mode)) {
+ s->current_mapping->mode = MODE_DIRECTORY;
+ s->current_mapping->info.dir.parent_mapping_index =
+ mapping_index;
+ } else {
+ s->current_mapping->mode = MODE_UNDEFINED;
+ s->current_mapping->info.file.offset = 0;
+ }
+ s->current_mapping->path=buffer;
+ s->current_mapping->read_only =
+ (st.st_mode & (S_IWUSR | S_IWGRP | S_IWOTH)) == 0;
+ }
+ }
+ closedir(dir);
+
+ /* fill with zeroes up to the end of the cluster */
+ while(s->directory.next%(0x10*s->sectors_per_cluster)) {
+ direntry_t* direntry=array_get_next(&(s->directory));
+ memset(direntry,0,sizeof(direntry_t));
+ }
+
+/* TODO: if there are more entries, bootsector has to be adjusted! */
+#define ROOT_ENTRIES (0x02 * 0x10 * s->sectors_per_cluster)
+ if (mapping_index == 0 && s->directory.next < ROOT_ENTRIES) {
+ /* root directory */
+ int cur = s->directory.next;
+ array_ensure_allocated(&(s->directory), ROOT_ENTRIES - 1);
+ memset(array_get(&(s->directory), cur), 0,
+ (ROOT_ENTRIES - cur) * sizeof(direntry_t));
+ }
+
+ /* reget the mapping, since s->mapping was possibly realloc()ed */
+ mapping = (mapping_t*)array_get(&(s->mapping), mapping_index);
+ first_cluster += (s->directory.next - mapping->info.dir.first_dir_index)
+ * 0x20 / s->cluster_size;
+ mapping->end = first_cluster;
+
+ direntry = (direntry_t*)array_get(&(s->directory), mapping->dir_index);
+ set_begin_of_direntry(direntry, mapping->begin);
+
+ return 0;
+}
+
+static inline uint32_t sector2cluster(BDRVVVFATState* s,off_t sector_num)
+{
+ return (sector_num-s->faked_sectors)/s->sectors_per_cluster;
+}
+
+static inline off_t cluster2sector(BDRVVVFATState* s, uint32_t cluster_num)
+{
+ return s->faked_sectors + s->sectors_per_cluster * cluster_num;
+}
+
+static inline uint32_t sector_offset_in_cluster(BDRVVVFATState* s,off_t sector_num)
+{
+ return (sector_num-s->first_sectors_number-2*s->sectors_per_fat)%s->sectors_per_cluster;
+}
+
+#ifdef DBG
+static direntry_t* get_direntry_for_mapping(BDRVVVFATState* s,mapping_t* mapping)
+{
+ if(mapping->mode==MODE_UNDEFINED)
+ return 0;
+ return (direntry_t*)(s->directory.pointer+sizeof(direntry_t)*mapping->dir_index);
+}
+#endif
+
+static int init_directories(BDRVVVFATState* s,
+ const char* dirname)
+{
+ bootsector_t* bootsector;
+ mapping_t* mapping;
+ unsigned int i;
+ unsigned int cluster;
+
+ memset(&(s->first_sectors[0]),0,0x40*0x200);
+
+ s->cluster_size=s->sectors_per_cluster*0x200;
+ s->cluster_buffer=qemu_malloc(s->cluster_size);
+
+ /*
+ * The formula: sc = spf+1+spf*spc*(512*8/fat_type),
+ * where sc is sector_count,
+ * spf is sectors_per_fat,
+ * spc is sectors_per_clusters, and
+ * fat_type = 12, 16 or 32.
+ */
+ i = 1+s->sectors_per_cluster*0x200*8/s->fat_type;
+ s->sectors_per_fat=(s->sector_count+i)/i; /* round up */
+
+ array_init(&(s->mapping),sizeof(mapping_t));
+ array_init(&(s->directory),sizeof(direntry_t));
+
+ /* add volume label */
+ {
+ direntry_t* entry=array_get_next(&(s->directory));
+ entry->attributes=0x28; /* archive | volume label */
+ snprintf((char*)entry->name,11,"QEMU VVFAT");
+ }
+
+ /* Now build FAT, and write back information into directory */
+ init_fat(s);
+
+ s->faked_sectors=s->first_sectors_number+s->sectors_per_fat*2;
+ s->cluster_count=sector2cluster(s, s->sector_count);
+
+ mapping = array_get_next(&(s->mapping));
+ mapping->begin = 0;
+ mapping->dir_index = 0;
+ mapping->info.dir.parent_mapping_index = -1;
+ mapping->first_mapping_index = -1;
+ mapping->path = strdup(dirname);
+ i = strlen(mapping->path);
+ if (i > 0 && mapping->path[i - 1] == '/')
+ mapping->path[i - 1] = '\0';
+ mapping->mode = MODE_DIRECTORY;
+ mapping->read_only = 0;
+ s->path = mapping->path;
+
+ for (i = 0, cluster = 0; i < s->mapping.next; i++) {
+ /* MS-DOS expects the FAT to be 0 for the root directory
+ * (except for the media byte). */
+ /* LATER TODO: still true for FAT32? */
+ int fix_fat = (i != 0);
+ mapping = array_get(&(s->mapping), i);
+
+ if (mapping->mode & MODE_DIRECTORY) {
+ mapping->begin = cluster;
+ if(read_directory(s, i)) {
+ fprintf(stderr, "Could not read directory %s\n",
+ mapping->path);
+ return -1;
+ }
+ mapping = array_get(&(s->mapping), i);
+ } else {
+ assert(mapping->mode == MODE_UNDEFINED);
+ mapping->mode=MODE_NORMAL;
+ mapping->begin = cluster;
+ if (mapping->end > 0) {
+ direntry_t* direntry = array_get(&(s->directory),
+ mapping->dir_index);
+
+ mapping->end = cluster + 1 + (mapping->end-1)/s->cluster_size;
+ set_begin_of_direntry(direntry, mapping->begin);
+ } else {
+ mapping->end = cluster + 1;
+ fix_fat = 0;
+ }
+ }
+
+ assert(mapping->begin < mapping->end);
+
+ /* next free cluster */
+ cluster = mapping->end;
+
+ if(cluster > s->cluster_count) {
+ fprintf(stderr,"Directory does not fit in FAT%d (capacity %s)\n",
+ s->fat_type,
+ s->fat_type == 12 ? s->sector_count == 2880 ? "1.44 MB"
+ : "2.88 MB"
+ : "504MB");
+ return -EINVAL;
+ }
+
+ /* fix fat for entry */
+ if (fix_fat) {
+ int j;
+ for(j = mapping->begin; j < mapping->end - 1; j++)
+ fat_set(s, j, j+1);
+ fat_set(s, mapping->end - 1, s->max_fat_value);
+ }
+ }
+
+ mapping = array_get(&(s->mapping), 0);
+ s->sectors_of_root_directory = mapping->end * s->sectors_per_cluster;
+ s->last_cluster_of_root_directory = mapping->end;
+
+ /* the FAT signature */
+ fat_set(s,0,s->max_fat_value);
+ fat_set(s,1,s->max_fat_value);
+
+ s->current_mapping = NULL;
+
+ bootsector=(bootsector_t*)(s->first_sectors+(s->first_sectors_number-1)*0x200);
+ bootsector->jump[0]=0xeb;
+ bootsector->jump[1]=0x3e;
+ bootsector->jump[2]=0x90;
+ memcpy(bootsector->name,"QEMU ",8);
+ bootsector->sector_size=cpu_to_le16(0x200);
+ bootsector->sectors_per_cluster=s->sectors_per_cluster;
+ bootsector->reserved_sectors=cpu_to_le16(1);
+ bootsector->number_of_fats=0x2; /* number of FATs */
+ bootsector->root_entries=cpu_to_le16(s->sectors_of_root_directory*0x10);
+ bootsector->total_sectors16=s->sector_count>0xffff?0:cpu_to_le16(s->sector_count);
+ bootsector->media_type=(s->fat_type!=12?0xf8:s->sector_count==5760?0xf9:0xf8); /* media descriptor */
+ s->fat.pointer[0] = bootsector->media_type;
+ bootsector->sectors_per_fat=cpu_to_le16(s->sectors_per_fat);
+ bootsector->sectors_per_track=cpu_to_le16(s->bs->secs);
+ bootsector->number_of_heads=cpu_to_le16(s->bs->heads);
+ bootsector->hidden_sectors=cpu_to_le32(s->first_sectors_number==1?0:0x3f);
+ bootsector->total_sectors=cpu_to_le32(s->sector_count>0xffff?s->sector_count:0);
+
+ /* LATER TODO: if FAT32, this is wrong */
+ bootsector->u.fat16.drive_number=s->fat_type==12?0:0x80; /* assume this is hda (TODO) */
+ bootsector->u.fat16.current_head=0;
+ bootsector->u.fat16.signature=0x29;
+ bootsector->u.fat16.id=cpu_to_le32(0xfabe1afd);
+
+ memcpy(bootsector->u.fat16.volume_label,"QEMU VVFAT ",11);
+ memcpy(bootsector->fat_type,(s->fat_type==12?"FAT12 ":s->fat_type==16?"FAT16 ":"FAT32 "),8);
+ bootsector->magic[0]=0x55; bootsector->magic[1]=0xaa;
+
+ return 0;
+}
+
+#ifdef DEBUG
+static BDRVVVFATState *vvv = NULL;
+#endif
+
+static int enable_write_target(BDRVVVFATState *s);
+static int is_consistent(BDRVVVFATState *s);
+
+static int vvfat_open(BlockDriverState *bs, const char* dirname, int flags)
+{
+ BDRVVVFATState *s = bs->opaque;
+ int floppy = 0;
+ int i;
+
+#ifdef DEBUG
+ vvv = s;
+#endif
+
+DLOG(if (stderr == NULL) {
+ stderr = fopen("vvfat.log", "a");
+ setbuf(stderr, NULL);
+})
+
+ s->bs = bs;
+
+ s->fat_type=16;
+ /* LATER TODO: if FAT32, adjust */
+ s->sectors_per_cluster=0x10;
+ /* 504MB disk*/
+ bs->cyls=1024; bs->heads=16; bs->secs=63;
+
+ s->current_cluster=0xffffffff;
+
+ s->first_sectors_number=0x40;
+ /* read only is the default for safety */
+ bs->read_only = 1;
+ s->qcow = s->write_target = NULL;
+ s->qcow_filename = NULL;
+ s->fat2 = NULL;
+ s->downcase_short_names = 1;
+
+ if (!strstart(dirname, "fat:", NULL))
+ return -1;
+
+ if (strstr(dirname, ":floppy:")) {
+ floppy = 1;
+ s->fat_type = 12;
+ s->first_sectors_number = 1;
+ s->sectors_per_cluster=2;
+ bs->cyls = 80; bs->heads = 2; bs->secs = 36;
+ }
+
+ s->sector_count=bs->cyls*bs->heads*bs->secs;
+
+ if (strstr(dirname, ":32:")) {
+ fprintf(stderr, "Big fat greek warning: FAT32 has not been tested. You are welcome to do so!\n");
+ s->fat_type = 32;
+ } else if (strstr(dirname, ":16:")) {
+ s->fat_type = 16;
+ } else if (strstr(dirname, ":12:")) {
+ s->fat_type = 12;
+ s->sector_count=2880;
+ }
+
+ if (strstr(dirname, ":rw:")) {
+ if (enable_write_target(s))
+ return -1;
+ bs->read_only = 0;
+ }
+
+ i = strrchr(dirname, ':') - dirname;
+ assert(i >= 3);
+ if (dirname[i-2] == ':' && qemu_isalpha(dirname[i-1]))
+ /* workaround for DOS drive names */
+ dirname += i-1;
+ else
+ dirname += i+1;
+
+ bs->total_sectors=bs->cyls*bs->heads*bs->secs;
+
+ if(init_directories(s, dirname))
+ return -1;
+
+ s->sector_count = s->faked_sectors + s->sectors_per_cluster*s->cluster_count;
+
+ if(s->first_sectors_number==0x40)
+ init_mbr(s);
+
+ /* for some reason or other, MS-DOS does not like to know about CHS... */
+ if (floppy)
+ bs->heads = bs->cyls = bs->secs = 0;
+
+ // assert(is_consistent(s));
+ return 0;
+}
+
+static inline void vvfat_close_current_file(BDRVVVFATState *s)
+{
+ if(s->current_mapping) {
+ s->current_mapping = NULL;
+ if (s->current_fd) {
+ close(s->current_fd);
+ s->current_fd = 0;
+ }
+ }
+ s->current_cluster = -1;
+}
+
+/* mappings between index1 and index2-1 are supposed to be ordered
+ * return value is the index of the last mapping for which end>cluster_num
+ */
+static inline int find_mapping_for_cluster_aux(BDRVVVFATState* s,int cluster_num,int index1,int index2)
+{
+ int index3=index1+1;
+ while(1) {
+ mapping_t* mapping;
+ index3=(index1+index2)/2;
+ mapping=array_get(&(s->mapping),index3);
+ assert(mapping->begin < mapping->end);
+ if(mapping->begin>=cluster_num) {
+ assert(index2!=index3 || index2==0);
+ if(index2==index3)
+ return index1;
+ index2=index3;
+ } else {
+ if(index1==index3)
+ return mapping->end<=cluster_num ? index2 : index1;
+ index1=index3;
+ }
+ assert(index1<=index2);
+ DLOG(mapping=array_get(&(s->mapping),index1);
+ assert(mapping->begin<=cluster_num);
+ assert(index2 >= s->mapping.next ||
+ ((mapping = array_get(&(s->mapping),index2)) &&
+ mapping->end>cluster_num)));
+ }
+}
+
+static inline mapping_t* find_mapping_for_cluster(BDRVVVFATState* s,int cluster_num)
+{
+ int index=find_mapping_for_cluster_aux(s,cluster_num,0,s->mapping.next);
+ mapping_t* mapping;
+ if(index>=s->mapping.next)
+ return NULL;
+ mapping=array_get(&(s->mapping),index);
+ if(mapping->begin>cluster_num)
+ return NULL;
+ assert(mapping->begin<=cluster_num && mapping->end>cluster_num);
+ return mapping;
+}
+
+/*
+ * This function simply compares path == mapping->path. Since the mappings
+ * are sorted by cluster, this is expensive: O(n).
+ */
+static inline mapping_t* find_mapping_for_path(BDRVVVFATState* s,
+ const char* path)
+{
+ int i;
+
+ for (i = 0; i < s->mapping.next; i++) {
+ mapping_t* mapping = array_get(&(s->mapping), i);
+ if (mapping->first_mapping_index < 0 &&
+ !strcmp(path, mapping->path))
+ return mapping;
+ }
+
+ return NULL;
+}
+
+static int open_file(BDRVVVFATState* s,mapping_t* mapping)
+{
+ if(!mapping)
+ return -1;
+ if(!s->current_mapping ||
+ strcmp(s->current_mapping->path,mapping->path)) {
+ /* open file */
+ int fd = open(mapping->path, O_RDONLY | O_BINARY | O_LARGEFILE);
+ if(fd<0)
+ return -1;
+ vvfat_close_current_file(s);
+ s->current_fd = fd;
+ s->current_mapping = mapping;
+ }
+ return 0;
+}
+
+static inline int read_cluster(BDRVVVFATState *s,int cluster_num)
+{
+ if(s->current_cluster != cluster_num) {
+ int result=0;
+ off_t offset;
+ assert(!s->current_mapping || s->current_fd || (s->current_mapping->mode & MODE_DIRECTORY));
+ if(!s->current_mapping
+ || s->current_mapping->begin>cluster_num
+ || s->current_mapping->end<=cluster_num) {
+ /* binary search of mappings for file */
+ mapping_t* mapping=find_mapping_for_cluster(s,cluster_num);
+
+ assert(!mapping || (cluster_num>=mapping->begin && cluster_num<mapping->end));
+
+ if (mapping && mapping->mode & MODE_DIRECTORY) {
+ vvfat_close_current_file(s);
+ s->current_mapping = mapping;
+read_cluster_directory:
+ offset = s->cluster_size*(cluster_num-s->current_mapping->begin);
+ s->cluster = (unsigned char*)s->directory.pointer+offset
+ + 0x20*s->current_mapping->info.dir.first_dir_index;
+ assert(((s->cluster-(unsigned char*)s->directory.pointer)%s->cluster_size)==0);
+ assert((char*)s->cluster+s->cluster_size <= s->directory.pointer+s->directory.next*s->directory.item_size);
+ s->current_cluster = cluster_num;
+ return 0;
+ }
+
+ if(open_file(s,mapping))
+ return -2;
+ } else if (s->current_mapping->mode & MODE_DIRECTORY)
+ goto read_cluster_directory;
+
+ assert(s->current_fd);
+
+ offset=s->cluster_size*(cluster_num-s->current_mapping->begin)+s->current_mapping->info.file.offset;
+ if(lseek(s->current_fd, offset, SEEK_SET)!=offset)
+ return -3;
+ s->cluster=s->cluster_buffer;
+ result=read(s->current_fd,s->cluster,s->cluster_size);
+ if(result<0) {
+ s->current_cluster = -1;
+ return -1;
+ }
+ s->current_cluster = cluster_num;
+ }
+ return 0;
+}
+
+#ifdef DEBUG
+static void hexdump(const void* address, uint32_t len)
+{
+ const unsigned char* p = address;
+ int i, j;
+
+ for (i = 0; i < len; i += 16) {
+ for (j = 0; j < 16 && i + j < len; j++)
+ fprintf(stderr, "%02x ", p[i + j]);
+ for (; j < 16; j++)
+ fprintf(stderr, " ");
+ fprintf(stderr, " ");
+ for (j = 0; j < 16 && i + j < len; j++)
+ fprintf(stderr, "%c", (p[i + j] < ' ' || p[i + j] > 0x7f) ? '.' : p[i + j]);
+ fprintf(stderr, "\n");
+ }
+}
+
+static void print_direntry(const direntry_t* direntry)
+{
+ int j = 0;
+ char buffer[1024];
+
+ fprintf(stderr, "direntry 0x%x: ", (int)direntry);
+ if(!direntry)
+ return;
+ if(is_long_name(direntry)) {
+ unsigned char* c=(unsigned char*)direntry;
+ int i;
+ for(i=1;i<11 && c[i] && c[i]!=0xff;i+=2)
+#define ADD_CHAR(c) {buffer[j] = (c); if (buffer[j] < ' ') buffer[j] = 0xb0; j++;}
+ ADD_CHAR(c[i]);
+ for(i=14;i<26 && c[i] && c[i]!=0xff;i+=2)
+ ADD_CHAR(c[i]);
+ for(i=28;i<32 && c[i] && c[i]!=0xff;i+=2)
+ ADD_CHAR(c[i]);
+ buffer[j] = 0;
+ fprintf(stderr, "%s\n", buffer);
+ } else {
+ int i;
+ for(i=0;i<11;i++)
+ ADD_CHAR(direntry->name[i]);
+ buffer[j] = 0;
+ fprintf(stderr,"%s attributes=0x%02x begin=%d size=%d\n",
+ buffer,
+ direntry->attributes,
+ begin_of_direntry(direntry),le32_to_cpu(direntry->size));
+ }
+}
+
+static void print_mapping(const mapping_t* mapping)
+{
+ fprintf(stderr, "mapping (0x%x): begin, end = %d, %d, dir_index = %d, first_mapping_index = %d, name = %s, mode = 0x%x, " , (int)mapping, mapping->begin, mapping->end, mapping->dir_index, mapping->first_mapping_index, mapping->path, mapping->mode);
+ if (mapping->mode & MODE_DIRECTORY)
+ fprintf(stderr, "parent_mapping_index = %d, first_dir_index = %d\n", mapping->info.dir.parent_mapping_index, mapping->info.dir.first_dir_index);
+ else
+ fprintf(stderr, "offset = %d\n", mapping->info.file.offset);
+}
+#endif
+
+static int vvfat_read(BlockDriverState *bs, int64_t sector_num,
+ uint8_t *buf, int nb_sectors)
+{
+ BDRVVVFATState *s = bs->opaque;
+ int i;
+
+ for(i=0;i<nb_sectors;i++,sector_num++) {
+ if (sector_num >= s->sector_count)
+ return -1;
+ if (s->qcow) {
+ int n;
+ if (s->qcow->drv->bdrv_is_allocated(s->qcow,
+ sector_num, nb_sectors-i, &n)) {
+DLOG(fprintf(stderr, "sectors %d+%d allocated\n", (int)sector_num, n));
+ if (s->qcow->drv->bdrv_read(s->qcow, sector_num, buf+i*0x200, n))
+ return -1;
+ i += n - 1;
+ sector_num += n - 1;
+ continue;
+ }
+DLOG(fprintf(stderr, "sector %d not allocated\n", (int)sector_num));
+ }
+ if(sector_num<s->faked_sectors) {
+ if(sector_num<s->first_sectors_number)
+ memcpy(buf+i*0x200,&(s->first_sectors[sector_num*0x200]),0x200);
+ else if(sector_num-s->first_sectors_number<s->sectors_per_fat)
+ memcpy(buf+i*0x200,&(s->fat.pointer[(sector_num-s->first_sectors_number)*0x200]),0x200);
+ else if(sector_num-s->first_sectors_number-s->sectors_per_fat<s->sectors_per_fat)
+ memcpy(buf+i*0x200,&(s->fat.pointer[(sector_num-s->first_sectors_number-s->sectors_per_fat)*0x200]),0x200);
+ } else {
+ uint32_t sector=sector_num-s->faked_sectors,
+ sector_offset_in_cluster=(sector%s->sectors_per_cluster),
+ cluster_num=sector/s->sectors_per_cluster;
+ if(read_cluster(s, cluster_num) != 0) {
+ /* LATER TODO: strict: return -1; */
+ memset(buf+i*0x200,0,0x200);
+ continue;
+ }
+ memcpy(buf+i*0x200,s->cluster+sector_offset_in_cluster*0x200,0x200);
+ }
+ }
+ return 0;
+}
+
+/* LATER TODO: statify all functions */
+
+/*
+ * Idea of the write support (use snapshot):
+ *
+ * 1. check if all data is consistent, recording renames, modifications,
+ * new files and directories (in s->commits).
+ *
+ * 2. if the data is not consistent, stop committing
+ *
+ * 3. handle renames, and create new files and directories (do not yet
+ * write their contents)
+ *
+ * 4. walk the directories, fixing the mapping and direntries, and marking
+ * the handled mappings as not deleted
+ *
+ * 5. commit the contents of the files
+ *
+ * 6. handle deleted files and directories
+ *
+ */
+
+typedef struct commit_t {
+ char* path;
+ union {
+ struct { uint32_t cluster; } rename;
+ struct { int dir_index; uint32_t modified_offset; } writeout;
+ struct { uint32_t first_cluster; } new_file;
+ struct { uint32_t cluster; } mkdir;
+ } param;
+ /* DELETEs and RMDIRs are handled differently: see handle_deletes() */
+ enum {
+ ACTION_RENAME, ACTION_WRITEOUT, ACTION_NEW_FILE, ACTION_MKDIR
+ } action;
+} commit_t;
+
+static void clear_commits(BDRVVVFATState* s)
+{
+ int i;
+DLOG(fprintf(stderr, "clear_commits (%d commits)\n", s->commits.next));
+ for (i = 0; i < s->commits.next; i++) {
+ commit_t* commit = array_get(&(s->commits), i);
+ assert(commit->path || commit->action == ACTION_WRITEOUT);
+ if (commit->action != ACTION_WRITEOUT) {
+ assert(commit->path);
+ free(commit->path);
+ } else
+ assert(commit->path == NULL);
+ }
+ s->commits.next = 0;
+}
+
+static void schedule_rename(BDRVVVFATState* s,
+ uint32_t cluster, char* new_path)
+{
+ commit_t* commit = array_get_next(&(s->commits));
+ commit->path = new_path;
+ commit->param.rename.cluster = cluster;
+ commit->action = ACTION_RENAME;
+}
+
+static void schedule_writeout(BDRVVVFATState* s,
+ int dir_index, uint32_t modified_offset)
+{
+ commit_t* commit = array_get_next(&(s->commits));
+ commit->path = NULL;
+ commit->param.writeout.dir_index = dir_index;
+ commit->param.writeout.modified_offset = modified_offset;
+ commit->action = ACTION_WRITEOUT;
+}
+
+static void schedule_new_file(BDRVVVFATState* s,
+ char* path, uint32_t first_cluster)
+{
+ commit_t* commit = array_get_next(&(s->commits));
+ commit->path = path;
+ commit->param.new_file.first_cluster = first_cluster;
+ commit->action = ACTION_NEW_FILE;
+}
+
+static void schedule_mkdir(BDRVVVFATState* s, uint32_t cluster, char* path)
+{
+ commit_t* commit = array_get_next(&(s->commits));
+ commit->path = path;
+ commit->param.mkdir.cluster = cluster;
+ commit->action = ACTION_MKDIR;
+}
+
+typedef struct {
+ /*
+ * Since the sequence number is at most 0x3f, and the filename
+ * length is at most 13 times the sequence number, the maximal
+ * filename length is 0x3f * 13 bytes.
+ */
+ unsigned char name[0x3f * 13 + 1];
+ int checksum, len;
+ int sequence_number;
+} long_file_name;
+
+static void lfn_init(long_file_name* lfn)
+{
+ lfn->sequence_number = lfn->len = 0;
+ lfn->checksum = 0x100;
+}
+
+/* return 0 if parsed successfully, > 0 if no long name, < 0 if error */
+static int parse_long_name(long_file_name* lfn,
+ const direntry_t* direntry)
+{
+ int i, j, offset;
+ const unsigned char* pointer = (const unsigned char*)direntry;
+
+ if (!is_long_name(direntry))
+ return 1;
+
+ if (pointer[0] & 0x40) {
+ lfn->sequence_number = pointer[0] & 0x3f;
+ lfn->checksum = pointer[13];
+ lfn->name[0] = 0;
+ lfn->name[lfn->sequence_number * 13] = 0;
+ } else if ((pointer[0] & 0x3f) != --lfn->sequence_number)
+ return -1;
+ else if (pointer[13] != lfn->checksum)
+ return -2;
+ else if (pointer[12] || pointer[26] || pointer[27])
+ return -3;
+
+ offset = 13 * (lfn->sequence_number - 1);
+ for (i = 0, j = 1; i < 13; i++, j+=2) {
+ if (j == 11)
+ j = 14;
+ else if (j == 26)
+ j = 28;
+
+ if (pointer[j+1] == 0)
+ lfn->name[offset + i] = pointer[j];
+ else if (pointer[j+1] != 0xff || (pointer[0] & 0x40) == 0)
+ return -4;
+ else
+ lfn->name[offset + i] = 0;
+ }
+
+ if (pointer[0] & 0x40)
+ lfn->len = offset + strlen((char*)lfn->name + offset);
+
+ return 0;
+}
+
+/* returns 0 if successful, >0 if no short_name, and <0 on error */
+static int parse_short_name(BDRVVVFATState* s,
+ long_file_name* lfn, direntry_t* direntry)
+{
+ int i, j;
+
+ if (!is_short_name(direntry))
+ return 1;
+
+ for (j = 7; j >= 0 && direntry->name[j] == ' '; j--);
+ for (i = 0; i <= j; i++) {
+ if (direntry->name[i] <= ' ' || direntry->name[i] > 0x7f)
+ return -1;
+ else if (s->downcase_short_names)
+ lfn->name[i] = qemu_tolower(direntry->name[i]);
+ else
+ lfn->name[i] = direntry->name[i];
+ }
+
+ for (j = 2; j >= 0 && direntry->extension[j] == ' '; j--);
+ if (j >= 0) {
+ lfn->name[i++] = '.';
+ lfn->name[i + j + 1] = '\0';
+ for (;j >= 0; j--) {
+ if (direntry->extension[j] <= ' ' || direntry->extension[j] > 0x7f)
+ return -2;
+ else if (s->downcase_short_names)
+ lfn->name[i + j] = qemu_tolower(direntry->extension[j]);
+ else
+ lfn->name[i + j] = direntry->extension[j];
+ }
+ } else
+ lfn->name[i + j + 1] = '\0';
+
+ lfn->len = strlen((char*)lfn->name);
+
+ return 0;
+}
+
+static inline uint32_t modified_fat_get(BDRVVVFATState* s,
+ unsigned int cluster)
+{
+ if (cluster < s->last_cluster_of_root_directory) {
+ if (cluster + 1 == s->last_cluster_of_root_directory)
+ return s->max_fat_value;
+ else
+ return cluster + 1;
+ }
+
+ if (s->fat_type==32) {
+ uint32_t* entry=((uint32_t*)s->fat2)+cluster;
+ return le32_to_cpu(*entry);
+ } else if (s->fat_type==16) {
+ uint16_t* entry=((uint16_t*)s->fat2)+cluster;
+ return le16_to_cpu(*entry);
+ } else {
+ const uint8_t* x=s->fat2+cluster*3/2;
+ return ((x[0]|(x[1]<<8))>>(cluster&1?4:0))&0x0fff;
+ }
+}
+
+static inline int cluster_was_modified(BDRVVVFATState* s, uint32_t cluster_num)
+{
+ int was_modified = 0;
+ int i, dummy;
+
+ if (s->qcow == NULL)
+ return 0;
+
+ for (i = 0; !was_modified && i < s->sectors_per_cluster; i++)
+ was_modified = s->qcow->drv->bdrv_is_allocated(s->qcow,
+ cluster2sector(s, cluster_num) + i, 1, &dummy);
+
+ return was_modified;
+}
+
+static const char* get_basename(const char* path)
+{
+ char* basename = strrchr(path, '/');
+ if (basename == NULL)
+ return path;
+ else
+ return basename + 1; /* strip '/' */
+}
+
+/*
+ * The array s->used_clusters holds the states of the clusters. If it is
+ * part of a file, it has bit 2 set, in case of a directory, bit 1. If it
+ * was modified, bit 3 is set.
+ * If any cluster is allocated, but not part of a file or directory, this
+ * driver refuses to commit.
+ */
+typedef enum {
+ USED_DIRECTORY = 1, USED_FILE = 2, USED_ANY = 3, USED_ALLOCATED = 4
+} used_t;
+
+/*
+ * get_cluster_count_for_direntry() not only determines how many clusters
+ * are occupied by direntry, but also if it was renamed or modified.
+ *
+ * A file is thought to be renamed *only* if there already was a file with
+ * exactly the same first cluster, but a different name.
+ *
+ * Further, the files/directories handled by this function are
+ * assumed to be *not* deleted (and *only* those).
+ */
+static uint32_t get_cluster_count_for_direntry(BDRVVVFATState* s,
+ direntry_t* direntry, const char* path)
+{
+ /*
+ * This is a little bit tricky:
+ * IF the guest OS just inserts a cluster into the file chain,
+ * and leaves the rest alone, (i.e. the original file had clusters
+ * 15 -> 16, but now has 15 -> 32 -> 16), then the following happens:
+ *
+ * - do_commit will write the cluster into the file at the given
+ * offset, but
+ *
+ * - the cluster which is overwritten should be moved to a later
+ * position in the file.
+ *
+ * I am not aware that any OS does something as braindead, but this
+ * situation could happen anyway when not committing for a long time.
+ * Just to be sure that this does not bite us, detect it, and copy the
+ * contents of the clusters to-be-overwritten into the qcow.
+ */
+ int copy_it = 0;
+ int was_modified = 0;
+ int32_t ret = 0;
+
+ uint32_t cluster_num = begin_of_direntry(direntry);
+ uint32_t offset = 0;
+ int first_mapping_index = -1;
+ mapping_t* mapping = NULL;
+ const char* basename2 = NULL;
+
+ vvfat_close_current_file(s);
+
+ /* the root directory */
+ if (cluster_num == 0)
+ return 0;
+
+ /* write support */
+ if (s->qcow) {
+ basename2 = get_basename(path);
+
+ mapping = find_mapping_for_cluster(s, cluster_num);
+
+ if (mapping) {
+ const char* basename;
+
+ assert(mapping->mode & MODE_DELETED);
+ mapping->mode &= ~MODE_DELETED;
+
+ basename = get_basename(mapping->path);
+
+ assert(mapping->mode & MODE_NORMAL);
+
+ /* rename */
+ if (strcmp(basename, basename2))
+ schedule_rename(s, cluster_num, strdup(path));
+ } else if (is_file(direntry))
+ /* new file */
+ schedule_new_file(s, strdup(path), cluster_num);
+ else {
+ assert(0);
+ return 0;
+ }
+ }
+
+ while(1) {
+ if (s->qcow) {
+ if (!copy_it && cluster_was_modified(s, cluster_num)) {
+ if (mapping == NULL ||
+ mapping->begin > cluster_num ||
+ mapping->end <= cluster_num)
+ mapping = find_mapping_for_cluster(s, cluster_num);
+
+
+ if (mapping &&
+ (mapping->mode & MODE_DIRECTORY) == 0) {
+
+ /* was modified in qcow */
+ if (offset != mapping->info.file.offset + s->cluster_size
+ * (cluster_num - mapping->begin)) {
+ /* offset of this cluster in file chain has changed */
+ assert(0);
+ copy_it = 1;
+ } else if (offset == 0) {
+ const char* basename = get_basename(mapping->path);
+
+ if (strcmp(basename, basename2))
+ copy_it = 1;
+ first_mapping_index = array_index(&(s->mapping), mapping);
+ }
+
+ if (mapping->first_mapping_index != first_mapping_index
+ && mapping->info.file.offset > 0) {
+ assert(0);
+ copy_it = 1;
+ }
+
+ /* need to write out? */
+ if (!was_modified && is_file(direntry)) {
+ was_modified = 1;
+ schedule_writeout(s, mapping->dir_index, offset);
+ }
+ }
+ }
+
+ if (copy_it) {
+ int i, dummy;
+ /*
+ * This is horribly inefficient, but that is okay, since
+ * it is rarely executed, if at all.
+ */
+ int64_t offset = cluster2sector(s, cluster_num);
+
+ vvfat_close_current_file(s);
+ for (i = 0; i < s->sectors_per_cluster; i++)
+ if (!s->qcow->drv->bdrv_is_allocated(s->qcow,
+ offset + i, 1, &dummy)) {
+ if (vvfat_read(s->bs,
+ offset, s->cluster_buffer, 1))
+ return -1;
+ if (s->qcow->drv->bdrv_write(s->qcow,
+ offset, s->cluster_buffer, 1))
+ return -2;
+ }
+ }
+ }
+
+ ret++;
+ if (s->used_clusters[cluster_num] & USED_ANY)
+ return 0;
+ s->used_clusters[cluster_num] = USED_FILE;
+
+ cluster_num = modified_fat_get(s, cluster_num);
+
+ if (fat_eof(s, cluster_num))
+ return ret;
+ else if (cluster_num < 2 || cluster_num > s->max_fat_value - 16)
+ return -1;
+
+ offset += s->cluster_size;
+ }
+}
+
+/*
+ * This function looks at the modified data (qcow).
+ * It returns 0 upon inconsistency or error, and the number of clusters
+ * used by the directory, its subdirectories and their files.
+ */
+static int check_directory_consistency(BDRVVVFATState *s,
+ int cluster_num, const char* path)
+{
+ int ret = 0;
+ unsigned char* cluster = qemu_malloc(s->cluster_size);
+ direntry_t* direntries = (direntry_t*)cluster;
+ mapping_t* mapping = find_mapping_for_cluster(s, cluster_num);
+
+ long_file_name lfn;
+ int path_len = strlen(path);
+ char path2[PATH_MAX];
+
+ assert(path_len < PATH_MAX); /* len was tested before! */
+ pstrcpy(path2, sizeof(path2), path);
+ path2[path_len] = '/';
+ path2[path_len + 1] = '\0';
+
+ if (mapping) {
+ const char* basename = get_basename(mapping->path);
+ const char* basename2 = get_basename(path);
+
+ assert(mapping->mode & MODE_DIRECTORY);
+
+ assert(mapping->mode & MODE_DELETED);
+ mapping->mode &= ~MODE_DELETED;
+
+ if (strcmp(basename, basename2))
+ schedule_rename(s, cluster_num, strdup(path));
+ } else
+ /* new directory */
+ schedule_mkdir(s, cluster_num, strdup(path));
+
+ lfn_init(&lfn);
+ do {
+ int i;
+ int subret = 0;
+
+ ret++;
+
+ if (s->used_clusters[cluster_num] & USED_ANY) {
+ fprintf(stderr, "cluster %d used more than once\n", (int)cluster_num);
+ return 0;
+ }
+ s->used_clusters[cluster_num] = USED_DIRECTORY;
+
+DLOG(fprintf(stderr, "read cluster %d (sector %d)\n", (int)cluster_num, (int)cluster2sector(s, cluster_num)));
+ subret = vvfat_read(s->bs, cluster2sector(s, cluster_num), cluster,
+ s->sectors_per_cluster);
+ if (subret) {
+ fprintf(stderr, "Error fetching direntries\n");
+ fail:
+ free(cluster);
+ return 0;
+ }
+
+ for (i = 0; i < 0x10 * s->sectors_per_cluster; i++) {
+ int cluster_count = 0;
+
+DLOG(fprintf(stderr, "check direntry %d: \n", i); print_direntry(direntries + i));
+ if (is_volume_label(direntries + i) || is_dot(direntries + i) ||
+ is_free(direntries + i))
+ continue;
+
+ subret = parse_long_name(&lfn, direntries + i);
+ if (subret < 0) {
+ fprintf(stderr, "Error in long name\n");
+ goto fail;
+ }
+ if (subret == 0 || is_free(direntries + i))
+ continue;
+
+ if (fat_chksum(direntries+i) != lfn.checksum) {
+ subret = parse_short_name(s, &lfn, direntries + i);
+ if (subret < 0) {
+ fprintf(stderr, "Error in short name (%d)\n", subret);
+ goto fail;
+ }
+ if (subret > 0 || !strcmp((char*)lfn.name, ".")
+ || !strcmp((char*)lfn.name, ".."))
+ continue;
+ }
+ lfn.checksum = 0x100; /* cannot use long name twice */
+
+ if (path_len + 1 + lfn.len >= PATH_MAX) {
+ fprintf(stderr, "Name too long: %s/%s\n", path, lfn.name);
+ goto fail;
+ }
+ pstrcpy(path2 + path_len + 1, sizeof(path2) - path_len - 1,
+ (char*)lfn.name);
+
+ if (is_directory(direntries + i)) {
+ if (begin_of_direntry(direntries + i) == 0) {
+ DLOG(fprintf(stderr, "invalid begin for directory: %s\n", path2); print_direntry(direntries + i));
+ goto fail;
+ }
+ cluster_count = check_directory_consistency(s,
+ begin_of_direntry(direntries + i), path2);
+ if (cluster_count == 0) {
+ DLOG(fprintf(stderr, "problem in directory %s:\n", path2); print_direntry(direntries + i));
+ goto fail;
+ }
+ } else if (is_file(direntries + i)) {
+ /* check file size with FAT */
+ cluster_count = get_cluster_count_for_direntry(s, direntries + i, path2);
+ if (cluster_count !=
+ (le32_to_cpu(direntries[i].size) + s->cluster_size
+ - 1) / s->cluster_size) {
+ DLOG(fprintf(stderr, "Cluster count mismatch\n"));
+ goto fail;
+ }
+ } else
+ assert(0); /* cluster_count = 0; */
+
+ ret += cluster_count;
+ }
+
+ cluster_num = modified_fat_get(s, cluster_num);
+ } while(!fat_eof(s, cluster_num));
+
+ free(cluster);
+ return ret;
+}
+
+/* returns 1 on success */
+static int is_consistent(BDRVVVFATState* s)
+{
+ int i, check;
+ int used_clusters_count = 0;
+
+DLOG(checkpoint());
+ /*
+ * - get modified FAT
+ * - compare the two FATs (TODO)
+ * - get buffer for marking used clusters
+ * - recurse direntries from root (using bs->bdrv_read to make
+ * sure to get the new data)
+ * - check that the FAT agrees with the size
+ * - count the number of clusters occupied by this directory and
+ * its files
+ * - check that the cumulative used cluster count agrees with the
+ * FAT
+ * - if all is fine, return number of used clusters
+ */
+ if (s->fat2 == NULL) {
+ int size = 0x200 * s->sectors_per_fat;
+ s->fat2 = qemu_malloc(size);
+ memcpy(s->fat2, s->fat.pointer, size);
+ }
+ check = vvfat_read(s->bs,
+ s->first_sectors_number, s->fat2, s->sectors_per_fat);
+ if (check) {
+ fprintf(stderr, "Could not copy fat\n");
+ return 0;
+ }
+ assert (s->used_clusters);
+ for (i = 0; i < sector2cluster(s, s->sector_count); i++)
+ s->used_clusters[i] &= ~USED_ANY;
+
+ clear_commits(s);
+
+ /* mark every mapped file/directory as deleted.
+ * (check_directory_consistency() will unmark those still present). */
+ if (s->qcow)
+ for (i = 0; i < s->mapping.next; i++) {
+ mapping_t* mapping = array_get(&(s->mapping), i);
+ if (mapping->first_mapping_index < 0)
+ mapping->mode |= MODE_DELETED;
+ }
+
+ used_clusters_count = check_directory_consistency(s, 0, s->path);
+ if (used_clusters_count <= 0) {
+ DLOG(fprintf(stderr, "problem in directory\n"));
+ return 0;
+ }
+
+ check = s->last_cluster_of_root_directory;
+ for (i = check; i < sector2cluster(s, s->sector_count); i++) {
+ if (modified_fat_get(s, i)) {
+ if(!s->used_clusters[i]) {
+ DLOG(fprintf(stderr, "FAT was modified (%d), but cluster is not used?\n", i));
+ return 0;
+ }
+ check++;
+ }
+
+ if (s->used_clusters[i] == USED_ALLOCATED) {
+ /* allocated, but not used... */
+ DLOG(fprintf(stderr, "unused, modified cluster: %d\n", i));
+ return 0;
+ }
+ }
+
+ if (check != used_clusters_count)
+ return 0;
+
+ return used_clusters_count;
+}
+
+static inline void adjust_mapping_indices(BDRVVVFATState* s,
+ int offset, int adjust)
+{
+ int i;
+
+ for (i = 0; i < s->mapping.next; i++) {
+ mapping_t* mapping = array_get(&(s->mapping), i);
+
+#define ADJUST_MAPPING_INDEX(name) \
+ if (mapping->name >= offset) \
+ mapping->name += adjust
+
+ ADJUST_MAPPING_INDEX(first_mapping_index);
+ if (mapping->mode & MODE_DIRECTORY)
+ ADJUST_MAPPING_INDEX(info.dir.parent_mapping_index);
+ }
+}
+
+/* insert or update mapping */
+static mapping_t* insert_mapping(BDRVVVFATState* s,
+ uint32_t begin, uint32_t end)
+{
+ /*
+ * - find mapping where mapping->begin >= begin,
+ * - if mapping->begin > begin: insert
+ * - adjust all references to mappings!
+ * - else: adjust
+ * - replace name
+ */
+ int index = find_mapping_for_cluster_aux(s, begin, 0, s->mapping.next);
+ mapping_t* mapping = NULL;
+ mapping_t* first_mapping = array_get(&(s->mapping), 0);
+
+ if (index < s->mapping.next && (mapping = array_get(&(s->mapping), index))
+ && mapping->begin < begin) {
+ mapping->end = begin;
+ index++;
+ mapping = array_get(&(s->mapping), index);
+ }
+ if (index >= s->mapping.next || mapping->begin > begin) {
+ mapping = array_insert(&(s->mapping), index, 1);
+ mapping->path = NULL;
+ adjust_mapping_indices(s, index, +1);
+ }
+
+ mapping->begin = begin;
+ mapping->end = end;
+
+DLOG(mapping_t* next_mapping;
+assert(index + 1 >= s->mapping.next ||
+((next_mapping = array_get(&(s->mapping), index + 1)) &&
+ next_mapping->begin >= end)));
+
+ if (s->current_mapping && first_mapping != (mapping_t*)s->mapping.pointer)
+ s->current_mapping = array_get(&(s->mapping),
+ s->current_mapping - first_mapping);
+
+ return mapping;
+}
+
+static int remove_mapping(BDRVVVFATState* s, int mapping_index)
+{
+ mapping_t* mapping = array_get(&(s->mapping), mapping_index);
+ mapping_t* first_mapping = array_get(&(s->mapping), 0);
+
+ /* free mapping */
+ if (mapping->first_mapping_index < 0)
+ free(mapping->path);
+
+ /* remove from s->mapping */
+ array_remove(&(s->mapping), mapping_index);
+
+ /* adjust all references to mappings */
+ adjust_mapping_indices(s, mapping_index, -1);
+
+ if (s->current_mapping && first_mapping != (mapping_t*)s->mapping.pointer)
+ s->current_mapping = array_get(&(s->mapping),
+ s->current_mapping - first_mapping);
+
+ return 0;
+}
+
+static void adjust_dirindices(BDRVVVFATState* s, int offset, int adjust)
+{
+ int i;
+ for (i = 0; i < s->mapping.next; i++) {
+ mapping_t* mapping = array_get(&(s->mapping), i);
+ if (mapping->dir_index >= offset)
+ mapping->dir_index += adjust;
+ if ((mapping->mode & MODE_DIRECTORY) &&
+ mapping->info.dir.first_dir_index >= offset)
+ mapping->info.dir.first_dir_index += adjust;
+ }
+}
+
+static direntry_t* insert_direntries(BDRVVVFATState* s,
+ int dir_index, int count)
+{
+ /*
+ * make room in s->directory,
+ * adjust_dirindices
+ */
+ direntry_t* result = array_insert(&(s->directory), dir_index, count);
+ if (result == NULL)
+ return NULL;
+ adjust_dirindices(s, dir_index, count);
+ return result;
+}
+
+static int remove_direntries(BDRVVVFATState* s, int dir_index, int count)
+{
+ int ret = array_remove_slice(&(s->directory), dir_index, count);
+ if (ret)
+ return ret;
+ adjust_dirindices(s, dir_index, -count);
+ return 0;
+}
+
+/*
+ * Adapt the mappings of the cluster chain starting at first cluster
+ * (i.e. if a file starts at first_cluster, the chain is followed according
+ * to the modified fat, and the corresponding entries in s->mapping are
+ * adjusted)
+ */
+static int commit_mappings(BDRVVVFATState* s,
+ uint32_t first_cluster, int dir_index)
+{
+ mapping_t* mapping = find_mapping_for_cluster(s, first_cluster);
+ direntry_t* direntry = array_get(&(s->directory), dir_index);
+ uint32_t cluster = first_cluster;
+
+ vvfat_close_current_file(s);
+
+ assert(mapping);
+ assert(mapping->begin == first_cluster);
+ mapping->first_mapping_index = -1;
+ mapping->dir_index = dir_index;
+ mapping->mode = (dir_index <= 0 || is_directory(direntry)) ?
+ MODE_DIRECTORY : MODE_NORMAL;
+
+ while (!fat_eof(s, cluster)) {
+ uint32_t c, c1;
+
+ for (c = cluster, c1 = modified_fat_get(s, c); c + 1 == c1;
+ c = c1, c1 = modified_fat_get(s, c1));
+
+ c++;
+ if (c > mapping->end) {
+ int index = array_index(&(s->mapping), mapping);
+ int i, max_i = s->mapping.next - index;
+ for (i = 1; i < max_i && mapping[i].begin < c; i++);
+ while (--i > 0)
+ remove_mapping(s, index + 1);
+ }
+ assert(mapping == array_get(&(s->mapping), s->mapping.next - 1)
+ || mapping[1].begin >= c);
+ mapping->end = c;
+
+ if (!fat_eof(s, c1)) {
+ int i = find_mapping_for_cluster_aux(s, c1, 0, s->mapping.next);
+ mapping_t* next_mapping = i >= s->mapping.next ? NULL :
+ array_get(&(s->mapping), i);
+
+ if (next_mapping == NULL || next_mapping->begin > c1) {
+ int i1 = array_index(&(s->mapping), mapping);
+
+ next_mapping = insert_mapping(s, c1, c1+1);
+
+ if (c1 < c)
+ i1++;
+ mapping = array_get(&(s->mapping), i1);
+ }
+
+ next_mapping->dir_index = mapping->dir_index;
+ next_mapping->first_mapping_index =
+ mapping->first_mapping_index < 0 ?
+ array_index(&(s->mapping), mapping) :
+ mapping->first_mapping_index;
+ next_mapping->path = mapping->path;
+ next_mapping->mode = mapping->mode;
+ next_mapping->read_only = mapping->read_only;
+ if (mapping->mode & MODE_DIRECTORY) {
+ next_mapping->info.dir.parent_mapping_index =
+ mapping->info.dir.parent_mapping_index;
+ next_mapping->info.dir.first_dir_index =
+ mapping->info.dir.first_dir_index +
+ 0x10 * s->sectors_per_cluster *
+ (mapping->end - mapping->begin);
+ } else
+ next_mapping->info.file.offset = mapping->info.file.offset +
+ mapping->end - mapping->begin;
+
+ mapping = next_mapping;
+ }
+
+ cluster = c1;
+ }
+
+ return 0;
+}
+
+static int commit_direntries(BDRVVVFATState* s,
+ int dir_index, int parent_mapping_index)
+{
+ direntry_t* direntry = array_get(&(s->directory), dir_index);
+ uint32_t first_cluster = dir_index == 0 ? 0 : begin_of_direntry(direntry);
+ mapping_t* mapping = find_mapping_for_cluster(s, first_cluster);
+
+ int factor = 0x10 * s->sectors_per_cluster;
+ int old_cluster_count, new_cluster_count;
+ int current_dir_index = mapping->info.dir.first_dir_index;
+ int first_dir_index = current_dir_index;
+ int ret, i;
+ uint32_t c;
+
+DLOG(fprintf(stderr, "commit_direntries for %s, parent_mapping_index %d\n", mapping->path, parent_mapping_index));
+
+ assert(direntry);
+ assert(mapping);
+ assert(mapping->begin == first_cluster);
+ assert(mapping->info.dir.first_dir_index < s->directory.next);
+ assert(mapping->mode & MODE_DIRECTORY);
+ assert(dir_index == 0 || is_directory(direntry));
+
+ mapping->info.dir.parent_mapping_index = parent_mapping_index;
+
+ if (first_cluster == 0) {
+ old_cluster_count = new_cluster_count =
+ s->last_cluster_of_root_directory;
+ } else {
+ for (old_cluster_count = 0, c = first_cluster; !fat_eof(s, c);
+ c = fat_get(s, c))
+ old_cluster_count++;
+
+ for (new_cluster_count = 0, c = first_cluster; !fat_eof(s, c);
+ c = modified_fat_get(s, c))
+ new_cluster_count++;
+ }
+
+ if (new_cluster_count > old_cluster_count) {
+ if (insert_direntries(s,
+ current_dir_index + factor * old_cluster_count,
+ factor * (new_cluster_count - old_cluster_count)) == NULL)
+ return -1;
+ } else if (new_cluster_count < old_cluster_count)
+ remove_direntries(s,
+ current_dir_index + factor * new_cluster_count,
+ factor * (old_cluster_count - new_cluster_count));
+
+ for (c = first_cluster; !fat_eof(s, c); c = modified_fat_get(s, c)) {
+ void* direntry = array_get(&(s->directory), current_dir_index);
+ int ret = vvfat_read(s->bs, cluster2sector(s, c), direntry,
+ s->sectors_per_cluster);
+ if (ret)
+ return ret;
+ assert(!strncmp(s->directory.pointer, "QEMU", 4));
+ current_dir_index += factor;
+ }
+
+ ret = commit_mappings(s, first_cluster, dir_index);
+ if (ret)
+ return ret;
+
+ /* recurse */
+ for (i = 0; i < factor * new_cluster_count; i++) {
+ direntry = array_get(&(s->directory), first_dir_index + i);
+ if (is_directory(direntry) && !is_dot(direntry)) {
+ mapping = find_mapping_for_cluster(s, first_cluster);
+ assert(mapping->mode & MODE_DIRECTORY);
+ ret = commit_direntries(s, first_dir_index + i,
+ array_index(&(s->mapping), mapping));
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+/* commit one file (adjust contents, adjust mapping),
+ return first_mapping_index */
+static int commit_one_file(BDRVVVFATState* s,
+ int dir_index, uint32_t offset)
+{
+ direntry_t* direntry = array_get(&(s->directory), dir_index);
+ uint32_t c = begin_of_direntry(direntry);
+ uint32_t first_cluster = c;
+ mapping_t* mapping = find_mapping_for_cluster(s, c);
+ uint32_t size = filesize_of_direntry(direntry);
+ char* cluster = qemu_malloc(s->cluster_size);
+ uint32_t i;
+ int fd = 0;
+
+ assert(offset < size);
+ assert((offset % s->cluster_size) == 0);
+
+ for (i = s->cluster_size; i < offset; i += s->cluster_size)
+ c = modified_fat_get(s, c);
+
+ fd = open(mapping->path, O_RDWR | O_CREAT | O_BINARY, 0666);
+ if (fd < 0) {
+ fprintf(stderr, "Could not open %s... (%s, %d)\n", mapping->path,
+ strerror(errno), errno);
+ return fd;
+ }
+ if (offset > 0)
+ if (lseek(fd, offset, SEEK_SET) != offset)
+ return -3;
+
+ while (offset < size) {
+ uint32_t c1;
+ int rest_size = (size - offset > s->cluster_size ?
+ s->cluster_size : size - offset);
+ int ret;
+
+ c1 = modified_fat_get(s, c);
+
+ assert((size - offset == 0 && fat_eof(s, c)) ||
+ (size > offset && c >=2 && !fat_eof(s, c)));
+
+ ret = vvfat_read(s->bs, cluster2sector(s, c),
+ (uint8_t*)cluster, (rest_size + 0x1ff) / 0x200);
+
+ if (ret < 0)
+ return ret;
+
+ if (write(fd, cluster, rest_size) < 0)
+ return -2;
+
+ offset += rest_size;
+ c = c1;
+ }
+
+ ftruncate(fd, size);
+ close(fd);
+
+ return commit_mappings(s, first_cluster, dir_index);
+}
+
+#ifdef DEBUG
+/* test, if all mappings point to valid direntries */
+static void check1(BDRVVVFATState* s)
+{
+ int i;
+ for (i = 0; i < s->mapping.next; i++) {
+ mapping_t* mapping = array_get(&(s->mapping), i);
+ if (mapping->mode & MODE_DELETED) {
+ fprintf(stderr, "deleted\n");
+ continue;
+ }
+ assert(mapping->dir_index >= 0);
+ assert(mapping->dir_index < s->directory.next);
+ direntry_t* direntry = array_get(&(s->directory), mapping->dir_index);
+ assert(mapping->begin == begin_of_direntry(direntry) || mapping->first_mapping_index >= 0);
+ if (mapping->mode & MODE_DIRECTORY) {
+ assert(mapping->info.dir.first_dir_index + 0x10 * s->sectors_per_cluster * (mapping->end - mapping->begin) <= s->directory.next);
+ assert((mapping->info.dir.first_dir_index % (0x10 * s->sectors_per_cluster)) == 0);
+ }
+ }
+}
+
+/* test, if all direntries have mappings */
+static void check2(BDRVVVFATState* s)
+{
+ int i;
+ int first_mapping = -1;
+
+ for (i = 0; i < s->directory.next; i++) {
+ direntry_t* direntry = array_get(&(s->directory), i);
+
+ if (is_short_name(direntry) && begin_of_direntry(direntry)) {
+ mapping_t* mapping = find_mapping_for_cluster(s, begin_of_direntry(direntry));
+ assert(mapping);
+ assert(mapping->dir_index == i || is_dot(direntry));
+ assert(mapping->begin == begin_of_direntry(direntry) || is_dot(direntry));
+ }
+
+ if ((i % (0x10 * s->sectors_per_cluster)) == 0) {
+ /* cluster start */
+ int j, count = 0;
+
+ for (j = 0; j < s->mapping.next; j++) {
+ mapping_t* mapping = array_get(&(s->mapping), j);
+ if (mapping->mode & MODE_DELETED)
+ continue;
+ if (mapping->mode & MODE_DIRECTORY) {
+ if (mapping->info.dir.first_dir_index <= i && mapping->info.dir.first_dir_index + 0x10 * s->sectors_per_cluster > i) {
+ assert(++count == 1);
+ if (mapping->first_mapping_index == -1)
+ first_mapping = array_index(&(s->mapping), mapping);
+ else
+ assert(first_mapping == mapping->first_mapping_index);
+ if (mapping->info.dir.parent_mapping_index < 0)
+ assert(j == 0);
+ else {
+ mapping_t* parent = array_get(&(s->mapping), mapping->info.dir.parent_mapping_index);
+ assert(parent->mode & MODE_DIRECTORY);
+ assert(parent->info.dir.first_dir_index < mapping->info.dir.first_dir_index);
+ }
+ }
+ }
+ }
+ if (count == 0)
+ first_mapping = -1;
+ }
+ }
+}
+#endif
+
+static int handle_renames_and_mkdirs(BDRVVVFATState* s)
+{
+ int i;
+
+#ifdef DEBUG
+ fprintf(stderr, "handle_renames\n");
+ for (i = 0; i < s->commits.next; i++) {
+ commit_t* commit = array_get(&(s->commits), i);
+ fprintf(stderr, "%d, %s (%d, %d)\n", i, commit->path ? commit->path : "(null)", commit->param.rename.cluster, commit->action);
+ }
+#endif
+
+ for (i = 0; i < s->commits.next;) {
+ commit_t* commit = array_get(&(s->commits), i);
+ if (commit->action == ACTION_RENAME) {
+ mapping_t* mapping = find_mapping_for_cluster(s,
+ commit->param.rename.cluster);
+ char* old_path = mapping->path;
+
+ assert(commit->path);
+ mapping->path = commit->path;
+ if (rename(old_path, mapping->path))
+ return -2;
+
+ if (mapping->mode & MODE_DIRECTORY) {
+ int l1 = strlen(mapping->path);
+ int l2 = strlen(old_path);
+ int diff = l1 - l2;
+ direntry_t* direntry = array_get(&(s->directory),
+ mapping->info.dir.first_dir_index);
+ uint32_t c = mapping->begin;
+ int i = 0;
+
+ /* recurse */
+ while (!fat_eof(s, c)) {
+ do {
+ direntry_t* d = direntry + i;
+
+ if (is_file(d) || (is_directory(d) && !is_dot(d))) {
+ mapping_t* m = find_mapping_for_cluster(s,
+ begin_of_direntry(d));
+ int l = strlen(m->path);
+ char* new_path = qemu_malloc(l + diff + 1);
+
+ assert(!strncmp(m->path, mapping->path, l2));
+
+ pstrcpy(new_path, l + diff + 1, mapping->path);
+ pstrcpy(new_path + l1, l + diff + 1 - l1,
+ m->path + l2);
+
+ schedule_rename(s, m->begin, new_path);
+ }
+ i++;
+ } while((i % (0x10 * s->sectors_per_cluster)) != 0);
+ c = fat_get(s, c);
+ }
+ }
+
+ free(old_path);
+ array_remove(&(s->commits), i);
+ continue;
+ } else if (commit->action == ACTION_MKDIR) {
+ mapping_t* mapping;
+ int j, parent_path_len;
+
+#ifdef __MINGW32__
+ if (mkdir(commit->path))
+ return -5;
+#else
+ if (mkdir(commit->path, 0755))
+ return -5;
+#endif
+
+ mapping = insert_mapping(s, commit->param.mkdir.cluster,
+ commit->param.mkdir.cluster + 1);
+ if (mapping == NULL)
+ return -6;
+
+ mapping->mode = MODE_DIRECTORY;
+ mapping->read_only = 0;
+ mapping->path = commit->path;
+ j = s->directory.next;
+ assert(j);
+ insert_direntries(s, s->directory.next,
+ 0x10 * s->sectors_per_cluster);
+ mapping->info.dir.first_dir_index = j;
+
+ parent_path_len = strlen(commit->path)
+ - strlen(get_basename(commit->path)) - 1;
+ for (j = 0; j < s->mapping.next; j++) {
+ mapping_t* m = array_get(&(s->mapping), j);
+ if (m->first_mapping_index < 0 && m != mapping &&
+ !strncmp(m->path, mapping->path, parent_path_len) &&
+ strlen(m->path) == parent_path_len)
+ break;
+ }
+ assert(j < s->mapping.next);
+ mapping->info.dir.parent_mapping_index = j;
+
+ array_remove(&(s->commits), i);
+ continue;
+ }
+
+ i++;
+ }
+ return 0;
+}
+
+/*
+ * TODO: make sure that the short name is not matching *another* file
+ */
+static int handle_commits(BDRVVVFATState* s)
+{
+ int i, fail = 0;
+
+ vvfat_close_current_file(s);
+
+ for (i = 0; !fail && i < s->commits.next; i++) {
+ commit_t* commit = array_get(&(s->commits), i);
+ switch(commit->action) {
+ case ACTION_RENAME: case ACTION_MKDIR:
+ assert(0);
+ fail = -2;
+ break;
+ case ACTION_WRITEOUT: {
+ direntry_t* entry = array_get(&(s->directory),
+ commit->param.writeout.dir_index);
+ uint32_t begin = begin_of_direntry(entry);
+ mapping_t* mapping = find_mapping_for_cluster(s, begin);
+
+ assert(mapping);
+ assert(mapping->begin == begin);
+ assert(commit->path == NULL);
+
+ if (commit_one_file(s, commit->param.writeout.dir_index,
+ commit->param.writeout.modified_offset))
+ fail = -3;
+
+ break;
+ }
+ case ACTION_NEW_FILE: {
+ int begin = commit->param.new_file.first_cluster;
+ mapping_t* mapping = find_mapping_for_cluster(s, begin);
+ direntry_t* entry;
+ int i;
+
+ /* find direntry */
+ for (i = 0; i < s->directory.next; i++) {
+ entry = array_get(&(s->directory), i);
+ if (is_file(entry) && begin_of_direntry(entry) == begin)
+ break;
+ }
+
+ if (i >= s->directory.next) {
+ fail = -6;
+ continue;
+ }
+
+ /* make sure there exists an initial mapping */
+ if (mapping && mapping->begin != begin) {
+ mapping->end = begin;
+ mapping = NULL;
+ }
+ if (mapping == NULL) {
+ mapping = insert_mapping(s, begin, begin+1);
+ }
+ /* most members will be fixed in commit_mappings() */
+ assert(commit->path);
+ mapping->path = commit->path;
+ mapping->read_only = 0;
+ mapping->mode = MODE_NORMAL;
+ mapping->info.file.offset = 0;
+
+ if (commit_one_file(s, i, 0))
+ fail = -7;
+
+ break;
+ }
+ default:
+ assert(0);
+ }
+ }
+ if (i > 0 && array_remove_slice(&(s->commits), 0, i))
+ return -1;
+ return fail;
+}
+
+static int handle_deletes(BDRVVVFATState* s)
+{
+ int i, deferred = 1, deleted = 1;
+
+ /* delete files corresponding to mappings marked as deleted */
+ /* handle DELETEs and unused mappings (modified_fat_get(s, mapping->begin) == 0) */
+ while (deferred && deleted) {
+ deferred = 0;
+ deleted = 0;
+
+ for (i = 1; i < s->mapping.next; i++) {
+ mapping_t* mapping = array_get(&(s->mapping), i);
+ if (mapping->mode & MODE_DELETED) {
+ direntry_t* entry = array_get(&(s->directory),
+ mapping->dir_index);
+
+ if (is_free(entry)) {
+ /* remove file/directory */
+ if (mapping->mode & MODE_DIRECTORY) {
+ int j, next_dir_index = s->directory.next,
+ first_dir_index = mapping->info.dir.first_dir_index;
+
+ if (rmdir(mapping->path) < 0) {
+ if (errno == ENOTEMPTY) {
+ deferred++;
+ continue;
+ } else
+ return -5;
+ }
+
+ for (j = 1; j < s->mapping.next; j++) {
+ mapping_t* m = array_get(&(s->mapping), j);
+ if (m->mode & MODE_DIRECTORY &&
+ m->info.dir.first_dir_index >
+ first_dir_index &&
+ m->info.dir.first_dir_index <
+ next_dir_index)
+ next_dir_index =
+ m->info.dir.first_dir_index;
+ }
+ remove_direntries(s, first_dir_index,
+ next_dir_index - first_dir_index);
+
+ deleted++;
+ }
+ } else {
+ if (unlink(mapping->path))
+ return -4;
+ deleted++;
+ }
+ DLOG(fprintf(stderr, "DELETE (%d)\n", i); print_mapping(mapping); print_direntry(entry));
+ remove_mapping(s, i);
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * synchronize mapping with new state:
+ *
+ * - copy FAT (with bdrv_read)
+ * - mark all filenames corresponding to mappings as deleted
+ * - recurse direntries from root (using bs->bdrv_read)
+ * - delete files corresponding to mappings marked as deleted
+ */
+static int do_commit(BDRVVVFATState* s)
+{
+ int ret = 0;
+
+ /* the real meat are the commits. Nothing to do? Move along! */
+ if (s->commits.next == 0)
+ return 0;
+
+ vvfat_close_current_file(s);
+
+ ret = handle_renames_and_mkdirs(s);
+ if (ret) {
+ fprintf(stderr, "Error handling renames (%d)\n", ret);
+ assert(0);
+ return ret;
+ }
+
+ /* copy FAT (with bdrv_read) */
+ memcpy(s->fat.pointer, s->fat2, 0x200 * s->sectors_per_fat);
+
+ /* recurse direntries from root (using bs->bdrv_read) */
+ ret = commit_direntries(s, 0, -1);
+ if (ret) {
+ fprintf(stderr, "Fatal: error while committing (%d)\n", ret);
+ assert(0);
+ return ret;
+ }
+
+ ret = handle_commits(s);
+ if (ret) {
+ fprintf(stderr, "Error handling commits (%d)\n", ret);
+ assert(0);
+ return ret;
+ }
+
+ ret = handle_deletes(s);
+ if (ret) {
+ fprintf(stderr, "Error deleting\n");
+ assert(0);
+ return ret;
+ }
+
+ s->qcow->drv->bdrv_make_empty(s->qcow);
+
+ memset(s->used_clusters, 0, sector2cluster(s, s->sector_count));
+
+DLOG(checkpoint());
+ return 0;
+}
+
+static int try_commit(BDRVVVFATState* s)
+{
+ vvfat_close_current_file(s);
+DLOG(checkpoint());
+ if(!is_consistent(s))
+ return -1;
+ return do_commit(s);
+}
+
+static int vvfat_write(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t *buf, int nb_sectors)
+{
+ BDRVVVFATState *s = bs->opaque;
+ int i, ret;
+
+DLOG(checkpoint());
+
+ vvfat_close_current_file(s);
+
+ /*
+ * Some sanity checks:
+ * - do not allow writing to the boot sector
+ * - do not allow to write non-ASCII filenames
+ */
+
+ if (sector_num < s->first_sectors_number)
+ return -1;
+
+ for (i = sector2cluster(s, sector_num);
+ i <= sector2cluster(s, sector_num + nb_sectors - 1);) {
+ mapping_t* mapping = find_mapping_for_cluster(s, i);
+ if (mapping) {
+ if (mapping->read_only) {
+ fprintf(stderr, "Tried to write to write-protected file %s\n",
+ mapping->path);
+ return -1;
+ }
+
+ if (mapping->mode & MODE_DIRECTORY) {
+ int begin = cluster2sector(s, i);
+ int end = begin + s->sectors_per_cluster, k;
+ int dir_index;
+ const direntry_t* direntries;
+ long_file_name lfn;
+
+ lfn_init(&lfn);
+
+ if (begin < sector_num)
+ begin = sector_num;
+ if (end > sector_num + nb_sectors)
+ end = sector_num + nb_sectors;
+ dir_index = mapping->dir_index +
+ 0x10 * (begin - mapping->begin * s->sectors_per_cluster);
+ direntries = (direntry_t*)(buf + 0x200 * (begin - sector_num));
+
+ for (k = 0; k < (end - begin) * 0x10; k++) {
+ /* do not allow non-ASCII filenames */
+ if (parse_long_name(&lfn, direntries + k) < 0) {
+ fprintf(stderr, "Warning: non-ASCII filename\n");
+ return -1;
+ }
+ /* no access to the direntry of a read-only file */
+ else if (is_short_name(direntries+k) &&
+ (direntries[k].attributes & 1)) {
+ if (memcmp(direntries + k,
+ array_get(&(s->directory), dir_index + k),
+ sizeof(direntry_t))) {
+ fprintf(stderr, "Warning: tried to write to write-protected file\n");
+ return -1;
+ }
+ }
+ }
+ }
+ i = mapping->end;
+ } else
+ i++;
+ }
+
+ /*
+ * Use qcow backend. Commit later.
+ */
+DLOG(fprintf(stderr, "Write to qcow backend: %d + %d\n", (int)sector_num, nb_sectors));
+ ret = s->qcow->drv->bdrv_write(s->qcow, sector_num, buf, nb_sectors);
+ if (ret < 0) {
+ fprintf(stderr, "Error writing to qcow backend\n");
+ return ret;
+ }
+
+ for (i = sector2cluster(s, sector_num);
+ i <= sector2cluster(s, sector_num + nb_sectors - 1); i++)
+ if (i >= 0)
+ s->used_clusters[i] |= USED_ALLOCATED;
+
+DLOG(checkpoint());
+ /* TODO: add timeout */
+ try_commit(s);
+
+DLOG(checkpoint());
+ return 0;
+}
+
+static int vvfat_is_allocated(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, int* n)
+{
+ BDRVVVFATState* s = bs->opaque;
+ *n = s->sector_count - sector_num;
+ if (*n > nb_sectors)
+ *n = nb_sectors;
+ else if (*n < 0)
+ return 0;
+ return 1;
+}
+
+static int write_target_commit(BlockDriverState *bs, int64_t sector_num,
+ const uint8_t* buffer, int nb_sectors) {
+ BDRVVVFATState* s = bs->opaque;
+ return try_commit(s);
+}
+
+static void write_target_close(BlockDriverState *bs) {
+ BDRVVVFATState* s = bs->opaque;
+ bdrv_delete(s->qcow);
+ free(s->qcow_filename);
+}
+
+static BlockDriver vvfat_write_target = {
+ "vvfat_write_target", 0, NULL, NULL, NULL,
+ write_target_commit,
+ write_target_close,
+ NULL, NULL, NULL
+};
+
+static int enable_write_target(BDRVVVFATState *s)
+{
+ int size = sector2cluster(s, s->sector_count);
+ s->used_clusters = calloc(size, 1);
+
+ array_init(&(s->commits), sizeof(commit_t));
+
+ s->qcow_filename = qemu_malloc(1024);
+ get_tmp_filename(s->qcow_filename, 1024);
+ if (bdrv_create(bdrv_find_format("qcow"),
+ s->qcow_filename, s->sector_count, "fat:", 0) < 0)
+ return -1;
+ s->qcow = bdrv_new("");
+ if (s->qcow == NULL || bdrv_open(s->qcow, s->qcow_filename, 0) < 0)
+ return -1;
+
+#ifndef _WIN32
+ unlink(s->qcow_filename);
+#endif
+
+ s->bs->backing_hd = calloc(sizeof(BlockDriverState), 1);
+ s->bs->backing_hd->drv = &vvfat_write_target;
+ s->bs->backing_hd->opaque = s;
+
+ return 0;
+}
+
+static void vvfat_close(BlockDriverState *bs)
+{
+ BDRVVVFATState *s = bs->opaque;
+
+ vvfat_close_current_file(s);
+ array_free(&(s->fat));
+ array_free(&(s->directory));
+ array_free(&(s->mapping));
+ if(s->cluster_buffer)
+ free(s->cluster_buffer);
+}
+
+static BlockDriver bdrv_vvfat = {
+ .format_name = "vvfat",
+ .instance_size = sizeof(BDRVVVFATState),
+ .bdrv_open = vvfat_open,
+ .bdrv_read = vvfat_read,
+ .bdrv_write = vvfat_write,
+ .bdrv_close = vvfat_close,
+ .bdrv_is_allocated = vvfat_is_allocated,
+ .protocol_name = "fat",
+};
+
+static void bdrv_vvfat_init(void)
+{
+ bdrv_register(&bdrv_vvfat);
+}
+
+block_init(bdrv_vvfat_init);
+
+#ifdef DEBUG
+static void checkpoint(void) {
+ assert(((mapping_t*)array_get(&(vvv->mapping), 0))->end == 2);
+ check1(vvv);
+ check2(vvv);
+ assert(!vvv->current_mapping || vvv->current_fd || (vvv->current_mapping->mode & MODE_DIRECTORY));
+#if 0
+ if (((direntry_t*)vvv->directory.pointer)[1].attributes != 0xf)
+ fprintf(stderr, "Nonono!\n");
+ mapping_t* mapping;
+ direntry_t* direntry;
+ assert(vvv->mapping.size >= vvv->mapping.item_size * vvv->mapping.next);
+ assert(vvv->directory.size >= vvv->directory.item_size * vvv->directory.next);
+ if (vvv->mapping.next<47)
+ return;
+ assert((mapping = array_get(&(vvv->mapping), 47)));
+ assert(mapping->dir_index < vvv->directory.next);
+ direntry = array_get(&(vvv->directory), mapping->dir_index);
+ assert(!memcmp(direntry->name, "USB H ", 11) || direntry->name[0]==0);
+#endif
+ return;
+ /* avoid compiler warnings: */
+ hexdump(NULL, 100);
+ remove_mapping(vvv, NULL);
+ print_mapping(NULL);
+ print_direntry(NULL);
+}
+#endif