diff options
author | Anthony Liguori <aliguori@us.ibm.com> | 2009-05-09 17:14:19 -0500 |
---|---|---|
committer | Anthony Liguori <aliguori@us.ibm.com> | 2009-05-14 16:13:46 -0500 |
commit | 019d6b8ff0d495ded6977f24a4e8fd1c7fec09e0 (patch) | |
tree | ffaf507f7440b5c7d8ed8a4de193b1df41e4a2d8 /block | |
parent | 5efa9d5a8b18841c9c62208a494d7f519238979a (diff) |
Move block drivers into their own directory
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
Diffstat (limited to 'block')
-rw-r--r-- | block/bochs.c | 259 | ||||
-rw-r--r-- | block/cloop.c | 171 | ||||
-rw-r--r-- | block/cow.c | 275 | ||||
-rw-r--r-- | block/dmg.c | 301 | ||||
-rw-r--r-- | block/nbd.c | 196 | ||||
-rw-r--r-- | block/parallels.c | 181 | ||||
-rw-r--r-- | block/qcow.c | 945 | ||||
-rw-r--r-- | block/qcow2.c | 2931 | ||||
-rw-r--r-- | block/raw-posix.c | 1438 | ||||
-rw-r--r-- | block/raw-win32.c | 394 | ||||
-rw-r--r-- | block/vmdk.c | 833 | ||||
-rw-r--r-- | block/vpc.c | 606 | ||||
-rw-r--r-- | block/vvfat.c | 2855 |
13 files changed, 11385 insertions, 0 deletions
diff --git a/block/bochs.c b/block/bochs.c new file mode 100644 index 0000000000..bac81c42b7 --- /dev/null +++ b/block/bochs.c @@ -0,0 +1,259 @@ +/* + * Block driver for the various disk image formats used by Bochs + * Currently only for "growing" type in read-only mode + * + * Copyright (c) 2005 Alex Beregszaszi + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block_int.h" +#include "module.h" + +/**************************************************************/ + +#define HEADER_MAGIC "Bochs Virtual HD Image" +#define HEADER_VERSION 0x00020000 +#define HEADER_V1 0x00010000 +#define HEADER_SIZE 512 + +#define REDOLOG_TYPE "Redolog" +#define GROWING_TYPE "Growing" + +// not allocated: 0xffffffff + +// always little-endian +struct bochs_header_v1 { + char magic[32]; // "Bochs Virtual HD Image" + char type[16]; // "Redolog" + char subtype[16]; // "Undoable" / "Volatile" / "Growing" + uint32_t version; + uint32_t header; // size of header + + union { + struct { + uint32_t catalog; // num of entries + uint32_t bitmap; // bitmap size + uint32_t extent; // extent size + uint64_t disk; // disk size + char padding[HEADER_SIZE - 64 - 8 - 20]; + } redolog; + char padding[HEADER_SIZE - 64 - 8]; + } extra; +}; + +// always little-endian +struct bochs_header { + char magic[32]; // "Bochs Virtual HD Image" + char type[16]; // "Redolog" + char subtype[16]; // "Undoable" / "Volatile" / "Growing" + uint32_t version; + uint32_t header; // size of header + + union { + struct { + uint32_t catalog; // num of entries + uint32_t bitmap; // bitmap size + uint32_t extent; // extent size + uint32_t reserved; // for ??? + uint64_t disk; // disk size + char padding[HEADER_SIZE - 64 - 8 - 24]; + } redolog; + char padding[HEADER_SIZE - 64 - 8]; + } extra; +}; + +typedef struct BDRVBochsState { + int fd; + + uint32_t *catalog_bitmap; + int catalog_size; + + int data_offset; + + int bitmap_blocks; + int extent_blocks; + int extent_size; +} BDRVBochsState; + +static int bochs_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + const struct bochs_header *bochs = (const void *)buf; + + if (buf_size < HEADER_SIZE) + return 0; + + if (!strcmp(bochs->magic, HEADER_MAGIC) && + !strcmp(bochs->type, REDOLOG_TYPE) && + !strcmp(bochs->subtype, GROWING_TYPE) && + ((le32_to_cpu(bochs->version) == HEADER_VERSION) || + (le32_to_cpu(bochs->version) == HEADER_V1))) + return 100; + + return 0; +} + +static int bochs_open(BlockDriverState *bs, const char *filename, int flags) +{ + BDRVBochsState *s = bs->opaque; + int fd, i; + struct bochs_header bochs; + struct bochs_header_v1 header_v1; + + fd = open(filename, O_RDWR | O_BINARY); + if (fd < 0) { + fd = open(filename, O_RDONLY | O_BINARY); + if (fd < 0) + return -1; + } + + bs->read_only = 1; // no write support yet + + s->fd = fd; + + if (read(fd, &bochs, sizeof(bochs)) != sizeof(bochs)) { + goto fail; + } + + if (strcmp(bochs.magic, HEADER_MAGIC) || + strcmp(bochs.type, REDOLOG_TYPE) || + strcmp(bochs.subtype, GROWING_TYPE) || + ((le32_to_cpu(bochs.version) != HEADER_VERSION) && + (le32_to_cpu(bochs.version) != HEADER_V1))) { + goto fail; + } + + if (le32_to_cpu(bochs.version) == HEADER_V1) { + memcpy(&header_v1, &bochs, sizeof(bochs)); + bs->total_sectors = le64_to_cpu(header_v1.extra.redolog.disk) / 512; + } else { + bs->total_sectors = le64_to_cpu(bochs.extra.redolog.disk) / 512; + } + + lseek(s->fd, le32_to_cpu(bochs.header), SEEK_SET); + + s->catalog_size = le32_to_cpu(bochs.extra.redolog.catalog); + s->catalog_bitmap = qemu_malloc(s->catalog_size * 4); + if (read(s->fd, s->catalog_bitmap, s->catalog_size * 4) != + s->catalog_size * 4) + goto fail; + for (i = 0; i < s->catalog_size; i++) + le32_to_cpus(&s->catalog_bitmap[i]); + + s->data_offset = le32_to_cpu(bochs.header) + (s->catalog_size * 4); + + s->bitmap_blocks = 1 + (le32_to_cpu(bochs.extra.redolog.bitmap) - 1) / 512; + s->extent_blocks = 1 + (le32_to_cpu(bochs.extra.redolog.extent) - 1) / 512; + + s->extent_size = le32_to_cpu(bochs.extra.redolog.extent); + + return 0; + fail: + close(fd); + return -1; +} + +static inline int seek_to_sector(BlockDriverState *bs, int64_t sector_num) +{ + BDRVBochsState *s = bs->opaque; + int64_t offset = sector_num * 512; + int64_t extent_index, extent_offset, bitmap_offset, block_offset; + char bitmap_entry; + + // seek to sector + extent_index = offset / s->extent_size; + extent_offset = (offset % s->extent_size) / 512; + + if (s->catalog_bitmap[extent_index] == 0xffffffff) + { +// fprintf(stderr, "page not allocated [%x - %x:%x]\n", +// sector_num, extent_index, extent_offset); + return -1; // not allocated + } + + bitmap_offset = s->data_offset + (512 * s->catalog_bitmap[extent_index] * + (s->extent_blocks + s->bitmap_blocks)); + block_offset = bitmap_offset + (512 * (s->bitmap_blocks + extent_offset)); + +// fprintf(stderr, "sect: %x [ext i: %x o: %x] -> %x bitmap: %x block: %x\n", +// sector_num, extent_index, extent_offset, +// le32_to_cpu(s->catalog_bitmap[extent_index]), +// bitmap_offset, block_offset); + + // read in bitmap for current extent + lseek(s->fd, bitmap_offset + (extent_offset / 8), SEEK_SET); + + read(s->fd, &bitmap_entry, 1); + + if (!((bitmap_entry >> (extent_offset % 8)) & 1)) + { +// fprintf(stderr, "sector (%x) in bitmap not allocated\n", +// sector_num); + return -1; // not allocated + } + + lseek(s->fd, block_offset, SEEK_SET); + + return 0; +} + +static int bochs_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + BDRVBochsState *s = bs->opaque; + int ret; + + while (nb_sectors > 0) { + if (!seek_to_sector(bs, sector_num)) + { + ret = read(s->fd, buf, 512); + if (ret != 512) + return -1; + } + else + memset(buf, 0, 512); + nb_sectors--; + sector_num++; + buf += 512; + } + return 0; +} + +static void bochs_close(BlockDriverState *bs) +{ + BDRVBochsState *s = bs->opaque; + qemu_free(s->catalog_bitmap); + close(s->fd); +} + +static BlockDriver bdrv_bochs = { + .format_name = "bochs", + .instance_size = sizeof(BDRVBochsState), + .bdrv_probe = bochs_probe, + .bdrv_open = bochs_open, + .bdrv_read = bochs_read, + .bdrv_close = bochs_close, +}; + +static void bdrv_bochs_init(void) +{ + bdrv_register(&bdrv_bochs); +} + +block_init(bdrv_bochs_init); diff --git a/block/cloop.c b/block/cloop.c new file mode 100644 index 0000000000..06c687e690 --- /dev/null +++ b/block/cloop.c @@ -0,0 +1,171 @@ +/* + * QEMU Block driver for CLOOP images + * + * Copyright (c) 2004 Johannes E. Schindelin + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block_int.h" +#include "module.h" +#include <zlib.h> + +typedef struct BDRVCloopState { + int fd; + uint32_t block_size; + uint32_t n_blocks; + uint64_t* offsets; + uint32_t sectors_per_block; + uint32_t current_block; + uint8_t *compressed_block; + uint8_t *uncompressed_block; + z_stream zstream; +} BDRVCloopState; + +static int cloop_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + const char* magic_version_2_0="#!/bin/sh\n" + "#V2.0 Format\n" + "modprobe cloop file=$0 && mount -r -t iso9660 /dev/cloop $1\n"; + int length=strlen(magic_version_2_0); + if(length>buf_size) + length=buf_size; + if(!memcmp(magic_version_2_0,buf,length)) + return 2; + return 0; +} + +static int cloop_open(BlockDriverState *bs, const char *filename, int flags) +{ + BDRVCloopState *s = bs->opaque; + uint32_t offsets_size,max_compressed_block_size=1,i; + + s->fd = open(filename, O_RDONLY | O_BINARY); + if (s->fd < 0) + return -errno; + bs->read_only = 1; + + /* read header */ + if(lseek(s->fd,128,SEEK_SET)<0) { +cloop_close: + close(s->fd); + return -1; + } + if(read(s->fd,&s->block_size,4)<4) + goto cloop_close; + s->block_size=be32_to_cpu(s->block_size); + if(read(s->fd,&s->n_blocks,4)<4) + goto cloop_close; + s->n_blocks=be32_to_cpu(s->n_blocks); + + /* read offsets */ + offsets_size=s->n_blocks*sizeof(uint64_t); + s->offsets=(uint64_t*)qemu_malloc(offsets_size); + if(read(s->fd,s->offsets,offsets_size)<offsets_size) + goto cloop_close; + for(i=0;i<s->n_blocks;i++) { + s->offsets[i]=be64_to_cpu(s->offsets[i]); + if(i>0) { + uint32_t size=s->offsets[i]-s->offsets[i-1]; + if(size>max_compressed_block_size) + max_compressed_block_size=size; + } + } + + /* initialize zlib engine */ + s->compressed_block = qemu_malloc(max_compressed_block_size+1); + s->uncompressed_block = qemu_malloc(s->block_size); + if(inflateInit(&s->zstream) != Z_OK) + goto cloop_close; + s->current_block=s->n_blocks; + + s->sectors_per_block = s->block_size/512; + bs->total_sectors = s->n_blocks*s->sectors_per_block; + return 0; +} + +static inline int cloop_read_block(BDRVCloopState *s,int block_num) +{ + if(s->current_block != block_num) { + int ret; + uint32_t bytes = s->offsets[block_num+1]-s->offsets[block_num]; + + lseek(s->fd, s->offsets[block_num], SEEK_SET); + ret = read(s->fd, s->compressed_block, bytes); + if (ret != bytes) + return -1; + + s->zstream.next_in = s->compressed_block; + s->zstream.avail_in = bytes; + s->zstream.next_out = s->uncompressed_block; + s->zstream.avail_out = s->block_size; + ret = inflateReset(&s->zstream); + if(ret != Z_OK) + return -1; + ret = inflate(&s->zstream, Z_FINISH); + if(ret != Z_STREAM_END || s->zstream.total_out != s->block_size) + return -1; + + s->current_block = block_num; + } + return 0; +} + +static int cloop_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + BDRVCloopState *s = bs->opaque; + int i; + + for(i=0;i<nb_sectors;i++) { + uint32_t sector_offset_in_block=((sector_num+i)%s->sectors_per_block), + block_num=(sector_num+i)/s->sectors_per_block; + if(cloop_read_block(s, block_num) != 0) + return -1; + memcpy(buf+i*512,s->uncompressed_block+sector_offset_in_block*512,512); + } + return 0; +} + +static void cloop_close(BlockDriverState *bs) +{ + BDRVCloopState *s = bs->opaque; + close(s->fd); + if(s->n_blocks>0) + free(s->offsets); + free(s->compressed_block); + free(s->uncompressed_block); + inflateEnd(&s->zstream); +} + +static BlockDriver bdrv_cloop = { + .format_name = "cloop", + .instance_size = sizeof(BDRVCloopState), + .bdrv_probe = cloop_probe, + .bdrv_open = cloop_open, + .bdrv_read = cloop_read, + .bdrv_close = cloop_close, +}; + +static void bdrv_cloop_init(void) +{ + bdrv_register(&bdrv_cloop); +} + +block_init(bdrv_cloop_init); diff --git a/block/cow.c b/block/cow.c new file mode 100644 index 0000000000..94b3549389 --- /dev/null +++ b/block/cow.c @@ -0,0 +1,275 @@ +/* + * Block driver for the COW format + * + * Copyright (c) 2004 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#ifndef _WIN32 +#include "qemu-common.h" +#include "block_int.h" +#include "module.h" +#include <sys/mman.h> + +/**************************************************************/ +/* COW block driver using file system holes */ + +/* user mode linux compatible COW file */ +#define COW_MAGIC 0x4f4f4f4d /* MOOO */ +#define COW_VERSION 2 + +struct cow_header_v2 { + uint32_t magic; + uint32_t version; + char backing_file[1024]; + int32_t mtime; + uint64_t size; + uint32_t sectorsize; +}; + +typedef struct BDRVCowState { + int fd; + uint8_t *cow_bitmap; /* if non NULL, COW mappings are used first */ + uint8_t *cow_bitmap_addr; /* mmap address of cow_bitmap */ + int cow_bitmap_size; + int64_t cow_sectors_offset; +} BDRVCowState; + +static int cow_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + const struct cow_header_v2 *cow_header = (const void *)buf; + + if (buf_size >= sizeof(struct cow_header_v2) && + be32_to_cpu(cow_header->magic) == COW_MAGIC && + be32_to_cpu(cow_header->version) == COW_VERSION) + return 100; + else + return 0; +} + +static int cow_open(BlockDriverState *bs, const char *filename, int flags) +{ + BDRVCowState *s = bs->opaque; + int fd; + struct cow_header_v2 cow_header; + int64_t size; + + fd = open(filename, O_RDWR | O_BINARY | O_LARGEFILE); + if (fd < 0) { + fd = open(filename, O_RDONLY | O_BINARY | O_LARGEFILE); + if (fd < 0) + return -1; + } + s->fd = fd; + /* see if it is a cow image */ + if (read(fd, &cow_header, sizeof(cow_header)) != sizeof(cow_header)) { + goto fail; + } + + if (be32_to_cpu(cow_header.magic) != COW_MAGIC || + be32_to_cpu(cow_header.version) != COW_VERSION) { + goto fail; + } + + /* cow image found */ + size = be64_to_cpu(cow_header.size); + bs->total_sectors = size / 512; + + pstrcpy(bs->backing_file, sizeof(bs->backing_file), + cow_header.backing_file); + + /* mmap the bitmap */ + s->cow_bitmap_size = ((bs->total_sectors + 7) >> 3) + sizeof(cow_header); + s->cow_bitmap_addr = (void *)mmap(get_mmap_addr(s->cow_bitmap_size), + s->cow_bitmap_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, s->fd, 0); + if (s->cow_bitmap_addr == MAP_FAILED) + goto fail; + s->cow_bitmap = s->cow_bitmap_addr + sizeof(cow_header); + s->cow_sectors_offset = (s->cow_bitmap_size + 511) & ~511; + return 0; + fail: + close(fd); + return -1; +} + +static inline void cow_set_bit(uint8_t *bitmap, int64_t bitnum) +{ + bitmap[bitnum / 8] |= (1 << (bitnum%8)); +} + +static inline int is_bit_set(const uint8_t *bitmap, int64_t bitnum) +{ + return !!(bitmap[bitnum / 8] & (1 << (bitnum%8))); +} + + +/* Return true if first block has been changed (ie. current version is + * in COW file). Set the number of continuous blocks for which that + * is true. */ +static inline int is_changed(uint8_t *bitmap, + int64_t sector_num, int nb_sectors, + int *num_same) +{ + int changed; + + if (!bitmap || nb_sectors == 0) { + *num_same = nb_sectors; + return 0; + } + + changed = is_bit_set(bitmap, sector_num); + for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) { + if (is_bit_set(bitmap, sector_num + *num_same) != changed) + break; + } + + return changed; +} + +static int cow_is_allocated(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, int *pnum) +{ + BDRVCowState *s = bs->opaque; + return is_changed(s->cow_bitmap, sector_num, nb_sectors, pnum); +} + +static int cow_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + BDRVCowState *s = bs->opaque; + int ret, n; + + while (nb_sectors > 0) { + if (is_changed(s->cow_bitmap, sector_num, nb_sectors, &n)) { + lseek(s->fd, s->cow_sectors_offset + sector_num * 512, SEEK_SET); + ret = read(s->fd, buf, n * 512); + if (ret != n * 512) + return -1; + } else { + if (bs->backing_hd) { + /* read from the base image */ + ret = bdrv_read(bs->backing_hd, sector_num, buf, n); + if (ret < 0) + return -1; + } else { + memset(buf, 0, n * 512); + } + } + nb_sectors -= n; + sector_num += n; + buf += n * 512; + } + return 0; +} + +static int cow_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + BDRVCowState *s = bs->opaque; + int ret, i; + + lseek(s->fd, s->cow_sectors_offset + sector_num * 512, SEEK_SET); + ret = write(s->fd, buf, nb_sectors * 512); + if (ret != nb_sectors * 512) + return -1; + for (i = 0; i < nb_sectors; i++) + cow_set_bit(s->cow_bitmap, sector_num + i); + return 0; +} + +static void cow_close(BlockDriverState *bs) +{ + BDRVCowState *s = bs->opaque; + munmap((void *)s->cow_bitmap_addr, s->cow_bitmap_size); + close(s->fd); +} + +static int cow_create(const char *filename, int64_t image_sectors, + const char *image_filename, int flags) +{ + int fd, cow_fd; + struct cow_header_v2 cow_header; + struct stat st; + + if (flags) + return -ENOTSUP; + + cow_fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, + 0644); + if (cow_fd < 0) + return -1; + memset(&cow_header, 0, sizeof(cow_header)); + cow_header.magic = cpu_to_be32(COW_MAGIC); + cow_header.version = cpu_to_be32(COW_VERSION); + if (image_filename) { + /* Note: if no file, we put a dummy mtime */ + cow_header.mtime = cpu_to_be32(0); + + fd = open(image_filename, O_RDONLY | O_BINARY); + if (fd < 0) { + close(cow_fd); + goto mtime_fail; + } + if (fstat(fd, &st) != 0) { + close(fd); + goto mtime_fail; + } + close(fd); + cow_header.mtime = cpu_to_be32(st.st_mtime); + mtime_fail: + pstrcpy(cow_header.backing_file, sizeof(cow_header.backing_file), + image_filename); + } + cow_header.sectorsize = cpu_to_be32(512); + cow_header.size = cpu_to_be64(image_sectors * 512); + write(cow_fd, &cow_header, sizeof(cow_header)); + /* resize to include at least all the bitmap */ + ftruncate(cow_fd, sizeof(cow_header) + ((image_sectors + 7) >> 3)); + close(cow_fd); + return 0; +} + +static void cow_flush(BlockDriverState *bs) +{ + BDRVCowState *s = bs->opaque; + fsync(s->fd); +} + +static BlockDriver bdrv_cow = { + .format_name = "cow", + .instance_size = sizeof(BDRVCowState), + .bdrv_probe = cow_probe, + .bdrv_open = cow_open, + .bdrv_read = cow_read, + .bdrv_write = cow_write, + .bdrv_close = cow_close, + .bdrv_create = cow_create, + .bdrv_flush = cow_flush, + .bdrv_is_allocated = cow_is_allocated, +}; + +static void bdrv_cow_init(void) +{ + bdrv_register(&bdrv_cow); +} + +block_init(bdrv_cow_init); +#endif diff --git a/block/dmg.c b/block/dmg.c new file mode 100644 index 0000000000..262560ffd3 --- /dev/null +++ b/block/dmg.c @@ -0,0 +1,301 @@ +/* + * QEMU Block driver for DMG images + * + * Copyright (c) 2004 Johannes E. Schindelin + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block_int.h" +#include "bswap.h" +#include "module.h" +#include <zlib.h> + +typedef struct BDRVDMGState { + int fd; + + /* each chunk contains a certain number of sectors, + * offsets[i] is the offset in the .dmg file, + * lengths[i] is the length of the compressed chunk, + * sectors[i] is the sector beginning at offsets[i], + * sectorcounts[i] is the number of sectors in that chunk, + * the sectors array is ordered + * 0<=i<n_chunks */ + + uint32_t n_chunks; + uint32_t* types; + uint64_t* offsets; + uint64_t* lengths; + uint64_t* sectors; + uint64_t* sectorcounts; + uint32_t current_chunk; + uint8_t *compressed_chunk; + uint8_t *uncompressed_chunk; + z_stream zstream; +} BDRVDMGState; + +static int dmg_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + int len=strlen(filename); + if(len>4 && !strcmp(filename+len-4,".dmg")) + return 2; + return 0; +} + +static off_t read_off(int fd) +{ + uint64_t buffer; + if(read(fd,&buffer,8)<8) + return 0; + return be64_to_cpu(buffer); +} + +static off_t read_uint32(int fd) +{ + uint32_t buffer; + if(read(fd,&buffer,4)<4) + return 0; + return be32_to_cpu(buffer); +} + +static int dmg_open(BlockDriverState *bs, const char *filename, int flags) +{ + BDRVDMGState *s = bs->opaque; + off_t info_begin,info_end,last_in_offset,last_out_offset; + uint32_t count; + uint32_t max_compressed_size=1,max_sectors_per_chunk=1,i; + + s->fd = open(filename, O_RDONLY | O_BINARY); + if (s->fd < 0) + return -errno; + bs->read_only = 1; + s->n_chunks = 0; + s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL; + + /* read offset of info blocks */ + if(lseek(s->fd,-0x1d8,SEEK_END)<0) { +dmg_close: + close(s->fd); + /* open raw instead */ + bs->drv=bdrv_find_format("raw"); + return bs->drv->bdrv_open(bs, filename, flags); + } + info_begin=read_off(s->fd); + if(info_begin==0) + goto dmg_close; + if(lseek(s->fd,info_begin,SEEK_SET)<0) + goto dmg_close; + if(read_uint32(s->fd)!=0x100) + goto dmg_close; + if((count = read_uint32(s->fd))==0) + goto dmg_close; + info_end = info_begin+count; + if(lseek(s->fd,0xf8,SEEK_CUR)<0) + goto dmg_close; + + /* read offsets */ + last_in_offset = last_out_offset = 0; + while(lseek(s->fd,0,SEEK_CUR)<info_end) { + uint32_t type; + + count = read_uint32(s->fd); + if(count==0) + goto dmg_close; + type = read_uint32(s->fd); + if(type!=0x6d697368 || count<244) + lseek(s->fd,count-4,SEEK_CUR); + else { + int new_size, chunk_count; + if(lseek(s->fd,200,SEEK_CUR)<0) + goto dmg_close; + chunk_count = (count-204)/40; + new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count); + s->types = qemu_realloc(s->types, new_size/2); + s->offsets = qemu_realloc(s->offsets, new_size); + s->lengths = qemu_realloc(s->lengths, new_size); + s->sectors = qemu_realloc(s->sectors, new_size); + s->sectorcounts = qemu_realloc(s->sectorcounts, new_size); + + for(i=s->n_chunks;i<s->n_chunks+chunk_count;i++) { + s->types[i] = read_uint32(s->fd); + if(s->types[i]!=0x80000005 && s->types[i]!=1 && s->types[i]!=2) { + if(s->types[i]==0xffffffff) { + last_in_offset = s->offsets[i-1]+s->lengths[i-1]; + last_out_offset = s->sectors[i-1]+s->sectorcounts[i-1]; + } + chunk_count--; + i--; + if(lseek(s->fd,36,SEEK_CUR)<0) + goto dmg_close; + continue; + } + read_uint32(s->fd); + s->sectors[i] = last_out_offset+read_off(s->fd); + s->sectorcounts[i] = read_off(s->fd); + s->offsets[i] = last_in_offset+read_off(s->fd); + s->lengths[i] = read_off(s->fd); + if(s->lengths[i]>max_compressed_size) + max_compressed_size = s->lengths[i]; + if(s->sectorcounts[i]>max_sectors_per_chunk) + max_sectors_per_chunk = s->sectorcounts[i]; + } + s->n_chunks+=chunk_count; + } + } + + /* initialize zlib engine */ + s->compressed_chunk = qemu_malloc(max_compressed_size+1); + s->uncompressed_chunk = qemu_malloc(512*max_sectors_per_chunk); + if(inflateInit(&s->zstream) != Z_OK) + goto dmg_close; + + s->current_chunk = s->n_chunks; + + return 0; +} + +static inline int is_sector_in_chunk(BDRVDMGState* s, + uint32_t chunk_num,int sector_num) +{ + if(chunk_num>=s->n_chunks || s->sectors[chunk_num]>sector_num || + s->sectors[chunk_num]+s->sectorcounts[chunk_num]<=sector_num) + return 0; + else + return -1; +} + +static inline uint32_t search_chunk(BDRVDMGState* s,int sector_num) +{ + /* binary search */ + uint32_t chunk1=0,chunk2=s->n_chunks,chunk3; + while(chunk1!=chunk2) { + chunk3 = (chunk1+chunk2)/2; + if(s->sectors[chunk3]>sector_num) + chunk2 = chunk3; + else if(s->sectors[chunk3]+s->sectorcounts[chunk3]>sector_num) + return chunk3; + else + chunk1 = chunk3; + } + return s->n_chunks; /* error */ +} + +static inline int dmg_read_chunk(BDRVDMGState *s,int sector_num) +{ + if(!is_sector_in_chunk(s,s->current_chunk,sector_num)) { + int ret; + uint32_t chunk = search_chunk(s,sector_num); + + if(chunk>=s->n_chunks) + return -1; + + s->current_chunk = s->n_chunks; + switch(s->types[chunk]) { + case 0x80000005: { /* zlib compressed */ + int i; + + ret = lseek(s->fd, s->offsets[chunk], SEEK_SET); + if(ret<0) + return -1; + + /* we need to buffer, because only the chunk as whole can be + * inflated. */ + i=0; + do { + ret = read(s->fd, s->compressed_chunk+i, s->lengths[chunk]-i); + if(ret<0 && errno==EINTR) + ret=0; + i+=ret; + } while(ret>=0 && ret+i<s->lengths[chunk]); + + if (ret != s->lengths[chunk]) + return -1; + + s->zstream.next_in = s->compressed_chunk; + s->zstream.avail_in = s->lengths[chunk]; + s->zstream.next_out = s->uncompressed_chunk; + s->zstream.avail_out = 512*s->sectorcounts[chunk]; + ret = inflateReset(&s->zstream); + if(ret != Z_OK) + return -1; + ret = inflate(&s->zstream, Z_FINISH); + if(ret != Z_STREAM_END || s->zstream.total_out != 512*s->sectorcounts[chunk]) + return -1; + break; } + case 1: /* copy */ + ret = read(s->fd, s->uncompressed_chunk, s->lengths[chunk]); + if (ret != s->lengths[chunk]) + return -1; + break; + case 2: /* zero */ + memset(s->uncompressed_chunk, 0, 512*s->sectorcounts[chunk]); + break; + } + s->current_chunk = chunk; + } + return 0; +} + +static int dmg_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + BDRVDMGState *s = bs->opaque; + int i; + + for(i=0;i<nb_sectors;i++) { + uint32_t sector_offset_in_chunk; + if(dmg_read_chunk(s, sector_num+i) != 0) + return -1; + sector_offset_in_chunk = sector_num+i-s->sectors[s->current_chunk]; + memcpy(buf+i*512,s->uncompressed_chunk+sector_offset_in_chunk*512,512); + } + return 0; +} + +static void dmg_close(BlockDriverState *bs) +{ + BDRVDMGState *s = bs->opaque; + close(s->fd); + if(s->n_chunks>0) { + free(s->types); + free(s->offsets); + free(s->lengths); + free(s->sectors); + free(s->sectorcounts); + } + free(s->compressed_chunk); + free(s->uncompressed_chunk); + inflateEnd(&s->zstream); +} + +static BlockDriver bdrv_dmg = { + .format_name = "dmg", + .instance_size = sizeof(BDRVDMGState), + .bdrv_probe = dmg_probe, + .bdrv_open = dmg_open, + .bdrv_read = dmg_read, + .bdrv_close = dmg_close, +}; + +static void bdrv_dmg_init(void) +{ + bdrv_register(&bdrv_dmg); +} + +block_init(bdrv_dmg_init); diff --git a/block/nbd.c b/block/nbd.c new file mode 100644 index 0000000000..47d4778999 --- /dev/null +++ b/block/nbd.c @@ -0,0 +1,196 @@ +/* + * QEMU Block driver for NBD + * + * Copyright (C) 2008 Bull S.A.S. + * Author: Laurent Vivier <Laurent.Vivier@bull.net> + * + * Some parts: + * Copyright (C) 2007 Anthony Liguori <anthony@codemonkey.ws> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu-common.h" +#include "nbd.h" +#include "module.h" + +#include <sys/types.h> +#include <unistd.h> + +typedef struct BDRVNBDState { + int sock; + off_t size; + size_t blocksize; +} BDRVNBDState; + +static int nbd_open(BlockDriverState *bs, const char* filename, int flags) +{ + BDRVNBDState *s = bs->opaque; + const char *host; + const char *unixpath; + int sock; + off_t size; + size_t blocksize; + int ret; + + if ((flags & BDRV_O_CREAT)) + return -EINVAL; + + if (!strstart(filename, "nbd:", &host)) + return -EINVAL; + + if (strstart(host, "unix:", &unixpath)) { + + if (unixpath[0] != '/') + return -EINVAL; + + sock = unix_socket_outgoing(unixpath); + + } else { + uint16_t port; + char *p, *r; + char hostname[128]; + + pstrcpy(hostname, 128, host); + + p = strchr(hostname, ':'); + if (p == NULL) + return -EINVAL; + + *p = '\0'; + p++; + + port = strtol(p, &r, 0); + if (r == p) + return -EINVAL; + sock = tcp_socket_outgoing(hostname, port); + } + + if (sock == -1) + return -errno; + + ret = nbd_receive_negotiate(sock, &size, &blocksize); + if (ret == -1) + return -errno; + + s->sock = sock; + s->size = size; + s->blocksize = blocksize; + + return 0; +} + +static int nbd_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + BDRVNBDState *s = bs->opaque; + struct nbd_request request; + struct nbd_reply reply; + + request.type = NBD_CMD_READ; + request.handle = (uint64_t)(intptr_t)bs; + request.from = sector_num * 512;; + request.len = nb_sectors * 512; + + if (nbd_send_request(s->sock, &request) == -1) + return -errno; + + if (nbd_receive_reply(s->sock, &reply) == -1) + return -errno; + + if (reply.error !=0) + return -reply.error; + + if (reply.handle != request.handle) + return -EIO; + + if (nbd_wr_sync(s->sock, buf, request.len, 1) != request.len) + return -EIO; + + return 0; +} + +static int nbd_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + BDRVNBDState *s = bs->opaque; + struct nbd_request request; + struct nbd_reply reply; + + request.type = NBD_CMD_WRITE; + request.handle = (uint64_t)(intptr_t)bs; + request.from = sector_num * 512;; + request.len = nb_sectors * 512; + + if (nbd_send_request(s->sock, &request) == -1) + return -errno; + + if (nbd_wr_sync(s->sock, (uint8_t*)buf, request.len, 0) != request.len) + return -EIO; + + if (nbd_receive_reply(s->sock, &reply) == -1) + return -errno; + + if (reply.error !=0) + return -reply.error; + + if (reply.handle != request.handle) + return -EIO; + + return 0; +} + +static void nbd_close(BlockDriverState *bs) +{ + BDRVNBDState *s = bs->opaque; + struct nbd_request request; + + request.type = NBD_CMD_DISC; + request.handle = (uint64_t)(intptr_t)bs; + request.from = 0; + request.len = 0; + nbd_send_request(s->sock, &request); + + close(s->sock); +} + +static int64_t nbd_getlength(BlockDriverState *bs) +{ + BDRVNBDState *s = bs->opaque; + + return s->size; +} + +static BlockDriver bdrv_nbd = { + .format_name = "nbd", + .instance_size = sizeof(BDRVNBDState), + .bdrv_open = nbd_open, + .bdrv_read = nbd_read, + .bdrv_write = nbd_write, + .bdrv_close = nbd_close, + .bdrv_getlength = nbd_getlength, + .protocol_name = "nbd", +}; + +static void bdrv_nbd_init(void) +{ + bdrv_register(&bdrv_nbd); +} + +block_init(bdrv_nbd_init); diff --git a/block/parallels.c b/block/parallels.c new file mode 100644 index 0000000000..0b64a5c625 --- /dev/null +++ b/block/parallels.c @@ -0,0 +1,181 @@ +/* + * Block driver for Parallels disk image format + * + * Copyright (c) 2007 Alex Beregszaszi + * + * This code is based on comparing different disk images created by Parallels. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block_int.h" +#include "module.h" + +/**************************************************************/ + +#define HEADER_MAGIC "WithoutFreeSpace" +#define HEADER_VERSION 2 +#define HEADER_SIZE 64 + +// always little-endian +struct parallels_header { + char magic[16]; // "WithoutFreeSpace" + uint32_t version; + uint32_t heads; + uint32_t cylinders; + uint32_t tracks; + uint32_t catalog_entries; + uint32_t nb_sectors; + char padding[24]; +} __attribute__((packed)); + +typedef struct BDRVParallelsState { + int fd; + + uint32_t *catalog_bitmap; + int catalog_size; + + int tracks; +} BDRVParallelsState; + +static int parallels_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + const struct parallels_header *ph = (const void *)buf; + + if (buf_size < HEADER_SIZE) + return 0; + + if (!memcmp(ph->magic, HEADER_MAGIC, 16) && + (le32_to_cpu(ph->version) == HEADER_VERSION)) + return 100; + + return 0; +} + +static int parallels_open(BlockDriverState *bs, const char *filename, int flags) +{ + BDRVParallelsState *s = bs->opaque; + int fd, i; + struct parallels_header ph; + + fd = open(filename, O_RDWR | O_BINARY | O_LARGEFILE); + if (fd < 0) { + fd = open(filename, O_RDONLY | O_BINARY | O_LARGEFILE); + if (fd < 0) + return -1; + } + + bs->read_only = 1; // no write support yet + + s->fd = fd; + + if (read(fd, &ph, sizeof(ph)) != sizeof(ph)) + goto fail; + + if (memcmp(ph.magic, HEADER_MAGIC, 16) || + (le32_to_cpu(ph.version) != HEADER_VERSION)) { + goto fail; + } + + bs->total_sectors = le32_to_cpu(ph.nb_sectors); + + if (lseek(s->fd, 64, SEEK_SET) != 64) + goto fail; + + s->tracks = le32_to_cpu(ph.tracks); + + s->catalog_size = le32_to_cpu(ph.catalog_entries); + s->catalog_bitmap = qemu_malloc(s->catalog_size * 4); + if (read(s->fd, s->catalog_bitmap, s->catalog_size * 4) != + s->catalog_size * 4) + goto fail; + for (i = 0; i < s->catalog_size; i++) + le32_to_cpus(&s->catalog_bitmap[i]); + + return 0; +fail: + if (s->catalog_bitmap) + qemu_free(s->catalog_bitmap); + close(fd); + return -1; +} + +static inline int seek_to_sector(BlockDriverState *bs, int64_t sector_num) +{ + BDRVParallelsState *s = bs->opaque; + uint32_t index, offset, position; + + index = sector_num / s->tracks; + offset = sector_num % s->tracks; + + // not allocated + if ((index > s->catalog_size) || (s->catalog_bitmap[index] == 0)) + return -1; + + position = (s->catalog_bitmap[index] + offset) * 512; + +// fprintf(stderr, "sector: %llx index=%x offset=%x pointer=%x position=%x\n", +// sector_num, index, offset, s->catalog_bitmap[index], position); + + if (lseek(s->fd, position, SEEK_SET) != position) + return -1; + + return 0; +} + +static int parallels_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + BDRVParallelsState *s = bs->opaque; + + while (nb_sectors > 0) { + if (!seek_to_sector(bs, sector_num)) { + if (read(s->fd, buf, 512) != 512) + return -1; + } else + memset(buf, 0, 512); + nb_sectors--; + sector_num++; + buf += 512; + } + return 0; +} + +static void parallels_close(BlockDriverState *bs) +{ + BDRVParallelsState *s = bs->opaque; + qemu_free(s->catalog_bitmap); + close(s->fd); +} + +static BlockDriver bdrv_parallels = { + .format_name = "parallels", + .instance_size = sizeof(BDRVParallelsState), + .bdrv_probe = parallels_probe, + .bdrv_open = parallels_open, + .bdrv_read = parallels_read, + .bdrv_close = parallels_close, +}; + +static void bdrv_parallels_init(void) +{ + bdrv_register(&bdrv_parallels); +} + +block_init(bdrv_parallels_init); diff --git a/block/qcow.c b/block/qcow.c new file mode 100644 index 0000000000..1cf7c3be77 --- /dev/null +++ b/block/qcow.c @@ -0,0 +1,945 @@ +/* + * Block driver for the QCOW format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block_int.h" +#include "module.h" +#include <zlib.h> +#include "aes.h" + +/**************************************************************/ +/* QEMU COW block driver with compression and encryption support */ + +#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb) +#define QCOW_VERSION 1 + +#define QCOW_CRYPT_NONE 0 +#define QCOW_CRYPT_AES 1 + +#define QCOW_OFLAG_COMPRESSED (1LL << 63) + +typedef struct QCowHeader { + uint32_t magic; + uint32_t version; + uint64_t backing_file_offset; + uint32_t backing_file_size; + uint32_t mtime; + uint64_t size; /* in bytes */ + uint8_t cluster_bits; + uint8_t l2_bits; + uint32_t crypt_method; + uint64_t l1_table_offset; +} QCowHeader; + +#define L2_CACHE_SIZE 16 + +typedef struct BDRVQcowState { + BlockDriverState *hd; + int cluster_bits; + int cluster_size; + int cluster_sectors; + int l2_bits; + int l2_size; + int l1_size; + uint64_t cluster_offset_mask; + uint64_t l1_table_offset; + uint64_t *l1_table; + uint64_t *l2_cache; + uint64_t l2_cache_offsets[L2_CACHE_SIZE]; + uint32_t l2_cache_counts[L2_CACHE_SIZE]; + uint8_t *cluster_cache; + uint8_t *cluster_data; + uint64_t cluster_cache_offset; + uint32_t crypt_method; /* current crypt method, 0 if no key yet */ + uint32_t crypt_method_header; + AES_KEY aes_encrypt_key; + AES_KEY aes_decrypt_key; +} BDRVQcowState; + +static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset); + +static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + const QCowHeader *cow_header = (const void *)buf; + + if (buf_size >= sizeof(QCowHeader) && + be32_to_cpu(cow_header->magic) == QCOW_MAGIC && + be32_to_cpu(cow_header->version) == QCOW_VERSION) + return 100; + else + return 0; +} + +static int qcow_open(BlockDriverState *bs, const char *filename, int flags) +{ + BDRVQcowState *s = bs->opaque; + int len, i, shift, ret; + QCowHeader header; + + ret = bdrv_file_open(&s->hd, filename, flags); + if (ret < 0) + return ret; + if (bdrv_pread(s->hd, 0, &header, sizeof(header)) != sizeof(header)) + goto fail; + be32_to_cpus(&header.magic); + be32_to_cpus(&header.version); + be64_to_cpus(&header.backing_file_offset); + be32_to_cpus(&header.backing_file_size); + be32_to_cpus(&header.mtime); + be64_to_cpus(&header.size); + be32_to_cpus(&header.crypt_method); + be64_to_cpus(&header.l1_table_offset); + + if (header.magic != QCOW_MAGIC || header.version != QCOW_VERSION) + goto fail; + if (header.size <= 1 || header.cluster_bits < 9) + goto fail; + if (header.crypt_method > QCOW_CRYPT_AES) + goto fail; + s->crypt_method_header = header.crypt_method; + if (s->crypt_method_header) + bs->encrypted = 1; + s->cluster_bits = header.cluster_bits; + s->cluster_size = 1 << s->cluster_bits; + s->cluster_sectors = 1 << (s->cluster_bits - 9); + s->l2_bits = header.l2_bits; + s->l2_size = 1 << s->l2_bits; + bs->total_sectors = header.size / 512; + s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1; + + /* read the level 1 table */ + shift = s->cluster_bits + s->l2_bits; + s->l1_size = (header.size + (1LL << shift) - 1) >> shift; + + s->l1_table_offset = header.l1_table_offset; + s->l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t)); + if (!s->l1_table) + goto fail; + if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) != + s->l1_size * sizeof(uint64_t)) + goto fail; + for(i = 0;i < s->l1_size; i++) { + be64_to_cpus(&s->l1_table[i]); + } + /* alloc L2 cache */ + s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t)); + if (!s->l2_cache) + goto fail; + s->cluster_cache = qemu_malloc(s->cluster_size); + if (!s->cluster_cache) + goto fail; + s->cluster_data = qemu_malloc(s->cluster_size); + if (!s->cluster_data) + goto fail; + s->cluster_cache_offset = -1; + + /* read the backing file name */ + if (header.backing_file_offset != 0) { + len = header.backing_file_size; + if (len > 1023) + len = 1023; + if (bdrv_pread(s->hd, header.backing_file_offset, bs->backing_file, len) != len) + goto fail; + bs->backing_file[len] = '\0'; + } + return 0; + + fail: + qemu_free(s->l1_table); + qemu_free(s->l2_cache); + qemu_free(s->cluster_cache); + qemu_free(s->cluster_data); + bdrv_delete(s->hd); + return -1; +} + +static int qcow_set_key(BlockDriverState *bs, const char *key) +{ + BDRVQcowState *s = bs->opaque; + uint8_t keybuf[16]; + int len, i; + + memset(keybuf, 0, 16); + len = strlen(key); + if (len > 16) + len = 16; + /* XXX: we could compress the chars to 7 bits to increase + entropy */ + for(i = 0;i < len;i++) { + keybuf[i] = key[i]; + } + s->crypt_method = s->crypt_method_header; + + if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0) + return -1; + if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0) + return -1; +#if 0 + /* test */ + { + uint8_t in[16]; + uint8_t out[16]; + uint8_t tmp[16]; + for(i=0;i<16;i++) + in[i] = i; + AES_encrypt(in, tmp, &s->aes_encrypt_key); + AES_decrypt(tmp, out, &s->aes_decrypt_key); + for(i = 0; i < 16; i++) + printf(" %02x", tmp[i]); + printf("\n"); + for(i = 0; i < 16; i++) + printf(" %02x", out[i]); + printf("\n"); + } +#endif + return 0; +} + +/* The crypt function is compatible with the linux cryptoloop + algorithm for < 4 GB images. NOTE: out_buf == in_buf is + supported */ +static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num, + uint8_t *out_buf, const uint8_t *in_buf, + int nb_sectors, int enc, + const AES_KEY *key) +{ + union { + uint64_t ll[2]; + uint8_t b[16]; + } ivec; + int i; + + for(i = 0; i < nb_sectors; i++) { + ivec.ll[0] = cpu_to_le64(sector_num); + ivec.ll[1] = 0; + AES_cbc_encrypt(in_buf, out_buf, 512, key, + ivec.b, enc); + sector_num++; + in_buf += 512; + out_buf += 512; + } +} + +/* 'allocate' is: + * + * 0 to not allocate. + * + * 1 to allocate a normal cluster (for sector indexes 'n_start' to + * 'n_end') + * + * 2 to allocate a compressed cluster of size + * 'compressed_size'. 'compressed_size' must be > 0 and < + * cluster_size + * + * return 0 if not allocated. + */ +static uint64_t get_cluster_offset(BlockDriverState *bs, + uint64_t offset, int allocate, + int compressed_size, + int n_start, int n_end) +{ + BDRVQcowState *s = bs->opaque; + int min_index, i, j, l1_index, l2_index; + uint64_t l2_offset, *l2_table, cluster_offset, tmp; + uint32_t min_count; + int new_l2_table; + + l1_index = offset >> (s->l2_bits + s->cluster_bits); + l2_offset = s->l1_table[l1_index]; + new_l2_table = 0; + if (!l2_offset) { + if (!allocate) + return 0; + /* allocate a new l2 entry */ + l2_offset = bdrv_getlength(s->hd); + /* round to cluster size */ + l2_offset = (l2_offset + s->cluster_size - 1) & ~(s->cluster_size - 1); + /* update the L1 entry */ + s->l1_table[l1_index] = l2_offset; + tmp = cpu_to_be64(l2_offset); + if (bdrv_pwrite(s->hd, s->l1_table_offset + l1_index * sizeof(tmp), + &tmp, sizeof(tmp)) != sizeof(tmp)) + return 0; + new_l2_table = 1; + } + for(i = 0; i < L2_CACHE_SIZE; i++) { + if (l2_offset == s->l2_cache_offsets[i]) { + /* increment the hit count */ + if (++s->l2_cache_counts[i] == 0xffffffff) { + for(j = 0; j < L2_CACHE_SIZE; j++) { + s->l2_cache_counts[j] >>= 1; + } + } + l2_table = s->l2_cache + (i << s->l2_bits); + goto found; + } + } + /* not found: load a new entry in the least used one */ + min_index = 0; + min_count = 0xffffffff; + for(i = 0; i < L2_CACHE_SIZE; i++) { + if (s->l2_cache_counts[i] < min_count) { + min_count = s->l2_cache_counts[i]; + min_index = i; + } + } + l2_table = s->l2_cache + (min_index << s->l2_bits); + if (new_l2_table) { + memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); + if (bdrv_pwrite(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) != + s->l2_size * sizeof(uint64_t)) + return 0; + } else { + if (bdrv_pread(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) != + s->l2_size * sizeof(uint64_t)) + return 0; + } + s->l2_cache_offsets[min_index] = l2_offset; + s->l2_cache_counts[min_index] = 1; + found: + l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); + cluster_offset = be64_to_cpu(l2_table[l2_index]); + if (!cluster_offset || + ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1)) { + if (!allocate) + return 0; + /* allocate a new cluster */ + if ((cluster_offset & QCOW_OFLAG_COMPRESSED) && + (n_end - n_start) < s->cluster_sectors) { + /* if the cluster is already compressed, we must + decompress it in the case it is not completely + overwritten */ + if (decompress_cluster(s, cluster_offset) < 0) + return 0; + cluster_offset = bdrv_getlength(s->hd); + cluster_offset = (cluster_offset + s->cluster_size - 1) & + ~(s->cluster_size - 1); + /* write the cluster content */ + if (bdrv_pwrite(s->hd, cluster_offset, s->cluster_cache, s->cluster_size) != + s->cluster_size) + return -1; + } else { + cluster_offset = bdrv_getlength(s->hd); + if (allocate == 1) { + /* round to cluster size */ + cluster_offset = (cluster_offset + s->cluster_size - 1) & + ~(s->cluster_size - 1); + bdrv_truncate(s->hd, cluster_offset + s->cluster_size); + /* if encrypted, we must initialize the cluster + content which won't be written */ + if (s->crypt_method && + (n_end - n_start) < s->cluster_sectors) { + uint64_t start_sect; + start_sect = (offset & ~(s->cluster_size - 1)) >> 9; + memset(s->cluster_data + 512, 0x00, 512); + for(i = 0; i < s->cluster_sectors; i++) { + if (i < n_start || i >= n_end) { + encrypt_sectors(s, start_sect + i, + s->cluster_data, + s->cluster_data + 512, 1, 1, + &s->aes_encrypt_key); + if (bdrv_pwrite(s->hd, cluster_offset + i * 512, + s->cluster_data, 512) != 512) + return -1; + } + } + } + } else if (allocate == 2) { + cluster_offset |= QCOW_OFLAG_COMPRESSED | + (uint64_t)compressed_size << (63 - s->cluster_bits); + } + } + /* update L2 table */ + tmp = cpu_to_be64(cluster_offset); + l2_table[l2_index] = tmp; + if (bdrv_pwrite(s->hd, + l2_offset + l2_index * sizeof(tmp), &tmp, sizeof(tmp)) != sizeof(tmp)) + return 0; + } + return cluster_offset; +} + +static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, int *pnum) +{ + BDRVQcowState *s = bs->opaque; + int index_in_cluster, n; + uint64_t cluster_offset; + + cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0); + index_in_cluster = sector_num & (s->cluster_sectors - 1); + n = s->cluster_sectors - index_in_cluster; + if (n > nb_sectors) + n = nb_sectors; + *pnum = n; + return (cluster_offset != 0); +} + +static int decompress_buffer(uint8_t *out_buf, int out_buf_size, + const uint8_t *buf, int buf_size) +{ + z_stream strm1, *strm = &strm1; + int ret, out_len; + + memset(strm, 0, sizeof(*strm)); + + strm->next_in = (uint8_t *)buf; + strm->avail_in = buf_size; + strm->next_out = out_buf; + strm->avail_out = out_buf_size; + + ret = inflateInit2(strm, -12); + if (ret != Z_OK) + return -1; + ret = inflate(strm, Z_FINISH); + out_len = strm->next_out - out_buf; + if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || + out_len != out_buf_size) { + inflateEnd(strm); + return -1; + } + inflateEnd(strm); + return 0; +} + +static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset) +{ + int ret, csize; + uint64_t coffset; + + coffset = cluster_offset & s->cluster_offset_mask; + if (s->cluster_cache_offset != coffset) { + csize = cluster_offset >> (63 - s->cluster_bits); + csize &= (s->cluster_size - 1); + ret = bdrv_pread(s->hd, coffset, s->cluster_data, csize); + if (ret != csize) + return -1; + if (decompress_buffer(s->cluster_cache, s->cluster_size, + s->cluster_data, csize) < 0) { + return -1; + } + s->cluster_cache_offset = coffset; + } + return 0; +} + +#if 0 + +static int qcow_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + BDRVQcowState *s = bs->opaque; + int ret, index_in_cluster, n; + uint64_t cluster_offset; + + while (nb_sectors > 0) { + cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0); + index_in_cluster = sector_num & (s->cluster_sectors - 1); + n = s->cluster_sectors - index_in_cluster; + if (n > nb_sectors) + n = nb_sectors; + if (!cluster_offset) { + if (bs->backing_hd) { + /* read from the base image */ + ret = bdrv_read(bs->backing_hd, sector_num, buf, n); + if (ret < 0) + return -1; + } else { + memset(buf, 0, 512 * n); + } + } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) { + if (decompress_cluster(s, cluster_offset) < 0) + return -1; + memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n); + } else { + ret = bdrv_pread(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512); + if (ret != n * 512) + return -1; + if (s->crypt_method) { + encrypt_sectors(s, sector_num, buf, buf, n, 0, + &s->aes_decrypt_key); + } + } + nb_sectors -= n; + sector_num += n; + buf += n * 512; + } + return 0; +} +#endif + +static int qcow_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + BDRVQcowState *s = bs->opaque; + int ret, index_in_cluster, n; + uint64_t cluster_offset; + + while (nb_sectors > 0) { + index_in_cluster = sector_num & (s->cluster_sectors - 1); + n = s->cluster_sectors - index_in_cluster; + if (n > nb_sectors) + n = nb_sectors; + cluster_offset = get_cluster_offset(bs, sector_num << 9, 1, 0, + index_in_cluster, + index_in_cluster + n); + if (!cluster_offset) + return -1; + if (s->crypt_method) { + encrypt_sectors(s, sector_num, s->cluster_data, buf, n, 1, + &s->aes_encrypt_key); + ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, + s->cluster_data, n * 512); + } else { + ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512); + } + if (ret != n * 512) + return -1; + nb_sectors -= n; + sector_num += n; + buf += n * 512; + } + s->cluster_cache_offset = -1; /* disable compressed cache */ + return 0; +} + +typedef struct QCowAIOCB { + BlockDriverAIOCB common; + int64_t sector_num; + QEMUIOVector *qiov; + uint8_t *buf; + void *orig_buf; + int nb_sectors; + int n; + uint64_t cluster_offset; + uint8_t *cluster_data; + struct iovec hd_iov; + QEMUIOVector hd_qiov; + BlockDriverAIOCB *hd_aiocb; +} QCowAIOCB; + +static void qcow_aio_read_cb(void *opaque, int ret) +{ + QCowAIOCB *acb = opaque; + BlockDriverState *bs = acb->common.bs; + BDRVQcowState *s = bs->opaque; + int index_in_cluster; + + acb->hd_aiocb = NULL; + if (ret < 0) + goto done; + + redo: + /* post process the read buffer */ + if (!acb->cluster_offset) { + /* nothing to do */ + } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) { + /* nothing to do */ + } else { + if (s->crypt_method) { + encrypt_sectors(s, acb->sector_num, acb->buf, acb->buf, + acb->n, 0, + &s->aes_decrypt_key); + } + } + + acb->nb_sectors -= acb->n; + acb->sector_num += acb->n; + acb->buf += acb->n * 512; + + if (acb->nb_sectors == 0) { + /* request completed */ + ret = 0; + goto done; + } + + /* prepare next AIO request */ + acb->cluster_offset = get_cluster_offset(bs, acb->sector_num << 9, + 0, 0, 0, 0); + index_in_cluster = acb->sector_num & (s->cluster_sectors - 1); + acb->n = s->cluster_sectors - index_in_cluster; + if (acb->n > acb->nb_sectors) + acb->n = acb->nb_sectors; + + if (!acb->cluster_offset) { + if (bs->backing_hd) { + /* read from the base image */ + acb->hd_iov.iov_base = (void *)acb->buf; + acb->hd_iov.iov_len = acb->n * 512; + qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1); + acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num, + &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb); + if (acb->hd_aiocb == NULL) + goto done; + } else { + /* Note: in this case, no need to wait */ + memset(acb->buf, 0, 512 * acb->n); + goto redo; + } + } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) { + /* add AIO support for compressed blocks ? */ + if (decompress_cluster(s, acb->cluster_offset) < 0) + goto done; + memcpy(acb->buf, + s->cluster_cache + index_in_cluster * 512, 512 * acb->n); + goto redo; + } else { + if ((acb->cluster_offset & 511) != 0) { + ret = -EIO; + goto done; + } + acb->hd_iov.iov_base = (void *)acb->buf; + acb->hd_iov.iov_len = acb->n * 512; + qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1); + acb->hd_aiocb = bdrv_aio_readv(s->hd, + (acb->cluster_offset >> 9) + index_in_cluster, + &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb); + if (acb->hd_aiocb == NULL) + goto done; + } + + return; + +done: + if (acb->qiov->niov > 1) { + qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size); + qemu_vfree(acb->orig_buf); + } + acb->common.cb(acb->common.opaque, ret); + qemu_aio_release(acb); +} + +static BlockDriverAIOCB *qcow_aio_readv(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + QCowAIOCB *acb; + + acb = qemu_aio_get(bs, cb, opaque); + if (!acb) + return NULL; + acb->hd_aiocb = NULL; + acb->sector_num = sector_num; + acb->qiov = qiov; + if (qiov->niov > 1) + acb->buf = acb->orig_buf = qemu_blockalign(bs, qiov->size); + else + acb->buf = (uint8_t *)qiov->iov->iov_base; + acb->nb_sectors = nb_sectors; + acb->n = 0; + acb->cluster_offset = 0; + + qcow_aio_read_cb(acb, 0); + return &acb->common; +} + +static void qcow_aio_write_cb(void *opaque, int ret) +{ + QCowAIOCB *acb = opaque; + BlockDriverState *bs = acb->common.bs; + BDRVQcowState *s = bs->opaque; + int index_in_cluster; + uint64_t cluster_offset; + const uint8_t *src_buf; + + acb->hd_aiocb = NULL; + + if (ret < 0) + goto done; + + acb->nb_sectors -= acb->n; + acb->sector_num += acb->n; + acb->buf += acb->n * 512; + + if (acb->nb_sectors == 0) { + /* request completed */ + ret = 0; + goto done; + } + + index_in_cluster = acb->sector_num & (s->cluster_sectors - 1); + acb->n = s->cluster_sectors - index_in_cluster; + if (acb->n > acb->nb_sectors) + acb->n = acb->nb_sectors; + cluster_offset = get_cluster_offset(bs, acb->sector_num << 9, 1, 0, + index_in_cluster, + index_in_cluster + acb->n); + if (!cluster_offset || (cluster_offset & 511) != 0) { + ret = -EIO; + goto done; + } + if (s->crypt_method) { + if (!acb->cluster_data) { + acb->cluster_data = qemu_mallocz(s->cluster_size); + if (!acb->cluster_data) { + ret = -ENOMEM; + goto done; + } + } + encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf, + acb->n, 1, &s->aes_encrypt_key); + src_buf = acb->cluster_data; + } else { + src_buf = acb->buf; + } + + acb->hd_iov.iov_base = (void *)src_buf; + acb->hd_iov.iov_len = acb->n * 512; + qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1); + acb->hd_aiocb = bdrv_aio_writev(s->hd, + (cluster_offset >> 9) + index_in_cluster, + &acb->hd_qiov, acb->n, + qcow_aio_write_cb, acb); + if (acb->hd_aiocb == NULL) + goto done; + return; + +done: + if (acb->qiov->niov > 1) + qemu_vfree(acb->orig_buf); + acb->common.cb(acb->common.opaque, ret); + qemu_aio_release(acb); +} + +static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + BDRVQcowState *s = bs->opaque; + QCowAIOCB *acb; + + s->cluster_cache_offset = -1; /* disable compressed cache */ + + acb = qemu_aio_get(bs, cb, opaque); + if (!acb) + return NULL; + acb->hd_aiocb = NULL; + acb->sector_num = sector_num; + acb->qiov = qiov; + if (qiov->niov > 1) { + acb->buf = acb->orig_buf = qemu_blockalign(bs, qiov->size); + qemu_iovec_to_buffer(qiov, acb->buf); + } else { + acb->buf = (uint8_t *)qiov->iov->iov_base; + } + acb->nb_sectors = nb_sectors; + acb->n = 0; + + qcow_aio_write_cb(acb, 0); + return &acb->common; +} + +static void qcow_aio_cancel(BlockDriverAIOCB *blockacb) +{ + QCowAIOCB *acb = (QCowAIOCB *)blockacb; + if (acb->hd_aiocb) + bdrv_aio_cancel(acb->hd_aiocb); + qemu_aio_release(acb); +} + +static void qcow_close(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + qemu_free(s->l1_table); + qemu_free(s->l2_cache); + qemu_free(s->cluster_cache); + qemu_free(s->cluster_data); + bdrv_delete(s->hd); +} + +static int qcow_create(const char *filename, int64_t total_size, + const char *backing_file, int flags) +{ + int fd, header_size, backing_filename_len, l1_size, i, shift; + QCowHeader header; + uint64_t tmp; + + fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644); + if (fd < 0) + return -1; + memset(&header, 0, sizeof(header)); + header.magic = cpu_to_be32(QCOW_MAGIC); + header.version = cpu_to_be32(QCOW_VERSION); + header.size = cpu_to_be64(total_size * 512); + header_size = sizeof(header); + backing_filename_len = 0; + if (backing_file) { + if (strcmp(backing_file, "fat:")) { + header.backing_file_offset = cpu_to_be64(header_size); + backing_filename_len = strlen(backing_file); + header.backing_file_size = cpu_to_be32(backing_filename_len); + header_size += backing_filename_len; + } else { + /* special backing file for vvfat */ + backing_file = NULL; + } + header.cluster_bits = 9; /* 512 byte cluster to avoid copying + unmodifyed sectors */ + header.l2_bits = 12; /* 32 KB L2 tables */ + } else { + header.cluster_bits = 12; /* 4 KB clusters */ + header.l2_bits = 9; /* 4 KB L2 tables */ + } + header_size = (header_size + 7) & ~7; + shift = header.cluster_bits + header.l2_bits; + l1_size = ((total_size * 512) + (1LL << shift) - 1) >> shift; + + header.l1_table_offset = cpu_to_be64(header_size); + if (flags & BLOCK_FLAG_ENCRYPT) { + header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES); + } else { + header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE); + } + + /* write all the data */ + write(fd, &header, sizeof(header)); + if (backing_file) { + write(fd, backing_file, backing_filename_len); + } + lseek(fd, header_size, SEEK_SET); + tmp = 0; + for(i = 0;i < l1_size; i++) { + write(fd, &tmp, sizeof(tmp)); + } + close(fd); + return 0; +} + +static int qcow_make_empty(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + uint32_t l1_length = s->l1_size * sizeof(uint64_t); + int ret; + + memset(s->l1_table, 0, l1_length); + if (bdrv_pwrite(s->hd, s->l1_table_offset, s->l1_table, l1_length) < 0) + return -1; + ret = bdrv_truncate(s->hd, s->l1_table_offset + l1_length); + if (ret < 0) + return ret; + + memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t)); + memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t)); + memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t)); + + return 0; +} + +/* XXX: put compressed sectors first, then all the cluster aligned + tables to avoid losing bytes in alignment */ +static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + BDRVQcowState *s = bs->opaque; + z_stream strm; + int ret, out_len; + uint8_t *out_buf; + uint64_t cluster_offset; + + if (nb_sectors != s->cluster_sectors) + return -EINVAL; + + out_buf = qemu_malloc(s->cluster_size + (s->cluster_size / 1000) + 128); + if (!out_buf) + return -1; + + /* best compression, small window, no zlib header */ + memset(&strm, 0, sizeof(strm)); + ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, + Z_DEFLATED, -12, + 9, Z_DEFAULT_STRATEGY); + if (ret != 0) { + qemu_free(out_buf); + return -1; + } + + strm.avail_in = s->cluster_size; + strm.next_in = (uint8_t *)buf; + strm.avail_out = s->cluster_size; + strm.next_out = out_buf; + + ret = deflate(&strm, Z_FINISH); + if (ret != Z_STREAM_END && ret != Z_OK) { + qemu_free(out_buf); + deflateEnd(&strm); + return -1; + } + out_len = strm.next_out - out_buf; + + deflateEnd(&strm); + + if (ret != Z_STREAM_END || out_len >= s->cluster_size) { + /* could not compress: write normal cluster */ + qcow_write(bs, sector_num, buf, s->cluster_sectors); + } else { + cluster_offset = get_cluster_offset(bs, sector_num << 9, 2, + out_len, 0, 0); + cluster_offset &= s->cluster_offset_mask; + if (bdrv_pwrite(s->hd, cluster_offset, out_buf, out_len) != out_len) { + qemu_free(out_buf); + return -1; + } + } + + qemu_free(out_buf); + return 0; +} + +static void qcow_flush(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + bdrv_flush(s->hd); +} + +static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + BDRVQcowState *s = bs->opaque; + bdi->cluster_size = s->cluster_size; + return 0; +} + +static BlockDriver bdrv_qcow = { + .format_name = "qcow", + .instance_size = sizeof(BDRVQcowState), + .bdrv_probe = qcow_probe, + .bdrv_open = qcow_open, + .bdrv_close = qcow_close, + .bdrv_create = qcow_create, + .bdrv_flush = qcow_flush, + .bdrv_is_allocated = qcow_is_allocated, + .bdrv_set_key = qcow_set_key, + .bdrv_make_empty = qcow_make_empty, + .bdrv_aio_readv = qcow_aio_readv, + .bdrv_aio_writev = qcow_aio_writev, + .bdrv_aio_cancel = qcow_aio_cancel, + .aiocb_size = sizeof(QCowAIOCB), + .bdrv_write_compressed = qcow_write_compressed, + .bdrv_get_info = qcow_get_info, +}; + +static void bdrv_qcow_init(void) +{ + bdrv_register(&bdrv_qcow); +} + +block_init(bdrv_qcow_init); diff --git a/block/qcow2.c b/block/qcow2.c new file mode 100644 index 0000000000..a6de9b6919 --- /dev/null +++ b/block/qcow2.c @@ -0,0 +1,2931 @@ +/* + * Block driver for the QCOW version 2 format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block_int.h" +#include "module.h" +#include <zlib.h> +#include "aes.h" + +/* + Differences with QCOW: + + - Support for multiple incremental snapshots. + - Memory management by reference counts. + - Clusters which have a reference count of one have the bit + QCOW_OFLAG_COPIED to optimize write performance. + - Size of compressed clusters is stored in sectors to reduce bit usage + in the cluster offsets. + - Support for storing additional data (such as the VM state) in the + snapshots. + - If a backing store is used, the cluster size is not constrained + (could be backported to QCOW). + - L2 tables have always a size of one cluster. +*/ + +//#define DEBUG_ALLOC +//#define DEBUG_ALLOC2 +//#define DEBUG_EXT + +#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb) +#define QCOW_VERSION 2 + +#define QCOW_CRYPT_NONE 0 +#define QCOW_CRYPT_AES 1 + +#define QCOW_MAX_CRYPT_CLUSTERS 32 + +/* indicate that the refcount of the referenced cluster is exactly one. */ +#define QCOW_OFLAG_COPIED (1LL << 63) +/* indicate that the cluster is compressed (they never have the copied flag) */ +#define QCOW_OFLAG_COMPRESSED (1LL << 62) + +#define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */ + +typedef struct QCowHeader { + uint32_t magic; + uint32_t version; + uint64_t backing_file_offset; + uint32_t backing_file_size; + uint32_t cluster_bits; + uint64_t size; /* in bytes */ + uint32_t crypt_method; + uint32_t l1_size; /* XXX: save number of clusters instead ? */ + uint64_t l1_table_offset; + uint64_t refcount_table_offset; + uint32_t refcount_table_clusters; + uint32_t nb_snapshots; + uint64_t snapshots_offset; +} QCowHeader; + + +typedef struct { + uint32_t magic; + uint32_t len; +} QCowExtension; +#define QCOW_EXT_MAGIC_END 0 +#define QCOW_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA + + +typedef struct __attribute__((packed)) QCowSnapshotHeader { + /* header is 8 byte aligned */ + uint64_t l1_table_offset; + + uint32_t l1_size; + uint16_t id_str_size; + uint16_t name_size; + + uint32_t date_sec; + uint32_t date_nsec; + + uint64_t vm_clock_nsec; + + uint32_t vm_state_size; + uint32_t extra_data_size; /* for extension */ + /* extra data follows */ + /* id_str follows */ + /* name follows */ +} QCowSnapshotHeader; + +#define L2_CACHE_SIZE 16 + +typedef struct QCowSnapshot { + uint64_t l1_table_offset; + uint32_t l1_size; + char *id_str; + char *name; + uint32_t vm_state_size; + uint32_t date_sec; + uint32_t date_nsec; + uint64_t vm_clock_nsec; +} QCowSnapshot; + +typedef struct BDRVQcowState { + BlockDriverState *hd; + int cluster_bits; + int cluster_size; + int cluster_sectors; + int l2_bits; + int l2_size; + int l1_size; + int l1_vm_state_index; + int csize_shift; + int csize_mask; + uint64_t cluster_offset_mask; + uint64_t l1_table_offset; + uint64_t *l1_table; + uint64_t *l2_cache; + uint64_t l2_cache_offsets[L2_CACHE_SIZE]; + uint32_t l2_cache_counts[L2_CACHE_SIZE]; + uint8_t *cluster_cache; + uint8_t *cluster_data; + uint64_t cluster_cache_offset; + + uint64_t *refcount_table; + uint64_t refcount_table_offset; + uint32_t refcount_table_size; + uint64_t refcount_block_cache_offset; + uint16_t *refcount_block_cache; + int64_t free_cluster_index; + int64_t free_byte_offset; + + uint32_t crypt_method; /* current crypt method, 0 if no key yet */ + uint32_t crypt_method_header; + AES_KEY aes_encrypt_key; + AES_KEY aes_decrypt_key; + uint64_t snapshots_offset; + int snapshots_size; + int nb_snapshots; + QCowSnapshot *snapshots; +} BDRVQcowState; + +static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset); +static int qcow_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors); +static int qcow_read_snapshots(BlockDriverState *bs); +static void qcow_free_snapshots(BlockDriverState *bs); +static int refcount_init(BlockDriverState *bs); +static void refcount_close(BlockDriverState *bs); +static int get_refcount(BlockDriverState *bs, int64_t cluster_index); +static int update_cluster_refcount(BlockDriverState *bs, + int64_t cluster_index, + int addend); +static void update_refcount(BlockDriverState *bs, + int64_t offset, int64_t length, + int addend); +static int64_t alloc_clusters(BlockDriverState *bs, int64_t size); +static int64_t alloc_bytes(BlockDriverState *bs, int size); +static void free_clusters(BlockDriverState *bs, + int64_t offset, int64_t size); +static int check_refcounts(BlockDriverState *bs); + +static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + const QCowHeader *cow_header = (const void *)buf; + + if (buf_size >= sizeof(QCowHeader) && + be32_to_cpu(cow_header->magic) == QCOW_MAGIC && + be32_to_cpu(cow_header->version) == QCOW_VERSION) + return 100; + else + return 0; +} + + +/* + * read qcow2 extension and fill bs + * start reading from start_offset + * finish reading upon magic of value 0 or when end_offset reached + * unknown magic is skipped (future extension this version knows nothing about) + * return 0 upon success, non-0 otherwise + */ +static int qcow_read_extensions(BlockDriverState *bs, uint64_t start_offset, + uint64_t end_offset) +{ + BDRVQcowState *s = bs->opaque; + QCowExtension ext; + uint64_t offset; + +#ifdef DEBUG_EXT + printf("qcow_read_extensions: start=%ld end=%ld\n", start_offset, end_offset); +#endif + offset = start_offset; + while (offset < end_offset) { + +#ifdef DEBUG_EXT + /* Sanity check */ + if (offset > s->cluster_size) + printf("qcow_handle_extension: suspicious offset %lu\n", offset); + + printf("attemting to read extended header in offset %lu\n", offset); +#endif + + if (bdrv_pread(s->hd, offset, &ext, sizeof(ext)) != sizeof(ext)) { + fprintf(stderr, "qcow_handle_extension: ERROR: pread fail from offset %llu\n", + (unsigned long long)offset); + return 1; + } + be32_to_cpus(&ext.magic); + be32_to_cpus(&ext.len); + offset += sizeof(ext); +#ifdef DEBUG_EXT + printf("ext.magic = 0x%x\n", ext.magic); +#endif + switch (ext.magic) { + case QCOW_EXT_MAGIC_END: + return 0; + + case QCOW_EXT_MAGIC_BACKING_FORMAT: + if (ext.len >= sizeof(bs->backing_format)) { + fprintf(stderr, "ERROR: ext_backing_format: len=%u too large" + " (>=%zu)\n", + ext.len, sizeof(bs->backing_format)); + return 2; + } + if (bdrv_pread(s->hd, offset , bs->backing_format, + ext.len) != ext.len) + return 3; + bs->backing_format[ext.len] = '\0'; +#ifdef DEBUG_EXT + printf("Qcow2: Got format extension %s\n", bs->backing_format); +#endif + offset += ((ext.len + 7) & ~7); + break; + + default: + /* unknown magic -- just skip it */ + offset += ((ext.len + 7) & ~7); + break; + } + } + + return 0; +} + + +static int qcow_open(BlockDriverState *bs, const char *filename, int flags) +{ + BDRVQcowState *s = bs->opaque; + int len, i, shift, ret; + QCowHeader header; + uint64_t ext_end; + + /* Performance is terrible right now with cache=writethrough due mainly + * to reference count updates. If the user does not explicitly specify + * a caching type, force to writeback caching. + */ + if ((flags & BDRV_O_CACHE_DEF)) { + flags |= BDRV_O_CACHE_WB; + flags &= ~BDRV_O_CACHE_DEF; + } + ret = bdrv_file_open(&s->hd, filename, flags); + if (ret < 0) + return ret; + if (bdrv_pread(s->hd, 0, &header, sizeof(header)) != sizeof(header)) + goto fail; + be32_to_cpus(&header.magic); + be32_to_cpus(&header.version); + be64_to_cpus(&header.backing_file_offset); + be32_to_cpus(&header.backing_file_size); + be64_to_cpus(&header.size); + be32_to_cpus(&header.cluster_bits); + be32_to_cpus(&header.crypt_method); + be64_to_cpus(&header.l1_table_offset); + be32_to_cpus(&header.l1_size); + be64_to_cpus(&header.refcount_table_offset); + be32_to_cpus(&header.refcount_table_clusters); + be64_to_cpus(&header.snapshots_offset); + be32_to_cpus(&header.nb_snapshots); + + if (header.magic != QCOW_MAGIC || header.version != QCOW_VERSION) + goto fail; + if (header.size <= 1 || + header.cluster_bits < 9 || + header.cluster_bits > 16) + goto fail; + if (header.crypt_method > QCOW_CRYPT_AES) + goto fail; + s->crypt_method_header = header.crypt_method; + if (s->crypt_method_header) + bs->encrypted = 1; + s->cluster_bits = header.cluster_bits; + s->cluster_size = 1 << s->cluster_bits; + s->cluster_sectors = 1 << (s->cluster_bits - 9); + s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */ + s->l2_size = 1 << s->l2_bits; + bs->total_sectors = header.size / 512; + s->csize_shift = (62 - (s->cluster_bits - 8)); + s->csize_mask = (1 << (s->cluster_bits - 8)) - 1; + s->cluster_offset_mask = (1LL << s->csize_shift) - 1; + s->refcount_table_offset = header.refcount_table_offset; + s->refcount_table_size = + header.refcount_table_clusters << (s->cluster_bits - 3); + + s->snapshots_offset = header.snapshots_offset; + s->nb_snapshots = header.nb_snapshots; + + /* read the level 1 table */ + s->l1_size = header.l1_size; + shift = s->cluster_bits + s->l2_bits; + s->l1_vm_state_index = (header.size + (1LL << shift) - 1) >> shift; + /* the L1 table must contain at least enough entries to put + header.size bytes */ + if (s->l1_size < s->l1_vm_state_index) + goto fail; + s->l1_table_offset = header.l1_table_offset; + s->l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t)); + if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) != + s->l1_size * sizeof(uint64_t)) + goto fail; + for(i = 0;i < s->l1_size; i++) { + be64_to_cpus(&s->l1_table[i]); + } + /* alloc L2 cache */ + s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t)); + s->cluster_cache = qemu_malloc(s->cluster_size); + /* one more sector for decompressed data alignment */ + s->cluster_data = qemu_malloc(QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size + + 512); + s->cluster_cache_offset = -1; + + if (refcount_init(bs) < 0) + goto fail; + + /* read qcow2 extensions */ + if (header.backing_file_offset) + ext_end = header.backing_file_offset; + else + ext_end = s->cluster_size; + if (qcow_read_extensions(bs, sizeof(header), ext_end)) + goto fail; + + /* read the backing file name */ + if (header.backing_file_offset != 0) { + len = header.backing_file_size; + if (len > 1023) + len = 1023; + if (bdrv_pread(s->hd, header.backing_file_offset, bs->backing_file, len) != len) + goto fail; + bs->backing_file[len] = '\0'; + } + if (qcow_read_snapshots(bs) < 0) + goto fail; + +#ifdef DEBUG_ALLOC + check_refcounts(bs); +#endif + return 0; + + fail: + qcow_free_snapshots(bs); + refcount_close(bs); + qemu_free(s->l1_table); + qemu_free(s->l2_cache); + qemu_free(s->cluster_cache); + qemu_free(s->cluster_data); + bdrv_delete(s->hd); + return -1; +} + +static int qcow_set_key(BlockDriverState *bs, const char *key) +{ + BDRVQcowState *s = bs->opaque; + uint8_t keybuf[16]; + int len, i; + + memset(keybuf, 0, 16); + len = strlen(key); + if (len > 16) + len = 16; + /* XXX: we could compress the chars to 7 bits to increase + entropy */ + for(i = 0;i < len;i++) { + keybuf[i] = key[i]; + } + s->crypt_method = s->crypt_method_header; + + if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0) + return -1; + if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0) + return -1; +#if 0 + /* test */ + { + uint8_t in[16]; + uint8_t out[16]; + uint8_t tmp[16]; + for(i=0;i<16;i++) + in[i] = i; + AES_encrypt(in, tmp, &s->aes_encrypt_key); + AES_decrypt(tmp, out, &s->aes_decrypt_key); + for(i = 0; i < 16; i++) + printf(" %02x", tmp[i]); + printf("\n"); + for(i = 0; i < 16; i++) + printf(" %02x", out[i]); + printf("\n"); + } +#endif + return 0; +} + +/* The crypt function is compatible with the linux cryptoloop + algorithm for < 4 GB images. NOTE: out_buf == in_buf is + supported */ +static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num, + uint8_t *out_buf, const uint8_t *in_buf, + int nb_sectors, int enc, + const AES_KEY *key) +{ + union { + uint64_t ll[2]; + uint8_t b[16]; + } ivec; + int i; + + for(i = 0; i < nb_sectors; i++) { + ivec.ll[0] = cpu_to_le64(sector_num); + ivec.ll[1] = 0; + AES_cbc_encrypt(in_buf, out_buf, 512, key, + ivec.b, enc); + sector_num++; + in_buf += 512; + out_buf += 512; + } +} + +static int copy_sectors(BlockDriverState *bs, uint64_t start_sect, + uint64_t cluster_offset, int n_start, int n_end) +{ + BDRVQcowState *s = bs->opaque; + int n, ret; + + n = n_end - n_start; + if (n <= 0) + return 0; + ret = qcow_read(bs, start_sect + n_start, s->cluster_data, n); + if (ret < 0) + return ret; + if (s->crypt_method) { + encrypt_sectors(s, start_sect + n_start, + s->cluster_data, + s->cluster_data, n, 1, + &s->aes_encrypt_key); + } + ret = bdrv_write(s->hd, (cluster_offset >> 9) + n_start, + s->cluster_data, n); + if (ret < 0) + return ret; + return 0; +} + +static void l2_cache_reset(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + + memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t)); + memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t)); + memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t)); +} + +static inline int l2_cache_new_entry(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + uint32_t min_count; + int min_index, i; + + /* find a new entry in the least used one */ + min_index = 0; + min_count = 0xffffffff; + for(i = 0; i < L2_CACHE_SIZE; i++) { + if (s->l2_cache_counts[i] < min_count) { + min_count = s->l2_cache_counts[i]; + min_index = i; + } + } + return min_index; +} + +static int64_t align_offset(int64_t offset, int n) +{ + offset = (offset + n - 1) & ~(n - 1); + return offset; +} + +static int grow_l1_table(BlockDriverState *bs, int min_size) +{ + BDRVQcowState *s = bs->opaque; + int new_l1_size, new_l1_size2, ret, i; + uint64_t *new_l1_table; + uint64_t new_l1_table_offset; + uint8_t data[12]; + + new_l1_size = s->l1_size; + if (min_size <= new_l1_size) + return 0; + while (min_size > new_l1_size) { + new_l1_size = (new_l1_size * 3 + 1) / 2; + } +#ifdef DEBUG_ALLOC2 + printf("grow l1_table from %d to %d\n", s->l1_size, new_l1_size); +#endif + + new_l1_size2 = sizeof(uint64_t) * new_l1_size; + new_l1_table = qemu_mallocz(new_l1_size2); + memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t)); + + /* write new table (align to cluster) */ + new_l1_table_offset = alloc_clusters(bs, new_l1_size2); + + for(i = 0; i < s->l1_size; i++) + new_l1_table[i] = cpu_to_be64(new_l1_table[i]); + ret = bdrv_pwrite(s->hd, new_l1_table_offset, new_l1_table, new_l1_size2); + if (ret != new_l1_size2) + goto fail; + for(i = 0; i < s->l1_size; i++) + new_l1_table[i] = be64_to_cpu(new_l1_table[i]); + + /* set new table */ + cpu_to_be32w((uint32_t*)data, new_l1_size); + cpu_to_be64w((uint64_t*)(data + 4), new_l1_table_offset); + if (bdrv_pwrite(s->hd, offsetof(QCowHeader, l1_size), data, + sizeof(data)) != sizeof(data)) + goto fail; + qemu_free(s->l1_table); + free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t)); + s->l1_table_offset = new_l1_table_offset; + s->l1_table = new_l1_table; + s->l1_size = new_l1_size; + return 0; + fail: + qemu_free(s->l1_table); + return -EIO; +} + +/* + * seek_l2_table + * + * seek l2_offset in the l2_cache table + * if not found, return NULL, + * if found, + * increments the l2 cache hit count of the entry, + * if counter overflow, divide by two all counters + * return the pointer to the l2 cache entry + * + */ + +static uint64_t *seek_l2_table(BDRVQcowState *s, uint64_t l2_offset) +{ + int i, j; + + for(i = 0; i < L2_CACHE_SIZE; i++) { + if (l2_offset == s->l2_cache_offsets[i]) { + /* increment the hit count */ + if (++s->l2_cache_counts[i] == 0xffffffff) { + for(j = 0; j < L2_CACHE_SIZE; j++) { + s->l2_cache_counts[j] >>= 1; + } + } + return s->l2_cache + (i << s->l2_bits); + } + } + return NULL; +} + +/* + * l2_load + * + * Loads a L2 table into memory. If the table is in the cache, the cache + * is used; otherwise the L2 table is loaded from the image file. + * + * Returns a pointer to the L2 table on success, or NULL if the read from + * the image file failed. + */ + +static uint64_t *l2_load(BlockDriverState *bs, uint64_t l2_offset) +{ + BDRVQcowState *s = bs->opaque; + int min_index; + uint64_t *l2_table; + + /* seek if the table for the given offset is in the cache */ + + l2_table = seek_l2_table(s, l2_offset); + if (l2_table != NULL) + return l2_table; + + /* not found: load a new entry in the least used one */ + + min_index = l2_cache_new_entry(bs); + l2_table = s->l2_cache + (min_index << s->l2_bits); + if (bdrv_pread(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) != + s->l2_size * sizeof(uint64_t)) + return NULL; + s->l2_cache_offsets[min_index] = l2_offset; + s->l2_cache_counts[min_index] = 1; + + return l2_table; +} + +/* + * l2_allocate + * + * Allocate a new l2 entry in the file. If l1_index points to an already + * used entry in the L2 table (i.e. we are doing a copy on write for the L2 + * table) copy the contents of the old L2 table into the newly allocated one. + * Otherwise the new table is initialized with zeros. + * + */ + +static uint64_t *l2_allocate(BlockDriverState *bs, int l1_index) +{ + BDRVQcowState *s = bs->opaque; + int min_index; + uint64_t old_l2_offset, tmp; + uint64_t *l2_table, l2_offset; + + old_l2_offset = s->l1_table[l1_index]; + + /* allocate a new l2 entry */ + + l2_offset = alloc_clusters(bs, s->l2_size * sizeof(uint64_t)); + + /* update the L1 entry */ + + s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED; + + tmp = cpu_to_be64(l2_offset | QCOW_OFLAG_COPIED); + if (bdrv_pwrite(s->hd, s->l1_table_offset + l1_index * sizeof(tmp), + &tmp, sizeof(tmp)) != sizeof(tmp)) + return NULL; + + /* allocate a new entry in the l2 cache */ + + min_index = l2_cache_new_entry(bs); + l2_table = s->l2_cache + (min_index << s->l2_bits); + + if (old_l2_offset == 0) { + /* if there was no old l2 table, clear the new table */ + memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); + } else { + /* if there was an old l2 table, read it from the disk */ + if (bdrv_pread(s->hd, old_l2_offset, + l2_table, s->l2_size * sizeof(uint64_t)) != + s->l2_size * sizeof(uint64_t)) + return NULL; + } + /* write the l2 table to the file */ + if (bdrv_pwrite(s->hd, l2_offset, + l2_table, s->l2_size * sizeof(uint64_t)) != + s->l2_size * sizeof(uint64_t)) + return NULL; + + /* update the l2 cache entry */ + + s->l2_cache_offsets[min_index] = l2_offset; + s->l2_cache_counts[min_index] = 1; + + return l2_table; +} + +static int size_to_clusters(BDRVQcowState *s, int64_t size) +{ + return (size + (s->cluster_size - 1)) >> s->cluster_bits; +} + +static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size, + uint64_t *l2_table, uint64_t start, uint64_t mask) +{ + int i; + uint64_t offset = be64_to_cpu(l2_table[0]) & ~mask; + + if (!offset) + return 0; + + for (i = start; i < start + nb_clusters; i++) + if (offset + i * cluster_size != (be64_to_cpu(l2_table[i]) & ~mask)) + break; + + return (i - start); +} + +static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table) +{ + int i = 0; + + while(nb_clusters-- && l2_table[i] == 0) + i++; + + return i; +} + +/* + * get_cluster_offset + * + * For a given offset of the disk image, return cluster offset in + * qcow2 file. + * + * on entry, *num is the number of contiguous clusters we'd like to + * access following offset. + * + * on exit, *num is the number of contiguous clusters we can read. + * + * Return 1, if the offset is found + * Return 0, otherwise. + * + */ + +static uint64_t get_cluster_offset(BlockDriverState *bs, + uint64_t offset, int *num) +{ + BDRVQcowState *s = bs->opaque; + int l1_index, l2_index; + uint64_t l2_offset, *l2_table, cluster_offset; + int l1_bits, c; + int index_in_cluster, nb_available, nb_needed, nb_clusters; + + index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1); + nb_needed = *num + index_in_cluster; + + l1_bits = s->l2_bits + s->cluster_bits; + + /* compute how many bytes there are between the offset and + * the end of the l1 entry + */ + + nb_available = (1 << l1_bits) - (offset & ((1 << l1_bits) - 1)); + + /* compute the number of available sectors */ + + nb_available = (nb_available >> 9) + index_in_cluster; + + if (nb_needed > nb_available) { + nb_needed = nb_available; + } + + cluster_offset = 0; + + /* seek the the l2 offset in the l1 table */ + + l1_index = offset >> l1_bits; + if (l1_index >= s->l1_size) + goto out; + + l2_offset = s->l1_table[l1_index]; + + /* seek the l2 table of the given l2 offset */ + + if (!l2_offset) + goto out; + + /* load the l2 table in memory */ + + l2_offset &= ~QCOW_OFLAG_COPIED; + l2_table = l2_load(bs, l2_offset); + if (l2_table == NULL) + return 0; + + /* find the cluster offset for the given disk offset */ + + l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); + cluster_offset = be64_to_cpu(l2_table[l2_index]); + nb_clusters = size_to_clusters(s, nb_needed << 9); + + if (!cluster_offset) { + /* how many empty clusters ? */ + c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]); + } else { + /* how many allocated clusters ? */ + c = count_contiguous_clusters(nb_clusters, s->cluster_size, + &l2_table[l2_index], 0, QCOW_OFLAG_COPIED); + } + + nb_available = (c * s->cluster_sectors); +out: + if (nb_available > nb_needed) + nb_available = nb_needed; + + *num = nb_available - index_in_cluster; + + return cluster_offset & ~QCOW_OFLAG_COPIED; +} + +/* + * free_any_clusters + * + * free clusters according to its type: compressed or not + * + */ + +static void free_any_clusters(BlockDriverState *bs, + uint64_t cluster_offset, int nb_clusters) +{ + BDRVQcowState *s = bs->opaque; + + /* free the cluster */ + + if (cluster_offset & QCOW_OFLAG_COMPRESSED) { + int nb_csectors; + nb_csectors = ((cluster_offset >> s->csize_shift) & + s->csize_mask) + 1; + free_clusters(bs, (cluster_offset & s->cluster_offset_mask) & ~511, + nb_csectors * 512); + return; + } + + free_clusters(bs, cluster_offset, nb_clusters << s->cluster_bits); + + return; +} + +/* + * get_cluster_table + * + * for a given disk offset, load (and allocate if needed) + * the l2 table. + * + * the l2 table offset in the qcow2 file and the cluster index + * in the l2 table are given to the caller. + * + */ + +static int get_cluster_table(BlockDriverState *bs, uint64_t offset, + uint64_t **new_l2_table, + uint64_t *new_l2_offset, + int *new_l2_index) +{ + BDRVQcowState *s = bs->opaque; + int l1_index, l2_index, ret; + uint64_t l2_offset, *l2_table; + + /* seek the the l2 offset in the l1 table */ + + l1_index = offset >> (s->l2_bits + s->cluster_bits); + if (l1_index >= s->l1_size) { + ret = grow_l1_table(bs, l1_index + 1); + if (ret < 0) + return 0; + } + l2_offset = s->l1_table[l1_index]; + + /* seek the l2 table of the given l2 offset */ + + if (l2_offset & QCOW_OFLAG_COPIED) { + /* load the l2 table in memory */ + l2_offset &= ~QCOW_OFLAG_COPIED; + l2_table = l2_load(bs, l2_offset); + if (l2_table == NULL) + return 0; + } else { + if (l2_offset) + free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t)); + l2_table = l2_allocate(bs, l1_index); + if (l2_table == NULL) + return 0; + l2_offset = s->l1_table[l1_index] & ~QCOW_OFLAG_COPIED; + } + + /* find the cluster offset for the given disk offset */ + + l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); + + *new_l2_table = l2_table; + *new_l2_offset = l2_offset; + *new_l2_index = l2_index; + + return 1; +} + +/* + * alloc_compressed_cluster_offset + * + * For a given offset of the disk image, return cluster offset in + * qcow2 file. + * + * If the offset is not found, allocate a new compressed cluster. + * + * Return the cluster offset if successful, + * Return 0, otherwise. + * + */ + +static uint64_t alloc_compressed_cluster_offset(BlockDriverState *bs, + uint64_t offset, + int compressed_size) +{ + BDRVQcowState *s = bs->opaque; + int l2_index, ret; + uint64_t l2_offset, *l2_table, cluster_offset; + int nb_csectors; + + ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index); + if (ret == 0) + return 0; + + cluster_offset = be64_to_cpu(l2_table[l2_index]); + if (cluster_offset & QCOW_OFLAG_COPIED) + return cluster_offset & ~QCOW_OFLAG_COPIED; + + if (cluster_offset) + free_any_clusters(bs, cluster_offset, 1); + + cluster_offset = alloc_bytes(bs, compressed_size); + nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) - + (cluster_offset >> 9); + + cluster_offset |= QCOW_OFLAG_COMPRESSED | + ((uint64_t)nb_csectors << s->csize_shift); + + /* update L2 table */ + + /* compressed clusters never have the copied flag */ + + l2_table[l2_index] = cpu_to_be64(cluster_offset); + if (bdrv_pwrite(s->hd, + l2_offset + l2_index * sizeof(uint64_t), + l2_table + l2_index, + sizeof(uint64_t)) != sizeof(uint64_t)) + return 0; + + return cluster_offset; +} + +typedef struct QCowL2Meta +{ + uint64_t offset; + int n_start; + int nb_available; + int nb_clusters; +} QCowL2Meta; + +static int alloc_cluster_link_l2(BlockDriverState *bs, uint64_t cluster_offset, + QCowL2Meta *m) +{ + BDRVQcowState *s = bs->opaque; + int i, j = 0, l2_index, ret; + uint64_t *old_cluster, start_sect, l2_offset, *l2_table; + + if (m->nb_clusters == 0) + return 0; + + old_cluster = qemu_malloc(m->nb_clusters * sizeof(uint64_t)); + + /* copy content of unmodified sectors */ + start_sect = (m->offset & ~(s->cluster_size - 1)) >> 9; + if (m->n_start) { + ret = copy_sectors(bs, start_sect, cluster_offset, 0, m->n_start); + if (ret < 0) + goto err; + } + + if (m->nb_available & (s->cluster_sectors - 1)) { + uint64_t end = m->nb_available & ~(uint64_t)(s->cluster_sectors - 1); + ret = copy_sectors(bs, start_sect + end, cluster_offset + (end << 9), + m->nb_available - end, s->cluster_sectors); + if (ret < 0) + goto err; + } + + ret = -EIO; + /* update L2 table */ + if (!get_cluster_table(bs, m->offset, &l2_table, &l2_offset, &l2_index)) + goto err; + + for (i = 0; i < m->nb_clusters; i++) { + /* if two concurrent writes happen to the same unallocated cluster + * each write allocates separate cluster and writes data concurrently. + * The first one to complete updates l2 table with pointer to its + * cluster the second one has to do RMW (which is done above by + * copy_sectors()), update l2 table with its cluster pointer and free + * old cluster. This is what this loop does */ + if(l2_table[l2_index + i] != 0) + old_cluster[j++] = l2_table[l2_index + i]; + + l2_table[l2_index + i] = cpu_to_be64((cluster_offset + + (i << s->cluster_bits)) | QCOW_OFLAG_COPIED); + } + + if (bdrv_pwrite(s->hd, l2_offset + l2_index * sizeof(uint64_t), + l2_table + l2_index, m->nb_clusters * sizeof(uint64_t)) != + m->nb_clusters * sizeof(uint64_t)) + goto err; + + for (i = 0; i < j; i++) + free_any_clusters(bs, be64_to_cpu(old_cluster[i]) & ~QCOW_OFLAG_COPIED, + 1); + + ret = 0; +err: + qemu_free(old_cluster); + return ret; + } + +/* + * alloc_cluster_offset + * + * For a given offset of the disk image, return cluster offset in + * qcow2 file. + * + * If the offset is not found, allocate a new cluster. + * + * Return the cluster offset if successful, + * Return 0, otherwise. + * + */ + +static uint64_t alloc_cluster_offset(BlockDriverState *bs, + uint64_t offset, + int n_start, int n_end, + int *num, QCowL2Meta *m) +{ + BDRVQcowState *s = bs->opaque; + int l2_index, ret; + uint64_t l2_offset, *l2_table, cluster_offset; + int nb_clusters, i = 0; + + ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index); + if (ret == 0) + return 0; + + nb_clusters = size_to_clusters(s, n_end << 9); + + nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); + + cluster_offset = be64_to_cpu(l2_table[l2_index]); + + /* We keep all QCOW_OFLAG_COPIED clusters */ + + if (cluster_offset & QCOW_OFLAG_COPIED) { + nb_clusters = count_contiguous_clusters(nb_clusters, s->cluster_size, + &l2_table[l2_index], 0, 0); + + cluster_offset &= ~QCOW_OFLAG_COPIED; + m->nb_clusters = 0; + + goto out; + } + + /* for the moment, multiple compressed clusters are not managed */ + + if (cluster_offset & QCOW_OFLAG_COMPRESSED) + nb_clusters = 1; + + /* how many available clusters ? */ + + while (i < nb_clusters) { + i += count_contiguous_clusters(nb_clusters - i, s->cluster_size, + &l2_table[l2_index], i, 0); + + if(be64_to_cpu(l2_table[l2_index + i])) + break; + + i += count_contiguous_free_clusters(nb_clusters - i, + &l2_table[l2_index + i]); + + cluster_offset = be64_to_cpu(l2_table[l2_index + i]); + + if ((cluster_offset & QCOW_OFLAG_COPIED) || + (cluster_offset & QCOW_OFLAG_COMPRESSED)) + break; + } + nb_clusters = i; + + /* allocate a new cluster */ + + cluster_offset = alloc_clusters(bs, nb_clusters * s->cluster_size); + + /* save info needed for meta data update */ + m->offset = offset; + m->n_start = n_start; + m->nb_clusters = nb_clusters; + +out: + m->nb_available = MIN(nb_clusters << (s->cluster_bits - 9), n_end); + + *num = m->nb_available - n_start; + + return cluster_offset; +} + +static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, int *pnum) +{ + uint64_t cluster_offset; + + *pnum = nb_sectors; + cluster_offset = get_cluster_offset(bs, sector_num << 9, pnum); + + return (cluster_offset != 0); +} + +static int decompress_buffer(uint8_t *out_buf, int out_buf_size, + const uint8_t *buf, int buf_size) +{ + z_stream strm1, *strm = &strm1; + int ret, out_len; + + memset(strm, 0, sizeof(*strm)); + + strm->next_in = (uint8_t *)buf; + strm->avail_in = buf_size; + strm->next_out = out_buf; + strm->avail_out = out_buf_size; + + ret = inflateInit2(strm, -12); + if (ret != Z_OK) + return -1; + ret = inflate(strm, Z_FINISH); + out_len = strm->next_out - out_buf; + if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || + out_len != out_buf_size) { + inflateEnd(strm); + return -1; + } + inflateEnd(strm); + return 0; +} + +static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset) +{ + int ret, csize, nb_csectors, sector_offset; + uint64_t coffset; + + coffset = cluster_offset & s->cluster_offset_mask; + if (s->cluster_cache_offset != coffset) { + nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1; + sector_offset = coffset & 511; + csize = nb_csectors * 512 - sector_offset; + ret = bdrv_read(s->hd, coffset >> 9, s->cluster_data, nb_csectors); + if (ret < 0) { + return -1; + } + if (decompress_buffer(s->cluster_cache, s->cluster_size, + s->cluster_data + sector_offset, csize) < 0) { + return -1; + } + s->cluster_cache_offset = coffset; + } + return 0; +} + +/* handle reading after the end of the backing file */ +static int backing_read1(BlockDriverState *bs, + int64_t sector_num, uint8_t *buf, int nb_sectors) +{ + int n1; + if ((sector_num + nb_sectors) <= bs->total_sectors) + return nb_sectors; + if (sector_num >= bs->total_sectors) + n1 = 0; + else + n1 = bs->total_sectors - sector_num; + memset(buf + n1 * 512, 0, 512 * (nb_sectors - n1)); + return n1; +} + +static int qcow_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + BDRVQcowState *s = bs->opaque; + int ret, index_in_cluster, n, n1; + uint64_t cluster_offset; + + while (nb_sectors > 0) { + n = nb_sectors; + cluster_offset = get_cluster_offset(bs, sector_num << 9, &n); + index_in_cluster = sector_num & (s->cluster_sectors - 1); + if (!cluster_offset) { + if (bs->backing_hd) { + /* read from the base image */ + n1 = backing_read1(bs->backing_hd, sector_num, buf, n); + if (n1 > 0) { + ret = bdrv_read(bs->backing_hd, sector_num, buf, n1); + if (ret < 0) + return -1; + } + } else { + memset(buf, 0, 512 * n); + } + } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) { + if (decompress_cluster(s, cluster_offset) < 0) + return -1; + memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n); + } else { + ret = bdrv_pread(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512); + if (ret != n * 512) + return -1; + if (s->crypt_method) { + encrypt_sectors(s, sector_num, buf, buf, n, 0, + &s->aes_decrypt_key); + } + } + nb_sectors -= n; + sector_num += n; + buf += n * 512; + } + return 0; +} + +static int qcow_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + BDRVQcowState *s = bs->opaque; + int ret, index_in_cluster, n; + uint64_t cluster_offset; + int n_end; + QCowL2Meta l2meta; + + while (nb_sectors > 0) { + index_in_cluster = sector_num & (s->cluster_sectors - 1); + n_end = index_in_cluster + nb_sectors; + if (s->crypt_method && + n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) + n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors; + cluster_offset = alloc_cluster_offset(bs, sector_num << 9, + index_in_cluster, + n_end, &n, &l2meta); + if (!cluster_offset) + return -1; + if (s->crypt_method) { + encrypt_sectors(s, sector_num, s->cluster_data, buf, n, 1, + &s->aes_encrypt_key); + ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, + s->cluster_data, n * 512); + } else { + ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512); + } + if (ret != n * 512 || alloc_cluster_link_l2(bs, cluster_offset, &l2meta) < 0) { + free_any_clusters(bs, cluster_offset, l2meta.nb_clusters); + return -1; + } + nb_sectors -= n; + sector_num += n; + buf += n * 512; + } + s->cluster_cache_offset = -1; /* disable compressed cache */ + return 0; +} + +typedef struct QCowAIOCB { + BlockDriverAIOCB common; + int64_t sector_num; + QEMUIOVector *qiov; + uint8_t *buf; + void *orig_buf; + int nb_sectors; + int n; + uint64_t cluster_offset; + uint8_t *cluster_data; + BlockDriverAIOCB *hd_aiocb; + struct iovec hd_iov; + QEMUIOVector hd_qiov; + QEMUBH *bh; + QCowL2Meta l2meta; +} QCowAIOCB; + +static void qcow_aio_read_cb(void *opaque, int ret); +static void qcow_aio_read_bh(void *opaque) +{ + QCowAIOCB *acb = opaque; + qemu_bh_delete(acb->bh); + acb->bh = NULL; + qcow_aio_read_cb(opaque, 0); +} + +static int qcow_schedule_bh(QEMUBHFunc *cb, QCowAIOCB *acb) +{ + if (acb->bh) + return -EIO; + + acb->bh = qemu_bh_new(cb, acb); + if (!acb->bh) + return -EIO; + + qemu_bh_schedule(acb->bh); + + return 0; +} + +static void qcow_aio_read_cb(void *opaque, int ret) +{ + QCowAIOCB *acb = opaque; + BlockDriverState *bs = acb->common.bs; + BDRVQcowState *s = bs->opaque; + int index_in_cluster, n1; + + acb->hd_aiocb = NULL; + if (ret < 0) + goto done; + + /* post process the read buffer */ + if (!acb->cluster_offset) { + /* nothing to do */ + } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) { + /* nothing to do */ + } else { + if (s->crypt_method) { + encrypt_sectors(s, acb->sector_num, acb->buf, acb->buf, + acb->n, 0, + &s->aes_decrypt_key); + } + } + + acb->nb_sectors -= acb->n; + acb->sector_num += acb->n; + acb->buf += acb->n * 512; + + if (acb->nb_sectors == 0) { + /* request completed */ + ret = 0; + goto done; + } + + /* prepare next AIO request */ + acb->n = acb->nb_sectors; + acb->cluster_offset = get_cluster_offset(bs, acb->sector_num << 9, &acb->n); + index_in_cluster = acb->sector_num & (s->cluster_sectors - 1); + + if (!acb->cluster_offset) { + if (bs->backing_hd) { + /* read from the base image */ + n1 = backing_read1(bs->backing_hd, acb->sector_num, + acb->buf, acb->n); + if (n1 > 0) { + acb->hd_iov.iov_base = (void *)acb->buf; + acb->hd_iov.iov_len = acb->n * 512; + qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1); + acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num, + &acb->hd_qiov, acb->n, + qcow_aio_read_cb, acb); + if (acb->hd_aiocb == NULL) + goto done; + } else { + ret = qcow_schedule_bh(qcow_aio_read_bh, acb); + if (ret < 0) + goto done; + } + } else { + /* Note: in this case, no need to wait */ + memset(acb->buf, 0, 512 * acb->n); + ret = qcow_schedule_bh(qcow_aio_read_bh, acb); + if (ret < 0) + goto done; + } + } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) { + /* add AIO support for compressed blocks ? */ + if (decompress_cluster(s, acb->cluster_offset) < 0) + goto done; + memcpy(acb->buf, + s->cluster_cache + index_in_cluster * 512, 512 * acb->n); + ret = qcow_schedule_bh(qcow_aio_read_bh, acb); + if (ret < 0) + goto done; + } else { + if ((acb->cluster_offset & 511) != 0) { + ret = -EIO; + goto done; + } + + acb->hd_iov.iov_base = (void *)acb->buf; + acb->hd_iov.iov_len = acb->n * 512; + qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1); + acb->hd_aiocb = bdrv_aio_readv(s->hd, + (acb->cluster_offset >> 9) + index_in_cluster, + &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb); + if (acb->hd_aiocb == NULL) + goto done; + } + + return; +done: + if (acb->qiov->niov > 1) { + qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size); + qemu_vfree(acb->orig_buf); + } + acb->common.cb(acb->common.opaque, ret); + qemu_aio_release(acb); +} + +static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque, int is_write) +{ + QCowAIOCB *acb; + + acb = qemu_aio_get(bs, cb, opaque); + if (!acb) + return NULL; + acb->hd_aiocb = NULL; + acb->sector_num = sector_num; + acb->qiov = qiov; + if (qiov->niov > 1) { + acb->buf = acb->orig_buf = qemu_blockalign(bs, qiov->size); + if (is_write) + qemu_iovec_to_buffer(qiov, acb->buf); + } else { + acb->buf = (uint8_t *)qiov->iov->iov_base; + } + acb->nb_sectors = nb_sectors; + acb->n = 0; + acb->cluster_offset = 0; + acb->l2meta.nb_clusters = 0; + return acb; +} + +static BlockDriverAIOCB *qcow_aio_readv(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + QCowAIOCB *acb; + + acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); + if (!acb) + return NULL; + + qcow_aio_read_cb(acb, 0); + return &acb->common; +} + +static void qcow_aio_write_cb(void *opaque, int ret) +{ + QCowAIOCB *acb = opaque; + BlockDriverState *bs = acb->common.bs; + BDRVQcowState *s = bs->opaque; + int index_in_cluster; + const uint8_t *src_buf; + int n_end; + + acb->hd_aiocb = NULL; + + if (ret < 0) + goto done; + + if (alloc_cluster_link_l2(bs, acb->cluster_offset, &acb->l2meta) < 0) { + free_any_clusters(bs, acb->cluster_offset, acb->l2meta.nb_clusters); + goto done; + } + + acb->nb_sectors -= acb->n; + acb->sector_num += acb->n; + acb->buf += acb->n * 512; + + if (acb->nb_sectors == 0) { + /* request completed */ + ret = 0; + goto done; + } + + index_in_cluster = acb->sector_num & (s->cluster_sectors - 1); + n_end = index_in_cluster + acb->nb_sectors; + if (s->crypt_method && + n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) + n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors; + + acb->cluster_offset = alloc_cluster_offset(bs, acb->sector_num << 9, + index_in_cluster, + n_end, &acb->n, &acb->l2meta); + if (!acb->cluster_offset || (acb->cluster_offset & 511) != 0) { + ret = -EIO; + goto done; + } + if (s->crypt_method) { + if (!acb->cluster_data) { + acb->cluster_data = qemu_mallocz(QCOW_MAX_CRYPT_CLUSTERS * + s->cluster_size); + } + encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf, + acb->n, 1, &s->aes_encrypt_key); + src_buf = acb->cluster_data; + } else { + src_buf = acb->buf; + } + acb->hd_iov.iov_base = (void *)src_buf; + acb->hd_iov.iov_len = acb->n * 512; + qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1); + acb->hd_aiocb = bdrv_aio_writev(s->hd, + (acb->cluster_offset >> 9) + index_in_cluster, + &acb->hd_qiov, acb->n, + qcow_aio_write_cb, acb); + if (acb->hd_aiocb == NULL) + goto done; + + return; + +done: + if (acb->qiov->niov > 1) + qemu_vfree(acb->orig_buf); + acb->common.cb(acb->common.opaque, ret); + qemu_aio_release(acb); +} + +static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + BDRVQcowState *s = bs->opaque; + QCowAIOCB *acb; + + s->cluster_cache_offset = -1; /* disable compressed cache */ + + acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); + if (!acb) + return NULL; + + qcow_aio_write_cb(acb, 0); + return &acb->common; +} + +static void qcow_aio_cancel(BlockDriverAIOCB *blockacb) +{ + QCowAIOCB *acb = (QCowAIOCB *)blockacb; + if (acb->hd_aiocb) + bdrv_aio_cancel(acb->hd_aiocb); + qemu_aio_release(acb); +} + +static void qcow_close(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + qemu_free(s->l1_table); + qemu_free(s->l2_cache); + qemu_free(s->cluster_cache); + qemu_free(s->cluster_data); + refcount_close(bs); + bdrv_delete(s->hd); +} + +/* XXX: use std qcow open function ? */ +typedef struct QCowCreateState { + int cluster_size; + int cluster_bits; + uint16_t *refcount_block; + uint64_t *refcount_table; + int64_t l1_table_offset; + int64_t refcount_table_offset; + int64_t refcount_block_offset; +} QCowCreateState; + +static void create_refcount_update(QCowCreateState *s, + int64_t offset, int64_t size) +{ + int refcount; + int64_t start, last, cluster_offset; + uint16_t *p; + + start = offset & ~(s->cluster_size - 1); + last = (offset + size - 1) & ~(s->cluster_size - 1); + for(cluster_offset = start; cluster_offset <= last; + cluster_offset += s->cluster_size) { + p = &s->refcount_block[cluster_offset >> s->cluster_bits]; + refcount = be16_to_cpu(*p); + refcount++; + *p = cpu_to_be16(refcount); + } +} + +static int qcow_create2(const char *filename, int64_t total_size, + const char *backing_file, const char *backing_format, + int flags) +{ + + int fd, header_size, backing_filename_len, l1_size, i, shift, l2_bits; + int ref_clusters, backing_format_len = 0; + QCowHeader header; + uint64_t tmp, offset; + QCowCreateState s1, *s = &s1; + QCowExtension ext_bf = {0, 0}; + + + memset(s, 0, sizeof(*s)); + + fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644); + if (fd < 0) + return -1; + memset(&header, 0, sizeof(header)); + header.magic = cpu_to_be32(QCOW_MAGIC); + header.version = cpu_to_be32(QCOW_VERSION); + header.size = cpu_to_be64(total_size * 512); + header_size = sizeof(header); + backing_filename_len = 0; + if (backing_file) { + if (backing_format) { + ext_bf.magic = QCOW_EXT_MAGIC_BACKING_FORMAT; + backing_format_len = strlen(backing_format); + ext_bf.len = (backing_format_len + 7) & ~7; + header_size += ((sizeof(ext_bf) + ext_bf.len + 7) & ~7); + } + header.backing_file_offset = cpu_to_be64(header_size); + backing_filename_len = strlen(backing_file); + header.backing_file_size = cpu_to_be32(backing_filename_len); + header_size += backing_filename_len; + } + s->cluster_bits = 12; /* 4 KB clusters */ + s->cluster_size = 1 << s->cluster_bits; + header.cluster_bits = cpu_to_be32(s->cluster_bits); + header_size = (header_size + 7) & ~7; + if (flags & BLOCK_FLAG_ENCRYPT) { + header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES); + } else { + header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE); + } + l2_bits = s->cluster_bits - 3; + shift = s->cluster_bits + l2_bits; + l1_size = (((total_size * 512) + (1LL << shift) - 1) >> shift); + offset = align_offset(header_size, s->cluster_size); + s->l1_table_offset = offset; + header.l1_table_offset = cpu_to_be64(s->l1_table_offset); + header.l1_size = cpu_to_be32(l1_size); + offset += align_offset(l1_size * sizeof(uint64_t), s->cluster_size); + + s->refcount_table = qemu_mallocz(s->cluster_size); + + s->refcount_table_offset = offset; + header.refcount_table_offset = cpu_to_be64(offset); + header.refcount_table_clusters = cpu_to_be32(1); + offset += s->cluster_size; + s->refcount_block_offset = offset; + + /* count how many refcount blocks needed */ + tmp = offset >> s->cluster_bits; + ref_clusters = (tmp >> (s->cluster_bits - REFCOUNT_SHIFT)) + 1; + for (i=0; i < ref_clusters; i++) { + s->refcount_table[i] = cpu_to_be64(offset); + offset += s->cluster_size; + } + + s->refcount_block = qemu_mallocz(ref_clusters * s->cluster_size); + + /* update refcounts */ + create_refcount_update(s, 0, header_size); + create_refcount_update(s, s->l1_table_offset, l1_size * sizeof(uint64_t)); + create_refcount_update(s, s->refcount_table_offset, s->cluster_size); + create_refcount_update(s, s->refcount_block_offset, ref_clusters * s->cluster_size); + + /* write all the data */ + write(fd, &header, sizeof(header)); + if (backing_file) { + if (backing_format_len) { + char zero[16]; + int d = ext_bf.len - backing_format_len; + + memset(zero, 0, sizeof(zero)); + cpu_to_be32s(&ext_bf.magic); + cpu_to_be32s(&ext_bf.len); + write(fd, &ext_bf, sizeof(ext_bf)); + write(fd, backing_format, backing_format_len); + if (d>0) { + write(fd, zero, d); + } + } + write(fd, backing_file, backing_filename_len); + } + lseek(fd, s->l1_table_offset, SEEK_SET); + tmp = 0; + for(i = 0;i < l1_size; i++) { + write(fd, &tmp, sizeof(tmp)); + } + lseek(fd, s->refcount_table_offset, SEEK_SET); + write(fd, s->refcount_table, s->cluster_size); + + lseek(fd, s->refcount_block_offset, SEEK_SET); + write(fd, s->refcount_block, ref_clusters * s->cluster_size); + + qemu_free(s->refcount_table); + qemu_free(s->refcount_block); + close(fd); + return 0; +} + +static int qcow_create(const char *filename, int64_t total_size, + const char *backing_file, int flags) +{ + return qcow_create2(filename, total_size, backing_file, NULL, flags); +} + +static int qcow_make_empty(BlockDriverState *bs) +{ +#if 0 + /* XXX: not correct */ + BDRVQcowState *s = bs->opaque; + uint32_t l1_length = s->l1_size * sizeof(uint64_t); + int ret; + + memset(s->l1_table, 0, l1_length); + if (bdrv_pwrite(s->hd, s->l1_table_offset, s->l1_table, l1_length) < 0) + return -1; + ret = bdrv_truncate(s->hd, s->l1_table_offset + l1_length); + if (ret < 0) + return ret; + + l2_cache_reset(bs); +#endif + return 0; +} + +/* XXX: put compressed sectors first, then all the cluster aligned + tables to avoid losing bytes in alignment */ +static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + BDRVQcowState *s = bs->opaque; + z_stream strm; + int ret, out_len; + uint8_t *out_buf; + uint64_t cluster_offset; + + if (nb_sectors == 0) { + /* align end of file to a sector boundary to ease reading with + sector based I/Os */ + cluster_offset = bdrv_getlength(s->hd); + cluster_offset = (cluster_offset + 511) & ~511; + bdrv_truncate(s->hd, cluster_offset); + return 0; + } + + if (nb_sectors != s->cluster_sectors) + return -EINVAL; + + out_buf = qemu_malloc(s->cluster_size + (s->cluster_size / 1000) + 128); + + /* best compression, small window, no zlib header */ + memset(&strm, 0, sizeof(strm)); + ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, + Z_DEFLATED, -12, + 9, Z_DEFAULT_STRATEGY); + if (ret != 0) { + qemu_free(out_buf); + return -1; + } + + strm.avail_in = s->cluster_size; + strm.next_in = (uint8_t *)buf; + strm.avail_out = s->cluster_size; + strm.next_out = out_buf; + + ret = deflate(&strm, Z_FINISH); + if (ret != Z_STREAM_END && ret != Z_OK) { + qemu_free(out_buf); + deflateEnd(&strm); + return -1; + } + out_len = strm.next_out - out_buf; + + deflateEnd(&strm); + + if (ret != Z_STREAM_END || out_len >= s->cluster_size) { + /* could not compress: write normal cluster */ + qcow_write(bs, sector_num, buf, s->cluster_sectors); + } else { + cluster_offset = alloc_compressed_cluster_offset(bs, sector_num << 9, + out_len); + if (!cluster_offset) + return -1; + cluster_offset &= s->cluster_offset_mask; + if (bdrv_pwrite(s->hd, cluster_offset, out_buf, out_len) != out_len) { + qemu_free(out_buf); + return -1; + } + } + + qemu_free(out_buf); + return 0; +} + +static void qcow_flush(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + bdrv_flush(s->hd); +} + +static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + BDRVQcowState *s = bs->opaque; + bdi->cluster_size = s->cluster_size; + bdi->vm_state_offset = (int64_t)s->l1_vm_state_index << + (s->cluster_bits + s->l2_bits); + return 0; +} + +/*********************************************************/ +/* snapshot support */ + +/* update the refcounts of snapshots and the copied flag */ +static int update_snapshot_refcount(BlockDriverState *bs, + int64_t l1_table_offset, + int l1_size, + int addend) +{ + BDRVQcowState *s = bs->opaque; + uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated; + int64_t old_offset, old_l2_offset; + int l2_size, i, j, l1_modified, l2_modified, nb_csectors, refcount; + + l2_cache_reset(bs); + + l2_table = NULL; + l1_table = NULL; + l1_size2 = l1_size * sizeof(uint64_t); + l1_allocated = 0; + if (l1_table_offset != s->l1_table_offset) { + l1_table = qemu_malloc(l1_size2); + l1_allocated = 1; + if (bdrv_pread(s->hd, l1_table_offset, + l1_table, l1_size2) != l1_size2) + goto fail; + for(i = 0;i < l1_size; i++) + be64_to_cpus(&l1_table[i]); + } else { + assert(l1_size == s->l1_size); + l1_table = s->l1_table; + l1_allocated = 0; + } + + l2_size = s->l2_size * sizeof(uint64_t); + l2_table = qemu_malloc(l2_size); + l1_modified = 0; + for(i = 0; i < l1_size; i++) { + l2_offset = l1_table[i]; + if (l2_offset) { + old_l2_offset = l2_offset; + l2_offset &= ~QCOW_OFLAG_COPIED; + l2_modified = 0; + if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size) + goto fail; + for(j = 0; j < s->l2_size; j++) { + offset = be64_to_cpu(l2_table[j]); + if (offset != 0) { + old_offset = offset; + offset &= ~QCOW_OFLAG_COPIED; + if (offset & QCOW_OFLAG_COMPRESSED) { + nb_csectors = ((offset >> s->csize_shift) & + s->csize_mask) + 1; + if (addend != 0) + update_refcount(bs, (offset & s->cluster_offset_mask) & ~511, + nb_csectors * 512, addend); + /* compressed clusters are never modified */ + refcount = 2; + } else { + if (addend != 0) { + refcount = update_cluster_refcount(bs, offset >> s->cluster_bits, addend); + } else { + refcount = get_refcount(bs, offset >> s->cluster_bits); + } + } + + if (refcount == 1) { + offset |= QCOW_OFLAG_COPIED; + } + if (offset != old_offset) { + l2_table[j] = cpu_to_be64(offset); + l2_modified = 1; + } + } + } + if (l2_modified) { + if (bdrv_pwrite(s->hd, + l2_offset, l2_table, l2_size) != l2_size) + goto fail; + } + + if (addend != 0) { + refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend); + } else { + refcount = get_refcount(bs, l2_offset >> s->cluster_bits); + } + if (refcount == 1) { + l2_offset |= QCOW_OFLAG_COPIED; + } + if (l2_offset != old_l2_offset) { + l1_table[i] = l2_offset; + l1_modified = 1; + } + } + } + if (l1_modified) { + for(i = 0; i < l1_size; i++) + cpu_to_be64s(&l1_table[i]); + if (bdrv_pwrite(s->hd, l1_table_offset, l1_table, + l1_size2) != l1_size2) + goto fail; + for(i = 0; i < l1_size; i++) + be64_to_cpus(&l1_table[i]); + } + if (l1_allocated) + qemu_free(l1_table); + qemu_free(l2_table); + return 0; + fail: + if (l1_allocated) + qemu_free(l1_table); + qemu_free(l2_table); + return -EIO; +} + +static void qcow_free_snapshots(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + int i; + + for(i = 0; i < s->nb_snapshots; i++) { + qemu_free(s->snapshots[i].name); + qemu_free(s->snapshots[i].id_str); + } + qemu_free(s->snapshots); + s->snapshots = NULL; + s->nb_snapshots = 0; +} + +static int qcow_read_snapshots(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + QCowSnapshotHeader h; + QCowSnapshot *sn; + int i, id_str_size, name_size; + int64_t offset; + uint32_t extra_data_size; + + if (!s->nb_snapshots) { + s->snapshots = NULL; + s->snapshots_size = 0; + return 0; + } + + offset = s->snapshots_offset; + s->snapshots = qemu_mallocz(s->nb_snapshots * sizeof(QCowSnapshot)); + for(i = 0; i < s->nb_snapshots; i++) { + offset = align_offset(offset, 8); + if (bdrv_pread(s->hd, offset, &h, sizeof(h)) != sizeof(h)) + goto fail; + offset += sizeof(h); + sn = s->snapshots + i; + sn->l1_table_offset = be64_to_cpu(h.l1_table_offset); + sn->l1_size = be32_to_cpu(h.l1_size); + sn->vm_state_size = be32_to_cpu(h.vm_state_size); + sn->date_sec = be32_to_cpu(h.date_sec); + sn->date_nsec = be32_to_cpu(h.date_nsec); + sn->vm_clock_nsec = be64_to_cpu(h.vm_clock_nsec); + extra_data_size = be32_to_cpu(h.extra_data_size); + + id_str_size = be16_to_cpu(h.id_str_size); + name_size = be16_to_cpu(h.name_size); + + offset += extra_data_size; + + sn->id_str = qemu_malloc(id_str_size + 1); + if (bdrv_pread(s->hd, offset, sn->id_str, id_str_size) != id_str_size) + goto fail; + offset += id_str_size; + sn->id_str[id_str_size] = '\0'; + + sn->name = qemu_malloc(name_size + 1); + if (bdrv_pread(s->hd, offset, sn->name, name_size) != name_size) + goto fail; + offset += name_size; + sn->name[name_size] = '\0'; + } + s->snapshots_size = offset - s->snapshots_offset; + return 0; + fail: + qcow_free_snapshots(bs); + return -1; +} + +/* add at the end of the file a new list of snapshots */ +static int qcow_write_snapshots(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + QCowSnapshot *sn; + QCowSnapshotHeader h; + int i, name_size, id_str_size, snapshots_size; + uint64_t data64; + uint32_t data32; + int64_t offset, snapshots_offset; + + /* compute the size of the snapshots */ + offset = 0; + for(i = 0; i < s->nb_snapshots; i++) { + sn = s->snapshots + i; + offset = align_offset(offset, 8); + offset += sizeof(h); + offset += strlen(sn->id_str); + offset += strlen(sn->name); + } + snapshots_size = offset; + + snapshots_offset = alloc_clusters(bs, snapshots_size); + offset = snapshots_offset; + + for(i = 0; i < s->nb_snapshots; i++) { + sn = s->snapshots + i; + memset(&h, 0, sizeof(h)); + h.l1_table_offset = cpu_to_be64(sn->l1_table_offset); + h.l1_size = cpu_to_be32(sn->l1_size); + h.vm_state_size = cpu_to_be32(sn->vm_state_size); + h.date_sec = cpu_to_be32(sn->date_sec); + h.date_nsec = cpu_to_be32(sn->date_nsec); + h.vm_clock_nsec = cpu_to_be64(sn->vm_clock_nsec); + + id_str_size = strlen(sn->id_str); + name_size = strlen(sn->name); + h.id_str_size = cpu_to_be16(id_str_size); + h.name_size = cpu_to_be16(name_size); + offset = align_offset(offset, 8); + if (bdrv_pwrite(s->hd, offset, &h, sizeof(h)) != sizeof(h)) + goto fail; + offset += sizeof(h); + if (bdrv_pwrite(s->hd, offset, sn->id_str, id_str_size) != id_str_size) + goto fail; + offset += id_str_size; + if (bdrv_pwrite(s->hd, offset, sn->name, name_size) != name_size) + goto fail; + offset += name_size; + } + + /* update the various header fields */ + data64 = cpu_to_be64(snapshots_offset); + if (bdrv_pwrite(s->hd, offsetof(QCowHeader, snapshots_offset), + &data64, sizeof(data64)) != sizeof(data64)) + goto fail; + data32 = cpu_to_be32(s->nb_snapshots); + if (bdrv_pwrite(s->hd, offsetof(QCowHeader, nb_snapshots), + &data32, sizeof(data32)) != sizeof(data32)) + goto fail; + + /* free the old snapshot table */ + free_clusters(bs, s->snapshots_offset, s->snapshots_size); + s->snapshots_offset = snapshots_offset; + s->snapshots_size = snapshots_size; + return 0; + fail: + return -1; +} + +static void find_new_snapshot_id(BlockDriverState *bs, + char *id_str, int id_str_size) +{ + BDRVQcowState *s = bs->opaque; + QCowSnapshot *sn; + int i, id, id_max = 0; + + for(i = 0; i < s->nb_snapshots; i++) { + sn = s->snapshots + i; + id = strtoul(sn->id_str, NULL, 10); + if (id > id_max) + id_max = id; + } + snprintf(id_str, id_str_size, "%d", id_max + 1); +} + +static int find_snapshot_by_id(BlockDriverState *bs, const char *id_str) +{ + BDRVQcowState *s = bs->opaque; + int i; + + for(i = 0; i < s->nb_snapshots; i++) { + if (!strcmp(s->snapshots[i].id_str, id_str)) + return i; + } + return -1; +} + +static int find_snapshot_by_id_or_name(BlockDriverState *bs, const char *name) +{ + BDRVQcowState *s = bs->opaque; + int i, ret; + + ret = find_snapshot_by_id(bs, name); + if (ret >= 0) + return ret; + for(i = 0; i < s->nb_snapshots; i++) { + if (!strcmp(s->snapshots[i].name, name)) + return i; + } + return -1; +} + +/* if no id is provided, a new one is constructed */ +static int qcow_snapshot_create(BlockDriverState *bs, + QEMUSnapshotInfo *sn_info) +{ + BDRVQcowState *s = bs->opaque; + QCowSnapshot *snapshots1, sn1, *sn = &sn1; + int i, ret; + uint64_t *l1_table = NULL; + + memset(sn, 0, sizeof(*sn)); + + if (sn_info->id_str[0] == '\0') { + /* compute a new id */ + find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str)); + } + + /* check that the ID is unique */ + if (find_snapshot_by_id(bs, sn_info->id_str) >= 0) + return -ENOENT; + + sn->id_str = qemu_strdup(sn_info->id_str); + if (!sn->id_str) + goto fail; + sn->name = qemu_strdup(sn_info->name); + if (!sn->name) + goto fail; + sn->vm_state_size = sn_info->vm_state_size; + sn->date_sec = sn_info->date_sec; + sn->date_nsec = sn_info->date_nsec; + sn->vm_clock_nsec = sn_info->vm_clock_nsec; + + ret = update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1); + if (ret < 0) + goto fail; + + /* create the L1 table of the snapshot */ + sn->l1_table_offset = alloc_clusters(bs, s->l1_size * sizeof(uint64_t)); + sn->l1_size = s->l1_size; + + l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t)); + for(i = 0; i < s->l1_size; i++) { + l1_table[i] = cpu_to_be64(s->l1_table[i]); + } + if (bdrv_pwrite(s->hd, sn->l1_table_offset, + l1_table, s->l1_size * sizeof(uint64_t)) != + (s->l1_size * sizeof(uint64_t))) + goto fail; + qemu_free(l1_table); + l1_table = NULL; + + snapshots1 = qemu_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot)); + if (s->snapshots) { + memcpy(snapshots1, s->snapshots, s->nb_snapshots * sizeof(QCowSnapshot)); + qemu_free(s->snapshots); + } + s->snapshots = snapshots1; + s->snapshots[s->nb_snapshots++] = *sn; + + if (qcow_write_snapshots(bs) < 0) + goto fail; +#ifdef DEBUG_ALLOC + check_refcounts(bs); +#endif + return 0; + fail: + qemu_free(sn->name); + qemu_free(l1_table); + return -1; +} + +/* copy the snapshot 'snapshot_name' into the current disk image */ +static int qcow_snapshot_goto(BlockDriverState *bs, + const char *snapshot_id) +{ + BDRVQcowState *s = bs->opaque; + QCowSnapshot *sn; + int i, snapshot_index, l1_size2; + + snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id); + if (snapshot_index < 0) + return -ENOENT; + sn = &s->snapshots[snapshot_index]; + + if (update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, -1) < 0) + goto fail; + + if (grow_l1_table(bs, sn->l1_size) < 0) + goto fail; + + s->l1_size = sn->l1_size; + l1_size2 = s->l1_size * sizeof(uint64_t); + /* copy the snapshot l1 table to the current l1 table */ + if (bdrv_pread(s->hd, sn->l1_table_offset, + s->l1_table, l1_size2) != l1_size2) + goto fail; + if (bdrv_pwrite(s->hd, s->l1_table_offset, + s->l1_table, l1_size2) != l1_size2) + goto fail; + for(i = 0;i < s->l1_size; i++) { + be64_to_cpus(&s->l1_table[i]); + } + + if (update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1) < 0) + goto fail; + +#ifdef DEBUG_ALLOC + check_refcounts(bs); +#endif + return 0; + fail: + return -EIO; +} + +static int qcow_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) +{ + BDRVQcowState *s = bs->opaque; + QCowSnapshot *sn; + int snapshot_index, ret; + + snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id); + if (snapshot_index < 0) + return -ENOENT; + sn = &s->snapshots[snapshot_index]; + + ret = update_snapshot_refcount(bs, sn->l1_table_offset, sn->l1_size, -1); + if (ret < 0) + return ret; + /* must update the copied flag on the current cluster offsets */ + ret = update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0); + if (ret < 0) + return ret; + free_clusters(bs, sn->l1_table_offset, sn->l1_size * sizeof(uint64_t)); + + qemu_free(sn->id_str); + qemu_free(sn->name); + memmove(sn, sn + 1, (s->nb_snapshots - snapshot_index - 1) * sizeof(*sn)); + s->nb_snapshots--; + ret = qcow_write_snapshots(bs); + if (ret < 0) { + /* XXX: restore snapshot if error ? */ + return ret; + } +#ifdef DEBUG_ALLOC + check_refcounts(bs); +#endif + return 0; +} + +static int qcow_snapshot_list(BlockDriverState *bs, + QEMUSnapshotInfo **psn_tab) +{ + BDRVQcowState *s = bs->opaque; + QEMUSnapshotInfo *sn_tab, *sn_info; + QCowSnapshot *sn; + int i; + + sn_tab = qemu_mallocz(s->nb_snapshots * sizeof(QEMUSnapshotInfo)); + for(i = 0; i < s->nb_snapshots; i++) { + sn_info = sn_tab + i; + sn = s->snapshots + i; + pstrcpy(sn_info->id_str, sizeof(sn_info->id_str), + sn->id_str); + pstrcpy(sn_info->name, sizeof(sn_info->name), + sn->name); + sn_info->vm_state_size = sn->vm_state_size; + sn_info->date_sec = sn->date_sec; + sn_info->date_nsec = sn->date_nsec; + sn_info->vm_clock_nsec = sn->vm_clock_nsec; + } + *psn_tab = sn_tab; + return s->nb_snapshots; +} + +/*********************************************************/ +/* refcount handling */ + +static int refcount_init(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + int ret, refcount_table_size2, i; + + s->refcount_block_cache = qemu_malloc(s->cluster_size); + refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t); + s->refcount_table = qemu_malloc(refcount_table_size2); + if (s->refcount_table_size > 0) { + ret = bdrv_pread(s->hd, s->refcount_table_offset, + s->refcount_table, refcount_table_size2); + if (ret != refcount_table_size2) + goto fail; + for(i = 0; i < s->refcount_table_size; i++) + be64_to_cpus(&s->refcount_table[i]); + } + return 0; + fail: + return -ENOMEM; +} + +static void refcount_close(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + qemu_free(s->refcount_block_cache); + qemu_free(s->refcount_table); +} + + +static int load_refcount_block(BlockDriverState *bs, + int64_t refcount_block_offset) +{ + BDRVQcowState *s = bs->opaque; + int ret; + ret = bdrv_pread(s->hd, refcount_block_offset, s->refcount_block_cache, + s->cluster_size); + if (ret != s->cluster_size) + return -EIO; + s->refcount_block_cache_offset = refcount_block_offset; + return 0; +} + +static int get_refcount(BlockDriverState *bs, int64_t cluster_index) +{ + BDRVQcowState *s = bs->opaque; + int refcount_table_index, block_index; + int64_t refcount_block_offset; + + refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT); + if (refcount_table_index >= s->refcount_table_size) + return 0; + refcount_block_offset = s->refcount_table[refcount_table_index]; + if (!refcount_block_offset) + return 0; + if (refcount_block_offset != s->refcount_block_cache_offset) { + /* better than nothing: return allocated if read error */ + if (load_refcount_block(bs, refcount_block_offset) < 0) + return 1; + } + block_index = cluster_index & + ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1); + return be16_to_cpu(s->refcount_block_cache[block_index]); +} + +/* return < 0 if error */ +static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size) +{ + BDRVQcowState *s = bs->opaque; + int i, nb_clusters; + + nb_clusters = size_to_clusters(s, size); +retry: + for(i = 0; i < nb_clusters; i++) { + int64_t i = s->free_cluster_index++; + if (get_refcount(bs, i) != 0) + goto retry; + } +#ifdef DEBUG_ALLOC2 + printf("alloc_clusters: size=%lld -> %lld\n", + size, + (s->free_cluster_index - nb_clusters) << s->cluster_bits); +#endif + return (s->free_cluster_index - nb_clusters) << s->cluster_bits; +} + +static int64_t alloc_clusters(BlockDriverState *bs, int64_t size) +{ + int64_t offset; + + offset = alloc_clusters_noref(bs, size); + update_refcount(bs, offset, size, 1); + return offset; +} + +/* only used to allocate compressed sectors. We try to allocate + contiguous sectors. size must be <= cluster_size */ +static int64_t alloc_bytes(BlockDriverState *bs, int size) +{ + BDRVQcowState *s = bs->opaque; + int64_t offset, cluster_offset; + int free_in_cluster; + + assert(size > 0 && size <= s->cluster_size); + if (s->free_byte_offset == 0) { + s->free_byte_offset = alloc_clusters(bs, s->cluster_size); + } + redo: + free_in_cluster = s->cluster_size - + (s->free_byte_offset & (s->cluster_size - 1)); + if (size <= free_in_cluster) { + /* enough space in current cluster */ + offset = s->free_byte_offset; + s->free_byte_offset += size; + free_in_cluster -= size; + if (free_in_cluster == 0) + s->free_byte_offset = 0; + if ((offset & (s->cluster_size - 1)) != 0) + update_cluster_refcount(bs, offset >> s->cluster_bits, 1); + } else { + offset = alloc_clusters(bs, s->cluster_size); + cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1); + if ((cluster_offset + s->cluster_size) == offset) { + /* we are lucky: contiguous data */ + offset = s->free_byte_offset; + update_cluster_refcount(bs, offset >> s->cluster_bits, 1); + s->free_byte_offset += size; + } else { + s->free_byte_offset = offset; + goto redo; + } + } + return offset; +} + +static void free_clusters(BlockDriverState *bs, + int64_t offset, int64_t size) +{ + update_refcount(bs, offset, size, -1); +} + +static int grow_refcount_table(BlockDriverState *bs, int min_size) +{ + BDRVQcowState *s = bs->opaque; + int new_table_size, new_table_size2, refcount_table_clusters, i, ret; + uint64_t *new_table; + int64_t table_offset; + uint8_t data[12]; + int old_table_size; + int64_t old_table_offset; + + if (min_size <= s->refcount_table_size) + return 0; + /* compute new table size */ + refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3); + for(;;) { + if (refcount_table_clusters == 0) { + refcount_table_clusters = 1; + } else { + refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2; + } + new_table_size = refcount_table_clusters << (s->cluster_bits - 3); + if (min_size <= new_table_size) + break; + } +#ifdef DEBUG_ALLOC2 + printf("grow_refcount_table from %d to %d\n", + s->refcount_table_size, + new_table_size); +#endif + new_table_size2 = new_table_size * sizeof(uint64_t); + new_table = qemu_mallocz(new_table_size2); + memcpy(new_table, s->refcount_table, + s->refcount_table_size * sizeof(uint64_t)); + for(i = 0; i < s->refcount_table_size; i++) + cpu_to_be64s(&new_table[i]); + /* Note: we cannot update the refcount now to avoid recursion */ + table_offset = alloc_clusters_noref(bs, new_table_size2); + ret = bdrv_pwrite(s->hd, table_offset, new_table, new_table_size2); + if (ret != new_table_size2) + goto fail; + for(i = 0; i < s->refcount_table_size; i++) + be64_to_cpus(&new_table[i]); + + cpu_to_be64w((uint64_t*)data, table_offset); + cpu_to_be32w((uint32_t*)(data + 8), refcount_table_clusters); + if (bdrv_pwrite(s->hd, offsetof(QCowHeader, refcount_table_offset), + data, sizeof(data)) != sizeof(data)) + goto fail; + qemu_free(s->refcount_table); + old_table_offset = s->refcount_table_offset; + old_table_size = s->refcount_table_size; + s->refcount_table = new_table; + s->refcount_table_size = new_table_size; + s->refcount_table_offset = table_offset; + + update_refcount(bs, table_offset, new_table_size2, 1); + free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t)); + return 0; + fail: + free_clusters(bs, table_offset, new_table_size2); + qemu_free(new_table); + return -EIO; +} + +/* addend must be 1 or -1 */ +/* XXX: cache several refcount block clusters ? */ +static int update_cluster_refcount(BlockDriverState *bs, + int64_t cluster_index, + int addend) +{ + BDRVQcowState *s = bs->opaque; + int64_t offset, refcount_block_offset; + int ret, refcount_table_index, block_index, refcount; + uint64_t data64; + + refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT); + if (refcount_table_index >= s->refcount_table_size) { + if (addend < 0) + return -EINVAL; + ret = grow_refcount_table(bs, refcount_table_index + 1); + if (ret < 0) + return ret; + } + refcount_block_offset = s->refcount_table[refcount_table_index]; + if (!refcount_block_offset) { + if (addend < 0) + return -EINVAL; + /* create a new refcount block */ + /* Note: we cannot update the refcount now to avoid recursion */ + offset = alloc_clusters_noref(bs, s->cluster_size); + memset(s->refcount_block_cache, 0, s->cluster_size); + ret = bdrv_pwrite(s->hd, offset, s->refcount_block_cache, s->cluster_size); + if (ret != s->cluster_size) + return -EINVAL; + s->refcount_table[refcount_table_index] = offset; + data64 = cpu_to_be64(offset); + ret = bdrv_pwrite(s->hd, s->refcount_table_offset + + refcount_table_index * sizeof(uint64_t), + &data64, sizeof(data64)); + if (ret != sizeof(data64)) + return -EINVAL; + + refcount_block_offset = offset; + s->refcount_block_cache_offset = offset; + update_refcount(bs, offset, s->cluster_size, 1); + } else { + if (refcount_block_offset != s->refcount_block_cache_offset) { + if (load_refcount_block(bs, refcount_block_offset) < 0) + return -EIO; + } + } + /* we can update the count and save it */ + block_index = cluster_index & + ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1); + refcount = be16_to_cpu(s->refcount_block_cache[block_index]); + refcount += addend; + if (refcount < 0 || refcount > 0xffff) + return -EINVAL; + if (refcount == 0 && cluster_index < s->free_cluster_index) { + s->free_cluster_index = cluster_index; + } + s->refcount_block_cache[block_index] = cpu_to_be16(refcount); + if (bdrv_pwrite(s->hd, + refcount_block_offset + (block_index << REFCOUNT_SHIFT), + &s->refcount_block_cache[block_index], 2) != 2) + return -EIO; + return refcount; +} + +static void update_refcount(BlockDriverState *bs, + int64_t offset, int64_t length, + int addend) +{ + BDRVQcowState *s = bs->opaque; + int64_t start, last, cluster_offset; + +#ifdef DEBUG_ALLOC2 + printf("update_refcount: offset=%lld size=%lld addend=%d\n", + offset, length, addend); +#endif + if (length <= 0) + return; + start = offset & ~(s->cluster_size - 1); + last = (offset + length - 1) & ~(s->cluster_size - 1); + for(cluster_offset = start; cluster_offset <= last; + cluster_offset += s->cluster_size) { + update_cluster_refcount(bs, cluster_offset >> s->cluster_bits, addend); + } +} + +/* + * Increases the refcount for a range of clusters in a given refcount table. + * This is used to construct a temporary refcount table out of L1 and L2 tables + * which can be compared the the refcount table saved in the image. + * + * Returns the number of errors in the image that were found + */ +static int inc_refcounts(BlockDriverState *bs, + uint16_t *refcount_table, + int refcount_table_size, + int64_t offset, int64_t size) +{ + BDRVQcowState *s = bs->opaque; + int64_t start, last, cluster_offset; + int k; + int errors = 0; + + if (size <= 0) + return 0; + + start = offset & ~(s->cluster_size - 1); + last = (offset + size - 1) & ~(s->cluster_size - 1); + for(cluster_offset = start; cluster_offset <= last; + cluster_offset += s->cluster_size) { + k = cluster_offset >> s->cluster_bits; + if (k < 0 || k >= refcount_table_size) { + fprintf(stderr, "ERROR: invalid cluster offset=0x%" PRIx64 "\n", + cluster_offset); + errors++; + } else { + if (++refcount_table[k] == 0) { + fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64 + "\n", cluster_offset); + errors++; + } + } + } + + return errors; +} + +/* + * Increases the refcount in the given refcount table for the all clusters + * referenced in the L2 table. While doing so, performs some checks on L2 + * entries. + * + * Returns the number of errors found by the checks or -errno if an internal + * error occurred. + */ +static int check_refcounts_l2(BlockDriverState *bs, + uint16_t *refcount_table, int refcount_table_size, int64_t l2_offset, + int check_copied) +{ + BDRVQcowState *s = bs->opaque; + uint64_t *l2_table, offset; + int i, l2_size, nb_csectors, refcount; + int errors = 0; + + /* Read L2 table from disk */ + l2_size = s->l2_size * sizeof(uint64_t); + l2_table = qemu_malloc(l2_size); + + if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size) + goto fail; + + /* Do the actual checks */ + for(i = 0; i < s->l2_size; i++) { + offset = be64_to_cpu(l2_table[i]); + if (offset != 0) { + if (offset & QCOW_OFLAG_COMPRESSED) { + /* Compressed clusters don't have QCOW_OFLAG_COPIED */ + if (offset & QCOW_OFLAG_COPIED) { + fprintf(stderr, "ERROR: cluster %" PRId64 ": " + "copied flag must never be set for compressed " + "clusters\n", offset >> s->cluster_bits); + offset &= ~QCOW_OFLAG_COPIED; + errors++; + } + + /* Mark cluster as used */ + nb_csectors = ((offset >> s->csize_shift) & + s->csize_mask) + 1; + offset &= s->cluster_offset_mask; + errors += inc_refcounts(bs, refcount_table, + refcount_table_size, + offset & ~511, nb_csectors * 512); + } else { + /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */ + if (check_copied) { + uint64_t entry = offset; + offset &= ~QCOW_OFLAG_COPIED; + refcount = get_refcount(bs, offset >> s->cluster_bits); + if ((refcount == 1) != ((entry & QCOW_OFLAG_COPIED) != 0)) { + fprintf(stderr, "ERROR OFLAG_COPIED: offset=%" + PRIx64 " refcount=%d\n", entry, refcount); + errors++; + } + } + + /* Mark cluster as used */ + offset &= ~QCOW_OFLAG_COPIED; + errors += inc_refcounts(bs, refcount_table, + refcount_table_size, + offset, s->cluster_size); + + /* Correct offsets are cluster aligned */ + if (offset & (s->cluster_size - 1)) { + fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not " + "properly aligned; L2 entry corrupted.\n", offset); + errors++; + } + } + } + } + + qemu_free(l2_table); + return errors; + +fail: + fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n"); + qemu_free(l2_table); + return -EIO; +} + +/* + * Increases the refcount for the L1 table, its L2 tables and all referenced + * clusters in the given refcount table. While doing so, performs some checks + * on L1 and L2 entries. + * + * Returns the number of errors found by the checks or -errno if an internal + * error occurred. + */ +static int check_refcounts_l1(BlockDriverState *bs, + uint16_t *refcount_table, + int refcount_table_size, + int64_t l1_table_offset, int l1_size, + int check_copied) +{ + BDRVQcowState *s = bs->opaque; + uint64_t *l1_table, l2_offset, l1_size2; + int i, refcount, ret; + int errors = 0; + + l1_size2 = l1_size * sizeof(uint64_t); + + /* Mark L1 table as used */ + errors += inc_refcounts(bs, refcount_table, refcount_table_size, + l1_table_offset, l1_size2); + + /* Read L1 table entries from disk */ + l1_table = qemu_malloc(l1_size2); + if (bdrv_pread(s->hd, l1_table_offset, + l1_table, l1_size2) != l1_size2) + goto fail; + for(i = 0;i < l1_size; i++) + be64_to_cpus(&l1_table[i]); + + /* Do the actual checks */ + for(i = 0; i < l1_size; i++) { + l2_offset = l1_table[i]; + if (l2_offset) { + /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */ + if (check_copied) { + refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED) + >> s->cluster_bits); + if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) { + fprintf(stderr, "ERROR OFLAG_COPIED: l2_offset=%" PRIx64 + " refcount=%d\n", l2_offset, refcount); + errors++; + } + } + + /* Mark L2 table as used */ + l2_offset &= ~QCOW_OFLAG_COPIED; + errors += inc_refcounts(bs, refcount_table, + refcount_table_size, + l2_offset, + s->cluster_size); + + /* L2 tables are cluster aligned */ + if (l2_offset & (s->cluster_size - 1)) { + fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not " + "cluster aligned; L1 entry corrupted\n", l2_offset); + errors++; + } + + /* Process and check L2 entries */ + ret = check_refcounts_l2(bs, refcount_table, refcount_table_size, + l2_offset, check_copied); + if (ret < 0) { + goto fail; + } + errors += ret; + } + } + qemu_free(l1_table); + return errors; + +fail: + fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n"); + qemu_free(l1_table); + return -EIO; +} + +/* + * Checks an image for refcount consistency. + * + * Returns 0 if no errors are found, the number of errors in case the image is + * detected as corrupted, and -errno when an internal error occured. + */ +static int check_refcounts(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + int64_t size; + int nb_clusters, refcount1, refcount2, i; + QCowSnapshot *sn; + uint16_t *refcount_table; + int ret, errors = 0; + + size = bdrv_getlength(s->hd); + nb_clusters = size_to_clusters(s, size); + refcount_table = qemu_mallocz(nb_clusters * sizeof(uint16_t)); + + /* header */ + errors += inc_refcounts(bs, refcount_table, nb_clusters, + 0, s->cluster_size); + + /* current L1 table */ + ret = check_refcounts_l1(bs, refcount_table, nb_clusters, + s->l1_table_offset, s->l1_size, 1); + if (ret < 0) { + return ret; + } + errors += ret; + + /* snapshots */ + for(i = 0; i < s->nb_snapshots; i++) { + sn = s->snapshots + i; + check_refcounts_l1(bs, refcount_table, nb_clusters, + sn->l1_table_offset, sn->l1_size, 0); + } + errors += inc_refcounts(bs, refcount_table, nb_clusters, + s->snapshots_offset, s->snapshots_size); + + /* refcount data */ + errors += inc_refcounts(bs, refcount_table, nb_clusters, + s->refcount_table_offset, + s->refcount_table_size * sizeof(uint64_t)); + for(i = 0; i < s->refcount_table_size; i++) { + int64_t offset; + offset = s->refcount_table[i]; + if (offset != 0) { + errors += inc_refcounts(bs, refcount_table, nb_clusters, + offset, s->cluster_size); + } + } + + /* compare ref counts */ + for(i = 0; i < nb_clusters; i++) { + refcount1 = get_refcount(bs, i); + refcount2 = refcount_table[i]; + if (refcount1 != refcount2) { + fprintf(stderr, "ERROR cluster %d refcount=%d reference=%d\n", + i, refcount1, refcount2); + errors++; + } + } + + qemu_free(refcount_table); + + return errors; +} + +static int qcow_check(BlockDriverState *bs) +{ + return check_refcounts(bs); +} + +#if 0 +static void dump_refcounts(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + int64_t nb_clusters, k, k1, size; + int refcount; + + size = bdrv_getlength(s->hd); + nb_clusters = size_to_clusters(s, size); + for(k = 0; k < nb_clusters;) { + k1 = k; + refcount = get_refcount(bs, k); + k++; + while (k < nb_clusters && get_refcount(bs, k) == refcount) + k++; + printf("%lld: refcount=%d nb=%lld\n", k, refcount, k - k1); + } +} +#endif + +static int qcow_put_buffer(BlockDriverState *bs, const uint8_t *buf, + int64_t pos, int size) +{ + int growable = bs->growable; + + bs->growable = 1; + bdrv_pwrite(bs, pos, buf, size); + bs->growable = growable; + + return size; +} + +static int qcow_get_buffer(BlockDriverState *bs, uint8_t *buf, + int64_t pos, int size) +{ + int growable = bs->growable; + int ret; + + bs->growable = 1; + ret = bdrv_pread(bs, pos, buf, size); + bs->growable = growable; + + return ret; +} + +static BlockDriver bdrv_qcow2 = { + .format_name = "qcow2", + .instance_size = sizeof(BDRVQcowState), + .bdrv_probe = qcow_probe, + .bdrv_open = qcow_open, + .bdrv_close = qcow_close, + .bdrv_create = qcow_create, + .bdrv_flush = qcow_flush, + .bdrv_is_allocated = qcow_is_allocated, + .bdrv_set_key = qcow_set_key, + .bdrv_make_empty = qcow_make_empty, + + .bdrv_aio_readv = qcow_aio_readv, + .bdrv_aio_writev = qcow_aio_writev, + .bdrv_aio_cancel = qcow_aio_cancel, + .aiocb_size = sizeof(QCowAIOCB), + .bdrv_write_compressed = qcow_write_compressed, + + .bdrv_snapshot_create = qcow_snapshot_create, + .bdrv_snapshot_goto = qcow_snapshot_goto, + .bdrv_snapshot_delete = qcow_snapshot_delete, + .bdrv_snapshot_list = qcow_snapshot_list, + .bdrv_get_info = qcow_get_info, + + .bdrv_put_buffer = qcow_put_buffer, + .bdrv_get_buffer = qcow_get_buffer, + + .bdrv_create2 = qcow_create2, + .bdrv_check = qcow_check, +}; + +static void bdrv_qcow2_init(void) +{ + bdrv_register(&bdrv_qcow2); +} + +block_init(bdrv_qcow2_init); diff --git a/block/raw-posix.c b/block/raw-posix.c new file mode 100644 index 0000000000..f3a9476b44 --- /dev/null +++ b/block/raw-posix.c @@ -0,0 +1,1438 @@ +/* + * Block driver for RAW files (posix) + * + * Copyright (c) 2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "qemu-timer.h" +#include "qemu-char.h" +#include "block_int.h" +#include "module.h" +#ifdef CONFIG_AIO +#include "posix-aio-compat.h" +#endif + +#ifdef CONFIG_COCOA +#include <paths.h> +#include <sys/param.h> +#include <IOKit/IOKitLib.h> +#include <IOKit/IOBSD.h> +#include <IOKit/storage/IOMediaBSDClient.h> +#include <IOKit/storage/IOMedia.h> +#include <IOKit/storage/IOCDMedia.h> +//#include <IOKit/storage/IOCDTypes.h> +#include <CoreFoundation/CoreFoundation.h> +#endif + +#ifdef __sun__ +#define _POSIX_PTHREAD_SEMANTICS 1 +#include <signal.h> +#include <sys/dkio.h> +#endif +#ifdef __linux__ +#include <sys/ioctl.h> +#include <linux/cdrom.h> +#include <linux/fd.h> +#endif +#ifdef __FreeBSD__ +#include <signal.h> +#include <sys/disk.h> +#include <sys/cdio.h> +#endif + +#ifdef __OpenBSD__ +#include <sys/ioctl.h> +#include <sys/disklabel.h> +#include <sys/dkio.h> +#endif + +#ifdef __DragonFly__ +#include <sys/ioctl.h> +#include <sys/diskslice.h> +#endif + +//#define DEBUG_FLOPPY + +//#define DEBUG_BLOCK +#if defined(DEBUG_BLOCK) +#define DEBUG_BLOCK_PRINT(formatCstr, ...) do { if (qemu_log_enabled()) \ + { qemu_log(formatCstr, ## __VA_ARGS__); qemu_log_flush(); } } while (0) +#else +#define DEBUG_BLOCK_PRINT(formatCstr, ...) +#endif + +/* OS X does not have O_DSYNC */ +#ifndef O_DSYNC +#define O_DSYNC O_SYNC +#endif + +/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */ +#ifndef O_DIRECT +#define O_DIRECT O_DSYNC +#endif + +#define FTYPE_FILE 0 +#define FTYPE_CD 1 +#define FTYPE_FD 2 + +#define ALIGNED_BUFFER_SIZE (32 * 512) + +/* if the FD is not accessed during that time (in ms), we try to + reopen it to see if the disk has been changed */ +#define FD_OPEN_TIMEOUT 1000 + +typedef struct BDRVRawState { + int fd; + int type; + unsigned int lseek_err_cnt; +#if defined(__linux__) + /* linux floppy specific */ + int fd_open_flags; + int64_t fd_open_time; + int64_t fd_error_time; + int fd_got_error; + int fd_media_changed; +#endif +#if defined(__FreeBSD__) + int cd_open_flags; +#endif + uint8_t* aligned_buf; +} BDRVRawState; + +static int posix_aio_init(void); + +static int fd_open(BlockDriverState *bs); + +#if defined(__FreeBSD__) +static int cd_open(BlockDriverState *bs); +#endif + +static int raw_is_inserted(BlockDriverState *bs); + +static int raw_open(BlockDriverState *bs, const char *filename, int flags) +{ + BDRVRawState *s = bs->opaque; + int fd, open_flags, ret; + + posix_aio_init(); + + s->lseek_err_cnt = 0; + + open_flags = O_BINARY; + if ((flags & BDRV_O_ACCESS) == O_RDWR) { + open_flags |= O_RDWR; + } else { + open_flags |= O_RDONLY; + bs->read_only = 1; + } + if (flags & BDRV_O_CREAT) + open_flags |= O_CREAT | O_TRUNC; + + /* Use O_DSYNC for write-through caching, no flags for write-back caching, + * and O_DIRECT for no caching. */ + if ((flags & BDRV_O_NOCACHE)) + open_flags |= O_DIRECT; + else if (!(flags & BDRV_O_CACHE_WB)) + open_flags |= O_DSYNC; + + s->type = FTYPE_FILE; + + fd = open(filename, open_flags, 0644); + if (fd < 0) { + ret = -errno; + if (ret == -EROFS) + ret = -EACCES; + return ret; + } + s->fd = fd; + s->aligned_buf = NULL; + if ((flags & BDRV_O_NOCACHE)) { + s->aligned_buf = qemu_blockalign(bs, ALIGNED_BUFFER_SIZE); + if (s->aligned_buf == NULL) { + ret = -errno; + close(fd); + return ret; + } + } + return 0; +} + +/* XXX: use host sector size if necessary with: +#ifdef DIOCGSECTORSIZE + { + unsigned int sectorsize = 512; + if (!ioctl(fd, DIOCGSECTORSIZE, §orsize) && + sectorsize > bufsize) + bufsize = sectorsize; + } +#endif +#ifdef CONFIG_COCOA + u_int32_t blockSize = 512; + if ( !ioctl( fd, DKIOCGETBLOCKSIZE, &blockSize ) && blockSize > bufsize) { + bufsize = blockSize; + } +#endif +*/ + +/* + * offset and count are in bytes, but must be multiples of 512 for files + * opened with O_DIRECT. buf must be aligned to 512 bytes then. + * + * This function may be called without alignment if the caller ensures + * that O_DIRECT is not in effect. + */ +static int raw_pread_aligned(BlockDriverState *bs, int64_t offset, + uint8_t *buf, int count) +{ + BDRVRawState *s = bs->opaque; + int ret; + + ret = fd_open(bs); + if (ret < 0) + return ret; + + if (offset >= 0 && lseek(s->fd, offset, SEEK_SET) == (off_t)-1) { + ++(s->lseek_err_cnt); + if(s->lseek_err_cnt <= 10) { + DEBUG_BLOCK_PRINT("raw_pread(%d:%s, %" PRId64 ", %p, %d) [%" PRId64 + "] lseek failed : %d = %s\n", + s->fd, bs->filename, offset, buf, count, + bs->total_sectors, errno, strerror(errno)); + } + return -1; + } + s->lseek_err_cnt=0; + + ret = read(s->fd, buf, count); + if (ret == count) + goto label__raw_read__success; + + DEBUG_BLOCK_PRINT("raw_pread(%d:%s, %" PRId64 ", %p, %d) [%" PRId64 + "] read failed %d : %d = %s\n", + s->fd, bs->filename, offset, buf, count, + bs->total_sectors, ret, errno, strerror(errno)); + + /* Try harder for CDrom. */ + if (bs->type == BDRV_TYPE_CDROM) { + lseek(s->fd, offset, SEEK_SET); + ret = read(s->fd, buf, count); + if (ret == count) + goto label__raw_read__success; + lseek(s->fd, offset, SEEK_SET); + ret = read(s->fd, buf, count); + if (ret == count) + goto label__raw_read__success; + + DEBUG_BLOCK_PRINT("raw_pread(%d:%s, %" PRId64 ", %p, %d) [%" PRId64 + "] retry read failed %d : %d = %s\n", + s->fd, bs->filename, offset, buf, count, + bs->total_sectors, ret, errno, strerror(errno)); + } + +label__raw_read__success: + + return ret; +} + +/* + * offset and count are in bytes, but must be multiples of 512 for files + * opened with O_DIRECT. buf must be aligned to 512 bytes then. + * + * This function may be called without alignment if the caller ensures + * that O_DIRECT is not in effect. + */ +static int raw_pwrite_aligned(BlockDriverState *bs, int64_t offset, + const uint8_t *buf, int count) +{ + BDRVRawState *s = bs->opaque; + int ret; + + ret = fd_open(bs); + if (ret < 0) + return -errno; + + if (offset >= 0 && lseek(s->fd, offset, SEEK_SET) == (off_t)-1) { + ++(s->lseek_err_cnt); + if(s->lseek_err_cnt) { + DEBUG_BLOCK_PRINT("raw_pwrite(%d:%s, %" PRId64 ", %p, %d) [%" + PRId64 "] lseek failed : %d = %s\n", + s->fd, bs->filename, offset, buf, count, + bs->total_sectors, errno, strerror(errno)); + } + return -EIO; + } + s->lseek_err_cnt = 0; + + ret = write(s->fd, buf, count); + if (ret == count) + goto label__raw_write__success; + + DEBUG_BLOCK_PRINT("raw_pwrite(%d:%s, %" PRId64 ", %p, %d) [%" PRId64 + "] write failed %d : %d = %s\n", + s->fd, bs->filename, offset, buf, count, + bs->total_sectors, ret, errno, strerror(errno)); + +label__raw_write__success: + + return (ret < 0) ? -errno : ret; +} + + +/* + * offset and count are in bytes and possibly not aligned. For files opened + * with O_DIRECT, necessary alignments are ensured before calling + * raw_pread_aligned to do the actual read. + */ +static int raw_pread(BlockDriverState *bs, int64_t offset, + uint8_t *buf, int count) +{ + BDRVRawState *s = bs->opaque; + int size, ret, shift, sum; + + sum = 0; + + if (s->aligned_buf != NULL) { + + if (offset & 0x1ff) { + /* align offset on a 512 bytes boundary */ + + shift = offset & 0x1ff; + size = (shift + count + 0x1ff) & ~0x1ff; + if (size > ALIGNED_BUFFER_SIZE) + size = ALIGNED_BUFFER_SIZE; + ret = raw_pread_aligned(bs, offset - shift, s->aligned_buf, size); + if (ret < 0) + return ret; + + size = 512 - shift; + if (size > count) + size = count; + memcpy(buf, s->aligned_buf + shift, size); + + buf += size; + offset += size; + count -= size; + sum += size; + + if (count == 0) + return sum; + } + if (count & 0x1ff || (uintptr_t) buf & 0x1ff) { + + /* read on aligned buffer */ + + while (count) { + + size = (count + 0x1ff) & ~0x1ff; + if (size > ALIGNED_BUFFER_SIZE) + size = ALIGNED_BUFFER_SIZE; + + ret = raw_pread_aligned(bs, offset, s->aligned_buf, size); + if (ret < 0) + return ret; + + size = ret; + if (size > count) + size = count; + + memcpy(buf, s->aligned_buf, size); + + buf += size; + offset += size; + count -= size; + sum += size; + } + + return sum; + } + } + + return raw_pread_aligned(bs, offset, buf, count) + sum; +} + +static int raw_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + int ret; + + ret = raw_pread(bs, sector_num * 512, buf, nb_sectors * 512); + if (ret == (nb_sectors * 512)) + ret = 0; + return ret; +} + +/* + * offset and count are in bytes and possibly not aligned. For files opened + * with O_DIRECT, necessary alignments are ensured before calling + * raw_pwrite_aligned to do the actual write. + */ +static int raw_pwrite(BlockDriverState *bs, int64_t offset, + const uint8_t *buf, int count) +{ + BDRVRawState *s = bs->opaque; + int size, ret, shift, sum; + + sum = 0; + + if (s->aligned_buf != NULL) { + + if (offset & 0x1ff) { + /* align offset on a 512 bytes boundary */ + shift = offset & 0x1ff; + ret = raw_pread_aligned(bs, offset - shift, s->aligned_buf, 512); + if (ret < 0) + return ret; + + size = 512 - shift; + if (size > count) + size = count; + memcpy(s->aligned_buf + shift, buf, size); + + ret = raw_pwrite_aligned(bs, offset - shift, s->aligned_buf, 512); + if (ret < 0) + return ret; + + buf += size; + offset += size; + count -= size; + sum += size; + + if (count == 0) + return sum; + } + if (count & 0x1ff || (uintptr_t) buf & 0x1ff) { + + while ((size = (count & ~0x1ff)) != 0) { + + if (size > ALIGNED_BUFFER_SIZE) + size = ALIGNED_BUFFER_SIZE; + + memcpy(s->aligned_buf, buf, size); + + ret = raw_pwrite_aligned(bs, offset, s->aligned_buf, size); + if (ret < 0) + return ret; + + buf += ret; + offset += ret; + count -= ret; + sum += ret; + } + /* here, count < 512 because (count & ~0x1ff) == 0 */ + if (count) { + ret = raw_pread_aligned(bs, offset, s->aligned_buf, 512); + if (ret < 0) + return ret; + memcpy(s->aligned_buf, buf, count); + + ret = raw_pwrite_aligned(bs, offset, s->aligned_buf, 512); + if (ret < 0) + return ret; + if (count < ret) + ret = count; + + sum += ret; + } + return sum; + } + } + return raw_pwrite_aligned(bs, offset, buf, count) + sum; +} + +static int raw_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + int ret; + ret = raw_pwrite(bs, sector_num * 512, buf, nb_sectors * 512); + if (ret == (nb_sectors * 512)) + ret = 0; + return ret; +} + +#ifdef CONFIG_AIO +/***********************************************************/ +/* Unix AIO using POSIX AIO */ + +typedef struct RawAIOCB { + BlockDriverAIOCB common; + struct qemu_paiocb aiocb; + struct RawAIOCB *next; + int ret; +} RawAIOCB; + +typedef struct PosixAioState +{ + int rfd, wfd; + RawAIOCB *first_aio; +} PosixAioState; + +static void posix_aio_read(void *opaque) +{ + PosixAioState *s = opaque; + RawAIOCB *acb, **pacb; + int ret; + ssize_t len; + + /* read all bytes from signal pipe */ + for (;;) { + char bytes[16]; + + len = read(s->rfd, bytes, sizeof(bytes)); + if (len == -1 && errno == EINTR) + continue; /* try again */ + if (len == sizeof(bytes)) + continue; /* more to read */ + break; + } + + for(;;) { + pacb = &s->first_aio; + for(;;) { + acb = *pacb; + if (!acb) + goto the_end; + ret = qemu_paio_error(&acb->aiocb); + if (ret == ECANCELED) { + /* remove the request */ + *pacb = acb->next; + qemu_aio_release(acb); + } else if (ret != EINPROGRESS) { + /* end of aio */ + if (ret == 0) { + ret = qemu_paio_return(&acb->aiocb); + if (ret == acb->aiocb.aio_nbytes) + ret = 0; + else + ret = -EINVAL; + } else { + ret = -ret; + } + /* remove the request */ + *pacb = acb->next; + /* call the callback */ + acb->common.cb(acb->common.opaque, ret); + qemu_aio_release(acb); + break; + } else { + pacb = &acb->next; + } + } + } + the_end: ; +} + +static int posix_aio_flush(void *opaque) +{ + PosixAioState *s = opaque; + return !!s->first_aio; +} + +static PosixAioState *posix_aio_state; + +static void aio_signal_handler(int signum) +{ + if (posix_aio_state) { + char byte = 0; + + write(posix_aio_state->wfd, &byte, sizeof(byte)); + } + + qemu_service_io(); +} + +static int posix_aio_init(void) +{ + struct sigaction act; + PosixAioState *s; + int fds[2]; + struct qemu_paioinit ai; + + if (posix_aio_state) + return 0; + + s = qemu_malloc(sizeof(PosixAioState)); + + sigfillset(&act.sa_mask); + act.sa_flags = 0; /* do not restart syscalls to interrupt select() */ + act.sa_handler = aio_signal_handler; + sigaction(SIGUSR2, &act, NULL); + + s->first_aio = NULL; + if (pipe(fds) == -1) { + fprintf(stderr, "failed to create pipe\n"); + return -errno; + } + + s->rfd = fds[0]; + s->wfd = fds[1]; + + fcntl(s->rfd, F_SETFL, O_NONBLOCK); + fcntl(s->wfd, F_SETFL, O_NONBLOCK); + + qemu_aio_set_fd_handler(s->rfd, posix_aio_read, NULL, posix_aio_flush, s); + + memset(&ai, 0, sizeof(ai)); + ai.aio_threads = 64; + ai.aio_num = 64; + qemu_paio_init(&ai); + + posix_aio_state = s; + + return 0; +} + +static RawAIOCB *raw_aio_setup(BlockDriverState *bs, int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + BDRVRawState *s = bs->opaque; + RawAIOCB *acb; + + if (fd_open(bs) < 0) + return NULL; + + acb = qemu_aio_get(bs, cb, opaque); + if (!acb) + return NULL; + acb->aiocb.aio_fildes = s->fd; + acb->aiocb.ev_signo = SIGUSR2; + acb->aiocb.aio_iov = qiov->iov; + acb->aiocb.aio_niov = qiov->niov; + acb->aiocb.aio_nbytes = nb_sectors * 512; + acb->aiocb.aio_offset = sector_num * 512; + acb->aiocb.aio_flags = 0; + + /* + * If O_DIRECT is used the buffer needs to be aligned on a sector + * boundary. Tell the low level code to ensure that in case it's + * not done yet. + */ + if (s->aligned_buf) + acb->aiocb.aio_flags |= QEMU_AIO_SECTOR_ALIGNED; + + acb->next = posix_aio_state->first_aio; + posix_aio_state->first_aio = acb; + return acb; +} + +static void raw_aio_remove(RawAIOCB *acb) +{ + RawAIOCB **pacb; + + /* remove the callback from the queue */ + pacb = &posix_aio_state->first_aio; + for(;;) { + if (*pacb == NULL) { + fprintf(stderr, "raw_aio_remove: aio request not found!\n"); + break; + } else if (*pacb == acb) { + *pacb = acb->next; + qemu_aio_release(acb); + break; + } + pacb = &(*pacb)->next; + } +} + +static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + RawAIOCB *acb; + + acb = raw_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque); + if (!acb) + return NULL; + if (qemu_paio_read(&acb->aiocb) < 0) { + raw_aio_remove(acb); + return NULL; + } + return &acb->common; +} + +static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + RawAIOCB *acb; + + acb = raw_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque); + if (!acb) + return NULL; + if (qemu_paio_write(&acb->aiocb) < 0) { + raw_aio_remove(acb); + return NULL; + } + return &acb->common; +} + +static void raw_aio_cancel(BlockDriverAIOCB *blockacb) +{ + int ret; + RawAIOCB *acb = (RawAIOCB *)blockacb; + + ret = qemu_paio_cancel(acb->aiocb.aio_fildes, &acb->aiocb); + if (ret == QEMU_PAIO_NOTCANCELED) { + /* fail safe: if the aio could not be canceled, we wait for + it */ + while (qemu_paio_error(&acb->aiocb) == EINPROGRESS); + } + + raw_aio_remove(acb); +} +#else /* CONFIG_AIO */ +static int posix_aio_init(void) +{ + return 0; +} +#endif /* CONFIG_AIO */ + + +static void raw_close(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + if (s->fd >= 0) { + close(s->fd); + s->fd = -1; + if (s->aligned_buf != NULL) + qemu_free(s->aligned_buf); + } +} + +static int raw_truncate(BlockDriverState *bs, int64_t offset) +{ + BDRVRawState *s = bs->opaque; + if (s->type != FTYPE_FILE) + return -ENOTSUP; + if (ftruncate(s->fd, offset) < 0) + return -errno; + return 0; +} + +#ifdef __OpenBSD__ +static int64_t raw_getlength(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + int fd = s->fd; + struct stat st; + + if (fstat(fd, &st)) + return -1; + if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { + struct disklabel dl; + + if (ioctl(fd, DIOCGDINFO, &dl)) + return -1; + return (uint64_t)dl.d_secsize * + dl.d_partitions[DISKPART(st.st_rdev)].p_size; + } else + return st.st_size; +} +#else /* !__OpenBSD__ */ +static int64_t raw_getlength(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + int fd = s->fd; + int64_t size; +#ifdef HOST_BSD + struct stat sb; +#ifdef __FreeBSD__ + int reopened = 0; +#endif +#endif +#ifdef __sun__ + struct dk_minfo minfo; + int rv; +#endif + int ret; + + ret = fd_open(bs); + if (ret < 0) + return ret; + +#ifdef HOST_BSD +#ifdef __FreeBSD__ +again: +#endif + if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) { +#ifdef DIOCGMEDIASIZE + if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size)) +#elif defined(DIOCGPART) + { + struct partinfo pi; + if (ioctl(fd, DIOCGPART, &pi) == 0) + size = pi.media_size; + else + size = 0; + } + if (size == 0) +#endif +#ifdef CONFIG_COCOA + size = LONG_LONG_MAX; +#else + size = lseek(fd, 0LL, SEEK_END); +#endif +#ifdef __FreeBSD__ + switch(s->type) { + case FTYPE_CD: + /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */ + if (size == 2048LL * (unsigned)-1) + size = 0; + /* XXX no disc? maybe we need to reopen... */ + if (size <= 0 && !reopened && cd_open(bs) >= 0) { + reopened = 1; + goto again; + } + } +#endif + } else +#endif +#ifdef __sun__ + /* + * use the DKIOCGMEDIAINFO ioctl to read the size. + */ + rv = ioctl ( fd, DKIOCGMEDIAINFO, &minfo ); + if ( rv != -1 ) { + size = minfo.dki_lbsize * minfo.dki_capacity; + } else /* there are reports that lseek on some devices + fails, but irc discussion said that contingency + on contingency was overkill */ +#endif + { + size = lseek(fd, 0, SEEK_END); + } + return size; +} +#endif + +static int raw_create(const char *filename, int64_t total_size, + const char *backing_file, int flags) +{ + int fd; + + if (flags || backing_file) + return -ENOTSUP; + + fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, + 0644); + if (fd < 0) + return -EIO; + ftruncate(fd, total_size * 512); + close(fd); + return 0; +} + +static void raw_flush(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + fsync(s->fd); +} + +static BlockDriver bdrv_raw = { + .format_name = "raw", + .instance_size = sizeof(BDRVRawState), + .bdrv_probe = NULL, /* no probe for protocols */ + .bdrv_open = raw_open, + .bdrv_read = raw_read, + .bdrv_write = raw_write, + .bdrv_close = raw_close, + .bdrv_create = raw_create, + .bdrv_flush = raw_flush, + +#ifdef CONFIG_AIO + .bdrv_aio_readv = raw_aio_readv, + .bdrv_aio_writev = raw_aio_writev, + .bdrv_aio_cancel = raw_aio_cancel, + .aiocb_size = sizeof(RawAIOCB), +#endif + + .bdrv_truncate = raw_truncate, + .bdrv_getlength = raw_getlength, +}; + +/***********************************************/ +/* host device */ + +#ifdef CONFIG_COCOA +static kern_return_t FindEjectableCDMedia( io_iterator_t *mediaIterator ); +static kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex maxPathSize ); + +kern_return_t FindEjectableCDMedia( io_iterator_t *mediaIterator ) +{ + kern_return_t kernResult; + mach_port_t masterPort; + CFMutableDictionaryRef classesToMatch; + + kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort ); + if ( KERN_SUCCESS != kernResult ) { + printf( "IOMasterPort returned %d\n", kernResult ); + } + + classesToMatch = IOServiceMatching( kIOCDMediaClass ); + if ( classesToMatch == NULL ) { + printf( "IOServiceMatching returned a NULL dictionary.\n" ); + } else { + CFDictionarySetValue( classesToMatch, CFSTR( kIOMediaEjectableKey ), kCFBooleanTrue ); + } + kernResult = IOServiceGetMatchingServices( masterPort, classesToMatch, mediaIterator ); + if ( KERN_SUCCESS != kernResult ) + { + printf( "IOServiceGetMatchingServices returned %d\n", kernResult ); + } + + return kernResult; +} + +kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex maxPathSize ) +{ + io_object_t nextMedia; + kern_return_t kernResult = KERN_FAILURE; + *bsdPath = '\0'; + nextMedia = IOIteratorNext( mediaIterator ); + if ( nextMedia ) + { + CFTypeRef bsdPathAsCFString; + bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 ); + if ( bsdPathAsCFString ) { + size_t devPathLength; + strcpy( bsdPath, _PATH_DEV ); + strcat( bsdPath, "r" ); + devPathLength = strlen( bsdPath ); + if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) { + kernResult = KERN_SUCCESS; + } + CFRelease( bsdPathAsCFString ); + } + IOObjectRelease( nextMedia ); + } + + return kernResult; +} + +#endif + +static int hdev_open(BlockDriverState *bs, const char *filename, int flags) +{ + BDRVRawState *s = bs->opaque; + int fd, open_flags, ret; + + posix_aio_init(); + +#ifdef CONFIG_COCOA + if (strstart(filename, "/dev/cdrom", NULL)) { + kern_return_t kernResult; + io_iterator_t mediaIterator; + char bsdPath[ MAXPATHLEN ]; + int fd; + + kernResult = FindEjectableCDMedia( &mediaIterator ); + kernResult = GetBSDPath( mediaIterator, bsdPath, sizeof( bsdPath ) ); + + if ( bsdPath[ 0 ] != '\0' ) { + strcat(bsdPath,"s0"); + /* some CDs don't have a partition 0 */ + fd = open(bsdPath, O_RDONLY | O_BINARY | O_LARGEFILE); + if (fd < 0) { + bsdPath[strlen(bsdPath)-1] = '1'; + } else { + close(fd); + } + filename = bsdPath; + } + + if ( mediaIterator ) + IOObjectRelease( mediaIterator ); + } +#endif + open_flags = O_BINARY; + if ((flags & BDRV_O_ACCESS) == O_RDWR) { + open_flags |= O_RDWR; + } else { + open_flags |= O_RDONLY; + bs->read_only = 1; + } + /* Use O_DSYNC for write-through caching, no flags for write-back caching, + * and O_DIRECT for no caching. */ + if ((flags & BDRV_O_NOCACHE)) + open_flags |= O_DIRECT; + else if (!(flags & BDRV_O_CACHE_WB)) + open_flags |= O_DSYNC; + + s->type = FTYPE_FILE; +#if defined(__linux__) + if (strstart(filename, "/dev/cd", NULL)) { + /* open will not fail even if no CD is inserted */ + open_flags |= O_NONBLOCK; + s->type = FTYPE_CD; + } else if (strstart(filename, "/dev/fd", NULL)) { + s->type = FTYPE_FD; + s->fd_open_flags = open_flags; + /* open will not fail even if no floppy is inserted */ + open_flags |= O_NONBLOCK; +#ifdef CONFIG_AIO + } else if (strstart(filename, "/dev/sg", NULL)) { + bs->sg = 1; +#endif + } +#endif +#if defined(__FreeBSD__) + if (strstart(filename, "/dev/cd", NULL) || + strstart(filename, "/dev/acd", NULL)) { + s->type = FTYPE_CD; + s->cd_open_flags = open_flags; + } +#endif + s->fd = -1; + fd = open(filename, open_flags, 0644); + if (fd < 0) { + ret = -errno; + if (ret == -EROFS) + ret = -EACCES; + return ret; + } + s->fd = fd; +#if defined(__FreeBSD__) + /* make sure the door isnt locked at this time */ + if (s->type == FTYPE_CD) + ioctl (s->fd, CDIOCALLOW); +#endif +#if defined(__linux__) + /* close fd so that we can reopen it as needed */ + if (s->type == FTYPE_FD) { + close(s->fd); + s->fd = -1; + s->fd_media_changed = 1; + } +#endif + return 0; +} + +#if defined(__linux__) +/* Note: we do not have a reliable method to detect if the floppy is + present. The current method is to try to open the floppy at every + I/O and to keep it opened during a few hundreds of ms. */ +static int fd_open(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + int last_media_present; + + if (s->type != FTYPE_FD) + return 0; + last_media_present = (s->fd >= 0); + if (s->fd >= 0 && + (qemu_get_clock(rt_clock) - s->fd_open_time) >= FD_OPEN_TIMEOUT) { + close(s->fd); + s->fd = -1; +#ifdef DEBUG_FLOPPY + printf("Floppy closed\n"); +#endif + } + if (s->fd < 0) { + if (s->fd_got_error && + (qemu_get_clock(rt_clock) - s->fd_error_time) < FD_OPEN_TIMEOUT) { +#ifdef DEBUG_FLOPPY + printf("No floppy (open delayed)\n"); +#endif + return -EIO; + } + s->fd = open(bs->filename, s->fd_open_flags); + if (s->fd < 0) { + s->fd_error_time = qemu_get_clock(rt_clock); + s->fd_got_error = 1; + if (last_media_present) + s->fd_media_changed = 1; +#ifdef DEBUG_FLOPPY + printf("No floppy\n"); +#endif + return -EIO; + } +#ifdef DEBUG_FLOPPY + printf("Floppy opened\n"); +#endif + } + if (!last_media_present) + s->fd_media_changed = 1; + s->fd_open_time = qemu_get_clock(rt_clock); + s->fd_got_error = 0; + return 0; +} + +static int raw_is_inserted(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + int ret; + + switch(s->type) { + case FTYPE_CD: + ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT); + if (ret == CDS_DISC_OK) + return 1; + else + return 0; + break; + case FTYPE_FD: + ret = fd_open(bs); + return (ret >= 0); + default: + return 1; + } +} + +/* currently only used by fdc.c, but a CD version would be good too */ +static int raw_media_changed(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + + switch(s->type) { + case FTYPE_FD: + { + int ret; + /* XXX: we do not have a true media changed indication. It + does not work if the floppy is changed without trying + to read it */ + fd_open(bs); + ret = s->fd_media_changed; + s->fd_media_changed = 0; +#ifdef DEBUG_FLOPPY + printf("Floppy changed=%d\n", ret); +#endif + return ret; + } + default: + return -ENOTSUP; + } +} + +static int raw_eject(BlockDriverState *bs, int eject_flag) +{ + BDRVRawState *s = bs->opaque; + + switch(s->type) { + case FTYPE_CD: + if (eject_flag) { + if (ioctl (s->fd, CDROMEJECT, NULL) < 0) + perror("CDROMEJECT"); + } else { + if (ioctl (s->fd, CDROMCLOSETRAY, NULL) < 0) + perror("CDROMEJECT"); + } + break; + case FTYPE_FD: + { + int fd; + if (s->fd >= 0) { + close(s->fd); + s->fd = -1; + } + fd = open(bs->filename, s->fd_open_flags | O_NONBLOCK); + if (fd >= 0) { + if (ioctl(fd, FDEJECT, 0) < 0) + perror("FDEJECT"); + close(fd); + } + } + break; + default: + return -ENOTSUP; + } + return 0; +} + +static int raw_set_locked(BlockDriverState *bs, int locked) +{ + BDRVRawState *s = bs->opaque; + + switch(s->type) { + case FTYPE_CD: + if (ioctl (s->fd, CDROM_LOCKDOOR, locked) < 0) { + /* Note: an error can happen if the distribution automatically + mounts the CD-ROM */ + // perror("CDROM_LOCKDOOR"); + } + break; + default: + return -ENOTSUP; + } + return 0; +} + +static int raw_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) +{ + BDRVRawState *s = bs->opaque; + + return ioctl(s->fd, req, buf); +} + +#ifdef CONFIG_AIO +static BlockDriverAIOCB *raw_aio_ioctl(BlockDriverState *bs, + unsigned long int req, void *buf, + BlockDriverCompletionFunc *cb, void *opaque) +{ + BDRVRawState *s = bs->opaque; + RawAIOCB *acb; + + if (fd_open(bs) < 0) + return NULL; + + acb = qemu_aio_get(bs, cb, opaque); + if (!acb) + return NULL; + acb->aiocb.aio_fildes = s->fd; + acb->aiocb.ev_signo = SIGUSR2; + acb->aiocb.aio_offset = 0; + acb->aiocb.aio_flags = 0; + + acb->next = posix_aio_state->first_aio; + posix_aio_state->first_aio = acb; + + acb->aiocb.aio_ioctl_buf = buf; + acb->aiocb.aio_ioctl_cmd = req; + if (qemu_paio_ioctl(&acb->aiocb) < 0) { + raw_aio_remove(acb); + return NULL; + } + + return &acb->common; +} +#endif + +#elif defined(__FreeBSD__) + +static int fd_open(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + + /* this is just to ensure s->fd is sane (its called by io ops) */ + if (s->fd >= 0) + return 0; + return -EIO; +} + +static int cd_open(BlockDriverState *bs) +{ +#if defined(__FreeBSD__) + BDRVRawState *s = bs->opaque; + int fd; + + switch(s->type) { + case FTYPE_CD: + /* XXX force reread of possibly changed/newly loaded disc, + * FreeBSD seems to not notice sometimes... */ + if (s->fd >= 0) + close (s->fd); + fd = open(bs->filename, s->cd_open_flags, 0644); + if (fd < 0) { + s->fd = -1; + return -EIO; + } + s->fd = fd; + /* make sure the door isnt locked at this time */ + ioctl (s->fd, CDIOCALLOW); + } +#endif + return 0; +} + +static int raw_is_inserted(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + + switch(s->type) { + case FTYPE_CD: + return (raw_getlength(bs) > 0); + case FTYPE_FD: + /* XXX handle this */ + /* FALLTHRU */ + default: + return 1; + } +} + +static int raw_media_changed(BlockDriverState *bs) +{ + return -ENOTSUP; +} + +static int raw_eject(BlockDriverState *bs, int eject_flag) +{ + BDRVRawState *s = bs->opaque; + + switch(s->type) { + case FTYPE_CD: + if (s->fd < 0) + return -ENOTSUP; + (void) ioctl (s->fd, CDIOCALLOW); + if (eject_flag) { + if (ioctl (s->fd, CDIOCEJECT) < 0) + perror("CDIOCEJECT"); + } else { + if (ioctl (s->fd, CDIOCCLOSE) < 0) + perror("CDIOCCLOSE"); + } + if (cd_open(bs) < 0) + return -ENOTSUP; + break; + case FTYPE_FD: + /* XXX handle this */ + /* FALLTHRU */ + default: + return -ENOTSUP; + } + return 0; +} + +static int raw_set_locked(BlockDriverState *bs, int locked) +{ + BDRVRawState *s = bs->opaque; + + switch(s->type) { + case FTYPE_CD: + if (s->fd < 0) + return -ENOTSUP; + if (ioctl (s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) { + /* Note: an error can happen if the distribution automatically + mounts the CD-ROM */ + // perror("CDROM_LOCKDOOR"); + } + break; + default: + return -ENOTSUP; + } + return 0; +} + +static int raw_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) +{ + return -ENOTSUP; +} +#else /* !linux && !FreeBSD */ + +static int fd_open(BlockDriverState *bs) +{ + return 0; +} + +static int raw_is_inserted(BlockDriverState *bs) +{ + return 1; +} + +static int raw_media_changed(BlockDriverState *bs) +{ + return -ENOTSUP; +} + +static int raw_eject(BlockDriverState *bs, int eject_flag) +{ + return -ENOTSUP; +} + +static int raw_set_locked(BlockDriverState *bs, int locked) +{ + return -ENOTSUP; +} + +static int raw_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) +{ + return -ENOTSUP; +} + +static BlockDriverAIOCB *raw_aio_ioctl(BlockDriverState *bs, + unsigned long int req, void *buf, + BlockDriverCompletionFunc *cb, void *opaque) +{ + return NULL; +} +#endif /* !linux && !FreeBSD */ + +#if defined(__linux__) || defined(__FreeBSD__) +static int hdev_create(const char *filename, int64_t total_size, + const char *backing_file, int flags) +{ + int fd; + int ret = 0; + struct stat stat_buf; + + if (flags || backing_file) + return -ENOTSUP; + + fd = open(filename, O_WRONLY | O_BINARY); + if (fd < 0) + return -EIO; + + if (fstat(fd, &stat_buf) < 0) + ret = -EIO; + else if (!S_ISBLK(stat_buf.st_mode)) + ret = -EIO; + else if (lseek(fd, 0, SEEK_END) < total_size * 512) + ret = -ENOSPC; + + close(fd); + return ret; +} + +#else /* !(linux || freebsd) */ + +static int hdev_create(const char *filename, int64_t total_size, + const char *backing_file, int flags) +{ + return -ENOTSUP; +} +#endif + +static BlockDriver bdrv_host_device = { + .format_name = "host_device", + .instance_size = sizeof(BDRVRawState), + .bdrv_open = hdev_open, + .bdrv_close = raw_close, + .bdrv_create = hdev_create, + .bdrv_flush = raw_flush, + +#ifdef CONFIG_AIO + .bdrv_aio_readv = raw_aio_readv, + .bdrv_aio_writev = raw_aio_writev, + .bdrv_aio_cancel = raw_aio_cancel, + .aiocb_size = sizeof(RawAIOCB), +#endif + + .bdrv_read = raw_read, + .bdrv_write = raw_write, + .bdrv_getlength = raw_getlength, + + /* removable device support */ + .bdrv_is_inserted = raw_is_inserted, + .bdrv_media_changed = raw_media_changed, + .bdrv_eject = raw_eject, + .bdrv_set_locked = raw_set_locked, + /* generic scsi device */ + .bdrv_ioctl = raw_ioctl, +#ifdef CONFIG_AIO + .bdrv_aio_ioctl = raw_aio_ioctl, +#endif +}; + +static void bdrv_raw_init(void) +{ + bdrv_register(&bdrv_raw); + bdrv_register(&bdrv_host_device); +} + +block_init(bdrv_raw_init); diff --git a/block/raw-win32.c b/block/raw-win32.c new file mode 100644 index 0000000000..ab3abd636d --- /dev/null +++ b/block/raw-win32.c @@ -0,0 +1,394 @@ +/* + * Block driver for RAW files (win32) + * + * Copyright (c) 2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "qemu-timer.h" +#include "block_int.h" +#include "module.h" +#include <windows.h> +#include <winioctl.h> + +#define FTYPE_FILE 0 +#define FTYPE_CD 1 +#define FTYPE_HARDDISK 2 + +typedef struct BDRVRawState { + HANDLE hfile; + int type; + char drive_path[16]; /* format: "d:\" */ +} BDRVRawState; + +int qemu_ftruncate64(int fd, int64_t length) +{ + LARGE_INTEGER li; + LONG high; + HANDLE h; + BOOL res; + + if ((GetVersion() & 0x80000000UL) && (length >> 32) != 0) + return -1; + + h = (HANDLE)_get_osfhandle(fd); + + /* get current position, ftruncate do not change position */ + li.HighPart = 0; + li.LowPart = SetFilePointer (h, 0, &li.HighPart, FILE_CURRENT); + if (li.LowPart == 0xffffffffUL && GetLastError() != NO_ERROR) + return -1; + + high = length >> 32; + if (!SetFilePointer(h, (DWORD) length, &high, FILE_BEGIN)) + return -1; + res = SetEndOfFile(h); + + /* back to old position */ + SetFilePointer(h, li.LowPart, &li.HighPart, FILE_BEGIN); + return res ? 0 : -1; +} + +static int set_sparse(int fd) +{ + DWORD returned; + return (int) DeviceIoControl((HANDLE)_get_osfhandle(fd), FSCTL_SET_SPARSE, + NULL, 0, NULL, 0, &returned, NULL); +} + +static int raw_open(BlockDriverState *bs, const char *filename, int flags) +{ + BDRVRawState *s = bs->opaque; + int access_flags, create_flags; + DWORD overlapped; + + s->type = FTYPE_FILE; + + if ((flags & BDRV_O_ACCESS) == O_RDWR) { + access_flags = GENERIC_READ | GENERIC_WRITE; + } else { + access_flags = GENERIC_READ; + } + if (flags & BDRV_O_CREAT) { + create_flags = CREATE_ALWAYS; + } else { + create_flags = OPEN_EXISTING; + } + overlapped = FILE_ATTRIBUTE_NORMAL; + if ((flags & BDRV_O_NOCACHE)) + overlapped |= FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH; + else if (!(flags & BDRV_O_CACHE_WB)) + overlapped |= FILE_FLAG_WRITE_THROUGH; + s->hfile = CreateFile(filename, access_flags, + FILE_SHARE_READ, NULL, + create_flags, overlapped, NULL); + if (s->hfile == INVALID_HANDLE_VALUE) { + int err = GetLastError(); + + if (err == ERROR_ACCESS_DENIED) + return -EACCES; + return -1; + } + return 0; +} + +static int raw_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + BDRVRawState *s = bs->opaque; + OVERLAPPED ov; + DWORD ret_count; + int ret; + int64_t offset = sector_num * 512; + int count = nb_sectors * 512; + + memset(&ov, 0, sizeof(ov)); + ov.Offset = offset; + ov.OffsetHigh = offset >> 32; + ret = ReadFile(s->hfile, buf, count, &ret_count, &ov); + if (!ret) + return ret_count; + if (ret_count == count) + ret_count = 0; + return ret_count; +} + +static int raw_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + BDRVRawState *s = bs->opaque; + OVERLAPPED ov; + DWORD ret_count; + int ret; + int64_t offset = sector_num * 512; + int count = nb_sectors * 512; + + memset(&ov, 0, sizeof(ov)); + ov.Offset = offset; + ov.OffsetHigh = offset >> 32; + ret = WriteFile(s->hfile, buf, count, &ret_count, &ov); + if (!ret) + return ret_count; + if (ret_count == count) + ret_count = 0; + return ret_count; +} + +static void raw_flush(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + FlushFileBuffers(s->hfile); +} + +static void raw_close(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + CloseHandle(s->hfile); +} + +static int raw_truncate(BlockDriverState *bs, int64_t offset) +{ + BDRVRawState *s = bs->opaque; + LONG low, high; + + low = offset; + high = offset >> 32; + if (!SetFilePointer(s->hfile, low, &high, FILE_BEGIN)) + return -EIO; + if (!SetEndOfFile(s->hfile)) + return -EIO; + return 0; +} + +static int64_t raw_getlength(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + LARGE_INTEGER l; + ULARGE_INTEGER available, total, total_free; + DISK_GEOMETRY_EX dg; + DWORD count; + BOOL status; + + switch(s->type) { + case FTYPE_FILE: + l.LowPart = GetFileSize(s->hfile, (PDWORD)&l.HighPart); + if (l.LowPart == 0xffffffffUL && GetLastError() != NO_ERROR) + return -EIO; + break; + case FTYPE_CD: + if (!GetDiskFreeSpaceEx(s->drive_path, &available, &total, &total_free)) + return -EIO; + l.QuadPart = total.QuadPart; + break; + case FTYPE_HARDDISK: + status = DeviceIoControl(s->hfile, IOCTL_DISK_GET_DRIVE_GEOMETRY_EX, + NULL, 0, &dg, sizeof(dg), &count, NULL); + if (status != 0) { + l = dg.DiskSize; + } + break; + default: + return -EIO; + } + return l.QuadPart; +} + +static int raw_create(const char *filename, int64_t total_size, + const char *backing_file, int flags) +{ + int fd; + + if (flags || backing_file) + return -ENOTSUP; + + fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, + 0644); + if (fd < 0) + return -EIO; + set_sparse(fd); + ftruncate(fd, total_size * 512); + close(fd); + return 0; +} + +static BlockDriver bdrv_raw = { + .format_name = "raw", + .instance_size = sizeof(BDRVRawState), + .bdrv_open = raw_open, + .bdrv_close = raw_close, + .bdrv_create = raw_create, + .bdrv_flush = raw_flush, + .bdrv_read = raw_read, + .bdrv_write = raw_write, + .bdrv_truncate = raw_truncate, + .bdrv_getlength = raw_getlength, +}; + +/***********************************************/ +/* host device */ + +static int find_cdrom(char *cdrom_name, int cdrom_name_size) +{ + char drives[256], *pdrv = drives; + UINT type; + + memset(drives, 0, sizeof(drives)); + GetLogicalDriveStrings(sizeof(drives), drives); + while(pdrv[0] != '\0') { + type = GetDriveType(pdrv); + switch(type) { + case DRIVE_CDROM: + snprintf(cdrom_name, cdrom_name_size, "\\\\.\\%c:", pdrv[0]); + return 0; + break; + } + pdrv += lstrlen(pdrv) + 1; + } + return -1; +} + +static int find_device_type(BlockDriverState *bs, const char *filename) +{ + BDRVRawState *s = bs->opaque; + UINT type; + const char *p; + + if (strstart(filename, "\\\\.\\", &p) || + strstart(filename, "//./", &p)) { + if (stristart(p, "PhysicalDrive", NULL)) + return FTYPE_HARDDISK; + snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", p[0]); + type = GetDriveType(s->drive_path); + switch (type) { + case DRIVE_REMOVABLE: + case DRIVE_FIXED: + return FTYPE_HARDDISK; + case DRIVE_CDROM: + return FTYPE_CD; + default: + return FTYPE_FILE; + } + } else { + return FTYPE_FILE; + } +} + +static int hdev_open(BlockDriverState *bs, const char *filename, int flags) +{ + BDRVRawState *s = bs->opaque; + int access_flags, create_flags; + DWORD overlapped; + char device_name[64]; + + if (strstart(filename, "/dev/cdrom", NULL)) { + if (find_cdrom(device_name, sizeof(device_name)) < 0) + return -ENOENT; + filename = device_name; + } else { + /* transform drive letters into device name */ + if (((filename[0] >= 'a' && filename[0] <= 'z') || + (filename[0] >= 'A' && filename[0] <= 'Z')) && + filename[1] == ':' && filename[2] == '\0') { + snprintf(device_name, sizeof(device_name), "\\\\.\\%c:", filename[0]); + filename = device_name; + } + } + s->type = find_device_type(bs, filename); + + if ((flags & BDRV_O_ACCESS) == O_RDWR) { + access_flags = GENERIC_READ | GENERIC_WRITE; + } else { + access_flags = GENERIC_READ; + } + create_flags = OPEN_EXISTING; + + overlapped = FILE_ATTRIBUTE_NORMAL; + if ((flags & BDRV_O_NOCACHE)) + overlapped |= FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH; + else if (!(flags & BDRV_O_CACHE_WB)) + overlapped |= FILE_FLAG_WRITE_THROUGH; + s->hfile = CreateFile(filename, access_flags, + FILE_SHARE_READ, NULL, + create_flags, overlapped, NULL); + if (s->hfile == INVALID_HANDLE_VALUE) { + int err = GetLastError(); + + if (err == ERROR_ACCESS_DENIED) + return -EACCES; + return -1; + } + return 0; +} + +#if 0 +/***********************************************/ +/* removable device additional commands */ + +static int raw_is_inserted(BlockDriverState *bs) +{ + return 1; +} + +static int raw_media_changed(BlockDriverState *bs) +{ + return -ENOTSUP; +} + +static int raw_eject(BlockDriverState *bs, int eject_flag) +{ + DWORD ret_count; + + if (s->type == FTYPE_FILE) + return -ENOTSUP; + if (eject_flag) { + DeviceIoControl(s->hfile, IOCTL_STORAGE_EJECT_MEDIA, + NULL, 0, NULL, 0, &lpBytesReturned, NULL); + } else { + DeviceIoControl(s->hfile, IOCTL_STORAGE_LOAD_MEDIA, + NULL, 0, NULL, 0, &lpBytesReturned, NULL); + } +} + +static int raw_set_locked(BlockDriverState *bs, int locked) +{ + return -ENOTSUP; +} +#endif + +static BlockDriver bdrv_host_device = { + .format_name = "host_device", + .instance_size = sizeof(BDRVRawState), + .bdrv_open = hdev_open, + .bdrv_close = raw_close, + .bdrv_flush = raw_flush, + + .bdrv_read = raw_read, + .bdrv_write = raw_write, + .bdrv_getlength = raw_getlength, +}; + +static void bdrv_raw_init(void) +{ + bdrv_register(&bdrv_raw); + bdrv_register(&bdrv_host_device); + return 0; +} + +block_init(bdrv_raw_init); diff --git a/block/vmdk.c b/block/vmdk.c new file mode 100644 index 0000000000..13866e9b06 --- /dev/null +++ b/block/vmdk.c @@ -0,0 +1,833 @@ +/* + * Block driver for the VMDK format + * + * Copyright (c) 2004 Fabrice Bellard + * Copyright (c) 2005 Filip Navara + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu-common.h" +#include "block_int.h" +#include "module.h" + +#define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D') +#define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V') + +typedef struct { + uint32_t version; + uint32_t flags; + uint32_t disk_sectors; + uint32_t granularity; + uint32_t l1dir_offset; + uint32_t l1dir_size; + uint32_t file_sectors; + uint32_t cylinders; + uint32_t heads; + uint32_t sectors_per_track; +} VMDK3Header; + +typedef struct { + uint32_t version; + uint32_t flags; + int64_t capacity; + int64_t granularity; + int64_t desc_offset; + int64_t desc_size; + int32_t num_gtes_per_gte; + int64_t rgd_offset; + int64_t gd_offset; + int64_t grain_offset; + char filler[1]; + char check_bytes[4]; +} __attribute__((packed)) VMDK4Header; + +#define L2_CACHE_SIZE 16 + +typedef struct BDRVVmdkState { + BlockDriverState *hd; + int64_t l1_table_offset; + int64_t l1_backup_table_offset; + uint32_t *l1_table; + uint32_t *l1_backup_table; + unsigned int l1_size; + uint32_t l1_entry_sectors; + + unsigned int l2_size; + uint32_t *l2_cache; + uint32_t l2_cache_offsets[L2_CACHE_SIZE]; + uint32_t l2_cache_counts[L2_CACHE_SIZE]; + + unsigned int cluster_sectors; + uint32_t parent_cid; + int is_parent; +} BDRVVmdkState; + +typedef struct VmdkMetaData { + uint32_t offset; + unsigned int l1_index; + unsigned int l2_index; + unsigned int l2_offset; + int valid; +} VmdkMetaData; + +typedef struct ActiveBDRVState{ + BlockDriverState *hd; // active image handler + uint64_t cluster_offset; // current write offset +}ActiveBDRVState; + +static ActiveBDRVState activeBDRV; + + +static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + uint32_t magic; + + if (buf_size < 4) + return 0; + magic = be32_to_cpu(*(uint32_t *)buf); + if (magic == VMDK3_MAGIC || + magic == VMDK4_MAGIC) + return 100; + else + return 0; +} + +#define CHECK_CID 1 + +#define SECTOR_SIZE 512 +#define DESC_SIZE 20*SECTOR_SIZE // 20 sectors of 512 bytes each +#define HEADER_SIZE 512 // first sector of 512 bytes + +static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent) +{ + BDRVVmdkState *s = bs->opaque; + char desc[DESC_SIZE]; + uint32_t cid; + const char *p_name, *cid_str; + size_t cid_str_size; + + /* the descriptor offset = 0x200 */ + if (bdrv_pread(s->hd, 0x200, desc, DESC_SIZE) != DESC_SIZE) + return 0; + + if (parent) { + cid_str = "parentCID"; + cid_str_size = sizeof("parentCID"); + } else { + cid_str = "CID"; + cid_str_size = sizeof("CID"); + } + + if ((p_name = strstr(desc,cid_str)) != NULL) { + p_name += cid_str_size; + sscanf(p_name,"%x",&cid); + } + + return cid; +} + +static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid) +{ + BDRVVmdkState *s = bs->opaque; + char desc[DESC_SIZE], tmp_desc[DESC_SIZE]; + char *p_name, *tmp_str; + + /* the descriptor offset = 0x200 */ + if (bdrv_pread(s->hd, 0x200, desc, DESC_SIZE) != DESC_SIZE) + return -1; + + tmp_str = strstr(desc,"parentCID"); + pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str); + if ((p_name = strstr(desc,"CID")) != NULL) { + p_name += sizeof("CID"); + snprintf(p_name, sizeof(desc) - (p_name - desc), "%x\n", cid); + pstrcat(desc, sizeof(desc), tmp_desc); + } + + if (bdrv_pwrite(s->hd, 0x200, desc, DESC_SIZE) != DESC_SIZE) + return -1; + return 0; +} + +static int vmdk_is_cid_valid(BlockDriverState *bs) +{ +#ifdef CHECK_CID + BDRVVmdkState *s = bs->opaque; + BlockDriverState *p_bs = s->hd->backing_hd; + uint32_t cur_pcid; + + if (p_bs) { + cur_pcid = vmdk_read_cid(p_bs,0); + if (s->parent_cid != cur_pcid) + // CID not valid + return 0; + } +#endif + // CID valid + return 1; +} + +static int vmdk_snapshot_create(const char *filename, const char *backing_file) +{ + int snp_fd, p_fd; + uint32_t p_cid; + char *p_name, *gd_buf, *rgd_buf; + const char *real_filename, *temp_str; + VMDK4Header header; + uint32_t gde_entries, gd_size; + int64_t gd_offset, rgd_offset, capacity, gt_size; + char p_desc[DESC_SIZE], s_desc[DESC_SIZE], hdr[HEADER_SIZE]; + static const char desc_template[] = + "# Disk DescriptorFile\n" + "version=1\n" + "CID=%x\n" + "parentCID=%x\n" + "createType=\"monolithicSparse\"\n" + "parentFileNameHint=\"%s\"\n" + "\n" + "# Extent description\n" + "RW %u SPARSE \"%s\"\n" + "\n" + "# The Disk Data Base \n" + "#DDB\n" + "\n"; + + snp_fd = open(filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, 0644); + if (snp_fd < 0) + return -1; + p_fd = open(backing_file, O_RDONLY | O_BINARY | O_LARGEFILE); + if (p_fd < 0) { + close(snp_fd); + return -1; + } + + /* read the header */ + if (lseek(p_fd, 0x0, SEEK_SET) == -1) + goto fail; + if (read(p_fd, hdr, HEADER_SIZE) != HEADER_SIZE) + goto fail; + + /* write the header */ + if (lseek(snp_fd, 0x0, SEEK_SET) == -1) + goto fail; + if (write(snp_fd, hdr, HEADER_SIZE) == -1) + goto fail; + + memset(&header, 0, sizeof(header)); + memcpy(&header,&hdr[4], sizeof(header)); // skip the VMDK4_MAGIC + + ftruncate(snp_fd, header.grain_offset << 9); + /* the descriptor offset = 0x200 */ + if (lseek(p_fd, 0x200, SEEK_SET) == -1) + goto fail; + if (read(p_fd, p_desc, DESC_SIZE) != DESC_SIZE) + goto fail; + + if ((p_name = strstr(p_desc,"CID")) != NULL) { + p_name += sizeof("CID"); + sscanf(p_name,"%x",&p_cid); + } + + real_filename = filename; + if ((temp_str = strrchr(real_filename, '\\')) != NULL) + real_filename = temp_str + 1; + if ((temp_str = strrchr(real_filename, '/')) != NULL) + real_filename = temp_str + 1; + if ((temp_str = strrchr(real_filename, ':')) != NULL) + real_filename = temp_str + 1; + + snprintf(s_desc, sizeof(s_desc), desc_template, p_cid, p_cid, backing_file, + (uint32_t)header.capacity, real_filename); + + /* write the descriptor */ + if (lseek(snp_fd, 0x200, SEEK_SET) == -1) + goto fail; + if (write(snp_fd, s_desc, strlen(s_desc)) == -1) + goto fail; + + gd_offset = header.gd_offset * SECTOR_SIZE; // offset of GD table + rgd_offset = header.rgd_offset * SECTOR_SIZE; // offset of RGD table + capacity = header.capacity * SECTOR_SIZE; // Extent size + /* + * Each GDE span 32M disk, means: + * 512 GTE per GT, each GTE points to grain + */ + gt_size = (int64_t)header.num_gtes_per_gte * header.granularity * SECTOR_SIZE; + if (!gt_size) + goto fail; + gde_entries = (uint32_t)(capacity / gt_size); // number of gde/rgde + gd_size = gde_entries * sizeof(uint32_t); + + /* write RGD */ + rgd_buf = qemu_malloc(gd_size); + if (lseek(p_fd, rgd_offset, SEEK_SET) == -1) + goto fail_rgd; + if (read(p_fd, rgd_buf, gd_size) != gd_size) + goto fail_rgd; + if (lseek(snp_fd, rgd_offset, SEEK_SET) == -1) + goto fail_rgd; + if (write(snp_fd, rgd_buf, gd_size) == -1) + goto fail_rgd; + qemu_free(rgd_buf); + + /* write GD */ + gd_buf = qemu_malloc(gd_size); + if (lseek(p_fd, gd_offset, SEEK_SET) == -1) + goto fail_gd; + if (read(p_fd, gd_buf, gd_size) != gd_size) + goto fail_gd; + if (lseek(snp_fd, gd_offset, SEEK_SET) == -1) + goto fail_gd; + if (write(snp_fd, gd_buf, gd_size) == -1) + goto fail_gd; + qemu_free(gd_buf); + + close(p_fd); + close(snp_fd); + return 0; + + fail_gd: + qemu_free(gd_buf); + fail_rgd: + qemu_free(rgd_buf); + fail: + close(p_fd); + close(snp_fd); + return -1; +} + +static void vmdk_parent_close(BlockDriverState *bs) +{ + if (bs->backing_hd) + bdrv_close(bs->backing_hd); +} + +static int parent_open = 0; +static int vmdk_parent_open(BlockDriverState *bs, const char * filename) +{ + BDRVVmdkState *s = bs->opaque; + char *p_name; + char desc[DESC_SIZE]; + char parent_img_name[1024]; + + /* the descriptor offset = 0x200 */ + if (bdrv_pread(s->hd, 0x200, desc, DESC_SIZE) != DESC_SIZE) + return -1; + + if ((p_name = strstr(desc,"parentFileNameHint")) != NULL) { + char *end_name; + struct stat file_buf; + + p_name += sizeof("parentFileNameHint") + 1; + if ((end_name = strchr(p_name,'\"')) == NULL) + return -1; + if ((end_name - p_name) > sizeof (s->hd->backing_file) - 1) + return -1; + + pstrcpy(s->hd->backing_file, end_name - p_name + 1, p_name); + if (stat(s->hd->backing_file, &file_buf) != 0) { + path_combine(parent_img_name, sizeof(parent_img_name), + filename, s->hd->backing_file); + } else { + pstrcpy(parent_img_name, sizeof(parent_img_name), + s->hd->backing_file); + } + + s->hd->backing_hd = bdrv_new(""); + if (!s->hd->backing_hd) { + failure: + bdrv_close(s->hd); + return -1; + } + parent_open = 1; + if (bdrv_open(s->hd->backing_hd, parent_img_name, BDRV_O_RDONLY) < 0) + goto failure; + parent_open = 0; + } + + return 0; +} + +static int vmdk_open(BlockDriverState *bs, const char *filename, int flags) +{ + BDRVVmdkState *s = bs->opaque; + uint32_t magic; + int l1_size, i, ret; + + if (parent_open) + // Parent must be opened as RO. + flags = BDRV_O_RDONLY; + + ret = bdrv_file_open(&s->hd, filename, flags); + if (ret < 0) + return ret; + if (bdrv_pread(s->hd, 0, &magic, sizeof(magic)) != sizeof(magic)) + goto fail; + + magic = be32_to_cpu(magic); + if (magic == VMDK3_MAGIC) { + VMDK3Header header; + + if (bdrv_pread(s->hd, sizeof(magic), &header, sizeof(header)) != sizeof(header)) + goto fail; + s->cluster_sectors = le32_to_cpu(header.granularity); + s->l2_size = 1 << 9; + s->l1_size = 1 << 6; + bs->total_sectors = le32_to_cpu(header.disk_sectors); + s->l1_table_offset = le32_to_cpu(header.l1dir_offset) << 9; + s->l1_backup_table_offset = 0; + s->l1_entry_sectors = s->l2_size * s->cluster_sectors; + } else if (magic == VMDK4_MAGIC) { + VMDK4Header header; + + if (bdrv_pread(s->hd, sizeof(magic), &header, sizeof(header)) != sizeof(header)) + goto fail; + bs->total_sectors = le64_to_cpu(header.capacity); + s->cluster_sectors = le64_to_cpu(header.granularity); + s->l2_size = le32_to_cpu(header.num_gtes_per_gte); + s->l1_entry_sectors = s->l2_size * s->cluster_sectors; + if (s->l1_entry_sectors <= 0) + goto fail; + s->l1_size = (bs->total_sectors + s->l1_entry_sectors - 1) + / s->l1_entry_sectors; + s->l1_table_offset = le64_to_cpu(header.rgd_offset) << 9; + s->l1_backup_table_offset = le64_to_cpu(header.gd_offset) << 9; + + if (parent_open) + s->is_parent = 1; + else + s->is_parent = 0; + + // try to open parent images, if exist + if (vmdk_parent_open(bs, filename) != 0) + goto fail; + // write the CID once after the image creation + s->parent_cid = vmdk_read_cid(bs,1); + } else { + goto fail; + } + + /* read the L1 table */ + l1_size = s->l1_size * sizeof(uint32_t); + s->l1_table = qemu_malloc(l1_size); + if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, l1_size) != l1_size) + goto fail; + for(i = 0; i < s->l1_size; i++) { + le32_to_cpus(&s->l1_table[i]); + } + + if (s->l1_backup_table_offset) { + s->l1_backup_table = qemu_malloc(l1_size); + if (bdrv_pread(s->hd, s->l1_backup_table_offset, s->l1_backup_table, l1_size) != l1_size) + goto fail; + for(i = 0; i < s->l1_size; i++) { + le32_to_cpus(&s->l1_backup_table[i]); + } + } + + s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint32_t)); + return 0; + fail: + qemu_free(s->l1_backup_table); + qemu_free(s->l1_table); + qemu_free(s->l2_cache); + bdrv_delete(s->hd); + return -1; +} + +static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data, + uint64_t offset, int allocate); + +static int get_whole_cluster(BlockDriverState *bs, uint64_t cluster_offset, + uint64_t offset, int allocate) +{ + uint64_t parent_cluster_offset; + BDRVVmdkState *s = bs->opaque; + uint8_t whole_grain[s->cluster_sectors*512]; // 128 sectors * 512 bytes each = grain size 64KB + + // we will be here if it's first write on non-exist grain(cluster). + // try to read from parent image, if exist + if (s->hd->backing_hd) { + BDRVVmdkState *ps = s->hd->backing_hd->opaque; + + if (!vmdk_is_cid_valid(bs)) + return -1; + + parent_cluster_offset = get_cluster_offset(s->hd->backing_hd, NULL, offset, allocate); + + if (parent_cluster_offset) { + BDRVVmdkState *act_s = activeBDRV.hd->opaque; + + if (bdrv_pread(ps->hd, parent_cluster_offset, whole_grain, ps->cluster_sectors*512) != ps->cluster_sectors*512) + return -1; + + //Write grain only into the active image + if (bdrv_pwrite(act_s->hd, activeBDRV.cluster_offset << 9, whole_grain, sizeof(whole_grain)) != sizeof(whole_grain)) + return -1; + } + } + return 0; +} + +static int vmdk_L2update(BlockDriverState *bs, VmdkMetaData *m_data) +{ + BDRVVmdkState *s = bs->opaque; + + /* update L2 table */ + if (bdrv_pwrite(s->hd, ((int64_t)m_data->l2_offset * 512) + (m_data->l2_index * sizeof(m_data->offset)), + &(m_data->offset), sizeof(m_data->offset)) != sizeof(m_data->offset)) + return -1; + /* update backup L2 table */ + if (s->l1_backup_table_offset != 0) { + m_data->l2_offset = s->l1_backup_table[m_data->l1_index]; + if (bdrv_pwrite(s->hd, ((int64_t)m_data->l2_offset * 512) + (m_data->l2_index * sizeof(m_data->offset)), + &(m_data->offset), sizeof(m_data->offset)) != sizeof(m_data->offset)) + return -1; + } + + return 0; +} + +static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data, + uint64_t offset, int allocate) +{ + BDRVVmdkState *s = bs->opaque; + unsigned int l1_index, l2_offset, l2_index; + int min_index, i, j; + uint32_t min_count, *l2_table, tmp = 0; + uint64_t cluster_offset; + + if (m_data) + m_data->valid = 0; + + l1_index = (offset >> 9) / s->l1_entry_sectors; + if (l1_index >= s->l1_size) + return 0; + l2_offset = s->l1_table[l1_index]; + if (!l2_offset) + return 0; + for(i = 0; i < L2_CACHE_SIZE; i++) { + if (l2_offset == s->l2_cache_offsets[i]) { + /* increment the hit count */ + if (++s->l2_cache_counts[i] == 0xffffffff) { + for(j = 0; j < L2_CACHE_SIZE; j++) { + s->l2_cache_counts[j] >>= 1; + } + } + l2_table = s->l2_cache + (i * s->l2_size); + goto found; + } + } + /* not found: load a new entry in the least used one */ + min_index = 0; + min_count = 0xffffffff; + for(i = 0; i < L2_CACHE_SIZE; i++) { + if (s->l2_cache_counts[i] < min_count) { + min_count = s->l2_cache_counts[i]; + min_index = i; + } + } + l2_table = s->l2_cache + (min_index * s->l2_size); + if (bdrv_pread(s->hd, (int64_t)l2_offset * 512, l2_table, s->l2_size * sizeof(uint32_t)) != + s->l2_size * sizeof(uint32_t)) + return 0; + + s->l2_cache_offsets[min_index] = l2_offset; + s->l2_cache_counts[min_index] = 1; + found: + l2_index = ((offset >> 9) / s->cluster_sectors) % s->l2_size; + cluster_offset = le32_to_cpu(l2_table[l2_index]); + + if (!cluster_offset) { + if (!allocate) + return 0; + // Avoid the L2 tables update for the images that have snapshots. + if (!s->is_parent) { + cluster_offset = bdrv_getlength(s->hd); + bdrv_truncate(s->hd, cluster_offset + (s->cluster_sectors << 9)); + + cluster_offset >>= 9; + tmp = cpu_to_le32(cluster_offset); + l2_table[l2_index] = tmp; + // Save the active image state + activeBDRV.cluster_offset = cluster_offset; + activeBDRV.hd = bs; + } + /* First of all we write grain itself, to avoid race condition + * that may to corrupt the image. + * This problem may occur because of insufficient space on host disk + * or inappropriate VM shutdown. + */ + if (get_whole_cluster(bs, cluster_offset, offset, allocate) == -1) + return 0; + + if (m_data) { + m_data->offset = tmp; + m_data->l1_index = l1_index; + m_data->l2_index = l2_index; + m_data->l2_offset = l2_offset; + m_data->valid = 1; + } + } + cluster_offset <<= 9; + return cluster_offset; +} + +static int vmdk_is_allocated(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, int *pnum) +{ + BDRVVmdkState *s = bs->opaque; + int index_in_cluster, n; + uint64_t cluster_offset; + + cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0); + index_in_cluster = sector_num % s->cluster_sectors; + n = s->cluster_sectors - index_in_cluster; + if (n > nb_sectors) + n = nb_sectors; + *pnum = n; + return (cluster_offset != 0); +} + +static int vmdk_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + BDRVVmdkState *s = bs->opaque; + int index_in_cluster, n, ret; + uint64_t cluster_offset; + + while (nb_sectors > 0) { + cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0); + index_in_cluster = sector_num % s->cluster_sectors; + n = s->cluster_sectors - index_in_cluster; + if (n > nb_sectors) + n = nb_sectors; + if (!cluster_offset) { + // try to read from parent image, if exist + if (s->hd->backing_hd) { + if (!vmdk_is_cid_valid(bs)) + return -1; + ret = bdrv_read(s->hd->backing_hd, sector_num, buf, n); + if (ret < 0) + return -1; + } else { + memset(buf, 0, 512 * n); + } + } else { + if(bdrv_pread(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512) != n * 512) + return -1; + } + nb_sectors -= n; + sector_num += n; + buf += n * 512; + } + return 0; +} + +static int vmdk_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + BDRVVmdkState *s = bs->opaque; + VmdkMetaData m_data; + int index_in_cluster, n; + uint64_t cluster_offset; + static int cid_update = 0; + + if (sector_num > bs->total_sectors) { + fprintf(stderr, + "(VMDK) Wrong offset: sector_num=0x%" PRIx64 + " total_sectors=0x%" PRIx64 "\n", + sector_num, bs->total_sectors); + return -1; + } + + while (nb_sectors > 0) { + index_in_cluster = sector_num & (s->cluster_sectors - 1); + n = s->cluster_sectors - index_in_cluster; + if (n > nb_sectors) + n = nb_sectors; + cluster_offset = get_cluster_offset(bs, &m_data, sector_num << 9, 1); + if (!cluster_offset) + return -1; + + if (bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512) != n * 512) + return -1; + if (m_data.valid) { + /* update L2 tables */ + if (vmdk_L2update(bs, &m_data) == -1) + return -1; + } + nb_sectors -= n; + sector_num += n; + buf += n * 512; + + // update CID on the first write every time the virtual disk is opened + if (!cid_update) { + vmdk_write_cid(bs, time(NULL)); + cid_update++; + } + } + return 0; +} + +static int vmdk_create(const char *filename, int64_t total_size, + const char *backing_file, int flags) +{ + int fd, i; + VMDK4Header header; + uint32_t tmp, magic, grains, gd_size, gt_size, gt_count; + static const char desc_template[] = + "# Disk DescriptorFile\n" + "version=1\n" + "CID=%x\n" + "parentCID=ffffffff\n" + "createType=\"monolithicSparse\"\n" + "\n" + "# Extent description\n" + "RW %" PRId64 " SPARSE \"%s\"\n" + "\n" + "# The Disk Data Base \n" + "#DDB\n" + "\n" + "ddb.virtualHWVersion = \"%d\"\n" + "ddb.geometry.cylinders = \"%" PRId64 "\"\n" + "ddb.geometry.heads = \"16\"\n" + "ddb.geometry.sectors = \"63\"\n" + "ddb.adapterType = \"ide\"\n"; + char desc[1024]; + const char *real_filename, *temp_str; + + /* XXX: add support for backing file */ + if (backing_file) { + return vmdk_snapshot_create(filename, backing_file); + } + + fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, + 0644); + if (fd < 0) + return -1; + magic = cpu_to_be32(VMDK4_MAGIC); + memset(&header, 0, sizeof(header)); + header.version = cpu_to_le32(1); + header.flags = cpu_to_le32(3); /* ?? */ + header.capacity = cpu_to_le64(total_size); + header.granularity = cpu_to_le64(128); + header.num_gtes_per_gte = cpu_to_le32(512); + + grains = (total_size + header.granularity - 1) / header.granularity; + gt_size = ((header.num_gtes_per_gte * sizeof(uint32_t)) + 511) >> 9; + gt_count = (grains + header.num_gtes_per_gte - 1) / header.num_gtes_per_gte; + gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9; + + header.desc_offset = 1; + header.desc_size = 20; + header.rgd_offset = header.desc_offset + header.desc_size; + header.gd_offset = header.rgd_offset + gd_size + (gt_size * gt_count); + header.grain_offset = + ((header.gd_offset + gd_size + (gt_size * gt_count) + + header.granularity - 1) / header.granularity) * + header.granularity; + + header.desc_offset = cpu_to_le64(header.desc_offset); + header.desc_size = cpu_to_le64(header.desc_size); + header.rgd_offset = cpu_to_le64(header.rgd_offset); + header.gd_offset = cpu_to_le64(header.gd_offset); + header.grain_offset = cpu_to_le64(header.grain_offset); + + header.check_bytes[0] = 0xa; + header.check_bytes[1] = 0x20; + header.check_bytes[2] = 0xd; + header.check_bytes[3] = 0xa; + + /* write all the data */ + write(fd, &magic, sizeof(magic)); + write(fd, &header, sizeof(header)); + + ftruncate(fd, header.grain_offset << 9); + + /* write grain directory */ + lseek(fd, le64_to_cpu(header.rgd_offset) << 9, SEEK_SET); + for (i = 0, tmp = header.rgd_offset + gd_size; + i < gt_count; i++, tmp += gt_size) + write(fd, &tmp, sizeof(tmp)); + + /* write backup grain directory */ + lseek(fd, le64_to_cpu(header.gd_offset) << 9, SEEK_SET); + for (i = 0, tmp = header.gd_offset + gd_size; + i < gt_count; i++, tmp += gt_size) + write(fd, &tmp, sizeof(tmp)); + + /* compose the descriptor */ + real_filename = filename; + if ((temp_str = strrchr(real_filename, '\\')) != NULL) + real_filename = temp_str + 1; + if ((temp_str = strrchr(real_filename, '/')) != NULL) + real_filename = temp_str + 1; + if ((temp_str = strrchr(real_filename, ':')) != NULL) + real_filename = temp_str + 1; + snprintf(desc, sizeof(desc), desc_template, (unsigned int)time(NULL), + total_size, real_filename, + (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4), + total_size / (int64_t)(63 * 16)); + + /* write the descriptor */ + lseek(fd, le64_to_cpu(header.desc_offset) << 9, SEEK_SET); + write(fd, desc, strlen(desc)); + + close(fd); + return 0; +} + +static void vmdk_close(BlockDriverState *bs) +{ + BDRVVmdkState *s = bs->opaque; + + qemu_free(s->l1_table); + qemu_free(s->l2_cache); + // try to close parent image, if exist + vmdk_parent_close(s->hd); + bdrv_delete(s->hd); +} + +static void vmdk_flush(BlockDriverState *bs) +{ + BDRVVmdkState *s = bs->opaque; + bdrv_flush(s->hd); +} + +static BlockDriver bdrv_vmdk = { + .format_name = "vmdk", + .instance_size = sizeof(BDRVVmdkState), + .bdrv_probe = vmdk_probe, + .bdrv_open = vmdk_open, + .bdrv_read = vmdk_read, + .bdrv_write = vmdk_write, + .bdrv_close = vmdk_close, + .bdrv_create = vmdk_create, + .bdrv_flush = vmdk_flush, + .bdrv_is_allocated = vmdk_is_allocated, +}; + +static void bdrv_vmdk_init(void) +{ + bdrv_register(&bdrv_vmdk); +} + +block_init(bdrv_vmdk_init); diff --git a/block/vpc.c b/block/vpc.c new file mode 100644 index 0000000000..211ae5c72f --- /dev/null +++ b/block/vpc.c @@ -0,0 +1,606 @@ +/* + * Block driver for Conectix/Microsoft Virtual PC images + * + * Copyright (c) 2005 Alex Beregszaszi + * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block_int.h" +#include "module.h" + +/**************************************************************/ + +#define HEADER_SIZE 512 + +//#define CACHE + +enum vhd_type { + VHD_FIXED = 2, + VHD_DYNAMIC = 3, + VHD_DIFFERENCING = 4, +}; + +// Seconds since Jan 1, 2000 0:00:00 (UTC) +#define VHD_TIMESTAMP_BASE 946684800 + +// always big-endian +struct vhd_footer { + char creator[8]; // "conectix" + uint32_t features; + uint32_t version; + + // Offset of next header structure, 0xFFFFFFFF if none + uint64_t data_offset; + + // Seconds since Jan 1, 2000 0:00:00 (UTC) + uint32_t timestamp; + + char creator_app[4]; // "vpc " + uint16_t major; + uint16_t minor; + char creator_os[4]; // "Wi2k" + + uint64_t orig_size; + uint64_t size; + + uint16_t cyls; + uint8_t heads; + uint8_t secs_per_cyl; + + uint32_t type; + + // Checksum of the Hard Disk Footer ("one's complement of the sum of all + // the bytes in the footer without the checksum field") + uint32_t checksum; + + // UUID used to identify a parent hard disk (backing file) + uint8_t uuid[16]; + + uint8_t in_saved_state; +}; + +struct vhd_dyndisk_header { + char magic[8]; // "cxsparse" + + // Offset of next header structure, 0xFFFFFFFF if none + uint64_t data_offset; + + // Offset of the Block Allocation Table (BAT) + uint64_t table_offset; + + uint32_t version; + uint32_t max_table_entries; // 32bit/entry + + // 2 MB by default, must be a power of two + uint32_t block_size; + + uint32_t checksum; + uint8_t parent_uuid[16]; + uint32_t parent_timestamp; + uint32_t reserved; + + // Backing file name (in UTF-16) + uint8_t parent_name[512]; + + struct { + uint32_t platform; + uint32_t data_space; + uint32_t data_length; + uint32_t reserved; + uint64_t data_offset; + } parent_locator[8]; +}; + +typedef struct BDRVVPCState { + BlockDriverState *hd; + + uint8_t footer_buf[HEADER_SIZE]; + uint64_t free_data_block_offset; + int max_table_entries; + uint32_t *pagetable; + uint64_t bat_offset; + uint64_t last_bitmap_offset; + + uint32_t block_size; + uint32_t bitmap_size; + +#ifdef CACHE + uint8_t *pageentry_u8; + uint32_t *pageentry_u32; + uint16_t *pageentry_u16; + + uint64_t last_bitmap; +#endif +} BDRVVPCState; + +static uint32_t vpc_checksum(uint8_t* buf, size_t size) +{ + uint32_t res = 0; + int i; + + for (i = 0; i < size; i++) + res += buf[i]; + + return ~res; +} + + +static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8)) + return 100; + return 0; +} + +static int vpc_open(BlockDriverState *bs, const char *filename, int flags) +{ + BDRVVPCState *s = bs->opaque; + int ret, i; + struct vhd_footer* footer; + struct vhd_dyndisk_header* dyndisk_header; + uint8_t buf[HEADER_SIZE]; + uint32_t checksum; + + ret = bdrv_file_open(&s->hd, filename, flags); + if (ret < 0) + return ret; + + if (bdrv_pread(s->hd, 0, s->footer_buf, HEADER_SIZE) != HEADER_SIZE) + goto fail; + + footer = (struct vhd_footer*) s->footer_buf; + if (strncmp(footer->creator, "conectix", 8)) + goto fail; + + checksum = be32_to_cpu(footer->checksum); + footer->checksum = 0; + if (vpc_checksum(s->footer_buf, HEADER_SIZE) != checksum) + fprintf(stderr, "block-vpc: The header checksum of '%s' is " + "incorrect.\n", filename); + + // The visible size of a image in Virtual PC depends on the geometry + // rather than on the size stored in the footer (the size in the footer + // is too large usually) + bs->total_sectors = (int64_t) + be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl; + + if (bdrv_pread(s->hd, be64_to_cpu(footer->data_offset), buf, HEADER_SIZE) + != HEADER_SIZE) + goto fail; + + dyndisk_header = (struct vhd_dyndisk_header*) buf; + + if (strncmp(dyndisk_header->magic, "cxsparse", 8)) + goto fail; + + + s->block_size = be32_to_cpu(dyndisk_header->block_size); + s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511; + + s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries); + s->pagetable = qemu_malloc(s->max_table_entries * 4); + + s->bat_offset = be64_to_cpu(dyndisk_header->table_offset); + if (bdrv_pread(s->hd, s->bat_offset, s->pagetable, + s->max_table_entries * 4) != s->max_table_entries * 4) + goto fail; + + s->free_data_block_offset = + (s->bat_offset + (s->max_table_entries * 4) + 511) & ~511; + + for (i = 0; i < s->max_table_entries; i++) { + be32_to_cpus(&s->pagetable[i]); + if (s->pagetable[i] != 0xFFFFFFFF) { + int64_t next = (512 * (int64_t) s->pagetable[i]) + + s->bitmap_size + s->block_size; + + if (next> s->free_data_block_offset) + s->free_data_block_offset = next; + } + } + + s->last_bitmap_offset = (int64_t) -1; + +#ifdef CACHE + s->pageentry_u8 = qemu_malloc(512); + s->pageentry_u32 = s->pageentry_u8; + s->pageentry_u16 = s->pageentry_u8; + s->last_pagetable = -1; +#endif + + return 0; + fail: + bdrv_delete(s->hd); + return -1; +} + +/* + * Returns the absolute byte offset of the given sector in the image file. + * If the sector is not allocated, -1 is returned instead. + * + * The parameter write must be 1 if the offset will be used for a write + * operation (the block bitmaps is updated then), 0 otherwise. + */ +static inline int64_t get_sector_offset(BlockDriverState *bs, + int64_t sector_num, int write) +{ + BDRVVPCState *s = bs->opaque; + uint64_t offset = sector_num * 512; + uint64_t bitmap_offset, block_offset; + uint32_t pagetable_index, pageentry_index; + + pagetable_index = offset / s->block_size; + pageentry_index = (offset % s->block_size) / 512; + + if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff) + return -1; // not allocated + + bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index]; + block_offset = bitmap_offset + s->bitmap_size + (512 * pageentry_index); + + // We must ensure that we don't write to any sectors which are marked as + // unused in the bitmap. We get away with setting all bits in the block + // bitmap each time we write to a new block. This might cause Virtual PC to + // miss sparse read optimization, but it's not a problem in terms of + // correctness. + if (write && (s->last_bitmap_offset != bitmap_offset)) { + uint8_t bitmap[s->bitmap_size]; + + s->last_bitmap_offset = bitmap_offset; + memset(bitmap, 0xff, s->bitmap_size); + bdrv_pwrite(s->hd, bitmap_offset, bitmap, s->bitmap_size); + } + +// printf("sector: %" PRIx64 ", index: %x, offset: %x, bioff: %" PRIx64 ", bloff: %" PRIx64 "\n", +// sector_num, pagetable_index, pageentry_index, +// bitmap_offset, block_offset); + +// disabled by reason +#if 0 +#ifdef CACHE + if (bitmap_offset != s->last_bitmap) + { + lseek(s->fd, bitmap_offset, SEEK_SET); + + s->last_bitmap = bitmap_offset; + + // Scary! Bitmap is stored as big endian 32bit entries, + // while we used to look it up byte by byte + read(s->fd, s->pageentry_u8, 512); + for (i = 0; i < 128; i++) + be32_to_cpus(&s->pageentry_u32[i]); + } + + if ((s->pageentry_u8[pageentry_index / 8] >> (pageentry_index % 8)) & 1) + return -1; +#else + lseek(s->fd, bitmap_offset + (pageentry_index / 8), SEEK_SET); + + read(s->fd, &bitmap_entry, 1); + + if ((bitmap_entry >> (pageentry_index % 8)) & 1) + return -1; // not allocated +#endif +#endif + + return block_offset; +} + +/* + * Writes the footer to the end of the image file. This is needed when the + * file grows as it overwrites the old footer + * + * Returns 0 on success and < 0 on error + */ +static int rewrite_footer(BlockDriverState* bs) +{ + int ret; + BDRVVPCState *s = bs->opaque; + int64_t offset = s->free_data_block_offset; + + ret = bdrv_pwrite(s->hd, offset, s->footer_buf, HEADER_SIZE); + if (ret < 0) + return ret; + + return 0; +} + +/* + * Allocates a new block. This involves writing a new footer and updating + * the Block Allocation Table to use the space at the old end of the image + * file (overwriting the old footer) + * + * Returns the sectors' offset in the image file on success and < 0 on error + */ +static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num) +{ + BDRVVPCState *s = bs->opaque; + int64_t bat_offset; + uint32_t index, bat_value; + int ret; + uint8_t bitmap[s->bitmap_size]; + + // Check if sector_num is valid + if ((sector_num < 0) || (sector_num > bs->total_sectors)) + return -1; + + // Write entry into in-memory BAT + index = (sector_num * 512) / s->block_size; + if (s->pagetable[index] != 0xFFFFFFFF) + return -1; + + s->pagetable[index] = s->free_data_block_offset / 512; + + // Initialize the block's bitmap + memset(bitmap, 0xff, s->bitmap_size); + bdrv_pwrite(s->hd, s->free_data_block_offset, bitmap, s->bitmap_size); + + // Write new footer (the old one will be overwritten) + s->free_data_block_offset += s->block_size + s->bitmap_size; + ret = rewrite_footer(bs); + if (ret < 0) + goto fail; + + // Write BAT entry to disk + bat_offset = s->bat_offset + (4 * index); + bat_value = be32_to_cpu(s->pagetable[index]); + ret = bdrv_pwrite(s->hd, bat_offset, &bat_value, 4); + if (ret < 0) + goto fail; + + return get_sector_offset(bs, sector_num, 0); + +fail: + s->free_data_block_offset -= (s->block_size + s->bitmap_size); + return -1; +} + +static int vpc_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + BDRVVPCState *s = bs->opaque; + int ret; + int64_t offset; + + while (nb_sectors > 0) { + offset = get_sector_offset(bs, sector_num, 0); + + if (offset == -1) { + memset(buf, 0, 512); + } else { + ret = bdrv_pread(s->hd, offset, buf, 512); + if (ret != 512) + return -1; + } + + nb_sectors--; + sector_num++; + buf += 512; + } + return 0; +} + +static int vpc_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + BDRVVPCState *s = bs->opaque; + int64_t offset; + int ret; + + while (nb_sectors > 0) { + offset = get_sector_offset(bs, sector_num, 1); + + if (offset == -1) { + offset = alloc_block(bs, sector_num); + if (offset < 0) + return -1; + } + + ret = bdrv_pwrite(s->hd, offset, buf, 512); + if (ret != 512) + return -1; + + nb_sectors--; + sector_num++; + buf += 512; + } + + return 0; +} + + +/* + * Calculates the number of cylinders, heads and sectors per cylinder + * based on a given number of sectors. This is the algorithm described + * in the VHD specification. + * + * Note that the geometry doesn't always exactly match total_sectors but + * may round it down. + * + * Returns 0 on success, -EFBIG if the size is larger than 127 GB + */ +static int calculate_geometry(int64_t total_sectors, uint16_t* cyls, + uint8_t* heads, uint8_t* secs_per_cyl) +{ + uint32_t cyls_times_heads; + + if (total_sectors > 65535 * 16 * 255) + return -EFBIG; + + if (total_sectors > 65535 * 16 * 63) { + *secs_per_cyl = 255; + *heads = 16; + cyls_times_heads = total_sectors / *secs_per_cyl; + } else { + *secs_per_cyl = 17; + cyls_times_heads = total_sectors / *secs_per_cyl; + *heads = (cyls_times_heads + 1023) / 1024; + + if (*heads < 4) + *heads = 4; + + if (cyls_times_heads >= (*heads * 1024) || *heads > 16) { + *secs_per_cyl = 31; + *heads = 16; + cyls_times_heads = total_sectors / *secs_per_cyl; + } + + if (cyls_times_heads >= (*heads * 1024)) { + *secs_per_cyl = 63; + *heads = 16; + cyls_times_heads = total_sectors / *secs_per_cyl; + } + } + + // Note: Rounding up deviates from the Virtual PC behaviour + // However, we need this to avoid truncating images in qemu-img convert + *cyls = (cyls_times_heads + *heads - 1) / *heads; + + return 0; +} + +static int vpc_create(const char *filename, int64_t total_sectors, + const char *backing_file, int flags) +{ + uint8_t buf[1024]; + struct vhd_footer* footer = (struct vhd_footer*) buf; + struct vhd_dyndisk_header* dyndisk_header = + (struct vhd_dyndisk_header*) buf; + int fd, i; + uint16_t cyls; + uint8_t heads; + uint8_t secs_per_cyl; + size_t block_size, num_bat_entries; + + if (backing_file != NULL) + return -ENOTSUP; + + fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644); + if (fd < 0) + return -EIO; + + // Calculate matching total_size and geometry + if (calculate_geometry(total_sectors, &cyls, &heads, &secs_per_cyl)) + return -EFBIG; + total_sectors = (int64_t) cyls * heads * secs_per_cyl; + + // Prepare the Hard Disk Footer + memset(buf, 0, 1024); + + strncpy(footer->creator, "conectix", 8); + // TODO Check if "qemu" creator_app is ok for VPC + strncpy(footer->creator_app, "qemu", 4); + strncpy(footer->creator_os, "Wi2k", 4); + + footer->features = be32_to_cpu(0x02); + footer->version = be32_to_cpu(0x00010000); + footer->data_offset = be64_to_cpu(HEADER_SIZE); + footer->timestamp = be32_to_cpu(time(NULL) - VHD_TIMESTAMP_BASE); + + // Version of Virtual PC 2007 + footer->major = be16_to_cpu(0x0005); + footer->minor =be16_to_cpu(0x0003); + + footer->orig_size = be64_to_cpu(total_sectors * 512); + footer->size = be64_to_cpu(total_sectors * 512); + + footer->cyls = be16_to_cpu(cyls); + footer->heads = heads; + footer->secs_per_cyl = secs_per_cyl; + + footer->type = be32_to_cpu(VHD_DYNAMIC); + + // TODO uuid is missing + + footer->checksum = be32_to_cpu(vpc_checksum(buf, HEADER_SIZE)); + + // Write the footer (twice: at the beginning and at the end) + block_size = 0x200000; + num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512); + + if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE) + return -EIO; + + if (lseek(fd, 1536 + ((num_bat_entries * 4 + 511) & ~511), SEEK_SET) < 0) + return -EIO; + if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE) + return -EIO; + + // Write the initial BAT + if (lseek(fd, 3 * 512, SEEK_SET) < 0) + return -EIO; + + memset(buf, 0xFF, 512); + for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) + if (write(fd, buf, 512) != 512) + return -EIO; + + + // Prepare the Dynamic Disk Header + memset(buf, 0, 1024); + + strncpy(dyndisk_header->magic, "cxsparse", 8); + + dyndisk_header->data_offset = be64_to_cpu(0xFFFFFFFF); + dyndisk_header->table_offset = be64_to_cpu(3 * 512); + dyndisk_header->version = be32_to_cpu(0x00010000); + dyndisk_header->block_size = be32_to_cpu(block_size); + dyndisk_header->max_table_entries = be32_to_cpu(num_bat_entries); + + dyndisk_header->checksum = be32_to_cpu(vpc_checksum(buf, 1024)); + + // Write the header + if (lseek(fd, 512, SEEK_SET) < 0) + return -EIO; + if (write(fd, buf, 1024) != 1024) + return -EIO; + + close(fd); + return 0; +} + +static void vpc_close(BlockDriverState *bs) +{ + BDRVVPCState *s = bs->opaque; + qemu_free(s->pagetable); +#ifdef CACHE + qemu_free(s->pageentry_u8); +#endif + bdrv_delete(s->hd); +} + +static BlockDriver bdrv_vpc = { + .format_name = "vpc", + .instance_size = sizeof(BDRVVPCState), + .bdrv_probe = vpc_probe, + .bdrv_open = vpc_open, + .bdrv_read = vpc_read, + .bdrv_write = vpc_write, + .bdrv_close = vpc_close, + .bdrv_create = vpc_create, +}; + +static void bdrv_vpc_init(void) +{ + bdrv_register(&bdrv_vpc); +} + +block_init(bdrv_vpc_init); diff --git a/block/vvfat.c b/block/vvfat.c new file mode 100644 index 0000000000..2a8feb38d8 --- /dev/null +++ b/block/vvfat.c @@ -0,0 +1,2855 @@ +/* vim:set shiftwidth=4 ts=8: */ +/* + * QEMU Block driver for virtual VFAT (shadows a local directory) + * + * Copyright (c) 2004,2005 Johannes E. Schindelin + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include <sys/stat.h> +#include <dirent.h> +#include "qemu-common.h" +#include "block_int.h" +#include "module.h" + +#ifndef S_IWGRP +#define S_IWGRP 0 +#endif +#ifndef S_IWOTH +#define S_IWOTH 0 +#endif + +/* TODO: add ":bootsector=blabla.img:" */ +/* LATER TODO: add automatic boot sector generation from + BOOTEASY.ASM and Ranish Partition Manager + Note that DOS assumes the system files to be the first files in the + file system (test if the boot sector still relies on that fact)! */ +/* MAYBE TODO: write block-visofs.c */ +/* TODO: call try_commit() only after a timeout */ + +/* #define DEBUG */ + +#ifdef DEBUG + +#define DLOG(a) a + +#undef stderr +#define stderr STDERR +FILE* stderr = NULL; + +static void checkpoint(void); + +#ifdef __MINGW32__ +void nonono(const char* file, int line, const char* msg) { + fprintf(stderr, "Nonono! %s:%d %s\n", file, line, msg); + exit(-5); +} +#undef assert +#define assert(a) do {if (!(a)) nonono(__FILE__, __LINE__, #a);}while(0) +#endif + +#else + +#define DLOG(a) + +#endif + +/* dynamic array functions */ +typedef struct array_t { + char* pointer; + unsigned int size,next,item_size; +} array_t; + +static inline void array_init(array_t* array,unsigned int item_size) +{ + array->pointer = NULL; + array->size=0; + array->next=0; + array->item_size=item_size; +} + +static inline void array_free(array_t* array) +{ + if(array->pointer) + free(array->pointer); + array->size=array->next=0; +} + +/* does not automatically grow */ +static inline void* array_get(array_t* array,unsigned int index) { + assert(index < array->next); + return array->pointer + index * array->item_size; +} + +static inline int array_ensure_allocated(array_t* array, int index) +{ + if((index + 1) * array->item_size > array->size) { + int new_size = (index + 32) * array->item_size; + array->pointer = qemu_realloc(array->pointer, new_size); + if (!array->pointer) + return -1; + array->size = new_size; + array->next = index + 1; + } + + return 0; +} + +static inline void* array_get_next(array_t* array) { + unsigned int next = array->next; + void* result; + + if (array_ensure_allocated(array, next) < 0) + return NULL; + + array->next = next + 1; + result = array_get(array, next); + + return result; +} + +static inline void* array_insert(array_t* array,unsigned int index,unsigned int count) { + if((array->next+count)*array->item_size>array->size) { + int increment=count*array->item_size; + array->pointer=qemu_realloc(array->pointer,array->size+increment); + if(!array->pointer) + return NULL; + array->size+=increment; + } + memmove(array->pointer+(index+count)*array->item_size, + array->pointer+index*array->item_size, + (array->next-index)*array->item_size); + array->next+=count; + return array->pointer+index*array->item_size; +} + +/* this performs a "roll", so that the element which was at index_from becomes + * index_to, but the order of all other elements is preserved. */ +static inline int array_roll(array_t* array,int index_to,int index_from,int count) +{ + char* buf; + char* from; + char* to; + int is; + + if(!array || + index_to<0 || index_to>=array->next || + index_from<0 || index_from>=array->next) + return -1; + + if(index_to==index_from) + return 0; + + is=array->item_size; + from=array->pointer+index_from*is; + to=array->pointer+index_to*is; + buf=qemu_malloc(is*count); + memcpy(buf,from,is*count); + + if(index_to<index_from) + memmove(to+is*count,to,from-to); + else + memmove(from,from+is*count,to-from); + + memcpy(to,buf,is*count); + + free(buf); + + return 0; +} + +static inline int array_remove_slice(array_t* array,int index, int count) +{ + assert(index >=0); + assert(count > 0); + assert(index + count <= array->next); + if(array_roll(array,array->next-1,index,count)) + return -1; + array->next -= count; + return 0; +} + +static int array_remove(array_t* array,int index) +{ + return array_remove_slice(array, index, 1); +} + +/* return the index for a given member */ +static int array_index(array_t* array, void* pointer) +{ + size_t offset = (char*)pointer - array->pointer; + assert((offset % array->item_size) == 0); + assert(offset/array->item_size < array->next); + return offset/array->item_size; +} + +/* These structures are used to fake a disk and the VFAT filesystem. + * For this reason we need to use __attribute__((packed)). */ + +typedef struct bootsector_t { + uint8_t jump[3]; + uint8_t name[8]; + uint16_t sector_size; + uint8_t sectors_per_cluster; + uint16_t reserved_sectors; + uint8_t number_of_fats; + uint16_t root_entries; + uint16_t total_sectors16; + uint8_t media_type; + uint16_t sectors_per_fat; + uint16_t sectors_per_track; + uint16_t number_of_heads; + uint32_t hidden_sectors; + uint32_t total_sectors; + union { + struct { + uint8_t drive_number; + uint8_t current_head; + uint8_t signature; + uint32_t id; + uint8_t volume_label[11]; + } __attribute__((packed)) fat16; + struct { + uint32_t sectors_per_fat; + uint16_t flags; + uint8_t major,minor; + uint32_t first_cluster_of_root_directory; + uint16_t info_sector; + uint16_t backup_boot_sector; + uint16_t ignored; + } __attribute__((packed)) fat32; + } u; + uint8_t fat_type[8]; + uint8_t ignored[0x1c0]; + uint8_t magic[2]; +} __attribute__((packed)) bootsector_t; + +typedef struct { + uint8_t head; + uint8_t sector; + uint8_t cylinder; +} mbr_chs_t; + +typedef struct partition_t { + uint8_t attributes; /* 0x80 = bootable */ + mbr_chs_t start_CHS; + uint8_t fs_type; /* 0x1 = FAT12, 0x6 = FAT16, 0xe = FAT16_LBA, 0xb = FAT32, 0xc = FAT32_LBA */ + mbr_chs_t end_CHS; + uint32_t start_sector_long; + uint32_t length_sector_long; +} __attribute__((packed)) partition_t; + +typedef struct mbr_t { + uint8_t ignored[0x1b8]; + uint32_t nt_id; + uint8_t ignored2[2]; + partition_t partition[4]; + uint8_t magic[2]; +} __attribute__((packed)) mbr_t; + +typedef struct direntry_t { + uint8_t name[8]; + uint8_t extension[3]; + uint8_t attributes; + uint8_t reserved[2]; + uint16_t ctime; + uint16_t cdate; + uint16_t adate; + uint16_t begin_hi; + uint16_t mtime; + uint16_t mdate; + uint16_t begin; + uint32_t size; +} __attribute__((packed)) direntry_t; + +/* this structure are used to transparently access the files */ + +typedef struct mapping_t { + /* begin is the first cluster, end is the last+1 */ + uint32_t begin,end; + /* as s->directory is growable, no pointer may be used here */ + unsigned int dir_index; + /* the clusters of a file may be in any order; this points to the first */ + int first_mapping_index; + union { + /* offset is + * - the offset in the file (in clusters) for a file, or + * - the next cluster of the directory for a directory, and + * - the address of the buffer for a faked entry + */ + struct { + uint32_t offset; + } file; + struct { + int parent_mapping_index; + int first_dir_index; + } dir; + } info; + /* path contains the full path, i.e. it always starts with s->path */ + char* path; + + enum { MODE_UNDEFINED = 0, MODE_NORMAL = 1, MODE_MODIFIED = 2, + MODE_DIRECTORY = 4, MODE_FAKED = 8, + MODE_DELETED = 16, MODE_RENAMED = 32 } mode; + int read_only; +} mapping_t; + +#ifdef DEBUG +static void print_direntry(const struct direntry_t*); +static void print_mapping(const struct mapping_t* mapping); +#endif + +/* here begins the real VVFAT driver */ + +typedef struct BDRVVVFATState { + BlockDriverState* bs; /* pointer to parent */ + unsigned int first_sectors_number; /* 1 for a single partition, 0x40 for a disk with partition table */ + unsigned char first_sectors[0x40*0x200]; + + int fat_type; /* 16 or 32 */ + array_t fat,directory,mapping; + + unsigned int cluster_size; + unsigned int sectors_per_cluster; + unsigned int sectors_per_fat; + unsigned int sectors_of_root_directory; + uint32_t last_cluster_of_root_directory; + unsigned int faked_sectors; /* how many sectors are faked before file data */ + uint32_t sector_count; /* total number of sectors of the partition */ + uint32_t cluster_count; /* total number of clusters of this partition */ + uint32_t max_fat_value; + + int current_fd; + mapping_t* current_mapping; + unsigned char* cluster; /* points to current cluster */ + unsigned char* cluster_buffer; /* points to a buffer to hold temp data */ + unsigned int current_cluster; + + /* write support */ + BlockDriverState* write_target; + char* qcow_filename; + BlockDriverState* qcow; + void* fat2; + char* used_clusters; + array_t commits; + const char* path; + int downcase_short_names; +} BDRVVVFATState; + +/* take the sector position spos and convert it to Cylinder/Head/Sector position + * if the position is outside the specified geometry, fill maximum value for CHS + * and return 1 to signal overflow. + */ +static int sector2CHS(BlockDriverState* bs, mbr_chs_t * chs, int spos){ + int head,sector; + sector = spos % (bs->secs); spos/= bs->secs; + head = spos % (bs->heads); spos/= bs->heads; + if(spos >= bs->cyls){ + /* Overflow, + it happens if 32bit sector positions are used, while CHS is only 24bit. + Windows/Dos is said to take 1023/255/63 as nonrepresentable CHS */ + chs->head = 0xFF; + chs->sector = 0xFF; + chs->cylinder = 0xFF; + return 1; + } + chs->head = (uint8_t)head; + chs->sector = (uint8_t)( (sector+1) | ((spos>>8)<<6) ); + chs->cylinder = (uint8_t)spos; + return 0; +} + +static void init_mbr(BDRVVVFATState* s) +{ + /* TODO: if the files mbr.img and bootsect.img exist, use them */ + mbr_t* real_mbr=(mbr_t*)s->first_sectors; + partition_t* partition=&(real_mbr->partition[0]); + int lba; + + memset(s->first_sectors,0,512); + + /* Win NT Disk Signature */ + real_mbr->nt_id= cpu_to_le32(0xbe1afdfa); + + partition->attributes=0x80; /* bootable */ + + /* LBA is used when partition is outside the CHS geometry */ + lba = sector2CHS(s->bs, &partition->start_CHS, s->first_sectors_number-1); + lba|= sector2CHS(s->bs, &partition->end_CHS, s->sector_count); + + /*LBA partitions are identified only by start/length_sector_long not by CHS*/ + partition->start_sector_long =cpu_to_le32(s->first_sectors_number-1); + partition->length_sector_long=cpu_to_le32(s->sector_count - s->first_sectors_number+1); + + /* FAT12/FAT16/FAT32 */ + /* DOS uses different types when partition is LBA, + probably to prevent older versions from using CHS on them */ + partition->fs_type= s->fat_type==12 ? 0x1: + s->fat_type==16 ? (lba?0xe:0x06): + /*fat_tyoe==32*/ (lba?0xc:0x0b); + + real_mbr->magic[0]=0x55; real_mbr->magic[1]=0xaa; +} + +/* direntry functions */ + +/* dest is assumed to hold 258 bytes, and pads with 0xffff up to next multiple of 26 */ +static inline int short2long_name(char* dest,const char* src) +{ + int i; + int len; + for(i=0;i<129 && src[i];i++) { + dest[2*i]=src[i]; + dest[2*i+1]=0; + } + len=2*i; + dest[2*i]=dest[2*i+1]=0; + for(i=2*i+2;(i%26);i++) + dest[i]=0xff; + return len; +} + +static inline direntry_t* create_long_filename(BDRVVVFATState* s,const char* filename) +{ + char buffer[258]; + int length=short2long_name(buffer,filename), + number_of_entries=(length+25)/26,i; + direntry_t* entry; + + for(i=0;i<number_of_entries;i++) { + entry=array_get_next(&(s->directory)); + entry->attributes=0xf; + entry->reserved[0]=0; + entry->begin=0; + entry->name[0]=(number_of_entries-i)|(i==0?0x40:0); + } + for(i=0;i<26*number_of_entries;i++) { + int offset=(i%26); + if(offset<10) offset=1+offset; + else if(offset<22) offset=14+offset-10; + else offset=28+offset-22; + entry=array_get(&(s->directory),s->directory.next-1-(i/26)); + entry->name[offset]=buffer[i]; + } + return array_get(&(s->directory),s->directory.next-number_of_entries); +} + +static char is_free(const direntry_t* direntry) +{ + return direntry->name[0]==0xe5 || direntry->name[0]==0x00; +} + +static char is_volume_label(const direntry_t* direntry) +{ + return direntry->attributes == 0x28; +} + +static char is_long_name(const direntry_t* direntry) +{ + return direntry->attributes == 0xf; +} + +static char is_short_name(const direntry_t* direntry) +{ + return !is_volume_label(direntry) && !is_long_name(direntry) + && !is_free(direntry); +} + +static char is_directory(const direntry_t* direntry) +{ + return direntry->attributes & 0x10 && direntry->name[0] != 0xe5; +} + +static inline char is_dot(const direntry_t* direntry) +{ + return is_short_name(direntry) && direntry->name[0] == '.'; +} + +static char is_file(const direntry_t* direntry) +{ + return is_short_name(direntry) && !is_directory(direntry); +} + +static inline uint32_t begin_of_direntry(const direntry_t* direntry) +{ + return le16_to_cpu(direntry->begin)|(le16_to_cpu(direntry->begin_hi)<<16); +} + +static inline uint32_t filesize_of_direntry(const direntry_t* direntry) +{ + return le32_to_cpu(direntry->size); +} + +static void set_begin_of_direntry(direntry_t* direntry, uint32_t begin) +{ + direntry->begin = cpu_to_le16(begin & 0xffff); + direntry->begin_hi = cpu_to_le16((begin >> 16) & 0xffff); +} + +/* fat functions */ + +static inline uint8_t fat_chksum(const direntry_t* entry) +{ + uint8_t chksum=0; + int i; + + for(i=0;i<11;i++) { + unsigned char c; + + c = (i <= 8) ? entry->name[i] : entry->extension[i-8]; + chksum=(((chksum&0xfe)>>1)|((chksum&0x01)?0x80:0)) + c; + } + + return chksum; +} + +/* if return_time==0, this returns the fat_date, else the fat_time */ +static uint16_t fat_datetime(time_t time,int return_time) { + struct tm* t; +#ifdef _WIN32 + t=localtime(&time); /* this is not thread safe */ +#else + struct tm t1; + t=&t1; + localtime_r(&time,t); +#endif + if(return_time) + return cpu_to_le16((t->tm_sec/2)|(t->tm_min<<5)|(t->tm_hour<<11)); + return cpu_to_le16((t->tm_mday)|((t->tm_mon+1)<<5)|((t->tm_year-80)<<9)); +} + +static inline void fat_set(BDRVVVFATState* s,unsigned int cluster,uint32_t value) +{ + if(s->fat_type==32) { + uint32_t* entry=array_get(&(s->fat),cluster); + *entry=cpu_to_le32(value); + } else if(s->fat_type==16) { + uint16_t* entry=array_get(&(s->fat),cluster); + *entry=cpu_to_le16(value&0xffff); + } else { + int offset = (cluster*3/2); + unsigned char* p = array_get(&(s->fat), offset); + switch (cluster&1) { + case 0: + p[0] = value&0xff; + p[1] = (p[1]&0xf0) | ((value>>8)&0xf); + break; + case 1: + p[0] = (p[0]&0xf) | ((value&0xf)<<4); + p[1] = (value>>4); + break; + } + } +} + +static inline uint32_t fat_get(BDRVVVFATState* s,unsigned int cluster) +{ + if(s->fat_type==32) { + uint32_t* entry=array_get(&(s->fat),cluster); + return le32_to_cpu(*entry); + } else if(s->fat_type==16) { + uint16_t* entry=array_get(&(s->fat),cluster); + return le16_to_cpu(*entry); + } else { + const uint8_t* x=(uint8_t*)(s->fat.pointer)+cluster*3/2; + return ((x[0]|(x[1]<<8))>>(cluster&1?4:0))&0x0fff; + } +} + +static inline int fat_eof(BDRVVVFATState* s,uint32_t fat_entry) +{ + if(fat_entry>s->max_fat_value-8) + return -1; + return 0; +} + +static inline void init_fat(BDRVVVFATState* s) +{ + if (s->fat_type == 12) { + array_init(&(s->fat),1); + array_ensure_allocated(&(s->fat), + s->sectors_per_fat * 0x200 * 3 / 2 - 1); + } else { + array_init(&(s->fat),(s->fat_type==32?4:2)); + array_ensure_allocated(&(s->fat), + s->sectors_per_fat * 0x200 / s->fat.item_size - 1); + } + memset(s->fat.pointer,0,s->fat.size); + + switch(s->fat_type) { + case 12: s->max_fat_value=0xfff; break; + case 16: s->max_fat_value=0xffff; break; + case 32: s->max_fat_value=0x0fffffff; break; + default: s->max_fat_value=0; /* error... */ + } + +} + +/* TODO: in create_short_filename, 0xe5->0x05 is not yet handled! */ +/* TODO: in parse_short_filename, 0x05->0xe5 is not yet handled! */ +static inline direntry_t* create_short_and_long_name(BDRVVVFATState* s, + unsigned int directory_start, const char* filename, int is_dot) +{ + int i,j,long_index=s->directory.next; + direntry_t* entry = NULL; + direntry_t* entry_long = NULL; + + if(is_dot) { + entry=array_get_next(&(s->directory)); + memset(entry->name,0x20,11); + memcpy(entry->name,filename,strlen(filename)); + return entry; + } + + entry_long=create_long_filename(s,filename); + + i = strlen(filename); + for(j = i - 1; j>0 && filename[j]!='.';j--); + if (j > 0) + i = (j > 8 ? 8 : j); + else if (i > 8) + i = 8; + + entry=array_get_next(&(s->directory)); + memset(entry->name,0x20,11); + memcpy(entry->name, filename, i); + + if(j > 0) + for (i = 0; i < 3 && filename[j+1+i]; i++) + entry->extension[i] = filename[j+1+i]; + + /* upcase & remove unwanted characters */ + for(i=10;i>=0;i--) { + if(i==10 || i==7) for(;i>0 && entry->name[i]==' ';i--); + if(entry->name[i]<=' ' || entry->name[i]>0x7f + || strchr(".*?<>|\":/\\[];,+='",entry->name[i])) + entry->name[i]='_'; + else if(entry->name[i]>='a' && entry->name[i]<='z') + entry->name[i]+='A'-'a'; + } + + /* mangle duplicates */ + while(1) { + direntry_t* entry1=array_get(&(s->directory),directory_start); + int j; + + for(;entry1<entry;entry1++) + if(!is_long_name(entry1) && !memcmp(entry1->name,entry->name,11)) + break; /* found dupe */ + if(entry1==entry) /* no dupe found */ + break; + + /* use all 8 characters of name */ + if(entry->name[7]==' ') { + int j; + for(j=6;j>0 && entry->name[j]==' ';j--) + entry->name[j]='~'; + } + + /* increment number */ + for(j=7;j>0 && entry->name[j]=='9';j--) + entry->name[j]='0'; + if(j>0) { + if(entry->name[j]<'0' || entry->name[j]>'9') + entry->name[j]='0'; + else + entry->name[j]++; + } + } + + /* calculate checksum; propagate to long name */ + if(entry_long) { + uint8_t chksum=fat_chksum(entry); + + /* calculate anew, because realloc could have taken place */ + entry_long=array_get(&(s->directory),long_index); + while(entry_long<entry && is_long_name(entry_long)) { + entry_long->reserved[1]=chksum; + entry_long++; + } + } + + return entry; +} + +/* + * Read a directory. (the index of the corresponding mapping must be passed). + */ +static int read_directory(BDRVVVFATState* s, int mapping_index) +{ + mapping_t* mapping = array_get(&(s->mapping), mapping_index); + direntry_t* direntry; + const char* dirname = mapping->path; + int first_cluster = mapping->begin; + int parent_index = mapping->info.dir.parent_mapping_index; + mapping_t* parent_mapping = (mapping_t*) + (parent_index >= 0 ? array_get(&(s->mapping), parent_index) : NULL); + int first_cluster_of_parent = parent_mapping ? parent_mapping->begin : -1; + + DIR* dir=opendir(dirname); + struct dirent* entry; + int i; + + assert(mapping->mode & MODE_DIRECTORY); + + if(!dir) { + mapping->end = mapping->begin; + return -1; + } + + i = mapping->info.dir.first_dir_index = + first_cluster == 0 ? 0 : s->directory.next; + + /* actually read the directory, and allocate the mappings */ + while((entry=readdir(dir))) { + unsigned int length=strlen(dirname)+2+strlen(entry->d_name); + char* buffer; + direntry_t* direntry; + struct stat st; + int is_dot=!strcmp(entry->d_name,"."); + int is_dotdot=!strcmp(entry->d_name,".."); + + if(first_cluster == 0 && (is_dotdot || is_dot)) + continue; + + buffer=(char*)qemu_malloc(length); + snprintf(buffer,length,"%s/%s",dirname,entry->d_name); + + if(stat(buffer,&st)<0) { + free(buffer); + continue; + } + + /* create directory entry for this file */ + direntry=create_short_and_long_name(s, i, entry->d_name, + is_dot || is_dotdot); + direntry->attributes=(S_ISDIR(st.st_mode)?0x10:0x20); + direntry->reserved[0]=direntry->reserved[1]=0; + direntry->ctime=fat_datetime(st.st_ctime,1); + direntry->cdate=fat_datetime(st.st_ctime,0); + direntry->adate=fat_datetime(st.st_atime,0); + direntry->begin_hi=0; + direntry->mtime=fat_datetime(st.st_mtime,1); + direntry->mdate=fat_datetime(st.st_mtime,0); + if(is_dotdot) + set_begin_of_direntry(direntry, first_cluster_of_parent); + else if(is_dot) + set_begin_of_direntry(direntry, first_cluster); + else + direntry->begin=0; /* do that later */ + if (st.st_size > 0x7fffffff) { + fprintf(stderr, "File %s is larger than 2GB\n", buffer); + free(buffer); + return -2; + } + direntry->size=cpu_to_le32(S_ISDIR(st.st_mode)?0:st.st_size); + + /* create mapping for this file */ + if(!is_dot && !is_dotdot && (S_ISDIR(st.st_mode) || st.st_size)) { + s->current_mapping=(mapping_t*)array_get_next(&(s->mapping)); + s->current_mapping->begin=0; + s->current_mapping->end=st.st_size; + /* + * we get the direntry of the most recent direntry, which + * contains the short name and all the relevant information. + */ + s->current_mapping->dir_index=s->directory.next-1; + s->current_mapping->first_mapping_index = -1; + if (S_ISDIR(st.st_mode)) { + s->current_mapping->mode = MODE_DIRECTORY; + s->current_mapping->info.dir.parent_mapping_index = + mapping_index; + } else { + s->current_mapping->mode = MODE_UNDEFINED; + s->current_mapping->info.file.offset = 0; + } + s->current_mapping->path=buffer; + s->current_mapping->read_only = + (st.st_mode & (S_IWUSR | S_IWGRP | S_IWOTH)) == 0; + } + } + closedir(dir); + + /* fill with zeroes up to the end of the cluster */ + while(s->directory.next%(0x10*s->sectors_per_cluster)) { + direntry_t* direntry=array_get_next(&(s->directory)); + memset(direntry,0,sizeof(direntry_t)); + } + +/* TODO: if there are more entries, bootsector has to be adjusted! */ +#define ROOT_ENTRIES (0x02 * 0x10 * s->sectors_per_cluster) + if (mapping_index == 0 && s->directory.next < ROOT_ENTRIES) { + /* root directory */ + int cur = s->directory.next; + array_ensure_allocated(&(s->directory), ROOT_ENTRIES - 1); + memset(array_get(&(s->directory), cur), 0, + (ROOT_ENTRIES - cur) * sizeof(direntry_t)); + } + + /* reget the mapping, since s->mapping was possibly realloc()ed */ + mapping = (mapping_t*)array_get(&(s->mapping), mapping_index); + first_cluster += (s->directory.next - mapping->info.dir.first_dir_index) + * 0x20 / s->cluster_size; + mapping->end = first_cluster; + + direntry = (direntry_t*)array_get(&(s->directory), mapping->dir_index); + set_begin_of_direntry(direntry, mapping->begin); + + return 0; +} + +static inline uint32_t sector2cluster(BDRVVVFATState* s,off_t sector_num) +{ + return (sector_num-s->faked_sectors)/s->sectors_per_cluster; +} + +static inline off_t cluster2sector(BDRVVVFATState* s, uint32_t cluster_num) +{ + return s->faked_sectors + s->sectors_per_cluster * cluster_num; +} + +static inline uint32_t sector_offset_in_cluster(BDRVVVFATState* s,off_t sector_num) +{ + return (sector_num-s->first_sectors_number-2*s->sectors_per_fat)%s->sectors_per_cluster; +} + +#ifdef DBG +static direntry_t* get_direntry_for_mapping(BDRVVVFATState* s,mapping_t* mapping) +{ + if(mapping->mode==MODE_UNDEFINED) + return 0; + return (direntry_t*)(s->directory.pointer+sizeof(direntry_t)*mapping->dir_index); +} +#endif + +static int init_directories(BDRVVVFATState* s, + const char* dirname) +{ + bootsector_t* bootsector; + mapping_t* mapping; + unsigned int i; + unsigned int cluster; + + memset(&(s->first_sectors[0]),0,0x40*0x200); + + s->cluster_size=s->sectors_per_cluster*0x200; + s->cluster_buffer=qemu_malloc(s->cluster_size); + + /* + * The formula: sc = spf+1+spf*spc*(512*8/fat_type), + * where sc is sector_count, + * spf is sectors_per_fat, + * spc is sectors_per_clusters, and + * fat_type = 12, 16 or 32. + */ + i = 1+s->sectors_per_cluster*0x200*8/s->fat_type; + s->sectors_per_fat=(s->sector_count+i)/i; /* round up */ + + array_init(&(s->mapping),sizeof(mapping_t)); + array_init(&(s->directory),sizeof(direntry_t)); + + /* add volume label */ + { + direntry_t* entry=array_get_next(&(s->directory)); + entry->attributes=0x28; /* archive | volume label */ + snprintf((char*)entry->name,11,"QEMU VVFAT"); + } + + /* Now build FAT, and write back information into directory */ + init_fat(s); + + s->faked_sectors=s->first_sectors_number+s->sectors_per_fat*2; + s->cluster_count=sector2cluster(s, s->sector_count); + + mapping = array_get_next(&(s->mapping)); + mapping->begin = 0; + mapping->dir_index = 0; + mapping->info.dir.parent_mapping_index = -1; + mapping->first_mapping_index = -1; + mapping->path = strdup(dirname); + i = strlen(mapping->path); + if (i > 0 && mapping->path[i - 1] == '/') + mapping->path[i - 1] = '\0'; + mapping->mode = MODE_DIRECTORY; + mapping->read_only = 0; + s->path = mapping->path; + + for (i = 0, cluster = 0; i < s->mapping.next; i++) { + /* MS-DOS expects the FAT to be 0 for the root directory + * (except for the media byte). */ + /* LATER TODO: still true for FAT32? */ + int fix_fat = (i != 0); + mapping = array_get(&(s->mapping), i); + + if (mapping->mode & MODE_DIRECTORY) { + mapping->begin = cluster; + if(read_directory(s, i)) { + fprintf(stderr, "Could not read directory %s\n", + mapping->path); + return -1; + } + mapping = array_get(&(s->mapping), i); + } else { + assert(mapping->mode == MODE_UNDEFINED); + mapping->mode=MODE_NORMAL; + mapping->begin = cluster; + if (mapping->end > 0) { + direntry_t* direntry = array_get(&(s->directory), + mapping->dir_index); + + mapping->end = cluster + 1 + (mapping->end-1)/s->cluster_size; + set_begin_of_direntry(direntry, mapping->begin); + } else { + mapping->end = cluster + 1; + fix_fat = 0; + } + } + + assert(mapping->begin < mapping->end); + + /* next free cluster */ + cluster = mapping->end; + + if(cluster > s->cluster_count) { + fprintf(stderr,"Directory does not fit in FAT%d (capacity %s)\n", + s->fat_type, + s->fat_type == 12 ? s->sector_count == 2880 ? "1.44 MB" + : "2.88 MB" + : "504MB"); + return -EINVAL; + } + + /* fix fat for entry */ + if (fix_fat) { + int j; + for(j = mapping->begin; j < mapping->end - 1; j++) + fat_set(s, j, j+1); + fat_set(s, mapping->end - 1, s->max_fat_value); + } + } + + mapping = array_get(&(s->mapping), 0); + s->sectors_of_root_directory = mapping->end * s->sectors_per_cluster; + s->last_cluster_of_root_directory = mapping->end; + + /* the FAT signature */ + fat_set(s,0,s->max_fat_value); + fat_set(s,1,s->max_fat_value); + + s->current_mapping = NULL; + + bootsector=(bootsector_t*)(s->first_sectors+(s->first_sectors_number-1)*0x200); + bootsector->jump[0]=0xeb; + bootsector->jump[1]=0x3e; + bootsector->jump[2]=0x90; + memcpy(bootsector->name,"QEMU ",8); + bootsector->sector_size=cpu_to_le16(0x200); + bootsector->sectors_per_cluster=s->sectors_per_cluster; + bootsector->reserved_sectors=cpu_to_le16(1); + bootsector->number_of_fats=0x2; /* number of FATs */ + bootsector->root_entries=cpu_to_le16(s->sectors_of_root_directory*0x10); + bootsector->total_sectors16=s->sector_count>0xffff?0:cpu_to_le16(s->sector_count); + bootsector->media_type=(s->fat_type!=12?0xf8:s->sector_count==5760?0xf9:0xf8); /* media descriptor */ + s->fat.pointer[0] = bootsector->media_type; + bootsector->sectors_per_fat=cpu_to_le16(s->sectors_per_fat); + bootsector->sectors_per_track=cpu_to_le16(s->bs->secs); + bootsector->number_of_heads=cpu_to_le16(s->bs->heads); + bootsector->hidden_sectors=cpu_to_le32(s->first_sectors_number==1?0:0x3f); + bootsector->total_sectors=cpu_to_le32(s->sector_count>0xffff?s->sector_count:0); + + /* LATER TODO: if FAT32, this is wrong */ + bootsector->u.fat16.drive_number=s->fat_type==12?0:0x80; /* assume this is hda (TODO) */ + bootsector->u.fat16.current_head=0; + bootsector->u.fat16.signature=0x29; + bootsector->u.fat16.id=cpu_to_le32(0xfabe1afd); + + memcpy(bootsector->u.fat16.volume_label,"QEMU VVFAT ",11); + memcpy(bootsector->fat_type,(s->fat_type==12?"FAT12 ":s->fat_type==16?"FAT16 ":"FAT32 "),8); + bootsector->magic[0]=0x55; bootsector->magic[1]=0xaa; + + return 0; +} + +#ifdef DEBUG +static BDRVVVFATState *vvv = NULL; +#endif + +static int enable_write_target(BDRVVVFATState *s); +static int is_consistent(BDRVVVFATState *s); + +static int vvfat_open(BlockDriverState *bs, const char* dirname, int flags) +{ + BDRVVVFATState *s = bs->opaque; + int floppy = 0; + int i; + +#ifdef DEBUG + vvv = s; +#endif + +DLOG(if (stderr == NULL) { + stderr = fopen("vvfat.log", "a"); + setbuf(stderr, NULL); +}) + + s->bs = bs; + + s->fat_type=16; + /* LATER TODO: if FAT32, adjust */ + s->sectors_per_cluster=0x10; + /* 504MB disk*/ + bs->cyls=1024; bs->heads=16; bs->secs=63; + + s->current_cluster=0xffffffff; + + s->first_sectors_number=0x40; + /* read only is the default for safety */ + bs->read_only = 1; + s->qcow = s->write_target = NULL; + s->qcow_filename = NULL; + s->fat2 = NULL; + s->downcase_short_names = 1; + + if (!strstart(dirname, "fat:", NULL)) + return -1; + + if (strstr(dirname, ":floppy:")) { + floppy = 1; + s->fat_type = 12; + s->first_sectors_number = 1; + s->sectors_per_cluster=2; + bs->cyls = 80; bs->heads = 2; bs->secs = 36; + } + + s->sector_count=bs->cyls*bs->heads*bs->secs; + + if (strstr(dirname, ":32:")) { + fprintf(stderr, "Big fat greek warning: FAT32 has not been tested. You are welcome to do so!\n"); + s->fat_type = 32; + } else if (strstr(dirname, ":16:")) { + s->fat_type = 16; + } else if (strstr(dirname, ":12:")) { + s->fat_type = 12; + s->sector_count=2880; + } + + if (strstr(dirname, ":rw:")) { + if (enable_write_target(s)) + return -1; + bs->read_only = 0; + } + + i = strrchr(dirname, ':') - dirname; + assert(i >= 3); + if (dirname[i-2] == ':' && qemu_isalpha(dirname[i-1])) + /* workaround for DOS drive names */ + dirname += i-1; + else + dirname += i+1; + + bs->total_sectors=bs->cyls*bs->heads*bs->secs; + + if(init_directories(s, dirname)) + return -1; + + s->sector_count = s->faked_sectors + s->sectors_per_cluster*s->cluster_count; + + if(s->first_sectors_number==0x40) + init_mbr(s); + + /* for some reason or other, MS-DOS does not like to know about CHS... */ + if (floppy) + bs->heads = bs->cyls = bs->secs = 0; + + // assert(is_consistent(s)); + return 0; +} + +static inline void vvfat_close_current_file(BDRVVVFATState *s) +{ + if(s->current_mapping) { + s->current_mapping = NULL; + if (s->current_fd) { + close(s->current_fd); + s->current_fd = 0; + } + } + s->current_cluster = -1; +} + +/* mappings between index1 and index2-1 are supposed to be ordered + * return value is the index of the last mapping for which end>cluster_num + */ +static inline int find_mapping_for_cluster_aux(BDRVVVFATState* s,int cluster_num,int index1,int index2) +{ + int index3=index1+1; + while(1) { + mapping_t* mapping; + index3=(index1+index2)/2; + mapping=array_get(&(s->mapping),index3); + assert(mapping->begin < mapping->end); + if(mapping->begin>=cluster_num) { + assert(index2!=index3 || index2==0); + if(index2==index3) + return index1; + index2=index3; + } else { + if(index1==index3) + return mapping->end<=cluster_num ? index2 : index1; + index1=index3; + } + assert(index1<=index2); + DLOG(mapping=array_get(&(s->mapping),index1); + assert(mapping->begin<=cluster_num); + assert(index2 >= s->mapping.next || + ((mapping = array_get(&(s->mapping),index2)) && + mapping->end>cluster_num))); + } +} + +static inline mapping_t* find_mapping_for_cluster(BDRVVVFATState* s,int cluster_num) +{ + int index=find_mapping_for_cluster_aux(s,cluster_num,0,s->mapping.next); + mapping_t* mapping; + if(index>=s->mapping.next) + return NULL; + mapping=array_get(&(s->mapping),index); + if(mapping->begin>cluster_num) + return NULL; + assert(mapping->begin<=cluster_num && mapping->end>cluster_num); + return mapping; +} + +/* + * This function simply compares path == mapping->path. Since the mappings + * are sorted by cluster, this is expensive: O(n). + */ +static inline mapping_t* find_mapping_for_path(BDRVVVFATState* s, + const char* path) +{ + int i; + + for (i = 0; i < s->mapping.next; i++) { + mapping_t* mapping = array_get(&(s->mapping), i); + if (mapping->first_mapping_index < 0 && + !strcmp(path, mapping->path)) + return mapping; + } + + return NULL; +} + +static int open_file(BDRVVVFATState* s,mapping_t* mapping) +{ + if(!mapping) + return -1; + if(!s->current_mapping || + strcmp(s->current_mapping->path,mapping->path)) { + /* open file */ + int fd = open(mapping->path, O_RDONLY | O_BINARY | O_LARGEFILE); + if(fd<0) + return -1; + vvfat_close_current_file(s); + s->current_fd = fd; + s->current_mapping = mapping; + } + return 0; +} + +static inline int read_cluster(BDRVVVFATState *s,int cluster_num) +{ + if(s->current_cluster != cluster_num) { + int result=0; + off_t offset; + assert(!s->current_mapping || s->current_fd || (s->current_mapping->mode & MODE_DIRECTORY)); + if(!s->current_mapping + || s->current_mapping->begin>cluster_num + || s->current_mapping->end<=cluster_num) { + /* binary search of mappings for file */ + mapping_t* mapping=find_mapping_for_cluster(s,cluster_num); + + assert(!mapping || (cluster_num>=mapping->begin && cluster_num<mapping->end)); + + if (mapping && mapping->mode & MODE_DIRECTORY) { + vvfat_close_current_file(s); + s->current_mapping = mapping; +read_cluster_directory: + offset = s->cluster_size*(cluster_num-s->current_mapping->begin); + s->cluster = (unsigned char*)s->directory.pointer+offset + + 0x20*s->current_mapping->info.dir.first_dir_index; + assert(((s->cluster-(unsigned char*)s->directory.pointer)%s->cluster_size)==0); + assert((char*)s->cluster+s->cluster_size <= s->directory.pointer+s->directory.next*s->directory.item_size); + s->current_cluster = cluster_num; + return 0; + } + + if(open_file(s,mapping)) + return -2; + } else if (s->current_mapping->mode & MODE_DIRECTORY) + goto read_cluster_directory; + + assert(s->current_fd); + + offset=s->cluster_size*(cluster_num-s->current_mapping->begin)+s->current_mapping->info.file.offset; + if(lseek(s->current_fd, offset, SEEK_SET)!=offset) + return -3; + s->cluster=s->cluster_buffer; + result=read(s->current_fd,s->cluster,s->cluster_size); + if(result<0) { + s->current_cluster = -1; + return -1; + } + s->current_cluster = cluster_num; + } + return 0; +} + +#ifdef DEBUG +static void hexdump(const void* address, uint32_t len) +{ + const unsigned char* p = address; + int i, j; + + for (i = 0; i < len; i += 16) { + for (j = 0; j < 16 && i + j < len; j++) + fprintf(stderr, "%02x ", p[i + j]); + for (; j < 16; j++) + fprintf(stderr, " "); + fprintf(stderr, " "); + for (j = 0; j < 16 && i + j < len; j++) + fprintf(stderr, "%c", (p[i + j] < ' ' || p[i + j] > 0x7f) ? '.' : p[i + j]); + fprintf(stderr, "\n"); + } +} + +static void print_direntry(const direntry_t* direntry) +{ + int j = 0; + char buffer[1024]; + + fprintf(stderr, "direntry 0x%x: ", (int)direntry); + if(!direntry) + return; + if(is_long_name(direntry)) { + unsigned char* c=(unsigned char*)direntry; + int i; + for(i=1;i<11 && c[i] && c[i]!=0xff;i+=2) +#define ADD_CHAR(c) {buffer[j] = (c); if (buffer[j] < ' ') buffer[j] = 0xb0; j++;} + ADD_CHAR(c[i]); + for(i=14;i<26 && c[i] && c[i]!=0xff;i+=2) + ADD_CHAR(c[i]); + for(i=28;i<32 && c[i] && c[i]!=0xff;i+=2) + ADD_CHAR(c[i]); + buffer[j] = 0; + fprintf(stderr, "%s\n", buffer); + } else { + int i; + for(i=0;i<11;i++) + ADD_CHAR(direntry->name[i]); + buffer[j] = 0; + fprintf(stderr,"%s attributes=0x%02x begin=%d size=%d\n", + buffer, + direntry->attributes, + begin_of_direntry(direntry),le32_to_cpu(direntry->size)); + } +} + +static void print_mapping(const mapping_t* mapping) +{ + fprintf(stderr, "mapping (0x%x): begin, end = %d, %d, dir_index = %d, first_mapping_index = %d, name = %s, mode = 0x%x, " , (int)mapping, mapping->begin, mapping->end, mapping->dir_index, mapping->first_mapping_index, mapping->path, mapping->mode); + if (mapping->mode & MODE_DIRECTORY) + fprintf(stderr, "parent_mapping_index = %d, first_dir_index = %d\n", mapping->info.dir.parent_mapping_index, mapping->info.dir.first_dir_index); + else + fprintf(stderr, "offset = %d\n", mapping->info.file.offset); +} +#endif + +static int vvfat_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + BDRVVVFATState *s = bs->opaque; + int i; + + for(i=0;i<nb_sectors;i++,sector_num++) { + if (sector_num >= s->sector_count) + return -1; + if (s->qcow) { + int n; + if (s->qcow->drv->bdrv_is_allocated(s->qcow, + sector_num, nb_sectors-i, &n)) { +DLOG(fprintf(stderr, "sectors %d+%d allocated\n", (int)sector_num, n)); + if (s->qcow->drv->bdrv_read(s->qcow, sector_num, buf+i*0x200, n)) + return -1; + i += n - 1; + sector_num += n - 1; + continue; + } +DLOG(fprintf(stderr, "sector %d not allocated\n", (int)sector_num)); + } + if(sector_num<s->faked_sectors) { + if(sector_num<s->first_sectors_number) + memcpy(buf+i*0x200,&(s->first_sectors[sector_num*0x200]),0x200); + else if(sector_num-s->first_sectors_number<s->sectors_per_fat) + memcpy(buf+i*0x200,&(s->fat.pointer[(sector_num-s->first_sectors_number)*0x200]),0x200); + else if(sector_num-s->first_sectors_number-s->sectors_per_fat<s->sectors_per_fat) + memcpy(buf+i*0x200,&(s->fat.pointer[(sector_num-s->first_sectors_number-s->sectors_per_fat)*0x200]),0x200); + } else { + uint32_t sector=sector_num-s->faked_sectors, + sector_offset_in_cluster=(sector%s->sectors_per_cluster), + cluster_num=sector/s->sectors_per_cluster; + if(read_cluster(s, cluster_num) != 0) { + /* LATER TODO: strict: return -1; */ + memset(buf+i*0x200,0,0x200); + continue; + } + memcpy(buf+i*0x200,s->cluster+sector_offset_in_cluster*0x200,0x200); + } + } + return 0; +} + +/* LATER TODO: statify all functions */ + +/* + * Idea of the write support (use snapshot): + * + * 1. check if all data is consistent, recording renames, modifications, + * new files and directories (in s->commits). + * + * 2. if the data is not consistent, stop committing + * + * 3. handle renames, and create new files and directories (do not yet + * write their contents) + * + * 4. walk the directories, fixing the mapping and direntries, and marking + * the handled mappings as not deleted + * + * 5. commit the contents of the files + * + * 6. handle deleted files and directories + * + */ + +typedef struct commit_t { + char* path; + union { + struct { uint32_t cluster; } rename; + struct { int dir_index; uint32_t modified_offset; } writeout; + struct { uint32_t first_cluster; } new_file; + struct { uint32_t cluster; } mkdir; + } param; + /* DELETEs and RMDIRs are handled differently: see handle_deletes() */ + enum { + ACTION_RENAME, ACTION_WRITEOUT, ACTION_NEW_FILE, ACTION_MKDIR + } action; +} commit_t; + +static void clear_commits(BDRVVVFATState* s) +{ + int i; +DLOG(fprintf(stderr, "clear_commits (%d commits)\n", s->commits.next)); + for (i = 0; i < s->commits.next; i++) { + commit_t* commit = array_get(&(s->commits), i); + assert(commit->path || commit->action == ACTION_WRITEOUT); + if (commit->action != ACTION_WRITEOUT) { + assert(commit->path); + free(commit->path); + } else + assert(commit->path == NULL); + } + s->commits.next = 0; +} + +static void schedule_rename(BDRVVVFATState* s, + uint32_t cluster, char* new_path) +{ + commit_t* commit = array_get_next(&(s->commits)); + commit->path = new_path; + commit->param.rename.cluster = cluster; + commit->action = ACTION_RENAME; +} + +static void schedule_writeout(BDRVVVFATState* s, + int dir_index, uint32_t modified_offset) +{ + commit_t* commit = array_get_next(&(s->commits)); + commit->path = NULL; + commit->param.writeout.dir_index = dir_index; + commit->param.writeout.modified_offset = modified_offset; + commit->action = ACTION_WRITEOUT; +} + +static void schedule_new_file(BDRVVVFATState* s, + char* path, uint32_t first_cluster) +{ + commit_t* commit = array_get_next(&(s->commits)); + commit->path = path; + commit->param.new_file.first_cluster = first_cluster; + commit->action = ACTION_NEW_FILE; +} + +static void schedule_mkdir(BDRVVVFATState* s, uint32_t cluster, char* path) +{ + commit_t* commit = array_get_next(&(s->commits)); + commit->path = path; + commit->param.mkdir.cluster = cluster; + commit->action = ACTION_MKDIR; +} + +typedef struct { + /* + * Since the sequence number is at most 0x3f, and the filename + * length is at most 13 times the sequence number, the maximal + * filename length is 0x3f * 13 bytes. + */ + unsigned char name[0x3f * 13 + 1]; + int checksum, len; + int sequence_number; +} long_file_name; + +static void lfn_init(long_file_name* lfn) +{ + lfn->sequence_number = lfn->len = 0; + lfn->checksum = 0x100; +} + +/* return 0 if parsed successfully, > 0 if no long name, < 0 if error */ +static int parse_long_name(long_file_name* lfn, + const direntry_t* direntry) +{ + int i, j, offset; + const unsigned char* pointer = (const unsigned char*)direntry; + + if (!is_long_name(direntry)) + return 1; + + if (pointer[0] & 0x40) { + lfn->sequence_number = pointer[0] & 0x3f; + lfn->checksum = pointer[13]; + lfn->name[0] = 0; + lfn->name[lfn->sequence_number * 13] = 0; + } else if ((pointer[0] & 0x3f) != --lfn->sequence_number) + return -1; + else if (pointer[13] != lfn->checksum) + return -2; + else if (pointer[12] || pointer[26] || pointer[27]) + return -3; + + offset = 13 * (lfn->sequence_number - 1); + for (i = 0, j = 1; i < 13; i++, j+=2) { + if (j == 11) + j = 14; + else if (j == 26) + j = 28; + + if (pointer[j+1] == 0) + lfn->name[offset + i] = pointer[j]; + else if (pointer[j+1] != 0xff || (pointer[0] & 0x40) == 0) + return -4; + else + lfn->name[offset + i] = 0; + } + + if (pointer[0] & 0x40) + lfn->len = offset + strlen((char*)lfn->name + offset); + + return 0; +} + +/* returns 0 if successful, >0 if no short_name, and <0 on error */ +static int parse_short_name(BDRVVVFATState* s, + long_file_name* lfn, direntry_t* direntry) +{ + int i, j; + + if (!is_short_name(direntry)) + return 1; + + for (j = 7; j >= 0 && direntry->name[j] == ' '; j--); + for (i = 0; i <= j; i++) { + if (direntry->name[i] <= ' ' || direntry->name[i] > 0x7f) + return -1; + else if (s->downcase_short_names) + lfn->name[i] = qemu_tolower(direntry->name[i]); + else + lfn->name[i] = direntry->name[i]; + } + + for (j = 2; j >= 0 && direntry->extension[j] == ' '; j--); + if (j >= 0) { + lfn->name[i++] = '.'; + lfn->name[i + j + 1] = '\0'; + for (;j >= 0; j--) { + if (direntry->extension[j] <= ' ' || direntry->extension[j] > 0x7f) + return -2; + else if (s->downcase_short_names) + lfn->name[i + j] = qemu_tolower(direntry->extension[j]); + else + lfn->name[i + j] = direntry->extension[j]; + } + } else + lfn->name[i + j + 1] = '\0'; + + lfn->len = strlen((char*)lfn->name); + + return 0; +} + +static inline uint32_t modified_fat_get(BDRVVVFATState* s, + unsigned int cluster) +{ + if (cluster < s->last_cluster_of_root_directory) { + if (cluster + 1 == s->last_cluster_of_root_directory) + return s->max_fat_value; + else + return cluster + 1; + } + + if (s->fat_type==32) { + uint32_t* entry=((uint32_t*)s->fat2)+cluster; + return le32_to_cpu(*entry); + } else if (s->fat_type==16) { + uint16_t* entry=((uint16_t*)s->fat2)+cluster; + return le16_to_cpu(*entry); + } else { + const uint8_t* x=s->fat2+cluster*3/2; + return ((x[0]|(x[1]<<8))>>(cluster&1?4:0))&0x0fff; + } +} + +static inline int cluster_was_modified(BDRVVVFATState* s, uint32_t cluster_num) +{ + int was_modified = 0; + int i, dummy; + + if (s->qcow == NULL) + return 0; + + for (i = 0; !was_modified && i < s->sectors_per_cluster; i++) + was_modified = s->qcow->drv->bdrv_is_allocated(s->qcow, + cluster2sector(s, cluster_num) + i, 1, &dummy); + + return was_modified; +} + +static const char* get_basename(const char* path) +{ + char* basename = strrchr(path, '/'); + if (basename == NULL) + return path; + else + return basename + 1; /* strip '/' */ +} + +/* + * The array s->used_clusters holds the states of the clusters. If it is + * part of a file, it has bit 2 set, in case of a directory, bit 1. If it + * was modified, bit 3 is set. + * If any cluster is allocated, but not part of a file or directory, this + * driver refuses to commit. + */ +typedef enum { + USED_DIRECTORY = 1, USED_FILE = 2, USED_ANY = 3, USED_ALLOCATED = 4 +} used_t; + +/* + * get_cluster_count_for_direntry() not only determines how many clusters + * are occupied by direntry, but also if it was renamed or modified. + * + * A file is thought to be renamed *only* if there already was a file with + * exactly the same first cluster, but a different name. + * + * Further, the files/directories handled by this function are + * assumed to be *not* deleted (and *only* those). + */ +static uint32_t get_cluster_count_for_direntry(BDRVVVFATState* s, + direntry_t* direntry, const char* path) +{ + /* + * This is a little bit tricky: + * IF the guest OS just inserts a cluster into the file chain, + * and leaves the rest alone, (i.e. the original file had clusters + * 15 -> 16, but now has 15 -> 32 -> 16), then the following happens: + * + * - do_commit will write the cluster into the file at the given + * offset, but + * + * - the cluster which is overwritten should be moved to a later + * position in the file. + * + * I am not aware that any OS does something as braindead, but this + * situation could happen anyway when not committing for a long time. + * Just to be sure that this does not bite us, detect it, and copy the + * contents of the clusters to-be-overwritten into the qcow. + */ + int copy_it = 0; + int was_modified = 0; + int32_t ret = 0; + + uint32_t cluster_num = begin_of_direntry(direntry); + uint32_t offset = 0; + int first_mapping_index = -1; + mapping_t* mapping = NULL; + const char* basename2 = NULL; + + vvfat_close_current_file(s); + + /* the root directory */ + if (cluster_num == 0) + return 0; + + /* write support */ + if (s->qcow) { + basename2 = get_basename(path); + + mapping = find_mapping_for_cluster(s, cluster_num); + + if (mapping) { + const char* basename; + + assert(mapping->mode & MODE_DELETED); + mapping->mode &= ~MODE_DELETED; + + basename = get_basename(mapping->path); + + assert(mapping->mode & MODE_NORMAL); + + /* rename */ + if (strcmp(basename, basename2)) + schedule_rename(s, cluster_num, strdup(path)); + } else if (is_file(direntry)) + /* new file */ + schedule_new_file(s, strdup(path), cluster_num); + else { + assert(0); + return 0; + } + } + + while(1) { + if (s->qcow) { + if (!copy_it && cluster_was_modified(s, cluster_num)) { + if (mapping == NULL || + mapping->begin > cluster_num || + mapping->end <= cluster_num) + mapping = find_mapping_for_cluster(s, cluster_num); + + + if (mapping && + (mapping->mode & MODE_DIRECTORY) == 0) { + + /* was modified in qcow */ + if (offset != mapping->info.file.offset + s->cluster_size + * (cluster_num - mapping->begin)) { + /* offset of this cluster in file chain has changed */ + assert(0); + copy_it = 1; + } else if (offset == 0) { + const char* basename = get_basename(mapping->path); + + if (strcmp(basename, basename2)) + copy_it = 1; + first_mapping_index = array_index(&(s->mapping), mapping); + } + + if (mapping->first_mapping_index != first_mapping_index + && mapping->info.file.offset > 0) { + assert(0); + copy_it = 1; + } + + /* need to write out? */ + if (!was_modified && is_file(direntry)) { + was_modified = 1; + schedule_writeout(s, mapping->dir_index, offset); + } + } + } + + if (copy_it) { + int i, dummy; + /* + * This is horribly inefficient, but that is okay, since + * it is rarely executed, if at all. + */ + int64_t offset = cluster2sector(s, cluster_num); + + vvfat_close_current_file(s); + for (i = 0; i < s->sectors_per_cluster; i++) + if (!s->qcow->drv->bdrv_is_allocated(s->qcow, + offset + i, 1, &dummy)) { + if (vvfat_read(s->bs, + offset, s->cluster_buffer, 1)) + return -1; + if (s->qcow->drv->bdrv_write(s->qcow, + offset, s->cluster_buffer, 1)) + return -2; + } + } + } + + ret++; + if (s->used_clusters[cluster_num] & USED_ANY) + return 0; + s->used_clusters[cluster_num] = USED_FILE; + + cluster_num = modified_fat_get(s, cluster_num); + + if (fat_eof(s, cluster_num)) + return ret; + else if (cluster_num < 2 || cluster_num > s->max_fat_value - 16) + return -1; + + offset += s->cluster_size; + } +} + +/* + * This function looks at the modified data (qcow). + * It returns 0 upon inconsistency or error, and the number of clusters + * used by the directory, its subdirectories and their files. + */ +static int check_directory_consistency(BDRVVVFATState *s, + int cluster_num, const char* path) +{ + int ret = 0; + unsigned char* cluster = qemu_malloc(s->cluster_size); + direntry_t* direntries = (direntry_t*)cluster; + mapping_t* mapping = find_mapping_for_cluster(s, cluster_num); + + long_file_name lfn; + int path_len = strlen(path); + char path2[PATH_MAX]; + + assert(path_len < PATH_MAX); /* len was tested before! */ + pstrcpy(path2, sizeof(path2), path); + path2[path_len] = '/'; + path2[path_len + 1] = '\0'; + + if (mapping) { + const char* basename = get_basename(mapping->path); + const char* basename2 = get_basename(path); + + assert(mapping->mode & MODE_DIRECTORY); + + assert(mapping->mode & MODE_DELETED); + mapping->mode &= ~MODE_DELETED; + + if (strcmp(basename, basename2)) + schedule_rename(s, cluster_num, strdup(path)); + } else + /* new directory */ + schedule_mkdir(s, cluster_num, strdup(path)); + + lfn_init(&lfn); + do { + int i; + int subret = 0; + + ret++; + + if (s->used_clusters[cluster_num] & USED_ANY) { + fprintf(stderr, "cluster %d used more than once\n", (int)cluster_num); + return 0; + } + s->used_clusters[cluster_num] = USED_DIRECTORY; + +DLOG(fprintf(stderr, "read cluster %d (sector %d)\n", (int)cluster_num, (int)cluster2sector(s, cluster_num))); + subret = vvfat_read(s->bs, cluster2sector(s, cluster_num), cluster, + s->sectors_per_cluster); + if (subret) { + fprintf(stderr, "Error fetching direntries\n"); + fail: + free(cluster); + return 0; + } + + for (i = 0; i < 0x10 * s->sectors_per_cluster; i++) { + int cluster_count = 0; + +DLOG(fprintf(stderr, "check direntry %d: \n", i); print_direntry(direntries + i)); + if (is_volume_label(direntries + i) || is_dot(direntries + i) || + is_free(direntries + i)) + continue; + + subret = parse_long_name(&lfn, direntries + i); + if (subret < 0) { + fprintf(stderr, "Error in long name\n"); + goto fail; + } + if (subret == 0 || is_free(direntries + i)) + continue; + + if (fat_chksum(direntries+i) != lfn.checksum) { + subret = parse_short_name(s, &lfn, direntries + i); + if (subret < 0) { + fprintf(stderr, "Error in short name (%d)\n", subret); + goto fail; + } + if (subret > 0 || !strcmp((char*)lfn.name, ".") + || !strcmp((char*)lfn.name, "..")) + continue; + } + lfn.checksum = 0x100; /* cannot use long name twice */ + + if (path_len + 1 + lfn.len >= PATH_MAX) { + fprintf(stderr, "Name too long: %s/%s\n", path, lfn.name); + goto fail; + } + pstrcpy(path2 + path_len + 1, sizeof(path2) - path_len - 1, + (char*)lfn.name); + + if (is_directory(direntries + i)) { + if (begin_of_direntry(direntries + i) == 0) { + DLOG(fprintf(stderr, "invalid begin for directory: %s\n", path2); print_direntry(direntries + i)); + goto fail; + } + cluster_count = check_directory_consistency(s, + begin_of_direntry(direntries + i), path2); + if (cluster_count == 0) { + DLOG(fprintf(stderr, "problem in directory %s:\n", path2); print_direntry(direntries + i)); + goto fail; + } + } else if (is_file(direntries + i)) { + /* check file size with FAT */ + cluster_count = get_cluster_count_for_direntry(s, direntries + i, path2); + if (cluster_count != + (le32_to_cpu(direntries[i].size) + s->cluster_size + - 1) / s->cluster_size) { + DLOG(fprintf(stderr, "Cluster count mismatch\n")); + goto fail; + } + } else + assert(0); /* cluster_count = 0; */ + + ret += cluster_count; + } + + cluster_num = modified_fat_get(s, cluster_num); + } while(!fat_eof(s, cluster_num)); + + free(cluster); + return ret; +} + +/* returns 1 on success */ +static int is_consistent(BDRVVVFATState* s) +{ + int i, check; + int used_clusters_count = 0; + +DLOG(checkpoint()); + /* + * - get modified FAT + * - compare the two FATs (TODO) + * - get buffer for marking used clusters + * - recurse direntries from root (using bs->bdrv_read to make + * sure to get the new data) + * - check that the FAT agrees with the size + * - count the number of clusters occupied by this directory and + * its files + * - check that the cumulative used cluster count agrees with the + * FAT + * - if all is fine, return number of used clusters + */ + if (s->fat2 == NULL) { + int size = 0x200 * s->sectors_per_fat; + s->fat2 = qemu_malloc(size); + memcpy(s->fat2, s->fat.pointer, size); + } + check = vvfat_read(s->bs, + s->first_sectors_number, s->fat2, s->sectors_per_fat); + if (check) { + fprintf(stderr, "Could not copy fat\n"); + return 0; + } + assert (s->used_clusters); + for (i = 0; i < sector2cluster(s, s->sector_count); i++) + s->used_clusters[i] &= ~USED_ANY; + + clear_commits(s); + + /* mark every mapped file/directory as deleted. + * (check_directory_consistency() will unmark those still present). */ + if (s->qcow) + for (i = 0; i < s->mapping.next; i++) { + mapping_t* mapping = array_get(&(s->mapping), i); + if (mapping->first_mapping_index < 0) + mapping->mode |= MODE_DELETED; + } + + used_clusters_count = check_directory_consistency(s, 0, s->path); + if (used_clusters_count <= 0) { + DLOG(fprintf(stderr, "problem in directory\n")); + return 0; + } + + check = s->last_cluster_of_root_directory; + for (i = check; i < sector2cluster(s, s->sector_count); i++) { + if (modified_fat_get(s, i)) { + if(!s->used_clusters[i]) { + DLOG(fprintf(stderr, "FAT was modified (%d), but cluster is not used?\n", i)); + return 0; + } + check++; + } + + if (s->used_clusters[i] == USED_ALLOCATED) { + /* allocated, but not used... */ + DLOG(fprintf(stderr, "unused, modified cluster: %d\n", i)); + return 0; + } + } + + if (check != used_clusters_count) + return 0; + + return used_clusters_count; +} + +static inline void adjust_mapping_indices(BDRVVVFATState* s, + int offset, int adjust) +{ + int i; + + for (i = 0; i < s->mapping.next; i++) { + mapping_t* mapping = array_get(&(s->mapping), i); + +#define ADJUST_MAPPING_INDEX(name) \ + if (mapping->name >= offset) \ + mapping->name += adjust + + ADJUST_MAPPING_INDEX(first_mapping_index); + if (mapping->mode & MODE_DIRECTORY) + ADJUST_MAPPING_INDEX(info.dir.parent_mapping_index); + } +} + +/* insert or update mapping */ +static mapping_t* insert_mapping(BDRVVVFATState* s, + uint32_t begin, uint32_t end) +{ + /* + * - find mapping where mapping->begin >= begin, + * - if mapping->begin > begin: insert + * - adjust all references to mappings! + * - else: adjust + * - replace name + */ + int index = find_mapping_for_cluster_aux(s, begin, 0, s->mapping.next); + mapping_t* mapping = NULL; + mapping_t* first_mapping = array_get(&(s->mapping), 0); + + if (index < s->mapping.next && (mapping = array_get(&(s->mapping), index)) + && mapping->begin < begin) { + mapping->end = begin; + index++; + mapping = array_get(&(s->mapping), index); + } + if (index >= s->mapping.next || mapping->begin > begin) { + mapping = array_insert(&(s->mapping), index, 1); + mapping->path = NULL; + adjust_mapping_indices(s, index, +1); + } + + mapping->begin = begin; + mapping->end = end; + +DLOG(mapping_t* next_mapping; +assert(index + 1 >= s->mapping.next || +((next_mapping = array_get(&(s->mapping), index + 1)) && + next_mapping->begin >= end))); + + if (s->current_mapping && first_mapping != (mapping_t*)s->mapping.pointer) + s->current_mapping = array_get(&(s->mapping), + s->current_mapping - first_mapping); + + return mapping; +} + +static int remove_mapping(BDRVVVFATState* s, int mapping_index) +{ + mapping_t* mapping = array_get(&(s->mapping), mapping_index); + mapping_t* first_mapping = array_get(&(s->mapping), 0); + + /* free mapping */ + if (mapping->first_mapping_index < 0) + free(mapping->path); + + /* remove from s->mapping */ + array_remove(&(s->mapping), mapping_index); + + /* adjust all references to mappings */ + adjust_mapping_indices(s, mapping_index, -1); + + if (s->current_mapping && first_mapping != (mapping_t*)s->mapping.pointer) + s->current_mapping = array_get(&(s->mapping), + s->current_mapping - first_mapping); + + return 0; +} + +static void adjust_dirindices(BDRVVVFATState* s, int offset, int adjust) +{ + int i; + for (i = 0; i < s->mapping.next; i++) { + mapping_t* mapping = array_get(&(s->mapping), i); + if (mapping->dir_index >= offset) + mapping->dir_index += adjust; + if ((mapping->mode & MODE_DIRECTORY) && + mapping->info.dir.first_dir_index >= offset) + mapping->info.dir.first_dir_index += adjust; + } +} + +static direntry_t* insert_direntries(BDRVVVFATState* s, + int dir_index, int count) +{ + /* + * make room in s->directory, + * adjust_dirindices + */ + direntry_t* result = array_insert(&(s->directory), dir_index, count); + if (result == NULL) + return NULL; + adjust_dirindices(s, dir_index, count); + return result; +} + +static int remove_direntries(BDRVVVFATState* s, int dir_index, int count) +{ + int ret = array_remove_slice(&(s->directory), dir_index, count); + if (ret) + return ret; + adjust_dirindices(s, dir_index, -count); + return 0; +} + +/* + * Adapt the mappings of the cluster chain starting at first cluster + * (i.e. if a file starts at first_cluster, the chain is followed according + * to the modified fat, and the corresponding entries in s->mapping are + * adjusted) + */ +static int commit_mappings(BDRVVVFATState* s, + uint32_t first_cluster, int dir_index) +{ + mapping_t* mapping = find_mapping_for_cluster(s, first_cluster); + direntry_t* direntry = array_get(&(s->directory), dir_index); + uint32_t cluster = first_cluster; + + vvfat_close_current_file(s); + + assert(mapping); + assert(mapping->begin == first_cluster); + mapping->first_mapping_index = -1; + mapping->dir_index = dir_index; + mapping->mode = (dir_index <= 0 || is_directory(direntry)) ? + MODE_DIRECTORY : MODE_NORMAL; + + while (!fat_eof(s, cluster)) { + uint32_t c, c1; + + for (c = cluster, c1 = modified_fat_get(s, c); c + 1 == c1; + c = c1, c1 = modified_fat_get(s, c1)); + + c++; + if (c > mapping->end) { + int index = array_index(&(s->mapping), mapping); + int i, max_i = s->mapping.next - index; + for (i = 1; i < max_i && mapping[i].begin < c; i++); + while (--i > 0) + remove_mapping(s, index + 1); + } + assert(mapping == array_get(&(s->mapping), s->mapping.next - 1) + || mapping[1].begin >= c); + mapping->end = c; + + if (!fat_eof(s, c1)) { + int i = find_mapping_for_cluster_aux(s, c1, 0, s->mapping.next); + mapping_t* next_mapping = i >= s->mapping.next ? NULL : + array_get(&(s->mapping), i); + + if (next_mapping == NULL || next_mapping->begin > c1) { + int i1 = array_index(&(s->mapping), mapping); + + next_mapping = insert_mapping(s, c1, c1+1); + + if (c1 < c) + i1++; + mapping = array_get(&(s->mapping), i1); + } + + next_mapping->dir_index = mapping->dir_index; + next_mapping->first_mapping_index = + mapping->first_mapping_index < 0 ? + array_index(&(s->mapping), mapping) : + mapping->first_mapping_index; + next_mapping->path = mapping->path; + next_mapping->mode = mapping->mode; + next_mapping->read_only = mapping->read_only; + if (mapping->mode & MODE_DIRECTORY) { + next_mapping->info.dir.parent_mapping_index = + mapping->info.dir.parent_mapping_index; + next_mapping->info.dir.first_dir_index = + mapping->info.dir.first_dir_index + + 0x10 * s->sectors_per_cluster * + (mapping->end - mapping->begin); + } else + next_mapping->info.file.offset = mapping->info.file.offset + + mapping->end - mapping->begin; + + mapping = next_mapping; + } + + cluster = c1; + } + + return 0; +} + +static int commit_direntries(BDRVVVFATState* s, + int dir_index, int parent_mapping_index) +{ + direntry_t* direntry = array_get(&(s->directory), dir_index); + uint32_t first_cluster = dir_index == 0 ? 0 : begin_of_direntry(direntry); + mapping_t* mapping = find_mapping_for_cluster(s, first_cluster); + + int factor = 0x10 * s->sectors_per_cluster; + int old_cluster_count, new_cluster_count; + int current_dir_index = mapping->info.dir.first_dir_index; + int first_dir_index = current_dir_index; + int ret, i; + uint32_t c; + +DLOG(fprintf(stderr, "commit_direntries for %s, parent_mapping_index %d\n", mapping->path, parent_mapping_index)); + + assert(direntry); + assert(mapping); + assert(mapping->begin == first_cluster); + assert(mapping->info.dir.first_dir_index < s->directory.next); + assert(mapping->mode & MODE_DIRECTORY); + assert(dir_index == 0 || is_directory(direntry)); + + mapping->info.dir.parent_mapping_index = parent_mapping_index; + + if (first_cluster == 0) { + old_cluster_count = new_cluster_count = + s->last_cluster_of_root_directory; + } else { + for (old_cluster_count = 0, c = first_cluster; !fat_eof(s, c); + c = fat_get(s, c)) + old_cluster_count++; + + for (new_cluster_count = 0, c = first_cluster; !fat_eof(s, c); + c = modified_fat_get(s, c)) + new_cluster_count++; + } + + if (new_cluster_count > old_cluster_count) { + if (insert_direntries(s, + current_dir_index + factor * old_cluster_count, + factor * (new_cluster_count - old_cluster_count)) == NULL) + return -1; + } else if (new_cluster_count < old_cluster_count) + remove_direntries(s, + current_dir_index + factor * new_cluster_count, + factor * (old_cluster_count - new_cluster_count)); + + for (c = first_cluster; !fat_eof(s, c); c = modified_fat_get(s, c)) { + void* direntry = array_get(&(s->directory), current_dir_index); + int ret = vvfat_read(s->bs, cluster2sector(s, c), direntry, + s->sectors_per_cluster); + if (ret) + return ret; + assert(!strncmp(s->directory.pointer, "QEMU", 4)); + current_dir_index += factor; + } + + ret = commit_mappings(s, first_cluster, dir_index); + if (ret) + return ret; + + /* recurse */ + for (i = 0; i < factor * new_cluster_count; i++) { + direntry = array_get(&(s->directory), first_dir_index + i); + if (is_directory(direntry) && !is_dot(direntry)) { + mapping = find_mapping_for_cluster(s, first_cluster); + assert(mapping->mode & MODE_DIRECTORY); + ret = commit_direntries(s, first_dir_index + i, + array_index(&(s->mapping), mapping)); + if (ret) + return ret; + } + } + + return 0; +} + +/* commit one file (adjust contents, adjust mapping), + return first_mapping_index */ +static int commit_one_file(BDRVVVFATState* s, + int dir_index, uint32_t offset) +{ + direntry_t* direntry = array_get(&(s->directory), dir_index); + uint32_t c = begin_of_direntry(direntry); + uint32_t first_cluster = c; + mapping_t* mapping = find_mapping_for_cluster(s, c); + uint32_t size = filesize_of_direntry(direntry); + char* cluster = qemu_malloc(s->cluster_size); + uint32_t i; + int fd = 0; + + assert(offset < size); + assert((offset % s->cluster_size) == 0); + + for (i = s->cluster_size; i < offset; i += s->cluster_size) + c = modified_fat_get(s, c); + + fd = open(mapping->path, O_RDWR | O_CREAT | O_BINARY, 0666); + if (fd < 0) { + fprintf(stderr, "Could not open %s... (%s, %d)\n", mapping->path, + strerror(errno), errno); + return fd; + } + if (offset > 0) + if (lseek(fd, offset, SEEK_SET) != offset) + return -3; + + while (offset < size) { + uint32_t c1; + int rest_size = (size - offset > s->cluster_size ? + s->cluster_size : size - offset); + int ret; + + c1 = modified_fat_get(s, c); + + assert((size - offset == 0 && fat_eof(s, c)) || + (size > offset && c >=2 && !fat_eof(s, c))); + + ret = vvfat_read(s->bs, cluster2sector(s, c), + (uint8_t*)cluster, (rest_size + 0x1ff) / 0x200); + + if (ret < 0) + return ret; + + if (write(fd, cluster, rest_size) < 0) + return -2; + + offset += rest_size; + c = c1; + } + + ftruncate(fd, size); + close(fd); + + return commit_mappings(s, first_cluster, dir_index); +} + +#ifdef DEBUG +/* test, if all mappings point to valid direntries */ +static void check1(BDRVVVFATState* s) +{ + int i; + for (i = 0; i < s->mapping.next; i++) { + mapping_t* mapping = array_get(&(s->mapping), i); + if (mapping->mode & MODE_DELETED) { + fprintf(stderr, "deleted\n"); + continue; + } + assert(mapping->dir_index >= 0); + assert(mapping->dir_index < s->directory.next); + direntry_t* direntry = array_get(&(s->directory), mapping->dir_index); + assert(mapping->begin == begin_of_direntry(direntry) || mapping->first_mapping_index >= 0); + if (mapping->mode & MODE_DIRECTORY) { + assert(mapping->info.dir.first_dir_index + 0x10 * s->sectors_per_cluster * (mapping->end - mapping->begin) <= s->directory.next); + assert((mapping->info.dir.first_dir_index % (0x10 * s->sectors_per_cluster)) == 0); + } + } +} + +/* test, if all direntries have mappings */ +static void check2(BDRVVVFATState* s) +{ + int i; + int first_mapping = -1; + + for (i = 0; i < s->directory.next; i++) { + direntry_t* direntry = array_get(&(s->directory), i); + + if (is_short_name(direntry) && begin_of_direntry(direntry)) { + mapping_t* mapping = find_mapping_for_cluster(s, begin_of_direntry(direntry)); + assert(mapping); + assert(mapping->dir_index == i || is_dot(direntry)); + assert(mapping->begin == begin_of_direntry(direntry) || is_dot(direntry)); + } + + if ((i % (0x10 * s->sectors_per_cluster)) == 0) { + /* cluster start */ + int j, count = 0; + + for (j = 0; j < s->mapping.next; j++) { + mapping_t* mapping = array_get(&(s->mapping), j); + if (mapping->mode & MODE_DELETED) + continue; + if (mapping->mode & MODE_DIRECTORY) { + if (mapping->info.dir.first_dir_index <= i && mapping->info.dir.first_dir_index + 0x10 * s->sectors_per_cluster > i) { + assert(++count == 1); + if (mapping->first_mapping_index == -1) + first_mapping = array_index(&(s->mapping), mapping); + else + assert(first_mapping == mapping->first_mapping_index); + if (mapping->info.dir.parent_mapping_index < 0) + assert(j == 0); + else { + mapping_t* parent = array_get(&(s->mapping), mapping->info.dir.parent_mapping_index); + assert(parent->mode & MODE_DIRECTORY); + assert(parent->info.dir.first_dir_index < mapping->info.dir.first_dir_index); + } + } + } + } + if (count == 0) + first_mapping = -1; + } + } +} +#endif + +static int handle_renames_and_mkdirs(BDRVVVFATState* s) +{ + int i; + +#ifdef DEBUG + fprintf(stderr, "handle_renames\n"); + for (i = 0; i < s->commits.next; i++) { + commit_t* commit = array_get(&(s->commits), i); + fprintf(stderr, "%d, %s (%d, %d)\n", i, commit->path ? commit->path : "(null)", commit->param.rename.cluster, commit->action); + } +#endif + + for (i = 0; i < s->commits.next;) { + commit_t* commit = array_get(&(s->commits), i); + if (commit->action == ACTION_RENAME) { + mapping_t* mapping = find_mapping_for_cluster(s, + commit->param.rename.cluster); + char* old_path = mapping->path; + + assert(commit->path); + mapping->path = commit->path; + if (rename(old_path, mapping->path)) + return -2; + + if (mapping->mode & MODE_DIRECTORY) { + int l1 = strlen(mapping->path); + int l2 = strlen(old_path); + int diff = l1 - l2; + direntry_t* direntry = array_get(&(s->directory), + mapping->info.dir.first_dir_index); + uint32_t c = mapping->begin; + int i = 0; + + /* recurse */ + while (!fat_eof(s, c)) { + do { + direntry_t* d = direntry + i; + + if (is_file(d) || (is_directory(d) && !is_dot(d))) { + mapping_t* m = find_mapping_for_cluster(s, + begin_of_direntry(d)); + int l = strlen(m->path); + char* new_path = qemu_malloc(l + diff + 1); + + assert(!strncmp(m->path, mapping->path, l2)); + + pstrcpy(new_path, l + diff + 1, mapping->path); + pstrcpy(new_path + l1, l + diff + 1 - l1, + m->path + l2); + + schedule_rename(s, m->begin, new_path); + } + i++; + } while((i % (0x10 * s->sectors_per_cluster)) != 0); + c = fat_get(s, c); + } + } + + free(old_path); + array_remove(&(s->commits), i); + continue; + } else if (commit->action == ACTION_MKDIR) { + mapping_t* mapping; + int j, parent_path_len; + +#ifdef __MINGW32__ + if (mkdir(commit->path)) + return -5; +#else + if (mkdir(commit->path, 0755)) + return -5; +#endif + + mapping = insert_mapping(s, commit->param.mkdir.cluster, + commit->param.mkdir.cluster + 1); + if (mapping == NULL) + return -6; + + mapping->mode = MODE_DIRECTORY; + mapping->read_only = 0; + mapping->path = commit->path; + j = s->directory.next; + assert(j); + insert_direntries(s, s->directory.next, + 0x10 * s->sectors_per_cluster); + mapping->info.dir.first_dir_index = j; + + parent_path_len = strlen(commit->path) + - strlen(get_basename(commit->path)) - 1; + for (j = 0; j < s->mapping.next; j++) { + mapping_t* m = array_get(&(s->mapping), j); + if (m->first_mapping_index < 0 && m != mapping && + !strncmp(m->path, mapping->path, parent_path_len) && + strlen(m->path) == parent_path_len) + break; + } + assert(j < s->mapping.next); + mapping->info.dir.parent_mapping_index = j; + + array_remove(&(s->commits), i); + continue; + } + + i++; + } + return 0; +} + +/* + * TODO: make sure that the short name is not matching *another* file + */ +static int handle_commits(BDRVVVFATState* s) +{ + int i, fail = 0; + + vvfat_close_current_file(s); + + for (i = 0; !fail && i < s->commits.next; i++) { + commit_t* commit = array_get(&(s->commits), i); + switch(commit->action) { + case ACTION_RENAME: case ACTION_MKDIR: + assert(0); + fail = -2; + break; + case ACTION_WRITEOUT: { + direntry_t* entry = array_get(&(s->directory), + commit->param.writeout.dir_index); + uint32_t begin = begin_of_direntry(entry); + mapping_t* mapping = find_mapping_for_cluster(s, begin); + + assert(mapping); + assert(mapping->begin == begin); + assert(commit->path == NULL); + + if (commit_one_file(s, commit->param.writeout.dir_index, + commit->param.writeout.modified_offset)) + fail = -3; + + break; + } + case ACTION_NEW_FILE: { + int begin = commit->param.new_file.first_cluster; + mapping_t* mapping = find_mapping_for_cluster(s, begin); + direntry_t* entry; + int i; + + /* find direntry */ + for (i = 0; i < s->directory.next; i++) { + entry = array_get(&(s->directory), i); + if (is_file(entry) && begin_of_direntry(entry) == begin) + break; + } + + if (i >= s->directory.next) { + fail = -6; + continue; + } + + /* make sure there exists an initial mapping */ + if (mapping && mapping->begin != begin) { + mapping->end = begin; + mapping = NULL; + } + if (mapping == NULL) { + mapping = insert_mapping(s, begin, begin+1); + } + /* most members will be fixed in commit_mappings() */ + assert(commit->path); + mapping->path = commit->path; + mapping->read_only = 0; + mapping->mode = MODE_NORMAL; + mapping->info.file.offset = 0; + + if (commit_one_file(s, i, 0)) + fail = -7; + + break; + } + default: + assert(0); + } + } + if (i > 0 && array_remove_slice(&(s->commits), 0, i)) + return -1; + return fail; +} + +static int handle_deletes(BDRVVVFATState* s) +{ + int i, deferred = 1, deleted = 1; + + /* delete files corresponding to mappings marked as deleted */ + /* handle DELETEs and unused mappings (modified_fat_get(s, mapping->begin) == 0) */ + while (deferred && deleted) { + deferred = 0; + deleted = 0; + + for (i = 1; i < s->mapping.next; i++) { + mapping_t* mapping = array_get(&(s->mapping), i); + if (mapping->mode & MODE_DELETED) { + direntry_t* entry = array_get(&(s->directory), + mapping->dir_index); + + if (is_free(entry)) { + /* remove file/directory */ + if (mapping->mode & MODE_DIRECTORY) { + int j, next_dir_index = s->directory.next, + first_dir_index = mapping->info.dir.first_dir_index; + + if (rmdir(mapping->path) < 0) { + if (errno == ENOTEMPTY) { + deferred++; + continue; + } else + return -5; + } + + for (j = 1; j < s->mapping.next; j++) { + mapping_t* m = array_get(&(s->mapping), j); + if (m->mode & MODE_DIRECTORY && + m->info.dir.first_dir_index > + first_dir_index && + m->info.dir.first_dir_index < + next_dir_index) + next_dir_index = + m->info.dir.first_dir_index; + } + remove_direntries(s, first_dir_index, + next_dir_index - first_dir_index); + + deleted++; + } + } else { + if (unlink(mapping->path)) + return -4; + deleted++; + } + DLOG(fprintf(stderr, "DELETE (%d)\n", i); print_mapping(mapping); print_direntry(entry)); + remove_mapping(s, i); + } + } + } + + return 0; +} + +/* + * synchronize mapping with new state: + * + * - copy FAT (with bdrv_read) + * - mark all filenames corresponding to mappings as deleted + * - recurse direntries from root (using bs->bdrv_read) + * - delete files corresponding to mappings marked as deleted + */ +static int do_commit(BDRVVVFATState* s) +{ + int ret = 0; + + /* the real meat are the commits. Nothing to do? Move along! */ + if (s->commits.next == 0) + return 0; + + vvfat_close_current_file(s); + + ret = handle_renames_and_mkdirs(s); + if (ret) { + fprintf(stderr, "Error handling renames (%d)\n", ret); + assert(0); + return ret; + } + + /* copy FAT (with bdrv_read) */ + memcpy(s->fat.pointer, s->fat2, 0x200 * s->sectors_per_fat); + + /* recurse direntries from root (using bs->bdrv_read) */ + ret = commit_direntries(s, 0, -1); + if (ret) { + fprintf(stderr, "Fatal: error while committing (%d)\n", ret); + assert(0); + return ret; + } + + ret = handle_commits(s); + if (ret) { + fprintf(stderr, "Error handling commits (%d)\n", ret); + assert(0); + return ret; + } + + ret = handle_deletes(s); + if (ret) { + fprintf(stderr, "Error deleting\n"); + assert(0); + return ret; + } + + s->qcow->drv->bdrv_make_empty(s->qcow); + + memset(s->used_clusters, 0, sector2cluster(s, s->sector_count)); + +DLOG(checkpoint()); + return 0; +} + +static int try_commit(BDRVVVFATState* s) +{ + vvfat_close_current_file(s); +DLOG(checkpoint()); + if(!is_consistent(s)) + return -1; + return do_commit(s); +} + +static int vvfat_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + BDRVVVFATState *s = bs->opaque; + int i, ret; + +DLOG(checkpoint()); + + vvfat_close_current_file(s); + + /* + * Some sanity checks: + * - do not allow writing to the boot sector + * - do not allow to write non-ASCII filenames + */ + + if (sector_num < s->first_sectors_number) + return -1; + + for (i = sector2cluster(s, sector_num); + i <= sector2cluster(s, sector_num + nb_sectors - 1);) { + mapping_t* mapping = find_mapping_for_cluster(s, i); + if (mapping) { + if (mapping->read_only) { + fprintf(stderr, "Tried to write to write-protected file %s\n", + mapping->path); + return -1; + } + + if (mapping->mode & MODE_DIRECTORY) { + int begin = cluster2sector(s, i); + int end = begin + s->sectors_per_cluster, k; + int dir_index; + const direntry_t* direntries; + long_file_name lfn; + + lfn_init(&lfn); + + if (begin < sector_num) + begin = sector_num; + if (end > sector_num + nb_sectors) + end = sector_num + nb_sectors; + dir_index = mapping->dir_index + + 0x10 * (begin - mapping->begin * s->sectors_per_cluster); + direntries = (direntry_t*)(buf + 0x200 * (begin - sector_num)); + + for (k = 0; k < (end - begin) * 0x10; k++) { + /* do not allow non-ASCII filenames */ + if (parse_long_name(&lfn, direntries + k) < 0) { + fprintf(stderr, "Warning: non-ASCII filename\n"); + return -1; + } + /* no access to the direntry of a read-only file */ + else if (is_short_name(direntries+k) && + (direntries[k].attributes & 1)) { + if (memcmp(direntries + k, + array_get(&(s->directory), dir_index + k), + sizeof(direntry_t))) { + fprintf(stderr, "Warning: tried to write to write-protected file\n"); + return -1; + } + } + } + } + i = mapping->end; + } else + i++; + } + + /* + * Use qcow backend. Commit later. + */ +DLOG(fprintf(stderr, "Write to qcow backend: %d + %d\n", (int)sector_num, nb_sectors)); + ret = s->qcow->drv->bdrv_write(s->qcow, sector_num, buf, nb_sectors); + if (ret < 0) { + fprintf(stderr, "Error writing to qcow backend\n"); + return ret; + } + + for (i = sector2cluster(s, sector_num); + i <= sector2cluster(s, sector_num + nb_sectors - 1); i++) + if (i >= 0) + s->used_clusters[i] |= USED_ALLOCATED; + +DLOG(checkpoint()); + /* TODO: add timeout */ + try_commit(s); + +DLOG(checkpoint()); + return 0; +} + +static int vvfat_is_allocated(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, int* n) +{ + BDRVVVFATState* s = bs->opaque; + *n = s->sector_count - sector_num; + if (*n > nb_sectors) + *n = nb_sectors; + else if (*n < 0) + return 0; + return 1; +} + +static int write_target_commit(BlockDriverState *bs, int64_t sector_num, + const uint8_t* buffer, int nb_sectors) { + BDRVVVFATState* s = bs->opaque; + return try_commit(s); +} + +static void write_target_close(BlockDriverState *bs) { + BDRVVVFATState* s = bs->opaque; + bdrv_delete(s->qcow); + free(s->qcow_filename); +} + +static BlockDriver vvfat_write_target = { + "vvfat_write_target", 0, NULL, NULL, NULL, + write_target_commit, + write_target_close, + NULL, NULL, NULL +}; + +static int enable_write_target(BDRVVVFATState *s) +{ + int size = sector2cluster(s, s->sector_count); + s->used_clusters = calloc(size, 1); + + array_init(&(s->commits), sizeof(commit_t)); + + s->qcow_filename = qemu_malloc(1024); + get_tmp_filename(s->qcow_filename, 1024); + if (bdrv_create(bdrv_find_format("qcow"), + s->qcow_filename, s->sector_count, "fat:", 0) < 0) + return -1; + s->qcow = bdrv_new(""); + if (s->qcow == NULL || bdrv_open(s->qcow, s->qcow_filename, 0) < 0) + return -1; + +#ifndef _WIN32 + unlink(s->qcow_filename); +#endif + + s->bs->backing_hd = calloc(sizeof(BlockDriverState), 1); + s->bs->backing_hd->drv = &vvfat_write_target; + s->bs->backing_hd->opaque = s; + + return 0; +} + +static void vvfat_close(BlockDriverState *bs) +{ + BDRVVVFATState *s = bs->opaque; + + vvfat_close_current_file(s); + array_free(&(s->fat)); + array_free(&(s->directory)); + array_free(&(s->mapping)); + if(s->cluster_buffer) + free(s->cluster_buffer); +} + +static BlockDriver bdrv_vvfat = { + .format_name = "vvfat", + .instance_size = sizeof(BDRVVVFATState), + .bdrv_open = vvfat_open, + .bdrv_read = vvfat_read, + .bdrv_write = vvfat_write, + .bdrv_close = vvfat_close, + .bdrv_is_allocated = vvfat_is_allocated, + .protocol_name = "fat", +}; + +static void bdrv_vvfat_init(void) +{ + bdrv_register(&bdrv_vvfat); +} + +block_init(bdrv_vvfat_init); + +#ifdef DEBUG +static void checkpoint(void) { + assert(((mapping_t*)array_get(&(vvv->mapping), 0))->end == 2); + check1(vvv); + check2(vvv); + assert(!vvv->current_mapping || vvv->current_fd || (vvv->current_mapping->mode & MODE_DIRECTORY)); +#if 0 + if (((direntry_t*)vvv->directory.pointer)[1].attributes != 0xf) + fprintf(stderr, "Nonono!\n"); + mapping_t* mapping; + direntry_t* direntry; + assert(vvv->mapping.size >= vvv->mapping.item_size * vvv->mapping.next); + assert(vvv->directory.size >= vvv->directory.item_size * vvv->directory.next); + if (vvv->mapping.next<47) + return; + assert((mapping = array_get(&(vvv->mapping), 47))); + assert(mapping->dir_index < vvv->directory.next); + direntry = array_get(&(vvv->directory), mapping->dir_index); + assert(!memcmp(direntry->name, "USB H ", 11) || direntry->name[0]==0); +#endif + return; + /* avoid compiler warnings: */ + hexdump(NULL, 100); + remove_mapping(vvv, NULL); + print_mapping(NULL); + print_direntry(NULL); +} +#endif |