diff options
57 files changed, 2507 insertions, 478 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 906f252477..59940f97ad 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1000,3 +1000,9 @@ SSH M: Richard W.M. Jones <rjones@redhat.com> S: Supported F: block/ssh.c + +ARCHIPELAGO +M: Chrysostomos Nanakos <cnanakos@grnet.gr> +M: Chrysostomos Nanakos <chris@include.gr> +S: Maintained +F: block/archipelago.c diff --git a/block-migration.c b/block-migration.c index 73cdd07e8c..ba3ed36f77 100644 --- a/block-migration.c +++ b/block-migration.c @@ -186,7 +186,7 @@ static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector) { int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK; - if ((sector << BDRV_SECTOR_BITS) < bdrv_getlength(bmds->bs)) { + if (sector < bdrv_nb_sectors(bmds->bs)) { return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] & (1UL << (chunk % (sizeof(unsigned long) * 8)))); } else { @@ -223,8 +223,7 @@ static void alloc_aio_bitmap(BlkMigDevState *bmds) BlockDriverState *bs = bmds->bs; int64_t bitmap_size; - bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) + - BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1; + bitmap_size = bdrv_nb_sectors(bs) + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1; bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8; bmds->aio_bitmap = g_malloc0(bitmap_size); @@ -350,7 +349,7 @@ static void init_blk_migration_it(void *opaque, BlockDriverState *bs) int64_t sectors; if (!bdrv_is_read_only(bs)) { - sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS; + sectors = bdrv_nb_sectors(bs); if (sectors <= 0) { return; } @@ -799,7 +798,7 @@ static int block_load(QEMUFile *f, void *opaque, int version_id) if (bs != bs_prev) { bs_prev = bs; - total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS; + total_sectors = bdrv_nb_sectors(bs); if (total_sectors <= 0) { error_report("Error getting length of block device %s", device_name); @@ -57,6 +57,8 @@ struct BdrvDirtyBitmap { #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ +#define COROUTINE_POOL_RESERVATION 64 /* number of coroutines to reserve */ + static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load); static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, @@ -701,6 +703,7 @@ static int find_image_format(BlockDriverState *bs, const char *filename, /** * Set the current 'total_sectors' value + * Return 0 on success, -errno on error. */ static int refresh_total_sectors(BlockDriverState *bs, int64_t hint) { @@ -1313,7 +1316,6 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp) error_setg_errno(errp, -total_size, "Could not get image size"); goto out; } - total_size &= BDRV_SECTOR_MASK; /* Create the temporary image */ ret = get_tmp_filename(tmp_filename, PATH_MAX + 1); @@ -2107,6 +2109,9 @@ int bdrv_attach_dev(BlockDriverState *bs, void *dev) } bs->dev = dev; bdrv_iostatus_reset(bs); + + /* We're expecting I/O from the device so bump up coroutine pool size */ + qemu_coroutine_adjust_pool_size(COROUTINE_POOL_RESERVATION); return 0; } @@ -2126,6 +2131,7 @@ void bdrv_detach_dev(BlockDriverState *bs, void *dev) bs->dev_ops = NULL; bs->dev_opaque = NULL; bs->guest_block_size = 512; + qemu_coroutine_adjust_pool_size(-COROUTINE_POOL_RESERVATION); } /* TODO change to return DeviceState * when all users are qdevified */ @@ -2203,6 +2209,9 @@ bool bdrv_dev_is_medium_locked(BlockDriverState *bs) */ int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix) { + if (bs->drv == NULL) { + return -ENOMEDIUM; + } if (bs->drv->bdrv_check == NULL) { return -ENOTSUP; } @@ -2269,7 +2278,14 @@ int bdrv_commit(BlockDriverState *bs) } total_sectors = length >> BDRV_SECTOR_BITS; - buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE); + + /* qemu_try_blockalign() for bs will choose an alignment that works for + * bs->backing_hd as well, so no need to compare the alignment manually. */ + buf = qemu_try_blockalign(bs, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE); + if (buf == NULL) { + ret = -ENOMEM; + goto ro_cleanup; + } for (sector = 0; sector < total_sectors; sector += n) { ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n); @@ -2307,7 +2323,7 @@ int bdrv_commit(BlockDriverState *bs) ret = 0; ro_cleanup: - g_free(buf); + qemu_vfree(buf); if (ro) { /* ignoring error return here */ @@ -2827,18 +2843,16 @@ int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, */ int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) { - int64_t target_size; - int64_t ret, nb_sectors, sector_num = 0; + int64_t target_sectors, ret, nb_sectors, sector_num = 0; int n; - target_size = bdrv_getlength(bs); - if (target_size < 0) { - return target_size; + target_sectors = bdrv_nb_sectors(bs); + if (target_sectors < 0) { + return target_sectors; } - target_size /= BDRV_SECTOR_SIZE; for (;;) { - nb_sectors = target_size - sector_num; + nb_sectors = target_sectors - sector_num; if (nb_sectors <= 0) { return 0; } @@ -2968,7 +2982,12 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, cluster_sector_num, cluster_nb_sectors); iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; - iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len); + iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); + if (bounce_buffer == NULL) { + ret = -ENOMEM; + goto err; + } + qemu_iovec_init_external(&bounce_qiov, &iov, 1); ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, @@ -3056,15 +3075,14 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); } else { /* Read zeros after EOF of growable BDSes */ - int64_t len, total_sectors, max_nb_sectors; + int64_t total_sectors, max_nb_sectors; - len = bdrv_getlength(bs); - if (len < 0) { - ret = len; + total_sectors = bdrv_nb_sectors(bs); + if (total_sectors < 0) { + ret = total_sectors; goto out; } - total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE); max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), align >> BDRV_SECTOR_BITS); if (max_nb_sectors > 0) { @@ -3253,7 +3271,11 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, /* Fall back to bounce buffer if write zeroes is unsupported */ iov.iov_len = num * BDRV_SECTOR_SIZE; if (iov.iov_base == NULL) { - iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE); + iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); + if (iov.iov_base == NULL) { + ret = -ENOMEM; + goto fail; + } memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); } qemu_iovec_init_external(&qiov, &iov, 1); @@ -3273,6 +3295,7 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, nb_sectors -= num; } +fail: qemu_vfree(iov.iov_base); return ret; } @@ -3536,11 +3559,12 @@ int64_t bdrv_get_allocated_file_size(BlockDriverState *bs) } /** - * Length of a file in bytes. Return < 0 if error or unknown. + * Return number of sectors on success, -errno on error. */ -int64_t bdrv_getlength(BlockDriverState *bs) +int64_t bdrv_nb_sectors(BlockDriverState *bs) { BlockDriver *drv = bs->drv; + if (!drv) return -ENOMEDIUM; @@ -3550,19 +3574,26 @@ int64_t bdrv_getlength(BlockDriverState *bs) return ret; } } - return bs->total_sectors * BDRV_SECTOR_SIZE; + return bs->total_sectors; +} + +/** + * Return length in bytes on success, -errno on error. + * The length is always a multiple of BDRV_SECTOR_SIZE. + */ +int64_t bdrv_getlength(BlockDriverState *bs) +{ + int64_t ret = bdrv_nb_sectors(bs); + + return ret < 0 ? ret : ret * BDRV_SECTOR_SIZE; } /* return 0 as number of sectors if no device present or error */ void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr) { - int64_t length; - length = bdrv_getlength(bs); - if (length < 0) - length = 0; - else - length = length >> BDRV_SECTOR_BITS; - *nb_sectors_ptr = length; + int64_t nb_sectors = bdrv_nb_sectors(bs); + + *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors; } void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error, @@ -3945,21 +3976,21 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum) { - int64_t length; + int64_t total_sectors; int64_t n; int64_t ret, ret2; - length = bdrv_getlength(bs); - if (length < 0) { - return length; + total_sectors = bdrv_nb_sectors(bs); + if (total_sectors < 0) { + return total_sectors; } - if (sector_num >= (length >> BDRV_SECTOR_BITS)) { + if (sector_num >= total_sectors) { *pnum = 0; return 0; } - n = bs->total_sectors - sector_num; + n = total_sectors - sector_num; if (n < nb_sectors) { nb_sectors = n; } @@ -3994,8 +4025,8 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, ret |= BDRV_BLOCK_ZERO; } else if (bs->backing_hd) { BlockDriverState *bs2 = bs->backing_hd; - int64_t length2 = bdrv_getlength(bs2); - if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) { + int64_t nb_sectors2 = bdrv_nb_sectors(bs2); + if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { ret |= BDRV_BLOCK_ZERO; } } @@ -4607,8 +4638,9 @@ static void bdrv_aio_bh_cb(void *opaque) { BlockDriverAIOCBSync *acb = opaque; - if (!acb->is_write) + if (!acb->is_write && acb->ret >= 0) { qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); + } qemu_vfree(acb->bounce); acb->common.cb(acb->common.opaque, acb->ret); qemu_bh_delete(acb->bh); @@ -4630,10 +4662,12 @@ static BlockDriverAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); acb->is_write = is_write; acb->qiov = qiov; - acb->bounce = qemu_blockalign(bs, qiov->size); + acb->bounce = qemu_try_blockalign(bs, qiov->size); acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); - if (is_write) { + if (acb->bounce == NULL) { + acb->ret = -ENOMEM; + } else if (is_write) { qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); } else { @@ -5247,6 +5281,19 @@ void *qemu_blockalign(BlockDriverState *bs, size_t size) return qemu_memalign(bdrv_opt_mem_align(bs), size); } +void *qemu_try_blockalign(BlockDriverState *bs, size_t size) +{ + size_t align = bdrv_opt_mem_align(bs); + + /* Ensure that NULL is never returned on success */ + assert(align > 0); + if (size == 0) { + size = align; + } + + return qemu_try_memalign(align, size); +} + /* * Check if all memory in this vector is sector aligned. */ @@ -5277,13 +5324,12 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity, granularity >>= BDRV_SECTOR_BITS; assert(granularity); - bitmap_size = bdrv_getlength(bs); + bitmap_size = bdrv_nb_sectors(bs); if (bitmap_size < 0) { error_setg_errno(errp, -bitmap_size, "could not get length of device"); errno = -bitmap_size; return NULL; } - bitmap_size >>= BDRV_SECTOR_BITS; bitmap = g_malloc0(sizeof(BdrvDirtyBitmap)); bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1); QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list); @@ -5371,6 +5417,9 @@ void bdrv_ref(BlockDriverState *bs) * deleted. */ void bdrv_unref(BlockDriverState *bs) { + if (!bs) { + return; + } assert(bs->refcnt > 0); if (--bs->refcnt == 0) { bdrv_delete(bs); @@ -5591,7 +5640,7 @@ void bdrv_img_create(const char *filename, const char *fmt, if (size == -1) { if (backing_file) { BlockDriverState *bs; - uint64_t size; + int64_t size; int back_flags; /* backing files always opened read-only */ @@ -5609,8 +5658,13 @@ void bdrv_img_create(const char *filename, const char *fmt, local_err = NULL; goto out; } - bdrv_get_geometry(bs, &size); - size *= 512; + size = bdrv_getlength(bs); + if (size < 0) { + error_setg_errno(errp, -size, "Could not get size of '%s'", + backing_file); + bdrv_unref(bs); + goto out; + } qemu_opt_set_number(opts, BLOCK_OPT_SIZE, size); diff --git a/block/Makefile.objs b/block/Makefile.objs index fd88c03ece..858d2b387b 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -17,6 +17,7 @@ block-obj-$(CONFIG_LIBNFS) += nfs.o block-obj-$(CONFIG_CURL) += curl.o block-obj-$(CONFIG_RBD) += rbd.o block-obj-$(CONFIG_GLUSTERFS) += gluster.o +block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o block-obj-$(CONFIG_LIBSSH2) += ssh.o endif @@ -35,5 +36,6 @@ gluster.o-cflags := $(GLUSTERFS_CFLAGS) gluster.o-libs := $(GLUSTERFS_LIBS) ssh.o-cflags := $(LIBSSH2_CFLAGS) ssh.o-libs := $(LIBSSH2_LIBS) +archipelago.o-libs := $(ARCHIPELAGO_LIBS) qcow.o-libs := -lz linux-aio.o-libs := -laio diff --git a/block/archipelago.c b/block/archipelago.c new file mode 100644 index 0000000000..6629d03a1d --- /dev/null +++ b/block/archipelago.c @@ -0,0 +1,1069 @@ +/* + * QEMU Block driver for Archipelago + * + * Copyright (C) 2014 Chrysostomos Nanakos <cnanakos@grnet.gr> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +/* + * VM Image on Archipelago volume is specified like this: + * + * file.driver=archipelago,file.volume=<volumename> + * [,file.mport=<mapperd_port>[,file.vport=<vlmcd_port>] + * [,file.segment=<segment_name>]] + * + * or + * + * file=archipelago:<volumename>[/mport=<mapperd_port>[:vport=<vlmcd_port>][: + * segment=<segment_name>]] + * + * 'archipelago' is the protocol. + * + * 'mport' is the port number on which mapperd is listening. This is optional + * and if not specified, QEMU will make Archipelago to use the default port. + * + * 'vport' is the port number on which vlmcd is listening. This is optional + * and if not specified, QEMU will make Archipelago to use the default port. + * + * 'segment' is the name of the shared memory segment Archipelago stack + * is using. This is optional and if not specified, QEMU will make Archipelago + * to use the default value, 'archipelago'. + * + * Examples: + * + * file.driver=archipelago,file.volume=my_vm_volume + * file.driver=archipelago,file.volume=my_vm_volume,file.mport=123 + * file.driver=archipelago,file.volume=my_vm_volume,file.mport=123, + * file.vport=1234 + * file.driver=archipelago,file.volume=my_vm_volume,file.mport=123, + * file.vport=1234,file.segment=my_segment + * + * or + * + * file=archipelago:my_vm_volume + * file=archipelago:my_vm_volume/mport=123 + * file=archipelago:my_vm_volume/mport=123:vport=1234 + * file=archipelago:my_vm_volume/mport=123:vport=1234:segment=my_segment + * + */ + +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/error-report.h" +#include "qemu/thread.h" +#include "qapi/qmp/qint.h" +#include "qapi/qmp/qstring.h" +#include "qapi/qmp/qjson.h" + +#include <inttypes.h> +#include <xseg/xseg.h> +#include <xseg/protocol.h> + +#define ARCHIP_FD_READ 0 +#define ARCHIP_FD_WRITE 1 +#define MAX_REQUEST_SIZE 524288 + +#define ARCHIPELAGO_OPT_VOLUME "volume" +#define ARCHIPELAGO_OPT_SEGMENT "segment" +#define ARCHIPELAGO_OPT_MPORT "mport" +#define ARCHIPELAGO_OPT_VPORT "vport" +#define ARCHIPELAGO_DFL_MPORT 1001 +#define ARCHIPELAGO_DFL_VPORT 501 + +#define archipelagolog(fmt, ...) \ + do { \ + fprintf(stderr, "archipelago\t%-24s: " fmt, __func__, ##__VA_ARGS__); \ + } while (0) + +typedef enum { + ARCHIP_OP_READ, + ARCHIP_OP_WRITE, + ARCHIP_OP_FLUSH, + ARCHIP_OP_VOLINFO, +} ARCHIPCmd; + +typedef struct ArchipelagoAIOCB { + BlockDriverAIOCB common; + QEMUBH *bh; + struct BDRVArchipelagoState *s; + QEMUIOVector *qiov; + ARCHIPCmd cmd; + bool cancelled; + int status; + int64_t size; + int64_t ret; +} ArchipelagoAIOCB; + +typedef struct BDRVArchipelagoState { + ArchipelagoAIOCB *event_acb; + char *volname; + char *segment_name; + uint64_t size; + /* Archipelago specific */ + struct xseg *xseg; + struct xseg_port *port; + xport srcport; + xport sport; + xport mportno; + xport vportno; + QemuMutex archip_mutex; + QemuCond archip_cond; + bool is_signaled; + /* Request handler specific */ + QemuThread request_th; + QemuCond request_cond; + QemuMutex request_mutex; + bool th_is_signaled; + bool stopping; +} BDRVArchipelagoState; + +typedef struct ArchipelagoSegmentedRequest { + size_t count; + size_t total; + int ref; + int failed; +} ArchipelagoSegmentedRequest; + +typedef struct AIORequestData { + const char *volname; + off_t offset; + size_t size; + uint64_t bufidx; + int ret; + int op; + ArchipelagoAIOCB *aio_cb; + ArchipelagoSegmentedRequest *segreq; +} AIORequestData; + +static void qemu_archipelago_complete_aio(void *opaque); + +static void init_local_signal(struct xseg *xseg, xport sport, xport srcport) +{ + if (xseg && (sport != srcport)) { + xseg_init_local_signal(xseg, srcport); + sport = srcport; + } +} + +static void archipelago_finish_aiocb(AIORequestData *reqdata) +{ + if (reqdata->aio_cb->ret != reqdata->segreq->total) { + reqdata->aio_cb->ret = -EIO; + } else if (reqdata->aio_cb->ret == reqdata->segreq->total) { + reqdata->aio_cb->ret = 0; + } + reqdata->aio_cb->bh = aio_bh_new( + bdrv_get_aio_context(reqdata->aio_cb->common.bs), + qemu_archipelago_complete_aio, reqdata + ); + qemu_bh_schedule(reqdata->aio_cb->bh); +} + +static int wait_reply(struct xseg *xseg, xport srcport, struct xseg_port *port, + struct xseg_request *expected_req) +{ + struct xseg_request *req; + xseg_prepare_wait(xseg, srcport); + void *psd = xseg_get_signal_desc(xseg, port); + while (1) { + req = xseg_receive(xseg, srcport, X_NONBLOCK); + if (req) { + if (req != expected_req) { + archipelagolog("Unknown received request\n"); + xseg_put_request(xseg, req, srcport); + } else if (!(req->state & XS_SERVED)) { + return -1; + } else { + break; + } + } + xseg_wait_signal(xseg, psd, 100000UL); + } + xseg_cancel_wait(xseg, srcport); + return 0; +} + +static void xseg_request_handler(void *state) +{ + BDRVArchipelagoState *s = (BDRVArchipelagoState *) state; + void *psd = xseg_get_signal_desc(s->xseg, s->port); + qemu_mutex_lock(&s->request_mutex); + + while (!s->stopping) { + struct xseg_request *req; + void *data; + xseg_prepare_wait(s->xseg, s->srcport); + req = xseg_receive(s->xseg, s->srcport, X_NONBLOCK); + if (req) { + AIORequestData *reqdata; + ArchipelagoSegmentedRequest *segreq; + xseg_get_req_data(s->xseg, req, (void **)&reqdata); + + switch (reqdata->op) { + case ARCHIP_OP_READ: + data = xseg_get_data(s->xseg, req); + segreq = reqdata->segreq; + segreq->count += req->serviced; + + qemu_iovec_from_buf(reqdata->aio_cb->qiov, reqdata->bufidx, + data, + req->serviced); + + xseg_put_request(s->xseg, req, s->srcport); + + if ((__sync_add_and_fetch(&segreq->ref, -1)) == 0) { + if (!segreq->failed) { + reqdata->aio_cb->ret = segreq->count; + archipelago_finish_aiocb(reqdata); + g_free(segreq); + } else { + g_free(segreq); + g_free(reqdata); + } + } else { + g_free(reqdata); + } + break; + case ARCHIP_OP_WRITE: + case ARCHIP_OP_FLUSH: + segreq = reqdata->segreq; + segreq->count += req->serviced; + xseg_put_request(s->xseg, req, s->srcport); + + if ((__sync_add_and_fetch(&segreq->ref, -1)) == 0) { + if (!segreq->failed) { + reqdata->aio_cb->ret = segreq->count; + archipelago_finish_aiocb(reqdata); + g_free(segreq); + } else { + g_free(segreq); + g_free(reqdata); + } + } else { + g_free(reqdata); + } + break; + case ARCHIP_OP_VOLINFO: + s->is_signaled = true; + qemu_cond_signal(&s->archip_cond); + break; + } + } else { + xseg_wait_signal(s->xseg, psd, 100000UL); + } + xseg_cancel_wait(s->xseg, s->srcport); + } + + s->th_is_signaled = true; + qemu_cond_signal(&s->request_cond); + qemu_mutex_unlock(&s->request_mutex); + qemu_thread_exit(NULL); +} + +static int qemu_archipelago_xseg_init(BDRVArchipelagoState *s) +{ + if (xseg_initialize()) { + archipelagolog("Cannot initialize XSEG\n"); + goto err_exit; + } + + s->xseg = xseg_join("posix", s->segment_name, + "posixfd", NULL); + if (!s->xseg) { + archipelagolog("Cannot join XSEG shared memory segment\n"); + goto err_exit; + } + s->port = xseg_bind_dynport(s->xseg); + s->srcport = s->port->portno; + init_local_signal(s->xseg, s->sport, s->srcport); + return 0; + +err_exit: + return -1; +} + +static int qemu_archipelago_init(BDRVArchipelagoState *s) +{ + int ret; + + ret = qemu_archipelago_xseg_init(s); + if (ret < 0) { + error_report("Cannot initialize XSEG. Aborting...\n"); + goto err_exit; + } + + qemu_cond_init(&s->archip_cond); + qemu_mutex_init(&s->archip_mutex); + qemu_cond_init(&s->request_cond); + qemu_mutex_init(&s->request_mutex); + s->th_is_signaled = false; + qemu_thread_create(&s->request_th, "xseg_io_th", + (void *) xseg_request_handler, + (void *) s, QEMU_THREAD_JOINABLE); + +err_exit: + return ret; +} + +static void qemu_archipelago_complete_aio(void *opaque) +{ + AIORequestData *reqdata = (AIORequestData *) opaque; + ArchipelagoAIOCB *aio_cb = (ArchipelagoAIOCB *) reqdata->aio_cb; + + qemu_bh_delete(aio_cb->bh); + aio_cb->common.cb(aio_cb->common.opaque, aio_cb->ret); + aio_cb->status = 0; + + if (!aio_cb->cancelled) { + qemu_aio_release(aio_cb); + } + g_free(reqdata); +} + +static void xseg_find_port(char *pstr, const char *needle, xport *aport) +{ + const char *a; + char *endptr = NULL; + unsigned long port; + if (strstart(pstr, needle, &a)) { + if (strlen(a) > 0) { + port = strtoul(a, &endptr, 10); + if (strlen(endptr)) { + *aport = -2; + return; + } + *aport = (xport) port; + } + } +} + +static void xseg_find_segment(char *pstr, const char *needle, + char **segment_name) +{ + const char *a; + if (strstart(pstr, needle, &a)) { + if (strlen(a) > 0) { + *segment_name = g_strdup(a); + } + } +} + +static void parse_filename_opts(const char *filename, Error **errp, + char **volume, char **segment_name, + xport *mport, xport *vport) +{ + const char *start; + char *tokens[4], *ds; + int idx; + xport lmport = NoPort, lvport = NoPort; + + strstart(filename, "archipelago:", &start); + + ds = g_strdup(start); + tokens[0] = strtok(ds, "/"); + tokens[1] = strtok(NULL, ":"); + tokens[2] = strtok(NULL, ":"); + tokens[3] = strtok(NULL, "\0"); + + if (!strlen(tokens[0])) { + error_setg(errp, "volume name must be specified first"); + g_free(ds); + return; + } + + for (idx = 1; idx < 4; idx++) { + if (tokens[idx] != NULL) { + if (strstart(tokens[idx], "mport=", NULL)) { + xseg_find_port(tokens[idx], "mport=", &lmport); + } + if (strstart(tokens[idx], "vport=", NULL)) { + xseg_find_port(tokens[idx], "vport=", &lvport); + } + if (strstart(tokens[idx], "segment=", NULL)) { + xseg_find_segment(tokens[idx], "segment=", segment_name); + } + } + } + + if ((lmport == -2) || (lvport == -2)) { + error_setg(errp, "mport and/or vport must be set"); + g_free(ds); + return; + } + *volume = g_strdup(tokens[0]); + *mport = lmport; + *vport = lvport; + g_free(ds); +} + +static void archipelago_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + const char *start; + char *volume = NULL, *segment_name = NULL; + xport mport = NoPort, vport = NoPort; + + if (qdict_haskey(options, ARCHIPELAGO_OPT_VOLUME) + || qdict_haskey(options, ARCHIPELAGO_OPT_SEGMENT) + || qdict_haskey(options, ARCHIPELAGO_OPT_MPORT) + || qdict_haskey(options, ARCHIPELAGO_OPT_VPORT)) { + error_setg(errp, "volume/mport/vport/segment and a file name may not" + " be specified at the same time"); + return; + } + + if (!strstart(filename, "archipelago:", &start)) { + error_setg(errp, "File name must start with 'archipelago:'"); + return; + } + + if (!strlen(start) || strstart(start, "/", NULL)) { + error_setg(errp, "volume name must be specified"); + return; + } + + parse_filename_opts(filename, errp, &volume, &segment_name, &mport, &vport); + + if (volume) { + qdict_put(options, ARCHIPELAGO_OPT_VOLUME, qstring_from_str(volume)); + g_free(volume); + } + if (segment_name) { + qdict_put(options, ARCHIPELAGO_OPT_SEGMENT, + qstring_from_str(segment_name)); + g_free(segment_name); + } + if (mport != NoPort) { + qdict_put(options, ARCHIPELAGO_OPT_MPORT, qint_from_int(mport)); + } + if (vport != NoPort) { + qdict_put(options, ARCHIPELAGO_OPT_VPORT, qint_from_int(vport)); + } +} + +static QemuOptsList archipelago_runtime_opts = { + .name = "archipelago", + .head = QTAILQ_HEAD_INITIALIZER(archipelago_runtime_opts.head), + .desc = { + { + .name = ARCHIPELAGO_OPT_VOLUME, + .type = QEMU_OPT_STRING, + .help = "Name of the volume image", + }, + { + .name = ARCHIPELAGO_OPT_SEGMENT, + .type = QEMU_OPT_STRING, + .help = "Name of the Archipelago shared memory segment", + }, + { + .name = ARCHIPELAGO_OPT_MPORT, + .type = QEMU_OPT_NUMBER, + .help = "Archipelago mapperd port number" + }, + { + .name = ARCHIPELAGO_OPT_VPORT, + .type = QEMU_OPT_NUMBER, + .help = "Archipelago vlmcd port number" + + }, + { /* end of list */ } + }, +}; + +static int qemu_archipelago_open(BlockDriverState *bs, + QDict *options, + int bdrv_flags, + Error **errp) +{ + int ret = 0; + const char *volume, *segment_name; + QemuOpts *opts; + Error *local_err = NULL; + BDRVArchipelagoState *s = bs->opaque; + + opts = qemu_opts_create(&archipelago_runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto err_exit; + } + + s->mportno = qemu_opt_get_number(opts, ARCHIPELAGO_OPT_MPORT, + ARCHIPELAGO_DFL_MPORT); + s->vportno = qemu_opt_get_number(opts, ARCHIPELAGO_OPT_VPORT, + ARCHIPELAGO_DFL_VPORT); + + segment_name = qemu_opt_get(opts, ARCHIPELAGO_OPT_SEGMENT); + if (segment_name == NULL) { + s->segment_name = g_strdup("archipelago"); + } else { + s->segment_name = g_strdup(segment_name); + } + + volume = qemu_opt_get(opts, ARCHIPELAGO_OPT_VOLUME); + if (volume == NULL) { + error_setg(errp, "archipelago block driver requires the 'volume'" + " option"); + ret = -EINVAL; + goto err_exit; + } + s->volname = g_strdup(volume); + + /* Initialize XSEG, join shared memory segment */ + ret = qemu_archipelago_init(s); + if (ret < 0) { + error_setg(errp, "cannot initialize XSEG and join shared " + "memory segment"); + goto err_exit; + } + + qemu_opts_del(opts); + return 0; + +err_exit: + g_free(s->volname); + g_free(s->segment_name); + qemu_opts_del(opts); + return ret; +} + +static void qemu_archipelago_close(BlockDriverState *bs) +{ + int r, targetlen; + char *target; + struct xseg_request *req; + BDRVArchipelagoState *s = bs->opaque; + + s->stopping = true; + + qemu_mutex_lock(&s->request_mutex); + while (!s->th_is_signaled) { + qemu_cond_wait(&s->request_cond, + &s->request_mutex); + } + qemu_mutex_unlock(&s->request_mutex); + qemu_thread_join(&s->request_th); + qemu_cond_destroy(&s->request_cond); + qemu_mutex_destroy(&s->request_mutex); + + qemu_cond_destroy(&s->archip_cond); + qemu_mutex_destroy(&s->archip_mutex); + + targetlen = strlen(s->volname); + req = xseg_get_request(s->xseg, s->srcport, s->vportno, X_ALLOC); + if (!req) { + archipelagolog("Cannot get XSEG request\n"); + goto err_exit; + } + r = xseg_prep_request(s->xseg, req, targetlen, 0); + if (r < 0) { + xseg_put_request(s->xseg, req, s->srcport); + archipelagolog("Cannot prepare XSEG close request\n"); + goto err_exit; + } + + target = xseg_get_target(s->xseg, req); + memcpy(target, s->volname, targetlen); + req->size = req->datalen; + req->offset = 0; + req->op = X_CLOSE; + + xport p = xseg_submit(s->xseg, req, s->srcport, X_ALLOC); + if (p == NoPort) { + xseg_put_request(s->xseg, req, s->srcport); + archipelagolog("Cannot submit XSEG close request\n"); + goto err_exit; + } + + xseg_signal(s->xseg, p); + wait_reply(s->xseg, s->srcport, s->port, req); + + xseg_put_request(s->xseg, req, s->srcport); + +err_exit: + g_free(s->volname); + g_free(s->segment_name); + xseg_quit_local_signal(s->xseg, s->srcport); + xseg_leave_dynport(s->xseg, s->port); + xseg_leave(s->xseg); +} + +static int qemu_archipelago_create_volume(Error **errp, const char *volname, + char *segment_name, + uint64_t size, xport mportno, + xport vportno) +{ + int ret, targetlen; + struct xseg *xseg = NULL; + struct xseg_request *req; + struct xseg_request_clone *xclone; + struct xseg_port *port; + xport srcport = NoPort, sport = NoPort; + char *target; + + /* Try default values if none has been set */ + if (mportno == (xport) -1) { + mportno = ARCHIPELAGO_DFL_MPORT; + } + + if (vportno == (xport) -1) { + vportno = ARCHIPELAGO_DFL_VPORT; + } + + if (xseg_initialize()) { + error_setg(errp, "Cannot initialize XSEG"); + return -1; + } + + xseg = xseg_join("posix", segment_name, + "posixfd", NULL); + + if (!xseg) { + error_setg(errp, "Cannot join XSEG shared memory segment"); + return -1; + } + + port = xseg_bind_dynport(xseg); + srcport = port->portno; + init_local_signal(xseg, sport, srcport); + + req = xseg_get_request(xseg, srcport, mportno, X_ALLOC); + if (!req) { + error_setg(errp, "Cannot get XSEG request"); + return -1; + } + + targetlen = strlen(volname); + ret = xseg_prep_request(xseg, req, targetlen, + sizeof(struct xseg_request_clone)); + if (ret < 0) { + error_setg(errp, "Cannot prepare XSEG request"); + goto err_exit; + } + + target = xseg_get_target(xseg, req); + if (!target) { + error_setg(errp, "Cannot get XSEG target.\n"); + goto err_exit; + } + memcpy(target, volname, targetlen); + xclone = (struct xseg_request_clone *) xseg_get_data(xseg, req); + memset(xclone->target, 0 , XSEG_MAX_TARGETLEN); + xclone->targetlen = 0; + xclone->size = size; + req->offset = 0; + req->size = req->datalen; + req->op = X_CLONE; + + xport p = xseg_submit(xseg, req, srcport, X_ALLOC); + if (p == NoPort) { + error_setg(errp, "Could not submit XSEG request"); + goto err_exit; + } + xseg_signal(xseg, p); + + ret = wait_reply(xseg, srcport, port, req); + if (ret < 0) { + error_setg(errp, "wait_reply() error."); + } + + xseg_put_request(xseg, req, srcport); + xseg_quit_local_signal(xseg, srcport); + xseg_leave_dynport(xseg, port); + xseg_leave(xseg); + return ret; + +err_exit: + xseg_put_request(xseg, req, srcport); + xseg_quit_local_signal(xseg, srcport); + xseg_leave_dynport(xseg, port); + xseg_leave(xseg); + return -1; +} + +static int qemu_archipelago_create(const char *filename, + QemuOpts *options, + Error **errp) +{ + int ret = 0; + uint64_t total_size = 0; + char *volname = NULL, *segment_name = NULL; + const char *start; + xport mport = NoPort, vport = NoPort; + + if (!strstart(filename, "archipelago:", &start)) { + error_setg(errp, "File name must start with 'archipelago:'"); + return -1; + } + + if (!strlen(start) || strstart(start, "/", NULL)) { + error_setg(errp, "volume name must be specified"); + return -1; + } + + parse_filename_opts(filename, errp, &volname, &segment_name, &mport, + &vport); + total_size = qemu_opt_get_size_del(options, BLOCK_OPT_SIZE, 0); + + if (segment_name == NULL) { + segment_name = g_strdup("archipelago"); + } + + /* Create an Archipelago volume */ + ret = qemu_archipelago_create_volume(errp, volname, segment_name, + total_size, mport, + vport); + + g_free(volname); + g_free(segment_name); + return ret; +} + +static void qemu_archipelago_aio_cancel(BlockDriverAIOCB *blockacb) +{ + ArchipelagoAIOCB *aio_cb = (ArchipelagoAIOCB *) blockacb; + aio_cb->cancelled = true; + while (aio_cb->status == -EINPROGRESS) { + aio_poll(bdrv_get_aio_context(aio_cb->common.bs), true); + } + qemu_aio_release(aio_cb); +} + +static const AIOCBInfo archipelago_aiocb_info = { + .aiocb_size = sizeof(ArchipelagoAIOCB), + .cancel = qemu_archipelago_aio_cancel, +}; + +static int archipelago_submit_request(BDRVArchipelagoState *s, + uint64_t bufidx, + size_t count, + off_t offset, + ArchipelagoAIOCB *aio_cb, + ArchipelagoSegmentedRequest *segreq, + int op) +{ + int ret, targetlen; + char *target; + void *data = NULL; + struct xseg_request *req; + AIORequestData *reqdata = g_malloc(sizeof(AIORequestData)); + + targetlen = strlen(s->volname); + req = xseg_get_request(s->xseg, s->srcport, s->vportno, X_ALLOC); + if (!req) { + archipelagolog("Cannot get XSEG request\n"); + goto err_exit2; + } + ret = xseg_prep_request(s->xseg, req, targetlen, count); + if (ret < 0) { + archipelagolog("Cannot prepare XSEG request\n"); + goto err_exit; + } + target = xseg_get_target(s->xseg, req); + if (!target) { + archipelagolog("Cannot get XSEG target\n"); + goto err_exit; + } + memcpy(target, s->volname, targetlen); + req->size = count; + req->offset = offset; + + switch (op) { + case ARCHIP_OP_READ: + req->op = X_READ; + break; + case ARCHIP_OP_WRITE: + req->op = X_WRITE; + break; + case ARCHIP_OP_FLUSH: + req->op = X_FLUSH; + break; + } + reqdata->volname = s->volname; + reqdata->offset = offset; + reqdata->size = count; + reqdata->bufidx = bufidx; + reqdata->aio_cb = aio_cb; + reqdata->segreq = segreq; + reqdata->op = op; + + xseg_set_req_data(s->xseg, req, reqdata); + if (op == ARCHIP_OP_WRITE) { + data = xseg_get_data(s->xseg, req); + if (!data) { + archipelagolog("Cannot get XSEG data\n"); + goto err_exit; + } + qemu_iovec_to_buf(aio_cb->qiov, bufidx, data, count); + } + + xport p = xseg_submit(s->xseg, req, s->srcport, X_ALLOC); + if (p == NoPort) { + archipelagolog("Could not submit XSEG request\n"); + goto err_exit; + } + xseg_signal(s->xseg, p); + return 0; + +err_exit: + g_free(reqdata); + xseg_put_request(s->xseg, req, s->srcport); + return -EIO; +err_exit2: + g_free(reqdata); + return -EIO; +} + +static int archipelago_aio_segmented_rw(BDRVArchipelagoState *s, + size_t count, + off_t offset, + ArchipelagoAIOCB *aio_cb, + int op) +{ + int i, ret, segments_nr, last_segment_size; + ArchipelagoSegmentedRequest *segreq; + + segreq = g_malloc(sizeof(ArchipelagoSegmentedRequest)); + + if (op == ARCHIP_OP_FLUSH) { + segments_nr = 1; + segreq->ref = segments_nr; + segreq->total = count; + segreq->count = 0; + segreq->failed = 0; + ret = archipelago_submit_request(s, 0, count, offset, aio_cb, + segreq, ARCHIP_OP_FLUSH); + if (ret < 0) { + goto err_exit; + } + return 0; + } + + segments_nr = (int)(count / MAX_REQUEST_SIZE) + \ + ((count % MAX_REQUEST_SIZE) ? 1 : 0); + last_segment_size = (int)(count % MAX_REQUEST_SIZE); + + segreq->ref = segments_nr; + segreq->total = count; + segreq->count = 0; + segreq->failed = 0; + + for (i = 0; i < segments_nr - 1; i++) { + ret = archipelago_submit_request(s, i * MAX_REQUEST_SIZE, + MAX_REQUEST_SIZE, + offset + i * MAX_REQUEST_SIZE, + aio_cb, segreq, op); + + if (ret < 0) { + goto err_exit; + } + } + + if ((segments_nr > 1) && last_segment_size) { + ret = archipelago_submit_request(s, i * MAX_REQUEST_SIZE, + last_segment_size, + offset + i * MAX_REQUEST_SIZE, + aio_cb, segreq, op); + } else if ((segments_nr > 1) && !last_segment_size) { + ret = archipelago_submit_request(s, i * MAX_REQUEST_SIZE, + MAX_REQUEST_SIZE, + offset + i * MAX_REQUEST_SIZE, + aio_cb, segreq, op); + } else if (segments_nr == 1) { + ret = archipelago_submit_request(s, 0, count, offset, aio_cb, + segreq, op); + } + + if (ret < 0) { + goto err_exit; + } + + return 0; + +err_exit: + __sync_add_and_fetch(&segreq->failed, 1); + if (segments_nr == 1) { + if (__sync_add_and_fetch(&segreq->ref, -1) == 0) { + g_free(segreq); + } + } else { + if ((__sync_add_and_fetch(&segreq->ref, -segments_nr + i)) == 0) { + g_free(segreq); + } + } + + return ret; +} + +static BlockDriverAIOCB *qemu_archipelago_aio_rw(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque, + int op) +{ + ArchipelagoAIOCB *aio_cb; + BDRVArchipelagoState *s = bs->opaque; + int64_t size, off; + int ret; + + aio_cb = qemu_aio_get(&archipelago_aiocb_info, bs, cb, opaque); + aio_cb->cmd = op; + aio_cb->qiov = qiov; + + aio_cb->ret = 0; + aio_cb->s = s; + aio_cb->cancelled = false; + aio_cb->status = -EINPROGRESS; + + off = sector_num * BDRV_SECTOR_SIZE; + size = nb_sectors * BDRV_SECTOR_SIZE; + aio_cb->size = size; + + ret = archipelago_aio_segmented_rw(s, size, off, + aio_cb, op); + if (ret < 0) { + goto err_exit; + } + return &aio_cb->common; + +err_exit: + error_report("qemu_archipelago_aio_rw(): I/O Error\n"); + qemu_aio_release(aio_cb); + return NULL; +} + +static BlockDriverAIOCB *qemu_archipelago_aio_readv(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + return qemu_archipelago_aio_rw(bs, sector_num, qiov, nb_sectors, cb, + opaque, ARCHIP_OP_READ); +} + +static BlockDriverAIOCB *qemu_archipelago_aio_writev(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + return qemu_archipelago_aio_rw(bs, sector_num, qiov, nb_sectors, cb, + opaque, ARCHIP_OP_WRITE); +} + +static int64_t archipelago_volume_info(BDRVArchipelagoState *s) +{ + uint64_t size; + int ret, targetlen; + struct xseg_request *req; + struct xseg_reply_info *xinfo; + AIORequestData *reqdata = g_malloc(sizeof(AIORequestData)); + + const char *volname = s->volname; + targetlen = strlen(volname); + req = xseg_get_request(s->xseg, s->srcport, s->mportno, X_ALLOC); + if (!req) { + archipelagolog("Cannot get XSEG request\n"); + goto err_exit2; + } + ret = xseg_prep_request(s->xseg, req, targetlen, + sizeof(struct xseg_reply_info)); + if (ret < 0) { + archipelagolog("Cannot prepare XSEG request\n"); + goto err_exit; + } + char *target = xseg_get_target(s->xseg, req); + if (!target) { + archipelagolog("Cannot get XSEG target\n"); + goto err_exit; + } + memcpy(target, volname, targetlen); + req->size = req->datalen; + req->offset = 0; + req->op = X_INFO; + + reqdata->op = ARCHIP_OP_VOLINFO; + reqdata->volname = volname; + xseg_set_req_data(s->xseg, req, reqdata); + + xport p = xseg_submit(s->xseg, req, s->srcport, X_ALLOC); + if (p == NoPort) { + archipelagolog("Cannot submit XSEG request\n"); + goto err_exit; + } + xseg_signal(s->xseg, p); + qemu_mutex_lock(&s->archip_mutex); + while (!s->is_signaled) { + qemu_cond_wait(&s->archip_cond, &s->archip_mutex); + } + s->is_signaled = false; + qemu_mutex_unlock(&s->archip_mutex); + + xinfo = (struct xseg_reply_info *) xseg_get_data(s->xseg, req); + size = xinfo->size; + xseg_put_request(s->xseg, req, s->srcport); + g_free(reqdata); + s->size = size; + return size; + +err_exit: + xseg_put_request(s->xseg, req, s->srcport); +err_exit2: + g_free(reqdata); + return -EIO; +} + +static int64_t qemu_archipelago_getlength(BlockDriverState *bs) +{ + int64_t ret; + BDRVArchipelagoState *s = bs->opaque; + + ret = archipelago_volume_info(s); + return ret; +} + +static QemuOptsList qemu_archipelago_create_opts = { + .name = "archipelago-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(qemu_archipelago_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { /* end of list */ } + } +}; + +static BlockDriverAIOCB *qemu_archipelago_aio_flush(BlockDriverState *bs, + BlockDriverCompletionFunc *cb, void *opaque) +{ + return qemu_archipelago_aio_rw(bs, 0, NULL, 0, cb, opaque, + ARCHIP_OP_FLUSH); +} + +static BlockDriver bdrv_archipelago = { + .format_name = "archipelago", + .protocol_name = "archipelago", + .instance_size = sizeof(BDRVArchipelagoState), + .bdrv_parse_filename = archipelago_parse_filename, + .bdrv_file_open = qemu_archipelago_open, + .bdrv_close = qemu_archipelago_close, + .bdrv_create = qemu_archipelago_create, + .bdrv_getlength = qemu_archipelago_getlength, + .bdrv_aio_readv = qemu_archipelago_aio_readv, + .bdrv_aio_writev = qemu_archipelago_aio_writev, + .bdrv_aio_flush = qemu_archipelago_aio_flush, + .bdrv_has_zero_init = bdrv_has_zero_init_1, + .create_opts = &qemu_archipelago_create_opts, +}; + +static void bdrv_archipelago_init(void) +{ + bdrv_register(&bdrv_archipelago); +} + +block_init(bdrv_archipelago_init); diff --git a/block/bochs.c b/block/bochs.c index eba23df335..6674b27438 100644 --- a/block/bochs.c +++ b/block/bochs.c @@ -131,7 +131,11 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags, return -EFBIG; } - s->catalog_bitmap = g_malloc(s->catalog_size * 4); + s->catalog_bitmap = g_try_malloc(s->catalog_size * 4); + if (s->catalog_size && s->catalog_bitmap == NULL) { + error_setg(errp, "Could not allocate memory for catalog"); + return -ENOMEM; + } ret = bdrv_pread(bs->file, le32_to_cpu(bochs.header), s->catalog_bitmap, s->catalog_size * 4); diff --git a/block/cloop.c b/block/cloop.c index 8457737922..f328be06f8 100644 --- a/block/cloop.c +++ b/block/cloop.c @@ -116,7 +116,12 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags, "try increasing block size"); return -EINVAL; } - s->offsets = g_malloc(offsets_size); + + s->offsets = g_try_malloc(offsets_size); + if (s->offsets == NULL) { + error_setg(errp, "Could not allocate offsets table"); + return -ENOMEM; + } ret = bdrv_pread(bs->file, 128 + 4 + 4, s->offsets, offsets_size); if (ret < 0) { @@ -158,8 +163,20 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags, } /* initialize zlib engine */ - s->compressed_block = g_malloc(max_compressed_block_size + 1); - s->uncompressed_block = g_malloc(s->block_size); + s->compressed_block = g_try_malloc(max_compressed_block_size + 1); + if (s->compressed_block == NULL) { + error_setg(errp, "Could not allocate compressed_block"); + ret = -ENOMEM; + goto fail; + } + + s->uncompressed_block = g_try_malloc(s->block_size); + if (s->uncompressed_block == NULL) { + error_setg(errp, "Could not allocate uncompressed_block"); + ret = -ENOMEM; + goto fail; + } + if (inflateInit(&s->zstream) != Z_OK) { ret = -EINVAL; goto fail; diff --git a/block/curl.c b/block/curl.c index 79ff2f1e41..d4b85d20a5 100644 --- a/block/curl.c +++ b/block/curl.c @@ -640,7 +640,13 @@ static void curl_readv_bh_cb(void *p) state->buf_start = start; state->buf_len = acb->end + s->readahead_size; end = MIN(start + state->buf_len, s->len) - 1; - state->orig_buf = g_malloc(state->buf_len); + state->orig_buf = g_try_malloc(state->buf_len); + if (state->buf_len && state->orig_buf == NULL) { + curl_clean_state(state); + acb->common.cb(acb->common.opaque, -ENOMEM); + qemu_aio_release(acb); + return; + } state->acb[0] = acb; snprintf(state->range, 127, "%zd-%zd", start, end); diff --git a/block/dmg.c b/block/dmg.c index 1e153cd76d..e455886d2b 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -284,8 +284,15 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags, } /* initialize zlib engine */ - s->compressed_chunk = g_malloc(max_compressed_size + 1); - s->uncompressed_chunk = g_malloc(512 * max_sectors_per_chunk); + s->compressed_chunk = qemu_try_blockalign(bs->file, + max_compressed_size + 1); + s->uncompressed_chunk = qemu_try_blockalign(bs->file, + 512 * max_sectors_per_chunk); + if (s->compressed_chunk == NULL || s->uncompressed_chunk == NULL) { + ret = -ENOMEM; + goto fail; + } + if (inflateInit(&s->zstream) != Z_OK) { ret = -EINVAL; goto fail; @@ -302,8 +309,8 @@ fail: g_free(s->lengths); g_free(s->sectors); g_free(s->sectorcounts); - g_free(s->compressed_chunk); - g_free(s->uncompressed_chunk); + qemu_vfree(s->compressed_chunk); + qemu_vfree(s->uncompressed_chunk); return ret; } @@ -426,8 +433,8 @@ static void dmg_close(BlockDriverState *bs) g_free(s->lengths); g_free(s->sectors); g_free(s->sectorcounts); - g_free(s->compressed_chunk); - g_free(s->uncompressed_chunk); + qemu_vfree(s->compressed_chunk); + qemu_vfree(s->uncompressed_chunk); inflateEnd(&s->zstream); } diff --git a/block/iscsi.c b/block/iscsi.c index a7bb6970ac..2c9cfc18eb 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -893,7 +893,10 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, nb_blocks = sector_qemu2lun(nb_sectors, iscsilun); if (iscsilun->zeroblock == NULL) { - iscsilun->zeroblock = g_malloc0(iscsilun->block_size); + iscsilun->zeroblock = g_try_malloc0(iscsilun->block_size); + if (iscsilun->zeroblock == NULL) { + return -ENOMEM; + } } iscsi_co_init_iscsitask(iscsilun, &iTask); diff --git a/block/mirror.c b/block/mirror.c index c7a655fc58..5e7a166b39 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -367,7 +367,12 @@ static void coroutine_fn mirror_run(void *opaque) } end = s->common.len >> BDRV_SECTOR_BITS; - s->buf = qemu_blockalign(bs, s->buf_size); + s->buf = qemu_try_blockalign(bs, s->buf_size); + if (s->buf == NULL) { + ret = -ENOMEM; + goto immediate_exit; + } + sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; mirror_free_init(s); diff --git a/block/nfs.c b/block/nfs.c index 8439e0d389..fe46c33709 100644 --- a/block/nfs.c +++ b/block/nfs.c @@ -172,7 +172,11 @@ static int coroutine_fn nfs_co_writev(BlockDriverState *bs, nfs_co_init_task(client, &task); - buf = g_malloc(nb_sectors * BDRV_SECTOR_SIZE); + buf = g_try_malloc(nb_sectors * BDRV_SECTOR_SIZE); + if (nb_sectors && buf == NULL) { + return -ENOMEM; + } + qemu_iovec_to_buf(iov, 0, buf, nb_sectors * BDRV_SECTOR_SIZE); if (nfs_pwrite_async(client->context, client->fh, diff --git a/block/parallels.c b/block/parallels.c index 1a5bd350b3..7325678a4d 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -105,7 +105,11 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags, ret = -EFBIG; goto fail; } - s->catalog_bitmap = g_malloc(s->catalog_size * 4); + s->catalog_bitmap = g_try_malloc(s->catalog_size * 4); + if (s->catalog_size && s->catalog_bitmap == NULL) { + ret = -ENOMEM; + goto fail; + } ret = bdrv_pread(bs->file, 64, s->catalog_bitmap, s->catalog_size * 4); if (ret < 0) { diff --git a/block/qapi.c b/block/qapi.c index f44f6b4012..79d1e6a9f4 100644 --- a/block/qapi.c +++ b/block/qapi.c @@ -28,6 +28,13 @@ #include "qapi-visit.h" #include "qapi/qmp-output-visitor.h" #include "qapi/qmp/types.h" +#ifdef __linux__ +#include <linux/fs.h> +#include <sys/ioctl.h> +#ifndef FS_NOCOW_FL +#define FS_NOCOW_FL 0x00800000 /* Do not cow file */ +#endif +#endif BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs) { @@ -165,19 +172,28 @@ void bdrv_query_image_info(BlockDriverState *bs, ImageInfo **p_info, Error **errp) { - uint64_t total_sectors; + int64_t size; const char *backing_filename; char backing_filename2[1024]; BlockDriverInfo bdi; int ret; Error *err = NULL; - ImageInfo *info = g_new0(ImageInfo, 1); - - bdrv_get_geometry(bs, &total_sectors); + ImageInfo *info; +#ifdef __linux__ + int fd, attr; +#endif + + size = bdrv_getlength(bs); + if (size < 0) { + error_setg_errno(errp, -size, "Can't get size of device '%s'", + bdrv_get_device_name(bs)); + return; + } + info = g_new0(ImageInfo, 1); info->filename = g_strdup(bs->filename); info->format = g_strdup(bdrv_get_format_name(bs)); - info->virtual_size = total_sectors * 512; + info->virtual_size = size; info->actual_size = bdrv_get_allocated_file_size(bs); info->has_actual_size = info->actual_size >= 0; if (bdrv_is_encrypted(bs)) { @@ -195,6 +211,18 @@ void bdrv_query_image_info(BlockDriverState *bs, info->format_specific = bdrv_get_specific_info(bs); info->has_format_specific = info->format_specific != NULL; +#ifdef __linux__ + /* get NOCOW info */ + fd = qemu_open(bs->filename, O_RDONLY | O_NONBLOCK); + if (fd >= 0) { + if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0 && (attr & FS_NOCOW_FL)) { + info->has_nocow = true; + info->nocow = true; + } + qemu_close(fd); + } +#endif + backing_filename = bs->backing_file; if (backing_filename[0] != '\0') { info->backing_filename = g_strdup(backing_filename); @@ -625,4 +653,8 @@ void bdrv_image_info_dump(fprintf_function func_fprintf, void *f, func_fprintf(f, "Format specific information:\n"); bdrv_image_info_specific_dump(func_fprintf, f, info->format_specific); } + + if (info->has_nocow && info->nocow) { + func_fprintf(f, "NOCOW flag: set\n"); + } } diff --git a/block/qcow.c b/block/qcow.c index a874056cf3..67332f03c1 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -182,7 +182,12 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, } s->l1_table_offset = header.l1_table_offset; - s->l1_table = g_malloc(s->l1_size * sizeof(uint64_t)); + s->l1_table = g_try_malloc(s->l1_size * sizeof(uint64_t)); + if (s->l1_table == NULL) { + error_setg(errp, "Could not allocate memory for L1 table"); + ret = -ENOMEM; + goto fail; + } ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)); @@ -193,8 +198,16 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, for(i = 0;i < s->l1_size; i++) { be64_to_cpus(&s->l1_table[i]); } - /* alloc L2 cache */ - s->l2_cache = g_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t)); + + /* alloc L2 cache (max. 64k * 16 * 8 = 8 MB) */ + s->l2_cache = + qemu_try_blockalign(bs->file, + s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t)); + if (s->l2_cache == NULL) { + error_setg(errp, "Could not allocate L2 table cache"); + ret = -ENOMEM; + goto fail; + } s->cluster_cache = g_malloc(s->cluster_size); s->cluster_data = g_malloc(s->cluster_size); s->cluster_cache_offset = -1; @@ -226,7 +239,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, fail: g_free(s->l1_table); - g_free(s->l2_cache); + qemu_vfree(s->l2_cache); g_free(s->cluster_cache); g_free(s->cluster_data); return ret; @@ -517,7 +530,10 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, void *orig_buf; if (qiov->niov > 1) { - buf = orig_buf = qemu_blockalign(bs, qiov->size); + buf = orig_buf = qemu_try_blockalign(bs, qiov->size); + if (buf == NULL) { + return -ENOMEM; + } } else { orig_buf = NULL; buf = (uint8_t *)qiov->iov->iov_base; @@ -619,7 +635,10 @@ static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, s->cluster_cache_offset = -1; /* disable compressed cache */ if (qiov->niov > 1) { - buf = orig_buf = qemu_blockalign(bs, qiov->size); + buf = orig_buf = qemu_try_blockalign(bs, qiov->size); + if (buf == NULL) { + return -ENOMEM; + } qemu_iovec_to_buf(qiov, 0, buf, qiov->size); } else { orig_buf = NULL; @@ -685,7 +704,7 @@ static void qcow_close(BlockDriverState *bs) BDRVQcowState *s = bs->opaque; g_free(s->l1_table); - g_free(s->l2_cache); + qemu_vfree(s->l2_cache); g_free(s->cluster_cache); g_free(s->cluster_data); diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c index 8ecbb5bc00..5353b44828 100644 --- a/block/qcow2-cache.c +++ b/block/qcow2-cache.c @@ -53,10 +53,21 @@ Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables) c->entries = g_malloc0(sizeof(*c->entries) * num_tables); for (i = 0; i < c->size; i++) { - c->entries[i].table = qemu_blockalign(bs, s->cluster_size); + c->entries[i].table = qemu_try_blockalign(bs->file, s->cluster_size); + if (c->entries[i].table == NULL) { + goto fail; + } } return c; + +fail: + for (i = 0; i < c->size; i++) { + qemu_vfree(c->entries[i].table); + } + g_free(c->entries); + g_free(c); + return NULL; } int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 4208dc08b5..5b36018b3e 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -72,14 +72,20 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, #endif new_l1_size2 = sizeof(uint64_t) * new_l1_size; - new_l1_table = g_malloc0(align_offset(new_l1_size2, 512)); + new_l1_table = qemu_try_blockalign(bs->file, + align_offset(new_l1_size2, 512)); + if (new_l1_table == NULL) { + return -ENOMEM; + } + memset(new_l1_table, 0, align_offset(new_l1_size2, 512)); + memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t)); /* write new table (align to cluster) */ BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE); new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2); if (new_l1_table_offset < 0) { - g_free(new_l1_table); + qemu_vfree(new_l1_table); return new_l1_table_offset; } @@ -113,7 +119,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, if (ret < 0) { goto fail; } - g_free(s->l1_table); + qemu_vfree(s->l1_table); old_l1_table_offset = s->l1_table_offset; s->l1_table_offset = new_l1_table_offset; s->l1_table = new_l1_table; @@ -123,7 +129,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, QCOW2_DISCARD_OTHER); return 0; fail: - g_free(new_l1_table); + qemu_vfree(new_l1_table); qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2, QCOW2_DISCARD_OTHER); return ret; @@ -372,7 +378,10 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs, } iov.iov_len = n * BDRV_SECTOR_SIZE; - iov.iov_base = qemu_blockalign(bs, iov.iov_len); + iov.iov_base = qemu_try_blockalign(bs, iov.iov_len); + if (iov.iov_base == NULL) { + return -ENOMEM; + } qemu_iovec_init_external(&qiov, &iov, 1); @@ -702,7 +711,11 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters); assert(m->nb_clusters > 0); - old_cluster = g_malloc(m->nb_clusters * sizeof(uint64_t)); + old_cluster = g_try_malloc(m->nb_clusters * sizeof(uint64_t)); + if (old_cluster == NULL) { + ret = -ENOMEM; + goto err; + } /* copy content of unmodified sectors */ ret = perform_cow(bs, m, &m->cow_start); @@ -1106,6 +1119,17 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, return 0; } + /* !*host_offset would overwrite the image header and is reserved for "no + * host offset preferred". If 0 was a valid host offset, it'd trigger the + * following overlap check; do that now to avoid having an invalid value in + * *host_offset. */ + if (!alloc_cluster_offset) { + ret = qcow2_pre_write_overlap_check(bs, 0, alloc_cluster_offset, + nb_clusters * s->cluster_size); + assert(ret < 0); + goto fail; + } + /* * Save info needed for meta data update. * @@ -1562,7 +1586,10 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, if (!is_active_l1) { /* inactive L2 tables require a buffer to be stored in when loading * them from disk */ - l2_table = qemu_blockalign(bs, s->cluster_size); + l2_table = qemu_try_blockalign(bs->file, s->cluster_size); + if (l2_table == NULL) { + return -ENOMEM; + } } for (i = 0; i < l1_size; i++) { @@ -1740,7 +1767,11 @@ int qcow2_expand_zero_clusters(BlockDriverState *bs) nb_clusters = size_to_clusters(s, bs->file->total_sectors * BDRV_SECTOR_SIZE); - expanded_clusters = g_malloc0((nb_clusters + 7) / 8); + expanded_clusters = g_try_malloc0((nb_clusters + 7) / 8); + if (expanded_clusters == NULL) { + ret = -ENOMEM; + goto fail; + } ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size, &expanded_clusters, &nb_clusters); diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index cc6cf743d6..d60e2feb10 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -46,19 +46,25 @@ int qcow2_refcount_init(BlockDriverState *bs) assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t)); refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t); - s->refcount_table = g_malloc(refcount_table_size2); + s->refcount_table = g_try_malloc(refcount_table_size2); + if (s->refcount_table_size > 0) { + if (s->refcount_table == NULL) { + ret = -ENOMEM; + goto fail; + } BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD); ret = bdrv_pread(bs->file, s->refcount_table_offset, s->refcount_table, refcount_table_size2); - if (ret != refcount_table_size2) + if (ret < 0) { goto fail; + } for(i = 0; i < s->refcount_table_size; i++) be64_to_cpus(&s->refcount_table[i]); } return 0; fail: - return -ENOMEM; + return ret; } void qcow2_refcount_close(BlockDriverState *bs) @@ -344,8 +350,14 @@ static int alloc_refcount_block(BlockDriverState *bs, uint64_t meta_offset = (blocks_used * refcount_block_clusters) * s->cluster_size; uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size; - uint16_t *new_blocks = g_malloc0(blocks_clusters * s->cluster_size); - uint64_t *new_table = g_malloc0(table_size * sizeof(uint64_t)); + uint64_t *new_table = g_try_malloc0(table_size * sizeof(uint64_t)); + uint16_t *new_blocks = g_try_malloc0(blocks_clusters * s->cluster_size); + + assert(table_size > 0 && blocks_clusters > 0); + if (new_table == NULL || new_blocks == NULL) { + ret = -ENOMEM; + goto fail_table; + } /* Fill the new refcount table */ memcpy(new_table, s->refcount_table, @@ -424,6 +436,7 @@ static int alloc_refcount_block(BlockDriverState *bs, return -EAGAIN; fail_table: + g_free(new_blocks); g_free(new_table); fail_block: if (*refcount_block != NULL) { @@ -847,7 +860,8 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, int64_t l1_table_offset, int l1_size, int addend) { BDRVQcowState *s = bs->opaque; - uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated; + uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2; + bool l1_allocated = false; int64_t old_offset, old_l2_offset; int i, j, l1_modified = 0, nb_csectors, refcount; int ret; @@ -862,8 +876,12 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, * l1_table_offset when it is the current s->l1_table_offset! Be careful * when changing this! */ if (l1_table_offset != s->l1_table_offset) { - l1_table = g_malloc0(align_offset(l1_size2, 512)); - l1_allocated = 1; + l1_table = g_try_malloc0(align_offset(l1_size2, 512)); + if (l1_size2 && l1_table == NULL) { + ret = -ENOMEM; + goto fail; + } + l1_allocated = true; ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2); if (ret < 0) { @@ -875,7 +893,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, } else { assert(l1_size == s->l1_size); l1_table = s->l1_table; - l1_allocated = 0; + l1_allocated = false; } for(i = 0; i < l1_size; i++) { @@ -1197,7 +1215,11 @@ static int check_refcounts_l1(BlockDriverState *bs, if (l1_size2 == 0) { l1_table = NULL; } else { - l1_table = g_malloc(l1_size2); + l1_table = g_try_malloc(l1_size2); + if (l1_table == NULL) { + ret = -ENOMEM; + goto fail; + } if (bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2) != l1_size2) goto fail; @@ -1501,7 +1523,11 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, return -EFBIG; } - refcount_table = g_malloc0(nb_clusters * sizeof(uint16_t)); + refcount_table = g_try_malloc0(nb_clusters * sizeof(uint16_t)); + if (nb_clusters && refcount_table == NULL) { + res->check_errors++; + return -ENOMEM; + } res->bfi.total_clusters = size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE); @@ -1753,9 +1779,13 @@ int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset, uint64_t l1_ofs = s->snapshots[i].l1_table_offset; uint32_t l1_sz = s->snapshots[i].l1_size; uint64_t l1_sz2 = l1_sz * sizeof(uint64_t); - uint64_t *l1 = g_malloc(l1_sz2); + uint64_t *l1 = g_try_malloc(l1_sz2); int ret; + if (l1_sz2 && l1 == NULL) { + return -ENOMEM; + } + ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2); if (ret < 0) { g_free(l1); diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c index 0aa9defbe2..f67b47282f 100644 --- a/block/qcow2-snapshot.c +++ b/block/qcow2-snapshot.c @@ -381,7 +381,12 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) sn->l1_table_offset = l1_table_offset; sn->l1_size = s->l1_size; - l1_table = g_malloc(s->l1_size * sizeof(uint64_t)); + l1_table = g_try_malloc(s->l1_size * sizeof(uint64_t)); + if (s->l1_size && l1_table == NULL) { + ret = -ENOMEM; + goto fail; + } + for(i = 0; i < s->l1_size; i++) { l1_table[i] = cpu_to_be64(s->l1_table[i]); } @@ -499,7 +504,11 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) * Decrease the refcount referenced by the old one only when the L1 * table is overwritten. */ - sn_l1_table = g_malloc0(cur_l1_bytes); + sn_l1_table = g_try_malloc0(cur_l1_bytes); + if (cur_l1_bytes && sn_l1_table == NULL) { + ret = -ENOMEM; + goto fail; + } ret = bdrv_pread(bs->file, sn->l1_table_offset, sn_l1_table, sn_l1_bytes); if (ret < 0) { @@ -698,17 +707,21 @@ int qcow2_snapshot_load_tmp(BlockDriverState *bs, return -EFBIG; } new_l1_bytes = sn->l1_size * sizeof(uint64_t); - new_l1_table = g_malloc0(align_offset(new_l1_bytes, 512)); + new_l1_table = qemu_try_blockalign(bs->file, + align_offset(new_l1_bytes, 512)); + if (new_l1_table == NULL) { + return -ENOMEM; + } ret = bdrv_pread(bs->file, sn->l1_table_offset, new_l1_table, new_l1_bytes); if (ret < 0) { error_setg(errp, "Failed to read l1 table for snapshot"); - g_free(new_l1_table); + qemu_vfree(new_l1_table); return ret; } /* Switch the L1 table */ - g_free(s->l1_table); + qemu_vfree(s->l1_table); s->l1_size = sn->l1_size; s->l1_table_offset = sn->l1_table_offset; diff --git a/block/qcow2.c b/block/qcow2.c index 1e3ab6bd02..435e0e11d0 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -688,8 +688,13 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, if (s->l1_size > 0) { - s->l1_table = g_malloc0( + s->l1_table = qemu_try_blockalign(bs->file, align_offset(s->l1_size * sizeof(uint64_t), 512)); + if (s->l1_table == NULL) { + error_setg(errp, "Could not allocate L1 table"); + ret = -ENOMEM; + goto fail; + } ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)); if (ret < 0) { @@ -704,11 +709,22 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, /* alloc L2 table/refcount block cache */ s->l2_table_cache = qcow2_cache_create(bs, L2_CACHE_SIZE); s->refcount_block_cache = qcow2_cache_create(bs, REFCOUNT_CACHE_SIZE); + if (s->l2_table_cache == NULL || s->refcount_block_cache == NULL) { + error_setg(errp, "Could not allocate metadata caches"); + ret = -ENOMEM; + goto fail; + } s->cluster_cache = g_malloc(s->cluster_size); /* one more sector for decompressed data alignment */ - s->cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size - + 512); + s->cluster_data = qemu_try_blockalign(bs->file, QCOW_MAX_CRYPT_CLUSTERS + * s->cluster_size + 512); + if (s->cluster_data == NULL) { + error_setg(errp, "Could not allocate temporary cluster buffer"); + ret = -ENOMEM; + goto fail; + } + s->cluster_cache_offset = -1; s->flags = flags; @@ -852,7 +868,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, cleanup_unknown_header_ext(bs); qcow2_free_snapshots(bs); qcow2_refcount_close(bs); - g_free(s->l1_table); + qemu_vfree(s->l1_table); /* else pre-write overlap checks in cache_destroy may crash */ s->l1_table = NULL; if (s->l2_table_cache) { @@ -1082,7 +1098,12 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, */ if (!cluster_data) { cluster_data = - qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); + qemu_try_blockalign(bs->file, QCOW_MAX_CRYPT_CLUSTERS + * s->cluster_size); + if (cluster_data == NULL) { + ret = -ENOMEM; + goto fail; + } } assert(cur_nr_sectors <= @@ -1182,8 +1203,13 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, if (s->crypt_method) { if (!cluster_data) { - cluster_data = qemu_blockalign(bs, QCOW_MAX_CRYPT_CLUSTERS * - s->cluster_size); + cluster_data = qemu_try_blockalign(bs->file, + QCOW_MAX_CRYPT_CLUSTERS + * s->cluster_size); + if (cluster_data == NULL) { + ret = -ENOMEM; + goto fail; + } } assert(hd_qiov.size <= @@ -1270,7 +1296,7 @@ fail: static void qcow2_close(BlockDriverState *bs) { BDRVQcowState *s = bs->opaque; - g_free(s->l1_table); + qemu_vfree(s->l1_table); /* else pre-write overlap checks in cache_destroy may crash */ s->l1_table = NULL; @@ -1557,7 +1583,7 @@ static int preallocate(BlockDriverState *bs) int ret; QCowL2Meta *meta; - nb_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS; + nb_sectors = bdrv_nb_sectors(bs); offset = 0; while (nb_sectors) { @@ -1947,7 +1973,6 @@ static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num, /* align end of file to a sector boundary to ease reading with sector based I/Os */ cluster_offset = bdrv_getlength(bs->file); - cluster_offset = (cluster_offset + 511) & ~511; bdrv_truncate(bs->file, cluster_offset); return 0; } diff --git a/block/qed-check.c b/block/qed-check.c index b473dcd61f..40a882cc93 100644 --- a/block/qed-check.c +++ b/block/qed-check.c @@ -227,8 +227,11 @@ int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix) }; int ret; - check.used_clusters = g_malloc0(((check.nclusters + 31) / 32) * - sizeof(check.used_clusters[0])); + check.used_clusters = g_try_malloc0(((check.nclusters + 31) / 32) * + sizeof(check.used_clusters[0])); + if (check.nclusters && check.used_clusters == NULL) { + return -ENOMEM; + } check.result->bfi.total_clusters = (s->header.image_size + s->header.cluster_size - 1) / diff --git a/block/qed.c b/block/qed.c index 7944832181..ba395af76a 100644 --- a/block/qed.c +++ b/block/qed.c @@ -1240,7 +1240,11 @@ static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len) struct iovec *iov = acb->qiov->iov; if (!iov->iov_base) { - iov->iov_base = qemu_blockalign(acb->common.bs, iov->iov_len); + iov->iov_base = qemu_try_blockalign(acb->common.bs, iov->iov_len); + if (iov->iov_base == NULL) { + qed_aio_complete(acb, -ENOMEM); + return; + } memset(iov->iov_base, 0, iov->iov_len); } } diff --git a/block/raw-posix.c b/block/raw-posix.c index 8e9758e920..1194eb00ad 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -798,7 +798,11 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) * Ok, we have to do it the hard way, copy all segments into * a single aligned buffer. */ - buf = qemu_blockalign(aiocb->bs, aiocb->aio_nbytes); + buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes); + if (buf == NULL) { + return -ENOMEM; + } + if (aiocb->aio_type & QEMU_AIO_WRITE) { char *p = buf; int i; diff --git a/block/rbd.c b/block/rbd.c index 2b797d3e8b..4459102adf 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -617,7 +617,7 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs, RBDAIOCmd cmd) { RBDAIOCB *acb; - RADOSCB *rcb; + RADOSCB *rcb = NULL; rbd_completion_t c; int64_t off, size; char *buf; @@ -631,7 +631,10 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs, if (cmd == RBD_AIO_DISCARD || cmd == RBD_AIO_FLUSH) { acb->bounce = NULL; } else { - acb->bounce = qemu_blockalign(bs, qiov->size); + acb->bounce = qemu_try_blockalign(bs, qiov->size); + if (acb->bounce == NULL) { + goto failed; + } } acb->ret = 0; acb->error = 0; diff --git a/block/vdi.c b/block/vdi.c index 197bd77c97..adc6aa9a5f 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -53,13 +53,6 @@ #include "block/block_int.h" #include "qemu/module.h" #include "migration/migration.h" -#ifdef __linux__ -#include <linux/fs.h> -#include <sys/ioctl.h> -#ifndef FS_NOCOW_FL -#define FS_NOCOW_FL 0x00800000 /* Do not cow file */ -#endif -#endif #if defined(CONFIG_UUID) #include <uuid/uuid.h> @@ -299,7 +292,12 @@ static int vdi_check(BlockDriverState *bs, BdrvCheckResult *res, return -ENOTSUP; } - bmap = g_malloc(s->header.blocks_in_image * sizeof(uint32_t)); + bmap = g_try_malloc(s->header.blocks_in_image * sizeof(uint32_t)); + if (s->header.blocks_in_image && bmap == NULL) { + res->check_errors++; + return -ENOMEM; + } + memset(bmap, 0xff, s->header.blocks_in_image * sizeof(uint32_t)); /* Check block map and value of blocks_allocated. */ @@ -357,23 +355,23 @@ static int vdi_make_empty(BlockDriverState *bs) static int vdi_probe(const uint8_t *buf, int buf_size, const char *filename) { const VdiHeader *header = (const VdiHeader *)buf; - int result = 0; + int ret = 0; logout("\n"); if (buf_size < sizeof(*header)) { /* Header too small, no VDI. */ } else if (le32_to_cpu(header->signature) == VDI_SIGNATURE) { - result = 100; + ret = 100; } - if (result == 0) { + if (ret == 0) { logout("no vdi image\n"); } else { logout("%s", header->text); } - return result; + return ret; } static int vdi_open(BlockDriverState *bs, QDict *options, int flags, @@ -478,7 +476,12 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags, bmap_size = header.blocks_in_image * sizeof(uint32_t); bmap_size = (bmap_size + SECTOR_SIZE - 1) / SECTOR_SIZE; - s->bmap = g_malloc(bmap_size * SECTOR_SIZE); + s->bmap = qemu_try_blockalign(bs->file, bmap_size * SECTOR_SIZE); + if (s->bmap == NULL) { + ret = -ENOMEM; + goto fail; + } + ret = bdrv_read(bs->file, s->bmap_sector, (uint8_t *)s->bmap, bmap_size); if (ret < 0) { goto fail_free_bmap; @@ -493,7 +496,7 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags, return 0; fail_free_bmap: - g_free(s->bmap); + qemu_vfree(s->bmap); fail: return ret; @@ -681,8 +684,7 @@ static int vdi_co_write(BlockDriverState *bs, static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) { - int fd; - int result = 0; + int ret = 0; uint64_t bytes = 0; uint32_t blocks; size_t block_size = DEFAULT_CLUSTER_SIZE; @@ -690,7 +692,10 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) VdiHeader header; size_t i; size_t bmap_size; - bool nocow = false; + int64_t offset = 0; + Error *local_err = NULL; + BlockDriverState *bs = NULL; + uint32_t *bmap = NULL; logout("\n"); @@ -707,37 +712,25 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) image_type = VDI_TYPE_STATIC; } #endif - nocow = qemu_opt_get_bool_del(opts, BLOCK_OPT_NOCOW, false); if (bytes > VDI_DISK_SIZE_MAX) { - result = -ENOTSUP; + ret = -ENOTSUP; error_setg(errp, "Unsupported VDI image size (size is 0x%" PRIx64 ", max supported is 0x%" PRIx64 ")", bytes, VDI_DISK_SIZE_MAX); goto exit; } - fd = qemu_open(filename, - O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, - 0644); - if (fd < 0) { - result = -errno; + ret = bdrv_create_file(filename, opts, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); goto exit; } - - if (nocow) { -#ifdef __linux__ - /* Set NOCOW flag to solve performance issue on fs like btrfs. - * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value will - * be ignored since any failure of this operation should not block the - * left work. - */ - int attr; - if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) { - attr |= FS_NOCOW_FL; - ioctl(fd, FS_IOC_SETFLAGS, &attr); - } -#endif + ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, + NULL, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + goto exit; } /* We need enough blocks to store the given disk size, @@ -769,13 +762,20 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) vdi_header_print(&header); #endif vdi_header_to_le(&header); - if (write(fd, &header, sizeof(header)) < 0) { - result = -errno; - goto close_and_exit; + ret = bdrv_pwrite_sync(bs, offset, &header, sizeof(header)); + if (ret < 0) { + error_setg(errp, "Error writing header to %s", filename); + goto exit; } + offset += sizeof(header); if (bmap_size > 0) { - uint32_t *bmap = g_malloc0(bmap_size); + bmap = g_try_malloc0(bmap_size); + if (bmap == NULL) { + ret = -ENOMEM; + error_setg(errp, "Could not allocate bmap"); + goto exit; + } for (i = 0; i < blocks; i++) { if (image_type == VDI_TYPE_STATIC) { bmap[i] = i; @@ -783,35 +783,33 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) bmap[i] = VDI_UNALLOCATED; } } - if (write(fd, bmap, bmap_size) < 0) { - result = -errno; - g_free(bmap); - goto close_and_exit; + ret = bdrv_pwrite_sync(bs, offset, bmap, bmap_size); + if (ret < 0) { + error_setg(errp, "Error writing bmap to %s", filename); + goto exit; } - g_free(bmap); + offset += bmap_size; } if (image_type == VDI_TYPE_STATIC) { - if (ftruncate(fd, sizeof(header) + bmap_size + blocks * block_size)) { - result = -errno; - goto close_and_exit; + ret = bdrv_truncate(bs, offset + blocks * block_size); + if (ret < 0) { + error_setg(errp, "Failed to statically allocate %s", filename); + goto exit; } } -close_and_exit: - if ((close(fd) < 0) && !result) { - result = -errno; - } - exit: - return result; + bdrv_unref(bs); + g_free(bmap); + return ret; } static void vdi_close(BlockDriverState *bs) { BDRVVdiState *s = bs->opaque; - g_free(s->bmap); + qemu_vfree(s->bmap); migrate_del_blocker(s->migration_blocker); error_free(s->migration_blocker); diff --git a/block/vhdx-endian.c b/block/vhdx-endian.c index fe879ed995..0640d3f4a9 100644 --- a/block/vhdx-endian.c +++ b/block/vhdx-endian.c @@ -82,8 +82,6 @@ void vhdx_log_desc_le_import(VHDXLogDescriptor *d) assert(d != NULL); le32_to_cpus(&d->signature); - le32_to_cpus(&d->trailing_bytes); - le64_to_cpus(&d->leading_bytes); le64_to_cpus(&d->file_offset); le64_to_cpus(&d->sequence_number); } @@ -99,6 +97,15 @@ void vhdx_log_desc_le_export(VHDXLogDescriptor *d) cpu_to_le64s(&d->sequence_number); } +void vhdx_log_data_le_import(VHDXLogDataSector *d) +{ + assert(d != NULL); + + le32_to_cpus(&d->data_signature); + le32_to_cpus(&d->sequence_high); + le32_to_cpus(&d->sequence_low); +} + void vhdx_log_data_le_export(VHDXLogDataSector *d) { assert(d != NULL); diff --git a/block/vhdx-log.c b/block/vhdx-log.c index a77c040ee0..eb5c7a097b 100644 --- a/block/vhdx-log.c +++ b/block/vhdx-log.c @@ -84,6 +84,7 @@ static int vhdx_log_peek_hdr(BlockDriverState *bs, VHDXLogEntries *log, if (ret < 0) { goto exit; } + vhdx_log_entry_hdr_le_import(hdr); exit: return ret; @@ -211,7 +212,7 @@ static bool vhdx_log_hdr_is_valid(VHDXLogEntries *log, VHDXLogEntryHeader *hdr, { int valid = false; - if (memcmp(&hdr->signature, "loge", 4)) { + if (hdr->signature != VHDX_LOG_SIGNATURE) { goto exit; } @@ -275,12 +276,12 @@ static bool vhdx_log_desc_is_valid(VHDXLogDescriptor *desc, goto exit; } - if (!memcmp(&desc->signature, "zero", 4)) { + if (desc->signature == VHDX_LOG_ZERO_SIGNATURE) { if (desc->zero_length % VHDX_LOG_SECTOR_SIZE == 0) { /* valid */ ret = true; } - } else if (!memcmp(&desc->signature, "desc", 4)) { + } else if (desc->signature == VHDX_LOG_DESC_SIGNATURE) { /* valid */ ret = true; } @@ -327,13 +328,15 @@ static int vhdx_compute_desc_sectors(uint32_t desc_cnt) * passed into this function. Each descriptor will also be validated, * and error returned if any are invalid. */ static int vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s, - VHDXLogEntries *log, VHDXLogDescEntries **buffer) + VHDXLogEntries *log, VHDXLogDescEntries **buffer, + bool convert_endian) { int ret = 0; uint32_t desc_sectors; uint32_t sectors_read; VHDXLogEntryHeader hdr; VHDXLogDescEntries *desc_entries = NULL; + VHDXLogDescriptor desc; int i; assert(*buffer == NULL); @@ -342,14 +345,19 @@ static int vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s, if (ret < 0) { goto exit; } - vhdx_log_entry_hdr_le_import(&hdr); + if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) { ret = -EINVAL; goto exit; } desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count); - desc_entries = qemu_blockalign(bs, desc_sectors * VHDX_LOG_SECTOR_SIZE); + desc_entries = qemu_try_blockalign(bs->file, + desc_sectors * VHDX_LOG_SECTOR_SIZE); + if (desc_entries == NULL) { + ret = -ENOMEM; + goto exit; + } ret = vhdx_log_read_sectors(bs, log, §ors_read, desc_entries, desc_sectors, false); @@ -363,12 +371,19 @@ static int vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s, /* put in proper endianness, and validate each desc */ for (i = 0; i < hdr.descriptor_count; i++) { - vhdx_log_desc_le_import(&desc_entries->desc[i]); - if (vhdx_log_desc_is_valid(&desc_entries->desc[i], &hdr) == false) { + desc = desc_entries->desc[i]; + vhdx_log_desc_le_import(&desc); + if (convert_endian) { + desc_entries->desc[i] = desc; + } + if (vhdx_log_desc_is_valid(&desc, &hdr) == false) { ret = -EINVAL; goto free_and_exit; } } + if (convert_endian) { + desc_entries->hdr = hdr; + } *buffer = desc_entries; goto exit; @@ -403,7 +418,7 @@ static int vhdx_log_flush_desc(BlockDriverState *bs, VHDXLogDescriptor *desc, buffer = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE); - if (!memcmp(&desc->signature, "desc", 4)) { + if (desc->signature == VHDX_LOG_DESC_SIGNATURE) { /* data sector */ if (data == NULL) { ret = -EFAULT; @@ -431,10 +446,15 @@ static int vhdx_log_flush_desc(BlockDriverState *bs, VHDXLogDescriptor *desc, memcpy(buffer+offset, &desc->trailing_bytes, 4); - } else if (!memcmp(&desc->signature, "zero", 4)) { + } else if (desc->signature == VHDX_LOG_ZERO_SIGNATURE) { /* write 'count' sectors of sector */ memset(buffer, 0, VHDX_LOG_SECTOR_SIZE); count = desc->zero_length / VHDX_LOG_SECTOR_SIZE; + } else { + error_report("Invalid VHDX log descriptor entry signature 0x%" PRIx32, + desc->signature); + ret = -EINVAL; + goto exit; } file_offset = desc->file_offset; @@ -493,13 +513,13 @@ static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s, goto exit; } - ret = vhdx_log_read_desc(bs, s, &logs->log, &desc_entries); + ret = vhdx_log_read_desc(bs, s, &logs->log, &desc_entries, true); if (ret < 0) { goto exit; } for (i = 0; i < desc_entries->hdr.descriptor_count; i++) { - if (!memcmp(&desc_entries->desc[i].signature, "desc", 4)) { + if (desc_entries->desc[i].signature == VHDX_LOG_DESC_SIGNATURE) { /* data sector, so read a sector to flush */ ret = vhdx_log_read_sectors(bs, &logs->log, §ors_read, data, 1, false); @@ -510,6 +530,7 @@ static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s, ret = -EINVAL; goto exit; } + vhdx_log_data_le_import(data); } ret = vhdx_log_flush_desc(bs, &desc_entries->desc[i], data); @@ -558,9 +579,6 @@ static int vhdx_validate_log_entry(BlockDriverState *bs, BDRVVHDXState *s, goto inc_and_exit; } - vhdx_log_entry_hdr_le_import(&hdr); - - if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) { goto inc_and_exit; } @@ -573,13 +591,13 @@ static int vhdx_validate_log_entry(BlockDriverState *bs, BDRVVHDXState *s, desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count); - /* Read desc sectors, and calculate log checksum */ + /* Read all log sectors, and calculate log checksum */ total_sectors = hdr.entry_length / VHDX_LOG_SECTOR_SIZE; /* read_desc() will increment the read idx */ - ret = vhdx_log_read_desc(bs, s, log, &desc_buffer); + ret = vhdx_log_read_desc(bs, s, log, &desc_buffer, false); if (ret < 0) { goto free_and_exit; } @@ -602,7 +620,7 @@ static int vhdx_validate_log_entry(BlockDriverState *bs, BDRVVHDXState *s, } } crc ^= 0xffffffff; - if (crc != desc_buffer->hdr.checksum) { + if (crc != hdr.checksum) { goto free_and_exit; } @@ -962,7 +980,6 @@ static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s, * last data sector */ vhdx_update_checksum(buffer, total_length, offsetof(VHDXLogEntryHeader, checksum)); - cpu_to_le32s((uint32_t *)(buffer + 4)); /* now write to the log */ ret = vhdx_log_write_sectors(bs, &s->log, §ors_written, buffer, diff --git a/block/vhdx.c b/block/vhdx.c index fedcf9f9ca..f666940db7 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -135,10 +135,8 @@ typedef struct VHDXSectorInfo { * buf: buffer pointer * size: size of buffer (must be > crc_offset+4) * - * Note: The resulting checksum is in the CPU endianness, not necessarily - * in the file format endianness (LE). Any header export to disk should - * make sure that vhdx_header_le_export() is used to convert to the - * correct endianness + * Note: The buffer should have all multi-byte data in little-endian format, + * and the resulting checksum is in little endian format. */ uint32_t vhdx_update_checksum(uint8_t *buf, size_t size, int crc_offset) { @@ -149,6 +147,7 @@ uint32_t vhdx_update_checksum(uint8_t *buf, size_t size, int crc_offset) memset(buf + crc_offset, 0, sizeof(crc)); crc = crc32c(0xffffffff, buf, size); + cpu_to_le32s(&crc); memcpy(buf + crc_offset, &crc, sizeof(crc)); return crc; @@ -300,7 +299,7 @@ static int vhdx_write_header(BlockDriverState *bs_file, VHDXHeader *hdr, { uint8_t *buffer = NULL; int ret; - VHDXHeader header_le; + VHDXHeader *header_le; assert(bs_file != NULL); assert(hdr != NULL); @@ -321,11 +320,12 @@ static int vhdx_write_header(BlockDriverState *bs_file, VHDXHeader *hdr, } /* overwrite the actual VHDXHeader portion */ - memcpy(buffer, hdr, sizeof(VHDXHeader)); - hdr->checksum = vhdx_update_checksum(buffer, VHDX_HEADER_SIZE, - offsetof(VHDXHeader, checksum)); - vhdx_header_le_export(hdr, &header_le); - ret = bdrv_pwrite_sync(bs_file, offset, &header_le, sizeof(VHDXHeader)); + header_le = (VHDXHeader *)buffer; + memcpy(header_le, hdr, sizeof(VHDXHeader)); + vhdx_header_le_export(hdr, header_le); + vhdx_update_checksum(buffer, VHDX_HEADER_SIZE, + offsetof(VHDXHeader, checksum)); + ret = bdrv_pwrite_sync(bs_file, offset, header_le, sizeof(VHDXHeader)); exit: qemu_vfree(buffer); @@ -432,13 +432,14 @@ static void vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s, } /* copy over just the relevant portion that we need */ memcpy(header1, buffer, sizeof(VHDXHeader)); - vhdx_header_le_import(header1); - if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) && - !memcmp(&header1->signature, "head", 4) && - header1->version == 1) { - h1_seq = header1->sequence_number; - h1_valid = true; + if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4)) { + vhdx_header_le_import(header1); + if (header1->signature == VHDX_HEADER_SIGNATURE && + header1->version == 1) { + h1_seq = header1->sequence_number; + h1_valid = true; + } } ret = bdrv_pread(bs->file, VHDX_HEADER2_OFFSET, buffer, VHDX_HEADER_SIZE); @@ -447,13 +448,14 @@ static void vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s, } /* copy over just the relevant portion that we need */ memcpy(header2, buffer, sizeof(VHDXHeader)); - vhdx_header_le_import(header2); - if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4) && - !memcmp(&header2->signature, "head", 4) && - header2->version == 1) { - h2_seq = header2->sequence_number; - h2_valid = true; + if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4)) { + vhdx_header_le_import(header2); + if (header2->signature == VHDX_HEADER_SIGNATURE && + header2->version == 1) { + h2_seq = header2->sequence_number; + h2_valid = true; + } } /* If there is only 1 valid header (or no valid headers), we @@ -519,15 +521,21 @@ static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s) goto fail; } memcpy(&s->rt, buffer, sizeof(s->rt)); - vhdx_region_header_le_import(&s->rt); offset += sizeof(s->rt); - if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4) || - memcmp(&s->rt.signature, "regi", 4)) { + if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4)) { ret = -EINVAL; goto fail; } + vhdx_region_header_le_import(&s->rt); + + if (s->rt.signature != VHDX_REGION_SIGNATURE) { + ret = -EINVAL; + goto fail; + } + + /* Per spec, maximum region table entry count is 2047 */ if (s->rt.entry_count > 2047) { ret = -EINVAL; @@ -630,7 +638,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s) vhdx_metadata_header_le_import(&s->metadata_hdr); - if (memcmp(&s->metadata_hdr.signature, "metadata", 8)) { + if (s->metadata_hdr.signature != VHDX_METADATA_SIGNATURE) { ret = -EINVAL; goto exit; } @@ -950,7 +958,11 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags, } /* s->bat is freed in vhdx_close() */ - s->bat = qemu_blockalign(bs, s->bat_rt.length); + s->bat = qemu_try_blockalign(bs->file, s->bat_rt.length); + if (s->bat == NULL) { + ret = -ENOMEM; + goto fail; + } ret = bdrv_pread(bs->file, s->bat_offset, s->bat, s->bat_rt.length); if (ret < 0) { @@ -1540,7 +1552,8 @@ exit: */ static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s, uint64_t image_size, VHDXImageType type, - bool use_zero_blocks, VHDXRegionTableEntry *rt_bat) + bool use_zero_blocks, uint64_t file_offset, + uint32_t length) { int ret = 0; uint64_t data_file_offset; @@ -1555,7 +1568,7 @@ static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s, /* this gives a data start after BAT/bitmap entries, and well * past any metadata entries (with a 4 MB buffer for future * expansion */ - data_file_offset = rt_bat->file_offset + rt_bat->length + 5 * MiB; + data_file_offset = file_offset + length + 5 * MiB; total_sectors = image_size >> s->logical_sector_size_bits; if (type == VHDX_TYPE_DYNAMIC) { @@ -1579,7 +1592,11 @@ static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s, use_zero_blocks || bdrv_has_zero_init(bs) == 0) { /* for a fixed file, the default BAT entry is not zero */ - s->bat = g_malloc0(rt_bat->length); + s->bat = g_try_malloc0(length); + if (length && s->bat != NULL) { + ret = -ENOMEM; + goto exit; + } block_state = type == VHDX_TYPE_FIXED ? PAYLOAD_BLOCK_FULLY_PRESENT : PAYLOAD_BLOCK_NOT_PRESENT; block_state = use_zero_blocks ? PAYLOAD_BLOCK_ZERO : block_state; @@ -1594,7 +1611,7 @@ static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s, cpu_to_le64s(&s->bat[sinfo.bat_idx]); sector_num += s->sectors_per_block; } - ret = bdrv_pwrite(bs, rt_bat->file_offset, s->bat, rt_bat->length); + ret = bdrv_pwrite(bs, file_offset, s->bat, length); if (ret < 0) { goto exit; } @@ -1626,6 +1643,8 @@ static int vhdx_create_new_region_table(BlockDriverState *bs, int ret = 0; uint32_t offset = 0; void *buffer = NULL; + uint64_t bat_file_offset; + uint32_t bat_length; BDRVVHDXState *s = NULL; VHDXRegionTableHeader *region_table; VHDXRegionTableEntry *rt_bat; @@ -1674,19 +1693,26 @@ static int vhdx_create_new_region_table(BlockDriverState *bs, rt_metadata->length = 1 * MiB; /* min size, and more than enough */ *metadata_offset = rt_metadata->file_offset; + bat_file_offset = rt_bat->file_offset; + bat_length = rt_bat->length; + + vhdx_region_header_le_export(region_table); + vhdx_region_entry_le_export(rt_bat); + vhdx_region_entry_le_export(rt_metadata); + vhdx_update_checksum(buffer, VHDX_HEADER_BLOCK_SIZE, offsetof(VHDXRegionTableHeader, checksum)); /* The region table gives us the data we need to create the BAT, * so do that now */ - ret = vhdx_create_bat(bs, s, image_size, type, use_zero_blocks, rt_bat); + ret = vhdx_create_bat(bs, s, image_size, type, use_zero_blocks, + bat_file_offset, bat_length); + if (ret < 0) { + goto exit; + } /* Now write out the region headers to disk */ - vhdx_region_header_le_export(region_table); - vhdx_region_entry_le_export(rt_bat); - vhdx_region_entry_le_export(rt_metadata); - ret = bdrv_pwrite(bs, VHDX_REGION_TABLE_OFFSET, buffer, VHDX_HEADER_BLOCK_SIZE); if (ret < 0) { diff --git a/block/vhdx.h b/block/vhdx.h index 5370010c59..b4a12a081e 100644 --- a/block/vhdx.h +++ b/block/vhdx.h @@ -435,6 +435,7 @@ void vhdx_header_le_import(VHDXHeader *h); void vhdx_header_le_export(VHDXHeader *orig_h, VHDXHeader *new_h); void vhdx_log_desc_le_import(VHDXLogDescriptor *d); void vhdx_log_desc_le_export(VHDXLogDescriptor *d); +void vhdx_log_data_le_import(VHDXLogDataSector *d); void vhdx_log_data_le_export(VHDXLogDataSector *d); void vhdx_log_entry_hdr_le_import(VHDXLogEntryHeader *hdr); void vhdx_log_entry_hdr_le_export(VHDXLogEntryHeader *hdr); diff --git a/block/vmdk.c b/block/vmdk.c index 0517bbaf91..01412a8939 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -106,6 +106,7 @@ typedef struct VmdkExtent { uint32_t l2_cache_counts[L2_CACHE_SIZE]; int64_t cluster_sectors; + int64_t next_cluster_sector; char *type; } VmdkExtent; @@ -124,7 +125,6 @@ typedef struct BDRVVmdkState { } BDRVVmdkState; typedef struct VmdkMetaData { - uint32_t offset; unsigned int l1_index; unsigned int l2_index; unsigned int l2_offset; @@ -397,6 +397,7 @@ static int vmdk_add_extent(BlockDriverState *bs, { VmdkExtent *extent; BDRVVmdkState *s = bs->opaque; + int64_t length; if (cluster_sectors > 0x200000) { /* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */ @@ -412,6 +413,11 @@ static int vmdk_add_extent(BlockDriverState *bs, return -EFBIG; } + length = bdrv_getlength(file); + if (length < 0) { + return length; + } + s->extents = g_realloc(s->extents, (s->num_extents + 1) * sizeof(VmdkExtent)); extent = &s->extents[s->num_extents]; @@ -427,6 +433,8 @@ static int vmdk_add_extent(BlockDriverState *bs, extent->l1_entry_sectors = l2_size * cluster_sectors; extent->l2_size = l2_size; extent->cluster_sectors = flat ? sectors : cluster_sectors; + extent->next_cluster_sector = + ROUND_UP(DIV_ROUND_UP(length, BDRV_SECTOR_SIZE), cluster_sectors); if (s->num_extents > 1) { extent->end_sector = (*(extent - 1)).end_sector + extent->sectors; @@ -448,7 +456,11 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent, /* read the L1 table */ l1_size = extent->l1_size * sizeof(uint32_t); - extent->l1_table = g_malloc(l1_size); + extent->l1_table = g_try_malloc(l1_size); + if (l1_size && extent->l1_table == NULL) { + return -ENOMEM; + } + ret = bdrv_pread(extent->file, extent->l1_table_offset, extent->l1_table, @@ -464,7 +476,11 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent, } if (extent->l1_backup_table_offset) { - extent->l1_backup_table = g_malloc(l1_size); + extent->l1_backup_table = g_try_malloc(l1_size); + if (l1_size && extent->l1_backup_table == NULL) { + ret = -ENOMEM; + goto fail_l1; + } ret = bdrv_pread(extent->file, extent->l1_backup_table_offset, extent->l1_backup_table, @@ -669,8 +685,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) { l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9; } - if (bdrv_getlength(file) < - le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE) { + if (bdrv_nb_sectors(file) < le64_to_cpu(header.grain_offset)) { error_setg(errp, "File truncated, expecting at least %" PRId64 " bytes", (int64_t)(le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE)); @@ -952,57 +967,97 @@ static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp) } } +/** + * get_whole_cluster + * + * Copy backing file's cluster that covers @sector_num, otherwise write zero, + * to the cluster at @cluster_sector_num. + * + * If @skip_start_sector < @skip_end_sector, the relative range + * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave + * it for call to write user data in the request. + */ static int get_whole_cluster(BlockDriverState *bs, - VmdkExtent *extent, - uint64_t cluster_offset, - uint64_t offset, - bool allocate) + VmdkExtent *extent, + uint64_t cluster_sector_num, + uint64_t sector_num, + uint64_t skip_start_sector, + uint64_t skip_end_sector) { int ret = VMDK_OK; - uint8_t *whole_grain = NULL; + int64_t cluster_bytes; + uint8_t *whole_grain; + /* For COW, align request sector_num to cluster start */ + sector_num = QEMU_ALIGN_DOWN(sector_num, extent->cluster_sectors); + cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS; + whole_grain = qemu_blockalign(bs, cluster_bytes); + + if (!bs->backing_hd) { + memset(whole_grain, 0, skip_start_sector << BDRV_SECTOR_BITS); + memset(whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), 0, + cluster_bytes - (skip_end_sector << BDRV_SECTOR_BITS)); + } + + assert(skip_end_sector <= extent->cluster_sectors); /* we will be here if it's first write on non-exist grain(cluster). * try to read from parent image, if exist */ - if (bs->backing_hd) { - whole_grain = - qemu_blockalign(bs, extent->cluster_sectors << BDRV_SECTOR_BITS); - if (!vmdk_is_cid_valid(bs)) { - ret = VMDK_ERROR; - goto exit; - } + if (bs->backing_hd && !vmdk_is_cid_valid(bs)) { + ret = VMDK_ERROR; + goto exit; + } - /* floor offset to cluster */ - offset -= offset % (extent->cluster_sectors * 512); - ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain, - extent->cluster_sectors); + /* Read backing data before skip range */ + if (skip_start_sector > 0) { + if (bs->backing_hd) { + ret = bdrv_read(bs->backing_hd, sector_num, + whole_grain, skip_start_sector); + if (ret < 0) { + ret = VMDK_ERROR; + goto exit; + } + } + ret = bdrv_write(extent->file, cluster_sector_num, whole_grain, + skip_start_sector); if (ret < 0) { ret = VMDK_ERROR; goto exit; } - - /* Write grain only into the active image */ - ret = bdrv_write(extent->file, cluster_offset, whole_grain, - extent->cluster_sectors); + } + /* Read backing data after skip range */ + if (skip_end_sector < extent->cluster_sectors) { + if (bs->backing_hd) { + ret = bdrv_read(bs->backing_hd, sector_num + skip_end_sector, + whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), + extent->cluster_sectors - skip_end_sector); + if (ret < 0) { + ret = VMDK_ERROR; + goto exit; + } + } + ret = bdrv_write(extent->file, cluster_sector_num + skip_end_sector, + whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), + extent->cluster_sectors - skip_end_sector); if (ret < 0) { ret = VMDK_ERROR; goto exit; } } + exit: qemu_vfree(whole_grain); return ret; } -static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data) +static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, + uint32_t offset) { - uint32_t offset; - QEMU_BUILD_BUG_ON(sizeof(offset) != sizeof(m_data->offset)); - offset = cpu_to_le32(m_data->offset); + offset = cpu_to_le32(offset); /* update L2 table */ if (bdrv_pwrite_sync( extent->file, ((int64_t)m_data->l2_offset * 512) - + (m_data->l2_index * sizeof(m_data->offset)), + + (m_data->l2_index * sizeof(offset)), &offset, sizeof(offset)) < 0) { return VMDK_ERROR; } @@ -1012,7 +1067,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data) if (bdrv_pwrite_sync( extent->file, ((int64_t)m_data->l2_offset * 512) - + (m_data->l2_index * sizeof(m_data->offset)), + + (m_data->l2_index * sizeof(offset)), &offset, sizeof(offset)) < 0) { return VMDK_ERROR; } @@ -1024,17 +1079,41 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data) return VMDK_OK; } +/** + * get_cluster_offset + * + * Look up cluster offset in extent file by sector number, and store in + * @cluster_offset. + * + * For flat extents, the start offset as parsed from the description file is + * returned. + * + * For sparse extents, look up in L1, L2 table. If allocate is true, return an + * offset for a new cluster and update L2 cache. If there is a backing file, + * COW is done before returning; otherwise, zeroes are written to the allocated + * cluster. Both COW and zero writing skips the sector range + * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller + * has new data to write there. + * + * Returns: VMDK_OK if cluster exists and mapped in the image. + * VMDK_UNALLOC if cluster is not mapped and @allocate is false. + * VMDK_ERROR if failed. + */ static int get_cluster_offset(BlockDriverState *bs, - VmdkExtent *extent, - VmdkMetaData *m_data, - uint64_t offset, - int allocate, - uint64_t *cluster_offset) + VmdkExtent *extent, + VmdkMetaData *m_data, + uint64_t offset, + bool allocate, + uint64_t *cluster_offset, + uint64_t skip_start_sector, + uint64_t skip_end_sector) { unsigned int l1_index, l2_offset, l2_index; int min_index, i, j; uint32_t min_count, *l2_table; bool zeroed = false; + int64_t ret; + int32_t cluster_sector; if (m_data) { m_data->valid = 0; @@ -1088,52 +1167,41 @@ static int get_cluster_offset(BlockDriverState *bs, extent->l2_cache_counts[min_index] = 1; found: l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size; - *cluster_offset = le32_to_cpu(l2_table[l2_index]); + cluster_sector = le32_to_cpu(l2_table[l2_index]); if (m_data) { m_data->valid = 1; m_data->l1_index = l1_index; m_data->l2_index = l2_index; - m_data->offset = *cluster_offset; m_data->l2_offset = l2_offset; m_data->l2_cache_entry = &l2_table[l2_index]; } - if (extent->has_zero_grain && *cluster_offset == VMDK_GTE_ZEROED) { + if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) { zeroed = true; } - if (!*cluster_offset || zeroed) { + if (!cluster_sector || zeroed) { if (!allocate) { return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; } - /* Avoid the L2 tables update for the images that have snapshots. */ - *cluster_offset = bdrv_getlength(extent->file); - if (!extent->compressed) { - bdrv_truncate( - extent->file, - *cluster_offset + (extent->cluster_sectors << 9) - ); - } - - *cluster_offset >>= 9; - l2_table[l2_index] = cpu_to_le32(*cluster_offset); + cluster_sector = extent->next_cluster_sector; + extent->next_cluster_sector += extent->cluster_sectors; /* First of all we write grain itself, to avoid race condition * that may to corrupt the image. * This problem may occur because of insufficient space on host disk * or inappropriate VM shutdown. */ - if (get_whole_cluster( - bs, extent, *cluster_offset, offset, allocate) == -1) { - return VMDK_ERROR; - } - - if (m_data) { - m_data->offset = *cluster_offset; + ret = get_whole_cluster(bs, extent, + cluster_sector, + offset >> BDRV_SECTOR_BITS, + skip_start_sector, skip_end_sector); + if (ret) { + return ret; } } - *cluster_offset <<= 9; + *cluster_offset = cluster_sector << BDRV_SECTOR_BITS; return VMDK_OK; } @@ -1168,7 +1236,8 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, } qemu_co_mutex_lock(&s->lock); ret = get_cluster_offset(bs, extent, NULL, - sector_num * 512, 0, &offset); + sector_num * 512, false, &offset, + 0, 0); qemu_co_mutex_unlock(&s->lock); switch (ret) { @@ -1321,9 +1390,9 @@ static int vmdk_read(BlockDriverState *bs, int64_t sector_num, if (!extent) { return -EIO; } - ret = get_cluster_offset( - bs, extent, NULL, - sector_num << 9, 0, &cluster_offset); + ret = get_cluster_offset(bs, extent, NULL, + sector_num << 9, false, &cluster_offset, + 0, 0); extent_begin_sector = extent->end_sector - extent->sectors; extent_relative_sector_num = sector_num - extent_begin_sector; index_in_cluster = extent_relative_sector_num % extent->cluster_sectors; @@ -1404,12 +1473,17 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, if (!extent) { return -EIO; } - ret = get_cluster_offset( - bs, - extent, - &m_data, - sector_num << 9, !extent->compressed, - &cluster_offset); + extent_begin_sector = extent->end_sector - extent->sectors; + extent_relative_sector_num = sector_num - extent_begin_sector; + index_in_cluster = extent_relative_sector_num % extent->cluster_sectors; + n = extent->cluster_sectors - index_in_cluster; + if (n > nb_sectors) { + n = nb_sectors; + } + ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9, + !(extent->compressed || zeroed), + &cluster_offset, + index_in_cluster, index_in_cluster + n); if (extent->compressed) { if (ret == VMDK_OK) { /* Refuse write to allocated cluster for streamOptimized */ @@ -1418,24 +1492,13 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, return -EIO; } else { /* allocate */ - ret = get_cluster_offset( - bs, - extent, - &m_data, - sector_num << 9, 1, - &cluster_offset); + ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9, + true, &cluster_offset, 0, 0); } } if (ret == VMDK_ERROR) { return -EINVAL; } - extent_begin_sector = extent->end_sector - extent->sectors; - extent_relative_sector_num = sector_num - extent_begin_sector; - index_in_cluster = extent_relative_sector_num % extent->cluster_sectors; - n = extent->cluster_sectors - index_in_cluster; - if (n > nb_sectors) { - n = nb_sectors; - } if (zeroed) { /* Do zeroed write, buf is ignored */ if (extent->has_zero_grain && @@ -1443,9 +1506,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, n >= extent->cluster_sectors) { n = extent->cluster_sectors; if (!zero_dry_run) { - m_data.offset = VMDK_GTE_ZEROED; /* update L2 tables */ - if (vmdk_L2update(extent, &m_data) != VMDK_OK) { + if (vmdk_L2update(extent, &m_data, VMDK_GTE_ZEROED) + != VMDK_OK) { return -EIO; } } @@ -1461,7 +1524,9 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, } if (m_data.valid) { /* update L2 tables */ - if (vmdk_L2update(extent, &m_data) != VMDK_OK) { + if (vmdk_L2update(extent, &m_data, + cluster_offset >> BDRV_SECTOR_BITS) + != VMDK_OK) { return -EIO; } } @@ -1999,7 +2064,7 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result, BDRVVmdkState *s = bs->opaque; VmdkExtent *extent = NULL; int64_t sector_num = 0; - int64_t total_sectors = bdrv_getlength(bs) / BDRV_SECTOR_SIZE; + int64_t total_sectors = bdrv_nb_sectors(bs); int ret; uint64_t cluster_offset; @@ -2020,7 +2085,7 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result, } ret = get_cluster_offset(bs, extent, NULL, sector_num << BDRV_SECTOR_BITS, - 0, &cluster_offset); + false, &cluster_offset, 0, 0); if (ret == VMDK_ERROR) { fprintf(stderr, "ERROR: could not get cluster_offset for sector %" diff --git a/block/vpc.c b/block/vpc.c index 8b376a40be..055efc42d2 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -29,13 +29,6 @@ #if defined(CONFIG_UUID) #include <uuid/uuid.h> #endif -#ifdef __linux__ -#include <linux/fs.h> -#include <sys/ioctl.h> -#ifndef FS_NOCOW_FL -#define FS_NOCOW_FL 0x00800000 /* Do not cow file */ -#endif -#endif /**************************************************************/ @@ -276,7 +269,11 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } - s->pagetable = qemu_blockalign(bs, s->max_table_entries * 4); + s->pagetable = qemu_try_blockalign(bs->file, s->max_table_entries * 4); + if (s->pagetable == NULL) { + ret = -ENOMEM; + goto fail; + } s->bat_offset = be64_to_cpu(dyndisk_header->table_offset); @@ -656,39 +653,41 @@ static int calculate_geometry(int64_t total_sectors, uint16_t* cyls, return 0; } -static int create_dynamic_disk(int fd, uint8_t *buf, int64_t total_sectors) +static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf, + int64_t total_sectors) { VHDDynDiskHeader *dyndisk_header = (VHDDynDiskHeader *) buf; size_t block_size, num_bat_entries; int i; - int ret = -EIO; + int ret; + int64_t offset = 0; // Write the footer (twice: at the beginning and at the end) block_size = 0x200000; num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512); - if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE) { + ret = bdrv_pwrite_sync(bs, offset, buf, HEADER_SIZE); + if (ret) { goto fail; } - if (lseek(fd, 1536 + ((num_bat_entries * 4 + 511) & ~511), SEEK_SET) < 0) { - goto fail; - } - if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE) { + offset = 1536 + ((num_bat_entries * 4 + 511) & ~511); + ret = bdrv_pwrite_sync(bs, offset, buf, HEADER_SIZE); + if (ret < 0) { goto fail; } // Write the initial BAT - if (lseek(fd, 3 * 512, SEEK_SET) < 0) { - goto fail; - } + offset = 3 * 512; memset(buf, 0xFF, 512); for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) { - if (write(fd, buf, 512) != 512) { + ret = bdrv_pwrite_sync(bs, offset, buf, 512); + if (ret < 0) { goto fail; } + offset += 512; } // Prepare the Dynamic Disk Header @@ -709,39 +708,35 @@ static int create_dynamic_disk(int fd, uint8_t *buf, int64_t total_sectors) dyndisk_header->checksum = be32_to_cpu(vpc_checksum(buf, 1024)); // Write the header - if (lseek(fd, 512, SEEK_SET) < 0) { - goto fail; - } + offset = 512; - if (write(fd, buf, 1024) != 1024) { + ret = bdrv_pwrite_sync(bs, offset, buf, 1024); + if (ret < 0) { goto fail; } - ret = 0; fail: return ret; } -static int create_fixed_disk(int fd, uint8_t *buf, int64_t total_size) +static int create_fixed_disk(BlockDriverState *bs, uint8_t *buf, + int64_t total_size) { - int ret = -EIO; + int ret; /* Add footer to total size */ - total_size += 512; - if (ftruncate(fd, total_size) != 0) { - ret = -errno; - goto fail; - } - if (lseek(fd, -512, SEEK_END) < 0) { - goto fail; - } - if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE) { - goto fail; + total_size += HEADER_SIZE; + + ret = bdrv_truncate(bs, total_size); + if (ret < 0) { + return ret; } - ret = 0; + ret = bdrv_pwrite_sync(bs, total_size - HEADER_SIZE, buf, HEADER_SIZE); + if (ret < 0) { + return ret; + } - fail: return ret; } @@ -750,7 +745,7 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) uint8_t buf[1024]; VHDFooter *footer = (VHDFooter *) buf; char *disk_type_param; - int fd, i; + int i; uint16_t cyls = 0; uint8_t heads = 0; uint8_t secs_per_cyl = 0; @@ -758,7 +753,8 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) int64_t total_size; int disk_type; int ret = -EIO; - bool nocow = false; + Error *local_err = NULL; + BlockDriverState *bs = NULL; /* Read out options */ total_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0); @@ -775,28 +771,17 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) } else { disk_type = VHD_DYNAMIC; } - nocow = qemu_opt_get_bool_del(opts, BLOCK_OPT_NOCOW, false); - /* Create the file */ - fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644); - if (fd < 0) { - ret = -EIO; + ret = bdrv_create_file(filename, opts, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); goto out; } - - if (nocow) { -#ifdef __linux__ - /* Set NOCOW flag to solve performance issue on fs like btrfs. - * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value will - * be ignored since any failure of this operation should not block the - * left work. - */ - int attr; - if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) { - attr |= FS_NOCOW_FL; - ioctl(fd, FS_IOC_SETFLAGS, &attr); - } -#endif + ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, + NULL, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + goto out; } /* @@ -810,7 +795,7 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) &secs_per_cyl)) { ret = -EFBIG; - goto fail; + goto out; } } @@ -856,14 +841,13 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) footer->checksum = be32_to_cpu(vpc_checksum(buf, HEADER_SIZE)); if (disk_type == VHD_DYNAMIC) { - ret = create_dynamic_disk(fd, buf, total_sectors); + ret = create_dynamic_disk(bs, buf, total_sectors); } else { - ret = create_fixed_disk(fd, buf, total_size); + ret = create_fixed_disk(bs, buf, total_size); } -fail: - qemu_close(fd); out: + bdrv_unref(bs); g_free(disk_type_param); return ret; } diff --git a/block/win32-aio.c b/block/win32-aio.c index 8e417f70ae..5030e3274d 100644 --- a/block/win32-aio.c +++ b/block/win32-aio.c @@ -139,7 +139,10 @@ BlockDriverAIOCB *win32_aio_submit(BlockDriverState *bs, waiocb->is_read = (type == QEMU_AIO_READ); if (qiov->niov > 1) { - waiocb->buf = qemu_blockalign(bs, qiov->size); + waiocb->buf = qemu_try_blockalign(bs, qiov->size); + if (waiocb->buf == NULL) { + goto out; + } if (type & QEMU_AIO_WRITE) { iov_to_buf(qiov->iov, qiov->niov, 0, waiocb->buf, qiov->size); } @@ -168,6 +171,7 @@ BlockDriverAIOCB *win32_aio_submit(BlockDriverState *bs, out_dec_count: aio->count--; +out: qemu_aio_release(waiocb); return NULL; } @@ -326,6 +326,7 @@ seccomp="" glusterfs="" glusterfs_discard="no" glusterfs_zerofill="no" +archipelago="" virtio_blk_data_plane="" gtk="" gtkabi="" @@ -1087,6 +1088,10 @@ for opt do ;; --enable-glusterfs) glusterfs="yes" ;; + --disable-archipelago) archipelago="no" + ;; + --enable-archipelago) archipelago="yes" + ;; --disable-virtio-blk-data-plane) virtio_blk_data_plane="no" ;; --enable-virtio-blk-data-plane) virtio_blk_data_plane="yes" @@ -1382,6 +1387,8 @@ Advanced options (experts only): --enable-coroutine-pool enable coroutine freelist (better performance) --enable-glusterfs enable GlusterFS backend --disable-glusterfs disable GlusterFS backend + --enable-archipelago enable Archipelago backend + --disable-archipelago disable Archipelago backend --enable-gcov enable test coverage analysis with gcov --gcov=GCOV use specified gcov [$gcov_tool] --disable-tpm disable TPM support @@ -3072,6 +3079,33 @@ EOF fi fi + +########################################## +# archipelago probe +if test "$archipelago" != "no" ; then + cat > $TMPC <<EOF +#include <stdio.h> +#include <xseg/xseg.h> +#include <xseg/protocol.h> +int main(void) { + xseg_initialize(); + return 0; +} +EOF + archipelago_libs=-lxseg + if compile_prog "" "$archipelago_libs"; then + archipelago="yes" + libs_tools="$archipelago_libs $libs_tools" + libs_softmmu="$archipelago_libs $libs_softmmu" + else + if test "$archipelago" = "yes" ; then + feature_not_found "Archipelago backend support" "Install libxseg devel" + fi + archipelago="no" + fi +fi + + ########################################## # glusterfs probe if test "$glusterfs" != "no" ; then @@ -3087,7 +3121,8 @@ if test "$glusterfs" != "no" ; then fi else if test "$glusterfs" = "yes" ; then - feature_not_found "GlusterFS backend support" "Install glusterfs-api devel" + feature_not_found "GlusterFS backend support" \ + "Install glusterfs-api devel >= 3" fi glusterfs="no" fi @@ -3532,7 +3567,8 @@ EOF spice_server_version=$($pkg_config --modversion spice-server) else if test "$spice" = "yes" ; then - feature_not_found "spice" "Install spice-server and spice-protocol devel" + feature_not_found "spice" \ + "Install spice-server(>=0.12.0) and spice-protocol(>=0.12.3) devel" fi spice="no" fi @@ -3563,7 +3599,7 @@ EOF smartcard_nss="yes" else if test "$smartcard_nss" = "yes"; then - feature_not_found "nss" + feature_not_found "nss" "Install nss devel >= 3.12.8" fi smartcard_nss="no" fi @@ -3579,7 +3615,7 @@ if test "$libusb" != "no" ; then libs_softmmu="$libs_softmmu $libusb_libs" else if test "$libusb" = "yes"; then - feature_not_found "libusb" "Install libusb devel" + feature_not_found "libusb" "Install libusb devel >= 1.0.13" fi libusb="no" fi @@ -4004,7 +4040,7 @@ if test "$libnfs" != "no" ; then LIBS="$LIBS $libnfs_libs" else if test "$libnfs" = "yes" ; then - feature_not_found "libnfs" + feature_not_found "libnfs" "Install libnfs devel >= 1.9.3" fi libnfs="no" fi @@ -4251,6 +4287,7 @@ echo "seccomp support $seccomp" echo "coroutine backend $coroutine" echo "coroutine pool $coroutine_pool" echo "GlusterFS support $glusterfs" +echo "Archipelago support $archipelago" echo "virtio-blk-data-plane $virtio_blk_data_plane" echo "gcov $gcov_tool" echo "gcov enabled $gcov" @@ -4689,6 +4726,11 @@ if test "$glusterfs_zerofill" = "yes" ; then echo "CONFIG_GLUSTERFS_ZEROFILL=y" >> $config_host_mak fi +if test "$archipelago" = "yes" ; then + echo "CONFIG_ARCHIPELAGO=m" >> $config_host_mak + echo "ARCHIPELAGO_LIBS=$archipelago_libs" >> $config_host_mak +fi + if test "$libssh2" = "yes" ; then echo "CONFIG_LIBSSH2=m" >> $config_host_mak echo "LIBSSH2_CFLAGS=$libssh2_cflags" >> $config_host_mak diff --git a/docs/multiple-iothreads.txt b/docs/multiple-iothreads.txt new file mode 100644 index 0000000000..40b8419916 --- /dev/null +++ b/docs/multiple-iothreads.txt @@ -0,0 +1,134 @@ +Copyright (c) 2014 Red Hat Inc. + +This work is licensed under the terms of the GNU GPL, version 2 or later. See +the COPYING file in the top-level directory. + + +This document explains the IOThread feature and how to write code that runs +outside the QEMU global mutex. + +The main loop and IOThreads +--------------------------- +QEMU is an event-driven program that can do several things at once using an +event loop. The VNC server and the QMP monitor are both processed from the +same event loop, which monitors their file descriptors until they become +readable and then invokes a callback. + +The default event loop is called the main loop (see main-loop.c). It is +possible to create additional event loop threads using -object +iothread,id=my-iothread. + +Side note: The main loop and IOThread are both event loops but their code is +not shared completely. Sometimes it is useful to remember that although they +are conceptually similar they are currently not interchangeable. + +Why IOThreads are useful +------------------------ +IOThreads allow the user to control the placement of work. The main loop is a +scalability bottleneck on hosts with many CPUs. Work can be spread across +several IOThreads instead of just one main loop. When set up correctly this +can improve I/O latency and reduce jitter seen by the guest. + +The main loop is also deeply associated with the QEMU global mutex, which is a +scalability bottleneck in itself. vCPU threads and the main loop use the QEMU +global mutex to serialize execution of QEMU code. This mutex is necessary +because a lot of QEMU's code historically was not thread-safe. + +The fact that all I/O processing is done in a single main loop and that the +QEMU global mutex is contended by all vCPU threads and the main loop explain +why it is desirable to place work into IOThreads. + +The experimental virtio-blk data-plane implementation has been benchmarked and +shows these effects: +ftp://public.dhe.ibm.com/linux/pdfs/KVM_Virtualized_IO_Performance_Paper.pdf + +How to program for IOThreads +---------------------------- +The main difference between legacy code and new code that can run in an +IOThread is dealing explicitly with the event loop object, AioContext +(see include/block/aio.h). Code that only works in the main loop +implicitly uses the main loop's AioContext. Code that supports running +in IOThreads must be aware of its AioContext. + +AioContext supports the following services: + * File descriptor monitoring (read/write/error on POSIX hosts) + * Event notifiers (inter-thread signalling) + * Timers + * Bottom Halves (BH) deferred callbacks + +There are several old APIs that use the main loop AioContext: + * LEGACY qemu_aio_set_fd_handler() - monitor a file descriptor + * LEGACY qemu_aio_set_event_notifier() - monitor an event notifier + * LEGACY timer_new_ms() - create a timer + * LEGACY qemu_bh_new() - create a BH + * LEGACY qemu_aio_wait() - run an event loop iteration + +Since they implicitly work on the main loop they cannot be used in code that +runs in an IOThread. They might cause a crash or deadlock if called from an +IOThread since the QEMU global mutex is not held. + +Instead, use the AioContext functions directly (see include/block/aio.h): + * aio_set_fd_handler() - monitor a file descriptor + * aio_set_event_notifier() - monitor an event notifier + * aio_timer_new() - create a timer + * aio_bh_new() - create a BH + * aio_poll() - run an event loop iteration + +The AioContext can be obtained from the IOThread using +iothread_get_aio_context() or for the main loop using qemu_get_aio_context(). +Code that takes an AioContext argument works both in IOThreads or the main +loop, depending on which AioContext instance the caller passes in. + +How to synchronize with an IOThread +----------------------------------- +AioContext is not thread-safe so some rules must be followed when using file +descriptors, event notifiers, timers, or BHs across threads: + +1. AioContext functions can be called safely from file descriptor, event +notifier, timer, or BH callbacks invoked by the AioContext. No locking is +necessary. + +2. Other threads wishing to access the AioContext must use +aio_context_acquire()/aio_context_release() for mutual exclusion. Once the +context is acquired no other thread can access it or run event loop iterations +in this AioContext. + +aio_context_acquire()/aio_context_release() calls may be nested. This +means you can call them if you're not sure whether #1 applies. + +There is currently no lock ordering rule if a thread needs to acquire multiple +AioContexts simultaneously. Therefore, it is only safe for code holding the +QEMU global mutex to acquire other AioContexts. + +Side note: the best way to schedule a function call across threads is to create +a BH in the target AioContext beforehand and then call qemu_bh_schedule(). No +acquire/release or locking is needed for the qemu_bh_schedule() call. But be +sure to acquire the AioContext for aio_bh_new() if necessary. + +The relationship between AioContext and the block layer +------------------------------------------------------- +The AioContext originates from the QEMU block layer because it provides a +scoped way of running event loop iterations until all work is done. This +feature is used to complete all in-flight block I/O requests (see +bdrv_drain_all()). Nowadays AioContext is a generic event loop that can be +used by any QEMU subsystem. + +The block layer has support for AioContext integrated. Each BlockDriverState +is associated with an AioContext using bdrv_set_aio_context() and +bdrv_get_aio_context(). This allows block layer code to process I/O inside the +right AioContext. Other subsystems may wish to follow a similar approach. + +Block layer code must therefore expect to run in an IOThread and avoid using +old APIs that implicitly use the main loop. See the "How to program for +IOThreads" above for information on how to do that. + +If main loop code such as a QMP function wishes to access a BlockDriverState it +must first call aio_context_acquire(bdrv_get_aio_context(bs)) to ensure the +IOThread does not run in parallel. + +Long-running jobs (usually in the form of coroutines) are best scheduled in the +BlockDriverState's AioContext to avoid the need to acquire/release around each +bdrv_*() call. Be aware that there is currently no mechanism to get notified +when bdrv_set_aio_context() moves this BlockDriverState to a different +AioContext (see bdrv_detach_aio_context()/bdrv_attach_aio_context()), so you +may need to add this if you want to support long-running jobs. diff --git a/docs/specs/qcow2.txt b/docs/specs/qcow2.txt index 3f713a6447..cfbc8b070c 100644 --- a/docs/specs/qcow2.txt +++ b/docs/specs/qcow2.txt @@ -135,12 +135,12 @@ be stored. Each extension has a structure like the following: Unless stated otherwise, each header extension type shall appear at most once in the same image. -The remaining space between the end of the header extension area and the end of -the first cluster can be used for the backing file name. It is not allowed to -store other data here, so that an implementation can safely modify the header -and add extensions without harming data of compatible features that it -doesn't support. Compatible features that need space for additional data can -use a header extension. +If the image has a backing file then the backing file name should be stored in +the remaining space between the end of the header extension area and the end of +the first cluster. It is not allowed to store other data here, so that an +implementation can safely modify the header and add extensions without harming +data of compatible features that it doesn't support. Compatible features that +need space for additional data can use a header extension. == Feature name table == diff --git a/hw/block/xen_disk.c b/hw/block/xen_disk.c index aed5b5b3e9..a221d0bfca 100644 --- a/hw/block/xen_disk.c +++ b/hw/block/xen_disk.c @@ -589,6 +589,7 @@ static int blk_send_response_one(struct ioreq *ioreq) break; default: dst = NULL; + return 0; } memcpy(dst, &resp, sizeof(resp)); blkdev->rings.common.rsp_prod_pvt++; diff --git a/include/block/block.h b/include/block/block.h index f08471d7e1..e94b701667 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -275,6 +275,7 @@ BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, const char *backing_file); int bdrv_get_backing_file_depth(BlockDriverState *bs); int bdrv_truncate(BlockDriverState *bs, int64_t offset); +int64_t bdrv_nb_sectors(BlockDriverState *bs); int64_t bdrv_getlength(BlockDriverState *bs); int64_t bdrv_get_allocated_file_size(BlockDriverState *bs); void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr); @@ -454,6 +455,7 @@ void bdrv_img_create(const char *filename, const char *fmt, size_t bdrv_opt_mem_align(BlockDriverState *bs); void bdrv_set_guest_block_size(BlockDriverState *bs, int align); void *qemu_blockalign(BlockDriverState *bs, size_t size); +void *qemu_try_blockalign(BlockDriverState *bs, size_t size); bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov); struct HBitmapIter; diff --git a/include/block/coroutine.h b/include/block/coroutine.h index b408f96b82..b9b7f488c9 100644 --- a/include/block/coroutine.h +++ b/include/block/coroutine.h @@ -223,4 +223,15 @@ void coroutine_fn co_aio_sleep_ns(AioContext *ctx, QEMUClockType type, * Note that this function clobbers the handlers for the file descriptor. */ void coroutine_fn yield_until_fd_readable(int fd); + +/** + * Add or subtract from the coroutine pool size + * + * The coroutine implementation keeps a pool of coroutines to be reused by + * qemu_coroutine_create(). This makes coroutine creation cheap. Heavy + * coroutine users should call this to reserve pool space. Call it again with + * a negative number to release pool space. + */ +void qemu_coroutine_adjust_pool_size(int n); + #endif /* QEMU_COROUTINE_H */ diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h index 8480d523f0..9dd43fc2db 100644 --- a/include/qemu/osdep.h +++ b/include/qemu/osdep.h @@ -95,6 +95,7 @@ typedef signed int int_fast16_t; #define qemu_printf printf int qemu_daemon(int nochdir, int noclose); +void *qemu_try_memalign(size_t alignment, size_t size); void *qemu_memalign(size_t alignment, size_t size); void *qemu_anon_ram_alloc(size_t size); void qemu_vfree(void *ptr); diff --git a/qapi/block-core.json b/qapi/block-core.json index e378653a77..fb74c56e32 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -115,6 +115,8 @@ # @format-specific: #optional structure supplying additional format-specific # information (since 1.7) # +# @nocow: #optional info of whether NOCOW flag is set or not. (since 2.2) +# # Since: 1.3 # ## @@ -126,7 +128,8 @@ '*backing-filename': 'str', '*full-backing-filename': 'str', '*backing-filename-format': 'str', '*snapshots': ['SnapshotInfo'], '*backing-image': 'ImageInfo', - '*format-specific': 'ImageInfoSpecific' } } + '*format-specific': 'ImageInfoSpecific', + '*nocow': 'bool' } } ## # @ImageCheck: @@ -194,6 +197,7 @@ # 'file', 'file', 'ftp', 'ftps', 'host_cdrom', 'host_device', # 'host_floppy', 'http', 'https', 'nbd', 'parallels', 'qcow', # 'qcow2', 'raw', 'tftp', 'vdi', 'vmdk', 'vpc', 'vvfat' +# 2.2: 'archipelago' # # @backing_file: #optional the name of the backing file (for copy-on-write) # @@ -1143,7 +1147,7 @@ # Since: 2.0 ## { 'enum': 'BlockdevDriver', - 'data': [ 'file', 'host_device', 'host_cdrom', 'host_floppy', + 'data': [ 'archipelago', 'file', 'host_device', 'host_cdrom', 'host_floppy', 'http', 'https', 'ftp', 'ftps', 'tftp', 'vvfat', 'blkdebug', 'blkverify', 'bochs', 'cloop', 'cow', 'dmg', 'parallels', 'qcow', 'qcow2', 'qed', 'raw', 'vdi', 'vhdx', 'vmdk', 'vpc', 'quorum' ] } @@ -1273,6 +1277,37 @@ '*pass-discard-snapshot': 'bool', '*pass-discard-other': 'bool' } } + +## +# @BlockdevOptionsArchipelago +# +# Driver specific block device options for Archipelago. +# +# @volume: Name of the Archipelago volume image +# +# @mport: #optional The port number on which mapperd is +# listening. This is optional +# and if not specified, QEMU will make Archipelago +# use the default port (1001). +# +# @vport: #optional The port number on which vlmcd is +# listening. This is optional +# and if not specified, QEMU will make Archipelago +# use the default port (501). +# +# @segment: #optional The name of the shared memory segment +# Archipelago stack is using. This is optional +# and if not specified, QEMU will make Archipelago +# use the default value, 'archipelago'. +# Since: 2.2 +## +{ 'type': 'BlockdevOptionsArchipelago', + 'data': { 'volume': 'str', + '*mport': 'int', + '*vport': 'int', + '*segment': 'str' } } + + ## # @BlkdebugEvent # @@ -1416,6 +1451,7 @@ 'base': 'BlockdevOptionsBase', 'discriminator': 'driver', 'data': { + 'archipelago':'BlockdevOptionsArchipelago', 'file': 'BlockdevOptionsFile', 'host_device':'BlockdevOptionsFile', 'host_cdrom': 'BlockdevOptionsFile', diff --git a/qdev-monitor.c b/qdev-monitor.c index f87f3d89cd..5fe5e75a88 100644 --- a/qdev-monitor.c +++ b/qdev-monitor.c @@ -182,9 +182,10 @@ static const char *find_typename_by_alias(const char *alias) int qdev_device_help(QemuOpts *opts) { + Error *local_err = NULL; const char *driver; - Property *prop; - ObjectClass *klass; + DevicePropertyInfoList *prop_list; + DevicePropertyInfoList *prop; driver = qemu_opt_get(opts, "driver"); if (driver && is_help_option(driver)) { @@ -196,35 +197,28 @@ int qdev_device_help(QemuOpts *opts) return 0; } - klass = object_class_by_name(driver); - if (!klass) { + if (!object_class_by_name(driver)) { const char *typename = find_typename_by_alias(driver); if (typename) { driver = typename; - klass = object_class_by_name(driver); } } - if (!object_class_dynamic_cast(klass, TYPE_DEVICE)) { - return 0; + prop_list = qmp_device_list_properties(driver, &local_err); + if (!prop_list) { + error_printf("%s\n", error_get_pretty(local_err)); + error_free(local_err); + return 1; } - do { - for (prop = DEVICE_CLASS(klass)->props; prop && prop->name; prop++) { - /* - * TODO Properties without a parser are just for dirty hacks. - * qdev_prop_ptr is the only such PropertyInfo. It's marked - * for removal. This conditional should be removed along with - * it. - */ - if (!prop->info->set) { - continue; /* no way to set it, don't show */ - } - error_printf("%s.%s=%s\n", driver, prop->name, - prop->info->legacy_name ?: prop->info->name); - } - klass = object_class_get_parent(klass); - } while (klass != object_class_by_name(TYPE_DEVICE)); + + for (prop = prop_list; prop; prop = prop->next) { + error_printf("%s.%s=%s\n", driver, + prop->value->name, + prop->value->type); + } + + qapi_free_DevicePropertyInfoList(prop_list); return 1; } diff --git a/qemu-coroutine.c b/qemu-coroutine.c index 470852100a..bd574aa1b5 100644 --- a/qemu-coroutine.c +++ b/qemu-coroutine.c @@ -19,14 +19,14 @@ #include "block/coroutine_int.h" enum { - /* Maximum free pool size prevents holding too many freed coroutines */ - POOL_MAX_SIZE = 64, + POOL_DEFAULT_SIZE = 64, }; /** Free list to speed up creation */ static QemuMutex pool_lock; static QSLIST_HEAD(, Coroutine) pool = QSLIST_HEAD_INITIALIZER(pool); static unsigned int pool_size; +static unsigned int pool_max_size = POOL_DEFAULT_SIZE; Coroutine *qemu_coroutine_create(CoroutineEntry *entry) { @@ -55,7 +55,7 @@ static void coroutine_delete(Coroutine *co) { if (CONFIG_COROUTINE_POOL) { qemu_mutex_lock(&pool_lock); - if (pool_size < POOL_MAX_SIZE) { + if (pool_size < pool_max_size) { QSLIST_INSERT_HEAD(&pool, co, pool_next); co->caller = NULL; pool_size++; @@ -137,3 +137,23 @@ void coroutine_fn qemu_coroutine_yield(void) self->caller = NULL; coroutine_swap(self, to); } + +void qemu_coroutine_adjust_pool_size(int n) +{ + qemu_mutex_lock(&pool_lock); + + pool_max_size += n; + + /* Callers should never take away more than they added */ + assert(pool_max_size >= POOL_DEFAULT_SIZE); + + /* Trim oversized pool down to new max */ + while (pool_size > pool_max_size) { + Coroutine *co = QSLIST_FIRST(&pool); + QSLIST_REMOVE_HEAD(&pool, pool_next); + pool_size--; + qemu_coroutine_delete(co); + } + + qemu_mutex_unlock(&pool_lock); +} diff --git a/qemu-img.c b/qemu-img.c index 19495bb594..18caa4b450 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -960,7 +960,6 @@ static int img_compare(int argc, char **argv) int64_t sector_num = 0; int64_t nb_sectors; int c, pnum; - uint64_t bs_sectors; uint64_t progress_base; for (;;) { @@ -1022,10 +1021,20 @@ static int img_compare(int argc, char **argv) buf1 = qemu_blockalign(bs1, IO_BUF_SIZE); buf2 = qemu_blockalign(bs2, IO_BUF_SIZE); - bdrv_get_geometry(bs1, &bs_sectors); - total_sectors1 = bs_sectors; - bdrv_get_geometry(bs2, &bs_sectors); - total_sectors2 = bs_sectors; + total_sectors1 = bdrv_nb_sectors(bs1); + if (total_sectors1 < 0) { + error_report("Can't get size of %s: %s", + filename1, strerror(-total_sectors1)); + ret = 4; + goto out; + } + total_sectors2 = bdrv_nb_sectors(bs2); + if (total_sectors2 < 0) { + error_report("Can't get size of %s: %s", + filename2, strerror(-total_sectors2)); + ret = 4; + goto out; + } total_sectors = MIN(total_sectors1, total_sectors2); progress_base = MAX(total_sectors1, total_sectors2); @@ -1187,7 +1196,7 @@ static int img_convert(int argc, char **argv) BlockDriver *drv, *proto_drv; BlockDriverState **bs = NULL, *out_bs = NULL; int64_t total_sectors, nb_sectors, sector_num, bs_offset; - uint64_t bs_sectors; + int64_t *bs_sectors = NULL; uint8_t * buf = NULL; size_t bufsectors = IO_BUF_SIZE / BDRV_SECTOR_SIZE; const uint8_t *buf1; @@ -1328,7 +1337,8 @@ static int img_convert(int argc, char **argv) qemu_progress_print(0, 100); - bs = g_malloc0(bs_n * sizeof(BlockDriverState *)); + bs = g_new0(BlockDriverState *, bs_n); + bs_sectors = g_new(int64_t, bs_n); total_sectors = 0; for (bs_i = 0; bs_i < bs_n; bs_i++) { @@ -1342,8 +1352,14 @@ static int img_convert(int argc, char **argv) ret = -1; goto out; } - bdrv_get_geometry(bs[bs_i], &bs_sectors); - total_sectors += bs_sectors; + bs_sectors[bs_i] = bdrv_nb_sectors(bs[bs_i]); + if (bs_sectors[bs_i] < 0) { + error_report("Could not get size of %s: %s", + argv[optind + bs_i], strerror(-bs_sectors[bs_i])); + ret = -1; + goto out; + } + total_sectors += bs_sectors[bs_i]; } if (sn_opts) { @@ -1461,7 +1477,6 @@ static int img_convert(int argc, char **argv) bs_i = 0; bs_offset = 0; - bdrv_get_geometry(bs[0], &bs_sectors); /* increase bufsectors from the default 4096 (2M) if opt_transfer_length * or discard_alignment of the out_bs is greater. Limit to 32768 (16MB) @@ -1474,13 +1489,13 @@ static int img_convert(int argc, char **argv) buf = qemu_blockalign(out_bs, bufsectors * BDRV_SECTOR_SIZE); if (skip_create) { - int64_t output_length = bdrv_getlength(out_bs); - if (output_length < 0) { + int64_t output_sectors = bdrv_nb_sectors(out_bs); + if (output_sectors < 0) { error_report("unable to get output image length: %s\n", - strerror(-output_length)); + strerror(-output_sectors)); ret = -1; goto out; - } else if (output_length < total_sectors << BDRV_SECTOR_BITS) { + } else if (output_sectors < total_sectors) { error_report("output file is smaller than input file"); ret = -1; goto out; @@ -1528,19 +1543,19 @@ static int img_convert(int argc, char **argv) buf2 = buf; while (remainder > 0) { int nlow; - while (bs_num == bs_sectors) { + while (bs_num == bs_sectors[bs_i]) { + bs_offset += bs_sectors[bs_i]; bs_i++; assert (bs_i < bs_n); - bs_offset += bs_sectors; - bdrv_get_geometry(bs[bs_i], &bs_sectors); bs_num = 0; /* printf("changing part: sector_num=%" PRId64 ", " "bs_i=%d, bs_offset=%" PRId64 ", bs_sectors=%" PRId64 - "\n", sector_num, bs_i, bs_offset, bs_sectors); */ + "\n", sector_num, bs_i, bs_offset, bs_sectors[bs_i]); */ } - assert (bs_num < bs_sectors); + assert (bs_num < bs_sectors[bs_i]); - nlow = (remainder > bs_sectors - bs_num) ? bs_sectors - bs_num : remainder; + nlow = remainder > bs_sectors[bs_i] - bs_num + ? bs_sectors[bs_i] - bs_num : remainder; ret = bdrv_read(bs[bs_i], bs_num, buf2, nlow); if (ret < 0) { @@ -1601,14 +1616,13 @@ restart: break; } - while (sector_num - bs_offset >= bs_sectors) { + while (sector_num - bs_offset >= bs_sectors[bs_i]) { + bs_offset += bs_sectors[bs_i]; bs_i ++; assert (bs_i < bs_n); - bs_offset += bs_sectors; - bdrv_get_geometry(bs[bs_i], &bs_sectors); /* printf("changing part: sector_num=%" PRId64 ", bs_i=%d, " "bs_offset=%" PRId64 ", bs_sectors=%" PRId64 "\n", - sector_num, bs_i, bs_offset, bs_sectors); */ + sector_num, bs_i, bs_offset, bs_sectors[bs_i]); */ } if ((out_baseimg || has_zero_init) && @@ -1661,7 +1675,7 @@ restart: } } - n = MIN(n, bs_sectors - (sector_num - bs_offset)); + n = MIN(n, bs_sectors[bs_i] - (sector_num - bs_offset)); sectors_read += n; if (count_allocated_sectors) { @@ -1719,6 +1733,7 @@ out: } g_free(bs); } + g_free(bs_sectors); fail_getopt: g_free(options); @@ -2418,9 +2433,9 @@ static int img_rebase(int argc, char **argv) * the image is the same as the original one at any time. */ if (!unsafe) { - uint64_t num_sectors; - uint64_t old_backing_num_sectors; - uint64_t new_backing_num_sectors = 0; + int64_t num_sectors; + int64_t old_backing_num_sectors; + int64_t new_backing_num_sectors = 0; uint64_t sector; int n; uint8_t * buf_old; @@ -2430,10 +2445,31 @@ static int img_rebase(int argc, char **argv) buf_old = qemu_blockalign(bs, IO_BUF_SIZE); buf_new = qemu_blockalign(bs, IO_BUF_SIZE); - bdrv_get_geometry(bs, &num_sectors); - bdrv_get_geometry(bs_old_backing, &old_backing_num_sectors); + num_sectors = bdrv_nb_sectors(bs); + if (num_sectors < 0) { + error_report("Could not get size of '%s': %s", + filename, strerror(-num_sectors)); + ret = -1; + goto out; + } + old_backing_num_sectors = bdrv_nb_sectors(bs_old_backing); + if (old_backing_num_sectors < 0) { + char backing_name[1024]; + + bdrv_get_backing_filename(bs, backing_name, sizeof(backing_name)); + error_report("Could not get size of '%s': %s", + backing_name, strerror(-old_backing_num_sectors)); + ret = -1; + goto out; + } if (bs_new_backing) { - bdrv_get_geometry(bs_new_backing, &new_backing_num_sectors); + new_backing_num_sectors = bdrv_nb_sectors(bs_new_backing); + if (new_backing_num_sectors < 0) { + error_report("Could not get size of '%s': %s", + out_baseimg, strerror(-new_backing_num_sectors)); + ret = -1; + goto out; + } } if (num_sectors != 0) { @@ -509,6 +509,7 @@ DevicePropertyInfoList *qmp_device_list_properties(const char *typename, if (strcmp(prop->name, "type") == 0 || strcmp(prop->name, "realized") == 0 || strcmp(prop->name, "hotpluggable") == 0 || + strcmp(prop->name, "hotplugged") == 0 || strcmp(prop->name, "parent_bus") == 0) { continue; } diff --git a/tests/qemu-iotests/059 b/tests/qemu-iotests/059 index 26a2fd3e0e..3c053c29b4 100755 --- a/tests/qemu-iotests/059 +++ b/tests/qemu-iotests/059 @@ -114,6 +114,10 @@ echo echo "=== Testing version 3 ===" _use_sample_img iotest-version3.vmdk.bz2 _img_info +for i in {0..99}; do + $QEMU_IO -r -c "read -P $(( i % 10 + 0x30 )) $(( i * 64 * 1024 * 10 + i * 512 )) 512" $TEST_IMG \ + | _filter_qemu_io +done echo echo "=== Testing 4TB monolithicFlat creation and IO ===" diff --git a/tests/qemu-iotests/059.out b/tests/qemu-iotests/059.out index eba0dedda0..0dadba658f 100644 --- a/tests/qemu-iotests/059.out +++ b/tests/qemu-iotests/059.out @@ -2056,8 +2056,208 @@ wrote 512/512 bytes at offset 10240 === Testing version 3 === image: TEST_DIR/iotest-version3.IMGFMT file format: IMGFMT -virtual size: 1.0G (1073741824 bytes) +virtual size: 16G (17179869184 bytes) cluster_size: 65536 +read 512/512 bytes at offset 0 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 655872 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 1311744 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 1967616 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 2623488 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 3279360 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 3935232 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 4591104 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 5246976 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 5902848 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 6558720 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 7214592 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 7870464 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 8526336 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 9182208 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 9838080 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 10493952 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 11149824 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 11805696 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 12461568 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 13117440 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 13773312 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 14429184 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 15085056 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 15740928 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 16396800 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 17052672 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 17708544 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 18364416 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 19020288 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 19676160 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 20332032 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 20987904 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 21643776 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 22299648 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 22955520 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 23611392 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 24267264 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 24923136 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 25579008 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 26234880 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 26890752 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 27546624 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 28202496 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 28858368 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 29514240 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 30170112 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 30825984 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 31481856 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 32137728 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 32793600 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 33449472 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 34105344 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 34761216 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 35417088 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 36072960 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 36728832 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 37384704 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 38040576 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 38696448 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 39352320 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 40008192 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 40664064 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 41319936 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 41975808 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 42631680 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 43287552 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 43943424 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 44599296 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 45255168 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 45911040 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 46566912 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 47222784 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 47878656 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 48534528 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 49190400 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 49846272 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 50502144 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 51158016 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 51813888 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 52469760 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 53125632 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 53781504 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 54437376 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 55093248 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 55749120 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 56404992 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 57060864 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 57716736 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 58372608 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 59028480 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 59684352 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 60340224 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 60996096 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 61651968 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 62307840 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 62963712 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 63619584 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 64275456 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 512/512 bytes at offset 64931328 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) === Testing 4TB monolithicFlat creation and IO === Formatting 'TEST_DIR/iotest-version3.IMGFMT', fmt=IMGFMT size=4398046511104 diff --git a/tests/qemu-iotests/060 b/tests/qemu-iotests/060 index 3cffc12fec..830386fdaa 100755 --- a/tests/qemu-iotests/060 +++ b/tests/qemu-iotests/060 @@ -164,6 +164,15 @@ wait_break 0 write 64k 64k resume 0" | $QEMU_IO | _filter_qemu_io +echo +echo "=== Testing unallocated image header ===" +echo +_make_test_img 64M +# Create L1/L2 +$QEMU_IO -c "$OPEN_RW" -c "write 0 64k" | _filter_qemu_io +poke_file "$TEST_IMG" "$rb_offset" "\x00\x00" +$QEMU_IO -c "$OPEN_RW" -c "write 64k 64k" | _filter_qemu_io + # success, all done echo "*** done" rm -f $seq.full diff --git a/tests/qemu-iotests/060.out b/tests/qemu-iotests/060.out index a517948036..c27c952412 100644 --- a/tests/qemu-iotests/060.out +++ b/tests/qemu-iotests/060.out @@ -93,4 +93,12 @@ blkdebug: Suspended request '0' write failed: Input/output error blkdebug: Resuming request '0' aio_write failed: No medium found + +=== Testing unallocated image header === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 +wrote 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +qcow2: Preventing invalid write on metadata (overlaps with qcow2_header); image marked as corrupt. +write failed: Input/output error *** done diff --git a/tests/qemu-iotests/084 b/tests/qemu-iotests/084 index cb4d7b729e..ae33c2cba4 100755 --- a/tests/qemu-iotests/084 +++ b/tests/qemu-iotests/084 @@ -1,6 +1,7 @@ #!/bin/bash # -# Test case for VDI header corruption; image too large, and too many blocks +# Test case for VDI header corruption; image too large, and too many blocks. +# Also simple test for creating dynamic and static VDI images. # # Copyright (C) 2013 Red Hat, Inc. # @@ -43,14 +44,25 @@ _supported_fmt vdi _supported_proto generic _supported_os Linux +size=64M ds_offset=368 # disk image size field offset bs_offset=376 # block size field offset bii_offset=384 # block in image field offset echo +echo "=== Statically allocated image creation ===" +echo +_make_test_img $size -o static +_img_info +stat -c"disk image file size in bytes: %s" "${TEST_IMG}" +_cleanup_test_img + +echo echo "=== Testing image size bounds ===" echo -_make_test_img 64M +_make_test_img $size +_img_info +stat -c"disk image file size in bytes: %s" "${TEST_IMG}" # check for image size too large # poke max image size, and appropriate blocks_in_image value diff --git a/tests/qemu-iotests/084.out b/tests/qemu-iotests/084.out index c7120d9b0b..ea29ae0b9d 100644 --- a/tests/qemu-iotests/084.out +++ b/tests/qemu-iotests/084.out @@ -1,8 +1,22 @@ QA output created by 084 +=== Statically allocated image creation === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 +image: TEST_DIR/t.IMGFMT +file format: IMGFMT +virtual size: 64M (67108864 bytes) +cluster_size: 1048576 +disk image file size in bytes: 67109888 + === Testing image size bounds === Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 +image: TEST_DIR/t.IMGFMT +file format: IMGFMT +virtual size: 64M (67108864 bytes) +cluster_size: 1048576 +disk image file size in bytes: 1024 Test 1: Maximum size (1024 TB): qemu-img: Could not open 'TEST_DIR/t.IMGFMT': Could not open 'TEST_DIR/t.IMGFMT': Invalid argument diff --git a/tests/qemu-iotests/common b/tests/qemu-iotests/common index e4083f4212..70df659d5b 100644 --- a/tests/qemu-iotests/common +++ b/tests/qemu-iotests/common @@ -152,6 +152,7 @@ check options -nbd test nbd -ssh test ssh -nfs test nfs + -archipelago test archipelago -xdiff graphical mode diff -nocache use O_DIRECT on backing file -misalign misalign memory allocations @@ -263,6 +264,11 @@ testlist options xpand=false ;; + -archipelago) + IMGPROTO=archipelago + xpand=false + ;; + -nocache) CACHEMODE="none" CACHEMODE_IS_DEFAULT=false diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc index e0ea7e3a7d..3fd691e6cd 100644 --- a/tests/qemu-iotests/common.rc +++ b/tests/qemu-iotests/common.rc @@ -64,6 +64,8 @@ elif [ "$IMGPROTO" = "ssh" ]; then elif [ "$IMGPROTO" = "nfs" ]; then TEST_DIR="nfs://127.0.0.1/$TEST_DIR" TEST_IMG=$TEST_DIR/t.$IMGFMT +elif [ "$IMGPROTO" = "archipelago" ]; then + TEST_IMG="archipelago:at.$IMGFMT" else TEST_IMG=$IMGPROTO:$TEST_DIR/t.$IMGFMT fi @@ -163,7 +165,8 @@ _make_test_img() -e "s# lazy_refcounts=\\(on\\|off\\)##g" \ -e "s# block_size=[0-9]\\+##g" \ -e "s# block_state_zero=\\(on\\|off\\)##g" \ - -e "s# log_size=[0-9]\\+##g" + -e "s# log_size=[0-9]\\+##g" \ + -e "s/archipelago:a/TEST_DIR\//g" # Start an NBD server on the image file, which is what we'll be talking to if [ $IMGPROTO = "nbd" ]; then @@ -206,6 +209,10 @@ _cleanup_test_img() rbd --no-progress rm "$TEST_DIR/t.$IMGFMT" > /dev/null ;; + archipelago) + vlmc remove "at.$IMGFMT" > /dev/null + ;; + sheepdog) collie vdi delete "$TEST_DIR/t.$IMGFMT" ;; diff --git a/tests/qemu-iotests/sample_images/iotest-version3.vmdk.bz2 b/tests/qemu-iotests/sample_images/iotest-version3.vmdk.bz2 Binary files differindex 30abf217e7..619329a246 100644 --- a/tests/qemu-iotests/sample_images/iotest-version3.vmdk.bz2 +++ b/tests/qemu-iotests/sample_images/iotest-version3.vmdk.bz2 diff --git a/tests/test-coroutine.c b/tests/test-coroutine.c index 760636ddc4..6e634f4a78 100644 --- a/tests/test-coroutine.c +++ b/tests/test-coroutine.c @@ -288,6 +288,29 @@ static void perf_yield(void) maxcycles, duration); } +static __attribute__((noinline)) void dummy(unsigned *i) +{ + (*i)--; +} + +static void perf_baseline(void) +{ + unsigned int i, maxcycles; + double duration; + + maxcycles = 100000000; + i = maxcycles; + + g_test_timer_start(); + while (i > 0) { + dummy(&i); + } + duration = g_test_timer_elapsed(); + + g_test_message("Function call %u iterations: %f s\n", + maxcycles, duration); +} + int main(int argc, char **argv) { g_test_init(&argc, &argv, NULL); @@ -301,6 +324,7 @@ int main(int argc, char **argv) g_test_add_func("/perf/lifecycle", perf_lifecycle); g_test_add_func("/perf/nesting", perf_nesting); g_test_add_func("/perf/yield", perf_yield); + g_test_add_func("/perf/function-call", perf_baseline); } return g_test_run(); } diff --git a/thread-pool.c b/thread-pool.c index dfb699dd94..23888dcfc4 100644 --- a/thread-pool.c +++ b/thread-pool.c @@ -21,7 +21,6 @@ #include "block/coroutine.h" #include "trace.h" #include "block/block_int.h" -#include "qemu/event_notifier.h" #include "block/thread-pool.h" #include "qemu/main-loop.h" @@ -57,8 +56,8 @@ struct ThreadPoolElement { }; struct ThreadPool { - EventNotifier notifier; AioContext *ctx; + QEMUBH *completion_bh; QemuMutex lock; QemuCond check_cancel; QemuCond worker_stopped; @@ -119,7 +118,7 @@ static void *worker_thread(void *opaque) qemu_cond_broadcast(&pool->check_cancel); } - event_notifier_set(&pool->notifier); + qemu_bh_schedule(pool->completion_bh); } pool->cur_threads--; @@ -168,12 +167,11 @@ static void spawn_thread(ThreadPool *pool) } } -static void event_notifier_ready(EventNotifier *notifier) +static void thread_pool_completion_bh(void *opaque) { - ThreadPool *pool = container_of(notifier, ThreadPool, notifier); + ThreadPool *pool = opaque; ThreadPoolElement *elem, *next; - event_notifier_test_and_clear(notifier); restart: QLIST_FOREACH_SAFE(elem, &pool->head, all, next) { if (elem->state != THREAD_CANCELED && elem->state != THREAD_DONE) { @@ -187,6 +185,12 @@ restart: QLIST_REMOVE(elem, all); /* Read state before ret. */ smp_rmb(); + + /* Schedule ourselves in case elem->common.cb() calls aio_poll() to + * wait for another request that completed at the same time. + */ + qemu_bh_schedule(pool->completion_bh); + elem->common.cb(elem->common.opaque, elem->ret); qemu_aio_release(elem); goto restart; @@ -215,7 +219,7 @@ static void thread_pool_cancel(BlockDriverAIOCB *acb) qemu_sem_timedwait(&pool->sem, 0) == 0) { QTAILQ_REMOVE(&pool->request_list, elem, reqs); elem->state = THREAD_CANCELED; - event_notifier_set(&pool->notifier); + qemu_bh_schedule(pool->completion_bh); } else { pool->pending_cancellations++; while (elem->state != THREAD_CANCELED && elem->state != THREAD_DONE) { @@ -224,7 +228,7 @@ static void thread_pool_cancel(BlockDriverAIOCB *acb) pool->pending_cancellations--; } qemu_mutex_unlock(&pool->lock); - event_notifier_ready(&pool->notifier); + thread_pool_completion_bh(pool); } static const AIOCBInfo thread_pool_aiocb_info = { @@ -293,8 +297,8 @@ static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx) } memset(pool, 0, sizeof(*pool)); - event_notifier_init(&pool->notifier, false); pool->ctx = ctx; + pool->completion_bh = aio_bh_new(ctx, thread_pool_completion_bh, pool); qemu_mutex_init(&pool->lock); qemu_cond_init(&pool->check_cancel); qemu_cond_init(&pool->worker_stopped); @@ -304,8 +308,6 @@ static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx) QLIST_INIT(&pool->head); QTAILQ_INIT(&pool->request_list); - - aio_set_event_notifier(ctx, &pool->notifier, event_notifier_ready); } ThreadPool *thread_pool_new(AioContext *ctx) @@ -339,11 +341,10 @@ void thread_pool_free(ThreadPool *pool) qemu_mutex_unlock(&pool->lock); - aio_set_event_notifier(pool->ctx, &pool->notifier, NULL); + qemu_bh_delete(pool->completion_bh); qemu_sem_destroy(&pool->sem); qemu_cond_destroy(&pool->check_cancel); qemu_cond_destroy(&pool->worker_stopped); qemu_mutex_destroy(&pool->lock); - event_notifier_cleanup(&pool->notifier); g_free(pool); } diff --git a/util/oslib-posix.c b/util/oslib-posix.c index cdbfb2e270..016a047cfc 100644 --- a/util/oslib-posix.c +++ b/util/oslib-posix.c @@ -94,7 +94,7 @@ void *qemu_oom_check(void *ptr) return ptr; } -void *qemu_memalign(size_t alignment, size_t size) +void *qemu_try_memalign(size_t alignment, size_t size) { void *ptr; @@ -106,19 +106,23 @@ void *qemu_memalign(size_t alignment, size_t size) int ret; ret = posix_memalign(&ptr, alignment, size); if (ret != 0) { - fprintf(stderr, "Failed to allocate %zu B: %s\n", - size, strerror(ret)); - abort(); + errno = ret; + ptr = NULL; } #elif defined(CONFIG_BSD) - ptr = qemu_oom_check(valloc(size)); + ptr = valloc(size); #else - ptr = qemu_oom_check(memalign(alignment, size)); + ptr = memalign(alignment, size); #endif trace_qemu_memalign(alignment, size, ptr); return ptr; } +void *qemu_memalign(size_t alignment, size_t size) +{ + return qemu_oom_check(qemu_try_memalign(alignment, size)); +} + /* alloc shared memory pages */ void *qemu_anon_ram_alloc(size_t size) { diff --git a/util/oslib-win32.c b/util/oslib-win32.c index 507cedd84d..a3eab4acba 100644 --- a/util/oslib-win32.c +++ b/util/oslib-win32.c @@ -50,18 +50,23 @@ void *qemu_oom_check(void *ptr) return ptr; } -void *qemu_memalign(size_t alignment, size_t size) +void *qemu_try_memalign(size_t alignment, size_t size) { void *ptr; if (!size) { abort(); } - ptr = qemu_oom_check(VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_READWRITE)); + ptr = VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_READWRITE); trace_qemu_memalign(alignment, size, ptr); return ptr; } +void *qemu_memalign(size_t alignment, size_t size) +{ + return qemu_oom_check(qemu_try_memalign(alignment, size)); +} + void *qemu_anon_ram_alloc(size_t size) { void *ptr; |