aboutsummaryrefslogtreecommitdiff
path: root/block/io.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/io.c')
-rw-r--r--block/io.c242
1 files changed, 141 insertions, 101 deletions
diff --git a/block/io.c b/block/io.c
index cfda7148d8..7323f0fb7b 100644
--- a/block/io.c
+++ b/block/io.c
@@ -33,14 +33,13 @@
#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
-static BlockAIOCB *bdrv_co_aio_rw_vector(BdrvChild *child,
- int64_t sector_num,
- QEMUIOVector *qiov,
- int nb_sectors,
- BdrvRequestFlags flags,
- BlockCompletionFunc *cb,
- void *opaque,
- bool is_write);
+static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
+ int64_t offset,
+ QEMUIOVector *qiov,
+ BdrvRequestFlags flags,
+ BlockCompletionFunc *cb,
+ void *opaque,
+ bool is_write);
static void coroutine_fn bdrv_co_do_rw(void *opaque);
static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
int64_t offset, int count, BdrvRequestFlags flags);
@@ -971,21 +970,25 @@ err:
/*
* Forwards an already correctly aligned request to the BlockDriver. This
- * handles copy on read and zeroing after EOF; any other features must be
- * implemented by the caller.
+ * handles copy on read, zeroing after EOF, and fragmentation of large
+ * reads; any other features must be implemented by the caller.
*/
static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
int64_t align, QEMUIOVector *qiov, int flags)
{
int64_t total_bytes, max_bytes;
- int ret;
+ int ret = 0;
+ uint64_t bytes_remaining = bytes;
+ int max_transfer;
assert(is_power_of_2(align));
assert((offset & (align - 1)) == 0);
assert((bytes & (align - 1)) == 0);
assert(!qiov || bytes == qiov->size);
assert((bs->open_flags & BDRV_O_NO_IO) == 0);
+ max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
+ align);
/* TODO: We would need a per-BDS .supported_read_flags and
* potential fallback support, if we ever implement any read flags
@@ -1024,7 +1027,7 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
}
}
- /* Forward the request to the BlockDriver */
+ /* Forward the request to the BlockDriver, possibly fragmenting it */
total_bytes = bdrv_getlength(bs);
if (total_bytes < 0) {
ret = total_bytes;
@@ -1032,30 +1035,39 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
}
max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
- if (bytes <= max_bytes) {
+ if (bytes <= max_bytes && bytes <= max_transfer) {
ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
- } else if (max_bytes > 0) {
- QEMUIOVector local_qiov;
+ goto out;
+ }
- qemu_iovec_init(&local_qiov, qiov->niov);
- qemu_iovec_concat(&local_qiov, qiov, 0, max_bytes);
+ while (bytes_remaining) {
+ int num;
- ret = bdrv_driver_preadv(bs, offset, max_bytes, &local_qiov, 0);
+ if (max_bytes) {
+ QEMUIOVector local_qiov;
- qemu_iovec_destroy(&local_qiov);
- } else {
- ret = 0;
- }
+ num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
+ assert(num);
+ qemu_iovec_init(&local_qiov, qiov->niov);
+ qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
- /* Reading beyond end of file is supposed to produce zeroes */
- if (ret == 0 && total_bytes < offset + bytes) {
- uint64_t zero_offset = MAX(0, total_bytes - offset);
- uint64_t zero_bytes = offset + bytes - zero_offset;
- qemu_iovec_memset(qiov, zero_offset, 0, zero_bytes);
+ ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
+ num, &local_qiov, 0);
+ max_bytes -= num;
+ qemu_iovec_destroy(&local_qiov);
+ } else {
+ num = bytes_remaining;
+ ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0,
+ bytes_remaining);
+ }
+ if (ret < 0) {
+ goto out;
+ }
+ bytes_remaining -= num;
}
out:
- return ret;
+ return ret < 0 ? ret : 0;
}
/*
@@ -1256,7 +1268,8 @@ fail:
}
/*
- * Forwards an already correctly aligned write request to the BlockDriver.
+ * Forwards an already correctly aligned write request to the BlockDriver,
+ * after possibly fragmenting it.
*/
static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
@@ -1268,6 +1281,8 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
int64_t start_sector = offset >> BDRV_SECTOR_BITS;
int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
+ uint64_t bytes_remaining = bytes;
+ int max_transfer;
assert(is_power_of_2(align));
assert((offset & (align - 1)) == 0);
@@ -1275,6 +1290,8 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
assert(!qiov || bytes == qiov->size);
assert((bs->open_flags & BDRV_O_NO_IO) == 0);
assert(!(flags & ~BDRV_REQ_MASK));
+ max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
+ align);
waited = wait_serialising_requests(req);
assert(!waited || !req->serialising);
@@ -1297,9 +1314,34 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
} else if (flags & BDRV_REQ_ZERO_WRITE) {
bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
- } else {
+ } else if (bytes <= max_transfer) {
bdrv_debug_event(bs, BLKDBG_PWRITEV);
ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags);
+ } else {
+ bdrv_debug_event(bs, BLKDBG_PWRITEV);
+ while (bytes_remaining) {
+ int num = MIN(bytes_remaining, max_transfer);
+ QEMUIOVector local_qiov;
+ int local_flags = flags;
+
+ assert(num);
+ if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
+ !(bs->supported_write_flags & BDRV_REQ_FUA)) {
+ /* If FUA is going to be emulated by flush, we only
+ * need to flush on the last iteration */
+ local_flags &= ~BDRV_REQ_FUA;
+ }
+ qemu_iovec_init(&local_qiov, qiov->niov);
+ qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
+
+ ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
+ num, &local_qiov, local_flags);
+ qemu_iovec_destroy(&local_qiov);
+ if (ret < 0) {
+ break;
+ }
+ bytes_remaining -= num;
+ }
}
bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
@@ -1312,6 +1354,7 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
if (ret >= 0) {
bs->total_sectors = MAX(bs->total_sectors, end_sector);
+ ret = 0;
}
return ret;
@@ -1971,8 +2014,9 @@ BlockAIOCB *bdrv_aio_readv(BdrvChild *child, int64_t sector_num,
{
trace_bdrv_aio_readv(child->bs, sector_num, nb_sectors, opaque);
- return bdrv_co_aio_rw_vector(child, sector_num, qiov, nb_sectors, 0,
- cb, opaque, false);
+ assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size);
+ return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov,
+ 0, cb, opaque, false);
}
BlockAIOCB *bdrv_aio_writev(BdrvChild *child, int64_t sector_num,
@@ -1981,8 +2025,9 @@ BlockAIOCB *bdrv_aio_writev(BdrvChild *child, int64_t sector_num,
{
trace_bdrv_aio_writev(child->bs, sector_num, nb_sectors, opaque);
- return bdrv_co_aio_rw_vector(child, sector_num, qiov, nb_sectors, 0,
- cb, opaque, true);
+ assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size);
+ return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov,
+ 0, cb, opaque, true);
}
void bdrv_aio_cancel(BlockAIOCB *acb)
@@ -2018,8 +2063,8 @@ typedef struct BlockRequest {
union {
/* Used during read, write, trim */
struct {
- int64_t sector;
- int nb_sectors;
+ int64_t offset;
+ int bytes;
int flags;
QEMUIOVector *qiov;
};
@@ -2083,24 +2128,23 @@ static void coroutine_fn bdrv_co_do_rw(void *opaque)
BlockAIOCBCoroutine *acb = opaque;
if (!acb->is_write) {
- acb->req.error = bdrv_co_do_readv(acb->child, acb->req.sector,
- acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
+ acb->req.error = bdrv_co_preadv(acb->child, acb->req.offset,
+ acb->req.qiov->size, acb->req.qiov, acb->req.flags);
} else {
- acb->req.error = bdrv_co_do_writev(acb->child, acb->req.sector,
- acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
+ acb->req.error = bdrv_co_pwritev(acb->child, acb->req.offset,
+ acb->req.qiov->size, acb->req.qiov, acb->req.flags);
}
bdrv_co_complete(acb);
}
-static BlockAIOCB *bdrv_co_aio_rw_vector(BdrvChild *child,
- int64_t sector_num,
- QEMUIOVector *qiov,
- int nb_sectors,
- BdrvRequestFlags flags,
- BlockCompletionFunc *cb,
- void *opaque,
- bool is_write)
+static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
+ int64_t offset,
+ QEMUIOVector *qiov,
+ BdrvRequestFlags flags,
+ BlockCompletionFunc *cb,
+ void *opaque,
+ bool is_write)
{
Coroutine *co;
BlockAIOCBCoroutine *acb;
@@ -2109,8 +2153,7 @@ static BlockAIOCB *bdrv_co_aio_rw_vector(BdrvChild *child,
acb->child = child;
acb->need_bh = true;
acb->req.error = -EINPROGRESS;
- acb->req.sector = sector_num;
- acb->req.nb_sectors = nb_sectors;
+ acb->req.offset = offset;
acb->req.qiov = qiov;
acb->req.flags = flags;
acb->is_write = is_write;
@@ -2150,30 +2193,29 @@ BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
return &acb->common;
}
-static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
+static void coroutine_fn bdrv_aio_pdiscard_co_entry(void *opaque)
{
BlockAIOCBCoroutine *acb = opaque;
BlockDriverState *bs = acb->common.bs;
- acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
+ acb->req.error = bdrv_co_pdiscard(bs, acb->req.offset, acb->req.bytes);
bdrv_co_complete(acb);
}
-BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
- int64_t sector_num, int nb_sectors,
- BlockCompletionFunc *cb, void *opaque)
+BlockAIOCB *bdrv_aio_pdiscard(BlockDriverState *bs, int64_t offset, int count,
+ BlockCompletionFunc *cb, void *opaque)
{
Coroutine *co;
BlockAIOCBCoroutine *acb;
- trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
+ trace_bdrv_aio_pdiscard(bs, offset, count, opaque);
acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
acb->need_bh = true;
acb->req.error = -EINPROGRESS;
- acb->req.sector = sector_num;
- acb->req.nb_sectors = nb_sectors;
- co = qemu_coroutine_create(bdrv_aio_discard_co_entry, acb);
+ acb->req.offset = offset;
+ acb->req.bytes = count;
+ co = qemu_coroutine_create(bdrv_aio_pdiscard_co_entry, acb);
qemu_coroutine_enter(co);
bdrv_co_maybe_schedule_bh(acb);
@@ -2346,28 +2388,29 @@ int bdrv_flush(BlockDriverState *bs)
typedef struct DiscardCo {
BlockDriverState *bs;
- int64_t sector_num;
- int nb_sectors;
+ int64_t offset;
+ int count;
int ret;
} DiscardCo;
-static void coroutine_fn bdrv_discard_co_entry(void *opaque)
+static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
{
DiscardCo *rwco = opaque;
- rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
+ rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->count);
}
-int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
- int nb_sectors)
+int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
+ int count)
{
BdrvTrackedRequest req;
- int max_discard, ret;
+ int max_pdiscard, ret;
+ int head, align;
if (!bs->drv) {
return -ENOMEDIUM;
}
- ret = bdrv_check_request(bs, sector_num, nb_sectors);
+ ret = bdrv_check_byte_request(bs, offset, count);
if (ret < 0) {
return ret;
} else if (bs->read_only) {
@@ -2380,50 +2423,47 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
return 0;
}
- if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
+ if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
+ return 0;
+ }
+
+ /* Discard is advisory, so ignore any unaligned head or tail */
+ align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
+ assert(is_power_of_2(align));
+ head = MIN(count, -offset & (align - 1));
+ if (head) {
+ count -= head;
+ offset += head;
+ }
+ count = QEMU_ALIGN_DOWN(count, align);
+ if (!count) {
return 0;
}
- tracked_request_begin(&req, bs, sector_num << BDRV_SECTOR_BITS,
- nb_sectors << BDRV_SECTOR_BITS, BDRV_TRACKED_DISCARD);
+ tracked_request_begin(&req, bs, offset, count, BDRV_TRACKED_DISCARD);
ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
if (ret < 0) {
goto out;
}
- max_discard = MIN_NON_ZERO(bs->bl.max_pdiscard >> BDRV_SECTOR_BITS,
- BDRV_REQUEST_MAX_SECTORS);
- while (nb_sectors > 0) {
- int ret;
- int num = nb_sectors;
- int discard_alignment = bs->bl.pdiscard_alignment >> BDRV_SECTOR_BITS;
-
- /* align request */
- if (discard_alignment &&
- num >= discard_alignment &&
- sector_num % discard_alignment) {
- if (num > discard_alignment) {
- num = discard_alignment;
- }
- num -= sector_num % discard_alignment;
- }
+ max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX),
+ align);
- /* limit request size */
- if (num > max_discard) {
- num = max_discard;
- }
+ while (count > 0) {
+ int ret;
+ int num = MIN(count, max_pdiscard);
- if (bs->drv->bdrv_co_discard) {
- ret = bs->drv->bdrv_co_discard(bs, sector_num, num);
+ if (bs->drv->bdrv_co_pdiscard) {
+ ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
} else {
BlockAIOCB *acb;
CoroutineIOCompletion co = {
.coroutine = qemu_coroutine_self(),
};
- acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
- bdrv_co_io_em_complete, &co);
+ acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
+ bdrv_co_io_em_complete, &co);
if (acb == NULL) {
ret = -EIO;
goto out;
@@ -2436,8 +2476,8 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
goto out;
}
- sector_num += num;
- nb_sectors -= num;
+ offset += num;
+ count -= num;
}
ret = 0;
out:
@@ -2448,23 +2488,23 @@ out:
return ret;
}
-int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
+int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count)
{
Coroutine *co;
DiscardCo rwco = {
.bs = bs,
- .sector_num = sector_num,
- .nb_sectors = nb_sectors,
+ .offset = offset,
+ .count = count,
.ret = NOT_DONE,
};
if (qemu_in_coroutine()) {
/* Fast-path if already in coroutine context */
- bdrv_discard_co_entry(&rwco);
+ bdrv_pdiscard_co_entry(&rwco);
} else {
AioContext *aio_context = bdrv_get_aio_context(bs);
- co = qemu_coroutine_create(bdrv_discard_co_entry, &rwco);
+ co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco);
qemu_coroutine_enter(co);
while (rwco.ret == NOT_DONE) {
aio_poll(aio_context, true);