block/backup.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387

/*
 * QEMU backup
 *
 * Copyright (C) 2013 Proxmox Server Solutions
 *
 * Authors:
 *  Dietmar Maurer (dietmar@proxmox.com)
 *
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
 * See the COPYING file in the top-level directory.
 *
 */

#include <stdio.h>
#include <errno.h>
#include <unistd.h>

#include "trace.h"
#include "block/block.h"
#include "block/block_int.h"
#include "block/blockjob.h"
#include "qemu/ratelimit.h"

#define BACKUP_CLUSTER_BITS 16
#define BACKUP_CLUSTER_SIZE (1 << BACKUP_CLUSTER_BITS)
#define BACKUP_SECTORS_PER_CLUSTER (BACKUP_CLUSTER_SIZE / BDRV_SECTOR_SIZE)

#define SLICE_TIME 100000000ULL /* ns */

typedef struct CowRequest {
    int64_t start;
    int64_t end;
    QLIST_ENTRY(CowRequest) list;
    CoQueue wait_queue; /* coroutines blocked on this request */
} CowRequest;

typedef struct BackupBlockJob {
    BlockJob common;
    BlockDriverState *target;
    MirrorSyncMode sync_mode;
    RateLimit limit;
    BlockdevOnError on_source_error;
    BlockdevOnError on_target_error;
    CoRwlock flush_rwlock;
    uint64_t sectors_read;
    HBitmap *bitmap;
    QLIST_HEAD(, CowRequest) inflight_reqs;
} BackupBlockJob;

/* See if in-flight requests overlap and wait for them to complete */
static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
                                                       int64_t start,
                                                       int64_t end)
{
    CowRequest *req;
    bool retry;

    do {
        retry = false;
        QLIST_FOREACH(req, &job->inflight_reqs, list) {
            if (end > req->start && start < req->end) {
                qemu_co_queue_wait(&req->wait_queue);
                retry = true;
                break;
            }
        }
    } while (retry);
}

/* Keep track of an in-flight request */
static void cow_request_begin(CowRequest *req, BackupBlockJob *job,
                                     int64_t start, int64_t end)
{
    req->start = start;
    req->end = end;
    qemu_co_queue_init(&req->wait_queue);
    QLIST_INSERT_HEAD(&job->inflight_reqs, req, list);
}

/* Forget about a completed request */
static void cow_request_end(CowRequest *req)
{
    QLIST_REMOVE(req, list);
    qemu_co_queue_restart_all(&req->wait_queue);
}

static int coroutine_fn backup_do_cow(BlockDriverState *bs,
                                      int64_t sector_num, int nb_sectors,
                                      bool *error_is_read)
{
    BackupBlockJob *job = (BackupBlockJob *)bs->job;
    CowRequest cow_request;
    struct iovec iov;
    QEMUIOVector bounce_qiov;
    void *bounce_buffer = NULL;
    int ret = 0;
    int64_t start, end;
    int n;

    qemu_co_rwlock_rdlock(&job->flush_rwlock);

    start = sector_num / BACKUP_SECTORS_PER_CLUSTER;
    end = DIV_ROUND_UP(sector_num + nb_sectors, BACKUP_SECTORS_PER_CLUSTER);

    trace_backup_do_cow_enter(job, start, sector_num, nb_sectors);

    wait_for_overlapping_requests(job, start, end);
    cow_request_begin(&cow_request, job, start, end);

    for (; start < end; start++) {
        if (hbitmap_get(job->bitmap, start)) {
            trace_backup_do_cow_skip(job, start);
            continue; /* already copied */
        }

        trace_backup_do_cow_process(job, start);

        n = MIN(BACKUP_SECTORS_PER_CLUSTER,
                job->common.len / BDRV_SECTOR_SIZE -
                start * BACKUP_SECTORS_PER_CLUSTER);

        if (!bounce_buffer) {
            bounce_buffer = qemu_blockalign(bs, BACKUP_CLUSTER_SIZE);
        }
        iov.iov_base = bounce_buffer;
        iov.iov_len = n * BDRV_SECTOR_SIZE;
        qemu_iovec_init_external(&bounce_qiov, &iov, 1);

        ret = bdrv_co_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER, n,
                            &bounce_qiov);
        if (ret < 0) {
            trace_backup_do_cow_read_fail(job, start, ret);
            if (error_is_read) {
                *error_is_read = true;
            }
            goto out;
        }

        if (buffer_is_zero(iov.iov_base, iov.iov_len)) {
            ret = bdrv_co_write_zeroes(job->target,
                                       start * BACKUP_SECTORS_PER_CLUSTER,
                                       n, 0);
        } else {
            ret = bdrv_co_writev(job->target,
                                 start * BACKUP_SECTORS_PER_CLUSTER, n,
                                 &bounce_qiov);
        }
        if (ret < 0) {
            trace_backup_do_cow_write_fail(job, start, ret);
            if (error_is_read) {
                *error_is_read = false;
            }
            goto out;
        }

        hbitmap_set(job->bitmap, start, 1);

        /* Publish progress, guest I/O counts as progress too.  Note that the
         * offset field is an opaque progress value, it is not a disk offset.
         */
        job->sectors_read += n;
        job->common.offset += n * BDRV_SECTOR_SIZE;
    }

out:
    if (bounce_buffer) {
        qemu_vfree(bounce_buffer);
    }

    cow_request_end(&cow_request);

    trace_backup_do_cow_return(job, sector_num, nb_sectors, ret);

    qemu_co_rwlock_unlock(&job->flush_rwlock);

    return ret;
}

static int coroutine_fn backup_before_write_notify(
        NotifierWithReturn *notifier,
        void *opaque)
{
    BdrvTrackedRequest *req = opaque;

    return backup_do_cow(req->bs, req->sector_num, req->nb_sectors, NULL);
}

static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
{
    BackupBlockJob *s = container_of(job, BackupBlockJob, common);

    if (speed < 0) {
        error_set(errp, QERR_INVALID_PARAMETER, "speed");
        return;
    }
    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
}

static void backup_iostatus_reset(BlockJob *job)
{
    BackupBlockJob *s = container_of(job, BackupBlockJob, common);

    bdrv_iostatus_reset(s->target);
}

static const BlockJobDriver backup_job_driver = {
    .instance_size  = sizeof(BackupBlockJob),
    .job_type       = BLOCK_JOB_TYPE_BACKUP,
    .set_speed      = backup_set_speed,
    .iostatus_reset = backup_iostatus_reset,
};

static BlockErrorAction backup_error_action(BackupBlockJob *job,
                                            bool read, int error)
{
    if (read) {
        return block_job_error_action(&job->common, job->common.bs,
                                      job->on_source_error, true, error);
    } else {
        return block_job_error_action(&job->common, job->target,
                                      job->on_target_error, false, error);
    }
}

static void coroutine_fn backup_run(void *opaque)
{
    BackupBlockJob *job = opaque;
    BlockDriverState *bs = job->common.bs;
    BlockDriverState *target = job->target;
    BlockdevOnError on_target_error = job->on_target_error;
    NotifierWithReturn before_write = {
        .notify = backup_before_write_notify,
    };
    int64_t start, end;
    int ret = 0;

    QLIST_INIT(&job->inflight_reqs);
    qemu_co_rwlock_init(&job->flush_rwlock);

    start = 0;
    end = DIV_ROUND_UP(job->common.len / BDRV_SECTOR_SIZE,
                       BACKUP_SECTORS_PER_CLUSTER);

    job->bitmap = hbitmap_alloc(end, 0);

    bdrv_set_enable_write_cache(target, true);
    bdrv_set_on_error(target, on_target_error, on_target_error);
    bdrv_iostatus_enable(target);

    bdrv_add_before_write_notifier(bs, &before_write);

    if (job->sync_mode == MIRROR_SYNC_MODE_NONE) {
        while (!block_job_is_cancelled(&job->common)) {
            /* Yield until the job is cancelled.  We just let our before_write
             * notify callback service CoW requests. */
            job->common.busy = false;
            qemu_coroutine_yield();
            job->common.busy = true;
        }
    } else {
        /* Both FULL and TOP SYNC_MODE's require copying.. */
        for (; start < end; start++) {
            bool error_is_read;

            if (block_job_is_cancelled(&job->common)) {
                break;
            }

            /* we need to yield so that qemu_aio_flush() returns.
             * (without, VM does not reboot)
             */
            if (job->common.speed) {
                uint64_t delay_ns = ratelimit_calculate_delay(
                        &job->limit, job->sectors_read);
                job->sectors_read = 0;
                block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns);
            } else {
                block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0);
            }

            if (block_job_is_cancelled(&job->common)) {
                break;
            }

            if (job->sync_mode == MIRROR_SYNC_MODE_TOP) {
                int i, n;
                int alloced = 0;

                /* Check to see if these blocks are already in the
                 * backing file. */

                for (i = 0; i < BACKUP_SECTORS_PER_CLUSTER;) {
                    /* bdrv_is_allocated() only returns true/false based
                     * on the first set of sectors it comes across that
                     * are are all in the same state.
                     * For that reason we must verify each sector in the
                     * backup cluster length.  We end up copying more than
                     * needed but at some point that is always the case. */
                    alloced =
                        bdrv_is_allocated(bs,
                                start * BACKUP_SECTORS_PER_CLUSTER + i,
                                BACKUP_SECTORS_PER_CLUSTER - i, &n);
                    i += n;

                    if (alloced == 1) {
                        break;
                    }
                }

                /* If the above loop never found any sectors that are in
                 * the topmost image, skip this backup. */
                if (alloced == 0) {
                    continue;
                }
            }
            /* FULL sync mode we copy the whole drive. */
            ret = backup_do_cow(bs, start * BACKUP_SECTORS_PER_CLUSTER,
                    BACKUP_SECTORS_PER_CLUSTER, &error_is_read);
            if (ret < 0) {
                /* Depending on error action, fail now or retry cluster */
                BlockErrorAction action =
                    backup_error_action(job, error_is_read, -ret);
                if (action == BDRV_ACTION_REPORT) {
                    break;
                } else {
                    start--;
                    continue;
                }
            }
        }
    }

    notifier_with_return_remove(&before_write);

    /* wait until pending backup_do_cow() calls have completed */
    qemu_co_rwlock_wrlock(&job->flush_rwlock);
    qemu_co_rwlock_unlock(&job->flush_rwlock);

    hbitmap_free(job->bitmap);

    bdrv_iostatus_disable(target);
    bdrv_unref(target);

    block_job_completed(&job->common, ret);
}

void backup_start(BlockDriverState *bs, BlockDriverState *target,
                  int64_t speed, MirrorSyncMode sync_mode,
                  BlockdevOnError on_source_error,
                  BlockdevOnError on_target_error,
                  BlockDriverCompletionFunc *cb, void *opaque,
                  Error **errp)
{
    int64_t len;

    assert(bs);
    assert(target);
    assert(cb);

    if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
         on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
        !bdrv_iostatus_is_enabled(bs)) {
        error_set(errp, QERR_INVALID_PARAMETER, "on-source-error");
        return;
    }

    len = bdrv_getlength(bs);
    if (len < 0) {
        error_setg_errno(errp, -len, "unable to get length for '%s'",
                         bdrv_get_device_name(bs));
        return;
    }

    BackupBlockJob *job = block_job_create(&backup_job_driver, bs, speed,
                                           cb, opaque, errp);
    if (!job) {
        return;
    }

    job->on_source_error = on_source_error;
    job->on_target_error = on_target_error;
    job->target = target;
    job->sync_mode = sync_mode;
    job->common.len = len;
    job->common.co = qemu_coroutine_create(backup_run);
    qemu_coroutine_enter(job->common.co, job);
}