aboutsummaryrefslogtreecommitdiff
path: root/include/block/block.h
blob: 8d5b3ecebdbfd2edf3b572d443abad3f37c30347 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
#ifndef BLOCK_H
#define BLOCK_H

#include "block/aio.h"
#include "block/aio-wait.h"
#include "qemu/iov.h"
#include "qemu/coroutine.h"
#include "block/accounting.h"
#include "block/dirty-bitmap.h"
#include "block/blockjob.h"
#include "qemu/hbitmap.h"
#include "qemu/transactions.h"

/*
 * generated_co_wrapper
 *
 * Function specifier, which does nothing but mark functions to be
 * generated by scripts/block-coroutine-wrapper.py
 *
 * Read more in docs/devel/block-coroutine-wrapper.rst
 */
#define generated_co_wrapper

/* block.c */
typedef struct BlockDriver BlockDriver;
typedef struct BdrvChild BdrvChild;
typedef struct BdrvChildClass BdrvChildClass;

typedef struct BlockDriverInfo {
    /* in bytes, 0 if irrelevant */
    int cluster_size;
    /* offset at which the VM state can be saved (0 if not possible) */
    int64_t vm_state_offset;
    bool is_dirty;
    /*
     * True if this block driver only supports compressed writes
     */
    bool needs_compressed_writes;
} BlockDriverInfo;

typedef struct BlockFragInfo {
    uint64_t allocated_clusters;
    uint64_t total_clusters;
    uint64_t fragmented_clusters;
    uint64_t compressed_clusters;
} BlockFragInfo;

typedef enum {
    BDRV_REQ_COPY_ON_READ       = 0x1,
    BDRV_REQ_ZERO_WRITE         = 0x2,

    /*
     * The BDRV_REQ_MAY_UNMAP flag is used in write_zeroes requests to indicate
     * that the block driver should unmap (discard) blocks if it is guaranteed
     * that the result will read back as zeroes. The flag is only passed to the
     * driver if the block device is opened with BDRV_O_UNMAP.
     */
    BDRV_REQ_MAY_UNMAP          = 0x4,

    BDRV_REQ_FUA                = 0x10,
    BDRV_REQ_WRITE_COMPRESSED   = 0x20,

    /* Signifies that this write request will not change the visible disk
     * content. */
    BDRV_REQ_WRITE_UNCHANGED    = 0x40,

    /* Forces request serialisation. Use only with write requests. */
    BDRV_REQ_SERIALISING        = 0x80,

    /* Execute the request only if the operation can be offloaded or otherwise
     * be executed efficiently, but return an error instead of using a slow
     * fallback. */
    BDRV_REQ_NO_FALLBACK        = 0x100,

    /*
     * BDRV_REQ_PREFETCH makes sense only in the context of copy-on-read
     * (i.e., together with the BDRV_REQ_COPY_ON_READ flag or when a COR
     * filter is involved), in which case it signals that the COR operation
     * need not read the data into memory (qiov) but only ensure they are
     * copied to the top layer (i.e., that COR operation is done).
     */
    BDRV_REQ_PREFETCH  = 0x200,

    /*
     * If we need to wait for other requests, just fail immediately. Used
     * only together with BDRV_REQ_SERIALISING.
     */
    BDRV_REQ_NO_WAIT = 0x400,

    /* Mask of valid flags */
    BDRV_REQ_MASK               = 0x7ff,
} BdrvRequestFlags;

typedef struct BlockSizes {
    uint32_t phys;
    uint32_t log;
} BlockSizes;

typedef struct HDGeometry {
    uint32_t heads;
    uint32_t sectors;
    uint32_t cylinders;
} HDGeometry;

#define BDRV_O_RDWR        0x0002
#define BDRV_O_RESIZE      0x0004 /* request permission for resizing the node */
#define BDRV_O_SNAPSHOT    0x0008 /* open the file read only and save writes in a snapshot */
#define BDRV_O_TEMPORARY   0x0010 /* delete the file after use */
#define BDRV_O_NOCACHE     0x0020 /* do not use the host page cache */
#define BDRV_O_NATIVE_AIO  0x0080 /* use native AIO instead of the thread pool */
#define BDRV_O_NO_BACKING  0x0100 /* don't open the backing file */
#define BDRV_O_NO_FLUSH    0x0200 /* disable flushing on this disk */
#define BDRV_O_COPY_ON_READ 0x0400 /* copy read backing sectors into image */
#define BDRV_O_INACTIVE    0x0800  /* consistency hint for migration handoff */
#define BDRV_O_CHECK       0x1000  /* open solely for consistency check */
#define BDRV_O_ALLOW_RDWR  0x2000  /* allow reopen to change from r/o to r/w */
#define BDRV_O_UNMAP       0x4000  /* execute guest UNMAP/TRIM operations */
#define BDRV_O_PROTOCOL    0x8000  /* if no block driver is explicitly given:
                                      select an appropriate protocol driver,
                                      ignoring the format layer */
#define BDRV_O_NO_IO       0x10000 /* don't initialize for I/O */
#define BDRV_O_AUTO_RDONLY 0x20000 /* degrade to read-only if opening read-write fails */
#define BDRV_O_IO_URING    0x40000 /* use io_uring instead of the thread pool */

#define BDRV_O_CACHE_MASK  (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH)


/* Option names of options parsed by the block layer */

#define BDRV_OPT_CACHE_WB       "cache.writeback"
#define BDRV_OPT_CACHE_DIRECT   "cache.direct"
#define BDRV_OPT_CACHE_NO_FLUSH "cache.no-flush"
#define BDRV_OPT_READ_ONLY      "read-only"
#define BDRV_OPT_AUTO_READ_ONLY "auto-read-only"
#define BDRV_OPT_DISCARD        "discard"
#define BDRV_OPT_FORCE_SHARE    "force-share"


#define BDRV_SECTOR_BITS   9
#define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)

#define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
                                           INT_MAX >> BDRV_SECTOR_BITS)
#define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)

/*
 * We want allow aligning requests and disk length up to any 32bit alignment
 * and don't afraid of overflow.
 * To achieve it, and in the same time use some pretty number as maximum disk
 * size, let's define maximum "length" (a limit for any offset/bytes request and
 * for disk size) to be the greatest power of 2 less than INT64_MAX.
 */
#define BDRV_MAX_ALIGNMENT (1L << 30)
#define BDRV_MAX_LENGTH (QEMU_ALIGN_DOWN(INT64_MAX, BDRV_MAX_ALIGNMENT))

/*
 * Allocation status flags for bdrv_block_status() and friends.
 *
 * Public flags:
 * BDRV_BLOCK_DATA: allocation for data at offset is tied to this layer
 * BDRV_BLOCK_ZERO: offset reads as zero
 * BDRV_BLOCK_OFFSET_VALID: an associated offset exists for accessing raw data
 * BDRV_BLOCK_ALLOCATED: the content of the block is determined by this
 *                       layer rather than any backing, set by block layer
 * BDRV_BLOCK_EOF: the returned pnum covers through end of file for this
 *                 layer, set by block layer
 *
 * Internal flags:
 * BDRV_BLOCK_RAW: for use by passthrough drivers, such as raw, to request
 *                 that the block layer recompute the answer from the returned
 *                 BDS; must be accompanied by just BDRV_BLOCK_OFFSET_VALID.
 * BDRV_BLOCK_RECURSE: request that the block layer will recursively search for
 *                     zeroes in file child of current block node inside
 *                     returned region. Only valid together with both
 *                     BDRV_BLOCK_DATA and BDRV_BLOCK_OFFSET_VALID. Should not
 *                     appear with BDRV_BLOCK_ZERO.
 *
 * If BDRV_BLOCK_OFFSET_VALID is set, the map parameter represents the
 * host offset within the returned BDS that is allocated for the
 * corresponding raw guest data.  However, whether that offset
 * actually contains data also depends on BDRV_BLOCK_DATA, as follows:
 *
 * DATA ZERO OFFSET_VALID
 *  t    t        t       sectors read as zero, returned file is zero at offset
 *  t    f        t       sectors read as valid from file at offset
 *  f    t        t       sectors preallocated, read as zero, returned file not
 *                        necessarily zero at offset
 *  f    f        t       sectors preallocated but read from backing_hd,
 *                        returned file contains garbage at offset
 *  t    t        f       sectors preallocated, read as zero, unknown offset
 *  t    f        f       sectors read from unknown file or offset
 *  f    t        f       not allocated or unknown offset, read as zero
 *  f    f        f       not allocated or unknown offset, read from backing_hd
 */
#define BDRV_BLOCK_DATA         0x01
#define BDRV_BLOCK_ZERO         0x02
#define BDRV_BLOCK_OFFSET_VALID 0x04
#define BDRV_BLOCK_RAW          0x08
#define BDRV_BLOCK_ALLOCATED    0x10
#define BDRV_BLOCK_EOF          0x20
#define BDRV_BLOCK_RECURSE      0x40

typedef QTAILQ_HEAD(BlockReopenQueue, BlockReopenQueueEntry) BlockReopenQueue;

typedef struct BDRVReopenState {
    BlockDriverState *bs;
    int flags;
    BlockdevDetectZeroesOptions detect_zeroes;
    bool backing_missing;
    bool replace_backing_bs;  /* new_backing_bs is ignored if this is false */
    BlockDriverState *old_backing_bs; /* keep pointer for permissions update */
    QDict *options;
    QDict *explicit_options;
    void *opaque;
} BDRVReopenState;

/*
 * Block operation types
 */
typedef enum BlockOpType {
    BLOCK_OP_TYPE_BACKUP_SOURCE,
    BLOCK_OP_TYPE_BACKUP_TARGET,
    BLOCK_OP_TYPE_CHANGE,
    BLOCK_OP_TYPE_COMMIT_SOURCE,
    BLOCK_OP_TYPE_COMMIT_TARGET,
    BLOCK_OP_TYPE_DATAPLANE,
    BLOCK_OP_TYPE_DRIVE_DEL,
    BLOCK_OP_TYPE_EJECT,
    BLOCK_OP_TYPE_EXTERNAL_SNAPSHOT,
    BLOCK_OP_TYPE_INTERNAL_SNAPSHOT,
    BLOCK_OP_TYPE_INTERNAL_SNAPSHOT_DELETE,
    BLOCK_OP_TYPE_MIRROR_SOURCE,
    BLOCK_OP_TYPE_MIRROR_TARGET,
    BLOCK_OP_TYPE_RESIZE,
    BLOCK_OP_TYPE_STREAM,
    BLOCK_OP_TYPE_REPLACE,
    BLOCK_OP_TYPE_MAX,
} BlockOpType;

/* Block node permission constants */
enum {
    /**
     * A user that has the "permission" of consistent reads is guaranteed that
     * their view of the contents of the block device is complete and
     * self-consistent, representing the contents of a disk at a specific
     * point.
     *
     * For most block devices (including their backing files) this is true, but
     * the property cannot be maintained in a few situations like for
     * intermediate nodes of a commit block job.
     */
    BLK_PERM_CONSISTENT_READ    = 0x01,

    /** This permission is required to change the visible disk contents. */
    BLK_PERM_WRITE              = 0x02,

    /**
     * This permission (which is weaker than BLK_PERM_WRITE) is both enough and
     * required for writes to the block node when the caller promises that
     * the visible disk content doesn't change.
     *
     * As the BLK_PERM_WRITE permission is strictly stronger, either is
     * sufficient to perform an unchanging write.
     */
    BLK_PERM_WRITE_UNCHANGED    = 0x04,

    /** This permission is required to change the size of a block node. */
    BLK_PERM_RESIZE             = 0x08,

    /**
     * This permission is required to change the node that this BdrvChild
     * points to.
     */
    BLK_PERM_GRAPH_MOD          = 0x10,

    BLK_PERM_ALL                = 0x1f,

    DEFAULT_PERM_PASSTHROUGH    = BLK_PERM_CONSISTENT_READ
                                 | BLK_PERM_WRITE
                                 | BLK_PERM_WRITE_UNCHANGED
                                 | BLK_PERM_RESIZE,

    DEFAULT_PERM_UNCHANGED      = BLK_PERM_ALL & ~DEFAULT_PERM_PASSTHROUGH,
};

/*
 * Flags that parent nodes assign to child nodes to specify what kind of
 * role(s) they take.
 *
 * At least one of DATA, METADATA, FILTERED, or COW must be set for
 * every child.
 */
enum BdrvChildRoleBits {
    /*
     * This child stores data.
     * Any node may have an arbitrary number of such children.
     */
    BDRV_CHILD_DATA         = (1 << 0),

    /*
     * This child stores metadata.
     * Any node may have an arbitrary number of metadata-storing
     * children.
     */
    BDRV_CHILD_METADATA     = (1 << 1),

    /*
     * A child that always presents exactly the same visible data as
     * the parent, e.g. by virtue of the parent forwarding all reads
     * and writes.
     * This flag is mutually exclusive with DATA, METADATA, and COW.
     * Any node may have at most one filtered child at a time.
     */
    BDRV_CHILD_FILTERED     = (1 << 2),

    /*
     * Child from which to read all data that isn't allocated in the
     * parent (i.e., the backing child); such data is copied to the
     * parent through COW (and optionally COR).
     * This field is mutually exclusive with DATA, METADATA, and
     * FILTERED.
     * Any node may have at most one such backing child at a time.
     */
    BDRV_CHILD_COW          = (1 << 3),

    /*
     * The primary child.  For most drivers, this is the child whose
     * filename applies best to the parent node.
     * Any node may have at most one primary child at a time.
     */
    BDRV_CHILD_PRIMARY      = (1 << 4),

    /* Useful combination of flags */
    BDRV_CHILD_IMAGE        = BDRV_CHILD_DATA
                              | BDRV_CHILD_METADATA
                              | BDRV_CHILD_PRIMARY,
};

/* Mask of BdrvChildRoleBits values */
typedef unsigned int BdrvChildRole;

char *bdrv_perm_names(uint64_t perm);
uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm);

/* disk I/O throttling */
void bdrv_init(void);
void bdrv_init_with_whitelist(void);
bool bdrv_uses_whitelist(void);
int bdrv_is_whitelisted(BlockDriver *drv, bool read_only);
BlockDriver *bdrv_find_protocol(const char *filename,
                                bool allow_protocol_prefix,
                                Error **errp);
BlockDriver *bdrv_find_format(const char *format_name);
int bdrv_create(BlockDriver *drv, const char* filename,
                QemuOpts *opts, Error **errp);
int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp);

BlockDriverState *bdrv_new(void);
int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
                Error **errp);
int bdrv_replace_node(BlockDriverState *from, BlockDriverState *to,
                      Error **errp);
BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *node_options,
                                   int flags, Error **errp);
int bdrv_drop_filter(BlockDriverState *bs, Error **errp);

int bdrv_parse_aio(const char *mode, int *flags);
int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough);
int bdrv_parse_discard_flags(const char *mode, int *flags);
BdrvChild *bdrv_open_child(const char *filename,
                           QDict *options, const char *bdref_key,
                           BlockDriverState* parent,
                           const BdrvChildClass *child_class,
                           BdrvChildRole child_role,
                           bool allow_none, Error **errp);
BlockDriverState *bdrv_open_blockdev_ref(BlockdevRef *ref, Error **errp);
int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd,
                        Error **errp);
int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
                           const char *bdref_key, Error **errp);
BlockDriverState *bdrv_open(const char *filename, const char *reference,
                            QDict *options, int flags, Error **errp);
BlockDriverState *bdrv_new_open_driver(BlockDriver *drv, const char *node_name,
                                       int flags, Error **errp);
BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
                                    BlockDriverState *bs, QDict *options,
                                    bool keep_old_opts);
int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp);
int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only,
                              Error **errp);
int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
                       int64_t bytes, BdrvRequestFlags flags);
int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags);
int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int64_t bytes);
int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf,
                int64_t bytes);
int bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
                     const void *buf, int64_t bytes);
/*
 * Efficiently zero a region of the disk image.  Note that this is a regular
 * I/O request like read or write and should have a reasonable size.  This
 * function is not suitable for zeroing the entire image in a single request
 * because it may allocate memory for the entire region.
 */
int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
                                       int64_t bytes, BdrvRequestFlags flags);
BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
    const char *backing_file);
void bdrv_refresh_filename(BlockDriverState *bs);

int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
                                  PreallocMode prealloc, BdrvRequestFlags flags,
                                  Error **errp);
int generated_co_wrapper
bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
              PreallocMode prealloc, BdrvRequestFlags flags, Error **errp);

int64_t bdrv_nb_sectors(BlockDriverState *bs);
int64_t bdrv_getlength(BlockDriverState *bs);
int64_t bdrv_get_allocated_file_size(BlockDriverState *bs);
BlockMeasureInfo *bdrv_measure(BlockDriver *drv, QemuOpts *opts,
                               BlockDriverState *in_bs, Error **errp);
void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr);
void bdrv_refresh_limits(BlockDriverState *bs, Transaction *tran, Error **errp);
int bdrv_commit(BlockDriverState *bs);
int bdrv_make_empty(BdrvChild *c, Error **errp);
int bdrv_change_backing_file(BlockDriverState *bs, const char *backing_file,
                             const char *backing_fmt, bool warn);
void bdrv_register(BlockDriver *bdrv);
int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base,
                           const char *backing_file_str);
BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
                                    BlockDriverState *bs);
BlockDriverState *bdrv_find_base(BlockDriverState *bs);
bool bdrv_is_backing_chain_frozen(BlockDriverState *bs, BlockDriverState *base,
                                  Error **errp);
int bdrv_freeze_backing_chain(BlockDriverState *bs, BlockDriverState *base,
                              Error **errp);
void bdrv_unfreeze_backing_chain(BlockDriverState *bs, BlockDriverState *base);
int coroutine_fn bdrv_co_delete_file(BlockDriverState *bs, Error **errp);
void coroutine_fn bdrv_co_delete_file_noerr(BlockDriverState *bs);


typedef struct BdrvCheckResult {
    int corruptions;
    int leaks;
    int check_errors;
    int corruptions_fixed;
    int leaks_fixed;
    int64_t image_end_offset;
    BlockFragInfo bfi;
} BdrvCheckResult;

typedef enum {
    BDRV_FIX_LEAKS    = 1,
    BDRV_FIX_ERRORS   = 2,
} BdrvCheckMode;

int generated_co_wrapper bdrv_check(BlockDriverState *bs, BdrvCheckResult *res,
                                    BdrvCheckMode fix);

/* The units of offset and total_work_size may be chosen arbitrarily by the
 * block driver; total_work_size may change during the course of the amendment
 * operation */
typedef void BlockDriverAmendStatusCB(BlockDriverState *bs, int64_t offset,
                                      int64_t total_work_size, void *opaque);
int bdrv_amend_options(BlockDriverState *bs_new, QemuOpts *opts,
                       BlockDriverAmendStatusCB *status_cb, void *cb_opaque,
                       bool force,
                       Error **errp);

/* check if a named node can be replaced when doing drive-mirror */
BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs,
                                        const char *node_name, Error **errp);

/* async block I/O */
void bdrv_aio_cancel(BlockAIOCB *acb);
void bdrv_aio_cancel_async(BlockAIOCB *acb);

/* sg packet commands */
int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf);

/* Invalidate any cached metadata used by image formats */
int generated_co_wrapper bdrv_invalidate_cache(BlockDriverState *bs,
                                               Error **errp);
void bdrv_invalidate_cache_all(Error **errp);
int bdrv_inactivate_all(void);

/* Ensure contents are flushed to disk.  */
int generated_co_wrapper bdrv_flush(BlockDriverState *bs);
int coroutine_fn bdrv_co_flush(BlockDriverState *bs);
int bdrv_flush_all(void);
void bdrv_close_all(void);
void bdrv_drain(BlockDriverState *bs);
void coroutine_fn bdrv_co_drain(BlockDriverState *bs);
void bdrv_drain_all_begin(void);
void bdrv_drain_all_end(void);
void bdrv_drain_all(void);

#define BDRV_POLL_WHILE(bs, cond) ({                       \
    BlockDriverState *bs_ = (bs);                          \
    AIO_WAIT_WHILE(bdrv_get_aio_context(bs_),              \
                   cond); })

int generated_co_wrapper bdrv_pdiscard(BdrvChild *child, int64_t offset,
                                       int64_t bytes);
int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
int bdrv_has_zero_init_1(BlockDriverState *bs);
int bdrv_has_zero_init(BlockDriverState *bs);
bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
int bdrv_block_status(BlockDriverState *bs, int64_t offset,
                      int64_t bytes, int64_t *pnum, int64_t *map,
                      BlockDriverState **file);
int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
                            int64_t offset, int64_t bytes, int64_t *pnum,
                            int64_t *map, BlockDriverState **file);
int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes,
                      int64_t *pnum);
int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base,
                            bool include_base, int64_t offset, int64_t bytes,
                            int64_t *pnum);
int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset,
                                      int64_t bytes);

bool bdrv_is_read_only(BlockDriverState *bs);
int bdrv_can_set_read_only(BlockDriverState *bs, bool read_only,
                           bool ignore_allow_rdw, Error **errp);
int bdrv_apply_auto_read_only(BlockDriverState *bs, const char *errmsg,
                              Error **errp);
bool bdrv_is_writable(BlockDriverState *bs);
bool bdrv_is_sg(BlockDriverState *bs);
bool bdrv_is_inserted(BlockDriverState *bs);
void bdrv_lock_medium(BlockDriverState *bs, bool locked);
void bdrv_eject(BlockDriverState *bs, bool eject_flag);
const char *bdrv_get_format_name(BlockDriverState *bs);
BlockDriverState *bdrv_find_node(const char *node_name);
BlockDeviceInfoList *bdrv_named_nodes_list(bool flat, Error **errp);
XDbgBlockGraph *bdrv_get_xdbg_block_graph(Error **errp);
BlockDriverState *bdrv_lookup_bs(const char *device,
                                 const char *node_name,
                                 Error **errp);
bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base);
BlockDriverState *bdrv_next_node(BlockDriverState *bs);
BlockDriverState *bdrv_next_all_states(BlockDriverState *bs);

typedef struct BdrvNextIterator {
    enum {
        BDRV_NEXT_BACKEND_ROOTS,
        BDRV_NEXT_MONITOR_OWNED,
    } phase;
    BlockBackend *blk;
    BlockDriverState *bs;
} BdrvNextIterator;

BlockDriverState *bdrv_first(BdrvNextIterator *it);
BlockDriverState *bdrv_next(BdrvNextIterator *it);
void bdrv_next_cleanup(BdrvNextIterator *it);

BlockDriverState *bdrv_next_monitor_owned(BlockDriverState *bs);
bool bdrv_supports_compressed_writes(BlockDriverState *bs);
void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
                         void *opaque, bool read_only);
const char *bdrv_get_node_name(const BlockDriverState *bs);
const char *bdrv_get_device_name(const BlockDriverState *bs);
const char *bdrv_get_device_or_node_name(const BlockDriverState *bs);
int bdrv_get_flags(BlockDriverState *bs);
int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi);
ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs,
                                          Error **errp);
BlockStatsSpecific *bdrv_get_specific_stats(BlockDriverState *bs);
void bdrv_round_to_clusters(BlockDriverState *bs,
                            int64_t offset, int64_t bytes,
                            int64_t *cluster_offset,
                            int64_t *cluster_bytes);

void bdrv_get_backing_filename(BlockDriverState *bs,
                               char *filename, int filename_size);
char *bdrv_get_full_backing_filename(BlockDriverState *bs, Error **errp);
char *bdrv_get_full_backing_filename_from_filename(const char *backed,
                                                   const char *backing,
                                                   Error **errp);
char *bdrv_dirname(BlockDriverState *bs, Error **errp);

int path_has_protocol(const char *path);
int path_is_absolute(const char *path);
char *path_combine(const char *base_path, const char *filename);

int generated_co_wrapper
bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
int generated_co_wrapper
bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
                      int64_t pos, int size);

int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
                      int64_t pos, int size);

void bdrv_img_create(const char *filename, const char *fmt,
                     const char *base_filename, const char *base_fmt,
                     char *options, uint64_t img_size, int flags,
                     bool quiet, Error **errp);

/* Returns the alignment in bytes that is required so that no bounce buffer
 * is required throughout the stack */
size_t bdrv_min_mem_align(BlockDriverState *bs);
/* Returns optimal alignment in bytes for bounce buffer */
size_t bdrv_opt_mem_align(BlockDriverState *bs);
void *qemu_blockalign(BlockDriverState *bs, size_t size);
void *qemu_blockalign0(BlockDriverState *bs, size_t size);
void *qemu_try_blockalign(BlockDriverState *bs, size_t size);
void *qemu_try_blockalign0(BlockDriverState *bs, size_t size);
bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov);

void bdrv_enable_copy_on_read(BlockDriverState *bs);
void bdrv_disable_copy_on_read(BlockDriverState *bs);

void bdrv_ref(BlockDriverState *bs);
void bdrv_unref(BlockDriverState *bs);
void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child);
BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
                             BlockDriverState *child_bs,
                             const char *child_name,
                             const BdrvChildClass *child_class,
                             BdrvChildRole child_role,
                             Error **errp);

bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp);
void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason);
void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason);
void bdrv_op_block_all(BlockDriverState *bs, Error *reason);
void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason);
bool bdrv_op_blocker_is_empty(BlockDriverState *bs);

#define BLKDBG_EVENT(child, evt) \
    do { \
        if (child) { \
            bdrv_debug_event(child->bs, evt); \
        } \
    } while (0)

void bdrv_debug_event(BlockDriverState *bs, BlkdebugEvent event);

int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
                           const char *tag);
int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag);
int bdrv_debug_resume(BlockDriverState *bs, const char *tag);
bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag);

/**
 * bdrv_get_aio_context:
 *
 * Returns: the currently bound #AioContext
 */
AioContext *bdrv_get_aio_context(BlockDriverState *bs);

/**
 * Move the current coroutine to the AioContext of @bs and return the old
 * AioContext of the coroutine. Increase bs->in_flight so that draining @bs
 * will wait for the operation to proceed until the corresponding
 * bdrv_co_leave().
 *
 * Consequently, you can't call drain inside a bdrv_co_enter/leave() section as
 * this will deadlock.
 */
AioContext *coroutine_fn bdrv_co_enter(BlockDriverState *bs);

/**
 * Ends a section started by bdrv_co_enter(). Move the current coroutine back
 * to old_ctx and decrease bs->in_flight again.
 */
void coroutine_fn bdrv_co_leave(BlockDriverState *bs, AioContext *old_ctx);

/**
 * Locks the AioContext of @bs if it's not the current AioContext. This avoids
 * double locking which could lead to deadlocks: This is a coroutine_fn, so we
 * know we already own the lock of the current AioContext.
 *
 * May only be called in the main thread.
 */
void coroutine_fn bdrv_co_lock(BlockDriverState *bs);

/**
 * Unlocks the AioContext of @bs if it's not the current AioContext.
 */
void coroutine_fn bdrv_co_unlock(BlockDriverState *bs);

/**
 * Transfer control to @co in the aio context of @bs
 */
void bdrv_coroutine_enter(BlockDriverState *bs, Coroutine *co);

void bdrv_set_aio_context_ignore(BlockDriverState *bs,
                                 AioContext *new_context, GSList **ignore);
int bdrv_try_set_aio_context(BlockDriverState *bs, AioContext *ctx,
                             Error **errp);
int bdrv_child_try_set_aio_context(BlockDriverState *bs, AioContext *ctx,
                                   BdrvChild *ignore_child, Error **errp);
bool bdrv_child_can_set_aio_context(BdrvChild *c, AioContext *ctx,
                                    GSList **ignore, Error **errp);
bool bdrv_can_set_aio_context(BlockDriverState *bs, AioContext *ctx,
                              GSList **ignore, Error **errp);
AioContext *bdrv_child_get_parent_aio_context(BdrvChild *c);

int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz);
int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo);

void bdrv_io_plug(BlockDriverState *bs);
void bdrv_io_unplug(BlockDriverState *bs);

/**
 * bdrv_parent_drained_begin_single:
 *
 * Begin a quiesced section for the parent of @c. If @poll is true, wait for
 * any pending activity to cease.
 */
void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll);

/**
 * bdrv_parent_drained_end_single:
 *
 * End a quiesced section for the parent of @c.
 *
 * This polls @bs's AioContext until all scheduled sub-drained_ends
 * have settled, which may result in graph changes.
 */
void bdrv_parent_drained_end_single(BdrvChild *c);

/**
 * bdrv_drain_poll:
 *
 * Poll for pending requests in @bs, its parents (except for @ignore_parent),
 * and if @recursive is true its children as well (used for subtree drain).
 *
 * If @ignore_bds_parents is true, parents that are BlockDriverStates must
 * ignore the drain request because they will be drained separately (used for
 * drain_all).
 *
 * This is part of bdrv_drained_begin.
 */
bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
                     BdrvChild *ignore_parent, bool ignore_bds_parents);

/**
 * bdrv_drained_begin:
 *
 * Begin a quiesced section for exclusive access to the BDS, by disabling
 * external request sources including NBD server and device model. Note that
 * this doesn't block timers or coroutines from submitting more requests, which
 * means block_job_pause is still necessary.
 *
 * This function can be recursive.
 */
void bdrv_drained_begin(BlockDriverState *bs);

/**
 * bdrv_do_drained_begin_quiesce:
 *
 * Quiesces a BDS like bdrv_drained_begin(), but does not wait for already
 * running requests to complete.
 */
void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
                                   BdrvChild *parent, bool ignore_bds_parents);

/**
 * Like bdrv_drained_begin, but recursively begins a quiesced section for
 * exclusive access to all child nodes as well.
 */
void bdrv_subtree_drained_begin(BlockDriverState *bs);

/**
 * bdrv_drained_end:
 *
 * End a quiescent section started by bdrv_drained_begin().
 *
 * This polls @bs's AioContext until all scheduled sub-drained_ends
 * have settled.  On one hand, that may result in graph changes.  On
 * the other, this requires that the caller either runs in the main
 * loop; or that all involved nodes (@bs and all of its parents) are
 * in the caller's AioContext.
 */
void bdrv_drained_end(BlockDriverState *bs);

/**
 * bdrv_drained_end_no_poll:
 *
 * Same as bdrv_drained_end(), but do not poll for the subgraph to
 * actually become unquiesced.  Therefore, no graph changes will occur
 * with this function.
 *
 * *drained_end_counter is incremented for every background operation
 * that is scheduled, and will be decremented for every operation once
 * it settles.  The caller must poll until it reaches 0.  The counter
 * should be accessed using atomic operations only.
 */
void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter);

/**
 * End a quiescent section started by bdrv_subtree_drained_begin().
 */
void bdrv_subtree_drained_end(BlockDriverState *bs);

void bdrv_add_child(BlockDriverState *parent, BlockDriverState *child,
                    Error **errp);
void bdrv_del_child(BlockDriverState *parent, BdrvChild *child, Error **errp);

bool bdrv_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name,
                                     uint32_t granularity, Error **errp);
/**
 *
 * bdrv_register_buf/bdrv_unregister_buf:
 *
 * Register/unregister a buffer for I/O. For example, VFIO drivers are
 * interested to know the memory areas that would later be used for I/O, so
 * that they can prepare IOMMU mapping etc., to get better performance.
 */
void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size);
void bdrv_unregister_buf(BlockDriverState *bs, void *host);

/**
 *
 * bdrv_co_copy_range:
 *
 * Do offloaded copy between two children. If the operation is not implemented
 * by the driver, or if the backend storage doesn't support it, a negative
 * error code will be returned.
 *
 * Note: block layer doesn't emulate or fallback to a bounce buffer approach
 * because usually the caller shouldn't attempt offloaded copy any more (e.g.
 * calling copy_file_range(2)) after the first error, thus it should fall back
 * to a read+write path in the caller level.
 *
 * @src: Source child to copy data from
 * @src_offset: offset in @src image to read data
 * @dst: Destination child to copy data to
 * @dst_offset: offset in @dst image to write data
 * @bytes: number of bytes to copy
 * @flags: request flags. Supported flags:
 *         BDRV_REQ_ZERO_WRITE - treat the @src range as zero data and do zero
 *                               write on @dst as if bdrv_co_pwrite_zeroes is
 *                               called. Used to simplify caller code, or
 *                               during BlockDriver.bdrv_co_copy_range_from()
 *                               recursion.
 *         BDRV_REQ_NO_SERIALISING - do not serialize with other overlapping
 *                                   requests currently in flight.
 *
 * Returns: 0 if succeeded; negative error code if failed.
 **/
int coroutine_fn bdrv_co_copy_range(BdrvChild *src, int64_t src_offset,
                                    BdrvChild *dst, int64_t dst_offset,
                                    int64_t bytes, BdrvRequestFlags read_flags,
                                    BdrvRequestFlags write_flags);

void bdrv_cancel_in_flight(BlockDriverState *bs);

#endif