Merge remote-tracking branch 'remotes/mst/tags/for_upstream' into staging

pci, pc, virtio: features, fixes reconnect for vhost blk tests for UEFI misc other stuff Signed-off-by: Michael S. Tsirkin <mst@redhat.com> # gpg: Signature made Tue 21 May 2019 14:41:32 BST # gpg: using RSA key 281F0DB8D28D5469 # gpg: Good signature from "Michael S. Tsirkin <mst@kernel.org>" [full] # gpg: aka "Michael S. Tsirkin <mst@redhat.com>" [full] # Primary key fingerprint: 0270 606B 6F3C DF3D 0B17 0970 C350 3912 AFBE 8E67 # Subkey fingerprint: 5D09 FD08 71C8 F85B 94CA 8A0D 281F 0DB8 D28D 5469 * remotes/mst/tags/for_upstream: (34 commits) tests: acpi: print error unable to dump ACPI table during rebuild tests: acpi: refactor rebuild-expected-aml.sh to dump ACPI tables for a specified list of targets tests: acpi: allow to override default accelerator tests: acpi: ignore SMBIOS tests when UEFI firmware is used tests: acpi: add a way to start tests with UEFI firmware tests: acpi: add acpi_find_rsdp_address_uefi() helper tests: acpi: move boot_sector_init() into x86 tests branch tests: acpi: skip FACS table if board uses hw reduced ACPI profile tests: acpi: fetch X_DSDT if pointer to DSDT is 0 tests: acpi: make pointer to RSDP 64bit tests: acpi: make RSDT test routine handle XSDT tests: acpi: make acpi_fetch_table() take size of fetched table pointer tests: acpi: rename acpi_parse_rsdp_table() into acpi_fetch_rsdp_table() pci: Simplify pci_bus_is_root() pcie: Remove redundant test in pcie_mmcfg_data_{read,write}() libvhost-user: fix bad vu_log_write hw/arm/virt-acpi-build: pass AcpiMcfgInfo to build_mcfg() i386, acpi: remove mcfg_ prefix in AcpiMcfgInfo members hw/arm/virt-acpi-build: remove unnecessary variable mcfg_start do not call vhost_net_cleanup() on running net from char user event ... Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
author: Peter Maydell <peter.maydell@linaro.org> 2019-05-21 14:56:57 +0100
committer: Peter Maydell <peter.maydell@linaro.org> 2019-05-21 14:56:57 +0100
commit: 247ba27c528c52e4a41c233c1c9a699f40e4d2a5 (patch)
tree: cec47b9b84e1e099b1295468f59fe31490c6e379
parent: 62516a0a18cd156d913dd625baca52c46743223b (diff)
parent: ba02ff90ee1dcaf7aa5645075217e555ae2c54ea (diff)
36 files changed, 1890 insertions, 1453 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 9424a490d6..a6948ebc63 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1484,7 +1484,7 @@ M: Michael S. Tsirkin <mst@redhat.com>
 S: Supported
 F: hw/*/*vhost*
 F: docs/interop/vhost-user.json
-F: docs/interop/vhost-user.txt
+F: docs/interop/vhost-user.rst
 F: contrib/vhost-user-*/
 F: backends/vhost-user.c
 F: include/sysemu/vhost-user-backend.h
diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
index 74d42177c5..3825b1cacf 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -433,7 +433,7 @@ vu_log_write(VuDev *dev, uint64_t address, uint64_t length)
     page = address / VHOST_LOG_PAGE;
     while (page * VHOST_LOG_PAGE < address + length) {
         vu_log_page(dev->log_table, page);
-        page += VHOST_LOG_PAGE;
+        page += 1;
     }
 
     vu_log_kick(dev);
diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c
index 43583f2659..86a3987744 100644
--- a/contrib/vhost-user-blk/vhost-user-blk.c
+++ b/contrib/vhost-user-blk/vhost-user-blk.c
@@ -398,7 +398,8 @@ vub_get_features(VuDev *dev)
 static uint64_t
 vub_get_protocol_features(VuDev *dev)
 {
-    return 1ull << VHOST_USER_PROTOCOL_F_CONFIG;
+    return 1ull << VHOST_USER_PROTOCOL_F_CONFIG |
+           1ull << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD;
 }
 
 static int
diff --git a/docs/interop/index.rst b/docs/interop/index.rst
index 2df977dd52..a037bd67ec 100644
--- a/docs/interop/index.rst
+++ b/docs/interop/index.rst
@@ -15,4 +15,4 @@ Contents:
    bitmaps
    live-block-operations
    pr-helper
-
+   vhost-user
diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst
new file mode 100644
index 0000000000..7f3232c798
--- /dev/null
+++ b/docs/interop/vhost-user.rst
@@ -0,0 +1,1351 @@
+===================
+Vhost-user Protocol
+===================
+:Copyright: 2014 Virtual Open Systems Sarl.
+:Licence: This work is licensed under the terms of the GNU GPL,
+          version 2 or later. See the COPYING file in the top-level
+          directory.
+
+.. contents:: Table of Contents
+
+Introduction
+============
+
+This protocol is aiming to complement the ``ioctl`` interface used to
+control the vhost implementation in the Linux kernel. It implements
+the control plane needed to establish virtqueue sharing with a user
+space process on the same host. It uses communication over a Unix
+domain socket to share file descriptors in the ancillary data of the
+message.
+
+The protocol defines 2 sides of the communication, *master* and
+*slave*. *Master* is the application that shares its virtqueues, in
+our case QEMU. *Slave* is the consumer of the virtqueues.
+
+In the current implementation QEMU is the *master*, and the *slave* is
+the external process consuming the virtio queues, for example a
+software Ethernet switch running in user space, such as Snabbswitch,
+or a block device backend processing read & write to a virtual
+disk. In order to facilitate interoperability between various backend
+implementations, it is recommended to follow the :ref:`Backend program
+conventions <backend_conventions>`.
+
+*Master* and *slave* can be either a client (i.e. connecting) or
+server (listening) in the socket communication.
+
+Message Specification
+=====================
+
+.. Note:: All numbers are in the machine native byte order.
+
+A vhost-user message consists of 3 header fields and a payload.
+
++---------+-------+------+---------+
+| request | flags | size | payload |
++---------+-------+------+---------+
+
+Header
+------
+
+:request: 32-bit type of the request
+
+:flags: 32-bit bit field
+
+- Lower 2 bits are the version (currently 0x01)
+- Bit 2 is the reply flag - needs to be sent on each reply from the slave
+- Bit 3 is the need_reply flag - see :ref:`REPLY_ACK <reply_ack>` for
+  details.
+
+:size: 32-bit size of the payload
+
+Payload
+-------
+
+Depending on the request type, **payload** can be:
+
+A single 64-bit integer
+^^^^^^^^^^^^^^^^^^^^^^^
+
++-----+
+| u64 |
++-----+
+
+:u64: a 64-bit unsigned integer
+
+A vring state description
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
++-------+-----+
+| index | num |
++-------+-----+
+
+:index: a 32-bit index
+
+:num: a 32-bit number
+
+A vring address description
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
++-------+-------+------+------------+------+-----------+-----+
+| index | flags | size | descriptor | used | available | log |
++-------+-------+------+------------+------+-----------+-----+
+
+:index: a 32-bit vring index
+
+:flags: a 32-bit vring flags
+
+:descriptor: a 64-bit ring address of the vring descriptor table
+
+:used: a 64-bit ring address of the vring used ring
+
+:available: a 64-bit ring address of the vring available ring
+
+:log: a 64-bit guest address for logging
+
+Note that a ring address is an IOVA if ``VIRTIO_F_IOMMU_PLATFORM`` has
+been negotiated. Otherwise it is a user address.
+
+Memory regions description
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
++-------------+---------+---------+-----+---------+
+| num regions | padding | region0 | ... | region7 |
++-------------+---------+---------+-----+---------+
+
+:num regions: a 32-bit number of regions
+
+:padding: 32-bit
+
+A region is:
+
++---------------+------+--------------+-------------+
+| guest address | size | user address | mmap offset |
++---------------+------+--------------+-------------+
+
+:guest address: a 64-bit guest address of the region
+
+:size: a 64-bit size
+
+:user address: a 64-bit user address
+
+:mmap offset: 64-bit offset where region starts in the mapped memory
+
+Log description
+^^^^^^^^^^^^^^^
+
++----------+------------+
+| log size | log offset |
++----------+------------+
+
+:log size: size of area used for logging
+
+:log offset: offset from start of supplied file descriptor where
+             logging starts (i.e. where guest address 0 would be
+             logged)
+
+An IOTLB message
+^^^^^^^^^^^^^^^^
+
++------+------+--------------+-------------------+------+
+| iova | size | user address | permissions flags | type |
++------+------+--------------+-------------------+------+
+
+:iova: a 64-bit I/O virtual address programmed by the guest
+
+:size: a 64-bit size
+
+:user address: a 64-bit user address
+
+:permissions flags: an 8-bit value:
+  - 0: No access
+  - 1: Read access
+  - 2: Write access
+  - 3: Read/Write access
+
+:type: an 8-bit IOTLB message type:
+  - 1: IOTLB miss
+  - 2: IOTLB update
+  - 3: IOTLB invalidate
+  - 4: IOTLB access fail
+
+Virtio device config space
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
++--------+------+-------+---------+
+| offset | size | flags | payload |
++--------+------+-------+---------+
+
+:offset: a 32-bit offset of virtio device's configuration space
+
+:size: a 32-bit configuration space access size in bytes
+
+:flags: a 32-bit value:
+  - 0: Vhost master messages used for writeable fields
+  - 1: Vhost master messages used for live migration
+
+:payload: Size bytes array holding the contents of the virtio
+          device's configuration space
+
+Vring area description
+^^^^^^^^^^^^^^^^^^^^^^
+
++-----+------+--------+
+| u64 | size | offset |
++-----+------+--------+
+
+:u64: a 64-bit integer contains vring index and flags
+
+:size: a 64-bit size of this area
+
+:offset: a 64-bit offset of this area from the start of the
+         supplied file descriptor
+
+Inflight description
+^^^^^^^^^^^^^^^^^^^^
+
++-----------+-------------+------------+------------+
+| mmap size | mmap offset | num queues | queue size |
++-----------+-------------+------------+------------+
+
+:mmap size: a 64-bit size of area to track inflight I/O
+
+:mmap offset: a 64-bit offset of this area from the start
+              of the supplied file descriptor
+
+:num queues: a 16-bit number of virtqueues
+
+:queue size: a 16-bit size of virtqueues
+
+C structure
+-----------
+
+In QEMU the vhost-user message is implemented with the following struct:
+
+.. code:: c
+
+  typedef struct VhostUserMsg {
+      VhostUserRequest request;
+      uint32_t flags;
+      uint32_t size;
+      union {
+          uint64_t u64;
+          struct vhost_vring_state state;
+          struct vhost_vring_addr addr;
+          VhostUserMemory memory;
+          VhostUserLog log;
+          struct vhost_iotlb_msg iotlb;
+          VhostUserConfig config;
+          VhostUserVringArea area;
+          VhostUserInflight inflight;
+      };
+  } QEMU_PACKED VhostUserMsg;
+
+Communication
+=============
+
+The protocol for vhost-user is based on the existing implementation of
+vhost for the Linux Kernel. Most messages that can be sent via the
+Unix domain socket implementing vhost-user have an equivalent ioctl to
+the kernel implementation.
+
+The communication consists of *master* sending message requests and
+*slave* sending message replies. Most of the requests don't require
+replies. Here is a list of the ones that do:
+
+* ``VHOST_USER_GET_FEATURES``
+* ``VHOST_USER_GET_PROTOCOL_FEATURES``
+* ``VHOST_USER_GET_VRING_BASE``
+* ``VHOST_USER_SET_LOG_BASE`` (if ``VHOST_USER_PROTOCOL_F_LOG_SHMFD``)
+* ``VHOST_USER_GET_INFLIGHT_FD`` (if ``VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD``)
+
+.. seealso::
+
+   :ref:`REPLY_ACK <reply_ack>`
+       The section on ``REPLY_ACK`` protocol extension.
+
+There are several messages that the master sends with file descriptors passed
+in the ancillary data:
+
+* ``VHOST_USER_SET_MEM_TABLE``
+* ``VHOST_USER_SET_LOG_BASE`` (if ``VHOST_USER_PROTOCOL_F_LOG_SHMFD``)
+* ``VHOST_USER_SET_LOG_FD``
+* ``VHOST_USER_SET_VRING_KICK``
+* ``VHOST_USER_SET_VRING_CALL``
+* ``VHOST_USER_SET_VRING_ERR``
+* ``VHOST_USER_SET_SLAVE_REQ_FD``
+* ``VHOST_USER_SET_INFLIGHT_FD`` (if ``VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD``)
+
+If *master* is unable to send the full message or receives a wrong
+reply it will close the connection. An optional reconnection mechanism
+can be implemented.
+
+Any protocol extensions are gated by protocol feature bits, which
+allows full backwards compatibility on both master and slave.  As
+older slaves don't support negotiating protocol features, a feature
+bit was dedicated for this purpose::
+
+  #define VHOST_USER_F_PROTOCOL_FEATURES 30
+
+Starting and stopping rings
+---------------------------
+
+Client must only process each ring when it is started.
+
+Client must only pass data between the ring and the backend, when the
+ring is enabled.
+
+If ring is started but disabled, client must process the ring without
+talking to the backend.
+
+For example, for a networking device, in the disabled state client
+must not supply any new RX packets, but must process and discard any
+TX packets.
+
+If ``VHOST_USER_F_PROTOCOL_FEATURES`` has not been negotiated, the
+ring is initialized in an enabled state.
+
+If ``VHOST_USER_F_PROTOCOL_FEATURES`` has been negotiated, the ring is
+initialized in a disabled state. Client must not pass data to/from the
+backend until ring is enabled by ``VHOST_USER_SET_VRING_ENABLE`` with
+parameter 1, or after it has been disabled by
+``VHOST_USER_SET_VRING_ENABLE`` with parameter 0.
+
+Each ring is initialized in a stopped state, client must not process
+it until ring is started, or after it has been stopped.
+
+Client must start ring upon receiving a kick (that is, detecting that
+file descriptor is readable) on the descriptor specified by
+``VHOST_USER_SET_VRING_KICK``, and stop ring upon receiving
+``VHOST_USER_GET_VRING_BASE``.
+
+While processing the rings (whether they are enabled or not), client
+must support changing some configuration aspects on the fly.
+
+Multiple queue support
+----------------------
+
+Multiple queue is treated as a protocol extension, hence the slave has
+to implement protocol features first. The multiple queues feature is
+supported only when the protocol feature ``VHOST_USER_PROTOCOL_F_MQ``
+(bit 0) is set.
+
+The max number of queue pairs the slave supports can be queried with
+message ``VHOST_USER_GET_QUEUE_NUM``. Master should stop when the
+number of requested queues is bigger than that.
+
+As all queues share one connection, the master uses a unique index for each
+queue in the sent message to identify a specified queue. One queue pair
+is enabled initially. More queues are enabled dynamically, by sending
+message ``VHOST_USER_SET_VRING_ENABLE``.
+
+Migration
+---------
+
+During live migration, the master may need to track the modifications
+the slave makes to the memory mapped regions. The client should mark
+the dirty pages in a log. Once it complies to this logging, it may
+declare the ``VHOST_F_LOG_ALL`` vhost feature.
+
+To start/stop logging of data/used ring writes, server may send
+messages ``VHOST_USER_SET_FEATURES`` with ``VHOST_F_LOG_ALL`` and
+``VHOST_USER_SET_VRING_ADDR`` with ``VHOST_VRING_F_LOG`` in ring's
+flags set to 1/0, respectively.
+
+All the modifications to memory pointed by vring "descriptor" should
+be marked. Modifications to "used" vring should be marked if
+``VHOST_VRING_F_LOG`` is part of ring's flags.
+
+Dirty pages are of size::
+
+  #define VHOST_LOG_PAGE 0x1000
+
+The log memory fd is provided in the ancillary data of
+``VHOST_USER_SET_LOG_BASE`` message when the slave has
+``VHOST_USER_PROTOCOL_F_LOG_SHMFD`` protocol feature.
+
+The size of the log is supplied as part of ``VhostUserMsg`` which
+should be large enough to cover all known guest addresses. Log starts
+at the supplied offset in the supplied file descriptor.  The log
+covers from address 0 to the maximum of guest regions. In pseudo-code,
+to mark page at ``addr`` as dirty::
+
+  page = addr / VHOST_LOG_PAGE
+  log[page / 8] |= 1 << page % 8
+
+Where ``addr`` is the guest physical address.
+
+Use atomic operations, as the log may be concurrently manipulated.
+
+Note that when logging modifications to the used ring (when
+``VHOST_VRING_F_LOG`` is set for this ring), ``log_guest_addr`` should
+be used to calculate the log offset: the write to first byte of the
+used ring is logged at this offset from log start. Also note that this
+value might be outside the legal guest physical address range
+(i.e. does not have to be covered by the ``VhostUserMemory`` table), but
+the bit offset of the last byte of the ring must fall within the size
+supplied by ``VhostUserLog``.
+
+``VHOST_USER_SET_LOG_FD`` is an optional message with an eventfd in
+ancillary data, it may be used to inform the master that the log has
+been modified.
+
+Once the source has finished migration, rings will be stopped by the
+source. No further update must be done before rings are restarted.
+
+In postcopy migration the slave is started before all the memory has
+been received from the source host, and care must be taken to avoid
+accessing pages that have yet to be received.  The slave opens a
+'userfault'-fd and registers the memory with it; this fd is then
+passed back over to the master.  The master services requests on the
+userfaultfd for pages that are accessed and when the page is available
+it performs WAKE ioctl's on the userfaultfd to wake the stalled
+slave.  The client indicates support for this via the
+``VHOST_USER_PROTOCOL_F_PAGEFAULT`` feature.
+
+Memory access
+-------------
+
+The master sends a list of vhost memory regions to the slave using the
+``VHOST_USER_SET_MEM_TABLE`` message.  Each region has two base
+addresses: a guest address and a user address.
+
+Messages contain guest addresses and/or user addresses to reference locations
+within the shared memory.  The mapping of these addresses works as follows.
+
+User addresses map to the vhost memory region containing that user address.
+
+When the ``VIRTIO_F_IOMMU_PLATFORM`` feature has not been negotiated:
+
+* Guest addresses map to the vhost memory region containing that guest
+  address.
+
+When the ``VIRTIO_F_IOMMU_PLATFORM`` feature has been negotiated:
+
+* Guest addresses are also called I/O virtual addresses (IOVAs).  They are
+  translated to user addresses via the IOTLB.
+
+* The vhost memory region guest address is not used.
+
+IOMMU support
+-------------
+
+When the ``VIRTIO_F_IOMMU_PLATFORM`` feature has been negotiated, the
+master sends IOTLB entries update & invalidation by sending
+``VHOST_USER_IOTLB_MSG`` requests to the slave with a ``struct
+vhost_iotlb_msg`` as payload. For update events, the ``iotlb`` payload
+has to be filled with the update message type (2), the I/O virtual
+address, the size, the user virtual address, and the permissions
+flags. Addresses and size must be within vhost memory regions set via
+the ``VHOST_USER_SET_MEM_TABLE`` request. For invalidation events, the
+``iotlb`` payload has to be filled with the invalidation message type
+(3), the I/O virtual address and the size. On success, the slave is
+expected to reply with a zero payload, non-zero otherwise.
+
+The slave relies on the slave communcation channel (see :ref:`Slave
+communication <slave_communication>` section below) to send IOTLB miss
+and access failure events, by sending ``VHOST_USER_SLAVE_IOTLB_MSG``
+requests to the master with a ``struct vhost_iotlb_msg`` as
+payload. For miss events, the iotlb payload has to be filled with the
+miss message type (1), the I/O virtual address and the permissions
+flags. For access failure event, the iotlb payload has to be filled
+with the access failure message type (4), the I/O virtual address and
+the permissions flags.  For synchronization purpose, the slave may
+rely on the reply-ack feature, so the master may send a reply when
+operation is completed if the reply-ack feature is negotiated and
+slaves requests a reply. For miss events, completed operation means
+either master sent an update message containing the IOTLB entry
+containing requested address and permission, or master sent nothing if
+the IOTLB miss message is invalid (invalid IOVA or permission).
+
+The master isn't expected to take the initiative to send IOTLB update
+messages, as the slave sends IOTLB miss messages for the guest virtual
+memory areas it needs to access.
+
+.. _slave_communication:
+
+Slave communication
+-------------------
+
+An optional communication channel is provided if the slave declares
+``VHOST_USER_PROTOCOL_F_SLAVE_REQ`` protocol feature, to allow the
+slave to make requests to the master.
+
+The fd is provided via ``VHOST_USER_SET_SLAVE_REQ_FD`` ancillary data.
+
+A slave may then send ``VHOST_USER_SLAVE_*`` messages to the master
+using this fd communication channel.
+
+If ``VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD`` protocol feature is
+negotiated, slave can send file descriptors (at most 8 descriptors in
+each message) to master via ancillary data using this fd communication
+channel.
+
+Inflight I/O tracking
+---------------------
+
+To support reconnecting after restart or crash, slave may need to
+resubmit inflight I/Os. If virtqueue is processed in order, we can
+easily achieve that by getting the inflight descriptors from
+descriptor table (split virtqueue) or descriptor ring (packed
+virtqueue). However, it can't work when we process descriptors
+out-of-order because some entries which store the information of
+inflight descriptors in available ring (split virtqueue) or descriptor
+ring (packed virtqueue) might be overrided by new entries. To solve
+this problem, slave need to allocate an extra buffer to store this
+information of inflight descriptors and share it with master for
+persistent. ``VHOST_USER_GET_INFLIGHT_FD`` and
+``VHOST_USER_SET_INFLIGHT_FD`` are used to transfer this buffer
+between master and slave. And the format of this buffer is described
+below:
+
++---------------+---------------+-----+---------------+
+| queue0 region | queue1 region | ... | queueN region |
++---------------+---------------+-----+---------------+
+
+N is the number of available virtqueues. Slave could get it from num
+queues field of ``VhostUserInflight``.
+
+For split virtqueue, queue region can be implemented as:
+
+.. code:: c
+
+  typedef struct DescStateSplit {
+      /* Indicate whether this descriptor is inflight or not.
+       * Only available for head-descriptor. */
+      uint8_t inflight;
+
+      /* Padding */
+      uint8_t padding[5];
+
+      /* Maintain a list for the last batch of used descriptors.
+       * Only available when batching is used for submitting */
+      uint16_t next;
+
+      /* Used to preserve the order of fetching available descriptors.
+       * Only available for head-descriptor. */
+      uint64_t counter;
+  } DescStateSplit;
+
+  typedef struct QueueRegionSplit {
+      /* The feature flags of this region. Now it's initialized to 0. */
+      uint64_t features;
+
+      /* The version of this region. It's 1 currently.
+       * Zero value indicates an uninitialized buffer */
+      uint16_t version;
+
+      /* The size of DescStateSplit array. It's equal to the virtqueue
+       * size. Slave could get it from queue size field of VhostUserInflight. */
+      uint16_t desc_num;
+
+      /* The head of list that track the last batch of used descriptors. */
+      uint16_t last_batch_head;
+
+      /* Store the idx value of used ring */
+      uint16_t used_idx;
+
+      /* Used to track the state of each descriptor in descriptor table */
+      DescStateSplit desc[0];
+  } QueueRegionSplit;
+
+To track inflight I/O, the queue region should be processed as follows:
+
+When receiving available buffers from the driver:
+
+#. Get the next available head-descriptor index from available ring, ``i``
+
+#. Set ``desc[i].counter`` to the value of global counter
+
+#. Increase global counter by 1
+
+#. Set ``desc[i].inflight`` to 1
+
+When supplying used buffers to the driver:
+
+1. Get corresponding used head-descriptor index, i
+
+2. Set ``desc[i].next`` to ``last_batch_head``
+
+3. Set ``last_batch_head`` to ``i``
+
+#. Steps 1,2,3 may be performed repeatedly if batching is possible
+
+#. Increase the ``idx`` value of used ring by the size of the batch
+
+#. Set the ``inflight`` field of each ``DescStateSplit`` entry in the batch to 0
+
+#. Set ``used_idx`` to the ``idx`` value of used ring
+
+When reconnecting:
+
+#. If the value of ``used_idx`` does not match the ``idx`` value of
+   used ring (means the inflight field of ``DescStateSplit`` entries in
+   last batch may be incorrect),
+
+   a. Subtract the value of ``used_idx`` from the ``idx`` value of
+      used ring to get last batch size of ``DescStateSplit`` entries
+
+   #. Set the ``inflight`` field of each ``DescStateSplit`` entry to 0 in last batch
+      list which starts from ``last_batch_head``
+
+   #. Set ``used_idx`` to the ``idx`` value of used ring
+
+#. Resubmit inflight ``DescStateSplit`` entries in order of their
+   counter value
+
+For packed virtqueue, queue region can be implemented as:
+
+.. code:: c
+
+  typedef struct DescStatePacked {
+      /* Indicate whether this descriptor is inflight or not.
+       * Only available for head-descriptor. */
+      uint8_t inflight;
+
+      /* Padding */
+      uint8_t padding;
+
+      /* Link to the next free entry */
+      uint16_t next;
+
+      /* Link to the last entry of descriptor list.
+       * Only available for head-descriptor. */
+      uint16_t last;
+
+      /* The length of descriptor list.
+       * Only available for head-descriptor. */
+      uint16_t num;
+
+      /* Used to preserve the order of fetching available descriptors.
+       * Only available for head-descriptor. */
+      uint64_t counter;
+
+      /* The buffer id */
+      uint16_t id;
+
+      /* The descriptor flags */
+      uint16_t flags;
+
+      /* The buffer length */
+      uint32_t len;
+
+      /* The buffer address */
+      uint64_t addr;
+  } DescStatePacked;
+
+  typedef struct QueueRegionPacked {
+      /* The feature flags of this region. Now it's initialized to 0. */
+      uint64_t features;
+
+      /* The version of this region. It's 1 currently.
+       * Zero value indicates an uninitialized buffer */
+      uint16_t version;
+
+      /* The size of DescStatePacked array. It's equal to the virtqueue
+       * size. Slave could get it from queue size field of VhostUserInflight. */
+      uint16_t desc_num;
+
+      /* The head of free DescStatePacked entry list */
+      uint16_t free_head;
+
+      /* The old head of free DescStatePacked entry list */
+      uint16_t old_free_head;
+
+      /* The used index of descriptor ring */
+      uint16_t used_idx;
+
+      /* The old used index of descriptor ring */
+      uint16_t old_used_idx;
+
+      /* Device ring wrap counter */
+      uint8_t used_wrap_counter;
+
+      /* The old device ring wrap counter */
+      uint8_t old_used_wrap_counter;
+
+      /* Padding */
+      uint8_t padding[7];
+
+      /* Used to track the state of each descriptor fetched from descriptor ring */
+      DescStatePacked desc[0];
+  } QueueRegionPacked;
+
+To track inflight I/O, the queue region should be processed as follows:
+
+When receiving available buffers from the driver:
+
+#. Get the next available descriptor entry from descriptor ring, ``d``
+
+#. If ``d`` is head descriptor,
+
+   a. Set ``desc[old_free_head].num`` to 0
+
+   #. Set ``desc[old_free_head].counter`` to the value of global counter
+
+   #. Increase global counter by 1
+
+   #. Set ``desc[old_free_head].inflight`` to 1
+
+#. If ``d`` is last descriptor, set ``desc[old_free_head].last`` to
+   ``free_head``
+
+#. Increase ``desc[old_free_head].num`` by 1
+
+#. Set ``desc[free_head].addr``, ``desc[free_head].len``,
+   ``desc[free_head].flags``, ``desc[free_head].id`` to ``d.addr``,
+   ``d.len``, ``d.flags``, ``d.id``
+
+#. Set ``free_head`` to ``desc[free_head].next``
+
+#. If ``d`` is last descriptor, set ``old_free_head`` to ``free_head``
+
+When supplying used buffers to the driver:
+
+1. Get corresponding used head-descriptor entry from descriptor ring,
+   ``d``
+
+2. Get corresponding ``DescStatePacked`` entry, ``e``
+
+3. Set ``desc[e.last].next`` to ``free_head``
+
+4. Set ``free_head`` to the index of ``e``
+
+#. Steps 1,2,3,4 may be performed repeatedly if batching is possible
+
+#. Increase ``used_idx`` by the size of the batch and update
+   ``used_wrap_counter`` if needed
+
+#. Update ``d.flags``
+
+#. Set the ``inflight`` field of each head ``DescStatePacked`` entry
+   in the batch to 0
+
+#. Set ``old_free_head``,  ``old_used_idx``, ``old_used_wrap_counter``
+   to ``free_head``, ``used_idx``, ``used_wrap_counter``
+
+When reconnecting:
+
+#. If ``used_idx`` does not match ``old_used_idx`` (means the
+   ``inflight`` field of ``DescStatePacked`` entries in last batch may
+   be incorrect),
+
+   a. Get the next descriptor ring entry through ``old_used_idx``, ``d``
+
+   #. Use ``old_used_wrap_counter`` to calculate the available flags
+
+   #. If ``d.flags`` is not equal to the calculated flags value (means
+      slave has submitted the buffer to guest driver before crash, so
+      it has to commit the in-progres update), set ``old_free_head``,
+      ``old_used_idx``, ``old_used_wrap_counter`` to ``free_head``,
+      ``used_idx``, ``used_wrap_counter``
+
+#. Set ``free_head``, ``used_idx``, ``used_wrap_counter`` to
+   ``old_free_head``, ``old_used_idx``, ``old_used_wrap_counter``
+   (roll back any in-progress update)
+
+#. Set the ``inflight`` field of each ``DescStatePacked`` entry in
+   free list to 0
+
+#. Resubmit inflight ``DescStatePacked`` entries in order of their
+   counter value
+
+Protocol features
+-----------------
+
+.. code:: c
+
+  #define VHOST_USER_PROTOCOL_F_MQ             0
+  #define VHOST_USER_PROTOCOL_F_LOG_SHMFD      1
+  #define VHOST_USER_PROTOCOL_F_RARP           2
+  #define VHOST_USER_PROTOCOL_F_REPLY_ACK      3
+  #define VHOST_USER_PROTOCOL_F_MTU            4
+  #define VHOST_USER_PROTOCOL_F_SLAVE_REQ      5
+  #define VHOST_USER_PROTOCOL_F_CROSS_ENDIAN   6
+  #define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7
+  #define VHOST_USER_PROTOCOL_F_PAGEFAULT      8
+  #define VHOST_USER_PROTOCOL_F_CONFIG         9
+  #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD  10
+  #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER  11
+  #define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12
+
+Master message types
+--------------------
+
+``VHOST_USER_GET_FEATURES``
+  :id: 1
+  :equivalent ioctl: ``VHOST_GET_FEATURES``
+  :master payload: N/A
+  :slave payload: ``u64``
+
+  Get from the underlying vhost implementation the features bitmask.
+  Feature bit ``VHOST_USER_F_PROTOCOL_FEATURES`` signals slave support
+  for ``VHOST_USER_GET_PROTOCOL_FEATURES`` and
+  ``VHOST_USER_SET_PROTOCOL_FEATURES``.
+
+``VHOST_USER_SET_FEATURES``
+  :id: 2
+  :equivalent ioctl: ``VHOST_SET_FEATURES``
+  :master payload: ``u64``
+
+  Enable features in the underlying vhost implementation using a
+  bitmask.  Feature bit ``VHOST_USER_F_PROTOCOL_FEATURES`` signals
+  slave support for ``VHOST_USER_GET_PROTOCOL_FEATURES`` and
+  ``VHOST_USER_SET_PROTOCOL_FEATURES``.
+
+``VHOST_USER_GET_PROTOCOL_FEATURES``
+  :id: 15
+  :equivalent ioctl: ``VHOST_GET_FEATURES``
+  :master payload: N/A
+  :slave payload: ``u64``
+
+  Get the protocol feature bitmask from the underlying vhost
+  implementation.  Only legal if feature bit
+  ``VHOST_USER_F_PROTOCOL_FEATURES`` is present in
+  ``VHOST_USER_GET_FEATURES``.
+
+.. Note::
+   Slave that reported ``VHOST_USER_F_PROTOCOL_FEATURES`` must
+   support this message even before ``VHOST_USER_SET_FEATURES`` was
+   called.
+
+``VHOST_USER_SET_PROTOCOL_FEATURES``
+  :id: 16
+  :equivalent ioctl: ``VHOST_SET_FEATURES``
+  :master payload: ``u64``
+
+  Enable protocol features in the underlying vhost implementation.
+
+  Only legal if feature bit ``VHOST_USER_F_PROTOCOL_FEATURES`` is present in
+  ``VHOST_USER_GET_FEATURES``.
+
+.. Note::
+   Slave that reported ``VHOST_USER_F_PROTOCOL_FEATURES`` must support
+   this message even before ``VHOST_USER_SET_FEATURES`` was called.
+
+``VHOST_USER_SET_OWNER``
+  :id: 3
+  :equivalent ioctl: ``VHOST_SET_OWNER``
+  :master payload: N/A
+
+  Issued when a new connection is established. It sets the current
+  *master* as an owner of the session. This can be used on the *slave*
+  as a "session start" flag.
+
+``VHOST_USER_RESET_OWNER``
+  :id: 4
+  :master payload: N/A
+
+.. admonition:: Deprecated
+
+   This is no longer used. Used to be sent to request disabling all
+   rings, but some clients interpreted it to also discard connection
+   state (this interpretation would lead to bugs).  It is recommended
+   that clients either ignore this message, or use it to disable all
+   rings.
+
+``VHOST_USER_SET_MEM_TABLE``
+  :id: 5
+  :equivalent ioctl: ``VHOST_SET_MEM_TABLE``
+  :master payload: memory regions description
+  :slave payload: (postcopy only) memory regions description
+
+  Sets the memory map regions on the slave so it can translate the
+  vring addresses. In the ancillary data there is an array of file
+  descriptors for each memory mapped region. The size and ordering of
+  the fds matches the number and ordering of memory regions.
+
+  When ``VHOST_USER_POSTCOPY_LISTEN`` has been received,
+  ``SET_MEM_TABLE`` replies with the bases of the memory mapped
+  regions to the master.  The slave must have mmap'd the regions but
+  not yet accessed them and should not yet generate a userfault
+  event.
+
+.. Note::
+   ``NEED_REPLY_MASK`` is not set in this case.  QEMU will then
+   reply back to the list of mappings with an empty
+   ``VHOST_USER_SET_MEM_TABLE`` as an acknowledgement; only upon
+   reception of this message may the guest start accessing the memory
+   and generating faults.
+
+``VHOST_USER_SET_LOG_BASE``
+  :id: 6
+  :equivalent ioctl: ``VHOST_SET_LOG_BASE``
+  :master payload: u64
+  :slave payload: N/A
+
+  Sets logging shared memory space.
+
+  When slave has ``VHOST_USER_PROTOCOL_F_LOG_SHMFD`` protocol feature,
+  the log memory fd is provided in the ancillary data of
+  ``VHOST_USER_SET_LOG_BASE`` message, the size and offset of shared
+  memory area provided in the message.
+
+``VHOST_USER_SET_LOG_FD``
+  :id: 7
+  :equivalent ioctl: ``VHOST_SET_LOG_FD``
+  :master payload: N/A
+
+  Sets the logging file descriptor, which is passed as ancillary data.
+
+``VHOST_USER_SET_VRING_NUM``
+  :id: 8
+  :equivalent ioctl: ``VHOST_SET_VRING_NUM``
+  :master payload: vring state description
+
+  Set the size of the queue.
+
+``VHOST_USER_SET_VRING_ADDR``
+  :id: 9
+  :equivalent ioctl: ``VHOST_SET_VRING_ADDR``
+  :master payload: vring address description
+  :slave payload: N/A
+
+  Sets the addresses of the different aspects of the vring.
+
+``VHOST_USER_SET_VRING_BASE``
+  :id: 10
+  :equivalent ioctl: ``VHOST_SET_VRING_BASE``
+  :master payload: vring state description
+
+  Sets the base offset in the available vring.
+
+``VHOST_USER_GET_VRING_BASE``
+  :id: 11
+  :equivalent ioctl: ``VHOST_USER_GET_VRING_BASE``
+  :master payload: vring state description
+  :slave payload: vring state description
+
+  Get the available vring base offset.
+
+``VHOST_USER_SET_VRING_KICK``
+  :id: 12
+  :equivalent ioctl: ``VHOST_SET_VRING_KICK``
+  :master payload: ``u64``
+
+  Set the event file descriptor for adding buffers to the vring. It is
+  passed in the ancillary data.
+
+  Bits (0-7) of the payload contain the vring index. Bit 8 is the
+  invalid FD flag. This flag is set when there is no file descriptor
+  in the ancillary data. This signals that polling should be used
+  instead of waiting for a kick.
+
+``VHOST_USER_SET_VRING_CALL``
+  :id: 13
+  :equivalent ioctl: ``VHOST_SET_VRING_CALL``
+  :master payload: ``u64``
+
+  Set the event file descriptor to signal when buffers are used. It is
+  passed in the ancillary data.
+
+  Bits (0-7) of the payload contain the vring index. Bit 8 is the
+  invalid FD flag. This flag is set when there is no file descriptor
+  in the ancillary data. This signals that polling will be used
+  instead of waiting for the call.
+
+``VHOST_USER_SET_VRING_ERR``
+  :id: 14
+  :equivalent ioctl: ``VHOST_SET_VRING_ERR``
+  :master payload: ``u64``
+
+  Set the event file descriptor to signal when error occurs. It is
+  passed in the ancillary data.
+
+  Bits (0-7) of the payload contain the vring index. Bit 8 is the
+  invalid FD flag. This flag is set when there is no file descriptor
+  in the ancillary data.
+
+``VHOST_USER_GET_QUEUE_NUM``
+  :id: 17
+  :equivalent ioctl: N/A
+  :master payload: N/A
+  :slave payload: u64
+
+  Query how many queues the backend supports.
+
+  This request should be sent only when ``VHOST_USER_PROTOCOL_F_MQ``
+  is set in queried protocol features by
+  ``VHOST_USER_GET_PROTOCOL_FEATURES``.
+
+``VHOST_USER_SET_VRING_ENABLE``
+  :id: 18
+  :equivalent ioctl: N/A
+  :master payload: vring state description
+
+  Signal slave to enable or disable corresponding vring.
+
+  This request should be sent only when
+  ``VHOST_USER_F_PROTOCOL_FEATURES`` has been negotiated.
+
+``VHOST_USER_SEND_RARP``
+  :id: 19
+  :equivalent ioctl: N/A
+  :master payload: ``u64``
+
+  Ask vhost user backend to broadcast a fake RARP to notify the migration
+  is terminated for guest that does not support GUEST_ANNOUNCE.
+
+  Only legal if feature bit ``VHOST_USER_F_PROTOCOL_FEATURES`` is
+  present in ``VHOST_USER_GET_FEATURES`` and protocol feature bit
+  ``VHOST_USER_PROTOCOL_F_RARP`` is present in
+  ``VHOST_USER_GET_PROTOCOL_FEATURES``.  The first 6 bytes of the
+  payload contain the mac address of the guest to allow the vhost user
+  backend to construct and broadcast the fake RARP.
+
+``VHOST_USER_NET_SET_MTU``
+  :id: 20
+  :equivalent ioctl: N/A
+  :master payload: ``u64``
+
+  Set host MTU value exposed to the guest.
+
+  This request should be sent only when ``VIRTIO_NET_F_MTU`` feature
+  has been successfully negotiated, ``VHOST_USER_F_PROTOCOL_FEATURES``
+  is present in ``VHOST_USER_GET_FEATURES`` and protocol feature bit
+  ``VHOST_USER_PROTOCOL_F_NET_MTU`` is present in
+  ``VHOST_USER_GET_PROTOCOL_FEATURES``.
+
+  If ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is negotiated, slave must
+  respond with zero in case the specified MTU is valid, or non-zero
+  otherwise.
+
+``VHOST_USER_SET_SLAVE_REQ_FD``
+  :id: 21
+  :equivalent ioctl: N/A
+  :master payload: N/A
+
+  Set the socket file descriptor for slave initiated requests. It is passed
+  in the ancillary data.
+
+  This request should be sent only when
+  ``VHOST_USER_F_PROTOCOL_FEATURES`` has been negotiated, and protocol
+  feature bit ``VHOST_USER_PROTOCOL_F_SLAVE_REQ`` bit is present in
+  ``VHOST_USER_GET_PROTOCOL_FEATURES``.  If
+  ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is negotiated, slave must
+  respond with zero for success, non-zero otherwise.
+
+``VHOST_USER_IOTLB_MSG``
+  :id: 22
+  :equivalent ioctl: N/A (equivalent to ``VHOST_IOTLB_MSG`` message type)
+  :master payload: ``struct vhost_iotlb_msg``
+  :slave payload: ``u64``
+
+  Send IOTLB messages with ``struct vhost_iotlb_msg`` as payload.
+
+  Master sends such requests to update and invalidate entries in the
+  device IOTLB. The slave has to acknowledge the request with sending
+  zero as ``u64`` payload for success, non-zero otherwise.
+
+  This request should be send only when ``VIRTIO_F_IOMMU_PLATFORM``
+  feature has been successfully negotiated.
+
+``VHOST_USER_SET_VRING_ENDIAN``
+  :id: 23
+  :equivalent ioctl: ``VHOST_SET_VRING_ENDIAN``
+  :master payload: vring state description
+
+  Set the endianness of a VQ for legacy devices. Little-endian is
+  indicated with state.num set to 0 and big-endian is indicated with
+  state.num set to 1. Other values are invalid.
+
+  This request should be sent only when
+  ``VHOST_USER_PROTOCOL_F_CROSS_ENDIAN`` has been negotiated.
+  Backends that negotiated this feature should handle both
+  endiannesses and expect this message once (per VQ) during device
+  configuration (ie. before the master starts the VQ).
+
+``VHOST_USER_GET_CONFIG``
+  :id: 24
+  :equivalent ioctl: N/A
+  :master payload: virtio device config space
+  :slave payload: virtio device config space
+
+  When ``VHOST_USER_PROTOCOL_F_CONFIG`` is negotiated, this message is
+  submitted by the vhost-user master to fetch the contents of the
+  virtio device configuration space, vhost-user slave's payload size
+  MUST match master's request, vhost-user slave uses zero length of
+  payload to indicate an error to vhost-user master. The vhost-user
+  master may cache the contents to avoid repeated
+  ``VHOST_USER_GET_CONFIG`` calls.
+
+``VHOST_USER_SET_CONFIG``
+  :id: 25
+  :equivalent ioctl: N/A
+  :master payload: virtio device config space
+  :slave payload: N/A
+
+  When ``VHOST_USER_PROTOCOL_F_CONFIG`` is negotiated, this message is
+  submitted by the vhost-user master when the Guest changes the virtio
+  device configuration space and also can be used for live migration
+  on the destination host. The vhost-user slave must check the flags
+  field, and slaves MUST NOT accept SET_CONFIG for read-only
+  configuration space fields unless the live migration bit is set.
+
+``VHOST_USER_CREATE_CRYPTO_SESSION``
+  :id: 26
+  :equivalent ioctl: N/A
+  :master payload: crypto session description
+  :slave payload: crypto session description
+
+  Create a session for crypto operation. The server side must return
+  the session id, 0 or positive for success, negative for failure.
+  This request should be sent only when
+  ``VHOST_USER_PROTOCOL_F_CRYPTO_SESSION`` feature has been
+  successfully negotiated.  It's a required feature for crypto
+  devices.
+
+``VHOST_USER_CLOSE_CRYPTO_SESSION``
+  :id: 27
+  :equivalent ioctl: N/A
+  :master payload: ``u64``
+
+  Close a session for crypto operation which was previously
+  created by ``VHOST_USER_CREATE_CRYPTO_SESSION``.
+
+  This request should be sent only when
+  ``VHOST_USER_PROTOCOL_F_CRYPTO_SESSION`` feature has been
+  successfully negotiated.  It's a required feature for crypto
+  devices.
+
+``VHOST_USER_POSTCOPY_ADVISE``
+  :id: 28
+  :master payload: N/A
+  :slave payload: userfault fd
+
+  When ``VHOST_USER_PROTOCOL_F_PAGEFAULT`` is supported, the master
+  advises slave that a migration with postcopy enabled is underway,
+  the slave must open a userfaultfd for later use.  Note that at this
+  stage the migration is still in precopy mode.
+
+``VHOST_USER_POSTCOPY_LISTEN``
+  :id: 29
+  :master payload: N/A
+
+  Master advises slave that a transition to postcopy mode has
+  happened.  The slave must ensure that shared memory is registered
+  with userfaultfd to cause faulting of non-present pages.
+
+  This is always sent sometime after a ``VHOST_USER_POSTCOPY_ADVISE``,
+  and thus only when ``VHOST_USER_PROTOCOL_F_PAGEFAULT`` is supported.
+
+``VHOST_USER_POSTCOPY_END``
+  :id: 30
+  :slave payload: ``u64``
+
+  Master advises that postcopy migration has now completed.  The slave
+  must disable the userfaultfd. The response is an acknowledgement
+  only.
+
+  When ``VHOST_USER_PROTOCOL_F_PAGEFAULT`` is supported, this message
+  is sent at the end of the migration, after
+  ``VHOST_USER_POSTCOPY_LISTEN`` was previously sent.
+
+  The value returned is an error indication; 0 is success.
+
+``VHOST_USER_GET_INFLIGHT_FD``
+  :id: 31
+  :equivalent ioctl: N/A
+  :master payload: inflight description
+
+  When ``VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD`` protocol feature has
+  been successfully negotiated, this message is submitted by master to
+  get a shared buffer from slave. The shared buffer will be used to
+  track inflight I/O by slave. QEMU should retrieve a new one when vm
+  reset.
+
+``VHOST_USER_SET_INFLIGHT_FD``
+  :id: 32
+  :equivalent ioctl: N/A
+  :master payload: inflight description
+
+  When ``VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD`` protocol feature has
+  been successfully negotiated, this message is submitted by master to
+  send the shared inflight buffer back to slave so that slave could
+  get inflight I/O after a crash or restart.
+
+Slave message types
+-------------------
+
+``VHOST_USER_SLAVE_IOTLB_MSG``
+  :id: 1
+  :equivalent ioctl: N/A (equivalent to ``VHOST_IOTLB_MSG`` message type)
+  :slave payload: ``struct vhost_iotlb_msg``
+  :master payload: N/A
+
+  Send IOTLB messages with ``struct vhost_iotlb_msg`` as payload.
+  Slave sends such requests to notify of an IOTLB miss, or an IOTLB
+  access failure. If ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is
+  negotiated, and slave set the ``VHOST_USER_NEED_REPLY`` flag, master
+  must respond with zero when operation is successfully completed, or
+  non-zero otherwise.  This request should be send only when
+  ``VIRTIO_F_IOMMU_PLATFORM`` feature has been successfully
+  negotiated.
+
+``VHOST_USER_SLAVE_CONFIG_CHANGE_MSG``
+  :id: 2
+  :equivalent ioctl: N/A
+  :slave payload: N/A
+  :master payload: N/A
+
+  When ``VHOST_USER_PROTOCOL_F_CONFIG`` is negotiated, vhost-user
+  slave sends such messages to notify that the virtio device's
+  configuration space has changed, for those host devices which can
+  support such feature, host driver can send ``VHOST_USER_GET_CONFIG``
+  message to slave to get the latest content. If
+  ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is negotiated, and slave set the
+  ``VHOST_USER_NEED_REPLY`` flag, master must respond with zero when
+  operation is successfully completed, or non-zero otherwise.
+
+``VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG``
+  :id: 3
+  :equivalent ioctl: N/A
+  :slave payload: vring area description
+  :master payload: N/A
+
+  Sets host notifier for a specified queue. The queue index is
+  contained in the ``u64`` field of the vring area description. The
+  host notifier is described by the file descriptor (typically it's a
+  VFIO device fd) which is passed as ancillary data and the size
+  (which is mmap size and should be the same as host page size) and
+  offset (which is mmap offset) carried in the vring area
+  description. QEMU can mmap the file descriptor based on the size and
+  offset to get a memory range. Registering a host notifier means
+  mapping this memory range to the VM as the specified queue's notify
+  MMIO region. Slave sends this request to tell QEMU to de-register
+  the existing notifier if any and register the new notifier if the
+  request is sent with a file descriptor.
+
+  This request should be sent only when
+  ``VHOST_USER_PROTOCOL_F_HOST_NOTIFIER`` protocol feature has been
+  successfully negotiated.
+
+.. _reply_ack:
+
+VHOST_USER_PROTOCOL_F_REPLY_ACK
+-------------------------------
+
+The original vhost-user specification only demands replies for certain
+commands. This differs from the vhost protocol implementation where
+commands are sent over an ``ioctl()`` call and block until the client
+has completed.
+
+With this protocol extension negotiated, the sender (QEMU) can set the
+``need_reply`` [Bit 3] flag to any command. This indicates that the
+client MUST respond with a Payload ``VhostUserMsg`` indicating success
+or failure. The payload should be set to zero on success or non-zero
+on failure, unless the message already has an explicit reply body.
+
+The response payload gives QEMU a deterministic indication of the result
+of the command. Today, QEMU is expected to terminate the main vhost-user
+loop upon receiving such errors. In future, qemu could be taught to be more
+resilient for selective requests.
+
+For the message types that already solicit a reply from the client,
+the presence of ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` or need_reply bit
+being set brings no behavioural change. (See the Communication_
+section for details.)
+
+.. _backend_conventions:
+
+Backend program conventions
+===========================
+
+vhost-user backends can provide various devices & services and may
+need to be configured manually depending on the use case. However, it
+is a good idea to follow the conventions listed here when
+possible. Users, QEMU or libvirt, can then rely on some common
+behaviour to avoid heterogenous configuration and management of the
+backend programs and facilitate interoperability.
+
+Each backend installed on a host system should come with at least one
+JSON file that conforms to the vhost-user.json schema. Each file
+informs the management applications about the backend type, and binary
+location. In addition, it defines rules for management apps for
+picking the highest priority backend when multiple match the search
+criteria (see ``@VhostUserBackend`` documentation in the schema file).
+
+If the backend is not capable of enabling a requested feature on the
+host (such as 3D acceleration with virgl), or the initialization
+failed, the backend should fail to start early and exit with a status
+!= 0. It may also print a message to stderr for further details.
+
+The backend program must not daemonize itself, but it may be
+daemonized by the management layer. It may also have a restricted
+access to the system.
+
+File descriptors 0, 1 and 2 will exist, and have regular
+stdin/stdout/stderr usage (they may have been redirected to /dev/null
+by the management layer, or to a log handler).
+
+The backend program must end (as quickly and cleanly as possible) when
+the SIGTERM signal is received. Eventually, it may receive SIGKILL by
+the management layer after a few seconds.
+
+The following command line options have an expected behaviour. They
+are mandatory, unless explicitly said differently:
+
+--socket-path=PATH
+
+  This option specify the location of the vhost-user Unix domain socket.
+  It is incompatible with --fd.
+
+--fd=FDNUM
+
+  When this argument is given, the backend program is started with the
+  vhost-user socket as file descriptor FDNUM. It is incompatible with
+  --socket-path.
+
+--print-capabilities
+
+  Output to stdout the backend capabilities in JSON format, and then
+  exit successfully. Other options and arguments should be ignored, and
+  the backend program should not perform its normal function.  The
+  capabilities can be reported dynamically depending on the host
+  capabilities.
+
+The JSON output is described in the ``vhost-user.json`` schema, by
+```@VHostUserBackendCapabilities``.  Example:
+
+.. code:: json
+
+  {
+    "type": "foo",
+    "features": [
+      "feature-a",
+      "feature-b"
+    ]
+  }
+
+vhost-user-input
+----------------
+
+Command line options:
+
+--evdev-path=PATH
+
+  Specify the linux input device.
+
+  (optional)
+
+--no-grab
+
+  Do no request exclusive access to the input device.
+
+  (optional)
+
+vhost-user-gpu
+--------------
+
+Command line options:
+
+--render-node=PATH
+
+  Specify the GPU DRM render node.
+
+  (optional)
+
+--virgl
+
+  Enable virgl rendering support.
+
+  (optional)
diff --git a/docs/interop/vhost-user.txt b/docs/interop/vhost-user.txt
deleted file mode 100644
index 4dbd530cb9..0000000000
--- a/docs/interop/vhost-user.txt
+++ /dev/null
@@ -1,1219 +0,0 @@
-Vhost-user Protocol
-===================
-
-Copyright (c) 2014 Virtual Open Systems Sarl.
-
-This work is licensed under the terms of the GNU GPL, version 2 or later.
-See the COPYING file in the top-level directory.
-===================
-
-This protocol is aiming to complement the ioctl interface used to control the
-vhost implementation in the Linux kernel. It implements the control plane needed
-to establish virtqueue sharing with a user space process on the same host. It
-uses communication over a Unix domain socket to share file descriptors in the
-ancillary data of the message.
-
-The protocol defines 2 sides of the communication, master and slave. Master is
-the application that shares its virtqueues, in our case QEMU. Slave is the
-consumer of the virtqueues.
-
-In the current implementation QEMU is the Master, and the Slave is the
-external process consuming the virtio queues, for example a software
-Ethernet switch running in user space, such as Snabbswitch, or a block
-device backend processing read & write to a virtual disk. In order to
-facilitate interoperability between various backend implementations,
-it is recommended to follow the "Backend program conventions"
-described in this document.
-
-Master and slave can be either a client (i.e. connecting) or server (listening)
-in the socket communication.
-
-Message Specification
----------------------
-
-Note that all numbers are in the machine native byte order. A vhost-user message
-consists of 3 header fields and a payload:
-
-------------------------------------
-| request | flags | size | payload |
-------------------------------------
-
- * Request: 32-bit type of the request
- * Flags: 32-bit bit field:
-   - Lower 2 bits are the version (currently 0x01)
-   - Bit 2 is the reply flag - needs to be sent on each reply from the slave
-   - Bit 3 is the need_reply flag - see VHOST_USER_PROTOCOL_F_REPLY_ACK for
-     details.
- * Size - 32-bit size of the payload
-
-
-Depending on the request type, payload can be:
-
- * A single 64-bit integer
-   -------
-   | u64 |
-   -------
-
-   u64: a 64-bit unsigned integer
-
- * A vring state description
-   ---------------
-   | index | num |
-   ---------------
-
-   Index: a 32-bit index
-   Num: a 32-bit number
-
- * A vring address description
-   --------------------------------------------------------------
-   | index | flags | size | descriptor | used | available | log |
-   --------------------------------------------------------------
-
-   Index: a 32-bit vring index
-   Flags: a 32-bit vring flags
-   Descriptor: a 64-bit ring address of the vring descriptor table
-   Used: a 64-bit ring address of the vring used ring
-   Available: a 64-bit ring address of the vring available ring
-   Log: a 64-bit guest address for logging
-
-   Note that a ring address is an IOVA if VIRTIO_F_IOMMU_PLATFORM has been
-   negotiated.  Otherwise it is a user address.
-
- * Memory regions description
-   ---------------------------------------------------
-   | num regions | padding | region0 | ... | region7 |
-   ---------------------------------------------------
-
-   Num regions: a 32-bit number of regions
-   Padding: 32-bit
-
-   A region is:
-   -----------------------------------------------------
-   | guest address | size | user address | mmap offset |
-   -----------------------------------------------------
-
-   Guest address: a 64-bit guest address of the region
-   Size: a 64-bit size
-   User address: a 64-bit user address
-   mmap offset: 64-bit offset where region starts in the mapped memory
-
-* Log description
-   ---------------------------
-   | log size | log offset |
-   ---------------------------
-   log size: size of area used for logging
-   log offset: offset from start of supplied file descriptor
-       where logging starts (i.e. where guest address 0 would be logged)
-
- * An IOTLB message
-   ---------------------------------------------------------
-   | iova | size | user address | permissions flags | type |
-   ---------------------------------------------------------
-
-   IOVA: a 64-bit I/O virtual address programmed by the guest
-   Size: a 64-bit size
-   User address: a 64-bit user address
-   Permissions: an 8-bit value:
-    - 0: No access
-    - 1: Read access
-    - 2: Write access
-    - 3: Read/Write access
-   Type: an 8-bit IOTLB message type:
-    - 1: IOTLB miss
-    - 2: IOTLB update
-    - 3: IOTLB invalidate
-    - 4: IOTLB access fail
-
- * Virtio device config space
-   -----------------------------------
-   | offset | size | flags | payload |
-   -----------------------------------
-
-   Offset: a 32-bit offset of virtio device's configuration space
-   Size: a 32-bit configuration space access size in bytes
-   Flags: a 32-bit value:
-    - 0: Vhost master messages used for writeable fields
-    - 1: Vhost master messages used for live migration
-   Payload: Size bytes array holding the contents of the virtio
-       device's configuration space
-
- * Vring area description
-   -----------------------
-   | u64 | size | offset |
-   -----------------------
-
-   u64: a 64-bit integer contains vring index and flags
-   Size: a 64-bit size of this area
-   Offset: a 64-bit offset of this area from the start of the
-       supplied file descriptor
-
- * Inflight description
-   -----------------------------------------------------
-   | mmap size | mmap offset | num queues | queue size |
-   -----------------------------------------------------
-
-   mmap size: a 64-bit size of area to track inflight I/O
-   mmap offset: a 64-bit offset of this area from the start
-                of the supplied file descriptor
-   num queues: a 16-bit number of virtqueues
-   queue size: a 16-bit size of virtqueues
-
-In QEMU the vhost-user message is implemented with the following struct:
-
-typedef struct VhostUserMsg {
-    VhostUserRequest request;
-    uint32_t flags;
-    uint32_t size;
-    union {
-        uint64_t u64;
-        struct vhost_vring_state state;
-        struct vhost_vring_addr addr;
-        VhostUserMemory memory;
-        VhostUserLog log;
-        struct vhost_iotlb_msg iotlb;
-        VhostUserConfig config;
-        VhostUserVringArea area;
-        VhostUserInflight inflight;
-    };
-} QEMU_PACKED VhostUserMsg;
-
-Communication
--------------
-
-The protocol for vhost-user is based on the existing implementation of vhost
-for the Linux Kernel. Most messages that can be sent via the Unix domain socket
-implementing vhost-user have an equivalent ioctl to the kernel implementation.
-
-The communication consists of master sending message requests and slave sending
-message replies. Most of the requests don't require replies. Here is a list of
-the ones that do:
-
- * VHOST_USER_GET_FEATURES
- * VHOST_USER_GET_PROTOCOL_FEATURES
- * VHOST_USER_GET_VRING_BASE
- * VHOST_USER_SET_LOG_BASE (if VHOST_USER_PROTOCOL_F_LOG_SHMFD)
- * VHOST_USER_GET_INFLIGHT_FD (if VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)
-
-[ Also see the section on REPLY_ACK protocol extension. ]
-
-There are several messages that the master sends with file descriptors passed
-in the ancillary data:
-
- * VHOST_USER_SET_MEM_TABLE
- * VHOST_USER_SET_LOG_BASE (if VHOST_USER_PROTOCOL_F_LOG_SHMFD)
- * VHOST_USER_SET_LOG_FD
- * VHOST_USER_SET_VRING_KICK
- * VHOST_USER_SET_VRING_CALL
- * VHOST_USER_SET_VRING_ERR
- * VHOST_USER_SET_SLAVE_REQ_FD
- * VHOST_USER_SET_INFLIGHT_FD (if VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)
-
-If Master is unable to send the full message or receives a wrong reply it will
-close the connection. An optional reconnection mechanism can be implemented.
-
-Any protocol extensions are gated by protocol feature bits,
-which allows full backwards compatibility on both master
-and slave.
-As older slaves don't support negotiating protocol features,
-a feature bit was dedicated for this purpose:
-#define VHOST_USER_F_PROTOCOL_FEATURES 30
-
-Starting and stopping rings
-----------------------
-Client must only process each ring when it is started.
-
-Client must only pass data between the ring and the
-backend, when the ring is enabled.
-
-If ring is started but disabled, client must process the
-ring without talking to the backend.
-
-For example, for a networking device, in the disabled state
-client must not supply any new RX packets, but must process
-and discard any TX packets.
-
-If VHOST_USER_F_PROTOCOL_FEATURES has not been negotiated, the ring is initialized
-in an enabled state.
-
-If VHOST_USER_F_PROTOCOL_FEATURES has been negotiated, the ring is initialized
-in a disabled state. Client must not pass data to/from the backend until ring is enabled by
-VHOST_USER_SET_VRING_ENABLE with parameter 1, or after it has been disabled by
-VHOST_USER_SET_VRING_ENABLE with parameter 0.
-
-Each ring is initialized in a stopped state, client must not process it until
-ring is started, or after it has been stopped.
-
-Client must start ring upon receiving a kick (that is, detecting that file
-descriptor is readable) on the descriptor specified by
-VHOST_USER_SET_VRING_KICK, and stop ring upon receiving
-VHOST_USER_GET_VRING_BASE.
-
-While processing the rings (whether they are enabled or not), client must
-support changing some configuration aspects on the fly.
-
-Multiple queue support
-----------------------
-
-Multiple queue is treated as a protocol extension, hence the slave has to
-implement protocol features first. The multiple queues feature is supported
-only when the protocol feature VHOST_USER_PROTOCOL_F_MQ (bit 0) is set.
-
-The max number of queue pairs the slave supports can be queried with message
-VHOST_USER_GET_QUEUE_NUM. Master should stop when the number of
-requested queues is bigger than that.
-
-As all queues share one connection, the master uses a unique index for each
-queue in the sent message to identify a specified queue. One queue pair
-is enabled initially. More queues are enabled dynamically, by sending
-message VHOST_USER_SET_VRING_ENABLE.
-
-Migration
----------
-
-During live migration, the master may need to track the modifications
-the slave makes to the memory mapped regions. The client should mark
-the dirty pages in a log. Once it complies to this logging, it may
-declare the VHOST_F_LOG_ALL vhost feature.
-
-To start/stop logging of data/used ring writes, server may send messages
-VHOST_USER_SET_FEATURES with VHOST_F_LOG_ALL and VHOST_USER_SET_VRING_ADDR with
-VHOST_VRING_F_LOG in ring's flags set to 1/0, respectively.
-
-All the modifications to memory pointed by vring "descriptor" should
-be marked. Modifications to "used" vring should be marked if
-VHOST_VRING_F_LOG is part of ring's flags.
-
-Dirty pages are of size:
-#define VHOST_LOG_PAGE 0x1000
-
-The log memory fd is provided in the ancillary data of
-VHOST_USER_SET_LOG_BASE message when the slave has
-VHOST_USER_PROTOCOL_F_LOG_SHMFD protocol feature.
-
-The size of the log is supplied as part of VhostUserMsg
-which should be large enough to cover all known guest
-addresses. Log starts at the supplied offset in the
-supplied file descriptor.
-The log covers from address 0 to the maximum of guest
-regions. In pseudo-code, to mark page at "addr" as dirty:
-
-page = addr / VHOST_LOG_PAGE
-log[page / 8] |= 1 << page % 8
-
-Where addr is the guest physical address.
-
-Use atomic operations, as the log may be concurrently manipulated.
-
-Note that when logging modifications to the used ring (when VHOST_VRING_F_LOG
-is set for this ring), log_guest_addr should be used to calculate the log
-offset: the write to first byte of the used ring is logged at this offset from
-log start. Also note that this value might be outside the legal guest physical
-address range (i.e. does not have to be covered by the VhostUserMemory table),
-but the bit offset of the last byte of the ring must fall within
-the size supplied by VhostUserLog.
-
-VHOST_USER_SET_LOG_FD is an optional message with an eventfd in
-ancillary data, it may be used to inform the master that the log has
-been modified.
-
-Once the source has finished migration, rings will be stopped by
-the source. No further update must be done before rings are
-restarted.
-
-In postcopy migration the slave is started before all the memory has been
-received from the source host, and care must be taken to avoid accessing pages
-that have yet to be received.  The slave opens a 'userfault'-fd and registers
-the memory with it; this fd is then passed back over to the master.
-The master services requests on the userfaultfd for pages that are accessed
-and when the page is available it performs WAKE ioctl's on the userfaultfd
-to wake the stalled slave.  The client indicates support for this via the
-VHOST_USER_PROTOCOL_F_PAGEFAULT feature.
-
-Memory access
--------------
-
-The master sends a list of vhost memory regions to the slave using the
-VHOST_USER_SET_MEM_TABLE message.  Each region has two base addresses: a guest
-address and a user address.
-
-Messages contain guest addresses and/or user addresses to reference locations
-within the shared memory.  The mapping of these addresses works as follows.
-
-User addresses map to the vhost memory region containing that user address.
-
-When the VIRTIO_F_IOMMU_PLATFORM feature has not been negotiated:
-
- * Guest addresses map to the vhost memory region containing that guest
-   address.
-
-When the VIRTIO_F_IOMMU_PLATFORM feature has been negotiated:
-
- * Guest addresses are also called I/O virtual addresses (IOVAs).  They are
-   translated to user addresses via the IOTLB.
-
- * The vhost memory region guest address is not used.
-
-IOMMU support
--------------
-
-When the VIRTIO_F_IOMMU_PLATFORM feature has been negotiated, the master
-sends IOTLB entries update & invalidation by sending VHOST_USER_IOTLB_MSG
-requests to the slave with a struct vhost_iotlb_msg as payload. For update
-events, the iotlb payload has to be filled with the update message type (2),
-the I/O virtual address, the size, the user virtual address, and the
-permissions flags. Addresses and size must be within vhost memory regions set
-via the VHOST_USER_SET_MEM_TABLE request. For invalidation events, the iotlb
-payload has to be filled with the invalidation message type (3), the I/O virtual
-address and the size. On success, the slave is expected to reply with a zero
-payload, non-zero otherwise.
-
-The slave relies on the slave communcation channel (see "Slave communication"
-section below) to send IOTLB miss and access failure events, by sending
-VHOST_USER_SLAVE_IOTLB_MSG requests to the master with a struct vhost_iotlb_msg
-as payload. For miss events, the iotlb payload has to be filled with the miss
-message type (1), the I/O virtual address and the permissions flags. For access
-failure event, the iotlb payload has to be filled with the access failure
-message type (4), the I/O virtual address and the permissions flags.
-For synchronization purpose, the slave may rely on the reply-ack feature,
-so the master may send a reply when operation is completed if the reply-ack
-feature is negotiated and slaves requests a reply. For miss events, completed
-operation means either master sent an update message containing the IOTLB entry
-containing requested address and permission, or master sent nothing if the IOTLB
-miss message is invalid (invalid IOVA or permission).
-
-The master isn't expected to take the initiative to send IOTLB update messages,
-as the slave sends IOTLB miss messages for the guest virtual memory areas it
-needs to access.
-
-Slave communication
--------------------
-
-An optional communication channel is provided if the slave declares
-VHOST_USER_PROTOCOL_F_SLAVE_REQ protocol feature, to allow the slave to make
-requests to the master.
-
-The fd is provided via VHOST_USER_SET_SLAVE_REQ_FD ancillary data.
-
-A slave may then send VHOST_USER_SLAVE_* messages to the master
-using this fd communication channel.
-
-If VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD protocol feature is negotiated,
-slave can send file descriptors (at most 8 descriptors in each message)
-to master via ancillary data using this fd communication channel.
-
-Inflight I/O tracking
----------------------
-
-To support reconnecting after restart or crash, slave may need to resubmit
-inflight I/Os. If virtqueue is processed in order, we can easily achieve
-that by getting the inflight descriptors from descriptor table (split virtqueue)
-or descriptor ring (packed virtqueue). However, it can't work when we process
-descriptors out-of-order because some entries which store the information of
-inflight descriptors in available ring (split virtqueue) or descriptor
-ring (packed virtqueue) might be overrided by new entries. To solve this
-problem, slave need to allocate an extra buffer to store this information of inflight
-descriptors and share it with master for persistent. VHOST_USER_GET_INFLIGHT_FD and
-VHOST_USER_SET_INFLIGHT_FD are used to transfer this buffer between master
-and slave. And the format of this buffer is described below:
-
--------------------------------------------------------
-| queue0 region | queue1 region | ... | queueN region |
--------------------------------------------------------
-
-N is the number of available virtqueues. Slave could get it from num queues
-field of VhostUserInflight.
-
-For split virtqueue, queue region can be implemented as:
-
-typedef struct DescStateSplit {
-    /* Indicate whether this descriptor is inflight or not.
-     * Only available for head-descriptor. */
-    uint8_t inflight;
-
-    /* Padding */
-    uint8_t padding[5];
-
-    /* Maintain a list for the last batch of used descriptors.
-     * Only available when batching is used for submitting */
-    uint16_t next;
-
-    /* Used to preserve the order of fetching available descriptors.
-     * Only available for head-descriptor. */
-    uint64_t counter;
-} DescStateSplit;
-
-typedef struct QueueRegionSplit {
-    /* The feature flags of this region. Now it's initialized to 0. */
-    uint64_t features;
-
-    /* The version of this region. It's 1 currently.
-     * Zero value indicates an uninitialized buffer */
-    uint16_t version;
-
-    /* The size of DescStateSplit array. It's equal to the virtqueue
-     * size. Slave could get it from queue size field of VhostUserInflight. */
-    uint16_t desc_num;
-
-    /* The head of list that track the last batch of used descriptors. */
-    uint16_t last_batch_head;
-
-    /* Store the idx value of used ring */
-    uint16_t used_idx;
-
-    /* Used to track the state of each descriptor in descriptor table */
-    DescStateSplit desc[0];
-} QueueRegionSplit;
-
-To track inflight I/O, the queue region should be processed as follows:
-
-When receiving available buffers from the driver:
-
-    1. Get the next available head-descriptor index from available ring, i
-
-    2. Set desc[i].counter to the value of global counter
-
-    3. Increase global counter by 1
-
-    4. Set desc[i].inflight to 1
-
-When supplying used buffers to the driver:
-
-    1. Get corresponding used head-descriptor index, i
-
-    2. Set desc[i].next to last_batch_head
-
-    3. Set last_batch_head to i
-
-    4. Steps 1,2,3 may be performed repeatedly if batching is possible
-
-    5. Increase the idx value of used ring by the size of the batch
-
-    6. Set the inflight field of each DescStateSplit entry in the batch to 0
-
-    7. Set used_idx to the idx value of used ring
-
-When reconnecting:
-
-    1. If the value of used_idx does not match the idx value of used ring (means
-    the inflight field of DescStateSplit entries in last batch may be incorrect),
-
-        (a) Subtract the value of used_idx from the idx value of used ring to get
-        last batch size of DescStateSplit entries
-
-        (b) Set the inflight field of each DescStateSplit entry to 0 in last batch
-        list which starts from last_batch_head
-
-        (c) Set used_idx to the idx value of used ring
-
-    2. Resubmit inflight DescStateSplit entries in order of their counter value
-
-For packed virtqueue, queue region can be implemented as:
-
-typedef struct DescStatePacked {
-    /* Indicate whether this descriptor is inflight or not.
-     * Only available for head-descriptor. */
-    uint8_t inflight;
-
-    /* Padding */
-    uint8_t padding;
-
-    /* Link to the next free entry */
-    uint16_t next;
-
-    /* Link to the last entry of descriptor list.
-     * Only available for head-descriptor. */
-    uint16_t last;
-
-    /* The length of descriptor list.
-     * Only available for head-descriptor. */
-    uint16_t num;
-
-    /* Used to preserve the order of fetching available descriptors.
-     * Only available for head-descriptor. */
-    uint64_t counter;
-
-    /* The buffer id */
-    uint16_t id;
-
-    /* The descriptor flags */
-    uint16_t flags;
-
-    /* The buffer length */
-    uint32_t len;
-
-    /* The buffer address */
-    uint64_t addr;
-} DescStatePacked;
-
-typedef struct QueueRegionPacked {
-    /* The feature flags of this region. Now it's initialized to 0. */
-    uint64_t features;
-
-    /* The version of this region. It's 1 currently.
-     * Zero value indicates an uninitialized buffer */
-    uint16_t version;
-
-    /* The size of DescStatePacked array. It's equal to the virtqueue
-     * size. Slave could get it from queue size field of VhostUserInflight. */
-    uint16_t desc_num;
-
-    /* The head of free DescStatePacked entry list */
-    uint16_t free_head;
-
-    /* The old head of free DescStatePacked entry list */
-    uint16_t old_free_head;
-
-    /* The used index of descriptor ring */
-    uint16_t used_idx;
-
-    /* The old used index of descriptor ring */
-    uint16_t old_used_idx;
-
-    /* Device ring wrap counter */
-    uint8_t used_wrap_counter;
-
-    /* The old device ring wrap counter */
-    uint8_t old_used_wrap_counter;
-
-    /* Padding */
-    uint8_t padding[7];
-
-    /* Used to track the state of each descriptor fetched from descriptor ring */
-    DescStatePacked desc[0];
-} QueueRegionPacked;
-
-To track inflight I/O, the queue region should be processed as follows:
-
-When receiving available buffers from the driver:
-
-    1. Get the next available descriptor entry from descriptor ring, d
-
-    2. If d is head descriptor,
-
-        (a) Set desc[old_free_head].num to 0
-
-        (b) Set desc[old_free_head].counter to the value of global counter
-
-        (c) Increase global counter by 1
-
-        (d) Set desc[old_free_head].inflight to 1
-
-    3. If d is last descriptor, set desc[old_free_head].last to free_head
-
-    4. Increase desc[old_free_head].num by 1
-
-    5. Set desc[free_head].addr, desc[free_head].len, desc[free_head].flags,
-    desc[free_head].id to d.addr, d.len, d.flags, d.id
-
-    6. Set free_head to desc[free_head].next
-
-    7. If d is last descriptor, set old_free_head to free_head
-
-When supplying used buffers to the driver:
-
-    1. Get corresponding used head-descriptor entry from descriptor ring, d
-
-    2. Get corresponding DescStatePacked entry, e
-
-    3. Set desc[e.last].next to free_head
-
-    4. Set free_head to the index of e
-
-    5. Steps 1,2,3,4 may be performed repeatedly if batching is possible
-
-    6. Increase used_idx by the size of the batch and update used_wrap_counter if needed
-
-    7. Update d.flags
-
-    8. Set the inflight field of each head DescStatePacked entry in the batch to 0
-
-    9. Set old_free_head, old_used_idx, old_used_wrap_counter to free_head, used_idx,
-    used_wrap_counter
-
-When reconnecting:
-
-    1. If used_idx does not match old_used_idx (means the inflight field of DescStatePacked
-    entries in last batch may be incorrect),
-
-        (a) Get the next descriptor ring entry through old_used_idx, d
-
-        (b) Use old_used_wrap_counter to calculate the available flags
-
-        (c) If d.flags is not equal to the calculated flags value (means slave has
-        submitted the buffer to guest driver before crash, so it has to commit the
-        in-progres update), set old_free_head, old_used_idx, old_used_wrap_counter
-        to free_head, used_idx, used_wrap_counter
-
-    2. Set free_head, used_idx, used_wrap_counter to old_free_head, old_used_idx,
-    old_used_wrap_counter (roll back any in-progress update)
-
-    3. Set the inflight field of each DescStatePacked entry in free list to 0
-
-    4. Resubmit inflight DescStatePacked entries in order of their counter value
-
-Protocol features
------------------
-
-#define VHOST_USER_PROTOCOL_F_MQ             0
-#define VHOST_USER_PROTOCOL_F_LOG_SHMFD      1
-#define VHOST_USER_PROTOCOL_F_RARP           2
-#define VHOST_USER_PROTOCOL_F_REPLY_ACK      3
-#define VHOST_USER_PROTOCOL_F_MTU            4
-#define VHOST_USER_PROTOCOL_F_SLAVE_REQ      5
-#define VHOST_USER_PROTOCOL_F_CROSS_ENDIAN   6
-#define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7
-#define VHOST_USER_PROTOCOL_F_PAGEFAULT      8
-#define VHOST_USER_PROTOCOL_F_CONFIG         9
-#define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD  10
-#define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER  11
-#define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12
-
-Master message types
---------------------
-
- * VHOST_USER_GET_FEATURES
-
-      Id: 1
-      Equivalent ioctl: VHOST_GET_FEATURES
-      Master payload: N/A
-      Slave payload: u64
-
-      Get from the underlying vhost implementation the features bitmask.
-      Feature bit VHOST_USER_F_PROTOCOL_FEATURES signals slave support for
-      VHOST_USER_GET_PROTOCOL_FEATURES and VHOST_USER_SET_PROTOCOL_FEATURES.
-
- * VHOST_USER_SET_FEATURES
-
-      Id: 2
-      Ioctl: VHOST_SET_FEATURES
-      Master payload: u64
-
-      Enable features in the underlying vhost implementation using a bitmask.
-      Feature bit VHOST_USER_F_PROTOCOL_FEATURES signals slave support for
-      VHOST_USER_GET_PROTOCOL_FEATURES and VHOST_USER_SET_PROTOCOL_FEATURES.
-
- * VHOST_USER_GET_PROTOCOL_FEATURES
-
-      Id: 15
-      Equivalent ioctl: VHOST_GET_FEATURES
-      Master payload: N/A
-      Slave payload: u64
-
-      Get the protocol feature bitmask from the underlying vhost implementation.
-      Only legal if feature bit VHOST_USER_F_PROTOCOL_FEATURES is present in
-      VHOST_USER_GET_FEATURES.
-      Note: slave that reported VHOST_USER_F_PROTOCOL_FEATURES must support
-      this message even before VHOST_USER_SET_FEATURES was called.
-
- * VHOST_USER_SET_PROTOCOL_FEATURES
-
-      Id: 16
-      Ioctl: VHOST_SET_FEATURES
-      Master payload: u64
-
-      Enable protocol features in the underlying vhost implementation.
-      Only legal if feature bit VHOST_USER_F_PROTOCOL_FEATURES is present in
-      VHOST_USER_GET_FEATURES.
-      Note: slave that reported VHOST_USER_F_PROTOCOL_FEATURES must support
-      this message even before VHOST_USER_SET_FEATURES was called.
-
- * VHOST_USER_SET_OWNER
-
-      Id: 3
-      Equivalent ioctl: VHOST_SET_OWNER
-      Master payload: N/A
-
-      Issued when a new connection is established. It sets the current Master
-      as an owner of the session. This can be used on the Slave as a
-      "session start" flag.
-
- * VHOST_USER_RESET_OWNER
-
-      Id: 4
-      Master payload: N/A
-
-      This is no longer used. Used to be sent to request disabling
-      all rings, but some clients interpreted it to also discard
-      connection state (this interpretation would lead to bugs).
-      It is recommended that clients either ignore this message,
-      or use it to disable all rings.
-
- * VHOST_USER_SET_MEM_TABLE
-
-      Id: 5
-      Equivalent ioctl: VHOST_SET_MEM_TABLE
-      Master payload: memory regions description
-      Slave payload: (postcopy only) memory regions description
-
-      Sets the memory map regions on the slave so it can translate the vring
-      addresses. In the ancillary data there is an array of file descriptors
-      for each memory mapped region. The size and ordering of the fds matches
-      the number and ordering of memory regions.
-
-      When VHOST_USER_POSTCOPY_LISTEN has been received, SET_MEM_TABLE replies with
-      the bases of the memory mapped regions to the master.  The slave must
-      have mmap'd the regions but not yet accessed them and should not yet generate
-      a userfault event. Note NEED_REPLY_MASK is not set in this case.
-      QEMU will then reply back to the list of mappings with an empty
-      VHOST_USER_SET_MEM_TABLE as an acknowledgment; only upon reception of this
-      message may the guest start accessing the memory and generating faults.
-
- * VHOST_USER_SET_LOG_BASE
-
-      Id: 6
-      Equivalent ioctl: VHOST_SET_LOG_BASE
-      Master payload: u64
-      Slave payload: N/A
-
-      Sets logging shared memory space.
-      When slave has VHOST_USER_PROTOCOL_F_LOG_SHMFD protocol
-      feature, the log memory fd is provided in the ancillary data of
-      VHOST_USER_SET_LOG_BASE message, the size and offset of shared
-      memory area provided in the message.
-
-
- * VHOST_USER_SET_LOG_FD
-
-      Id: 7
-      Equivalent ioctl: VHOST_SET_LOG_FD
-      Master payload: N/A
-
-      Sets the logging file descriptor, which is passed as ancillary data.
-
- * VHOST_USER_SET_VRING_NUM
-
-      Id: 8
-      Equivalent ioctl: VHOST_SET_VRING_NUM
-      Master payload: vring state description
-
-      Set the size of the queue.
-
- * VHOST_USER_SET_VRING_ADDR
-
-      Id: 9
-      Equivalent ioctl: VHOST_SET_VRING_ADDR
-      Master payload: vring address description
-      Slave payload: N/A
-
-      Sets the addresses of the different aspects of the vring.
-
- * VHOST_USER_SET_VRING_BASE
-
-      Id: 10
-      Equivalent ioctl: VHOST_SET_VRING_BASE
-      Master payload: vring state description
-
-      Sets the base offset in the available vring.
-
- * VHOST_USER_GET_VRING_BASE
-
-      Id: 11
-      Equivalent ioctl: VHOST_USER_GET_VRING_BASE
-      Master payload: vring state description
-      Slave payload: vring state description
-
-      Get the available vring base offset.
-
- * VHOST_USER_SET_VRING_KICK
-
-      Id: 12
-      Equivalent ioctl: VHOST_SET_VRING_KICK
-      Master payload: u64
-
-      Set the event file descriptor for adding buffers to the vring. It
-      is passed in the ancillary data.
-      Bits (0-7) of the payload contain the vring index. Bit 8 is the
-      invalid FD flag. This flag is set when there is no file descriptor
-      in the ancillary data. This signals that polling should be used
-      instead of waiting for a kick.
-
- * VHOST_USER_SET_VRING_CALL
-
-      Id: 13
-      Equivalent ioctl: VHOST_SET_VRING_CALL
-      Master payload: u64
-
-      Set the event file descriptor to signal when buffers are used. It
-      is passed in the ancillary data.
-      Bits (0-7) of the payload contain the vring index. Bit 8 is the
-      invalid FD flag. This flag is set when there is no file descriptor
-      in the ancillary data. This signals that polling will be used
-      instead of waiting for the call.
-
- * VHOST_USER_SET_VRING_ERR
-
-      Id: 14
-      Equivalent ioctl: VHOST_SET_VRING_ERR
-      Master payload: u64
-
-      Set the event file descriptor to signal when error occurs. It
-      is passed in the ancillary data.
-      Bits (0-7) of the payload contain the vring index. Bit 8 is the
-      invalid FD flag. This flag is set when there is no file descriptor
-      in the ancillary data.
-
- * VHOST_USER_GET_QUEUE_NUM
-
-      Id: 17
-      Equivalent ioctl: N/A
-      Master payload: N/A
-      Slave payload: u64
-
-      Query how many queues the backend supports. This request should be
-      sent only when VHOST_USER_PROTOCOL_F_MQ is set in queried protocol
-      features by VHOST_USER_GET_PROTOCOL_FEATURES.
-
- * VHOST_USER_SET_VRING_ENABLE
-
-      Id: 18
-      Equivalent ioctl: N/A
-      Master payload: vring state description
-
-      Signal slave to enable or disable corresponding vring.
-      This request should be sent only when VHOST_USER_F_PROTOCOL_FEATURES
-      has been negotiated.
-
- * VHOST_USER_SEND_RARP
-
-      Id: 19
-      Equivalent ioctl: N/A
-      Master payload: u64
-
-      Ask vhost user backend to broadcast a fake RARP to notify the migration
-      is terminated for guest that does not support GUEST_ANNOUNCE.
-      Only legal if feature bit VHOST_USER_F_PROTOCOL_FEATURES is present in
-      VHOST_USER_GET_FEATURES and protocol feature bit VHOST_USER_PROTOCOL_F_RARP
-      is present in VHOST_USER_GET_PROTOCOL_FEATURES.
-      The first 6 bytes of the payload contain the mac address of the guest to
-      allow the vhost user backend to construct and broadcast the fake RARP.
-
- * VHOST_USER_NET_SET_MTU
-
-      Id: 20
-      Equivalent ioctl: N/A
-      Master payload: u64
-
-      Set host MTU value exposed to the guest.
-      This request should be sent only when VIRTIO_NET_F_MTU feature has been
-      successfully negotiated, VHOST_USER_F_PROTOCOL_FEATURES is present in
-      VHOST_USER_GET_FEATURES and protocol feature bit
-      VHOST_USER_PROTOCOL_F_NET_MTU is present in
-      VHOST_USER_GET_PROTOCOL_FEATURES.
-      If VHOST_USER_PROTOCOL_F_REPLY_ACK is negotiated, slave must respond
-      with zero in case the specified MTU is valid, or non-zero otherwise.
-
- * VHOST_USER_SET_SLAVE_REQ_FD
-
-      Id: 21
-      Equivalent ioctl: N/A
-      Master payload: N/A
-
-      Set the socket file descriptor for slave initiated requests. It is passed
-      in the ancillary data.
-      This request should be sent only when VHOST_USER_F_PROTOCOL_FEATURES
-      has been negotiated, and protocol feature bit VHOST_USER_PROTOCOL_F_SLAVE_REQ
-      bit is present in VHOST_USER_GET_PROTOCOL_FEATURES.
-      If VHOST_USER_PROTOCOL_F_REPLY_ACK is negotiated, slave must respond
-      with zero for success, non-zero otherwise.
-
- * VHOST_USER_IOTLB_MSG
-
-      Id: 22
-      Equivalent ioctl: N/A (equivalent to VHOST_IOTLB_MSG message type)
-      Master payload: struct vhost_iotlb_msg
-      Slave payload: u64
-
-      Send IOTLB messages with struct vhost_iotlb_msg as payload.
-      Master sends such requests to update and invalidate entries in the device
-      IOTLB. The slave has to acknowledge the request with sending zero as u64
-      payload for success, non-zero otherwise.
-      This request should be send only when VIRTIO_F_IOMMU_PLATFORM feature
-      has been successfully negotiated.
-
- * VHOST_USER_SET_VRING_ENDIAN
-
-      Id: 23
-      Equivalent ioctl: VHOST_SET_VRING_ENDIAN
-      Master payload: vring state description
-
-      Set the endianness of a VQ for legacy devices. Little-endian is indicated
-      with state.num set to 0 and big-endian is indicated with state.num set
-      to 1. Other values are invalid.
-      This request should be sent only when VHOST_USER_PROTOCOL_F_CROSS_ENDIAN
-      has been negotiated.
-      Backends that negotiated this feature should handle both endiannesses
-      and expect this message once (per VQ) during device configuration
-      (ie. before the master starts the VQ).
-
- * VHOST_USER_GET_CONFIG
-
-      Id: 24
-      Equivalent ioctl: N/A
-      Master payload: virtio device config space
-      Slave payload: virtio device config space
-
-      When VHOST_USER_PROTOCOL_F_CONFIG is negotiated, this message is
-      submitted by the vhost-user master to fetch the contents of the virtio
-      device configuration space, vhost-user slave's payload size MUST match
-      master's request, vhost-user slave uses zero length of payload to
-      indicate an error to vhost-user master. The vhost-user master may
-      cache the contents to avoid repeated VHOST_USER_GET_CONFIG calls.
-
-* VHOST_USER_SET_CONFIG
-
-      Id: 25
-      Equivalent ioctl: N/A
-      Master payload: virtio device config space
-      Slave payload: N/A
-
-      When VHOST_USER_PROTOCOL_F_CONFIG is negotiated, this message is
-      submitted by the vhost-user master when the Guest changes the virtio
-      device configuration space and also can be used for live migration
-      on the destination host. The vhost-user slave must check the flags
-      field, and slaves MUST NOT accept SET_CONFIG for read-only
-      configuration space fields unless the live migration bit is set.
-
-* VHOST_USER_CREATE_CRYPTO_SESSION
-
-     Id: 26
-     Equivalent ioctl: N/A
-     Master payload: crypto session description
-     Slave payload: crypto session description
-
-     Create a session for crypto operation. The server side must return the
-     session id, 0 or positive for success, negative for failure.
-     This request should be sent only when VHOST_USER_PROTOCOL_F_CRYPTO_SESSION
-     feature has been successfully negotiated.
-     It's a required feature for crypto devices.
-
-* VHOST_USER_CLOSE_CRYPTO_SESSION
-
-     Id: 27
-     Equivalent ioctl: N/A
-     Master payload: u64
-
-     Close a session for crypto operation which was previously
-     created by VHOST_USER_CREATE_CRYPTO_SESSION.
-     This request should be sent only when VHOST_USER_PROTOCOL_F_CRYPTO_SESSION
-     feature has been successfully negotiated.
-     It's a required feature for crypto devices.
-
- * VHOST_USER_POSTCOPY_ADVISE
-      Id: 28
-      Master payload: N/A
-      Slave payload: userfault fd
-
-      When VHOST_USER_PROTOCOL_F_PAGEFAULT is supported, the
-      master advises slave that a migration with postcopy enabled is underway,
-      the slave must open a userfaultfd for later use.
-      Note that at this stage the migration is still in precopy mode.
-
- * VHOST_USER_POSTCOPY_LISTEN
-      Id: 29
-      Master payload: N/A
-
-      Master advises slave that a transition to postcopy mode has happened.
-      The slave must ensure that shared memory is registered with userfaultfd
-      to cause faulting of non-present pages.
-
-      This is always sent sometime after a VHOST_USER_POSTCOPY_ADVISE, and
-      thus only when VHOST_USER_PROTOCOL_F_PAGEFAULT is supported.
-
- * VHOST_USER_POSTCOPY_END
-      Id: 30
-      Slave payload: u64
-
-      Master advises that postcopy migration has now completed.  The
-      slave must disable the userfaultfd. The response is an acknowledgement
-      only.
-      When VHOST_USER_PROTOCOL_F_PAGEFAULT is supported, this message
-      is sent at the end of the migration, after VHOST_USER_POSTCOPY_LISTEN
-      was previously sent.
-      The value returned is an error indication; 0 is success.
-
- * VHOST_USER_GET_INFLIGHT_FD
-      Id: 31
-      Equivalent ioctl: N/A
-      Master payload: inflight description
-
-      When VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD protocol feature has been
-      successfully negotiated, this message is submitted by master to get
-      a shared buffer from slave. The shared buffer will be used to track
-      inflight I/O by slave. QEMU should retrieve a new one when vm reset.
-
- * VHOST_USER_SET_INFLIGHT_FD
-      Id: 32
-      Equivalent ioctl: N/A
-      Master payload: inflight description
-
-      When VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD protocol feature has been
-      successfully negotiated, this message is submitted by master to send
-      the shared inflight buffer back to slave so that slave could get
-      inflight I/O after a crash or restart.
-
-Slave message types
--------------------
-
- * VHOST_USER_SLAVE_IOTLB_MSG
-
-      Id: 1
-      Equivalent ioctl: N/A (equivalent to VHOST_IOTLB_MSG message type)
-      Slave payload: struct vhost_iotlb_msg
-      Master payload: N/A
-
-      Send IOTLB messages with struct vhost_iotlb_msg as payload.
-      Slave sends such requests to notify of an IOTLB miss, or an IOTLB
-      access failure. If VHOST_USER_PROTOCOL_F_REPLY_ACK is negotiated,
-      and slave set the VHOST_USER_NEED_REPLY flag, master must respond with
-      zero when operation is successfully completed, or non-zero otherwise.
-      This request should be send only when VIRTIO_F_IOMMU_PLATFORM feature
-      has been successfully negotiated.
-
-* VHOST_USER_SLAVE_CONFIG_CHANGE_MSG
-
-     Id: 2
-     Equivalent ioctl: N/A
-     Slave payload: N/A
-     Master payload: N/A
-
-     When VHOST_USER_PROTOCOL_F_CONFIG is negotiated, vhost-user slave sends
-     such messages to notify that the virtio device's configuration space has
-     changed, for those host devices which can support such feature, host
-     driver can send VHOST_USER_GET_CONFIG message to slave to get the latest
-     content. If VHOST_USER_PROTOCOL_F_REPLY_ACK is negotiated, and slave set
-     the VHOST_USER_NEED_REPLY flag, master must respond with zero when
-     operation is successfully completed, or non-zero otherwise.
-
- * VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG
-
-      Id: 3
-      Equivalent ioctl: N/A
-      Slave payload: vring area description
-      Master payload: N/A
-
-      Sets host notifier for a specified queue. The queue index is contained
-      in the u64 field of the vring area description. The host notifier is
-      described by the file descriptor (typically it's a VFIO device fd) which
-      is passed as ancillary data and the size (which is mmap size and should
-      be the same as host page size) and offset (which is mmap offset) carried
-      in the vring area description. QEMU can mmap the file descriptor based
-      on the size and offset to get a memory range. Registering a host notifier
-      means mapping this memory range to the VM as the specified queue's notify
-      MMIO region. Slave sends this request to tell QEMU to de-register the
-      existing notifier if any and register the new notifier if the request is
-      sent with a file descriptor.
-      This request should be sent only when VHOST_USER_PROTOCOL_F_HOST_NOTIFIER
-      protocol feature has been successfully negotiated.
-
-VHOST_USER_PROTOCOL_F_REPLY_ACK:
--------------------------------
-The original vhost-user specification only demands replies for certain
-commands. This differs from the vhost protocol implementation where commands
-are sent over an ioctl() call and block until the client has completed.
-
-With this protocol extension negotiated, the sender (QEMU) can set the
-"need_reply" [Bit 3] flag to any command. This indicates that
-the client MUST respond with a Payload VhostUserMsg indicating success or
-failure. The payload should be set to zero on success or non-zero on failure,
-unless the message already has an explicit reply body.
-
-The response payload gives QEMU a deterministic indication of the result
-of the command. Today, QEMU is expected to terminate the main vhost-user
-loop upon receiving such errors. In future, qemu could be taught to be more
-resilient for selective requests.
-
-For the message types that already solicit a reply from the client, the
-presence of VHOST_USER_PROTOCOL_F_REPLY_ACK or need_reply bit being set brings
-no behavioural change. (See the 'Communication' section for details.)
-
-Backend program conventions
----------------------------
-
-vhost-user backends can provide various devices & services and may
-need to be configured manually depending on the use case. However, it
-is a good idea to follow the conventions listed here when
-possible. Users, QEMU or libvirt, can then rely on some common
-behaviour to avoid heterogenous configuration and management of the
-backend programs and facilitate interoperability.
-
-Each backend installed on a host system should come with at least one
-JSON file that conforms to the vhost-user.json schema. Each file
-informs the management applications about the backend type, and binary
-location. In addition, it defines rules for management apps for
-picking the highest priority backend when multiple match the search
-criteria (see @VhostUserBackend documentation in the schema file).
-
-If the backend is not capable of enabling a requested feature on the
-host (such as 3D acceleration with virgl), or the initialization
-failed, the backend should fail to start early and exit with a status
-!= 0. It may also print a message to stderr for further details.
-
-The backend program must not daemonize itself, but it may be
-daemonized by the management layer. It may also have a restricted
-access to the system.
-
-File descriptors 0, 1 and 2 will exist, and have regular
-stdin/stdout/stderr usage (they may have been redirected to /dev/null
-by the management layer, or to a log handler).
-
-The backend program must end (as quickly and cleanly as possible) when
-the SIGTERM signal is received. Eventually, it may receive SIGKILL by
-the management layer after a few seconds.
-
-The following command line options have an expected behaviour. They
-are mandatory, unless explicitly said differently:
-
-* --socket-path=PATH
-
-This option specify the location of the vhost-user Unix domain socket.
-It is incompatible with --fd.
-
-* --fd=FDNUM
-
-When this argument is given, the backend program is started with the
-vhost-user socket as file descriptor FDNUM. It is incompatible with
---socket-path.
-
-* --print-capabilities
-
-Output to stdout the backend capabilities in JSON format, and then
-exit successfully. Other options and arguments should be ignored, and
-the backend program should not perform its normal function.  The
-capabilities can be reported dynamically depending on the host
-capabilities.
-
-The JSON output is described in the vhost-user.json schema, by
-@VHostUserBackendCapabilities.  Example:
-{
-  "type": "foo",
-  "features": [
-    "feature-a",
-    "feature-b"
-  ]
-}
-
-vhost-user-input
-----------------
-
-Command line options:
-
-* --evdev-path=PATH (optional)
-
-Specify the linux input device.
-
-* --no-grab (optional)
-
-Do no request exclusive access to the input device.
-
-vhost-user-gpu
---------------
-
-Command line options:
-
-* --render-node=PATH (optional)
-
-Specify the GPU DRM render node.
-
-* --virgl (optional)
-
-Enable virgl rendering support.
diff --git a/hw/acpi/pcihp.c b/hw/acpi/pcihp.c
index 88e4ae1bcd..613406d09b 100644
--- a/hw/acpi/pcihp.c
+++ b/hw/acpi/pcihp.c
@@ -37,14 +37,7 @@
 #include "hw/pci/pci_bus.h"
 #include "qapi/error.h"
 #include "qom/qom-qobject.h"
-
-//#define DEBUG
-
-#ifdef DEBUG
-# define ACPI_PCIHP_DPRINTF(format, ...)     printf(format, ## __VA_ARGS__)
-#else
-# define ACPI_PCIHP_DPRINTF(format, ...)     do { } while (0)
-#endif
+#include "trace.h"
 
 #define ACPI_PCIHP_ADDR 0xae00
 #define ACPI_PCIHP_SIZE 0x0014
@@ -159,6 +152,8 @@ static void acpi_pcihp_eject_slot(AcpiPciHpState *s, unsigned bsel, unsigned slo
     int slot = ctz32(slots);
     PCIBus *bus = acpi_pcihp_find_hotplug_bus(s, bsel);
 
+    trace_acpi_pci_eject_slot(bsel, slot);
+
     if (!bus) {
         return;
     }
@@ -270,6 +265,8 @@ void acpi_pcihp_device_plug_cb(HotplugHandler *hotplug_dev, AcpiPciHpState *s,
 void acpi_pcihp_device_unplug_cb(HotplugHandler *hotplug_dev, AcpiPciHpState *s,
                                  DeviceState *dev, Error **errp)
 {
+    trace_acpi_pci_unplug(PCI_SLOT(PCI_DEVICE(dev)->devfn),
+                          acpi_pcihp_get_bsel(pci_get_bus(PCI_DEVICE(dev))));
     object_property_set_bool(OBJECT(dev), false, "realized", NULL);
 }
 
@@ -280,6 +277,9 @@ void acpi_pcihp_device_unplug_request_cb(HotplugHandler *hotplug_dev,
     PCIDevice *pdev = PCI_DEVICE(dev);
     int slot = PCI_SLOT(pdev->devfn);
     int bsel = acpi_pcihp_get_bsel(pci_get_bus(pdev));
+
+    trace_acpi_pci_unplug_request(bsel, slot);
+
     if (bsel < 0) {
         error_setg(errp, "Unsupported bus. Bus doesn't have property '"
                    ACPI_PCIHP_PROP_BSEL "' set");
@@ -306,23 +306,23 @@ static uint64_t pci_read(void *opaque, hwaddr addr, unsigned int size)
         if (!s->legacy_piix) {
             s->acpi_pcihp_pci_status[bsel].up = 0;
         }
-        ACPI_PCIHP_DPRINTF("pci_up_read %" PRIu32 "\n", val);
+        trace_acpi_pci_up_read(val);
         break;
     case PCI_DOWN_BASE:
         val = s->acpi_pcihp_pci_status[bsel].down;
-        ACPI_PCIHP_DPRINTF("pci_down_read %" PRIu32 "\n", val);
+        trace_acpi_pci_down_read(val);
         break;
     case PCI_EJ_BASE:
         /* No feature defined yet */
-        ACPI_PCIHP_DPRINTF("pci_features_read %" PRIu32 "\n", val);
+        trace_acpi_pci_features_read(val);
         break;
     case PCI_RMV_BASE:
         val = s->acpi_pcihp_pci_status[bsel].hotplug_enable;
-        ACPI_PCIHP_DPRINTF("pci_rmv_read %" PRIu32 "\n", val);
+        trace_acpi_pci_rmv_read(val);
         break;
     case PCI_SEL_BASE:
         val = s->hotplug_select;
-        ACPI_PCIHP_DPRINTF("pci_sel_read %" PRIu32 "\n", val);
+        trace_acpi_pci_sel_read(val);
     default:
         break;
     }
@@ -340,13 +340,11 @@ static void pci_write(void *opaque, hwaddr addr, uint64_t data,
             break;
         }
         acpi_pcihp_eject_slot(s, s->hotplug_select, data);
-        ACPI_PCIHP_DPRINTF("pciej write %" HWADDR_PRIx " <== %" PRIu64 "\n",
-                      addr, data);
+        trace_acpi_pci_ej_write(addr, data);
         break;
     case PCI_SEL_BASE:
         s->hotplug_select = s->legacy_piix ? ACPI_PCIHP_BSEL_DEFAULT : data;
-        ACPI_PCIHP_DPRINTF("pcisel write %" HWADDR_PRIx " <== %" PRIu64 "\n",
-                      addr, data);
+        trace_acpi_pci_sel_write(addr, data);
     default:
         break;
     }
diff --git a/hw/acpi/piix4.c b/hw/acpi/piix4.c
index c903e65169..ec4e186cec 100644
--- a/hw/acpi/piix4.c
+++ b/hw/acpi/piix4.c
@@ -39,14 +39,7 @@
 #include "hw/acpi/acpi_dev_interface.h"
 #include "hw/xen/xen.h"
 #include "qom/cpu.h"
-
-//#define DEBUG
-
-#ifdef DEBUG
-# define PIIX4_DPRINTF(format, ...)     printf(format, ## __VA_ARGS__)
-#else
-# define PIIX4_DPRINTF(format, ...)     do { } while (0)
-#endif
+#include "trace.h"
 
 #define GPE_BASE 0xafe0
 #define GPE_LEN 4
@@ -583,7 +576,7 @@ static uint64_t gpe_readb(void *opaque, hwaddr addr, unsigned width)
     PIIX4PMState *s = opaque;
     uint32_t val = acpi_gpe_ioport_readb(&s->ar, addr);
 
-    PIIX4_DPRINTF("gpe read %" HWADDR_PRIx " == %" PRIu32 "\n", addr, val);
+    trace_piix4_gpe_readb(addr, width, val);
     return val;
 }
 
@@ -592,10 +585,9 @@ static void gpe_writeb(void *opaque, hwaddr addr, uint64_t val,
 {
     PIIX4PMState *s = opaque;
 
+    trace_piix4_gpe_writeb(addr, width, val);
     acpi_gpe_ioport_writeb(&s->ar, addr, val);
     acpi_update_sci(&s->ar, s->irq);
-
-    PIIX4_DPRINTF("gpe write %" HWADDR_PRIx " <== %" PRIu64 "\n", addr, val);
 }
 
 static const MemoryRegionOps piix4_gpe_ops = {
diff --git a/hw/acpi/trace-events b/hw/acpi/trace-events
index 6272d8a9e7..96b8273297 100644
--- a/hw/acpi/trace-events
+++ b/hw/acpi/trace-events
@@ -31,6 +31,22 @@ cpuhp_acpi_ejecting_cpu(uint32_t idx) "0x%"PRIx32
 cpuhp_acpi_write_ost_ev(uint32_t slot, uint32_t ev) "idx[0x%"PRIx32"] OST EVENT: 0x%"PRIx32
 cpuhp_acpi_write_ost_status(uint32_t slot, uint32_t st) "idx[0x%"PRIx32"] OST STATUS: 0x%"PRIx32
 
+# pcihp.c
+acpi_pci_eject_slot(unsigned bsel, unsigned slot) "bsel: %u slot: %u"
+acpi_pci_unplug(int bsel, int slot) "bsel: %d slot: %d"
+acpi_pci_unplug_request(int bsel, int slot) "bsel: %d slot: %d"
+acpi_pci_up_read(uint32_t val) "%" PRIu32
+acpi_pci_down_read(uint32_t val) "%" PRIu32
+acpi_pci_features_read(uint32_t val) "%" PRIu32
+acpi_pci_rmv_read(uint32_t val) "%" PRIu32
+acpi_pci_sel_read(uint32_t val) "%" PRIu32
+acpi_pci_ej_write(uint64_t addr, uint64_t data) "0x%" PRIx64 " <== %" PRIu64
+acpi_pci_sel_write(uint64_t addr, uint64_t data) "0x%" PRIx64 " <== %" PRIu64
+
+# piix4.c
+piix4_gpe_readb(uint64_t addr, unsigned width, uint64_t val) "addr: 0x%" PRIx64 " width: %d ==> 0x%" PRIx64
+piix4_gpe_writeb(uint64_t addr, unsigned width, uint64_t val) "addr: 0x%" PRIx64 " width: %d <== 0x%" PRIx64
+
 # tco.c
 tco_timer_reload(int ticks, int msec) "ticks=%d (%d ms)"
 tco_timer_expired(int timeouts_no, bool strap, bool no_reboot) "timeouts_no=%d no_reboot=%d/%d"
diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
index bf9c0bc2f4..e7c96d658e 100644
--- a/hw/arm/virt-acpi-build.c
+++ b/hw/arm/virt-acpi-build.c
@@ -40,6 +40,7 @@
 #include "hw/loader.h"
 #include "hw/hw.h"
 #include "hw/acpi/aml-build.h"
+#include "hw/acpi/pci.h"
 #include "hw/pci/pcie_host.h"
 #include "hw/pci/pci.h"
 #include "hw/arm/virt.h"
@@ -546,25 +547,20 @@ build_srat(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
 }
 
 static void
-build_mcfg(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms)
+build_mcfg(GArray *table_data, BIOSLinker *linker, AcpiMcfgInfo *info)
 {
     AcpiTableMcfg *mcfg;
-    const MemMapEntry *memmap = vms->memmap;
-    int ecam_id = VIRT_ECAM_ID(vms->highmem_ecam);
     int len = sizeof(*mcfg) + sizeof(mcfg->allocation[0]);
-    int mcfg_start = table_data->len;
 
     mcfg = acpi_data_push(table_data, len);
-    mcfg->allocation[0].address = cpu_to_le64(memmap[ecam_id].base);
+    mcfg->allocation[0].address = cpu_to_le64(info->base);
 
     /* Only a single allocation so no need to play with segments */
     mcfg->allocation[0].pci_segment = cpu_to_le16(0);
     mcfg->allocation[0].start_bus_number = 0;
-    mcfg->allocation[0].end_bus_number =
-        PCIE_MMCFG_BUS(memmap[ecam_id].size - 1);
+    mcfg->allocation[0].end_bus_number = PCIE_MMCFG_BUS(info->size - 1);
 
-    build_header(linker, table_data, (void *)(table_data->data + mcfg_start),
-                 "MCFG", table_data->len - mcfg_start, 1, NULL, NULL);
+    build_header(linker, table_data, (void *)mcfg, "MCFG", len, 1, NULL, NULL);
 }
 
 /* GTDT */
@@ -803,7 +799,13 @@ void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables)
     build_gtdt(tables_blob, tables->linker, vms);
 
     acpi_add_table(table_offsets, tables_blob);
-    build_mcfg(tables_blob, tables->linker, vms);
+    {
+        AcpiMcfgInfo mcfg = {
+           .base = vms->memmap[VIRT_ECAM_ID(vms->highmem_ecam)].base,
+           .size = vms->memmap[VIRT_ECAM_ID(vms->highmem_ecam)].size,
+        };
+        build_mcfg(tables_blob, tables->linker, &mcfg);
+    }
 
     acpi_add_table(table_offsets, tables_blob);
     build_spcr(tables_blob, tables->linker, vms);
diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index 28b81368f7..9cb61336a6 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -103,7 +103,7 @@ const VhostDevConfigOps blk_ops = {
     .vhost_dev_config_notifier = vhost_user_blk_handle_config_change,
 };
 
-static void vhost_user_blk_start(VirtIODevice *vdev)
+static int vhost_user_blk_start(VirtIODevice *vdev)
 {
     VHostUserBlk *s = VHOST_USER_BLK(vdev);
     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
@@ -112,13 +112,13 @@ static void vhost_user_blk_start(VirtIODevice *vdev)
 
     if (!k->set_guest_notifiers) {
         error_report("binding does not support guest notifiers");
-        return;
+        return -ENOSYS;
     }
 
     ret = vhost_dev_enable_notifiers(&s->dev, vdev);
     if (ret < 0) {
         error_report("Error enabling host notifiers: %d", -ret);
-        return;
+        return ret;
     }
 
     ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, true);
@@ -157,12 +157,13 @@ static void vhost_user_blk_start(VirtIODevice *vdev)
         vhost_virtqueue_mask(&s->dev, vdev, i, false);
     }
 
-    return;
+    return ret;
 
 err_guest_notifiers:
     k->set_guest_notifiers(qbus->parent, s->dev.nvqs, false);
 err_host_notifiers:
     vhost_dev_disable_notifiers(&s->dev, vdev);
+    return ret;
 }
 
 static void vhost_user_blk_stop(VirtIODevice *vdev)
@@ -190,18 +191,28 @@ static void vhost_user_blk_stop(VirtIODevice *vdev)
 static void vhost_user_blk_set_status(VirtIODevice *vdev, uint8_t status)
 {
     VHostUserBlk *s = VHOST_USER_BLK(vdev);
-    bool should_start = status & VIRTIO_CONFIG_S_DRIVER_OK;
+    bool should_start = vdev->started;
+    int ret;
 
     if (!vdev->vm_running) {
         should_start = false;
     }
 
+    if (!s->connected) {
+        return;
+    }
+
     if (s->dev.started == should_start) {
         return;
     }
 
     if (should_start) {
-        vhost_user_blk_start(vdev);
+        ret = vhost_user_blk_start(vdev);
+        if (ret < 0) {
+            error_report("vhost-user-blk: vhost start failed: %s",
+                         strerror(-ret));
+            qemu_chr_fe_disconnect(&s->chardev);
+        }
     } else {
         vhost_user_blk_stop(vdev);
     }
@@ -237,10 +248,13 @@ static uint64_t vhost_user_blk_get_features(VirtIODevice *vdev,
 static void vhost_user_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
 {
     VHostUserBlk *s = VHOST_USER_BLK(vdev);
-    int i;
+    int i, ret;
 
-    if (!(virtio_host_has_feature(vdev, VIRTIO_F_VERSION_1) &&
-        !virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1))) {
+    if (!vdev->start_on_kick) {
+        return;
+    }
+
+    if (!s->connected) {
         return;
     }
 
@@ -251,7 +265,13 @@ static void vhost_user_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
     /* Some guests kick before setting VIRTIO_CONFIG_S_DRIVER_OK so start
      * vhost here instead of waiting for .set_status().
      */
-    vhost_user_blk_start(vdev);
+    ret = vhost_user_blk_start(vdev);
+    if (ret < 0) {
+        error_report("vhost-user-blk: vhost start failed: %s",
+                     strerror(-ret));
+        qemu_chr_fe_disconnect(&s->chardev);
+        return;
+    }
 
     /* Kick right away to begin processing requests already in vring */
     for (i = 0; i < s->dev.nvqs; i++) {
@@ -271,11 +291,103 @@ static void vhost_user_blk_reset(VirtIODevice *vdev)
     vhost_dev_free_inflight(s->inflight);
 }
 
+static int vhost_user_blk_connect(DeviceState *dev)
+{
+    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+    VHostUserBlk *s = VHOST_USER_BLK(vdev);
+    int ret = 0;
+
+    if (s->connected) {
+        return 0;
+    }
+    s->connected = true;
+
+    s->dev.nvqs = s->num_queues;
+    s->dev.vqs = s->vqs;
+    s->dev.vq_index = 0;
+    s->dev.backend_features = 0;
+
+    vhost_dev_set_config_notifier(&s->dev, &blk_ops);
+
+    ret = vhost_dev_init(&s->dev, &s->vhost_user, VHOST_BACKEND_TYPE_USER, 0);
+    if (ret < 0) {
+        error_report("vhost-user-blk: vhost initialization failed: %s",
+                     strerror(-ret));
+        return ret;
+    }
+
+    /* restore vhost state */
+    if (vdev->started) {
+        ret = vhost_user_blk_start(vdev);
+        if (ret < 0) {
+            error_report("vhost-user-blk: vhost start failed: %s",
+                         strerror(-ret));
+            return ret;
+        }
+    }
+
+    return 0;
+}
+
+static void vhost_user_blk_disconnect(DeviceState *dev)
+{
+    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+    VHostUserBlk *s = VHOST_USER_BLK(vdev);
+
+    if (!s->connected) {
+        return;
+    }
+    s->connected = false;
+
+    if (s->dev.started) {
+        vhost_user_blk_stop(vdev);
+    }
+
+    vhost_dev_cleanup(&s->dev);
+}
+
+static gboolean vhost_user_blk_watch(GIOChannel *chan, GIOCondition cond,
+                                     void *opaque)
+{
+    DeviceState *dev = opaque;
+    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+    VHostUserBlk *s = VHOST_USER_BLK(vdev);
+
+    qemu_chr_fe_disconnect(&s->chardev);
+
+    return true;
+}
+
+static void vhost_user_blk_event(void *opaque, int event)
+{
+    DeviceState *dev = opaque;
+    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+    VHostUserBlk *s = VHOST_USER_BLK(vdev);
+
+    switch (event) {
+    case CHR_EVENT_OPENED:
+        if (vhost_user_blk_connect(dev) < 0) {
+            qemu_chr_fe_disconnect(&s->chardev);
+            return;
+        }
+        s->watch = qemu_chr_fe_add_watch(&s->chardev, G_IO_HUP,
+                                         vhost_user_blk_watch, dev);
+        break;
+    case CHR_EVENT_CLOSED:
+        vhost_user_blk_disconnect(dev);
+        if (s->watch) {
+            g_source_remove(s->watch);
+            s->watch = 0;
+        }
+        break;
+    }
+}
+
 static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp)
 {
     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
     VHostUserBlk *s = VHOST_USER_BLK(vdev);
-    struct vhost_virtqueue *vqs = NULL;
+    Error *err = NULL;
     int i, ret;
 
     if (!s->chardev.chr) {
@@ -306,27 +418,29 @@ static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp)
     }
 
     s->inflight = g_new0(struct vhost_inflight, 1);
+    s->vqs = g_new(struct vhost_virtqueue, s->num_queues);
+    s->watch = 0;
+    s->connected = false;
 
-    s->dev.nvqs = s->num_queues;
-    s->dev.vqs = g_new(struct vhost_virtqueue, s->dev.nvqs);
-    s->dev.vq_index = 0;
-    s->dev.backend_features = 0;
-    vqs = s->dev.vqs;
-
-    vhost_dev_set_config_notifier(&s->dev, &blk_ops);
+    qemu_chr_fe_set_handlers(&s->chardev,  NULL, NULL, vhost_user_blk_event,
+                             NULL, (void *)dev, NULL, true);
 
-    ret = vhost_dev_init(&s->dev, &s->vhost_user, VHOST_BACKEND_TYPE_USER, 0);
-    if (ret < 0) {
-        error_setg(errp, "vhost-user-blk: vhost initialization failed: %s",
-                   strerror(-ret));
+reconnect:
+    if (qemu_chr_fe_wait_connected(&s->chardev, &err) < 0) {
+        error_report_err(err);
         goto virtio_err;
     }
 
+    /* check whether vhost_user_blk_connect() failed or not */
+    if (!s->connected) {
+        goto reconnect;
+    }
+
     ret = vhost_dev_get_config(&s->dev, (uint8_t *)&s->blkcfg,
-                              sizeof(struct virtio_blk_config));
+                               sizeof(struct virtio_blk_config));
     if (ret < 0) {
-        error_setg(errp, "vhost-user-blk: get block config failed");
-        goto vhost_err;
+        error_report("vhost-user-blk: get block config failed");
+        goto reconnect;
     }
 
     if (s->blkcfg.num_queues != s->num_queues) {
@@ -335,10 +449,8 @@ static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp)
 
     return;
 
-vhost_err:
-    vhost_dev_cleanup(&s->dev);
 virtio_err:
-    g_free(vqs);
+    g_free(s->vqs);
     g_free(s->inflight);
     virtio_cleanup(vdev);
     vhost_user_cleanup(&s->vhost_user);
@@ -348,12 +460,13 @@ static void vhost_user_blk_device_unrealize(DeviceState *dev, Error **errp)
 {
     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
     VHostUserBlk *s = VHOST_USER_BLK(dev);
-    struct vhost_virtqueue *vqs = s->dev.vqs;
 
-    vhost_user_blk_set_status(vdev, 0);
+    virtio_set_status(vdev, 0);
+    qemu_chr_fe_set_handlers(&s->chardev,  NULL, NULL, NULL,
+                             NULL, NULL, NULL, false);
     vhost_dev_cleanup(&s->dev);
     vhost_dev_free_inflight(s->inflight);
-    g_free(vqs);
+    g_free(s->vqs);
     g_free(s->inflight);
     virtio_cleanup(vdev);
     vhost_user_cleanup(&s->vhost_user);
diff --git a/hw/core/machine.c b/hw/core/machine.c
index 5d046a43e3..934c1bcceb 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -102,9 +102,26 @@ const size_t hw_compat_2_7_len = G_N_ELEMENTS(hw_compat_2_7);
 
 GlobalProperty hw_compat_2_6[] = {
     { "virtio-mmio", "format_transport_address", "off" },
-    /* Optional because not all virtio-pci devices support legacy mode */
-    { "virtio-pci", "disable-modern", "on",  .optional = true },
-    { "virtio-pci", "disable-legacy", "off", .optional = true },
+    /*
+     * don't include devices which are modern-only
+     * ie keyboard, mouse, tablet, gpu, vga & crypto
+     */
+    { "virtio-9p-pci", "disable-modern", "on" },
+    { "virtio-9p-pci", "disable-legacy", "off" },
+    { "virtio-balloon-pci", "disable-modern", "on" },
+    { "virtio-balloon-pci", "disable-legacy", "off" },
+    { "virtio-blk-pci", "disable-modern", "on" },
+    { "virtio-blk-pci", "disable-legacy", "off" },
+    { "virtio-input-host-pci", "disable-modern", "on" },
+    { "virtio-input-host-pci", "disable-legacy", "off" },
+    { "virtio-net-pci", "disable-modern", "on" },
+    { "virtio-net-pci", "disable-legacy", "off" },
+    { "virtio-rng-pci", "disable-modern", "on" },
+    { "virtio-rng-pci", "disable-legacy", "off" },
+    { "virtio-scsi-pci", "disable-modern", "on" },
+    { "virtio-scsi-pci", "disable-legacy", "off" },
+    { "virtio-serial-pci", "disable-modern", "on" },
+    { "virtio-serial-pci", "disable-legacy", "off" },
 };
 const size_t hw_compat_2_6_len = G_N_ELEMENTS(hw_compat_2_6);
 
diff --git a/hw/display/virtio-gpu-pci.c b/hw/display/virtio-gpu-pci.c
index bdcd33c925..0bc4d9d424 100644
--- a/hw/display/virtio-gpu-pci.c
+++ b/hw/display/virtio-gpu-pci.c
@@ -47,7 +47,9 @@ static void virtio_gpu_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
     Error *local_error = NULL;
 
     qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
-    virtio_pci_force_virtio_1(vpci_dev);
+    if (!virtio_pci_force_virtio_1(vpci_dev, errp)) {
+        return;
+    }
     object_property_set_bool(OBJECT(vdev), true, "realized", &local_error);
 
     if (local_error) {
diff --git a/hw/display/virtio-vga.c b/hw/display/virtio-vga.c
index a2b803b75f..5d57bf5b0c 100644
--- a/hw/display/virtio-vga.c
+++ b/hw/display/virtio-vga.c
@@ -154,7 +154,9 @@ static void virtio_vga_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
 
     /* init virtio bits */
     qdev_set_parent_bus(DEVICE(g), BUS(&vpci_dev->bus));
-    virtio_pci_force_virtio_1(vpci_dev);
+    if (!virtio_pci_force_virtio_1(vpci_dev, errp)) {
+        return;
+    }
     object_property_set_bool(OBJECT(g), true, "realized", &err);
     if (err) {
         error_propagate(errp, err);
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index b4ec14e349..0d78d73894 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -59,6 +59,7 @@
 #include "hw/i386/x86-iommu.h"
 
 #include "hw/acpi/aml-build.h"
+#include "hw/acpi/pci.h"
 
 #include "qom/qom-qobject.h"
 #include "hw/i386/amd_iommu.h"
@@ -87,11 +88,6 @@
 /* Default IOAPIC ID */
 #define ACPI_BUILD_IOAPIC_ID 0x0
 
-typedef struct AcpiMcfgInfo {
-    uint64_t mcfg_base;
-    uint32_t mcfg_size;
-} AcpiMcfgInfo;
-
 typedef struct AcpiPmInfo {
     bool s3_disabled;
     bool s4_disabled;
@@ -2413,29 +2409,16 @@ static void
 build_mcfg_q35(GArray *table_data, BIOSLinker *linker, AcpiMcfgInfo *info)
 {
     AcpiTableMcfg *mcfg;
-    const char *sig;
     int len = sizeof(*mcfg) + 1 * sizeof(mcfg->allocation[0]);
 
     mcfg = acpi_data_push(table_data, len);
-    mcfg->allocation[0].address = cpu_to_le64(info->mcfg_base);
+    mcfg->allocation[0].address = cpu_to_le64(info->base);
     /* Only a single allocation so no need to play with segments */
     mcfg->allocation[0].pci_segment = cpu_to_le16(0);
     mcfg->allocation[0].start_bus_number = 0;
-    mcfg->allocation[0].end_bus_number = PCIE_MMCFG_BUS(info->mcfg_size - 1);
+    mcfg->allocation[0].end_bus_number = PCIE_MMCFG_BUS(info->size - 1);
 
-    /* MCFG is used for ECAM which can be enabled or disabled by guest.
-     * To avoid table size changes (which create migration issues),
-     * always create the table even if there are no allocations,
-     * but set the signature to a reserved value in this case.
-     * ACPI spec requires OSPMs to ignore such tables.
-     */
-    if (info->mcfg_base == PCIE_BASE_ADDR_UNMAPPED) {
-        /* Reserved signature: ignored by OSPM */
-        sig = "QEMU";
-    } else {
-        sig = "MCFG";
-    }
-    build_header(linker, table_data, (void *)mcfg, sig, len, 1, NULL, NULL);
+    build_header(linker, table_data, (void *)mcfg, "MCFG", len, 1, NULL, NULL);
 }
 
 /*
@@ -2602,12 +2585,15 @@ static bool acpi_get_mcfg(AcpiMcfgInfo *mcfg)
     if (!o) {
         return false;
     }
-    mcfg->mcfg_base = qnum_get_uint(qobject_to(QNum, o));
+    mcfg->base = qnum_get_uint(qobject_to(QNum, o));
     qobject_unref(o);
+    if (mcfg->base == PCIE_BASE_ADDR_UNMAPPED) {
+        return false;
+    }
 
     o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_SIZE, NULL);
     assert(o);
-    mcfg->mcfg_size = qnum_get_uint(qobject_to(QNum, o));
+    mcfg->size = qnum_get_uint(qobject_to(QNum, o));
     qobject_unref(o);
     return true;
 }
diff --git a/hw/pci-bridge/pci_expander_bridge.c b/hw/pci-bridge/pci_expander_bridge.c
index e62de4218f..ca66bc721a 100644
--- a/hw/pci-bridge/pci_expander_bridge.c
+++ b/hw/pci-bridge/pci_expander_bridge.c
@@ -66,11 +66,6 @@ static int pxb_bus_num(PCIBus *bus)
     return pxb->bus_nr;
 }
 
-static bool pxb_is_root(PCIBus *bus)
-{
-    return true; /* by definition */
-}
-
 static uint16_t pxb_bus_numa_node(PCIBus *bus)
 {
     PXBDev *pxb = convert_to_pxb(bus->parent_dev);
@@ -83,7 +78,6 @@ static void pxb_bus_class_init(ObjectClass *class, void *data)
     PCIBusClass *pbc = PCI_BUS_CLASS(class);
 
     pbc->bus_num = pxb_bus_num;
-    pbc->is_root = pxb_is_root;
     pbc->numa_node = pxb_bus_numa_node;
 }
 
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index a78023f669..b386777045 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -129,14 +129,9 @@ static void pci_bus_unrealize(BusState *qbus, Error **errp)
     vmstate_unregister(NULL, &vmstate_pcibus, bus);
 }
 
-static bool pcibus_is_root(PCIBus *bus)
-{
-    return !bus->parent_dev;
-}
-
 static int pcibus_num(PCIBus *bus)
 {
-    if (pcibus_is_root(bus)) {
+    if (pci_bus_is_root(bus)) {
         return 0; /* pci host bridge */
     }
     return bus->parent_dev->config[PCI_SECONDARY_BUS];
@@ -164,7 +159,6 @@ static void pci_bus_class_init(ObjectClass *klass, void *data)
     k->unrealize = pci_bus_unrealize;
     k->reset = pcibus_reset;
 
-    pbc->is_root = pcibus_is_root;
     pbc->bus_num = pcibus_num;
     pbc->numa_node = pcibus_numa_node;
     pbc->allows_extended_config_space = pcibus_allows_extended_config_space;
@@ -398,6 +392,7 @@ static void pci_root_bus_init(PCIBus *bus, DeviceState *parent,
     bus->slot_reserved_mask = 0x0;
     bus->address_space_mem = address_space_mem;
     bus->address_space_io = address_space_io;
+    bus->flags |= PCI_BUS_IS_ROOT;
 
     /* host bridge */
     QLIST_INIT(&bus->child);
@@ -415,11 +410,6 @@ bool pci_bus_is_express(PCIBus *bus)
     return object_dynamic_cast(OBJECT(bus), TYPE_PCIE_BUS);
 }
 
-bool pci_bus_is_root(PCIBus *bus)
-{
-    return PCI_BUS_GET_CLASS(bus)->is_root(bus);
-}
-
 bool pci_bus_allows_extended_config_space(PCIBus *bus)
 {
     return PCI_BUS_GET_CLASS(bus)->allows_extended_config_space(bus);
diff --git a/hw/pci/pcie_host.c b/hw/pci/pcie_host.c
index 553db56778..1ee4945a6d 100644
--- a/hw/pci/pcie_host.c
+++ b/hw/pci/pcie_host.c
@@ -47,11 +47,6 @@ static void pcie_mmcfg_data_write(void *opaque, hwaddr mmcfg_addr,
     }
     addr = PCIE_MMCFG_CONFOFFSET(mmcfg_addr);
     limit = pci_config_size(pci_dev);
-    if (limit <= addr) {
-        /* conventional pci device can be behind pcie-to-pci bridge.
-           256 <= addr < 4K has no effects. */
-        return;
-    }
     pci_host_config_write_common(pci_dev, addr, limit, val, len);
 }
 
@@ -70,11 +65,6 @@ static uint64_t pcie_mmcfg_data_read(void *opaque,
     }
     addr = PCIE_MMCFG_CONFOFFSET(mmcfg_addr);
     limit = pci_config_size(pci_dev);
-    if (limit <= addr) {
-        /* conventional pci device can be behind pcie-to-pci bridge.
-           256 <= addr < 4K has no effects. */
-        return ~0x0;
-    }
     return pci_host_config_read_common(pci_dev, addr, limit, len);
 }
 
diff --git a/hw/virtio/virtio-crypto-pci.c b/hw/virtio/virtio-crypto-pci.c
index 90a6e0dc2e..13807e538b 100644
--- a/hw/virtio/virtio-crypto-pci.c
+++ b/hw/virtio/virtio-crypto-pci.c
@@ -51,7 +51,9 @@ static void virtio_crypto_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
     }
 
     qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
-    virtio_pci_force_virtio_1(vpci_dev);
+    if (!virtio_pci_force_virtio_1(vpci_dev, errp)) {
+        return;
+    }
     object_property_set_bool(OBJECT(vdev), true, "realized", errp);
     object_property_set_link(OBJECT(vcrypto),
                  OBJECT(vcrypto->vdev.conf.cryptodev), "cryptodev",
diff --git a/hw/virtio/virtio-input-pci.c b/hw/virtio/virtio-input-pci.c
index 2c1397842b..28477729a3 100644
--- a/hw/virtio/virtio-input-pci.c
+++ b/hw/virtio/virtio-input-pci.c
@@ -48,7 +48,9 @@ static void virtio_input_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
     DeviceState *vdev = DEVICE(&vinput->vdev);
 
     qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
-    virtio_pci_force_virtio_1(vpci_dev);
+    if (!virtio_pci_force_virtio_1(vpci_dev, errp)) {
+        return;
+    }
     object_property_set_bool(OBJECT(vdev), true, "realized", errp);
 }
 
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index cb44e19b67..9056cdfa3c 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -20,6 +20,7 @@
 #include "standard-headers/linux/virtio_pci.h"
 #include "hw/virtio/virtio.h"
 #include "hw/pci/pci.h"
+#include "hw/pci/pci_bus.h"
 #include "qapi/error.h"
 #include "qemu/error-report.h"
 #include "hw/pci/msi.h"
@@ -1721,16 +1722,22 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp)
                        /* PCI BAR regions must be powers of 2 */
                        pow2ceil(proxy->notify.offset + proxy->notify.size));
 
-    if (proxy->disable_legacy == ON_OFF_AUTO_AUTO) {
-        proxy->disable_legacy = pcie_port ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
-    }
-
-    if (!virtio_pci_modern(proxy) && !virtio_pci_legacy(proxy)) {
-        error_setg(errp, "device cannot work as neither modern nor legacy mode"
-                   " is enabled");
-        error_append_hint(errp, "Set either disable-modern or disable-legacy"
-                          " to off\n");
-        return;
+    if ((proxy->disable_legacy == ON_OFF_AUTO_ON) ||
+        ((proxy->disable_legacy == ON_OFF_AUTO_AUTO) && pcie_port)) {
+        if (proxy->disable_modern) {
+            error_setg(errp, "device cannot work as neither modern nor "
+                       "legacy mode is enabled");
+            error_append_hint(errp, "Set either disable-modern or "
+                              "disable-legacy to off\n");
+            return;
+        }
+        proxy->mode = VIRTIO_PCI_MODE_MODERN;
+    } else {
+        if (proxy->disable_modern) {
+            proxy->mode = VIRTIO_PCI_MODE_LEGACY;
+        } else {
+            proxy->mode = VIRTIO_PCI_MODE_TRANSITIONAL;
+        }
     }
 
     if (pcie_port && pci_is_express(pci_dev)) {
diff --git a/hw/virtio/virtio-pci.h b/hw/virtio/virtio-pci.h
index 18581854ca..bfea2892a5 100644
--- a/hw/virtio/virtio-pci.h
+++ b/hw/virtio/virtio-pci.h
@@ -15,6 +15,7 @@
 #ifndef QEMU_VIRTIO_PCI_H
 #define QEMU_VIRTIO_PCI_H
 
+#include "qapi/error.h"
 #include "hw/pci/msi.h"
 #include "hw/virtio/virtio-bus.h"
 
@@ -118,6 +119,12 @@ typedef struct VirtIOPCIQueue {
   uint32_t used[2];
 } VirtIOPCIQueue;
 
+typedef enum {
+    VIRTIO_PCI_MODE_LEGACY,
+    VIRTIO_PCI_MODE_TRANSITIONAL,
+    VIRTIO_PCI_MODE_MODERN,
+} VirtIOPCIMode;
+
 struct VirtIOPCIProxy {
     PCIDevice pci_dev;
     MemoryRegion bar;
@@ -142,6 +149,7 @@ struct VirtIOPCIProxy {
     bool disable_modern;
     bool ignore_backend_features;
     OnOffAuto disable_legacy;
+    VirtIOPCIMode mode;
     uint32_t class_code;
     uint32_t nvectors;
     uint32_t dfselect;
@@ -156,23 +164,34 @@ struct VirtIOPCIProxy {
 
 static inline bool virtio_pci_modern(VirtIOPCIProxy *proxy)
 {
-    return !proxy->disable_modern;
+    return proxy->mode != VIRTIO_PCI_MODE_LEGACY;
 }
 
 static inline bool virtio_pci_legacy(VirtIOPCIProxy *proxy)
 {
-    return proxy->disable_legacy == ON_OFF_AUTO_OFF;
+    return proxy->mode != VIRTIO_PCI_MODE_MODERN;
 }
 
-static inline void virtio_pci_force_virtio_1(VirtIOPCIProxy *proxy)
+static inline bool virtio_pci_force_virtio_1(VirtIOPCIProxy *proxy,
+                                             Error **errp)
 {
-    proxy->disable_modern = false;
-    proxy->disable_legacy = ON_OFF_AUTO_ON;
+    if (proxy->disable_legacy == ON_OFF_AUTO_OFF) {
+        error_setg(errp, "Unable to set disable-legacy=off on a virtio-1.0 "
+                   "only device");
+        return false;
+    }
+    if (proxy->disable_modern == true) {
+        error_setg(errp, "Unable to set disable-modern=on on a virtio-1.0 "
+                   "only device");
+        return false;
+    }
+    proxy->mode = VIRTIO_PCI_MODE_MODERN;
+    return true;
 }
 
 static inline void virtio_pci_disable_modern(VirtIOPCIProxy *proxy)
 {
-    proxy->disable_modern = true;
+    proxy->mode = VIRTIO_PCI_MODE_LEGACY;
 }
 
 /*
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 28056a7ef7..4805727b53 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -1162,10 +1162,16 @@ int virtio_set_status(VirtIODevice *vdev, uint8_t val)
             }
         }
     }
+    vdev->started = val & VIRTIO_CONFIG_S_DRIVER_OK;
+    if (unlikely(vdev->start_on_kick && vdev->started)) {
+        vdev->start_on_kick = false;
+    }
+
     if (k->set_status) {
         k->set_status(vdev, val);
     }
     vdev->status = val;
+
     return 0;
 }
 
@@ -1208,6 +1214,9 @@ void virtio_reset(void *opaque)
         k->reset(vdev);
     }
 
+    vdev->start_on_kick = (virtio_host_has_feature(vdev, VIRTIO_F_VERSION_1) &&
+                          !virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1));
+    vdev->started = false;
     vdev->broken = false;
     vdev->guest_features = 0;
     vdev->queue_sel = 0;
@@ -1518,14 +1527,21 @@ void virtio_queue_set_align(VirtIODevice *vdev, int n, int align)
 
 static bool virtio_queue_notify_aio_vq(VirtQueue *vq)
 {
+    bool ret = false;
+
     if (vq->vring.desc && vq->handle_aio_output) {
         VirtIODevice *vdev = vq->vdev;
 
         trace_virtio_queue_notify(vdev, vq - vdev->vq, vq);
-        return vq->handle_aio_output(vdev, vq);
+        ret = vq->handle_aio_output(vdev, vq);
+
+        if (unlikely(vdev->start_on_kick)) {
+            vdev->started = true;
+            vdev->start_on_kick = false;
+        }
     }
 
-    return false;
+    return ret;
 }
 
 static void virtio_queue_notify_vq(VirtQueue *vq)
@@ -1539,6 +1555,11 @@ static void virtio_queue_notify_vq(VirtQueue *vq)
 
         trace_virtio_queue_notify(vdev, vq - vdev->vq, vq);
         vq->handle_output(vdev, vq);
+
+        if (unlikely(vdev->start_on_kick)) {
+            vdev->started = true;
+            vdev->start_on_kick = false;
+        }
     }
 }
 
@@ -1556,6 +1577,11 @@ void virtio_queue_notify(VirtIODevice *vdev, int n)
     } else if (vq->handle_output) {
         vq->handle_output(vdev, vq);
     }
+
+    if (unlikely(vdev->start_on_kick)) {
+        vdev->started = true;
+        vdev->start_on_kick = false;
+    }
 }
 
 uint16_t virtio_queue_vector(VirtIODevice *vdev, int n)
@@ -1770,6 +1796,13 @@ static bool virtio_broken_needed(void *opaque)
     return vdev->broken;
 }
 
+static bool virtio_started_needed(void *opaque)
+{
+    VirtIODevice *vdev = opaque;
+
+    return vdev->started;
+}
+
 static const VMStateDescription vmstate_virtqueue = {
     .name = "virtqueue_state",
     .version_id = 1,
@@ -1898,6 +1931,17 @@ static const VMStateDescription vmstate_virtio_broken = {
     }
 };
 
+static const VMStateDescription vmstate_virtio_started = {
+    .name = "virtio/started",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = &virtio_started_needed,
+    .fields = (VMStateField[]) {
+        VMSTATE_BOOL(started, VirtIODevice),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 static const VMStateDescription vmstate_virtio = {
     .name = "virtio",
     .version_id = 1,
@@ -1913,6 +1957,7 @@ static const VMStateDescription vmstate_virtio = {
         &vmstate_virtio_ringsize,
         &vmstate_virtio_broken,
         &vmstate_virtio_extra_state,
+        &vmstate_virtio_started,
         NULL
     }
 };
@@ -2246,7 +2291,7 @@ static void virtio_vmstate_change(void *opaque, int running, RunState state)
     VirtIODevice *vdev = opaque;
     BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
-    bool backend_run = running && (vdev->status & VIRTIO_CONFIG_S_DRIVER_OK);
+    bool backend_run = running && vdev->started;
     vdev->vm_running = running;
 
     if (backend_run) {
@@ -2286,6 +2331,9 @@ void virtio_init(VirtIODevice *vdev, const char *name,
             g_malloc0(sizeof(*vdev->vector_queues) * nvectors);
     }
 
+    vdev->start_on_kick = (virtio_host_has_feature(vdev, VIRTIO_F_VERSION_1) &&
+                          !virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1));
+    vdev->started = false;
     vdev->device_id = device_id;
     vdev->status = 0;
     atomic_set(&vdev->isr, 0);
diff --git a/include/hw/acpi/pci.h b/include/hw/acpi/pci.h
new file mode 100644
index 0000000000..124af7d32a
--- /dev/null
+++ b/include/hw/acpi/pci.h
@@ -0,0 +1,33 @@
+/*
+ * Support for generating PCI related ACPI tables and passing them to Guests
+ *
+ * Copyright (C) 2006 Fabrice Bellard
+ * Copyright (C) 2008-2010  Kevin O'Connor <kevin@koconnor.net>
+ * Copyright (C) 2013-2019 Red Hat Inc
+ * Copyright (C) 2019 Intel Corporation
+ *
+ * Author: Wei Yang <richardw.yang@linux.intel.com>
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef HW_ACPI_PCI_H
+#define HW_ACPI_PCI_H
+
+typedef struct AcpiMcfgInfo {
+    uint64_t base;
+    uint32_t size;
+} AcpiMcfgInfo;
+
+#endif
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index fdd4c43d3a..edf44de21d 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -395,7 +395,6 @@ typedef PCIINTxRoute (*pci_route_irq_fn)(void *opaque, int pin);
 #define TYPE_PCIE_BUS "PCIE"
 
 bool pci_bus_is_express(PCIBus *bus);
-bool pci_bus_is_root(PCIBus *bus);
 bool pci_bus_allows_extended_config_space(PCIBus *bus);
 
 void pci_root_bus_new_inplace(PCIBus *bus, size_t bus_size, DeviceState *parent,
diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h
index f6df834170..aea98d5040 100644
--- a/include/hw/pci/pci_bus.h
+++ b/include/hw/pci/pci_bus.h
@@ -15,14 +15,19 @@ typedef struct PCIBusClass {
     BusClass parent_class;
     /*< public >*/
 
-    bool (*is_root)(PCIBus *bus);
     int (*bus_num)(PCIBus *bus);
     uint16_t (*numa_node)(PCIBus *bus);
     bool (*allows_extended_config_space)(PCIBus *bus);
 } PCIBusClass;
 
+enum PCIBusFlags {
+    /* This bus is the root of a PCI domain */
+    PCI_BUS_IS_ROOT                                         = 0x0001,
+};
+
 struct PCIBus {
     BusState qbus;
+    enum PCIBusFlags flags;
     PCIIOMMUFunc iommu_fn;
     void *iommu_opaque;
     uint8_t devfn_min;
@@ -47,4 +52,9 @@ struct PCIBus {
     Notifier machine_done;
 };
 
+static inline bool pci_bus_is_root(PCIBus *bus)
+{
+    return !!(bus->flags & PCI_BUS_IS_ROOT);
+}
+
 #endif /* QEMU_PCI_BUS_H */
diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h
index 33ed3b8dde..fa55dc10ae 100644
--- a/include/hw/qdev-core.h
+++ b/include/hw/qdev-core.h
@@ -251,8 +251,6 @@ struct PropertyInfo {
 /**
  * GlobalProperty:
  * @used: Set to true if property was used when initializing a device.
- * @optional: If set to true, GlobalProperty will be skipped without errors
- *            if the property doesn't exist.
  *
  * An error is fatal for non-hotplugged devices, when the global is applied.
  */
@@ -261,7 +259,6 @@ typedef struct GlobalProperty {
     const char *property;
     const char *value;
     bool used;
-    bool optional;
 } GlobalProperty;
 
 static inline void
diff --git a/include/hw/virtio/vhost-user-blk.h b/include/hw/virtio/vhost-user-blk.h
index 68634bee61..51457fb857 100644
--- a/include/hw/virtio/vhost-user-blk.h
+++ b/include/hw/virtio/vhost-user-blk.h
@@ -38,6 +38,9 @@ typedef struct VHostUserBlk {
     struct vhost_dev dev;
     struct vhost_inflight *inflight;
     VhostUserState vhost_user;
+    struct vhost_virtqueue *vqs;
+    guint watch;
+    bool connected;
 } VHostUserBlk;
 
 #endif
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index 7140381e3a..27c0efc3d0 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -105,6 +105,8 @@ struct VirtIODevice
     uint16_t device_id;
     bool vm_running;
     bool broken; /* device in invalid state, needs reset */
+    bool started;
+    bool start_on_kick; /* virtio 1.0 transitional devices support that */
     VMChangeStateEntry *vmstate;
     char *bus_name;
     uint8_t device_endian;
diff --git a/net/vhost-user.c b/net/vhost-user.c
index 5a26a24708..51921de443 100644
--- a/net/vhost-user.c
+++ b/net/vhost-user.c
@@ -236,7 +236,6 @@ static void chr_closed_bh(void *opaque)
     s = DO_UPCAST(NetVhostUserState, nc, ncs[0]);
 
     qmp_set_link(name, false, &err);
-    vhost_user_stop(queues, ncs);
 
     qemu_chr_fe_set_handlers(&s->chr, NULL, NULL, net_vhost_user_event,
                              NULL, opaque, NULL, true);
diff --git a/qom/object.c b/qom/object.c
index d3412e7fdc..99c4fa707e 100644
--- a/qom/object.c
+++ b/qom/object.c
@@ -385,9 +385,6 @@ void object_apply_global_props(Object *obj, const GPtrArray *props, Error **errp
         if (object_dynamic_cast(obj, p->driver) == NULL) {
             continue;
         }
-        if (p->optional && !object_property_find(obj, p->property, NULL)) {
-            continue;
-        }
         p->used = true;
         object_property_parse(obj, p->value, p->property, &err);
         if (err != NULL) {
diff --git a/tests/acpi-utils.c b/tests/acpi-utils.c
index cc33b460ab..d2a202efca 100644
--- a/tests/acpi-utils.c
+++ b/tests/acpi-utils.c
@@ -51,19 +51,7 @@ uint32_t acpi_find_rsdp_address(QTestState *qts)
     return off;
 }
 
-uint64_t acpi_get_xsdt_address(uint8_t *rsdp_table)
-{
-    uint64_t xsdt_physical_address;
-    uint8_t revision = rsdp_table[15 /* Revision offset */];
-
-    /* We must have revision 2 if we're looking for an XSDT pointer */
-    g_assert(revision == 2);
-
-    memcpy(&xsdt_physical_address, &rsdp_table[24 /* XsdtAddress offset */], 8);
-    return le64_to_cpu(xsdt_physical_address);
-}
-
-void acpi_parse_rsdp_table(QTestState *qts, uint32_t addr, uint8_t *rsdp_table)
+void acpi_fetch_rsdp_table(QTestState *qts, uint64_t addr, uint8_t *rsdp_table)
 {
     uint8_t revision;
 
@@ -91,13 +79,15 @@ void acpi_parse_rsdp_table(QTestState *qts, uint32_t addr, uint8_t *rsdp_table)
  *  actual one.
  */
 void acpi_fetch_table(QTestState *qts, uint8_t **aml, uint32_t *aml_len,
-                      const uint8_t *addr_ptr, const char *sig,
+                      const uint8_t *addr_ptr, int addr_size, const char *sig,
                       bool verify_checksum)
 {
-    uint32_t addr, len;
+    uint32_t len;
+    uint64_t addr = 0;
 
-    memcpy(&addr, addr_ptr , sizeof(addr));
-    addr = le32_to_cpu(addr);
+    g_assert(addr_size == 4 || addr_size == 8);
+    memcpy(&addr, addr_ptr , addr_size);
+    addr = le64_to_cpu(addr);
     qtest_memread(qts, addr + 4, &len, 4); /* Length of ACPI table */
     *aml_len = le32_to_cpu(len);
     *aml = g_malloc0(*aml_len);
@@ -111,3 +101,47 @@ void acpi_fetch_table(QTestState *qts, uint8_t **aml, uint32_t *aml_len,
         g_assert(!acpi_calc_checksum(*aml, *aml_len));
     }
 }
+
+#define GUID_SIZE 16
+static const uint8_t AcpiTestSupportGuid[GUID_SIZE] = {
+       0xb1, 0xa6, 0x87, 0xab,
+       0x34, 0x20,
+       0xa0, 0xbd,
+       0x71, 0xbd, 0x37, 0x50, 0x07, 0x75, 0x77, 0x85 };
+
+typedef struct {
+    uint8_t signature_guid[GUID_SIZE];
+    uint64_t rsdp10;
+    uint64_t rsdp20;
+} __attribute__((packed)) UefiTestSupport;
+
+/* Wait at most 600 seconds (test is slow with TCG and --enable-debug) */
+#define TEST_DELAY (1 * G_USEC_PER_SEC / 10)
+#define TEST_CYCLES MAX((600 * G_USEC_PER_SEC / TEST_DELAY), 1)
+#define MB 0x100000ULL
+uint64_t acpi_find_rsdp_address_uefi(QTestState *qts, uint64_t start,
+                                     uint64_t size)
+{
+    int i, j;
+    uint8_t data[GUID_SIZE];
+
+    for (i = 0; i < TEST_CYCLES; ++i) {
+        for (j = 0; j < size / MB; j++) {
+            /* look for GUID at every 1Mb block */
+            uint64_t addr = start + j * MB;
+
+            qtest_memread(qts, addr, data, sizeof(data));
+            if (!memcmp(AcpiTestSupportGuid, data, sizeof(data))) {
+                UefiTestSupport ret;
+
+                qtest_memread(qts, addr, &ret, sizeof(ret));
+                ret.rsdp10 = le64_to_cpu(ret.rsdp10);
+                ret.rsdp20 = le64_to_cpu(ret.rsdp20);
+                return ret.rsdp20 ? ret.rsdp20 : ret.rsdp10;
+            }
+        }
+        g_usleep(TEST_DELAY);
+    }
+    g_assert_not_reached();
+    return 0;
+}
diff --git a/tests/acpi-utils.h b/tests/acpi-utils.h
index 73fe24f044..0c86780689 100644
--- a/tests/acpi-utils.h
+++ b/tests/acpi-utils.h
@@ -46,10 +46,11 @@ typedef struct {
 
 uint8_t acpi_calc_checksum(const uint8_t *data, int len);
 uint32_t acpi_find_rsdp_address(QTestState *qts);
-uint64_t acpi_get_xsdt_address(uint8_t *rsdp_table);
-void acpi_parse_rsdp_table(QTestState *qts, uint32_t addr, uint8_t *rsdp_table);
+uint64_t acpi_find_rsdp_address_uefi(QTestState *qts, uint64_t start,
+                                     uint64_t size);
+void acpi_fetch_rsdp_table(QTestState *qts, uint64_t addr, uint8_t *rsdp_table);
 void acpi_fetch_table(QTestState *qts, uint8_t **aml, uint32_t *aml_len,
-                      const uint8_t *addr_ptr, const char *sig,
+                      const uint8_t *addr_ptr, int addr_size, const char *sig,
                       bool verify_checksum);
 
 #endif /* TEST_ACPI_UTILS_H */
diff --git a/tests/bios-tables-test.c b/tests/bios-tables-test.c
index a506dcbb29..11e07be093 100644
--- a/tests/bios-tables-test.c
+++ b/tests/bios-tables-test.c
@@ -24,9 +24,15 @@
 #define ACPI_REBUILD_EXPECTED_AML "TEST_ACPI_REBUILD_AML"
 
 typedef struct {
+    const char *accel;
     const char *machine;
     const char *variant;
-    uint32_t rsdp_addr;
+    const char *uefi_fl1;
+    const char *uefi_fl2;
+    const char *cd;
+    const uint64_t ram_start;
+    const uint64_t scan_len;
+    uint64_t rsdp_addr;
     uint8_t rsdp_table[36 /* ACPI 2.0+ RSDP size */];
     GArray *tables;
     uint32_t smbios_ep_addr;
@@ -77,22 +83,13 @@ static void free_test_data(test_data *data)
     g_array_free(data->tables, true);
 }
 
-static void test_acpi_rsdp_address(test_data *data)
-{
-    uint32_t off = acpi_find_rsdp_address(data->qts);
-    g_assert_cmphex(off, <, 0x100000);
-    data->rsdp_addr = off;
-}
-
 static void test_acpi_rsdp_table(test_data *data)
 {
-    uint8_t *rsdp_table = data->rsdp_table, revision;
-    uint32_t addr = data->rsdp_addr;
+    uint8_t *rsdp_table = data->rsdp_table;
 
-    acpi_parse_rsdp_table(data->qts, addr, rsdp_table);
-    revision = rsdp_table[15 /* Revision offset */];
+    acpi_fetch_rsdp_table(data->qts, data->rsdp_addr, rsdp_table);
 
-    switch (revision) {
+    switch (rsdp_table[15 /* Revision offset */]) {
     case 0: /* ACPI 1.0 RSDP */
         /* With rev 1, checksum is only for the first 20 bytes */
         g_assert(!acpi_calc_checksum(rsdp_table,  20));
@@ -107,21 +104,29 @@ static void test_acpi_rsdp_table(test_data *data)
     }
 }
 
-static void test_acpi_rsdt_table(test_data *data)
+static void test_acpi_rxsdt_table(test_data *data)
 {
+    const char *sig = "RSDT";
     AcpiSdtTable rsdt = {};
+    int entry_size = 4;
+    int addr_off = 16 /* RsdtAddress */;
     uint8_t *ent;
 
-    /* read RSDT table */
+    if (data->rsdp_table[15 /* Revision offset */] != 0) {
+        addr_off = 24 /* XsdtAddress */;
+        entry_size = 8;
+        sig = "XSDT";
+    }
+    /* read [RX]SDT table */
     acpi_fetch_table(data->qts, &rsdt.aml, &rsdt.aml_len,
-                     &data->rsdp_table[16 /* RsdtAddress */], "RSDT", true);
+                     &data->rsdp_table[addr_off], entry_size, sig, true);
 
     /* Load all tables and add to test list directly RSDT referenced tables */
-    ACPI_FOREACH_RSDT_ENTRY(rsdt.aml, rsdt.aml_len, ent, 4 /* Entry size */) {
+    ACPI_FOREACH_RSDT_ENTRY(rsdt.aml, rsdt.aml_len, ent, entry_size) {
         AcpiSdtTable ssdt_table = {};
 
         acpi_fetch_table(data->qts, &ssdt_table.aml, &ssdt_table.aml_len, ent,
-                         NULL, true);
+                         entry_size, NULL, true);
         /* Add table to ASL test tables list */
         g_array_append_val(data->tables, ssdt_table);
     }
@@ -134,16 +139,29 @@ static void test_acpi_fadt_table(test_data *data)
     AcpiSdtTable table = g_array_index(data->tables, typeof(table), 0);
     uint8_t *fadt_aml = table.aml;
     uint32_t fadt_len = table.aml_len;
+    uint32_t val;
+    int dsdt_offset = 40 /* DSDT */;
+    int dsdt_entry_size = 4;
 
     g_assert(compare_signature(&table, "FACP"));
 
     /* Since DSDT/FACS isn't in RSDT, add them to ASL test list manually */
-    acpi_fetch_table(data->qts, &table.aml, &table.aml_len,
-                     fadt_aml + 36 /* FIRMWARE_CTRL */, "FACS", false);
-    g_array_append_val(data->tables, table);
+    memcpy(&val, fadt_aml + 112 /* Flags */, 4);
+    val = le32_to_cpu(val);
+    if (!(val & 1UL << 20 /* HW_REDUCED_ACPI */)) {
+        acpi_fetch_table(data->qts, &table.aml, &table.aml_len,
+                         fadt_aml + 36 /* FIRMWARE_CTRL */, 4, "FACS", false);
+        g_array_append_val(data->tables, table);
+    }
 
+    memcpy(&val, fadt_aml + dsdt_offset, 4);
+    val = le32_to_cpu(val);
+    if (!val) {
+        dsdt_offset = 140 /* X_DSDT */;
+        dsdt_entry_size = 8;
+    }
     acpi_fetch_table(data->qts, &table.aml, &table.aml_len,
-                     fadt_aml + 40 /* DSDT */, "DSDT", true);
+                     fadt_aml + dsdt_offset, dsdt_entry_size, "DSDT", true);
     g_array_append_val(data->tables, table);
 
     memset(fadt_aml + 36, 0, 4); /* sanitize FIRMWARE_CTRL ptr */
@@ -177,11 +195,14 @@ static void dump_aml_files(test_data *data, bool rebuild)
                                        sdt->aml, ext);
             fd = g_open(aml_file, O_WRONLY|O_TRUNC|O_CREAT,
                         S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH);
+            if (fd < 0) {
+                perror(aml_file);
+            }
+            g_assert(fd >= 0);
         } else {
             fd = g_file_open_tmp("aml-XXXXXX", &sdt->aml_file, &error);
             g_assert_no_error(error);
         }
-        g_assert(fd >= 0);
 
         ret = qemu_write_full(fd, sdt->aml, sdt->aml_len);
         g_assert(ret == sdt->aml_len);
@@ -505,23 +526,44 @@ static void test_smbios_structs(test_data *data)
 static void test_acpi_one(const char *params, test_data *data)
 {
     char *args;
-
-    /* Disable kernel irqchip to be able to override apic irq0. */
-    args = g_strdup_printf("-machine %s,accel=%s,kernel-irqchip=off "
-                           "-net none -display none %s "
-                           "-drive id=hd0,if=none,file=%s,format=raw "
-                           "-device ide-hd,drive=hd0 ",
-                           data->machine, "kvm:tcg",
-                           params ? params : "", disk);
+    bool use_uefi = data->uefi_fl1 && data->uefi_fl2;
+
+    if (use_uefi) {
+        /*
+         * TODO: convert '-drive if=pflash' to new syntax (see e33763be7cd3)
+         * when arm/virt boad starts to support it.
+         */
+        args = g_strdup_printf("-machine %s,accel=%s -nodefaults -nographic "
+            "-drive if=pflash,format=raw,file=%s,readonly "
+            "-drive if=pflash,format=raw,file=%s,snapshot=on -cdrom %s %s",
+            data->machine, data->accel ? data->accel : "kvm:tcg",
+            data->uefi_fl1, data->uefi_fl2, data->cd, params ? params : "");
+
+    } else {
+        /* Disable kernel irqchip to be able to override apic irq0. */
+        args = g_strdup_printf("-machine %s,accel=%s,kernel-irqchip=off "
+            "-net none -display none %s "
+            "-drive id=hd0,if=none,file=%s,format=raw "
+            "-device ide-hd,drive=hd0 ",
+             data->machine, data->accel ? data->accel : "kvm:tcg",
+             params ? params : "", disk);
+    }
 
     data->qts = qtest_init(args);
 
-    boot_sector_test(data->qts);
+    if (use_uefi) {
+        g_assert(data->scan_len);
+        data->rsdp_addr = acpi_find_rsdp_address_uefi(data->qts,
+            data->ram_start, data->scan_len);
+    } else {
+        boot_sector_test(data->qts);
+        data->rsdp_addr = acpi_find_rsdp_address(data->qts);
+        g_assert_cmphex(data->rsdp_addr, <, 0x100000);
+    }
 
     data->tables = g_array_new(false, true, sizeof(AcpiSdtTable));
-    test_acpi_rsdp_address(data);
     test_acpi_rsdp_table(data);
-    test_acpi_rsdt_table(data);
+    test_acpi_rxsdt_table(data);
     test_acpi_fadt_table(data);
 
     if (iasl) {
@@ -532,8 +574,15 @@ static void test_acpi_one(const char *params, test_data *data)
         }
     }
 
-    test_smbios_entry_point(data);
-    test_smbios_structs(data);
+    /*
+     * TODO: make SMBIOS tests work with UEFI firmware,
+     * Bug on uefi-test-tools to provide entry point:
+     * https://bugs.launchpad.net/qemu/+bug/1821884
+     */
+    if (!use_uefi) {
+        test_smbios_entry_point(data);
+        test_smbios_structs(data);
+    }
 
     assert(!global_qtest);
     qtest_quit(data->qts);
@@ -769,13 +818,14 @@ int main(int argc, char *argv[])
     const char *arch = qtest_get_arch();
     int ret;
 
-    ret = boot_sector_init(disk);
-    if(ret)
-        return ret;
-
     g_test_init(&argc, &argv, NULL);
 
     if (strcmp(arch, "i386") == 0 || strcmp(arch, "x86_64") == 0) {
+        ret = boot_sector_init(disk);
+        if (ret) {
+            return ret;
+        }
+
         qtest_add_func("acpi/piix4", test_acpi_piix4_tcg);
         qtest_add_func("acpi/piix4/bridge", test_acpi_piix4_tcg_bridge);
         qtest_add_func("acpi/q35", test_acpi_q35_tcg);
diff --git a/tests/data/acpi/rebuild-expected-aml.sh b/tests/data/acpi/rebuild-expected-aml.sh
index abdff70a0d..ff7e62249d 100755
--- a/tests/data/acpi/rebuild-expected-aml.sh
+++ b/tests/data/acpi/rebuild-expected-aml.sh
@@ -7,21 +7,12 @@
 #
 # Authors:
 #  Marcel Apfelbaum <marcel.a@redhat.com>
+#  Igor Mammedov <imammedo@redhat.com>
 #
 # This work is licensed under the terms of the GNU GPLv2.
 # See the COPYING.LIB file in the top-level directory.
 
-qemu=
-
-if [ -e x86_64-softmmu/qemu-system-x86_64 ]; then
-    qemu="x86_64-softmmu/qemu-system-x86_64"
-elif [ -e i386-softmmu/qemu-system-i386 ]; then
-    qemu="i386-softmmu/qemu-system-i386"
-else
-    echo "Run 'make' to build the qemu exectutable!"
-    echo "Run this script from the build directory."
-    exit 1;
-fi
+qemu_bins="x86_64-softmmu/qemu-system-x86_64"
 
 if [ ! -e "tests/bios-tables-test" ]; then
     echo "Test: bios-tables-test is required! Run make check before this script."
@@ -29,6 +20,14 @@ if [ ! -e "tests/bios-tables-test" ]; then
     exit 1;
 fi
 
-TEST_ACPI_REBUILD_AML=y QTEST_QEMU_BINARY=$qemu tests/bios-tables-test
+for qemu in $qemu_bins; do
+    if [ ! -e $qemu ]; then
+        echo "Run 'make' to build the following QEMU executables: $qemu_bins"
+        echo "Also, run this script from the build directory."
+        exit 1;
+    fi
+    TEST_ACPI_REBUILD_AML=y QTEST_QEMU_BINARY=$qemu tests/bios-tables-test
+done
+
 
 echo "The files were rebuilt and can be added to git."
diff --git a/tests/vmgenid-test.c b/tests/vmgenid-test.c
index ae38ee5ac0..85d8e6463e 100644
--- a/tests/vmgenid-test.c
+++ b/tests/vmgenid-test.c
@@ -40,14 +40,14 @@ static uint32_t acpi_find_vgia(QTestState *qts)
     g_assert_cmphex(rsdp_offset, <, RSDP_ADDR_INVALID);
 
 
-    acpi_parse_rsdp_table(qts, rsdp_offset, rsdp_table);
+    acpi_fetch_rsdp_table(qts, rsdp_offset, rsdp_table);
     acpi_fetch_table(qts, &rsdt, &rsdt_len, &rsdp_table[16 /* RsdtAddress */],
-                     "RSDT", true);
+                     4, "RSDT", true);
 
     ACPI_FOREACH_RSDT_ENTRY(rsdt, rsdt_len, ent, 4 /* Entry size */) {
         uint8_t *table_aml;
 
-        acpi_fetch_table(qts, &table_aml, &table_length, ent, NULL, true);
+        acpi_fetch_table(qts, &table_aml, &table_length, ent, 4, NULL, true);
         if (!memcmp(table_aml + 16 /* OEM Table ID */, "VMGENID", 7)) {
             uint32_t vgia_val;
             uint8_t *aml = &table_aml[36 /* AML byte-code start */];
author	Peter Maydell <peter.maydell@linaro.org>	2019-05-21 14:56:57 +0100
committer	Peter Maydell <peter.maydell@linaro.org>	2019-05-21 14:56:57 +0100
commit	247ba27c528c52e4a41c233c1c9a699f40e4d2a5 (patch)
tree	cec47b9b84e1e099b1295468f59fe31490c6e379
parent	62516a0a18cd156d913dd625baca52c46743223b (diff)
parent	ba02ff90ee1dcaf7aa5645075217e555ae2c54ea (diff)