diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2019-05-21 14:56:57 +0100 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2019-05-21 14:56:57 +0100 |
commit | 247ba27c528c52e4a41c233c1c9a699f40e4d2a5 (patch) | |
tree | cec47b9b84e1e099b1295468f59fe31490c6e379 | |
parent | 62516a0a18cd156d913dd625baca52c46743223b (diff) | |
parent | ba02ff90ee1dcaf7aa5645075217e555ae2c54ea (diff) |
Merge remote-tracking branch 'remotes/mst/tags/for_upstream' into staging
pci, pc, virtio: features, fixes
reconnect for vhost blk
tests for UEFI
misc other stuff
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
# gpg: Signature made Tue 21 May 2019 14:41:32 BST
# gpg: using RSA key 281F0DB8D28D5469
# gpg: Good signature from "Michael S. Tsirkin <mst@kernel.org>" [full]
# gpg: aka "Michael S. Tsirkin <mst@redhat.com>" [full]
# Primary key fingerprint: 0270 606B 6F3C DF3D 0B17 0970 C350 3912 AFBE 8E67
# Subkey fingerprint: 5D09 FD08 71C8 F85B 94CA 8A0D 281F 0DB8 D28D 5469
* remotes/mst/tags/for_upstream: (34 commits)
tests: acpi: print error unable to dump ACPI table during rebuild
tests: acpi: refactor rebuild-expected-aml.sh to dump ACPI tables for a specified list of targets
tests: acpi: allow to override default accelerator
tests: acpi: ignore SMBIOS tests when UEFI firmware is used
tests: acpi: add a way to start tests with UEFI firmware
tests: acpi: add acpi_find_rsdp_address_uefi() helper
tests: acpi: move boot_sector_init() into x86 tests branch
tests: acpi: skip FACS table if board uses hw reduced ACPI profile
tests: acpi: fetch X_DSDT if pointer to DSDT is 0
tests: acpi: make pointer to RSDP 64bit
tests: acpi: make RSDT test routine handle XSDT
tests: acpi: make acpi_fetch_table() take size of fetched table pointer
tests: acpi: rename acpi_parse_rsdp_table() into acpi_fetch_rsdp_table()
pci: Simplify pci_bus_is_root()
pcie: Remove redundant test in pcie_mmcfg_data_{read,write}()
libvhost-user: fix bad vu_log_write
hw/arm/virt-acpi-build: pass AcpiMcfgInfo to build_mcfg()
i386, acpi: remove mcfg_ prefix in AcpiMcfgInfo members
hw/arm/virt-acpi-build: remove unnecessary variable mcfg_start
do not call vhost_net_cleanup() on running net from char user event
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
36 files changed, 1890 insertions, 1453 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 9424a490d6..a6948ebc63 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1484,7 +1484,7 @@ M: Michael S. Tsirkin <mst@redhat.com> S: Supported F: hw/*/*vhost* F: docs/interop/vhost-user.json -F: docs/interop/vhost-user.txt +F: docs/interop/vhost-user.rst F: contrib/vhost-user-*/ F: backends/vhost-user.c F: include/sysemu/vhost-user-backend.h diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c index 74d42177c5..3825b1cacf 100644 --- a/contrib/libvhost-user/libvhost-user.c +++ b/contrib/libvhost-user/libvhost-user.c @@ -433,7 +433,7 @@ vu_log_write(VuDev *dev, uint64_t address, uint64_t length) page = address / VHOST_LOG_PAGE; while (page * VHOST_LOG_PAGE < address + length) { vu_log_page(dev->log_table, page); - page += VHOST_LOG_PAGE; + page += 1; } vu_log_kick(dev); diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c index 43583f2659..86a3987744 100644 --- a/contrib/vhost-user-blk/vhost-user-blk.c +++ b/contrib/vhost-user-blk/vhost-user-blk.c @@ -398,7 +398,8 @@ vub_get_features(VuDev *dev) static uint64_t vub_get_protocol_features(VuDev *dev) { - return 1ull << VHOST_USER_PROTOCOL_F_CONFIG; + return 1ull << VHOST_USER_PROTOCOL_F_CONFIG | + 1ull << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD; } static int diff --git a/docs/interop/index.rst b/docs/interop/index.rst index 2df977dd52..a037bd67ec 100644 --- a/docs/interop/index.rst +++ b/docs/interop/index.rst @@ -15,4 +15,4 @@ Contents: bitmaps live-block-operations pr-helper - + vhost-user diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst new file mode 100644 index 0000000000..7f3232c798 --- /dev/null +++ b/docs/interop/vhost-user.rst @@ -0,0 +1,1351 @@ +=================== +Vhost-user Protocol +=================== +:Copyright: 2014 Virtual Open Systems Sarl. +:Licence: This work is licensed under the terms of the GNU GPL, + version 2 or later. See the COPYING file in the top-level + directory. + +.. contents:: Table of Contents + +Introduction +============ + +This protocol is aiming to complement the ``ioctl`` interface used to +control the vhost implementation in the Linux kernel. It implements +the control plane needed to establish virtqueue sharing with a user +space process on the same host. It uses communication over a Unix +domain socket to share file descriptors in the ancillary data of the +message. + +The protocol defines 2 sides of the communication, *master* and +*slave*. *Master* is the application that shares its virtqueues, in +our case QEMU. *Slave* is the consumer of the virtqueues. + +In the current implementation QEMU is the *master*, and the *slave* is +the external process consuming the virtio queues, for example a +software Ethernet switch running in user space, such as Snabbswitch, +or a block device backend processing read & write to a virtual +disk. In order to facilitate interoperability between various backend +implementations, it is recommended to follow the :ref:`Backend program +conventions <backend_conventions>`. + +*Master* and *slave* can be either a client (i.e. connecting) or +server (listening) in the socket communication. + +Message Specification +===================== + +.. Note:: All numbers are in the machine native byte order. + +A vhost-user message consists of 3 header fields and a payload. + ++---------+-------+------+---------+ +| request | flags | size | payload | ++---------+-------+------+---------+ + +Header +------ + +:request: 32-bit type of the request + +:flags: 32-bit bit field + +- Lower 2 bits are the version (currently 0x01) +- Bit 2 is the reply flag - needs to be sent on each reply from the slave +- Bit 3 is the need_reply flag - see :ref:`REPLY_ACK <reply_ack>` for + details. + +:size: 32-bit size of the payload + +Payload +------- + +Depending on the request type, **payload** can be: + +A single 64-bit integer +^^^^^^^^^^^^^^^^^^^^^^^ + ++-----+ +| u64 | ++-----+ + +:u64: a 64-bit unsigned integer + +A vring state description +^^^^^^^^^^^^^^^^^^^^^^^^^ + ++-------+-----+ +| index | num | ++-------+-----+ + +:index: a 32-bit index + +:num: a 32-bit number + +A vring address description +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + ++-------+-------+------+------------+------+-----------+-----+ +| index | flags | size | descriptor | used | available | log | ++-------+-------+------+------------+------+-----------+-----+ + +:index: a 32-bit vring index + +:flags: a 32-bit vring flags + +:descriptor: a 64-bit ring address of the vring descriptor table + +:used: a 64-bit ring address of the vring used ring + +:available: a 64-bit ring address of the vring available ring + +:log: a 64-bit guest address for logging + +Note that a ring address is an IOVA if ``VIRTIO_F_IOMMU_PLATFORM`` has +been negotiated. Otherwise it is a user address. + +Memory regions description +^^^^^^^^^^^^^^^^^^^^^^^^^^ + ++-------------+---------+---------+-----+---------+ +| num regions | padding | region0 | ... | region7 | ++-------------+---------+---------+-----+---------+ + +:num regions: a 32-bit number of regions + +:padding: 32-bit + +A region is: + ++---------------+------+--------------+-------------+ +| guest address | size | user address | mmap offset | ++---------------+------+--------------+-------------+ + +:guest address: a 64-bit guest address of the region + +:size: a 64-bit size + +:user address: a 64-bit user address + +:mmap offset: 64-bit offset where region starts in the mapped memory + +Log description +^^^^^^^^^^^^^^^ + ++----------+------------+ +| log size | log offset | ++----------+------------+ + +:log size: size of area used for logging + +:log offset: offset from start of supplied file descriptor where + logging starts (i.e. where guest address 0 would be + logged) + +An IOTLB message +^^^^^^^^^^^^^^^^ + ++------+------+--------------+-------------------+------+ +| iova | size | user address | permissions flags | type | ++------+------+--------------+-------------------+------+ + +:iova: a 64-bit I/O virtual address programmed by the guest + +:size: a 64-bit size + +:user address: a 64-bit user address + +:permissions flags: an 8-bit value: + - 0: No access + - 1: Read access + - 2: Write access + - 3: Read/Write access + +:type: an 8-bit IOTLB message type: + - 1: IOTLB miss + - 2: IOTLB update + - 3: IOTLB invalidate + - 4: IOTLB access fail + +Virtio device config space +^^^^^^^^^^^^^^^^^^^^^^^^^^ + ++--------+------+-------+---------+ +| offset | size | flags | payload | ++--------+------+-------+---------+ + +:offset: a 32-bit offset of virtio device's configuration space + +:size: a 32-bit configuration space access size in bytes + +:flags: a 32-bit value: + - 0: Vhost master messages used for writeable fields + - 1: Vhost master messages used for live migration + +:payload: Size bytes array holding the contents of the virtio + device's configuration space + +Vring area description +^^^^^^^^^^^^^^^^^^^^^^ + ++-----+------+--------+ +| u64 | size | offset | ++-----+------+--------+ + +:u64: a 64-bit integer contains vring index and flags + +:size: a 64-bit size of this area + +:offset: a 64-bit offset of this area from the start of the + supplied file descriptor + +Inflight description +^^^^^^^^^^^^^^^^^^^^ + ++-----------+-------------+------------+------------+ +| mmap size | mmap offset | num queues | queue size | ++-----------+-------------+------------+------------+ + +:mmap size: a 64-bit size of area to track inflight I/O + +:mmap offset: a 64-bit offset of this area from the start + of the supplied file descriptor + +:num queues: a 16-bit number of virtqueues + +:queue size: a 16-bit size of virtqueues + +C structure +----------- + +In QEMU the vhost-user message is implemented with the following struct: + +.. code:: c + + typedef struct VhostUserMsg { + VhostUserRequest request; + uint32_t flags; + uint32_t size; + union { + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + VhostUserMemory memory; + VhostUserLog log; + struct vhost_iotlb_msg iotlb; + VhostUserConfig config; + VhostUserVringArea area; + VhostUserInflight inflight; + }; + } QEMU_PACKED VhostUserMsg; + +Communication +============= + +The protocol for vhost-user is based on the existing implementation of +vhost for the Linux Kernel. Most messages that can be sent via the +Unix domain socket implementing vhost-user have an equivalent ioctl to +the kernel implementation. + +The communication consists of *master* sending message requests and +*slave* sending message replies. Most of the requests don't require +replies. Here is a list of the ones that do: + +* ``VHOST_USER_GET_FEATURES`` +* ``VHOST_USER_GET_PROTOCOL_FEATURES`` +* ``VHOST_USER_GET_VRING_BASE`` +* ``VHOST_USER_SET_LOG_BASE`` (if ``VHOST_USER_PROTOCOL_F_LOG_SHMFD``) +* ``VHOST_USER_GET_INFLIGHT_FD`` (if ``VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD``) + +.. seealso:: + + :ref:`REPLY_ACK <reply_ack>` + The section on ``REPLY_ACK`` protocol extension. + +There are several messages that the master sends with file descriptors passed +in the ancillary data: + +* ``VHOST_USER_SET_MEM_TABLE`` +* ``VHOST_USER_SET_LOG_BASE`` (if ``VHOST_USER_PROTOCOL_F_LOG_SHMFD``) +* ``VHOST_USER_SET_LOG_FD`` +* ``VHOST_USER_SET_VRING_KICK`` +* ``VHOST_USER_SET_VRING_CALL`` +* ``VHOST_USER_SET_VRING_ERR`` +* ``VHOST_USER_SET_SLAVE_REQ_FD`` +* ``VHOST_USER_SET_INFLIGHT_FD`` (if ``VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD``) + +If *master* is unable to send the full message or receives a wrong +reply it will close the connection. An optional reconnection mechanism +can be implemented. + +Any protocol extensions are gated by protocol feature bits, which +allows full backwards compatibility on both master and slave. As +older slaves don't support negotiating protocol features, a feature +bit was dedicated for this purpose:: + + #define VHOST_USER_F_PROTOCOL_FEATURES 30 + +Starting and stopping rings +--------------------------- + +Client must only process each ring when it is started. + +Client must only pass data between the ring and the backend, when the +ring is enabled. + +If ring is started but disabled, client must process the ring without +talking to the backend. + +For example, for a networking device, in the disabled state client +must not supply any new RX packets, but must process and discard any +TX packets. + +If ``VHOST_USER_F_PROTOCOL_FEATURES`` has not been negotiated, the +ring is initialized in an enabled state. + +If ``VHOST_USER_F_PROTOCOL_FEATURES`` has been negotiated, the ring is +initialized in a disabled state. Client must not pass data to/from the +backend until ring is enabled by ``VHOST_USER_SET_VRING_ENABLE`` with +parameter 1, or after it has been disabled by +``VHOST_USER_SET_VRING_ENABLE`` with parameter 0. + +Each ring is initialized in a stopped state, client must not process +it until ring is started, or after it has been stopped. + +Client must start ring upon receiving a kick (that is, detecting that +file descriptor is readable) on the descriptor specified by +``VHOST_USER_SET_VRING_KICK``, and stop ring upon receiving +``VHOST_USER_GET_VRING_BASE``. + +While processing the rings (whether they are enabled or not), client +must support changing some configuration aspects on the fly. + +Multiple queue support +---------------------- + +Multiple queue is treated as a protocol extension, hence the slave has +to implement protocol features first. The multiple queues feature is +supported only when the protocol feature ``VHOST_USER_PROTOCOL_F_MQ`` +(bit 0) is set. + +The max number of queue pairs the slave supports can be queried with +message ``VHOST_USER_GET_QUEUE_NUM``. Master should stop when the +number of requested queues is bigger than that. + +As all queues share one connection, the master uses a unique index for each +queue in the sent message to identify a specified queue. One queue pair +is enabled initially. More queues are enabled dynamically, by sending +message ``VHOST_USER_SET_VRING_ENABLE``. + +Migration +--------- + +During live migration, the master may need to track the modifications +the slave makes to the memory mapped regions. The client should mark +the dirty pages in a log. Once it complies to this logging, it may +declare the ``VHOST_F_LOG_ALL`` vhost feature. + +To start/stop logging of data/used ring writes, server may send +messages ``VHOST_USER_SET_FEATURES`` with ``VHOST_F_LOG_ALL`` and +``VHOST_USER_SET_VRING_ADDR`` with ``VHOST_VRING_F_LOG`` in ring's +flags set to 1/0, respectively. + +All the modifications to memory pointed by vring "descriptor" should +be marked. Modifications to "used" vring should be marked if +``VHOST_VRING_F_LOG`` is part of ring's flags. + +Dirty pages are of size:: + + #define VHOST_LOG_PAGE 0x1000 + +The log memory fd is provided in the ancillary data of +``VHOST_USER_SET_LOG_BASE`` message when the slave has +``VHOST_USER_PROTOCOL_F_LOG_SHMFD`` protocol feature. + +The size of the log is supplied as part of ``VhostUserMsg`` which +should be large enough to cover all known guest addresses. Log starts +at the supplied offset in the supplied file descriptor. The log +covers from address 0 to the maximum of guest regions. In pseudo-code, +to mark page at ``addr`` as dirty:: + + page = addr / VHOST_LOG_PAGE + log[page / 8] |= 1 << page % 8 + +Where ``addr`` is the guest physical address. + +Use atomic operations, as the log may be concurrently manipulated. + +Note that when logging modifications to the used ring (when +``VHOST_VRING_F_LOG`` is set for this ring), ``log_guest_addr`` should +be used to calculate the log offset: the write to first byte of the +used ring is logged at this offset from log start. Also note that this +value might be outside the legal guest physical address range +(i.e. does not have to be covered by the ``VhostUserMemory`` table), but +the bit offset of the last byte of the ring must fall within the size +supplied by ``VhostUserLog``. + +``VHOST_USER_SET_LOG_FD`` is an optional message with an eventfd in +ancillary data, it may be used to inform the master that the log has +been modified. + +Once the source has finished migration, rings will be stopped by the +source. No further update must be done before rings are restarted. + +In postcopy migration the slave is started before all the memory has +been received from the source host, and care must be taken to avoid +accessing pages that have yet to be received. The slave opens a +'userfault'-fd and registers the memory with it; this fd is then +passed back over to the master. The master services requests on the +userfaultfd for pages that are accessed and when the page is available +it performs WAKE ioctl's on the userfaultfd to wake the stalled +slave. The client indicates support for this via the +``VHOST_USER_PROTOCOL_F_PAGEFAULT`` feature. + +Memory access +------------- + +The master sends a list of vhost memory regions to the slave using the +``VHOST_USER_SET_MEM_TABLE`` message. Each region has two base +addresses: a guest address and a user address. + +Messages contain guest addresses and/or user addresses to reference locations +within the shared memory. The mapping of these addresses works as follows. + +User addresses map to the vhost memory region containing that user address. + +When the ``VIRTIO_F_IOMMU_PLATFORM`` feature has not been negotiated: + +* Guest addresses map to the vhost memory region containing that guest + address. + +When the ``VIRTIO_F_IOMMU_PLATFORM`` feature has been negotiated: + +* Guest addresses are also called I/O virtual addresses (IOVAs). They are + translated to user addresses via the IOTLB. + +* The vhost memory region guest address is not used. + +IOMMU support +------------- + +When the ``VIRTIO_F_IOMMU_PLATFORM`` feature has been negotiated, the +master sends IOTLB entries update & invalidation by sending +``VHOST_USER_IOTLB_MSG`` requests to the slave with a ``struct +vhost_iotlb_msg`` as payload. For update events, the ``iotlb`` payload +has to be filled with the update message type (2), the I/O virtual +address, the size, the user virtual address, and the permissions +flags. Addresses and size must be within vhost memory regions set via +the ``VHOST_USER_SET_MEM_TABLE`` request. For invalidation events, the +``iotlb`` payload has to be filled with the invalidation message type +(3), the I/O virtual address and the size. On success, the slave is +expected to reply with a zero payload, non-zero otherwise. + +The slave relies on the slave communcation channel (see :ref:`Slave +communication <slave_communication>` section below) to send IOTLB miss +and access failure events, by sending ``VHOST_USER_SLAVE_IOTLB_MSG`` +requests to the master with a ``struct vhost_iotlb_msg`` as +payload. For miss events, the iotlb payload has to be filled with the +miss message type (1), the I/O virtual address and the permissions +flags. For access failure event, the iotlb payload has to be filled +with the access failure message type (4), the I/O virtual address and +the permissions flags. For synchronization purpose, the slave may +rely on the reply-ack feature, so the master may send a reply when +operation is completed if the reply-ack feature is negotiated and +slaves requests a reply. For miss events, completed operation means +either master sent an update message containing the IOTLB entry +containing requested address and permission, or master sent nothing if +the IOTLB miss message is invalid (invalid IOVA or permission). + +The master isn't expected to take the initiative to send IOTLB update +messages, as the slave sends IOTLB miss messages for the guest virtual +memory areas it needs to access. + +.. _slave_communication: + +Slave communication +------------------- + +An optional communication channel is provided if the slave declares +``VHOST_USER_PROTOCOL_F_SLAVE_REQ`` protocol feature, to allow the +slave to make requests to the master. + +The fd is provided via ``VHOST_USER_SET_SLAVE_REQ_FD`` ancillary data. + +A slave may then send ``VHOST_USER_SLAVE_*`` messages to the master +using this fd communication channel. + +If ``VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD`` protocol feature is +negotiated, slave can send file descriptors (at most 8 descriptors in +each message) to master via ancillary data using this fd communication +channel. + +Inflight I/O tracking +--------------------- + +To support reconnecting after restart or crash, slave may need to +resubmit inflight I/Os. If virtqueue is processed in order, we can +easily achieve that by getting the inflight descriptors from +descriptor table (split virtqueue) or descriptor ring (packed +virtqueue). However, it can't work when we process descriptors +out-of-order because some entries which store the information of +inflight descriptors in available ring (split virtqueue) or descriptor +ring (packed virtqueue) might be overrided by new entries. To solve +this problem, slave need to allocate an extra buffer to store this +information of inflight descriptors and share it with master for +persistent. ``VHOST_USER_GET_INFLIGHT_FD`` and +``VHOST_USER_SET_INFLIGHT_FD`` are used to transfer this buffer +between master and slave. And the format of this buffer is described +below: + ++---------------+---------------+-----+---------------+ +| queue0 region | queue1 region | ... | queueN region | ++---------------+---------------+-----+---------------+ + +N is the number of available virtqueues. Slave could get it from num +queues field of ``VhostUserInflight``. + +For split virtqueue, queue region can be implemented as: + +.. code:: c + + typedef struct DescStateSplit { + /* Indicate whether this descriptor is inflight or not. + * Only available for head-descriptor. */ + uint8_t inflight; + + /* Padding */ + uint8_t padding[5]; + + /* Maintain a list for the last batch of used descriptors. + * Only available when batching is used for submitting */ + uint16_t next; + + /* Used to preserve the order of fetching available descriptors. + * Only available for head-descriptor. */ + uint64_t counter; + } DescStateSplit; + + typedef struct QueueRegionSplit { + /* The feature flags of this region. Now it's initialized to 0. */ + uint64_t features; + + /* The version of this region. It's 1 currently. + * Zero value indicates an uninitialized buffer */ + uint16_t version; + + /* The size of DescStateSplit array. It's equal to the virtqueue + * size. Slave could get it from queue size field of VhostUserInflight. */ + uint16_t desc_num; + + /* The head of list that track the last batch of used descriptors. */ + uint16_t last_batch_head; + + /* Store the idx value of used ring */ + uint16_t used_idx; + + /* Used to track the state of each descriptor in descriptor table */ + DescStateSplit desc[0]; + } QueueRegionSplit; + +To track inflight I/O, the queue region should be processed as follows: + +When receiving available buffers from the driver: + +#. Get the next available head-descriptor index from available ring, ``i`` + +#. Set ``desc[i].counter`` to the value of global counter + +#. Increase global counter by 1 + +#. Set ``desc[i].inflight`` to 1 + +When supplying used buffers to the driver: + +1. Get corresponding used head-descriptor index, i + +2. Set ``desc[i].next`` to ``last_batch_head`` + +3. Set ``last_batch_head`` to ``i`` + +#. Steps 1,2,3 may be performed repeatedly if batching is possible + +#. Increase the ``idx`` value of used ring by the size of the batch + +#. Set the ``inflight`` field of each ``DescStateSplit`` entry in the batch to 0 + +#. Set ``used_idx`` to the ``idx`` value of used ring + +When reconnecting: + +#. If the value of ``used_idx`` does not match the ``idx`` value of + used ring (means the inflight field of ``DescStateSplit`` entries in + last batch may be incorrect), + + a. Subtract the value of ``used_idx`` from the ``idx`` value of + used ring to get last batch size of ``DescStateSplit`` entries + + #. Set the ``inflight`` field of each ``DescStateSplit`` entry to 0 in last batch + list which starts from ``last_batch_head`` + + #. Set ``used_idx`` to the ``idx`` value of used ring + +#. Resubmit inflight ``DescStateSplit`` entries in order of their + counter value + +For packed virtqueue, queue region can be implemented as: + +.. code:: c + + typedef struct DescStatePacked { + /* Indicate whether this descriptor is inflight or not. + * Only available for head-descriptor. */ + uint8_t inflight; + + /* Padding */ + uint8_t padding; + + /* Link to the next free entry */ + uint16_t next; + + /* Link to the last entry of descriptor list. + * Only available for head-descriptor. */ + uint16_t last; + + /* The length of descriptor list. + * Only available for head-descriptor. */ + uint16_t num; + + /* Used to preserve the order of fetching available descriptors. + * Only available for head-descriptor. */ + uint64_t counter; + + /* The buffer id */ + uint16_t id; + + /* The descriptor flags */ + uint16_t flags; + + /* The buffer length */ + uint32_t len; + + /* The buffer address */ + uint64_t addr; + } DescStatePacked; + + typedef struct QueueRegionPacked { + /* The feature flags of this region. Now it's initialized to 0. */ + uint64_t features; + + /* The version of this region. It's 1 currently. + * Zero value indicates an uninitialized buffer */ + uint16_t version; + + /* The size of DescStatePacked array. It's equal to the virtqueue + * size. Slave could get it from queue size field of VhostUserInflight. */ + uint16_t desc_num; + + /* The head of free DescStatePacked entry list */ + uint16_t free_head; + + /* The old head of free DescStatePacked entry list */ + uint16_t old_free_head; + + /* The used index of descriptor ring */ + uint16_t used_idx; + + /* The old used index of descriptor ring */ + uint16_t old_used_idx; + + /* Device ring wrap counter */ + uint8_t used_wrap_counter; + + /* The old device ring wrap counter */ + uint8_t old_used_wrap_counter; + + /* Padding */ + uint8_t padding[7]; + + /* Used to track the state of each descriptor fetched from descriptor ring */ + DescStatePacked desc[0]; + } QueueRegionPacked; + +To track inflight I/O, the queue region should be processed as follows: + +When receiving available buffers from the driver: + +#. Get the next available descriptor entry from descriptor ring, ``d`` + +#. If ``d`` is head descriptor, + + a. Set ``desc[old_free_head].num`` to 0 + + #. Set ``desc[old_free_head].counter`` to the value of global counter + + #. Increase global counter by 1 + + #. Set ``desc[old_free_head].inflight`` to 1 + +#. If ``d`` is last descriptor, set ``desc[old_free_head].last`` to + ``free_head`` + +#. Increase ``desc[old_free_head].num`` by 1 + +#. Set ``desc[free_head].addr``, ``desc[free_head].len``, + ``desc[free_head].flags``, ``desc[free_head].id`` to ``d.addr``, + ``d.len``, ``d.flags``, ``d.id`` + +#. Set ``free_head`` to ``desc[free_head].next`` + +#. If ``d`` is last descriptor, set ``old_free_head`` to ``free_head`` + +When supplying used buffers to the driver: + +1. Get corresponding used head-descriptor entry from descriptor ring, + ``d`` + +2. Get corresponding ``DescStatePacked`` entry, ``e`` + +3. Set ``desc[e.last].next`` to ``free_head`` + +4. Set ``free_head`` to the index of ``e`` + +#. Steps 1,2,3,4 may be performed repeatedly if batching is possible + +#. Increase ``used_idx`` by the size of the batch and update + ``used_wrap_counter`` if needed + +#. Update ``d.flags`` + +#. Set the ``inflight`` field of each head ``DescStatePacked`` entry + in the batch to 0 + +#. Set ``old_free_head``, ``old_used_idx``, ``old_used_wrap_counter`` + to ``free_head``, ``used_idx``, ``used_wrap_counter`` + +When reconnecting: + +#. If ``used_idx`` does not match ``old_used_idx`` (means the + ``inflight`` field of ``DescStatePacked`` entries in last batch may + be incorrect), + + a. Get the next descriptor ring entry through ``old_used_idx``, ``d`` + + #. Use ``old_used_wrap_counter`` to calculate the available flags + + #. If ``d.flags`` is not equal to the calculated flags value (means + slave has submitted the buffer to guest driver before crash, so + it has to commit the in-progres update), set ``old_free_head``, + ``old_used_idx``, ``old_used_wrap_counter`` to ``free_head``, + ``used_idx``, ``used_wrap_counter`` + +#. Set ``free_head``, ``used_idx``, ``used_wrap_counter`` to + ``old_free_head``, ``old_used_idx``, ``old_used_wrap_counter`` + (roll back any in-progress update) + +#. Set the ``inflight`` field of each ``DescStatePacked`` entry in + free list to 0 + +#. Resubmit inflight ``DescStatePacked`` entries in order of their + counter value + +Protocol features +----------------- + +.. code:: c + + #define VHOST_USER_PROTOCOL_F_MQ 0 + #define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1 + #define VHOST_USER_PROTOCOL_F_RARP 2 + #define VHOST_USER_PROTOCOL_F_REPLY_ACK 3 + #define VHOST_USER_PROTOCOL_F_MTU 4 + #define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5 + #define VHOST_USER_PROTOCOL_F_CROSS_ENDIAN 6 + #define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7 + #define VHOST_USER_PROTOCOL_F_PAGEFAULT 8 + #define VHOST_USER_PROTOCOL_F_CONFIG 9 + #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD 10 + #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11 + #define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12 + +Master message types +-------------------- + +``VHOST_USER_GET_FEATURES`` + :id: 1 + :equivalent ioctl: ``VHOST_GET_FEATURES`` + :master payload: N/A + :slave payload: ``u64`` + + Get from the underlying vhost implementation the features bitmask. + Feature bit ``VHOST_USER_F_PROTOCOL_FEATURES`` signals slave support + for ``VHOST_USER_GET_PROTOCOL_FEATURES`` and + ``VHOST_USER_SET_PROTOCOL_FEATURES``. + +``VHOST_USER_SET_FEATURES`` + :id: 2 + :equivalent ioctl: ``VHOST_SET_FEATURES`` + :master payload: ``u64`` + + Enable features in the underlying vhost implementation using a + bitmask. Feature bit ``VHOST_USER_F_PROTOCOL_FEATURES`` signals + slave support for ``VHOST_USER_GET_PROTOCOL_FEATURES`` and + ``VHOST_USER_SET_PROTOCOL_FEATURES``. + +``VHOST_USER_GET_PROTOCOL_FEATURES`` + :id: 15 + :equivalent ioctl: ``VHOST_GET_FEATURES`` + :master payload: N/A + :slave payload: ``u64`` + + Get the protocol feature bitmask from the underlying vhost + implementation. Only legal if feature bit + ``VHOST_USER_F_PROTOCOL_FEATURES`` is present in + ``VHOST_USER_GET_FEATURES``. + +.. Note:: + Slave that reported ``VHOST_USER_F_PROTOCOL_FEATURES`` must + support this message even before ``VHOST_USER_SET_FEATURES`` was + called. + +``VHOST_USER_SET_PROTOCOL_FEATURES`` + :id: 16 + :equivalent ioctl: ``VHOST_SET_FEATURES`` + :master payload: ``u64`` + + Enable protocol features in the underlying vhost implementation. + + Only legal if feature bit ``VHOST_USER_F_PROTOCOL_FEATURES`` is present in + ``VHOST_USER_GET_FEATURES``. + +.. Note:: + Slave that reported ``VHOST_USER_F_PROTOCOL_FEATURES`` must support + this message even before ``VHOST_USER_SET_FEATURES`` was called. + +``VHOST_USER_SET_OWNER`` + :id: 3 + :equivalent ioctl: ``VHOST_SET_OWNER`` + :master payload: N/A + + Issued when a new connection is established. It sets the current + *master* as an owner of the session. This can be used on the *slave* + as a "session start" flag. + +``VHOST_USER_RESET_OWNER`` + :id: 4 + :master payload: N/A + +.. admonition:: Deprecated + + This is no longer used. Used to be sent to request disabling all + rings, but some clients interpreted it to also discard connection + state (this interpretation would lead to bugs). It is recommended + that clients either ignore this message, or use it to disable all + rings. + +``VHOST_USER_SET_MEM_TABLE`` + :id: 5 + :equivalent ioctl: ``VHOST_SET_MEM_TABLE`` + :master payload: memory regions description + :slave payload: (postcopy only) memory regions description + + Sets the memory map regions on the slave so it can translate the + vring addresses. In the ancillary data there is an array of file + descriptors for each memory mapped region. The size and ordering of + the fds matches the number and ordering of memory regions. + + When ``VHOST_USER_POSTCOPY_LISTEN`` has been received, + ``SET_MEM_TABLE`` replies with the bases of the memory mapped + regions to the master. The slave must have mmap'd the regions but + not yet accessed them and should not yet generate a userfault + event. + +.. Note:: + ``NEED_REPLY_MASK`` is not set in this case. QEMU will then + reply back to the list of mappings with an empty + ``VHOST_USER_SET_MEM_TABLE`` as an acknowledgement; only upon + reception of this message may the guest start accessing the memory + and generating faults. + +``VHOST_USER_SET_LOG_BASE`` + :id: 6 + :equivalent ioctl: ``VHOST_SET_LOG_BASE`` + :master payload: u64 + :slave payload: N/A + + Sets logging shared memory space. + + When slave has ``VHOST_USER_PROTOCOL_F_LOG_SHMFD`` protocol feature, + the log memory fd is provided in the ancillary data of + ``VHOST_USER_SET_LOG_BASE`` message, the size and offset of shared + memory area provided in the message. + +``VHOST_USER_SET_LOG_FD`` + :id: 7 + :equivalent ioctl: ``VHOST_SET_LOG_FD`` + :master payload: N/A + + Sets the logging file descriptor, which is passed as ancillary data. + +``VHOST_USER_SET_VRING_NUM`` + :id: 8 + :equivalent ioctl: ``VHOST_SET_VRING_NUM`` + :master payload: vring state description + + Set the size of the queue. + +``VHOST_USER_SET_VRING_ADDR`` + :id: 9 + :equivalent ioctl: ``VHOST_SET_VRING_ADDR`` + :master payload: vring address description + :slave payload: N/A + + Sets the addresses of the different aspects of the vring. + +``VHOST_USER_SET_VRING_BASE`` + :id: 10 + :equivalent ioctl: ``VHOST_SET_VRING_BASE`` + :master payload: vring state description + + Sets the base offset in the available vring. + +``VHOST_USER_GET_VRING_BASE`` + :id: 11 + :equivalent ioctl: ``VHOST_USER_GET_VRING_BASE`` + :master payload: vring state description + :slave payload: vring state description + + Get the available vring base offset. + +``VHOST_USER_SET_VRING_KICK`` + :id: 12 + :equivalent ioctl: ``VHOST_SET_VRING_KICK`` + :master payload: ``u64`` + + Set the event file descriptor for adding buffers to the vring. It is + passed in the ancillary data. + + Bits (0-7) of the payload contain the vring index. Bit 8 is the + invalid FD flag. This flag is set when there is no file descriptor + in the ancillary data. This signals that polling should be used + instead of waiting for a kick. + +``VHOST_USER_SET_VRING_CALL`` + :id: 13 + :equivalent ioctl: ``VHOST_SET_VRING_CALL`` + :master payload: ``u64`` + + Set the event file descriptor to signal when buffers are used. It is + passed in the ancillary data. + + Bits (0-7) of the payload contain the vring index. Bit 8 is the + invalid FD flag. This flag is set when there is no file descriptor + in the ancillary data. This signals that polling will be used + instead of waiting for the call. + +``VHOST_USER_SET_VRING_ERR`` + :id: 14 + :equivalent ioctl: ``VHOST_SET_VRING_ERR`` + :master payload: ``u64`` + + Set the event file descriptor to signal when error occurs. It is + passed in the ancillary data. + + Bits (0-7) of the payload contain the vring index. Bit 8 is the + invalid FD flag. This flag is set when there is no file descriptor + in the ancillary data. + +``VHOST_USER_GET_QUEUE_NUM`` + :id: 17 + :equivalent ioctl: N/A + :master payload: N/A + :slave payload: u64 + + Query how many queues the backend supports. + + This request should be sent only when ``VHOST_USER_PROTOCOL_F_MQ`` + is set in queried protocol features by + ``VHOST_USER_GET_PROTOCOL_FEATURES``. + +``VHOST_USER_SET_VRING_ENABLE`` + :id: 18 + :equivalent ioctl: N/A + :master payload: vring state description + + Signal slave to enable or disable corresponding vring. + + This request should be sent only when + ``VHOST_USER_F_PROTOCOL_FEATURES`` has been negotiated. + +``VHOST_USER_SEND_RARP`` + :id: 19 + :equivalent ioctl: N/A + :master payload: ``u64`` + + Ask vhost user backend to broadcast a fake RARP to notify the migration + is terminated for guest that does not support GUEST_ANNOUNCE. + + Only legal if feature bit ``VHOST_USER_F_PROTOCOL_FEATURES`` is + present in ``VHOST_USER_GET_FEATURES`` and protocol feature bit + ``VHOST_USER_PROTOCOL_F_RARP`` is present in + ``VHOST_USER_GET_PROTOCOL_FEATURES``. The first 6 bytes of the + payload contain the mac address of the guest to allow the vhost user + backend to construct and broadcast the fake RARP. + +``VHOST_USER_NET_SET_MTU`` + :id: 20 + :equivalent ioctl: N/A + :master payload: ``u64`` + + Set host MTU value exposed to the guest. + + This request should be sent only when ``VIRTIO_NET_F_MTU`` feature + has been successfully negotiated, ``VHOST_USER_F_PROTOCOL_FEATURES`` + is present in ``VHOST_USER_GET_FEATURES`` and protocol feature bit + ``VHOST_USER_PROTOCOL_F_NET_MTU`` is present in + ``VHOST_USER_GET_PROTOCOL_FEATURES``. + + If ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is negotiated, slave must + respond with zero in case the specified MTU is valid, or non-zero + otherwise. + +``VHOST_USER_SET_SLAVE_REQ_FD`` + :id: 21 + :equivalent ioctl: N/A + :master payload: N/A + + Set the socket file descriptor for slave initiated requests. It is passed + in the ancillary data. + + This request should be sent only when + ``VHOST_USER_F_PROTOCOL_FEATURES`` has been negotiated, and protocol + feature bit ``VHOST_USER_PROTOCOL_F_SLAVE_REQ`` bit is present in + ``VHOST_USER_GET_PROTOCOL_FEATURES``. If + ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is negotiated, slave must + respond with zero for success, non-zero otherwise. + +``VHOST_USER_IOTLB_MSG`` + :id: 22 + :equivalent ioctl: N/A (equivalent to ``VHOST_IOTLB_MSG`` message type) + :master payload: ``struct vhost_iotlb_msg`` + :slave payload: ``u64`` + + Send IOTLB messages with ``struct vhost_iotlb_msg`` as payload. + + Master sends such requests to update and invalidate entries in the + device IOTLB. The slave has to acknowledge the request with sending + zero as ``u64`` payload for success, non-zero otherwise. + + This request should be send only when ``VIRTIO_F_IOMMU_PLATFORM`` + feature has been successfully negotiated. + +``VHOST_USER_SET_VRING_ENDIAN`` + :id: 23 + :equivalent ioctl: ``VHOST_SET_VRING_ENDIAN`` + :master payload: vring state description + + Set the endianness of a VQ for legacy devices. Little-endian is + indicated with state.num set to 0 and big-endian is indicated with + state.num set to 1. Other values are invalid. + + This request should be sent only when + ``VHOST_USER_PROTOCOL_F_CROSS_ENDIAN`` has been negotiated. + Backends that negotiated this feature should handle both + endiannesses and expect this message once (per VQ) during device + configuration (ie. before the master starts the VQ). + +``VHOST_USER_GET_CONFIG`` + :id: 24 + :equivalent ioctl: N/A + :master payload: virtio device config space + :slave payload: virtio device config space + + When ``VHOST_USER_PROTOCOL_F_CONFIG`` is negotiated, this message is + submitted by the vhost-user master to fetch the contents of the + virtio device configuration space, vhost-user slave's payload size + MUST match master's request, vhost-user slave uses zero length of + payload to indicate an error to vhost-user master. The vhost-user + master may cache the contents to avoid repeated + ``VHOST_USER_GET_CONFIG`` calls. + +``VHOST_USER_SET_CONFIG`` + :id: 25 + :equivalent ioctl: N/A + :master payload: virtio device config space + :slave payload: N/A + + When ``VHOST_USER_PROTOCOL_F_CONFIG`` is negotiated, this message is + submitted by the vhost-user master when the Guest changes the virtio + device configuration space and also can be used for live migration + on the destination host. The vhost-user slave must check the flags + field, and slaves MUST NOT accept SET_CONFIG for read-only + configuration space fields unless the live migration bit is set. + +``VHOST_USER_CREATE_CRYPTO_SESSION`` + :id: 26 + :equivalent ioctl: N/A + :master payload: crypto session description + :slave payload: crypto session description + + Create a session for crypto operation. The server side must return + the session id, 0 or positive for success, negative for failure. + This request should be sent only when + ``VHOST_USER_PROTOCOL_F_CRYPTO_SESSION`` feature has been + successfully negotiated. It's a required feature for crypto + devices. + +``VHOST_USER_CLOSE_CRYPTO_SESSION`` + :id: 27 + :equivalent ioctl: N/A + :master payload: ``u64`` + + Close a session for crypto operation which was previously + created by ``VHOST_USER_CREATE_CRYPTO_SESSION``. + + This request should be sent only when + ``VHOST_USER_PROTOCOL_F_CRYPTO_SESSION`` feature has been + successfully negotiated. It's a required feature for crypto + devices. + +``VHOST_USER_POSTCOPY_ADVISE`` + :id: 28 + :master payload: N/A + :slave payload: userfault fd + + When ``VHOST_USER_PROTOCOL_F_PAGEFAULT`` is supported, the master + advises slave that a migration with postcopy enabled is underway, + the slave must open a userfaultfd for later use. Note that at this + stage the migration is still in precopy mode. + +``VHOST_USER_POSTCOPY_LISTEN`` + :id: 29 + :master payload: N/A + + Master advises slave that a transition to postcopy mode has + happened. The slave must ensure that shared memory is registered + with userfaultfd to cause faulting of non-present pages. + + This is always sent sometime after a ``VHOST_USER_POSTCOPY_ADVISE``, + and thus only when ``VHOST_USER_PROTOCOL_F_PAGEFAULT`` is supported. + +``VHOST_USER_POSTCOPY_END`` + :id: 30 + :slave payload: ``u64`` + + Master advises that postcopy migration has now completed. The slave + must disable the userfaultfd. The response is an acknowledgement + only. + + When ``VHOST_USER_PROTOCOL_F_PAGEFAULT`` is supported, this message + is sent at the end of the migration, after + ``VHOST_USER_POSTCOPY_LISTEN`` was previously sent. + + The value returned is an error indication; 0 is success. + +``VHOST_USER_GET_INFLIGHT_FD`` + :id: 31 + :equivalent ioctl: N/A + :master payload: inflight description + + When ``VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD`` protocol feature has + been successfully negotiated, this message is submitted by master to + get a shared buffer from slave. The shared buffer will be used to + track inflight I/O by slave. QEMU should retrieve a new one when vm + reset. + +``VHOST_USER_SET_INFLIGHT_FD`` + :id: 32 + :equivalent ioctl: N/A + :master payload: inflight description + + When ``VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD`` protocol feature has + been successfully negotiated, this message is submitted by master to + send the shared inflight buffer back to slave so that slave could + get inflight I/O after a crash or restart. + +Slave message types +------------------- + +``VHOST_USER_SLAVE_IOTLB_MSG`` + :id: 1 + :equivalent ioctl: N/A (equivalent to ``VHOST_IOTLB_MSG`` message type) + :slave payload: ``struct vhost_iotlb_msg`` + :master payload: N/A + + Send IOTLB messages with ``struct vhost_iotlb_msg`` as payload. + Slave sends such requests to notify of an IOTLB miss, or an IOTLB + access failure. If ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is + negotiated, and slave set the ``VHOST_USER_NEED_REPLY`` flag, master + must respond with zero when operation is successfully completed, or + non-zero otherwise. This request should be send only when + ``VIRTIO_F_IOMMU_PLATFORM`` feature has been successfully + negotiated. + +``VHOST_USER_SLAVE_CONFIG_CHANGE_MSG`` + :id: 2 + :equivalent ioctl: N/A + :slave payload: N/A + :master payload: N/A + + When ``VHOST_USER_PROTOCOL_F_CONFIG`` is negotiated, vhost-user + slave sends such messages to notify that the virtio device's + configuration space has changed, for those host devices which can + support such feature, host driver can send ``VHOST_USER_GET_CONFIG`` + message to slave to get the latest content. If + ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is negotiated, and slave set the + ``VHOST_USER_NEED_REPLY`` flag, master must respond with zero when + operation is successfully completed, or non-zero otherwise. + +``VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG`` + :id: 3 + :equivalent ioctl: N/A + :slave payload: vring area description + :master payload: N/A + + Sets host notifier for a specified queue. The queue index is + contained in the ``u64`` field of the vring area description. The + host notifier is described by the file descriptor (typically it's a + VFIO device fd) which is passed as ancillary data and the size + (which is mmap size and should be the same as host page size) and + offset (which is mmap offset) carried in the vring area + description. QEMU can mmap the file descriptor based on the size and + offset to get a memory range. Registering a host notifier means + mapping this memory range to the VM as the specified queue's notify + MMIO region. Slave sends this request to tell QEMU to de-register + the existing notifier if any and register the new notifier if the + request is sent with a file descriptor. + + This request should be sent only when + ``VHOST_USER_PROTOCOL_F_HOST_NOTIFIER`` protocol feature has been + successfully negotiated. + +.. _reply_ack: + +VHOST_USER_PROTOCOL_F_REPLY_ACK +------------------------------- + +The original vhost-user specification only demands replies for certain +commands. This differs from the vhost protocol implementation where +commands are sent over an ``ioctl()`` call and block until the client +has completed. + +With this protocol extension negotiated, the sender (QEMU) can set the +``need_reply`` [Bit 3] flag to any command. This indicates that the +client MUST respond with a Payload ``VhostUserMsg`` indicating success +or failure. The payload should be set to zero on success or non-zero +on failure, unless the message already has an explicit reply body. + +The response payload gives QEMU a deterministic indication of the result +of the command. Today, QEMU is expected to terminate the main vhost-user +loop upon receiving such errors. In future, qemu could be taught to be more +resilient for selective requests. + +For the message types that already solicit a reply from the client, +the presence of ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` or need_reply bit +being set brings no behavioural change. (See the Communication_ +section for details.) + +.. _backend_conventions: + +Backend program conventions +=========================== + +vhost-user backends can provide various devices & services and may +need to be configured manually depending on the use case. However, it +is a good idea to follow the conventions listed here when +possible. Users, QEMU or libvirt, can then rely on some common +behaviour to avoid heterogenous configuration and management of the +backend programs and facilitate interoperability. + +Each backend installed on a host system should come with at least one +JSON file that conforms to the vhost-user.json schema. Each file +informs the management applications about the backend type, and binary +location. In addition, it defines rules for management apps for +picking the highest priority backend when multiple match the search +criteria (see ``@VhostUserBackend`` documentation in the schema file). + +If the backend is not capable of enabling a requested feature on the +host (such as 3D acceleration with virgl), or the initialization +failed, the backend should fail to start early and exit with a status +!= 0. It may also print a message to stderr for further details. + +The backend program must not daemonize itself, but it may be +daemonized by the management layer. It may also have a restricted +access to the system. + +File descriptors 0, 1 and 2 will exist, and have regular +stdin/stdout/stderr usage (they may have been redirected to /dev/null +by the management layer, or to a log handler). + +The backend program must end (as quickly and cleanly as possible) when +the SIGTERM signal is received. Eventually, it may receive SIGKILL by +the management layer after a few seconds. + +The following command line options have an expected behaviour. They +are mandatory, unless explicitly said differently: + +--socket-path=PATH + + This option specify the location of the vhost-user Unix domain socket. + It is incompatible with --fd. + +--fd=FDNUM + + When this argument is given, the backend program is started with the + vhost-user socket as file descriptor FDNUM. It is incompatible with + --socket-path. + +--print-capabilities + + Output to stdout the backend capabilities in JSON format, and then + exit successfully. Other options and arguments should be ignored, and + the backend program should not perform its normal function. The + capabilities can be reported dynamically depending on the host + capabilities. + +The JSON output is described in the ``vhost-user.json`` schema, by +```@VHostUserBackendCapabilities``. Example: + +.. code:: json + + { + "type": "foo", + "features": [ + "feature-a", + "feature-b" + ] + } + +vhost-user-input +---------------- + +Command line options: + +--evdev-path=PATH + + Specify the linux input device. + + (optional) + +--no-grab + + Do no request exclusive access to the input device. + + (optional) + +vhost-user-gpu +-------------- + +Command line options: + +--render-node=PATH + + Specify the GPU DRM render node. + + (optional) + +--virgl + + Enable virgl rendering support. + + (optional) diff --git a/docs/interop/vhost-user.txt b/docs/interop/vhost-user.txt deleted file mode 100644 index 4dbd530cb9..0000000000 --- a/docs/interop/vhost-user.txt +++ /dev/null @@ -1,1219 +0,0 @@ -Vhost-user Protocol -=================== - -Copyright (c) 2014 Virtual Open Systems Sarl. - -This work is licensed under the terms of the GNU GPL, version 2 or later. -See the COPYING file in the top-level directory. -=================== - -This protocol is aiming to complement the ioctl interface used to control the -vhost implementation in the Linux kernel. It implements the control plane needed -to establish virtqueue sharing with a user space process on the same host. It -uses communication over a Unix domain socket to share file descriptors in the -ancillary data of the message. - -The protocol defines 2 sides of the communication, master and slave. Master is -the application that shares its virtqueues, in our case QEMU. Slave is the -consumer of the virtqueues. - -In the current implementation QEMU is the Master, and the Slave is the -external process consuming the virtio queues, for example a software -Ethernet switch running in user space, such as Snabbswitch, or a block -device backend processing read & write to a virtual disk. In order to -facilitate interoperability between various backend implementations, -it is recommended to follow the "Backend program conventions" -described in this document. - -Master and slave can be either a client (i.e. connecting) or server (listening) -in the socket communication. - -Message Specification ---------------------- - -Note that all numbers are in the machine native byte order. A vhost-user message -consists of 3 header fields and a payload: - ------------------------------------- -| request | flags | size | payload | ------------------------------------- - - * Request: 32-bit type of the request - * Flags: 32-bit bit field: - - Lower 2 bits are the version (currently 0x01) - - Bit 2 is the reply flag - needs to be sent on each reply from the slave - - Bit 3 is the need_reply flag - see VHOST_USER_PROTOCOL_F_REPLY_ACK for - details. - * Size - 32-bit size of the payload - - -Depending on the request type, payload can be: - - * A single 64-bit integer - ------- - | u64 | - ------- - - u64: a 64-bit unsigned integer - - * A vring state description - --------------- - | index | num | - --------------- - - Index: a 32-bit index - Num: a 32-bit number - - * A vring address description - -------------------------------------------------------------- - | index | flags | size | descriptor | used | available | log | - -------------------------------------------------------------- - - Index: a 32-bit vring index - Flags: a 32-bit vring flags - Descriptor: a 64-bit ring address of the vring descriptor table - Used: a 64-bit ring address of the vring used ring - Available: a 64-bit ring address of the vring available ring - Log: a 64-bit guest address for logging - - Note that a ring address is an IOVA if VIRTIO_F_IOMMU_PLATFORM has been - negotiated. Otherwise it is a user address. - - * Memory regions description - --------------------------------------------------- - | num regions | padding | region0 | ... | region7 | - --------------------------------------------------- - - Num regions: a 32-bit number of regions - Padding: 32-bit - - A region is: - ----------------------------------------------------- - | guest address | size | user address | mmap offset | - ----------------------------------------------------- - - Guest address: a 64-bit guest address of the region - Size: a 64-bit size - User address: a 64-bit user address - mmap offset: 64-bit offset where region starts in the mapped memory - -* Log description - --------------------------- - | log size | log offset | - --------------------------- - log size: size of area used for logging - log offset: offset from start of supplied file descriptor - where logging starts (i.e. where guest address 0 would be logged) - - * An IOTLB message - --------------------------------------------------------- - | iova | size | user address | permissions flags | type | - --------------------------------------------------------- - - IOVA: a 64-bit I/O virtual address programmed by the guest - Size: a 64-bit size - User address: a 64-bit user address - Permissions: an 8-bit value: - - 0: No access - - 1: Read access - - 2: Write access - - 3: Read/Write access - Type: an 8-bit IOTLB message type: - - 1: IOTLB miss - - 2: IOTLB update - - 3: IOTLB invalidate - - 4: IOTLB access fail - - * Virtio device config space - ----------------------------------- - | offset | size | flags | payload | - ----------------------------------- - - Offset: a 32-bit offset of virtio device's configuration space - Size: a 32-bit configuration space access size in bytes - Flags: a 32-bit value: - - 0: Vhost master messages used for writeable fields - - 1: Vhost master messages used for live migration - Payload: Size bytes array holding the contents of the virtio - device's configuration space - - * Vring area description - ----------------------- - | u64 | size | offset | - ----------------------- - - u64: a 64-bit integer contains vring index and flags - Size: a 64-bit size of this area - Offset: a 64-bit offset of this area from the start of the - supplied file descriptor - - * Inflight description - ----------------------------------------------------- - | mmap size | mmap offset | num queues | queue size | - ----------------------------------------------------- - - mmap size: a 64-bit size of area to track inflight I/O - mmap offset: a 64-bit offset of this area from the start - of the supplied file descriptor - num queues: a 16-bit number of virtqueues - queue size: a 16-bit size of virtqueues - -In QEMU the vhost-user message is implemented with the following struct: - -typedef struct VhostUserMsg { - VhostUserRequest request; - uint32_t flags; - uint32_t size; - union { - uint64_t u64; - struct vhost_vring_state state; - struct vhost_vring_addr addr; - VhostUserMemory memory; - VhostUserLog log; - struct vhost_iotlb_msg iotlb; - VhostUserConfig config; - VhostUserVringArea area; - VhostUserInflight inflight; - }; -} QEMU_PACKED VhostUserMsg; - -Communication -------------- - -The protocol for vhost-user is based on the existing implementation of vhost -for the Linux Kernel. Most messages that can be sent via the Unix domain socket -implementing vhost-user have an equivalent ioctl to the kernel implementation. - -The communication consists of master sending message requests and slave sending -message replies. Most of the requests don't require replies. Here is a list of -the ones that do: - - * VHOST_USER_GET_FEATURES - * VHOST_USER_GET_PROTOCOL_FEATURES - * VHOST_USER_GET_VRING_BASE - * VHOST_USER_SET_LOG_BASE (if VHOST_USER_PROTOCOL_F_LOG_SHMFD) - * VHOST_USER_GET_INFLIGHT_FD (if VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD) - -[ Also see the section on REPLY_ACK protocol extension. ] - -There are several messages that the master sends with file descriptors passed -in the ancillary data: - - * VHOST_USER_SET_MEM_TABLE - * VHOST_USER_SET_LOG_BASE (if VHOST_USER_PROTOCOL_F_LOG_SHMFD) - * VHOST_USER_SET_LOG_FD - * VHOST_USER_SET_VRING_KICK - * VHOST_USER_SET_VRING_CALL - * VHOST_USER_SET_VRING_ERR - * VHOST_USER_SET_SLAVE_REQ_FD - * VHOST_USER_SET_INFLIGHT_FD (if VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD) - -If Master is unable to send the full message or receives a wrong reply it will -close the connection. An optional reconnection mechanism can be implemented. - -Any protocol extensions are gated by protocol feature bits, -which allows full backwards compatibility on both master -and slave. -As older slaves don't support negotiating protocol features, -a feature bit was dedicated for this purpose: -#define VHOST_USER_F_PROTOCOL_FEATURES 30 - -Starting and stopping rings ----------------------- -Client must only process each ring when it is started. - -Client must only pass data between the ring and the -backend, when the ring is enabled. - -If ring is started but disabled, client must process the -ring without talking to the backend. - -For example, for a networking device, in the disabled state -client must not supply any new RX packets, but must process -and discard any TX packets. - -If VHOST_USER_F_PROTOCOL_FEATURES has not been negotiated, the ring is initialized -in an enabled state. - -If VHOST_USER_F_PROTOCOL_FEATURES has been negotiated, the ring is initialized -in a disabled state. Client must not pass data to/from the backend until ring is enabled by -VHOST_USER_SET_VRING_ENABLE with parameter 1, or after it has been disabled by -VHOST_USER_SET_VRING_ENABLE with parameter 0. - -Each ring is initialized in a stopped state, client must not process it until -ring is started, or after it has been stopped. - -Client must start ring upon receiving a kick (that is, detecting that file -descriptor is readable) on the descriptor specified by -VHOST_USER_SET_VRING_KICK, and stop ring upon receiving -VHOST_USER_GET_VRING_BASE. - -While processing the rings (whether they are enabled or not), client must -support changing some configuration aspects on the fly. - -Multiple queue support ----------------------- - -Multiple queue is treated as a protocol extension, hence the slave has to -implement protocol features first. The multiple queues feature is supported -only when the protocol feature VHOST_USER_PROTOCOL_F_MQ (bit 0) is set. - -The max number of queue pairs the slave supports can be queried with message -VHOST_USER_GET_QUEUE_NUM. Master should stop when the number of -requested queues is bigger than that. - -As all queues share one connection, the master uses a unique index for each -queue in the sent message to identify a specified queue. One queue pair -is enabled initially. More queues are enabled dynamically, by sending -message VHOST_USER_SET_VRING_ENABLE. - -Migration ---------- - -During live migration, the master may need to track the modifications -the slave makes to the memory mapped regions. The client should mark -the dirty pages in a log. Once it complies to this logging, it may -declare the VHOST_F_LOG_ALL vhost feature. - -To start/stop logging of data/used ring writes, server may send messages -VHOST_USER_SET_FEATURES with VHOST_F_LOG_ALL and VHOST_USER_SET_VRING_ADDR with -VHOST_VRING_F_LOG in ring's flags set to 1/0, respectively. - -All the modifications to memory pointed by vring "descriptor" should -be marked. Modifications to "used" vring should be marked if -VHOST_VRING_F_LOG is part of ring's flags. - -Dirty pages are of size: -#define VHOST_LOG_PAGE 0x1000 - -The log memory fd is provided in the ancillary data of -VHOST_USER_SET_LOG_BASE message when the slave has -VHOST_USER_PROTOCOL_F_LOG_SHMFD protocol feature. - -The size of the log is supplied as part of VhostUserMsg -which should be large enough to cover all known guest -addresses. Log starts at the supplied offset in the -supplied file descriptor. -The log covers from address 0 to the maximum of guest -regions. In pseudo-code, to mark page at "addr" as dirty: - -page = addr / VHOST_LOG_PAGE -log[page / 8] |= 1 << page % 8 - -Where addr is the guest physical address. - -Use atomic operations, as the log may be concurrently manipulated. - -Note that when logging modifications to the used ring (when VHOST_VRING_F_LOG -is set for this ring), log_guest_addr should be used to calculate the log -offset: the write to first byte of the used ring is logged at this offset from -log start. Also note that this value might be outside the legal guest physical -address range (i.e. does not have to be covered by the VhostUserMemory table), -but the bit offset of the last byte of the ring must fall within -the size supplied by VhostUserLog. - -VHOST_USER_SET_LOG_FD is an optional message with an eventfd in -ancillary data, it may be used to inform the master that the log has -been modified. - -Once the source has finished migration, rings will be stopped by -the source. No further update must be done before rings are -restarted. - -In postcopy migration the slave is started before all the memory has been -received from the source host, and care must be taken to avoid accessing pages -that have yet to be received. The slave opens a 'userfault'-fd and registers -the memory with it; this fd is then passed back over to the master. -The master services requests on the userfaultfd for pages that are accessed -and when the page is available it performs WAKE ioctl's on the userfaultfd -to wake the stalled slave. The client indicates support for this via the -VHOST_USER_PROTOCOL_F_PAGEFAULT feature. - -Memory access -------------- - -The master sends a list of vhost memory regions to the slave using the -VHOST_USER_SET_MEM_TABLE message. Each region has two base addresses: a guest -address and a user address. - -Messages contain guest addresses and/or user addresses to reference locations -within the shared memory. The mapping of these addresses works as follows. - -User addresses map to the vhost memory region containing that user address. - -When the VIRTIO_F_IOMMU_PLATFORM feature has not been negotiated: - - * Guest addresses map to the vhost memory region containing that guest - address. - -When the VIRTIO_F_IOMMU_PLATFORM feature has been negotiated: - - * Guest addresses are also called I/O virtual addresses (IOVAs). They are - translated to user addresses via the IOTLB. - - * The vhost memory region guest address is not used. - -IOMMU support -------------- - -When the VIRTIO_F_IOMMU_PLATFORM feature has been negotiated, the master -sends IOTLB entries update & invalidation by sending VHOST_USER_IOTLB_MSG -requests to the slave with a struct vhost_iotlb_msg as payload. For update -events, the iotlb payload has to be filled with the update message type (2), -the I/O virtual address, the size, the user virtual address, and the -permissions flags. Addresses and size must be within vhost memory regions set -via the VHOST_USER_SET_MEM_TABLE request. For invalidation events, the iotlb -payload has to be filled with the invalidation message type (3), the I/O virtual -address and the size. On success, the slave is expected to reply with a zero -payload, non-zero otherwise. - -The slave relies on the slave communcation channel (see "Slave communication" -section below) to send IOTLB miss and access failure events, by sending -VHOST_USER_SLAVE_IOTLB_MSG requests to the master with a struct vhost_iotlb_msg -as payload. For miss events, the iotlb payload has to be filled with the miss -message type (1), the I/O virtual address and the permissions flags. For access -failure event, the iotlb payload has to be filled with the access failure -message type (4), the I/O virtual address and the permissions flags. -For synchronization purpose, the slave may rely on the reply-ack feature, -so the master may send a reply when operation is completed if the reply-ack -feature is negotiated and slaves requests a reply. For miss events, completed -operation means either master sent an update message containing the IOTLB entry -containing requested address and permission, or master sent nothing if the IOTLB -miss message is invalid (invalid IOVA or permission). - -The master isn't expected to take the initiative to send IOTLB update messages, -as the slave sends IOTLB miss messages for the guest virtual memory areas it -needs to access. - -Slave communication -------------------- - -An optional communication channel is provided if the slave declares -VHOST_USER_PROTOCOL_F_SLAVE_REQ protocol feature, to allow the slave to make -requests to the master. - -The fd is provided via VHOST_USER_SET_SLAVE_REQ_FD ancillary data. - -A slave may then send VHOST_USER_SLAVE_* messages to the master -using this fd communication channel. - -If VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD protocol feature is negotiated, -slave can send file descriptors (at most 8 descriptors in each message) -to master via ancillary data using this fd communication channel. - -Inflight I/O tracking ---------------------- - -To support reconnecting after restart or crash, slave may need to resubmit -inflight I/Os. If virtqueue is processed in order, we can easily achieve -that by getting the inflight descriptors from descriptor table (split virtqueue) -or descriptor ring (packed virtqueue). However, it can't work when we process -descriptors out-of-order because some entries which store the information of -inflight descriptors in available ring (split virtqueue) or descriptor -ring (packed virtqueue) might be overrided by new entries. To solve this -problem, slave need to allocate an extra buffer to store this information of inflight -descriptors and share it with master for persistent. VHOST_USER_GET_INFLIGHT_FD and -VHOST_USER_SET_INFLIGHT_FD are used to transfer this buffer between master -and slave. And the format of this buffer is described below: - -------------------------------------------------------- -| queue0 region | queue1 region | ... | queueN region | -------------------------------------------------------- - -N is the number of available virtqueues. Slave could get it from num queues -field of VhostUserInflight. - -For split virtqueue, queue region can be implemented as: - -typedef struct DescStateSplit { - /* Indicate whether this descriptor is inflight or not. - * Only available for head-descriptor. */ - uint8_t inflight; - - /* Padding */ - uint8_t padding[5]; - - /* Maintain a list for the last batch of used descriptors. - * Only available when batching is used for submitting */ - uint16_t next; - - /* Used to preserve the order of fetching available descriptors. - * Only available for head-descriptor. */ - uint64_t counter; -} DescStateSplit; - -typedef struct QueueRegionSplit { - /* The feature flags of this region. Now it's initialized to 0. */ - uint64_t features; - - /* The version of this region. It's 1 currently. - * Zero value indicates an uninitialized buffer */ - uint16_t version; - - /* The size of DescStateSplit array. It's equal to the virtqueue - * size. Slave could get it from queue size field of VhostUserInflight. */ - uint16_t desc_num; - - /* The head of list that track the last batch of used descriptors. */ - uint16_t last_batch_head; - - /* Store the idx value of used ring */ - uint16_t used_idx; - - /* Used to track the state of each descriptor in descriptor table */ - DescStateSplit desc[0]; -} QueueRegionSplit; - -To track inflight I/O, the queue region should be processed as follows: - -When receiving available buffers from the driver: - - 1. Get the next available head-descriptor index from available ring, i - - 2. Set desc[i].counter to the value of global counter - - 3. Increase global counter by 1 - - 4. Set desc[i].inflight to 1 - -When supplying used buffers to the driver: - - 1. Get corresponding used head-descriptor index, i - - 2. Set desc[i].next to last_batch_head - - 3. Set last_batch_head to i - - 4. Steps 1,2,3 may be performed repeatedly if batching is possible - - 5. Increase the idx value of used ring by the size of the batch - - 6. Set the inflight field of each DescStateSplit entry in the batch to 0 - - 7. Set used_idx to the idx value of used ring - -When reconnecting: - - 1. If the value of used_idx does not match the idx value of used ring (means - the inflight field of DescStateSplit entries in last batch may be incorrect), - - (a) Subtract the value of used_idx from the idx value of used ring to get - last batch size of DescStateSplit entries - - (b) Set the inflight field of each DescStateSplit entry to 0 in last batch - list which starts from last_batch_head - - (c) Set used_idx to the idx value of used ring - - 2. Resubmit inflight DescStateSplit entries in order of their counter value - -For packed virtqueue, queue region can be implemented as: - -typedef struct DescStatePacked { - /* Indicate whether this descriptor is inflight or not. - * Only available for head-descriptor. */ - uint8_t inflight; - - /* Padding */ - uint8_t padding; - - /* Link to the next free entry */ - uint16_t next; - - /* Link to the last entry of descriptor list. - * Only available for head-descriptor. */ - uint16_t last; - - /* The length of descriptor list. - * Only available for head-descriptor. */ - uint16_t num; - - /* Used to preserve the order of fetching available descriptors. - * Only available for head-descriptor. */ - uint64_t counter; - - /* The buffer id */ - uint16_t id; - - /* The descriptor flags */ - uint16_t flags; - - /* The buffer length */ - uint32_t len; - - /* The buffer address */ - uint64_t addr; -} DescStatePacked; - -typedef struct QueueRegionPacked { - /* The feature flags of this region. Now it's initialized to 0. */ - uint64_t features; - - /* The version of this region. It's 1 currently. - * Zero value indicates an uninitialized buffer */ - uint16_t version; - - /* The size of DescStatePacked array. It's equal to the virtqueue - * size. Slave could get it from queue size field of VhostUserInflight. */ - uint16_t desc_num; - - /* The head of free DescStatePacked entry list */ - uint16_t free_head; - - /* The old head of free DescStatePacked entry list */ - uint16_t old_free_head; - - /* The used index of descriptor ring */ - uint16_t used_idx; - - /* The old used index of descriptor ring */ - uint16_t old_used_idx; - - /* Device ring wrap counter */ - uint8_t used_wrap_counter; - - /* The old device ring wrap counter */ - uint8_t old_used_wrap_counter; - - /* Padding */ - uint8_t padding[7]; - - /* Used to track the state of each descriptor fetched from descriptor ring */ - DescStatePacked desc[0]; -} QueueRegionPacked; - -To track inflight I/O, the queue region should be processed as follows: - -When receiving available buffers from the driver: - - 1. Get the next available descriptor entry from descriptor ring, d - - 2. If d is head descriptor, - - (a) Set desc[old_free_head].num to 0 - - (b) Set desc[old_free_head].counter to the value of global counter - - (c) Increase global counter by 1 - - (d) Set desc[old_free_head].inflight to 1 - - 3. If d is last descriptor, set desc[old_free_head].last to free_head - - 4. Increase desc[old_free_head].num by 1 - - 5. Set desc[free_head].addr, desc[free_head].len, desc[free_head].flags, - desc[free_head].id to d.addr, d.len, d.flags, d.id - - 6. Set free_head to desc[free_head].next - - 7. If d is last descriptor, set old_free_head to free_head - -When supplying used buffers to the driver: - - 1. Get corresponding used head-descriptor entry from descriptor ring, d - - 2. Get corresponding DescStatePacked entry, e - - 3. Set desc[e.last].next to free_head - - 4. Set free_head to the index of e - - 5. Steps 1,2,3,4 may be performed repeatedly if batching is possible - - 6. Increase used_idx by the size of the batch and update used_wrap_counter if needed - - 7. Update d.flags - - 8. Set the inflight field of each head DescStatePacked entry in the batch to 0 - - 9. Set old_free_head, old_used_idx, old_used_wrap_counter to free_head, used_idx, - used_wrap_counter - -When reconnecting: - - 1. If used_idx does not match old_used_idx (means the inflight field of DescStatePacked - entries in last batch may be incorrect), - - (a) Get the next descriptor ring entry through old_used_idx, d - - (b) Use old_used_wrap_counter to calculate the available flags - - (c) If d.flags is not equal to the calculated flags value (means slave has - submitted the buffer to guest driver before crash, so it has to commit the - in-progres update), set old_free_head, old_used_idx, old_used_wrap_counter - to free_head, used_idx, used_wrap_counter - - 2. Set free_head, used_idx, used_wrap_counter to old_free_head, old_used_idx, - old_used_wrap_counter (roll back any in-progress update) - - 3. Set the inflight field of each DescStatePacked entry in free list to 0 - - 4. Resubmit inflight DescStatePacked entries in order of their counter value - -Protocol features ------------------ - -#define VHOST_USER_PROTOCOL_F_MQ 0 -#define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1 -#define VHOST_USER_PROTOCOL_F_RARP 2 -#define VHOST_USER_PROTOCOL_F_REPLY_ACK 3 -#define VHOST_USER_PROTOCOL_F_MTU 4 -#define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5 -#define VHOST_USER_PROTOCOL_F_CROSS_ENDIAN 6 -#define VHOST_USER_PROTOCOL_F_CRYPTO_SESSION 7 -#define VHOST_USER_PROTOCOL_F_PAGEFAULT 8 -#define VHOST_USER_PROTOCOL_F_CONFIG 9 -#define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD 10 -#define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11 -#define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12 - -Master message types --------------------- - - * VHOST_USER_GET_FEATURES - - Id: 1 - Equivalent ioctl: VHOST_GET_FEATURES - Master payload: N/A - Slave payload: u64 - - Get from the underlying vhost implementation the features bitmask. - Feature bit VHOST_USER_F_PROTOCOL_FEATURES signals slave support for - VHOST_USER_GET_PROTOCOL_FEATURES and VHOST_USER_SET_PROTOCOL_FEATURES. - - * VHOST_USER_SET_FEATURES - - Id: 2 - Ioctl: VHOST_SET_FEATURES - Master payload: u64 - - Enable features in the underlying vhost implementation using a bitmask. - Feature bit VHOST_USER_F_PROTOCOL_FEATURES signals slave support for - VHOST_USER_GET_PROTOCOL_FEATURES and VHOST_USER_SET_PROTOCOL_FEATURES. - - * VHOST_USER_GET_PROTOCOL_FEATURES - - Id: 15 - Equivalent ioctl: VHOST_GET_FEATURES - Master payload: N/A - Slave payload: u64 - - Get the protocol feature bitmask from the underlying vhost implementation. - Only legal if feature bit VHOST_USER_F_PROTOCOL_FEATURES is present in - VHOST_USER_GET_FEATURES. - Note: slave that reported VHOST_USER_F_PROTOCOL_FEATURES must support - this message even before VHOST_USER_SET_FEATURES was called. - - * VHOST_USER_SET_PROTOCOL_FEATURES - - Id: 16 - Ioctl: VHOST_SET_FEATURES - Master payload: u64 - - Enable protocol features in the underlying vhost implementation. - Only legal if feature bit VHOST_USER_F_PROTOCOL_FEATURES is present in - VHOST_USER_GET_FEATURES. - Note: slave that reported VHOST_USER_F_PROTOCOL_FEATURES must support - this message even before VHOST_USER_SET_FEATURES was called. - - * VHOST_USER_SET_OWNER - - Id: 3 - Equivalent ioctl: VHOST_SET_OWNER - Master payload: N/A - - Issued when a new connection is established. It sets the current Master - as an owner of the session. This can be used on the Slave as a - "session start" flag. - - * VHOST_USER_RESET_OWNER - - Id: 4 - Master payload: N/A - - This is no longer used. Used to be sent to request disabling - all rings, but some clients interpreted it to also discard - connection state (this interpretation would lead to bugs). - It is recommended that clients either ignore this message, - or use it to disable all rings. - - * VHOST_USER_SET_MEM_TABLE - - Id: 5 - Equivalent ioctl: VHOST_SET_MEM_TABLE - Master payload: memory regions description - Slave payload: (postcopy only) memory regions description - - Sets the memory map regions on the slave so it can translate the vring - addresses. In the ancillary data there is an array of file descriptors - for each memory mapped region. The size and ordering of the fds matches - the number and ordering of memory regions. - - When VHOST_USER_POSTCOPY_LISTEN has been received, SET_MEM_TABLE replies with - the bases of the memory mapped regions to the master. The slave must - have mmap'd the regions but not yet accessed them and should not yet generate - a userfault event. Note NEED_REPLY_MASK is not set in this case. - QEMU will then reply back to the list of mappings with an empty - VHOST_USER_SET_MEM_TABLE as an acknowledgment; only upon reception of this - message may the guest start accessing the memory and generating faults. - - * VHOST_USER_SET_LOG_BASE - - Id: 6 - Equivalent ioctl: VHOST_SET_LOG_BASE - Master payload: u64 - Slave payload: N/A - - Sets logging shared memory space. - When slave has VHOST_USER_PROTOCOL_F_LOG_SHMFD protocol - feature, the log memory fd is provided in the ancillary data of - VHOST_USER_SET_LOG_BASE message, the size and offset of shared - memory area provided in the message. - - - * VHOST_USER_SET_LOG_FD - - Id: 7 - Equivalent ioctl: VHOST_SET_LOG_FD - Master payload: N/A - - Sets the logging file descriptor, which is passed as ancillary data. - - * VHOST_USER_SET_VRING_NUM - - Id: 8 - Equivalent ioctl: VHOST_SET_VRING_NUM - Master payload: vring state description - - Set the size of the queue. - - * VHOST_USER_SET_VRING_ADDR - - Id: 9 - Equivalent ioctl: VHOST_SET_VRING_ADDR - Master payload: vring address description - Slave payload: N/A - - Sets the addresses of the different aspects of the vring. - - * VHOST_USER_SET_VRING_BASE - - Id: 10 - Equivalent ioctl: VHOST_SET_VRING_BASE - Master payload: vring state description - - Sets the base offset in the available vring. - - * VHOST_USER_GET_VRING_BASE - - Id: 11 - Equivalent ioctl: VHOST_USER_GET_VRING_BASE - Master payload: vring state description - Slave payload: vring state description - - Get the available vring base offset. - - * VHOST_USER_SET_VRING_KICK - - Id: 12 - Equivalent ioctl: VHOST_SET_VRING_KICK - Master payload: u64 - - Set the event file descriptor for adding buffers to the vring. It - is passed in the ancillary data. - Bits (0-7) of the payload contain the vring index. Bit 8 is the - invalid FD flag. This flag is set when there is no file descriptor - in the ancillary data. This signals that polling should be used - instead of waiting for a kick. - - * VHOST_USER_SET_VRING_CALL - - Id: 13 - Equivalent ioctl: VHOST_SET_VRING_CALL - Master payload: u64 - - Set the event file descriptor to signal when buffers are used. It - is passed in the ancillary data. - Bits (0-7) of the payload contain the vring index. Bit 8 is the - invalid FD flag. This flag is set when there is no file descriptor - in the ancillary data. This signals that polling will be used - instead of waiting for the call. - - * VHOST_USER_SET_VRING_ERR - - Id: 14 - Equivalent ioctl: VHOST_SET_VRING_ERR - Master payload: u64 - - Set the event file descriptor to signal when error occurs. It - is passed in the ancillary data. - Bits (0-7) of the payload contain the vring index. Bit 8 is the - invalid FD flag. This flag is set when there is no file descriptor - in the ancillary data. - - * VHOST_USER_GET_QUEUE_NUM - - Id: 17 - Equivalent ioctl: N/A - Master payload: N/A - Slave payload: u64 - - Query how many queues the backend supports. This request should be - sent only when VHOST_USER_PROTOCOL_F_MQ is set in queried protocol - features by VHOST_USER_GET_PROTOCOL_FEATURES. - - * VHOST_USER_SET_VRING_ENABLE - - Id: 18 - Equivalent ioctl: N/A - Master payload: vring state description - - Signal slave to enable or disable corresponding vring. - This request should be sent only when VHOST_USER_F_PROTOCOL_FEATURES - has been negotiated. - - * VHOST_USER_SEND_RARP - - Id: 19 - Equivalent ioctl: N/A - Master payload: u64 - - Ask vhost user backend to broadcast a fake RARP to notify the migration - is terminated for guest that does not support GUEST_ANNOUNCE. - Only legal if feature bit VHOST_USER_F_PROTOCOL_FEATURES is present in - VHOST_USER_GET_FEATURES and protocol feature bit VHOST_USER_PROTOCOL_F_RARP - is present in VHOST_USER_GET_PROTOCOL_FEATURES. - The first 6 bytes of the payload contain the mac address of the guest to - allow the vhost user backend to construct and broadcast the fake RARP. - - * VHOST_USER_NET_SET_MTU - - Id: 20 - Equivalent ioctl: N/A - Master payload: u64 - - Set host MTU value exposed to the guest. - This request should be sent only when VIRTIO_NET_F_MTU feature has been - successfully negotiated, VHOST_USER_F_PROTOCOL_FEATURES is present in - VHOST_USER_GET_FEATURES and protocol feature bit - VHOST_USER_PROTOCOL_F_NET_MTU is present in - VHOST_USER_GET_PROTOCOL_FEATURES. - If VHOST_USER_PROTOCOL_F_REPLY_ACK is negotiated, slave must respond - with zero in case the specified MTU is valid, or non-zero otherwise. - - * VHOST_USER_SET_SLAVE_REQ_FD - - Id: 21 - Equivalent ioctl: N/A - Master payload: N/A - - Set the socket file descriptor for slave initiated requests. It is passed - in the ancillary data. - This request should be sent only when VHOST_USER_F_PROTOCOL_FEATURES - has been negotiated, and protocol feature bit VHOST_USER_PROTOCOL_F_SLAVE_REQ - bit is present in VHOST_USER_GET_PROTOCOL_FEATURES. - If VHOST_USER_PROTOCOL_F_REPLY_ACK is negotiated, slave must respond - with zero for success, non-zero otherwise. - - * VHOST_USER_IOTLB_MSG - - Id: 22 - Equivalent ioctl: N/A (equivalent to VHOST_IOTLB_MSG message type) - Master payload: struct vhost_iotlb_msg - Slave payload: u64 - - Send IOTLB messages with struct vhost_iotlb_msg as payload. - Master sends such requests to update and invalidate entries in the device - IOTLB. The slave has to acknowledge the request with sending zero as u64 - payload for success, non-zero otherwise. - This request should be send only when VIRTIO_F_IOMMU_PLATFORM feature - has been successfully negotiated. - - * VHOST_USER_SET_VRING_ENDIAN - - Id: 23 - Equivalent ioctl: VHOST_SET_VRING_ENDIAN - Master payload: vring state description - - Set the endianness of a VQ for legacy devices. Little-endian is indicated - with state.num set to 0 and big-endian is indicated with state.num set - to 1. Other values are invalid. - This request should be sent only when VHOST_USER_PROTOCOL_F_CROSS_ENDIAN - has been negotiated. - Backends that negotiated this feature should handle both endiannesses - and expect this message once (per VQ) during device configuration - (ie. before the master starts the VQ). - - * VHOST_USER_GET_CONFIG - - Id: 24 - Equivalent ioctl: N/A - Master payload: virtio device config space - Slave payload: virtio device config space - - When VHOST_USER_PROTOCOL_F_CONFIG is negotiated, this message is - submitted by the vhost-user master to fetch the contents of the virtio - device configuration space, vhost-user slave's payload size MUST match - master's request, vhost-user slave uses zero length of payload to - indicate an error to vhost-user master. The vhost-user master may - cache the contents to avoid repeated VHOST_USER_GET_CONFIG calls. - -* VHOST_USER_SET_CONFIG - - Id: 25 - Equivalent ioctl: N/A - Master payload: virtio device config space - Slave payload: N/A - - When VHOST_USER_PROTOCOL_F_CONFIG is negotiated, this message is - submitted by the vhost-user master when the Guest changes the virtio - device configuration space and also can be used for live migration - on the destination host. The vhost-user slave must check the flags - field, and slaves MUST NOT accept SET_CONFIG for read-only - configuration space fields unless the live migration bit is set. - -* VHOST_USER_CREATE_CRYPTO_SESSION - - Id: 26 - Equivalent ioctl: N/A - Master payload: crypto session description - Slave payload: crypto session description - - Create a session for crypto operation. The server side must return the - session id, 0 or positive for success, negative for failure. - This request should be sent only when VHOST_USER_PROTOCOL_F_CRYPTO_SESSION - feature has been successfully negotiated. - It's a required feature for crypto devices. - -* VHOST_USER_CLOSE_CRYPTO_SESSION - - Id: 27 - Equivalent ioctl: N/A - Master payload: u64 - - Close a session for crypto operation which was previously - created by VHOST_USER_CREATE_CRYPTO_SESSION. - This request should be sent only when VHOST_USER_PROTOCOL_F_CRYPTO_SESSION - feature has been successfully negotiated. - It's a required feature for crypto devices. - - * VHOST_USER_POSTCOPY_ADVISE - Id: 28 - Master payload: N/A - Slave payload: userfault fd - - When VHOST_USER_PROTOCOL_F_PAGEFAULT is supported, the - master advises slave that a migration with postcopy enabled is underway, - the slave must open a userfaultfd for later use. - Note that at this stage the migration is still in precopy mode. - - * VHOST_USER_POSTCOPY_LISTEN - Id: 29 - Master payload: N/A - - Master advises slave that a transition to postcopy mode has happened. - The slave must ensure that shared memory is registered with userfaultfd - to cause faulting of non-present pages. - - This is always sent sometime after a VHOST_USER_POSTCOPY_ADVISE, and - thus only when VHOST_USER_PROTOCOL_F_PAGEFAULT is supported. - - * VHOST_USER_POSTCOPY_END - Id: 30 - Slave payload: u64 - - Master advises that postcopy migration has now completed. The - slave must disable the userfaultfd. The response is an acknowledgement - only. - When VHOST_USER_PROTOCOL_F_PAGEFAULT is supported, this message - is sent at the end of the migration, after VHOST_USER_POSTCOPY_LISTEN - was previously sent. - The value returned is an error indication; 0 is success. - - * VHOST_USER_GET_INFLIGHT_FD - Id: 31 - Equivalent ioctl: N/A - Master payload: inflight description - - When VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD protocol feature has been - successfully negotiated, this message is submitted by master to get - a shared buffer from slave. The shared buffer will be used to track - inflight I/O by slave. QEMU should retrieve a new one when vm reset. - - * VHOST_USER_SET_INFLIGHT_FD - Id: 32 - Equivalent ioctl: N/A - Master payload: inflight description - - When VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD protocol feature has been - successfully negotiated, this message is submitted by master to send - the shared inflight buffer back to slave so that slave could get - inflight I/O after a crash or restart. - -Slave message types -------------------- - - * VHOST_USER_SLAVE_IOTLB_MSG - - Id: 1 - Equivalent ioctl: N/A (equivalent to VHOST_IOTLB_MSG message type) - Slave payload: struct vhost_iotlb_msg - Master payload: N/A - - Send IOTLB messages with struct vhost_iotlb_msg as payload. - Slave sends such requests to notify of an IOTLB miss, or an IOTLB - access failure. If VHOST_USER_PROTOCOL_F_REPLY_ACK is negotiated, - and slave set the VHOST_USER_NEED_REPLY flag, master must respond with - zero when operation is successfully completed, or non-zero otherwise. - This request should be send only when VIRTIO_F_IOMMU_PLATFORM feature - has been successfully negotiated. - -* VHOST_USER_SLAVE_CONFIG_CHANGE_MSG - - Id: 2 - Equivalent ioctl: N/A - Slave payload: N/A - Master payload: N/A - - When VHOST_USER_PROTOCOL_F_CONFIG is negotiated, vhost-user slave sends - such messages to notify that the virtio device's configuration space has - changed, for those host devices which can support such feature, host - driver can send VHOST_USER_GET_CONFIG message to slave to get the latest - content. If VHOST_USER_PROTOCOL_F_REPLY_ACK is negotiated, and slave set - the VHOST_USER_NEED_REPLY flag, master must respond with zero when - operation is successfully completed, or non-zero otherwise. - - * VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG - - Id: 3 - Equivalent ioctl: N/A - Slave payload: vring area description - Master payload: N/A - - Sets host notifier for a specified queue. The queue index is contained - in the u64 field of the vring area description. The host notifier is - described by the file descriptor (typically it's a VFIO device fd) which - is passed as ancillary data and the size (which is mmap size and should - be the same as host page size) and offset (which is mmap offset) carried - in the vring area description. QEMU can mmap the file descriptor based - on the size and offset to get a memory range. Registering a host notifier - means mapping this memory range to the VM as the specified queue's notify - MMIO region. Slave sends this request to tell QEMU to de-register the - existing notifier if any and register the new notifier if the request is - sent with a file descriptor. - This request should be sent only when VHOST_USER_PROTOCOL_F_HOST_NOTIFIER - protocol feature has been successfully negotiated. - -VHOST_USER_PROTOCOL_F_REPLY_ACK: -------------------------------- -The original vhost-user specification only demands replies for certain -commands. This differs from the vhost protocol implementation where commands -are sent over an ioctl() call and block until the client has completed. - -With this protocol extension negotiated, the sender (QEMU) can set the -"need_reply" [Bit 3] flag to any command. This indicates that -the client MUST respond with a Payload VhostUserMsg indicating success or -failure. The payload should be set to zero on success or non-zero on failure, -unless the message already has an explicit reply body. - -The response payload gives QEMU a deterministic indication of the result -of the command. Today, QEMU is expected to terminate the main vhost-user -loop upon receiving such errors. In future, qemu could be taught to be more -resilient for selective requests. - -For the message types that already solicit a reply from the client, the -presence of VHOST_USER_PROTOCOL_F_REPLY_ACK or need_reply bit being set brings -no behavioural change. (See the 'Communication' section for details.) - -Backend program conventions ---------------------------- - -vhost-user backends can provide various devices & services and may -need to be configured manually depending on the use case. However, it -is a good idea to follow the conventions listed here when -possible. Users, QEMU or libvirt, can then rely on some common -behaviour to avoid heterogenous configuration and management of the -backend programs and facilitate interoperability. - -Each backend installed on a host system should come with at least one -JSON file that conforms to the vhost-user.json schema. Each file -informs the management applications about the backend type, and binary -location. In addition, it defines rules for management apps for -picking the highest priority backend when multiple match the search -criteria (see @VhostUserBackend documentation in the schema file). - -If the backend is not capable of enabling a requested feature on the -host (such as 3D acceleration with virgl), or the initialization -failed, the backend should fail to start early and exit with a status -!= 0. It may also print a message to stderr for further details. - -The backend program must not daemonize itself, but it may be -daemonized by the management layer. It may also have a restricted -access to the system. - -File descriptors 0, 1 and 2 will exist, and have regular -stdin/stdout/stderr usage (they may have been redirected to /dev/null -by the management layer, or to a log handler). - -The backend program must end (as quickly and cleanly as possible) when -the SIGTERM signal is received. Eventually, it may receive SIGKILL by -the management layer after a few seconds. - -The following command line options have an expected behaviour. They -are mandatory, unless explicitly said differently: - -* --socket-path=PATH - -This option specify the location of the vhost-user Unix domain socket. -It is incompatible with --fd. - -* --fd=FDNUM - -When this argument is given, the backend program is started with the -vhost-user socket as file descriptor FDNUM. It is incompatible with ---socket-path. - -* --print-capabilities - -Output to stdout the backend capabilities in JSON format, and then -exit successfully. Other options and arguments should be ignored, and -the backend program should not perform its normal function. The -capabilities can be reported dynamically depending on the host -capabilities. - -The JSON output is described in the vhost-user.json schema, by -@VHostUserBackendCapabilities. Example: -{ - "type": "foo", - "features": [ - "feature-a", - "feature-b" - ] -} - -vhost-user-input ----------------- - -Command line options: - -* --evdev-path=PATH (optional) - -Specify the linux input device. - -* --no-grab (optional) - -Do no request exclusive access to the input device. - -vhost-user-gpu --------------- - -Command line options: - -* --render-node=PATH (optional) - -Specify the GPU DRM render node. - -* --virgl (optional) - -Enable virgl rendering support. diff --git a/hw/acpi/pcihp.c b/hw/acpi/pcihp.c index 88e4ae1bcd..613406d09b 100644 --- a/hw/acpi/pcihp.c +++ b/hw/acpi/pcihp.c @@ -37,14 +37,7 @@ #include "hw/pci/pci_bus.h" #include "qapi/error.h" #include "qom/qom-qobject.h" - -//#define DEBUG - -#ifdef DEBUG -# define ACPI_PCIHP_DPRINTF(format, ...) printf(format, ## __VA_ARGS__) -#else -# define ACPI_PCIHP_DPRINTF(format, ...) do { } while (0) -#endif +#include "trace.h" #define ACPI_PCIHP_ADDR 0xae00 #define ACPI_PCIHP_SIZE 0x0014 @@ -159,6 +152,8 @@ static void acpi_pcihp_eject_slot(AcpiPciHpState *s, unsigned bsel, unsigned slo int slot = ctz32(slots); PCIBus *bus = acpi_pcihp_find_hotplug_bus(s, bsel); + trace_acpi_pci_eject_slot(bsel, slot); + if (!bus) { return; } @@ -270,6 +265,8 @@ void acpi_pcihp_device_plug_cb(HotplugHandler *hotplug_dev, AcpiPciHpState *s, void acpi_pcihp_device_unplug_cb(HotplugHandler *hotplug_dev, AcpiPciHpState *s, DeviceState *dev, Error **errp) { + trace_acpi_pci_unplug(PCI_SLOT(PCI_DEVICE(dev)->devfn), + acpi_pcihp_get_bsel(pci_get_bus(PCI_DEVICE(dev)))); object_property_set_bool(OBJECT(dev), false, "realized", NULL); } @@ -280,6 +277,9 @@ void acpi_pcihp_device_unplug_request_cb(HotplugHandler *hotplug_dev, PCIDevice *pdev = PCI_DEVICE(dev); int slot = PCI_SLOT(pdev->devfn); int bsel = acpi_pcihp_get_bsel(pci_get_bus(pdev)); + + trace_acpi_pci_unplug_request(bsel, slot); + if (bsel < 0) { error_setg(errp, "Unsupported bus. Bus doesn't have property '" ACPI_PCIHP_PROP_BSEL "' set"); @@ -306,23 +306,23 @@ static uint64_t pci_read(void *opaque, hwaddr addr, unsigned int size) if (!s->legacy_piix) { s->acpi_pcihp_pci_status[bsel].up = 0; } - ACPI_PCIHP_DPRINTF("pci_up_read %" PRIu32 "\n", val); + trace_acpi_pci_up_read(val); break; case PCI_DOWN_BASE: val = s->acpi_pcihp_pci_status[bsel].down; - ACPI_PCIHP_DPRINTF("pci_down_read %" PRIu32 "\n", val); + trace_acpi_pci_down_read(val); break; case PCI_EJ_BASE: /* No feature defined yet */ - ACPI_PCIHP_DPRINTF("pci_features_read %" PRIu32 "\n", val); + trace_acpi_pci_features_read(val); break; case PCI_RMV_BASE: val = s->acpi_pcihp_pci_status[bsel].hotplug_enable; - ACPI_PCIHP_DPRINTF("pci_rmv_read %" PRIu32 "\n", val); + trace_acpi_pci_rmv_read(val); break; case PCI_SEL_BASE: val = s->hotplug_select; - ACPI_PCIHP_DPRINTF("pci_sel_read %" PRIu32 "\n", val); + trace_acpi_pci_sel_read(val); default: break; } @@ -340,13 +340,11 @@ static void pci_write(void *opaque, hwaddr addr, uint64_t data, break; } acpi_pcihp_eject_slot(s, s->hotplug_select, data); - ACPI_PCIHP_DPRINTF("pciej write %" HWADDR_PRIx " <== %" PRIu64 "\n", - addr, data); + trace_acpi_pci_ej_write(addr, data); break; case PCI_SEL_BASE: s->hotplug_select = s->legacy_piix ? ACPI_PCIHP_BSEL_DEFAULT : data; - ACPI_PCIHP_DPRINTF("pcisel write %" HWADDR_PRIx " <== %" PRIu64 "\n", - addr, data); + trace_acpi_pci_sel_write(addr, data); default: break; } diff --git a/hw/acpi/piix4.c b/hw/acpi/piix4.c index c903e65169..ec4e186cec 100644 --- a/hw/acpi/piix4.c +++ b/hw/acpi/piix4.c @@ -39,14 +39,7 @@ #include "hw/acpi/acpi_dev_interface.h" #include "hw/xen/xen.h" #include "qom/cpu.h" - -//#define DEBUG - -#ifdef DEBUG -# define PIIX4_DPRINTF(format, ...) printf(format, ## __VA_ARGS__) -#else -# define PIIX4_DPRINTF(format, ...) do { } while (0) -#endif +#include "trace.h" #define GPE_BASE 0xafe0 #define GPE_LEN 4 @@ -583,7 +576,7 @@ static uint64_t gpe_readb(void *opaque, hwaddr addr, unsigned width) PIIX4PMState *s = opaque; uint32_t val = acpi_gpe_ioport_readb(&s->ar, addr); - PIIX4_DPRINTF("gpe read %" HWADDR_PRIx " == %" PRIu32 "\n", addr, val); + trace_piix4_gpe_readb(addr, width, val); return val; } @@ -592,10 +585,9 @@ static void gpe_writeb(void *opaque, hwaddr addr, uint64_t val, { PIIX4PMState *s = opaque; + trace_piix4_gpe_writeb(addr, width, val); acpi_gpe_ioport_writeb(&s->ar, addr, val); acpi_update_sci(&s->ar, s->irq); - - PIIX4_DPRINTF("gpe write %" HWADDR_PRIx " <== %" PRIu64 "\n", addr, val); } static const MemoryRegionOps piix4_gpe_ops = { diff --git a/hw/acpi/trace-events b/hw/acpi/trace-events index 6272d8a9e7..96b8273297 100644 --- a/hw/acpi/trace-events +++ b/hw/acpi/trace-events @@ -31,6 +31,22 @@ cpuhp_acpi_ejecting_cpu(uint32_t idx) "0x%"PRIx32 cpuhp_acpi_write_ost_ev(uint32_t slot, uint32_t ev) "idx[0x%"PRIx32"] OST EVENT: 0x%"PRIx32 cpuhp_acpi_write_ost_status(uint32_t slot, uint32_t st) "idx[0x%"PRIx32"] OST STATUS: 0x%"PRIx32 +# pcihp.c +acpi_pci_eject_slot(unsigned bsel, unsigned slot) "bsel: %u slot: %u" +acpi_pci_unplug(int bsel, int slot) "bsel: %d slot: %d" +acpi_pci_unplug_request(int bsel, int slot) "bsel: %d slot: %d" +acpi_pci_up_read(uint32_t val) "%" PRIu32 +acpi_pci_down_read(uint32_t val) "%" PRIu32 +acpi_pci_features_read(uint32_t val) "%" PRIu32 +acpi_pci_rmv_read(uint32_t val) "%" PRIu32 +acpi_pci_sel_read(uint32_t val) "%" PRIu32 +acpi_pci_ej_write(uint64_t addr, uint64_t data) "0x%" PRIx64 " <== %" PRIu64 +acpi_pci_sel_write(uint64_t addr, uint64_t data) "0x%" PRIx64 " <== %" PRIu64 + +# piix4.c +piix4_gpe_readb(uint64_t addr, unsigned width, uint64_t val) "addr: 0x%" PRIx64 " width: %d ==> 0x%" PRIx64 +piix4_gpe_writeb(uint64_t addr, unsigned width, uint64_t val) "addr: 0x%" PRIx64 " width: %d <== 0x%" PRIx64 + # tco.c tco_timer_reload(int ticks, int msec) "ticks=%d (%d ms)" tco_timer_expired(int timeouts_no, bool strap, bool no_reboot) "timeouts_no=%d no_reboot=%d/%d" diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c index bf9c0bc2f4..e7c96d658e 100644 --- a/hw/arm/virt-acpi-build.c +++ b/hw/arm/virt-acpi-build.c @@ -40,6 +40,7 @@ #include "hw/loader.h" #include "hw/hw.h" #include "hw/acpi/aml-build.h" +#include "hw/acpi/pci.h" #include "hw/pci/pcie_host.h" #include "hw/pci/pci.h" #include "hw/arm/virt.h" @@ -546,25 +547,20 @@ build_srat(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) } static void -build_mcfg(GArray *table_data, BIOSLinker *linker, VirtMachineState *vms) +build_mcfg(GArray *table_data, BIOSLinker *linker, AcpiMcfgInfo *info) { AcpiTableMcfg *mcfg; - const MemMapEntry *memmap = vms->memmap; - int ecam_id = VIRT_ECAM_ID(vms->highmem_ecam); int len = sizeof(*mcfg) + sizeof(mcfg->allocation[0]); - int mcfg_start = table_data->len; mcfg = acpi_data_push(table_data, len); - mcfg->allocation[0].address = cpu_to_le64(memmap[ecam_id].base); + mcfg->allocation[0].address = cpu_to_le64(info->base); /* Only a single allocation so no need to play with segments */ mcfg->allocation[0].pci_segment = cpu_to_le16(0); mcfg->allocation[0].start_bus_number = 0; - mcfg->allocation[0].end_bus_number = - PCIE_MMCFG_BUS(memmap[ecam_id].size - 1); + mcfg->allocation[0].end_bus_number = PCIE_MMCFG_BUS(info->size - 1); - build_header(linker, table_data, (void *)(table_data->data + mcfg_start), - "MCFG", table_data->len - mcfg_start, 1, NULL, NULL); + build_header(linker, table_data, (void *)mcfg, "MCFG", len, 1, NULL, NULL); } /* GTDT */ @@ -803,7 +799,13 @@ void virt_acpi_build(VirtMachineState *vms, AcpiBuildTables *tables) build_gtdt(tables_blob, tables->linker, vms); acpi_add_table(table_offsets, tables_blob); - build_mcfg(tables_blob, tables->linker, vms); + { + AcpiMcfgInfo mcfg = { + .base = vms->memmap[VIRT_ECAM_ID(vms->highmem_ecam)].base, + .size = vms->memmap[VIRT_ECAM_ID(vms->highmem_ecam)].size, + }; + build_mcfg(tables_blob, tables->linker, &mcfg); + } acpi_add_table(table_offsets, tables_blob); build_spcr(tables_blob, tables->linker, vms); diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c index 28b81368f7..9cb61336a6 100644 --- a/hw/block/vhost-user-blk.c +++ b/hw/block/vhost-user-blk.c @@ -103,7 +103,7 @@ const VhostDevConfigOps blk_ops = { .vhost_dev_config_notifier = vhost_user_blk_handle_config_change, }; -static void vhost_user_blk_start(VirtIODevice *vdev) +static int vhost_user_blk_start(VirtIODevice *vdev) { VHostUserBlk *s = VHOST_USER_BLK(vdev); BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); @@ -112,13 +112,13 @@ static void vhost_user_blk_start(VirtIODevice *vdev) if (!k->set_guest_notifiers) { error_report("binding does not support guest notifiers"); - return; + return -ENOSYS; } ret = vhost_dev_enable_notifiers(&s->dev, vdev); if (ret < 0) { error_report("Error enabling host notifiers: %d", -ret); - return; + return ret; } ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, true); @@ -157,12 +157,13 @@ static void vhost_user_blk_start(VirtIODevice *vdev) vhost_virtqueue_mask(&s->dev, vdev, i, false); } - return; + return ret; err_guest_notifiers: k->set_guest_notifiers(qbus->parent, s->dev.nvqs, false); err_host_notifiers: vhost_dev_disable_notifiers(&s->dev, vdev); + return ret; } static void vhost_user_blk_stop(VirtIODevice *vdev) @@ -190,18 +191,28 @@ static void vhost_user_blk_stop(VirtIODevice *vdev) static void vhost_user_blk_set_status(VirtIODevice *vdev, uint8_t status) { VHostUserBlk *s = VHOST_USER_BLK(vdev); - bool should_start = status & VIRTIO_CONFIG_S_DRIVER_OK; + bool should_start = vdev->started; + int ret; if (!vdev->vm_running) { should_start = false; } + if (!s->connected) { + return; + } + if (s->dev.started == should_start) { return; } if (should_start) { - vhost_user_blk_start(vdev); + ret = vhost_user_blk_start(vdev); + if (ret < 0) { + error_report("vhost-user-blk: vhost start failed: %s", + strerror(-ret)); + qemu_chr_fe_disconnect(&s->chardev); + } } else { vhost_user_blk_stop(vdev); } @@ -237,10 +248,13 @@ static uint64_t vhost_user_blk_get_features(VirtIODevice *vdev, static void vhost_user_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq) { VHostUserBlk *s = VHOST_USER_BLK(vdev); - int i; + int i, ret; - if (!(virtio_host_has_feature(vdev, VIRTIO_F_VERSION_1) && - !virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1))) { + if (!vdev->start_on_kick) { + return; + } + + if (!s->connected) { return; } @@ -251,7 +265,13 @@ static void vhost_user_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq) /* Some guests kick before setting VIRTIO_CONFIG_S_DRIVER_OK so start * vhost here instead of waiting for .set_status(). */ - vhost_user_blk_start(vdev); + ret = vhost_user_blk_start(vdev); + if (ret < 0) { + error_report("vhost-user-blk: vhost start failed: %s", + strerror(-ret)); + qemu_chr_fe_disconnect(&s->chardev); + return; + } /* Kick right away to begin processing requests already in vring */ for (i = 0; i < s->dev.nvqs; i++) { @@ -271,11 +291,103 @@ static void vhost_user_blk_reset(VirtIODevice *vdev) vhost_dev_free_inflight(s->inflight); } +static int vhost_user_blk_connect(DeviceState *dev) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserBlk *s = VHOST_USER_BLK(vdev); + int ret = 0; + + if (s->connected) { + return 0; + } + s->connected = true; + + s->dev.nvqs = s->num_queues; + s->dev.vqs = s->vqs; + s->dev.vq_index = 0; + s->dev.backend_features = 0; + + vhost_dev_set_config_notifier(&s->dev, &blk_ops); + + ret = vhost_dev_init(&s->dev, &s->vhost_user, VHOST_BACKEND_TYPE_USER, 0); + if (ret < 0) { + error_report("vhost-user-blk: vhost initialization failed: %s", + strerror(-ret)); + return ret; + } + + /* restore vhost state */ + if (vdev->started) { + ret = vhost_user_blk_start(vdev); + if (ret < 0) { + error_report("vhost-user-blk: vhost start failed: %s", + strerror(-ret)); + return ret; + } + } + + return 0; +} + +static void vhost_user_blk_disconnect(DeviceState *dev) +{ + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserBlk *s = VHOST_USER_BLK(vdev); + + if (!s->connected) { + return; + } + s->connected = false; + + if (s->dev.started) { + vhost_user_blk_stop(vdev); + } + + vhost_dev_cleanup(&s->dev); +} + +static gboolean vhost_user_blk_watch(GIOChannel *chan, GIOCondition cond, + void *opaque) +{ + DeviceState *dev = opaque; + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserBlk *s = VHOST_USER_BLK(vdev); + + qemu_chr_fe_disconnect(&s->chardev); + + return true; +} + +static void vhost_user_blk_event(void *opaque, int event) +{ + DeviceState *dev = opaque; + VirtIODevice *vdev = VIRTIO_DEVICE(dev); + VHostUserBlk *s = VHOST_USER_BLK(vdev); + + switch (event) { + case CHR_EVENT_OPENED: + if (vhost_user_blk_connect(dev) < 0) { + qemu_chr_fe_disconnect(&s->chardev); + return; + } + s->watch = qemu_chr_fe_add_watch(&s->chardev, G_IO_HUP, + vhost_user_blk_watch, dev); + break; + case CHR_EVENT_CLOSED: + vhost_user_blk_disconnect(dev); + if (s->watch) { + g_source_remove(s->watch); + s->watch = 0; + } + break; + } +} + static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp) { VirtIODevice *vdev = VIRTIO_DEVICE(dev); VHostUserBlk *s = VHOST_USER_BLK(vdev); - struct vhost_virtqueue *vqs = NULL; + Error *err = NULL; int i, ret; if (!s->chardev.chr) { @@ -306,27 +418,29 @@ static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp) } s->inflight = g_new0(struct vhost_inflight, 1); + s->vqs = g_new(struct vhost_virtqueue, s->num_queues); + s->watch = 0; + s->connected = false; - s->dev.nvqs = s->num_queues; - s->dev.vqs = g_new(struct vhost_virtqueue, s->dev.nvqs); - s->dev.vq_index = 0; - s->dev.backend_features = 0; - vqs = s->dev.vqs; - - vhost_dev_set_config_notifier(&s->dev, &blk_ops); + qemu_chr_fe_set_handlers(&s->chardev, NULL, NULL, vhost_user_blk_event, + NULL, (void *)dev, NULL, true); - ret = vhost_dev_init(&s->dev, &s->vhost_user, VHOST_BACKEND_TYPE_USER, 0); - if (ret < 0) { - error_setg(errp, "vhost-user-blk: vhost initialization failed: %s", - strerror(-ret)); +reconnect: + if (qemu_chr_fe_wait_connected(&s->chardev, &err) < 0) { + error_report_err(err); goto virtio_err; } + /* check whether vhost_user_blk_connect() failed or not */ + if (!s->connected) { + goto reconnect; + } + ret = vhost_dev_get_config(&s->dev, (uint8_t *)&s->blkcfg, - sizeof(struct virtio_blk_config)); + sizeof(struct virtio_blk_config)); if (ret < 0) { - error_setg(errp, "vhost-user-blk: get block config failed"); - goto vhost_err; + error_report("vhost-user-blk: get block config failed"); + goto reconnect; } if (s->blkcfg.num_queues != s->num_queues) { @@ -335,10 +449,8 @@ static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp) return; -vhost_err: - vhost_dev_cleanup(&s->dev); virtio_err: - g_free(vqs); + g_free(s->vqs); g_free(s->inflight); virtio_cleanup(vdev); vhost_user_cleanup(&s->vhost_user); @@ -348,12 +460,13 @@ static void vhost_user_blk_device_unrealize(DeviceState *dev, Error **errp) { VirtIODevice *vdev = VIRTIO_DEVICE(dev); VHostUserBlk *s = VHOST_USER_BLK(dev); - struct vhost_virtqueue *vqs = s->dev.vqs; - vhost_user_blk_set_status(vdev, 0); + virtio_set_status(vdev, 0); + qemu_chr_fe_set_handlers(&s->chardev, NULL, NULL, NULL, + NULL, NULL, NULL, false); vhost_dev_cleanup(&s->dev); vhost_dev_free_inflight(s->inflight); - g_free(vqs); + g_free(s->vqs); g_free(s->inflight); virtio_cleanup(vdev); vhost_user_cleanup(&s->vhost_user); diff --git a/hw/core/machine.c b/hw/core/machine.c index 5d046a43e3..934c1bcceb 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -102,9 +102,26 @@ const size_t hw_compat_2_7_len = G_N_ELEMENTS(hw_compat_2_7); GlobalProperty hw_compat_2_6[] = { { "virtio-mmio", "format_transport_address", "off" }, - /* Optional because not all virtio-pci devices support legacy mode */ - { "virtio-pci", "disable-modern", "on", .optional = true }, - { "virtio-pci", "disable-legacy", "off", .optional = true }, + /* + * don't include devices which are modern-only + * ie keyboard, mouse, tablet, gpu, vga & crypto + */ + { "virtio-9p-pci", "disable-modern", "on" }, + { "virtio-9p-pci", "disable-legacy", "off" }, + { "virtio-balloon-pci", "disable-modern", "on" }, + { "virtio-balloon-pci", "disable-legacy", "off" }, + { "virtio-blk-pci", "disable-modern", "on" }, + { "virtio-blk-pci", "disable-legacy", "off" }, + { "virtio-input-host-pci", "disable-modern", "on" }, + { "virtio-input-host-pci", "disable-legacy", "off" }, + { "virtio-net-pci", "disable-modern", "on" }, + { "virtio-net-pci", "disable-legacy", "off" }, + { "virtio-rng-pci", "disable-modern", "on" }, + { "virtio-rng-pci", "disable-legacy", "off" }, + { "virtio-scsi-pci", "disable-modern", "on" }, + { "virtio-scsi-pci", "disable-legacy", "off" }, + { "virtio-serial-pci", "disable-modern", "on" }, + { "virtio-serial-pci", "disable-legacy", "off" }, }; const size_t hw_compat_2_6_len = G_N_ELEMENTS(hw_compat_2_6); diff --git a/hw/display/virtio-gpu-pci.c b/hw/display/virtio-gpu-pci.c index bdcd33c925..0bc4d9d424 100644 --- a/hw/display/virtio-gpu-pci.c +++ b/hw/display/virtio-gpu-pci.c @@ -47,7 +47,9 @@ static void virtio_gpu_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) Error *local_error = NULL; qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus)); - virtio_pci_force_virtio_1(vpci_dev); + if (!virtio_pci_force_virtio_1(vpci_dev, errp)) { + return; + } object_property_set_bool(OBJECT(vdev), true, "realized", &local_error); if (local_error) { diff --git a/hw/display/virtio-vga.c b/hw/display/virtio-vga.c index a2b803b75f..5d57bf5b0c 100644 --- a/hw/display/virtio-vga.c +++ b/hw/display/virtio-vga.c @@ -154,7 +154,9 @@ static void virtio_vga_realize(VirtIOPCIProxy *vpci_dev, Error **errp) /* init virtio bits */ qdev_set_parent_bus(DEVICE(g), BUS(&vpci_dev->bus)); - virtio_pci_force_virtio_1(vpci_dev); + if (!virtio_pci_force_virtio_1(vpci_dev, errp)) { + return; + } object_property_set_bool(OBJECT(g), true, "realized", &err); if (err) { error_propagate(errp, err); diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c index b4ec14e349..0d78d73894 100644 --- a/hw/i386/acpi-build.c +++ b/hw/i386/acpi-build.c @@ -59,6 +59,7 @@ #include "hw/i386/x86-iommu.h" #include "hw/acpi/aml-build.h" +#include "hw/acpi/pci.h" #include "qom/qom-qobject.h" #include "hw/i386/amd_iommu.h" @@ -87,11 +88,6 @@ /* Default IOAPIC ID */ #define ACPI_BUILD_IOAPIC_ID 0x0 -typedef struct AcpiMcfgInfo { - uint64_t mcfg_base; - uint32_t mcfg_size; -} AcpiMcfgInfo; - typedef struct AcpiPmInfo { bool s3_disabled; bool s4_disabled; @@ -2413,29 +2409,16 @@ static void build_mcfg_q35(GArray *table_data, BIOSLinker *linker, AcpiMcfgInfo *info) { AcpiTableMcfg *mcfg; - const char *sig; int len = sizeof(*mcfg) + 1 * sizeof(mcfg->allocation[0]); mcfg = acpi_data_push(table_data, len); - mcfg->allocation[0].address = cpu_to_le64(info->mcfg_base); + mcfg->allocation[0].address = cpu_to_le64(info->base); /* Only a single allocation so no need to play with segments */ mcfg->allocation[0].pci_segment = cpu_to_le16(0); mcfg->allocation[0].start_bus_number = 0; - mcfg->allocation[0].end_bus_number = PCIE_MMCFG_BUS(info->mcfg_size - 1); + mcfg->allocation[0].end_bus_number = PCIE_MMCFG_BUS(info->size - 1); - /* MCFG is used for ECAM which can be enabled or disabled by guest. - * To avoid table size changes (which create migration issues), - * always create the table even if there are no allocations, - * but set the signature to a reserved value in this case. - * ACPI spec requires OSPMs to ignore such tables. - */ - if (info->mcfg_base == PCIE_BASE_ADDR_UNMAPPED) { - /* Reserved signature: ignored by OSPM */ - sig = "QEMU"; - } else { - sig = "MCFG"; - } - build_header(linker, table_data, (void *)mcfg, sig, len, 1, NULL, NULL); + build_header(linker, table_data, (void *)mcfg, "MCFG", len, 1, NULL, NULL); } /* @@ -2602,12 +2585,15 @@ static bool acpi_get_mcfg(AcpiMcfgInfo *mcfg) if (!o) { return false; } - mcfg->mcfg_base = qnum_get_uint(qobject_to(QNum, o)); + mcfg->base = qnum_get_uint(qobject_to(QNum, o)); qobject_unref(o); + if (mcfg->base == PCIE_BASE_ADDR_UNMAPPED) { + return false; + } o = object_property_get_qobject(pci_host, PCIE_HOST_MCFG_SIZE, NULL); assert(o); - mcfg->mcfg_size = qnum_get_uint(qobject_to(QNum, o)); + mcfg->size = qnum_get_uint(qobject_to(QNum, o)); qobject_unref(o); return true; } diff --git a/hw/pci-bridge/pci_expander_bridge.c b/hw/pci-bridge/pci_expander_bridge.c index e62de4218f..ca66bc721a 100644 --- a/hw/pci-bridge/pci_expander_bridge.c +++ b/hw/pci-bridge/pci_expander_bridge.c @@ -66,11 +66,6 @@ static int pxb_bus_num(PCIBus *bus) return pxb->bus_nr; } -static bool pxb_is_root(PCIBus *bus) -{ - return true; /* by definition */ -} - static uint16_t pxb_bus_numa_node(PCIBus *bus) { PXBDev *pxb = convert_to_pxb(bus->parent_dev); @@ -83,7 +78,6 @@ static void pxb_bus_class_init(ObjectClass *class, void *data) PCIBusClass *pbc = PCI_BUS_CLASS(class); pbc->bus_num = pxb_bus_num; - pbc->is_root = pxb_is_root; pbc->numa_node = pxb_bus_numa_node; } diff --git a/hw/pci/pci.c b/hw/pci/pci.c index a78023f669..b386777045 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -129,14 +129,9 @@ static void pci_bus_unrealize(BusState *qbus, Error **errp) vmstate_unregister(NULL, &vmstate_pcibus, bus); } -static bool pcibus_is_root(PCIBus *bus) -{ - return !bus->parent_dev; -} - static int pcibus_num(PCIBus *bus) { - if (pcibus_is_root(bus)) { + if (pci_bus_is_root(bus)) { return 0; /* pci host bridge */ } return bus->parent_dev->config[PCI_SECONDARY_BUS]; @@ -164,7 +159,6 @@ static void pci_bus_class_init(ObjectClass *klass, void *data) k->unrealize = pci_bus_unrealize; k->reset = pcibus_reset; - pbc->is_root = pcibus_is_root; pbc->bus_num = pcibus_num; pbc->numa_node = pcibus_numa_node; pbc->allows_extended_config_space = pcibus_allows_extended_config_space; @@ -398,6 +392,7 @@ static void pci_root_bus_init(PCIBus *bus, DeviceState *parent, bus->slot_reserved_mask = 0x0; bus->address_space_mem = address_space_mem; bus->address_space_io = address_space_io; + bus->flags |= PCI_BUS_IS_ROOT; /* host bridge */ QLIST_INIT(&bus->child); @@ -415,11 +410,6 @@ bool pci_bus_is_express(PCIBus *bus) return object_dynamic_cast(OBJECT(bus), TYPE_PCIE_BUS); } -bool pci_bus_is_root(PCIBus *bus) -{ - return PCI_BUS_GET_CLASS(bus)->is_root(bus); -} - bool pci_bus_allows_extended_config_space(PCIBus *bus) { return PCI_BUS_GET_CLASS(bus)->allows_extended_config_space(bus); diff --git a/hw/pci/pcie_host.c b/hw/pci/pcie_host.c index 553db56778..1ee4945a6d 100644 --- a/hw/pci/pcie_host.c +++ b/hw/pci/pcie_host.c @@ -47,11 +47,6 @@ static void pcie_mmcfg_data_write(void *opaque, hwaddr mmcfg_addr, } addr = PCIE_MMCFG_CONFOFFSET(mmcfg_addr); limit = pci_config_size(pci_dev); - if (limit <= addr) { - /* conventional pci device can be behind pcie-to-pci bridge. - 256 <= addr < 4K has no effects. */ - return; - } pci_host_config_write_common(pci_dev, addr, limit, val, len); } @@ -70,11 +65,6 @@ static uint64_t pcie_mmcfg_data_read(void *opaque, } addr = PCIE_MMCFG_CONFOFFSET(mmcfg_addr); limit = pci_config_size(pci_dev); - if (limit <= addr) { - /* conventional pci device can be behind pcie-to-pci bridge. - 256 <= addr < 4K has no effects. */ - return ~0x0; - } return pci_host_config_read_common(pci_dev, addr, limit, len); } diff --git a/hw/virtio/virtio-crypto-pci.c b/hw/virtio/virtio-crypto-pci.c index 90a6e0dc2e..13807e538b 100644 --- a/hw/virtio/virtio-crypto-pci.c +++ b/hw/virtio/virtio-crypto-pci.c @@ -51,7 +51,9 @@ static void virtio_crypto_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) } qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus)); - virtio_pci_force_virtio_1(vpci_dev); + if (!virtio_pci_force_virtio_1(vpci_dev, errp)) { + return; + } object_property_set_bool(OBJECT(vdev), true, "realized", errp); object_property_set_link(OBJECT(vcrypto), OBJECT(vcrypto->vdev.conf.cryptodev), "cryptodev", diff --git a/hw/virtio/virtio-input-pci.c b/hw/virtio/virtio-input-pci.c index 2c1397842b..28477729a3 100644 --- a/hw/virtio/virtio-input-pci.c +++ b/hw/virtio/virtio-input-pci.c @@ -48,7 +48,9 @@ static void virtio_input_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) DeviceState *vdev = DEVICE(&vinput->vdev); qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus)); - virtio_pci_force_virtio_1(vpci_dev); + if (!virtio_pci_force_virtio_1(vpci_dev, errp)) { + return; + } object_property_set_bool(OBJECT(vdev), true, "realized", errp); } diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index cb44e19b67..9056cdfa3c 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -20,6 +20,7 @@ #include "standard-headers/linux/virtio_pci.h" #include "hw/virtio/virtio.h" #include "hw/pci/pci.h" +#include "hw/pci/pci_bus.h" #include "qapi/error.h" #include "qemu/error-report.h" #include "hw/pci/msi.h" @@ -1721,16 +1722,22 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) /* PCI BAR regions must be powers of 2 */ pow2ceil(proxy->notify.offset + proxy->notify.size)); - if (proxy->disable_legacy == ON_OFF_AUTO_AUTO) { - proxy->disable_legacy = pcie_port ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; - } - - if (!virtio_pci_modern(proxy) && !virtio_pci_legacy(proxy)) { - error_setg(errp, "device cannot work as neither modern nor legacy mode" - " is enabled"); - error_append_hint(errp, "Set either disable-modern or disable-legacy" - " to off\n"); - return; + if ((proxy->disable_legacy == ON_OFF_AUTO_ON) || + ((proxy->disable_legacy == ON_OFF_AUTO_AUTO) && pcie_port)) { + if (proxy->disable_modern) { + error_setg(errp, "device cannot work as neither modern nor " + "legacy mode is enabled"); + error_append_hint(errp, "Set either disable-modern or " + "disable-legacy to off\n"); + return; + } + proxy->mode = VIRTIO_PCI_MODE_MODERN; + } else { + if (proxy->disable_modern) { + proxy->mode = VIRTIO_PCI_MODE_LEGACY; + } else { + proxy->mode = VIRTIO_PCI_MODE_TRANSITIONAL; + } } if (pcie_port && pci_is_express(pci_dev)) { diff --git a/hw/virtio/virtio-pci.h b/hw/virtio/virtio-pci.h index 18581854ca..bfea2892a5 100644 --- a/hw/virtio/virtio-pci.h +++ b/hw/virtio/virtio-pci.h @@ -15,6 +15,7 @@ #ifndef QEMU_VIRTIO_PCI_H #define QEMU_VIRTIO_PCI_H +#include "qapi/error.h" #include "hw/pci/msi.h" #include "hw/virtio/virtio-bus.h" @@ -118,6 +119,12 @@ typedef struct VirtIOPCIQueue { uint32_t used[2]; } VirtIOPCIQueue; +typedef enum { + VIRTIO_PCI_MODE_LEGACY, + VIRTIO_PCI_MODE_TRANSITIONAL, + VIRTIO_PCI_MODE_MODERN, +} VirtIOPCIMode; + struct VirtIOPCIProxy { PCIDevice pci_dev; MemoryRegion bar; @@ -142,6 +149,7 @@ struct VirtIOPCIProxy { bool disable_modern; bool ignore_backend_features; OnOffAuto disable_legacy; + VirtIOPCIMode mode; uint32_t class_code; uint32_t nvectors; uint32_t dfselect; @@ -156,23 +164,34 @@ struct VirtIOPCIProxy { static inline bool virtio_pci_modern(VirtIOPCIProxy *proxy) { - return !proxy->disable_modern; + return proxy->mode != VIRTIO_PCI_MODE_LEGACY; } static inline bool virtio_pci_legacy(VirtIOPCIProxy *proxy) { - return proxy->disable_legacy == ON_OFF_AUTO_OFF; + return proxy->mode != VIRTIO_PCI_MODE_MODERN; } -static inline void virtio_pci_force_virtio_1(VirtIOPCIProxy *proxy) +static inline bool virtio_pci_force_virtio_1(VirtIOPCIProxy *proxy, + Error **errp) { - proxy->disable_modern = false; - proxy->disable_legacy = ON_OFF_AUTO_ON; + if (proxy->disable_legacy == ON_OFF_AUTO_OFF) { + error_setg(errp, "Unable to set disable-legacy=off on a virtio-1.0 " + "only device"); + return false; + } + if (proxy->disable_modern == true) { + error_setg(errp, "Unable to set disable-modern=on on a virtio-1.0 " + "only device"); + return false; + } + proxy->mode = VIRTIO_PCI_MODE_MODERN; + return true; } static inline void virtio_pci_disable_modern(VirtIOPCIProxy *proxy) { - proxy->disable_modern = true; + proxy->mode = VIRTIO_PCI_MODE_LEGACY; } /* diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 28056a7ef7..4805727b53 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -1162,10 +1162,16 @@ int virtio_set_status(VirtIODevice *vdev, uint8_t val) } } } + vdev->started = val & VIRTIO_CONFIG_S_DRIVER_OK; + if (unlikely(vdev->start_on_kick && vdev->started)) { + vdev->start_on_kick = false; + } + if (k->set_status) { k->set_status(vdev, val); } vdev->status = val; + return 0; } @@ -1208,6 +1214,9 @@ void virtio_reset(void *opaque) k->reset(vdev); } + vdev->start_on_kick = (virtio_host_has_feature(vdev, VIRTIO_F_VERSION_1) && + !virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)); + vdev->started = false; vdev->broken = false; vdev->guest_features = 0; vdev->queue_sel = 0; @@ -1518,14 +1527,21 @@ void virtio_queue_set_align(VirtIODevice *vdev, int n, int align) static bool virtio_queue_notify_aio_vq(VirtQueue *vq) { + bool ret = false; + if (vq->vring.desc && vq->handle_aio_output) { VirtIODevice *vdev = vq->vdev; trace_virtio_queue_notify(vdev, vq - vdev->vq, vq); - return vq->handle_aio_output(vdev, vq); + ret = vq->handle_aio_output(vdev, vq); + + if (unlikely(vdev->start_on_kick)) { + vdev->started = true; + vdev->start_on_kick = false; + } } - return false; + return ret; } static void virtio_queue_notify_vq(VirtQueue *vq) @@ -1539,6 +1555,11 @@ static void virtio_queue_notify_vq(VirtQueue *vq) trace_virtio_queue_notify(vdev, vq - vdev->vq, vq); vq->handle_output(vdev, vq); + + if (unlikely(vdev->start_on_kick)) { + vdev->started = true; + vdev->start_on_kick = false; + } } } @@ -1556,6 +1577,11 @@ void virtio_queue_notify(VirtIODevice *vdev, int n) } else if (vq->handle_output) { vq->handle_output(vdev, vq); } + + if (unlikely(vdev->start_on_kick)) { + vdev->started = true; + vdev->start_on_kick = false; + } } uint16_t virtio_queue_vector(VirtIODevice *vdev, int n) @@ -1770,6 +1796,13 @@ static bool virtio_broken_needed(void *opaque) return vdev->broken; } +static bool virtio_started_needed(void *opaque) +{ + VirtIODevice *vdev = opaque; + + return vdev->started; +} + static const VMStateDescription vmstate_virtqueue = { .name = "virtqueue_state", .version_id = 1, @@ -1898,6 +1931,17 @@ static const VMStateDescription vmstate_virtio_broken = { } }; +static const VMStateDescription vmstate_virtio_started = { + .name = "virtio/started", + .version_id = 1, + .minimum_version_id = 1, + .needed = &virtio_started_needed, + .fields = (VMStateField[]) { + VMSTATE_BOOL(started, VirtIODevice), + VMSTATE_END_OF_LIST() + } +}; + static const VMStateDescription vmstate_virtio = { .name = "virtio", .version_id = 1, @@ -1913,6 +1957,7 @@ static const VMStateDescription vmstate_virtio = { &vmstate_virtio_ringsize, &vmstate_virtio_broken, &vmstate_virtio_extra_state, + &vmstate_virtio_started, NULL } }; @@ -2246,7 +2291,7 @@ static void virtio_vmstate_change(void *opaque, int running, RunState state) VirtIODevice *vdev = opaque; BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); - bool backend_run = running && (vdev->status & VIRTIO_CONFIG_S_DRIVER_OK); + bool backend_run = running && vdev->started; vdev->vm_running = running; if (backend_run) { @@ -2286,6 +2331,9 @@ void virtio_init(VirtIODevice *vdev, const char *name, g_malloc0(sizeof(*vdev->vector_queues) * nvectors); } + vdev->start_on_kick = (virtio_host_has_feature(vdev, VIRTIO_F_VERSION_1) && + !virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)); + vdev->started = false; vdev->device_id = device_id; vdev->status = 0; atomic_set(&vdev->isr, 0); diff --git a/include/hw/acpi/pci.h b/include/hw/acpi/pci.h new file mode 100644 index 0000000000..124af7d32a --- /dev/null +++ b/include/hw/acpi/pci.h @@ -0,0 +1,33 @@ +/* + * Support for generating PCI related ACPI tables and passing them to Guests + * + * Copyright (C) 2006 Fabrice Bellard + * Copyright (C) 2008-2010 Kevin O'Connor <kevin@koconnor.net> + * Copyright (C) 2013-2019 Red Hat Inc + * Copyright (C) 2019 Intel Corporation + * + * Author: Wei Yang <richardw.yang@linux.intel.com> + * Author: Michael S. Tsirkin <mst@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + */ +#ifndef HW_ACPI_PCI_H +#define HW_ACPI_PCI_H + +typedef struct AcpiMcfgInfo { + uint64_t base; + uint32_t size; +} AcpiMcfgInfo; + +#endif diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index fdd4c43d3a..edf44de21d 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -395,7 +395,6 @@ typedef PCIINTxRoute (*pci_route_irq_fn)(void *opaque, int pin); #define TYPE_PCIE_BUS "PCIE" bool pci_bus_is_express(PCIBus *bus); -bool pci_bus_is_root(PCIBus *bus); bool pci_bus_allows_extended_config_space(PCIBus *bus); void pci_root_bus_new_inplace(PCIBus *bus, size_t bus_size, DeviceState *parent, diff --git a/include/hw/pci/pci_bus.h b/include/hw/pci/pci_bus.h index f6df834170..aea98d5040 100644 --- a/include/hw/pci/pci_bus.h +++ b/include/hw/pci/pci_bus.h @@ -15,14 +15,19 @@ typedef struct PCIBusClass { BusClass parent_class; /*< public >*/ - bool (*is_root)(PCIBus *bus); int (*bus_num)(PCIBus *bus); uint16_t (*numa_node)(PCIBus *bus); bool (*allows_extended_config_space)(PCIBus *bus); } PCIBusClass; +enum PCIBusFlags { + /* This bus is the root of a PCI domain */ + PCI_BUS_IS_ROOT = 0x0001, +}; + struct PCIBus { BusState qbus; + enum PCIBusFlags flags; PCIIOMMUFunc iommu_fn; void *iommu_opaque; uint8_t devfn_min; @@ -47,4 +52,9 @@ struct PCIBus { Notifier machine_done; }; +static inline bool pci_bus_is_root(PCIBus *bus) +{ + return !!(bus->flags & PCI_BUS_IS_ROOT); +} + #endif /* QEMU_PCI_BUS_H */ diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h index 33ed3b8dde..fa55dc10ae 100644 --- a/include/hw/qdev-core.h +++ b/include/hw/qdev-core.h @@ -251,8 +251,6 @@ struct PropertyInfo { /** * GlobalProperty: * @used: Set to true if property was used when initializing a device. - * @optional: If set to true, GlobalProperty will be skipped without errors - * if the property doesn't exist. * * An error is fatal for non-hotplugged devices, when the global is applied. */ @@ -261,7 +259,6 @@ typedef struct GlobalProperty { const char *property; const char *value; bool used; - bool optional; } GlobalProperty; static inline void diff --git a/include/hw/virtio/vhost-user-blk.h b/include/hw/virtio/vhost-user-blk.h index 68634bee61..51457fb857 100644 --- a/include/hw/virtio/vhost-user-blk.h +++ b/include/hw/virtio/vhost-user-blk.h @@ -38,6 +38,9 @@ typedef struct VHostUserBlk { struct vhost_dev dev; struct vhost_inflight *inflight; VhostUserState vhost_user; + struct vhost_virtqueue *vqs; + guint watch; + bool connected; } VHostUserBlk; #endif diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index 7140381e3a..27c0efc3d0 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -105,6 +105,8 @@ struct VirtIODevice uint16_t device_id; bool vm_running; bool broken; /* device in invalid state, needs reset */ + bool started; + bool start_on_kick; /* virtio 1.0 transitional devices support that */ VMChangeStateEntry *vmstate; char *bus_name; uint8_t device_endian; diff --git a/net/vhost-user.c b/net/vhost-user.c index 5a26a24708..51921de443 100644 --- a/net/vhost-user.c +++ b/net/vhost-user.c @@ -236,7 +236,6 @@ static void chr_closed_bh(void *opaque) s = DO_UPCAST(NetVhostUserState, nc, ncs[0]); qmp_set_link(name, false, &err); - vhost_user_stop(queues, ncs); qemu_chr_fe_set_handlers(&s->chr, NULL, NULL, net_vhost_user_event, NULL, opaque, NULL, true); diff --git a/qom/object.c b/qom/object.c index d3412e7fdc..99c4fa707e 100644 --- a/qom/object.c +++ b/qom/object.c @@ -385,9 +385,6 @@ void object_apply_global_props(Object *obj, const GPtrArray *props, Error **errp if (object_dynamic_cast(obj, p->driver) == NULL) { continue; } - if (p->optional && !object_property_find(obj, p->property, NULL)) { - continue; - } p->used = true; object_property_parse(obj, p->value, p->property, &err); if (err != NULL) { diff --git a/tests/acpi-utils.c b/tests/acpi-utils.c index cc33b460ab..d2a202efca 100644 --- a/tests/acpi-utils.c +++ b/tests/acpi-utils.c @@ -51,19 +51,7 @@ uint32_t acpi_find_rsdp_address(QTestState *qts) return off; } -uint64_t acpi_get_xsdt_address(uint8_t *rsdp_table) -{ - uint64_t xsdt_physical_address; - uint8_t revision = rsdp_table[15 /* Revision offset */]; - - /* We must have revision 2 if we're looking for an XSDT pointer */ - g_assert(revision == 2); - - memcpy(&xsdt_physical_address, &rsdp_table[24 /* XsdtAddress offset */], 8); - return le64_to_cpu(xsdt_physical_address); -} - -void acpi_parse_rsdp_table(QTestState *qts, uint32_t addr, uint8_t *rsdp_table) +void acpi_fetch_rsdp_table(QTestState *qts, uint64_t addr, uint8_t *rsdp_table) { uint8_t revision; @@ -91,13 +79,15 @@ void acpi_parse_rsdp_table(QTestState *qts, uint32_t addr, uint8_t *rsdp_table) * actual one. */ void acpi_fetch_table(QTestState *qts, uint8_t **aml, uint32_t *aml_len, - const uint8_t *addr_ptr, const char *sig, + const uint8_t *addr_ptr, int addr_size, const char *sig, bool verify_checksum) { - uint32_t addr, len; + uint32_t len; + uint64_t addr = 0; - memcpy(&addr, addr_ptr , sizeof(addr)); - addr = le32_to_cpu(addr); + g_assert(addr_size == 4 || addr_size == 8); + memcpy(&addr, addr_ptr , addr_size); + addr = le64_to_cpu(addr); qtest_memread(qts, addr + 4, &len, 4); /* Length of ACPI table */ *aml_len = le32_to_cpu(len); *aml = g_malloc0(*aml_len); @@ -111,3 +101,47 @@ void acpi_fetch_table(QTestState *qts, uint8_t **aml, uint32_t *aml_len, g_assert(!acpi_calc_checksum(*aml, *aml_len)); } } + +#define GUID_SIZE 16 +static const uint8_t AcpiTestSupportGuid[GUID_SIZE] = { + 0xb1, 0xa6, 0x87, 0xab, + 0x34, 0x20, + 0xa0, 0xbd, + 0x71, 0xbd, 0x37, 0x50, 0x07, 0x75, 0x77, 0x85 }; + +typedef struct { + uint8_t signature_guid[GUID_SIZE]; + uint64_t rsdp10; + uint64_t rsdp20; +} __attribute__((packed)) UefiTestSupport; + +/* Wait at most 600 seconds (test is slow with TCG and --enable-debug) */ +#define TEST_DELAY (1 * G_USEC_PER_SEC / 10) +#define TEST_CYCLES MAX((600 * G_USEC_PER_SEC / TEST_DELAY), 1) +#define MB 0x100000ULL +uint64_t acpi_find_rsdp_address_uefi(QTestState *qts, uint64_t start, + uint64_t size) +{ + int i, j; + uint8_t data[GUID_SIZE]; + + for (i = 0; i < TEST_CYCLES; ++i) { + for (j = 0; j < size / MB; j++) { + /* look for GUID at every 1Mb block */ + uint64_t addr = start + j * MB; + + qtest_memread(qts, addr, data, sizeof(data)); + if (!memcmp(AcpiTestSupportGuid, data, sizeof(data))) { + UefiTestSupport ret; + + qtest_memread(qts, addr, &ret, sizeof(ret)); + ret.rsdp10 = le64_to_cpu(ret.rsdp10); + ret.rsdp20 = le64_to_cpu(ret.rsdp20); + return ret.rsdp20 ? ret.rsdp20 : ret.rsdp10; + } + } + g_usleep(TEST_DELAY); + } + g_assert_not_reached(); + return 0; +} diff --git a/tests/acpi-utils.h b/tests/acpi-utils.h index 73fe24f044..0c86780689 100644 --- a/tests/acpi-utils.h +++ b/tests/acpi-utils.h @@ -46,10 +46,11 @@ typedef struct { uint8_t acpi_calc_checksum(const uint8_t *data, int len); uint32_t acpi_find_rsdp_address(QTestState *qts); -uint64_t acpi_get_xsdt_address(uint8_t *rsdp_table); -void acpi_parse_rsdp_table(QTestState *qts, uint32_t addr, uint8_t *rsdp_table); +uint64_t acpi_find_rsdp_address_uefi(QTestState *qts, uint64_t start, + uint64_t size); +void acpi_fetch_rsdp_table(QTestState *qts, uint64_t addr, uint8_t *rsdp_table); void acpi_fetch_table(QTestState *qts, uint8_t **aml, uint32_t *aml_len, - const uint8_t *addr_ptr, const char *sig, + const uint8_t *addr_ptr, int addr_size, const char *sig, bool verify_checksum); #endif /* TEST_ACPI_UTILS_H */ diff --git a/tests/bios-tables-test.c b/tests/bios-tables-test.c index a506dcbb29..11e07be093 100644 --- a/tests/bios-tables-test.c +++ b/tests/bios-tables-test.c @@ -24,9 +24,15 @@ #define ACPI_REBUILD_EXPECTED_AML "TEST_ACPI_REBUILD_AML" typedef struct { + const char *accel; const char *machine; const char *variant; - uint32_t rsdp_addr; + const char *uefi_fl1; + const char *uefi_fl2; + const char *cd; + const uint64_t ram_start; + const uint64_t scan_len; + uint64_t rsdp_addr; uint8_t rsdp_table[36 /* ACPI 2.0+ RSDP size */]; GArray *tables; uint32_t smbios_ep_addr; @@ -77,22 +83,13 @@ static void free_test_data(test_data *data) g_array_free(data->tables, true); } -static void test_acpi_rsdp_address(test_data *data) -{ - uint32_t off = acpi_find_rsdp_address(data->qts); - g_assert_cmphex(off, <, 0x100000); - data->rsdp_addr = off; -} - static void test_acpi_rsdp_table(test_data *data) { - uint8_t *rsdp_table = data->rsdp_table, revision; - uint32_t addr = data->rsdp_addr; + uint8_t *rsdp_table = data->rsdp_table; - acpi_parse_rsdp_table(data->qts, addr, rsdp_table); - revision = rsdp_table[15 /* Revision offset */]; + acpi_fetch_rsdp_table(data->qts, data->rsdp_addr, rsdp_table); - switch (revision) { + switch (rsdp_table[15 /* Revision offset */]) { case 0: /* ACPI 1.0 RSDP */ /* With rev 1, checksum is only for the first 20 bytes */ g_assert(!acpi_calc_checksum(rsdp_table, 20)); @@ -107,21 +104,29 @@ static void test_acpi_rsdp_table(test_data *data) } } -static void test_acpi_rsdt_table(test_data *data) +static void test_acpi_rxsdt_table(test_data *data) { + const char *sig = "RSDT"; AcpiSdtTable rsdt = {}; + int entry_size = 4; + int addr_off = 16 /* RsdtAddress */; uint8_t *ent; - /* read RSDT table */ + if (data->rsdp_table[15 /* Revision offset */] != 0) { + addr_off = 24 /* XsdtAddress */; + entry_size = 8; + sig = "XSDT"; + } + /* read [RX]SDT table */ acpi_fetch_table(data->qts, &rsdt.aml, &rsdt.aml_len, - &data->rsdp_table[16 /* RsdtAddress */], "RSDT", true); + &data->rsdp_table[addr_off], entry_size, sig, true); /* Load all tables and add to test list directly RSDT referenced tables */ - ACPI_FOREACH_RSDT_ENTRY(rsdt.aml, rsdt.aml_len, ent, 4 /* Entry size */) { + ACPI_FOREACH_RSDT_ENTRY(rsdt.aml, rsdt.aml_len, ent, entry_size) { AcpiSdtTable ssdt_table = {}; acpi_fetch_table(data->qts, &ssdt_table.aml, &ssdt_table.aml_len, ent, - NULL, true); + entry_size, NULL, true); /* Add table to ASL test tables list */ g_array_append_val(data->tables, ssdt_table); } @@ -134,16 +139,29 @@ static void test_acpi_fadt_table(test_data *data) AcpiSdtTable table = g_array_index(data->tables, typeof(table), 0); uint8_t *fadt_aml = table.aml; uint32_t fadt_len = table.aml_len; + uint32_t val; + int dsdt_offset = 40 /* DSDT */; + int dsdt_entry_size = 4; g_assert(compare_signature(&table, "FACP")); /* Since DSDT/FACS isn't in RSDT, add them to ASL test list manually */ - acpi_fetch_table(data->qts, &table.aml, &table.aml_len, - fadt_aml + 36 /* FIRMWARE_CTRL */, "FACS", false); - g_array_append_val(data->tables, table); + memcpy(&val, fadt_aml + 112 /* Flags */, 4); + val = le32_to_cpu(val); + if (!(val & 1UL << 20 /* HW_REDUCED_ACPI */)) { + acpi_fetch_table(data->qts, &table.aml, &table.aml_len, + fadt_aml + 36 /* FIRMWARE_CTRL */, 4, "FACS", false); + g_array_append_val(data->tables, table); + } + memcpy(&val, fadt_aml + dsdt_offset, 4); + val = le32_to_cpu(val); + if (!val) { + dsdt_offset = 140 /* X_DSDT */; + dsdt_entry_size = 8; + } acpi_fetch_table(data->qts, &table.aml, &table.aml_len, - fadt_aml + 40 /* DSDT */, "DSDT", true); + fadt_aml + dsdt_offset, dsdt_entry_size, "DSDT", true); g_array_append_val(data->tables, table); memset(fadt_aml + 36, 0, 4); /* sanitize FIRMWARE_CTRL ptr */ @@ -177,11 +195,14 @@ static void dump_aml_files(test_data *data, bool rebuild) sdt->aml, ext); fd = g_open(aml_file, O_WRONLY|O_TRUNC|O_CREAT, S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH); + if (fd < 0) { + perror(aml_file); + } + g_assert(fd >= 0); } else { fd = g_file_open_tmp("aml-XXXXXX", &sdt->aml_file, &error); g_assert_no_error(error); } - g_assert(fd >= 0); ret = qemu_write_full(fd, sdt->aml, sdt->aml_len); g_assert(ret == sdt->aml_len); @@ -505,23 +526,44 @@ static void test_smbios_structs(test_data *data) static void test_acpi_one(const char *params, test_data *data) { char *args; - - /* Disable kernel irqchip to be able to override apic irq0. */ - args = g_strdup_printf("-machine %s,accel=%s,kernel-irqchip=off " - "-net none -display none %s " - "-drive id=hd0,if=none,file=%s,format=raw " - "-device ide-hd,drive=hd0 ", - data->machine, "kvm:tcg", - params ? params : "", disk); + bool use_uefi = data->uefi_fl1 && data->uefi_fl2; + + if (use_uefi) { + /* + * TODO: convert '-drive if=pflash' to new syntax (see e33763be7cd3) + * when arm/virt boad starts to support it. + */ + args = g_strdup_printf("-machine %s,accel=%s -nodefaults -nographic " + "-drive if=pflash,format=raw,file=%s,readonly " + "-drive if=pflash,format=raw,file=%s,snapshot=on -cdrom %s %s", + data->machine, data->accel ? data->accel : "kvm:tcg", + data->uefi_fl1, data->uefi_fl2, data->cd, params ? params : ""); + + } else { + /* Disable kernel irqchip to be able to override apic irq0. */ + args = g_strdup_printf("-machine %s,accel=%s,kernel-irqchip=off " + "-net none -display none %s " + "-drive id=hd0,if=none,file=%s,format=raw " + "-device ide-hd,drive=hd0 ", + data->machine, data->accel ? data->accel : "kvm:tcg", + params ? params : "", disk); + } data->qts = qtest_init(args); - boot_sector_test(data->qts); + if (use_uefi) { + g_assert(data->scan_len); + data->rsdp_addr = acpi_find_rsdp_address_uefi(data->qts, + data->ram_start, data->scan_len); + } else { + boot_sector_test(data->qts); + data->rsdp_addr = acpi_find_rsdp_address(data->qts); + g_assert_cmphex(data->rsdp_addr, <, 0x100000); + } data->tables = g_array_new(false, true, sizeof(AcpiSdtTable)); - test_acpi_rsdp_address(data); test_acpi_rsdp_table(data); - test_acpi_rsdt_table(data); + test_acpi_rxsdt_table(data); test_acpi_fadt_table(data); if (iasl) { @@ -532,8 +574,15 @@ static void test_acpi_one(const char *params, test_data *data) } } - test_smbios_entry_point(data); - test_smbios_structs(data); + /* + * TODO: make SMBIOS tests work with UEFI firmware, + * Bug on uefi-test-tools to provide entry point: + * https://bugs.launchpad.net/qemu/+bug/1821884 + */ + if (!use_uefi) { + test_smbios_entry_point(data); + test_smbios_structs(data); + } assert(!global_qtest); qtest_quit(data->qts); @@ -769,13 +818,14 @@ int main(int argc, char *argv[]) const char *arch = qtest_get_arch(); int ret; - ret = boot_sector_init(disk); - if(ret) - return ret; - g_test_init(&argc, &argv, NULL); if (strcmp(arch, "i386") == 0 || strcmp(arch, "x86_64") == 0) { + ret = boot_sector_init(disk); + if (ret) { + return ret; + } + qtest_add_func("acpi/piix4", test_acpi_piix4_tcg); qtest_add_func("acpi/piix4/bridge", test_acpi_piix4_tcg_bridge); qtest_add_func("acpi/q35", test_acpi_q35_tcg); diff --git a/tests/data/acpi/rebuild-expected-aml.sh b/tests/data/acpi/rebuild-expected-aml.sh index abdff70a0d..ff7e62249d 100755 --- a/tests/data/acpi/rebuild-expected-aml.sh +++ b/tests/data/acpi/rebuild-expected-aml.sh @@ -7,21 +7,12 @@ # # Authors: # Marcel Apfelbaum <marcel.a@redhat.com> +# Igor Mammedov <imammedo@redhat.com> # # This work is licensed under the terms of the GNU GPLv2. # See the COPYING.LIB file in the top-level directory. -qemu= - -if [ -e x86_64-softmmu/qemu-system-x86_64 ]; then - qemu="x86_64-softmmu/qemu-system-x86_64" -elif [ -e i386-softmmu/qemu-system-i386 ]; then - qemu="i386-softmmu/qemu-system-i386" -else - echo "Run 'make' to build the qemu exectutable!" - echo "Run this script from the build directory." - exit 1; -fi +qemu_bins="x86_64-softmmu/qemu-system-x86_64" if [ ! -e "tests/bios-tables-test" ]; then echo "Test: bios-tables-test is required! Run make check before this script." @@ -29,6 +20,14 @@ if [ ! -e "tests/bios-tables-test" ]; then exit 1; fi -TEST_ACPI_REBUILD_AML=y QTEST_QEMU_BINARY=$qemu tests/bios-tables-test +for qemu in $qemu_bins; do + if [ ! -e $qemu ]; then + echo "Run 'make' to build the following QEMU executables: $qemu_bins" + echo "Also, run this script from the build directory." + exit 1; + fi + TEST_ACPI_REBUILD_AML=y QTEST_QEMU_BINARY=$qemu tests/bios-tables-test +done + echo "The files were rebuilt and can be added to git." diff --git a/tests/vmgenid-test.c b/tests/vmgenid-test.c index ae38ee5ac0..85d8e6463e 100644 --- a/tests/vmgenid-test.c +++ b/tests/vmgenid-test.c @@ -40,14 +40,14 @@ static uint32_t acpi_find_vgia(QTestState *qts) g_assert_cmphex(rsdp_offset, <, RSDP_ADDR_INVALID); - acpi_parse_rsdp_table(qts, rsdp_offset, rsdp_table); + acpi_fetch_rsdp_table(qts, rsdp_offset, rsdp_table); acpi_fetch_table(qts, &rsdt, &rsdt_len, &rsdp_table[16 /* RsdtAddress */], - "RSDT", true); + 4, "RSDT", true); ACPI_FOREACH_RSDT_ENTRY(rsdt, rsdt_len, ent, 4 /* Entry size */) { uint8_t *table_aml; - acpi_fetch_table(qts, &table_aml, &table_length, ent, NULL, true); + acpi_fetch_table(qts, &table_aml, &table_length, ent, 4, NULL, true); if (!memcmp(table_aml + 16 /* OEM Table ID */, "VMGENID", 7)) { uint32_t vgia_val; uint8_t *aml = &table_aml[36 /* AML byte-code start */]; |