aboutsummaryrefslogtreecommitdiff
path: root/docs
diff options
context:
space:
mode:
authorRichard Henderson <richard.henderson@linaro.org>2022-05-16 16:31:01 -0700
committerRichard Henderson <richard.henderson@linaro.org>2022-05-16 16:31:01 -0700
commiteec398119fc6911d99412c37af06a6bc27871f85 (patch)
treea315e93a78329d2d2de9b087e890024fc6b39d29 /docs
parentafdb415e67e13e8726edc21238c9883447b2c704 (diff)
parent6852c21db229c4bf4c1db772444bdfbbd027e5b8 (diff)
Merge tag 'for_upstream' of git://git.kernel.org/pub/scm/virt/kvm/mst/qemu into staging
virtio,pc,pci: fixes,cleanups,features most of CXL support fixes, cleanups all over the place Signed-off-by: Michael S. Tsirkin <mst@redhat.com> # -----BEGIN PGP SIGNATURE----- # # iQFDBAABCAAtFiEEXQn9CHHI+FuUyooNKB8NuNKNVGkFAmKCuLIPHG1zdEByZWRo # YXQuY29tAAoJECgfDbjSjVRpdDUH/12SmWaAo+0+SdIHgWFFxsmg3t/EdcO38fgi # MV+GpYdbp6TlU3jdQhrMZYmFdkVVydBdxk93ujCLbFS0ixTsKj31j0IbZMfdcGgv # SLqnV+E3JdHqnGP39q9a9rdwYWyqhkgHoldxilIFW76ngOSapaZVvnwnOMAMkf77 # 1LieL4/Xq7N9Ho86Zrs3IczQcf0czdJRDaFaSIu8GaHl8ELyuPhlSm6CSqqrEEWR # PA/COQsLDbLOMxbfCi5v88r5aaxmGNZcGbXQbiH9qVHw65nlHyLH9UkNTdJn1du1 # f2GYwwa7eekfw/LCvvVwxO1znJrj02sfFai7aAtQYbXPvjvQiqA= # =xdSk # -----END PGP SIGNATURE----- # gpg: Signature made Mon 16 May 2022 01:48:50 PM PDT # gpg: using RSA key 5D09FD0871C8F85B94CA8A0D281F0DB8D28D5469 # gpg: issuer "mst@redhat.com" # gpg: Good signature from "Michael S. Tsirkin <mst@kernel.org>" [undefined] # gpg: aka "Michael S. Tsirkin <mst@redhat.com>" [undefined] # gpg: WARNING: This key is not certified with a trusted signature! # gpg: There is no indication that the signature belongs to the owner. # Primary key fingerprint: 0270 606B 6F3C DF3D 0B17 0970 C350 3912 AFBE 8E67 # Subkey fingerprint: 5D09 FD08 71C8 F85B 94CA 8A0D 281F 0DB8 D28D 5469 * tag 'for_upstream' of git://git.kernel.org/pub/scm/virt/kvm/mst/qemu: (86 commits) vhost-user-scsi: avoid unlink(NULL) with fd passing virtio-net: don't handle mq request in userspace handler for vhost-vdpa vhost-vdpa: change name and polarity for vhost_vdpa_one_time_request() vhost-vdpa: backend feature should set only once vhost-net: fix improper cleanup in vhost_net_start vhost-vdpa: fix improper cleanup in net_init_vhost_vdpa virtio-net: align ctrl_vq index for non-mq guest for vhost_vdpa virtio-net: setup vhost_dev and notifiers for cvq only when feature is negotiated hw/i386/amd_iommu: Fix IOMMU event log encoding errors hw/i386: Make pic a property of common x86 base machine type hw/i386: Make pit a property of common x86 base machine type include/hw/pci/pcie_host: Correct PCIE_MMCFG_SIZE_MAX include/hw/pci/pcie_host: Correct PCIE_MMCFG_BUS_MASK docs/vhost-user: Clarifications for VHOST_USER_ADD/REM_MEM_REG vhost-user: more master/slave things virtio: add vhost support for virtio devices virtio: drop name parameter for virtio_init() virtio/vhost-user: dynamically assign VhostUserHostNotifiers hw/virtio/vhost-user: don't suppress F_CONFIG when supported include/hw: start documenting the vhost API ... Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Diffstat (limited to 'docs')
-rw-r--r--docs/devel/index-internals.rst1
-rw-r--r--docs/devel/virtio-backends.rst214
-rw-r--r--docs/interop/vhost-user-gpu.rst10
-rw-r--r--docs/interop/vhost-user.rst579
-rw-r--r--docs/system/device-emulation.rst1
-rw-r--r--docs/system/devices/cxl.rst302
6 files changed, 839 insertions, 268 deletions
diff --git a/docs/devel/index-internals.rst b/docs/devel/index-internals.rst
index a50889c556..e1a93df263 100644
--- a/docs/devel/index-internals.rst
+++ b/docs/devel/index-internals.rst
@@ -18,3 +18,4 @@ Details about QEMU's various subsystems including how to add features to them.
tracing
vfio-migration
writing-monitor-commands
+ virtio-backends
diff --git a/docs/devel/virtio-backends.rst b/docs/devel/virtio-backends.rst
new file mode 100644
index 0000000000..9ff092e7a0
--- /dev/null
+++ b/docs/devel/virtio-backends.rst
@@ -0,0 +1,214 @@
+..
+ Copyright (c) 2022, Linaro Limited
+ Written by Alex Bennée
+
+Writing VirtIO backends for QEMU
+================================
+
+This document attempts to outline the information a developer needs to
+know to write device emulations in QEMU. It is specifically focused on
+implementing VirtIO devices. For VirtIO the frontend is the driver
+running on the guest. The backend is the everything that QEMU needs to
+do to handle the emulation of the VirtIO device. This can be done
+entirely in QEMU, divided between QEMU and the kernel (vhost) or
+handled by a separate process which is configured by QEMU
+(vhost-user).
+
+VirtIO Transports
+-----------------
+
+VirtIO supports a number of different transports. While the details of
+the configuration and operation of the device will generally be the
+same QEMU represents them as different devices depending on the
+transport they use. For example -device virtio-foo represents the foo
+device using mmio and -device virtio-foo-pci is the same class of
+device using the PCI transport.
+
+Using the QEMU Object Model (QOM)
+---------------------------------
+
+Generally all devices in QEMU are super classes of ``TYPE_DEVICE``
+however VirtIO devices should be based on ``TYPE_VIRTIO_DEVICE`` which
+itself is derived from the base class. For example:
+
+.. code:: c
+
+ static const TypeInfo virtio_blk_info = {
+ .name = TYPE_VIRTIO_BLK,
+ .parent = TYPE_VIRTIO_DEVICE,
+ .instance_size = sizeof(VirtIOBlock),
+ .instance_init = virtio_blk_instance_init,
+ .class_init = virtio_blk_class_init,
+ };
+
+The author may decide to have a more expansive class hierarchy to
+support multiple device types. For example the Virtio GPU device:
+
+.. code:: c
+
+ static const TypeInfo virtio_gpu_base_info = {
+ .name = TYPE_VIRTIO_GPU_BASE,
+ .parent = TYPE_VIRTIO_DEVICE,
+ .instance_size = sizeof(VirtIOGPUBase),
+ .class_size = sizeof(VirtIOGPUBaseClass),
+ .class_init = virtio_gpu_base_class_init,
+ .abstract = true
+ };
+
+ static const TypeInfo vhost_user_gpu_info = {
+ .name = TYPE_VHOST_USER_GPU,
+ .parent = TYPE_VIRTIO_GPU_BASE,
+ .instance_size = sizeof(VhostUserGPU),
+ .instance_init = vhost_user_gpu_instance_init,
+ .instance_finalize = vhost_user_gpu_instance_finalize,
+ .class_init = vhost_user_gpu_class_init,
+ };
+
+ static const TypeInfo virtio_gpu_info = {
+ .name = TYPE_VIRTIO_GPU,
+ .parent = TYPE_VIRTIO_GPU_BASE,
+ .instance_size = sizeof(VirtIOGPU),
+ .class_size = sizeof(VirtIOGPUClass),
+ .class_init = virtio_gpu_class_init,
+ };
+
+defines a base class for the VirtIO GPU and then specialises two
+versions, one for the internal implementation and the other for the
+vhost-user version.
+
+VirtIOPCIProxy
+^^^^^^^^^^^^^^
+
+[AJB: the following is supposition and welcomes more informed
+opinions]
+
+Probably due to legacy from the pre-QOM days PCI VirtIO devices don't
+follow the normal hierarchy. Instead the a standalone object is based
+on the VirtIOPCIProxy class and the specific VirtIO instance is
+manually instantiated:
+
+.. code:: c
+
+ /*
+ * virtio-blk-pci: This extends VirtioPCIProxy.
+ */
+ #define TYPE_VIRTIO_BLK_PCI "virtio-blk-pci-base"
+ DECLARE_INSTANCE_CHECKER(VirtIOBlkPCI, VIRTIO_BLK_PCI,
+ TYPE_VIRTIO_BLK_PCI)
+
+ struct VirtIOBlkPCI {
+ VirtIOPCIProxy parent_obj;
+ VirtIOBlock vdev;
+ };
+
+ static Property virtio_blk_pci_properties[] = {
+ DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0),
+ DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags,
+ VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true),
+ DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors,
+ DEV_NVECTORS_UNSPECIFIED),
+ DEFINE_PROP_END_OF_LIST(),
+ };
+
+ static void virtio_blk_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+ {
+ VirtIOBlkPCI *dev = VIRTIO_BLK_PCI(vpci_dev);
+ DeviceState *vdev = DEVICE(&dev->vdev);
+
+ ...
+
+ qdev_realize(vdev, BUS(&vpci_dev->bus), errp);
+ }
+
+ static void virtio_blk_pci_class_init(ObjectClass *klass, void *data)
+ {
+ DeviceClass *dc = DEVICE_CLASS(klass);
+ VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+ PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+
+ set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
+ device_class_set_props(dc, virtio_blk_pci_properties);
+ k->realize = virtio_blk_pci_realize;
+ pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+ pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_BLOCK;
+ pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
+ pcidev_k->class_id = PCI_CLASS_STORAGE_SCSI;
+ }
+
+ static void virtio_blk_pci_instance_init(Object *obj)
+ {
+ VirtIOBlkPCI *dev = VIRTIO_BLK_PCI(obj);
+
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+ TYPE_VIRTIO_BLK);
+ object_property_add_alias(obj, "bootindex", OBJECT(&dev->vdev),
+ "bootindex");
+ }
+
+ static const VirtioPCIDeviceTypeInfo virtio_blk_pci_info = {
+ .base_name = TYPE_VIRTIO_BLK_PCI,
+ .generic_name = "virtio-blk-pci",
+ .transitional_name = "virtio-blk-pci-transitional",
+ .non_transitional_name = "virtio-blk-pci-non-transitional",
+ .instance_size = sizeof(VirtIOBlkPCI),
+ .instance_init = virtio_blk_pci_instance_init,
+ .class_init = virtio_blk_pci_class_init,
+ };
+
+Here you can see the instance_init has to manually instantiate the
+underlying ``TYPE_VIRTIO_BLOCK`` object and link an alias for one of
+it's properties to the PCI device.
+
+
+Back End Implementations
+------------------------
+
+There are a number of places where the implementation of the backend
+can be done:
+
+* in QEMU itself
+* in the host kernel (a.k.a vhost)
+* in a separate process (a.k.a. vhost-user)
+
+vhost_ops vs TYPE_VHOST_USER_BACKEND
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+There are two choices to how to implement vhost code. Most of the code
+which has to work with either vhost or vhost-user uses
+``vhost_dev_init()`` to instantiate the appropriate backend. This
+means including a ``struct vhost_dev`` in the main object structure.
+
+For vhost-user devices you also need to add code to track the
+initialisation of the ``chardev`` device used for the control socket
+between QEMU and the external vhost-user process.
+
+If you only need to implement a vhost-user backed the other option is
+a use a QOM-ified version of vhost-user.
+
+.. code:: c
+
+ static void
+ vhost_user_gpu_instance_init(Object *obj)
+ {
+ VhostUserGPU *g = VHOST_USER_GPU(obj);
+
+ g->vhost = VHOST_USER_BACKEND(object_new(TYPE_VHOST_USER_BACKEND));
+ object_property_add_alias(obj, "chardev",
+ OBJECT(g->vhost), "chardev");
+ }
+
+ static const TypeInfo vhost_user_gpu_info = {
+ .name = TYPE_VHOST_USER_GPU,
+ .parent = TYPE_VIRTIO_GPU_BASE,
+ .instance_size = sizeof(VhostUserGPU),
+ .instance_init = vhost_user_gpu_instance_init,
+ .instance_finalize = vhost_user_gpu_instance_finalize,
+ .class_init = vhost_user_gpu_class_init,
+ };
+
+Using it this way entails adding a ``struct VhostUserBackend`` to your
+core object structure and manually instantiating the backend. This
+sub-structure tracks both the ``vhost_dev`` and ``CharDev`` types
+needed for the connection. Instead of calling ``vhost_dev_init`` you
+would call ``vhost_user_backend_dev_init`` which does what is needed
+on your behalf.
diff --git a/docs/interop/vhost-user-gpu.rst b/docs/interop/vhost-user-gpu.rst
index 71a2c52b31..1640553729 100644
--- a/docs/interop/vhost-user-gpu.rst
+++ b/docs/interop/vhost-user-gpu.rst
@@ -13,10 +13,10 @@ Introduction
============
The vhost-user-gpu protocol is aiming at sharing the rendering result
-of a virtio-gpu, done from a vhost-user slave process to a vhost-user
-master process (such as QEMU). It bears a resemblance to a display
+of a virtio-gpu, done from a vhost-user back-end process to a vhost-user
+front-end process (such as QEMU). It bears a resemblance to a display
server protocol, if you consider QEMU as the display server and the
-slave as the client, but in a very limited way. Typically, it will
+back-end as the client, but in a very limited way. Typically, it will
work by setting a scanout/display configuration, before sending flush
events for the display updates. It will also update the cursor shape
and position.
@@ -26,8 +26,8 @@ socket ancillary data to share opened file descriptors (DMABUF fds or
shared memory). The socket is usually obtained via
``VHOST_USER_GPU_SET_SOCKET``.
-Requests are sent by the *slave*, and the optional replies by the
-*master*.
+Requests are sent by the *back-end*, and the optional replies by the
+*front-end*.
Wire format
===========
diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst
index f9e721ba5f..a99ba4433c 100644
--- a/docs/interop/vhost-user.rst
+++ b/docs/interop/vhost-user.rst
@@ -23,19 +23,19 @@ space process on the same host. It uses communication over a Unix
domain socket to share file descriptors in the ancillary data of the
message.
-The protocol defines 2 sides of the communication, *master* and
-*slave*. *Master* is the application that shares its virtqueues, in
-our case QEMU. *Slave* is the consumer of the virtqueues.
+The protocol defines 2 sides of the communication, *front-end* and
+*back-end*. The *front-end* is the application that shares its virtqueues, in
+our case QEMU. The *back-end* is the consumer of the virtqueues.
-In the current implementation QEMU is the *master*, and the *slave* is
-the external process consuming the virtio queues, for example a
+In the current implementation QEMU is the *front-end*, and the *back-end*
+is the external process consuming the virtio queues, for example a
software Ethernet switch running in user space, such as Snabbswitch,
-or a block device backend processing read & write to a virtual
-disk. In order to facilitate interoperability between various backend
+or a block device back-end processing read & write to a virtual
+disk. In order to facilitate interoperability between various back-end
implementations, it is recommended to follow the :ref:`Backend program
conventions <backend_conventions>`.
-*Master* and *slave* can be either a client (i.e. connecting) or
+The *front-end* and *back-end* can be either a client (i.e. connecting) or
server (listening) in the socket communication.
Support for platforms other than Linux
@@ -77,7 +77,7 @@ Header
:flags: 32-bit bit field
- Lower 2 bits are the version (currently 0x01)
-- Bit 2 is the reply flag - needs to be sent on each reply from the slave
+- Bit 2 is the reply flag - needs to be sent on each reply from the back-end
- Bit 3 is the need_reply flag - see :ref:`REPLY_ACK <reply_ack>` for
details.
@@ -222,8 +222,8 @@ Virtio device config space
:size: a 32-bit configuration space access size in bytes
:flags: a 32-bit value:
- - 0: Vhost master messages used for writeable fields
- - 1: Vhost master messages used for live migration
+ - 0: Vhost front-end messages used for writeable fields
+ - 1: Vhost front-end messages used for live migration
:payload: Size bytes array holding the contents of the virtio
device's configuration space
@@ -290,8 +290,8 @@ vhost for the Linux Kernel. Most messages that can be sent via the
Unix domain socket implementing vhost-user have an equivalent ioctl to
the kernel implementation.
-The communication consists of *master* sending message requests and
-*slave* sending message replies. Most of the requests don't require
+The communication consists of the *front-end* sending message requests and
+the *back-end* sending message replies. Most of the requests don't require
replies. Here is a list of the ones that do:
* ``VHOST_USER_GET_FEATURES``
@@ -305,7 +305,7 @@ replies. Here is a list of the ones that do:
:ref:`REPLY_ACK <reply_ack>`
The section on ``REPLY_ACK`` protocol extension.
-There are several messages that the master sends with file descriptors passed
+There are several messages that the front-end sends with file descriptors passed
in the ancillary data:
* ``VHOST_USER_ADD_MEM_REG``
@@ -318,100 +318,108 @@ in the ancillary data:
* ``VHOST_USER_SET_SLAVE_REQ_FD``
* ``VHOST_USER_SET_INFLIGHT_FD`` (if ``VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD``)
-If *master* is unable to send the full message or receives a wrong
+If *front-end* is unable to send the full message or receives a wrong
reply it will close the connection. An optional reconnection mechanism
can be implemented.
-If *slave* detects some error such as incompatible features, it may also
+If *back-end* detects some error such as incompatible features, it may also
close the connection. This should only happen in exceptional circumstances.
Any protocol extensions are gated by protocol feature bits, which
-allows full backwards compatibility on both master and slave. As
-older slaves don't support negotiating protocol features, a feature
+allows full backwards compatibility on both front-end and back-end. As
+older back-ends don't support negotiating protocol features, a feature
bit was dedicated for this purpose::
#define VHOST_USER_F_PROTOCOL_FEATURES 30
-Starting and stopping rings
----------------------------
+Note that VHOST_USER_F_PROTOCOL_FEATURES is the UNUSED (30) feature
+bit defined in `VIRTIO 1.1 6.3 Legacy Interface: Reserved Feature Bits
+<https://docs.oasis-open.org/virtio/virtio/v1.1/cs01/virtio-v1.1-cs01.html#x1-4130003>`_.
+VIRTIO devices do not advertise this feature bit and therefore VIRTIO
+drivers cannot negotiate it.
-Client must only process each ring when it is started.
+This reserved feature bit was reused by the vhost-user protocol to add
+vhost-user protocol feature negotiation in a backwards compatible
+fashion. Old vhost-user front-end and back-end implementations continue to
+work even though they are not aware of vhost-user protocol feature
+negotiation.
-Client must only pass data between the ring and the backend, when the
-ring is enabled.
+Ring states
+-----------
-If ring is started but disabled, client must process the ring without
-talking to the backend.
+Rings can be in one of three states:
-For example, for a networking device, in the disabled state client
-must not supply any new RX packets, but must process and discard any
-TX packets.
+* stopped: the back-end must not process the ring at all.
-If ``VHOST_USER_F_PROTOCOL_FEATURES`` has not been negotiated, the
-ring is initialized in an enabled state.
+* started but disabled: the back-end must process the ring without
+ causing any side effects. For example, for a networking device,
+ in the disabled state the back-end must not supply any new RX packets,
+ but must process and discard any TX packets.
-If ``VHOST_USER_F_PROTOCOL_FEATURES`` has been negotiated, the ring is
-initialized in a disabled state. Client must not pass data to/from the
-backend until ring is enabled by ``VHOST_USER_SET_VRING_ENABLE`` with
-parameter 1, or after it has been disabled by
-``VHOST_USER_SET_VRING_ENABLE`` with parameter 0.
+* started and enabled.
-Each ring is initialized in a stopped state, client must not process
-it until ring is started, or after it has been stopped.
+Each ring is initialized in a stopped state. The back-end must start
+ring upon receiving a kick (that is, detecting that file descriptor is
+readable) on the descriptor specified by ``VHOST_USER_SET_VRING_KICK``
+or receiving the in-band message ``VHOST_USER_VRING_KICK`` if negotiated,
+and stop ring upon receiving ``VHOST_USER_GET_VRING_BASE``.
-Client must start ring upon receiving a kick (that is, detecting that
-file descriptor is readable) on the descriptor specified by
-``VHOST_USER_SET_VRING_KICK`` or receiving the in-band message
-``VHOST_USER_VRING_KICK`` if negotiated, and stop ring upon receiving
-``VHOST_USER_GET_VRING_BASE``.
+Rings can be enabled or disabled by ``VHOST_USER_SET_VRING_ENABLE``.
-While processing the rings (whether they are enabled or not), client
+If ``VHOST_USER_F_PROTOCOL_FEATURES`` has not been negotiated, the
+ring starts directly in the enabled state.
+
+If ``VHOST_USER_F_PROTOCOL_FEATURES`` has been negotiated, the ring is
+initialized in a disabled state and is enabled by
+``VHOST_USER_SET_VRING_ENABLE`` with parameter 1.
+
+While processing the rings (whether they are enabled or not), the back-end
must support changing some configuration aspects on the fly.
Multiple queue support
----------------------
-Many devices have a fixed number of virtqueues. In this case the master
+Many devices have a fixed number of virtqueues. In this case the front-end
already knows the number of available virtqueues without communicating with the
-slave.
+back-end.
Some devices do not have a fixed number of virtqueues. Instead the maximum
-number of virtqueues is chosen by the slave. The number can depend on host
-resource availability or slave implementation details. Such devices are called
+number of virtqueues is chosen by the back-end. The number can depend on host
+resource availability or back-end implementation details. Such devices are called
multiple queue devices.
-Multiple queue support allows the slave to advertise the maximum number of
-queues. This is treated as a protocol extension, hence the slave has to
+Multiple queue support allows the back-end to advertise the maximum number of
+queues. This is treated as a protocol extension, hence the back-end has to
implement protocol features first. The multiple queues feature is supported
only when the protocol feature ``VHOST_USER_PROTOCOL_F_MQ`` (bit 0) is set.
-The max number of queues the slave supports can be queried with message
-``VHOST_USER_GET_QUEUE_NUM``. Master should stop when the number of requested
+The max number of queues the back-end supports can be queried with message
+``VHOST_USER_GET_QUEUE_NUM``. Front-end should stop when the number of requested
queues is bigger than that.
-As all queues share one connection, the master uses a unique index for each
+As all queues share one connection, the front-end uses a unique index for each
queue in the sent message to identify a specified queue.
-The master enables queues by sending message ``VHOST_USER_SET_VRING_ENABLE``.
+The front-end enables queues by sending message ``VHOST_USER_SET_VRING_ENABLE``.
vhost-user-net has historically automatically enabled the first queue pair.
-Slaves should always implement the ``VHOST_USER_PROTOCOL_F_MQ`` protocol
+Back-ends should always implement the ``VHOST_USER_PROTOCOL_F_MQ`` protocol
feature, even for devices with a fixed number of virtqueues, since it is simple
to implement and offers a degree of introspection.
-Masters must not rely on the ``VHOST_USER_PROTOCOL_F_MQ`` protocol feature for
+Front-ends must not rely on the ``VHOST_USER_PROTOCOL_F_MQ`` protocol feature for
devices with a fixed number of virtqueues. Only true multiqueue devices
require this protocol feature.
Migration
---------
-During live migration, the master may need to track the modifications
-the slave makes to the memory mapped regions. The client should mark
+During live migration, the front-end may need to track the modifications
+the back-end makes to the memory mapped regions. The front-end should mark
the dirty pages in a log. Once it complies to this logging, it may
declare the ``VHOST_F_LOG_ALL`` vhost feature.
-To start/stop logging of data/used ring writes, server may send
+To start/stop logging of data/used ring writes, the front-end may send
messages ``VHOST_USER_SET_FEATURES`` with ``VHOST_F_LOG_ALL`` and
``VHOST_USER_SET_VRING_ADDR`` with ``VHOST_VRING_F_LOG`` in ring's
flags set to 1/0, respectively.
@@ -425,7 +433,7 @@ Dirty pages are of size::
#define VHOST_LOG_PAGE 0x1000
The log memory fd is provided in the ancillary data of
-``VHOST_USER_SET_LOG_BASE`` message when the slave has
+``VHOST_USER_SET_LOG_BASE`` message when the back-end has
``VHOST_USER_PROTOCOL_F_LOG_SHMFD`` protocol feature.
The size of the log is supplied as part of ``VhostUserMsg`` which
@@ -451,26 +459,26 @@ the bit offset of the last byte of the ring must fall within the size
supplied by ``VhostUserLog``.
``VHOST_USER_SET_LOG_FD`` is an optional message with an eventfd in
-ancillary data, it may be used to inform the master that the log has
+ancillary data, it may be used to inform the front-end that the log has
been modified.
Once the source has finished migration, rings will be stopped by the
source. No further update must be done before rings are restarted.
-In postcopy migration the slave is started before all the memory has
+In postcopy migration the back-end is started before all the memory has
been received from the source host, and care must be taken to avoid
-accessing pages that have yet to be received. The slave opens a
+accessing pages that have yet to be received. The back-end opens a
'userfault'-fd and registers the memory with it; this fd is then
-passed back over to the master. The master services requests on the
+passed back over to the front-end. The front-end services requests on the
userfaultfd for pages that are accessed and when the page is available
it performs WAKE ioctl's on the userfaultfd to wake the stalled
-slave. The client indicates support for this via the
+back-end. The front-end indicates support for this via the
``VHOST_USER_PROTOCOL_F_PAGEFAULT`` feature.
Memory access
-------------
-The master sends a list of vhost memory regions to the slave using the
+The front-end sends a list of vhost memory regions to the back-end using the
``VHOST_USER_SET_MEM_TABLE`` message. Each region has two base
addresses: a guest address and a user address.
@@ -495,60 +503,60 @@ IOMMU support
-------------
When the ``VIRTIO_F_IOMMU_PLATFORM`` feature has been negotiated, the
-master sends IOTLB entries update & invalidation by sending
-``VHOST_USER_IOTLB_MSG`` requests to the slave with a ``struct
+front-end sends IOTLB entries update & invalidation by sending
+``VHOST_USER_IOTLB_MSG`` requests to the back-end with a ``struct
vhost_iotlb_msg`` as payload. For update events, the ``iotlb`` payload
has to be filled with the update message type (2), the I/O virtual
address, the size, the user virtual address, and the permissions
flags. Addresses and size must be within vhost memory regions set via
the ``VHOST_USER_SET_MEM_TABLE`` request. For invalidation events, the
``iotlb`` payload has to be filled with the invalidation message type
-(3), the I/O virtual address and the size. On success, the slave is
+(3), the I/O virtual address and the size. On success, the back-end is
expected to reply with a zero payload, non-zero otherwise.
-The slave relies on the slave communication channel (see :ref:`Slave
-communication <slave_communication>` section below) to send IOTLB miss
+The back-end relies on the back-end communication channel (see :ref:`Back-end
+communication <backend_communication>` section below) to send IOTLB miss
and access failure events, by sending ``VHOST_USER_SLAVE_IOTLB_MSG``
-requests to the master with a ``struct vhost_iotlb_msg`` as
+requests to the front-end with a ``struct vhost_iotlb_msg`` as
payload. For miss events, the iotlb payload has to be filled with the
miss message type (1), the I/O virtual address and the permissions
flags. For access failure event, the iotlb payload has to be filled
with the access failure message type (4), the I/O virtual address and
-the permissions flags. For synchronization purpose, the slave may
-rely on the reply-ack feature, so the master may send a reply when
+the permissions flags. For synchronization purpose, the back-end may
+rely on the reply-ack feature, so the front-end may send a reply when
operation is completed if the reply-ack feature is negotiated and
-slaves requests a reply. For miss events, completed operation means
-either master sent an update message containing the IOTLB entry
-containing requested address and permission, or master sent nothing if
+back-ends requests a reply. For miss events, completed operation means
+either front-end sent an update message containing the IOTLB entry
+containing requested address and permission, or front-end sent nothing if
the IOTLB miss message is invalid (invalid IOVA or permission).
-The master isn't expected to take the initiative to send IOTLB update
-messages, as the slave sends IOTLB miss messages for the guest virtual
+The front-end isn't expected to take the initiative to send IOTLB update
+messages, as the back-end sends IOTLB miss messages for the guest virtual
memory areas it needs to access.
-.. _slave_communication:
+.. _backend_communication:
-Slave communication
--------------------
+Back-end communication
+----------------------
-An optional communication channel is provided if the slave declares
+An optional communication channel is provided if the back-end declares
``VHOST_USER_PROTOCOL_F_SLAVE_REQ`` protocol feature, to allow the
-slave to make requests to the master.
+back-end to make requests to the front-end.
The fd is provided via ``VHOST_USER_SET_SLAVE_REQ_FD`` ancillary data.
-A slave may then send ``VHOST_USER_SLAVE_*`` messages to the master
+A back-end may then send ``VHOST_USER_SLAVE_*`` messages to the front-end
using this fd communication channel.
If ``VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD`` protocol feature is
-negotiated, slave can send file descriptors (at most 8 descriptors in
-each message) to master via ancillary data using this fd communication
+negotiated, back-end can send file descriptors (at most 8 descriptors in
+each message) to front-end via ancillary data using this fd communication
channel.
Inflight I/O tracking
---------------------
-To support reconnecting after restart or crash, slave may need to
+To support reconnecting after restart or crash, back-end may need to
resubmit inflight I/Os. If virtqueue is processed in order, we can
easily achieve that by getting the inflight descriptors from
descriptor table (split virtqueue) or descriptor ring (packed
@@ -556,18 +564,18 @@ virtqueue). However, it can't work when we process descriptors
out-of-order because some entries which store the information of
inflight descriptors in available ring (split virtqueue) or descriptor
ring (packed virtqueue) might be overridden by new entries. To solve
-this problem, slave need to allocate an extra buffer to store this
-information of inflight descriptors and share it with master for
+this problem, the back-end need to allocate an extra buffer to store this
+information of inflight descriptors and share it with front-end for
persistent. ``VHOST_USER_GET_INFLIGHT_FD`` and
``VHOST_USER_SET_INFLIGHT_FD`` are used to transfer this buffer
-between master and slave. And the format of this buffer is described
+between front-end and back-end. And the format of this buffer is described
below:
+---------------+---------------+-----+---------------+
| queue0 region | queue1 region | ... | queueN region |
+---------------+---------------+-----+---------------+
-N is the number of available virtqueues. Slave could get it from num
+N is the number of available virtqueues. The back-end could get it from num
queues field of ``VhostUserInflight``.
For split virtqueue, queue region can be implemented as:
@@ -599,8 +607,8 @@ For split virtqueue, queue region can be implemented as:
* Zero value indicates an uninitialized buffer */
uint16_t version;
- /* The size of DescStateSplit array. It's equal to the virtqueue
- * size. Slave could get it from queue size field of VhostUserInflight. */
+ /* The size of DescStateSplit array. It's equal to the virtqueue size.
+ * The back-end could get it from queue size field of VhostUserInflight. */
uint16_t desc_num;
/* The head of list that track the last batch of used descriptors. */
@@ -706,8 +714,8 @@ For packed virtqueue, queue region can be implemented as:
* Zero value indicates an uninitialized buffer */
uint16_t version;
- /* The size of DescStatePacked array. It's equal to the virtqueue
- * size. Slave could get it from queue size field of VhostUserInflight. */
+ /* The size of DescStatePacked array. It's equal to the virtqueue size.
+ * The back-end could get it from queue size field of VhostUserInflight. */
uint16_t desc_num;
/* The head of free DescStatePacked entry list */
@@ -799,7 +807,7 @@ When reconnecting:
#. Use ``old_used_wrap_counter`` to calculate the available flags
#. If ``d.flags`` is not equal to the calculated flags value (means
- slave has submitted the buffer to guest driver before crash, so
+ back-end has submitted the buffer to guest driver before crash, so
it has to commit the in-progres update), set ``old_free_head``,
``old_used_idx``, ``old_used_wrap_counter`` to ``free_head``,
``used_idx``, ``used_wrap_counter``
@@ -828,11 +836,11 @@ cause the sending application(s) to block, it is not advised to use
this feature unless absolutely necessary. It is also considered an
error to negotiate this feature without also negotiating
``VHOST_USER_PROTOCOL_F_SLAVE_REQ`` and ``VHOST_USER_PROTOCOL_F_REPLY_ACK``,
-the former is necessary for getting a message channel from the slave
-to the master, while the latter needs to be used with the in-band
+the former is necessary for getting a message channel from the back-end
+to the front-end, while the latter needs to be used with the in-band
notification messages to block until they are processed, both to avoid
blocking later and for proper processing (at least in the simulation
-use case.) As it has no other way of signalling this error, the slave
+use case.) As it has no other way of signalling this error, the back-end
should close the connection as a response to a
``VHOST_USER_SET_PROTOCOL_FEATURES`` message that sets the in-band
notifications feature flag without the other two.
@@ -860,95 +868,101 @@ Protocol features
#define VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS 15
#define VHOST_USER_PROTOCOL_F_STATUS 16
-Master message types
---------------------
+Front-end message types
+-----------------------
``VHOST_USER_GET_FEATURES``
:id: 1
:equivalent ioctl: ``VHOST_GET_FEATURES``
- :master payload: N/A
- :slave payload: ``u64``
+ :request payload: N/A
+ :reply payload: ``u64``
Get from the underlying vhost implementation the features bitmask.
- Feature bit ``VHOST_USER_F_PROTOCOL_FEATURES`` signals slave support
+ Feature bit ``VHOST_USER_F_PROTOCOL_FEATURES`` signals back-end support
for ``VHOST_USER_GET_PROTOCOL_FEATURES`` and
``VHOST_USER_SET_PROTOCOL_FEATURES``.
``VHOST_USER_SET_FEATURES``
:id: 2
:equivalent ioctl: ``VHOST_SET_FEATURES``
- :master payload: ``u64``
+ :request payload: ``u64``
+ :reply payload: N/A
Enable features in the underlying vhost implementation using a
bitmask. Feature bit ``VHOST_USER_F_PROTOCOL_FEATURES`` signals
- slave support for ``VHOST_USER_GET_PROTOCOL_FEATURES`` and
+ back-end support for ``VHOST_USER_GET_PROTOCOL_FEATURES`` and
``VHOST_USER_SET_PROTOCOL_FEATURES``.
``VHOST_USER_GET_PROTOCOL_FEATURES``
:id: 15
:equivalent ioctl: ``VHOST_GET_FEATURES``
- :master payload: N/A
- :slave payload: ``u64``
+ :request payload: N/A
+ :reply payload: ``u64``
Get the protocol feature bitmask from the underlying vhost
implementation. Only legal if feature bit
``VHOST_USER_F_PROTOCOL_FEATURES`` is present in
- ``VHOST_USER_GET_FEATURES``.
+ ``VHOST_USER_GET_FEATURES``. It does not need to be acknowledged by
+ ``VHOST_USER_SET_FEATURES``.
.. Note::
- Slave that reported ``VHOST_USER_F_PROTOCOL_FEATURES`` must
+ Back-ends that report ``VHOST_USER_F_PROTOCOL_FEATURES`` must
support this message even before ``VHOST_USER_SET_FEATURES`` was
called.
``VHOST_USER_SET_PROTOCOL_FEATURES``
:id: 16
:equivalent ioctl: ``VHOST_SET_FEATURES``
- :master payload: ``u64``
+ :request payload: ``u64``
+ :reply payload: N/A
Enable protocol features in the underlying vhost implementation.
Only legal if feature bit ``VHOST_USER_F_PROTOCOL_FEATURES`` is present in
- ``VHOST_USER_GET_FEATURES``.
+ ``VHOST_USER_GET_FEATURES``. It does not need to be acknowledged by
+ ``VHOST_USER_SET_FEATURES``.
.. Note::
- Slave that reported ``VHOST_USER_F_PROTOCOL_FEATURES`` must support
+ Back-ends that report ``VHOST_USER_F_PROTOCOL_FEATURES`` must support
this message even before ``VHOST_USER_SET_FEATURES`` was called.
``VHOST_USER_SET_OWNER``
:id: 3
:equivalent ioctl: ``VHOST_SET_OWNER``
- :master payload: N/A
+ :request payload: N/A
+ :reply payload: N/A
- Issued when a new connection is established. It sets the current
- *master* as an owner of the session. This can be used on the *slave*
+ Issued when a new connection is established. It marks the sender
+ as the front-end that owns of the session. This can be used on the *back-end*
as a "session start" flag.
``VHOST_USER_RESET_OWNER``
:id: 4
- :master payload: N/A
+ :request payload: N/A
+ :reply payload: N/A
.. admonition:: Deprecated
This is no longer used. Used to be sent to request disabling all
- rings, but some clients interpreted it to also discard connection
+ rings, but some back-ends interpreted it to also discard connection
state (this interpretation would lead to bugs). It is recommended
- that clients either ignore this message, or use it to disable all
+ that back-ends either ignore this message, or use it to disable all
rings.
``VHOST_USER_SET_MEM_TABLE``
:id: 5
:equivalent ioctl: ``VHOST_SET_MEM_TABLE``
- :master payload: memory regions description
- :slave payload: (postcopy only) memory regions description
+ :request payload: memory regions description
+ :reply payload: (postcopy only) memory regions description
- Sets the memory map regions on the slave so it can translate the
+ Sets the memory map regions on the back-end so it can translate the
vring addresses. In the ancillary data there is an array of file
descriptors for each memory mapped region. The size and ordering of
the fds matches the number and ordering of memory regions.
When ``VHOST_USER_POSTCOPY_LISTEN`` has been received,
``SET_MEM_TABLE`` replies with the bases of the memory mapped
- regions to the master. The slave must have mmap'd the regions but
+ regions to the front-end. The back-end must have mmap'd the regions but
not yet accessed them and should not yet generate a userfault
event.
@@ -962,12 +976,12 @@ Master message types
``VHOST_USER_SET_LOG_BASE``
:id: 6
:equivalent ioctl: ``VHOST_SET_LOG_BASE``
- :master payload: u64
- :slave payload: N/A
+ :request payload: u64
+ :reply payload: N/A
Sets logging shared memory space.
- When slave has ``VHOST_USER_PROTOCOL_F_LOG_SHMFD`` protocol feature,
+ When the back-end has ``VHOST_USER_PROTOCOL_F_LOG_SHMFD`` protocol feature,
the log memory fd is provided in the ancillary data of
``VHOST_USER_SET_LOG_BASE`` message, the size and offset of shared
memory area provided in the message.
@@ -975,44 +989,48 @@ Master message types
``VHOST_USER_SET_LOG_FD``
:id: 7
:equivalent ioctl: ``VHOST_SET_LOG_FD``
- :master payload: N/A
+ :request payload: N/A
+ :reply payload: N/A
Sets the logging file descriptor, which is passed as ancillary data.
``VHOST_USER_SET_VRING_NUM``
:id: 8
:equivalent ioctl: ``VHOST_SET_VRING_NUM``
- :master payload: vring state description
+ :request payload: vring state description
+ :reply payload: N/A
Set the size of the queue.
``VHOST_USER_SET_VRING_ADDR``
:id: 9
:equivalent ioctl: ``VHOST_SET_VRING_ADDR``
- :master payload: vring address description
- :slave payload: N/A
+ :request payload: vring address description
+ :reply payload: N/A
Sets the addresses of the different aspects of the vring.
``VHOST_USER_SET_VRING_BASE``
:id: 10
:equivalent ioctl: ``VHOST_SET_VRING_BASE``
- :master payload: vring state description
+ :request payload: vring state description
+ :reply payload: N/A
Sets the base offset in the available vring.
``VHOST_USER_GET_VRING_BASE``
:id: 11
:equivalent ioctl: ``VHOST_USER_GET_VRING_BASE``
- :master payload: vring state description
- :slave payload: vring state description
+ :request payload: vring state description
+ :reply payload: vring state description
Get the available vring base offset.
``VHOST_USER_SET_VRING_KICK``
:id: 12
:equivalent ioctl: ``VHOST_SET_VRING_KICK``
- :master payload: ``u64``
+ :request payload: ``u64``
+ :reply payload: N/A
Set the event file descriptor for adding buffers to the vring. It is
passed in the ancillary data.
@@ -1030,7 +1048,8 @@ Master message types
``VHOST_USER_SET_VRING_CALL``
:id: 13
:equivalent ioctl: ``VHOST_SET_VRING_CALL``
- :master payload: ``u64``
+ :request payload: ``u64``
+ :reply payload: N/A
Set the event file descriptor to signal when buffers are used. It is
passed in the ancillary data.
@@ -1048,7 +1067,8 @@ Master message types
``VHOST_USER_SET_VRING_ERR``
:id: 14
:equivalent ioctl: ``VHOST_SET_VRING_ERR``
- :master payload: ``u64``
+ :request payload: ``u64``
+ :reply payload: N/A
Set the event file descriptor to signal when error occurs. It is
passed in the ancillary data.
@@ -1065,10 +1085,10 @@ Master message types
``VHOST_USER_GET_QUEUE_NUM``
:id: 17
:equivalent ioctl: N/A
- :master payload: N/A
- :slave payload: u64
+ :request payload: N/A
+ :reply payload: u64
- Query how many queues the backend supports.
+ Query how many queues the back-end supports.
This request should be sent only when ``VHOST_USER_PROTOCOL_F_MQ``
is set in queried protocol features by
@@ -1077,9 +1097,10 @@ Master message types
``VHOST_USER_SET_VRING_ENABLE``
:id: 18
:equivalent ioctl: N/A
- :master payload: vring state description
+ :request payload: vring state description
+ :reply payload: N/A
- Signal slave to enable or disable corresponding vring.
+ Signal the back-end to enable or disable corresponding vring.
This request should be sent only when
``VHOST_USER_F_PROTOCOL_FEATURES`` has been negotiated.
@@ -1087,9 +1108,10 @@ Master message types
``VHOST_USER_SEND_RARP``
:id: 19
:equivalent ioctl: N/A
- :master payload: ``u64``
+ :request payload: ``u64``
+ :reply payload: N/A
- Ask vhost user backend to broadcast a fake RARP to notify the migration
+ Ask vhost user back-end to broadcast a fake RARP to notify the migration
is terminated for guest that does not support GUEST_ANNOUNCE.
Only legal if feature bit ``VHOST_USER_F_PROTOCOL_FEATURES`` is
@@ -1097,12 +1119,13 @@ Master message types
``VHOST_USER_PROTOCOL_F_RARP`` is present in
``VHOST_USER_GET_PROTOCOL_FEATURES``. The first 6 bytes of the
payload contain the mac address of the guest to allow the vhost user
- backend to construct and broadcast the fake RARP.
+ back-end to construct and broadcast the fake RARP.
``VHOST_USER_NET_SET_MTU``
:id: 20
:equivalent ioctl: N/A
- :master payload: ``u64``
+ :request payload: ``u64``
+ :reply payload: N/A
Set host MTU value exposed to the guest.
@@ -1112,35 +1135,36 @@ Master message types
``VHOST_USER_PROTOCOL_F_NET_MTU`` is present in
``VHOST_USER_GET_PROTOCOL_FEATURES``.
- If ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is negotiated, slave must
+ If ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is negotiated, the back-end must
respond with zero in case the specified MTU is valid, or non-zero
otherwise.
``VHOST_USER_SET_SLAVE_REQ_FD``
:id: 21
:equivalent ioctl: N/A
- :master payload: N/A
+ :request payload: N/A
+ :reply payload: N/A
- Set the socket file descriptor for slave initiated requests. It is passed
+ Set the socket file descriptor for back-end initiated requests. It is passed
in the ancillary data.
This request should be sent only when
``VHOST_USER_F_PROTOCOL_FEATURES`` has been negotiated, and protocol
feature bit ``VHOST_USER_PROTOCOL_F_SLAVE_REQ`` bit is present in
``VHOST_USER_GET_PROTOCOL_FEATURES``. If
- ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is negotiated, slave must
+ ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is negotiated, the back-end must
respond with zero for success, non-zero otherwise.
``VHOST_USER_IOTLB_MSG``
:id: 22
:equivalent ioctl: N/A (equivalent to ``VHOST_IOTLB_MSG`` message type)
- :master payload: ``struct vhost_iotlb_msg``
- :slave payload: ``u64``
+ :request payload: ``struct vhost_iotlb_msg``
+ :reply payload: ``u64``
Send IOTLB messages with ``struct vhost_iotlb_msg`` as payload.
- Master sends such requests to update and invalidate entries in the
- device IOTLB. The slave has to acknowledge the request with sending
+ The front-end sends such requests to update and invalidate entries in the
+ device IOTLB. The back-end has to acknowledge the request with sending
zero as ``u64`` payload for success, non-zero otherwise.
This request should be send only when ``VIRTIO_F_IOMMU_PLATFORM``
@@ -1149,7 +1173,8 @@ Master message types
``VHOST_USER_SET_VRING_ENDIAN``
:id: 23
:equivalent ioctl: ``VHOST_SET_VRING_ENDIAN``
- :master payload: vring state description
+ :request payload: vring state description
+ :reply payload: N/A
Set the endianness of a VQ for legacy devices. Little-endian is
indicated with state.num set to 0 and big-endian is indicated with
@@ -1159,42 +1184,42 @@ Master message types
``VHOST_USER_PROTOCOL_F_CROSS_ENDIAN`` has been negotiated.
Backends that negotiated this feature should handle both
endiannesses and expect this message once (per VQ) during device
- configuration (ie. before the master starts the VQ).
+ configuration (ie. before the front-end starts the VQ).
``VHOST_USER_GET_CONFIG``
:id: 24
:equivalent ioctl: N/A
- :master payload: virtio device config space
- :slave payload: virtio device config space
+ :request payload: virtio device config space
+ :reply payload: virtio device config space
When ``VHOST_USER_PROTOCOL_F_CONFIG`` is negotiated, this message is
- submitted by the vhost-user master to fetch the contents of the
- virtio device configuration space, vhost-user slave's payload size
- MUST match master's request, vhost-user slave uses zero length of
- payload to indicate an error to vhost-user master. The vhost-user
- master may cache the contents to avoid repeated
+ submitted by the vhost-user front-end to fetch the contents of the
+ virtio device configuration space, vhost-user back-end's payload size
+ MUST match the front-end's request, vhost-user back-end uses zero length of
+ payload to indicate an error to the vhost-user front-end. The vhost-user
+ front-end may cache the contents to avoid repeated
``VHOST_USER_GET_CONFIG`` calls.
``VHOST_USER_SET_CONFIG``
:id: 25
:equivalent ioctl: N/A
- :master payload: virtio device config space
- :slave payload: N/A
+ :request payload: virtio device config space
+ :reply payload: N/A
When ``VHOST_USER_PROTOCOL_F_CONFIG`` is negotiated, this message is
- submitted by the vhost-user master when the Guest changes the virtio
+ submitted by the vhost-user front-end when the Guest changes the virtio
device configuration space and also can be used for live migration
- on the destination host. The vhost-user slave must check the flags
- field, and slaves MUST NOT accept SET_CONFIG for read-only
+ on the destination host. The vhost-user back-end must check the flags
+ field, and back-ends MUST NOT accept SET_CONFIG for read-only
configuration space fields unless the live migration bit is set.
``VHOST_USER_CREATE_CRYPTO_SESSION``
:id: 26
:equivalent ioctl: N/A
- :master payload: crypto session description
- :slave payload: crypto session description
+ :request payload: crypto session description
+ :reply payload: crypto session description
- Create a session for crypto operation. The server side must return
+ Create a session for crypto operation. The back-end must return
the session id, 0 or positive for success, negative for failure.
This request should be sent only when
``VHOST_USER_PROTOCOL_F_CRYPTO_SESSION`` feature has been
@@ -1204,7 +1229,8 @@ Master message types
``VHOST_USER_CLOSE_CRYPTO_SESSION``
:id: 27
:equivalent ioctl: N/A
- :master payload: ``u64``
+ :request payload: ``u64``
+ :reply payload: N/A
Close a session for crypto operation which was previously
created by ``VHOST_USER_CREATE_CRYPTO_SESSION``.
@@ -1216,20 +1242,21 @@ Master message types
``VHOST_USER_POSTCOPY_ADVISE``
:id: 28
- :master payload: N/A
- :slave payload: userfault fd
+ :request payload: N/A
+ :reply payload: userfault fd
- When ``VHOST_USER_PROTOCOL_F_PAGEFAULT`` is supported, the master
- advises slave that a migration with postcopy enabled is underway,
- the slave must open a userfaultfd for later use. Note that at this
+ When ``VHOST_USER_PROTOCOL_F_PAGEFAULT`` is supported, the front-end
+ advises back-end that a migration with postcopy enabled is underway,
+ the back-end must open a userfaultfd for later use. Note that at this
stage the migration is still in precopy mode.
``VHOST_USER_POSTCOPY_LISTEN``
:id: 29
- :master payload: N/A
+ :request payload: N/A
+ :reply payload: N/A
- Master advises slave that a transition to postcopy mode has
- happened. The slave must ensure that shared memory is registered
+ The front-end advises back-end that a transition to postcopy mode has
+ happened. The back-end must ensure that shared memory is registered
with userfaultfd to cause faulting of non-present pages.
This is always sent sometime after a ``VHOST_USER_POSTCOPY_ADVISE``,
@@ -1237,10 +1264,11 @@ Master message types
``VHOST_USER_POSTCOPY_END``
:id: 30
- :slave payload: ``u64``
+ :request payload: N/A
+ :reply payload: ``u64``
- Master advises that postcopy migration has now completed. The slave
- must disable the userfaultfd. The response is an acknowledgement
+ The front-end advises that postcopy migration has now completed. The back-end
+ must disable the userfaultfd. The reply is an acknowledgement
only.
When ``VHOST_USER_PROTOCOL_F_PAGEFAULT`` is supported, this message
@@ -1252,156 +1280,181 @@ Master message types
``VHOST_USER_GET_INFLIGHT_FD``
:id: 31
:equivalent ioctl: N/A
- :master payload: inflight description
+ :request payload: inflight description
+ :reply payload: N/A
When ``VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD`` protocol feature has
- been successfully negotiated, this message is submitted by master to
- get a shared buffer from slave. The shared buffer will be used to
- track inflight I/O by slave. QEMU should retrieve a new one when vm
+ been successfully negotiated, this message is submitted by the front-end to
+ get a shared buffer from back-end. The shared buffer will be used to
+ track inflight I/O by back-end. QEMU should retrieve a new one when vm
reset.
``VHOST_USER_SET_INFLIGHT_FD``
:id: 32
:equivalent ioctl: N/A
- :master payload: inflight description
+ :request payload: inflight description
+ :reply payload: N/A
When ``VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD`` protocol feature has
- been successfully negotiated, this message is submitted by master to
- send the shared inflight buffer back to slave so that slave could
- get inflight I/O after a crash or restart.
+ been successfully negotiated, this message is submitted by the front-end to
+ send the shared inflight buffer back to the back-end so that the back-end
+ could get inflight I/O after a crash or restart.
``VHOST_USER_GPU_SET_SOCKET``
:id: 33
:equivalent ioctl: N/A
- :master payload: N/A
+ :request payload: N/A
+ :reply payload: N/A
Sets the GPU protocol socket file descriptor, which is passed as
- ancillary data. The GPU protocol is used to inform the master of
+ ancillary data. The GPU protocol is used to inform the front-end of
rendering state and updates. See vhost-user-gpu.rst for details.
``VHOST_USER_RESET_DEVICE``
:id: 34
:equivalent ioctl: N/A
- :master payload: N/A
- :slave payload: N/A
+ :request payload: N/A
+ :reply payload: N/A
- Ask the vhost user backend to disable all rings and reset all
+ Ask the vhost user back-end to disable all rings and reset all
internal device state to the initial state, ready to be
- reinitialized. The backend retains ownership of the device
+ reinitialized. The back-end retains ownership of the device
throughout the reset operation.
Only valid if the ``VHOST_USER_PROTOCOL_F_RESET_DEVICE`` protocol
- feature is set by the backend.
+ feature is set by the back-end.
``VHOST_USER_VRING_KICK``
:id: 35
:equivalent ioctl: N/A
- :slave payload: vring state description
- :master payload: N/A
+ :request payload: vring state description
+ :reply payload: N/A
When the ``VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS`` protocol
feature has been successfully negotiated, this message may be
- submitted by the master to indicate that a buffer was added to
+ submitted by the front-end to indicate that a buffer was added to
the vring instead of signalling it using the vring's kick file
- descriptor or having the slave rely on polling.
+ descriptor or having the back-end rely on polling.
The state.num field is currently reserved and must be set to 0.
``VHOST_USER_GET_MAX_MEM_SLOTS``
:id: 36
:equivalent ioctl: N/A
- :slave payload: u64
+ :request payload: N/A
+ :reply payload: u64
When the ``VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS`` protocol
feature has been successfully negotiated, this message is submitted
- by master to the slave. The slave should return the message with a
+ by the front-end to the back-end. The back-end should return the message with a
u64 payload containing the maximum number of memory slots for
- QEMU to expose to the guest. The value returned by the backend
+ QEMU to expose to the guest. The value returned by the back-end
will be capped at the maximum number of ram slots which can be
supported by the target platform.
``VHOST_USER_ADD_MEM_REG``
:id: 37
:equivalent ioctl: N/A
- :slave payload: single memory region description
+ :request payload: N/A
+ :reply payload: single memory region description
When the ``VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS`` protocol
feature has been successfully negotiated, this message is submitted
- by the master to the slave. The message payload contains a memory
+ by the front-end to the back-end. The message payload contains a memory
region descriptor struct, describing a region of guest memory which
- the slave device must map in. When the
+ the back-end device must map in. When the
``VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS`` protocol feature has
been successfully negotiated, along with the
``VHOST_USER_REM_MEM_REG`` message, this message is used to set and
- update the memory tables of the slave device.
+ update the memory tables of the back-end device.
+
+ Exactly one file descriptor from which the memory is mapped is
+ passed in the ancillary data.
+
+ In postcopy mode (see ``VHOST_USER_POSTCOPY_LISTEN``), the back-end
+ replies with the bases of the memory mapped region to the front-end.
+ For further details on postcopy, see ``VHOST_USER_SET_MEM_TABLE``.
+ They apply to ``VHOST_USER_ADD_MEM_REG`` accordingly.
Exactly one file descriptor from which the memory is mapped is
passed in the ancillary data.
- In postcopy mode (see ``VHOST_USER_POSTCOPY_LISTEN``), the slave
- replies with the bases of the memory mapped region to the master.
+ In postcopy mode (see ``VHOST_USER_POSTCOPY_LISTEN``), the back-end
+ replies with the bases of the memory mapped region to the front-end.
For further details on postcopy, see ``VHOST_USER_SET_MEM_TABLE``.
They apply to ``VHOST_USER_ADD_MEM_REG`` accordingly.
``VHOST_USER_REM_MEM_REG``
:id: 38
:equivalent ioctl: N/A
- :slave payload: single memory region description
+ :request payload: N/A
+ :reply payload: single memory region description
When the ``VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS`` protocol
feature has been successfully negotiated, this message is submitted
- by the master to the slave. The message payload contains a memory
+ by the front-end to the back-end. The message payload contains a memory
region descriptor struct, describing a region of guest memory which
- the slave device must unmap. When the
+ the back-end device must unmap. When the
``VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS`` protocol feature has
been successfully negotiated, along with the
``VHOST_USER_ADD_MEM_REG`` message, this message is used to set and
- update the memory tables of the slave device.
+ update the memory tables of the back-end device.
The memory region to be removed is identified by its guest address,
user address and size. The mmap offset is ignored.
No file descriptors SHOULD be passed in the ancillary data. For
- compatibility with existing incorrect implementations, the slave MAY
+ compatibility with existing incorrect implementations, the back-end MAY
accept messages with one file descriptor. If a file descriptor is
- passed, the slave MUST close it without using it otherwise.
+ passed, the back-end MUST close it without using it otherwise.
+
+ The memory region to be removed is identified by its guest address,
+ user address and size. The mmap offset is ignored.
+
+ No file descriptors SHOULD be passed in the ancillary data. For
+ compatibility with existing incorrect implementations, the back-end MAY
+ accept messages with one file descriptor. If a file descriptor is
+ passed, the back-end MUST close it without using it otherwise.
``VHOST_USER_SET_STATUS``
:id: 39
:equivalent ioctl: VHOST_VDPA_SET_STATUS
- :slave payload: N/A
- :master payload: ``u64``
+ :request payload: ``u64``
+ :reply payload: N/A
When the ``VHOST_USER_PROTOCOL_F_STATUS`` protocol feature has been
- successfully negotiated, this message is submitted by the master to
- notify the backend with updated device status as defined in the Virtio
+ successfully negotiated, this message is submitted by the front-end to
+ notify the back-end with updated device status as defined in the Virtio
specification.
``VHOST_USER_GET_STATUS``
:id: 40
:equivalent ioctl: VHOST_VDPA_GET_STATUS
- :slave payload: ``u64``
- :master payload: N/A
+ :request payload: N/A
+ :reply payload: ``u64``
When the ``VHOST_USER_PROTOCOL_F_STATUS`` protocol feature has been
- successfully negotiated, this message is submitted by the master to
- query the backend for its device status as defined in the Virtio
+ successfully negotiated, this message is submitted by the front-end to
+ query the back-end for its device status as defined in the Virtio
specification.
-Slave message types
--------------------
+Back-end message types
+----------------------
+
+For this type of message, the request is sent by the back-end and the reply
+is sent by the front-end.
``VHOST_USER_SLAVE_IOTLB_MSG``
:id: 1
:equivalent ioctl: N/A (equivalent to ``VHOST_IOTLB_MSG`` message type)
- :slave payload: ``struct vhost_iotlb_msg``
- :master payload: N/A
+ :request payload: ``struct vhost_iotlb_msg``
+ :reply payload: N/A
Send IOTLB messages with ``struct vhost_iotlb_msg`` as payload.
- Slave sends such requests to notify of an IOTLB miss, or an IOTLB
+ The back-end sends such requests to notify of an IOTLB miss, or an IOTLB
access failure. If ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is
- negotiated, and slave set the ``VHOST_USER_NEED_REPLY`` flag, master
+ negotiated, and back-end set the ``VHOST_USER_NEED_REPLY`` flag, the front-end
must respond with zero when operation is successfully completed, or
non-zero otherwise. This request should be send only when
``VIRTIO_F_IOMMU_PLATFORM`` feature has been successfully
@@ -1410,23 +1463,23 @@ Slave message types
``VHOST_USER_SLAVE_CONFIG_CHANGE_MSG``
:id: 2
:equivalent ioctl: N/A
- :slave payload: N/A
- :master payload: N/A
+ :request payload: N/A
+ :reply payload: N/A
When ``VHOST_USER_PROTOCOL_F_CONFIG`` is negotiated, vhost-user
- slave sends such messages to notify that the virtio device's
+ back-end sends such messages to notify that the virtio device's
configuration space has changed, for those host devices which can
support such feature, host driver can send ``VHOST_USER_GET_CONFIG``
- message to slave to get the latest content. If
- ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is negotiated, and slave set the
- ``VHOST_USER_NEED_REPLY`` flag, master must respond with zero when
+ message to the back-end to get the latest content. If
+ ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` is negotiated, and the back-end sets the
+ ``VHOST_USER_NEED_REPLY`` flag, the front-end must respond with zero when
operation is successfully completed, or non-zero otherwise.
``VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG``
:id: 3
:equivalent ioctl: N/A
- :slave payload: vring area description
- :master payload: N/A
+ :request payload: vring area description
+ :reply payload: N/A
Sets host notifier for a specified queue. The queue index is
contained in the ``u64`` field of the vring area description. The
@@ -1437,7 +1490,7 @@ Slave message types
description. QEMU can mmap the file descriptor based on the size and
offset to get a memory range. Registering a host notifier means
mapping this memory range to the VM as the specified queue's notify
- MMIO region. Slave sends this request to tell QEMU to de-register
+ MMIO region. The back-end sends this request to tell QEMU to de-register
the existing notifier if any and register the new notifier if the
request is sent with a file descriptor.
@@ -1448,28 +1501,28 @@ Slave message types
``VHOST_USER_SLAVE_VRING_CALL``
:id: 4
:equivalent ioctl: N/A
- :slave payload: vring state description
- :master payload: N/A
+ :request payload: vring state description
+ :reply payload: N/A
When the ``VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS`` protocol
feature has been successfully negotiated, this message may be
- submitted by the slave to indicate that a buffer was used from
+ submitted by the back-end to indicate that a buffer was used from
the vring instead of signalling this using the vring's call file
- descriptor or having the master relying on polling.
+ descriptor or having the front-end relying on polling.
The state.num field is currently reserved and must be set to 0.
``VHOST_USER_SLAVE_VRING_ERR``
:id: 5
:equivalent ioctl: N/A
- :slave payload: vring state description
- :master payload: N/A
+ :request payload: vring state description
+ :reply payload: N/A
When the ``VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS`` protocol
feature has been successfully negotiated, this message may be
- submitted by the slave to indicate that an error occurred on the
+ submitted by the back-end to indicate that an error occurred on the
specific vring, instead of signalling the error file descriptor
- set by the master via ``VHOST_USER_SET_VRING_ERR``.
+ set by the front-end via ``VHOST_USER_SET_VRING_ERR``.
The state.num field is currently reserved and must be set to 0.
@@ -1480,21 +1533,21 @@ VHOST_USER_PROTOCOL_F_REPLY_ACK
The original vhost-user specification only demands replies for certain
commands. This differs from the vhost protocol implementation where
-commands are sent over an ``ioctl()`` call and block until the client
+commands are sent over an ``ioctl()`` call and block until the back-end
has completed.
With this protocol extension negotiated, the sender (QEMU) can set the
``need_reply`` [Bit 3] flag to any command. This indicates that the
-client MUST respond with a Payload ``VhostUserMsg`` indicating success
+back-end MUST respond with a Payload ``VhostUserMsg`` indicating success
or failure. The payload should be set to zero on success or non-zero
on failure, unless the message already has an explicit reply body.
-The response payload gives QEMU a deterministic indication of the result
+The reply payload gives QEMU a deterministic indication of the result
of the command. Today, QEMU is expected to terminate the main vhost-user
loop upon receiving such errors. In future, qemu could be taught to be more
resilient for selective requests.
-For the message types that already solicit a reply from the client,
+For the message types that already solicit a reply from the back-end,
the presence of ``VHOST_USER_PROTOCOL_F_REPLY_ACK`` or need_reply bit
being set brings no behavioural change. (See the Communication_
section for details.)
@@ -1504,26 +1557,26 @@ section for details.)
Backend program conventions
===========================
-vhost-user backends can provide various devices & services and may
+vhost-user back-ends can provide various devices & services and may
need to be configured manually depending on the use case. However, it
is a good idea to follow the conventions listed here when
possible. Users, QEMU or libvirt, can then rely on some common
behaviour to avoid heterogeneous configuration and management of the
-backend programs and facilitate interoperability.
+back-end programs and facilitate interoperability.
-Each backend installed on a host system should come with at least one
+Each back-end installed on a host system should come with at least one
JSON file that conforms to the vhost-user.json schema. Each file
-informs the management applications about the backend type, and binary
+informs the management applications about the back-end type, and binary
location. In addition, it defines rules for management apps for
-picking the highest priority backend when multiple match the search
+picking the highest priority back-end when multiple match the search
criteria (see ``@VhostUserBackend`` documentation in the schema file).
-If the backend is not capable of enabling a requested feature on the
+If the back-end is not capable of enabling a requested feature on the
host (such as 3D acceleration with virgl), or the initialization
-failed, the backend should fail to start early and exit with a status
+failed, the back-end should fail to start early and exit with a status
!= 0. It may also print a message to stderr for further details.
-The backend program must not daemonize itself, but it may be
+The back-end program must not daemonize itself, but it may be
daemonized by the management layer. It may also have a restricted
access to the system.
@@ -1531,7 +1584,7 @@ File descriptors 0, 1 and 2 will exist, and have regular
stdin/stdout/stderr usage (they may have been redirected to /dev/null
by the management layer, or to a log handler).
-The backend program must end (as quickly and cleanly as possible) when
+The back-end program must end (as quickly and cleanly as possible) when
the SIGTERM signal is received. Eventually, it may receive SIGKILL by
the management layer after a few seconds.
@@ -1545,15 +1598,15 @@ are mandatory, unless explicitly said differently:
--fd=FDNUM
- When this argument is given, the backend program is started with the
+ When this argument is given, the back-end program is started with the
vhost-user socket as file descriptor FDNUM. It is incompatible with
--socket-path.
--print-capabilities
- Output to stdout the backend capabilities in JSON format, and then
+ Output to stdout the back-end capabilities in JSON format, and then
exit successfully. Other options and arguments should be ignored, and
- the backend program should not perform its normal function. The
+ the back-end program should not perform its normal function. The
capabilities can be reported dynamically depending on the host
capabilities.
diff --git a/docs/system/device-emulation.rst b/docs/system/device-emulation.rst
index ae8dd233e8..3b729b920d 100644
--- a/docs/system/device-emulation.rst
+++ b/docs/system/device-emulation.rst
@@ -84,6 +84,7 @@ Emulated Devices
devices/can.rst
devices/ccid.rst
+ devices/cxl.rst
devices/ivshmem.rst
devices/net.rst
devices/nvme.rst
diff --git a/docs/system/devices/cxl.rst b/docs/system/devices/cxl.rst
new file mode 100644
index 0000000000..9293cbf01a
--- /dev/null
+++ b/docs/system/devices/cxl.rst
@@ -0,0 +1,302 @@
+Compute Express Link (CXL)
+==========================
+From the view of a single host, CXL is an interconnect standard that
+targets accelerators and memory devices attached to a CXL host.
+This description will focus on those aspects visible either to
+software running on a QEMU emulated host or to the internals of
+functional emulation. As such, it will skip over many of the
+electrical and protocol elements that would be more of interest
+for real hardware and will dominate more general introductions to CXL.
+It will also completely ignore the fabric management aspects of CXL
+by considering only a single host and a static configuration.
+
+CXL shares many concepts and much of the infrastructure of PCI Express,
+with CXL Host Bridges, which have CXL Root Ports which may be directly
+attached to CXL or PCI End Points. Alternatively there may be CXL Switches
+with CXL and PCI Endpoints attached below them. In many cases additional
+control and capabilities are exposed via PCI Express interfaces.
+This sharing of interfaces and hence emulation code is is reflected
+in how the devices are emulated in QEMU. In most cases the various
+CXL elements are built upon an equivalent PCIe devices.
+
+CXL devices support the following interfaces:
+
+* Most conventional PCIe interfaces
+
+ - Configuration space access
+ - BAR mapped memory accesses used for registers and mailboxes.
+ - MSI/MSI-X
+ - AER
+ - DOE mailboxes
+ - IDE
+ - Many other PCI express defined interfaces..
+
+* Memory operations
+
+ - Equivalent of accessing DRAM / NVDIMMs. Any access / feature
+ supported by the host for normal memory should also work for
+ CXL attached memory devices.
+
+* Cache operations. The are mostly irrelevant to QEMU emulation as
+ QEMU is not emulating a coherency protocol. Any emulation related
+ to these will be device specific and is out of the scope of this
+ document.
+
+CXL 2.0 Device Types
+--------------------
+CXL 2.0 End Points are often categorized into three types.
+
+**Type 1:** These support coherent caching of host memory. Example might
+be a crypto accelerators. May also have device private memory accessible
+via means such as PCI memory reads and writes to BARs.
+
+**Type 2:** These support coherent caching of host memory and host
+managed device memory (HDM) for which the coherency protocol is managed
+by the host. This is a complex topic, so for more information on CXL
+coherency see the CXL 2.0 specification.
+
+**Type 3 Memory devices:** These devices act as a means of attaching
+additional memory (HDM) to a CXL host including both volatile and
+persistent memory. The CXL topology may support interleaving across a
+number of Type 3 memory devices using HDM Decoders in the host, host
+bridge, switch upstream port and endpoints.
+
+Scope of CXL emulation in QEMU
+------------------------------
+The focus of CXL emulation is CXL revision 2.0 and later. Earlier CXL
+revisions defined a smaller set of features, leaving much of the control
+interface as implementation defined or device specific, making generic
+emulation challenging with host specific firmware being responsible
+for setup and the Endpoints being presented to operating systems
+as Root Complex Integrated End Points. CXL rev 2.0 looks a lot
+more like PCI Express, with fully specified discoverability
+of the CXL topology.
+
+CXL System components
+----------------------
+A CXL system is made up a Host with a number of 'standard components'
+the control and capabilities of which are discoverable by system software
+using means described in the CXL 2.0 specification.
+
+CXL Fixed Memory Windows (CFMW)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+A CFMW consists of a particular range of Host Physical Address space
+which is routed to particular CXL Host Bridges. At time of generic
+software initialization it will have a particularly interleaving
+configuration and associated Quality of Serice Throtling Group (QTG).
+This information is available to system software, when making
+decisions about how to configure interleave across available CXL
+memory devices. It is provide as CFMW Structures (CFMWS) in
+the CXL Early Discovery Table, an ACPI table.
+
+Note: QTG 0 is the only one currently supported in QEMU.
+
+CXL Host Bridge (CXL HB)
+~~~~~~~~~~~~~~~~~~~~~~~~
+A CXL host bridge is similar to the PCIe equivalent, but with a
+specification defined register interface called CXL Host Bridge
+Component Registers (CHBCR). The location of this CHBCR MMIO
+space is described to system software via a CXL Host Bridge
+Structure (CHBS) in the CEDT ACPI table. The actual interfaces
+are identical to those used for other parts of the CXL heirarchy
+as CXL Component Registers in PCI BARs.
+
+Interfaces provided include:
+
+* Configuration of HDM Decoders to route CXL Memory accesses with
+ a particularly Host Physical Address range to the target port
+ below which the CXL device servicing that address lies. This
+ may be a mapping to a single Root Port (RP) or across a set of
+ target RPs.
+
+CXL Root Ports (CXL RP)
+~~~~~~~~~~~~~~~~~~~~~~~
+A CXL Root Port servers te same purpose as a PCIe Root Port.
+There are a number of CXL specific Designated Vendor Specific
+Extended Capabilities (DVSEC) in PCIe Configuration Space
+and associated component register access via PCI bars.
+
+CXL Switch
+~~~~~~~~~~
+Not yet implemented in QEMU.
+
+Here we consider a simple CXL switch with only a single
+virtual hierarchy. Whilst more complex devices exist, their
+visibility to a particular host is generally the same as for
+a simple switch design. Hosts often have no awareness
+of complex rerouting and device pooling, they simply see
+devices being hot added or hot removed.
+
+A CXL switch has a similar architecture to those in PCIe,
+with a single upstream port, internal PCI bus and multiple
+downstream ports.
+
+Both the CXL upstream and downstream ports have CXL specific
+DVSECs in configuration space, and component registers in PCI
+BARs. The Upstream Port has the configuration interfaces for
+the HDM decoders which route incoming memory accesses to the
+appropriate downstream port.
+
+CXL Memory Devices - Type 3
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+CXL type 3 devices use a PCI class code and are intended to be supported
+by a generic operating system driver. They have HDM decoders
+though in these EP devices, the decoder is reponsible not for
+routing but for translation of the incoming host physical address (HPA)
+into a Device Physical Address (DPA).
+
+CXL Memory Interleave
+---------------------
+To understand the interaction of different CXL hardware components which
+are emulated in QEMU, let us consider a memory read in a fully configured
+CXL topology. Note that system software is responsible for configuration
+of all components with the exception of the CFMWs. System software is
+responsible for allocating appropriate ranges from within the CFMWs
+and exposing those via normal memory configurations as would be done
+for system RAM.
+
+Example system Topology. x marks the match in each decoder level::
+
+ |<------------------SYSTEM PHYSICAL ADDRESS MAP (1)----------------->|
+ | __________ __________________________________ __________ |
+ | | | | | | | |
+ | | CFMW 0 | | CXL Fixed Memory Window 1 | | CFMW 1 | |
+ | | HB0 only | | Configured to interleave memory | | HB1 only | |
+ | | | | memory accesses across HB0/HB1 | | | |
+ | |__________| |_____x____________________________| |__________| |
+ | | | |
+ | | | |
+ | | | |
+ | Interleave Decoder | |
+ | Matches this HB | |
+ \_____________| |_____________/
+ __________|__________ _____|_______________
+ | | | |
+ (2) | CXL HB 0 | | CXL HB 1 |
+ | HB IntLv Decoders | | HB IntLv Decoders |
+ | PCI/CXL Root Bus 0c | | PCI/CXL Root Bus 0d |
+ | | | |
+ |___x_________________| |_____________________|
+ | | | |
+ | | | |
+ A HB 0 HDM Decoder | | |
+ matches this Port | | |
+ | | | |
+ ___________|___ __________|__ __|_________ ___|_________
+ (3)| Root Port 0 | | Root Port 1 | | Root Port 2| | Root Port 3 |
+ | Appears in | | Appears in | | Appears in | | Appear in |
+ | PCI topology | | PCI Topology| | PCI Topo | | PCI Topo |
+ | As 0c:00.0 | | as 0c:01.0 | | as de:00.0 | | as de:01.0 |
+ |_______________| |_____________| |____________| |_____________|
+ | | | |
+ | | | |
+ _____|_________ ______|______ ______|_____ ______|_______
+ (4)| x | | | | | | |
+ | CXL Type3 0 | | CXL Type3 1 | | CXL type3 2| | CLX Type 3 3 |
+ | | | | | | | |
+ | PMEM0(Vol LSA)| | PMEM1 (...) | | PMEM2 (...)| | PMEM3 (...) |
+ | Decoder to go | | | | | | |
+ | from host PA | | PCI 0e:00.0 | | PCI df:00.0| | PCI e0:00.0 |
+ | to device PA | | | | | | |
+ | PCI as 0d:00.0| | | | | | |
+ |_______________| |_____________| |____________| |______________|
+
+Notes:
+
+(1) **3 CXL Fixed Memory Windows (CFMW)** corresponding to different
+ ranges of the system physical address map. Each CFMW has
+ particular interleave setup across the CXL Host Bridges (HB)
+ CFMW0 provides uninterleaved access to HB0, CFW2 provides
+ uninterleaved acess to HB1. CFW1 provides interleaved memory access
+ across HB0 and HB1.
+
+(2) **Two CXL Host Bridges**. Each of these has 2 CXL Root Ports and
+ programmable HDM decoders to route memory accesses either to
+ a single port or interleave them across multiple ports.
+ A complex configuration here, might be to use the following HDM
+ decoders in HB0. HDM0 routes CFMW0 requests to RP0 and hence
+ part of CXL Type3 0. HDM1 routes CFMW0 requests from a
+ different region of the CFMW0 PA range to RP2 and hence part
+ of CXL Type 3 1. HDM2 routes yet another PA range from within
+ CFMW0 to be interleaved across RP0 and RP1, providing 2 way
+ interleave of part of the memory provided by CXL Type3 0 and
+ CXL Type 3 1. HDM3 routes those interleaved accesses from
+ CFMW1 that target HB0 to RP 0 and another part of the memory of
+ CXL Type 3 0 (as part of a 2 way interleave at the system level
+ across for example CXL Type3 0 and CXL Type3 2.
+ HDM4 is used to enable system wide 4 way interleave across all
+ the present CXL type3 devices, by interleaving those (interleaved)
+ requests that HB0 receives from from CFMW1 across RP 0 and
+ RP 1 and hence to yet more regions of the memory of the
+ attached Type3 devices. Note this is a representative subset
+ of the full range of possible HDM decoder configurations in this
+ topology.
+
+(3) **Four CXL Root Ports.** In this case the CXL Type 3 devices are
+ directly attached to these ports.
+
+(4) **Four CXL Type3 memory expansion devices.** These will each have
+ HDM decoders, but in this case rather than performing interleave
+ they will take the Host Physical Addresses of accesses and map
+ them to their own local Device Physical Address Space (DPA).
+
+Example command lines
+---------------------
+A very simple setup with just one directly attached CXL Type 3 device::
+
+ qemu-system-aarch64 -M virt,gic-version=3,cxl=on -m 4g,maxmem=8G,slots=8 -cpu max \
+ ...
+ -object memory-backend-file,id=cxl-mem1,share=on,mem-path=/tmp/cxltest.raw,size=256M \
+ -object memory-backend-file,id=cxl-lsa1,share=on,mem-path=/tmp/lsa.raw,size=256M \
+ -device pxb-cxl,bus_nr=12,bus=pcie.0,id=cxl.1 \
+ -device cxl-rp,port=0,bus=cxl.1,id=root_port13,chassis=0,slot=2 \
+ -device cxl-type3,bus=root_port13,memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem0 \
+ -cxl-fixed-memory-window targets.0=cxl.1,size=4G
+
+A setup suitable for 4 way interleave. Only one fixed window provided, to enable 2 way
+interleave across 2 CXL host bridges. Each host bridge has 2 CXL Root Ports, with
+the CXL Type3 device directly attached (no switches).::
+
+ qemu-system-aarch64 -M virt,gic-version=3,cxl=on -m 4g,maxmem=8G,slots=8 -cpu max \
+ ...
+ -object memory-backend-file,id=cxl-mem1,share=on,mem-path=/tmp/cxltest.raw,size=256M \
+ -object memory-backend-file,id=cxl-mem2,share=on,mem-path=/tmp/cxltest2.raw,size=256M \
+ -object memory-backend-file,id=cxl-mem3,share=on,mem-path=/tmp/cxltest3.raw,size=256M \
+ -object memory-backend-file,id=cxl-mem4,share=on,mem-path=/tmp/cxltest4.raw,size=256M \
+ -object memory-backend-file,id=cxl-lsa1,share=on,mem-path=/tmp/lsa.raw,size=256M \
+ -object memory-backend-file,id=cxl-lsa2,share=on,mem-path=/tmp/lsa2.raw,size=256M \
+ -object memory-backend-file,id=cxl-lsa3,share=on,mem-path=/tmp/lsa3.raw,size=256M \
+ -object memory-backend-file,id=cxl-lsa4,share=on,mem-path=/tmp/lsa4.raw,size=256M \
+ -device pxb-cxl,bus_nr=12,bus=pcie.0,id=cxl.1 \
+ -device pxb-cxl,bus_nr=222,bus=pcie.0,id=cxl.2 \
+ -device cxl-rp,port=0,bus=cxl.1,id=root_port13,chassis=0,slot=2 \
+ -device cxl-type3,bus=root_port13,memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem0 \
+ -device cxl-rp,port=1,bus=cxl.1,id=root_port14,chassis=0,slot=3 \
+ -device cxl-type3,bus=root_port14,memdev=cxl-mem2,lsa=cxl-lsa2,id=cxl-pmem1 \
+ -device cxl-rp,port=0,bus=cxl.2,id=root_port15,chassis=0,slot=5 \
+ -device cxl-type3,bus=root_port15,memdev=cxl-mem3,lsa=cxl-lsa3,id=cxl-pmem2 \
+ -device cxl-rp,port=1,bus=cxl.2,id=root_port16,chassis=0,slot=6 \
+ -device cxl-type3,bus=root_port16,memdev=cxl-mem4,lsa=cxl-lsa4,id=cxl-pmem3 \
+ -cxl-fixed-memory-window targets.0=cxl.1,targets.1=cxl.2,size=4G,interleave-granularity=8k
+
+Kernel Configuration Options
+----------------------------
+
+In Linux 5.18 the followings options are necessary to make use of
+OS management of CXL memory devices as described here.
+
+* CONFIG_CXL_BUS
+* CONFIG_CXL_PCI
+* CONFIG_CXL_ACPI
+* CONFIG_CXL_PMEM
+* CONFIG_CXL_MEM
+* CONFIG_CXL_PORT
+* CONFIG_CXL_REGION
+
+References
+----------
+
+ - Consortium website for specifications etc:
+ http://www.computeexpresslink.org
+ - Compute Express link Revision 2 specification, October 2020
+ - CEDT CFMWS & QTG _DSM ECN May 2021