diff options
Diffstat (limited to 'docs')
-rw-r--r-- | docs/mach-virt-graphical.cfg | 281 | ||||
-rw-r--r-- | docs/mach-virt-serial.cfg | 243 | ||||
-rw-r--r-- | docs/migration.txt | 71 | ||||
-rw-r--r-- | docs/multi-thread-tcg.txt | 350 | ||||
-rw-r--r-- | docs/q35-chipset.cfg | 152 | ||||
-rw-r--r-- | docs/q35-emulated.cfg | 288 | ||||
-rw-r--r-- | docs/q35-virtio-graphical.cfg | 248 | ||||
-rw-r--r-- | docs/q35-virtio-serial.cfg | 193 | ||||
-rw-r--r-- | docs/qapi-code-gen.txt | 2 | ||||
-rw-r--r-- | docs/replay.txt | 7 | ||||
-rw-r--r-- | docs/specs/vmgenid.txt | 245 |
11 files changed, 1927 insertions, 153 deletions
diff --git a/docs/mach-virt-graphical.cfg b/docs/mach-virt-graphical.cfg new file mode 100644 index 0000000000..0fdf6846dd --- /dev/null +++ b/docs/mach-virt-graphical.cfg @@ -0,0 +1,281 @@ +# mach-virt - VirtIO guest (graphical console) +# ========================================================= +# +# Usage: +# +# $ qemu-system-aarch64 \ +# -nodefaults \ +# -readconfig mach-virt-graphical.cfg \ +# -cpu host +# +# You will probably need to tweak the lines marked as +# CHANGE ME before being able to use this configuration! +# +# The guest will have a selection of VirtIO devices +# tailored towards optimal performance with modern guests, +# and will be accessed through a graphical console. +# +# --------------------------------------------------------- +# +# Using -nodefaults is required to have full control over +# the virtual hardware: when it's specified, QEMU will +# populate the board with only the builtin peripherals, +# such as the PL011 UART, plus a PCI Express Root Bus; the +# user will then have to explicitly add further devices. +# +# The PCI Express Root Bus shows up in the guest as: +# +# 00:00.0 Host bridge +# +# This configuration file adds a number of other useful +# devices, more specifically: +# +# 00:01.0 Display controller +# 00.1c.* PCI bridge (PCI Express Root Ports) +# 01:00.0 SCSI storage controller +# 02:00.0 Ethernet controller +# 03:00.0 USB controller +# +# More information about these devices is available below. + + +# Machine options +# ========================================================= +# +# We use the virt machine type and enable KVM acceleration +# for better performance. +# +# Using less than 1 GiB of memory is probably not going to +# yield good performance in the guest, and might even lead +# to obscure boot issues in some cases. +# +# Unfortunately, there is no way to configure the CPU model +# in this file, so it will have to be provided on the +# command line, but we can configure the guest to use the +# same GIC version as the host. + +[machine] + type = "virt" + accel = "kvm" + gic-version = "host" + +[memory] + size = "1024" + + +# Firmware configuration +# ========================================================= +# +# There are two parts to the firmware: a read-only image +# containing the executable code, which is shared between +# guests, and a read/write variable store that is owned +# by one specific guest, exclusively, and is used to +# record information such as the UEFI boot order. +# +# For any new guest, its permanent, private variable store +# should initially be copied from the template file +# provided along with the firmware binary. +# +# Depending on the OS distribution you're using on the +# host, the name of the package containing the firmware +# binary and variable store template, as well as the paths +# to the files themselves, will be different. For example: +# +# Fedora +# edk2-aarch64 (pkg) +# /usr/share/edk2/aarch64/QEMU_EFI-pflash.raw (bin) +# /usr/share/edk2/aarch64/vars-template-pflash.raw (var) +# +# RHEL +# AAVMF (pkg) +# /usr/share/AAVMF/AAVMF_CODE.fd (bin) +# /usr/share/AAVMF/AAVMF_VARS.fd (var) +# +# Debian/Ubuntu +# qemu-efi (pkg) +# /usr/share/AAVMF/AAVMF_CODE.fd (bin) +# /usr/share/AAVMF/AAVMF_VARS.fd (var) + +[drive "uefi-binary"] + file = "/usr/share/AAVMF/AAVMF_CODE.fd" # CHANGE ME + format = "raw" + if = "pflash" + unit = "0" + readonly = "on" + +[drive "uefi-varstore"] + file = "guest_VARS.fd" # CHANGE ME + format = "raw" + if = "pflash" + unit = "1" + + +# PCI bridge (PCI Express Root Ports) +# ========================================================= +# +# We create eight PCI Express Root Ports, and we plug them +# all into separate functions of the same slot. Some of +# them will be used by devices, the rest will remain +# available for hotplug. + +[device "pcie.1"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.0" + port = "1" + chassis = "1" + multifunction = "on" + +[device "pcie.2"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.1" + port = "2" + chassis = "2" + +[device "pcie.3"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.2" + port = "3" + chassis = "3" + +[device "pcie.4"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.3" + port = "4" + chassis = "4" + +[device "pcie.5"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.4" + port = "5" + chassis = "5" + +[device "pcie.6"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.5" + port = "6" + chassis = "6" + +[device "pcie.7"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.6" + port = "7" + chassis = "7" + +[device "pcie.8"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.7" + port = "8" + chassis = "8" + + +# SCSI storage controller (and storage) +# ========================================================= +# +# We use virtio-scsi here so that we can (hot)plug a large +# number of disks without running into issues; a SCSI disk, +# backed by a qcow2 disk image on the host's filesystem, is +# attached to it. +# +# We also create an optical disk, mostly for installation +# purposes: once the guest OS has been succesfully +# installed, the guest will no longer boot from optical +# media. If you don't want, or no longer want, to have an +# optical disk in the guest you can safely comment out +# all relevant sections below. + +[device "scsi"] + driver = "virtio-scsi-pci" + bus = "pcie.1" + addr = "00.0" + +[device "scsi-disk"] + driver = "scsi-hd" + bus = "scsi.0" + drive = "disk" + bootindex = "1" + +[drive "disk"] + file = "guest.qcow2" # CHANGE ME + format = "qcow2" + if = "none" + +[device "scsi-optical-disk"] + driver = "scsi-cd" + bus = "scsi.0" + drive = "optical-disk" + bootindex = "2" + +[drive "optical-disk"] + file = "install.iso" # CHANGE ME + format = "raw" + if = "none" + + +# Ethernet controller +# ========================================================= +# +# We use virtio-net for improved performance over emulated +# hardware; on the host side, we take advantage of user +# networking so that the QEMU process doesn't require any +# additional privileges. + +[netdev "hostnet"] + type = "user" + +[device "net"] + driver = "virtio-net-pci" + netdev = "hostnet" + bus = "pcie.2" + addr = "00.0" + + +# USB controller (and input devices) +# ========================================================= +# +# We add a virtualization-friendly USB 3.0 controller and +# a USB keyboard / USB tablet combo so that graphical +# guests can be controlled appropriately. + +[device "usb"] + driver = "nec-usb-xhci" + bus = "pcie.3" + addr = "00.0" + +[device "keyboard"] + driver = "usb-kbd" + bus = "usb.0" + +[device "tablet"] + driver = "usb-tablet" + bus = "usb.0" + + +# Display controller +# ========================================================= +# +# We use virtio-gpu because the legacy VGA framebuffer is +# very troublesome on aarch64, and virtio-gpu is the only +# video device that doesn't implement it. +# +# If you're running the guest on a remote, potentially +# headless host, you will probably want to append something +# like +# +# -display vnc=127.0.0.1:0 +# +# to the command line in order to prevent QEMU from +# creating a graphical display window on the host and +# enable remote access instead. + +[device "video"] + driver = "virtio-gpu" + bus = "pcie.0" + addr = "01.0" diff --git a/docs/mach-virt-serial.cfg b/docs/mach-virt-serial.cfg new file mode 100644 index 0000000000..aee9f1c5a1 --- /dev/null +++ b/docs/mach-virt-serial.cfg @@ -0,0 +1,243 @@ +# mach-virt - VirtIO guest (serial console) +# ========================================================= +# +# Usage: +# +# $ qemu-system-aarch64 \ +# -nodefaults \ +# -readconfig mach-virt-serial.cfg \ +# -display none -serial mon:stdio \ +# -cpu host +# +# You will probably need to tweak the lines marked as +# CHANGE ME before being able to use this configuration! +# +# The guest will have a selection of VirtIO devices +# tailored towards optimal performance with modern guests, +# and will be accessed through the serial console. +# +# --------------------------------------------------------- +# +# Using -nodefaults is required to have full control over +# the virtual hardware: when it's specified, QEMU will +# populate the board with only the builtin peripherals, +# such as the PL011 UART, plus a PCI Express Root Bus; the +# user will then have to explicitly add further devices. +# +# The PCI Express Root Bus shows up in the guest as: +# +# 00:00.0 Host bridge +# +# This configuration file adds a number of other useful +# devices, more specifically: +# +# 00.1c.* PCI bridge (PCI Express Root Ports) +# 01:00.0 SCSI storage controller +# 02:00.0 Ethernet controller +# +# More information about these devices is available below. +# +# We use '-display none' to prevent QEMU from creating a +# graphical display window, which would serve no use in +# this specific configuration, and '-serial mon:stdio' to +# multiplex the guest's serial console and the QEMU monitor +# to the host's stdio; use 'Ctrl+A h' to learn how to +# switch between the two and more. + + +# Machine options +# ========================================================= +# +# We use the virt machine type and enable KVM acceleration +# for better performance. +# +# Using less than 1 GiB of memory is probably not going to +# yield good performance in the guest, and might even lead +# to obscure boot issues in some cases. +# +# Unfortunately, there is no way to configure the CPU model +# in this file, so it will have to be provided on the +# command line, but we can configure the guest to use the +# same GIC version as the host. + +[machine] + type = "virt" + accel = "kvm" + gic-version = "host" + +[memory] + size = "1024" + + +# Firmware configuration +# ========================================================= +# +# There are two parts to the firmware: a read-only image +# containing the executable code, which is shared between +# guests, and a read/write variable store that is owned +# by one specific guest, exclusively, and is used to +# record information such as the UEFI boot order. +# +# For any new guest, its permanent, private variable store +# should initially be copied from the template file +# provided along with the firmware binary. +# +# Depending on the OS distribution you're using on the +# host, the name of the package containing the firmware +# binary and variable store template, as well as the paths +# to the files themselves, will be different. For example: +# +# Fedora +# edk2-aarch64 (pkg) +# /usr/share/edk2/aarch64/QEMU_EFI-pflash.raw (bin) +# /usr/share/edk2/aarch64/vars-template-pflash.raw (var) +# +# RHEL +# AAVMF (pkg) +# /usr/share/AAVMF/AAVMF_CODE.fd (bin) +# /usr/share/AAVMF/AAVMF_VARS.fd (var) +# +# Debian/Ubuntu +# qemu-efi (pkg) +# /usr/share/AAVMF/AAVMF_CODE.fd (bin) +# /usr/share/AAVMF/AAVMF_VARS.fd (var) + +[drive "uefi-binary"] + file = "/usr/share/AAVMF/AAVMF_CODE.fd" # CHANGE ME + format = "raw" + if = "pflash" + unit = "0" + readonly = "on" + +[drive "uefi-varstore"] + file = "guest_VARS.fd" # CHANGE ME + format = "raw" + if = "pflash" + unit = "1" + + +# PCI bridge (PCI Express Root Ports) +# ========================================================= +# +# We create eight PCI Express Root Ports, and we plug them +# all into separate functions of the same slot. Some of +# them will be used by devices, the rest will remain +# available for hotplug. + +[device "pcie.1"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.0" + port = "1" + chassis = "1" + multifunction = "on" + +[device "pcie.2"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.1" + port = "2" + chassis = "2" + +[device "pcie.3"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.2" + port = "3" + chassis = "3" + +[device "pcie.4"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.3" + port = "4" + chassis = "4" + +[device "pcie.5"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.4" + port = "5" + chassis = "5" + +[device "pcie.6"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.5" + port = "6" + chassis = "6" + +[device "pcie.7"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.6" + port = "7" + chassis = "7" + +[device "pcie.8"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.7" + port = "8" + chassis = "8" + + +# SCSI storage controller (and storage) +# ========================================================= +# +# We use virtio-scsi here so that we can (hot)plug a large +# number of disks without running into issues; a SCSI disk, +# backed by a qcow2 disk image on the host's filesystem, is +# attached to it. +# +# We also create an optical disk, mostly for installation +# purposes: once the guest OS has been succesfully +# installed, the guest will no longer boot from optical +# media. If you don't want, or no longer want, to have an +# optical disk in the guest you can safely comment out +# all relevant sections below. + +[device "scsi"] + driver = "virtio-scsi-pci" + bus = "pcie.1" + addr = "00.0" + +[device "scsi-disk"] + driver = "scsi-hd" + bus = "scsi.0" + drive = "disk" + bootindex = "1" + +[drive "disk"] + file = "guest.qcow2" # CHANGE ME + format = "qcow2" + if = "none" + +[device "scsi-optical-disk"] + driver = "scsi-cd" + bus = "scsi.0" + drive = "optical-disk" + bootindex = "2" + +[drive "optical-disk"] + file = "install.iso" # CHANGE ME + format = "raw" + if = "none" + + +# Ethernet controller +# ========================================================= +# +# We use virtio-net for improved performance over emulated +# hardware; on the host side, we take advantage of user +# networking so that the QEMU process doesn't require any +# additional privileges. + +[netdev "hostnet"] + type = "user" + +[device "net"] + driver = "virtio-net-pci" + netdev = "hostnet" + bus = "pcie.2" + addr = "00.0" diff --git a/docs/migration.txt b/docs/migration.txt index 6503c17685..1b940a829b 100644 --- a/docs/migration.txt +++ b/docs/migration.txt @@ -161,6 +161,11 @@ include/hw/hw.h. === More about versions === +Version numbers are intended for major incompatible changes to the +migration of a device, and using them breaks backwards-migration +compatibility; in general most changes can be made by adding Subsections +(see below) or _TEST macros (see below) which won't break compatibility. + You can see that there are several version fields: - version_id: the maximum version_id supported by VMState for that device. @@ -175,6 +180,9 @@ version_id. And the function load_state_old() (if present) is able to load state from minimum_version_id_old to minimum_version_id. This function is deprecated and will be removed when no more users are left. +Saving state will always create a section with the 'version_id' value +and thus can't be loaded by any older QEMU. + === Massaging functions === Sometimes, it is not enough to be able to save the state directly @@ -292,6 +300,56 @@ save/send this state when we are in the middle of a pio operation not enabled, the values on that fields are garbage and don't need to be sent. +Using a condition function that checks a 'property' to determine whether +to send a subsection allows backwards migration compatibility when +new subsections are added. + +For example; + a) Add a new property using DEFINE_PROP_BOOL - e.g. support-foo and + default it to true. + b) Add an entry to the HW_COMPAT_ for the previous version + that sets the property to false. + c) Add a static bool support_foo function that tests the property. + d) Add a subsection with a .needed set to the support_foo function + e) (potentially) Add a pre_load that sets up a default value for 'foo' + to be used if the subsection isn't loaded. + +Now that subsection will not be generated when using an older +machine type and the migration stream will be accepted by older +QEMU versions. pre-load functions can be used to initialise state +on the newer version so that they default to suitable values +when loading streams created by older QEMU versions that do not +generate the subsection. + +In some cases subsections are added for data that had been accidentally +omitted by earlier versions; if the missing data causes the migration +process to succeed but the guest to behave badly then it may be better +to send the subsection and cause the migration to explicitly fail +with the unknown subsection error. If the bad behaviour only happens +with certain data values, making the subsection conditional on +the data value (rather than the machine type) allows migrations to succeed +in most cases. In general the preference is to tie the subsection to +the machine type, and allow reliable migrations, unless the behaviour +from omission of the subsection is really bad. + += Not sending existing elements = + +Sometimes members of the VMState are no longer needed; + removing them will break migration compatibility + making them version dependent and bumping the version will break backwards + migration compatibility. + +The best way is to: + a) Add a new property/compatibility/function in the same way for subsections + above. + b) replace the VMSTATE macro with the _TEST version of the macro, e.g.: + VMSTATE_UINT32(foo, barstruct) + becomes + VMSTATE_UINT32_TEST(foo, barstruct, pre_version_baz) + + Sometime in the future when we no longer care about the ancient +versions these can be killed off. + = Return path = In most migration scenarios there is only a single data path that runs @@ -482,3 +540,16 @@ request for a page that has already been sent is ignored. Duplicate requests such as this can happen as a page is sent at about the same time the destination accesses it. +=== Postcopy with hugepages === + +Postcopy now works with hugetlbfs backed memory: + a) The linux kernel on the destination must support userfault on hugepages. + b) The huge-page configuration on the source and destination VMs must be + identical; i.e. RAMBlocks on both sides must use the same page size. + c) Note that -mem-path /dev/hugepages will fall back to allocating normal + RAM if it doesn't have enough hugepages, triggering (b) to fail. + Using -mem-prealloc enforces the allocation using hugepages. + d) Care should be taken with the size of hugepage used; postcopy with 2MB + hugepages works well, however 1GB hugepages are likely to be problematic + since it takes ~1 second to transfer a 1GB hugepage across a 10Gbps link, + and until the full page is transferred the destination thread is blocked. diff --git a/docs/multi-thread-tcg.txt b/docs/multi-thread-tcg.txt new file mode 100644 index 0000000000..a99b4564c6 --- /dev/null +++ b/docs/multi-thread-tcg.txt @@ -0,0 +1,350 @@ +Copyright (c) 2015-2016 Linaro Ltd. + +This work is licensed under the terms of the GNU GPL, version 2 or +later. See the COPYING file in the top-level directory. + +Introduction +============ + +This document outlines the design for multi-threaded TCG system-mode +emulation. The current user-mode emulation mirrors the thread +structure of the translated executable. Some of the work will be +applicable to both system and linux-user emulation. + +The original system-mode TCG implementation was single threaded and +dealt with multiple CPUs with simple round-robin scheduling. This +simplified a lot of things but became increasingly limited as systems +being emulated gained additional cores and per-core performance gains +for host systems started to level off. + +vCPU Scheduling +=============== + +We introduce a new running mode where each vCPU will run on its own +user-space thread. This will be enabled by default for all FE/BE +combinations that have had the required work done to support this +safely. + +In the general case of running translated code there should be no +inter-vCPU dependencies and all vCPUs should be able to run at full +speed. Synchronisation will only be required while accessing internal +shared data structures or when the emulated architecture requires a +coherent representation of the emulated machine state. + +Shared Data Structures +====================== + +Main Run Loop +------------- + +Even when there is no code being generated there are a number of +structures associated with the hot-path through the main run-loop. +These are associated with looking up the next translation block to +execute. These include: + + tb_jmp_cache (per-vCPU, cache of recent jumps) + tb_ctx.htable (global hash table, phys address->tb lookup) + +As TB linking only occurs when blocks are in the same page this code +is critical to performance as looking up the next TB to execute is the +most common reason to exit the generated code. + +DESIGN REQUIREMENT: Make access to lookup structures safe with +multiple reader/writer threads. Minimise any lock contention to do it. + +The hot-path avoids using locks where possible. The tb_jmp_cache is +updated with atomic accesses to ensure consistent results. The fall +back QHT based hash table is also designed for lockless lookups. Locks +are only taken when code generation is required or TranslationBlocks +have their block-to-block jumps patched. + +Global TCG State +---------------- + +We need to protect the entire code generation cycle including any post +generation patching of the translated code. This also implies a shared +translation buffer which contains code running on all cores. Any +execution path that comes to the main run loop will need to hold a +mutex for code generation. This also includes times when we need flush +code or entries from any shared lookups/caches. Structures held on a +per-vCPU basis won't need locking unless other vCPUs will need to +modify them. + +DESIGN REQUIREMENT: Add locking around all code generation and TB +patching. + +(Current solution) + +Mainly as part of the linux-user work all code generation is +serialised with a tb_lock(). For the SoftMMU tb_lock() also takes the +place of mmap_lock() in linux-user. + +Translation Blocks +------------------ + +Currently the whole system shares a single code generation buffer +which when full will force a flush of all translations and start from +scratch again. Some operations also force a full flush of translations +including: + + - debugging operations (breakpoint insertion/removal) + - some CPU helper functions + +This is done with the async_safe_run_on_cpu() mechanism to ensure all +vCPUs are quiescent when changes are being made to shared global +structures. + +More granular translation invalidation events are typically due +to a change of the state of a physical page: + + - code modification (self modify code, patching code) + - page changes (new page mapping in linux-user mode) + +While setting the invalid flag in a TranslationBlock will stop it +being used when looked up in the hot-path there are a number of other +book-keeping structures that need to be safely cleared. + +Any TranslationBlocks which have been patched to jump directly to the +now invalid blocks need the jump patches reversing so they will return +to the C code. + +There are a number of look-up caches that need to be properly updated +including the: + + - jump lookup cache + - the physical-to-tb lookup hash table + - the global page table + +The global page table (l1_map) which provides a multi-level look-up +for PageDesc structures which contain pointers to the start of a +linked list of all Translation Blocks in that page (see page_next). + +Both the jump patching and the page cache involve linked lists that +the invalidated TranslationBlock needs to be removed from. + +DESIGN REQUIREMENT: Safely handle invalidation of TBs + - safely patch/revert direct jumps + - remove central PageDesc lookup entries + - ensure lookup caches/hashes are safely updated + +(Current solution) + +The direct jump themselves are updated atomically by the TCG +tb_set_jmp_target() code. Modification to the linked lists that allow +searching for linked pages are done under the protect of the +tb_lock(). + +The global page table is protected by the tb_lock() in system-mode and +mmap_lock() in linux-user mode. + +The lookup caches are updated atomically and the lookup hash uses QHT +which is designed for concurrent safe lookup. + + +Memory maps and TLBs +-------------------- + +The memory handling code is fairly critical to the speed of memory +access in the emulated system. The SoftMMU code is designed so the +hot-path can be handled entirely within translated code. This is +handled with a per-vCPU TLB structure which once populated will allow +a series of accesses to the page to occur without exiting the +translated code. It is possible to set flags in the TLB address which +will ensure the slow-path is taken for each access. This can be done +to support: + + - Memory regions (dividing up access to PIO, MMIO and RAM) + - Dirty page tracking (for code gen, SMC detection, migration and display) + - Virtual TLB (for translating guest address->real address) + +When the TLB tables are updated by a vCPU thread other than their own +we need to ensure it is done in a safe way so no inconsistent state is +seen by the vCPU thread. + +Some operations require updating a number of vCPUs TLBs at the same +time in a synchronised manner. + +DESIGN REQUIREMENTS: + + - TLB Flush All/Page + - can be across-vCPUs + - cross vCPU TLB flush may need other vCPU brought to halt + - change may need to be visible to the calling vCPU immediately + - TLB Flag Update + - usually cross-vCPU + - want change to be visible as soon as possible + - TLB Update (update a CPUTLBEntry, via tlb_set_page_with_attrs) + - This is a per-vCPU table - by definition can't race + - updated by its own thread when the slow-path is forced + +(Current solution) + +We have updated cputlb.c to defer operations when a cross-vCPU +operation with async_run_on_cpu() which ensures each vCPU sees a +coherent state when it next runs its work (in a few instructions +time). + +A new set up operations (tlb_flush_*_all_cpus) take an additional flag +which when set will force synchronisation by setting the source vCPUs +work as "safe work" and exiting the cpu run loop. This ensure by the +time execution restarts all flush operations have completed. + +TLB flag updates are all done atomically and are also protected by the +tb_lock() which is used by the functions that update the TLB in bulk. + +(Known limitation) + +Not really a limitation but the wait mechanism is overly strict for +some architectures which only need flushes completed by a barrier +instruction. This could be a future optimisation. + +Emulated hardware state +----------------------- + +Currently thanks to KVM work any access to IO memory is automatically +protected by the global iothread mutex, also known as the BQL (Big +Qemu Lock). Any IO region that doesn't use global mutex is expected to +do its own locking. + +However IO memory isn't the only way emulated hardware state can be +modified. Some architectures have model specific registers that +trigger hardware emulation features. Generally any translation helper +that needs to update more than a single vCPUs of state should take the +BQL. + +As the BQL, or global iothread mutex is shared across the system we +push the use of the lock as far down into the TCG code as possible to +minimise contention. + +(Current solution) + +MMIO access automatically serialises hardware emulation by way of the +BQL. Currently ARM targets serialise all ARM_CP_IO register accesses +and also defer the reset/startup of vCPUs to the vCPU context by way +of async_run_on_cpu(). + +Updates to interrupt state are also protected by the BQL as they can +often be cross vCPU. + +Memory Consistency +================== + +Between emulated guests and host systems there are a range of memory +consistency models. Even emulating weakly ordered systems on strongly +ordered hosts needs to ensure things like store-after-load re-ordering +can be prevented when the guest wants to. + +Memory Barriers +--------------- + +Barriers (sometimes known as fences) provide a mechanism for software +to enforce a particular ordering of memory operations from the point +of view of external observers (e.g. another processor core). They can +apply to any memory operations as well as just loads or stores. + +The Linux kernel has an excellent write-up on the various forms of +memory barrier and the guarantees they can provide [1]. + +Barriers are often wrapped around synchronisation primitives to +provide explicit memory ordering semantics. However they can be used +by themselves to provide safe lockless access by ensuring for example +a change to a signal flag will only be visible once the changes to +payload are. + +DESIGN REQUIREMENT: Add a new tcg_memory_barrier op + +This would enforce a strong load/store ordering so all loads/stores +complete at the memory barrier. On single-core non-SMP strongly +ordered backends this could become a NOP. + +Aside from explicit standalone memory barrier instructions there are +also implicit memory ordering semantics which comes with each guest +memory access instruction. For example all x86 load/stores come with +fairly strong guarantees of sequential consistency where as ARM has +special variants of load/store instructions that imply acquire/release +semantics. + +In the case of a strongly ordered guest architecture being emulated on +a weakly ordered host the scope for a heavy performance impact is +quite high. + +DESIGN REQUIREMENTS: Be efficient with use of memory barriers + - host systems with stronger implied guarantees can skip some barriers + - merge consecutive barriers to the strongest one + +(Current solution) + +The system currently has a tcg_gen_mb() which will add memory barrier +operations if code generation is being done in a parallel context. The +tcg_optimize() function attempts to merge barriers up to their +strongest form before any load/store operations. The solution was +originally developed and tested for linux-user based systems. All +backends have been converted to emit fences when required. So far the +following front-ends have been updated to emit fences when required: + + - target-i386 + - target-arm + - target-aarch64 + - target-alpha + - target-mips + +Memory Control and Maintenance +------------------------------ + +This includes a class of instructions for controlling system cache +behaviour. While QEMU doesn't model cache behaviour these instructions +are often seen when code modification has taken place to ensure the +changes take effect. + +Synchronisation Primitives +-------------------------- + +There are two broad types of synchronisation primitives found in +modern ISAs: atomic instructions and exclusive regions. + +The first type offer a simple atomic instruction which will guarantee +some sort of test and conditional store will be truly atomic w.r.t. +other cores sharing access to the memory. The classic example is the +x86 cmpxchg instruction. + +The second type offer a pair of load/store instructions which offer a +guarantee that an region of memory has not been touched between the +load and store instructions. An example of this is ARM's ldrex/strex +pair where the strex instruction will return a flag indicating a +successful store only if no other CPU has accessed the memory region +since the ldrex. + +Traditionally TCG has generated a series of operations that work +because they are within the context of a single translation block so +will have completed before another CPU is scheduled. However with +the ability to have multiple threads running to emulate multiple CPUs +we will need to explicitly expose these semantics. + +DESIGN REQUIREMENTS: + - Support classic atomic instructions + - Support load/store exclusive (or load link/store conditional) pairs + - Generic enough infrastructure to support all guest architectures +CURRENT OPEN QUESTIONS: + - How problematic is the ABA problem in general? + +(Current solution) + +The TCG provides a number of atomic helpers (tcg_gen_atomic_*) which +can be used directly or combined to emulate other instructions like +ARM's ldrex/strex instructions. While they are susceptible to the ABA +problem so far common guests have not implemented patterns where +this may be a problem - typically presenting a locking ABI which +assumes cmpxchg like semantics. + +The code also includes a fall-back for cases where multi-threaded TCG +ops can't work (e.g. guest atomic width > host atomic width). In this +case an EXCP_ATOMIC exit occurs and the instruction is emulated with +an exclusive lock which ensures all emulation is serialised. + +While the atomic helpers look good enough for now there may be a need +to look at solutions that can more closely model the guest +architectures semantics. + +========== + +[1] https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/plain/Documentation/memory-barriers.txt diff --git a/docs/q35-chipset.cfg b/docs/q35-chipset.cfg deleted file mode 100644 index e4ddb7d9cc..0000000000 --- a/docs/q35-chipset.cfg +++ /dev/null @@ -1,152 +0,0 @@ -################################################################ -# -# qemu -M q35 creates a bare machine with just the very essential -# chipset devices being present: -# -# 00.0 - Host bridge -# 1f.0 - ISA bridge / LPC -# 1f.2 - SATA (AHCI) controller -# 1f.3 - SMBus controller -# -# This config file documents the other devices and how they are -# created. You can simply use "-readconfig $thisfile" to create -# them all. Here is a overview: -# -# 19.0 - Ethernet controller (not created, our e1000 emulation -# doesn't emulate the ich9 device). -# 1a.* - USB Controller #2 (ehci + uhci companions) -# 1b.0 - HD Audio Controller -# 1c.* - PCI Express Ports -# 1d.* - USB Controller #1 (ehci + uhci companions, -# "qemu -M q35 -usb" creates these too) -# 1e.0 - PCI Bridge -# - -[device "ich9-ehci-2"] - driver = "ich9-usb-ehci2" - multifunction = "on" - bus = "pcie.0" - addr = "1a.7" - -[device "ich9-uhci-4"] - driver = "ich9-usb-uhci4" - multifunction = "on" - bus = "pcie.0" - addr = "1a.0" - masterbus = "ich9-ehci-2.0" - firstport = "0" - -[device "ich9-uhci-5"] - driver = "ich9-usb-uhci5" - multifunction = "on" - bus = "pcie.0" - addr = "1a.1" - masterbus = "ich9-ehci-2.0" - firstport = "2" - -[device "ich9-uhci-6"] - driver = "ich9-usb-uhci6" - multifunction = "on" - bus = "pcie.0" - addr = "1a.2" - masterbus = "ich9-ehci-2.0" - firstport = "4" - - -[device "ich9-hda-audio"] - driver = "ich9-intel-hda" - bus = "pcie.0" - addr = "1b.0" - - -[device "ich9-pcie-port-1"] - driver = "ioh3420" - multifunction = "on" - bus = "pcie.0" - addr = "1c.0" - port = "1" - chassis = "1" - -[device "ich9-pcie-port-2"] - driver = "ioh3420" - multifunction = "on" - bus = "pcie.0" - addr = "1c.1" - port = "2" - chassis = "2" - -[device "ich9-pcie-port-3"] - driver = "ioh3420" - multifunction = "on" - bus = "pcie.0" - addr = "1c.2" - port = "3" - chassis = "3" - -[device "ich9-pcie-port-4"] - driver = "ioh3420" - multifunction = "on" - bus = "pcie.0" - addr = "1c.3" - port = "4" - chassis = "4" - -## -# Example PCIe switch with two downstream ports -# -#[device "pcie-switch-upstream-port-1"] -# driver = "x3130-upstream" -# bus = "ich9-pcie-port-4" -# addr = "00.0" -# -#[device "pcie-switch-downstream-port-1-1"] -# driver = "xio3130-downstream" -# multifunction = "on" -# bus = "pcie-switch-upstream-port-1" -# addr = "00.0" -# port = "1" -# chassis = "5" -# -#[device "pcie-switch-downstream-port-1-2"] -# driver = "xio3130-downstream" -# multifunction = "on" -# bus = "pcie-switch-upstream-port-1" -# addr = "00.1" -# port = "1" -# chassis = "6" - -[device "ich9-ehci-1"] - driver = "ich9-usb-ehci1" - multifunction = "on" - bus = "pcie.0" - addr = "1d.7" - -[device "ich9-uhci-1"] - driver = "ich9-usb-uhci1" - multifunction = "on" - bus = "pcie.0" - addr = "1d.0" - masterbus = "ich9-ehci-1.0" - firstport = "0" - -[device "ich9-uhci-2"] - driver = "ich9-usb-uhci2" - multifunction = "on" - bus = "pcie.0" - addr = "1d.1" - masterbus = "ich9-ehci-1.0" - firstport = "2" - -[device "ich9-uhci-3"] - driver = "ich9-usb-uhci3" - multifunction = "on" - bus = "pcie.0" - addr = "1d.2" - masterbus = "ich9-ehci-1.0" - firstport = "4" - - -[device "ich9-pci-bridge"] - driver = "i82801b11-bridge" - bus = "pcie.0" - addr = "1e.0" diff --git a/docs/q35-emulated.cfg b/docs/q35-emulated.cfg new file mode 100644 index 0000000000..c6416d6545 --- /dev/null +++ b/docs/q35-emulated.cfg @@ -0,0 +1,288 @@ +# q35 - Emulated guest (graphical console) +# ========================================================= +# +# Usage: +# +# $ qemu-system-x86_64 \ +# -nodefaults \ +# -readconfig q35-emulated.cfg +# +# You will probably need to tweak the lines marked as +# CHANGE ME before being able to use this configuration! +# +# The guest will have a selection of emulated devices that +# closely resembles that of a physical machine, and will be +# accessed through a graphical console. +# +# --------------------------------------------------------- +# +# Using -nodefaults is required to have full control over +# the virtual hardware: when it's specified, QEMU will +# populate the board with only the builtin peripherals +# plus a small selection of core PCI devices and +# controllers; the user will then have to explicitly add +# further devices. +# +# The core PCI devices show up in the guest as: +# +# 00:00.0 Host bridge +# 00:1f.0 ISA bridge / LPC +# 00:1f.2 SATA (AHCI) controller +# 00:1f.3 SMBus controller +# +# This configuration file adds a number of devices that +# are pretty much guaranteed to be present in every single +# physical machine based on q35, more specifically: +# +# 00:01.0 VGA compatible controller +# 00:19.0 Ethernet controller +# 00:1a.* USB controller (#2) +# 00:1b.0 Audio device +# 00:1c.* PCI bridge (PCI Express Root Ports) +# 00:1d.* USB Controller (#1) +# 00:1e.0 PCI bridge (legacy PCI bridge) +# +# More information about these devices is available below. + + +# Machine options +# ========================================================= +# +# We use the q35 machine type and enable KVM acceleration +# for better performance. +# +# Using less than 1 GiB of memory is probably not going to +# yield good performance in the guest, and might even lead +# to obscure boot issues in some cases. +# +# Unfortunately, there is no way to configure the CPU model +# in this file, so it will have to be provided on the +# command line. + +[machine] + type = "q35" + accel = "kvm" + +[memory] + size = "1024" + + +# PCI bridge (PCI Express Root Ports) +# ========================================================= +# +# We add four PCI Express Root Ports, all sharing the same +# slot on the PCI Express Root Bus. These ports support +# hotplug. + +[device "ich9-pcie-port-1"] + driver = "ioh3420" + multifunction = "on" + bus = "pcie.0" + addr = "1c.0" + port = "1" + chassis = "1" + +[device "ich9-pcie-port-2"] + driver = "ioh3420" + multifunction = "on" + bus = "pcie.0" + addr = "1c.1" + port = "2" + chassis = "2" + +[device "ich9-pcie-port-3"] + driver = "ioh3420" + multifunction = "on" + bus = "pcie.0" + addr = "1c.2" + port = "3" + chassis = "3" + +[device "ich9-pcie-port-4"] + driver = "ioh3420" + multifunction = "on" + bus = "pcie.0" + addr = "1c.3" + port = "4" + chassis = "4" + + +# PCI bridge (legacy PCI bridge) +# ========================================================= +# +# This bridge can be used to build an independent topology +# for legacy PCI devices. PCI Express devices should be +# plugged into PCI Express slots instead, so ideally there +# will be no devices connected to this bridge. + +[device "ich9-pci-bridge"] + driver = "i82801b11-bridge" + bus = "pcie.0" + addr = "1e.0" + + +# SATA storage +# ========================================================= +# +# An implicit SATA controller is created automatically for +# every single q35 guest; here we create a disk, backed by +# a qcow2 disk image on the host's filesystem, and attach +# it to that controller so that the guest can use it. +# +# We also create an optical disk, mostly for installation +# purposes: once the guest OS has been succesfully +# installed, the guest will no longer boot from optical +# media. If you don't want, or no longer want, to have an +# optical disk in the guest you can safely comment out +# all relevant sections below. + +[device "sata-disk"] + driver = "ide-hd" + bus = "ide.0" + drive = "disk" + bootindex = "1" + +[drive "disk"] + file = "guest.qcow2" # CHANGE ME + format = "qcow2" + if = "none" + +[device "sata-optical-disk"] + driver = "ide-cd" + bus = "ide.1" + drive = "optical-disk" + bootindex = "2" + +[drive "optical-disk"] + file = "install.iso" # CHANGE ME + format = "raw" + if = "none" + + +# USB controller (#1) +# ========================================================= +# +# EHCI controller + UHCI companion controllers. + +[device "ich9-ehci-1"] + driver = "ich9-usb-ehci1" + multifunction = "on" + bus = "pcie.0" + addr = "1d.7" + +[device "ich9-uhci-1"] + driver = "ich9-usb-uhci1" + multifunction = "on" + bus = "pcie.0" + addr = "1d.0" + masterbus = "ich9-ehci-1.0" + firstport = "0" + +[device "ich9-uhci-2"] + driver = "ich9-usb-uhci2" + multifunction = "on" + bus = "pcie.0" + addr = "1d.1" + masterbus = "ich9-ehci-1.0" + firstport = "2" + +[device "ich9-uhci-3"] + driver = "ich9-usb-uhci3" + multifunction = "on" + bus = "pcie.0" + addr = "1d.2" + masterbus = "ich9-ehci-1.0" + firstport = "4" + + +# USB controller (#2) +# ========================================================= +# +# EHCI controller + UHCI companion controllers. + +[device "ich9-ehci-2"] + driver = "ich9-usb-ehci2" + multifunction = "on" + bus = "pcie.0" + addr = "1a.7" + +[device "ich9-uhci-4"] + driver = "ich9-usb-uhci4" + multifunction = "on" + bus = "pcie.0" + addr = "1a.0" + masterbus = "ich9-ehci-2.0" + firstport = "0" + +[device "ich9-uhci-5"] + driver = "ich9-usb-uhci5" + multifunction = "on" + bus = "pcie.0" + addr = "1a.1" + masterbus = "ich9-ehci-2.0" + firstport = "2" + +[device "ich9-uhci-6"] + driver = "ich9-usb-uhci6" + multifunction = "on" + bus = "pcie.0" + addr = "1a.2" + masterbus = "ich9-ehci-2.0" + firstport = "4" + + +# Ethernet controller +# ========================================================= +# +# We add a Gigabit Ethernet interface to the guest; on the +# host side, we take advantage of user networking so that +# the QEMU process doesn't require any additional +# privileges. + +[netdev "hostnet"] + type = "user" + +[device "net"] + driver = "e1000" + netdev = "hostnet" + bus = "pcie.0" + addr = "19.0" + + +# VGA compatible controller +# ========================================================= +# +# We use stdvga instead of Cirrus as it supports more video +# modes and is closer to what actual hardware looks like. +# +# If you're running the guest on a remote, potentially +# headless host, you will probably want to append something +# like +# +# -display vnc=127.0.0.1:0 +# +# to the command line in order to prevent QEMU from +# creating a graphical display window on the host and +# enable remote access instead. + +[device "video"] + driver = "VGA" + bus = "pcie.0" + addr = "01.0" + + +# Audio device +# ========================================================= +# +# The sound card is a legacy PCI device that is plugged +# directly into the PCI Express Root Bus. + +[device "ich9-hda-audio"] + driver = "ich9-intel-hda" + bus = "pcie.0" + addr = "1b.0" + +[device "ich9-hda-duplex"] + driver = "hda-duplex" + bus = "ich9-hda-audio.0" + cad = "0" diff --git a/docs/q35-virtio-graphical.cfg b/docs/q35-virtio-graphical.cfg new file mode 100644 index 0000000000..28bde2fc57 --- /dev/null +++ b/docs/q35-virtio-graphical.cfg @@ -0,0 +1,248 @@ +# q35 - VirtIO guest (graphical console) +# ========================================================= +# +# Usage: +# +# $ qemu-system-x86_64 \ +# -nodefaults \ +# -readconfig q35-virtio-graphical.cfg +# +# You will probably need to tweak the lines marked as +# CHANGE ME before being able to use this configuration! +# +# The guest will have a selection of VirtIO devices +# tailored towards optimal performance with modern guests, +# and will be accessed through a graphical console. +# +# --------------------------------------------------------- +# +# Using -nodefaults is required to have full control over +# the virtual hardware: when it's specified, QEMU will +# populate the board with only the builtin peripherals +# plus a small selection of core PCI devices and +# controllers; the user will then have to explicitly add +# further devices. +# +# The core PCI devices show up in the guest as: +# +# 00:00.0 Host bridge +# 00:1f.0 ISA bridge / LPC +# 00:1f.2 SATA (AHCI) controller +# 00:1f.3 SMBus controller +# +# This configuration file adds a number of other useful +# devices, more specifically: +# +# 00:01.0 VGA compatible controller +# 00:1b.0 Audio device +# 00.1c.* PCI bridge (PCI Express Root Ports) +# 01:00.0 SCSI storage controller +# 02:00.0 Ethernet controller +# 03:00.0 USB controller +# +# More information about these devices is available below. + + +# Machine options +# ========================================================= +# +# We use the q35 machine type and enable KVM acceleration +# for better performance. +# +# Using less than 1 GiB of memory is probably not going to +# yield good performance in the guest, and might even lead +# to obscure boot issues in some cases. + +[machine] + type = "q35" + accel = "kvm" + +[memory] + size = "1024" + + +# PCI bridge (PCI Express Root Ports) +# ========================================================= +# +# We create eight PCI Express Root Ports, and we plug them +# all into separate functions of the same slot. Some of +# them will be used by devices, the rest will remain +# available for hotplug. + +[device "pcie.1"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.0" + port = "1" + chassis = "1" + multifunction = "on" + +[device "pcie.2"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.1" + port = "2" + chassis = "2" + +[device "pcie.3"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.2" + port = "3" + chassis = "3" + +[device "pcie.4"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.3" + port = "4" + chassis = "4" + +[device "pcie.5"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.4" + port = "5" + chassis = "5" + +[device "pcie.6"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.5" + port = "6" + chassis = "6" + +[device "pcie.7"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.6" + port = "7" + chassis = "7" + +[device "pcie.8"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.7" + port = "8" + chassis = "8" + + +# SCSI storage controller (and storage) +# ========================================================= +# +# We use virtio-scsi here so that we can (hot)plug a large +# number of disks without running into issues; a SCSI disk, +# backed by a qcow2 disk image on the host's filesystem, is +# attached to it. +# +# We also create an optical disk, mostly for installation +# purposes: once the guest OS has been succesfully +# installed, the guest will no longer boot from optical +# media. If you don't want, or no longer want, to have an +# optical disk in the guest you can safely comment out +# all relevant sections below. + +[device "scsi"] + driver = "virtio-scsi-pci" + bus = "pcie.1" + addr = "00.0" + +[device "scsi-disk"] + driver = "scsi-hd" + bus = "scsi.0" + drive = "disk" + bootindex = "1" + +[drive "disk"] + file = "guest.qcow2" # CHANGE ME + format = "qcow2" + if = "none" + +[device "scsi-optical-disk"] + driver = "scsi-cd" + bus = "scsi.0" + drive = "optical-disk" + bootindex = "2" + +[drive "optical-disk"] + file = "install.iso" # CHANGE ME + format = "raw" + if = "none" + + +# Ethernet controller +# ========================================================= +# +# We use virtio-net for improved performance over emulated +# hardware; on the host side, we take advantage of user +# networking so that the QEMU process doesn't require any +# additional privileges. + +[netdev "hostnet"] + type = "user" + +[device "net"] + driver = "virtio-net-pci" + netdev = "hostnet" + bus = "pcie.2" + addr = "00.0" + + +# USB controller (and input devices) +# ========================================================= +# +# We add a virtualization-friendly USB 3.0 controller and +# a USB tablet so that graphical guests can be controlled +# appropriately. A USB keyboard is not needed, as q35 +# guests get a PS/2 one added automatically. + +[device "usb"] + driver = "nec-usb-xhci" + bus = "pcie.3" + addr = "00.0" + +[device "tablet"] + driver = "usb-tablet" + bus = "usb.0" + + +# VGA compatible controller +# ========================================================= +# +# We plug the QXL video card directly into the PCI Express +# Root Bus as it is a legacy PCI device; this way, we can +# reduce the number of PCI Express controllers in the +# guest. +# +# If you're running the guest on a remote, potentially +# headless host, you will probably want to append something +# like +# +# -display vnc=127.0.0.1:0 +# +# to the command line in order to prevent QEMU from +# creating a graphical display window on the host and +# enable remote access instead. + +[device "video"] + driver = "qxl-vga" + bus = "pcie.0" + addr = "01.0" + + +# Audio device +# ========================================================= +# +# Like the video card, the sound card is a legacy PCI +# device and as such can be plugged directly into the PCI +# Express Root Bus. + +[device "sound"] + driver = "ich9-intel-hda" + bus = "pcie.0" + addr = "1b.0" + +[device "duplex"] + driver = "hda-duplex" + bus = "sound.0" + cad = "0" diff --git a/docs/q35-virtio-serial.cfg b/docs/q35-virtio-serial.cfg new file mode 100644 index 0000000000..c33c9cc07a --- /dev/null +++ b/docs/q35-virtio-serial.cfg @@ -0,0 +1,193 @@ +# q35 - VirtIO guest (serial console) +# ========================================================= +# +# Usage: +# +# $ qemu-system-x86_64 \ +# -nodefaults \ +# -readconfig q35-virtio-serial.cfg \ +# -display none -serial mon:stdio +# +# You will probably need to tweak the lines marked as +# CHANGE ME before being able to use this configuration! +# +# The guest will have a selection of VirtIO devices +# tailored towards optimal performance with modern guests, +# and will be accessed through the serial console. +# +# --------------------------------------------------------- +# +# Using -nodefaults is required to have full control over +# the virtual hardware: when it's specified, QEMU will +# populate the board with only the builtin peripherals +# plus a small selection of core PCI devices and +# controllers; the user will then have to explicitly add +# further devices. +# +# The core PCI devices show up in the guest as: +# +# 00:00.0 Host bridge +# 00:1f.0 ISA bridge / LPC +# 00:1f.2 SATA (AHCI) controller +# 00:1f.3 SMBus controller +# +# This configuration file adds a number of other useful +# devices, more specifically: +# +# 00.1c.* PCI bridge (PCI Express Root Ports) +# 01:00.0 SCSI storage controller +# 02:00.0 Ethernet controller +# +# More information about these devices is available below. +# +# We use '-display none' to prevent QEMU from creating a +# graphical display window, which would serve no use in +# this specific configuration, and '-serial mon:stdio' to +# multiplex the guest's serial console and the QEMU monitor +# to the host's stdio; use 'Ctrl+A h' to learn how to +# switch between the two and more. + + +# Machine options +# ========================================================= +# +# We use the q35 machine type and enable KVM acceleration +# for better performance. +# +# Using less than 1 GiB of memory is probably not going to +# yield good performance in the guest, and might even lead +# to obscure boot issues in some cases. + +[machine] + type = "q35" + accel = "kvm" + +[memory] + size = "1024" + + +# PCI bridge (PCI Express Root Ports) +# ========================================================= +# +# We create eight PCI Express Root Ports, and we plug them +# all into separate functions of the same slot. Some of +# them will be used by devices, the rest will remain +# available for hotplug. + +[device "pcie.1"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.0" + port = "1" + chassis = "1" + multifunction = "on" + +[device "pcie.2"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.1" + port = "2" + chassis = "2" + +[device "pcie.3"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.2" + port = "3" + chassis = "3" + +[device "pcie.4"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.3" + port = "4" + chassis = "4" + +[device "pcie.5"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.4" + port = "5" + chassis = "5" + +[device "pcie.6"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.5" + port = "6" + chassis = "6" + +[device "pcie.7"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.6" + port = "7" + chassis = "7" + +[device "pcie.8"] + driver = "pcie-root-port" + bus = "pcie.0" + addr = "1c.7" + port = "8" + chassis = "8" + + +# SCSI storage controller (and storage) +# ========================================================= +# +# We use virtio-scsi here so that we can (hot)plug a large +# number of disks without running into issues; a SCSI disk, +# backed by a qcow2 disk image on the host's filesystem, is +# attached to it. +# +# We also create an optical disk, mostly for installation +# purposes: once the guest OS has been succesfully +# installed, the guest will no longer boot from optical +# media. If you don't want, or no longer want, to have an +# optical disk in the guest you can safely comment out +# all relevant sections below. + +[device "scsi"] + driver = "virtio-scsi-pci" + bus = "pcie.1" + addr = "00.0" + +[device "scsi-disk"] + driver = "scsi-hd" + bus = "scsi.0" + drive = "disk" + bootindex = "1" + +[drive "disk"] + file = "guest.qcow2" # CHANGE ME + format = "qcow2" + if = "none" + +[device "scsi-optical-disk"] + driver = "scsi-cd" + bus = "scsi.0" + drive = "optical-disk" + bootindex = "2" + +[drive "optical-disk"] + file = "install.iso" # CHANGE ME + format = "raw" + if = "none" + + +# Ethernet controller +# ========================================================= +# +# We use virtio-net for improved performance over emulated +# hardware; on the host side, we take advantage of user +# networking so that the QEMU process doesn't require any +# additional privileges. + +[netdev "hostnet"] + type = "user" + +[device "net"] + driver = "virtio-net-pci" + netdev = "hostnet" + bus = "pcie.2" + addr = "00.0" diff --git a/docs/qapi-code-gen.txt b/docs/qapi-code-gen.txt index 7eb7be12ab..6746c1052c 100644 --- a/docs/qapi-code-gen.txt +++ b/docs/qapi-code-gen.txt @@ -1138,7 +1138,7 @@ Example: Visitor *v; UserDefOneList *arg1 = NULL; - v = qobject_input_visitor_new(QOBJECT(args), true); + v = qobject_input_visitor_new(QOBJECT(args)); visit_start_struct(v, NULL, NULL, 0, &err); if (err) { goto out; diff --git a/docs/replay.txt b/docs/replay.txt index 03e193193f..486c1e0e9d 100644 --- a/docs/replay.txt +++ b/docs/replay.txt @@ -225,3 +225,10 @@ recording the virtual machine this filter puts all packets coming from the outer world into the log. In replay mode packets from the log are injected into the network device. All interactions with network backend in replay mode are disabled. + +Audio devices +------------- + +Audio data is recorded and replay automatically. The command line for recording +and replaying must contain identical specifications of audio hardware, e.g.: + -soundhw ac97 diff --git a/docs/specs/vmgenid.txt b/docs/specs/vmgenid.txt new file mode 100644 index 0000000000..aa9f518676 --- /dev/null +++ b/docs/specs/vmgenid.txt @@ -0,0 +1,245 @@ +VIRTUAL MACHINE GENERATION ID +============================= + +Copyright (C) 2016 Red Hat, Inc. +Copyright (C) 2017 Skyport Systems, Inc. + +This work is licensed under the terms of the GNU GPL, version 2 or later. +See the COPYING file in the top-level directory. + +=== + +The VM generation ID (vmgenid) device is an emulated device which +exposes a 128-bit, cryptographically random, integer value identifier, +referred to as a Globally Unique Identifier, or GUID. + +This allows management applications (e.g. libvirt) to notify the guest +operating system when the virtual machine is executed with a different +configuration (e.g. snapshot execution or creation from a template). The +guest operating system notices the change, and is then able to react as +appropriate by marking its copies of distributed databases as dirty, +re-initializing its random number generator etc. + + +Requirements +------------ + +These requirements are extracted from the "How to implement virtual machine +generation ID support in a virtualization platform" section of the +specification, dated August 1, 2012. + + +The document may be found on the web at: + http://go.microsoft.com/fwlink/?LinkId=260709 + +R1a. The generation ID shall live in an 8-byte aligned buffer. + +R1b. The buffer holding the generation ID shall be in guest RAM, ROM, or device + MMIO range. + +R1c. The buffer holding the generation ID shall be kept separate from areas + used by the operating system. + +R1d. The buffer shall not be covered by an AddressRangeMemory or + AddressRangeACPI entry in the E820 or UEFI memory map. + +R1e. The generation ID shall not live in a page frame that could be mapped with + caching disabled. (In other words, regardless of whether the generation ID + lives in RAM, ROM or MMIO, it shall only be mapped as cacheable.) + +R2 to R5. [These AML requirements are isolated well enough in the Microsoft + specification for us to simply refer to them here.] + +R6. The hypervisor shall expose a _HID (hardware identifier) object in the + VMGenId device's scope that is unique to the hypervisor vendor. + + +QEMU Implementation +------------------- + +The above-mentioned specification does not dictate which ACPI descriptor table +will contain the VM Generation ID device. Other implementations (Hyper-V and +Xen) put it in the main descriptor table (Differentiated System Description +Table or DSDT). For ease of debugging and implementation, we have decided to +put it in its own Secondary System Description Table, or SSDT. + +The following is a dump of the contents from a running system: + +# iasl -p ./SSDT -d /sys/firmware/acpi/tables/SSDT + +Intel ACPI Component Architecture +ASL+ Optimizing Compiler version 20150717-64 +Copyright (c) 2000 - 2015 Intel Corporation + +Reading ACPI table from file /sys/firmware/acpi/tables/SSDT - Length +00000198 (0x0000C6) +ACPI: SSDT 0x0000000000000000 0000C6 (v01 BOCHS VMGENID 00000001 BXPC +00000001) +Acpi table [SSDT] successfully installed and loaded +Pass 1 parse of [SSDT] +Pass 2 parse of [SSDT] +Parsing Deferred Opcodes (Methods/Buffers/Packages/Regions) + +Parsing completed +Disassembly completed +ASL Output: ./SSDT.dsl - 1631 bytes +# cat SSDT.dsl +/* + * Intel ACPI Component Architecture + * AML/ASL+ Disassembler version 20150717-64 + * Copyright (c) 2000 - 2015 Intel Corporation + * + * Disassembling to symbolic ASL+ operators + * + * Disassembly of /sys/firmware/acpi/tables/SSDT, Sun Feb 5 00:19:37 2017 + * + * Original Table Header: + * Signature "SSDT" + * Length 0x000000CA (202) + * Revision 0x01 + * Checksum 0x4B + * OEM ID "BOCHS " + * OEM Table ID "VMGENID" + * OEM Revision 0x00000001 (1) + * Compiler ID "BXPC" + * Compiler Version 0x00000001 (1) + */ +DefinitionBlock ("/sys/firmware/acpi/tables/SSDT.aml", "SSDT", 1, "BOCHS ", +"VMGENID", 0x00000001) +{ + Name (VGIA, 0x07FFF000) + Scope (\_SB) + { + Device (VGEN) + { + Name (_HID, "QEMUVGID") // _HID: Hardware ID + Name (_CID, "VM_Gen_Counter") // _CID: Compatible ID + Name (_DDN, "VM_Gen_Counter") // _DDN: DOS Device Name + Method (_STA, 0, NotSerialized) // _STA: Status + { + Local0 = 0x0F + If ((VGIA == Zero)) + { + Local0 = Zero + } + + Return (Local0) + } + + Method (ADDR, 0, NotSerialized) + { + Local0 = Package (0x02) {} + Index (Local0, Zero) = (VGIA + 0x28) + Index (Local0, One) = Zero + Return (Local0) + } + } + } + + Method (\_GPE._E05, 0, NotSerialized) // _Exx: Edge-Triggered GPE + { + Notify (\_SB.VGEN, 0x80) // Status Change + } +} + + +Design Details: +--------------- + +Requirements R1a through R1e dictate that the memory holding the +VM Generation ID must be allocated and owned by the guest firmware, +in this case BIOS or UEFI. However, to be useful, QEMU must be able to +change the contents of the memory at runtime, specifically when starting a +backed-up or snapshotted image. In order to do this, QEMU must know the +address that has been allocated. + +The mechanism chosen for this memory sharing is writeable fw_cfg blobs. +These are data object that are visible to both QEMU and guests, and are +addressable as sequential files. + +More information about fw_cfg can be found in "docs/specs/fw_cfg.txt" + +Two fw_cfg blobs are used in this case: + +/etc/vmgenid_guid - contains the actual VM Generation ID GUID + - read-only to the guest +/etc/vmgenid_addr - contains the address of the downloaded vmgenid blob + - writeable by the guest + + +QEMU sends the following commands to the guest at startup: + +1. Allocate memory for vmgenid_guid fw_cfg blob. +2. Write the address of vmgenid_guid into the SSDT (VGIA ACPI variable as + shown above in the iasl dump). Note that this change is not propagated + back to QEMU. +3. Write the address of vmgenid_guid back to QEMU's copy of vmgenid_addr + via the fw_cfg DMA interface. + +After step 3, QEMU is able to update the contents of vmgenid_guid at will. + +Since BIOS or UEFI does not necessarily run when we wish to change the GUID, +the value of VGIA is persisted via the VMState mechanism. + +As spelled out in the specification, any change to the GUID executes an +ACPI notification. The exact handler to use is not specified, so the vmgenid +device uses the first unused one: \_GPE._E05. + + +Endian-ness Considerations: +--------------------------- + +Although not specified in Microsoft's document, it is assumed that the +device is expected to use little-endian format. + +All GUID passed in via command line or monitor are treated as big-endian. +GUID values displayed via monitor are shown in big-endian format. + + +GUID Storage Format: +-------------------- + +In order to implement an OVMF "SDT Header Probe Suppressor", the contents of +the vmgenid_guid fw_cfg blob are not simply a 128-bit GUID. There is also +significant padding in order to align and fill a memory page, as shown in the +following diagram: + ++----------------------------------+ +| SSDT with OEM Table ID = VMGENID | ++----------------------------------+ +| ... | TOP OF PAGE +| VGIA dword object ---------------|-----> +---------------------------+ +| ... | | fw-allocated array for | +| _STA method referring to VGIA | | "etc/vmgenid_guid" | +| ... | +---------------------------+ +| ADDR method referring to VGIA | | 0: OVMF SDT Header probe | +| ... | | suppressor | ++----------------------------------+ | 36: padding for 8-byte | + | alignment | + | 40: GUID | + | 56: padding to page size | + +---------------------------+ + END OF PAGE + + +Device Usage: +------------- + +The device has one property, which may be only be set using the command line: + + guid - sets the value of the GUID. A special value "auto" instructs + QEMU to generate a new random GUID. + +For example: + + QEMU -device vmgenid,guid="324e6eaf-d1d1-4bf6-bf41-b9bb6c91fb87" + QEMU -device vmgenid,guid=auto + +The property may be queried via QMP/HMP: + + (QEMU) query-vm-generation-id + {"return": {"guid": "324e6eaf-d1d1-4bf6-bf41-b9bb6c91fb87"}} + +Setting of this parameter is intentionally left out from the QMP/HMP +interfaces. There are no known use cases for changing the GUID once QEMU is +running, and adding this capability would greatly increase the complexity. |