diff options
103 files changed, 2554 insertions, 703 deletions
@@ -1271,7 +1271,7 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp) QemuOpts *opts = NULL; QDict *snapshot_options; BlockDriverState *bs_snapshot; - Error *local_err; + Error *local_err = NULL; int ret; /* if snapshot, we create a temporary backing file and open it @@ -1841,9 +1841,9 @@ void bdrv_close(BlockDriverState *bs) if (bs->job) { block_job_cancel_sync(bs->job); } - bdrv_drain_all(); /* complete I/O */ + bdrv_drain(bs); /* complete I/O */ bdrv_flush(bs); - bdrv_drain_all(); /* in case flush left pending I/O */ + bdrv_drain(bs); /* in case flush left pending I/O */ notifier_list_notify(&bs->close_notifiers, bs); if (bs->drv) { @@ -3906,7 +3906,7 @@ void bdrv_attach_aio_context(BlockDriverState *bs, void bdrv_set_aio_context(BlockDriverState *bs, AioContext *new_context) { - bdrv_drain_all(); /* ensure there are no in-flight requests */ + bdrv_drain(bs); /* ensure there are no in-flight requests */ bdrv_detach_aio_context(bs); diff --git a/block/io.c b/block/io.c index 305e0d952e..d4bc83b33b 100644 --- a/block/io.c +++ b/block/io.c @@ -236,12 +236,12 @@ static bool bdrv_requests_pending(BlockDriverState *bs) /* * Wait for pending requests to complete on a single BlockDriverState subtree * - * See the warning in bdrv_drain_all(). This function can only be called if - * you are sure nothing can generate I/O because you have op blockers - * installed. - * * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState * AioContext. + * + * Only this BlockDriverState's AioContext is run, so in-flight requests must + * not depend on events in other AioContexts. In that case, use + * bdrv_drain_all() instead. */ void bdrv_drain(BlockDriverState *bs) { @@ -260,12 +260,6 @@ void bdrv_drain(BlockDriverState *bs) * * This function does not flush data to disk, use bdrv_flush_all() for that * after calling this function. - * - * Note that completion of an asynchronous I/O operation can trigger any - * number of other I/O operations on other devices---for example a coroutine - * can be arbitrarily complex and a constant flow of I/O can come until the - * coroutine is complete. Because of this, it is not possible to have a - * function to drain a single device's I/O queue. */ void bdrv_drain_all(void) { @@ -288,6 +282,12 @@ void bdrv_drain_all(void) } } + /* Note that completion of an asynchronous I/O operation can trigger any + * number of other I/O operations on other devices---for example a + * coroutine can submit an I/O request to another device in response to + * request completion. Therefore we must keep looping until there was no + * more activity rather than simply draining each device independently. + */ while (busy) { busy = false; diff --git a/block/mirror.c b/block/mirror.c index 8888cea952..d409337c4a 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -708,6 +708,8 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp); if (!s->dirty_bitmap) { + g_free(s->replaces); + block_job_release(bs); return; } bdrv_set_enable_write_cache(s->target, true); diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c index ed92a098c4..53b8afc3d3 100644 --- a/block/qcow2-cache.c +++ b/block/qcow2-cache.c @@ -281,9 +281,6 @@ static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c, i = min_lru_index; trace_qcow2_cache_get_replace_entry(qemu_coroutine_self(), c == s->l2_table_cache, i); - if (i < 0) { - return i; - } ret = qcow2_cache_entry_flush(bs, c, i); if (ret < 0) { diff --git a/block/raw-posix.c b/block/raw-posix.c index cbe6574bf4..855febed52 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -2430,7 +2430,8 @@ static int floppy_probe_device(const char *filename) struct stat st; if (strstart(filename, "/dev/fd", NULL) && - !strstart(filename, "/dev/fdset/", NULL)) { + !strstart(filename, "/dev/fdset/", NULL) && + !strstart(filename, "/dev/fd/", NULL)) { prio = 50; } diff --git a/block/snapshot.c b/block/snapshot.c index 19395ae014..49e143e991 100644 --- a/block/snapshot.c +++ b/block/snapshot.c @@ -239,7 +239,7 @@ int bdrv_snapshot_delete(BlockDriverState *bs, } /* drain all pending i/o before deleting snapshot */ - bdrv_drain_all(); + bdrv_drain(bs); if (drv->bdrv_snapshot_delete) { return drv->bdrv_snapshot_delete(bs, snapshot_id, name, errp); diff --git a/blockjob.c b/blockjob.c index ec46fad2f1..62bb906634 100644 --- a/blockjob.c +++ b/blockjob.c @@ -66,10 +66,7 @@ void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs, block_job_set_speed(job, speed, &local_err); if (local_err) { - bs->job = NULL; - bdrv_op_unblock_all(bs, job->blocker); - error_free(job->blocker); - g_free(job); + block_job_release(bs); error_propagate(errp, local_err); return NULL; } @@ -77,18 +74,25 @@ void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs, return job; } -void block_job_completed(BlockJob *job, int ret) +void block_job_release(BlockDriverState *bs) { - BlockDriverState *bs = job->bs; + BlockJob *job = bs->job; - assert(bs->job == job); - job->cb(job->opaque, ret); bs->job = NULL; bdrv_op_unblock_all(bs, job->blocker); error_free(job->blocker); g_free(job); } +void block_job_completed(BlockJob *job, int ret) +{ + BlockDriverState *bs = job->bs; + + assert(bs->job == job); + job->cb(job->opaque, ret); + block_job_release(bs); +} + void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp) { Error *local_err = NULL; @@ -315,6 +315,7 @@ snappy="" bzip2="" guest_agent="" guest_agent_with_vss="no" +guest_agent_ntddscsi="no" guest_agent_msi="" vss_win32_sdk="" win_sdk="no" @@ -732,7 +733,7 @@ if test "$mingw32" = "yes" ; then sysconfdir="\${prefix}" local_statedir= confsuffix="" - libs_qga="-lws2_32 -lwinmm -lpowrprof $libs_qga" + libs_qga="-lws2_32 -lwinmm -lpowrprof -liphlpapi $libs_qga" fi werror="" @@ -3820,6 +3821,26 @@ if test "$mingw32" = "yes" -a "$guest_agent" != "no" -a "$guest_agent_with_vss" fi ########################################## +# check if mingw environment provides a recent ntddscsi.h +if test "$mingw32" = "yes" -a "$guest_agent" != "no"; then + cat > $TMPC << EOF +#include <windows.h> +#include <ntddscsi.h> +int main(void) { +#if !defined(IOCTL_SCSI_GET_ADDRESS) +#error Missing required ioctl definitions +#endif + SCSI_ADDRESS addr = { .Lun = 0, .TargetId = 0, .PathId = 0 }; + return addr.Lun; +} +EOF + if compile_prog "" "" ; then + guest_agent_ntddscsi=yes + libs_qga="-lsetupapi $libs_qga" + fi +fi + +########################################## # Guest agent Window MSI package if test "$guest_agent" != yes; then @@ -4489,6 +4510,7 @@ echo "libiscsi support $libiscsi" echo "libnfs support $libnfs" echo "build guest agent $guest_agent" echo "QGA VSS support $guest_agent_with_vss" +echo "QGA w32 disk info $guest_agent_ntddscsi" echo "seccomp support $seccomp" echo "coroutine backend $coroutine" echo "coroutine pool $coroutine_pool" @@ -4566,6 +4588,9 @@ if test "$mingw32" = "yes" ; then echo "CONFIG_QGA_VSS=y" >> $config_host_mak echo "WIN_SDK=\"$win_sdk\"" >> $config_host_mak fi + if test "$guest_agent_ntddscsi" = "yes" ; then + echo "CONFIG_QGA_NTDDDISK=y" >> $config_host_mak + fi if test "$guest_agent_msi" != "no"; then echo "QEMU_GA_MSI_ENABLED=yes" >> $config_host_mak echo "QEMU_GA_MSI_MINGW_DLL_PATH=${QEMU_GA_MSI_MINGW_DLL_PATH}" >> $config_host_mak diff --git a/docs/qmp/qmp-events.txt b/docs/qmp/qmp-events.txt index 4c13d48726..d92cc4833b 100644 --- a/docs/qmp/qmp-events.txt +++ b/docs/qmp/qmp-events.txt @@ -473,6 +473,20 @@ Example: { "timestamp": {"seconds": 1290688046, "microseconds": 417172}, "event": "SPICE_MIGRATE_COMPLETED" } +MIGRATION +--------- + +Emitted when a migration event happens + +Data: None. + + - "status": migration status + See MigrationStatus in ~/qapi-schema.json for possible values + +Example: + +{"timestamp": {"seconds": 1432121972, "microseconds": 744001}, + "event": "MIGRATION", "data": {"status": "completed"}} STOP ---- diff --git a/docs/specs/ppc-spapr-hotplug.txt b/docs/specs/ppc-spapr-hotplug.txt index d35771cc2b..46e07196bb 100644 --- a/docs/specs/ppc-spapr-hotplug.txt +++ b/docs/specs/ppc-spapr-hotplug.txt @@ -284,4 +284,22 @@ struct rtas_event_log_v6_hp { } drc; } QEMU_PACKED; +== ibm,lrdr-capacity == + +ibm,lrdr-capacity is a property in the /rtas device tree node that identifies +the dynamic reconfiguration capabilities of the guest. It consists of a triple +consisting of <phys>, <size> and <maxcpus>. + + <phys>, encoded in BE format represents the maximum address in bytes and + hence the maximum memory that can be allocated to the guest. + + <size>, encoded in BE format represents the size increments in which + memory can be hot-plugged to the guest. + + <maxcpus>, a BE-encoded integer, represents the maximum number of + processors that the guest can have. + +pseries guests use this property to note the maximum allowed CPUs for the +guest. + [1] http://thread.gmane.org/gmane.linux.ports.ppc.embedded/75350/focus=106867 diff --git a/docs/specs/rocker.txt b/docs/specs/rocker.txt index 0af5c61585..1c743515c1 100644 --- a/docs/specs/rocker.txt +++ b/docs/specs/rocker.txt @@ -637,6 +637,7 @@ The TLVs for Rx descriptor buffer are: (1 << 5): TCP packet (1 << 6): UDP packet (1 << 7): TCP/UDP csum good + (1 << 8): Offload forward RX_CSUM 2 IP calculated checksum: IPv4: IP payload csum IPv6: header and payload csum @@ -645,6 +646,9 @@ The TLVs for Rx descriptor buffer are: RX_FRAG_MAX_LEN 2 Packet maximum fragment length RX_FRAG_LEN 2 Actual packet fragment length after receive +Offload forward RX_FLAG indicates the device has already forwarded the packet +so the host CPU should not also forward the packet. + Possible status return codes in descriptor on completion are: DESC_COMP_ERR reason @@ -1414,6 +1414,11 @@ static ram_addr_t ram_block_add(RAMBlock *new_block, Error **errp) } } + new_ram_size = MAX(old_ram_size, + (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS); + if (new_ram_size > old_ram_size) { + migration_bitmap_extend(old_ram_size, new_ram_size); + } /* Keep the list sorted from biggest to smallest block. Unlike QTAILQ, * QLIST (which has an RCU-friendly variant) does not have insertion at * tail, so save the last element in last_block. diff --git a/hw/char/spapr_vty.c b/hw/char/spapr_vty.c index 4e464bd15a..36b328b9af 100644 --- a/hw/char/spapr_vty.c +++ b/hw/char/spapr_vty.c @@ -74,7 +74,7 @@ static void spapr_vty_realize(VIOsPAPRDevice *sdev, Error **errp) } /* Forward declaration */ -static target_ulong h_put_term_char(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_put_term_char(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { target_ulong reg = args[0]; @@ -101,7 +101,7 @@ static target_ulong h_put_term_char(PowerPCCPU *cpu, sPAPREnvironment *spapr, return H_SUCCESS; } -static target_ulong h_get_term_char(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_get_term_char(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { target_ulong reg = args[0]; @@ -193,7 +193,7 @@ VIOsPAPRDevice *spapr_vty_get_default(VIOsPAPRBus *bus) DeviceState *iter = kid->child; /* Only look at VTY devices */ - if (!object_dynamic_cast(OBJECT(iter), "spapr-vty")) { + if (!object_dynamic_cast(OBJECT(iter), TYPE_VIO_SPAPR_VTY_DEVICE)) { continue; } @@ -214,7 +214,7 @@ VIOsPAPRDevice *spapr_vty_get_default(VIOsPAPRBus *bus) return selected; } -VIOsPAPRDevice *vty_lookup(sPAPREnvironment *spapr, target_ulong reg) +VIOsPAPRDevice *vty_lookup(sPAPRMachineState *spapr, target_ulong reg) { VIOsPAPRDevice *sdev; @@ -228,6 +228,10 @@ VIOsPAPRDevice *vty_lookup(sPAPREnvironment *spapr, target_ulong reg) return spapr_vty_get_default(spapr->vio_bus); } + if (!object_dynamic_cast(OBJECT(sdev), TYPE_VIO_SPAPR_VTY_DEVICE)) { + return NULL; + } + return sdev; } diff --git a/hw/core/sysbus.c b/hw/core/sysbus.c index 278a2d1bdd..3c58629894 100644 --- a/hw/core/sysbus.c +++ b/hw/core/sysbus.c @@ -109,7 +109,13 @@ qemu_irq sysbus_get_connected_irq(SysBusDevice *dev, int n) void sysbus_connect_irq(SysBusDevice *dev, int n, qemu_irq irq) { + SysBusDeviceClass *sbd = SYS_BUS_DEVICE_GET_CLASS(dev); + qdev_connect_gpio_out_named(DEVICE(dev), SYSBUS_DEVICE_GPIO_IRQ, n, irq); + + if (sbd->connect_irq_notifier) { + sbd->connect_irq_notifier(dev, irq); + } } /* Check whether an MMIO region exists */ diff --git a/hw/display/virtio-gpu-pci.c b/hw/display/virtio-gpu-pci.c index f0f25c7bc9..5bc62cf344 100644 --- a/hw/display/virtio-gpu-pci.c +++ b/hw/display/virtio-gpu-pci.c @@ -17,7 +17,6 @@ #include "hw/virtio/virtio-gpu.h" static Property virtio_gpu_pci_properties[] = { - DEFINE_VIRTIO_GPU_PROPERTIES(VirtIOGPUPCI, vdev.conf), DEFINE_VIRTIO_GPU_PCI_PROPERTIES(VirtIOPCIProxy), DEFINE_PROP_END_OF_LIST(), }; @@ -25,13 +24,21 @@ static Property virtio_gpu_pci_properties[] = { static void virtio_gpu_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) { VirtIOGPUPCI *vgpu = VIRTIO_GPU_PCI(vpci_dev); + VirtIOGPU *g = &vgpu->vdev; DeviceState *vdev = DEVICE(&vgpu->vdev); + int i; qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus)); /* force virtio-1.0 */ vpci_dev->flags &= ~VIRTIO_PCI_FLAG_DISABLE_MODERN; vpci_dev->flags |= VIRTIO_PCI_FLAG_DISABLE_LEGACY; object_property_set_bool(OBJECT(vdev), true, "realized", errp); + + for (i = 0; i < g->conf.max_outputs; i++) { + object_property_set_link(OBJECT(g->scanout[i].con), + OBJECT(vpci_dev), + "device", errp); + } } static void virtio_gpu_pci_class_init(ObjectClass *klass, void *data) @@ -49,8 +56,9 @@ static void virtio_gpu_pci_class_init(ObjectClass *klass, void *data) static void virtio_gpu_initfn(Object *obj) { VirtIOGPUPCI *dev = VIRTIO_GPU_PCI(obj); - object_initialize(&dev->vdev, sizeof(dev->vdev), TYPE_VIRTIO_GPU); - object_property_add_child(obj, "virtio-backend", OBJECT(&dev->vdev), NULL); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VIRTIO_GPU); } static const TypeInfo virtio_gpu_pci_info = { diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c index 8c109b79f4..990a26b850 100644 --- a/hw/display/virtio-gpu.c +++ b/hw/display/virtio-gpu.c @@ -871,7 +871,7 @@ static void virtio_gpu_reset(VirtIODevice *vdev) } static Property virtio_gpu_properties[] = { - DEFINE_VIRTIO_GPU_PROPERTIES(VirtIOGPU, conf), + DEFINE_PROP_UINT32("max_outputs", VirtIOGPU, conf.max_outputs, 1), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/display/virtio-vga.c b/hw/display/virtio-vga.c index 94f9d0eb5a..f7e539fe93 100644 --- a/hw/display/virtio-vga.c +++ b/hw/display/virtio-vga.c @@ -79,6 +79,7 @@ static void virtio_vga_realize(VirtIOPCIProxy *vpci_dev, Error **errp) VirtIOGPU *g = &vvga->vdev; VGACommonState *vga = &vvga->vga; uint32_t offset; + int i; /* init vga compat bits */ vga->vram_size_mb = 8; @@ -120,6 +121,12 @@ static void virtio_vga_realize(VirtIOPCIProxy *vpci_dev, Error **errp) vga->con = g->scanout[0].con; graphic_console_set_hwops(vga->con, &virtio_vga_ops, vvga); + + for (i = 0; i < g->conf.max_outputs; i++) { + object_property_set_link(OBJECT(g->scanout[i].con), + OBJECT(vpci_dev), + "device", errp); + } } static void virtio_vga_reset(DeviceState *dev) @@ -131,7 +138,6 @@ static void virtio_vga_reset(DeviceState *dev) } static Property virtio_vga_properties[] = { - DEFINE_VIRTIO_GPU_PROPERTIES(VirtIOVGA, vdev.conf), DEFINE_VIRTIO_GPU_PCI_PROPERTIES(VirtIOPCIProxy), DEFINE_PROP_END_OF_LIST(), }; @@ -155,8 +161,9 @@ static void virtio_vga_class_init(ObjectClass *klass, void *data) static void virtio_vga_inst_initfn(Object *obj) { VirtIOVGA *dev = VIRTIO_VGA(obj); - object_initialize(&dev->vdev, sizeof(dev->vdev), TYPE_VIRTIO_GPU); - object_property_add_child(obj, "virtio-backend", OBJECT(&dev->vdev), NULL); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VIRTIO_GPU); } static TypeInfo virtio_vga_info = { diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 0f99fdc3cc..8167b122f0 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -311,6 +311,8 @@ static void pc_compat_2_3(MachineState *machine) if (kvm_enabled()) { pcms->smm = ON_OFF_AUTO_OFF; } + global_state_set_optional(); + savevm_skip_configuration(); } static void pc_compat_2_2(MachineState *machine) diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index e9cc96a758..974aead5a9 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -294,6 +294,8 @@ static void pc_compat_2_3(MachineState *machine) if (kvm_enabled()) { pcms->smm = ON_OFF_AUTO_OFF; } + global_state_set_optional(); + savevm_skip_configuration(); } static void pc_compat_2_2(MachineState *machine) diff --git a/hw/intc/arm_gic_kvm.c b/hw/intc/arm_gic_kvm.c index 2cb7d255d2..f56bff1afb 100644 --- a/hw/intc/arm_gic_kvm.c +++ b/hw/intc/arm_gic_kvm.c @@ -570,6 +570,12 @@ static void kvm_arm_gic_realize(DeviceState *dev, Error **errp) */ i += (GIC_INTERNAL * s->num_cpu); qdev_init_gpio_in(dev, kvm_arm_gic_set_irq, i); + + for (i = 0; i < s->num_irq - GIC_INTERNAL; i++) { + qemu_irq irq = qdev_get_gpio_in(dev, i); + kvm_irqchip_set_qemuirq_gsi(kvm_state, irq, i); + } + /* We never use our outbound IRQ/FIQ lines but provide them so that * we maintain the same interface as the non-KVM GIC. */ diff --git a/hw/intc/xics.c b/hw/intc/xics.c index 0fd2a84c7b..924b1ae3c9 100644 --- a/hw/intc/xics.c +++ b/hw/intc/xics.c @@ -806,7 +806,7 @@ void xics_free(XICSState *icp, int irq, int num) * Guest interfaces */ -static target_ulong h_cppr(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_cppr(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { CPUState *cs = CPU(cpu); @@ -816,7 +816,7 @@ static target_ulong h_cppr(PowerPCCPU *cpu, sPAPREnvironment *spapr, return H_SUCCESS; } -static target_ulong h_ipi(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_ipi(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { target_ulong server = get_cpu_index_by_dt_id(args[0]); @@ -830,7 +830,7 @@ static target_ulong h_ipi(PowerPCCPU *cpu, sPAPREnvironment *spapr, return H_SUCCESS; } -static target_ulong h_xirr(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_xirr(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { CPUState *cs = CPU(cpu); @@ -840,7 +840,7 @@ static target_ulong h_xirr(PowerPCCPU *cpu, sPAPREnvironment *spapr, return H_SUCCESS; } -static target_ulong h_xirr_x(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_xirr_x(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { CPUState *cs = CPU(cpu); @@ -852,7 +852,7 @@ static target_ulong h_xirr_x(PowerPCCPU *cpu, sPAPREnvironment *spapr, return H_SUCCESS; } -static target_ulong h_eoi(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_eoi(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { CPUState *cs = CPU(cpu); @@ -862,7 +862,7 @@ static target_ulong h_eoi(PowerPCCPU *cpu, sPAPREnvironment *spapr, return H_SUCCESS; } -static target_ulong h_ipoll(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_ipoll(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { CPUState *cs = CPU(cpu); @@ -874,7 +874,7 @@ static target_ulong h_ipoll(PowerPCCPU *cpu, sPAPREnvironment *spapr, return H_SUCCESS; } -static void rtas_set_xive(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static void rtas_set_xive(PowerPCCPU *cpu, sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -902,7 +902,7 @@ static void rtas_set_xive(PowerPCCPU *cpu, sPAPREnvironment *spapr, rtas_st(rets, 0, RTAS_OUT_SUCCESS); } -static void rtas_get_xive(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static void rtas_get_xive(PowerPCCPU *cpu, sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -927,7 +927,7 @@ static void rtas_get_xive(PowerPCCPU *cpu, sPAPREnvironment *spapr, rtas_st(rets, 2, ics->irqs[nr - ics->offset].priority); } -static void rtas_int_off(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static void rtas_int_off(PowerPCCPU *cpu, sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -953,7 +953,7 @@ static void rtas_int_off(PowerPCCPU *cpu, sPAPREnvironment *spapr, rtas_st(rets, 0, RTAS_OUT_SUCCESS); } -static void rtas_int_on(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static void rtas_int_on(PowerPCCPU *cpu, sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) diff --git a/hw/intc/xics_kvm.c b/hw/intc/xics_kvm.c index c15453f26f..d58729cfae 100644 --- a/hw/intc/xics_kvm.c +++ b/hw/intc/xics_kvm.c @@ -331,6 +331,15 @@ static void xics_kvm_cpu_setup(XICSState *icp, PowerPCCPU *cpu) abort(); } + /* + * If we are reusing a parked vCPU fd corresponding to the CPU + * which was hot-removed earlier we don't have to renable + * KVM_CAP_IRQ_XICS capability again. + */ + if (ss->cap_irq_xics_enabled) { + return; + } + if (icpkvm->kernel_xics_fd != -1) { int ret; @@ -343,6 +352,7 @@ static void xics_kvm_cpu_setup(XICSState *icp, PowerPCCPU *cpu) kvm_arch_vcpu_id(cs), strerror(errno)); exit(1); } + ss->cap_irq_xics_enabled = true; } } @@ -368,7 +378,7 @@ static void xics_kvm_set_nr_servers(XICSState *icp, uint32_t nr_servers, } } -static void rtas_dummy(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static void rtas_dummy(PowerPCCPU *cpu, sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) diff --git a/hw/misc/macio/macio.c b/hw/misc/macio/macio.c index e9037b0c39..e3c0242d41 100644 --- a/hw/misc/macio/macio.c +++ b/hw/misc/macio/macio.c @@ -132,8 +132,6 @@ static void macio_common_realize(PCIDevice *d, Error **errp) SysBusDevice *sysbus_dev; Error *err = NULL; - d->config[0x3d] = 0x01; // interrupt on pin 1 - object_property_set_bool(OBJECT(&s->cuda), true, "realized", &err); if (err) { error_propagate(errp, err); diff --git a/hw/net/e1000.c b/hw/net/e1000.c index bab8e2abfb..5c6bcd0014 100644 --- a/hw/net/e1000.c +++ b/hw/net/e1000.c @@ -185,6 +185,9 @@ e1000_link_up(E1000State *s) { s->mac_reg[STATUS] |= E1000_STATUS_LU; s->phy_reg[PHY_STATUS] |= MII_SR_LINK_STATUS; + + /* E1000_STATUS_LU is tested by e1000_can_receive() */ + qemu_flush_queued_packets(qemu_get_queue(s->nic)); } static bool diff --git a/hw/net/rocker/rocker.c b/hw/net/rocker/rocker.c index 4d25842509..47d080fd33 100644 --- a/hw/net/rocker/rocker.c +++ b/hw/net/rocker/rocker.c @@ -96,7 +96,7 @@ World *rocker_get_world(Rocker *r, enum rocker_world_type type) RockerSwitch *qmp_query_rocker(const char *name, Error **errp) { - RockerSwitch *rocker = g_malloc0(sizeof(*rocker)); + RockerSwitch *rocker; Rocker *r; r = rocker_find(name); @@ -106,6 +106,7 @@ RockerSwitch *qmp_query_rocker(const char *name, Error **errp) return NULL; } + rocker = g_new0(RockerSwitch, 1); rocker->name = g_strdup(r->name); rocker->id = r->switch_id; rocker->ports = r->fp_ports; @@ -192,11 +193,13 @@ static int tx_consume(Rocker *r, DescInfo *info) if (!tlvs[ROCKER_TLV_TX_L3_CSUM_OFF]) { return -ROCKER_EINVAL; } + break; case ROCKER_TX_OFFLOAD_TSO: if (!tlvs[ROCKER_TLV_TX_TSO_MSS] || !tlvs[ROCKER_TLV_TX_TSO_HDR_LEN]) { return -ROCKER_EINVAL; } + break; } if (tlvs[ROCKER_TLV_TX_L3_CSUM_OFF]) { @@ -600,7 +603,7 @@ static DescRing *rocker_get_rx_ring_by_pport(Rocker *r, } int rx_produce(World *world, uint32_t pport, - const struct iovec *iov, int iovcnt) + const struct iovec *iov, int iovcnt, uint8_t copy_to_cpu) { Rocker *r = world_rocker(world); PCIDevice *dev = (PCIDevice *)r; @@ -643,6 +646,10 @@ int rx_produce(World *world, uint32_t pport, goto out; } + if (copy_to_cpu) { + rx_flags |= ROCKER_RX_FLAGS_FWD_OFFLOAD; + } + /* XXX calc rx flags/csum */ tlv_size = rocker_tlv_total_size(sizeof(uint16_t)) + /* flags */ diff --git a/hw/net/rocker/rocker.h b/hw/net/rocker/rocker.h index b3310b61eb..f9c80f8013 100644 --- a/hw/net/rocker/rocker.h +++ b/hw/net/rocker/rocker.h @@ -77,7 +77,7 @@ int rocker_event_link_changed(Rocker *r, uint32_t pport, bool link_up); int rocker_event_mac_vlan_seen(Rocker *r, uint32_t pport, uint8_t *addr, uint16_t vlan_id); int rx_produce(World *world, uint32_t pport, - const struct iovec *iov, int iovcnt); + const struct iovec *iov, int iovcnt, uint8_t copy_to_cpu); int rocker_port_eg(Rocker *r, uint32_t pport, const struct iovec *iov, int iovcnt); diff --git a/hw/net/rocker/rocker_fp.c b/hw/net/rocker/rocker_fp.c index d8d934c396..c693ae5081 100644 --- a/hw/net/rocker/rocker_fp.c +++ b/hw/net/rocker/rocker_fp.c @@ -125,18 +125,21 @@ int fp_port_eg(FpPort *port, const struct iovec *iov, int iovcnt) return ROCKER_OK; } -static int fp_port_can_receive(NetClientState *nc) -{ - FpPort *port = qemu_get_nic_opaque(nc); - - return port->enabled; -} - static ssize_t fp_port_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt) { FpPort *port = qemu_get_nic_opaque(nc); + /* If the port is disabled, we want to drop this pkt + * now rather than queing it for later. We don't want + * any stale pkts getting into the device when the port + * transitions to enabled. + */ + + if (!port->enabled) { + return -1; + } + return world_ingress(port->world, port->pport, iov, iovcnt); } @@ -165,7 +168,6 @@ static void fp_port_set_link_status(NetClientState *nc) static NetClientInfo fp_port_info = { .type = NET_CLIENT_OPTIONS_KIND_NIC, .size = sizeof(NICState), - .can_receive = fp_port_can_receive, .receive = fp_port_receive, .receive_iov = fp_port_receive_iov, .cleanup = fp_port_cleanup, diff --git a/hw/net/rocker/rocker_hw.h b/hw/net/rocker/rocker_hw.h index fe639badd4..8c50830325 100644 --- a/hw/net/rocker/rocker_hw.h +++ b/hw/net/rocker/rocker_hw.h @@ -250,6 +250,7 @@ enum { #define ROCKER_RX_FLAGS_TCP (1 << 5) #define ROCKER_RX_FLAGS_UDP (1 << 6) #define ROCKER_RX_FLAGS_TCP_UDP_CSUM_GOOD (1 << 7) +#define ROCKER_RX_FLAGS_FWD_OFFLOAD (1 << 8) /* Tx msg */ enum { diff --git a/hw/net/rocker/rocker_of_dpa.c b/hw/net/rocker/rocker_of_dpa.c index b25a17d6d7..874fb01d69 100644 --- a/hw/net/rocker/rocker_of_dpa.c +++ b/hw/net/rocker/rocker_of_dpa.c @@ -825,6 +825,8 @@ static OfDpaGroup *of_dpa_group_alloc(uint32_t id) static void of_dpa_output_l2_interface(OfDpaFlowContext *fc, OfDpaGroup *group) { + uint8_t copy_to_cpu = fc->action_set.apply.copy_to_cpu; + if (group->l2_interface.pop_vlan) { of_dpa_flow_pkt_strip_vlan(fc); } @@ -837,7 +839,8 @@ static void of_dpa_output_l2_interface(OfDpaFlowContext *fc, */ if (group->l2_interface.out_pport == 0) { - rx_produce(fc->of_dpa->world, fc->in_pport, fc->iov, fc->iovcnt); + rx_produce(fc->of_dpa->world, fc->in_pport, fc->iov, fc->iovcnt, + copy_to_cpu); } else if (group->l2_interface.out_pport != fc->in_pport) { rocker_port_eg(world_rocker(fc->of_dpa->world), group->l2_interface.out_pport, @@ -2525,7 +2528,6 @@ static void of_dpa_group_fill(void *key, void *value, void *user_data) ngroup->has_set_vlan_id = true; ngroup->set_vlan_id = ntohs(group->l2_rewrite.vlan_id); } - break; if (memcmp(group->l2_rewrite.src_mac.a, zero_mac.a, ETH_ALEN)) { ngroup->has_set_eth_src = true; ngroup->set_eth_src = @@ -2536,6 +2538,7 @@ static void of_dpa_group_fill(void *key, void *value, void *user_data) ngroup->set_eth_dst = qemu_mac_strdup_printf(group->l2_rewrite.dst_mac.a); } + break; case ROCKER_OF_DPA_GROUP_TYPE_L2_FLOOD: case ROCKER_OF_DPA_GROUP_TYPE_L2_MCAST: ngroup->has_vlan_id = true; diff --git a/hw/net/rocker/rocker_world.c b/hw/net/rocker/rocker_world.c index b991e871d3..a6b18f1754 100644 --- a/hw/net/rocker/rocker_world.c +++ b/hw/net/rocker/rocker_world.c @@ -32,7 +32,7 @@ ssize_t world_ingress(World *world, uint32_t pport, return world->ops->ig(world, pport, iov, iovcnt); } - return iov_size(iov, iovcnt); + return -1; } int world_do_cmd(World *world, DescInfo *info, diff --git a/hw/net/spapr_llan.c b/hw/net/spapr_llan.c index 2dd5ec1117..1ca5e9ce6a 100644 --- a/hw/net/spapr_llan.c +++ b/hw/net/spapr_llan.c @@ -284,7 +284,7 @@ static int check_bd(VIOsPAPRVLANDevice *dev, vlan_bd_t bd, } static target_ulong h_register_logical_lan(PowerPCCPU *cpu, - sPAPREnvironment *spapr, + sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { @@ -349,7 +349,8 @@ static target_ulong h_register_logical_lan(PowerPCCPU *cpu, } -static target_ulong h_free_logical_lan(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_free_logical_lan(PowerPCCPU *cpu, + sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { target_ulong reg = args[0]; @@ -371,7 +372,7 @@ static target_ulong h_free_logical_lan(PowerPCCPU *cpu, sPAPREnvironment *spapr, } static target_ulong h_add_logical_lan_buffer(PowerPCCPU *cpu, - sPAPREnvironment *spapr, + sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { @@ -421,7 +422,8 @@ static target_ulong h_add_logical_lan_buffer(PowerPCCPU *cpu, return H_SUCCESS; } -static target_ulong h_send_logical_lan(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_send_logical_lan(PowerPCCPU *cpu, + sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { target_ulong reg = args[0]; @@ -490,7 +492,7 @@ static target_ulong h_send_logical_lan(PowerPCCPU *cpu, sPAPREnvironment *spapr, return H_SUCCESS; } -static target_ulong h_multicast_ctrl(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_multicast_ctrl(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { target_ulong reg = args[0]; diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c index 104a0f599b..706e0606a9 100644 --- a/hw/net/vmxnet3.c +++ b/hw/net/vmxnet3.c @@ -1879,6 +1879,12 @@ vmxnet3_receive(NetClientState *nc, const uint8_t *buf, size_t size) return -1; } + if (s->peer_has_vhdr) { + vmxnet_rx_pkt_set_vhdr(s->rx_pkt, (struct virtio_net_hdr *)buf); + buf += sizeof(struct virtio_net_hdr); + size -= sizeof(struct virtio_net_hdr); + } + /* Pad to minimum Ethernet frame length */ if (size < sizeof(min_buf)) { memcpy(min_buf, buf, size); @@ -1887,12 +1893,6 @@ vmxnet3_receive(NetClientState *nc, const uint8_t *buf, size_t size) size = sizeof(min_buf); } - if (s->peer_has_vhdr) { - vmxnet_rx_pkt_set_vhdr(s->rx_pkt, (struct virtio_net_hdr *)buf); - buf += sizeof(struct virtio_net_hdr); - size -= sizeof(struct virtio_net_hdr); - } - vmxnet_rx_pkt_set_packet_type(s->rx_pkt, get_eth_packet_type(PKT_GET_ETH_HDR(buf))); diff --git a/hw/nvram/spapr_nvram.c b/hw/nvram/spapr_nvram.c index 11332d14ea..fcaa77dd9a 100644 --- a/hw/nvram/spapr_nvram.c +++ b/hw/nvram/spapr_nvram.c @@ -45,7 +45,7 @@ typedef struct sPAPRNVRAM { #define DEFAULT_NVRAM_SIZE 65536 #define MAX_NVRAM_SIZE 1048576 -static void rtas_nvram_fetch(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static void rtas_nvram_fetch(PowerPCCPU *cpu, sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -86,7 +86,7 @@ static void rtas_nvram_fetch(PowerPCCPU *cpu, sPAPREnvironment *spapr, rtas_st(rets, 1, len); } -static void rtas_nvram_store(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static void rtas_nvram_store(PowerPCCPU *cpu, sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) diff --git a/hw/ppc/mac_newworld.c b/hw/ppc/mac_newworld.c index 0f3e34122a..77d5c819ef 100644 --- a/hw/ppc/mac_newworld.c +++ b/hw/ppc/mac_newworld.c @@ -145,7 +145,6 @@ static void ppc_core99_reset(void *opaque) static void ppc_core99_init(MachineState *machine) { ram_addr_t ram_size = machine->ram_size; - const char *cpu_model = machine->cpu_model; const char *kernel_filename = machine->kernel_filename; const char *kernel_cmdline = machine->kernel_cmdline; const char *initrd_filename = machine->initrd_filename; @@ -182,14 +181,15 @@ static void ppc_core99_init(MachineState *machine) linux_boot = (kernel_filename != NULL); /* init CPUs */ - if (cpu_model == NULL) + if (machine->cpu_model == NULL) { #ifdef TARGET_PPC64 - cpu_model = "970fx"; + machine->cpu_model = "970fx"; #else - cpu_model = "G4"; + machine->cpu_model = "G4"; #endif + } for (i = 0; i < smp_cpus; i++) { - cpu = cpu_ppc_init(cpu_model); + cpu = cpu_ppc_init(machine->cpu_model); if (cpu == NULL) { fprintf(stderr, "Unable to find PowerPC CPU definition\n"); exit(1); diff --git a/hw/ppc/mac_oldworld.c b/hw/ppc/mac_oldworld.c index 99879dd2d5..06fdbaf588 100644 --- a/hw/ppc/mac_oldworld.c +++ b/hw/ppc/mac_oldworld.c @@ -75,7 +75,6 @@ static void ppc_heathrow_reset(void *opaque) static void ppc_heathrow_init(MachineState *machine) { ram_addr_t ram_size = machine->ram_size; - const char *cpu_model = machine->cpu_model; const char *kernel_filename = machine->kernel_filename; const char *kernel_cmdline = machine->kernel_cmdline; const char *initrd_filename = machine->initrd_filename; @@ -107,10 +106,10 @@ static void ppc_heathrow_init(MachineState *machine) linux_boot = (kernel_filename != NULL); /* init CPUs */ - if (cpu_model == NULL) - cpu_model = "G3"; + if (machine->cpu_model == NULL) + machine->cpu_model = "G3"; for (i = 0; i < smp_cpus; i++) { - cpu = cpu_ppc_init(cpu_model); + cpu = cpu_ppc_init(machine->cpu_model); if (cpu == NULL) { fprintf(stderr, "Unable to find PowerPC CPU definition\n"); exit(1); diff --git a/hw/ppc/ppc440_bamboo.c b/hw/ppc/ppc440_bamboo.c index 778970aa9b..032fa803db 100644 --- a/hw/ppc/ppc440_bamboo.c +++ b/hw/ppc/ppc440_bamboo.c @@ -159,7 +159,6 @@ static void main_cpu_reset(void *opaque) static void bamboo_init(MachineState *machine) { ram_addr_t ram_size = machine->ram_size; - const char *cpu_model = machine->cpu_model; const char *kernel_filename = machine->kernel_filename; const char *kernel_cmdline = machine->kernel_cmdline; const char *initrd_filename = machine->initrd_filename; @@ -184,10 +183,10 @@ static void bamboo_init(MachineState *machine) int i; /* Setup CPU. */ - if (cpu_model == NULL) { - cpu_model = "440EP"; + if (machine->cpu_model == NULL) { + machine->cpu_model = "440EP"; } - cpu = cpu_ppc_init(cpu_model); + cpu = cpu_ppc_init(machine->cpu_model); if (cpu == NULL) { fprintf(stderr, "Unable to initialize CPU!\n"); exit(1); diff --git a/hw/ppc/prep.c b/hw/ppc/prep.c index 998ee2d16b..45b5f62d6e 100644 --- a/hw/ppc/prep.c +++ b/hw/ppc/prep.c @@ -506,7 +506,6 @@ static int PPC_NVRAM_set_params (Nvram *nvram, uint16_t NVRAM_size, static void ppc_prep_init(MachineState *machine) { ram_addr_t ram_size = machine->ram_size; - const char *cpu_model = machine->cpu_model; const char *kernel_filename = machine->kernel_filename; const char *kernel_cmdline = machine->kernel_cmdline; const char *initrd_filename = machine->initrd_filename; @@ -536,10 +535,10 @@ static void ppc_prep_init(MachineState *machine) linux_boot = (kernel_filename != NULL); /* init CPUs */ - if (cpu_model == NULL) - cpu_model = "602"; + if (machine->cpu_model == NULL) + machine->cpu_model = "602"; for (i = 0; i < smp_cpus; i++) { - cpu = cpu_ppc_init(cpu_model); + cpu = cpu_ppc_init(machine->cpu_model); if (cpu == NULL) { fprintf(stderr, "Unable to find PowerPC CPU definition\n"); exit(1); diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index f174e5a0f3..a6f19473cf 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -34,6 +34,7 @@ #include "sysemu/cpus.h" #include "sysemu/kvm.h" #include "kvm_ppc.h" +#include "migration/migration.h" #include "mmu-hash64.h" #include "qom/cpu.h" @@ -90,25 +91,6 @@ #define HTAB_SIZE(spapr) (1ULL << ((spapr)->htab_shift)) -typedef struct sPAPRMachineState sPAPRMachineState; - -#define TYPE_SPAPR_MACHINE "spapr-machine" -#define SPAPR_MACHINE(obj) \ - OBJECT_CHECK(sPAPRMachineState, (obj), TYPE_SPAPR_MACHINE) - -/** - * sPAPRMachineState: - */ -struct sPAPRMachineState { - /*< private >*/ - MachineState parent_obj; - - /*< public >*/ - char *kvm_type; -}; - -sPAPREnvironment *spapr; - static XICSState *try_create_xics(const char *type, int nr_servers, int nr_irqs, Error **errp) { @@ -184,7 +166,28 @@ static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu, return ret; } -static int spapr_fixup_cpu_dt(void *fdt, sPAPREnvironment *spapr) +static int spapr_fixup_cpu_numa_dt(void *fdt, int offset, CPUState *cs) +{ + int ret = 0; + PowerPCCPU *cpu = POWERPC_CPU(cs); + int index = ppc_get_vcpu_dt_id(cpu); + uint32_t associativity[] = {cpu_to_be32(0x5), + cpu_to_be32(0x0), + cpu_to_be32(0x0), + cpu_to_be32(0x0), + cpu_to_be32(cs->numa_node), + cpu_to_be32(index)}; + + /* Advertise NUMA via ibm,associativity */ + if (nb_numa_nodes > 1) { + ret = fdt_setprop(fdt, offset, "ibm,associativity", associativity, + sizeof(associativity)); + } + + return ret; +} + +static int spapr_fixup_cpu_dt(void *fdt, sPAPRMachineState *spapr) { int ret = 0, offset, cpus_offset; CPUState *cs; @@ -196,12 +199,6 @@ static int spapr_fixup_cpu_dt(void *fdt, sPAPREnvironment *spapr) PowerPCCPU *cpu = POWERPC_CPU(cs); DeviceClass *dc = DEVICE_GET_CLASS(cs); int index = ppc_get_vcpu_dt_id(cpu); - uint32_t associativity[] = {cpu_to_be32(0x5), - cpu_to_be32(0x0), - cpu_to_be32(0x0), - cpu_to_be32(0x0), - cpu_to_be32(cs->numa_node), - cpu_to_be32(index)}; if ((index % smt) != 0) { continue; @@ -225,20 +222,17 @@ static int spapr_fixup_cpu_dt(void *fdt, sPAPREnvironment *spapr) } } - if (nb_numa_nodes > 1) { - ret = fdt_setprop(fdt, offset, "ibm,associativity", associativity, - sizeof(associativity)); - if (ret < 0) { - return ret; - } - } - ret = fdt_setprop(fdt, offset, "ibm,pft-size", pft_size_prop, sizeof(pft_size_prop)); if (ret < 0) { return ret; } + ret = spapr_fixup_cpu_numa_dt(fdt, offset, cs); + if (ret < 0) { + return ret; + } + ret = spapr_fixup_cpu_smt_dt(fdt, offset, cpu, ppc_get_compat_smt_threads(cpu)); if (ret < 0) { @@ -284,15 +278,18 @@ static size_t create_page_sizes_prop(CPUPPCState *env, uint32_t *prop, static hwaddr spapr_node0_size(void) { + MachineState *machine = MACHINE(qdev_get_machine()); + if (nb_numa_nodes) { int i; for (i = 0; i < nb_numa_nodes; ++i) { if (numa_info[i].node_mem) { - return MIN(pow2floor(numa_info[i].node_mem), ram_size); + return MIN(pow2floor(numa_info[i].node_mem), + machine->ram_size); } } } - return ram_size; + return machine->ram_size; } #define _FDT(exp) \ @@ -318,18 +315,13 @@ static void *spapr_create_fdt_skel(hwaddr initrd_base, uint32_t epow_irq) { void *fdt; - CPUState *cs; uint32_t start_prop = cpu_to_be32(initrd_base); uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size); GString *hypertas = g_string_sized_new(256); GString *qemu_hypertas = g_string_sized_new(256); uint32_t refpoints[] = {cpu_to_be32(0x4), cpu_to_be32(0x4)}; - uint32_t interrupt_server_ranges_prop[] = {0, cpu_to_be32(smp_cpus)}; - int smt = kvmppc_smt_threads(); + uint32_t interrupt_server_ranges_prop[] = {0, cpu_to_be32(max_cpus)}; unsigned char vec5[] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x80}; - QemuOpts *opts = qemu_opts_find(qemu_find_opts("smp-opts"), NULL); - unsigned sockets = opts ? qemu_opt_get_number(opts, "sockets", 0) : 0; - uint32_t cpus_per_socket = sockets ? (smp_cpus / sockets) : 1; char *buf; add_str(hypertas, "hcall-pft"); @@ -415,107 +407,6 @@ static void *spapr_create_fdt_skel(hwaddr initrd_base, _FDT((fdt_end_node(fdt))); - /* cpus */ - _FDT((fdt_begin_node(fdt, "cpus"))); - - _FDT((fdt_property_cell(fdt, "#address-cells", 0x1))); - _FDT((fdt_property_cell(fdt, "#size-cells", 0x0))); - - CPU_FOREACH(cs) { - PowerPCCPU *cpu = POWERPC_CPU(cs); - CPUPPCState *env = &cpu->env; - DeviceClass *dc = DEVICE_GET_CLASS(cs); - PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cs); - int index = ppc_get_vcpu_dt_id(cpu); - char *nodename; - uint32_t segs[] = {cpu_to_be32(28), cpu_to_be32(40), - 0xffffffff, 0xffffffff}; - uint32_t tbfreq = kvm_enabled() ? kvmppc_get_tbfreq() : TIMEBASE_FREQ; - uint32_t cpufreq = kvm_enabled() ? kvmppc_get_clockfreq() : 1000000000; - uint32_t page_sizes_prop[64]; - size_t page_sizes_prop_size; - - if ((index % smt) != 0) { - continue; - } - - nodename = g_strdup_printf("%s@%x", dc->fw_name, index); - - _FDT((fdt_begin_node(fdt, nodename))); - - g_free(nodename); - - _FDT((fdt_property_cell(fdt, "reg", index))); - _FDT((fdt_property_string(fdt, "device_type", "cpu"))); - - _FDT((fdt_property_cell(fdt, "cpu-version", env->spr[SPR_PVR]))); - _FDT((fdt_property_cell(fdt, "d-cache-block-size", - env->dcache_line_size))); - _FDT((fdt_property_cell(fdt, "d-cache-line-size", - env->dcache_line_size))); - _FDT((fdt_property_cell(fdt, "i-cache-block-size", - env->icache_line_size))); - _FDT((fdt_property_cell(fdt, "i-cache-line-size", - env->icache_line_size))); - - if (pcc->l1_dcache_size) { - _FDT((fdt_property_cell(fdt, "d-cache-size", pcc->l1_dcache_size))); - } else { - fprintf(stderr, "Warning: Unknown L1 dcache size for cpu\n"); - } - if (pcc->l1_icache_size) { - _FDT((fdt_property_cell(fdt, "i-cache-size", pcc->l1_icache_size))); - } else { - fprintf(stderr, "Warning: Unknown L1 icache size for cpu\n"); - } - - _FDT((fdt_property_cell(fdt, "timebase-frequency", tbfreq))); - _FDT((fdt_property_cell(fdt, "clock-frequency", cpufreq))); - _FDT((fdt_property_cell(fdt, "ibm,slb-size", env->slb_nr))); - _FDT((fdt_property_string(fdt, "status", "okay"))); - _FDT((fdt_property(fdt, "64-bit", NULL, 0))); - - if (env->spr_cb[SPR_PURR].oea_read) { - _FDT((fdt_property(fdt, "ibm,purr", NULL, 0))); - } - - if (env->mmu_model & POWERPC_MMU_1TSEG) { - _FDT((fdt_property(fdt, "ibm,processor-segment-sizes", - segs, sizeof(segs)))); - } - - /* Advertise VMX/VSX (vector extensions) if available - * 0 / no property == no vector extensions - * 1 == VMX / Altivec available - * 2 == VSX available */ - if (env->insns_flags & PPC_ALTIVEC) { - uint32_t vmx = (env->insns_flags2 & PPC2_VSX) ? 2 : 1; - - _FDT((fdt_property_cell(fdt, "ibm,vmx", vmx))); - } - - /* Advertise DFP (Decimal Floating Point) if available - * 0 / no property == no DFP - * 1 == DFP available */ - if (env->insns_flags2 & PPC2_DFP) { - _FDT((fdt_property_cell(fdt, "ibm,dfp", 1))); - } - - page_sizes_prop_size = create_page_sizes_prop(env, page_sizes_prop, - sizeof(page_sizes_prop)); - if (page_sizes_prop_size) { - _FDT((fdt_property(fdt, "ibm,segment-page-sizes", - page_sizes_prop, page_sizes_prop_size))); - } - - _FDT((fdt_property_cell(fdt, "ibm,chip-id", - cs->cpu_index / cpus_per_socket))); - - _FDT((fdt_end_node(fdt))); - } - - _FDT((fdt_end_node(fdt))); - /* RTAS */ _FDT((fdt_begin_node(fdt, "rtas"))); @@ -604,7 +495,8 @@ static void *spapr_create_fdt_skel(hwaddr initrd_base, return fdt; } -int spapr_h_cas_compose_response(target_ulong addr, target_ulong size) +int spapr_h_cas_compose_response(sPAPRMachineState *spapr, + target_ulong addr, target_ulong size) { void *fdt, *fdt_skel; sPAPRDeviceTreeUpdateHeader hdr = { .version_id = 1 }; @@ -665,8 +557,9 @@ static void spapr_populate_memory_node(void *fdt, int nodeid, hwaddr start, sizeof(associativity)))); } -static int spapr_populate_memory(sPAPREnvironment *spapr, void *fdt) +static int spapr_populate_memory(sPAPRMachineState *spapr, void *fdt) { + MachineState *machine = MACHINE(spapr); hwaddr mem_start, node_size; int i, nb_nodes = nb_numa_nodes; NodeInfo *nodes = numa_info; @@ -675,7 +568,7 @@ static int spapr_populate_memory(sPAPREnvironment *spapr, void *fdt) /* No NUMA nodes, assume there is just one node with whole RAM */ if (!nb_numa_nodes) { nb_nodes = 1; - ramnode.node_mem = ram_size; + ramnode.node_mem = machine->ram_size; nodes = &ramnode; } @@ -683,12 +576,12 @@ static int spapr_populate_memory(sPAPREnvironment *spapr, void *fdt) if (!nodes[i].node_mem) { continue; } - if (mem_start >= ram_size) { + if (mem_start >= machine->ram_size) { node_size = 0; } else { node_size = nodes[i].node_mem; - if (node_size > ram_size - mem_start) { - node_size = ram_size - mem_start; + if (node_size > machine->ram_size - mem_start) { + node_size = machine->ram_size - mem_start; } } if (!mem_start) { @@ -714,7 +607,138 @@ static int spapr_populate_memory(sPAPREnvironment *spapr, void *fdt) return 0; } -static void spapr_finalize_fdt(sPAPREnvironment *spapr, +static void spapr_populate_cpu_dt(CPUState *cs, void *fdt, int offset, + sPAPRMachineState *spapr) +{ + PowerPCCPU *cpu = POWERPC_CPU(cs); + CPUPPCState *env = &cpu->env; + PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cs); + int index = ppc_get_vcpu_dt_id(cpu); + uint32_t segs[] = {cpu_to_be32(28), cpu_to_be32(40), + 0xffffffff, 0xffffffff}; + uint32_t tbfreq = kvm_enabled() ? kvmppc_get_tbfreq() : TIMEBASE_FREQ; + uint32_t cpufreq = kvm_enabled() ? kvmppc_get_clockfreq() : 1000000000; + uint32_t page_sizes_prop[64]; + size_t page_sizes_prop_size; + QemuOpts *opts = qemu_opts_find(qemu_find_opts("smp-opts"), NULL); + unsigned sockets = opts ? qemu_opt_get_number(opts, "sockets", 0) : 0; + uint32_t cpus_per_socket = sockets ? (smp_cpus / sockets) : 1; + uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)}; + + _FDT((fdt_setprop_cell(fdt, offset, "reg", index))); + _FDT((fdt_setprop_string(fdt, offset, "device_type", "cpu"))); + + _FDT((fdt_setprop_cell(fdt, offset, "cpu-version", env->spr[SPR_PVR]))); + _FDT((fdt_setprop_cell(fdt, offset, "d-cache-block-size", + env->dcache_line_size))); + _FDT((fdt_setprop_cell(fdt, offset, "d-cache-line-size", + env->dcache_line_size))); + _FDT((fdt_setprop_cell(fdt, offset, "i-cache-block-size", + env->icache_line_size))); + _FDT((fdt_setprop_cell(fdt, offset, "i-cache-line-size", + env->icache_line_size))); + + if (pcc->l1_dcache_size) { + _FDT((fdt_setprop_cell(fdt, offset, "d-cache-size", + pcc->l1_dcache_size))); + } else { + fprintf(stderr, "Warning: Unknown L1 dcache size for cpu\n"); + } + if (pcc->l1_icache_size) { + _FDT((fdt_setprop_cell(fdt, offset, "i-cache-size", + pcc->l1_icache_size))); + } else { + fprintf(stderr, "Warning: Unknown L1 icache size for cpu\n"); + } + + _FDT((fdt_setprop_cell(fdt, offset, "timebase-frequency", tbfreq))); + _FDT((fdt_setprop_cell(fdt, offset, "clock-frequency", cpufreq))); + _FDT((fdt_setprop_cell(fdt, offset, "ibm,slb-size", env->slb_nr))); + _FDT((fdt_setprop_string(fdt, offset, "status", "okay"))); + _FDT((fdt_setprop(fdt, offset, "64-bit", NULL, 0))); + + if (env->spr_cb[SPR_PURR].oea_read) { + _FDT((fdt_setprop(fdt, offset, "ibm,purr", NULL, 0))); + } + + if (env->mmu_model & POWERPC_MMU_1TSEG) { + _FDT((fdt_setprop(fdt, offset, "ibm,processor-segment-sizes", + segs, sizeof(segs)))); + } + + /* Advertise VMX/VSX (vector extensions) if available + * 0 / no property == no vector extensions + * 1 == VMX / Altivec available + * 2 == VSX available */ + if (env->insns_flags & PPC_ALTIVEC) { + uint32_t vmx = (env->insns_flags2 & PPC2_VSX) ? 2 : 1; + + _FDT((fdt_setprop_cell(fdt, offset, "ibm,vmx", vmx))); + } + + /* Advertise DFP (Decimal Floating Point) if available + * 0 / no property == no DFP + * 1 == DFP available */ + if (env->insns_flags2 & PPC2_DFP) { + _FDT((fdt_setprop_cell(fdt, offset, "ibm,dfp", 1))); + } + + page_sizes_prop_size = create_page_sizes_prop(env, page_sizes_prop, + sizeof(page_sizes_prop)); + if (page_sizes_prop_size) { + _FDT((fdt_setprop(fdt, offset, "ibm,segment-page-sizes", + page_sizes_prop, page_sizes_prop_size))); + } + + _FDT((fdt_setprop_cell(fdt, offset, "ibm,chip-id", + cs->cpu_index / cpus_per_socket))); + + _FDT((fdt_setprop(fdt, offset, "ibm,pft-size", + pft_size_prop, sizeof(pft_size_prop)))); + + _FDT(spapr_fixup_cpu_numa_dt(fdt, offset, cs)); + + _FDT(spapr_fixup_cpu_smt_dt(fdt, offset, cpu, + ppc_get_compat_smt_threads(cpu))); +} + +static void spapr_populate_cpus_dt_node(void *fdt, sPAPRMachineState *spapr) +{ + CPUState *cs; + int cpus_offset; + char *nodename; + int smt = kvmppc_smt_threads(); + + cpus_offset = fdt_add_subnode(fdt, 0, "cpus"); + _FDT(cpus_offset); + _FDT((fdt_setprop_cell(fdt, cpus_offset, "#address-cells", 0x1))); + _FDT((fdt_setprop_cell(fdt, cpus_offset, "#size-cells", 0x0))); + + /* + * We walk the CPUs in reverse order to ensure that CPU DT nodes + * created by fdt_add_subnode() end up in the right order in FDT + * for the guest kernel the enumerate the CPUs correctly. + */ + CPU_FOREACH_REVERSE(cs) { + PowerPCCPU *cpu = POWERPC_CPU(cs); + int index = ppc_get_vcpu_dt_id(cpu); + DeviceClass *dc = DEVICE_GET_CLASS(cs); + int offset; + + if ((index % smt) != 0) { + continue; + } + + nodename = g_strdup_printf("%s@%x", dc->fw_name, index); + offset = fdt_add_subnode(fdt, cpus_offset, nodename); + g_free(nodename); + _FDT(offset); + spapr_populate_cpu_dt(cs, fdt, offset, spapr); + } + +} + +static void spapr_finalize_fdt(sPAPRMachineState *spapr, hwaddr fdt_addr, hwaddr rtas_addr, hwaddr rtas_size) @@ -759,11 +783,8 @@ static void spapr_finalize_fdt(sPAPREnvironment *spapr, fprintf(stderr, "Couldn't set up RTAS device tree properties\n"); } - /* Advertise NUMA via ibm,associativity */ - ret = spapr_fixup_cpu_dt(fdt, spapr); - if (ret < 0) { - fprintf(stderr, "Couldn't finalize CPU device tree properties\n"); - } + /* cpus */ + spapr_populate_cpus_dt_node(fdt, spapr); bootlist = get_boot_devices_list(&cb, true); if (cb && bootlist) { @@ -830,7 +851,7 @@ static void emulate_spapr_hypercall(PowerPCCPU *cpu) #define CLEAN_HPTE(_hpte) ((*(uint64_t *)(_hpte)) &= tswap64(~HPTE64_V_HPTE_DIRTY)) #define DIRTY_HPTE(_hpte) ((*(uint64_t *)(_hpte)) |= tswap64(HPTE64_V_HPTE_DIRTY)) -static void spapr_reset_htab(sPAPREnvironment *spapr) +static void spapr_reset_htab(sPAPRMachineState *spapr) { long shift; int index; @@ -892,7 +913,7 @@ static int find_unknown_sysbus_device(SysBusDevice *sbdev, void *opaque) * A guest reset will cause spapr->htab_fd to become stale if being used. * Reopen the file descriptor to make sure the whole HTAB is properly read. */ -static int spapr_check_htab_fd(sPAPREnvironment *spapr) +static int spapr_check_htab_fd(sPAPRMachineState *spapr) { int rc = 0; @@ -912,6 +933,7 @@ static int spapr_check_htab_fd(sPAPREnvironment *spapr) static void ppc_spapr_reset(void) { + sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); PowerPCCPU *first_ppc_cpu; uint32_t rtas_limit; @@ -945,12 +967,13 @@ static void ppc_spapr_reset(void) first_ppc_cpu->env.gpr[3] = spapr->fdt_addr; first_ppc_cpu->env.gpr[5] = 0; first_cpu->halted = 0; - first_ppc_cpu->env.nip = spapr->entry_point; + first_ppc_cpu->env.nip = SPAPR_ENTRY_POINT; } static void spapr_cpu_reset(void *opaque) { + sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); PowerPCCPU *cpu = opaque; CPUState *cs = CPU(cpu); CPUPPCState *env = &cpu->env; @@ -979,12 +1002,12 @@ static void spapr_cpu_reset(void *opaque) * We have 8 hpte per group, and each hpte is 16 bytes. * ie have 128 bytes per hpte entry. */ - env->htab_mask = (1ULL << ((spapr)->htab_shift - 7)) - 1; + env->htab_mask = (1ULL << (spapr->htab_shift - 7)) - 1; env->spr[SPR_SDR1] = (target_ulong)(uintptr_t)spapr->htab | (spapr->htab_shift - 18); } -static void spapr_create_nvram(sPAPREnvironment *spapr) +static void spapr_create_nvram(sPAPRMachineState *spapr) { DeviceState *dev = qdev_create(&spapr->vio_bus->bus, "spapr-nvram"); DriveInfo *dinfo = drive_get(IF_PFLASH, 0, 0); @@ -998,7 +1021,7 @@ static void spapr_create_nvram(sPAPREnvironment *spapr) spapr->nvram = (struct sPAPRNVRAM *)dev; } -static void spapr_rtc_create(sPAPREnvironment *spapr) +static void spapr_rtc_create(sPAPRMachineState *spapr) { DeviceState *dev = qdev_create(NULL, TYPE_SPAPR_RTC); @@ -1028,7 +1051,7 @@ static int spapr_vga_init(PCIBus *pci_bus) static int spapr_post_load(void *opaque, int version_id) { - sPAPREnvironment *spapr = (sPAPREnvironment *)opaque; + sPAPRMachineState *spapr = (sPAPRMachineState *)opaque; int err = 0; /* In earlier versions, there was no separate qdev for the PAPR @@ -1057,16 +1080,16 @@ static const VMStateDescription vmstate_spapr = { VMSTATE_UNUSED_BUFFER(version_before_3, 0, 4), /* RTC offset */ - VMSTATE_UINT64_TEST(rtc_offset, sPAPREnvironment, version_before_3), + VMSTATE_UINT64_TEST(rtc_offset, sPAPRMachineState, version_before_3), - VMSTATE_PPC_TIMEBASE_V(tb, sPAPREnvironment, 2), + VMSTATE_PPC_TIMEBASE_V(tb, sPAPRMachineState, 2), VMSTATE_END_OF_LIST() }, }; static int htab_save_setup(QEMUFile *f, void *opaque) { - sPAPREnvironment *spapr = opaque; + sPAPRMachineState *spapr = opaque; /* "Iteration" header */ qemu_put_be32(f, spapr->htab_shift); @@ -1090,7 +1113,7 @@ static int htab_save_setup(QEMUFile *f, void *opaque) return 0; } -static void htab_save_first_pass(QEMUFile *f, sPAPREnvironment *spapr, +static void htab_save_first_pass(QEMUFile *f, sPAPRMachineState *spapr, int64_t max_ns) { int htabslots = HTAB_SIZE(spapr) / HASH_PTE_SIZE_64; @@ -1140,7 +1163,7 @@ static void htab_save_first_pass(QEMUFile *f, sPAPREnvironment *spapr, spapr->htab_save_index = index; } -static int htab_save_later_pass(QEMUFile *f, sPAPREnvironment *spapr, +static int htab_save_later_pass(QEMUFile *f, sPAPRMachineState *spapr, int64_t max_ns) { bool final = max_ns < 0; @@ -1222,7 +1245,7 @@ static int htab_save_later_pass(QEMUFile *f, sPAPREnvironment *spapr, static int htab_save_iterate(QEMUFile *f, void *opaque) { - sPAPREnvironment *spapr = opaque; + sPAPRMachineState *spapr = opaque; int rc = 0; /* Iteration header */ @@ -1257,7 +1280,7 @@ static int htab_save_iterate(QEMUFile *f, void *opaque) static int htab_save_complete(QEMUFile *f, void *opaque) { - sPAPREnvironment *spapr = opaque; + sPAPRMachineState *spapr = opaque; /* Iteration header */ qemu_put_be32(f, 0); @@ -1292,7 +1315,7 @@ static int htab_save_complete(QEMUFile *f, void *opaque) static int htab_load(QEMUFile *f, void *opaque, int version_id) { - sPAPREnvironment *spapr = opaque; + sPAPRMachineState *spapr = opaque; uint32_t section_hdr; int fd = -1; @@ -1386,16 +1409,42 @@ static void spapr_boot_set(void *opaque, const char *boot_device, machine->boot_order = g_strdup(boot_device); } +static void spapr_cpu_init(sPAPRMachineState *spapr, PowerPCCPU *cpu) +{ + CPUPPCState *env = &cpu->env; + + /* Set time-base frequency to 512 MHz */ + cpu_ppc_tb_init(env, TIMEBASE_FREQ); + + /* PAPR always has exception vectors in RAM not ROM. To ensure this, + * MSR[IP] should never be set. + */ + env->msr_mask &= ~(1 << 6); + + /* Tell KVM that we're in PAPR mode */ + if (kvm_enabled()) { + kvmppc_set_papr(cpu); + } + + if (cpu->max_compat) { + if (ppc_set_compat(cpu, cpu->max_compat) < 0) { + exit(1); + } + } + + xics_cpu_setup(spapr->icp, cpu); + + qemu_register_reset(spapr_cpu_reset, cpu); +} + /* pSeries LPAR / sPAPR hardware init */ static void ppc_spapr_init(MachineState *machine) { - ram_addr_t ram_size = machine->ram_size; - const char *cpu_model = machine->cpu_model; + sPAPRMachineState *spapr = SPAPR_MACHINE(machine); const char *kernel_filename = machine->kernel_filename; const char *kernel_cmdline = machine->kernel_cmdline; const char *initrd_filename = machine->initrd_filename; PowerPCCPU *cpu; - CPUPPCState *env; PCIHostState *phb; int i; MemoryRegion *sysmem = get_system_memory(); @@ -1412,7 +1461,6 @@ static void ppc_spapr_init(MachineState *machine) msi_supported = true; - spapr = g_malloc0(sizeof(*spapr)); QLIST_INIT(&spapr->phbs); cpu_ppc_hypercall = emulate_spapr_hypercall; @@ -1459,7 +1507,7 @@ static void ppc_spapr_init(MachineState *machine) * more than needed for the Linux guests we support. */ spapr->htab_shift = 18; /* Minimum architected size */ while (spapr->htab_shift <= 46) { - if ((1ULL << (spapr->htab_shift + 7)) >= ram_size) { + if ((1ULL << (spapr->htab_shift + 7)) >= machine->ram_size) { break; } spapr->htab_shift++; @@ -1467,43 +1515,21 @@ static void ppc_spapr_init(MachineState *machine) /* Set up Interrupt Controller before we create the VCPUs */ spapr->icp = xics_system_init(machine, - smp_cpus * kvmppc_smt_threads() / smp_threads, + DIV_ROUND_UP(max_cpus * kvmppc_smt_threads(), + smp_threads), XICS_IRQS); /* init CPUs */ - if (cpu_model == NULL) { - cpu_model = kvm_enabled() ? "host" : "POWER7"; + if (machine->cpu_model == NULL) { + machine->cpu_model = kvm_enabled() ? "host" : "POWER7"; } for (i = 0; i < smp_cpus; i++) { - cpu = cpu_ppc_init(cpu_model); + cpu = cpu_ppc_init(machine->cpu_model); if (cpu == NULL) { fprintf(stderr, "Unable to find PowerPC CPU definition\n"); exit(1); } - env = &cpu->env; - - /* Set time-base frequency to 512 MHz */ - cpu_ppc_tb_init(env, TIMEBASE_FREQ); - - /* PAPR always has exception vectors in RAM not ROM. To ensure this, - * MSR[IP] should never be set. - */ - env->msr_mask &= ~(1 << 6); - - /* Tell KVM that we're in PAPR mode */ - if (kvm_enabled()) { - kvmppc_set_papr(cpu); - } - - if (cpu->max_compat) { - if (ppc_set_compat(cpu, cpu->max_compat) < 0) { - exit(1); - } - } - - xics_cpu_setup(spapr->icp, cpu); - - qemu_register_reset(spapr_cpu_reset, cpu); + spapr_cpu_init(spapr, cpu); } if (kvm_enabled()) { @@ -1512,9 +1538,8 @@ static void ppc_spapr_init(MachineState *machine) } /* allocate RAM */ - spapr->ram_limit = ram_size; memory_region_allocate_system_memory(ram, NULL, "ppc_spapr.ram", - spapr->ram_limit); + machine->ram_size); memory_region_add_subregion(sysmem, 0, ram); if (rma_alloc_size && rma) { @@ -1658,8 +1683,9 @@ static void ppc_spapr_init(MachineState *machine) } g_free(filename); - spapr->entry_point = 0x100; - + /* FIXME: Should register things through the MachineState's qdev + * interface, this is a legacy from the sPAPREnvironment structure + * which predated MachineState but had a similar function */ vmstate_register(NULL, 0, &vmstate_spapr, spapr); register_savevm_live(NULL, "spapr/htab", -1, 1, &savevm_htab_handlers, spapr); @@ -1755,17 +1781,17 @@ static char *spapr_get_fw_dev_path(FWPathProvider *p, BusState *bus, static char *spapr_get_kvm_type(Object *obj, Error **errp) { - sPAPRMachineState *sm = SPAPR_MACHINE(obj); + sPAPRMachineState *spapr = SPAPR_MACHINE(obj); - return g_strdup(sm->kvm_type); + return g_strdup(spapr->kvm_type); } static void spapr_set_kvm_type(Object *obj, const char *value, Error **errp) { - sPAPRMachineState *sm = SPAPR_MACHINE(obj); + sPAPRMachineState *spapr = SPAPR_MACHINE(obj); - g_free(sm->kvm_type); - sm->kvm_type = g_strdup(value); + g_free(spapr->kvm_type); + spapr->kvm_type = g_strdup(value); } static void spapr_machine_initfn(Object *obj) @@ -1820,6 +1846,7 @@ static const TypeInfo spapr_machine_info = { .abstract = true, .instance_size = sizeof(sPAPRMachineState), .instance_init = spapr_machine_initfn, + .class_size = sizeof(sPAPRMachineClass), .class_init = spapr_machine_class_init, .interfaces = (InterfaceInfo[]) { { TYPE_FW_PATH_PROVIDER }, @@ -1851,6 +1878,8 @@ static const TypeInfo spapr_machine_info = { static void spapr_compat_2_3(Object *obj) { + savevm_skip_section_footers(); + global_state_set_optional(); } static void spapr_compat_2_2(Object *obj) diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c index fda9e3590a..f626eb7b34 100644 --- a/hw/ppc/spapr_events.c +++ b/hw/ppc/spapr_events.c @@ -238,6 +238,7 @@ void spapr_events_fdt_skel(void *fdt, uint32_t check_exception_irq) static void rtas_event_log_queue(int log_type, void *data, bool exception) { + sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); sPAPREventLogEntry *entry = g_new(sPAPREventLogEntry, 1); g_assert(data); @@ -250,6 +251,7 @@ static void rtas_event_log_queue(int log_type, void *data, bool exception) static sPAPREventLogEntry *rtas_event_log_dequeue(uint32_t event_mask, bool exception) { + sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); sPAPREventLogEntry *entry = NULL; /* we only queue EPOW events atm. */ @@ -278,6 +280,7 @@ static sPAPREventLogEntry *rtas_event_log_dequeue(uint32_t event_mask, static bool rtas_event_log_contains(uint32_t event_mask, bool exception) { + sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); sPAPREventLogEntry *entry = NULL; /* we only queue EPOW events atm. */ @@ -314,6 +317,7 @@ static void spapr_init_v6hdr(struct rtas_event_log_v6 *v6hdr) static void spapr_init_maina(struct rtas_event_log_v6_maina *maina, int section_count) { + sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); struct tm tm; int year; @@ -336,7 +340,7 @@ static void spapr_init_maina(struct rtas_event_log_v6_maina *maina, static void spapr_powerdown_req(Notifier *n, void *opaque) { - sPAPREnvironment *spapr = container_of(n, sPAPREnvironment, epow_notifier); + sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); struct rtas_error_log *hdr; struct rtas_event_log_v6 *v6hdr; struct rtas_event_log_v6_maina *maina; @@ -384,6 +388,7 @@ static void spapr_powerdown_req(Notifier *n, void *opaque) static void spapr_hotplug_req_event(sPAPRDRConnector *drc, uint8_t hp_action) { + sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); struct hp_log_full *new_hp; struct rtas_error_log *hdr; struct rtas_event_log_v6 *v6hdr; @@ -453,7 +458,7 @@ void spapr_hotplug_req_remove_event(sPAPRDRConnector *drc) spapr_hotplug_req_event(drc, RTAS_LOG_V6_HP_ACTION_REMOVE); } -static void check_exception(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static void check_exception(PowerPCCPU *cpu, sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -508,7 +513,7 @@ out_no_events: rtas_st(rets, 0, RTAS_OUT_NO_ERRORS_FOUND); } -static void event_scan(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static void event_scan(PowerPCCPU *cpu, sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -548,7 +553,7 @@ out_no_events: rtas_st(rets, 0, RTAS_OUT_NO_ERRORS_FOUND); } -void spapr_events_init(sPAPREnvironment *spapr) +void spapr_events_init(sPAPRMachineState *spapr) { QTAILQ_INIT(&spapr->pending_events); spapr->check_exception_irq = xics_alloc(spapr->icp, 0, 0, false); diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c index 4f76f1cbfe..652ddf6e37 100644 --- a/hw/ppc/spapr_hcall.c +++ b/hw/ppc/spapr_hcall.c @@ -84,9 +84,10 @@ static inline bool valid_pte_index(CPUPPCState *env, target_ulong pte_index) return true; } -static target_ulong h_enter(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_enter(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { + MachineState *machine = MACHINE(spapr); CPUPPCState *env = &cpu->env; target_ulong flags = args[0]; target_ulong pte_index = args[1]; @@ -118,7 +119,7 @@ static target_ulong h_enter(PowerPCCPU *cpu, sPAPREnvironment *spapr, raddr = (ptel & HPTE64_R_RPN) & ~((1ULL << page_shift) - 1); - if (raddr < spapr->ram_limit) { + if (raddr < machine->ram_size) { /* Regular RAM - should have WIMG=0010 */ if ((ptel & HPTE64_R_WIMG) != HPTE64_R_M) { return H_PARAMETER; @@ -205,7 +206,7 @@ static RemoveResult remove_hpte(CPUPPCState *env, target_ulong ptex, return REMOVE_SUCCESS; } -static target_ulong h_remove(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_remove(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { CPUPPCState *env = &cpu->env; @@ -252,7 +253,7 @@ static target_ulong h_remove(PowerPCCPU *cpu, sPAPREnvironment *spapr, #define H_BULK_REMOVE_MAX_BATCH 4 -static target_ulong h_bulk_remove(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_bulk_remove(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { CPUPPCState *env = &cpu->env; @@ -299,7 +300,7 @@ static target_ulong h_bulk_remove(PowerPCCPU *cpu, sPAPREnvironment *spapr, return H_SUCCESS; } -static target_ulong h_protect(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_protect(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { CPUPPCState *env = &cpu->env; @@ -337,7 +338,7 @@ static target_ulong h_protect(PowerPCCPU *cpu, sPAPREnvironment *spapr, return H_SUCCESS; } -static target_ulong h_read(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_read(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { CPUPPCState *env = &cpu->env; @@ -367,7 +368,7 @@ static target_ulong h_read(PowerPCCPU *cpu, sPAPREnvironment *spapr, return H_SUCCESS; } -static target_ulong h_set_dabr(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_set_dabr(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { /* FIXME: actually implement this */ @@ -506,7 +507,7 @@ static target_ulong deregister_dtl(CPUPPCState *env, target_ulong addr) return H_SUCCESS; } -static target_ulong h_register_vpa(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_register_vpa(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { target_ulong flags = args[0]; @@ -551,7 +552,7 @@ static target_ulong h_register_vpa(PowerPCCPU *cpu, sPAPREnvironment *spapr, return ret; } -static target_ulong h_cede(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_cede(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { CPUPPCState *env = &cpu->env; @@ -567,7 +568,7 @@ static target_ulong h_cede(PowerPCCPU *cpu, sPAPREnvironment *spapr, return H_SUCCESS; } -static target_ulong h_rtas(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_rtas(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { target_ulong rtas_r3 = args[0]; @@ -579,7 +580,7 @@ static target_ulong h_rtas(PowerPCCPU *cpu, sPAPREnvironment *spapr, nret, rtas_r3 + 12 + 4*nargs); } -static target_ulong h_logical_load(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_logical_load(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { CPUState *cs = CPU(cpu); @@ -603,7 +604,7 @@ static target_ulong h_logical_load(PowerPCCPU *cpu, sPAPREnvironment *spapr, return H_PARAMETER; } -static target_ulong h_logical_store(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_logical_store(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { CPUState *cs = CPU(cpu); @@ -629,7 +630,7 @@ static target_ulong h_logical_store(PowerPCCPU *cpu, sPAPREnvironment *spapr, return H_PARAMETER; } -static target_ulong h_logical_memop(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_logical_memop(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { CPUState *cs = CPU(cpu); @@ -698,14 +699,14 @@ static target_ulong h_logical_memop(PowerPCCPU *cpu, sPAPREnvironment *spapr, return H_SUCCESS; } -static target_ulong h_logical_icbi(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_logical_icbi(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { /* Nothing to do on emulation, KVM will trap this in the kernel */ return H_SUCCESS; } -static target_ulong h_logical_dcbf(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_logical_dcbf(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { /* Nothing to do on emulation, KVM will trap this in the kernel */ @@ -788,7 +789,7 @@ static target_ulong h_set_mode_resource_addr_trans_mode(PowerPCCPU *cpu, return H_SUCCESS; } -static target_ulong h_set_mode(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_set_mode(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { target_ulong resource = args[1]; @@ -828,7 +829,7 @@ static void do_set_compat(void *arg) ((cpuver) == CPU_POWERPC_LOGICAL_2_07) ? 2070 : 0) static target_ulong h_client_architecture_support(PowerPCCPU *cpu_, - sPAPREnvironment *spapr, + sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { @@ -921,7 +922,7 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu_, return H_SUCCESS; } - if (spapr_h_cas_compose_response(args[1], args[2])) { + if (spapr_h_cas_compose_response(spapr, args[1], args[2])) { qemu_system_reset_request(); } @@ -952,6 +953,8 @@ void spapr_register_hypercall(target_ulong opcode, spapr_hcall_fn fn) target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode, target_ulong *args) { + sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); + if ((opcode <= MAX_HCALL_OPCODE) && ((opcode & 0x3) == 0)) { spapr_hcall_fn fn = papr_hypercall_table[opcode / 4]; diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c index 8cd9dba9ac..f61504e0c5 100644 --- a/hw/ppc/spapr_iommu.c +++ b/hw/ppc/spapr_iommu.c @@ -60,6 +60,20 @@ sPAPRTCETable *spapr_tce_find_by_liobn(target_ulong liobn) return NULL; } +static IOMMUAccessFlags spapr_tce_iommu_access_flags(uint64_t tce) +{ + switch (tce & SPAPR_TCE_RW) { + case SPAPR_TCE_FAULT: + return IOMMU_NONE; + case SPAPR_TCE_RO: + return IOMMU_RO; + case SPAPR_TCE_WO: + return IOMMU_WO; + default: /* SPAPR_TCE_RW */ + return IOMMU_RW; + } +} + /* Called from RCU critical section */ static IOMMUTLBEntry spapr_tce_translate_iommu(MemoryRegion *iommu, hwaddr addr, bool is_write) @@ -82,7 +96,7 @@ static IOMMUTLBEntry spapr_tce_translate_iommu(MemoryRegion *iommu, hwaddr addr, ret.iova = addr & page_mask; ret.translated_addr = tce & page_mask; ret.addr_mask = ~page_mask; - ret.perm = tce & IOMMU_RW; + ret.perm = spapr_tce_iommu_access_flags(tce); } trace_spapr_iommu_xlate(tcet->liobn, addr, ret.iova, ret.perm, ret.addr_mask); @@ -233,14 +247,14 @@ static target_ulong put_tce_emu(sPAPRTCETable *tcet, target_ulong ioba, entry.iova = ioba & page_mask; entry.translated_addr = tce & page_mask; entry.addr_mask = ~page_mask; - entry.perm = tce & IOMMU_RW; + entry.perm = spapr_tce_iommu_access_flags(tce); memory_region_notify_iommu(&tcet->iommu, entry); return H_SUCCESS; } static target_ulong h_put_tce_indirect(PowerPCCPU *cpu, - sPAPREnvironment *spapr, + sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { int i; @@ -267,9 +281,7 @@ static target_ulong h_put_tce_indirect(PowerPCCPU *cpu, ioba &= page_mask; for (i = 0; i < npages; ++i, ioba += page_size) { - target_ulong off = (tce_list & ~SPAPR_TCE_RW) + - i * sizeof(target_ulong); - tce = ldq_be_phys(cs->as, off); + tce = ldq_be_phys(cs->as, tce_list + i * sizeof(target_ulong)); ret = put_tce_emu(tcet, ioba, tce); if (ret) { @@ -287,7 +299,7 @@ static target_ulong h_put_tce_indirect(PowerPCCPU *cpu, return ret; } -static target_ulong h_stuff_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_stuff_tce(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { int i; @@ -326,7 +338,7 @@ static target_ulong h_stuff_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr, return ret; } -static target_ulong h_put_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_put_tce(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { target_ulong liobn = args[0]; @@ -367,7 +379,7 @@ static target_ulong get_tce_emu(sPAPRTCETable *tcet, target_ulong ioba, return H_SUCCESS; } -static target_ulong h_get_tce(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_get_tce(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { target_ulong liobn = args[0]; diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c index d4a6150527..a8f79d8003 100644 --- a/hw/ppc/spapr_pci.c +++ b/hw/ppc/spapr_pci.c @@ -23,6 +23,7 @@ * THE SOFTWARE. */ #include "hw/hw.h" +#include "hw/sysbus.h" #include "hw/pci/pci.h" #include "hw/pci/msi.h" #include "hw/pci/msix.h" @@ -35,6 +36,7 @@ #include "qemu/error-report.h" #include "qapi/qmp/qerror.h" +#include "hw/pci/pci_bridge.h" #include "hw/pci/pci_bus.h" #include "hw/ppc/spapr_drc.h" #include "sysemu/device_tree.h" @@ -50,6 +52,8 @@ #define RTAS_TYPE_MSI 1 #define RTAS_TYPE_MSIX 2 +#define FDT_NAME_MAX 128 + #define _FDT(exp) \ do { \ int ret = (exp); \ @@ -58,7 +62,7 @@ } \ } while (0) -sPAPRPHBState *spapr_pci_find_phb(sPAPREnvironment *spapr, uint64_t buid) +sPAPRPHBState *spapr_pci_find_phb(sPAPRMachineState *spapr, uint64_t buid) { sPAPRPHBState *sphb; @@ -72,7 +76,7 @@ sPAPRPHBState *spapr_pci_find_phb(sPAPREnvironment *spapr, uint64_t buid) return NULL; } -PCIDevice *spapr_pci_find_dev(sPAPREnvironment *spapr, uint64_t buid, +PCIDevice *spapr_pci_find_dev(sPAPRMachineState *spapr, uint64_t buid, uint32_t config_addr) { sPAPRPHBState *sphb = spapr_pci_find_phb(spapr, buid); @@ -93,7 +97,7 @@ static uint32_t rtas_pci_cfgaddr(uint32_t arg) return ((arg >> 20) & 0xf00) | (arg & 0xff); } -static void finish_read_pci_config(sPAPREnvironment *spapr, uint64_t buid, +static void finish_read_pci_config(sPAPRMachineState *spapr, uint64_t buid, uint32_t addr, uint32_t size, target_ulong rets) { @@ -123,7 +127,7 @@ static void finish_read_pci_config(sPAPREnvironment *spapr, uint64_t buid, rtas_st(rets, 1, val); } -static void rtas_ibm_read_pci_config(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static void rtas_ibm_read_pci_config(PowerPCCPU *cpu, sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -143,7 +147,7 @@ static void rtas_ibm_read_pci_config(PowerPCCPU *cpu, sPAPREnvironment *spapr, finish_read_pci_config(spapr, buid, addr, size, rets); } -static void rtas_read_pci_config(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static void rtas_read_pci_config(PowerPCCPU *cpu, sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -161,7 +165,7 @@ static void rtas_read_pci_config(PowerPCCPU *cpu, sPAPREnvironment *spapr, finish_read_pci_config(spapr, 0, addr, size, rets); } -static void finish_write_pci_config(sPAPREnvironment *spapr, uint64_t buid, +static void finish_write_pci_config(sPAPRMachineState *spapr, uint64_t buid, uint32_t addr, uint32_t size, uint32_t val, target_ulong rets) { @@ -189,7 +193,7 @@ static void finish_write_pci_config(sPAPREnvironment *spapr, uint64_t buid, rtas_st(rets, 0, RTAS_OUT_SUCCESS); } -static void rtas_ibm_write_pci_config(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static void rtas_ibm_write_pci_config(PowerPCCPU *cpu, sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -210,7 +214,7 @@ static void rtas_ibm_write_pci_config(PowerPCCPU *cpu, sPAPREnvironment *spapr, finish_write_pci_config(spapr, buid, addr, size, val, rets); } -static void rtas_write_pci_config(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static void rtas_write_pci_config(PowerPCCPU *cpu, sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -259,7 +263,7 @@ static void spapr_msi_setmsg(PCIDevice *pdev, hwaddr addr, bool msix, } } -static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -377,7 +381,7 @@ out: } static void rtas_ibm_query_interrupt_source_number(PowerPCCPU *cpu, - sPAPREnvironment *spapr, + sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, @@ -418,13 +422,14 @@ static void rtas_ibm_query_interrupt_source_number(PowerPCCPU *cpu, } static void rtas_ibm_set_eeh_option(PowerPCCPU *cpu, - sPAPREnvironment *spapr, + sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) { sPAPRPHBState *sphb; sPAPRPHBClass *spc; + PCIDevice *pdev; uint32_t addr, option; uint64_t buid; int ret; @@ -442,6 +447,12 @@ static void rtas_ibm_set_eeh_option(PowerPCCPU *cpu, goto param_error_exit; } + pdev = pci_find_device(PCI_HOST_BRIDGE(sphb)->bus, + (addr >> 16) & 0xFF, (addr >> 8) & 0xFF); + if (!pdev || !object_dynamic_cast(OBJECT(pdev), "vfio-pci")) { + goto param_error_exit; + } + spc = SPAPR_PCI_HOST_BRIDGE_GET_CLASS(sphb); if (!spc->eeh_set_option) { goto param_error_exit; @@ -456,7 +467,7 @@ param_error_exit: } static void rtas_ibm_get_config_addr_info2(PowerPCCPU *cpu, - sPAPREnvironment *spapr, + sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -512,7 +523,7 @@ param_error_exit: } static void rtas_ibm_read_slot_reset_state2(PowerPCCPU *cpu, - sPAPREnvironment *spapr, + sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -556,7 +567,7 @@ param_error_exit: } static void rtas_ibm_set_slot_reset(PowerPCCPU *cpu, - sPAPREnvironment *spapr, + sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -592,7 +603,7 @@ param_error_exit: } static void rtas_ibm_configure_pe(PowerPCCPU *cpu, - sPAPREnvironment *spapr, + sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -627,7 +638,7 @@ param_error_exit: /* To support it later */ static void rtas_ibm_slot_error_detail(PowerPCCPU *cpu, - sPAPREnvironment *spapr, + sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -718,6 +729,7 @@ static PCIINTxRoute spapr_route_intx_pin_to_irq(void *opaque, int pin) static void spapr_msi_write(void *opaque, hwaddr addr, uint64_t data, unsigned size) { + sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); uint32_t irq = data; trace_spapr_pci_msi_write(addr, data, irq); @@ -742,6 +754,60 @@ static AddressSpace *spapr_pci_dma_iommu(PCIBus *bus, void *opaque, int devfn) return &phb->iommu_as; } +static char *spapr_phb_vfio_get_loc_code(sPAPRPHBState *sphb, PCIDevice *pdev) +{ + char *path = NULL, *buf = NULL, *host = NULL; + + /* Get the PCI VFIO host id */ + host = object_property_get_str(OBJECT(pdev), "host", NULL); + if (!host) { + goto err_out; + } + + /* Construct the path of the file that will give us the DT location */ + path = g_strdup_printf("/sys/bus/pci/devices/%s/devspec", host); + g_free(host); + if (!path || !g_file_get_contents(path, &buf, NULL, NULL)) { + goto err_out; + } + g_free(path); + + /* Construct and read from host device tree the loc-code */ + path = g_strdup_printf("/proc/device-tree%s/ibm,loc-code", buf); + g_free(buf); + if (!path || !g_file_get_contents(path, &buf, NULL, NULL)) { + goto err_out; + } + return buf; + +err_out: + g_free(path); + return NULL; +} + +static char *spapr_phb_get_loc_code(sPAPRPHBState *sphb, PCIDevice *pdev) +{ + char *buf; + const char *devtype = "qemu"; + uint32_t busnr = pci_bus_num(PCI_BUS(qdev_get_parent_bus(DEVICE(pdev)))); + + if (object_dynamic_cast(OBJECT(pdev), "vfio-pci")) { + buf = spapr_phb_vfio_get_loc_code(sphb, pdev); + if (buf) { + return buf; + } + devtype = "vfio"; + } + /* + * For emulated devices and VFIO-failure case, make up + * the loc-code. + */ + buf = g_strdup_printf("%s_%s:%04x:%02x:%02x.%x", + devtype, pdev->name, sphb->index, busnr, + PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); + return buf; +} + /* Macros to operate with address in OF binding to PCI */ #define b_x(x, p, l) (((x) & ((1<<(l))-1)) << (p)) #define b_n(x) b_x((x), 31, 1) /* 0 if relocatable */ @@ -786,7 +852,13 @@ typedef struct ResourceProps { * phys.hi = 0xYYXXXXZZ, where: * 0xYY = npt000ss * ||| | - * ||| +-- space code: 1 if IO region, 2 if MEM region + * ||| +-- space code + * ||| | + * ||| + 00 if configuration space + * ||| + 01 if IO region, + * ||| + 10 if 32-bit MEM region + * ||| + 11 if 64-bit MEM region + * ||| * ||+------ for non-relocatable IO: 1 if aliased * || for relocatable IO: 1 if below 64KB * || for MEM: 1 if below 1MB @@ -846,6 +918,8 @@ static void populate_resource_props(PCIDevice *d, ResourceProps *rp) reg->phys_hi = cpu_to_be32(dev_id | b_rrrrrrrr(pci_bar(d, i))); if (d->io_regions[i].type & PCI_BASE_ADDRESS_SPACE_IO) { reg->phys_hi |= cpu_to_be32(b_ss(1)); + } else if (d->io_regions[i].type & PCI_BASE_ADDRESS_MEM_TYPE_64) { + reg->phys_hi |= cpu_to_be32(b_ss(3)); } else { reg->phys_hi |= cpu_to_be32(b_ss(2)); } @@ -870,13 +944,17 @@ static void populate_resource_props(PCIDevice *d, ResourceProps *rp) rp->assigned_len = assigned_idx * sizeof(ResourceFields); } +static uint32_t spapr_phb_get_pci_drc_index(sPAPRPHBState *phb, + PCIDevice *pdev); + static int spapr_populate_pci_child_dt(PCIDevice *dev, void *fdt, int offset, - int phb_index, int drc_index, - const char *drc_name) + sPAPRPHBState *sphb) { ResourceProps rp; bool is_bridge = false; - int pci_status; + int pci_status, err; + char *buf = NULL; + uint32_t drc_index = spapr_phb_get_pci_drc_index(sphb, dev); if (pci_default_read_config(dev, PCI_HEADER_TYPE, 1) == PCI_HEADER_TYPE_BRIDGE) { @@ -891,8 +969,7 @@ static int spapr_populate_pci_child_dt(PCIDevice *dev, void *fdt, int offset, _FDT(fdt_setprop_cell(fdt, offset, "revision-id", pci_default_read_config(dev, PCI_REVISION_ID, 1))); _FDT(fdt_setprop_cell(fdt, offset, "class-code", - pci_default_read_config(dev, PCI_CLASS_DEVICE, 2) - << 8)); + pci_default_read_config(dev, PCI_CLASS_PROG, 3))); if (pci_default_read_config(dev, PCI_INTERRUPT_PIN, 1)) { _FDT(fdt_setprop_cell(fdt, offset, "interrupts", pci_default_read_config(dev, PCI_INTERRUPT_PIN, 1))); @@ -938,8 +1015,21 @@ static int spapr_populate_pci_child_dt(PCIDevice *dev, void *fdt, int offset, * processed by OF beforehand */ _FDT(fdt_setprop_string(fdt, offset, "name", "pci")); - _FDT(fdt_setprop(fdt, offset, "ibm,loc-code", drc_name, strlen(drc_name))); - _FDT(fdt_setprop_cell(fdt, offset, "ibm,my-drc-index", drc_index)); + buf = spapr_phb_get_loc_code(sphb, dev); + if (!buf) { + error_report("Failed setting the ibm,loc-code"); + return -1; + } + + err = fdt_setprop_string(fdt, offset, "ibm,loc-code", buf); + g_free(buf); + if (err < 0) { + return err; + } + + if (drc_index) { + _FDT(fdt_setprop_cell(fdt, offset, "ibm,my-drc-index", drc_index)); + } _FDT(fdt_setprop_cell(fdt, offset, "#address-cells", RESOURCE_CELLS_ADDRESS)); @@ -957,29 +1047,27 @@ static int spapr_populate_pci_child_dt(PCIDevice *dev, void *fdt, int offset, } /* create OF node for pci device and required OF DT properties */ -static void *spapr_create_pci_child_dt(sPAPRPHBState *phb, PCIDevice *dev, - int drc_index, const char *drc_name, - int *dt_offset) +static int spapr_create_pci_child_dt(sPAPRPHBState *phb, PCIDevice *dev, + void *fdt, int node_offset) { - void *fdt; - int offset, ret, fdt_size; + int offset, ret; int slot = PCI_SLOT(dev->devfn); int func = PCI_FUNC(dev->devfn); - char nodename[512]; + char nodename[FDT_NAME_MAX]; - fdt = create_device_tree(&fdt_size); if (func != 0) { - sprintf(nodename, "pci@%d,%d", slot, func); + snprintf(nodename, FDT_NAME_MAX, "pci@%x,%x", slot, func); } else { - sprintf(nodename, "pci@%d", slot); + snprintf(nodename, FDT_NAME_MAX, "pci@%x", slot); } - offset = fdt_add_subnode(fdt, 0, nodename); - ret = spapr_populate_pci_child_dt(dev, fdt, offset, phb->index, drc_index, - drc_name); - g_assert(!ret); + offset = fdt_add_subnode(fdt, node_offset, nodename); + ret = spapr_populate_pci_child_dt(dev, fdt, offset, phb); - *dt_offset = offset; - return fdt; + g_assert(!ret); + if (ret) { + return 0; + } + return offset; } static void spapr_phb_add_pci_device(sPAPRDRConnector *drc, @@ -989,22 +1077,21 @@ static void spapr_phb_add_pci_device(sPAPRDRConnector *drc, { sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc); DeviceState *dev = DEVICE(pdev); - int drc_index = drck->get_index(drc); - const char *drc_name = drck->get_name(drc); void *fdt = NULL; - int fdt_start_offset = 0; + int fdt_start_offset = 0, fdt_size; - /* boot-time devices get their device tree node created by SLOF, but for - * hotplugged devices we need QEMU to generate it so the guest can fetch - * it via RTAS - */ if (dev->hotplugged) { - fdt = spapr_create_pci_child_dt(phb, pdev, drc_index, drc_name, - &fdt_start_offset); + fdt = create_device_tree(&fdt_size); + fdt_start_offset = spapr_create_pci_child_dt(phb, pdev, fdt, 0); + if (!fdt_start_offset) { + error_setg(errp, "Failed to create pci child device tree node"); + goto out; + } } drck->attach(drc, DEVICE(pdev), fdt, fdt_start_offset, !dev->hotplugged, errp); +out: if (*errp) { g_free(fdt); } @@ -1046,6 +1133,20 @@ static sPAPRDRConnector *spapr_phb_get_pci_drc(sPAPRPHBState *phb, pdev->devfn); } +static uint32_t spapr_phb_get_pci_drc_index(sPAPRPHBState *phb, + PCIDevice *pdev) +{ + sPAPRDRConnector *drc = spapr_phb_get_pci_drc(phb, pdev); + sPAPRDRConnectorClass *drck; + + if (!drc) { + return 0; + } + + drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc); + return drck->get_index(drc); +} + static void spapr_phb_hot_plug_child(HotplugHandler *plug_handler, DeviceState *plugged_dev, Error **errp) { @@ -1110,6 +1211,7 @@ static void spapr_phb_hot_unplug_child(HotplugHandler *plug_handler, static void spapr_phb_realize(DeviceState *dev, Error **errp) { + sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); SysBusDevice *s = SYS_BUS_DEVICE(dev); sPAPRPHBState *sphb = SPAPR_PCI_HOST_BRIDGE(s); PCIHostState *phb = PCI_HOST_BRIDGE(s); @@ -1351,34 +1453,28 @@ static const VMStateDescription vmstate_spapr_pci_msi = { }, }; -static void spapr_pci_fill_msi_devs(gpointer key, gpointer value, - gpointer opaque) -{ - sPAPRPHBState *sphb = opaque; - - sphb->msi_devs[sphb->msi_devs_num].key = *(uint32_t *)key; - sphb->msi_devs[sphb->msi_devs_num].value = *(spapr_pci_msi *)value; - sphb->msi_devs_num++; -} - static void spapr_pci_pre_save(void *opaque) { sPAPRPHBState *sphb = opaque; - int msi_devs_num; + GHashTableIter iter; + gpointer key, value; + int i; if (sphb->msi_devs) { g_free(sphb->msi_devs); sphb->msi_devs = NULL; } - sphb->msi_devs_num = 0; - msi_devs_num = g_hash_table_size(sphb->msi); - if (!msi_devs_num) { + sphb->msi_devs_num = g_hash_table_size(sphb->msi); + if (!sphb->msi_devs_num) { return; } - sphb->msi_devs = g_malloc(msi_devs_num * sizeof(spapr_pci_msi_mig)); + sphb->msi_devs = g_malloc(sphb->msi_devs_num * sizeof(spapr_pci_msi_mig)); - g_hash_table_foreach(sphb->msi, spapr_pci_fill_msi_devs, sphb); - assert(sphb->msi_devs_num == msi_devs_num); + g_hash_table_iter_init(&iter, sphb->msi); + for (i = 0; g_hash_table_iter_next(&iter, &key, &value); ++i) { + sphb->msi_devs[i].key = *(uint32_t *) key; + sphb->msi_devs[i].value = *(spapr_pci_msi *) value; + } } static int spapr_pci_post_load(void *opaque, int version_id) @@ -1464,7 +1560,7 @@ static const TypeInfo spapr_phb_info = { } }; -PCIHostState *spapr_create_phb(sPAPREnvironment *spapr, int index) +PCIHostState *spapr_create_phb(sPAPRMachineState *spapr, int index) { DeviceState *dev; @@ -1475,12 +1571,90 @@ PCIHostState *spapr_create_phb(sPAPREnvironment *spapr, int index) return PCI_HOST_BRIDGE(dev); } +typedef struct sPAPRFDT { + void *fdt; + int node_off; + sPAPRPHBState *sphb; +} sPAPRFDT; + +static void spapr_populate_pci_devices_dt(PCIBus *bus, PCIDevice *pdev, + void *opaque) +{ + PCIBus *sec_bus; + sPAPRFDT *p = opaque; + int offset; + sPAPRFDT s_fdt; + + offset = spapr_create_pci_child_dt(p->sphb, pdev, p->fdt, p->node_off); + if (!offset) { + error_report("Failed to create pci child device tree node"); + return; + } + + if ((pci_default_read_config(pdev, PCI_HEADER_TYPE, 1) != + PCI_HEADER_TYPE_BRIDGE)) { + return; + } + + sec_bus = pci_bridge_get_sec_bus(PCI_BRIDGE(pdev)); + if (!sec_bus) { + return; + } + + s_fdt.fdt = p->fdt; + s_fdt.node_off = offset; + s_fdt.sphb = p->sphb; + pci_for_each_device(sec_bus, pci_bus_num(sec_bus), + spapr_populate_pci_devices_dt, + &s_fdt); +} + +static void spapr_phb_pci_enumerate_bridge(PCIBus *bus, PCIDevice *pdev, + void *opaque) +{ + unsigned int *bus_no = opaque; + unsigned int primary = *bus_no; + unsigned int subordinate = 0xff; + PCIBus *sec_bus = NULL; + + if ((pci_default_read_config(pdev, PCI_HEADER_TYPE, 1) != + PCI_HEADER_TYPE_BRIDGE)) { + return; + } + + (*bus_no)++; + pci_default_write_config(pdev, PCI_PRIMARY_BUS, primary, 1); + pci_default_write_config(pdev, PCI_SECONDARY_BUS, *bus_no, 1); + pci_default_write_config(pdev, PCI_SUBORDINATE_BUS, *bus_no, 1); + + sec_bus = pci_bridge_get_sec_bus(PCI_BRIDGE(pdev)); + if (!sec_bus) { + return; + } + + pci_default_write_config(pdev, PCI_SUBORDINATE_BUS, subordinate, 1); + pci_for_each_device(sec_bus, pci_bus_num(sec_bus), + spapr_phb_pci_enumerate_bridge, bus_no); + pci_default_write_config(pdev, PCI_SUBORDINATE_BUS, *bus_no, 1); +} + +static void spapr_phb_pci_enumerate(sPAPRPHBState *phb) +{ + PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus; + unsigned int bus_no = 0; + + pci_for_each_device(bus, pci_bus_num(bus), + spapr_phb_pci_enumerate_bridge, + &bus_no); + +} + int spapr_populate_pci_dt(sPAPRPHBState *phb, uint32_t xics_phandle, void *fdt) { int bus_off, i, j, ret; - char nodename[256]; + char nodename[FDT_NAME_MAX]; uint32_t bus_range[] = { cpu_to_be32(0), cpu_to_be32(0xff) }; const uint64_t mmiosize = memory_region_size(&phb->memwindow); const uint64_t w32max = (1ULL << 32) - SPAPR_PCI_MEM_WIN_BUS_OFFSET; @@ -1514,9 +1688,11 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, cpu_to_be32(b_ddddd(-1)|b_fff(0)), 0x0, 0x0, cpu_to_be32(-1)}; uint32_t interrupt_map[PCI_SLOT_MAX * PCI_NUM_PINS][7]; sPAPRTCETable *tcet; + PCIBus *bus = PCI_HOST_BRIDGE(phb)->bus; + sPAPRFDT s_fdt; /* Start populating the FDT */ - sprintf(nodename, "pci@%" PRIx64, phb->buid); + snprintf(nodename, FDT_NAME_MAX, "pci@%" PRIx64, phb->buid); bus_off = fdt_add_subnode(fdt, 0, nodename); if (bus_off < 0) { return bus_off; @@ -1563,6 +1739,18 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, tcet->liobn, tcet->bus_offset, tcet->nb_table << tcet->page_shift); + /* Walk the bridges and program the bus numbers*/ + spapr_phb_pci_enumerate(phb); + _FDT(fdt_setprop_cell(fdt, bus_off, "qemu,phb-enumerated", 0x1)); + + /* Populate tree nodes with PCI devices attached */ + s_fdt.fdt = fdt; + s_fdt.node_off = bus_off; + s_fdt.sphb = phb; + pci_for_each_device(bus, pci_bus_num(bus), + spapr_populate_pci_devices_dt, + &s_fdt); + ret = spapr_drc_populate_dt(fdt, bus_off, OBJECT(phb), SPAPR_DR_CONNECTOR_TYPE_PCI); if (ret) { @@ -1631,6 +1819,7 @@ static int spapr_switch_one_vga(DeviceState *dev, void *opaque) void spapr_pci_switch_vga(bool big_endian) { + sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); sPAPRPHBState *sphb; /* diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c index 99a1be5113..cca45ed312 100644 --- a/hw/ppc/spapr_pci_vfio.c +++ b/hw/ppc/spapr_pci_vfio.c @@ -19,6 +19,7 @@ #include "hw/ppc/spapr.h" #include "hw/pci-host/spapr.h" +#include "hw/pci/msix.h" #include "linux/vfio.h" #include "hw/vfio/vfio.h" @@ -71,9 +72,26 @@ static void spapr_phb_vfio_finish_realize(sPAPRPHBState *sphb, Error **errp) spapr_tce_get_iommu(tcet)); } +static void spapr_phb_vfio_eeh_reenable(sPAPRPHBVFIOState *svphb) +{ + struct vfio_eeh_pe_op op = { + .argsz = sizeof(op), + .op = VFIO_EEH_PE_ENABLE + }; + + vfio_container_ioctl(&svphb->phb.iommu_as, + svphb->iommugroupid, VFIO_EEH_PE_OP, &op); +} + static void spapr_phb_vfio_reset(DeviceState *qdev) { - /* Do nothing */ + /* + * The PE might be in frozen state. To reenable the EEH + * functionality on it will clean the frozen state, which + * ensures that the contained PCI devices will work properly + * after reboot. + */ + spapr_phb_vfio_eeh_reenable(SPAPR_PCI_VFIO_HOST_BRIDGE(qdev)); } static int spapr_phb_vfio_eeh_set_option(sPAPRPHBState *sphb, @@ -142,6 +160,49 @@ static int spapr_phb_vfio_eeh_get_state(sPAPRPHBState *sphb, int *state) return RTAS_OUT_SUCCESS; } +static void spapr_phb_vfio_eeh_clear_dev_msix(PCIBus *bus, + PCIDevice *pdev, + void *opaque) +{ + /* Check if the device is VFIO PCI device */ + if (!object_dynamic_cast(OBJECT(pdev), "vfio-pci")) { + return; + } + + /* + * The MSIx table will be cleaned out by reset. We need + * disable it so that it can be reenabled properly. Also, + * the cached MSIx table should be cleared as it's not + * reflecting the contents in hardware. + */ + if (msix_enabled(pdev)) { + uint16_t flags; + + flags = pci_host_config_read_common(pdev, + pdev->msix_cap + PCI_MSIX_FLAGS, + pci_config_size(pdev), 2); + flags &= ~PCI_MSIX_FLAGS_ENABLE; + pci_host_config_write_common(pdev, + pdev->msix_cap + PCI_MSIX_FLAGS, + pci_config_size(pdev), flags, 2); + } + + msix_reset(pdev); +} + +static void spapr_phb_vfio_eeh_clear_bus_msix(PCIBus *bus, void *opaque) +{ + pci_for_each_device(bus, pci_bus_num(bus), + spapr_phb_vfio_eeh_clear_dev_msix, NULL); +} + +static void spapr_phb_vfio_eeh_pre_reset(sPAPRPHBState *sphb) +{ + PCIHostState *phb = PCI_HOST_BRIDGE(sphb); + + pci_for_each_bus(phb->bus, spapr_phb_vfio_eeh_clear_bus_msix, NULL); +} + static int spapr_phb_vfio_eeh_reset(sPAPRPHBState *sphb, int option) { sPAPRPHBVFIOState *svphb = SPAPR_PCI_VFIO_HOST_BRIDGE(sphb); @@ -153,9 +214,11 @@ static int spapr_phb_vfio_eeh_reset(sPAPRPHBState *sphb, int option) op.op = VFIO_EEH_PE_RESET_DEACTIVATE; break; case RTAS_SLOT_RESET_HOT: + spapr_phb_vfio_eeh_pre_reset(sphb); op.op = VFIO_EEH_PE_RESET_HOT; break; case RTAS_SLOT_RESET_FUNDAMENTAL: + spapr_phb_vfio_eeh_pre_reset(sphb); op.op = VFIO_EEH_PE_RESET_FUNDAMENTAL; break; default: diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c index fa28d43f81..2986f94f03 100644 --- a/hw/ppc/spapr_rtas.c +++ b/hw/ppc/spapr_rtas.c @@ -29,6 +29,7 @@ #include "sysemu/char.h" #include "hw/qdev.h" #include "sysemu/device_tree.h" +#include "sysemu/cpus.h" #include "hw/ppc/spapr.h" #include "hw/ppc/spapr_vio.h" @@ -47,7 +48,7 @@ do { } while (0) #endif -static sPAPRConfigureConnectorState *spapr_ccs_find(sPAPREnvironment *spapr, +static sPAPRConfigureConnectorState *spapr_ccs_find(sPAPRMachineState *spapr, uint32_t drc_index) { sPAPRConfigureConnectorState *ccs = NULL; @@ -61,14 +62,14 @@ static sPAPRConfigureConnectorState *spapr_ccs_find(sPAPREnvironment *spapr, return ccs; } -static void spapr_ccs_add(sPAPREnvironment *spapr, +static void spapr_ccs_add(sPAPRMachineState *spapr, sPAPRConfigureConnectorState *ccs) { g_assert(!spapr_ccs_find(spapr, ccs->drc_index)); QTAILQ_INSERT_HEAD(&spapr->ccs_list, ccs, next); } -static void spapr_ccs_remove(sPAPREnvironment *spapr, +static void spapr_ccs_remove(sPAPRMachineState *spapr, sPAPRConfigureConnectorState *ccs) { QTAILQ_REMOVE(&spapr->ccs_list, ccs, next); @@ -77,7 +78,7 @@ static void spapr_ccs_remove(sPAPREnvironment *spapr, void spapr_ccs_reset_hook(void *opaque) { - sPAPREnvironment *spapr = opaque; + sPAPRMachineState *spapr = opaque; sPAPRConfigureConnectorState *ccs, *ccs_tmp; QTAILQ_FOREACH_SAFE(ccs, &spapr->ccs_list, next, ccs_tmp) { @@ -85,7 +86,7 @@ void spapr_ccs_reset_hook(void *opaque) } } -static void rtas_display_character(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static void rtas_display_character(PowerPCCPU *cpu, sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -101,7 +102,7 @@ static void rtas_display_character(PowerPCCPU *cpu, sPAPREnvironment *spapr, } } -static void rtas_power_off(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static void rtas_power_off(PowerPCCPU *cpu, sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) { @@ -113,7 +114,7 @@ static void rtas_power_off(PowerPCCPU *cpu, sPAPREnvironment *spapr, rtas_st(rets, 0, RTAS_OUT_SUCCESS); } -static void rtas_system_reboot(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static void rtas_system_reboot(PowerPCCPU *cpu, sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -127,7 +128,7 @@ static void rtas_system_reboot(PowerPCCPU *cpu, sPAPREnvironment *spapr, } static void rtas_query_cpu_stopped_state(PowerPCCPU *cpu_, - sPAPREnvironment *spapr, + sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -157,7 +158,7 @@ static void rtas_query_cpu_stopped_state(PowerPCCPU *cpu_, rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR); } -static void rtas_start_cpu(PowerPCCPU *cpu_, sPAPREnvironment *spapr, +static void rtas_start_cpu(PowerPCCPU *cpu_, sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -204,7 +205,7 @@ static void rtas_start_cpu(PowerPCCPU *cpu_, sPAPREnvironment *spapr, rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR); } -static void rtas_stop_self(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static void rtas_stop_self(PowerPCCPU *cpu, sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -227,7 +228,7 @@ static void rtas_stop_self(PowerPCCPU *cpu, sPAPREnvironment *spapr, } static void rtas_ibm_get_system_parameter(PowerPCCPU *cpu, - sPAPREnvironment *spapr, + sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -262,7 +263,7 @@ static void rtas_ibm_get_system_parameter(PowerPCCPU *cpu, } static void rtas_ibm_set_system_parameter(PowerPCCPU *cpu, - sPAPREnvironment *spapr, + sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -282,7 +283,7 @@ static void rtas_ibm_set_system_parameter(PowerPCCPU *cpu, } static void rtas_ibm_os_term(PowerPCCPU *cpu, - sPAPREnvironment *spapr, + sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -294,7 +295,7 @@ static void rtas_ibm_os_term(PowerPCCPU *cpu, rtas_st(rets, 0, ret); } -static void rtas_set_power_level(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static void rtas_set_power_level(PowerPCCPU *cpu, sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -319,7 +320,7 @@ static void rtas_set_power_level(PowerPCCPU *cpu, sPAPREnvironment *spapr, rtas_st(rets, 1, 100); } -static void rtas_get_power_level(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static void rtas_get_power_level(PowerPCCPU *cpu, sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -356,7 +357,7 @@ static bool sensor_type_is_dr(uint32_t sensor_type) return false; } -static void rtas_set_indicator(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static void rtas_set_indicator(PowerPCCPU *cpu, sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -427,7 +428,7 @@ out_unimplemented: rtas_st(rets, 0, RTAS_OUT_NOT_SUPPORTED); } -static void rtas_get_sensor_state(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static void rtas_get_sensor_state(PowerPCCPU *cpu, sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -481,7 +482,7 @@ static void rtas_get_sensor_state(PowerPCCPU *cpu, sPAPREnvironment *spapr, #define CC_WA_LEN 4096 static void rtas_ibm_configure_connector(PowerPCCPU *cpu, - sPAPREnvironment *spapr, + sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -601,7 +602,7 @@ static struct rtas_call { spapr_rtas_fn fn; } rtas_table[RTAS_TOKEN_MAX - RTAS_TOKEN_BASE]; -target_ulong spapr_rtas_call(PowerPCCPU *cpu, sPAPREnvironment *spapr, +target_ulong spapr_rtas_call(PowerPCCPU *cpu, sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) { @@ -651,6 +652,8 @@ int spapr_rtas_device_tree_setup(void *fdt, hwaddr rtas_addr, { int ret; int i; + uint32_t lrdr_capacity[5]; + MachineState *machine = MACHINE(qdev_get_machine()); ret = fdt_add_mem_rsv(fdt, rtas_addr, rtas_size); if (ret < 0) { @@ -699,6 +702,19 @@ int spapr_rtas_device_tree_setup(void *fdt, hwaddr rtas_addr, } } + + lrdr_capacity[0] = cpu_to_be32(((uint64_t)machine->maxram_size) >> 32); + lrdr_capacity[1] = cpu_to_be32(machine->maxram_size & 0xffffffff); + lrdr_capacity[2] = 0; + lrdr_capacity[3] = cpu_to_be32(SPAPR_MEMORY_BLOCK_SIZE); + lrdr_capacity[4] = cpu_to_be32(max_cpus/smp_threads); + ret = qemu_fdt_setprop(fdt, "/rtas", "ibm,lrdr-capacity", lrdr_capacity, + sizeof(lrdr_capacity)); + if (ret < 0) { + fprintf(stderr, "Couldn't add ibm,lrdr-capacity rtas property\n"); + return ret; + } + return 0; } diff --git a/hw/ppc/spapr_rtc.c b/hw/ppc/spapr_rtc.c index 9da3746e7c..d20b8f270c 100644 --- a/hw/ppc/spapr_rtc.c +++ b/hw/ppc/spapr_rtc.c @@ -76,7 +76,7 @@ int spapr_rtc_import_offset(DeviceState *dev, int64_t legacy_offset) return 0; } -static void rtas_get_time_of_day(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static void rtas_get_time_of_day(PowerPCCPU *cpu, sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -106,7 +106,7 @@ static void rtas_get_time_of_day(PowerPCCPU *cpu, sPAPREnvironment *spapr, rtas_st(rets, 7, ns); } -static void rtas_set_time_of_day(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static void rtas_set_time_of_day(PowerPCCPU *cpu, sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) diff --git a/hw/ppc/spapr_vio.c b/hw/ppc/spapr_vio.c index 8b59b64b7e..c51eb8e244 100644 --- a/hw/ppc/spapr_vio.c +++ b/hw/ppc/spapr_vio.c @@ -160,7 +160,7 @@ static int vio_make_devnode(VIOsPAPRDevice *dev, /* * CRQ handling */ -static target_ulong h_reg_crq(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_reg_crq(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { target_ulong reg = args[0]; @@ -218,7 +218,7 @@ static target_ulong free_crq(VIOsPAPRDevice *dev) return H_SUCCESS; } -static target_ulong h_free_crq(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_free_crq(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { target_ulong reg = args[0]; @@ -232,7 +232,7 @@ static target_ulong h_free_crq(PowerPCCPU *cpu, sPAPREnvironment *spapr, return free_crq(dev); } -static target_ulong h_send_crq(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_send_crq(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { target_ulong reg = args[0]; @@ -255,7 +255,7 @@ static target_ulong h_send_crq(PowerPCCPU *cpu, sPAPREnvironment *spapr, return H_HARDWARE; } -static target_ulong h_enable_crq(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_enable_crq(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { target_ulong reg = args[0]; @@ -333,7 +333,7 @@ void spapr_vio_set_bypass(VIOsPAPRDevice *dev, bool bypass) dev->tcet->bypass = bypass; } -static void rtas_set_tce_bypass(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static void rtas_set_tce_bypass(PowerPCCPU *cpu, sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -364,7 +364,7 @@ static void rtas_set_tce_bypass(PowerPCCPU *cpu, sPAPREnvironment *spapr, rtas_st(rets, 0, RTAS_OUT_SUCCESS); } -static void rtas_quiesce(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static void rtas_quiesce(PowerPCCPU *cpu, sPAPRMachineState *spapr, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets) @@ -426,6 +426,7 @@ static void spapr_vio_busdev_reset(DeviceState *qdev) static void spapr_vio_busdev_realize(DeviceState *qdev, Error **errp) { + sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); VIOsPAPRDevice *dev = (VIOsPAPRDevice *)qdev; VIOsPAPRDeviceClass *pc = VIO_SPAPR_DEVICE_GET_CLASS(dev); char *id; @@ -491,7 +492,7 @@ static void spapr_vio_busdev_realize(DeviceState *qdev, Error **errp) pc->realize(dev, errp); } -static target_ulong h_vio_signal(PowerPCCPU *cpu, sPAPREnvironment *spapr, +static target_ulong h_vio_signal(PowerPCCPU *cpu, sPAPRMachineState *spapr, target_ulong opcode, target_ulong *args) { diff --git a/hw/ppc/virtex_ml507.c b/hw/ppc/virtex_ml507.c index 439732f7ab..de86f7c64c 100644 --- a/hw/ppc/virtex_ml507.c +++ b/hw/ppc/virtex_ml507.c @@ -197,7 +197,6 @@ static int xilinx_load_device_tree(hwaddr addr, static void virtex_init(MachineState *machine) { ram_addr_t ram_size = machine->ram_size; - const char *cpu_model = machine->cpu_model; const char *kernel_filename = machine->kernel_filename; const char *kernel_cmdline = machine->kernel_cmdline; hwaddr initrd_base = 0; @@ -214,11 +213,11 @@ static void virtex_init(MachineState *machine) int i; /* init CPUs */ - if (cpu_model == NULL) { - cpu_model = "440-Xilinx"; + if (machine->cpu_model == NULL) { + machine->cpu_model = "440-Xilinx"; } - cpu = ppc440_init_xilinx(&ram_size, 1, cpu_model, 400000000); + cpu = ppc440_init_xilinx(&ram_size, 1, machine->cpu_model, 400000000); env = &cpu->env; qemu_register_reset(main_cpu_reset, cpu); diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c index d631337e11..e345a6e9b8 100644 --- a/hw/s390x/virtio-ccw.c +++ b/hw/s390x/virtio-ccw.c @@ -1316,8 +1316,8 @@ static int virtio_ccw_add_irqfd(VirtioCcwDevice *dev, int n) VirtQueue *vq = virtio_get_queue(vdev, n); EventNotifier *notifier = virtio_queue_get_guest_notifier(vq); - return kvm_irqchip_add_irqfd_notifier(kvm_state, notifier, NULL, - dev->routes.gsi[n]); + return kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, notifier, NULL, + dev->routes.gsi[n]); } static void virtio_ccw_remove_irqfd(VirtioCcwDevice *dev, int n) @@ -1327,8 +1327,8 @@ static void virtio_ccw_remove_irqfd(VirtioCcwDevice *dev, int n) EventNotifier *notifier = virtio_queue_get_guest_notifier(vq); int ret; - ret = kvm_irqchip_remove_irqfd_notifier(kvm_state, notifier, - dev->routes.gsi[n]); + ret = kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, notifier, + dev->routes.gsi[n]); assert(ret == 0); } diff --git a/hw/vfio/common.c b/hw/vfio/common.c index b1045da857..85ee9b005e 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -772,11 +772,19 @@ static void vfio_disconnect_container(VFIOGroup *group) if (QLIST_EMPTY(&container->group_list)) { VFIOAddressSpace *space = container->space; + VFIOGuestIOMMU *giommu, *tmp; if (container->iommu_data.release) { container->iommu_data.release(container); } QLIST_REMOVE(container, next); + + QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) { + memory_region_unregister_iommu_notifier(&giommu->n); + QLIST_REMOVE(giommu, giommu_next); + g_free(giommu); + } + trace_vfio_disconnect_container(container->fd); close(container->fd); g_free(container); diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index e0e339a534..2ed877fe9f 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -597,7 +597,7 @@ static void vfio_add_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage *msg, return; } - if (kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->kvm_interrupt, + if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt, NULL, virq) < 0) { kvm_irqchip_release_virq(kvm_state, virq); event_notifier_cleanup(&vector->kvm_interrupt); @@ -609,8 +609,8 @@ static void vfio_add_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage *msg, static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector) { - kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->kvm_interrupt, - vector->virq); + kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt, + vector->virq); kvm_irqchip_release_virq(kvm_state, vector->virq); vector->virq = -1; event_notifier_cleanup(&vector->kvm_interrupt); @@ -939,7 +939,7 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev) }; uint64_t size; off_t off = 0; - size_t bytes; + ssize_t bytes; if (ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_REGION_INFO, ®_info)) { error_report("vfio: Error getting ROM info: %m"); @@ -2252,6 +2252,33 @@ static int vfio_early_setup_msix(VFIOPCIDevice *vdev) vdev->msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK; vdev->msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; + /* + * Test the size of the pba_offset variable and catch if it extends outside + * of the specified BAR. If it is the case, we need to apply a hardware + * specific quirk if the device is known or we have a broken configuration. + */ + if (vdev->msix->pba_offset >= + vdev->bars[vdev->msix->pba_bar].region.size) { + + PCIDevice *pdev = &vdev->pdev; + uint16_t vendor = pci_get_word(pdev->config + PCI_VENDOR_ID); + uint16_t device = pci_get_word(pdev->config + PCI_DEVICE_ID); + + /* + * Chelsio T5 Virtual Function devices are encoded as 0x58xx for T5 + * adapters. The T5 hardware returns an incorrect value of 0x8000 for + * the VF PBA offset while the BAR itself is only 8k. The correct value + * is 0x1000, so we hard code that here. + */ + if (vendor == PCI_VENDOR_ID_CHELSIO && (device & 0xff00) == 0x5800) { + vdev->msix->pba_offset = 0x1000; + } else { + error_report("vfio: Hardware reports invalid configuration, " + "MSIX PBA outside of specified BAR"); + return -EINVAL; + } + } + trace_vfio_early_setup_msix(vdev->vbasedev.name, pos, vdev->msix->table_bar, vdev->msix->table_offset, @@ -2388,7 +2415,7 @@ static void vfio_map_bar(VFIOPCIDevice *vdev, int nr) * potentially insert a direct-mapped subregion before and after it. */ if (vdev->msix && vdev->msix->table_bar == nr) { - size = vdev->msix->table_offset & qemu_host_page_mask; + size = vdev->msix->table_offset & qemu_real_host_page_mask; } strncat(name, " mmap", sizeof(name) - strlen(name) - 1); @@ -2401,8 +2428,9 @@ static void vfio_map_bar(VFIOPCIDevice *vdev, int nr) if (vdev->msix && vdev->msix->table_bar == nr) { uint64_t start; - start = HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset + - (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE)); + start = REAL_HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset + + (vdev->msix->entries * + PCI_MSIX_ENTRY_SIZE)); size = start < bar->region.size ? bar->region.size - start : 0; strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1); diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index 5c678b914e..60365d1279 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -26,6 +26,7 @@ #include "hw/sysbus.h" #include "trace.h" #include "hw/platform-bus.h" +#include "sysemu/kvm.h" /* * Functions used whatever the injection method @@ -51,6 +52,7 @@ static VFIOINTp *vfio_init_intp(VFIODevice *vbasedev, intp->pin = info.index; intp->flags = info.flags; intp->state = VFIO_IRQ_INACTIVE; + intp->kvm_accel = false; sysbus_init_irq(sbdev, &intp->qemuirq); @@ -61,6 +63,13 @@ static VFIOINTp *vfio_init_intp(VFIODevice *vbasedev, error_report("vfio: Error: trigger event_notifier_init failed "); return NULL; } + /* Get an eventfd for resample/unmask */ + ret = event_notifier_init(&intp->unmask, 0); + if (ret) { + g_free(intp); + error_report("vfio: Error: resamplefd event_notifier_init failed"); + return NULL; + } QLIST_INSERT_HEAD(&vdev->intp_list, intp, next); return intp; @@ -315,6 +324,94 @@ static int vfio_start_eventfd_injection(VFIOINTp *intp) return ret; } +/* + * Functions used for irqfd + */ + +/** + * vfio_set_resample_eventfd - sets the resamplefd for an IRQ + * @intp: the IRQ struct handle + * programs the VFIO driver to unmask this IRQ when the + * intp->unmask eventfd is triggered + */ +static int vfio_set_resample_eventfd(VFIOINTp *intp) +{ + VFIODevice *vbasedev = &intp->vdev->vbasedev; + struct vfio_irq_set *irq_set; + int argsz, ret; + int32_t *pfd; + + argsz = sizeof(*irq_set) + sizeof(*pfd); + irq_set = g_malloc0(argsz); + irq_set->argsz = argsz; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_UNMASK; + irq_set->index = intp->pin; + irq_set->start = 0; + irq_set->count = 1; + pfd = (int32_t *)&irq_set->data; + *pfd = event_notifier_get_fd(&intp->unmask); + qemu_set_fd_handler(*pfd, NULL, NULL, NULL); + ret = ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set); + g_free(irq_set); + if (ret < 0) { + error_report("vfio: Failed to set resample eventfd: %m"); + } + return ret; +} + +static void vfio_start_irqfd_injection(SysBusDevice *sbdev, qemu_irq irq) +{ + VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(sbdev); + VFIOINTp *intp; + + if (!kvm_irqfds_enabled() || !kvm_resamplefds_enabled() || + !vdev->irqfd_allowed) { + return; + } + + QLIST_FOREACH(intp, &vdev->intp_list, next) { + if (intp->qemuirq == irq) { + break; + } + } + assert(intp); + + /* Get to a known interrupt state */ + qemu_set_fd_handler(event_notifier_get_fd(&intp->interrupt), + NULL, NULL, vdev); + + vfio_mask_single_irqindex(&vdev->vbasedev, intp->pin); + qemu_set_irq(intp->qemuirq, 0); + + if (kvm_irqchip_add_irqfd_notifier(kvm_state, &intp->interrupt, + &intp->unmask, irq) < 0) { + goto fail_irqfd; + } + + if (vfio_set_trigger_eventfd(intp, NULL) < 0) { + goto fail_vfio; + } + if (vfio_set_resample_eventfd(intp) < 0) { + goto fail_vfio; + } + + /* Let's resume injection with irqfd setup */ + vfio_unmask_single_irqindex(&vdev->vbasedev, intp->pin); + + intp->kvm_accel = true; + + trace_vfio_platform_start_irqfd_injection(intp->pin, + event_notifier_get_fd(&intp->interrupt), + event_notifier_get_fd(&intp->unmask)); + return; +fail_vfio: + kvm_irqchip_remove_irqfd_notifier(kvm_state, &intp->interrupt, irq); +fail_irqfd: + vfio_start_eventfd_injection(intp); + vfio_unmask_single_irqindex(&vdev->vbasedev, intp->pin); + return; +} + /* VFIO skeleton */ static void vfio_platform_compute_needs_reset(VFIODevice *vbasedev) @@ -584,17 +681,20 @@ static Property vfio_platform_dev_properties[] = { DEFINE_PROP_BOOL("x-mmap", VFIOPlatformDevice, vbasedev.allow_mmap, true), DEFINE_PROP_UINT32("mmap-timeout-ms", VFIOPlatformDevice, mmap_timeout, 1100), + DEFINE_PROP_BOOL("x-irqfd", VFIOPlatformDevice, irqfd_allowed, true), DEFINE_PROP_END_OF_LIST(), }; static void vfio_platform_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); + SysBusDeviceClass *sbc = SYS_BUS_DEVICE_CLASS(klass); dc->realize = vfio_platform_realize; dc->props = vfio_platform_dev_properties; dc->vmsd = &vfio_platform_vmstate; dc->desc = "VFIO-based platform device assignment"; + sbc->connect_irq_notifier = vfio_start_irqfd_injection; set_bit(DEVICE_CATEGORY_MISC, dc->categories); } diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c index 6ca0258067..ccca2b6f3b 100644 --- a/hw/virtio/virtio-pci.c +++ b/hw/virtio/virtio-pci.c @@ -616,7 +616,7 @@ static int kvm_virtio_pci_irqfd_use(VirtIOPCIProxy *proxy, VirtQueue *vq = virtio_get_queue(vdev, queue_no); EventNotifier *n = virtio_queue_get_guest_notifier(vq); int ret; - ret = kvm_irqchip_add_irqfd_notifier(kvm_state, n, NULL, irqfd->virq); + ret = kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, NULL, irqfd->virq); return ret; } @@ -630,7 +630,7 @@ static void kvm_virtio_pci_irqfd_release(VirtIOPCIProxy *proxy, VirtIOIRQFD *irqfd = &proxy->vector_irqfd[vector]; int ret; - ret = kvm_irqchip_remove_irqfd_notifier(kvm_state, n, irqfd->virq); + ret = kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, n, irqfd->virq); assert(ret == 0); } diff --git a/include/block/blockjob.h b/include/block/blockjob.h index 57d8ef13e2..dd9d5e6aad 100644 --- a/include/block/blockjob.h +++ b/include/block/blockjob.h @@ -166,6 +166,14 @@ void block_job_sleep_ns(BlockJob *job, QEMUClockType type, int64_t ns); void block_job_yield(BlockJob *job); /** + * block_job_release: + * @bs: The block device. + * + * Release job resources when an error occurred or job completed. + */ +void block_job_release(BlockDriverState *bs); + +/** * block_job_completed: * @job: The job being completed. * @ret: The status code. diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h index 8999634981..ea6a9a667c 100644 --- a/include/exec/cpu-all.h +++ b/include/exec/cpu-all.h @@ -183,10 +183,13 @@ extern unsigned long reserved_va; /* ??? These should be the larger of uintptr_t and target_ulong. */ extern uintptr_t qemu_real_host_page_size; +extern uintptr_t qemu_real_host_page_mask; extern uintptr_t qemu_host_page_size; extern uintptr_t qemu_host_page_mask; #define HOST_PAGE_ALIGN(addr) (((addr) + qemu_host_page_size - 1) & qemu_host_page_mask) +#define REAL_HOST_PAGE_ALIGN(addr) (((addr) + qemu_real_host_page_size - 1) & \ + qemu_real_host_page_mask) /* same as PROT_xxx */ #define PAGE_READ 0x0001 diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h index d678114cb2..2e74760ade 100644 --- a/include/exec/exec-all.h +++ b/include/exec/exec-all.h @@ -365,4 +365,7 @@ static inline bool cpu_can_do_io(CPUState *cpu) return cpu->can_do_io != 0; } +#if !defined(CONFIG_USER_ONLY) +void migration_bitmap_extend(ram_addr_t old, ram_addr_t new); +#endif #endif diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index 954e2681be..15e3352967 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -293,7 +293,12 @@ int e820_get_num_entries(void); bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *); #define PC_COMPAT_2_3 \ - HW_COMPAT_2_3 + HW_COMPAT_2_3 \ + {\ + .driver = TYPE_X86_CPU,\ + .property = "arat",\ + .value = "off",\ + }, #define PC_COMPAT_2_2 \ PC_COMPAT_2_3 \ diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h index 9dca38837b..5322b560eb 100644 --- a/include/hw/pci-host/spapr.h +++ b/include/hw/pci-host/spapr.h @@ -119,21 +119,23 @@ struct sPAPRPHBVFIOState { static inline qemu_irq spapr_phb_lsi_qirq(struct sPAPRPHBState *phb, int pin) { + sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); + return xics_get_qirq(spapr->icp, phb->lsi_table[pin].irq); } -PCIHostState *spapr_create_phb(sPAPREnvironment *spapr, int index); +PCIHostState *spapr_create_phb(sPAPRMachineState *spapr, int index); int spapr_populate_pci_dt(sPAPRPHBState *phb, uint32_t xics_phandle, void *fdt); -void spapr_pci_msi_init(sPAPREnvironment *spapr, hwaddr addr); +void spapr_pci_msi_init(sPAPRMachineState *spapr, hwaddr addr); void spapr_pci_rtas_init(void); -sPAPRPHBState *spapr_pci_find_phb(sPAPREnvironment *spapr, uint64_t buid); -PCIDevice *spapr_pci_find_dev(sPAPREnvironment *spapr, uint64_t buid, +sPAPRPHBState *spapr_pci_find_phb(sPAPRMachineState *spapr, uint64_t buid); +PCIDevice *spapr_pci_find_dev(sPAPRMachineState *spapr, uint64_t buid, uint32_t config_addr); #endif /* __HW_SPAPR_PCI_H__ */ diff --git a/include/hw/pci/pci_ids.h b/include/hw/pci/pci_ids.h index 49c062b8ce..d98e6c915d 100644 --- a/include/hw/pci/pci_ids.h +++ b/include/hw/pci/pci_ids.h @@ -114,6 +114,8 @@ #define PCI_VENDOR_ID_ENSONIQ 0x1274 #define PCI_DEVICE_ID_ENSONIQ_ES1370 0x5000 +#define PCI_VENDOR_ID_CHELSIO 0x1425 + #define PCI_VENDOR_ID_FREESCALE 0x1957 #define PCI_DEVICE_ID_MPC8533E 0x0030 diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h index 7b4b1bb3d7..91a61abbc4 100644 --- a/include/hw/ppc/spapr.h +++ b/include/hw/ppc/spapr.h @@ -2,6 +2,7 @@ #define __HW_SPAPR_H__ #include "sysemu/dma.h" +#include "hw/boards.h" #include "hw/ppc/xics.h" #include "hw/ppc/spapr_drc.h" @@ -12,15 +13,42 @@ typedef struct sPAPRConfigureConnectorState sPAPRConfigureConnectorState; typedef struct sPAPREventLogEntry sPAPREventLogEntry; #define HPTE64_V_HPTE_DIRTY 0x0000000000000040ULL +#define SPAPR_ENTRY_POINT 0x100 + +typedef struct sPAPRMachineClass sPAPRMachineClass; +typedef struct sPAPRMachineState sPAPRMachineState; + +#define TYPE_SPAPR_MACHINE "spapr-machine" +#define SPAPR_MACHINE(obj) \ + OBJECT_CHECK(sPAPRMachineState, (obj), TYPE_SPAPR_MACHINE) +#define SPAPR_MACHINE_GET_CLASS(obj) \ + OBJECT_GET_CLASS(sPAPRMachineClass, obj, TYPE_SPAPR_MACHINE) +#define SPAPR_MACHINE_CLASS(klass) \ + OBJECT_CLASS_CHECK(sPAPRMachineClass, klass, TYPE_SPAPR_MACHINE) + +/** + * sPAPRMachineClass: + */ +struct sPAPRMachineClass { + /*< private >*/ + MachineClass parent_class; + + /*< public >*/ +}; + +/** + * sPAPRMachineState: + */ +struct sPAPRMachineState { + /*< private >*/ + MachineState parent_obj; -typedef struct sPAPREnvironment { struct VIOsPAPRBus *vio_bus; QLIST_HEAD(, sPAPRPHBState) phbs; struct sPAPRNVRAM *nvram; XICSState *icp; DeviceState *rtc; - hwaddr ram_limit; void *htab; uint32_t htab_shift; hwaddr rma_size; @@ -29,7 +57,6 @@ typedef struct sPAPREnvironment { ssize_t rtas_size; void *rtas_blob; void *fdt_skel; - target_ulong entry_point; uint64_t rtc_offset; /* Now used only during incoming migration */ struct PPCTimebase tb; bool has_graphics; @@ -46,7 +73,10 @@ typedef struct sPAPREnvironment { /* RTAS state */ QTAILQ_HEAD(, sPAPRConfigureConnectorState) ccs_list; -} sPAPREnvironment; + + /*< public >*/ + char *kvm_type; +}; #define H_SUCCESS 0 #define H_BUSY 1 /* Hardware busy -- retry later */ @@ -319,8 +349,6 @@ typedef struct sPAPREnvironment { #define KVMPPC_H_CAS (KVMPPC_HCALL_BASE + 0x2) #define KVMPPC_HCALL_MAX KVMPPC_H_CAS -extern sPAPREnvironment *spapr; - typedef struct sPAPRDeviceTreeUpdateHeader { uint32_t version_id; } sPAPRDeviceTreeUpdateHeader; @@ -335,7 +363,7 @@ typedef struct sPAPRDeviceTreeUpdateHeader { do { } while (0) #endif -typedef target_ulong (*spapr_hcall_fn)(PowerPCCPU *cpu, sPAPREnvironment *spapr, +typedef target_ulong (*spapr_hcall_fn)(PowerPCCPU *cpu, sPAPRMachineState *sm, target_ulong opcode, target_ulong *args); @@ -490,12 +518,12 @@ static inline void rtas_st_buffer(target_ulong phys, target_ulong phys_len, rtas_st_buffer_direct(phys + 2, phys_len - 2, buffer, buffer_len); } -typedef void (*spapr_rtas_fn)(PowerPCCPU *cpu, sPAPREnvironment *spapr, +typedef void (*spapr_rtas_fn)(PowerPCCPU *cpu, sPAPRMachineState *sm, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets); void spapr_rtas_register(int token, const char *name, spapr_rtas_fn fn); -target_ulong spapr_rtas_call(PowerPCCPU *cpu, sPAPREnvironment *spapr, +target_ulong spapr_rtas_call(PowerPCCPU *cpu, sPAPRMachineState *sm, uint32_t token, uint32_t nargs, target_ulong args, uint32_t nret, target_ulong rets); int spapr_rtas_device_tree_setup(void *fdt, hwaddr rtas_addr, @@ -546,9 +574,10 @@ struct sPAPREventLogEntry { QTAILQ_ENTRY(sPAPREventLogEntry) next; }; -void spapr_events_init(sPAPREnvironment *spapr); +void spapr_events_init(sPAPRMachineState *sm); void spapr_events_fdt_skel(void *fdt, uint32_t epow_irq); -int spapr_h_cas_compose_response(target_ulong addr, target_ulong size); +int spapr_h_cas_compose_response(sPAPRMachineState *sm, + target_ulong addr, target_ulong size); sPAPRTCETable *spapr_tce_new_table(DeviceState *owner, uint32_t liobn, uint64_t bus_offset, uint32_t page_shift, @@ -578,4 +607,6 @@ void spapr_ccs_reset_hook(void *opaque); void spapr_rtc_read(DeviceState *dev, struct tm *tm, uint32_t *ns); int spapr_rtc_import_offset(DeviceState *dev, int64_t legacy_offset); +#define SPAPR_MEMORY_BLOCK_SIZE (1 << 28) /* 256MB */ + #endif /* !defined (__HW_SPAPR_H__) */ diff --git a/include/hw/ppc/spapr_vio.h b/include/hw/ppc/spapr_vio.h index f95016a92e..2299a5405a 100644 --- a/include/hw/ppc/spapr_vio.h +++ b/include/hw/ppc/spapr_vio.h @@ -88,6 +88,8 @@ extern int spapr_vio_signal(VIOsPAPRDevice *dev, target_ulong mode); static inline qemu_irq spapr_vio_qirq(VIOsPAPRDevice *dev) { + sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine()); + return xics_get_qirq(spapr->icp, dev->irq); } @@ -126,7 +128,7 @@ static inline int spapr_vio_dma_set(VIOsPAPRDevice *dev, uint64_t taddr, int spapr_vio_send_crq(VIOsPAPRDevice *dev, uint8_t *crq); -VIOsPAPRDevice *vty_lookup(sPAPREnvironment *spapr, target_ulong reg); +VIOsPAPRDevice *vty_lookup(sPAPRMachineState *spapr, target_ulong reg); void vty_putchars(VIOsPAPRDevice *sdev, uint8_t *buf, int len); void spapr_vty_create(VIOsPAPRBus *bus, CharDriverState *chardev); void spapr_vlan_create(VIOsPAPRBus *bus, NICInfo *nd); diff --git a/include/hw/ppc/xics.h b/include/hw/ppc/xics.h index a214dd7f28..355a96623c 100644 --- a/include/hw/ppc/xics.h +++ b/include/hw/ppc/xics.h @@ -109,6 +109,7 @@ struct ICPState { uint8_t pending_priority; uint8_t mfrr; qemu_irq output; + bool cap_irq_xics_enabled; }; #define TYPE_ICS "ics" diff --git a/include/hw/sysbus.h b/include/hw/sysbus.h index 34f93c39bf..cc1dba49bf 100644 --- a/include/hw/sysbus.h +++ b/include/hw/sysbus.h @@ -58,6 +58,7 @@ typedef struct SysBusDeviceClass { * omitted then. (This is not considered a fatal error.) */ char *(*explicit_ofw_unit_address)(const SysBusDevice *dev); + void (*connect_irq_notifier)(SysBusDevice *dev, qemu_irq irq); } SysBusDeviceClass; struct SysBusDevice { diff --git a/include/hw/vfio/vfio-platform.h b/include/hw/vfio/vfio-platform.h index 26b2ad6f4e..c5cf1d79f3 100644 --- a/include/hw/vfio/vfio-platform.h +++ b/include/hw/vfio/vfio-platform.h @@ -41,6 +41,7 @@ typedef struct VFIOINTp { int state; /* inactive, pending, active */ uint8_t pin; /* index */ uint32_t flags; /* IRQ info flags */ + bool kvm_accel; /* set when QEMU bypass through KVM enabled */ } VFIOINTp; /* function type for user side eventfd handler */ @@ -57,6 +58,7 @@ typedef struct VFIOPlatformDevice { uint32_t mmap_timeout; /* delay to re-enable mmaps after interrupt */ QEMUTimer *mmap_timer; /* allows fast-path resume after IRQ hit */ QemuMutex intp_mutex; /* protect the intp_list IRQ state */ + bool irqfd_allowed; /* debug option to force irqfd on/off */ } VFIOPlatformDevice; typedef struct VFIOPlatformDeviceClass { diff --git a/include/hw/virtio/virtio-gpu.h b/include/hw/virtio/virtio-gpu.h index b8c9244b21..889676147a 100644 --- a/include/hw/virtio/virtio-gpu.h +++ b/include/hw/virtio/virtio-gpu.h @@ -112,9 +112,6 @@ extern const GraphicHwOps virtio_gpu_ops; VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, false), \ DEFINE_PROP_UINT32("vectors", _state, nvectors, 3) -#define DEFINE_VIRTIO_GPU_PROPERTIES(_state, _conf_field) \ - DEFINE_PROP_UINT32("max_outputs", _state, _conf_field.max_outputs, 1) - #define VIRTIO_GPU_FILL_CMD(out) do { \ size_t s; \ s = iov_to_buf(cmd->elem.out_sg, cmd->elem.out_num, 0, \ diff --git a/include/migration/migration.h b/include/migration/migration.h index 9387c8c9d4..b2711ef305 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -34,6 +34,7 @@ #define QEMU_VM_SECTION_FULL 0x04 #define QEMU_VM_SUBSECTION 0x05 #define QEMU_VM_VMDESCRIPTION 0x06 +#define QEMU_VM_CONFIGURATION 0x07 #define QEMU_VM_SECTION_FOOTER 0x7e struct MigrationParams { @@ -176,10 +177,11 @@ bool migrate_use_compression(void); int migrate_compress_level(void); int migrate_compress_threads(void); int migrate_decompress_threads(void); +bool migrate_use_events(void); void ram_control_before_iterate(QEMUFile *f, uint64_t flags); void ram_control_after_iterate(QEMUFile *f, uint64_t flags); -void ram_control_load_hook(QEMUFile *f, uint64_t flags); +void ram_control_load_hook(QEMUFile *f, uint64_t flags, void *data); /* Whenever this is found in the data stream, the flags * will be passed to ram_control_load_hook in the incoming-migration @@ -197,4 +199,7 @@ size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset, void ram_mig_init(void); void savevm_skip_section_footers(void); +void register_global_state(void); +void global_state_set_optional(void); +void savevm_skip_configuration(void); #endif diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h index 4f67d79227..ea49f33fac 100644 --- a/include/migration/qemu-file.h +++ b/include/migration/qemu-file.h @@ -63,16 +63,20 @@ typedef ssize_t (QEMUFileWritevBufferFunc)(void *opaque, struct iovec *iov, /* * This function provides hooks around different * stages of RAM migration. + * 'opaque' is the backend specific data in QEMUFile + * 'data' is call specific data associated with the 'flags' value */ -typedef int (QEMURamHookFunc)(QEMUFile *f, void *opaque, uint64_t flags); +typedef int (QEMURamHookFunc)(QEMUFile *f, void *opaque, uint64_t flags, + void *data); /* * Constants used by ram_control_* hooks */ -#define RAM_CONTROL_SETUP 0 -#define RAM_CONTROL_ROUND 1 -#define RAM_CONTROL_HOOK 2 -#define RAM_CONTROL_FINISH 3 +#define RAM_CONTROL_SETUP 0 +#define RAM_CONTROL_ROUND 1 +#define RAM_CONTROL_HOOK 2 +#define RAM_CONTROL_FINISH 3 +#define RAM_CONTROL_BLOCK_REG 4 /* * This function allows override of where the RAM page diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h index 0695d7c3de..f51ff693e9 100644 --- a/include/migration/vmstate.h +++ b/include/migration/vmstate.h @@ -820,6 +820,8 @@ int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd, void vmstate_save_state(QEMUFile *f, const VMStateDescription *vmsd, void *opaque, QJSON *vmdesc); +bool vmstate_save_needed(const VMStateDescription *vmsd, void *opaque); + int vmstate_register_with_alias_id(DeviceState *dev, int instance_id, const VMStateDescription *vmsd, void *base, int alias_id, diff --git a/include/qom/cpu.h b/include/qom/cpu.h index 39f0f19fb0..42f42f5a2d 100644 --- a/include/qom/cpu.h +++ b/include/qom/cpu.h @@ -323,6 +323,8 @@ extern struct CPUTailQ cpus; #define CPU_FOREACH(cpu) QTAILQ_FOREACH(cpu, &cpus, node) #define CPU_FOREACH_SAFE(cpu, next_cpu) \ QTAILQ_FOREACH_SAFE(cpu, &cpus, node, next_cpu) +#define CPU_FOREACH_REVERSE(cpu) \ + QTAILQ_FOREACH_REVERSE(cpu, &cpus, CPUTailQ, node) #define first_cpu QTAILQ_FIRST(&cpus) DECLARE_TLS(CPUState *, current_cpu); diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index f459fbdbd4..983e99e1e7 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -19,6 +19,7 @@ #include "qemu/queue.h" #include "qom/cpu.h" #include "exec/memattrs.h" +#include "hw/irq.h" #ifdef CONFIG_KVM #include <linux/kvm.h> @@ -151,6 +152,7 @@ extern bool kvm_readonly_mem_allowed; #define kvm_halt_in_kernel() (false) #define kvm_eventfds_enabled() (false) #define kvm_irqfds_enabled() (false) +#define kvm_resamplefds_enabled() (false) #define kvm_msi_via_irqfd_enabled() (false) #define kvm_gsi_routing_allowed() (false) #define kvm_gsi_direct_mapping() (false) @@ -416,9 +418,15 @@ void kvm_irqchip_release_virq(KVMState *s, int virq); int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter); +int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, + EventNotifier *rn, int virq); +int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, + int virq); int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, - EventNotifier *rn, int virq); -int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, int virq); + EventNotifier *rn, qemu_irq irq); +int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, + qemu_irq irq); +void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi); void kvm_pc_gsi_handler(void *opaque, int n, int level); void kvm_pc_setup_irq_routing(bool pci_enabled); void kvm_init_irq_routing(KVMState *s); diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index df809518b4..44570d17e6 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -28,6 +28,7 @@ bool runstate_check(RunState state); void runstate_set(RunState new_state); int runstate_is_running(void); bool runstate_needs_reset(void); +bool runstate_store(char *str, size_t size); typedef struct vm_change_state_entry VMChangeStateEntry; typedef void VMChangeStateHandler(void *opaque, int running, RunState state); @@ -35,6 +35,7 @@ #include "exec/address-spaces.h" #include "qemu/event_notifier.h" #include "trace.h" +#include "hw/irq.h" #include "hw/boards.h" @@ -84,6 +85,7 @@ struct KVMState * unsigned, and treating them as signed here can break things */ unsigned irq_set_ioctl; unsigned int sigmask_len; + GHashTable *gsimap; #ifdef KVM_CAP_IRQ_ROUTING struct kvm_irq_routing *irq_routes; int nr_allocated_irq_routes; @@ -1332,19 +1334,49 @@ int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg) } #endif /* !KVM_CAP_IRQ_ROUTING */ -int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, - EventNotifier *rn, int virq) +int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, + EventNotifier *rn, int virq) { return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), rn ? event_notifier_get_fd(rn) : -1, virq, true); } -int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, int virq) +int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, + int virq) { return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), -1, virq, false); } +int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, + EventNotifier *rn, qemu_irq irq) +{ + gpointer key, gsi; + gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi); + + if (!found) { + return -ENXIO; + } + return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi)); +} + +int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, + qemu_irq irq) +{ + gpointer key, gsi; + gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi); + + if (!found) { + return -ENXIO; + } + return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi)); +} + +void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi) +{ + g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi)); +} + static void kvm_irqchip_create(MachineState *machine, KVMState *s) { int ret; @@ -1380,6 +1412,8 @@ static void kvm_irqchip_create(MachineState *machine, KVMState *s) kvm_halt_in_kernel_allowed = true; kvm_init_irq_routing(s); + + s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal); } /* Find number of supported CPUs using the recommended diff --git a/kvm-stub.c b/kvm-stub.c index 7ba90c546f..d9ad624eee 100644 --- a/kvm-stub.c +++ b/kvm-stub.c @@ -24,6 +24,7 @@ bool kvm_kernel_irqchip; bool kvm_async_interrupts_allowed; bool kvm_eventfds_allowed; bool kvm_irqfds_allowed; +bool kvm_resamplefds_allowed; bool kvm_msi_via_irqfd_allowed; bool kvm_gsi_routing_allowed; bool kvm_gsi_direct_mapping; @@ -137,13 +138,14 @@ int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter) return -ENOSYS; } -int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, - EventNotifier *rn, int virq) +int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, + EventNotifier *rn, int virq) { return -ENOSYS; } -int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, int virq) +int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, + int virq) { return -ENOSYS; } diff --git a/linux-user/main.c b/linux-user/main.c index c855bccadc..6c5c2effbf 100644 --- a/linux-user/main.c +++ b/linux-user/main.c @@ -1424,8 +1424,7 @@ void cpu_loop (CPUSPARCState *env) #ifdef TARGET_PPC static inline uint64_t cpu_ppc_get_tb(CPUPPCState *env) { - /* TO FIX */ - return 0; + return cpu_get_real_ticks(); } uint64_t cpu_ppc_load_tbl(CPUPPCState *env) diff --git a/migration/block.c b/migration/block.c index ddb59ccf87..ed865ed23b 100644 --- a/migration/block.c +++ b/migration/block.c @@ -457,7 +457,7 @@ static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds, blk_mig_lock(); if (bmds_aio_inflight(bmds, sector)) { blk_mig_unlock(); - bdrv_drain_all(); + bdrv_drain(bmds->bs); } else { blk_mig_unlock(); } diff --git a/migration/migration.c b/migration/migration.c index c6ac08a0cb..45719a0f5f 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -26,6 +26,8 @@ #include "qemu/thread.h" #include "qmp-commands.h" #include "trace.h" +#include "qapi/util.h" +#include "qapi-event.h" #define MAX_THROTTLE (32 << 20) /* Migration speed throttling */ @@ -97,6 +99,120 @@ void migration_incoming_state_destroy(void) mis_current = NULL; } + +typedef struct { + bool optional; + uint32_t size; + uint8_t runstate[100]; +} GlobalState; + +static GlobalState global_state; + +static int global_state_store(void) +{ + if (!runstate_store((char *)global_state.runstate, + sizeof(global_state.runstate))) { + error_report("runstate name too big: %s", global_state.runstate); + trace_migrate_state_too_big(); + return -EINVAL; + } + return 0; +} + +static char *global_state_get_runstate(void) +{ + return (char *)global_state.runstate; +} + +void global_state_set_optional(void) +{ + global_state.optional = true; +} + +static bool global_state_needed(void *opaque) +{ + GlobalState *s = opaque; + char *runstate = (char *)s->runstate; + + /* If it is not optional, it is mandatory */ + + if (s->optional == false) { + return true; + } + + /* If state is running or paused, it is not needed */ + + if (strcmp(runstate, "running") == 0 || + strcmp(runstate, "paused") == 0) { + return false; + } + + /* for any other state it is needed */ + return true; +} + +static int global_state_post_load(void *opaque, int version_id) +{ + GlobalState *s = opaque; + int ret = 0; + char *runstate = (char *)s->runstate; + + trace_migrate_global_state_post_load(runstate); + + if (strcmp(runstate, "running") != 0) { + Error *local_err = NULL; + int r = qapi_enum_parse(RunState_lookup, runstate, RUN_STATE_MAX, + -1, &local_err); + + if (r == -1) { + if (local_err) { + error_report_err(local_err); + } + return -EINVAL; + } + ret = vm_stop_force_state(r); + } + + return ret; +} + +static void global_state_pre_save(void *opaque) +{ + GlobalState *s = opaque; + + trace_migrate_global_state_pre_save((char *)s->runstate); + s->size = strlen((char *)s->runstate) + 1; +} + +static const VMStateDescription vmstate_globalstate = { + .name = "globalstate", + .version_id = 1, + .minimum_version_id = 1, + .post_load = global_state_post_load, + .pre_save = global_state_pre_save, + .needed = global_state_needed, + .fields = (VMStateField[]) { + VMSTATE_UINT32(size, GlobalState), + VMSTATE_BUFFER(runstate, GlobalState), + VMSTATE_END_OF_LIST() + }, +}; + +void register_global_state(void) +{ + /* We would use it independently that we receive it */ + strcpy((char *)&global_state.runstate, ""); + vmstate_register(NULL, 0, &vmstate_globalstate, &global_state); +} + +static void migrate_generate_event(int new_state) +{ + if (migrate_use_events()) { + qapi_event_send_migration(new_state, &error_abort); + trace_migrate_set_state(new_state); + } +} + /* * Called on -incoming with a defer: uri. * The migration can be started later after any parameters have been @@ -114,6 +230,7 @@ void qemu_start_incoming_migration(const char *uri, Error **errp) { const char *p; + qapi_event_send_migration(MIGRATION_STATUS_SETUP, &error_abort); if (!strcmp(uri, "defer")) { deferred_incoming_migration(errp); } else if (strstart(uri, "tcp:", &p)) { @@ -142,7 +259,7 @@ static void process_incoming_migration_co(void *opaque) int ret; migration_incoming_state_new(f); - + migrate_generate_event(MIGRATION_STATUS_ACTIVE); ret = qemu_loadvm_state(f); qemu_fclose(f); @@ -150,10 +267,12 @@ static void process_incoming_migration_co(void *opaque) migration_incoming_state_destroy(); if (ret < 0) { + migrate_generate_event(MIGRATION_STATUS_FAILED); error_report("load of migration failed: %s", strerror(-ret)); migrate_decompress_threads_join(); exit(EXIT_FAILURE); } + migrate_generate_event(MIGRATION_STATUS_COMPLETED); qemu_announce_self(); /* Make sure all file formats flush their mutable metadata */ @@ -164,10 +283,20 @@ static void process_incoming_migration_co(void *opaque) exit(EXIT_FAILURE); } - if (autostart) { + /* runstate == "" means that we haven't received it through the + * wire, so we obey autostart. runstate == runing means that we + * need to run it, we need to make sure that we do it after + * everything else has finished. Every other state change is done + * at the post_load function */ + + if (strcmp(global_state_get_runstate(), "running") == 0) { vm_start(); - } else { - runstate_set(RUN_STATE_PAUSED); + } else if (strcmp(global_state_get_runstate(), "") == 0) { + if (autostart) { + vm_start(); + } else { + runstate_set(RUN_STATE_PAUSED); + } } migrate_decompress_threads_join(); } @@ -392,8 +521,8 @@ void qmp_migrate_set_parameters(bool has_compress_level, static void migrate_set_state(MigrationState *s, int old_state, int new_state) { - if (atomic_cmpxchg(&s->state, old_state, new_state) == new_state) { - trace_migrate_set_state(new_state); + if (atomic_cmpxchg(&s->state, old_state, new_state) == old_state) { + migrate_generate_event(new_state); } } @@ -432,8 +561,7 @@ void migrate_fd_error(MigrationState *s) { trace_migrate_fd_error(); assert(s->file == NULL); - s->state = MIGRATION_STATUS_FAILED; - trace_migrate_set_state(MIGRATION_STATUS_FAILED); + migrate_set_state(s, MIGRATION_STATUS_SETUP, MIGRATION_STATUS_FAILED); notifier_list_notify(&migration_state_notifiers, s); } @@ -517,8 +645,7 @@ static MigrationState *migrate_init(const MigrationParams *params) s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS] = decompress_thread_count; s->bandwidth_limit = bandwidth_limit; - s->state = MIGRATION_STATUS_SETUP; - trace_migrate_set_state(MIGRATION_STATUS_SETUP); + migrate_set_state(s, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP); s->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); return s; @@ -577,7 +704,6 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk, error_setg(errp, QERR_MIGRATION_ACTIVE); return; } - if (runstate_check(RUN_STATE_INMIGRATE)) { error_setg(errp, "Guest is waiting for an incoming migration"); return; @@ -592,6 +718,12 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk, return; } + /* We are starting a new migration, so we want to start in a clean + state. This change is only needed if previous migration + failed/was cancelled. We don't use migrate_set_state() because + we are setting the initial state, not changing it. */ + s->state = MIGRATION_STATUS_NONE; + s = migrate_init(¶ms); if (strstart(uri, "tcp:", &p)) { @@ -611,7 +743,7 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk, } else { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "uri", "a valid migration protocol"); - s->state = MIGRATION_STATUS_FAILED; + migrate_set_state(s, MIGRATION_STATUS_SETUP, MIGRATION_STATUS_FAILED); return; } @@ -740,6 +872,15 @@ int migrate_decompress_threads(void) return s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS]; } +bool migrate_use_events(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->enabled_capabilities[MIGRATION_CAPABILITY_EVENTS]; +} + int migrate_use_xbzrle(void) { MigrationState *s; @@ -793,10 +934,13 @@ static void *migration_thread(void *opaque) qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER); old_vm_running = runstate_is_running(); - ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); - if (ret >= 0) { - qemu_file_set_rate_limit(s->file, INT64_MAX); - qemu_savevm_state_complete(s->file); + ret = global_state_store(); + if (!ret) { + ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); + if (ret >= 0) { + qemu_file_set_rate_limit(s->file, INT64_MAX); + qemu_savevm_state_complete(s->file); + } } qemu_mutex_unlock_iothread(); diff --git a/migration/qemu-file.c b/migration/qemu-file.c index 557c1c1a62..6bb3dc15cd 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -129,7 +129,7 @@ void ram_control_before_iterate(QEMUFile *f, uint64_t flags) int ret = 0; if (f->ops->before_ram_iterate) { - ret = f->ops->before_ram_iterate(f, f->opaque, flags); + ret = f->ops->before_ram_iterate(f, f->opaque, flags, NULL); if (ret < 0) { qemu_file_set_error(f, ret); } @@ -141,24 +141,30 @@ void ram_control_after_iterate(QEMUFile *f, uint64_t flags) int ret = 0; if (f->ops->after_ram_iterate) { - ret = f->ops->after_ram_iterate(f, f->opaque, flags); + ret = f->ops->after_ram_iterate(f, f->opaque, flags, NULL); if (ret < 0) { qemu_file_set_error(f, ret); } } } -void ram_control_load_hook(QEMUFile *f, uint64_t flags) +void ram_control_load_hook(QEMUFile *f, uint64_t flags, void *data) { int ret = -EINVAL; if (f->ops->hook_ram_load) { - ret = f->ops->hook_ram_load(f, f->opaque, flags); + ret = f->ops->hook_ram_load(f, f->opaque, flags, data); if (ret < 0) { qemu_file_set_error(f, ret); } } else { - qemu_file_set_error(f, ret); + /* + * Hook is a hook specifically requested by the source sending a flag + * that expects there to be a hook on the destination. + */ + if (flags == RAM_CONTROL_HOOK) { + qemu_file_set_error(f, ret); + } } } diff --git a/migration/ram.c b/migration/ram.c index 57368e1575..c696814196 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -222,6 +222,7 @@ static RAMBlock *last_seen_block; static RAMBlock *last_sent_block; static ram_addr_t last_offset; static unsigned long *migration_bitmap; +static QemuMutex migration_bitmap_mutex; static uint64_t migration_dirty_pages; static uint32_t last_version; static bool ram_bulk_stage; @@ -494,6 +495,7 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data, return 1; } +/* Called with rcu_read_lock() to protect migration_bitmap */ static inline ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr, ram_addr_t start) @@ -502,26 +504,31 @@ ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr, unsigned long nr = base + (start >> TARGET_PAGE_BITS); uint64_t mr_size = TARGET_PAGE_ALIGN(memory_region_size(mr)); unsigned long size = base + (mr_size >> TARGET_PAGE_BITS); + unsigned long *bitmap; unsigned long next; + bitmap = atomic_rcu_read(&migration_bitmap); if (ram_bulk_stage && nr > base) { next = nr + 1; } else { - next = find_next_bit(migration_bitmap, size, nr); + next = find_next_bit(bitmap, size, nr); } if (next < size) { - clear_bit(next, migration_bitmap); + clear_bit(next, bitmap); migration_dirty_pages--; } return (next - base) << TARGET_PAGE_BITS; } +/* Called with rcu_read_lock() to protect migration_bitmap */ static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length) { + unsigned long *bitmap; + bitmap = atomic_rcu_read(&migration_bitmap); migration_dirty_pages += - cpu_physical_memory_sync_dirty_bitmap(migration_bitmap, start, length); + cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length); } @@ -563,11 +570,13 @@ static void migration_bitmap_sync(void) trace_migration_bitmap_sync_start(); address_space_sync_dirty_bitmap(&address_space_memory); + qemu_mutex_lock(&migration_bitmap_mutex); rcu_read_lock(); QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { migration_bitmap_sync_range(block->mr->ram_addr, block->used_length); } rcu_read_unlock(); + qemu_mutex_unlock(&migration_bitmap_mutex); trace_migration_bitmap_sync_end(migration_dirty_pages - num_dirty_pages_init); @@ -1017,10 +1026,15 @@ void free_xbzrle_decoded_buf(void) static void migration_end(void) { - if (migration_bitmap) { + /* caller have hold iothread lock or is in a bh, so there is + * no writing race against this migration_bitmap + */ + unsigned long *bitmap = migration_bitmap; + atomic_rcu_set(&migration_bitmap, NULL); + if (bitmap) { memory_global_dirty_log_stop(); - g_free(migration_bitmap); - migration_bitmap = NULL; + synchronize_rcu(); + g_free(bitmap); } XBZRLE_cache_lock(); @@ -1051,6 +1065,30 @@ static void reset_ram_globals(void) #define MAX_WAIT 50 /* ms, half buffered_file limit */ +void migration_bitmap_extend(ram_addr_t old, ram_addr_t new) +{ + /* called in qemu main thread, so there is + * no writing race against this migration_bitmap + */ + if (migration_bitmap) { + unsigned long *old_bitmap = migration_bitmap, *bitmap; + bitmap = bitmap_new(new); + + /* prevent migration_bitmap content from being set bit + * by migration_bitmap_sync_range() at the same time. + * it is safe to migration if migration_bitmap is cleared bit + * at the same time. + */ + qemu_mutex_lock(&migration_bitmap_mutex); + bitmap_copy(bitmap, old_bitmap, old); + bitmap_set(bitmap, old, new - old); + atomic_rcu_set(&migration_bitmap, bitmap); + qemu_mutex_unlock(&migration_bitmap_mutex); + migration_dirty_pages += new - old; + synchronize_rcu(); + g_free(old_bitmap); + } +} /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has * long-running RCU critical section. When rcu-reclaims in the code @@ -1067,6 +1105,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque) dirty_rate_high_cnt = 0; bitmap_sync_count = 0; migration_bitmap_sync_init(); + qemu_mutex_init(&migration_bitmap_mutex); if (migrate_use_xbzrle()) { XBZRLE_cache_lock(); @@ -1477,6 +1516,8 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) error_report_err(local_err); } } + ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, + block->idstr); break; } } @@ -1545,7 +1586,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) break; default: if (flags & RAM_SAVE_FLAG_HOOK) { - ram_control_load_hook(f, flags); + ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); } else { error_report("Unknown combination of migration flags: %#x", flags); diff --git a/migration/rdma.c b/migration/rdma.c index b777273b59..f106b2a818 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -215,17 +215,19 @@ static void network_to_caps(RDMACapabilities *cap) * the information. It's small anyway, so a list is overkill. */ typedef struct RDMALocalBlock { - uint8_t *local_host_addr; /* local virtual address */ - uint64_t remote_host_addr; /* remote virtual address */ - uint64_t offset; - uint64_t length; - struct ibv_mr **pmr; /* MRs for chunk-level registration */ - struct ibv_mr *mr; /* MR for non-chunk-level registration */ - uint32_t *remote_keys; /* rkeys for chunk-level registration */ - uint32_t remote_rkey; /* rkeys for non-chunk-level registration */ - int index; /* which block are we */ - bool is_ram_block; - int nb_chunks; + char *block_name; + uint8_t *local_host_addr; /* local virtual address */ + uint64_t remote_host_addr; /* remote virtual address */ + uint64_t offset; + uint64_t length; + struct ibv_mr **pmr; /* MRs for chunk-level registration */ + struct ibv_mr *mr; /* MR for non-chunk-level registration */ + uint32_t *remote_keys; /* rkeys for chunk-level registration */ + uint32_t remote_rkey; /* rkeys for non-chunk-level registration */ + int index; /* which block are we */ + unsigned int src_index; /* (Only used on dest) */ + bool is_ram_block; + int nb_chunks; unsigned long *transit_bitmap; unsigned long *unregister_bitmap; } RDMALocalBlock; @@ -353,6 +355,9 @@ typedef struct RDMAContext { RDMALocalBlocks local_ram_blocks; RDMADestBlock *dest_blocks; + /* Index of the next RAMBlock received during block registration */ + unsigned int next_src_index; + /* * Migration on *destination* started. * Then use coroutine yield function. @@ -411,7 +416,7 @@ static void network_to_control(RDMAControlHeader *control) */ typedef struct QEMU_PACKED { union QEMU_PACKED { - uint64_t current_addr; /* offset into the ramblock of the chunk */ + uint64_t current_addr; /* offset into the ram_addr_t space */ uint64_t chunk; /* chunk to lookup if unregistering */ } key; uint32_t current_index; /* which ramblock the chunk belongs to */ @@ -419,8 +424,19 @@ typedef struct QEMU_PACKED { uint64_t chunks; /* how many sequential chunks to register */ } RDMARegister; -static void register_to_network(RDMARegister *reg) +static void register_to_network(RDMAContext *rdma, RDMARegister *reg) { + RDMALocalBlock *local_block; + local_block = &rdma->local_ram_blocks.block[reg->current_index]; + + if (local_block->is_ram_block) { + /* + * current_addr as passed in is an address in the local ram_addr_t + * space, we need to translate this for the destination + */ + reg->key.current_addr -= local_block->offset; + reg->key.current_addr += rdma->dest_blocks[reg->current_index].offset; + } reg->key.current_addr = htonll(reg->key.current_addr); reg->current_index = htonl(reg->current_index); reg->chunks = htonll(reg->chunks); @@ -436,13 +452,19 @@ static void network_to_register(RDMARegister *reg) typedef struct QEMU_PACKED { uint32_t value; /* if zero, we will madvise() */ uint32_t block_idx; /* which ram block index */ - uint64_t offset; /* where in the remote ramblock this chunk */ + uint64_t offset; /* Address in remote ram_addr_t space */ uint64_t length; /* length of the chunk */ } RDMACompress; -static void compress_to_network(RDMACompress *comp) +static void compress_to_network(RDMAContext *rdma, RDMACompress *comp) { comp->value = htonl(comp->value); + /* + * comp->offset as passed in is an address in the local ram_addr_t + * space, we need to translate this for the destination + */ + comp->offset -= rdma->local_ram_blocks.block[comp->block_idx].offset; + comp->offset += rdma->dest_blocks[comp->block_idx].offset; comp->block_idx = htonl(comp->block_idx); comp->offset = htonll(comp->offset); comp->length = htonll(comp->length); @@ -511,27 +533,27 @@ static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block, return result; } -static int rdma_add_block(RDMAContext *rdma, void *host_addr, +static int rdma_add_block(RDMAContext *rdma, const char *block_name, + void *host_addr, ram_addr_t block_offset, uint64_t length) { RDMALocalBlocks *local = &rdma->local_ram_blocks; - RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap, - (void *)(uintptr_t)block_offset); + RDMALocalBlock *block; RDMALocalBlock *old = local->block; - assert(block == NULL); - local->block = g_malloc0(sizeof(RDMALocalBlock) * (local->nb_blocks + 1)); if (local->nb_blocks) { int x; - for (x = 0; x < local->nb_blocks; x++) { - g_hash_table_remove(rdma->blockmap, - (void *)(uintptr_t)old[x].offset); - g_hash_table_insert(rdma->blockmap, - (void *)(uintptr_t)old[x].offset, - &local->block[x]); + if (rdma->blockmap) { + for (x = 0; x < local->nb_blocks; x++) { + g_hash_table_remove(rdma->blockmap, + (void *)(uintptr_t)old[x].offset); + g_hash_table_insert(rdma->blockmap, + (void *)(uintptr_t)old[x].offset, + &local->block[x]); + } } memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks); g_free(old); @@ -539,10 +561,12 @@ static int rdma_add_block(RDMAContext *rdma, void *host_addr, block = &local->block[local->nb_blocks]; + block->block_name = g_strdup(block_name); block->local_host_addr = host_addr; block->offset = block_offset; block->length = length; block->index = local->nb_blocks; + block->src_index = ~0U; /* Filled in by the receipt of the block list */ block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL; block->transit_bitmap = bitmap_new(block->nb_chunks); bitmap_clear(block->transit_bitmap, 0, block->nb_chunks); @@ -552,9 +576,12 @@ static int rdma_add_block(RDMAContext *rdma, void *host_addr, block->is_ram_block = local->init ? false : true; - g_hash_table_insert(rdma->blockmap, (void *) block_offset, block); + if (rdma->blockmap) { + g_hash_table_insert(rdma->blockmap, (void *) block_offset, block); + } - trace_rdma_add_block(local->nb_blocks, (uintptr_t) block->local_host_addr, + trace_rdma_add_block(block_name, local->nb_blocks, + (uintptr_t) block->local_host_addr, block->offset, block->length, (uintptr_t) (block->local_host_addr + block->length), BITS_TO_LONGS(block->nb_chunks) * @@ -574,7 +601,7 @@ static int rdma_add_block(RDMAContext *rdma, void *host_addr, static int qemu_rdma_init_one_block(const char *block_name, void *host_addr, ram_addr_t block_offset, ram_addr_t length, void *opaque) { - return rdma_add_block(opaque, host_addr, block_offset, length); + return rdma_add_block(opaque, block_name, host_addr, block_offset, length); } /* @@ -587,7 +614,6 @@ static int qemu_rdma_init_ram_blocks(RDMAContext *rdma) RDMALocalBlocks *local = &rdma->local_ram_blocks; assert(rdma->blockmap == NULL); - rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal); memset(local, 0, sizeof *local); qemu_ram_foreach_block(qemu_rdma_init_one_block, rdma); trace_qemu_rdma_init_ram_blocks(local->nb_blocks); @@ -597,16 +623,19 @@ static int qemu_rdma_init_ram_blocks(RDMAContext *rdma) return 0; } -static int rdma_delete_block(RDMAContext *rdma, ram_addr_t block_offset) +/* + * Note: If used outside of cleanup, the caller must ensure that the destination + * block structures are also updated + */ +static int rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block) { RDMALocalBlocks *local = &rdma->local_ram_blocks; - RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap, - (void *) block_offset); RDMALocalBlock *old = local->block; int x; - assert(block); - + if (rdma->blockmap) { + g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)block->offset); + } if (block->pmr) { int j; @@ -636,8 +665,14 @@ static int rdma_delete_block(RDMAContext *rdma, ram_addr_t block_offset) g_free(block->remote_keys); block->remote_keys = NULL; - for (x = 0; x < local->nb_blocks; x++) { - g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)old[x].offset); + g_free(block->block_name); + block->block_name = NULL; + + if (rdma->blockmap) { + for (x = 0; x < local->nb_blocks; x++) { + g_hash_table_remove(rdma->blockmap, + (void *)(uintptr_t)old[x].offset); + } } if (local->nb_blocks > 1) { @@ -659,8 +694,7 @@ static int rdma_delete_block(RDMAContext *rdma, ram_addr_t block_offset) local->block = NULL; } - trace_rdma_delete_block(local->nb_blocks, - (uintptr_t)block->local_host_addr, + trace_rdma_delete_block(block, (uintptr_t)block->local_host_addr, block->offset, block->length, (uintptr_t)(block->local_host_addr + block->length), BITS_TO_LONGS(block->nb_chunks) * @@ -670,7 +704,7 @@ static int rdma_delete_block(RDMAContext *rdma, ram_addr_t block_offset) local->nb_blocks--; - if (local->nb_blocks) { + if (local->nb_blocks && rdma->blockmap) { for (x = 0; x < local->nb_blocks; x++) { g_hash_table_insert(rdma->blockmap, (void *)(uintptr_t)local->block[x].offset, @@ -1223,7 +1257,7 @@ const char *print_wrid(int wrid) /* * Perform a non-optimized memory unregistration after every transfer - * for demonsration purposes, only if pin-all is not requested. + * for demonstration purposes, only if pin-all is not requested. * * Potential optimizations: * 1. Start a new thread to run this function continuously @@ -1289,7 +1323,7 @@ static int qemu_rdma_unregister_waiting(RDMAContext *rdma) rdma->total_registrations--; reg.key.chunk = chunk; - register_to_network(®); + register_to_network(rdma, ®); ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®, &resp, NULL, NULL); if (ret < 0) { @@ -1910,7 +1944,7 @@ retry: trace_qemu_rdma_write_one_zero(chunk, sge.length, current_index, current_addr); - compress_to_network(&comp); + compress_to_network(rdma, &comp); ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &comp, NULL, NULL, NULL); @@ -1937,7 +1971,7 @@ retry: trace_qemu_rdma_write_one_sendreg(chunk, sge.length, current_index, current_addr); - register_to_network(®); + register_to_network(rdma, ®); ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®, &resp, ®_result_idx, NULL); if (ret < 0) { @@ -2198,7 +2232,7 @@ static void qemu_rdma_cleanup(RDMAContext *rdma) if (rdma->local_ram_blocks.block) { while (rdma->local_ram_blocks.nb_blocks) { - rdma_delete_block(rdma, rdma->local_ram_blocks.block->offset); + rdma_delete_block(rdma, &rdma->local_ram_blocks.block[0]); } } @@ -2271,6 +2305,14 @@ static int qemu_rdma_source_init(RDMAContext *rdma, Error **errp, bool pin_all) goto err_rdma_source_init; } + /* Build the hash that maps from offset to RAMBlock */ + rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal); + for (idx = 0; idx < rdma->local_ram_blocks.nb_blocks; idx++) { + g_hash_table_insert(rdma->blockmap, + (void *)(uintptr_t)rdma->local_ram_blocks.block[idx].offset, + &rdma->local_ram_blocks.block[idx]); + } + for (idx = 0; idx < RDMA_WRID_MAX; idx++) { ret = qemu_rdma_reg_control(rdma, idx); if (ret) { @@ -2880,6 +2922,14 @@ err_rdma_dest_wait: return ret; } +static int dest_ram_sort_func(const void *a, const void *b) +{ + unsigned int a_index = ((const RDMALocalBlock *)a)->src_index; + unsigned int b_index = ((const RDMALocalBlock *)b)->src_index; + + return (a_index < b_index) ? -1 : (a_index != b_index); +} + /* * During each iteration of the migration, we listen for instructions * by the source VM to perform dynamic page registrations before they @@ -2889,8 +2939,7 @@ err_rdma_dest_wait: * * Keep doing this until the source tells us to stop. */ -static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque, - uint64_t flags) +static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque) { RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult), .type = RDMA_CONTROL_REGISTER_RESULT, @@ -2920,7 +2969,7 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque, CHECK_ERROR_STATE(); do { - trace_qemu_rdma_registration_handle_wait(flags); + trace_qemu_rdma_registration_handle_wait(); ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE); @@ -2943,6 +2992,13 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque, trace_qemu_rdma_registration_handle_compress(comp->length, comp->block_idx, comp->offset); + if (comp->block_idx >= rdma->local_ram_blocks.nb_blocks) { + error_report("rdma: 'compress' bad block index %u (vs %d)", + (unsigned int)comp->block_idx, + rdma->local_ram_blocks.nb_blocks); + ret = -EIO; + break; + } block = &(rdma->local_ram_blocks.block[comp->block_idx]); host_addr = block->local_host_addr + @@ -2958,6 +3014,13 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque, case RDMA_CONTROL_RAM_BLOCKS_REQUEST: trace_qemu_rdma_registration_handle_ram_blocks(); + /* Sort our local RAM Block list so it's the same as the source, + * we can do this since we've filled in a src_index in the list + * as we received the RAMBlock list earlier. + */ + qsort(rdma->local_ram_blocks.block, + rdma->local_ram_blocks.nb_blocks, + sizeof(RDMALocalBlock), dest_ram_sort_func); if (rdma->pin_all) { ret = qemu_rdma_reg_whole_ram_blocks(rdma); if (ret) { @@ -2985,6 +3048,12 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque, rdma->dest_blocks[i].length = local->block[i].length; dest_block_to_network(&rdma->dest_blocks[i]); + trace_qemu_rdma_registration_handle_ram_blocks_loop( + local->block[i].block_name, + local->block[i].offset, + local->block[i].length, + local->block[i].local_host_addr, + local->block[i].src_index); } blocks.len = rdma->local_ram_blocks.nb_blocks @@ -3018,8 +3087,23 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque, trace_qemu_rdma_registration_handle_register_loop(count, reg->current_index, reg->key.current_addr, reg->chunks); + if (reg->current_index >= rdma->local_ram_blocks.nb_blocks) { + error_report("rdma: 'register' bad block index %u (vs %d)", + (unsigned int)reg->current_index, + rdma->local_ram_blocks.nb_blocks); + ret = -ENOENT; + break; + } block = &(rdma->local_ram_blocks.block[reg->current_index]); if (block->is_ram_block) { + if (block->offset > reg->key.current_addr) { + error_report("rdma: bad register address for block %s" + " offset: %" PRIx64 " current_addr: %" PRIx64, + block->block_name, block->offset, + reg->key.current_addr); + ret = -ERANGE; + break; + } host_addr = (block->local_host_addr + (reg->key.current_addr - block->offset)); chunk = ram_chunk_index(block->local_host_addr, @@ -3028,6 +3112,14 @@ static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque, chunk = reg->key.chunk; host_addr = block->local_host_addr + (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT)); + /* Check for particularly bad chunk value */ + if (host_addr < (void *)block->local_host_addr) { + error_report("rdma: bad chunk for block %s" + " chunk: %" PRIx64, + block->block_name, reg->key.chunk); + ret = -ERANGE; + break; + } } chunk_start = ram_chunk_start(block, chunk); chunk_end = ram_chunk_end(block, chunk + reg->chunks); @@ -3108,8 +3200,56 @@ out: return ret; } +/* Destination: + * Called via a ram_control_load_hook during the initial RAM load section which + * lists the RAMBlocks by name. This lets us know the order of the RAMBlocks + * on the source. + * We've already built our local RAMBlock list, but not yet sent the list to + * the source. + */ +static int rdma_block_notification_handle(QEMUFileRDMA *rfile, const char *name) +{ + RDMAContext *rdma = rfile->rdma; + int curr; + int found = -1; + + /* Find the matching RAMBlock in our local list */ + for (curr = 0; curr < rdma->local_ram_blocks.nb_blocks; curr++) { + if (!strcmp(rdma->local_ram_blocks.block[curr].block_name, name)) { + found = curr; + break; + } + } + + if (found == -1) { + error_report("RAMBlock '%s' not found on destination", name); + return -ENOENT; + } + + rdma->local_ram_blocks.block[curr].src_index = rdma->next_src_index; + trace_rdma_block_notification_handle(name, rdma->next_src_index); + rdma->next_src_index++; + + return 0; +} + +static int rdma_load_hook(QEMUFile *f, void *opaque, uint64_t flags, void *data) +{ + switch (flags) { + case RAM_CONTROL_BLOCK_REG: + return rdma_block_notification_handle(opaque, data); + + case RAM_CONTROL_HOOK: + return qemu_rdma_registration_handle(f, opaque); + + default: + /* Shouldn't be called with any other values */ + abort(); + } +} + static int qemu_rdma_registration_start(QEMUFile *f, void *opaque, - uint64_t flags) + uint64_t flags, void *data) { QEMUFileRDMA *rfile = opaque; RDMAContext *rdma = rfile->rdma; @@ -3128,7 +3268,7 @@ static int qemu_rdma_registration_start(QEMUFile *f, void *opaque, * First, flush writes, if any. */ static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque, - uint64_t flags) + uint64_t flags, void *data) { Error *local_err = NULL, **errp = &local_err; QEMUFileRDMA *rfile = opaque; @@ -3148,7 +3288,7 @@ static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque, if (flags == RAM_CONTROL_SETUP) { RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT }; RDMALocalBlocks *local = &rdma->local_ram_blocks; - int reg_result_idx, i, j, nb_dest_blocks; + int reg_result_idx, i, nb_dest_blocks; head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST; trace_qemu_rdma_registration_stop_ram(); @@ -3184,9 +3324,11 @@ static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque, */ if (local->nb_blocks != nb_dest_blocks) { - ERROR(errp, "ram blocks mismatch #1! " + ERROR(errp, "ram blocks mismatch (Number of blocks %d vs %d) " "Your QEMU command line parameters are probably " - "not identical on both the source and destination."); + "not identical on both the source and destination.", + local->nb_blocks, nb_dest_blocks); + rdma->error_state = -EINVAL; return -EINVAL; } @@ -3196,30 +3338,18 @@ static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque, for (i = 0; i < nb_dest_blocks; i++) { network_to_dest_block(&rdma->dest_blocks[i]); - /* search local ram blocks */ - for (j = 0; j < local->nb_blocks; j++) { - if (rdma->dest_blocks[i].offset != local->block[j].offset) { - continue; - } - - if (rdma->dest_blocks[i].length != local->block[j].length) { - ERROR(errp, "ram blocks mismatch #2! " - "Your QEMU command line parameters are probably " - "not identical on both the source and destination."); - return -EINVAL; - } - local->block[j].remote_host_addr = - rdma->dest_blocks[i].remote_host_addr; - local->block[j].remote_rkey = rdma->dest_blocks[i].remote_rkey; - break; - } - - if (j >= local->nb_blocks) { - ERROR(errp, "ram blocks mismatch #3! " - "Your QEMU command line parameters are probably " - "not identical on both the source and destination."); + /* We require that the blocks are in the same order */ + if (rdma->dest_blocks[i].length != local->block[i].length) { + ERROR(errp, "Block %s/%d has a different length %" PRIu64 + "vs %" PRIu64, local->block[i].block_name, i, + local->block[i].length, + rdma->dest_blocks[i].length); + rdma->error_state = -EINVAL; return -EINVAL; } + local->block[i].remote_host_addr = + rdma->dest_blocks[i].remote_host_addr; + local->block[i].remote_rkey = rdma->dest_blocks[i].remote_rkey; } } @@ -3250,7 +3380,7 @@ static const QEMUFileOps rdma_read_ops = { .get_buffer = qemu_rdma_get_buffer, .get_fd = qemu_rdma_get_fd, .close = qemu_rdma_close, - .hook_ram_load = qemu_rdma_registration_handle, + .hook_ram_load = rdma_load_hook, }; static const QEMUFileOps rdma_write_ops = { @@ -3263,12 +3393,13 @@ static const QEMUFileOps rdma_write_ops = { static void *qemu_fopen_rdma(RDMAContext *rdma, const char *mode) { - QEMUFileRDMA *r = g_malloc0(sizeof(QEMUFileRDMA)); + QEMUFileRDMA *r; if (qemu_file_mode_is_not_valid(mode)) { return NULL; } + r = g_malloc0(sizeof(QEMUFileRDMA)); r->rdma = rdma; if (mode[0] == 'w') { @@ -3287,7 +3418,7 @@ static void rdma_accept_incoming_migration(void *opaque) QEMUFile *f; Error *local_err = NULL, **errp = &local_err; - trace_qemu_dma_accept_incoming_migration(); + trace_qemu_rdma_accept_incoming_migration(); ret = qemu_rdma_accept(rdma); if (ret) { @@ -3295,7 +3426,7 @@ static void rdma_accept_incoming_migration(void *opaque) return; } - trace_qemu_dma_accept_incoming_migration_accepted(); + trace_qemu_rdma_accept_incoming_migration_accepted(); f = qemu_fopen_rdma(rdma, "rb"); if (f == NULL) { diff --git a/migration/savevm.c b/migration/savevm.c index 9e0e286797..86735fc53a 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -246,11 +246,55 @@ typedef struct SaveStateEntry { typedef struct SaveState { QTAILQ_HEAD(, SaveStateEntry) handlers; int global_section_id; + bool skip_configuration; + uint32_t len; + const char *name; } SaveState; static SaveState savevm_state = { .handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers), .global_section_id = 0, + .skip_configuration = false, +}; + +void savevm_skip_configuration(void) +{ + savevm_state.skip_configuration = true; +} + + +static void configuration_pre_save(void *opaque) +{ + SaveState *state = opaque; + const char *current_name = MACHINE_GET_CLASS(current_machine)->name; + + state->len = strlen(current_name); + state->name = current_name; +} + +static int configuration_post_load(void *opaque, int version_id) +{ + SaveState *state = opaque; + const char *current_name = MACHINE_GET_CLASS(current_machine)->name; + + if (strncmp(state->name, current_name, state->len) != 0) { + error_report("Machine type received is '%s' and local is '%s'", + state->name, current_name); + return -EINVAL; + } + return 0; +} + +static const VMStateDescription vmstate_configuration = { + .name = "configuration", + .version_id = 1, + .post_load = configuration_post_load, + .pre_save = configuration_pre_save, + .fields = (VMStateField[]) { + VMSTATE_UINT32(len, SaveState), + VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, 0, len), + VMSTATE_END_OF_LIST() + }, }; static void dump_vmstate_vmsd(FILE *out_file, @@ -653,41 +697,6 @@ static void save_section_footer(QEMUFile *f, SaveStateEntry *se) } } -/* - * Read a footer off the wire and check that it matches the expected section - * - * Returns: true if the footer was good - * false if there is a problem (and calls error_report to say why) - */ -static bool check_section_footer(QEMUFile *f, SaveStateEntry *se) -{ - uint8_t read_mark; - uint32_t read_section_id; - - if (skip_section_footers) { - /* No footer to check */ - return true; - } - - read_mark = qemu_get_byte(f); - - if (read_mark != QEMU_VM_SECTION_FOOTER) { - error_report("Missing section footer for %s", se->idstr); - return false; - } - - read_section_id = qemu_get_be32(f); - if (read_section_id != se->section_id) { - error_report("Mismatched section id in footer for %s -" - " read 0x%x expected 0x%x", - se->idstr, read_section_id, se->section_id); - return false; - } - - /* All good */ - return true; -} - bool qemu_savevm_state_blocked(Error **errp) { SaveStateEntry *se; @@ -723,6 +732,11 @@ void qemu_savevm_state_begin(QEMUFile *f, se->ops->set_params(params, se->opaque); } + if (!savevm_state.skip_configuration) { + qemu_put_byte(f, QEMU_VM_CONFIGURATION); + vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0); + } + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { if (!se->ops || !se->ops->save_live_setup) { continue; @@ -836,6 +850,11 @@ void qemu_savevm_state_complete(QEMUFile *f) if ((!se->ops || !se->ops->save_state) && !se->vmsd) { continue; } + if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) { + trace_savevm_section_skip(se->idstr, se->section_id); + continue; + } + trace_savevm_section_start(se->idstr, se->section_id); json_start_object(vmdesc, NULL); @@ -949,6 +968,9 @@ static int qemu_save_device_state(QEMUFile *f) if ((!se->ops || !se->ops->save_state) && !se->vmsd) { continue; } + if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) { + continue; + } save_section_header(f, se, QEMU_VM_SECTION_FULL); @@ -989,6 +1011,41 @@ struct LoadStateEntry { int version_id; }; +/* + * Read a footer off the wire and check that it matches the expected section + * + * Returns: true if the footer was good + * false if there is a problem (and calls error_report to say why) + */ +static bool check_section_footer(QEMUFile *f, LoadStateEntry *le) +{ + uint8_t read_mark; + uint32_t read_section_id; + + if (skip_section_footers) { + /* No footer to check */ + return true; + } + + read_mark = qemu_get_byte(f); + + if (read_mark != QEMU_VM_SECTION_FOOTER) { + error_report("Missing section footer for %s", le->se->idstr); + return false; + } + + read_section_id = qemu_get_be32(f); + if (read_section_id != le->section_id) { + error_report("Mismatched section id in footer for %s -" + " read 0x%x expected 0x%x", + le->se->idstr, read_section_id, le->section_id); + return false; + } + + /* All good */ + return true; +} + void loadvm_free_handlers(MigrationIncomingState *mis) { LoadStateEntry *le, *new_le; @@ -1029,6 +1086,18 @@ int qemu_loadvm_state(QEMUFile *f) return -ENOTSUP; } + if (!savevm_state.skip_configuration) { + if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) { + error_report("Configuration section missing"); + return -EINVAL; + } + ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0); + + if (ret) { + return ret; + } + } + while ((section_type = qemu_get_byte(f)) != QEMU_VM_EOF) { uint32_t instance_id, version_id, section_id; SaveStateEntry *se; @@ -1082,7 +1151,7 @@ int qemu_loadvm_state(QEMUFile *f) " device '%s'", instance_id, idstr); goto out; } - if (!check_section_footer(f, le->se)) { + if (!check_section_footer(f, le)) { ret = -EINVAL; goto out; } @@ -1109,7 +1178,7 @@ int qemu_loadvm_state(QEMUFile *f) section_id, le->se->idstr); goto out; } - if (!check_section_footer(f, le->se)) { + if (!check_section_footer(f, le)) { ret = -EINVAL; goto out; } @@ -1127,16 +1196,35 @@ int qemu_loadvm_state(QEMUFile *f) * Try to read in the VMDESC section as well, so that dumping tools that * intercept our migration stream have the chance to see it. */ - if (qemu_get_byte(f) == QEMU_VM_VMDESCRIPTION) { - uint32_t size = qemu_get_be32(f); - uint8_t *buf = g_malloc(0x1000); - - while (size > 0) { - uint32_t read_chunk = MIN(size, 0x1000); - qemu_get_buffer(f, buf, read_chunk); - size -= read_chunk; + + /* We've got to be careful; if we don't read the data and just shut the fd + * then the sender can error if we close while it's still sending. + * We also mustn't read data that isn't there; some transports (RDMA) + * will stall waiting for that data when the source has already closed. + */ + if (should_send_vmdesc()) { + uint8_t *buf; + uint32_t size; + section_type = qemu_get_byte(f); + + if (section_type != QEMU_VM_VMDESCRIPTION) { + error_report("Expected vmdescription section, but got %d", + section_type); + /* + * It doesn't seem worth failing at this point since + * we apparently have an otherwise valid VM state + */ + } else { + buf = g_malloc(0x1000); + size = qemu_get_be32(f); + + while (size > 0) { + uint32_t read_chunk = MIN(size, 0x1000); + qemu_get_buffer(f, buf, read_chunk); + size -= read_chunk; + } + g_free(buf); } - g_free(buf); } cpu_synchronize_all_post_init(); diff --git a/migration/vmstate.c b/migration/vmstate.c index 6138d1acb7..e8ccf22f67 100644 --- a/migration/vmstate.c +++ b/migration/vmstate.c @@ -276,6 +276,17 @@ static void vmsd_desc_field_end(const VMStateDescription *vmsd, QJSON *vmdesc, json_end_object(vmdesc); } + +bool vmstate_save_needed(const VMStateDescription *vmsd, void *opaque) +{ + if (vmsd->needed && !vmsd->needed(opaque)) { + /* optional section not needed */ + return false; + } + return true; +} + + void vmstate_save_state(QEMUFile *f, const VMStateDescription *vmsd, void *opaque, QJSON *vmdesc) { diff --git a/pc-bios/README b/pc-bios/README index 63e725444d..05cf0421be 100644 --- a/pc-bios/README +++ b/pc-bios/README @@ -17,7 +17,7 @@ - SLOF (Slimline Open Firmware) is a free IEEE 1275 Open Firmware implementation for certain IBM POWER hardware. The sources are at https://github.com/aik/SLOF, and the image currently in qemu is - built from git tag qemu-slof-20150313. + built from git tag qemu-slof-20150429. - sgabios (the Serial Graphics Adapter option ROM) provides a means for legacy x86 software to communicate with an attached serial console as diff --git a/pc-bios/slof.bin b/pc-bios/slof.bin Binary files differindex ab72cba80c..0398ac67bc 100644 --- a/pc-bios/slof.bin +++ b/pc-bios/slof.bin diff --git a/qapi-schema.json b/qapi-schema.json index 106008cdeb..1285b8c74a 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -523,6 +523,9 @@ # minimize migration traffic. The feature is disabled by default. # (since 2.4 ) # +# @events: generate events for each migration state change +# (since 2.4 ) +# # @auto-converge: If enabled, QEMU will automatically throttle down the guest # to speed up convergence of RAM migration. (since 1.6) # @@ -530,7 +533,7 @@ ## { 'enum': 'MigrationCapability', 'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks', - 'compress'] } + 'compress', 'events'] } ## # @MigrationCapabilityStatus diff --git a/qapi/event.json b/qapi/event.json index 378dda572a..f0cef010f0 100644 --- a/qapi/event.json +++ b/qapi/event.json @@ -243,6 +243,18 @@ { 'event': 'SPICE_MIGRATE_COMPLETED' } ## +# @MIGRATION +# +# Emitted when a migration event happens +# +# @status: @MigrationStatus describing the current migration status. +# +# Since: 2.4 +## +{ 'event': 'MIGRATION', + 'data': {'status': 'MigrationStatus'}} + +## # @ACPI_DEVICE_OST # # Emitted when guest executes ACPI _OST method. diff --git a/qga/commands-posix.c b/qga/commands-posix.c index befd00b00d..675f4b4c66 100644 --- a/qga/commands-posix.c +++ b/qga/commands-posix.c @@ -154,6 +154,8 @@ void qmp_guest_set_time(bool has_time, int64_t time_ns, Error **errp) /* If user has passed a time, validate and set it. */ if (has_time) { + GDate date = { 0, }; + /* year-2038 will overflow in case time_t is 32bit */ if (time_ns / 1000000000 != (time_t)(time_ns / 1000000000)) { error_setg(errp, "Time %" PRId64 " is too large", time_ns); @@ -162,6 +164,11 @@ void qmp_guest_set_time(bool has_time, int64_t time_ns, Error **errp) tv.tv_sec = time_ns / 1000000000; tv.tv_usec = (time_ns % 1000000000) / 1000; + g_date_set_time_t(&date, tv.tv_sec); + if (date.year < 1970 || date.year >= 2070) { + error_setg_errno(errp, errno, "Invalid time"); + return; + } ret = settimeofday(&tv, NULL); if (ret < 0) { @@ -1325,18 +1332,18 @@ static void guest_fsfreeze_cleanup(void) /* * Walk list of mounted file systems in the guest, and trim them. */ -void qmp_guest_fstrim(bool has_minimum, int64_t minimum, Error **errp) +GuestFilesystemTrimResponse * +qmp_guest_fstrim(bool has_minimum, int64_t minimum, Error **errp) { + GuestFilesystemTrimResponse *response; + GuestFilesystemTrimResultList *list; + GuestFilesystemTrimResult *result; int ret = 0; FsMountList mounts; struct FsMount *mount; int fd; Error *local_err = NULL; - struct fstrim_range r = { - .start = 0, - .len = -1, - .minlen = has_minimum ? minimum : 0, - }; + struct fstrim_range r; slog("guest-fstrim called"); @@ -1344,36 +1351,59 @@ void qmp_guest_fstrim(bool has_minimum, int64_t minimum, Error **errp) build_fs_mount_list(&mounts, &local_err); if (local_err) { error_propagate(errp, local_err); - return; + return NULL; } + response = g_malloc0(sizeof(*response)); + QTAILQ_FOREACH(mount, &mounts, next) { + result = g_malloc0(sizeof(*result)); + result->path = g_strdup(mount->dirname); + + list = g_malloc0(sizeof(*list)); + list->value = result; + list->next = response->paths; + response->paths = list; + fd = qemu_open(mount->dirname, O_RDONLY); if (fd == -1) { - error_setg_errno(errp, errno, "failed to open %s", mount->dirname); - goto error; + result->error = g_strdup_printf("failed to open: %s", + strerror(errno)); + result->has_error = true; + continue; } /* We try to cull filesytems we know won't work in advance, but other * filesytems may not implement fstrim for less obvious reasons. These - * will report EOPNOTSUPP; we simply ignore these errors. Any other - * error means an unexpected error, so return it in those cases. In - * some other cases ENOTTY will be reported (e.g. CD-ROMs). + * will report EOPNOTSUPP; while in some other cases ENOTTY will be + * reported (e.g. CD-ROMs). + * Any other error means an unexpected error. */ + r.start = 0; + r.len = -1; + r.minlen = has_minimum ? minimum : 0; ret = ioctl(fd, FITRIM, &r); if (ret == -1) { - if (errno != ENOTTY && errno != EOPNOTSUPP) { - error_setg_errno(errp, errno, "failed to trim %s", - mount->dirname); - close(fd); - goto error; + result->has_error = true; + if (errno == ENOTTY || errno == EOPNOTSUPP) { + result->error = g_strdup("trim not supported"); + } else { + result->error = g_strdup_printf("failed to trim: %s", + strerror(errno)); } + close(fd); + continue; } + + result->has_minimum = true; + result->minimum = r.minlen; + result->has_trimmed = true; + result->trimmed = r.len; close(fd); } -error: free_fs_mount_list(&mounts); + return response; } #endif /* CONFIG_FSTRIM */ @@ -2402,9 +2432,11 @@ int64_t qmp_guest_fsfreeze_thaw(Error **errp) #endif /* CONFIG_FSFREEZE */ #if !defined(CONFIG_FSTRIM) -void qmp_guest_fstrim(bool has_minimum, int64_t minimum, Error **errp) +GuestFilesystemTrimResponse * +qmp_guest_fstrim(bool has_minimum, int64_t minimum, Error **errp) { error_setg(errp, QERR_UNSUPPORTED); + return NULL; } #endif diff --git a/qga/commands-win32.c b/qga/commands-win32.c index fbddc8b1b2..a7822d5ff7 100644 --- a/qga/commands-win32.c +++ b/qga/commands-win32.c @@ -16,11 +16,22 @@ #include <powrprof.h> #include <stdio.h> #include <string.h> +#include <winsock2.h> +#include <ws2tcpip.h> +#include <iptypes.h> +#include <iphlpapi.h> +#ifdef CONFIG_QGA_NTDDSCSI +#include <winioctl.h> +#include <ntddscsi.h> +#include <setupapi.h> +#include <initguid.h> +#endif #include "qga/guest-agent-core.h" #include "qga/vss-win32.h" #include "qga-qmp-commands.h" #include "qapi/qmp/qerror.h" #include "qemu/queue.h" +#include "qemu/host-utils.h" #ifndef SHTDN_REASON_FLAG_PLANNED #define SHTDN_REASON_FLAG_PLANNED 0x80000000 @@ -382,12 +393,307 @@ static void guest_file_init(void) QTAILQ_INIT(&guest_file_state.filehandles); } -GuestFilesystemInfoList *qmp_guest_get_fsinfo(Error **errp) +#ifdef CONFIG_QGA_NTDDSCSI + +static STORAGE_BUS_TYPE win2qemu[] = { + [BusTypeUnknown] = GUEST_DISK_BUS_TYPE_UNKNOWN, + [BusTypeScsi] = GUEST_DISK_BUS_TYPE_SCSI, + [BusTypeAtapi] = GUEST_DISK_BUS_TYPE_IDE, + [BusTypeAta] = GUEST_DISK_BUS_TYPE_IDE, + [BusType1394] = GUEST_DISK_BUS_TYPE_IEEE1394, + [BusTypeSsa] = GUEST_DISK_BUS_TYPE_SSA, + [BusTypeFibre] = GUEST_DISK_BUS_TYPE_SSA, + [BusTypeUsb] = GUEST_DISK_BUS_TYPE_USB, + [BusTypeRAID] = GUEST_DISK_BUS_TYPE_RAID, +#if (_WIN32_WINNT >= 0x0600) + [BusTypeiScsi] = GUEST_DISK_BUS_TYPE_ISCSI, + [BusTypeSas] = GUEST_DISK_BUS_TYPE_SAS, + [BusTypeSata] = GUEST_DISK_BUS_TYPE_SATA, + [BusTypeSd] = GUEST_DISK_BUS_TYPE_SD, + [BusTypeMmc] = GUEST_DISK_BUS_TYPE_MMC, +#endif +#if (_WIN32_WINNT >= 0x0601) + [BusTypeVirtual] = GUEST_DISK_BUS_TYPE_VIRTUAL, + [BusTypeFileBackedVirtual] = GUEST_DISK_BUS_TYPE_FILE_BACKED_VIRTUAL, +#endif +}; + +static GuestDiskBusType find_bus_type(STORAGE_BUS_TYPE bus) +{ + if (bus > ARRAY_SIZE(win2qemu) || (int)bus < 0) { + return GUEST_DISK_BUS_TYPE_UNKNOWN; + } + return win2qemu[(int)bus]; +} + +DEFINE_GUID(GUID_DEVINTERFACE_VOLUME, + 0x53f5630dL, 0xb6bf, 0x11d0, 0x94, 0xf2, + 0x00, 0xa0, 0xc9, 0x1e, 0xfb, 0x8b); + +static GuestPCIAddress *get_pci_info(char *guid, Error **errp) +{ + HDEVINFO dev_info; + SP_DEVINFO_DATA dev_info_data; + DWORD size = 0; + int i; + char dev_name[MAX_PATH]; + char *buffer = NULL; + GuestPCIAddress *pci = NULL; + char *name = g_strdup(&guid[4]); + + if (!QueryDosDevice(name, dev_name, ARRAY_SIZE(dev_name))) { + error_setg_win32(errp, GetLastError(), "failed to get dos device name"); + goto out; + } + + dev_info = SetupDiGetClassDevs(&GUID_DEVINTERFACE_VOLUME, 0, 0, + DIGCF_PRESENT | DIGCF_DEVICEINTERFACE); + if (dev_info == INVALID_HANDLE_VALUE) { + error_setg_win32(errp, GetLastError(), "failed to get devices tree"); + goto out; + } + + dev_info_data.cbSize = sizeof(SP_DEVINFO_DATA); + for (i = 0; SetupDiEnumDeviceInfo(dev_info, i, &dev_info_data); i++) { + DWORD addr, bus, slot, func, dev, data, size2; + while (!SetupDiGetDeviceRegistryProperty(dev_info, &dev_info_data, + SPDRP_PHYSICAL_DEVICE_OBJECT_NAME, + &data, (PBYTE)buffer, size, + &size2)) { + size = MAX(size, size2); + if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) { + g_free(buffer); + /* Double the size to avoid problems on + * W2k MBCS systems per KB 888609. + * https://support.microsoft.com/en-us/kb/259695 */ + buffer = g_malloc(size * 2); + } else { + error_setg_win32(errp, GetLastError(), + "failed to get device name"); + goto out; + } + } + + if (g_strcmp0(buffer, dev_name)) { + continue; + } + + /* There is no need to allocate buffer in the next functions. The size + * is known and ULONG according to + * https://support.microsoft.com/en-us/kb/253232 + * https://msdn.microsoft.com/en-us/library/windows/hardware/ff543095(v=vs.85).aspx + */ + if (!SetupDiGetDeviceRegistryProperty(dev_info, &dev_info_data, + SPDRP_BUSNUMBER, &data, (PBYTE)&bus, size, NULL)) { + break; + } + + /* The function retrieves the device's address. This value will be + * transformed into device function and number */ + if (!SetupDiGetDeviceRegistryProperty(dev_info, &dev_info_data, + SPDRP_ADDRESS, &data, (PBYTE)&addr, size, NULL)) { + break; + } + + /* This call returns UINumber of DEVICE_CAPABILITIES structure. + * This number is typically a user-perceived slot number. */ + if (!SetupDiGetDeviceRegistryProperty(dev_info, &dev_info_data, + SPDRP_UI_NUMBER, &data, (PBYTE)&slot, size, NULL)) { + break; + } + + /* SetupApi gives us the same information as driver with + * IoGetDeviceProperty. According to Microsoft + * https://support.microsoft.com/en-us/kb/253232 + * FunctionNumber = (USHORT)((propertyAddress) & 0x0000FFFF); + * DeviceNumber = (USHORT)(((propertyAddress) >> 16) & 0x0000FFFF); + * SPDRP_ADDRESS is propertyAddress, so we do the same.*/ + + func = addr & 0x0000FFFF; + dev = (addr >> 16) & 0x0000FFFF; + pci = g_malloc0(sizeof(*pci)); + pci->domain = dev; + pci->slot = slot; + pci->function = func; + pci->bus = bus; + break; + } +out: + g_free(buffer); + g_free(name); + return pci; +} + +static int get_disk_bus_type(HANDLE vol_h, Error **errp) +{ + STORAGE_PROPERTY_QUERY query; + STORAGE_DEVICE_DESCRIPTOR *dev_desc, buf; + DWORD received; + + dev_desc = &buf; + dev_desc->Size = sizeof(buf); + query.PropertyId = StorageDeviceProperty; + query.QueryType = PropertyStandardQuery; + + if (!DeviceIoControl(vol_h, IOCTL_STORAGE_QUERY_PROPERTY, &query, + sizeof(STORAGE_PROPERTY_QUERY), dev_desc, + dev_desc->Size, &received, NULL)) { + error_setg_win32(errp, GetLastError(), "failed to get bus type"); + return -1; + } + + return dev_desc->BusType; +} + +/* VSS provider works with volumes, thus there is no difference if + * the volume consist of spanned disks. Info about the first disk in the + * volume is returned for the spanned disk group (LVM) */ +static GuestDiskAddressList *build_guest_disk_info(char *guid, Error **errp) +{ + GuestDiskAddressList *list = NULL; + GuestDiskAddress *disk; + SCSI_ADDRESS addr, *scsi_ad; + DWORD len; + int bus; + HANDLE vol_h; + + scsi_ad = &addr; + char *name = g_strndup(guid, strlen(guid)-1); + + vol_h = CreateFile(name, 0, FILE_SHARE_READ, NULL, OPEN_EXISTING, + 0, NULL); + if (vol_h == INVALID_HANDLE_VALUE) { + error_setg_win32(errp, GetLastError(), "failed to open volume"); + goto out_free; + } + + bus = get_disk_bus_type(vol_h, errp); + if (bus < 0) { + goto out_close; + } + + disk = g_malloc0(sizeof(*disk)); + disk->bus_type = find_bus_type(bus); + if (bus == BusTypeScsi || bus == BusTypeAta || bus == BusTypeRAID +#if (_WIN32_WINNT >= 0x0600) + /* This bus type is not supported before Windows Server 2003 SP1 */ + || bus == BusTypeSas +#endif + ) { + /* We are able to use the same ioctls for different bus types + * according to Microsoft docs + * https://technet.microsoft.com/en-us/library/ee851589(v=ws.10).aspx */ + if (DeviceIoControl(vol_h, IOCTL_SCSI_GET_ADDRESS, NULL, 0, scsi_ad, + sizeof(SCSI_ADDRESS), &len, NULL)) { + disk->unit = addr.Lun; + disk->target = addr.TargetId; + disk->bus = addr.PathId; + disk->pci_controller = get_pci_info(name, errp); + } + /* We do not set error in this case, because we still have enough + * information about volume. */ + } else { + disk->pci_controller = NULL; + } + + list = g_malloc0(sizeof(*list)); + list->value = disk; + list->next = NULL; +out_close: + CloseHandle(vol_h); +out_free: + g_free(name); + return list; +} + +#else + +static GuestDiskAddressList *build_guest_disk_info(char *guid, Error **errp) { - error_setg(errp, QERR_UNSUPPORTED); return NULL; } +#endif /* CONFIG_QGA_NTDDSCSI */ + +static GuestFilesystemInfo *build_guest_fsinfo(char *guid, Error **errp) +{ + DWORD info_size; + char mnt, *mnt_point; + char fs_name[32]; + char vol_info[MAX_PATH+1]; + size_t len; + GuestFilesystemInfo *fs = NULL; + + GetVolumePathNamesForVolumeName(guid, (LPCH)&mnt, 0, &info_size); + if (GetLastError() != ERROR_MORE_DATA) { + error_setg_win32(errp, GetLastError(), "failed to get volume name"); + return NULL; + } + + mnt_point = g_malloc(info_size + 1); + if (!GetVolumePathNamesForVolumeName(guid, mnt_point, info_size, + &info_size)) { + error_setg_win32(errp, GetLastError(), "failed to get volume name"); + goto free; + } + + len = strlen(mnt_point); + mnt_point[len] = '\\'; + mnt_point[len+1] = 0; + if (!GetVolumeInformation(mnt_point, vol_info, sizeof(vol_info), NULL, NULL, + NULL, (LPSTR)&fs_name, sizeof(fs_name))) { + if (GetLastError() != ERROR_NOT_READY) { + error_setg_win32(errp, GetLastError(), "failed to get volume info"); + } + goto free; + } + + fs_name[sizeof(fs_name) - 1] = 0; + fs = g_malloc(sizeof(*fs)); + fs->name = g_strdup(guid); + if (len == 0) { + fs->mountpoint = g_strdup("System Reserved"); + } else { + fs->mountpoint = g_strndup(mnt_point, len); + } + fs->type = g_strdup(fs_name); + fs->disk = build_guest_disk_info(guid, errp);; +free: + g_free(mnt_point); + return fs; +} + +GuestFilesystemInfoList *qmp_guest_get_fsinfo(Error **errp) +{ + HANDLE vol_h; + GuestFilesystemInfoList *new, *ret = NULL; + char guid[256]; + + vol_h = FindFirstVolume(guid, sizeof(guid)); + if (vol_h == INVALID_HANDLE_VALUE) { + error_setg_win32(errp, GetLastError(), "failed to find any volume"); + return NULL; + } + + do { + GuestFilesystemInfo *info = build_guest_fsinfo(guid, errp); + if (info == NULL) { + continue; + } + new = g_malloc(sizeof(*ret)); + new->value = info; + new->next = ret; + ret = new; + } while (FindNextVolume(vol_h, guid, sizeof(guid))); + + if (GetLastError() != ERROR_NO_MORE_FILES) { + error_setg_win32(errp, GetLastError(), "failed to find next volume"); + } + + FindVolumeClose(vol_h); + return ret; +} + /* * Return status of freeze/thaw */ @@ -493,9 +799,11 @@ static void guest_fsfreeze_cleanup(void) * Walk list of mounted file systems in the guest, and discard unused * areas. */ -void qmp_guest_fstrim(bool has_minimum, int64_t minimum, Error **errp) +GuestFilesystemTrimResponse * +qmp_guest_fstrim(bool has_minimum, int64_t minimum, Error **errp) { error_setg(errp, QERR_UNSUPPORTED); + return NULL; } typedef enum { @@ -589,12 +897,220 @@ void qmp_guest_suspend_hybrid(Error **errp) error_setg(errp, QERR_UNSUPPORTED); } -GuestNetworkInterfaceList *qmp_guest_network_get_interfaces(Error **errp) +static IP_ADAPTER_ADDRESSES *guest_get_adapters_addresses(Error **errp) { - error_setg(errp, QERR_UNSUPPORTED); + IP_ADAPTER_ADDRESSES *adptr_addrs = NULL; + ULONG adptr_addrs_len = 0; + DWORD ret; + + /* Call the first time to get the adptr_addrs_len. */ + GetAdaptersAddresses(AF_UNSPEC, GAA_FLAG_INCLUDE_PREFIX, + NULL, adptr_addrs, &adptr_addrs_len); + + adptr_addrs = g_malloc(adptr_addrs_len); + ret = GetAdaptersAddresses(AF_UNSPEC, GAA_FLAG_INCLUDE_PREFIX, + NULL, adptr_addrs, &adptr_addrs_len); + if (ret != ERROR_SUCCESS) { + error_setg_win32(errp, ret, "failed to get adapters addresses"); + g_free(adptr_addrs); + adptr_addrs = NULL; + } + return adptr_addrs; +} + +static char *guest_wctomb_dup(WCHAR *wstr) +{ + char *str; + size_t i; + + i = wcslen(wstr) + 1; + str = g_malloc(i); + WideCharToMultiByte(CP_ACP, WC_COMPOSITECHECK, + wstr, -1, str, i, NULL, NULL); + return str; +} + +static char *guest_addr_to_str(IP_ADAPTER_UNICAST_ADDRESS *ip_addr, + Error **errp) +{ + char addr_str[INET6_ADDRSTRLEN + INET_ADDRSTRLEN]; + DWORD len; + int ret; + + if (ip_addr->Address.lpSockaddr->sa_family == AF_INET || + ip_addr->Address.lpSockaddr->sa_family == AF_INET6) { + len = sizeof(addr_str); + ret = WSAAddressToString(ip_addr->Address.lpSockaddr, + ip_addr->Address.iSockaddrLength, + NULL, + addr_str, + &len); + if (ret != 0) { + error_setg_win32(errp, WSAGetLastError(), + "failed address presentation form conversion"); + return NULL; + } + return g_strdup(addr_str); + } return NULL; } +#if (_WIN32_WINNT >= 0x0600) +static int64_t guest_ip_prefix(IP_ADAPTER_UNICAST_ADDRESS *ip_addr) +{ + /* For Windows Vista/2008 and newer, use the OnLinkPrefixLength + * field to obtain the prefix. + */ + return ip_addr->OnLinkPrefixLength; +} +#else +/* When using the Windows XP and 2003 build environment, do the best we can to + * figure out the prefix. + */ +static IP_ADAPTER_INFO *guest_get_adapters_info(void) +{ + IP_ADAPTER_INFO *adptr_info = NULL; + ULONG adptr_info_len = 0; + DWORD ret; + + /* Call the first time to get the adptr_info_len. */ + GetAdaptersInfo(adptr_info, &adptr_info_len); + + adptr_info = g_malloc(adptr_info_len); + ret = GetAdaptersInfo(adptr_info, &adptr_info_len); + if (ret != ERROR_SUCCESS) { + g_free(adptr_info); + adptr_info = NULL; + } + return adptr_info; +} + +static int64_t guest_ip_prefix(IP_ADAPTER_UNICAST_ADDRESS *ip_addr) +{ + int64_t prefix = -1; /* Use for AF_INET6 and unknown/undetermined values. */ + IP_ADAPTER_INFO *adptr_info, *info; + IP_ADDR_STRING *ip; + struct in_addr *p; + + if (ip_addr->Address.lpSockaddr->sa_family != AF_INET) { + return prefix; + } + adptr_info = guest_get_adapters_info(); + if (adptr_info == NULL) { + return prefix; + } + + /* Match up the passed in ip_addr with one found in adaptr_info. + * The matching one in adptr_info will have the netmask. + */ + p = &((struct sockaddr_in *)ip_addr->Address.lpSockaddr)->sin_addr; + for (info = adptr_info; info; info = info->Next) { + for (ip = &info->IpAddressList; ip; ip = ip->Next) { + if (p->S_un.S_addr == inet_addr(ip->IpAddress.String)) { + prefix = ctpop32(inet_addr(ip->IpMask.String)); + goto out; + } + } + } +out: + g_free(adptr_info); + return prefix; +} +#endif + +GuestNetworkInterfaceList *qmp_guest_network_get_interfaces(Error **errp) +{ + IP_ADAPTER_ADDRESSES *adptr_addrs, *addr; + IP_ADAPTER_UNICAST_ADDRESS *ip_addr = NULL; + GuestNetworkInterfaceList *head = NULL, *cur_item = NULL; + GuestIpAddressList *head_addr, *cur_addr; + GuestNetworkInterfaceList *info; + GuestIpAddressList *address_item = NULL; + unsigned char *mac_addr; + char *addr_str; + WORD wsa_version; + WSADATA wsa_data; + int ret; + + adptr_addrs = guest_get_adapters_addresses(errp); + if (adptr_addrs == NULL) { + return NULL; + } + + /* Make WSA APIs available. */ + wsa_version = MAKEWORD(2, 2); + ret = WSAStartup(wsa_version, &wsa_data); + if (ret != 0) { + error_setg_win32(errp, ret, "failed socket startup"); + goto out; + } + + for (addr = adptr_addrs; addr; addr = addr->Next) { + info = g_malloc0(sizeof(*info)); + + if (cur_item == NULL) { + head = cur_item = info; + } else { + cur_item->next = info; + cur_item = info; + } + + info->value = g_malloc0(sizeof(*info->value)); + info->value->name = guest_wctomb_dup(addr->FriendlyName); + + if (addr->PhysicalAddressLength != 0) { + mac_addr = addr->PhysicalAddress; + + info->value->hardware_address = + g_strdup_printf("%02x:%02x:%02x:%02x:%02x:%02x", + (int) mac_addr[0], (int) mac_addr[1], + (int) mac_addr[2], (int) mac_addr[3], + (int) mac_addr[4], (int) mac_addr[5]); + + info->value->has_hardware_address = true; + } + + head_addr = NULL; + cur_addr = NULL; + for (ip_addr = addr->FirstUnicastAddress; + ip_addr; + ip_addr = ip_addr->Next) { + addr_str = guest_addr_to_str(ip_addr, errp); + if (addr_str == NULL) { + continue; + } + + address_item = g_malloc0(sizeof(*address_item)); + + if (!cur_addr) { + head_addr = cur_addr = address_item; + } else { + cur_addr->next = address_item; + cur_addr = address_item; + } + + address_item->value = g_malloc0(sizeof(*address_item->value)); + address_item->value->ip_address = addr_str; + address_item->value->prefix = guest_ip_prefix(ip_addr); + if (ip_addr->Address.lpSockaddr->sa_family == AF_INET) { + address_item->value->ip_address_type = + GUEST_IP_ADDRESS_TYPE_IPV4; + } else if (ip_addr->Address.lpSockaddr->sa_family == AF_INET6) { + address_item->value->ip_address_type = + GUEST_IP_ADDRESS_TYPE_IPV6; + } + } + if (head_addr) { + info->value->has_ip_addresses = true; + info->value->ip_addresses = head_addr; + } + } + WSACleanup(); +out: + g_free(adptr_addrs); + return head; +} + int64_t qmp_guest_get_time(Error **errp) { SYSTEMTIME ts = {0}; @@ -707,12 +1223,12 @@ GuestMemoryBlockInfo *qmp_guest_get_memory_block_info(Error **errp) GList *ga_command_blacklist_init(GList *blacklist) { const char *list_unsupported[] = { - "guest-suspend-hybrid", "guest-network-get-interfaces", + "guest-suspend-hybrid", "guest-get-vcpus", "guest-set-vcpus", "guest-set-user-password", "guest-get-memory-blocks", "guest-set-memory-blocks", "guest-get-memory-block-size", - "guest-fsfreeze-freeze-list", "guest-get-fsinfo", + "guest-fsfreeze-freeze-list", "guest-fstrim", NULL}; char **p = (char **)list_unsupported; diff --git a/qga/main.c b/qga/main.c index 23cde0104a..791982ef01 100644 --- a/qga/main.c +++ b/qga/main.c @@ -274,7 +274,7 @@ static void ga_log(const gchar *domain, GLogLevelFlags level, level &= G_LOG_LEVEL_MASK; #ifndef _WIN32 - if (domain && strcmp(domain, "syslog") == 0) { + if (g_strcmp0(domain, "syslog") == 0) { syslog(LOG_INFO, "%s: %s", level_str, msg); } else if (level & s->log_level) { #else diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json index b446dc729d..8a9b818d18 100644 --- a/qga/qapi-schema.json +++ b/qga/qapi-schema.json @@ -425,6 +425,30 @@ 'returns': 'int' } ## +# @GuestFilesystemTrimResult +# +# @path: path that was trimmed +# @error: an error message when trim failed +# @trimmed: bytes trimmed for this path +# @minimum: reported effective minimum for this path +# +# Since: 2.4 +## +{ 'struct': 'GuestFilesystemTrimResult', + 'data': {'path': 'str', + '*trimmed': 'int', '*minimum': 'int', '*error': 'str'} } + +## +# @GuestFilesystemTrimResponse +# +# @paths: list of @GuestFilesystemTrimResult per path that was trimmed +# +# Since: 2.4 +## +{ 'struct': 'GuestFilesystemTrimResponse', + 'data': {'paths': ['GuestFilesystemTrimResult']} } + +## # @guest-fstrim: # # Discard (or "trim") blocks which are not in use by the filesystem. @@ -437,12 +461,14 @@ # fragmented free space, although not all blocks will be discarded. # The default value is zero, meaning "discard every free block". # -# Returns: Nothing. +# Returns: A @GuestFilesystemTrimResponse which contains the +# status of all trimmed paths. (since 2.4) # # Since: 1.2 ## { 'command': 'guest-fstrim', - 'data': { '*minimum': 'int' } } + 'data': { '*minimum': 'int' }, + 'returns': 'GuestFilesystemTrimResponse' } ## # @guest-suspend-disk @@ -677,12 +703,24 @@ # @uml: UML disks # @sata: SATA disks # @sd: SD cards +# @unknown: Unknown bus type +# @ieee1394: Win IEEE 1394 bus type +# @ssa: Win SSA bus type +# @fibre: Win fiber channel bus type +# @raid: Win RAID bus type +# @iscsi: Win iScsi bus type +# @sas: Win serial-attaches SCSI bus type +# @mmc: Win multimedia card (MMC) bus type +# @virtual: Win virtual bus type +# @file-backed virtual: Win file-backed bus type # # Since: 2.2 ## { 'enum': 'GuestDiskBusType', 'data': [ 'ide', 'fdc', 'scsi', 'virtio', 'xen', 'usb', 'uml', 'sata', - 'sd' ] } + 'sd', 'unknown', 'ieee1394', 'ssa', 'fibre', 'raid', 'iscsi', + 'sas', 'mmc', 'virtual', 'file-backed-virtual' ] } + ## # @GuestPCIAddress: diff --git a/roms/SLOF b/roms/SLOF -Subproject c89b0df661c0a6bfa9ff0ed4a371f631f5ee38b +Subproject 7d766a3ac9b2474f6c7da0084d43590cbbf047b diff --git a/target-i386/cpu.c b/target-i386/cpu.c index 36b07f99aa..b4f9461969 100644 --- a/target-i386/cpu.c +++ b/target-i386/cpu.c @@ -286,6 +286,17 @@ static const char *cpuid_xsave_feature_name[] = { NULL, NULL, NULL, NULL, }; +static const char *cpuid_6_feature_name[] = { + NULL, NULL, "arat", NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, +}; + #define I486_FEATURES (CPUID_FP87 | CPUID_VME | CPUID_PSE) #define PENTIUM_FEATURES (I486_FEATURES | CPUID_DE | CPUID_TSC | \ CPUID_MSR | CPUID_MCE | CPUID_CX8 | CPUID_MMX | CPUID_APIC) @@ -341,6 +352,7 @@ static const char *cpuid_xsave_feature_name[] = { CPUID_7_0_EBX_ERMS, CPUID_7_0_EBX_INVPCID, CPUID_7_0_EBX_RTM, CPUID_7_0_EBX_RDSEED */ #define TCG_APM_FEATURES 0 +#define TCG_6_EAX_FEATURES CPUID_6_EAX_ARAT typedef struct FeatureWordInfo { @@ -410,6 +422,11 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] = { .cpuid_reg = R_EAX, .tcg_features = 0, }, + [FEAT_6_EAX] = { + .feat_names = cpuid_6_feature_name, + .cpuid_eax = 6, .cpuid_reg = R_EAX, + .tcg_features = TCG_6_EAX_FEATURES, + }, }; typedef struct X86RegisterInfo32 { @@ -1003,6 +1020,8 @@ static X86CPUDefinition builtin_x86_defs[] = { CPUID_EXT2_LM | CPUID_EXT2_SYSCALL | CPUID_EXT2_NX, .features[FEAT_8000_0001_ECX] = CPUID_EXT3_LAHF_LM, + .features[FEAT_6_EAX] = + CPUID_6_EAX_ARAT, .xlevel = 0x8000000A, .model_id = "Westmere E56xx/L56xx/X56xx (Nehalem-C)", }, @@ -1032,6 +1051,8 @@ static X86CPUDefinition builtin_x86_defs[] = { CPUID_EXT3_LAHF_LM, .features[FEAT_XSAVE] = CPUID_XSAVE_XSAVEOPT, + .features[FEAT_6_EAX] = + CPUID_6_EAX_ARAT, .xlevel = 0x8000000A, .model_id = "Intel Xeon E312xx (Sandy Bridge)", }, @@ -1064,6 +1085,8 @@ static X86CPUDefinition builtin_x86_defs[] = { CPUID_EXT3_LAHF_LM, .features[FEAT_XSAVE] = CPUID_XSAVE_XSAVEOPT, + .features[FEAT_6_EAX] = + CPUID_6_EAX_ARAT, .xlevel = 0x8000000A, .model_id = "Intel Xeon E3-12xx v2 (Ivy Bridge)", }, @@ -1098,6 +1121,8 @@ static X86CPUDefinition builtin_x86_defs[] = { CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_ERMS | CPUID_7_0_EBX_INVPCID, .features[FEAT_XSAVE] = CPUID_XSAVE_XSAVEOPT, + .features[FEAT_6_EAX] = + CPUID_6_EAX_ARAT, .xlevel = 0x8000000A, .model_id = "Intel Core Processor (Haswell, no TSX)", }, { @@ -1132,6 +1157,8 @@ static X86CPUDefinition builtin_x86_defs[] = { CPUID_7_0_EBX_RTM, .features[FEAT_XSAVE] = CPUID_XSAVE_XSAVEOPT, + .features[FEAT_6_EAX] = + CPUID_6_EAX_ARAT, .xlevel = 0x8000000A, .model_id = "Intel Core Processor (Haswell)", }, @@ -1168,6 +1195,8 @@ static X86CPUDefinition builtin_x86_defs[] = { CPUID_7_0_EBX_SMAP, .features[FEAT_XSAVE] = CPUID_XSAVE_XSAVEOPT, + .features[FEAT_6_EAX] = + CPUID_6_EAX_ARAT, .xlevel = 0x8000000A, .model_id = "Intel Core Processor (Broadwell, no TSX)", }, @@ -1204,6 +1233,8 @@ static X86CPUDefinition builtin_x86_defs[] = { CPUID_7_0_EBX_SMAP, .features[FEAT_XSAVE] = CPUID_XSAVE_XSAVEOPT, + .features[FEAT_6_EAX] = + CPUID_6_EAX_ARAT, .xlevel = 0x8000000A, .model_id = "Intel Core Processor (Broadwell)", }, @@ -2359,7 +2390,7 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, break; case 6: /* Thermal and Power Leaf */ - *eax = 0; + *eax = env->features[FEAT_6_EAX]; *ebx = 0; *ecx = 0; *edx = 0; diff --git a/target-i386/cpu.h b/target-i386/cpu.h index ac39291b48..14dced08a7 100644 --- a/target-i386/cpu.h +++ b/target-i386/cpu.h @@ -415,6 +415,7 @@ typedef enum FeatureWord { FEAT_KVM, /* CPUID[4000_0001].EAX (KVM_CPUID_FEATURES) */ FEAT_SVM, /* CPUID[8000_000A].EDX */ FEAT_XSAVE, /* CPUID[EAX=0xd,ECX=1].EAX */ + FEAT_6_EAX, /* CPUID[6].EAX */ FEATURE_WORDS, } FeatureWord; @@ -580,6 +581,8 @@ typedef uint32_t FeatureWordArray[FEATURE_WORDS]; #define CPUID_XSAVE_XGETBV1 (1U << 2) #define CPUID_XSAVE_XSAVES (1U << 3) +#define CPUID_6_EAX_ARAT (1U << 2) + /* CPUID[0x80000007].EDX flags: */ #define CPUID_APM_INVTSC (1U << 8) @@ -959,7 +962,7 @@ typedef struct CPUX86State { uint8_t has_error_code; uint32_t sipi_vector; bool tsc_valid; - int tsc_khz; + int64_t tsc_khz; void *kvm_xsave_buf; uint64_t mcg_cap; diff --git a/target-i386/kvm.c b/target-i386/kvm.c index 9038bf7077..066d03d99e 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -238,6 +238,8 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function, if (!kvm_irqchip_in_kernel()) { ret &= ~CPUID_EXT_X2APIC; } + } else if (function == 6 && reg == R_EAX) { + ret |= CPUID_6_EAX_ARAT; /* safe to allow because of emulated APIC */ } else if (function == 0x80000001 && reg == R_EDX) { /* On Intel, kvm returns cpuid according to the Intel spec, * so add missing bits according to the AMD spec: diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c index ddf469fe09..110436d088 100644 --- a/target-ppc/kvm.c +++ b/target-ppc/kvm.c @@ -40,6 +40,7 @@ #include "trace.h" #include "exec/gdbstub.h" #include "exec/memattrs.h" +#include "sysemu/hostmem.h" //#define DEBUG_KVM @@ -303,16 +304,11 @@ static void kvm_get_smmu_info(PowerPCCPU *cpu, struct kvm_ppc_smmu_info *info) kvm_get_fallback_smmu_info(cpu, info); } -static long getrampagesize(void) +static long gethugepagesize(const char *mem_path) { struct statfs fs; int ret; - if (!mem_path) { - /* guest RAM is backed by normal anonymous pages */ - return getpagesize(); - } - do { ret = statfs(mem_path, &fs); } while (ret != 0 && errno == EINTR); @@ -334,6 +330,55 @@ static long getrampagesize(void) return fs.f_bsize; } +static int find_max_supported_pagesize(Object *obj, void *opaque) +{ + char *mem_path; + long *hpsize_min = opaque; + + if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) { + mem_path = object_property_get_str(obj, "mem-path", NULL); + if (mem_path) { + long hpsize = gethugepagesize(mem_path); + if (hpsize < *hpsize_min) { + *hpsize_min = hpsize; + } + } else { + *hpsize_min = getpagesize(); + } + } + + return 0; +} + +static long getrampagesize(void) +{ + long hpsize = LONG_MAX; + Object *memdev_root; + + if (mem_path) { + return gethugepagesize(mem_path); + } + + /* it's possible we have memory-backend objects with + * hugepage-backed RAM. these may get mapped into system + * address space via -numa parameters or memory hotplug + * hooks. we want to take these into account, but we + * also want to make sure these supported hugepage + * sizes are applicable across the entire range of memory + * we may boot from, so we take the min across all + * backends, and assume normal pages in cases where a + * backend isn't backed by hugepages. + */ + memdev_root = object_resolve_path("/objects", NULL); + if (!memdev_root) { + return getpagesize(); + } + + object_child_foreach(memdev_root, find_max_supported_pagesize, &hpsize); + + return (hpsize == LONG_MAX) ? getpagesize() : hpsize; +} + static bool kvm_valid_page_size(uint32_t flags, long rampgsize, uint32_t shift) { if (!(flags & KVM_PPC_PAGE_SIZES_REAL)) { diff --git a/target-s390x/int_helper.c b/target-s390x/int_helper.c index 2c2b3f622c..a46c736d67 100644 --- a/target-s390x/int_helper.c +++ b/target-s390x/int_helper.c @@ -121,11 +121,12 @@ uint64_t HELPER(clz)(uint64_t v) return clz64(v); } -uint64_t HELPER(cvd)(int32_t bin) +uint64_t HELPER(cvd)(int32_t reg) { /* positive 0 */ uint64_t dec = 0x0c; - int shift = 4; + int64_t bin = reg; + int shift; if (bin < 0) { bin = -bin; @@ -133,9 +134,7 @@ uint64_t HELPER(cvd)(int32_t bin) } for (shift = 4; (shift < 64) && bin; shift += 4) { - int current_number = bin % 10; - - dec |= (current_number) << shift; + dec |= (bin % 10) << shift; bin /= 10; } diff --git a/target-s390x/mem_helper.c b/target-s390x/mem_helper.c index 3ccbeb99e4..6f8bd796ad 100644 --- a/target-s390x/mem_helper.c +++ b/target-s390x/mem_helper.c @@ -482,6 +482,7 @@ uint32_t HELPER(ex)(CPUS390XState *env, uint32_t cc, uint64_t v1, case 0xc00: helper_tr(env, l, get_address(env, 0, b1, d1), get_address(env, 0, b2, d2)); + break; case 0xd00: cc = helper_trt(env, l, get_address(env, 0, b1, d1), get_address(env, 0, b2, d2)); @@ -550,7 +551,7 @@ uint32_t HELPER(mvcl)(CPUS390XState *env, uint32_t r1, uint32_t r2) uint64_t dest = get_address_31fix(env, r1); uint64_t srclen = env->regs[r2 + 1] & 0xffffff; uint64_t src = get_address_31fix(env, r2); - uint8_t pad = src >> 24; + uint8_t pad = env->regs[r2 + 1] >> 24; uint8_t v; uint32_t cc; diff --git a/tcg/s390/tcg-target.c b/tcg/s390/tcg-target.c index 669fafe24f..921991ebfa 100644 --- a/tcg/s390/tcg-target.c +++ b/tcg/s390/tcg-target.c @@ -1643,8 +1643,10 @@ static void tcg_out_qemu_ld(TCGContext* s, TCGReg data_reg, TCGReg addr_reg, base_reg = tcg_out_tlb_read(s, addr_reg, opc, mem_index, 1); - label_ptr = s->code_ptr + 1; - tcg_out_insn(s, RI, BRC, S390_CC_NE, 0); + /* We need to keep the offset unchanged for retranslation. */ + tcg_out16(s, RI_BRC | (S390_CC_NE << 4)); + label_ptr = s->code_ptr; + s->code_ptr += 1; tcg_out_qemu_ld_direct(s, opc, data_reg, base_reg, TCG_REG_R2, 0); @@ -1669,8 +1671,10 @@ static void tcg_out_qemu_st(TCGContext* s, TCGReg data_reg, TCGReg addr_reg, base_reg = tcg_out_tlb_read(s, addr_reg, opc, mem_index, 0); - label_ptr = s->code_ptr + 1; - tcg_out_insn(s, RI, BRC, S390_CC_NE, 0); + /* We need to keep the offset unchanged for retranslation. */ + tcg_out16(s, RI_BRC | (S390_CC_NE << 4)); + label_ptr = s->code_ptr; + s->code_ptr += 1; tcg_out_qemu_st_direct(s, opc, data_reg, base_reg, TCG_REG_R2, 0); diff --git a/tests/Makefile b/tests/Makefile index ef1e981b39..e843a7e229 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -135,6 +135,9 @@ check-qtest-pci-y += tests/display-vga-test$(EXESUF) gcov-files-pci-y += hw/display/vga.c gcov-files-pci-y += hw/display/cirrus_vga.c gcov-files-pci-y += hw/display/vga-pci.c +gcov-files-pci-y += hw/display/virtio-gpu.c +gcov-files-pci-y += hw/display/virtio-gpu-pci.c +gcov-files-pci-$(CONFIG_VIRTIO_VGA) += hw/display/virtio-vga.c check-qtest-pci-y += tests/intel-hda-test$(EXESUF) gcov-files-pci-y += hw/audio/intel-hda.c hw/audio/hda-codec.c diff --git a/tests/display-vga-test.c b/tests/display-vga-test.c index 17f59101e8..7694344eaf 100644 --- a/tests/display-vga-test.c +++ b/tests/display-vga-test.c @@ -36,6 +36,20 @@ static void pci_multihead(void) qtest_end(); } +static void pci_virtio_gpu(void) +{ + qtest_start("-vga none -device virtio-gpu-pci"); + qtest_end(); +} + +#ifdef CONFIG_VIRTIO_VGA +static void pci_virtio_vga(void) +{ + qtest_start("-vga none -device virtio-vga"); + qtest_end(); +} +#endif + int main(int argc, char **argv) { int ret; @@ -46,6 +60,10 @@ int main(int argc, char **argv) qtest_add_func("/display/pci/stdvga", pci_stdvga); qtest_add_func("/display/pci/secondary", pci_secondary); qtest_add_func("/display/pci/multihead", pci_multihead); + qtest_add_func("/display/pci/virtio-gpu", pci_virtio_gpu); +#ifdef CONFIG_VIRTIO_VGA + qtest_add_func("/display/pci/virtio-vga", pci_virtio_vga); +#endif ret = g_test_run(); return ret; diff --git a/tests/rocker/bridge-vlan b/tests/rocker/bridge-vlan index ef9e5f53bb..897d82c5c7 100755 --- a/tests/rocker/bridge-vlan +++ b/tests/rocker/bridge-vlan @@ -20,8 +20,8 @@ simp ssh tut sw1 --cmd "echo 1 | sudo dd of=/sys/class/net/br0/bridge/vlan_filte # add both ports to VLAN 57 -simp ssh tut sw1 --cmd "sudo /sbin/bridge vlan add vid 57 dev sw1p1 master self" -simp ssh tut sw1 --cmd "sudo /sbin/bridge vlan add vid 57 dev sw1p2 master self" +simp ssh tut sw1 --cmd "sudo /sbin/bridge vlan add vid 57 dev sw1p1" +simp ssh tut sw1 --cmd "sudo /sbin/bridge vlan add vid 57 dev sw1p2" # turn off learning and flooding in SW diff --git a/tests/rocker/bridge-vlan-stp b/tests/rocker/bridge-vlan-stp index c660312bc6..85d2646820 100755 --- a/tests/rocker/bridge-vlan-stp +++ b/tests/rocker/bridge-vlan-stp @@ -21,8 +21,8 @@ simp ssh tut sw1 --cmd "echo 1 | sudo dd of=/sys/class/net/br0/bridge/vlan_filte # add both ports to VLAN 57 -simp ssh tut sw1 --cmd "sudo /sbin/bridge vlan add vid 57 dev sw1p1 master self" -simp ssh tut sw1 --cmd "sudo /sbin/bridge vlan add vid 57 dev sw1p2 master self" +simp ssh tut sw1 --cmd "sudo /sbin/bridge vlan add vid 57 dev sw1p1" +simp ssh tut sw1 --cmd "sudo /sbin/bridge vlan add vid 57 dev sw1p2" # turn off learning and flooding in SW diff --git a/trace-events b/trace-events index 52b7efa9a4..2d395c59a9 100644 --- a/trace-events +++ b/trace-events @@ -1191,6 +1191,7 @@ qemu_loadvm_state_section_partend(uint32_t section_id) "%u" qemu_loadvm_state_section_startfull(uint32_t section_id, const char *idstr, uint32_t instance_id, uint32_t version_id) "%u(%s) %u %u" savevm_section_start(const char *id, unsigned int section_id) "%s, section_id %u" savevm_section_end(const char *id, unsigned int section_id, int ret) "%s, section_id %u -> %d" +savevm_section_skip(const char *id, unsigned int section_id) "%s, section_id %u" savevm_state_begin(void) "" savevm_state_header(void) "" savevm_state_iterate(void) "" @@ -1403,10 +1404,13 @@ migrate_fd_error(void) "" migrate_fd_cancel(void) "" migrate_pending(uint64_t size, uint64_t max) "pending size %" PRIu64 " max %" PRIu64 migrate_transferred(uint64_t tranferred, uint64_t time_spent, double bandwidth, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %g max_size %" PRId64 +migrate_state_too_big(void) "" +migrate_global_state_post_load(const char *state) "loaded state: %s" +migrate_global_state_pre_save(const char *state) "saved state: %s" # migration/rdma.c -qemu_dma_accept_incoming_migration(void) "" -qemu_dma_accept_incoming_migration_accepted(void) "" +qemu_rdma_accept_incoming_migration(void) "" +qemu_rdma_accept_incoming_migration_accepted(void) "" qemu_rdma_accept_pin_state(bool pin) "%d" qemu_rdma_accept_pin_verbsc(void *verbs) "Verbs context after listen: %p" qemu_rdma_block_for_wrid_miss(const char *wcompstr, int wcomp, const char *gcompstr, uint64_t req) "A Wanted wrid %s (%d) but got %s (%" PRIu64 ")" @@ -1433,13 +1437,14 @@ qemu_rdma_register_and_get_keys(uint64_t len, void *start) "Registering %" PRIu6 qemu_rdma_registration_handle_compress(int64_t length, int index, int64_t offset) "Zapping zero chunk: %" PRId64 " bytes, index %d, offset %" PRId64 qemu_rdma_registration_handle_finished(void) "" qemu_rdma_registration_handle_ram_blocks(void) "" +qemu_rdma_registration_handle_ram_blocks_loop(const char *name, uint64_t offset, uint64_t length, void *local_host_addr, unsigned int src_index) "%s: @%" PRIx64 "/%" PRIu64 " host:@%p src_index: %u" qemu_rdma_registration_handle_register(int requests) "%d requests" qemu_rdma_registration_handle_register_loop(int req, int index, uint64_t addr, uint64_t chunks) "Registration request (%d): index %d, current_addr %" PRIu64 " chunks: %" PRIu64 qemu_rdma_registration_handle_register_rkey(int rkey) "%x" qemu_rdma_registration_handle_unregister(int requests) "%d requests" qemu_rdma_registration_handle_unregister_loop(int count, int index, uint64_t chunk) "Unregistration request (%d): index %d, chunk %" PRIu64 qemu_rdma_registration_handle_unregister_success(uint64_t chunk) "%" PRIu64 -qemu_rdma_registration_handle_wait(uint64_t flags) "Waiting for next request %" PRIu64 +qemu_rdma_registration_handle_wait(void) "" qemu_rdma_registration_start(uint64_t flags) "%" PRIu64 qemu_rdma_registration_stop(uint64_t flags) "%" PRIu64 qemu_rdma_registration_stop_ram(void) "" @@ -1458,8 +1463,9 @@ qemu_rdma_write_one_recvregres(int mykey, int theirkey, uint64_t chunk) "Receive qemu_rdma_write_one_sendreg(uint64_t chunk, int len, int index, int64_t offset) "Sending registration request chunk %" PRIu64 " for %d bytes, index: %d, offset: %" PRId64 qemu_rdma_write_one_top(uint64_t chunks, uint64_t size) "Writing %" PRIu64 " chunks, (%" PRIu64 " MB)" qemu_rdma_write_one_zero(uint64_t chunk, int len, int index, int64_t offset) "Entire chunk is zero, sending compress: %" PRIu64 " for %d bytes, index: %d, offset: %" PRId64 -rdma_add_block(int block, uint64_t addr, uint64_t offset, uint64_t len, uint64_t end, uint64_t bits, int chunks) "Added Block: %d, addr: %" PRIu64 ", offset: %" PRIu64 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d" -rdma_delete_block(int block, uint64_t addr, uint64_t offset, uint64_t len, uint64_t end, uint64_t bits, int chunks) "Deleted Block: %d, addr: %" PRIu64 ", offset: %" PRIu64 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d" +rdma_add_block(const char *block_name, int block, uint64_t addr, uint64_t offset, uint64_t len, uint64_t end, uint64_t bits, int chunks) "Added Block: '%s':%d, addr: %" PRIu64 ", offset: %" PRIu64 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d" +rdma_block_notification_handle(const char *name, int index) "%s at %d" +rdma_delete_block(void *block, uint64_t addr, uint64_t offset, uint64_t len, uint64_t end, uint64_t bits, int chunks) "Deleted Block: %p, addr: %" PRIu64 ", offset: %" PRIu64 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d" rdma_start_incoming_migration(void) "" rdma_start_incoming_migration_after_dest_init(void) "" rdma_start_incoming_migration_after_rdma_listen(void) "" @@ -1594,6 +1600,7 @@ vfio_platform_intp_interrupt(int pin, int fd) "Inject IRQ #%d (fd = %d)" vfio_platform_intp_inject_pending_lockheld(int pin, int fd) "Inject pending IRQ #%d (fd = %d)" vfio_platform_populate_interrupts(int pin, int count, int flags) "- IRQ index %d: count %d, flags=0x%x" vfio_intp_interrupt_set_pending(int index) "irq %d is set PENDING" +vfio_platform_start_irqfd_injection(int index, int fd, int resamplefd) "IRQ index=%d, fd = %d, resamplefd = %d" #hw/acpi/memory_hotplug.c mhp_acpi_invalid_slot_selected(uint32_t slot) "0x%"PRIx32 diff --git a/translate-all.c b/translate-all.c index 412bc9005f..50d53fdac0 100644 --- a/translate-all.c +++ b/translate-all.c @@ -118,6 +118,7 @@ typedef struct PageDesc { #define V_L1_SHIFT (L1_MAP_ADDR_SPACE_BITS - TARGET_PAGE_BITS - V_L1_BITS) uintptr_t qemu_real_host_page_size; +uintptr_t qemu_real_host_page_mask; uintptr_t qemu_host_page_size; uintptr_t qemu_host_page_mask; @@ -307,6 +308,7 @@ void page_size_init(void) /* NOTE: we can always suppose that qemu_host_page_size >= TARGET_PAGE_SIZE */ qemu_real_host_page_size = getpagesize(); + qemu_real_host_page_mask = ~(qemu_real_host_page_size - 1); if (qemu_host_page_size == 0) { qemu_host_page_size = qemu_real_host_page_size; } @@ -573,8 +573,14 @@ static const RunStateTransition runstate_transitions_def[] = { { RUN_STATE_DEBUG, RUN_STATE_RUNNING }, { RUN_STATE_DEBUG, RUN_STATE_FINISH_MIGRATE }, - { RUN_STATE_INMIGRATE, RUN_STATE_RUNNING }, + { RUN_STATE_INMIGRATE, RUN_STATE_INTERNAL_ERROR }, + { RUN_STATE_INMIGRATE, RUN_STATE_IO_ERROR }, { RUN_STATE_INMIGRATE, RUN_STATE_PAUSED }, + { RUN_STATE_INMIGRATE, RUN_STATE_RUNNING }, + { RUN_STATE_INMIGRATE, RUN_STATE_SHUTDOWN }, + { RUN_STATE_INMIGRATE, RUN_STATE_SUSPENDED }, + { RUN_STATE_INMIGRATE, RUN_STATE_WATCHDOG }, + { RUN_STATE_INMIGRATE, RUN_STATE_GUEST_PANICKED }, { RUN_STATE_INTERNAL_ERROR, RUN_STATE_PAUSED }, { RUN_STATE_INTERNAL_ERROR, RUN_STATE_FINISH_MIGRATE }, @@ -634,6 +640,18 @@ bool runstate_check(RunState state) return current_run_state == state; } +bool runstate_store(char *str, size_t size) +{ + const char *state = RunState_lookup[current_run_state]; + size_t len = strlen(state) + 1; + + if (len > size) { + return false; + } + memcpy(str, state, len); + return true; +} + static void runstate_init(void) { const RunStateTransition *p; @@ -4606,6 +4624,7 @@ int main(int argc, char **argv, char **envp) return 0; } + register_global_state(); if (incoming) { Error *local_err = NULL; qemu_start_incoming_migration(incoming, &local_err); |