diff options
Diffstat (limited to 'hw')
-rw-r--r-- | hw/hyperv/vmbus.c | 20 | ||||
-rw-r--r-- | hw/ppc/Kconfig | 5 | ||||
-rw-r--r-- | hw/ppc/meson.build | 3 | ||||
-rw-r--r-- | hw/ppc/pegasos2.c | 789 | ||||
-rw-r--r-- | hw/ppc/spapr.c | 77 | ||||
-rw-r--r-- | hw/ppc/spapr_caps.c | 41 | ||||
-rw-r--r-- | hw/ppc/spapr_hcall.c | 24 | ||||
-rw-r--r-- | hw/ppc/spapr_vof.c | 167 | ||||
-rw-r--r-- | hw/ppc/trace-events | 24 | ||||
-rw-r--r-- | hw/ppc/vof.c | 1053 | ||||
-rw-r--r-- | hw/vfio/common.c | 315 | ||||
-rw-r--r-- | hw/virtio/vhost-user.c | 3 | ||||
-rw-r--r-- | hw/virtio/virtio-mem.c | 391 |
13 files changed, 2813 insertions, 99 deletions
diff --git a/hw/hyperv/vmbus.c b/hw/hyperv/vmbus.c index 984caf898d..c9887d5a7b 100644 --- a/hw/hyperv/vmbus.c +++ b/hw/hyperv/vmbus.c @@ -2372,6 +2372,14 @@ static void vmbus_dev_realize(DeviceState *dev, Error **errp) assert(!qemu_uuid_is_null(&vdev->instanceid)); + if (!qemu_uuid_is_null(&vdc->instanceid)) { + /* Class wants to only have a single instance with a fixed UUID */ + if (!qemu_uuid_is_equal(&vdev->instanceid, &vdc->instanceid)) { + error_setg(&err, "instance id can't be changed"); + goto error_out; + } + } + /* Check for instance id collision for this class id */ QTAILQ_FOREACH(child, &BUS(vmbus)->children, sibling) { VMBusDevice *child_dev = VMBUS_DEVICE(child->child); @@ -2438,18 +2446,22 @@ static void vmbus_dev_unrealize(DeviceState *dev) free_channels(vdev); } +static Property vmbus_dev_props[] = { + DEFINE_PROP_UUID("instanceid", VMBusDevice, instanceid), + DEFINE_PROP_END_OF_LIST() +}; + + static void vmbus_dev_class_init(ObjectClass *klass, void *data) { DeviceClass *kdev = DEVICE_CLASS(klass); + device_class_set_props(kdev, vmbus_dev_props); kdev->bus_type = TYPE_VMBUS; kdev->realize = vmbus_dev_realize; kdev->unrealize = vmbus_dev_unrealize; kdev->reset = vmbus_dev_reset; } -static Property vmbus_dev_instanceid = - DEFINE_PROP_UUID("instanceid", VMBusDevice, instanceid); - static void vmbus_dev_instance_init(Object *obj) { VMBusDevice *vdev = VMBUS_DEVICE(obj); @@ -2458,8 +2470,6 @@ static void vmbus_dev_instance_init(Object *obj) if (!qemu_uuid_is_null(&vdc->instanceid)) { /* Class wants to only have a single instance with a fixed UUID */ vdev->instanceid = vdc->instanceid; - } else { - qdev_property_add_static(DEVICE(vdev), &vmbus_dev_instanceid); } } diff --git a/hw/ppc/Kconfig b/hw/ppc/Kconfig index 66e0b15d9e..7fcafec60a 100644 --- a/hw/ppc/Kconfig +++ b/hw/ppc/Kconfig @@ -13,6 +13,7 @@ config PSERIES select MSI_NONBROKEN select FDT_PPC select CHRP_NVRAM + select VOF config SPAPR_RNG bool @@ -75,6 +76,7 @@ config PEGASOS2 select VT82C686 select IDE_VIA select SMBUS_EEPROM + select VOF # This should come with VT82C686 select ACPI_X86 @@ -144,3 +146,6 @@ config FW_CFG_PPC config FDT_PPC bool + +config VOF + bool diff --git a/hw/ppc/meson.build b/hw/ppc/meson.build index 597d974dd4..aa4c8e6a2e 100644 --- a/hw/ppc/meson.build +++ b/hw/ppc/meson.build @@ -84,4 +84,7 @@ ppc_ss.add(when: 'CONFIG_VIRTEX', if_true: files('virtex_ml507.c')) # Pegasos2 ppc_ss.add(when: 'CONFIG_PEGASOS2', if_true: files('pegasos2.c')) +ppc_ss.add(when: 'CONFIG_VOF', if_true: files('vof.c')) +ppc_ss.add(when: ['CONFIG_VOF', 'CONFIG_PSERIES'], if_true: files('spapr_vof.c')) + hw_arch += {'ppc': ppc_ss} diff --git a/hw/ppc/pegasos2.c b/hw/ppc/pegasos2.c index 0bfd0928aa..9a6ae867e4 100644 --- a/hw/ppc/pegasos2.c +++ b/hw/ppc/pegasos2.c @@ -1,7 +1,7 @@ /* * QEMU PowerPC CHRP (Genesi/bPlan Pegasos II) hardware System Emulator * - * Copyright (c) 2018-2020 BALATON Zoltan + * Copyright (c) 2018-2021 BALATON Zoltan * * This work is licensed under the GNU GPL license version 2 or later. * @@ -34,26 +34,68 @@ #include "trace.h" #include "qemu/datadir.h" #include "sysemu/device_tree.h" +#include "hw/ppc/vof.h" -#define PROM_FILENAME "pegasos2.rom" +#include <libfdt.h> + +#define PROM_FILENAME "vof.bin" #define PROM_ADDR 0xfff00000 #define PROM_SIZE 0x80000 +#define KVMPPC_HCALL_BASE 0xf000 +#define KVMPPC_H_RTAS (KVMPPC_HCALL_BASE + 0x0) +#define KVMPPC_H_VOF_CLIENT (KVMPPC_HCALL_BASE + 0x5) + +#define H_SUCCESS 0 +#define H_PRIVILEGE -3 /* Caller not privileged */ +#define H_PARAMETER -4 /* Parameter invalid, out-of-range or conflicting */ + #define BUS_FREQ_HZ 133333333 +#define PCI0_MEM_BASE 0xc0000000 +#define PCI0_MEM_SIZE 0x20000000 +#define PCI0_IO_BASE 0xf8000000 +#define PCI0_IO_SIZE 0x10000 + +#define PCI1_MEM_BASE 0x80000000 +#define PCI1_MEM_SIZE 0x40000000 +#define PCI1_IO_BASE 0xfe000000 +#define PCI1_IO_SIZE 0x10000 + +#define TYPE_PEGASOS2_MACHINE MACHINE_TYPE_NAME("pegasos2") +OBJECT_DECLARE_TYPE(Pegasos2MachineState, MachineClass, PEGASOS2_MACHINE) + +struct Pegasos2MachineState { + MachineState parent_obj; + PowerPCCPU *cpu; + DeviceState *mv; + Vof *vof; + void *fdt_blob; + uint64_t kernel_addr; + uint64_t kernel_entry; + uint64_t kernel_size; +}; + +static void *build_fdt(MachineState *machine, int *fdt_size); + static void pegasos2_cpu_reset(void *opaque) { PowerPCCPU *cpu = opaque; + Pegasos2MachineState *pm = PEGASOS2_MACHINE(current_machine); cpu_reset(CPU(cpu)); cpu->env.spr[SPR_HID1] = 7ULL << 28; + if (pm->vof) { + cpu->env.gpr[1] = 2 * VOF_STACK_SIZE - 0x20; + cpu->env.nip = 0x100; + } } static void pegasos2_init(MachineState *machine) { - PowerPCCPU *cpu = NULL; + Pegasos2MachineState *pm = PEGASOS2_MACHINE(machine); + CPUPPCState *env; MemoryRegion *rom = g_new(MemoryRegion, 1); - DeviceState *mv; PCIBus *pci_bus; PCIDevice *dev; I2CBus *i2c_bus; @@ -63,15 +105,16 @@ static void pegasos2_init(MachineState *machine) uint8_t *spd_data; /* init CPU */ - cpu = POWERPC_CPU(cpu_create(machine->cpu_type)); - if (PPC_INPUT(&cpu->env) != PPC_FLAGS_INPUT_6xx) { + pm->cpu = POWERPC_CPU(cpu_create(machine->cpu_type)); + env = &pm->cpu->env; + if (PPC_INPUT(env) != PPC_FLAGS_INPUT_6xx) { error_report("Incompatible CPU, only 6xx bus supported"); exit(1); } /* Set time-base frequency */ - cpu_ppc_tb_init(&cpu->env, BUS_FREQ_HZ / 4); - qemu_register_reset(pegasos2_cpu_reset, cpu); + cpu_ppc_tb_init(env, BUS_FREQ_HZ / 4); + qemu_register_reset(pegasos2_cpu_reset, pm->cpu); /* RAM */ memory_region_add_subregion(get_system_memory(), 0, machine->ram); @@ -82,30 +125,36 @@ static void pegasos2_init(MachineState *machine) error_report("Could not find firmware '%s'", fwname); exit(1); } + if (!machine->firmware && !pm->vof) { + pm->vof = g_malloc0(sizeof(*pm->vof)); + } memory_region_init_rom(rom, NULL, "pegasos2.rom", PROM_SIZE, &error_fatal); memory_region_add_subregion(get_system_memory(), PROM_ADDR, rom); sz = load_elf(filename, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 1, PPC_ELF_MACHINE, 0, 0); if (sz <= 0) { - sz = load_image_targphys(filename, PROM_ADDR, PROM_SIZE); + sz = load_image_targphys(filename, pm->vof ? 0 : PROM_ADDR, PROM_SIZE); } if (sz <= 0 || sz > PROM_SIZE) { error_report("Could not load firmware '%s'", filename); exit(1); } g_free(filename); + if (pm->vof) { + pm->vof->fw_size = sz; + } /* Marvell Discovery II system controller */ - mv = DEVICE(sysbus_create_simple(TYPE_MV64361, -1, - ((qemu_irq *)cpu->env.irq_inputs)[PPC6xx_INPUT_INT])); - pci_bus = mv64361_get_pci_bus(mv, 1); + pm->mv = DEVICE(sysbus_create_simple(TYPE_MV64361, -1, + ((qemu_irq *)env->irq_inputs)[PPC6xx_INPUT_INT])); + pci_bus = mv64361_get_pci_bus(pm->mv, 1); /* VIA VT8231 South Bridge (multifunction PCI device) */ /* VT8231 function 0: PCI-to-ISA Bridge */ dev = pci_create_simple_multifunction(pci_bus, PCI_DEVFN(12, 0), true, TYPE_VT8231_ISA); qdev_connect_gpio_out(DEVICE(dev), 0, - qdev_get_gpio_in_named(mv, "gpp", 31)); + qdev_get_gpio_in_named(pm->mv, "gpp", 31)); /* VT8231 function 1: IDE Controller */ dev = pci_create_simple(pci_bus, PCI_DEVFN(12, 1), "via-ide"); @@ -127,18 +176,728 @@ static void pegasos2_init(MachineState *machine) /* other PC hardware */ pci_vga_init(pci_bus); + + if (machine->kernel_filename) { + sz = load_elf(machine->kernel_filename, NULL, NULL, NULL, + &pm->kernel_entry, &pm->kernel_addr, NULL, NULL, 1, + PPC_ELF_MACHINE, 0, 0); + if (sz <= 0) { + error_report("Could not load kernel '%s'", + machine->kernel_filename); + exit(1); + } + pm->kernel_size = sz; + if (!pm->vof) { + warn_report("Option -kernel may be ineffective with -bios."); + } + } + if (machine->kernel_cmdline && !pm->vof) { + warn_report("Option -append may be ineffective with -bios."); + } +} + +static uint32_t pegasos2_pci_config_read(AddressSpace *as, int bus, + uint32_t addr, uint32_t len) +{ + hwaddr pcicfg = (bus ? 0xf1000c78 : 0xf1000cf8); + uint32_t val = 0xffffffff; + + stl_le_phys(as, pcicfg, addr | BIT(31)); + switch (len) { + case 4: + val = ldl_le_phys(as, pcicfg + 4); + break; + case 2: + val = lduw_le_phys(as, pcicfg + 4); + break; + case 1: + val = ldub_phys(as, pcicfg + 4); + break; + default: + qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid length\n", __func__); + break; + } + return val; +} + +static void pegasos2_pci_config_write(AddressSpace *as, int bus, uint32_t addr, + uint32_t len, uint32_t val) +{ + hwaddr pcicfg = (bus ? 0xf1000c78 : 0xf1000cf8); + + stl_le_phys(as, pcicfg, addr | BIT(31)); + switch (len) { + case 4: + stl_le_phys(as, pcicfg + 4, val); + break; + case 2: + stw_le_phys(as, pcicfg + 4, val); + break; + case 1: + stb_phys(as, pcicfg + 4, val); + break; + default: + qemu_log_mask(LOG_GUEST_ERROR, "%s: invalid length\n", __func__); + break; + } +} + +static void pegasos2_machine_reset(MachineState *machine) +{ + Pegasos2MachineState *pm = PEGASOS2_MACHINE(machine); + AddressSpace *as = CPU(pm->cpu)->as; + void *fdt; + uint64_t d[2]; + int sz; + + qemu_devices_reset(); + if (!pm->vof) { + return; /* Firmware should set up machine so nothing to do */ + } + + /* Otherwise, set up devices that board firmware would normally do */ + stl_le_phys(as, 0xf1000000, 0x28020ff); + stl_le_phys(as, 0xf1000278, 0xa31fc); + stl_le_phys(as, 0xf100f300, 0x11ff0400); + stl_le_phys(as, 0xf100f10c, 0x80000000); + stl_le_phys(as, 0xf100001c, 0x8000000); + pegasos2_pci_config_write(as, 0, PCI_COMMAND, 2, PCI_COMMAND_IO | + PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER); + pegasos2_pci_config_write(as, 1, PCI_COMMAND, 2, PCI_COMMAND_IO | + PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER); + + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 0) << 8) | + PCI_INTERRUPT_LINE, 2, 0x9); + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 0) << 8) | + 0x50, 1, 0x2); + + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 1) << 8) | + PCI_INTERRUPT_LINE, 2, 0x109); + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 1) << 8) | + PCI_CLASS_PROG, 1, 0xf); + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 1) << 8) | + 0x40, 1, 0xb); + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 1) << 8) | + 0x50, 4, 0x17171717); + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 1) << 8) | + PCI_COMMAND, 2, 0x87); + + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 2) << 8) | + PCI_INTERRUPT_LINE, 2, 0x409); + + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 3) << 8) | + PCI_INTERRUPT_LINE, 2, 0x409); + + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 4) << 8) | + PCI_INTERRUPT_LINE, 2, 0x9); + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 4) << 8) | + 0x48, 4, 0xf00); + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 4) << 8) | + 0x40, 4, 0x558020); + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 4) << 8) | + 0x90, 4, 0xd00); + + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 5) << 8) | + PCI_INTERRUPT_LINE, 2, 0x309); + + pegasos2_pci_config_write(as, 1, (PCI_DEVFN(12, 6) << 8) | + PCI_INTERRUPT_LINE, 2, 0x309); + + /* Device tree and VOF set up */ + vof_init(pm->vof, machine->ram_size, &error_fatal); + if (vof_claim(pm->vof, 0, VOF_STACK_SIZE, VOF_STACK_SIZE) == -1) { + error_report("Memory allocation for stack failed"); + exit(1); + } + if (pm->kernel_size && + vof_claim(pm->vof, pm->kernel_addr, pm->kernel_size, 0) == -1) { + error_report("Memory for kernel is in use"); + exit(1); + } + fdt = build_fdt(machine, &sz); + /* FIXME: VOF assumes entry is same as load address */ + d[0] = cpu_to_be64(pm->kernel_entry); + d[1] = cpu_to_be64(pm->kernel_size - (pm->kernel_entry - pm->kernel_addr)); + qemu_fdt_setprop(fdt, "/chosen", "qemu,boot-kernel", d, sizeof(d)); + + qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt)); + g_free(pm->fdt_blob); + pm->fdt_blob = fdt; + + vof_build_dt(fdt, pm->vof); + vof_client_open_store(fdt, pm->vof, "/chosen", "stdout", "/failsafe"); + pm->cpu->vhyp = PPC_VIRTUAL_HYPERVISOR(machine); } -static void pegasos2_machine(MachineClass *mc) +enum pegasos2_rtas_tokens { + RTAS_RESTART_RTAS = 0, + RTAS_NVRAM_FETCH = 1, + RTAS_NVRAM_STORE = 2, + RTAS_GET_TIME_OF_DAY = 3, + RTAS_SET_TIME_OF_DAY = 4, + RTAS_EVENT_SCAN = 6, + RTAS_CHECK_EXCEPTION = 7, + RTAS_READ_PCI_CONFIG = 8, + RTAS_WRITE_PCI_CONFIG = 9, + RTAS_DISPLAY_CHARACTER = 10, + RTAS_SET_INDICATOR = 11, + RTAS_POWER_OFF = 17, + RTAS_SUSPEND = 18, + RTAS_HIBERNATE = 19, + RTAS_SYSTEM_REBOOT = 20, +}; + +static target_ulong pegasos2_rtas(PowerPCCPU *cpu, Pegasos2MachineState *pm, + target_ulong args_real) { + AddressSpace *as = CPU(cpu)->as; + uint32_t token = ldl_be_phys(as, args_real); + uint32_t nargs = ldl_be_phys(as, args_real + 4); + uint32_t nrets = ldl_be_phys(as, args_real + 8); + uint32_t args = args_real + 12; + uint32_t rets = args_real + 12 + nargs * 4; + + if (nrets < 1) { + qemu_log_mask(LOG_GUEST_ERROR, "Too few return values in RTAS call\n"); + return H_PARAMETER; + } + switch (token) { + case RTAS_READ_PCI_CONFIG: + { + uint32_t addr, len, val; + + if (nargs != 2 || nrets != 2) { + stl_be_phys(as, rets, -1); + return H_PARAMETER; + } + addr = ldl_be_phys(as, args); + len = ldl_be_phys(as, args + 4); + val = pegasos2_pci_config_read(as, !(addr >> 24), + addr & 0x0fffffff, len); + stl_be_phys(as, rets, 0); + stl_be_phys(as, rets + 4, val); + return H_SUCCESS; + } + case RTAS_WRITE_PCI_CONFIG: + { + uint32_t addr, len, val; + + if (nargs != 3 || nrets != 1) { + stl_be_phys(as, rets, -1); + return H_PARAMETER; + } + addr = ldl_be_phys(as, args); + len = ldl_be_phys(as, args + 4); + val = ldl_be_phys(as, args + 8); + pegasos2_pci_config_write(as, !(addr >> 24), + addr & 0x0fffffff, len, val); + stl_be_phys(as, rets, 0); + return H_SUCCESS; + } + case RTAS_DISPLAY_CHARACTER: + if (nargs != 1 || nrets != 1) { + stl_be_phys(as, rets, -1); + return H_PARAMETER; + } + qemu_log_mask(LOG_UNIMP, "%c", ldl_be_phys(as, args)); + stl_be_phys(as, rets, 0); + return H_SUCCESS; + default: + qemu_log_mask(LOG_UNIMP, "Unknown RTAS token %u (args=%u, rets=%u)\n", + token, nargs, nrets); + stl_be_phys(as, rets, 0); + return H_SUCCESS; + } +} + +static void pegasos2_hypercall(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu) +{ + Pegasos2MachineState *pm = PEGASOS2_MACHINE(vhyp); + CPUPPCState *env = &cpu->env; + + /* The TCG path should also be holding the BQL at this point */ + g_assert(qemu_mutex_iothread_locked()); + + if (msr_pr) { + qemu_log_mask(LOG_GUEST_ERROR, "Hypercall made with MSR[PR]=1\n"); + env->gpr[3] = H_PRIVILEGE; + } else if (env->gpr[3] == KVMPPC_H_RTAS) { + env->gpr[3] = pegasos2_rtas(cpu, pm, env->gpr[4]); + } else if (env->gpr[3] == KVMPPC_H_VOF_CLIENT) { + int ret = vof_client_call(MACHINE(pm), pm->vof, pm->fdt_blob, + env->gpr[4]); + env->gpr[3] = (ret ? H_PARAMETER : H_SUCCESS); + } else { + qemu_log_mask(LOG_GUEST_ERROR, "Unsupported hypercall " TARGET_FMT_lx + "\n", env->gpr[3]); + env->gpr[3] = -1; + } +} + +static void vhyp_nop(PPCVirtualHypervisor *vhyp, PowerPCCPU *cpu) +{ +} + +static target_ulong vhyp_encode_hpt_for_kvm_pr(PPCVirtualHypervisor *vhyp) +{ + return POWERPC_CPU(current_cpu)->env.spr[SPR_SDR1]; +} + +static void pegasos2_machine_class_init(ObjectClass *oc, void *data) +{ + MachineClass *mc = MACHINE_CLASS(oc); + PPCVirtualHypervisorClass *vhc = PPC_VIRTUAL_HYPERVISOR_CLASS(oc); + mc->desc = "Genesi/bPlan Pegasos II"; mc->init = pegasos2_init; + mc->reset = pegasos2_machine_reset; mc->block_default_type = IF_IDE; mc->default_boot_order = "cd"; mc->default_display = "std"; mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("7400_v2.9"); mc->default_ram_id = "pegasos2.ram"; mc->default_ram_size = 512 * MiB; + + vhc->hypercall = pegasos2_hypercall; + vhc->cpu_exec_enter = vhyp_nop; + vhc->cpu_exec_exit = vhyp_nop; + vhc->encode_hpt_for_kvm_pr = vhyp_encode_hpt_for_kvm_pr; +} + +static const TypeInfo pegasos2_machine_info = { + .name = TYPE_PEGASOS2_MACHINE, + .parent = TYPE_MACHINE, + .class_init = pegasos2_machine_class_init, + .instance_size = sizeof(Pegasos2MachineState), + .interfaces = (InterfaceInfo[]) { + { TYPE_PPC_VIRTUAL_HYPERVISOR }, + { } + }, +}; + +static void pegasos2_machine_register_types(void) +{ + type_register_static(&pegasos2_machine_info); +} + +type_init(pegasos2_machine_register_types) + +/* FDT creation for passing to firmware */ + +typedef struct { + void *fdt; + const char *path; +} FDTInfo; + +/* We do everything in reverse order so it comes out right in the tree */ + +static void dt_ide(PCIBus *bus, PCIDevice *d, FDTInfo *fi) +{ + qemu_fdt_setprop_string(fi->fdt, fi->path, "device_type", "spi"); } -DEFINE_MACHINE("pegasos2", pegasos2_machine) +static void dt_usb(PCIBus *bus, PCIDevice *d, FDTInfo *fi) +{ + qemu_fdt_setprop_cell(fi->fdt, fi->path, "#size-cells", 0); + qemu_fdt_setprop_cell(fi->fdt, fi->path, "#address-cells", 1); + qemu_fdt_setprop_string(fi->fdt, fi->path, "device_type", "usb"); +} + +static void dt_isa(PCIBus *bus, PCIDevice *d, FDTInfo *fi) +{ + GString *name = g_string_sized_new(64); + uint32_t cells[3]; + + qemu_fdt_setprop_cell(fi->fdt, fi->path, "#size-cells", 1); + qemu_fdt_setprop_cell(fi->fdt, fi->path, "#address-cells", 2); + qemu_fdt_setprop_string(fi->fdt, fi->path, "device_type", "isa"); + qemu_fdt_setprop_string(fi->fdt, fi->path, "name", "isa"); + + /* addional devices */ + g_string_printf(name, "%s/lpt@i3bc", fi->path); + qemu_fdt_add_subnode(fi->fdt, name->str); + qemu_fdt_setprop_cell(fi->fdt, name->str, "clock-frequency", 0); + cells[0] = cpu_to_be32(7); + cells[1] = 0; + qemu_fdt_setprop(fi->fdt, name->str, "interrupts", + cells, 2 * sizeof(cells[0])); + cells[0] = cpu_to_be32(1); + cells[1] = cpu_to_be32(0x3bc); + cells[2] = cpu_to_be32(8); + qemu_fdt_setprop(fi->fdt, name->str, "reg", cells, 3 * sizeof(cells[0])); + qemu_fdt_setprop_string(fi->fdt, name->str, "device_type", "lpt"); + qemu_fdt_setprop_string(fi->fdt, name->str, "name", "lpt"); + + g_string_printf(name, "%s/fdc@i3f0", fi->path); + qemu_fdt_add_subnode(fi->fdt, name->str); + qemu_fdt_setprop_cell(fi->fdt, name->str, "clock-frequency", 0); + cells[0] = cpu_to_be32(6); + cells[1] = 0; + qemu_fdt_setprop(fi->fdt, name->str, "interrupts", + cells, 2 * sizeof(cells[0])); + cells[0] = cpu_to_be32(1); + cells[1] = cpu_to_be32(0x3f0); + cells[2] = cpu_to_be32(8); + qemu_fdt_setprop(fi->fdt, name->str, "reg", cells, 3 * sizeof(cells[0])); + qemu_fdt_setprop_string(fi->fdt, name->str, "device_type", "fdc"); + qemu_fdt_setprop_string(fi->fdt, name->str, "name", "fdc"); + + g_string_printf(name, "%s/timer@i40", fi->path); + qemu_fdt_add_subnode(fi->fdt, name->str); + qemu_fdt_setprop_cell(fi->fdt, name->str, "clock-frequency", 0); + cells[0] = cpu_to_be32(1); + cells[1] = cpu_to_be32(0x40); + cells[2] = cpu_to_be32(8); + qemu_fdt_setprop(fi->fdt, name->str, "reg", cells, 3 * sizeof(cells[0])); + qemu_fdt_setprop_string(fi->fdt, name->str, "device_type", "timer"); + qemu_fdt_setprop_string(fi->fdt, name->str, "name", "timer"); + + g_string_printf(name, "%s/rtc@i70", fi->path); + qemu_fdt_add_subnode(fi->fdt, name->str); + qemu_fdt_setprop_string(fi->fdt, name->str, "compatible", "ds1385-rtc"); + qemu_fdt_setprop_cell(fi->fdt, name->str, "clock-frequency", 0); + cells[0] = cpu_to_be32(8); + cells[1] = 0; + qemu_fdt_setprop(fi->fdt, name->str, "interrupts", + cells, 2 * sizeof(cells[0])); + cells[0] = cpu_to_be32(1); + cells[1] = cpu_to_be32(0x70); + cells[2] = cpu_to_be32(2); + qemu_fdt_setprop(fi->fdt, name->str, "reg", cells, 3 * sizeof(cells[0])); + qemu_fdt_setprop_string(fi->fdt, name->str, "device_type", "rtc"); + qemu_fdt_setprop_string(fi->fdt, name->str, "name", "rtc"); + + g_string_printf(name, "%s/keyboard@i60", fi->path); + qemu_fdt_add_subnode(fi->fdt, name->str); + cells[0] = cpu_to_be32(1); + cells[1] = 0; + qemu_fdt_setprop(fi->fdt, name->str, "interrupts", + cells, 2 * sizeof(cells[0])); + cells[0] = cpu_to_be32(1); + cells[1] = cpu_to_be32(0x60); + cells[2] = cpu_to_be32(5); + qemu_fdt_setprop(fi->fdt, name->str, "reg", cells, 3 * sizeof(cells[0])); + qemu_fdt_setprop_string(fi->fdt, name->str, "device_type", "keyboard"); + qemu_fdt_setprop_string(fi->fdt, name->str, "name", "keyboard"); + + g_string_printf(name, "%s/8042@i60", fi->path); + qemu_fdt_add_subnode(fi->fdt, name->str); + qemu_fdt_setprop_cell(fi->fdt, name->str, "#interrupt-cells", 2); + qemu_fdt_setprop_cell(fi->fdt, name->str, "#size-cells", 0); + qemu_fdt_setprop_cell(fi->fdt, name->str, "#address-cells", 1); + qemu_fdt_setprop_string(fi->fdt, name->str, "interrupt-controller", ""); + qemu_fdt_setprop_cell(fi->fdt, name->str, "clock-frequency", 0); + cells[0] = cpu_to_be32(1); + cells[1] = cpu_to_be32(0x60); + cells[2] = cpu_to_be32(5); + qemu_fdt_setprop(fi->fdt, name->str, "reg", cells, 3 * sizeof(cells[0])); + qemu_fdt_setprop_string(fi->fdt, name->str, "device_type", ""); + qemu_fdt_setprop_string(fi->fdt, name->str, "name", "8042"); + + g_string_printf(name, "%s/serial@i2f8", fi->path); + qemu_fdt_add_subnode(fi->fdt, name->str); + qemu_fdt_setprop_cell(fi->fdt, name->str, "clock-frequency", 0); + cells[0] = cpu_to_be32(3); + cells[1] = 0; + qemu_fdt_setprop(fi->fdt, name->str, "interrupts", + cells, 2 * sizeof(cells[0])); + cells[0] = cpu_to_be32(1); + cells[1] = cpu_to_be32(0x2f8); + cells[2] = cpu_to_be32(8); + qemu_fdt_setprop(fi->fdt, name->str, "reg", cells, 3 * sizeof(cells[0])); + qemu_fdt_setprop_string(fi->fdt, name->str, "device_type", "serial"); + qemu_fdt_setprop_string(fi->fdt, name->str, "name", "serial"); + + g_string_free(name, TRUE); +} + +static struct { + const char *id; + const char *name; + void (*dtf)(PCIBus *bus, PCIDevice *d, FDTInfo *fi); +} device_map[] = { + { "pci11ab,6460", "host", NULL }, + { "pci1106,8231", "isa", dt_isa }, + { "pci1106,571", "ide", dt_ide }, + { "pci1106,3044", "firewire", NULL }, + { "pci1106,3038", "usb", dt_usb }, + { "pci1106,8235", "other", NULL }, + { "pci1106,3058", "sound", NULL }, + { NULL, NULL } +}; + +static void add_pci_device(PCIBus *bus, PCIDevice *d, void *opaque) +{ + FDTInfo *fi = opaque; + GString *node = g_string_new(NULL); + uint32_t cells[(PCI_NUM_REGIONS + 1) * 5]; + int i, j; + const char *name = NULL; + g_autofree const gchar *pn = g_strdup_printf("pci%x,%x", + pci_get_word(&d->config[PCI_VENDOR_ID]), + pci_get_word(&d->config[PCI_DEVICE_ID])); + + for (i = 0; device_map[i].id; i++) { + if (!strcmp(pn, device_map[i].id)) { + name = device_map[i].name; + break; + } + } + g_string_printf(node, "%s/%s@%x", fi->path, (name ?: pn), + PCI_SLOT(d->devfn)); + if (PCI_FUNC(d->devfn)) { + g_string_append_printf(node, ",%x", PCI_FUNC(d->devfn)); + } + + qemu_fdt_add_subnode(fi->fdt, node->str); + if (device_map[i].dtf) { + FDTInfo cfi = { fi->fdt, node->str }; + device_map[i].dtf(bus, d, &cfi); + } + cells[0] = cpu_to_be32(d->devfn << 8); + cells[1] = 0; + cells[2] = 0; + cells[3] = 0; + cells[4] = 0; + j = 5; + for (i = 0; i < PCI_NUM_REGIONS; i++) { + if (!d->io_regions[i].size) { + continue; + } + cells[j] = cpu_to_be32(d->devfn << 8 | (PCI_BASE_ADDRESS_0 + i * 4)); + if (d->io_regions[i].type & PCI_BASE_ADDRESS_SPACE_IO) { + cells[j] |= cpu_to_be32(1 << 24); + } else { + cells[j] |= cpu_to_be32(2 << 24); + if (d->io_regions[i].type & PCI_BASE_ADDRESS_MEM_PREFETCH) { + cells[j] |= cpu_to_be32(4 << 28); + } + } + cells[j + 1] = 0; + cells[j + 2] = 0; + cells[j + 3] = cpu_to_be32(d->io_regions[i].size >> 32); + cells[j + 4] = cpu_to_be32(d->io_regions[i].size); + j += 5; + } + qemu_fdt_setprop(fi->fdt, node->str, "reg", cells, j * sizeof(cells[0])); + qemu_fdt_setprop_string(fi->fdt, node->str, "name", name ?: pn); + if (pci_get_byte(&d->config[PCI_INTERRUPT_PIN])) { + qemu_fdt_setprop_cell(fi->fdt, node->str, "interrupts", + pci_get_byte(&d->config[PCI_INTERRUPT_PIN])); + } + /* Pegasos2 firmware has subsystem-id amd subsystem-vendor-id swapped */ + qemu_fdt_setprop_cell(fi->fdt, node->str, "subsystem-vendor-id", + pci_get_word(&d->config[PCI_SUBSYSTEM_ID])); + qemu_fdt_setprop_cell(fi->fdt, node->str, "subsystem-id", + pci_get_word(&d->config[PCI_SUBSYSTEM_VENDOR_ID])); + cells[0] = pci_get_long(&d->config[PCI_CLASS_REVISION]); + qemu_fdt_setprop_cell(fi->fdt, node->str, "class-code", cells[0] >> 8); + qemu_fdt_setprop_cell(fi->fdt, node->str, "revision-id", cells[0] & 0xff); + qemu_fdt_setprop_cell(fi->fdt, node->str, "device-id", + pci_get_word(&d->config[PCI_DEVICE_ID])); + qemu_fdt_setprop_cell(fi->fdt, node->str, "vendor-id", + pci_get_word(&d->config[PCI_VENDOR_ID])); + + g_string_free(node, TRUE); +} + +static void *build_fdt(MachineState *machine, int *fdt_size) +{ + Pegasos2MachineState *pm = PEGASOS2_MACHINE(machine); + PowerPCCPU *cpu = pm->cpu; + PCIBus *pci_bus; + FDTInfo fi; + uint32_t cells[16]; + void *fdt = create_device_tree(fdt_size); + + fi.fdt = fdt; + + /* root node */ + qemu_fdt_setprop_string(fdt, "/", "CODEGEN,description", + "Pegasos CHRP PowerPC System"); + qemu_fdt_setprop_string(fdt, "/", "CODEGEN,board", "Pegasos2"); + qemu_fdt_setprop_string(fdt, "/", "CODEGEN,vendor", "bplan GmbH"); + qemu_fdt_setprop_string(fdt, "/", "revision", "2B"); + qemu_fdt_setprop_string(fdt, "/", "model", "Pegasos2"); + qemu_fdt_setprop_string(fdt, "/", "device_type", "chrp"); + qemu_fdt_setprop_cell(fdt, "/", "#address-cells", 1); + qemu_fdt_setprop_string(fdt, "/", "name", "bplan,Pegasos2"); + + /* pci@c0000000 */ + qemu_fdt_add_subnode(fdt, "/pci@c0000000"); + cells[0] = 0; + cells[1] = 0; + qemu_fdt_setprop(fdt, "/pci@c0000000", "bus-range", + cells, 2 * sizeof(cells[0])); + qemu_fdt_setprop_cell(fdt, "/pci@c0000000", "pci-bridge-number", 1); + cells[0] = cpu_to_be32(PCI0_MEM_BASE); + cells[1] = cpu_to_be32(PCI0_MEM_SIZE); + qemu_fdt_setprop(fdt, "/pci@c0000000", "reg", cells, 2 * sizeof(cells[0])); + cells[0] = cpu_to_be32(0x01000000); + cells[1] = 0; + cells[2] = 0; + cells[3] = cpu_to_be32(PCI0_IO_BASE); + cells[4] = 0; + cells[5] = cpu_to_be32(PCI0_IO_SIZE); + cells[6] = cpu_to_be32(0x02000000); + cells[7] = 0; + cells[8] = cpu_to_be32(PCI0_MEM_BASE); + cells[9] = cpu_to_be32(PCI0_MEM_BASE); + cells[10] = 0; + cells[11] = cpu_to_be32(PCI0_MEM_SIZE); + qemu_fdt_setprop(fdt, "/pci@c0000000", "ranges", + cells, 12 * sizeof(cells[0])); + qemu_fdt_setprop_cell(fdt, "/pci@c0000000", "#size-cells", 2); + qemu_fdt_setprop_cell(fdt, "/pci@c0000000", "#address-cells", 3); + qemu_fdt_setprop_string(fdt, "/pci@c0000000", "device_type", "pci"); + qemu_fdt_setprop_string(fdt, "/pci@c0000000", "name", "pci"); + + fi.path = "/pci@c0000000"; + pci_bus = mv64361_get_pci_bus(pm->mv, 0); + pci_for_each_device_reverse(pci_bus, 0, add_pci_device, &fi); + + /* pci@80000000 */ + qemu_fdt_add_subnode(fdt, "/pci@80000000"); + cells[0] = 0; + cells[1] = 0; + qemu_fdt_setprop(fdt, "/pci@80000000", "bus-range", + cells, 2 * sizeof(cells[0])); + qemu_fdt_setprop_cell(fdt, "/pci@80000000", "pci-bridge-number", 0); + cells[0] = cpu_to_be32(PCI1_MEM_BASE); + cells[1] = cpu_to_be32(PCI1_MEM_SIZE); + qemu_fdt_setprop(fdt, "/pci@80000000", "reg", cells, 2 * sizeof(cells[0])); + qemu_fdt_setprop_cell(fdt, "/pci@80000000", "8259-interrupt-acknowledge", + 0xf1000cb4); + cells[0] = cpu_to_be32(0x01000000); + cells[1] = 0; + cells[2] = 0; + cells[3] = cpu_to_be32(PCI1_IO_BASE); + cells[4] = 0; + cells[5] = cpu_to_be32(PCI1_IO_SIZE); + cells[6] = cpu_to_be32(0x02000000); + cells[7] = 0; + cells[8] = cpu_to_be32(PCI1_MEM_BASE); + cells[9] = cpu_to_be32(PCI1_MEM_BASE); + cells[10] = 0; + cells[11] = cpu_to_be32(PCI1_MEM_SIZE); + qemu_fdt_setprop(fdt, "/pci@80000000", "ranges", + cells, 12 * sizeof(cells[0])); + qemu_fdt_setprop_cell(fdt, "/pci@80000000", "#size-cells", 2); + qemu_fdt_setprop_cell(fdt, "/pci@80000000", "#address-cells", 3); + qemu_fdt_setprop_string(fdt, "/pci@80000000", "device_type", "pci"); + qemu_fdt_setprop_string(fdt, "/pci@80000000", "name", "pci"); + + fi.path = "/pci@80000000"; + pci_bus = mv64361_get_pci_bus(pm->mv, 1); + pci_for_each_device_reverse(pci_bus, 0, add_pci_device, &fi); + + qemu_fdt_add_subnode(fdt, "/failsafe"); + qemu_fdt_setprop_string(fdt, "/failsafe", "device_type", "serial"); + qemu_fdt_setprop_string(fdt, "/failsafe", "name", "failsafe"); + + qemu_fdt_add_subnode(fdt, "/rtas"); + qemu_fdt_setprop_cell(fdt, "/rtas", "system-reboot", RTAS_SYSTEM_REBOOT); + qemu_fdt_setprop_cell(fdt, "/rtas", "hibernate", RTAS_HIBERNATE); + qemu_fdt_setprop_cell(fdt, "/rtas", "suspend", RTAS_SUSPEND); + qemu_fdt_setprop_cell(fdt, "/rtas", "power-off", RTAS_POWER_OFF); + qemu_fdt_setprop_cell(fdt, "/rtas", "set-indicator", RTAS_SET_INDICATOR); + qemu_fdt_setprop_cell(fdt, "/rtas", "display-character", + RTAS_DISPLAY_CHARACTER); + qemu_fdt_setprop_cell(fdt, "/rtas", "write-pci-config", + RTAS_WRITE_PCI_CONFIG); + qemu_fdt_setprop_cell(fdt, "/rtas", "read-pci-config", + RTAS_READ_PCI_CONFIG); + /* Pegasos2 firmware misspells check-exception and guests use that */ + qemu_fdt_setprop_cell(fdt, "/rtas", "check-execption", + RTAS_CHECK_EXCEPTION); + qemu_fdt_setprop_cell(fdt, "/rtas", "event-scan", RTAS_EVENT_SCAN); + qemu_fdt_setprop_cell(fdt, "/rtas", "set-time-of-day", + RTAS_SET_TIME_OF_DAY); + qemu_fdt_setprop_cell(fdt, "/rtas", "get-time-of-day", + RTAS_GET_TIME_OF_DAY); + qemu_fdt_setprop_cell(fdt, "/rtas", "nvram-store", RTAS_NVRAM_STORE); + qemu_fdt_setprop_cell(fdt, "/rtas", "nvram-fetch", RTAS_NVRAM_FETCH); + qemu_fdt_setprop_cell(fdt, "/rtas", "restart-rtas", RTAS_RESTART_RTAS); + qemu_fdt_setprop_cell(fdt, "/rtas", "rtas-error-log-max", 0); + qemu_fdt_setprop_cell(fdt, "/rtas", "rtas-event-scan-rate", 0); + qemu_fdt_setprop_cell(fdt, "/rtas", "rtas-display-device", 0); + qemu_fdt_setprop_cell(fdt, "/rtas", "rtas-size", 20); + qemu_fdt_setprop_cell(fdt, "/rtas", "rtas-version", 1); + + /* cpus */ + qemu_fdt_add_subnode(fdt, "/cpus"); + qemu_fdt_setprop_cell(fdt, "/cpus", "#cpus", 1); + qemu_fdt_setprop_cell(fdt, "/cpus", "#address-cells", 1); + qemu_fdt_setprop_cell(fdt, "/cpus", "#size-cells", 0); + qemu_fdt_setprop_string(fdt, "/cpus", "name", "cpus"); + + /* FIXME Get CPU name from CPU object */ + const char *cp = "/cpus/PowerPC,G4"; + qemu_fdt_add_subnode(fdt, cp); + qemu_fdt_setprop_cell(fdt, cp, "l2cr", 0); + qemu_fdt_setprop_cell(fdt, cp, "d-cache-size", 0x8000); + qemu_fdt_setprop_cell(fdt, cp, "d-cache-block-size", + cpu->env.dcache_line_size); + qemu_fdt_setprop_cell(fdt, cp, "d-cache-line-size", + cpu->env.dcache_line_size); + qemu_fdt_setprop_cell(fdt, cp, "i-cache-size", 0x8000); + qemu_fdt_setprop_cell(fdt, cp, "i-cache-block-size", + cpu->env.icache_line_size); + qemu_fdt_setprop_cell(fdt, cp, "i-cache-line-size", + cpu->env.icache_line_size); + if (cpu->env.id_tlbs) { + qemu_fdt_setprop_cell(fdt, cp, "i-tlb-sets", cpu->env.nb_ways); + qemu_fdt_setprop_cell(fdt, cp, "i-tlb-size", cpu->env.tlb_per_way); + qemu_fdt_setprop_cell(fdt, cp, "d-tlb-sets", cpu->env.nb_ways); + qemu_fdt_setprop_cell(fdt, cp, "d-tlb-size", cpu->env.tlb_per_way); + qemu_fdt_setprop_string(fdt, cp, "tlb-split", ""); + } + qemu_fdt_setprop_cell(fdt, cp, "tlb-sets", cpu->env.nb_ways); + qemu_fdt_setprop_cell(fdt, cp, "tlb-size", cpu->env.nb_tlb); + qemu_fdt_setprop_string(fdt, cp, "state", "running"); + if (cpu->env.insns_flags & PPC_ALTIVEC) { + qemu_fdt_setprop_string(fdt, cp, "altivec", ""); + qemu_fdt_setprop_string(fdt, cp, "data-streams", ""); + } + /* + * FIXME What flags do data-streams, external-control and + * performance-monitor depend on? + */ + qemu_fdt_setprop_string(fdt, cp, "external-control", ""); + if (cpu->env.insns_flags & PPC_FLOAT_FSQRT) { + qemu_fdt_setprop_string(fdt, cp, "general-purpose", ""); + } + qemu_fdt_setprop_string(fdt, cp, "performance-monitor", ""); + if (cpu->env.insns_flags & PPC_FLOAT_FRES) { + qemu_fdt_setprop_string(fdt, cp, "graphics", ""); + } + qemu_fdt_setprop_cell(fdt, cp, "reservation-granule-size", 4); + qemu_fdt_setprop_cell(fdt, cp, "timebase-frequency", + cpu->env.tb_env->tb_freq); + qemu_fdt_setprop_cell(fdt, cp, "bus-frequency", BUS_FREQ_HZ); + qemu_fdt_setprop_cell(fdt, cp, "clock-frequency", BUS_FREQ_HZ * 7.5); + qemu_fdt_setprop_cell(fdt, cp, "cpu-version", cpu->env.spr[SPR_PVR]); + cells[0] = 0; + cells[1] = 0; + qemu_fdt_setprop(fdt, cp, "reg", cells, 2 * sizeof(cells[0])); + qemu_fdt_setprop_string(fdt, cp, "device_type", "cpu"); + qemu_fdt_setprop_string(fdt, cp, "name", strrchr(cp, '/') + 1); + + /* memory */ + qemu_fdt_add_subnode(fdt, "/memory@0"); + cells[0] = 0; + cells[1] = cpu_to_be32(machine->ram_size); + qemu_fdt_setprop(fdt, "/memory@0", "reg", cells, 2 * sizeof(cells[0])); + qemu_fdt_setprop_string(fdt, "/memory@0", "device_type", "memory"); + qemu_fdt_setprop_string(fdt, "/memory@0", "name", "memory"); + + qemu_fdt_add_subnode(fdt, "/chosen"); + qemu_fdt_setprop_string(fdt, "/chosen", "bootargs", + machine->kernel_cmdline ?: ""); + qemu_fdt_setprop_string(fdt, "/chosen", "name", "chosen"); + + qemu_fdt_add_subnode(fdt, "/openprom"); + qemu_fdt_setprop_string(fdt, "/openprom", "model", "Pegasos2,1.1"); + + return fdt; +} diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 4dd90b75cc..a007be471e 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -101,6 +101,7 @@ #define FDT_MAX_ADDR 0x80000000 /* FDT must stay below that */ #define FW_MAX_SIZE 0x400000 #define FW_FILE_NAME "slof.bin" +#define FW_FILE_NAME_VOF "vof.bin" #define FW_OVERHEAD 0x2800000 #define KERNEL_LOAD_ADDR FW_MAX_SIZE @@ -880,6 +881,10 @@ static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt) add_str(hypertas, "hcall-copy"); add_str(hypertas, "hcall-debug"); add_str(hypertas, "hcall-vphn"); + if (spapr_get_cap(spapr, SPAPR_CAP_RPT_INVALIDATE) == SPAPR_CAP_ON) { + add_str(hypertas, "hcall-rpt-invalidate"); + } + add_str(qemu_hypertas, "hcall-memop1"); if (!kvm_enabled() || kvmppc_spapr_use_multitce()) { @@ -919,9 +924,13 @@ static void spapr_dt_rtas(SpaprMachineState *spapr, void *fdt) * * The extra 8 bytes is required because Linux's FWNMI error log check * is off-by-one. + * + * RTAS_MIN_SIZE is required for the RTAS blob itself. */ - _FDT(fdt_setprop_cell(fdt, rtas, "rtas-size", RTAS_ERROR_LOG_MAX + - ms->smp.max_cpus * sizeof(uint64_t)*2 + sizeof(uint64_t))); + _FDT(fdt_setprop_cell(fdt, rtas, "rtas-size", RTAS_MIN_SIZE + + RTAS_ERROR_LOG_MAX + + ms->smp.max_cpus * sizeof(uint64_t) * 2 + + sizeof(uint64_t))); _FDT(fdt_setprop_cell(fdt, rtas, "rtas-error-log-max", RTAS_ERROR_LOG_MAX)); _FDT(fdt_setprop_cell(fdt, rtas, "rtas-event-scan-rate", @@ -1639,22 +1648,29 @@ static void spapr_machine_reset(MachineState *machine) fdt_addr = MIN(spapr->rma_size, FDT_MAX_ADDR) - FDT_MAX_SIZE; fdt = spapr_build_fdt(spapr, true, FDT_MAX_SIZE); + if (spapr->vof) { + spapr_vof_reset(spapr, fdt, &error_fatal); + /* + * Do not pack the FDT as the client may change properties. + * VOF client does not expect the FDT so we do not load it to the VM. + */ + } else { + rc = fdt_pack(fdt); + /* Should only fail if we've built a corrupted tree */ + assert(rc == 0); - rc = fdt_pack(fdt); - - /* Should only fail if we've built a corrupted tree */ - assert(rc == 0); - - /* Load the fdt */ + spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT, + 0, fdt_addr, 0); + cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt)); + } qemu_fdt_dumpdtb(fdt, fdt_totalsize(fdt)); - cpu_physical_memory_write(fdt_addr, fdt, fdt_totalsize(fdt)); + g_free(spapr->fdt_blob); spapr->fdt_size = fdt_totalsize(fdt); spapr->fdt_initial_size = spapr->fdt_size; spapr->fdt_blob = fdt; /* Set up the entry state */ - spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT, 0, fdt_addr, 0); first_ppc_cpu->env.gpr[5] = 0; spapr->fwnmi_system_reset_addr = -1; @@ -2018,6 +2034,7 @@ static const VMStateDescription vmstate_spapr = { &vmstate_spapr_cap_ccf_assist, &vmstate_spapr_cap_fwnmi, &vmstate_spapr_fwnmi, + &vmstate_spapr_cap_rpt_invalidate, NULL } }; @@ -2657,7 +2674,8 @@ static void spapr_machine_init(MachineState *machine) SpaprMachineState *spapr = SPAPR_MACHINE(machine); SpaprMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine); MachineClass *mc = MACHINE_GET_CLASS(machine); - const char *bios_name = machine->firmware ?: FW_FILE_NAME; + const char *bios_default = spapr->vof ? FW_FILE_NAME_VOF : FW_FILE_NAME; + const char *bios_name = machine->firmware ?: bios_default; const char *kernel_filename = machine->kernel_filename; const char *initrd_filename = machine->initrd_filename; PCIHostState *phb; @@ -3014,6 +3032,10 @@ static void spapr_machine_init(MachineState *machine) } qemu_cond_init(&spapr->fwnmi_machine_check_interlock_cond); + if (spapr->vof) { + spapr->vof->fw_size = fw_size; /* for claim() on itself */ + spapr_register_hypercall(KVMPPC_H_VOF_CLIENT, spapr_h_vof_client); + } } #define DEFAULT_KVM_TYPE "auto" @@ -3204,6 +3226,28 @@ static void spapr_set_resize_hpt(Object *obj, const char *value, Error **errp) } } +static bool spapr_get_vof(Object *obj, Error **errp) +{ + SpaprMachineState *spapr = SPAPR_MACHINE(obj); + + return spapr->vof != NULL; +} + +static void spapr_set_vof(Object *obj, bool value, Error **errp) +{ + SpaprMachineState *spapr = SPAPR_MACHINE(obj); + + if (spapr->vof) { + vof_cleanup(spapr->vof); + g_free(spapr->vof); + spapr->vof = NULL; + } + if (!value) { + return; + } + spapr->vof = g_malloc0(sizeof(*spapr->vof)); +} + static char *spapr_get_ic_mode(Object *obj, Error **errp) { SpaprMachineState *spapr = SPAPR_MACHINE(obj); @@ -3329,6 +3373,11 @@ static void spapr_instance_init(Object *obj) stringify(KERNEL_LOAD_ADDR) " for -kernel is the default"); spapr->kernel_addr = KERNEL_LOAD_ADDR; + + object_property_add_bool(obj, "x-vof", spapr_get_vof, spapr_set_vof); + object_property_set_description(obj, "x-vof", + "Enable Virtual Open Firmware (experimental)"); + /* The machine class defines the default interrupt controller mode */ spapr->irq = smc->irq; object_property_add_str(obj, "ic-mode", spapr_get_ic_mode, @@ -4492,6 +4541,7 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data) XICSFabricClass *xic = XICS_FABRIC_CLASS(oc); InterruptStatsProviderClass *ispc = INTERRUPT_STATS_PROVIDER_CLASS(oc); XiveFabricClass *xfc = XIVE_FABRIC_CLASS(oc); + VofMachineIfClass *vmc = VOF_MACHINE_CLASS(oc); mc->desc = "pSeries Logical Partition (PAPR compliant)"; mc->ignore_boot_device_suffixes = true; @@ -4573,6 +4623,7 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data) smc->default_caps.caps[SPAPR_CAP_LARGE_DECREMENTER] = SPAPR_CAP_ON; smc->default_caps.caps[SPAPR_CAP_CCF_ASSIST] = SPAPR_CAP_ON; smc->default_caps.caps[SPAPR_CAP_FWNMI] = SPAPR_CAP_ON; + smc->default_caps.caps[SPAPR_CAP_RPT_INVALIDATE] = SPAPR_CAP_OFF; spapr_caps_add_properties(smc); smc->irq = &spapr_irq_dual; smc->dr_phb_enabled = true; @@ -4580,6 +4631,9 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data) smc->smp_threads_vsmt = true; smc->nr_xirqs = SPAPR_NR_XIRQS; xfc->match_nvt = spapr_match_nvt; + vmc->client_architecture_support = spapr_vof_client_architecture_support; + vmc->quiesce = spapr_vof_quiesce; + vmc->setprop = spapr_vof_setprop; } static const TypeInfo spapr_machine_info = { @@ -4599,6 +4653,7 @@ static const TypeInfo spapr_machine_info = { { TYPE_XICS_FABRIC }, { TYPE_INTERRUPT_STATS_PROVIDER }, { TYPE_XIVE_FABRIC }, + { TYPE_VOF_MACHINE_IF }, { } }, }; diff --git a/hw/ppc/spapr_caps.c b/hw/ppc/spapr_caps.c index d0c419b392..ed7c077a0d 100644 --- a/hw/ppc/spapr_caps.c +++ b/hw/ppc/spapr_caps.c @@ -582,6 +582,37 @@ static void cap_fwnmi_apply(SpaprMachineState *spapr, uint8_t val, } } +static void cap_rpt_invalidate_apply(SpaprMachineState *spapr, + uint8_t val, Error **errp) +{ + ERRP_GUARD(); + + if (!val) { + /* capability disabled by default */ + return; + } + + if (tcg_enabled()) { + error_setg(errp, "No H_RPT_INVALIDATE support in TCG"); + error_append_hint(errp, + "Try appending -machine cap-rpt-invalidate=off\n"); + } else if (kvm_enabled()) { + if (!kvmppc_has_cap_mmu_radix()) { + error_setg(errp, "H_RPT_INVALIDATE only supported on Radix"); + return; + } + + if (!kvmppc_has_cap_rpt_invalidate()) { + error_setg(errp, + "KVM implementation does not support H_RPT_INVALIDATE"); + error_append_hint(errp, + "Try appending -machine cap-rpt-invalidate=off\n"); + } else { + kvmppc_enable_h_rpt_invalidate(); + } + } +} + SpaprCapabilityInfo capability_table[SPAPR_CAP_NUM] = { [SPAPR_CAP_HTM] = { .name = "htm", @@ -690,6 +721,15 @@ SpaprCapabilityInfo capability_table[SPAPR_CAP_NUM] = { .type = "bool", .apply = cap_fwnmi_apply, }, + [SPAPR_CAP_RPT_INVALIDATE] = { + .name = "rpt-invalidate", + .description = "Allow H_RPT_INVALIDATE", + .index = SPAPR_CAP_RPT_INVALIDATE, + .get = spapr_cap_get_bool, + .set = spapr_cap_set_bool, + .type = "bool", + .apply = cap_rpt_invalidate_apply, + }, }; static SpaprCapabilities default_caps_with_cpu(SpaprMachineState *spapr, @@ -830,6 +870,7 @@ SPAPR_CAP_MIG_STATE(nested_kvm_hv, SPAPR_CAP_NESTED_KVM_HV); SPAPR_CAP_MIG_STATE(large_decr, SPAPR_CAP_LARGE_DECREMENTER); SPAPR_CAP_MIG_STATE(ccf_assist, SPAPR_CAP_CCF_ASSIST); SPAPR_CAP_MIG_STATE(fwnmi, SPAPR_CAP_FWNMI); +SPAPR_CAP_MIG_STATE(rpt_invalidate, SPAPR_CAP_RPT_INVALIDATE); void spapr_caps_init(SpaprMachineState *spapr) { diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c index f25014afda..0e9a5b2e40 100644 --- a/hw/ppc/spapr_hcall.c +++ b/hw/ppc/spapr_hcall.c @@ -1233,8 +1233,7 @@ target_ulong do_client_architecture_support(PowerPCCPU *cpu, spapr_setup_hpt(spapr); } - fdt = spapr_build_fdt(spapr, false, fdt_bufsize); - + fdt = spapr_build_fdt(spapr, spapr->vof != NULL, fdt_bufsize); g_free(spapr->fdt_blob); spapr->fdt_size = fdt_totalsize(fdt); spapr->fdt_initial_size = spapr->fdt_size; @@ -1277,6 +1276,25 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu, return ret; } +target_ulong spapr_vof_client_architecture_support(MachineState *ms, + CPUState *cs, + target_ulong ovec_addr) +{ + SpaprMachineState *spapr = SPAPR_MACHINE(ms); + + target_ulong ret = do_client_architecture_support(POWERPC_CPU(cs), spapr, + ovec_addr, FDT_MAX_SIZE); + + /* + * This adds stdout and generates phandles for boottime and CAS FDTs. + * It is alright to update the FDT here as do_client_architecture_support() + * does not pack it. + */ + spapr_vof_client_dt_finalize(spapr, spapr->fdt_blob); + + return ret; +} + static target_ulong h_get_cpu_characteristics(PowerPCCPU *cpu, SpaprMachineState *spapr, target_ulong opcode, @@ -1299,6 +1317,8 @@ static target_ulong h_get_cpu_characteristics(PowerPCCPU *cpu, behaviour |= H_CPU_BEHAV_L1D_FLUSH_PR; break; case SPAPR_CAP_FIXED: + behaviour |= H_CPU_BEHAV_NO_L1D_FLUSH_ENTRY; + behaviour |= H_CPU_BEHAV_NO_L1D_FLUSH_UACCESS; break; default: /* broken */ assert(safe_cache == SPAPR_CAP_BROKEN); diff --git a/hw/ppc/spapr_vof.c b/hw/ppc/spapr_vof.c new file mode 100644 index 0000000000..40ce8fe003 --- /dev/null +++ b/hw/ppc/spapr_vof.c @@ -0,0 +1,167 @@ +/* + * SPAPR machine hooks to Virtual Open Firmware, + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "qapi/error.h" +#include "hw/ppc/spapr.h" +#include "hw/ppc/spapr_vio.h" +#include "hw/ppc/spapr_cpu_core.h" +#include "hw/ppc/fdt.h" +#include "hw/ppc/vof.h" +#include "sysemu/sysemu.h" +#include "qom/qom-qobject.h" +#include "trace.h" + +target_ulong spapr_h_vof_client(PowerPCCPU *cpu, SpaprMachineState *spapr, + target_ulong opcode, target_ulong *_args) +{ + int ret = vof_client_call(MACHINE(spapr), spapr->vof, spapr->fdt_blob, + ppc64_phys_to_real(_args[0])); + + if (ret) { + return H_PARAMETER; + } + return H_SUCCESS; +} + +void spapr_vof_client_dt_finalize(SpaprMachineState *spapr, void *fdt) +{ + char *stdout_path = spapr_vio_stdout_path(spapr->vio_bus); + + vof_build_dt(fdt, spapr->vof); + + if (spapr->vof->bootargs) { + int chosen; + + _FDT(chosen = fdt_path_offset(fdt, "/chosen")); + /* + * If the client did not change "bootargs", spapr_dt_chosen() must have + * stored machine->kernel_cmdline in it before getting here. + */ + _FDT(fdt_setprop_string(fdt, chosen, "bootargs", spapr->vof->bootargs)); + } + + /* + * SLOF-less setup requires an open instance of stdout for early + * kernel printk. By now all phandles are settled so we can open + * the default serial console. + */ + if (stdout_path) { + _FDT(vof_client_open_store(fdt, spapr->vof, "/chosen", "stdout", + stdout_path)); + } +} + +void spapr_vof_reset(SpaprMachineState *spapr, void *fdt, Error **errp) +{ + target_ulong stack_ptr; + Vof *vof = spapr->vof; + PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu); + + vof_init(vof, spapr->rma_size, errp); + + stack_ptr = vof_claim(vof, 0, VOF_STACK_SIZE, VOF_STACK_SIZE); + if (stack_ptr == -1) { + error_setg(errp, "Memory allocation for stack failed"); + return; + } + /* Stack grows downwards plus reserve space for the minimum stack frame */ + stack_ptr += VOF_STACK_SIZE - 0x20; + + if (spapr->kernel_size && + vof_claim(vof, spapr->kernel_addr, spapr->kernel_size, 0) == -1) { + error_setg(errp, "Memory for kernel is in use"); + return; + } + + if (spapr->initrd_size && + vof_claim(vof, spapr->initrd_base, spapr->initrd_size, 0) == -1) { + error_setg(errp, "Memory for initramdisk is in use"); + return; + } + + spapr_vof_client_dt_finalize(spapr, fdt); + + spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT, + stack_ptr, spapr->initrd_base, + spapr->initrd_size); + /* VOF is 32bit BE so enforce MSR here */ + first_ppc_cpu->env.msr &= ~((1ULL << MSR_SF) | (1ULL << MSR_LE)); + + /* + * At this point the expected allocation map is: + * + * 0..c38 - the initial firmware + * 8000..10000 - stack + * 400000.. - kernel + * 3ea0000.. - initramdisk + * + * We skip writing FDT as nothing expects it; OF client interface is + * going to be used for reading the device tree. + */ +} + +void spapr_vof_quiesce(MachineState *ms) +{ + SpaprMachineState *spapr = SPAPR_MACHINE(ms); + + spapr->fdt_size = fdt_totalsize(spapr->fdt_blob); + spapr->fdt_initial_size = spapr->fdt_size; +} + +bool spapr_vof_setprop(MachineState *ms, const char *path, const char *propname, + void *val, int vallen) +{ + SpaprMachineState *spapr = SPAPR_MACHINE(ms); + + /* + * We only allow changing properties which we know how to update in QEMU + * OR + * the ones which we know that they need to survive during "quiesce". + */ + + if (strcmp(path, "/rtas") == 0) { + if (strcmp(propname, "linux,rtas-base") == 0 || + strcmp(propname, "linux,rtas-entry") == 0) { + /* These need to survive quiesce so let them store in the FDT */ + return true; + } + } + + if (strcmp(path, "/chosen") == 0) { + if (strcmp(propname, "bootargs") == 0) { + Vof *vof = spapr->vof; + + g_free(vof->bootargs); + vof->bootargs = g_strndup(val, vallen); + return true; + } + if (strcmp(propname, "linux,initrd-start") == 0) { + if (vallen == sizeof(uint32_t)) { + spapr->initrd_base = ldl_be_p(val); + return true; + } + if (vallen == sizeof(uint64_t)) { + spapr->initrd_base = ldq_be_p(val); + return true; + } + return false; + } + if (strcmp(propname, "linux,initrd-end") == 0) { + if (vallen == sizeof(uint32_t)) { + spapr->initrd_size = ldl_be_p(val) - spapr->initrd_base; + return true; + } + if (vallen == sizeof(uint64_t)) { + spapr->initrd_size = ldq_be_p(val) - spapr->initrd_base; + return true; + } + return false; + } + } + + return true; +} diff --git a/hw/ppc/trace-events b/hw/ppc/trace-events index 0ba3e40353..6e90a01072 100644 --- a/hw/ppc/trace-events +++ b/hw/ppc/trace-events @@ -71,6 +71,30 @@ spapr_rtas_ibm_configure_connector_invalid(uint32_t index) "DRC index: 0x%"PRIx3 spapr_vio_h_reg_crq(uint64_t reg, uint64_t queue_addr, uint64_t queue_len) "CRQ for dev 0x%" PRIx64 " registered at 0x%" PRIx64 "/0x%" PRIx64 spapr_vio_free_crq(uint32_t reg) "CRQ for dev 0x%" PRIx32 " freed" +# vof.c +vof_error_str_truncated(const char *s, int len) "%s truncated to %d" +vof_error_param(const char *method, int nargscheck, int nretcheck, int nargs, int nret) "%s takes/returns %d/%d, not %d/%d" +vof_error_unknown_service(const char *service, int nargs, int nret) "\"%s\" args=%d rets=%d" +vof_error_unknown_method(const char *method) "\"%s\"" +vof_error_unknown_ihandle_close(uint32_t ih) "ih=0x%x" +vof_error_unknown_path(const char *path) "\"%s\"" +vof_error_write(uint32_t ih) "ih=0x%x" +vof_finddevice(const char *path, uint32_t ph) "\"%s\" => ph=0x%x" +vof_claim(uint32_t virt, uint32_t size, uint32_t align, uint32_t ret) "virt=0x%x size=0x%x align=0x%x => 0x%x" +vof_release(uint32_t virt, uint32_t size, uint32_t ret) "virt=0x%x size=0x%x => 0x%x" +vof_method(uint32_t ihandle, const char *method, uint32_t param, uint32_t ret, uint32_t ret2) "ih=0x%x \"%s\"(0x%x) => 0x%x 0x%x" +vof_getprop(uint32_t ph, const char *prop, uint32_t ret, const char *val) "ph=0x%x \"%s\" => len=%d [%s]" +vof_getproplen(uint32_t ph, const char *prop, uint32_t ret) "ph=0x%x \"%s\" => len=%d" +vof_setprop(uint32_t ph, const char *prop, const char *val, uint32_t vallen, uint32_t ret) "ph=0x%x \"%s\" [%s] len=%d => ret=%d" +vof_open(const char *path, uint32_t ph, uint32_t ih) "%s ph=0x%x => ih=0x%x" +vof_interpret(const char *cmd, uint32_t param1, uint32_t param2, uint32_t ret, uint32_t ret2) "[%s] 0x%x 0x%x => 0x%x 0x%x" +vof_package_to_path(uint32_t ph, const char *tmp, uint32_t ret) "ph=0x%x => %s len=%d" +vof_instance_to_path(uint32_t ih, uint32_t ph, const char *tmp, uint32_t ret) "ih=0x%x ph=0x%x => %s len=%d" +vof_instance_to_package(uint32_t ih, uint32_t ph) "ih=0x%x => ph=0x%x" +vof_write(uint32_t ih, unsigned cb, const char *msg) "ih=0x%x [%u] \"%s\"" +vof_avail(uint64_t start, uint64_t end, uint64_t size) "0x%"PRIx64"..0x%"PRIx64" size=0x%"PRIx64 +vof_claimed(uint64_t start, uint64_t end, uint64_t size) "0x%"PRIx64"..0x%"PRIx64" size=0x%"PRIx64 + # ppc.c ppc_tb_adjust(uint64_t offs1, uint64_t offs2, int64_t diff, int64_t seconds) "adjusted from 0x%"PRIx64" to 0x%"PRIx64", diff %"PRId64" (%"PRId64"s)" diff --git a/hw/ppc/vof.c b/hw/ppc/vof.c new file mode 100644 index 0000000000..81f6596215 --- /dev/null +++ b/hw/ppc/vof.c @@ -0,0 +1,1053 @@ +/* + * QEMU PowerPC Virtual Open Firmware. + * + * This implements client interface from OpenFirmware IEEE1275 on the QEMU + * side to leave only a very basic firmware in the VM. + * + * Copyright (c) 2021 IBM Corporation. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "qemu/timer.h" +#include "qemu/range.h" +#include "qemu/units.h" +#include "qemu/log.h" +#include "qapi/error.h" +#include "exec/ram_addr.h" +#include "exec/address-spaces.h" +#include "hw/ppc/vof.h" +#include "hw/ppc/fdt.h" +#include "sysemu/runstate.h" +#include "qom/qom-qobject.h" +#include "trace.h" + +#include <libfdt.h> + +/* + * OF 1275 "nextprop" description suggests is it 32 bytes max but + * LoPAPR defines "ibm,query-interrupt-source-number" which is 33 chars long. + */ +#define OF_PROPNAME_LEN_MAX 64 + +#define VOF_MAX_PATH 256 +#define VOF_MAX_SETPROPLEN 2048 +#define VOF_MAX_METHODLEN 256 +#define VOF_MAX_FORTHCODE 256 +#define VOF_VTY_BUF_SIZE 256 + +typedef struct { + uint64_t start; + uint64_t size; +} OfClaimed; + +typedef struct { + char *path; /* the path used to open the instance */ + uint32_t phandle; +} OfInstance; + +static int readstr(hwaddr pa, char *buf, int size) +{ + if (VOF_MEM_READ(pa, buf, size) != MEMTX_OK) { + return -1; + } + if (strnlen(buf, size) == size) { + buf[size - 1] = '\0'; + trace_vof_error_str_truncated(buf, size); + return -1; + } + return 0; +} + +static bool cmpservice(const char *s, unsigned nargs, unsigned nret, + const char *s1, unsigned nargscheck, unsigned nretcheck) +{ + if (strcmp(s, s1)) { + return false; + } + if ((nargscheck && (nargs != nargscheck)) || + (nretcheck && (nret != nretcheck))) { + trace_vof_error_param(s, nargscheck, nretcheck, nargs, nret); + return false; + } + + return true; +} + +static void prop_format(char *tval, int tlen, const void *prop, int len) +{ + int i; + const unsigned char *c; + char *t; + const char bin[] = "..."; + + for (i = 0, c = prop; i < len; ++i, ++c) { + if (*c == '\0' && i == len - 1) { + strncpy(tval, prop, tlen - 1); + return; + } + if (*c < 0x20 || *c >= 0x80) { + break; + } + } + + for (i = 0, c = prop, t = tval; i < len; ++i, ++c) { + if (t >= tval + tlen - sizeof(bin) - 1 - 2 - 1) { + strcpy(t, bin); + return; + } + if (i && i % 4 == 0 && i != len - 1) { + strcat(t, " "); + ++t; + } + t += sprintf(t, "%02X", *c & 0xFF); + } +} + +static int get_path(const void *fdt, int offset, char *buf, int len) +{ + int ret; + + ret = fdt_get_path(fdt, offset, buf, len - 1); + if (ret < 0) { + return ret; + } + + buf[len - 1] = '\0'; + + return strlen(buf) + 1; +} + +static int phandle_to_path(const void *fdt, uint32_t ph, char *buf, int len) +{ + int ret; + + ret = fdt_node_offset_by_phandle(fdt, ph); + if (ret < 0) { + return ret; + } + + return get_path(fdt, ret, buf, len); +} + +static int path_offset(const void *fdt, const char *path) +{ + g_autofree char *p = NULL; + char *at; + + /* + * https://www.devicetree.org/open-firmware/bindings/ppc/release/ppc-2_1.html#HDR16 + * + * "Conversion from numeric representation to text representation shall use + * the lower case forms of the hexadecimal digits in the range a..f, + * suppressing leading zeros". + */ + p = g_strdup(path); + for (at = strchr(p, '@'); at && *at; ) { + if (*at == '/') { + at = strchr(at, '@'); + } else { + *at = tolower(*at); + ++at; + } + } + + return fdt_path_offset(fdt, p); +} + +static uint32_t vof_finddevice(const void *fdt, uint32_t nodeaddr) +{ + char fullnode[VOF_MAX_PATH]; + uint32_t ret = -1; + int offset; + + if (readstr(nodeaddr, fullnode, sizeof(fullnode))) { + return (uint32_t) ret; + } + + offset = path_offset(fdt, fullnode); + if (offset >= 0) { + ret = fdt_get_phandle(fdt, offset); + } + trace_vof_finddevice(fullnode, ret); + return (uint32_t) ret; +} + +static const void *getprop(const void *fdt, int nodeoff, const char *propname, + int *proplen, bool *write0) +{ + const char *unit, *prop; + const void *ret = fdt_getprop(fdt, nodeoff, propname, proplen); + + if (ret) { + if (write0) { + *write0 = false; + } + return ret; + } + + if (strcmp(propname, "name")) { + return NULL; + } + /* + * We return a value for "name" from path if queried but property does not + * exist. @proplen does not include the unit part in this case. + */ + prop = fdt_get_name(fdt, nodeoff, proplen); + if (!prop) { + *proplen = 0; + return NULL; + } + + unit = memchr(prop, '@', *proplen); + if (unit) { + *proplen = unit - prop; + } + *proplen += 1; + + /* + * Since it might be cut at "@" and there will be no trailing zero + * in the prop buffer, tell the caller to write zero at the end. + */ + if (write0) { + *write0 = true; + } + return prop; +} + +static uint32_t vof_getprop(const void *fdt, uint32_t nodeph, uint32_t pname, + uint32_t valaddr, uint32_t vallen) +{ + char propname[OF_PROPNAME_LEN_MAX + 1]; + uint32_t ret = 0; + int proplen = 0; + const void *prop; + char trval[64] = ""; + int nodeoff = fdt_node_offset_by_phandle(fdt, nodeph); + bool write0; + + if (nodeoff < 0) { + return -1; + } + if (readstr(pname, propname, sizeof(propname))) { + return -1; + } + prop = getprop(fdt, nodeoff, propname, &proplen, &write0); + if (prop) { + const char zero = 0; + int cb = MIN(proplen, vallen); + + if (VOF_MEM_WRITE(valaddr, prop, cb) != MEMTX_OK || + /* if that was "name" with a unit address, overwrite '@' with '0' */ + (write0 && + cb == proplen && + VOF_MEM_WRITE(valaddr + cb - 1, &zero, 1) != MEMTX_OK)) { + ret = -1; + } else { + /* + * OF1275 says: + * "Size is either the actual size of the property, or -1 if name + * does not exist", hence returning proplen instead of cb. + */ + ret = proplen; + /* Do not format a value if tracepoint is silent, for performance */ + if (trace_event_get_state(TRACE_VOF_GETPROP) && + qemu_loglevel_mask(LOG_TRACE)) { + prop_format(trval, sizeof(trval), prop, ret); + } + } + } else { + ret = -1; + } + trace_vof_getprop(nodeph, propname, ret, trval); + + return ret; +} + +static uint32_t vof_getproplen(const void *fdt, uint32_t nodeph, uint32_t pname) +{ + char propname[OF_PROPNAME_LEN_MAX + 1]; + uint32_t ret = 0; + int proplen = 0; + const void *prop; + int nodeoff = fdt_node_offset_by_phandle(fdt, nodeph); + + if (nodeoff < 0) { + return -1; + } + if (readstr(pname, propname, sizeof(propname))) { + return -1; + } + prop = getprop(fdt, nodeoff, propname, &proplen, NULL); + if (prop) { + ret = proplen; + } else { + ret = -1; + } + trace_vof_getproplen(nodeph, propname, ret); + + return ret; +} + +static uint32_t vof_setprop(MachineState *ms, void *fdt, Vof *vof, + uint32_t nodeph, uint32_t pname, + uint32_t valaddr, uint32_t vallen) +{ + char propname[OF_PROPNAME_LEN_MAX + 1]; + uint32_t ret = -1; + int offset; + char trval[64] = ""; + char nodepath[VOF_MAX_PATH] = ""; + Object *vmo = object_dynamic_cast(OBJECT(ms), TYPE_VOF_MACHINE_IF); + VofMachineIfClass *vmc; + g_autofree char *val = NULL; + + if (vallen > VOF_MAX_SETPROPLEN) { + goto trace_exit; + } + if (readstr(pname, propname, sizeof(propname))) { + goto trace_exit; + } + offset = fdt_node_offset_by_phandle(fdt, nodeph); + if (offset < 0) { + goto trace_exit; + } + ret = get_path(fdt, offset, nodepath, sizeof(nodepath)); + if (ret <= 0) { + goto trace_exit; + } + + val = g_malloc0(vallen); + if (VOF_MEM_READ(valaddr, val, vallen) != MEMTX_OK) { + goto trace_exit; + } + + if (!vmo) { + goto trace_exit; + } + + vmc = VOF_MACHINE_GET_CLASS(vmo); + if (!vmc->setprop || !vmc->setprop(ms, nodepath, propname, val, vallen)) { + goto trace_exit; + } + + ret = fdt_setprop(fdt, offset, propname, val, vallen); + if (ret) { + goto trace_exit; + } + + if (trace_event_get_state(TRACE_VOF_SETPROP) && + qemu_loglevel_mask(LOG_TRACE)) { + prop_format(trval, sizeof(trval), val, vallen); + } + ret = vallen; + +trace_exit: + trace_vof_setprop(nodeph, propname, trval, vallen, ret); + + return ret; +} + +static uint32_t vof_nextprop(const void *fdt, uint32_t phandle, + uint32_t prevaddr, uint32_t nameaddr) +{ + int offset, nodeoff = fdt_node_offset_by_phandle(fdt, phandle); + char prev[OF_PROPNAME_LEN_MAX + 1]; + const char *tmp; + + if (readstr(prevaddr, prev, sizeof(prev))) { + return -1; + } + + fdt_for_each_property_offset(offset, fdt, nodeoff) { + if (!fdt_getprop_by_offset(fdt, offset, &tmp, NULL)) { + return 0; + } + if (prev[0] == '\0' || strcmp(prev, tmp) == 0) { + if (prev[0] != '\0') { + offset = fdt_next_property_offset(fdt, offset); + if (offset < 0) { + return 0; + } + } + if (!fdt_getprop_by_offset(fdt, offset, &tmp, NULL)) { + return 0; + } + + if (VOF_MEM_WRITE(nameaddr, tmp, strlen(tmp) + 1) != MEMTX_OK) { + return -1; + } + return 1; + } + } + + return 0; +} + +static uint32_t vof_peer(const void *fdt, uint32_t phandle) +{ + int ret; + + if (phandle == 0) { + ret = fdt_path_offset(fdt, "/"); + } else { + ret = fdt_next_subnode(fdt, fdt_node_offset_by_phandle(fdt, phandle)); + } + + if (ret < 0) { + ret = 0; + } else { + ret = fdt_get_phandle(fdt, ret); + } + + return ret; +} + +static uint32_t vof_child(const void *fdt, uint32_t phandle) +{ + int ret = fdt_first_subnode(fdt, fdt_node_offset_by_phandle(fdt, phandle)); + + if (ret < 0) { + ret = 0; + } else { + ret = fdt_get_phandle(fdt, ret); + } + + return ret; +} + +static uint32_t vof_parent(const void *fdt, uint32_t phandle) +{ + int ret = fdt_parent_offset(fdt, fdt_node_offset_by_phandle(fdt, phandle)); + + if (ret < 0) { + ret = 0; + } else { + ret = fdt_get_phandle(fdt, ret); + } + + return ret; +} + +static uint32_t vof_do_open(void *fdt, Vof *vof, int offset, const char *path) +{ + uint32_t ret = -1; + OfInstance *inst = NULL; + + if (vof->of_instance_last == 0xFFFFFFFF) { + /* We do not recycle ihandles yet */ + goto trace_exit; + } + + inst = g_new0(OfInstance, 1); + inst->phandle = fdt_get_phandle(fdt, offset); + g_assert(inst->phandle); + ++vof->of_instance_last; + + inst->path = g_strdup(path); + g_hash_table_insert(vof->of_instances, + GINT_TO_POINTER(vof->of_instance_last), + inst); + ret = vof->of_instance_last; + +trace_exit: + trace_vof_open(path, inst ? inst->phandle : 0, ret); + + return ret; +} + +uint32_t vof_client_open_store(void *fdt, Vof *vof, const char *nodename, + const char *prop, const char *path) +{ + int node = fdt_path_offset(fdt, nodename); + int inst, offset; + + offset = fdt_path_offset(fdt, path); + if (offset < 0) { + trace_vof_error_unknown_path(path); + return offset; + } + + inst = vof_do_open(fdt, vof, offset, path); + + return fdt_setprop_cell(fdt, node, prop, inst); +} + +static uint32_t vof_open(void *fdt, Vof *vof, uint32_t pathaddr) +{ + char path[VOF_MAX_PATH]; + int offset; + + if (readstr(pathaddr, path, sizeof(path))) { + return -1; + } + + offset = path_offset(fdt, path); + if (offset < 0) { + trace_vof_error_unknown_path(path); + return offset; + } + + return vof_do_open(fdt, vof, offset, path); +} + +static void vof_close(Vof *vof, uint32_t ihandle) +{ + if (!g_hash_table_remove(vof->of_instances, GINT_TO_POINTER(ihandle))) { + trace_vof_error_unknown_ihandle_close(ihandle); + } +} + +static uint32_t vof_instance_to_package(Vof *vof, uint32_t ihandle) +{ + gpointer instp = g_hash_table_lookup(vof->of_instances, + GINT_TO_POINTER(ihandle)); + uint32_t ret = -1; + + if (instp) { + ret = ((OfInstance *)instp)->phandle; + } + trace_vof_instance_to_package(ihandle, ret); + + return ret; +} + +static uint32_t vof_package_to_path(const void *fdt, uint32_t phandle, + uint32_t buf, uint32_t len) +{ + uint32_t ret = -1; + char tmp[VOF_MAX_PATH] = ""; + + ret = phandle_to_path(fdt, phandle, tmp, sizeof(tmp)); + if (ret > 0) { + if (VOF_MEM_WRITE(buf, tmp, ret) != MEMTX_OK) { + ret = -1; + } + } + + trace_vof_package_to_path(phandle, tmp, ret); + + return ret; +} + +static uint32_t vof_instance_to_path(void *fdt, Vof *vof, uint32_t ihandle, + uint32_t buf, uint32_t len) +{ + uint32_t ret = -1; + uint32_t phandle = vof_instance_to_package(vof, ihandle); + char tmp[VOF_MAX_PATH] = ""; + + if (phandle != -1) { + ret = phandle_to_path(fdt, phandle, tmp, sizeof(tmp)); + if (ret > 0) { + if (VOF_MEM_WRITE(buf, tmp, ret) != MEMTX_OK) { + ret = -1; + } + } + } + trace_vof_instance_to_path(ihandle, phandle, tmp, ret); + + return ret; +} + +static uint32_t vof_write(Vof *vof, uint32_t ihandle, uint32_t buf, + uint32_t len) +{ + char tmp[VOF_VTY_BUF_SIZE]; + unsigned cb; + OfInstance *inst = (OfInstance *) + g_hash_table_lookup(vof->of_instances, GINT_TO_POINTER(ihandle)); + + if (!inst) { + trace_vof_error_write(ihandle); + return -1; + } + + for ( ; len > 0; len -= cb) { + cb = MIN(len, sizeof(tmp) - 1); + if (VOF_MEM_READ(buf, tmp, cb) != MEMTX_OK) { + return -1; + } + + /* FIXME: there is no backend(s) yet so just call a trace */ + if (trace_event_get_state(TRACE_VOF_WRITE) && + qemu_loglevel_mask(LOG_TRACE)) { + tmp[cb] = '\0'; + trace_vof_write(ihandle, cb, tmp); + } + } + + return len; +} + +static void vof_claimed_dump(GArray *claimed) +{ + int i; + OfClaimed c; + + if (trace_event_get_state(TRACE_VOF_CLAIMED) && + qemu_loglevel_mask(LOG_TRACE)) { + + for (i = 0; i < claimed->len; ++i) { + c = g_array_index(claimed, OfClaimed, i); + trace_vof_claimed(c.start, c.start + c.size, c.size); + } + } +} + +static bool vof_claim_avail(GArray *claimed, uint64_t virt, uint64_t size) +{ + int i; + OfClaimed c; + + for (i = 0; i < claimed->len; ++i) { + c = g_array_index(claimed, OfClaimed, i); + if (ranges_overlap(c.start, c.size, virt, size)) { + return false; + } + } + + return true; +} + +static void vof_claim_add(GArray *claimed, uint64_t virt, uint64_t size) +{ + OfClaimed newclaim; + + newclaim.start = virt; + newclaim.size = size; + g_array_append_val(claimed, newclaim); +} + +static gint of_claimed_compare_func(gconstpointer a, gconstpointer b) +{ + return ((OfClaimed *)a)->start - ((OfClaimed *)b)->start; +} + +static void vof_dt_memory_available(void *fdt, GArray *claimed, uint64_t base) +{ + int i, n, offset, proplen = 0, sc, ac; + target_ulong mem0_end; + const uint8_t *mem0_reg; + g_autofree uint8_t *avail = NULL; + uint8_t *availcur; + + if (!fdt || !claimed) { + return; + } + + offset = fdt_path_offset(fdt, "/"); + _FDT(offset); + ac = fdt_address_cells(fdt, offset); + g_assert(ac == 1 || ac == 2); + sc = fdt_size_cells(fdt, offset); + g_assert(sc == 1 || sc == 2); + + offset = fdt_path_offset(fdt, "/memory@0"); + _FDT(offset); + + mem0_reg = fdt_getprop(fdt, offset, "reg", &proplen); + g_assert(mem0_reg && proplen == sizeof(uint32_t) * (ac + sc)); + if (sc == 2) { + mem0_end = be64_to_cpu(*(uint64_t *)(mem0_reg + sizeof(uint32_t) * ac)); + } else { + mem0_end = be32_to_cpu(*(uint32_t *)(mem0_reg + sizeof(uint32_t) * ac)); + } + + g_array_sort(claimed, of_claimed_compare_func); + vof_claimed_dump(claimed); + + /* + * VOF resides in the first page so we do not need to check if there is + * available memory before the first claimed block + */ + g_assert(claimed->len && (g_array_index(claimed, OfClaimed, 0).start == 0)); + + avail = g_malloc0(sizeof(uint32_t) * (ac + sc) * claimed->len); + for (i = 0, n = 0, availcur = avail; i < claimed->len; ++i) { + OfClaimed c = g_array_index(claimed, OfClaimed, i); + uint64_t start, size; + + start = c.start + c.size; + if (i < claimed->len - 1) { + OfClaimed cn = g_array_index(claimed, OfClaimed, i + 1); + + size = cn.start - start; + } else { + size = mem0_end - start; + } + + if (ac == 2) { + *(uint64_t *) availcur = cpu_to_be64(start); + } else { + *(uint32_t *) availcur = cpu_to_be32(start); + } + availcur += sizeof(uint32_t) * ac; + if (sc == 2) { + *(uint64_t *) availcur = cpu_to_be64(size); + } else { + *(uint32_t *) availcur = cpu_to_be32(size); + } + availcur += sizeof(uint32_t) * sc; + + if (size) { + trace_vof_avail(c.start + c.size, c.start + c.size + size, size); + ++n; + } + } + _FDT((fdt_setprop(fdt, offset, "available", avail, availcur - avail))); +} + +/* + * OF1275: + * "Allocates size bytes of memory. If align is zero, the allocated range + * begins at the virtual address virt. Otherwise, an aligned address is + * automatically chosen and the input argument virt is ignored". + * + * In other words, exactly one of @virt and @align is non-zero. + */ +uint64_t vof_claim(Vof *vof, uint64_t virt, uint64_t size, + uint64_t align) +{ + uint64_t ret; + + if (size == 0) { + ret = -1; + } else if (align == 0) { + if (!vof_claim_avail(vof->claimed, virt, size)) { + ret = -1; + } else { + ret = virt; + } + } else { + vof->claimed_base = QEMU_ALIGN_UP(vof->claimed_base, align); + while (1) { + if (vof->claimed_base >= vof->top_addr) { + error_report("Out of RMA memory for the OF client"); + return -1; + } + if (vof_claim_avail(vof->claimed, vof->claimed_base, size)) { + break; + } + vof->claimed_base += size; + } + ret = vof->claimed_base; + } + + if (ret != -1) { + vof->claimed_base = MAX(vof->claimed_base, ret + size); + vof_claim_add(vof->claimed, ret, size); + } + trace_vof_claim(virt, size, align, ret); + + return ret; +} + +static uint32_t vof_release(Vof *vof, uint64_t virt, uint64_t size) +{ + uint32_t ret = -1; + int i; + GArray *claimed = vof->claimed; + OfClaimed c; + + for (i = 0; i < claimed->len; ++i) { + c = g_array_index(claimed, OfClaimed, i); + if (c.start == virt && c.size == size) { + g_array_remove_index(claimed, i); + ret = 0; + break; + } + } + + trace_vof_release(virt, size, ret); + + return ret; +} + +static void vof_instantiate_rtas(Error **errp) +{ + error_setg(errp, "The firmware should have instantiated RTAS"); +} + +static uint32_t vof_call_method(MachineState *ms, Vof *vof, uint32_t methodaddr, + uint32_t ihandle, uint32_t param1, + uint32_t param2, uint32_t param3, + uint32_t param4, uint32_t *ret2) +{ + uint32_t ret = -1; + char method[VOF_MAX_METHODLEN] = ""; + OfInstance *inst; + + if (!ihandle) { + goto trace_exit; + } + + inst = (OfInstance *)g_hash_table_lookup(vof->of_instances, + GINT_TO_POINTER(ihandle)); + if (!inst) { + goto trace_exit; + } + + if (readstr(methodaddr, method, sizeof(method))) { + goto trace_exit; + } + + if (strcmp(inst->path, "/") == 0) { + if (strcmp(method, "ibm,client-architecture-support") == 0) { + Object *vmo = object_dynamic_cast(OBJECT(ms), TYPE_VOF_MACHINE_IF); + + if (vmo) { + VofMachineIfClass *vmc = VOF_MACHINE_GET_CLASS(vmo); + + g_assert(vmc->client_architecture_support); + ret = vmc->client_architecture_support(ms, first_cpu, param1); + } + + *ret2 = 0; + } + } else if (strcmp(inst->path, "/rtas") == 0) { + if (strcmp(method, "instantiate-rtas") == 0) { + vof_instantiate_rtas(&error_fatal); + ret = 0; + *ret2 = param1; /* rtas-base */ + } + } else { + trace_vof_error_unknown_method(method); + } + +trace_exit: + trace_vof_method(ihandle, method, param1, ret, *ret2); + + return ret; +} + +static uint32_t vof_call_interpret(uint32_t cmdaddr, uint32_t param1, + uint32_t param2, uint32_t *ret2) +{ + uint32_t ret = -1; + char cmd[VOF_MAX_FORTHCODE] = ""; + + /* No interpret implemented so just call a trace */ + readstr(cmdaddr, cmd, sizeof(cmd)); + trace_vof_interpret(cmd, param1, param2, ret, *ret2); + + return ret; +} + +static void vof_quiesce(MachineState *ms, void *fdt, Vof *vof) +{ + Object *vmo = object_dynamic_cast(OBJECT(ms), TYPE_VOF_MACHINE_IF); + /* After "quiesce", no change is expected to the FDT, pack FDT to ensure */ + int rc = fdt_pack(fdt); + + assert(rc == 0); + + if (vmo) { + VofMachineIfClass *vmc = VOF_MACHINE_GET_CLASS(vmo); + + if (vmc->quiesce) { + vmc->quiesce(ms); + } + } + + vof_claimed_dump(vof->claimed); +} + +static uint32_t vof_client_handle(MachineState *ms, void *fdt, Vof *vof, + const char *service, + uint32_t *args, unsigned nargs, + uint32_t *rets, unsigned nrets) +{ + uint32_t ret = 0; + + /* @nrets includes the value which this function returns */ +#define cmpserv(s, a, r) \ + cmpservice(service, nargs, nrets, (s), (a), (r)) + + if (cmpserv("finddevice", 1, 1)) { + ret = vof_finddevice(fdt, args[0]); + } else if (cmpserv("getprop", 4, 1)) { + ret = vof_getprop(fdt, args[0], args[1], args[2], args[3]); + } else if (cmpserv("getproplen", 2, 1)) { + ret = vof_getproplen(fdt, args[0], args[1]); + } else if (cmpserv("setprop", 4, 1)) { + ret = vof_setprop(ms, fdt, vof, args[0], args[1], args[2], args[3]); + } else if (cmpserv("nextprop", 3, 1)) { + ret = vof_nextprop(fdt, args[0], args[1], args[2]); + } else if (cmpserv("peer", 1, 1)) { + ret = vof_peer(fdt, args[0]); + } else if (cmpserv("child", 1, 1)) { + ret = vof_child(fdt, args[0]); + } else if (cmpserv("parent", 1, 1)) { + ret = vof_parent(fdt, args[0]); + } else if (cmpserv("open", 1, 1)) { + ret = vof_open(fdt, vof, args[0]); + } else if (cmpserv("close", 1, 0)) { + vof_close(vof, args[0]); + } else if (cmpserv("instance-to-package", 1, 1)) { + ret = vof_instance_to_package(vof, args[0]); + } else if (cmpserv("package-to-path", 3, 1)) { + ret = vof_package_to_path(fdt, args[0], args[1], args[2]); + } else if (cmpserv("instance-to-path", 3, 1)) { + ret = vof_instance_to_path(fdt, vof, args[0], args[1], args[2]); + } else if (cmpserv("write", 3, 1)) { + ret = vof_write(vof, args[0], args[1], args[2]); + } else if (cmpserv("claim", 3, 1)) { + ret = vof_claim(vof, args[0], args[1], args[2]); + if (ret != -1) { + vof_dt_memory_available(fdt, vof->claimed, vof->claimed_base); + } + } else if (cmpserv("release", 2, 0)) { + ret = vof_release(vof, args[0], args[1]); + if (ret != -1) { + vof_dt_memory_available(fdt, vof->claimed, vof->claimed_base); + } + } else if (cmpserv("call-method", 0, 0)) { + ret = vof_call_method(ms, vof, args[0], args[1], args[2], args[3], + args[4], args[5], rets); + } else if (cmpserv("interpret", 0, 0)) { + ret = vof_call_interpret(args[0], args[1], args[2], rets); + } else if (cmpserv("milliseconds", 0, 1)) { + ret = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL); + } else if (cmpserv("quiesce", 0, 0)) { + vof_quiesce(ms, fdt, vof); + } else if (cmpserv("exit", 0, 0)) { + error_report("Stopped as the VM requested \"exit\""); + vm_stop(RUN_STATE_PAUSED); + } else { + trace_vof_error_unknown_service(service, nargs, nrets); + ret = -1; + } + +#undef cmpserv + + return ret; +} + +/* Defined as Big Endian */ +struct prom_args { + uint32_t service; + uint32_t nargs; + uint32_t nret; + uint32_t args[10]; +} QEMU_PACKED; + +int vof_client_call(MachineState *ms, Vof *vof, void *fdt, + target_ulong args_real) +{ + struct prom_args args_be; + uint32_t args[ARRAY_SIZE(args_be.args)]; + uint32_t rets[ARRAY_SIZE(args_be.args)] = { 0 }, ret; + char service[64]; + unsigned nargs, nret, i; + + if (VOF_MEM_READ(args_real, &args_be, sizeof(args_be)) != MEMTX_OK) { + return -EINVAL; + } + nargs = be32_to_cpu(args_be.nargs); + if (nargs >= ARRAY_SIZE(args_be.args)) { + return -EINVAL; + } + + if (VOF_MEM_READ(be32_to_cpu(args_be.service), service, sizeof(service)) != + MEMTX_OK) { + return -EINVAL; + } + if (strnlen(service, sizeof(service)) == sizeof(service)) { + /* Too long service name */ + return -EINVAL; + } + + for (i = 0; i < nargs; ++i) { + args[i] = be32_to_cpu(args_be.args[i]); + } + + nret = be32_to_cpu(args_be.nret); + ret = vof_client_handle(ms, fdt, vof, service, args, nargs, rets, nret); + if (!nret) { + return 0; + } + + args_be.args[nargs] = cpu_to_be32(ret); + for (i = 1; i < nret; ++i) { + args_be.args[nargs + i] = cpu_to_be32(rets[i - 1]); + } + + if (VOF_MEM_WRITE(args_real + offsetof(struct prom_args, args[nargs]), + args_be.args + nargs, sizeof(args_be.args[0]) * nret) != + MEMTX_OK) { + return -EINVAL; + } + + return 0; +} + +static void vof_instance_free(gpointer data) +{ + OfInstance *inst = (OfInstance *)data; + + g_free(inst->path); + g_free(inst); +} + +void vof_init(Vof *vof, uint64_t top_addr, Error **errp) +{ + vof_cleanup(vof); + + vof->of_instances = g_hash_table_new_full(g_direct_hash, g_direct_equal, + NULL, vof_instance_free); + vof->claimed = g_array_new(false, false, sizeof(OfClaimed)); + + /* Keep allocations in 32bit as CLI ABI can only return cells==32bit */ + vof->top_addr = MIN(top_addr, 4 * GiB); + if (vof_claim(vof, 0, vof->fw_size, 0) == -1) { + error_setg(errp, "Memory for firmware is in use"); + } +} + +void vof_cleanup(Vof *vof) +{ + if (vof->claimed) { + g_array_unref(vof->claimed); + } + if (vof->of_instances) { + g_hash_table_unref(vof->of_instances); + } + vof->claimed = NULL; + vof->of_instances = NULL; +} + +void vof_build_dt(void *fdt, Vof *vof) +{ + uint32_t phandle = fdt_get_max_phandle(fdt); + int offset, proplen = 0; + const void *prop; + + /* Assign phandles to nodes without predefined phandles (like XICS/XIVE) */ + for (offset = fdt_next_node(fdt, -1, NULL); + offset >= 0; + offset = fdt_next_node(fdt, offset, NULL)) { + prop = fdt_getprop(fdt, offset, "phandle", &proplen); + if (prop) { + continue; + } + ++phandle; + _FDT(fdt_setprop_cell(fdt, offset, "phandle", phandle)); + } + + vof_dt_memory_available(fdt, vof->claimed, vof->claimed_base); +} + +static const TypeInfo vof_machine_if_info = { + .name = TYPE_VOF_MACHINE_IF, + .parent = TYPE_INTERFACE, + .class_size = sizeof(VofMachineIfClass), +}; + +static void vof_machine_if_register_types(void) +{ + type_register_static(&vof_machine_if_info); +} +type_init(vof_machine_if_register_types) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index ae5654fcdb..3f0d111360 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -36,6 +36,7 @@ #include "qemu/range.h" #include "sysemu/kvm.h" #include "sysemu/reset.h" +#include "sysemu/runstate.h" #include "trace.h" #include "qapi/error.h" #include "migration/migration.h" @@ -134,6 +135,29 @@ static const char *index_to_str(VFIODevice *vbasedev, int index) } } +static int vfio_ram_block_discard_disable(VFIOContainer *container, bool state) +{ + switch (container->iommu_type) { + case VFIO_TYPE1v2_IOMMU: + case VFIO_TYPE1_IOMMU: + /* + * We support coordinated discarding of RAM via the RamDiscardManager. + */ + return ram_block_uncoordinated_discard_disable(state); + default: + /* + * VFIO_SPAPR_TCE_IOMMU most probably works just fine with + * RamDiscardManager, however, it is completely untested. + * + * VFIO_SPAPR_TCE_v2_IOMMU with "DMA memory preregistering" does + * completely the opposite of managing mapping/pinning dynamically as + * required by RamDiscardManager. We would have to special-case sections + * with a RamDiscardManager. + */ + return ram_block_discard_disable(state); + } +} + int vfio_set_irq_signaling(VFIODevice *vbasedev, int index, int subindex, int action, int fd, Error **errp) { @@ -569,6 +593,44 @@ static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr, error_report("iommu map to non memory area %"HWADDR_PRIx"", xlat); return false; + } else if (memory_region_has_ram_discard_manager(mr)) { + RamDiscardManager *rdm = memory_region_get_ram_discard_manager(mr); + MemoryRegionSection tmp = { + .mr = mr, + .offset_within_region = xlat, + .size = int128_make64(len), + }; + + /* + * Malicious VMs can map memory into the IOMMU, which is expected + * to remain discarded. vfio will pin all pages, populating memory. + * Disallow that. vmstate priorities make sure any RamDiscardManager + * were already restored before IOMMUs are restored. + */ + if (!ram_discard_manager_is_populated(rdm, &tmp)) { + error_report("iommu map to discarded memory (e.g., unplugged via" + " virtio-mem): %"HWADDR_PRIx"", + iotlb->translated_addr); + return false; + } + + /* + * Malicious VMs might trigger discarding of IOMMU-mapped memory. The + * pages will remain pinned inside vfio until unmapped, resulting in a + * higher memory consumption than expected. If memory would get + * populated again later, there would be an inconsistency between pages + * pinned by vfio and pages seen by QEMU. This is the case until + * unmapped from the IOMMU (e.g., during device reset). + * + * With malicious guests, we really only care about pinning more memory + * than expected. RLIMIT_MEMLOCK set for the user/process can never be + * exceeded and can be used to mitigate this problem. + */ + warn_report_once("Using vfio with vIOMMUs and coordinated discarding of" + " RAM (e.g., virtio-mem) works, however, malicious" + " guests can trigger pinning of more memory than" + " intended via an IOMMU. It's possible to mitigate " + " by setting/adjusting RLIMIT_MEMLOCK."); } /* @@ -649,6 +711,153 @@ out: rcu_read_unlock(); } +static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl, + MemoryRegionSection *section) +{ + VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, + listener); + const hwaddr size = int128_get64(section->size); + const hwaddr iova = section->offset_within_address_space; + int ret; + + /* Unmap with a single call. */ + ret = vfio_dma_unmap(vrdl->container, iova, size , NULL); + if (ret) { + error_report("%s: vfio_dma_unmap() failed: %s", __func__, + strerror(-ret)); + } +} + +static int vfio_ram_discard_notify_populate(RamDiscardListener *rdl, + MemoryRegionSection *section) +{ + VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener, + listener); + const hwaddr end = section->offset_within_region + + int128_get64(section->size); + hwaddr start, next, iova; + void *vaddr; + int ret; + + /* + * Map in (aligned within memory region) minimum granularity, so we can + * unmap in minimum granularity later. + */ + for (start = section->offset_within_region; start < end; start = next) { + next = ROUND_UP(start + 1, vrdl->granularity); + next = MIN(next, end); + + iova = start - section->offset_within_region + + section->offset_within_address_space; + vaddr = memory_region_get_ram_ptr(section->mr) + start; + + ret = vfio_dma_map(vrdl->container, iova, next - start, + vaddr, section->readonly); + if (ret) { + /* Rollback */ + vfio_ram_discard_notify_discard(rdl, section); + return ret; + } + } + return 0; +} + +static void vfio_register_ram_discard_listener(VFIOContainer *container, + MemoryRegionSection *section) +{ + RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); + VFIORamDiscardListener *vrdl; + + /* Ignore some corner cases not relevant in practice. */ + g_assert(QEMU_IS_ALIGNED(section->offset_within_region, TARGET_PAGE_SIZE)); + g_assert(QEMU_IS_ALIGNED(section->offset_within_address_space, + TARGET_PAGE_SIZE)); + g_assert(QEMU_IS_ALIGNED(int128_get64(section->size), TARGET_PAGE_SIZE)); + + vrdl = g_new0(VFIORamDiscardListener, 1); + vrdl->container = container; + vrdl->mr = section->mr; + vrdl->offset_within_address_space = section->offset_within_address_space; + vrdl->size = int128_get64(section->size); + vrdl->granularity = ram_discard_manager_get_min_granularity(rdm, + section->mr); + + g_assert(vrdl->granularity && is_power_of_2(vrdl->granularity)); + g_assert(vrdl->granularity >= 1 << ctz64(container->pgsizes)); + + ram_discard_listener_init(&vrdl->listener, + vfio_ram_discard_notify_populate, + vfio_ram_discard_notify_discard, true); + ram_discard_manager_register_listener(rdm, &vrdl->listener, section); + QLIST_INSERT_HEAD(&container->vrdl_list, vrdl, next); + + /* + * Sanity-check if we have a theoretically problematic setup where we could + * exceed the maximum number of possible DMA mappings over time. We assume + * that each mapped section in the same address space as a RamDiscardManager + * section consumes exactly one DMA mapping, with the exception of + * RamDiscardManager sections; i.e., we don't expect to have gIOMMU sections + * in the same address space as RamDiscardManager sections. + * + * We assume that each section in the address space consumes one memslot. + * We take the number of KVM memory slots as a best guess for the maximum + * number of sections in the address space we could have over time, + * also consuming DMA mappings. + */ + if (container->dma_max_mappings) { + unsigned int vrdl_count = 0, vrdl_mappings = 0, max_memslots = 512; + +#ifdef CONFIG_KVM + if (kvm_enabled()) { + max_memslots = kvm_get_max_memslots(); + } +#endif + + QLIST_FOREACH(vrdl, &container->vrdl_list, next) { + hwaddr start, end; + + start = QEMU_ALIGN_DOWN(vrdl->offset_within_address_space, + vrdl->granularity); + end = ROUND_UP(vrdl->offset_within_address_space + vrdl->size, + vrdl->granularity); + vrdl_mappings += (end - start) / vrdl->granularity; + vrdl_count++; + } + + if (vrdl_mappings + max_memslots - vrdl_count > + container->dma_max_mappings) { + warn_report("%s: possibly running out of DMA mappings. E.g., try" + " increasing the 'block-size' of virtio-mem devies." + " Maximum possible DMA mappings: %d, Maximum possible" + " memslots: %d", __func__, container->dma_max_mappings, + max_memslots); + } + } +} + +static void vfio_unregister_ram_discard_listener(VFIOContainer *container, + MemoryRegionSection *section) +{ + RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); + VFIORamDiscardListener *vrdl = NULL; + + QLIST_FOREACH(vrdl, &container->vrdl_list, next) { + if (vrdl->mr == section->mr && + vrdl->offset_within_address_space == + section->offset_within_address_space) { + break; + } + } + + if (!vrdl) { + hw_error("vfio: Trying to unregister missing RAM discard listener"); + } + + ram_discard_manager_unregister_listener(rdm, &vrdl->listener); + QLIST_REMOVE(vrdl, next); + g_free(vrdl); +} + static void vfio_listener_region_add(MemoryListener *listener, MemoryRegionSection *section) { @@ -810,6 +1019,16 @@ static void vfio_listener_region_add(MemoryListener *listener, /* Here we assume that memory_region_is_ram(section->mr)==true */ + /* + * For RAM memory regions with a RamDiscardManager, we only want to map the + * actually populated parts - and update the mapping whenever we're notified + * about changes. + */ + if (memory_region_has_ram_discard_manager(section->mr)) { + vfio_register_ram_discard_listener(container, section); + return; + } + vaddr = memory_region_get_ram_ptr(section->mr) + section->offset_within_region + (iova - section->offset_within_address_space); @@ -947,6 +1166,10 @@ static void vfio_listener_region_del(MemoryListener *listener, pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1; try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask)); + } else if (memory_region_has_ram_discard_manager(section->mr)) { + vfio_unregister_ram_discard_listener(container, section); + /* Unregistering will trigger an unmap. */ + try_unmap = false; } if (try_unmap) { @@ -1108,6 +1331,49 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) rcu_read_unlock(); } +static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section, + void *opaque) +{ + const hwaddr size = int128_get64(section->size); + const hwaddr iova = section->offset_within_address_space; + const ram_addr_t ram_addr = memory_region_get_ram_addr(section->mr) + + section->offset_within_region; + VFIORamDiscardListener *vrdl = opaque; + + /* + * Sync the whole mapped region (spanning multiple individual mappings) + * in one go. + */ + return vfio_get_dirty_bitmap(vrdl->container, iova, size, ram_addr); +} + +static int vfio_sync_ram_discard_listener_dirty_bitmap(VFIOContainer *container, + MemoryRegionSection *section) +{ + RamDiscardManager *rdm = memory_region_get_ram_discard_manager(section->mr); + VFIORamDiscardListener *vrdl = NULL; + + QLIST_FOREACH(vrdl, &container->vrdl_list, next) { + if (vrdl->mr == section->mr && + vrdl->offset_within_address_space == + section->offset_within_address_space) { + break; + } + } + + if (!vrdl) { + hw_error("vfio: Trying to sync missing RAM discard listener"); + } + + /* + * We only want/can synchronize the bitmap for actually mapped parts - + * which correspond to populated parts. Replay all populated parts. + */ + return ram_discard_manager_replay_populated(rdm, section, + vfio_ram_discard_get_dirty_bitmap, + &vrdl); +} + static int vfio_sync_dirty_bitmap(VFIOContainer *container, MemoryRegionSection *section) { @@ -1139,6 +1405,8 @@ static int vfio_sync_dirty_bitmap(VFIOContainer *container, } } return 0; + } else if (memory_region_has_ram_discard_manager(section->mr)) { + return vfio_sync_ram_discard_listener_dirty_bitmap(container, section); } ram_addr = memory_region_get_ram_addr(section->mr) + @@ -1732,15 +2000,25 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, * new memory, it will not yet set ram_block_discard_set_required() and * therefore, neither stops us here or deals with the sudden memory * consumption of inflated memory. + * + * We do support discarding of memory coordinated via the RamDiscardManager + * with some IOMMU types. vfio_ram_block_discard_disable() handles the + * details once we know which type of IOMMU we are using. */ - ret = ram_block_discard_disable(true); - if (ret) { - error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken"); - return ret; - } QLIST_FOREACH(container, &space->containers, next) { if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { + ret = vfio_ram_block_discard_disable(container, true); + if (ret) { + error_setg_errno(errp, -ret, + "Cannot set discarding of RAM broken"); + if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, + &container->fd)) { + error_report("vfio: error disconnecting group %d from" + " container", group->groupid); + } + return ret; + } group->container = container; QLIST_INSERT_HEAD(&container->group_list, group, container_next); vfio_kvm_device_add_group(group); @@ -1768,14 +2046,22 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, container->fd = fd; container->error = NULL; container->dirty_pages_supported = false; + container->dma_max_mappings = 0; QLIST_INIT(&container->giommu_list); QLIST_INIT(&container->hostwin_list); + QLIST_INIT(&container->vrdl_list); ret = vfio_init_container(container, group->fd, errp); if (ret) { goto free_container_exit; } + ret = vfio_ram_block_discard_disable(container, true); + if (ret) { + error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken"); + goto free_container_exit; + } + switch (container->iommu_type) { case VFIO_TYPE1v2_IOMMU: case VFIO_TYPE1_IOMMU: @@ -1798,7 +2084,10 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, vfio_host_win_add(container, 0, (hwaddr)-1, info->iova_pgsizes); container->pgsizes = info->iova_pgsizes; + /* The default in the kernel ("dma_entry_limit") is 65535. */ + container->dma_max_mappings = 65535; if (!ret) { + vfio_get_info_dma_avail(info, &container->dma_max_mappings); vfio_get_iommu_info_migration(container, info); } g_free(info); @@ -1820,7 +2109,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, if (ret) { error_setg_errno(errp, errno, "failed to enable container"); ret = -errno; - goto free_container_exit; + goto enable_discards_exit; } } else { container->prereg_listener = vfio_prereg_listener; @@ -1832,7 +2121,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, ret = -1; error_propagate_prepend(errp, container->error, "RAM memory listener initialization failed: "); - goto free_container_exit; + goto enable_discards_exit; } } @@ -1845,7 +2134,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, if (v2) { memory_listener_unregister(&container->prereg_listener); } - goto free_container_exit; + goto enable_discards_exit; } if (v2) { @@ -1860,7 +2149,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as, if (ret) { error_setg_errno(errp, -ret, "failed to remove existing window"); - goto free_container_exit; + goto enable_discards_exit; } } else { /* The default table uses 4K pages */ @@ -1901,6 +2190,9 @@ listener_release_exit: vfio_kvm_device_del_group(group); vfio_listener_release(container); +enable_discards_exit: + vfio_ram_block_discard_disable(container, false); + free_container_exit: g_free(container); @@ -1908,7 +2200,6 @@ close_fd_exit: close(fd); put_space_exit: - ram_block_discard_disable(false); vfio_put_address_space(space); return ret; @@ -2030,7 +2321,7 @@ void vfio_put_group(VFIOGroup *group) } if (!group->ram_block_discard_allowed) { - ram_block_discard_disable(false); + vfio_ram_block_discard_disable(group->container, false); } vfio_kvm_device_del_group(group); vfio_disconnect_container(group); @@ -2084,7 +2375,7 @@ int vfio_get_device(VFIOGroup *group, const char *name, if (!group->ram_block_discard_allowed) { group->ram_block_discard_allowed = true; - ram_block_discard_disable(false); + vfio_ram_block_discard_disable(group->container, false); } } diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 1ac4a2ebec..29ea2b4fce 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -1913,7 +1913,10 @@ static int vhost_user_backend_init(struct vhost_dev *dev, void *opaque, if (err < 0) { return -EPROTO; } + } else { + dev->max_queues = 1; } + if (dev->num_queues && dev->max_queues < dev->num_queues) { error_setg(errp, "The maximum number of queues supported by the " "backend is %" PRIu64, dev->max_queues); diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c index 75aa7d6f1b..df91e454b2 100644 --- a/hw/virtio/virtio-mem.c +++ b/hw/virtio/virtio-mem.c @@ -145,7 +145,173 @@ static bool virtio_mem_is_busy(void) return migration_in_incoming_postcopy() || !migration_is_idle(); } -static bool virtio_mem_test_bitmap(VirtIOMEM *vmem, uint64_t start_gpa, +typedef int (*virtio_mem_range_cb)(const VirtIOMEM *vmem, void *arg, + uint64_t offset, uint64_t size); + +static int virtio_mem_for_each_unplugged_range(const VirtIOMEM *vmem, void *arg, + virtio_mem_range_cb cb) +{ + unsigned long first_zero_bit, last_zero_bit; + uint64_t offset, size; + int ret = 0; + + first_zero_bit = find_first_zero_bit(vmem->bitmap, vmem->bitmap_size); + while (first_zero_bit < vmem->bitmap_size) { + offset = first_zero_bit * vmem->block_size; + last_zero_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, + first_zero_bit + 1) - 1; + size = (last_zero_bit - first_zero_bit + 1) * vmem->block_size; + + ret = cb(vmem, arg, offset, size); + if (ret) { + break; + } + first_zero_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, + last_zero_bit + 2); + } + return ret; +} + +/* + * Adjust the memory section to cover the intersection with the given range. + * + * Returns false if the intersection is empty, otherwise returns true. + */ +static bool virito_mem_intersect_memory_section(MemoryRegionSection *s, + uint64_t offset, uint64_t size) +{ + uint64_t start = MAX(s->offset_within_region, offset); + uint64_t end = MIN(s->offset_within_region + int128_get64(s->size), + offset + size); + + if (end <= start) { + return false; + } + + s->offset_within_address_space += start - s->offset_within_region; + s->offset_within_region = start; + s->size = int128_make64(end - start); + return true; +} + +typedef int (*virtio_mem_section_cb)(MemoryRegionSection *s, void *arg); + +static int virtio_mem_for_each_plugged_section(const VirtIOMEM *vmem, + MemoryRegionSection *s, + void *arg, + virtio_mem_section_cb cb) +{ + unsigned long first_bit, last_bit; + uint64_t offset, size; + int ret = 0; + + first_bit = s->offset_within_region / vmem->bitmap_size; + first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, first_bit); + while (first_bit < vmem->bitmap_size) { + MemoryRegionSection tmp = *s; + + offset = first_bit * vmem->block_size; + last_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, + first_bit + 1) - 1; + size = (last_bit - first_bit + 1) * vmem->block_size; + + if (!virito_mem_intersect_memory_section(&tmp, offset, size)) { + break; + } + ret = cb(&tmp, arg); + if (ret) { + break; + } + first_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, + last_bit + 2); + } + return ret; +} + +static int virtio_mem_notify_populate_cb(MemoryRegionSection *s, void *arg) +{ + RamDiscardListener *rdl = arg; + + return rdl->notify_populate(rdl, s); +} + +static int virtio_mem_notify_discard_cb(MemoryRegionSection *s, void *arg) +{ + RamDiscardListener *rdl = arg; + + rdl->notify_discard(rdl, s); + return 0; +} + +static void virtio_mem_notify_unplug(VirtIOMEM *vmem, uint64_t offset, + uint64_t size) +{ + RamDiscardListener *rdl; + + QLIST_FOREACH(rdl, &vmem->rdl_list, next) { + MemoryRegionSection tmp = *rdl->section; + + if (!virito_mem_intersect_memory_section(&tmp, offset, size)) { + continue; + } + rdl->notify_discard(rdl, &tmp); + } +} + +static int virtio_mem_notify_plug(VirtIOMEM *vmem, uint64_t offset, + uint64_t size) +{ + RamDiscardListener *rdl, *rdl2; + int ret = 0; + + QLIST_FOREACH(rdl, &vmem->rdl_list, next) { + MemoryRegionSection tmp = *rdl->section; + + if (!virito_mem_intersect_memory_section(&tmp, offset, size)) { + continue; + } + ret = rdl->notify_populate(rdl, &tmp); + if (ret) { + break; + } + } + + if (ret) { + /* Notify all already-notified listeners. */ + QLIST_FOREACH(rdl2, &vmem->rdl_list, next) { + MemoryRegionSection tmp = *rdl->section; + + if (rdl2 == rdl) { + break; + } + if (!virito_mem_intersect_memory_section(&tmp, offset, size)) { + continue; + } + rdl2->notify_discard(rdl2, &tmp); + } + } + return ret; +} + +static void virtio_mem_notify_unplug_all(VirtIOMEM *vmem) +{ + RamDiscardListener *rdl; + + if (!vmem->size) { + return; + } + + QLIST_FOREACH(rdl, &vmem->rdl_list, next) { + if (rdl->double_discard_supported) { + rdl->notify_discard(rdl, rdl->section); + } else { + virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl, + virtio_mem_notify_discard_cb); + } + } +} + +static bool virtio_mem_test_bitmap(const VirtIOMEM *vmem, uint64_t start_gpa, uint64_t size, bool plugged) { const unsigned long first_bit = (start_gpa - vmem->addr) / vmem->block_size; @@ -198,7 +364,8 @@ static void virtio_mem_send_response_simple(VirtIOMEM *vmem, virtio_mem_send_response(vmem, elem, &resp); } -static bool virtio_mem_valid_range(VirtIOMEM *vmem, uint64_t gpa, uint64_t size) +static bool virtio_mem_valid_range(const VirtIOMEM *vmem, uint64_t gpa, + uint64_t size) { if (!QEMU_IS_ALIGNED(gpa, vmem->block_size)) { return false; @@ -219,19 +386,21 @@ static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa, uint64_t size, bool plug) { const uint64_t offset = start_gpa - vmem->addr; - int ret; + RAMBlock *rb = vmem->memdev->mr.ram_block; if (virtio_mem_is_busy()) { return -EBUSY; } if (!plug) { - ret = ram_block_discard_range(vmem->memdev->mr.ram_block, offset, size); - if (ret) { - error_report("Unexpected error discarding RAM: %s", - strerror(-ret)); + if (ram_block_discard_range(rb, offset, size)) { return -EBUSY; } + virtio_mem_notify_unplug(vmem, offset, size); + } else if (virtio_mem_notify_plug(vmem, offset, size)) { + /* Could be a mapping attempt resulted in memory getting populated. */ + ram_block_discard_range(vmem->memdev->mr.ram_block, offset, size); + return -EBUSY; } virtio_mem_set_bitmap(vmem, start_gpa, size, plug); return 0; @@ -318,17 +487,16 @@ static void virtio_mem_resize_usable_region(VirtIOMEM *vmem, static int virtio_mem_unplug_all(VirtIOMEM *vmem) { RAMBlock *rb = vmem->memdev->mr.ram_block; - int ret; if (virtio_mem_is_busy()) { return -EBUSY; } - ret = ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb)); - if (ret) { - error_report("Unexpected error discarding RAM: %s", strerror(-ret)); + if (ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb))) { return -EBUSY; } + virtio_mem_notify_unplug_all(vmem); + bitmap_clear(vmem->bitmap, 0, vmem->bitmap_size); if (vmem->size) { vmem->size = 0; @@ -551,7 +719,7 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp) return; } - if (ram_block_discard_require(true)) { + if (ram_block_coordinated_discard_require(true)) { error_setg(errp, "Discarding RAM is disabled"); return; } @@ -559,7 +727,7 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp) ret = ram_block_discard_range(rb, 0, qemu_ram_get_used_length(rb)); if (ret) { error_setg_errno(errp, -ret, "Unexpected error discarding RAM"); - ram_block_discard_require(false); + ram_block_coordinated_discard_require(false); return; } @@ -577,6 +745,13 @@ static void virtio_mem_device_realize(DeviceState *dev, Error **errp) vmstate_register_ram(&vmem->memdev->mr, DEVICE(vmem)); qemu_register_reset(virtio_mem_system_reset, vmem); precopy_add_notifier(&vmem->precopy_notifier); + + /* + * Set ourselves as RamDiscardManager before the plug handler maps the + * memory region and exposes it via an address space. + */ + memory_region_set_ram_discard_manager(&vmem->memdev->mr, + RAM_DISCARD_MANAGER(vmem)); } static void virtio_mem_device_unrealize(DeviceState *dev) @@ -584,6 +759,11 @@ static void virtio_mem_device_unrealize(DeviceState *dev) VirtIODevice *vdev = VIRTIO_DEVICE(dev); VirtIOMEM *vmem = VIRTIO_MEM(dev); + /* + * The unplug handler unmapped the memory region, it cannot be + * found via an address space anymore. Unset ourselves. + */ + memory_region_set_ram_discard_manager(&vmem->memdev->mr, NULL); precopy_remove_notifier(&vmem->precopy_notifier); qemu_unregister_reset(virtio_mem_system_reset, vmem); vmstate_unregister_ram(&vmem->memdev->mr, DEVICE(vmem)); @@ -591,43 +771,47 @@ static void virtio_mem_device_unrealize(DeviceState *dev) virtio_del_queue(vdev, 0); virtio_cleanup(vdev); g_free(vmem->bitmap); - ram_block_discard_require(false); + ram_block_coordinated_discard_require(false); } -static int virtio_mem_restore_unplugged(VirtIOMEM *vmem) +static int virtio_mem_discard_range_cb(const VirtIOMEM *vmem, void *arg, + uint64_t offset, uint64_t size) { RAMBlock *rb = vmem->memdev->mr.ram_block; - unsigned long first_zero_bit, last_zero_bit; - uint64_t offset, length; - int ret; - /* Find consecutive unplugged blocks and discard the consecutive range. */ - first_zero_bit = find_first_zero_bit(vmem->bitmap, vmem->bitmap_size); - while (first_zero_bit < vmem->bitmap_size) { - offset = first_zero_bit * vmem->block_size; - last_zero_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, - first_zero_bit + 1) - 1; - length = (last_zero_bit - first_zero_bit + 1) * vmem->block_size; + return ram_block_discard_range(rb, offset, size) ? -EINVAL : 0; +} - ret = ram_block_discard_range(rb, offset, length); - if (ret) { - error_report("Unexpected error discarding RAM: %s", - strerror(-ret)); - return -EINVAL; - } - first_zero_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, - last_zero_bit + 2); - } - return 0; +static int virtio_mem_restore_unplugged(VirtIOMEM *vmem) +{ + /* Make sure all memory is really discarded after migration. */ + return virtio_mem_for_each_unplugged_range(vmem, NULL, + virtio_mem_discard_range_cb); } static int virtio_mem_post_load(void *opaque, int version_id) { + VirtIOMEM *vmem = VIRTIO_MEM(opaque); + RamDiscardListener *rdl; + int ret; + + /* + * We started out with all memory discarded and our memory region is mapped + * into an address space. Replay, now that we updated the bitmap. + */ + QLIST_FOREACH(rdl, &vmem->rdl_list, next) { + ret = virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl, + virtio_mem_notify_populate_cb); + if (ret) { + return ret; + } + } + if (migration_in_incoming_postcopy()) { return 0; } - return virtio_mem_restore_unplugged(VIRTIO_MEM(opaque)); + return virtio_mem_restore_unplugged(vmem); } typedef struct VirtIOMEMMigSanityChecks { @@ -702,6 +886,7 @@ static const VMStateDescription vmstate_virtio_mem_device = { .name = "virtio-mem-device", .minimum_version_id = 1, .version_id = 1, + .priority = MIG_PRI_VIRTIO_MEM, .post_load = virtio_mem_post_load, .fields = (VMStateField[]) { VMSTATE_WITH_TMP(VirtIOMEM, VirtIOMEMMigSanityChecks, @@ -872,28 +1057,19 @@ static void virtio_mem_set_block_size(Object *obj, Visitor *v, const char *name, vmem->block_size = value; } -static void virtio_mem_precopy_exclude_unplugged(VirtIOMEM *vmem) +static int virtio_mem_precopy_exclude_range_cb(const VirtIOMEM *vmem, void *arg, + uint64_t offset, uint64_t size) { void * const host = qemu_ram_get_host_addr(vmem->memdev->mr.ram_block); - unsigned long first_zero_bit, last_zero_bit; - uint64_t offset, length; - /* - * Find consecutive unplugged blocks and exclude them from migration. - * - * Note: Blocks cannot get (un)plugged during precopy, no locking needed. - */ - first_zero_bit = find_first_zero_bit(vmem->bitmap, vmem->bitmap_size); - while (first_zero_bit < vmem->bitmap_size) { - offset = first_zero_bit * vmem->block_size; - last_zero_bit = find_next_bit(vmem->bitmap, vmem->bitmap_size, - first_zero_bit + 1) - 1; - length = (last_zero_bit - first_zero_bit + 1) * vmem->block_size; + qemu_guest_free_page_hint(host + offset, size); + return 0; +} - qemu_guest_free_page_hint(host + offset, length); - first_zero_bit = find_next_zero_bit(vmem->bitmap, vmem->bitmap_size, - last_zero_bit + 2); - } +static void virtio_mem_precopy_exclude_unplugged(VirtIOMEM *vmem) +{ + virtio_mem_for_each_unplugged_range(vmem, NULL, + virtio_mem_precopy_exclude_range_cb); } static int virtio_mem_precopy_notify(NotifierWithReturn *n, void *data) @@ -918,6 +1094,7 @@ static void virtio_mem_instance_init(Object *obj) notifier_list_init(&vmem->size_change_notifiers); vmem->precopy_notifier.notify = virtio_mem_precopy_notify; + QLIST_INIT(&vmem->rdl_list); object_property_add(obj, VIRTIO_MEM_SIZE_PROP, "size", virtio_mem_get_size, NULL, NULL, NULL); @@ -937,11 +1114,107 @@ static Property virtio_mem_properties[] = { DEFINE_PROP_END_OF_LIST(), }; +static uint64_t virtio_mem_rdm_get_min_granularity(const RamDiscardManager *rdm, + const MemoryRegion *mr) +{ + const VirtIOMEM *vmem = VIRTIO_MEM(rdm); + + g_assert(mr == &vmem->memdev->mr); + return vmem->block_size; +} + +static bool virtio_mem_rdm_is_populated(const RamDiscardManager *rdm, + const MemoryRegionSection *s) +{ + const VirtIOMEM *vmem = VIRTIO_MEM(rdm); + uint64_t start_gpa = vmem->addr + s->offset_within_region; + uint64_t end_gpa = start_gpa + int128_get64(s->size); + + g_assert(s->mr == &vmem->memdev->mr); + + start_gpa = QEMU_ALIGN_DOWN(start_gpa, vmem->block_size); + end_gpa = QEMU_ALIGN_UP(end_gpa, vmem->block_size); + + if (!virtio_mem_valid_range(vmem, start_gpa, end_gpa - start_gpa)) { + return false; + } + + return virtio_mem_test_bitmap(vmem, start_gpa, end_gpa - start_gpa, true); +} + +struct VirtIOMEMReplayData { + void *fn; + void *opaque; +}; + +static int virtio_mem_rdm_replay_populated_cb(MemoryRegionSection *s, void *arg) +{ + struct VirtIOMEMReplayData *data = arg; + + return ((ReplayRamPopulate)data->fn)(s, data->opaque); +} + +static int virtio_mem_rdm_replay_populated(const RamDiscardManager *rdm, + MemoryRegionSection *s, + ReplayRamPopulate replay_fn, + void *opaque) +{ + const VirtIOMEM *vmem = VIRTIO_MEM(rdm); + struct VirtIOMEMReplayData data = { + .fn = replay_fn, + .opaque = opaque, + }; + + g_assert(s->mr == &vmem->memdev->mr); + return virtio_mem_for_each_plugged_section(vmem, s, &data, + virtio_mem_rdm_replay_populated_cb); +} + +static void virtio_mem_rdm_register_listener(RamDiscardManager *rdm, + RamDiscardListener *rdl, + MemoryRegionSection *s) +{ + VirtIOMEM *vmem = VIRTIO_MEM(rdm); + int ret; + + g_assert(s->mr == &vmem->memdev->mr); + rdl->section = memory_region_section_new_copy(s); + + QLIST_INSERT_HEAD(&vmem->rdl_list, rdl, next); + ret = virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl, + virtio_mem_notify_populate_cb); + if (ret) { + error_report("%s: Replaying plugged ranges failed: %s", __func__, + strerror(-ret)); + } +} + +static void virtio_mem_rdm_unregister_listener(RamDiscardManager *rdm, + RamDiscardListener *rdl) +{ + VirtIOMEM *vmem = VIRTIO_MEM(rdm); + + g_assert(rdl->section->mr == &vmem->memdev->mr); + if (vmem->size) { + if (rdl->double_discard_supported) { + rdl->notify_discard(rdl, rdl->section); + } else { + virtio_mem_for_each_plugged_section(vmem, rdl->section, rdl, + virtio_mem_notify_discard_cb); + } + } + + memory_region_section_free_copy(rdl->section); + rdl->section = NULL; + QLIST_REMOVE(rdl, next); +} + static void virtio_mem_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass); VirtIOMEMClass *vmc = VIRTIO_MEM_CLASS(klass); + RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_CLASS(klass); device_class_set_props(dc, virtio_mem_properties); dc->vmsd = &vmstate_virtio_mem; @@ -957,6 +1230,12 @@ static void virtio_mem_class_init(ObjectClass *klass, void *data) vmc->get_memory_region = virtio_mem_get_memory_region; vmc->add_size_change_notifier = virtio_mem_add_size_change_notifier; vmc->remove_size_change_notifier = virtio_mem_remove_size_change_notifier; + + rdmc->get_min_granularity = virtio_mem_rdm_get_min_granularity; + rdmc->is_populated = virtio_mem_rdm_is_populated; + rdmc->replay_populated = virtio_mem_rdm_replay_populated; + rdmc->register_listener = virtio_mem_rdm_register_listener; + rdmc->unregister_listener = virtio_mem_rdm_unregister_listener; } static const TypeInfo virtio_mem_info = { @@ -966,6 +1245,10 @@ static const TypeInfo virtio_mem_info = { .instance_init = virtio_mem_instance_init, .class_init = virtio_mem_class_init, .class_size = sizeof(VirtIOMEMClass), + .interfaces = (InterfaceInfo[]) { + { TYPE_RAM_DISCARD_MANAGER }, + { } + }, }; static void virtio_register_types(void) |