diff options
Diffstat (limited to 'hw/ppc/spapr.c')
-rw-r--r-- | hw/ppc/spapr.c | 219 |
1 files changed, 86 insertions, 133 deletions
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 08a2a5a770..514a17ae74 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -81,6 +81,8 @@ #include "hw/mem/memory-device.h" #include "hw/ppc/spapr_tpm_proxy.h" +#include "monitor/monitor.h" + #include <libfdt.h> /* SLOF memory layout: @@ -94,7 +96,6 @@ * We load our kernel at 4M, leaving space for SLOF initial image */ #define FDT_MAX_SIZE 0x100000 -#define RTAS_MAX_SIZE 0x10000 #define RTAS_MAX_ADDR 0x80000000 /* RTAS must stay below that */ #define FW_MAX_SIZE 0x400000 #define FW_FILE_NAME "slof.bin" @@ -218,8 +219,7 @@ static int spapr_fixup_cpu_numa_dt(void *fdt, int offset, PowerPCCPU *cpu) /* Populate the "ibm,pa-features" property */ static void spapr_populate_pa_features(SpaprMachineState *spapr, PowerPCCPU *cpu, - void *fdt, int offset, - bool legacy_guest) + void *fdt, int offset) { uint8_t pa_features_206[] = { 6, 0, 0xf6, 0x1f, 0xc7, 0x00, 0x80, 0xc0 }; @@ -285,7 +285,7 @@ static void spapr_populate_pa_features(SpaprMachineState *spapr, if ((spapr_get_cap(spapr, SPAPR_CAP_HTM) != 0) && pa_size > 24) { pa_features[24] |= 0x80; /* Transactional memory support */ } - if (legacy_guest && pa_size > 40) { + if (spapr->cas_pre_isa3_guest && pa_size > 40) { /* Workaround for broken kernels that attempt (guest) radix * mode when they can't handle it, if they see the radix bit set * in pa-features. So hide it from them. */ @@ -295,65 +295,6 @@ static void spapr_populate_pa_features(SpaprMachineState *spapr, _FDT((fdt_setprop(fdt, offset, "ibm,pa-features", pa_features, pa_size))); } -static int spapr_fixup_cpu_dt(void *fdt, SpaprMachineState *spapr) -{ - MachineState *ms = MACHINE(spapr); - int ret = 0, offset, cpus_offset; - CPUState *cs; - char cpu_model[32]; - uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)}; - - CPU_FOREACH(cs) { - PowerPCCPU *cpu = POWERPC_CPU(cs); - DeviceClass *dc = DEVICE_GET_CLASS(cs); - int index = spapr_get_vcpu_id(cpu); - int compat_smt = MIN(ms->smp.threads, ppc_compat_max_vthreads(cpu)); - - if (!spapr_is_thread0_in_vcore(spapr, cpu)) { - continue; - } - - snprintf(cpu_model, 32, "%s@%x", dc->fw_name, index); - - cpus_offset = fdt_path_offset(fdt, "/cpus"); - if (cpus_offset < 0) { - cpus_offset = fdt_add_subnode(fdt, 0, "cpus"); - if (cpus_offset < 0) { - return cpus_offset; - } - } - offset = fdt_subnode_offset(fdt, cpus_offset, cpu_model); - if (offset < 0) { - offset = fdt_add_subnode(fdt, cpus_offset, cpu_model); - if (offset < 0) { - return offset; - } - } - - ret = fdt_setprop(fdt, offset, "ibm,pft-size", - pft_size_prop, sizeof(pft_size_prop)); - if (ret < 0) { - return ret; - } - - if (ms->numa_state->num_nodes > 1) { - ret = spapr_fixup_cpu_numa_dt(fdt, offset, cpu); - if (ret < 0) { - return ret; - } - } - - ret = spapr_fixup_cpu_smt_dt(fdt, offset, cpu, compat_smt); - if (ret < 0) { - return ret; - } - - spapr_populate_pa_features(spapr, cpu, fdt, offset, - spapr->cas_legacy_guest_workaround); - } - return ret; -} - static hwaddr spapr_node0_size(MachineState *machine) { if (machine->numa_state->num_nodes) { @@ -388,7 +329,7 @@ static int spapr_populate_memory_node(void *fdt, int nodeid, hwaddr start, mem_reg_property[0] = cpu_to_be64(start); mem_reg_property[1] = cpu_to_be64(size); - sprintf(mem_name, "memory@" TARGET_FMT_lx, start); + sprintf(mem_name, "memory@%" HWADDR_PRIx, start); off = fdt_add_subnode(fdt, 0, mem_name); _FDT(off); _FDT((fdt_setprop_string(fdt, off, "device_type", "memory"))); @@ -551,7 +492,7 @@ static void spapr_populate_cpu_dt(CPUState *cs, void *fdt, int offset, page_sizes_prop, page_sizes_prop_size))); } - spapr_populate_pa_features(spapr, cpu, fdt, offset, false); + spapr_populate_pa_features(spapr, cpu, fdt, offset); _FDT((fdt_setprop_cell(fdt, offset, "ibm,chip-id", cs->cpu_index / vcpus_per_socket))); @@ -984,11 +925,13 @@ static bool spapr_hotplugged_dev_before_cas(void) return false; } +static void *spapr_build_fdt(SpaprMachineState *spapr); + int spapr_h_cas_compose_response(SpaprMachineState *spapr, target_ulong addr, target_ulong size, SpaprOptionVector *ov5_updates) { - void *fdt, *fdt_skel; + void *fdt; SpaprDeviceTreeUpdateHeader hdr = { .version_id = 1 }; if (spapr_hotplugged_dev_before_cas()) { @@ -1004,28 +947,11 @@ int spapr_h_cas_compose_response(SpaprMachineState *spapr, size -= sizeof(hdr); - /* Create skeleton */ - fdt_skel = g_malloc0(size); - _FDT((fdt_create(fdt_skel, size))); - _FDT((fdt_finish_reservemap(fdt_skel))); - _FDT((fdt_begin_node(fdt_skel, ""))); - _FDT((fdt_end_node(fdt_skel))); - _FDT((fdt_finish(fdt_skel))); - fdt = g_malloc0(size); - _FDT((fdt_open_into(fdt_skel, fdt, size))); - g_free(fdt_skel); - - /* Fixup cpu nodes */ - _FDT((spapr_fixup_cpu_dt(fdt, spapr))); - - if (spapr_dt_cas_updates(spapr, fdt, ov5_updates)) { - return -1; - } - - /* Pack resulting tree */ + fdt = spapr_build_fdt(spapr); _FDT((fdt_pack(fdt))); if (fdt_totalsize(fdt) + sizeof(hdr) > size) { + g_free(fdt); trace_spapr_cas_failed(size); return -1; } @@ -1033,7 +959,11 @@ int spapr_h_cas_compose_response(SpaprMachineState *spapr, cpu_physical_memory_write(addr, &hdr, sizeof(hdr)); cpu_physical_memory_write(addr + sizeof(hdr), fdt, fdt_totalsize(fdt)); trace_spapr_cas_continue(fdt_totalsize(fdt) + sizeof(hdr)); - g_free(fdt); + + g_free(spapr->fdt_blob); + spapr->fdt_size = fdt_totalsize(fdt); + spapr->fdt_initial_size = spapr->fdt_size; + spapr->fdt_blob = fdt; return 0; } @@ -1136,19 +1066,28 @@ static void spapr_dt_ov5_platform_support(SpaprMachineState *spapr, void *fdt, PowerPCCPU *first_ppc_cpu = POWERPC_CPU(first_cpu); char val[2 * 4] = { - 23, spapr->irq->ov5, /* Xive mode. */ + 23, 0x00, /* XICS / XIVE mode */ 24, 0x00, /* Hash/Radix, filled in below. */ 25, 0x00, /* Hash options: Segment Tables == no, GTSE == no. */ 26, 0x40, /* Radix options: GTSE == yes. */ }; + if (spapr->irq->xics && spapr->irq->xive) { + val[1] = SPAPR_OV5_XIVE_BOTH; + } else if (spapr->irq->xive) { + val[1] = SPAPR_OV5_XIVE_EXPLOIT; + } else { + assert(spapr->irq->xics); + val[1] = SPAPR_OV5_XIVE_LEGACY; + } + if (!ppc_check_compat(first_ppc_cpu, CPU_POWERPC_LOGICAL_3_00, 0, first_ppc_cpu->compat_pvr)) { /* * If we're in a pre POWER9 compat mode then the guest should * do hash and use the legacy interrupt mode */ - val[1] = 0x00; /* XICS */ + val[1] = SPAPR_OV5_XIVE_LEGACY; /* XICS */ val[3] = 0x00; /* Hash */ } else if (kvm_enabled()) { if (kvmppc_has_cap_mmu_radix() && kvmppc_has_cap_mmu_hash_v3()) { @@ -1178,11 +1117,16 @@ static void spapr_dt_chosen(SpaprMachineState *spapr, void *fdt) _FDT(chosen = fdt_add_subnode(fdt, 0, "chosen")); - _FDT(fdt_setprop_string(fdt, chosen, "bootargs", machine->kernel_cmdline)); - _FDT(fdt_setprop_cell(fdt, chosen, "linux,initrd-start", - spapr->initrd_base)); - _FDT(fdt_setprop_cell(fdt, chosen, "linux,initrd-end", - spapr->initrd_base + spapr->initrd_size)); + if (machine->kernel_cmdline && machine->kernel_cmdline[0]) { + _FDT(fdt_setprop_string(fdt, chosen, "bootargs", + machine->kernel_cmdline)); + } + if (spapr->initrd_size) { + _FDT(fdt_setprop_cell(fdt, chosen, "linux,initrd-start", + spapr->initrd_base)); + _FDT(fdt_setprop_cell(fdt, chosen, "linux,initrd-end", + spapr->initrd_base + spapr->initrd_size)); + } if (spapr->kernel_size) { uint64_t kprop[2] = { cpu_to_be64(KERNEL_LOAD_ADDR), @@ -1717,8 +1661,7 @@ static void spapr_machine_reset(MachineState *machine) { SpaprMachineState *spapr = SPAPR_MACHINE(machine); PowerPCCPU *first_ppc_cpu; - uint32_t rtas_limit; - hwaddr rtas_addr, fdt_addr; + hwaddr fdt_addr; void *fdt; int rc; @@ -1739,16 +1682,6 @@ static void spapr_machine_reset(MachineState *machine) spapr_setup_hpt_and_vrma(spapr); } - /* - * NVLink2-connected GPU RAM needs to be placed on a separate NUMA node. - * We assign a new numa ID per GPU in spapr_pci_collect_nvgpu() which is - * called from vPHB reset handler so we initialize the counter here. - * If no NUMA is configured from the QEMU side, we start from 1 as GPU RAM - * must be equally distant from any other node. - * The final value of spapr->gpu_numa_id is going to be written to - * max-associativity-domains in spapr_build_fdt(). - */ - spapr->gpu_numa_id = MAX(1, machine->numa_state->num_nodes); qemu_devices_reset(); /* @@ -1792,14 +1725,10 @@ static void spapr_machine_reset(MachineState *machine) * or just below 2GB, whichever is lower, so that it can be * processed with 32-bit real mode code if necessary */ - rtas_limit = MIN(spapr->rma_size, RTAS_MAX_ADDR); - rtas_addr = rtas_limit - RTAS_MAX_SIZE; - fdt_addr = rtas_addr - FDT_MAX_SIZE; + fdt_addr = MIN(spapr->rma_size, RTAS_MAX_ADDR) - FDT_MAX_SIZE; fdt = spapr_build_fdt(spapr); - spapr_load_rtas(spapr, fdt, rtas_addr); - rc = fdt_pack(fdt); /* Should only fail if we've built a corrupted tree */ @@ -2847,13 +2776,57 @@ static void spapr_machine_init(MachineState *machine) spapr_ovec_set(spapr->ov5, OV5_DRMEM_V2); /* advertise XIVE on POWER9 machines */ - if (spapr->irq->ov5 & (SPAPR_OV5_XIVE_EXPLOIT | SPAPR_OV5_XIVE_BOTH)) { + if (spapr->irq->xive) { spapr_ovec_set(spapr->ov5, OV5_XIVE_EXPLOIT); } /* init CPUs */ spapr_init_cpus(spapr); + /* + * check we don't have a memory-less/cpu-less NUMA node + * Firmware relies on the existing memory/cpu topology to provide the + * NUMA topology to the kernel. + * And the linux kernel needs to know the NUMA topology at start + * to be able to hotplug CPUs later. + */ + if (machine->numa_state->num_nodes) { + for (i = 0; i < machine->numa_state->num_nodes; ++i) { + /* check for memory-less node */ + if (machine->numa_state->nodes[i].node_mem == 0) { + CPUState *cs; + int found = 0; + /* check for cpu-less node */ + CPU_FOREACH(cs) { + PowerPCCPU *cpu = POWERPC_CPU(cs); + if (cpu->node_id == i) { + found = 1; + break; + } + } + /* memory-less and cpu-less node */ + if (!found) { + error_report( + "Memory-less/cpu-less nodes are not supported (node %d)", + i); + exit(1); + } + } + } + + } + + /* + * NVLink2-connected GPU RAM needs to be placed on a separate NUMA node. + * We assign a new numa ID per GPU in spapr_pci_collect_nvgpu() which is + * called from vPHB reset handler so we initialize the counter here. + * If no NUMA is configured from the QEMU side, we start from 1 as GPU RAM + * must be equally distant from any other node. + * The final value of spapr->gpu_numa_id is going to be written to + * max-associativity-domains in spapr_build_fdt(). + */ + spapr->gpu_numa_id = MAX(1, machine->numa_state->num_nodes); + if ((!kvm_enabled() || kvmppc_has_cap_mmu_radix()) && ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0, spapr->max_compat_pvr)) { @@ -2915,28 +2888,6 @@ static void spapr_machine_init(MachineState *machine) spapr_create_lmb_dr_connectors(spapr); } - filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, "spapr-rtas.bin"); - if (!filename) { - error_report("Could not find LPAR rtas '%s'", "spapr-rtas.bin"); - exit(1); - } - spapr->rtas_size = get_image_size(filename); - if (spapr->rtas_size < 0) { - error_report("Could not get size of LPAR rtas '%s'", filename); - exit(1); - } - spapr->rtas_blob = g_malloc(spapr->rtas_size); - if (load_image_size(filename, spapr->rtas_blob, spapr->rtas_size) < 0) { - error_report("Could not load LPAR rtas '%s'", filename); - exit(1); - } - if (spapr->rtas_size > RTAS_MAX_SIZE) { - error_report("RTAS too big ! 0x%zx bytes (max is 0x%x)", - (size_t)spapr->rtas_size, RTAS_MAX_SIZE); - exit(1); - } - g_free(filename); - /* Set up RTAS event infrastructure */ spapr_events_init(spapr); @@ -4321,6 +4272,8 @@ static void spapr_pic_print_info(InterruptStatsProvider *obj, SpaprMachineState *spapr = SPAPR_MACHINE(obj); spapr->irq->print_info(spapr, mon); + monitor_printf(mon, "irqchip: %s\n", + kvm_irqchip_in_kernel() ? "in-kernel" : "emulated"); } int spapr_get_vcpu_id(PowerPCCPU *cpu) |