diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2014-03-18 14:31:42 +0000 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2014-03-18 14:31:42 +0000 |
commit | 2dda43bacc79f8e283702614745cd700c637de64 (patch) | |
tree | 091425c65ad18714e967d6376cada6a6a6ae9db6 | |
parent | 315b59344126beab85a62b53582794b14436a5a4 (diff) | |
parent | 1ed27a17cd9d9ebec8963bc358d74060b1dd6127 (diff) |
Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20140317' into staging
target-arm queue:
* more A64 Neon instructions
* fixes to reset CBAR values for A9 and A15 boards
* fix accesses to PMCR register in -icount mode
# gpg: Signature made Mon 17 Mar 2014 22:04:52 GMT using RSA key ID 14360CDE
# gpg: Good signature from "Peter Maydell <peter.maydell@linaro.org>"
* remotes/pmaydell/tags/pull-target-arm-20140317: (30 commits)
scripts/qemu-binfmt-conf.sh: Add AArch64 registration
target-arm: A64: Add [UF]RSQRTE (reciprocal root estimate)
target-arm: A64: Implement FCVTXN
target-arm: A64: Implement scalar saturating narrow ops
target-arm: A64: Move handle_2misc_narrow function
target-arm: A64: Implement AdvSIMD reciprocal estimate insns URECPE, FRECPE
softfloat: export squash_input_denormal functions
target-arm: A64: Implement FCVTZS, FCVTZU in the shift-imm categories
target-arm: A64: Handle saturating left shifts SQSHL, SQSHLU, UQSHL
exec-all.h: Increase MAX_OP_PER_INSTR for ARM A64 decoder
target-arm: A64: Implement FRINT*
target-arm: A64: Implement SRI
target-arm: A64: Add FRECPX (reciprocal exponent)
target-arm: A64: List unsupported shift-imm opcodes
target-arm: A64: Implement FCVTL
target-arm: A64: Implement FCVTN
target-arm: A64: Implement FCVT[NMAPZ][SU] SIMD instructions
target-arm: A64: Implement SHLL, SHLL2
target-arm: A64: Implement SADDLP, UADDLP, SADALP, UADALP
target-arm: A64: Saturating and narrowing shift ops
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r-- | fpu/softfloat.c | 4 | ||||
-rw-r--r-- | hw/arm/exynos4210.c | 16 | ||||
-rw-r--r-- | hw/arm/realview.c | 39 | ||||
-rw-r--r-- | hw/arm/vexpress.c | 123 | ||||
-rw-r--r-- | hw/arm/virt.c | 6 | ||||
-rw-r--r-- | include/exec/exec-all.h | 2 | ||||
-rw-r--r-- | include/fpu/softfloat.h | 7 | ||||
-rw-r--r-- | scripts/qemu-binfmt-conf.sh | 3 | ||||
-rw-r--r-- | target-arm/helper-a64.c | 178 | ||||
-rw-r--r-- | target-arm/helper-a64.h | 10 | ||||
-rw-r--r-- | target-arm/helper.c | 332 | ||||
-rw-r--r-- | target-arm/helper.h | 10 | ||||
-rw-r--r-- | target-arm/translate-a64.c | 1357 | ||||
-rw-r--r-- | target-arm/translate.c | 25 | ||||
-rw-r--r-- | target-arm/translate.h | 6 |
15 files changed, 1855 insertions, 263 deletions
diff --git a/fpu/softfloat.c b/fpu/softfloat.c index fc0b179df4..5f02c16d8d 100644 --- a/fpu/softfloat.c +++ b/fpu/softfloat.c @@ -288,7 +288,7 @@ INLINE flag extractFloat32Sign( float32 a ) | If `a' is denormal and we are in flush-to-zero mode then set the | input-denormal exception and return zero. Otherwise just return the value. *----------------------------------------------------------------------------*/ -static float32 float32_squash_input_denormal(float32 a STATUS_PARAM) +float32 float32_squash_input_denormal(float32 a STATUS_PARAM) { if (STATUS(flush_inputs_to_zero)) { if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) { @@ -473,7 +473,7 @@ INLINE flag extractFloat64Sign( float64 a ) | If `a' is denormal and we are in flush-to-zero mode then set the | input-denormal exception and return zero. Otherwise just return the value. *----------------------------------------------------------------------------*/ -static float64 float64_squash_input_denormal(float64 a STATUS_PARAM) +float64 float64_squash_input_denormal(float64 a STATUS_PARAM) { if (STATUS(flush_inputs_to_zero)) { if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) { diff --git a/hw/arm/exynos4210.c b/hw/arm/exynos4210.c index 9f137e9acd..6426d168d2 100644 --- a/hw/arm/exynos4210.c +++ b/hw/arm/exynos4210.c @@ -143,11 +143,21 @@ Exynos4210State *exynos4210_init(MemoryRegion *system_mem, unsigned long mem_size; DeviceState *dev; SysBusDevice *busdev; + ObjectClass *cpu_oc; + + cpu_oc = cpu_class_by_name(TYPE_ARM_CPU, "cortex-a9"); + assert(cpu_oc); for (n = 0; n < EXYNOS4210_NCPUS; n++) { - s->cpu[n] = cpu_arm_init("cortex-a9"); - if (!s->cpu[n]) { - fprintf(stderr, "Unable to find CPU %d definition\n", n); + Object *cpuobj = object_new(object_class_get_name(cpu_oc)); + Error *err = NULL; + + s->cpu[n] = ARM_CPU(cpuobj); + object_property_set_int(cpuobj, EXYNOS4210_SMP_PRIVATE_BASE_ADDR, + "reset-cbar", &error_abort); + object_property_set_bool(cpuobj, true, "realized", &err); + if (err) { + error_report("%s", error_get_pretty(err)); exit(1); } } diff --git a/hw/arm/realview.c b/hw/arm/realview.c index 6ef7646002..7e04e507f9 100644 --- a/hw/arm/realview.c +++ b/hw/arm/realview.c @@ -18,6 +18,7 @@ #include "hw/i2c/i2c.h" #include "sysemu/blockdev.h" #include "exec/address-spaces.h" +#include "qemu/error-report.h" #define SMP_BOOT_ADDR 0xe0000000 #define SMP_BOOTREG_ADDR 0x10000030 @@ -49,6 +50,7 @@ static void realview_init(QEMUMachineInitArgs *args, { ARMCPU *cpu = NULL; CPUARMState *env; + ObjectClass *cpu_oc; MemoryRegion *sysmem = get_system_memory(); MemoryRegion *ram_lo = g_new(MemoryRegion, 1); MemoryRegion *ram_hi = g_new(MemoryRegion, 1); @@ -70,12 +72,14 @@ static void realview_init(QEMUMachineInitArgs *args, uint32_t sys_id; ram_addr_t low_ram_size; ram_addr_t ram_size = args->ram_size; + hwaddr periphbase = 0; switch (board_type) { case BOARD_EB: break; case BOARD_EB_MPCORE: is_mpcore = 1; + periphbase = 0x10100000; break; case BOARD_PB_A8: is_pb = 1; @@ -83,16 +87,37 @@ static void realview_init(QEMUMachineInitArgs *args, case BOARD_PBX_A9: is_mpcore = 1; is_pb = 1; + periphbase = 0x1f000000; break; } + + cpu_oc = cpu_class_by_name(TYPE_ARM_CPU, args->cpu_model); + if (!cpu_oc) { + fprintf(stderr, "Unable to find CPU definition\n"); + exit(1); + } + for (n = 0; n < smp_cpus; n++) { - cpu = cpu_arm_init(args->cpu_model); - if (!cpu) { - fprintf(stderr, "Unable to find CPU definition\n"); + Object *cpuobj = object_new(object_class_get_name(cpu_oc)); + Error *err = NULL; + + if (is_pb && is_mpcore) { + object_property_set_int(cpuobj, periphbase, "reset-cbar", &err); + if (err) { + error_report("%s", error_get_pretty(err)); + exit(1); + } + } + + object_property_set_bool(cpuobj, true, "realized", &err); + if (err) { + error_report("%s", error_get_pretty(err)); exit(1); } - cpu_irq[n] = qdev_get_gpio_in(DEVICE(cpu), ARM_CPU_IRQ); + + cpu_irq[n] = qdev_get_gpio_in(DEVICE(cpuobj), ARM_CPU_IRQ); } + cpu = ARM_CPU(first_cpu); env = &cpu->env; if (arm_feature(env, ARM_FEATURE_V7)) { if (is_mpcore) { @@ -141,16 +166,10 @@ static void realview_init(QEMUMachineInitArgs *args, sysbus_mmio_map(SYS_BUS_DEVICE(sysctl), 0, 0x10000000); if (is_mpcore) { - hwaddr periphbase; dev = qdev_create(NULL, is_pb ? "a9mpcore_priv": "realview_mpcore"); qdev_prop_set_uint32(dev, "num-cpu", smp_cpus); qdev_init_nofail(dev); busdev = SYS_BUS_DEVICE(dev); - if (is_pb) { - periphbase = 0x1f000000; - } else { - periphbase = 0x10100000; - } sysbus_mmio_map(busdev, 0, periphbase); for (n = 0; n < smp_cpus; n++) { sysbus_connect_irq(busdev, n, cpu_irq[n]); diff --git a/hw/arm/vexpress.c b/hw/arm/vexpress.c index ef1707aef0..67628af588 100644 --- a/hw/arm/vexpress.c +++ b/hw/arm/vexpress.c @@ -32,6 +32,7 @@ #include "sysemu/blockdev.h" #include "hw/block/flash.h" #include "sysemu/device_tree.h" +#include "qemu/error-report.h" #include <libfdt.h> #define VEXPRESS_BOARD_ID 0x8e0 @@ -173,6 +174,64 @@ struct VEDBoardInfo { DBoardInitFn *init; }; +static void init_cpus(const char *cpu_model, const char *privdev, + hwaddr periphbase, qemu_irq *pic) +{ + ObjectClass *cpu_oc = cpu_class_by_name(TYPE_ARM_CPU, cpu_model); + DeviceState *dev; + SysBusDevice *busdev; + int n; + + if (!cpu_oc) { + fprintf(stderr, "Unable to find CPU definition\n"); + exit(1); + } + + /* Create the actual CPUs */ + for (n = 0; n < smp_cpus; n++) { + Object *cpuobj = object_new(object_class_get_name(cpu_oc)); + Error *err = NULL; + + object_property_set_int(cpuobj, periphbase, "reset-cbar", &err); + if (err) { + error_report("%s", error_get_pretty(err)); + exit(1); + } + object_property_set_bool(cpuobj, true, "realized", &err); + if (err) { + error_report("%s", error_get_pretty(err)); + exit(1); + } + } + + /* Create the private peripheral devices (including the GIC); + * this must happen after the CPUs are created because a15mpcore_priv + * wires itself up to the CPU's generic_timer gpio out lines. + */ + dev = qdev_create(NULL, privdev); + qdev_prop_set_uint32(dev, "num-cpu", smp_cpus); + qdev_init_nofail(dev); + busdev = SYS_BUS_DEVICE(dev); + sysbus_mmio_map(busdev, 0, periphbase); + + /* Interrupts [42:0] are from the motherboard; + * [47:43] are reserved; [63:48] are daughterboard + * peripherals. Note that some documentation numbers + * external interrupts starting from 32 (because there + * are internal interrupts 0..31). + */ + for (n = 0; n < 64; n++) { + pic[n] = qdev_get_gpio_in(dev, n); + } + + /* Connect the CPUs to the GIC */ + for (n = 0; n < smp_cpus; n++) { + DeviceState *cpudev = DEVICE(qemu_get_cpu(n)); + + sysbus_connect_irq(busdev, n, qdev_get_gpio_in(cpudev, ARM_CPU_IRQ)); + } +} + static void a9_daughterboard_init(const VEDBoardInfo *daughterboard, ram_addr_t ram_size, const char *cpu_model, @@ -181,25 +240,12 @@ static void a9_daughterboard_init(const VEDBoardInfo *daughterboard, MemoryRegion *sysmem = get_system_memory(); MemoryRegion *ram = g_new(MemoryRegion, 1); MemoryRegion *lowram = g_new(MemoryRegion, 1); - DeviceState *dev; - SysBusDevice *busdev; - int n; - qemu_irq cpu_irq[4]; ram_addr_t low_ram_size; if (!cpu_model) { cpu_model = "cortex-a9"; } - for (n = 0; n < smp_cpus; n++) { - ARMCPU *cpu = cpu_arm_init(cpu_model); - if (!cpu) { - fprintf(stderr, "Unable to find CPU definition\n"); - exit(1); - } - cpu_irq[n] = qdev_get_gpio_in(DEVICE(cpu), ARM_CPU_IRQ); - } - if (ram_size > 0x40000000) { /* 1GB is the maximum the address space permits */ fprintf(stderr, "vexpress-a9: cannot model more than 1GB RAM\n"); @@ -221,23 +267,7 @@ static void a9_daughterboard_init(const VEDBoardInfo *daughterboard, memory_region_add_subregion(sysmem, 0x60000000, ram); /* 0x1e000000 A9MPCore (SCU) private memory region */ - dev = qdev_create(NULL, "a9mpcore_priv"); - qdev_prop_set_uint32(dev, "num-cpu", smp_cpus); - qdev_init_nofail(dev); - busdev = SYS_BUS_DEVICE(dev); - sysbus_mmio_map(busdev, 0, 0x1e000000); - for (n = 0; n < smp_cpus; n++) { - sysbus_connect_irq(busdev, n, cpu_irq[n]); - } - /* Interrupts [42:0] are from the motherboard; - * [47:43] are reserved; [63:48] are daughterboard - * peripherals. Note that some documentation numbers - * external interrupts starting from 32 (because the - * A9MP has internal interrupts 0..31). - */ - for (n = 0; n < 64; n++) { - pic[n] = qdev_get_gpio_in(dev, n); - } + init_cpus(cpu_model, "a9mpcore_priv", 0x1e000000, pic); /* Daughterboard peripherals : 0x10020000 .. 0x20000000 */ @@ -296,29 +326,14 @@ static void a15_daughterboard_init(const VEDBoardInfo *daughterboard, const char *cpu_model, qemu_irq *pic) { - int n; MemoryRegion *sysmem = get_system_memory(); MemoryRegion *ram = g_new(MemoryRegion, 1); MemoryRegion *sram = g_new(MemoryRegion, 1); - qemu_irq cpu_irq[4]; - DeviceState *dev; - SysBusDevice *busdev; if (!cpu_model) { cpu_model = "cortex-a15"; } - for (n = 0; n < smp_cpus; n++) { - ARMCPU *cpu; - - cpu = cpu_arm_init(cpu_model); - if (!cpu) { - fprintf(stderr, "Unable to find CPU definition\n"); - exit(1); - } - cpu_irq[n] = qdev_get_gpio_in(DEVICE(cpu), ARM_CPU_IRQ); - } - { /* We have to use a separate 64 bit variable here to avoid the gcc * "comparison is always false due to limited range of data type" @@ -337,23 +352,7 @@ static void a15_daughterboard_init(const VEDBoardInfo *daughterboard, memory_region_add_subregion(sysmem, 0x80000000, ram); /* 0x2c000000 A15MPCore private memory region (GIC) */ - dev = qdev_create(NULL, "a15mpcore_priv"); - qdev_prop_set_uint32(dev, "num-cpu", smp_cpus); - qdev_init_nofail(dev); - busdev = SYS_BUS_DEVICE(dev); - sysbus_mmio_map(busdev, 0, 0x2c000000); - for (n = 0; n < smp_cpus; n++) { - sysbus_connect_irq(busdev, n, cpu_irq[n]); - } - /* Interrupts [42:0] are from the motherboard; - * [47:43] are reserved; [63:48] are daughterboard - * peripherals. Note that some documentation numbers - * external interrupts starting from 32 (because there - * are internal interrupts 0..31). - */ - for (n = 0; n < 64; n++) { - pic[n] = qdev_get_gpio_in(dev, n); - } + init_cpus(cpu_model, "a15mpcore_priv", 0x2c000000, pic); /* A15 daughterboard peripherals: */ diff --git a/hw/arm/virt.c b/hw/arm/virt.c index 517f2fe30f..2bbc9313d2 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -390,6 +390,12 @@ static void machvirt_init(QEMUMachineInitArgs *args) if (n > 0) { object_property_set_bool(cpuobj, true, "start-powered-off", NULL); } + + if (object_property_find(cpuobj, "reset-cbar", NULL)) { + object_property_set_int(cpuobj, vbi->memmap[VIRT_CPUPERIPHS].base, + "reset-cbar", &error_abort); + } + object_property_set_bool(cpuobj, true, "realized", NULL); } fdt_add_cpu_nodes(vbi); diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h index 502b7aa084..f9ac332f9d 100644 --- a/include/exec/exec-all.h +++ b/include/exec/exec-all.h @@ -44,7 +44,7 @@ struct TranslationBlock; typedef struct TranslationBlock TranslationBlock; /* XXX: make safe guess about sizes */ -#define MAX_OP_PER_INSTR 208 +#define MAX_OP_PER_INSTR 266 #if HOST_LONG_BITS == 32 #define MAX_OPC_PARAM_PER_ARG 2 diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h index 4b4df88527..db878c1313 100644 --- a/include/fpu/softfloat.h +++ b/include/fpu/softfloat.h @@ -245,6 +245,13 @@ INLINE flag get_default_nan_mode(float_status *status) void float_raise( int8 flags STATUS_PARAM); /*---------------------------------------------------------------------------- +| If `a' is denormal and we are in flush-to-zero mode then set the +| input-denormal exception and return zero. Otherwise just return the value. +*----------------------------------------------------------------------------*/ +float32 float32_squash_input_denormal(float32 a STATUS_PARAM); +float64 float64_squash_input_denormal(float64 a STATUS_PARAM); + +/*---------------------------------------------------------------------------- | Options to indicate which negations to perform in float*_muladd() | Using these differs from negating an input or output before calling | the muladd function in that this means that a NaN doesn't have its diff --git a/scripts/qemu-binfmt-conf.sh b/scripts/qemu-binfmt-conf.sh index 0da2618883..289b1a3963 100644 --- a/scripts/qemu-binfmt-conf.sh +++ b/scripts/qemu-binfmt-conf.sh @@ -41,6 +41,9 @@ if [ $cpu != "arm" ] ; then echo ':arm:M::\x7fELF\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x28\x00:\xff\xff\xff\xff\xff\xff\xff\x00\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff\xff:/usr/local/bin/qemu-arm:' > /proc/sys/fs/binfmt_misc/register echo ':armeb:M::\x7fELF\x01\x02\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x28:\xff\xff\xff\xff\xff\xff\xff\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff:/usr/local/bin/qemu-armeb:' > /proc/sys/fs/binfmt_misc/register fi +if [ $cpu != "aarch64" ] ; then + echo ':aarch64:M::\x7fELF\x02\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\xb7\x00:\xff\xff\xff\xff\xff\xff\xff\x00\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff\xff:/usr/local/bin/qemu-aarch64:' > /proc/sys/fs/binfmt_misc/register +fi if [ $cpu != "sparc" ] ; then echo ':sparc:M::\x7fELF\x01\x02\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x02:\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff:/usr/local/bin/qemu-sparc:' > /proc/sys/fs/binfmt_misc/register fi diff --git a/target-arm/helper-a64.c b/target-arm/helper-a64.c index c2ce33ee88..ec0258295f 100644 --- a/target-arm/helper-a64.c +++ b/target-arm/helper-a64.c @@ -60,6 +60,11 @@ uint32_t HELPER(cls32)(uint32_t x) return clrsb32(x); } +uint32_t HELPER(clz32)(uint32_t x) +{ + return clz32(x); +} + uint64_t HELPER(rbit64)(uint64_t x) { /* assign the correct byte position */ @@ -180,6 +185,36 @@ uint64_t HELPER(simd_tbl)(CPUARMState *env, uint64_t result, uint64_t indices, return result; } +/* Helper function for 64 bit polynomial multiply case: + * perform PolynomialMult(op1, op2) and return either the top or + * bottom half of the 128 bit result. + */ +uint64_t HELPER(neon_pmull_64_lo)(uint64_t op1, uint64_t op2) +{ + int bitnum; + uint64_t res = 0; + + for (bitnum = 0; bitnum < 64; bitnum++) { + if (op1 & (1ULL << bitnum)) { + res ^= op2 << bitnum; + } + } + return res; +} +uint64_t HELPER(neon_pmull_64_hi)(uint64_t op1, uint64_t op2) +{ + int bitnum; + uint64_t res = 0; + + /* bit 0 of op1 can't influence the high 64 bits at all */ + for (bitnum = 1; bitnum < 64; bitnum++) { + if (op1 & (1ULL << bitnum)) { + res ^= op2 >> (64 - bitnum); + } + } + return res; +} + /* 64bit/double versions of the neon float compare functions */ uint64_t HELPER(neon_ceq_f64)(float64 a, float64 b, void *fpstp) { @@ -258,3 +293,146 @@ float64 HELPER(rsqrtsf_f64)(float64 a, float64 b, void *fpstp) } return float64_muladd(a, b, float64_three, float_muladd_halve_result, fpst); } + +/* Pairwise long add: add pairs of adjacent elements into + * double-width elements in the result (eg _s8 is an 8x8->16 op) + */ +uint64_t HELPER(neon_addlp_s8)(uint64_t a) +{ + uint64_t nsignmask = 0x0080008000800080ULL; + uint64_t wsignmask = 0x8000800080008000ULL; + uint64_t elementmask = 0x00ff00ff00ff00ffULL; + uint64_t tmp1, tmp2; + uint64_t res, signres; + + /* Extract odd elements, sign extend each to a 16 bit field */ + tmp1 = a & elementmask; + tmp1 ^= nsignmask; + tmp1 |= wsignmask; + tmp1 = (tmp1 - nsignmask) ^ wsignmask; + /* Ditto for the even elements */ + tmp2 = (a >> 8) & elementmask; + tmp2 ^= nsignmask; + tmp2 |= wsignmask; + tmp2 = (tmp2 - nsignmask) ^ wsignmask; + + /* calculate the result by summing bits 0..14, 16..22, etc, + * and then adjusting the sign bits 15, 23, etc manually. + * This ensures the addition can't overflow the 16 bit field. + */ + signres = (tmp1 ^ tmp2) & wsignmask; + res = (tmp1 & ~wsignmask) + (tmp2 & ~wsignmask); + res ^= signres; + + return res; +} + +uint64_t HELPER(neon_addlp_u8)(uint64_t a) +{ + uint64_t tmp; + + tmp = a & 0x00ff00ff00ff00ffULL; + tmp += (a >> 8) & 0x00ff00ff00ff00ffULL; + return tmp; +} + +uint64_t HELPER(neon_addlp_s16)(uint64_t a) +{ + int32_t reslo, reshi; + + reslo = (int32_t)(int16_t)a + (int32_t)(int16_t)(a >> 16); + reshi = (int32_t)(int16_t)(a >> 32) + (int32_t)(int16_t)(a >> 48); + + return (uint32_t)reslo | (((uint64_t)reshi) << 32); +} + +uint64_t HELPER(neon_addlp_u16)(uint64_t a) +{ + uint64_t tmp; + + tmp = a & 0x0000ffff0000ffffULL; + tmp += (a >> 16) & 0x0000ffff0000ffffULL; + return tmp; +} + +/* Floating-point reciprocal exponent - see FPRecpX in ARM ARM */ +float32 HELPER(frecpx_f32)(float32 a, void *fpstp) +{ + float_status *fpst = fpstp; + uint32_t val32, sbit; + int32_t exp; + + if (float32_is_any_nan(a)) { + float32 nan = a; + if (float32_is_signaling_nan(a)) { + float_raise(float_flag_invalid, fpst); + nan = float32_maybe_silence_nan(a); + } + if (fpst->default_nan_mode) { + nan = float32_default_nan; + } + return nan; + } + + val32 = float32_val(a); + sbit = 0x80000000ULL & val32; + exp = extract32(val32, 23, 8); + + if (exp == 0) { + return make_float32(sbit | (0xfe << 23)); + } else { + return make_float32(sbit | (~exp & 0xff) << 23); + } +} + +float64 HELPER(frecpx_f64)(float64 a, void *fpstp) +{ + float_status *fpst = fpstp; + uint64_t val64, sbit; + int64_t exp; + + if (float64_is_any_nan(a)) { + float64 nan = a; + if (float64_is_signaling_nan(a)) { + float_raise(float_flag_invalid, fpst); + nan = float64_maybe_silence_nan(a); + } + if (fpst->default_nan_mode) { + nan = float64_default_nan; + } + return nan; + } + + val64 = float64_val(a); + sbit = 0x8000000000000000ULL & val64; + exp = extract64(float64_val(a), 52, 11); + + if (exp == 0) { + return make_float64(sbit | (0x7feULL << 52)); + } else { + return make_float64(sbit | (~exp & 0x7ffULL) << 52); + } +} + +float32 HELPER(fcvtx_f64_to_f32)(float64 a, CPUARMState *env) +{ + /* Von Neumann rounding is implemented by using round-to-zero + * and then setting the LSB of the result if Inexact was raised. + */ + float32 r; + float_status *fpst = &env->vfp.fp_status; + float_status tstat = *fpst; + int exflags; + + set_float_rounding_mode(float_round_to_zero, &tstat); + set_float_exception_flags(0, &tstat); + r = float64_to_float32(a, &tstat); + r = float32_maybe_silence_nan(r); + exflags = get_float_exception_flags(&tstat); + if (exflags & float_flag_inexact) { + r = make_float32(float32_val(r) | 1); + } + exflags |= get_float_exception_flags(fpst); + set_float_exception_flags(exflags, fpst); + return r; +} diff --git a/target-arm/helper-a64.h b/target-arm/helper-a64.h index ab9933cab0..3f05bedcca 100644 --- a/target-arm/helper-a64.h +++ b/target-arm/helper-a64.h @@ -21,12 +21,15 @@ DEF_HELPER_FLAGS_2(sdiv64, TCG_CALL_NO_RWG_SE, s64, s64, s64) DEF_HELPER_FLAGS_1(clz64, TCG_CALL_NO_RWG_SE, i64, i64) DEF_HELPER_FLAGS_1(cls64, TCG_CALL_NO_RWG_SE, i64, i64) DEF_HELPER_FLAGS_1(cls32, TCG_CALL_NO_RWG_SE, i32, i32) +DEF_HELPER_FLAGS_1(clz32, TCG_CALL_NO_RWG_SE, i32, i32) DEF_HELPER_FLAGS_1(rbit64, TCG_CALL_NO_RWG_SE, i64, i64) DEF_HELPER_3(vfp_cmps_a64, i64, f32, f32, ptr) DEF_HELPER_3(vfp_cmpes_a64, i64, f32, f32, ptr) DEF_HELPER_3(vfp_cmpd_a64, i64, f64, f64, ptr) DEF_HELPER_3(vfp_cmped_a64, i64, f64, f64, ptr) DEF_HELPER_FLAGS_5(simd_tbl, TCG_CALL_NO_RWG_SE, i64, env, i64, i64, i32, i32) +DEF_HELPER_FLAGS_2(neon_pmull_64_lo, TCG_CALL_NO_RWG_SE, i64, i64, i64) +DEF_HELPER_FLAGS_2(neon_pmull_64_hi, TCG_CALL_NO_RWG_SE, i64, i64, i64) DEF_HELPER_FLAGS_3(vfp_mulxs, TCG_CALL_NO_RWG, f32, f32, f32, ptr) DEF_HELPER_FLAGS_3(vfp_mulxd, TCG_CALL_NO_RWG, f64, f64, f64, ptr) DEF_HELPER_FLAGS_3(neon_ceq_f64, TCG_CALL_NO_RWG, i64, i64, i64, ptr) @@ -36,3 +39,10 @@ DEF_HELPER_FLAGS_3(recpsf_f32, TCG_CALL_NO_RWG, f32, f32, f32, ptr) DEF_HELPER_FLAGS_3(recpsf_f64, TCG_CALL_NO_RWG, f64, f64, f64, ptr) DEF_HELPER_FLAGS_3(rsqrtsf_f32, TCG_CALL_NO_RWG, f32, f32, f32, ptr) DEF_HELPER_FLAGS_3(rsqrtsf_f64, TCG_CALL_NO_RWG, f64, f64, f64, ptr) +DEF_HELPER_FLAGS_1(neon_addlp_s8, TCG_CALL_NO_RWG_SE, i64, i64) +DEF_HELPER_FLAGS_1(neon_addlp_u8, TCG_CALL_NO_RWG_SE, i64, i64) +DEF_HELPER_FLAGS_1(neon_addlp_s16, TCG_CALL_NO_RWG_SE, i64, i64) +DEF_HELPER_FLAGS_1(neon_addlp_u16, TCG_CALL_NO_RWG_SE, i64, i64) +DEF_HELPER_FLAGS_2(frecpx_f64, TCG_CALL_NO_RWG, f64, f64, ptr) +DEF_HELPER_FLAGS_2(frecpx_f32, TCG_CALL_NO_RWG, f32, f32, ptr) +DEF_HELPER_FLAGS_2(fcvtx_f64_to_f32, TCG_CALL_NO_RWG, f32, f64, env) diff --git a/target-arm/helper.c b/target-arm/helper.c index f0a1fd48e6..55077ed1b6 100644 --- a/target-arm/helper.c +++ b/target-arm/helper.c @@ -1983,6 +1983,7 @@ void register_cp_regs_for_features(ARMCPU *cpu) ARMCPRegInfo pmcr = { .name = "PMCR", .cp = 15, .crn = 9, .crm = 12, .opc1 = 0, .opc2 = 0, .access = PL0_RW, .resetvalue = cpu->midr & 0xff000000, + .type = ARM_CP_IO, .fieldoffset = offsetof(CPUARMState, cp15.c9_pmcr), .accessfn = pmreg_access, .writefn = pmcr_write, .raw_writefn = raw_write, @@ -4519,16 +4520,21 @@ float32 HELPER(rsqrts_f32)(float32 a, float32 b, CPUARMState *env) * int->float conversions at run-time. */ #define float64_256 make_float64(0x4070000000000000LL) #define float64_512 make_float64(0x4080000000000000LL) +#define float32_maxnorm make_float32(0x7f7fffff) +#define float64_maxnorm make_float64(0x7fefffffffffffffLL) -/* The algorithm that must be used to calculate the estimate - * is specified by the ARM ARM. +/* Reciprocal functions + * + * The algorithm that must be used to calculate the estimate + * is specified by the ARM ARM, see FPRecipEstimate() */ -static float64 recip_estimate(float64 a, CPUARMState *env) + +static float64 recip_estimate(float64 a, float_status *real_fp_status) { /* These calculations mustn't set any fp exception flags, * so we use a local copy of the fp_status. */ - float_status dummy_status = env->vfp.standard_fp_status; + float_status dummy_status = *real_fp_status; float_status *s = &dummy_status; /* q = (int)(a * 512.0) */ float64 q = float64_mul(float64_512, a, s); @@ -4549,56 +4555,178 @@ static float64 recip_estimate(float64 a, CPUARMState *env) return float64_div(int64_to_float64(q_int, s), float64_256, s); } -float32 HELPER(recpe_f32)(float32 a, CPUARMState *env) +/* Common wrapper to call recip_estimate */ +static float64 call_recip_estimate(float64 num, int off, float_status *fpst) { - float_status *s = &env->vfp.standard_fp_status; - float64 f64; - uint32_t val32 = float32_val(a); + uint64_t val64 = float64_val(num); + uint64_t frac = extract64(val64, 0, 52); + int64_t exp = extract64(val64, 52, 11); + uint64_t sbit; + float64 scaled, estimate; - int result_exp; - int a_exp = (val32 & 0x7f800000) >> 23; - int sign = val32 & 0x80000000; + /* Generate the scaled number for the estimate function */ + if (exp == 0) { + if (extract64(frac, 51, 1) == 0) { + exp = -1; + frac = extract64(frac, 0, 50) << 2; + } else { + frac = extract64(frac, 0, 51) << 1; + } + } - if (float32_is_any_nan(a)) { - if (float32_is_signaling_nan(a)) { - float_raise(float_flag_invalid, s); + /* scaled = '0' : '01111111110' : fraction<51:44> : Zeros(44); */ + scaled = make_float64((0x3feULL << 52) + | extract64(frac, 44, 8) << 44); + + estimate = recip_estimate(scaled, fpst); + + /* Build new result */ + val64 = float64_val(estimate); + sbit = 0x8000000000000000ULL & val64; + exp = off - exp; + frac = extract64(val64, 0, 52); + + if (exp == 0) { + frac = 1ULL << 51 | extract64(frac, 1, 51); + } else if (exp == -1) { + frac = 1ULL << 50 | extract64(frac, 2, 50); + exp = 0; + } + + return make_float64(sbit | (exp << 52) | frac); +} + +static bool round_to_inf(float_status *fpst, bool sign_bit) +{ + switch (fpst->float_rounding_mode) { + case float_round_nearest_even: /* Round to Nearest */ + return true; + case float_round_up: /* Round to +Inf */ + return !sign_bit; + case float_round_down: /* Round to -Inf */ + return sign_bit; + case float_round_to_zero: /* Round to Zero */ + return false; + } + + g_assert_not_reached(); +} + +float32 HELPER(recpe_f32)(float32 input, void *fpstp) +{ + float_status *fpst = fpstp; + float32 f32 = float32_squash_input_denormal(input, fpst); + uint32_t f32_val = float32_val(f32); + uint32_t f32_sbit = 0x80000000ULL & f32_val; + int32_t f32_exp = extract32(f32_val, 23, 8); + uint32_t f32_frac = extract32(f32_val, 0, 23); + float64 f64, r64; + uint64_t r64_val; + int64_t r64_exp; + uint64_t r64_frac; + + if (float32_is_any_nan(f32)) { + float32 nan = f32; + if (float32_is_signaling_nan(f32)) { + float_raise(float_flag_invalid, fpst); + nan = float32_maybe_silence_nan(f32); } - return float32_default_nan; - } else if (float32_is_infinity(a)) { - return float32_set_sign(float32_zero, float32_is_neg(a)); - } else if (float32_is_zero_or_denormal(a)) { - if (!float32_is_zero(a)) { - float_raise(float_flag_input_denormal, s); + if (fpst->default_nan_mode) { + nan = float32_default_nan; } - float_raise(float_flag_divbyzero, s); - return float32_set_sign(float32_infinity, float32_is_neg(a)); - } else if (a_exp >= 253) { - float_raise(float_flag_underflow, s); - return float32_set_sign(float32_zero, float32_is_neg(a)); + return nan; + } else if (float32_is_infinity(f32)) { + return float32_set_sign(float32_zero, float32_is_neg(f32)); + } else if (float32_is_zero(f32)) { + float_raise(float_flag_divbyzero, fpst); + return float32_set_sign(float32_infinity, float32_is_neg(f32)); + } else if ((f32_val & ~(1ULL << 31)) < (1ULL << 21)) { + /* Abs(value) < 2.0^-128 */ + float_raise(float_flag_overflow | float_flag_inexact, fpst); + if (round_to_inf(fpst, f32_sbit)) { + return float32_set_sign(float32_infinity, float32_is_neg(f32)); + } else { + return float32_set_sign(float32_maxnorm, float32_is_neg(f32)); + } + } else if (f32_exp >= 253 && fpst->flush_to_zero) { + float_raise(float_flag_underflow, fpst); + return float32_set_sign(float32_zero, float32_is_neg(f32)); } - f64 = make_float64((0x3feULL << 52) - | ((int64_t)(val32 & 0x7fffff) << 29)); - result_exp = 253 - a_exp; + f64 = make_float64(((int64_t)(f32_exp) << 52) | (int64_t)(f32_frac) << 29); + r64 = call_recip_estimate(f64, 253, fpst); + r64_val = float64_val(r64); + r64_exp = extract64(r64_val, 52, 11); + r64_frac = extract64(r64_val, 0, 52); - f64 = recip_estimate(f64, env); + /* result = sign : result_exp<7:0> : fraction<51:29>; */ + return make_float32(f32_sbit | + (r64_exp & 0xff) << 23 | + extract64(r64_frac, 29, 24)); +} - val32 = sign - | ((result_exp & 0xff) << 23) - | ((float64_val(f64) >> 29) & 0x7fffff); - return make_float32(val32); +float64 HELPER(recpe_f64)(float64 input, void *fpstp) +{ + float_status *fpst = fpstp; + float64 f64 = float64_squash_input_denormal(input, fpst); + uint64_t f64_val = float64_val(f64); + uint64_t f64_sbit = 0x8000000000000000ULL & f64_val; + int64_t f64_exp = extract64(f64_val, 52, 11); + float64 r64; + uint64_t r64_val; + int64_t r64_exp; + uint64_t r64_frac; + + /* Deal with any special cases */ + if (float64_is_any_nan(f64)) { + float64 nan = f64; + if (float64_is_signaling_nan(f64)) { + float_raise(float_flag_invalid, fpst); + nan = float64_maybe_silence_nan(f64); + } + if (fpst->default_nan_mode) { + nan = float64_default_nan; + } + return nan; + } else if (float64_is_infinity(f64)) { + return float64_set_sign(float64_zero, float64_is_neg(f64)); + } else if (float64_is_zero(f64)) { + float_raise(float_flag_divbyzero, fpst); + return float64_set_sign(float64_infinity, float64_is_neg(f64)); + } else if ((f64_val & ~(1ULL << 63)) < (1ULL << 50)) { + /* Abs(value) < 2.0^-1024 */ + float_raise(float_flag_overflow | float_flag_inexact, fpst); + if (round_to_inf(fpst, f64_sbit)) { + return float64_set_sign(float64_infinity, float64_is_neg(f64)); + } else { + return float64_set_sign(float64_maxnorm, float64_is_neg(f64)); + } + } else if (f64_exp >= 1023 && fpst->flush_to_zero) { + float_raise(float_flag_underflow, fpst); + return float64_set_sign(float64_zero, float64_is_neg(f64)); + } + + r64 = call_recip_estimate(f64, 2045, fpst); + r64_val = float64_val(r64); + r64_exp = extract64(r64_val, 52, 11); + r64_frac = extract64(r64_val, 0, 52); + + /* result = sign : result_exp<10:0> : fraction<51:0> */ + return make_float64(f64_sbit | + ((r64_exp & 0x7ff) << 52) | + r64_frac); } /* The algorithm that must be used to calculate the estimate * is specified by the ARM ARM. */ -static float64 recip_sqrt_estimate(float64 a, CPUARMState *env) +static float64 recip_sqrt_estimate(float64 a, float_status *real_fp_status) { /* These calculations mustn't set any fp exception flags, * so we use a local copy of the fp_status. */ - float_status dummy_status = env->vfp.standard_fp_status; + float_status dummy_status = *real_fp_status; float_status *s = &dummy_status; float64 q; int64_t q_int; @@ -4645,49 +4773,64 @@ static float64 recip_sqrt_estimate(float64 a, CPUARMState *env) return float64_div(int64_to_float64(q_int, s), float64_256, s); } -float32 HELPER(rsqrte_f32)(float32 a, CPUARMState *env) +float32 HELPER(rsqrte_f32)(float32 input, void *fpstp) { - float_status *s = &env->vfp.standard_fp_status; + float_status *s = fpstp; + float32 f32 = float32_squash_input_denormal(input, s); + uint32_t val = float32_val(f32); + uint32_t f32_sbit = 0x80000000 & val; + int32_t f32_exp = extract32(val, 23, 8); + uint32_t f32_frac = extract32(val, 0, 23); + uint64_t f64_frac; + uint64_t val64; int result_exp; float64 f64; - uint32_t val; - uint64_t val64; - - val = float32_val(a); - if (float32_is_any_nan(a)) { - if (float32_is_signaling_nan(a)) { + if (float32_is_any_nan(f32)) { + float32 nan = f32; + if (float32_is_signaling_nan(f32)) { float_raise(float_flag_invalid, s); + nan = float32_maybe_silence_nan(f32); } - return float32_default_nan; - } else if (float32_is_zero_or_denormal(a)) { - if (!float32_is_zero(a)) { - float_raise(float_flag_input_denormal, s); + if (s->default_nan_mode) { + nan = float32_default_nan; } + return nan; + } else if (float32_is_zero(f32)) { float_raise(float_flag_divbyzero, s); - return float32_set_sign(float32_infinity, float32_is_neg(a)); - } else if (float32_is_neg(a)) { + return float32_set_sign(float32_infinity, float32_is_neg(f32)); + } else if (float32_is_neg(f32)) { float_raise(float_flag_invalid, s); return float32_default_nan; - } else if (float32_is_infinity(a)) { + } else if (float32_is_infinity(f32)) { return float32_zero; } - /* Normalize to a double-precision value between 0.25 and 1.0, + /* Scale and normalize to a double-precision value between 0.25 and 1.0, * preserving the parity of the exponent. */ - if ((val & 0x800000) == 0) { - f64 = make_float64(((uint64_t)(val & 0x80000000) << 32) + + f64_frac = ((uint64_t) f32_frac) << 29; + if (f32_exp == 0) { + while (extract64(f64_frac, 51, 1) == 0) { + f64_frac = f64_frac << 1; + f32_exp = f32_exp-1; + } + f64_frac = extract64(f64_frac, 0, 51) << 1; + } + + if (extract64(f32_exp, 0, 1) == 0) { + f64 = make_float64(((uint64_t) f32_sbit) << 32 | (0x3feULL << 52) - | ((uint64_t)(val & 0x7fffff) << 29)); + | f64_frac); } else { - f64 = make_float64(((uint64_t)(val & 0x80000000) << 32) + f64 = make_float64(((uint64_t) f32_sbit) << 32 | (0x3fdULL << 52) - | ((uint64_t)(val & 0x7fffff) << 29)); + | f64_frac); } - result_exp = (380 - ((val & 0x7f800000) >> 23)) / 2; + result_exp = (380 - f32_exp) / 2; - f64 = recip_sqrt_estimate(f64, env); + f64 = recip_sqrt_estimate(f64, s); val64 = float64_val(f64); @@ -4696,8 +4839,72 @@ float32 HELPER(rsqrte_f32)(float32 a, CPUARMState *env) return make_float32(val); } -uint32_t HELPER(recpe_u32)(uint32_t a, CPUARMState *env) +float64 HELPER(rsqrte_f64)(float64 input, void *fpstp) +{ + float_status *s = fpstp; + float64 f64 = float64_squash_input_denormal(input, s); + uint64_t val = float64_val(f64); + uint64_t f64_sbit = 0x8000000000000000ULL & val; + int64_t f64_exp = extract64(val, 52, 11); + uint64_t f64_frac = extract64(val, 0, 52); + int64_t result_exp; + uint64_t result_frac; + + if (float64_is_any_nan(f64)) { + float64 nan = f64; + if (float64_is_signaling_nan(f64)) { + float_raise(float_flag_invalid, s); + nan = float64_maybe_silence_nan(f64); + } + if (s->default_nan_mode) { + nan = float64_default_nan; + } + return nan; + } else if (float64_is_zero(f64)) { + float_raise(float_flag_divbyzero, s); + return float64_set_sign(float64_infinity, float64_is_neg(f64)); + } else if (float64_is_neg(f64)) { + float_raise(float_flag_invalid, s); + return float64_default_nan; + } else if (float64_is_infinity(f64)) { + return float64_zero; + } + + /* Scale and normalize to a double-precision value between 0.25 and 1.0, + * preserving the parity of the exponent. */ + + if (f64_exp == 0) { + while (extract64(f64_frac, 51, 1) == 0) { + f64_frac = f64_frac << 1; + f64_exp = f64_exp - 1; + } + f64_frac = extract64(f64_frac, 0, 51) << 1; + } + + if (extract64(f64_exp, 0, 1) == 0) { + f64 = make_float64(f64_sbit + | (0x3feULL << 52) + | f64_frac); + } else { + f64 = make_float64(f64_sbit + | (0x3fdULL << 52) + | f64_frac); + } + + result_exp = (3068 - f64_exp) / 2; + + f64 = recip_sqrt_estimate(f64, s); + + result_frac = extract64(float64_val(f64), 0, 52); + + return make_float64(f64_sbit | + ((result_exp & 0x7ff) << 52) | + result_frac); +} + +uint32_t HELPER(recpe_u32)(uint32_t a, void *fpstp) { + float_status *s = fpstp; float64 f64; if ((a & 0x80000000) == 0) { @@ -4707,13 +4914,14 @@ uint32_t HELPER(recpe_u32)(uint32_t a, CPUARMState *env) f64 = make_float64((0x3feULL << 52) | ((int64_t)(a & 0x7fffffff) << 21)); - f64 = recip_estimate (f64, env); + f64 = recip_estimate(f64, s); return 0x80000000 | ((float64_val(f64) >> 21) & 0x7fffffff); } -uint32_t HELPER(rsqrte_u32)(uint32_t a, CPUARMState *env) +uint32_t HELPER(rsqrte_u32)(uint32_t a, void *fpstp) { + float_status *fpst = fpstp; float64 f64; if ((a & 0xc0000000) == 0) { @@ -4728,7 +4936,7 @@ uint32_t HELPER(rsqrte_u32)(uint32_t a, CPUARMState *env) | ((uint64_t)(a & 0x3fffffff) << 22)); } - f64 = recip_sqrt_estimate(f64, env); + f64 = recip_sqrt_estimate(f64, fpst); return 0x80000000 | ((float64_val(f64) >> 21) & 0x7fffffff); } diff --git a/target-arm/helper.h b/target-arm/helper.h index 8923f8ae71..a3d6f32b06 100644 --- a/target-arm/helper.h +++ b/target-arm/helper.h @@ -167,10 +167,12 @@ DEF_HELPER_4(vfp_muladds, f32, f32, f32, f32, ptr) DEF_HELPER_3(recps_f32, f32, f32, f32, env) DEF_HELPER_3(rsqrts_f32, f32, f32, f32, env) -DEF_HELPER_2(recpe_f32, f32, f32, env) -DEF_HELPER_2(rsqrte_f32, f32, f32, env) -DEF_HELPER_2(recpe_u32, i32, i32, env) -DEF_HELPER_2(rsqrte_u32, i32, i32, env) +DEF_HELPER_FLAGS_2(recpe_f32, TCG_CALL_NO_RWG, f32, f32, ptr) +DEF_HELPER_FLAGS_2(recpe_f64, TCG_CALL_NO_RWG, f64, f64, ptr) +DEF_HELPER_FLAGS_2(rsqrte_f32, TCG_CALL_NO_RWG, f32, f32, ptr) +DEF_HELPER_FLAGS_2(rsqrte_f64, TCG_CALL_NO_RWG, f64, f64, ptr) +DEF_HELPER_2(recpe_u32, i32, i32, ptr) +DEF_HELPER_FLAGS_2(rsqrte_u32, TCG_CALL_NO_RWG, i32, i32, ptr) DEF_HELPER_5(neon_tbl, i32, env, i32, i32, i32, i32) DEF_HELPER_3(shl_cc, i32, env, i32, i32) diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c index 2fd9113628..befffac2e3 100644 --- a/target-arm/translate-a64.c +++ b/target-arm/translate-a64.c @@ -76,11 +76,13 @@ typedef struct AArch64DecodeTable { typedef void NeonGenTwoOpFn(TCGv_i32, TCGv_i32, TCGv_i32); typedef void NeonGenTwoOpEnvFn(TCGv_i32, TCGv_ptr, TCGv_i32, TCGv_i32); typedef void NeonGenTwo64OpFn(TCGv_i64, TCGv_i64, TCGv_i64); +typedef void NeonGenTwo64OpEnvFn(TCGv_i64, TCGv_ptr, TCGv_i64, TCGv_i64); typedef void NeonGenNarrowFn(TCGv_i32, TCGv_i64); typedef void NeonGenNarrowEnvFn(TCGv_i32, TCGv_ptr, TCGv_i64); typedef void NeonGenWidenFn(TCGv_i64, TCGv_i32); typedef void NeonGenTwoSingleOPFn(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_ptr); typedef void NeonGenTwoDoubleOPFn(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_ptr); +typedef void NeonGenOneOpFn(TCGv_i64, TCGv_i64); /* initialize TCG globals. */ void a64_translate_init(void) @@ -3096,12 +3098,11 @@ static void disas_add_sub_ext_reg(DisasContext *s, uint32_t insn) /* non-flag setting ops may use SP */ if (!setflags) { - tcg_rn = read_cpu_reg_sp(s, rn, sf); tcg_rd = cpu_reg_sp(s, rd); } else { - tcg_rn = read_cpu_reg(s, rn, sf); tcg_rd = cpu_reg(s, rd); } + tcg_rn = read_cpu_reg_sp(s, rn, sf); tcg_rm = read_cpu_reg(s, rm, sf); ext_and_shift_reg(tcg_rm, tcg_rm, option, imm3); @@ -5828,6 +5829,21 @@ static void handle_shli_with_ins(TCGv_i64 tcg_res, TCGv_i64 tcg_src, } } +/* SRI: shift right with insert */ +static void handle_shri_with_ins(TCGv_i64 tcg_res, TCGv_i64 tcg_src, + int size, int shift) +{ + int esize = 8 << size; + + /* shift count same as element size is valid but does nothing; + * special case to avoid potential shift by 64. + */ + if (shift != esize) { + tcg_gen_shri_i64(tcg_src, tcg_src, shift); + tcg_gen_deposit_i64(tcg_res, tcg_res, tcg_src, 0, esize - shift); + } +} + /* SSHR[RA]/USHR[RA] - Scalar shift right (optional rounding/accumulate) */ static void handle_scalar_simd_shri(DisasContext *s, bool is_u, int immh, int immb, @@ -5838,6 +5854,7 @@ static void handle_scalar_simd_shri(DisasContext *s, int shift = 2 * (8 << size) - immhb; bool accumulate = false; bool round = false; + bool insert = false; TCGv_i64 tcg_rn; TCGv_i64 tcg_rd; TCGv_i64 tcg_round; @@ -5857,6 +5874,9 @@ static void handle_scalar_simd_shri(DisasContext *s, case 0x06: /* SRSRA / URSRA (accum + rounding) */ accumulate = round = true; break; + case 0x08: /* SRI */ + insert = true; + break; } if (round) { @@ -5867,10 +5887,14 @@ static void handle_scalar_simd_shri(DisasContext *s, } tcg_rn = read_fp_dreg(s, rn); - tcg_rd = accumulate ? read_fp_dreg(s, rd) : tcg_temp_new_i64(); + tcg_rd = (accumulate || insert) ? read_fp_dreg(s, rd) : tcg_temp_new_i64(); - handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round, - accumulate, is_u, size, shift); + if (insert) { + handle_shri_with_ins(tcg_rd, tcg_rn, size, shift); + } else { + handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round, + accumulate, is_u, size, shift); + } write_fp_dreg(s, rd, tcg_rd); @@ -5908,6 +5932,374 @@ static void handle_scalar_simd_shli(DisasContext *s, bool insert, tcg_temp_free_i64(tcg_rd); } +/* SQSHRN/SQSHRUN - Saturating (signed/unsigned) shift right with + * (signed/unsigned) narrowing */ +static void handle_vec_simd_sqshrn(DisasContext *s, bool is_scalar, bool is_q, + bool is_u_shift, bool is_u_narrow, + int immh, int immb, int opcode, + int rn, int rd) +{ + int immhb = immh << 3 | immb; + int size = 32 - clz32(immh) - 1; + int esize = 8 << size; + int shift = (2 * esize) - immhb; + int elements = is_scalar ? 1 : (64 / esize); + bool round = extract32(opcode, 0, 1); + TCGMemOp ldop = (size + 1) | (is_u_shift ? 0 : MO_SIGN); + TCGv_i64 tcg_rn, tcg_rd, tcg_round; + TCGv_i32 tcg_rd_narrowed; + TCGv_i64 tcg_final; + + static NeonGenNarrowEnvFn * const signed_narrow_fns[4][2] = { + { gen_helper_neon_narrow_sat_s8, + gen_helper_neon_unarrow_sat8 }, + { gen_helper_neon_narrow_sat_s16, + gen_helper_neon_unarrow_sat16 }, + { gen_helper_neon_narrow_sat_s32, + gen_helper_neon_unarrow_sat32 }, + { NULL, NULL }, + }; + static NeonGenNarrowEnvFn * const unsigned_narrow_fns[4] = { + gen_helper_neon_narrow_sat_u8, + gen_helper_neon_narrow_sat_u16, + gen_helper_neon_narrow_sat_u32, + NULL + }; + NeonGenNarrowEnvFn *narrowfn; + + int i; + + assert(size < 4); + + if (extract32(immh, 3, 1)) { + unallocated_encoding(s); + return; + } + + if (is_u_shift) { + narrowfn = unsigned_narrow_fns[size]; + } else { + narrowfn = signed_narrow_fns[size][is_u_narrow ? 1 : 0]; + } + + tcg_rn = tcg_temp_new_i64(); + tcg_rd = tcg_temp_new_i64(); + tcg_rd_narrowed = tcg_temp_new_i32(); + tcg_final = tcg_const_i64(0); + + if (round) { + uint64_t round_const = 1ULL << (shift - 1); + tcg_round = tcg_const_i64(round_const); + } else { + TCGV_UNUSED_I64(tcg_round); + } + + for (i = 0; i < elements; i++) { + read_vec_element(s, tcg_rn, rn, i, ldop); + handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round, + false, is_u_shift, size+1, shift); + narrowfn(tcg_rd_narrowed, cpu_env, tcg_rd); + tcg_gen_extu_i32_i64(tcg_rd, tcg_rd_narrowed); + tcg_gen_deposit_i64(tcg_final, tcg_final, tcg_rd, esize * i, esize); + } + + if (!is_q) { + clear_vec_high(s, rd); + write_vec_element(s, tcg_final, rd, 0, MO_64); + } else { + write_vec_element(s, tcg_final, rd, 1, MO_64); + } + + if (round) { + tcg_temp_free_i64(tcg_round); + } + tcg_temp_free_i64(tcg_rn); + tcg_temp_free_i64(tcg_rd); + tcg_temp_free_i32(tcg_rd_narrowed); + tcg_temp_free_i64(tcg_final); + return; +} + +/* SQSHLU, UQSHL, SQSHL: saturating left shifts */ +static void handle_simd_qshl(DisasContext *s, bool scalar, bool is_q, + bool src_unsigned, bool dst_unsigned, + int immh, int immb, int rn, int rd) +{ + int immhb = immh << 3 | immb; + int size = 32 - clz32(immh) - 1; + int shift = immhb - (8 << size); + int pass; + + assert(immh != 0); + assert(!(scalar && is_q)); + + if (!scalar) { + if (!is_q && extract32(immh, 3, 1)) { + unallocated_encoding(s); + return; + } + + /* Since we use the variable-shift helpers we must + * replicate the shift count into each element of + * the tcg_shift value. + */ + switch (size) { + case 0: + shift |= shift << 8; + /* fall through */ + case 1: + shift |= shift << 16; + break; + case 2: + case 3: + break; + default: + g_assert_not_reached(); + } + } + + if (size == 3) { + TCGv_i64 tcg_shift = tcg_const_i64(shift); + static NeonGenTwo64OpEnvFn * const fns[2][2] = { + { gen_helper_neon_qshl_s64, gen_helper_neon_qshlu_s64 }, + { NULL, gen_helper_neon_qshl_u64 }, + }; + NeonGenTwo64OpEnvFn *genfn = fns[src_unsigned][dst_unsigned]; + int maxpass = is_q ? 2 : 1; + + for (pass = 0; pass < maxpass; pass++) { + TCGv_i64 tcg_op = tcg_temp_new_i64(); + + read_vec_element(s, tcg_op, rn, pass, MO_64); + genfn(tcg_op, cpu_env, tcg_op, tcg_shift); + write_vec_element(s, tcg_op, rd, pass, MO_64); + + tcg_temp_free_i64(tcg_op); + } + tcg_temp_free_i64(tcg_shift); + + if (!is_q) { + clear_vec_high(s, rd); + } + } else { + TCGv_i32 tcg_shift = tcg_const_i32(shift); + static NeonGenTwoOpEnvFn * const fns[2][2][3] = { + { + { gen_helper_neon_qshl_s8, + gen_helper_neon_qshl_s16, + gen_helper_neon_qshl_s32 }, + { gen_helper_neon_qshlu_s8, + gen_helper_neon_qshlu_s16, + gen_helper_neon_qshlu_s32 } + }, { + { NULL, NULL, NULL }, + { gen_helper_neon_qshl_u8, + gen_helper_neon_qshl_u16, + gen_helper_neon_qshl_u32 } + } + }; + NeonGenTwoOpEnvFn *genfn = fns[src_unsigned][dst_unsigned][size]; + TCGMemOp memop = scalar ? size : MO_32; + int maxpass = scalar ? 1 : is_q ? 4 : 2; + + for (pass = 0; pass < maxpass; pass++) { + TCGv_i32 tcg_op = tcg_temp_new_i32(); + + read_vec_element_i32(s, tcg_op, rn, pass, memop); + genfn(tcg_op, cpu_env, tcg_op, tcg_shift); + if (scalar) { + switch (size) { + case 0: + tcg_gen_ext8u_i32(tcg_op, tcg_op); + break; + case 1: + tcg_gen_ext16u_i32(tcg_op, tcg_op); + break; + case 2: + break; + default: + g_assert_not_reached(); + } + write_fp_sreg(s, rd, tcg_op); + } else { + write_vec_element_i32(s, tcg_op, rd, pass, MO_32); + } + + tcg_temp_free_i32(tcg_op); + } + tcg_temp_free_i32(tcg_shift); + + if (!is_q && !scalar) { + clear_vec_high(s, rd); + } + } +} + +/* Common vector code for handling integer to FP conversion */ +static void handle_simd_intfp_conv(DisasContext *s, int rd, int rn, + int elements, int is_signed, + int fracbits, int size) +{ + bool is_double = size == 3 ? true : false; + TCGv_ptr tcg_fpst = get_fpstatus_ptr(); + TCGv_i32 tcg_shift = tcg_const_i32(fracbits); + TCGv_i64 tcg_int = tcg_temp_new_i64(); + TCGMemOp mop = size | (is_signed ? MO_SIGN : 0); + int pass; + + for (pass = 0; pass < elements; pass++) { + read_vec_element(s, tcg_int, rn, pass, mop); + + if (is_double) { + TCGv_i64 tcg_double = tcg_temp_new_i64(); + if (is_signed) { + gen_helper_vfp_sqtod(tcg_double, tcg_int, + tcg_shift, tcg_fpst); + } else { + gen_helper_vfp_uqtod(tcg_double, tcg_int, + tcg_shift, tcg_fpst); + } + if (elements == 1) { + write_fp_dreg(s, rd, tcg_double); + } else { + write_vec_element(s, tcg_double, rd, pass, MO_64); + } + tcg_temp_free_i64(tcg_double); + } else { + TCGv_i32 tcg_single = tcg_temp_new_i32(); + if (is_signed) { + gen_helper_vfp_sqtos(tcg_single, tcg_int, + tcg_shift, tcg_fpst); + } else { + gen_helper_vfp_uqtos(tcg_single, tcg_int, + tcg_shift, tcg_fpst); + } + if (elements == 1) { + write_fp_sreg(s, rd, tcg_single); + } else { + write_vec_element_i32(s, tcg_single, rd, pass, MO_32); + } + tcg_temp_free_i32(tcg_single); + } + } + + if (!is_double && elements == 2) { + clear_vec_high(s, rd); + } + + tcg_temp_free_i64(tcg_int); + tcg_temp_free_ptr(tcg_fpst); + tcg_temp_free_i32(tcg_shift); +} + +/* UCVTF/SCVTF - Integer to FP conversion */ +static void handle_simd_shift_intfp_conv(DisasContext *s, bool is_scalar, + bool is_q, bool is_u, + int immh, int immb, int opcode, + int rn, int rd) +{ + bool is_double = extract32(immh, 3, 1); + int size = is_double ? MO_64 : MO_32; + int elements; + int immhb = immh << 3 | immb; + int fracbits = (is_double ? 128 : 64) - immhb; + + if (!extract32(immh, 2, 2)) { + unallocated_encoding(s); + return; + } + + if (is_scalar) { + elements = 1; + } else { + elements = is_double ? 2 : is_q ? 4 : 2; + if (is_double && !is_q) { + unallocated_encoding(s); + return; + } + } + /* immh == 0 would be a failure of the decode logic */ + g_assert(immh); + + handle_simd_intfp_conv(s, rd, rn, elements, !is_u, fracbits, size); +} + +/* FCVTZS, FVCVTZU - FP to fixedpoint conversion */ +static void handle_simd_shift_fpint_conv(DisasContext *s, bool is_scalar, + bool is_q, bool is_u, + int immh, int immb, int rn, int rd) +{ + bool is_double = extract32(immh, 3, 1); + int immhb = immh << 3 | immb; + int fracbits = (is_double ? 128 : 64) - immhb; + int pass; + TCGv_ptr tcg_fpstatus; + TCGv_i32 tcg_rmode, tcg_shift; + + if (!extract32(immh, 2, 2)) { + unallocated_encoding(s); + return; + } + + if (!is_scalar && !is_q && is_double) { + unallocated_encoding(s); + return; + } + + assert(!(is_scalar && is_q)); + + tcg_rmode = tcg_const_i32(arm_rmode_to_sf(FPROUNDING_ZERO)); + gen_helper_set_rmode(tcg_rmode, tcg_rmode, cpu_env); + tcg_fpstatus = get_fpstatus_ptr(); + tcg_shift = tcg_const_i32(fracbits); + + if (is_double) { + int maxpass = is_scalar ? 1 : is_q ? 2 : 1; + + for (pass = 0; pass < maxpass; pass++) { + TCGv_i64 tcg_op = tcg_temp_new_i64(); + + read_vec_element(s, tcg_op, rn, pass, MO_64); + if (is_u) { + gen_helper_vfp_touqd(tcg_op, tcg_op, tcg_shift, tcg_fpstatus); + } else { + gen_helper_vfp_tosqd(tcg_op, tcg_op, tcg_shift, tcg_fpstatus); + } + write_vec_element(s, tcg_op, rd, pass, MO_64); + tcg_temp_free_i64(tcg_op); + } + if (!is_q) { + clear_vec_high(s, rd); + } + } else { + int maxpass = is_scalar ? 1 : is_q ? 4 : 2; + for (pass = 0; pass < maxpass; pass++) { + TCGv_i32 tcg_op = tcg_temp_new_i32(); + + read_vec_element_i32(s, tcg_op, rn, pass, MO_32); + if (is_u) { + gen_helper_vfp_touls(tcg_op, tcg_op, tcg_shift, tcg_fpstatus); + } else { + gen_helper_vfp_tosls(tcg_op, tcg_op, tcg_shift, tcg_fpstatus); + } + if (is_scalar) { + write_fp_sreg(s, rd, tcg_op); + } else { + write_vec_element_i32(s, tcg_op, rd, pass, MO_32); + } + tcg_temp_free_i32(tcg_op); + } + if (!is_q && !is_scalar) { + clear_vec_high(s, rd); + } + } + + tcg_temp_free_ptr(tcg_fpstatus); + tcg_temp_free_i32(tcg_shift); + gen_helper_set_rmode(tcg_rmode, tcg_rmode, cpu_env); + tcg_temp_free_i32(tcg_rmode); +} + /* C3.6.9 AdvSIMD scalar shift by immediate * 31 30 29 28 23 22 19 18 16 15 11 10 9 5 4 0 * +-----+---+-------------+------+------+--------+---+------+------+ @@ -5925,7 +6317,18 @@ static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn) int immh = extract32(insn, 19, 4); bool is_u = extract32(insn, 29, 1); + if (immh == 0) { + unallocated_encoding(s); + return; + } + switch (opcode) { + case 0x08: /* SRI */ + if (!is_u) { + unallocated_encoding(s); + return; + } + /* fall through */ case 0x00: /* SSHR / USHR */ case 0x02: /* SSRA / USRA */ case 0x04: /* SRSHR / URSHR */ @@ -5935,8 +6338,39 @@ static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn) case 0x0a: /* SHL / SLI */ handle_scalar_simd_shli(s, is_u, immh, immb, opcode, rn, rd); break; + case 0x1c: /* SCVTF, UCVTF */ + handle_simd_shift_intfp_conv(s, true, false, is_u, immh, immb, + opcode, rn, rd); + break; + case 0x10: /* SQSHRUN, SQSHRUN2 */ + case 0x11: /* SQRSHRUN, SQRSHRUN2 */ + if (!is_u) { + unallocated_encoding(s); + return; + } + handle_vec_simd_sqshrn(s, true, false, false, true, + immh, immb, opcode, rn, rd); + break; + case 0x12: /* SQSHRN, SQSHRN2, UQSHRN */ + case 0x13: /* SQRSHRN, SQRSHRN2, UQRSHRN, UQRSHRN2 */ + handle_vec_simd_sqshrn(s, true, false, is_u, is_u, + immh, immb, opcode, rn, rd); + break; + case 0xc: /* SQSHLU */ + if (!is_u) { + unallocated_encoding(s); + return; + } + handle_simd_qshl(s, true, false, false, true, immh, immb, rn, rd); + break; + case 0xe: /* SQSHL, UQSHL */ + handle_simd_qshl(s, true, false, is_u, is_u, immh, immb, rn, rd); + break; + case 0x1f: /* FCVTZS, FCVTZU */ + handle_simd_shift_fpint_conv(s, true, false, is_u, immh, immb, rn, rd); + break; default: - unsupported_encoding(s, insn); + unallocated_encoding(s); break; } } @@ -6483,15 +6917,25 @@ static void disas_simd_scalar_three_reg_same(DisasContext *s, uint32_t insn) } static void handle_2misc_64(DisasContext *s, int opcode, bool u, - TCGv_i64 tcg_rd, TCGv_i64 tcg_rn) + TCGv_i64 tcg_rd, TCGv_i64 tcg_rn, + TCGv_i32 tcg_rmode, TCGv_ptr tcg_fpstatus) { /* Handle 64->64 opcodes which are shared between the scalar and * vector 2-reg-misc groups. We cover every integer opcode where size == 3 * is valid in either group and also the double-precision fp ops. + * The caller only need provide tcg_rmode and tcg_fpstatus if the op + * requires them. */ TCGCond cond; switch (opcode) { + case 0x4: /* CLS, CLZ */ + if (u) { + gen_helper_clz64(tcg_rd, tcg_rn); + } else { + gen_helper_cls64(tcg_rd, tcg_rn); + } + break; case 0x5: /* NOT */ /* This opcode is shared with CNT and RBIT but we have earlier * enforced that size == 3 if and only if this is the NOT insn. @@ -6531,6 +6975,42 @@ static void handle_2misc_64(DisasContext *s, int opcode, bool u, case 0x6f: /* FNEG */ gen_helper_vfp_negd(tcg_rd, tcg_rn); break; + case 0x7f: /* FSQRT */ + gen_helper_vfp_sqrtd(tcg_rd, tcg_rn, cpu_env); + break; + case 0x1a: /* FCVTNS */ + case 0x1b: /* FCVTMS */ + case 0x1c: /* FCVTAS */ + case 0x3a: /* FCVTPS */ + case 0x3b: /* FCVTZS */ + { + TCGv_i32 tcg_shift = tcg_const_i32(0); + gen_helper_vfp_tosqd(tcg_rd, tcg_rn, tcg_shift, tcg_fpstatus); + tcg_temp_free_i32(tcg_shift); + break; + } + case 0x5a: /* FCVTNU */ + case 0x5b: /* FCVTMU */ + case 0x5c: /* FCVTAU */ + case 0x7a: /* FCVTPU */ + case 0x7b: /* FCVTZU */ + { + TCGv_i32 tcg_shift = tcg_const_i32(0); + gen_helper_vfp_touqd(tcg_rd, tcg_rn, tcg_shift, tcg_fpstatus); + tcg_temp_free_i32(tcg_shift); + break; + } + case 0x18: /* FRINTN */ + case 0x19: /* FRINTM */ + case 0x38: /* FRINTP */ + case 0x39: /* FRINTZ */ + case 0x58: /* FRINTA */ + case 0x79: /* FRINTI */ + gen_helper_rintd(tcg_rd, tcg_rn, tcg_fpstatus); + break; + case 0x59: /* FRINTX */ + gen_helper_rintd_exact(tcg_rd, tcg_rn, tcg_fpstatus); + break; default: g_assert_not_reached(); } @@ -6645,6 +7125,194 @@ static void handle_2misc_fcmp_zero(DisasContext *s, int opcode, tcg_temp_free_ptr(fpst); } +static void handle_2misc_reciprocal(DisasContext *s, int opcode, + bool is_scalar, bool is_u, bool is_q, + int size, int rn, int rd) +{ + bool is_double = (size == 3); + TCGv_ptr fpst = get_fpstatus_ptr(); + + if (is_double) { + TCGv_i64 tcg_op = tcg_temp_new_i64(); + TCGv_i64 tcg_res = tcg_temp_new_i64(); + int pass; + + for (pass = 0; pass < (is_scalar ? 1 : 2); pass++) { + read_vec_element(s, tcg_op, rn, pass, MO_64); + switch (opcode) { + case 0x3d: /* FRECPE */ + gen_helper_recpe_f64(tcg_res, tcg_op, fpst); + break; + case 0x3f: /* FRECPX */ + gen_helper_frecpx_f64(tcg_res, tcg_op, fpst); + break; + case 0x7d: /* FRSQRTE */ + gen_helper_rsqrte_f64(tcg_res, tcg_op, fpst); + break; + default: + g_assert_not_reached(); + } + write_vec_element(s, tcg_res, rd, pass, MO_64); + } + if (is_scalar) { + clear_vec_high(s, rd); + } + + tcg_temp_free_i64(tcg_res); + tcg_temp_free_i64(tcg_op); + } else { + TCGv_i32 tcg_op = tcg_temp_new_i32(); + TCGv_i32 tcg_res = tcg_temp_new_i32(); + int pass, maxpasses; + + if (is_scalar) { + maxpasses = 1; + } else { + maxpasses = is_q ? 4 : 2; + } + + for (pass = 0; pass < maxpasses; pass++) { + read_vec_element_i32(s, tcg_op, rn, pass, MO_32); + + switch (opcode) { + case 0x3c: /* URECPE */ + gen_helper_recpe_u32(tcg_res, tcg_op, fpst); + break; + case 0x3d: /* FRECPE */ + gen_helper_recpe_f32(tcg_res, tcg_op, fpst); + break; + case 0x3f: /* FRECPX */ + gen_helper_frecpx_f32(tcg_res, tcg_op, fpst); + break; + case 0x7d: /* FRSQRTE */ + gen_helper_rsqrte_f32(tcg_res, tcg_op, fpst); + break; + default: + g_assert_not_reached(); + } + + if (is_scalar) { + write_fp_sreg(s, rd, tcg_res); + } else { + write_vec_element_i32(s, tcg_res, rd, pass, MO_32); + } + } + tcg_temp_free_i32(tcg_res); + tcg_temp_free_i32(tcg_op); + if (!is_q && !is_scalar) { + clear_vec_high(s, rd); + } + } + tcg_temp_free_ptr(fpst); +} + +static void handle_2misc_narrow(DisasContext *s, bool scalar, + int opcode, bool u, bool is_q, + int size, int rn, int rd) +{ + /* Handle 2-reg-misc ops which are narrowing (so each 2*size element + * in the source becomes a size element in the destination). + */ + int pass; + TCGv_i32 tcg_res[2]; + int destelt = is_q ? 2 : 0; + int passes = scalar ? 1 : 2; + + if (scalar) { + tcg_res[1] = tcg_const_i32(0); + } + + for (pass = 0; pass < passes; pass++) { + TCGv_i64 tcg_op = tcg_temp_new_i64(); + NeonGenNarrowFn *genfn = NULL; + NeonGenNarrowEnvFn *genenvfn = NULL; + + if (scalar) { + read_vec_element(s, tcg_op, rn, pass, size + 1); + } else { + read_vec_element(s, tcg_op, rn, pass, MO_64); + } + tcg_res[pass] = tcg_temp_new_i32(); + + switch (opcode) { + case 0x12: /* XTN, SQXTUN */ + { + static NeonGenNarrowFn * const xtnfns[3] = { + gen_helper_neon_narrow_u8, + gen_helper_neon_narrow_u16, + tcg_gen_trunc_i64_i32, + }; + static NeonGenNarrowEnvFn * const sqxtunfns[3] = { + gen_helper_neon_unarrow_sat8, + gen_helper_neon_unarrow_sat16, + gen_helper_neon_unarrow_sat32, + }; + if (u) { + genenvfn = sqxtunfns[size]; + } else { + genfn = xtnfns[size]; + } + break; + } + case 0x14: /* SQXTN, UQXTN */ + { + static NeonGenNarrowEnvFn * const fns[3][2] = { + { gen_helper_neon_narrow_sat_s8, + gen_helper_neon_narrow_sat_u8 }, + { gen_helper_neon_narrow_sat_s16, + gen_helper_neon_narrow_sat_u16 }, + { gen_helper_neon_narrow_sat_s32, + gen_helper_neon_narrow_sat_u32 }, + }; + genenvfn = fns[size][u]; + break; + } + case 0x16: /* FCVTN, FCVTN2 */ + /* 32 bit to 16 bit or 64 bit to 32 bit float conversion */ + if (size == 2) { + gen_helper_vfp_fcvtsd(tcg_res[pass], tcg_op, cpu_env); + } else { + TCGv_i32 tcg_lo = tcg_temp_new_i32(); + TCGv_i32 tcg_hi = tcg_temp_new_i32(); + tcg_gen_trunc_i64_i32(tcg_lo, tcg_op); + gen_helper_vfp_fcvt_f32_to_f16(tcg_lo, tcg_lo, cpu_env); + tcg_gen_shri_i64(tcg_op, tcg_op, 32); + tcg_gen_trunc_i64_i32(tcg_hi, tcg_op); + gen_helper_vfp_fcvt_f32_to_f16(tcg_hi, tcg_hi, cpu_env); + tcg_gen_deposit_i32(tcg_res[pass], tcg_lo, tcg_hi, 16, 16); + tcg_temp_free_i32(tcg_lo); + tcg_temp_free_i32(tcg_hi); + } + break; + case 0x56: /* FCVTXN, FCVTXN2 */ + /* 64 bit to 32 bit float conversion + * with von Neumann rounding (round to odd) + */ + assert(size == 2); + gen_helper_fcvtx_f64_to_f32(tcg_res[pass], tcg_op, cpu_env); + break; + default: + g_assert_not_reached(); + } + + if (genfn) { + genfn(tcg_res[pass], tcg_op); + } else if (genenvfn) { + genenvfn(tcg_res[pass], cpu_env, tcg_op); + } + + tcg_temp_free_i64(tcg_op); + } + + for (pass = 0; pass < 2; pass++) { + write_vec_element_i32(s, tcg_res[pass], rd, destelt + pass, MO_32); + tcg_temp_free_i32(tcg_res[pass]); + } + if (!is_q) { + clear_vec_high(s, rd); + } +} + /* C3.6.12 AdvSIMD scalar two reg misc * 31 30 29 28 24 23 22 21 17 16 12 11 10 9 5 4 0 * +-----+---+-----------+------+-----------+--------+-----+------+------+ @@ -6658,6 +7326,10 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn) int opcode = extract32(insn, 12, 5); int size = extract32(insn, 22, 2); bool u = extract32(insn, 29, 1); + bool is_fcvt = false; + int rmode; + TCGv_i32 tcg_rmode; + TCGv_ptr tcg_fpstatus; switch (opcode) { case 0xa: /* CMLT */ @@ -6674,6 +7346,19 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn) return; } break; + case 0x12: /* SQXTUN */ + if (u) { + unallocated_encoding(s); + return; + } + /* fall through */ + case 0x14: /* SQXTN, UQXTN */ + if (size == 3) { + unallocated_encoding(s); + return; + } + handle_2misc_narrow(s, true, opcode, u, false, size, rn, rd); + return; case 0xc ... 0xf: case 0x16 ... 0x1d: case 0x1f: @@ -6690,23 +7375,41 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn) case 0x6d: /* FCMLE (zero) */ handle_2misc_fcmp_zero(s, opcode, true, u, true, size, rn, rd); return; + case 0x1d: /* SCVTF */ + case 0x5d: /* UCVTF */ + { + bool is_signed = (opcode == 0x1d); + handle_simd_intfp_conv(s, rd, rn, 1, is_signed, 0, size); + return; + } + case 0x3d: /* FRECPE */ + case 0x3f: /* FRECPX */ + case 0x7d: /* FRSQRTE */ + handle_2misc_reciprocal(s, opcode, true, u, true, size, rn, rd); + return; case 0x1a: /* FCVTNS */ case 0x1b: /* FCVTMS */ - case 0x1c: /* FCVTAS */ - case 0x1d: /* SCVTF */ case 0x3a: /* FCVTPS */ case 0x3b: /* FCVTZS */ - case 0x3d: /* FRECPE */ - case 0x3f: /* FRECPX */ - case 0x56: /* FCVTXN, FCVTXN2 */ case 0x5a: /* FCVTNU */ case 0x5b: /* FCVTMU */ - case 0x5c: /* FCVTAU */ - case 0x5d: /* UCVTF */ case 0x7a: /* FCVTPU */ case 0x7b: /* FCVTZU */ - case 0x7d: /* FRSQRTE */ - unsupported_encoding(s, insn); + is_fcvt = true; + rmode = extract32(opcode, 5, 1) | (extract32(opcode, 0, 1) << 1); + break; + case 0x1c: /* FCVTAS */ + case 0x5c: /* FCVTAU */ + /* TIEAWAY doesn't fit in the usual rounding mode encoding */ + is_fcvt = true; + rmode = FPROUNDING_TIEAWAY; + break; + case 0x56: /* FCVTXN, FCVTXN2 */ + if (size == 2) { + unallocated_encoding(s); + return; + } + handle_2misc_narrow(s, true, opcode, u, false, size - 1, rn, rd); return; default: unallocated_encoding(s); @@ -6716,25 +7419,71 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn) default: /* Other categories of encoding in this class: * + SUQADD/USQADD/SQABS/SQNEG : size 8, 16, 32 or 64 - * + SQXTN/SQXTN2/SQXTUN/SQXTUN2/UQXTN/UQXTN2: - * narrowing saturate ops: size 64/32/16 -> 32/16/8 */ unsupported_encoding(s, insn); return; } + if (is_fcvt) { + tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode)); + gen_helper_set_rmode(tcg_rmode, tcg_rmode, cpu_env); + tcg_fpstatus = get_fpstatus_ptr(); + } else { + TCGV_UNUSED_I32(tcg_rmode); + TCGV_UNUSED_PTR(tcg_fpstatus); + } + if (size == 3) { TCGv_i64 tcg_rn = read_fp_dreg(s, rn); TCGv_i64 tcg_rd = tcg_temp_new_i64(); - handle_2misc_64(s, opcode, u, tcg_rd, tcg_rn); + handle_2misc_64(s, opcode, u, tcg_rd, tcg_rn, tcg_rmode, tcg_fpstatus); write_fp_dreg(s, rd, tcg_rd); tcg_temp_free_i64(tcg_rd); tcg_temp_free_i64(tcg_rn); + } else if (size == 2) { + TCGv_i32 tcg_rn = read_fp_sreg(s, rn); + TCGv_i32 tcg_rd = tcg_temp_new_i32(); + + switch (opcode) { + case 0x1a: /* FCVTNS */ + case 0x1b: /* FCVTMS */ + case 0x1c: /* FCVTAS */ + case 0x3a: /* FCVTPS */ + case 0x3b: /* FCVTZS */ + { + TCGv_i32 tcg_shift = tcg_const_i32(0); + gen_helper_vfp_tosls(tcg_rd, tcg_rn, tcg_shift, tcg_fpstatus); + tcg_temp_free_i32(tcg_shift); + break; + } + case 0x5a: /* FCVTNU */ + case 0x5b: /* FCVTMU */ + case 0x5c: /* FCVTAU */ + case 0x7a: /* FCVTPU */ + case 0x7b: /* FCVTZU */ + { + TCGv_i32 tcg_shift = tcg_const_i32(0); + gen_helper_vfp_touls(tcg_rd, tcg_rn, tcg_shift, tcg_fpstatus); + tcg_temp_free_i32(tcg_shift); + break; + } + default: + g_assert_not_reached(); + } + + write_fp_sreg(s, rd, tcg_rd); + tcg_temp_free_i32(tcg_rd); + tcg_temp_free_i32(tcg_rn); } else { - /* the 'size might not be 64' ops aren't implemented yet */ g_assert_not_reached(); } + + if (is_fcvt) { + gen_helper_set_rmode(tcg_rmode, tcg_rmode, cpu_env); + tcg_temp_free_i32(tcg_rmode); + tcg_temp_free_ptr(tcg_fpstatus); + } } /* SSHR[RA]/USHR[RA] - Vector shift right (optional rounding/accumulate) */ @@ -6746,6 +7495,7 @@ static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u, int shift = 2 * (8 << size) - immhb; bool accumulate = false; bool round = false; + bool insert = false; int dsize = is_q ? 128 : 64; int esize = 8 << size; int elements = dsize/esize; @@ -6775,6 +7525,9 @@ static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u, case 0x06: /* SRSRA / URSRA (accum + rounding) */ accumulate = round = true; break; + case 0x08: /* SRI */ + insert = true; + break; } if (round) { @@ -6786,12 +7539,16 @@ static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u, for (i = 0; i < elements; i++) { read_vec_element(s, tcg_rn, rn, i, memop); - if (accumulate) { + if (accumulate || insert) { read_vec_element(s, tcg_rd, rd, i, memop); } - handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round, - accumulate, is_u, size, shift); + if (insert) { + handle_shri_with_ins(tcg_rd, tcg_rn, size, shift); + } else { + handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round, + accumulate, is_u, size, shift); + } write_vec_element(s, tcg_rd, rd, i, size); } @@ -6878,6 +7635,62 @@ static void handle_vec_simd_wshli(DisasContext *s, bool is_q, bool is_u, } } +/* SHRN/RSHRN - Shift right with narrowing (and potential rounding) */ +static void handle_vec_simd_shrn(DisasContext *s, bool is_q, + int immh, int immb, int opcode, int rn, int rd) +{ + int immhb = immh << 3 | immb; + int size = 32 - clz32(immh) - 1; + int dsize = 64; + int esize = 8 << size; + int elements = dsize/esize; + int shift = (2 * esize) - immhb; + bool round = extract32(opcode, 0, 1); + TCGv_i64 tcg_rn, tcg_rd, tcg_final; + TCGv_i64 tcg_round; + int i; + + if (extract32(immh, 3, 1)) { + unallocated_encoding(s); + return; + } + + tcg_rn = tcg_temp_new_i64(); + tcg_rd = tcg_temp_new_i64(); + tcg_final = tcg_temp_new_i64(); + read_vec_element(s, tcg_final, rd, is_q ? 1 : 0, MO_64); + + if (round) { + uint64_t round_const = 1ULL << (shift - 1); + tcg_round = tcg_const_i64(round_const); + } else { + TCGV_UNUSED_I64(tcg_round); + } + + for (i = 0; i < elements; i++) { + read_vec_element(s, tcg_rn, rn, i, size+1); + handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round, + false, true, size+1, shift); + + tcg_gen_deposit_i64(tcg_final, tcg_final, tcg_rd, esize * i, esize); + } + + if (!is_q) { + clear_vec_high(s, rd); + write_vec_element(s, tcg_final, rd, 0, MO_64); + } else { + write_vec_element(s, tcg_final, rd, 1, MO_64); + } + + if (round) { + tcg_temp_free_i64(tcg_round); + } + tcg_temp_free_i64(tcg_rn); + tcg_temp_free_i64(tcg_rd); + tcg_temp_free_i64(tcg_final); + return; +} + /* C3.6.14 AdvSIMD shift by immediate * 31 30 29 28 23 22 19 18 16 15 11 10 9 5 4 0 @@ -6896,6 +7709,12 @@ static void disas_simd_shift_imm(DisasContext *s, uint32_t insn) bool is_q = extract32(insn, 30, 1); switch (opcode) { + case 0x08: /* SRI */ + if (!is_u) { + unallocated_encoding(s); + return; + } + /* fall through */ case 0x00: /* SSHR / USHR */ case 0x02: /* SSRA / USRA (accumulate) */ case 0x04: /* SRSHR / URSHR (rounding) */ @@ -6905,15 +7724,42 @@ static void disas_simd_shift_imm(DisasContext *s, uint32_t insn) case 0x0a: /* SHL / SLI */ handle_vec_simd_shli(s, is_q, is_u, immh, immb, opcode, rn, rd); break; + case 0x10: /* SHRN */ + case 0x11: /* RSHRN / SQRSHRUN */ + if (is_u) { + handle_vec_simd_sqshrn(s, false, is_q, false, true, immh, immb, + opcode, rn, rd); + } else { + handle_vec_simd_shrn(s, is_q, immh, immb, opcode, rn, rd); + } + break; + case 0x12: /* SQSHRN / UQSHRN */ + case 0x13: /* SQRSHRN / UQRSHRN */ + handle_vec_simd_sqshrn(s, false, is_q, is_u, is_u, immh, immb, + opcode, rn, rd); + break; case 0x14: /* SSHLL / USHLL */ handle_vec_simd_wshli(s, is_q, is_u, immh, immb, opcode, rn, rd); break; + case 0x1c: /* SCVTF / UCVTF */ + handle_simd_shift_intfp_conv(s, false, is_q, is_u, immh, immb, + opcode, rn, rd); + break; + case 0xc: /* SQSHLU */ + if (!is_u) { + unallocated_encoding(s); + return; + } + handle_simd_qshl(s, false, is_q, false, true, immh, immb, rn, rd); + break; + case 0xe: /* SQSHL, UQSHL */ + handle_simd_qshl(s, false, is_q, is_u, is_u, immh, immb, rn, rd); + break; + case 0x1f: /* FCVTZS/ FCVTZU */ + handle_simd_shift_fpint_conv(s, false, is_q, is_u, immh, immb, rn, rd); + return; default: - /* We don't currently implement any of the Narrow or saturating shifts; - * nor do we implement the fixed-point conversions in this - * encoding group (SCVTF, FCVTZS, UCVTF, FCVTZU). - */ - unsupported_encoding(s, insn); + unallocated_encoding(s); return; } } @@ -7124,6 +7970,10 @@ static void handle_3rd_widening(DisasContext *s, int is_q, int is_u, int size, gen_helper_neon_addl_saturate_s32(tcg_passres, cpu_env, tcg_passres, tcg_passres); break; + case 14: /* PMULL */ + assert(size == 0); + gen_helper_neon_mull_p8(tcg_passres, tcg_op1, tcg_op2); + break; default: g_assert_not_reached(); } @@ -7243,6 +8093,30 @@ static void handle_3rd_narrowing(DisasContext *s, int is_q, int is_u, int size, } } +static void handle_pmull_64(DisasContext *s, int is_q, int rd, int rn, int rm) +{ + /* PMULL of 64 x 64 -> 128 is an odd special case because it + * is the only three-reg-diff instruction which produces a + * 128-bit wide result from a single operation. However since + * it's possible to calculate the two halves more or less + * separately we just use two helper calls. + */ + TCGv_i64 tcg_op1 = tcg_temp_new_i64(); + TCGv_i64 tcg_op2 = tcg_temp_new_i64(); + TCGv_i64 tcg_res = tcg_temp_new_i64(); + + read_vec_element(s, tcg_op1, rn, is_q, MO_64); + read_vec_element(s, tcg_op2, rm, is_q, MO_64); + gen_helper_neon_pmull_64_lo(tcg_res, tcg_op1, tcg_op2); + write_vec_element(s, tcg_res, rd, 0, MO_64); + gen_helper_neon_pmull_64_hi(tcg_res, tcg_op1, tcg_op2); + write_vec_element(s, tcg_res, rd, 1, MO_64); + + tcg_temp_free_i64(tcg_op1); + tcg_temp_free_i64(tcg_op2); + tcg_temp_free_i64(tcg_res); +} + /* C3.6.15 AdvSIMD three different * 31 30 29 28 24 23 22 21 20 16 15 12 11 10 9 5 4 0 * +---+---+---+-----------+------+---+------+--------+-----+------+------+ @@ -7293,8 +8167,15 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn) unallocated_encoding(s); return; } - unsupported_encoding(s, insn); - break; + if (size == 3) { + if (!arm_dc_feature(s, ARM_FEATURE_V8_AES)) { + unallocated_encoding(s); + return; + } + handle_pmull_64(s, is_q, rd, rn, rm); + return; + } + goto is_widening; case 9: /* SQDMLAL, SQDMLAL2 */ case 11: /* SQDMLSL, SQDMLSL2 */ case 13: /* SQDMULL, SQDMULL2 */ @@ -7315,6 +8196,7 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn) unallocated_encoding(s); return; } + is_widening: handle_3rd_widening(s, is_q, is_u, size, opcode, rd, rn, rm); break; default: @@ -7991,76 +8873,48 @@ static void disas_simd_three_reg_same(DisasContext *s, uint32_t insn) } } -static void handle_2misc_narrow(DisasContext *s, int opcode, bool u, bool is_q, - int size, int rn, int rd) +static void handle_2misc_widening(DisasContext *s, int opcode, bool is_q, + int size, int rn, int rd) { - /* Handle 2-reg-misc ops which are narrowing (so each 2*size element - * in the source becomes a size element in the destination). + /* Handle 2-reg-misc ops which are widening (so each size element + * in the source becomes a 2*size element in the destination. + * The only instruction like this is FCVTL. */ int pass; - TCGv_i32 tcg_res[2]; - int destelt = is_q ? 2 : 0; - for (pass = 0; pass < 2; pass++) { - TCGv_i64 tcg_op = tcg_temp_new_i64(); - NeonGenNarrowFn *genfn = NULL; - NeonGenNarrowEnvFn *genenvfn = NULL; + if (size == 3) { + /* 32 -> 64 bit fp conversion */ + TCGv_i64 tcg_res[2]; + int srcelt = is_q ? 2 : 0; - read_vec_element(s, tcg_op, rn, pass, MO_64); - tcg_res[pass] = tcg_temp_new_i32(); + for (pass = 0; pass < 2; pass++) { + TCGv_i32 tcg_op = tcg_temp_new_i32(); + tcg_res[pass] = tcg_temp_new_i64(); - switch (opcode) { - case 0x12: /* XTN, SQXTUN */ - { - static NeonGenNarrowFn * const xtnfns[3] = { - gen_helper_neon_narrow_u8, - gen_helper_neon_narrow_u16, - tcg_gen_trunc_i64_i32, - }; - static NeonGenNarrowEnvFn * const sqxtunfns[3] = { - gen_helper_neon_unarrow_sat8, - gen_helper_neon_unarrow_sat16, - gen_helper_neon_unarrow_sat32, - }; - if (u) { - genenvfn = sqxtunfns[size]; - } else { - genfn = xtnfns[size]; - } - break; - } - case 0x14: /* SQXTN, UQXTN */ - { - static NeonGenNarrowEnvFn * const fns[3][2] = { - { gen_helper_neon_narrow_sat_s8, - gen_helper_neon_narrow_sat_u8 }, - { gen_helper_neon_narrow_sat_s16, - gen_helper_neon_narrow_sat_u16 }, - { gen_helper_neon_narrow_sat_s32, - gen_helper_neon_narrow_sat_u32 }, - }; - genenvfn = fns[size][u]; - break; - } - default: - g_assert_not_reached(); + read_vec_element_i32(s, tcg_op, rn, srcelt + pass, MO_32); + gen_helper_vfp_fcvtds(tcg_res[pass], tcg_op, cpu_env); + tcg_temp_free_i32(tcg_op); } - - if (genfn) { - genfn(tcg_res[pass], tcg_op); - } else { - genenvfn(tcg_res[pass], cpu_env, tcg_op); + for (pass = 0; pass < 2; pass++) { + write_vec_element(s, tcg_res[pass], rd, pass, MO_64); + tcg_temp_free_i64(tcg_res[pass]); } + } else { + /* 16 -> 32 bit fp conversion */ + int srcelt = is_q ? 4 : 0; + TCGv_i32 tcg_res[4]; - tcg_temp_free_i64(tcg_op); - } + for (pass = 0; pass < 4; pass++) { + tcg_res[pass] = tcg_temp_new_i32(); - for (pass = 0; pass < 2; pass++) { - write_vec_element_i32(s, tcg_res[pass], rd, destelt + pass, MO_32); - tcg_temp_free_i32(tcg_res[pass]); - } - if (!is_q) { - clear_vec_high(s, rd); + read_vec_element_i32(s, tcg_res[pass], rn, srcelt + pass, MO_16); + gen_helper_vfp_fcvt_f16_to_f32(tcg_res[pass], tcg_res[pass], + cpu_env); + } + for (pass = 0; pass < 4; pass++) { + write_vec_element_i32(s, tcg_res[pass], rd, pass, MO_32); + tcg_temp_free_i32(tcg_res[pass]); + } } } @@ -8133,6 +8987,108 @@ static void handle_rev(DisasContext *s, int opcode, bool u, } } +static void handle_2misc_pairwise(DisasContext *s, int opcode, bool u, + bool is_q, int size, int rn, int rd) +{ + /* Implement the pairwise operations from 2-misc: + * SADDLP, UADDLP, SADALP, UADALP. + * These all add pairs of elements in the input to produce a + * double-width result element in the output (possibly accumulating). + */ + bool accum = (opcode == 0x6); + int maxpass = is_q ? 2 : 1; + int pass; + TCGv_i64 tcg_res[2]; + + if (size == 2) { + /* 32 + 32 -> 64 op */ + TCGMemOp memop = size + (u ? 0 : MO_SIGN); + + for (pass = 0; pass < maxpass; pass++) { + TCGv_i64 tcg_op1 = tcg_temp_new_i64(); + TCGv_i64 tcg_op2 = tcg_temp_new_i64(); + + tcg_res[pass] = tcg_temp_new_i64(); + + read_vec_element(s, tcg_op1, rn, pass * 2, memop); + read_vec_element(s, tcg_op2, rn, pass * 2 + 1, memop); + tcg_gen_add_i64(tcg_res[pass], tcg_op1, tcg_op2); + if (accum) { + read_vec_element(s, tcg_op1, rd, pass, MO_64); + tcg_gen_add_i64(tcg_res[pass], tcg_res[pass], tcg_op1); + } + + tcg_temp_free_i64(tcg_op1); + tcg_temp_free_i64(tcg_op2); + } + } else { + for (pass = 0; pass < maxpass; pass++) { + TCGv_i64 tcg_op = tcg_temp_new_i64(); + NeonGenOneOpFn *genfn; + static NeonGenOneOpFn * const fns[2][2] = { + { gen_helper_neon_addlp_s8, gen_helper_neon_addlp_u8 }, + { gen_helper_neon_addlp_s16, gen_helper_neon_addlp_u16 }, + }; + + genfn = fns[size][u]; + + tcg_res[pass] = tcg_temp_new_i64(); + + read_vec_element(s, tcg_op, rn, pass, MO_64); + genfn(tcg_res[pass], tcg_op); + + if (accum) { + read_vec_element(s, tcg_op, rd, pass, MO_64); + if (size == 0) { + gen_helper_neon_addl_u16(tcg_res[pass], + tcg_res[pass], tcg_op); + } else { + gen_helper_neon_addl_u32(tcg_res[pass], + tcg_res[pass], tcg_op); + } + } + tcg_temp_free_i64(tcg_op); + } + } + if (!is_q) { + tcg_res[1] = tcg_const_i64(0); + } + for (pass = 0; pass < 2; pass++) { + write_vec_element(s, tcg_res[pass], rd, pass, MO_64); + tcg_temp_free_i64(tcg_res[pass]); + } +} + +static void handle_shll(DisasContext *s, bool is_q, int size, int rn, int rd) +{ + /* Implement SHLL and SHLL2 */ + int pass; + int part = is_q ? 2 : 0; + TCGv_i64 tcg_res[2]; + + for (pass = 0; pass < 2; pass++) { + static NeonGenWidenFn * const widenfns[3] = { + gen_helper_neon_widen_u8, + gen_helper_neon_widen_u16, + tcg_gen_extu_i32_i64, + }; + NeonGenWidenFn *widenfn = widenfns[size]; + TCGv_i32 tcg_op = tcg_temp_new_i32(); + + read_vec_element_i32(s, tcg_op, rn, part + pass, MO_32); + tcg_res[pass] = tcg_temp_new_i64(); + widenfn(tcg_res[pass], tcg_op); + tcg_gen_shli_i64(tcg_res[pass], tcg_res[pass], 8 << size); + + tcg_temp_free_i32(tcg_op); + } + + for (pass = 0; pass < 2; pass++) { + write_vec_element(s, tcg_res[pass], rd, pass, MO_64); + tcg_temp_free_i64(tcg_res[pass]); + } +} + /* C3.6.17 AdvSIMD two reg misc * 31 30 29 28 24 23 22 21 17 16 12 11 10 9 5 4 0 * +---+---+---+-----------+------+-----------+--------+-----+------+------+ @@ -8147,6 +9103,11 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn) bool is_q = extract32(insn, 30, 1); int rn = extract32(insn, 5, 5); int rd = extract32(insn, 0, 5); + bool need_fpstatus = false; + bool need_rmode = false; + int rmode = -1; + TCGv_i32 tcg_rmode; + TCGv_ptr tcg_fpstatus; switch (opcode) { case 0x0: /* REV64, REV32 */ @@ -8173,23 +9134,28 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn) unallocated_encoding(s); return; } - handle_2misc_narrow(s, opcode, u, is_q, size, rn, rd); + handle_2misc_narrow(s, false, opcode, u, is_q, size, rn, rd); return; - case 0x2: /* SADDLP, UADDLP */ case 0x4: /* CLS, CLZ */ + if (size == 3) { + unallocated_encoding(s); + return; + } + break; + case 0x2: /* SADDLP, UADDLP */ case 0x6: /* SADALP, UADALP */ if (size == 3) { unallocated_encoding(s); return; } - unsupported_encoding(s, insn); + handle_2misc_pairwise(s, opcode, u, is_q, size, rn, rd); return; case 0x13: /* SHLL, SHLL2 */ if (u == 0 || size == 3) { unallocated_encoding(s); return; } - unsupported_encoding(s, insn); + handle_shll(s, is_q, size, rn, rd); return; case 0xa: /* CMLT */ if (u == 1) { @@ -8220,8 +9186,9 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn) /* Floating point: U, size[1] and opcode indicate operation; * size[0] indicates single or double precision. */ + int is_double = extract32(size, 0, 1); opcode |= (extract32(size, 1, 1) << 5) | (u << 6); - size = extract32(size, 0, 1) ? 3 : 2; + size = is_double ? 3 : 2; switch (opcode) { case 0x2f: /* FABS */ case 0x6f: /* FNEG */ @@ -8230,6 +9197,18 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn) return; } break; + case 0x1d: /* SCVTF */ + case 0x5d: /* UCVTF */ + { + bool is_signed = (opcode == 0x1d) ? true : false; + int elements = is_double ? 2 : is_q ? 4 : 2; + if (is_double && !is_q) { + unallocated_encoding(s); + return; + } + handle_simd_intfp_conv(s, rd, rn, elements, is_signed, 0, size); + return; + } case 0x2c: /* FCMGT (zero) */ case 0x2d: /* FCMEQ (zero) */ case 0x2e: /* FCMLT (zero) */ @@ -8241,35 +9220,98 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn) } handle_2misc_fcmp_zero(s, opcode, false, u, is_q, size, rn, rd); return; - case 0x16: /* FCVTN, FCVTN2 */ - case 0x17: /* FCVTL, FCVTL2 */ - case 0x18: /* FRINTN */ - case 0x19: /* FRINTM */ + case 0x7f: /* FSQRT */ + if (size == 3 && !is_q) { + unallocated_encoding(s); + return; + } + break; case 0x1a: /* FCVTNS */ case 0x1b: /* FCVTMS */ - case 0x1c: /* FCVTAS */ - case 0x1d: /* SCVTF */ - case 0x38: /* FRINTP */ - case 0x39: /* FRINTZ */ case 0x3a: /* FCVTPS */ case 0x3b: /* FCVTZS */ - case 0x3c: /* URECPE */ - case 0x3d: /* FRECPE */ - case 0x56: /* FCVTXN, FCVTXN2 */ - case 0x58: /* FRINTA */ - case 0x59: /* FRINTX */ case 0x5a: /* FCVTNU */ case 0x5b: /* FCVTMU */ - case 0x5c: /* FCVTAU */ - case 0x5d: /* UCVTF */ - case 0x79: /* FRINTI */ case 0x7a: /* FCVTPU */ case 0x7b: /* FCVTZU */ - case 0x7c: /* URSQRTE */ + need_fpstatus = true; + need_rmode = true; + rmode = extract32(opcode, 5, 1) | (extract32(opcode, 0, 1) << 1); + if (size == 3 && !is_q) { + unallocated_encoding(s); + return; + } + break; + case 0x5c: /* FCVTAU */ + case 0x1c: /* FCVTAS */ + need_fpstatus = true; + need_rmode = true; + rmode = FPROUNDING_TIEAWAY; + if (size == 3 && !is_q) { + unallocated_encoding(s); + return; + } + break; + case 0x3c: /* URECPE */ + if (size == 3) { + unallocated_encoding(s); + return; + } + /* fall through */ + case 0x3d: /* FRECPE */ case 0x7d: /* FRSQRTE */ - case 0x7f: /* FSQRT */ - unsupported_encoding(s, insn); + if (size == 3 && !is_q) { + unallocated_encoding(s); + return; + } + handle_2misc_reciprocal(s, opcode, false, u, is_q, size, rn, rd); + return; + case 0x56: /* FCVTXN, FCVTXN2 */ + if (size == 2) { + unallocated_encoding(s); + return; + } + /* fall through */ + case 0x16: /* FCVTN, FCVTN2 */ + /* handle_2misc_narrow does a 2*size -> size operation, but these + * instructions encode the source size rather than dest size. + */ + handle_2misc_narrow(s, false, opcode, 0, is_q, size - 1, rn, rd); + return; + case 0x17: /* FCVTL, FCVTL2 */ + handle_2misc_widening(s, opcode, is_q, size, rn, rd); return; + case 0x18: /* FRINTN */ + case 0x19: /* FRINTM */ + case 0x38: /* FRINTP */ + case 0x39: /* FRINTZ */ + need_rmode = true; + rmode = extract32(opcode, 5, 1) | (extract32(opcode, 0, 1) << 1); + /* fall through */ + case 0x59: /* FRINTX */ + case 0x79: /* FRINTI */ + need_fpstatus = true; + if (size == 3 && !is_q) { + unallocated_encoding(s); + return; + } + break; + case 0x58: /* FRINTA */ + need_rmode = true; + rmode = FPROUNDING_TIEAWAY; + need_fpstatus = true; + if (size == 3 && !is_q) { + unallocated_encoding(s); + return; + } + break; + case 0x7c: /* URSQRTE */ + if (size == 3) { + unallocated_encoding(s); + return; + } + need_fpstatus = true; + break; default: unallocated_encoding(s); return; @@ -8281,6 +9323,18 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn) return; } + if (need_fpstatus) { + tcg_fpstatus = get_fpstatus_ptr(); + } else { + TCGV_UNUSED_PTR(tcg_fpstatus); + } + if (need_rmode) { + tcg_rmode = tcg_const_i32(arm_rmode_to_sf(rmode)); + gen_helper_set_rmode(tcg_rmode, tcg_rmode, cpu_env); + } else { + TCGV_UNUSED_I32(tcg_rmode); + } + if (size == 3) { /* All 64-bit element operations can be shared with scalar 2misc */ int pass; @@ -8291,7 +9345,8 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn) read_vec_element(s, tcg_op, rn, pass, MO_64); - handle_2misc_64(s, opcode, u, tcg_res, tcg_op); + handle_2misc_64(s, opcode, u, tcg_res, tcg_op, + tcg_rmode, tcg_fpstatus); write_vec_element(s, tcg_res, rd, pass, MO_64); @@ -8327,6 +9382,13 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn) case 0x9: /* CMEQ, CMLE */ cond = u ? TCG_COND_LE : TCG_COND_EQ; goto do_cmop; + case 0x4: /* CLS */ + if (u) { + gen_helper_clz32(tcg_res, tcg_op); + } else { + gen_helper_cls32(tcg_res, tcg_op); + } + break; case 0xb: /* ABS, NEG */ if (u) { tcg_gen_neg_i32(tcg_res, tcg_op); @@ -8344,6 +9406,47 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn) case 0x6f: /* FNEG */ gen_helper_vfp_negs(tcg_res, tcg_op); break; + case 0x7f: /* FSQRT */ + gen_helper_vfp_sqrts(tcg_res, tcg_op, cpu_env); + break; + case 0x1a: /* FCVTNS */ + case 0x1b: /* FCVTMS */ + case 0x1c: /* FCVTAS */ + case 0x3a: /* FCVTPS */ + case 0x3b: /* FCVTZS */ + { + TCGv_i32 tcg_shift = tcg_const_i32(0); + gen_helper_vfp_tosls(tcg_res, tcg_op, + tcg_shift, tcg_fpstatus); + tcg_temp_free_i32(tcg_shift); + break; + } + case 0x5a: /* FCVTNU */ + case 0x5b: /* FCVTMU */ + case 0x5c: /* FCVTAU */ + case 0x7a: /* FCVTPU */ + case 0x7b: /* FCVTZU */ + { + TCGv_i32 tcg_shift = tcg_const_i32(0); + gen_helper_vfp_touls(tcg_res, tcg_op, + tcg_shift, tcg_fpstatus); + tcg_temp_free_i32(tcg_shift); + break; + } + case 0x18: /* FRINTN */ + case 0x19: /* FRINTM */ + case 0x38: /* FRINTP */ + case 0x39: /* FRINTZ */ + case 0x58: /* FRINTA */ + case 0x79: /* FRINTI */ + gen_helper_rints(tcg_res, tcg_op, tcg_fpstatus); + break; + case 0x59: /* FRINTX */ + gen_helper_rints_exact(tcg_res, tcg_op, tcg_fpstatus); + break; + case 0x7c: /* URSQRTE */ + gen_helper_rsqrte_u32(tcg_res, tcg_op, tcg_fpstatus); + break; default: g_assert_not_reached(); } @@ -8407,6 +9510,21 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn) } } break; + case 0x4: /* CLS, CLZ */ + if (u) { + if (size == 0) { + gen_helper_neon_clz_u8(tcg_res, tcg_op); + } else { + gen_helper_neon_clz_u16(tcg_res, tcg_op); + } + } else { + if (size == 0) { + gen_helper_neon_cls_s8(tcg_res, tcg_op); + } else { + gen_helper_neon_cls_s16(tcg_res, tcg_op); + } + } + break; default: g_assert_not_reached(); } @@ -8421,6 +9539,14 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn) if (!is_q) { clear_vec_high(s, rd); } + + if (need_rmode) { + gen_helper_set_rmode(tcg_rmode, tcg_rmode, cpu_env); + tcg_temp_free_i32(tcg_rmode); + } + if (need_fpstatus) { + tcg_temp_free_ptr(tcg_fpstatus); + } } /* C3.6.13 AdvSIMD scalar x indexed element @@ -9045,6 +10171,7 @@ void gen_intermediate_code_internal_a64(ARMCPU *cpu, dc->vec_stride = 0; dc->cp_regs = cpu->cp_regs; dc->current_pl = arm_current_pl(env); + dc->features = env->features; init_tmp_a64_array(dc); diff --git a/target-arm/translate.c b/target-arm/translate.c index fbe513b40d..56e3b4bf7f 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -6682,17 +6682,33 @@ static int disas_neon_data_insn(CPUARMState * env, DisasContext *s, uint32_t ins break; } case NEON_2RM_VRECPE: - gen_helper_recpe_u32(tmp, tmp, cpu_env); + { + TCGv_ptr fpstatus = get_fpstatus_ptr(1); + gen_helper_recpe_u32(tmp, tmp, fpstatus); + tcg_temp_free_ptr(fpstatus); break; + } case NEON_2RM_VRSQRTE: - gen_helper_rsqrte_u32(tmp, tmp, cpu_env); + { + TCGv_ptr fpstatus = get_fpstatus_ptr(1); + gen_helper_rsqrte_u32(tmp, tmp, fpstatus); + tcg_temp_free_ptr(fpstatus); break; + } case NEON_2RM_VRECPE_F: - gen_helper_recpe_f32(cpu_F0s, cpu_F0s, cpu_env); + { + TCGv_ptr fpstatus = get_fpstatus_ptr(1); + gen_helper_recpe_f32(cpu_F0s, cpu_F0s, fpstatus); + tcg_temp_free_ptr(fpstatus); break; + } case NEON_2RM_VRSQRTE_F: - gen_helper_rsqrte_f32(cpu_F0s, cpu_F0s, cpu_env); + { + TCGv_ptr fpstatus = get_fpstatus_ptr(1); + gen_helper_rsqrte_f32(cpu_F0s, cpu_F0s, fpstatus); + tcg_temp_free_ptr(fpstatus); break; + } case NEON_2RM_VCVT_FS: /* VCVT.F32.S32 */ gen_vfp_sito(0, 1); break; @@ -10654,6 +10670,7 @@ static inline void gen_intermediate_code_internal(ARMCPU *cpu, dc->vec_stride = ARM_TBFLAG_VECSTRIDE(tb->flags); dc->cp_regs = cpu->cp_regs; dc->current_pl = arm_current_pl(env); + dc->features = env->features; cpu_F0s = tcg_temp_new_i32(); cpu_F1s = tcg_temp_new_i32(); diff --git a/target-arm/translate.h b/target-arm/translate.h index 2f491f9ff6..3525ffcecb 100644 --- a/target-arm/translate.h +++ b/target-arm/translate.h @@ -26,6 +26,7 @@ typedef struct DisasContext { int aarch64; int current_pl; GHashTable *cp_regs; + uint64_t features; /* CPU features bits */ #define TMP_A64_MAX 16 int tmp_a64_count; TCGv_i64 tmp_a64[TMP_A64_MAX]; @@ -33,6 +34,11 @@ typedef struct DisasContext { extern TCGv_ptr cpu_env; +static inline int arm_dc_feature(DisasContext *dc, int feature) +{ + return (dc->features & (1ULL << feature)) != 0; +} + /* target-specific extra values for is_jmp */ /* These instructions trap after executing, so the A32/T32 decoder must * defer them until after the conditional execution state has been updated. |