diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2018-03-02 14:37:10 +0000 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2018-03-02 14:37:10 +0000 |
commit | 86f4c7e05b1c44dbe1b329a51f311f10aef6ff34 (patch) | |
tree | 6073147f05719812e5ecb14ffd6994a66fed9a7f /target | |
parent | 2e7b766594e17f786a6b2e5be690bc5b43ce6036 (diff) | |
parent | e66a67bf28e1b4fce2e3d72a2610dbd48d9d3078 (diff) |
Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20180302' into staging
target-arm queue:
* implement FCMA and RDM v8.1 and v8.3 instructions
* enable Cortex-M33 v8M core, and provide new mps2-an505 board model
that uses it
* decodetree: Propagate return value from translate subroutines
* xlnx-zynqmp: Implement the RTC device
# gpg: Signature made Fri 02 Mar 2018 11:05:40 GMT
# gpg: using RSA key 3C2525ED14360CDE
# gpg: Good signature from "Peter Maydell <peter.maydell@linaro.org>"
# gpg: aka "Peter Maydell <pmaydell@gmail.com>"
# gpg: aka "Peter Maydell <pmaydell@chiark.greenend.org.uk>"
# Primary key fingerprint: E1A5 C593 CD41 9DE2 8E83 15CF 3C25 25ED 1436 0CDE
* remotes/pmaydell/tags/pull-target-arm-20180302: (39 commits)
target/arm: Enable ARM_FEATURE_V8_FCMA
target/arm: Decode t32 simd 3reg and 2reg_scalar extension
target/arm: Decode aa32 armv8.3 2-reg-index
target/arm: Decode aa32 armv8.3 3-same
target/arm: Decode aa64 armv8.3 fcmla
target/arm: Decode aa64 armv8.3 fcadd
target/arm: Add ARM_FEATURE_V8_FCMA
target/arm: Enable ARM_FEATURE_V8_RDM
target/arm: Decode aa32 armv8.1 two reg and a scalar
target/arm: Decode aa32 armv8.1 three same
target/arm: Decode aa64 armv8.1 scalar/vector x indexed element
target/arm: Decode aa64 armv8.1 three same extra
target/arm: Decode aa64 armv8.1 scalar three same extra
target/arm: Refactor disas_simd_indexed size checks
target/arm: Refactor disas_simd_indexed decode
target/arm: Add ARM_FEATURE_V8_RDM
mps2-an505: New board model: MPS2 with AN505 Cortex-M33 FPGA image
hw/arm/iotkit: Model Arm IOT Kit
hw/misc/iotkit-secctl: Add remaining simple registers
hw/misc/iotkit-secctl: Add handling for PPCs
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Diffstat (limited to 'target')
-rw-r--r-- | target/arm/Makefile.objs | 2 | ||||
-rw-r--r-- | target/arm/cpu.c | 66 | ||||
-rw-r--r-- | target/arm/cpu.h | 8 | ||||
-rw-r--r-- | target/arm/cpu64.c | 2 | ||||
-rw-r--r-- | target/arm/helper.c | 28 | ||||
-rw-r--r-- | target/arm/helper.h | 31 | ||||
-rw-r--r-- | target/arm/idau.h | 61 | ||||
-rw-r--r-- | target/arm/translate-a64.c | 500 | ||||
-rw-r--r-- | target/arm/translate.c | 271 | ||||
-rw-r--r-- | target/arm/vec_helper.c | 429 |
10 files changed, 1274 insertions, 124 deletions
diff --git a/target/arm/Makefile.objs b/target/arm/Makefile.objs index 847fb52ee0..1297bead5f 100644 --- a/target/arm/Makefile.objs +++ b/target/arm/Makefile.objs @@ -5,7 +5,7 @@ obj-$(call land,$(CONFIG_KVM),$(call lnot,$(TARGET_AARCH64))) += kvm32.o obj-$(call land,$(CONFIG_KVM),$(TARGET_AARCH64)) += kvm64.o obj-$(call lnot,$(CONFIG_KVM)) += kvm-stub.o obj-y += translate.o op_helper.o helper.o cpu.o -obj-y += neon_helper.o iwmmxt_helper.o +obj-y += neon_helper.o iwmmxt_helper.o vec_helper.o obj-y += gdbstub.o obj-$(TARGET_AARCH64) += cpu64.o translate-a64.o helper-a64.o gdbstub64.o obj-y += crypto_helper.o diff --git a/target/arm/cpu.c b/target/arm/cpu.c index 1b3ae62db6..6b77aaa445 100644 --- a/target/arm/cpu.c +++ b/target/arm/cpu.c @@ -19,6 +19,7 @@ */ #include "qemu/osdep.h" +#include "target/arm/idau.h" #include "qemu/error-report.h" #include "qapi/error.h" #include "cpu.h" @@ -186,6 +187,7 @@ static void arm_cpu_reset(CPUState *s) uint32_t initial_msp; /* Loaded from 0x0 */ uint32_t initial_pc; /* Loaded from 0x4 */ uint8_t *rom; + uint32_t vecbase; if (arm_feature(env, ARM_FEATURE_M_SECURITY)) { env->v7m.secure = true; @@ -213,8 +215,11 @@ static void arm_cpu_reset(CPUState *s) /* Unlike A/R profile, M profile defines the reset LR value */ env->regs[14] = 0xffffffff; - /* Load the initial SP and PC from the vector table at address 0 */ - rom = rom_ptr(0); + env->v7m.vecbase[M_REG_S] = cpu->init_svtor & 0xffffff80; + + /* Load the initial SP and PC from offset 0 and 4 in the vector table */ + vecbase = env->v7m.vecbase[env->v7m.secure]; + rom = rom_ptr(vecbase); if (rom) { /* Address zero is covered by ROM which hasn't yet been * copied into physical memory. @@ -227,8 +232,8 @@ static void arm_cpu_reset(CPUState *s) * it got copied into memory. In the latter case, rom_ptr * will return a NULL pointer and we should use ldl_phys instead. */ - initial_msp = ldl_phys(s->as, 0); - initial_pc = ldl_phys(s->as, 4); + initial_msp = ldl_phys(s->as, vecbase); + initial_pc = ldl_phys(s->as, vecbase + 4); } env->regs[13] = initial_msp & 0xFFFFFFFC; @@ -623,6 +628,10 @@ static Property arm_cpu_pmsav7_dregion_property = pmsav7_dregion, qdev_prop_uint32, uint32_t); +/* M profile: initial value of the Secure VTOR */ +static Property arm_cpu_initsvtor_property = + DEFINE_PROP_UINT32("init-svtor", ARMCPU, init_svtor, 0); + static void arm_cpu_post_init(Object *obj) { ARMCPU *cpu = ARM_CPU(obj); @@ -688,6 +697,15 @@ static void arm_cpu_post_init(Object *obj) } } + if (arm_feature(&cpu->env, ARM_FEATURE_M_SECURITY)) { + object_property_add_link(obj, "idau", TYPE_IDAU_INTERFACE, &cpu->idau, + qdev_prop_allow_set_link_before_realize, + OBJ_PROP_LINK_UNREF_ON_RELEASE, + &error_abort); + qdev_property_add_static(DEVICE(obj), &arm_cpu_initsvtor_property, + &error_abort); + } + qdev_property_add_static(DEVICE(obj), &arm_cpu_cfgend_property, &error_abort); } @@ -1188,6 +1206,35 @@ static void cortex_m4_initfn(Object *obj) cpu->id_isar5 = 0x00000000; } +static void cortex_m33_initfn(Object *obj) +{ + ARMCPU *cpu = ARM_CPU(obj); + + set_feature(&cpu->env, ARM_FEATURE_V8); + set_feature(&cpu->env, ARM_FEATURE_M); + set_feature(&cpu->env, ARM_FEATURE_M_SECURITY); + set_feature(&cpu->env, ARM_FEATURE_THUMB_DSP); + cpu->midr = 0x410fd213; /* r0p3 */ + cpu->pmsav7_dregion = 16; + cpu->sau_sregion = 8; + cpu->id_pfr0 = 0x00000030; + cpu->id_pfr1 = 0x00000210; + cpu->id_dfr0 = 0x00200000; + cpu->id_afr0 = 0x00000000; + cpu->id_mmfr0 = 0x00101F40; + cpu->id_mmfr1 = 0x00000000; + cpu->id_mmfr2 = 0x01000000; + cpu->id_mmfr3 = 0x00000000; + cpu->id_isar0 = 0x01101110; + cpu->id_isar1 = 0x02212000; + cpu->id_isar2 = 0x20232232; + cpu->id_isar3 = 0x01111131; + cpu->id_isar4 = 0x01310132; + cpu->id_isar5 = 0x00000000; + cpu->clidr = 0x00000000; + cpu->ctr = 0x8000c000; +} + static void arm_v7m_class_init(ObjectClass *oc, void *data) { CPUClass *cc = CPU_CLASS(oc); @@ -1650,6 +1697,8 @@ static void arm_any_initfn(Object *obj) set_feature(&cpu->env, ARM_FEATURE_V8_SHA256); set_feature(&cpu->env, ARM_FEATURE_V8_PMULL); set_feature(&cpu->env, ARM_FEATURE_CRC); + set_feature(&cpu->env, ARM_FEATURE_V8_RDM); + set_feature(&cpu->env, ARM_FEATURE_V8_FCMA); cpu->midr = 0xffffffff; } #endif @@ -1679,6 +1728,8 @@ static const ARMCPUInfo arm_cpus[] = { .class_init = arm_v7m_class_init }, { .name = "cortex-m4", .initfn = cortex_m4_initfn, .class_init = arm_v7m_class_init }, + { .name = "cortex-m33", .initfn = cortex_m33_initfn, + .class_init = arm_v7m_class_init }, { .name = "cortex-r5", .initfn = cortex_r5_initfn }, { .name = "cortex-a7", .initfn = cortex_a7_initfn }, { .name = "cortex-a8", .initfn = cortex_a8_initfn }, @@ -1821,11 +1872,18 @@ static const TypeInfo arm_cpu_type_info = { .class_init = arm_cpu_class_init, }; +static const TypeInfo idau_interface_type_info = { + .name = TYPE_IDAU_INTERFACE, + .parent = TYPE_INTERFACE, + .class_size = sizeof(IDAUInterfaceClass), +}; + static void arm_cpu_register_types(void) { const ARMCPUInfo *info = arm_cpus; type_register_static(&arm_cpu_type_info); + type_register_static(&idau_interface_type_info); while (info->name) { cpu_register(info); diff --git a/target/arm/cpu.h b/target/arm/cpu.h index 2b9740878b..8dd6b788df 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -694,6 +694,9 @@ struct ARMCPU { /* MemoryRegion to use for secure physical accesses */ MemoryRegion *secure_memory; + /* For v8M, pointer to the IDAU interface provided by board/SoC */ + Object *idau; + /* 'compatible' string for this CPU for Linux device trees */ const char *dtb_compatible; @@ -728,6 +731,9 @@ struct ARMCPU { */ uint32_t psci_conduit; + /* For v8M, initial value of the Secure VTOR */ + uint32_t init_svtor; + /* [QEMU_]KVM_ARM_TARGET_* constant for this CPU, or * QEMU_KVM_ARM_TARGET_NONE if the kernel doesn't support this CPU type. */ @@ -1427,7 +1433,9 @@ enum arm_features { ARM_FEATURE_V8_SHA3, /* implements SHA3 part of v8 Crypto Extensions */ ARM_FEATURE_V8_SM3, /* implements SM3 part of v8 Crypto Extensions */ ARM_FEATURE_V8_SM4, /* implements SM4 part of v8 Crypto Extensions */ + ARM_FEATURE_V8_RDM, /* implements v8.1 simd round multiply */ ARM_FEATURE_V8_FP16, /* implements v8.2 half-precision float */ + ARM_FEATURE_V8_FCMA, /* has complex number part of v8.3 extensions. */ }; static inline int arm_feature(CPUARMState *env, int feature) diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c index 9743bdc8c3..4228713b19 100644 --- a/target/arm/cpu64.c +++ b/target/arm/cpu64.c @@ -230,7 +230,9 @@ static void aarch64_any_initfn(Object *obj) set_feature(&cpu->env, ARM_FEATURE_V8_SM4); set_feature(&cpu->env, ARM_FEATURE_V8_PMULL); set_feature(&cpu->env, ARM_FEATURE_CRC); + set_feature(&cpu->env, ARM_FEATURE_V8_RDM); set_feature(&cpu->env, ARM_FEATURE_V8_FP16); + set_feature(&cpu->env, ARM_FEATURE_V8_FCMA); cpu->ctr = 0x80038003; /* 32 byte I and D cacheline size, VIPT icache */ cpu->dcz_blocksize = 7; /* 512 bytes */ } diff --git a/target/arm/helper.c b/target/arm/helper.c index c82f63d440..09893e3f72 100644 --- a/target/arm/helper.c +++ b/target/arm/helper.c @@ -1,4 +1,5 @@ #include "qemu/osdep.h" +#include "target/arm/idau.h" #include "trace.h" #include "cpu.h" #include "internals.h" @@ -9741,19 +9742,32 @@ static void v8m_security_lookup(CPUARMState *env, uint32_t address, */ ARMCPU *cpu = arm_env_get_cpu(env); int r; + bool idau_exempt = false, idau_ns = true, idau_nsc = true; + int idau_region = IREGION_NOTVALID; - /* TODO: implement IDAU */ + if (cpu->idau) { + IDAUInterfaceClass *iic = IDAU_INTERFACE_GET_CLASS(cpu->idau); + IDAUInterface *ii = IDAU_INTERFACE(cpu->idau); + + iic->check(ii, address, &idau_region, &idau_exempt, &idau_ns, + &idau_nsc); + } if (access_type == MMU_INST_FETCH && extract32(address, 28, 4) == 0xf) { /* 0xf0000000..0xffffffff is always S for insn fetches */ return; } - if (v8m_is_sau_exempt(env, address, access_type)) { + if (idau_exempt || v8m_is_sau_exempt(env, address, access_type)) { sattrs->ns = !regime_is_secure(env, mmu_idx); return; } + if (idau_region != IREGION_NOTVALID) { + sattrs->irvalid = true; + sattrs->iregion = idau_region; + } + switch (env->sau.ctrl & 3) { case 0: /* SAU.ENABLE == 0, SAU.ALLNS == 0 */ break; @@ -9790,7 +9804,15 @@ static void v8m_security_lookup(CPUARMState *env, uint32_t address, } } - /* TODO when we support the IDAU then it may override the result here */ + /* The IDAU will override the SAU lookup results if it specifies + * higher security than the SAU does. + */ + if (!idau_ns) { + if (sattrs->ns || (!idau_nsc && sattrs->nsc)) { + sattrs->ns = false; + sattrs->nsc = idau_nsc; + } + } break; } } diff --git a/target/arm/helper.h b/target/arm/helper.h index 6dd8504ec3..0d2094f2be 100644 --- a/target/arm/helper.h +++ b/target/arm/helper.h @@ -365,8 +365,12 @@ DEF_HELPER_FLAGS_1(neon_rbit_u8, TCG_CALL_NO_RWG_SE, i32, i32) DEF_HELPER_3(neon_qdmulh_s16, i32, env, i32, i32) DEF_HELPER_3(neon_qrdmulh_s16, i32, env, i32, i32) +DEF_HELPER_4(neon_qrdmlah_s16, i32, env, i32, i32, i32) +DEF_HELPER_4(neon_qrdmlsh_s16, i32, env, i32, i32, i32) DEF_HELPER_3(neon_qdmulh_s32, i32, env, i32, i32) DEF_HELPER_3(neon_qrdmulh_s32, i32, env, i32, i32) +DEF_HELPER_4(neon_qrdmlah_s32, i32, env, s32, s32, s32) +DEF_HELPER_4(neon_qrdmlsh_s32, i32, env, s32, s32, s32) DEF_HELPER_1(neon_narrow_u8, i32, i64) DEF_HELPER_1(neon_narrow_u16, i32, i64) @@ -565,6 +569,33 @@ DEF_HELPER_2(dc_zva, void, env, i64) DEF_HELPER_FLAGS_2(neon_pmull_64_lo, TCG_CALL_NO_RWG_SE, i64, i64, i64) DEF_HELPER_FLAGS_2(neon_pmull_64_hi, TCG_CALL_NO_RWG_SE, i64, i64, i64) +DEF_HELPER_FLAGS_5(gvec_qrdmlah_s16, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_qrdmlsh_s16, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_qrdmlah_s32, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_qrdmlsh_s32, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(gvec_fcaddh, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fcadds, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fcaddd, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_5(gvec_fcmlah, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fcmlah_idx, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fcmlas, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fcmlas_idx, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_5(gvec_fcmlad, TCG_CALL_NO_RWG, + void, ptr, ptr, ptr, ptr, i32) + #ifdef TARGET_AARCH64 #include "helper-a64.h" #endif diff --git a/target/arm/idau.h b/target/arm/idau.h new file mode 100644 index 0000000000..cac27b95fa --- /dev/null +++ b/target/arm/idau.h @@ -0,0 +1,61 @@ +/* + * QEMU ARM CPU -- interface for the Arm v8M IDAU + * + * Copyright (c) 2018 Linaro Ltd + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see + * <http://www.gnu.org/licenses/gpl-2.0.html> + * + * In the v8M architecture, the IDAU is a small piece of hardware + * typically implemented in the SoC which provides board or SoC + * specific security attribution information for each address that + * the CPU performs MPU/SAU checks on. For QEMU, we model this with a + * QOM interface which is implemented by the board or SoC object and + * connected to the CPU using a link property. + */ + +#ifndef TARGET_ARM_IDAU_H +#define TARGET_ARM_IDAU_H + +#include "qom/object.h" + +#define TYPE_IDAU_INTERFACE "idau-interface" +#define IDAU_INTERFACE(obj) \ + INTERFACE_CHECK(IDAUInterface, (obj), TYPE_IDAU_INTERFACE) +#define IDAU_INTERFACE_CLASS(class) \ + OBJECT_CLASS_CHECK(IDAUInterfaceClass, (class), TYPE_IDAU_INTERFACE) +#define IDAU_INTERFACE_GET_CLASS(obj) \ + OBJECT_GET_CLASS(IDAUInterfaceClass, (obj), TYPE_IDAU_INTERFACE) + +typedef struct IDAUInterface { + Object parent; +} IDAUInterface; + +#define IREGION_NOTVALID -1 + +typedef struct IDAUInterfaceClass { + InterfaceClass parent; + + /* Check the specified address and return the IDAU security information + * for it by filling in iregion, exempt, ns and nsc: + * iregion: IDAU region number, or IREGION_NOTVALID if not valid + * exempt: true if address is exempt from security attribution + * ns: true if the address is NonSecure + * nsc: true if the address is NonSecure-callable + */ + void (*check)(IDAUInterface *ii, uint32_t address, int *iregion, + bool *exempt, bool *ns, bool *nsc); +} IDAUInterfaceClass; + +#endif diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c index 32811dc8b0..31ff0479e6 100644 --- a/target/arm/translate-a64.c +++ b/target/arm/translate-a64.c @@ -701,6 +701,33 @@ static void gen_gvec_op3(DisasContext *s, bool is_q, int rd, vec_full_reg_size(s), gvec_op); } +/* Expand a 3-operand + env pointer operation using + * an out-of-line helper. + */ +static void gen_gvec_op3_env(DisasContext *s, bool is_q, int rd, + int rn, int rm, gen_helper_gvec_3_ptr *fn) +{ + tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), cpu_env, + is_q ? 16 : 8, vec_full_reg_size(s), 0, fn); +} + +/* Expand a 3-operand + fpstatus pointer + simd data value operation using + * an out-of-line helper. + */ +static void gen_gvec_op3_fpst(DisasContext *s, bool is_q, int rd, int rn, + int rm, bool is_fp16, int data, + gen_helper_gvec_3_ptr *fn) +{ + TCGv_ptr fpst = get_fpstatus_ptr(is_fp16); + tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), fpst, + is_q ? 16 : 8, vec_full_reg_size(s), data, fn); + tcg_temp_free_ptr(fpst); +} + /* Set ZF and NF based on a 64 bit result. This is alas fiddlier * than the 32 bit equivalent. */ @@ -7971,6 +7998,89 @@ static void disas_simd_scalar_three_reg_same_fp16(DisasContext *s, tcg_temp_free_ptr(fpst); } +/* AdvSIMD scalar three same extra + * 31 30 29 28 24 23 22 21 20 16 15 14 11 10 9 5 4 0 + * +-----+---+-----------+------+---+------+---+--------+---+----+----+ + * | 0 1 | U | 1 1 1 1 0 | size | 0 | Rm | 1 | opcode | 1 | Rn | Rd | + * +-----+---+-----------+------+---+------+---+--------+---+----+----+ + */ +static void disas_simd_scalar_three_reg_same_extra(DisasContext *s, + uint32_t insn) +{ + int rd = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + int opcode = extract32(insn, 11, 4); + int rm = extract32(insn, 16, 5); + int size = extract32(insn, 22, 2); + bool u = extract32(insn, 29, 1); + TCGv_i32 ele1, ele2, ele3; + TCGv_i64 res; + int feature; + + switch (u * 16 + opcode) { + case 0x10: /* SQRDMLAH (vector) */ + case 0x11: /* SQRDMLSH (vector) */ + if (size != 1 && size != 2) { + unallocated_encoding(s); + return; + } + feature = ARM_FEATURE_V8_RDM; + break; + default: + unallocated_encoding(s); + return; + } + if (!arm_dc_feature(s, feature)) { + unallocated_encoding(s); + return; + } + if (!fp_access_check(s)) { + return; + } + + /* Do a single operation on the lowest element in the vector. + * We use the standard Neon helpers and rely on 0 OP 0 == 0 + * with no side effects for all these operations. + * OPTME: special-purpose helpers would avoid doing some + * unnecessary work in the helper for the 16 bit cases. + */ + ele1 = tcg_temp_new_i32(); + ele2 = tcg_temp_new_i32(); + ele3 = tcg_temp_new_i32(); + + read_vec_element_i32(s, ele1, rn, 0, size); + read_vec_element_i32(s, ele2, rm, 0, size); + read_vec_element_i32(s, ele3, rd, 0, size); + + switch (opcode) { + case 0x0: /* SQRDMLAH */ + if (size == 1) { + gen_helper_neon_qrdmlah_s16(ele3, cpu_env, ele1, ele2, ele3); + } else { + gen_helper_neon_qrdmlah_s32(ele3, cpu_env, ele1, ele2, ele3); + } + break; + case 0x1: /* SQRDMLSH */ + if (size == 1) { + gen_helper_neon_qrdmlsh_s16(ele3, cpu_env, ele1, ele2, ele3); + } else { + gen_helper_neon_qrdmlsh_s32(ele3, cpu_env, ele1, ele2, ele3); + } + break; + default: + g_assert_not_reached(); + } + tcg_temp_free_i32(ele1); + tcg_temp_free_i32(ele2); + + res = tcg_temp_new_i64(); + tcg_gen_extu_i32_i64(res, ele3); + tcg_temp_free_i32(ele3); + + write_fp_dreg(s, rd, res); + tcg_temp_free_i64(res); +} + static void handle_2misc_64(DisasContext *s, int opcode, bool u, TCGv_i64 tcg_rd, TCGv_i64 tcg_rn, TCGv_i32 tcg_rmode, TCGv_ptr tcg_fpstatus) @@ -10706,6 +10816,134 @@ static void disas_simd_three_reg_same_fp16(DisasContext *s, uint32_t insn) clear_vec_high(s, is_q, rd); } +/* AdvSIMD three same extra + * 31 30 29 28 24 23 22 21 20 16 15 14 11 10 9 5 4 0 + * +---+---+---+-----------+------+---+------+---+--------+---+----+----+ + * | 0 | Q | U | 0 1 1 1 0 | size | 0 | Rm | 1 | opcode | 1 | Rn | Rd | + * +---+---+---+-----------+------+---+------+---+--------+---+----+----+ + */ +static void disas_simd_three_reg_same_extra(DisasContext *s, uint32_t insn) +{ + int rd = extract32(insn, 0, 5); + int rn = extract32(insn, 5, 5); + int opcode = extract32(insn, 11, 4); + int rm = extract32(insn, 16, 5); + int size = extract32(insn, 22, 2); + bool u = extract32(insn, 29, 1); + bool is_q = extract32(insn, 30, 1); + int feature, rot; + + switch (u * 16 + opcode) { + case 0x10: /* SQRDMLAH (vector) */ + case 0x11: /* SQRDMLSH (vector) */ + if (size != 1 && size != 2) { + unallocated_encoding(s); + return; + } + feature = ARM_FEATURE_V8_RDM; + break; + case 0x8: /* FCMLA, #0 */ + case 0x9: /* FCMLA, #90 */ + case 0xa: /* FCMLA, #180 */ + case 0xb: /* FCMLA, #270 */ + case 0xc: /* FCADD, #90 */ + case 0xe: /* FCADD, #270 */ + if (size == 0 + || (size == 1 && !arm_dc_feature(s, ARM_FEATURE_V8_FP16)) + || (size == 3 && !is_q)) { + unallocated_encoding(s); + return; + } + feature = ARM_FEATURE_V8_FCMA; + break; + default: + unallocated_encoding(s); + return; + } + if (!arm_dc_feature(s, feature)) { + unallocated_encoding(s); + return; + } + if (!fp_access_check(s)) { + return; + } + + switch (opcode) { + case 0x0: /* SQRDMLAH (vector) */ + switch (size) { + case 1: + gen_gvec_op3_env(s, is_q, rd, rn, rm, gen_helper_gvec_qrdmlah_s16); + break; + case 2: + gen_gvec_op3_env(s, is_q, rd, rn, rm, gen_helper_gvec_qrdmlah_s32); + break; + default: + g_assert_not_reached(); + } + return; + + case 0x1: /* SQRDMLSH (vector) */ + switch (size) { + case 1: + gen_gvec_op3_env(s, is_q, rd, rn, rm, gen_helper_gvec_qrdmlsh_s16); + break; + case 2: + gen_gvec_op3_env(s, is_q, rd, rn, rm, gen_helper_gvec_qrdmlsh_s32); + break; + default: + g_assert_not_reached(); + } + return; + + case 0x8: /* FCMLA, #0 */ + case 0x9: /* FCMLA, #90 */ + case 0xa: /* FCMLA, #180 */ + case 0xb: /* FCMLA, #270 */ + rot = extract32(opcode, 0, 2); + switch (size) { + case 1: + gen_gvec_op3_fpst(s, is_q, rd, rn, rm, true, rot, + gen_helper_gvec_fcmlah); + break; + case 2: + gen_gvec_op3_fpst(s, is_q, rd, rn, rm, false, rot, + gen_helper_gvec_fcmlas); + break; + case 3: + gen_gvec_op3_fpst(s, is_q, rd, rn, rm, false, rot, + gen_helper_gvec_fcmlad); + break; + default: + g_assert_not_reached(); + } + return; + + case 0xc: /* FCADD, #90 */ + case 0xe: /* FCADD, #270 */ + rot = extract32(opcode, 1, 1); + switch (size) { + case 1: + gen_gvec_op3_fpst(s, is_q, rd, rn, rm, size == 1, rot, + gen_helper_gvec_fcaddh); + break; + case 2: + gen_gvec_op3_fpst(s, is_q, rd, rn, rm, size == 1, rot, + gen_helper_gvec_fcadds); + break; + case 3: + gen_gvec_op3_fpst(s, is_q, rd, rn, rm, size == 1, rot, + gen_helper_gvec_fcaddd); + break; + default: + g_assert_not_reached(); + } + return; + + default: + g_assert_not_reached(); + } +} + static void handle_2misc_widening(DisasContext *s, int opcode, bool is_q, int size, int rn, int rd) { @@ -11782,107 +12020,137 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn) int rn = extract32(insn, 5, 5); int rd = extract32(insn, 0, 5); bool is_long = false; - bool is_fp = false; + int is_fp = 0; bool is_fp16 = false; int index; TCGv_ptr fpst; - switch (opcode) { - case 0x0: /* MLA */ - case 0x4: /* MLS */ - if (!u || is_scalar) { + switch (16 * u + opcode) { + case 0x08: /* MUL */ + case 0x10: /* MLA */ + case 0x14: /* MLS */ + if (is_scalar) { unallocated_encoding(s); return; } break; - case 0x2: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */ - case 0x6: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */ - case 0xa: /* SMULL, SMULL2, UMULL, UMULL2 */ + case 0x02: /* SMLAL, SMLAL2 */ + case 0x12: /* UMLAL, UMLAL2 */ + case 0x06: /* SMLSL, SMLSL2 */ + case 0x16: /* UMLSL, UMLSL2 */ + case 0x0a: /* SMULL, SMULL2 */ + case 0x1a: /* UMULL, UMULL2 */ if (is_scalar) { unallocated_encoding(s); return; } is_long = true; break; - case 0x3: /* SQDMLAL, SQDMLAL2 */ - case 0x7: /* SQDMLSL, SQDMLSL2 */ - case 0xb: /* SQDMULL, SQDMULL2 */ + case 0x03: /* SQDMLAL, SQDMLAL2 */ + case 0x07: /* SQDMLSL, SQDMLSL2 */ + case 0x0b: /* SQDMULL, SQDMULL2 */ is_long = true; - /* fall through */ - case 0xc: /* SQDMULH */ - case 0xd: /* SQRDMULH */ - if (u) { - unallocated_encoding(s); - return; - } break; - case 0x8: /* MUL */ - if (u || is_scalar) { - unallocated_encoding(s); - return; - } + case 0x0c: /* SQDMULH */ + case 0x0d: /* SQRDMULH */ break; - case 0x1: /* FMLA */ - case 0x5: /* FMLS */ - if (u) { + case 0x01: /* FMLA */ + case 0x05: /* FMLS */ + case 0x09: /* FMUL */ + case 0x19: /* FMULX */ + is_fp = 1; + break; + case 0x1d: /* SQRDMLAH */ + case 0x1f: /* SQRDMLSH */ + if (!arm_dc_feature(s, ARM_FEATURE_V8_RDM)) { unallocated_encoding(s); return; } - /* fall through */ - case 0x9: /* FMUL, FMULX */ - if (size == 1) { + break; + case 0x11: /* FCMLA #0 */ + case 0x13: /* FCMLA #90 */ + case 0x15: /* FCMLA #180 */ + case 0x17: /* FCMLA #270 */ + if (!arm_dc_feature(s, ARM_FEATURE_V8_FCMA)) { unallocated_encoding(s); return; } - is_fp = true; + is_fp = 2; break; default: unallocated_encoding(s); return; } - if (is_fp) { + switch (is_fp) { + case 1: /* normal fp */ /* convert insn encoded size to TCGMemOp size */ switch (size) { - case 2: /* single precision */ - size = MO_32; - index = h << 1 | l; - rm |= (m << 4); - break; - case 3: /* double precision */ - size = MO_64; - if (l || !is_q) { - unallocated_encoding(s); - return; - } - index = h; - rm |= (m << 4); - break; - case 0: /* half precision */ + case 0: /* half-precision */ size = MO_16; - index = h << 2 | l << 1 | m; is_fp16 = true; - if (arm_dc_feature(s, ARM_FEATURE_V8_FP16)) { - break; - } - /* fallthru */ - default: /* unallocated */ + break; + case MO_32: /* single precision */ + case MO_64: /* double precision */ + break; + default: unallocated_encoding(s); return; } - } else { + break; + + case 2: /* complex fp */ + /* Each indexable element is a complex pair. */ + size <<= 1; switch (size) { - case 1: - index = h << 2 | l << 1 | m; + case MO_32: + if (h && !is_q) { + unallocated_encoding(s); + return; + } + is_fp16 = true; break; - case 2: - index = h << 1 | l; - rm |= (m << 4); + case MO_64: break; default: unallocated_encoding(s); return; } + break; + + default: /* integer */ + switch (size) { + case MO_8: + case MO_64: + unallocated_encoding(s); + return; + } + break; + } + if (is_fp16 && !arm_dc_feature(s, ARM_FEATURE_V8_FP16)) { + unallocated_encoding(s); + return; + } + + /* Given TCGMemOp size, adjust register and indexing. */ + switch (size) { + case MO_16: + index = h << 2 | l << 1 | m; + break; + case MO_32: + index = h << 1 | l; + rm |= m << 4; + break; + case MO_64: + if (l || !is_q) { + unallocated_encoding(s); + return; + } + index = h; + rm |= m << 4; + break; + default: + g_assert_not_reached(); } if (!fp_access_check(s)) { @@ -11895,6 +12163,23 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn) fpst = NULL; } + switch (16 * u + opcode) { + case 0x11: /* FCMLA #0 */ + case 0x13: /* FCMLA #90 */ + case 0x15: /* FCMLA #180 */ + case 0x17: /* FCMLA #270 */ + tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_reg_offset(s, rm, index, size), fpst, + is_q ? 16 : 8, vec_full_reg_size(s), + extract32(insn, 13, 2), /* rot */ + size == MO_64 + ? gen_helper_gvec_fcmlas_idx + : gen_helper_gvec_fcmlah_idx); + tcg_temp_free_ptr(fpst); + return; + } + if (size == 3) { TCGv_i64 tcg_idx = tcg_temp_new_i64(); int pass; @@ -11909,21 +12194,20 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn) read_vec_element(s, tcg_op, rn, pass, MO_64); - switch (opcode) { - case 0x5: /* FMLS */ + switch (16 * u + opcode) { + case 0x05: /* FMLS */ /* As usual for ARM, separate negation for fused multiply-add */ gen_helper_vfp_negd(tcg_op, tcg_op); /* fall through */ - case 0x1: /* FMLA */ + case 0x01: /* FMLA */ read_vec_element(s, tcg_res, rd, pass, MO_64); gen_helper_vfp_muladdd(tcg_res, tcg_op, tcg_idx, tcg_res, fpst); break; - case 0x9: /* FMUL, FMULX */ - if (u) { - gen_helper_vfp_mulxd(tcg_res, tcg_op, tcg_idx, fpst); - } else { - gen_helper_vfp_muld(tcg_res, tcg_op, tcg_idx, fpst); - } + case 0x09: /* FMUL */ + gen_helper_vfp_muld(tcg_res, tcg_op, tcg_idx, fpst); + break; + case 0x19: /* FMULX */ + gen_helper_vfp_mulxd(tcg_res, tcg_op, tcg_idx, fpst); break; default: g_assert_not_reached(); @@ -11966,10 +12250,10 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn) read_vec_element_i32(s, tcg_op, rn, pass, is_scalar ? size : MO_32); - switch (opcode) { - case 0x0: /* MLA */ - case 0x4: /* MLS */ - case 0x8: /* MUL */ + switch (16 * u + opcode) { + case 0x08: /* MUL */ + case 0x10: /* MLA */ + case 0x14: /* MLS */ { static NeonGenTwoOpFn * const fns[2][2] = { { gen_helper_neon_add_u16, gen_helper_neon_sub_u16 }, @@ -11991,8 +12275,8 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn) genfn(tcg_res, tcg_op, tcg_res); break; } - case 0x5: /* FMLS */ - case 0x1: /* FMLA */ + case 0x05: /* FMLS */ + case 0x01: /* FMLA */ read_vec_element_i32(s, tcg_res, rd, pass, is_scalar ? size : MO_32); switch (size) { @@ -12023,39 +12307,43 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn) g_assert_not_reached(); } break; - case 0x9: /* FMUL, FMULX */ + case 0x09: /* FMUL */ switch (size) { case 1: - if (u) { - if (is_scalar) { - gen_helper_advsimd_mulxh(tcg_res, tcg_op, - tcg_idx, fpst); - } else { - gen_helper_advsimd_mulx2h(tcg_res, tcg_op, - tcg_idx, fpst); - } + if (is_scalar) { + gen_helper_advsimd_mulh(tcg_res, tcg_op, + tcg_idx, fpst); } else { - if (is_scalar) { - gen_helper_advsimd_mulh(tcg_res, tcg_op, - tcg_idx, fpst); - } else { - gen_helper_advsimd_mul2h(tcg_res, tcg_op, - tcg_idx, fpst); - } + gen_helper_advsimd_mul2h(tcg_res, tcg_op, + tcg_idx, fpst); } break; case 2: - if (u) { - gen_helper_vfp_mulxs(tcg_res, tcg_op, tcg_idx, fpst); + gen_helper_vfp_muls(tcg_res, tcg_op, tcg_idx, fpst); + break; + default: + g_assert_not_reached(); + } + break; + case 0x19: /* FMULX */ + switch (size) { + case 1: + if (is_scalar) { + gen_helper_advsimd_mulxh(tcg_res, tcg_op, + tcg_idx, fpst); } else { - gen_helper_vfp_muls(tcg_res, tcg_op, tcg_idx, fpst); + gen_helper_advsimd_mulx2h(tcg_res, tcg_op, + tcg_idx, fpst); } break; + case 2: + gen_helper_vfp_mulxs(tcg_res, tcg_op, tcg_idx, fpst); + break; default: g_assert_not_reached(); } break; - case 0xc: /* SQDMULH */ + case 0x0c: /* SQDMULH */ if (size == 1) { gen_helper_neon_qdmulh_s16(tcg_res, cpu_env, tcg_op, tcg_idx); @@ -12064,7 +12352,7 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn) tcg_op, tcg_idx); } break; - case 0xd: /* SQRDMULH */ + case 0x0d: /* SQRDMULH */ if (size == 1) { gen_helper_neon_qrdmulh_s16(tcg_res, cpu_env, tcg_op, tcg_idx); @@ -12073,6 +12361,28 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn) tcg_op, tcg_idx); } break; + case 0x1d: /* SQRDMLAH */ + read_vec_element_i32(s, tcg_res, rd, pass, + is_scalar ? size : MO_32); + if (size == 1) { + gen_helper_neon_qrdmlah_s16(tcg_res, cpu_env, + tcg_op, tcg_idx, tcg_res); + } else { + gen_helper_neon_qrdmlah_s32(tcg_res, cpu_env, + tcg_op, tcg_idx, tcg_res); + } + break; + case 0x1f: /* SQRDMLSH */ + read_vec_element_i32(s, tcg_res, rd, pass, + is_scalar ? size : MO_32); + if (size == 1) { + gen_helper_neon_qrdmlsh_s16(tcg_res, cpu_env, + tcg_op, tcg_idx, tcg_res); + } else { + gen_helper_neon_qrdmlsh_s32(tcg_res, cpu_env, + tcg_op, tcg_idx, tcg_res); + } + break; default: g_assert_not_reached(); } @@ -12794,6 +13104,7 @@ static void disas_crypto_three_reg_imm2(DisasContext *s, uint32_t insn) static const AArch64DecodeTable data_proc_simd[] = { /* pattern , mask , fn */ { 0x0e200400, 0x9f200400, disas_simd_three_reg_same }, + { 0x0e008400, 0x9f208400, disas_simd_three_reg_same_extra }, { 0x0e200000, 0x9f200c00, disas_simd_three_reg_diff }, { 0x0e200800, 0x9f3e0c00, disas_simd_two_reg_misc }, { 0x0e300800, 0x9f3e0c00, disas_simd_across_lanes }, @@ -12806,6 +13117,7 @@ static const AArch64DecodeTable data_proc_simd[] = { { 0x0e000800, 0xbf208c00, disas_simd_zip_trn }, { 0x2e000000, 0xbf208400, disas_simd_ext }, { 0x5e200400, 0xdf200400, disas_simd_scalar_three_reg_same }, + { 0x5e008400, 0xdf208400, disas_simd_scalar_three_reg_same_extra }, { 0x5e200000, 0xdf200c00, disas_simd_scalar_three_reg_diff }, { 0x5e200800, 0xdf3e0c00, disas_simd_scalar_two_reg_misc }, { 0x5e300800, 0xdf3e0c00, disas_simd_scalar_pairwise }, diff --git a/target/arm/translate.c b/target/arm/translate.c index aa6dcaa577..ba6ab7d287 100644 --- a/target/arm/translate.c +++ b/target/arm/translate.c @@ -25,6 +25,7 @@ #include "disas/disas.h" #include "exec/exec-all.h" #include "tcg-op.h" +#include "tcg-op-gvec.h" #include "qemu/log.h" #include "qemu/bitops.h" #include "arm_ldst.h" @@ -75,6 +76,10 @@ static const char *regnames[] = { "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "pc" }; +/* Function prototypes for gen_ functions calling Neon helpers. */ +typedef void NeonGenThreeOpEnvFn(TCGv_i32, TCGv_env, TCGv_i32, + TCGv_i32, TCGv_i32); + /* initialize TCG globals. */ void arm_translate_init(void) { @@ -5374,9 +5379,9 @@ static void gen_neon_narrow_op(int op, int u, int size, #define NEON_3R_VPMAX 20 #define NEON_3R_VPMIN 21 #define NEON_3R_VQDMULH_VQRDMULH 22 -#define NEON_3R_VPADD 23 +#define NEON_3R_VPADD_VQRDMLAH 23 #define NEON_3R_SHA 24 /* SHA1C,SHA1P,SHA1M,SHA1SU0,SHA256H{2},SHA256SU1 */ -#define NEON_3R_VFM 25 /* VFMA, VFMS : float fused multiply-add */ +#define NEON_3R_VFM_VQRDMLSH 25 /* VFMA, VFMS, VQRDMLSH */ #define NEON_3R_FLOAT_ARITH 26 /* float VADD, VSUB, VPADD, VABD */ #define NEON_3R_FLOAT_MULTIPLY 27 /* float VMLA, VMLS, VMUL */ #define NEON_3R_FLOAT_CMP 28 /* float VCEQ, VCGE, VCGT */ @@ -5408,9 +5413,9 @@ static const uint8_t neon_3r_sizes[] = { [NEON_3R_VPMAX] = 0x7, [NEON_3R_VPMIN] = 0x7, [NEON_3R_VQDMULH_VQRDMULH] = 0x6, - [NEON_3R_VPADD] = 0x7, + [NEON_3R_VPADD_VQRDMLAH] = 0x7, [NEON_3R_SHA] = 0xf, /* size field encodes op type */ - [NEON_3R_VFM] = 0x5, /* size bit 1 encodes op */ + [NEON_3R_VFM_VQRDMLSH] = 0x7, /* For VFM, size bit 1 encodes op */ [NEON_3R_FLOAT_ARITH] = 0x5, /* size bit 1 encodes op */ [NEON_3R_FLOAT_MULTIPLY] = 0x5, /* size bit 1 encodes op */ [NEON_3R_FLOAT_CMP] = 0x5, /* size bit 1 encodes op */ @@ -5589,6 +5594,22 @@ static const uint8_t neon_2rm_sizes[] = { [NEON_2RM_VCVT_UF] = 0x4, }; + +/* Expand v8.1 simd helper. */ +static int do_v81_helper(DisasContext *s, gen_helper_gvec_3_ptr *fn, + int q, int rd, int rn, int rm) +{ + if (arm_dc_feature(s, ARM_FEATURE_V8_RDM)) { + int opr_sz = (1 + q) * 8; + tcg_gen_gvec_3_ptr(vfp_reg_offset(1, rd), + vfp_reg_offset(1, rn), + vfp_reg_offset(1, rm), cpu_env, + opr_sz, opr_sz, 0, fn); + return 0; + } + return 1; +} + /* Translate a NEON data processing instruction. Return nonzero if the instruction is invalid. We process data in a mixture of 32-bit and 64-bit chunks. @@ -5641,12 +5662,13 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) if (q && ((rd | rn | rm) & 1)) { return 1; } - /* - * The SHA-1/SHA-256 3-register instructions require special treatment - * here, as their size field is overloaded as an op type selector, and - * they all consume their input in a single pass. - */ - if (op == NEON_3R_SHA) { + switch (op) { + case NEON_3R_SHA: + /* The SHA-1/SHA-256 3-register instructions require special + * treatment here, as their size field is overloaded as an + * op type selector, and they all consume their input in a + * single pass. + */ if (!q) { return 1; } @@ -5683,6 +5705,40 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) tcg_temp_free_ptr(ptr2); tcg_temp_free_ptr(ptr3); return 0; + + case NEON_3R_VPADD_VQRDMLAH: + if (!u) { + break; /* VPADD */ + } + /* VQRDMLAH */ + switch (size) { + case 1: + return do_v81_helper(s, gen_helper_gvec_qrdmlah_s16, + q, rd, rn, rm); + case 2: + return do_v81_helper(s, gen_helper_gvec_qrdmlah_s32, + q, rd, rn, rm); + } + return 1; + + case NEON_3R_VFM_VQRDMLSH: + if (!u) { + /* VFM, VFMS */ + if (size == 1) { + return 1; + } + break; + } + /* VQRDMLSH */ + switch (size) { + case 1: + return do_v81_helper(s, gen_helper_gvec_qrdmlsh_s16, + q, rd, rn, rm); + case 2: + return do_v81_helper(s, gen_helper_gvec_qrdmlsh_s32, + q, rd, rn, rm); + } + return 1; } if (size == 3 && op != NEON_3R_LOGIC) { /* 64-bit element instructions. */ @@ -5768,11 +5824,7 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) rm = rtmp; } break; - case NEON_3R_VPADD: - if (u) { - return 1; - } - /* Fall through */ + case NEON_3R_VPADD_VQRDMLAH: case NEON_3R_VPMAX: case NEON_3R_VPMIN: pairwise = 1; @@ -5806,8 +5858,8 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) return 1; } break; - case NEON_3R_VFM: - if (!arm_dc_feature(s, ARM_FEATURE_VFP4) || u) { + case NEON_3R_VFM_VQRDMLSH: + if (!arm_dc_feature(s, ARM_FEATURE_VFP4)) { return 1; } break; @@ -6004,7 +6056,7 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) } } break; - case NEON_3R_VPADD: + case NEON_3R_VPADD_VQRDMLAH: switch (size) { case 0: gen_helper_neon_padd_u8(tmp, tmp, tmp2); break; case 1: gen_helper_neon_padd_u16(tmp, tmp, tmp2); break; @@ -6103,7 +6155,7 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) } } break; - case NEON_3R_VFM: + case NEON_3R_VFM_VQRDMLSH: { /* VFMA, VFMS: fused multiply-add */ TCGv_ptr fpstatus = get_fpstatus_ptr(1); @@ -6937,11 +6989,45 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) } neon_store_reg64(cpu_V0, rd + pass); } + break; + case 14: /* VQRDMLAH scalar */ + case 15: /* VQRDMLSH scalar */ + { + NeonGenThreeOpEnvFn *fn; + if (!arm_dc_feature(s, ARM_FEATURE_V8_RDM)) { + return 1; + } + if (u && ((rd | rn) & 1)) { + return 1; + } + if (op == 14) { + if (size == 1) { + fn = gen_helper_neon_qrdmlah_s16; + } else { + fn = gen_helper_neon_qrdmlah_s32; + } + } else { + if (size == 1) { + fn = gen_helper_neon_qrdmlsh_s16; + } else { + fn = gen_helper_neon_qrdmlsh_s32; + } + } + tmp2 = neon_get_scalar(size, rm); + for (pass = 0; pass < (u ? 4 : 2); pass++) { + tmp = neon_load_reg(rn, pass); + tmp3 = neon_load_reg(rd, pass); + fn(tmp, cpu_env, tmp, tmp2, tmp3); + tcg_temp_free_i32(tmp3); + neon_store_reg(rd, pass, tmp); + } + tcg_temp_free_i32(tmp2); + } break; - default: /* 14 and 15 are RESERVED */ - return 1; + default: + g_assert_not_reached(); } } } else { /* size == 3 */ @@ -7594,6 +7680,123 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) return 0; } +/* Advanced SIMD three registers of the same length extension. + * 31 25 23 22 20 16 12 11 10 9 8 3 0 + * +---------------+-----+---+-----+----+----+---+----+---+----+---------+----+ + * | 1 1 1 1 1 1 0 | op1 | D | op2 | Vn | Vd | 1 | o3 | 0 | o4 | N Q M U | Vm | + * +---------------+-----+---+-----+----+----+---+----+---+----+---------+----+ + */ +static int disas_neon_insn_3same_ext(DisasContext *s, uint32_t insn) +{ + gen_helper_gvec_3_ptr *fn_gvec_ptr; + int rd, rn, rm, rot, size, opr_sz; + TCGv_ptr fpst; + bool q; + + q = extract32(insn, 6, 1); + VFP_DREG_D(rd, insn); + VFP_DREG_N(rn, insn); + VFP_DREG_M(rm, insn); + if ((rd | rn | rm) & q) { + return 1; + } + + if ((insn & 0xfe200f10) == 0xfc200800) { + /* VCMLA -- 1111 110R R.1S .... .... 1000 ...0 .... */ + size = extract32(insn, 20, 1); + rot = extract32(insn, 23, 2); + if (!arm_dc_feature(s, ARM_FEATURE_V8_FCMA) + || (!size && !arm_dc_feature(s, ARM_FEATURE_V8_FP16))) { + return 1; + } + fn_gvec_ptr = size ? gen_helper_gvec_fcmlas : gen_helper_gvec_fcmlah; + } else if ((insn & 0xfea00f10) == 0xfc800800) { + /* VCADD -- 1111 110R 1.0S .... .... 1000 ...0 .... */ + size = extract32(insn, 20, 1); + rot = extract32(insn, 24, 1); + if (!arm_dc_feature(s, ARM_FEATURE_V8_FCMA) + || (!size && !arm_dc_feature(s, ARM_FEATURE_V8_FP16))) { + return 1; + } + fn_gvec_ptr = size ? gen_helper_gvec_fcadds : gen_helper_gvec_fcaddh; + } else { + return 1; + } + + if (s->fp_excp_el) { + gen_exception_insn(s, 4, EXCP_UDEF, + syn_fp_access_trap(1, 0xe, false), s->fp_excp_el); + return 0; + } + if (!s->vfp_enabled) { + return 1; + } + + opr_sz = (1 + q) * 8; + fpst = get_fpstatus_ptr(1); + tcg_gen_gvec_3_ptr(vfp_reg_offset(1, rd), + vfp_reg_offset(1, rn), + vfp_reg_offset(1, rm), fpst, + opr_sz, opr_sz, rot, fn_gvec_ptr); + tcg_temp_free_ptr(fpst); + return 0; +} + +/* Advanced SIMD two registers and a scalar extension. + * 31 24 23 22 20 16 12 11 10 9 8 3 0 + * +-----------------+----+---+----+----+----+---+----+---+----+---------+----+ + * | 1 1 1 1 1 1 1 0 | o1 | D | o2 | Vn | Vd | 1 | o3 | 0 | o4 | N Q M U | Vm | + * +-----------------+----+---+----+----+----+---+----+---+----+---------+----+ + * + */ + +static int disas_neon_insn_2reg_scalar_ext(DisasContext *s, uint32_t insn) +{ + int rd, rn, rm, rot, size, opr_sz; + TCGv_ptr fpst; + bool q; + + q = extract32(insn, 6, 1); + VFP_DREG_D(rd, insn); + VFP_DREG_N(rn, insn); + VFP_DREG_M(rm, insn); + if ((rd | rn) & q) { + return 1; + } + + if ((insn & 0xff000f10) == 0xfe000800) { + /* VCMLA (indexed) -- 1111 1110 S.RR .... .... 1000 ...0 .... */ + rot = extract32(insn, 20, 2); + size = extract32(insn, 23, 1); + if (!arm_dc_feature(s, ARM_FEATURE_V8_FCMA) + || (!size && !arm_dc_feature(s, ARM_FEATURE_V8_FP16))) { + return 1; + } + } else { + return 1; + } + + if (s->fp_excp_el) { + gen_exception_insn(s, 4, EXCP_UDEF, + syn_fp_access_trap(1, 0xe, false), s->fp_excp_el); + return 0; + } + if (!s->vfp_enabled) { + return 1; + } + + opr_sz = (1 + q) * 8; + fpst = get_fpstatus_ptr(1); + tcg_gen_gvec_3_ptr(vfp_reg_offset(1, rd), + vfp_reg_offset(1, rn), + vfp_reg_offset(1, rm), fpst, + opr_sz, opr_sz, rot, + size ? gen_helper_gvec_fcmlas_idx + : gen_helper_gvec_fcmlah_idx); + tcg_temp_free_ptr(fpst); + return 0; +} + static int disas_coproc_insn(DisasContext *s, uint32_t insn) { int cpnum, is64, crn, crm, opc1, opc2, isread, rt, rt2; @@ -8338,6 +8541,18 @@ static void disas_arm_insn(DisasContext *s, unsigned int insn) } } } + } else if ((insn & 0x0e000a00) == 0x0c000800 + && arm_dc_feature(s, ARM_FEATURE_V8)) { + if (disas_neon_insn_3same_ext(s, insn)) { + goto illegal_op; + } + return; + } else if ((insn & 0x0f000a00) == 0x0e000800 + && arm_dc_feature(s, ARM_FEATURE_V8)) { + if (disas_neon_insn_2reg_scalar_ext(s, insn)) { + goto illegal_op; + } + return; } else if ((insn & 0x0fe00000) == 0x0c400000) { /* Coprocessor double register transfer. */ ARCH(5TE); @@ -10559,7 +10774,19 @@ static void disas_thumb2_insn(DisasContext *s, uint32_t insn) default_exception_el(s)); break; } - if (((insn >> 24) & 3) == 3) { + if ((insn & 0xfe000a00) == 0xfc000800 + && arm_dc_feature(s, ARM_FEATURE_V8)) { + /* The Thumb2 and ARM encodings are identical. */ + if (disas_neon_insn_3same_ext(s, insn)) { + goto illegal_op; + } + } else if ((insn & 0xff000a00) == 0xfe000800 + && arm_dc_feature(s, ARM_FEATURE_V8)) { + /* The Thumb2 and ARM encodings are identical. */ + if (disas_neon_insn_2reg_scalar_ext(s, insn)) { + goto illegal_op; + } + } else if (((insn >> 24) & 3) == 3) { /* Translate into the equivalent ARM encoding. */ insn = (insn & 0xe2ffffff) | ((insn & (1 << 28)) >> 4) | (1 << 28); if (disas_neon_data_insn(s, insn)) { diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c new file mode 100644 index 0000000000..ec705cfca5 --- /dev/null +++ b/target/arm/vec_helper.c @@ -0,0 +1,429 @@ +/* + * ARM AdvSIMD / SVE Vector Operations + * + * Copyright (c) 2018 Linaro + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "cpu.h" +#include "exec/exec-all.h" +#include "exec/helper-proto.h" +#include "tcg/tcg-gvec-desc.h" +#include "fpu/softfloat.h" + + +/* Note that vector data is stored in host-endian 64-bit chunks, + so addressing units smaller than that needs a host-endian fixup. */ +#ifdef HOST_WORDS_BIGENDIAN +#define H1(x) ((x) ^ 7) +#define H2(x) ((x) ^ 3) +#define H4(x) ((x) ^ 1) +#else +#define H1(x) (x) +#define H2(x) (x) +#define H4(x) (x) +#endif + +#define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] |= CPSR_Q + +static void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz) +{ + uint64_t *d = vd + opr_sz; + uintptr_t i; + + for (i = opr_sz; i < max_sz; i += 8) { + *d++ = 0; + } +} + +/* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ +static uint16_t inl_qrdmlah_s16(CPUARMState *env, int16_t src1, + int16_t src2, int16_t src3) +{ + /* Simplify: + * = ((a3 << 16) + ((e1 * e2) << 1) + (1 << 15)) >> 16 + * = ((a3 << 15) + (e1 * e2) + (1 << 14)) >> 15 + */ + int32_t ret = (int32_t)src1 * src2; + ret = ((int32_t)src3 << 15) + ret + (1 << 14); + ret >>= 15; + if (ret != (int16_t)ret) { + SET_QC(); + ret = (ret < 0 ? -0x8000 : 0x7fff); + } + return ret; +} + +uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, + uint32_t src2, uint32_t src3) +{ + uint16_t e1 = inl_qrdmlah_s16(env, src1, src2, src3); + uint16_t e2 = inl_qrdmlah_s16(env, src1 >> 16, src2 >> 16, src3 >> 16); + return deposit32(e1, 16, 16, e2); +} + +void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, + void *ve, uint32_t desc) +{ + uintptr_t opr_sz = simd_oprsz(desc); + int16_t *d = vd; + int16_t *n = vn; + int16_t *m = vm; + CPUARMState *env = ve; + uintptr_t i; + + for (i = 0; i < opr_sz / 2; ++i) { + d[i] = inl_qrdmlah_s16(env, n[i], m[i], d[i]); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +/* Signed saturating rounding doubling multiply-subtract high half, 16-bit */ +static uint16_t inl_qrdmlsh_s16(CPUARMState *env, int16_t src1, + int16_t src2, int16_t src3) +{ + /* Similarly, using subtraction: + * = ((a3 << 16) - ((e1 * e2) << 1) + (1 << 15)) >> 16 + * = ((a3 << 15) - (e1 * e2) + (1 << 14)) >> 15 + */ + int32_t ret = (int32_t)src1 * src2; + ret = ((int32_t)src3 << 15) - ret + (1 << 14); + ret >>= 15; + if (ret != (int16_t)ret) { + SET_QC(); + ret = (ret < 0 ? -0x8000 : 0x7fff); + } + return ret; +} + +uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, + uint32_t src2, uint32_t src3) +{ + uint16_t e1 = inl_qrdmlsh_s16(env, src1, src2, src3); + uint16_t e2 = inl_qrdmlsh_s16(env, src1 >> 16, src2 >> 16, src3 >> 16); + return deposit32(e1, 16, 16, e2); +} + +void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, + void *ve, uint32_t desc) +{ + uintptr_t opr_sz = simd_oprsz(desc); + int16_t *d = vd; + int16_t *n = vn; + int16_t *m = vm; + CPUARMState *env = ve; + uintptr_t i; + + for (i = 0; i < opr_sz / 2; ++i) { + d[i] = inl_qrdmlsh_s16(env, n[i], m[i], d[i]); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +/* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ +uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, + int32_t src2, int32_t src3) +{ + /* Simplify similarly to int_qrdmlah_s16 above. */ + int64_t ret = (int64_t)src1 * src2; + ret = ((int64_t)src3 << 31) + ret + (1 << 30); + ret >>= 31; + if (ret != (int32_t)ret) { + SET_QC(); + ret = (ret < 0 ? INT32_MIN : INT32_MAX); + } + return ret; +} + +void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, + void *ve, uint32_t desc) +{ + uintptr_t opr_sz = simd_oprsz(desc); + int32_t *d = vd; + int32_t *n = vn; + int32_t *m = vm; + CPUARMState *env = ve; + uintptr_t i; + + for (i = 0; i < opr_sz / 4; ++i) { + d[i] = helper_neon_qrdmlah_s32(env, n[i], m[i], d[i]); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +/* Signed saturating rounding doubling multiply-subtract high half, 32-bit */ +uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, + int32_t src2, int32_t src3) +{ + /* Simplify similarly to int_qrdmlsh_s16 above. */ + int64_t ret = (int64_t)src1 * src2; + ret = ((int64_t)src3 << 31) - ret + (1 << 30); + ret >>= 31; + if (ret != (int32_t)ret) { + SET_QC(); + ret = (ret < 0 ? INT32_MIN : INT32_MAX); + } + return ret; +} + +void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, + void *ve, uint32_t desc) +{ + uintptr_t opr_sz = simd_oprsz(desc); + int32_t *d = vd; + int32_t *n = vn; + int32_t *m = vm; + CPUARMState *env = ve; + uintptr_t i; + + for (i = 0; i < opr_sz / 4; ++i) { + d[i] = helper_neon_qrdmlsh_s32(env, n[i], m[i], d[i]); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, + void *vfpst, uint32_t desc) +{ + uintptr_t opr_sz = simd_oprsz(desc); + float16 *d = vd; + float16 *n = vn; + float16 *m = vm; + float_status *fpst = vfpst; + uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); + uint32_t neg_imag = neg_real ^ 1; + uintptr_t i; + + /* Shift boolean to the sign bit so we can xor to negate. */ + neg_real <<= 15; + neg_imag <<= 15; + + for (i = 0; i < opr_sz / 2; i += 2) { + float16 e0 = n[H2(i)]; + float16 e1 = m[H2(i + 1)] ^ neg_imag; + float16 e2 = n[H2(i + 1)]; + float16 e3 = m[H2(i)] ^ neg_real; + + d[H2(i)] = float16_add(e0, e1, fpst); + d[H2(i + 1)] = float16_add(e2, e3, fpst); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, + void *vfpst, uint32_t desc) +{ + uintptr_t opr_sz = simd_oprsz(desc); + float32 *d = vd; + float32 *n = vn; + float32 *m = vm; + float_status *fpst = vfpst; + uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); + uint32_t neg_imag = neg_real ^ 1; + uintptr_t i; + + /* Shift boolean to the sign bit so we can xor to negate. */ + neg_real <<= 31; + neg_imag <<= 31; + + for (i = 0; i < opr_sz / 4; i += 2) { + float32 e0 = n[H4(i)]; + float32 e1 = m[H4(i + 1)] ^ neg_imag; + float32 e2 = n[H4(i + 1)]; + float32 e3 = m[H4(i)] ^ neg_real; + + d[H4(i)] = float32_add(e0, e1, fpst); + d[H4(i + 1)] = float32_add(e2, e3, fpst); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, + void *vfpst, uint32_t desc) +{ + uintptr_t opr_sz = simd_oprsz(desc); + float64 *d = vd; + float64 *n = vn; + float64 *m = vm; + float_status *fpst = vfpst; + uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); + uint64_t neg_imag = neg_real ^ 1; + uintptr_t i; + + /* Shift boolean to the sign bit so we can xor to negate. */ + neg_real <<= 63; + neg_imag <<= 63; + + for (i = 0; i < opr_sz / 8; i += 2) { + float64 e0 = n[i]; + float64 e1 = m[i + 1] ^ neg_imag; + float64 e2 = n[i + 1]; + float64 e3 = m[i] ^ neg_real; + + d[i] = float64_add(e0, e1, fpst); + d[i + 1] = float64_add(e2, e3, fpst); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, + void *vfpst, uint32_t desc) +{ + uintptr_t opr_sz = simd_oprsz(desc); + float16 *d = vd; + float16 *n = vn; + float16 *m = vm; + float_status *fpst = vfpst; + intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); + uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); + uint32_t neg_real = flip ^ neg_imag; + uintptr_t i; + + /* Shift boolean to the sign bit so we can xor to negate. */ + neg_real <<= 15; + neg_imag <<= 15; + + for (i = 0; i < opr_sz / 2; i += 2) { + float16 e2 = n[H2(i + flip)]; + float16 e1 = m[H2(i + flip)] ^ neg_real; + float16 e4 = e2; + float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; + + d[H2(i)] = float16_muladd(e2, e1, d[H2(i)], 0, fpst); + d[H2(i + 1)] = float16_muladd(e4, e3, d[H2(i + 1)], 0, fpst); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, + void *vfpst, uint32_t desc) +{ + uintptr_t opr_sz = simd_oprsz(desc); + float16 *d = vd; + float16 *n = vn; + float16 *m = vm; + float_status *fpst = vfpst; + intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); + uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); + uint32_t neg_real = flip ^ neg_imag; + uintptr_t i; + float16 e1 = m[H2(flip)]; + float16 e3 = m[H2(1 - flip)]; + + /* Shift boolean to the sign bit so we can xor to negate. */ + neg_real <<= 15; + neg_imag <<= 15; + e1 ^= neg_real; + e3 ^= neg_imag; + + for (i = 0; i < opr_sz / 2; i += 2) { + float16 e2 = n[H2(i + flip)]; + float16 e4 = e2; + + d[H2(i)] = float16_muladd(e2, e1, d[H2(i)], 0, fpst); + d[H2(i + 1)] = float16_muladd(e4, e3, d[H2(i + 1)], 0, fpst); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, + void *vfpst, uint32_t desc) +{ + uintptr_t opr_sz = simd_oprsz(desc); + float32 *d = vd; + float32 *n = vn; + float32 *m = vm; + float_status *fpst = vfpst; + intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); + uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); + uint32_t neg_real = flip ^ neg_imag; + uintptr_t i; + + /* Shift boolean to the sign bit so we can xor to negate. */ + neg_real <<= 31; + neg_imag <<= 31; + + for (i = 0; i < opr_sz / 4; i += 2) { + float32 e2 = n[H4(i + flip)]; + float32 e1 = m[H4(i + flip)] ^ neg_real; + float32 e4 = e2; + float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; + + d[H4(i)] = float32_muladd(e2, e1, d[H4(i)], 0, fpst); + d[H4(i + 1)] = float32_muladd(e4, e3, d[H4(i + 1)], 0, fpst); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, + void *vfpst, uint32_t desc) +{ + uintptr_t opr_sz = simd_oprsz(desc); + float32 *d = vd; + float32 *n = vn; + float32 *m = vm; + float_status *fpst = vfpst; + intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); + uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); + uint32_t neg_real = flip ^ neg_imag; + uintptr_t i; + float32 e1 = m[H4(flip)]; + float32 e3 = m[H4(1 - flip)]; + + /* Shift boolean to the sign bit so we can xor to negate. */ + neg_real <<= 31; + neg_imag <<= 31; + e1 ^= neg_real; + e3 ^= neg_imag; + + for (i = 0; i < opr_sz / 4; i += 2) { + float32 e2 = n[H4(i + flip)]; + float32 e4 = e2; + + d[H4(i)] = float32_muladd(e2, e1, d[H4(i)], 0, fpst); + d[H4(i + 1)] = float32_muladd(e4, e3, d[H4(i + 1)], 0, fpst); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} + +void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, + void *vfpst, uint32_t desc) +{ + uintptr_t opr_sz = simd_oprsz(desc); + float64 *d = vd; + float64 *n = vn; + float64 *m = vm; + float_status *fpst = vfpst; + intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); + uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); + uint64_t neg_real = flip ^ neg_imag; + uintptr_t i; + + /* Shift boolean to the sign bit so we can xor to negate. */ + neg_real <<= 63; + neg_imag <<= 63; + + for (i = 0; i < opr_sz / 8; i += 2) { + float64 e2 = n[i + flip]; + float64 e1 = m[i + flip] ^ neg_real; + float64 e4 = e2; + float64 e3 = m[i + 1 - flip] ^ neg_imag; + + d[i] = float64_muladd(e2, e1, d[i], 0, fpst); + d[i + 1] = float64_muladd(e4, e3, d[i + 1], 0, fpst); + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +} |