aboutsummaryrefslogtreecommitdiff
path: root/target
diff options
context:
space:
mode:
Diffstat (limited to 'target')
-rw-r--r--target/arm/Makefile.objs2
-rw-r--r--target/arm/cpu.c66
-rw-r--r--target/arm/cpu.h8
-rw-r--r--target/arm/cpu64.c2
-rw-r--r--target/arm/helper.c28
-rw-r--r--target/arm/helper.h31
-rw-r--r--target/arm/idau.h61
-rw-r--r--target/arm/translate-a64.c500
-rw-r--r--target/arm/translate.c271
-rw-r--r--target/arm/vec_helper.c429
10 files changed, 1274 insertions, 124 deletions
diff --git a/target/arm/Makefile.objs b/target/arm/Makefile.objs
index 847fb52ee0..1297bead5f 100644
--- a/target/arm/Makefile.objs
+++ b/target/arm/Makefile.objs
@@ -5,7 +5,7 @@ obj-$(call land,$(CONFIG_KVM),$(call lnot,$(TARGET_AARCH64))) += kvm32.o
obj-$(call land,$(CONFIG_KVM),$(TARGET_AARCH64)) += kvm64.o
obj-$(call lnot,$(CONFIG_KVM)) += kvm-stub.o
obj-y += translate.o op_helper.o helper.o cpu.o
-obj-y += neon_helper.o iwmmxt_helper.o
+obj-y += neon_helper.o iwmmxt_helper.o vec_helper.o
obj-y += gdbstub.o
obj-$(TARGET_AARCH64) += cpu64.o translate-a64.o helper-a64.o gdbstub64.o
obj-y += crypto_helper.o
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index 1b3ae62db6..6b77aaa445 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -19,6 +19,7 @@
*/
#include "qemu/osdep.h"
+#include "target/arm/idau.h"
#include "qemu/error-report.h"
#include "qapi/error.h"
#include "cpu.h"
@@ -186,6 +187,7 @@ static void arm_cpu_reset(CPUState *s)
uint32_t initial_msp; /* Loaded from 0x0 */
uint32_t initial_pc; /* Loaded from 0x4 */
uint8_t *rom;
+ uint32_t vecbase;
if (arm_feature(env, ARM_FEATURE_M_SECURITY)) {
env->v7m.secure = true;
@@ -213,8 +215,11 @@ static void arm_cpu_reset(CPUState *s)
/* Unlike A/R profile, M profile defines the reset LR value */
env->regs[14] = 0xffffffff;
- /* Load the initial SP and PC from the vector table at address 0 */
- rom = rom_ptr(0);
+ env->v7m.vecbase[M_REG_S] = cpu->init_svtor & 0xffffff80;
+
+ /* Load the initial SP and PC from offset 0 and 4 in the vector table */
+ vecbase = env->v7m.vecbase[env->v7m.secure];
+ rom = rom_ptr(vecbase);
if (rom) {
/* Address zero is covered by ROM which hasn't yet been
* copied into physical memory.
@@ -227,8 +232,8 @@ static void arm_cpu_reset(CPUState *s)
* it got copied into memory. In the latter case, rom_ptr
* will return a NULL pointer and we should use ldl_phys instead.
*/
- initial_msp = ldl_phys(s->as, 0);
- initial_pc = ldl_phys(s->as, 4);
+ initial_msp = ldl_phys(s->as, vecbase);
+ initial_pc = ldl_phys(s->as, vecbase + 4);
}
env->regs[13] = initial_msp & 0xFFFFFFFC;
@@ -623,6 +628,10 @@ static Property arm_cpu_pmsav7_dregion_property =
pmsav7_dregion,
qdev_prop_uint32, uint32_t);
+/* M profile: initial value of the Secure VTOR */
+static Property arm_cpu_initsvtor_property =
+ DEFINE_PROP_UINT32("init-svtor", ARMCPU, init_svtor, 0);
+
static void arm_cpu_post_init(Object *obj)
{
ARMCPU *cpu = ARM_CPU(obj);
@@ -688,6 +697,15 @@ static void arm_cpu_post_init(Object *obj)
}
}
+ if (arm_feature(&cpu->env, ARM_FEATURE_M_SECURITY)) {
+ object_property_add_link(obj, "idau", TYPE_IDAU_INTERFACE, &cpu->idau,
+ qdev_prop_allow_set_link_before_realize,
+ OBJ_PROP_LINK_UNREF_ON_RELEASE,
+ &error_abort);
+ qdev_property_add_static(DEVICE(obj), &arm_cpu_initsvtor_property,
+ &error_abort);
+ }
+
qdev_property_add_static(DEVICE(obj), &arm_cpu_cfgend_property,
&error_abort);
}
@@ -1188,6 +1206,35 @@ static void cortex_m4_initfn(Object *obj)
cpu->id_isar5 = 0x00000000;
}
+static void cortex_m33_initfn(Object *obj)
+{
+ ARMCPU *cpu = ARM_CPU(obj);
+
+ set_feature(&cpu->env, ARM_FEATURE_V8);
+ set_feature(&cpu->env, ARM_FEATURE_M);
+ set_feature(&cpu->env, ARM_FEATURE_M_SECURITY);
+ set_feature(&cpu->env, ARM_FEATURE_THUMB_DSP);
+ cpu->midr = 0x410fd213; /* r0p3 */
+ cpu->pmsav7_dregion = 16;
+ cpu->sau_sregion = 8;
+ cpu->id_pfr0 = 0x00000030;
+ cpu->id_pfr1 = 0x00000210;
+ cpu->id_dfr0 = 0x00200000;
+ cpu->id_afr0 = 0x00000000;
+ cpu->id_mmfr0 = 0x00101F40;
+ cpu->id_mmfr1 = 0x00000000;
+ cpu->id_mmfr2 = 0x01000000;
+ cpu->id_mmfr3 = 0x00000000;
+ cpu->id_isar0 = 0x01101110;
+ cpu->id_isar1 = 0x02212000;
+ cpu->id_isar2 = 0x20232232;
+ cpu->id_isar3 = 0x01111131;
+ cpu->id_isar4 = 0x01310132;
+ cpu->id_isar5 = 0x00000000;
+ cpu->clidr = 0x00000000;
+ cpu->ctr = 0x8000c000;
+}
+
static void arm_v7m_class_init(ObjectClass *oc, void *data)
{
CPUClass *cc = CPU_CLASS(oc);
@@ -1650,6 +1697,8 @@ static void arm_any_initfn(Object *obj)
set_feature(&cpu->env, ARM_FEATURE_V8_SHA256);
set_feature(&cpu->env, ARM_FEATURE_V8_PMULL);
set_feature(&cpu->env, ARM_FEATURE_CRC);
+ set_feature(&cpu->env, ARM_FEATURE_V8_RDM);
+ set_feature(&cpu->env, ARM_FEATURE_V8_FCMA);
cpu->midr = 0xffffffff;
}
#endif
@@ -1679,6 +1728,8 @@ static const ARMCPUInfo arm_cpus[] = {
.class_init = arm_v7m_class_init },
{ .name = "cortex-m4", .initfn = cortex_m4_initfn,
.class_init = arm_v7m_class_init },
+ { .name = "cortex-m33", .initfn = cortex_m33_initfn,
+ .class_init = arm_v7m_class_init },
{ .name = "cortex-r5", .initfn = cortex_r5_initfn },
{ .name = "cortex-a7", .initfn = cortex_a7_initfn },
{ .name = "cortex-a8", .initfn = cortex_a8_initfn },
@@ -1821,11 +1872,18 @@ static const TypeInfo arm_cpu_type_info = {
.class_init = arm_cpu_class_init,
};
+static const TypeInfo idau_interface_type_info = {
+ .name = TYPE_IDAU_INTERFACE,
+ .parent = TYPE_INTERFACE,
+ .class_size = sizeof(IDAUInterfaceClass),
+};
+
static void arm_cpu_register_types(void)
{
const ARMCPUInfo *info = arm_cpus;
type_register_static(&arm_cpu_type_info);
+ type_register_static(&idau_interface_type_info);
while (info->name) {
cpu_register(info);
diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index 2b9740878b..8dd6b788df 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -694,6 +694,9 @@ struct ARMCPU {
/* MemoryRegion to use for secure physical accesses */
MemoryRegion *secure_memory;
+ /* For v8M, pointer to the IDAU interface provided by board/SoC */
+ Object *idau;
+
/* 'compatible' string for this CPU for Linux device trees */
const char *dtb_compatible;
@@ -728,6 +731,9 @@ struct ARMCPU {
*/
uint32_t psci_conduit;
+ /* For v8M, initial value of the Secure VTOR */
+ uint32_t init_svtor;
+
/* [QEMU_]KVM_ARM_TARGET_* constant for this CPU, or
* QEMU_KVM_ARM_TARGET_NONE if the kernel doesn't support this CPU type.
*/
@@ -1427,7 +1433,9 @@ enum arm_features {
ARM_FEATURE_V8_SHA3, /* implements SHA3 part of v8 Crypto Extensions */
ARM_FEATURE_V8_SM3, /* implements SM3 part of v8 Crypto Extensions */
ARM_FEATURE_V8_SM4, /* implements SM4 part of v8 Crypto Extensions */
+ ARM_FEATURE_V8_RDM, /* implements v8.1 simd round multiply */
ARM_FEATURE_V8_FP16, /* implements v8.2 half-precision float */
+ ARM_FEATURE_V8_FCMA, /* has complex number part of v8.3 extensions. */
};
static inline int arm_feature(CPUARMState *env, int feature)
diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c
index 9743bdc8c3..4228713b19 100644
--- a/target/arm/cpu64.c
+++ b/target/arm/cpu64.c
@@ -230,7 +230,9 @@ static void aarch64_any_initfn(Object *obj)
set_feature(&cpu->env, ARM_FEATURE_V8_SM4);
set_feature(&cpu->env, ARM_FEATURE_V8_PMULL);
set_feature(&cpu->env, ARM_FEATURE_CRC);
+ set_feature(&cpu->env, ARM_FEATURE_V8_RDM);
set_feature(&cpu->env, ARM_FEATURE_V8_FP16);
+ set_feature(&cpu->env, ARM_FEATURE_V8_FCMA);
cpu->ctr = 0x80038003; /* 32 byte I and D cacheline size, VIPT icache */
cpu->dcz_blocksize = 7; /* 512 bytes */
}
diff --git a/target/arm/helper.c b/target/arm/helper.c
index c82f63d440..09893e3f72 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -1,4 +1,5 @@
#include "qemu/osdep.h"
+#include "target/arm/idau.h"
#include "trace.h"
#include "cpu.h"
#include "internals.h"
@@ -9741,19 +9742,32 @@ static void v8m_security_lookup(CPUARMState *env, uint32_t address,
*/
ARMCPU *cpu = arm_env_get_cpu(env);
int r;
+ bool idau_exempt = false, idau_ns = true, idau_nsc = true;
+ int idau_region = IREGION_NOTVALID;
- /* TODO: implement IDAU */
+ if (cpu->idau) {
+ IDAUInterfaceClass *iic = IDAU_INTERFACE_GET_CLASS(cpu->idau);
+ IDAUInterface *ii = IDAU_INTERFACE(cpu->idau);
+
+ iic->check(ii, address, &idau_region, &idau_exempt, &idau_ns,
+ &idau_nsc);
+ }
if (access_type == MMU_INST_FETCH && extract32(address, 28, 4) == 0xf) {
/* 0xf0000000..0xffffffff is always S for insn fetches */
return;
}
- if (v8m_is_sau_exempt(env, address, access_type)) {
+ if (idau_exempt || v8m_is_sau_exempt(env, address, access_type)) {
sattrs->ns = !regime_is_secure(env, mmu_idx);
return;
}
+ if (idau_region != IREGION_NOTVALID) {
+ sattrs->irvalid = true;
+ sattrs->iregion = idau_region;
+ }
+
switch (env->sau.ctrl & 3) {
case 0: /* SAU.ENABLE == 0, SAU.ALLNS == 0 */
break;
@@ -9790,7 +9804,15 @@ static void v8m_security_lookup(CPUARMState *env, uint32_t address,
}
}
- /* TODO when we support the IDAU then it may override the result here */
+ /* The IDAU will override the SAU lookup results if it specifies
+ * higher security than the SAU does.
+ */
+ if (!idau_ns) {
+ if (sattrs->ns || (!idau_nsc && sattrs->nsc)) {
+ sattrs->ns = false;
+ sattrs->nsc = idau_nsc;
+ }
+ }
break;
}
}
diff --git a/target/arm/helper.h b/target/arm/helper.h
index 6dd8504ec3..0d2094f2be 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -365,8 +365,12 @@ DEF_HELPER_FLAGS_1(neon_rbit_u8, TCG_CALL_NO_RWG_SE, i32, i32)
DEF_HELPER_3(neon_qdmulh_s16, i32, env, i32, i32)
DEF_HELPER_3(neon_qrdmulh_s16, i32, env, i32, i32)
+DEF_HELPER_4(neon_qrdmlah_s16, i32, env, i32, i32, i32)
+DEF_HELPER_4(neon_qrdmlsh_s16, i32, env, i32, i32, i32)
DEF_HELPER_3(neon_qdmulh_s32, i32, env, i32, i32)
DEF_HELPER_3(neon_qrdmulh_s32, i32, env, i32, i32)
+DEF_HELPER_4(neon_qrdmlah_s32, i32, env, s32, s32, s32)
+DEF_HELPER_4(neon_qrdmlsh_s32, i32, env, s32, s32, s32)
DEF_HELPER_1(neon_narrow_u8, i32, i64)
DEF_HELPER_1(neon_narrow_u16, i32, i64)
@@ -565,6 +569,33 @@ DEF_HELPER_2(dc_zva, void, env, i64)
DEF_HELPER_FLAGS_2(neon_pmull_64_lo, TCG_CALL_NO_RWG_SE, i64, i64, i64)
DEF_HELPER_FLAGS_2(neon_pmull_64_hi, TCG_CALL_NO_RWG_SE, i64, i64, i64)
+DEF_HELPER_FLAGS_5(gvec_qrdmlah_s16, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_qrdmlsh_s16, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_qrdmlah_s32, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_qrdmlsh_s32, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(gvec_fcaddh, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fcadds, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fcaddd, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_5(gvec_fcmlah, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fcmlah_idx, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fcmlas, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fcmlas_idx, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_5(gvec_fcmlad, TCG_CALL_NO_RWG,
+ void, ptr, ptr, ptr, ptr, i32)
+
#ifdef TARGET_AARCH64
#include "helper-a64.h"
#endif
diff --git a/target/arm/idau.h b/target/arm/idau.h
new file mode 100644
index 0000000000..cac27b95fa
--- /dev/null
+++ b/target/arm/idau.h
@@ -0,0 +1,61 @@
+/*
+ * QEMU ARM CPU -- interface for the Arm v8M IDAU
+ *
+ * Copyright (c) 2018 Linaro Ltd
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see
+ * <http://www.gnu.org/licenses/gpl-2.0.html>
+ *
+ * In the v8M architecture, the IDAU is a small piece of hardware
+ * typically implemented in the SoC which provides board or SoC
+ * specific security attribution information for each address that
+ * the CPU performs MPU/SAU checks on. For QEMU, we model this with a
+ * QOM interface which is implemented by the board or SoC object and
+ * connected to the CPU using a link property.
+ */
+
+#ifndef TARGET_ARM_IDAU_H
+#define TARGET_ARM_IDAU_H
+
+#include "qom/object.h"
+
+#define TYPE_IDAU_INTERFACE "idau-interface"
+#define IDAU_INTERFACE(obj) \
+ INTERFACE_CHECK(IDAUInterface, (obj), TYPE_IDAU_INTERFACE)
+#define IDAU_INTERFACE_CLASS(class) \
+ OBJECT_CLASS_CHECK(IDAUInterfaceClass, (class), TYPE_IDAU_INTERFACE)
+#define IDAU_INTERFACE_GET_CLASS(obj) \
+ OBJECT_GET_CLASS(IDAUInterfaceClass, (obj), TYPE_IDAU_INTERFACE)
+
+typedef struct IDAUInterface {
+ Object parent;
+} IDAUInterface;
+
+#define IREGION_NOTVALID -1
+
+typedef struct IDAUInterfaceClass {
+ InterfaceClass parent;
+
+ /* Check the specified address and return the IDAU security information
+ * for it by filling in iregion, exempt, ns and nsc:
+ * iregion: IDAU region number, or IREGION_NOTVALID if not valid
+ * exempt: true if address is exempt from security attribution
+ * ns: true if the address is NonSecure
+ * nsc: true if the address is NonSecure-callable
+ */
+ void (*check)(IDAUInterface *ii, uint32_t address, int *iregion,
+ bool *exempt, bool *ns, bool *nsc);
+} IDAUInterfaceClass;
+
+#endif
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index 32811dc8b0..31ff0479e6 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -701,6 +701,33 @@ static void gen_gvec_op3(DisasContext *s, bool is_q, int rd,
vec_full_reg_size(s), gvec_op);
}
+/* Expand a 3-operand + env pointer operation using
+ * an out-of-line helper.
+ */
+static void gen_gvec_op3_env(DisasContext *s, bool is_q, int rd,
+ int rn, int rm, gen_helper_gvec_3_ptr *fn)
+{
+ tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm), cpu_env,
+ is_q ? 16 : 8, vec_full_reg_size(s), 0, fn);
+}
+
+/* Expand a 3-operand + fpstatus pointer + simd data value operation using
+ * an out-of-line helper.
+ */
+static void gen_gvec_op3_fpst(DisasContext *s, bool is_q, int rd, int rn,
+ int rm, bool is_fp16, int data,
+ gen_helper_gvec_3_ptr *fn)
+{
+ TCGv_ptr fpst = get_fpstatus_ptr(is_fp16);
+ tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm), fpst,
+ is_q ? 16 : 8, vec_full_reg_size(s), data, fn);
+ tcg_temp_free_ptr(fpst);
+}
+
/* Set ZF and NF based on a 64 bit result. This is alas fiddlier
* than the 32 bit equivalent.
*/
@@ -7971,6 +7998,89 @@ static void disas_simd_scalar_three_reg_same_fp16(DisasContext *s,
tcg_temp_free_ptr(fpst);
}
+/* AdvSIMD scalar three same extra
+ * 31 30 29 28 24 23 22 21 20 16 15 14 11 10 9 5 4 0
+ * +-----+---+-----------+------+---+------+---+--------+---+----+----+
+ * | 0 1 | U | 1 1 1 1 0 | size | 0 | Rm | 1 | opcode | 1 | Rn | Rd |
+ * +-----+---+-----------+------+---+------+---+--------+---+----+----+
+ */
+static void disas_simd_scalar_three_reg_same_extra(DisasContext *s,
+ uint32_t insn)
+{
+ int rd = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int opcode = extract32(insn, 11, 4);
+ int rm = extract32(insn, 16, 5);
+ int size = extract32(insn, 22, 2);
+ bool u = extract32(insn, 29, 1);
+ TCGv_i32 ele1, ele2, ele3;
+ TCGv_i64 res;
+ int feature;
+
+ switch (u * 16 + opcode) {
+ case 0x10: /* SQRDMLAH (vector) */
+ case 0x11: /* SQRDMLSH (vector) */
+ if (size != 1 && size != 2) {
+ unallocated_encoding(s);
+ return;
+ }
+ feature = ARM_FEATURE_V8_RDM;
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+ if (!arm_dc_feature(s, feature)) {
+ unallocated_encoding(s);
+ return;
+ }
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ /* Do a single operation on the lowest element in the vector.
+ * We use the standard Neon helpers and rely on 0 OP 0 == 0
+ * with no side effects for all these operations.
+ * OPTME: special-purpose helpers would avoid doing some
+ * unnecessary work in the helper for the 16 bit cases.
+ */
+ ele1 = tcg_temp_new_i32();
+ ele2 = tcg_temp_new_i32();
+ ele3 = tcg_temp_new_i32();
+
+ read_vec_element_i32(s, ele1, rn, 0, size);
+ read_vec_element_i32(s, ele2, rm, 0, size);
+ read_vec_element_i32(s, ele3, rd, 0, size);
+
+ switch (opcode) {
+ case 0x0: /* SQRDMLAH */
+ if (size == 1) {
+ gen_helper_neon_qrdmlah_s16(ele3, cpu_env, ele1, ele2, ele3);
+ } else {
+ gen_helper_neon_qrdmlah_s32(ele3, cpu_env, ele1, ele2, ele3);
+ }
+ break;
+ case 0x1: /* SQRDMLSH */
+ if (size == 1) {
+ gen_helper_neon_qrdmlsh_s16(ele3, cpu_env, ele1, ele2, ele3);
+ } else {
+ gen_helper_neon_qrdmlsh_s32(ele3, cpu_env, ele1, ele2, ele3);
+ }
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ tcg_temp_free_i32(ele1);
+ tcg_temp_free_i32(ele2);
+
+ res = tcg_temp_new_i64();
+ tcg_gen_extu_i32_i64(res, ele3);
+ tcg_temp_free_i32(ele3);
+
+ write_fp_dreg(s, rd, res);
+ tcg_temp_free_i64(res);
+}
+
static void handle_2misc_64(DisasContext *s, int opcode, bool u,
TCGv_i64 tcg_rd, TCGv_i64 tcg_rn,
TCGv_i32 tcg_rmode, TCGv_ptr tcg_fpstatus)
@@ -10706,6 +10816,134 @@ static void disas_simd_three_reg_same_fp16(DisasContext *s, uint32_t insn)
clear_vec_high(s, is_q, rd);
}
+/* AdvSIMD three same extra
+ * 31 30 29 28 24 23 22 21 20 16 15 14 11 10 9 5 4 0
+ * +---+---+---+-----------+------+---+------+---+--------+---+----+----+
+ * | 0 | Q | U | 0 1 1 1 0 | size | 0 | Rm | 1 | opcode | 1 | Rn | Rd |
+ * +---+---+---+-----------+------+---+------+---+--------+---+----+----+
+ */
+static void disas_simd_three_reg_same_extra(DisasContext *s, uint32_t insn)
+{
+ int rd = extract32(insn, 0, 5);
+ int rn = extract32(insn, 5, 5);
+ int opcode = extract32(insn, 11, 4);
+ int rm = extract32(insn, 16, 5);
+ int size = extract32(insn, 22, 2);
+ bool u = extract32(insn, 29, 1);
+ bool is_q = extract32(insn, 30, 1);
+ int feature, rot;
+
+ switch (u * 16 + opcode) {
+ case 0x10: /* SQRDMLAH (vector) */
+ case 0x11: /* SQRDMLSH (vector) */
+ if (size != 1 && size != 2) {
+ unallocated_encoding(s);
+ return;
+ }
+ feature = ARM_FEATURE_V8_RDM;
+ break;
+ case 0x8: /* FCMLA, #0 */
+ case 0x9: /* FCMLA, #90 */
+ case 0xa: /* FCMLA, #180 */
+ case 0xb: /* FCMLA, #270 */
+ case 0xc: /* FCADD, #90 */
+ case 0xe: /* FCADD, #270 */
+ if (size == 0
+ || (size == 1 && !arm_dc_feature(s, ARM_FEATURE_V8_FP16))
+ || (size == 3 && !is_q)) {
+ unallocated_encoding(s);
+ return;
+ }
+ feature = ARM_FEATURE_V8_FCMA;
+ break;
+ default:
+ unallocated_encoding(s);
+ return;
+ }
+ if (!arm_dc_feature(s, feature)) {
+ unallocated_encoding(s);
+ return;
+ }
+ if (!fp_access_check(s)) {
+ return;
+ }
+
+ switch (opcode) {
+ case 0x0: /* SQRDMLAH (vector) */
+ switch (size) {
+ case 1:
+ gen_gvec_op3_env(s, is_q, rd, rn, rm, gen_helper_gvec_qrdmlah_s16);
+ break;
+ case 2:
+ gen_gvec_op3_env(s, is_q, rd, rn, rm, gen_helper_gvec_qrdmlah_s32);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ return;
+
+ case 0x1: /* SQRDMLSH (vector) */
+ switch (size) {
+ case 1:
+ gen_gvec_op3_env(s, is_q, rd, rn, rm, gen_helper_gvec_qrdmlsh_s16);
+ break;
+ case 2:
+ gen_gvec_op3_env(s, is_q, rd, rn, rm, gen_helper_gvec_qrdmlsh_s32);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ return;
+
+ case 0x8: /* FCMLA, #0 */
+ case 0x9: /* FCMLA, #90 */
+ case 0xa: /* FCMLA, #180 */
+ case 0xb: /* FCMLA, #270 */
+ rot = extract32(opcode, 0, 2);
+ switch (size) {
+ case 1:
+ gen_gvec_op3_fpst(s, is_q, rd, rn, rm, true, rot,
+ gen_helper_gvec_fcmlah);
+ break;
+ case 2:
+ gen_gvec_op3_fpst(s, is_q, rd, rn, rm, false, rot,
+ gen_helper_gvec_fcmlas);
+ break;
+ case 3:
+ gen_gvec_op3_fpst(s, is_q, rd, rn, rm, false, rot,
+ gen_helper_gvec_fcmlad);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ return;
+
+ case 0xc: /* FCADD, #90 */
+ case 0xe: /* FCADD, #270 */
+ rot = extract32(opcode, 1, 1);
+ switch (size) {
+ case 1:
+ gen_gvec_op3_fpst(s, is_q, rd, rn, rm, size == 1, rot,
+ gen_helper_gvec_fcaddh);
+ break;
+ case 2:
+ gen_gvec_op3_fpst(s, is_q, rd, rn, rm, size == 1, rot,
+ gen_helper_gvec_fcadds);
+ break;
+ case 3:
+ gen_gvec_op3_fpst(s, is_q, rd, rn, rm, size == 1, rot,
+ gen_helper_gvec_fcaddd);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ return;
+
+ default:
+ g_assert_not_reached();
+ }
+}
+
static void handle_2misc_widening(DisasContext *s, int opcode, bool is_q,
int size, int rn, int rd)
{
@@ -11782,107 +12020,137 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn)
int rn = extract32(insn, 5, 5);
int rd = extract32(insn, 0, 5);
bool is_long = false;
- bool is_fp = false;
+ int is_fp = 0;
bool is_fp16 = false;
int index;
TCGv_ptr fpst;
- switch (opcode) {
- case 0x0: /* MLA */
- case 0x4: /* MLS */
- if (!u || is_scalar) {
+ switch (16 * u + opcode) {
+ case 0x08: /* MUL */
+ case 0x10: /* MLA */
+ case 0x14: /* MLS */
+ if (is_scalar) {
unallocated_encoding(s);
return;
}
break;
- case 0x2: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
- case 0x6: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
- case 0xa: /* SMULL, SMULL2, UMULL, UMULL2 */
+ case 0x02: /* SMLAL, SMLAL2 */
+ case 0x12: /* UMLAL, UMLAL2 */
+ case 0x06: /* SMLSL, SMLSL2 */
+ case 0x16: /* UMLSL, UMLSL2 */
+ case 0x0a: /* SMULL, SMULL2 */
+ case 0x1a: /* UMULL, UMULL2 */
if (is_scalar) {
unallocated_encoding(s);
return;
}
is_long = true;
break;
- case 0x3: /* SQDMLAL, SQDMLAL2 */
- case 0x7: /* SQDMLSL, SQDMLSL2 */
- case 0xb: /* SQDMULL, SQDMULL2 */
+ case 0x03: /* SQDMLAL, SQDMLAL2 */
+ case 0x07: /* SQDMLSL, SQDMLSL2 */
+ case 0x0b: /* SQDMULL, SQDMULL2 */
is_long = true;
- /* fall through */
- case 0xc: /* SQDMULH */
- case 0xd: /* SQRDMULH */
- if (u) {
- unallocated_encoding(s);
- return;
- }
break;
- case 0x8: /* MUL */
- if (u || is_scalar) {
- unallocated_encoding(s);
- return;
- }
+ case 0x0c: /* SQDMULH */
+ case 0x0d: /* SQRDMULH */
break;
- case 0x1: /* FMLA */
- case 0x5: /* FMLS */
- if (u) {
+ case 0x01: /* FMLA */
+ case 0x05: /* FMLS */
+ case 0x09: /* FMUL */
+ case 0x19: /* FMULX */
+ is_fp = 1;
+ break;
+ case 0x1d: /* SQRDMLAH */
+ case 0x1f: /* SQRDMLSH */
+ if (!arm_dc_feature(s, ARM_FEATURE_V8_RDM)) {
unallocated_encoding(s);
return;
}
- /* fall through */
- case 0x9: /* FMUL, FMULX */
- if (size == 1) {
+ break;
+ case 0x11: /* FCMLA #0 */
+ case 0x13: /* FCMLA #90 */
+ case 0x15: /* FCMLA #180 */
+ case 0x17: /* FCMLA #270 */
+ if (!arm_dc_feature(s, ARM_FEATURE_V8_FCMA)) {
unallocated_encoding(s);
return;
}
- is_fp = true;
+ is_fp = 2;
break;
default:
unallocated_encoding(s);
return;
}
- if (is_fp) {
+ switch (is_fp) {
+ case 1: /* normal fp */
/* convert insn encoded size to TCGMemOp size */
switch (size) {
- case 2: /* single precision */
- size = MO_32;
- index = h << 1 | l;
- rm |= (m << 4);
- break;
- case 3: /* double precision */
- size = MO_64;
- if (l || !is_q) {
- unallocated_encoding(s);
- return;
- }
- index = h;
- rm |= (m << 4);
- break;
- case 0: /* half precision */
+ case 0: /* half-precision */
size = MO_16;
- index = h << 2 | l << 1 | m;
is_fp16 = true;
- if (arm_dc_feature(s, ARM_FEATURE_V8_FP16)) {
- break;
- }
- /* fallthru */
- default: /* unallocated */
+ break;
+ case MO_32: /* single precision */
+ case MO_64: /* double precision */
+ break;
+ default:
unallocated_encoding(s);
return;
}
- } else {
+ break;
+
+ case 2: /* complex fp */
+ /* Each indexable element is a complex pair. */
+ size <<= 1;
switch (size) {
- case 1:
- index = h << 2 | l << 1 | m;
+ case MO_32:
+ if (h && !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+ is_fp16 = true;
break;
- case 2:
- index = h << 1 | l;
- rm |= (m << 4);
+ case MO_64:
break;
default:
unallocated_encoding(s);
return;
}
+ break;
+
+ default: /* integer */
+ switch (size) {
+ case MO_8:
+ case MO_64:
+ unallocated_encoding(s);
+ return;
+ }
+ break;
+ }
+ if (is_fp16 && !arm_dc_feature(s, ARM_FEATURE_V8_FP16)) {
+ unallocated_encoding(s);
+ return;
+ }
+
+ /* Given TCGMemOp size, adjust register and indexing. */
+ switch (size) {
+ case MO_16:
+ index = h << 2 | l << 1 | m;
+ break;
+ case MO_32:
+ index = h << 1 | l;
+ rm |= m << 4;
+ break;
+ case MO_64:
+ if (l || !is_q) {
+ unallocated_encoding(s);
+ return;
+ }
+ index = h;
+ rm |= m << 4;
+ break;
+ default:
+ g_assert_not_reached();
}
if (!fp_access_check(s)) {
@@ -11895,6 +12163,23 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn)
fpst = NULL;
}
+ switch (16 * u + opcode) {
+ case 0x11: /* FCMLA #0 */
+ case 0x13: /* FCMLA #90 */
+ case 0x15: /* FCMLA #180 */
+ case 0x17: /* FCMLA #270 */
+ tcg_gen_gvec_3_ptr(vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_reg_offset(s, rm, index, size), fpst,
+ is_q ? 16 : 8, vec_full_reg_size(s),
+ extract32(insn, 13, 2), /* rot */
+ size == MO_64
+ ? gen_helper_gvec_fcmlas_idx
+ : gen_helper_gvec_fcmlah_idx);
+ tcg_temp_free_ptr(fpst);
+ return;
+ }
+
if (size == 3) {
TCGv_i64 tcg_idx = tcg_temp_new_i64();
int pass;
@@ -11909,21 +12194,20 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn)
read_vec_element(s, tcg_op, rn, pass, MO_64);
- switch (opcode) {
- case 0x5: /* FMLS */
+ switch (16 * u + opcode) {
+ case 0x05: /* FMLS */
/* As usual for ARM, separate negation for fused multiply-add */
gen_helper_vfp_negd(tcg_op, tcg_op);
/* fall through */
- case 0x1: /* FMLA */
+ case 0x01: /* FMLA */
read_vec_element(s, tcg_res, rd, pass, MO_64);
gen_helper_vfp_muladdd(tcg_res, tcg_op, tcg_idx, tcg_res, fpst);
break;
- case 0x9: /* FMUL, FMULX */
- if (u) {
- gen_helper_vfp_mulxd(tcg_res, tcg_op, tcg_idx, fpst);
- } else {
- gen_helper_vfp_muld(tcg_res, tcg_op, tcg_idx, fpst);
- }
+ case 0x09: /* FMUL */
+ gen_helper_vfp_muld(tcg_res, tcg_op, tcg_idx, fpst);
+ break;
+ case 0x19: /* FMULX */
+ gen_helper_vfp_mulxd(tcg_res, tcg_op, tcg_idx, fpst);
break;
default:
g_assert_not_reached();
@@ -11966,10 +12250,10 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn)
read_vec_element_i32(s, tcg_op, rn, pass, is_scalar ? size : MO_32);
- switch (opcode) {
- case 0x0: /* MLA */
- case 0x4: /* MLS */
- case 0x8: /* MUL */
+ switch (16 * u + opcode) {
+ case 0x08: /* MUL */
+ case 0x10: /* MLA */
+ case 0x14: /* MLS */
{
static NeonGenTwoOpFn * const fns[2][2] = {
{ gen_helper_neon_add_u16, gen_helper_neon_sub_u16 },
@@ -11991,8 +12275,8 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn)
genfn(tcg_res, tcg_op, tcg_res);
break;
}
- case 0x5: /* FMLS */
- case 0x1: /* FMLA */
+ case 0x05: /* FMLS */
+ case 0x01: /* FMLA */
read_vec_element_i32(s, tcg_res, rd, pass,
is_scalar ? size : MO_32);
switch (size) {
@@ -12023,39 +12307,43 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn)
g_assert_not_reached();
}
break;
- case 0x9: /* FMUL, FMULX */
+ case 0x09: /* FMUL */
switch (size) {
case 1:
- if (u) {
- if (is_scalar) {
- gen_helper_advsimd_mulxh(tcg_res, tcg_op,
- tcg_idx, fpst);
- } else {
- gen_helper_advsimd_mulx2h(tcg_res, tcg_op,
- tcg_idx, fpst);
- }
+ if (is_scalar) {
+ gen_helper_advsimd_mulh(tcg_res, tcg_op,
+ tcg_idx, fpst);
} else {
- if (is_scalar) {
- gen_helper_advsimd_mulh(tcg_res, tcg_op,
- tcg_idx, fpst);
- } else {
- gen_helper_advsimd_mul2h(tcg_res, tcg_op,
- tcg_idx, fpst);
- }
+ gen_helper_advsimd_mul2h(tcg_res, tcg_op,
+ tcg_idx, fpst);
}
break;
case 2:
- if (u) {
- gen_helper_vfp_mulxs(tcg_res, tcg_op, tcg_idx, fpst);
+ gen_helper_vfp_muls(tcg_res, tcg_op, tcg_idx, fpst);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ break;
+ case 0x19: /* FMULX */
+ switch (size) {
+ case 1:
+ if (is_scalar) {
+ gen_helper_advsimd_mulxh(tcg_res, tcg_op,
+ tcg_idx, fpst);
} else {
- gen_helper_vfp_muls(tcg_res, tcg_op, tcg_idx, fpst);
+ gen_helper_advsimd_mulx2h(tcg_res, tcg_op,
+ tcg_idx, fpst);
}
break;
+ case 2:
+ gen_helper_vfp_mulxs(tcg_res, tcg_op, tcg_idx, fpst);
+ break;
default:
g_assert_not_reached();
}
break;
- case 0xc: /* SQDMULH */
+ case 0x0c: /* SQDMULH */
if (size == 1) {
gen_helper_neon_qdmulh_s16(tcg_res, cpu_env,
tcg_op, tcg_idx);
@@ -12064,7 +12352,7 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn)
tcg_op, tcg_idx);
}
break;
- case 0xd: /* SQRDMULH */
+ case 0x0d: /* SQRDMULH */
if (size == 1) {
gen_helper_neon_qrdmulh_s16(tcg_res, cpu_env,
tcg_op, tcg_idx);
@@ -12073,6 +12361,28 @@ static void disas_simd_indexed(DisasContext *s, uint32_t insn)
tcg_op, tcg_idx);
}
break;
+ case 0x1d: /* SQRDMLAH */
+ read_vec_element_i32(s, tcg_res, rd, pass,
+ is_scalar ? size : MO_32);
+ if (size == 1) {
+ gen_helper_neon_qrdmlah_s16(tcg_res, cpu_env,
+ tcg_op, tcg_idx, tcg_res);
+ } else {
+ gen_helper_neon_qrdmlah_s32(tcg_res, cpu_env,
+ tcg_op, tcg_idx, tcg_res);
+ }
+ break;
+ case 0x1f: /* SQRDMLSH */
+ read_vec_element_i32(s, tcg_res, rd, pass,
+ is_scalar ? size : MO_32);
+ if (size == 1) {
+ gen_helper_neon_qrdmlsh_s16(tcg_res, cpu_env,
+ tcg_op, tcg_idx, tcg_res);
+ } else {
+ gen_helper_neon_qrdmlsh_s32(tcg_res, cpu_env,
+ tcg_op, tcg_idx, tcg_res);
+ }
+ break;
default:
g_assert_not_reached();
}
@@ -12794,6 +13104,7 @@ static void disas_crypto_three_reg_imm2(DisasContext *s, uint32_t insn)
static const AArch64DecodeTable data_proc_simd[] = {
/* pattern , mask , fn */
{ 0x0e200400, 0x9f200400, disas_simd_three_reg_same },
+ { 0x0e008400, 0x9f208400, disas_simd_three_reg_same_extra },
{ 0x0e200000, 0x9f200c00, disas_simd_three_reg_diff },
{ 0x0e200800, 0x9f3e0c00, disas_simd_two_reg_misc },
{ 0x0e300800, 0x9f3e0c00, disas_simd_across_lanes },
@@ -12806,6 +13117,7 @@ static const AArch64DecodeTable data_proc_simd[] = {
{ 0x0e000800, 0xbf208c00, disas_simd_zip_trn },
{ 0x2e000000, 0xbf208400, disas_simd_ext },
{ 0x5e200400, 0xdf200400, disas_simd_scalar_three_reg_same },
+ { 0x5e008400, 0xdf208400, disas_simd_scalar_three_reg_same_extra },
{ 0x5e200000, 0xdf200c00, disas_simd_scalar_three_reg_diff },
{ 0x5e200800, 0xdf3e0c00, disas_simd_scalar_two_reg_misc },
{ 0x5e300800, 0xdf3e0c00, disas_simd_scalar_pairwise },
diff --git a/target/arm/translate.c b/target/arm/translate.c
index aa6dcaa577..ba6ab7d287 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -25,6 +25,7 @@
#include "disas/disas.h"
#include "exec/exec-all.h"
#include "tcg-op.h"
+#include "tcg-op-gvec.h"
#include "qemu/log.h"
#include "qemu/bitops.h"
#include "arm_ldst.h"
@@ -75,6 +76,10 @@ static const char *regnames[] =
{ "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "pc" };
+/* Function prototypes for gen_ functions calling Neon helpers. */
+typedef void NeonGenThreeOpEnvFn(TCGv_i32, TCGv_env, TCGv_i32,
+ TCGv_i32, TCGv_i32);
+
/* initialize TCG globals. */
void arm_translate_init(void)
{
@@ -5374,9 +5379,9 @@ static void gen_neon_narrow_op(int op, int u, int size,
#define NEON_3R_VPMAX 20
#define NEON_3R_VPMIN 21
#define NEON_3R_VQDMULH_VQRDMULH 22
-#define NEON_3R_VPADD 23
+#define NEON_3R_VPADD_VQRDMLAH 23
#define NEON_3R_SHA 24 /* SHA1C,SHA1P,SHA1M,SHA1SU0,SHA256H{2},SHA256SU1 */
-#define NEON_3R_VFM 25 /* VFMA, VFMS : float fused multiply-add */
+#define NEON_3R_VFM_VQRDMLSH 25 /* VFMA, VFMS, VQRDMLSH */
#define NEON_3R_FLOAT_ARITH 26 /* float VADD, VSUB, VPADD, VABD */
#define NEON_3R_FLOAT_MULTIPLY 27 /* float VMLA, VMLS, VMUL */
#define NEON_3R_FLOAT_CMP 28 /* float VCEQ, VCGE, VCGT */
@@ -5408,9 +5413,9 @@ static const uint8_t neon_3r_sizes[] = {
[NEON_3R_VPMAX] = 0x7,
[NEON_3R_VPMIN] = 0x7,
[NEON_3R_VQDMULH_VQRDMULH] = 0x6,
- [NEON_3R_VPADD] = 0x7,
+ [NEON_3R_VPADD_VQRDMLAH] = 0x7,
[NEON_3R_SHA] = 0xf, /* size field encodes op type */
- [NEON_3R_VFM] = 0x5, /* size bit 1 encodes op */
+ [NEON_3R_VFM_VQRDMLSH] = 0x7, /* For VFM, size bit 1 encodes op */
[NEON_3R_FLOAT_ARITH] = 0x5, /* size bit 1 encodes op */
[NEON_3R_FLOAT_MULTIPLY] = 0x5, /* size bit 1 encodes op */
[NEON_3R_FLOAT_CMP] = 0x5, /* size bit 1 encodes op */
@@ -5589,6 +5594,22 @@ static const uint8_t neon_2rm_sizes[] = {
[NEON_2RM_VCVT_UF] = 0x4,
};
+
+/* Expand v8.1 simd helper. */
+static int do_v81_helper(DisasContext *s, gen_helper_gvec_3_ptr *fn,
+ int q, int rd, int rn, int rm)
+{
+ if (arm_dc_feature(s, ARM_FEATURE_V8_RDM)) {
+ int opr_sz = (1 + q) * 8;
+ tcg_gen_gvec_3_ptr(vfp_reg_offset(1, rd),
+ vfp_reg_offset(1, rn),
+ vfp_reg_offset(1, rm), cpu_env,
+ opr_sz, opr_sz, 0, fn);
+ return 0;
+ }
+ return 1;
+}
+
/* Translate a NEON data processing instruction. Return nonzero if the
instruction is invalid.
We process data in a mixture of 32-bit and 64-bit chunks.
@@ -5641,12 +5662,13 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
if (q && ((rd | rn | rm) & 1)) {
return 1;
}
- /*
- * The SHA-1/SHA-256 3-register instructions require special treatment
- * here, as their size field is overloaded as an op type selector, and
- * they all consume their input in a single pass.
- */
- if (op == NEON_3R_SHA) {
+ switch (op) {
+ case NEON_3R_SHA:
+ /* The SHA-1/SHA-256 3-register instructions require special
+ * treatment here, as their size field is overloaded as an
+ * op type selector, and they all consume their input in a
+ * single pass.
+ */
if (!q) {
return 1;
}
@@ -5683,6 +5705,40 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
tcg_temp_free_ptr(ptr2);
tcg_temp_free_ptr(ptr3);
return 0;
+
+ case NEON_3R_VPADD_VQRDMLAH:
+ if (!u) {
+ break; /* VPADD */
+ }
+ /* VQRDMLAH */
+ switch (size) {
+ case 1:
+ return do_v81_helper(s, gen_helper_gvec_qrdmlah_s16,
+ q, rd, rn, rm);
+ case 2:
+ return do_v81_helper(s, gen_helper_gvec_qrdmlah_s32,
+ q, rd, rn, rm);
+ }
+ return 1;
+
+ case NEON_3R_VFM_VQRDMLSH:
+ if (!u) {
+ /* VFM, VFMS */
+ if (size == 1) {
+ return 1;
+ }
+ break;
+ }
+ /* VQRDMLSH */
+ switch (size) {
+ case 1:
+ return do_v81_helper(s, gen_helper_gvec_qrdmlsh_s16,
+ q, rd, rn, rm);
+ case 2:
+ return do_v81_helper(s, gen_helper_gvec_qrdmlsh_s32,
+ q, rd, rn, rm);
+ }
+ return 1;
}
if (size == 3 && op != NEON_3R_LOGIC) {
/* 64-bit element instructions. */
@@ -5768,11 +5824,7 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
rm = rtmp;
}
break;
- case NEON_3R_VPADD:
- if (u) {
- return 1;
- }
- /* Fall through */
+ case NEON_3R_VPADD_VQRDMLAH:
case NEON_3R_VPMAX:
case NEON_3R_VPMIN:
pairwise = 1;
@@ -5806,8 +5858,8 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
return 1;
}
break;
- case NEON_3R_VFM:
- if (!arm_dc_feature(s, ARM_FEATURE_VFP4) || u) {
+ case NEON_3R_VFM_VQRDMLSH:
+ if (!arm_dc_feature(s, ARM_FEATURE_VFP4)) {
return 1;
}
break;
@@ -6004,7 +6056,7 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
}
}
break;
- case NEON_3R_VPADD:
+ case NEON_3R_VPADD_VQRDMLAH:
switch (size) {
case 0: gen_helper_neon_padd_u8(tmp, tmp, tmp2); break;
case 1: gen_helper_neon_padd_u16(tmp, tmp, tmp2); break;
@@ -6103,7 +6155,7 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
}
}
break;
- case NEON_3R_VFM:
+ case NEON_3R_VFM_VQRDMLSH:
{
/* VFMA, VFMS: fused multiply-add */
TCGv_ptr fpstatus = get_fpstatus_ptr(1);
@@ -6937,11 +6989,45 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
}
neon_store_reg64(cpu_V0, rd + pass);
}
+ break;
+ case 14: /* VQRDMLAH scalar */
+ case 15: /* VQRDMLSH scalar */
+ {
+ NeonGenThreeOpEnvFn *fn;
+ if (!arm_dc_feature(s, ARM_FEATURE_V8_RDM)) {
+ return 1;
+ }
+ if (u && ((rd | rn) & 1)) {
+ return 1;
+ }
+ if (op == 14) {
+ if (size == 1) {
+ fn = gen_helper_neon_qrdmlah_s16;
+ } else {
+ fn = gen_helper_neon_qrdmlah_s32;
+ }
+ } else {
+ if (size == 1) {
+ fn = gen_helper_neon_qrdmlsh_s16;
+ } else {
+ fn = gen_helper_neon_qrdmlsh_s32;
+ }
+ }
+ tmp2 = neon_get_scalar(size, rm);
+ for (pass = 0; pass < (u ? 4 : 2); pass++) {
+ tmp = neon_load_reg(rn, pass);
+ tmp3 = neon_load_reg(rd, pass);
+ fn(tmp, cpu_env, tmp, tmp2, tmp3);
+ tcg_temp_free_i32(tmp3);
+ neon_store_reg(rd, pass, tmp);
+ }
+ tcg_temp_free_i32(tmp2);
+ }
break;
- default: /* 14 and 15 are RESERVED */
- return 1;
+ default:
+ g_assert_not_reached();
}
}
} else { /* size == 3 */
@@ -7594,6 +7680,123 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
return 0;
}
+/* Advanced SIMD three registers of the same length extension.
+ * 31 25 23 22 20 16 12 11 10 9 8 3 0
+ * +---------------+-----+---+-----+----+----+---+----+---+----+---------+----+
+ * | 1 1 1 1 1 1 0 | op1 | D | op2 | Vn | Vd | 1 | o3 | 0 | o4 | N Q M U | Vm |
+ * +---------------+-----+---+-----+----+----+---+----+---+----+---------+----+
+ */
+static int disas_neon_insn_3same_ext(DisasContext *s, uint32_t insn)
+{
+ gen_helper_gvec_3_ptr *fn_gvec_ptr;
+ int rd, rn, rm, rot, size, opr_sz;
+ TCGv_ptr fpst;
+ bool q;
+
+ q = extract32(insn, 6, 1);
+ VFP_DREG_D(rd, insn);
+ VFP_DREG_N(rn, insn);
+ VFP_DREG_M(rm, insn);
+ if ((rd | rn | rm) & q) {
+ return 1;
+ }
+
+ if ((insn & 0xfe200f10) == 0xfc200800) {
+ /* VCMLA -- 1111 110R R.1S .... .... 1000 ...0 .... */
+ size = extract32(insn, 20, 1);
+ rot = extract32(insn, 23, 2);
+ if (!arm_dc_feature(s, ARM_FEATURE_V8_FCMA)
+ || (!size && !arm_dc_feature(s, ARM_FEATURE_V8_FP16))) {
+ return 1;
+ }
+ fn_gvec_ptr = size ? gen_helper_gvec_fcmlas : gen_helper_gvec_fcmlah;
+ } else if ((insn & 0xfea00f10) == 0xfc800800) {
+ /* VCADD -- 1111 110R 1.0S .... .... 1000 ...0 .... */
+ size = extract32(insn, 20, 1);
+ rot = extract32(insn, 24, 1);
+ if (!arm_dc_feature(s, ARM_FEATURE_V8_FCMA)
+ || (!size && !arm_dc_feature(s, ARM_FEATURE_V8_FP16))) {
+ return 1;
+ }
+ fn_gvec_ptr = size ? gen_helper_gvec_fcadds : gen_helper_gvec_fcaddh;
+ } else {
+ return 1;
+ }
+
+ if (s->fp_excp_el) {
+ gen_exception_insn(s, 4, EXCP_UDEF,
+ syn_fp_access_trap(1, 0xe, false), s->fp_excp_el);
+ return 0;
+ }
+ if (!s->vfp_enabled) {
+ return 1;
+ }
+
+ opr_sz = (1 + q) * 8;
+ fpst = get_fpstatus_ptr(1);
+ tcg_gen_gvec_3_ptr(vfp_reg_offset(1, rd),
+ vfp_reg_offset(1, rn),
+ vfp_reg_offset(1, rm), fpst,
+ opr_sz, opr_sz, rot, fn_gvec_ptr);
+ tcg_temp_free_ptr(fpst);
+ return 0;
+}
+
+/* Advanced SIMD two registers and a scalar extension.
+ * 31 24 23 22 20 16 12 11 10 9 8 3 0
+ * +-----------------+----+---+----+----+----+---+----+---+----+---------+----+
+ * | 1 1 1 1 1 1 1 0 | o1 | D | o2 | Vn | Vd | 1 | o3 | 0 | o4 | N Q M U | Vm |
+ * +-----------------+----+---+----+----+----+---+----+---+----+---------+----+
+ *
+ */
+
+static int disas_neon_insn_2reg_scalar_ext(DisasContext *s, uint32_t insn)
+{
+ int rd, rn, rm, rot, size, opr_sz;
+ TCGv_ptr fpst;
+ bool q;
+
+ q = extract32(insn, 6, 1);
+ VFP_DREG_D(rd, insn);
+ VFP_DREG_N(rn, insn);
+ VFP_DREG_M(rm, insn);
+ if ((rd | rn) & q) {
+ return 1;
+ }
+
+ if ((insn & 0xff000f10) == 0xfe000800) {
+ /* VCMLA (indexed) -- 1111 1110 S.RR .... .... 1000 ...0 .... */
+ rot = extract32(insn, 20, 2);
+ size = extract32(insn, 23, 1);
+ if (!arm_dc_feature(s, ARM_FEATURE_V8_FCMA)
+ || (!size && !arm_dc_feature(s, ARM_FEATURE_V8_FP16))) {
+ return 1;
+ }
+ } else {
+ return 1;
+ }
+
+ if (s->fp_excp_el) {
+ gen_exception_insn(s, 4, EXCP_UDEF,
+ syn_fp_access_trap(1, 0xe, false), s->fp_excp_el);
+ return 0;
+ }
+ if (!s->vfp_enabled) {
+ return 1;
+ }
+
+ opr_sz = (1 + q) * 8;
+ fpst = get_fpstatus_ptr(1);
+ tcg_gen_gvec_3_ptr(vfp_reg_offset(1, rd),
+ vfp_reg_offset(1, rn),
+ vfp_reg_offset(1, rm), fpst,
+ opr_sz, opr_sz, rot,
+ size ? gen_helper_gvec_fcmlas_idx
+ : gen_helper_gvec_fcmlah_idx);
+ tcg_temp_free_ptr(fpst);
+ return 0;
+}
+
static int disas_coproc_insn(DisasContext *s, uint32_t insn)
{
int cpnum, is64, crn, crm, opc1, opc2, isread, rt, rt2;
@@ -8338,6 +8541,18 @@ static void disas_arm_insn(DisasContext *s, unsigned int insn)
}
}
}
+ } else if ((insn & 0x0e000a00) == 0x0c000800
+ && arm_dc_feature(s, ARM_FEATURE_V8)) {
+ if (disas_neon_insn_3same_ext(s, insn)) {
+ goto illegal_op;
+ }
+ return;
+ } else if ((insn & 0x0f000a00) == 0x0e000800
+ && arm_dc_feature(s, ARM_FEATURE_V8)) {
+ if (disas_neon_insn_2reg_scalar_ext(s, insn)) {
+ goto illegal_op;
+ }
+ return;
} else if ((insn & 0x0fe00000) == 0x0c400000) {
/* Coprocessor double register transfer. */
ARCH(5TE);
@@ -10559,7 +10774,19 @@ static void disas_thumb2_insn(DisasContext *s, uint32_t insn)
default_exception_el(s));
break;
}
- if (((insn >> 24) & 3) == 3) {
+ if ((insn & 0xfe000a00) == 0xfc000800
+ && arm_dc_feature(s, ARM_FEATURE_V8)) {
+ /* The Thumb2 and ARM encodings are identical. */
+ if (disas_neon_insn_3same_ext(s, insn)) {
+ goto illegal_op;
+ }
+ } else if ((insn & 0xff000a00) == 0xfe000800
+ && arm_dc_feature(s, ARM_FEATURE_V8)) {
+ /* The Thumb2 and ARM encodings are identical. */
+ if (disas_neon_insn_2reg_scalar_ext(s, insn)) {
+ goto illegal_op;
+ }
+ } else if (((insn >> 24) & 3) == 3) {
/* Translate into the equivalent ARM encoding. */
insn = (insn & 0xe2ffffff) | ((insn & (1 << 28)) >> 4) | (1 << 28);
if (disas_neon_data_insn(s, insn)) {
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
new file mode 100644
index 0000000000..ec705cfca5
--- /dev/null
+++ b/target/arm/vec_helper.c
@@ -0,0 +1,429 @@
+/*
+ * ARM AdvSIMD / SVE Vector Operations
+ *
+ * Copyright (c) 2018 Linaro
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "cpu.h"
+#include "exec/exec-all.h"
+#include "exec/helper-proto.h"
+#include "tcg/tcg-gvec-desc.h"
+#include "fpu/softfloat.h"
+
+
+/* Note that vector data is stored in host-endian 64-bit chunks,
+ so addressing units smaller than that needs a host-endian fixup. */
+#ifdef HOST_WORDS_BIGENDIAN
+#define H1(x) ((x) ^ 7)
+#define H2(x) ((x) ^ 3)
+#define H4(x) ((x) ^ 1)
+#else
+#define H1(x) (x)
+#define H2(x) (x)
+#define H4(x) (x)
+#endif
+
+#define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] |= CPSR_Q
+
+static void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz)
+{
+ uint64_t *d = vd + opr_sz;
+ uintptr_t i;
+
+ for (i = opr_sz; i < max_sz; i += 8) {
+ *d++ = 0;
+ }
+}
+
+/* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
+static uint16_t inl_qrdmlah_s16(CPUARMState *env, int16_t src1,
+ int16_t src2, int16_t src3)
+{
+ /* Simplify:
+ * = ((a3 << 16) + ((e1 * e2) << 1) + (1 << 15)) >> 16
+ * = ((a3 << 15) + (e1 * e2) + (1 << 14)) >> 15
+ */
+ int32_t ret = (int32_t)src1 * src2;
+ ret = ((int32_t)src3 << 15) + ret + (1 << 14);
+ ret >>= 15;
+ if (ret != (int16_t)ret) {
+ SET_QC();
+ ret = (ret < 0 ? -0x8000 : 0x7fff);
+ }
+ return ret;
+}
+
+uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
+ uint32_t src2, uint32_t src3)
+{
+ uint16_t e1 = inl_qrdmlah_s16(env, src1, src2, src3);
+ uint16_t e2 = inl_qrdmlah_s16(env, src1 >> 16, src2 >> 16, src3 >> 16);
+ return deposit32(e1, 16, 16, e2);
+}
+
+void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
+ void *ve, uint32_t desc)
+{
+ uintptr_t opr_sz = simd_oprsz(desc);
+ int16_t *d = vd;
+ int16_t *n = vn;
+ int16_t *m = vm;
+ CPUARMState *env = ve;
+ uintptr_t i;
+
+ for (i = 0; i < opr_sz / 2; ++i) {
+ d[i] = inl_qrdmlah_s16(env, n[i], m[i], d[i]);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+/* Signed saturating rounding doubling multiply-subtract high half, 16-bit */
+static uint16_t inl_qrdmlsh_s16(CPUARMState *env, int16_t src1,
+ int16_t src2, int16_t src3)
+{
+ /* Similarly, using subtraction:
+ * = ((a3 << 16) - ((e1 * e2) << 1) + (1 << 15)) >> 16
+ * = ((a3 << 15) - (e1 * e2) + (1 << 14)) >> 15
+ */
+ int32_t ret = (int32_t)src1 * src2;
+ ret = ((int32_t)src3 << 15) - ret + (1 << 14);
+ ret >>= 15;
+ if (ret != (int16_t)ret) {
+ SET_QC();
+ ret = (ret < 0 ? -0x8000 : 0x7fff);
+ }
+ return ret;
+}
+
+uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
+ uint32_t src2, uint32_t src3)
+{
+ uint16_t e1 = inl_qrdmlsh_s16(env, src1, src2, src3);
+ uint16_t e2 = inl_qrdmlsh_s16(env, src1 >> 16, src2 >> 16, src3 >> 16);
+ return deposit32(e1, 16, 16, e2);
+}
+
+void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
+ void *ve, uint32_t desc)
+{
+ uintptr_t opr_sz = simd_oprsz(desc);
+ int16_t *d = vd;
+ int16_t *n = vn;
+ int16_t *m = vm;
+ CPUARMState *env = ve;
+ uintptr_t i;
+
+ for (i = 0; i < opr_sz / 2; ++i) {
+ d[i] = inl_qrdmlsh_s16(env, n[i], m[i], d[i]);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+/* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
+uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
+ int32_t src2, int32_t src3)
+{
+ /* Simplify similarly to int_qrdmlah_s16 above. */
+ int64_t ret = (int64_t)src1 * src2;
+ ret = ((int64_t)src3 << 31) + ret + (1 << 30);
+ ret >>= 31;
+ if (ret != (int32_t)ret) {
+ SET_QC();
+ ret = (ret < 0 ? INT32_MIN : INT32_MAX);
+ }
+ return ret;
+}
+
+void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
+ void *ve, uint32_t desc)
+{
+ uintptr_t opr_sz = simd_oprsz(desc);
+ int32_t *d = vd;
+ int32_t *n = vn;
+ int32_t *m = vm;
+ CPUARMState *env = ve;
+ uintptr_t i;
+
+ for (i = 0; i < opr_sz / 4; ++i) {
+ d[i] = helper_neon_qrdmlah_s32(env, n[i], m[i], d[i]);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+/* Signed saturating rounding doubling multiply-subtract high half, 32-bit */
+uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
+ int32_t src2, int32_t src3)
+{
+ /* Simplify similarly to int_qrdmlsh_s16 above. */
+ int64_t ret = (int64_t)src1 * src2;
+ ret = ((int64_t)src3 << 31) - ret + (1 << 30);
+ ret >>= 31;
+ if (ret != (int32_t)ret) {
+ SET_QC();
+ ret = (ret < 0 ? INT32_MIN : INT32_MAX);
+ }
+ return ret;
+}
+
+void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
+ void *ve, uint32_t desc)
+{
+ uintptr_t opr_sz = simd_oprsz(desc);
+ int32_t *d = vd;
+ int32_t *n = vn;
+ int32_t *m = vm;
+ CPUARMState *env = ve;
+ uintptr_t i;
+
+ for (i = 0; i < opr_sz / 4; ++i) {
+ d[i] = helper_neon_qrdmlsh_s32(env, n[i], m[i], d[i]);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
+ void *vfpst, uint32_t desc)
+{
+ uintptr_t opr_sz = simd_oprsz(desc);
+ float16 *d = vd;
+ float16 *n = vn;
+ float16 *m = vm;
+ float_status *fpst = vfpst;
+ uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
+ uint32_t neg_imag = neg_real ^ 1;
+ uintptr_t i;
+
+ /* Shift boolean to the sign bit so we can xor to negate. */
+ neg_real <<= 15;
+ neg_imag <<= 15;
+
+ for (i = 0; i < opr_sz / 2; i += 2) {
+ float16 e0 = n[H2(i)];
+ float16 e1 = m[H2(i + 1)] ^ neg_imag;
+ float16 e2 = n[H2(i + 1)];
+ float16 e3 = m[H2(i)] ^ neg_real;
+
+ d[H2(i)] = float16_add(e0, e1, fpst);
+ d[H2(i + 1)] = float16_add(e2, e3, fpst);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
+ void *vfpst, uint32_t desc)
+{
+ uintptr_t opr_sz = simd_oprsz(desc);
+ float32 *d = vd;
+ float32 *n = vn;
+ float32 *m = vm;
+ float_status *fpst = vfpst;
+ uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
+ uint32_t neg_imag = neg_real ^ 1;
+ uintptr_t i;
+
+ /* Shift boolean to the sign bit so we can xor to negate. */
+ neg_real <<= 31;
+ neg_imag <<= 31;
+
+ for (i = 0; i < opr_sz / 4; i += 2) {
+ float32 e0 = n[H4(i)];
+ float32 e1 = m[H4(i + 1)] ^ neg_imag;
+ float32 e2 = n[H4(i + 1)];
+ float32 e3 = m[H4(i)] ^ neg_real;
+
+ d[H4(i)] = float32_add(e0, e1, fpst);
+ d[H4(i + 1)] = float32_add(e2, e3, fpst);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
+ void *vfpst, uint32_t desc)
+{
+ uintptr_t opr_sz = simd_oprsz(desc);
+ float64 *d = vd;
+ float64 *n = vn;
+ float64 *m = vm;
+ float_status *fpst = vfpst;
+ uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
+ uint64_t neg_imag = neg_real ^ 1;
+ uintptr_t i;
+
+ /* Shift boolean to the sign bit so we can xor to negate. */
+ neg_real <<= 63;
+ neg_imag <<= 63;
+
+ for (i = 0; i < opr_sz / 8; i += 2) {
+ float64 e0 = n[i];
+ float64 e1 = m[i + 1] ^ neg_imag;
+ float64 e2 = n[i + 1];
+ float64 e3 = m[i] ^ neg_real;
+
+ d[i] = float64_add(e0, e1, fpst);
+ d[i + 1] = float64_add(e2, e3, fpst);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm,
+ void *vfpst, uint32_t desc)
+{
+ uintptr_t opr_sz = simd_oprsz(desc);
+ float16 *d = vd;
+ float16 *n = vn;
+ float16 *m = vm;
+ float_status *fpst = vfpst;
+ intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
+ uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+ uint32_t neg_real = flip ^ neg_imag;
+ uintptr_t i;
+
+ /* Shift boolean to the sign bit so we can xor to negate. */
+ neg_real <<= 15;
+ neg_imag <<= 15;
+
+ for (i = 0; i < opr_sz / 2; i += 2) {
+ float16 e2 = n[H2(i + flip)];
+ float16 e1 = m[H2(i + flip)] ^ neg_real;
+ float16 e4 = e2;
+ float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
+
+ d[H2(i)] = float16_muladd(e2, e1, d[H2(i)], 0, fpst);
+ d[H2(i + 1)] = float16_muladd(e4, e3, d[H2(i + 1)], 0, fpst);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm,
+ void *vfpst, uint32_t desc)
+{
+ uintptr_t opr_sz = simd_oprsz(desc);
+ float16 *d = vd;
+ float16 *n = vn;
+ float16 *m = vm;
+ float_status *fpst = vfpst;
+ intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
+ uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+ uint32_t neg_real = flip ^ neg_imag;
+ uintptr_t i;
+ float16 e1 = m[H2(flip)];
+ float16 e3 = m[H2(1 - flip)];
+
+ /* Shift boolean to the sign bit so we can xor to negate. */
+ neg_real <<= 15;
+ neg_imag <<= 15;
+ e1 ^= neg_real;
+ e3 ^= neg_imag;
+
+ for (i = 0; i < opr_sz / 2; i += 2) {
+ float16 e2 = n[H2(i + flip)];
+ float16 e4 = e2;
+
+ d[H2(i)] = float16_muladd(e2, e1, d[H2(i)], 0, fpst);
+ d[H2(i + 1)] = float16_muladd(e4, e3, d[H2(i + 1)], 0, fpst);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm,
+ void *vfpst, uint32_t desc)
+{
+ uintptr_t opr_sz = simd_oprsz(desc);
+ float32 *d = vd;
+ float32 *n = vn;
+ float32 *m = vm;
+ float_status *fpst = vfpst;
+ intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
+ uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+ uint32_t neg_real = flip ^ neg_imag;
+ uintptr_t i;
+
+ /* Shift boolean to the sign bit so we can xor to negate. */
+ neg_real <<= 31;
+ neg_imag <<= 31;
+
+ for (i = 0; i < opr_sz / 4; i += 2) {
+ float32 e2 = n[H4(i + flip)];
+ float32 e1 = m[H4(i + flip)] ^ neg_real;
+ float32 e4 = e2;
+ float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
+
+ d[H4(i)] = float32_muladd(e2, e1, d[H4(i)], 0, fpst);
+ d[H4(i + 1)] = float32_muladd(e4, e3, d[H4(i + 1)], 0, fpst);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm,
+ void *vfpst, uint32_t desc)
+{
+ uintptr_t opr_sz = simd_oprsz(desc);
+ float32 *d = vd;
+ float32 *n = vn;
+ float32 *m = vm;
+ float_status *fpst = vfpst;
+ intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
+ uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+ uint32_t neg_real = flip ^ neg_imag;
+ uintptr_t i;
+ float32 e1 = m[H4(flip)];
+ float32 e3 = m[H4(1 - flip)];
+
+ /* Shift boolean to the sign bit so we can xor to negate. */
+ neg_real <<= 31;
+ neg_imag <<= 31;
+ e1 ^= neg_real;
+ e3 ^= neg_imag;
+
+ for (i = 0; i < opr_sz / 4; i += 2) {
+ float32 e2 = n[H4(i + flip)];
+ float32 e4 = e2;
+
+ d[H4(i)] = float32_muladd(e2, e1, d[H4(i)], 0, fpst);
+ d[H4(i + 1)] = float32_muladd(e4, e3, d[H4(i + 1)], 0, fpst);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm,
+ void *vfpst, uint32_t desc)
+{
+ uintptr_t opr_sz = simd_oprsz(desc);
+ float64 *d = vd;
+ float64 *n = vn;
+ float64 *m = vm;
+ float_status *fpst = vfpst;
+ intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
+ uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+ uint64_t neg_real = flip ^ neg_imag;
+ uintptr_t i;
+
+ /* Shift boolean to the sign bit so we can xor to negate. */
+ neg_real <<= 63;
+ neg_imag <<= 63;
+
+ for (i = 0; i < opr_sz / 8; i += 2) {
+ float64 e2 = n[i + flip];
+ float64 e1 = m[i + flip] ^ neg_real;
+ float64 e4 = e2;
+ float64 e3 = m[i + 1 - flip] ^ neg_imag;
+
+ d[i] = float64_muladd(e2, e1, d[i], 0, fpst);
+ d[i + 1] = float64_muladd(e4, e3, d[i + 1], 0, fpst);
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}