aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Maydell <peter.maydell@linaro.org>2019-05-14 10:08:47 +0100
committerPeter Maydell <peter.maydell@linaro.org>2019-05-14 10:08:47 +0100
commite329ad2ab72c43b56df88b34954c2c7d839bb373 (patch)
tree69180a9f32bc7cfa7b9645e7d18ba3ff32a37975
parente24f44dbeab8e54c72bdaedbd35453fb2a6c38da (diff)
parenta7b6d286cfb5205b9f5330aefc5727269b3d810f (diff)
Merge remote-tracking branch 'remotes/rth/tags/pull-tcg-20190513' into staging
Improve code generation for vector duplication. Add vector expansions for shifts by non-constant scalar. Add vector expansions for shifts by vector. Add integer and vector expansions for absolute value. Several patches in preparation for Altivec. Bug fix for tcg/aarch64 vs min/max. # gpg: Signature made Tue 14 May 2019 00:58:02 BST # gpg: using RSA key 7A481E78868B4DB6A85A05C064DF38E8AF7E215F # gpg: issuer "richard.henderson@linaro.org" # gpg: Good signature from "Richard Henderson <richard.henderson@linaro.org>" [full] # Primary key fingerprint: 7A48 1E78 868B 4DB6 A85A 05C0 64DF 38E8 AF7E 215F * remotes/rth/tags/pull-tcg-20190513: (31 commits) tcg/aarch64: Do not advertise minmax for MO_64 target/xtensa: Use tcg_gen_abs_i32 target/tricore: Use tcg_gen_abs_tl target/s390x: Use tcg_gen_abs_i64 target/ppc: Use tcg_gen_abs_tl target/ppc: Use tcg_gen_abs_i32 target/cris: Use tcg_gen_abs_tl target/arm: Use tcg_gen_abs_i64 and tcg_gen_gvec_abs tcg/aarch64: Support vector absolute value tcg/i386: Support vector absolute value tcg: Add support for vector absolute value tcg: Add support for integer absolute value tcg/i386: Support vector scalar shift opcodes tcg: Add gvec expanders for vector shift by scalar tcg/aarch64: Support vector variable shift opcodes tcg/i386: Support vector variable shift opcodes tcg: Add gvec expanders for variable shift tcg: Add INDEX_op_dupm_vec tcg/aarch64: Implement tcg_out_dupm_vec tcg/i386: Implement tcg_out_dupm_vec ... Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r--accel/tcg/tcg-runtime-gvec.c192
-rw-r--r--accel/tcg/tcg-runtime.h20
-rw-r--r--target/arm/helper.h2
-rw-r--r--target/arm/neon_helper.c5
-rw-r--r--target/arm/translate-a64.c41
-rw-r--r--target/arm/translate-sve.c9
-rw-r--r--target/arm/translate.c144
-rw-r--r--target/cris/translate.c9
-rw-r--r--target/ppc/translate.c68
-rw-r--r--target/ppc/translate/spe-impl.inc.c14
-rw-r--r--target/ppc/translate/vmx-impl.inc.c7
-rw-r--r--target/s390x/translate.c8
-rw-r--r--target/tricore/translate.c27
-rw-r--r--target/xtensa/translate.c9
-rw-r--r--tcg/README4
-rw-r--r--tcg/aarch64/tcg-target.h3
-rw-r--r--tcg/aarch64/tcg-target.inc.c121
-rw-r--r--tcg/aarch64/tcg-target.opc.h2
-rw-r--r--tcg/arm/tcg-target.inc.c5
-rw-r--r--tcg/i386/tcg-target.h5
-rw-r--r--tcg/i386/tcg-target.inc.c163
-rw-r--r--tcg/mips/tcg-target.inc.c3
-rw-r--r--tcg/optimize.c8
-rw-r--r--tcg/ppc/tcg-target.inc.c3
-rw-r--r--tcg/riscv/tcg-target.inc.c5
-rw-r--r--tcg/s390/tcg-target.inc.c3
-rw-r--r--tcg/sparc/tcg-target.inc.c3
-rw-r--r--tcg/tcg-op-gvec.c945
-rw-r--r--tcg/tcg-op-gvec.h64
-rw-r--r--tcg/tcg-op-vec.c270
-rw-r--r--tcg/tcg-op.c20
-rw-r--r--tcg/tcg-op.h14
-rw-r--r--tcg/tcg-opc.h2
-rw-r--r--tcg/tcg.c271
-rw-r--r--tcg/tcg.h21
-rw-r--r--tcg/tci/tcg-target.inc.c3
36 files changed, 2020 insertions, 473 deletions
diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
index e2c6f24262..0f09e0ef38 100644
--- a/accel/tcg/tcg-runtime-gvec.c
+++ b/accel/tcg/tcg-runtime-gvec.c
@@ -398,6 +398,54 @@ void HELPER(gvec_neg64)(void *d, void *a, uint32_t desc)
clear_high(d, oprsz, desc);
}
+void HELPER(gvec_abs8)(void *d, void *a, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(int8_t)) {
+ int8_t aa = *(int8_t *)(a + i);
+ *(int8_t *)(d + i) = aa < 0 ? -aa : aa;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_abs16)(void *d, void *a, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(int16_t)) {
+ int16_t aa = *(int16_t *)(a + i);
+ *(int16_t *)(d + i) = aa < 0 ? -aa : aa;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_abs32)(void *d, void *a, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(int32_t)) {
+ int32_t aa = *(int32_t *)(a + i);
+ *(int32_t *)(d + i) = aa < 0 ? -aa : aa;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_abs64)(void *d, void *a, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(int64_t)) {
+ int64_t aa = *(int64_t *)(a + i);
+ *(int64_t *)(d + i) = aa < 0 ? -aa : aa;
+ }
+ clear_high(d, oprsz, desc);
+}
+
void HELPER(gvec_mov)(void *d, void *a, uint32_t desc)
{
intptr_t oprsz = simd_oprsz(desc);
@@ -725,6 +773,150 @@ void HELPER(gvec_sar64i)(void *d, void *a, uint32_t desc)
clear_high(d, oprsz, desc);
}
+void HELPER(gvec_shl8v)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
+ uint8_t sh = *(uint8_t *)(b + i) & 7;
+ *(uint8_t *)(d + i) = *(uint8_t *)(a + i) << sh;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_shl16v)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
+ uint8_t sh = *(uint16_t *)(b + i) & 15;
+ *(uint16_t *)(d + i) = *(uint16_t *)(a + i) << sh;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_shl32v)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
+ uint8_t sh = *(uint32_t *)(b + i) & 31;
+ *(uint32_t *)(d + i) = *(uint32_t *)(a + i) << sh;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_shl64v)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+ uint8_t sh = *(uint64_t *)(b + i) & 63;
+ *(uint64_t *)(d + i) = *(uint64_t *)(a + i) << sh;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_shr8v)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
+ uint8_t sh = *(uint8_t *)(b + i) & 7;
+ *(uint8_t *)(d + i) = *(uint8_t *)(a + i) >> sh;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_shr16v)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
+ uint8_t sh = *(uint16_t *)(b + i) & 15;
+ *(uint16_t *)(d + i) = *(uint16_t *)(a + i) >> sh;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_shr32v)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
+ uint8_t sh = *(uint32_t *)(b + i) & 31;
+ *(uint32_t *)(d + i) = *(uint32_t *)(a + i) >> sh;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_shr64v)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+ uint8_t sh = *(uint64_t *)(b + i) & 63;
+ *(uint64_t *)(d + i) = *(uint64_t *)(a + i) >> sh;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sar8v)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec8)) {
+ uint8_t sh = *(uint8_t *)(b + i) & 7;
+ *(int8_t *)(d + i) = *(int8_t *)(a + i) >> sh;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sar16v)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(int16_t)) {
+ uint8_t sh = *(uint16_t *)(b + i) & 15;
+ *(int16_t *)(d + i) = *(int16_t *)(a + i) >> sh;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sar32v)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec32)) {
+ uint8_t sh = *(uint32_t *)(b + i) & 31;
+ *(int32_t *)(d + i) = *(int32_t *)(a + i) >> sh;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sar64v)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ uint8_t sh = *(uint64_t *)(b + i) & 63;
+ *(int64_t *)(d + i) = *(int64_t *)(a + i) >> sh;
+ }
+ clear_high(d, oprsz, desc);
+}
+
/* If vectors are enabled, the compiler fills in -1 for true.
Otherwise, we must take care of this by hand. */
#ifdef CONFIG_VECTOR16
diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
index dfe325625c..6d73dc2d65 100644
--- a/accel/tcg/tcg-runtime.h
+++ b/accel/tcg/tcg-runtime.h
@@ -225,6 +225,11 @@ DEF_HELPER_FLAGS_3(gvec_neg16, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(gvec_neg32, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(gvec_neg64, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_abs8, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_abs16, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_abs32, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_abs64, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
DEF_HELPER_FLAGS_3(gvec_not, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_and, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_or, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
@@ -254,6 +259,21 @@ DEF_HELPER_FLAGS_3(gvec_sar16i, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(gvec_sar32i, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(gvec_sar64i, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_shl8v, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_shl16v, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_shl32v, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_shl64v, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_shr8v, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_shr16v, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_shr32v, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_shr64v, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_sar8v, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sar16v, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sar32v, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sar64v, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
DEF_HELPER_FLAGS_4(gvec_eq8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_eq16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(gvec_eq32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
diff --git a/target/arm/helper.h b/target/arm/helper.h
index 50cb036378..132aa1682e 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -352,8 +352,6 @@ DEF_HELPER_2(neon_ceq_u8, i32, i32, i32)
DEF_HELPER_2(neon_ceq_u16, i32, i32, i32)
DEF_HELPER_2(neon_ceq_u32, i32, i32, i32)
-DEF_HELPER_1(neon_abs_s8, i32, i32)
-DEF_HELPER_1(neon_abs_s16, i32, i32)
DEF_HELPER_1(neon_clz_u8, i32, i32)
DEF_HELPER_1(neon_clz_u16, i32, i32)
DEF_HELPER_1(neon_cls_s8, i32, i32)
diff --git a/target/arm/neon_helper.c b/target/arm/neon_helper.c
index ed1c6fc41c..4259056723 100644
--- a/target/arm/neon_helper.c
+++ b/target/arm/neon_helper.c
@@ -1228,11 +1228,6 @@ NEON_VOP(ceq_u16, neon_u16, 2)
NEON_VOP(ceq_u32, neon_u32, 1)
#undef NEON_FN
-#define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src
-NEON_VOP1(abs_s8, neon_s8, 4)
-NEON_VOP1(abs_s16, neon_s16, 2)
-#undef NEON_FN
-
/* Count Leading Sign/Zero Bits. */
static inline int do_clz8(uint8_t x)
{
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index 9dcc5ff3a3..b7c5a928b4 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -9468,11 +9468,7 @@ static void handle_2misc_64(DisasContext *s, int opcode, bool u,
if (u) {
tcg_gen_neg_i64(tcg_rd, tcg_rn);
} else {
- TCGv_i64 tcg_zero = tcg_const_i64(0);
- tcg_gen_neg_i64(tcg_rd, tcg_rn);
- tcg_gen_movcond_i64(TCG_COND_GT, tcg_rd, tcg_rn, tcg_zero,
- tcg_rn, tcg_rd);
- tcg_temp_free_i64(tcg_zero);
+ tcg_gen_abs_i64(tcg_rd, tcg_rn);
}
break;
case 0x2f: /* FABS */
@@ -12366,11 +12362,12 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
}
break;
case 0xb:
- if (u) { /* NEG */
+ if (u) { /* ABS, NEG */
gen_gvec_fn2(s, is_q, rd, rn, tcg_gen_gvec_neg, size);
- return;
+ } else {
+ gen_gvec_fn2(s, is_q, rd, rn, tcg_gen_gvec_abs, size);
}
- break;
+ return;
}
if (size == 3) {
@@ -12438,17 +12435,6 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
gen_helper_neon_qabs_s32(tcg_res, cpu_env, tcg_op);
}
break;
- case 0xb: /* ABS, NEG */
- if (u) {
- tcg_gen_neg_i32(tcg_res, tcg_op);
- } else {
- TCGv_i32 tcg_zero = tcg_const_i32(0);
- tcg_gen_neg_i32(tcg_res, tcg_op);
- tcg_gen_movcond_i32(TCG_COND_GT, tcg_res, tcg_op,
- tcg_zero, tcg_op, tcg_res);
- tcg_temp_free_i32(tcg_zero);
- }
- break;
case 0x2f: /* FABS */
gen_helper_vfp_abss(tcg_res, tcg_op);
break;
@@ -12561,23 +12547,6 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
tcg_temp_free_i32(tcg_zero);
break;
}
- case 0xb: /* ABS, NEG */
- if (u) {
- TCGv_i32 tcg_zero = tcg_const_i32(0);
- if (size) {
- gen_helper_neon_sub_u16(tcg_res, tcg_zero, tcg_op);
- } else {
- gen_helper_neon_sub_u8(tcg_res, tcg_zero, tcg_op);
- }
- tcg_temp_free_i32(tcg_zero);
- } else {
- if (size) {
- gen_helper_neon_abs_s16(tcg_res, tcg_op);
- } else {
- gen_helper_neon_abs_s8(tcg_res, tcg_op);
- }
- }
- break;
case 0x4: /* CLS, CLZ */
if (u) {
if (size == 0) {
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index 80645db508..fa068b0e47 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -3302,29 +3302,30 @@ static bool trans_SUB_zzi(DisasContext *s, arg_rri_esz *a)
static bool trans_SUBR_zzi(DisasContext *s, arg_rri_esz *a)
{
+ static const TCGOpcode vecop_list[] = { INDEX_op_sub_vec, 0 };
static const GVecGen2s op[4] = {
{ .fni8 = tcg_gen_vec_sub8_i64,
.fniv = tcg_gen_sub_vec,
.fno = gen_helper_sve_subri_b,
- .opc = INDEX_op_sub_vec,
+ .opt_opc = vecop_list,
.vece = MO_8,
.scalar_first = true },
{ .fni8 = tcg_gen_vec_sub16_i64,
.fniv = tcg_gen_sub_vec,
.fno = gen_helper_sve_subri_h,
- .opc = INDEX_op_sub_vec,
+ .opt_opc = vecop_list,
.vece = MO_16,
.scalar_first = true },
{ .fni4 = tcg_gen_sub_i32,
.fniv = tcg_gen_sub_vec,
.fno = gen_helper_sve_subri_s,
- .opc = INDEX_op_sub_vec,
+ .opt_opc = vecop_list,
.vece = MO_32,
.scalar_first = true },
{ .fni8 = tcg_gen_sub_i64,
.fniv = tcg_gen_sub_vec,
.fno = gen_helper_sve_subri_d,
- .opc = INDEX_op_sub_vec,
+ .opt_opc = vecop_list,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
.vece = MO_64,
.scalar_first = true }
diff --git a/target/arm/translate.c b/target/arm/translate.c
index 10bc53f91c..dd053c80d6 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -604,16 +604,6 @@ static void gen_sar(TCGv_i32 dest, TCGv_i32 t0, TCGv_i32 t1)
tcg_temp_free_i32(tmp1);
}
-static void tcg_gen_abs_i32(TCGv_i32 dest, TCGv_i32 src)
-{
- TCGv_i32 c0 = tcg_const_i32(0);
- TCGv_i32 tmp = tcg_temp_new_i32();
- tcg_gen_neg_i32(tmp, src);
- tcg_gen_movcond_i32(TCG_COND_GT, dest, src, c0, src, tmp);
- tcg_temp_free_i32(c0);
- tcg_temp_free_i32(tmp);
-}
-
static void shifter_out_im(TCGv_i32 var, int shift)
{
if (shift == 0) {
@@ -5861,27 +5851,31 @@ static void gen_ssra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
tcg_gen_add_vec(vece, d, d, a);
}
+static const TCGOpcode vecop_list_ssra[] = {
+ INDEX_op_sari_vec, INDEX_op_add_vec, 0
+};
+
const GVecGen2i ssra_op[4] = {
{ .fni8 = gen_ssra8_i64,
.fniv = gen_ssra_vec,
.load_dest = true,
- .opc = INDEX_op_sari_vec,
+ .opt_opc = vecop_list_ssra,
.vece = MO_8 },
{ .fni8 = gen_ssra16_i64,
.fniv = gen_ssra_vec,
.load_dest = true,
- .opc = INDEX_op_sari_vec,
+ .opt_opc = vecop_list_ssra,
.vece = MO_16 },
{ .fni4 = gen_ssra32_i32,
.fniv = gen_ssra_vec,
.load_dest = true,
- .opc = INDEX_op_sari_vec,
+ .opt_opc = vecop_list_ssra,
.vece = MO_32 },
{ .fni8 = gen_ssra64_i64,
.fniv = gen_ssra_vec,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .opt_opc = vecop_list_ssra,
.load_dest = true,
- .opc = INDEX_op_sari_vec,
.vece = MO_64 },
};
@@ -5915,27 +5909,31 @@ static void gen_usra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
tcg_gen_add_vec(vece, d, d, a);
}
+static const TCGOpcode vecop_list_usra[] = {
+ INDEX_op_shri_vec, INDEX_op_add_vec, 0
+};
+
const GVecGen2i usra_op[4] = {
{ .fni8 = gen_usra8_i64,
.fniv = gen_usra_vec,
.load_dest = true,
- .opc = INDEX_op_shri_vec,
+ .opt_opc = vecop_list_usra,
.vece = MO_8, },
{ .fni8 = gen_usra16_i64,
.fniv = gen_usra_vec,
.load_dest = true,
- .opc = INDEX_op_shri_vec,
+ .opt_opc = vecop_list_usra,
.vece = MO_16, },
{ .fni4 = gen_usra32_i32,
.fniv = gen_usra_vec,
.load_dest = true,
- .opc = INDEX_op_shri_vec,
+ .opt_opc = vecop_list_usra,
.vece = MO_32, },
{ .fni8 = gen_usra64_i64,
.fniv = gen_usra_vec,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
.load_dest = true,
- .opc = INDEX_op_shri_vec,
+ .opt_opc = vecop_list_usra,
.vece = MO_64, },
};
@@ -5993,27 +5991,29 @@ static void gen_shr_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
}
}
+static const TCGOpcode vecop_list_sri[] = { INDEX_op_shri_vec, 0 };
+
const GVecGen2i sri_op[4] = {
{ .fni8 = gen_shr8_ins_i64,
.fniv = gen_shr_ins_vec,
.load_dest = true,
- .opc = INDEX_op_shri_vec,
+ .opt_opc = vecop_list_sri,
.vece = MO_8 },
{ .fni8 = gen_shr16_ins_i64,
.fniv = gen_shr_ins_vec,
.load_dest = true,
- .opc = INDEX_op_shri_vec,
+ .opt_opc = vecop_list_sri,
.vece = MO_16 },
{ .fni4 = gen_shr32_ins_i32,
.fniv = gen_shr_ins_vec,
.load_dest = true,
- .opc = INDEX_op_shri_vec,
+ .opt_opc = vecop_list_sri,
.vece = MO_32 },
{ .fni8 = gen_shr64_ins_i64,
.fniv = gen_shr_ins_vec,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
.load_dest = true,
- .opc = INDEX_op_shri_vec,
+ .opt_opc = vecop_list_sri,
.vece = MO_64 },
};
@@ -6069,27 +6069,29 @@ static void gen_shl_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
}
}
+static const TCGOpcode vecop_list_sli[] = { INDEX_op_shli_vec, 0 };
+
const GVecGen2i sli_op[4] = {
{ .fni8 = gen_shl8_ins_i64,
.fniv = gen_shl_ins_vec,
.load_dest = true,
- .opc = INDEX_op_shli_vec,
+ .opt_opc = vecop_list_sli,
.vece = MO_8 },
{ .fni8 = gen_shl16_ins_i64,
.fniv = gen_shl_ins_vec,
.load_dest = true,
- .opc = INDEX_op_shli_vec,
+ .opt_opc = vecop_list_sli,
.vece = MO_16 },
{ .fni4 = gen_shl32_ins_i32,
.fniv = gen_shl_ins_vec,
.load_dest = true,
- .opc = INDEX_op_shli_vec,
+ .opt_opc = vecop_list_sli,
.vece = MO_32 },
{ .fni8 = gen_shl64_ins_i64,
.fniv = gen_shl_ins_vec,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
.load_dest = true,
- .opc = INDEX_op_shli_vec,
+ .opt_opc = vecop_list_sli,
.vece = MO_64 },
};
@@ -6156,51 +6158,60 @@ static void gen_mls_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
/* Note that while NEON does not support VMLA and VMLS as 64-bit ops,
* these tables are shared with AArch64 which does support them.
*/
+
+static const TCGOpcode vecop_list_mla[] = {
+ INDEX_op_mul_vec, INDEX_op_add_vec, 0
+};
+
+static const TCGOpcode vecop_list_mls[] = {
+ INDEX_op_mul_vec, INDEX_op_sub_vec, 0
+};
+
const GVecGen3 mla_op[4] = {
{ .fni4 = gen_mla8_i32,
.fniv = gen_mla_vec,
- .opc = INDEX_op_mul_vec,
.load_dest = true,
+ .opt_opc = vecop_list_mla,
.vece = MO_8 },
{ .fni4 = gen_mla16_i32,
.fniv = gen_mla_vec,
- .opc = INDEX_op_mul_vec,
.load_dest = true,
+ .opt_opc = vecop_list_mla,
.vece = MO_16 },
{ .fni4 = gen_mla32_i32,
.fniv = gen_mla_vec,
- .opc = INDEX_op_mul_vec,
.load_dest = true,
+ .opt_opc = vecop_list_mla,
.vece = MO_32 },
{ .fni8 = gen_mla64_i64,
.fniv = gen_mla_vec,
- .opc = INDEX_op_mul_vec,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
.load_dest = true,
+ .opt_opc = vecop_list_mla,
.vece = MO_64 },
};
const GVecGen3 mls_op[4] = {
{ .fni4 = gen_mls8_i32,
.fniv = gen_mls_vec,
- .opc = INDEX_op_mul_vec,
.load_dest = true,
+ .opt_opc = vecop_list_mls,
.vece = MO_8 },
{ .fni4 = gen_mls16_i32,
.fniv = gen_mls_vec,
- .opc = INDEX_op_mul_vec,
.load_dest = true,
+ .opt_opc = vecop_list_mls,
.vece = MO_16 },
{ .fni4 = gen_mls32_i32,
.fniv = gen_mls_vec,
- .opc = INDEX_op_mul_vec,
.load_dest = true,
+ .opt_opc = vecop_list_mls,
.vece = MO_32 },
{ .fni8 = gen_mls64_i64,
.fniv = gen_mls_vec,
- .opc = INDEX_op_mul_vec,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
.load_dest = true,
+ .opt_opc = vecop_list_mls,
.vece = MO_64 },
};
@@ -6226,19 +6237,25 @@ static void gen_cmtst_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
tcg_gen_cmp_vec(TCG_COND_NE, vece, d, d, a);
}
+static const TCGOpcode vecop_list_cmtst[] = { INDEX_op_cmp_vec, 0 };
+
const GVecGen3 cmtst_op[4] = {
{ .fni4 = gen_helper_neon_tst_u8,
.fniv = gen_cmtst_vec,
+ .opt_opc = vecop_list_cmtst,
.vece = MO_8 },
{ .fni4 = gen_helper_neon_tst_u16,
.fniv = gen_cmtst_vec,
+ .opt_opc = vecop_list_cmtst,
.vece = MO_16 },
{ .fni4 = gen_cmtst_i32,
.fniv = gen_cmtst_vec,
+ .opt_opc = vecop_list_cmtst,
.vece = MO_32 },
{ .fni8 = gen_cmtst_i64,
.fniv = gen_cmtst_vec,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .opt_opc = vecop_list_cmtst,
.vece = MO_64 },
};
@@ -6253,26 +6270,30 @@ static void gen_uqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec sat,
tcg_temp_free_vec(x);
}
+static const TCGOpcode vecop_list_uqadd[] = {
+ INDEX_op_usadd_vec, INDEX_op_cmp_vec, INDEX_op_add_vec, 0
+};
+
const GVecGen4 uqadd_op[4] = {
{ .fniv = gen_uqadd_vec,
.fno = gen_helper_gvec_uqadd_b,
- .opc = INDEX_op_usadd_vec,
.write_aofs = true,
+ .opt_opc = vecop_list_uqadd,
.vece = MO_8 },
{ .fniv = gen_uqadd_vec,
.fno = gen_helper_gvec_uqadd_h,
- .opc = INDEX_op_usadd_vec,
.write_aofs = true,
+ .opt_opc = vecop_list_uqadd,
.vece = MO_16 },
{ .fniv = gen_uqadd_vec,
.fno = gen_helper_gvec_uqadd_s,
- .opc = INDEX_op_usadd_vec,
.write_aofs = true,
+ .opt_opc = vecop_list_uqadd,
.vece = MO_32 },
{ .fniv = gen_uqadd_vec,
.fno = gen_helper_gvec_uqadd_d,
- .opc = INDEX_op_usadd_vec,
.write_aofs = true,
+ .opt_opc = vecop_list_uqadd,
.vece = MO_64 },
};
@@ -6287,25 +6308,29 @@ static void gen_sqadd_vec(unsigned vece, TCGv_vec t, TCGv_vec sat,
tcg_temp_free_vec(x);
}
+static const TCGOpcode vecop_list_sqadd[] = {
+ INDEX_op_ssadd_vec, INDEX_op_cmp_vec, INDEX_op_add_vec, 0
+};
+
const GVecGen4 sqadd_op[4] = {
{ .fniv = gen_sqadd_vec,
.fno = gen_helper_gvec_sqadd_b,
- .opc = INDEX_op_ssadd_vec,
+ .opt_opc = vecop_list_sqadd,
.write_aofs = true,
.vece = MO_8 },
{ .fniv = gen_sqadd_vec,
.fno = gen_helper_gvec_sqadd_h,
- .opc = INDEX_op_ssadd_vec,
+ .opt_opc = vecop_list_sqadd,
.write_aofs = true,
.vece = MO_16 },
{ .fniv = gen_sqadd_vec,
.fno = gen_helper_gvec_sqadd_s,
- .opc = INDEX_op_ssadd_vec,
+ .opt_opc = vecop_list_sqadd,
.write_aofs = true,
.vece = MO_32 },
{ .fniv = gen_sqadd_vec,
.fno = gen_helper_gvec_sqadd_d,
- .opc = INDEX_op_ssadd_vec,
+ .opt_opc = vecop_list_sqadd,
.write_aofs = true,
.vece = MO_64 },
};
@@ -6321,25 +6346,29 @@ static void gen_uqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec sat,
tcg_temp_free_vec(x);
}
+static const TCGOpcode vecop_list_uqsub[] = {
+ INDEX_op_ussub_vec, INDEX_op_cmp_vec, INDEX_op_sub_vec, 0
+};
+
const GVecGen4 uqsub_op[4] = {
{ .fniv = gen_uqsub_vec,
.fno = gen_helper_gvec_uqsub_b,
- .opc = INDEX_op_ussub_vec,
+ .opt_opc = vecop_list_uqsub,
.write_aofs = true,
.vece = MO_8 },
{ .fniv = gen_uqsub_vec,
.fno = gen_helper_gvec_uqsub_h,
- .opc = INDEX_op_ussub_vec,
+ .opt_opc = vecop_list_uqsub,
.write_aofs = true,
.vece = MO_16 },
{ .fniv = gen_uqsub_vec,
.fno = gen_helper_gvec_uqsub_s,
- .opc = INDEX_op_ussub_vec,
+ .opt_opc = vecop_list_uqsub,
.write_aofs = true,
.vece = MO_32 },
{ .fniv = gen_uqsub_vec,
.fno = gen_helper_gvec_uqsub_d,
- .opc = INDEX_op_ussub_vec,
+ .opt_opc = vecop_list_uqsub,
.write_aofs = true,
.vece = MO_64 },
};
@@ -6355,25 +6384,29 @@ static void gen_sqsub_vec(unsigned vece, TCGv_vec t, TCGv_vec sat,
tcg_temp_free_vec(x);
}
+static const TCGOpcode vecop_list_sqsub[] = {
+ INDEX_op_sssub_vec, INDEX_op_cmp_vec, INDEX_op_sub_vec, 0
+};
+
const GVecGen4 sqsub_op[4] = {
{ .fniv = gen_sqsub_vec,
.fno = gen_helper_gvec_sqsub_b,
- .opc = INDEX_op_sssub_vec,
+ .opt_opc = vecop_list_sqsub,
.write_aofs = true,
.vece = MO_8 },
{ .fniv = gen_sqsub_vec,
.fno = gen_helper_gvec_sqsub_h,
- .opc = INDEX_op_sssub_vec,
+ .opt_opc = vecop_list_sqsub,
.write_aofs = true,
.vece = MO_16 },
{ .fniv = gen_sqsub_vec,
.fno = gen_helper_gvec_sqsub_s,
- .opc = INDEX_op_sssub_vec,
+ .opt_opc = vecop_list_sqsub,
.write_aofs = true,
.vece = MO_32 },
{ .fniv = gen_sqsub_vec,
.fno = gen_helper_gvec_sqsub_d,
- .opc = INDEX_op_sssub_vec,
+ .opt_opc = vecop_list_sqsub,
.write_aofs = true,
.vece = MO_64 },
};
@@ -8087,6 +8120,9 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
case NEON_2RM_VNEG:
tcg_gen_gvec_neg(size, rd_ofs, rm_ofs, vec_size, vec_size);
break;
+ case NEON_2RM_VABS:
+ tcg_gen_gvec_abs(size, rd_ofs, rm_ofs, vec_size, vec_size);
+ break;
default:
elementwise:
@@ -8192,14 +8228,6 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
}
tcg_temp_free_i32(tmp2);
break;
- case NEON_2RM_VABS:
- switch(size) {
- case 0: gen_helper_neon_abs_s8(tmp, tmp); break;
- case 1: gen_helper_neon_abs_s16(tmp, tmp); break;
- case 2: tcg_gen_abs_i32(tmp, tmp); break;
- default: abort();
- }
- break;
case NEON_2RM_VCGT0_F:
{
TCGv_ptr fpstatus = get_fpstatus_ptr(1);
diff --git a/target/cris/translate.c b/target/cris/translate.c
index b005a5c20e..31b40a57f9 100644
--- a/target/cris/translate.c
+++ b/target/cris/translate.c
@@ -1686,18 +1686,11 @@ static int dec_cmp_r(CPUCRISState *env, DisasContext *dc)
static int dec_abs_r(CPUCRISState *env, DisasContext *dc)
{
- TCGv t0;
-
LOG_DIS("abs $r%u, $r%u\n",
dc->op1, dc->op2);
cris_cc_mask(dc, CC_MASK_NZ);
- t0 = tcg_temp_new();
- tcg_gen_sari_tl(t0, cpu_R[dc->op1], 31);
- tcg_gen_xor_tl(cpu_R[dc->op2], cpu_R[dc->op1], t0);
- tcg_gen_sub_tl(cpu_R[dc->op2], cpu_R[dc->op2], t0);
- tcg_temp_free(t0);
-
+ tcg_gen_abs_tl(cpu_R[dc->op2], cpu_R[dc->op1]);
cris_alu(dc, CC_OP_MOVE,
cpu_R[dc->op2], cpu_R[dc->op2], cpu_R[dc->op2], 4);
return 2;
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index 8d08625c33..b5217f632f 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -5075,40 +5075,26 @@ static void gen_ecowx(DisasContext *ctx)
/* abs - abs. */
static void gen_abs(DisasContext *ctx)
{
- TCGLabel *l1 = gen_new_label();
- TCGLabel *l2 = gen_new_label();
- tcg_gen_brcondi_tl(TCG_COND_GE, cpu_gpr[rA(ctx->opcode)], 0, l1);
- tcg_gen_neg_tl(cpu_gpr[rD(ctx->opcode)], cpu_gpr[rA(ctx->opcode)]);
- tcg_gen_br(l2);
- gen_set_label(l1);
- tcg_gen_mov_tl(cpu_gpr[rD(ctx->opcode)], cpu_gpr[rA(ctx->opcode)]);
- gen_set_label(l2);
+ TCGv d = cpu_gpr[rD(ctx->opcode)];
+ TCGv a = cpu_gpr[rA(ctx->opcode)];
+
+ tcg_gen_abs_tl(d, a);
if (unlikely(Rc(ctx->opcode) != 0)) {
- gen_set_Rc0(ctx, cpu_gpr[rD(ctx->opcode)]);
+ gen_set_Rc0(ctx, d);
}
}
/* abso - abso. */
static void gen_abso(DisasContext *ctx)
{
- TCGLabel *l1 = gen_new_label();
- TCGLabel *l2 = gen_new_label();
- TCGLabel *l3 = gen_new_label();
- /* Start with XER OV disabled, the most likely case */
- tcg_gen_movi_tl(cpu_ov, 0);
- tcg_gen_brcondi_tl(TCG_COND_GE, cpu_gpr[rA(ctx->opcode)], 0, l2);
- tcg_gen_brcondi_tl(TCG_COND_NE, cpu_gpr[rA(ctx->opcode)], 0x80000000, l1);
- tcg_gen_movi_tl(cpu_ov, 1);
- tcg_gen_movi_tl(cpu_so, 1);
- tcg_gen_br(l2);
- gen_set_label(l1);
- tcg_gen_neg_tl(cpu_gpr[rD(ctx->opcode)], cpu_gpr[rA(ctx->opcode)]);
- tcg_gen_br(l3);
- gen_set_label(l2);
- tcg_gen_mov_tl(cpu_gpr[rD(ctx->opcode)], cpu_gpr[rA(ctx->opcode)]);
- gen_set_label(l3);
+ TCGv d = cpu_gpr[rD(ctx->opcode)];
+ TCGv a = cpu_gpr[rA(ctx->opcode)];
+
+ tcg_gen_setcondi_tl(TCG_COND_EQ, cpu_ov, a, 0x80000000);
+ tcg_gen_abs_tl(d, a);
+ tcg_gen_or_tl(cpu_so, cpu_so, cpu_ov);
if (unlikely(Rc(ctx->opcode) != 0)) {
- gen_set_Rc0(ctx, cpu_gpr[rD(ctx->opcode)]);
+ gen_set_Rc0(ctx, d);
}
}
@@ -5344,34 +5330,28 @@ static void gen_mulo(DisasContext *ctx)
/* nabs - nabs. */
static void gen_nabs(DisasContext *ctx)
{
- TCGLabel *l1 = gen_new_label();
- TCGLabel *l2 = gen_new_label();
- tcg_gen_brcondi_tl(TCG_COND_GT, cpu_gpr[rA(ctx->opcode)], 0, l1);
- tcg_gen_mov_tl(cpu_gpr[rD(ctx->opcode)], cpu_gpr[rA(ctx->opcode)]);
- tcg_gen_br(l2);
- gen_set_label(l1);
- tcg_gen_neg_tl(cpu_gpr[rD(ctx->opcode)], cpu_gpr[rA(ctx->opcode)]);
- gen_set_label(l2);
+ TCGv d = cpu_gpr[rD(ctx->opcode)];
+ TCGv a = cpu_gpr[rA(ctx->opcode)];
+
+ tcg_gen_abs_tl(d, a);
+ tcg_gen_neg_tl(d, d);
if (unlikely(Rc(ctx->opcode) != 0)) {
- gen_set_Rc0(ctx, cpu_gpr[rD(ctx->opcode)]);
+ gen_set_Rc0(ctx, d);
}
}
/* nabso - nabso. */
static void gen_nabso(DisasContext *ctx)
{
- TCGLabel *l1 = gen_new_label();
- TCGLabel *l2 = gen_new_label();
- tcg_gen_brcondi_tl(TCG_COND_GT, cpu_gpr[rA(ctx->opcode)], 0, l1);
- tcg_gen_mov_tl(cpu_gpr[rD(ctx->opcode)], cpu_gpr[rA(ctx->opcode)]);
- tcg_gen_br(l2);
- gen_set_label(l1);
- tcg_gen_neg_tl(cpu_gpr[rD(ctx->opcode)], cpu_gpr[rA(ctx->opcode)]);
- gen_set_label(l2);
+ TCGv d = cpu_gpr[rD(ctx->opcode)];
+ TCGv a = cpu_gpr[rA(ctx->opcode)];
+
+ tcg_gen_abs_tl(d, a);
+ tcg_gen_neg_tl(d, d);
/* nabs never overflows */
tcg_gen_movi_tl(cpu_ov, 0);
if (unlikely(Rc(ctx->opcode) != 0)) {
- gen_set_Rc0(ctx, cpu_gpr[rD(ctx->opcode)]);
+ gen_set_Rc0(ctx, d);
}
}
diff --git a/target/ppc/translate/spe-impl.inc.c b/target/ppc/translate/spe-impl.inc.c
index 7ab0a29b5f..36b4d5654d 100644
--- a/target/ppc/translate/spe-impl.inc.c
+++ b/target/ppc/translate/spe-impl.inc.c
@@ -126,19 +126,7 @@ static inline void gen_##name(DisasContext *ctx) \
tcg_temp_free_i32(t0); \
}
-static inline void gen_op_evabs(TCGv_i32 ret, TCGv_i32 arg1)
-{
- TCGLabel *l1 = gen_new_label();
- TCGLabel *l2 = gen_new_label();
-
- tcg_gen_brcondi_i32(TCG_COND_GE, arg1, 0, l1);
- tcg_gen_neg_i32(ret, arg1);
- tcg_gen_br(l2);
- gen_set_label(l1);
- tcg_gen_mov_i32(ret, arg1);
- gen_set_label(l2);
-}
-GEN_SPEOP_ARITH1(evabs, gen_op_evabs);
+GEN_SPEOP_ARITH1(evabs, tcg_gen_abs_i32);
GEN_SPEOP_ARITH1(evneg, tcg_gen_neg_i32);
GEN_SPEOP_ARITH1(evextsb, tcg_gen_ext8s_i32);
GEN_SPEOP_ARITH1(evextsh, tcg_gen_ext16s_i32);
diff --git a/target/ppc/translate/vmx-impl.inc.c b/target/ppc/translate/vmx-impl.inc.c
index bd3ff40e68..6861f4c5b9 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -566,10 +566,15 @@ static void glue(glue(gen_, NAME), _vec)(unsigned vece, TCGv_vec t, \
} \
static void glue(gen_, NAME)(DisasContext *ctx) \
{ \
+ static const TCGOpcode vecop_list[] = { \
+ glue(glue(INDEX_op_, NORM), _vec), \
+ glue(glue(INDEX_op_, SAT), _vec), \
+ INDEX_op_cmp_vec, 0 \
+ }; \
static const GVecGen4 g = { \
.fniv = glue(glue(gen_, NAME), _vec), \
.fno = glue(gen_helper_, NAME), \
- .opc = glue(glue(INDEX_op_, SAT), _vec), \
+ .opt_opc = vecop_list, \
.write_aofs = true, \
.vece = VECE, \
}; \
diff --git a/target/s390x/translate.c b/target/s390x/translate.c
index d4951836ad..e8e8a79b7d 100644
--- a/target/s390x/translate.c
+++ b/target/s390x/translate.c
@@ -1407,13 +1407,7 @@ static DisasJumpType help_branch(DisasContext *s, DisasCompare *c,
static DisasJumpType op_abs(DisasContext *s, DisasOps *o)
{
- TCGv_i64 z, n;
- z = tcg_const_i64(0);
- n = tcg_temp_new_i64();
- tcg_gen_neg_i64(n, o->in2);
- tcg_gen_movcond_i64(TCG_COND_LT, o->out, o->in2, z, n, o->in2);
- tcg_temp_free_i64(n);
- tcg_temp_free_i64(z);
+ tcg_gen_abs_i64(o->out, o->in2);
return DISAS_NEXT;
}
diff --git a/target/tricore/translate.c b/target/tricore/translate.c
index 8f6416144e..06c4485e55 100644
--- a/target/tricore/translate.c
+++ b/target/tricore/translate.c
@@ -2415,11 +2415,7 @@ gen_msubadr32s_h(TCGv ret, TCGv r1, TCGv r2, TCGv r3, uint32_t n, uint32_t mode)
static inline void gen_abs(TCGv ret, TCGv r1)
{
- TCGv temp = tcg_temp_new();
- TCGv t0 = tcg_const_i32(0);
-
- tcg_gen_neg_tl(temp, r1);
- tcg_gen_movcond_tl(TCG_COND_GE, ret, r1, t0, r1, temp);
+ tcg_gen_abs_tl(ret, r1);
/* overflow can only happen, if r1 = 0x80000000 */
tcg_gen_setcondi_tl(TCG_COND_EQ, cpu_PSW_V, r1, 0x80000000);
tcg_gen_shli_tl(cpu_PSW_V, cpu_PSW_V, 31);
@@ -2430,9 +2426,6 @@ static inline void gen_abs(TCGv ret, TCGv r1)
tcg_gen_xor_tl(cpu_PSW_AV, ret, cpu_PSW_AV);
/* calc SAV bit */
tcg_gen_or_tl(cpu_PSW_SAV, cpu_PSW_SAV, cpu_PSW_AV);
-
- tcg_temp_free(temp);
- tcg_temp_free(t0);
}
static inline void gen_absdif(TCGv ret, TCGv r1, TCGv r2)
@@ -6617,13 +6610,8 @@ static void decode_rr_divide(CPUTriCoreState *env, DisasContext *ctx)
tcg_gen_movi_tl(cpu_PSW_AV, 0);
if (!tricore_feature(env, TRICORE_FEATURE_131)) {
/* overflow = (abs(D[r3+1]) >= abs(D[r2])) */
- tcg_gen_neg_tl(temp, temp3);
- /* use cpu_PSW_AV to compare against 0 */
- tcg_gen_movcond_tl(TCG_COND_LT, temp, temp3, cpu_PSW_AV,
- temp, temp3);
- tcg_gen_neg_tl(temp2, cpu_gpr_d[r2]);
- tcg_gen_movcond_tl(TCG_COND_LT, temp2, cpu_gpr_d[r2], cpu_PSW_AV,
- temp2, cpu_gpr_d[r2]);
+ tcg_gen_abs_tl(temp, temp3);
+ tcg_gen_abs_tl(temp2, cpu_gpr_d[r2]);
tcg_gen_setcond_tl(TCG_COND_GE, cpu_PSW_V, temp, temp2);
} else {
/* overflow = (D[b] == 0) */
@@ -6655,13 +6643,8 @@ static void decode_rr_divide(CPUTriCoreState *env, DisasContext *ctx)
tcg_gen_movi_tl(cpu_PSW_AV, 0);
if (!tricore_feature(env, TRICORE_FEATURE_131)) {
/* overflow = (abs(D[r3+1]) >= abs(D[r2])) */
- tcg_gen_neg_tl(temp, temp3);
- /* use cpu_PSW_AV to compare against 0 */
- tcg_gen_movcond_tl(TCG_COND_LT, temp, temp3, cpu_PSW_AV,
- temp, temp3);
- tcg_gen_neg_tl(temp2, cpu_gpr_d[r2]);
- tcg_gen_movcond_tl(TCG_COND_LT, temp2, cpu_gpr_d[r2], cpu_PSW_AV,
- temp2, cpu_gpr_d[r2]);
+ tcg_gen_abs_tl(temp, temp3);
+ tcg_gen_abs_tl(temp2, cpu_gpr_d[r2]);
tcg_gen_setcond_tl(TCG_COND_GE, cpu_PSW_V, temp, temp2);
} else {
/* overflow = (D[b] == 0) */
diff --git a/target/xtensa/translate.c b/target/xtensa/translate.c
index 301c8e3161..b063fa85f2 100644
--- a/target/xtensa/translate.c
+++ b/target/xtensa/translate.c
@@ -1709,14 +1709,7 @@ void restore_state_to_opc(CPUXtensaState *env, TranslationBlock *tb,
static void translate_abs(DisasContext *dc, const OpcodeArg arg[],
const uint32_t par[])
{
- TCGv_i32 zero = tcg_const_i32(0);
- TCGv_i32 neg = tcg_temp_new_i32();
-
- tcg_gen_neg_i32(neg, arg[1].in);
- tcg_gen_movcond_i32(TCG_COND_GE, arg[0].out,
- arg[1].in, zero, arg[1].in, neg);
- tcg_temp_free(neg);
- tcg_temp_free(zero);
+ tcg_gen_abs_i32(arg[0].out, arg[1].in);
}
static void translate_add(DisasContext *dc, const OpcodeArg arg[],
diff --git a/tcg/README b/tcg/README
index c30e5418a6..cbdfd3b6bc 100644
--- a/tcg/README
+++ b/tcg/README
@@ -561,6 +561,10 @@ E.g. VECL=1 -> 64 << 1 -> v128, and VECE=2 -> 1 << 2 -> i32.
Similarly, v0 = -v1.
+* abs_vec v0, v1
+
+ Similarly, v0 = v1 < 0 ? -v1 : v1, in elements across the vector.
+
* smin_vec:
* umin_vec:
diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index ce2bb1f90b..e43554c3c7 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -132,9 +132,10 @@ typedef enum {
#define TCG_TARGET_HAS_orc_vec 1
#define TCG_TARGET_HAS_not_vec 1
#define TCG_TARGET_HAS_neg_vec 1
+#define TCG_TARGET_HAS_abs_vec 1
#define TCG_TARGET_HAS_shi_vec 1
#define TCG_TARGET_HAS_shs_vec 0
-#define TCG_TARGET_HAS_shv_vec 0
+#define TCG_TARGET_HAS_shv_vec 1
#define TCG_TARGET_HAS_cmp_vec 1
#define TCG_TARGET_HAS_mul_vec 1
#define TCG_TARGET_HAS_sat_vec 1
diff --git a/tcg/aarch64/tcg-target.inc.c b/tcg/aarch64/tcg-target.inc.c
index eefa929948..40bf35079a 100644
--- a/tcg/aarch64/tcg-target.inc.c
+++ b/tcg/aarch64/tcg-target.inc.c
@@ -381,6 +381,9 @@ typedef enum {
I3207_BLR = 0xd63f0000,
I3207_RET = 0xd65f0000,
+ /* AdvSIMD load/store single structure. */
+ I3303_LD1R = 0x0d40c000,
+
/* Load literal for loading the address at pc-relative offset */
I3305_LDR = 0x58000000,
I3305_LDR_v64 = 0x5c000000,
@@ -533,12 +536,14 @@ typedef enum {
I3616_CMEQ = 0x2e208c00,
I3616_SMAX = 0x0e206400,
I3616_SMIN = 0x0e206c00,
+ I3616_SSHL = 0x0e204400,
I3616_SQADD = 0x0e200c00,
I3616_SQSUB = 0x0e202c00,
I3616_UMAX = 0x2e206400,
I3616_UMIN = 0x2e206c00,
I3616_UQADD = 0x2e200c00,
I3616_UQSUB = 0x2e202c00,
+ I3616_USHL = 0x2e204400,
/* AdvSIMD two-reg misc. */
I3617_CMGT0 = 0x0e208800,
@@ -547,6 +552,7 @@ typedef enum {
I3617_CMGE0 = 0x2e208800,
I3617_CMLE0 = 0x2e20a800,
I3617_NOT = 0x2e205800,
+ I3617_ABS = 0x0e20b800,
I3617_NEG = 0x2e20b800,
/* System instructions. */
@@ -566,7 +572,14 @@ static inline uint32_t tcg_in32(TCGContext *s)
#define tcg_out_insn(S, FMT, OP, ...) \
glue(tcg_out_insn_,FMT)(S, glue(glue(glue(I,FMT),_),OP), ## __VA_ARGS__)
-static void tcg_out_insn_3305(TCGContext *s, AArch64Insn insn, int imm19, TCGReg rt)
+static void tcg_out_insn_3303(TCGContext *s, AArch64Insn insn, bool q,
+ TCGReg rt, TCGReg rn, unsigned size)
+{
+ tcg_out32(s, insn | (rt & 0x1f) | (rn << 5) | (size << 10) | (q << 30));
+}
+
+static void tcg_out_insn_3305(TCGContext *s, AArch64Insn insn,
+ int imm19, TCGReg rt)
{
tcg_out32(s, insn | (imm19 & 0x7ffff) << 5 | rt);
}
@@ -799,7 +812,7 @@ static void tcg_out_logicali(TCGContext *s, AArch64Insn insn, TCGType ext,
}
static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
- TCGReg rd, uint64_t v64)
+ TCGReg rd, tcg_target_long v64)
{
int op, cmode, imm8;
@@ -814,6 +827,43 @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
}
}
+static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
+ TCGReg rd, TCGReg rs)
+{
+ int is_q = type - TCG_TYPE_V64;
+ tcg_out_insn(s, 3605, DUP, is_q, rd, rs, 1 << vece, 0);
+ return true;
+}
+
+static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
+ TCGReg r, TCGReg base, intptr_t offset)
+{
+ TCGReg temp = TCG_REG_TMP;
+
+ if (offset < -0xffffff || offset > 0xffffff) {
+ tcg_out_movi(s, TCG_TYPE_PTR, temp, offset);
+ tcg_out_insn(s, 3502, ADD, 1, temp, temp, base);
+ base = temp;
+ } else {
+ AArch64Insn add_insn = I3401_ADDI;
+
+ if (offset < 0) {
+ add_insn = I3401_SUBI;
+ offset = -offset;
+ }
+ if (offset & 0xfff000) {
+ tcg_out_insn_3401(s, add_insn, 1, temp, base, offset & 0xfff000);
+ base = temp;
+ }
+ if (offset & 0xfff) {
+ tcg_out_insn_3401(s, add_insn, 1, temp, base, offset & 0xfff);
+ base = temp;
+ }
+ }
+ tcg_out_insn(s, 3303, LD1R, type == TCG_TYPE_V128, r, base, vece);
+ return true;
+}
+
static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
tcg_target_long value)
{
@@ -938,10 +988,10 @@ static void tcg_out_ldst(TCGContext *s, AArch64Insn insn, TCGReg rd,
tcg_out_ldst_r(s, insn, rd, rn, TCG_TYPE_I64, TCG_REG_TMP);
}
-static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
+static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
{
if (ret == arg) {
- return;
+ return true;
}
switch (type) {
case TCG_TYPE_I32:
@@ -970,6 +1020,7 @@ static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
default:
g_assert_not_reached();
}
+ return true;
}
static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
@@ -2099,10 +2150,8 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */
case INDEX_op_mov_i64:
- case INDEX_op_mov_vec:
case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi. */
case INDEX_op_movi_i64:
- case INDEX_op_dupi_vec:
case INDEX_op_call: /* Always emitted via tcg_out_call. */
default:
g_assert_not_reached();
@@ -2145,6 +2194,9 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
case INDEX_op_st_vec:
tcg_out_st(s, type, a0, a1, a2);
break;
+ case INDEX_op_dupm_vec:
+ tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
+ break;
case INDEX_op_add_vec:
tcg_out_insn(s, 3616, ADD, is_q, vece, a0, a1, a2);
break;
@@ -2157,6 +2209,9 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
case INDEX_op_neg_vec:
tcg_out_insn(s, 3617, NEG, is_q, vece, a0, a1);
break;
+ case INDEX_op_abs_vec:
+ tcg_out_insn(s, 3617, ABS, is_q, vece, a0, a1);
+ break;
case INDEX_op_and_vec:
tcg_out_insn(s, 3616, AND, is_q, 0, a0, a1, a2);
break;
@@ -2199,9 +2254,6 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
case INDEX_op_not_vec:
tcg_out_insn(s, 3617, NOT, is_q, 0, a0, a1);
break;
- case INDEX_op_dup_vec:
- tcg_out_insn(s, 3605, DUP, is_q, a0, a1, 1 << vece, 0);
- break;
case INDEX_op_shli_vec:
tcg_out_insn(s, 3614, SHL, is_q, a0, a1, a2 + (8 << vece));
break;
@@ -2211,6 +2263,12 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
case INDEX_op_sari_vec:
tcg_out_insn(s, 3614, SSHR, is_q, a0, a1, (16 << vece) - a2);
break;
+ case INDEX_op_shlv_vec:
+ tcg_out_insn(s, 3616, USHL, is_q, vece, a0, a1, a2);
+ break;
+ case INDEX_op_aa64_sshl_vec:
+ tcg_out_insn(s, 3616, SSHL, is_q, vece, a0, a1, a2);
+ break;
case INDEX_op_cmp_vec:
{
TCGCond cond = args[3];
@@ -2245,6 +2303,10 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
}
}
break;
+
+ case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */
+ case INDEX_op_dupi_vec: /* Always emitted via tcg_out_movi. */
+ case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */
default:
g_assert_not_reached();
}
@@ -2261,6 +2323,7 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
case INDEX_op_andc_vec:
case INDEX_op_orc_vec:
case INDEX_op_neg_vec:
+ case INDEX_op_abs_vec:
case INDEX_op_not_vec:
case INDEX_op_cmp_vec:
case INDEX_op_shli_vec:
@@ -2270,12 +2333,16 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
case INDEX_op_sssub_vec:
case INDEX_op_usadd_vec:
case INDEX_op_ussub_vec:
+ case INDEX_op_shlv_vec:
+ return 1;
+ case INDEX_op_shrv_vec:
+ case INDEX_op_sarv_vec:
+ return -1;
+ case INDEX_op_mul_vec:
case INDEX_op_smax_vec:
case INDEX_op_smin_vec:
case INDEX_op_umax_vec:
case INDEX_op_umin_vec:
- return 1;
- case INDEX_op_mul_vec:
return vece < MO_64;
default:
@@ -2286,6 +2353,32 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
TCGArg a0, ...)
{
+ va_list va;
+ TCGv_vec v0, v1, v2, t1;
+
+ va_start(va, a0);
+ v0 = temp_tcgv_vec(arg_temp(a0));
+ v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
+ v2 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
+
+ switch (opc) {
+ case INDEX_op_shrv_vec:
+ case INDEX_op_sarv_vec:
+ /* Right shifts are negative left shifts for AArch64. */
+ t1 = tcg_temp_new_vec(type);
+ tcg_gen_neg_vec(vece, t1, v2);
+ opc = (opc == INDEX_op_shrv_vec
+ ? INDEX_op_shlv_vec : INDEX_op_aa64_sshl_vec);
+ vec_gen_3(opc, type, vece, tcgv_vec_arg(v0),
+ tcgv_vec_arg(v1), tcgv_vec_arg(t1));
+ tcg_temp_free_vec(t1);
+ break;
+
+ default:
+ g_assert_not_reached();
+ }
+
+ va_end(va);
}
static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
@@ -2467,15 +2560,21 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
case INDEX_op_smin_vec:
case INDEX_op_umax_vec:
case INDEX_op_umin_vec:
+ case INDEX_op_shlv_vec:
+ case INDEX_op_shrv_vec:
+ case INDEX_op_sarv_vec:
+ case INDEX_op_aa64_sshl_vec:
return &w_w_w;
case INDEX_op_not_vec:
case INDEX_op_neg_vec:
+ case INDEX_op_abs_vec:
case INDEX_op_shli_vec:
case INDEX_op_shri_vec:
case INDEX_op_sari_vec:
return &w_w;
case INDEX_op_ld_vec:
case INDEX_op_st_vec:
+ case INDEX_op_dupm_vec:
return &w_r;
case INDEX_op_dup_vec:
return &w_wr;
diff --git a/tcg/aarch64/tcg-target.opc.h b/tcg/aarch64/tcg-target.opc.h
index 4816a6c3d4..59e1d3f7f7 100644
--- a/tcg/aarch64/tcg-target.opc.h
+++ b/tcg/aarch64/tcg-target.opc.h
@@ -1,3 +1,5 @@
/* Target-specific opcodes for host vector expansion. These will be
emitted by tcg_expand_vec_op. For those familiar with GCC internals,
consider these to be UNSPEC with names. */
+
+DEF(aa64_sshl_vec, 1, 2, 0, IMPLVEC)
diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c
index abf0c444b4..7316504c9d 100644
--- a/tcg/arm/tcg-target.inc.c
+++ b/tcg/arm/tcg-target.inc.c
@@ -2264,10 +2264,11 @@ static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
return false;
}
-static inline void tcg_out_mov(TCGContext *s, TCGType type,
+static inline bool tcg_out_mov(TCGContext *s, TCGType type,
TCGReg ret, TCGReg arg)
{
- tcg_out_dat_reg(s, COND_AL, ARITH_MOV, ret, 0, arg, SHIFT_IMM_LSL(0));
+ tcg_out_mov_reg(s, COND_AL, ret, arg);
+ return true;
}
static inline void tcg_out_movi(TCGContext *s, TCGType type,
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 241bf19413..66f16fbe3c 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -182,9 +182,10 @@ extern bool have_avx2;
#define TCG_TARGET_HAS_orc_vec 0
#define TCG_TARGET_HAS_not_vec 0
#define TCG_TARGET_HAS_neg_vec 0
+#define TCG_TARGET_HAS_abs_vec 1
#define TCG_TARGET_HAS_shi_vec 1
-#define TCG_TARGET_HAS_shs_vec 0
-#define TCG_TARGET_HAS_shv_vec 0
+#define TCG_TARGET_HAS_shs_vec 1
+#define TCG_TARGET_HAS_shv_vec have_avx2
#define TCG_TARGET_HAS_cmp_vec 1
#define TCG_TARGET_HAS_mul_vec 1
#define TCG_TARGET_HAS_sat_vec 1
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index d5ed9f1ffd..aafd01cb49 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -358,7 +358,6 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
#define OPC_MOVBE_MyGy (0xf1 | P_EXT38)
#define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16)
#define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16)
-#define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2)
#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
@@ -370,6 +369,9 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
#define OPC_MOVSLQ (0x63 | P_REXW)
#define OPC_MOVZBL (0xb6 | P_EXT)
#define OPC_MOVZWL (0xb7 | P_EXT)
+#define OPC_PABSB (0x1c | P_EXT38 | P_DATA16)
+#define OPC_PABSW (0x1d | P_EXT38 | P_DATA16)
+#define OPC_PABSD (0x1e | P_EXT38 | P_DATA16)
#define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16)
#define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16)
#define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16)
@@ -421,6 +423,14 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
#define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
#define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
+#define OPC_PSLLW (0xf1 | P_EXT | P_DATA16)
+#define OPC_PSLLD (0xf2 | P_EXT | P_DATA16)
+#define OPC_PSLLQ (0xf3 | P_EXT | P_DATA16)
+#define OPC_PSRAW (0xe1 | P_EXT | P_DATA16)
+#define OPC_PSRAD (0xe2 | P_EXT | P_DATA16)
+#define OPC_PSRLW (0xd1 | P_EXT | P_DATA16)
+#define OPC_PSRLD (0xd2 | P_EXT | P_DATA16)
+#define OPC_PSRLQ (0xd3 | P_EXT | P_DATA16)
#define OPC_PSUBB (0xf8 | P_EXT | P_DATA16)
#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16)
#define OPC_PSUBD (0xfa | P_EXT | P_DATA16)
@@ -458,12 +468,21 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
#define OPC_UD2 (0x0b | P_EXT)
#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16)
#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16)
+#define OPC_VPINSRB (0x20 | P_EXT3A | P_DATA16)
+#define OPC_VPINSRW (0xc4 | P_EXT | P_DATA16)
+#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
+#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
#define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_REXW)
#define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
+#define OPC_VPSLLVD (0x47 | P_EXT38 | P_DATA16)
+#define OPC_VPSLLVQ (0x47 | P_EXT38 | P_DATA16 | P_REXW)
+#define OPC_VPSRAVD (0x46 | P_EXT38 | P_DATA16)
+#define OPC_VPSRLVD (0x45 | P_EXT38 | P_DATA16)
+#define OPC_VPSRLVQ (0x45 | P_EXT38 | P_DATA16 | P_REXW)
#define OPC_VZEROUPPER (0x77 | P_EXT)
#define OPC_XCHG_ax_r32 (0x90)
@@ -809,12 +828,12 @@ static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
}
-static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
+static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
{
int rexw = 0;
if (arg == ret) {
- return;
+ return true;
}
switch (type) {
case TCG_TYPE_I64:
@@ -852,18 +871,20 @@ static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
default:
g_assert_not_reached();
}
+ return true;
}
-static void tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
+static const int avx2_dup_insn[4] = {
+ OPC_VPBROADCASTB, OPC_VPBROADCASTW,
+ OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
+};
+
+static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
TCGReg r, TCGReg a)
{
if (have_avx2) {
- static const int dup_insn[4] = {
- OPC_VPBROADCASTB, OPC_VPBROADCASTW,
- OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
- };
int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
- tcg_out_vex_modrm(s, dup_insn[vece] + vex_l, r, 0, a);
+ tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
} else {
switch (vece) {
case MO_8:
@@ -887,6 +908,39 @@ static void tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
g_assert_not_reached();
}
}
+ return true;
+}
+
+static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
+ TCGReg r, TCGReg base, intptr_t offset)
+{
+ if (have_avx2) {
+ int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
+ tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
+ r, 0, base, offset);
+ } else {
+ switch (vece) {
+ case MO_64:
+ tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSD, r, 0, base, offset);
+ break;
+ case MO_32:
+ tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
+ break;
+ case MO_16:
+ tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
+ tcg_out8(s, 0); /* imm8 */
+ tcg_out_dup_vec(s, type, vece, r, r);
+ break;
+ case MO_8:
+ tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
+ tcg_out8(s, 0); /* imm8 */
+ tcg_out_dup_vec(s, type, vece, r, r);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ }
+ return true;
}
static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
@@ -909,16 +963,16 @@ static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
} else if (have_avx2) {
tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
} else {
- tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
+ tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSD, ret);
}
new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
- } else if (have_avx2) {
- tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
- new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
} else {
- tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy, ret);
+ if (have_avx2) {
+ tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSD + vex_l, ret);
+ } else {
+ tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
+ }
new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
- tcg_out_dup_vec(s, type, MO_32, ret, ret);
}
}
@@ -2601,10 +2655,8 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
break;
case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */
case INDEX_op_mov_i64:
- case INDEX_op_mov_vec:
case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi. */
case INDEX_op_movi_i64:
- case INDEX_op_dupi_vec:
case INDEX_op_call: /* Always emitted via tcg_out_call. */
default:
tcg_abort();
@@ -2671,6 +2723,31 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
static int const umax_insn[4] = {
OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
};
+ static int const shlv_insn[4] = {
+ /* TODO: AVX512 adds support for MO_16. */
+ OPC_UD2, OPC_UD2, OPC_VPSLLVD, OPC_VPSLLVQ
+ };
+ static int const shrv_insn[4] = {
+ /* TODO: AVX512 adds support for MO_16. */
+ OPC_UD2, OPC_UD2, OPC_VPSRLVD, OPC_VPSRLVQ
+ };
+ static int const sarv_insn[4] = {
+ /* TODO: AVX512 adds support for MO_16, MO_64. */
+ OPC_UD2, OPC_UD2, OPC_VPSRAVD, OPC_UD2
+ };
+ static int const shls_insn[4] = {
+ OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
+ };
+ static int const shrs_insn[4] = {
+ OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
+ };
+ static int const sars_insn[4] = {
+ OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_UD2
+ };
+ static int const abs_insn[4] = {
+ /* TODO: AVX512 adds support for MO_64. */
+ OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_UD2
+ };
TCGType type = vecl + TCG_TYPE_V64;
int insn, sub;
@@ -2723,6 +2800,24 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
case INDEX_op_umax_vec:
insn = umax_insn[vece];
goto gen_simd;
+ case INDEX_op_shlv_vec:
+ insn = shlv_insn[vece];
+ goto gen_simd;
+ case INDEX_op_shrv_vec:
+ insn = shrv_insn[vece];
+ goto gen_simd;
+ case INDEX_op_sarv_vec:
+ insn = sarv_insn[vece];
+ goto gen_simd;
+ case INDEX_op_shls_vec:
+ insn = shls_insn[vece];
+ goto gen_simd;
+ case INDEX_op_shrs_vec:
+ insn = shrs_insn[vece];
+ goto gen_simd;
+ case INDEX_op_sars_vec:
+ insn = sars_insn[vece];
+ goto gen_simd;
case INDEX_op_x86_punpckl_vec:
insn = punpckl_insn[vece];
goto gen_simd;
@@ -2741,6 +2836,11 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
insn = OPC_PUNPCKLDQ;
goto gen_simd;
#endif
+ case INDEX_op_abs_vec:
+ insn = abs_insn[vece];
+ a2 = a1;
+ a1 = 0;
+ goto gen_simd;
gen_simd:
tcg_debug_assert(insn != OPC_UD2);
if (type == TCG_TYPE_V256) {
@@ -2793,8 +2893,8 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
case INDEX_op_st_vec:
tcg_out_st(s, type, a0, a1, a2);
break;
- case INDEX_op_dup_vec:
- tcg_out_dup_vec(s, type, vece, a0, a1);
+ case INDEX_op_dupm_vec:
+ tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
break;
case INDEX_op_x86_shufps_vec:
@@ -2837,6 +2937,9 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
tcg_out8(s, a2);
break;
+ case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */
+ case INDEX_op_dupi_vec: /* Always emitted via tcg_out_movi. */
+ case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */
default:
g_assert_not_reached();
}
@@ -3079,6 +3182,7 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
case INDEX_op_ld_vec:
case INDEX_op_st_vec:
+ case INDEX_op_dupm_vec:
return &x_r;
case INDEX_op_add_vec:
@@ -3096,6 +3200,12 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
case INDEX_op_umin_vec:
case INDEX_op_smax_vec:
case INDEX_op_umax_vec:
+ case INDEX_op_shlv_vec:
+ case INDEX_op_shrv_vec:
+ case INDEX_op_sarv_vec:
+ case INDEX_op_shls_vec:
+ case INDEX_op_shrs_vec:
+ case INDEX_op_sars_vec:
case INDEX_op_cmp_vec:
case INDEX_op_x86_shufps_vec:
case INDEX_op_x86_blend_vec:
@@ -3108,6 +3218,7 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
case INDEX_op_dup2_vec:
#endif
return &x_x_x;
+ case INDEX_op_abs_vec:
case INDEX_op_dup_vec:
case INDEX_op_shli_vec:
case INDEX_op_shri_vec:
@@ -3153,6 +3264,18 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
}
return 1;
+ case INDEX_op_shls_vec:
+ case INDEX_op_shrs_vec:
+ return vece >= MO_16;
+ case INDEX_op_sars_vec:
+ return vece >= MO_16 && vece <= MO_32;
+
+ case INDEX_op_shlv_vec:
+ case INDEX_op_shrv_vec:
+ return have_avx2 && vece >= MO_32;
+ case INDEX_op_sarv_vec:
+ return have_avx2 && vece == MO_32;
+
case INDEX_op_mul_vec:
if (vece == MO_8) {
/* We can expand the operation for MO_8. */
@@ -3173,6 +3296,8 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
case INDEX_op_umin_vec:
case INDEX_op_umax_vec:
return vece <= MO_32 ? 1 : -1;
+ case INDEX_op_abs_vec:
+ return vece <= MO_32;
default:
return 0;
diff --git a/tcg/mips/tcg-target.inc.c b/tcg/mips/tcg-target.inc.c
index 412cacdcb9..7cafd4a790 100644
--- a/tcg/mips/tcg-target.inc.c
+++ b/tcg/mips/tcg-target.inc.c
@@ -558,13 +558,14 @@ static inline void tcg_out_dsra(TCGContext *s, TCGReg rd, TCGReg rt, TCGArg sa)
tcg_out_opc_sa64(s, OPC_DSRA, OPC_DSRA32, rd, rt, sa);
}
-static inline void tcg_out_mov(TCGContext *s, TCGType type,
+static inline bool tcg_out_mov(TCGContext *s, TCGType type,
TCGReg ret, TCGReg arg)
{
/* Simple reg-reg move, optimising out the 'do nothing' case */
if (ret != arg) {
tcg_out_opc_reg(s, OPC_OR, ret, arg, TCG_REG_ZERO);
}
+ return true;
}
static void tcg_out_movi(TCGContext *s, TCGType type,
diff --git a/tcg/optimize.c b/tcg/optimize.c
index 5150c38a25..24faa06260 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -734,9 +734,13 @@ void tcg_optimize(TCGContext *s)
} else if (opc == INDEX_op_sub_i64) {
neg_op = INDEX_op_neg_i64;
have_neg = TCG_TARGET_HAS_neg_i64;
- } else {
+ } else if (TCG_TARGET_HAS_neg_vec) {
+ TCGType type = TCGOP_VECL(op) + TCG_TYPE_V64;
+ unsigned vece = TCGOP_VECE(op);
neg_op = INDEX_op_neg_vec;
- have_neg = TCG_TARGET_HAS_neg_vec;
+ have_neg = tcg_can_emit_vec_op(neg_op, type, vece) > 0;
+ } else {
+ break;
}
if (!have_neg) {
break;
diff --git a/tcg/ppc/tcg-target.inc.c b/tcg/ppc/tcg-target.inc.c
index 36b4791707..30c095d3d5 100644
--- a/tcg/ppc/tcg-target.inc.c
+++ b/tcg/ppc/tcg-target.inc.c
@@ -559,12 +559,13 @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
static void tcg_out_mem_long(TCGContext *s, int opi, int opx, TCGReg rt,
TCGReg base, tcg_target_long offset);
-static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
+static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
{
tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || type == TCG_TYPE_I32);
if (ret != arg) {
tcg_out32(s, OR | SAB(arg, ret, arg));
}
+ return true;
}
static inline void tcg_out_rld(TCGContext *s, int op, TCGReg ra, TCGReg rs,
diff --git a/tcg/riscv/tcg-target.inc.c b/tcg/riscv/tcg-target.inc.c
index 2932505094..6497a4dab2 100644
--- a/tcg/riscv/tcg-target.inc.c
+++ b/tcg/riscv/tcg-target.inc.c
@@ -515,10 +515,10 @@ static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
* TCG intrinsics
*/
-static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
+static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
{
if (ret == arg) {
- return;
+ return true;
}
switch (type) {
case TCG_TYPE_I32:
@@ -528,6 +528,7 @@ static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
default:
g_assert_not_reached();
}
+ return true;
}
static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
diff --git a/tcg/s390/tcg-target.inc.c b/tcg/s390/tcg-target.inc.c
index 3d6150b10e..331d51852c 100644
--- a/tcg/s390/tcg-target.inc.c
+++ b/tcg/s390/tcg-target.inc.c
@@ -548,7 +548,7 @@ static void tcg_out_sh32(TCGContext* s, S390Opcode op, TCGReg dest,
tcg_out_insn_RS(s, op, dest, sh_reg, 0, sh_imm);
}
-static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg dst, TCGReg src)
+static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg dst, TCGReg src)
{
if (src != dst) {
if (type == TCG_TYPE_I32) {
@@ -557,6 +557,7 @@ static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg dst, TCGReg src)
tcg_out_insn(s, RRE, LGR, dst, src);
}
}
+ return true;
}
static const S390Opcode lli_insns[4] = {
diff --git a/tcg/sparc/tcg-target.inc.c b/tcg/sparc/tcg-target.inc.c
index 7a61839dc1..83295955a7 100644
--- a/tcg/sparc/tcg-target.inc.c
+++ b/tcg/sparc/tcg-target.inc.c
@@ -407,12 +407,13 @@ static void tcg_out_arithc(TCGContext *s, TCGReg rd, TCGReg rs1,
| (val2const ? INSN_IMM13(val2) : INSN_RS2(val2)));
}
-static inline void tcg_out_mov(TCGContext *s, TCGType type,
+static inline bool tcg_out_mov(TCGContext *s, TCGType type,
TCGReg ret, TCGReg arg)
{
if (ret != arg) {
tcg_out_arith(s, ret, arg, TCG_REG_G0, ARITH_OR);
}
+ return true;
}
static inline void tcg_out_sethi(TCGContext *s, TCGReg ret, uint32_t arg)
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index 0996ef0812..338ddd9d9e 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -26,6 +26,13 @@
#define MAX_UNROLL 4
+#ifdef CONFIG_DEBUG_TCG
+static const TCGOpcode vecop_list_empty[1] = { 0 };
+#else
+#define vecop_list_empty NULL
+#endif
+
+
/* Verify vector size and alignment rules. OFS should be the OR of all
of the operand offsets so that we can check them all at once. */
static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
@@ -360,36 +367,69 @@ static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
* on elements of size VECE in the selected type. Do not select V64 if
* PREFER_I64 is true. Return 0 if no vector type is selected.
*/
-static TCGType choose_vector_type(TCGOpcode op, unsigned vece, uint32_t size,
- bool prefer_i64)
+static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
+ uint32_t size, bool prefer_i64)
{
if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) {
- if (op == 0) {
- return TCG_TYPE_V256;
- }
- /* Recall that ARM SVE allows vector sizes that are not a
+ /*
+ * Recall that ARM SVE allows vector sizes that are not a
* power of 2, but always a multiple of 16. The intent is
* that e.g. size == 80 would be expanded with 2x32 + 1x16.
* It is hard to imagine a case in which v256 is supported
* but v128 is not, but check anyway.
*/
- if (tcg_can_emit_vec_op(op, TCG_TYPE_V256, vece)
+ if (tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece)
&& (size % 32 == 0
- || tcg_can_emit_vec_op(op, TCG_TYPE_V128, vece))) {
+ || tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) {
return TCG_TYPE_V256;
}
}
if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16)
- && (op == 0 || tcg_can_emit_vec_op(op, TCG_TYPE_V128, vece))) {
+ && tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece)) {
return TCG_TYPE_V128;
}
if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
- && (op == 0 || tcg_can_emit_vec_op(op, TCG_TYPE_V64, vece))) {
+ && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) {
return TCG_TYPE_V64;
}
return 0;
}
+static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
+ uint32_t maxsz, TCGv_vec t_vec)
+{
+ uint32_t i = 0;
+
+ switch (type) {
+ case TCG_TYPE_V256:
+ /*
+ * Recall that ARM SVE allows vector sizes that are not a
+ * power of 2, but always a multiple of 16. The intent is
+ * that e.g. size == 80 would be expanded with 2x32 + 1x16.
+ */
+ for (; i + 32 <= oprsz; i += 32) {
+ tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
+ }
+ /* fallthru */
+ case TCG_TYPE_V128:
+ for (; i + 16 <= oprsz; i += 16) {
+ tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
+ }
+ break;
+ case TCG_TYPE_V64:
+ for (; i < oprsz; i += 8) {
+ tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
+ }
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ if (oprsz < maxsz) {
+ expand_clr(dofs + oprsz, maxsz - oprsz);
+ }
+}
+
/* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
* Only one of IN_32 or IN_64 may be set;
* IN_C is used if IN_32 and IN_64 are unset.
@@ -418,7 +458,7 @@ static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
/* Implement inline with a vector type, if possible.
* Prefer integer when 64-bit host and no variable dup.
*/
- type = choose_vector_type(0, vece, oprsz,
+ type = choose_vector_type(NULL, vece, oprsz,
(TCG_TARGET_REG_BITS == 64 && in_32 == NULL
&& (in_64 == NULL || vece == MO_64)));
if (type != 0) {
@@ -429,49 +469,11 @@ static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
} else if (in_64) {
tcg_gen_dup_i64_vec(vece, t_vec, in_64);
} else {
- switch (vece) {
- case MO_8:
- tcg_gen_dup8i_vec(t_vec, in_c);
- break;
- case MO_16:
- tcg_gen_dup16i_vec(t_vec, in_c);
- break;
- case MO_32:
- tcg_gen_dup32i_vec(t_vec, in_c);
- break;
- default:
- tcg_gen_dup64i_vec(t_vec, in_c);
- break;
- }
- }
-
- i = 0;
- switch (type) {
- case TCG_TYPE_V256:
- /* Recall that ARM SVE allows vector sizes that are not a
- * power of 2, but always a multiple of 16. The intent is
- * that e.g. size == 80 would be expanded with 2x32 + 1x16.
- */
- for (; i + 32 <= oprsz; i += 32) {
- tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
- }
- /* fallthru */
- case TCG_TYPE_V128:
- for (; i + 16 <= oprsz; i += 16) {
- tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
- }
- break;
- case TCG_TYPE_V64:
- for (; i < oprsz; i += 8) {
- tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
- }
- break;
- default:
- g_assert_not_reached();
+ tcg_gen_dupi_vec(vece, t_vec, in_c);
}
-
+ do_dup_store(type, dofs, oprsz, maxsz, t_vec);
tcg_temp_free_vec(t_vec);
- goto done;
+ return;
}
/* Otherwise, inline with an integer type, unless "large". */
@@ -663,6 +665,29 @@ static void expand_3_i32(uint32_t dofs, uint32_t aofs,
tcg_temp_free_i32(t0);
}
+static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t oprsz, int32_t c, bool load_dest,
+ void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t))
+{
+ TCGv_i32 t0 = tcg_temp_new_i32();
+ TCGv_i32 t1 = tcg_temp_new_i32();
+ TCGv_i32 t2 = tcg_temp_new_i32();
+ uint32_t i;
+
+ for (i = 0; i < oprsz; i += 4) {
+ tcg_gen_ld_i32(t0, cpu_env, aofs + i);
+ tcg_gen_ld_i32(t1, cpu_env, bofs + i);
+ if (load_dest) {
+ tcg_gen_ld_i32(t2, cpu_env, dofs + i);
+ }
+ fni(t2, t0, t1, c);
+ tcg_gen_st_i32(t2, cpu_env, dofs + i);
+ }
+ tcg_temp_free_i32(t0);
+ tcg_temp_free_i32(t1);
+ tcg_temp_free_i32(t2);
+}
+
/* Expand OPSZ bytes worth of three-operand operations using i32 elements. */
static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
uint32_t cofs, uint32_t oprsz, bool write_aofs,
@@ -770,6 +795,29 @@ static void expand_3_i64(uint32_t dofs, uint32_t aofs,
tcg_temp_free_i64(t0);
}
+static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t oprsz, int64_t c, bool load_dest,
+ void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t))
+{
+ TCGv_i64 t0 = tcg_temp_new_i64();
+ TCGv_i64 t1 = tcg_temp_new_i64();
+ TCGv_i64 t2 = tcg_temp_new_i64();
+ uint32_t i;
+
+ for (i = 0; i < oprsz; i += 8) {
+ tcg_gen_ld_i64(t0, cpu_env, aofs + i);
+ tcg_gen_ld_i64(t1, cpu_env, bofs + i);
+ if (load_dest) {
+ tcg_gen_ld_i64(t2, cpu_env, dofs + i);
+ }
+ fni(t2, t0, t1, c);
+ tcg_gen_st_i64(t2, cpu_env, dofs + i);
+ }
+ tcg_temp_free_i64(t0);
+ tcg_temp_free_i64(t1);
+ tcg_temp_free_i64(t2);
+}
+
/* Expand OPSZ bytes worth of three-operand operations using i64 elements. */
static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
uint32_t cofs, uint32_t oprsz, bool write_aofs,
@@ -883,6 +931,35 @@ static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
tcg_temp_free_vec(t0);
}
+/*
+ * Expand OPSZ bytes worth of three-vector operands and an immediate operand
+ * using host vectors.
+ */
+static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t tysz,
+ TCGType type, int64_t c, bool load_dest,
+ void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec,
+ int64_t))
+{
+ TCGv_vec t0 = tcg_temp_new_vec(type);
+ TCGv_vec t1 = tcg_temp_new_vec(type);
+ TCGv_vec t2 = tcg_temp_new_vec(type);
+ uint32_t i;
+
+ for (i = 0; i < oprsz; i += tysz) {
+ tcg_gen_ld_vec(t0, cpu_env, aofs + i);
+ tcg_gen_ld_vec(t1, cpu_env, bofs + i);
+ if (load_dest) {
+ tcg_gen_ld_vec(t2, cpu_env, dofs + i);
+ }
+ fni(vece, t2, t0, t1, c);
+ tcg_gen_st_vec(t2, cpu_env, dofs + i);
+ }
+ tcg_temp_free_vec(t0);
+ tcg_temp_free_vec(t1);
+ tcg_temp_free_vec(t2);
+}
+
/* Expand OPSZ bytes worth of four-operand operations using host vectors. */
static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t cofs, uint32_t oprsz,
@@ -916,6 +993,8 @@ static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
{
+ const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
+ const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
TCGType type;
uint32_t some;
@@ -924,7 +1003,7 @@ void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
type = 0;
if (g->fniv) {
- type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64);
+ type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
}
switch (type) {
case TCG_TYPE_V256:
@@ -957,13 +1036,14 @@ void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
} else {
assert(g->fno != NULL);
tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
- return;
+ oprsz = maxsz;
}
break;
default:
g_assert_not_reached();
}
+ tcg_swap_vecop_list(hold_list);
if (oprsz < maxsz) {
expand_clr(dofs + oprsz, maxsz - oprsz);
@@ -974,6 +1054,8 @@ void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
uint32_t maxsz, int64_t c, const GVecGen2i *g)
{
+ const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
+ const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
TCGType type;
uint32_t some;
@@ -982,7 +1064,7 @@ void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
type = 0;
if (g->fniv) {
- type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64);
+ type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
}
switch (type) {
case TCG_TYPE_V256:
@@ -1024,13 +1106,14 @@ void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
maxsz, c, g->fnoi);
tcg_temp_free_i64(tcg_c);
}
- return;
+ oprsz = maxsz;
}
break;
default:
g_assert_not_reached();
}
+ tcg_swap_vecop_list(hold_list);
if (oprsz < maxsz) {
expand_clr(dofs + oprsz, maxsz - oprsz);
@@ -1048,9 +1131,11 @@ void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
type = 0;
if (g->fniv) {
- type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64);
+ type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
}
if (type != 0) {
+ const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
+ const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
TCGv_vec t_vec = tcg_temp_new_vec(type);
uint32_t some;
@@ -1088,6 +1173,7 @@ void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
g_assert_not_reached();
}
tcg_temp_free_vec(t_vec);
+ tcg_swap_vecop_list(hold_list);
} else if (g->fni8 && check_size_impl(oprsz, 8)) {
TCGv_i64 t64 = tcg_temp_new_i64();
@@ -1115,6 +1201,8 @@ void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
{
+ const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
+ const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
TCGType type;
uint32_t some;
@@ -1123,7 +1211,7 @@ void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
type = 0;
if (g->fniv) {
- type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64);
+ type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
}
switch (type) {
case TCG_TYPE_V256:
@@ -1161,13 +1249,81 @@ void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
assert(g->fno != NULL);
tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
maxsz, g->data, g->fno);
- return;
+ oprsz = maxsz;
+ }
+ break;
+
+ default:
+ g_assert_not_reached();
+ }
+ tcg_swap_vecop_list(hold_list);
+
+ if (oprsz < maxsz) {
+ expand_clr(dofs + oprsz, maxsz - oprsz);
+ }
+}
+
+/* Expand a vector operation with three vectors and an immediate. */
+void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t oprsz, uint32_t maxsz, int64_t c,
+ const GVecGen3i *g)
+{
+ const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
+ const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
+ TCGType type;
+ uint32_t some;
+
+ check_size_align(oprsz, maxsz, dofs | aofs | bofs);
+ check_overlap_3(dofs, aofs, bofs, maxsz);
+
+ type = 0;
+ if (g->fniv) {
+ type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
+ }
+ switch (type) {
+ case TCG_TYPE_V256:
+ /*
+ * Recall that ARM SVE allows vector sizes that are not a
+ * power of 2, but always a multiple of 16. The intent is
+ * that e.g. size == 80 would be expanded with 2x32 + 1x16.
+ */
+ some = QEMU_ALIGN_DOWN(oprsz, 32);
+ expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
+ c, g->load_dest, g->fniv);
+ if (some == oprsz) {
+ break;
+ }
+ dofs += some;
+ aofs += some;
+ bofs += some;
+ oprsz -= some;
+ maxsz -= some;
+ /* fallthru */
+ case TCG_TYPE_V128:
+ expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
+ c, g->load_dest, g->fniv);
+ break;
+ case TCG_TYPE_V64:
+ expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
+ c, g->load_dest, g->fniv);
+ break;
+
+ case 0:
+ if (g->fni8 && check_size_impl(oprsz, 8)) {
+ expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8);
+ } else if (g->fni4 && check_size_impl(oprsz, 4)) {
+ expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4);
+ } else {
+ assert(g->fno != NULL);
+ tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno);
+ oprsz = maxsz;
}
break;
default:
g_assert_not_reached();
}
+ tcg_swap_vecop_list(hold_list);
if (oprsz < maxsz) {
expand_clr(dofs + oprsz, maxsz - oprsz);
@@ -1178,6 +1334,8 @@ void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
{
+ const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
+ const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
TCGType type;
uint32_t some;
@@ -1186,7 +1344,7 @@ void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
type = 0;
if (g->fniv) {
- type = choose_vector_type(g->opc, g->vece, oprsz, g->prefer_i64);
+ type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
}
switch (type) {
case TCG_TYPE_V256:
@@ -1227,13 +1385,14 @@ void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
assert(g->fno != NULL);
tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
oprsz, maxsz, g->data, g->fno);
- return;
+ oprsz = maxsz;
}
break;
default:
g_assert_not_reached();
}
+ tcg_swap_vecop_list(hold_list);
if (oprsz < maxsz) {
expand_clr(dofs + oprsz, maxsz - oprsz);
@@ -1287,6 +1446,16 @@ void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t oprsz, uint32_t maxsz)
{
+ if (vece <= MO_64) {
+ TCGType type = choose_vector_type(0, vece, oprsz, 0);
+ if (type != 0) {
+ TCGv_vec t_vec = tcg_temp_new_vec(type);
+ tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs);
+ do_dup_store(type, dofs, oprsz, maxsz, t_vec);
+ tcg_temp_free_vec(t_vec);
+ return;
+ }
+ }
if (vece <= MO_32) {
TCGv_i32 in = tcg_temp_new_i32();
switch (vece) {
@@ -1428,6 +1597,8 @@ void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
tcg_temp_free_i64(t2);
}
+static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 };
+
void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
{
@@ -1435,22 +1606,22 @@ void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
{ .fni8 = tcg_gen_vec_add8_i64,
.fniv = tcg_gen_add_vec,
.fno = gen_helper_gvec_add8,
- .opc = INDEX_op_add_vec,
+ .opt_opc = vecop_list_add,
.vece = MO_8 },
{ .fni8 = tcg_gen_vec_add16_i64,
.fniv = tcg_gen_add_vec,
.fno = gen_helper_gvec_add16,
- .opc = INDEX_op_add_vec,
+ .opt_opc = vecop_list_add,
.vece = MO_16 },
{ .fni4 = tcg_gen_add_i32,
.fniv = tcg_gen_add_vec,
.fno = gen_helper_gvec_add32,
- .opc = INDEX_op_add_vec,
+ .opt_opc = vecop_list_add,
.vece = MO_32 },
{ .fni8 = tcg_gen_add_i64,
.fniv = tcg_gen_add_vec,
.fno = gen_helper_gvec_add64,
- .opc = INDEX_op_add_vec,
+ .opt_opc = vecop_list_add,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
.vece = MO_64 },
};
@@ -1466,22 +1637,22 @@ void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
{ .fni8 = tcg_gen_vec_add8_i64,
.fniv = tcg_gen_add_vec,
.fno = gen_helper_gvec_adds8,
- .opc = INDEX_op_add_vec,
+ .opt_opc = vecop_list_add,
.vece = MO_8 },
{ .fni8 = tcg_gen_vec_add16_i64,
.fniv = tcg_gen_add_vec,
.fno = gen_helper_gvec_adds16,
- .opc = INDEX_op_add_vec,
+ .opt_opc = vecop_list_add,
.vece = MO_16 },
{ .fni4 = tcg_gen_add_i32,
.fniv = tcg_gen_add_vec,
.fno = gen_helper_gvec_adds32,
- .opc = INDEX_op_add_vec,
+ .opt_opc = vecop_list_add,
.vece = MO_32 },
{ .fni8 = tcg_gen_add_i64,
.fniv = tcg_gen_add_vec,
.fno = gen_helper_gvec_adds64,
- .opc = INDEX_op_add_vec,
+ .opt_opc = vecop_list_add,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
.vece = MO_64 },
};
@@ -1498,6 +1669,8 @@ void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
tcg_temp_free_i64(tmp);
}
+static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
+
void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
{
@@ -1505,22 +1678,22 @@ void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
{ .fni8 = tcg_gen_vec_sub8_i64,
.fniv = tcg_gen_sub_vec,
.fno = gen_helper_gvec_subs8,
- .opc = INDEX_op_sub_vec,
+ .opt_opc = vecop_list_sub,
.vece = MO_8 },
{ .fni8 = tcg_gen_vec_sub16_i64,
.fniv = tcg_gen_sub_vec,
.fno = gen_helper_gvec_subs16,
- .opc = INDEX_op_sub_vec,
+ .opt_opc = vecop_list_sub,
.vece = MO_16 },
{ .fni4 = tcg_gen_sub_i32,
.fniv = tcg_gen_sub_vec,
.fno = gen_helper_gvec_subs32,
- .opc = INDEX_op_sub_vec,
+ .opt_opc = vecop_list_sub,
.vece = MO_32 },
{ .fni8 = tcg_gen_sub_i64,
.fniv = tcg_gen_sub_vec,
.fno = gen_helper_gvec_subs64,
- .opc = INDEX_op_sub_vec,
+ .opt_opc = vecop_list_sub,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
.vece = MO_64 },
};
@@ -1584,22 +1757,22 @@ void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
{ .fni8 = tcg_gen_vec_sub8_i64,
.fniv = tcg_gen_sub_vec,
.fno = gen_helper_gvec_sub8,
- .opc = INDEX_op_sub_vec,
+ .opt_opc = vecop_list_sub,
.vece = MO_8 },
{ .fni8 = tcg_gen_vec_sub16_i64,
.fniv = tcg_gen_sub_vec,
.fno = gen_helper_gvec_sub16,
- .opc = INDEX_op_sub_vec,
+ .opt_opc = vecop_list_sub,
.vece = MO_16 },
{ .fni4 = tcg_gen_sub_i32,
.fniv = tcg_gen_sub_vec,
.fno = gen_helper_gvec_sub32,
- .opc = INDEX_op_sub_vec,
+ .opt_opc = vecop_list_sub,
.vece = MO_32 },
{ .fni8 = tcg_gen_sub_i64,
.fniv = tcg_gen_sub_vec,
.fno = gen_helper_gvec_sub64,
- .opc = INDEX_op_sub_vec,
+ .opt_opc = vecop_list_sub,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
.vece = MO_64 },
};
@@ -1608,27 +1781,29 @@ void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
}
+static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 };
+
void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
{
static const GVecGen3 g[4] = {
{ .fniv = tcg_gen_mul_vec,
.fno = gen_helper_gvec_mul8,
- .opc = INDEX_op_mul_vec,
+ .opt_opc = vecop_list_mul,
.vece = MO_8 },
{ .fniv = tcg_gen_mul_vec,
.fno = gen_helper_gvec_mul16,
- .opc = INDEX_op_mul_vec,
+ .opt_opc = vecop_list_mul,
.vece = MO_16 },
{ .fni4 = tcg_gen_mul_i32,
.fniv = tcg_gen_mul_vec,
.fno = gen_helper_gvec_mul32,
- .opc = INDEX_op_mul_vec,
+ .opt_opc = vecop_list_mul,
.vece = MO_32 },
{ .fni8 = tcg_gen_mul_i64,
.fniv = tcg_gen_mul_vec,
.fno = gen_helper_gvec_mul64,
- .opc = INDEX_op_mul_vec,
+ .opt_opc = vecop_list_mul,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
.vece = MO_64 },
};
@@ -1643,21 +1818,21 @@ void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
static const GVecGen2s g[4] = {
{ .fniv = tcg_gen_mul_vec,
.fno = gen_helper_gvec_muls8,
- .opc = INDEX_op_mul_vec,
+ .opt_opc = vecop_list_mul,
.vece = MO_8 },
{ .fniv = tcg_gen_mul_vec,
.fno = gen_helper_gvec_muls16,
- .opc = INDEX_op_mul_vec,
+ .opt_opc = vecop_list_mul,
.vece = MO_16 },
{ .fni4 = tcg_gen_mul_i32,
.fniv = tcg_gen_mul_vec,
.fno = gen_helper_gvec_muls32,
- .opc = INDEX_op_mul_vec,
+ .opt_opc = vecop_list_mul,
.vece = MO_32 },
{ .fni8 = tcg_gen_mul_i64,
.fniv = tcg_gen_mul_vec,
.fno = gen_helper_gvec_muls64,
- .opc = INDEX_op_mul_vec,
+ .opt_opc = vecop_list_mul,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
.vece = MO_64 },
};
@@ -1677,22 +1852,23 @@ void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
{
+ static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 };
static const GVecGen3 g[4] = {
{ .fniv = tcg_gen_ssadd_vec,
.fno = gen_helper_gvec_ssadd8,
- .opc = INDEX_op_ssadd_vec,
+ .opt_opc = vecop_list,
.vece = MO_8 },
{ .fniv = tcg_gen_ssadd_vec,
.fno = gen_helper_gvec_ssadd16,
- .opc = INDEX_op_ssadd_vec,
+ .opt_opc = vecop_list,
.vece = MO_16 },
{ .fniv = tcg_gen_ssadd_vec,
.fno = gen_helper_gvec_ssadd32,
- .opc = INDEX_op_ssadd_vec,
+ .opt_opc = vecop_list,
.vece = MO_32 },
{ .fniv = tcg_gen_ssadd_vec,
.fno = gen_helper_gvec_ssadd64,
- .opc = INDEX_op_ssadd_vec,
+ .opt_opc = vecop_list,
.vece = MO_64 },
};
tcg_debug_assert(vece <= MO_64);
@@ -1702,22 +1878,23 @@ void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
{
+ static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 };
static const GVecGen3 g[4] = {
{ .fniv = tcg_gen_sssub_vec,
.fno = gen_helper_gvec_sssub8,
- .opc = INDEX_op_sssub_vec,
+ .opt_opc = vecop_list,
.vece = MO_8 },
{ .fniv = tcg_gen_sssub_vec,
.fno = gen_helper_gvec_sssub16,
- .opc = INDEX_op_sssub_vec,
+ .opt_opc = vecop_list,
.vece = MO_16 },
{ .fniv = tcg_gen_sssub_vec,
.fno = gen_helper_gvec_sssub32,
- .opc = INDEX_op_sssub_vec,
+ .opt_opc = vecop_list,
.vece = MO_32 },
{ .fniv = tcg_gen_sssub_vec,
.fno = gen_helper_gvec_sssub64,
- .opc = INDEX_op_sssub_vec,
+ .opt_opc = vecop_list,
.vece = MO_64 },
};
tcg_debug_assert(vece <= MO_64);
@@ -1743,24 +1920,25 @@ static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
{
+ static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 };
static const GVecGen3 g[4] = {
{ .fniv = tcg_gen_usadd_vec,
.fno = gen_helper_gvec_usadd8,
- .opc = INDEX_op_usadd_vec,
+ .opt_opc = vecop_list,
.vece = MO_8 },
{ .fniv = tcg_gen_usadd_vec,
.fno = gen_helper_gvec_usadd16,
- .opc = INDEX_op_usadd_vec,
+ .opt_opc = vecop_list,
.vece = MO_16 },
{ .fni4 = tcg_gen_usadd_i32,
.fniv = tcg_gen_usadd_vec,
.fno = gen_helper_gvec_usadd32,
- .opc = INDEX_op_usadd_vec,
+ .opt_opc = vecop_list,
.vece = MO_32 },
{ .fni8 = tcg_gen_usadd_i64,
.fniv = tcg_gen_usadd_vec,
.fno = gen_helper_gvec_usadd64,
- .opc = INDEX_op_usadd_vec,
+ .opt_opc = vecop_list,
.vece = MO_64 }
};
tcg_debug_assert(vece <= MO_64);
@@ -1786,24 +1964,25 @@ static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
{
+ static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 };
static const GVecGen3 g[4] = {
{ .fniv = tcg_gen_ussub_vec,
.fno = gen_helper_gvec_ussub8,
- .opc = INDEX_op_ussub_vec,
+ .opt_opc = vecop_list,
.vece = MO_8 },
{ .fniv = tcg_gen_ussub_vec,
.fno = gen_helper_gvec_ussub16,
- .opc = INDEX_op_ussub_vec,
+ .opt_opc = vecop_list,
.vece = MO_16 },
{ .fni4 = tcg_gen_ussub_i32,
.fniv = tcg_gen_ussub_vec,
.fno = gen_helper_gvec_ussub32,
- .opc = INDEX_op_ussub_vec,
+ .opt_opc = vecop_list,
.vece = MO_32 },
{ .fni8 = tcg_gen_ussub_i64,
.fniv = tcg_gen_ussub_vec,
.fno = gen_helper_gvec_ussub64,
- .opc = INDEX_op_ussub_vec,
+ .opt_opc = vecop_list,
.vece = MO_64 }
};
tcg_debug_assert(vece <= MO_64);
@@ -1813,24 +1992,25 @@ void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
{
+ static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 };
static const GVecGen3 g[4] = {
{ .fniv = tcg_gen_smin_vec,
.fno = gen_helper_gvec_smin8,
- .opc = INDEX_op_smin_vec,
+ .opt_opc = vecop_list,
.vece = MO_8 },
{ .fniv = tcg_gen_smin_vec,
.fno = gen_helper_gvec_smin16,
- .opc = INDEX_op_smin_vec,
+ .opt_opc = vecop_list,
.vece = MO_16 },
{ .fni4 = tcg_gen_smin_i32,
.fniv = tcg_gen_smin_vec,
.fno = gen_helper_gvec_smin32,
- .opc = INDEX_op_smin_vec,
+ .opt_opc = vecop_list,
.vece = MO_32 },
{ .fni8 = tcg_gen_smin_i64,
.fniv = tcg_gen_smin_vec,
.fno = gen_helper_gvec_smin64,
- .opc = INDEX_op_smin_vec,
+ .opt_opc = vecop_list,
.vece = MO_64 }
};
tcg_debug_assert(vece <= MO_64);
@@ -1840,24 +2020,25 @@ void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
{
+ static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 };
static const GVecGen3 g[4] = {
{ .fniv = tcg_gen_umin_vec,
.fno = gen_helper_gvec_umin8,
- .opc = INDEX_op_umin_vec,
+ .opt_opc = vecop_list,
.vece = MO_8 },
{ .fniv = tcg_gen_umin_vec,
.fno = gen_helper_gvec_umin16,
- .opc = INDEX_op_umin_vec,
+ .opt_opc = vecop_list,
.vece = MO_16 },
{ .fni4 = tcg_gen_umin_i32,
.fniv = tcg_gen_umin_vec,
.fno = gen_helper_gvec_umin32,
- .opc = INDEX_op_umin_vec,
+ .opt_opc = vecop_list,
.vece = MO_32 },
{ .fni8 = tcg_gen_umin_i64,
.fniv = tcg_gen_umin_vec,
.fno = gen_helper_gvec_umin64,
- .opc = INDEX_op_umin_vec,
+ .opt_opc = vecop_list,
.vece = MO_64 }
};
tcg_debug_assert(vece <= MO_64);
@@ -1867,24 +2048,25 @@ void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
{
+ static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 };
static const GVecGen3 g[4] = {
{ .fniv = tcg_gen_smax_vec,
.fno = gen_helper_gvec_smax8,
- .opc = INDEX_op_smax_vec,
+ .opt_opc = vecop_list,
.vece = MO_8 },
{ .fniv = tcg_gen_smax_vec,
.fno = gen_helper_gvec_smax16,
- .opc = INDEX_op_smax_vec,
+ .opt_opc = vecop_list,
.vece = MO_16 },
{ .fni4 = tcg_gen_smax_i32,
.fniv = tcg_gen_smax_vec,
.fno = gen_helper_gvec_smax32,
- .opc = INDEX_op_smax_vec,
+ .opt_opc = vecop_list,
.vece = MO_32 },
{ .fni8 = tcg_gen_smax_i64,
.fniv = tcg_gen_smax_vec,
.fno = gen_helper_gvec_smax64,
- .opc = INDEX_op_smax_vec,
+ .opt_opc = vecop_list,
.vece = MO_64 }
};
tcg_debug_assert(vece <= MO_64);
@@ -1894,24 +2076,25 @@ void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
{
+ static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 };
static const GVecGen3 g[4] = {
{ .fniv = tcg_gen_umax_vec,
.fno = gen_helper_gvec_umax8,
- .opc = INDEX_op_umax_vec,
+ .opt_opc = vecop_list,
.vece = MO_8 },
{ .fniv = tcg_gen_umax_vec,
.fno = gen_helper_gvec_umax16,
- .opc = INDEX_op_umax_vec,
+ .opt_opc = vecop_list,
.vece = MO_16 },
{ .fni4 = tcg_gen_umax_i32,
.fniv = tcg_gen_umax_vec,
.fno = gen_helper_gvec_umax32,
- .opc = INDEX_op_umax_vec,
+ .opt_opc = vecop_list,
.vece = MO_32 },
{ .fni8 = tcg_gen_umax_i64,
.fniv = tcg_gen_umax_vec,
.fno = gen_helper_gvec_umax64,
- .opc = INDEX_op_umax_vec,
+ .opt_opc = vecop_list,
.vece = MO_64 }
};
tcg_debug_assert(vece <= MO_64);
@@ -1965,26 +2148,90 @@ void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t oprsz, uint32_t maxsz)
{
+ static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 };
static const GVecGen2 g[4] = {
{ .fni8 = tcg_gen_vec_neg8_i64,
.fniv = tcg_gen_neg_vec,
.fno = gen_helper_gvec_neg8,
- .opc = INDEX_op_neg_vec,
+ .opt_opc = vecop_list,
.vece = MO_8 },
{ .fni8 = tcg_gen_vec_neg16_i64,
.fniv = tcg_gen_neg_vec,
.fno = gen_helper_gvec_neg16,
- .opc = INDEX_op_neg_vec,
+ .opt_opc = vecop_list,
.vece = MO_16 },
{ .fni4 = tcg_gen_neg_i32,
.fniv = tcg_gen_neg_vec,
.fno = gen_helper_gvec_neg32,
- .opc = INDEX_op_neg_vec,
+ .opt_opc = vecop_list,
.vece = MO_32 },
{ .fni8 = tcg_gen_neg_i64,
.fniv = tcg_gen_neg_vec,
.fno = gen_helper_gvec_neg64,
- .opc = INDEX_op_neg_vec,
+ .opt_opc = vecop_list,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .vece = MO_64 },
+ };
+
+ tcg_debug_assert(vece <= MO_64);
+ tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
+}
+
+static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
+{
+ TCGv_i64 t = tcg_temp_new_i64();
+ int nbit = 8 << vece;
+
+ /* Create -1 for each negative element. */
+ tcg_gen_shri_i64(t, b, nbit - 1);
+ tcg_gen_andi_i64(t, t, dup_const(vece, 1));
+ tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
+
+ /*
+ * Invert (via xor -1) and add one (via sub -1).
+ * Because of the ordering the msb is cleared,
+ * so we never have carry into the next element.
+ */
+ tcg_gen_xor_i64(d, b, t);
+ tcg_gen_sub_i64(d, d, t);
+
+ tcg_temp_free_i64(t);
+}
+
+static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b)
+{
+ gen_absv_mask(d, b, MO_8);
+}
+
+static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b)
+{
+ gen_absv_mask(d, b, MO_16);
+}
+
+void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t oprsz, uint32_t maxsz)
+{
+ static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 };
+ static const GVecGen2 g[4] = {
+ { .fni8 = tcg_gen_vec_abs8_i64,
+ .fniv = tcg_gen_abs_vec,
+ .fno = gen_helper_gvec_abs8,
+ .opt_opc = vecop_list,
+ .vece = MO_8 },
+ { .fni8 = tcg_gen_vec_abs16_i64,
+ .fniv = tcg_gen_abs_vec,
+ .fno = gen_helper_gvec_abs16,
+ .opt_opc = vecop_list,
+ .vece = MO_16 },
+ { .fni4 = tcg_gen_abs_i32,
+ .fniv = tcg_gen_abs_vec,
+ .fno = gen_helper_gvec_abs32,
+ .opt_opc = vecop_list,
+ .vece = MO_32 },
+ { .fni8 = tcg_gen_abs_i64,
+ .fniv = tcg_gen_abs_vec,
+ .fno = gen_helper_gvec_abs64,
+ .opt_opc = vecop_list,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
.vece = MO_64 },
};
@@ -2000,7 +2247,6 @@ void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
.fni8 = tcg_gen_and_i64,
.fniv = tcg_gen_and_vec,
.fno = gen_helper_gvec_and,
- .opc = INDEX_op_and_vec,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
};
@@ -2018,7 +2264,6 @@ void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
.fni8 = tcg_gen_or_i64,
.fniv = tcg_gen_or_vec,
.fno = gen_helper_gvec_or,
- .opc = INDEX_op_or_vec,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
};
@@ -2036,7 +2281,6 @@ void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
.fni8 = tcg_gen_xor_i64,
.fniv = tcg_gen_xor_vec,
.fno = gen_helper_gvec_xor,
- .opc = INDEX_op_xor_vec,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
};
@@ -2054,7 +2298,6 @@ void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
.fni8 = tcg_gen_andc_i64,
.fniv = tcg_gen_andc_vec,
.fno = gen_helper_gvec_andc,
- .opc = INDEX_op_andc_vec,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
};
@@ -2072,7 +2315,6 @@ void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
.fni8 = tcg_gen_orc_i64,
.fniv = tcg_gen_orc_vec,
.fno = gen_helper_gvec_orc,
- .opc = INDEX_op_orc_vec,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
};
@@ -2138,7 +2380,6 @@ static const GVecGen2s gop_ands = {
.fni8 = tcg_gen_and_i64,
.fniv = tcg_gen_and_vec,
.fno = gen_helper_gvec_ands,
- .opc = INDEX_op_and_vec,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
.vece = MO_64
};
@@ -2164,7 +2405,6 @@ static const GVecGen2s gop_xors = {
.fni8 = tcg_gen_xor_i64,
.fniv = tcg_gen_xor_vec,
.fno = gen_helper_gvec_xors,
- .opc = INDEX_op_xor_vec,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
.vece = MO_64
};
@@ -2190,7 +2430,6 @@ static const GVecGen2s gop_ors = {
.fni8 = tcg_gen_or_i64,
.fniv = tcg_gen_or_vec,
.fno = gen_helper_gvec_ors,
- .opc = INDEX_op_or_vec,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
.vece = MO_64
};
@@ -2229,26 +2468,27 @@ void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
int64_t shift, uint32_t oprsz, uint32_t maxsz)
{
+ static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
static const GVecGen2i g[4] = {
{ .fni8 = tcg_gen_vec_shl8i_i64,
.fniv = tcg_gen_shli_vec,
.fno = gen_helper_gvec_shl8i,
- .opc = INDEX_op_shli_vec,
+ .opt_opc = vecop_list,
.vece = MO_8 },
{ .fni8 = tcg_gen_vec_shl16i_i64,
.fniv = tcg_gen_shli_vec,
.fno = gen_helper_gvec_shl16i,
- .opc = INDEX_op_shli_vec,
+ .opt_opc = vecop_list,
.vece = MO_16 },
{ .fni4 = tcg_gen_shli_i32,
.fniv = tcg_gen_shli_vec,
.fno = gen_helper_gvec_shl32i,
- .opc = INDEX_op_shli_vec,
+ .opt_opc = vecop_list,
.vece = MO_32 },
{ .fni8 = tcg_gen_shli_i64,
.fniv = tcg_gen_shli_vec,
.fno = gen_helper_gvec_shl64i,
- .opc = INDEX_op_shli_vec,
+ .opt_opc = vecop_list,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
.vece = MO_64 },
};
@@ -2279,26 +2519,27 @@ void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
int64_t shift, uint32_t oprsz, uint32_t maxsz)
{
+ static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
static const GVecGen2i g[4] = {
{ .fni8 = tcg_gen_vec_shr8i_i64,
.fniv = tcg_gen_shri_vec,
.fno = gen_helper_gvec_shr8i,
- .opc = INDEX_op_shri_vec,
+ .opt_opc = vecop_list,
.vece = MO_8 },
{ .fni8 = tcg_gen_vec_shr16i_i64,
.fniv = tcg_gen_shri_vec,
.fno = gen_helper_gvec_shr16i,
- .opc = INDEX_op_shri_vec,
+ .opt_opc = vecop_list,
.vece = MO_16 },
{ .fni4 = tcg_gen_shri_i32,
.fniv = tcg_gen_shri_vec,
.fno = gen_helper_gvec_shr32i,
- .opc = INDEX_op_shri_vec,
+ .opt_opc = vecop_list,
.vece = MO_32 },
{ .fni8 = tcg_gen_shri_i64,
.fniv = tcg_gen_shri_vec,
.fno = gen_helper_gvec_shr64i,
- .opc = INDEX_op_shri_vec,
+ .opt_opc = vecop_list,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
.vece = MO_64 },
};
@@ -2343,26 +2584,27 @@ void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
int64_t shift, uint32_t oprsz, uint32_t maxsz)
{
+ static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 };
static const GVecGen2i g[4] = {
{ .fni8 = tcg_gen_vec_sar8i_i64,
.fniv = tcg_gen_sari_vec,
.fno = gen_helper_gvec_sar8i,
- .opc = INDEX_op_sari_vec,
+ .opt_opc = vecop_list,
.vece = MO_8 },
{ .fni8 = tcg_gen_vec_sar16i_i64,
.fniv = tcg_gen_sari_vec,
.fno = gen_helper_gvec_sar16i,
- .opc = INDEX_op_sari_vec,
+ .opt_opc = vecop_list,
.vece = MO_16 },
{ .fni4 = tcg_gen_sari_i32,
.fniv = tcg_gen_sari_vec,
.fno = gen_helper_gvec_sar32i,
- .opc = INDEX_op_sari_vec,
+ .opt_opc = vecop_list,
.vece = MO_32 },
{ .fni8 = tcg_gen_sari_i64,
.fniv = tcg_gen_sari_vec,
.fno = gen_helper_gvec_sar64i,
- .opc = INDEX_op_sari_vec,
+ .opt_opc = vecop_list,
.prefer_i64 = TCG_TARGET_REG_BITS == 64,
.vece = MO_64 },
};
@@ -2376,6 +2618,415 @@ void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
}
}
+/*
+ * Specialized generation vector shifts by a non-constant scalar.
+ */
+
+typedef struct {
+ void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
+ void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
+ void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32);
+ void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
+ gen_helper_gvec_2 *fno[4];
+ TCGOpcode s_list[2];
+ TCGOpcode v_list[2];
+} GVecGen2sh;
+
+static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t oprsz, uint32_t tysz, TCGType type,
+ TCGv_i32 shift,
+ void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))
+{
+ TCGv_vec t0 = tcg_temp_new_vec(type);
+ uint32_t i;
+
+ for (i = 0; i < oprsz; i += tysz) {
+ tcg_gen_ld_vec(t0, cpu_env, aofs + i);
+ fni(vece, t0, t0, shift);
+ tcg_gen_st_vec(t0, cpu_env, dofs + i);
+ }
+ tcg_temp_free_vec(t0);
+}
+
+static void
+do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift,
+ uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g)
+{
+ TCGType type;
+ uint32_t some;
+
+ check_size_align(oprsz, maxsz, dofs | aofs);
+ check_overlap_2(dofs, aofs, maxsz);
+
+ /* If the backend has a scalar expansion, great. */
+ type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64);
+ if (type) {
+ const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
+ switch (type) {
+ case TCG_TYPE_V256:
+ some = QEMU_ALIGN_DOWN(oprsz, 32);
+ expand_2sh_vec(vece, dofs, aofs, some, 32,
+ TCG_TYPE_V256, shift, g->fniv_s);
+ if (some == oprsz) {
+ break;
+ }
+ dofs += some;
+ aofs += some;
+ oprsz -= some;
+ maxsz -= some;
+ /* fallthru */
+ case TCG_TYPE_V128:
+ expand_2sh_vec(vece, dofs, aofs, oprsz, 16,
+ TCG_TYPE_V128, shift, g->fniv_s);
+ break;
+ case TCG_TYPE_V64:
+ expand_2sh_vec(vece, dofs, aofs, oprsz, 8,
+ TCG_TYPE_V64, shift, g->fniv_s);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ tcg_swap_vecop_list(hold_list);
+ goto clear_tail;
+ }
+
+ /* If the backend supports variable vector shifts, also cool. */
+ type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64);
+ if (type) {
+ const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
+ TCGv_vec v_shift = tcg_temp_new_vec(type);
+
+ if (vece == MO_64) {
+ TCGv_i64 sh64 = tcg_temp_new_i64();
+ tcg_gen_extu_i32_i64(sh64, shift);
+ tcg_gen_dup_i64_vec(MO_64, v_shift, sh64);
+ tcg_temp_free_i64(sh64);
+ } else {
+ tcg_gen_dup_i32_vec(vece, v_shift, shift);
+ }
+
+ switch (type) {
+ case TCG_TYPE_V256:
+ some = QEMU_ALIGN_DOWN(oprsz, 32);
+ expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256,
+ v_shift, false, g->fniv_v);
+ if (some == oprsz) {
+ break;
+ }
+ dofs += some;
+ aofs += some;
+ oprsz -= some;
+ maxsz -= some;
+ /* fallthru */
+ case TCG_TYPE_V128:
+ expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
+ v_shift, false, g->fniv_v);
+ break;
+ case TCG_TYPE_V64:
+ expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
+ v_shift, false, g->fniv_v);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ tcg_temp_free_vec(v_shift);
+ tcg_swap_vecop_list(hold_list);
+ goto clear_tail;
+ }
+
+ /* Otherwise fall back to integral... */
+ if (vece == MO_32 && check_size_impl(oprsz, 4)) {
+ expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4);
+ } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
+ TCGv_i64 sh64 = tcg_temp_new_i64();
+ tcg_gen_extu_i32_i64(sh64, shift);
+ expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8);
+ tcg_temp_free_i64(sh64);
+ } else {
+ TCGv_ptr a0 = tcg_temp_new_ptr();
+ TCGv_ptr a1 = tcg_temp_new_ptr();
+ TCGv_i32 desc = tcg_temp_new_i32();
+
+ tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT);
+ tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0));
+ tcg_gen_addi_ptr(a0, cpu_env, dofs);
+ tcg_gen_addi_ptr(a1, cpu_env, aofs);
+
+ g->fno[vece](a0, a1, desc);
+
+ tcg_temp_free_ptr(a0);
+ tcg_temp_free_ptr(a1);
+ tcg_temp_free_i32(desc);
+ return;
+ }
+
+ clear_tail:
+ if (oprsz < maxsz) {
+ expand_clr(dofs + oprsz, maxsz - oprsz);
+ }
+}
+
+void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
+ TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen2sh g = {
+ .fni4 = tcg_gen_shl_i32,
+ .fni8 = tcg_gen_shl_i64,
+ .fniv_s = tcg_gen_shls_vec,
+ .fniv_v = tcg_gen_shlv_vec,
+ .fno = {
+ gen_helper_gvec_shl8i,
+ gen_helper_gvec_shl16i,
+ gen_helper_gvec_shl32i,
+ gen_helper_gvec_shl64i,
+ },
+ .s_list = { INDEX_op_shls_vec, 0 },
+ .v_list = { INDEX_op_shlv_vec, 0 },
+ };
+
+ tcg_debug_assert(vece <= MO_64);
+ do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
+}
+
+void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
+ TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen2sh g = {
+ .fni4 = tcg_gen_shr_i32,
+ .fni8 = tcg_gen_shr_i64,
+ .fniv_s = tcg_gen_shrs_vec,
+ .fniv_v = tcg_gen_shrv_vec,
+ .fno = {
+ gen_helper_gvec_shr8i,
+ gen_helper_gvec_shr16i,
+ gen_helper_gvec_shr32i,
+ gen_helper_gvec_shr64i,
+ },
+ .s_list = { INDEX_op_shrs_vec, 0 },
+ .v_list = { INDEX_op_shrv_vec, 0 },
+ };
+
+ tcg_debug_assert(vece <= MO_64);
+ do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
+}
+
+void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
+ TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen2sh g = {
+ .fni4 = tcg_gen_sar_i32,
+ .fni8 = tcg_gen_sar_i64,
+ .fniv_s = tcg_gen_sars_vec,
+ .fniv_v = tcg_gen_sarv_vec,
+ .fno = {
+ gen_helper_gvec_sar8i,
+ gen_helper_gvec_sar16i,
+ gen_helper_gvec_sar32i,
+ gen_helper_gvec_sar64i,
+ },
+ .s_list = { INDEX_op_sars_vec, 0 },
+ .v_list = { INDEX_op_sarv_vec, 0 },
+ };
+
+ tcg_debug_assert(vece <= MO_64);
+ do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
+}
+
+/*
+ * Expand D = A << (B % element bits)
+ *
+ * Unlike scalar shifts, where it is easy for the target front end
+ * to include the modulo as part of the expansion. If the target
+ * naturally includes the modulo as part of the operation, great!
+ * If the target has some other behaviour from out-of-range shifts,
+ * then it could not use this function anyway, and would need to
+ * do it's own expansion with custom functions.
+ */
+static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
+ TCGv_vec a, TCGv_vec b)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+
+ tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
+ tcg_gen_and_vec(vece, t, t, b);
+ tcg_gen_shlv_vec(vece, d, a, t);
+ tcg_temp_free_vec(t);
+}
+
+static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+ TCGv_i32 t = tcg_temp_new_i32();
+
+ tcg_gen_andi_i32(t, b, 31);
+ tcg_gen_shl_i32(d, a, t);
+ tcg_temp_free_i32(t);
+}
+
+static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+ TCGv_i64 t = tcg_temp_new_i64();
+
+ tcg_gen_andi_i64(t, b, 63);
+ tcg_gen_shl_i64(d, a, t);
+ tcg_temp_free_i64(t);
+}
+
+void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
+{
+ static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 };
+ static const GVecGen3 g[4] = {
+ { .fniv = tcg_gen_shlv_mod_vec,
+ .fno = gen_helper_gvec_shl8v,
+ .opt_opc = vecop_list,
+ .vece = MO_8 },
+ { .fniv = tcg_gen_shlv_mod_vec,
+ .fno = gen_helper_gvec_shl16v,
+ .opt_opc = vecop_list,
+ .vece = MO_16 },
+ { .fni4 = tcg_gen_shl_mod_i32,
+ .fniv = tcg_gen_shlv_mod_vec,
+ .fno = gen_helper_gvec_shl32v,
+ .opt_opc = vecop_list,
+ .vece = MO_32 },
+ { .fni8 = tcg_gen_shl_mod_i64,
+ .fniv = tcg_gen_shlv_mod_vec,
+ .fno = gen_helper_gvec_shl64v,
+ .opt_opc = vecop_list,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .vece = MO_64 },
+ };
+
+ tcg_debug_assert(vece <= MO_64);
+ tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
+}
+
+/*
+ * Similarly for logical right shifts.
+ */
+
+static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
+ TCGv_vec a, TCGv_vec b)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+
+ tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
+ tcg_gen_and_vec(vece, t, t, b);
+ tcg_gen_shrv_vec(vece, d, a, t);
+ tcg_temp_free_vec(t);
+}
+
+static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+ TCGv_i32 t = tcg_temp_new_i32();
+
+ tcg_gen_andi_i32(t, b, 31);
+ tcg_gen_shr_i32(d, a, t);
+ tcg_temp_free_i32(t);
+}
+
+static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+ TCGv_i64 t = tcg_temp_new_i64();
+
+ tcg_gen_andi_i64(t, b, 63);
+ tcg_gen_shr_i64(d, a, t);
+ tcg_temp_free_i64(t);
+}
+
+void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
+{
+ static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 };
+ static const GVecGen3 g[4] = {
+ { .fniv = tcg_gen_shrv_mod_vec,
+ .fno = gen_helper_gvec_shr8v,
+ .opt_opc = vecop_list,
+ .vece = MO_8 },
+ { .fniv = tcg_gen_shrv_mod_vec,
+ .fno = gen_helper_gvec_shr16v,
+ .opt_opc = vecop_list,
+ .vece = MO_16 },
+ { .fni4 = tcg_gen_shr_mod_i32,
+ .fniv = tcg_gen_shrv_mod_vec,
+ .fno = gen_helper_gvec_shr32v,
+ .opt_opc = vecop_list,
+ .vece = MO_32 },
+ { .fni8 = tcg_gen_shr_mod_i64,
+ .fniv = tcg_gen_shrv_mod_vec,
+ .fno = gen_helper_gvec_shr64v,
+ .opt_opc = vecop_list,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .vece = MO_64 },
+ };
+
+ tcg_debug_assert(vece <= MO_64);
+ tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
+}
+
+/*
+ * Similarly for arithmetic right shifts.
+ */
+
+static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
+ TCGv_vec a, TCGv_vec b)
+{
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+
+ tcg_gen_dupi_vec(vece, t, (8 << vece) - 1);
+ tcg_gen_and_vec(vece, t, t, b);
+ tcg_gen_sarv_vec(vece, d, a, t);
+ tcg_temp_free_vec(t);
+}
+
+static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+ TCGv_i32 t = tcg_temp_new_i32();
+
+ tcg_gen_andi_i32(t, b, 31);
+ tcg_gen_sar_i32(d, a, t);
+ tcg_temp_free_i32(t);
+}
+
+static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+ TCGv_i64 t = tcg_temp_new_i64();
+
+ tcg_gen_andi_i64(t, b, 63);
+ tcg_gen_sar_i64(d, a, t);
+ tcg_temp_free_i64(t);
+}
+
+void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
+{
+ static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 };
+ static const GVecGen3 g[4] = {
+ { .fniv = tcg_gen_sarv_mod_vec,
+ .fno = gen_helper_gvec_sar8v,
+ .opt_opc = vecop_list,
+ .vece = MO_8 },
+ { .fniv = tcg_gen_sarv_mod_vec,
+ .fno = gen_helper_gvec_sar16v,
+ .opt_opc = vecop_list,
+ .vece = MO_16 },
+ { .fni4 = tcg_gen_sar_mod_i32,
+ .fniv = tcg_gen_sarv_mod_vec,
+ .fno = gen_helper_gvec_sar32v,
+ .opt_opc = vecop_list,
+ .vece = MO_32 },
+ { .fni8 = tcg_gen_sar_mod_i64,
+ .fniv = tcg_gen_sarv_mod_vec,
+ .fno = gen_helper_gvec_sar64v,
+ .opt_opc = vecop_list,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .vece = MO_64 },
+ };
+
+ tcg_debug_assert(vece <= MO_64);
+ tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
+}
+
/* Expand OPSZ bytes worth of three-operand operations using i32 elements. */
static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
uint32_t oprsz, TCGCond cond)
@@ -2435,6 +3086,7 @@ void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
uint32_t aofs, uint32_t bofs,
uint32_t oprsz, uint32_t maxsz)
{
+ static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
static gen_helper_gvec_3 * const eq_fn[4] = {
gen_helper_gvec_eq8, gen_helper_gvec_eq16,
gen_helper_gvec_eq32, gen_helper_gvec_eq64
@@ -2467,6 +3119,8 @@ void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
[TCG_COND_LTU] = ltu_fn,
[TCG_COND_LEU] = leu_fn,
};
+
+ const TCGOpcode *hold_list;
TCGType type;
uint32_t some;
@@ -2479,10 +3133,12 @@ void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
return;
}
- /* Implement inline with a vector type, if possible.
+ /*
+ * Implement inline with a vector type, if possible.
* Prefer integer when 64-bit host and 64-bit comparison.
*/
- type = choose_vector_type(INDEX_op_cmp_vec, vece, oprsz,
+ hold_list = tcg_swap_vecop_list(cmp_list);
+ type = choose_vector_type(cmp_list, vece, oprsz,
TCG_TARGET_REG_BITS == 64 && vece == MO_64);
switch (type) {
case TCG_TYPE_V256:
@@ -2524,13 +3180,14 @@ void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
assert(fn != NULL);
}
tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
- return;
+ oprsz = maxsz;
}
break;
default:
g_assert_not_reached();
}
+ tcg_swap_vecop_list(hold_list);
if (oprsz < maxsz) {
expand_clr(dofs + oprsz, maxsz - oprsz);
diff --git a/tcg/tcg-op-gvec.h b/tcg/tcg-op-gvec.h
index 850da32ded..52a398c190 100644
--- a/tcg/tcg-op-gvec.h
+++ b/tcg/tcg-op-gvec.h
@@ -91,8 +91,8 @@ typedef struct {
void (*fniv)(unsigned, TCGv_vec, TCGv_vec);
/* Expand out-of-line helper w/descriptor. */
gen_helper_gvec_2 *fno;
- /* The opcode, if any, to which this corresponds. */
- TCGOpcode opc;
+ /* The optional opcodes, if any, utilized by .fniv. */
+ const TCGOpcode *opt_opc;
/* The data argument to the out-of-line helper. */
int32_t data;
/* The vector element size, if applicable. */
@@ -112,8 +112,8 @@ typedef struct {
gen_helper_gvec_2 *fno;
/* Expand out-of-line helper w/descriptor, data as argument. */
gen_helper_gvec_2i *fnoi;
- /* The opcode, if any, to which this corresponds. */
- TCGOpcode opc;
+ /* The optional opcodes, if any, utilized by .fniv. */
+ const TCGOpcode *opt_opc;
/* The vector element size, if applicable. */
uint8_t vece;
/* Prefer i64 to v64. */
@@ -131,8 +131,8 @@ typedef struct {
void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
/* Expand out-of-line helper w/descriptor. */
gen_helper_gvec_2i *fno;
- /* The opcode, if any, to which this corresponds. */
- TCGOpcode opc;
+ /* The optional opcodes, if any, utilized by .fniv. */
+ const TCGOpcode *opt_opc;
/* The data argument to the out-of-line helper. */
uint32_t data;
/* The vector element size, if applicable. */
@@ -152,8 +152,8 @@ typedef struct {
void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
/* Expand out-of-line helper w/descriptor. */
gen_helper_gvec_3 *fno;
- /* The opcode, if any, to which this corresponds. */
- TCGOpcode opc;
+ /* The optional opcodes, if any, utilized by .fniv. */
+ const TCGOpcode *opt_opc;
/* The data argument to the out-of-line helper. */
int32_t data;
/* The vector element size, if applicable. */
@@ -165,6 +165,27 @@ typedef struct {
} GVecGen3;
typedef struct {
+ /*
+ * Expand inline as a 64-bit or 32-bit integer. Only one of these will be
+ * non-NULL.
+ */
+ void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t);
+ void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t);
+ /* Expand inline with a host vector type. */
+ void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, int64_t);
+ /* Expand out-of-line helper w/descriptor, data in descriptor. */
+ gen_helper_gvec_3 *fno;
+ /* The optional opcodes, if any, utilized by .fniv. */
+ const TCGOpcode *opt_opc;
+ /* The vector element size, if applicable. */
+ uint8_t vece;
+ /* Prefer i64 to v64. */
+ bool prefer_i64;
+ /* Load dest as a 3rd source operand. */
+ bool load_dest;
+} GVecGen3i;
+
+typedef struct {
/* Expand inline as a 64-bit or 32-bit integer.
Only one of these will be non-NULL. */
void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64);
@@ -173,8 +194,8 @@ typedef struct {
void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, TCGv_vec);
/* Expand out-of-line helper w/descriptor. */
gen_helper_gvec_4 *fno;
- /* The opcode, if any, to which this corresponds. */
- TCGOpcode opc;
+ /* The optional opcodes, if any, utilized by .fniv. */
+ const TCGOpcode *opt_opc;
/* The data argument to the out-of-line helper. */
int32_t data;
/* The vector element size, if applicable. */
@@ -193,6 +214,9 @@ void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
uint32_t maxsz, TCGv_i64 c, const GVecGen2s *);
void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
uint32_t oprsz, uint32_t maxsz, const GVecGen3 *);
+void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t oprsz, uint32_t maxsz, int64_t c,
+ const GVecGen3i *);
void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
uint32_t oprsz, uint32_t maxsz, const GVecGen4 *);
@@ -204,6 +228,8 @@ void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t oprsz, uint32_t maxsz);
void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t oprsz, uint32_t maxsz);
void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
@@ -294,6 +320,24 @@ void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
int64_t shift, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
+ TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
+ TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
+ TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz);
+
+/*
+ * Perform vector shift by vector element, modulo the element size.
+ * E.g. D[i] = A[i] << (B[i] % (8 << vece)).
+ */
+void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+
void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
uint32_t aofs, uint32_t bofs,
uint32_t oprsz, uint32_t maxsz);
diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
index 27f65600c3..543508d545 100644
--- a/tcg/tcg-op-vec.c
+++ b/tcg/tcg-op-vec.c
@@ -34,6 +34,98 @@ extern TCGv_i32 TCGV_HIGH_link_error(TCGv_i64);
#define TCGV_HIGH TCGV_HIGH_link_error
#endif
+/*
+ * Vector optional opcode tracking.
+ * Except for the basic logical operations (and, or, xor), and
+ * data movement (mov, ld, st, dupi), many vector opcodes are
+ * optional and may not be supported on the host. Thank Intel
+ * for the irregularity in their instruction set.
+ *
+ * The gvec expanders allow custom vector operations to be composed,
+ * generally via the .fniv callback in the GVecGen* structures. At
+ * the same time, in deciding whether to use this hook we need to
+ * know if the host supports the required operations. This is
+ * presented as an array of opcodes, terminated by 0. Each opcode
+ * is assumed to be expanded with the given VECE.
+ *
+ * For debugging, we want to validate this array. Therefore, when
+ * tcg_ctx->vec_opt_opc is non-NULL, the tcg_gen_*_vec expanders
+ * will validate that their opcode is present in the list.
+ */
+#ifdef CONFIG_DEBUG_TCG
+void tcg_assert_listed_vecop(TCGOpcode op)
+{
+ const TCGOpcode *p = tcg_ctx->vecop_list;
+ if (p) {
+ for (; *p; ++p) {
+ if (*p == op) {
+ return;
+ }
+ }
+ g_assert_not_reached();
+ }
+}
+#endif
+
+bool tcg_can_emit_vecop_list(const TCGOpcode *list,
+ TCGType type, unsigned vece)
+{
+ if (list == NULL) {
+ return true;
+ }
+
+ for (; *list; ++list) {
+ TCGOpcode opc = *list;
+
+#ifdef CONFIG_DEBUG_TCG
+ switch (opc) {
+ case INDEX_op_and_vec:
+ case INDEX_op_or_vec:
+ case INDEX_op_xor_vec:
+ case INDEX_op_mov_vec:
+ case INDEX_op_dup_vec:
+ case INDEX_op_dupi_vec:
+ case INDEX_op_dup2_vec:
+ case INDEX_op_ld_vec:
+ case INDEX_op_st_vec:
+ /* These opcodes are mandatory and should not be listed. */
+ g_assert_not_reached();
+ default:
+ break;
+ }
+#endif
+
+ if (tcg_can_emit_vec_op(opc, type, vece)) {
+ continue;
+ }
+
+ /*
+ * The opcode list is created by front ends based on what they
+ * actually invoke. We must mirror the logic in the routines
+ * below for generic expansions using other opcodes.
+ */
+ switch (opc) {
+ case INDEX_op_neg_vec:
+ if (tcg_can_emit_vec_op(INDEX_op_sub_vec, type, vece)) {
+ continue;
+ }
+ break;
+ case INDEX_op_abs_vec:
+ if (tcg_can_emit_vec_op(INDEX_op_sub_vec, type, vece)
+ && (tcg_can_emit_vec_op(INDEX_op_smax_vec, type, vece) > 0
+ || tcg_can_emit_vec_op(INDEX_op_sari_vec, type, vece) > 0
+ || tcg_can_emit_vec_op(INDEX_op_cmp_vec, type, vece))) {
+ continue;
+ }
+ break;
+ default:
+ break;
+ }
+ return false;
+ }
+ return true;
+}
+
void vec_gen_2(TCGOpcode opc, TCGType type, unsigned vece, TCGArg r, TCGArg a)
{
TCGOp *op = tcg_emit_op(opc);
@@ -194,6 +286,17 @@ void tcg_gen_dup_i32_vec(unsigned vece, TCGv_vec r, TCGv_i32 a)
vec_gen_2(INDEX_op_dup_vec, type, vece, ri, ai);
}
+void tcg_gen_dup_mem_vec(unsigned vece, TCGv_vec r, TCGv_ptr b,
+ tcg_target_long ofs)
+{
+ TCGArg ri = tcgv_vec_arg(r);
+ TCGArg bi = tcgv_ptr_arg(b);
+ TCGTemp *rt = arg_temp(ri);
+ TCGType type = rt->base_type;
+
+ vec_gen_3(INDEX_op_dupm_vec, type, vece, ri, bi, ofs);
+}
+
static void vec_gen_ldst(TCGOpcode opc, TCGv_vec r, TCGv_ptr b, TCGArg o)
{
TCGArg ri = tcgv_vec_arg(r);
@@ -226,16 +329,6 @@ void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr b, TCGArg o, TCGType low_type)
vec_gen_3(INDEX_op_st_vec, low_type, 0, ri, bi, o);
}
-void tcg_gen_add_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
-{
- vec_gen_op3(INDEX_op_add_vec, vece, r, a, b);
-}
-
-void tcg_gen_sub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
-{
- vec_gen_op3(INDEX_op_sub_vec, vece, r, a, b);
-}
-
void tcg_gen_and_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
{
vec_gen_op3(INDEX_op_and_vec, 0, r, a, b);
@@ -296,11 +389,33 @@ void tcg_gen_eqv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
tcg_gen_not_vec(0, r, r);
}
-void tcg_gen_not_vec(unsigned vece, TCGv_vec r, TCGv_vec a)
+static bool do_op2(unsigned vece, TCGv_vec r, TCGv_vec a, TCGOpcode opc)
{
- if (TCG_TARGET_HAS_not_vec) {
- vec_gen_op2(INDEX_op_not_vec, 0, r, a);
+ TCGTemp *rt = tcgv_vec_temp(r);
+ TCGTemp *at = tcgv_vec_temp(a);
+ TCGArg ri = temp_arg(rt);
+ TCGArg ai = temp_arg(at);
+ TCGType type = rt->base_type;
+ int can;
+
+ tcg_debug_assert(at->base_type >= type);
+ tcg_assert_listed_vecop(opc);
+ can = tcg_can_emit_vec_op(opc, type, vece);
+ if (can > 0) {
+ vec_gen_2(opc, type, vece, ri, ai);
+ } else if (can < 0) {
+ const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
+ tcg_expand_vec_op(opc, type, vece, ri, ai);
+ tcg_swap_vecop_list(hold_list);
} else {
+ return false;
+ }
+ return true;
+}
+
+void tcg_gen_not_vec(unsigned vece, TCGv_vec r, TCGv_vec a)
+{
+ if (!TCG_TARGET_HAS_not_vec || !do_op2(vece, r, a, INDEX_op_not_vec)) {
TCGv_vec t = tcg_const_ones_vec_matching(r);
tcg_gen_xor_vec(0, r, a, t);
tcg_temp_free_vec(t);
@@ -309,13 +424,48 @@ void tcg_gen_not_vec(unsigned vece, TCGv_vec r, TCGv_vec a)
void tcg_gen_neg_vec(unsigned vece, TCGv_vec r, TCGv_vec a)
{
- if (TCG_TARGET_HAS_neg_vec) {
- vec_gen_op2(INDEX_op_neg_vec, vece, r, a);
- } else {
+ const TCGOpcode *hold_list;
+
+ tcg_assert_listed_vecop(INDEX_op_neg_vec);
+ hold_list = tcg_swap_vecop_list(NULL);
+
+ if (!TCG_TARGET_HAS_neg_vec || !do_op2(vece, r, a, INDEX_op_neg_vec)) {
TCGv_vec t = tcg_const_zeros_vec_matching(r);
tcg_gen_sub_vec(vece, r, t, a);
tcg_temp_free_vec(t);
}
+ tcg_swap_vecop_list(hold_list);
+}
+
+void tcg_gen_abs_vec(unsigned vece, TCGv_vec r, TCGv_vec a)
+{
+ const TCGOpcode *hold_list;
+
+ tcg_assert_listed_vecop(INDEX_op_abs_vec);
+ hold_list = tcg_swap_vecop_list(NULL);
+
+ if (!do_op2(vece, r, a, INDEX_op_abs_vec)) {
+ TCGType type = tcgv_vec_temp(r)->base_type;
+ TCGv_vec t = tcg_temp_new_vec(type);
+
+ tcg_debug_assert(tcg_can_emit_vec_op(INDEX_op_sub_vec, type, vece));
+ if (tcg_can_emit_vec_op(INDEX_op_smax_vec, type, vece) > 0) {
+ tcg_gen_neg_vec(vece, t, a);
+ tcg_gen_smax_vec(vece, r, a, t);
+ } else {
+ if (tcg_can_emit_vec_op(INDEX_op_sari_vec, type, vece) > 0) {
+ tcg_gen_sari_vec(vece, t, a, (8 << vece) - 1);
+ } else {
+ do_dupi_vec(t, MO_REG, 0);
+ tcg_gen_cmp_vec(TCG_COND_LT, vece, t, a, t);
+ }
+ tcg_gen_xor_vec(vece, r, a, t);
+ tcg_gen_sub_vec(vece, r, r, t);
+ }
+
+ tcg_temp_free_vec(t);
+ }
+ tcg_swap_vecop_list(hold_list);
}
static void do_shifti(TCGOpcode opc, unsigned vece,
@@ -330,6 +480,7 @@ static void do_shifti(TCGOpcode opc, unsigned vece,
tcg_debug_assert(at->base_type == type);
tcg_debug_assert(i >= 0 && i < (8 << vece));
+ tcg_assert_listed_vecop(opc);
if (i == 0) {
tcg_gen_mov_vec(r, a);
@@ -343,8 +494,10 @@ static void do_shifti(TCGOpcode opc, unsigned vece,
/* We leave the choice of expansion via scalar or vector shift
to the target. Often, but not always, dupi can feed a vector
shift easier than a scalar. */
+ const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
tcg_debug_assert(can < 0);
tcg_expand_vec_op(opc, type, vece, ri, ai, i);
+ tcg_swap_vecop_list(hold_list);
}
}
@@ -377,12 +530,15 @@ void tcg_gen_cmp_vec(TCGCond cond, unsigned vece,
tcg_debug_assert(at->base_type >= type);
tcg_debug_assert(bt->base_type >= type);
+ tcg_assert_listed_vecop(INDEX_op_cmp_vec);
can = tcg_can_emit_vec_op(INDEX_op_cmp_vec, type, vece);
if (can > 0) {
vec_gen_4(INDEX_op_cmp_vec, type, vece, ri, ai, bi, cond);
} else {
+ const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
tcg_debug_assert(can < 0);
tcg_expand_vec_op(INDEX_op_cmp_vec, type, vece, ri, ai, bi, cond);
+ tcg_swap_vecop_list(hold_list);
}
}
@@ -400,15 +556,28 @@ static void do_op3(unsigned vece, TCGv_vec r, TCGv_vec a,
tcg_debug_assert(at->base_type >= type);
tcg_debug_assert(bt->base_type >= type);
+ tcg_assert_listed_vecop(opc);
can = tcg_can_emit_vec_op(opc, type, vece);
if (can > 0) {
vec_gen_3(opc, type, vece, ri, ai, bi);
} else {
+ const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
tcg_debug_assert(can < 0);
tcg_expand_vec_op(opc, type, vece, ri, ai, bi);
+ tcg_swap_vecop_list(hold_list);
}
}
+void tcg_gen_add_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ do_op3(vece, r, a, b, INDEX_op_add_vec);
+}
+
+void tcg_gen_sub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ do_op3(vece, r, a, b, INDEX_op_sub_vec);
+}
+
void tcg_gen_mul_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
{
do_op3(vece, r, a, b, INDEX_op_mul_vec);
@@ -453,3 +622,72 @@ void tcg_gen_umax_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
{
do_op3(vece, r, a, b, INDEX_op_umax_vec);
}
+
+void tcg_gen_shlv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ do_op3(vece, r, a, b, INDEX_op_shlv_vec);
+}
+
+void tcg_gen_shrv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ do_op3(vece, r, a, b, INDEX_op_shrv_vec);
+}
+
+void tcg_gen_sarv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ do_op3(vece, r, a, b, INDEX_op_sarv_vec);
+}
+
+static void do_shifts(unsigned vece, TCGv_vec r, TCGv_vec a,
+ TCGv_i32 s, TCGOpcode opc_s, TCGOpcode opc_v)
+{
+ TCGTemp *rt = tcgv_vec_temp(r);
+ TCGTemp *at = tcgv_vec_temp(a);
+ TCGTemp *st = tcgv_i32_temp(s);
+ TCGArg ri = temp_arg(rt);
+ TCGArg ai = temp_arg(at);
+ TCGArg si = temp_arg(st);
+ TCGType type = rt->base_type;
+ const TCGOpcode *hold_list;
+ int can;
+
+ tcg_debug_assert(at->base_type >= type);
+ tcg_assert_listed_vecop(opc_s);
+ hold_list = tcg_swap_vecop_list(NULL);
+
+ can = tcg_can_emit_vec_op(opc_s, type, vece);
+ if (can > 0) {
+ vec_gen_3(opc_s, type, vece, ri, ai, si);
+ } else if (can < 0) {
+ tcg_expand_vec_op(opc_s, type, vece, ri, ai, si);
+ } else {
+ TCGv_vec vec_s = tcg_temp_new_vec(type);
+
+ if (vece == MO_64) {
+ TCGv_i64 s64 = tcg_temp_new_i64();
+ tcg_gen_extu_i32_i64(s64, s);
+ tcg_gen_dup_i64_vec(MO_64, vec_s, s64);
+ tcg_temp_free_i64(s64);
+ } else {
+ tcg_gen_dup_i32_vec(vece, vec_s, s);
+ }
+ do_op3(vece, r, a, vec_s, opc_v);
+ tcg_temp_free_vec(vec_s);
+ }
+ tcg_swap_vecop_list(hold_list);
+}
+
+void tcg_gen_shls_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 b)
+{
+ do_shifts(vece, r, a, b, INDEX_op_shls_vec, INDEX_op_shlv_vec);
+}
+
+void tcg_gen_shrs_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 b)
+{
+ do_shifts(vece, r, a, b, INDEX_op_shrs_vec, INDEX_op_shrv_vec);
+}
+
+void tcg_gen_sars_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 b)
+{
+ do_shifts(vece, r, a, b, INDEX_op_sars_vec, INDEX_op_sarv_vec);
+}
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index a00d1df37e..0ac291f1c4 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -1091,6 +1091,16 @@ void tcg_gen_umax_i32(TCGv_i32 ret, TCGv_i32 a, TCGv_i32 b)
tcg_gen_movcond_i32(TCG_COND_LTU, ret, a, b, b, a);
}
+void tcg_gen_abs_i32(TCGv_i32 ret, TCGv_i32 a)
+{
+ TCGv_i32 t = tcg_temp_new_i32();
+
+ tcg_gen_sari_i32(t, a, 31);
+ tcg_gen_xor_i32(ret, a, t);
+ tcg_gen_sub_i32(ret, ret, t);
+ tcg_temp_free_i32(t);
+}
+
/* 64-bit ops */
#if TCG_TARGET_REG_BITS == 32
@@ -2548,6 +2558,16 @@ void tcg_gen_umax_i64(TCGv_i64 ret, TCGv_i64 a, TCGv_i64 b)
tcg_gen_movcond_i64(TCG_COND_LTU, ret, a, b, b, a);
}
+void tcg_gen_abs_i64(TCGv_i64 ret, TCGv_i64 a)
+{
+ TCGv_i64 t = tcg_temp_new_i64();
+
+ tcg_gen_sari_i64(t, a, 63);
+ tcg_gen_xor_i64(ret, a, t);
+ tcg_gen_sub_i64(ret, ret, t);
+ tcg_temp_free_i64(t);
+}
+
/* Size changing operations. */
void tcg_gen_extrl_i64_i32(TCGv_i32 ret, TCGv_i64 arg)
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 1f1824c30a..660fe205d0 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -335,6 +335,7 @@ void tcg_gen_smin_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2);
void tcg_gen_smax_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2);
void tcg_gen_umin_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2);
void tcg_gen_umax_i32(TCGv_i32, TCGv_i32 arg1, TCGv_i32 arg2);
+void tcg_gen_abs_i32(TCGv_i32, TCGv_i32);
static inline void tcg_gen_discard_i32(TCGv_i32 arg)
{
@@ -534,6 +535,7 @@ void tcg_gen_smin_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2);
void tcg_gen_smax_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2);
void tcg_gen_umin_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2);
void tcg_gen_umax_i64(TCGv_i64, TCGv_i64 arg1, TCGv_i64 arg2);
+void tcg_gen_abs_i64(TCGv_i64, TCGv_i64);
#if TCG_TARGET_REG_BITS == 64
static inline void tcg_gen_discard_i64(TCGv_i64 arg)
@@ -954,6 +956,7 @@ void tcg_gen_atomic_umax_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
void tcg_gen_mov_vec(TCGv_vec, TCGv_vec);
void tcg_gen_dup_i32_vec(unsigned vece, TCGv_vec, TCGv_i32);
void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec, TCGv_i64);
+void tcg_gen_dup_mem_vec(unsigned vece, TCGv_vec, TCGv_ptr, tcg_target_long);
void tcg_gen_dup8i_vec(TCGv_vec, uint32_t);
void tcg_gen_dup16i_vec(TCGv_vec, uint32_t);
void tcg_gen_dup32i_vec(TCGv_vec, uint32_t);
@@ -972,6 +975,7 @@ void tcg_gen_nor_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
void tcg_gen_eqv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
void tcg_gen_not_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
void tcg_gen_neg_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
+void tcg_gen_abs_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
void tcg_gen_ssadd_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
void tcg_gen_usadd_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
void tcg_gen_sssub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
@@ -985,6 +989,14 @@ void tcg_gen_shli_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
void tcg_gen_shri_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
void tcg_gen_sari_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
+void tcg_gen_shls_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 s);
+void tcg_gen_shrs_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 s);
+void tcg_gen_sars_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_i32 s);
+
+void tcg_gen_shlv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec s);
+void tcg_gen_shrv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec s);
+void tcg_gen_sarv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec s);
+
void tcg_gen_cmp_vec(TCGCond cond, unsigned vece, TCGv_vec r,
TCGv_vec a, TCGv_vec b);
@@ -1010,6 +1022,7 @@ void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType t);
#define tcg_gen_addi_tl tcg_gen_addi_i64
#define tcg_gen_sub_tl tcg_gen_sub_i64
#define tcg_gen_neg_tl tcg_gen_neg_i64
+#define tcg_gen_abs_tl tcg_gen_abs_i64
#define tcg_gen_subfi_tl tcg_gen_subfi_i64
#define tcg_gen_subi_tl tcg_gen_subi_i64
#define tcg_gen_and_tl tcg_gen_and_i64
@@ -1122,6 +1135,7 @@ void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType t);
#define tcg_gen_addi_tl tcg_gen_addi_i32
#define tcg_gen_sub_tl tcg_gen_sub_i32
#define tcg_gen_neg_tl tcg_gen_neg_i32
+#define tcg_gen_abs_tl tcg_gen_abs_i32
#define tcg_gen_subfi_tl tcg_gen_subfi_i32
#define tcg_gen_subi_tl tcg_gen_subi_i32
#define tcg_gen_and_tl tcg_gen_and_i32
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index 1bad6e4208..4a2dd116eb 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -219,11 +219,13 @@ DEF(dup2_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_REG_BITS == 32))
DEF(ld_vec, 1, 1, 1, IMPLVEC)
DEF(st_vec, 0, 2, 1, IMPLVEC)
+DEF(dupm_vec, 1, 1, 1, IMPLVEC)
DEF(add_vec, 1, 2, 0, IMPLVEC)
DEF(sub_vec, 1, 2, 0, IMPLVEC)
DEF(mul_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_mul_vec))
DEF(neg_vec, 1, 1, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec))
+DEF(abs_vec, 1, 1, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_abs_vec))
DEF(ssadd_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_sat_vec))
DEF(usadd_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_sat_vec))
DEF(sssub_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_sat_vec))
diff --git a/tcg/tcg.c b/tcg/tcg.c
index f7bef51de8..24083b8c00 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -103,16 +103,37 @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
const char *ct_str, TCGType type);
static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1,
intptr_t arg2);
-static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg);
+static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg);
static void tcg_out_movi(TCGContext *s, TCGType type,
TCGReg ret, tcg_target_long arg);
static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
const int *const_args);
#if TCG_TARGET_MAYBE_vec
+static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
+ TCGReg dst, TCGReg src);
+static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
+ TCGReg dst, TCGReg base, intptr_t offset);
+static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
+ TCGReg dst, tcg_target_long arg);
static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, unsigned vecl,
unsigned vece, const TCGArg *args,
const int *const_args);
#else
+static inline bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
+ TCGReg dst, TCGReg src)
+{
+ g_assert_not_reached();
+}
+static inline bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
+ TCGReg dst, TCGReg base, intptr_t offset)
+{
+ g_assert_not_reached();
+}
+static inline void tcg_out_dupi_vec(TCGContext *s, TCGType type,
+ TCGReg dst, tcg_target_long arg)
+{
+ g_assert_not_reached();
+}
static inline void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, unsigned vecl,
unsigned vece, const TCGArg *args,
const int *const_args)
@@ -1579,6 +1600,7 @@ bool tcg_op_supported(TCGOpcode op)
case INDEX_op_mov_vec:
case INDEX_op_dup_vec:
case INDEX_op_dupi_vec:
+ case INDEX_op_dupm_vec:
case INDEX_op_ld_vec:
case INDEX_op_st_vec:
case INDEX_op_add_vec:
@@ -1594,6 +1616,8 @@ bool tcg_op_supported(TCGOpcode op)
return have_vec && TCG_TARGET_HAS_not_vec;
case INDEX_op_neg_vec:
return have_vec && TCG_TARGET_HAS_neg_vec;
+ case INDEX_op_abs_vec:
+ return have_vec && TCG_TARGET_HAS_abs_vec;
case INDEX_op_andc_vec:
return have_vec && TCG_TARGET_HAS_andc_vec;
case INDEX_op_orc_vec:
@@ -3270,15 +3294,15 @@ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs)
save_globals(s, allocated_regs);
}
+/*
+ * Specialized code generation for INDEX_op_movi_*.
+ */
static void tcg_reg_alloc_do_movi(TCGContext *s, TCGTemp *ots,
tcg_target_ulong val, TCGLifeData arg_life,
TCGRegSet preferred_regs)
{
- if (ots->fixed_reg) {
- /* For fixed registers, we do not do any constant propagation. */
- tcg_out_movi(s, ots->type, ots->reg, val);
- return;
- }
+ /* ENV should not be modified. */
+ tcg_debug_assert(!ots->fixed_reg);
/* The movi is not explicitly generated here. */
if (ots->val_type == TEMP_VAL_REG) {
@@ -3302,6 +3326,9 @@ static void tcg_reg_alloc_movi(TCGContext *s, const TCGOp *op)
tcg_reg_alloc_do_movi(s, ots, val, op->life, op->output_pref[0]);
}
+/*
+ * Specialized code generation for INDEX_op_mov_*.
+ */
static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
{
const TCGLifeData arg_life = op->life;
@@ -3314,6 +3341,9 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
ots = arg_temp(op->args[0]);
ts = arg_temp(op->args[1]);
+ /* ENV should not be modified. */
+ tcg_debug_assert(!ots->fixed_reg);
+
/* Note that otype != itype for no-op truncation. */
otype = ots->type;
itype = ts->type;
@@ -3338,7 +3368,7 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
}
tcg_debug_assert(ts->val_type == TEMP_VAL_REG);
- if (IS_DEAD_ARG(0) && !ots->fixed_reg) {
+ if (IS_DEAD_ARG(0)) {
/* mov to a non-saved dead register makes no sense (even with
liveness analysis disabled). */
tcg_debug_assert(NEED_SYNC_ARG(0));
@@ -3351,7 +3381,7 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
}
temp_dead(s, ots);
} else {
- if (IS_DEAD_ARG(1) && !ts->fixed_reg && !ots->fixed_reg) {
+ if (IS_DEAD_ARG(1) && !ts->fixed_reg) {
/* the mov can be suppressed */
if (ots->val_type == TEMP_VAL_REG) {
s->reg_to_temp[ots->reg] = NULL;
@@ -3367,7 +3397,22 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
allocated_regs, preferred_regs,
ots->indirect_base);
}
- tcg_out_mov(s, otype, ots->reg, ts->reg);
+ if (!tcg_out_mov(s, otype, ots->reg, ts->reg)) {
+ /*
+ * Cross register class move not supported.
+ * Store the source register into the destination slot
+ * and leave the destination temp as TEMP_VAL_MEM.
+ */
+ assert(!ots->fixed_reg);
+ if (!ts->mem_allocated) {
+ temp_allocate_frame(s, ots);
+ }
+ tcg_out_st(s, ts->type, ts->reg,
+ ots->mem_base->reg, ots->mem_offset);
+ ots->mem_coherent = 1;
+ temp_free_or_dead(s, ots, -1);
+ return;
+ }
}
ots->val_type = TEMP_VAL_REG;
ots->mem_coherent = 0;
@@ -3378,6 +3423,118 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp *op)
}
}
+/*
+ * Specialized code generation for INDEX_op_dup_vec.
+ */
+static void tcg_reg_alloc_dup(TCGContext *s, const TCGOp *op)
+{
+ const TCGLifeData arg_life = op->life;
+ TCGRegSet dup_out_regs, dup_in_regs;
+ TCGTemp *its, *ots;
+ TCGType itype, vtype;
+ intptr_t endian_fixup;
+ unsigned vece;
+ bool ok;
+
+ ots = arg_temp(op->args[0]);
+ its = arg_temp(op->args[1]);
+
+ /* ENV should not be modified. */
+ tcg_debug_assert(!ots->fixed_reg);
+
+ itype = its->type;
+ vece = TCGOP_VECE(op);
+ vtype = TCGOP_VECL(op) + TCG_TYPE_V64;
+
+ if (its->val_type == TEMP_VAL_CONST) {
+ /* Propagate constant via movi -> dupi. */
+ tcg_target_ulong val = its->val;
+ if (IS_DEAD_ARG(1)) {
+ temp_dead(s, its);
+ }
+ tcg_reg_alloc_do_movi(s, ots, val, arg_life, op->output_pref[0]);
+ return;
+ }
+
+ dup_out_regs = tcg_op_defs[INDEX_op_dup_vec].args_ct[0].u.regs;
+ dup_in_regs = tcg_op_defs[INDEX_op_dup_vec].args_ct[1].u.regs;
+
+ /* Allocate the output register now. */
+ if (ots->val_type != TEMP_VAL_REG) {
+ TCGRegSet allocated_regs = s->reserved_regs;
+
+ if (!IS_DEAD_ARG(1) && its->val_type == TEMP_VAL_REG) {
+ /* Make sure to not spill the input register. */
+ tcg_regset_set_reg(allocated_regs, its->reg);
+ }
+ ots->reg = tcg_reg_alloc(s, dup_out_regs, allocated_regs,
+ op->output_pref[0], ots->indirect_base);
+ ots->val_type = TEMP_VAL_REG;
+ ots->mem_coherent = 0;
+ s->reg_to_temp[ots->reg] = ots;
+ }
+
+ switch (its->val_type) {
+ case TEMP_VAL_REG:
+ /*
+ * The dup constriaints must be broad, covering all possible VECE.
+ * However, tcg_op_dup_vec() gets to see the VECE and we allow it
+ * to fail, indicating that extra moves are required for that case.
+ */
+ if (tcg_regset_test_reg(dup_in_regs, its->reg)) {
+ if (tcg_out_dup_vec(s, vtype, vece, ots->reg, its->reg)) {
+ goto done;
+ }
+ /* Try again from memory or a vector input register. */
+ }
+ if (!its->mem_coherent) {
+ /*
+ * The input register is not synced, and so an extra store
+ * would be required to use memory. Attempt an integer-vector
+ * register move first. We do not have a TCGRegSet for this.
+ */
+ if (tcg_out_mov(s, itype, ots->reg, its->reg)) {
+ break;
+ }
+ /* Sync the temp back to its slot and load from there. */
+ temp_sync(s, its, s->reserved_regs, 0, 0);
+ }
+ /* fall through */
+
+ case TEMP_VAL_MEM:
+#ifdef HOST_WORDS_BIGENDIAN
+ endian_fixup = itype == TCG_TYPE_I32 ? 4 : 8;
+ endian_fixup -= 1 << vece;
+#else
+ endian_fixup = 0;
+#endif
+ if (tcg_out_dupm_vec(s, vtype, vece, ots->reg, its->mem_base->reg,
+ its->mem_offset + endian_fixup)) {
+ goto done;
+ }
+ tcg_out_ld(s, itype, ots->reg, its->mem_base->reg, its->mem_offset);
+ break;
+
+ default:
+ g_assert_not_reached();
+ }
+
+ /* We now have a vector input register, so dup must succeed. */
+ ok = tcg_out_dup_vec(s, vtype, vece, ots->reg, ots->reg);
+ tcg_debug_assert(ok);
+
+ done:
+ if (IS_DEAD_ARG(1)) {
+ temp_dead(s, its);
+ }
+ if (NEED_SYNC_ARG(0)) {
+ temp_sync(s, ots, s->reserved_regs, 0, 0);
+ }
+ if (IS_DEAD_ARG(0)) {
+ temp_dead(s, ots);
+ }
+}
+
static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
{
const TCGLifeData arg_life = op->life;
@@ -3467,7 +3624,15 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
i_allocated_regs, 0);
reg = tcg_reg_alloc(s, arg_ct->u.regs, i_allocated_regs,
o_preferred_regs, ts->indirect_base);
- tcg_out_mov(s, ts->type, reg, ts->reg);
+ if (!tcg_out_mov(s, ts->type, reg, ts->reg)) {
+ /*
+ * Cross register class move not supported. Sync the
+ * temp back to its slot and load from there.
+ */
+ temp_sync(s, ts, i_allocated_regs, 0, 0);
+ tcg_out_ld(s, ts->type, reg,
+ ts->mem_base->reg, ts->mem_offset);
+ }
}
new_args[i] = reg;
const_args[i] = 0;
@@ -3504,6 +3669,10 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
arg = op->args[i];
arg_ct = &def->args_ct[i];
ts = arg_temp(arg);
+
+ /* ENV should not be modified. */
+ tcg_debug_assert(!ts->fixed_reg);
+
if ((arg_ct->ct & TCG_CT_ALIAS)
&& !const_args[arg_ct->alias_index]) {
reg = new_args[arg_ct->alias_index];
@@ -3512,29 +3681,21 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
i_allocated_regs | o_allocated_regs,
op->output_pref[k], ts->indirect_base);
} else {
- /* if fixed register, we try to use it */
- reg = ts->reg;
- if (ts->fixed_reg &&
- tcg_regset_test_reg(arg_ct->u.regs, reg)) {
- goto oarg_end;
- }
reg = tcg_reg_alloc(s, arg_ct->u.regs, o_allocated_regs,
op->output_pref[k], ts->indirect_base);
}
tcg_regset_set_reg(o_allocated_regs, reg);
- /* if a fixed register is used, then a move will be done afterwards */
- if (!ts->fixed_reg) {
- if (ts->val_type == TEMP_VAL_REG) {
- s->reg_to_temp[ts->reg] = NULL;
- }
- ts->val_type = TEMP_VAL_REG;
- ts->reg = reg;
- /* temp value is modified, so the value kept in memory is
- potentially not the same */
- ts->mem_coherent = 0;
- s->reg_to_temp[reg] = ts;
+ if (ts->val_type == TEMP_VAL_REG) {
+ s->reg_to_temp[ts->reg] = NULL;
}
- oarg_end:
+ ts->val_type = TEMP_VAL_REG;
+ ts->reg = reg;
+ /*
+ * Temp value is modified, so the value kept in memory is
+ * potentially not the same.
+ */
+ ts->mem_coherent = 0;
+ s->reg_to_temp[reg] = ts;
new_args[i] = reg;
}
}
@@ -3550,10 +3711,10 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
/* move the outputs in the correct register if needed */
for(i = 0; i < nb_oargs; i++) {
ts = arg_temp(op->args[i]);
- reg = new_args[i];
- if (ts->fixed_reg && ts->reg != reg) {
- tcg_out_mov(s, ts->type, ts->reg, reg);
- }
+
+ /* ENV should not be modified. */
+ tcg_debug_assert(!ts->fixed_reg);
+
if (NEED_SYNC_ARG(i)) {
temp_sync(s, ts, o_allocated_regs, 0, IS_DEAD_ARG(i));
} else if (IS_DEAD_ARG(i)) {
@@ -3630,7 +3791,15 @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
if (ts->val_type == TEMP_VAL_REG) {
if (ts->reg != reg) {
tcg_reg_free(s, reg, allocated_regs);
- tcg_out_mov(s, ts->type, reg, ts->reg);
+ if (!tcg_out_mov(s, ts->type, reg, ts->reg)) {
+ /*
+ * Cross register class move not supported. Sync the
+ * temp back to its slot and load from there.
+ */
+ temp_sync(s, ts, allocated_regs, 0, 0);
+ tcg_out_ld(s, ts->type, reg,
+ ts->mem_base->reg, ts->mem_offset);
+ }
}
} else {
TCGRegSet arg_set = 0;
@@ -3674,26 +3843,23 @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
for(i = 0; i < nb_oargs; i++) {
arg = op->args[i];
ts = arg_temp(arg);
+
+ /* ENV should not be modified. */
+ tcg_debug_assert(!ts->fixed_reg);
+
reg = tcg_target_call_oarg_regs[i];
tcg_debug_assert(s->reg_to_temp[reg] == NULL);
-
- if (ts->fixed_reg) {
- if (ts->reg != reg) {
- tcg_out_mov(s, ts->type, ts->reg, reg);
- }
- } else {
- if (ts->val_type == TEMP_VAL_REG) {
- s->reg_to_temp[ts->reg] = NULL;
- }
- ts->val_type = TEMP_VAL_REG;
- ts->reg = reg;
- ts->mem_coherent = 0;
- s->reg_to_temp[reg] = ts;
- if (NEED_SYNC_ARG(i)) {
- temp_sync(s, ts, allocated_regs, 0, IS_DEAD_ARG(i));
- } else if (IS_DEAD_ARG(i)) {
- temp_dead(s, ts);
- }
+ if (ts->val_type == TEMP_VAL_REG) {
+ s->reg_to_temp[ts->reg] = NULL;
+ }
+ ts->val_type = TEMP_VAL_REG;
+ ts->reg = reg;
+ ts->mem_coherent = 0;
+ s->reg_to_temp[reg] = ts;
+ if (NEED_SYNC_ARG(i)) {
+ temp_sync(s, ts, allocated_regs, 0, IS_DEAD_ARG(i));
+ } else if (IS_DEAD_ARG(i)) {
+ temp_dead(s, ts);
}
}
}
@@ -3943,6 +4109,9 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
case INDEX_op_dupi_vec:
tcg_reg_alloc_movi(s, op);
break;
+ case INDEX_op_dup_vec:
+ tcg_reg_alloc_dup(s, op);
+ break;
case INDEX_op_insn_start:
if (num_insns >= 0) {
size_t off = tcg_current_code_size(s);
diff --git a/tcg/tcg.h b/tcg/tcg.h
index cfc57110a1..0e01a70d66 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -176,6 +176,7 @@ typedef uint64_t TCGRegSet;
&& !defined(TCG_TARGET_HAS_v128) \
&& !defined(TCG_TARGET_HAS_v256)
#define TCG_TARGET_MAYBE_vec 0
+#define TCG_TARGET_HAS_abs_vec 0
#define TCG_TARGET_HAS_neg_vec 0
#define TCG_TARGET_HAS_not_vec 0
#define TCG_TARGET_HAS_andc_vec 0
@@ -692,6 +693,7 @@ struct TCGContext {
#ifdef CONFIG_DEBUG_TCG
int temps_in_use;
int goto_tb_issue_mask;
+ const TCGOpcode *vecop_list;
#endif
/* Code generation. Note that we specifically do not use tcg_insn_unit
@@ -1492,4 +1494,23 @@ void helper_atomic_sto_le_mmu(CPUArchState *env, target_ulong addr, Int128 val,
void helper_atomic_sto_be_mmu(CPUArchState *env, target_ulong addr, Int128 val,
TCGMemOpIdx oi, uintptr_t retaddr);
+#ifdef CONFIG_DEBUG_TCG
+void tcg_assert_listed_vecop(TCGOpcode);
+#else
+static inline void tcg_assert_listed_vecop(TCGOpcode op) { }
+#endif
+
+static inline const TCGOpcode *tcg_swap_vecop_list(const TCGOpcode *n)
+{
+#ifdef CONFIG_DEBUG_TCG
+ const TCGOpcode *o = tcg_ctx->vecop_list;
+ tcg_ctx->vecop_list = n;
+ return o;
+#else
+ return NULL;
+#endif
+}
+
+bool tcg_can_emit_vecop_list(const TCGOpcode *, TCGType, unsigned);
+
#endif /* TCG_H */
diff --git a/tcg/tci/tcg-target.inc.c b/tcg/tci/tcg-target.inc.c
index 0015a98485..992d50cb1e 100644
--- a/tcg/tci/tcg-target.inc.c
+++ b/tcg/tci/tcg-target.inc.c
@@ -509,7 +509,7 @@ static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1,
old_code_ptr[1] = s->code_ptr - old_code_ptr;
}
-static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
+static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
{
uint8_t *old_code_ptr = s->code_ptr;
tcg_debug_assert(ret != arg);
@@ -521,6 +521,7 @@ static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
tcg_out_r(s, ret);
tcg_out_r(s, arg);
old_code_ptr[1] = s->code_ptr - old_code_ptr;
+ return true;
}
static void tcg_out_movi(TCGContext *s, TCGType type,