diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2022-03-05 14:43:19 +0000 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2022-03-05 14:43:19 +0000 |
commit | 2acf5e1d0e0f15be1b0ad85cf05b3a6e6307680c (patch) | |
tree | 4fb5f7d0984ea90091db7c464ca7b47d29635bd0 | |
parent | d7e2fe4aac8b74bbfe82b2309536528b4dbe0d34 (diff) | |
parent | cf320769476c3e2820be2a6280bfa1e15baf396f (diff) |
Merge remote-tracking branch 'remotes/rth-gitlab/tags/pull-tcg-20220304' into staging
Reorder do_constant_folding_cond test to satisfy valgrind.
Fix value of MAX_OPC_PARAM_IARGS.
Add opcodes for vector nand, nor, eqv.
Support vector nand, nor, eqv on PPC and S390X hosts.
Support AVX512VL, AVX512BW, AVX512DQ, and AVX512VBMI2.
# gpg: Signature made Fri 04 Mar 2022 18:59:08 GMT
# gpg: using RSA key 7A481E78868B4DB6A85A05C064DF38E8AF7E215F
# gpg: issuer "richard.henderson@linaro.org"
# gpg: Good signature from "Richard Henderson <richard.henderson@linaro.org>" [full]
# Primary key fingerprint: 7A48 1E78 868B 4DB6 A85A 05C0 64DF 38E8 AF7E 215F
* remotes/rth-gitlab/tags/pull-tcg-20220304: (21 commits)
tcg/i386: Implement bitsel for avx512
tcg/i386: Implement more logical operations for avx512
tcg/i386: Implement avx512 multiply
tcg/i386: Implement avx512 min/max/abs
tcg/i386: Expand scalar rotate with avx512 insns
tcg/i386: Remove rotls_vec from tcg_target_op_def
tcg/i386: Expand vector word rotate as avx512vbmi2 shift-double
tcg/i386: Support avx512vbmi2 vector shift-double instructions
tcg/i386: Implement avx512 variable rotate
tcg/i386: Implement avx512 immediate rotate
tcg/i386: Implement avx512 immediate sari shift
tcg/i386: Implement avx512 scalar shift
tcg/i386: Implement avx512 variable shifts
tcg/i386: Use tcg_can_emit_vec_op in expand_vec_cmp_noinv
tcg/i386: Add tcg_out_evex_opc
tcg/i386: Detect AVX512
tcg/s390x: Implement vector NAND, NOR, EQV
tcg/ppc: Implement vector NAND, NOR, EQV
tcg: Add opcodes for vector nand, nor, eqv
tcg: Set MAX_OPC_PARAM_IARGS to 7
...
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r-- | include/qemu/cpuid.h | 20 | ||||
-rw-r--r-- | include/tcg/tcg-opc.h | 3 | ||||
-rw-r--r-- | include/tcg/tcg.h | 5 | ||||
-rw-r--r-- | tcg/aarch64/tcg-target.h | 3 | ||||
-rw-r--r-- | tcg/arm/tcg-target.h | 3 | ||||
-rw-r--r-- | tcg/i386/tcg-target-con-set.h | 1 | ||||
-rw-r--r-- | tcg/i386/tcg-target.c.inc | 385 | ||||
-rw-r--r-- | tcg/i386/tcg-target.h | 17 | ||||
-rw-r--r-- | tcg/i386/tcg-target.opc.h | 3 | ||||
-rw-r--r-- | tcg/optimize.c | 20 | ||||
-rw-r--r-- | tcg/ppc/tcg-target.c.inc | 15 | ||||
-rw-r--r-- | tcg/ppc/tcg-target.h | 3 | ||||
-rw-r--r-- | tcg/s390x/tcg-target.c.inc | 17 | ||||
-rw-r--r-- | tcg/s390x/tcg-target.h | 3 | ||||
-rw-r--r-- | tcg/tcg-op-vec.c | 27 | ||||
-rw-r--r-- | tcg/tcg.c | 6 | ||||
-rw-r--r-- | tcg/tci/tcg-target.c.inc | 2 |
17 files changed, 440 insertions, 93 deletions
diff --git a/include/qemu/cpuid.h b/include/qemu/cpuid.h index 09fc245b91..7adb12d320 100644 --- a/include/qemu/cpuid.h +++ b/include/qemu/cpuid.h @@ -45,12 +45,26 @@ #ifndef bit_AVX2 #define bit_AVX2 (1 << 5) #endif -#ifndef bit_AVX512F -#define bit_AVX512F (1 << 16) -#endif #ifndef bit_BMI2 #define bit_BMI2 (1 << 8) #endif +#ifndef bit_AVX512F +#define bit_AVX512F (1 << 16) +#endif +#ifndef bit_AVX512DQ +#define bit_AVX512DQ (1 << 17) +#endif +#ifndef bit_AVX512BW +#define bit_AVX512BW (1 << 30) +#endif +#ifndef bit_AVX512VL +#define bit_AVX512VL (1u << 31) +#endif + +/* Leaf 7, %ecx */ +#ifndef bit_AVX512VBMI2 +#define bit_AVX512VBMI2 (1 << 6) +#endif /* Leaf 0x80000001, %ecx */ #ifndef bit_LZCNT diff --git a/include/tcg/tcg-opc.h b/include/tcg/tcg-opc.h index 675873e200..dd444734d9 100644 --- a/include/tcg/tcg-opc.h +++ b/include/tcg/tcg-opc.h @@ -245,6 +245,9 @@ DEF(or_vec, 1, 2, 0, IMPLVEC) DEF(xor_vec, 1, 2, 0, IMPLVEC) DEF(andc_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_andc_vec)) DEF(orc_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_orc_vec)) +DEF(nand_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_nand_vec)) +DEF(nor_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_nor_vec)) +DEF(eqv_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_eqv_vec)) DEF(not_vec, 1, 1, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_not_vec)) DEF(shli_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_shi_vec)) diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h index 42f5b500ed..73869fd9d0 100644 --- a/include/tcg/tcg.h +++ b/include/tcg/tcg.h @@ -43,7 +43,7 @@ #else #define MAX_OPC_PARAM_PER_ARG 1 #endif -#define MAX_OPC_PARAM_IARGS 6 +#define MAX_OPC_PARAM_IARGS 7 #define MAX_OPC_PARAM_OARGS 1 #define MAX_OPC_PARAM_ARGS (MAX_OPC_PARAM_IARGS + MAX_OPC_PARAM_OARGS) @@ -183,6 +183,9 @@ typedef uint64_t TCGRegSet; #define TCG_TARGET_HAS_not_vec 0 #define TCG_TARGET_HAS_andc_vec 0 #define TCG_TARGET_HAS_orc_vec 0 +#define TCG_TARGET_HAS_nand_vec 0 +#define TCG_TARGET_HAS_nor_vec 0 +#define TCG_TARGET_HAS_eqv_vec 0 #define TCG_TARGET_HAS_roti_vec 0 #define TCG_TARGET_HAS_rots_vec 0 #define TCG_TARGET_HAS_rotv_vec 0 diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h index 876af589ce..485f685bd2 100644 --- a/tcg/aarch64/tcg-target.h +++ b/tcg/aarch64/tcg-target.h @@ -131,6 +131,9 @@ typedef enum { #define TCG_TARGET_HAS_andc_vec 1 #define TCG_TARGET_HAS_orc_vec 1 +#define TCG_TARGET_HAS_nand_vec 0 +#define TCG_TARGET_HAS_nor_vec 0 +#define TCG_TARGET_HAS_eqv_vec 0 #define TCG_TARGET_HAS_not_vec 1 #define TCG_TARGET_HAS_neg_vec 1 #define TCG_TARGET_HAS_abs_vec 1 diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h index 27c27a1f14..7e96495392 100644 --- a/tcg/arm/tcg-target.h +++ b/tcg/arm/tcg-target.h @@ -130,6 +130,9 @@ extern bool use_neon_instructions; #define TCG_TARGET_HAS_andc_vec 1 #define TCG_TARGET_HAS_orc_vec 1 +#define TCG_TARGET_HAS_nand_vec 0 +#define TCG_TARGET_HAS_nor_vec 0 +#define TCG_TARGET_HAS_eqv_vec 0 #define TCG_TARGET_HAS_not_vec 1 #define TCG_TARGET_HAS_neg_vec 1 #define TCG_TARGET_HAS_abs_vec 1 diff --git a/tcg/i386/tcg-target-con-set.h b/tcg/i386/tcg-target-con-set.h index 78774d1005..91ceb0e1da 100644 --- a/tcg/i386/tcg-target-con-set.h +++ b/tcg/i386/tcg-target-con-set.h @@ -45,6 +45,7 @@ C_O1_I2(r, r, rI) C_O1_I2(x, x, x) C_N1_I2(r, r, r) C_N1_I2(r, r, rW) +C_O1_I3(x, 0, x, x) C_O1_I3(x, x, x, x) C_O1_I4(r, r, re, r, 0) C_O1_I4(r, r, r, ri, ri) diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc index faa15eecab..b5c6159853 100644 --- a/tcg/i386/tcg-target.c.inc +++ b/tcg/i386/tcg-target.c.inc @@ -171,6 +171,10 @@ bool have_bmi1; bool have_popcnt; bool have_avx1; bool have_avx2; +bool have_avx512bw; +bool have_avx512dq; +bool have_avx512vbmi2; +bool have_avx512vl; bool have_movbe; #ifdef CONFIG_CPUID_H @@ -258,6 +262,7 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct) #define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */ #define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */ #define P_VEXL 0x80000 /* Set VEX.L = 1 */ +#define P_EVEX 0x100000 /* Requires EVEX encoding */ #define OPC_ARITH_EvIz (0x81) #define OPC_ARITH_EvIb (0x83) @@ -308,6 +313,7 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct) #define OPC_PABSB (0x1c | P_EXT38 | P_DATA16) #define OPC_PABSW (0x1d | P_EXT38 | P_DATA16) #define OPC_PABSD (0x1e | P_EXT38 | P_DATA16) +#define OPC_VPABSQ (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) #define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16) #define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16) #define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16) @@ -334,15 +340,19 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct) #define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16) #define OPC_PMAXSW (0xee | P_EXT | P_DATA16) #define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16) +#define OPC_VPMAXSQ (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) #define OPC_PMAXUB (0xde | P_EXT | P_DATA16) #define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16) #define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16) +#define OPC_VPMAXUQ (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) #define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16) #define OPC_PMINSW (0xea | P_EXT | P_DATA16) #define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16) +#define OPC_VPMINSQ (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) #define OPC_PMINUB (0xda | P_EXT | P_DATA16) #define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16) #define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16) +#define OPC_VPMINUQ (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) #define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16) #define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16) #define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16) @@ -351,19 +361,21 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct) #define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16) #define OPC_PMULLW (0xd5 | P_EXT | P_DATA16) #define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16) +#define OPC_VPMULLQ (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) #define OPC_POR (0xeb | P_EXT | P_DATA16) #define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16) #define OPC_PSHUFD (0x70 | P_EXT | P_DATA16) #define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2) #define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3) #define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */ -#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */ +#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */ #define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */ #define OPC_PSLLW (0xf1 | P_EXT | P_DATA16) #define OPC_PSLLD (0xf2 | P_EXT | P_DATA16) #define OPC_PSLLQ (0xf3 | P_EXT | P_DATA16) #define OPC_PSRAW (0xe1 | P_EXT | P_DATA16) #define OPC_PSRAD (0xe2 | P_EXT | P_DATA16) +#define OPC_VPSRAQ (0x72 | P_EXT | P_DATA16 | P_VEXW | P_EVEX) #define OPC_PSRLW (0xd1 | P_EXT | P_DATA16) #define OPC_PSRLD (0xd2 | P_EXT | P_DATA16) #define OPC_PSRLQ (0xd3 | P_EXT | P_DATA16) @@ -414,11 +426,29 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct) #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16) #define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_VEXW) #define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL) +#define OPC_VPROLVD (0x15 | P_EXT38 | P_DATA16 | P_EVEX) +#define OPC_VPROLVQ (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) +#define OPC_VPRORVD (0x14 | P_EXT38 | P_DATA16 | P_EVEX) +#define OPC_VPRORVQ (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) +#define OPC_VPSHLDW (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) +#define OPC_VPSHLDD (0x71 | P_EXT3A | P_DATA16 | P_EVEX) +#define OPC_VPSHLDQ (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) +#define OPC_VPSHLDVW (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) +#define OPC_VPSHLDVD (0x71 | P_EXT38 | P_DATA16 | P_EVEX) +#define OPC_VPSHLDVQ (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) +#define OPC_VPSHRDVW (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) +#define OPC_VPSHRDVD (0x73 | P_EXT38 | P_DATA16 | P_EVEX) +#define OPC_VPSHRDVQ (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) +#define OPC_VPSLLVW (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) #define OPC_VPSLLVD (0x47 | P_EXT38 | P_DATA16) #define OPC_VPSLLVQ (0x47 | P_EXT38 | P_DATA16 | P_VEXW) +#define OPC_VPSRAVW (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) #define OPC_VPSRAVD (0x46 | P_EXT38 | P_DATA16) +#define OPC_VPSRAVQ (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) +#define OPC_VPSRLVW (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) #define OPC_VPSRLVD (0x45 | P_EXT38 | P_DATA16) #define OPC_VPSRLVQ (0x45 | P_EXT38 | P_DATA16 | P_VEXW) +#define OPC_VPTERNLOGQ (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) #define OPC_VZEROUPPER (0x77 | P_EXT) #define OPC_XCHG_ax_r32 (0x90) @@ -622,9 +652,57 @@ static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v, tcg_out8(s, opc); } +static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v, + int rm, int index) +{ + /* The entire 4-byte evex prefix; with R' and V' set. */ + uint32_t p = 0x08041062; + int mm, pp; + + tcg_debug_assert(have_avx512vl); + + /* EVEX.mm */ + if (opc & P_EXT3A) { + mm = 3; + } else if (opc & P_EXT38) { + mm = 2; + } else if (opc & P_EXT) { + mm = 1; + } else { + g_assert_not_reached(); + } + + /* EVEX.pp */ + if (opc & P_DATA16) { + pp = 1; /* 0x66 */ + } else if (opc & P_SIMDF3) { + pp = 2; /* 0xf3 */ + } else if (opc & P_SIMDF2) { + pp = 3; /* 0xf2 */ + } else { + pp = 0; + } + + p = deposit32(p, 8, 2, mm); + p = deposit32(p, 13, 1, (rm & 8) == 0); /* EVEX.RXB.B */ + p = deposit32(p, 14, 1, (index & 8) == 0); /* EVEX.RXB.X */ + p = deposit32(p, 15, 1, (r & 8) == 0); /* EVEX.RXB.R */ + p = deposit32(p, 16, 2, pp); + p = deposit32(p, 19, 4, ~v); + p = deposit32(p, 23, 1, (opc & P_VEXW) != 0); + p = deposit32(p, 29, 2, (opc & P_VEXL) != 0); + + tcg_out32(s, p); + tcg_out8(s, opc); +} + static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) { - tcg_out_vex_opc(s, opc, r, v, rm, 0); + if (opc & P_EVEX) { + tcg_out_evex_opc(s, opc, r, v, rm, 0); + } else { + tcg_out_vex_opc(s, opc, r, v, rm, 0); + } tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); } @@ -2746,7 +2824,7 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2 }; static int const mul_insn[4] = { - OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2 + OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ }; static int const shift_imm_insn[4] = { OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib @@ -2770,28 +2848,31 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2 }; static int const smin_insn[4] = { - OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_UD2 + OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ }; static int const smax_insn[4] = { - OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_UD2 + OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ }; static int const umin_insn[4] = { - OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_UD2 + OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ }; static int const umax_insn[4] = { - OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2 + OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ + }; + static int const rotlv_insn[4] = { + OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ + }; + static int const rotrv_insn[4] = { + OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ }; static int const shlv_insn[4] = { - /* TODO: AVX512 adds support for MO_16. */ - OPC_UD2, OPC_UD2, OPC_VPSLLVD, OPC_VPSLLVQ + OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ }; static int const shrv_insn[4] = { - /* TODO: AVX512 adds support for MO_16. */ - OPC_UD2, OPC_UD2, OPC_VPSRLVD, OPC_VPSRLVQ + OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ }; static int const sarv_insn[4] = { - /* TODO: AVX512 adds support for MO_16, MO_64. */ - OPC_UD2, OPC_UD2, OPC_VPSRAVD, OPC_UD2 + OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ }; static int const shls_insn[4] = { OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ @@ -2800,16 +2881,24 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ }; static int const sars_insn[4] = { - OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_UD2 + OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ + }; + static int const vpshldi_insn[4] = { + OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ + }; + static int const vpshldv_insn[4] = { + OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ + }; + static int const vpshrdv_insn[4] = { + OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ }; static int const abs_insn[4] = { - /* TODO: AVX512 adds support for MO_64. */ - OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_UD2 + OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ }; TCGType type = vecl + TCG_TYPE_V64; int insn, sub; - TCGArg a0, a1, a2; + TCGArg a0, a1, a2, a3; a0 = args[0]; a1 = args[1]; @@ -2867,6 +2956,12 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, case INDEX_op_sarv_vec: insn = sarv_insn[vece]; goto gen_simd; + case INDEX_op_rotlv_vec: + insn = rotlv_insn[vece]; + goto gen_simd; + case INDEX_op_rotrv_vec: + insn = rotrv_insn[vece]; + goto gen_simd; case INDEX_op_shls_vec: insn = shls_insn[vece]; goto gen_simd; @@ -2888,6 +2983,16 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, case INDEX_op_x86_packus_vec: insn = packus_insn[vece]; goto gen_simd; + case INDEX_op_x86_vpshldv_vec: + insn = vpshldv_insn[vece]; + a1 = a2; + a2 = args[3]; + goto gen_simd; + case INDEX_op_x86_vpshrdv_vec: + insn = vpshrdv_insn[vece]; + a1 = a2; + a2 = args[3]; + goto gen_simd; #if TCG_TARGET_REG_BITS == 32 case INDEX_op_dup2_vec: /* First merge the two 32-bit inputs to a single 64-bit element. */ @@ -2931,17 +3036,30 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, break; case INDEX_op_shli_vec: + insn = shift_imm_insn[vece]; sub = 6; goto gen_shift; case INDEX_op_shri_vec: + insn = shift_imm_insn[vece]; sub = 2; goto gen_shift; case INDEX_op_sari_vec: - tcg_debug_assert(vece != MO_64); + if (vece == MO_64) { + insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX; + } else { + insn = shift_imm_insn[vece]; + } sub = 4; + goto gen_shift; + case INDEX_op_rotli_vec: + insn = OPC_PSHIFTD_Ib | P_EVEX; /* VPROL[DQ] */ + if (vece == MO_64) { + insn |= P_VEXW; + } + sub = 1; + goto gen_shift; gen_shift: tcg_debug_assert(vece != MO_8); - insn = shift_imm_insn[vece]; if (type == TCG_TYPE_V256) { insn |= P_VEXL; } @@ -2977,7 +3095,51 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, insn = OPC_VPERM2I128; sub = args[3]; goto gen_simd_imm8; + case INDEX_op_x86_vpshldi_vec: + insn = vpshldi_insn[vece]; + sub = args[3]; + goto gen_simd_imm8; + + case INDEX_op_not_vec: + insn = OPC_VPTERNLOGQ; + a2 = a1; + sub = 0x33; /* !B */ + goto gen_simd_imm8; + case INDEX_op_nor_vec: + insn = OPC_VPTERNLOGQ; + sub = 0x11; /* norCB */ + goto gen_simd_imm8; + case INDEX_op_nand_vec: + insn = OPC_VPTERNLOGQ; + sub = 0x77; /* nandCB */ + goto gen_simd_imm8; + case INDEX_op_eqv_vec: + insn = OPC_VPTERNLOGQ; + sub = 0x99; /* xnorCB */ + goto gen_simd_imm8; + case INDEX_op_orc_vec: + insn = OPC_VPTERNLOGQ; + sub = 0xdd; /* orB!C */ + goto gen_simd_imm8; + + case INDEX_op_bitsel_vec: + insn = OPC_VPTERNLOGQ; + a3 = args[3]; + if (a0 == a1) { + a1 = a2; + a2 = a3; + sub = 0xca; /* A?B:C */ + } else if (a0 == a2) { + a2 = a3; + sub = 0xe2; /* B?A:C */ + } else { + tcg_out_mov(s, type, a0, a3); + sub = 0xb8; /* B?C:A */ + } + goto gen_simd_imm8; + gen_simd_imm8: + tcg_debug_assert(insn != OPC_UD2); if (type == TCG_TYPE_V256) { insn |= P_VEXL; } @@ -3196,6 +3358,10 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_or_vec: case INDEX_op_xor_vec: case INDEX_op_andc_vec: + case INDEX_op_orc_vec: + case INDEX_op_nand_vec: + case INDEX_op_nor_vec: + case INDEX_op_eqv_vec: case INDEX_op_ssadd_vec: case INDEX_op_usadd_vec: case INDEX_op_sssub_vec: @@ -3207,10 +3373,11 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_shlv_vec: case INDEX_op_shrv_vec: case INDEX_op_sarv_vec: + case INDEX_op_rotlv_vec: + case INDEX_op_rotrv_vec: case INDEX_op_shls_vec: case INDEX_op_shrs_vec: case INDEX_op_sars_vec: - case INDEX_op_rotls_vec: case INDEX_op_cmp_vec: case INDEX_op_x86_shufps_vec: case INDEX_op_x86_blend_vec: @@ -3219,6 +3386,7 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_x86_vperm2i128_vec: case INDEX_op_x86_punpckl_vec: case INDEX_op_x86_punpckh_vec: + case INDEX_op_x86_vpshldi_vec: #if TCG_TARGET_REG_BITS == 32 case INDEX_op_dup2_vec: #endif @@ -3226,12 +3394,19 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_abs_vec: case INDEX_op_dup_vec: + case INDEX_op_not_vec: case INDEX_op_shli_vec: case INDEX_op_shri_vec: case INDEX_op_sari_vec: + case INDEX_op_rotli_vec: case INDEX_op_x86_psrldq_vec: return C_O1_I1(x, x); + case INDEX_op_x86_vpshldv_vec: + case INDEX_op_x86_vpshrdv_vec: + return C_O1_I3(x, 0, x, x); + + case INDEX_op_bitsel_vec: case INDEX_op_x86_vpblendvb_vec: return C_O1_I3(x, x, x, x); @@ -3249,53 +3424,96 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) case INDEX_op_or_vec: case INDEX_op_xor_vec: case INDEX_op_andc_vec: + case INDEX_op_orc_vec: + case INDEX_op_nand_vec: + case INDEX_op_nor_vec: + case INDEX_op_eqv_vec: + case INDEX_op_not_vec: + case INDEX_op_bitsel_vec: return 1; - case INDEX_op_rotli_vec: case INDEX_op_cmp_vec: case INDEX_op_cmpsel_vec: return -1; + case INDEX_op_rotli_vec: + return have_avx512vl && vece >= MO_32 ? 1 : -1; + case INDEX_op_shli_vec: case INDEX_op_shri_vec: /* We must expand the operation for MO_8. */ return vece == MO_8 ? -1 : 1; case INDEX_op_sari_vec: - /* We must expand the operation for MO_8. */ - if (vece == MO_8) { + switch (vece) { + case MO_8: return -1; - } - /* We can emulate this for MO_64, but it does not pay off - unless we're producing at least 4 values. */ - if (vece == MO_64) { + case MO_16: + case MO_32: + return 1; + case MO_64: + if (have_avx512vl) { + return 1; + } + /* + * We can emulate this for MO_64, but it does not pay off + * unless we're producing at least 4 values. + */ return type >= TCG_TYPE_V256 ? -1 : 0; } - return 1; + return 0; case INDEX_op_shls_vec: case INDEX_op_shrs_vec: return vece >= MO_16; case INDEX_op_sars_vec: - return vece >= MO_16 && vece <= MO_32; + switch (vece) { + case MO_16: + case MO_32: + return 1; + case MO_64: + return have_avx512vl; + } + return 0; case INDEX_op_rotls_vec: return vece >= MO_16 ? -1 : 0; case INDEX_op_shlv_vec: case INDEX_op_shrv_vec: - return have_avx2 && vece >= MO_32; + switch (vece) { + case MO_16: + return have_avx512bw; + case MO_32: + case MO_64: + return have_avx2; + } + return 0; case INDEX_op_sarv_vec: - return have_avx2 && vece == MO_32; + switch (vece) { + case MO_16: + return have_avx512bw; + case MO_32: + return have_avx2; + case MO_64: + return have_avx512vl; + } + return 0; case INDEX_op_rotlv_vec: case INDEX_op_rotrv_vec: - return have_avx2 && vece >= MO_32 ? -1 : 0; + switch (vece) { + case MO_16: + return have_avx512vbmi2 ? -1 : 0; + case MO_32: + case MO_64: + return have_avx512vl ? 1 : have_avx2 ? -1 : 0; + } + return 0; case INDEX_op_mul_vec: - if (vece == MO_8) { - /* We can expand the operation for MO_8. */ + switch (vece) { + case MO_8: return -1; - } - if (vece == MO_64) { - return 0; + case MO_64: + return have_avx512dq; } return 1; @@ -3309,7 +3527,7 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) case INDEX_op_umin_vec: case INDEX_op_umax_vec: case INDEX_op_abs_vec: - return vece <= MO_32; + return vece <= MO_32 || have_avx512vl; default: return 0; @@ -3427,6 +3645,12 @@ static void expand_vec_rotli(TCGType type, unsigned vece, return; } + if (have_avx512vbmi2) { + vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece, + tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm); + return; + } + t = tcg_temp_new_vec(type); tcg_gen_shli_vec(vece, t, v1, imm); tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm); @@ -3434,31 +3658,19 @@ static void expand_vec_rotli(TCGType type, unsigned vece, tcg_temp_free_vec(t); } -static void expand_vec_rotls(TCGType type, unsigned vece, - TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh) +static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0, + TCGv_vec v1, TCGv_vec sh, bool right) { - TCGv_i32 rsh; TCGv_vec t; - tcg_debug_assert(vece != MO_8); + if (have_avx512vbmi2) { + vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec, + type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1), + tcgv_vec_arg(v1), tcgv_vec_arg(sh)); + return; + } t = tcg_temp_new_vec(type); - rsh = tcg_temp_new_i32(); - - tcg_gen_neg_i32(rsh, lsh); - tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1); - tcg_gen_shls_vec(vece, t, v1, lsh); - tcg_gen_shrs_vec(vece, v0, v1, rsh); - tcg_gen_or_vec(vece, v0, v0, t); - tcg_temp_free_vec(t); - tcg_temp_free_i32(rsh); -} - -static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0, - TCGv_vec v1, TCGv_vec sh, bool right) -{ - TCGv_vec t = tcg_temp_new_vec(type); - tcg_gen_dupi_vec(vece, t, 8 << vece); tcg_gen_sub_vec(vece, t, t, sh); if (right) { @@ -3472,6 +3684,35 @@ static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0, tcg_temp_free_vec(t); } +static void expand_vec_rotls(TCGType type, unsigned vece, + TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh) +{ + TCGv_vec t = tcg_temp_new_vec(type); + + tcg_debug_assert(vece != MO_8); + + if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) { + tcg_gen_dup_i32_vec(vece, t, lsh); + if (vece >= MO_32) { + tcg_gen_rotlv_vec(vece, v0, v1, t); + } else { + expand_vec_rotv(type, vece, v0, v1, t, false); + } + } else { + TCGv_i32 rsh = tcg_temp_new_i32(); + + tcg_gen_neg_i32(rsh, lsh); + tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1); + tcg_gen_shls_vec(vece, t, v1, lsh); + tcg_gen_shrs_vec(vece, v0, v1, rsh); + tcg_gen_or_vec(vece, v0, v0, t); + + tcg_temp_free_i32(rsh); + } + + tcg_temp_free_vec(t); +} + static void expand_vec_mul(TCGType type, unsigned vece, TCGv_vec v0, TCGv_vec v1, TCGv_vec v2) { @@ -3567,28 +3808,28 @@ static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0, fixup = NEED_SWAP | NEED_INV; break; case TCG_COND_LEU: - if (vece <= MO_32) { + if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) { fixup = NEED_UMIN; } else { fixup = NEED_BIAS | NEED_INV; } break; case TCG_COND_GTU: - if (vece <= MO_32) { + if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) { fixup = NEED_UMIN | NEED_INV; } else { fixup = NEED_BIAS; } break; case TCG_COND_GEU: - if (vece <= MO_32) { + if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) { fixup = NEED_UMAX; } else { fixup = NEED_BIAS | NEED_SWAP | NEED_INV; } break; case TCG_COND_LTU: - if (vece <= MO_32) { + if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) { fixup = NEED_UMAX | NEED_INV; } else { fixup = NEED_BIAS | NEED_SWAP; @@ -3839,12 +4080,12 @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count) static void tcg_target_init(TCGContext *s) { #ifdef CONFIG_CPUID_H - unsigned a, b, c, d, b7 = 0; + unsigned a, b, c, d, b7 = 0, c7 = 0; unsigned max = __get_cpuid_max(0, 0); if (max >= 7) { /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */ - __cpuid_count(7, 0, a, b7, c, d); + __cpuid_count(7, 0, a, b7, c7, d); have_bmi1 = (b7 & bit_BMI) != 0; have_bmi2 = (b7 & bit_BMI2) != 0; } @@ -3874,6 +4115,22 @@ static void tcg_target_init(TCGContext *s) if ((xcrl & 6) == 6) { have_avx1 = (c & bit_AVX) != 0; have_avx2 = (b7 & bit_AVX2) != 0; + + /* + * There are interesting instructions in AVX512, so long + * as we have AVX512VL, which indicates support for EVEX + * on sizes smaller than 512 bits. We are required to + * check that OPMASK and all extended ZMM state are enabled + * even if we're not using them -- the insns will fault. + */ + if ((xcrl & 0xe0) == 0xe0 + && (b7 & bit_AVX512F) + && (b7 & bit_AVX512VL)) { + have_avx512vl = true; + have_avx512bw = (b7 & bit_AVX512BW) != 0; + have_avx512dq = (b7 & bit_AVX512DQ) != 0; + have_avx512vbmi2 = (c7 & bit_AVX512VBMI2) != 0; + } } } } diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h index 3b2c9437a0..00fcbe297d 100644 --- a/tcg/i386/tcg-target.h +++ b/tcg/i386/tcg-target.h @@ -103,6 +103,10 @@ extern bool have_bmi1; extern bool have_popcnt; extern bool have_avx1; extern bool have_avx2; +extern bool have_avx512bw; +extern bool have_avx512dq; +extern bool have_avx512vbmi2; +extern bool have_avx512vl; extern bool have_movbe; /* optional instructions */ @@ -184,20 +188,23 @@ extern bool have_movbe; #define TCG_TARGET_HAS_v256 have_avx2 #define TCG_TARGET_HAS_andc_vec 1 -#define TCG_TARGET_HAS_orc_vec 0 -#define TCG_TARGET_HAS_not_vec 0 +#define TCG_TARGET_HAS_orc_vec have_avx512vl +#define TCG_TARGET_HAS_nand_vec have_avx512vl +#define TCG_TARGET_HAS_nor_vec have_avx512vl +#define TCG_TARGET_HAS_eqv_vec have_avx512vl +#define TCG_TARGET_HAS_not_vec have_avx512vl #define TCG_TARGET_HAS_neg_vec 0 #define TCG_TARGET_HAS_abs_vec 1 -#define TCG_TARGET_HAS_roti_vec 0 +#define TCG_TARGET_HAS_roti_vec have_avx512vl #define TCG_TARGET_HAS_rots_vec 0 -#define TCG_TARGET_HAS_rotv_vec 0 +#define TCG_TARGET_HAS_rotv_vec have_avx512vl #define TCG_TARGET_HAS_shi_vec 1 #define TCG_TARGET_HAS_shs_vec 1 #define TCG_TARGET_HAS_shv_vec have_avx2 #define TCG_TARGET_HAS_mul_vec 1 #define TCG_TARGET_HAS_sat_vec 1 #define TCG_TARGET_HAS_minmax_vec 1 -#define TCG_TARGET_HAS_bitsel_vec 0 +#define TCG_TARGET_HAS_bitsel_vec have_avx512vl #define TCG_TARGET_HAS_cmpsel_vec -1 #define TCG_TARGET_deposit_i32_valid(ofs, len) \ diff --git a/tcg/i386/tcg-target.opc.h b/tcg/i386/tcg-target.opc.h index 1312941800..b5f403e35e 100644 --- a/tcg/i386/tcg-target.opc.h +++ b/tcg/i386/tcg-target.opc.h @@ -33,3 +33,6 @@ DEF(x86_psrldq_vec, 1, 1, 1, IMPLVEC) DEF(x86_vperm2i128_vec, 1, 2, 1, IMPLVEC) DEF(x86_punpckl_vec, 1, 2, 0, IMPLVEC) DEF(x86_punpckh_vec, 1, 2, 0, IMPLVEC) +DEF(x86_vpshldi_vec, 1, 2, 1, IMPLVEC) +DEF(x86_vpshldv_vec, 1, 3, 0, IMPLVEC) +DEF(x86_vpshrdv_vec, 1, 3, 0, IMPLVEC) diff --git a/tcg/optimize.c b/tcg/optimize.c index e573000951..ae081ab29c 100644 --- a/tcg/optimize.c +++ b/tcg/optimize.c @@ -359,13 +359,13 @@ static uint64_t do_constant_folding_2(TCGOpcode op, uint64_t x, uint64_t y) CASE_OP_32_64_VEC(orc): return x | ~y; - CASE_OP_32_64(eqv): + CASE_OP_32_64_VEC(eqv): return ~(x ^ y); - CASE_OP_32_64(nand): + CASE_OP_32_64_VEC(nand): return ~(x & y); - CASE_OP_32_64(nor): + CASE_OP_32_64_VEC(nor): return ~(x | y); case INDEX_op_clz_i32: @@ -552,10 +552,10 @@ static bool do_constant_folding_cond_eq(TCGCond c) static int do_constant_folding_cond(TCGType type, TCGArg x, TCGArg y, TCGCond c) { - uint64_t xv = arg_info(x)->val; - uint64_t yv = arg_info(y)->val; - if (arg_is_const(x) && arg_is_const(y)) { + uint64_t xv = arg_info(x)->val; + uint64_t yv = arg_info(y)->val; + switch (type) { case TCG_TYPE_I32: return do_constant_folding_cond_32(xv, yv, c); @@ -567,7 +567,7 @@ static int do_constant_folding_cond(TCGType type, TCGArg x, } } else if (args_are_copies(x, y)) { return do_constant_folding_cond_eq(c); - } else if (arg_is_const(y) && yv == 0) { + } else if (arg_is_const(y) && arg_info(y)->val == 0) { switch (c) { case TCG_COND_LTU: return 0; @@ -2119,7 +2119,7 @@ void tcg_optimize(TCGContext *s) case INDEX_op_dup2_vec: done = fold_dup2(&ctx, op); break; - CASE_OP_32_64(eqv): + CASE_OP_32_64_VEC(eqv): done = fold_eqv(&ctx, op); break; CASE_OP_32_64(extract): @@ -2170,13 +2170,13 @@ void tcg_optimize(TCGContext *s) CASE_OP_32_64(mulu2): done = fold_multiply2(&ctx, op); break; - CASE_OP_32_64(nand): + CASE_OP_32_64_VEC(nand): done = fold_nand(&ctx, op); break; CASE_OP_32_64(neg): done = fold_neg(&ctx, op); break; - CASE_OP_32_64(nor): + CASE_OP_32_64_VEC(nor): done = fold_nor(&ctx, op); break; CASE_OP_32_64_VEC(not): diff --git a/tcg/ppc/tcg-target.c.inc b/tcg/ppc/tcg-target.c.inc index 69d22e08cb..1f3c5c171c 100644 --- a/tcg/ppc/tcg-target.c.inc +++ b/tcg/ppc/tcg-target.c.inc @@ -3122,6 +3122,9 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) case INDEX_op_xor_vec: case INDEX_op_andc_vec: case INDEX_op_not_vec: + case INDEX_op_nor_vec: + case INDEX_op_eqv_vec: + case INDEX_op_nand_vec: return 1; case INDEX_op_orc_vec: return have_isa_2_07; @@ -3400,6 +3403,15 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, case INDEX_op_orc_vec: insn = VORC; break; + case INDEX_op_nand_vec: + insn = VNAND; + break; + case INDEX_op_nor_vec: + insn = VNOR; + break; + case INDEX_op_eqv_vec: + insn = VEQV; + break; case INDEX_op_cmp_vec: switch (args[3]) { @@ -3787,6 +3799,9 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_xor_vec: case INDEX_op_andc_vec: case INDEX_op_orc_vec: + case INDEX_op_nor_vec: + case INDEX_op_eqv_vec: + case INDEX_op_nand_vec: case INDEX_op_cmp_vec: case INDEX_op_ssadd_vec: case INDEX_op_sssub_vec: diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h index c775c97b61..e6cf72503f 100644 --- a/tcg/ppc/tcg-target.h +++ b/tcg/ppc/tcg-target.h @@ -162,6 +162,9 @@ extern bool have_vsx; #define TCG_TARGET_HAS_andc_vec 1 #define TCG_TARGET_HAS_orc_vec have_isa_2_07 +#define TCG_TARGET_HAS_nand_vec have_isa_2_07 +#define TCG_TARGET_HAS_nor_vec 1 +#define TCG_TARGET_HAS_eqv_vec have_isa_2_07 #define TCG_TARGET_HAS_not_vec 1 #define TCG_TARGET_HAS_neg_vec have_isa_3_00 #define TCG_TARGET_HAS_abs_vec 0 diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc index d56c1e51e4..6e65828c09 100644 --- a/tcg/s390x/tcg-target.c.inc +++ b/tcg/s390x/tcg-target.c.inc @@ -290,7 +290,9 @@ typedef enum S390Opcode { VRRc_VMXL = 0xe7fd, VRRc_VN = 0xe768, VRRc_VNC = 0xe769, + VRRc_VNN = 0xe76e, VRRc_VNO = 0xe76b, + VRRc_VNX = 0xe76c, VRRc_VO = 0xe76a, VRRc_VOC = 0xe76f, VRRc_VPKS = 0xe797, /* we leave the m5 cs field 0 */ @@ -2805,6 +2807,15 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, case INDEX_op_xor_vec: tcg_out_insn(s, VRRc, VX, a0, a1, a2, 0); break; + case INDEX_op_nand_vec: + tcg_out_insn(s, VRRc, VNN, a0, a1, a2, 0); + break; + case INDEX_op_nor_vec: + tcg_out_insn(s, VRRc, VNO, a0, a1, a2, 0); + break; + case INDEX_op_eqv_vec: + tcg_out_insn(s, VRRc, VNX, a0, a1, a2, 0); + break; case INDEX_op_shli_vec: tcg_out_insn(s, VRSa, VESL, a0, a2, TCG_REG_NONE, a1, vece); @@ -2901,7 +2912,10 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) case INDEX_op_and_vec: case INDEX_op_andc_vec: case INDEX_op_bitsel_vec: + case INDEX_op_eqv_vec: + case INDEX_op_nand_vec: case INDEX_op_neg_vec: + case INDEX_op_nor_vec: case INDEX_op_not_vec: case INDEX_op_or_vec: case INDEX_op_orc_vec: @@ -3246,6 +3260,9 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_or_vec: case INDEX_op_orc_vec: case INDEX_op_xor_vec: + case INDEX_op_nand_vec: + case INDEX_op_nor_vec: + case INDEX_op_eqv_vec: case INDEX_op_cmp_vec: case INDEX_op_mul_vec: case INDEX_op_rotlv_vec: diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h index 69217d995b..23e2063667 100644 --- a/tcg/s390x/tcg-target.h +++ b/tcg/s390x/tcg-target.h @@ -145,6 +145,9 @@ extern uint64_t s390_facilities[3]; #define TCG_TARGET_HAS_andc_vec 1 #define TCG_TARGET_HAS_orc_vec HAVE_FACILITY(VECTOR_ENH1) +#define TCG_TARGET_HAS_nand_vec HAVE_FACILITY(VECTOR_ENH1) +#define TCG_TARGET_HAS_nor_vec 1 +#define TCG_TARGET_HAS_eqv_vec HAVE_FACILITY(VECTOR_ENH1) #define TCG_TARGET_HAS_not_vec 1 #define TCG_TARGET_HAS_neg_vec 1 #define TCG_TARGET_HAS_abs_vec 1 diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c index faf30f9cdd..463dabf515 100644 --- a/tcg/tcg-op-vec.c +++ b/tcg/tcg-op-vec.c @@ -371,23 +371,32 @@ void tcg_gen_orc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b) void tcg_gen_nand_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b) { - /* TODO: Add TCG_TARGET_HAS_nand_vec when adding a backend supports it. */ - tcg_gen_and_vec(0, r, a, b); - tcg_gen_not_vec(0, r, r); + if (TCG_TARGET_HAS_nand_vec) { + vec_gen_op3(INDEX_op_nand_vec, 0, r, a, b); + } else { + tcg_gen_and_vec(0, r, a, b); + tcg_gen_not_vec(0, r, r); + } } void tcg_gen_nor_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b) { - /* TODO: Add TCG_TARGET_HAS_nor_vec when adding a backend supports it. */ - tcg_gen_or_vec(0, r, a, b); - tcg_gen_not_vec(0, r, r); + if (TCG_TARGET_HAS_nor_vec) { + vec_gen_op3(INDEX_op_nor_vec, 0, r, a, b); + } else { + tcg_gen_or_vec(0, r, a, b); + tcg_gen_not_vec(0, r, r); + } } void tcg_gen_eqv_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b) { - /* TODO: Add TCG_TARGET_HAS_eqv_vec when adding a backend supports it. */ - tcg_gen_xor_vec(0, r, a, b); - tcg_gen_not_vec(0, r, r); + if (TCG_TARGET_HAS_eqv_vec) { + vec_gen_op3(INDEX_op_eqv_vec, 0, r, a, b); + } else { + tcg_gen_xor_vec(0, r, a, b); + tcg_gen_not_vec(0, r, r); + } } static bool do_op2(unsigned vece, TCGv_vec r, TCGv_vec a, TCGOpcode opc) @@ -1407,6 +1407,12 @@ bool tcg_op_supported(TCGOpcode op) return have_vec && TCG_TARGET_HAS_andc_vec; case INDEX_op_orc_vec: return have_vec && TCG_TARGET_HAS_orc_vec; + case INDEX_op_nand_vec: + return have_vec && TCG_TARGET_HAS_nand_vec; + case INDEX_op_nor_vec: + return have_vec && TCG_TARGET_HAS_nor_vec; + case INDEX_op_eqv_vec: + return have_vec && TCG_TARGET_HAS_eqv_vec; case INDEX_op_mul_vec: return have_vec && TCG_TARGET_HAS_mul_vec; case INDEX_op_shli_vec: diff --git a/tcg/tci/tcg-target.c.inc b/tcg/tci/tcg-target.c.inc index 9ff1fa0832..98337c567a 100644 --- a/tcg/tci/tcg-target.c.inc +++ b/tcg/tci/tcg-target.c.inc @@ -197,7 +197,7 @@ static const int tcg_target_reg_alloc_order[] = { TCG_REG_R0, }; -#if MAX_OPC_PARAM_IARGS != 6 +#if MAX_OPC_PARAM_IARGS != 7 # error Fix needed, number of supported input arguments changed! #endif |