diff options
author | Richard Henderson <richard.henderson@linaro.org> | 2024-09-07 17:24:57 -0700 |
---|---|---|
committer | Richard Henderson <richard.henderson@linaro.org> | 2024-09-22 06:54:50 +0200 |
commit | d58967490238f8d4c941102ade649314785d3f48 (patch) | |
tree | d8ac4a6250f5238a51f1b4de129b03af5c862101 | |
parent | c044ec0d85cd94d1a986297c2e1f228408dddd76 (diff) |
tcg/i386: Implement cmpsel_vec with avx512 insns
The avx512 vpblendm* instructions exactly implement cmpsel,
using a predicate input. Of course this matches nicely with
the avx512 predicate comparison instructions.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
-rw-r--r-- | tcg/i386/tcg-target.c.inc | 44 |
1 files changed, 43 insertions, 1 deletions
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc index f94a2a2385..d473dc7a5e 100644 --- a/tcg/i386/tcg-target.c.inc +++ b/tcg/i386/tcg-target.c.inc @@ -413,6 +413,10 @@ static bool tcg_target_const_match(int64_t val, int ct, #define OPC_UD2 (0x0b | P_EXT) #define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16) #define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16) +#define OPC_VPBLENDMB (0x66 | P_EXT38 | P_DATA16 | P_EVEX) +#define OPC_VPBLENDMW (0x66 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) +#define OPC_VPBLENDMD (0x64 | P_EXT38 | P_DATA16 | P_EVEX) +#define OPC_VPBLENDMQ (0x64 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX) #define OPC_VPCMPB (0x3f | P_EXT3A | P_DATA16 | P_EVEX) #define OPC_VPCMPUB (0x3e | P_EXT3A | P_DATA16 | P_EVEX) #define OPC_VPCMPW (0x3f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX) @@ -738,6 +742,16 @@ static void tcg_out_vex_modrm_type(TCGContext *s, int opc, tcg_out_vex_modrm(s, opc, r, v, rm); } +static void tcg_out_evex_modrm_type(TCGContext *s, int opc, int r, int v, + int rm, int aaa, bool z, TCGType type) +{ + if (type == TCG_TYPE_V256) { + opc |= P_VEXL; + } + tcg_out_evex_opc(s, opc, r, v, rm, 0, aaa, z); + tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); +} + /* Output an opcode with a full "rm + (index<<shift) + offset" address mode. We handle either RM and INDEX missing with a negative value. In 64-bit mode for absolute addresses, ~RM is the size of the immediate operand @@ -3183,11 +3197,39 @@ static void tcg_out_cmp_vec(TCGContext *s, TCGType type, unsigned vece, } } +static void tcg_out_cmpsel_vec_k1(TCGContext *s, TCGType type, unsigned vece, + TCGReg v0, TCGReg c1, TCGReg c2, + TCGReg v3, TCGReg v4, TCGCond cond) +{ + static const int vpblendm_insn[] = { + OPC_VPBLENDMB, OPC_VPBLENDMW, OPC_VPBLENDMD, OPC_VPBLENDMQ + }; + bool z = false; + + /* Swap to place constant in V4 to take advantage of zero-masking. */ + if (!v3) { + z = true; + v3 = v4; + cond = tcg_invert_cond(cond); + } + + tcg_out_cmp_vec_k1(s, type, vece, c1, c2, cond); + tcg_out_evex_modrm_type(s, vpblendm_insn[vece], v0, v4, v3, + /* k1 */1, z, type); +} + static void tcg_out_cmpsel_vec(TCGContext *s, TCGType type, unsigned vece, TCGReg v0, TCGReg c1, TCGReg c2, TCGReg v3, TCGReg v4, TCGCond cond) { - bool inv = tcg_out_cmp_vec_noinv(s, type, vece, TCG_TMP_VEC, c1, c2, cond); + bool inv; + + if (vece <= MO_16 ? have_avx512bw : have_avx512vl) { + tcg_out_cmpsel_vec_k1(s, type, vece, v0, c1, c2, v3, v4, cond); + return; + } + + inv = tcg_out_cmp_vec_noinv(s, type, vece, TCG_TMP_VEC, c1, c2, cond); /* * Since XMM0 is 16, the only way we get 0 into V3 |