aboutsummaryrefslogtreecommitdiff
path: root/tcg/i386/tcg-target.c.inc
diff options
context:
space:
mode:
Diffstat (limited to 'tcg/i386/tcg-target.c.inc')
-rw-r--r--tcg/i386/tcg-target.c.inc486
1 files changed, 311 insertions, 175 deletions
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 9a54ef7f8d..1bf50f1f62 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -133,6 +133,7 @@ static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
#define TCG_CT_CONST_I32 0x400
#define TCG_CT_CONST_WSZ 0x800
#define TCG_CT_CONST_TST 0x1000
+#define TCG_CT_CONST_ZERO 0x2000
/* Registers used with L constraint, which are the first argument
registers on x86_64, and two random call clobbered registers on
@@ -226,6 +227,9 @@ static bool tcg_target_const_match(int64_t val, int ct,
if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
return 1;
}
+ if ((ct & TCG_CT_CONST_ZERO) && val == 0) {
+ return 1;
+ }
return 0;
}
@@ -409,6 +413,18 @@ static bool tcg_target_const_match(int64_t val, int ct,
#define OPC_UD2 (0x0b | P_EXT)
#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16)
#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16)
+#define OPC_VPBLENDMB (0x66 | P_EXT38 | P_DATA16 | P_EVEX)
+#define OPC_VPBLENDMW (0x66 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
+#define OPC_VPBLENDMD (0x64 | P_EXT38 | P_DATA16 | P_EVEX)
+#define OPC_VPBLENDMQ (0x64 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
+#define OPC_VPCMPB (0x3f | P_EXT3A | P_DATA16 | P_EVEX)
+#define OPC_VPCMPUB (0x3e | P_EXT3A | P_DATA16 | P_EVEX)
+#define OPC_VPCMPW (0x3f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
+#define OPC_VPCMPUW (0x3e | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
+#define OPC_VPCMPD (0x1f | P_EXT3A | P_DATA16 | P_EVEX)
+#define OPC_VPCMPUD (0x1e | P_EXT3A | P_DATA16 | P_EVEX)
+#define OPC_VPCMPQ (0x1f | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
+#define OPC_VPCMPUQ (0x1e | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
#define OPC_VPINSRB (0x20 | P_EXT3A | P_DATA16)
#define OPC_VPINSRW (0xc4 | P_EXT | P_DATA16)
#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
@@ -417,6 +433,10 @@ static bool tcg_target_const_match(int64_t val, int ct,
#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
+#define OPC_VPMOVM2B (0x28 | P_EXT38 | P_SIMDF3 | P_EVEX)
+#define OPC_VPMOVM2W (0x28 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX)
+#define OPC_VPMOVM2D (0x38 | P_EXT38 | P_SIMDF3 | P_EVEX)
+#define OPC_VPMOVM2Q (0x38 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX)
#define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
#define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
#define OPC_VPROLVD (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
@@ -442,6 +462,14 @@ static bool tcg_target_const_match(int64_t val, int ct,
#define OPC_VPSRLVD (0x45 | P_EXT38 | P_DATA16)
#define OPC_VPSRLVQ (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
#define OPC_VPTERNLOGQ (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
+#define OPC_VPTESTMB (0x26 | P_EXT38 | P_DATA16 | P_EVEX)
+#define OPC_VPTESTMW (0x26 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
+#define OPC_VPTESTMD (0x27 | P_EXT38 | P_DATA16 | P_EVEX)
+#define OPC_VPTESTMQ (0x27 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
+#define OPC_VPTESTNMB (0x26 | P_EXT38 | P_SIMDF3 | P_EVEX)
+#define OPC_VPTESTNMW (0x26 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX)
+#define OPC_VPTESTNMD (0x27 | P_EXT38 | P_SIMDF3 | P_EVEX)
+#define OPC_VPTESTNMQ (0x27 | P_EXT38 | P_SIMDF3 | P_VEXW | P_EVEX)
#define OPC_VZEROUPPER (0x77 | P_EXT)
#define OPC_XCHG_ax_r32 (0x90)
#define OPC_XCHG_EvGv (0x87)
@@ -658,7 +686,7 @@ static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
}
static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
- int rm, int index)
+ int rm, int index, int aaa, bool z)
{
/* The entire 4-byte evex prefix; with R' and V' set. */
uint32_t p = 0x08041062;
@@ -695,7 +723,9 @@ static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
p = deposit32(p, 16, 2, pp);
p = deposit32(p, 19, 4, ~v);
p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
+ p = deposit32(p, 24, 3, aaa);
p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
+ p = deposit32(p, 31, 1, z);
tcg_out32(s, p);
tcg_out8(s, opc);
@@ -704,13 +734,32 @@ static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
{
if (opc & P_EVEX) {
- tcg_out_evex_opc(s, opc, r, v, rm, 0);
+ tcg_out_evex_opc(s, opc, r, v, rm, 0, 0, false);
} else {
tcg_out_vex_opc(s, opc, r, v, rm, 0);
}
tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
}
+static void tcg_out_vex_modrm_type(TCGContext *s, int opc,
+ int r, int v, int rm, TCGType type)
+{
+ if (type == TCG_TYPE_V256) {
+ opc |= P_VEXL;
+ }
+ tcg_out_vex_modrm(s, opc, r, v, rm);
+}
+
+static void tcg_out_evex_modrm_type(TCGContext *s, int opc, int r, int v,
+ int rm, int aaa, bool z, TCGType type)
+{
+ if (type == TCG_TYPE_V256) {
+ opc |= P_VEXL;
+ }
+ tcg_out_evex_opc(s, opc, r, v, rm, 0, aaa, z);
+ tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
+}
+
/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
We handle either RM and INDEX missing with a negative value. In 64-bit
mode for absolute addresses, ~RM is the size of the immediate operand
@@ -904,8 +953,7 @@ static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
TCGReg r, TCGReg a)
{
if (have_avx2) {
- int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
- tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
+ tcg_out_vex_modrm_type(s, avx2_dup_insn[vece], r, 0, a, type);
} else {
switch (vece) {
case MO_8:
@@ -3021,6 +3069,214 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
#undef OP_32_64
}
+static int const umin_insn[4] = {
+ OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
+};
+
+static int const umax_insn[4] = {
+ OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
+};
+
+static bool tcg_out_cmp_vec_noinv(TCGContext *s, TCGType type, unsigned vece,
+ TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond)
+{
+ static int const cmpeq_insn[4] = {
+ OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
+ };
+ static int const cmpgt_insn[4] = {
+ OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
+ };
+
+ enum {
+ NEED_INV = 1,
+ NEED_SWAP = 2,
+ NEED_UMIN = 4,
+ NEED_UMAX = 8,
+ INVALID = 16,
+ };
+ static const uint8_t cond_fixup[16] = {
+ [0 ... 15] = INVALID,
+ [TCG_COND_EQ] = 0,
+ [TCG_COND_GT] = 0,
+ [TCG_COND_NE] = NEED_INV,
+ [TCG_COND_LE] = NEED_INV,
+ [TCG_COND_LT] = NEED_SWAP,
+ [TCG_COND_GE] = NEED_SWAP | NEED_INV,
+ [TCG_COND_LEU] = NEED_UMIN,
+ [TCG_COND_GTU] = NEED_UMIN | NEED_INV,
+ [TCG_COND_GEU] = NEED_UMAX,
+ [TCG_COND_LTU] = NEED_UMAX | NEED_INV,
+ };
+ int fixup = cond_fixup[cond];
+
+ assert(!(fixup & INVALID));
+
+ if (fixup & NEED_INV) {
+ cond = tcg_invert_cond(cond);
+ }
+
+ if (fixup & NEED_SWAP) {
+ TCGReg swap = v1;
+ v1 = v2;
+ v2 = swap;
+ cond = tcg_swap_cond(cond);
+ }
+
+ if (fixup & (NEED_UMIN | NEED_UMAX)) {
+ int op = (fixup & NEED_UMIN ? umin_insn[vece] : umax_insn[vece]);
+
+ /* avx2 does not have 64-bit min/max; adjusted during expand. */
+ assert(vece <= MO_32);
+
+ tcg_out_vex_modrm_type(s, op, TCG_TMP_VEC, v1, v2, type);
+ v2 = TCG_TMP_VEC;
+ cond = TCG_COND_EQ;
+ }
+
+ switch (cond) {
+ case TCG_COND_EQ:
+ tcg_out_vex_modrm_type(s, cmpeq_insn[vece], v0, v1, v2, type);
+ break;
+ case TCG_COND_GT:
+ tcg_out_vex_modrm_type(s, cmpgt_insn[vece], v0, v1, v2, type);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ return fixup & NEED_INV;
+}
+
+static void tcg_out_cmp_vec_k1(TCGContext *s, TCGType type, unsigned vece,
+ TCGReg v1, TCGReg v2, TCGCond cond)
+{
+ static const int cmpm_insn[2][4] = {
+ { OPC_VPCMPB, OPC_VPCMPW, OPC_VPCMPD, OPC_VPCMPQ },
+ { OPC_VPCMPUB, OPC_VPCMPUW, OPC_VPCMPUD, OPC_VPCMPUQ }
+ };
+ static const int testm_insn[4] = {
+ OPC_VPTESTMB, OPC_VPTESTMW, OPC_VPTESTMD, OPC_VPTESTMQ
+ };
+ static const int testnm_insn[4] = {
+ OPC_VPTESTNMB, OPC_VPTESTNMW, OPC_VPTESTNMD, OPC_VPTESTNMQ
+ };
+
+ static const int cond_ext[16] = {
+ [TCG_COND_EQ] = 0,
+ [TCG_COND_NE] = 4,
+ [TCG_COND_LT] = 1,
+ [TCG_COND_LTU] = 1,
+ [TCG_COND_LE] = 2,
+ [TCG_COND_LEU] = 2,
+ [TCG_COND_NEVER] = 3,
+ [TCG_COND_GE] = 5,
+ [TCG_COND_GEU] = 5,
+ [TCG_COND_GT] = 6,
+ [TCG_COND_GTU] = 6,
+ [TCG_COND_ALWAYS] = 7,
+ };
+
+ switch (cond) {
+ case TCG_COND_TSTNE:
+ tcg_out_vex_modrm_type(s, testm_insn[vece], /* k1 */ 1, v1, v2, type);
+ break;
+ case TCG_COND_TSTEQ:
+ tcg_out_vex_modrm_type(s, testnm_insn[vece], /* k1 */ 1, v1, v2, type);
+ break;
+ default:
+ tcg_out_vex_modrm_type(s, cmpm_insn[is_unsigned_cond(cond)][vece],
+ /* k1 */ 1, v1, v2, type);
+ tcg_out8(s, cond_ext[cond]);
+ break;
+ }
+}
+
+static void tcg_out_k1_to_vec(TCGContext *s, TCGType type,
+ unsigned vece, TCGReg dest)
+{
+ static const int movm_insn[] = {
+ OPC_VPMOVM2B, OPC_VPMOVM2W, OPC_VPMOVM2D, OPC_VPMOVM2Q
+ };
+ tcg_out_vex_modrm_type(s, movm_insn[vece], dest, 0, /* k1 */ 1, type);
+}
+
+static void tcg_out_cmp_vec(TCGContext *s, TCGType type, unsigned vece,
+ TCGReg v0, TCGReg v1, TCGReg v2, TCGCond cond)
+{
+ /*
+ * With avx512, we have a complete set of comparisons into mask.
+ * Unless there's a single insn expansion for the comparision,
+ * expand via a mask in k1.
+ */
+ if ((vece <= MO_16 ? have_avx512bw : have_avx512dq)
+ && cond != TCG_COND_EQ
+ && cond != TCG_COND_LT
+ && cond != TCG_COND_GT) {
+ tcg_out_cmp_vec_k1(s, type, vece, v1, v2, cond);
+ tcg_out_k1_to_vec(s, type, vece, v0);
+ return;
+ }
+
+ if (tcg_out_cmp_vec_noinv(s, type, vece, v0, v1, v2, cond)) {
+ tcg_out_dupi_vec(s, type, vece, TCG_TMP_VEC, -1);
+ tcg_out_vex_modrm_type(s, OPC_PXOR, v0, v0, TCG_TMP_VEC, type);
+ }
+}
+
+static void tcg_out_cmpsel_vec_k1(TCGContext *s, TCGType type, unsigned vece,
+ TCGReg v0, TCGReg c1, TCGReg c2,
+ TCGReg v3, TCGReg v4, TCGCond cond)
+{
+ static const int vpblendm_insn[] = {
+ OPC_VPBLENDMB, OPC_VPBLENDMW, OPC_VPBLENDMD, OPC_VPBLENDMQ
+ };
+ bool z = false;
+
+ /* Swap to place constant in V4 to take advantage of zero-masking. */
+ if (!v3) {
+ z = true;
+ v3 = v4;
+ cond = tcg_invert_cond(cond);
+ }
+
+ tcg_out_cmp_vec_k1(s, type, vece, c1, c2, cond);
+ tcg_out_evex_modrm_type(s, vpblendm_insn[vece], v0, v4, v3,
+ /* k1 */1, z, type);
+}
+
+static void tcg_out_cmpsel_vec(TCGContext *s, TCGType type, unsigned vece,
+ TCGReg v0, TCGReg c1, TCGReg c2,
+ TCGReg v3, TCGReg v4, TCGCond cond)
+{
+ bool inv;
+
+ if (vece <= MO_16 ? have_avx512bw : have_avx512vl) {
+ tcg_out_cmpsel_vec_k1(s, type, vece, v0, c1, c2, v3, v4, cond);
+ return;
+ }
+
+ inv = tcg_out_cmp_vec_noinv(s, type, vece, TCG_TMP_VEC, c1, c2, cond);
+
+ /*
+ * Since XMM0 is 16, the only way we get 0 into V3
+ * is via the constant zero constraint.
+ */
+ if (!v3) {
+ if (inv) {
+ tcg_out_vex_modrm_type(s, OPC_PAND, v0, TCG_TMP_VEC, v4, type);
+ } else {
+ tcg_out_vex_modrm_type(s, OPC_PANDN, v0, TCG_TMP_VEC, v4, type);
+ }
+ } else {
+ if (inv) {
+ TCGReg swap = v3;
+ v3 = v4;
+ v4 = swap;
+ }
+ tcg_out_vex_modrm_type(s, OPC_VPBLENDVB, v0, v4, v3, type);
+ tcg_out8(s, (TCG_TMP_VEC - TCG_REG_XMM0) << 4);
+ }
+}
+
static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
unsigned vecl, unsigned vece,
const TCGArg args[TCG_MAX_OP_ARGS],
@@ -3050,12 +3306,6 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
static int const shift_imm_insn[4] = {
OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
};
- static int const cmpeq_insn[4] = {
- OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
- };
- static int const cmpgt_insn[4] = {
- OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
- };
static int const punpckl_insn[4] = {
OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
};
@@ -3074,12 +3324,6 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
static int const smax_insn[4] = {
OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
};
- static int const umin_insn[4] = {
- OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
- };
- static int const umax_insn[4] = {
- OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
- };
static int const rotlv_insn[4] = {
OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
};
@@ -3231,29 +3475,21 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
goto gen_simd;
gen_simd:
tcg_debug_assert(insn != OPC_UD2);
- if (type == TCG_TYPE_V256) {
- insn |= P_VEXL;
- }
- tcg_out_vex_modrm(s, insn, a0, a1, a2);
+ tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type);
break;
case INDEX_op_cmp_vec:
- sub = args[3];
- if (sub == TCG_COND_EQ) {
- insn = cmpeq_insn[vece];
- } else if (sub == TCG_COND_GT) {
- insn = cmpgt_insn[vece];
- } else {
- g_assert_not_reached();
- }
- goto gen_simd;
+ tcg_out_cmp_vec(s, type, vece, a0, a1, a2, args[3]);
+ break;
+
+ case INDEX_op_cmpsel_vec:
+ tcg_out_cmpsel_vec(s, type, vece, a0, a1, a2,
+ args[3], args[4], args[5]);
+ break;
case INDEX_op_andc_vec:
insn = OPC_PANDN;
- if (type == TCG_TYPE_V256) {
- insn |= P_VEXL;
- }
- tcg_out_vex_modrm(s, insn, a0, a2, a1);
+ tcg_out_vex_modrm_type(s, insn, a0, a2, a1, type);
break;
case INDEX_op_shli_vec:
@@ -3281,10 +3517,7 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
goto gen_shift;
gen_shift:
tcg_debug_assert(vece != MO_8);
- if (type == TCG_TYPE_V256) {
- insn |= P_VEXL;
- }
- tcg_out_vex_modrm(s, insn, sub, a0, a1);
+ tcg_out_vex_modrm_type(s, insn, sub, a0, a1, type);
tcg_out8(s, a2);
break;
@@ -3361,22 +3594,10 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
gen_simd_imm8:
tcg_debug_assert(insn != OPC_UD2);
- if (type == TCG_TYPE_V256) {
- insn |= P_VEXL;
- }
- tcg_out_vex_modrm(s, insn, a0, a1, a2);
+ tcg_out_vex_modrm_type(s, insn, a0, a1, a2, type);
tcg_out8(s, sub);
break;
- case INDEX_op_x86_vpblendvb_vec:
- insn = OPC_VPBLENDVB;
- if (type == TCG_TYPE_V256) {
- insn |= P_VEXL;
- }
- tcg_out_vex_modrm(s, insn, a0, a1, a2);
- tcg_out8(s, args[3] << 4);
- break;
-
case INDEX_op_x86_psrldq_vec:
tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
tcg_out8(s, a2);
@@ -3642,8 +3863,9 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
return C_O1_I3(x, 0, x, x);
case INDEX_op_bitsel_vec:
- case INDEX_op_x86_vpblendvb_vec:
return C_O1_I3(x, x, x, x);
+ case INDEX_op_cmpsel_vec:
+ return C_O1_I4(x, x, x, xO, x);
default:
g_assert_not_reached();
@@ -3979,145 +4201,59 @@ static void expand_vec_mul(TCGType type, unsigned vece,
}
}
-static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
- TCGv_vec v1, TCGv_vec v2, TCGCond cond)
+static TCGCond expand_vec_cond(TCGType type, unsigned vece,
+ TCGArg *a1, TCGArg *a2, TCGCond cond)
{
- enum {
- NEED_INV = 1,
- NEED_SWAP = 2,
- NEED_BIAS = 4,
- NEED_UMIN = 8,
- NEED_UMAX = 16,
- };
- TCGv_vec t1, t2, t3;
- uint8_t fixup;
-
- switch (cond) {
- case TCG_COND_EQ:
- case TCG_COND_GT:
- fixup = 0;
- break;
- case TCG_COND_NE:
- case TCG_COND_LE:
- fixup = NEED_INV;
- break;
- case TCG_COND_LT:
- fixup = NEED_SWAP;
- break;
- case TCG_COND_GE:
- fixup = NEED_SWAP | NEED_INV;
- break;
- case TCG_COND_LEU:
- if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
- fixup = NEED_UMIN;
- } else {
- fixup = NEED_BIAS | NEED_INV;
- }
- break;
- case TCG_COND_GTU:
- if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
- fixup = NEED_UMIN | NEED_INV;
- } else {
- fixup = NEED_BIAS;
- }
- break;
- case TCG_COND_GEU:
- if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
- fixup = NEED_UMAX;
- } else {
- fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
- }
- break;
- case TCG_COND_LTU:
- if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
- fixup = NEED_UMAX | NEED_INV;
- } else {
- fixup = NEED_BIAS | NEED_SWAP;
- }
- break;
- default:
- g_assert_not_reached();
- }
-
- if (fixup & NEED_INV) {
- cond = tcg_invert_cond(cond);
- }
- if (fixup & NEED_SWAP) {
- t1 = v1, v1 = v2, v2 = t1;
- cond = tcg_swap_cond(cond);
- }
+ /*
+ * Without AVX512, there are no 64-bit unsigned comparisons.
+ * We must bias the inputs so that they become signed.
+ * All other swapping and inversion are handled during code generation.
+ */
+ if (vece == MO_64 && !have_avx512dq && is_unsigned_cond(cond)) {
+ TCGv_vec v1 = temp_tcgv_vec(arg_temp(*a1));
+ TCGv_vec v2 = temp_tcgv_vec(arg_temp(*a2));
+ TCGv_vec t1 = tcg_temp_new_vec(type);
+ TCGv_vec t2 = tcg_temp_new_vec(type);
+ TCGv_vec t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
- t1 = t2 = NULL;
- if (fixup & (NEED_UMIN | NEED_UMAX)) {
- t1 = tcg_temp_new_vec(type);
- if (fixup & NEED_UMIN) {
- tcg_gen_umin_vec(vece, t1, v1, v2);
- } else {
- tcg_gen_umax_vec(vece, t1, v1, v2);
- }
- v2 = t1;
- cond = TCG_COND_EQ;
- } else if (fixup & NEED_BIAS) {
- t1 = tcg_temp_new_vec(type);
- t2 = tcg_temp_new_vec(type);
- t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
tcg_gen_sub_vec(vece, t1, v1, t3);
tcg_gen_sub_vec(vece, t2, v2, t3);
- v1 = t1;
- v2 = t2;
+ *a1 = tcgv_vec_arg(t1);
+ *a2 = tcgv_vec_arg(t2);
cond = tcg_signed_cond(cond);
}
-
- tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
- /* Expand directly; do not recurse. */
- vec_gen_4(INDEX_op_cmp_vec, type, vece,
- tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
-
- if (t1) {
- tcg_temp_free_vec(t1);
- if (t2) {
- tcg_temp_free_vec(t2);
- }
- }
- return fixup & NEED_INV;
+ return cond;
}
-static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
- TCGv_vec v1, TCGv_vec v2, TCGCond cond)
+static void expand_vec_cmp(TCGType type, unsigned vece, TCGArg a0,
+ TCGArg a1, TCGArg a2, TCGCond cond)
{
- if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
- tcg_gen_not_vec(vece, v0, v0);
- }
+ cond = expand_vec_cond(type, vece, &a1, &a2, cond);
+ /* Expand directly; do not recurse. */
+ vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond);
}
-static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
- TCGv_vec c1, TCGv_vec c2,
- TCGv_vec v3, TCGv_vec v4, TCGCond cond)
+static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGArg a0,
+ TCGArg a1, TCGArg a2,
+ TCGArg a3, TCGArg a4, TCGCond cond)
{
- TCGv_vec t = tcg_temp_new_vec(type);
-
- if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
- /* Invert the sense of the compare by swapping arguments. */
- TCGv_vec x;
- x = v3, v3 = v4, v4 = x;
- }
- vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
- tcgv_vec_arg(v0), tcgv_vec_arg(v4),
- tcgv_vec_arg(v3), tcgv_vec_arg(t));
- tcg_temp_free_vec(t);
+ cond = expand_vec_cond(type, vece, &a1, &a2, cond);
+ /* Expand directly; do not recurse. */
+ vec_gen_6(INDEX_op_cmpsel_vec, type, vece, a0, a1, a2, a3, a4, cond);
}
void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
TCGArg a0, ...)
{
va_list va;
- TCGArg a2;
- TCGv_vec v0, v1, v2, v3, v4;
+ TCGArg a1, a2, a3, a4, a5;
+ TCGv_vec v0, v1, v2;
va_start(va, a0);
- v0 = temp_tcgv_vec(arg_temp(a0));
- v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
+ a1 = va_arg(va, TCGArg);
a2 = va_arg(va, TCGArg);
+ v0 = temp_tcgv_vec(arg_temp(a0));
+ v1 = temp_tcgv_vec(arg_temp(a1));
switch (opc) {
case INDEX_op_shli_vec:
@@ -4153,15 +4289,15 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
break;
case INDEX_op_cmp_vec:
- v2 = temp_tcgv_vec(arg_temp(a2));
- expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
+ a3 = va_arg(va, TCGArg);
+ expand_vec_cmp(type, vece, a0, a1, a2, a3);
break;
case INDEX_op_cmpsel_vec:
- v2 = temp_tcgv_vec(arg_temp(a2));
- v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
- v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
- expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
+ a3 = va_arg(va, TCGArg);
+ a4 = va_arg(va, TCGArg);
+ a5 = va_arg(va, TCGArg);
+ expand_vec_cmpsel(type, vece, a0, a1, a2, a3, a4, a5);
break;
default: