diff options
Diffstat (limited to 'tcg/i386/tcg-target.inc.c')
-rw-r--r-- | tcg/i386/tcg-target.inc.c | 590 |
1 files changed, 359 insertions, 231 deletions
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c index 1b4e3b80e1..4d84aea3a9 100644 --- a/tcg/i386/tcg-target.inc.c +++ b/tcg/i386/tcg-target.inc.c @@ -329,6 +329,7 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type, #define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */ #define OPC_ANDN (0xf2 | P_EXT38) #define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3)) +#define OPC_AND_GvEv (OPC_ARITH_GvEv | (ARITH_AND << 3)) #define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16) #define OPC_BSF (0xbc | P_EXT) #define OPC_BSR (0xbd | P_EXT) @@ -377,6 +378,10 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type, #define OPC_PADDW (0xfd | P_EXT | P_DATA16) #define OPC_PADDD (0xfe | P_EXT | P_DATA16) #define OPC_PADDQ (0xd4 | P_EXT | P_DATA16) +#define OPC_PADDSB (0xec | P_EXT | P_DATA16) +#define OPC_PADDSW (0xed | P_EXT | P_DATA16) +#define OPC_PADDUB (0xdc | P_EXT | P_DATA16) +#define OPC_PADDUW (0xdd | P_EXT | P_DATA16) #define OPC_PAND (0xdb | P_EXT | P_DATA16) #define OPC_PANDN (0xdf | P_EXT | P_DATA16) #define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16) @@ -388,6 +393,18 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type, #define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16) #define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16) #define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16) +#define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16) +#define OPC_PMAXSW (0xee | P_EXT | P_DATA16) +#define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16) +#define OPC_PMAXUB (0xde | P_EXT | P_DATA16) +#define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16) +#define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16) +#define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16) +#define OPC_PMINSW (0xea | P_EXT | P_DATA16) +#define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16) +#define OPC_PMINUB (0xda | P_EXT | P_DATA16) +#define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16) +#define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16) #define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16) #define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16) #define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16) @@ -408,6 +425,10 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type, #define OPC_PSUBW (0xf9 | P_EXT | P_DATA16) #define OPC_PSUBD (0xfa | P_EXT | P_DATA16) #define OPC_PSUBQ (0xfb | P_EXT | P_DATA16) +#define OPC_PSUBSB (0xe8 | P_EXT | P_DATA16) +#define OPC_PSUBSW (0xe9 | P_EXT | P_DATA16) +#define OPC_PSUBUB (0xd8 | P_EXT | P_DATA16) +#define OPC_PSUBUW (0xd9 | P_EXT | P_DATA16) #define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16) #define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16) #define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16) @@ -1621,7 +1642,7 @@ static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi, } if (TCG_TYPE_PTR == TCG_TYPE_I64) { hrexw = P_REXW; - if (TARGET_PAGE_BITS + CPU_TLB_BITS > 32) { + if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) { tlbtype = TCG_TYPE_I64; tlbrexw = P_REXW; } @@ -1629,6 +1650,15 @@ static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi, } tcg_out_mov(s, tlbtype, r0, addrlo); + tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0, + TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS); + + tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0, + offsetof(CPUArchState, tlb_mask[mem_index])); + + tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0, + offsetof(CPUArchState, tlb_table[mem_index])); + /* If the required alignment is at least as large as the access, simply copy the address and mask. For lesser alignments, check that we don't cross pages for the complete access. */ @@ -1638,20 +1668,10 @@ static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi, tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask); } tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask; - - tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0, - TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS); - tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0); - tgen_arithi(s, ARITH_AND + tlbrexw, r0, - (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS, 0); - - tcg_out_modrm_sib_offset(s, OPC_LEA + hrexw, r0, TCG_AREG0, r0, 0, - offsetof(CPUArchState, tlb_table[mem_index][0]) - + which); /* cmp 0(r0), r1 */ - tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, 0); + tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which); /* Prepare for both the fast path add of the tlb addend, and the slow path function argument setup. */ @@ -1664,7 +1684,7 @@ static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi, if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) { /* cmp 4(r0), addrhi */ - tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, 4); + tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4); /* jne slow_path */ tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0); @@ -1676,7 +1696,7 @@ static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi, /* add addend(r0), r1 */ tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0, - offsetof(CPUTLBEntry, addend) - which); + offsetof(CPUTLBEntry, addend)); } /* @@ -2591,9 +2611,21 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, static int const add_insn[4] = { OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ }; + static int const ssadd_insn[4] = { + OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2 + }; + static int const usadd_insn[4] = { + OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2 + }; static int const sub_insn[4] = { OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ }; + static int const sssub_insn[4] = { + OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2 + }; + static int const ussub_insn[4] = { + OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2 + }; static int const mul_insn[4] = { OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2 }; @@ -2618,6 +2650,18 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, static int const packus_insn[4] = { OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2 }; + static int const smin_insn[4] = { + OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_UD2 + }; + static int const smax_insn[4] = { + OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_UD2 + }; + static int const umin_insn[4] = { + OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_UD2 + }; + static int const umax_insn[4] = { + OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2 + }; TCGType type = vecl + TCG_TYPE_V64; int insn, sub; @@ -2631,9 +2675,21 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, case INDEX_op_add_vec: insn = add_insn[vece]; goto gen_simd; + case INDEX_op_ssadd_vec: + insn = ssadd_insn[vece]; + goto gen_simd; + case INDEX_op_usadd_vec: + insn = usadd_insn[vece]; + goto gen_simd; case INDEX_op_sub_vec: insn = sub_insn[vece]; goto gen_simd; + case INDEX_op_sssub_vec: + insn = sssub_insn[vece]; + goto gen_simd; + case INDEX_op_ussub_vec: + insn = ussub_insn[vece]; + goto gen_simd; case INDEX_op_mul_vec: insn = mul_insn[vece]; goto gen_simd; @@ -2646,6 +2702,18 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, case INDEX_op_xor_vec: insn = OPC_PXOR; goto gen_simd; + case INDEX_op_smin_vec: + insn = smin_insn[vece]; + goto gen_simd; + case INDEX_op_umin_vec: + insn = umin_insn[vece]; + goto gen_simd; + case INDEX_op_smax_vec: + insn = smax_insn[vece]; + goto gen_simd; + case INDEX_op_umax_vec: + insn = umax_insn[vece]; + goto gen_simd; case INDEX_op_x86_punpckl_vec: insn = punpckl_insn[vece]; goto gen_simd; @@ -3007,6 +3075,14 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op) case INDEX_op_or_vec: case INDEX_op_xor_vec: case INDEX_op_andc_vec: + case INDEX_op_ssadd_vec: + case INDEX_op_usadd_vec: + case INDEX_op_sssub_vec: + case INDEX_op_ussub_vec: + case INDEX_op_smin_vec: + case INDEX_op_umin_vec: + case INDEX_op_smax_vec: + case INDEX_op_umax_vec: case INDEX_op_cmp_vec: case INDEX_op_x86_shufps_vec: case INDEX_op_x86_blend_vec: @@ -3074,258 +3150,310 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) } return 1; + case INDEX_op_ssadd_vec: + case INDEX_op_usadd_vec: + case INDEX_op_sssub_vec: + case INDEX_op_ussub_vec: + return vece <= MO_16; + case INDEX_op_smin_vec: + case INDEX_op_smax_vec: + case INDEX_op_umin_vec: + case INDEX_op_umax_vec: + return vece <= MO_32 ? 1 : -1; + default: return 0; } } -void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, - TCGArg a0, ...) +static void expand_vec_shi(TCGType type, unsigned vece, bool shr, + TCGv_vec v0, TCGv_vec v1, TCGArg imm) { - va_list va; - TCGArg a1, a2; - TCGv_vec v0, t1, t2, t3, t4; + TCGv_vec t1, t2; - va_start(va, a0); - v0 = temp_tcgv_vec(arg_temp(a0)); + tcg_debug_assert(vece == MO_8); - switch (opc) { - case INDEX_op_shli_vec: - case INDEX_op_shri_vec: - tcg_debug_assert(vece == MO_8); - a1 = va_arg(va, TCGArg); - a2 = va_arg(va, TCGArg); - /* Unpack to W, shift, and repack. Tricky bits: - (1) Use punpck*bw x,x to produce DDCCBBAA, - i.e. duplicate in other half of the 16-bit lane. - (2) For right-shift, add 8 so that the high half of - the lane becomes zero. For left-shift, we must - shift up and down again. - (3) Step 2 leaves high half zero such that PACKUSWB - (pack with unsigned saturation) does not modify - the quantity. */ + t1 = tcg_temp_new_vec(type); + t2 = tcg_temp_new_vec(type); + + /* Unpack to W, shift, and repack. Tricky bits: + (1) Use punpck*bw x,x to produce DDCCBBAA, + i.e. duplicate in other half of the 16-bit lane. + (2) For right-shift, add 8 so that the high half of + the lane becomes zero. For left-shift, we must + shift up and down again. + (3) Step 2 leaves high half zero such that PACKUSWB + (pack with unsigned saturation) does not modify + the quantity. */ + vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, + tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); + vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, + tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); + + if (shr) { + tcg_gen_shri_vec(MO_16, t1, t1, imm + 8); + tcg_gen_shri_vec(MO_16, t2, t2, imm + 8); + } else { + tcg_gen_shli_vec(MO_16, t1, t1, imm + 8); + tcg_gen_shli_vec(MO_16, t2, t2, imm + 8); + tcg_gen_shri_vec(MO_16, t1, t1, 8); + tcg_gen_shri_vec(MO_16, t2, t2, 8); + } + + vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, + tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); + tcg_temp_free_vec(t1); + tcg_temp_free_vec(t2); +} + +static void expand_vec_sari(TCGType type, unsigned vece, + TCGv_vec v0, TCGv_vec v1, TCGArg imm) +{ + TCGv_vec t1, t2; + + switch (vece) { + case MO_8: + /* Unpack to W, shift, and repack, as in expand_vec_shi. */ t1 = tcg_temp_new_vec(type); t2 = tcg_temp_new_vec(type); vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, - tcgv_vec_arg(t1), a1, a1); + tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, - tcgv_vec_arg(t2), a1, a1); - if (opc == INDEX_op_shri_vec) { - vec_gen_3(INDEX_op_shri_vec, type, MO_16, - tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8); - vec_gen_3(INDEX_op_shri_vec, type, MO_16, - tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8); - } else { - vec_gen_3(INDEX_op_shli_vec, type, MO_16, - tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8); - vec_gen_3(INDEX_op_shli_vec, type, MO_16, - tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8); - vec_gen_3(INDEX_op_shri_vec, type, MO_16, - tcgv_vec_arg(t1), tcgv_vec_arg(t1), 8); - vec_gen_3(INDEX_op_shri_vec, type, MO_16, - tcgv_vec_arg(t2), tcgv_vec_arg(t2), 8); - } - vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, - a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2)); + tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); + tcg_gen_sari_vec(MO_16, t1, t1, imm + 8); + tcg_gen_sari_vec(MO_16, t2, t2, imm + 8); + vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8, + tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); tcg_temp_free_vec(t1); tcg_temp_free_vec(t2); break; - case INDEX_op_sari_vec: - a1 = va_arg(va, TCGArg); - a2 = va_arg(va, TCGArg); - if (vece == MO_8) { - /* Unpack to W, shift, and repack, as above. */ - t1 = tcg_temp_new_vec(type); - t2 = tcg_temp_new_vec(type); - vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, - tcgv_vec_arg(t1), a1, a1); - vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, - tcgv_vec_arg(t2), a1, a1); - vec_gen_3(INDEX_op_sari_vec, type, MO_16, - tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8); - vec_gen_3(INDEX_op_sari_vec, type, MO_16, - tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8); - vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8, - a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2)); - tcg_temp_free_vec(t1); - tcg_temp_free_vec(t2); - break; - } - tcg_debug_assert(vece == MO_64); - /* MO_64: If the shift is <= 32, we can emulate the sign extend by - performing an arithmetic 32-bit shift and overwriting the high - half of the result (note that the ISA says shift of 32 is valid). */ - if (a2 <= 32) { + case MO_64: + if (imm <= 32) { + /* We can emulate a small sign extend by performing an arithmetic + * 32-bit shift and overwriting the high half of a 64-bit logical + * shift (note that the ISA says shift of 32 is valid). + */ t1 = tcg_temp_new_vec(type); - vec_gen_3(INDEX_op_sari_vec, type, MO_32, tcgv_vec_arg(t1), a1, a2); - vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2); + tcg_gen_sari_vec(MO_32, t1, v1, imm); + tcg_gen_shri_vec(MO_64, v0, v1, imm); vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32, - a0, a0, tcgv_vec_arg(t1), 0xaa); + tcgv_vec_arg(v0), tcgv_vec_arg(v0), + tcgv_vec_arg(t1), 0xaa); + tcg_temp_free_vec(t1); + } else { + /* Otherwise we will need to use a compare vs 0 to produce + * the sign-extend, shift and merge. + */ + t1 = tcg_const_zeros_vec(type); + tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1); + tcg_gen_shri_vec(MO_64, v0, v1, imm); + tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm); + tcg_gen_or_vec(MO_64, v0, v0, t1); tcg_temp_free_vec(t1); - break; } - /* Otherwise we will need to use a compare vs 0 to produce the - sign-extend, shift and merge. */ - t1 = tcg_temp_new_vec(type); - t2 = tcg_const_zeros_vec(type); - vec_gen_4(INDEX_op_cmp_vec, type, MO_64, - tcgv_vec_arg(t1), tcgv_vec_arg(t2), a1, TCG_COND_GT); + break; + + default: + g_assert_not_reached(); + } +} + +static void expand_vec_mul(TCGType type, unsigned vece, + TCGv_vec v0, TCGv_vec v1, TCGv_vec v2) +{ + TCGv_vec t1, t2, t3, t4; + + tcg_debug_assert(vece == MO_8); + + /* + * Unpack v1 bytes to words, 0 | x. + * Unpack v2 bytes to words, y | 0. + * This leaves the 8-bit result, x * y, with 8 bits of right padding. + * Shift logical right by 8 bits to clear the high 8 bytes before + * using an unsigned saturated pack. + * + * The difference between the V64, V128 and V256 cases is merely how + * we distribute the expansion between temporaries. + */ + switch (type) { + case TCG_TYPE_V64: + t1 = tcg_temp_new_vec(TCG_TYPE_V128); + t2 = tcg_temp_new_vec(TCG_TYPE_V128); + tcg_gen_dup16i_vec(t2, 0); + vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, + tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t2)); + vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, + tcgv_vec_arg(t2), tcgv_vec_arg(t2), tcgv_vec_arg(v2)); + tcg_gen_mul_vec(MO_16, t1, t1, t2); + tcg_gen_shri_vec(MO_16, t1, t1, 8); + vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, + tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1)); + tcg_temp_free_vec(t1); tcg_temp_free_vec(t2); - vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2); - vec_gen_3(INDEX_op_shli_vec, type, MO_64, - tcgv_vec_arg(t1), tcgv_vec_arg(t1), 64 - a2); - vec_gen_3(INDEX_op_or_vec, type, MO_64, a0, a0, tcgv_vec_arg(t1)); + break; + + case TCG_TYPE_V128: + case TCG_TYPE_V256: + t1 = tcg_temp_new_vec(type); + t2 = tcg_temp_new_vec(type); + t3 = tcg_temp_new_vec(type); + t4 = tcg_temp_new_vec(type); + tcg_gen_dup16i_vec(t4, 0); + vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, + tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t4)); + vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, + tcgv_vec_arg(t2), tcgv_vec_arg(t4), tcgv_vec_arg(v2)); + vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, + tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(t4)); + vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, + tcgv_vec_arg(t4), tcgv_vec_arg(t4), tcgv_vec_arg(v2)); + tcg_gen_mul_vec(MO_16, t1, t1, t2); + tcg_gen_mul_vec(MO_16, t3, t3, t4); + tcg_gen_shri_vec(MO_16, t1, t1, 8); + tcg_gen_shri_vec(MO_16, t3, t3, 8); + vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, + tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3)); tcg_temp_free_vec(t1); + tcg_temp_free_vec(t2); + tcg_temp_free_vec(t3); + tcg_temp_free_vec(t4); break; - case INDEX_op_mul_vec: - tcg_debug_assert(vece == MO_8); - a1 = va_arg(va, TCGArg); - a2 = va_arg(va, TCGArg); - switch (type) { - case TCG_TYPE_V64: - t1 = tcg_temp_new_vec(TCG_TYPE_V128); - t2 = tcg_temp_new_vec(TCG_TYPE_V128); - tcg_gen_dup16i_vec(t2, 0); - vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, - tcgv_vec_arg(t1), a1, tcgv_vec_arg(t2)); - vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, - tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2); - tcg_gen_mul_vec(MO_16, t1, t1, t2); - tcg_gen_shri_vec(MO_16, t1, t1, 8); - vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, - a0, tcgv_vec_arg(t1), tcgv_vec_arg(t1)); - tcg_temp_free_vec(t1); - tcg_temp_free_vec(t2); - break; + default: + g_assert_not_reached(); + } +} - case TCG_TYPE_V128: - t1 = tcg_temp_new_vec(TCG_TYPE_V128); - t2 = tcg_temp_new_vec(TCG_TYPE_V128); - t3 = tcg_temp_new_vec(TCG_TYPE_V128); - t4 = tcg_temp_new_vec(TCG_TYPE_V128); - tcg_gen_dup16i_vec(t4, 0); - vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, - tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4)); - vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, - tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2); - vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8, - tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4)); - vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8, - tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2); - tcg_gen_mul_vec(MO_16, t1, t1, t2); - tcg_gen_mul_vec(MO_16, t3, t3, t4); - tcg_gen_shri_vec(MO_16, t1, t1, 8); - tcg_gen_shri_vec(MO_16, t3, t3, 8); - vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, - a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3)); - tcg_temp_free_vec(t1); - tcg_temp_free_vec(t2); - tcg_temp_free_vec(t3); - tcg_temp_free_vec(t4); - break; +static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0, + TCGv_vec v1, TCGv_vec v2, TCGCond cond) +{ + enum { + NEED_SWAP = 1, + NEED_INV = 2, + NEED_BIAS = 4 + }; + static const uint8_t fixups[16] = { + [0 ... 15] = -1, + [TCG_COND_EQ] = 0, + [TCG_COND_NE] = NEED_INV, + [TCG_COND_GT] = 0, + [TCG_COND_LT] = NEED_SWAP, + [TCG_COND_LE] = NEED_INV, + [TCG_COND_GE] = NEED_SWAP | NEED_INV, + [TCG_COND_GTU] = NEED_BIAS, + [TCG_COND_LTU] = NEED_BIAS | NEED_SWAP, + [TCG_COND_LEU] = NEED_BIAS | NEED_INV, + [TCG_COND_GEU] = NEED_BIAS | NEED_SWAP | NEED_INV, + }; + TCGv_vec t1, t2; + uint8_t fixup; - case TCG_TYPE_V256: - t1 = tcg_temp_new_vec(TCG_TYPE_V256); - t2 = tcg_temp_new_vec(TCG_TYPE_V256); - t3 = tcg_temp_new_vec(TCG_TYPE_V256); - t4 = tcg_temp_new_vec(TCG_TYPE_V256); - tcg_gen_dup16i_vec(t4, 0); - /* a1: A[0-7] ... D[0-7]; a2: W[0-7] ... Z[0-7] - t1: extends of B[0-7], D[0-7] - t2: extends of X[0-7], Z[0-7] - t3: extends of A[0-7], C[0-7] - t4: extends of W[0-7], Y[0-7]. */ - vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8, - tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4)); - vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8, - tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2); - vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8, - tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4)); - vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8, - tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2); - /* t1: BX DZ; t2: AW CY. */ - tcg_gen_mul_vec(MO_16, t1, t1, t2); - tcg_gen_mul_vec(MO_16, t3, t3, t4); - tcg_gen_shri_vec(MO_16, t1, t1, 8); - tcg_gen_shri_vec(MO_16, t3, t3, 8); - /* a0: AW BX CY DZ. */ - vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V256, MO_8, - a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3)); - tcg_temp_free_vec(t1); - tcg_temp_free_vec(t2); - tcg_temp_free_vec(t3); - tcg_temp_free_vec(t4); - break; + fixup = fixups[cond & 15]; + tcg_debug_assert(fixup != 0xff); - default: - g_assert_not_reached(); + if (fixup & NEED_INV) { + cond = tcg_invert_cond(cond); + } + if (fixup & NEED_SWAP) { + t1 = v1, v1 = v2, v2 = t1; + cond = tcg_swap_cond(cond); + } + + t1 = t2 = NULL; + if (fixup & NEED_BIAS) { + t1 = tcg_temp_new_vec(type); + t2 = tcg_temp_new_vec(type); + tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1)); + tcg_gen_sub_vec(vece, t1, v1, t2); + tcg_gen_sub_vec(vece, t2, v2, t2); + v1 = t1; + v2 = t2; + cond = tcg_signed_cond(cond); + } + + tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT); + /* Expand directly; do not recurse. */ + vec_gen_4(INDEX_op_cmp_vec, type, vece, + tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond); + + if (t1) { + tcg_temp_free_vec(t1); + if (t2) { + tcg_temp_free_vec(t2); } - break; + } + if (fixup & NEED_INV) { + tcg_gen_not_vec(vece, v0, v0); + } +} - case INDEX_op_cmp_vec: - { - enum { - NEED_SWAP = 1, - NEED_INV = 2, - NEED_BIAS = 4 - }; - static const uint8_t fixups[16] = { - [0 ... 15] = -1, - [TCG_COND_EQ] = 0, - [TCG_COND_NE] = NEED_INV, - [TCG_COND_GT] = 0, - [TCG_COND_LT] = NEED_SWAP, - [TCG_COND_LE] = NEED_INV, - [TCG_COND_GE] = NEED_SWAP | NEED_INV, - [TCG_COND_GTU] = NEED_BIAS, - [TCG_COND_LTU] = NEED_BIAS | NEED_SWAP, - [TCG_COND_LEU] = NEED_BIAS | NEED_INV, - [TCG_COND_GEU] = NEED_BIAS | NEED_SWAP | NEED_INV, - }; +static void expand_vec_minmax(TCGType type, unsigned vece, + TCGCond cond, bool min, + TCGv_vec v0, TCGv_vec v1, TCGv_vec v2) +{ + TCGv_vec t1 = tcg_temp_new_vec(type); - TCGCond cond; - uint8_t fixup; + tcg_debug_assert(vece == MO_64); - a1 = va_arg(va, TCGArg); - a2 = va_arg(va, TCGArg); - cond = va_arg(va, TCGArg); - fixup = fixups[cond & 15]; - tcg_debug_assert(fixup != 0xff); + tcg_gen_cmp_vec(cond, vece, t1, v1, v2); + if (min) { + TCGv_vec t2; + t2 = v1, v1 = v2, v2 = t2; + } + vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece, + tcgv_vec_arg(v0), tcgv_vec_arg(v1), + tcgv_vec_arg(v2), tcgv_vec_arg(t1)); + tcg_temp_free_vec(t1); +} - if (fixup & NEED_INV) { - cond = tcg_invert_cond(cond); - } - if (fixup & NEED_SWAP) { - TCGArg t; - t = a1, a1 = a2, a2 = t; - cond = tcg_swap_cond(cond); - } +void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, + TCGArg a0, ...) +{ + va_list va; + TCGArg a2; + TCGv_vec v0, v1, v2; - t1 = t2 = NULL; - if (fixup & NEED_BIAS) { - t1 = tcg_temp_new_vec(type); - t2 = tcg_temp_new_vec(type); - tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1)); - tcg_gen_sub_vec(vece, t1, temp_tcgv_vec(arg_temp(a1)), t2); - tcg_gen_sub_vec(vece, t2, temp_tcgv_vec(arg_temp(a2)), t2); - a1 = tcgv_vec_arg(t1); - a2 = tcgv_vec_arg(t2); - cond = tcg_signed_cond(cond); - } + va_start(va, a0); + v0 = temp_tcgv_vec(arg_temp(a0)); + v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg))); + a2 = va_arg(va, TCGArg); - tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT); - vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond); + switch (opc) { + case INDEX_op_shli_vec: + case INDEX_op_shri_vec: + expand_vec_shi(type, vece, opc == INDEX_op_shri_vec, v0, v1, a2); + break; - if (fixup & NEED_BIAS) { - tcg_temp_free_vec(t1); - tcg_temp_free_vec(t2); - } - if (fixup & NEED_INV) { - tcg_gen_not_vec(vece, v0, v0); - } - } + case INDEX_op_sari_vec: + expand_vec_sari(type, vece, v0, v1, a2); + break; + + case INDEX_op_mul_vec: + v2 = temp_tcgv_vec(arg_temp(a2)); + expand_vec_mul(type, vece, v0, v1, v2); + break; + + case INDEX_op_cmp_vec: + v2 = temp_tcgv_vec(arg_temp(a2)); + expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg)); + break; + + case INDEX_op_smin_vec: + v2 = temp_tcgv_vec(arg_temp(a2)); + expand_vec_minmax(type, vece, TCG_COND_GT, true, v0, v1, v2); + break; + case INDEX_op_smax_vec: + v2 = temp_tcgv_vec(arg_temp(a2)); + expand_vec_minmax(type, vece, TCG_COND_GT, false, v0, v1, v2); + break; + case INDEX_op_umin_vec: + v2 = temp_tcgv_vec(arg_temp(a2)); + expand_vec_minmax(type, vece, TCG_COND_GTU, true, v0, v1, v2); + break; + case INDEX_op_umax_vec: + v2 = temp_tcgv_vec(arg_temp(a2)); + expand_vec_minmax(type, vece, TCG_COND_GTU, false, v0, v1, v2); break; default: |