diff options
author | Richard Henderson <richard.henderson@linaro.org> | 2024-03-12 14:28:27 -1000 |
---|---|---|
committer | Richard Henderson <richard.henderson@linaro.org> | 2024-05-06 12:55:50 -0700 |
commit | 2623ca6ac11dd1c15ec1c2e87aa2e7f22f0adec8 (patch) | |
tree | 571f3695676d2c636d786f43731584baacac0851 | |
parent | b3ee719e6499987a635332d012f08dc80cd277e0 (diff) |
tcg/i386: Simplify immediate 8-bit logical vector shifts
The x86 isa does not have this operation, so we need an expansion.
Use the same algorithm that we use for expanding this vector
operation with integers: perform the shift with a wider type
and then mask the bits that must be zero.
This reduces the instruction count from 5 to 2.
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
-rw-r--r-- | tcg/i386/tcg-target.c.inc | 59 |
1 files changed, 13 insertions, 46 deletions
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc index c6ba498623..6837c519b0 100644 --- a/tcg/i386/tcg-target.c.inc +++ b/tcg/i386/tcg-target.c.inc @@ -3769,49 +3769,20 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) } } -static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc, +static void expand_vec_shi(TCGType type, unsigned vece, bool right, TCGv_vec v0, TCGv_vec v1, TCGArg imm) { - TCGv_vec t1, t2; + uint8_t mask; tcg_debug_assert(vece == MO_8); - - t1 = tcg_temp_new_vec(type); - t2 = tcg_temp_new_vec(type); - - /* - * Unpack to W, shift, and repack. Tricky bits: - * (1) Use punpck*bw x,x to produce DDCCBBAA, - * i.e. duplicate in other half of the 16-bit lane. - * (2) For right-shift, add 8 so that the high half of the lane - * becomes zero. For left-shift, and left-rotate, we must - * shift up and down again. - * (3) Step 2 leaves high half zero such that PACKUSWB - * (pack with unsigned saturation) does not modify - * the quantity. - */ - vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, - tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); - vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, - tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1)); - - if (opc != INDEX_op_rotli_vec) { - imm += 8; - } - if (opc == INDEX_op_shri_vec) { - tcg_gen_shri_vec(MO_16, t1, t1, imm); - tcg_gen_shri_vec(MO_16, t2, t2, imm); + if (right) { + mask = 0xff >> imm; + tcg_gen_shri_vec(MO_16, v0, v1, imm); } else { - tcg_gen_shli_vec(MO_16, t1, t1, imm); - tcg_gen_shli_vec(MO_16, t2, t2, imm); - tcg_gen_shri_vec(MO_16, t1, t1, 8); - tcg_gen_shri_vec(MO_16, t2, t2, 8); + mask = 0xff << imm; + tcg_gen_shli_vec(MO_16, v0, v1, imm); } - - vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, - tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2)); - tcg_temp_free_vec(t1); - tcg_temp_free_vec(t2); + tcg_gen_and_vec(MO_8, v0, v0, tcg_constant_vec(type, MO_8, mask)); } static void expand_vec_sari(TCGType type, unsigned vece, @@ -3821,7 +3792,7 @@ static void expand_vec_sari(TCGType type, unsigned vece, switch (vece) { case MO_8: - /* Unpack to W, shift, and repack, as in expand_vec_shi. */ + /* Unpack to 16-bit, shift, and repack. */ t1 = tcg_temp_new_vec(type); t2 = tcg_temp_new_vec(type); vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, @@ -3874,12 +3845,7 @@ static void expand_vec_rotli(TCGType type, unsigned vece, { TCGv_vec t; - if (vece == MO_8) { - expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm); - return; - } - - if (have_avx512vbmi2) { + if (vece != MO_8 && have_avx512vbmi2) { vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm); return; @@ -4155,10 +4121,11 @@ void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, switch (opc) { case INDEX_op_shli_vec: + expand_vec_shi(type, vece, false, v0, v1, a2); + break; case INDEX_op_shri_vec: - expand_vec_shi(type, vece, opc, v0, v1, a2); + expand_vec_shi(type, vece, true, v0, v1, a2); break; - case INDEX_op_sari_vec: expand_vec_sari(type, vece, v0, v1, a2); break; |