diff options
Diffstat (limited to 'tcg/tcg-op-gvec.c')
-rw-r--r-- | tcg/tcg-op-gvec.c | 276 |
1 files changed, 276 insertions, 0 deletions
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c index f8ae75b331..0253f77b62 100644 --- a/tcg/tcg-op-gvec.c +++ b/tcg/tcg-op-gvec.c @@ -535,6 +535,26 @@ static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, tcg_temp_free_i32(t0); } +static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, + int32_t c, bool load_dest, + void (*fni)(TCGv_i32, TCGv_i32, int32_t)) +{ + TCGv_i32 t0 = tcg_temp_new_i32(); + TCGv_i32 t1 = tcg_temp_new_i32(); + uint32_t i; + + for (i = 0; i < oprsz; i += 4) { + tcg_gen_ld_i32(t0, cpu_env, aofs + i); + if (load_dest) { + tcg_gen_ld_i32(t1, cpu_env, dofs + i); + } + fni(t1, t0, c); + tcg_gen_st_i32(t1, cpu_env, dofs + i); + } + tcg_temp_free_i32(t0); + tcg_temp_free_i32(t1); +} + /* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ static void expand_3_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t oprsz, bool load_dest, @@ -598,6 +618,26 @@ static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, tcg_temp_free_i64(t0); } +static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, + int64_t c, bool load_dest, + void (*fni)(TCGv_i64, TCGv_i64, int64_t)) +{ + TCGv_i64 t0 = tcg_temp_new_i64(); + TCGv_i64 t1 = tcg_temp_new_i64(); + uint32_t i; + + for (i = 0; i < oprsz; i += 8) { + tcg_gen_ld_i64(t0, cpu_env, aofs + i); + if (load_dest) { + tcg_gen_ld_i64(t1, cpu_env, dofs + i); + } + fni(t1, t0, c); + tcg_gen_st_i64(t1, cpu_env, dofs + i); + } + tcg_temp_free_i64(t0); + tcg_temp_free_i64(t1); +} + /* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ static void expand_3_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t oprsz, bool load_dest, @@ -662,6 +702,29 @@ static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs, tcg_temp_free_vec(t0); } +/* Expand OPSZ bytes worth of two-vector operands and an immediate operand + using host vectors. */ +static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t oprsz, uint32_t tysz, TCGType type, + int64_t c, bool load_dest, + void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t)) +{ + TCGv_vec t0 = tcg_temp_new_vec(type); + TCGv_vec t1 = tcg_temp_new_vec(type); + uint32_t i; + + for (i = 0; i < oprsz; i += tysz) { + tcg_gen_ld_vec(t0, cpu_env, aofs + i); + if (load_dest) { + tcg_gen_ld_vec(t1, cpu_env, dofs + i); + } + fni(vece, t1, t0, c); + tcg_gen_st_vec(t1, cpu_env, dofs + i); + } + tcg_temp_free_vec(t0); + tcg_temp_free_vec(t1); +} + /* Expand OPSZ bytes worth of three-operand operations using host vectors. */ static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t oprsz, @@ -765,6 +828,55 @@ void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs, } } +void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz, + uint32_t maxsz, int64_t c, const GVecGen2i *g) +{ + check_size_align(oprsz, maxsz, dofs | aofs); + check_overlap_2(dofs, aofs, maxsz); + + /* Recall that ARM SVE allows vector sizes that are not a power of 2. + Expand with successively smaller host vector sizes. The intent is + that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */ + + if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32) + && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) { + uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32); + expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, + c, g->load_dest, g->fniv); + if (some == oprsz) { + goto done; + } + dofs += some; + aofs += some; + oprsz -= some; + maxsz -= some; + } + + if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16) + && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) { + expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, + c, g->load_dest, g->fniv); + } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64 + && g->fniv && check_size_impl(oprsz, 8) + && (!g->opc + || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) { + expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, + c, g->load_dest, g->fniv); + } else if (g->fni8 && check_size_impl(oprsz, 8)) { + expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8); + } else if (g->fni4 && check_size_impl(oprsz, 4)) { + expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4); + } else { + tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno); + return; + } + + done: + if (oprsz < maxsz) { + expand_clr(dofs + oprsz, maxsz - oprsz); + } +} + /* Expand a vector three-operand operation. */ void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g) @@ -1307,3 +1419,167 @@ void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs, }; tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); } + +void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) +{ + uint64_t mask = dup_const(MO_8, 0xff << c); + tcg_gen_shli_i64(d, a, c); + tcg_gen_andi_i64(d, d, mask); +} + +void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) +{ + uint64_t mask = dup_const(MO_16, 0xffff << c); + tcg_gen_shli_i64(d, a, c); + tcg_gen_andi_i64(d, d, mask); +} + +void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t shift, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen2i g[4] = { + { .fni8 = tcg_gen_vec_shl8i_i64, + .fniv = tcg_gen_shli_vec, + .fno = gen_helper_gvec_shl8i, + .opc = INDEX_op_shli_vec, + .vece = MO_8 }, + { .fni8 = tcg_gen_vec_shl16i_i64, + .fniv = tcg_gen_shli_vec, + .fno = gen_helper_gvec_shl16i, + .opc = INDEX_op_shli_vec, + .vece = MO_16 }, + { .fni4 = tcg_gen_shli_i32, + .fniv = tcg_gen_shli_vec, + .fno = gen_helper_gvec_shl32i, + .opc = INDEX_op_shli_vec, + .vece = MO_32 }, + { .fni8 = tcg_gen_shli_i64, + .fniv = tcg_gen_shli_vec, + .fno = gen_helper_gvec_shl64i, + .opc = INDEX_op_shli_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 }, + }; + + tcg_debug_assert(vece <= MO_64); + tcg_debug_assert(shift >= 0 && shift < (8 << vece)); + if (shift == 0) { + tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); + } else { + tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); + } +} + +void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) +{ + uint64_t mask = dup_const(MO_8, 0xff >> c); + tcg_gen_shri_i64(d, a, c); + tcg_gen_andi_i64(d, d, mask); +} + +void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) +{ + uint64_t mask = dup_const(MO_16, 0xffff >> c); + tcg_gen_shri_i64(d, a, c); + tcg_gen_andi_i64(d, d, mask); +} + +void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t shift, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen2i g[4] = { + { .fni8 = tcg_gen_vec_shr8i_i64, + .fniv = tcg_gen_shri_vec, + .fno = gen_helper_gvec_shr8i, + .opc = INDEX_op_shri_vec, + .vece = MO_8 }, + { .fni8 = tcg_gen_vec_shr16i_i64, + .fniv = tcg_gen_shri_vec, + .fno = gen_helper_gvec_shr16i, + .opc = INDEX_op_shri_vec, + .vece = MO_16 }, + { .fni4 = tcg_gen_shri_i32, + .fniv = tcg_gen_shri_vec, + .fno = gen_helper_gvec_shr32i, + .opc = INDEX_op_shri_vec, + .vece = MO_32 }, + { .fni8 = tcg_gen_shri_i64, + .fniv = tcg_gen_shri_vec, + .fno = gen_helper_gvec_shr64i, + .opc = INDEX_op_shri_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 }, + }; + + tcg_debug_assert(vece <= MO_64); + tcg_debug_assert(shift >= 0 && shift < (8 << vece)); + if (shift == 0) { + tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); + } else { + tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); + } +} + +void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) +{ + uint64_t s_mask = dup_const(MO_8, 0x80 >> c); + uint64_t c_mask = dup_const(MO_8, 0xff >> c); + TCGv_i64 s = tcg_temp_new_i64(); + + tcg_gen_shri_i64(d, a, c); + tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ + tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ + tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ + tcg_gen_or_i64(d, d, s); /* include sign extension */ + tcg_temp_free_i64(s); +} + +void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) +{ + uint64_t s_mask = dup_const(MO_16, 0x8000 >> c); + uint64_t c_mask = dup_const(MO_16, 0xffff >> c); + TCGv_i64 s = tcg_temp_new_i64(); + + tcg_gen_shri_i64(d, a, c); + tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ + tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ + tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ + tcg_gen_or_i64(d, d, s); /* include sign extension */ + tcg_temp_free_i64(s); +} + +void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t shift, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen2i g[4] = { + { .fni8 = tcg_gen_vec_sar8i_i64, + .fniv = tcg_gen_sari_vec, + .fno = gen_helper_gvec_sar8i, + .opc = INDEX_op_sari_vec, + .vece = MO_8 }, + { .fni8 = tcg_gen_vec_sar16i_i64, + .fniv = tcg_gen_sari_vec, + .fno = gen_helper_gvec_sar16i, + .opc = INDEX_op_sari_vec, + .vece = MO_16 }, + { .fni4 = tcg_gen_sari_i32, + .fniv = tcg_gen_sari_vec, + .fno = gen_helper_gvec_sar32i, + .opc = INDEX_op_sari_vec, + .vece = MO_32 }, + { .fni8 = tcg_gen_sari_i64, + .fniv = tcg_gen_sari_vec, + .fno = gen_helper_gvec_sar64i, + .opc = INDEX_op_sari_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 }, + }; + + tcg_debug_assert(vece <= MO_64); + tcg_debug_assert(shift >= 0 && shift < (8 << vece)); + if (shift == 0) { + tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); + } else { + tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); + } +} |