diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2018-02-08 17:41:15 +0000 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2018-02-08 17:41:15 +0000 |
commit | 04bb7fe2bf55bdf66d5b7a5a719b40bbb4048178 (patch) | |
tree | d6352968c57e2255ef66f0ba4a696ba82cc9f3ca | |
parent | 008a51bbb343972dd8cf09126da8c3b87f4e1c96 (diff) | |
parent | 14e4c1e2355473ccb2939afc69ac8f25de103b92 (diff) |
Merge remote-tracking branch 'remotes/rth/tags/pull-tcg-20180208' into staging
tcg generic vectors
# gpg: Signature made Thu 08 Feb 2018 16:47:16 GMT
# gpg: using RSA key 64DF38E8AF7E215F
# gpg: Good signature from "Richard Henderson <richard.henderson@linaro.org>"
# Primary key fingerprint: 7A48 1E78 868B 4DB6 A85A 05C0 64DF 38E8 AF7E 215F
* remotes/rth/tags/pull-tcg-20180208:
tcg/aarch64: Add vector operations
tcg/i386: Add vector operations
target/arm: Use vector infrastructure for aa64 orr/bic immediate
target/arm: Use vector infrastructure for aa64 multiplies
target/arm: Use vector infrastructure for aa64 compares
target/arm: Use vector infrastructure for aa64 constant shifts
target/arm: Use vector infrastructure for aa64 dup/movi
target/arm: Use vector infrastructure for aa64 mov/not/neg
target/arm: Use vector infrastructure for aa64 add/sub/logic
target/arm: Align vector registers
tcg/optimize: Handle vector opcodes during optimize
tcg: Add generic vector helpers with a scalar operand
tcg: Add generic helpers for saturating arithmetic
tcg: Add generic vector ops for multiplication
tcg: Add generic vector ops for comparisons
tcg: Add generic vector ops for constant shifts
tcg: Add generic vector expanders
tcg: Standardize integral arguments to expanders
tcg: Add types and basic operations for host vectors
tcg: Allow multiple word entries into the constant pool
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r-- | Makefile.target | 4 | ||||
-rw-r--r-- | accel/tcg/Makefile.objs | 2 | ||||
-rw-r--r-- | accel/tcg/tcg-runtime-gvec.c | 997 | ||||
-rw-r--r-- | accel/tcg/tcg-runtime.h | 118 | ||||
-rwxr-xr-x | configure | 48 | ||||
-rw-r--r-- | target/arm/cpu.h | 2 | ||||
-rw-r--r-- | target/arm/translate-a64.c | 975 | ||||
-rw-r--r-- | tcg/README | 86 | ||||
-rw-r--r-- | tcg/aarch64/tcg-target.h | 25 | ||||
-rw-r--r-- | tcg/aarch64/tcg-target.inc.c | 588 | ||||
-rw-r--r-- | tcg/aarch64/tcg-target.opc.h | 3 | ||||
-rw-r--r-- | tcg/i386/tcg-target.h | 41 | ||||
-rw-r--r-- | tcg/i386/tcg-target.inc.c | 987 | ||||
-rw-r--r-- | tcg/i386/tcg-target.opc.h | 13 | ||||
-rw-r--r-- | tcg/optimize.c | 150 | ||||
-rw-r--r-- | tcg/tcg-gvec-desc.h | 49 | ||||
-rw-r--r-- | tcg/tcg-op-gvec.c | 2216 | ||||
-rw-r--r-- | tcg/tcg-op-gvec.h | 306 | ||||
-rw-r--r-- | tcg/tcg-op-vec.c | 389 | ||||
-rw-r--r-- | tcg/tcg-op.c | 42 | ||||
-rw-r--r-- | tcg/tcg-op.h | 52 | ||||
-rw-r--r-- | tcg/tcg-opc.h | 46 | ||||
-rw-r--r-- | tcg/tcg-pool.inc.c | 113 | ||||
-rw-r--r-- | tcg/tcg.c | 125 | ||||
-rw-r--r-- | tcg/tcg.h | 87 |
25 files changed, 6969 insertions, 495 deletions
diff --git a/Makefile.target b/Makefile.target index f9a9da7e7c..6549481096 100644 --- a/Makefile.target +++ b/Makefile.target @@ -93,8 +93,8 @@ all: $(PROGS) stap # cpu emulator library obj-y += exec.o obj-y += accel/ -obj-$(CONFIG_TCG) += tcg/tcg.o tcg/tcg-op.o tcg/optimize.o -obj-$(CONFIG_TCG) += tcg/tcg-common.o +obj-$(CONFIG_TCG) += tcg/tcg.o tcg/tcg-op.o tcg/tcg-op-vec.o tcg/tcg-op-gvec.o +obj-$(CONFIG_TCG) += tcg/tcg-common.o tcg/optimize.o obj-$(CONFIG_TCG_INTERPRETER) += tcg/tci.o obj-$(CONFIG_TCG_INTERPRETER) += disas/tci.o obj-y += fpu/softfloat.o diff --git a/accel/tcg/Makefile.objs b/accel/tcg/Makefile.objs index 228cd84fa4..d381a02f34 100644 --- a/accel/tcg/Makefile.objs +++ b/accel/tcg/Makefile.objs @@ -1,6 +1,6 @@ obj-$(CONFIG_SOFTMMU) += tcg-all.o obj-$(CONFIG_SOFTMMU) += cputlb.o -obj-y += tcg-runtime.o +obj-y += tcg-runtime.o tcg-runtime-gvec.o obj-y += cpu-exec.o cpu-exec-common.o translate-all.o obj-y += translator.o diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c new file mode 100644 index 0000000000..8bf8d63912 --- /dev/null +++ b/accel/tcg/tcg-runtime-gvec.c @@ -0,0 +1,997 @@ +/* + * Generic vectorized operation runtime + * + * Copyright (c) 2018 Linaro + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "qemu/host-utils.h" +#include "cpu.h" +#include "exec/helper-proto.h" +#include "tcg-gvec-desc.h" + + +/* Virtually all hosts support 16-byte vectors. Those that don't can emulate + * them via GCC's generic vector extension. This turns out to be simpler and + * more reliable than getting the compiler to autovectorize. + * + * In tcg-op-gvec.c, we asserted that both the size and alignment of the data + * are multiples of 16. + * + * When the compiler does not support all of the operations we require, the + * loops are written so that we can always fall back on the base types. + */ +#ifdef CONFIG_VECTOR16 +typedef uint8_t vec8 __attribute__((vector_size(16))); +typedef uint16_t vec16 __attribute__((vector_size(16))); +typedef uint32_t vec32 __attribute__((vector_size(16))); +typedef uint64_t vec64 __attribute__((vector_size(16))); + +typedef int8_t svec8 __attribute__((vector_size(16))); +typedef int16_t svec16 __attribute__((vector_size(16))); +typedef int32_t svec32 __attribute__((vector_size(16))); +typedef int64_t svec64 __attribute__((vector_size(16))); + +#define DUP16(X) { X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X } +#define DUP8(X) { X, X, X, X, X, X, X, X } +#define DUP4(X) { X, X, X, X } +#define DUP2(X) { X, X } +#else +typedef uint8_t vec8; +typedef uint16_t vec16; +typedef uint32_t vec32; +typedef uint64_t vec64; + +typedef int8_t svec8; +typedef int16_t svec16; +typedef int32_t svec32; +typedef int64_t svec64; + +#define DUP16(X) X +#define DUP8(X) X +#define DUP4(X) X +#define DUP2(X) X +#endif /* CONFIG_VECTOR16 */ + +static inline void clear_high(void *d, intptr_t oprsz, uint32_t desc) +{ + intptr_t maxsz = simd_maxsz(desc); + intptr_t i; + + if (unlikely(maxsz > oprsz)) { + for (i = oprsz; i < maxsz; i += sizeof(uint64_t)) { + *(uint64_t *)(d + i) = 0; + } + } +} + +void HELPER(gvec_add8)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec8)) { + *(vec8 *)(d + i) = *(vec8 *)(a + i) + *(vec8 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_add16)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec16)) { + *(vec16 *)(d + i) = *(vec16 *)(a + i) + *(vec16 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_add32)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec32)) { + *(vec32 *)(d + i) = *(vec32 *)(a + i) + *(vec32 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) + *(vec64 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_adds8)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec8 vecb = (vec8)DUP16(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec8)) { + *(vec8 *)(d + i) = *(vec8 *)(a + i) + vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_adds16)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec16 vecb = (vec16)DUP8(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec16)) { + *(vec16 *)(d + i) = *(vec16 *)(a + i) + vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_adds32)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec32 vecb = (vec32)DUP4(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec32)) { + *(vec32 *)(d + i) = *(vec32 *)(a + i) + vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_adds64)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec64 vecb = (vec64)DUP2(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) + vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_sub8)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec8)) { + *(vec8 *)(d + i) = *(vec8 *)(a + i) - *(vec8 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_sub16)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec16)) { + *(vec16 *)(d + i) = *(vec16 *)(a + i) - *(vec16 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_sub32)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec32)) { + *(vec32 *)(d + i) = *(vec32 *)(a + i) - *(vec32 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) - *(vec64 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_subs8)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec8 vecb = (vec8)DUP16(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec8)) { + *(vec8 *)(d + i) = *(vec8 *)(a + i) - vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_subs16)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec16 vecb = (vec16)DUP8(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec16)) { + *(vec16 *)(d + i) = *(vec16 *)(a + i) - vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_subs32)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec32 vecb = (vec32)DUP4(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec32)) { + *(vec32 *)(d + i) = *(vec32 *)(a + i) - vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_subs64)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec64 vecb = (vec64)DUP2(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) - vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_mul8)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec8)) { + *(vec8 *)(d + i) = *(vec8 *)(a + i) * *(vec8 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_mul16)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec16)) { + *(vec16 *)(d + i) = *(vec16 *)(a + i) * *(vec16 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_mul32)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec32)) { + *(vec32 *)(d + i) = *(vec32 *)(a + i) * *(vec32 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_mul64)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) * *(vec64 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_muls8)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec8 vecb = (vec8)DUP16(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec8)) { + *(vec8 *)(d + i) = *(vec8 *)(a + i) * vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_muls16)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec16 vecb = (vec16)DUP8(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec16)) { + *(vec16 *)(d + i) = *(vec16 *)(a + i) * vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_muls32)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec32 vecb = (vec32)DUP4(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec32)) { + *(vec32 *)(d + i) = *(vec32 *)(a + i) * vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_muls64)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec64 vecb = (vec64)DUP2(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) * vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_neg8)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec8)) { + *(vec8 *)(d + i) = -*(vec8 *)(a + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_neg16)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec16)) { + *(vec16 *)(d + i) = -*(vec16 *)(a + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_neg32)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec32)) { + *(vec32 *)(d + i) = -*(vec32 *)(a + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_neg64)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = -*(vec64 *)(a + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_mov)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + + memcpy(d, a, oprsz); + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_dup64)(void *d, uint32_t desc, uint64_t c) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + if (c == 0) { + oprsz = 0; + } else { + for (i = 0; i < oprsz; i += sizeof(uint64_t)) { + *(uint64_t *)(d + i) = c; + } + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_dup32)(void *d, uint32_t desc, uint32_t c) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + if (c == 0) { + oprsz = 0; + } else { + for (i = 0; i < oprsz; i += sizeof(uint32_t)) { + *(uint32_t *)(d + i) = c; + } + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_dup16)(void *d, uint32_t desc, uint32_t c) +{ + HELPER(gvec_dup32)(d, desc, 0x00010001 * (c & 0xffff)); +} + +void HELPER(gvec_dup8)(void *d, uint32_t desc, uint32_t c) +{ + HELPER(gvec_dup32)(d, desc, 0x01010101 * (c & 0xff)); +} + +void HELPER(gvec_not)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = ~*(vec64 *)(a + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_and)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) & *(vec64 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_or)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) | *(vec64 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_xor)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ *(vec64 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_andc)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_orc)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec64 vecb = (vec64)DUP2(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) & vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_xors)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec64 vecb = (vec64)DUP2(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_ors)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec64 vecb = (vec64)DUP2(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) | vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_shl8i)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + int shift = simd_data(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec8)) { + *(vec8 *)(d + i) = *(vec8 *)(a + i) << shift; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_shl16i)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + int shift = simd_data(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec16)) { + *(vec16 *)(d + i) = *(vec16 *)(a + i) << shift; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_shl32i)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + int shift = simd_data(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec32)) { + *(vec32 *)(d + i) = *(vec32 *)(a + i) << shift; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_shl64i)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + int shift = simd_data(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) << shift; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_shr8i)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + int shift = simd_data(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec8)) { + *(vec8 *)(d + i) = *(vec8 *)(a + i) >> shift; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_shr16i)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + int shift = simd_data(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec16)) { + *(vec16 *)(d + i) = *(vec16 *)(a + i) >> shift; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_shr32i)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + int shift = simd_data(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec32)) { + *(vec32 *)(d + i) = *(vec32 *)(a + i) >> shift; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_shr64i)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + int shift = simd_data(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) >> shift; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_sar8i)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + int shift = simd_data(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec8)) { + *(svec8 *)(d + i) = *(svec8 *)(a + i) >> shift; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_sar16i)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + int shift = simd_data(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec16)) { + *(svec16 *)(d + i) = *(svec16 *)(a + i) >> shift; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_sar32i)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + int shift = simd_data(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec32)) { + *(svec32 *)(d + i) = *(svec32 *)(a + i) >> shift; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_sar64i)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + int shift = simd_data(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(svec64 *)(d + i) = *(svec64 *)(a + i) >> shift; + } + clear_high(d, oprsz, desc); +} + +/* If vectors are enabled, the compiler fills in -1 for true. + Otherwise, we must take care of this by hand. */ +#ifdef CONFIG_VECTOR16 +# define DO_CMP0(X) X +#else +# define DO_CMP0(X) -(X) +#endif + +#define DO_CMP1(NAME, TYPE, OP) \ +void HELPER(NAME)(void *d, void *a, void *b, uint32_t desc) \ +{ \ + intptr_t oprsz = simd_oprsz(desc); \ + intptr_t i; \ + for (i = 0; i < oprsz; i += sizeof(vec64)) { \ + *(TYPE *)(d + i) = DO_CMP0(*(TYPE *)(a + i) OP *(TYPE *)(b + i)); \ + } \ + clear_high(d, oprsz, desc); \ +} + +#define DO_CMP2(SZ) \ + DO_CMP1(gvec_eq##SZ, vec##SZ, ==) \ + DO_CMP1(gvec_ne##SZ, vec##SZ, !=) \ + DO_CMP1(gvec_lt##SZ, svec##SZ, <) \ + DO_CMP1(gvec_le##SZ, svec##SZ, <=) \ + DO_CMP1(gvec_ltu##SZ, vec##SZ, <) \ + DO_CMP1(gvec_leu##SZ, vec##SZ, <=) + +DO_CMP2(8) +DO_CMP2(16) +DO_CMP2(32) +DO_CMP2(64) + +#undef DO_CMP0 +#undef DO_CMP1 +#undef DO_CMP2 + +void HELPER(gvec_ssadd8)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(int8_t)) { + int r = *(int8_t *)(a + i) + *(int8_t *)(b + i); + if (r > INT8_MAX) { + r = INT8_MAX; + } else if (r < INT8_MIN) { + r = INT8_MIN; + } + *(int8_t *)(d + i) = r; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_ssadd16)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(int16_t)) { + int r = *(int16_t *)(a + i) + *(int16_t *)(b + i); + if (r > INT16_MAX) { + r = INT16_MAX; + } else if (r < INT16_MIN) { + r = INT16_MIN; + } + *(int16_t *)(d + i) = r; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_ssadd32)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(int32_t)) { + int32_t ai = *(int32_t *)(a + i); + int32_t bi = *(int32_t *)(b + i); + int32_t di = ai + bi; + if (((di ^ ai) &~ (ai ^ bi)) < 0) { + /* Signed overflow. */ + di = (di < 0 ? INT32_MAX : INT32_MIN); + } + *(int32_t *)(d + i) = di; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_ssadd64)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(int64_t)) { + int64_t ai = *(int64_t *)(a + i); + int64_t bi = *(int64_t *)(b + i); + int64_t di = ai + bi; + if (((di ^ ai) &~ (ai ^ bi)) < 0) { + /* Signed overflow. */ + di = (di < 0 ? INT64_MAX : INT64_MIN); + } + *(int64_t *)(d + i) = di; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_sssub8)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(uint8_t)) { + int r = *(int8_t *)(a + i) - *(int8_t *)(b + i); + if (r > INT8_MAX) { + r = INT8_MAX; + } else if (r < INT8_MIN) { + r = INT8_MIN; + } + *(uint8_t *)(d + i) = r; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_sssub16)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(int16_t)) { + int r = *(int16_t *)(a + i) - *(int16_t *)(b + i); + if (r > INT16_MAX) { + r = INT16_MAX; + } else if (r < INT16_MIN) { + r = INT16_MIN; + } + *(int16_t *)(d + i) = r; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_sssub32)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(int32_t)) { + int32_t ai = *(int32_t *)(a + i); + int32_t bi = *(int32_t *)(b + i); + int32_t di = ai - bi; + if (((di ^ ai) & (ai ^ bi)) < 0) { + /* Signed overflow. */ + di = (di < 0 ? INT32_MAX : INT32_MIN); + } + *(int32_t *)(d + i) = di; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_sssub64)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(int64_t)) { + int64_t ai = *(int64_t *)(a + i); + int64_t bi = *(int64_t *)(b + i); + int64_t di = ai - bi; + if (((di ^ ai) & (ai ^ bi)) < 0) { + /* Signed overflow. */ + di = (di < 0 ? INT64_MAX : INT64_MIN); + } + *(int64_t *)(d + i) = di; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_usadd8)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(uint8_t)) { + unsigned r = *(uint8_t *)(a + i) + *(uint8_t *)(b + i); + if (r > UINT8_MAX) { + r = UINT8_MAX; + } + *(uint8_t *)(d + i) = r; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_usadd16)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(uint16_t)) { + unsigned r = *(uint16_t *)(a + i) + *(uint16_t *)(b + i); + if (r > UINT16_MAX) { + r = UINT16_MAX; + } + *(uint16_t *)(d + i) = r; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_usadd32)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(uint32_t)) { + uint32_t ai = *(uint32_t *)(a + i); + uint32_t bi = *(uint32_t *)(b + i); + uint32_t di = ai + bi; + if (di < ai) { + di = UINT32_MAX; + } + *(uint32_t *)(d + i) = di; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_usadd64)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(uint64_t)) { + uint64_t ai = *(uint64_t *)(a + i); + uint64_t bi = *(uint64_t *)(b + i); + uint64_t di = ai + bi; + if (di < ai) { + di = UINT64_MAX; + } + *(uint64_t *)(d + i) = di; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_ussub8)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(uint8_t)) { + int r = *(uint8_t *)(a + i) - *(uint8_t *)(b + i); + if (r < 0) { + r = 0; + } + *(uint8_t *)(d + i) = r; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_ussub16)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(uint16_t)) { + int r = *(uint16_t *)(a + i) - *(uint16_t *)(b + i); + if (r < 0) { + r = 0; + } + *(uint16_t *)(d + i) = r; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_ussub32)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(uint32_t)) { + uint32_t ai = *(uint32_t *)(a + i); + uint32_t bi = *(uint32_t *)(b + i); + uint32_t di = ai - bi; + if (ai < bi) { + di = 0; + } + *(uint32_t *)(d + i) = di; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_ussub64)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(uint64_t)) { + uint64_t ai = *(uint64_t *)(a + i); + uint64_t bi = *(uint64_t *)(b + i); + uint64_t di = ai - bi; + if (ai < bi) { + di = 0; + } + *(uint64_t *)(d + i) = di; + } + clear_high(d, oprsz, desc); +} diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h index 1df17d0ba9..2536959a18 100644 --- a/accel/tcg/tcg-runtime.h +++ b/accel/tcg/tcg-runtime.h @@ -134,3 +134,121 @@ GEN_ATOMIC_HELPERS(xor_fetch) GEN_ATOMIC_HELPERS(xchg) #undef GEN_ATOMIC_HELPERS + +DEF_HELPER_FLAGS_3(gvec_mov, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(gvec_dup8, TCG_CALL_NO_RWG, void, ptr, i32, i32) +DEF_HELPER_FLAGS_3(gvec_dup16, TCG_CALL_NO_RWG, void, ptr, i32, i32) +DEF_HELPER_FLAGS_3(gvec_dup32, TCG_CALL_NO_RWG, void, ptr, i32, i32) +DEF_HELPER_FLAGS_3(gvec_dup64, TCG_CALL_NO_RWG, void, ptr, i32, i64) + +DEF_HELPER_FLAGS_4(gvec_add8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_add16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_add32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_add64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_adds8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(gvec_adds16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(gvec_adds32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(gvec_adds64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) + +DEF_HELPER_FLAGS_4(gvec_sub8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_sub16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_sub32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_sub64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_subs8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(gvec_subs16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(gvec_subs32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(gvec_subs64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) + +DEF_HELPER_FLAGS_4(gvec_mul8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_mul16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_mul32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_mul64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_muls8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(gvec_muls16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(gvec_muls32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(gvec_muls64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) + +DEF_HELPER_FLAGS_4(gvec_ssadd8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_ssadd16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_ssadd32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_ssadd64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_sssub8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_sssub16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_sssub32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_sssub64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_usadd8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_usadd16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_usadd32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_usadd64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_ussub8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_ussub16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_ussub32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_ussub64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(gvec_neg8, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_neg16, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_neg32, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_neg64, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(gvec_not, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_and, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_or, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_xor, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_andc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_orc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_ands, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(gvec_xors, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(gvec_ors, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) + +DEF_HELPER_FLAGS_3(gvec_shl8i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_shl16i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_shl32i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_shl64i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(gvec_shr8i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_shr16i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_shr32i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_shr64i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(gvec_sar8i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_sar16i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_sar32i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_sar64i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_eq8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_eq16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_eq32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_eq64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_ne8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_ne16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_ne32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_ne64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_lt8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_lt16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_lt32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_lt64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_le8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_le16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_le32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_le64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_ltu8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_ltu16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_ltu32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_ltu64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_leu8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_leu16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_leu32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_leu64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) @@ -5001,6 +5001,50 @@ if compile_prog "" "" ; then fi ######################################## +# See if 16-byte vector operations are supported. +# Even without a vector unit the compiler may expand these. +# There is a bug in old GCC for PPC that crashes here. +# Unfortunately it's the system compiler for Centos 7. + +cat > $TMPC << EOF +typedef unsigned char U1 __attribute__((vector_size(16))); +typedef unsigned short U2 __attribute__((vector_size(16))); +typedef unsigned int U4 __attribute__((vector_size(16))); +typedef unsigned long long U8 __attribute__((vector_size(16))); +typedef signed char S1 __attribute__((vector_size(16))); +typedef signed short S2 __attribute__((vector_size(16))); +typedef signed int S4 __attribute__((vector_size(16))); +typedef signed long long S8 __attribute__((vector_size(16))); +static U1 a1, b1; +static U2 a2, b2; +static U4 a4, b4; +static U8 a8, b8; +static S1 c1; +static S2 c2; +static S4 c4; +static S8 c8; +static int i; +int main(void) +{ + a1 += b1; a2 += b2; a4 += b4; a8 += b8; + a1 -= b1; a2 -= b2; a4 -= b4; a8 -= b8; + a1 *= b1; a2 *= b2; a4 *= b4; a8 *= b8; + a1 &= b1; a2 &= b2; a4 &= b4; a8 &= b8; + a1 |= b1; a2 |= b2; a4 |= b4; a8 |= b8; + a1 ^= b1; a2 ^= b2; a4 ^= b4; a8 ^= b8; + a1 <<= i; a2 <<= i; a4 <<= i; a8 <<= i; + a1 >>= i; a2 >>= i; a4 >>= i; a8 >>= i; + c1 >>= i; c2 >>= i; c4 >>= i; c8 >>= i; + return 0; +} +EOF + +vector16=no +if compile_prog "" "" ; then + vector16=yes +fi + +######################################## # check if getauxval is available. getauxval=no @@ -6329,6 +6373,10 @@ if test "$atomic64" = "yes" ; then echo "CONFIG_ATOMIC64=y" >> $config_host_mak fi +if test "$vector16" = "yes" ; then + echo "CONFIG_VECTOR16=y" >> $config_host_mak +fi + if test "$getauxval" = "yes" ; then echo "CONFIG_GETAUXVAL=y" >> $config_host_mak fi diff --git a/target/arm/cpu.h b/target/arm/cpu.h index d2bb59eded..8d41f783dc 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -492,7 +492,7 @@ typedef struct CPUARMState { * the two execution states, and means we do not need to explicitly * map these registers when changing states. */ - uint64_t regs[64]; + uint64_t regs[64] QEMU_ALIGNED(16); uint32_t xregs[16]; /* We store these fpcsr fields separately for convenience. */ diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c index eed64c73e5..0830c3f1c8 100644 --- a/target/arm/translate-a64.c +++ b/target/arm/translate-a64.c @@ -21,6 +21,7 @@ #include "cpu.h" #include "exec/exec-all.h" #include "tcg-op.h" +#include "tcg-op-gvec.h" #include "qemu/log.h" #include "arm_ldst.h" #include "translate.h" @@ -84,6 +85,13 @@ typedef void CryptoTwoOpFn(TCGv_ptr, TCGv_ptr); typedef void CryptoThreeOpIntFn(TCGv_ptr, TCGv_ptr, TCGv_i32); typedef void CryptoThreeOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr); +/* Note that the gvec expanders operate on offsets + sizes. */ +typedef void GVecGen2Fn(unsigned, uint32_t, uint32_t, uint32_t, uint32_t); +typedef void GVecGen2iFn(unsigned, uint32_t, uint32_t, int64_t, + uint32_t, uint32_t); +typedef void GVecGen3Fn(unsigned, uint32_t, uint32_t, + uint32_t, uint32_t, uint32_t); + /* initialize TCG globals. */ void a64_translate_init(void) { @@ -548,6 +556,14 @@ static TCGv_ptr vec_full_reg_ptr(DisasContext *s, int regno) return ret; } +/* Return the byte size of the "whole" vector register, VL / 8. */ +static inline int vec_full_reg_size(DisasContext *s) +{ + /* FIXME SVE: We should put the composite ZCR_EL* value into tb->flags. + In the meantime this is just the AdvSIMD length of 128. */ + return 128 / 8; +} + /* Return the offset into CPUARMState of a slice (from * the least significant end) of FP register Qn (ie * Dn, Sn, Hn or Bn). @@ -618,6 +634,51 @@ static TCGv_ptr get_fpstatus_ptr(void) return statusptr; } +/* Expand a 2-operand AdvSIMD vector operation using an expander function. */ +static void gen_gvec_fn2(DisasContext *s, bool is_q, int rd, int rn, + GVecGen2Fn *gvec_fn, int vece) +{ + gvec_fn(vece, vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn), + is_q ? 16 : 8, vec_full_reg_size(s)); +} + +/* Expand a 2-operand + immediate AdvSIMD vector operation using + * an expander function. + */ +static void gen_gvec_fn2i(DisasContext *s, bool is_q, int rd, int rn, + int64_t imm, GVecGen2iFn *gvec_fn, int vece) +{ + gvec_fn(vece, vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn), + imm, is_q ? 16 : 8, vec_full_reg_size(s)); +} + +/* Expand a 3-operand AdvSIMD vector operation using an expander function. */ +static void gen_gvec_fn3(DisasContext *s, bool is_q, int rd, int rn, int rm, + GVecGen3Fn *gvec_fn, int vece) +{ + gvec_fn(vece, vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), is_q ? 16 : 8, vec_full_reg_size(s)); +} + +/* Expand a 2-operand + immediate AdvSIMD vector operation using + * an op descriptor. + */ +static void gen_gvec_op2i(DisasContext *s, bool is_q, int rd, + int rn, int64_t imm, const GVecGen2i *gvec_op) +{ + tcg_gen_gvec_2i(vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn), + is_q ? 16 : 8, vec_full_reg_size(s), imm, gvec_op); +} + +/* Expand a 3-operand AdvSIMD vector operation using an op descriptor. */ +static void gen_gvec_op3(DisasContext *s, bool is_q, int rd, + int rn, int rm, const GVecGen3 *gvec_op) +{ + tcg_gen_gvec_3(vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), is_q ? 16 : 8, + vec_full_reg_size(s), gvec_op); +} + /* Set ZF and NF based on a 64 bit result. This is alas fiddlier * than the 32 bit equivalent. */ @@ -4566,14 +4627,17 @@ static void handle_fp_1src_double(DisasContext *s, int opcode, int rd, int rn) TCGv_i64 tcg_op; TCGv_i64 tcg_res; + switch (opcode) { + case 0x0: /* FMOV */ + gen_gvec_fn2(s, false, rd, rn, tcg_gen_gvec_mov, 0); + return; + } + fpst = get_fpstatus_ptr(); tcg_op = read_fp_dreg(s, rn); tcg_res = tcg_temp_new_i64(); switch (opcode) { - case 0x0: /* FMOV */ - tcg_gen_mov_i64(tcg_res, tcg_op); - break; case 0x1: /* FABS */ gen_helper_vfp_absd(tcg_res, tcg_op); break; @@ -5848,10 +5912,7 @@ static void handle_simd_dupe(DisasContext *s, int is_q, int rd, int rn, int imm5) { int size = ctz32(imm5); - int esize = 8 << size; - int elements = (is_q ? 128 : 64) / esize; - int index, i; - TCGv_i64 tmp; + int index = imm5 >> (size + 1); if (size > 3 || (size == 3 && !is_q)) { unallocated_encoding(s); @@ -5862,20 +5923,9 @@ static void handle_simd_dupe(DisasContext *s, int is_q, int rd, int rn, return; } - index = imm5 >> (size + 1); - - tmp = tcg_temp_new_i64(); - read_vec_element(s, tmp, rn, index, size); - - for (i = 0; i < elements; i++) { - write_vec_element(s, tmp, rd, i, size); - } - - if (!is_q) { - clear_vec_high(s, rd); - } - - tcg_temp_free_i64(tmp); + tcg_gen_gvec_dup_mem(size, vec_full_reg_offset(s, rd), + vec_reg_offset(s, rn, index, size), + is_q ? 16 : 8, vec_full_reg_size(s)); } /* DUP (element, scalar) @@ -5924,9 +5974,7 @@ static void handle_simd_dupg(DisasContext *s, int is_q, int rd, int rn, int imm5) { int size = ctz32(imm5); - int esize = 8 << size; - int elements = (is_q ? 128 : 64)/esize; - int i = 0; + uint32_t dofs, oprsz, maxsz; if (size > 3 || ((size == 3) && !is_q)) { unallocated_encoding(s); @@ -5937,12 +5985,11 @@ static void handle_simd_dupg(DisasContext *s, int is_q, int rd, int rn, return; } - for (i = 0; i < elements; i++) { - write_vec_element(s, cpu_reg(s, rn), rd, i, size); - } - if (!is_q) { - clear_vec_high(s, rd); - } + dofs = vec_full_reg_offset(s, rd); + oprsz = is_q ? 16 : 8; + maxsz = vec_full_reg_size(s); + + tcg_gen_gvec_dup_i64(size, dofs, oprsz, maxsz, cpu_reg(s, rn)); } /* INS (Element) @@ -6133,8 +6180,6 @@ static void disas_simd_mod_imm(DisasContext *s, uint32_t insn) bool is_neg = extract32(insn, 29, 1); bool is_q = extract32(insn, 30, 1); uint64_t imm = 0; - TCGv_i64 tcg_rd, tcg_imm; - int i; if (o2 != 0 || ((cmode == 0xf) && is_neg && !is_q)) { unallocated_encoding(s); @@ -6215,32 +6260,18 @@ static void disas_simd_mod_imm(DisasContext *s, uint32_t insn) imm = ~imm; } - tcg_imm = tcg_const_i64(imm); - tcg_rd = new_tmp_a64(s); - - for (i = 0; i < 2; i++) { - int foffs = i ? fp_reg_hi_offset(s, rd) : fp_reg_offset(s, rd, MO_64); - - if (i == 1 && !is_q) { - /* non-quad ops clear high half of vector */ - tcg_gen_movi_i64(tcg_rd, 0); - } else if ((cmode & 0x9) == 0x1 || (cmode & 0xd) == 0x9) { - tcg_gen_ld_i64(tcg_rd, cpu_env, foffs); - if (is_neg) { - /* AND (BIC) */ - tcg_gen_and_i64(tcg_rd, tcg_rd, tcg_imm); - } else { - /* ORR */ - tcg_gen_or_i64(tcg_rd, tcg_rd, tcg_imm); - } + if (!((cmode & 0x9) == 0x1 || (cmode & 0xd) == 0x9)) { + /* MOVI or MVNI, with MVNI negation handled above. */ + tcg_gen_gvec_dup64i(vec_full_reg_offset(s, rd), is_q ? 16 : 8, + vec_full_reg_size(s), imm); + } else { + /* ORR or BIC, with BIC negation to AND handled above. */ + if (is_neg) { + gen_gvec_fn2i(s, is_q, rd, rd, imm, tcg_gen_gvec_andi, MO_64); } else { - /* MOVI */ - tcg_gen_mov_i64(tcg_rd, tcg_imm); + gen_gvec_fn2i(s, is_q, rd, rd, imm, tcg_gen_gvec_ori, MO_64); } - tcg_gen_st_i64(tcg_rd, cpu_env, foffs); } - - tcg_temp_free_i64(tcg_imm); } /* AdvSIMD scalar copy @@ -6485,32 +6516,6 @@ static void handle_shri_with_rndacc(TCGv_i64 tcg_res, TCGv_i64 tcg_src, } } -/* Common SHL/SLI - Shift left with an optional insert */ -static void handle_shli_with_ins(TCGv_i64 tcg_res, TCGv_i64 tcg_src, - bool insert, int shift) -{ - if (insert) { /* SLI */ - tcg_gen_deposit_i64(tcg_res, tcg_res, tcg_src, shift, 64 - shift); - } else { /* SHL */ - tcg_gen_shli_i64(tcg_res, tcg_src, shift); - } -} - -/* SRI: shift right with insert */ -static void handle_shri_with_ins(TCGv_i64 tcg_res, TCGv_i64 tcg_src, - int size, int shift) -{ - int esize = 8 << size; - - /* shift count same as element size is valid but does nothing; - * special case to avoid potential shift by 64. - */ - if (shift != esize) { - tcg_gen_shri_i64(tcg_src, tcg_src, shift); - tcg_gen_deposit_i64(tcg_res, tcg_res, tcg_src, 0, esize - shift); - } -} - /* SSHR[RA]/USHR[RA] - Scalar shift right (optional rounding/accumulate) */ static void handle_scalar_simd_shri(DisasContext *s, bool is_u, int immh, int immb, @@ -6561,7 +6566,14 @@ static void handle_scalar_simd_shri(DisasContext *s, tcg_rd = (accumulate || insert) ? read_fp_dreg(s, rd) : tcg_temp_new_i64(); if (insert) { - handle_shri_with_ins(tcg_rd, tcg_rn, size, shift); + /* shift count same as element size is valid but does nothing; + * special case to avoid potential shift by 64. + */ + int esize = 8 << size; + if (shift != esize) { + tcg_gen_shri_i64(tcg_rn, tcg_rn, shift); + tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_rn, 0, esize - shift); + } } else { handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round, accumulate, is_u, size, shift); @@ -6599,7 +6611,11 @@ static void handle_scalar_simd_shli(DisasContext *s, bool insert, tcg_rn = read_fp_dreg(s, rn); tcg_rd = insert ? read_fp_dreg(s, rd) : tcg_temp_new_i64(); - handle_shli_with_ins(tcg_rd, tcg_rn, insert, shift); + if (insert) { + tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_rn, shift, 64 - shift); + } else { + tcg_gen_shli_i64(tcg_rd, tcg_rn, shift); + } write_fp_dreg(s, rd, tcg_rd); @@ -7175,6 +7191,28 @@ static void disas_simd_scalar_three_reg_diff(DisasContext *s, uint32_t insn) } } +/* CMTST : test is "if (X & Y != 0)". */ +static void gen_cmtst_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + tcg_gen_and_i32(d, a, b); + tcg_gen_setcondi_i32(TCG_COND_NE, d, d, 0); + tcg_gen_neg_i32(d, d); +} + +static void gen_cmtst_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + tcg_gen_and_i64(d, a, b); + tcg_gen_setcondi_i64(TCG_COND_NE, d, d, 0); + tcg_gen_neg_i64(d, d); +} + +static void gen_cmtst_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) +{ + tcg_gen_and_vec(vece, d, a, b); + tcg_gen_dupi_vec(vece, a, 0); + tcg_gen_cmp_vec(TCG_COND_NE, vece, d, d, a); +} + static void handle_3same_64(DisasContext *s, int opcode, bool u, TCGv_i64 tcg_rd, TCGv_i64 tcg_rn, TCGv_i64 tcg_rm) { @@ -7218,10 +7256,7 @@ static void handle_3same_64(DisasContext *s, int opcode, bool u, cond = TCG_COND_EQ; goto do_cmop; } - /* CMTST : test is "if (X & Y != 0)". */ - tcg_gen_and_i64(tcg_rd, tcg_rn, tcg_rm); - tcg_gen_setcondi_i64(TCG_COND_NE, tcg_rd, tcg_rd, 0); - tcg_gen_neg_i64(tcg_rd, tcg_rd); + gen_cmtst_i64(tcg_rd, tcg_rn, tcg_rm); break; case 0x8: /* SSHL, USHL */ if (u) { @@ -8329,16 +8364,195 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn) } } +static void gen_ssra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + tcg_gen_vec_sar8i_i64(a, a, shift); + tcg_gen_vec_add8_i64(d, d, a); +} + +static void gen_ssra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + tcg_gen_vec_sar16i_i64(a, a, shift); + tcg_gen_vec_add16_i64(d, d, a); +} + +static void gen_ssra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) +{ + tcg_gen_sari_i32(a, a, shift); + tcg_gen_add_i32(d, d, a); +} + +static void gen_ssra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + tcg_gen_sari_i64(a, a, shift); + tcg_gen_add_i64(d, d, a); +} + +static void gen_ssra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) +{ + tcg_gen_sari_vec(vece, a, a, sh); + tcg_gen_add_vec(vece, d, d, a); +} + +static void gen_usra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + tcg_gen_vec_shr8i_i64(a, a, shift); + tcg_gen_vec_add8_i64(d, d, a); +} + +static void gen_usra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + tcg_gen_vec_shr16i_i64(a, a, shift); + tcg_gen_vec_add16_i64(d, d, a); +} + +static void gen_usra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) +{ + tcg_gen_shri_i32(a, a, shift); + tcg_gen_add_i32(d, d, a); +} + +static void gen_usra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + tcg_gen_shri_i64(a, a, shift); + tcg_gen_add_i64(d, d, a); +} + +static void gen_usra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) +{ + tcg_gen_shri_vec(vece, a, a, sh); + tcg_gen_add_vec(vece, d, d, a); +} + +static void gen_shr8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + uint64_t mask = dup_const(MO_8, 0xff >> shift); + TCGv_i64 t = tcg_temp_new_i64(); + + tcg_gen_shri_i64(t, a, shift); + tcg_gen_andi_i64(t, t, mask); + tcg_gen_andi_i64(d, d, ~mask); + tcg_gen_or_i64(d, d, t); + tcg_temp_free_i64(t); +} + +static void gen_shr16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + uint64_t mask = dup_const(MO_16, 0xffff >> shift); + TCGv_i64 t = tcg_temp_new_i64(); + + tcg_gen_shri_i64(t, a, shift); + tcg_gen_andi_i64(t, t, mask); + tcg_gen_andi_i64(d, d, ~mask); + tcg_gen_or_i64(d, d, t); + tcg_temp_free_i64(t); +} + +static void gen_shr32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) +{ + tcg_gen_shri_i32(a, a, shift); + tcg_gen_deposit_i32(d, d, a, 0, 32 - shift); +} + +static void gen_shr64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + tcg_gen_shri_i64(a, a, shift); + tcg_gen_deposit_i64(d, d, a, 0, 64 - shift); +} + +static void gen_shr_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) +{ + uint64_t mask = (2ull << ((8 << vece) - 1)) - 1; + TCGv_vec t = tcg_temp_new_vec_matching(d); + TCGv_vec m = tcg_temp_new_vec_matching(d); + + tcg_gen_dupi_vec(vece, m, mask ^ (mask >> sh)); + tcg_gen_shri_vec(vece, t, a, sh); + tcg_gen_and_vec(vece, d, d, m); + tcg_gen_or_vec(vece, d, d, t); + + tcg_temp_free_vec(t); + tcg_temp_free_vec(m); +} + /* SSHR[RA]/USHR[RA] - Vector shift right (optional rounding/accumulate) */ static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u, int immh, int immb, int opcode, int rn, int rd) { + static const GVecGen2i ssra_op[4] = { + { .fni8 = gen_ssra8_i64, + .fniv = gen_ssra_vec, + .load_dest = true, + .opc = INDEX_op_sari_vec, + .vece = MO_8 }, + { .fni8 = gen_ssra16_i64, + .fniv = gen_ssra_vec, + .load_dest = true, + .opc = INDEX_op_sari_vec, + .vece = MO_16 }, + { .fni4 = gen_ssra32_i32, + .fniv = gen_ssra_vec, + .load_dest = true, + .opc = INDEX_op_sari_vec, + .vece = MO_32 }, + { .fni8 = gen_ssra64_i64, + .fniv = gen_ssra_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true, + .opc = INDEX_op_sari_vec, + .vece = MO_64 }, + }; + static const GVecGen2i usra_op[4] = { + { .fni8 = gen_usra8_i64, + .fniv = gen_usra_vec, + .load_dest = true, + .opc = INDEX_op_shri_vec, + .vece = MO_8, }, + { .fni8 = gen_usra16_i64, + .fniv = gen_usra_vec, + .load_dest = true, + .opc = INDEX_op_shri_vec, + .vece = MO_16, }, + { .fni4 = gen_usra32_i32, + .fniv = gen_usra_vec, + .load_dest = true, + .opc = INDEX_op_shri_vec, + .vece = MO_32, }, + { .fni8 = gen_usra64_i64, + .fniv = gen_usra_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true, + .opc = INDEX_op_shri_vec, + .vece = MO_64, }, + }; + static const GVecGen2i sri_op[4] = { + { .fni8 = gen_shr8_ins_i64, + .fniv = gen_shr_ins_vec, + .load_dest = true, + .opc = INDEX_op_shri_vec, + .vece = MO_8 }, + { .fni8 = gen_shr16_ins_i64, + .fniv = gen_shr_ins_vec, + .load_dest = true, + .opc = INDEX_op_shri_vec, + .vece = MO_16 }, + { .fni4 = gen_shr32_ins_i32, + .fniv = gen_shr_ins_vec, + .load_dest = true, + .opc = INDEX_op_shri_vec, + .vece = MO_32 }, + { .fni8 = gen_shr64_ins_i64, + .fniv = gen_shr_ins_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true, + .opc = INDEX_op_shri_vec, + .vece = MO_64 }, + }; + int size = 32 - clz32(immh) - 1; int immhb = immh << 3 | immb; int shift = 2 * (8 << size) - immhb; bool accumulate = false; - bool round = false; - bool insert = false; int dsize = is_q ? 128 : 64; int esize = 8 << size; int elements = dsize/esize; @@ -8346,6 +8560,7 @@ static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u, TCGv_i64 tcg_rn = new_tmp_a64(s); TCGv_i64 tcg_rd = new_tmp_a64(s); TCGv_i64 tcg_round; + uint64_t round_const; int i; if (extract32(immh, 3, 1) && !is_q) { @@ -8364,64 +8579,159 @@ static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u, switch (opcode) { case 0x02: /* SSRA / USRA (accumulate) */ - accumulate = true; - break; + if (is_u) { + /* Shift count same as element size produces zero to add. */ + if (shift == 8 << size) { + goto done; + } + gen_gvec_op2i(s, is_q, rd, rn, shift, &usra_op[size]); + } else { + /* Shift count same as element size produces all sign to add. */ + if (shift == 8 << size) { + shift -= 1; + } + gen_gvec_op2i(s, is_q, rd, rn, shift, &ssra_op[size]); + } + return; + case 0x08: /* SRI */ + /* Shift count same as element size is valid but does nothing. */ + if (shift == 8 << size) { + goto done; + } + gen_gvec_op2i(s, is_q, rd, rn, shift, &sri_op[size]); + return; + + case 0x00: /* SSHR / USHR */ + if (is_u) { + if (shift == 8 << size) { + /* Shift count the same size as element size produces zero. */ + tcg_gen_gvec_dup8i(vec_full_reg_offset(s, rd), + is_q ? 16 : 8, vec_full_reg_size(s), 0); + } else { + gen_gvec_fn2i(s, is_q, rd, rn, shift, tcg_gen_gvec_shri, size); + } + } else { + /* Shift count the same size as element size produces all sign. */ + if (shift == 8 << size) { + shift -= 1; + } + gen_gvec_fn2i(s, is_q, rd, rn, shift, tcg_gen_gvec_sari, size); + } + return; + case 0x04: /* SRSHR / URSHR (rounding) */ - round = true; break; case 0x06: /* SRSRA / URSRA (accum + rounding) */ - accumulate = round = true; - break; - case 0x08: /* SRI */ - insert = true; + accumulate = true; break; + default: + g_assert_not_reached(); } - if (round) { - uint64_t round_const = 1ULL << (shift - 1); - tcg_round = tcg_const_i64(round_const); - } else { - tcg_round = NULL; - } + round_const = 1ULL << (shift - 1); + tcg_round = tcg_const_i64(round_const); for (i = 0; i < elements; i++) { read_vec_element(s, tcg_rn, rn, i, memop); - if (accumulate || insert) { + if (accumulate) { read_vec_element(s, tcg_rd, rd, i, memop); } - if (insert) { - handle_shri_with_ins(tcg_rd, tcg_rn, size, shift); - } else { - handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round, - accumulate, is_u, size, shift); - } + handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round, + accumulate, is_u, size, shift); write_vec_element(s, tcg_rd, rd, i, size); } + tcg_temp_free_i64(tcg_round); + done: if (!is_q) { clear_vec_high(s, rd); } +} - if (round) { - tcg_temp_free_i64(tcg_round); - } +static void gen_shl8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + uint64_t mask = dup_const(MO_8, 0xff << shift); + TCGv_i64 t = tcg_temp_new_i64(); + + tcg_gen_shli_i64(t, a, shift); + tcg_gen_andi_i64(t, t, mask); + tcg_gen_andi_i64(d, d, ~mask); + tcg_gen_or_i64(d, d, t); + tcg_temp_free_i64(t); +} + +static void gen_shl16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + uint64_t mask = dup_const(MO_16, 0xffff << shift); + TCGv_i64 t = tcg_temp_new_i64(); + + tcg_gen_shli_i64(t, a, shift); + tcg_gen_andi_i64(t, t, mask); + tcg_gen_andi_i64(d, d, ~mask); + tcg_gen_or_i64(d, d, t); + tcg_temp_free_i64(t); +} + +static void gen_shl32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) +{ + tcg_gen_deposit_i32(d, d, a, shift, 32 - shift); +} + +static void gen_shl64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + tcg_gen_deposit_i64(d, d, a, shift, 64 - shift); +} + +static void gen_shl_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) +{ + uint64_t mask = (1ull << sh) - 1; + TCGv_vec t = tcg_temp_new_vec_matching(d); + TCGv_vec m = tcg_temp_new_vec_matching(d); + + tcg_gen_dupi_vec(vece, m, mask); + tcg_gen_shli_vec(vece, t, a, sh); + tcg_gen_and_vec(vece, d, d, m); + tcg_gen_or_vec(vece, d, d, t); + + tcg_temp_free_vec(t); + tcg_temp_free_vec(m); } /* SHL/SLI - Vector shift left */ static void handle_vec_simd_shli(DisasContext *s, bool is_q, bool insert, - int immh, int immb, int opcode, int rn, int rd) + int immh, int immb, int opcode, int rn, int rd) { + static const GVecGen2i shi_op[4] = { + { .fni8 = gen_shl8_ins_i64, + .fniv = gen_shl_ins_vec, + .opc = INDEX_op_shli_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true, + .vece = MO_8 }, + { .fni8 = gen_shl16_ins_i64, + .fniv = gen_shl_ins_vec, + .opc = INDEX_op_shli_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true, + .vece = MO_16 }, + { .fni4 = gen_shl32_ins_i32, + .fniv = gen_shl_ins_vec, + .opc = INDEX_op_shli_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true, + .vece = MO_32 }, + { .fni8 = gen_shl64_ins_i64, + .fniv = gen_shl_ins_vec, + .opc = INDEX_op_shli_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true, + .vece = MO_64 }, + }; int size = 32 - clz32(immh) - 1; int immhb = immh << 3 | immb; int shift = immhb - (8 << size); - int dsize = is_q ? 128 : 64; - int esize = 8 << size; - int elements = dsize/esize; - TCGv_i64 tcg_rn = new_tmp_a64(s); - TCGv_i64 tcg_rd = new_tmp_a64(s); - int i; if (extract32(immh, 3, 1) && !is_q) { unallocated_encoding(s); @@ -8437,19 +8747,10 @@ static void handle_vec_simd_shli(DisasContext *s, bool is_q, bool insert, return; } - for (i = 0; i < elements; i++) { - read_vec_element(s, tcg_rn, rn, i, size); - if (insert) { - read_vec_element(s, tcg_rd, rd, i, size); - } - - handle_shli_with_ins(tcg_rd, tcg_rn, insert, shift); - - write_vec_element(s, tcg_rd, rd, i, size); - } - - if (!is_q) { - clear_vec_high(s, rd); + if (insert) { + gen_gvec_op2i(s, is_q, rd, rn, shift, &shi_op[size]); + } else { + gen_gvec_fn2i(s, is_q, rd, rn, shift, tcg_gen_gvec_shli, size); } } @@ -9072,85 +9373,115 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn) } } +static void gen_bsl_i64(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm) +{ + tcg_gen_xor_i64(rn, rn, rm); + tcg_gen_and_i64(rn, rn, rd); + tcg_gen_xor_i64(rd, rm, rn); +} + +static void gen_bit_i64(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm) +{ + tcg_gen_xor_i64(rn, rn, rd); + tcg_gen_and_i64(rn, rn, rm); + tcg_gen_xor_i64(rd, rd, rn); +} + +static void gen_bif_i64(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm) +{ + tcg_gen_xor_i64(rn, rn, rd); + tcg_gen_andc_i64(rn, rn, rm); + tcg_gen_xor_i64(rd, rd, rn); +} + +static void gen_bsl_vec(unsigned vece, TCGv_vec rd, TCGv_vec rn, TCGv_vec rm) +{ + tcg_gen_xor_vec(vece, rn, rn, rm); + tcg_gen_and_vec(vece, rn, rn, rd); + tcg_gen_xor_vec(vece, rd, rm, rn); +} + +static void gen_bit_vec(unsigned vece, TCGv_vec rd, TCGv_vec rn, TCGv_vec rm) +{ + tcg_gen_xor_vec(vece, rn, rn, rd); + tcg_gen_and_vec(vece, rn, rn, rm); + tcg_gen_xor_vec(vece, rd, rd, rn); +} + +static void gen_bif_vec(unsigned vece, TCGv_vec rd, TCGv_vec rn, TCGv_vec rm) +{ + tcg_gen_xor_vec(vece, rn, rn, rd); + tcg_gen_andc_vec(vece, rn, rn, rm); + tcg_gen_xor_vec(vece, rd, rd, rn); +} + /* Logic op (opcode == 3) subgroup of C3.6.16. */ static void disas_simd_3same_logic(DisasContext *s, uint32_t insn) { + static const GVecGen3 bsl_op = { + .fni8 = gen_bsl_i64, + .fniv = gen_bsl_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true + }; + static const GVecGen3 bit_op = { + .fni8 = gen_bit_i64, + .fniv = gen_bit_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true + }; + static const GVecGen3 bif_op = { + .fni8 = gen_bif_i64, + .fniv = gen_bif_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true + }; + int rd = extract32(insn, 0, 5); int rn = extract32(insn, 5, 5); int rm = extract32(insn, 16, 5); int size = extract32(insn, 22, 2); bool is_u = extract32(insn, 29, 1); bool is_q = extract32(insn, 30, 1); - TCGv_i64 tcg_op1, tcg_op2, tcg_res[2]; - int pass; if (!fp_access_check(s)) { return; } - tcg_op1 = tcg_temp_new_i64(); - tcg_op2 = tcg_temp_new_i64(); - tcg_res[0] = tcg_temp_new_i64(); - tcg_res[1] = tcg_temp_new_i64(); - - for (pass = 0; pass < (is_q ? 2 : 1); pass++) { - read_vec_element(s, tcg_op1, rn, pass, MO_64); - read_vec_element(s, tcg_op2, rm, pass, MO_64); - - if (!is_u) { - switch (size) { - case 0: /* AND */ - tcg_gen_and_i64(tcg_res[pass], tcg_op1, tcg_op2); - break; - case 1: /* BIC */ - tcg_gen_andc_i64(tcg_res[pass], tcg_op1, tcg_op2); - break; - case 2: /* ORR */ - tcg_gen_or_i64(tcg_res[pass], tcg_op1, tcg_op2); - break; - case 3: /* ORN */ - tcg_gen_orc_i64(tcg_res[pass], tcg_op1, tcg_op2); - break; - } + switch (size + 4 * is_u) { + case 0: /* AND */ + gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_and, 0); + return; + case 1: /* BIC */ + gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_andc, 0); + return; + case 2: /* ORR */ + if (rn == rm) { /* MOV */ + gen_gvec_fn2(s, is_q, rd, rn, tcg_gen_gvec_mov, 0); } else { - if (size != 0) { - /* B* ops need res loaded to operate on */ - read_vec_element(s, tcg_res[pass], rd, pass, MO_64); - } - - switch (size) { - case 0: /* EOR */ - tcg_gen_xor_i64(tcg_res[pass], tcg_op1, tcg_op2); - break; - case 1: /* BSL bitwise select */ - tcg_gen_xor_i64(tcg_op1, tcg_op1, tcg_op2); - tcg_gen_and_i64(tcg_op1, tcg_op1, tcg_res[pass]); - tcg_gen_xor_i64(tcg_res[pass], tcg_op2, tcg_op1); - break; - case 2: /* BIT, bitwise insert if true */ - tcg_gen_xor_i64(tcg_op1, tcg_op1, tcg_res[pass]); - tcg_gen_and_i64(tcg_op1, tcg_op1, tcg_op2); - tcg_gen_xor_i64(tcg_res[pass], tcg_res[pass], tcg_op1); - break; - case 3: /* BIF, bitwise insert if false */ - tcg_gen_xor_i64(tcg_op1, tcg_op1, tcg_res[pass]); - tcg_gen_andc_i64(tcg_op1, tcg_op1, tcg_op2); - tcg_gen_xor_i64(tcg_res[pass], tcg_res[pass], tcg_op1); - break; - } + gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_or, 0); } - } + return; + case 3: /* ORN */ + gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_orc, 0); + return; + case 4: /* EOR */ + gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_xor, 0); + return; - write_vec_element(s, tcg_res[0], rd, 0, MO_64); - if (!is_q) { - tcg_gen_movi_i64(tcg_res[1], 0); - } - write_vec_element(s, tcg_res[1], rd, 1, MO_64); + case 5: /* BSL bitwise select */ + gen_gvec_op3(s, is_q, rd, rn, rm, &bsl_op); + return; + case 6: /* BIT, bitwise insert if true */ + gen_gvec_op3(s, is_q, rd, rn, rm, &bit_op); + return; + case 7: /* BIF, bitwise insert if false */ + gen_gvec_op3(s, is_q, rd, rn, rm, &bif_op); + return; - tcg_temp_free_i64(tcg_op1); - tcg_temp_free_i64(tcg_op2); - tcg_temp_free_i64(tcg_res[0]); - tcg_temp_free_i64(tcg_res[1]); + default: + g_assert_not_reached(); + } } /* Helper functions for 32 bit comparisons */ @@ -9400,9 +9731,131 @@ static void disas_simd_3same_float(DisasContext *s, uint32_t insn) } } +static void gen_mla8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + gen_helper_neon_mul_u8(a, a, b); + gen_helper_neon_add_u8(d, d, a); +} + +static void gen_mla16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + gen_helper_neon_mul_u16(a, a, b); + gen_helper_neon_add_u16(d, d, a); +} + +static void gen_mla32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + tcg_gen_mul_i32(a, a, b); + tcg_gen_add_i32(d, d, a); +} + +static void gen_mla64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + tcg_gen_mul_i64(a, a, b); + tcg_gen_add_i64(d, d, a); +} + +static void gen_mla_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) +{ + tcg_gen_mul_vec(vece, a, a, b); + tcg_gen_add_vec(vece, d, d, a); +} + +static void gen_mls8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + gen_helper_neon_mul_u8(a, a, b); + gen_helper_neon_sub_u8(d, d, a); +} + +static void gen_mls16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + gen_helper_neon_mul_u16(a, a, b); + gen_helper_neon_sub_u16(d, d, a); +} + +static void gen_mls32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + tcg_gen_mul_i32(a, a, b); + tcg_gen_sub_i32(d, d, a); +} + +static void gen_mls64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + tcg_gen_mul_i64(a, a, b); + tcg_gen_sub_i64(d, d, a); +} + +static void gen_mls_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) +{ + tcg_gen_mul_vec(vece, a, a, b); + tcg_gen_sub_vec(vece, d, d, a); +} + /* Integer op subgroup of C3.6.16. */ static void disas_simd_3same_int(DisasContext *s, uint32_t insn) { + static const GVecGen3 cmtst_op[4] = { + { .fni4 = gen_helper_neon_tst_u8, + .fniv = gen_cmtst_vec, + .vece = MO_8 }, + { .fni4 = gen_helper_neon_tst_u16, + .fniv = gen_cmtst_vec, + .vece = MO_16 }, + { .fni4 = gen_cmtst_i32, + .fniv = gen_cmtst_vec, + .vece = MO_32 }, + { .fni8 = gen_cmtst_i64, + .fniv = gen_cmtst_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 }, + }; + static const GVecGen3 mla_op[4] = { + { .fni4 = gen_mla8_i32, + .fniv = gen_mla_vec, + .opc = INDEX_op_mul_vec, + .load_dest = true, + .vece = MO_8 }, + { .fni4 = gen_mla16_i32, + .fniv = gen_mla_vec, + .opc = INDEX_op_mul_vec, + .load_dest = true, + .vece = MO_16 }, + { .fni4 = gen_mla32_i32, + .fniv = gen_mla_vec, + .opc = INDEX_op_mul_vec, + .load_dest = true, + .vece = MO_32 }, + { .fni8 = gen_mla64_i64, + .fniv = gen_mla_vec, + .opc = INDEX_op_mul_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true, + .vece = MO_64 }, + }; + static const GVecGen3 mls_op[4] = { + { .fni4 = gen_mls8_i32, + .fniv = gen_mls_vec, + .opc = INDEX_op_mul_vec, + .load_dest = true, + .vece = MO_8 }, + { .fni4 = gen_mls16_i32, + .fniv = gen_mls_vec, + .opc = INDEX_op_mul_vec, + .load_dest = true, + .vece = MO_16 }, + { .fni4 = gen_mls32_i32, + .fniv = gen_mls_vec, + .opc = INDEX_op_mul_vec, + .load_dest = true, + .vece = MO_32 }, + { .fni8 = gen_mls64_i64, + .fniv = gen_mls_vec, + .opc = INDEX_op_mul_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true, + .vece = MO_64 }, + }; + int is_q = extract32(insn, 30, 1); int u = extract32(insn, 29, 1); int size = extract32(insn, 22, 2); @@ -9411,6 +9864,7 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn) int rn = extract32(insn, 5, 5); int rd = extract32(insn, 0, 5); int pass; + TCGCond cond; switch (opcode) { case 0x13: /* MUL, PMUL */ @@ -9450,6 +9904,48 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn) return; } + switch (opcode) { + case 0x10: /* ADD, SUB */ + if (u) { + gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_sub, size); + } else { + gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_add, size); + } + return; + case 0x13: /* MUL, PMUL */ + if (!u) { /* MUL */ + gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_mul, size); + return; + } + break; + case 0x12: /* MLA, MLS */ + if (u) { + gen_gvec_op3(s, is_q, rd, rn, rm, &mls_op[size]); + } else { + gen_gvec_op3(s, is_q, rd, rn, rm, &mla_op[size]); + } + return; + case 0x11: + if (!u) { /* CMTST */ + gen_gvec_op3(s, is_q, rd, rn, rm, &cmtst_op[size]); + return; + } + /* else CMEQ */ + cond = TCG_COND_EQ; + goto do_gvec_cmp; + case 0x06: /* CMGT, CMHI */ + cond = u ? TCG_COND_GTU : TCG_COND_GT; + goto do_gvec_cmp; + case 0x07: /* CMGE, CMHS */ + cond = u ? TCG_COND_GEU : TCG_COND_GE; + do_gvec_cmp: + tcg_gen_gvec_cmp(cond, size, vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), + is_q ? 16 : 8, vec_full_reg_size(s)); + return; + } + if (size == 3) { assert(is_q); for (pass = 0; pass < 2; pass++) { @@ -9530,26 +10026,6 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn) genenvfn = fns[size][u]; break; } - case 0x6: /* CMGT, CMHI */ - { - static NeonGenTwoOpFn * const fns[3][2] = { - { gen_helper_neon_cgt_s8, gen_helper_neon_cgt_u8 }, - { gen_helper_neon_cgt_s16, gen_helper_neon_cgt_u16 }, - { gen_helper_neon_cgt_s32, gen_helper_neon_cgt_u32 }, - }; - genfn = fns[size][u]; - break; - } - case 0x7: /* CMGE, CMHS */ - { - static NeonGenTwoOpFn * const fns[3][2] = { - { gen_helper_neon_cge_s8, gen_helper_neon_cge_u8 }, - { gen_helper_neon_cge_s16, gen_helper_neon_cge_u16 }, - { gen_helper_neon_cge_s32, gen_helper_neon_cge_u32 }, - }; - genfn = fns[size][u]; - break; - } case 0x8: /* SSHL, USHL */ { static NeonGenTwoOpFn * const fns[3][2] = { @@ -9622,44 +10098,11 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn) genfn = fns[size][u]; break; } - case 0x10: /* ADD, SUB */ - { - static NeonGenTwoOpFn * const fns[3][2] = { - { gen_helper_neon_add_u8, gen_helper_neon_sub_u8 }, - { gen_helper_neon_add_u16, gen_helper_neon_sub_u16 }, - { tcg_gen_add_i32, tcg_gen_sub_i32 }, - }; - genfn = fns[size][u]; - break; - } - case 0x11: /* CMTST, CMEQ */ - { - static NeonGenTwoOpFn * const fns[3][2] = { - { gen_helper_neon_tst_u8, gen_helper_neon_ceq_u8 }, - { gen_helper_neon_tst_u16, gen_helper_neon_ceq_u16 }, - { gen_helper_neon_tst_u32, gen_helper_neon_ceq_u32 }, - }; - genfn = fns[size][u]; - break; - } case 0x13: /* MUL, PMUL */ - if (u) { - /* PMUL */ - assert(size == 0); - genfn = gen_helper_neon_mul_p8; - break; - } - /* fall through : MUL */ - case 0x12: /* MLA, MLS */ - { - static NeonGenTwoOpFn * const fns[3] = { - gen_helper_neon_mul_u8, - gen_helper_neon_mul_u16, - tcg_gen_mul_i32, - }; - genfn = fns[size]; + assert(u); /* PMUL */ + assert(size == 0); + genfn = gen_helper_neon_mul_p8; break; - } case 0x16: /* SQDMULH, SQRDMULH */ { static NeonGenTwoOpEnvFn * const fns[2][2] = { @@ -9680,18 +10123,16 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn) genfn(tcg_res, tcg_op1, tcg_op2); } - if (opcode == 0xf || opcode == 0x12) { - /* SABA, UABA, MLA, MLS: accumulating ops */ - static NeonGenTwoOpFn * const fns[3][2] = { - { gen_helper_neon_add_u8, gen_helper_neon_sub_u8 }, - { gen_helper_neon_add_u16, gen_helper_neon_sub_u16 }, - { tcg_gen_add_i32, tcg_gen_sub_i32 }, + if (opcode == 0xf) { + /* SABA, UABA: accumulating ops */ + static NeonGenTwoOpFn * const fns[3] = { + gen_helper_neon_add_u8, + gen_helper_neon_add_u16, + tcg_gen_add_i32, }; - bool is_sub = (opcode == 0x12 && u); /* MLS */ - genfn = fns[size][is_sub]; read_vec_element_i32(s, tcg_op1, rd, pass, MO_32); - genfn(tcg_res, tcg_op1, tcg_res); + fns[size](tcg_res, tcg_op1, tcg_res); } write_vec_element_i32(s, tcg_res, rd, pass, MO_32); @@ -10003,8 +10444,7 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn) return; case 0x5: /* CNT, NOT, RBIT */ if (u && size == 0) { - /* NOT: adjust size so we can use the 64-bits-at-a-time loop. */ - size = 3; + /* NOT */ break; } else if (u && size == 1) { /* RBIT */ @@ -10256,6 +10696,21 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn) tcg_rmode = NULL; } + switch (opcode) { + case 0x5: + if (u && size == 0) { /* NOT */ + gen_gvec_fn2(s, is_q, rd, rn, tcg_gen_gvec_not, 0); + return; + } + break; + case 0xb: + if (u) { /* NEG */ + gen_gvec_fn2(s, is_q, rd, rn, tcg_gen_gvec_neg, size); + return; + } + break; + } + if (size == 3) { /* All 64-bit element operations can be shared with scalar 2misc */ int pass; diff --git a/tcg/README b/tcg/README index 03bfb6acd4..bb2ea5121b 100644 --- a/tcg/README +++ b/tcg/README @@ -503,6 +503,92 @@ of the memory access. For a 32-bit host, qemu_ld/st_i64 is guaranteed to only be used with a 64-bit memory access specified in flags. +********* Host vector operations + +All of the vector ops have two parameters, TCGOP_VECL & TCGOP_VECE. +The former specifies the length of the vector in log2 64-bit units; the +later specifies the length of the element (if applicable) in log2 8-bit units. +E.g. VECL=1 -> 64 << 1 -> v128, and VECE=2 -> 1 << 2 -> i32. + +* mov_vec v0, v1 +* ld_vec v0, t1 +* st_vec v0, t1 + + Move, load and store. + +* dup_vec v0, r1 + + Duplicate the low N bits of R1 into VECL/VECE copies across V0. + +* dupi_vec v0, c + + Similarly, for a constant. + Smaller values will be replicated to host register size by the expanders. + +* dup2_vec v0, r1, r2 + + Duplicate r2:r1 into VECL/64 copies across V0. This opcode is + only present for 32-bit hosts. + +* add_vec v0, v1, v2 + + v0 = v1 + v2, in elements across the vector. + +* sub_vec v0, v1, v2 + + Similarly, v0 = v1 - v2. + +* mul_vec v0, v1, v2 + + Similarly, v0 = v1 * v2. + +* neg_vec v0, v1 + + Similarly, v0 = -v1. + +* and_vec v0, v1, v2 +* or_vec v0, v1, v2 +* xor_vec v0, v1, v2 +* andc_vec v0, v1, v2 +* orc_vec v0, v1, v2 +* not_vec v0, v1 + + Similarly, logical operations with and without compliment. + Note that VECE is unused. + +* shli_vec v0, v1, i2 +* shls_vec v0, v1, s2 + + Shift all elements from v1 by a scalar i2/s2. I.e. + + for (i = 0; i < VECL/VECE; ++i) { + v0[i] = v1[i] << s2; + } + +* shri_vec v0, v1, i2 +* sari_vec v0, v1, i2 +* shrs_vec v0, v1, s2 +* sars_vec v0, v1, s2 + + Similarly for logical and arithmetic right shift. + +* shlv_vec v0, v1, v2 + + Shift elements from v1 by elements from v2. I.e. + + for (i = 0; i < VECL/VECE; ++i) { + v0[i] = v1[i] << v2[i]; + } + +* shrv_vec v0, v1, v2 +* sarv_vec v0, v1, v2 + + Similarly for logical and arithmetic right shift. + +* cmp_vec v0, v1, v2, cond + + Compare vectors by element, storing -1 for true and 0 for false. + ********* Note 1: Some shortcuts are defined when the last operand is known to be diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h index c2525066ab..9aea1d1771 100644 --- a/tcg/aarch64/tcg-target.h +++ b/tcg/aarch64/tcg-target.h @@ -31,13 +31,22 @@ typedef enum { TCG_REG_SP = 31, TCG_REG_XZR = 31, + TCG_REG_V0 = 32, TCG_REG_V1, TCG_REG_V2, TCG_REG_V3, + TCG_REG_V4, TCG_REG_V5, TCG_REG_V6, TCG_REG_V7, + TCG_REG_V8, TCG_REG_V9, TCG_REG_V10, TCG_REG_V11, + TCG_REG_V12, TCG_REG_V13, TCG_REG_V14, TCG_REG_V15, + TCG_REG_V16, TCG_REG_V17, TCG_REG_V18, TCG_REG_V19, + TCG_REG_V20, TCG_REG_V21, TCG_REG_V22, TCG_REG_V23, + TCG_REG_V24, TCG_REG_V25, TCG_REG_V26, TCG_REG_V27, + TCG_REG_V28, TCG_REG_V29, TCG_REG_V30, TCG_REG_V31, + /* Aliases. */ TCG_REG_FP = TCG_REG_X29, TCG_REG_LR = TCG_REG_X30, TCG_AREG0 = TCG_REG_X19, } TCGReg; -#define TCG_TARGET_NB_REGS 32 +#define TCG_TARGET_NB_REGS 64 /* used for function call generation */ #define TCG_REG_CALL_STACK TCG_REG_SP @@ -113,6 +122,20 @@ typedef enum { #define TCG_TARGET_HAS_mulsh_i64 1 #define TCG_TARGET_HAS_direct_jump 1 +#define TCG_TARGET_HAS_v64 1 +#define TCG_TARGET_HAS_v128 1 +#define TCG_TARGET_HAS_v256 0 + +#define TCG_TARGET_HAS_andc_vec 1 +#define TCG_TARGET_HAS_orc_vec 1 +#define TCG_TARGET_HAS_not_vec 1 +#define TCG_TARGET_HAS_neg_vec 1 +#define TCG_TARGET_HAS_shi_vec 1 +#define TCG_TARGET_HAS_shs_vec 0 +#define TCG_TARGET_HAS_shv_vec 0 +#define TCG_TARGET_HAS_cmp_vec 1 +#define TCG_TARGET_HAS_mul_vec 1 + #define TCG_TARGET_DEFAULT_MO (0) static inline void flush_icache_range(uintptr_t start, uintptr_t stop) diff --git a/tcg/aarch64/tcg-target.inc.c b/tcg/aarch64/tcg-target.inc.c index 150530f30e..be3192078d 100644 --- a/tcg/aarch64/tcg-target.inc.c +++ b/tcg/aarch64/tcg-target.inc.c @@ -20,10 +20,15 @@ QEMU_BUILD_BUG_ON(TCG_TYPE_I32 != 0 || TCG_TYPE_I64 != 1); #ifdef CONFIG_DEBUG_TCG static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { - "%x0", "%x1", "%x2", "%x3", "%x4", "%x5", "%x6", "%x7", - "%x8", "%x9", "%x10", "%x11", "%x12", "%x13", "%x14", "%x15", - "%x16", "%x17", "%x18", "%x19", "%x20", "%x21", "%x22", "%x23", - "%x24", "%x25", "%x26", "%x27", "%x28", "%fp", "%x30", "%sp", + "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", + "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", + "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", + "x24", "x25", "x26", "x27", "x28", "fp", "x30", "sp", + + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "fp", "v30", "v31", }; #endif /* CONFIG_DEBUG_TCG */ @@ -43,6 +48,14 @@ static const int tcg_target_reg_alloc_order[] = { /* X19 reserved for AREG0 */ /* X29 reserved as fp */ /* X30 reserved as temporary */ + + TCG_REG_V0, TCG_REG_V1, TCG_REG_V2, TCG_REG_V3, + TCG_REG_V4, TCG_REG_V5, TCG_REG_V6, TCG_REG_V7, + /* V8 - V15 are call-saved, and skipped. */ + TCG_REG_V16, TCG_REG_V17, TCG_REG_V18, TCG_REG_V19, + TCG_REG_V20, TCG_REG_V21, TCG_REG_V22, TCG_REG_V23, + TCG_REG_V24, TCG_REG_V25, TCG_REG_V26, TCG_REG_V27, + TCG_REG_V28, TCG_REG_V29, TCG_REG_V30, TCG_REG_V31, }; static const int tcg_target_call_iarg_regs[8] = { @@ -54,6 +67,7 @@ static const int tcg_target_call_oarg_regs[1] = { }; #define TCG_REG_TMP TCG_REG_X30 +#define TCG_VEC_TMP TCG_REG_V31 #ifndef CONFIG_SOFTMMU /* Note that XZR cannot be encoded in the address base register slot, @@ -119,9 +133,13 @@ static const char *target_parse_constraint(TCGArgConstraint *ct, const char *ct_str, TCGType type) { switch (*ct_str++) { - case 'r': + case 'r': /* general registers */ ct->ct |= TCG_CT_REG; - ct->u.regs = 0xffffffffu; + ct->u.regs |= 0xffffffffu; + break; + case 'w': /* advsimd registers */ + ct->ct |= TCG_CT_REG; + ct->u.regs |= 0xffffffff00000000ull; break; case 'l': /* qemu_ld / qemu_st address, data_reg */ ct->ct |= TCG_CT_REG; @@ -153,11 +171,13 @@ static const char *target_parse_constraint(TCGArgConstraint *ct, return ct_str; } +/* Match a constant valid for addition (12-bit, optionally shifted). */ static inline bool is_aimm(uint64_t val) { return (val & ~0xfff) == 0 || (val & ~0xfff000) == 0; } +/* Match a constant valid for logical operations. */ static inline bool is_limm(uint64_t val) { /* Taking a simplified view of the logical immediates for now, ignoring @@ -178,6 +198,106 @@ static inline bool is_limm(uint64_t val) return (val & (val - 1)) == 0; } +/* Match a constant that is valid for vectors. */ +static bool is_fimm(uint64_t v64, int *op, int *cmode, int *imm8) +{ + int i; + + *op = 0; + /* Match replication across 8 bits. */ + if (v64 == dup_const(MO_8, v64)) { + *cmode = 0xe; + *imm8 = v64 & 0xff; + return true; + } + /* Match replication across 16 bits. */ + if (v64 == dup_const(MO_16, v64)) { + uint16_t v16 = v64; + + if (v16 == (v16 & 0xff)) { + *cmode = 0x8; + *imm8 = v16 & 0xff; + return true; + } else if (v16 == (v16 & 0xff00)) { + *cmode = 0xa; + *imm8 = v16 >> 8; + return true; + } + } + /* Match replication across 32 bits. */ + if (v64 == dup_const(MO_32, v64)) { + uint32_t v32 = v64; + + if (v32 == (v32 & 0xff)) { + *cmode = 0x0; + *imm8 = v32 & 0xff; + return true; + } else if (v32 == (v32 & 0xff00)) { + *cmode = 0x2; + *imm8 = (v32 >> 8) & 0xff; + return true; + } else if (v32 == (v32 & 0xff0000)) { + *cmode = 0x4; + *imm8 = (v32 >> 16) & 0xff; + return true; + } else if (v32 == (v32 & 0xff000000)) { + *cmode = 0x6; + *imm8 = v32 >> 24; + return true; + } else if ((v32 & 0xffff00ff) == 0xff) { + *cmode = 0xc; + *imm8 = (v32 >> 8) & 0xff; + return true; + } else if ((v32 & 0xff00ffff) == 0xffff) { + *cmode = 0xd; + *imm8 = (v32 >> 16) & 0xff; + return true; + } + /* Match forms of a float32. */ + if (extract32(v32, 0, 19) == 0 + && (extract32(v32, 25, 6) == 0x20 + || extract32(v32, 25, 6) == 0x1f)) { + *cmode = 0xf; + *imm8 = (extract32(v32, 31, 1) << 7) + | (extract32(v32, 25, 1) << 6) + | extract32(v32, 19, 6); + return true; + } + } + /* Match forms of a float64. */ + if (extract64(v64, 0, 48) == 0 + && (extract64(v64, 54, 9) == 0x100 + || extract64(v64, 54, 9) == 0x0ff)) { + *cmode = 0xf; + *op = 1; + *imm8 = (extract64(v64, 63, 1) << 7) + | (extract64(v64, 54, 1) << 6) + | extract64(v64, 48, 6); + return true; + } + /* Match bytes of 0x00 and 0xff. */ + for (i = 0; i < 64; i += 8) { + uint64_t byte = extract64(v64, i, 8); + if (byte != 0 && byte != 0xff) { + break; + } + } + if (i == 64) { + *cmode = 0xe; + *op = 1; + *imm8 = (extract64(v64, 0, 1) << 0) + | (extract64(v64, 8, 1) << 1) + | (extract64(v64, 16, 1) << 2) + | (extract64(v64, 24, 1) << 3) + | (extract64(v64, 32, 1) << 4) + | (extract64(v64, 40, 1) << 5) + | (extract64(v64, 48, 1) << 6) + | (extract64(v64, 56, 1) << 7); + return true; + } + return false; +} + static int tcg_target_const_match(tcg_target_long val, TCGType type, const TCGArgConstraint *arg_ct) { @@ -271,6 +391,9 @@ typedef enum { /* Load literal for loading the address at pc-relative offset */ I3305_LDR = 0x58000000, + I3305_LDR_v64 = 0x5c000000, + I3305_LDR_v128 = 0x9c000000, + /* Load/store register. Described here as 3.3.12, but the helper that emits them can transform to 3.3.10 or 3.3.13. */ I3312_STRB = 0x38000000 | LDST_ST << 22 | MO_8 << 30, @@ -290,6 +413,15 @@ typedef enum { I3312_LDRSHX = 0x38000000 | LDST_LD_S_X << 22 | MO_16 << 30, I3312_LDRSWX = 0x38000000 | LDST_LD_S_X << 22 | MO_32 << 30, + I3312_LDRVS = 0x3c000000 | LDST_LD << 22 | MO_32 << 30, + I3312_STRVS = 0x3c000000 | LDST_ST << 22 | MO_32 << 30, + + I3312_LDRVD = 0x3c000000 | LDST_LD << 22 | MO_64 << 30, + I3312_STRVD = 0x3c000000 | LDST_ST << 22 | MO_64 << 30, + + I3312_LDRVQ = 0x3c000000 | 3 << 22 | 0 << 30, + I3312_STRVQ = 0x3c000000 | 2 << 22 | 0 << 30, + I3312_TO_I3310 = 0x00200800, I3312_TO_I3313 = 0x01000000, @@ -374,8 +506,48 @@ typedef enum { I3510_EON = 0x4a200000, I3510_ANDS = 0x6a000000, - NOP = 0xd503201f, + /* AdvSIMD copy */ + I3605_DUP = 0x0e000400, + I3605_INS = 0x4e001c00, + I3605_UMOV = 0x0e003c00, + + /* AdvSIMD modified immediate */ + I3606_MOVI = 0x0f000400, + + /* AdvSIMD shift by immediate */ + I3614_SSHR = 0x0f000400, + I3614_SSRA = 0x0f001400, + I3614_SHL = 0x0f005400, + I3614_USHR = 0x2f000400, + I3614_USRA = 0x2f001400, + + /* AdvSIMD three same. */ + I3616_ADD = 0x0e208400, + I3616_AND = 0x0e201c00, + I3616_BIC = 0x0e601c00, + I3616_EOR = 0x2e201c00, + I3616_MUL = 0x0e209c00, + I3616_ORR = 0x0ea01c00, + I3616_ORN = 0x0ee01c00, + I3616_SUB = 0x2e208400, + I3616_CMGT = 0x0e203400, + I3616_CMGE = 0x0e203c00, + I3616_CMTST = 0x0e208c00, + I3616_CMHI = 0x2e203400, + I3616_CMHS = 0x2e203c00, + I3616_CMEQ = 0x2e208c00, + + /* AdvSIMD two-reg misc. */ + I3617_CMGT0 = 0x0e208800, + I3617_CMEQ0 = 0x0e209800, + I3617_CMLT0 = 0x0e20a800, + I3617_CMGE0 = 0x2e208800, + I3617_CMLE0 = 0x2e20a800, + I3617_NOT = 0x2e205800, + I3617_NEG = 0x2e20b800, + /* System instructions. */ + NOP = 0xd503201f, DMB_ISH = 0xd50338bf, DMB_LD = 0x00000100, DMB_ST = 0x00000200, @@ -520,26 +692,64 @@ static void tcg_out_insn_3509(TCGContext *s, AArch64Insn insn, TCGType ext, tcg_out32(s, insn | ext << 31 | rm << 16 | ra << 10 | rn << 5 | rd); } +static void tcg_out_insn_3605(TCGContext *s, AArch64Insn insn, bool q, + TCGReg rd, TCGReg rn, int dst_idx, int src_idx) +{ + /* Note that bit 11 set means general register input. Therefore + we can handle both register sets with one function. */ + tcg_out32(s, insn | q << 30 | (dst_idx << 16) | (src_idx << 11) + | (rd & 0x1f) | (~rn & 0x20) << 6 | (rn & 0x1f) << 5); +} + +static void tcg_out_insn_3606(TCGContext *s, AArch64Insn insn, bool q, + TCGReg rd, bool op, int cmode, uint8_t imm8) +{ + tcg_out32(s, insn | q << 30 | op << 29 | cmode << 12 | (rd & 0x1f) + | (imm8 & 0xe0) << (16 - 5) | (imm8 & 0x1f) << 5); +} + +static void tcg_out_insn_3614(TCGContext *s, AArch64Insn insn, bool q, + TCGReg rd, TCGReg rn, unsigned immhb) +{ + tcg_out32(s, insn | q << 30 | immhb << 16 + | (rn & 0x1f) << 5 | (rd & 0x1f)); +} + +static void tcg_out_insn_3616(TCGContext *s, AArch64Insn insn, bool q, + unsigned size, TCGReg rd, TCGReg rn, TCGReg rm) +{ + tcg_out32(s, insn | q << 30 | (size << 22) | (rm & 0x1f) << 16 + | (rn & 0x1f) << 5 | (rd & 0x1f)); +} + +static void tcg_out_insn_3617(TCGContext *s, AArch64Insn insn, bool q, + unsigned size, TCGReg rd, TCGReg rn) +{ + tcg_out32(s, insn | q << 30 | (size << 22) + | (rn & 0x1f) << 5 | (rd & 0x1f)); +} + static void tcg_out_insn_3310(TCGContext *s, AArch64Insn insn, TCGReg rd, TCGReg base, TCGType ext, TCGReg regoff) { /* Note the AArch64Insn constants above are for C3.3.12. Adjust. */ tcg_out32(s, insn | I3312_TO_I3310 | regoff << 16 | - 0x4000 | ext << 13 | base << 5 | rd); + 0x4000 | ext << 13 | base << 5 | (rd & 0x1f)); } static void tcg_out_insn_3312(TCGContext *s, AArch64Insn insn, TCGReg rd, TCGReg rn, intptr_t offset) { - tcg_out32(s, insn | (offset & 0x1ff) << 12 | rn << 5 | rd); + tcg_out32(s, insn | (offset & 0x1ff) << 12 | rn << 5 | (rd & 0x1f)); } static void tcg_out_insn_3313(TCGContext *s, AArch64Insn insn, TCGReg rd, TCGReg rn, uintptr_t scaled_uimm) { /* Note the AArch64Insn constants above are for C3.3.12. Adjust. */ - tcg_out32(s, insn | I3312_TO_I3313 | scaled_uimm << 10 | rn << 5 | rd); + tcg_out32(s, insn | I3312_TO_I3313 | scaled_uimm << 10 + | rn << 5 | (rd & 0x1f)); } /* Register to register move using ORR (shifted register with no shift). */ @@ -585,6 +795,22 @@ static void tcg_out_logicali(TCGContext *s, AArch64Insn insn, TCGType ext, tcg_out_insn_3404(s, insn, ext, rd, rn, ext, r, c); } +static void tcg_out_dupi_vec(TCGContext *s, TCGType type, + TCGReg rd, uint64_t v64) +{ + int op, cmode, imm8; + + if (is_fimm(v64, &op, &cmode, &imm8)) { + tcg_out_insn(s, 3606, MOVI, type == TCG_TYPE_V128, rd, op, cmode, imm8); + } else if (type == TCG_TYPE_V128) { + new_pool_l2(s, R_AARCH64_CONDBR19, s->code_ptr, 0, v64, v64); + tcg_out_insn(s, 3305, LDR_v128, 0, rd); + } else { + new_pool_label(s, v64, R_AARCH64_CONDBR19, s->code_ptr, 0); + tcg_out_insn(s, 3305, LDR_v64, 0, rd); + } +} + static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd, tcg_target_long value) { @@ -594,6 +820,22 @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd, int s0, s1; AArch64Insn opc; + switch (type) { + case TCG_TYPE_I32: + case TCG_TYPE_I64: + tcg_debug_assert(rd < 32); + break; + + case TCG_TYPE_V64: + case TCG_TYPE_V128: + tcg_debug_assert(rd >= 32); + tcg_out_dupi_vec(s, type, rd, value); + return; + + default: + g_assert_not_reached(); + } + /* For 32-bit values, discard potential garbage in value. For 64-bit values within [2**31, 2**32-1], we can create smaller sequences by interpreting this as a negative 32-bit number, while ensuring that @@ -669,15 +911,13 @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd, /* Define something more legible for general use. */ #define tcg_out_ldst_r tcg_out_insn_3310 -static void tcg_out_ldst(TCGContext *s, AArch64Insn insn, - TCGReg rd, TCGReg rn, intptr_t offset) +static void tcg_out_ldst(TCGContext *s, AArch64Insn insn, TCGReg rd, + TCGReg rn, intptr_t offset, int lgsize) { - TCGMemOp size = (uint32_t)insn >> 30; - /* If the offset is naturally aligned and in range, then we can use the scaled uimm12 encoding */ - if (offset >= 0 && !(offset & ((1 << size) - 1))) { - uintptr_t scaled_uimm = offset >> size; + if (offset >= 0 && !(offset & ((1 << lgsize) - 1))) { + uintptr_t scaled_uimm = offset >> lgsize; if (scaled_uimm <= 0xfff) { tcg_out_insn_3313(s, insn, rd, rn, scaled_uimm); return; @@ -695,32 +935,102 @@ static void tcg_out_ldst(TCGContext *s, AArch64Insn insn, tcg_out_ldst_r(s, insn, rd, rn, TCG_TYPE_I64, TCG_REG_TMP); } -static inline void tcg_out_mov(TCGContext *s, - TCGType type, TCGReg ret, TCGReg arg) +static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg) { - if (ret != arg) { - tcg_out_movr(s, type, ret, arg); + if (ret == arg) { + return; + } + switch (type) { + case TCG_TYPE_I32: + case TCG_TYPE_I64: + if (ret < 32 && arg < 32) { + tcg_out_movr(s, type, ret, arg); + break; + } else if (ret < 32) { + tcg_out_insn(s, 3605, UMOV, type, ret, arg, 0, 0); + break; + } else if (arg < 32) { + tcg_out_insn(s, 3605, INS, 0, ret, arg, 4 << type, 0); + break; + } + /* FALLTHRU */ + + case TCG_TYPE_V64: + tcg_debug_assert(ret >= 32 && arg >= 32); + tcg_out_insn(s, 3616, ORR, 0, 0, ret, arg, arg); + break; + case TCG_TYPE_V128: + tcg_debug_assert(ret >= 32 && arg >= 32); + tcg_out_insn(s, 3616, ORR, 1, 0, ret, arg, arg); + break; + + default: + g_assert_not_reached(); } } -static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg, - TCGReg arg1, intptr_t arg2) +static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, + TCGReg base, intptr_t ofs) { - tcg_out_ldst(s, type == TCG_TYPE_I32 ? I3312_LDRW : I3312_LDRX, - arg, arg1, arg2); + AArch64Insn insn; + int lgsz; + + switch (type) { + case TCG_TYPE_I32: + insn = (ret < 32 ? I3312_LDRW : I3312_LDRVS); + lgsz = 2; + break; + case TCG_TYPE_I64: + insn = (ret < 32 ? I3312_LDRX : I3312_LDRVD); + lgsz = 3; + break; + case TCG_TYPE_V64: + insn = I3312_LDRVD; + lgsz = 3; + break; + case TCG_TYPE_V128: + insn = I3312_LDRVQ; + lgsz = 4; + break; + default: + g_assert_not_reached(); + } + tcg_out_ldst(s, insn, ret, base, ofs, lgsz); } -static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, - TCGReg arg1, intptr_t arg2) +static void tcg_out_st(TCGContext *s, TCGType type, TCGReg src, + TCGReg base, intptr_t ofs) { - tcg_out_ldst(s, type == TCG_TYPE_I32 ? I3312_STRW : I3312_STRX, - arg, arg1, arg2); + AArch64Insn insn; + int lgsz; + + switch (type) { + case TCG_TYPE_I32: + insn = (src < 32 ? I3312_STRW : I3312_STRVS); + lgsz = 2; + break; + case TCG_TYPE_I64: + insn = (src < 32 ? I3312_STRX : I3312_STRVD); + lgsz = 3; + break; + case TCG_TYPE_V64: + insn = I3312_STRVD; + lgsz = 3; + break; + case TCG_TYPE_V128: + insn = I3312_STRVQ; + lgsz = 4; + break; + default: + g_assert_not_reached(); + } + tcg_out_ldst(s, insn, src, base, ofs, lgsz); } static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, TCGReg base, intptr_t ofs) { - if (val == 0) { + if (type <= TCG_TYPE_I64 && val == 0) { tcg_out_st(s, type, TCG_REG_XZR, base, ofs); return true; } @@ -1210,14 +1520,15 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addr_reg, TCGMemOp opc, /* Merge "low bits" from tlb offset, load the tlb comparator into X0. X0 = load [X2 + (tlb_offset & 0x000fff)] */ tcg_out_ldst(s, TARGET_LONG_BITS == 32 ? I3312_LDRW : I3312_LDRX, - TCG_REG_X0, TCG_REG_X2, tlb_offset & 0xfff); + TCG_REG_X0, TCG_REG_X2, tlb_offset & 0xfff, + TARGET_LONG_BITS == 32 ? 2 : 3); /* Load the tlb addend. Do that early to avoid stalling. X1 = load [X2 + (tlb_offset & 0xfff) + offsetof(addend)] */ tcg_out_ldst(s, I3312_LDRX, TCG_REG_X1, TCG_REG_X2, (tlb_offset & 0xfff) + (offsetof(CPUTLBEntry, addend)) - (is_read ? offsetof(CPUTLBEntry, addr_read) - : offsetof(CPUTLBEntry, addr_write))); + : offsetof(CPUTLBEntry, addr_write)), 3); /* Perform the address comparison. */ tcg_out_cmp(s, (TARGET_LONG_BITS == 64), TCG_REG_X0, TCG_REG_X3, 0); @@ -1435,49 +1746,49 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, case INDEX_op_ld8u_i32: case INDEX_op_ld8u_i64: - tcg_out_ldst(s, I3312_LDRB, a0, a1, a2); + tcg_out_ldst(s, I3312_LDRB, a0, a1, a2, 0); break; case INDEX_op_ld8s_i32: - tcg_out_ldst(s, I3312_LDRSBW, a0, a1, a2); + tcg_out_ldst(s, I3312_LDRSBW, a0, a1, a2, 0); break; case INDEX_op_ld8s_i64: - tcg_out_ldst(s, I3312_LDRSBX, a0, a1, a2); + tcg_out_ldst(s, I3312_LDRSBX, a0, a1, a2, 0); break; case INDEX_op_ld16u_i32: case INDEX_op_ld16u_i64: - tcg_out_ldst(s, I3312_LDRH, a0, a1, a2); + tcg_out_ldst(s, I3312_LDRH, a0, a1, a2, 1); break; case INDEX_op_ld16s_i32: - tcg_out_ldst(s, I3312_LDRSHW, a0, a1, a2); + tcg_out_ldst(s, I3312_LDRSHW, a0, a1, a2, 1); break; case INDEX_op_ld16s_i64: - tcg_out_ldst(s, I3312_LDRSHX, a0, a1, a2); + tcg_out_ldst(s, I3312_LDRSHX, a0, a1, a2, 1); break; case INDEX_op_ld_i32: case INDEX_op_ld32u_i64: - tcg_out_ldst(s, I3312_LDRW, a0, a1, a2); + tcg_out_ldst(s, I3312_LDRW, a0, a1, a2, 2); break; case INDEX_op_ld32s_i64: - tcg_out_ldst(s, I3312_LDRSWX, a0, a1, a2); + tcg_out_ldst(s, I3312_LDRSWX, a0, a1, a2, 2); break; case INDEX_op_ld_i64: - tcg_out_ldst(s, I3312_LDRX, a0, a1, a2); + tcg_out_ldst(s, I3312_LDRX, a0, a1, a2, 3); break; case INDEX_op_st8_i32: case INDEX_op_st8_i64: - tcg_out_ldst(s, I3312_STRB, REG0(0), a1, a2); + tcg_out_ldst(s, I3312_STRB, REG0(0), a1, a2, 0); break; case INDEX_op_st16_i32: case INDEX_op_st16_i64: - tcg_out_ldst(s, I3312_STRH, REG0(0), a1, a2); + tcg_out_ldst(s, I3312_STRH, REG0(0), a1, a2, 1); break; case INDEX_op_st_i32: case INDEX_op_st32_i64: - tcg_out_ldst(s, I3312_STRW, REG0(0), a1, a2); + tcg_out_ldst(s, I3312_STRW, REG0(0), a1, a2, 2); break; case INDEX_op_st_i64: - tcg_out_ldst(s, I3312_STRX, REG0(0), a1, a2); + tcg_out_ldst(s, I3312_STRX, REG0(0), a1, a2, 3); break; case INDEX_op_add_i32: @@ -1776,25 +2087,176 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */ case INDEX_op_mov_i64: + case INDEX_op_mov_vec: case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi. */ case INDEX_op_movi_i64: + case INDEX_op_dupi_vec: case INDEX_op_call: /* Always emitted via tcg_out_call. */ default: - tcg_abort(); + g_assert_not_reached(); } #undef REG0 } +static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, + unsigned vecl, unsigned vece, + const TCGArg *args, const int *const_args) +{ + static const AArch64Insn cmp_insn[16] = { + [TCG_COND_EQ] = I3616_CMEQ, + [TCG_COND_GT] = I3616_CMGT, + [TCG_COND_GE] = I3616_CMGE, + [TCG_COND_GTU] = I3616_CMHI, + [TCG_COND_GEU] = I3616_CMHS, + }; + static const AArch64Insn cmp0_insn[16] = { + [TCG_COND_EQ] = I3617_CMEQ0, + [TCG_COND_GT] = I3617_CMGT0, + [TCG_COND_GE] = I3617_CMGE0, + [TCG_COND_LT] = I3617_CMLT0, + [TCG_COND_LE] = I3617_CMLE0, + }; + + TCGType type = vecl + TCG_TYPE_V64; + unsigned is_q = vecl; + TCGArg a0, a1, a2; + + a0 = args[0]; + a1 = args[1]; + a2 = args[2]; + + switch (opc) { + case INDEX_op_ld_vec: + tcg_out_ld(s, type, a0, a1, a2); + break; + case INDEX_op_st_vec: + tcg_out_st(s, type, a0, a1, a2); + break; + case INDEX_op_add_vec: + tcg_out_insn(s, 3616, ADD, is_q, vece, a0, a1, a2); + break; + case INDEX_op_sub_vec: + tcg_out_insn(s, 3616, SUB, is_q, vece, a0, a1, a2); + break; + case INDEX_op_mul_vec: + tcg_out_insn(s, 3616, MUL, is_q, vece, a0, a1, a2); + break; + case INDEX_op_neg_vec: + tcg_out_insn(s, 3617, NEG, is_q, vece, a0, a1); + break; + case INDEX_op_and_vec: + tcg_out_insn(s, 3616, AND, is_q, 0, a0, a1, a2); + break; + case INDEX_op_or_vec: + tcg_out_insn(s, 3616, ORR, is_q, 0, a0, a1, a2); + break; + case INDEX_op_xor_vec: + tcg_out_insn(s, 3616, EOR, is_q, 0, a0, a1, a2); + break; + case INDEX_op_andc_vec: + tcg_out_insn(s, 3616, BIC, is_q, 0, a0, a1, a2); + break; + case INDEX_op_orc_vec: + tcg_out_insn(s, 3616, ORN, is_q, 0, a0, a1, a2); + break; + case INDEX_op_not_vec: + tcg_out_insn(s, 3617, NOT, is_q, 0, a0, a1); + break; + case INDEX_op_dup_vec: + tcg_out_insn(s, 3605, DUP, is_q, a0, a1, 1 << vece, 0); + break; + case INDEX_op_shli_vec: + tcg_out_insn(s, 3614, SHL, is_q, a0, a1, a2 + (8 << vece)); + break; + case INDEX_op_shri_vec: + tcg_out_insn(s, 3614, USHR, is_q, a0, a1, (16 << vece) - a2); + break; + case INDEX_op_sari_vec: + tcg_out_insn(s, 3614, SSHR, is_q, a0, a1, (16 << vece) - a2); + break; + case INDEX_op_cmp_vec: + { + TCGCond cond = args[3]; + AArch64Insn insn; + + if (cond == TCG_COND_NE) { + if (const_args[2]) { + tcg_out_insn(s, 3616, CMTST, is_q, vece, a0, a1, a1); + } else { + tcg_out_insn(s, 3616, CMEQ, is_q, vece, a0, a1, a2); + tcg_out_insn(s, 3617, NOT, is_q, 0, a0, a0); + } + } else { + if (const_args[2]) { + insn = cmp0_insn[cond]; + if (insn) { + tcg_out_insn_3617(s, insn, is_q, vece, a0, a1); + break; + } + tcg_out_dupi_vec(s, type, TCG_VEC_TMP, 0); + a2 = TCG_VEC_TMP; + } + insn = cmp_insn[cond]; + if (insn == 0) { + TCGArg t; + t = a1, a1 = a2, a2 = t; + cond = tcg_swap_cond(cond); + insn = cmp_insn[cond]; + tcg_debug_assert(insn != 0); + } + tcg_out_insn_3616(s, insn, is_q, vece, a0, a1, a2); + } + } + break; + default: + g_assert_not_reached(); + } +} + +int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) +{ + switch (opc) { + case INDEX_op_add_vec: + case INDEX_op_sub_vec: + case INDEX_op_mul_vec: + case INDEX_op_and_vec: + case INDEX_op_or_vec: + case INDEX_op_xor_vec: + case INDEX_op_andc_vec: + case INDEX_op_orc_vec: + case INDEX_op_neg_vec: + case INDEX_op_not_vec: + case INDEX_op_cmp_vec: + case INDEX_op_shli_vec: + case INDEX_op_shri_vec: + case INDEX_op_sari_vec: + return 1; + + default: + return 0; + } +} + +void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, + TCGArg a0, ...) +{ +} + static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op) { static const TCGTargetOpDef r = { .args_ct_str = { "r" } }; static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } }; + static const TCGTargetOpDef w_w = { .args_ct_str = { "w", "w" } }; + static const TCGTargetOpDef w_r = { .args_ct_str = { "w", "r" } }; + static const TCGTargetOpDef w_wr = { .args_ct_str = { "w", "wr" } }; static const TCGTargetOpDef r_l = { .args_ct_str = { "r", "l" } }; static const TCGTargetOpDef r_rA = { .args_ct_str = { "r", "rA" } }; static const TCGTargetOpDef rZ_r = { .args_ct_str = { "rZ", "r" } }; static const TCGTargetOpDef lZ_l = { .args_ct_str = { "lZ", "l" } }; static const TCGTargetOpDef r_r_r = { .args_ct_str = { "r", "r", "r" } }; + static const TCGTargetOpDef w_w_w = { .args_ct_str = { "w", "w", "w" } }; + static const TCGTargetOpDef w_w_wZ = { .args_ct_str = { "w", "w", "wZ" } }; static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } }; static const TCGTargetOpDef r_r_rA = { .args_ct_str = { "r", "r", "rA" } }; static const TCGTargetOpDef r_r_rL = { .args_ct_str = { "r", "r", "rL" } }; @@ -1938,6 +2400,29 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op) case INDEX_op_sub2_i64: return &add2; + case INDEX_op_add_vec: + case INDEX_op_sub_vec: + case INDEX_op_mul_vec: + case INDEX_op_and_vec: + case INDEX_op_or_vec: + case INDEX_op_xor_vec: + case INDEX_op_andc_vec: + case INDEX_op_orc_vec: + return &w_w_w; + case INDEX_op_not_vec: + case INDEX_op_neg_vec: + case INDEX_op_shli_vec: + case INDEX_op_shri_vec: + case INDEX_op_sari_vec: + return &w_w; + case INDEX_op_ld_vec: + case INDEX_op_st_vec: + return &w_r; + case INDEX_op_dup_vec: + return &w_wr; + case INDEX_op_cmp_vec: + return &w_w_wZ; + default: return NULL; } @@ -1947,8 +2432,10 @@ static void tcg_target_init(TCGContext *s) { tcg_target_available_regs[TCG_TYPE_I32] = 0xffffffffu; tcg_target_available_regs[TCG_TYPE_I64] = 0xffffffffu; + tcg_target_available_regs[TCG_TYPE_V64] = 0xffffffff00000000ull; + tcg_target_available_regs[TCG_TYPE_V128] = 0xffffffff00000000ull; - tcg_target_call_clobber_regs = 0xfffffffu; + tcg_target_call_clobber_regs = -1ull; tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X19); tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X20); tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X21); @@ -1960,12 +2447,21 @@ static void tcg_target_init(TCGContext *s) tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X27); tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X28); tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X29); + tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V8); + tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V9); + tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V10); + tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V11); + tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V12); + tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V13); + tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V14); + tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V15); s->reserved_regs = 0; tcg_regset_set_reg(s->reserved_regs, TCG_REG_SP); tcg_regset_set_reg(s->reserved_regs, TCG_REG_FP); tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP); tcg_regset_set_reg(s->reserved_regs, TCG_REG_X18); /* platform register */ + tcg_regset_set_reg(s->reserved_regs, TCG_VEC_TMP); } /* Saving pairs: (X19, X20) .. (X27, X28), (X29(fp), X30(lr)). */ diff --git a/tcg/aarch64/tcg-target.opc.h b/tcg/aarch64/tcg-target.opc.h new file mode 100644 index 0000000000..4816a6c3d4 --- /dev/null +++ b/tcg/aarch64/tcg-target.opc.h @@ -0,0 +1,3 @@ +/* Target-specific opcodes for host vector expansion. These will be + emitted by tcg_expand_vec_op. For those familiar with GCC internals, + consider these to be UNSPEC with names. */ diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h index b89dababf4..9fdf37f23c 100644 --- a/tcg/i386/tcg-target.h +++ b/tcg/i386/tcg-target.h @@ -30,10 +30,10 @@ #ifdef __x86_64__ # define TCG_TARGET_REG_BITS 64 -# define TCG_TARGET_NB_REGS 16 +# define TCG_TARGET_NB_REGS 32 #else # define TCG_TARGET_REG_BITS 32 -# define TCG_TARGET_NB_REGS 8 +# define TCG_TARGET_NB_REGS 24 #endif typedef enum { @@ -56,6 +56,26 @@ typedef enum { TCG_REG_R13, TCG_REG_R14, TCG_REG_R15, + + TCG_REG_XMM0, + TCG_REG_XMM1, + TCG_REG_XMM2, + TCG_REG_XMM3, + TCG_REG_XMM4, + TCG_REG_XMM5, + TCG_REG_XMM6, + TCG_REG_XMM7, + + /* 64-bit registers; likewise always define. */ + TCG_REG_XMM8, + TCG_REG_XMM9, + TCG_REG_XMM10, + TCG_REG_XMM11, + TCG_REG_XMM12, + TCG_REG_XMM13, + TCG_REG_XMM14, + TCG_REG_XMM15, + TCG_REG_RAX = TCG_REG_EAX, TCG_REG_RCX = TCG_REG_ECX, TCG_REG_RDX = TCG_REG_EDX, @@ -77,6 +97,8 @@ typedef enum { extern bool have_bmi1; extern bool have_popcnt; +extern bool have_avx1; +extern bool have_avx2; /* optional instructions */ #define TCG_TARGET_HAS_div2_i32 1 @@ -146,6 +168,21 @@ extern bool have_popcnt; #define TCG_TARGET_HAS_mulsh_i64 0 #endif +/* We do not support older SSE systems, only beginning with AVX1. */ +#define TCG_TARGET_HAS_v64 have_avx1 +#define TCG_TARGET_HAS_v128 have_avx1 +#define TCG_TARGET_HAS_v256 have_avx2 + +#define TCG_TARGET_HAS_andc_vec 1 +#define TCG_TARGET_HAS_orc_vec 0 +#define TCG_TARGET_HAS_not_vec 0 +#define TCG_TARGET_HAS_neg_vec 0 +#define TCG_TARGET_HAS_shi_vec 1 +#define TCG_TARGET_HAS_shs_vec 0 +#define TCG_TARGET_HAS_shv_vec 0 +#define TCG_TARGET_HAS_cmp_vec 1 +#define TCG_TARGET_HAS_mul_vec 1 + #define TCG_TARGET_deposit_i32_valid(ofs, len) \ (((ofs) == 0 && (len) == 8) || ((ofs) == 8 && (len) == 8) || \ ((ofs) == 0 && (len) == 16)) diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c index 63d27f10e7..fc05909d1d 100644 --- a/tcg/i386/tcg-target.inc.c +++ b/tcg/i386/tcg-target.inc.c @@ -28,10 +28,15 @@ static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { #if TCG_TARGET_REG_BITS == 64 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi", - "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", #else "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi", #endif + "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", +#if TCG_TARGET_REG_BITS == 64 + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", +#endif }; #endif @@ -61,6 +66,28 @@ static const int tcg_target_reg_alloc_order[] = { TCG_REG_EDX, TCG_REG_EAX, #endif + TCG_REG_XMM0, + TCG_REG_XMM1, + TCG_REG_XMM2, + TCG_REG_XMM3, + TCG_REG_XMM4, + TCG_REG_XMM5, +#ifndef _WIN64 + /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save + any of them. Therefore only allow xmm0-xmm5 to be allocated. */ + TCG_REG_XMM6, + TCG_REG_XMM7, +#if TCG_TARGET_REG_BITS == 64 + TCG_REG_XMM8, + TCG_REG_XMM9, + TCG_REG_XMM10, + TCG_REG_XMM11, + TCG_REG_XMM12, + TCG_REG_XMM13, + TCG_REG_XMM14, + TCG_REG_XMM15, +#endif +#endif }; static const int tcg_target_call_iarg_regs[] = { @@ -94,7 +121,7 @@ static const int tcg_target_call_oarg_regs[] = { #define TCG_CT_CONST_I32 0x400 #define TCG_CT_CONST_WSZ 0x800 -/* Registers used with L constraint, which are the first argument +/* Registers used with L constraint, which are the first argument registers on x86_64, and two random call clobbered registers on i386. */ #if TCG_TARGET_REG_BITS == 64 @@ -125,6 +152,8 @@ static bool have_cmov; it there. Therefore we always define the variable. */ bool have_bmi1; bool have_popcnt; +bool have_avx1; +bool have_avx2; #ifdef CONFIG_CPUID_H static bool have_movbe; @@ -148,6 +177,8 @@ static void patch_reloc(tcg_insn_unit *code_ptr, int type, if (value != (int32_t)value) { tcg_abort(); } + /* FALLTHRU */ + case R_386_32: tcg_patch32(code_ptr, value); break; case R_386_PC8: @@ -162,6 +193,14 @@ static void patch_reloc(tcg_insn_unit *code_ptr, int type, } } +#if TCG_TARGET_REG_BITS == 64 +#define ALL_GENERAL_REGS 0x0000ffffu +#define ALL_VECTOR_REGS 0xffff0000u +#else +#define ALL_GENERAL_REGS 0x000000ffu +#define ALL_VECTOR_REGS 0x00ff0000u +#endif + /* parse target specific constraints */ static const char *target_parse_constraint(TCGArgConstraint *ct, const char *ct_str, TCGType type) @@ -192,21 +231,29 @@ static const char *target_parse_constraint(TCGArgConstraint *ct, tcg_regset_set_reg(ct->u.regs, TCG_REG_EDI); break; case 'q': + /* A register that can be used as a byte operand. */ ct->ct |= TCG_CT_REG; ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf; break; case 'Q': + /* A register with an addressable second byte (e.g. %ah). */ ct->ct |= TCG_CT_REG; ct->u.regs = 0xf; break; case 'r': + /* A general register. */ ct->ct |= TCG_CT_REG; - ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff; + ct->u.regs |= ALL_GENERAL_REGS; break; case 'W': /* With TZCNT/LZCNT, we can have operand-size as an input. */ ct->ct |= TCG_CT_CONST_WSZ; break; + case 'x': + /* A vector register. */ + ct->ct |= TCG_CT_REG; + ct->u.regs |= ALL_VECTOR_REGS; + break; /* qemu_ld/st address constraint */ case 'L': @@ -277,14 +324,17 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type, # define P_REXB_RM 0 # define P_GS 0 #endif -#define P_SIMDF3 0x10000 /* 0xf3 opcode prefix */ -#define P_SIMDF2 0x20000 /* 0xf2 opcode prefix */ +#define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */ +#define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */ +#define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */ +#define P_VEXL 0x80000 /* Set VEX.L = 1 */ #define OPC_ARITH_EvIz (0x81) #define OPC_ARITH_EvIb (0x83) #define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */ #define OPC_ANDN (0xf2 | P_EXT38) #define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3)) +#define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16) #define OPC_BSF (0xbc | P_EXT) #define OPC_BSR (0xbd | P_EXT) #define OPC_BSWAP (0xc8 | P_EXT) @@ -310,11 +360,68 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type, #define OPC_MOVL_Iv (0xb8) #define OPC_MOVBE_GyMy (0xf0 | P_EXT38) #define OPC_MOVBE_MyGy (0xf1 | P_EXT38) +#define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16) +#define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16) +#define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2) +#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16) +#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16) +#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3) +#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3) +#define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3) +#define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16) #define OPC_MOVSBL (0xbe | P_EXT) #define OPC_MOVSWL (0xbf | P_EXT) #define OPC_MOVSLQ (0x63 | P_REXW) #define OPC_MOVZBL (0xb6 | P_EXT) #define OPC_MOVZWL (0xb7 | P_EXT) +#define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16) +#define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16) +#define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16) +#define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16) +#define OPC_PADDB (0xfc | P_EXT | P_DATA16) +#define OPC_PADDW (0xfd | P_EXT | P_DATA16) +#define OPC_PADDD (0xfe | P_EXT | P_DATA16) +#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16) +#define OPC_PAND (0xdb | P_EXT | P_DATA16) +#define OPC_PANDN (0xdf | P_EXT | P_DATA16) +#define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16) +#define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16) +#define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16) +#define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16) +#define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16) +#define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16) +#define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16) +#define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16) +#define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16) +#define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16) +#define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16) +#define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16) +#define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16) +#define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16) +#define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16) +#define OPC_PMULLW (0xd5 | P_EXT | P_DATA16) +#define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16) +#define OPC_POR (0xeb | P_EXT | P_DATA16) +#define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16) +#define OPC_PSHUFD (0x70 | P_EXT | P_DATA16) +#define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2) +#define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3) +#define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */ +#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */ +#define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */ +#define OPC_PSUBB (0xf8 | P_EXT | P_DATA16) +#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16) +#define OPC_PSUBD (0xfa | P_EXT | P_DATA16) +#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16) +#define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16) +#define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16) +#define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16) +#define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16) +#define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16) +#define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16) +#define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16) +#define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16) +#define OPC_PXOR (0xef | P_EXT | P_DATA16) #define OPC_POP_r32 (0x58) #define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3) #define OPC_PUSH_r32 (0x50) @@ -326,14 +433,26 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type, #define OPC_SHIFT_Ib (0xc1) #define OPC_SHIFT_cl (0xd3) #define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3) +#define OPC_SHUFPS (0xc6 | P_EXT) #define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16) #define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2) #define OPC_TESTL (0x85) #define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3) +#define OPC_UD2 (0x0b | P_EXT) +#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16) +#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16) +#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16) +#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16) +#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16) +#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16) +#define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_REXW) +#define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL) +#define OPC_VZEROUPPER (0x77 | P_EXT) #define OPC_XCHG_ax_r32 (0x90) #define OPC_GRP3_Ev (0xf7) #define OPC_GRP5 (0xff) +#define OPC_GRP14 (0x73 | P_EXT | P_DATA16) /* Group 1 opcode extensions for 0x80-0x83. These are also used as modifiers for OPC_ARITH. */ @@ -439,10 +558,12 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x) tcg_out8(s, (uint8_t)(rex | 0x40)); } - if (opc & (P_EXT | P_EXT38)) { + if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { tcg_out8(s, 0x0f); if (opc & P_EXT38) { tcg_out8(s, 0x38); + } else if (opc & P_EXT3A) { + tcg_out8(s, 0x3a); } } @@ -459,10 +580,12 @@ static void tcg_out_opc(TCGContext *s, int opc) } else if (opc & P_SIMDF2) { tcg_out8(s, 0xf2); } - if (opc & (P_EXT | P_EXT38)) { + if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { tcg_out8(s, 0x0f); if (opc & P_EXT38) { tcg_out8(s, 0x38); + } else if (opc & P_EXT3A) { + tcg_out8(s, 0x3a); } } tcg_out8(s, opc); @@ -479,34 +602,42 @@ static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm) tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); } -static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) +static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v, + int rm, int index) { int tmp; - if ((opc & (P_REXW | P_EXT | P_EXT38)) || (rm & 8)) { + /* Use the two byte form if possible, which cannot encode + VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */ + if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT + && ((rm | index) & 8) == 0) { + /* Two byte VEX prefix. */ + tcg_out8(s, 0xc5); + + tmp = (r & 8 ? 0 : 0x80); /* VEX.R */ + } else { /* Three byte VEX prefix. */ tcg_out8(s, 0xc4); /* VEX.m-mmmm */ - if (opc & P_EXT38) { + if (opc & P_EXT3A) { + tmp = 3; + } else if (opc & P_EXT38) { tmp = 2; } else if (opc & P_EXT) { tmp = 1; } else { - tcg_abort(); + g_assert_not_reached(); } - tmp |= 0x40; /* VEX.X */ - tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */ - tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */ + tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */ + tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */ + tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */ tcg_out8(s, tmp); - tmp = (opc & P_REXW ? 0x80 : 0); /* VEX.W */ - } else { - /* Two byte VEX prefix. */ - tcg_out8(s, 0xc5); - - tmp = (r & 8 ? 0 : 0x80); /* VEX.R */ + tmp = (opc & P_REXW ? 0x80 : 0); /* VEX.W */ } + + tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */ /* VEX.pp */ if (opc & P_DATA16) { tmp |= 1; /* 0x66 */ @@ -518,6 +649,11 @@ static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) tmp |= (~v & 15) << 3; /* VEX.vvvv */ tcg_out8(s, tmp); tcg_out8(s, opc); +} + +static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) +{ + tcg_out_vex_opc(s, opc, r, v, rm, 0); tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); } @@ -526,8 +662,8 @@ static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) mode for absolute addresses, ~RM is the size of the immediate operand that will follow the instruction. */ -static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, - int index, int shift, intptr_t offset) +static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index, + int shift, intptr_t offset) { int mod, len; @@ -538,7 +674,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm; intptr_t disp = offset - pc; if (disp == (int32_t)disp) { - tcg_out_opc(s, opc, r, 0, 0); tcg_out8(s, (LOWREGMASK(r) << 3) | 5); tcg_out32(s, disp); return; @@ -548,7 +683,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, use of the MODRM+SIB encoding and is therefore larger than rip-relative addressing. */ if (offset == (int32_t)offset) { - tcg_out_opc(s, opc, r, 0, 0); tcg_out8(s, (LOWREGMASK(r) << 3) | 4); tcg_out8(s, (4 << 3) | 5); tcg_out32(s, offset); @@ -556,10 +690,9 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, } /* ??? The memory isn't directly addressable. */ - tcg_abort(); + g_assert_not_reached(); } else { /* Absolute address. */ - tcg_out_opc(s, opc, r, 0, 0); tcg_out8(s, (r << 3) | 5); tcg_out32(s, offset); return; @@ -582,7 +715,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, that would be used for %esp is the escape to the two byte form. */ if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) { /* Single byte MODRM format. */ - tcg_out_opc(s, opc, r, rm, 0); tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); } else { /* Two byte MODRM+SIB format. */ @@ -596,7 +728,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, tcg_debug_assert(index != TCG_REG_ESP); } - tcg_out_opc(s, opc, r, rm, index); tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4); tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm)); } @@ -608,6 +739,21 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, } } +static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, + int index, int shift, intptr_t offset) +{ + tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index); + tcg_out_sib_offset(s, r, rm, index, shift, offset); +} + +static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v, + int rm, int index, int shift, + intptr_t offset) +{ + tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index); + tcg_out_sib_offset(s, r, rm, index, shift, offset); +} + /* A simplification of the above with no index or shift. */ static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, int rm, intptr_t offset) @@ -615,6 +761,30 @@ static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset); } +static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r, + int v, int rm, intptr_t offset) +{ + tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset); +} + +/* Output an opcode with an expected reference to the constant pool. */ +static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r) +{ + tcg_out_opc(s, opc, r, 0, 0); + /* Absolute for 32-bit, pc-relative for 64-bit. */ + tcg_out8(s, LOWREGMASK(r) << 3 | 5); + tcg_out32(s, 0); +} + +/* Output an opcode with an expected reference to the constant pool. */ +static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r) +{ + tcg_out_vex_opc(s, opc, r, 0, 0, 0); + /* Absolute for 32-bit, pc-relative for 64-bit. */ + tcg_out8(s, LOWREGMASK(r) << 3 | 5); + tcg_out32(s, 0); +} + /* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */ static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src) { @@ -625,12 +795,116 @@ static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src) tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src); } -static inline void tcg_out_mov(TCGContext *s, TCGType type, - TCGReg ret, TCGReg arg) +static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg) { - if (arg != ret) { - int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0); - tcg_out_modrm(s, opc, ret, arg); + int rexw = 0; + + if (arg == ret) { + return; + } + switch (type) { + case TCG_TYPE_I64: + rexw = P_REXW; + /* fallthru */ + case TCG_TYPE_I32: + if (ret < 16) { + if (arg < 16) { + tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg); + } else { + tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret); + } + } else { + if (arg < 16) { + tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg); + } else { + tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); + } + } + break; + + case TCG_TYPE_V64: + tcg_debug_assert(ret >= 16 && arg >= 16); + tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); + break; + case TCG_TYPE_V128: + tcg_debug_assert(ret >= 16 && arg >= 16); + tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg); + break; + case TCG_TYPE_V256: + tcg_debug_assert(ret >= 16 && arg >= 16); + tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg); + break; + + default: + g_assert_not_reached(); + } +} + +static void tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece, + TCGReg r, TCGReg a) +{ + if (have_avx2) { + static const int dup_insn[4] = { + OPC_VPBROADCASTB, OPC_VPBROADCASTW, + OPC_VPBROADCASTD, OPC_VPBROADCASTQ, + }; + int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); + tcg_out_vex_modrm(s, dup_insn[vece] + vex_l, r, 0, a); + } else { + switch (vece) { + case MO_8: + /* ??? With zero in a register, use PSHUFB. */ + tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, 0, a); + a = r; + /* FALLTHRU */ + case MO_16: + tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, 0, a); + a = r; + /* FALLTHRU */ + case MO_32: + tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a); + /* imm8 operand: all output lanes selected from input lane 0. */ + tcg_out8(s, 0); + break; + case MO_64: + tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, 0, a); + break; + default: + g_assert_not_reached(); + } + } +} + +static void tcg_out_dupi_vec(TCGContext *s, TCGType type, + TCGReg ret, tcg_target_long arg) +{ + int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); + + if (arg == 0) { + tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); + return; + } + if (arg == -1) { + tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret); + return; + } + + if (TCG_TARGET_REG_BITS == 64) { + if (type == TCG_TYPE_V64) { + tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret); + } else if (have_avx2) { + tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret); + } else { + tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret); + } + new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); + } else if (have_avx2) { + tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret); + new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); + } else { + tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy, ret); + new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); + tcg_out_dup_vec(s, type, MO_32, ret, ret); } } @@ -639,6 +913,25 @@ static void tcg_out_movi(TCGContext *s, TCGType type, { tcg_target_long diff; + switch (type) { + case TCG_TYPE_I32: +#if TCG_TARGET_REG_BITS == 64 + case TCG_TYPE_I64: +#endif + if (ret < 16) { + break; + } + /* fallthru */ + case TCG_TYPE_V64: + case TCG_TYPE_V128: + case TCG_TYPE_V256: + tcg_debug_assert(ret >= 16); + tcg_out_dupi_vec(s, type, ret, arg); + return; + default: + g_assert_not_reached(); + } + if (arg == 0) { tgen_arithr(s, ARITH_XOR, ret, ret); return; @@ -702,18 +995,74 @@ static inline void tcg_out_pop(TCGContext *s, int reg) tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0); } -static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, - TCGReg arg1, intptr_t arg2) +static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, + TCGReg arg1, intptr_t arg2) { - int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0); - tcg_out_modrm_offset(s, opc, ret, arg1, arg2); + switch (type) { + case TCG_TYPE_I32: + if (ret < 16) { + tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2); + } else { + tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2); + } + break; + case TCG_TYPE_I64: + if (ret < 16) { + tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2); + break; + } + /* FALLTHRU */ + case TCG_TYPE_V64: + tcg_debug_assert(ret >= 16); + tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2); + break; + case TCG_TYPE_V128: + tcg_debug_assert(ret >= 16); + tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx, ret, 0, arg1, arg2); + break; + case TCG_TYPE_V256: + tcg_debug_assert(ret >= 16); + tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL, + ret, 0, arg1, arg2); + break; + default: + g_assert_not_reached(); + } } -static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, - TCGReg arg1, intptr_t arg2) +static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, + TCGReg arg1, intptr_t arg2) { - int opc = OPC_MOVL_EvGv + (type == TCG_TYPE_I64 ? P_REXW : 0); - tcg_out_modrm_offset(s, opc, arg, arg1, arg2); + switch (type) { + case TCG_TYPE_I32: + if (arg < 16) { + tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2); + } else { + tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2); + } + break; + case TCG_TYPE_I64: + if (arg < 16) { + tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2); + break; + } + /* FALLTHRU */ + case TCG_TYPE_V64: + tcg_debug_assert(arg >= 16); + tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2); + break; + case TCG_TYPE_V128: + tcg_debug_assert(arg >= 16); + tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx, arg, 0, arg1, arg2); + break; + case TCG_TYPE_V256: + tcg_debug_assert(arg >= 16); + tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL, + arg, 0, arg1, arg2); + break; + default: + g_assert_not_reached(); + } } static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, @@ -725,6 +1074,8 @@ static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, return false; } rexw = P_REXW; + } else if (type != TCG_TYPE_I32) { + return false; } tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs); tcg_out32(s, val); @@ -2259,8 +2610,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, break; case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */ case INDEX_op_mov_i64: + case INDEX_op_mov_vec: case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi. */ case INDEX_op_movi_i64: + case INDEX_op_dupi_vec: case INDEX_op_call: /* Always emitted via tcg_out_call. */ default: tcg_abort(); @@ -2269,6 +2622,181 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, #undef OP_32_64 } +static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, + unsigned vecl, unsigned vece, + const TCGArg *args, const int *const_args) +{ + static int const add_insn[4] = { + OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ + }; + static int const sub_insn[4] = { + OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ + }; + static int const mul_insn[4] = { + OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2 + }; + static int const shift_imm_insn[4] = { + OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib + }; + static int const cmpeq_insn[4] = { + OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ + }; + static int const cmpgt_insn[4] = { + OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ + }; + static int const punpckl_insn[4] = { + OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ + }; + static int const punpckh_insn[4] = { + OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ + }; + static int const packss_insn[4] = { + OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2 + }; + static int const packus_insn[4] = { + OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2 + }; + + TCGType type = vecl + TCG_TYPE_V64; + int insn, sub; + TCGArg a0, a1, a2; + + a0 = args[0]; + a1 = args[1]; + a2 = args[2]; + + switch (opc) { + case INDEX_op_add_vec: + insn = add_insn[vece]; + goto gen_simd; + case INDEX_op_sub_vec: + insn = sub_insn[vece]; + goto gen_simd; + case INDEX_op_mul_vec: + insn = mul_insn[vece]; + goto gen_simd; + case INDEX_op_and_vec: + insn = OPC_PAND; + goto gen_simd; + case INDEX_op_or_vec: + insn = OPC_POR; + goto gen_simd; + case INDEX_op_xor_vec: + insn = OPC_PXOR; + goto gen_simd; + case INDEX_op_x86_punpckl_vec: + insn = punpckl_insn[vece]; + goto gen_simd; + case INDEX_op_x86_punpckh_vec: + insn = punpckh_insn[vece]; + goto gen_simd; + case INDEX_op_x86_packss_vec: + insn = packss_insn[vece]; + goto gen_simd; + case INDEX_op_x86_packus_vec: + insn = packus_insn[vece]; + goto gen_simd; + gen_simd: + tcg_debug_assert(insn != OPC_UD2); + if (type == TCG_TYPE_V256) { + insn |= P_VEXL; + } + tcg_out_vex_modrm(s, insn, a0, a1, a2); + break; + + case INDEX_op_cmp_vec: + sub = args[3]; + if (sub == TCG_COND_EQ) { + insn = cmpeq_insn[vece]; + } else if (sub == TCG_COND_GT) { + insn = cmpgt_insn[vece]; + } else { + g_assert_not_reached(); + } + goto gen_simd; + + case INDEX_op_andc_vec: + insn = OPC_PANDN; + if (type == TCG_TYPE_V256) { + insn |= P_VEXL; + } + tcg_out_vex_modrm(s, insn, a0, a2, a1); + break; + + case INDEX_op_shli_vec: + sub = 6; + goto gen_shift; + case INDEX_op_shri_vec: + sub = 2; + goto gen_shift; + case INDEX_op_sari_vec: + tcg_debug_assert(vece != MO_64); + sub = 4; + gen_shift: + tcg_debug_assert(vece != MO_8); + insn = shift_imm_insn[vece]; + if (type == TCG_TYPE_V256) { + insn |= P_VEXL; + } + tcg_out_vex_modrm(s, insn, sub, a0, a1); + tcg_out8(s, a2); + break; + + case INDEX_op_ld_vec: + tcg_out_ld(s, type, a0, a1, a2); + break; + case INDEX_op_st_vec: + tcg_out_st(s, type, a0, a1, a2); + break; + case INDEX_op_dup_vec: + tcg_out_dup_vec(s, type, vece, a0, a1); + break; + + case INDEX_op_x86_shufps_vec: + insn = OPC_SHUFPS; + sub = args[3]; + goto gen_simd_imm8; + case INDEX_op_x86_blend_vec: + if (vece == MO_16) { + insn = OPC_PBLENDW; + } else if (vece == MO_32) { + insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS); + } else { + g_assert_not_reached(); + } + sub = args[3]; + goto gen_simd_imm8; + case INDEX_op_x86_vperm2i128_vec: + insn = OPC_VPERM2I128; + sub = args[3]; + goto gen_simd_imm8; + gen_simd_imm8: + if (type == TCG_TYPE_V256) { + insn |= P_VEXL; + } + tcg_out_vex_modrm(s, insn, a0, a1, a2); + tcg_out8(s, sub); + break; + + case INDEX_op_x86_vpblendvb_vec: + insn = OPC_VPBLENDVB; + if (type == TCG_TYPE_V256) { + insn |= P_VEXL; + } + tcg_out_vex_modrm(s, insn, a0, a1, a2); + tcg_out8(s, args[3] << 4); + break; + + case INDEX_op_x86_psrldq_vec: + tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1); + tcg_out8(s, a2); + break; + + default: + g_assert_not_reached(); + } +} + static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op) { static const TCGTargetOpDef r = { .args_ct_str = { "r" } }; @@ -2292,6 +2820,11 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op) = { .args_ct_str = { "r", "r", "L", "L" } }; static const TCGTargetOpDef L_L_L_L = { .args_ct_str = { "L", "L", "L", "L" } }; + static const TCGTargetOpDef x_x = { .args_ct_str = { "x", "x" } }; + static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } }; + static const TCGTargetOpDef x_x_x_x + = { .args_ct_str = { "x", "x", "x", "x" } }; + static const TCGTargetOpDef x_r = { .args_ct_str = { "x", "r" } }; switch (op) { case INDEX_op_goto_ptr: @@ -2493,12 +3026,342 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op) return &s2; } + case INDEX_op_ld_vec: + case INDEX_op_st_vec: + return &x_r; + + case INDEX_op_add_vec: + case INDEX_op_sub_vec: + case INDEX_op_mul_vec: + case INDEX_op_and_vec: + case INDEX_op_or_vec: + case INDEX_op_xor_vec: + case INDEX_op_andc_vec: + case INDEX_op_cmp_vec: + case INDEX_op_x86_shufps_vec: + case INDEX_op_x86_blend_vec: + case INDEX_op_x86_packss_vec: + case INDEX_op_x86_packus_vec: + case INDEX_op_x86_vperm2i128_vec: + case INDEX_op_x86_punpckl_vec: + case INDEX_op_x86_punpckh_vec: + return &x_x_x; + case INDEX_op_dup_vec: + case INDEX_op_shli_vec: + case INDEX_op_shri_vec: + case INDEX_op_sari_vec: + case INDEX_op_x86_psrldq_vec: + return &x_x; + case INDEX_op_x86_vpblendvb_vec: + return &x_x_x_x; + default: break; } return NULL; } +int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) +{ + switch (opc) { + case INDEX_op_add_vec: + case INDEX_op_sub_vec: + case INDEX_op_and_vec: + case INDEX_op_or_vec: + case INDEX_op_xor_vec: + case INDEX_op_andc_vec: + return 1; + case INDEX_op_cmp_vec: + return -1; + + case INDEX_op_shli_vec: + case INDEX_op_shri_vec: + /* We must expand the operation for MO_8. */ + return vece == MO_8 ? -1 : 1; + + case INDEX_op_sari_vec: + /* We must expand the operation for MO_8. */ + if (vece == MO_8) { + return -1; + } + /* We can emulate this for MO_64, but it does not pay off + unless we're producing at least 4 values. */ + if (vece == MO_64) { + return type >= TCG_TYPE_V256 ? -1 : 0; + } + return 1; + + case INDEX_op_mul_vec: + if (vece == MO_8) { + /* We can expand the operation for MO_8. */ + return -1; + } + if (vece == MO_64) { + return 0; + } + return 1; + + default: + return 0; + } +} + +void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, + TCGArg a0, ...) +{ + va_list va; + TCGArg a1, a2; + TCGv_vec v0, t1, t2, t3, t4; + + va_start(va, a0); + v0 = temp_tcgv_vec(arg_temp(a0)); + + switch (opc) { + case INDEX_op_shli_vec: + case INDEX_op_shri_vec: + tcg_debug_assert(vece == MO_8); + a1 = va_arg(va, TCGArg); + a2 = va_arg(va, TCGArg); + /* Unpack to W, shift, and repack. Tricky bits: + (1) Use punpck*bw x,x to produce DDCCBBAA, + i.e. duplicate in other half of the 16-bit lane. + (2) For right-shift, add 8 so that the high half of + the lane becomes zero. For left-shift, we must + shift up and down again. + (3) Step 2 leaves high half zero such that PACKUSWB + (pack with unsigned saturation) does not modify + the quantity. */ + t1 = tcg_temp_new_vec(type); + t2 = tcg_temp_new_vec(type); + vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, + tcgv_vec_arg(t1), a1, a1); + vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, + tcgv_vec_arg(t2), a1, a1); + if (opc == INDEX_op_shri_vec) { + vec_gen_3(INDEX_op_shri_vec, type, MO_16, + tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8); + vec_gen_3(INDEX_op_shri_vec, type, MO_16, + tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8); + } else { + vec_gen_3(INDEX_op_shli_vec, type, MO_16, + tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8); + vec_gen_3(INDEX_op_shli_vec, type, MO_16, + tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8); + vec_gen_3(INDEX_op_shri_vec, type, MO_16, + tcgv_vec_arg(t1), tcgv_vec_arg(t1), 8); + vec_gen_3(INDEX_op_shri_vec, type, MO_16, + tcgv_vec_arg(t2), tcgv_vec_arg(t2), 8); + } + vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, + a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2)); + tcg_temp_free_vec(t1); + tcg_temp_free_vec(t2); + break; + + case INDEX_op_sari_vec: + a1 = va_arg(va, TCGArg); + a2 = va_arg(va, TCGArg); + if (vece == MO_8) { + /* Unpack to W, shift, and repack, as above. */ + t1 = tcg_temp_new_vec(type); + t2 = tcg_temp_new_vec(type); + vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, + tcgv_vec_arg(t1), a1, a1); + vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, + tcgv_vec_arg(t2), a1, a1); + vec_gen_3(INDEX_op_sari_vec, type, MO_16, + tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8); + vec_gen_3(INDEX_op_sari_vec, type, MO_16, + tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8); + vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8, + a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2)); + tcg_temp_free_vec(t1); + tcg_temp_free_vec(t2); + break; + } + tcg_debug_assert(vece == MO_64); + /* MO_64: If the shift is <= 32, we can emulate the sign extend by + performing an arithmetic 32-bit shift and overwriting the high + half of the result (note that the ISA says shift of 32 is valid). */ + if (a2 <= 32) { + t1 = tcg_temp_new_vec(type); + vec_gen_3(INDEX_op_sari_vec, type, MO_32, tcgv_vec_arg(t1), a1, a2); + vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2); + vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32, + a0, a0, tcgv_vec_arg(t1), 0xaa); + tcg_temp_free_vec(t1); + break; + } + /* Otherwise we will need to use a compare vs 0 to produce the + sign-extend, shift and merge. */ + t1 = tcg_temp_new_vec(type); + t2 = tcg_const_zeros_vec(type); + vec_gen_4(INDEX_op_cmp_vec, type, MO_64, + tcgv_vec_arg(t1), tcgv_vec_arg(t2), a1, TCG_COND_GT); + tcg_temp_free_vec(t2); + vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2); + vec_gen_3(INDEX_op_shli_vec, type, MO_64, + tcgv_vec_arg(t1), tcgv_vec_arg(t1), 64 - a2); + vec_gen_3(INDEX_op_or_vec, type, MO_64, a0, a0, tcgv_vec_arg(t1)); + tcg_temp_free_vec(t1); + break; + + case INDEX_op_mul_vec: + tcg_debug_assert(vece == MO_8); + a1 = va_arg(va, TCGArg); + a2 = va_arg(va, TCGArg); + switch (type) { + case TCG_TYPE_V64: + t1 = tcg_temp_new_vec(TCG_TYPE_V128); + t2 = tcg_temp_new_vec(TCG_TYPE_V128); + tcg_gen_dup16i_vec(t2, 0); + vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, + tcgv_vec_arg(t1), a1, tcgv_vec_arg(t2)); + vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, + tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2); + tcg_gen_mul_vec(MO_16, t1, t1, t2); + tcg_gen_shri_vec(MO_16, t1, t1, 8); + vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, + a0, tcgv_vec_arg(t1), tcgv_vec_arg(t1)); + tcg_temp_free_vec(t1); + tcg_temp_free_vec(t2); + break; + + case TCG_TYPE_V128: + t1 = tcg_temp_new_vec(TCG_TYPE_V128); + t2 = tcg_temp_new_vec(TCG_TYPE_V128); + t3 = tcg_temp_new_vec(TCG_TYPE_V128); + t4 = tcg_temp_new_vec(TCG_TYPE_V128); + tcg_gen_dup16i_vec(t4, 0); + vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, + tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4)); + vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, + tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2); + vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8, + tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4)); + vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8, + tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2); + tcg_gen_mul_vec(MO_16, t1, t1, t2); + tcg_gen_mul_vec(MO_16, t3, t3, t4); + tcg_gen_shri_vec(MO_16, t1, t1, 8); + tcg_gen_shri_vec(MO_16, t3, t3, 8); + vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, + a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3)); + tcg_temp_free_vec(t1); + tcg_temp_free_vec(t2); + tcg_temp_free_vec(t3); + tcg_temp_free_vec(t4); + break; + + case TCG_TYPE_V256: + t1 = tcg_temp_new_vec(TCG_TYPE_V256); + t2 = tcg_temp_new_vec(TCG_TYPE_V256); + t3 = tcg_temp_new_vec(TCG_TYPE_V256); + t4 = tcg_temp_new_vec(TCG_TYPE_V256); + tcg_gen_dup16i_vec(t4, 0); + /* a1: A[0-7] ... D[0-7]; a2: W[0-7] ... Z[0-7] + t1: extends of B[0-7], D[0-7] + t2: extends of X[0-7], Z[0-7] + t3: extends of A[0-7], C[0-7] + t4: extends of W[0-7], Y[0-7]. */ + vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8, + tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4)); + vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8, + tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2); + vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8, + tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4)); + vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8, + tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2); + /* t1: BX DZ; t2: AW CY. */ + tcg_gen_mul_vec(MO_16, t1, t1, t2); + tcg_gen_mul_vec(MO_16, t3, t3, t4); + tcg_gen_shri_vec(MO_16, t1, t1, 8); + tcg_gen_shri_vec(MO_16, t3, t3, 8); + /* a0: AW BX CY DZ. */ + vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V256, MO_8, + a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3)); + tcg_temp_free_vec(t1); + tcg_temp_free_vec(t2); + tcg_temp_free_vec(t3); + tcg_temp_free_vec(t4); + break; + + default: + g_assert_not_reached(); + } + break; + + case INDEX_op_cmp_vec: + { + enum { + NEED_SWAP = 1, + NEED_INV = 2, + NEED_BIAS = 4 + }; + static const uint8_t fixups[16] = { + [0 ... 15] = -1, + [TCG_COND_EQ] = 0, + [TCG_COND_NE] = NEED_INV, + [TCG_COND_GT] = 0, + [TCG_COND_LT] = NEED_SWAP, + [TCG_COND_LE] = NEED_INV, + [TCG_COND_GE] = NEED_SWAP | NEED_INV, + [TCG_COND_GTU] = NEED_BIAS, + [TCG_COND_LTU] = NEED_BIAS | NEED_SWAP, + [TCG_COND_LEU] = NEED_BIAS | NEED_INV, + [TCG_COND_GEU] = NEED_BIAS | NEED_SWAP | NEED_INV, + }; + + TCGCond cond; + uint8_t fixup; + + a1 = va_arg(va, TCGArg); + a2 = va_arg(va, TCGArg); + cond = va_arg(va, TCGArg); + fixup = fixups[cond & 15]; + tcg_debug_assert(fixup != 0xff); + + if (fixup & NEED_INV) { + cond = tcg_invert_cond(cond); + } + if (fixup & NEED_SWAP) { + TCGArg t; + t = a1, a1 = a2, a2 = t; + cond = tcg_swap_cond(cond); + } + + t1 = t2 = NULL; + if (fixup & NEED_BIAS) { + t1 = tcg_temp_new_vec(type); + t2 = tcg_temp_new_vec(type); + tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1)); + tcg_gen_sub_vec(vece, t1, temp_tcgv_vec(arg_temp(a1)), t2); + tcg_gen_sub_vec(vece, t2, temp_tcgv_vec(arg_temp(a2)), t2); + a1 = tcgv_vec_arg(t1); + a2 = tcgv_vec_arg(t2); + cond = tcg_signed_cond(cond); + } + + tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT); + vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond); + + if (fixup & NEED_BIAS) { + tcg_temp_free_vec(t1); + tcg_temp_free_vec(t2); + } + if (fixup & NEED_INV) { + tcg_gen_not_vec(vece, v0, v0); + } + } + break; + + default: + break; + } + + va_end(va); +} + static const int tcg_target_callee_save_regs[] = { #if TCG_TARGET_REG_BITS == 64 TCG_REG_RBP, @@ -2577,6 +3440,9 @@ static void tcg_target_qemu_prologue(TCGContext *s) tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend); + if (have_avx2) { + tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0); + } for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) { tcg_out_pop(s, tcg_target_callee_save_regs[i]); } @@ -2598,9 +3464,16 @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count) static void tcg_target_init(TCGContext *s) { #ifdef CONFIG_CPUID_H - unsigned a, b, c, d; + unsigned a, b, c, d, b7 = 0; int max = __get_cpuid_max(0, 0); + if (max >= 7) { + /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */ + __cpuid_count(7, 0, a, b7, c, d); + have_bmi1 = (b7 & bit_BMI) != 0; + have_bmi2 = (b7 & bit_BMI2) != 0; + } + if (max >= 1) { __cpuid(1, a, b, c, d); #ifndef have_cmov @@ -2609,17 +3482,22 @@ static void tcg_target_init(TCGContext *s) available, we'll use a small forward branch. */ have_cmov = (d & bit_CMOV) != 0; #endif + /* MOVBE is only available on Intel Atom and Haswell CPUs, so we need to probe for it. */ have_movbe = (c & bit_MOVBE) != 0; have_popcnt = (c & bit_POPCNT) != 0; - } - if (max >= 7) { - /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */ - __cpuid_count(7, 0, a, b, c, d); - have_bmi1 = (b & bit_BMI) != 0; - have_bmi2 = (b & bit_BMI2) != 0; + /* There are a number of things we must check before we can be + sure of not hitting invalid opcode. */ + if (c & bit_OSXSAVE) { + unsigned xcrl, xcrh; + asm ("xgetbv" : "=a" (xcrl), "=d" (xcrh) : "c" (0)); + if ((xcrl & 6) == 6) { + have_avx1 = (c & bit_AVX) != 0; + have_avx2 = (b7 & bit_AVX2) != 0; + } + } } max = __get_cpuid_max(0x8000000, 0); @@ -2630,11 +3508,16 @@ static void tcg_target_init(TCGContext *s) } #endif /* CONFIG_CPUID_H */ + tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS; if (TCG_TARGET_REG_BITS == 64) { - tcg_target_available_regs[TCG_TYPE_I32] = 0xffff; - tcg_target_available_regs[TCG_TYPE_I64] = 0xffff; - } else { - tcg_target_available_regs[TCG_TYPE_I32] = 0xff; + tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS; + } + if (have_avx1) { + tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS; + tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS; + } + if (have_avx2) { + tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS; } tcg_target_call_clobber_regs = 0; diff --git a/tcg/i386/tcg-target.opc.h b/tcg/i386/tcg-target.opc.h new file mode 100644 index 0000000000..e5fa88ba25 --- /dev/null +++ b/tcg/i386/tcg-target.opc.h @@ -0,0 +1,13 @@ +/* Target-specific opcodes for host vector expansion. These will be + emitted by tcg_expand_vec_op. For those familiar with GCC internals, + consider these to be UNSPEC with names. */ + +DEF(x86_shufps_vec, 1, 2, 1, IMPLVEC) +DEF(x86_vpblendvb_vec, 1, 3, 0, IMPLVEC) +DEF(x86_blend_vec, 1, 2, 1, IMPLVEC) +DEF(x86_packss_vec, 1, 2, 0, IMPLVEC) +DEF(x86_packus_vec, 1, 2, 0, IMPLVEC) +DEF(x86_psrldq_vec, 1, 1, 1, IMPLVEC) +DEF(x86_vperm2i128_vec, 1, 2, 1, IMPLVEC) +DEF(x86_punpckl_vec, 1, 2, 0, IMPLVEC) +DEF(x86_punpckh_vec, 1, 2, 0, IMPLVEC) diff --git a/tcg/optimize.c b/tcg/optimize.c index 2cbbeefd53..d4ea67e541 100644 --- a/tcg/optimize.c +++ b/tcg/optimize.c @@ -32,6 +32,11 @@ glue(glue(case INDEX_op_, x), _i32): \ glue(glue(case INDEX_op_, x), _i64) +#define CASE_OP_32_64_VEC(x) \ + glue(glue(case INDEX_op_, x), _i32): \ + glue(glue(case INDEX_op_, x), _i64): \ + glue(glue(case INDEX_op_, x), _vec) + struct tcg_temp_info { bool is_const; TCGTemp *prev_copy; @@ -108,40 +113,6 @@ static void init_arg_info(struct tcg_temp_info *infos, init_ts_info(infos, temps_used, arg_temp(arg)); } -static int op_bits(TCGOpcode op) -{ - const TCGOpDef *def = &tcg_op_defs[op]; - return def->flags & TCG_OPF_64BIT ? 64 : 32; -} - -static TCGOpcode op_to_mov(TCGOpcode op) -{ - switch (op_bits(op)) { - case 32: - return INDEX_op_mov_i32; - case 64: - return INDEX_op_mov_i64; - default: - fprintf(stderr, "op_to_mov: unexpected return value of " - "function op_bits.\n"); - tcg_abort(); - } -} - -static TCGOpcode op_to_movi(TCGOpcode op) -{ - switch (op_bits(op)) { - case 32: - return INDEX_op_movi_i32; - case 64: - return INDEX_op_movi_i64; - default: - fprintf(stderr, "op_to_movi: unexpected return value of " - "function op_bits.\n"); - tcg_abort(); - } -} - static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts) { TCGTemp *i; @@ -199,11 +170,23 @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2) static void tcg_opt_gen_movi(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg val) { - TCGOpcode new_op = op_to_movi(op->opc); + const TCGOpDef *def; + TCGOpcode new_op; tcg_target_ulong mask; struct tcg_temp_info *di = arg_info(dst); + def = &tcg_op_defs[op->opc]; + if (def->flags & TCG_OPF_VECTOR) { + new_op = INDEX_op_dupi_vec; + } else if (def->flags & TCG_OPF_64BIT) { + new_op = INDEX_op_movi_i64; + } else { + new_op = INDEX_op_movi_i32; + } op->opc = new_op; + /* TCGOP_VECL and TCGOP_VECE remain unchanged. */ + op->args[0] = dst; + op->args[1] = val; reset_temp(dst); di->is_const = true; @@ -214,15 +197,13 @@ static void tcg_opt_gen_movi(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg val) mask |= ~0xffffffffull; } di->mask = mask; - - op->args[0] = dst; - op->args[1] = val; } static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src) { TCGTemp *dst_ts = arg_temp(dst); TCGTemp *src_ts = arg_temp(src); + const TCGOpDef *def; struct tcg_temp_info *di; struct tcg_temp_info *si; tcg_target_ulong mask; @@ -236,9 +217,16 @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src) reset_ts(dst_ts); di = ts_info(dst_ts); si = ts_info(src_ts); - new_op = op_to_mov(op->opc); - + def = &tcg_op_defs[op->opc]; + if (def->flags & TCG_OPF_VECTOR) { + new_op = INDEX_op_mov_vec; + } else if (def->flags & TCG_OPF_64BIT) { + new_op = INDEX_op_mov_i64; + } else { + new_op = INDEX_op_mov_i32; + } op->opc = new_op; + /* TCGOP_VECL and TCGOP_VECE remain unchanged. */ op->args[0] = dst; op->args[1] = src; @@ -417,8 +405,9 @@ static TCGArg do_constant_folding_2(TCGOpcode op, TCGArg x, TCGArg y) static TCGArg do_constant_folding(TCGOpcode op, TCGArg x, TCGArg y) { + const TCGOpDef *def = &tcg_op_defs[op]; TCGArg res = do_constant_folding_2(op, x, y); - if (op_bits(op) == 32) { + if (!(def->flags & TCG_OPF_64BIT)) { res = (int32_t)res; } return res; @@ -508,13 +497,12 @@ static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x, tcg_target_ulong xv = arg_info(x)->val; tcg_target_ulong yv = arg_info(y)->val; if (arg_is_const(x) && arg_is_const(y)) { - switch (op_bits(op)) { - case 32: - return do_constant_folding_cond_32(xv, yv, c); - case 64: + const TCGOpDef *def = &tcg_op_defs[op]; + tcg_debug_assert(!(def->flags & TCG_OPF_VECTOR)); + if (def->flags & TCG_OPF_64BIT) { return do_constant_folding_cond_64(xv, yv, c); - default: - tcg_abort(); + } else { + return do_constant_folding_cond_32(xv, yv, c); } } else if (args_are_copies(x, y)) { return do_constant_folding_cond_eq(c); @@ -653,11 +641,11 @@ void tcg_optimize(TCGContext *s) /* For commutative operations make constant second argument */ switch (opc) { - CASE_OP_32_64(add): - CASE_OP_32_64(mul): - CASE_OP_32_64(and): - CASE_OP_32_64(or): - CASE_OP_32_64(xor): + CASE_OP_32_64_VEC(add): + CASE_OP_32_64_VEC(mul): + CASE_OP_32_64_VEC(and): + CASE_OP_32_64_VEC(or): + CASE_OP_32_64_VEC(xor): CASE_OP_32_64(eqv): CASE_OP_32_64(nand): CASE_OP_32_64(nor): @@ -722,7 +710,7 @@ void tcg_optimize(TCGContext *s) continue; } break; - CASE_OP_32_64(sub): + CASE_OP_32_64_VEC(sub): { TCGOpcode neg_op; bool have_neg; @@ -734,9 +722,12 @@ void tcg_optimize(TCGContext *s) if (opc == INDEX_op_sub_i32) { neg_op = INDEX_op_neg_i32; have_neg = TCG_TARGET_HAS_neg_i32; - } else { + } else if (opc == INDEX_op_sub_i64) { neg_op = INDEX_op_neg_i64; have_neg = TCG_TARGET_HAS_neg_i64; + } else { + neg_op = INDEX_op_neg_vec; + have_neg = TCG_TARGET_HAS_neg_vec; } if (!have_neg) { break; @@ -750,7 +741,7 @@ void tcg_optimize(TCGContext *s) } } break; - CASE_OP_32_64(xor): + CASE_OP_32_64_VEC(xor): CASE_OP_32_64(nand): if (!arg_is_const(op->args[1]) && arg_is_const(op->args[2]) @@ -767,7 +758,7 @@ void tcg_optimize(TCGContext *s) goto try_not; } break; - CASE_OP_32_64(andc): + CASE_OP_32_64_VEC(andc): if (!arg_is_const(op->args[2]) && arg_is_const(op->args[1]) && arg_info(op->args[1])->val == -1) { @@ -775,7 +766,7 @@ void tcg_optimize(TCGContext *s) goto try_not; } break; - CASE_OP_32_64(orc): + CASE_OP_32_64_VEC(orc): CASE_OP_32_64(eqv): if (!arg_is_const(op->args[2]) && arg_is_const(op->args[1]) @@ -789,7 +780,10 @@ void tcg_optimize(TCGContext *s) TCGOpcode not_op; bool have_not; - if (def->flags & TCG_OPF_64BIT) { + if (def->flags & TCG_OPF_VECTOR) { + not_op = INDEX_op_not_vec; + have_not = TCG_TARGET_HAS_not_vec; + } else if (def->flags & TCG_OPF_64BIT) { not_op = INDEX_op_not_i64; have_not = TCG_TARGET_HAS_not_i64; } else { @@ -810,16 +804,16 @@ void tcg_optimize(TCGContext *s) /* Simplify expression for "op r, a, const => mov r, a" cases */ switch (opc) { - CASE_OP_32_64(add): - CASE_OP_32_64(sub): + CASE_OP_32_64_VEC(add): + CASE_OP_32_64_VEC(sub): + CASE_OP_32_64_VEC(or): + CASE_OP_32_64_VEC(xor): + CASE_OP_32_64_VEC(andc): CASE_OP_32_64(shl): CASE_OP_32_64(shr): CASE_OP_32_64(sar): CASE_OP_32_64(rotl): CASE_OP_32_64(rotr): - CASE_OP_32_64(or): - CASE_OP_32_64(xor): - CASE_OP_32_64(andc): if (!arg_is_const(op->args[1]) && arg_is_const(op->args[2]) && arg_info(op->args[2])->val == 0) { @@ -827,8 +821,8 @@ void tcg_optimize(TCGContext *s) continue; } break; - CASE_OP_32_64(and): - CASE_OP_32_64(orc): + CASE_OP_32_64_VEC(and): + CASE_OP_32_64_VEC(orc): CASE_OP_32_64(eqv): if (!arg_is_const(op->args[1]) && arg_is_const(op->args[2]) @@ -1042,8 +1036,8 @@ void tcg_optimize(TCGContext *s) /* Simplify expression for "op r, a, 0 => movi r, 0" cases */ switch (opc) { - CASE_OP_32_64(and): - CASE_OP_32_64(mul): + CASE_OP_32_64_VEC(and): + CASE_OP_32_64_VEC(mul): CASE_OP_32_64(muluh): CASE_OP_32_64(mulsh): if (arg_is_const(op->args[2]) @@ -1058,8 +1052,8 @@ void tcg_optimize(TCGContext *s) /* Simplify expression for "op r, a, a => mov r, a" cases */ switch (opc) { - CASE_OP_32_64(or): - CASE_OP_32_64(and): + CASE_OP_32_64_VEC(or): + CASE_OP_32_64_VEC(and): if (args_are_copies(op->args[1], op->args[2])) { tcg_opt_gen_mov(s, op, op->args[0], op->args[1]); continue; @@ -1071,9 +1065,9 @@ void tcg_optimize(TCGContext *s) /* Simplify expression for "op r, a, a => movi r, 0" cases */ switch (opc) { - CASE_OP_32_64(andc): - CASE_OP_32_64(sub): - CASE_OP_32_64(xor): + CASE_OP_32_64_VEC(andc): + CASE_OP_32_64_VEC(sub): + CASE_OP_32_64_VEC(xor): if (args_are_copies(op->args[1], op->args[2])) { tcg_opt_gen_movi(s, op, op->args[0], 0); continue; @@ -1087,13 +1081,23 @@ void tcg_optimize(TCGContext *s) folding. Constants will be substituted to arguments by register allocator where needed and possible. Also detect copies. */ switch (opc) { - CASE_OP_32_64(mov): + CASE_OP_32_64_VEC(mov): tcg_opt_gen_mov(s, op, op->args[0], op->args[1]); break; CASE_OP_32_64(movi): + case INDEX_op_dupi_vec: tcg_opt_gen_movi(s, op, op->args[0], op->args[1]); break; + case INDEX_op_dup_vec: + if (arg_is_const(op->args[1])) { + tmp = arg_info(op->args[1])->val; + tmp = dup_const(TCGOP_VECE(op), tmp); + tcg_opt_gen_movi(s, op, op->args[0], tmp); + continue; + } + break; + CASE_OP_32_64(not): CASE_OP_32_64(neg): CASE_OP_32_64(ext8s): diff --git a/tcg/tcg-gvec-desc.h b/tcg/tcg-gvec-desc.h new file mode 100644 index 0000000000..3b4c2d9c69 --- /dev/null +++ b/tcg/tcg-gvec-desc.h @@ -0,0 +1,49 @@ +/* + * Generic vector operation descriptor + * + * Copyright (c) 2018 Linaro + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +/* ??? These bit widths are set for ARM SVE, maxing out at 256 byte vectors. */ +#define SIMD_OPRSZ_SHIFT 0 +#define SIMD_OPRSZ_BITS 5 + +#define SIMD_MAXSZ_SHIFT (SIMD_OPRSZ_SHIFT + SIMD_OPRSZ_BITS) +#define SIMD_MAXSZ_BITS 5 + +#define SIMD_DATA_SHIFT (SIMD_MAXSZ_SHIFT + SIMD_MAXSZ_BITS) +#define SIMD_DATA_BITS (32 - SIMD_DATA_SHIFT) + +/* Create a descriptor from components. */ +uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data); + +/* Extract the operation size from a descriptor. */ +static inline intptr_t simd_oprsz(uint32_t desc) +{ + return (extract32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS) + 1) * 8; +} + +/* Extract the max vector size from a descriptor. */ +static inline intptr_t simd_maxsz(uint32_t desc) +{ + return (extract32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS) + 1) * 8; +} + +/* Extract the operation-specific data from a descriptor. */ +static inline int32_t simd_data(uint32_t desc) +{ + return sextract32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS); +} diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c new file mode 100644 index 0000000000..bfe44bba81 --- /dev/null +++ b/tcg/tcg-op-gvec.c @@ -0,0 +1,2216 @@ +/* + * Generic vector operation expansion + * + * Copyright (c) 2018 Linaro + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "tcg.h" +#include "tcg-op.h" +#include "tcg-op-gvec.h" +#include "tcg-gvec-desc.h" + +#define MAX_UNROLL 4 + +/* Verify vector size and alignment rules. OFS should be the OR of all + of the operand offsets so that we can check them all at once. */ +static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs) +{ + uint32_t opr_align = oprsz >= 16 ? 15 : 7; + uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7; + tcg_debug_assert(oprsz > 0); + tcg_debug_assert(oprsz <= maxsz); + tcg_debug_assert((oprsz & opr_align) == 0); + tcg_debug_assert((maxsz & max_align) == 0); + tcg_debug_assert((ofs & max_align) == 0); +} + +/* Verify vector overlap rules for two operands. */ +static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s) +{ + tcg_debug_assert(d == a || d + s <= a || a + s <= d); +} + +/* Verify vector overlap rules for three operands. */ +static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s) +{ + check_overlap_2(d, a, s); + check_overlap_2(d, b, s); + check_overlap_2(a, b, s); +} + +/* Verify vector overlap rules for four operands. */ +static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b, + uint32_t c, uint32_t s) +{ + check_overlap_2(d, a, s); + check_overlap_2(d, b, s); + check_overlap_2(d, c, s); + check_overlap_2(a, b, s); + check_overlap_2(a, c, s); + check_overlap_2(b, c, s); +} + +/* Create a descriptor from components. */ +uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data) +{ + uint32_t desc = 0; + + assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS)); + assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS)); + assert(data == sextract32(data, 0, SIMD_DATA_BITS)); + + oprsz = (oprsz / 8) - 1; + maxsz = (maxsz / 8) - 1; + desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz); + desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz); + desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data); + + return desc; +} + +/* Generate a call to a gvec-style helper with two vector operands. */ +void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs, + uint32_t oprsz, uint32_t maxsz, int32_t data, + gen_helper_gvec_2 *fn) +{ + TCGv_ptr a0, a1; + TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); + + a0 = tcg_temp_new_ptr(); + a1 = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(a0, cpu_env, dofs); + tcg_gen_addi_ptr(a1, cpu_env, aofs); + + fn(a0, a1, desc); + + tcg_temp_free_ptr(a0); + tcg_temp_free_ptr(a1); + tcg_temp_free_i32(desc); +} + +/* Generate a call to a gvec-style helper with two vector operands + and one scalar operand. */ +void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c, + uint32_t oprsz, uint32_t maxsz, int32_t data, + gen_helper_gvec_2i *fn) +{ + TCGv_ptr a0, a1; + TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); + + a0 = tcg_temp_new_ptr(); + a1 = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(a0, cpu_env, dofs); + tcg_gen_addi_ptr(a1, cpu_env, aofs); + + fn(a0, a1, c, desc); + + tcg_temp_free_ptr(a0); + tcg_temp_free_ptr(a1); + tcg_temp_free_i32(desc); +} + +/* Generate a call to a gvec-style helper with three vector operands. */ +void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, + uint32_t oprsz, uint32_t maxsz, int32_t data, + gen_helper_gvec_3 *fn) +{ + TCGv_ptr a0, a1, a2; + TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); + + a0 = tcg_temp_new_ptr(); + a1 = tcg_temp_new_ptr(); + a2 = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(a0, cpu_env, dofs); + tcg_gen_addi_ptr(a1, cpu_env, aofs); + tcg_gen_addi_ptr(a2, cpu_env, bofs); + + fn(a0, a1, a2, desc); + + tcg_temp_free_ptr(a0); + tcg_temp_free_ptr(a1); + tcg_temp_free_ptr(a2); + tcg_temp_free_i32(desc); +} + +/* Generate a call to a gvec-style helper with four vector operands. */ +void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, + uint32_t cofs, uint32_t oprsz, uint32_t maxsz, + int32_t data, gen_helper_gvec_4 *fn) +{ + TCGv_ptr a0, a1, a2, a3; + TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); + + a0 = tcg_temp_new_ptr(); + a1 = tcg_temp_new_ptr(); + a2 = tcg_temp_new_ptr(); + a3 = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(a0, cpu_env, dofs); + tcg_gen_addi_ptr(a1, cpu_env, aofs); + tcg_gen_addi_ptr(a2, cpu_env, bofs); + tcg_gen_addi_ptr(a3, cpu_env, cofs); + + fn(a0, a1, a2, a3, desc); + + tcg_temp_free_ptr(a0); + tcg_temp_free_ptr(a1); + tcg_temp_free_ptr(a2); + tcg_temp_free_ptr(a3); + tcg_temp_free_i32(desc); +} + +/* Generate a call to a gvec-style helper with five vector operands. */ +void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, + uint32_t cofs, uint32_t xofs, uint32_t oprsz, + uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn) +{ + TCGv_ptr a0, a1, a2, a3, a4; + TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); + + a0 = tcg_temp_new_ptr(); + a1 = tcg_temp_new_ptr(); + a2 = tcg_temp_new_ptr(); + a3 = tcg_temp_new_ptr(); + a4 = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(a0, cpu_env, dofs); + tcg_gen_addi_ptr(a1, cpu_env, aofs); + tcg_gen_addi_ptr(a2, cpu_env, bofs); + tcg_gen_addi_ptr(a3, cpu_env, cofs); + tcg_gen_addi_ptr(a4, cpu_env, xofs); + + fn(a0, a1, a2, a3, a4, desc); + + tcg_temp_free_ptr(a0); + tcg_temp_free_ptr(a1); + tcg_temp_free_ptr(a2); + tcg_temp_free_ptr(a3); + tcg_temp_free_ptr(a4); + tcg_temp_free_i32(desc); +} + +/* Generate a call to a gvec-style helper with three vector operands + and an extra pointer operand. */ +void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs, + TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, + int32_t data, gen_helper_gvec_2_ptr *fn) +{ + TCGv_ptr a0, a1; + TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); + + a0 = tcg_temp_new_ptr(); + a1 = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(a0, cpu_env, dofs); + tcg_gen_addi_ptr(a1, cpu_env, aofs); + + fn(a0, a1, ptr, desc); + + tcg_temp_free_ptr(a0); + tcg_temp_free_ptr(a1); + tcg_temp_free_i32(desc); +} + +/* Generate a call to a gvec-style helper with three vector operands + and an extra pointer operand. */ +void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, + TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, + int32_t data, gen_helper_gvec_3_ptr *fn) +{ + TCGv_ptr a0, a1, a2; + TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); + + a0 = tcg_temp_new_ptr(); + a1 = tcg_temp_new_ptr(); + a2 = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(a0, cpu_env, dofs); + tcg_gen_addi_ptr(a1, cpu_env, aofs); + tcg_gen_addi_ptr(a2, cpu_env, bofs); + + fn(a0, a1, a2, ptr, desc); + + tcg_temp_free_ptr(a0); + tcg_temp_free_ptr(a1); + tcg_temp_free_ptr(a2); + tcg_temp_free_i32(desc); +} + +/* Generate a call to a gvec-style helper with four vector operands + and an extra pointer operand. */ +void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, + uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz, + uint32_t maxsz, int32_t data, + gen_helper_gvec_4_ptr *fn) +{ + TCGv_ptr a0, a1, a2, a3; + TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); + + a0 = tcg_temp_new_ptr(); + a1 = tcg_temp_new_ptr(); + a2 = tcg_temp_new_ptr(); + a3 = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(a0, cpu_env, dofs); + tcg_gen_addi_ptr(a1, cpu_env, aofs); + tcg_gen_addi_ptr(a2, cpu_env, bofs); + tcg_gen_addi_ptr(a3, cpu_env, cofs); + + fn(a0, a1, a2, a3, ptr, desc); + + tcg_temp_free_ptr(a0); + tcg_temp_free_ptr(a1); + tcg_temp_free_ptr(a2); + tcg_temp_free_ptr(a3); + tcg_temp_free_i32(desc); +} + +/* Return true if we want to implement something of OPRSZ bytes + in units of LNSZ. This limits the expansion of inline code. */ +static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz) +{ + uint32_t lnct = oprsz / lnsz; + return lnct >= 1 && lnct <= MAX_UNROLL; +} + +static void expand_clr(uint32_t dofs, uint32_t maxsz); + +/* Duplicate C as per VECE. */ +uint64_t (dup_const)(unsigned vece, uint64_t c) +{ + switch (vece) { + case MO_8: + return 0x0101010101010101ull * (uint8_t)c; + case MO_16: + return 0x0001000100010001ull * (uint16_t)c; + case MO_32: + return 0x0000000100000001ull * (uint32_t)c; + case MO_64: + return c; + default: + g_assert_not_reached(); + } +} + +/* Duplicate IN into OUT as per VECE. */ +static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in) +{ + switch (vece) { + case MO_8: + tcg_gen_ext8u_i32(out, in); + tcg_gen_muli_i32(out, out, 0x01010101); + break; + case MO_16: + tcg_gen_deposit_i32(out, in, in, 16, 16); + break; + case MO_32: + tcg_gen_mov_i32(out, in); + break; + default: + g_assert_not_reached(); + } +} + +static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in) +{ + switch (vece) { + case MO_8: + tcg_gen_ext8u_i64(out, in); + tcg_gen_muli_i64(out, out, 0x0101010101010101ull); + break; + case MO_16: + tcg_gen_ext16u_i64(out, in); + tcg_gen_muli_i64(out, out, 0x0001000100010001ull); + break; + case MO_32: + tcg_gen_deposit_i64(out, in, in, 32, 32); + break; + case MO_64: + tcg_gen_mov_i64(out, in); + break; + default: + g_assert_not_reached(); + } +} + +/* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C. + * Only one of IN_32 or IN_64 may be set; + * IN_C is used if IN_32 and IN_64 are unset. + */ +static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz, + uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64, + uint64_t in_c) +{ + TCGType type; + TCGv_i64 t_64; + TCGv_i32 t_32, t_desc; + TCGv_ptr t_ptr; + uint32_t i; + + assert(vece <= (in_32 ? MO_32 : MO_64)); + assert(in_32 == NULL || in_64 == NULL); + + /* If we're storing 0, expand oprsz to maxsz. */ + if (in_32 == NULL && in_64 == NULL) { + in_c = dup_const(vece, in_c); + if (in_c == 0) { + oprsz = maxsz; + } + } + + type = 0; + if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)) { + type = TCG_TYPE_V256; + } else if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)) { + type = TCG_TYPE_V128; + } else if (TCG_TARGET_HAS_v64 && check_size_impl(oprsz, 8) + /* Prefer integer when 64-bit host and no variable dup. */ + && !(TCG_TARGET_REG_BITS == 64 && in_32 == NULL + && (in_64 == NULL || vece == MO_64))) { + type = TCG_TYPE_V64; + } + + /* Implement inline with a vector type, if possible. */ + if (type != 0) { + TCGv_vec t_vec = tcg_temp_new_vec(type); + + if (in_32) { + tcg_gen_dup_i32_vec(vece, t_vec, in_32); + } else if (in_64) { + tcg_gen_dup_i64_vec(vece, t_vec, in_64); + } else { + switch (vece) { + case MO_8: + tcg_gen_dup8i_vec(t_vec, in_c); + break; + case MO_16: + tcg_gen_dup16i_vec(t_vec, in_c); + break; + case MO_32: + tcg_gen_dup32i_vec(t_vec, in_c); + break; + default: + tcg_gen_dup64i_vec(t_vec, in_c); + break; + } + } + + i = 0; + if (TCG_TARGET_HAS_v256) { + for (; i + 32 <= oprsz; i += 32) { + tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256); + } + } + if (TCG_TARGET_HAS_v128) { + for (; i + 16 <= oprsz; i += 16) { + tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128); + } + } + if (TCG_TARGET_HAS_v64) { + for (; i < oprsz; i += 8) { + tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64); + } + } + tcg_temp_free_vec(t_vec); + goto done; + } + + /* Otherwise, inline with an integer type, unless "large". */ + if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) { + t_64 = NULL; + t_32 = NULL; + + if (in_32) { + /* We are given a 32-bit variable input. For a 64-bit host, + use a 64-bit operation unless the 32-bit operation would + be simple enough. */ + if (TCG_TARGET_REG_BITS == 64 + && (vece != MO_32 || !check_size_impl(oprsz, 4))) { + t_64 = tcg_temp_new_i64(); + tcg_gen_extu_i32_i64(t_64, in_32); + gen_dup_i64(vece, t_64, t_64); + } else { + t_32 = tcg_temp_new_i32(); + gen_dup_i32(vece, t_32, in_32); + } + } else if (in_64) { + /* We are given a 64-bit variable input. */ + t_64 = tcg_temp_new_i64(); + gen_dup_i64(vece, t_64, in_64); + } else { + /* We are given a constant input. */ + /* For 64-bit hosts, use 64-bit constants for "simple" constants + or when we'd need too many 32-bit stores, or when a 64-bit + constant is really required. */ + if (vece == MO_64 + || (TCG_TARGET_REG_BITS == 64 + && (in_c == 0 || in_c == -1 + || !check_size_impl(oprsz, 4)))) { + t_64 = tcg_const_i64(in_c); + } else { + t_32 = tcg_const_i32(in_c); + } + } + + /* Implement inline if we picked an implementation size above. */ + if (t_32) { + for (i = 0; i < oprsz; i += 4) { + tcg_gen_st_i32(t_32, cpu_env, dofs + i); + } + tcg_temp_free_i32(t_32); + goto done; + } + if (t_64) { + for (i = 0; i < oprsz; i += 8) { + tcg_gen_st_i64(t_64, cpu_env, dofs + i); + } + tcg_temp_free_i64(t_64); + goto done; + } + } + + /* Otherwise implement out of line. */ + t_ptr = tcg_temp_new_ptr(); + tcg_gen_addi_ptr(t_ptr, cpu_env, dofs); + t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0)); + + if (vece == MO_64) { + if (in_64) { + gen_helper_gvec_dup64(t_ptr, t_desc, in_64); + } else { + t_64 = tcg_const_i64(in_c); + gen_helper_gvec_dup64(t_ptr, t_desc, t_64); + tcg_temp_free_i64(t_64); + } + } else { + typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32); + static dup_fn * const fns[3] = { + gen_helper_gvec_dup8, + gen_helper_gvec_dup16, + gen_helper_gvec_dup32 + }; + + if (in_32) { + fns[vece](t_ptr, t_desc, in_32); + } else { + t_32 = tcg_temp_new_i32(); + if (in_64) { + tcg_gen_extrl_i64_i32(t_32, in_64); + } else if (vece == MO_8) { + tcg_gen_movi_i32(t_32, in_c & 0xff); + } else if (vece == MO_16) { + tcg_gen_movi_i32(t_32, in_c & 0xffff); + } else { + tcg_gen_movi_i32(t_32, in_c); + } + fns[vece](t_ptr, t_desc, t_32); + tcg_temp_free_i32(t_32); + } + } + + tcg_temp_free_ptr(t_ptr); + tcg_temp_free_i32(t_desc); + return; + + done: + if (oprsz < maxsz) { + expand_clr(dofs + oprsz, maxsz - oprsz); + } +} + +/* Likewise, but with zero. */ +static void expand_clr(uint32_t dofs, uint32_t maxsz) +{ + do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0); +} + +/* Expand OPSZ bytes worth of two-operand operations using i32 elements. */ +static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, + void (*fni)(TCGv_i32, TCGv_i32)) +{ + TCGv_i32 t0 = tcg_temp_new_i32(); + uint32_t i; + + for (i = 0; i < oprsz; i += 4) { + tcg_gen_ld_i32(t0, cpu_env, aofs + i); + fni(t0, t0); + tcg_gen_st_i32(t0, cpu_env, dofs + i); + } + tcg_temp_free_i32(t0); +} + +static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, + int32_t c, bool load_dest, + void (*fni)(TCGv_i32, TCGv_i32, int32_t)) +{ + TCGv_i32 t0 = tcg_temp_new_i32(); + TCGv_i32 t1 = tcg_temp_new_i32(); + uint32_t i; + + for (i = 0; i < oprsz; i += 4) { + tcg_gen_ld_i32(t0, cpu_env, aofs + i); + if (load_dest) { + tcg_gen_ld_i32(t1, cpu_env, dofs + i); + } + fni(t1, t0, c); + tcg_gen_st_i32(t1, cpu_env, dofs + i); + } + tcg_temp_free_i32(t0); + tcg_temp_free_i32(t1); +} + +static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, + TCGv_i32 c, bool scalar_first, + void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) +{ + TCGv_i32 t0 = tcg_temp_new_i32(); + TCGv_i32 t1 = tcg_temp_new_i32(); + uint32_t i; + + for (i = 0; i < oprsz; i += 4) { + tcg_gen_ld_i32(t0, cpu_env, aofs + i); + if (scalar_first) { + fni(t1, c, t0); + } else { + fni(t1, t0, c); + } + tcg_gen_st_i32(t1, cpu_env, dofs + i); + } + tcg_temp_free_i32(t0); + tcg_temp_free_i32(t1); +} + +/* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ +static void expand_3_i32(uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, bool load_dest, + void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) +{ + TCGv_i32 t0 = tcg_temp_new_i32(); + TCGv_i32 t1 = tcg_temp_new_i32(); + TCGv_i32 t2 = tcg_temp_new_i32(); + uint32_t i; + + for (i = 0; i < oprsz; i += 4) { + tcg_gen_ld_i32(t0, cpu_env, aofs + i); + tcg_gen_ld_i32(t1, cpu_env, bofs + i); + if (load_dest) { + tcg_gen_ld_i32(t2, cpu_env, dofs + i); + } + fni(t2, t0, t1); + tcg_gen_st_i32(t2, cpu_env, dofs + i); + } + tcg_temp_free_i32(t2); + tcg_temp_free_i32(t1); + tcg_temp_free_i32(t0); +} + +/* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ +static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, + uint32_t cofs, uint32_t oprsz, + void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32)) +{ + TCGv_i32 t0 = tcg_temp_new_i32(); + TCGv_i32 t1 = tcg_temp_new_i32(); + TCGv_i32 t2 = tcg_temp_new_i32(); + TCGv_i32 t3 = tcg_temp_new_i32(); + uint32_t i; + + for (i = 0; i < oprsz; i += 4) { + tcg_gen_ld_i32(t1, cpu_env, aofs + i); + tcg_gen_ld_i32(t2, cpu_env, bofs + i); + tcg_gen_ld_i32(t3, cpu_env, cofs + i); + fni(t0, t1, t2, t3); + tcg_gen_st_i32(t0, cpu_env, dofs + i); + } + tcg_temp_free_i32(t3); + tcg_temp_free_i32(t2); + tcg_temp_free_i32(t1); + tcg_temp_free_i32(t0); +} + +/* Expand OPSZ bytes worth of two-operand operations using i64 elements. */ +static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, + void (*fni)(TCGv_i64, TCGv_i64)) +{ + TCGv_i64 t0 = tcg_temp_new_i64(); + uint32_t i; + + for (i = 0; i < oprsz; i += 8) { + tcg_gen_ld_i64(t0, cpu_env, aofs + i); + fni(t0, t0); + tcg_gen_st_i64(t0, cpu_env, dofs + i); + } + tcg_temp_free_i64(t0); +} + +static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, + int64_t c, bool load_dest, + void (*fni)(TCGv_i64, TCGv_i64, int64_t)) +{ + TCGv_i64 t0 = tcg_temp_new_i64(); + TCGv_i64 t1 = tcg_temp_new_i64(); + uint32_t i; + + for (i = 0; i < oprsz; i += 8) { + tcg_gen_ld_i64(t0, cpu_env, aofs + i); + if (load_dest) { + tcg_gen_ld_i64(t1, cpu_env, dofs + i); + } + fni(t1, t0, c); + tcg_gen_st_i64(t1, cpu_env, dofs + i); + } + tcg_temp_free_i64(t0); + tcg_temp_free_i64(t1); +} + +static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, + TCGv_i64 c, bool scalar_first, + void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) +{ + TCGv_i64 t0 = tcg_temp_new_i64(); + TCGv_i64 t1 = tcg_temp_new_i64(); + uint32_t i; + + for (i = 0; i < oprsz; i += 8) { + tcg_gen_ld_i64(t0, cpu_env, aofs + i); + if (scalar_first) { + fni(t1, c, t0); + } else { + fni(t1, t0, c); + } + tcg_gen_st_i64(t1, cpu_env, dofs + i); + } + tcg_temp_free_i64(t0); + tcg_temp_free_i64(t1); +} + +/* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ +static void expand_3_i64(uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, bool load_dest, + void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) +{ + TCGv_i64 t0 = tcg_temp_new_i64(); + TCGv_i64 t1 = tcg_temp_new_i64(); + TCGv_i64 t2 = tcg_temp_new_i64(); + uint32_t i; + + for (i = 0; i < oprsz; i += 8) { + tcg_gen_ld_i64(t0, cpu_env, aofs + i); + tcg_gen_ld_i64(t1, cpu_env, bofs + i); + if (load_dest) { + tcg_gen_ld_i64(t2, cpu_env, dofs + i); + } + fni(t2, t0, t1); + tcg_gen_st_i64(t2, cpu_env, dofs + i); + } + tcg_temp_free_i64(t2); + tcg_temp_free_i64(t1); + tcg_temp_free_i64(t0); +} + +/* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ +static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, + uint32_t cofs, uint32_t oprsz, + void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64)) +{ + TCGv_i64 t0 = tcg_temp_new_i64(); + TCGv_i64 t1 = tcg_temp_new_i64(); + TCGv_i64 t2 = tcg_temp_new_i64(); + TCGv_i64 t3 = tcg_temp_new_i64(); + uint32_t i; + + for (i = 0; i < oprsz; i += 8) { + tcg_gen_ld_i64(t1, cpu_env, aofs + i); + tcg_gen_ld_i64(t2, cpu_env, bofs + i); + tcg_gen_ld_i64(t3, cpu_env, cofs + i); + fni(t0, t1, t2, t3); + tcg_gen_st_i64(t0, cpu_env, dofs + i); + } + tcg_temp_free_i64(t3); + tcg_temp_free_i64(t2); + tcg_temp_free_i64(t1); + tcg_temp_free_i64(t0); +} + +/* Expand OPSZ bytes worth of two-operand operations using host vectors. */ +static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t oprsz, uint32_t tysz, TCGType type, + void (*fni)(unsigned, TCGv_vec, TCGv_vec)) +{ + TCGv_vec t0 = tcg_temp_new_vec(type); + uint32_t i; + + for (i = 0; i < oprsz; i += tysz) { + tcg_gen_ld_vec(t0, cpu_env, aofs + i); + fni(vece, t0, t0); + tcg_gen_st_vec(t0, cpu_env, dofs + i); + } + tcg_temp_free_vec(t0); +} + +/* Expand OPSZ bytes worth of two-vector operands and an immediate operand + using host vectors. */ +static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t oprsz, uint32_t tysz, TCGType type, + int64_t c, bool load_dest, + void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t)) +{ + TCGv_vec t0 = tcg_temp_new_vec(type); + TCGv_vec t1 = tcg_temp_new_vec(type); + uint32_t i; + + for (i = 0; i < oprsz; i += tysz) { + tcg_gen_ld_vec(t0, cpu_env, aofs + i); + if (load_dest) { + tcg_gen_ld_vec(t1, cpu_env, dofs + i); + } + fni(vece, t1, t0, c); + tcg_gen_st_vec(t1, cpu_env, dofs + i); + } + tcg_temp_free_vec(t0); + tcg_temp_free_vec(t1); +} + +static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t oprsz, uint32_t tysz, TCGType type, + TCGv_vec c, bool scalar_first, + void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) +{ + TCGv_vec t0 = tcg_temp_new_vec(type); + TCGv_vec t1 = tcg_temp_new_vec(type); + uint32_t i; + + for (i = 0; i < oprsz; i += tysz) { + tcg_gen_ld_vec(t0, cpu_env, aofs + i); + if (scalar_first) { + fni(vece, t1, c, t0); + } else { + fni(vece, t1, t0, c); + } + tcg_gen_st_vec(t1, cpu_env, dofs + i); + } + tcg_temp_free_vec(t0); + tcg_temp_free_vec(t1); +} + +/* Expand OPSZ bytes worth of three-operand operations using host vectors. */ +static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, + uint32_t tysz, TCGType type, bool load_dest, + void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) +{ + TCGv_vec t0 = tcg_temp_new_vec(type); + TCGv_vec t1 = tcg_temp_new_vec(type); + TCGv_vec t2 = tcg_temp_new_vec(type); + uint32_t i; + + for (i = 0; i < oprsz; i += tysz) { + tcg_gen_ld_vec(t0, cpu_env, aofs + i); + tcg_gen_ld_vec(t1, cpu_env, bofs + i); + if (load_dest) { + tcg_gen_ld_vec(t2, cpu_env, dofs + i); + } + fni(vece, t2, t0, t1); + tcg_gen_st_vec(t2, cpu_env, dofs + i); + } + tcg_temp_free_vec(t2); + tcg_temp_free_vec(t1); + tcg_temp_free_vec(t0); +} + +/* Expand OPSZ bytes worth of four-operand operations using host vectors. */ +static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t cofs, uint32_t oprsz, + uint32_t tysz, TCGType type, + void (*fni)(unsigned, TCGv_vec, TCGv_vec, + TCGv_vec, TCGv_vec)) +{ + TCGv_vec t0 = tcg_temp_new_vec(type); + TCGv_vec t1 = tcg_temp_new_vec(type); + TCGv_vec t2 = tcg_temp_new_vec(type); + TCGv_vec t3 = tcg_temp_new_vec(type); + uint32_t i; + + for (i = 0; i < oprsz; i += tysz) { + tcg_gen_ld_vec(t1, cpu_env, aofs + i); + tcg_gen_ld_vec(t2, cpu_env, bofs + i); + tcg_gen_ld_vec(t3, cpu_env, cofs + i); + fni(vece, t0, t1, t2, t3); + tcg_gen_st_vec(t0, cpu_env, dofs + i); + } + tcg_temp_free_vec(t3); + tcg_temp_free_vec(t2); + tcg_temp_free_vec(t1); + tcg_temp_free_vec(t0); +} + +/* Expand a vector two-operand operation. */ +void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs, + uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g) +{ + check_size_align(oprsz, maxsz, dofs | aofs); + check_overlap_2(dofs, aofs, maxsz); + + /* Recall that ARM SVE allows vector sizes that are not a power of 2. + Expand with successively smaller host vector sizes. The intent is + that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */ + /* ??? For maxsz > oprsz, the host may be able to use an opr-sized + operation, zeroing the balance of the register. We can then + use a max-sized store to implement the clearing without an extra + store operation. This is true for aarch64 and x86_64 hosts. */ + + if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32) + && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) { + uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32); + expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv); + if (some == oprsz) { + goto done; + } + dofs += some; + aofs += some; + oprsz -= some; + maxsz -= some; + } + + if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16) + && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) { + expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv); + } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64 + && g->fniv && check_size_impl(oprsz, 8) + && (!g->opc + || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) { + expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv); + } else if (g->fni8 && check_size_impl(oprsz, 8)) { + expand_2_i64(dofs, aofs, oprsz, g->fni8); + } else if (g->fni4 && check_size_impl(oprsz, 4)) { + expand_2_i32(dofs, aofs, oprsz, g->fni4); + } else { + assert(g->fno != NULL); + tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno); + return; + } + + done: + if (oprsz < maxsz) { + expand_clr(dofs + oprsz, maxsz - oprsz); + } +} + +/* Expand a vector operation with two vectors and an immediate. */ +void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz, + uint32_t maxsz, int64_t c, const GVecGen2i *g) +{ + check_size_align(oprsz, maxsz, dofs | aofs); + check_overlap_2(dofs, aofs, maxsz); + + /* Recall that ARM SVE allows vector sizes that are not a power of 2. + Expand with successively smaller host vector sizes. The intent is + that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */ + + if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32) + && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) { + uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32); + expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, + c, g->load_dest, g->fniv); + if (some == oprsz) { + goto done; + } + dofs += some; + aofs += some; + oprsz -= some; + maxsz -= some; + } + + if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16) + && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) { + expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, + c, g->load_dest, g->fniv); + } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64 + && g->fniv && check_size_impl(oprsz, 8) + && (!g->opc + || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) { + expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, + c, g->load_dest, g->fniv); + } else if (g->fni8 && check_size_impl(oprsz, 8)) { + expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8); + } else if (g->fni4 && check_size_impl(oprsz, 4)) { + expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4); + } else { + if (g->fno) { + tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno); + } else { + TCGv_i64 tcg_c = tcg_const_i64(c); + tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz, maxsz, c, g->fnoi); + tcg_temp_free_i64(tcg_c); + } + return; + } + + done: + if (oprsz < maxsz) { + expand_clr(dofs + oprsz, maxsz - oprsz); + } +} + +/* Expand a vector operation with two vectors and a scalar. */ +void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz, + uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g) +{ + TCGType type; + + check_size_align(oprsz, maxsz, dofs | aofs); + check_overlap_2(dofs, aofs, maxsz); + + type = 0; + if (g->fniv) { + if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)) { + type = TCG_TYPE_V256; + } else if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)) { + type = TCG_TYPE_V128; + } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64 + && check_size_impl(oprsz, 8)) { + type = TCG_TYPE_V64; + } + } + if (type != 0) { + TCGv_vec t_vec = tcg_temp_new_vec(type); + + tcg_gen_dup_i64_vec(g->vece, t_vec, c); + + /* Recall that ARM SVE allows vector sizes that are not a power of 2. + Expand with successively smaller host vector sizes. The intent is + that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */ + switch (type) { + case TCG_TYPE_V256: + { + uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32); + expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, + t_vec, g->scalar_first, g->fniv); + if (some == oprsz) { + break; + } + dofs += some; + aofs += some; + oprsz -= some; + maxsz -= some; + } + /* fallthru */ + + case TCG_TYPE_V128: + expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, + t_vec, g->scalar_first, g->fniv); + break; + + case TCG_TYPE_V64: + expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, + t_vec, g->scalar_first, g->fniv); + break; + + default: + g_assert_not_reached(); + } + tcg_temp_free_vec(t_vec); + } else if (g->fni8 && check_size_impl(oprsz, 8)) { + TCGv_i64 t64 = tcg_temp_new_i64(); + + gen_dup_i64(g->vece, t64, c); + expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8); + tcg_temp_free_i64(t64); + } else if (g->fni4 && check_size_impl(oprsz, 4)) { + TCGv_i32 t32 = tcg_temp_new_i32(); + + tcg_gen_extrl_i64_i32(t32, c); + gen_dup_i32(g->vece, t32, t32); + expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4); + tcg_temp_free_i32(t32); + } else { + tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno); + return; + } + + if (oprsz < maxsz) { + expand_clr(dofs + oprsz, maxsz - oprsz); + } +} + +/* Expand a vector three-operand operation. */ +void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs, + uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g) +{ + check_size_align(oprsz, maxsz, dofs | aofs | bofs); + check_overlap_3(dofs, aofs, bofs, maxsz); + + /* Recall that ARM SVE allows vector sizes that are not a power of 2. + Expand with successively smaller host vector sizes. The intent is + that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */ + + if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32) + && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) { + uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32); + expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, + g->load_dest, g->fniv); + if (some == oprsz) { + goto done; + } + dofs += some; + aofs += some; + bofs += some; + oprsz -= some; + maxsz -= some; + } + + if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16) + && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) { + expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, + g->load_dest, g->fniv); + } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64 + && g->fniv && check_size_impl(oprsz, 8) + && (!g->opc + || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) { + expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, + g->load_dest, g->fniv); + } else if (g->fni8 && check_size_impl(oprsz, 8)) { + expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8); + } else if (g->fni4 && check_size_impl(oprsz, 4)) { + expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4); + } else { + assert(g->fno != NULL); + tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, g->data, g->fno); + } + + done: + if (oprsz < maxsz) { + expand_clr(dofs + oprsz, maxsz - oprsz); + } +} + +/* Expand a vector four-operand operation. */ +void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs, + uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g) +{ + check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs); + check_overlap_4(dofs, aofs, bofs, cofs, maxsz); + + /* Recall that ARM SVE allows vector sizes that are not a power of 2. + Expand with successively smaller host vector sizes. The intent is + that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */ + + if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32) + && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) { + uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32); + expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some, + 32, TCG_TYPE_V256, g->fniv); + if (some == oprsz) { + goto done; + } + dofs += some; + aofs += some; + bofs += some; + cofs += some; + oprsz -= some; + maxsz -= some; + } + + if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16) + && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) { + expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, + 16, TCG_TYPE_V128, g->fniv); + } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64 + && g->fniv && check_size_impl(oprsz, 8) + && (!g->opc + || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) { + expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, + 8, TCG_TYPE_V64, g->fniv); + } else if (g->fni8 && check_size_impl(oprsz, 8)) { + expand_4_i64(dofs, aofs, bofs, cofs, oprsz, g->fni8); + } else if (g->fni4 && check_size_impl(oprsz, 4)) { + expand_4_i32(dofs, aofs, bofs, cofs, oprsz, g->fni4); + } else { + assert(g->fno != NULL); + tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs, + oprsz, maxsz, g->data, g->fno); + return; + } + + done: + if (oprsz < maxsz) { + expand_clr(dofs + oprsz, maxsz - oprsz); + } +} + +/* + * Expand specific vector operations. + */ + +static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b) +{ + tcg_gen_mov_vec(a, b); +} + +void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen2 g = { + .fni8 = tcg_gen_mov_i64, + .fniv = vec_mov2, + .fno = gen_helper_gvec_mov, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + }; + if (dofs != aofs) { + tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); + } else { + check_size_align(oprsz, maxsz, dofs); + if (oprsz < maxsz) { + expand_clr(dofs + oprsz, maxsz - oprsz); + } + } +} + +void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz, + uint32_t maxsz, TCGv_i32 in) +{ + check_size_align(oprsz, maxsz, dofs); + tcg_debug_assert(vece <= MO_32); + do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0); +} + +void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz, + uint32_t maxsz, TCGv_i64 in) +{ + check_size_align(oprsz, maxsz, dofs); + tcg_debug_assert(vece <= MO_64); + do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); +} + +void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t oprsz, uint32_t maxsz) +{ + if (vece <= MO_32) { + TCGv_i32 in = tcg_temp_new_i32(); + switch (vece) { + case MO_8: + tcg_gen_ld8u_i32(in, cpu_env, aofs); + break; + case MO_16: + tcg_gen_ld16u_i32(in, cpu_env, aofs); + break; + case MO_32: + tcg_gen_ld_i32(in, cpu_env, aofs); + break; + } + tcg_gen_gvec_dup_i32(vece, dofs, oprsz, maxsz, in); + tcg_temp_free_i32(in); + } else if (vece == MO_64) { + TCGv_i64 in = tcg_temp_new_i64(); + tcg_gen_ld_i64(in, cpu_env, aofs); + tcg_gen_gvec_dup_i64(MO_64, dofs, oprsz, maxsz, in); + tcg_temp_free_i64(in); + } else { + /* 128-bit duplicate. */ + /* ??? Dup to 256-bit vector. */ + int i; + + tcg_debug_assert(vece == 4); + tcg_debug_assert(oprsz >= 16); + if (TCG_TARGET_HAS_v128) { + TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128); + + tcg_gen_ld_vec(in, cpu_env, aofs); + for (i = 0; i < oprsz; i += 16) { + tcg_gen_st_vec(in, cpu_env, dofs + i); + } + tcg_temp_free_vec(in); + } else { + TCGv_i64 in0 = tcg_temp_new_i64(); + TCGv_i64 in1 = tcg_temp_new_i64(); + + tcg_gen_ld_i64(in0, cpu_env, aofs); + tcg_gen_ld_i64(in1, cpu_env, aofs + 8); + for (i = 0; i < oprsz; i += 16) { + tcg_gen_st_i64(in0, cpu_env, dofs + i); + tcg_gen_st_i64(in1, cpu_env, dofs + i + 8); + } + tcg_temp_free_i64(in0); + tcg_temp_free_i64(in1); + } + } +} + +void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz, + uint32_t maxsz, uint64_t x) +{ + check_size_align(oprsz, maxsz, dofs); + do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x); +} + +void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz, + uint32_t maxsz, uint32_t x) +{ + check_size_align(oprsz, maxsz, dofs); + do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x); +} + +void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz, + uint32_t maxsz, uint16_t x) +{ + check_size_align(oprsz, maxsz, dofs); + do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x); +} + +void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz, + uint32_t maxsz, uint8_t x) +{ + check_size_align(oprsz, maxsz, dofs); + do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x); +} + +void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen2 g = { + .fni8 = tcg_gen_not_i64, + .fniv = tcg_gen_not_vec, + .fno = gen_helper_gvec_not, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + }; + tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); +} + +/* Perform a vector addition using normal addition and a mask. The mask + should be the sign bit of each lane. This 6-operation form is more + efficient than separate additions when there are 4 or more lanes in + the 64-bit operation. */ +static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) +{ + TCGv_i64 t1 = tcg_temp_new_i64(); + TCGv_i64 t2 = tcg_temp_new_i64(); + TCGv_i64 t3 = tcg_temp_new_i64(); + + tcg_gen_andc_i64(t1, a, m); + tcg_gen_andc_i64(t2, b, m); + tcg_gen_xor_i64(t3, a, b); + tcg_gen_add_i64(d, t1, t2); + tcg_gen_and_i64(t3, t3, m); + tcg_gen_xor_i64(d, d, t3); + + tcg_temp_free_i64(t1); + tcg_temp_free_i64(t2); + tcg_temp_free_i64(t3); +} + +void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); + gen_addv_mask(d, a, b, m); + tcg_temp_free_i64(m); +} + +void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); + gen_addv_mask(d, a, b, m); + tcg_temp_free_i64(m); +} + +void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + TCGv_i64 t1 = tcg_temp_new_i64(); + TCGv_i64 t2 = tcg_temp_new_i64(); + + tcg_gen_andi_i64(t1, a, ~0xffffffffull); + tcg_gen_add_i64(t2, a, b); + tcg_gen_add_i64(t1, t1, b); + tcg_gen_deposit_i64(d, t1, t2, 0, 32); + + tcg_temp_free_i64(t1); + tcg_temp_free_i64(t2); +} + +void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen3 g[4] = { + { .fni8 = tcg_gen_vec_add8_i64, + .fniv = tcg_gen_add_vec, + .fno = gen_helper_gvec_add8, + .opc = INDEX_op_add_vec, + .vece = MO_8 }, + { .fni8 = tcg_gen_vec_add16_i64, + .fniv = tcg_gen_add_vec, + .fno = gen_helper_gvec_add16, + .opc = INDEX_op_add_vec, + .vece = MO_16 }, + { .fni4 = tcg_gen_add_i32, + .fniv = tcg_gen_add_vec, + .fno = gen_helper_gvec_add32, + .opc = INDEX_op_add_vec, + .vece = MO_32 }, + { .fni8 = tcg_gen_add_i64, + .fniv = tcg_gen_add_vec, + .fno = gen_helper_gvec_add64, + .opc = INDEX_op_add_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 }, + }; + + tcg_debug_assert(vece <= MO_64); + tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); +} + +void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs, + TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen2s g[4] = { + { .fni8 = tcg_gen_vec_add8_i64, + .fniv = tcg_gen_add_vec, + .fno = gen_helper_gvec_adds8, + .opc = INDEX_op_add_vec, + .vece = MO_8 }, + { .fni8 = tcg_gen_vec_add16_i64, + .fniv = tcg_gen_add_vec, + .fno = gen_helper_gvec_adds16, + .opc = INDEX_op_add_vec, + .vece = MO_16 }, + { .fni4 = tcg_gen_add_i32, + .fniv = tcg_gen_add_vec, + .fno = gen_helper_gvec_adds32, + .opc = INDEX_op_add_vec, + .vece = MO_32 }, + { .fni8 = tcg_gen_add_i64, + .fniv = tcg_gen_add_vec, + .fno = gen_helper_gvec_adds64, + .opc = INDEX_op_add_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 }, + }; + + tcg_debug_assert(vece <= MO_64); + tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); +} + +void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t c, uint32_t oprsz, uint32_t maxsz) +{ + TCGv_i64 tmp = tcg_const_i64(c); + tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz); + tcg_temp_free_i64(tmp); +} + +void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs, + TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen2s g[4] = { + { .fni8 = tcg_gen_vec_sub8_i64, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_gvec_subs8, + .opc = INDEX_op_sub_vec, + .vece = MO_8 }, + { .fni8 = tcg_gen_vec_sub16_i64, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_gvec_subs16, + .opc = INDEX_op_sub_vec, + .vece = MO_16 }, + { .fni4 = tcg_gen_sub_i32, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_gvec_subs32, + .opc = INDEX_op_sub_vec, + .vece = MO_32 }, + { .fni8 = tcg_gen_sub_i64, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_gvec_subs64, + .opc = INDEX_op_sub_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 }, + }; + + tcg_debug_assert(vece <= MO_64); + tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); +} + +/* Perform a vector subtraction using normal subtraction and a mask. + Compare gen_addv_mask above. */ +static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) +{ + TCGv_i64 t1 = tcg_temp_new_i64(); + TCGv_i64 t2 = tcg_temp_new_i64(); + TCGv_i64 t3 = tcg_temp_new_i64(); + + tcg_gen_or_i64(t1, a, m); + tcg_gen_andc_i64(t2, b, m); + tcg_gen_eqv_i64(t3, a, b); + tcg_gen_sub_i64(d, t1, t2); + tcg_gen_and_i64(t3, t3, m); + tcg_gen_xor_i64(d, d, t3); + + tcg_temp_free_i64(t1); + tcg_temp_free_i64(t2); + tcg_temp_free_i64(t3); +} + +void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); + gen_subv_mask(d, a, b, m); + tcg_temp_free_i64(m); +} + +void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); + gen_subv_mask(d, a, b, m); + tcg_temp_free_i64(m); +} + +void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + TCGv_i64 t1 = tcg_temp_new_i64(); + TCGv_i64 t2 = tcg_temp_new_i64(); + + tcg_gen_andi_i64(t1, b, ~0xffffffffull); + tcg_gen_sub_i64(t2, a, b); + tcg_gen_sub_i64(t1, a, t1); + tcg_gen_deposit_i64(d, t1, t2, 0, 32); + + tcg_temp_free_i64(t1); + tcg_temp_free_i64(t2); +} + +void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen3 g[4] = { + { .fni8 = tcg_gen_vec_sub8_i64, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_gvec_sub8, + .opc = INDEX_op_sub_vec, + .vece = MO_8 }, + { .fni8 = tcg_gen_vec_sub16_i64, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_gvec_sub16, + .opc = INDEX_op_sub_vec, + .vece = MO_16 }, + { .fni4 = tcg_gen_sub_i32, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_gvec_sub32, + .opc = INDEX_op_sub_vec, + .vece = MO_32 }, + { .fni8 = tcg_gen_sub_i64, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_gvec_sub64, + .opc = INDEX_op_sub_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 }, + }; + + tcg_debug_assert(vece <= MO_64); + tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); +} + +void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen3 g[4] = { + { .fniv = tcg_gen_mul_vec, + .fno = gen_helper_gvec_mul8, + .opc = INDEX_op_mul_vec, + .vece = MO_8 }, + { .fniv = tcg_gen_mul_vec, + .fno = gen_helper_gvec_mul16, + .opc = INDEX_op_mul_vec, + .vece = MO_16 }, + { .fni4 = tcg_gen_mul_i32, + .fniv = tcg_gen_mul_vec, + .fno = gen_helper_gvec_mul32, + .opc = INDEX_op_mul_vec, + .vece = MO_32 }, + { .fni8 = tcg_gen_mul_i64, + .fniv = tcg_gen_mul_vec, + .fno = gen_helper_gvec_mul64, + .opc = INDEX_op_mul_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 }, + }; + + tcg_debug_assert(vece <= MO_64); + tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); +} + +void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs, + TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen2s g[4] = { + { .fniv = tcg_gen_mul_vec, + .fno = gen_helper_gvec_muls8, + .opc = INDEX_op_mul_vec, + .vece = MO_8 }, + { .fniv = tcg_gen_mul_vec, + .fno = gen_helper_gvec_muls16, + .opc = INDEX_op_mul_vec, + .vece = MO_16 }, + { .fni4 = tcg_gen_mul_i32, + .fniv = tcg_gen_mul_vec, + .fno = gen_helper_gvec_muls32, + .opc = INDEX_op_mul_vec, + .vece = MO_32 }, + { .fni8 = tcg_gen_mul_i64, + .fniv = tcg_gen_mul_vec, + .fno = gen_helper_gvec_muls64, + .opc = INDEX_op_mul_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 }, + }; + + tcg_debug_assert(vece <= MO_64); + tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); +} + +void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t c, uint32_t oprsz, uint32_t maxsz) +{ + TCGv_i64 tmp = tcg_const_i64(c); + tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz); + tcg_temp_free_i64(tmp); +} + +void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen3 g[4] = { + { .fno = gen_helper_gvec_ssadd8, .vece = MO_8 }, + { .fno = gen_helper_gvec_ssadd16, .vece = MO_16 }, + { .fno = gen_helper_gvec_ssadd32, .vece = MO_32 }, + { .fno = gen_helper_gvec_ssadd64, .vece = MO_64 } + }; + tcg_debug_assert(vece <= MO_64); + tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); +} + +void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen3 g[4] = { + { .fno = gen_helper_gvec_sssub8, .vece = MO_8 }, + { .fno = gen_helper_gvec_sssub16, .vece = MO_16 }, + { .fno = gen_helper_gvec_sssub32, .vece = MO_32 }, + { .fno = gen_helper_gvec_sssub64, .vece = MO_64 } + }; + tcg_debug_assert(vece <= MO_64); + tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); +} + +static void tcg_gen_vec_usadd32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + TCGv_i32 max = tcg_const_i32(-1); + tcg_gen_add_i32(d, a, b); + tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d); + tcg_temp_free_i32(max); +} + +static void tcg_gen_vec_usadd32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + TCGv_i64 max = tcg_const_i64(-1); + tcg_gen_add_i64(d, a, b); + tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d); + tcg_temp_free_i64(max); +} + +void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen3 g[4] = { + { .fno = gen_helper_gvec_usadd8, .vece = MO_8 }, + { .fno = gen_helper_gvec_usadd16, .vece = MO_16 }, + { .fni4 = tcg_gen_vec_usadd32_i32, + .fno = gen_helper_gvec_usadd32, + .vece = MO_32 }, + { .fni8 = tcg_gen_vec_usadd32_i64, + .fno = gen_helper_gvec_usadd64, + .vece = MO_64 } + }; + tcg_debug_assert(vece <= MO_64); + tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); +} + +static void tcg_gen_vec_ussub32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + TCGv_i32 min = tcg_const_i32(0); + tcg_gen_sub_i32(d, a, b); + tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d); + tcg_temp_free_i32(min); +} + +static void tcg_gen_vec_ussub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + TCGv_i64 min = tcg_const_i64(0); + tcg_gen_sub_i64(d, a, b); + tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d); + tcg_temp_free_i64(min); +} + +void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen3 g[4] = { + { .fno = gen_helper_gvec_ussub8, .vece = MO_8 }, + { .fno = gen_helper_gvec_ussub16, .vece = MO_16 }, + { .fni4 = tcg_gen_vec_ussub32_i32, + .fno = gen_helper_gvec_ussub32, + .vece = MO_32 }, + { .fni8 = tcg_gen_vec_ussub32_i64, + .fno = gen_helper_gvec_ussub64, + .vece = MO_64 } + }; + tcg_debug_assert(vece <= MO_64); + tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); +} + +/* Perform a vector negation using normal negation and a mask. + Compare gen_subv_mask above. */ +static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m) +{ + TCGv_i64 t2 = tcg_temp_new_i64(); + TCGv_i64 t3 = tcg_temp_new_i64(); + + tcg_gen_andc_i64(t3, m, b); + tcg_gen_andc_i64(t2, b, m); + tcg_gen_sub_i64(d, m, t2); + tcg_gen_xor_i64(d, d, t3); + + tcg_temp_free_i64(t2); + tcg_temp_free_i64(t3); +} + +void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b) +{ + TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); + gen_negv_mask(d, b, m); + tcg_temp_free_i64(m); +} + +void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b) +{ + TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); + gen_negv_mask(d, b, m); + tcg_temp_free_i64(m); +} + +void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b) +{ + TCGv_i64 t1 = tcg_temp_new_i64(); + TCGv_i64 t2 = tcg_temp_new_i64(); + + tcg_gen_andi_i64(t1, b, ~0xffffffffull); + tcg_gen_neg_i64(t2, b); + tcg_gen_neg_i64(t1, t1); + tcg_gen_deposit_i64(d, t1, t2, 0, 32); + + tcg_temp_free_i64(t1); + tcg_temp_free_i64(t2); +} + +void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen2 g[4] = { + { .fni8 = tcg_gen_vec_neg8_i64, + .fniv = tcg_gen_neg_vec, + .fno = gen_helper_gvec_neg8, + .opc = INDEX_op_neg_vec, + .vece = MO_8 }, + { .fni8 = tcg_gen_vec_neg16_i64, + .fniv = tcg_gen_neg_vec, + .fno = gen_helper_gvec_neg16, + .opc = INDEX_op_neg_vec, + .vece = MO_16 }, + { .fni4 = tcg_gen_neg_i32, + .fniv = tcg_gen_neg_vec, + .fno = gen_helper_gvec_neg32, + .opc = INDEX_op_neg_vec, + .vece = MO_32 }, + { .fni8 = tcg_gen_neg_i64, + .fniv = tcg_gen_neg_vec, + .fno = gen_helper_gvec_neg64, + .opc = INDEX_op_neg_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 }, + }; + + tcg_debug_assert(vece <= MO_64); + tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); +} + +void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen3 g = { + .fni8 = tcg_gen_and_i64, + .fniv = tcg_gen_and_vec, + .fno = gen_helper_gvec_and, + .opc = INDEX_op_and_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + }; + tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); +} + +void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen3 g = { + .fni8 = tcg_gen_or_i64, + .fniv = tcg_gen_or_vec, + .fno = gen_helper_gvec_or, + .opc = INDEX_op_or_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + }; + tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); +} + +void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen3 g = { + .fni8 = tcg_gen_xor_i64, + .fniv = tcg_gen_xor_vec, + .fno = gen_helper_gvec_xor, + .opc = INDEX_op_xor_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + }; + tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); +} + +void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen3 g = { + .fni8 = tcg_gen_andc_i64, + .fniv = tcg_gen_andc_vec, + .fno = gen_helper_gvec_andc, + .opc = INDEX_op_andc_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + }; + tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); +} + +void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen3 g = { + .fni8 = tcg_gen_orc_i64, + .fniv = tcg_gen_orc_vec, + .fno = gen_helper_gvec_orc, + .opc = INDEX_op_orc_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + }; + tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); +} + +static const GVecGen2s gop_ands = { + .fni8 = tcg_gen_and_i64, + .fniv = tcg_gen_and_vec, + .fno = gen_helper_gvec_ands, + .opc = INDEX_op_and_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 +}; + +void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs, + TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) +{ + TCGv_i64 tmp = tcg_temp_new_i64(); + gen_dup_i64(vece, tmp, c); + tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); + tcg_temp_free_i64(tmp); +} + +void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t c, uint32_t oprsz, uint32_t maxsz) +{ + TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); + tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); + tcg_temp_free_i64(tmp); +} + +static const GVecGen2s gop_xors = { + .fni8 = tcg_gen_xor_i64, + .fniv = tcg_gen_xor_vec, + .fno = gen_helper_gvec_xors, + .opc = INDEX_op_xor_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 +}; + +void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs, + TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) +{ + TCGv_i64 tmp = tcg_temp_new_i64(); + gen_dup_i64(vece, tmp, c); + tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); + tcg_temp_free_i64(tmp); +} + +void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t c, uint32_t oprsz, uint32_t maxsz) +{ + TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); + tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); + tcg_temp_free_i64(tmp); +} + +static const GVecGen2s gop_ors = { + .fni8 = tcg_gen_or_i64, + .fniv = tcg_gen_or_vec, + .fno = gen_helper_gvec_ors, + .opc = INDEX_op_or_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 +}; + +void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs, + TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) +{ + TCGv_i64 tmp = tcg_temp_new_i64(); + gen_dup_i64(vece, tmp, c); + tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); + tcg_temp_free_i64(tmp); +} + +void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t c, uint32_t oprsz, uint32_t maxsz) +{ + TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); + tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); + tcg_temp_free_i64(tmp); +} + +void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) +{ + uint64_t mask = dup_const(MO_8, 0xff << c); + tcg_gen_shli_i64(d, a, c); + tcg_gen_andi_i64(d, d, mask); +} + +void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) +{ + uint64_t mask = dup_const(MO_16, 0xffff << c); + tcg_gen_shli_i64(d, a, c); + tcg_gen_andi_i64(d, d, mask); +} + +void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t shift, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen2i g[4] = { + { .fni8 = tcg_gen_vec_shl8i_i64, + .fniv = tcg_gen_shli_vec, + .fno = gen_helper_gvec_shl8i, + .opc = INDEX_op_shli_vec, + .vece = MO_8 }, + { .fni8 = tcg_gen_vec_shl16i_i64, + .fniv = tcg_gen_shli_vec, + .fno = gen_helper_gvec_shl16i, + .opc = INDEX_op_shli_vec, + .vece = MO_16 }, + { .fni4 = tcg_gen_shli_i32, + .fniv = tcg_gen_shli_vec, + .fno = gen_helper_gvec_shl32i, + .opc = INDEX_op_shli_vec, + .vece = MO_32 }, + { .fni8 = tcg_gen_shli_i64, + .fniv = tcg_gen_shli_vec, + .fno = gen_helper_gvec_shl64i, + .opc = INDEX_op_shli_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 }, + }; + + tcg_debug_assert(vece <= MO_64); + tcg_debug_assert(shift >= 0 && shift < (8 << vece)); + if (shift == 0) { + tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); + } else { + tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); + } +} + +void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) +{ + uint64_t mask = dup_const(MO_8, 0xff >> c); + tcg_gen_shri_i64(d, a, c); + tcg_gen_andi_i64(d, d, mask); +} + +void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) +{ + uint64_t mask = dup_const(MO_16, 0xffff >> c); + tcg_gen_shri_i64(d, a, c); + tcg_gen_andi_i64(d, d, mask); +} + +void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t shift, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen2i g[4] = { + { .fni8 = tcg_gen_vec_shr8i_i64, + .fniv = tcg_gen_shri_vec, + .fno = gen_helper_gvec_shr8i, + .opc = INDEX_op_shri_vec, + .vece = MO_8 }, + { .fni8 = tcg_gen_vec_shr16i_i64, + .fniv = tcg_gen_shri_vec, + .fno = gen_helper_gvec_shr16i, + .opc = INDEX_op_shri_vec, + .vece = MO_16 }, + { .fni4 = tcg_gen_shri_i32, + .fniv = tcg_gen_shri_vec, + .fno = gen_helper_gvec_shr32i, + .opc = INDEX_op_shri_vec, + .vece = MO_32 }, + { .fni8 = tcg_gen_shri_i64, + .fniv = tcg_gen_shri_vec, + .fno = gen_helper_gvec_shr64i, + .opc = INDEX_op_shri_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 }, + }; + + tcg_debug_assert(vece <= MO_64); + tcg_debug_assert(shift >= 0 && shift < (8 << vece)); + if (shift == 0) { + tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); + } else { + tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); + } +} + +void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) +{ + uint64_t s_mask = dup_const(MO_8, 0x80 >> c); + uint64_t c_mask = dup_const(MO_8, 0xff >> c); + TCGv_i64 s = tcg_temp_new_i64(); + + tcg_gen_shri_i64(d, a, c); + tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ + tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ + tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ + tcg_gen_or_i64(d, d, s); /* include sign extension */ + tcg_temp_free_i64(s); +} + +void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) +{ + uint64_t s_mask = dup_const(MO_16, 0x8000 >> c); + uint64_t c_mask = dup_const(MO_16, 0xffff >> c); + TCGv_i64 s = tcg_temp_new_i64(); + + tcg_gen_shri_i64(d, a, c); + tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ + tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ + tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ + tcg_gen_or_i64(d, d, s); /* include sign extension */ + tcg_temp_free_i64(s); +} + +void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t shift, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen2i g[4] = { + { .fni8 = tcg_gen_vec_sar8i_i64, + .fniv = tcg_gen_sari_vec, + .fno = gen_helper_gvec_sar8i, + .opc = INDEX_op_sari_vec, + .vece = MO_8 }, + { .fni8 = tcg_gen_vec_sar16i_i64, + .fniv = tcg_gen_sari_vec, + .fno = gen_helper_gvec_sar16i, + .opc = INDEX_op_sari_vec, + .vece = MO_16 }, + { .fni4 = tcg_gen_sari_i32, + .fniv = tcg_gen_sari_vec, + .fno = gen_helper_gvec_sar32i, + .opc = INDEX_op_sari_vec, + .vece = MO_32 }, + { .fni8 = tcg_gen_sari_i64, + .fniv = tcg_gen_sari_vec, + .fno = gen_helper_gvec_sar64i, + .opc = INDEX_op_sari_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 }, + }; + + tcg_debug_assert(vece <= MO_64); + tcg_debug_assert(shift >= 0 && shift < (8 << vece)); + if (shift == 0) { + tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); + } else { + tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); + } +} + +/* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ +static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, + uint32_t oprsz, TCGCond cond) +{ + TCGv_i32 t0 = tcg_temp_new_i32(); + TCGv_i32 t1 = tcg_temp_new_i32(); + uint32_t i; + + for (i = 0; i < oprsz; i += 4) { + tcg_gen_ld_i32(t0, cpu_env, aofs + i); + tcg_gen_ld_i32(t1, cpu_env, bofs + i); + tcg_gen_setcond_i32(cond, t0, t0, t1); + tcg_gen_neg_i32(t0, t0); + tcg_gen_st_i32(t0, cpu_env, dofs + i); + } + tcg_temp_free_i32(t1); + tcg_temp_free_i32(t0); +} + +static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, + uint32_t oprsz, TCGCond cond) +{ + TCGv_i64 t0 = tcg_temp_new_i64(); + TCGv_i64 t1 = tcg_temp_new_i64(); + uint32_t i; + + for (i = 0; i < oprsz; i += 8) { + tcg_gen_ld_i64(t0, cpu_env, aofs + i); + tcg_gen_ld_i64(t1, cpu_env, bofs + i); + tcg_gen_setcond_i64(cond, t0, t0, t1); + tcg_gen_neg_i64(t0, t0); + tcg_gen_st_i64(t0, cpu_env, dofs + i); + } + tcg_temp_free_i64(t1); + tcg_temp_free_i64(t0); +} + +static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t tysz, + TCGType type, TCGCond cond) +{ + TCGv_vec t0 = tcg_temp_new_vec(type); + TCGv_vec t1 = tcg_temp_new_vec(type); + uint32_t i; + + for (i = 0; i < oprsz; i += tysz) { + tcg_gen_ld_vec(t0, cpu_env, aofs + i); + tcg_gen_ld_vec(t1, cpu_env, bofs + i); + tcg_gen_cmp_vec(cond, vece, t0, t0, t1); + tcg_gen_st_vec(t0, cpu_env, dofs + i); + } + tcg_temp_free_vec(t1); + tcg_temp_free_vec(t0); +} + +void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs, + uint32_t aofs, uint32_t bofs, + uint32_t oprsz, uint32_t maxsz) +{ + static gen_helper_gvec_3 * const eq_fn[4] = { + gen_helper_gvec_eq8, gen_helper_gvec_eq16, + gen_helper_gvec_eq32, gen_helper_gvec_eq64 + }; + static gen_helper_gvec_3 * const ne_fn[4] = { + gen_helper_gvec_ne8, gen_helper_gvec_ne16, + gen_helper_gvec_ne32, gen_helper_gvec_ne64 + }; + static gen_helper_gvec_3 * const lt_fn[4] = { + gen_helper_gvec_lt8, gen_helper_gvec_lt16, + gen_helper_gvec_lt32, gen_helper_gvec_lt64 + }; + static gen_helper_gvec_3 * const le_fn[4] = { + gen_helper_gvec_le8, gen_helper_gvec_le16, + gen_helper_gvec_le32, gen_helper_gvec_le64 + }; + static gen_helper_gvec_3 * const ltu_fn[4] = { + gen_helper_gvec_ltu8, gen_helper_gvec_ltu16, + gen_helper_gvec_ltu32, gen_helper_gvec_ltu64 + }; + static gen_helper_gvec_3 * const leu_fn[4] = { + gen_helper_gvec_leu8, gen_helper_gvec_leu16, + gen_helper_gvec_leu32, gen_helper_gvec_leu64 + }; + static gen_helper_gvec_3 * const * const fns[16] = { + [TCG_COND_EQ] = eq_fn, + [TCG_COND_NE] = ne_fn, + [TCG_COND_LT] = lt_fn, + [TCG_COND_LE] = le_fn, + [TCG_COND_LTU] = ltu_fn, + [TCG_COND_LEU] = leu_fn, + }; + + check_size_align(oprsz, maxsz, dofs | aofs | bofs); + check_overlap_3(dofs, aofs, bofs, maxsz); + + if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) { + do_dup(MO_8, dofs, oprsz, maxsz, + NULL, NULL, -(cond == TCG_COND_ALWAYS)); + return; + } + + /* Recall that ARM SVE allows vector sizes that are not a power of 2. + Expand with successively smaller host vector sizes. The intent is + that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */ + + if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32) + && tcg_can_emit_vec_op(INDEX_op_cmp_vec, TCG_TYPE_V256, vece)) { + uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32); + expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond); + if (some == oprsz) { + goto done; + } + dofs += some; + aofs += some; + bofs += some; + oprsz -= some; + maxsz -= some; + } + + if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16) + && tcg_can_emit_vec_op(INDEX_op_cmp_vec, TCG_TYPE_V128, vece)) { + expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond); + } else if (TCG_TARGET_HAS_v64 + && check_size_impl(oprsz, 8) + && (TCG_TARGET_REG_BITS == 32 || vece != MO_64) + && tcg_can_emit_vec_op(INDEX_op_cmp_vec, TCG_TYPE_V64, vece)) { + expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond); + } else if (vece == MO_64 && check_size_impl(oprsz, 8)) { + expand_cmp_i64(dofs, aofs, bofs, oprsz, cond); + } else if (vece == MO_32 && check_size_impl(oprsz, 4)) { + expand_cmp_i32(dofs, aofs, bofs, oprsz, cond); + } else { + gen_helper_gvec_3 * const *fn = fns[cond]; + + if (fn == NULL) { + uint32_t tmp; + tmp = aofs, aofs = bofs, bofs = tmp; + cond = tcg_swap_cond(cond); + fn = fns[cond]; + assert(fn != NULL); + } + tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]); + return; + } + + done: + if (oprsz < maxsz) { + expand_clr(dofs + oprsz, maxsz - oprsz); + } +} diff --git a/tcg/tcg-op-gvec.h b/tcg/tcg-op-gvec.h new file mode 100644 index 0000000000..ff43a29a0b --- /dev/null +++ b/tcg/tcg-op-gvec.h @@ -0,0 +1,306 @@ +/* + * Generic vector operation expansion + * + * Copyright (c) 2018 Linaro + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +/* + * "Generic" vectors. All operands are given as offsets from ENV, + * and therefore cannot also be allocated via tcg_global_mem_new_*. + * OPRSZ is the byte size of the vector upon which the operation is performed. + * MAXSZ is the byte size of the full vector; bytes beyond OPSZ are cleared. + * + * All sizes must be 8 or any multiple of 16. + * When OPRSZ is 8, the alignment may be 8, otherwise must be 16. + * Operands may completely, but not partially, overlap. + */ + +/* Expand a call to a gvec-style helper, with pointers to two vector + operands, and a descriptor (see tcg-gvec-desc.h). */ +typedef void gen_helper_gvec_2(TCGv_ptr, TCGv_ptr, TCGv_i32); +void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs, + uint32_t oprsz, uint32_t maxsz, int32_t data, + gen_helper_gvec_2 *fn); + +/* Similarly, passing an extra data value. */ +typedef void gen_helper_gvec_2i(TCGv_ptr, TCGv_ptr, TCGv_i64, TCGv_i32); +void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c, + uint32_t oprsz, uint32_t maxsz, int32_t data, + gen_helper_gvec_2i *fn); + +/* Similarly, passing an extra pointer (e.g. env or float_status). */ +typedef void gen_helper_gvec_2_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32); +void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs, + TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, + int32_t data, gen_helper_gvec_2_ptr *fn); + +/* Similarly, with three vector operands. */ +typedef void gen_helper_gvec_3(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32); +void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, + uint32_t oprsz, uint32_t maxsz, int32_t data, + gen_helper_gvec_3 *fn); + +/* Similarly, with four vector operands. */ +typedef void gen_helper_gvec_4(TCGv_ptr, TCGv_ptr, TCGv_ptr, + TCGv_ptr, TCGv_i32); +void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, + uint32_t cofs, uint32_t oprsz, uint32_t maxsz, + int32_t data, gen_helper_gvec_4 *fn); + +/* Similarly, with five vector operands. */ +typedef void gen_helper_gvec_5(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr, + TCGv_ptr, TCGv_i32); +void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, + uint32_t cofs, uint32_t xofs, uint32_t oprsz, + uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn); + +typedef void gen_helper_gvec_3_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr, + TCGv_ptr, TCGv_i32); +void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, + TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, + int32_t data, gen_helper_gvec_3_ptr *fn); + +typedef void gen_helper_gvec_4_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr, + TCGv_ptr, TCGv_ptr, TCGv_i32); +void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, + uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz, + uint32_t maxsz, int32_t data, + gen_helper_gvec_4_ptr *fn); + +/* Expand a gvec operation. Either inline or out-of-line depending on + the actual vector size and the operations supported by the host. */ +typedef struct { + /* Expand inline as a 64-bit or 32-bit integer. + Only one of these will be non-NULL. */ + void (*fni8)(TCGv_i64, TCGv_i64); + void (*fni4)(TCGv_i32, TCGv_i32); + /* Expand inline with a host vector type. */ + void (*fniv)(unsigned, TCGv_vec, TCGv_vec); + /* Expand out-of-line helper w/descriptor. */ + gen_helper_gvec_2 *fno; + /* The opcode, if any, to which this corresponds. */ + TCGOpcode opc; + /* The data argument to the out-of-line helper. */ + int32_t data; + /* The vector element size, if applicable. */ + uint8_t vece; + /* Prefer i64 to v64. */ + bool prefer_i64; +} GVecGen2; + +typedef struct { + /* Expand inline as a 64-bit or 32-bit integer. + Only one of these will be non-NULL. */ + void (*fni8)(TCGv_i64, TCGv_i64, int64_t); + void (*fni4)(TCGv_i32, TCGv_i32, int32_t); + /* Expand inline with a host vector type. */ + void (*fniv)(unsigned, TCGv_vec, TCGv_vec, int64_t); + /* Expand out-of-line helper w/descriptor, data in descriptor. */ + gen_helper_gvec_2 *fno; + /* Expand out-of-line helper w/descriptor, data as argument. */ + gen_helper_gvec_2i *fnoi; + /* The opcode, if any, to which this corresponds. */ + TCGOpcode opc; + /* The vector element size, if applicable. */ + uint8_t vece; + /* Prefer i64 to v64. */ + bool prefer_i64; + /* Load dest as a 3rd source operand. */ + bool load_dest; +} GVecGen2i; + +typedef struct { + /* Expand inline as a 64-bit or 32-bit integer. + Only one of these will be non-NULL. */ + void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64); + void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32); + /* Expand inline with a host vector type. */ + void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec); + /* Expand out-of-line helper w/descriptor. */ + gen_helper_gvec_2i *fno; + /* The opcode, if any, to which this corresponds. */ + TCGOpcode opc; + /* The data argument to the out-of-line helper. */ + uint32_t data; + /* The vector element size, if applicable. */ + uint8_t vece; + /* Prefer i64 to v64. */ + bool prefer_i64; + /* Load scalar as 1st source operand. */ + bool scalar_first; +} GVecGen2s; + +typedef struct { + /* Expand inline as a 64-bit or 32-bit integer. + Only one of these will be non-NULL. */ + void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64); + void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32); + /* Expand inline with a host vector type. */ + void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec); + /* Expand out-of-line helper w/descriptor. */ + gen_helper_gvec_3 *fno; + /* The opcode, if any, to which this corresponds. */ + TCGOpcode opc; + /* The data argument to the out-of-line helper. */ + int32_t data; + /* The vector element size, if applicable. */ + uint8_t vece; + /* Prefer i64 to v64. */ + bool prefer_i64; + /* Load dest as a 3rd source operand. */ + bool load_dest; +} GVecGen3; + +typedef struct { + /* Expand inline as a 64-bit or 32-bit integer. + Only one of these will be non-NULL. */ + void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64); + void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32); + /* Expand inline with a host vector type. */ + void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, TCGv_vec); + /* Expand out-of-line helper w/descriptor. */ + gen_helper_gvec_4 *fno; + /* The opcode, if any, to which this corresponds. */ + TCGOpcode opc; + /* The data argument to the out-of-line helper. */ + int32_t data; + /* The vector element size, if applicable. */ + uint8_t vece; + /* Prefer i64 to v64. */ + bool prefer_i64; +} GVecGen4; + +void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs, + uint32_t oprsz, uint32_t maxsz, const GVecGen2 *); +void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz, + uint32_t maxsz, int64_t c, const GVecGen2i *); +void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz, + uint32_t maxsz, TCGv_i64 c, const GVecGen2s *); +void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs, + uint32_t oprsz, uint32_t maxsz, const GVecGen3 *); +void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs, + uint32_t oprsz, uint32_t maxsz, const GVecGen4 *); + +/* Expand a specific vector operation. */ + +void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t oprsz, uint32_t maxsz); + +void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz); + +void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t c, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t c, uint32_t oprsz, uint32_t maxsz); + +void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs, + TCGv_i64 c, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs, + TCGv_i64 c, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs, + TCGv_i64 c, uint32_t oprsz, uint32_t maxsz); + +/* Saturated arithmetic. */ +void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz); + +void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz); + +void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t c, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t c, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t c, uint32_t oprsz, uint32_t maxsz); + +void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs, + TCGv_i64 c, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs, + TCGv_i64 c, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs, + TCGv_i64 c, uint32_t oprsz, uint32_t maxsz); + +void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t s, uint32_t m); +void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t s, + uint32_t m, TCGv_i32); +void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t s, + uint32_t m, TCGv_i64); + +void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t s, uint32_t m, uint8_t x); +void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t s, uint32_t m, uint16_t x); +void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t s, uint32_t m, uint32_t x); +void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t s, uint32_t m, uint64_t x); + +void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t shift, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t shift, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t shift, uint32_t oprsz, uint32_t maxsz); + +void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs, + uint32_t aofs, uint32_t bofs, + uint32_t oprsz, uint32_t maxsz); + +/* + * 64-bit vector operations. Use these when the register has been allocated + * with tcg_global_mem_new_i64, and so we cannot also address it via pointer. + * OPRSZ = MAXSZ = 8. + */ + +void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 a); +void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 a); +void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 a); + +void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b); +void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b); +void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b); + +void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b); +void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b); +void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b); + +void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t); +void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t); +void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t); +void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t); +void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t); +void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t); diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c new file mode 100644 index 0000000000..70ec889bc1 --- /dev/null +++ b/tcg/tcg-op-vec.c @@ -0,0 +1,389 @@ +/* + * Tiny Code Generator for QEMU + * + * Copyright (c) 2018 Linaro, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "cpu.h" +#include "exec/exec-all.h" +#include "tcg.h" +#include "tcg-op.h" +#include "tcg-mo.h" + +/* Reduce the number of ifdefs below. This assumes that all uses of + TCGV_HIGH and TCGV_LOW are properly protected by a conditional that + the compiler can eliminate. */ +#if TCG_TARGET_REG_BITS == 64 +extern TCGv_i32 TCGV_LOW_link_error(TCGv_i64); +extern TCGv_i32 TCGV_HIGH_link_error(TCGv_i64); +#define TCGV_LOW TCGV_LOW_link_error +#define TCGV_HIGH TCGV_HIGH_link_error +#endif + +void vec_gen_2(TCGOpcode opc, TCGType type, unsigned vece, TCGArg r, TCGArg a) +{ + TCGOp *op = tcg_emit_op(opc); + TCGOP_VECL(op) = type - TCG_TYPE_V64; + TCGOP_VECE(op) = vece; + op->args[0] = r; + op->args[1] = a; +} + +void vec_gen_3(TCGOpcode opc, TCGType type, unsigned vece, + TCGArg r, TCGArg a, TCGArg b) +{ + TCGOp *op = tcg_emit_op(opc); + TCGOP_VECL(op) = type - TCG_TYPE_V64; + TCGOP_VECE(op) = vece; + op->args[0] = r; + op->args[1] = a; + op->args[2] = b; +} + +void vec_gen_4(TCGOpcode opc, TCGType type, unsigned vece, + TCGArg r, TCGArg a, TCGArg b, TCGArg c) +{ + TCGOp *op = tcg_emit_op(opc); + TCGOP_VECL(op) = type - TCG_TYPE_V64; + TCGOP_VECE(op) = vece; + op->args[0] = r; + op->args[1] = a; + op->args[2] = b; + op->args[3] = c; +} + +static void vec_gen_op2(TCGOpcode opc, unsigned vece, TCGv_vec r, TCGv_vec a) +{ + TCGTemp *rt = tcgv_vec_temp(r); + TCGTemp *at = tcgv_vec_temp(a); + TCGType type = rt->base_type; + + /* Must enough inputs for the output. */ + tcg_debug_assert(at->base_type >= type); + vec_gen_2(opc, type, vece, temp_arg(rt), temp_arg(at)); +} + +static void vec_gen_op3(TCGOpcode opc, unsigned vece, + TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + TCGTemp *rt = tcgv_vec_temp(r); + TCGTemp *at = tcgv_vec_temp(a); + TCGTemp *bt = tcgv_vec_temp(b); + TCGType type = rt->base_type; + + /* Must enough inputs for the output. */ + tcg_debug_assert(at->base_type >= type); + tcg_debug_assert(bt->base_type >= type); + vec_gen_3(opc, type, vece, temp_arg(rt), temp_arg(at), temp_arg(bt)); +} + +void tcg_gen_mov_vec(TCGv_vec r, TCGv_vec a) +{ + if (r != a) { + vec_gen_op2(INDEX_op_mov_vec, 0, r, a); + } +} + +#define MO_REG (TCG_TARGET_REG_BITS == 64 ? MO_64 : MO_32) + +static void do_dupi_vec(TCGv_vec r, unsigned vece, TCGArg a) +{ + TCGTemp *rt = tcgv_vec_temp(r); + vec_gen_2(INDEX_op_dupi_vec, rt->base_type, vece, temp_arg(rt), a); +} + +TCGv_vec tcg_const_zeros_vec(TCGType type) +{ + TCGv_vec ret = tcg_temp_new_vec(type); + do_dupi_vec(ret, MO_REG, 0); + return ret; +} + +TCGv_vec tcg_const_ones_vec(TCGType type) +{ + TCGv_vec ret = tcg_temp_new_vec(type); + do_dupi_vec(ret, MO_REG, -1); + return ret; +} + +TCGv_vec tcg_const_zeros_vec_matching(TCGv_vec m) +{ + TCGTemp *t = tcgv_vec_temp(m); + return tcg_const_zeros_vec(t->base_type); +} + +TCGv_vec tcg_const_ones_vec_matching(TCGv_vec m) +{ + TCGTemp *t = tcgv_vec_temp(m); + return tcg_const_ones_vec(t->base_type); +} + +void tcg_gen_dup64i_vec(TCGv_vec r, uint64_t a) +{ + if (TCG_TARGET_REG_BITS == 32 && a == deposit64(a, 32, 32, a)) { + do_dupi_vec(r, MO_32, a); + } else if (TCG_TARGET_REG_BITS == 64 || a == (uint64_t)(int32_t)a) { + do_dupi_vec(r, MO_64, a); + } else { + TCGv_i64 c = tcg_const_i64(a); + tcg_gen_dup_i64_vec(MO_64, r, c); + tcg_temp_free_i64(c); + } +} + +void tcg_gen_dup32i_vec(TCGv_vec r, uint32_t a) +{ + do_dupi_vec(r, MO_REG, dup_const(MO_32, a)); +} + +void tcg_gen_dup16i_vec(TCGv_vec r, uint32_t a) +{ + do_dupi_vec(r, MO_REG, dup_const(MO_16, a)); +} + +void tcg_gen_dup8i_vec(TCGv_vec r, uint32_t a) +{ + do_dupi_vec(r, MO_REG, dup_const(MO_8, a)); +} + +void tcg_gen_dupi_vec(unsigned vece, TCGv_vec r, uint64_t a) +{ + do_dupi_vec(r, MO_REG, dup_const(vece, a)); +} + +void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec r, TCGv_i64 a) +{ + TCGArg ri = tcgv_vec_arg(r); + TCGTemp *rt = arg_temp(ri); + TCGType type = rt->base_type; + + if (TCG_TARGET_REG_BITS == 64) { + TCGArg ai = tcgv_i64_arg(a); + vec_gen_2(INDEX_op_dup_vec, type, vece, ri, ai); + } else if (vece == MO_64) { + TCGArg al = tcgv_i32_arg(TCGV_LOW(a)); + TCGArg ah = tcgv_i32_arg(TCGV_HIGH(a)); + vec_gen_3(INDEX_op_dup2_vec, type, MO_64, ri, al, ah); + } else { + TCGArg ai = tcgv_i32_arg(TCGV_LOW(a)); + vec_gen_2(INDEX_op_dup_vec, type, vece, ri, ai); + } +} + +void tcg_gen_dup_i32_vec(unsigned vece, TCGv_vec r, TCGv_i32 a) +{ + TCGArg ri = tcgv_vec_arg(r); + TCGArg ai = tcgv_i32_arg(a); + TCGTemp *rt = arg_temp(ri); + TCGType type = rt->base_type; + + vec_gen_2(INDEX_op_dup_vec, type, vece, ri, ai); +} + +static void vec_gen_ldst(TCGOpcode opc, TCGv_vec r, TCGv_ptr b, TCGArg o) +{ + TCGArg ri = tcgv_vec_arg(r); + TCGArg bi = tcgv_ptr_arg(b); + TCGTemp *rt = arg_temp(ri); + TCGType type = rt->base_type; + + vec_gen_3(opc, type, 0, ri, bi, o); +} + +void tcg_gen_ld_vec(TCGv_vec r, TCGv_ptr b, TCGArg o) +{ + vec_gen_ldst(INDEX_op_ld_vec, r, b, o); +} + +void tcg_gen_st_vec(TCGv_vec r, TCGv_ptr b, TCGArg o) +{ + vec_gen_ldst(INDEX_op_st_vec, r, b, o); +} + +void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr b, TCGArg o, TCGType low_type) +{ + TCGArg ri = tcgv_vec_arg(r); + TCGArg bi = tcgv_ptr_arg(b); + TCGTemp *rt = arg_temp(ri); + TCGType type = rt->base_type; + + tcg_debug_assert(low_type >= TCG_TYPE_V64); + tcg_debug_assert(low_type <= type); + vec_gen_3(INDEX_op_st_vec, low_type, 0, ri, bi, o); +} + +void tcg_gen_add_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + vec_gen_op3(INDEX_op_add_vec, vece, r, a, b); +} + +void tcg_gen_sub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + vec_gen_op3(INDEX_op_sub_vec, vece, r, a, b); +} + +void tcg_gen_and_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + vec_gen_op3(INDEX_op_and_vec, 0, r, a, b); +} + +void tcg_gen_or_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + vec_gen_op3(INDEX_op_or_vec, 0, r, a, b); +} + +void tcg_gen_xor_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + vec_gen_op3(INDEX_op_xor_vec, 0, r, a, b); +} + +void tcg_gen_andc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + if (TCG_TARGET_HAS_andc_vec) { + vec_gen_op3(INDEX_op_andc_vec, 0, r, a, b); + } else { + TCGv_vec t = tcg_temp_new_vec_matching(r); + tcg_gen_not_vec(0, t, b); + tcg_gen_and_vec(0, r, a, t); + tcg_temp_free_vec(t); + } +} + +void tcg_gen_orc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + if (TCG_TARGET_HAS_orc_vec) { + vec_gen_op3(INDEX_op_orc_vec, 0, r, a, b); + } else { + TCGv_vec t = tcg_temp_new_vec_matching(r); + tcg_gen_not_vec(0, t, b); + tcg_gen_or_vec(0, r, a, t); + tcg_temp_free_vec(t); + } +} + +void tcg_gen_not_vec(unsigned vece, TCGv_vec r, TCGv_vec a) +{ + if (TCG_TARGET_HAS_not_vec) { + vec_gen_op2(INDEX_op_not_vec, 0, r, a); + } else { + TCGv_vec t = tcg_const_ones_vec_matching(r); + tcg_gen_xor_vec(0, r, a, t); + tcg_temp_free_vec(t); + } +} + +void tcg_gen_neg_vec(unsigned vece, TCGv_vec r, TCGv_vec a) +{ + if (TCG_TARGET_HAS_neg_vec) { + vec_gen_op2(INDEX_op_neg_vec, vece, r, a); + } else { + TCGv_vec t = tcg_const_zeros_vec_matching(r); + tcg_gen_sub_vec(vece, r, t, a); + tcg_temp_free_vec(t); + } +} + +static void do_shifti(TCGOpcode opc, unsigned vece, + TCGv_vec r, TCGv_vec a, int64_t i) +{ + TCGTemp *rt = tcgv_vec_temp(r); + TCGTemp *at = tcgv_vec_temp(a); + TCGArg ri = temp_arg(rt); + TCGArg ai = temp_arg(at); + TCGType type = rt->base_type; + int can; + + tcg_debug_assert(at->base_type == type); + tcg_debug_assert(i >= 0 && i < (8 << vece)); + + if (i == 0) { + tcg_gen_mov_vec(r, a); + return; + } + + can = tcg_can_emit_vec_op(opc, type, vece); + if (can > 0) { + vec_gen_3(opc, type, vece, ri, ai, i); + } else { + /* We leave the choice of expansion via scalar or vector shift + to the target. Often, but not always, dupi can feed a vector + shift easier than a scalar. */ + tcg_debug_assert(can < 0); + tcg_expand_vec_op(opc, type, vece, ri, ai, i); + } +} + +void tcg_gen_shli_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i) +{ + do_shifti(INDEX_op_shli_vec, vece, r, a, i); +} + +void tcg_gen_shri_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i) +{ + do_shifti(INDEX_op_shri_vec, vece, r, a, i); +} + +void tcg_gen_sari_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i) +{ + do_shifti(INDEX_op_sari_vec, vece, r, a, i); +} + +void tcg_gen_cmp_vec(TCGCond cond, unsigned vece, + TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + TCGTemp *rt = tcgv_vec_temp(r); + TCGTemp *at = tcgv_vec_temp(a); + TCGTemp *bt = tcgv_vec_temp(b); + TCGArg ri = temp_arg(rt); + TCGArg ai = temp_arg(at); + TCGArg bi = temp_arg(bt); + TCGType type = rt->base_type; + int can; + + tcg_debug_assert(at->base_type == type); + tcg_debug_assert(bt->base_type == type); + can = tcg_can_emit_vec_op(INDEX_op_cmp_vec, type, vece); + if (can > 0) { + vec_gen_4(INDEX_op_cmp_vec, type, vece, ri, ai, bi, cond); + } else { + tcg_debug_assert(can < 0); + tcg_expand_vec_op(INDEX_op_cmp_vec, type, vece, ri, ai, bi, cond); + } +} + +void tcg_gen_mul_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + TCGTemp *rt = tcgv_vec_temp(r); + TCGTemp *at = tcgv_vec_temp(a); + TCGTemp *bt = tcgv_vec_temp(b); + TCGArg ri = temp_arg(rt); + TCGArg ai = temp_arg(at); + TCGArg bi = temp_arg(bt); + TCGType type = rt->base_type; + int can; + + tcg_debug_assert(at->base_type == type); + tcg_debug_assert(bt->base_type == type); + can = tcg_can_emit_vec_op(INDEX_op_mul_vec, type, vece); + if (can > 0) { + vec_gen_3(INDEX_op_mul_vec, type, vece, ri, ai, bi); + } else { + tcg_debug_assert(can < 0); + tcg_expand_vec_op(INDEX_op_mul_vec, type, vece, ri, ai, bi); + } +} diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c index 0c509bfe46..3467787323 100644 --- a/tcg/tcg-op.c +++ b/tcg/tcg-op.c @@ -140,7 +140,7 @@ void tcg_gen_subi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2) } } -void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2) +void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2) { TCGv_i32 t0; /* Some cases can be optimized here. */ @@ -148,17 +148,17 @@ void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2) case 0: tcg_gen_movi_i32(ret, 0); return; - case 0xffffffffu: + case -1: tcg_gen_mov_i32(ret, arg1); return; - case 0xffu: + case 0xff: /* Don't recurse with tcg_gen_ext8u_i32. */ if (TCG_TARGET_HAS_ext8u_i32) { tcg_gen_op2_i32(INDEX_op_ext8u_i32, ret, arg1); return; } break; - case 0xffffu: + case 0xffff: if (TCG_TARGET_HAS_ext16u_i32) { tcg_gen_op2_i32(INDEX_op_ext16u_i32, ret, arg1); return; @@ -199,9 +199,9 @@ void tcg_gen_xori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2) } } -void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2) +void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2) { - tcg_debug_assert(arg2 < 32); + tcg_debug_assert(arg2 >= 0 && arg2 < 32); if (arg2 == 0) { tcg_gen_mov_i32(ret, arg1); } else { @@ -211,9 +211,9 @@ void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2) } } -void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2) +void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2) { - tcg_debug_assert(arg2 < 32); + tcg_debug_assert(arg2 >= 0 && arg2 < 32); if (arg2 == 0) { tcg_gen_mov_i32(ret, arg1); } else { @@ -223,9 +223,9 @@ void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2) } } -void tcg_gen_sari_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2) +void tcg_gen_sari_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2) { - tcg_debug_assert(arg2 < 32); + tcg_debug_assert(arg2 >= 0 && arg2 < 32); if (arg2 == 0) { tcg_gen_mov_i32(ret, arg1); } else { @@ -1201,7 +1201,7 @@ void tcg_gen_subi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2) } } -void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2) +void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2) { TCGv_i64 t0; @@ -1216,23 +1216,23 @@ void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2) case 0: tcg_gen_movi_i64(ret, 0); return; - case 0xffffffffffffffffull: + case -1: tcg_gen_mov_i64(ret, arg1); return; - case 0xffull: + case 0xff: /* Don't recurse with tcg_gen_ext8u_i64. */ if (TCG_TARGET_HAS_ext8u_i64) { tcg_gen_op2_i64(INDEX_op_ext8u_i64, ret, arg1); return; } break; - case 0xffffu: + case 0xffff: if (TCG_TARGET_HAS_ext16u_i64) { tcg_gen_op2_i64(INDEX_op_ext16u_i64, ret, arg1); return; } break; - case 0xffffffffull: + case 0xffffffffu: if (TCG_TARGET_HAS_ext32u_i64) { tcg_gen_op2_i64(INDEX_op_ext32u_i64, ret, arg1); return; @@ -1332,9 +1332,9 @@ static inline void tcg_gen_shifti_i64(TCGv_i64 ret, TCGv_i64 arg1, } } -void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2) +void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2) { - tcg_debug_assert(arg2 < 64); + tcg_debug_assert(arg2 >= 0 && arg2 < 64); if (TCG_TARGET_REG_BITS == 32) { tcg_gen_shifti_i64(ret, arg1, arg2, 0, 0); } else if (arg2 == 0) { @@ -1346,9 +1346,9 @@ void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2) } } -void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2) +void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2) { - tcg_debug_assert(arg2 < 64); + tcg_debug_assert(arg2 >= 0 && arg2 < 64); if (TCG_TARGET_REG_BITS == 32) { tcg_gen_shifti_i64(ret, arg1, arg2, 1, 0); } else if (arg2 == 0) { @@ -1360,9 +1360,9 @@ void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2) } } -void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2) +void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2) { - tcg_debug_assert(arg2 < 64); + tcg_debug_assert(arg2 >= 0 && arg2 < 64); if (TCG_TARGET_REG_BITS == 32) { tcg_gen_shifti_i64(ret, arg1, arg2, 1, 1); } else if (arg2 == 0) { diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h index ca07b32b65..75bb55aeac 100644 --- a/tcg/tcg-op.h +++ b/tcg/tcg-op.h @@ -35,6 +35,10 @@ void tcg_gen_op4(TCGOpcode, TCGArg, TCGArg, TCGArg, TCGArg); void tcg_gen_op5(TCGOpcode, TCGArg, TCGArg, TCGArg, TCGArg, TCGArg); void tcg_gen_op6(TCGOpcode, TCGArg, TCGArg, TCGArg, TCGArg, TCGArg, TCGArg); +void vec_gen_2(TCGOpcode, TCGType, unsigned, TCGArg, TCGArg); +void vec_gen_3(TCGOpcode, TCGType, unsigned, TCGArg, TCGArg, TCGArg); +void vec_gen_4(TCGOpcode, TCGType, unsigned, TCGArg, TCGArg, TCGArg, TCGArg); + static inline void tcg_gen_op1_i32(TCGOpcode opc, TCGv_i32 a1) { tcg_gen_op1(opc, tcgv_i32_arg(a1)); @@ -265,12 +269,12 @@ void tcg_gen_mb(TCGBar); void tcg_gen_addi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2); void tcg_gen_subfi_i32(TCGv_i32 ret, int32_t arg1, TCGv_i32 arg2); void tcg_gen_subi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2); -void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2); +void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2); void tcg_gen_ori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2); void tcg_gen_xori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2); -void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2); -void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2); -void tcg_gen_sari_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2); +void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2); +void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2); +void tcg_gen_sari_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2); void tcg_gen_muli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2); void tcg_gen_div_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2); void tcg_gen_rem_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2); @@ -454,12 +458,12 @@ static inline void tcg_gen_not_i32(TCGv_i32 ret, TCGv_i32 arg) void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2); void tcg_gen_subfi_i64(TCGv_i64 ret, int64_t arg1, TCGv_i64 arg2); void tcg_gen_subi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2); -void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2); +void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2); void tcg_gen_ori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2); void tcg_gen_xori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2); -void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2); -void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2); -void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2); +void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2); +void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2); +void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2); void tcg_gen_muli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2); void tcg_gen_div_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2); void tcg_gen_rem_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2); @@ -903,6 +907,36 @@ void tcg_gen_atomic_or_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp); void tcg_gen_atomic_xor_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp); void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp); +void tcg_gen_mov_vec(TCGv_vec, TCGv_vec); +void tcg_gen_dup_i32_vec(unsigned vece, TCGv_vec, TCGv_i32); +void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec, TCGv_i64); +void tcg_gen_dup8i_vec(TCGv_vec, uint32_t); +void tcg_gen_dup16i_vec(TCGv_vec, uint32_t); +void tcg_gen_dup32i_vec(TCGv_vec, uint32_t); +void tcg_gen_dup64i_vec(TCGv_vec, uint64_t); +void tcg_gen_dupi_vec(unsigned vece, TCGv_vec, uint64_t); +void tcg_gen_add_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b); +void tcg_gen_sub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b); +void tcg_gen_mul_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b); +void tcg_gen_and_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b); +void tcg_gen_or_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b); +void tcg_gen_xor_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b); +void tcg_gen_andc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b); +void tcg_gen_orc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b); +void tcg_gen_not_vec(unsigned vece, TCGv_vec r, TCGv_vec a); +void tcg_gen_neg_vec(unsigned vece, TCGv_vec r, TCGv_vec a); + +void tcg_gen_shli_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i); +void tcg_gen_shri_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i); +void tcg_gen_sari_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i); + +void tcg_gen_cmp_vec(TCGCond cond, unsigned vece, TCGv_vec r, + TCGv_vec a, TCGv_vec b); + +void tcg_gen_ld_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset); +void tcg_gen_st_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset); +void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType t); + #if TARGET_LONG_BITS == 64 #define tcg_gen_movi_tl tcg_gen_movi_i64 #define tcg_gen_mov_tl tcg_gen_mov_i64 @@ -1001,6 +1035,7 @@ void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp); #define tcg_gen_atomic_and_fetch_tl tcg_gen_atomic_and_fetch_i64 #define tcg_gen_atomic_or_fetch_tl tcg_gen_atomic_or_fetch_i64 #define tcg_gen_atomic_xor_fetch_tl tcg_gen_atomic_xor_fetch_i64 +#define tcg_gen_dup_tl_vec tcg_gen_dup_i64_vec #else #define tcg_gen_movi_tl tcg_gen_movi_i32 #define tcg_gen_mov_tl tcg_gen_mov_i32 @@ -1098,6 +1133,7 @@ void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp); #define tcg_gen_atomic_and_fetch_tl tcg_gen_atomic_and_fetch_i32 #define tcg_gen_atomic_or_fetch_tl tcg_gen_atomic_or_fetch_i32 #define tcg_gen_atomic_xor_fetch_tl tcg_gen_atomic_xor_fetch_i32 +#define tcg_gen_dup_tl_vec tcg_gen_dup_i32_vec #endif #if UINTPTR_MAX == UINT32_MAX diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h index 956fb1e9f3..d81a6c4535 100644 --- a/tcg/tcg-opc.h +++ b/tcg/tcg-opc.h @@ -204,8 +204,54 @@ DEF(qemu_ld_i64, DATA64_ARGS, TLADDR_ARGS, 1, DEF(qemu_st_i64, 0, TLADDR_ARGS + DATA64_ARGS, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT) +/* Host vector support. */ + +#define IMPLVEC TCG_OPF_VECTOR | IMPL(TCG_TARGET_MAYBE_vec) + +DEF(mov_vec, 1, 1, 0, TCG_OPF_VECTOR | TCG_OPF_NOT_PRESENT) +DEF(dupi_vec, 1, 0, 1, TCG_OPF_VECTOR | TCG_OPF_NOT_PRESENT) + +DEF(dup_vec, 1, 1, 0, IMPLVEC) +DEF(dup2_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_REG_BITS == 32)) + +DEF(ld_vec, 1, 1, 1, IMPLVEC) +DEF(st_vec, 0, 2, 1, IMPLVEC) + +DEF(add_vec, 1, 2, 0, IMPLVEC) +DEF(sub_vec, 1, 2, 0, IMPLVEC) +DEF(mul_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_mul_vec)) +DEF(neg_vec, 1, 1, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec)) + +DEF(and_vec, 1, 2, 0, IMPLVEC) +DEF(or_vec, 1, 2, 0, IMPLVEC) +DEF(xor_vec, 1, 2, 0, IMPLVEC) +DEF(andc_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_andc_vec)) +DEF(orc_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_orc_vec)) +DEF(not_vec, 1, 1, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_not_vec)) + +DEF(shli_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_shi_vec)) +DEF(shri_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_shi_vec)) +DEF(sari_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_shi_vec)) + +DEF(shls_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_shs_vec)) +DEF(shrs_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_shs_vec)) +DEF(sars_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_shs_vec)) + +DEF(shlv_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_shv_vec)) +DEF(shrv_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_shv_vec)) +DEF(sarv_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_shv_vec)) + +DEF(cmp_vec, 1, 2, 1, IMPLVEC) + +DEF(last_generic, 0, 0, 0, TCG_OPF_NOT_PRESENT) + +#if TCG_TARGET_MAYBE_vec +#include "tcg-target.opc.h" +#endif + #undef TLADDR_ARGS #undef DATA64_ARGS #undef IMPL #undef IMPL64 +#undef IMPLVEC #undef DEF diff --git a/tcg/tcg-pool.inc.c b/tcg/tcg-pool.inc.c index 8a85131405..7af5513ff3 100644 --- a/tcg/tcg-pool.inc.c +++ b/tcg/tcg-pool.inc.c @@ -22,39 +22,110 @@ typedef struct TCGLabelPoolData { struct TCGLabelPoolData *next; - tcg_target_ulong data; tcg_insn_unit *label; intptr_t addend; - int type; + int rtype; + unsigned nlong; + tcg_target_ulong data[]; } TCGLabelPoolData; -static void new_pool_label(TCGContext *s, tcg_target_ulong data, int type, - tcg_insn_unit *label, intptr_t addend) +static TCGLabelPoolData *new_pool_alloc(TCGContext *s, int nlong, int rtype, + tcg_insn_unit *label, intptr_t addend) { - TCGLabelPoolData *n = tcg_malloc(sizeof(*n)); - TCGLabelPoolData *i, **pp; + TCGLabelPoolData *n = tcg_malloc(sizeof(TCGLabelPoolData) + + sizeof(tcg_target_ulong) * nlong); - n->data = data; n->label = label; - n->type = type; n->addend = addend; + n->rtype = rtype; + n->nlong = nlong; + return n; +} + +static void new_pool_insert(TCGContext *s, TCGLabelPoolData *n) +{ + TCGLabelPoolData *i, **pp; + int nlong = n->nlong; /* Insertion sort on the pool. */ - for (pp = &s->pool_labels; (i = *pp) && i->data < data; pp = &i->next) { - continue; + for (pp = &s->pool_labels; (i = *pp) != NULL; pp = &i->next) { + if (nlong > i->nlong) { + break; + } + if (nlong < i->nlong) { + continue; + } + if (memcmp(n->data, i->data, sizeof(tcg_target_ulong) * nlong) >= 0) { + break; + } } n->next = *pp; *pp = n; } +/* The "usual" for generic integer code. */ +static inline void new_pool_label(TCGContext *s, tcg_target_ulong d, int rtype, + tcg_insn_unit *label, intptr_t addend) +{ + TCGLabelPoolData *n = new_pool_alloc(s, 1, rtype, label, addend); + n->data[0] = d; + new_pool_insert(s, n); +} + +/* For v64 or v128, depending on the host. */ +static inline void new_pool_l2(TCGContext *s, int rtype, tcg_insn_unit *label, + intptr_t addend, tcg_target_ulong d0, + tcg_target_ulong d1) +{ + TCGLabelPoolData *n = new_pool_alloc(s, 2, rtype, label, addend); + n->data[0] = d0; + n->data[1] = d1; + new_pool_insert(s, n); +} + +/* For v128 or v256, depending on the host. */ +static inline void new_pool_l4(TCGContext *s, int rtype, tcg_insn_unit *label, + intptr_t addend, tcg_target_ulong d0, + tcg_target_ulong d1, tcg_target_ulong d2, + tcg_target_ulong d3) +{ + TCGLabelPoolData *n = new_pool_alloc(s, 4, rtype, label, addend); + n->data[0] = d0; + n->data[1] = d1; + n->data[2] = d2; + n->data[3] = d3; + new_pool_insert(s, n); +} + +/* For v256, for 32-bit host. */ +static inline void new_pool_l8(TCGContext *s, int rtype, tcg_insn_unit *label, + intptr_t addend, tcg_target_ulong d0, + tcg_target_ulong d1, tcg_target_ulong d2, + tcg_target_ulong d3, tcg_target_ulong d4, + tcg_target_ulong d5, tcg_target_ulong d6, + tcg_target_ulong d7) +{ + TCGLabelPoolData *n = new_pool_alloc(s, 8, rtype, label, addend); + n->data[0] = d0; + n->data[1] = d1; + n->data[2] = d2; + n->data[3] = d3; + n->data[4] = d4; + n->data[5] = d5; + n->data[6] = d6; + n->data[7] = d7; + new_pool_insert(s, n); +} + /* To be provided by cpu/tcg-target.inc.c. */ static void tcg_out_nop_fill(tcg_insn_unit *p, int count); static bool tcg_out_pool_finalize(TCGContext *s) { TCGLabelPoolData *p = s->pool_labels; - tcg_target_ulong d, *a; + TCGLabelPoolData *l = NULL; + void *a; if (p == NULL) { return true; @@ -62,24 +133,24 @@ static bool tcg_out_pool_finalize(TCGContext *s) /* ??? Round up to qemu_icache_linesize, but then do not round again when allocating the next TranslationBlock structure. */ - a = (void *)ROUND_UP((uintptr_t)s->code_ptr, sizeof(tcg_target_ulong)); + a = (void *)ROUND_UP((uintptr_t)s->code_ptr, + sizeof(tcg_target_ulong) * p->nlong); tcg_out_nop_fill(s->code_ptr, (tcg_insn_unit *)a - s->code_ptr); s->data_gen_ptr = a; - /* Ensure the first comparison fails. */ - d = p->data + 1; - for (; p != NULL; p = p->next) { - if (p->data != d) { - d = p->data; - if (unlikely((void *)a > s->code_gen_highwater)) { + size_t size = sizeof(tcg_target_ulong) * p->nlong; + if (!l || l->nlong != p->nlong || memcmp(l->data, p->data, size)) { + if (unlikely(a > s->code_gen_highwater)) { return false; } - *a++ = d; + memcpy(a, p->data, size); + a += size; + l = p; } - patch_reloc(p->label, p->type, (intptr_t)(a - 1), p->addend); + patch_reloc(p->label, p->rtype, (intptr_t)a - size, p->addend); } - s->code_ptr = (void *)a; + s->code_ptr = a; return true; } @@ -106,6 +106,18 @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg ret, tcg_target_long arg); static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *const_args); +#if TCG_TARGET_MAYBE_vec +static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, unsigned vecl, + unsigned vece, const TCGArg *args, + const int *const_args); +#else +static inline void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, unsigned vecl, + unsigned vece, const TCGArg *args, + const int *const_args) +{ + g_assert_not_reached(); +} +#endif static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, TCGReg arg1, intptr_t arg2); static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, @@ -146,8 +158,7 @@ struct tcg_region_state { }; static struct tcg_region_state region; - -static TCGRegSet tcg_target_available_regs[2]; +static TCGRegSet tcg_target_available_regs[TCG_TYPE_COUNT]; static TCGRegSet tcg_target_call_clobber_regs; #if TCG_TARGET_INSN_UNIT_SIZE == 1 @@ -1026,6 +1037,41 @@ TCGv_i64 tcg_temp_new_internal_i64(int temp_local) return temp_tcgv_i64(t); } +TCGv_vec tcg_temp_new_vec(TCGType type) +{ + TCGTemp *t; + +#ifdef CONFIG_DEBUG_TCG + switch (type) { + case TCG_TYPE_V64: + assert(TCG_TARGET_HAS_v64); + break; + case TCG_TYPE_V128: + assert(TCG_TARGET_HAS_v128); + break; + case TCG_TYPE_V256: + assert(TCG_TARGET_HAS_v256); + break; + default: + g_assert_not_reached(); + } +#endif + + t = tcg_temp_new_internal(type, 0); + return temp_tcgv_vec(t); +} + +/* Create a new temp of the same type as an existing temp. */ +TCGv_vec tcg_temp_new_vec_matching(TCGv_vec match) +{ + TCGTemp *t = tcgv_vec_temp(match); + + tcg_debug_assert(t->temp_allocated != 0); + + t = tcg_temp_new_internal(t->base_type, 0); + return temp_tcgv_vec(t); +} + static void tcg_temp_free_internal(TCGTemp *ts) { TCGContext *s = tcg_ctx; @@ -1057,6 +1103,11 @@ void tcg_temp_free_i64(TCGv_i64 arg) tcg_temp_free_internal(tcgv_i64_temp(arg)); } +void tcg_temp_free_vec(TCGv_vec arg) +{ + tcg_temp_free_internal(tcgv_vec_temp(arg)); +} + TCGv_i32 tcg_const_i32(int32_t val) { TCGv_i32 t0; @@ -1114,6 +1165,9 @@ int tcg_check_temp_count(void) Test the runtime variable that controls each opcode. */ bool tcg_op_supported(TCGOpcode op) { + const bool have_vec + = TCG_TARGET_HAS_v64 | TCG_TARGET_HAS_v128 | TCG_TARGET_HAS_v256; + switch (op) { case INDEX_op_discard: case INDEX_op_set_label: @@ -1327,10 +1381,47 @@ bool tcg_op_supported(TCGOpcode op) case INDEX_op_mulsh_i64: return TCG_TARGET_HAS_mulsh_i64; - case NB_OPS: - break; + case INDEX_op_mov_vec: + case INDEX_op_dup_vec: + case INDEX_op_dupi_vec: + case INDEX_op_ld_vec: + case INDEX_op_st_vec: + case INDEX_op_add_vec: + case INDEX_op_sub_vec: + case INDEX_op_and_vec: + case INDEX_op_or_vec: + case INDEX_op_xor_vec: + case INDEX_op_cmp_vec: + return have_vec; + case INDEX_op_dup2_vec: + return have_vec && TCG_TARGET_REG_BITS == 32; + case INDEX_op_not_vec: + return have_vec && TCG_TARGET_HAS_not_vec; + case INDEX_op_neg_vec: + return have_vec && TCG_TARGET_HAS_neg_vec; + case INDEX_op_andc_vec: + return have_vec && TCG_TARGET_HAS_andc_vec; + case INDEX_op_orc_vec: + return have_vec && TCG_TARGET_HAS_orc_vec; + case INDEX_op_mul_vec: + return have_vec && TCG_TARGET_HAS_mul_vec; + case INDEX_op_shli_vec: + case INDEX_op_shri_vec: + case INDEX_op_sari_vec: + return have_vec && TCG_TARGET_HAS_shi_vec; + case INDEX_op_shls_vec: + case INDEX_op_shrs_vec: + case INDEX_op_sars_vec: + return have_vec && TCG_TARGET_HAS_shs_vec; + case INDEX_op_shlv_vec: + case INDEX_op_shrv_vec: + case INDEX_op_sarv_vec: + return have_vec && TCG_TARGET_HAS_shv_vec; + + default: + tcg_debug_assert(op > INDEX_op_last_generic && op < NB_OPS); + return true; } - g_assert_not_reached(); } /* Note: we convert the 64 bit args to 32 bit and do some alignment @@ -1661,6 +1752,11 @@ void tcg_dump_ops(TCGContext *s) nb_iargs = def->nb_iargs; nb_cargs = def->nb_cargs; + if (def->flags & TCG_OPF_VECTOR) { + col += qemu_log("v%d,e%d,", 64 << TCGOP_VECL(op), + 8 << TCGOP_VECE(op)); + } + k = 0; for (i = 0; i < nb_oargs; i++) { if (k != 0) { @@ -1685,6 +1781,7 @@ void tcg_dump_ops(TCGContext *s) case INDEX_op_brcond_i64: case INDEX_op_setcond_i64: case INDEX_op_movcond_i64: + case INDEX_op_cmp_vec: if (op->args[k] < ARRAY_SIZE(cond_name) && cond_name[op->args[k]]) { col += qemu_log(",%s", cond_name[op->args[k++]]); @@ -2890,8 +2987,13 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op) } /* emit instruction */ - tcg_out_op(s, op->opc, new_args, const_args); - + if (def->flags & TCG_OPF_VECTOR) { + tcg_out_vec_op(s, op->opc, TCGOP_VECL(op), TCGOP_VECE(op), + new_args, const_args); + } else { + tcg_out_op(s, op->opc, new_args, const_args); + } + /* move the outputs in the correct register if needed */ for(i = 0; i < nb_oargs; i++) { ts = arg_temp(op->args[i]); @@ -3239,10 +3341,12 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb) switch (opc) { case INDEX_op_mov_i32: case INDEX_op_mov_i64: + case INDEX_op_mov_vec: tcg_reg_alloc_mov(s, op); break; case INDEX_op_movi_i32: case INDEX_op_movi_i64: + case INDEX_op_dupi_vec: tcg_reg_alloc_movi(s, op); break; case INDEX_op_insn_start: @@ -3645,3 +3749,10 @@ void tcg_register_jit(void *buf, size_t buf_size) { } #endif /* ELF_HOST_MACHINE */ + +#if !TCG_TARGET_MAYBE_vec +void tcg_expand_vec_op(TCGOpcode o, TCGType t, unsigned e, TCGArg a0, ...) +{ + g_assert_not_reached(); +} +#endif @@ -170,6 +170,31 @@ typedef uint64_t TCGRegSet; # error "Missing unsigned widening multiply" #endif +#if !defined(TCG_TARGET_HAS_v64) \ + && !defined(TCG_TARGET_HAS_v128) \ + && !defined(TCG_TARGET_HAS_v256) +#define TCG_TARGET_MAYBE_vec 0 +#define TCG_TARGET_HAS_neg_vec 0 +#define TCG_TARGET_HAS_not_vec 0 +#define TCG_TARGET_HAS_andc_vec 0 +#define TCG_TARGET_HAS_orc_vec 0 +#define TCG_TARGET_HAS_shi_vec 0 +#define TCG_TARGET_HAS_shs_vec 0 +#define TCG_TARGET_HAS_shv_vec 0 +#define TCG_TARGET_HAS_mul_vec 0 +#else +#define TCG_TARGET_MAYBE_vec 1 +#endif +#ifndef TCG_TARGET_HAS_v64 +#define TCG_TARGET_HAS_v64 0 +#endif +#ifndef TCG_TARGET_HAS_v128 +#define TCG_TARGET_HAS_v128 0 +#endif +#ifndef TCG_TARGET_HAS_v256 +#define TCG_TARGET_HAS_v256 0 +#endif + #ifndef TARGET_INSN_START_EXTRA_WORDS # define TARGET_INSN_START_WORDS 1 #else @@ -246,6 +271,11 @@ typedef struct TCGPool { typedef enum TCGType { TCG_TYPE_I32, TCG_TYPE_I64, + + TCG_TYPE_V64, + TCG_TYPE_V128, + TCG_TYPE_V256, + TCG_TYPE_COUNT, /* number of different types */ /* An alias for the size of the host register. */ @@ -396,6 +426,8 @@ typedef tcg_target_ulong TCGArg; * TCGv_i32 : 32 bit integer type * TCGv_i64 : 64 bit integer type * TCGv_ptr : a host pointer type + * TCGv_vec : a host vector type; the exact size is not exposed + to the CPU front-end code. * TCGv : an integer type the same size as target_ulong (an alias for either TCGv_i32 or TCGv_i64) The compiler's type checking will complain if you mix them @@ -418,6 +450,7 @@ typedef tcg_target_ulong TCGArg; typedef struct TCGv_i32_d *TCGv_i32; typedef struct TCGv_i64_d *TCGv_i64; typedef struct TCGv_ptr_d *TCGv_ptr; +typedef struct TCGv_vec_d *TCGv_vec; typedef TCGv_ptr TCGv_env; #if TARGET_LONG_BITS == 32 #define TCGv TCGv_i32 @@ -589,6 +622,9 @@ typedef struct TCGOp { #define TCGOP_CALLI(X) (X)->param1 #define TCGOP_CALLO(X) (X)->param2 +#define TCGOP_VECL(X) (X)->param1 +#define TCGOP_VECE(X) (X)->param2 + /* Make sure operands fit in the bitfields above. */ QEMU_BUILD_BUG_ON(NB_OPS > (1 << 8)); @@ -726,6 +762,11 @@ static inline TCGTemp *tcgv_ptr_temp(TCGv_ptr v) return tcgv_i32_temp((TCGv_i32)v); } +static inline TCGTemp *tcgv_vec_temp(TCGv_vec v) +{ + return tcgv_i32_temp((TCGv_i32)v); +} + static inline TCGArg tcgv_i32_arg(TCGv_i32 v) { return temp_arg(tcgv_i32_temp(v)); @@ -741,6 +782,11 @@ static inline TCGArg tcgv_ptr_arg(TCGv_ptr v) return temp_arg(tcgv_ptr_temp(v)); } +static inline TCGArg tcgv_vec_arg(TCGv_vec v) +{ + return temp_arg(tcgv_vec_temp(v)); +} + static inline TCGv_i32 temp_tcgv_i32(TCGTemp *t) { (void)temp_idx(t); /* trigger embedded assert */ @@ -757,6 +803,11 @@ static inline TCGv_ptr temp_tcgv_ptr(TCGTemp *t) return (TCGv_ptr)temp_tcgv_i32(t); } +static inline TCGv_vec temp_tcgv_vec(TCGTemp *t) +{ + return (TCGv_vec)temp_tcgv_i32(t); +} + #if TCG_TARGET_REG_BITS == 32 static inline TCGv_i32 TCGV_LOW(TCGv_i64 t) { @@ -832,9 +883,12 @@ TCGTemp *tcg_global_mem_new_internal(TCGType, TCGv_ptr, TCGv_i32 tcg_temp_new_internal_i32(int temp_local); TCGv_i64 tcg_temp_new_internal_i64(int temp_local); +TCGv_vec tcg_temp_new_vec(TCGType type); +TCGv_vec tcg_temp_new_vec_matching(TCGv_vec match); void tcg_temp_free_i32(TCGv_i32 arg); void tcg_temp_free_i64(TCGv_i64 arg); +void tcg_temp_free_vec(TCGv_vec arg); static inline TCGv_i32 tcg_global_mem_new_i32(TCGv_ptr reg, intptr_t offset, const char *name) @@ -916,6 +970,8 @@ enum { /* Instruction is optional and not implemented by the host, or insn is generic and should not be implemened by the host. */ TCG_OPF_NOT_PRESENT = 0x10, + /* Instruction operands are vectors. */ + TCG_OPF_VECTOR = 0x20, }; typedef struct TCGOpDef { @@ -981,6 +1037,10 @@ TCGv_i32 tcg_const_i32(int32_t val); TCGv_i64 tcg_const_i64(int64_t val); TCGv_i32 tcg_const_local_i32(int32_t val); TCGv_i64 tcg_const_local_i64(int64_t val); +TCGv_vec tcg_const_zeros_vec(TCGType); +TCGv_vec tcg_const_ones_vec(TCGType); +TCGv_vec tcg_const_zeros_vec_matching(TCGv_vec); +TCGv_vec tcg_const_ones_vec_matching(TCGv_vec); TCGLabel *gen_new_label(void); @@ -1151,6 +1211,33 @@ uintptr_t tcg_qemu_tb_exec(CPUArchState *env, uint8_t *tb_ptr); void tcg_register_jit(void *buf, size_t buf_size); +#if TCG_TARGET_MAYBE_vec +/* Return zero if the tuple (opc, type, vece) is unsupportable; + return > 0 if it is directly supportable; + return < 0 if we must call tcg_expand_vec_op. */ +int tcg_can_emit_vec_op(TCGOpcode, TCGType, unsigned); +#else +static inline int tcg_can_emit_vec_op(TCGOpcode o, TCGType t, unsigned ve) +{ + return 0; +} +#endif + +/* Expand the tuple (opc, type, vece) on the given arguments. */ +void tcg_expand_vec_op(TCGOpcode, TCGType, unsigned, TCGArg, ...); + +/* Replicate a constant C accoring to the log2 of the element size. */ +uint64_t dup_const(unsigned vece, uint64_t c); + +#define dup_const(VECE, C) \ + (__builtin_constant_p(VECE) \ + ? ( (VECE) == MO_8 ? 0x0101010101010101ull * (uint8_t)(C) \ + : (VECE) == MO_16 ? 0x0001000100010001ull * (uint16_t)(C) \ + : (VECE) == MO_32 ? 0x0000000100000001ull * (uint32_t)(C) \ + : dup_const(VECE, C)) \ + : dup_const(VECE, C)) + + /* * Memory helpers that will be used by TCG generated code. */ |