aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Maydell <peter.maydell@linaro.org>2018-02-08 17:41:15 +0000
committerPeter Maydell <peter.maydell@linaro.org>2018-02-08 17:41:15 +0000
commit04bb7fe2bf55bdf66d5b7a5a719b40bbb4048178 (patch)
treed6352968c57e2255ef66f0ba4a696ba82cc9f3ca
parent008a51bbb343972dd8cf09126da8c3b87f4e1c96 (diff)
parent14e4c1e2355473ccb2939afc69ac8f25de103b92 (diff)
Merge remote-tracking branch 'remotes/rth/tags/pull-tcg-20180208' into staging
tcg generic vectors # gpg: Signature made Thu 08 Feb 2018 16:47:16 GMT # gpg: using RSA key 64DF38E8AF7E215F # gpg: Good signature from "Richard Henderson <richard.henderson@linaro.org>" # Primary key fingerprint: 7A48 1E78 868B 4DB6 A85A 05C0 64DF 38E8 AF7E 215F * remotes/rth/tags/pull-tcg-20180208: tcg/aarch64: Add vector operations tcg/i386: Add vector operations target/arm: Use vector infrastructure for aa64 orr/bic immediate target/arm: Use vector infrastructure for aa64 multiplies target/arm: Use vector infrastructure for aa64 compares target/arm: Use vector infrastructure for aa64 constant shifts target/arm: Use vector infrastructure for aa64 dup/movi target/arm: Use vector infrastructure for aa64 mov/not/neg target/arm: Use vector infrastructure for aa64 add/sub/logic target/arm: Align vector registers tcg/optimize: Handle vector opcodes during optimize tcg: Add generic vector helpers with a scalar operand tcg: Add generic helpers for saturating arithmetic tcg: Add generic vector ops for multiplication tcg: Add generic vector ops for comparisons tcg: Add generic vector ops for constant shifts tcg: Add generic vector expanders tcg: Standardize integral arguments to expanders tcg: Add types and basic operations for host vectors tcg: Allow multiple word entries into the constant pool Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r--Makefile.target4
-rw-r--r--accel/tcg/Makefile.objs2
-rw-r--r--accel/tcg/tcg-runtime-gvec.c997
-rw-r--r--accel/tcg/tcg-runtime.h118
-rwxr-xr-xconfigure48
-rw-r--r--target/arm/cpu.h2
-rw-r--r--target/arm/translate-a64.c975
-rw-r--r--tcg/README86
-rw-r--r--tcg/aarch64/tcg-target.h25
-rw-r--r--tcg/aarch64/tcg-target.inc.c588
-rw-r--r--tcg/aarch64/tcg-target.opc.h3
-rw-r--r--tcg/i386/tcg-target.h41
-rw-r--r--tcg/i386/tcg-target.inc.c987
-rw-r--r--tcg/i386/tcg-target.opc.h13
-rw-r--r--tcg/optimize.c150
-rw-r--r--tcg/tcg-gvec-desc.h49
-rw-r--r--tcg/tcg-op-gvec.c2216
-rw-r--r--tcg/tcg-op-gvec.h306
-rw-r--r--tcg/tcg-op-vec.c389
-rw-r--r--tcg/tcg-op.c42
-rw-r--r--tcg/tcg-op.h52
-rw-r--r--tcg/tcg-opc.h46
-rw-r--r--tcg/tcg-pool.inc.c113
-rw-r--r--tcg/tcg.c125
-rw-r--r--tcg/tcg.h87
25 files changed, 6969 insertions, 495 deletions
diff --git a/Makefile.target b/Makefile.target
index f9a9da7e7c..6549481096 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -93,8 +93,8 @@ all: $(PROGS) stap
# cpu emulator library
obj-y += exec.o
obj-y += accel/
-obj-$(CONFIG_TCG) += tcg/tcg.o tcg/tcg-op.o tcg/optimize.o
-obj-$(CONFIG_TCG) += tcg/tcg-common.o
+obj-$(CONFIG_TCG) += tcg/tcg.o tcg/tcg-op.o tcg/tcg-op-vec.o tcg/tcg-op-gvec.o
+obj-$(CONFIG_TCG) += tcg/tcg-common.o tcg/optimize.o
obj-$(CONFIG_TCG_INTERPRETER) += tcg/tci.o
obj-$(CONFIG_TCG_INTERPRETER) += disas/tci.o
obj-y += fpu/softfloat.o
diff --git a/accel/tcg/Makefile.objs b/accel/tcg/Makefile.objs
index 228cd84fa4..d381a02f34 100644
--- a/accel/tcg/Makefile.objs
+++ b/accel/tcg/Makefile.objs
@@ -1,6 +1,6 @@
obj-$(CONFIG_SOFTMMU) += tcg-all.o
obj-$(CONFIG_SOFTMMU) += cputlb.o
-obj-y += tcg-runtime.o
+obj-y += tcg-runtime.o tcg-runtime-gvec.o
obj-y += cpu-exec.o cpu-exec-common.o translate-all.o
obj-y += translator.o
diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c
new file mode 100644
index 0000000000..8bf8d63912
--- /dev/null
+++ b/accel/tcg/tcg-runtime-gvec.c
@@ -0,0 +1,997 @@
+/*
+ * Generic vectorized operation runtime
+ *
+ * Copyright (c) 2018 Linaro
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+#include "cpu.h"
+#include "exec/helper-proto.h"
+#include "tcg-gvec-desc.h"
+
+
+/* Virtually all hosts support 16-byte vectors. Those that don't can emulate
+ * them via GCC's generic vector extension. This turns out to be simpler and
+ * more reliable than getting the compiler to autovectorize.
+ *
+ * In tcg-op-gvec.c, we asserted that both the size and alignment of the data
+ * are multiples of 16.
+ *
+ * When the compiler does not support all of the operations we require, the
+ * loops are written so that we can always fall back on the base types.
+ */
+#ifdef CONFIG_VECTOR16
+typedef uint8_t vec8 __attribute__((vector_size(16)));
+typedef uint16_t vec16 __attribute__((vector_size(16)));
+typedef uint32_t vec32 __attribute__((vector_size(16)));
+typedef uint64_t vec64 __attribute__((vector_size(16)));
+
+typedef int8_t svec8 __attribute__((vector_size(16)));
+typedef int16_t svec16 __attribute__((vector_size(16)));
+typedef int32_t svec32 __attribute__((vector_size(16)));
+typedef int64_t svec64 __attribute__((vector_size(16)));
+
+#define DUP16(X) { X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X }
+#define DUP8(X) { X, X, X, X, X, X, X, X }
+#define DUP4(X) { X, X, X, X }
+#define DUP2(X) { X, X }
+#else
+typedef uint8_t vec8;
+typedef uint16_t vec16;
+typedef uint32_t vec32;
+typedef uint64_t vec64;
+
+typedef int8_t svec8;
+typedef int16_t svec16;
+typedef int32_t svec32;
+typedef int64_t svec64;
+
+#define DUP16(X) X
+#define DUP8(X) X
+#define DUP4(X) X
+#define DUP2(X) X
+#endif /* CONFIG_VECTOR16 */
+
+static inline void clear_high(void *d, intptr_t oprsz, uint32_t desc)
+{
+ intptr_t maxsz = simd_maxsz(desc);
+ intptr_t i;
+
+ if (unlikely(maxsz > oprsz)) {
+ for (i = oprsz; i < maxsz; i += sizeof(uint64_t)) {
+ *(uint64_t *)(d + i) = 0;
+ }
+ }
+}
+
+void HELPER(gvec_add8)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec8)) {
+ *(vec8 *)(d + i) = *(vec8 *)(a + i) + *(vec8 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_add16)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec16)) {
+ *(vec16 *)(d + i) = *(vec16 *)(a + i) + *(vec16 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_add32)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec32)) {
+ *(vec32 *)(d + i) = *(vec32 *)(a + i) + *(vec32 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = *(vec64 *)(a + i) + *(vec64 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_adds8)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ vec8 vecb = (vec8)DUP16(b);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec8)) {
+ *(vec8 *)(d + i) = *(vec8 *)(a + i) + vecb;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_adds16)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ vec16 vecb = (vec16)DUP8(b);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec16)) {
+ *(vec16 *)(d + i) = *(vec16 *)(a + i) + vecb;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_adds32)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ vec32 vecb = (vec32)DUP4(b);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec32)) {
+ *(vec32 *)(d + i) = *(vec32 *)(a + i) + vecb;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_adds64)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ vec64 vecb = (vec64)DUP2(b);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = *(vec64 *)(a + i) + vecb;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sub8)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec8)) {
+ *(vec8 *)(d + i) = *(vec8 *)(a + i) - *(vec8 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sub16)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec16)) {
+ *(vec16 *)(d + i) = *(vec16 *)(a + i) - *(vec16 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sub32)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec32)) {
+ *(vec32 *)(d + i) = *(vec32 *)(a + i) - *(vec32 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = *(vec64 *)(a + i) - *(vec64 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_subs8)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ vec8 vecb = (vec8)DUP16(b);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec8)) {
+ *(vec8 *)(d + i) = *(vec8 *)(a + i) - vecb;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_subs16)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ vec16 vecb = (vec16)DUP8(b);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec16)) {
+ *(vec16 *)(d + i) = *(vec16 *)(a + i) - vecb;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_subs32)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ vec32 vecb = (vec32)DUP4(b);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec32)) {
+ *(vec32 *)(d + i) = *(vec32 *)(a + i) - vecb;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_subs64)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ vec64 vecb = (vec64)DUP2(b);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = *(vec64 *)(a + i) - vecb;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_mul8)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec8)) {
+ *(vec8 *)(d + i) = *(vec8 *)(a + i) * *(vec8 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_mul16)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec16)) {
+ *(vec16 *)(d + i) = *(vec16 *)(a + i) * *(vec16 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_mul32)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec32)) {
+ *(vec32 *)(d + i) = *(vec32 *)(a + i) * *(vec32 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_mul64)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = *(vec64 *)(a + i) * *(vec64 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_muls8)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ vec8 vecb = (vec8)DUP16(b);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec8)) {
+ *(vec8 *)(d + i) = *(vec8 *)(a + i) * vecb;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_muls16)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ vec16 vecb = (vec16)DUP8(b);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec16)) {
+ *(vec16 *)(d + i) = *(vec16 *)(a + i) * vecb;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_muls32)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ vec32 vecb = (vec32)DUP4(b);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec32)) {
+ *(vec32 *)(d + i) = *(vec32 *)(a + i) * vecb;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_muls64)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ vec64 vecb = (vec64)DUP2(b);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = *(vec64 *)(a + i) * vecb;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_neg8)(void *d, void *a, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec8)) {
+ *(vec8 *)(d + i) = -*(vec8 *)(a + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_neg16)(void *d, void *a, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec16)) {
+ *(vec16 *)(d + i) = -*(vec16 *)(a + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_neg32)(void *d, void *a, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec32)) {
+ *(vec32 *)(d + i) = -*(vec32 *)(a + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_neg64)(void *d, void *a, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = -*(vec64 *)(a + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_mov)(void *d, void *a, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+
+ memcpy(d, a, oprsz);
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_dup64)(void *d, uint32_t desc, uint64_t c)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ if (c == 0) {
+ oprsz = 0;
+ } else {
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+ *(uint64_t *)(d + i) = c;
+ }
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_dup32)(void *d, uint32_t desc, uint32_t c)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ if (c == 0) {
+ oprsz = 0;
+ } else {
+ for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
+ *(uint32_t *)(d + i) = c;
+ }
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_dup16)(void *d, uint32_t desc, uint32_t c)
+{
+ HELPER(gvec_dup32)(d, desc, 0x00010001 * (c & 0xffff));
+}
+
+void HELPER(gvec_dup8)(void *d, uint32_t desc, uint32_t c)
+{
+ HELPER(gvec_dup32)(d, desc, 0x01010101 * (c & 0xff));
+}
+
+void HELPER(gvec_not)(void *d, void *a, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = ~*(vec64 *)(a + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_and)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = *(vec64 *)(a + i) & *(vec64 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_or)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = *(vec64 *)(a + i) | *(vec64 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_xor)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ *(vec64 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_andc)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_orc)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i);
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ vec64 vecb = (vec64)DUP2(b);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = *(vec64 *)(a + i) & vecb;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_xors)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ vec64 vecb = (vec64)DUP2(b);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ vecb;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_ors)(void *d, void *a, uint64_t b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ vec64 vecb = (vec64)DUP2(b);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = *(vec64 *)(a + i) | vecb;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_shl8i)(void *d, void *a, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ int shift = simd_data(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec8)) {
+ *(vec8 *)(d + i) = *(vec8 *)(a + i) << shift;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_shl16i)(void *d, void *a, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ int shift = simd_data(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec16)) {
+ *(vec16 *)(d + i) = *(vec16 *)(a + i) << shift;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_shl32i)(void *d, void *a, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ int shift = simd_data(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec32)) {
+ *(vec32 *)(d + i) = *(vec32 *)(a + i) << shift;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_shl64i)(void *d, void *a, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ int shift = simd_data(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = *(vec64 *)(a + i) << shift;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_shr8i)(void *d, void *a, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ int shift = simd_data(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec8)) {
+ *(vec8 *)(d + i) = *(vec8 *)(a + i) >> shift;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_shr16i)(void *d, void *a, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ int shift = simd_data(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec16)) {
+ *(vec16 *)(d + i) = *(vec16 *)(a + i) >> shift;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_shr32i)(void *d, void *a, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ int shift = simd_data(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec32)) {
+ *(vec32 *)(d + i) = *(vec32 *)(a + i) >> shift;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_shr64i)(void *d, void *a, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ int shift = simd_data(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(vec64 *)(d + i) = *(vec64 *)(a + i) >> shift;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sar8i)(void *d, void *a, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ int shift = simd_data(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec8)) {
+ *(svec8 *)(d + i) = *(svec8 *)(a + i) >> shift;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sar16i)(void *d, void *a, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ int shift = simd_data(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec16)) {
+ *(svec16 *)(d + i) = *(svec16 *)(a + i) >> shift;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sar32i)(void *d, void *a, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ int shift = simd_data(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec32)) {
+ *(svec32 *)(d + i) = *(svec32 *)(a + i) >> shift;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sar64i)(void *d, void *a, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ int shift = simd_data(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(vec64)) {
+ *(svec64 *)(d + i) = *(svec64 *)(a + i) >> shift;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+/* If vectors are enabled, the compiler fills in -1 for true.
+ Otherwise, we must take care of this by hand. */
+#ifdef CONFIG_VECTOR16
+# define DO_CMP0(X) X
+#else
+# define DO_CMP0(X) -(X)
+#endif
+
+#define DO_CMP1(NAME, TYPE, OP) \
+void HELPER(NAME)(void *d, void *a, void *b, uint32_t desc) \
+{ \
+ intptr_t oprsz = simd_oprsz(desc); \
+ intptr_t i; \
+ for (i = 0; i < oprsz; i += sizeof(vec64)) { \
+ *(TYPE *)(d + i) = DO_CMP0(*(TYPE *)(a + i) OP *(TYPE *)(b + i)); \
+ } \
+ clear_high(d, oprsz, desc); \
+}
+
+#define DO_CMP2(SZ) \
+ DO_CMP1(gvec_eq##SZ, vec##SZ, ==) \
+ DO_CMP1(gvec_ne##SZ, vec##SZ, !=) \
+ DO_CMP1(gvec_lt##SZ, svec##SZ, <) \
+ DO_CMP1(gvec_le##SZ, svec##SZ, <=) \
+ DO_CMP1(gvec_ltu##SZ, vec##SZ, <) \
+ DO_CMP1(gvec_leu##SZ, vec##SZ, <=)
+
+DO_CMP2(8)
+DO_CMP2(16)
+DO_CMP2(32)
+DO_CMP2(64)
+
+#undef DO_CMP0
+#undef DO_CMP1
+#undef DO_CMP2
+
+void HELPER(gvec_ssadd8)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(int8_t)) {
+ int r = *(int8_t *)(a + i) + *(int8_t *)(b + i);
+ if (r > INT8_MAX) {
+ r = INT8_MAX;
+ } else if (r < INT8_MIN) {
+ r = INT8_MIN;
+ }
+ *(int8_t *)(d + i) = r;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_ssadd16)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(int16_t)) {
+ int r = *(int16_t *)(a + i) + *(int16_t *)(b + i);
+ if (r > INT16_MAX) {
+ r = INT16_MAX;
+ } else if (r < INT16_MIN) {
+ r = INT16_MIN;
+ }
+ *(int16_t *)(d + i) = r;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_ssadd32)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(int32_t)) {
+ int32_t ai = *(int32_t *)(a + i);
+ int32_t bi = *(int32_t *)(b + i);
+ int32_t di = ai + bi;
+ if (((di ^ ai) &~ (ai ^ bi)) < 0) {
+ /* Signed overflow. */
+ di = (di < 0 ? INT32_MAX : INT32_MIN);
+ }
+ *(int32_t *)(d + i) = di;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_ssadd64)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(int64_t)) {
+ int64_t ai = *(int64_t *)(a + i);
+ int64_t bi = *(int64_t *)(b + i);
+ int64_t di = ai + bi;
+ if (((di ^ ai) &~ (ai ^ bi)) < 0) {
+ /* Signed overflow. */
+ di = (di < 0 ? INT64_MAX : INT64_MIN);
+ }
+ *(int64_t *)(d + i) = di;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sssub8)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
+ int r = *(int8_t *)(a + i) - *(int8_t *)(b + i);
+ if (r > INT8_MAX) {
+ r = INT8_MAX;
+ } else if (r < INT8_MIN) {
+ r = INT8_MIN;
+ }
+ *(uint8_t *)(d + i) = r;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sssub16)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(int16_t)) {
+ int r = *(int16_t *)(a + i) - *(int16_t *)(b + i);
+ if (r > INT16_MAX) {
+ r = INT16_MAX;
+ } else if (r < INT16_MIN) {
+ r = INT16_MIN;
+ }
+ *(int16_t *)(d + i) = r;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sssub32)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(int32_t)) {
+ int32_t ai = *(int32_t *)(a + i);
+ int32_t bi = *(int32_t *)(b + i);
+ int32_t di = ai - bi;
+ if (((di ^ ai) & (ai ^ bi)) < 0) {
+ /* Signed overflow. */
+ di = (di < 0 ? INT32_MAX : INT32_MIN);
+ }
+ *(int32_t *)(d + i) = di;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_sssub64)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(int64_t)) {
+ int64_t ai = *(int64_t *)(a + i);
+ int64_t bi = *(int64_t *)(b + i);
+ int64_t di = ai - bi;
+ if (((di ^ ai) & (ai ^ bi)) < 0) {
+ /* Signed overflow. */
+ di = (di < 0 ? INT64_MAX : INT64_MIN);
+ }
+ *(int64_t *)(d + i) = di;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_usadd8)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
+ unsigned r = *(uint8_t *)(a + i) + *(uint8_t *)(b + i);
+ if (r > UINT8_MAX) {
+ r = UINT8_MAX;
+ }
+ *(uint8_t *)(d + i) = r;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_usadd16)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
+ unsigned r = *(uint16_t *)(a + i) + *(uint16_t *)(b + i);
+ if (r > UINT16_MAX) {
+ r = UINT16_MAX;
+ }
+ *(uint16_t *)(d + i) = r;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_usadd32)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
+ uint32_t ai = *(uint32_t *)(a + i);
+ uint32_t bi = *(uint32_t *)(b + i);
+ uint32_t di = ai + bi;
+ if (di < ai) {
+ di = UINT32_MAX;
+ }
+ *(uint32_t *)(d + i) = di;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_usadd64)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+ uint64_t ai = *(uint64_t *)(a + i);
+ uint64_t bi = *(uint64_t *)(b + i);
+ uint64_t di = ai + bi;
+ if (di < ai) {
+ di = UINT64_MAX;
+ }
+ *(uint64_t *)(d + i) = di;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_ussub8)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
+ int r = *(uint8_t *)(a + i) - *(uint8_t *)(b + i);
+ if (r < 0) {
+ r = 0;
+ }
+ *(uint8_t *)(d + i) = r;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_ussub16)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
+ int r = *(uint16_t *)(a + i) - *(uint16_t *)(b + i);
+ if (r < 0) {
+ r = 0;
+ }
+ *(uint16_t *)(d + i) = r;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_ussub32)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
+ uint32_t ai = *(uint32_t *)(a + i);
+ uint32_t bi = *(uint32_t *)(b + i);
+ uint32_t di = ai - bi;
+ if (ai < bi) {
+ di = 0;
+ }
+ *(uint32_t *)(d + i) = di;
+ }
+ clear_high(d, oprsz, desc);
+}
+
+void HELPER(gvec_ussub64)(void *d, void *a, void *b, uint32_t desc)
+{
+ intptr_t oprsz = simd_oprsz(desc);
+ intptr_t i;
+
+ for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
+ uint64_t ai = *(uint64_t *)(a + i);
+ uint64_t bi = *(uint64_t *)(b + i);
+ uint64_t di = ai - bi;
+ if (ai < bi) {
+ di = 0;
+ }
+ *(uint64_t *)(d + i) = di;
+ }
+ clear_high(d, oprsz, desc);
+}
diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h
index 1df17d0ba9..2536959a18 100644
--- a/accel/tcg/tcg-runtime.h
+++ b/accel/tcg/tcg-runtime.h
@@ -134,3 +134,121 @@ GEN_ATOMIC_HELPERS(xor_fetch)
GEN_ATOMIC_HELPERS(xchg)
#undef GEN_ATOMIC_HELPERS
+
+DEF_HELPER_FLAGS_3(gvec_mov, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_dup8, TCG_CALL_NO_RWG, void, ptr, i32, i32)
+DEF_HELPER_FLAGS_3(gvec_dup16, TCG_CALL_NO_RWG, void, ptr, i32, i32)
+DEF_HELPER_FLAGS_3(gvec_dup32, TCG_CALL_NO_RWG, void, ptr, i32, i32)
+DEF_HELPER_FLAGS_3(gvec_dup64, TCG_CALL_NO_RWG, void, ptr, i32, i64)
+
+DEF_HELPER_FLAGS_4(gvec_add8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_add16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_add32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_add64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_adds8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_adds16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_adds32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_adds64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_4(gvec_sub8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sub16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sub32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sub64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_subs8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_subs16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_subs32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_subs64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_4(gvec_mul8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_mul16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_mul32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_mul64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_muls8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_muls16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_muls32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_muls64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_4(gvec_ssadd8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_ssadd16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_ssadd32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_ssadd64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_sssub8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sssub16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sssub32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_sssub64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_usadd8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_usadd16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_usadd32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_usadd64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_ussub8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_ussub16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_ussub32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_ussub64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_neg8, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_neg16, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_neg32, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_neg64, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_not, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_and, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_or, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_xor, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_andc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_orc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_ands, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_xors, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(gvec_ors, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+
+DEF_HELPER_FLAGS_3(gvec_shl8i, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_shl16i, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_shl32i, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_shl64i, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_shr8i, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_shr16i, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_shr32i, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_shr64i, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_3(gvec_sar8i, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_sar16i, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_sar32i, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+DEF_HELPER_FLAGS_3(gvec_sar64i, TCG_CALL_NO_RWG, void, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_eq8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_eq16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_eq32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_eq64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_ne8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_ne16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_ne32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_ne64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_lt8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_lt16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_lt32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_lt64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_le8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_le16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_le32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_le64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_ltu8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_ltu16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_ltu32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_ltu64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+
+DEF_HELPER_FLAGS_4(gvec_leu8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_leu16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_leu32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_leu64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
diff --git a/configure b/configure
index 831ebf248f..00695c7ea7 100755
--- a/configure
+++ b/configure
@@ -5001,6 +5001,50 @@ if compile_prog "" "" ; then
fi
########################################
+# See if 16-byte vector operations are supported.
+# Even without a vector unit the compiler may expand these.
+# There is a bug in old GCC for PPC that crashes here.
+# Unfortunately it's the system compiler for Centos 7.
+
+cat > $TMPC << EOF
+typedef unsigned char U1 __attribute__((vector_size(16)));
+typedef unsigned short U2 __attribute__((vector_size(16)));
+typedef unsigned int U4 __attribute__((vector_size(16)));
+typedef unsigned long long U8 __attribute__((vector_size(16)));
+typedef signed char S1 __attribute__((vector_size(16)));
+typedef signed short S2 __attribute__((vector_size(16)));
+typedef signed int S4 __attribute__((vector_size(16)));
+typedef signed long long S8 __attribute__((vector_size(16)));
+static U1 a1, b1;
+static U2 a2, b2;
+static U4 a4, b4;
+static U8 a8, b8;
+static S1 c1;
+static S2 c2;
+static S4 c4;
+static S8 c8;
+static int i;
+int main(void)
+{
+ a1 += b1; a2 += b2; a4 += b4; a8 += b8;
+ a1 -= b1; a2 -= b2; a4 -= b4; a8 -= b8;
+ a1 *= b1; a2 *= b2; a4 *= b4; a8 *= b8;
+ a1 &= b1; a2 &= b2; a4 &= b4; a8 &= b8;
+ a1 |= b1; a2 |= b2; a4 |= b4; a8 |= b8;
+ a1 ^= b1; a2 ^= b2; a4 ^= b4; a8 ^= b8;
+ a1 <<= i; a2 <<= i; a4 <<= i; a8 <<= i;
+ a1 >>= i; a2 >>= i; a4 >>= i; a8 >>= i;
+ c1 >>= i; c2 >>= i; c4 >>= i; c8 >>= i;
+ return 0;
+}
+EOF
+
+vector16=no
+if compile_prog "" "" ; then
+ vector16=yes
+fi
+
+########################################
# check if getauxval is available.
getauxval=no
@@ -6329,6 +6373,10 @@ if test "$atomic64" = "yes" ; then
echo "CONFIG_ATOMIC64=y" >> $config_host_mak
fi
+if test "$vector16" = "yes" ; then
+ echo "CONFIG_VECTOR16=y" >> $config_host_mak
+fi
+
if test "$getauxval" = "yes" ; then
echo "CONFIG_GETAUXVAL=y" >> $config_host_mak
fi
diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index d2bb59eded..8d41f783dc 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -492,7 +492,7 @@ typedef struct CPUARMState {
* the two execution states, and means we do not need to explicitly
* map these registers when changing states.
*/
- uint64_t regs[64];
+ uint64_t regs[64] QEMU_ALIGNED(16);
uint32_t xregs[16];
/* We store these fpcsr fields separately for convenience. */
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index eed64c73e5..0830c3f1c8 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -21,6 +21,7 @@
#include "cpu.h"
#include "exec/exec-all.h"
#include "tcg-op.h"
+#include "tcg-op-gvec.h"
#include "qemu/log.h"
#include "arm_ldst.h"
#include "translate.h"
@@ -84,6 +85,13 @@ typedef void CryptoTwoOpFn(TCGv_ptr, TCGv_ptr);
typedef void CryptoThreeOpIntFn(TCGv_ptr, TCGv_ptr, TCGv_i32);
typedef void CryptoThreeOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr);
+/* Note that the gvec expanders operate on offsets + sizes. */
+typedef void GVecGen2Fn(unsigned, uint32_t, uint32_t, uint32_t, uint32_t);
+typedef void GVecGen2iFn(unsigned, uint32_t, uint32_t, int64_t,
+ uint32_t, uint32_t);
+typedef void GVecGen3Fn(unsigned, uint32_t, uint32_t,
+ uint32_t, uint32_t, uint32_t);
+
/* initialize TCG globals. */
void a64_translate_init(void)
{
@@ -548,6 +556,14 @@ static TCGv_ptr vec_full_reg_ptr(DisasContext *s, int regno)
return ret;
}
+/* Return the byte size of the "whole" vector register, VL / 8. */
+static inline int vec_full_reg_size(DisasContext *s)
+{
+ /* FIXME SVE: We should put the composite ZCR_EL* value into tb->flags.
+ In the meantime this is just the AdvSIMD length of 128. */
+ return 128 / 8;
+}
+
/* Return the offset into CPUARMState of a slice (from
* the least significant end) of FP register Qn (ie
* Dn, Sn, Hn or Bn).
@@ -618,6 +634,51 @@ static TCGv_ptr get_fpstatus_ptr(void)
return statusptr;
}
+/* Expand a 2-operand AdvSIMD vector operation using an expander function. */
+static void gen_gvec_fn2(DisasContext *s, bool is_q, int rd, int rn,
+ GVecGen2Fn *gvec_fn, int vece)
+{
+ gvec_fn(vece, vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn),
+ is_q ? 16 : 8, vec_full_reg_size(s));
+}
+
+/* Expand a 2-operand + immediate AdvSIMD vector operation using
+ * an expander function.
+ */
+static void gen_gvec_fn2i(DisasContext *s, bool is_q, int rd, int rn,
+ int64_t imm, GVecGen2iFn *gvec_fn, int vece)
+{
+ gvec_fn(vece, vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn),
+ imm, is_q ? 16 : 8, vec_full_reg_size(s));
+}
+
+/* Expand a 3-operand AdvSIMD vector operation using an expander function. */
+static void gen_gvec_fn3(DisasContext *s, bool is_q, int rd, int rn, int rm,
+ GVecGen3Fn *gvec_fn, int vece)
+{
+ gvec_fn(vece, vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm), is_q ? 16 : 8, vec_full_reg_size(s));
+}
+
+/* Expand a 2-operand + immediate AdvSIMD vector operation using
+ * an op descriptor.
+ */
+static void gen_gvec_op2i(DisasContext *s, bool is_q, int rd,
+ int rn, int64_t imm, const GVecGen2i *gvec_op)
+{
+ tcg_gen_gvec_2i(vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn),
+ is_q ? 16 : 8, vec_full_reg_size(s), imm, gvec_op);
+}
+
+/* Expand a 3-operand AdvSIMD vector operation using an op descriptor. */
+static void gen_gvec_op3(DisasContext *s, bool is_q, int rd,
+ int rn, int rm, const GVecGen3 *gvec_op)
+{
+ tcg_gen_gvec_3(vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm), is_q ? 16 : 8,
+ vec_full_reg_size(s), gvec_op);
+}
+
/* Set ZF and NF based on a 64 bit result. This is alas fiddlier
* than the 32 bit equivalent.
*/
@@ -4566,14 +4627,17 @@ static void handle_fp_1src_double(DisasContext *s, int opcode, int rd, int rn)
TCGv_i64 tcg_op;
TCGv_i64 tcg_res;
+ switch (opcode) {
+ case 0x0: /* FMOV */
+ gen_gvec_fn2(s, false, rd, rn, tcg_gen_gvec_mov, 0);
+ return;
+ }
+
fpst = get_fpstatus_ptr();
tcg_op = read_fp_dreg(s, rn);
tcg_res = tcg_temp_new_i64();
switch (opcode) {
- case 0x0: /* FMOV */
- tcg_gen_mov_i64(tcg_res, tcg_op);
- break;
case 0x1: /* FABS */
gen_helper_vfp_absd(tcg_res, tcg_op);
break;
@@ -5848,10 +5912,7 @@ static void handle_simd_dupe(DisasContext *s, int is_q, int rd, int rn,
int imm5)
{
int size = ctz32(imm5);
- int esize = 8 << size;
- int elements = (is_q ? 128 : 64) / esize;
- int index, i;
- TCGv_i64 tmp;
+ int index = imm5 >> (size + 1);
if (size > 3 || (size == 3 && !is_q)) {
unallocated_encoding(s);
@@ -5862,20 +5923,9 @@ static void handle_simd_dupe(DisasContext *s, int is_q, int rd, int rn,
return;
}
- index = imm5 >> (size + 1);
-
- tmp = tcg_temp_new_i64();
- read_vec_element(s, tmp, rn, index, size);
-
- for (i = 0; i < elements; i++) {
- write_vec_element(s, tmp, rd, i, size);
- }
-
- if (!is_q) {
- clear_vec_high(s, rd);
- }
-
- tcg_temp_free_i64(tmp);
+ tcg_gen_gvec_dup_mem(size, vec_full_reg_offset(s, rd),
+ vec_reg_offset(s, rn, index, size),
+ is_q ? 16 : 8, vec_full_reg_size(s));
}
/* DUP (element, scalar)
@@ -5924,9 +5974,7 @@ static void handle_simd_dupg(DisasContext *s, int is_q, int rd, int rn,
int imm5)
{
int size = ctz32(imm5);
- int esize = 8 << size;
- int elements = (is_q ? 128 : 64)/esize;
- int i = 0;
+ uint32_t dofs, oprsz, maxsz;
if (size > 3 || ((size == 3) && !is_q)) {
unallocated_encoding(s);
@@ -5937,12 +5985,11 @@ static void handle_simd_dupg(DisasContext *s, int is_q, int rd, int rn,
return;
}
- for (i = 0; i < elements; i++) {
- write_vec_element(s, cpu_reg(s, rn), rd, i, size);
- }
- if (!is_q) {
- clear_vec_high(s, rd);
- }
+ dofs = vec_full_reg_offset(s, rd);
+ oprsz = is_q ? 16 : 8;
+ maxsz = vec_full_reg_size(s);
+
+ tcg_gen_gvec_dup_i64(size, dofs, oprsz, maxsz, cpu_reg(s, rn));
}
/* INS (Element)
@@ -6133,8 +6180,6 @@ static void disas_simd_mod_imm(DisasContext *s, uint32_t insn)
bool is_neg = extract32(insn, 29, 1);
bool is_q = extract32(insn, 30, 1);
uint64_t imm = 0;
- TCGv_i64 tcg_rd, tcg_imm;
- int i;
if (o2 != 0 || ((cmode == 0xf) && is_neg && !is_q)) {
unallocated_encoding(s);
@@ -6215,32 +6260,18 @@ static void disas_simd_mod_imm(DisasContext *s, uint32_t insn)
imm = ~imm;
}
- tcg_imm = tcg_const_i64(imm);
- tcg_rd = new_tmp_a64(s);
-
- for (i = 0; i < 2; i++) {
- int foffs = i ? fp_reg_hi_offset(s, rd) : fp_reg_offset(s, rd, MO_64);
-
- if (i == 1 && !is_q) {
- /* non-quad ops clear high half of vector */
- tcg_gen_movi_i64(tcg_rd, 0);
- } else if ((cmode & 0x9) == 0x1 || (cmode & 0xd) == 0x9) {
- tcg_gen_ld_i64(tcg_rd, cpu_env, foffs);
- if (is_neg) {
- /* AND (BIC) */
- tcg_gen_and_i64(tcg_rd, tcg_rd, tcg_imm);
- } else {
- /* ORR */
- tcg_gen_or_i64(tcg_rd, tcg_rd, tcg_imm);
- }
+ if (!((cmode & 0x9) == 0x1 || (cmode & 0xd) == 0x9)) {
+ /* MOVI or MVNI, with MVNI negation handled above. */
+ tcg_gen_gvec_dup64i(vec_full_reg_offset(s, rd), is_q ? 16 : 8,
+ vec_full_reg_size(s), imm);
+ } else {
+ /* ORR or BIC, with BIC negation to AND handled above. */
+ if (is_neg) {
+ gen_gvec_fn2i(s, is_q, rd, rd, imm, tcg_gen_gvec_andi, MO_64);
} else {
- /* MOVI */
- tcg_gen_mov_i64(tcg_rd, tcg_imm);
+ gen_gvec_fn2i(s, is_q, rd, rd, imm, tcg_gen_gvec_ori, MO_64);
}
- tcg_gen_st_i64(tcg_rd, cpu_env, foffs);
}
-
- tcg_temp_free_i64(tcg_imm);
}
/* AdvSIMD scalar copy
@@ -6485,32 +6516,6 @@ static void handle_shri_with_rndacc(TCGv_i64 tcg_res, TCGv_i64 tcg_src,
}
}
-/* Common SHL/SLI - Shift left with an optional insert */
-static void handle_shli_with_ins(TCGv_i64 tcg_res, TCGv_i64 tcg_src,
- bool insert, int shift)
-{
- if (insert) { /* SLI */
- tcg_gen_deposit_i64(tcg_res, tcg_res, tcg_src, shift, 64 - shift);
- } else { /* SHL */
- tcg_gen_shli_i64(tcg_res, tcg_src, shift);
- }
-}
-
-/* SRI: shift right with insert */
-static void handle_shri_with_ins(TCGv_i64 tcg_res, TCGv_i64 tcg_src,
- int size, int shift)
-{
- int esize = 8 << size;
-
- /* shift count same as element size is valid but does nothing;
- * special case to avoid potential shift by 64.
- */
- if (shift != esize) {
- tcg_gen_shri_i64(tcg_src, tcg_src, shift);
- tcg_gen_deposit_i64(tcg_res, tcg_res, tcg_src, 0, esize - shift);
- }
-}
-
/* SSHR[RA]/USHR[RA] - Scalar shift right (optional rounding/accumulate) */
static void handle_scalar_simd_shri(DisasContext *s,
bool is_u, int immh, int immb,
@@ -6561,7 +6566,14 @@ static void handle_scalar_simd_shri(DisasContext *s,
tcg_rd = (accumulate || insert) ? read_fp_dreg(s, rd) : tcg_temp_new_i64();
if (insert) {
- handle_shri_with_ins(tcg_rd, tcg_rn, size, shift);
+ /* shift count same as element size is valid but does nothing;
+ * special case to avoid potential shift by 64.
+ */
+ int esize = 8 << size;
+ if (shift != esize) {
+ tcg_gen_shri_i64(tcg_rn, tcg_rn, shift);
+ tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_rn, 0, esize - shift);
+ }
} else {
handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
accumulate, is_u, size, shift);
@@ -6599,7 +6611,11 @@ static void handle_scalar_simd_shli(DisasContext *s, bool insert,
tcg_rn = read_fp_dreg(s, rn);
tcg_rd = insert ? read_fp_dreg(s, rd) : tcg_temp_new_i64();
- handle_shli_with_ins(tcg_rd, tcg_rn, insert, shift);
+ if (insert) {
+ tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_rn, shift, 64 - shift);
+ } else {
+ tcg_gen_shli_i64(tcg_rd, tcg_rn, shift);
+ }
write_fp_dreg(s, rd, tcg_rd);
@@ -7175,6 +7191,28 @@ static void disas_simd_scalar_three_reg_diff(DisasContext *s, uint32_t insn)
}
}
+/* CMTST : test is "if (X & Y != 0)". */
+static void gen_cmtst_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+ tcg_gen_and_i32(d, a, b);
+ tcg_gen_setcondi_i32(TCG_COND_NE, d, d, 0);
+ tcg_gen_neg_i32(d, d);
+}
+
+static void gen_cmtst_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+ tcg_gen_and_i64(d, a, b);
+ tcg_gen_setcondi_i64(TCG_COND_NE, d, d, 0);
+ tcg_gen_neg_i64(d, d);
+}
+
+static void gen_cmtst_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
+{
+ tcg_gen_and_vec(vece, d, a, b);
+ tcg_gen_dupi_vec(vece, a, 0);
+ tcg_gen_cmp_vec(TCG_COND_NE, vece, d, d, a);
+}
+
static void handle_3same_64(DisasContext *s, int opcode, bool u,
TCGv_i64 tcg_rd, TCGv_i64 tcg_rn, TCGv_i64 tcg_rm)
{
@@ -7218,10 +7256,7 @@ static void handle_3same_64(DisasContext *s, int opcode, bool u,
cond = TCG_COND_EQ;
goto do_cmop;
}
- /* CMTST : test is "if (X & Y != 0)". */
- tcg_gen_and_i64(tcg_rd, tcg_rn, tcg_rm);
- tcg_gen_setcondi_i64(TCG_COND_NE, tcg_rd, tcg_rd, 0);
- tcg_gen_neg_i64(tcg_rd, tcg_rd);
+ gen_cmtst_i64(tcg_rd, tcg_rn, tcg_rm);
break;
case 0x8: /* SSHL, USHL */
if (u) {
@@ -8329,16 +8364,195 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn)
}
}
+static void gen_ssra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+ tcg_gen_vec_sar8i_i64(a, a, shift);
+ tcg_gen_vec_add8_i64(d, d, a);
+}
+
+static void gen_ssra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+ tcg_gen_vec_sar16i_i64(a, a, shift);
+ tcg_gen_vec_add16_i64(d, d, a);
+}
+
+static void gen_ssra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
+{
+ tcg_gen_sari_i32(a, a, shift);
+ tcg_gen_add_i32(d, d, a);
+}
+
+static void gen_ssra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+ tcg_gen_sari_i64(a, a, shift);
+ tcg_gen_add_i64(d, d, a);
+}
+
+static void gen_ssra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
+{
+ tcg_gen_sari_vec(vece, a, a, sh);
+ tcg_gen_add_vec(vece, d, d, a);
+}
+
+static void gen_usra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+ tcg_gen_vec_shr8i_i64(a, a, shift);
+ tcg_gen_vec_add8_i64(d, d, a);
+}
+
+static void gen_usra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+ tcg_gen_vec_shr16i_i64(a, a, shift);
+ tcg_gen_vec_add16_i64(d, d, a);
+}
+
+static void gen_usra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
+{
+ tcg_gen_shri_i32(a, a, shift);
+ tcg_gen_add_i32(d, d, a);
+}
+
+static void gen_usra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+ tcg_gen_shri_i64(a, a, shift);
+ tcg_gen_add_i64(d, d, a);
+}
+
+static void gen_usra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
+{
+ tcg_gen_shri_vec(vece, a, a, sh);
+ tcg_gen_add_vec(vece, d, d, a);
+}
+
+static void gen_shr8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+ uint64_t mask = dup_const(MO_8, 0xff >> shift);
+ TCGv_i64 t = tcg_temp_new_i64();
+
+ tcg_gen_shri_i64(t, a, shift);
+ tcg_gen_andi_i64(t, t, mask);
+ tcg_gen_andi_i64(d, d, ~mask);
+ tcg_gen_or_i64(d, d, t);
+ tcg_temp_free_i64(t);
+}
+
+static void gen_shr16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+ uint64_t mask = dup_const(MO_16, 0xffff >> shift);
+ TCGv_i64 t = tcg_temp_new_i64();
+
+ tcg_gen_shri_i64(t, a, shift);
+ tcg_gen_andi_i64(t, t, mask);
+ tcg_gen_andi_i64(d, d, ~mask);
+ tcg_gen_or_i64(d, d, t);
+ tcg_temp_free_i64(t);
+}
+
+static void gen_shr32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
+{
+ tcg_gen_shri_i32(a, a, shift);
+ tcg_gen_deposit_i32(d, d, a, 0, 32 - shift);
+}
+
+static void gen_shr64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+ tcg_gen_shri_i64(a, a, shift);
+ tcg_gen_deposit_i64(d, d, a, 0, 64 - shift);
+}
+
+static void gen_shr_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
+{
+ uint64_t mask = (2ull << ((8 << vece) - 1)) - 1;
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ TCGv_vec m = tcg_temp_new_vec_matching(d);
+
+ tcg_gen_dupi_vec(vece, m, mask ^ (mask >> sh));
+ tcg_gen_shri_vec(vece, t, a, sh);
+ tcg_gen_and_vec(vece, d, d, m);
+ tcg_gen_or_vec(vece, d, d, t);
+
+ tcg_temp_free_vec(t);
+ tcg_temp_free_vec(m);
+}
+
/* SSHR[RA]/USHR[RA] - Vector shift right (optional rounding/accumulate) */
static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u,
int immh, int immb, int opcode, int rn, int rd)
{
+ static const GVecGen2i ssra_op[4] = {
+ { .fni8 = gen_ssra8_i64,
+ .fniv = gen_ssra_vec,
+ .load_dest = true,
+ .opc = INDEX_op_sari_vec,
+ .vece = MO_8 },
+ { .fni8 = gen_ssra16_i64,
+ .fniv = gen_ssra_vec,
+ .load_dest = true,
+ .opc = INDEX_op_sari_vec,
+ .vece = MO_16 },
+ { .fni4 = gen_ssra32_i32,
+ .fniv = gen_ssra_vec,
+ .load_dest = true,
+ .opc = INDEX_op_sari_vec,
+ .vece = MO_32 },
+ { .fni8 = gen_ssra64_i64,
+ .fniv = gen_ssra_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .load_dest = true,
+ .opc = INDEX_op_sari_vec,
+ .vece = MO_64 },
+ };
+ static const GVecGen2i usra_op[4] = {
+ { .fni8 = gen_usra8_i64,
+ .fniv = gen_usra_vec,
+ .load_dest = true,
+ .opc = INDEX_op_shri_vec,
+ .vece = MO_8, },
+ { .fni8 = gen_usra16_i64,
+ .fniv = gen_usra_vec,
+ .load_dest = true,
+ .opc = INDEX_op_shri_vec,
+ .vece = MO_16, },
+ { .fni4 = gen_usra32_i32,
+ .fniv = gen_usra_vec,
+ .load_dest = true,
+ .opc = INDEX_op_shri_vec,
+ .vece = MO_32, },
+ { .fni8 = gen_usra64_i64,
+ .fniv = gen_usra_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .load_dest = true,
+ .opc = INDEX_op_shri_vec,
+ .vece = MO_64, },
+ };
+ static const GVecGen2i sri_op[4] = {
+ { .fni8 = gen_shr8_ins_i64,
+ .fniv = gen_shr_ins_vec,
+ .load_dest = true,
+ .opc = INDEX_op_shri_vec,
+ .vece = MO_8 },
+ { .fni8 = gen_shr16_ins_i64,
+ .fniv = gen_shr_ins_vec,
+ .load_dest = true,
+ .opc = INDEX_op_shri_vec,
+ .vece = MO_16 },
+ { .fni4 = gen_shr32_ins_i32,
+ .fniv = gen_shr_ins_vec,
+ .load_dest = true,
+ .opc = INDEX_op_shri_vec,
+ .vece = MO_32 },
+ { .fni8 = gen_shr64_ins_i64,
+ .fniv = gen_shr_ins_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .load_dest = true,
+ .opc = INDEX_op_shri_vec,
+ .vece = MO_64 },
+ };
+
int size = 32 - clz32(immh) - 1;
int immhb = immh << 3 | immb;
int shift = 2 * (8 << size) - immhb;
bool accumulate = false;
- bool round = false;
- bool insert = false;
int dsize = is_q ? 128 : 64;
int esize = 8 << size;
int elements = dsize/esize;
@@ -8346,6 +8560,7 @@ static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u,
TCGv_i64 tcg_rn = new_tmp_a64(s);
TCGv_i64 tcg_rd = new_tmp_a64(s);
TCGv_i64 tcg_round;
+ uint64_t round_const;
int i;
if (extract32(immh, 3, 1) && !is_q) {
@@ -8364,64 +8579,159 @@ static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u,
switch (opcode) {
case 0x02: /* SSRA / USRA (accumulate) */
- accumulate = true;
- break;
+ if (is_u) {
+ /* Shift count same as element size produces zero to add. */
+ if (shift == 8 << size) {
+ goto done;
+ }
+ gen_gvec_op2i(s, is_q, rd, rn, shift, &usra_op[size]);
+ } else {
+ /* Shift count same as element size produces all sign to add. */
+ if (shift == 8 << size) {
+ shift -= 1;
+ }
+ gen_gvec_op2i(s, is_q, rd, rn, shift, &ssra_op[size]);
+ }
+ return;
+ case 0x08: /* SRI */
+ /* Shift count same as element size is valid but does nothing. */
+ if (shift == 8 << size) {
+ goto done;
+ }
+ gen_gvec_op2i(s, is_q, rd, rn, shift, &sri_op[size]);
+ return;
+
+ case 0x00: /* SSHR / USHR */
+ if (is_u) {
+ if (shift == 8 << size) {
+ /* Shift count the same size as element size produces zero. */
+ tcg_gen_gvec_dup8i(vec_full_reg_offset(s, rd),
+ is_q ? 16 : 8, vec_full_reg_size(s), 0);
+ } else {
+ gen_gvec_fn2i(s, is_q, rd, rn, shift, tcg_gen_gvec_shri, size);
+ }
+ } else {
+ /* Shift count the same size as element size produces all sign. */
+ if (shift == 8 << size) {
+ shift -= 1;
+ }
+ gen_gvec_fn2i(s, is_q, rd, rn, shift, tcg_gen_gvec_sari, size);
+ }
+ return;
+
case 0x04: /* SRSHR / URSHR (rounding) */
- round = true;
break;
case 0x06: /* SRSRA / URSRA (accum + rounding) */
- accumulate = round = true;
- break;
- case 0x08: /* SRI */
- insert = true;
+ accumulate = true;
break;
+ default:
+ g_assert_not_reached();
}
- if (round) {
- uint64_t round_const = 1ULL << (shift - 1);
- tcg_round = tcg_const_i64(round_const);
- } else {
- tcg_round = NULL;
- }
+ round_const = 1ULL << (shift - 1);
+ tcg_round = tcg_const_i64(round_const);
for (i = 0; i < elements; i++) {
read_vec_element(s, tcg_rn, rn, i, memop);
- if (accumulate || insert) {
+ if (accumulate) {
read_vec_element(s, tcg_rd, rd, i, memop);
}
- if (insert) {
- handle_shri_with_ins(tcg_rd, tcg_rn, size, shift);
- } else {
- handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
- accumulate, is_u, size, shift);
- }
+ handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
+ accumulate, is_u, size, shift);
write_vec_element(s, tcg_rd, rd, i, size);
}
+ tcg_temp_free_i64(tcg_round);
+ done:
if (!is_q) {
clear_vec_high(s, rd);
}
+}
- if (round) {
- tcg_temp_free_i64(tcg_round);
- }
+static void gen_shl8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+ uint64_t mask = dup_const(MO_8, 0xff << shift);
+ TCGv_i64 t = tcg_temp_new_i64();
+
+ tcg_gen_shli_i64(t, a, shift);
+ tcg_gen_andi_i64(t, t, mask);
+ tcg_gen_andi_i64(d, d, ~mask);
+ tcg_gen_or_i64(d, d, t);
+ tcg_temp_free_i64(t);
+}
+
+static void gen_shl16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+ uint64_t mask = dup_const(MO_16, 0xffff << shift);
+ TCGv_i64 t = tcg_temp_new_i64();
+
+ tcg_gen_shli_i64(t, a, shift);
+ tcg_gen_andi_i64(t, t, mask);
+ tcg_gen_andi_i64(d, d, ~mask);
+ tcg_gen_or_i64(d, d, t);
+ tcg_temp_free_i64(t);
+}
+
+static void gen_shl32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift)
+{
+ tcg_gen_deposit_i32(d, d, a, shift, 32 - shift);
+}
+
+static void gen_shl64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift)
+{
+ tcg_gen_deposit_i64(d, d, a, shift, 64 - shift);
+}
+
+static void gen_shl_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh)
+{
+ uint64_t mask = (1ull << sh) - 1;
+ TCGv_vec t = tcg_temp_new_vec_matching(d);
+ TCGv_vec m = tcg_temp_new_vec_matching(d);
+
+ tcg_gen_dupi_vec(vece, m, mask);
+ tcg_gen_shli_vec(vece, t, a, sh);
+ tcg_gen_and_vec(vece, d, d, m);
+ tcg_gen_or_vec(vece, d, d, t);
+
+ tcg_temp_free_vec(t);
+ tcg_temp_free_vec(m);
}
/* SHL/SLI - Vector shift left */
static void handle_vec_simd_shli(DisasContext *s, bool is_q, bool insert,
- int immh, int immb, int opcode, int rn, int rd)
+ int immh, int immb, int opcode, int rn, int rd)
{
+ static const GVecGen2i shi_op[4] = {
+ { .fni8 = gen_shl8_ins_i64,
+ .fniv = gen_shl_ins_vec,
+ .opc = INDEX_op_shli_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .load_dest = true,
+ .vece = MO_8 },
+ { .fni8 = gen_shl16_ins_i64,
+ .fniv = gen_shl_ins_vec,
+ .opc = INDEX_op_shli_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .load_dest = true,
+ .vece = MO_16 },
+ { .fni4 = gen_shl32_ins_i32,
+ .fniv = gen_shl_ins_vec,
+ .opc = INDEX_op_shli_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .load_dest = true,
+ .vece = MO_32 },
+ { .fni8 = gen_shl64_ins_i64,
+ .fniv = gen_shl_ins_vec,
+ .opc = INDEX_op_shli_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .load_dest = true,
+ .vece = MO_64 },
+ };
int size = 32 - clz32(immh) - 1;
int immhb = immh << 3 | immb;
int shift = immhb - (8 << size);
- int dsize = is_q ? 128 : 64;
- int esize = 8 << size;
- int elements = dsize/esize;
- TCGv_i64 tcg_rn = new_tmp_a64(s);
- TCGv_i64 tcg_rd = new_tmp_a64(s);
- int i;
if (extract32(immh, 3, 1) && !is_q) {
unallocated_encoding(s);
@@ -8437,19 +8747,10 @@ static void handle_vec_simd_shli(DisasContext *s, bool is_q, bool insert,
return;
}
- for (i = 0; i < elements; i++) {
- read_vec_element(s, tcg_rn, rn, i, size);
- if (insert) {
- read_vec_element(s, tcg_rd, rd, i, size);
- }
-
- handle_shli_with_ins(tcg_rd, tcg_rn, insert, shift);
-
- write_vec_element(s, tcg_rd, rd, i, size);
- }
-
- if (!is_q) {
- clear_vec_high(s, rd);
+ if (insert) {
+ gen_gvec_op2i(s, is_q, rd, rn, shift, &shi_op[size]);
+ } else {
+ gen_gvec_fn2i(s, is_q, rd, rn, shift, tcg_gen_gvec_shli, size);
}
}
@@ -9072,85 +9373,115 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
}
}
+static void gen_bsl_i64(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
+{
+ tcg_gen_xor_i64(rn, rn, rm);
+ tcg_gen_and_i64(rn, rn, rd);
+ tcg_gen_xor_i64(rd, rm, rn);
+}
+
+static void gen_bit_i64(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
+{
+ tcg_gen_xor_i64(rn, rn, rd);
+ tcg_gen_and_i64(rn, rn, rm);
+ tcg_gen_xor_i64(rd, rd, rn);
+}
+
+static void gen_bif_i64(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
+{
+ tcg_gen_xor_i64(rn, rn, rd);
+ tcg_gen_andc_i64(rn, rn, rm);
+ tcg_gen_xor_i64(rd, rd, rn);
+}
+
+static void gen_bsl_vec(unsigned vece, TCGv_vec rd, TCGv_vec rn, TCGv_vec rm)
+{
+ tcg_gen_xor_vec(vece, rn, rn, rm);
+ tcg_gen_and_vec(vece, rn, rn, rd);
+ tcg_gen_xor_vec(vece, rd, rm, rn);
+}
+
+static void gen_bit_vec(unsigned vece, TCGv_vec rd, TCGv_vec rn, TCGv_vec rm)
+{
+ tcg_gen_xor_vec(vece, rn, rn, rd);
+ tcg_gen_and_vec(vece, rn, rn, rm);
+ tcg_gen_xor_vec(vece, rd, rd, rn);
+}
+
+static void gen_bif_vec(unsigned vece, TCGv_vec rd, TCGv_vec rn, TCGv_vec rm)
+{
+ tcg_gen_xor_vec(vece, rn, rn, rd);
+ tcg_gen_andc_vec(vece, rn, rn, rm);
+ tcg_gen_xor_vec(vece, rd, rd, rn);
+}
+
/* Logic op (opcode == 3) subgroup of C3.6.16. */
static void disas_simd_3same_logic(DisasContext *s, uint32_t insn)
{
+ static const GVecGen3 bsl_op = {
+ .fni8 = gen_bsl_i64,
+ .fniv = gen_bsl_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .load_dest = true
+ };
+ static const GVecGen3 bit_op = {
+ .fni8 = gen_bit_i64,
+ .fniv = gen_bit_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .load_dest = true
+ };
+ static const GVecGen3 bif_op = {
+ .fni8 = gen_bif_i64,
+ .fniv = gen_bif_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .load_dest = true
+ };
+
int rd = extract32(insn, 0, 5);
int rn = extract32(insn, 5, 5);
int rm = extract32(insn, 16, 5);
int size = extract32(insn, 22, 2);
bool is_u = extract32(insn, 29, 1);
bool is_q = extract32(insn, 30, 1);
- TCGv_i64 tcg_op1, tcg_op2, tcg_res[2];
- int pass;
if (!fp_access_check(s)) {
return;
}
- tcg_op1 = tcg_temp_new_i64();
- tcg_op2 = tcg_temp_new_i64();
- tcg_res[0] = tcg_temp_new_i64();
- tcg_res[1] = tcg_temp_new_i64();
-
- for (pass = 0; pass < (is_q ? 2 : 1); pass++) {
- read_vec_element(s, tcg_op1, rn, pass, MO_64);
- read_vec_element(s, tcg_op2, rm, pass, MO_64);
-
- if (!is_u) {
- switch (size) {
- case 0: /* AND */
- tcg_gen_and_i64(tcg_res[pass], tcg_op1, tcg_op2);
- break;
- case 1: /* BIC */
- tcg_gen_andc_i64(tcg_res[pass], tcg_op1, tcg_op2);
- break;
- case 2: /* ORR */
- tcg_gen_or_i64(tcg_res[pass], tcg_op1, tcg_op2);
- break;
- case 3: /* ORN */
- tcg_gen_orc_i64(tcg_res[pass], tcg_op1, tcg_op2);
- break;
- }
+ switch (size + 4 * is_u) {
+ case 0: /* AND */
+ gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_and, 0);
+ return;
+ case 1: /* BIC */
+ gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_andc, 0);
+ return;
+ case 2: /* ORR */
+ if (rn == rm) { /* MOV */
+ gen_gvec_fn2(s, is_q, rd, rn, tcg_gen_gvec_mov, 0);
} else {
- if (size != 0) {
- /* B* ops need res loaded to operate on */
- read_vec_element(s, tcg_res[pass], rd, pass, MO_64);
- }
-
- switch (size) {
- case 0: /* EOR */
- tcg_gen_xor_i64(tcg_res[pass], tcg_op1, tcg_op2);
- break;
- case 1: /* BSL bitwise select */
- tcg_gen_xor_i64(tcg_op1, tcg_op1, tcg_op2);
- tcg_gen_and_i64(tcg_op1, tcg_op1, tcg_res[pass]);
- tcg_gen_xor_i64(tcg_res[pass], tcg_op2, tcg_op1);
- break;
- case 2: /* BIT, bitwise insert if true */
- tcg_gen_xor_i64(tcg_op1, tcg_op1, tcg_res[pass]);
- tcg_gen_and_i64(tcg_op1, tcg_op1, tcg_op2);
- tcg_gen_xor_i64(tcg_res[pass], tcg_res[pass], tcg_op1);
- break;
- case 3: /* BIF, bitwise insert if false */
- tcg_gen_xor_i64(tcg_op1, tcg_op1, tcg_res[pass]);
- tcg_gen_andc_i64(tcg_op1, tcg_op1, tcg_op2);
- tcg_gen_xor_i64(tcg_res[pass], tcg_res[pass], tcg_op1);
- break;
- }
+ gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_or, 0);
}
- }
+ return;
+ case 3: /* ORN */
+ gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_orc, 0);
+ return;
+ case 4: /* EOR */
+ gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_xor, 0);
+ return;
- write_vec_element(s, tcg_res[0], rd, 0, MO_64);
- if (!is_q) {
- tcg_gen_movi_i64(tcg_res[1], 0);
- }
- write_vec_element(s, tcg_res[1], rd, 1, MO_64);
+ case 5: /* BSL bitwise select */
+ gen_gvec_op3(s, is_q, rd, rn, rm, &bsl_op);
+ return;
+ case 6: /* BIT, bitwise insert if true */
+ gen_gvec_op3(s, is_q, rd, rn, rm, &bit_op);
+ return;
+ case 7: /* BIF, bitwise insert if false */
+ gen_gvec_op3(s, is_q, rd, rn, rm, &bif_op);
+ return;
- tcg_temp_free_i64(tcg_op1);
- tcg_temp_free_i64(tcg_op2);
- tcg_temp_free_i64(tcg_res[0]);
- tcg_temp_free_i64(tcg_res[1]);
+ default:
+ g_assert_not_reached();
+ }
}
/* Helper functions for 32 bit comparisons */
@@ -9400,9 +9731,131 @@ static void disas_simd_3same_float(DisasContext *s, uint32_t insn)
}
}
+static void gen_mla8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+ gen_helper_neon_mul_u8(a, a, b);
+ gen_helper_neon_add_u8(d, d, a);
+}
+
+static void gen_mla16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+ gen_helper_neon_mul_u16(a, a, b);
+ gen_helper_neon_add_u16(d, d, a);
+}
+
+static void gen_mla32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+ tcg_gen_mul_i32(a, a, b);
+ tcg_gen_add_i32(d, d, a);
+}
+
+static void gen_mla64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+ tcg_gen_mul_i64(a, a, b);
+ tcg_gen_add_i64(d, d, a);
+}
+
+static void gen_mla_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
+{
+ tcg_gen_mul_vec(vece, a, a, b);
+ tcg_gen_add_vec(vece, d, d, a);
+}
+
+static void gen_mls8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+ gen_helper_neon_mul_u8(a, a, b);
+ gen_helper_neon_sub_u8(d, d, a);
+}
+
+static void gen_mls16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+ gen_helper_neon_mul_u16(a, a, b);
+ gen_helper_neon_sub_u16(d, d, a);
+}
+
+static void gen_mls32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+ tcg_gen_mul_i32(a, a, b);
+ tcg_gen_sub_i32(d, d, a);
+}
+
+static void gen_mls64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+ tcg_gen_mul_i64(a, a, b);
+ tcg_gen_sub_i64(d, d, a);
+}
+
+static void gen_mls_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b)
+{
+ tcg_gen_mul_vec(vece, a, a, b);
+ tcg_gen_sub_vec(vece, d, d, a);
+}
+
/* Integer op subgroup of C3.6.16. */
static void disas_simd_3same_int(DisasContext *s, uint32_t insn)
{
+ static const GVecGen3 cmtst_op[4] = {
+ { .fni4 = gen_helper_neon_tst_u8,
+ .fniv = gen_cmtst_vec,
+ .vece = MO_8 },
+ { .fni4 = gen_helper_neon_tst_u16,
+ .fniv = gen_cmtst_vec,
+ .vece = MO_16 },
+ { .fni4 = gen_cmtst_i32,
+ .fniv = gen_cmtst_vec,
+ .vece = MO_32 },
+ { .fni8 = gen_cmtst_i64,
+ .fniv = gen_cmtst_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .vece = MO_64 },
+ };
+ static const GVecGen3 mla_op[4] = {
+ { .fni4 = gen_mla8_i32,
+ .fniv = gen_mla_vec,
+ .opc = INDEX_op_mul_vec,
+ .load_dest = true,
+ .vece = MO_8 },
+ { .fni4 = gen_mla16_i32,
+ .fniv = gen_mla_vec,
+ .opc = INDEX_op_mul_vec,
+ .load_dest = true,
+ .vece = MO_16 },
+ { .fni4 = gen_mla32_i32,
+ .fniv = gen_mla_vec,
+ .opc = INDEX_op_mul_vec,
+ .load_dest = true,
+ .vece = MO_32 },
+ { .fni8 = gen_mla64_i64,
+ .fniv = gen_mla_vec,
+ .opc = INDEX_op_mul_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .load_dest = true,
+ .vece = MO_64 },
+ };
+ static const GVecGen3 mls_op[4] = {
+ { .fni4 = gen_mls8_i32,
+ .fniv = gen_mls_vec,
+ .opc = INDEX_op_mul_vec,
+ .load_dest = true,
+ .vece = MO_8 },
+ { .fni4 = gen_mls16_i32,
+ .fniv = gen_mls_vec,
+ .opc = INDEX_op_mul_vec,
+ .load_dest = true,
+ .vece = MO_16 },
+ { .fni4 = gen_mls32_i32,
+ .fniv = gen_mls_vec,
+ .opc = INDEX_op_mul_vec,
+ .load_dest = true,
+ .vece = MO_32 },
+ { .fni8 = gen_mls64_i64,
+ .fniv = gen_mls_vec,
+ .opc = INDEX_op_mul_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .load_dest = true,
+ .vece = MO_64 },
+ };
+
int is_q = extract32(insn, 30, 1);
int u = extract32(insn, 29, 1);
int size = extract32(insn, 22, 2);
@@ -9411,6 +9864,7 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn)
int rn = extract32(insn, 5, 5);
int rd = extract32(insn, 0, 5);
int pass;
+ TCGCond cond;
switch (opcode) {
case 0x13: /* MUL, PMUL */
@@ -9450,6 +9904,48 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn)
return;
}
+ switch (opcode) {
+ case 0x10: /* ADD, SUB */
+ if (u) {
+ gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_sub, size);
+ } else {
+ gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_add, size);
+ }
+ return;
+ case 0x13: /* MUL, PMUL */
+ if (!u) { /* MUL */
+ gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_mul, size);
+ return;
+ }
+ break;
+ case 0x12: /* MLA, MLS */
+ if (u) {
+ gen_gvec_op3(s, is_q, rd, rn, rm, &mls_op[size]);
+ } else {
+ gen_gvec_op3(s, is_q, rd, rn, rm, &mla_op[size]);
+ }
+ return;
+ case 0x11:
+ if (!u) { /* CMTST */
+ gen_gvec_op3(s, is_q, rd, rn, rm, &cmtst_op[size]);
+ return;
+ }
+ /* else CMEQ */
+ cond = TCG_COND_EQ;
+ goto do_gvec_cmp;
+ case 0x06: /* CMGT, CMHI */
+ cond = u ? TCG_COND_GTU : TCG_COND_GT;
+ goto do_gvec_cmp;
+ case 0x07: /* CMGE, CMHS */
+ cond = u ? TCG_COND_GEU : TCG_COND_GE;
+ do_gvec_cmp:
+ tcg_gen_gvec_cmp(cond, size, vec_full_reg_offset(s, rd),
+ vec_full_reg_offset(s, rn),
+ vec_full_reg_offset(s, rm),
+ is_q ? 16 : 8, vec_full_reg_size(s));
+ return;
+ }
+
if (size == 3) {
assert(is_q);
for (pass = 0; pass < 2; pass++) {
@@ -9530,26 +10026,6 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn)
genenvfn = fns[size][u];
break;
}
- case 0x6: /* CMGT, CMHI */
- {
- static NeonGenTwoOpFn * const fns[3][2] = {
- { gen_helper_neon_cgt_s8, gen_helper_neon_cgt_u8 },
- { gen_helper_neon_cgt_s16, gen_helper_neon_cgt_u16 },
- { gen_helper_neon_cgt_s32, gen_helper_neon_cgt_u32 },
- };
- genfn = fns[size][u];
- break;
- }
- case 0x7: /* CMGE, CMHS */
- {
- static NeonGenTwoOpFn * const fns[3][2] = {
- { gen_helper_neon_cge_s8, gen_helper_neon_cge_u8 },
- { gen_helper_neon_cge_s16, gen_helper_neon_cge_u16 },
- { gen_helper_neon_cge_s32, gen_helper_neon_cge_u32 },
- };
- genfn = fns[size][u];
- break;
- }
case 0x8: /* SSHL, USHL */
{
static NeonGenTwoOpFn * const fns[3][2] = {
@@ -9622,44 +10098,11 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn)
genfn = fns[size][u];
break;
}
- case 0x10: /* ADD, SUB */
- {
- static NeonGenTwoOpFn * const fns[3][2] = {
- { gen_helper_neon_add_u8, gen_helper_neon_sub_u8 },
- { gen_helper_neon_add_u16, gen_helper_neon_sub_u16 },
- { tcg_gen_add_i32, tcg_gen_sub_i32 },
- };
- genfn = fns[size][u];
- break;
- }
- case 0x11: /* CMTST, CMEQ */
- {
- static NeonGenTwoOpFn * const fns[3][2] = {
- { gen_helper_neon_tst_u8, gen_helper_neon_ceq_u8 },
- { gen_helper_neon_tst_u16, gen_helper_neon_ceq_u16 },
- { gen_helper_neon_tst_u32, gen_helper_neon_ceq_u32 },
- };
- genfn = fns[size][u];
- break;
- }
case 0x13: /* MUL, PMUL */
- if (u) {
- /* PMUL */
- assert(size == 0);
- genfn = gen_helper_neon_mul_p8;
- break;
- }
- /* fall through : MUL */
- case 0x12: /* MLA, MLS */
- {
- static NeonGenTwoOpFn * const fns[3] = {
- gen_helper_neon_mul_u8,
- gen_helper_neon_mul_u16,
- tcg_gen_mul_i32,
- };
- genfn = fns[size];
+ assert(u); /* PMUL */
+ assert(size == 0);
+ genfn = gen_helper_neon_mul_p8;
break;
- }
case 0x16: /* SQDMULH, SQRDMULH */
{
static NeonGenTwoOpEnvFn * const fns[2][2] = {
@@ -9680,18 +10123,16 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn)
genfn(tcg_res, tcg_op1, tcg_op2);
}
- if (opcode == 0xf || opcode == 0x12) {
- /* SABA, UABA, MLA, MLS: accumulating ops */
- static NeonGenTwoOpFn * const fns[3][2] = {
- { gen_helper_neon_add_u8, gen_helper_neon_sub_u8 },
- { gen_helper_neon_add_u16, gen_helper_neon_sub_u16 },
- { tcg_gen_add_i32, tcg_gen_sub_i32 },
+ if (opcode == 0xf) {
+ /* SABA, UABA: accumulating ops */
+ static NeonGenTwoOpFn * const fns[3] = {
+ gen_helper_neon_add_u8,
+ gen_helper_neon_add_u16,
+ tcg_gen_add_i32,
};
- bool is_sub = (opcode == 0x12 && u); /* MLS */
- genfn = fns[size][is_sub];
read_vec_element_i32(s, tcg_op1, rd, pass, MO_32);
- genfn(tcg_res, tcg_op1, tcg_res);
+ fns[size](tcg_res, tcg_op1, tcg_res);
}
write_vec_element_i32(s, tcg_res, rd, pass, MO_32);
@@ -10003,8 +10444,7 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
return;
case 0x5: /* CNT, NOT, RBIT */
if (u && size == 0) {
- /* NOT: adjust size so we can use the 64-bits-at-a-time loop. */
- size = 3;
+ /* NOT */
break;
} else if (u && size == 1) {
/* RBIT */
@@ -10256,6 +10696,21 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn)
tcg_rmode = NULL;
}
+ switch (opcode) {
+ case 0x5:
+ if (u && size == 0) { /* NOT */
+ gen_gvec_fn2(s, is_q, rd, rn, tcg_gen_gvec_not, 0);
+ return;
+ }
+ break;
+ case 0xb:
+ if (u) { /* NEG */
+ gen_gvec_fn2(s, is_q, rd, rn, tcg_gen_gvec_neg, size);
+ return;
+ }
+ break;
+ }
+
if (size == 3) {
/* All 64-bit element operations can be shared with scalar 2misc */
int pass;
diff --git a/tcg/README b/tcg/README
index 03bfb6acd4..bb2ea5121b 100644
--- a/tcg/README
+++ b/tcg/README
@@ -503,6 +503,92 @@ of the memory access.
For a 32-bit host, qemu_ld/st_i64 is guaranteed to only be used with a
64-bit memory access specified in flags.
+********* Host vector operations
+
+All of the vector ops have two parameters, TCGOP_VECL & TCGOP_VECE.
+The former specifies the length of the vector in log2 64-bit units; the
+later specifies the length of the element (if applicable) in log2 8-bit units.
+E.g. VECL=1 -> 64 << 1 -> v128, and VECE=2 -> 1 << 2 -> i32.
+
+* mov_vec v0, v1
+* ld_vec v0, t1
+* st_vec v0, t1
+
+ Move, load and store.
+
+* dup_vec v0, r1
+
+ Duplicate the low N bits of R1 into VECL/VECE copies across V0.
+
+* dupi_vec v0, c
+
+ Similarly, for a constant.
+ Smaller values will be replicated to host register size by the expanders.
+
+* dup2_vec v0, r1, r2
+
+ Duplicate r2:r1 into VECL/64 copies across V0. This opcode is
+ only present for 32-bit hosts.
+
+* add_vec v0, v1, v2
+
+ v0 = v1 + v2, in elements across the vector.
+
+* sub_vec v0, v1, v2
+
+ Similarly, v0 = v1 - v2.
+
+* mul_vec v0, v1, v2
+
+ Similarly, v0 = v1 * v2.
+
+* neg_vec v0, v1
+
+ Similarly, v0 = -v1.
+
+* and_vec v0, v1, v2
+* or_vec v0, v1, v2
+* xor_vec v0, v1, v2
+* andc_vec v0, v1, v2
+* orc_vec v0, v1, v2
+* not_vec v0, v1
+
+ Similarly, logical operations with and without compliment.
+ Note that VECE is unused.
+
+* shli_vec v0, v1, i2
+* shls_vec v0, v1, s2
+
+ Shift all elements from v1 by a scalar i2/s2. I.e.
+
+ for (i = 0; i < VECL/VECE; ++i) {
+ v0[i] = v1[i] << s2;
+ }
+
+* shri_vec v0, v1, i2
+* sari_vec v0, v1, i2
+* shrs_vec v0, v1, s2
+* sars_vec v0, v1, s2
+
+ Similarly for logical and arithmetic right shift.
+
+* shlv_vec v0, v1, v2
+
+ Shift elements from v1 by elements from v2. I.e.
+
+ for (i = 0; i < VECL/VECE; ++i) {
+ v0[i] = v1[i] << v2[i];
+ }
+
+* shrv_vec v0, v1, v2
+* sarv_vec v0, v1, v2
+
+ Similarly for logical and arithmetic right shift.
+
+* cmp_vec v0, v1, v2, cond
+
+ Compare vectors by element, storing -1 for true and 0 for false.
+
*********
Note 1: Some shortcuts are defined when the last operand is known to be
diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index c2525066ab..9aea1d1771 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -31,13 +31,22 @@ typedef enum {
TCG_REG_SP = 31,
TCG_REG_XZR = 31,
+ TCG_REG_V0 = 32, TCG_REG_V1, TCG_REG_V2, TCG_REG_V3,
+ TCG_REG_V4, TCG_REG_V5, TCG_REG_V6, TCG_REG_V7,
+ TCG_REG_V8, TCG_REG_V9, TCG_REG_V10, TCG_REG_V11,
+ TCG_REG_V12, TCG_REG_V13, TCG_REG_V14, TCG_REG_V15,
+ TCG_REG_V16, TCG_REG_V17, TCG_REG_V18, TCG_REG_V19,
+ TCG_REG_V20, TCG_REG_V21, TCG_REG_V22, TCG_REG_V23,
+ TCG_REG_V24, TCG_REG_V25, TCG_REG_V26, TCG_REG_V27,
+ TCG_REG_V28, TCG_REG_V29, TCG_REG_V30, TCG_REG_V31,
+
/* Aliases. */
TCG_REG_FP = TCG_REG_X29,
TCG_REG_LR = TCG_REG_X30,
TCG_AREG0 = TCG_REG_X19,
} TCGReg;
-#define TCG_TARGET_NB_REGS 32
+#define TCG_TARGET_NB_REGS 64
/* used for function call generation */
#define TCG_REG_CALL_STACK TCG_REG_SP
@@ -113,6 +122,20 @@ typedef enum {
#define TCG_TARGET_HAS_mulsh_i64 1
#define TCG_TARGET_HAS_direct_jump 1
+#define TCG_TARGET_HAS_v64 1
+#define TCG_TARGET_HAS_v128 1
+#define TCG_TARGET_HAS_v256 0
+
+#define TCG_TARGET_HAS_andc_vec 1
+#define TCG_TARGET_HAS_orc_vec 1
+#define TCG_TARGET_HAS_not_vec 1
+#define TCG_TARGET_HAS_neg_vec 1
+#define TCG_TARGET_HAS_shi_vec 1
+#define TCG_TARGET_HAS_shs_vec 0
+#define TCG_TARGET_HAS_shv_vec 0
+#define TCG_TARGET_HAS_cmp_vec 1
+#define TCG_TARGET_HAS_mul_vec 1
+
#define TCG_TARGET_DEFAULT_MO (0)
static inline void flush_icache_range(uintptr_t start, uintptr_t stop)
diff --git a/tcg/aarch64/tcg-target.inc.c b/tcg/aarch64/tcg-target.inc.c
index 150530f30e..be3192078d 100644
--- a/tcg/aarch64/tcg-target.inc.c
+++ b/tcg/aarch64/tcg-target.inc.c
@@ -20,10 +20,15 @@ QEMU_BUILD_BUG_ON(TCG_TYPE_I32 != 0 || TCG_TYPE_I64 != 1);
#ifdef CONFIG_DEBUG_TCG
static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
- "%x0", "%x1", "%x2", "%x3", "%x4", "%x5", "%x6", "%x7",
- "%x8", "%x9", "%x10", "%x11", "%x12", "%x13", "%x14", "%x15",
- "%x16", "%x17", "%x18", "%x19", "%x20", "%x21", "%x22", "%x23",
- "%x24", "%x25", "%x26", "%x27", "%x28", "%fp", "%x30", "%sp",
+ "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+ "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+ "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23",
+ "x24", "x25", "x26", "x27", "x28", "fp", "x30", "sp",
+
+ "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+ "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+ "v24", "v25", "v26", "v27", "v28", "fp", "v30", "v31",
};
#endif /* CONFIG_DEBUG_TCG */
@@ -43,6 +48,14 @@ static const int tcg_target_reg_alloc_order[] = {
/* X19 reserved for AREG0 */
/* X29 reserved as fp */
/* X30 reserved as temporary */
+
+ TCG_REG_V0, TCG_REG_V1, TCG_REG_V2, TCG_REG_V3,
+ TCG_REG_V4, TCG_REG_V5, TCG_REG_V6, TCG_REG_V7,
+ /* V8 - V15 are call-saved, and skipped. */
+ TCG_REG_V16, TCG_REG_V17, TCG_REG_V18, TCG_REG_V19,
+ TCG_REG_V20, TCG_REG_V21, TCG_REG_V22, TCG_REG_V23,
+ TCG_REG_V24, TCG_REG_V25, TCG_REG_V26, TCG_REG_V27,
+ TCG_REG_V28, TCG_REG_V29, TCG_REG_V30, TCG_REG_V31,
};
static const int tcg_target_call_iarg_regs[8] = {
@@ -54,6 +67,7 @@ static const int tcg_target_call_oarg_regs[1] = {
};
#define TCG_REG_TMP TCG_REG_X30
+#define TCG_VEC_TMP TCG_REG_V31
#ifndef CONFIG_SOFTMMU
/* Note that XZR cannot be encoded in the address base register slot,
@@ -119,9 +133,13 @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
const char *ct_str, TCGType type)
{
switch (*ct_str++) {
- case 'r':
+ case 'r': /* general registers */
ct->ct |= TCG_CT_REG;
- ct->u.regs = 0xffffffffu;
+ ct->u.regs |= 0xffffffffu;
+ break;
+ case 'w': /* advsimd registers */
+ ct->ct |= TCG_CT_REG;
+ ct->u.regs |= 0xffffffff00000000ull;
break;
case 'l': /* qemu_ld / qemu_st address, data_reg */
ct->ct |= TCG_CT_REG;
@@ -153,11 +171,13 @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
return ct_str;
}
+/* Match a constant valid for addition (12-bit, optionally shifted). */
static inline bool is_aimm(uint64_t val)
{
return (val & ~0xfff) == 0 || (val & ~0xfff000) == 0;
}
+/* Match a constant valid for logical operations. */
static inline bool is_limm(uint64_t val)
{
/* Taking a simplified view of the logical immediates for now, ignoring
@@ -178,6 +198,106 @@ static inline bool is_limm(uint64_t val)
return (val & (val - 1)) == 0;
}
+/* Match a constant that is valid for vectors. */
+static bool is_fimm(uint64_t v64, int *op, int *cmode, int *imm8)
+{
+ int i;
+
+ *op = 0;
+ /* Match replication across 8 bits. */
+ if (v64 == dup_const(MO_8, v64)) {
+ *cmode = 0xe;
+ *imm8 = v64 & 0xff;
+ return true;
+ }
+ /* Match replication across 16 bits. */
+ if (v64 == dup_const(MO_16, v64)) {
+ uint16_t v16 = v64;
+
+ if (v16 == (v16 & 0xff)) {
+ *cmode = 0x8;
+ *imm8 = v16 & 0xff;
+ return true;
+ } else if (v16 == (v16 & 0xff00)) {
+ *cmode = 0xa;
+ *imm8 = v16 >> 8;
+ return true;
+ }
+ }
+ /* Match replication across 32 bits. */
+ if (v64 == dup_const(MO_32, v64)) {
+ uint32_t v32 = v64;
+
+ if (v32 == (v32 & 0xff)) {
+ *cmode = 0x0;
+ *imm8 = v32 & 0xff;
+ return true;
+ } else if (v32 == (v32 & 0xff00)) {
+ *cmode = 0x2;
+ *imm8 = (v32 >> 8) & 0xff;
+ return true;
+ } else if (v32 == (v32 & 0xff0000)) {
+ *cmode = 0x4;
+ *imm8 = (v32 >> 16) & 0xff;
+ return true;
+ } else if (v32 == (v32 & 0xff000000)) {
+ *cmode = 0x6;
+ *imm8 = v32 >> 24;
+ return true;
+ } else if ((v32 & 0xffff00ff) == 0xff) {
+ *cmode = 0xc;
+ *imm8 = (v32 >> 8) & 0xff;
+ return true;
+ } else if ((v32 & 0xff00ffff) == 0xffff) {
+ *cmode = 0xd;
+ *imm8 = (v32 >> 16) & 0xff;
+ return true;
+ }
+ /* Match forms of a float32. */
+ if (extract32(v32, 0, 19) == 0
+ && (extract32(v32, 25, 6) == 0x20
+ || extract32(v32, 25, 6) == 0x1f)) {
+ *cmode = 0xf;
+ *imm8 = (extract32(v32, 31, 1) << 7)
+ | (extract32(v32, 25, 1) << 6)
+ | extract32(v32, 19, 6);
+ return true;
+ }
+ }
+ /* Match forms of a float64. */
+ if (extract64(v64, 0, 48) == 0
+ && (extract64(v64, 54, 9) == 0x100
+ || extract64(v64, 54, 9) == 0x0ff)) {
+ *cmode = 0xf;
+ *op = 1;
+ *imm8 = (extract64(v64, 63, 1) << 7)
+ | (extract64(v64, 54, 1) << 6)
+ | extract64(v64, 48, 6);
+ return true;
+ }
+ /* Match bytes of 0x00 and 0xff. */
+ for (i = 0; i < 64; i += 8) {
+ uint64_t byte = extract64(v64, i, 8);
+ if (byte != 0 && byte != 0xff) {
+ break;
+ }
+ }
+ if (i == 64) {
+ *cmode = 0xe;
+ *op = 1;
+ *imm8 = (extract64(v64, 0, 1) << 0)
+ | (extract64(v64, 8, 1) << 1)
+ | (extract64(v64, 16, 1) << 2)
+ | (extract64(v64, 24, 1) << 3)
+ | (extract64(v64, 32, 1) << 4)
+ | (extract64(v64, 40, 1) << 5)
+ | (extract64(v64, 48, 1) << 6)
+ | (extract64(v64, 56, 1) << 7);
+ return true;
+ }
+ return false;
+}
+
static int tcg_target_const_match(tcg_target_long val, TCGType type,
const TCGArgConstraint *arg_ct)
{
@@ -271,6 +391,9 @@ typedef enum {
/* Load literal for loading the address at pc-relative offset */
I3305_LDR = 0x58000000,
+ I3305_LDR_v64 = 0x5c000000,
+ I3305_LDR_v128 = 0x9c000000,
+
/* Load/store register. Described here as 3.3.12, but the helper
that emits them can transform to 3.3.10 or 3.3.13. */
I3312_STRB = 0x38000000 | LDST_ST << 22 | MO_8 << 30,
@@ -290,6 +413,15 @@ typedef enum {
I3312_LDRSHX = 0x38000000 | LDST_LD_S_X << 22 | MO_16 << 30,
I3312_LDRSWX = 0x38000000 | LDST_LD_S_X << 22 | MO_32 << 30,
+ I3312_LDRVS = 0x3c000000 | LDST_LD << 22 | MO_32 << 30,
+ I3312_STRVS = 0x3c000000 | LDST_ST << 22 | MO_32 << 30,
+
+ I3312_LDRVD = 0x3c000000 | LDST_LD << 22 | MO_64 << 30,
+ I3312_STRVD = 0x3c000000 | LDST_ST << 22 | MO_64 << 30,
+
+ I3312_LDRVQ = 0x3c000000 | 3 << 22 | 0 << 30,
+ I3312_STRVQ = 0x3c000000 | 2 << 22 | 0 << 30,
+
I3312_TO_I3310 = 0x00200800,
I3312_TO_I3313 = 0x01000000,
@@ -374,8 +506,48 @@ typedef enum {
I3510_EON = 0x4a200000,
I3510_ANDS = 0x6a000000,
- NOP = 0xd503201f,
+ /* AdvSIMD copy */
+ I3605_DUP = 0x0e000400,
+ I3605_INS = 0x4e001c00,
+ I3605_UMOV = 0x0e003c00,
+
+ /* AdvSIMD modified immediate */
+ I3606_MOVI = 0x0f000400,
+
+ /* AdvSIMD shift by immediate */
+ I3614_SSHR = 0x0f000400,
+ I3614_SSRA = 0x0f001400,
+ I3614_SHL = 0x0f005400,
+ I3614_USHR = 0x2f000400,
+ I3614_USRA = 0x2f001400,
+
+ /* AdvSIMD three same. */
+ I3616_ADD = 0x0e208400,
+ I3616_AND = 0x0e201c00,
+ I3616_BIC = 0x0e601c00,
+ I3616_EOR = 0x2e201c00,
+ I3616_MUL = 0x0e209c00,
+ I3616_ORR = 0x0ea01c00,
+ I3616_ORN = 0x0ee01c00,
+ I3616_SUB = 0x2e208400,
+ I3616_CMGT = 0x0e203400,
+ I3616_CMGE = 0x0e203c00,
+ I3616_CMTST = 0x0e208c00,
+ I3616_CMHI = 0x2e203400,
+ I3616_CMHS = 0x2e203c00,
+ I3616_CMEQ = 0x2e208c00,
+
+ /* AdvSIMD two-reg misc. */
+ I3617_CMGT0 = 0x0e208800,
+ I3617_CMEQ0 = 0x0e209800,
+ I3617_CMLT0 = 0x0e20a800,
+ I3617_CMGE0 = 0x2e208800,
+ I3617_CMLE0 = 0x2e20a800,
+ I3617_NOT = 0x2e205800,
+ I3617_NEG = 0x2e20b800,
+
/* System instructions. */
+ NOP = 0xd503201f,
DMB_ISH = 0xd50338bf,
DMB_LD = 0x00000100,
DMB_ST = 0x00000200,
@@ -520,26 +692,64 @@ static void tcg_out_insn_3509(TCGContext *s, AArch64Insn insn, TCGType ext,
tcg_out32(s, insn | ext << 31 | rm << 16 | ra << 10 | rn << 5 | rd);
}
+static void tcg_out_insn_3605(TCGContext *s, AArch64Insn insn, bool q,
+ TCGReg rd, TCGReg rn, int dst_idx, int src_idx)
+{
+ /* Note that bit 11 set means general register input. Therefore
+ we can handle both register sets with one function. */
+ tcg_out32(s, insn | q << 30 | (dst_idx << 16) | (src_idx << 11)
+ | (rd & 0x1f) | (~rn & 0x20) << 6 | (rn & 0x1f) << 5);
+}
+
+static void tcg_out_insn_3606(TCGContext *s, AArch64Insn insn, bool q,
+ TCGReg rd, bool op, int cmode, uint8_t imm8)
+{
+ tcg_out32(s, insn | q << 30 | op << 29 | cmode << 12 | (rd & 0x1f)
+ | (imm8 & 0xe0) << (16 - 5) | (imm8 & 0x1f) << 5);
+}
+
+static void tcg_out_insn_3614(TCGContext *s, AArch64Insn insn, bool q,
+ TCGReg rd, TCGReg rn, unsigned immhb)
+{
+ tcg_out32(s, insn | q << 30 | immhb << 16
+ | (rn & 0x1f) << 5 | (rd & 0x1f));
+}
+
+static void tcg_out_insn_3616(TCGContext *s, AArch64Insn insn, bool q,
+ unsigned size, TCGReg rd, TCGReg rn, TCGReg rm)
+{
+ tcg_out32(s, insn | q << 30 | (size << 22) | (rm & 0x1f) << 16
+ | (rn & 0x1f) << 5 | (rd & 0x1f));
+}
+
+static void tcg_out_insn_3617(TCGContext *s, AArch64Insn insn, bool q,
+ unsigned size, TCGReg rd, TCGReg rn)
+{
+ tcg_out32(s, insn | q << 30 | (size << 22)
+ | (rn & 0x1f) << 5 | (rd & 0x1f));
+}
+
static void tcg_out_insn_3310(TCGContext *s, AArch64Insn insn,
TCGReg rd, TCGReg base, TCGType ext,
TCGReg regoff)
{
/* Note the AArch64Insn constants above are for C3.3.12. Adjust. */
tcg_out32(s, insn | I3312_TO_I3310 | regoff << 16 |
- 0x4000 | ext << 13 | base << 5 | rd);
+ 0x4000 | ext << 13 | base << 5 | (rd & 0x1f));
}
static void tcg_out_insn_3312(TCGContext *s, AArch64Insn insn,
TCGReg rd, TCGReg rn, intptr_t offset)
{
- tcg_out32(s, insn | (offset & 0x1ff) << 12 | rn << 5 | rd);
+ tcg_out32(s, insn | (offset & 0x1ff) << 12 | rn << 5 | (rd & 0x1f));
}
static void tcg_out_insn_3313(TCGContext *s, AArch64Insn insn,
TCGReg rd, TCGReg rn, uintptr_t scaled_uimm)
{
/* Note the AArch64Insn constants above are for C3.3.12. Adjust. */
- tcg_out32(s, insn | I3312_TO_I3313 | scaled_uimm << 10 | rn << 5 | rd);
+ tcg_out32(s, insn | I3312_TO_I3313 | scaled_uimm << 10
+ | rn << 5 | (rd & 0x1f));
}
/* Register to register move using ORR (shifted register with no shift). */
@@ -585,6 +795,22 @@ static void tcg_out_logicali(TCGContext *s, AArch64Insn insn, TCGType ext,
tcg_out_insn_3404(s, insn, ext, rd, rn, ext, r, c);
}
+static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
+ TCGReg rd, uint64_t v64)
+{
+ int op, cmode, imm8;
+
+ if (is_fimm(v64, &op, &cmode, &imm8)) {
+ tcg_out_insn(s, 3606, MOVI, type == TCG_TYPE_V128, rd, op, cmode, imm8);
+ } else if (type == TCG_TYPE_V128) {
+ new_pool_l2(s, R_AARCH64_CONDBR19, s->code_ptr, 0, v64, v64);
+ tcg_out_insn(s, 3305, LDR_v128, 0, rd);
+ } else {
+ new_pool_label(s, v64, R_AARCH64_CONDBR19, s->code_ptr, 0);
+ tcg_out_insn(s, 3305, LDR_v64, 0, rd);
+ }
+}
+
static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
tcg_target_long value)
{
@@ -594,6 +820,22 @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
int s0, s1;
AArch64Insn opc;
+ switch (type) {
+ case TCG_TYPE_I32:
+ case TCG_TYPE_I64:
+ tcg_debug_assert(rd < 32);
+ break;
+
+ case TCG_TYPE_V64:
+ case TCG_TYPE_V128:
+ tcg_debug_assert(rd >= 32);
+ tcg_out_dupi_vec(s, type, rd, value);
+ return;
+
+ default:
+ g_assert_not_reached();
+ }
+
/* For 32-bit values, discard potential garbage in value. For 64-bit
values within [2**31, 2**32-1], we can create smaller sequences by
interpreting this as a negative 32-bit number, while ensuring that
@@ -669,15 +911,13 @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
/* Define something more legible for general use. */
#define tcg_out_ldst_r tcg_out_insn_3310
-static void tcg_out_ldst(TCGContext *s, AArch64Insn insn,
- TCGReg rd, TCGReg rn, intptr_t offset)
+static void tcg_out_ldst(TCGContext *s, AArch64Insn insn, TCGReg rd,
+ TCGReg rn, intptr_t offset, int lgsize)
{
- TCGMemOp size = (uint32_t)insn >> 30;
-
/* If the offset is naturally aligned and in range, then we can
use the scaled uimm12 encoding */
- if (offset >= 0 && !(offset & ((1 << size) - 1))) {
- uintptr_t scaled_uimm = offset >> size;
+ if (offset >= 0 && !(offset & ((1 << lgsize) - 1))) {
+ uintptr_t scaled_uimm = offset >> lgsize;
if (scaled_uimm <= 0xfff) {
tcg_out_insn_3313(s, insn, rd, rn, scaled_uimm);
return;
@@ -695,32 +935,102 @@ static void tcg_out_ldst(TCGContext *s, AArch64Insn insn,
tcg_out_ldst_r(s, insn, rd, rn, TCG_TYPE_I64, TCG_REG_TMP);
}
-static inline void tcg_out_mov(TCGContext *s,
- TCGType type, TCGReg ret, TCGReg arg)
+static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
{
- if (ret != arg) {
- tcg_out_movr(s, type, ret, arg);
+ if (ret == arg) {
+ return;
+ }
+ switch (type) {
+ case TCG_TYPE_I32:
+ case TCG_TYPE_I64:
+ if (ret < 32 && arg < 32) {
+ tcg_out_movr(s, type, ret, arg);
+ break;
+ } else if (ret < 32) {
+ tcg_out_insn(s, 3605, UMOV, type, ret, arg, 0, 0);
+ break;
+ } else if (arg < 32) {
+ tcg_out_insn(s, 3605, INS, 0, ret, arg, 4 << type, 0);
+ break;
+ }
+ /* FALLTHRU */
+
+ case TCG_TYPE_V64:
+ tcg_debug_assert(ret >= 32 && arg >= 32);
+ tcg_out_insn(s, 3616, ORR, 0, 0, ret, arg, arg);
+ break;
+ case TCG_TYPE_V128:
+ tcg_debug_assert(ret >= 32 && arg >= 32);
+ tcg_out_insn(s, 3616, ORR, 1, 0, ret, arg, arg);
+ break;
+
+ default:
+ g_assert_not_reached();
}
}
-static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg,
- TCGReg arg1, intptr_t arg2)
+static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
+ TCGReg base, intptr_t ofs)
{
- tcg_out_ldst(s, type == TCG_TYPE_I32 ? I3312_LDRW : I3312_LDRX,
- arg, arg1, arg2);
+ AArch64Insn insn;
+ int lgsz;
+
+ switch (type) {
+ case TCG_TYPE_I32:
+ insn = (ret < 32 ? I3312_LDRW : I3312_LDRVS);
+ lgsz = 2;
+ break;
+ case TCG_TYPE_I64:
+ insn = (ret < 32 ? I3312_LDRX : I3312_LDRVD);
+ lgsz = 3;
+ break;
+ case TCG_TYPE_V64:
+ insn = I3312_LDRVD;
+ lgsz = 3;
+ break;
+ case TCG_TYPE_V128:
+ insn = I3312_LDRVQ;
+ lgsz = 4;
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ tcg_out_ldst(s, insn, ret, base, ofs, lgsz);
}
-static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
- TCGReg arg1, intptr_t arg2)
+static void tcg_out_st(TCGContext *s, TCGType type, TCGReg src,
+ TCGReg base, intptr_t ofs)
{
- tcg_out_ldst(s, type == TCG_TYPE_I32 ? I3312_STRW : I3312_STRX,
- arg, arg1, arg2);
+ AArch64Insn insn;
+ int lgsz;
+
+ switch (type) {
+ case TCG_TYPE_I32:
+ insn = (src < 32 ? I3312_STRW : I3312_STRVS);
+ lgsz = 2;
+ break;
+ case TCG_TYPE_I64:
+ insn = (src < 32 ? I3312_STRX : I3312_STRVD);
+ lgsz = 3;
+ break;
+ case TCG_TYPE_V64:
+ insn = I3312_STRVD;
+ lgsz = 3;
+ break;
+ case TCG_TYPE_V128:
+ insn = I3312_STRVQ;
+ lgsz = 4;
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ tcg_out_ldst(s, insn, src, base, ofs, lgsz);
}
static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
TCGReg base, intptr_t ofs)
{
- if (val == 0) {
+ if (type <= TCG_TYPE_I64 && val == 0) {
tcg_out_st(s, type, TCG_REG_XZR, base, ofs);
return true;
}
@@ -1210,14 +1520,15 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addr_reg, TCGMemOp opc,
/* Merge "low bits" from tlb offset, load the tlb comparator into X0.
X0 = load [X2 + (tlb_offset & 0x000fff)] */
tcg_out_ldst(s, TARGET_LONG_BITS == 32 ? I3312_LDRW : I3312_LDRX,
- TCG_REG_X0, TCG_REG_X2, tlb_offset & 0xfff);
+ TCG_REG_X0, TCG_REG_X2, tlb_offset & 0xfff,
+ TARGET_LONG_BITS == 32 ? 2 : 3);
/* Load the tlb addend. Do that early to avoid stalling.
X1 = load [X2 + (tlb_offset & 0xfff) + offsetof(addend)] */
tcg_out_ldst(s, I3312_LDRX, TCG_REG_X1, TCG_REG_X2,
(tlb_offset & 0xfff) + (offsetof(CPUTLBEntry, addend)) -
(is_read ? offsetof(CPUTLBEntry, addr_read)
- : offsetof(CPUTLBEntry, addr_write)));
+ : offsetof(CPUTLBEntry, addr_write)), 3);
/* Perform the address comparison. */
tcg_out_cmp(s, (TARGET_LONG_BITS == 64), TCG_REG_X0, TCG_REG_X3, 0);
@@ -1435,49 +1746,49 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
case INDEX_op_ld8u_i32:
case INDEX_op_ld8u_i64:
- tcg_out_ldst(s, I3312_LDRB, a0, a1, a2);
+ tcg_out_ldst(s, I3312_LDRB, a0, a1, a2, 0);
break;
case INDEX_op_ld8s_i32:
- tcg_out_ldst(s, I3312_LDRSBW, a0, a1, a2);
+ tcg_out_ldst(s, I3312_LDRSBW, a0, a1, a2, 0);
break;
case INDEX_op_ld8s_i64:
- tcg_out_ldst(s, I3312_LDRSBX, a0, a1, a2);
+ tcg_out_ldst(s, I3312_LDRSBX, a0, a1, a2, 0);
break;
case INDEX_op_ld16u_i32:
case INDEX_op_ld16u_i64:
- tcg_out_ldst(s, I3312_LDRH, a0, a1, a2);
+ tcg_out_ldst(s, I3312_LDRH, a0, a1, a2, 1);
break;
case INDEX_op_ld16s_i32:
- tcg_out_ldst(s, I3312_LDRSHW, a0, a1, a2);
+ tcg_out_ldst(s, I3312_LDRSHW, a0, a1, a2, 1);
break;
case INDEX_op_ld16s_i64:
- tcg_out_ldst(s, I3312_LDRSHX, a0, a1, a2);
+ tcg_out_ldst(s, I3312_LDRSHX, a0, a1, a2, 1);
break;
case INDEX_op_ld_i32:
case INDEX_op_ld32u_i64:
- tcg_out_ldst(s, I3312_LDRW, a0, a1, a2);
+ tcg_out_ldst(s, I3312_LDRW, a0, a1, a2, 2);
break;
case INDEX_op_ld32s_i64:
- tcg_out_ldst(s, I3312_LDRSWX, a0, a1, a2);
+ tcg_out_ldst(s, I3312_LDRSWX, a0, a1, a2, 2);
break;
case INDEX_op_ld_i64:
- tcg_out_ldst(s, I3312_LDRX, a0, a1, a2);
+ tcg_out_ldst(s, I3312_LDRX, a0, a1, a2, 3);
break;
case INDEX_op_st8_i32:
case INDEX_op_st8_i64:
- tcg_out_ldst(s, I3312_STRB, REG0(0), a1, a2);
+ tcg_out_ldst(s, I3312_STRB, REG0(0), a1, a2, 0);
break;
case INDEX_op_st16_i32:
case INDEX_op_st16_i64:
- tcg_out_ldst(s, I3312_STRH, REG0(0), a1, a2);
+ tcg_out_ldst(s, I3312_STRH, REG0(0), a1, a2, 1);
break;
case INDEX_op_st_i32:
case INDEX_op_st32_i64:
- tcg_out_ldst(s, I3312_STRW, REG0(0), a1, a2);
+ tcg_out_ldst(s, I3312_STRW, REG0(0), a1, a2, 2);
break;
case INDEX_op_st_i64:
- tcg_out_ldst(s, I3312_STRX, REG0(0), a1, a2);
+ tcg_out_ldst(s, I3312_STRX, REG0(0), a1, a2, 3);
break;
case INDEX_op_add_i32:
@@ -1776,25 +2087,176 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */
case INDEX_op_mov_i64:
+ case INDEX_op_mov_vec:
case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi. */
case INDEX_op_movi_i64:
+ case INDEX_op_dupi_vec:
case INDEX_op_call: /* Always emitted via tcg_out_call. */
default:
- tcg_abort();
+ g_assert_not_reached();
}
#undef REG0
}
+static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
+ unsigned vecl, unsigned vece,
+ const TCGArg *args, const int *const_args)
+{
+ static const AArch64Insn cmp_insn[16] = {
+ [TCG_COND_EQ] = I3616_CMEQ,
+ [TCG_COND_GT] = I3616_CMGT,
+ [TCG_COND_GE] = I3616_CMGE,
+ [TCG_COND_GTU] = I3616_CMHI,
+ [TCG_COND_GEU] = I3616_CMHS,
+ };
+ static const AArch64Insn cmp0_insn[16] = {
+ [TCG_COND_EQ] = I3617_CMEQ0,
+ [TCG_COND_GT] = I3617_CMGT0,
+ [TCG_COND_GE] = I3617_CMGE0,
+ [TCG_COND_LT] = I3617_CMLT0,
+ [TCG_COND_LE] = I3617_CMLE0,
+ };
+
+ TCGType type = vecl + TCG_TYPE_V64;
+ unsigned is_q = vecl;
+ TCGArg a0, a1, a2;
+
+ a0 = args[0];
+ a1 = args[1];
+ a2 = args[2];
+
+ switch (opc) {
+ case INDEX_op_ld_vec:
+ tcg_out_ld(s, type, a0, a1, a2);
+ break;
+ case INDEX_op_st_vec:
+ tcg_out_st(s, type, a0, a1, a2);
+ break;
+ case INDEX_op_add_vec:
+ tcg_out_insn(s, 3616, ADD, is_q, vece, a0, a1, a2);
+ break;
+ case INDEX_op_sub_vec:
+ tcg_out_insn(s, 3616, SUB, is_q, vece, a0, a1, a2);
+ break;
+ case INDEX_op_mul_vec:
+ tcg_out_insn(s, 3616, MUL, is_q, vece, a0, a1, a2);
+ break;
+ case INDEX_op_neg_vec:
+ tcg_out_insn(s, 3617, NEG, is_q, vece, a0, a1);
+ break;
+ case INDEX_op_and_vec:
+ tcg_out_insn(s, 3616, AND, is_q, 0, a0, a1, a2);
+ break;
+ case INDEX_op_or_vec:
+ tcg_out_insn(s, 3616, ORR, is_q, 0, a0, a1, a2);
+ break;
+ case INDEX_op_xor_vec:
+ tcg_out_insn(s, 3616, EOR, is_q, 0, a0, a1, a2);
+ break;
+ case INDEX_op_andc_vec:
+ tcg_out_insn(s, 3616, BIC, is_q, 0, a0, a1, a2);
+ break;
+ case INDEX_op_orc_vec:
+ tcg_out_insn(s, 3616, ORN, is_q, 0, a0, a1, a2);
+ break;
+ case INDEX_op_not_vec:
+ tcg_out_insn(s, 3617, NOT, is_q, 0, a0, a1);
+ break;
+ case INDEX_op_dup_vec:
+ tcg_out_insn(s, 3605, DUP, is_q, a0, a1, 1 << vece, 0);
+ break;
+ case INDEX_op_shli_vec:
+ tcg_out_insn(s, 3614, SHL, is_q, a0, a1, a2 + (8 << vece));
+ break;
+ case INDEX_op_shri_vec:
+ tcg_out_insn(s, 3614, USHR, is_q, a0, a1, (16 << vece) - a2);
+ break;
+ case INDEX_op_sari_vec:
+ tcg_out_insn(s, 3614, SSHR, is_q, a0, a1, (16 << vece) - a2);
+ break;
+ case INDEX_op_cmp_vec:
+ {
+ TCGCond cond = args[3];
+ AArch64Insn insn;
+
+ if (cond == TCG_COND_NE) {
+ if (const_args[2]) {
+ tcg_out_insn(s, 3616, CMTST, is_q, vece, a0, a1, a1);
+ } else {
+ tcg_out_insn(s, 3616, CMEQ, is_q, vece, a0, a1, a2);
+ tcg_out_insn(s, 3617, NOT, is_q, 0, a0, a0);
+ }
+ } else {
+ if (const_args[2]) {
+ insn = cmp0_insn[cond];
+ if (insn) {
+ tcg_out_insn_3617(s, insn, is_q, vece, a0, a1);
+ break;
+ }
+ tcg_out_dupi_vec(s, type, TCG_VEC_TMP, 0);
+ a2 = TCG_VEC_TMP;
+ }
+ insn = cmp_insn[cond];
+ if (insn == 0) {
+ TCGArg t;
+ t = a1, a1 = a2, a2 = t;
+ cond = tcg_swap_cond(cond);
+ insn = cmp_insn[cond];
+ tcg_debug_assert(insn != 0);
+ }
+ tcg_out_insn_3616(s, insn, is_q, vece, a0, a1, a2);
+ }
+ }
+ break;
+ default:
+ g_assert_not_reached();
+ }
+}
+
+int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
+{
+ switch (opc) {
+ case INDEX_op_add_vec:
+ case INDEX_op_sub_vec:
+ case INDEX_op_mul_vec:
+ case INDEX_op_and_vec:
+ case INDEX_op_or_vec:
+ case INDEX_op_xor_vec:
+ case INDEX_op_andc_vec:
+ case INDEX_op_orc_vec:
+ case INDEX_op_neg_vec:
+ case INDEX_op_not_vec:
+ case INDEX_op_cmp_vec:
+ case INDEX_op_shli_vec:
+ case INDEX_op_shri_vec:
+ case INDEX_op_sari_vec:
+ return 1;
+
+ default:
+ return 0;
+ }
+}
+
+void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
+ TCGArg a0, ...)
+{
+}
+
static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
{
static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
+ static const TCGTargetOpDef w_w = { .args_ct_str = { "w", "w" } };
+ static const TCGTargetOpDef w_r = { .args_ct_str = { "w", "r" } };
+ static const TCGTargetOpDef w_wr = { .args_ct_str = { "w", "wr" } };
static const TCGTargetOpDef r_l = { .args_ct_str = { "r", "l" } };
static const TCGTargetOpDef r_rA = { .args_ct_str = { "r", "rA" } };
static const TCGTargetOpDef rZ_r = { .args_ct_str = { "rZ", "r" } };
static const TCGTargetOpDef lZ_l = { .args_ct_str = { "lZ", "l" } };
static const TCGTargetOpDef r_r_r = { .args_ct_str = { "r", "r", "r" } };
+ static const TCGTargetOpDef w_w_w = { .args_ct_str = { "w", "w", "w" } };
+ static const TCGTargetOpDef w_w_wZ = { .args_ct_str = { "w", "w", "wZ" } };
static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
static const TCGTargetOpDef r_r_rA = { .args_ct_str = { "r", "r", "rA" } };
static const TCGTargetOpDef r_r_rL = { .args_ct_str = { "r", "r", "rL" } };
@@ -1938,6 +2400,29 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
case INDEX_op_sub2_i64:
return &add2;
+ case INDEX_op_add_vec:
+ case INDEX_op_sub_vec:
+ case INDEX_op_mul_vec:
+ case INDEX_op_and_vec:
+ case INDEX_op_or_vec:
+ case INDEX_op_xor_vec:
+ case INDEX_op_andc_vec:
+ case INDEX_op_orc_vec:
+ return &w_w_w;
+ case INDEX_op_not_vec:
+ case INDEX_op_neg_vec:
+ case INDEX_op_shli_vec:
+ case INDEX_op_shri_vec:
+ case INDEX_op_sari_vec:
+ return &w_w;
+ case INDEX_op_ld_vec:
+ case INDEX_op_st_vec:
+ return &w_r;
+ case INDEX_op_dup_vec:
+ return &w_wr;
+ case INDEX_op_cmp_vec:
+ return &w_w_wZ;
+
default:
return NULL;
}
@@ -1947,8 +2432,10 @@ static void tcg_target_init(TCGContext *s)
{
tcg_target_available_regs[TCG_TYPE_I32] = 0xffffffffu;
tcg_target_available_regs[TCG_TYPE_I64] = 0xffffffffu;
+ tcg_target_available_regs[TCG_TYPE_V64] = 0xffffffff00000000ull;
+ tcg_target_available_regs[TCG_TYPE_V128] = 0xffffffff00000000ull;
- tcg_target_call_clobber_regs = 0xfffffffu;
+ tcg_target_call_clobber_regs = -1ull;
tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X19);
tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X20);
tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X21);
@@ -1960,12 +2447,21 @@ static void tcg_target_init(TCGContext *s)
tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X27);
tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X28);
tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X29);
+ tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V8);
+ tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V9);
+ tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V10);
+ tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V11);
+ tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V12);
+ tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V13);
+ tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V14);
+ tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V15);
s->reserved_regs = 0;
tcg_regset_set_reg(s->reserved_regs, TCG_REG_SP);
tcg_regset_set_reg(s->reserved_regs, TCG_REG_FP);
tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP);
tcg_regset_set_reg(s->reserved_regs, TCG_REG_X18); /* platform register */
+ tcg_regset_set_reg(s->reserved_regs, TCG_VEC_TMP);
}
/* Saving pairs: (X19, X20) .. (X27, X28), (X29(fp), X30(lr)). */
diff --git a/tcg/aarch64/tcg-target.opc.h b/tcg/aarch64/tcg-target.opc.h
new file mode 100644
index 0000000000..4816a6c3d4
--- /dev/null
+++ b/tcg/aarch64/tcg-target.opc.h
@@ -0,0 +1,3 @@
+/* Target-specific opcodes for host vector expansion. These will be
+ emitted by tcg_expand_vec_op. For those familiar with GCC internals,
+ consider these to be UNSPEC with names. */
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index b89dababf4..9fdf37f23c 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -30,10 +30,10 @@
#ifdef __x86_64__
# define TCG_TARGET_REG_BITS 64
-# define TCG_TARGET_NB_REGS 16
+# define TCG_TARGET_NB_REGS 32
#else
# define TCG_TARGET_REG_BITS 32
-# define TCG_TARGET_NB_REGS 8
+# define TCG_TARGET_NB_REGS 24
#endif
typedef enum {
@@ -56,6 +56,26 @@ typedef enum {
TCG_REG_R13,
TCG_REG_R14,
TCG_REG_R15,
+
+ TCG_REG_XMM0,
+ TCG_REG_XMM1,
+ TCG_REG_XMM2,
+ TCG_REG_XMM3,
+ TCG_REG_XMM4,
+ TCG_REG_XMM5,
+ TCG_REG_XMM6,
+ TCG_REG_XMM7,
+
+ /* 64-bit registers; likewise always define. */
+ TCG_REG_XMM8,
+ TCG_REG_XMM9,
+ TCG_REG_XMM10,
+ TCG_REG_XMM11,
+ TCG_REG_XMM12,
+ TCG_REG_XMM13,
+ TCG_REG_XMM14,
+ TCG_REG_XMM15,
+
TCG_REG_RAX = TCG_REG_EAX,
TCG_REG_RCX = TCG_REG_ECX,
TCG_REG_RDX = TCG_REG_EDX,
@@ -77,6 +97,8 @@ typedef enum {
extern bool have_bmi1;
extern bool have_popcnt;
+extern bool have_avx1;
+extern bool have_avx2;
/* optional instructions */
#define TCG_TARGET_HAS_div2_i32 1
@@ -146,6 +168,21 @@ extern bool have_popcnt;
#define TCG_TARGET_HAS_mulsh_i64 0
#endif
+/* We do not support older SSE systems, only beginning with AVX1. */
+#define TCG_TARGET_HAS_v64 have_avx1
+#define TCG_TARGET_HAS_v128 have_avx1
+#define TCG_TARGET_HAS_v256 have_avx2
+
+#define TCG_TARGET_HAS_andc_vec 1
+#define TCG_TARGET_HAS_orc_vec 0
+#define TCG_TARGET_HAS_not_vec 0
+#define TCG_TARGET_HAS_neg_vec 0
+#define TCG_TARGET_HAS_shi_vec 1
+#define TCG_TARGET_HAS_shs_vec 0
+#define TCG_TARGET_HAS_shv_vec 0
+#define TCG_TARGET_HAS_cmp_vec 1
+#define TCG_TARGET_HAS_mul_vec 1
+
#define TCG_TARGET_deposit_i32_valid(ofs, len) \
(((ofs) == 0 && (len) == 8) || ((ofs) == 8 && (len) == 8) || \
((ofs) == 0 && (len) == 16))
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 63d27f10e7..fc05909d1d 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -28,10 +28,15 @@
static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
#if TCG_TARGET_REG_BITS == 64
"%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
- "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
#else
"%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
#endif
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+#if TCG_TARGET_REG_BITS == 64
+ "%xmm8", "%xmm9", "%xmm10", "%xmm11",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+#endif
};
#endif
@@ -61,6 +66,28 @@ static const int tcg_target_reg_alloc_order[] = {
TCG_REG_EDX,
TCG_REG_EAX,
#endif
+ TCG_REG_XMM0,
+ TCG_REG_XMM1,
+ TCG_REG_XMM2,
+ TCG_REG_XMM3,
+ TCG_REG_XMM4,
+ TCG_REG_XMM5,
+#ifndef _WIN64
+ /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
+ any of them. Therefore only allow xmm0-xmm5 to be allocated. */
+ TCG_REG_XMM6,
+ TCG_REG_XMM7,
+#if TCG_TARGET_REG_BITS == 64
+ TCG_REG_XMM8,
+ TCG_REG_XMM9,
+ TCG_REG_XMM10,
+ TCG_REG_XMM11,
+ TCG_REG_XMM12,
+ TCG_REG_XMM13,
+ TCG_REG_XMM14,
+ TCG_REG_XMM15,
+#endif
+#endif
};
static const int tcg_target_call_iarg_regs[] = {
@@ -94,7 +121,7 @@ static const int tcg_target_call_oarg_regs[] = {
#define TCG_CT_CONST_I32 0x400
#define TCG_CT_CONST_WSZ 0x800
-/* Registers used with L constraint, which are the first argument
+/* Registers used with L constraint, which are the first argument
registers on x86_64, and two random call clobbered registers on
i386. */
#if TCG_TARGET_REG_BITS == 64
@@ -125,6 +152,8 @@ static bool have_cmov;
it there. Therefore we always define the variable. */
bool have_bmi1;
bool have_popcnt;
+bool have_avx1;
+bool have_avx2;
#ifdef CONFIG_CPUID_H
static bool have_movbe;
@@ -148,6 +177,8 @@ static void patch_reloc(tcg_insn_unit *code_ptr, int type,
if (value != (int32_t)value) {
tcg_abort();
}
+ /* FALLTHRU */
+ case R_386_32:
tcg_patch32(code_ptr, value);
break;
case R_386_PC8:
@@ -162,6 +193,14 @@ static void patch_reloc(tcg_insn_unit *code_ptr, int type,
}
}
+#if TCG_TARGET_REG_BITS == 64
+#define ALL_GENERAL_REGS 0x0000ffffu
+#define ALL_VECTOR_REGS 0xffff0000u
+#else
+#define ALL_GENERAL_REGS 0x000000ffu
+#define ALL_VECTOR_REGS 0x00ff0000u
+#endif
+
/* parse target specific constraints */
static const char *target_parse_constraint(TCGArgConstraint *ct,
const char *ct_str, TCGType type)
@@ -192,21 +231,29 @@ static const char *target_parse_constraint(TCGArgConstraint *ct,
tcg_regset_set_reg(ct->u.regs, TCG_REG_EDI);
break;
case 'q':
+ /* A register that can be used as a byte operand. */
ct->ct |= TCG_CT_REG;
ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf;
break;
case 'Q':
+ /* A register with an addressable second byte (e.g. %ah). */
ct->ct |= TCG_CT_REG;
ct->u.regs = 0xf;
break;
case 'r':
+ /* A general register. */
ct->ct |= TCG_CT_REG;
- ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff;
+ ct->u.regs |= ALL_GENERAL_REGS;
break;
case 'W':
/* With TZCNT/LZCNT, we can have operand-size as an input. */
ct->ct |= TCG_CT_CONST_WSZ;
break;
+ case 'x':
+ /* A vector register. */
+ ct->ct |= TCG_CT_REG;
+ ct->u.regs |= ALL_VECTOR_REGS;
+ break;
/* qemu_ld/st address constraint */
case 'L':
@@ -277,14 +324,17 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
# define P_REXB_RM 0
# define P_GS 0
#endif
-#define P_SIMDF3 0x10000 /* 0xf3 opcode prefix */
-#define P_SIMDF2 0x20000 /* 0xf2 opcode prefix */
+#define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */
+#define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */
+#define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */
+#define P_VEXL 0x80000 /* Set VEX.L = 1 */
#define OPC_ARITH_EvIz (0x81)
#define OPC_ARITH_EvIb (0x83)
#define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */
#define OPC_ANDN (0xf2 | P_EXT38)
#define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3))
+#define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16)
#define OPC_BSF (0xbc | P_EXT)
#define OPC_BSR (0xbd | P_EXT)
#define OPC_BSWAP (0xc8 | P_EXT)
@@ -310,11 +360,68 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
#define OPC_MOVL_Iv (0xb8)
#define OPC_MOVBE_GyMy (0xf0 | P_EXT38)
#define OPC_MOVBE_MyGy (0xf1 | P_EXT38)
+#define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16)
+#define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16)
+#define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2)
+#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
+#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
+#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
+#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
+#define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3)
+#define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16)
#define OPC_MOVSBL (0xbe | P_EXT)
#define OPC_MOVSWL (0xbf | P_EXT)
#define OPC_MOVSLQ (0x63 | P_REXW)
#define OPC_MOVZBL (0xb6 | P_EXT)
#define OPC_MOVZWL (0xb7 | P_EXT)
+#define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16)
+#define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16)
+#define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16)
+#define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16)
+#define OPC_PADDB (0xfc | P_EXT | P_DATA16)
+#define OPC_PADDW (0xfd | P_EXT | P_DATA16)
+#define OPC_PADDD (0xfe | P_EXT | P_DATA16)
+#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16)
+#define OPC_PAND (0xdb | P_EXT | P_DATA16)
+#define OPC_PANDN (0xdf | P_EXT | P_DATA16)
+#define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16)
+#define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16)
+#define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16)
+#define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16)
+#define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16)
+#define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16)
+#define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16)
+#define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16)
+#define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16)
+#define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16)
+#define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16)
+#define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16)
+#define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16)
+#define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16)
+#define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16)
+#define OPC_PMULLW (0xd5 | P_EXT | P_DATA16)
+#define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16)
+#define OPC_POR (0xeb | P_EXT | P_DATA16)
+#define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16)
+#define OPC_PSHUFD (0x70 | P_EXT | P_DATA16)
+#define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2)
+#define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3)
+#define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
+#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
+#define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
+#define OPC_PSUBB (0xf8 | P_EXT | P_DATA16)
+#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16)
+#define OPC_PSUBD (0xfa | P_EXT | P_DATA16)
+#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16)
+#define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16)
+#define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16)
+#define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16)
+#define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16)
+#define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16)
+#define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16)
+#define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16)
+#define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16)
+#define OPC_PXOR (0xef | P_EXT | P_DATA16)
#define OPC_POP_r32 (0x58)
#define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3)
#define OPC_PUSH_r32 (0x50)
@@ -326,14 +433,26 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
#define OPC_SHIFT_Ib (0xc1)
#define OPC_SHIFT_cl (0xd3)
#define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3)
+#define OPC_SHUFPS (0xc6 | P_EXT)
#define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16)
#define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2)
#define OPC_TESTL (0x85)
#define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3)
+#define OPC_UD2 (0x0b | P_EXT)
+#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16)
+#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16)
+#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
+#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
+#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
+#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
+#define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_REXW)
+#define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
+#define OPC_VZEROUPPER (0x77 | P_EXT)
#define OPC_XCHG_ax_r32 (0x90)
#define OPC_GRP3_Ev (0xf7)
#define OPC_GRP5 (0xff)
+#define OPC_GRP14 (0x73 | P_EXT | P_DATA16)
/* Group 1 opcode extensions for 0x80-0x83.
These are also used as modifiers for OPC_ARITH. */
@@ -439,10 +558,12 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
tcg_out8(s, (uint8_t)(rex | 0x40));
}
- if (opc & (P_EXT | P_EXT38)) {
+ if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
tcg_out8(s, 0x0f);
if (opc & P_EXT38) {
tcg_out8(s, 0x38);
+ } else if (opc & P_EXT3A) {
+ tcg_out8(s, 0x3a);
}
}
@@ -459,10 +580,12 @@ static void tcg_out_opc(TCGContext *s, int opc)
} else if (opc & P_SIMDF2) {
tcg_out8(s, 0xf2);
}
- if (opc & (P_EXT | P_EXT38)) {
+ if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
tcg_out8(s, 0x0f);
if (opc & P_EXT38) {
tcg_out8(s, 0x38);
+ } else if (opc & P_EXT3A) {
+ tcg_out8(s, 0x3a);
}
}
tcg_out8(s, opc);
@@ -479,34 +602,42 @@ static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
}
-static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
+static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
+ int rm, int index)
{
int tmp;
- if ((opc & (P_REXW | P_EXT | P_EXT38)) || (rm & 8)) {
+ /* Use the two byte form if possible, which cannot encode
+ VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */
+ if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT
+ && ((rm | index) & 8) == 0) {
+ /* Two byte VEX prefix. */
+ tcg_out8(s, 0xc5);
+
+ tmp = (r & 8 ? 0 : 0x80); /* VEX.R */
+ } else {
/* Three byte VEX prefix. */
tcg_out8(s, 0xc4);
/* VEX.m-mmmm */
- if (opc & P_EXT38) {
+ if (opc & P_EXT3A) {
+ tmp = 3;
+ } else if (opc & P_EXT38) {
tmp = 2;
} else if (opc & P_EXT) {
tmp = 1;
} else {
- tcg_abort();
+ g_assert_not_reached();
}
- tmp |= 0x40; /* VEX.X */
- tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */
- tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */
+ tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */
+ tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */
+ tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */
tcg_out8(s, tmp);
- tmp = (opc & P_REXW ? 0x80 : 0); /* VEX.W */
- } else {
- /* Two byte VEX prefix. */
- tcg_out8(s, 0xc5);
-
- tmp = (r & 8 ? 0 : 0x80); /* VEX.R */
+ tmp = (opc & P_REXW ? 0x80 : 0); /* VEX.W */
}
+
+ tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */
/* VEX.pp */
if (opc & P_DATA16) {
tmp |= 1; /* 0x66 */
@@ -518,6 +649,11 @@ static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
tmp |= (~v & 15) << 3; /* VEX.vvvv */
tcg_out8(s, tmp);
tcg_out8(s, opc);
+}
+
+static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
+{
+ tcg_out_vex_opc(s, opc, r, v, rm, 0);
tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
}
@@ -526,8 +662,8 @@ static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
mode for absolute addresses, ~RM is the size of the immediate operand
that will follow the instruction. */
-static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
- int index, int shift, intptr_t offset)
+static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
+ int shift, intptr_t offset)
{
int mod, len;
@@ -538,7 +674,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
intptr_t disp = offset - pc;
if (disp == (int32_t)disp) {
- tcg_out_opc(s, opc, r, 0, 0);
tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
tcg_out32(s, disp);
return;
@@ -548,7 +683,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
use of the MODRM+SIB encoding and is therefore larger than
rip-relative addressing. */
if (offset == (int32_t)offset) {
- tcg_out_opc(s, opc, r, 0, 0);
tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
tcg_out8(s, (4 << 3) | 5);
tcg_out32(s, offset);
@@ -556,10 +690,9 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
}
/* ??? The memory isn't directly addressable. */
- tcg_abort();
+ g_assert_not_reached();
} else {
/* Absolute address. */
- tcg_out_opc(s, opc, r, 0, 0);
tcg_out8(s, (r << 3) | 5);
tcg_out32(s, offset);
return;
@@ -582,7 +715,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
that would be used for %esp is the escape to the two byte form. */
if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
/* Single byte MODRM format. */
- tcg_out_opc(s, opc, r, rm, 0);
tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
} else {
/* Two byte MODRM+SIB format. */
@@ -596,7 +728,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
tcg_debug_assert(index != TCG_REG_ESP);
}
- tcg_out_opc(s, opc, r, rm, index);
tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
}
@@ -608,6 +739,21 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
}
}
+static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
+ int index, int shift, intptr_t offset)
+{
+ tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
+ tcg_out_sib_offset(s, r, rm, index, shift, offset);
+}
+
+static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
+ int rm, int index, int shift,
+ intptr_t offset)
+{
+ tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
+ tcg_out_sib_offset(s, r, rm, index, shift, offset);
+}
+
/* A simplification of the above with no index or shift. */
static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
int rm, intptr_t offset)
@@ -615,6 +761,30 @@ static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
}
+static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
+ int v, int rm, intptr_t offset)
+{
+ tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
+}
+
+/* Output an opcode with an expected reference to the constant pool. */
+static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
+{
+ tcg_out_opc(s, opc, r, 0, 0);
+ /* Absolute for 32-bit, pc-relative for 64-bit. */
+ tcg_out8(s, LOWREGMASK(r) << 3 | 5);
+ tcg_out32(s, 0);
+}
+
+/* Output an opcode with an expected reference to the constant pool. */
+static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
+{
+ tcg_out_vex_opc(s, opc, r, 0, 0, 0);
+ /* Absolute for 32-bit, pc-relative for 64-bit. */
+ tcg_out8(s, LOWREGMASK(r) << 3 | 5);
+ tcg_out32(s, 0);
+}
+
/* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */
static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
{
@@ -625,12 +795,116 @@ static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
}
-static inline void tcg_out_mov(TCGContext *s, TCGType type,
- TCGReg ret, TCGReg arg)
+static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
{
- if (arg != ret) {
- int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
- tcg_out_modrm(s, opc, ret, arg);
+ int rexw = 0;
+
+ if (arg == ret) {
+ return;
+ }
+ switch (type) {
+ case TCG_TYPE_I64:
+ rexw = P_REXW;
+ /* fallthru */
+ case TCG_TYPE_I32:
+ if (ret < 16) {
+ if (arg < 16) {
+ tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
+ } else {
+ tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
+ }
+ } else {
+ if (arg < 16) {
+ tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
+ } else {
+ tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
+ }
+ }
+ break;
+
+ case TCG_TYPE_V64:
+ tcg_debug_assert(ret >= 16 && arg >= 16);
+ tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
+ break;
+ case TCG_TYPE_V128:
+ tcg_debug_assert(ret >= 16 && arg >= 16);
+ tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
+ break;
+ case TCG_TYPE_V256:
+ tcg_debug_assert(ret >= 16 && arg >= 16);
+ tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
+ break;
+
+ default:
+ g_assert_not_reached();
+ }
+}
+
+static void tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
+ TCGReg r, TCGReg a)
+{
+ if (have_avx2) {
+ static const int dup_insn[4] = {
+ OPC_VPBROADCASTB, OPC_VPBROADCASTW,
+ OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
+ };
+ int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
+ tcg_out_vex_modrm(s, dup_insn[vece] + vex_l, r, 0, a);
+ } else {
+ switch (vece) {
+ case MO_8:
+ /* ??? With zero in a register, use PSHUFB. */
+ tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, 0, a);
+ a = r;
+ /* FALLTHRU */
+ case MO_16:
+ tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, 0, a);
+ a = r;
+ /* FALLTHRU */
+ case MO_32:
+ tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
+ /* imm8 operand: all output lanes selected from input lane 0. */
+ tcg_out8(s, 0);
+ break;
+ case MO_64:
+ tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, 0, a);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ }
+}
+
+static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
+ TCGReg ret, tcg_target_long arg)
+{
+ int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
+
+ if (arg == 0) {
+ tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
+ return;
+ }
+ if (arg == -1) {
+ tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
+ return;
+ }
+
+ if (TCG_TARGET_REG_BITS == 64) {
+ if (type == TCG_TYPE_V64) {
+ tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
+ } else if (have_avx2) {
+ tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
+ } else {
+ tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
+ }
+ new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
+ } else if (have_avx2) {
+ tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
+ new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
+ } else {
+ tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy, ret);
+ new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
+ tcg_out_dup_vec(s, type, MO_32, ret, ret);
}
}
@@ -639,6 +913,25 @@ static void tcg_out_movi(TCGContext *s, TCGType type,
{
tcg_target_long diff;
+ switch (type) {
+ case TCG_TYPE_I32:
+#if TCG_TARGET_REG_BITS == 64
+ case TCG_TYPE_I64:
+#endif
+ if (ret < 16) {
+ break;
+ }
+ /* fallthru */
+ case TCG_TYPE_V64:
+ case TCG_TYPE_V128:
+ case TCG_TYPE_V256:
+ tcg_debug_assert(ret >= 16);
+ tcg_out_dupi_vec(s, type, ret, arg);
+ return;
+ default:
+ g_assert_not_reached();
+ }
+
if (arg == 0) {
tgen_arithr(s, ARITH_XOR, ret, ret);
return;
@@ -702,18 +995,74 @@ static inline void tcg_out_pop(TCGContext *s, int reg)
tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
}
-static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
- TCGReg arg1, intptr_t arg2)
+static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
+ TCGReg arg1, intptr_t arg2)
{
- int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
- tcg_out_modrm_offset(s, opc, ret, arg1, arg2);
+ switch (type) {
+ case TCG_TYPE_I32:
+ if (ret < 16) {
+ tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
+ } else {
+ tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
+ }
+ break;
+ case TCG_TYPE_I64:
+ if (ret < 16) {
+ tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
+ break;
+ }
+ /* FALLTHRU */
+ case TCG_TYPE_V64:
+ tcg_debug_assert(ret >= 16);
+ tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
+ break;
+ case TCG_TYPE_V128:
+ tcg_debug_assert(ret >= 16);
+ tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx, ret, 0, arg1, arg2);
+ break;
+ case TCG_TYPE_V256:
+ tcg_debug_assert(ret >= 16);
+ tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
+ ret, 0, arg1, arg2);
+ break;
+ default:
+ g_assert_not_reached();
+ }
}
-static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
- TCGReg arg1, intptr_t arg2)
+static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
+ TCGReg arg1, intptr_t arg2)
{
- int opc = OPC_MOVL_EvGv + (type == TCG_TYPE_I64 ? P_REXW : 0);
- tcg_out_modrm_offset(s, opc, arg, arg1, arg2);
+ switch (type) {
+ case TCG_TYPE_I32:
+ if (arg < 16) {
+ tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
+ } else {
+ tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
+ }
+ break;
+ case TCG_TYPE_I64:
+ if (arg < 16) {
+ tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
+ break;
+ }
+ /* FALLTHRU */
+ case TCG_TYPE_V64:
+ tcg_debug_assert(arg >= 16);
+ tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
+ break;
+ case TCG_TYPE_V128:
+ tcg_debug_assert(arg >= 16);
+ tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx, arg, 0, arg1, arg2);
+ break;
+ case TCG_TYPE_V256:
+ tcg_debug_assert(arg >= 16);
+ tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
+ arg, 0, arg1, arg2);
+ break;
+ default:
+ g_assert_not_reached();
+ }
}
static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
@@ -725,6 +1074,8 @@ static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
return false;
}
rexw = P_REXW;
+ } else if (type != TCG_TYPE_I32) {
+ return false;
}
tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
tcg_out32(s, val);
@@ -2259,8 +2610,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
break;
case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */
case INDEX_op_mov_i64:
+ case INDEX_op_mov_vec:
case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi. */
case INDEX_op_movi_i64:
+ case INDEX_op_dupi_vec:
case INDEX_op_call: /* Always emitted via tcg_out_call. */
default:
tcg_abort();
@@ -2269,6 +2622,181 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
#undef OP_32_64
}
+static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
+ unsigned vecl, unsigned vece,
+ const TCGArg *args, const int *const_args)
+{
+ static int const add_insn[4] = {
+ OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
+ };
+ static int const sub_insn[4] = {
+ OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
+ };
+ static int const mul_insn[4] = {
+ OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
+ };
+ static int const shift_imm_insn[4] = {
+ OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
+ };
+ static int const cmpeq_insn[4] = {
+ OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
+ };
+ static int const cmpgt_insn[4] = {
+ OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
+ };
+ static int const punpckl_insn[4] = {
+ OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
+ };
+ static int const punpckh_insn[4] = {
+ OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
+ };
+ static int const packss_insn[4] = {
+ OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
+ };
+ static int const packus_insn[4] = {
+ OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
+ };
+
+ TCGType type = vecl + TCG_TYPE_V64;
+ int insn, sub;
+ TCGArg a0, a1, a2;
+
+ a0 = args[0];
+ a1 = args[1];
+ a2 = args[2];
+
+ switch (opc) {
+ case INDEX_op_add_vec:
+ insn = add_insn[vece];
+ goto gen_simd;
+ case INDEX_op_sub_vec:
+ insn = sub_insn[vece];
+ goto gen_simd;
+ case INDEX_op_mul_vec:
+ insn = mul_insn[vece];
+ goto gen_simd;
+ case INDEX_op_and_vec:
+ insn = OPC_PAND;
+ goto gen_simd;
+ case INDEX_op_or_vec:
+ insn = OPC_POR;
+ goto gen_simd;
+ case INDEX_op_xor_vec:
+ insn = OPC_PXOR;
+ goto gen_simd;
+ case INDEX_op_x86_punpckl_vec:
+ insn = punpckl_insn[vece];
+ goto gen_simd;
+ case INDEX_op_x86_punpckh_vec:
+ insn = punpckh_insn[vece];
+ goto gen_simd;
+ case INDEX_op_x86_packss_vec:
+ insn = packss_insn[vece];
+ goto gen_simd;
+ case INDEX_op_x86_packus_vec:
+ insn = packus_insn[vece];
+ goto gen_simd;
+ gen_simd:
+ tcg_debug_assert(insn != OPC_UD2);
+ if (type == TCG_TYPE_V256) {
+ insn |= P_VEXL;
+ }
+ tcg_out_vex_modrm(s, insn, a0, a1, a2);
+ break;
+
+ case INDEX_op_cmp_vec:
+ sub = args[3];
+ if (sub == TCG_COND_EQ) {
+ insn = cmpeq_insn[vece];
+ } else if (sub == TCG_COND_GT) {
+ insn = cmpgt_insn[vece];
+ } else {
+ g_assert_not_reached();
+ }
+ goto gen_simd;
+
+ case INDEX_op_andc_vec:
+ insn = OPC_PANDN;
+ if (type == TCG_TYPE_V256) {
+ insn |= P_VEXL;
+ }
+ tcg_out_vex_modrm(s, insn, a0, a2, a1);
+ break;
+
+ case INDEX_op_shli_vec:
+ sub = 6;
+ goto gen_shift;
+ case INDEX_op_shri_vec:
+ sub = 2;
+ goto gen_shift;
+ case INDEX_op_sari_vec:
+ tcg_debug_assert(vece != MO_64);
+ sub = 4;
+ gen_shift:
+ tcg_debug_assert(vece != MO_8);
+ insn = shift_imm_insn[vece];
+ if (type == TCG_TYPE_V256) {
+ insn |= P_VEXL;
+ }
+ tcg_out_vex_modrm(s, insn, sub, a0, a1);
+ tcg_out8(s, a2);
+ break;
+
+ case INDEX_op_ld_vec:
+ tcg_out_ld(s, type, a0, a1, a2);
+ break;
+ case INDEX_op_st_vec:
+ tcg_out_st(s, type, a0, a1, a2);
+ break;
+ case INDEX_op_dup_vec:
+ tcg_out_dup_vec(s, type, vece, a0, a1);
+ break;
+
+ case INDEX_op_x86_shufps_vec:
+ insn = OPC_SHUFPS;
+ sub = args[3];
+ goto gen_simd_imm8;
+ case INDEX_op_x86_blend_vec:
+ if (vece == MO_16) {
+ insn = OPC_PBLENDW;
+ } else if (vece == MO_32) {
+ insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
+ } else {
+ g_assert_not_reached();
+ }
+ sub = args[3];
+ goto gen_simd_imm8;
+ case INDEX_op_x86_vperm2i128_vec:
+ insn = OPC_VPERM2I128;
+ sub = args[3];
+ goto gen_simd_imm8;
+ gen_simd_imm8:
+ if (type == TCG_TYPE_V256) {
+ insn |= P_VEXL;
+ }
+ tcg_out_vex_modrm(s, insn, a0, a1, a2);
+ tcg_out8(s, sub);
+ break;
+
+ case INDEX_op_x86_vpblendvb_vec:
+ insn = OPC_VPBLENDVB;
+ if (type == TCG_TYPE_V256) {
+ insn |= P_VEXL;
+ }
+ tcg_out_vex_modrm(s, insn, a0, a1, a2);
+ tcg_out8(s, args[3] << 4);
+ break;
+
+ case INDEX_op_x86_psrldq_vec:
+ tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
+ tcg_out8(s, a2);
+ break;
+
+ default:
+ g_assert_not_reached();
+ }
+}
+
static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
{
static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
@@ -2292,6 +2820,11 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
= { .args_ct_str = { "r", "r", "L", "L" } };
static const TCGTargetOpDef L_L_L_L
= { .args_ct_str = { "L", "L", "L", "L" } };
+ static const TCGTargetOpDef x_x = { .args_ct_str = { "x", "x" } };
+ static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } };
+ static const TCGTargetOpDef x_x_x_x
+ = { .args_ct_str = { "x", "x", "x", "x" } };
+ static const TCGTargetOpDef x_r = { .args_ct_str = { "x", "r" } };
switch (op) {
case INDEX_op_goto_ptr:
@@ -2493,12 +3026,342 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
return &s2;
}
+ case INDEX_op_ld_vec:
+ case INDEX_op_st_vec:
+ return &x_r;
+
+ case INDEX_op_add_vec:
+ case INDEX_op_sub_vec:
+ case INDEX_op_mul_vec:
+ case INDEX_op_and_vec:
+ case INDEX_op_or_vec:
+ case INDEX_op_xor_vec:
+ case INDEX_op_andc_vec:
+ case INDEX_op_cmp_vec:
+ case INDEX_op_x86_shufps_vec:
+ case INDEX_op_x86_blend_vec:
+ case INDEX_op_x86_packss_vec:
+ case INDEX_op_x86_packus_vec:
+ case INDEX_op_x86_vperm2i128_vec:
+ case INDEX_op_x86_punpckl_vec:
+ case INDEX_op_x86_punpckh_vec:
+ return &x_x_x;
+ case INDEX_op_dup_vec:
+ case INDEX_op_shli_vec:
+ case INDEX_op_shri_vec:
+ case INDEX_op_sari_vec:
+ case INDEX_op_x86_psrldq_vec:
+ return &x_x;
+ case INDEX_op_x86_vpblendvb_vec:
+ return &x_x_x_x;
+
default:
break;
}
return NULL;
}
+int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
+{
+ switch (opc) {
+ case INDEX_op_add_vec:
+ case INDEX_op_sub_vec:
+ case INDEX_op_and_vec:
+ case INDEX_op_or_vec:
+ case INDEX_op_xor_vec:
+ case INDEX_op_andc_vec:
+ return 1;
+ case INDEX_op_cmp_vec:
+ return -1;
+
+ case INDEX_op_shli_vec:
+ case INDEX_op_shri_vec:
+ /* We must expand the operation for MO_8. */
+ return vece == MO_8 ? -1 : 1;
+
+ case INDEX_op_sari_vec:
+ /* We must expand the operation for MO_8. */
+ if (vece == MO_8) {
+ return -1;
+ }
+ /* We can emulate this for MO_64, but it does not pay off
+ unless we're producing at least 4 values. */
+ if (vece == MO_64) {
+ return type >= TCG_TYPE_V256 ? -1 : 0;
+ }
+ return 1;
+
+ case INDEX_op_mul_vec:
+ if (vece == MO_8) {
+ /* We can expand the operation for MO_8. */
+ return -1;
+ }
+ if (vece == MO_64) {
+ return 0;
+ }
+ return 1;
+
+ default:
+ return 0;
+ }
+}
+
+void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
+ TCGArg a0, ...)
+{
+ va_list va;
+ TCGArg a1, a2;
+ TCGv_vec v0, t1, t2, t3, t4;
+
+ va_start(va, a0);
+ v0 = temp_tcgv_vec(arg_temp(a0));
+
+ switch (opc) {
+ case INDEX_op_shli_vec:
+ case INDEX_op_shri_vec:
+ tcg_debug_assert(vece == MO_8);
+ a1 = va_arg(va, TCGArg);
+ a2 = va_arg(va, TCGArg);
+ /* Unpack to W, shift, and repack. Tricky bits:
+ (1) Use punpck*bw x,x to produce DDCCBBAA,
+ i.e. duplicate in other half of the 16-bit lane.
+ (2) For right-shift, add 8 so that the high half of
+ the lane becomes zero. For left-shift, we must
+ shift up and down again.
+ (3) Step 2 leaves high half zero such that PACKUSWB
+ (pack with unsigned saturation) does not modify
+ the quantity. */
+ t1 = tcg_temp_new_vec(type);
+ t2 = tcg_temp_new_vec(type);
+ vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
+ tcgv_vec_arg(t1), a1, a1);
+ vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
+ tcgv_vec_arg(t2), a1, a1);
+ if (opc == INDEX_op_shri_vec) {
+ vec_gen_3(INDEX_op_shri_vec, type, MO_16,
+ tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
+ vec_gen_3(INDEX_op_shri_vec, type, MO_16,
+ tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
+ } else {
+ vec_gen_3(INDEX_op_shli_vec, type, MO_16,
+ tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
+ vec_gen_3(INDEX_op_shli_vec, type, MO_16,
+ tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
+ vec_gen_3(INDEX_op_shri_vec, type, MO_16,
+ tcgv_vec_arg(t1), tcgv_vec_arg(t1), 8);
+ vec_gen_3(INDEX_op_shri_vec, type, MO_16,
+ tcgv_vec_arg(t2), tcgv_vec_arg(t2), 8);
+ }
+ vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
+ a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2));
+ tcg_temp_free_vec(t1);
+ tcg_temp_free_vec(t2);
+ break;
+
+ case INDEX_op_sari_vec:
+ a1 = va_arg(va, TCGArg);
+ a2 = va_arg(va, TCGArg);
+ if (vece == MO_8) {
+ /* Unpack to W, shift, and repack, as above. */
+ t1 = tcg_temp_new_vec(type);
+ t2 = tcg_temp_new_vec(type);
+ vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
+ tcgv_vec_arg(t1), a1, a1);
+ vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
+ tcgv_vec_arg(t2), a1, a1);
+ vec_gen_3(INDEX_op_sari_vec, type, MO_16,
+ tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8);
+ vec_gen_3(INDEX_op_sari_vec, type, MO_16,
+ tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8);
+ vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
+ a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2));
+ tcg_temp_free_vec(t1);
+ tcg_temp_free_vec(t2);
+ break;
+ }
+ tcg_debug_assert(vece == MO_64);
+ /* MO_64: If the shift is <= 32, we can emulate the sign extend by
+ performing an arithmetic 32-bit shift and overwriting the high
+ half of the result (note that the ISA says shift of 32 is valid). */
+ if (a2 <= 32) {
+ t1 = tcg_temp_new_vec(type);
+ vec_gen_3(INDEX_op_sari_vec, type, MO_32, tcgv_vec_arg(t1), a1, a2);
+ vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2);
+ vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
+ a0, a0, tcgv_vec_arg(t1), 0xaa);
+ tcg_temp_free_vec(t1);
+ break;
+ }
+ /* Otherwise we will need to use a compare vs 0 to produce the
+ sign-extend, shift and merge. */
+ t1 = tcg_temp_new_vec(type);
+ t2 = tcg_const_zeros_vec(type);
+ vec_gen_4(INDEX_op_cmp_vec, type, MO_64,
+ tcgv_vec_arg(t1), tcgv_vec_arg(t2), a1, TCG_COND_GT);
+ tcg_temp_free_vec(t2);
+ vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2);
+ vec_gen_3(INDEX_op_shli_vec, type, MO_64,
+ tcgv_vec_arg(t1), tcgv_vec_arg(t1), 64 - a2);
+ vec_gen_3(INDEX_op_or_vec, type, MO_64, a0, a0, tcgv_vec_arg(t1));
+ tcg_temp_free_vec(t1);
+ break;
+
+ case INDEX_op_mul_vec:
+ tcg_debug_assert(vece == MO_8);
+ a1 = va_arg(va, TCGArg);
+ a2 = va_arg(va, TCGArg);
+ switch (type) {
+ case TCG_TYPE_V64:
+ t1 = tcg_temp_new_vec(TCG_TYPE_V128);
+ t2 = tcg_temp_new_vec(TCG_TYPE_V128);
+ tcg_gen_dup16i_vec(t2, 0);
+ vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
+ tcgv_vec_arg(t1), a1, tcgv_vec_arg(t2));
+ vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
+ tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2);
+ tcg_gen_mul_vec(MO_16, t1, t1, t2);
+ tcg_gen_shri_vec(MO_16, t1, t1, 8);
+ vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
+ a0, tcgv_vec_arg(t1), tcgv_vec_arg(t1));
+ tcg_temp_free_vec(t1);
+ tcg_temp_free_vec(t2);
+ break;
+
+ case TCG_TYPE_V128:
+ t1 = tcg_temp_new_vec(TCG_TYPE_V128);
+ t2 = tcg_temp_new_vec(TCG_TYPE_V128);
+ t3 = tcg_temp_new_vec(TCG_TYPE_V128);
+ t4 = tcg_temp_new_vec(TCG_TYPE_V128);
+ tcg_gen_dup16i_vec(t4, 0);
+ vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
+ tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4));
+ vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
+ tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2);
+ vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8,
+ tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4));
+ vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8,
+ tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2);
+ tcg_gen_mul_vec(MO_16, t1, t1, t2);
+ tcg_gen_mul_vec(MO_16, t3, t3, t4);
+ tcg_gen_shri_vec(MO_16, t1, t1, 8);
+ tcg_gen_shri_vec(MO_16, t3, t3, 8);
+ vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
+ a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3));
+ tcg_temp_free_vec(t1);
+ tcg_temp_free_vec(t2);
+ tcg_temp_free_vec(t3);
+ tcg_temp_free_vec(t4);
+ break;
+
+ case TCG_TYPE_V256:
+ t1 = tcg_temp_new_vec(TCG_TYPE_V256);
+ t2 = tcg_temp_new_vec(TCG_TYPE_V256);
+ t3 = tcg_temp_new_vec(TCG_TYPE_V256);
+ t4 = tcg_temp_new_vec(TCG_TYPE_V256);
+ tcg_gen_dup16i_vec(t4, 0);
+ /* a1: A[0-7] ... D[0-7]; a2: W[0-7] ... Z[0-7]
+ t1: extends of B[0-7], D[0-7]
+ t2: extends of X[0-7], Z[0-7]
+ t3: extends of A[0-7], C[0-7]
+ t4: extends of W[0-7], Y[0-7]. */
+ vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8,
+ tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4));
+ vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8,
+ tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2);
+ vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8,
+ tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4));
+ vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8,
+ tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2);
+ /* t1: BX DZ; t2: AW CY. */
+ tcg_gen_mul_vec(MO_16, t1, t1, t2);
+ tcg_gen_mul_vec(MO_16, t3, t3, t4);
+ tcg_gen_shri_vec(MO_16, t1, t1, 8);
+ tcg_gen_shri_vec(MO_16, t3, t3, 8);
+ /* a0: AW BX CY DZ. */
+ vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V256, MO_8,
+ a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3));
+ tcg_temp_free_vec(t1);
+ tcg_temp_free_vec(t2);
+ tcg_temp_free_vec(t3);
+ tcg_temp_free_vec(t4);
+ break;
+
+ default:
+ g_assert_not_reached();
+ }
+ break;
+
+ case INDEX_op_cmp_vec:
+ {
+ enum {
+ NEED_SWAP = 1,
+ NEED_INV = 2,
+ NEED_BIAS = 4
+ };
+ static const uint8_t fixups[16] = {
+ [0 ... 15] = -1,
+ [TCG_COND_EQ] = 0,
+ [TCG_COND_NE] = NEED_INV,
+ [TCG_COND_GT] = 0,
+ [TCG_COND_LT] = NEED_SWAP,
+ [TCG_COND_LE] = NEED_INV,
+ [TCG_COND_GE] = NEED_SWAP | NEED_INV,
+ [TCG_COND_GTU] = NEED_BIAS,
+ [TCG_COND_LTU] = NEED_BIAS | NEED_SWAP,
+ [TCG_COND_LEU] = NEED_BIAS | NEED_INV,
+ [TCG_COND_GEU] = NEED_BIAS | NEED_SWAP | NEED_INV,
+ };
+
+ TCGCond cond;
+ uint8_t fixup;
+
+ a1 = va_arg(va, TCGArg);
+ a2 = va_arg(va, TCGArg);
+ cond = va_arg(va, TCGArg);
+ fixup = fixups[cond & 15];
+ tcg_debug_assert(fixup != 0xff);
+
+ if (fixup & NEED_INV) {
+ cond = tcg_invert_cond(cond);
+ }
+ if (fixup & NEED_SWAP) {
+ TCGArg t;
+ t = a1, a1 = a2, a2 = t;
+ cond = tcg_swap_cond(cond);
+ }
+
+ t1 = t2 = NULL;
+ if (fixup & NEED_BIAS) {
+ t1 = tcg_temp_new_vec(type);
+ t2 = tcg_temp_new_vec(type);
+ tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1));
+ tcg_gen_sub_vec(vece, t1, temp_tcgv_vec(arg_temp(a1)), t2);
+ tcg_gen_sub_vec(vece, t2, temp_tcgv_vec(arg_temp(a2)), t2);
+ a1 = tcgv_vec_arg(t1);
+ a2 = tcgv_vec_arg(t2);
+ cond = tcg_signed_cond(cond);
+ }
+
+ tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
+ vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond);
+
+ if (fixup & NEED_BIAS) {
+ tcg_temp_free_vec(t1);
+ tcg_temp_free_vec(t2);
+ }
+ if (fixup & NEED_INV) {
+ tcg_gen_not_vec(vece, v0, v0);
+ }
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ va_end(va);
+}
+
static const int tcg_target_callee_save_regs[] = {
#if TCG_TARGET_REG_BITS == 64
TCG_REG_RBP,
@@ -2577,6 +3440,9 @@ static void tcg_target_qemu_prologue(TCGContext *s)
tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
+ if (have_avx2) {
+ tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
+ }
for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
tcg_out_pop(s, tcg_target_callee_save_regs[i]);
}
@@ -2598,9 +3464,16 @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
static void tcg_target_init(TCGContext *s)
{
#ifdef CONFIG_CPUID_H
- unsigned a, b, c, d;
+ unsigned a, b, c, d, b7 = 0;
int max = __get_cpuid_max(0, 0);
+ if (max >= 7) {
+ /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */
+ __cpuid_count(7, 0, a, b7, c, d);
+ have_bmi1 = (b7 & bit_BMI) != 0;
+ have_bmi2 = (b7 & bit_BMI2) != 0;
+ }
+
if (max >= 1) {
__cpuid(1, a, b, c, d);
#ifndef have_cmov
@@ -2609,17 +3482,22 @@ static void tcg_target_init(TCGContext *s)
available, we'll use a small forward branch. */
have_cmov = (d & bit_CMOV) != 0;
#endif
+
/* MOVBE is only available on Intel Atom and Haswell CPUs, so we
need to probe for it. */
have_movbe = (c & bit_MOVBE) != 0;
have_popcnt = (c & bit_POPCNT) != 0;
- }
- if (max >= 7) {
- /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */
- __cpuid_count(7, 0, a, b, c, d);
- have_bmi1 = (b & bit_BMI) != 0;
- have_bmi2 = (b & bit_BMI2) != 0;
+ /* There are a number of things we must check before we can be
+ sure of not hitting invalid opcode. */
+ if (c & bit_OSXSAVE) {
+ unsigned xcrl, xcrh;
+ asm ("xgetbv" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
+ if ((xcrl & 6) == 6) {
+ have_avx1 = (c & bit_AVX) != 0;
+ have_avx2 = (b7 & bit_AVX2) != 0;
+ }
+ }
}
max = __get_cpuid_max(0x8000000, 0);
@@ -2630,11 +3508,16 @@ static void tcg_target_init(TCGContext *s)
}
#endif /* CONFIG_CPUID_H */
+ tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
if (TCG_TARGET_REG_BITS == 64) {
- tcg_target_available_regs[TCG_TYPE_I32] = 0xffff;
- tcg_target_available_regs[TCG_TYPE_I64] = 0xffff;
- } else {
- tcg_target_available_regs[TCG_TYPE_I32] = 0xff;
+ tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
+ }
+ if (have_avx1) {
+ tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
+ tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
+ }
+ if (have_avx2) {
+ tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
}
tcg_target_call_clobber_regs = 0;
diff --git a/tcg/i386/tcg-target.opc.h b/tcg/i386/tcg-target.opc.h
new file mode 100644
index 0000000000..e5fa88ba25
--- /dev/null
+++ b/tcg/i386/tcg-target.opc.h
@@ -0,0 +1,13 @@
+/* Target-specific opcodes for host vector expansion. These will be
+ emitted by tcg_expand_vec_op. For those familiar with GCC internals,
+ consider these to be UNSPEC with names. */
+
+DEF(x86_shufps_vec, 1, 2, 1, IMPLVEC)
+DEF(x86_vpblendvb_vec, 1, 3, 0, IMPLVEC)
+DEF(x86_blend_vec, 1, 2, 1, IMPLVEC)
+DEF(x86_packss_vec, 1, 2, 0, IMPLVEC)
+DEF(x86_packus_vec, 1, 2, 0, IMPLVEC)
+DEF(x86_psrldq_vec, 1, 1, 1, IMPLVEC)
+DEF(x86_vperm2i128_vec, 1, 2, 1, IMPLVEC)
+DEF(x86_punpckl_vec, 1, 2, 0, IMPLVEC)
+DEF(x86_punpckh_vec, 1, 2, 0, IMPLVEC)
diff --git a/tcg/optimize.c b/tcg/optimize.c
index 2cbbeefd53..d4ea67e541 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -32,6 +32,11 @@
glue(glue(case INDEX_op_, x), _i32): \
glue(glue(case INDEX_op_, x), _i64)
+#define CASE_OP_32_64_VEC(x) \
+ glue(glue(case INDEX_op_, x), _i32): \
+ glue(glue(case INDEX_op_, x), _i64): \
+ glue(glue(case INDEX_op_, x), _vec)
+
struct tcg_temp_info {
bool is_const;
TCGTemp *prev_copy;
@@ -108,40 +113,6 @@ static void init_arg_info(struct tcg_temp_info *infos,
init_ts_info(infos, temps_used, arg_temp(arg));
}
-static int op_bits(TCGOpcode op)
-{
- const TCGOpDef *def = &tcg_op_defs[op];
- return def->flags & TCG_OPF_64BIT ? 64 : 32;
-}
-
-static TCGOpcode op_to_mov(TCGOpcode op)
-{
- switch (op_bits(op)) {
- case 32:
- return INDEX_op_mov_i32;
- case 64:
- return INDEX_op_mov_i64;
- default:
- fprintf(stderr, "op_to_mov: unexpected return value of "
- "function op_bits.\n");
- tcg_abort();
- }
-}
-
-static TCGOpcode op_to_movi(TCGOpcode op)
-{
- switch (op_bits(op)) {
- case 32:
- return INDEX_op_movi_i32;
- case 64:
- return INDEX_op_movi_i64;
- default:
- fprintf(stderr, "op_to_movi: unexpected return value of "
- "function op_bits.\n");
- tcg_abort();
- }
-}
-
static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts)
{
TCGTemp *i;
@@ -199,11 +170,23 @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2)
static void tcg_opt_gen_movi(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg val)
{
- TCGOpcode new_op = op_to_movi(op->opc);
+ const TCGOpDef *def;
+ TCGOpcode new_op;
tcg_target_ulong mask;
struct tcg_temp_info *di = arg_info(dst);
+ def = &tcg_op_defs[op->opc];
+ if (def->flags & TCG_OPF_VECTOR) {
+ new_op = INDEX_op_dupi_vec;
+ } else if (def->flags & TCG_OPF_64BIT) {
+ new_op = INDEX_op_movi_i64;
+ } else {
+ new_op = INDEX_op_movi_i32;
+ }
op->opc = new_op;
+ /* TCGOP_VECL and TCGOP_VECE remain unchanged. */
+ op->args[0] = dst;
+ op->args[1] = val;
reset_temp(dst);
di->is_const = true;
@@ -214,15 +197,13 @@ static void tcg_opt_gen_movi(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg val)
mask |= ~0xffffffffull;
}
di->mask = mask;
-
- op->args[0] = dst;
- op->args[1] = val;
}
static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
{
TCGTemp *dst_ts = arg_temp(dst);
TCGTemp *src_ts = arg_temp(src);
+ const TCGOpDef *def;
struct tcg_temp_info *di;
struct tcg_temp_info *si;
tcg_target_ulong mask;
@@ -236,9 +217,16 @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src)
reset_ts(dst_ts);
di = ts_info(dst_ts);
si = ts_info(src_ts);
- new_op = op_to_mov(op->opc);
-
+ def = &tcg_op_defs[op->opc];
+ if (def->flags & TCG_OPF_VECTOR) {
+ new_op = INDEX_op_mov_vec;
+ } else if (def->flags & TCG_OPF_64BIT) {
+ new_op = INDEX_op_mov_i64;
+ } else {
+ new_op = INDEX_op_mov_i32;
+ }
op->opc = new_op;
+ /* TCGOP_VECL and TCGOP_VECE remain unchanged. */
op->args[0] = dst;
op->args[1] = src;
@@ -417,8 +405,9 @@ static TCGArg do_constant_folding_2(TCGOpcode op, TCGArg x, TCGArg y)
static TCGArg do_constant_folding(TCGOpcode op, TCGArg x, TCGArg y)
{
+ const TCGOpDef *def = &tcg_op_defs[op];
TCGArg res = do_constant_folding_2(op, x, y);
- if (op_bits(op) == 32) {
+ if (!(def->flags & TCG_OPF_64BIT)) {
res = (int32_t)res;
}
return res;
@@ -508,13 +497,12 @@ static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x,
tcg_target_ulong xv = arg_info(x)->val;
tcg_target_ulong yv = arg_info(y)->val;
if (arg_is_const(x) && arg_is_const(y)) {
- switch (op_bits(op)) {
- case 32:
- return do_constant_folding_cond_32(xv, yv, c);
- case 64:
+ const TCGOpDef *def = &tcg_op_defs[op];
+ tcg_debug_assert(!(def->flags & TCG_OPF_VECTOR));
+ if (def->flags & TCG_OPF_64BIT) {
return do_constant_folding_cond_64(xv, yv, c);
- default:
- tcg_abort();
+ } else {
+ return do_constant_folding_cond_32(xv, yv, c);
}
} else if (args_are_copies(x, y)) {
return do_constant_folding_cond_eq(c);
@@ -653,11 +641,11 @@ void tcg_optimize(TCGContext *s)
/* For commutative operations make constant second argument */
switch (opc) {
- CASE_OP_32_64(add):
- CASE_OP_32_64(mul):
- CASE_OP_32_64(and):
- CASE_OP_32_64(or):
- CASE_OP_32_64(xor):
+ CASE_OP_32_64_VEC(add):
+ CASE_OP_32_64_VEC(mul):
+ CASE_OP_32_64_VEC(and):
+ CASE_OP_32_64_VEC(or):
+ CASE_OP_32_64_VEC(xor):
CASE_OP_32_64(eqv):
CASE_OP_32_64(nand):
CASE_OP_32_64(nor):
@@ -722,7 +710,7 @@ void tcg_optimize(TCGContext *s)
continue;
}
break;
- CASE_OP_32_64(sub):
+ CASE_OP_32_64_VEC(sub):
{
TCGOpcode neg_op;
bool have_neg;
@@ -734,9 +722,12 @@ void tcg_optimize(TCGContext *s)
if (opc == INDEX_op_sub_i32) {
neg_op = INDEX_op_neg_i32;
have_neg = TCG_TARGET_HAS_neg_i32;
- } else {
+ } else if (opc == INDEX_op_sub_i64) {
neg_op = INDEX_op_neg_i64;
have_neg = TCG_TARGET_HAS_neg_i64;
+ } else {
+ neg_op = INDEX_op_neg_vec;
+ have_neg = TCG_TARGET_HAS_neg_vec;
}
if (!have_neg) {
break;
@@ -750,7 +741,7 @@ void tcg_optimize(TCGContext *s)
}
}
break;
- CASE_OP_32_64(xor):
+ CASE_OP_32_64_VEC(xor):
CASE_OP_32_64(nand):
if (!arg_is_const(op->args[1])
&& arg_is_const(op->args[2])
@@ -767,7 +758,7 @@ void tcg_optimize(TCGContext *s)
goto try_not;
}
break;
- CASE_OP_32_64(andc):
+ CASE_OP_32_64_VEC(andc):
if (!arg_is_const(op->args[2])
&& arg_is_const(op->args[1])
&& arg_info(op->args[1])->val == -1) {
@@ -775,7 +766,7 @@ void tcg_optimize(TCGContext *s)
goto try_not;
}
break;
- CASE_OP_32_64(orc):
+ CASE_OP_32_64_VEC(orc):
CASE_OP_32_64(eqv):
if (!arg_is_const(op->args[2])
&& arg_is_const(op->args[1])
@@ -789,7 +780,10 @@ void tcg_optimize(TCGContext *s)
TCGOpcode not_op;
bool have_not;
- if (def->flags & TCG_OPF_64BIT) {
+ if (def->flags & TCG_OPF_VECTOR) {
+ not_op = INDEX_op_not_vec;
+ have_not = TCG_TARGET_HAS_not_vec;
+ } else if (def->flags & TCG_OPF_64BIT) {
not_op = INDEX_op_not_i64;
have_not = TCG_TARGET_HAS_not_i64;
} else {
@@ -810,16 +804,16 @@ void tcg_optimize(TCGContext *s)
/* Simplify expression for "op r, a, const => mov r, a" cases */
switch (opc) {
- CASE_OP_32_64(add):
- CASE_OP_32_64(sub):
+ CASE_OP_32_64_VEC(add):
+ CASE_OP_32_64_VEC(sub):
+ CASE_OP_32_64_VEC(or):
+ CASE_OP_32_64_VEC(xor):
+ CASE_OP_32_64_VEC(andc):
CASE_OP_32_64(shl):
CASE_OP_32_64(shr):
CASE_OP_32_64(sar):
CASE_OP_32_64(rotl):
CASE_OP_32_64(rotr):
- CASE_OP_32_64(or):
- CASE_OP_32_64(xor):
- CASE_OP_32_64(andc):
if (!arg_is_const(op->args[1])
&& arg_is_const(op->args[2])
&& arg_info(op->args[2])->val == 0) {
@@ -827,8 +821,8 @@ void tcg_optimize(TCGContext *s)
continue;
}
break;
- CASE_OP_32_64(and):
- CASE_OP_32_64(orc):
+ CASE_OP_32_64_VEC(and):
+ CASE_OP_32_64_VEC(orc):
CASE_OP_32_64(eqv):
if (!arg_is_const(op->args[1])
&& arg_is_const(op->args[2])
@@ -1042,8 +1036,8 @@ void tcg_optimize(TCGContext *s)
/* Simplify expression for "op r, a, 0 => movi r, 0" cases */
switch (opc) {
- CASE_OP_32_64(and):
- CASE_OP_32_64(mul):
+ CASE_OP_32_64_VEC(and):
+ CASE_OP_32_64_VEC(mul):
CASE_OP_32_64(muluh):
CASE_OP_32_64(mulsh):
if (arg_is_const(op->args[2])
@@ -1058,8 +1052,8 @@ void tcg_optimize(TCGContext *s)
/* Simplify expression for "op r, a, a => mov r, a" cases */
switch (opc) {
- CASE_OP_32_64(or):
- CASE_OP_32_64(and):
+ CASE_OP_32_64_VEC(or):
+ CASE_OP_32_64_VEC(and):
if (args_are_copies(op->args[1], op->args[2])) {
tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
continue;
@@ -1071,9 +1065,9 @@ void tcg_optimize(TCGContext *s)
/* Simplify expression for "op r, a, a => movi r, 0" cases */
switch (opc) {
- CASE_OP_32_64(andc):
- CASE_OP_32_64(sub):
- CASE_OP_32_64(xor):
+ CASE_OP_32_64_VEC(andc):
+ CASE_OP_32_64_VEC(sub):
+ CASE_OP_32_64_VEC(xor):
if (args_are_copies(op->args[1], op->args[2])) {
tcg_opt_gen_movi(s, op, op->args[0], 0);
continue;
@@ -1087,13 +1081,23 @@ void tcg_optimize(TCGContext *s)
folding. Constants will be substituted to arguments by register
allocator where needed and possible. Also detect copies. */
switch (opc) {
- CASE_OP_32_64(mov):
+ CASE_OP_32_64_VEC(mov):
tcg_opt_gen_mov(s, op, op->args[0], op->args[1]);
break;
CASE_OP_32_64(movi):
+ case INDEX_op_dupi_vec:
tcg_opt_gen_movi(s, op, op->args[0], op->args[1]);
break;
+ case INDEX_op_dup_vec:
+ if (arg_is_const(op->args[1])) {
+ tmp = arg_info(op->args[1])->val;
+ tmp = dup_const(TCGOP_VECE(op), tmp);
+ tcg_opt_gen_movi(s, op, op->args[0], tmp);
+ continue;
+ }
+ break;
+
CASE_OP_32_64(not):
CASE_OP_32_64(neg):
CASE_OP_32_64(ext8s):
diff --git a/tcg/tcg-gvec-desc.h b/tcg/tcg-gvec-desc.h
new file mode 100644
index 0000000000..3b4c2d9c69
--- /dev/null
+++ b/tcg/tcg-gvec-desc.h
@@ -0,0 +1,49 @@
+/*
+ * Generic vector operation descriptor
+ *
+ * Copyright (c) 2018 Linaro
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* ??? These bit widths are set for ARM SVE, maxing out at 256 byte vectors. */
+#define SIMD_OPRSZ_SHIFT 0
+#define SIMD_OPRSZ_BITS 5
+
+#define SIMD_MAXSZ_SHIFT (SIMD_OPRSZ_SHIFT + SIMD_OPRSZ_BITS)
+#define SIMD_MAXSZ_BITS 5
+
+#define SIMD_DATA_SHIFT (SIMD_MAXSZ_SHIFT + SIMD_MAXSZ_BITS)
+#define SIMD_DATA_BITS (32 - SIMD_DATA_SHIFT)
+
+/* Create a descriptor from components. */
+uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data);
+
+/* Extract the operation size from a descriptor. */
+static inline intptr_t simd_oprsz(uint32_t desc)
+{
+ return (extract32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS) + 1) * 8;
+}
+
+/* Extract the max vector size from a descriptor. */
+static inline intptr_t simd_maxsz(uint32_t desc)
+{
+ return (extract32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS) + 1) * 8;
+}
+
+/* Extract the operation-specific data from a descriptor. */
+static inline int32_t simd_data(uint32_t desc)
+{
+ return sextract32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS);
+}
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
new file mode 100644
index 0000000000..bfe44bba81
--- /dev/null
+++ b/tcg/tcg-op-gvec.c
@@ -0,0 +1,2216 @@
+/*
+ * Generic vector operation expansion
+ *
+ * Copyright (c) 2018 Linaro
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "tcg.h"
+#include "tcg-op.h"
+#include "tcg-op-gvec.h"
+#include "tcg-gvec-desc.h"
+
+#define MAX_UNROLL 4
+
+/* Verify vector size and alignment rules. OFS should be the OR of all
+ of the operand offsets so that we can check them all at once. */
+static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
+{
+ uint32_t opr_align = oprsz >= 16 ? 15 : 7;
+ uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7;
+ tcg_debug_assert(oprsz > 0);
+ tcg_debug_assert(oprsz <= maxsz);
+ tcg_debug_assert((oprsz & opr_align) == 0);
+ tcg_debug_assert((maxsz & max_align) == 0);
+ tcg_debug_assert((ofs & max_align) == 0);
+}
+
+/* Verify vector overlap rules for two operands. */
+static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
+{
+ tcg_debug_assert(d == a || d + s <= a || a + s <= d);
+}
+
+/* Verify vector overlap rules for three operands. */
+static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
+{
+ check_overlap_2(d, a, s);
+ check_overlap_2(d, b, s);
+ check_overlap_2(a, b, s);
+}
+
+/* Verify vector overlap rules for four operands. */
+static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
+ uint32_t c, uint32_t s)
+{
+ check_overlap_2(d, a, s);
+ check_overlap_2(d, b, s);
+ check_overlap_2(d, c, s);
+ check_overlap_2(a, b, s);
+ check_overlap_2(a, c, s);
+ check_overlap_2(b, c, s);
+}
+
+/* Create a descriptor from components. */
+uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
+{
+ uint32_t desc = 0;
+
+ assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS));
+ assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS));
+ assert(data == sextract32(data, 0, SIMD_DATA_BITS));
+
+ oprsz = (oprsz / 8) - 1;
+ maxsz = (maxsz / 8) - 1;
+ desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
+ desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
+ desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
+
+ return desc;
+}
+
+/* Generate a call to a gvec-style helper with two vector operands. */
+void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
+ uint32_t oprsz, uint32_t maxsz, int32_t data,
+ gen_helper_gvec_2 *fn)
+{
+ TCGv_ptr a0, a1;
+ TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+
+ a0 = tcg_temp_new_ptr();
+ a1 = tcg_temp_new_ptr();
+
+ tcg_gen_addi_ptr(a0, cpu_env, dofs);
+ tcg_gen_addi_ptr(a1, cpu_env, aofs);
+
+ fn(a0, a1, desc);
+
+ tcg_temp_free_ptr(a0);
+ tcg_temp_free_ptr(a1);
+ tcg_temp_free_i32(desc);
+}
+
+/* Generate a call to a gvec-style helper with two vector operands
+ and one scalar operand. */
+void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
+ uint32_t oprsz, uint32_t maxsz, int32_t data,
+ gen_helper_gvec_2i *fn)
+{
+ TCGv_ptr a0, a1;
+ TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+
+ a0 = tcg_temp_new_ptr();
+ a1 = tcg_temp_new_ptr();
+
+ tcg_gen_addi_ptr(a0, cpu_env, dofs);
+ tcg_gen_addi_ptr(a1, cpu_env, aofs);
+
+ fn(a0, a1, c, desc);
+
+ tcg_temp_free_ptr(a0);
+ tcg_temp_free_ptr(a1);
+ tcg_temp_free_i32(desc);
+}
+
+/* Generate a call to a gvec-style helper with three vector operands. */
+void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t oprsz, uint32_t maxsz, int32_t data,
+ gen_helper_gvec_3 *fn)
+{
+ TCGv_ptr a0, a1, a2;
+ TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+
+ a0 = tcg_temp_new_ptr();
+ a1 = tcg_temp_new_ptr();
+ a2 = tcg_temp_new_ptr();
+
+ tcg_gen_addi_ptr(a0, cpu_env, dofs);
+ tcg_gen_addi_ptr(a1, cpu_env, aofs);
+ tcg_gen_addi_ptr(a2, cpu_env, bofs);
+
+ fn(a0, a1, a2, desc);
+
+ tcg_temp_free_ptr(a0);
+ tcg_temp_free_ptr(a1);
+ tcg_temp_free_ptr(a2);
+ tcg_temp_free_i32(desc);
+}
+
+/* Generate a call to a gvec-style helper with four vector operands. */
+void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
+ int32_t data, gen_helper_gvec_4 *fn)
+{
+ TCGv_ptr a0, a1, a2, a3;
+ TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+
+ a0 = tcg_temp_new_ptr();
+ a1 = tcg_temp_new_ptr();
+ a2 = tcg_temp_new_ptr();
+ a3 = tcg_temp_new_ptr();
+
+ tcg_gen_addi_ptr(a0, cpu_env, dofs);
+ tcg_gen_addi_ptr(a1, cpu_env, aofs);
+ tcg_gen_addi_ptr(a2, cpu_env, bofs);
+ tcg_gen_addi_ptr(a3, cpu_env, cofs);
+
+ fn(a0, a1, a2, a3, desc);
+
+ tcg_temp_free_ptr(a0);
+ tcg_temp_free_ptr(a1);
+ tcg_temp_free_ptr(a2);
+ tcg_temp_free_ptr(a3);
+ tcg_temp_free_i32(desc);
+}
+
+/* Generate a call to a gvec-style helper with five vector operands. */
+void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t cofs, uint32_t xofs, uint32_t oprsz,
+ uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
+{
+ TCGv_ptr a0, a1, a2, a3, a4;
+ TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+
+ a0 = tcg_temp_new_ptr();
+ a1 = tcg_temp_new_ptr();
+ a2 = tcg_temp_new_ptr();
+ a3 = tcg_temp_new_ptr();
+ a4 = tcg_temp_new_ptr();
+
+ tcg_gen_addi_ptr(a0, cpu_env, dofs);
+ tcg_gen_addi_ptr(a1, cpu_env, aofs);
+ tcg_gen_addi_ptr(a2, cpu_env, bofs);
+ tcg_gen_addi_ptr(a3, cpu_env, cofs);
+ tcg_gen_addi_ptr(a4, cpu_env, xofs);
+
+ fn(a0, a1, a2, a3, a4, desc);
+
+ tcg_temp_free_ptr(a0);
+ tcg_temp_free_ptr(a1);
+ tcg_temp_free_ptr(a2);
+ tcg_temp_free_ptr(a3);
+ tcg_temp_free_ptr(a4);
+ tcg_temp_free_i32(desc);
+}
+
+/* Generate a call to a gvec-style helper with three vector operands
+ and an extra pointer operand. */
+void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
+ TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
+ int32_t data, gen_helper_gvec_2_ptr *fn)
+{
+ TCGv_ptr a0, a1;
+ TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+
+ a0 = tcg_temp_new_ptr();
+ a1 = tcg_temp_new_ptr();
+
+ tcg_gen_addi_ptr(a0, cpu_env, dofs);
+ tcg_gen_addi_ptr(a1, cpu_env, aofs);
+
+ fn(a0, a1, ptr, desc);
+
+ tcg_temp_free_ptr(a0);
+ tcg_temp_free_ptr(a1);
+ tcg_temp_free_i32(desc);
+}
+
+/* Generate a call to a gvec-style helper with three vector operands
+ and an extra pointer operand. */
+void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
+ int32_t data, gen_helper_gvec_3_ptr *fn)
+{
+ TCGv_ptr a0, a1, a2;
+ TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+
+ a0 = tcg_temp_new_ptr();
+ a1 = tcg_temp_new_ptr();
+ a2 = tcg_temp_new_ptr();
+
+ tcg_gen_addi_ptr(a0, cpu_env, dofs);
+ tcg_gen_addi_ptr(a1, cpu_env, aofs);
+ tcg_gen_addi_ptr(a2, cpu_env, bofs);
+
+ fn(a0, a1, a2, ptr, desc);
+
+ tcg_temp_free_ptr(a0);
+ tcg_temp_free_ptr(a1);
+ tcg_temp_free_ptr(a2);
+ tcg_temp_free_i32(desc);
+}
+
+/* Generate a call to a gvec-style helper with four vector operands
+ and an extra pointer operand. */
+void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
+ uint32_t maxsz, int32_t data,
+ gen_helper_gvec_4_ptr *fn)
+{
+ TCGv_ptr a0, a1, a2, a3;
+ TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
+
+ a0 = tcg_temp_new_ptr();
+ a1 = tcg_temp_new_ptr();
+ a2 = tcg_temp_new_ptr();
+ a3 = tcg_temp_new_ptr();
+
+ tcg_gen_addi_ptr(a0, cpu_env, dofs);
+ tcg_gen_addi_ptr(a1, cpu_env, aofs);
+ tcg_gen_addi_ptr(a2, cpu_env, bofs);
+ tcg_gen_addi_ptr(a3, cpu_env, cofs);
+
+ fn(a0, a1, a2, a3, ptr, desc);
+
+ tcg_temp_free_ptr(a0);
+ tcg_temp_free_ptr(a1);
+ tcg_temp_free_ptr(a2);
+ tcg_temp_free_ptr(a3);
+ tcg_temp_free_i32(desc);
+}
+
+/* Return true if we want to implement something of OPRSZ bytes
+ in units of LNSZ. This limits the expansion of inline code. */
+static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
+{
+ uint32_t lnct = oprsz / lnsz;
+ return lnct >= 1 && lnct <= MAX_UNROLL;
+}
+
+static void expand_clr(uint32_t dofs, uint32_t maxsz);
+
+/* Duplicate C as per VECE. */
+uint64_t (dup_const)(unsigned vece, uint64_t c)
+{
+ switch (vece) {
+ case MO_8:
+ return 0x0101010101010101ull * (uint8_t)c;
+ case MO_16:
+ return 0x0001000100010001ull * (uint16_t)c;
+ case MO_32:
+ return 0x0000000100000001ull * (uint32_t)c;
+ case MO_64:
+ return c;
+ default:
+ g_assert_not_reached();
+ }
+}
+
+/* Duplicate IN into OUT as per VECE. */
+static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
+{
+ switch (vece) {
+ case MO_8:
+ tcg_gen_ext8u_i32(out, in);
+ tcg_gen_muli_i32(out, out, 0x01010101);
+ break;
+ case MO_16:
+ tcg_gen_deposit_i32(out, in, in, 16, 16);
+ break;
+ case MO_32:
+ tcg_gen_mov_i32(out, in);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+}
+
+static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
+{
+ switch (vece) {
+ case MO_8:
+ tcg_gen_ext8u_i64(out, in);
+ tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
+ break;
+ case MO_16:
+ tcg_gen_ext16u_i64(out, in);
+ tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
+ break;
+ case MO_32:
+ tcg_gen_deposit_i64(out, in, in, 32, 32);
+ break;
+ case MO_64:
+ tcg_gen_mov_i64(out, in);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+}
+
+/* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
+ * Only one of IN_32 or IN_64 may be set;
+ * IN_C is used if IN_32 and IN_64 are unset.
+ */
+static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
+ uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
+ uint64_t in_c)
+{
+ TCGType type;
+ TCGv_i64 t_64;
+ TCGv_i32 t_32, t_desc;
+ TCGv_ptr t_ptr;
+ uint32_t i;
+
+ assert(vece <= (in_32 ? MO_32 : MO_64));
+ assert(in_32 == NULL || in_64 == NULL);
+
+ /* If we're storing 0, expand oprsz to maxsz. */
+ if (in_32 == NULL && in_64 == NULL) {
+ in_c = dup_const(vece, in_c);
+ if (in_c == 0) {
+ oprsz = maxsz;
+ }
+ }
+
+ type = 0;
+ if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)) {
+ type = TCG_TYPE_V256;
+ } else if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)) {
+ type = TCG_TYPE_V128;
+ } else if (TCG_TARGET_HAS_v64 && check_size_impl(oprsz, 8)
+ /* Prefer integer when 64-bit host and no variable dup. */
+ && !(TCG_TARGET_REG_BITS == 64 && in_32 == NULL
+ && (in_64 == NULL || vece == MO_64))) {
+ type = TCG_TYPE_V64;
+ }
+
+ /* Implement inline with a vector type, if possible. */
+ if (type != 0) {
+ TCGv_vec t_vec = tcg_temp_new_vec(type);
+
+ if (in_32) {
+ tcg_gen_dup_i32_vec(vece, t_vec, in_32);
+ } else if (in_64) {
+ tcg_gen_dup_i64_vec(vece, t_vec, in_64);
+ } else {
+ switch (vece) {
+ case MO_8:
+ tcg_gen_dup8i_vec(t_vec, in_c);
+ break;
+ case MO_16:
+ tcg_gen_dup16i_vec(t_vec, in_c);
+ break;
+ case MO_32:
+ tcg_gen_dup32i_vec(t_vec, in_c);
+ break;
+ default:
+ tcg_gen_dup64i_vec(t_vec, in_c);
+ break;
+ }
+ }
+
+ i = 0;
+ if (TCG_TARGET_HAS_v256) {
+ for (; i + 32 <= oprsz; i += 32) {
+ tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
+ }
+ }
+ if (TCG_TARGET_HAS_v128) {
+ for (; i + 16 <= oprsz; i += 16) {
+ tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
+ }
+ }
+ if (TCG_TARGET_HAS_v64) {
+ for (; i < oprsz; i += 8) {
+ tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
+ }
+ }
+ tcg_temp_free_vec(t_vec);
+ goto done;
+ }
+
+ /* Otherwise, inline with an integer type, unless "large". */
+ if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
+ t_64 = NULL;
+ t_32 = NULL;
+
+ if (in_32) {
+ /* We are given a 32-bit variable input. For a 64-bit host,
+ use a 64-bit operation unless the 32-bit operation would
+ be simple enough. */
+ if (TCG_TARGET_REG_BITS == 64
+ && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
+ t_64 = tcg_temp_new_i64();
+ tcg_gen_extu_i32_i64(t_64, in_32);
+ gen_dup_i64(vece, t_64, t_64);
+ } else {
+ t_32 = tcg_temp_new_i32();
+ gen_dup_i32(vece, t_32, in_32);
+ }
+ } else if (in_64) {
+ /* We are given a 64-bit variable input. */
+ t_64 = tcg_temp_new_i64();
+ gen_dup_i64(vece, t_64, in_64);
+ } else {
+ /* We are given a constant input. */
+ /* For 64-bit hosts, use 64-bit constants for "simple" constants
+ or when we'd need too many 32-bit stores, or when a 64-bit
+ constant is really required. */
+ if (vece == MO_64
+ || (TCG_TARGET_REG_BITS == 64
+ && (in_c == 0 || in_c == -1
+ || !check_size_impl(oprsz, 4)))) {
+ t_64 = tcg_const_i64(in_c);
+ } else {
+ t_32 = tcg_const_i32(in_c);
+ }
+ }
+
+ /* Implement inline if we picked an implementation size above. */
+ if (t_32) {
+ for (i = 0; i < oprsz; i += 4) {
+ tcg_gen_st_i32(t_32, cpu_env, dofs + i);
+ }
+ tcg_temp_free_i32(t_32);
+ goto done;
+ }
+ if (t_64) {
+ for (i = 0; i < oprsz; i += 8) {
+ tcg_gen_st_i64(t_64, cpu_env, dofs + i);
+ }
+ tcg_temp_free_i64(t_64);
+ goto done;
+ }
+ }
+
+ /* Otherwise implement out of line. */
+ t_ptr = tcg_temp_new_ptr();
+ tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
+ t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0));
+
+ if (vece == MO_64) {
+ if (in_64) {
+ gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
+ } else {
+ t_64 = tcg_const_i64(in_c);
+ gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
+ tcg_temp_free_i64(t_64);
+ }
+ } else {
+ typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
+ static dup_fn * const fns[3] = {
+ gen_helper_gvec_dup8,
+ gen_helper_gvec_dup16,
+ gen_helper_gvec_dup32
+ };
+
+ if (in_32) {
+ fns[vece](t_ptr, t_desc, in_32);
+ } else {
+ t_32 = tcg_temp_new_i32();
+ if (in_64) {
+ tcg_gen_extrl_i64_i32(t_32, in_64);
+ } else if (vece == MO_8) {
+ tcg_gen_movi_i32(t_32, in_c & 0xff);
+ } else if (vece == MO_16) {
+ tcg_gen_movi_i32(t_32, in_c & 0xffff);
+ } else {
+ tcg_gen_movi_i32(t_32, in_c);
+ }
+ fns[vece](t_ptr, t_desc, t_32);
+ tcg_temp_free_i32(t_32);
+ }
+ }
+
+ tcg_temp_free_ptr(t_ptr);
+ tcg_temp_free_i32(t_desc);
+ return;
+
+ done:
+ if (oprsz < maxsz) {
+ expand_clr(dofs + oprsz, maxsz - oprsz);
+ }
+}
+
+/* Likewise, but with zero. */
+static void expand_clr(uint32_t dofs, uint32_t maxsz)
+{
+ do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
+}
+
+/* Expand OPSZ bytes worth of two-operand operations using i32 elements. */
+static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
+ void (*fni)(TCGv_i32, TCGv_i32))
+{
+ TCGv_i32 t0 = tcg_temp_new_i32();
+ uint32_t i;
+
+ for (i = 0; i < oprsz; i += 4) {
+ tcg_gen_ld_i32(t0, cpu_env, aofs + i);
+ fni(t0, t0);
+ tcg_gen_st_i32(t0, cpu_env, dofs + i);
+ }
+ tcg_temp_free_i32(t0);
+}
+
+static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
+ int32_t c, bool load_dest,
+ void (*fni)(TCGv_i32, TCGv_i32, int32_t))
+{
+ TCGv_i32 t0 = tcg_temp_new_i32();
+ TCGv_i32 t1 = tcg_temp_new_i32();
+ uint32_t i;
+
+ for (i = 0; i < oprsz; i += 4) {
+ tcg_gen_ld_i32(t0, cpu_env, aofs + i);
+ if (load_dest) {
+ tcg_gen_ld_i32(t1, cpu_env, dofs + i);
+ }
+ fni(t1, t0, c);
+ tcg_gen_st_i32(t1, cpu_env, dofs + i);
+ }
+ tcg_temp_free_i32(t0);
+ tcg_temp_free_i32(t1);
+}
+
+static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
+ TCGv_i32 c, bool scalar_first,
+ void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
+{
+ TCGv_i32 t0 = tcg_temp_new_i32();
+ TCGv_i32 t1 = tcg_temp_new_i32();
+ uint32_t i;
+
+ for (i = 0; i < oprsz; i += 4) {
+ tcg_gen_ld_i32(t0, cpu_env, aofs + i);
+ if (scalar_first) {
+ fni(t1, c, t0);
+ } else {
+ fni(t1, t0, c);
+ }
+ tcg_gen_st_i32(t1, cpu_env, dofs + i);
+ }
+ tcg_temp_free_i32(t0);
+ tcg_temp_free_i32(t1);
+}
+
+/* Expand OPSZ bytes worth of three-operand operations using i32 elements. */
+static void expand_3_i32(uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, bool load_dest,
+ void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
+{
+ TCGv_i32 t0 = tcg_temp_new_i32();
+ TCGv_i32 t1 = tcg_temp_new_i32();
+ TCGv_i32 t2 = tcg_temp_new_i32();
+ uint32_t i;
+
+ for (i = 0; i < oprsz; i += 4) {
+ tcg_gen_ld_i32(t0, cpu_env, aofs + i);
+ tcg_gen_ld_i32(t1, cpu_env, bofs + i);
+ if (load_dest) {
+ tcg_gen_ld_i32(t2, cpu_env, dofs + i);
+ }
+ fni(t2, t0, t1);
+ tcg_gen_st_i32(t2, cpu_env, dofs + i);
+ }
+ tcg_temp_free_i32(t2);
+ tcg_temp_free_i32(t1);
+ tcg_temp_free_i32(t0);
+}
+
+/* Expand OPSZ bytes worth of three-operand operations using i32 elements. */
+static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t cofs, uint32_t oprsz,
+ void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
+{
+ TCGv_i32 t0 = tcg_temp_new_i32();
+ TCGv_i32 t1 = tcg_temp_new_i32();
+ TCGv_i32 t2 = tcg_temp_new_i32();
+ TCGv_i32 t3 = tcg_temp_new_i32();
+ uint32_t i;
+
+ for (i = 0; i < oprsz; i += 4) {
+ tcg_gen_ld_i32(t1, cpu_env, aofs + i);
+ tcg_gen_ld_i32(t2, cpu_env, bofs + i);
+ tcg_gen_ld_i32(t3, cpu_env, cofs + i);
+ fni(t0, t1, t2, t3);
+ tcg_gen_st_i32(t0, cpu_env, dofs + i);
+ }
+ tcg_temp_free_i32(t3);
+ tcg_temp_free_i32(t2);
+ tcg_temp_free_i32(t1);
+ tcg_temp_free_i32(t0);
+}
+
+/* Expand OPSZ bytes worth of two-operand operations using i64 elements. */
+static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
+ void (*fni)(TCGv_i64, TCGv_i64))
+{
+ TCGv_i64 t0 = tcg_temp_new_i64();
+ uint32_t i;
+
+ for (i = 0; i < oprsz; i += 8) {
+ tcg_gen_ld_i64(t0, cpu_env, aofs + i);
+ fni(t0, t0);
+ tcg_gen_st_i64(t0, cpu_env, dofs + i);
+ }
+ tcg_temp_free_i64(t0);
+}
+
+static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
+ int64_t c, bool load_dest,
+ void (*fni)(TCGv_i64, TCGv_i64, int64_t))
+{
+ TCGv_i64 t0 = tcg_temp_new_i64();
+ TCGv_i64 t1 = tcg_temp_new_i64();
+ uint32_t i;
+
+ for (i = 0; i < oprsz; i += 8) {
+ tcg_gen_ld_i64(t0, cpu_env, aofs + i);
+ if (load_dest) {
+ tcg_gen_ld_i64(t1, cpu_env, dofs + i);
+ }
+ fni(t1, t0, c);
+ tcg_gen_st_i64(t1, cpu_env, dofs + i);
+ }
+ tcg_temp_free_i64(t0);
+ tcg_temp_free_i64(t1);
+}
+
+static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
+ TCGv_i64 c, bool scalar_first,
+ void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
+{
+ TCGv_i64 t0 = tcg_temp_new_i64();
+ TCGv_i64 t1 = tcg_temp_new_i64();
+ uint32_t i;
+
+ for (i = 0; i < oprsz; i += 8) {
+ tcg_gen_ld_i64(t0, cpu_env, aofs + i);
+ if (scalar_first) {
+ fni(t1, c, t0);
+ } else {
+ fni(t1, t0, c);
+ }
+ tcg_gen_st_i64(t1, cpu_env, dofs + i);
+ }
+ tcg_temp_free_i64(t0);
+ tcg_temp_free_i64(t1);
+}
+
+/* Expand OPSZ bytes worth of three-operand operations using i64 elements. */
+static void expand_3_i64(uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, bool load_dest,
+ void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
+{
+ TCGv_i64 t0 = tcg_temp_new_i64();
+ TCGv_i64 t1 = tcg_temp_new_i64();
+ TCGv_i64 t2 = tcg_temp_new_i64();
+ uint32_t i;
+
+ for (i = 0; i < oprsz; i += 8) {
+ tcg_gen_ld_i64(t0, cpu_env, aofs + i);
+ tcg_gen_ld_i64(t1, cpu_env, bofs + i);
+ if (load_dest) {
+ tcg_gen_ld_i64(t2, cpu_env, dofs + i);
+ }
+ fni(t2, t0, t1);
+ tcg_gen_st_i64(t2, cpu_env, dofs + i);
+ }
+ tcg_temp_free_i64(t2);
+ tcg_temp_free_i64(t1);
+ tcg_temp_free_i64(t0);
+}
+
+/* Expand OPSZ bytes worth of three-operand operations using i64 elements. */
+static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t cofs, uint32_t oprsz,
+ void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
+{
+ TCGv_i64 t0 = tcg_temp_new_i64();
+ TCGv_i64 t1 = tcg_temp_new_i64();
+ TCGv_i64 t2 = tcg_temp_new_i64();
+ TCGv_i64 t3 = tcg_temp_new_i64();
+ uint32_t i;
+
+ for (i = 0; i < oprsz; i += 8) {
+ tcg_gen_ld_i64(t1, cpu_env, aofs + i);
+ tcg_gen_ld_i64(t2, cpu_env, bofs + i);
+ tcg_gen_ld_i64(t3, cpu_env, cofs + i);
+ fni(t0, t1, t2, t3);
+ tcg_gen_st_i64(t0, cpu_env, dofs + i);
+ }
+ tcg_temp_free_i64(t3);
+ tcg_temp_free_i64(t2);
+ tcg_temp_free_i64(t1);
+ tcg_temp_free_i64(t0);
+}
+
+/* Expand OPSZ bytes worth of two-operand operations using host vectors. */
+static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t oprsz, uint32_t tysz, TCGType type,
+ void (*fni)(unsigned, TCGv_vec, TCGv_vec))
+{
+ TCGv_vec t0 = tcg_temp_new_vec(type);
+ uint32_t i;
+
+ for (i = 0; i < oprsz; i += tysz) {
+ tcg_gen_ld_vec(t0, cpu_env, aofs + i);
+ fni(vece, t0, t0);
+ tcg_gen_st_vec(t0, cpu_env, dofs + i);
+ }
+ tcg_temp_free_vec(t0);
+}
+
+/* Expand OPSZ bytes worth of two-vector operands and an immediate operand
+ using host vectors. */
+static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t oprsz, uint32_t tysz, TCGType type,
+ int64_t c, bool load_dest,
+ void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
+{
+ TCGv_vec t0 = tcg_temp_new_vec(type);
+ TCGv_vec t1 = tcg_temp_new_vec(type);
+ uint32_t i;
+
+ for (i = 0; i < oprsz; i += tysz) {
+ tcg_gen_ld_vec(t0, cpu_env, aofs + i);
+ if (load_dest) {
+ tcg_gen_ld_vec(t1, cpu_env, dofs + i);
+ }
+ fni(vece, t1, t0, c);
+ tcg_gen_st_vec(t1, cpu_env, dofs + i);
+ }
+ tcg_temp_free_vec(t0);
+ tcg_temp_free_vec(t1);
+}
+
+static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t oprsz, uint32_t tysz, TCGType type,
+ TCGv_vec c, bool scalar_first,
+ void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
+{
+ TCGv_vec t0 = tcg_temp_new_vec(type);
+ TCGv_vec t1 = tcg_temp_new_vec(type);
+ uint32_t i;
+
+ for (i = 0; i < oprsz; i += tysz) {
+ tcg_gen_ld_vec(t0, cpu_env, aofs + i);
+ if (scalar_first) {
+ fni(vece, t1, c, t0);
+ } else {
+ fni(vece, t1, t0, c);
+ }
+ tcg_gen_st_vec(t1, cpu_env, dofs + i);
+ }
+ tcg_temp_free_vec(t0);
+ tcg_temp_free_vec(t1);
+}
+
+/* Expand OPSZ bytes worth of three-operand operations using host vectors. */
+static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz,
+ uint32_t tysz, TCGType type, bool load_dest,
+ void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
+{
+ TCGv_vec t0 = tcg_temp_new_vec(type);
+ TCGv_vec t1 = tcg_temp_new_vec(type);
+ TCGv_vec t2 = tcg_temp_new_vec(type);
+ uint32_t i;
+
+ for (i = 0; i < oprsz; i += tysz) {
+ tcg_gen_ld_vec(t0, cpu_env, aofs + i);
+ tcg_gen_ld_vec(t1, cpu_env, bofs + i);
+ if (load_dest) {
+ tcg_gen_ld_vec(t2, cpu_env, dofs + i);
+ }
+ fni(vece, t2, t0, t1);
+ tcg_gen_st_vec(t2, cpu_env, dofs + i);
+ }
+ tcg_temp_free_vec(t2);
+ tcg_temp_free_vec(t1);
+ tcg_temp_free_vec(t0);
+}
+
+/* Expand OPSZ bytes worth of four-operand operations using host vectors. */
+static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t cofs, uint32_t oprsz,
+ uint32_t tysz, TCGType type,
+ void (*fni)(unsigned, TCGv_vec, TCGv_vec,
+ TCGv_vec, TCGv_vec))
+{
+ TCGv_vec t0 = tcg_temp_new_vec(type);
+ TCGv_vec t1 = tcg_temp_new_vec(type);
+ TCGv_vec t2 = tcg_temp_new_vec(type);
+ TCGv_vec t3 = tcg_temp_new_vec(type);
+ uint32_t i;
+
+ for (i = 0; i < oprsz; i += tysz) {
+ tcg_gen_ld_vec(t1, cpu_env, aofs + i);
+ tcg_gen_ld_vec(t2, cpu_env, bofs + i);
+ tcg_gen_ld_vec(t3, cpu_env, cofs + i);
+ fni(vece, t0, t1, t2, t3);
+ tcg_gen_st_vec(t0, cpu_env, dofs + i);
+ }
+ tcg_temp_free_vec(t3);
+ tcg_temp_free_vec(t2);
+ tcg_temp_free_vec(t1);
+ tcg_temp_free_vec(t0);
+}
+
+/* Expand a vector two-operand operation. */
+void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
+ uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
+{
+ check_size_align(oprsz, maxsz, dofs | aofs);
+ check_overlap_2(dofs, aofs, maxsz);
+
+ /* Recall that ARM SVE allows vector sizes that are not a power of 2.
+ Expand with successively smaller host vector sizes. The intent is
+ that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */
+ /* ??? For maxsz > oprsz, the host may be able to use an opr-sized
+ operation, zeroing the balance of the register. We can then
+ use a max-sized store to implement the clearing without an extra
+ store operation. This is true for aarch64 and x86_64 hosts. */
+
+ if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32)
+ && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) {
+ uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32);
+ expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv);
+ if (some == oprsz) {
+ goto done;
+ }
+ dofs += some;
+ aofs += some;
+ oprsz -= some;
+ maxsz -= some;
+ }
+
+ if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16)
+ && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) {
+ expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv);
+ } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64
+ && g->fniv && check_size_impl(oprsz, 8)
+ && (!g->opc
+ || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) {
+ expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv);
+ } else if (g->fni8 && check_size_impl(oprsz, 8)) {
+ expand_2_i64(dofs, aofs, oprsz, g->fni8);
+ } else if (g->fni4 && check_size_impl(oprsz, 4)) {
+ expand_2_i32(dofs, aofs, oprsz, g->fni4);
+ } else {
+ assert(g->fno != NULL);
+ tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
+ return;
+ }
+
+ done:
+ if (oprsz < maxsz) {
+ expand_clr(dofs + oprsz, maxsz - oprsz);
+ }
+}
+
+/* Expand a vector operation with two vectors and an immediate. */
+void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
+ uint32_t maxsz, int64_t c, const GVecGen2i *g)
+{
+ check_size_align(oprsz, maxsz, dofs | aofs);
+ check_overlap_2(dofs, aofs, maxsz);
+
+ /* Recall that ARM SVE allows vector sizes that are not a power of 2.
+ Expand with successively smaller host vector sizes. The intent is
+ that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */
+
+ if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32)
+ && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) {
+ uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32);
+ expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
+ c, g->load_dest, g->fniv);
+ if (some == oprsz) {
+ goto done;
+ }
+ dofs += some;
+ aofs += some;
+ oprsz -= some;
+ maxsz -= some;
+ }
+
+ if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16)
+ && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) {
+ expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
+ c, g->load_dest, g->fniv);
+ } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64
+ && g->fniv && check_size_impl(oprsz, 8)
+ && (!g->opc
+ || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) {
+ expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
+ c, g->load_dest, g->fniv);
+ } else if (g->fni8 && check_size_impl(oprsz, 8)) {
+ expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
+ } else if (g->fni4 && check_size_impl(oprsz, 4)) {
+ expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
+ } else {
+ if (g->fno) {
+ tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
+ } else {
+ TCGv_i64 tcg_c = tcg_const_i64(c);
+ tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz, maxsz, c, g->fnoi);
+ tcg_temp_free_i64(tcg_c);
+ }
+ return;
+ }
+
+ done:
+ if (oprsz < maxsz) {
+ expand_clr(dofs + oprsz, maxsz - oprsz);
+ }
+}
+
+/* Expand a vector operation with two vectors and a scalar. */
+void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
+ uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
+{
+ TCGType type;
+
+ check_size_align(oprsz, maxsz, dofs | aofs);
+ check_overlap_2(dofs, aofs, maxsz);
+
+ type = 0;
+ if (g->fniv) {
+ if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)) {
+ type = TCG_TYPE_V256;
+ } else if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)) {
+ type = TCG_TYPE_V128;
+ } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64
+ && check_size_impl(oprsz, 8)) {
+ type = TCG_TYPE_V64;
+ }
+ }
+ if (type != 0) {
+ TCGv_vec t_vec = tcg_temp_new_vec(type);
+
+ tcg_gen_dup_i64_vec(g->vece, t_vec, c);
+
+ /* Recall that ARM SVE allows vector sizes that are not a power of 2.
+ Expand with successively smaller host vector sizes. The intent is
+ that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */
+ switch (type) {
+ case TCG_TYPE_V256:
+ {
+ uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32);
+ expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
+ t_vec, g->scalar_first, g->fniv);
+ if (some == oprsz) {
+ break;
+ }
+ dofs += some;
+ aofs += some;
+ oprsz -= some;
+ maxsz -= some;
+ }
+ /* fallthru */
+
+ case TCG_TYPE_V128:
+ expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
+ t_vec, g->scalar_first, g->fniv);
+ break;
+
+ case TCG_TYPE_V64:
+ expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
+ t_vec, g->scalar_first, g->fniv);
+ break;
+
+ default:
+ g_assert_not_reached();
+ }
+ tcg_temp_free_vec(t_vec);
+ } else if (g->fni8 && check_size_impl(oprsz, 8)) {
+ TCGv_i64 t64 = tcg_temp_new_i64();
+
+ gen_dup_i64(g->vece, t64, c);
+ expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
+ tcg_temp_free_i64(t64);
+ } else if (g->fni4 && check_size_impl(oprsz, 4)) {
+ TCGv_i32 t32 = tcg_temp_new_i32();
+
+ tcg_gen_extrl_i64_i32(t32, c);
+ gen_dup_i32(g->vece, t32, t32);
+ expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
+ tcg_temp_free_i32(t32);
+ } else {
+ tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
+ return;
+ }
+
+ if (oprsz < maxsz) {
+ expand_clr(dofs + oprsz, maxsz - oprsz);
+ }
+}
+
+/* Expand a vector three-operand operation. */
+void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
+{
+ check_size_align(oprsz, maxsz, dofs | aofs | bofs);
+ check_overlap_3(dofs, aofs, bofs, maxsz);
+
+ /* Recall that ARM SVE allows vector sizes that are not a power of 2.
+ Expand with successively smaller host vector sizes. The intent is
+ that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */
+
+ if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32)
+ && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) {
+ uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32);
+ expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
+ g->load_dest, g->fniv);
+ if (some == oprsz) {
+ goto done;
+ }
+ dofs += some;
+ aofs += some;
+ bofs += some;
+ oprsz -= some;
+ maxsz -= some;
+ }
+
+ if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16)
+ && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) {
+ expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
+ g->load_dest, g->fniv);
+ } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64
+ && g->fniv && check_size_impl(oprsz, 8)
+ && (!g->opc
+ || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) {
+ expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
+ g->load_dest, g->fniv);
+ } else if (g->fni8 && check_size_impl(oprsz, 8)) {
+ expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
+ } else if (g->fni4 && check_size_impl(oprsz, 4)) {
+ expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
+ } else {
+ assert(g->fno != NULL);
+ tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, g->data, g->fno);
+ }
+
+ done:
+ if (oprsz < maxsz) {
+ expand_clr(dofs + oprsz, maxsz - oprsz);
+ }
+}
+
+/* Expand a vector four-operand operation. */
+void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
+ uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
+{
+ check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
+ check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
+
+ /* Recall that ARM SVE allows vector sizes that are not a power of 2.
+ Expand with successively smaller host vector sizes. The intent is
+ that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */
+
+ if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32)
+ && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) {
+ uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32);
+ expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
+ 32, TCG_TYPE_V256, g->fniv);
+ if (some == oprsz) {
+ goto done;
+ }
+ dofs += some;
+ aofs += some;
+ bofs += some;
+ cofs += some;
+ oprsz -= some;
+ maxsz -= some;
+ }
+
+ if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16)
+ && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) {
+ expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
+ 16, TCG_TYPE_V128, g->fniv);
+ } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64
+ && g->fniv && check_size_impl(oprsz, 8)
+ && (!g->opc
+ || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) {
+ expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
+ 8, TCG_TYPE_V64, g->fniv);
+ } else if (g->fni8 && check_size_impl(oprsz, 8)) {
+ expand_4_i64(dofs, aofs, bofs, cofs, oprsz, g->fni8);
+ } else if (g->fni4 && check_size_impl(oprsz, 4)) {
+ expand_4_i32(dofs, aofs, bofs, cofs, oprsz, g->fni4);
+ } else {
+ assert(g->fno != NULL);
+ tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
+ oprsz, maxsz, g->data, g->fno);
+ return;
+ }
+
+ done:
+ if (oprsz < maxsz) {
+ expand_clr(dofs + oprsz, maxsz - oprsz);
+ }
+}
+
+/*
+ * Expand specific vector operations.
+ */
+
+static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
+{
+ tcg_gen_mov_vec(a, b);
+}
+
+void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen2 g = {
+ .fni8 = tcg_gen_mov_i64,
+ .fniv = vec_mov2,
+ .fno = gen_helper_gvec_mov,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+ if (dofs != aofs) {
+ tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
+ } else {
+ check_size_align(oprsz, maxsz, dofs);
+ if (oprsz < maxsz) {
+ expand_clr(dofs + oprsz, maxsz - oprsz);
+ }
+ }
+}
+
+void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
+ uint32_t maxsz, TCGv_i32 in)
+{
+ check_size_align(oprsz, maxsz, dofs);
+ tcg_debug_assert(vece <= MO_32);
+ do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
+}
+
+void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
+ uint32_t maxsz, TCGv_i64 in)
+{
+ check_size_align(oprsz, maxsz, dofs);
+ tcg_debug_assert(vece <= MO_64);
+ do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
+}
+
+void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t oprsz, uint32_t maxsz)
+{
+ if (vece <= MO_32) {
+ TCGv_i32 in = tcg_temp_new_i32();
+ switch (vece) {
+ case MO_8:
+ tcg_gen_ld8u_i32(in, cpu_env, aofs);
+ break;
+ case MO_16:
+ tcg_gen_ld16u_i32(in, cpu_env, aofs);
+ break;
+ case MO_32:
+ tcg_gen_ld_i32(in, cpu_env, aofs);
+ break;
+ }
+ tcg_gen_gvec_dup_i32(vece, dofs, oprsz, maxsz, in);
+ tcg_temp_free_i32(in);
+ } else if (vece == MO_64) {
+ TCGv_i64 in = tcg_temp_new_i64();
+ tcg_gen_ld_i64(in, cpu_env, aofs);
+ tcg_gen_gvec_dup_i64(MO_64, dofs, oprsz, maxsz, in);
+ tcg_temp_free_i64(in);
+ } else {
+ /* 128-bit duplicate. */
+ /* ??? Dup to 256-bit vector. */
+ int i;
+
+ tcg_debug_assert(vece == 4);
+ tcg_debug_assert(oprsz >= 16);
+ if (TCG_TARGET_HAS_v128) {
+ TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
+
+ tcg_gen_ld_vec(in, cpu_env, aofs);
+ for (i = 0; i < oprsz; i += 16) {
+ tcg_gen_st_vec(in, cpu_env, dofs + i);
+ }
+ tcg_temp_free_vec(in);
+ } else {
+ TCGv_i64 in0 = tcg_temp_new_i64();
+ TCGv_i64 in1 = tcg_temp_new_i64();
+
+ tcg_gen_ld_i64(in0, cpu_env, aofs);
+ tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
+ for (i = 0; i < oprsz; i += 16) {
+ tcg_gen_st_i64(in0, cpu_env, dofs + i);
+ tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
+ }
+ tcg_temp_free_i64(in0);
+ tcg_temp_free_i64(in1);
+ }
+ }
+}
+
+void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz,
+ uint32_t maxsz, uint64_t x)
+{
+ check_size_align(oprsz, maxsz, dofs);
+ do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x);
+}
+
+void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz,
+ uint32_t maxsz, uint32_t x)
+{
+ check_size_align(oprsz, maxsz, dofs);
+ do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x);
+}
+
+void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz,
+ uint32_t maxsz, uint16_t x)
+{
+ check_size_align(oprsz, maxsz, dofs);
+ do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x);
+}
+
+void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz,
+ uint32_t maxsz, uint8_t x)
+{
+ check_size_align(oprsz, maxsz, dofs);
+ do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x);
+}
+
+void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen2 g = {
+ .fni8 = tcg_gen_not_i64,
+ .fniv = tcg_gen_not_vec,
+ .fno = gen_helper_gvec_not,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+ tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
+}
+
+/* Perform a vector addition using normal addition and a mask. The mask
+ should be the sign bit of each lane. This 6-operation form is more
+ efficient than separate additions when there are 4 or more lanes in
+ the 64-bit operation. */
+static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
+{
+ TCGv_i64 t1 = tcg_temp_new_i64();
+ TCGv_i64 t2 = tcg_temp_new_i64();
+ TCGv_i64 t3 = tcg_temp_new_i64();
+
+ tcg_gen_andc_i64(t1, a, m);
+ tcg_gen_andc_i64(t2, b, m);
+ tcg_gen_xor_i64(t3, a, b);
+ tcg_gen_add_i64(d, t1, t2);
+ tcg_gen_and_i64(t3, t3, m);
+ tcg_gen_xor_i64(d, d, t3);
+
+ tcg_temp_free_i64(t1);
+ tcg_temp_free_i64(t2);
+ tcg_temp_free_i64(t3);
+}
+
+void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+ TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
+ gen_addv_mask(d, a, b, m);
+ tcg_temp_free_i64(m);
+}
+
+void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+ TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
+ gen_addv_mask(d, a, b, m);
+ tcg_temp_free_i64(m);
+}
+
+void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+ TCGv_i64 t1 = tcg_temp_new_i64();
+ TCGv_i64 t2 = tcg_temp_new_i64();
+
+ tcg_gen_andi_i64(t1, a, ~0xffffffffull);
+ tcg_gen_add_i64(t2, a, b);
+ tcg_gen_add_i64(t1, t1, b);
+ tcg_gen_deposit_i64(d, t1, t2, 0, 32);
+
+ tcg_temp_free_i64(t1);
+ tcg_temp_free_i64(t2);
+}
+
+void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen3 g[4] = {
+ { .fni8 = tcg_gen_vec_add8_i64,
+ .fniv = tcg_gen_add_vec,
+ .fno = gen_helper_gvec_add8,
+ .opc = INDEX_op_add_vec,
+ .vece = MO_8 },
+ { .fni8 = tcg_gen_vec_add16_i64,
+ .fniv = tcg_gen_add_vec,
+ .fno = gen_helper_gvec_add16,
+ .opc = INDEX_op_add_vec,
+ .vece = MO_16 },
+ { .fni4 = tcg_gen_add_i32,
+ .fniv = tcg_gen_add_vec,
+ .fno = gen_helper_gvec_add32,
+ .opc = INDEX_op_add_vec,
+ .vece = MO_32 },
+ { .fni8 = tcg_gen_add_i64,
+ .fniv = tcg_gen_add_vec,
+ .fno = gen_helper_gvec_add64,
+ .opc = INDEX_op_add_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .vece = MO_64 },
+ };
+
+ tcg_debug_assert(vece <= MO_64);
+ tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
+}
+
+void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
+ TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen2s g[4] = {
+ { .fni8 = tcg_gen_vec_add8_i64,
+ .fniv = tcg_gen_add_vec,
+ .fno = gen_helper_gvec_adds8,
+ .opc = INDEX_op_add_vec,
+ .vece = MO_8 },
+ { .fni8 = tcg_gen_vec_add16_i64,
+ .fniv = tcg_gen_add_vec,
+ .fno = gen_helper_gvec_adds16,
+ .opc = INDEX_op_add_vec,
+ .vece = MO_16 },
+ { .fni4 = tcg_gen_add_i32,
+ .fniv = tcg_gen_add_vec,
+ .fno = gen_helper_gvec_adds32,
+ .opc = INDEX_op_add_vec,
+ .vece = MO_32 },
+ { .fni8 = tcg_gen_add_i64,
+ .fniv = tcg_gen_add_vec,
+ .fno = gen_helper_gvec_adds64,
+ .opc = INDEX_op_add_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .vece = MO_64 },
+ };
+
+ tcg_debug_assert(vece <= MO_64);
+ tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
+}
+
+void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
+ int64_t c, uint32_t oprsz, uint32_t maxsz)
+{
+ TCGv_i64 tmp = tcg_const_i64(c);
+ tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
+ tcg_temp_free_i64(tmp);
+}
+
+void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
+ TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen2s g[4] = {
+ { .fni8 = tcg_gen_vec_sub8_i64,
+ .fniv = tcg_gen_sub_vec,
+ .fno = gen_helper_gvec_subs8,
+ .opc = INDEX_op_sub_vec,
+ .vece = MO_8 },
+ { .fni8 = tcg_gen_vec_sub16_i64,
+ .fniv = tcg_gen_sub_vec,
+ .fno = gen_helper_gvec_subs16,
+ .opc = INDEX_op_sub_vec,
+ .vece = MO_16 },
+ { .fni4 = tcg_gen_sub_i32,
+ .fniv = tcg_gen_sub_vec,
+ .fno = gen_helper_gvec_subs32,
+ .opc = INDEX_op_sub_vec,
+ .vece = MO_32 },
+ { .fni8 = tcg_gen_sub_i64,
+ .fniv = tcg_gen_sub_vec,
+ .fno = gen_helper_gvec_subs64,
+ .opc = INDEX_op_sub_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .vece = MO_64 },
+ };
+
+ tcg_debug_assert(vece <= MO_64);
+ tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
+}
+
+/* Perform a vector subtraction using normal subtraction and a mask.
+ Compare gen_addv_mask above. */
+static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
+{
+ TCGv_i64 t1 = tcg_temp_new_i64();
+ TCGv_i64 t2 = tcg_temp_new_i64();
+ TCGv_i64 t3 = tcg_temp_new_i64();
+
+ tcg_gen_or_i64(t1, a, m);
+ tcg_gen_andc_i64(t2, b, m);
+ tcg_gen_eqv_i64(t3, a, b);
+ tcg_gen_sub_i64(d, t1, t2);
+ tcg_gen_and_i64(t3, t3, m);
+ tcg_gen_xor_i64(d, d, t3);
+
+ tcg_temp_free_i64(t1);
+ tcg_temp_free_i64(t2);
+ tcg_temp_free_i64(t3);
+}
+
+void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+ TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
+ gen_subv_mask(d, a, b, m);
+ tcg_temp_free_i64(m);
+}
+
+void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+ TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
+ gen_subv_mask(d, a, b, m);
+ tcg_temp_free_i64(m);
+}
+
+void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+ TCGv_i64 t1 = tcg_temp_new_i64();
+ TCGv_i64 t2 = tcg_temp_new_i64();
+
+ tcg_gen_andi_i64(t1, b, ~0xffffffffull);
+ tcg_gen_sub_i64(t2, a, b);
+ tcg_gen_sub_i64(t1, a, t1);
+ tcg_gen_deposit_i64(d, t1, t2, 0, 32);
+
+ tcg_temp_free_i64(t1);
+ tcg_temp_free_i64(t2);
+}
+
+void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen3 g[4] = {
+ { .fni8 = tcg_gen_vec_sub8_i64,
+ .fniv = tcg_gen_sub_vec,
+ .fno = gen_helper_gvec_sub8,
+ .opc = INDEX_op_sub_vec,
+ .vece = MO_8 },
+ { .fni8 = tcg_gen_vec_sub16_i64,
+ .fniv = tcg_gen_sub_vec,
+ .fno = gen_helper_gvec_sub16,
+ .opc = INDEX_op_sub_vec,
+ .vece = MO_16 },
+ { .fni4 = tcg_gen_sub_i32,
+ .fniv = tcg_gen_sub_vec,
+ .fno = gen_helper_gvec_sub32,
+ .opc = INDEX_op_sub_vec,
+ .vece = MO_32 },
+ { .fni8 = tcg_gen_sub_i64,
+ .fniv = tcg_gen_sub_vec,
+ .fno = gen_helper_gvec_sub64,
+ .opc = INDEX_op_sub_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .vece = MO_64 },
+ };
+
+ tcg_debug_assert(vece <= MO_64);
+ tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
+}
+
+void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen3 g[4] = {
+ { .fniv = tcg_gen_mul_vec,
+ .fno = gen_helper_gvec_mul8,
+ .opc = INDEX_op_mul_vec,
+ .vece = MO_8 },
+ { .fniv = tcg_gen_mul_vec,
+ .fno = gen_helper_gvec_mul16,
+ .opc = INDEX_op_mul_vec,
+ .vece = MO_16 },
+ { .fni4 = tcg_gen_mul_i32,
+ .fniv = tcg_gen_mul_vec,
+ .fno = gen_helper_gvec_mul32,
+ .opc = INDEX_op_mul_vec,
+ .vece = MO_32 },
+ { .fni8 = tcg_gen_mul_i64,
+ .fniv = tcg_gen_mul_vec,
+ .fno = gen_helper_gvec_mul64,
+ .opc = INDEX_op_mul_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .vece = MO_64 },
+ };
+
+ tcg_debug_assert(vece <= MO_64);
+ tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
+}
+
+void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
+ TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen2s g[4] = {
+ { .fniv = tcg_gen_mul_vec,
+ .fno = gen_helper_gvec_muls8,
+ .opc = INDEX_op_mul_vec,
+ .vece = MO_8 },
+ { .fniv = tcg_gen_mul_vec,
+ .fno = gen_helper_gvec_muls16,
+ .opc = INDEX_op_mul_vec,
+ .vece = MO_16 },
+ { .fni4 = tcg_gen_mul_i32,
+ .fniv = tcg_gen_mul_vec,
+ .fno = gen_helper_gvec_muls32,
+ .opc = INDEX_op_mul_vec,
+ .vece = MO_32 },
+ { .fni8 = tcg_gen_mul_i64,
+ .fniv = tcg_gen_mul_vec,
+ .fno = gen_helper_gvec_muls64,
+ .opc = INDEX_op_mul_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .vece = MO_64 },
+ };
+
+ tcg_debug_assert(vece <= MO_64);
+ tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
+}
+
+void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
+ int64_t c, uint32_t oprsz, uint32_t maxsz)
+{
+ TCGv_i64 tmp = tcg_const_i64(c);
+ tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
+ tcg_temp_free_i64(tmp);
+}
+
+void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen3 g[4] = {
+ { .fno = gen_helper_gvec_ssadd8, .vece = MO_8 },
+ { .fno = gen_helper_gvec_ssadd16, .vece = MO_16 },
+ { .fno = gen_helper_gvec_ssadd32, .vece = MO_32 },
+ { .fno = gen_helper_gvec_ssadd64, .vece = MO_64 }
+ };
+ tcg_debug_assert(vece <= MO_64);
+ tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
+}
+
+void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen3 g[4] = {
+ { .fno = gen_helper_gvec_sssub8, .vece = MO_8 },
+ { .fno = gen_helper_gvec_sssub16, .vece = MO_16 },
+ { .fno = gen_helper_gvec_sssub32, .vece = MO_32 },
+ { .fno = gen_helper_gvec_sssub64, .vece = MO_64 }
+ };
+ tcg_debug_assert(vece <= MO_64);
+ tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
+}
+
+static void tcg_gen_vec_usadd32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+ TCGv_i32 max = tcg_const_i32(-1);
+ tcg_gen_add_i32(d, a, b);
+ tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
+ tcg_temp_free_i32(max);
+}
+
+static void tcg_gen_vec_usadd32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+ TCGv_i64 max = tcg_const_i64(-1);
+ tcg_gen_add_i64(d, a, b);
+ tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
+ tcg_temp_free_i64(max);
+}
+
+void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen3 g[4] = {
+ { .fno = gen_helper_gvec_usadd8, .vece = MO_8 },
+ { .fno = gen_helper_gvec_usadd16, .vece = MO_16 },
+ { .fni4 = tcg_gen_vec_usadd32_i32,
+ .fno = gen_helper_gvec_usadd32,
+ .vece = MO_32 },
+ { .fni8 = tcg_gen_vec_usadd32_i64,
+ .fno = gen_helper_gvec_usadd64,
+ .vece = MO_64 }
+ };
+ tcg_debug_assert(vece <= MO_64);
+ tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
+}
+
+static void tcg_gen_vec_ussub32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
+{
+ TCGv_i32 min = tcg_const_i32(0);
+ tcg_gen_sub_i32(d, a, b);
+ tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
+ tcg_temp_free_i32(min);
+}
+
+static void tcg_gen_vec_ussub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
+{
+ TCGv_i64 min = tcg_const_i64(0);
+ tcg_gen_sub_i64(d, a, b);
+ tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
+ tcg_temp_free_i64(min);
+}
+
+void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen3 g[4] = {
+ { .fno = gen_helper_gvec_ussub8, .vece = MO_8 },
+ { .fno = gen_helper_gvec_ussub16, .vece = MO_16 },
+ { .fni4 = tcg_gen_vec_ussub32_i32,
+ .fno = gen_helper_gvec_ussub32,
+ .vece = MO_32 },
+ { .fni8 = tcg_gen_vec_ussub32_i64,
+ .fno = gen_helper_gvec_ussub64,
+ .vece = MO_64 }
+ };
+ tcg_debug_assert(vece <= MO_64);
+ tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
+}
+
+/* Perform a vector negation using normal negation and a mask.
+ Compare gen_subv_mask above. */
+static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
+{
+ TCGv_i64 t2 = tcg_temp_new_i64();
+ TCGv_i64 t3 = tcg_temp_new_i64();
+
+ tcg_gen_andc_i64(t3, m, b);
+ tcg_gen_andc_i64(t2, b, m);
+ tcg_gen_sub_i64(d, m, t2);
+ tcg_gen_xor_i64(d, d, t3);
+
+ tcg_temp_free_i64(t2);
+ tcg_temp_free_i64(t3);
+}
+
+void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
+{
+ TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
+ gen_negv_mask(d, b, m);
+ tcg_temp_free_i64(m);
+}
+
+void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
+{
+ TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
+ gen_negv_mask(d, b, m);
+ tcg_temp_free_i64(m);
+}
+
+void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
+{
+ TCGv_i64 t1 = tcg_temp_new_i64();
+ TCGv_i64 t2 = tcg_temp_new_i64();
+
+ tcg_gen_andi_i64(t1, b, ~0xffffffffull);
+ tcg_gen_neg_i64(t2, b);
+ tcg_gen_neg_i64(t1, t1);
+ tcg_gen_deposit_i64(d, t1, t2, 0, 32);
+
+ tcg_temp_free_i64(t1);
+ tcg_temp_free_i64(t2);
+}
+
+void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen2 g[4] = {
+ { .fni8 = tcg_gen_vec_neg8_i64,
+ .fniv = tcg_gen_neg_vec,
+ .fno = gen_helper_gvec_neg8,
+ .opc = INDEX_op_neg_vec,
+ .vece = MO_8 },
+ { .fni8 = tcg_gen_vec_neg16_i64,
+ .fniv = tcg_gen_neg_vec,
+ .fno = gen_helper_gvec_neg16,
+ .opc = INDEX_op_neg_vec,
+ .vece = MO_16 },
+ { .fni4 = tcg_gen_neg_i32,
+ .fniv = tcg_gen_neg_vec,
+ .fno = gen_helper_gvec_neg32,
+ .opc = INDEX_op_neg_vec,
+ .vece = MO_32 },
+ { .fni8 = tcg_gen_neg_i64,
+ .fniv = tcg_gen_neg_vec,
+ .fno = gen_helper_gvec_neg64,
+ .opc = INDEX_op_neg_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .vece = MO_64 },
+ };
+
+ tcg_debug_assert(vece <= MO_64);
+ tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
+}
+
+void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen3 g = {
+ .fni8 = tcg_gen_and_i64,
+ .fniv = tcg_gen_and_vec,
+ .fno = gen_helper_gvec_and,
+ .opc = INDEX_op_and_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+ tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
+}
+
+void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen3 g = {
+ .fni8 = tcg_gen_or_i64,
+ .fniv = tcg_gen_or_vec,
+ .fno = gen_helper_gvec_or,
+ .opc = INDEX_op_or_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+ tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
+}
+
+void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen3 g = {
+ .fni8 = tcg_gen_xor_i64,
+ .fniv = tcg_gen_xor_vec,
+ .fno = gen_helper_gvec_xor,
+ .opc = INDEX_op_xor_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+ tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
+}
+
+void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen3 g = {
+ .fni8 = tcg_gen_andc_i64,
+ .fniv = tcg_gen_andc_vec,
+ .fno = gen_helper_gvec_andc,
+ .opc = INDEX_op_andc_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+ tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
+}
+
+void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen3 g = {
+ .fni8 = tcg_gen_orc_i64,
+ .fniv = tcg_gen_orc_vec,
+ .fno = gen_helper_gvec_orc,
+ .opc = INDEX_op_orc_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ };
+ tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
+}
+
+static const GVecGen2s gop_ands = {
+ .fni8 = tcg_gen_and_i64,
+ .fniv = tcg_gen_and_vec,
+ .fno = gen_helper_gvec_ands,
+ .opc = INDEX_op_and_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .vece = MO_64
+};
+
+void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
+ TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
+{
+ TCGv_i64 tmp = tcg_temp_new_i64();
+ gen_dup_i64(vece, tmp, c);
+ tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
+ tcg_temp_free_i64(tmp);
+}
+
+void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
+ int64_t c, uint32_t oprsz, uint32_t maxsz)
+{
+ TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
+ tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
+ tcg_temp_free_i64(tmp);
+}
+
+static const GVecGen2s gop_xors = {
+ .fni8 = tcg_gen_xor_i64,
+ .fniv = tcg_gen_xor_vec,
+ .fno = gen_helper_gvec_xors,
+ .opc = INDEX_op_xor_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .vece = MO_64
+};
+
+void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
+ TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
+{
+ TCGv_i64 tmp = tcg_temp_new_i64();
+ gen_dup_i64(vece, tmp, c);
+ tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
+ tcg_temp_free_i64(tmp);
+}
+
+void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
+ int64_t c, uint32_t oprsz, uint32_t maxsz)
+{
+ TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
+ tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
+ tcg_temp_free_i64(tmp);
+}
+
+static const GVecGen2s gop_ors = {
+ .fni8 = tcg_gen_or_i64,
+ .fniv = tcg_gen_or_vec,
+ .fno = gen_helper_gvec_ors,
+ .opc = INDEX_op_or_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .vece = MO_64
+};
+
+void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
+ TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
+{
+ TCGv_i64 tmp = tcg_temp_new_i64();
+ gen_dup_i64(vece, tmp, c);
+ tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
+ tcg_temp_free_i64(tmp);
+}
+
+void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
+ int64_t c, uint32_t oprsz, uint32_t maxsz)
+{
+ TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c));
+ tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
+ tcg_temp_free_i64(tmp);
+}
+
+void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
+{
+ uint64_t mask = dup_const(MO_8, 0xff << c);
+ tcg_gen_shli_i64(d, a, c);
+ tcg_gen_andi_i64(d, d, mask);
+}
+
+void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
+{
+ uint64_t mask = dup_const(MO_16, 0xffff << c);
+ tcg_gen_shli_i64(d, a, c);
+ tcg_gen_andi_i64(d, d, mask);
+}
+
+void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
+ int64_t shift, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen2i g[4] = {
+ { .fni8 = tcg_gen_vec_shl8i_i64,
+ .fniv = tcg_gen_shli_vec,
+ .fno = gen_helper_gvec_shl8i,
+ .opc = INDEX_op_shli_vec,
+ .vece = MO_8 },
+ { .fni8 = tcg_gen_vec_shl16i_i64,
+ .fniv = tcg_gen_shli_vec,
+ .fno = gen_helper_gvec_shl16i,
+ .opc = INDEX_op_shli_vec,
+ .vece = MO_16 },
+ { .fni4 = tcg_gen_shli_i32,
+ .fniv = tcg_gen_shli_vec,
+ .fno = gen_helper_gvec_shl32i,
+ .opc = INDEX_op_shli_vec,
+ .vece = MO_32 },
+ { .fni8 = tcg_gen_shli_i64,
+ .fniv = tcg_gen_shli_vec,
+ .fno = gen_helper_gvec_shl64i,
+ .opc = INDEX_op_shli_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .vece = MO_64 },
+ };
+
+ tcg_debug_assert(vece <= MO_64);
+ tcg_debug_assert(shift >= 0 && shift < (8 << vece));
+ if (shift == 0) {
+ tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
+ } else {
+ tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
+ }
+}
+
+void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
+{
+ uint64_t mask = dup_const(MO_8, 0xff >> c);
+ tcg_gen_shri_i64(d, a, c);
+ tcg_gen_andi_i64(d, d, mask);
+}
+
+void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
+{
+ uint64_t mask = dup_const(MO_16, 0xffff >> c);
+ tcg_gen_shri_i64(d, a, c);
+ tcg_gen_andi_i64(d, d, mask);
+}
+
+void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
+ int64_t shift, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen2i g[4] = {
+ { .fni8 = tcg_gen_vec_shr8i_i64,
+ .fniv = tcg_gen_shri_vec,
+ .fno = gen_helper_gvec_shr8i,
+ .opc = INDEX_op_shri_vec,
+ .vece = MO_8 },
+ { .fni8 = tcg_gen_vec_shr16i_i64,
+ .fniv = tcg_gen_shri_vec,
+ .fno = gen_helper_gvec_shr16i,
+ .opc = INDEX_op_shri_vec,
+ .vece = MO_16 },
+ { .fni4 = tcg_gen_shri_i32,
+ .fniv = tcg_gen_shri_vec,
+ .fno = gen_helper_gvec_shr32i,
+ .opc = INDEX_op_shri_vec,
+ .vece = MO_32 },
+ { .fni8 = tcg_gen_shri_i64,
+ .fniv = tcg_gen_shri_vec,
+ .fno = gen_helper_gvec_shr64i,
+ .opc = INDEX_op_shri_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .vece = MO_64 },
+ };
+
+ tcg_debug_assert(vece <= MO_64);
+ tcg_debug_assert(shift >= 0 && shift < (8 << vece));
+ if (shift == 0) {
+ tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
+ } else {
+ tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
+ }
+}
+
+void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
+{
+ uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
+ uint64_t c_mask = dup_const(MO_8, 0xff >> c);
+ TCGv_i64 s = tcg_temp_new_i64();
+
+ tcg_gen_shri_i64(d, a, c);
+ tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */
+ tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
+ tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */
+ tcg_gen_or_i64(d, d, s); /* include sign extension */
+ tcg_temp_free_i64(s);
+}
+
+void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
+{
+ uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
+ uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
+ TCGv_i64 s = tcg_temp_new_i64();
+
+ tcg_gen_shri_i64(d, a, c);
+ tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */
+ tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */
+ tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
+ tcg_gen_or_i64(d, d, s); /* include sign extension */
+ tcg_temp_free_i64(s);
+}
+
+void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
+ int64_t shift, uint32_t oprsz, uint32_t maxsz)
+{
+ static const GVecGen2i g[4] = {
+ { .fni8 = tcg_gen_vec_sar8i_i64,
+ .fniv = tcg_gen_sari_vec,
+ .fno = gen_helper_gvec_sar8i,
+ .opc = INDEX_op_sari_vec,
+ .vece = MO_8 },
+ { .fni8 = tcg_gen_vec_sar16i_i64,
+ .fniv = tcg_gen_sari_vec,
+ .fno = gen_helper_gvec_sar16i,
+ .opc = INDEX_op_sari_vec,
+ .vece = MO_16 },
+ { .fni4 = tcg_gen_sari_i32,
+ .fniv = tcg_gen_sari_vec,
+ .fno = gen_helper_gvec_sar32i,
+ .opc = INDEX_op_sari_vec,
+ .vece = MO_32 },
+ { .fni8 = tcg_gen_sari_i64,
+ .fniv = tcg_gen_sari_vec,
+ .fno = gen_helper_gvec_sar64i,
+ .opc = INDEX_op_sari_vec,
+ .prefer_i64 = TCG_TARGET_REG_BITS == 64,
+ .vece = MO_64 },
+ };
+
+ tcg_debug_assert(vece <= MO_64);
+ tcg_debug_assert(shift >= 0 && shift < (8 << vece));
+ if (shift == 0) {
+ tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
+ } else {
+ tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
+ }
+}
+
+/* Expand OPSZ bytes worth of three-operand operations using i32 elements. */
+static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t oprsz, TCGCond cond)
+{
+ TCGv_i32 t0 = tcg_temp_new_i32();
+ TCGv_i32 t1 = tcg_temp_new_i32();
+ uint32_t i;
+
+ for (i = 0; i < oprsz; i += 4) {
+ tcg_gen_ld_i32(t0, cpu_env, aofs + i);
+ tcg_gen_ld_i32(t1, cpu_env, bofs + i);
+ tcg_gen_setcond_i32(cond, t0, t0, t1);
+ tcg_gen_neg_i32(t0, t0);
+ tcg_gen_st_i32(t0, cpu_env, dofs + i);
+ }
+ tcg_temp_free_i32(t1);
+ tcg_temp_free_i32(t0);
+}
+
+static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t oprsz, TCGCond cond)
+{
+ TCGv_i64 t0 = tcg_temp_new_i64();
+ TCGv_i64 t1 = tcg_temp_new_i64();
+ uint32_t i;
+
+ for (i = 0; i < oprsz; i += 8) {
+ tcg_gen_ld_i64(t0, cpu_env, aofs + i);
+ tcg_gen_ld_i64(t1, cpu_env, bofs + i);
+ tcg_gen_setcond_i64(cond, t0, t0, t1);
+ tcg_gen_neg_i64(t0, t0);
+ tcg_gen_st_i64(t0, cpu_env, dofs + i);
+ }
+ tcg_temp_free_i64(t1);
+ tcg_temp_free_i64(t0);
+}
+
+static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t tysz,
+ TCGType type, TCGCond cond)
+{
+ TCGv_vec t0 = tcg_temp_new_vec(type);
+ TCGv_vec t1 = tcg_temp_new_vec(type);
+ uint32_t i;
+
+ for (i = 0; i < oprsz; i += tysz) {
+ tcg_gen_ld_vec(t0, cpu_env, aofs + i);
+ tcg_gen_ld_vec(t1, cpu_env, bofs + i);
+ tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
+ tcg_gen_st_vec(t0, cpu_env, dofs + i);
+ }
+ tcg_temp_free_vec(t1);
+ tcg_temp_free_vec(t0);
+}
+
+void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
+ uint32_t aofs, uint32_t bofs,
+ uint32_t oprsz, uint32_t maxsz)
+{
+ static gen_helper_gvec_3 * const eq_fn[4] = {
+ gen_helper_gvec_eq8, gen_helper_gvec_eq16,
+ gen_helper_gvec_eq32, gen_helper_gvec_eq64
+ };
+ static gen_helper_gvec_3 * const ne_fn[4] = {
+ gen_helper_gvec_ne8, gen_helper_gvec_ne16,
+ gen_helper_gvec_ne32, gen_helper_gvec_ne64
+ };
+ static gen_helper_gvec_3 * const lt_fn[4] = {
+ gen_helper_gvec_lt8, gen_helper_gvec_lt16,
+ gen_helper_gvec_lt32, gen_helper_gvec_lt64
+ };
+ static gen_helper_gvec_3 * const le_fn[4] = {
+ gen_helper_gvec_le8, gen_helper_gvec_le16,
+ gen_helper_gvec_le32, gen_helper_gvec_le64
+ };
+ static gen_helper_gvec_3 * const ltu_fn[4] = {
+ gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
+ gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
+ };
+ static gen_helper_gvec_3 * const leu_fn[4] = {
+ gen_helper_gvec_leu8, gen_helper_gvec_leu16,
+ gen_helper_gvec_leu32, gen_helper_gvec_leu64
+ };
+ static gen_helper_gvec_3 * const * const fns[16] = {
+ [TCG_COND_EQ] = eq_fn,
+ [TCG_COND_NE] = ne_fn,
+ [TCG_COND_LT] = lt_fn,
+ [TCG_COND_LE] = le_fn,
+ [TCG_COND_LTU] = ltu_fn,
+ [TCG_COND_LEU] = leu_fn,
+ };
+
+ check_size_align(oprsz, maxsz, dofs | aofs | bofs);
+ check_overlap_3(dofs, aofs, bofs, maxsz);
+
+ if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
+ do_dup(MO_8, dofs, oprsz, maxsz,
+ NULL, NULL, -(cond == TCG_COND_ALWAYS));
+ return;
+ }
+
+ /* Recall that ARM SVE allows vector sizes that are not a power of 2.
+ Expand with successively smaller host vector sizes. The intent is
+ that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */
+
+ if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)
+ && tcg_can_emit_vec_op(INDEX_op_cmp_vec, TCG_TYPE_V256, vece)) {
+ uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32);
+ expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
+ if (some == oprsz) {
+ goto done;
+ }
+ dofs += some;
+ aofs += some;
+ bofs += some;
+ oprsz -= some;
+ maxsz -= some;
+ }
+
+ if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)
+ && tcg_can_emit_vec_op(INDEX_op_cmp_vec, TCG_TYPE_V128, vece)) {
+ expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
+ } else if (TCG_TARGET_HAS_v64
+ && check_size_impl(oprsz, 8)
+ && (TCG_TARGET_REG_BITS == 32 || vece != MO_64)
+ && tcg_can_emit_vec_op(INDEX_op_cmp_vec, TCG_TYPE_V64, vece)) {
+ expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
+ } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
+ expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
+ } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
+ expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
+ } else {
+ gen_helper_gvec_3 * const *fn = fns[cond];
+
+ if (fn == NULL) {
+ uint32_t tmp;
+ tmp = aofs, aofs = bofs, bofs = tmp;
+ cond = tcg_swap_cond(cond);
+ fn = fns[cond];
+ assert(fn != NULL);
+ }
+ tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
+ return;
+ }
+
+ done:
+ if (oprsz < maxsz) {
+ expand_clr(dofs + oprsz, maxsz - oprsz);
+ }
+}
diff --git a/tcg/tcg-op-gvec.h b/tcg/tcg-op-gvec.h
new file mode 100644
index 0000000000..ff43a29a0b
--- /dev/null
+++ b/tcg/tcg-op-gvec.h
@@ -0,0 +1,306 @@
+/*
+ * Generic vector operation expansion
+ *
+ * Copyright (c) 2018 Linaro
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * "Generic" vectors. All operands are given as offsets from ENV,
+ * and therefore cannot also be allocated via tcg_global_mem_new_*.
+ * OPRSZ is the byte size of the vector upon which the operation is performed.
+ * MAXSZ is the byte size of the full vector; bytes beyond OPSZ are cleared.
+ *
+ * All sizes must be 8 or any multiple of 16.
+ * When OPRSZ is 8, the alignment may be 8, otherwise must be 16.
+ * Operands may completely, but not partially, overlap.
+ */
+
+/* Expand a call to a gvec-style helper, with pointers to two vector
+ operands, and a descriptor (see tcg-gvec-desc.h). */
+typedef void gen_helper_gvec_2(TCGv_ptr, TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
+ uint32_t oprsz, uint32_t maxsz, int32_t data,
+ gen_helper_gvec_2 *fn);
+
+/* Similarly, passing an extra data value. */
+typedef void gen_helper_gvec_2i(TCGv_ptr, TCGv_ptr, TCGv_i64, TCGv_i32);
+void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
+ uint32_t oprsz, uint32_t maxsz, int32_t data,
+ gen_helper_gvec_2i *fn);
+
+/* Similarly, passing an extra pointer (e.g. env or float_status). */
+typedef void gen_helper_gvec_2_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
+ TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
+ int32_t data, gen_helper_gvec_2_ptr *fn);
+
+/* Similarly, with three vector operands. */
+typedef void gen_helper_gvec_3(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t oprsz, uint32_t maxsz, int32_t data,
+ gen_helper_gvec_3 *fn);
+
+/* Similarly, with four vector operands. */
+typedef void gen_helper_gvec_4(TCGv_ptr, TCGv_ptr, TCGv_ptr,
+ TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
+ int32_t data, gen_helper_gvec_4 *fn);
+
+/* Similarly, with five vector operands. */
+typedef void gen_helper_gvec_5(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr,
+ TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t cofs, uint32_t xofs, uint32_t oprsz,
+ uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn);
+
+typedef void gen_helper_gvec_3_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr,
+ TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
+ int32_t data, gen_helper_gvec_3_ptr *fn);
+
+typedef void gen_helper_gvec_4_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr,
+ TCGv_ptr, TCGv_ptr, TCGv_i32);
+void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
+ uint32_t maxsz, int32_t data,
+ gen_helper_gvec_4_ptr *fn);
+
+/* Expand a gvec operation. Either inline or out-of-line depending on
+ the actual vector size and the operations supported by the host. */
+typedef struct {
+ /* Expand inline as a 64-bit or 32-bit integer.
+ Only one of these will be non-NULL. */
+ void (*fni8)(TCGv_i64, TCGv_i64);
+ void (*fni4)(TCGv_i32, TCGv_i32);
+ /* Expand inline with a host vector type. */
+ void (*fniv)(unsigned, TCGv_vec, TCGv_vec);
+ /* Expand out-of-line helper w/descriptor. */
+ gen_helper_gvec_2 *fno;
+ /* The opcode, if any, to which this corresponds. */
+ TCGOpcode opc;
+ /* The data argument to the out-of-line helper. */
+ int32_t data;
+ /* The vector element size, if applicable. */
+ uint8_t vece;
+ /* Prefer i64 to v64. */
+ bool prefer_i64;
+} GVecGen2;
+
+typedef struct {
+ /* Expand inline as a 64-bit or 32-bit integer.
+ Only one of these will be non-NULL. */
+ void (*fni8)(TCGv_i64, TCGv_i64, int64_t);
+ void (*fni4)(TCGv_i32, TCGv_i32, int32_t);
+ /* Expand inline with a host vector type. */
+ void (*fniv)(unsigned, TCGv_vec, TCGv_vec, int64_t);
+ /* Expand out-of-line helper w/descriptor, data in descriptor. */
+ gen_helper_gvec_2 *fno;
+ /* Expand out-of-line helper w/descriptor, data as argument. */
+ gen_helper_gvec_2i *fnoi;
+ /* The opcode, if any, to which this corresponds. */
+ TCGOpcode opc;
+ /* The vector element size, if applicable. */
+ uint8_t vece;
+ /* Prefer i64 to v64. */
+ bool prefer_i64;
+ /* Load dest as a 3rd source operand. */
+ bool load_dest;
+} GVecGen2i;
+
+typedef struct {
+ /* Expand inline as a 64-bit or 32-bit integer.
+ Only one of these will be non-NULL. */
+ void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
+ void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
+ /* Expand inline with a host vector type. */
+ void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
+ /* Expand out-of-line helper w/descriptor. */
+ gen_helper_gvec_2i *fno;
+ /* The opcode, if any, to which this corresponds. */
+ TCGOpcode opc;
+ /* The data argument to the out-of-line helper. */
+ uint32_t data;
+ /* The vector element size, if applicable. */
+ uint8_t vece;
+ /* Prefer i64 to v64. */
+ bool prefer_i64;
+ /* Load scalar as 1st source operand. */
+ bool scalar_first;
+} GVecGen2s;
+
+typedef struct {
+ /* Expand inline as a 64-bit or 32-bit integer.
+ Only one of these will be non-NULL. */
+ void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
+ void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
+ /* Expand inline with a host vector type. */
+ void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
+ /* Expand out-of-line helper w/descriptor. */
+ gen_helper_gvec_3 *fno;
+ /* The opcode, if any, to which this corresponds. */
+ TCGOpcode opc;
+ /* The data argument to the out-of-line helper. */
+ int32_t data;
+ /* The vector element size, if applicable. */
+ uint8_t vece;
+ /* Prefer i64 to v64. */
+ bool prefer_i64;
+ /* Load dest as a 3rd source operand. */
+ bool load_dest;
+} GVecGen3;
+
+typedef struct {
+ /* Expand inline as a 64-bit or 32-bit integer.
+ Only one of these will be non-NULL. */
+ void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64);
+ void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32);
+ /* Expand inline with a host vector type. */
+ void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, TCGv_vec);
+ /* Expand out-of-line helper w/descriptor. */
+ gen_helper_gvec_4 *fno;
+ /* The opcode, if any, to which this corresponds. */
+ TCGOpcode opc;
+ /* The data argument to the out-of-line helper. */
+ int32_t data;
+ /* The vector element size, if applicable. */
+ uint8_t vece;
+ /* Prefer i64 to v64. */
+ bool prefer_i64;
+} GVecGen4;
+
+void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
+ uint32_t oprsz, uint32_t maxsz, const GVecGen2 *);
+void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
+ uint32_t maxsz, int64_t c, const GVecGen2i *);
+void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
+ uint32_t maxsz, TCGv_i64 c, const GVecGen2s *);
+void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
+ uint32_t oprsz, uint32_t maxsz, const GVecGen3 *);
+void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
+ uint32_t oprsz, uint32_t maxsz, const GVecGen4 *);
+
+/* Expand a specific vector operation. */
+
+void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t oprsz, uint32_t maxsz);
+
+void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+
+void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
+ int64_t c, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
+ int64_t c, uint32_t oprsz, uint32_t maxsz);
+
+void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
+ TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
+ TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
+ TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
+
+/* Saturated arithmetic. */
+void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+
+void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t bofs, uint32_t oprsz, uint32_t maxsz);
+
+void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
+ int64_t c, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
+ int64_t c, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
+ int64_t c, uint32_t oprsz, uint32_t maxsz);
+
+void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
+ TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
+ TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
+ TCGv_i64 c, uint32_t oprsz, uint32_t maxsz);
+
+void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
+ uint32_t s, uint32_t m);
+void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t s,
+ uint32_t m, TCGv_i32);
+void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t s,
+ uint32_t m, TCGv_i64);
+
+void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t s, uint32_t m, uint8_t x);
+void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t s, uint32_t m, uint16_t x);
+void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t s, uint32_t m, uint32_t x);
+void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t s, uint32_t m, uint64_t x);
+
+void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
+ int64_t shift, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
+ int64_t shift, uint32_t oprsz, uint32_t maxsz);
+void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
+ int64_t shift, uint32_t oprsz, uint32_t maxsz);
+
+void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
+ uint32_t aofs, uint32_t bofs,
+ uint32_t oprsz, uint32_t maxsz);
+
+/*
+ * 64-bit vector operations. Use these when the register has been allocated
+ * with tcg_global_mem_new_i64, and so we cannot also address it via pointer.
+ * OPRSZ = MAXSZ = 8.
+ */
+
+void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 a);
+void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 a);
+void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 a);
+
+void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+
+void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b);
+
+void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
+void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
+void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
+void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
+void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
+void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t);
diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c
new file mode 100644
index 0000000000..70ec889bc1
--- /dev/null
+++ b/tcg/tcg-op-vec.c
@@ -0,0 +1,389 @@
+/*
+ * Tiny Code Generator for QEMU
+ *
+ * Copyright (c) 2018 Linaro, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "cpu.h"
+#include "exec/exec-all.h"
+#include "tcg.h"
+#include "tcg-op.h"
+#include "tcg-mo.h"
+
+/* Reduce the number of ifdefs below. This assumes that all uses of
+ TCGV_HIGH and TCGV_LOW are properly protected by a conditional that
+ the compiler can eliminate. */
+#if TCG_TARGET_REG_BITS == 64
+extern TCGv_i32 TCGV_LOW_link_error(TCGv_i64);
+extern TCGv_i32 TCGV_HIGH_link_error(TCGv_i64);
+#define TCGV_LOW TCGV_LOW_link_error
+#define TCGV_HIGH TCGV_HIGH_link_error
+#endif
+
+void vec_gen_2(TCGOpcode opc, TCGType type, unsigned vece, TCGArg r, TCGArg a)
+{
+ TCGOp *op = tcg_emit_op(opc);
+ TCGOP_VECL(op) = type - TCG_TYPE_V64;
+ TCGOP_VECE(op) = vece;
+ op->args[0] = r;
+ op->args[1] = a;
+}
+
+void vec_gen_3(TCGOpcode opc, TCGType type, unsigned vece,
+ TCGArg r, TCGArg a, TCGArg b)
+{
+ TCGOp *op = tcg_emit_op(opc);
+ TCGOP_VECL(op) = type - TCG_TYPE_V64;
+ TCGOP_VECE(op) = vece;
+ op->args[0] = r;
+ op->args[1] = a;
+ op->args[2] = b;
+}
+
+void vec_gen_4(TCGOpcode opc, TCGType type, unsigned vece,
+ TCGArg r, TCGArg a, TCGArg b, TCGArg c)
+{
+ TCGOp *op = tcg_emit_op(opc);
+ TCGOP_VECL(op) = type - TCG_TYPE_V64;
+ TCGOP_VECE(op) = vece;
+ op->args[0] = r;
+ op->args[1] = a;
+ op->args[2] = b;
+ op->args[3] = c;
+}
+
+static void vec_gen_op2(TCGOpcode opc, unsigned vece, TCGv_vec r, TCGv_vec a)
+{
+ TCGTemp *rt = tcgv_vec_temp(r);
+ TCGTemp *at = tcgv_vec_temp(a);
+ TCGType type = rt->base_type;
+
+ /* Must enough inputs for the output. */
+ tcg_debug_assert(at->base_type >= type);
+ vec_gen_2(opc, type, vece, temp_arg(rt), temp_arg(at));
+}
+
+static void vec_gen_op3(TCGOpcode opc, unsigned vece,
+ TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ TCGTemp *rt = tcgv_vec_temp(r);
+ TCGTemp *at = tcgv_vec_temp(a);
+ TCGTemp *bt = tcgv_vec_temp(b);
+ TCGType type = rt->base_type;
+
+ /* Must enough inputs for the output. */
+ tcg_debug_assert(at->base_type >= type);
+ tcg_debug_assert(bt->base_type >= type);
+ vec_gen_3(opc, type, vece, temp_arg(rt), temp_arg(at), temp_arg(bt));
+}
+
+void tcg_gen_mov_vec(TCGv_vec r, TCGv_vec a)
+{
+ if (r != a) {
+ vec_gen_op2(INDEX_op_mov_vec, 0, r, a);
+ }
+}
+
+#define MO_REG (TCG_TARGET_REG_BITS == 64 ? MO_64 : MO_32)
+
+static void do_dupi_vec(TCGv_vec r, unsigned vece, TCGArg a)
+{
+ TCGTemp *rt = tcgv_vec_temp(r);
+ vec_gen_2(INDEX_op_dupi_vec, rt->base_type, vece, temp_arg(rt), a);
+}
+
+TCGv_vec tcg_const_zeros_vec(TCGType type)
+{
+ TCGv_vec ret = tcg_temp_new_vec(type);
+ do_dupi_vec(ret, MO_REG, 0);
+ return ret;
+}
+
+TCGv_vec tcg_const_ones_vec(TCGType type)
+{
+ TCGv_vec ret = tcg_temp_new_vec(type);
+ do_dupi_vec(ret, MO_REG, -1);
+ return ret;
+}
+
+TCGv_vec tcg_const_zeros_vec_matching(TCGv_vec m)
+{
+ TCGTemp *t = tcgv_vec_temp(m);
+ return tcg_const_zeros_vec(t->base_type);
+}
+
+TCGv_vec tcg_const_ones_vec_matching(TCGv_vec m)
+{
+ TCGTemp *t = tcgv_vec_temp(m);
+ return tcg_const_ones_vec(t->base_type);
+}
+
+void tcg_gen_dup64i_vec(TCGv_vec r, uint64_t a)
+{
+ if (TCG_TARGET_REG_BITS == 32 && a == deposit64(a, 32, 32, a)) {
+ do_dupi_vec(r, MO_32, a);
+ } else if (TCG_TARGET_REG_BITS == 64 || a == (uint64_t)(int32_t)a) {
+ do_dupi_vec(r, MO_64, a);
+ } else {
+ TCGv_i64 c = tcg_const_i64(a);
+ tcg_gen_dup_i64_vec(MO_64, r, c);
+ tcg_temp_free_i64(c);
+ }
+}
+
+void tcg_gen_dup32i_vec(TCGv_vec r, uint32_t a)
+{
+ do_dupi_vec(r, MO_REG, dup_const(MO_32, a));
+}
+
+void tcg_gen_dup16i_vec(TCGv_vec r, uint32_t a)
+{
+ do_dupi_vec(r, MO_REG, dup_const(MO_16, a));
+}
+
+void tcg_gen_dup8i_vec(TCGv_vec r, uint32_t a)
+{
+ do_dupi_vec(r, MO_REG, dup_const(MO_8, a));
+}
+
+void tcg_gen_dupi_vec(unsigned vece, TCGv_vec r, uint64_t a)
+{
+ do_dupi_vec(r, MO_REG, dup_const(vece, a));
+}
+
+void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec r, TCGv_i64 a)
+{
+ TCGArg ri = tcgv_vec_arg(r);
+ TCGTemp *rt = arg_temp(ri);
+ TCGType type = rt->base_type;
+
+ if (TCG_TARGET_REG_BITS == 64) {
+ TCGArg ai = tcgv_i64_arg(a);
+ vec_gen_2(INDEX_op_dup_vec, type, vece, ri, ai);
+ } else if (vece == MO_64) {
+ TCGArg al = tcgv_i32_arg(TCGV_LOW(a));
+ TCGArg ah = tcgv_i32_arg(TCGV_HIGH(a));
+ vec_gen_3(INDEX_op_dup2_vec, type, MO_64, ri, al, ah);
+ } else {
+ TCGArg ai = tcgv_i32_arg(TCGV_LOW(a));
+ vec_gen_2(INDEX_op_dup_vec, type, vece, ri, ai);
+ }
+}
+
+void tcg_gen_dup_i32_vec(unsigned vece, TCGv_vec r, TCGv_i32 a)
+{
+ TCGArg ri = tcgv_vec_arg(r);
+ TCGArg ai = tcgv_i32_arg(a);
+ TCGTemp *rt = arg_temp(ri);
+ TCGType type = rt->base_type;
+
+ vec_gen_2(INDEX_op_dup_vec, type, vece, ri, ai);
+}
+
+static void vec_gen_ldst(TCGOpcode opc, TCGv_vec r, TCGv_ptr b, TCGArg o)
+{
+ TCGArg ri = tcgv_vec_arg(r);
+ TCGArg bi = tcgv_ptr_arg(b);
+ TCGTemp *rt = arg_temp(ri);
+ TCGType type = rt->base_type;
+
+ vec_gen_3(opc, type, 0, ri, bi, o);
+}
+
+void tcg_gen_ld_vec(TCGv_vec r, TCGv_ptr b, TCGArg o)
+{
+ vec_gen_ldst(INDEX_op_ld_vec, r, b, o);
+}
+
+void tcg_gen_st_vec(TCGv_vec r, TCGv_ptr b, TCGArg o)
+{
+ vec_gen_ldst(INDEX_op_st_vec, r, b, o);
+}
+
+void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr b, TCGArg o, TCGType low_type)
+{
+ TCGArg ri = tcgv_vec_arg(r);
+ TCGArg bi = tcgv_ptr_arg(b);
+ TCGTemp *rt = arg_temp(ri);
+ TCGType type = rt->base_type;
+
+ tcg_debug_assert(low_type >= TCG_TYPE_V64);
+ tcg_debug_assert(low_type <= type);
+ vec_gen_3(INDEX_op_st_vec, low_type, 0, ri, bi, o);
+}
+
+void tcg_gen_add_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ vec_gen_op3(INDEX_op_add_vec, vece, r, a, b);
+}
+
+void tcg_gen_sub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ vec_gen_op3(INDEX_op_sub_vec, vece, r, a, b);
+}
+
+void tcg_gen_and_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ vec_gen_op3(INDEX_op_and_vec, 0, r, a, b);
+}
+
+void tcg_gen_or_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ vec_gen_op3(INDEX_op_or_vec, 0, r, a, b);
+}
+
+void tcg_gen_xor_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ vec_gen_op3(INDEX_op_xor_vec, 0, r, a, b);
+}
+
+void tcg_gen_andc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ if (TCG_TARGET_HAS_andc_vec) {
+ vec_gen_op3(INDEX_op_andc_vec, 0, r, a, b);
+ } else {
+ TCGv_vec t = tcg_temp_new_vec_matching(r);
+ tcg_gen_not_vec(0, t, b);
+ tcg_gen_and_vec(0, r, a, t);
+ tcg_temp_free_vec(t);
+ }
+}
+
+void tcg_gen_orc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ if (TCG_TARGET_HAS_orc_vec) {
+ vec_gen_op3(INDEX_op_orc_vec, 0, r, a, b);
+ } else {
+ TCGv_vec t = tcg_temp_new_vec_matching(r);
+ tcg_gen_not_vec(0, t, b);
+ tcg_gen_or_vec(0, r, a, t);
+ tcg_temp_free_vec(t);
+ }
+}
+
+void tcg_gen_not_vec(unsigned vece, TCGv_vec r, TCGv_vec a)
+{
+ if (TCG_TARGET_HAS_not_vec) {
+ vec_gen_op2(INDEX_op_not_vec, 0, r, a);
+ } else {
+ TCGv_vec t = tcg_const_ones_vec_matching(r);
+ tcg_gen_xor_vec(0, r, a, t);
+ tcg_temp_free_vec(t);
+ }
+}
+
+void tcg_gen_neg_vec(unsigned vece, TCGv_vec r, TCGv_vec a)
+{
+ if (TCG_TARGET_HAS_neg_vec) {
+ vec_gen_op2(INDEX_op_neg_vec, vece, r, a);
+ } else {
+ TCGv_vec t = tcg_const_zeros_vec_matching(r);
+ tcg_gen_sub_vec(vece, r, t, a);
+ tcg_temp_free_vec(t);
+ }
+}
+
+static void do_shifti(TCGOpcode opc, unsigned vece,
+ TCGv_vec r, TCGv_vec a, int64_t i)
+{
+ TCGTemp *rt = tcgv_vec_temp(r);
+ TCGTemp *at = tcgv_vec_temp(a);
+ TCGArg ri = temp_arg(rt);
+ TCGArg ai = temp_arg(at);
+ TCGType type = rt->base_type;
+ int can;
+
+ tcg_debug_assert(at->base_type == type);
+ tcg_debug_assert(i >= 0 && i < (8 << vece));
+
+ if (i == 0) {
+ tcg_gen_mov_vec(r, a);
+ return;
+ }
+
+ can = tcg_can_emit_vec_op(opc, type, vece);
+ if (can > 0) {
+ vec_gen_3(opc, type, vece, ri, ai, i);
+ } else {
+ /* We leave the choice of expansion via scalar or vector shift
+ to the target. Often, but not always, dupi can feed a vector
+ shift easier than a scalar. */
+ tcg_debug_assert(can < 0);
+ tcg_expand_vec_op(opc, type, vece, ri, ai, i);
+ }
+}
+
+void tcg_gen_shli_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i)
+{
+ do_shifti(INDEX_op_shli_vec, vece, r, a, i);
+}
+
+void tcg_gen_shri_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i)
+{
+ do_shifti(INDEX_op_shri_vec, vece, r, a, i);
+}
+
+void tcg_gen_sari_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i)
+{
+ do_shifti(INDEX_op_sari_vec, vece, r, a, i);
+}
+
+void tcg_gen_cmp_vec(TCGCond cond, unsigned vece,
+ TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ TCGTemp *rt = tcgv_vec_temp(r);
+ TCGTemp *at = tcgv_vec_temp(a);
+ TCGTemp *bt = tcgv_vec_temp(b);
+ TCGArg ri = temp_arg(rt);
+ TCGArg ai = temp_arg(at);
+ TCGArg bi = temp_arg(bt);
+ TCGType type = rt->base_type;
+ int can;
+
+ tcg_debug_assert(at->base_type == type);
+ tcg_debug_assert(bt->base_type == type);
+ can = tcg_can_emit_vec_op(INDEX_op_cmp_vec, type, vece);
+ if (can > 0) {
+ vec_gen_4(INDEX_op_cmp_vec, type, vece, ri, ai, bi, cond);
+ } else {
+ tcg_debug_assert(can < 0);
+ tcg_expand_vec_op(INDEX_op_cmp_vec, type, vece, ri, ai, bi, cond);
+ }
+}
+
+void tcg_gen_mul_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
+{
+ TCGTemp *rt = tcgv_vec_temp(r);
+ TCGTemp *at = tcgv_vec_temp(a);
+ TCGTemp *bt = tcgv_vec_temp(b);
+ TCGArg ri = temp_arg(rt);
+ TCGArg ai = temp_arg(at);
+ TCGArg bi = temp_arg(bt);
+ TCGType type = rt->base_type;
+ int can;
+
+ tcg_debug_assert(at->base_type == type);
+ tcg_debug_assert(bt->base_type == type);
+ can = tcg_can_emit_vec_op(INDEX_op_mul_vec, type, vece);
+ if (can > 0) {
+ vec_gen_3(INDEX_op_mul_vec, type, vece, ri, ai, bi);
+ } else {
+ tcg_debug_assert(can < 0);
+ tcg_expand_vec_op(INDEX_op_mul_vec, type, vece, ri, ai, bi);
+ }
+}
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index 0c509bfe46..3467787323 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -140,7 +140,7 @@ void tcg_gen_subi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
}
}
-void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2)
+void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
{
TCGv_i32 t0;
/* Some cases can be optimized here. */
@@ -148,17 +148,17 @@ void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2)
case 0:
tcg_gen_movi_i32(ret, 0);
return;
- case 0xffffffffu:
+ case -1:
tcg_gen_mov_i32(ret, arg1);
return;
- case 0xffu:
+ case 0xff:
/* Don't recurse with tcg_gen_ext8u_i32. */
if (TCG_TARGET_HAS_ext8u_i32) {
tcg_gen_op2_i32(INDEX_op_ext8u_i32, ret, arg1);
return;
}
break;
- case 0xffffu:
+ case 0xffff:
if (TCG_TARGET_HAS_ext16u_i32) {
tcg_gen_op2_i32(INDEX_op_ext16u_i32, ret, arg1);
return;
@@ -199,9 +199,9 @@ void tcg_gen_xori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
}
}
-void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2)
+void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
{
- tcg_debug_assert(arg2 < 32);
+ tcg_debug_assert(arg2 >= 0 && arg2 < 32);
if (arg2 == 0) {
tcg_gen_mov_i32(ret, arg1);
} else {
@@ -211,9 +211,9 @@ void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2)
}
}
-void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2)
+void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
{
- tcg_debug_assert(arg2 < 32);
+ tcg_debug_assert(arg2 >= 0 && arg2 < 32);
if (arg2 == 0) {
tcg_gen_mov_i32(ret, arg1);
} else {
@@ -223,9 +223,9 @@ void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2)
}
}
-void tcg_gen_sari_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2)
+void tcg_gen_sari_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
{
- tcg_debug_assert(arg2 < 32);
+ tcg_debug_assert(arg2 >= 0 && arg2 < 32);
if (arg2 == 0) {
tcg_gen_mov_i32(ret, arg1);
} else {
@@ -1201,7 +1201,7 @@ void tcg_gen_subi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
}
}
-void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2)
+void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
{
TCGv_i64 t0;
@@ -1216,23 +1216,23 @@ void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2)
case 0:
tcg_gen_movi_i64(ret, 0);
return;
- case 0xffffffffffffffffull:
+ case -1:
tcg_gen_mov_i64(ret, arg1);
return;
- case 0xffull:
+ case 0xff:
/* Don't recurse with tcg_gen_ext8u_i64. */
if (TCG_TARGET_HAS_ext8u_i64) {
tcg_gen_op2_i64(INDEX_op_ext8u_i64, ret, arg1);
return;
}
break;
- case 0xffffu:
+ case 0xffff:
if (TCG_TARGET_HAS_ext16u_i64) {
tcg_gen_op2_i64(INDEX_op_ext16u_i64, ret, arg1);
return;
}
break;
- case 0xffffffffull:
+ case 0xffffffffu:
if (TCG_TARGET_HAS_ext32u_i64) {
tcg_gen_op2_i64(INDEX_op_ext32u_i64, ret, arg1);
return;
@@ -1332,9 +1332,9 @@ static inline void tcg_gen_shifti_i64(TCGv_i64 ret, TCGv_i64 arg1,
}
}
-void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2)
+void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
{
- tcg_debug_assert(arg2 < 64);
+ tcg_debug_assert(arg2 >= 0 && arg2 < 64);
if (TCG_TARGET_REG_BITS == 32) {
tcg_gen_shifti_i64(ret, arg1, arg2, 0, 0);
} else if (arg2 == 0) {
@@ -1346,9 +1346,9 @@ void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2)
}
}
-void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2)
+void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
{
- tcg_debug_assert(arg2 < 64);
+ tcg_debug_assert(arg2 >= 0 && arg2 < 64);
if (TCG_TARGET_REG_BITS == 32) {
tcg_gen_shifti_i64(ret, arg1, arg2, 1, 0);
} else if (arg2 == 0) {
@@ -1360,9 +1360,9 @@ void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2)
}
}
-void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2)
+void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
{
- tcg_debug_assert(arg2 < 64);
+ tcg_debug_assert(arg2 >= 0 && arg2 < 64);
if (TCG_TARGET_REG_BITS == 32) {
tcg_gen_shifti_i64(ret, arg1, arg2, 1, 1);
} else if (arg2 == 0) {
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index ca07b32b65..75bb55aeac 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -35,6 +35,10 @@ void tcg_gen_op4(TCGOpcode, TCGArg, TCGArg, TCGArg, TCGArg);
void tcg_gen_op5(TCGOpcode, TCGArg, TCGArg, TCGArg, TCGArg, TCGArg);
void tcg_gen_op6(TCGOpcode, TCGArg, TCGArg, TCGArg, TCGArg, TCGArg, TCGArg);
+void vec_gen_2(TCGOpcode, TCGType, unsigned, TCGArg, TCGArg);
+void vec_gen_3(TCGOpcode, TCGType, unsigned, TCGArg, TCGArg, TCGArg);
+void vec_gen_4(TCGOpcode, TCGType, unsigned, TCGArg, TCGArg, TCGArg, TCGArg);
+
static inline void tcg_gen_op1_i32(TCGOpcode opc, TCGv_i32 a1)
{
tcg_gen_op1(opc, tcgv_i32_arg(a1));
@@ -265,12 +269,12 @@ void tcg_gen_mb(TCGBar);
void tcg_gen_addi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
void tcg_gen_subfi_i32(TCGv_i32 ret, int32_t arg1, TCGv_i32 arg2);
void tcg_gen_subi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
-void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2);
+void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
void tcg_gen_ori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
void tcg_gen_xori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
-void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2);
-void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2);
-void tcg_gen_sari_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2);
+void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
+void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
+void tcg_gen_sari_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
void tcg_gen_muli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
void tcg_gen_div_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
void tcg_gen_rem_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
@@ -454,12 +458,12 @@ static inline void tcg_gen_not_i32(TCGv_i32 ret, TCGv_i32 arg)
void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
void tcg_gen_subfi_i64(TCGv_i64 ret, int64_t arg1, TCGv_i64 arg2);
void tcg_gen_subi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
-void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2);
+void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
void tcg_gen_ori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
void tcg_gen_xori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
-void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2);
-void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2);
-void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2);
+void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
+void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
+void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
void tcg_gen_muli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
void tcg_gen_div_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
void tcg_gen_rem_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
@@ -903,6 +907,36 @@ void tcg_gen_atomic_or_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
void tcg_gen_atomic_xor_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp);
void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
+void tcg_gen_mov_vec(TCGv_vec, TCGv_vec);
+void tcg_gen_dup_i32_vec(unsigned vece, TCGv_vec, TCGv_i32);
+void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec, TCGv_i64);
+void tcg_gen_dup8i_vec(TCGv_vec, uint32_t);
+void tcg_gen_dup16i_vec(TCGv_vec, uint32_t);
+void tcg_gen_dup32i_vec(TCGv_vec, uint32_t);
+void tcg_gen_dup64i_vec(TCGv_vec, uint64_t);
+void tcg_gen_dupi_vec(unsigned vece, TCGv_vec, uint64_t);
+void tcg_gen_add_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_sub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_mul_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_and_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_or_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_xor_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_andc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_orc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b);
+void tcg_gen_not_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
+void tcg_gen_neg_vec(unsigned vece, TCGv_vec r, TCGv_vec a);
+
+void tcg_gen_shli_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
+void tcg_gen_shri_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
+void tcg_gen_sari_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i);
+
+void tcg_gen_cmp_vec(TCGCond cond, unsigned vece, TCGv_vec r,
+ TCGv_vec a, TCGv_vec b);
+
+void tcg_gen_ld_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset);
+void tcg_gen_st_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset);
+void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType t);
+
#if TARGET_LONG_BITS == 64
#define tcg_gen_movi_tl tcg_gen_movi_i64
#define tcg_gen_mov_tl tcg_gen_mov_i64
@@ -1001,6 +1035,7 @@ void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
#define tcg_gen_atomic_and_fetch_tl tcg_gen_atomic_and_fetch_i64
#define tcg_gen_atomic_or_fetch_tl tcg_gen_atomic_or_fetch_i64
#define tcg_gen_atomic_xor_fetch_tl tcg_gen_atomic_xor_fetch_i64
+#define tcg_gen_dup_tl_vec tcg_gen_dup_i64_vec
#else
#define tcg_gen_movi_tl tcg_gen_movi_i32
#define tcg_gen_mov_tl tcg_gen_mov_i32
@@ -1098,6 +1133,7 @@ void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp);
#define tcg_gen_atomic_and_fetch_tl tcg_gen_atomic_and_fetch_i32
#define tcg_gen_atomic_or_fetch_tl tcg_gen_atomic_or_fetch_i32
#define tcg_gen_atomic_xor_fetch_tl tcg_gen_atomic_xor_fetch_i32
+#define tcg_gen_dup_tl_vec tcg_gen_dup_i32_vec
#endif
#if UINTPTR_MAX == UINT32_MAX
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index 956fb1e9f3..d81a6c4535 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -204,8 +204,54 @@ DEF(qemu_ld_i64, DATA64_ARGS, TLADDR_ARGS, 1,
DEF(qemu_st_i64, 0, TLADDR_ARGS + DATA64_ARGS, 1,
TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT)
+/* Host vector support. */
+
+#define IMPLVEC TCG_OPF_VECTOR | IMPL(TCG_TARGET_MAYBE_vec)
+
+DEF(mov_vec, 1, 1, 0, TCG_OPF_VECTOR | TCG_OPF_NOT_PRESENT)
+DEF(dupi_vec, 1, 0, 1, TCG_OPF_VECTOR | TCG_OPF_NOT_PRESENT)
+
+DEF(dup_vec, 1, 1, 0, IMPLVEC)
+DEF(dup2_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_REG_BITS == 32))
+
+DEF(ld_vec, 1, 1, 1, IMPLVEC)
+DEF(st_vec, 0, 2, 1, IMPLVEC)
+
+DEF(add_vec, 1, 2, 0, IMPLVEC)
+DEF(sub_vec, 1, 2, 0, IMPLVEC)
+DEF(mul_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_mul_vec))
+DEF(neg_vec, 1, 1, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec))
+
+DEF(and_vec, 1, 2, 0, IMPLVEC)
+DEF(or_vec, 1, 2, 0, IMPLVEC)
+DEF(xor_vec, 1, 2, 0, IMPLVEC)
+DEF(andc_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_andc_vec))
+DEF(orc_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_orc_vec))
+DEF(not_vec, 1, 1, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_not_vec))
+
+DEF(shli_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_shi_vec))
+DEF(shri_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_shi_vec))
+DEF(sari_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_shi_vec))
+
+DEF(shls_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_shs_vec))
+DEF(shrs_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_shs_vec))
+DEF(sars_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_shs_vec))
+
+DEF(shlv_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_shv_vec))
+DEF(shrv_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_shv_vec))
+DEF(sarv_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_shv_vec))
+
+DEF(cmp_vec, 1, 2, 1, IMPLVEC)
+
+DEF(last_generic, 0, 0, 0, TCG_OPF_NOT_PRESENT)
+
+#if TCG_TARGET_MAYBE_vec
+#include "tcg-target.opc.h"
+#endif
+
#undef TLADDR_ARGS
#undef DATA64_ARGS
#undef IMPL
#undef IMPL64
+#undef IMPLVEC
#undef DEF
diff --git a/tcg/tcg-pool.inc.c b/tcg/tcg-pool.inc.c
index 8a85131405..7af5513ff3 100644
--- a/tcg/tcg-pool.inc.c
+++ b/tcg/tcg-pool.inc.c
@@ -22,39 +22,110 @@
typedef struct TCGLabelPoolData {
struct TCGLabelPoolData *next;
- tcg_target_ulong data;
tcg_insn_unit *label;
intptr_t addend;
- int type;
+ int rtype;
+ unsigned nlong;
+ tcg_target_ulong data[];
} TCGLabelPoolData;
-static void new_pool_label(TCGContext *s, tcg_target_ulong data, int type,
- tcg_insn_unit *label, intptr_t addend)
+static TCGLabelPoolData *new_pool_alloc(TCGContext *s, int nlong, int rtype,
+ tcg_insn_unit *label, intptr_t addend)
{
- TCGLabelPoolData *n = tcg_malloc(sizeof(*n));
- TCGLabelPoolData *i, **pp;
+ TCGLabelPoolData *n = tcg_malloc(sizeof(TCGLabelPoolData)
+ + sizeof(tcg_target_ulong) * nlong);
- n->data = data;
n->label = label;
- n->type = type;
n->addend = addend;
+ n->rtype = rtype;
+ n->nlong = nlong;
+ return n;
+}
+
+static void new_pool_insert(TCGContext *s, TCGLabelPoolData *n)
+{
+ TCGLabelPoolData *i, **pp;
+ int nlong = n->nlong;
/* Insertion sort on the pool. */
- for (pp = &s->pool_labels; (i = *pp) && i->data < data; pp = &i->next) {
- continue;
+ for (pp = &s->pool_labels; (i = *pp) != NULL; pp = &i->next) {
+ if (nlong > i->nlong) {
+ break;
+ }
+ if (nlong < i->nlong) {
+ continue;
+ }
+ if (memcmp(n->data, i->data, sizeof(tcg_target_ulong) * nlong) >= 0) {
+ break;
+ }
}
n->next = *pp;
*pp = n;
}
+/* The "usual" for generic integer code. */
+static inline void new_pool_label(TCGContext *s, tcg_target_ulong d, int rtype,
+ tcg_insn_unit *label, intptr_t addend)
+{
+ TCGLabelPoolData *n = new_pool_alloc(s, 1, rtype, label, addend);
+ n->data[0] = d;
+ new_pool_insert(s, n);
+}
+
+/* For v64 or v128, depending on the host. */
+static inline void new_pool_l2(TCGContext *s, int rtype, tcg_insn_unit *label,
+ intptr_t addend, tcg_target_ulong d0,
+ tcg_target_ulong d1)
+{
+ TCGLabelPoolData *n = new_pool_alloc(s, 2, rtype, label, addend);
+ n->data[0] = d0;
+ n->data[1] = d1;
+ new_pool_insert(s, n);
+}
+
+/* For v128 or v256, depending on the host. */
+static inline void new_pool_l4(TCGContext *s, int rtype, tcg_insn_unit *label,
+ intptr_t addend, tcg_target_ulong d0,
+ tcg_target_ulong d1, tcg_target_ulong d2,
+ tcg_target_ulong d3)
+{
+ TCGLabelPoolData *n = new_pool_alloc(s, 4, rtype, label, addend);
+ n->data[0] = d0;
+ n->data[1] = d1;
+ n->data[2] = d2;
+ n->data[3] = d3;
+ new_pool_insert(s, n);
+}
+
+/* For v256, for 32-bit host. */
+static inline void new_pool_l8(TCGContext *s, int rtype, tcg_insn_unit *label,
+ intptr_t addend, tcg_target_ulong d0,
+ tcg_target_ulong d1, tcg_target_ulong d2,
+ tcg_target_ulong d3, tcg_target_ulong d4,
+ tcg_target_ulong d5, tcg_target_ulong d6,
+ tcg_target_ulong d7)
+{
+ TCGLabelPoolData *n = new_pool_alloc(s, 8, rtype, label, addend);
+ n->data[0] = d0;
+ n->data[1] = d1;
+ n->data[2] = d2;
+ n->data[3] = d3;
+ n->data[4] = d4;
+ n->data[5] = d5;
+ n->data[6] = d6;
+ n->data[7] = d7;
+ new_pool_insert(s, n);
+}
+
/* To be provided by cpu/tcg-target.inc.c. */
static void tcg_out_nop_fill(tcg_insn_unit *p, int count);
static bool tcg_out_pool_finalize(TCGContext *s)
{
TCGLabelPoolData *p = s->pool_labels;
- tcg_target_ulong d, *a;
+ TCGLabelPoolData *l = NULL;
+ void *a;
if (p == NULL) {
return true;
@@ -62,24 +133,24 @@ static bool tcg_out_pool_finalize(TCGContext *s)
/* ??? Round up to qemu_icache_linesize, but then do not round
again when allocating the next TranslationBlock structure. */
- a = (void *)ROUND_UP((uintptr_t)s->code_ptr, sizeof(tcg_target_ulong));
+ a = (void *)ROUND_UP((uintptr_t)s->code_ptr,
+ sizeof(tcg_target_ulong) * p->nlong);
tcg_out_nop_fill(s->code_ptr, (tcg_insn_unit *)a - s->code_ptr);
s->data_gen_ptr = a;
- /* Ensure the first comparison fails. */
- d = p->data + 1;
-
for (; p != NULL; p = p->next) {
- if (p->data != d) {
- d = p->data;
- if (unlikely((void *)a > s->code_gen_highwater)) {
+ size_t size = sizeof(tcg_target_ulong) * p->nlong;
+ if (!l || l->nlong != p->nlong || memcmp(l->data, p->data, size)) {
+ if (unlikely(a > s->code_gen_highwater)) {
return false;
}
- *a++ = d;
+ memcpy(a, p->data, size);
+ a += size;
+ l = p;
}
- patch_reloc(p->label, p->type, (intptr_t)(a - 1), p->addend);
+ patch_reloc(p->label, p->rtype, (intptr_t)a - size, p->addend);
}
- s->code_ptr = (void *)a;
+ s->code_ptr = a;
return true;
}
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 93caa0be93..bb24526c93 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -106,6 +106,18 @@ static void tcg_out_movi(TCGContext *s, TCGType type,
TCGReg ret, tcg_target_long arg);
static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args,
const int *const_args);
+#if TCG_TARGET_MAYBE_vec
+static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, unsigned vecl,
+ unsigned vece, const TCGArg *args,
+ const int *const_args);
+#else
+static inline void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, unsigned vecl,
+ unsigned vece, const TCGArg *args,
+ const int *const_args)
+{
+ g_assert_not_reached();
+}
+#endif
static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, TCGReg arg1,
intptr_t arg2);
static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
@@ -146,8 +158,7 @@ struct tcg_region_state {
};
static struct tcg_region_state region;
-
-static TCGRegSet tcg_target_available_regs[2];
+static TCGRegSet tcg_target_available_regs[TCG_TYPE_COUNT];
static TCGRegSet tcg_target_call_clobber_regs;
#if TCG_TARGET_INSN_UNIT_SIZE == 1
@@ -1026,6 +1037,41 @@ TCGv_i64 tcg_temp_new_internal_i64(int temp_local)
return temp_tcgv_i64(t);
}
+TCGv_vec tcg_temp_new_vec(TCGType type)
+{
+ TCGTemp *t;
+
+#ifdef CONFIG_DEBUG_TCG
+ switch (type) {
+ case TCG_TYPE_V64:
+ assert(TCG_TARGET_HAS_v64);
+ break;
+ case TCG_TYPE_V128:
+ assert(TCG_TARGET_HAS_v128);
+ break;
+ case TCG_TYPE_V256:
+ assert(TCG_TARGET_HAS_v256);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+#endif
+
+ t = tcg_temp_new_internal(type, 0);
+ return temp_tcgv_vec(t);
+}
+
+/* Create a new temp of the same type as an existing temp. */
+TCGv_vec tcg_temp_new_vec_matching(TCGv_vec match)
+{
+ TCGTemp *t = tcgv_vec_temp(match);
+
+ tcg_debug_assert(t->temp_allocated != 0);
+
+ t = tcg_temp_new_internal(t->base_type, 0);
+ return temp_tcgv_vec(t);
+}
+
static void tcg_temp_free_internal(TCGTemp *ts)
{
TCGContext *s = tcg_ctx;
@@ -1057,6 +1103,11 @@ void tcg_temp_free_i64(TCGv_i64 arg)
tcg_temp_free_internal(tcgv_i64_temp(arg));
}
+void tcg_temp_free_vec(TCGv_vec arg)
+{
+ tcg_temp_free_internal(tcgv_vec_temp(arg));
+}
+
TCGv_i32 tcg_const_i32(int32_t val)
{
TCGv_i32 t0;
@@ -1114,6 +1165,9 @@ int tcg_check_temp_count(void)
Test the runtime variable that controls each opcode. */
bool tcg_op_supported(TCGOpcode op)
{
+ const bool have_vec
+ = TCG_TARGET_HAS_v64 | TCG_TARGET_HAS_v128 | TCG_TARGET_HAS_v256;
+
switch (op) {
case INDEX_op_discard:
case INDEX_op_set_label:
@@ -1327,10 +1381,47 @@ bool tcg_op_supported(TCGOpcode op)
case INDEX_op_mulsh_i64:
return TCG_TARGET_HAS_mulsh_i64;
- case NB_OPS:
- break;
+ case INDEX_op_mov_vec:
+ case INDEX_op_dup_vec:
+ case INDEX_op_dupi_vec:
+ case INDEX_op_ld_vec:
+ case INDEX_op_st_vec:
+ case INDEX_op_add_vec:
+ case INDEX_op_sub_vec:
+ case INDEX_op_and_vec:
+ case INDEX_op_or_vec:
+ case INDEX_op_xor_vec:
+ case INDEX_op_cmp_vec:
+ return have_vec;
+ case INDEX_op_dup2_vec:
+ return have_vec && TCG_TARGET_REG_BITS == 32;
+ case INDEX_op_not_vec:
+ return have_vec && TCG_TARGET_HAS_not_vec;
+ case INDEX_op_neg_vec:
+ return have_vec && TCG_TARGET_HAS_neg_vec;
+ case INDEX_op_andc_vec:
+ return have_vec && TCG_TARGET_HAS_andc_vec;
+ case INDEX_op_orc_vec:
+ return have_vec && TCG_TARGET_HAS_orc_vec;
+ case INDEX_op_mul_vec:
+ return have_vec && TCG_TARGET_HAS_mul_vec;
+ case INDEX_op_shli_vec:
+ case INDEX_op_shri_vec:
+ case INDEX_op_sari_vec:
+ return have_vec && TCG_TARGET_HAS_shi_vec;
+ case INDEX_op_shls_vec:
+ case INDEX_op_shrs_vec:
+ case INDEX_op_sars_vec:
+ return have_vec && TCG_TARGET_HAS_shs_vec;
+ case INDEX_op_shlv_vec:
+ case INDEX_op_shrv_vec:
+ case INDEX_op_sarv_vec:
+ return have_vec && TCG_TARGET_HAS_shv_vec;
+
+ default:
+ tcg_debug_assert(op > INDEX_op_last_generic && op < NB_OPS);
+ return true;
}
- g_assert_not_reached();
}
/* Note: we convert the 64 bit args to 32 bit and do some alignment
@@ -1661,6 +1752,11 @@ void tcg_dump_ops(TCGContext *s)
nb_iargs = def->nb_iargs;
nb_cargs = def->nb_cargs;
+ if (def->flags & TCG_OPF_VECTOR) {
+ col += qemu_log("v%d,e%d,", 64 << TCGOP_VECL(op),
+ 8 << TCGOP_VECE(op));
+ }
+
k = 0;
for (i = 0; i < nb_oargs; i++) {
if (k != 0) {
@@ -1685,6 +1781,7 @@ void tcg_dump_ops(TCGContext *s)
case INDEX_op_brcond_i64:
case INDEX_op_setcond_i64:
case INDEX_op_movcond_i64:
+ case INDEX_op_cmp_vec:
if (op->args[k] < ARRAY_SIZE(cond_name)
&& cond_name[op->args[k]]) {
col += qemu_log(",%s", cond_name[op->args[k++]]);
@@ -2890,8 +2987,13 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
}
/* emit instruction */
- tcg_out_op(s, op->opc, new_args, const_args);
-
+ if (def->flags & TCG_OPF_VECTOR) {
+ tcg_out_vec_op(s, op->opc, TCGOP_VECL(op), TCGOP_VECE(op),
+ new_args, const_args);
+ } else {
+ tcg_out_op(s, op->opc, new_args, const_args);
+ }
+
/* move the outputs in the correct register if needed */
for(i = 0; i < nb_oargs; i++) {
ts = arg_temp(op->args[i]);
@@ -3239,10 +3341,12 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
switch (opc) {
case INDEX_op_mov_i32:
case INDEX_op_mov_i64:
+ case INDEX_op_mov_vec:
tcg_reg_alloc_mov(s, op);
break;
case INDEX_op_movi_i32:
case INDEX_op_movi_i64:
+ case INDEX_op_dupi_vec:
tcg_reg_alloc_movi(s, op);
break;
case INDEX_op_insn_start:
@@ -3645,3 +3749,10 @@ void tcg_register_jit(void *buf, size_t buf_size)
{
}
#endif /* ELF_HOST_MACHINE */
+
+#if !TCG_TARGET_MAYBE_vec
+void tcg_expand_vec_op(TCGOpcode o, TCGType t, unsigned e, TCGArg a0, ...)
+{
+ g_assert_not_reached();
+}
+#endif
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 2ce497cebf..9e2d909a4a 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -170,6 +170,31 @@ typedef uint64_t TCGRegSet;
# error "Missing unsigned widening multiply"
#endif
+#if !defined(TCG_TARGET_HAS_v64) \
+ && !defined(TCG_TARGET_HAS_v128) \
+ && !defined(TCG_TARGET_HAS_v256)
+#define TCG_TARGET_MAYBE_vec 0
+#define TCG_TARGET_HAS_neg_vec 0
+#define TCG_TARGET_HAS_not_vec 0
+#define TCG_TARGET_HAS_andc_vec 0
+#define TCG_TARGET_HAS_orc_vec 0
+#define TCG_TARGET_HAS_shi_vec 0
+#define TCG_TARGET_HAS_shs_vec 0
+#define TCG_TARGET_HAS_shv_vec 0
+#define TCG_TARGET_HAS_mul_vec 0
+#else
+#define TCG_TARGET_MAYBE_vec 1
+#endif
+#ifndef TCG_TARGET_HAS_v64
+#define TCG_TARGET_HAS_v64 0
+#endif
+#ifndef TCG_TARGET_HAS_v128
+#define TCG_TARGET_HAS_v128 0
+#endif
+#ifndef TCG_TARGET_HAS_v256
+#define TCG_TARGET_HAS_v256 0
+#endif
+
#ifndef TARGET_INSN_START_EXTRA_WORDS
# define TARGET_INSN_START_WORDS 1
#else
@@ -246,6 +271,11 @@ typedef struct TCGPool {
typedef enum TCGType {
TCG_TYPE_I32,
TCG_TYPE_I64,
+
+ TCG_TYPE_V64,
+ TCG_TYPE_V128,
+ TCG_TYPE_V256,
+
TCG_TYPE_COUNT, /* number of different types */
/* An alias for the size of the host register. */
@@ -396,6 +426,8 @@ typedef tcg_target_ulong TCGArg;
* TCGv_i32 : 32 bit integer type
* TCGv_i64 : 64 bit integer type
* TCGv_ptr : a host pointer type
+ * TCGv_vec : a host vector type; the exact size is not exposed
+ to the CPU front-end code.
* TCGv : an integer type the same size as target_ulong
(an alias for either TCGv_i32 or TCGv_i64)
The compiler's type checking will complain if you mix them
@@ -418,6 +450,7 @@ typedef tcg_target_ulong TCGArg;
typedef struct TCGv_i32_d *TCGv_i32;
typedef struct TCGv_i64_d *TCGv_i64;
typedef struct TCGv_ptr_d *TCGv_ptr;
+typedef struct TCGv_vec_d *TCGv_vec;
typedef TCGv_ptr TCGv_env;
#if TARGET_LONG_BITS == 32
#define TCGv TCGv_i32
@@ -589,6 +622,9 @@ typedef struct TCGOp {
#define TCGOP_CALLI(X) (X)->param1
#define TCGOP_CALLO(X) (X)->param2
+#define TCGOP_VECL(X) (X)->param1
+#define TCGOP_VECE(X) (X)->param2
+
/* Make sure operands fit in the bitfields above. */
QEMU_BUILD_BUG_ON(NB_OPS > (1 << 8));
@@ -726,6 +762,11 @@ static inline TCGTemp *tcgv_ptr_temp(TCGv_ptr v)
return tcgv_i32_temp((TCGv_i32)v);
}
+static inline TCGTemp *tcgv_vec_temp(TCGv_vec v)
+{
+ return tcgv_i32_temp((TCGv_i32)v);
+}
+
static inline TCGArg tcgv_i32_arg(TCGv_i32 v)
{
return temp_arg(tcgv_i32_temp(v));
@@ -741,6 +782,11 @@ static inline TCGArg tcgv_ptr_arg(TCGv_ptr v)
return temp_arg(tcgv_ptr_temp(v));
}
+static inline TCGArg tcgv_vec_arg(TCGv_vec v)
+{
+ return temp_arg(tcgv_vec_temp(v));
+}
+
static inline TCGv_i32 temp_tcgv_i32(TCGTemp *t)
{
(void)temp_idx(t); /* trigger embedded assert */
@@ -757,6 +803,11 @@ static inline TCGv_ptr temp_tcgv_ptr(TCGTemp *t)
return (TCGv_ptr)temp_tcgv_i32(t);
}
+static inline TCGv_vec temp_tcgv_vec(TCGTemp *t)
+{
+ return (TCGv_vec)temp_tcgv_i32(t);
+}
+
#if TCG_TARGET_REG_BITS == 32
static inline TCGv_i32 TCGV_LOW(TCGv_i64 t)
{
@@ -832,9 +883,12 @@ TCGTemp *tcg_global_mem_new_internal(TCGType, TCGv_ptr,
TCGv_i32 tcg_temp_new_internal_i32(int temp_local);
TCGv_i64 tcg_temp_new_internal_i64(int temp_local);
+TCGv_vec tcg_temp_new_vec(TCGType type);
+TCGv_vec tcg_temp_new_vec_matching(TCGv_vec match);
void tcg_temp_free_i32(TCGv_i32 arg);
void tcg_temp_free_i64(TCGv_i64 arg);
+void tcg_temp_free_vec(TCGv_vec arg);
static inline TCGv_i32 tcg_global_mem_new_i32(TCGv_ptr reg, intptr_t offset,
const char *name)
@@ -916,6 +970,8 @@ enum {
/* Instruction is optional and not implemented by the host, or insn
is generic and should not be implemened by the host. */
TCG_OPF_NOT_PRESENT = 0x10,
+ /* Instruction operands are vectors. */
+ TCG_OPF_VECTOR = 0x20,
};
typedef struct TCGOpDef {
@@ -981,6 +1037,10 @@ TCGv_i32 tcg_const_i32(int32_t val);
TCGv_i64 tcg_const_i64(int64_t val);
TCGv_i32 tcg_const_local_i32(int32_t val);
TCGv_i64 tcg_const_local_i64(int64_t val);
+TCGv_vec tcg_const_zeros_vec(TCGType);
+TCGv_vec tcg_const_ones_vec(TCGType);
+TCGv_vec tcg_const_zeros_vec_matching(TCGv_vec);
+TCGv_vec tcg_const_ones_vec_matching(TCGv_vec);
TCGLabel *gen_new_label(void);
@@ -1151,6 +1211,33 @@ uintptr_t tcg_qemu_tb_exec(CPUArchState *env, uint8_t *tb_ptr);
void tcg_register_jit(void *buf, size_t buf_size);
+#if TCG_TARGET_MAYBE_vec
+/* Return zero if the tuple (opc, type, vece) is unsupportable;
+ return > 0 if it is directly supportable;
+ return < 0 if we must call tcg_expand_vec_op. */
+int tcg_can_emit_vec_op(TCGOpcode, TCGType, unsigned);
+#else
+static inline int tcg_can_emit_vec_op(TCGOpcode o, TCGType t, unsigned ve)
+{
+ return 0;
+}
+#endif
+
+/* Expand the tuple (opc, type, vece) on the given arguments. */
+void tcg_expand_vec_op(TCGOpcode, TCGType, unsigned, TCGArg, ...);
+
+/* Replicate a constant C accoring to the log2 of the element size. */
+uint64_t dup_const(unsigned vece, uint64_t c);
+
+#define dup_const(VECE, C) \
+ (__builtin_constant_p(VECE) \
+ ? ( (VECE) == MO_8 ? 0x0101010101010101ull * (uint8_t)(C) \
+ : (VECE) == MO_16 ? 0x0001000100010001ull * (uint16_t)(C) \
+ : (VECE) == MO_32 ? 0x0000000100000001ull * (uint32_t)(C) \
+ : dup_const(VECE, C)) \
+ : dup_const(VECE, C))
+
+
/*
* Memory helpers that will be used by TCG generated code.
*/