diff options
88 files changed, 11140 insertions, 1233 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 301b6996e1..54feb95646 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -76,6 +76,29 @@ K: ^Subject:.*(?i)trivial T: git git://git.corpit.ru/qemu.git trivial-patches T: git git://github.com/vivier/qemu.git trivial-patches +Architecture support +-------------------- +S390 +M: Cornelia Huck <cohuck@redhat.com> +S: Supported +F: default-configs/s390x-softmmu.mak +F: gdb-xml/s390*.xml +F: hw/char/sclp*.[hc] +F: hw/char/terminal3270.c +F: hw/intc/s390_flic.c +F: hw/intc/s390_flic_kvm.c +F: hw/s390x/ +F: hw/vfio/ccw.c +F: hw/watchdog/wdt_diag288.c +F: include/hw/s390x/ +F: include/hw/watchdog/wdt_diag288.h +F: pc-bios/s390-ccw/ +F: pc-bios/s390-ccw.img +F: target/s390x/ +K: ^Subject:.*(?i)s390x? +T: git git://github.com/cohuck/qemu.git s390-next +L: qemu-s390x@nongnu.org + Guest CPU cores (TCG): ---------------------- Overall @@ -213,6 +236,7 @@ F: disas/ppc.c S390 M: Richard Henderson <rth@twiddle.net> M: Alexander Graf <agraf@suse.de> +M: David Hildenbrand <david@redhat.com> S: Maintained F: target/s390x/ F: hw/s390x/ @@ -832,15 +856,22 @@ F: hw/char/sclp*.[hc] F: hw/char/terminal3270.c F: hw/s390x/ F: include/hw/s390x/ -F: pc-bios/s390-ccw/ F: hw/watchdog/wdt_diag288.c F: include/hw/watchdog/wdt_diag288.h -F: pc-bios/s390-ccw.img F: default-configs/s390x-softmmu.mak T: git git://github.com/cohuck/qemu.git s390-next T: git git://github.com/borntraeger/qemu.git s390-next L: qemu-s390x@nongnu.org +S390-ccw Bios +M: Christian Borntraeger <borntraeger@de.ibm.com> +M: Thomas Huth <thuth@redhat.com> +S: Supported +F: pc-bios/s390-ccw/ +F: pc-bios/s390-ccw.img +T: git git://github.com/borntraeger/qemu.git s390-next +L: qemu-s390x@nongnu.org + UniCore32 Machines ------------- PKUnity-3 SoC initramfs-with-busybox diff --git a/Makefile.target b/Makefile.target index f9a9da7e7c..6549481096 100644 --- a/Makefile.target +++ b/Makefile.target @@ -93,8 +93,8 @@ all: $(PROGS) stap # cpu emulator library obj-y += exec.o obj-y += accel/ -obj-$(CONFIG_TCG) += tcg/tcg.o tcg/tcg-op.o tcg/optimize.o -obj-$(CONFIG_TCG) += tcg/tcg-common.o +obj-$(CONFIG_TCG) += tcg/tcg.o tcg/tcg-op.o tcg/tcg-op-vec.o tcg/tcg-op-gvec.o +obj-$(CONFIG_TCG) += tcg/tcg-common.o tcg/optimize.o obj-$(CONFIG_TCG_INTERPRETER) += tcg/tci.o obj-$(CONFIG_TCG_INTERPRETER) += disas/tci.o obj-y += fpu/softfloat.o diff --git a/accel/tcg/Makefile.objs b/accel/tcg/Makefile.objs index 228cd84fa4..d381a02f34 100644 --- a/accel/tcg/Makefile.objs +++ b/accel/tcg/Makefile.objs @@ -1,6 +1,6 @@ obj-$(CONFIG_SOFTMMU) += tcg-all.o obj-$(CONFIG_SOFTMMU) += cputlb.o -obj-y += tcg-runtime.o +obj-y += tcg-runtime.o tcg-runtime-gvec.o obj-y += cpu-exec.o cpu-exec-common.o translate-all.o obj-y += translator.o diff --git a/accel/tcg/tcg-runtime-gvec.c b/accel/tcg/tcg-runtime-gvec.c new file mode 100644 index 0000000000..8bf8d63912 --- /dev/null +++ b/accel/tcg/tcg-runtime-gvec.c @@ -0,0 +1,997 @@ +/* + * Generic vectorized operation runtime + * + * Copyright (c) 2018 Linaro + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "qemu/host-utils.h" +#include "cpu.h" +#include "exec/helper-proto.h" +#include "tcg-gvec-desc.h" + + +/* Virtually all hosts support 16-byte vectors. Those that don't can emulate + * them via GCC's generic vector extension. This turns out to be simpler and + * more reliable than getting the compiler to autovectorize. + * + * In tcg-op-gvec.c, we asserted that both the size and alignment of the data + * are multiples of 16. + * + * When the compiler does not support all of the operations we require, the + * loops are written so that we can always fall back on the base types. + */ +#ifdef CONFIG_VECTOR16 +typedef uint8_t vec8 __attribute__((vector_size(16))); +typedef uint16_t vec16 __attribute__((vector_size(16))); +typedef uint32_t vec32 __attribute__((vector_size(16))); +typedef uint64_t vec64 __attribute__((vector_size(16))); + +typedef int8_t svec8 __attribute__((vector_size(16))); +typedef int16_t svec16 __attribute__((vector_size(16))); +typedef int32_t svec32 __attribute__((vector_size(16))); +typedef int64_t svec64 __attribute__((vector_size(16))); + +#define DUP16(X) { X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X } +#define DUP8(X) { X, X, X, X, X, X, X, X } +#define DUP4(X) { X, X, X, X } +#define DUP2(X) { X, X } +#else +typedef uint8_t vec8; +typedef uint16_t vec16; +typedef uint32_t vec32; +typedef uint64_t vec64; + +typedef int8_t svec8; +typedef int16_t svec16; +typedef int32_t svec32; +typedef int64_t svec64; + +#define DUP16(X) X +#define DUP8(X) X +#define DUP4(X) X +#define DUP2(X) X +#endif /* CONFIG_VECTOR16 */ + +static inline void clear_high(void *d, intptr_t oprsz, uint32_t desc) +{ + intptr_t maxsz = simd_maxsz(desc); + intptr_t i; + + if (unlikely(maxsz > oprsz)) { + for (i = oprsz; i < maxsz; i += sizeof(uint64_t)) { + *(uint64_t *)(d + i) = 0; + } + } +} + +void HELPER(gvec_add8)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec8)) { + *(vec8 *)(d + i) = *(vec8 *)(a + i) + *(vec8 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_add16)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec16)) { + *(vec16 *)(d + i) = *(vec16 *)(a + i) + *(vec16 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_add32)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec32)) { + *(vec32 *)(d + i) = *(vec32 *)(a + i) + *(vec32 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) + *(vec64 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_adds8)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec8 vecb = (vec8)DUP16(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec8)) { + *(vec8 *)(d + i) = *(vec8 *)(a + i) + vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_adds16)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec16 vecb = (vec16)DUP8(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec16)) { + *(vec16 *)(d + i) = *(vec16 *)(a + i) + vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_adds32)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec32 vecb = (vec32)DUP4(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec32)) { + *(vec32 *)(d + i) = *(vec32 *)(a + i) + vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_adds64)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec64 vecb = (vec64)DUP2(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) + vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_sub8)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec8)) { + *(vec8 *)(d + i) = *(vec8 *)(a + i) - *(vec8 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_sub16)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec16)) { + *(vec16 *)(d + i) = *(vec16 *)(a + i) - *(vec16 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_sub32)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec32)) { + *(vec32 *)(d + i) = *(vec32 *)(a + i) - *(vec32 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) - *(vec64 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_subs8)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec8 vecb = (vec8)DUP16(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec8)) { + *(vec8 *)(d + i) = *(vec8 *)(a + i) - vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_subs16)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec16 vecb = (vec16)DUP8(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec16)) { + *(vec16 *)(d + i) = *(vec16 *)(a + i) - vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_subs32)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec32 vecb = (vec32)DUP4(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec32)) { + *(vec32 *)(d + i) = *(vec32 *)(a + i) - vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_subs64)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec64 vecb = (vec64)DUP2(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) - vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_mul8)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec8)) { + *(vec8 *)(d + i) = *(vec8 *)(a + i) * *(vec8 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_mul16)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec16)) { + *(vec16 *)(d + i) = *(vec16 *)(a + i) * *(vec16 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_mul32)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec32)) { + *(vec32 *)(d + i) = *(vec32 *)(a + i) * *(vec32 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_mul64)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) * *(vec64 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_muls8)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec8 vecb = (vec8)DUP16(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec8)) { + *(vec8 *)(d + i) = *(vec8 *)(a + i) * vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_muls16)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec16 vecb = (vec16)DUP8(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec16)) { + *(vec16 *)(d + i) = *(vec16 *)(a + i) * vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_muls32)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec32 vecb = (vec32)DUP4(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec32)) { + *(vec32 *)(d + i) = *(vec32 *)(a + i) * vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_muls64)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec64 vecb = (vec64)DUP2(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) * vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_neg8)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec8)) { + *(vec8 *)(d + i) = -*(vec8 *)(a + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_neg16)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec16)) { + *(vec16 *)(d + i) = -*(vec16 *)(a + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_neg32)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec32)) { + *(vec32 *)(d + i) = -*(vec32 *)(a + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_neg64)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = -*(vec64 *)(a + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_mov)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + + memcpy(d, a, oprsz); + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_dup64)(void *d, uint32_t desc, uint64_t c) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + if (c == 0) { + oprsz = 0; + } else { + for (i = 0; i < oprsz; i += sizeof(uint64_t)) { + *(uint64_t *)(d + i) = c; + } + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_dup32)(void *d, uint32_t desc, uint32_t c) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + if (c == 0) { + oprsz = 0; + } else { + for (i = 0; i < oprsz; i += sizeof(uint32_t)) { + *(uint32_t *)(d + i) = c; + } + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_dup16)(void *d, uint32_t desc, uint32_t c) +{ + HELPER(gvec_dup32)(d, desc, 0x00010001 * (c & 0xffff)); +} + +void HELPER(gvec_dup8)(void *d, uint32_t desc, uint32_t c) +{ + HELPER(gvec_dup32)(d, desc, 0x01010101 * (c & 0xff)); +} + +void HELPER(gvec_not)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = ~*(vec64 *)(a + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_and)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) & *(vec64 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_or)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) | *(vec64 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_xor)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ *(vec64 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_andc)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_orc)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i); + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec64 vecb = (vec64)DUP2(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) & vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_xors)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec64 vecb = (vec64)DUP2(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_ors)(void *d, void *a, uint64_t b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + vec64 vecb = (vec64)DUP2(b); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) | vecb; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_shl8i)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + int shift = simd_data(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec8)) { + *(vec8 *)(d + i) = *(vec8 *)(a + i) << shift; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_shl16i)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + int shift = simd_data(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec16)) { + *(vec16 *)(d + i) = *(vec16 *)(a + i) << shift; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_shl32i)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + int shift = simd_data(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec32)) { + *(vec32 *)(d + i) = *(vec32 *)(a + i) << shift; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_shl64i)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + int shift = simd_data(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) << shift; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_shr8i)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + int shift = simd_data(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec8)) { + *(vec8 *)(d + i) = *(vec8 *)(a + i) >> shift; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_shr16i)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + int shift = simd_data(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec16)) { + *(vec16 *)(d + i) = *(vec16 *)(a + i) >> shift; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_shr32i)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + int shift = simd_data(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec32)) { + *(vec32 *)(d + i) = *(vec32 *)(a + i) >> shift; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_shr64i)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + int shift = simd_data(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(vec64 *)(d + i) = *(vec64 *)(a + i) >> shift; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_sar8i)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + int shift = simd_data(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec8)) { + *(svec8 *)(d + i) = *(svec8 *)(a + i) >> shift; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_sar16i)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + int shift = simd_data(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec16)) { + *(svec16 *)(d + i) = *(svec16 *)(a + i) >> shift; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_sar32i)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + int shift = simd_data(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec32)) { + *(svec32 *)(d + i) = *(svec32 *)(a + i) >> shift; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_sar64i)(void *d, void *a, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + int shift = simd_data(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(vec64)) { + *(svec64 *)(d + i) = *(svec64 *)(a + i) >> shift; + } + clear_high(d, oprsz, desc); +} + +/* If vectors are enabled, the compiler fills in -1 for true. + Otherwise, we must take care of this by hand. */ +#ifdef CONFIG_VECTOR16 +# define DO_CMP0(X) X +#else +# define DO_CMP0(X) -(X) +#endif + +#define DO_CMP1(NAME, TYPE, OP) \ +void HELPER(NAME)(void *d, void *a, void *b, uint32_t desc) \ +{ \ + intptr_t oprsz = simd_oprsz(desc); \ + intptr_t i; \ + for (i = 0; i < oprsz; i += sizeof(vec64)) { \ + *(TYPE *)(d + i) = DO_CMP0(*(TYPE *)(a + i) OP *(TYPE *)(b + i)); \ + } \ + clear_high(d, oprsz, desc); \ +} + +#define DO_CMP2(SZ) \ + DO_CMP1(gvec_eq##SZ, vec##SZ, ==) \ + DO_CMP1(gvec_ne##SZ, vec##SZ, !=) \ + DO_CMP1(gvec_lt##SZ, svec##SZ, <) \ + DO_CMP1(gvec_le##SZ, svec##SZ, <=) \ + DO_CMP1(gvec_ltu##SZ, vec##SZ, <) \ + DO_CMP1(gvec_leu##SZ, vec##SZ, <=) + +DO_CMP2(8) +DO_CMP2(16) +DO_CMP2(32) +DO_CMP2(64) + +#undef DO_CMP0 +#undef DO_CMP1 +#undef DO_CMP2 + +void HELPER(gvec_ssadd8)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(int8_t)) { + int r = *(int8_t *)(a + i) + *(int8_t *)(b + i); + if (r > INT8_MAX) { + r = INT8_MAX; + } else if (r < INT8_MIN) { + r = INT8_MIN; + } + *(int8_t *)(d + i) = r; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_ssadd16)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(int16_t)) { + int r = *(int16_t *)(a + i) + *(int16_t *)(b + i); + if (r > INT16_MAX) { + r = INT16_MAX; + } else if (r < INT16_MIN) { + r = INT16_MIN; + } + *(int16_t *)(d + i) = r; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_ssadd32)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(int32_t)) { + int32_t ai = *(int32_t *)(a + i); + int32_t bi = *(int32_t *)(b + i); + int32_t di = ai + bi; + if (((di ^ ai) &~ (ai ^ bi)) < 0) { + /* Signed overflow. */ + di = (di < 0 ? INT32_MAX : INT32_MIN); + } + *(int32_t *)(d + i) = di; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_ssadd64)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(int64_t)) { + int64_t ai = *(int64_t *)(a + i); + int64_t bi = *(int64_t *)(b + i); + int64_t di = ai + bi; + if (((di ^ ai) &~ (ai ^ bi)) < 0) { + /* Signed overflow. */ + di = (di < 0 ? INT64_MAX : INT64_MIN); + } + *(int64_t *)(d + i) = di; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_sssub8)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(uint8_t)) { + int r = *(int8_t *)(a + i) - *(int8_t *)(b + i); + if (r > INT8_MAX) { + r = INT8_MAX; + } else if (r < INT8_MIN) { + r = INT8_MIN; + } + *(uint8_t *)(d + i) = r; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_sssub16)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(int16_t)) { + int r = *(int16_t *)(a + i) - *(int16_t *)(b + i); + if (r > INT16_MAX) { + r = INT16_MAX; + } else if (r < INT16_MIN) { + r = INT16_MIN; + } + *(int16_t *)(d + i) = r; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_sssub32)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(int32_t)) { + int32_t ai = *(int32_t *)(a + i); + int32_t bi = *(int32_t *)(b + i); + int32_t di = ai - bi; + if (((di ^ ai) & (ai ^ bi)) < 0) { + /* Signed overflow. */ + di = (di < 0 ? INT32_MAX : INT32_MIN); + } + *(int32_t *)(d + i) = di; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_sssub64)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(int64_t)) { + int64_t ai = *(int64_t *)(a + i); + int64_t bi = *(int64_t *)(b + i); + int64_t di = ai - bi; + if (((di ^ ai) & (ai ^ bi)) < 0) { + /* Signed overflow. */ + di = (di < 0 ? INT64_MAX : INT64_MIN); + } + *(int64_t *)(d + i) = di; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_usadd8)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(uint8_t)) { + unsigned r = *(uint8_t *)(a + i) + *(uint8_t *)(b + i); + if (r > UINT8_MAX) { + r = UINT8_MAX; + } + *(uint8_t *)(d + i) = r; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_usadd16)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(uint16_t)) { + unsigned r = *(uint16_t *)(a + i) + *(uint16_t *)(b + i); + if (r > UINT16_MAX) { + r = UINT16_MAX; + } + *(uint16_t *)(d + i) = r; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_usadd32)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(uint32_t)) { + uint32_t ai = *(uint32_t *)(a + i); + uint32_t bi = *(uint32_t *)(b + i); + uint32_t di = ai + bi; + if (di < ai) { + di = UINT32_MAX; + } + *(uint32_t *)(d + i) = di; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_usadd64)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(uint64_t)) { + uint64_t ai = *(uint64_t *)(a + i); + uint64_t bi = *(uint64_t *)(b + i); + uint64_t di = ai + bi; + if (di < ai) { + di = UINT64_MAX; + } + *(uint64_t *)(d + i) = di; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_ussub8)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(uint8_t)) { + int r = *(uint8_t *)(a + i) - *(uint8_t *)(b + i); + if (r < 0) { + r = 0; + } + *(uint8_t *)(d + i) = r; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_ussub16)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(uint16_t)) { + int r = *(uint16_t *)(a + i) - *(uint16_t *)(b + i); + if (r < 0) { + r = 0; + } + *(uint16_t *)(d + i) = r; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_ussub32)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(uint32_t)) { + uint32_t ai = *(uint32_t *)(a + i); + uint32_t bi = *(uint32_t *)(b + i); + uint32_t di = ai - bi; + if (ai < bi) { + di = 0; + } + *(uint32_t *)(d + i) = di; + } + clear_high(d, oprsz, desc); +} + +void HELPER(gvec_ussub64)(void *d, void *a, void *b, uint32_t desc) +{ + intptr_t oprsz = simd_oprsz(desc); + intptr_t i; + + for (i = 0; i < oprsz; i += sizeof(uint64_t)) { + uint64_t ai = *(uint64_t *)(a + i); + uint64_t bi = *(uint64_t *)(b + i); + uint64_t di = ai - bi; + if (ai < bi) { + di = 0; + } + *(uint64_t *)(d + i) = di; + } + clear_high(d, oprsz, desc); +} diff --git a/accel/tcg/tcg-runtime.h b/accel/tcg/tcg-runtime.h index 1df17d0ba9..2536959a18 100644 --- a/accel/tcg/tcg-runtime.h +++ b/accel/tcg/tcg-runtime.h @@ -134,3 +134,121 @@ GEN_ATOMIC_HELPERS(xor_fetch) GEN_ATOMIC_HELPERS(xchg) #undef GEN_ATOMIC_HELPERS + +DEF_HELPER_FLAGS_3(gvec_mov, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(gvec_dup8, TCG_CALL_NO_RWG, void, ptr, i32, i32) +DEF_HELPER_FLAGS_3(gvec_dup16, TCG_CALL_NO_RWG, void, ptr, i32, i32) +DEF_HELPER_FLAGS_3(gvec_dup32, TCG_CALL_NO_RWG, void, ptr, i32, i32) +DEF_HELPER_FLAGS_3(gvec_dup64, TCG_CALL_NO_RWG, void, ptr, i32, i64) + +DEF_HELPER_FLAGS_4(gvec_add8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_add16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_add32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_add64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_adds8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(gvec_adds16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(gvec_adds32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(gvec_adds64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) + +DEF_HELPER_FLAGS_4(gvec_sub8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_sub16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_sub32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_sub64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_subs8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(gvec_subs16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(gvec_subs32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(gvec_subs64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) + +DEF_HELPER_FLAGS_4(gvec_mul8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_mul16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_mul32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_mul64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_muls8, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(gvec_muls16, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(gvec_muls32, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(gvec_muls64, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) + +DEF_HELPER_FLAGS_4(gvec_ssadd8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_ssadd16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_ssadd32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_ssadd64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_sssub8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_sssub16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_sssub32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_sssub64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_usadd8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_usadd16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_usadd32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_usadd64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_ussub8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_ussub16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_ussub32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_ussub64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(gvec_neg8, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_neg16, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_neg32, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_neg64, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(gvec_not, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_and, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_or, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_xor, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_andc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_orc, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_ands, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(gvec_xors, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) +DEF_HELPER_FLAGS_4(gvec_ors, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32) + +DEF_HELPER_FLAGS_3(gvec_shl8i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_shl16i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_shl32i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_shl64i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(gvec_shr8i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_shr16i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_shr32i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_shr64i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_3(gvec_sar8i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_sar16i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_sar32i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) +DEF_HELPER_FLAGS_3(gvec_sar64i, TCG_CALL_NO_RWG, void, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_eq8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_eq16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_eq32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_eq64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_ne8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_ne16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_ne32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_ne64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_lt8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_lt16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_lt32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_lt64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_le8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_le16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_le32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_le64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_ltu8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_ltu16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_ltu32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_ltu64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) + +DEF_HELPER_FLAGS_4(gvec_leu8, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_leu16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_leu32, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_leu64, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) @@ -1933,9 +1933,9 @@ int main(int argc, char *argv[]) { EOF if compile_object ; then - if grep -q BiGeNdIaN $TMPO ; then + if strings -a $TMPO | grep -q BiGeNdIaN ; then bigendian="yes" - elif grep -q LiTtLeEnDiAn $TMPO ; then + elif strings -a $TMPO | grep -q LiTtLeEnDiAn ; then bigendian="no" else echo big/little test failed @@ -5001,6 +5001,50 @@ if compile_prog "" "" ; then fi ######################################## +# See if 16-byte vector operations are supported. +# Even without a vector unit the compiler may expand these. +# There is a bug in old GCC for PPC that crashes here. +# Unfortunately it's the system compiler for Centos 7. + +cat > $TMPC << EOF +typedef unsigned char U1 __attribute__((vector_size(16))); +typedef unsigned short U2 __attribute__((vector_size(16))); +typedef unsigned int U4 __attribute__((vector_size(16))); +typedef unsigned long long U8 __attribute__((vector_size(16))); +typedef signed char S1 __attribute__((vector_size(16))); +typedef signed short S2 __attribute__((vector_size(16))); +typedef signed int S4 __attribute__((vector_size(16))); +typedef signed long long S8 __attribute__((vector_size(16))); +static U1 a1, b1; +static U2 a2, b2; +static U4 a4, b4; +static U8 a8, b8; +static S1 c1; +static S2 c2; +static S4 c4; +static S8 c8; +static int i; +int main(void) +{ + a1 += b1; a2 += b2; a4 += b4; a8 += b8; + a1 -= b1; a2 -= b2; a4 -= b4; a8 -= b8; + a1 *= b1; a2 *= b2; a4 *= b4; a8 *= b8; + a1 &= b1; a2 &= b2; a4 &= b4; a8 &= b8; + a1 |= b1; a2 |= b2; a4 |= b4; a8 |= b8; + a1 ^= b1; a2 ^= b2; a4 ^= b4; a8 ^= b8; + a1 <<= i; a2 <<= i; a4 <<= i; a8 <<= i; + a1 >>= i; a2 >>= i; a4 >>= i; a8 >>= i; + c1 >>= i; c2 >>= i; c4 >>= i; c8 >>= i; + return 0; +} +EOF + +vector16=no +if compile_prog "" "" ; then + vector16=yes +fi + +######################################## # check if getauxval is available. getauxval=no @@ -6329,6 +6373,10 @@ if test "$atomic64" = "yes" ; then echo "CONFIG_ATOMIC64=y" >> $config_host_mak fi +if test "$vector16" = "yes" ; then + echo "CONFIG_VECTOR16=y" >> $config_host_mak +fi + if test "$getauxval" = "yes" ; then echo "CONFIG_GETAUXVAL=y" >> $config_host_mak fi @@ -6731,6 +6779,7 @@ case "$target_name" in echo "TARGET_ABI32=y" >> $config_target_mak ;; s390x) + mttcg=yes gdb_xml_files="s390x-core64.xml s390-acr.xml s390-fpr.xml s390-vx.xml s390-cr.xml s390-virt.xml s390-gs.xml" ;; tilegx) diff --git a/hw/arm/boot.c b/hw/arm/boot.c index 54f8f576b8..05108bc42f 100644 --- a/hw/arm/boot.c +++ b/hw/arm/boot.c @@ -386,6 +386,69 @@ static void set_kernel_args_old(const struct arm_boot_info *info) } } +static void fdt_add_psci_node(void *fdt) +{ + uint32_t cpu_suspend_fn; + uint32_t cpu_off_fn; + uint32_t cpu_on_fn; + uint32_t migrate_fn; + ARMCPU *armcpu = ARM_CPU(qemu_get_cpu(0)); + const char *psci_method; + int64_t psci_conduit; + + psci_conduit = object_property_get_int(OBJECT(armcpu), + "psci-conduit", + &error_abort); + switch (psci_conduit) { + case QEMU_PSCI_CONDUIT_DISABLED: + return; + case QEMU_PSCI_CONDUIT_HVC: + psci_method = "hvc"; + break; + case QEMU_PSCI_CONDUIT_SMC: + psci_method = "smc"; + break; + default: + g_assert_not_reached(); + } + + qemu_fdt_add_subnode(fdt, "/psci"); + if (armcpu->psci_version == 2) { + const char comp[] = "arm,psci-0.2\0arm,psci"; + qemu_fdt_setprop(fdt, "/psci", "compatible", comp, sizeof(comp)); + + cpu_off_fn = QEMU_PSCI_0_2_FN_CPU_OFF; + if (arm_feature(&armcpu->env, ARM_FEATURE_AARCH64)) { + cpu_suspend_fn = QEMU_PSCI_0_2_FN64_CPU_SUSPEND; + cpu_on_fn = QEMU_PSCI_0_2_FN64_CPU_ON; + migrate_fn = QEMU_PSCI_0_2_FN64_MIGRATE; + } else { + cpu_suspend_fn = QEMU_PSCI_0_2_FN_CPU_SUSPEND; + cpu_on_fn = QEMU_PSCI_0_2_FN_CPU_ON; + migrate_fn = QEMU_PSCI_0_2_FN_MIGRATE; + } + } else { + qemu_fdt_setprop_string(fdt, "/psci", "compatible", "arm,psci"); + + cpu_suspend_fn = QEMU_PSCI_0_1_FN_CPU_SUSPEND; + cpu_off_fn = QEMU_PSCI_0_1_FN_CPU_OFF; + cpu_on_fn = QEMU_PSCI_0_1_FN_CPU_ON; + migrate_fn = QEMU_PSCI_0_1_FN_MIGRATE; + } + + /* We adopt the PSCI spec's nomenclature, and use 'conduit' to refer + * to the instruction that should be used to invoke PSCI functions. + * However, the device tree binding uses 'method' instead, so that is + * what we should use here. + */ + qemu_fdt_setprop_string(fdt, "/psci", "method", psci_method); + + qemu_fdt_setprop_cell(fdt, "/psci", "cpu_suspend", cpu_suspend_fn); + qemu_fdt_setprop_cell(fdt, "/psci", "cpu_off", cpu_off_fn); + qemu_fdt_setprop_cell(fdt, "/psci", "cpu_on", cpu_on_fn); + qemu_fdt_setprop_cell(fdt, "/psci", "migrate", migrate_fn); +} + /** * load_dtb() - load a device tree binary image into memory * @addr: the address to load the image at @@ -542,6 +605,8 @@ static int load_dtb(hwaddr addr, const struct arm_boot_info *binfo, } } + fdt_add_psci_node(fdt); + if (binfo->modify_dtb) { binfo->modify_dtb(binfo, fdt); } diff --git a/hw/arm/fsl-imx6.c b/hw/arm/fsl-imx6.c index b0d4088290..e6559a8b12 100644 --- a/hw/arm/fsl-imx6.c +++ b/hw/arm/fsl-imx6.c @@ -93,7 +93,7 @@ static void fsl_imx6_init(Object *obj) } for (i = 0; i < FSL_IMX6_NUM_ESDHCS; i++) { - object_initialize(&s->esdhc[i], sizeof(s->esdhc[i]), TYPE_SYSBUS_SDHCI); + object_initialize(&s->esdhc[i], sizeof(s->esdhc[i]), TYPE_IMX_USDHC); qdev_set_parent_bus(DEVICE(&s->esdhc[i]), sysbus_get_default()); snprintf(name, NAME_SIZE, "sdhc%d", i + 1); object_property_add_child(obj, name, OBJECT(&s->esdhc[i]), NULL); diff --git a/hw/arm/virt.c b/hw/arm/virt.c index b334c82eda..dbb3c8036a 100644 --- a/hw/arm/virt.c +++ b/hw/arm/virt.c @@ -244,66 +244,6 @@ static void create_fdt(VirtMachineState *vms) } } -static void fdt_add_psci_node(const VirtMachineState *vms) -{ - uint32_t cpu_suspend_fn; - uint32_t cpu_off_fn; - uint32_t cpu_on_fn; - uint32_t migrate_fn; - void *fdt = vms->fdt; - ARMCPU *armcpu = ARM_CPU(qemu_get_cpu(0)); - const char *psci_method; - - switch (vms->psci_conduit) { - case QEMU_PSCI_CONDUIT_DISABLED: - return; - case QEMU_PSCI_CONDUIT_HVC: - psci_method = "hvc"; - break; - case QEMU_PSCI_CONDUIT_SMC: - psci_method = "smc"; - break; - default: - g_assert_not_reached(); - } - - qemu_fdt_add_subnode(fdt, "/psci"); - if (armcpu->psci_version == 2) { - const char comp[] = "arm,psci-0.2\0arm,psci"; - qemu_fdt_setprop(fdt, "/psci", "compatible", comp, sizeof(comp)); - - cpu_off_fn = QEMU_PSCI_0_2_FN_CPU_OFF; - if (arm_feature(&armcpu->env, ARM_FEATURE_AARCH64)) { - cpu_suspend_fn = QEMU_PSCI_0_2_FN64_CPU_SUSPEND; - cpu_on_fn = QEMU_PSCI_0_2_FN64_CPU_ON; - migrate_fn = QEMU_PSCI_0_2_FN64_MIGRATE; - } else { - cpu_suspend_fn = QEMU_PSCI_0_2_FN_CPU_SUSPEND; - cpu_on_fn = QEMU_PSCI_0_2_FN_CPU_ON; - migrate_fn = QEMU_PSCI_0_2_FN_MIGRATE; - } - } else { - qemu_fdt_setprop_string(fdt, "/psci", "compatible", "arm,psci"); - - cpu_suspend_fn = QEMU_PSCI_0_1_FN_CPU_SUSPEND; - cpu_off_fn = QEMU_PSCI_0_1_FN_CPU_OFF; - cpu_on_fn = QEMU_PSCI_0_1_FN_CPU_ON; - migrate_fn = QEMU_PSCI_0_1_FN_MIGRATE; - } - - /* We adopt the PSCI spec's nomenclature, and use 'conduit' to refer - * to the instruction that should be used to invoke PSCI functions. - * However, the device tree binding uses 'method' instead, so that is - * what we should use here. - */ - qemu_fdt_setprop_string(fdt, "/psci", "method", psci_method); - - qemu_fdt_setprop_cell(fdt, "/psci", "cpu_suspend", cpu_suspend_fn); - qemu_fdt_setprop_cell(fdt, "/psci", "cpu_off", cpu_off_fn); - qemu_fdt_setprop_cell(fdt, "/psci", "cpu_on", cpu_on_fn); - qemu_fdt_setprop_cell(fdt, "/psci", "migrate", migrate_fn); -} - static void fdt_add_timer_nodes(const VirtMachineState *vms) { /* On real hardware these interrupts are level-triggered. @@ -1409,7 +1349,6 @@ static void machvirt_init(MachineState *machine) } fdt_add_timer_nodes(vms); fdt_add_cpu_nodes(vms); - fdt_add_psci_node(vms); memory_region_allocate_system_memory(ram, NULL, "mach-virt.ram", machine->ram_size); diff --git a/hw/core/generic-loader.c b/hw/core/generic-loader.c index 46012673c3..cb0e68486d 100644 --- a/hw/core/generic-loader.c +++ b/hw/core/generic-loader.c @@ -105,7 +105,7 @@ static void generic_loader_realize(DeviceState *dev, Error **errp) error_setg(errp, "data can not be specified when setting a " "program counter"); return; - } else if (!s->cpu_num) { + } else if (s->cpu_num == CPU_NONE) { error_setg(errp, "cpu_num must be specified when setting a " "program counter"); return; diff --git a/hw/intc/Makefile.objs b/hw/intc/Makefile.objs index 571e094a14..0e9963f5ee 100644 --- a/hw/intc/Makefile.objs +++ b/hw/intc/Makefile.objs @@ -6,7 +6,7 @@ common-obj-$(CONFIG_XILINX) += xilinx_intc.o common-obj-$(CONFIG_XLNX_ZYNQMP) += xlnx-pmu-iomod-intc.o common-obj-$(CONFIG_XLNX_ZYNQMP) += xlnx-zynqmp-ipi.o common-obj-$(CONFIG_ETRAXFS) += etraxfs_pic.o -common-obj-$(CONFIG_IMX) += imx_avic.o +common-obj-$(CONFIG_IMX) += imx_avic.o imx_gpcv2.o common-obj-$(CONFIG_LM32) += lm32_pic.o common-obj-$(CONFIG_REALVIEW) += realview_gic.o common-obj-$(CONFIG_SLAVIO) += slavio_intctl.o diff --git a/hw/intc/armv7m_nvic.c b/hw/intc/armv7m_nvic.c index 8ca6ceeb9b..360889d30b 100644 --- a/hw/intc/armv7m_nvic.c +++ b/hw/intc/armv7m_nvic.c @@ -503,8 +503,25 @@ static void armv7m_nvic_clear_pending(void *opaque, int irq, bool secure) } } -void armv7m_nvic_set_pending(void *opaque, int irq, bool secure) +static void do_armv7m_nvic_set_pending(void *opaque, int irq, bool secure, + bool derived) { + /* Pend an exception, including possibly escalating it to HardFault. + * + * This function handles both "normal" pending of interrupts and + * exceptions, and also derived exceptions (ones which occur as + * a result of trying to take some other exception). + * + * If derived == true, the caller guarantees that we are part way through + * trying to take an exception (but have not yet called + * armv7m_nvic_acknowledge_irq() to make it active), and so: + * - s->vectpending is the "original exception" we were trying to take + * - irq is the "derived exception" + * - nvic_exec_prio(s) gives the priority before exception entry + * Here we handle the prioritization logic which the pseudocode puts + * in the DerivedLateArrival() function. + */ + NVICState *s = (NVICState *)opaque; bool banked = exc_is_banked(irq); VecInfo *vec; @@ -514,7 +531,44 @@ void armv7m_nvic_set_pending(void *opaque, int irq, bool secure) vec = (banked && secure) ? &s->sec_vectors[irq] : &s->vectors[irq]; - trace_nvic_set_pending(irq, secure, vec->enabled, vec->prio); + trace_nvic_set_pending(irq, secure, derived, vec->enabled, vec->prio); + + if (derived) { + /* Derived exceptions are always synchronous. */ + assert(irq >= ARMV7M_EXCP_HARD && irq < ARMV7M_EXCP_PENDSV); + + if (irq == ARMV7M_EXCP_DEBUG && + exc_group_prio(s, vec->prio, secure) >= nvic_exec_prio(s)) { + /* DebugMonitorFault, but its priority is lower than the + * preempted exception priority: just ignore it. + */ + return; + } + + if (irq == ARMV7M_EXCP_HARD && vec->prio >= s->vectpending_prio) { + /* If this is a terminal exception (one which means we cannot + * take the original exception, like a failure to read its + * vector table entry), then we must take the derived exception. + * If the derived exception can't take priority over the + * original exception, then we go into Lockup. + * + * For QEMU, we rely on the fact that a derived exception is + * terminal if and only if it's reported to us as HardFault, + * which saves having to have an extra argument is_terminal + * that we'd only use in one place. + */ + cpu_abort(&s->cpu->parent_obj, + "Lockup: can't take terminal derived exception " + "(original exception priority %d)\n", + s->vectpending_prio); + } + /* We now continue with the same code as for a normal pending + * exception, which will cause us to pend the derived exception. + * We'll then take either the original or the derived exception + * based on which is higher priority by the usual mechanism + * for selecting the highest priority pending interrupt. + */ + } if (irq >= ARMV7M_EXCP_HARD && irq < ARMV7M_EXCP_PENDSV) { /* If a synchronous exception is pending then it may be @@ -585,25 +639,31 @@ void armv7m_nvic_set_pending(void *opaque, int irq, bool secure) } } +void armv7m_nvic_set_pending(void *opaque, int irq, bool secure) +{ + do_armv7m_nvic_set_pending(opaque, irq, secure, false); +} + +void armv7m_nvic_set_pending_derived(void *opaque, int irq, bool secure) +{ + do_armv7m_nvic_set_pending(opaque, irq, secure, true); +} + /* Make pending IRQ active. */ -bool armv7m_nvic_acknowledge_irq(void *opaque) +void armv7m_nvic_acknowledge_irq(void *opaque) { NVICState *s = (NVICState *)opaque; CPUARMState *env = &s->cpu->env; const int pending = s->vectpending; const int running = nvic_exec_prio(s); VecInfo *vec; - bool targets_secure; assert(pending > ARMV7M_EXCP_RESET && pending < s->num_irq); if (s->vectpending_is_s_banked) { vec = &s->sec_vectors[pending]; - targets_secure = true; } else { vec = &s->vectors[pending]; - targets_secure = !exc_is_banked(s->vectpending) && - exc_targets_secure(s, s->vectpending); } assert(vec->enabled); @@ -611,7 +671,7 @@ bool armv7m_nvic_acknowledge_irq(void *opaque) assert(s->vectpending_prio < running); - trace_nvic_acknowledge_irq(pending, s->vectpending_prio, targets_secure); + trace_nvic_acknowledge_irq(pending, s->vectpending_prio); vec->active = 1; vec->pending = 0; @@ -619,8 +679,28 @@ bool armv7m_nvic_acknowledge_irq(void *opaque) write_v7m_exception(env, s->vectpending); nvic_irq_update(s); +} + +void armv7m_nvic_get_pending_irq_info(void *opaque, + int *pirq, bool *ptargets_secure) +{ + NVICState *s = (NVICState *)opaque; + const int pending = s->vectpending; + bool targets_secure; + + assert(pending > ARMV7M_EXCP_RESET && pending < s->num_irq); + + if (s->vectpending_is_s_banked) { + targets_secure = true; + } else { + targets_secure = !exc_is_banked(pending) && + exc_targets_secure(s, pending); + } + + trace_nvic_get_pending_irq_info(pending, targets_secure); - return targets_secure; + *ptargets_secure = targets_secure; + *pirq = pending; } int armv7m_nvic_complete_irq(void *opaque, int irq, bool secure) diff --git a/hw/intc/imx_gpcv2.c b/hw/intc/imx_gpcv2.c new file mode 100644 index 0000000000..4eb9ce2668 --- /dev/null +++ b/hw/intc/imx_gpcv2.c @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2018, Impinj, Inc. + * + * i.MX7 GPCv2 block emulation code + * + * Author: Andrey Smirnov <andrew.smirnov@gmail.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "hw/intc/imx_gpcv2.h" +#include "qemu/log.h" + +#define GPC_PU_PGC_SW_PUP_REQ 0x0f8 +#define GPC_PU_PGC_SW_PDN_REQ 0x104 + +#define USB_HSIC_PHY_SW_Pxx_REQ BIT(4) +#define USB_OTG2_PHY_SW_Pxx_REQ BIT(3) +#define USB_OTG1_PHY_SW_Pxx_REQ BIT(2) +#define PCIE_PHY_SW_Pxx_REQ BIT(1) +#define MIPI_PHY_SW_Pxx_REQ BIT(0) + + +static void imx_gpcv2_reset(DeviceState *dev) +{ + IMXGPCv2State *s = IMX_GPCV2(dev); + + memset(s->regs, 0, sizeof(s->regs)); +} + +static uint64_t imx_gpcv2_read(void *opaque, hwaddr offset, + unsigned size) +{ + IMXGPCv2State *s = opaque; + + return s->regs[offset / sizeof(uint32_t)]; +} + +static void imx_gpcv2_write(void *opaque, hwaddr offset, + uint64_t value, unsigned size) +{ + IMXGPCv2State *s = opaque; + const size_t idx = offset / sizeof(uint32_t); + + s->regs[idx] = value; + + /* + * Real HW will clear those bits once as a way to indicate that + * power up request is complete + */ + if (offset == GPC_PU_PGC_SW_PUP_REQ || + offset == GPC_PU_PGC_SW_PDN_REQ) { + s->regs[idx] &= ~(USB_HSIC_PHY_SW_Pxx_REQ | + USB_OTG2_PHY_SW_Pxx_REQ | + USB_OTG1_PHY_SW_Pxx_REQ | + PCIE_PHY_SW_Pxx_REQ | + MIPI_PHY_SW_Pxx_REQ); + } +} + +static const struct MemoryRegionOps imx_gpcv2_ops = { + .read = imx_gpcv2_read, + .write = imx_gpcv2_write, + .endianness = DEVICE_NATIVE_ENDIAN, + .impl = { + /* + * Our device would not work correctly if the guest was doing + * unaligned access. This might not be a limitation on the real + * device but in practice there is no reason for a guest to access + * this device unaligned. + */ + .min_access_size = 4, + .max_access_size = 4, + .unaligned = false, + }, +}; + +static void imx_gpcv2_init(Object *obj) +{ + SysBusDevice *sd = SYS_BUS_DEVICE(obj); + IMXGPCv2State *s = IMX_GPCV2(obj); + + memory_region_init_io(&s->iomem, + obj, + &imx_gpcv2_ops, + s, + TYPE_IMX_GPCV2 ".iomem", + sizeof(s->regs)); + sysbus_init_mmio(sd, &s->iomem); +} + +static const VMStateDescription vmstate_imx_gpcv2 = { + .name = TYPE_IMX_GPCV2, + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT32_ARRAY(regs, IMXGPCv2State, GPC_NUM), + VMSTATE_END_OF_LIST() + }, +}; + +static void imx_gpcv2_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->reset = imx_gpcv2_reset; + dc->vmsd = &vmstate_imx_gpcv2; + dc->desc = "i.MX GPCv2 Module"; +} + +static const TypeInfo imx_gpcv2_info = { + .name = TYPE_IMX_GPCV2, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(IMXGPCv2State), + .instance_init = imx_gpcv2_init, + .class_init = imx_gpcv2_class_init, +}; + +static void imx_gpcv2_register_type(void) +{ + type_register_static(&imx_gpcv2_info); +} +type_init(imx_gpcv2_register_type) diff --git a/hw/intc/s390_flic.c b/hw/intc/s390_flic.c index 6eaf178d79..a85a149c6d 100644 --- a/hw/intc/s390_flic.c +++ b/hw/intc/s390_flic.c @@ -22,16 +22,36 @@ #include "qapi/error.h" #include "hw/s390x/s390-virtio-ccw.h" +S390FLICStateClass *s390_get_flic_class(S390FLICState *fs) +{ + static S390FLICStateClass *class; + + if (!class) { + /* we only have one flic device, so this is fine to cache */ + class = S390_FLIC_COMMON_GET_CLASS(fs); + } + return class; +} + +QEMUS390FLICState *s390_get_qemu_flic(S390FLICState *fs) +{ + static QEMUS390FLICState *flic; + + if (!flic) { + /* we only have one flic device, so this is fine to cache */ + flic = QEMU_S390_FLIC(fs); + } + return flic; +} + S390FLICState *s390_get_flic(void) { static S390FLICState *fs; if (!fs) { - fs = S390_FLIC_COMMON(object_resolve_path(TYPE_KVM_S390_FLIC, NULL)); - if (!fs) { - fs = S390_FLIC_COMMON(object_resolve_path(TYPE_QEMU_S390_FLIC, - NULL)); - } + fs = S390_FLIC_COMMON(object_resolve_path_type("", + TYPE_S390_FLIC_COMMON, + NULL)); } return fs; } @@ -40,8 +60,11 @@ void s390_flic_init(void) { DeviceState *dev; - dev = s390_flic_kvm_create(); - if (!dev) { + if (kvm_enabled()) { + dev = qdev_create(NULL, TYPE_KVM_S390_FLIC); + object_property_add_child(qdev_get_machine(), TYPE_KVM_S390_FLIC, + OBJECT(dev), NULL); + } else { dev = qdev_create(NULL, TYPE_QEMU_S390_FLIC); object_property_add_child(qdev_get_machine(), TYPE_QEMU_S390_FLIC, OBJECT(dev), NULL); @@ -78,14 +101,41 @@ static void qemu_s390_release_adapter_routes(S390FLICState *fs, static int qemu_s390_clear_io_flic(S390FLICState *fs, uint16_t subchannel_id, uint16_t subchannel_nr) { - /* Fixme TCG */ - return -ENOSYS; + QEMUS390FLICState *flic = s390_get_qemu_flic(fs); + QEMUS390FlicIO *cur, *next; + uint8_t isc; + + g_assert(qemu_mutex_iothread_locked()); + if (!(flic->pending & FLIC_PENDING_IO)) { + return 0; + } + + /* check all iscs */ + for (isc = 0; isc < 8; isc++) { + if (QLIST_EMPTY(&flic->io[isc])) { + continue; + } + + /* search and delete any matching one */ + QLIST_FOREACH_SAFE(cur, &flic->io[isc], next, next) { + if (cur->id == subchannel_id && cur->nr == subchannel_nr) { + QLIST_REMOVE(cur, next); + g_free(cur); + } + } + + /* update our indicator bit */ + if (QLIST_EMPTY(&flic->io[isc])) { + flic->pending &= ~ISC_TO_PENDING_IO(isc); + } + } + return 0; } static int qemu_s390_modify_ais_mode(S390FLICState *fs, uint8_t isc, uint16_t mode) { - QEMUS390FLICState *flic = QEMU_S390_FLIC(fs); + QEMUS390FLICState *flic = s390_get_qemu_flic(fs); switch (mode) { case SIC_IRQ_MODE_ALL: @@ -106,7 +156,8 @@ static int qemu_s390_modify_ais_mode(S390FLICState *fs, uint8_t isc, static int qemu_s390_inject_airq(S390FLICState *fs, uint8_t type, uint8_t isc, uint8_t flags) { - QEMUS390FLICState *flic = QEMU_S390_FLIC(fs); + QEMUS390FLICState *flic = s390_get_qemu_flic(fs); + S390FLICStateClass *fsc = s390_get_flic_class(fs); bool flag = flags & S390_ADAPTER_SUPPRESSIBLE; uint32_t io_int_word = (isc << 27) | IO_INT_WORD_AI; @@ -115,7 +166,7 @@ static int qemu_s390_inject_airq(S390FLICState *fs, uint8_t type, return 0; } - s390_io_interrupt(0, 0, 0, io_int_word); + fsc->inject_io(fs, 0, 0, 0, io_int_word); if (flag && (flic->simm & AIS_MODE_MASK(isc))) { flic->nimm |= AIS_MODE_MASK(isc); @@ -126,12 +177,180 @@ static int qemu_s390_inject_airq(S390FLICState *fs, uint8_t type, return 0; } +static void qemu_s390_flic_notify(uint32_t type) +{ + CPUState *cs; + + /* + * We have to make all CPUs see CPU_INTERRUPT_HARD, so they might + * consider it. We will kick all running CPUs and only relevant + * sleeping ones. + */ + CPU_FOREACH(cs) { + S390CPU *cpu = S390_CPU(cs); + + cs->interrupt_request |= CPU_INTERRUPT_HARD; + + /* ignore CPUs that are not sleeping */ + if (s390_cpu_get_state(cpu) != CPU_STATE_OPERATING && + s390_cpu_get_state(cpu) != CPU_STATE_LOAD) { + continue; + } + + /* we always kick running CPUs for now, this is tricky */ + if (cs->halted) { + /* don't check for subclasses, CPUs double check when waking up */ + if (type & FLIC_PENDING_SERVICE) { + if (!(cpu->env.psw.mask & PSW_MASK_EXT)) { + continue; + } + } else if (type & FLIC_PENDING_IO) { + if (!(cpu->env.psw.mask & PSW_MASK_IO)) { + continue; + } + } else if (type & FLIC_PENDING_MCHK_CR) { + if (!(cpu->env.psw.mask & PSW_MASK_MCHECK)) { + continue; + } + } + } + cpu_interrupt(cs, CPU_INTERRUPT_HARD); + } +} + +uint32_t qemu_s390_flic_dequeue_service(QEMUS390FLICState *flic) +{ + uint32_t tmp; + + g_assert(qemu_mutex_iothread_locked()); + g_assert(flic->pending & FLIC_PENDING_SERVICE); + tmp = flic->service_param; + flic->service_param = 0; + flic->pending &= ~FLIC_PENDING_SERVICE; + + return tmp; +} + +/* caller has to free the returned object */ +QEMUS390FlicIO *qemu_s390_flic_dequeue_io(QEMUS390FLICState *flic, uint64_t cr6) +{ + QEMUS390FlicIO *io; + uint8_t isc; + + g_assert(qemu_mutex_iothread_locked()); + if (!(flic->pending & CR6_TO_PENDING_IO(cr6))) { + return NULL; + } + + for (isc = 0; isc < 8; isc++) { + if (QLIST_EMPTY(&flic->io[isc]) || !(cr6 & ISC_TO_ISC_BITS(isc))) { + continue; + } + io = QLIST_FIRST(&flic->io[isc]); + QLIST_REMOVE(io, next); + + /* update our indicator bit */ + if (QLIST_EMPTY(&flic->io[isc])) { + flic->pending &= ~ISC_TO_PENDING_IO(isc); + } + return io; + } + + return NULL; +} + +void qemu_s390_flic_dequeue_crw_mchk(QEMUS390FLICState *flic) +{ + g_assert(qemu_mutex_iothread_locked()); + g_assert(flic->pending & FLIC_PENDING_MCHK_CR); + flic->pending &= ~FLIC_PENDING_MCHK_CR; +} + +static void qemu_s390_inject_service(S390FLICState *fs, uint32_t parm) +{ + QEMUS390FLICState *flic = s390_get_qemu_flic(fs); + + g_assert(qemu_mutex_iothread_locked()); + /* multiplexing is good enough for sclp - kvm does it internally as well */ + flic->service_param |= parm; + flic->pending |= FLIC_PENDING_SERVICE; + + qemu_s390_flic_notify(FLIC_PENDING_SERVICE); +} + +static void qemu_s390_inject_io(S390FLICState *fs, uint16_t subchannel_id, + uint16_t subchannel_nr, uint32_t io_int_parm, + uint32_t io_int_word) +{ + const uint8_t isc = IO_INT_WORD_ISC(io_int_word); + QEMUS390FLICState *flic = s390_get_qemu_flic(fs); + QEMUS390FlicIO *io; + + g_assert(qemu_mutex_iothread_locked()); + io = g_new0(QEMUS390FlicIO, 1); + io->id = subchannel_id; + io->nr = subchannel_nr; + io->parm = io_int_parm; + io->word = io_int_word; + + QLIST_INSERT_HEAD(&flic->io[isc], io, next); + flic->pending |= ISC_TO_PENDING_IO(isc); + + qemu_s390_flic_notify(ISC_TO_PENDING_IO(isc)); +} + +static void qemu_s390_inject_crw_mchk(S390FLICState *fs) +{ + QEMUS390FLICState *flic = s390_get_qemu_flic(fs); + + g_assert(qemu_mutex_iothread_locked()); + flic->pending |= FLIC_PENDING_MCHK_CR; + + qemu_s390_flic_notify(FLIC_PENDING_MCHK_CR); +} + +bool qemu_s390_flic_has_service(QEMUS390FLICState *flic) +{ + /* called without lock via cc->has_work, will be validated under lock */ + return !!(flic->pending & FLIC_PENDING_SERVICE); +} + +bool qemu_s390_flic_has_io(QEMUS390FLICState *flic, uint64_t cr6) +{ + /* called without lock via cc->has_work, will be validated under lock */ + return !!(flic->pending & CR6_TO_PENDING_IO(cr6)); +} + +bool qemu_s390_flic_has_crw_mchk(QEMUS390FLICState *flic) +{ + /* called without lock via cc->has_work, will be validated under lock */ + return !!(flic->pending & FLIC_PENDING_MCHK_CR); +} + +bool qemu_s390_flic_has_any(QEMUS390FLICState *flic) +{ + g_assert(qemu_mutex_iothread_locked()); + return !!flic->pending; +} + static void qemu_s390_flic_reset(DeviceState *dev) { QEMUS390FLICState *flic = QEMU_S390_FLIC(dev); + QEMUS390FlicIO *cur, *next; + int isc; + g_assert(qemu_mutex_iothread_locked()); flic->simm = 0; flic->nimm = 0; + flic->pending = 0; + + /* remove all pending io interrupts */ + for (isc = 0; isc < 8; isc++) { + QLIST_FOREACH_SAFE(cur, &flic->io[isc], next, next) { + QLIST_REMOVE(cur, next); + g_free(cur); + } + } } bool ais_needed(void *opaque) @@ -153,6 +372,16 @@ static const VMStateDescription qemu_s390_flic_vmstate = { } }; +static void qemu_s390_flic_instance_init(Object *obj) +{ + QEMUS390FLICState *flic = QEMU_S390_FLIC(obj); + int isc; + + for (isc = 0; isc < 8; isc++) { + QLIST_INIT(&flic->io[isc]); + } +} + static void qemu_s390_flic_class_init(ObjectClass *oc, void *data) { DeviceClass *dc = DEVICE_CLASS(oc); @@ -167,6 +396,9 @@ static void qemu_s390_flic_class_init(ObjectClass *oc, void *data) fsc->clear_io_irq = qemu_s390_clear_io_flic; fsc->modify_ais_mode = qemu_s390_modify_ais_mode; fsc->inject_airq = qemu_s390_inject_airq; + fsc->inject_service = qemu_s390_inject_service; + fsc->inject_io = qemu_s390_inject_io; + fsc->inject_crw_mchk = qemu_s390_inject_crw_mchk; } static Property s390_flic_common_properties[] = { @@ -201,6 +433,7 @@ static const TypeInfo qemu_s390_flic_info = { .name = TYPE_QEMU_S390_FLIC, .parent = TYPE_S390_FLIC_COMMON, .instance_size = sizeof(QEMUS390FLICState), + .instance_init = qemu_s390_flic_instance_init, .class_init = qemu_s390_flic_class_init, }; diff --git a/hw/intc/s390_flic_kvm.c b/hw/intc/s390_flic_kvm.c index d208cb81c4..3f804ad52e 100644 --- a/hw/intc/s390_flic_kvm.c +++ b/hw/intc/s390_flic_kvm.c @@ -35,16 +35,15 @@ typedef struct KVMS390FLICState { bool clear_io_supported; } KVMS390FLICState; -DeviceState *s390_flic_kvm_create(void) +static KVMS390FLICState *s390_get_kvm_flic(S390FLICState *fs) { - DeviceState *dev = NULL; + static KVMS390FLICState *flic; - if (kvm_enabled()) { - dev = qdev_create(NULL, TYPE_KVM_S390_FLIC); - object_property_add_child(qdev_get_machine(), TYPE_KVM_S390_FLIC, - OBJECT(dev), NULL); + if (!flic) { + /* we only have one flic device, so this is fine to cache */ + flic = KVM_S390_FLIC(fs); } - return dev; + return flic; } /** @@ -123,20 +122,70 @@ static int flic_enqueue_irqs(void *buf, uint64_t len, return rc ? -errno : 0; } -int kvm_s390_inject_flic(struct kvm_s390_irq *irq) +static void kvm_s390_inject_flic(S390FLICState *fs, struct kvm_s390_irq *irq) { - static KVMS390FLICState *flic; + static bool use_flic = true; + int r; + + if (use_flic) { + r = flic_enqueue_irqs(irq, sizeof(*irq), s390_get_kvm_flic(fs)); + if (r == -ENOSYS) { + use_flic = false; + } + if (!r) { + return; + } + } + /* fallback to legacy KVM IOCTL in case FLIC fails */ + kvm_s390_floating_interrupt_legacy(irq); +} + +static void kvm_s390_inject_service(S390FLICState *fs, uint32_t parm) +{ + struct kvm_s390_irq irq = { + .type = KVM_S390_INT_SERVICE, + .u.ext.ext_params = parm, + }; + + kvm_s390_inject_flic(fs, &irq); +} - if (unlikely(!flic)) { - flic = KVM_S390_FLIC(s390_get_flic()); +static void kvm_s390_inject_io(S390FLICState *fs, uint16_t subchannel_id, + uint16_t subchannel_nr, uint32_t io_int_parm, + uint32_t io_int_word) +{ + struct kvm_s390_irq irq = { + .u.io.subchannel_id = subchannel_id, + .u.io.subchannel_nr = subchannel_nr, + .u.io.io_int_parm = io_int_parm, + .u.io.io_int_word = io_int_word, + }; + + if (io_int_word & IO_INT_WORD_AI) { + irq.type = KVM_S390_INT_IO(1, 0, 0, 0); + } else { + irq.type = KVM_S390_INT_IO(0, (subchannel_id & 0xff00) >> 8, + (subchannel_id & 0x0006), + subchannel_nr); } - return flic_enqueue_irqs(irq, sizeof(*irq), flic); + kvm_s390_inject_flic(fs, &irq); +} + +static void kvm_s390_inject_crw_mchk(S390FLICState *fs) +{ + struct kvm_s390_irq irq = { + .type = KVM_S390_MCHK, + .u.mchk.cr14 = CR14_CHANNEL_REPORT_SC, + .u.mchk.mcic = s390_build_validity_mcic() | MCIC_SC_CP, + }; + + kvm_s390_inject_flic(fs, &irq); } static int kvm_s390_clear_io_flic(S390FLICState *fs, uint16_t subchannel_id, uint16_t subchannel_nr) { - KVMS390FLICState *flic = KVM_S390_FLIC(fs); + KVMS390FLICState *flic = s390_get_kvm_flic(fs); int rc; uint32_t sid = subchannel_id << 16 | subchannel_nr; struct kvm_device_attr attr = { @@ -154,7 +203,7 @@ static int kvm_s390_clear_io_flic(S390FLICState *fs, uint16_t subchannel_id, static int kvm_s390_modify_ais_mode(S390FLICState *fs, uint8_t isc, uint16_t mode) { - KVMS390FLICState *flic = KVM_S390_FLIC(fs); + KVMS390FLICState *flic = s390_get_kvm_flic(fs); struct kvm_s390_ais_req req = { .isc = isc, .mode = mode, @@ -174,7 +223,7 @@ static int kvm_s390_modify_ais_mode(S390FLICState *fs, uint8_t isc, static int kvm_s390_inject_airq(S390FLICState *fs, uint8_t type, uint8_t isc, uint8_t flags) { - KVMS390FLICState *flic = KVM_S390_FLIC(fs); + KVMS390FLICState *flic = s390_get_kvm_flic(fs); uint32_t id = css_get_adapter_id(type, isc); struct kvm_device_attr attr = { .group = KVM_DEV_FLIC_AIRQ_INJECT, @@ -263,7 +312,7 @@ static int kvm_s390_io_adapter_map(S390FLICState *fs, uint32_t id, .group = KVM_DEV_FLIC_ADAPTER_MODIFY, .addr = (uint64_t)&req, }; - KVMS390FLICState *flic = KVM_S390_FLIC(fs); + KVMS390FLICState *flic = s390_get_kvm_flic(fs); int r; if (!kvm_gsi_routing_enabled()) { @@ -614,6 +663,9 @@ static void kvm_s390_flic_class_init(ObjectClass *oc, void *data) fsc->clear_io_irq = kvm_s390_clear_io_flic; fsc->modify_ais_mode = kvm_s390_modify_ais_mode; fsc->inject_airq = kvm_s390_inject_airq; + fsc->inject_service = kvm_s390_inject_service; + fsc->inject_io = kvm_s390_inject_io; + fsc->inject_crw_mchk = kvm_s390_inject_crw_mchk; } static const TypeInfo kvm_s390_flic_info = { diff --git a/hw/intc/trace-events b/hw/intc/trace-events index be769186fc..4092d2825e 100644 --- a/hw/intc/trace-events +++ b/hw/intc/trace-events @@ -177,10 +177,11 @@ nvic_set_prio(int irq, bool secure, uint8_t prio) "NVIC set irq %d secure-bank % nvic_irq_update(int vectpending, int pendprio, int exception_prio, int level) "NVIC vectpending %d pending prio %d exception_prio %d: setting irq line to %d" nvic_escalate_prio(int irq, int irqprio, int runprio) "NVIC escalating irq %d to HardFault: insufficient priority %d >= %d" nvic_escalate_disabled(int irq) "NVIC escalating irq %d to HardFault: disabled" -nvic_set_pending(int irq, bool secure, int en, int prio) "NVIC set pending irq %d secure-bank %d (enabled: %d priority %d)" +nvic_set_pending(int irq, bool secure, bool derived, int en, int prio) "NVIC set pending irq %d secure-bank %d derived %d (enabled: %d priority %d)" nvic_clear_pending(int irq, bool secure, int en, int prio) "NVIC clear pending irq %d secure-bank %d (enabled: %d priority %d)" nvic_set_pending_level(int irq) "NVIC set pending: irq %d higher prio than vectpending: setting irq line to 1" -nvic_acknowledge_irq(int irq, int prio, bool targets_secure) "NVIC acknowledge IRQ: %d now active (prio %d targets_secure %d)" +nvic_acknowledge_irq(int irq, int prio) "NVIC acknowledge IRQ: %d now active (prio %d)" +nvic_get_pending_irq_info(int irq, bool secure) "NVIC next IRQ %d: targets_secure: %d" nvic_complete_irq(int irq, bool secure) "NVIC complete IRQ %d (secure %d)" nvic_set_irq_level(int irq, int level) "NVIC external irq %d level set to %d" nvic_sysreg_read(uint64_t addr, uint32_t value, unsigned size) "NVIC sysreg read addr 0x%" PRIx64 " data 0x%" PRIx32 " size %u" diff --git a/hw/misc/Makefile.objs b/hw/misc/Makefile.objs index d517f83e81..fce426eb75 100644 --- a/hw/misc/Makefile.objs +++ b/hw/misc/Makefile.objs @@ -33,6 +33,10 @@ obj-$(CONFIG_IMX) += imx31_ccm.o obj-$(CONFIG_IMX) += imx25_ccm.o obj-$(CONFIG_IMX) += imx6_ccm.o obj-$(CONFIG_IMX) += imx6_src.o +obj-$(CONFIG_IMX) += imx7_ccm.o +obj-$(CONFIG_IMX) += imx2_wdt.o +obj-$(CONFIG_IMX) += imx7_snvs.o +obj-$(CONFIG_IMX) += imx7_gpr.o obj-$(CONFIG_MILKYMIST) += milkymist-hpdmc.o obj-$(CONFIG_MILKYMIST) += milkymist-pfpu.o obj-$(CONFIG_MAINSTONE) += mst_fpga.o diff --git a/hw/misc/imx2_wdt.c b/hw/misc/imx2_wdt.c new file mode 100644 index 0000000000..e47e442592 --- /dev/null +++ b/hw/misc/imx2_wdt.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2018, Impinj, Inc. + * + * i.MX2 Watchdog IP block + * + * Author: Andrey Smirnov <andrew.smirnov@gmail.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/bitops.h" +#include "sysemu/watchdog.h" + +#include "hw/misc/imx2_wdt.h" + +#define IMX2_WDT_WCR_WDA BIT(5) /* -> External Reset WDOG_B */ +#define IMX2_WDT_WCR_SRS BIT(4) /* -> Software Reset Signal */ + +static uint64_t imx2_wdt_read(void *opaque, hwaddr addr, + unsigned int size) +{ + return 0; +} + +static void imx2_wdt_write(void *opaque, hwaddr addr, + uint64_t value, unsigned int size) +{ + if (addr == IMX2_WDT_WCR && + (value & (IMX2_WDT_WCR_WDA | IMX2_WDT_WCR_SRS))) { + watchdog_perform_action(); + } +} + +static const MemoryRegionOps imx2_wdt_ops = { + .read = imx2_wdt_read, + .write = imx2_wdt_write, + .endianness = DEVICE_NATIVE_ENDIAN, + .impl = { + /* + * Our device would not work correctly if the guest was doing + * unaligned access. This might not be a limitation on the + * real device but in practice there is no reason for a guest + * to access this device unaligned. + */ + .min_access_size = 4, + .max_access_size = 4, + .unaligned = false, + }, +}; + +static void imx2_wdt_realize(DeviceState *dev, Error **errp) +{ + IMX2WdtState *s = IMX2_WDT(dev); + + memory_region_init_io(&s->mmio, OBJECT(dev), + &imx2_wdt_ops, s, + TYPE_IMX2_WDT".mmio", + IMX2_WDT_REG_NUM * sizeof(uint16_t)); + sysbus_init_mmio(SYS_BUS_DEVICE(dev), &s->mmio); +} + +static void imx2_wdt_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->realize = imx2_wdt_realize; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); +} + +static const TypeInfo imx2_wdt_info = { + .name = TYPE_IMX2_WDT, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(IMX2WdtState), + .class_init = imx2_wdt_class_init, +}; + +static WatchdogTimerModel model = { + .wdt_name = "imx2-watchdog", + .wdt_description = "i.MX2 Watchdog", +}; + +static void imx2_wdt_register_type(void) +{ + watchdog_add_model(&model); + type_register_static(&imx2_wdt_info); +} +type_init(imx2_wdt_register_type) diff --git a/hw/misc/imx7_ccm.c b/hw/misc/imx7_ccm.c new file mode 100644 index 0000000000..d90c48bfec --- /dev/null +++ b/hw/misc/imx7_ccm.c @@ -0,0 +1,277 @@ +/* + * Copyright (c) 2018, Impinj, Inc. + * + * i.MX7 CCM, PMU and ANALOG IP blocks emulation code + * + * Author: Andrey Smirnov <andrew.smirnov@gmail.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/log.h" + +#include "hw/misc/imx7_ccm.h" + +static void imx7_analog_reset(DeviceState *dev) +{ + IMX7AnalogState *s = IMX7_ANALOG(dev); + + memset(s->pmu, 0, sizeof(s->pmu)); + memset(s->analog, 0, sizeof(s->analog)); + + s->analog[ANALOG_PLL_ARM] = 0x00002042; + s->analog[ANALOG_PLL_DDR] = 0x0060302c; + s->analog[ANALOG_PLL_DDR_SS] = 0x00000000; + s->analog[ANALOG_PLL_DDR_NUM] = 0x06aaac4d; + s->analog[ANALOG_PLL_DDR_DENOM] = 0x100003ec; + s->analog[ANALOG_PLL_480] = 0x00002000; + s->analog[ANALOG_PLL_480A] = 0x52605a56; + s->analog[ANALOG_PLL_480B] = 0x52525216; + s->analog[ANALOG_PLL_ENET] = 0x00001fc0; + s->analog[ANALOG_PLL_AUDIO] = 0x0001301b; + s->analog[ANALOG_PLL_AUDIO_SS] = 0x00000000; + s->analog[ANALOG_PLL_AUDIO_NUM] = 0x05f5e100; + s->analog[ANALOG_PLL_AUDIO_DENOM] = 0x2964619c; + s->analog[ANALOG_PLL_VIDEO] = 0x0008201b; + s->analog[ANALOG_PLL_VIDEO_SS] = 0x00000000; + s->analog[ANALOG_PLL_VIDEO_NUM] = 0x0000f699; + s->analog[ANALOG_PLL_VIDEO_DENOM] = 0x000f4240; + s->analog[ANALOG_PLL_MISC0] = 0x00000000; + + /* all PLLs need to be locked */ + s->analog[ANALOG_PLL_ARM] |= ANALOG_PLL_LOCK; + s->analog[ANALOG_PLL_DDR] |= ANALOG_PLL_LOCK; + s->analog[ANALOG_PLL_480] |= ANALOG_PLL_LOCK; + s->analog[ANALOG_PLL_480A] |= ANALOG_PLL_LOCK; + s->analog[ANALOG_PLL_480B] |= ANALOG_PLL_LOCK; + s->analog[ANALOG_PLL_ENET] |= ANALOG_PLL_LOCK; + s->analog[ANALOG_PLL_AUDIO] |= ANALOG_PLL_LOCK; + s->analog[ANALOG_PLL_VIDEO] |= ANALOG_PLL_LOCK; + s->analog[ANALOG_PLL_MISC0] |= ANALOG_PLL_LOCK; + + /* + * Since I couldn't find any info about this in the reference + * manual the value of this register is based strictly on matching + * what Linux kernel expects it to be. + */ + s->analog[ANALOG_DIGPROG] = 0x720000; + /* + * Set revision to be 1.0 (Arbitrary choice, no particular + * reason). + */ + s->analog[ANALOG_DIGPROG] |= 0x000010; +} + +static void imx7_ccm_reset(DeviceState *dev) +{ + IMX7CCMState *s = IMX7_CCM(dev); + + memset(s->ccm, 0, sizeof(s->ccm)); +} + +#define CCM_INDEX(offset) (((offset) & ~(hwaddr)0xF) / sizeof(uint32_t)) +#define CCM_BITOP(offset) ((offset) & (hwaddr)0xF) + +enum { + CCM_BITOP_NONE = 0x00, + CCM_BITOP_SET = 0x04, + CCM_BITOP_CLR = 0x08, + CCM_BITOP_TOG = 0x0C, +}; + +static uint64_t imx7_set_clr_tog_read(void *opaque, hwaddr offset, + unsigned size) +{ + const uint32_t *mmio = opaque; + + return mmio[CCM_INDEX(offset)]; +} + +static void imx7_set_clr_tog_write(void *opaque, hwaddr offset, + uint64_t value, unsigned size) +{ + const uint8_t bitop = CCM_BITOP(offset); + const uint32_t index = CCM_INDEX(offset); + uint32_t *mmio = opaque; + + switch (bitop) { + case CCM_BITOP_NONE: + mmio[index] = value; + break; + case CCM_BITOP_SET: + mmio[index] |= value; + break; + case CCM_BITOP_CLR: + mmio[index] &= ~value; + break; + case CCM_BITOP_TOG: + mmio[index] ^= value; + break; + }; +} + +static const struct MemoryRegionOps imx7_set_clr_tog_ops = { + .read = imx7_set_clr_tog_read, + .write = imx7_set_clr_tog_write, + .endianness = DEVICE_NATIVE_ENDIAN, + .impl = { + /* + * Our device would not work correctly if the guest was doing + * unaligned access. This might not be a limitation on the real + * device but in practice there is no reason for a guest to access + * this device unaligned. + */ + .min_access_size = 4, + .max_access_size = 4, + .unaligned = false, + }, +}; + +static const struct MemoryRegionOps imx7_digprog_ops = { + .read = imx7_set_clr_tog_read, + .endianness = DEVICE_NATIVE_ENDIAN, + .impl = { + .min_access_size = 4, + .max_access_size = 4, + .unaligned = false, + }, +}; + +static void imx7_ccm_init(Object *obj) +{ + SysBusDevice *sd = SYS_BUS_DEVICE(obj); + IMX7CCMState *s = IMX7_CCM(obj); + + memory_region_init_io(&s->iomem, + obj, + &imx7_set_clr_tog_ops, + s->ccm, + TYPE_IMX7_CCM ".ccm", + sizeof(s->ccm)); + + sysbus_init_mmio(sd, &s->iomem); +} + +static void imx7_analog_init(Object *obj) +{ + SysBusDevice *sd = SYS_BUS_DEVICE(obj); + IMX7AnalogState *s = IMX7_ANALOG(obj); + + memory_region_init(&s->mmio.container, obj, TYPE_IMX7_ANALOG, + 0x10000); + + memory_region_init_io(&s->mmio.analog, + obj, + &imx7_set_clr_tog_ops, + s->analog, + TYPE_IMX7_ANALOG, + sizeof(s->analog)); + + memory_region_add_subregion(&s->mmio.container, + 0x60, &s->mmio.analog); + + memory_region_init_io(&s->mmio.pmu, + obj, + &imx7_set_clr_tog_ops, + s->pmu, + TYPE_IMX7_ANALOG ".pmu", + sizeof(s->pmu)); + + memory_region_add_subregion(&s->mmio.container, + 0x200, &s->mmio.pmu); + + memory_region_init_io(&s->mmio.digprog, + obj, + &imx7_digprog_ops, + &s->analog[ANALOG_DIGPROG], + TYPE_IMX7_ANALOG ".digprog", + sizeof(uint32_t)); + + memory_region_add_subregion_overlap(&s->mmio.container, + 0x800, &s->mmio.digprog, 10); + + + sysbus_init_mmio(sd, &s->mmio.container); +} + +static const VMStateDescription vmstate_imx7_ccm = { + .name = TYPE_IMX7_CCM, + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT32_ARRAY(ccm, IMX7CCMState, CCM_MAX), + VMSTATE_END_OF_LIST() + }, +}; + +static uint32_t imx7_ccm_get_clock_frequency(IMXCCMState *dev, IMXClk clock) +{ + /* + * This function is "consumed" by GPT emulation code, however on + * i.MX7 each GPT block can have their own clock root. This means + * that this functions needs somehow to know requester's identity + * and the way to pass it: be it via additional IMXClk constants + * or by adding another argument to this method needs to be + * figured out + */ + qemu_log_mask(LOG_GUEST_ERROR, "[%s]%s: Not implemented\n", + TYPE_IMX7_CCM, __func__); + return 0; +} + +static void imx7_ccm_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + IMXCCMClass *ccm = IMX_CCM_CLASS(klass); + + dc->reset = imx7_ccm_reset; + dc->vmsd = &vmstate_imx7_ccm; + dc->desc = "i.MX7 Clock Control Module"; + + ccm->get_clock_frequency = imx7_ccm_get_clock_frequency; +} + +static const TypeInfo imx7_ccm_info = { + .name = TYPE_IMX7_CCM, + .parent = TYPE_IMX_CCM, + .instance_size = sizeof(IMX7CCMState), + .instance_init = imx7_ccm_init, + .class_init = imx7_ccm_class_init, +}; + +static const VMStateDescription vmstate_imx7_analog = { + .name = TYPE_IMX7_ANALOG, + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT32_ARRAY(analog, IMX7AnalogState, ANALOG_MAX), + VMSTATE_UINT32_ARRAY(pmu, IMX7AnalogState, PMU_MAX), + VMSTATE_END_OF_LIST() + }, +}; + +static void imx7_analog_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->reset = imx7_analog_reset; + dc->vmsd = &vmstate_imx7_analog; + dc->desc = "i.MX7 Analog Module"; +} + +static const TypeInfo imx7_analog_info = { + .name = TYPE_IMX7_ANALOG, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(IMX7AnalogState), + .instance_init = imx7_analog_init, + .class_init = imx7_analog_class_init, +}; + +static void imx7_ccm_register_type(void) +{ + type_register_static(&imx7_ccm_info); + type_register_static(&imx7_analog_info); +} +type_init(imx7_ccm_register_type) diff --git a/hw/misc/imx7_gpr.c b/hw/misc/imx7_gpr.c new file mode 100644 index 0000000000..c2a9df29c6 --- /dev/null +++ b/hw/misc/imx7_gpr.c @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2018, Impinj, Inc. + * + * i.MX7 GPR IP block emulation code + * + * Author: Andrey Smirnov <andrew.smirnov@gmail.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + * Bare minimum emulation code needed to support being able to shut + * down linux guest gracefully. + */ + +#include "qemu/osdep.h" +#include "hw/misc/imx7_gpr.h" +#include "qemu/log.h" +#include "sysemu/sysemu.h" + +#include "trace.h" + +enum IMX7GPRRegisters { + IOMUXC_GPR0 = 0x00, + IOMUXC_GPR1 = 0x04, + IOMUXC_GPR2 = 0x08, + IOMUXC_GPR3 = 0x0c, + IOMUXC_GPR4 = 0x10, + IOMUXC_GPR5 = 0x14, + IOMUXC_GPR6 = 0x18, + IOMUXC_GPR7 = 0x1c, + IOMUXC_GPR8 = 0x20, + IOMUXC_GPR9 = 0x24, + IOMUXC_GPR10 = 0x28, + IOMUXC_GPR11 = 0x2c, + IOMUXC_GPR12 = 0x30, + IOMUXC_GPR13 = 0x34, + IOMUXC_GPR14 = 0x38, + IOMUXC_GPR15 = 0x3c, + IOMUXC_GPR16 = 0x40, + IOMUXC_GPR17 = 0x44, + IOMUXC_GPR18 = 0x48, + IOMUXC_GPR19 = 0x4c, + IOMUXC_GPR20 = 0x50, + IOMUXC_GPR21 = 0x54, + IOMUXC_GPR22 = 0x58, +}; + +#define IMX7D_GPR1_IRQ_MASK BIT(12) +#define IMX7D_GPR1_ENET1_TX_CLK_SEL_MASK BIT(13) +#define IMX7D_GPR1_ENET2_TX_CLK_SEL_MASK BIT(14) +#define IMX7D_GPR1_ENET_TX_CLK_SEL_MASK (0x3 << 13) +#define IMX7D_GPR1_ENET1_CLK_DIR_MASK BIT(17) +#define IMX7D_GPR1_ENET2_CLK_DIR_MASK BIT(18) +#define IMX7D_GPR1_ENET_CLK_DIR_MASK (0x3 << 17) + +#define IMX7D_GPR5_CSI_MUX_CONTROL_MIPI BIT(4) +#define IMX7D_GPR12_PCIE_PHY_REFCLK_SEL BIT(5) +#define IMX7D_GPR22_PCIE_PHY_PLL_LOCKED BIT(31) + + +static uint64_t imx7_gpr_read(void *opaque, hwaddr offset, unsigned size) +{ + trace_imx7_gpr_read(offset); + + if (offset == IOMUXC_GPR22) { + return IMX7D_GPR22_PCIE_PHY_PLL_LOCKED; + } + + return 0; +} + +static void imx7_gpr_write(void *opaque, hwaddr offset, + uint64_t v, unsigned size) +{ + trace_imx7_gpr_write(offset, v); +} + +static const struct MemoryRegionOps imx7_gpr_ops = { + .read = imx7_gpr_read, + .write = imx7_gpr_write, + .endianness = DEVICE_NATIVE_ENDIAN, + .impl = { + /* + * Our device would not work correctly if the guest was doing + * unaligned access. This might not be a limitation on the + * real device but in practice there is no reason for a guest + * to access this device unaligned. + */ + .min_access_size = 4, + .max_access_size = 4, + .unaligned = false, + }, +}; + +static void imx7_gpr_init(Object *obj) +{ + SysBusDevice *sd = SYS_BUS_DEVICE(obj); + IMX7GPRState *s = IMX7_GPR(obj); + + memory_region_init_io(&s->mmio, obj, &imx7_gpr_ops, s, + TYPE_IMX7_GPR, 64 * 1024); + sysbus_init_mmio(sd, &s->mmio); +} + +static void imx7_gpr_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->desc = "i.MX7 General Purpose Registers Module"; +} + +static const TypeInfo imx7_gpr_info = { + .name = TYPE_IMX7_GPR, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(IMX7GPRState), + .instance_init = imx7_gpr_init, + .class_init = imx7_gpr_class_init, +}; + +static void imx7_gpr_register_type(void) +{ + type_register_static(&imx7_gpr_info); +} +type_init(imx7_gpr_register_type) diff --git a/hw/misc/imx7_snvs.c b/hw/misc/imx7_snvs.c new file mode 100644 index 0000000000..4df482b282 --- /dev/null +++ b/hw/misc/imx7_snvs.c @@ -0,0 +1,83 @@ +/* + * IMX7 Secure Non-Volatile Storage + * + * Copyright (c) 2018, Impinj, Inc. + * + * Author: Andrey Smirnov <andrew.smirnov@gmail.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + * Bare minimum emulation code needed to support being able to shut + * down linux guest gracefully. + */ + +#include "qemu/osdep.h" +#include "hw/misc/imx7_snvs.h" +#include "qemu/log.h" +#include "sysemu/sysemu.h" + +static uint64_t imx7_snvs_read(void *opaque, hwaddr offset, unsigned size) +{ + return 0; +} + +static void imx7_snvs_write(void *opaque, hwaddr offset, + uint64_t v, unsigned size) +{ + const uint32_t value = v; + const uint32_t mask = SNVS_LPCR_TOP | SNVS_LPCR_DP_EN; + + if (offset == SNVS_LPCR && ((value & mask) == mask)) { + qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); + } +} + +static const struct MemoryRegionOps imx7_snvs_ops = { + .read = imx7_snvs_read, + .write = imx7_snvs_write, + .endianness = DEVICE_NATIVE_ENDIAN, + .impl = { + /* + * Our device would not work correctly if the guest was doing + * unaligned access. This might not be a limitation on the real + * device but in practice there is no reason for a guest to access + * this device unaligned. + */ + .min_access_size = 4, + .max_access_size = 4, + .unaligned = false, + }, +}; + +static void imx7_snvs_init(Object *obj) +{ + SysBusDevice *sd = SYS_BUS_DEVICE(obj); + IMX7SNVSState *s = IMX7_SNVS(obj); + + memory_region_init_io(&s->mmio, obj, &imx7_snvs_ops, s, + TYPE_IMX7_SNVS, 0x1000); + + sysbus_init_mmio(sd, &s->mmio); +} + +static void imx7_snvs_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->desc = "i.MX7 Secure Non-Volatile Storage Module"; +} + +static const TypeInfo imx7_snvs_info = { + .name = TYPE_IMX7_SNVS, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(IMX7SNVSState), + .instance_init = imx7_snvs_init, + .class_init = imx7_snvs_class_init, +}; + +static void imx7_snvs_register_type(void) +{ + type_register_static(&imx7_snvs_info); +} +type_init(imx7_snvs_register_type) diff --git a/hw/misc/trace-events b/hw/misc/trace-events index 616579a403..e6070f280d 100644 --- a/hw/misc/trace-events +++ b/hw/misc/trace-events @@ -66,3 +66,7 @@ mps2_scc_cfg_read(unsigned function, unsigned device, uint32_t value) "MPS2 SCC msf2_sysreg_write(uint64_t offset, uint32_t val, uint32_t prev) "msf2-sysreg write: addr 0x%08" HWADDR_PRIx " data 0x%" PRIx32 " prev 0x%" PRIx32 msf2_sysreg_read(uint64_t offset, uint32_t val) "msf2-sysreg read: addr 0x%08" HWADDR_PRIx " data 0x%08" PRIx32 msf2_sysreg_write_pll_status(void) "Invalid write to read only PLL status register" + +#hw/misc/imx7_gpr.c +imx7_gpr_read(uint64_t offset) "addr 0x%08" HWADDR_PRIx +imx7_gpr_write(uint64_t offset, uint64_t value) "addr 0x%08" HWADDR_PRIx "value 0x%08" HWADDR_PRIx diff --git a/hw/s390x/css.c b/hw/s390x/css.c index 1c526fd7e2..301bf1772f 100644 --- a/hw/s390x/css.c +++ b/hw/s390x/css.c @@ -439,7 +439,7 @@ static int s390_io_adapter_map(AdapterInfo *adapter, uint64_t map_addr, bool do_map) { S390FLICState *fs = s390_get_flic(); - S390FLICStateClass *fsc = S390_FLIC_COMMON_GET_CLASS(fs); + S390FLICStateClass *fsc = s390_get_flic_class(fs); return fsc->io_adapter_map(fs, adapter->adapter_id, map_addr, do_map); } @@ -520,7 +520,7 @@ void css_register_io_adapters(CssIoAdapterType type, bool swap, bool maskable, int ret, isc; IoAdapter *adapter; S390FLICState *fs = s390_get_flic(); - S390FLICStateClass *fsc = S390_FLIC_COMMON_GET_CLASS(fs); + S390FLICStateClass *fsc = s390_get_flic_class(fs); /* * Disallow multiple registrations for the same device type. @@ -566,7 +566,7 @@ static void css_clear_io_interrupt(uint16_t subchannel_id, Error *err = NULL; static bool no_clear_irq; S390FLICState *fs = s390_get_flic(); - S390FLICStateClass *fsc = S390_FLIC_COMMON_GET_CLASS(fs); + S390FLICStateClass *fsc = s390_get_flic_class(fs); int r; if (unlikely(no_clear_irq)) { @@ -640,7 +640,7 @@ void css_conditional_io_interrupt(SubchDev *sch) int css_do_sic(CPUS390XState *env, uint8_t isc, uint16_t mode) { S390FLICState *fs = s390_get_flic(); - S390FLICStateClass *fsc = S390_FLIC_COMMON_GET_CLASS(fs); + S390FLICStateClass *fsc = s390_get_flic_class(fs); int r; if (env->psw.mask & PSW_MASK_PSTATE) { @@ -666,7 +666,7 @@ out: void css_adapter_interrupt(CssIoAdapterType type, uint8_t isc) { S390FLICState *fs = s390_get_flic(); - S390FLICStateClass *fsc = S390_FLIC_COMMON_GET_CLASS(fs); + S390FLICStateClass *fsc = s390_get_flic_class(fs); uint32_t io_int_word = (isc << 27) | IO_INT_WORD_AI; IoAdapter *adapter = channel_subsys.io_adapters[type][isc]; diff --git a/hw/s390x/event-facility.c b/hw/s390x/event-facility.c index b0f71f4554..155a69467b 100644 --- a/hw/s390x/event-facility.c +++ b/hw/s390x/event-facility.c @@ -293,10 +293,10 @@ static void write_event_mask(SCLPEventFacility *ef, SCCB *sccb) ef->receive_mask = be32_to_cpu(tmp_mask); /* return the SCLP's capability masks to the guest */ - tmp_mask = cpu_to_be32(get_host_send_mask(ef)); + tmp_mask = cpu_to_be32(get_host_receive_mask(ef)); copy_mask(WEM_RECEIVE_MASK(we_mask, mask_length), (uint8_t *)&tmp_mask, mask_length, sizeof(tmp_mask)); - tmp_mask = cpu_to_be32(get_host_receive_mask(ef)); + tmp_mask = cpu_to_be32(get_host_send_mask(ef)); copy_mask(WEM_SEND_MASK(we_mask, mask_length), (uint8_t *)&tmp_mask, mask_length, sizeof(tmp_mask)); diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c index 7d9c65e719..77a50cab36 100644 --- a/hw/s390x/s390-pci-bus.c +++ b/hw/s390x/s390-pci-bus.c @@ -309,49 +309,187 @@ static uint64_t get_st_pto(uint64_t entry) : 0; } -static uint64_t s390_guest_io_table_walk(uint64_t guest_iota, - uint64_t guest_dma_address) +static bool rt_entry_isvalid(uint64_t entry) { - uint64_t sto_a, pto_a, px_a; - uint64_t sto, pto, pte; - uint32_t rtx, sx, px; - - rtx = calc_rtx(guest_dma_address); - sx = calc_sx(guest_dma_address); - px = calc_px(guest_dma_address); - - sto_a = guest_iota + rtx * sizeof(uint64_t); - sto = address_space_ldq(&address_space_memory, sto_a, - MEMTXATTRS_UNSPECIFIED, NULL); - sto = get_rt_sto(sto); - if (!sto) { - pte = 0; + return (entry & ZPCI_TABLE_VALID_MASK) == ZPCI_TABLE_VALID; +} + +static bool pt_entry_isvalid(uint64_t entry) +{ + return (entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID; +} + +static bool entry_isprotected(uint64_t entry) +{ + return (entry & ZPCI_TABLE_PROT_MASK) == ZPCI_TABLE_PROTECTED; +} + +/* ett is expected table type, -1 page table, 0 segment table, 1 region table */ +static uint64_t get_table_index(uint64_t iova, int8_t ett) +{ + switch (ett) { + case ZPCI_ETT_PT: + return calc_px(iova); + case ZPCI_ETT_ST: + return calc_sx(iova); + case ZPCI_ETT_RT: + return calc_rtx(iova); + } + + return -1; +} + +static bool entry_isvalid(uint64_t entry, int8_t ett) +{ + switch (ett) { + case ZPCI_ETT_PT: + return pt_entry_isvalid(entry); + case ZPCI_ETT_ST: + case ZPCI_ETT_RT: + return rt_entry_isvalid(entry); + } + + return false; +} + +/* Return true if address translation is done */ +static bool translate_iscomplete(uint64_t entry, int8_t ett) +{ + switch (ett) { + case 0: + return (entry & ZPCI_TABLE_FC) ? true : false; + case 1: + return false; + } + + return true; +} + +static uint64_t get_frame_size(int8_t ett) +{ + switch (ett) { + case ZPCI_ETT_PT: + return 1ULL << 12; + case ZPCI_ETT_ST: + return 1ULL << 20; + case ZPCI_ETT_RT: + return 1ULL << 31; + } + + return 0; +} + +static uint64_t get_next_table_origin(uint64_t entry, int8_t ett) +{ + switch (ett) { + case ZPCI_ETT_PT: + return entry & ZPCI_PTE_ADDR_MASK; + case ZPCI_ETT_ST: + return get_st_pto(entry); + case ZPCI_ETT_RT: + return get_rt_sto(entry); + } + + return 0; +} + +/** + * table_translate: do translation within one table and return the following + * table origin + * + * @entry: the entry being translated, the result is stored in this. + * @to: the address of table origin. + * @ett: expected table type, 1 region table, 0 segment table and -1 page table. + * @error: error code + */ +static uint64_t table_translate(S390IOTLBEntry *entry, uint64_t to, int8_t ett, + uint16_t *error) +{ + uint64_t tx, te, nto = 0; + uint16_t err = 0; + + tx = get_table_index(entry->iova, ett); + te = address_space_ldq(&address_space_memory, to + tx * sizeof(uint64_t), + MEMTXATTRS_UNSPECIFIED, NULL); + + if (!te) { + err = ERR_EVENT_INVALTE; + goto out; + } + + if (!entry_isvalid(te, ett)) { + entry->perm &= IOMMU_NONE; + goto out; + } + + if (ett == ZPCI_ETT_RT && ((te & ZPCI_TABLE_LEN_RTX) != ZPCI_TABLE_LEN_RTX + || te & ZPCI_TABLE_OFFSET_MASK)) { + err = ERR_EVENT_INVALTL; goto out; } - pto_a = sto + sx * sizeof(uint64_t); - pto = address_space_ldq(&address_space_memory, pto_a, - MEMTXATTRS_UNSPECIFIED, NULL); - pto = get_st_pto(pto); - if (!pto) { - pte = 0; + nto = get_next_table_origin(te, ett); + if (!nto) { + err = ERR_EVENT_TT; goto out; } - px_a = pto + px * sizeof(uint64_t); - pte = address_space_ldq(&address_space_memory, px_a, - MEMTXATTRS_UNSPECIFIED, NULL); + if (entry_isprotected(te)) { + entry->perm &= IOMMU_RO; + } else { + entry->perm &= IOMMU_RW; + } + if (translate_iscomplete(te, ett)) { + switch (ett) { + case ZPCI_ETT_PT: + entry->translated_addr = te & ZPCI_PTE_ADDR_MASK; + break; + case ZPCI_ETT_ST: + entry->translated_addr = (te & ZPCI_SFAA_MASK) | + (entry->iova & ~ZPCI_SFAA_MASK); + break; + } + nto = 0; + } out: - return pte; + if (err) { + entry->perm = IOMMU_NONE; + *error = err; + } + entry->len = get_frame_size(ett); + return nto; +} + +uint16_t s390_guest_io_table_walk(uint64_t g_iota, hwaddr addr, + S390IOTLBEntry *entry) +{ + uint64_t to = s390_pci_get_table_origin(g_iota); + int8_t ett = 1; + uint16_t error = 0; + + entry->iova = addr & PAGE_MASK; + entry->translated_addr = 0; + entry->perm = IOMMU_RW; + + if (entry_isprotected(g_iota)) { + entry->perm &= IOMMU_RO; + } + + while (to) { + to = table_translate(entry, to, ett--, &error); + } + + return error; } static IOMMUTLBEntry s390_translate_iommu(IOMMUMemoryRegion *mr, hwaddr addr, IOMMUAccessFlags flag) { - uint64_t pte; - uint32_t flags; S390PCIIOMMU *iommu = container_of(mr, S390PCIIOMMU, iommu_mr); + S390IOTLBEntry *entry; + uint64_t iova = addr & PAGE_MASK; + uint16_t error = 0; IOMMUTLBEntry ret = { .target_as = &address_space_memory, .iova = 0, @@ -374,26 +512,31 @@ static IOMMUTLBEntry s390_translate_iommu(IOMMUMemoryRegion *mr, hwaddr addr, DPRINTF("iommu trans addr 0x%" PRIx64 "\n", addr); if (addr < iommu->pba || addr > iommu->pal) { - return ret; + error = ERR_EVENT_OORANGE; + goto err; } - pte = s390_guest_io_table_walk(s390_pci_get_table_origin(iommu->g_iota), - addr); - if (!pte) { - return ret; - } - - flags = pte & ZPCI_PTE_FLAG_MASK; - ret.iova = addr; - ret.translated_addr = pte & ZPCI_PTE_ADDR_MASK; - ret.addr_mask = 0xfff; - - if (flags & ZPCI_PTE_INVALID) { - ret.perm = IOMMU_NONE; + entry = g_hash_table_lookup(iommu->iotlb, &iova); + if (entry) { + ret.iova = entry->iova; + ret.translated_addr = entry->translated_addr; + ret.addr_mask = entry->len - 1; + ret.perm = entry->perm; } else { - ret.perm = IOMMU_RW; + ret.iova = iova; + ret.addr_mask = ~PAGE_MASK; + ret.perm = IOMMU_NONE; } + if (flag != IOMMU_NONE && !(flag & ret.perm)) { + error = ERR_EVENT_TPROTE; + } +err: + if (error) { + iommu->pbdev->state = ZPCI_FS_ERROR; + s390_pci_generate_error_event(error, iommu->pbdev->fh, + iommu->pbdev->fid, addr, 0); + } return ret; } @@ -435,6 +578,8 @@ static S390PCIIOMMU *s390_pci_get_iommu(S390pciState *s, PCIBus *bus, PCI_FUNC(devfn)); memory_region_init(&iommu->mr, OBJECT(iommu), mr_name, UINT64_MAX); address_space_init(&iommu->as, &iommu->mr, as_name); + iommu->iotlb = g_hash_table_new_full(g_int64_hash, g_int64_equal, + NULL, g_free); table->iommu[PCI_SLOT(devfn)] = iommu; g_free(mr_name); @@ -524,6 +669,7 @@ void s390_pci_iommu_enable(S390PCIIOMMU *iommu) void s390_pci_iommu_disable(S390PCIIOMMU *iommu) { iommu->enabled = false; + g_hash_table_remove_all(iommu->iotlb); memory_region_del_subregion(&iommu->mr, MEMORY_REGION(&iommu->iommu_mr)); object_unparent(OBJECT(&iommu->iommu_mr)); } @@ -539,6 +685,7 @@ static void s390_pci_iommu_free(S390pciState *s, PCIBus *bus, int32_t devfn) } table->iommu[PCI_SLOT(devfn)] = NULL; + g_hash_table_destroy(iommu->iotlb); address_space_destroy(&iommu->as); object_unparent(OBJECT(&iommu->mr)); object_unparent(OBJECT(iommu)); diff --git a/hw/s390x/s390-pci-bus.h b/hw/s390x/s390-pci-bus.h index 2993f0ddef..1f7f9b5814 100644 --- a/hw/s390x/s390-pci-bus.h +++ b/hw/s390x/s390-pci-bus.h @@ -148,6 +148,8 @@ enum ZpciIoatDtype { #define ZPCI_STE_FLAG_MASK 0x7ffULL #define ZPCI_STE_ADDR_MASK (~ZPCI_STE_FLAG_MASK) +#define ZPCI_SFAA_MASK (~((1ULL << 20) - 1)) + /* I/O Page tables */ #define ZPCI_PTE_VALID_MASK 0x400 #define ZPCI_PTE_INVALID 0x400 @@ -165,10 +167,15 @@ enum ZpciIoatDtype { #define ZPCI_TABLE_INVALID 0x20 #define ZPCI_TABLE_PROTECTED 0x200 #define ZPCI_TABLE_UNPROTECTED 0x000 +#define ZPCI_TABLE_FC 0x400 #define ZPCI_TABLE_VALID_MASK 0x20 #define ZPCI_TABLE_PROT_MASK 0x200 +#define ZPCI_ETT_RT 1 +#define ZPCI_ETT_ST 0 +#define ZPCI_ETT_PT -1 + /* PCI Function States * * reserved: default; device has just been plugged or is in progress of being @@ -253,6 +260,13 @@ typedef struct S390MsixInfo { uint32_t pba_offset; } S390MsixInfo; +typedef struct S390IOTLBEntry { + uint64_t iova; + uint64_t translated_addr; + uint64_t len; + uint64_t perm; +} S390IOTLBEntry; + typedef struct S390PCIBusDevice S390PCIBusDevice; typedef struct S390PCIIOMMU { Object parent_obj; @@ -264,6 +278,7 @@ typedef struct S390PCIIOMMU { uint64_t g_iota; uint64_t pba; uint64_t pal; + GHashTable *iotlb; } S390PCIIOMMU; typedef struct S390PCIIOMMUTable { @@ -320,6 +335,8 @@ void s390_pci_iommu_enable(S390PCIIOMMU *iommu); void s390_pci_iommu_disable(S390PCIIOMMU *iommu); void s390_pci_generate_error_event(uint16_t pec, uint32_t fh, uint32_t fid, uint64_t faddr, uint32_t e); +uint16_t s390_guest_io_table_walk(uint64_t g_iota, hwaddr addr, + S390IOTLBEntry *entry); S390PCIBusDevice *s390_pci_find_dev_by_idx(S390pciState *s, uint32_t idx); S390PCIBusDevice *s390_pci_find_dev_by_fh(S390pciState *s, uint32_t fh); S390PCIBusDevice *s390_pci_find_dev_by_fid(S390pciState *s, uint32_t fid); diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c index be449210d9..3fcc330fe3 100644 --- a/hw/s390x/s390-pci-inst.c +++ b/hw/s390x/s390-pci-inst.c @@ -571,27 +571,65 @@ int pcistg_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) return 0; } +static void s390_pci_update_iotlb(S390PCIIOMMU *iommu, S390IOTLBEntry *entry) +{ + S390IOTLBEntry *cache = g_hash_table_lookup(iommu->iotlb, &entry->iova); + IOMMUTLBEntry notify = { + .target_as = &address_space_memory, + .iova = entry->iova, + .translated_addr = entry->translated_addr, + .perm = entry->perm, + .addr_mask = ~PAGE_MASK, + }; + + if (entry->perm == IOMMU_NONE) { + if (!cache) { + return; + } + g_hash_table_remove(iommu->iotlb, &entry->iova); + } else { + if (cache) { + if (cache->perm == entry->perm && + cache->translated_addr == entry->translated_addr) { + return; + } + + notify.perm = IOMMU_NONE; + memory_region_notify_iommu(&iommu->iommu_mr, notify); + notify.perm = entry->perm; + } + + cache = g_new(S390IOTLBEntry, 1); + cache->iova = entry->iova; + cache->translated_addr = entry->translated_addr; + cache->len = PAGE_SIZE; + cache->perm = entry->perm; + g_hash_table_replace(iommu->iotlb, &cache->iova, cache); + } + + memory_region_notify_iommu(&iommu->iommu_mr, notify); +} + int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) { CPUS390XState *env = &cpu->env; uint32_t fh; + uint16_t error = 0; S390PCIBusDevice *pbdev; S390PCIIOMMU *iommu; + S390IOTLBEntry entry; hwaddr start, end; - IOMMUTLBEntry entry; - IOMMUMemoryRegion *iommu_mr; - IOMMUMemoryRegionClass *imrc; cpu_synchronize_state(CPU(cpu)); if (env->psw.mask & PSW_MASK_PSTATE) { s390_program_interrupt(env, PGM_PRIVILEGED, 4, ra); - goto out; + return 0; } if (r2 & 0x1) { s390_program_interrupt(env, PGM_SPECIFICATION, 4, ra); - goto out; + return 0; } fh = env->regs[r1] >> 32; @@ -602,7 +640,7 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) if (!pbdev) { DPRINTF("rpcit no pci dev\n"); setcc(cpu, ZPCI_PCI_LS_INVAL_HANDLE); - goto out; + return 0; } switch (pbdev->state) { @@ -622,44 +660,37 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra) iommu = pbdev->iommu; if (!iommu->g_iota) { - pbdev->state = ZPCI_FS_ERROR; - setcc(cpu, ZPCI_PCI_LS_ERR); - s390_set_status_code(env, r1, ZPCI_PCI_ST_INSUF_RES); - s390_pci_generate_error_event(ERR_EVENT_INVALAS, pbdev->fh, pbdev->fid, - start, 0); - goto out; + error = ERR_EVENT_INVALAS; + goto err; } if (end < iommu->pba || start > iommu->pal) { - pbdev->state = ZPCI_FS_ERROR; - setcc(cpu, ZPCI_PCI_LS_ERR); - s390_set_status_code(env, r1, ZPCI_PCI_ST_INSUF_RES); - s390_pci_generate_error_event(ERR_EVENT_OORANGE, pbdev->fh, pbdev->fid, - start, 0); - goto out; + error = ERR_EVENT_OORANGE; + goto err; } - iommu_mr = &iommu->iommu_mr; - imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr); - while (start < end) { - entry = imrc->translate(iommu_mr, start, IOMMU_NONE); - - if (!entry.translated_addr) { - pbdev->state = ZPCI_FS_ERROR; - setcc(cpu, ZPCI_PCI_LS_ERR); - s390_set_status_code(env, r1, ZPCI_PCI_ST_INSUF_RES); - s390_pci_generate_error_event(ERR_EVENT_SERR, pbdev->fh, pbdev->fid, - start, ERR_EVENT_Q_BIT); - goto out; + error = s390_guest_io_table_walk(iommu->g_iota, start, &entry); + if (error) { + break; } - memory_region_notify_iommu(iommu_mr, entry); - start += entry.addr_mask + 1; + start += entry.len; + while (entry.iova < start && entry.iova < end) { + s390_pci_update_iotlb(iommu, &entry); + entry.iova += PAGE_SIZE; + entry.translated_addr += PAGE_SIZE; + } + } +err: + if (error) { + pbdev->state = ZPCI_FS_ERROR; + setcc(cpu, ZPCI_PCI_LS_ERR); + s390_set_status_code(env, r1, ZPCI_PCI_ST_FUNC_IN_ERR); + s390_pci_generate_error_event(error, pbdev->fh, pbdev->fid, start, 0); + } else { + setcc(cpu, ZPCI_PCI_LS_OK); } - - setcc(cpu, ZPCI_PCI_LS_OK); -out: return 0; } @@ -834,6 +865,8 @@ static int reg_ioat(CPUS390XState *env, S390PCIIOMMU *iommu, ZpciFib fib, uint8_t dt = (g_iota >> 2) & 0x7; uint8_t t = (g_iota >> 11) & 0x1; + pba &= ~0xfff; + pal |= 0xfff; if (pba > pal || pba < ZPCI_SDMA_ADDR || pal > ZPCI_EDMA_ADDR) { s390_program_interrupt(env, PGM_OPERAND, 6, ra); return -EINVAL; diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c index f2db448233..4abbe89847 100644 --- a/hw/s390x/s390-virtio-ccw.c +++ b/hw/s390x/s390-virtio-ccw.c @@ -77,10 +77,6 @@ static void s390_init_cpus(MachineState *machine) MachineClass *mc = MACHINE_GET_CLASS(machine); int i; - if (tcg_enabled() && max_cpus > 1) { - error_report("WARNING: SMP support on s390x is experimental!"); - } - /* initialize possible_cpus */ mc->possible_cpu_arch_ids(machine); diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c index 3d8f26949b..8f7fbc2ab7 100644 --- a/hw/s390x/virtio-ccw.c +++ b/hw/s390x/virtio-ccw.c @@ -1111,7 +1111,7 @@ static int virtio_ccw_setup_irqroutes(VirtioCcwDevice *dev, int nvqs) VirtIODevice *vdev = virtio_bus_get_device(&dev->bus); int ret; S390FLICState *fs = s390_get_flic(); - S390FLICStateClass *fsc = S390_FLIC_COMMON_GET_CLASS(fs); + S390FLICStateClass *fsc = s390_get_flic_class(fs); ret = virtio_ccw_get_mappings(dev); if (ret) { @@ -1129,7 +1129,7 @@ static int virtio_ccw_setup_irqroutes(VirtioCcwDevice *dev, int nvqs) static void virtio_ccw_release_irqroutes(VirtioCcwDevice *dev, int nvqs) { S390FLICState *fs = s390_get_flic(); - S390FLICStateClass *fsc = S390_FLIC_COMMON_GET_CLASS(fs); + S390FLICStateClass *fsc = s390_get_flic_class(fs); fsc->release_adapter_routes(fs, &dev->routes); } diff --git a/hw/sd/sdhci-internal.h b/hw/sd/sdhci-internal.h index fc807f08f3..0991acd724 100644 --- a/hw/sd/sdhci-internal.h +++ b/hw/sd/sdhci-internal.h @@ -84,12 +84,18 @@ /* R/W Host control Register 0x0 */ #define SDHC_HOSTCTL 0x28 +#define SDHC_CTRL_LED 0x01 #define SDHC_CTRL_DMA_CHECK_MASK 0x18 #define SDHC_CTRL_SDMA 0x00 #define SDHC_CTRL_ADMA1_32 0x08 #define SDHC_CTRL_ADMA2_32 0x10 #define SDHC_CTRL_ADMA2_64 0x18 #define SDHC_DMA_TYPE(x) ((x) & SDHC_CTRL_DMA_CHECK_MASK) +#define SDHC_CTRL_4BITBUS 0x02 +#define SDHC_CTRL_8BITBUS 0x20 +#define SDHC_CTRL_CDTEST_INS 0x40 +#define SDHC_CTRL_CDTEST_EN 0x80 + /* R/W Power Control Register 0x0 */ #define SDHC_PWRCON 0x29 @@ -226,4 +232,21 @@ enum { sdhc_gap_write = 2 /* SDHC stopped at block gap during write operation */ }; +extern const VMStateDescription sdhci_vmstate; + + +#define ESDHC_MIX_CTRL 0x48 +#define ESDHC_VENDOR_SPEC 0xc0 +#define ESDHC_DLL_CTRL 0x60 + +#define ESDHC_TUNING_CTRL 0xcc +#define ESDHC_TUNE_CTRL_STATUS 0x68 +#define ESDHC_WTMK_LVL 0x44 + +/* Undocumented register used by guests working around erratum ERR004536 */ +#define ESDHC_UNDOCUMENTED_REG27 0x6c + +#define ESDHC_CTRL_4BITBUS (0x1 << 1) +#define ESDHC_CTRL_8BITBUS (0x2 << 1) + #endif diff --git a/hw/sd/sdhci.c b/hw/sd/sdhci.c index 37b217269b..ee95e78aeb 100644 --- a/hw/sd/sdhci.c +++ b/hw/sd/sdhci.c @@ -243,7 +243,8 @@ static void sdhci_send_command(SDHCIState *s) } } - if ((s->norintstsen & SDHC_NISEN_TRSCMP) && + if (!(s->quirks & SDHCI_QUIRK_NO_BUSY_IRQ) && + (s->norintstsen & SDHC_NISEN_TRSCMP) && (s->cmdreg & SDHC_CMD_RESPONSE) == SDHC_CMD_RSP_WITH_BUSY) { s->norintsts |= SDHC_NIS_TRSCMP; } @@ -1188,6 +1189,8 @@ static void sdhci_initfn(SDHCIState *s) s->insert_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, sdhci_raise_insertion_irq, s); s->transfer_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, sdhci_data_transfer, s); + + s->io_ops = &sdhci_mmio_ops; } static void sdhci_uninitfn(SDHCIState *s) @@ -1395,6 +1398,10 @@ static void sdhci_sysbus_realize(DeviceState *dev, Error ** errp) } sysbus_init_irq(sbd, &s->irq); + + memory_region_init_io(&s->iomem, OBJECT(s), s->io_ops, s, "sdhci", + SDHC_REGISTERS_MAP_SIZE); + sysbus_init_mmio(sbd, &s->iomem); } @@ -1446,11 +1453,232 @@ static const TypeInfo sdhci_bus_info = { .class_init = sdhci_bus_class_init, }; +static uint64_t usdhc_read(void *opaque, hwaddr offset, unsigned size) +{ + SDHCIState *s = SYSBUS_SDHCI(opaque); + uint32_t ret; + uint16_t hostctl; + + switch (offset) { + default: + return sdhci_read(opaque, offset, size); + + case SDHC_HOSTCTL: + /* + * For a detailed explanation on the following bit + * manipulation code see comments in a similar part of + * usdhc_write() + */ + hostctl = SDHC_DMA_TYPE(s->hostctl) << (8 - 3); + + if (s->hostctl & SDHC_CTRL_8BITBUS) { + hostctl |= ESDHC_CTRL_8BITBUS; + } + + if (s->hostctl & SDHC_CTRL_4BITBUS) { + hostctl |= ESDHC_CTRL_4BITBUS; + } + + ret = hostctl; + ret |= (uint32_t)s->blkgap << 16; + ret |= (uint32_t)s->wakcon << 24; + + break; + + case ESDHC_DLL_CTRL: + case ESDHC_TUNE_CTRL_STATUS: + case ESDHC_UNDOCUMENTED_REG27: + case ESDHC_TUNING_CTRL: + case ESDHC_VENDOR_SPEC: + case ESDHC_MIX_CTRL: + case ESDHC_WTMK_LVL: + ret = 0; + break; + } + + return ret; +} + +static void +usdhc_write(void *opaque, hwaddr offset, uint64_t val, unsigned size) +{ + SDHCIState *s = SYSBUS_SDHCI(opaque); + uint8_t hostctl; + uint32_t value = (uint32_t)val; + + switch (offset) { + case ESDHC_DLL_CTRL: + case ESDHC_TUNE_CTRL_STATUS: + case ESDHC_UNDOCUMENTED_REG27: + case ESDHC_TUNING_CTRL: + case ESDHC_WTMK_LVL: + case ESDHC_VENDOR_SPEC: + break; + + case SDHC_HOSTCTL: + /* + * Here's What ESDHCI has at offset 0x28 (SDHC_HOSTCTL) + * + * 7 6 5 4 3 2 1 0 + * |-----------+--------+--------+-----------+----------+---------| + * | Card | Card | Endian | DATA3 | Data | Led | + * | Detect | Detect | Mode | as Card | Transfer | Control | + * | Signal | Test | | Detection | Width | | + * | Selection | Level | | Pin | | | + * |-----------+--------+--------+-----------+----------+---------| + * + * and 0x29 + * + * 15 10 9 8 + * |----------+------| + * | Reserved | DMA | + * | | Sel. | + * | | | + * |----------+------| + * + * and here's what SDCHI spec expects those offsets to be: + * + * 0x28 (Host Control Register) + * + * 7 6 5 4 3 2 1 0 + * |--------+--------+----------+------+--------+----------+---------| + * | Card | Card | Extended | DMA | High | Data | LED | + * | Detect | Detect | Data | Sel. | Speed | Transfer | Control | + * | Signal | Test | Transfer | | Enable | Width | | + * | Sel. | Level | Width | | | | | + * |--------+--------+----------+------+--------+----------+---------| + * + * and 0x29 (Power Control Register) + * + * |----------------------------------| + * | Power Control Register | + * | | + * | Description omitted, | + * | since it has no analog in ESDHCI | + * | | + * |----------------------------------| + * + * Since offsets 0x2A and 0x2B should be compatible between + * both IP specs we only need to reconcile least 16-bit of the + * word we've been given. + */ + + /* + * First, save bits 7 6 and 0 since they are identical + */ + hostctl = value & (SDHC_CTRL_LED | + SDHC_CTRL_CDTEST_INS | + SDHC_CTRL_CDTEST_EN); + /* + * Second, split "Data Transfer Width" from bits 2 and 1 in to + * bits 5 and 1 + */ + if (value & ESDHC_CTRL_8BITBUS) { + hostctl |= SDHC_CTRL_8BITBUS; + } + + if (value & ESDHC_CTRL_4BITBUS) { + hostctl |= ESDHC_CTRL_4BITBUS; + } + + /* + * Third, move DMA select from bits 9 and 8 to bits 4 and 3 + */ + hostctl |= SDHC_DMA_TYPE(value >> (8 - 3)); + + /* + * Now place the corrected value into low 16-bit of the value + * we are going to give standard SDHCI write function + * + * NOTE: This transformation should be the inverse of what can + * be found in drivers/mmc/host/sdhci-esdhc-imx.c in Linux + * kernel + */ + value &= ~UINT16_MAX; + value |= hostctl; + value |= (uint16_t)s->pwrcon << 8; + + sdhci_write(opaque, offset, value, size); + break; + + case ESDHC_MIX_CTRL: + /* + * So, when SD/MMC stack in Linux tries to write to "Transfer + * Mode Register", ESDHC i.MX quirk code will translate it + * into a write to ESDHC_MIX_CTRL, so we do the opposite in + * order to get where we started + * + * Note that Auto CMD23 Enable bit is located in a wrong place + * on i.MX, but since it is not used by QEMU we do not care. + * + * We don't want to call sdhci_write(.., SDHC_TRNMOD, ...) + * here becuase it will result in a call to + * sdhci_send_command(s) which we don't want. + * + */ + s->trnmod = value & UINT16_MAX; + break; + case SDHC_TRNMOD: + /* + * Similar to above, but this time a write to "Command + * Register" will be translated into a 4-byte write to + * "Transfer Mode register" where lower 16-bit of value would + * be set to zero. So what we do is fill those bits with + * cached value from s->trnmod and let the SDHCI + * infrastructure handle the rest + */ + sdhci_write(opaque, offset, val | s->trnmod, size); + break; + case SDHC_BLKSIZE: + /* + * ESDHCI does not implement "Host SDMA Buffer Boundary", and + * Linux driver will try to zero this field out which will + * break the rest of SDHCI emulation. + * + * Linux defaults to maximum possible setting (512K boundary) + * and it seems to be the only option that i.MX IP implements, + * so we artificially set it to that value. + */ + val |= 0x7 << 12; + /* FALLTHROUGH */ + default: + sdhci_write(opaque, offset, val, size); + break; + } +} + + +static const MemoryRegionOps usdhc_mmio_ops = { + .read = usdhc_read, + .write = usdhc_write, + .valid = { + .min_access_size = 1, + .max_access_size = 4, + .unaligned = false + }, + .endianness = DEVICE_LITTLE_ENDIAN, +}; + +static void imx_usdhc_init(Object *obj) +{ + SDHCIState *s = SYSBUS_SDHCI(obj); + + s->io_ops = &usdhc_mmio_ops; + s->quirks = SDHCI_QUIRK_NO_BUSY_IRQ; +} + +static const TypeInfo imx_usdhc_info = { + .name = TYPE_IMX_USDHC, + .parent = TYPE_SYSBUS_SDHCI, + .instance_init = imx_usdhc_init, +}; + static void sdhci_register_types(void) { type_register_static(&sdhci_pci_info); type_register_static(&sdhci_sysbus_info); type_register_static(&sdhci_bus_info); + type_register_static(&imx_usdhc_info); } type_init(sdhci_register_types) diff --git a/hw/timer/imx_gpt.c b/hw/timer/imx_gpt.c index 4b9b54bf2e..65e4ee6bcf 100644 --- a/hw/timer/imx_gpt.c +++ b/hw/timer/imx_gpt.c @@ -113,6 +113,17 @@ static const IMXClk imx6_gpt_clocks[] = { CLK_HIGH, /* 111 reference clock */ }; +static const IMXClk imx7_gpt_clocks[] = { + CLK_NONE, /* 000 No clock source */ + CLK_IPG, /* 001 ipg_clk, 532MHz*/ + CLK_IPG_HIGH, /* 010 ipg_clk_highfreq */ + CLK_EXT, /* 011 External clock */ + CLK_32k, /* 100 ipg_clk_32k */ + CLK_HIGH, /* 101 reference clock */ + CLK_NONE, /* 110 not defined */ + CLK_NONE, /* 111 not defined */ +}; + static void imx_gpt_set_freq(IMXGPTState *s) { uint32_t clksrc = extract32(s->cr, GPT_CR_CLKSRC_SHIFT, 3); @@ -512,6 +523,13 @@ static void imx6_gpt_init(Object *obj) s->clocks = imx6_gpt_clocks; } +static void imx7_gpt_init(Object *obj) +{ + IMXGPTState *s = IMX_GPT(obj); + + s->clocks = imx7_gpt_clocks; +} + static const TypeInfo imx25_gpt_info = { .name = TYPE_IMX25_GPT, .parent = TYPE_SYS_BUS_DEVICE, @@ -532,11 +550,18 @@ static const TypeInfo imx6_gpt_info = { .instance_init = imx6_gpt_init, }; +static const TypeInfo imx7_gpt_info = { + .name = TYPE_IMX7_GPT, + .parent = TYPE_IMX25_GPT, + .instance_init = imx7_gpt_init, +}; + static void imx_gpt_register_types(void) { type_register_static(&imx25_gpt_info); type_register_static(&imx31_gpt_info); type_register_static(&imx6_gpt_info); + type_register_static(&imx7_gpt_info); } type_init(imx_gpt_register_types) diff --git a/hw/usb/Makefile.objs b/hw/usb/Makefile.objs index fbcd498c59..41be700812 100644 --- a/hw/usb/Makefile.objs +++ b/hw/usb/Makefile.objs @@ -12,6 +12,7 @@ common-obj-$(CONFIG_USB_XHCI_NEC) += hcd-xhci-nec.o common-obj-$(CONFIG_USB_MUSB) += hcd-musb.o obj-$(CONFIG_TUSB6010) += tusb6010.o +obj-$(CONFIG_IMX) += chipidea.o # emulated usb devices common-obj-$(CONFIG_USB) += dev-hub.o diff --git a/hw/usb/chipidea.c b/hw/usb/chipidea.c new file mode 100644 index 0000000000..60d67f88b8 --- /dev/null +++ b/hw/usb/chipidea.c @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2018, Impinj, Inc. + * + * Chipidea USB block emulation code + * + * Author: Andrey Smirnov <andrew.smirnov@gmail.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "hw/usb/hcd-ehci.h" +#include "hw/usb/chipidea.h" +#include "qemu/log.h" + +enum { + CHIPIDEA_USBx_DCIVERSION = 0x000, + CHIPIDEA_USBx_DCCPARAMS = 0x004, + CHIPIDEA_USBx_DCCPARAMS_HC = BIT(8), +}; + +static uint64_t chipidea_read(void *opaque, hwaddr offset, + unsigned size) +{ + return 0; +} + +static void chipidea_write(void *opaque, hwaddr offset, + uint64_t value, unsigned size) +{ +} + +static const struct MemoryRegionOps chipidea_ops = { + .read = chipidea_read, + .write = chipidea_write, + .endianness = DEVICE_NATIVE_ENDIAN, + .impl = { + /* + * Our device would not work correctly if the guest was doing + * unaligned access. This might not be a limitation on the + * real device but in practice there is no reason for a guest + * to access this device unaligned. + */ + .min_access_size = 4, + .max_access_size = 4, + .unaligned = false, + }, +}; + +static uint64_t chipidea_dc_read(void *opaque, hwaddr offset, + unsigned size) +{ + switch (offset) { + case CHIPIDEA_USBx_DCIVERSION: + return 0x1; + case CHIPIDEA_USBx_DCCPARAMS: + /* + * Real hardware (at least i.MX7) will also report the + * controller as "Device Capable" (and 8 supported endpoints), + * but there doesn't seem to be much point in doing so, since + * we don't emulate that part. + */ + return CHIPIDEA_USBx_DCCPARAMS_HC; + } + + return 0; +} + +static void chipidea_dc_write(void *opaque, hwaddr offset, + uint64_t value, unsigned size) +{ +} + +static const struct MemoryRegionOps chipidea_dc_ops = { + .read = chipidea_dc_read, + .write = chipidea_dc_write, + .endianness = DEVICE_NATIVE_ENDIAN, + .impl = { + /* + * Our device would not work correctly if the guest was doing + * unaligned access. This might not be a limitation on the real + * device but in practice there is no reason for a guest to access + * this device unaligned. + */ + .min_access_size = 4, + .max_access_size = 4, + .unaligned = false, + }, +}; + +static void chipidea_init(Object *obj) +{ + EHCIState *ehci = &SYS_BUS_EHCI(obj)->ehci; + ChipideaState *ci = CHIPIDEA(obj); + int i; + + for (i = 0; i < ARRAY_SIZE(ci->iomem); i++) { + const struct { + const char *name; + hwaddr offset; + uint64_t size; + const struct MemoryRegionOps *ops; + } regions[ARRAY_SIZE(ci->iomem)] = { + /* + * Registers located between offsets 0x000 and 0xFC + */ + { + .name = TYPE_CHIPIDEA ".misc", + .offset = 0x000, + .size = 0x100, + .ops = &chipidea_ops, + }, + /* + * Registers located between offsets 0x1A4 and 0x1DC + */ + { + .name = TYPE_CHIPIDEA ".endpoints", + .offset = 0x1A4, + .size = 0x1DC - 0x1A4 + 4, + .ops = &chipidea_ops, + }, + /* + * USB_x_DCIVERSION and USB_x_DCCPARAMS + */ + { + .name = TYPE_CHIPIDEA ".dc", + .offset = 0x120, + .size = 8, + .ops = &chipidea_dc_ops, + }, + }; + + memory_region_init_io(&ci->iomem[i], + obj, + regions[i].ops, + ci, + regions[i].name, + regions[i].size); + + memory_region_add_subregion(&ehci->mem, + regions[i].offset, + &ci->iomem[i]); + } +} + +static void chipidea_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + SysBusEHCIClass *sec = SYS_BUS_EHCI_CLASS(klass); + + /* + * Offsets used were taken from i.MX7Dual Applications Processor + * Reference Manual, Rev 0.1, p. 3177, Table 11-59 + */ + sec->capsbase = 0x100; + sec->opregbase = 0x140; + sec->portnr = 1; + + set_bit(DEVICE_CATEGORY_USB, dc->categories); + dc->desc = "Chipidea USB Module"; +} + +static const TypeInfo chipidea_info = { + .name = TYPE_CHIPIDEA, + .parent = TYPE_SYS_BUS_EHCI, + .instance_size = sizeof(ChipideaState), + .instance_init = chipidea_init, + .class_init = chipidea_class_init, +}; + +static void chipidea_register_type(void) +{ + type_register_static(&chipidea_info); +} +type_init(chipidea_register_type) diff --git a/include/hw/intc/imx_gpcv2.h b/include/hw/intc/imx_gpcv2.h new file mode 100644 index 0000000000..ed978b24bb --- /dev/null +++ b/include/hw/intc/imx_gpcv2.h @@ -0,0 +1,22 @@ +#ifndef IMX_GPCV2_H +#define IMX_GPCV2_H + +#include "hw/sysbus.h" + +enum IMXGPCv2Registers { + GPC_NUM = 0xE00 / sizeof(uint32_t), +}; + +typedef struct IMXGPCv2State { + /*< private >*/ + SysBusDevice parent_obj; + + /*< public >*/ + MemoryRegion iomem; + uint32_t regs[GPC_NUM]; +} IMXGPCv2State; + +#define TYPE_IMX_GPCV2 "imx-gpcv2" +#define IMX_GPCV2(obj) OBJECT_CHECK(IMXGPCv2State, (obj), TYPE_IMX_GPCV2) + +#endif /* IMX_GPCV2_H */ diff --git a/include/hw/misc/imx2_wdt.h b/include/hw/misc/imx2_wdt.h new file mode 100644 index 0000000000..8afc99a10e --- /dev/null +++ b/include/hw/misc/imx2_wdt.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2017, Impinj, Inc. + * + * i.MX2 Watchdog IP block + * + * Author: Andrey Smirnov <andrew.smirnov@gmail.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef IMX2_WDT_H +#define IMX2_WDT_H + +#include "hw/sysbus.h" + +#define TYPE_IMX2_WDT "imx2.wdt" +#define IMX2_WDT(obj) OBJECT_CHECK(IMX2WdtState, (obj), TYPE_IMX2_WDT) + +enum IMX2WdtRegisters { + IMX2_WDT_WCR = 0x0000, + IMX2_WDT_REG_NUM = 0x0008 / sizeof(uint16_t) + 1, +}; + + +typedef struct IMX2WdtState { + /* <private> */ + SysBusDevice parent_obj; + + MemoryRegion mmio; +} IMX2WdtState; + +#endif /* IMX7_SNVS_H */ diff --git a/include/hw/misc/imx7_ccm.h b/include/hw/misc/imx7_ccm.h new file mode 100644 index 0000000000..9538f37d98 --- /dev/null +++ b/include/hw/misc/imx7_ccm.h @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2017, Impinj, Inc. + * + * i.MX7 CCM, PMU and ANALOG IP blocks emulation code + * + * Author: Andrey Smirnov <andrew.smirnov@gmail.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef IMX7_CCM_H +#define IMX7_CCM_H + +#include "hw/misc/imx_ccm.h" +#include "qemu/bitops.h" + +enum IMX7AnalogRegisters { + ANALOG_PLL_ARM, + ANALOG_PLL_ARM_SET, + ANALOG_PLL_ARM_CLR, + ANALOG_PLL_ARM_TOG, + ANALOG_PLL_DDR, + ANALOG_PLL_DDR_SET, + ANALOG_PLL_DDR_CLR, + ANALOG_PLL_DDR_TOG, + ANALOG_PLL_DDR_SS, + ANALOG_PLL_DDR_SS_SET, + ANALOG_PLL_DDR_SS_CLR, + ANALOG_PLL_DDR_SS_TOG, + ANALOG_PLL_DDR_NUM, + ANALOG_PLL_DDR_NUM_SET, + ANALOG_PLL_DDR_NUM_CLR, + ANALOG_PLL_DDR_NUM_TOG, + ANALOG_PLL_DDR_DENOM, + ANALOG_PLL_DDR_DENOM_SET, + ANALOG_PLL_DDR_DENOM_CLR, + ANALOG_PLL_DDR_DENOM_TOG, + ANALOG_PLL_480, + ANALOG_PLL_480_SET, + ANALOG_PLL_480_CLR, + ANALOG_PLL_480_TOG, + ANALOG_PLL_480A, + ANALOG_PLL_480A_SET, + ANALOG_PLL_480A_CLR, + ANALOG_PLL_480A_TOG, + ANALOG_PLL_480B, + ANALOG_PLL_480B_SET, + ANALOG_PLL_480B_CLR, + ANALOG_PLL_480B_TOG, + ANALOG_PLL_ENET, + ANALOG_PLL_ENET_SET, + ANALOG_PLL_ENET_CLR, + ANALOG_PLL_ENET_TOG, + ANALOG_PLL_AUDIO, + ANALOG_PLL_AUDIO_SET, + ANALOG_PLL_AUDIO_CLR, + ANALOG_PLL_AUDIO_TOG, + ANALOG_PLL_AUDIO_SS, + ANALOG_PLL_AUDIO_SS_SET, + ANALOG_PLL_AUDIO_SS_CLR, + ANALOG_PLL_AUDIO_SS_TOG, + ANALOG_PLL_AUDIO_NUM, + ANALOG_PLL_AUDIO_NUM_SET, + ANALOG_PLL_AUDIO_NUM_CLR, + ANALOG_PLL_AUDIO_NUM_TOG, + ANALOG_PLL_AUDIO_DENOM, + ANALOG_PLL_AUDIO_DENOM_SET, + ANALOG_PLL_AUDIO_DENOM_CLR, + ANALOG_PLL_AUDIO_DENOM_TOG, + ANALOG_PLL_VIDEO, + ANALOG_PLL_VIDEO_SET, + ANALOG_PLL_VIDEO_CLR, + ANALOG_PLL_VIDEO_TOG, + ANALOG_PLL_VIDEO_SS, + ANALOG_PLL_VIDEO_SS_SET, + ANALOG_PLL_VIDEO_SS_CLR, + ANALOG_PLL_VIDEO_SS_TOG, + ANALOG_PLL_VIDEO_NUM, + ANALOG_PLL_VIDEO_NUM_SET, + ANALOG_PLL_VIDEO_NUM_CLR, + ANALOG_PLL_VIDEO_NUM_TOG, + ANALOG_PLL_VIDEO_DENOM, + ANALOG_PLL_VIDEO_DENOM_SET, + ANALOG_PLL_VIDEO_DENOM_CLR, + ANALOG_PLL_VIDEO_DENOM_TOG, + ANALOG_PLL_MISC0, + ANALOG_PLL_MISC0_SET, + ANALOG_PLL_MISC0_CLR, + ANALOG_PLL_MISC0_TOG, + + ANALOG_DIGPROG = 0x800 / sizeof(uint32_t), + ANALOG_MAX, + + ANALOG_PLL_LOCK = BIT(31) +}; + +enum IMX7CCMRegisters { + CCM_MAX = 0xBE00 / sizeof(uint32_t) + 1, +}; + +enum IMX7PMURegisters { + PMU_MAX = 0x140 / sizeof(uint32_t), +}; + +#define TYPE_IMX7_CCM "imx7.ccm" +#define IMX7_CCM(obj) OBJECT_CHECK(IMX7CCMState, (obj), TYPE_IMX7_CCM) + +typedef struct IMX7CCMState { + /* <private> */ + IMXCCMState parent_obj; + + /* <public> */ + MemoryRegion iomem; + + uint32_t ccm[CCM_MAX]; +} IMX7CCMState; + + +#define TYPE_IMX7_ANALOG "imx7.analog" +#define IMX7_ANALOG(obj) OBJECT_CHECK(IMX7AnalogState, (obj), TYPE_IMX7_ANALOG) + +typedef struct IMX7AnalogState { + /* <private> */ + IMXCCMState parent_obj; + + /* <public> */ + struct { + MemoryRegion container; + MemoryRegion analog; + MemoryRegion digprog; + MemoryRegion pmu; + } mmio; + + uint32_t analog[ANALOG_MAX]; + uint32_t pmu[PMU_MAX]; +} IMX7AnalogState; + +#endif /* IMX7_CCM_H */ diff --git a/include/hw/misc/imx7_gpr.h b/include/hw/misc/imx7_gpr.h new file mode 100644 index 0000000000..e19373d274 --- /dev/null +++ b/include/hw/misc/imx7_gpr.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2017, Impinj, Inc. + * + * i.MX7 GPR IP block emulation code + * + * Author: Andrey Smirnov <andrew.smirnov@gmail.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef IMX7_GPR_H +#define IMX7_GPR_H + +#include "qemu/bitops.h" +#include "hw/sysbus.h" + +#define TYPE_IMX7_GPR "imx7.gpr" +#define IMX7_GPR(obj) OBJECT_CHECK(IMX7GPRState, (obj), TYPE_IMX7_GPR) + +typedef struct IMX7GPRState { + /* <private> */ + SysBusDevice parent_obj; + + MemoryRegion mmio; +} IMX7GPRState; + +#endif /* IMX7_GPR_H */ diff --git a/include/hw/misc/imx7_snvs.h b/include/hw/misc/imx7_snvs.h new file mode 100644 index 0000000000..255f8f26f9 --- /dev/null +++ b/include/hw/misc/imx7_snvs.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2017, Impinj, Inc. + * + * i.MX7 SNVS block emulation code + * + * Author: Andrey Smirnov <andrew.smirnov@gmail.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef IMX7_SNVS_H +#define IMX7_SNVS_H + +#include "qemu/bitops.h" +#include "hw/sysbus.h" + + +enum IMX7SNVSRegisters { + SNVS_LPCR = 0x38, + SNVS_LPCR_TOP = BIT(6), + SNVS_LPCR_DP_EN = BIT(5) +}; + +#define TYPE_IMX7_SNVS "imx7.snvs" +#define IMX7_SNVS(obj) OBJECT_CHECK(IMX7SNVSState, (obj), TYPE_IMX7_SNVS) + +typedef struct IMX7SNVSState { + /* <private> */ + SysBusDevice parent_obj; + + MemoryRegion mmio; +} IMX7SNVSState; + +#endif /* IMX7_SNVS_H */ diff --git a/include/hw/s390x/s390_flic.h b/include/hw/s390x/s390_flic.h index 7aab6ef7f0..4687ecfe83 100644 --- a/include/hw/s390x/s390_flic.h +++ b/include/hw/s390x/s390_flic.h @@ -16,6 +16,7 @@ #include "hw/sysbus.h" #include "hw/s390x/adapter.h" #include "hw/virtio/virtio.h" +#include "qemu/queue.h" /* * Reserve enough gsis to accommodate all virtio devices. @@ -66,6 +67,11 @@ typedef struct S390FLICStateClass { int (*modify_ais_mode)(S390FLICState *fs, uint8_t isc, uint16_t mode); int (*inject_airq)(S390FLICState *fs, uint8_t type, uint8_t isc, uint8_t flags); + void (*inject_service)(S390FLICState *fs, uint32_t parm); + void (*inject_io)(S390FLICState *fs, uint16_t subchannel_id, + uint16_t subchannel_nr, uint32_t io_int_parm, + uint32_t io_int_word); + void (*inject_crw_mchk)(S390FLICState *fs); } S390FLICStateClass; #define TYPE_KVM_S390_FLIC "s390-flic-kvm" @@ -80,24 +86,57 @@ typedef struct S390FLICStateClass { #define SIC_IRQ_MODE_SINGLE 1 #define AIS_MODE_MASK(isc) (0x80 >> isc) +#define ISC_TO_PENDING_IO(_isc) (0x80 >> (_isc)) +#define CR6_TO_PENDING_IO(_cr6) (((_cr6) >> 24) & 0xff) + +/* organize the ISC bits so that the macros above work */ +#define FLIC_PENDING_IO_ISC7 (1 << 0) +#define FLIC_PENDING_IO_ISC6 (1 << 1) +#define FLIC_PENDING_IO_ISC5 (1 << 2) +#define FLIC_PENDING_IO_ISC4 (1 << 3) +#define FLIC_PENDING_IO_ISC3 (1 << 4) +#define FLIC_PENDING_IO_ISC2 (1 << 5) +#define FLIC_PENDING_IO_ISC1 (1 << 6) +#define FLIC_PENDING_IO_ISC0 (1 << 7) +#define FLIC_PENDING_SERVICE (1 << 8) +#define FLIC_PENDING_MCHK_CR (1 << 9) + +#define FLIC_PENDING_IO (FLIC_PENDING_IO_ISC0 | FLIC_PENDING_IO_ISC1 | \ + FLIC_PENDING_IO_ISC2 | FLIC_PENDING_IO_ISC3 | \ + FLIC_PENDING_IO_ISC4 | FLIC_PENDING_IO_ISC5 | \ + FLIC_PENDING_IO_ISC6 | FLIC_PENDING_IO_ISC7) + +typedef struct QEMUS390FlicIO { + uint16_t id; + uint16_t nr; + uint32_t parm; + uint32_t word; + QLIST_ENTRY(QEMUS390FlicIO) next; +} QEMUS390FlicIO; + typedef struct QEMUS390FLICState { S390FLICState parent_obj; + uint32_t pending; + uint32_t service_param; uint8_t simm; uint8_t nimm; + QLIST_HEAD(, QEMUS390FlicIO) io[8]; } QEMUS390FLICState; +uint32_t qemu_s390_flic_dequeue_service(QEMUS390FLICState *flic); +QEMUS390FlicIO *qemu_s390_flic_dequeue_io(QEMUS390FLICState *flic, + uint64_t cr6); +void qemu_s390_flic_dequeue_crw_mchk(QEMUS390FLICState *flic); +bool qemu_s390_flic_has_service(QEMUS390FLICState *flic); +bool qemu_s390_flic_has_io(QEMUS390FLICState *fs, uint64_t cr6); +bool qemu_s390_flic_has_crw_mchk(QEMUS390FLICState *flic); +bool qemu_s390_flic_has_any(QEMUS390FLICState *flic); + void s390_flic_init(void); S390FLICState *s390_get_flic(void); +QEMUS390FLICState *s390_get_qemu_flic(S390FLICState *fs); +S390FLICStateClass *s390_get_flic_class(S390FLICState *fs); bool ais_needed(void *opaque); -#ifdef CONFIG_KVM -DeviceState *s390_flic_kvm_create(void); -#else -static inline DeviceState *s390_flic_kvm_create(void) -{ - return NULL; -} -#endif - #endif /* HW_S390_FLIC_H */ diff --git a/include/hw/sd/sdhci.h b/include/hw/sd/sdhci.h index 1cf70f8c23..f8d1ba3538 100644 --- a/include/hw/sd/sdhci.h +++ b/include/hw/sd/sdhci.h @@ -44,6 +44,7 @@ typedef struct SDHCIState { AddressSpace sysbus_dma_as; AddressSpace *dma_as; MemoryRegion *dma_mr; + const MemoryRegionOps *io_ops; QEMUTimer *insert_timer; /* timer for 'changing' sd card. */ QEMUTimer *transfer_timer; @@ -91,8 +92,18 @@ typedef struct SDHCIState { /* Configurable properties */ bool pending_insert_quirk; /* Quirk for Raspberry Pi card insert int */ + uint32_t quirks; } SDHCIState; +/* + * Controller does not provide transfer-complete interrupt when not + * busy. + * + * NOTE: This definition is taken out of Linux kernel and so the + * original bit number is preserved + */ +#define SDHCI_QUIRK_NO_BUSY_IRQ BIT(14) + #define TYPE_PCI_SDHCI "sdhci-pci" #define PCI_SDHCI(obj) OBJECT_CHECK(SDHCIState, (obj), TYPE_PCI_SDHCI) @@ -100,4 +111,6 @@ typedef struct SDHCIState { #define SYSBUS_SDHCI(obj) \ OBJECT_CHECK(SDHCIState, (obj), TYPE_SYSBUS_SDHCI) +#define TYPE_IMX_USDHC "imx-usdhc" + #endif /* SDHCI_H */ diff --git a/include/hw/timer/imx_gpt.h b/include/hw/timer/imx_gpt.h index eac59b2a70..20ccb327c4 100644 --- a/include/hw/timer/imx_gpt.h +++ b/include/hw/timer/imx_gpt.h @@ -77,6 +77,7 @@ #define TYPE_IMX25_GPT "imx25.gpt" #define TYPE_IMX31_GPT "imx31.gpt" #define TYPE_IMX6_GPT "imx6.gpt" +#define TYPE_IMX7_GPT "imx7.gpt" #define TYPE_IMX_GPT TYPE_IMX25_GPT diff --git a/include/hw/usb/chipidea.h b/include/hw/usb/chipidea.h new file mode 100644 index 0000000000..1ec2e9dbda --- /dev/null +++ b/include/hw/usb/chipidea.h @@ -0,0 +1,16 @@ +#ifndef CHIPIDEA_H +#define CHIPIDEA_H + +#include "hw/usb/hcd-ehci.h" + +typedef struct ChipideaState { + /*< private >*/ + EHCISysBusState parent_obj; + + MemoryRegion iomem[3]; +} ChipideaState; + +#define TYPE_CHIPIDEA "usb-chipidea" +#define CHIPIDEA(obj) OBJECT_CHECK(ChipideaState, (obj), TYPE_CHIPIDEA) + +#endif /* CHIPIDEA_H */ diff --git a/linux-user/elfload.c b/linux-user/elfload.c index 32a47674e6..8bb9a2c3e8 100644 --- a/linux-user/elfload.c +++ b/linux-user/elfload.c @@ -512,6 +512,21 @@ enum { ARM_HWCAP_A64_SHA1 = 1 << 5, ARM_HWCAP_A64_SHA2 = 1 << 6, ARM_HWCAP_A64_CRC32 = 1 << 7, + ARM_HWCAP_A64_ATOMICS = 1 << 8, + ARM_HWCAP_A64_FPHP = 1 << 9, + ARM_HWCAP_A64_ASIMDHP = 1 << 10, + ARM_HWCAP_A64_CPUID = 1 << 11, + ARM_HWCAP_A64_ASIMDRDM = 1 << 12, + ARM_HWCAP_A64_JSCVT = 1 << 13, + ARM_HWCAP_A64_FCMA = 1 << 14, + ARM_HWCAP_A64_LRCPC = 1 << 15, + ARM_HWCAP_A64_DCPOP = 1 << 16, + ARM_HWCAP_A64_SHA3 = 1 << 17, + ARM_HWCAP_A64_SM3 = 1 << 18, + ARM_HWCAP_A64_SM4 = 1 << 19, + ARM_HWCAP_A64_ASIMDDP = 1 << 20, + ARM_HWCAP_A64_SHA512 = 1 << 21, + ARM_HWCAP_A64_SVE = 1 << 22, }; #define ELF_HWCAP get_elf_hwcap() @@ -532,6 +547,10 @@ static uint32_t get_elf_hwcap(void) GET_FEATURE(ARM_FEATURE_V8_SHA1, ARM_HWCAP_A64_SHA1); GET_FEATURE(ARM_FEATURE_V8_SHA256, ARM_HWCAP_A64_SHA2); GET_FEATURE(ARM_FEATURE_CRC, ARM_HWCAP_A64_CRC32); + GET_FEATURE(ARM_FEATURE_V8_SHA3, ARM_HWCAP_A64_SHA3); + GET_FEATURE(ARM_FEATURE_V8_SM3, ARM_HWCAP_A64_SM3); + GET_FEATURE(ARM_FEATURE_V8_SM4, ARM_HWCAP_A64_SM4); + GET_FEATURE(ARM_FEATURE_V8_SHA512, ARM_HWCAP_A64_SHA512); #undef GET_FEATURE return hwcaps; diff --git a/target/arm/cpu.h b/target/arm/cpu.h index d2bb59eded..521444a5a1 100644 --- a/target/arm/cpu.h +++ b/target/arm/cpu.h @@ -153,6 +153,49 @@ typedef struct { uint32_t base_mask; } TCR; +/* Define a maximum sized vector register. + * For 32-bit, this is a 128-bit NEON/AdvSIMD register. + * For 64-bit, this is a 2048-bit SVE register. + * + * Note that the mapping between S, D, and Q views of the register bank + * differs between AArch64 and AArch32. + * In AArch32: + * Qn = regs[n].d[1]:regs[n].d[0] + * Dn = regs[n / 2].d[n & 1] + * Sn = regs[n / 4].d[n % 4 / 2], + * bits 31..0 for even n, and bits 63..32 for odd n + * (and regs[16] to regs[31] are inaccessible) + * In AArch64: + * Zn = regs[n].d[*] + * Qn = regs[n].d[1]:regs[n].d[0] + * Dn = regs[n].d[0] + * Sn = regs[n].d[0] bits 31..0 + * + * This corresponds to the architecturally defined mapping between + * the two execution states, and means we do not need to explicitly + * map these registers when changing states. + * + * Align the data for use with TCG host vector operations. + */ + +#ifdef TARGET_AARCH64 +# define ARM_MAX_VQ 16 +#else +# define ARM_MAX_VQ 1 +#endif + +typedef struct ARMVectorReg { + uint64_t d[2 * ARM_MAX_VQ] QEMU_ALIGNED(16); +} ARMVectorReg; + +/* In AArch32 mode, predicate registers do not exist at all. */ +#ifdef TARGET_AARCH64 +typedef struct ARMPredicateReg { + uint64_t p[2 * ARM_MAX_VQ / 8] QEMU_ALIGNED(16); +} ARMPredicateReg; +#endif + + typedef struct CPUARMState { /* Regs for current mode. */ uint32_t regs[16]; @@ -477,22 +520,12 @@ typedef struct CPUARMState { /* VFP coprocessor state. */ struct { - /* VFP/Neon register state. Note that the mapping between S, D and Q - * views of the register bank differs between AArch64 and AArch32: - * In AArch32: - * Qn = regs[2n+1]:regs[2n] - * Dn = regs[n] - * Sn = regs[n/2] bits 31..0 for even n, and bits 63..32 for odd n - * (and regs[32] to regs[63] are inaccessible) - * In AArch64: - * Qn = regs[2n+1]:regs[2n] - * Dn = regs[2n] - * Sn = regs[2n] bits 31..0 - * This corresponds to the architecturally defined mapping between - * the two execution states, and means we do not need to explicitly - * map these registers when changing states. - */ - uint64_t regs[64]; + ARMVectorReg zregs[32]; + +#ifdef TARGET_AARCH64 + /* Store FFR as pregs[16] to make it easier to treat as any other. */ + ARMPredicateReg pregs[17]; +#endif uint32_t xregs[16]; /* We store these fpcsr fields separately for convenience. */ @@ -516,6 +549,9 @@ typedef struct CPUARMState { */ float_status fp_status; float_status standard_fp_status; + + /* ZCR_EL[1-3] */ + uint64_t zcr_el[4]; } vfp; uint64_t exclusive_addr; uint64_t exclusive_val; @@ -890,6 +926,8 @@ void pmccntr_sync(CPUARMState *env); #define CPTR_TCPAC (1U << 31) #define CPTR_TTA (1U << 20) #define CPTR_TFP (1U << 10) +#define CPTR_TZ (1U << 8) /* CPTR_EL2 */ +#define CPTR_EZ (1U << 8) /* CPTR_EL3 */ #define MDCR_EPMAD (1U << 21) #define MDCR_EDAD (1U << 20) @@ -1341,6 +1379,10 @@ enum arm_features { ARM_FEATURE_M_SECURITY, /* M profile Security Extension */ ARM_FEATURE_JAZELLE, /* has (trivial) Jazelle implementation */ ARM_FEATURE_SVE, /* has Scalable Vector Extension */ + ARM_FEATURE_V8_SHA512, /* implements SHA512 part of v8 Crypto Extensions */ + ARM_FEATURE_V8_SHA3, /* implements SHA3 part of v8 Crypto Extensions */ + ARM_FEATURE_V8_SM3, /* implements SM3 part of v8 Crypto Extensions */ + ARM_FEATURE_V8_SM4, /* implements SM4 part of v8 Crypto Extensions */ }; static inline int arm_feature(CPUARMState *env, int feature) @@ -1506,16 +1548,42 @@ static inline bool armv7m_nvic_can_take_pending_exception(void *opaque) */ void armv7m_nvic_set_pending(void *opaque, int irq, bool secure); /** + * armv7m_nvic_set_pending_derived: mark this derived exception as pending + * @opaque: the NVIC + * @irq: the exception number to mark pending + * @secure: false for non-banked exceptions or for the nonsecure + * version of a banked exception, true for the secure version of a banked + * exception. + * + * Similar to armv7m_nvic_set_pending(), but specifically for derived + * exceptions (exceptions generated in the course of trying to take + * a different exception). + */ +void armv7m_nvic_set_pending_derived(void *opaque, int irq, bool secure); +/** + * armv7m_nvic_get_pending_irq_info: return highest priority pending + * exception, and whether it targets Secure state + * @opaque: the NVIC + * @pirq: set to pending exception number + * @ptargets_secure: set to whether pending exception targets Secure + * + * This function writes the number of the highest priority pending + * exception (the one which would be made active by + * armv7m_nvic_acknowledge_irq()) to @pirq, and sets @ptargets_secure + * to true if the current highest priority pending exception should + * be taken to Secure state, false for NS. + */ +void armv7m_nvic_get_pending_irq_info(void *opaque, int *pirq, + bool *ptargets_secure); +/** * armv7m_nvic_acknowledge_irq: make highest priority pending exception active * @opaque: the NVIC * * Move the current highest priority pending exception from the pending * state to the active state, and update v7m.exception to indicate that * it is the exception currently being handled. - * - * Returns: true if exception should be taken to Secure state, false for NS */ -bool armv7m_nvic_acknowledge_irq(void *opaque); +void armv7m_nvic_acknowledge_irq(void *opaque); /** * armv7m_nvic_complete_irq: complete specified interrupt or exception * @opaque: the NVIC @@ -2610,6 +2678,10 @@ static inline bool arm_cpu_data_is_big_endian(CPUARMState *env) #define ARM_TBFLAG_TBI0_MASK (0x1ull << ARM_TBFLAG_TBI0_SHIFT) #define ARM_TBFLAG_TBI1_SHIFT 1 /* TBI1 for EL0/1 */ #define ARM_TBFLAG_TBI1_MASK (0x1ull << ARM_TBFLAG_TBI1_SHIFT) +#define ARM_TBFLAG_SVEEXC_EL_SHIFT 2 +#define ARM_TBFLAG_SVEEXC_EL_MASK (0x3 << ARM_TBFLAG_SVEEXC_EL_SHIFT) +#define ARM_TBFLAG_ZCR_LEN_SHIFT 4 +#define ARM_TBFLAG_ZCR_LEN_MASK (0xf << ARM_TBFLAG_ZCR_LEN_SHIFT) /* some convenience accessor macros */ #define ARM_TBFLAG_AARCH64_STATE(F) \ @@ -2646,6 +2718,10 @@ static inline bool arm_cpu_data_is_big_endian(CPUARMState *env) (((F) & ARM_TBFLAG_TBI0_MASK) >> ARM_TBFLAG_TBI0_SHIFT) #define ARM_TBFLAG_TBI1(F) \ (((F) & ARM_TBFLAG_TBI1_MASK) >> ARM_TBFLAG_TBI1_SHIFT) +#define ARM_TBFLAG_SVEEXC_EL(F) \ + (((F) & ARM_TBFLAG_SVEEXC_EL_MASK) >> ARM_TBFLAG_SVEEXC_EL_SHIFT) +#define ARM_TBFLAG_ZCR_LEN(F) \ + (((F) & ARM_TBFLAG_ZCR_LEN_MASK) >> ARM_TBFLAG_ZCR_LEN_SHIFT) static inline bool bswap_code(bool sctlr_b) { @@ -2769,7 +2845,7 @@ static inline void *arm_get_el_change_hook_opaque(ARMCPU *cpu) */ static inline uint64_t *aa32_vfp_dreg(CPUARMState *env, unsigned regno) { - return &env->vfp.regs[regno]; + return &env->vfp.zregs[regno >> 1].d[regno & 1]; } /** @@ -2778,7 +2854,7 @@ static inline uint64_t *aa32_vfp_dreg(CPUARMState *env, unsigned regno) */ static inline uint64_t *aa32_vfp_qreg(CPUARMState *env, unsigned regno) { - return &env->vfp.regs[2 * regno]; + return &env->vfp.zregs[regno].d[0]; } /** @@ -2787,7 +2863,7 @@ static inline uint64_t *aa32_vfp_qreg(CPUARMState *env, unsigned regno) */ static inline uint64_t *aa64_vfp_qreg(CPUARMState *env, unsigned regno) { - return &env->vfp.regs[2 * regno]; + return &env->vfp.zregs[regno].d[0]; } #endif diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c index 670c07ab6e..1c330adc28 100644 --- a/target/arm/cpu64.c +++ b/target/arm/cpu64.c @@ -224,6 +224,10 @@ static void aarch64_any_initfn(Object *obj) set_feature(&cpu->env, ARM_FEATURE_V8_AES); set_feature(&cpu->env, ARM_FEATURE_V8_SHA1); set_feature(&cpu->env, ARM_FEATURE_V8_SHA256); + set_feature(&cpu->env, ARM_FEATURE_V8_SHA512); + set_feature(&cpu->env, ARM_FEATURE_V8_SHA3); + set_feature(&cpu->env, ARM_FEATURE_V8_SM3); + set_feature(&cpu->env, ARM_FEATURE_V8_SM4); set_feature(&cpu->env, ARM_FEATURE_V8_PMULL); set_feature(&cpu->env, ARM_FEATURE_CRC); cpu->ctr = 0x80038003; /* 32 byte I and D cacheline size, VIPT icache */ diff --git a/target/arm/crypto_helper.c b/target/arm/crypto_helper.c index 9ca0bdead7..cc339ea7e0 100644 --- a/target/arm/crypto_helper.c +++ b/target/arm/crypto_helper.c @@ -1,7 +1,7 @@ /* * crypto_helper.c - emulate v8 Crypto Extensions instructions * - * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + * Copyright (C) 2013 - 2018 Linaro Ltd <ard.biesheuvel@linaro.org> * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -419,3 +419,278 @@ void HELPER(crypto_sha256su1)(void *vd, void *vn, void *vm) rd[0] = d.l[0]; rd[1] = d.l[1]; } + +/* + * The SHA-512 logical functions (same as above but using 64-bit operands) + */ + +static uint64_t cho512(uint64_t x, uint64_t y, uint64_t z) +{ + return (x & (y ^ z)) ^ z; +} + +static uint64_t maj512(uint64_t x, uint64_t y, uint64_t z) +{ + return (x & y) | ((x | y) & z); +} + +static uint64_t S0_512(uint64_t x) +{ + return ror64(x, 28) ^ ror64(x, 34) ^ ror64(x, 39); +} + +static uint64_t S1_512(uint64_t x) +{ + return ror64(x, 14) ^ ror64(x, 18) ^ ror64(x, 41); +} + +static uint64_t s0_512(uint64_t x) +{ + return ror64(x, 1) ^ ror64(x, 8) ^ (x >> 7); +} + +static uint64_t s1_512(uint64_t x) +{ + return ror64(x, 19) ^ ror64(x, 61) ^ (x >> 6); +} + +void HELPER(crypto_sha512h)(void *vd, void *vn, void *vm) +{ + uint64_t *rd = vd; + uint64_t *rn = vn; + uint64_t *rm = vm; + uint64_t d0 = rd[0]; + uint64_t d1 = rd[1]; + + d1 += S1_512(rm[1]) + cho512(rm[1], rn[0], rn[1]); + d0 += S1_512(d1 + rm[0]) + cho512(d1 + rm[0], rm[1], rn[0]); + + rd[0] = d0; + rd[1] = d1; +} + +void HELPER(crypto_sha512h2)(void *vd, void *vn, void *vm) +{ + uint64_t *rd = vd; + uint64_t *rn = vn; + uint64_t *rm = vm; + uint64_t d0 = rd[0]; + uint64_t d1 = rd[1]; + + d1 += S0_512(rm[0]) + maj512(rn[0], rm[1], rm[0]); + d0 += S0_512(d1) + maj512(d1, rm[0], rm[1]); + + rd[0] = d0; + rd[1] = d1; +} + +void HELPER(crypto_sha512su0)(void *vd, void *vn) +{ + uint64_t *rd = vd; + uint64_t *rn = vn; + uint64_t d0 = rd[0]; + uint64_t d1 = rd[1]; + + d0 += s0_512(rd[1]); + d1 += s0_512(rn[0]); + + rd[0] = d0; + rd[1] = d1; +} + +void HELPER(crypto_sha512su1)(void *vd, void *vn, void *vm) +{ + uint64_t *rd = vd; + uint64_t *rn = vn; + uint64_t *rm = vm; + + rd[0] += s1_512(rn[0]) + rm[0]; + rd[1] += s1_512(rn[1]) + rm[1]; +} + +void HELPER(crypto_sm3partw1)(void *vd, void *vn, void *vm) +{ + uint64_t *rd = vd; + uint64_t *rn = vn; + uint64_t *rm = vm; + union CRYPTO_STATE d = { .l = { rd[0], rd[1] } }; + union CRYPTO_STATE n = { .l = { rn[0], rn[1] } }; + union CRYPTO_STATE m = { .l = { rm[0], rm[1] } }; + uint32_t t; + + t = CR_ST_WORD(d, 0) ^ CR_ST_WORD(n, 0) ^ ror32(CR_ST_WORD(m, 1), 17); + CR_ST_WORD(d, 0) = t ^ ror32(t, 17) ^ ror32(t, 9); + + t = CR_ST_WORD(d, 1) ^ CR_ST_WORD(n, 1) ^ ror32(CR_ST_WORD(m, 2), 17); + CR_ST_WORD(d, 1) = t ^ ror32(t, 17) ^ ror32(t, 9); + + t = CR_ST_WORD(d, 2) ^ CR_ST_WORD(n, 2) ^ ror32(CR_ST_WORD(m, 3), 17); + CR_ST_WORD(d, 2) = t ^ ror32(t, 17) ^ ror32(t, 9); + + t = CR_ST_WORD(d, 3) ^ CR_ST_WORD(n, 3) ^ ror32(CR_ST_WORD(d, 0), 17); + CR_ST_WORD(d, 3) = t ^ ror32(t, 17) ^ ror32(t, 9); + + rd[0] = d.l[0]; + rd[1] = d.l[1]; +} + +void HELPER(crypto_sm3partw2)(void *vd, void *vn, void *vm) +{ + uint64_t *rd = vd; + uint64_t *rn = vn; + uint64_t *rm = vm; + union CRYPTO_STATE d = { .l = { rd[0], rd[1] } }; + union CRYPTO_STATE n = { .l = { rn[0], rn[1] } }; + union CRYPTO_STATE m = { .l = { rm[0], rm[1] } }; + uint32_t t = CR_ST_WORD(n, 0) ^ ror32(CR_ST_WORD(m, 0), 25); + + CR_ST_WORD(d, 0) ^= t; + CR_ST_WORD(d, 1) ^= CR_ST_WORD(n, 1) ^ ror32(CR_ST_WORD(m, 1), 25); + CR_ST_WORD(d, 2) ^= CR_ST_WORD(n, 2) ^ ror32(CR_ST_WORD(m, 2), 25); + CR_ST_WORD(d, 3) ^= CR_ST_WORD(n, 3) ^ ror32(CR_ST_WORD(m, 3), 25) ^ + ror32(t, 17) ^ ror32(t, 2) ^ ror32(t, 26); + + rd[0] = d.l[0]; + rd[1] = d.l[1]; +} + +void HELPER(crypto_sm3tt)(void *vd, void *vn, void *vm, uint32_t imm2, + uint32_t opcode) +{ + uint64_t *rd = vd; + uint64_t *rn = vn; + uint64_t *rm = vm; + union CRYPTO_STATE d = { .l = { rd[0], rd[1] } }; + union CRYPTO_STATE n = { .l = { rn[0], rn[1] } }; + union CRYPTO_STATE m = { .l = { rm[0], rm[1] } }; + uint32_t t; + + assert(imm2 < 4); + + if (opcode == 0 || opcode == 2) { + /* SM3TT1A, SM3TT2A */ + t = par(CR_ST_WORD(d, 3), CR_ST_WORD(d, 2), CR_ST_WORD(d, 1)); + } else if (opcode == 1) { + /* SM3TT1B */ + t = maj(CR_ST_WORD(d, 3), CR_ST_WORD(d, 2), CR_ST_WORD(d, 1)); + } else if (opcode == 3) { + /* SM3TT2B */ + t = cho(CR_ST_WORD(d, 3), CR_ST_WORD(d, 2), CR_ST_WORD(d, 1)); + } else { + g_assert_not_reached(); + } + + t += CR_ST_WORD(d, 0) + CR_ST_WORD(m, imm2); + + CR_ST_WORD(d, 0) = CR_ST_WORD(d, 1); + + if (opcode < 2) { + /* SM3TT1A, SM3TT1B */ + t += CR_ST_WORD(n, 3) ^ ror32(CR_ST_WORD(d, 3), 20); + + CR_ST_WORD(d, 1) = ror32(CR_ST_WORD(d, 2), 23); + } else { + /* SM3TT2A, SM3TT2B */ + t += CR_ST_WORD(n, 3); + t ^= rol32(t, 9) ^ rol32(t, 17); + + CR_ST_WORD(d, 1) = ror32(CR_ST_WORD(d, 2), 13); + } + + CR_ST_WORD(d, 2) = CR_ST_WORD(d, 3); + CR_ST_WORD(d, 3) = t; + + rd[0] = d.l[0]; + rd[1] = d.l[1]; +} + +static uint8_t const sm4_sbox[] = { + 0xd6, 0x90, 0xe9, 0xfe, 0xcc, 0xe1, 0x3d, 0xb7, + 0x16, 0xb6, 0x14, 0xc2, 0x28, 0xfb, 0x2c, 0x05, + 0x2b, 0x67, 0x9a, 0x76, 0x2a, 0xbe, 0x04, 0xc3, + 0xaa, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99, + 0x9c, 0x42, 0x50, 0xf4, 0x91, 0xef, 0x98, 0x7a, + 0x33, 0x54, 0x0b, 0x43, 0xed, 0xcf, 0xac, 0x62, + 0xe4, 0xb3, 0x1c, 0xa9, 0xc9, 0x08, 0xe8, 0x95, + 0x80, 0xdf, 0x94, 0xfa, 0x75, 0x8f, 0x3f, 0xa6, + 0x47, 0x07, 0xa7, 0xfc, 0xf3, 0x73, 0x17, 0xba, + 0x83, 0x59, 0x3c, 0x19, 0xe6, 0x85, 0x4f, 0xa8, + 0x68, 0x6b, 0x81, 0xb2, 0x71, 0x64, 0xda, 0x8b, + 0xf8, 0xeb, 0x0f, 0x4b, 0x70, 0x56, 0x9d, 0x35, + 0x1e, 0x24, 0x0e, 0x5e, 0x63, 0x58, 0xd1, 0xa2, + 0x25, 0x22, 0x7c, 0x3b, 0x01, 0x21, 0x78, 0x87, + 0xd4, 0x00, 0x46, 0x57, 0x9f, 0xd3, 0x27, 0x52, + 0x4c, 0x36, 0x02, 0xe7, 0xa0, 0xc4, 0xc8, 0x9e, + 0xea, 0xbf, 0x8a, 0xd2, 0x40, 0xc7, 0x38, 0xb5, + 0xa3, 0xf7, 0xf2, 0xce, 0xf9, 0x61, 0x15, 0xa1, + 0xe0, 0xae, 0x5d, 0xa4, 0x9b, 0x34, 0x1a, 0x55, + 0xad, 0x93, 0x32, 0x30, 0xf5, 0x8c, 0xb1, 0xe3, + 0x1d, 0xf6, 0xe2, 0x2e, 0x82, 0x66, 0xca, 0x60, + 0xc0, 0x29, 0x23, 0xab, 0x0d, 0x53, 0x4e, 0x6f, + 0xd5, 0xdb, 0x37, 0x45, 0xde, 0xfd, 0x8e, 0x2f, + 0x03, 0xff, 0x6a, 0x72, 0x6d, 0x6c, 0x5b, 0x51, + 0x8d, 0x1b, 0xaf, 0x92, 0xbb, 0xdd, 0xbc, 0x7f, + 0x11, 0xd9, 0x5c, 0x41, 0x1f, 0x10, 0x5a, 0xd8, + 0x0a, 0xc1, 0x31, 0x88, 0xa5, 0xcd, 0x7b, 0xbd, + 0x2d, 0x74, 0xd0, 0x12, 0xb8, 0xe5, 0xb4, 0xb0, + 0x89, 0x69, 0x97, 0x4a, 0x0c, 0x96, 0x77, 0x7e, + 0x65, 0xb9, 0xf1, 0x09, 0xc5, 0x6e, 0xc6, 0x84, + 0x18, 0xf0, 0x7d, 0xec, 0x3a, 0xdc, 0x4d, 0x20, + 0x79, 0xee, 0x5f, 0x3e, 0xd7, 0xcb, 0x39, 0x48, +}; + +void HELPER(crypto_sm4e)(void *vd, void *vn) +{ + uint64_t *rd = vd; + uint64_t *rn = vn; + union CRYPTO_STATE d = { .l = { rd[0], rd[1] } }; + union CRYPTO_STATE n = { .l = { rn[0], rn[1] } }; + uint32_t t, i; + + for (i = 0; i < 4; i++) { + t = CR_ST_WORD(d, (i + 1) % 4) ^ + CR_ST_WORD(d, (i + 2) % 4) ^ + CR_ST_WORD(d, (i + 3) % 4) ^ + CR_ST_WORD(n, i); + + t = sm4_sbox[t & 0xff] | + sm4_sbox[(t >> 8) & 0xff] << 8 | + sm4_sbox[(t >> 16) & 0xff] << 16 | + sm4_sbox[(t >> 24) & 0xff] << 24; + + CR_ST_WORD(d, i) ^= t ^ rol32(t, 2) ^ rol32(t, 10) ^ rol32(t, 18) ^ + rol32(t, 24); + } + + rd[0] = d.l[0]; + rd[1] = d.l[1]; +} + +void HELPER(crypto_sm4ekey)(void *vd, void *vn, void* vm) +{ + uint64_t *rd = vd; + uint64_t *rn = vn; + uint64_t *rm = vm; + union CRYPTO_STATE d; + union CRYPTO_STATE n = { .l = { rn[0], rn[1] } }; + union CRYPTO_STATE m = { .l = { rm[0], rm[1] } }; + uint32_t t, i; + + d = n; + for (i = 0; i < 4; i++) { + t = CR_ST_WORD(d, (i + 1) % 4) ^ + CR_ST_WORD(d, (i + 2) % 4) ^ + CR_ST_WORD(d, (i + 3) % 4) ^ + CR_ST_WORD(m, i); + + t = sm4_sbox[t & 0xff] | + sm4_sbox[(t >> 8) & 0xff] << 8 | + sm4_sbox[(t >> 16) & 0xff] << 16 | + sm4_sbox[(t >> 24) & 0xff] << 24; + + CR_ST_WORD(d, i) ^= t ^ rol32(t, 13) ^ rol32(t, 23); + } + + rd[0] = d.l[0]; + rd[1] = d.l[1]; +} diff --git a/target/arm/helper.c b/target/arm/helper.c index bfce09643b..180ab75458 100644 --- a/target/arm/helper.c +++ b/target/arm/helper.c @@ -4266,6 +4266,125 @@ static const ARMCPRegInfo debug_lpae_cp_reginfo[] = { REGINFO_SENTINEL }; +/* Return the exception level to which SVE-disabled exceptions should + * be taken, or 0 if SVE is enabled. + */ +static int sve_exception_el(CPUARMState *env) +{ +#ifndef CONFIG_USER_ONLY + unsigned current_el = arm_current_el(env); + + /* The CPACR.ZEN controls traps to EL1: + * 0, 2 : trap EL0 and EL1 accesses + * 1 : trap only EL0 accesses + * 3 : trap no accesses + */ + switch (extract32(env->cp15.cpacr_el1, 16, 2)) { + default: + if (current_el <= 1) { + /* Trap to PL1, which might be EL1 or EL3 */ + if (arm_is_secure(env) && !arm_el_is_aa64(env, 3)) { + return 3; + } + return 1; + } + break; + case 1: + if (current_el == 0) { + return 1; + } + break; + case 3: + break; + } + + /* Similarly for CPACR.FPEN, after having checked ZEN. */ + switch (extract32(env->cp15.cpacr_el1, 20, 2)) { + default: + if (current_el <= 1) { + if (arm_is_secure(env) && !arm_el_is_aa64(env, 3)) { + return 3; + } + return 1; + } + break; + case 1: + if (current_el == 0) { + return 1; + } + break; + case 3: + break; + } + + /* CPTR_EL2. Check both TZ and TFP. */ + if (current_el <= 2 + && (env->cp15.cptr_el[2] & (CPTR_TFP | CPTR_TZ)) + && !arm_is_secure_below_el3(env)) { + return 2; + } + + /* CPTR_EL3. Check both EZ and TFP. */ + if (!(env->cp15.cptr_el[3] & CPTR_EZ) + || (env->cp15.cptr_el[3] & CPTR_TFP)) { + return 3; + } +#endif + return 0; +} + +static CPAccessResult zcr_access(CPUARMState *env, const ARMCPRegInfo *ri, + bool isread) +{ + switch (sve_exception_el(env)) { + case 3: + return CP_ACCESS_TRAP_EL3; + case 2: + return CP_ACCESS_TRAP_EL2; + case 1: + return CP_ACCESS_TRAP; + } + return CP_ACCESS_OK; +} + +static void zcr_write(CPUARMState *env, const ARMCPRegInfo *ri, + uint64_t value) +{ + /* Bits other than [3:0] are RAZ/WI. */ + raw_write(env, ri, value & 0xf); +} + +static const ARMCPRegInfo zcr_el1_reginfo = { + .name = "ZCR_EL1", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 0, .crn = 1, .crm = 2, .opc2 = 0, + .access = PL1_RW, .accessfn = zcr_access, .type = ARM_CP_64BIT, + .fieldoffset = offsetof(CPUARMState, vfp.zcr_el[1]), + .writefn = zcr_write, .raw_writefn = raw_write +}; + +static const ARMCPRegInfo zcr_el2_reginfo = { + .name = "ZCR_EL2", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 4, .crn = 1, .crm = 2, .opc2 = 0, + .access = PL2_RW, .accessfn = zcr_access, .type = ARM_CP_64BIT, + .fieldoffset = offsetof(CPUARMState, vfp.zcr_el[2]), + .writefn = zcr_write, .raw_writefn = raw_write +}; + +static const ARMCPRegInfo zcr_no_el2_reginfo = { + .name = "ZCR_EL2", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 4, .crn = 1, .crm = 2, .opc2 = 0, + .access = PL2_RW, .type = ARM_CP_64BIT, + .readfn = arm_cp_read_zero, .writefn = arm_cp_write_ignore +}; + +static const ARMCPRegInfo zcr_el3_reginfo = { + .name = "ZCR_EL3", .state = ARM_CP_STATE_AA64, + .opc0 = 3, .opc1 = 6, .crn = 1, .crm = 2, .opc2 = 0, + .access = PL3_RW, .accessfn = zcr_access, .type = ARM_CP_64BIT, + .fieldoffset = offsetof(CPUARMState, vfp.zcr_el[3]), + .writefn = zcr_write, .raw_writefn = raw_write +}; + void hw_watchpoint_update(ARMCPU *cpu, int n) { CPUARMState *env = &cpu->env; @@ -5332,6 +5451,18 @@ void register_cp_regs_for_features(ARMCPU *cpu) } define_one_arm_cp_reg(cpu, &sctlr); } + + if (arm_feature(env, ARM_FEATURE_SVE)) { + define_one_arm_cp_reg(cpu, &zcr_el1_reginfo); + if (arm_feature(env, ARM_FEATURE_EL2)) { + define_one_arm_cp_reg(cpu, &zcr_el2_reginfo); + } else { + define_one_arm_cp_reg(cpu, &zcr_no_el2_reginfo); + } + if (arm_feature(env, ARM_FEATURE_EL3)) { + define_one_arm_cp_reg(cpu, &zcr_el3_reginfo); + } + } } void arm_cpu_register_gdb_regs_for_features(ARMCPU *cpu) @@ -6161,12 +6292,127 @@ uint32_t arm_phys_excp_target_el(CPUState *cs, uint32_t excp_idx, return target_el; } -static void v7m_push(CPUARMState *env, uint32_t val) +static bool v7m_stack_write(ARMCPU *cpu, uint32_t addr, uint32_t value, + ARMMMUIdx mmu_idx, bool ignfault) { - CPUState *cs = CPU(arm_env_get_cpu(env)); + CPUState *cs = CPU(cpu); + CPUARMState *env = &cpu->env; + MemTxAttrs attrs = {}; + MemTxResult txres; + target_ulong page_size; + hwaddr physaddr; + int prot; + ARMMMUFaultInfo fi; + bool secure = mmu_idx & ARM_MMU_IDX_M_S; + int exc; + bool exc_secure; + + if (get_phys_addr(env, addr, MMU_DATA_STORE, mmu_idx, &physaddr, + &attrs, &prot, &page_size, &fi, NULL)) { + /* MPU/SAU lookup failed */ + if (fi.type == ARMFault_QEMU_SFault) { + qemu_log_mask(CPU_LOG_INT, + "...SecureFault with SFSR.AUVIOL during stacking\n"); + env->v7m.sfsr |= R_V7M_SFSR_AUVIOL_MASK | R_V7M_SFSR_SFARVALID_MASK; + env->v7m.sfar = addr; + exc = ARMV7M_EXCP_SECURE; + exc_secure = false; + } else { + qemu_log_mask(CPU_LOG_INT, "...MemManageFault with CFSR.MSTKERR\n"); + env->v7m.cfsr[secure] |= R_V7M_CFSR_MSTKERR_MASK; + exc = ARMV7M_EXCP_MEM; + exc_secure = secure; + } + goto pend_fault; + } + address_space_stl_le(arm_addressspace(cs, attrs), physaddr, value, + attrs, &txres); + if (txres != MEMTX_OK) { + /* BusFault trying to write the data */ + qemu_log_mask(CPU_LOG_INT, "...BusFault with BFSR.STKERR\n"); + env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_STKERR_MASK; + exc = ARMV7M_EXCP_BUS; + exc_secure = false; + goto pend_fault; + } + return true; + +pend_fault: + /* By pending the exception at this point we are making + * the IMPDEF choice "overridden exceptions pended" (see the + * MergeExcInfo() pseudocode). The other choice would be to not + * pend them now and then make a choice about which to throw away + * later if we have two derived exceptions. + * The only case when we must not pend the exception but instead + * throw it away is if we are doing the push of the callee registers + * and we've already generated a derived exception. Even in this + * case we will still update the fault status registers. + */ + if (!ignfault) { + armv7m_nvic_set_pending_derived(env->nvic, exc, exc_secure); + } + return false; +} + +static bool v7m_stack_read(ARMCPU *cpu, uint32_t *dest, uint32_t addr, + ARMMMUIdx mmu_idx) +{ + CPUState *cs = CPU(cpu); + CPUARMState *env = &cpu->env; + MemTxAttrs attrs = {}; + MemTxResult txres; + target_ulong page_size; + hwaddr physaddr; + int prot; + ARMMMUFaultInfo fi; + bool secure = mmu_idx & ARM_MMU_IDX_M_S; + int exc; + bool exc_secure; + uint32_t value; + + if (get_phys_addr(env, addr, MMU_DATA_LOAD, mmu_idx, &physaddr, + &attrs, &prot, &page_size, &fi, NULL)) { + /* MPU/SAU lookup failed */ + if (fi.type == ARMFault_QEMU_SFault) { + qemu_log_mask(CPU_LOG_INT, + "...SecureFault with SFSR.AUVIOL during unstack\n"); + env->v7m.sfsr |= R_V7M_SFSR_AUVIOL_MASK | R_V7M_SFSR_SFARVALID_MASK; + env->v7m.sfar = addr; + exc = ARMV7M_EXCP_SECURE; + exc_secure = false; + } else { + qemu_log_mask(CPU_LOG_INT, + "...MemManageFault with CFSR.MUNSTKERR\n"); + env->v7m.cfsr[secure] |= R_V7M_CFSR_MUNSTKERR_MASK; + exc = ARMV7M_EXCP_MEM; + exc_secure = secure; + } + goto pend_fault; + } - env->regs[13] -= 4; - stl_phys(cs->as, env->regs[13], val); + value = address_space_ldl(arm_addressspace(cs, attrs), physaddr, + attrs, &txres); + if (txres != MEMTX_OK) { + /* BusFault trying to read the data */ + qemu_log_mask(CPU_LOG_INT, "...BusFault with BFSR.UNSTKERR\n"); + env->v7m.cfsr[M_REG_NS] |= R_V7M_CFSR_UNSTKERR_MASK; + exc = ARMV7M_EXCP_BUS; + exc_secure = false; + goto pend_fault; + } + + *dest = value; + return true; + +pend_fault: + /* By pending the exception at this point we are making + * the IMPDEF choice "overridden exceptions pended" (see the + * MergeExcInfo() pseudocode). The other choice would be to not + * pend them now and then make a choice about which to throw away + * later if we have two derived exceptions. + */ + armv7m_nvic_set_pending(env->nvic, exc, exc_secure); + return false; } /* Return true if we're using the process stack pointer (not the MSP) */ @@ -6395,65 +6641,126 @@ static uint32_t *get_v7m_sp_ptr(CPUARMState *env, bool secure, bool threadmode, } } -static uint32_t arm_v7m_load_vector(ARMCPU *cpu, bool targets_secure) +static bool arm_v7m_load_vector(ARMCPU *cpu, int exc, bool targets_secure, + uint32_t *pvec) { CPUState *cs = CPU(cpu); CPUARMState *env = &cpu->env; MemTxResult result; - hwaddr vec = env->v7m.vecbase[targets_secure] + env->v7m.exception * 4; - uint32_t addr; + uint32_t addr = env->v7m.vecbase[targets_secure] + exc * 4; + uint32_t vector_entry; + MemTxAttrs attrs = {}; + ARMMMUIdx mmu_idx; + bool exc_secure; + + mmu_idx = arm_v7m_mmu_idx_for_secstate_and_priv(env, targets_secure, true); + + /* We don't do a get_phys_addr() here because the rules for vector + * loads are special: they always use the default memory map, and + * the default memory map permits reads from all addresses. + * Since there's no easy way to pass through to pmsav8_mpu_lookup() + * that we want this special case which would always say "yes", + * we just do the SAU lookup here followed by a direct physical load. + */ + attrs.secure = targets_secure; + attrs.user = false; - addr = address_space_ldl(cs->as, vec, - MEMTXATTRS_UNSPECIFIED, &result); + if (arm_feature(env, ARM_FEATURE_M_SECURITY)) { + V8M_SAttributes sattrs = {}; + + v8m_security_lookup(env, addr, MMU_DATA_LOAD, mmu_idx, &sattrs); + if (sattrs.ns) { + attrs.secure = false; + } else if (!targets_secure) { + /* NS access to S memory */ + goto load_fail; + } + } + + vector_entry = address_space_ldl(arm_addressspace(cs, attrs), addr, + attrs, &result); if (result != MEMTX_OK) { - /* Architecturally this should cause a HardFault setting HSFR.VECTTBL, - * which would then be immediately followed by our failing to load - * the entry vector for that HardFault, which is a Lockup case. - * Since we don't model Lockup, we just report this guest error - * via cpu_abort(). - */ - cpu_abort(cs, "Failed to read from %s exception vector table " - "entry %08x\n", targets_secure ? "secure" : "nonsecure", - (unsigned)vec); + goto load_fail; } - return addr; + *pvec = vector_entry; + return true; + +load_fail: + /* All vector table fetch fails are reported as HardFault, with + * HFSR.VECTTBL and .FORCED set. (FORCED is set because + * technically the underlying exception is a MemManage or BusFault + * that is escalated to HardFault.) This is a terminal exception, + * so we will either take the HardFault immediately or else enter + * lockup (the latter case is handled in armv7m_nvic_set_pending_derived()). + */ + exc_secure = targets_secure || + !(cpu->env.v7m.aircr & R_V7M_AIRCR_BFHFNMINS_MASK); + env->v7m.hfsr |= R_V7M_HFSR_VECTTBL_MASK | R_V7M_HFSR_FORCED_MASK; + armv7m_nvic_set_pending_derived(env->nvic, ARMV7M_EXCP_HARD, exc_secure); + return false; } -static void v7m_push_callee_stack(ARMCPU *cpu, uint32_t lr, bool dotailchain) +static bool v7m_push_callee_stack(ARMCPU *cpu, uint32_t lr, bool dotailchain, + bool ignore_faults) { /* For v8M, push the callee-saves register part of the stack frame. * Compare the v8M pseudocode PushCalleeStack(). * In the tailchaining case this may not be the current stack. */ CPUARMState *env = &cpu->env; - CPUState *cs = CPU(cpu); uint32_t *frame_sp_p; uint32_t frameptr; + ARMMMUIdx mmu_idx; + bool stacked_ok; if (dotailchain) { - frame_sp_p = get_v7m_sp_ptr(env, true, - lr & R_V7M_EXCRET_MODE_MASK, + bool mode = lr & R_V7M_EXCRET_MODE_MASK; + bool priv = !(env->v7m.control[M_REG_S] & R_V7M_CONTROL_NPRIV_MASK) || + !mode; + + mmu_idx = arm_v7m_mmu_idx_for_secstate_and_priv(env, M_REG_S, priv); + frame_sp_p = get_v7m_sp_ptr(env, M_REG_S, mode, lr & R_V7M_EXCRET_SPSEL_MASK); } else { + mmu_idx = core_to_arm_mmu_idx(env, cpu_mmu_index(env, false)); frame_sp_p = &env->regs[13]; } frameptr = *frame_sp_p - 0x28; - stl_phys(cs->as, frameptr, 0xfefa125b); - stl_phys(cs->as, frameptr + 0x8, env->regs[4]); - stl_phys(cs->as, frameptr + 0xc, env->regs[5]); - stl_phys(cs->as, frameptr + 0x10, env->regs[6]); - stl_phys(cs->as, frameptr + 0x14, env->regs[7]); - stl_phys(cs->as, frameptr + 0x18, env->regs[8]); - stl_phys(cs->as, frameptr + 0x1c, env->regs[9]); - stl_phys(cs->as, frameptr + 0x20, env->regs[10]); - stl_phys(cs->as, frameptr + 0x24, env->regs[11]); - + /* Write as much of the stack frame as we can. A write failure may + * cause us to pend a derived exception. + */ + stacked_ok = + v7m_stack_write(cpu, frameptr, 0xfefa125b, mmu_idx, ignore_faults) && + v7m_stack_write(cpu, frameptr + 0x8, env->regs[4], mmu_idx, + ignore_faults) && + v7m_stack_write(cpu, frameptr + 0xc, env->regs[5], mmu_idx, + ignore_faults) && + v7m_stack_write(cpu, frameptr + 0x10, env->regs[6], mmu_idx, + ignore_faults) && + v7m_stack_write(cpu, frameptr + 0x14, env->regs[7], mmu_idx, + ignore_faults) && + v7m_stack_write(cpu, frameptr + 0x18, env->regs[8], mmu_idx, + ignore_faults) && + v7m_stack_write(cpu, frameptr + 0x1c, env->regs[9], mmu_idx, + ignore_faults) && + v7m_stack_write(cpu, frameptr + 0x20, env->regs[10], mmu_idx, + ignore_faults) && + v7m_stack_write(cpu, frameptr + 0x24, env->regs[11], mmu_idx, + ignore_faults); + + /* Update SP regardless of whether any of the stack accesses failed. + * When we implement v8M stack limit checking then this attempt to + * update SP might also fail and result in a derived exception. + */ *frame_sp_p = frameptr; + + return !stacked_ok; } -static void v7m_exception_taken(ARMCPU *cpu, uint32_t lr, bool dotailchain) +static void v7m_exception_taken(ARMCPU *cpu, uint32_t lr, bool dotailchain, + bool ignore_stackfaults) { /* Do the "take the exception" parts of exception entry, * but not the pushing of state to the stack. This is @@ -6462,8 +6769,10 @@ static void v7m_exception_taken(ARMCPU *cpu, uint32_t lr, bool dotailchain) CPUARMState *env = &cpu->env; uint32_t addr; bool targets_secure; + int exc; + bool push_failed = false; - targets_secure = armv7m_nvic_acknowledge_irq(env->nvic); + armv7m_nvic_get_pending_irq_info(env->nvic, &exc, &targets_secure); if (arm_feature(env, ARM_FEATURE_V8)) { if (arm_feature(env, ARM_FEATURE_M_SECURITY) && @@ -6489,7 +6798,8 @@ static void v7m_exception_taken(ARMCPU *cpu, uint32_t lr, bool dotailchain) */ if (lr & R_V7M_EXCRET_DCRS_MASK && !(dotailchain && (lr & R_V7M_EXCRET_ES_MASK))) { - v7m_push_callee_stack(cpu, lr, dotailchain); + push_failed = v7m_push_callee_stack(cpu, lr, dotailchain, + ignore_stackfaults); } lr |= R_V7M_EXCRET_DCRS_MASK; } @@ -6531,6 +6841,27 @@ static void v7m_exception_taken(ARMCPU *cpu, uint32_t lr, bool dotailchain) } } + if (push_failed && !ignore_stackfaults) { + /* Derived exception on callee-saves register stacking: + * we might now want to take a different exception which + * targets a different security state, so try again from the top. + */ + v7m_exception_taken(cpu, lr, true, true); + return; + } + + if (!arm_v7m_load_vector(cpu, exc, targets_secure, &addr)) { + /* Vector load failed: derived exception */ + v7m_exception_taken(cpu, lr, true, true); + return; + } + + /* Now we've done everything that might cause a derived exception + * we can go ahead and activate whichever exception we're going to + * take (which might now be the derived exception). + */ + armv7m_nvic_acknowledge_irq(env->nvic); + /* Switch to target security state -- must do this before writing SPSEL */ switch_v7m_security_state(env, targets_secure); write_v7m_control_spsel(env, 0); @@ -6538,34 +6869,55 @@ static void v7m_exception_taken(ARMCPU *cpu, uint32_t lr, bool dotailchain) /* Clear IT bits */ env->condexec_bits = 0; env->regs[14] = lr; - addr = arm_v7m_load_vector(cpu, targets_secure); env->regs[15] = addr & 0xfffffffe; env->thumb = addr & 1; } -static void v7m_push_stack(ARMCPU *cpu) +static bool v7m_push_stack(ARMCPU *cpu) { /* Do the "set up stack frame" part of exception entry, * similar to pseudocode PushStack(). + * Return true if we generate a derived exception (and so + * should ignore further stack faults trying to process + * that derived exception.) */ + bool stacked_ok; CPUARMState *env = &cpu->env; uint32_t xpsr = xpsr_read(env); + uint32_t frameptr = env->regs[13]; + ARMMMUIdx mmu_idx = core_to_arm_mmu_idx(env, cpu_mmu_index(env, false)); /* Align stack pointer if the guest wants that */ - if ((env->regs[13] & 4) && + if ((frameptr & 4) && (env->v7m.ccr[env->v7m.secure] & R_V7M_CCR_STKALIGN_MASK)) { - env->regs[13] -= 4; + frameptr -= 4; xpsr |= XPSR_SPREALIGN; } - /* Switch to the handler mode. */ - v7m_push(env, xpsr); - v7m_push(env, env->regs[15]); - v7m_push(env, env->regs[14]); - v7m_push(env, env->regs[12]); - v7m_push(env, env->regs[3]); - v7m_push(env, env->regs[2]); - v7m_push(env, env->regs[1]); - v7m_push(env, env->regs[0]); + + frameptr -= 0x20; + + /* Write as much of the stack frame as we can. If we fail a stack + * write this will result in a derived exception being pended + * (which may be taken in preference to the one we started with + * if it has higher priority). + */ + stacked_ok = + v7m_stack_write(cpu, frameptr, env->regs[0], mmu_idx, false) && + v7m_stack_write(cpu, frameptr + 4, env->regs[1], mmu_idx, false) && + v7m_stack_write(cpu, frameptr + 8, env->regs[2], mmu_idx, false) && + v7m_stack_write(cpu, frameptr + 12, env->regs[3], mmu_idx, false) && + v7m_stack_write(cpu, frameptr + 16, env->regs[12], mmu_idx, false) && + v7m_stack_write(cpu, frameptr + 20, env->regs[14], mmu_idx, false) && + v7m_stack_write(cpu, frameptr + 24, env->regs[15], mmu_idx, false) && + v7m_stack_write(cpu, frameptr + 28, xpsr, mmu_idx, false); + + /* Update SP regardless of whether any of the stack accesses failed. + * When we implement v8M stack limit checking then this attempt to + * update SP might also fail and result in a derived exception. + */ + env->regs[13] = frameptr; + + return !stacked_ok; } static void do_v7m_exception_exit(ARMCPU *cpu) @@ -6711,7 +7063,7 @@ static void do_v7m_exception_exit(ARMCPU *cpu) if (sfault) { env->v7m.sfsr |= R_V7M_SFSR_INVER_MASK; armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false); - v7m_exception_taken(cpu, excret, true); + v7m_exception_taken(cpu, excret, true, false); qemu_log_mask(CPU_LOG_INT, "...taking SecureFault on existing " "stackframe: failed EXC_RETURN.ES validity check\n"); return; @@ -6723,7 +7075,7 @@ static void do_v7m_exception_exit(ARMCPU *cpu) */ env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVPC_MASK; armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure); - v7m_exception_taken(cpu, excret, true); + v7m_exception_taken(cpu, excret, true, false); qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on existing " "stackframe: failed exception return integrity check\n"); return; @@ -6752,6 +7104,11 @@ static void do_v7m_exception_exit(ARMCPU *cpu) !return_to_handler, return_to_sp_process); uint32_t frameptr = *frame_sp_p; + bool pop_ok = true; + ARMMMUIdx mmu_idx; + + mmu_idx = arm_v7m_mmu_idx_for_secstate_and_priv(env, return_to_secure, + !return_to_handler); if (!QEMU_IS_ALIGNED(frameptr, 8) && arm_feature(env, ARM_FEATURE_V8)) { @@ -6771,36 +7128,45 @@ static void do_v7m_exception_exit(ARMCPU *cpu) /* Take a SecureFault on the current stack */ env->v7m.sfsr |= R_V7M_SFSR_INVIS_MASK; armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_SECURE, false); - v7m_exception_taken(cpu, excret, true); + v7m_exception_taken(cpu, excret, true, false); qemu_log_mask(CPU_LOG_INT, "...taking SecureFault on existing " "stackframe: failed exception return integrity " "signature check\n"); return; } - env->regs[4] = ldl_phys(cs->as, frameptr + 0x8); - env->regs[5] = ldl_phys(cs->as, frameptr + 0xc); - env->regs[6] = ldl_phys(cs->as, frameptr + 0x10); - env->regs[7] = ldl_phys(cs->as, frameptr + 0x14); - env->regs[8] = ldl_phys(cs->as, frameptr + 0x18); - env->regs[9] = ldl_phys(cs->as, frameptr + 0x1c); - env->regs[10] = ldl_phys(cs->as, frameptr + 0x20); - env->regs[11] = ldl_phys(cs->as, frameptr + 0x24); + pop_ok = + v7m_stack_read(cpu, &env->regs[4], frameptr + 0x8, mmu_idx) && + v7m_stack_read(cpu, &env->regs[4], frameptr + 0x8, mmu_idx) && + v7m_stack_read(cpu, &env->regs[5], frameptr + 0xc, mmu_idx) && + v7m_stack_read(cpu, &env->regs[6], frameptr + 0x10, mmu_idx) && + v7m_stack_read(cpu, &env->regs[7], frameptr + 0x14, mmu_idx) && + v7m_stack_read(cpu, &env->regs[8], frameptr + 0x18, mmu_idx) && + v7m_stack_read(cpu, &env->regs[9], frameptr + 0x1c, mmu_idx) && + v7m_stack_read(cpu, &env->regs[10], frameptr + 0x20, mmu_idx) && + v7m_stack_read(cpu, &env->regs[11], frameptr + 0x24, mmu_idx); frameptr += 0x28; } - /* Pop registers. TODO: make these accesses use the correct - * attributes and address space (S/NS, priv/unpriv) and handle - * memory transaction failures. - */ - env->regs[0] = ldl_phys(cs->as, frameptr); - env->regs[1] = ldl_phys(cs->as, frameptr + 0x4); - env->regs[2] = ldl_phys(cs->as, frameptr + 0x8); - env->regs[3] = ldl_phys(cs->as, frameptr + 0xc); - env->regs[12] = ldl_phys(cs->as, frameptr + 0x10); - env->regs[14] = ldl_phys(cs->as, frameptr + 0x14); - env->regs[15] = ldl_phys(cs->as, frameptr + 0x18); + /* Pop registers */ + pop_ok = pop_ok && + v7m_stack_read(cpu, &env->regs[0], frameptr, mmu_idx) && + v7m_stack_read(cpu, &env->regs[1], frameptr + 0x4, mmu_idx) && + v7m_stack_read(cpu, &env->regs[2], frameptr + 0x8, mmu_idx) && + v7m_stack_read(cpu, &env->regs[3], frameptr + 0xc, mmu_idx) && + v7m_stack_read(cpu, &env->regs[12], frameptr + 0x10, mmu_idx) && + v7m_stack_read(cpu, &env->regs[14], frameptr + 0x14, mmu_idx) && + v7m_stack_read(cpu, &env->regs[15], frameptr + 0x18, mmu_idx) && + v7m_stack_read(cpu, &xpsr, frameptr + 0x1c, mmu_idx); + + if (!pop_ok) { + /* v7m_stack_read() pended a fault, so take it (as a tail + * chained exception on the same stack frame) + */ + v7m_exception_taken(cpu, excret, true, false); + return; + } /* Returning from an exception with a PC with bit 0 set is defined * behaviour on v8M (bit 0 is ignored), but for v7M it was specified @@ -6819,8 +7185,6 @@ static void do_v7m_exception_exit(ARMCPU *cpu) } } - xpsr = ldl_phys(cs->as, frameptr + 0x1c); - if (arm_feature(env, ARM_FEATURE_V8)) { /* For v8M we have to check whether the xPSR exception field * matches the EXCRET value for return to handler/thread @@ -6836,7 +7200,7 @@ static void do_v7m_exception_exit(ARMCPU *cpu) armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, env->v7m.secure); env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVPC_MASK; - v7m_exception_taken(cpu, excret, true); + v7m_exception_taken(cpu, excret, true, false); qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on existing " "stackframe: failed exception return integrity " "check\n"); @@ -6869,11 +7233,13 @@ static void do_v7m_exception_exit(ARMCPU *cpu) /* Take an INVPC UsageFault by pushing the stack again; * we know we're v7M so this is never a Secure UsageFault. */ + bool ignore_stackfaults; + assert(!arm_feature(env, ARM_FEATURE_V8)); armv7m_nvic_set_pending(env->nvic, ARMV7M_EXCP_USAGE, false); env->v7m.cfsr[env->v7m.secure] |= R_V7M_CFSR_INVPC_MASK; - v7m_push_stack(cpu); - v7m_exception_taken(cpu, excret, false); + ignore_stackfaults = v7m_push_stack(cpu); + v7m_exception_taken(cpu, excret, false, ignore_stackfaults); qemu_log_mask(CPU_LOG_INT, "...taking UsageFault on new stackframe: " "failed exception return integrity check\n"); return; @@ -7114,6 +7480,7 @@ void arm_v7m_cpu_do_interrupt(CPUState *cs) ARMCPU *cpu = ARM_CPU(cs); CPUARMState *env = &cpu->env; uint32_t lr; + bool ignore_stackfaults; arm_log_exception(cs->exception_index); @@ -7288,8 +7655,8 @@ void arm_v7m_cpu_do_interrupt(CPUState *cs) lr |= R_V7M_EXCRET_MODE_MASK; } - v7m_push_stack(cpu); - v7m_exception_taken(cpu, lr, false); + ignore_stackfaults = v7m_push_stack(cpu); + v7m_exception_taken(cpu, lr, false, ignore_stackfaults); qemu_log_mask(CPU_LOG_INT, "... as %d\n", env->v7m.exception); } @@ -11692,14 +12059,37 @@ void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc, target_ulong *cs_base, uint32_t *pflags) { ARMMMUIdx mmu_idx = core_to_arm_mmu_idx(env, cpu_mmu_index(env, false)); + int fp_el = fp_exception_el(env); uint32_t flags; if (is_a64(env)) { + int sve_el = sve_exception_el(env); + uint32_t zcr_len; + *pc = env->pc; flags = ARM_TBFLAG_AARCH64_STATE_MASK; /* Get control bits for tagged addresses */ flags |= (arm_regime_tbi0(env, mmu_idx) << ARM_TBFLAG_TBI0_SHIFT); flags |= (arm_regime_tbi1(env, mmu_idx) << ARM_TBFLAG_TBI1_SHIFT); + flags |= sve_el << ARM_TBFLAG_SVEEXC_EL_SHIFT; + + /* If SVE is disabled, but FP is enabled, + then the effective len is 0. */ + if (sve_el != 0 && fp_el == 0) { + zcr_len = 0; + } else { + int current_el = arm_current_el(env); + + zcr_len = env->vfp.zcr_el[current_el <= 1 ? 1 : current_el]; + zcr_len &= 0xf; + if (current_el < 2 && arm_feature(env, ARM_FEATURE_EL2)) { + zcr_len = MIN(zcr_len, 0xf & (uint32_t)env->vfp.zcr_el[2]); + } + if (current_el < 3 && arm_feature(env, ARM_FEATURE_EL3)) { + zcr_len = MIN(zcr_len, 0xf & (uint32_t)env->vfp.zcr_el[3]); + } + } + flags |= zcr_len << ARM_TBFLAG_ZCR_LEN_SHIFT; } else { *pc = env->regs[15]; flags = (env->thumb << ARM_TBFLAG_THUMB_SHIFT) @@ -11742,7 +12132,7 @@ void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc, if (arm_cpu_data_is_big_endian(env)) { flags |= ARM_TBFLAG_BE_DATA_MASK; } - flags |= fp_exception_el(env) << ARM_TBFLAG_FPEXC_EL_SHIFT; + flags |= fp_el << ARM_TBFLAG_FPEXC_EL_SHIFT; if (arm_v7m_is_handler_mode(env)) { flags |= ARM_TBFLAG_HANDLER_MASK; diff --git a/target/arm/helper.h b/target/arm/helper.h index 5dec2e6262..6383d7d09e 100644 --- a/target/arm/helper.h +++ b/target/arm/helper.h @@ -534,6 +534,18 @@ DEF_HELPER_FLAGS_3(crypto_sha256h2, TCG_CALL_NO_RWG, void, ptr, ptr, ptr) DEF_HELPER_FLAGS_2(crypto_sha256su0, TCG_CALL_NO_RWG, void, ptr, ptr) DEF_HELPER_FLAGS_3(crypto_sha256su1, TCG_CALL_NO_RWG, void, ptr, ptr, ptr) +DEF_HELPER_FLAGS_3(crypto_sha512h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr) +DEF_HELPER_FLAGS_3(crypto_sha512h2, TCG_CALL_NO_RWG, void, ptr, ptr, ptr) +DEF_HELPER_FLAGS_2(crypto_sha512su0, TCG_CALL_NO_RWG, void, ptr, ptr) +DEF_HELPER_FLAGS_3(crypto_sha512su1, TCG_CALL_NO_RWG, void, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_5(crypto_sm3tt, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32, i32) +DEF_HELPER_FLAGS_3(crypto_sm3partw1, TCG_CALL_NO_RWG, void, ptr, ptr, ptr) +DEF_HELPER_FLAGS_3(crypto_sm3partw2, TCG_CALL_NO_RWG, void, ptr, ptr, ptr) + +DEF_HELPER_FLAGS_2(crypto_sm4e, TCG_CALL_NO_RWG, void, ptr, ptr) +DEF_HELPER_FLAGS_3(crypto_sm4ekey, TCG_CALL_NO_RWG, void, ptr, ptr, ptr) + DEF_HELPER_FLAGS_3(crc32, TCG_CALL_NO_RWG_SE, i32, i32, i32, i32) DEF_HELPER_FLAGS_3(crc32c, TCG_CALL_NO_RWG_SE, i32, i32, i32, i32) DEF_HELPER_2(dc_zva, void, env, i64) diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h index ff53e9fafb..cfb7e5af72 100644 --- a/target/arm/kvm_arm.h +++ b/target/arm/kvm_arm.h @@ -234,6 +234,10 @@ static inline const char *gicv3_class_name(void) exit(1); #endif } else { + if (kvm_enabled()) { + error_report("Userspace GICv3 is not supported with KVM"); + exit(1); + } return "arm-gicv3"; } } diff --git a/target/arm/machine.c b/target/arm/machine.c index a85c2430d3..2c8b43062f 100644 --- a/target/arm/machine.c +++ b/target/arm/machine.c @@ -50,7 +50,40 @@ static const VMStateDescription vmstate_vfp = { .minimum_version_id = 3, .needed = vfp_needed, .fields = (VMStateField[]) { - VMSTATE_UINT64_ARRAY(env.vfp.regs, ARMCPU, 64), + /* For compatibility, store Qn out of Zn here. */ + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[0].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[1].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[2].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[3].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[4].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[5].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[6].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[7].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[8].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[9].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[10].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[11].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[12].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[13].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[14].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[15].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[16].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[17].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[18].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[19].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[20].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[21].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[22].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[23].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[24].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[25].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[26].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[27].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[28].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[29].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[30].d, ARMCPU, 0, 2), + VMSTATE_UINT64_SUB_ARRAY(env.vfp.zregs[31].d, ARMCPU, 0, 2), + /* The xregs array is a little awkward because element 1 (FPSCR) * requires a specific accessor, so we have to split it up in * the vmstate: @@ -89,6 +122,56 @@ static const VMStateDescription vmstate_iwmmxt = { } }; +#ifdef TARGET_AARCH64 +/* The expression ARM_MAX_VQ - 2 is 0 for pure AArch32 build, + * and ARMPredicateReg is actively empty. This triggers errors + * in the expansion of the VMSTATE macros. + */ + +static bool sve_needed(void *opaque) +{ + ARMCPU *cpu = opaque; + CPUARMState *env = &cpu->env; + + return arm_feature(env, ARM_FEATURE_SVE); +} + +/* The first two words of each Zreg is stored in VFP state. */ +static const VMStateDescription vmstate_zreg_hi_reg = { + .name = "cpu/sve/zreg_hi", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT64_SUB_ARRAY(d, ARMVectorReg, 2, ARM_MAX_VQ - 2), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_preg_reg = { + .name = "cpu/sve/preg", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT64_ARRAY(p, ARMPredicateReg, 2 * ARM_MAX_VQ / 8), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_sve = { + .name = "cpu/sve", + .version_id = 1, + .minimum_version_id = 1, + .needed = sve_needed, + .fields = (VMStateField[]) { + VMSTATE_STRUCT_ARRAY(env.vfp.zregs, ARMCPU, 32, 0, + vmstate_zreg_hi_reg, ARMVectorReg), + VMSTATE_STRUCT_ARRAY(env.vfp.pregs, ARMCPU, 17, 0, + vmstate_preg_reg, ARMPredicateReg), + VMSTATE_END_OF_LIST() + } +}; +#endif /* AARCH64 */ + static bool m_needed(void *opaque) { ARMCPU *cpu = opaque; @@ -553,6 +636,9 @@ const VMStateDescription vmstate_arm_cpu = { &vmstate_pmsav7, &vmstate_pmsav8, &vmstate_m_security, +#ifdef TARGET_AARCH64 + &vmstate_sve, +#endif NULL } }; diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c index eed64c73e5..fb1a4cb532 100644 --- a/target/arm/translate-a64.c +++ b/target/arm/translate-a64.c @@ -21,6 +21,7 @@ #include "cpu.h" #include "exec/exec-all.h" #include "tcg-op.h" +#include "tcg-op-gvec.h" #include "qemu/log.h" #include "arm_ldst.h" #include "translate.h" @@ -84,6 +85,13 @@ typedef void CryptoTwoOpFn(TCGv_ptr, TCGv_ptr); typedef void CryptoThreeOpIntFn(TCGv_ptr, TCGv_ptr, TCGv_i32); typedef void CryptoThreeOpFn(TCGv_ptr, TCGv_ptr, TCGv_ptr); +/* Note that the gvec expanders operate on offsets + sizes. */ +typedef void GVecGen2Fn(unsigned, uint32_t, uint32_t, uint32_t, uint32_t); +typedef void GVecGen2iFn(unsigned, uint32_t, uint32_t, int64_t, + uint32_t, uint32_t); +typedef void GVecGen3Fn(unsigned, uint32_t, uint32_t, + uint32_t, uint32_t, uint32_t); + /* initialize TCG globals. */ void a64_translate_init(void) { @@ -517,8 +525,8 @@ static inline int vec_reg_offset(DisasContext *s, int regno, { int offs = 0; #ifdef HOST_WORDS_BIGENDIAN - /* This is complicated slightly because vfp.regs[2n] is - * still the low half and vfp.regs[2n+1] the high half + /* This is complicated slightly because vfp.zregs[n].d[0] is + * still the low half and vfp.zregs[n].d[1] the high half * of the 128 bit vector, even on big endian systems. * Calculate the offset assuming a fully bigendian 128 bits, * then XOR to account for the order of the two 64 bit halves. @@ -528,7 +536,7 @@ static inline int vec_reg_offset(DisasContext *s, int regno, #else offs += element * (1 << size); #endif - offs += offsetof(CPUARMState, vfp.regs[regno * 2]); + offs += offsetof(CPUARMState, vfp.zregs[regno]); assert_fp_access_checked(s); return offs; } @@ -537,7 +545,7 @@ static inline int vec_reg_offset(DisasContext *s, int regno, static inline int vec_full_reg_offset(DisasContext *s, int regno) { assert_fp_access_checked(s); - return offsetof(CPUARMState, vfp.regs[regno * 2]); + return offsetof(CPUARMState, vfp.zregs[regno]); } /* Return a newly allocated pointer to the vector register. */ @@ -548,6 +556,14 @@ static TCGv_ptr vec_full_reg_ptr(DisasContext *s, int regno) return ret; } +/* Return the byte size of the "whole" vector register, VL / 8. */ +static inline int vec_full_reg_size(DisasContext *s) +{ + /* FIXME SVE: We should put the composite ZCR_EL* value into tb->flags. + In the meantime this is just the AdvSIMD length of 128. */ + return 128 / 8; +} + /* Return the offset into CPUARMState of a slice (from * the least significant end) of FP register Qn (ie * Dn, Sn, Hn or Bn). @@ -618,6 +634,51 @@ static TCGv_ptr get_fpstatus_ptr(void) return statusptr; } +/* Expand a 2-operand AdvSIMD vector operation using an expander function. */ +static void gen_gvec_fn2(DisasContext *s, bool is_q, int rd, int rn, + GVecGen2Fn *gvec_fn, int vece) +{ + gvec_fn(vece, vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn), + is_q ? 16 : 8, vec_full_reg_size(s)); +} + +/* Expand a 2-operand + immediate AdvSIMD vector operation using + * an expander function. + */ +static void gen_gvec_fn2i(DisasContext *s, bool is_q, int rd, int rn, + int64_t imm, GVecGen2iFn *gvec_fn, int vece) +{ + gvec_fn(vece, vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn), + imm, is_q ? 16 : 8, vec_full_reg_size(s)); +} + +/* Expand a 3-operand AdvSIMD vector operation using an expander function. */ +static void gen_gvec_fn3(DisasContext *s, bool is_q, int rd, int rn, int rm, + GVecGen3Fn *gvec_fn, int vece) +{ + gvec_fn(vece, vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), is_q ? 16 : 8, vec_full_reg_size(s)); +} + +/* Expand a 2-operand + immediate AdvSIMD vector operation using + * an op descriptor. + */ +static void gen_gvec_op2i(DisasContext *s, bool is_q, int rd, + int rn, int64_t imm, const GVecGen2i *gvec_op) +{ + tcg_gen_gvec_2i(vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn), + is_q ? 16 : 8, vec_full_reg_size(s), imm, gvec_op); +} + +/* Expand a 3-operand AdvSIMD vector operation using an op descriptor. */ +static void gen_gvec_op3(DisasContext *s, bool is_q, int rd, + int rn, int rm, const GVecGen3 *gvec_op) +{ + tcg_gen_gvec_3(vec_full_reg_offset(s, rd), vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), is_q ? 16 : 8, + vec_full_reg_size(s), gvec_op); +} + /* Set ZF and NF based on a 64 bit result. This is alas fiddlier * than the 32 bit equivalent. */ @@ -4566,14 +4627,17 @@ static void handle_fp_1src_double(DisasContext *s, int opcode, int rd, int rn) TCGv_i64 tcg_op; TCGv_i64 tcg_res; + switch (opcode) { + case 0x0: /* FMOV */ + gen_gvec_fn2(s, false, rd, rn, tcg_gen_gvec_mov, 0); + return; + } + fpst = get_fpstatus_ptr(); tcg_op = read_fp_dreg(s, rn); tcg_res = tcg_temp_new_i64(); switch (opcode) { - case 0x0: /* FMOV */ - tcg_gen_mov_i64(tcg_res, tcg_op); - break; case 0x1: /* FABS */ gen_helper_vfp_absd(tcg_res, tcg_op); break; @@ -5848,10 +5912,7 @@ static void handle_simd_dupe(DisasContext *s, int is_q, int rd, int rn, int imm5) { int size = ctz32(imm5); - int esize = 8 << size; - int elements = (is_q ? 128 : 64) / esize; - int index, i; - TCGv_i64 tmp; + int index = imm5 >> (size + 1); if (size > 3 || (size == 3 && !is_q)) { unallocated_encoding(s); @@ -5862,20 +5923,9 @@ static void handle_simd_dupe(DisasContext *s, int is_q, int rd, int rn, return; } - index = imm5 >> (size + 1); - - tmp = tcg_temp_new_i64(); - read_vec_element(s, tmp, rn, index, size); - - for (i = 0; i < elements; i++) { - write_vec_element(s, tmp, rd, i, size); - } - - if (!is_q) { - clear_vec_high(s, rd); - } - - tcg_temp_free_i64(tmp); + tcg_gen_gvec_dup_mem(size, vec_full_reg_offset(s, rd), + vec_reg_offset(s, rn, index, size), + is_q ? 16 : 8, vec_full_reg_size(s)); } /* DUP (element, scalar) @@ -5924,9 +5974,7 @@ static void handle_simd_dupg(DisasContext *s, int is_q, int rd, int rn, int imm5) { int size = ctz32(imm5); - int esize = 8 << size; - int elements = (is_q ? 128 : 64)/esize; - int i = 0; + uint32_t dofs, oprsz, maxsz; if (size > 3 || ((size == 3) && !is_q)) { unallocated_encoding(s); @@ -5937,12 +5985,11 @@ static void handle_simd_dupg(DisasContext *s, int is_q, int rd, int rn, return; } - for (i = 0; i < elements; i++) { - write_vec_element(s, cpu_reg(s, rn), rd, i, size); - } - if (!is_q) { - clear_vec_high(s, rd); - } + dofs = vec_full_reg_offset(s, rd); + oprsz = is_q ? 16 : 8; + maxsz = vec_full_reg_size(s); + + tcg_gen_gvec_dup_i64(size, dofs, oprsz, maxsz, cpu_reg(s, rn)); } /* INS (Element) @@ -6133,8 +6180,6 @@ static void disas_simd_mod_imm(DisasContext *s, uint32_t insn) bool is_neg = extract32(insn, 29, 1); bool is_q = extract32(insn, 30, 1); uint64_t imm = 0; - TCGv_i64 tcg_rd, tcg_imm; - int i; if (o2 != 0 || ((cmode == 0xf) && is_neg && !is_q)) { unallocated_encoding(s); @@ -6215,32 +6260,18 @@ static void disas_simd_mod_imm(DisasContext *s, uint32_t insn) imm = ~imm; } - tcg_imm = tcg_const_i64(imm); - tcg_rd = new_tmp_a64(s); - - for (i = 0; i < 2; i++) { - int foffs = i ? fp_reg_hi_offset(s, rd) : fp_reg_offset(s, rd, MO_64); - - if (i == 1 && !is_q) { - /* non-quad ops clear high half of vector */ - tcg_gen_movi_i64(tcg_rd, 0); - } else if ((cmode & 0x9) == 0x1 || (cmode & 0xd) == 0x9) { - tcg_gen_ld_i64(tcg_rd, cpu_env, foffs); - if (is_neg) { - /* AND (BIC) */ - tcg_gen_and_i64(tcg_rd, tcg_rd, tcg_imm); - } else { - /* ORR */ - tcg_gen_or_i64(tcg_rd, tcg_rd, tcg_imm); - } + if (!((cmode & 0x9) == 0x1 || (cmode & 0xd) == 0x9)) { + /* MOVI or MVNI, with MVNI negation handled above. */ + tcg_gen_gvec_dup64i(vec_full_reg_offset(s, rd), is_q ? 16 : 8, + vec_full_reg_size(s), imm); + } else { + /* ORR or BIC, with BIC negation to AND handled above. */ + if (is_neg) { + gen_gvec_fn2i(s, is_q, rd, rd, imm, tcg_gen_gvec_andi, MO_64); } else { - /* MOVI */ - tcg_gen_mov_i64(tcg_rd, tcg_imm); + gen_gvec_fn2i(s, is_q, rd, rd, imm, tcg_gen_gvec_ori, MO_64); } - tcg_gen_st_i64(tcg_rd, cpu_env, foffs); } - - tcg_temp_free_i64(tcg_imm); } /* AdvSIMD scalar copy @@ -6485,32 +6516,6 @@ static void handle_shri_with_rndacc(TCGv_i64 tcg_res, TCGv_i64 tcg_src, } } -/* Common SHL/SLI - Shift left with an optional insert */ -static void handle_shli_with_ins(TCGv_i64 tcg_res, TCGv_i64 tcg_src, - bool insert, int shift) -{ - if (insert) { /* SLI */ - tcg_gen_deposit_i64(tcg_res, tcg_res, tcg_src, shift, 64 - shift); - } else { /* SHL */ - tcg_gen_shli_i64(tcg_res, tcg_src, shift); - } -} - -/* SRI: shift right with insert */ -static void handle_shri_with_ins(TCGv_i64 tcg_res, TCGv_i64 tcg_src, - int size, int shift) -{ - int esize = 8 << size; - - /* shift count same as element size is valid but does nothing; - * special case to avoid potential shift by 64. - */ - if (shift != esize) { - tcg_gen_shri_i64(tcg_src, tcg_src, shift); - tcg_gen_deposit_i64(tcg_res, tcg_res, tcg_src, 0, esize - shift); - } -} - /* SSHR[RA]/USHR[RA] - Scalar shift right (optional rounding/accumulate) */ static void handle_scalar_simd_shri(DisasContext *s, bool is_u, int immh, int immb, @@ -6561,7 +6566,14 @@ static void handle_scalar_simd_shri(DisasContext *s, tcg_rd = (accumulate || insert) ? read_fp_dreg(s, rd) : tcg_temp_new_i64(); if (insert) { - handle_shri_with_ins(tcg_rd, tcg_rn, size, shift); + /* shift count same as element size is valid but does nothing; + * special case to avoid potential shift by 64. + */ + int esize = 8 << size; + if (shift != esize) { + tcg_gen_shri_i64(tcg_rn, tcg_rn, shift); + tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_rn, 0, esize - shift); + } } else { handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round, accumulate, is_u, size, shift); @@ -6599,7 +6611,11 @@ static void handle_scalar_simd_shli(DisasContext *s, bool insert, tcg_rn = read_fp_dreg(s, rn); tcg_rd = insert ? read_fp_dreg(s, rd) : tcg_temp_new_i64(); - handle_shli_with_ins(tcg_rd, tcg_rn, insert, shift); + if (insert) { + tcg_gen_deposit_i64(tcg_rd, tcg_rd, tcg_rn, shift, 64 - shift); + } else { + tcg_gen_shli_i64(tcg_rd, tcg_rn, shift); + } write_fp_dreg(s, rd, tcg_rd); @@ -7175,6 +7191,28 @@ static void disas_simd_scalar_three_reg_diff(DisasContext *s, uint32_t insn) } } +/* CMTST : test is "if (X & Y != 0)". */ +static void gen_cmtst_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + tcg_gen_and_i32(d, a, b); + tcg_gen_setcondi_i32(TCG_COND_NE, d, d, 0); + tcg_gen_neg_i32(d, d); +} + +static void gen_cmtst_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + tcg_gen_and_i64(d, a, b); + tcg_gen_setcondi_i64(TCG_COND_NE, d, d, 0); + tcg_gen_neg_i64(d, d); +} + +static void gen_cmtst_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) +{ + tcg_gen_and_vec(vece, d, a, b); + tcg_gen_dupi_vec(vece, a, 0); + tcg_gen_cmp_vec(TCG_COND_NE, vece, d, d, a); +} + static void handle_3same_64(DisasContext *s, int opcode, bool u, TCGv_i64 tcg_rd, TCGv_i64 tcg_rn, TCGv_i64 tcg_rm) { @@ -7218,10 +7256,7 @@ static void handle_3same_64(DisasContext *s, int opcode, bool u, cond = TCG_COND_EQ; goto do_cmop; } - /* CMTST : test is "if (X & Y != 0)". */ - tcg_gen_and_i64(tcg_rd, tcg_rn, tcg_rm); - tcg_gen_setcondi_i64(TCG_COND_NE, tcg_rd, tcg_rd, 0); - tcg_gen_neg_i64(tcg_rd, tcg_rd); + gen_cmtst_i64(tcg_rd, tcg_rn, tcg_rm); break; case 0x8: /* SSHL, USHL */ if (u) { @@ -8329,16 +8364,195 @@ static void disas_simd_scalar_two_reg_misc(DisasContext *s, uint32_t insn) } } +static void gen_ssra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + tcg_gen_vec_sar8i_i64(a, a, shift); + tcg_gen_vec_add8_i64(d, d, a); +} + +static void gen_ssra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + tcg_gen_vec_sar16i_i64(a, a, shift); + tcg_gen_vec_add16_i64(d, d, a); +} + +static void gen_ssra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) +{ + tcg_gen_sari_i32(a, a, shift); + tcg_gen_add_i32(d, d, a); +} + +static void gen_ssra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + tcg_gen_sari_i64(a, a, shift); + tcg_gen_add_i64(d, d, a); +} + +static void gen_ssra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) +{ + tcg_gen_sari_vec(vece, a, a, sh); + tcg_gen_add_vec(vece, d, d, a); +} + +static void gen_usra8_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + tcg_gen_vec_shr8i_i64(a, a, shift); + tcg_gen_vec_add8_i64(d, d, a); +} + +static void gen_usra16_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + tcg_gen_vec_shr16i_i64(a, a, shift); + tcg_gen_vec_add16_i64(d, d, a); +} + +static void gen_usra32_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) +{ + tcg_gen_shri_i32(a, a, shift); + tcg_gen_add_i32(d, d, a); +} + +static void gen_usra64_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + tcg_gen_shri_i64(a, a, shift); + tcg_gen_add_i64(d, d, a); +} + +static void gen_usra_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) +{ + tcg_gen_shri_vec(vece, a, a, sh); + tcg_gen_add_vec(vece, d, d, a); +} + +static void gen_shr8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + uint64_t mask = dup_const(MO_8, 0xff >> shift); + TCGv_i64 t = tcg_temp_new_i64(); + + tcg_gen_shri_i64(t, a, shift); + tcg_gen_andi_i64(t, t, mask); + tcg_gen_andi_i64(d, d, ~mask); + tcg_gen_or_i64(d, d, t); + tcg_temp_free_i64(t); +} + +static void gen_shr16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + uint64_t mask = dup_const(MO_16, 0xffff >> shift); + TCGv_i64 t = tcg_temp_new_i64(); + + tcg_gen_shri_i64(t, a, shift); + tcg_gen_andi_i64(t, t, mask); + tcg_gen_andi_i64(d, d, ~mask); + tcg_gen_or_i64(d, d, t); + tcg_temp_free_i64(t); +} + +static void gen_shr32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) +{ + tcg_gen_shri_i32(a, a, shift); + tcg_gen_deposit_i32(d, d, a, 0, 32 - shift); +} + +static void gen_shr64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + tcg_gen_shri_i64(a, a, shift); + tcg_gen_deposit_i64(d, d, a, 0, 64 - shift); +} + +static void gen_shr_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) +{ + uint64_t mask = (2ull << ((8 << vece) - 1)) - 1; + TCGv_vec t = tcg_temp_new_vec_matching(d); + TCGv_vec m = tcg_temp_new_vec_matching(d); + + tcg_gen_dupi_vec(vece, m, mask ^ (mask >> sh)); + tcg_gen_shri_vec(vece, t, a, sh); + tcg_gen_and_vec(vece, d, d, m); + tcg_gen_or_vec(vece, d, d, t); + + tcg_temp_free_vec(t); + tcg_temp_free_vec(m); +} + /* SSHR[RA]/USHR[RA] - Vector shift right (optional rounding/accumulate) */ static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u, int immh, int immb, int opcode, int rn, int rd) { + static const GVecGen2i ssra_op[4] = { + { .fni8 = gen_ssra8_i64, + .fniv = gen_ssra_vec, + .load_dest = true, + .opc = INDEX_op_sari_vec, + .vece = MO_8 }, + { .fni8 = gen_ssra16_i64, + .fniv = gen_ssra_vec, + .load_dest = true, + .opc = INDEX_op_sari_vec, + .vece = MO_16 }, + { .fni4 = gen_ssra32_i32, + .fniv = gen_ssra_vec, + .load_dest = true, + .opc = INDEX_op_sari_vec, + .vece = MO_32 }, + { .fni8 = gen_ssra64_i64, + .fniv = gen_ssra_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true, + .opc = INDEX_op_sari_vec, + .vece = MO_64 }, + }; + static const GVecGen2i usra_op[4] = { + { .fni8 = gen_usra8_i64, + .fniv = gen_usra_vec, + .load_dest = true, + .opc = INDEX_op_shri_vec, + .vece = MO_8, }, + { .fni8 = gen_usra16_i64, + .fniv = gen_usra_vec, + .load_dest = true, + .opc = INDEX_op_shri_vec, + .vece = MO_16, }, + { .fni4 = gen_usra32_i32, + .fniv = gen_usra_vec, + .load_dest = true, + .opc = INDEX_op_shri_vec, + .vece = MO_32, }, + { .fni8 = gen_usra64_i64, + .fniv = gen_usra_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true, + .opc = INDEX_op_shri_vec, + .vece = MO_64, }, + }; + static const GVecGen2i sri_op[4] = { + { .fni8 = gen_shr8_ins_i64, + .fniv = gen_shr_ins_vec, + .load_dest = true, + .opc = INDEX_op_shri_vec, + .vece = MO_8 }, + { .fni8 = gen_shr16_ins_i64, + .fniv = gen_shr_ins_vec, + .load_dest = true, + .opc = INDEX_op_shri_vec, + .vece = MO_16 }, + { .fni4 = gen_shr32_ins_i32, + .fniv = gen_shr_ins_vec, + .load_dest = true, + .opc = INDEX_op_shri_vec, + .vece = MO_32 }, + { .fni8 = gen_shr64_ins_i64, + .fniv = gen_shr_ins_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true, + .opc = INDEX_op_shri_vec, + .vece = MO_64 }, + }; + int size = 32 - clz32(immh) - 1; int immhb = immh << 3 | immb; int shift = 2 * (8 << size) - immhb; bool accumulate = false; - bool round = false; - bool insert = false; int dsize = is_q ? 128 : 64; int esize = 8 << size; int elements = dsize/esize; @@ -8346,6 +8560,7 @@ static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u, TCGv_i64 tcg_rn = new_tmp_a64(s); TCGv_i64 tcg_rd = new_tmp_a64(s); TCGv_i64 tcg_round; + uint64_t round_const; int i; if (extract32(immh, 3, 1) && !is_q) { @@ -8364,64 +8579,159 @@ static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u, switch (opcode) { case 0x02: /* SSRA / USRA (accumulate) */ - accumulate = true; - break; + if (is_u) { + /* Shift count same as element size produces zero to add. */ + if (shift == 8 << size) { + goto done; + } + gen_gvec_op2i(s, is_q, rd, rn, shift, &usra_op[size]); + } else { + /* Shift count same as element size produces all sign to add. */ + if (shift == 8 << size) { + shift -= 1; + } + gen_gvec_op2i(s, is_q, rd, rn, shift, &ssra_op[size]); + } + return; + case 0x08: /* SRI */ + /* Shift count same as element size is valid but does nothing. */ + if (shift == 8 << size) { + goto done; + } + gen_gvec_op2i(s, is_q, rd, rn, shift, &sri_op[size]); + return; + + case 0x00: /* SSHR / USHR */ + if (is_u) { + if (shift == 8 << size) { + /* Shift count the same size as element size produces zero. */ + tcg_gen_gvec_dup8i(vec_full_reg_offset(s, rd), + is_q ? 16 : 8, vec_full_reg_size(s), 0); + } else { + gen_gvec_fn2i(s, is_q, rd, rn, shift, tcg_gen_gvec_shri, size); + } + } else { + /* Shift count the same size as element size produces all sign. */ + if (shift == 8 << size) { + shift -= 1; + } + gen_gvec_fn2i(s, is_q, rd, rn, shift, tcg_gen_gvec_sari, size); + } + return; + case 0x04: /* SRSHR / URSHR (rounding) */ - round = true; break; case 0x06: /* SRSRA / URSRA (accum + rounding) */ - accumulate = round = true; - break; - case 0x08: /* SRI */ - insert = true; + accumulate = true; break; + default: + g_assert_not_reached(); } - if (round) { - uint64_t round_const = 1ULL << (shift - 1); - tcg_round = tcg_const_i64(round_const); - } else { - tcg_round = NULL; - } + round_const = 1ULL << (shift - 1); + tcg_round = tcg_const_i64(round_const); for (i = 0; i < elements; i++) { read_vec_element(s, tcg_rn, rn, i, memop); - if (accumulate || insert) { + if (accumulate) { read_vec_element(s, tcg_rd, rd, i, memop); } - if (insert) { - handle_shri_with_ins(tcg_rd, tcg_rn, size, shift); - } else { - handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round, - accumulate, is_u, size, shift); - } + handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round, + accumulate, is_u, size, shift); write_vec_element(s, tcg_rd, rd, i, size); } + tcg_temp_free_i64(tcg_round); + done: if (!is_q) { clear_vec_high(s, rd); } +} - if (round) { - tcg_temp_free_i64(tcg_round); - } +static void gen_shl8_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + uint64_t mask = dup_const(MO_8, 0xff << shift); + TCGv_i64 t = tcg_temp_new_i64(); + + tcg_gen_shli_i64(t, a, shift); + tcg_gen_andi_i64(t, t, mask); + tcg_gen_andi_i64(d, d, ~mask); + tcg_gen_or_i64(d, d, t); + tcg_temp_free_i64(t); +} + +static void gen_shl16_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + uint64_t mask = dup_const(MO_16, 0xffff << shift); + TCGv_i64 t = tcg_temp_new_i64(); + + tcg_gen_shli_i64(t, a, shift); + tcg_gen_andi_i64(t, t, mask); + tcg_gen_andi_i64(d, d, ~mask); + tcg_gen_or_i64(d, d, t); + tcg_temp_free_i64(t); +} + +static void gen_shl32_ins_i32(TCGv_i32 d, TCGv_i32 a, int32_t shift) +{ + tcg_gen_deposit_i32(d, d, a, shift, 32 - shift); +} + +static void gen_shl64_ins_i64(TCGv_i64 d, TCGv_i64 a, int64_t shift) +{ + tcg_gen_deposit_i64(d, d, a, shift, 64 - shift); +} + +static void gen_shl_ins_vec(unsigned vece, TCGv_vec d, TCGv_vec a, int64_t sh) +{ + uint64_t mask = (1ull << sh) - 1; + TCGv_vec t = tcg_temp_new_vec_matching(d); + TCGv_vec m = tcg_temp_new_vec_matching(d); + + tcg_gen_dupi_vec(vece, m, mask); + tcg_gen_shli_vec(vece, t, a, sh); + tcg_gen_and_vec(vece, d, d, m); + tcg_gen_or_vec(vece, d, d, t); + + tcg_temp_free_vec(t); + tcg_temp_free_vec(m); } /* SHL/SLI - Vector shift left */ static void handle_vec_simd_shli(DisasContext *s, bool is_q, bool insert, - int immh, int immb, int opcode, int rn, int rd) + int immh, int immb, int opcode, int rn, int rd) { + static const GVecGen2i shi_op[4] = { + { .fni8 = gen_shl8_ins_i64, + .fniv = gen_shl_ins_vec, + .opc = INDEX_op_shli_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true, + .vece = MO_8 }, + { .fni8 = gen_shl16_ins_i64, + .fniv = gen_shl_ins_vec, + .opc = INDEX_op_shli_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true, + .vece = MO_16 }, + { .fni4 = gen_shl32_ins_i32, + .fniv = gen_shl_ins_vec, + .opc = INDEX_op_shli_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true, + .vece = MO_32 }, + { .fni8 = gen_shl64_ins_i64, + .fniv = gen_shl_ins_vec, + .opc = INDEX_op_shli_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true, + .vece = MO_64 }, + }; int size = 32 - clz32(immh) - 1; int immhb = immh << 3 | immb; int shift = immhb - (8 << size); - int dsize = is_q ? 128 : 64; - int esize = 8 << size; - int elements = dsize/esize; - TCGv_i64 tcg_rn = new_tmp_a64(s); - TCGv_i64 tcg_rd = new_tmp_a64(s); - int i; if (extract32(immh, 3, 1) && !is_q) { unallocated_encoding(s); @@ -8437,19 +8747,10 @@ static void handle_vec_simd_shli(DisasContext *s, bool is_q, bool insert, return; } - for (i = 0; i < elements; i++) { - read_vec_element(s, tcg_rn, rn, i, size); - if (insert) { - read_vec_element(s, tcg_rd, rd, i, size); - } - - handle_shli_with_ins(tcg_rd, tcg_rn, insert, shift); - - write_vec_element(s, tcg_rd, rd, i, size); - } - - if (!is_q) { - clear_vec_high(s, rd); + if (insert) { + gen_gvec_op2i(s, is_q, rd, rn, shift, &shi_op[size]); + } else { + gen_gvec_fn2i(s, is_q, rd, rn, shift, tcg_gen_gvec_shli, size); } } @@ -9072,85 +9373,115 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn) } } +static void gen_bsl_i64(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm) +{ + tcg_gen_xor_i64(rn, rn, rm); + tcg_gen_and_i64(rn, rn, rd); + tcg_gen_xor_i64(rd, rm, rn); +} + +static void gen_bit_i64(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm) +{ + tcg_gen_xor_i64(rn, rn, rd); + tcg_gen_and_i64(rn, rn, rm); + tcg_gen_xor_i64(rd, rd, rn); +} + +static void gen_bif_i64(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm) +{ + tcg_gen_xor_i64(rn, rn, rd); + tcg_gen_andc_i64(rn, rn, rm); + tcg_gen_xor_i64(rd, rd, rn); +} + +static void gen_bsl_vec(unsigned vece, TCGv_vec rd, TCGv_vec rn, TCGv_vec rm) +{ + tcg_gen_xor_vec(vece, rn, rn, rm); + tcg_gen_and_vec(vece, rn, rn, rd); + tcg_gen_xor_vec(vece, rd, rm, rn); +} + +static void gen_bit_vec(unsigned vece, TCGv_vec rd, TCGv_vec rn, TCGv_vec rm) +{ + tcg_gen_xor_vec(vece, rn, rn, rd); + tcg_gen_and_vec(vece, rn, rn, rm); + tcg_gen_xor_vec(vece, rd, rd, rn); +} + +static void gen_bif_vec(unsigned vece, TCGv_vec rd, TCGv_vec rn, TCGv_vec rm) +{ + tcg_gen_xor_vec(vece, rn, rn, rd); + tcg_gen_andc_vec(vece, rn, rn, rm); + tcg_gen_xor_vec(vece, rd, rd, rn); +} + /* Logic op (opcode == 3) subgroup of C3.6.16. */ static void disas_simd_3same_logic(DisasContext *s, uint32_t insn) { + static const GVecGen3 bsl_op = { + .fni8 = gen_bsl_i64, + .fniv = gen_bsl_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true + }; + static const GVecGen3 bit_op = { + .fni8 = gen_bit_i64, + .fniv = gen_bit_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true + }; + static const GVecGen3 bif_op = { + .fni8 = gen_bif_i64, + .fniv = gen_bif_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true + }; + int rd = extract32(insn, 0, 5); int rn = extract32(insn, 5, 5); int rm = extract32(insn, 16, 5); int size = extract32(insn, 22, 2); bool is_u = extract32(insn, 29, 1); bool is_q = extract32(insn, 30, 1); - TCGv_i64 tcg_op1, tcg_op2, tcg_res[2]; - int pass; if (!fp_access_check(s)) { return; } - tcg_op1 = tcg_temp_new_i64(); - tcg_op2 = tcg_temp_new_i64(); - tcg_res[0] = tcg_temp_new_i64(); - tcg_res[1] = tcg_temp_new_i64(); - - for (pass = 0; pass < (is_q ? 2 : 1); pass++) { - read_vec_element(s, tcg_op1, rn, pass, MO_64); - read_vec_element(s, tcg_op2, rm, pass, MO_64); - - if (!is_u) { - switch (size) { - case 0: /* AND */ - tcg_gen_and_i64(tcg_res[pass], tcg_op1, tcg_op2); - break; - case 1: /* BIC */ - tcg_gen_andc_i64(tcg_res[pass], tcg_op1, tcg_op2); - break; - case 2: /* ORR */ - tcg_gen_or_i64(tcg_res[pass], tcg_op1, tcg_op2); - break; - case 3: /* ORN */ - tcg_gen_orc_i64(tcg_res[pass], tcg_op1, tcg_op2); - break; - } + switch (size + 4 * is_u) { + case 0: /* AND */ + gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_and, 0); + return; + case 1: /* BIC */ + gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_andc, 0); + return; + case 2: /* ORR */ + if (rn == rm) { /* MOV */ + gen_gvec_fn2(s, is_q, rd, rn, tcg_gen_gvec_mov, 0); } else { - if (size != 0) { - /* B* ops need res loaded to operate on */ - read_vec_element(s, tcg_res[pass], rd, pass, MO_64); - } - - switch (size) { - case 0: /* EOR */ - tcg_gen_xor_i64(tcg_res[pass], tcg_op1, tcg_op2); - break; - case 1: /* BSL bitwise select */ - tcg_gen_xor_i64(tcg_op1, tcg_op1, tcg_op2); - tcg_gen_and_i64(tcg_op1, tcg_op1, tcg_res[pass]); - tcg_gen_xor_i64(tcg_res[pass], tcg_op2, tcg_op1); - break; - case 2: /* BIT, bitwise insert if true */ - tcg_gen_xor_i64(tcg_op1, tcg_op1, tcg_res[pass]); - tcg_gen_and_i64(tcg_op1, tcg_op1, tcg_op2); - tcg_gen_xor_i64(tcg_res[pass], tcg_res[pass], tcg_op1); - break; - case 3: /* BIF, bitwise insert if false */ - tcg_gen_xor_i64(tcg_op1, tcg_op1, tcg_res[pass]); - tcg_gen_andc_i64(tcg_op1, tcg_op1, tcg_op2); - tcg_gen_xor_i64(tcg_res[pass], tcg_res[pass], tcg_op1); - break; - } + gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_or, 0); } - } + return; + case 3: /* ORN */ + gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_orc, 0); + return; + case 4: /* EOR */ + gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_xor, 0); + return; - write_vec_element(s, tcg_res[0], rd, 0, MO_64); - if (!is_q) { - tcg_gen_movi_i64(tcg_res[1], 0); - } - write_vec_element(s, tcg_res[1], rd, 1, MO_64); + case 5: /* BSL bitwise select */ + gen_gvec_op3(s, is_q, rd, rn, rm, &bsl_op); + return; + case 6: /* BIT, bitwise insert if true */ + gen_gvec_op3(s, is_q, rd, rn, rm, &bit_op); + return; + case 7: /* BIF, bitwise insert if false */ + gen_gvec_op3(s, is_q, rd, rn, rm, &bif_op); + return; - tcg_temp_free_i64(tcg_op1); - tcg_temp_free_i64(tcg_op2); - tcg_temp_free_i64(tcg_res[0]); - tcg_temp_free_i64(tcg_res[1]); + default: + g_assert_not_reached(); + } } /* Helper functions for 32 bit comparisons */ @@ -9400,9 +9731,131 @@ static void disas_simd_3same_float(DisasContext *s, uint32_t insn) } } +static void gen_mla8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + gen_helper_neon_mul_u8(a, a, b); + gen_helper_neon_add_u8(d, d, a); +} + +static void gen_mla16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + gen_helper_neon_mul_u16(a, a, b); + gen_helper_neon_add_u16(d, d, a); +} + +static void gen_mla32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + tcg_gen_mul_i32(a, a, b); + tcg_gen_add_i32(d, d, a); +} + +static void gen_mla64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + tcg_gen_mul_i64(a, a, b); + tcg_gen_add_i64(d, d, a); +} + +static void gen_mla_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) +{ + tcg_gen_mul_vec(vece, a, a, b); + tcg_gen_add_vec(vece, d, d, a); +} + +static void gen_mls8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + gen_helper_neon_mul_u8(a, a, b); + gen_helper_neon_sub_u8(d, d, a); +} + +static void gen_mls16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + gen_helper_neon_mul_u16(a, a, b); + gen_helper_neon_sub_u16(d, d, a); +} + +static void gen_mls32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + tcg_gen_mul_i32(a, a, b); + tcg_gen_sub_i32(d, d, a); +} + +static void gen_mls64_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + tcg_gen_mul_i64(a, a, b); + tcg_gen_sub_i64(d, d, a); +} + +static void gen_mls_vec(unsigned vece, TCGv_vec d, TCGv_vec a, TCGv_vec b) +{ + tcg_gen_mul_vec(vece, a, a, b); + tcg_gen_sub_vec(vece, d, d, a); +} + /* Integer op subgroup of C3.6.16. */ static void disas_simd_3same_int(DisasContext *s, uint32_t insn) { + static const GVecGen3 cmtst_op[4] = { + { .fni4 = gen_helper_neon_tst_u8, + .fniv = gen_cmtst_vec, + .vece = MO_8 }, + { .fni4 = gen_helper_neon_tst_u16, + .fniv = gen_cmtst_vec, + .vece = MO_16 }, + { .fni4 = gen_cmtst_i32, + .fniv = gen_cmtst_vec, + .vece = MO_32 }, + { .fni8 = gen_cmtst_i64, + .fniv = gen_cmtst_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 }, + }; + static const GVecGen3 mla_op[4] = { + { .fni4 = gen_mla8_i32, + .fniv = gen_mla_vec, + .opc = INDEX_op_mul_vec, + .load_dest = true, + .vece = MO_8 }, + { .fni4 = gen_mla16_i32, + .fniv = gen_mla_vec, + .opc = INDEX_op_mul_vec, + .load_dest = true, + .vece = MO_16 }, + { .fni4 = gen_mla32_i32, + .fniv = gen_mla_vec, + .opc = INDEX_op_mul_vec, + .load_dest = true, + .vece = MO_32 }, + { .fni8 = gen_mla64_i64, + .fniv = gen_mla_vec, + .opc = INDEX_op_mul_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true, + .vece = MO_64 }, + }; + static const GVecGen3 mls_op[4] = { + { .fni4 = gen_mls8_i32, + .fniv = gen_mls_vec, + .opc = INDEX_op_mul_vec, + .load_dest = true, + .vece = MO_8 }, + { .fni4 = gen_mls16_i32, + .fniv = gen_mls_vec, + .opc = INDEX_op_mul_vec, + .load_dest = true, + .vece = MO_16 }, + { .fni4 = gen_mls32_i32, + .fniv = gen_mls_vec, + .opc = INDEX_op_mul_vec, + .load_dest = true, + .vece = MO_32 }, + { .fni8 = gen_mls64_i64, + .fniv = gen_mls_vec, + .opc = INDEX_op_mul_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .load_dest = true, + .vece = MO_64 }, + }; + int is_q = extract32(insn, 30, 1); int u = extract32(insn, 29, 1); int size = extract32(insn, 22, 2); @@ -9411,6 +9864,7 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn) int rn = extract32(insn, 5, 5); int rd = extract32(insn, 0, 5); int pass; + TCGCond cond; switch (opcode) { case 0x13: /* MUL, PMUL */ @@ -9450,6 +9904,48 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn) return; } + switch (opcode) { + case 0x10: /* ADD, SUB */ + if (u) { + gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_sub, size); + } else { + gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_add, size); + } + return; + case 0x13: /* MUL, PMUL */ + if (!u) { /* MUL */ + gen_gvec_fn3(s, is_q, rd, rn, rm, tcg_gen_gvec_mul, size); + return; + } + break; + case 0x12: /* MLA, MLS */ + if (u) { + gen_gvec_op3(s, is_q, rd, rn, rm, &mls_op[size]); + } else { + gen_gvec_op3(s, is_q, rd, rn, rm, &mla_op[size]); + } + return; + case 0x11: + if (!u) { /* CMTST */ + gen_gvec_op3(s, is_q, rd, rn, rm, &cmtst_op[size]); + return; + } + /* else CMEQ */ + cond = TCG_COND_EQ; + goto do_gvec_cmp; + case 0x06: /* CMGT, CMHI */ + cond = u ? TCG_COND_GTU : TCG_COND_GT; + goto do_gvec_cmp; + case 0x07: /* CMGE, CMHS */ + cond = u ? TCG_COND_GEU : TCG_COND_GE; + do_gvec_cmp: + tcg_gen_gvec_cmp(cond, size, vec_full_reg_offset(s, rd), + vec_full_reg_offset(s, rn), + vec_full_reg_offset(s, rm), + is_q ? 16 : 8, vec_full_reg_size(s)); + return; + } + if (size == 3) { assert(is_q); for (pass = 0; pass < 2; pass++) { @@ -9530,26 +10026,6 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn) genenvfn = fns[size][u]; break; } - case 0x6: /* CMGT, CMHI */ - { - static NeonGenTwoOpFn * const fns[3][2] = { - { gen_helper_neon_cgt_s8, gen_helper_neon_cgt_u8 }, - { gen_helper_neon_cgt_s16, gen_helper_neon_cgt_u16 }, - { gen_helper_neon_cgt_s32, gen_helper_neon_cgt_u32 }, - }; - genfn = fns[size][u]; - break; - } - case 0x7: /* CMGE, CMHS */ - { - static NeonGenTwoOpFn * const fns[3][2] = { - { gen_helper_neon_cge_s8, gen_helper_neon_cge_u8 }, - { gen_helper_neon_cge_s16, gen_helper_neon_cge_u16 }, - { gen_helper_neon_cge_s32, gen_helper_neon_cge_u32 }, - }; - genfn = fns[size][u]; - break; - } case 0x8: /* SSHL, USHL */ { static NeonGenTwoOpFn * const fns[3][2] = { @@ -9622,44 +10098,11 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn) genfn = fns[size][u]; break; } - case 0x10: /* ADD, SUB */ - { - static NeonGenTwoOpFn * const fns[3][2] = { - { gen_helper_neon_add_u8, gen_helper_neon_sub_u8 }, - { gen_helper_neon_add_u16, gen_helper_neon_sub_u16 }, - { tcg_gen_add_i32, tcg_gen_sub_i32 }, - }; - genfn = fns[size][u]; - break; - } - case 0x11: /* CMTST, CMEQ */ - { - static NeonGenTwoOpFn * const fns[3][2] = { - { gen_helper_neon_tst_u8, gen_helper_neon_ceq_u8 }, - { gen_helper_neon_tst_u16, gen_helper_neon_ceq_u16 }, - { gen_helper_neon_tst_u32, gen_helper_neon_ceq_u32 }, - }; - genfn = fns[size][u]; - break; - } case 0x13: /* MUL, PMUL */ - if (u) { - /* PMUL */ - assert(size == 0); - genfn = gen_helper_neon_mul_p8; - break; - } - /* fall through : MUL */ - case 0x12: /* MLA, MLS */ - { - static NeonGenTwoOpFn * const fns[3] = { - gen_helper_neon_mul_u8, - gen_helper_neon_mul_u16, - tcg_gen_mul_i32, - }; - genfn = fns[size]; + assert(u); /* PMUL */ + assert(size == 0); + genfn = gen_helper_neon_mul_p8; break; - } case 0x16: /* SQDMULH, SQRDMULH */ { static NeonGenTwoOpEnvFn * const fns[2][2] = { @@ -9680,18 +10123,16 @@ static void disas_simd_3same_int(DisasContext *s, uint32_t insn) genfn(tcg_res, tcg_op1, tcg_op2); } - if (opcode == 0xf || opcode == 0x12) { - /* SABA, UABA, MLA, MLS: accumulating ops */ - static NeonGenTwoOpFn * const fns[3][2] = { - { gen_helper_neon_add_u8, gen_helper_neon_sub_u8 }, - { gen_helper_neon_add_u16, gen_helper_neon_sub_u16 }, - { tcg_gen_add_i32, tcg_gen_sub_i32 }, + if (opcode == 0xf) { + /* SABA, UABA: accumulating ops */ + static NeonGenTwoOpFn * const fns[3] = { + gen_helper_neon_add_u8, + gen_helper_neon_add_u16, + tcg_gen_add_i32, }; - bool is_sub = (opcode == 0x12 && u); /* MLS */ - genfn = fns[size][is_sub]; read_vec_element_i32(s, tcg_op1, rd, pass, MO_32); - genfn(tcg_res, tcg_op1, tcg_res); + fns[size](tcg_res, tcg_op1, tcg_res); } write_vec_element_i32(s, tcg_res, rd, pass, MO_32); @@ -10003,8 +10444,7 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn) return; case 0x5: /* CNT, NOT, RBIT */ if (u && size == 0) { - /* NOT: adjust size so we can use the 64-bits-at-a-time loop. */ - size = 3; + /* NOT */ break; } else if (u && size == 1) { /* RBIT */ @@ -10256,6 +10696,21 @@ static void disas_simd_two_reg_misc(DisasContext *s, uint32_t insn) tcg_rmode = NULL; } + switch (opcode) { + case 0x5: + if (u && size == 0) { /* NOT */ + gen_gvec_fn2(s, is_q, rd, rn, tcg_gen_gvec_not, 0); + return; + } + break; + case 0xb: + if (u) { /* NEG */ + gen_gvec_fn2(s, is_q, rd, rn, tcg_gen_gvec_neg, size); + return; + } + break; + } + if (size == 3) { /* All 64-bit element operations can be shared with scalar 2misc */ int pass; @@ -11132,6 +11587,341 @@ static void disas_crypto_two_reg_sha(DisasContext *s, uint32_t insn) tcg_temp_free_ptr(tcg_rn_ptr); } +/* Crypto three-reg SHA512 + * 31 21 20 16 15 14 13 12 11 10 9 5 4 0 + * +-----------------------+------+---+---+-----+--------+------+------+ + * | 1 1 0 0 1 1 1 0 0 1 1 | Rm | 1 | O | 0 0 | opcode | Rn | Rd | + * +-----------------------+------+---+---+-----+--------+------+------+ + */ +static void disas_crypto_three_reg_sha512(DisasContext *s, uint32_t insn) +{ + int opcode = extract32(insn, 10, 2); + int o = extract32(insn, 14, 1); + int rm = extract32(insn, 16, 5); + int rn = extract32(insn, 5, 5); + int rd = extract32(insn, 0, 5); + int feature; + CryptoThreeOpFn *genfn; + + if (o == 0) { + switch (opcode) { + case 0: /* SHA512H */ + feature = ARM_FEATURE_V8_SHA512; + genfn = gen_helper_crypto_sha512h; + break; + case 1: /* SHA512H2 */ + feature = ARM_FEATURE_V8_SHA512; + genfn = gen_helper_crypto_sha512h2; + break; + case 2: /* SHA512SU1 */ + feature = ARM_FEATURE_V8_SHA512; + genfn = gen_helper_crypto_sha512su1; + break; + case 3: /* RAX1 */ + feature = ARM_FEATURE_V8_SHA3; + genfn = NULL; + break; + } + } else { + switch (opcode) { + case 0: /* SM3PARTW1 */ + feature = ARM_FEATURE_V8_SM3; + genfn = gen_helper_crypto_sm3partw1; + break; + case 1: /* SM3PARTW2 */ + feature = ARM_FEATURE_V8_SM3; + genfn = gen_helper_crypto_sm3partw2; + break; + case 2: /* SM4EKEY */ + feature = ARM_FEATURE_V8_SM4; + genfn = gen_helper_crypto_sm4ekey; + break; + default: + unallocated_encoding(s); + return; + } + } + + if (!arm_dc_feature(s, feature)) { + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + if (genfn) { + TCGv_ptr tcg_rd_ptr, tcg_rn_ptr, tcg_rm_ptr; + + tcg_rd_ptr = vec_full_reg_ptr(s, rd); + tcg_rn_ptr = vec_full_reg_ptr(s, rn); + tcg_rm_ptr = vec_full_reg_ptr(s, rm); + + genfn(tcg_rd_ptr, tcg_rn_ptr, tcg_rm_ptr); + + tcg_temp_free_ptr(tcg_rd_ptr); + tcg_temp_free_ptr(tcg_rn_ptr); + tcg_temp_free_ptr(tcg_rm_ptr); + } else { + TCGv_i64 tcg_op1, tcg_op2, tcg_res[2]; + int pass; + + tcg_op1 = tcg_temp_new_i64(); + tcg_op2 = tcg_temp_new_i64(); + tcg_res[0] = tcg_temp_new_i64(); + tcg_res[1] = tcg_temp_new_i64(); + + for (pass = 0; pass < 2; pass++) { + read_vec_element(s, tcg_op1, rn, pass, MO_64); + read_vec_element(s, tcg_op2, rm, pass, MO_64); + + tcg_gen_rotli_i64(tcg_res[pass], tcg_op2, 1); + tcg_gen_xor_i64(tcg_res[pass], tcg_res[pass], tcg_op1); + } + write_vec_element(s, tcg_res[0], rd, 0, MO_64); + write_vec_element(s, tcg_res[1], rd, 1, MO_64); + + tcg_temp_free_i64(tcg_op1); + tcg_temp_free_i64(tcg_op2); + tcg_temp_free_i64(tcg_res[0]); + tcg_temp_free_i64(tcg_res[1]); + } +} + +/* Crypto two-reg SHA512 + * 31 12 11 10 9 5 4 0 + * +-----------------------------------------+--------+------+------+ + * | 1 1 0 0 1 1 1 0 1 1 0 0 0 0 0 0 1 0 0 0 | opcode | Rn | Rd | + * +-----------------------------------------+--------+------+------+ + */ +static void disas_crypto_two_reg_sha512(DisasContext *s, uint32_t insn) +{ + int opcode = extract32(insn, 10, 2); + int rn = extract32(insn, 5, 5); + int rd = extract32(insn, 0, 5); + TCGv_ptr tcg_rd_ptr, tcg_rn_ptr; + int feature; + CryptoTwoOpFn *genfn; + + switch (opcode) { + case 0: /* SHA512SU0 */ + feature = ARM_FEATURE_V8_SHA512; + genfn = gen_helper_crypto_sha512su0; + break; + case 1: /* SM4E */ + feature = ARM_FEATURE_V8_SM4; + genfn = gen_helper_crypto_sm4e; + break; + default: + unallocated_encoding(s); + return; + } + + if (!arm_dc_feature(s, feature)) { + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + tcg_rd_ptr = vec_full_reg_ptr(s, rd); + tcg_rn_ptr = vec_full_reg_ptr(s, rn); + + genfn(tcg_rd_ptr, tcg_rn_ptr); + + tcg_temp_free_ptr(tcg_rd_ptr); + tcg_temp_free_ptr(tcg_rn_ptr); +} + +/* Crypto four-register + * 31 23 22 21 20 16 15 14 10 9 5 4 0 + * +-------------------+-----+------+---+------+------+------+ + * | 1 1 0 0 1 1 1 0 0 | Op0 | Rm | 0 | Ra | Rn | Rd | + * +-------------------+-----+------+---+------+------+------+ + */ +static void disas_crypto_four_reg(DisasContext *s, uint32_t insn) +{ + int op0 = extract32(insn, 21, 2); + int rm = extract32(insn, 16, 5); + int ra = extract32(insn, 10, 5); + int rn = extract32(insn, 5, 5); + int rd = extract32(insn, 0, 5); + int feature; + + switch (op0) { + case 0: /* EOR3 */ + case 1: /* BCAX */ + feature = ARM_FEATURE_V8_SHA3; + break; + case 2: /* SM3SS1 */ + feature = ARM_FEATURE_V8_SM3; + break; + default: + unallocated_encoding(s); + return; + } + + if (!arm_dc_feature(s, feature)) { + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + if (op0 < 2) { + TCGv_i64 tcg_op1, tcg_op2, tcg_op3, tcg_res[2]; + int pass; + + tcg_op1 = tcg_temp_new_i64(); + tcg_op2 = tcg_temp_new_i64(); + tcg_op3 = tcg_temp_new_i64(); + tcg_res[0] = tcg_temp_new_i64(); + tcg_res[1] = tcg_temp_new_i64(); + + for (pass = 0; pass < 2; pass++) { + read_vec_element(s, tcg_op1, rn, pass, MO_64); + read_vec_element(s, tcg_op2, rm, pass, MO_64); + read_vec_element(s, tcg_op3, ra, pass, MO_64); + + if (op0 == 0) { + /* EOR3 */ + tcg_gen_xor_i64(tcg_res[pass], tcg_op2, tcg_op3); + } else { + /* BCAX */ + tcg_gen_andc_i64(tcg_res[pass], tcg_op2, tcg_op3); + } + tcg_gen_xor_i64(tcg_res[pass], tcg_res[pass], tcg_op1); + } + write_vec_element(s, tcg_res[0], rd, 0, MO_64); + write_vec_element(s, tcg_res[1], rd, 1, MO_64); + + tcg_temp_free_i64(tcg_op1); + tcg_temp_free_i64(tcg_op2); + tcg_temp_free_i64(tcg_op3); + tcg_temp_free_i64(tcg_res[0]); + tcg_temp_free_i64(tcg_res[1]); + } else { + TCGv_i32 tcg_op1, tcg_op2, tcg_op3, tcg_res, tcg_zero; + + tcg_op1 = tcg_temp_new_i32(); + tcg_op2 = tcg_temp_new_i32(); + tcg_op3 = tcg_temp_new_i32(); + tcg_res = tcg_temp_new_i32(); + tcg_zero = tcg_const_i32(0); + + read_vec_element_i32(s, tcg_op1, rn, 3, MO_32); + read_vec_element_i32(s, tcg_op2, rm, 3, MO_32); + read_vec_element_i32(s, tcg_op3, ra, 3, MO_32); + + tcg_gen_rotri_i32(tcg_res, tcg_op1, 20); + tcg_gen_add_i32(tcg_res, tcg_res, tcg_op2); + tcg_gen_add_i32(tcg_res, tcg_res, tcg_op3); + tcg_gen_rotri_i32(tcg_res, tcg_res, 25); + + write_vec_element_i32(s, tcg_zero, rd, 0, MO_32); + write_vec_element_i32(s, tcg_zero, rd, 1, MO_32); + write_vec_element_i32(s, tcg_zero, rd, 2, MO_32); + write_vec_element_i32(s, tcg_res, rd, 3, MO_32); + + tcg_temp_free_i32(tcg_op1); + tcg_temp_free_i32(tcg_op2); + tcg_temp_free_i32(tcg_op3); + tcg_temp_free_i32(tcg_res); + tcg_temp_free_i32(tcg_zero); + } +} + +/* Crypto XAR + * 31 21 20 16 15 10 9 5 4 0 + * +-----------------------+------+--------+------+------+ + * | 1 1 0 0 1 1 1 0 1 0 0 | Rm | imm6 | Rn | Rd | + * +-----------------------+------+--------+------+------+ + */ +static void disas_crypto_xar(DisasContext *s, uint32_t insn) +{ + int rm = extract32(insn, 16, 5); + int imm6 = extract32(insn, 10, 6); + int rn = extract32(insn, 5, 5); + int rd = extract32(insn, 0, 5); + TCGv_i64 tcg_op1, tcg_op2, tcg_res[2]; + int pass; + + if (!arm_dc_feature(s, ARM_FEATURE_V8_SHA3)) { + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + tcg_op1 = tcg_temp_new_i64(); + tcg_op2 = tcg_temp_new_i64(); + tcg_res[0] = tcg_temp_new_i64(); + tcg_res[1] = tcg_temp_new_i64(); + + for (pass = 0; pass < 2; pass++) { + read_vec_element(s, tcg_op1, rn, pass, MO_64); + read_vec_element(s, tcg_op2, rm, pass, MO_64); + + tcg_gen_xor_i64(tcg_res[pass], tcg_op1, tcg_op2); + tcg_gen_rotri_i64(tcg_res[pass], tcg_res[pass], imm6); + } + write_vec_element(s, tcg_res[0], rd, 0, MO_64); + write_vec_element(s, tcg_res[1], rd, 1, MO_64); + + tcg_temp_free_i64(tcg_op1); + tcg_temp_free_i64(tcg_op2); + tcg_temp_free_i64(tcg_res[0]); + tcg_temp_free_i64(tcg_res[1]); +} + +/* Crypto three-reg imm2 + * 31 21 20 16 15 14 13 12 11 10 9 5 4 0 + * +-----------------------+------+-----+------+--------+------+------+ + * | 1 1 0 0 1 1 1 0 0 1 0 | Rm | 1 0 | imm2 | opcode | Rn | Rd | + * +-----------------------+------+-----+------+--------+------+------+ + */ +static void disas_crypto_three_reg_imm2(DisasContext *s, uint32_t insn) +{ + int opcode = extract32(insn, 10, 2); + int imm2 = extract32(insn, 12, 2); + int rm = extract32(insn, 16, 5); + int rn = extract32(insn, 5, 5); + int rd = extract32(insn, 0, 5); + TCGv_ptr tcg_rd_ptr, tcg_rn_ptr, tcg_rm_ptr; + TCGv_i32 tcg_imm2, tcg_opcode; + + if (!arm_dc_feature(s, ARM_FEATURE_V8_SM3)) { + unallocated_encoding(s); + return; + } + + if (!fp_access_check(s)) { + return; + } + + tcg_rd_ptr = vec_full_reg_ptr(s, rd); + tcg_rn_ptr = vec_full_reg_ptr(s, rn); + tcg_rm_ptr = vec_full_reg_ptr(s, rm); + tcg_imm2 = tcg_const_i32(imm2); + tcg_opcode = tcg_const_i32(opcode); + + gen_helper_crypto_sm3tt(tcg_rd_ptr, tcg_rn_ptr, tcg_rm_ptr, tcg_imm2, + tcg_opcode); + + tcg_temp_free_ptr(tcg_rd_ptr); + tcg_temp_free_ptr(tcg_rn_ptr); + tcg_temp_free_ptr(tcg_rm_ptr); + tcg_temp_free_i32(tcg_imm2); + tcg_temp_free_i32(tcg_opcode); +} + /* C3.6 Data processing - SIMD, inc Crypto * * As the decode gets a little complex we are using a table based @@ -11161,6 +11951,11 @@ static const AArch64DecodeTable data_proc_simd[] = { { 0x4e280800, 0xff3e0c00, disas_crypto_aes }, { 0x5e000000, 0xff208c00, disas_crypto_three_reg_sha }, { 0x5e280800, 0xff3e0c00, disas_crypto_two_reg_sha }, + { 0xce608000, 0xffe0b000, disas_crypto_three_reg_sha512 }, + { 0xcec08000, 0xfffff000, disas_crypto_two_reg_sha512 }, + { 0xce000000, 0xff808000, disas_crypto_four_reg }, + { 0xce800000, 0xffe00000, disas_crypto_xar }, + { 0xce408000, 0xffe0c000, disas_crypto_three_reg_imm2 }, { 0x00000000, 0x00000000, NULL } }; @@ -11263,6 +12058,8 @@ static int aarch64_tr_init_disas_context(DisasContextBase *dcbase, dc->user = (dc->current_el == 0); #endif dc->fp_excp_el = ARM_TBFLAG_FPEXC_EL(dc->base.tb->flags); + dc->sve_excp_el = ARM_TBFLAG_SVEEXC_EL(dc->base.tb->flags); + dc->sve_len = (ARM_TBFLAG_ZCR_LEN(dc->base.tb->flags) + 1) * 16; dc->vec_len = 0; dc->vec_stride = 0; dc->cp_regs = arm_cpu->cp_regs; diff --git a/target/arm/translate.c b/target/arm/translate.c index 55826b7e5a..1270022289 100644 --- a/target/arm/translate.c +++ b/target/arm/translate.c @@ -1512,13 +1512,12 @@ static inline void gen_vfp_st(DisasContext *s, int dp, TCGv_i32 addr) } } -static inline long -vfp_reg_offset (int dp, int reg) +static inline long vfp_reg_offset(bool dp, unsigned reg) { if (dp) { - return offsetof(CPUARMState, vfp.regs[reg]); + return offsetof(CPUARMState, vfp.zregs[reg >> 1].d[reg & 1]); } else { - long ofs = offsetof(CPUARMState, vfp.regs[reg >> 1]); + long ofs = offsetof(CPUARMState, vfp.zregs[reg >> 2].d[(reg >> 1) & 1]); if (reg & 1) { ofs += offsetof(CPU_DoubleU, l.upper); } else { @@ -9926,6 +9925,7 @@ static void disas_thumb2_insn(DisasContext *s, uint32_t insn) tcg_temp_free_i32(addr); tcg_temp_free_i32(op); store_reg(s, rd, ttresp); + break; } goto illegal_op; } diff --git a/target/arm/translate.h b/target/arm/translate.h index 3f4df91e5e..c47febf99d 100644 --- a/target/arm/translate.h +++ b/target/arm/translate.h @@ -29,6 +29,8 @@ typedef struct DisasContext { bool tbi1; /* TBI1 for EL0/1, not used for EL2/3 */ bool ns; /* Use non-secure CPREG bank on access */ int fp_excp_el; /* FP exception EL or 0 if enabled */ + int sve_excp_el; /* SVE exception EL or 0 if enabled */ + int sve_len; /* SVE vector length in bytes */ /* Flag indicating that exceptions from secure mode are routed to EL3. */ bool secure_routed_to_el3; bool vfp_enabled; /* FP enabled via FPSCR.EN */ diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c index 979469dc3c..da7cb9c278 100644 --- a/target/s390x/cpu.c +++ b/target/s390x/cpu.c @@ -100,7 +100,6 @@ static void s390_cpu_initial_reset(CPUState *s) { S390CPU *cpu = S390_CPU(s); CPUS390XState *env = &cpu->env; - int i; s390_cpu_reset(s); /* initial reset does not clear everything! */ @@ -116,10 +115,6 @@ static void s390_cpu_initial_reset(CPUState *s) env->gbea = 1; env->pfault_token = -1UL; - for (i = 0; i < ARRAY_SIZE(env->io_index); i++) { - env->io_index[i] = -1; - } - env->mchk_index = -1; /* tininess for underflow is detected before rounding */ set_float_detect_tininess(float_tininess_before_rounding, @@ -137,7 +132,6 @@ static void s390_cpu_full_reset(CPUState *s) S390CPU *cpu = S390_CPU(s); S390CPUClass *scc = S390_CPU_GET_CLASS(cpu); CPUS390XState *env = &cpu->env; - int i; scc->parent_reset(s); cpu->env.sigp_order = 0; @@ -153,10 +147,6 @@ static void s390_cpu_full_reset(CPUState *s) env->gbea = 1; env->pfault_token = -1UL; - for (i = 0; i < ARRAY_SIZE(env->io_index); i++) { - env->io_index[i] = -1; - } - env->mchk_index = -1; /* tininess for underflow is detected before rounding */ set_float_detect_tininess(float_tininess_before_rounding, diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h index a1123ad621..21ce40d5b6 100644 --- a/target/s390x/cpu.h +++ b/target/s390x/cpu.h @@ -53,12 +53,6 @@ #define MMU_USER_IDX 0 -#define MAX_IO_QUEUE 16 -#define MAX_MCHK_QUEUE 16 - -#define PSW_MCHK_MASK 0x0004000000000000 -#define PSW_IO_MASK 0x0200000000000000 - #define S390_MAX_CPUS 248 typedef struct PSW { @@ -66,17 +60,6 @@ typedef struct PSW { uint64_t addr; } PSW; -typedef struct IOIntQueue { - uint16_t id; - uint16_t nr; - uint32_t parm; - uint32_t word; -} IOIntQueue; - -typedef struct MchkQueue { - uint16_t type; -} MchkQueue; - struct CPUS390XState { uint64_t regs[16]; /* GP registers */ /* @@ -122,15 +105,9 @@ struct CPUS390XState { uint64_t cregs[16]; /* control registers */ - IOIntQueue io_queue[MAX_IO_QUEUE][8]; - MchkQueue mchk_queue[MAX_MCHK_QUEUE]; - int pending_int; - uint32_t service_param; uint16_t external_call_addr; DECLARE_BITMAP(emergency_signals, S390_MAX_CPUS); - int io_index[8]; - int mchk_index; uint64_t ckc; uint64_t cputm; @@ -409,9 +386,6 @@ static inline void cpu_get_tb_cpu_state(CPUS390XState* env, target_ulong *pc, #define EXCP_IO 7 /* I/O interrupt */ #define EXCP_MCHK 8 /* machine check */ -#define INTERRUPT_IO (1 << 0) -#define INTERRUPT_MCHK (1 << 1) -#define INTERRUPT_EXT_SERVICE (1 << 2) #define INTERRUPT_EXT_CPU_TIMER (1 << 3) #define INTERRUPT_EXT_CLOCK_COMPARATOR (1 << 4) #define INTERRUPT_EXTERNAL_CALL (1 << 5) @@ -452,62 +426,66 @@ static inline void setcc(S390CPU *cpu, uint64_t cc) } /* STSI */ -#define STSI_LEVEL_MASK 0x00000000f0000000ULL -#define STSI_LEVEL_CURRENT 0x0000000000000000ULL -#define STSI_LEVEL_1 0x0000000010000000ULL -#define STSI_LEVEL_2 0x0000000020000000ULL -#define STSI_LEVEL_3 0x0000000030000000ULL +#define STSI_R0_FC_MASK 0x00000000f0000000ULL +#define STSI_R0_FC_CURRENT 0x0000000000000000ULL +#define STSI_R0_FC_LEVEL_1 0x0000000010000000ULL +#define STSI_R0_FC_LEVEL_2 0x0000000020000000ULL +#define STSI_R0_FC_LEVEL_3 0x0000000030000000ULL #define STSI_R0_RESERVED_MASK 0x000000000fffff00ULL #define STSI_R0_SEL1_MASK 0x00000000000000ffULL #define STSI_R1_RESERVED_MASK 0x00000000ffff0000ULL #define STSI_R1_SEL2_MASK 0x000000000000ffffULL /* Basic Machine Configuration */ -struct sysib_111 { - uint32_t res1[8]; +typedef struct SysIB_111 { + uint8_t res1[32]; uint8_t manuf[16]; uint8_t type[4]; uint8_t res2[12]; uint8_t model[16]; uint8_t sequence[16]; uint8_t plant[4]; - uint8_t res3[156]; -}; + uint8_t res3[3996]; +} SysIB_111; +QEMU_BUILD_BUG_ON(sizeof(SysIB_111) != 4096); /* Basic Machine CPU */ -struct sysib_121 { - uint32_t res1[80]; +typedef struct SysIB_121 { + uint8_t res1[80]; uint8_t sequence[16]; uint8_t plant[4]; uint8_t res2[2]; uint16_t cpu_addr; - uint8_t res3[152]; -}; + uint8_t res3[3992]; +} SysIB_121; +QEMU_BUILD_BUG_ON(sizeof(SysIB_121) != 4096); /* Basic Machine CPUs */ -struct sysib_122 { +typedef struct SysIB_122 { uint8_t res1[32]; uint32_t capability; uint16_t total_cpus; - uint16_t active_cpus; + uint16_t conf_cpus; uint16_t standby_cpus; uint16_t reserved_cpus; uint16_t adjustments[2026]; -}; +} SysIB_122; +QEMU_BUILD_BUG_ON(sizeof(SysIB_122) != 4096); /* LPAR CPU */ -struct sysib_221 { - uint32_t res1[80]; +typedef struct SysIB_221 { + uint8_t res1[80]; uint8_t sequence[16]; uint8_t plant[4]; uint16_t cpu_id; uint16_t cpu_addr; - uint8_t res3[152]; -}; + uint8_t res3[3992]; +} SysIB_221; +QEMU_BUILD_BUG_ON(sizeof(SysIB_221) != 4096); /* LPAR CPUs */ -struct sysib_222 { - uint32_t res1[32]; +typedef struct SysIB_222 { + uint8_t res1[32]; uint16_t lpar_num; uint8_t res2; uint8_t lcpuc; @@ -520,11 +498,12 @@ struct sysib_222 { uint8_t res3[16]; uint16_t dedicated_cpus; uint16_t shared_cpus; - uint8_t res4[180]; -}; + uint8_t res4[4020]; +} SysIB_222; +QEMU_BUILD_BUG_ON(sizeof(SysIB_222) != 4096); /* VM CPUs */ -struct sysib_322 { +typedef struct SysIB_322 { uint8_t res1[31]; uint8_t count; struct { @@ -543,7 +522,18 @@ struct sysib_322 { } vm[8]; uint8_t res4[1504]; uint8_t ext_names[8][256]; -}; +} SysIB_322; +QEMU_BUILD_BUG_ON(sizeof(SysIB_322) != 4096); + +typedef union SysIB { + SysIB_111 sysib_111; + SysIB_121 sysib_121; + SysIB_122 sysib_122; + SysIB_221 sysib_221; + SysIB_222 sysib_222; + SysIB_322 sysib_322; +} SysIB; +QEMU_BUILD_BUG_ON(sizeof(SysIB) != 4096); /* MMU defines */ #define _ASCE_ORIGIN ~0xfffULL /* segment table origin */ @@ -718,6 +708,10 @@ static inline unsigned int s390_cpu_set_state(uint8_t cpu_state, S390CPU *cpu) return 0; } #endif /* CONFIG_USER_ONLY */ +static inline uint8_t s390_cpu_get_state(S390CPU *cpu) +{ + return cpu->env.cpu_state; +} /* cpu_models.c */ @@ -752,7 +746,6 @@ void s390_program_interrupt(CPUS390XState *env, uint32_t code, int ilen, /* service interrupts are floating therefore we must not pass an cpustate */ void s390_sclp_extint(uint32_t parm); - /* mmu_helper.c */ int s390_cpu_virt_mem_rw(S390CPU *cpu, vaddr laddr, uint8_t ar, void *hostbuf, int len, bool is_write); diff --git a/target/s390x/cpu_features.c b/target/s390x/cpu_features.c index 85d10b5710..a5619f2893 100644 --- a/target/s390x/cpu_features.c +++ b/target/s390x/cpu_features.c @@ -156,8 +156,12 @@ static const S390FeatDef s390_features[] = { FEAT_INIT("ptff-qpc", S390_FEAT_TYPE_PTFF, 3, "PTFF Query Physical Clock"), FEAT_INIT("ptff-qui", S390_FEAT_TYPE_PTFF, 4, "PTFF Query UTC Information"), FEAT_INIT("ptff-qtou", S390_FEAT_TYPE_PTFF, 5, "PTFF Query TOD Offset User"), + FEAT_INIT("ptff-qsie", S390_FEAT_TYPE_PTFF, 10, "PTFF Query Steering Information Extended"), + FEAT_INIT("ptff-qtoue", S390_FEAT_TYPE_PTFF, 13, "PTFF Query TOD Offset User Extended"), FEAT_INIT("ptff-sto", S390_FEAT_TYPE_PTFF, 65, "PTFF Set TOD Offset"), FEAT_INIT("ptff-stou", S390_FEAT_TYPE_PTFF, 69, "PTFF Set TOD Offset User"), + FEAT_INIT("ptff-stoe", S390_FEAT_TYPE_PTFF, 73, "PTFF Set TOD Offset Extended"), + FEAT_INIT("ptff-stoue", S390_FEAT_TYPE_PTFF, 77, "PTFF Set TOD Offset User Extended"), FEAT_INIT("kmac-dea", S390_FEAT_TYPE_KMAC, 1, "KMAC DEA"), FEAT_INIT("kmac-tdea-128", S390_FEAT_TYPE_KMAC, 2, "KMAC TDEA-128"), @@ -445,6 +449,7 @@ static S390FeatGroupDef s390_feature_groups[] = { FEAT_GROUP_INIT("plo", PLO, "Perform-locked-operation facility"), FEAT_GROUP_INIT("tods", TOD_CLOCK_STEERING, "Tod-clock-steering facility"), FEAT_GROUP_INIT("gen13ptff", GEN13_PTFF, "PTFF enhancements introduced with z13"), + FEAT_GROUP_INIT("mepochptff", MULTIPLE_EPOCH_PTFF, "PTFF enhancements introduced with Multiple-epoch facility"), FEAT_GROUP_INIT("msa", MSA, "Message-security-assist facility"), FEAT_GROUP_INIT("msa1", MSA_EXT_1, "Message-security-assist-extension 1 facility"), FEAT_GROUP_INIT("msa2", MSA_EXT_2, "Message-security-assist-extension 2 facility"), diff --git a/target/s390x/cpu_features_def.h b/target/s390x/cpu_features_def.h index 4d930871b4..7c5915c7b2 100644 --- a/target/s390x/cpu_features_def.h +++ b/target/s390x/cpu_features_def.h @@ -151,8 +151,12 @@ typedef enum { S390_FEAT_PTFF_QPT, S390_FEAT_PTFF_QUI, S390_FEAT_PTFF_QTOU, + S390_FEAT_PTFF_QSIE, + S390_FEAT_PTFF_QTOUE, S390_FEAT_PTFF_STO, S390_FEAT_PTFF_STOU, + S390_FEAT_PTFF_STOE, + S390_FEAT_PTFF_STOUE, /* KMAC */ S390_FEAT_KMAC_DEA, diff --git a/target/s390x/cpu_models.c b/target/s390x/cpu_models.c index 584c409a19..1d5f0da4fe 100644 --- a/target/s390x/cpu_models.c +++ b/target/s390x/cpu_models.c @@ -23,6 +23,7 @@ #include "qapi/qmp/qdict.h" #ifndef CONFIG_USER_ONLY #include "sysemu/arch_init.h" +#include "hw/pci/pci.h" #endif #define CPUDEF_INIT(_type, _gen, _ec_ga, _mha_pow, _hmfai, _name, _desc) \ @@ -1271,6 +1272,11 @@ static void register_types(void) /* init all bitmaps from gnerated data initially */ s390_init_feat_bitmap(qemu_max_cpu_feat_init, qemu_max_cpu_feat); +#ifndef CONFIG_USER_ONLY + if (!pci_available) { + clear_bit(S390_FEAT_ZPCI, qemu_max_cpu_feat); + } +#endif for (i = 0; i < ARRAY_SIZE(s390_cpu_defs); i++) { s390_init_feat_bitmap(s390_cpu_defs[i].base_init, s390_cpu_defs[i].base_feat); diff --git a/target/s390x/excp_helper.c b/target/s390x/excp_helper.c index 6967fbfa25..411051edc3 100644 --- a/target/s390x/excp_helper.c +++ b/target/s390x/excp_helper.c @@ -28,6 +28,7 @@ #include "exec/address-spaces.h" #ifndef CONFIG_USER_ONLY #include "sysemu/sysemu.h" +#include "hw/s390x/s390_flic.h" #endif /* #define DEBUG_S390 */ @@ -236,6 +237,7 @@ static void do_svc_interrupt(CPUS390XState *env) static void do_ext_interrupt(CPUS390XState *env) { + QEMUS390FLICState *flic = QEMU_S390_FLIC(s390_get_flic()); S390CPU *cpu = s390_env_get_cpu(env); uint64_t mask, addr; uint16_t cpu_addr; @@ -272,17 +274,14 @@ static void do_ext_interrupt(CPUS390XState *env) lowcore->ext_int_code = cpu_to_be16(EXT_CPU_TIMER); lowcore->cpu_addr = 0; env->pending_int &= ~INTERRUPT_EXT_CPU_TIMER; - } else if ((env->pending_int & INTERRUPT_EXT_SERVICE) && + } else if (qemu_s390_flic_has_service(flic) && (env->cregs[0] & CR0_SERVICE_SC)) { - /* - * FIXME: floating IRQs should be considered by all CPUs and - * shuld not get cleared by CPU reset. - */ + uint32_t param; + + param = qemu_s390_flic_dequeue_service(flic); lowcore->ext_int_code = cpu_to_be16(EXT_SERVICE); - lowcore->ext_params = cpu_to_be32(env->service_param); + lowcore->ext_params = cpu_to_be32(param); lowcore->cpu_addr = 0; - env->service_param = 0; - env->pending_int &= ~INTERRUPT_EXT_SERVICE; } else { g_assert_not_reached(); } @@ -302,95 +301,46 @@ static void do_ext_interrupt(CPUS390XState *env) static void do_io_interrupt(CPUS390XState *env) { - S390CPU *cpu = s390_env_get_cpu(env); + QEMUS390FLICState *flic = QEMU_S390_FLIC(s390_get_flic()); + uint64_t mask, addr; + QEMUS390FlicIO *io; LowCore *lowcore; - IOIntQueue *q; - uint8_t isc; - int disable = 1; - int found = 0; - - if (!(env->psw.mask & PSW_MASK_IO)) { - cpu_abort(CPU(cpu), "I/O int w/o I/O mask\n"); - } - - for (isc = 0; isc < ARRAY_SIZE(env->io_index); isc++) { - uint64_t isc_bits; - - if (env->io_index[isc] < 0) { - continue; - } - if (env->io_index[isc] >= MAX_IO_QUEUE) { - cpu_abort(CPU(cpu), "I/O queue overrun for isc %d: %d\n", - isc, env->io_index[isc]); - } - - q = &env->io_queue[env->io_index[isc]][isc]; - isc_bits = ISC_TO_ISC_BITS(IO_INT_WORD_ISC(q->word)); - if (!(env->cregs[6] & isc_bits)) { - disable = 0; - continue; - } - if (!found) { - uint64_t mask, addr; - - found = 1; - lowcore = cpu_map_lowcore(env); - - lowcore->subchannel_id = cpu_to_be16(q->id); - lowcore->subchannel_nr = cpu_to_be16(q->nr); - lowcore->io_int_parm = cpu_to_be32(q->parm); - lowcore->io_int_word = cpu_to_be32(q->word); - lowcore->io_old_psw.mask = cpu_to_be64(get_psw_mask(env)); - lowcore->io_old_psw.addr = cpu_to_be64(env->psw.addr); - mask = be64_to_cpu(lowcore->io_new_psw.mask); - addr = be64_to_cpu(lowcore->io_new_psw.addr); - cpu_unmap_lowcore(lowcore); + g_assert(env->psw.mask & PSW_MASK_IO); + io = qemu_s390_flic_dequeue_io(flic, env->cregs[6]); + g_assert(io); - env->io_index[isc]--; + lowcore = cpu_map_lowcore(env); - DPRINTF("%s: %" PRIx64 " %" PRIx64 "\n", __func__, - env->psw.mask, env->psw.addr); - load_psw(env, mask, addr); - } - if (env->io_index[isc] >= 0) { - disable = 0; - } - continue; - } + lowcore->subchannel_id = cpu_to_be16(io->id); + lowcore->subchannel_nr = cpu_to_be16(io->nr); + lowcore->io_int_parm = cpu_to_be32(io->parm); + lowcore->io_int_word = cpu_to_be32(io->word); + lowcore->io_old_psw.mask = cpu_to_be64(get_psw_mask(env)); + lowcore->io_old_psw.addr = cpu_to_be64(env->psw.addr); + mask = be64_to_cpu(lowcore->io_new_psw.mask); + addr = be64_to_cpu(lowcore->io_new_psw.addr); - if (disable) { - env->pending_int &= ~INTERRUPT_IO; - } + cpu_unmap_lowcore(lowcore); + g_free(io); + DPRINTF("%s: %" PRIx64 " %" PRIx64 "\n", __func__, env->psw.mask, + env->psw.addr); + load_psw(env, mask, addr); } static void do_mchk_interrupt(CPUS390XState *env) { - S390CPU *cpu = s390_env_get_cpu(env); + QEMUS390FLICState *flic = QEMU_S390_FLIC(s390_get_flic()); uint64_t mask, addr; LowCore *lowcore; - MchkQueue *q; int i; - if (!(env->psw.mask & PSW_MASK_MCHECK)) { - cpu_abort(CPU(cpu), "Machine check w/o mchk mask\n"); - } - - if (env->mchk_index < 0 || env->mchk_index >= MAX_MCHK_QUEUE) { - cpu_abort(CPU(cpu), "Mchk queue overrun: %d\n", env->mchk_index); - } - - q = &env->mchk_queue[env->mchk_index]; + /* for now we only support channel report machine checks (floating) */ + g_assert(env->psw.mask & PSW_MASK_MCHECK); + g_assert(env->cregs[14] & CR14_CHANNEL_REPORT_SC); - if (q->type != 1) { - /* Don't know how to handle this... */ - cpu_abort(CPU(cpu), "Unknown machine check type %d\n", q->type); - } - if (!(env->cregs[14] & (1 << 28))) { - /* CRW machine checks disabled */ - return; - } + qemu_s390_flic_dequeue_crw_mchk(flic); lowcore = cpu_map_lowcore(env); @@ -417,11 +367,6 @@ static void do_mchk_interrupt(CPUS390XState *env) cpu_unmap_lowcore(lowcore); - env->mchk_index--; - if (env->mchk_index == -1) { - env->pending_int &= ~INTERRUPT_MCHK; - } - DPRINTF("%s: %" PRIx64 " %" PRIx64 "\n", __func__, env->psw.mask, env->psw.addr); @@ -430,12 +375,15 @@ static void do_mchk_interrupt(CPUS390XState *env) void s390_cpu_do_interrupt(CPUState *cs) { + QEMUS390FLICState *flic = QEMU_S390_FLIC(s390_get_flic()); S390CPU *cpu = S390_CPU(cs); CPUS390XState *env = &cpu->env; + bool stopped = false; qemu_log_mask(CPU_LOG_INT, "%s: %d at pc=%" PRIx64 "\n", __func__, cs->exception_index, env->psw.addr); +try_deliver: /* handle machine checks */ if (cs->exception_index == -1 && s390_cpu_has_mcck_int(cpu)) { cs->exception_index = EXCP_MCHK; @@ -478,20 +426,30 @@ void s390_cpu_do_interrupt(CPUState *cs) break; case EXCP_STOP: do_stop_interrupt(env); + stopped = true; break; } - /* WAIT PSW during interrupt injection or STOP interrupt */ - if (cs->exception_index == EXCP_HLT) { - /* don't trigger a cpu_loop_exit(), use an interrupt instead */ - cpu_interrupt(CPU(cpu), CPU_INTERRUPT_HALT); + if (cs->exception_index != -1 && !stopped) { + /* check if there are more pending interrupts to deliver */ + cs->exception_index = -1; + goto try_deliver; } cs->exception_index = -1; /* we might still have pending interrupts, but not deliverable */ - if (!env->pending_int) { + if (!env->pending_int && !qemu_s390_flic_has_any(flic)) { cs->interrupt_request &= ~CPU_INTERRUPT_HARD; } + + /* WAIT PSW during interrupt injection or STOP interrupt */ + if ((env->psw.mask & PSW_MASK_WAIT) || stopped) { + /* don't trigger a cpu_loop_exit(), use an interrupt instead */ + cpu_interrupt(CPU(cpu), CPU_INTERRUPT_HALT); + } else if (cs->halted) { + /* unhalt if we had a WAIT PSW somehwere in our injection chain */ + s390_cpu_unhalt(cpu); + } } bool s390_cpu_exec_interrupt(CPUState *cs, int interrupt_request) @@ -509,6 +467,11 @@ bool s390_cpu_exec_interrupt(CPUState *cs, int interrupt_request) s390_cpu_do_interrupt(cs); return true; } + if (env->psw.mask & PSW_MASK_WAIT) { + /* Woken up because of a floating interrupt but it has already + * been delivered. Go back to sleep. */ + cpu_interrupt(CPU(cpu), CPU_INTERRUPT_HALT); + } } return false; } diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c index bd483333ae..0cdbc15378 100644 --- a/target/s390x/gen-features.c +++ b/target/s390x/gen-features.c @@ -57,6 +57,12 @@ S390_FEAT_PTFF_QTOU, \ S390_FEAT_PTFF_STOU +#define S390_FEAT_GROUP_MULTIPLE_EPOCH_PTFF \ + S390_FEAT_PTFF_QSIE, \ + S390_FEAT_PTFF_QTOUE, \ + S390_FEAT_PTFF_STOE, \ + S390_FEAT_PTFF_STOUE + #define S390_FEAT_GROUP_MSA \ S390_FEAT_MSA, \ S390_FEAT_KMAC_DEA, \ @@ -217,6 +223,9 @@ static uint16_t group_TOD_CLOCK_STEERING[] = { static uint16_t group_GEN13_PTFF[] = { S390_FEAT_GROUP_GEN13_PTFF, }; +static uint16_t group_MULTIPLE_EPOCH_PTFF[] = { + S390_FEAT_GROUP_MULTIPLE_EPOCH_PTFF, +}; static uint16_t group_MSA[] = { S390_FEAT_GROUP_MSA, }; @@ -464,6 +473,7 @@ static uint16_t full_GEN14_GA1[] = { S390_FEAT_CMM_NT, S390_FEAT_HPMA2, S390_FEAT_SIE_KSS, + S390_FEAT_GROUP_MULTIPLE_EPOCH_PTFF, }; /* Default features (in order of release) @@ -570,8 +580,10 @@ static uint16_t qemu_LATEST[] = { S390_FEAT_STFLE_49, S390_FEAT_LOCAL_TLB_CLEARING, S390_FEAT_INTERLOCKED_ACCESS_2, - S390_FEAT_MSA_EXT_4, + S390_FEAT_ADAPTER_EVENT_NOTIFICATION, + S390_FEAT_ADAPTER_INT_SUPPRESSION, S390_FEAT_MSA_EXT_3, + S390_FEAT_MSA_EXT_4, }; /* add all new definitions before this point */ @@ -580,6 +592,8 @@ static uint16_t qemu_MAX[] = { S390_FEAT_STFLE_53, /* generates a dependency warning, leave it out for now */ S390_FEAT_MSA_EXT_5, + /* only with CONFIG_PCI */ + S390_FEAT_ZPCI, }; /****** END FEATURE DEFS ******/ @@ -662,6 +676,7 @@ static FeatGroupDefSpec FeatGroupDef[] = { FEAT_GROUP_INITIALIZER(PLO), FEAT_GROUP_INITIALIZER(TOD_CLOCK_STEERING), FEAT_GROUP_INITIALIZER(GEN13_PTFF), + FEAT_GROUP_INITIALIZER(MULTIPLE_EPOCH_PTFF), FEAT_GROUP_INITIALIZER(MSA), FEAT_GROUP_INITIALIZER(MSA_EXT_1), FEAT_GROUP_INITIALIZER(MSA_EXT_2), diff --git a/target/s390x/helper.h b/target/s390x/helper.h index 59a1d9869b..59cba86a27 100644 --- a/target/s390x/helper.h +++ b/target/s390x/helper.h @@ -170,6 +170,16 @@ DEF_HELPER_4(schm, void, env, i64, i64, i64) DEF_HELPER_3(ssch, void, env, i64, i64) DEF_HELPER_2(stcrw, void, env, i64) DEF_HELPER_3(stsch, void, env, i64, i64) +DEF_HELPER_2(tpi, i32, env, i64) DEF_HELPER_3(tsch, void, env, i64, i64) DEF_HELPER_2(chsc, void, env, i64) + +DEF_HELPER_2(clp, void, env, i32) +DEF_HELPER_3(pcilg, void, env, i32, i32) +DEF_HELPER_3(pcistg, void, env, i32, i32) +DEF_HELPER_4(stpcifc, void, env, i32, i64, i32) +DEF_HELPER_3(sic, void, env, i64, i64) +DEF_HELPER_3(rpcit, void, env, i32, i32) +DEF_HELPER_5(pcistb, void, env, i32, i32, i64, i32) +DEF_HELPER_4(mpcifc, void, env, i32, i64, i32) #endif diff --git a/target/s390x/insn-data.def b/target/s390x/insn-data.def index 11ee43dcbc..621e10d615 100644 --- a/target/s390x/insn-data.def +++ b/target/s390x/insn-data.def @@ -1063,8 +1063,22 @@ C(0xb233, SSCH, S, Z, 0, insn, 0, 0, ssch, 0) C(0xb239, STCRW, S, Z, 0, insn, 0, 0, stcrw, 0) C(0xb234, STSCH, S, Z, 0, insn, 0, 0, stsch, 0) + C(0xb236, TPI , S, Z, la2, 0, 0, 0, tpi, 0) C(0xb235, TSCH, S, Z, 0, insn, 0, 0, tsch, 0) /* ??? Not listed in PoO ninth edition, but there's a linux driver that uses it: "A CHSC subchannel is usually present on LPAR only." */ C(0xb25f, CHSC, RRE, Z, 0, insn, 0, 0, chsc, 0) + +/* zPCI Instructions */ + /* None of these instructions are documented in the PoP, so this is all + based upon target/s390x/kvm.c and Linux code and likely incomplete */ + C(0xebd0, PCISTB, RSY_a, PCI, la2, 0, 0, 0, pcistb, 0) + C(0xebd1, SIC, RSY_a, AIS, r1, r3, 0, 0, sic, 0) + C(0xb9a0, CLP, RRF_c, PCI, 0, 0, 0, 0, clp, 0) + C(0xb9d0, PCISTG, RRE, PCI, 0, 0, 0, 0, pcistg, 0) + C(0xb9d2, PCILG, RRE, PCI, 0, 0, 0, 0, pcilg, 0) + C(0xb9d3, RPCIT, RRE, PCI, 0, 0, 0, 0, rpcit, 0) + C(0xe3d0, MPCIFC, RXY_a, PCI, la2, 0, 0, 0, mpcifc, 0) + C(0xe3d4, STPCIFC, RXY_a, PCI, la2, 0, 0, 0, stpcifc, 0) + #endif /* CONFIG_USER_ONLY */ diff --git a/target/s390x/internal.h b/target/s390x/internal.h index fea165ffe4..d911e84958 100644 --- a/target/s390x/internal.h +++ b/target/s390x/internal.h @@ -278,11 +278,6 @@ static inline void s390_do_cpu_full_reset(CPUState *cs, run_on_cpu_data arg) cpu_reset(cs); } -static inline uint8_t s390_cpu_get_state(S390CPU *cpu) -{ - return cpu->env.cpu_state; -} - /* arch_dump.c */ int s390_cpu_write_elf64_note(WriteCoreDumpFunction f, CPUState *cs, diff --git a/target/s390x/interrupt.c b/target/s390x/interrupt.c index 39c026b8b5..25cfb3eef8 100644 --- a/target/s390x/interrupt.c +++ b/target/s390x/interrupt.c @@ -15,6 +15,9 @@ #include "exec/exec-all.h" #include "sysemu/kvm.h" #include "hw/s390x/ioinst.h" +#if !defined(CONFIG_USER_ONLY) +#include "hw/s390x/s390_flic.h" +#endif /* Ensure to exit the TB after this call! */ void trigger_pgm_exception(CPUS390XState *env, uint32_t code, uint32_t ilen) @@ -55,17 +58,6 @@ void s390_program_interrupt(CPUS390XState *env, uint32_t code, int ilen, } #if !defined(CONFIG_USER_ONLY) -static void cpu_inject_service(S390CPU *cpu, uint32_t param) -{ - CPUS390XState *env = &cpu->env; - - /* multiplexing is good enough for sclp - kvm does it internally as well*/ - env->service_param |= param; - - env->pending_int |= INTERRUPT_EXT_SERVICE; - cpu_interrupt(CPU(cpu), CPU_INTERRUPT_HARD); -} - void cpu_inject_clock_comparator(S390CPU *cpu) { CPUS390XState *env = &cpu->env; @@ -134,48 +126,6 @@ void cpu_inject_stop(S390CPU *cpu) cpu_interrupt(CPU(cpu), CPU_INTERRUPT_HARD); } -static void cpu_inject_io(S390CPU *cpu, uint16_t subchannel_id, - uint16_t subchannel_number, - uint32_t io_int_parm, uint32_t io_int_word) -{ - CPUS390XState *env = &cpu->env; - int isc = IO_INT_WORD_ISC(io_int_word); - - if (env->io_index[isc] == MAX_IO_QUEUE - 1) { - /* ugh - can't queue anymore. Let's drop. */ - return; - } - - env->io_index[isc]++; - assert(env->io_index[isc] < MAX_IO_QUEUE); - - env->io_queue[env->io_index[isc]][isc].id = subchannel_id; - env->io_queue[env->io_index[isc]][isc].nr = subchannel_number; - env->io_queue[env->io_index[isc]][isc].parm = io_int_parm; - env->io_queue[env->io_index[isc]][isc].word = io_int_word; - - env->pending_int |= INTERRUPT_IO; - cpu_interrupt(CPU(cpu), CPU_INTERRUPT_HARD); -} - -static void cpu_inject_crw_mchk(S390CPU *cpu) -{ - CPUS390XState *env = &cpu->env; - - if (env->mchk_index == MAX_MCHK_QUEUE - 1) { - /* ugh - can't queue anymore. Let's drop. */ - return; - } - - env->mchk_index++; - assert(env->mchk_index < MAX_MCHK_QUEUE); - - env->mchk_queue[env->mchk_index].type = 1; - - env->pending_int |= INTERRUPT_MCHK; - cpu_interrupt(CPU(cpu), CPU_INTERRUPT_HARD); -} - /* * All of the following interrupts are floating, i.e. not per-vcpu. * We just need a dummy cpustate in order to be able to inject in the @@ -183,53 +133,50 @@ static void cpu_inject_crw_mchk(S390CPU *cpu) */ void s390_sclp_extint(uint32_t parm) { - if (kvm_enabled()) { - kvm_s390_service_interrupt(parm); - } else { - S390CPU *dummy_cpu = s390_cpu_addr2state(0); + S390FLICState *fs = s390_get_flic(); + S390FLICStateClass *fsc = s390_get_flic_class(fs); - cpu_inject_service(dummy_cpu, parm); - } + fsc->inject_service(fs, parm); } void s390_io_interrupt(uint16_t subchannel_id, uint16_t subchannel_nr, uint32_t io_int_parm, uint32_t io_int_word) { - if (kvm_enabled()) { - kvm_s390_io_interrupt(subchannel_id, subchannel_nr, io_int_parm, - io_int_word); - } else { - S390CPU *dummy_cpu = s390_cpu_addr2state(0); + S390FLICState *fs = s390_get_flic(); + S390FLICStateClass *fsc = s390_get_flic_class(fs); - cpu_inject_io(dummy_cpu, subchannel_id, subchannel_nr, io_int_parm, - io_int_word); - } + fsc->inject_io(fs, subchannel_id, subchannel_nr, io_int_parm, io_int_word); } void s390_crw_mchk(void) { - if (kvm_enabled()) { - kvm_s390_crw_mchk(); - } else { - S390CPU *dummy_cpu = s390_cpu_addr2state(0); + S390FLICState *fs = s390_get_flic(); + S390FLICStateClass *fsc = s390_get_flic_class(fs); - cpu_inject_crw_mchk(dummy_cpu); - } + fsc->inject_crw_mchk(fs); } bool s390_cpu_has_mcck_int(S390CPU *cpu) { + QEMUS390FLICState *flic = s390_get_qemu_flic(s390_get_flic()); CPUS390XState *env = &cpu->env; if (!(env->psw.mask & PSW_MASK_MCHECK)) { return false; } - return env->pending_int & INTERRUPT_MCHK; + /* for now we only support channel report machine checks (floating) */ + if (qemu_s390_flic_has_crw_mchk(flic) && + (env->cregs[14] & CR14_CHANNEL_REPORT_SC)) { + return true; + } + + return false; } bool s390_cpu_has_ext_int(S390CPU *cpu) { + QEMUS390FLICState *flic = s390_get_qemu_flic(s390_get_flic()); CPUS390XState *env = &cpu->env; if (!(env->psw.mask & PSW_MASK_EXT)) { @@ -261,7 +208,7 @@ bool s390_cpu_has_ext_int(S390CPU *cpu) return true; } - if ((env->pending_int & INTERRUPT_EXT_SERVICE) && + if (qemu_s390_flic_has_service(flic) && (env->cregs[0] & CR0_SERVICE_SC)) { return true; } @@ -271,13 +218,14 @@ bool s390_cpu_has_ext_int(S390CPU *cpu) bool s390_cpu_has_io_int(S390CPU *cpu) { + QEMUS390FLICState *flic = s390_get_qemu_flic(s390_get_flic()); CPUS390XState *env = &cpu->env; if (!(env->psw.mask & PSW_MASK_IO)) { return false; } - return env->pending_int & INTERRUPT_IO; + return qemu_s390_flic_has_io(flic, env->cregs[6]); } bool s390_cpu_has_restart_int(S390CPU *cpu) diff --git a/target/s390x/kvm-stub.c b/target/s390x/kvm-stub.c index 6bae3e99d3..8cdcf83845 100644 --- a/target/s390x/kvm-stub.c +++ b/target/s390x/kvm-stub.c @@ -12,10 +12,6 @@ #include "cpu.h" #include "kvm_s390x.h" -void kvm_s390_service_interrupt(uint32_t parm) -{ -} - void kvm_s390_access_exception(S390CPU *cpu, uint16_t code, uint64_t te_code) { } @@ -30,15 +26,6 @@ void kvm_s390_program_interrupt(S390CPU *cpu, uint16_t code) { } -void kvm_s390_io_interrupt(uint16_t subchannel_id, uint16_t subchannel_nr, - uint32_t io_int_parm, uint32_t io_int_word) -{ -} - -void kvm_s390_crw_mchk(void) -{ -} - int kvm_s390_set_cpu_state(S390CPU *cpu, uint8_t cpu_state) { return -ENOSYS; diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c index 8fbbf72e4b..0301e9d519 100644 --- a/target/s390x/kvm.c +++ b/target/s390x/kvm.c @@ -1034,7 +1034,7 @@ void kvm_s390_vcpu_interrupt(S390CPU *cpu, struct kvm_s390_irq *irq) inject_vcpu_irq_legacy(cs, irq); } -static void __kvm_s390_floating_interrupt(struct kvm_s390_irq *irq) +void kvm_s390_floating_interrupt_legacy(struct kvm_s390_irq *irq) { struct kvm_s390_interrupt kvmint = {}; int r; @@ -1052,33 +1052,6 @@ static void __kvm_s390_floating_interrupt(struct kvm_s390_irq *irq) } } -void kvm_s390_floating_interrupt(struct kvm_s390_irq *irq) -{ - static bool use_flic = true; - int r; - - if (use_flic) { - r = kvm_s390_inject_flic(irq); - if (r == -ENOSYS) { - use_flic = false; - } - if (!r) { - return; - } - } - __kvm_s390_floating_interrupt(irq); -} - -void kvm_s390_service_interrupt(uint32_t parm) -{ - struct kvm_s390_irq irq = { - .type = KVM_S390_INT_SERVICE, - .u.ext.ext_params = parm, - }; - - kvm_s390_floating_interrupt(&irq); -} - void kvm_s390_program_interrupt(S390CPU *cpu, uint16_t code) { struct kvm_s390_irq irq = { @@ -1690,10 +1663,10 @@ static int handle_tsch(S390CPU *cpu) * If an I/O interrupt had been dequeued, we have to reinject it. */ if (run->s390_tsch.dequeued) { - kvm_s390_io_interrupt(run->s390_tsch.subchannel_id, - run->s390_tsch.subchannel_nr, - run->s390_tsch.io_int_parm, - run->s390_tsch.io_int_word); + s390_io_interrupt(run->s390_tsch.subchannel_id, + run->s390_tsch.subchannel_nr, + run->s390_tsch.io_int_parm, + run->s390_tsch.io_int_word); } ret = 0; } @@ -1702,7 +1675,7 @@ static int handle_tsch(S390CPU *cpu) static void insert_stsi_3_2_2(S390CPU *cpu, __u64 addr, uint8_t ar) { - struct sysib_322 sysib; + SysIB_322 sysib; int del; if (s390_cpu_virt_mem_read(cpu, addr, ar, &sysib, sizeof(sysib))) { @@ -1840,37 +1813,6 @@ bool kvm_arch_stop_on_emulation_error(CPUState *cpu) return true; } -void kvm_s390_io_interrupt(uint16_t subchannel_id, - uint16_t subchannel_nr, uint32_t io_int_parm, - uint32_t io_int_word) -{ - struct kvm_s390_irq irq = { - .u.io.subchannel_id = subchannel_id, - .u.io.subchannel_nr = subchannel_nr, - .u.io.io_int_parm = io_int_parm, - .u.io.io_int_word = io_int_word, - }; - - if (io_int_word & IO_INT_WORD_AI) { - irq.type = KVM_S390_INT_IO(1, 0, 0, 0); - } else { - irq.type = KVM_S390_INT_IO(0, (subchannel_id & 0xff00) >> 8, - (subchannel_id & 0x0006), - subchannel_nr); - } - kvm_s390_floating_interrupt(&irq); -} - -void kvm_s390_crw_mchk(void) -{ - struct kvm_s390_irq irq = { - .type = KVM_S390_MCHK, - .u.mchk.cr14 = CR14_CHANNEL_REPORT_SC, - .u.mchk.mcic = s390_build_validity_mcic() | MCIC_SC_CP, - }; - kvm_s390_floating_interrupt(&irq); -} - void kvm_s390_enable_css_support(S390CPU *cpu) { int r; @@ -2279,6 +2221,14 @@ void kvm_s390_get_host_cpu_model(S390CPUModel *model, Error **errp) return; } + /* PTFF subfunctions might be indicated although kernel support missing */ + if (!test_bit(S390_FEAT_MULTIPLE_EPOCH, model->features)) { + clear_bit(S390_FEAT_PTFF_QSIE, model->features); + clear_bit(S390_FEAT_PTFF_QTOUE, model->features); + clear_bit(S390_FEAT_PTFF_STOE, model->features); + clear_bit(S390_FEAT_PTFF_STOUE, model->features); + } + /* with cpu model support, CMM is only indicated if really available */ if (kvm_s390_cmma_available()) { set_bit(S390_FEAT_CMM, model->features); diff --git a/target/s390x/kvm_s390x.h b/target/s390x/kvm_s390x.h index 79b35946f3..7a3b862eea 100644 --- a/target/s390x/kvm_s390x.h +++ b/target/s390x/kvm_s390x.h @@ -12,17 +12,12 @@ struct kvm_s390_irq; -void kvm_s390_floating_interrupt(struct kvm_s390_irq *irq); -void kvm_s390_service_interrupt(uint32_t parm); +void kvm_s390_floating_interrupt_legacy(struct kvm_s390_irq *irq); void kvm_s390_vcpu_interrupt(S390CPU *cpu, struct kvm_s390_irq *irq); void kvm_s390_access_exception(S390CPU *cpu, uint16_t code, uint64_t te_code); int kvm_s390_mem_op(S390CPU *cpu, vaddr addr, uint8_t ar, void *hostbuf, int len, bool is_write); void kvm_s390_program_interrupt(S390CPU *cpu, uint16_t code); -void kvm_s390_io_interrupt(uint16_t subchannel_id, - uint16_t subchannel_nr, uint32_t io_int_parm, - uint32_t io_int_word); -void kvm_s390_crw_mchk(void); int kvm_s390_set_cpu_state(S390CPU *cpu, uint8_t cpu_state); void kvm_s390_vcpu_interrupt_pre_save(S390CPU *cpu); int kvm_s390_vcpu_interrupt_post_load(S390CPU *cpu); @@ -44,7 +39,4 @@ void kvm_s390_crypto_reset(void); void kvm_s390_restart_interrupt(S390CPU *cpu); void kvm_s390_stop_interrupt(S390CPU *cpu); -/* implemented outside of target/s390x/ */ -int kvm_s390_inject_flic(struct kvm_s390_irq *irq); - #endif /* KVM_S390X_H */ diff --git a/target/s390x/misc_helper.c b/target/s390x/misc_helper.c index 86da6aab7e..e0b23c1fd1 100644 --- a/target/s390x/misc_helper.c +++ b/target/s390x/misc_helper.c @@ -36,6 +36,10 @@ #include "hw/s390x/ebcdic.h" #include "hw/s390x/s390-virtio-hcall.h" #include "hw/s390x/sclp.h" +#include "hw/s390x/s390_flic.h" +#include "hw/s390x/ioinst.h" +#include "hw/s390x/s390-pci-inst.h" +#include "hw/boards.h" #endif /* #define DEBUG_HELPER */ @@ -194,132 +198,148 @@ void HELPER(spt)(CPUS390XState *env, uint64_t time) } /* Store System Information */ -uint32_t HELPER(stsi)(CPUS390XState *env, uint64_t a0, - uint64_t r0, uint64_t r1) +uint32_t HELPER(stsi)(CPUS390XState *env, uint64_t a0, uint64_t r0, uint64_t r1) { + const uintptr_t ra = GETPC(); + const uint32_t sel1 = r0 & STSI_R0_SEL1_MASK; + const uint32_t sel2 = r1 & STSI_R1_SEL2_MASK; + const MachineState *ms = MACHINE(qdev_get_machine()); + uint16_t total_cpus = 0, conf_cpus = 0, reserved_cpus = 0; S390CPU *cpu = s390_env_get_cpu(env); - int cc = 0; - int sel1, sel2; + SysIB sysib = { 0 }; + int i, cc = 0; + + if ((r0 & STSI_R0_FC_MASK) > STSI_R0_FC_LEVEL_3) { + /* invalid function code: no other checks are performed */ + return 3; + } - if ((r0 & STSI_LEVEL_MASK) <= STSI_LEVEL_3 && - ((r0 & STSI_R0_RESERVED_MASK) || (r1 & STSI_R1_RESERVED_MASK))) { - /* valid function code, invalid reserved bits */ - s390_program_interrupt(env, PGM_SPECIFICATION, 4, GETPC()); + if ((r0 & STSI_R0_RESERVED_MASK) || (r1 & STSI_R1_RESERVED_MASK)) { + s390_program_interrupt(env, PGM_SPECIFICATION, 4, ra); } - sel1 = r0 & STSI_R0_SEL1_MASK; - sel2 = r1 & STSI_R1_SEL2_MASK; + if ((r0 & STSI_R0_FC_MASK) == STSI_R0_FC_CURRENT) { + /* query the current level: no further checks are performed */ + env->regs[0] = STSI_R0_FC_LEVEL_3; + return 0; + } + + if (a0 & ~TARGET_PAGE_MASK) { + s390_program_interrupt(env, PGM_SPECIFICATION, 4, ra); + } - /* XXX: spec exception if sysib is not 4k-aligned */ + /* count the cpus and split them into configured and reserved ones */ + for (i = 0; i < ms->possible_cpus->len; i++) { + total_cpus++; + if (ms->possible_cpus->cpus[i].cpu) { + conf_cpus++; + } else { + reserved_cpus++; + } + } - switch (r0 & STSI_LEVEL_MASK) { - case STSI_LEVEL_1: + /* + * In theory, we could report Level 1 / Level 2 as current. However, + * the Linux kernel will detect this as running under LPAR and assume + * that we have a sclp linemode console (which is always present on + * LPAR, but not the default for QEMU), therefore not displaying boot + * messages and making booting a Linux kernel under TCG harder. + * + * For now we fake the same SMP configuration on all levels. + * + * TODO: We could later make the level configurable via the machine + * and change defaults (linemode console) based on machine type + * and accelerator. + */ + switch (r0 & STSI_R0_FC_MASK) { + case STSI_R0_FC_LEVEL_1: if ((sel1 == 1) && (sel2 == 1)) { /* Basic Machine Configuration */ - struct sysib_111 sysib; char type[5] = {}; - memset(&sysib, 0, sizeof(sysib)); - ebcdic_put(sysib.manuf, "QEMU ", 16); + ebcdic_put(sysib.sysib_111.manuf, "QEMU ", 16); /* same as machine type number in STORE CPU ID, but in EBCDIC */ snprintf(type, ARRAY_SIZE(type), "%X", cpu->model->def->type); - ebcdic_put(sysib.type, type, 4); + ebcdic_put(sysib.sysib_111.type, type, 4); /* model number (not stored in STORE CPU ID for z/Architecure) */ - ebcdic_put(sysib.model, "QEMU ", 16); - ebcdic_put(sysib.sequence, "QEMU ", 16); - ebcdic_put(sysib.plant, "QEMU", 4); - cpu_physical_memory_write(a0, &sysib, sizeof(sysib)); + ebcdic_put(sysib.sysib_111.model, "QEMU ", 16); + ebcdic_put(sysib.sysib_111.sequence, "QEMU ", 16); + ebcdic_put(sysib.sysib_111.plant, "QEMU", 4); } else if ((sel1 == 2) && (sel2 == 1)) { /* Basic Machine CPU */ - struct sysib_121 sysib; - - memset(&sysib, 0, sizeof(sysib)); - /* XXX make different for different CPUs? */ - ebcdic_put(sysib.sequence, "QEMUQEMUQEMUQEMU", 16); - ebcdic_put(sysib.plant, "QEMU", 4); - stw_p(&sysib.cpu_addr, env->core_id); - cpu_physical_memory_write(a0, &sysib, sizeof(sysib)); + ebcdic_put(sysib.sysib_121.sequence, "QEMUQEMUQEMUQEMU", 16); + ebcdic_put(sysib.sysib_121.plant, "QEMU", 4); + sysib.sysib_121.cpu_addr = cpu_to_be16(env->core_id); } else if ((sel1 == 2) && (sel2 == 2)) { /* Basic Machine CPUs */ - struct sysib_122 sysib; - - memset(&sysib, 0, sizeof(sysib)); - stl_p(&sysib.capability, 0x443afc29); - /* XXX change when SMP comes */ - stw_p(&sysib.total_cpus, 1); - stw_p(&sysib.active_cpus, 1); - stw_p(&sysib.standby_cpus, 0); - stw_p(&sysib.reserved_cpus, 0); - cpu_physical_memory_write(a0, &sysib, sizeof(sysib)); + sysib.sysib_122.capability = cpu_to_be32(0x443afc29); + sysib.sysib_122.total_cpus = cpu_to_be16(total_cpus); + sysib.sysib_122.conf_cpus = cpu_to_be16(conf_cpus); + sysib.sysib_122.reserved_cpus = cpu_to_be16(reserved_cpus); } else { cc = 3; } break; - case STSI_LEVEL_2: - { - if ((sel1 == 2) && (sel2 == 1)) { - /* LPAR CPU */ - struct sysib_221 sysib; - - memset(&sysib, 0, sizeof(sysib)); - /* XXX make different for different CPUs? */ - ebcdic_put(sysib.sequence, "QEMUQEMUQEMUQEMU", 16); - ebcdic_put(sysib.plant, "QEMU", 4); - stw_p(&sysib.cpu_addr, env->core_id); - stw_p(&sysib.cpu_id, 0); - cpu_physical_memory_write(a0, &sysib, sizeof(sysib)); - } else if ((sel1 == 2) && (sel2 == 2)) { - /* LPAR CPUs */ - struct sysib_222 sysib; - - memset(&sysib, 0, sizeof(sysib)); - stw_p(&sysib.lpar_num, 0); - sysib.lcpuc = 0; - /* XXX change when SMP comes */ - stw_p(&sysib.total_cpus, 1); - stw_p(&sysib.conf_cpus, 1); - stw_p(&sysib.standby_cpus, 0); - stw_p(&sysib.reserved_cpus, 0); - ebcdic_put(sysib.name, "QEMU ", 8); - stl_p(&sysib.caf, 1000); - stw_p(&sysib.dedicated_cpus, 0); - stw_p(&sysib.shared_cpus, 0); - cpu_physical_memory_write(a0, &sysib, sizeof(sysib)); - } else { - cc = 3; - } - break; + case STSI_R0_FC_LEVEL_2: + if ((sel1 == 2) && (sel2 == 1)) { + /* LPAR CPU */ + ebcdic_put(sysib.sysib_221.sequence, "QEMUQEMUQEMUQEMU", 16); + ebcdic_put(sysib.sysib_221.plant, "QEMU", 4); + sysib.sysib_221.cpu_addr = cpu_to_be16(env->core_id); + } else if ((sel1 == 2) && (sel2 == 2)) { + /* LPAR CPUs */ + sysib.sysib_222.lcpuc = 0x80; /* dedicated */ + sysib.sysib_222.total_cpus = cpu_to_be16(total_cpus); + sysib.sysib_222.conf_cpus = cpu_to_be16(conf_cpus); + sysib.sysib_222.reserved_cpus = cpu_to_be16(reserved_cpus); + ebcdic_put(sysib.sysib_222.name, "QEMU ", 8); + sysib.sysib_222.caf = cpu_to_be32(1000); + sysib.sysib_222.dedicated_cpus = cpu_to_be16(conf_cpus); + } else { + cc = 3; } - case STSI_LEVEL_3: - { - if ((sel1 == 2) && (sel2 == 2)) { - /* VM CPUs */ - struct sysib_322 sysib; - - memset(&sysib, 0, sizeof(sysib)); - sysib.count = 1; - /* XXX change when SMP comes */ - stw_p(&sysib.vm[0].total_cpus, 1); - stw_p(&sysib.vm[0].conf_cpus, 1); - stw_p(&sysib.vm[0].standby_cpus, 0); - stw_p(&sysib.vm[0].reserved_cpus, 0); - ebcdic_put(sysib.vm[0].name, "KVMguest", 8); - stl_p(&sysib.vm[0].caf, 1000); - ebcdic_put(sysib.vm[0].cpi, "KVM/Linux ", 16); - cpu_physical_memory_write(a0, &sysib, sizeof(sysib)); + break; + case STSI_R0_FC_LEVEL_3: + if ((sel1 == 2) && (sel2 == 2)) { + /* VM CPUs */ + sysib.sysib_322.count = 1; + sysib.sysib_322.vm[0].total_cpus = cpu_to_be16(total_cpus); + sysib.sysib_322.vm[0].conf_cpus = cpu_to_be16(conf_cpus); + sysib.sysib_322.vm[0].reserved_cpus = cpu_to_be16(reserved_cpus); + sysib.sysib_322.vm[0].caf = cpu_to_be32(1000); + /* Linux kernel uses this to distinguish us from z/VM */ + ebcdic_put(sysib.sysib_322.vm[0].cpi, "KVM/Linux ", 16); + sysib.sysib_322.vm[0].ext_name_encoding = 2; /* UTF-8 */ + + /* If our VM has a name, use the real name */ + if (qemu_name) { + memset(sysib.sysib_322.vm[0].name, 0x40, + sizeof(sysib.sysib_322.vm[0].name)); + ebcdic_put(sysib.sysib_322.vm[0].name, qemu_name, + MIN(sizeof(sysib.sysib_322.vm[0].name), + strlen(qemu_name))); + strncpy((char *)sysib.sysib_322.ext_names[0], qemu_name, + sizeof(sysib.sysib_322.ext_names[0])); } else { - cc = 3; + ebcdic_put(sysib.sysib_322.vm[0].name, "TCGguest", 8); + strcpy((char *)sysib.sysib_322.ext_names[0], "TCGguest"); } - break; + + /* add the uuid */ + memcpy(sysib.sysib_322.vm[0].uuid, &qemu_uuid, + sizeof(sysib.sysib_322.vm[0].uuid)); + } else { + cc = 3; } - case STSI_LEVEL_CURRENT: - env->regs[0] = STSI_LEVEL_3; - break; - default: - cc = 3; break; } + if (cc == 0) { + if (s390_cpu_virt_mem_write(cpu, a0, 0, &sysib, sizeof(sysib))) { + s390_cpu_virt_mem_handle_exc(cpu, ra); + } + } + return cc; } @@ -429,6 +449,59 @@ void HELPER(stsch)(CPUS390XState *env, uint64_t r1, uint64_t inst) qemu_mutex_unlock_iothread(); } +uint32_t HELPER(tpi)(CPUS390XState *env, uint64_t addr) +{ + const uintptr_t ra = GETPC(); + S390CPU *cpu = s390_env_get_cpu(env); + QEMUS390FLICState *flic = s390_get_qemu_flic(s390_get_flic()); + QEMUS390FlicIO *io = NULL; + LowCore *lowcore; + + if (addr & 0x3) { + s390_program_interrupt(env, PGM_SPECIFICATION, 4, ra); + } + + qemu_mutex_lock_iothread(); + io = qemu_s390_flic_dequeue_io(flic, env->cregs[6]); + if (!io) { + qemu_mutex_unlock_iothread(); + return 0; + } + + if (addr) { + struct { + uint16_t id; + uint16_t nr; + uint32_t parm; + } intc = { + .id = cpu_to_be16(io->id), + .nr = cpu_to_be16(io->nr), + .parm = cpu_to_be32(io->parm), + }; + + if (s390_cpu_virt_mem_write(cpu, addr, 0, &intc, sizeof(intc))) { + /* writing failed, reinject and properly clean up */ + s390_io_interrupt(io->id, io->nr, io->parm, io->word); + qemu_mutex_unlock_iothread(); + g_free(io); + s390_cpu_virt_mem_handle_exc(cpu, ra); + return 0; + } + } else { + /* no protection applies */ + lowcore = cpu_map_lowcore(env); + lowcore->subchannel_id = cpu_to_be16(io->id); + lowcore->subchannel_nr = cpu_to_be16(io->nr); + lowcore->io_int_parm = cpu_to_be32(io->parm); + lowcore->io_int_word = cpu_to_be32(io->word); + cpu_unmap_lowcore(lowcore); + } + + g_free(io); + qemu_mutex_unlock_iothread(); + return 1; +} + void HELPER(tsch)(CPUS390XState *env, uint64_t r1, uint64_t inst) { S390CPU *cpu = s390_env_get_cpu(env); @@ -560,3 +633,91 @@ uint32_t HELPER(stfle)(CPUS390XState *env, uint64_t addr) env->regs[0] = deposit64(env->regs[0], 0, 8, (max_bytes / 8) - 1); return count_bytes >= max_bytes ? 0 : 3; } + +#ifndef CONFIG_USER_ONLY +/* + * Note: we ignore any return code of the functions called for the pci + * instructions, as the only time they return !0 is when the stub is + * called, and in that case we didn't even offer the zpci facility. + * The only exception is SIC, where program checks need to be handled + * by the caller. + */ +void HELPER(clp)(CPUS390XState *env, uint32_t r2) +{ + S390CPU *cpu = s390_env_get_cpu(env); + + qemu_mutex_lock_iothread(); + clp_service_call(cpu, r2, GETPC()); + qemu_mutex_unlock_iothread(); +} + +void HELPER(pcilg)(CPUS390XState *env, uint32_t r1, uint32_t r2) +{ + S390CPU *cpu = s390_env_get_cpu(env); + + qemu_mutex_lock_iothread(); + pcilg_service_call(cpu, r1, r2, GETPC()); + qemu_mutex_unlock_iothread(); +} + +void HELPER(pcistg)(CPUS390XState *env, uint32_t r1, uint32_t r2) +{ + S390CPU *cpu = s390_env_get_cpu(env); + + qemu_mutex_lock_iothread(); + pcistg_service_call(cpu, r1, r2, GETPC()); + qemu_mutex_unlock_iothread(); +} + +void HELPER(stpcifc)(CPUS390XState *env, uint32_t r1, uint64_t fiba, + uint32_t ar) +{ + S390CPU *cpu = s390_env_get_cpu(env); + + qemu_mutex_lock_iothread(); + stpcifc_service_call(cpu, r1, fiba, ar, GETPC()); + qemu_mutex_unlock_iothread(); +} + +void HELPER(sic)(CPUS390XState *env, uint64_t r1, uint64_t r3) +{ + int r; + + qemu_mutex_lock_iothread(); + r = css_do_sic(env, (r3 >> 27) & 0x7, r1 & 0xffff); + qemu_mutex_unlock_iothread(); + /* css_do_sic() may actually return a PGM_xxx value to inject */ + if (r) { + s390_program_interrupt(env, -r, 4, GETPC()); + } +} + +void HELPER(rpcit)(CPUS390XState *env, uint32_t r1, uint32_t r2) +{ + S390CPU *cpu = s390_env_get_cpu(env); + + qemu_mutex_lock_iothread(); + rpcit_service_call(cpu, r1, r2, GETPC()); + qemu_mutex_unlock_iothread(); +} + +void HELPER(pcistb)(CPUS390XState *env, uint32_t r1, uint32_t r3, + uint64_t gaddr, uint32_t ar) +{ + S390CPU *cpu = s390_env_get_cpu(env); + + qemu_mutex_lock_iothread(); + pcistb_service_call(cpu, r1, r3, gaddr, ar, GETPC()); + qemu_mutex_unlock_iothread(); +} + +void HELPER(mpcifc)(CPUS390XState *env, uint32_t r1, uint64_t fiba, + uint32_t ar) +{ + S390CPU *cpu = s390_env_get_cpu(env); + + qemu_mutex_lock_iothread(); + mpcifc_service_call(cpu, r1, fiba, ar, GETPC()); + qemu_mutex_unlock_iothread(); +} +#endif diff --git a/target/s390x/translate.c b/target/s390x/translate.c index df0b41606d..b470d691d3 100644 --- a/target/s390x/translate.c +++ b/target/s390x/translate.c @@ -4199,6 +4199,14 @@ static ExitStatus op_stcrw(DisasContext *s, DisasOps *o) return NO_EXIT; } +static ExitStatus op_tpi(DisasContext *s, DisasOps *o) +{ + check_privileged(s); + gen_helper_tpi(cc_op, cpu_env, o->addr1); + set_cc_static(s); + return NO_EXIT; +} + static ExitStatus op_tsch(DisasContext *s, DisasOps *o) { check_privileged(s); @@ -4777,6 +4785,106 @@ static ExitStatus op_zero2(DisasContext *s, DisasOps *o) return NO_EXIT; } +#ifndef CONFIG_USER_ONLY +static ExitStatus op_clp(DisasContext *s, DisasOps *o) +{ + TCGv_i32 r2 = tcg_const_i32(get_field(s->fields, r2)); + + check_privileged(s); + gen_helper_clp(cpu_env, r2); + tcg_temp_free_i32(r2); + set_cc_static(s); + return NO_EXIT; +} + +static ExitStatus op_pcilg(DisasContext *s, DisasOps *o) +{ + TCGv_i32 r1 = tcg_const_i32(get_field(s->fields, r1)); + TCGv_i32 r2 = tcg_const_i32(get_field(s->fields, r2)); + + check_privileged(s); + gen_helper_pcilg(cpu_env, r1, r2); + tcg_temp_free_i32(r1); + tcg_temp_free_i32(r2); + set_cc_static(s); + return NO_EXIT; +} + +static ExitStatus op_pcistg(DisasContext *s, DisasOps *o) +{ + TCGv_i32 r1 = tcg_const_i32(get_field(s->fields, r1)); + TCGv_i32 r2 = tcg_const_i32(get_field(s->fields, r2)); + + check_privileged(s); + gen_helper_pcistg(cpu_env, r1, r2); + tcg_temp_free_i32(r1); + tcg_temp_free_i32(r2); + set_cc_static(s); + return NO_EXIT; +} + +static ExitStatus op_stpcifc(DisasContext *s, DisasOps *o) +{ + TCGv_i32 r1 = tcg_const_i32(get_field(s->fields, r1)); + TCGv_i32 ar = tcg_const_i32(get_field(s->fields, b2)); + + check_privileged(s); + gen_helper_stpcifc(cpu_env, r1, o->addr1, ar); + tcg_temp_free_i32(ar); + tcg_temp_free_i32(r1); + set_cc_static(s); + return NO_EXIT; +} + +static ExitStatus op_sic(DisasContext *s, DisasOps *o) +{ + check_privileged(s); + gen_helper_sic(cpu_env, o->in1, o->in2); + return NO_EXIT; +} + +static ExitStatus op_rpcit(DisasContext *s, DisasOps *o) +{ + TCGv_i32 r1 = tcg_const_i32(get_field(s->fields, r1)); + TCGv_i32 r2 = tcg_const_i32(get_field(s->fields, r2)); + + check_privileged(s); + gen_helper_rpcit(cpu_env, r1, r2); + tcg_temp_free_i32(r1); + tcg_temp_free_i32(r2); + set_cc_static(s); + return NO_EXIT; +} + +static ExitStatus op_pcistb(DisasContext *s, DisasOps *o) +{ + TCGv_i32 r1 = tcg_const_i32(get_field(s->fields, r1)); + TCGv_i32 r3 = tcg_const_i32(get_field(s->fields, r3)); + TCGv_i32 ar = tcg_const_i32(get_field(s->fields, b2)); + + check_privileged(s); + gen_helper_pcistb(cpu_env, r1, r3, o->addr1, ar); + tcg_temp_free_i32(ar); + tcg_temp_free_i32(r1); + tcg_temp_free_i32(r3); + set_cc_static(s); + return NO_EXIT; +} + +static ExitStatus op_mpcifc(DisasContext *s, DisasOps *o) +{ + TCGv_i32 r1 = tcg_const_i32(get_field(s->fields, r1)); + TCGv_i32 ar = tcg_const_i32(get_field(s->fields, b2)); + + check_privileged(s); + gen_helper_mpcifc(cpu_env, r1, o->addr1, ar); + tcg_temp_free_i32(ar); + tcg_temp_free_i32(r1); + set_cc_static(s); + return NO_EXIT; +} +#endif + /* ====================================================================== */ /* The "Cc OUTput" generators. Given the generated output (and in some cases the original inputs), update the various cc data structures in order to @@ -5708,6 +5816,8 @@ enum DisasInsnEnum { #define FAC_MSA4 S390_FEAT_MSA_EXT_4 /* msa-extension-4 facility */ #define FAC_MSA5 S390_FEAT_MSA_EXT_5 /* msa-extension-5 facility */ #define FAC_ECT S390_FEAT_EXTRACT_CPU_TIME +#define FAC_PCI S390_FEAT_ZPCI /* z/PCI facility */ +#define FAC_AIS S390_FEAT_ADAPTER_INT_SUPPRESSION static const DisasInsn insn_info[] = { #include "insn-data.def" diff --git a/tcg/README b/tcg/README index 03bfb6acd4..bb2ea5121b 100644 --- a/tcg/README +++ b/tcg/README @@ -503,6 +503,92 @@ of the memory access. For a 32-bit host, qemu_ld/st_i64 is guaranteed to only be used with a 64-bit memory access specified in flags. +********* Host vector operations + +All of the vector ops have two parameters, TCGOP_VECL & TCGOP_VECE. +The former specifies the length of the vector in log2 64-bit units; the +later specifies the length of the element (if applicable) in log2 8-bit units. +E.g. VECL=1 -> 64 << 1 -> v128, and VECE=2 -> 1 << 2 -> i32. + +* mov_vec v0, v1 +* ld_vec v0, t1 +* st_vec v0, t1 + + Move, load and store. + +* dup_vec v0, r1 + + Duplicate the low N bits of R1 into VECL/VECE copies across V0. + +* dupi_vec v0, c + + Similarly, for a constant. + Smaller values will be replicated to host register size by the expanders. + +* dup2_vec v0, r1, r2 + + Duplicate r2:r1 into VECL/64 copies across V0. This opcode is + only present for 32-bit hosts. + +* add_vec v0, v1, v2 + + v0 = v1 + v2, in elements across the vector. + +* sub_vec v0, v1, v2 + + Similarly, v0 = v1 - v2. + +* mul_vec v0, v1, v2 + + Similarly, v0 = v1 * v2. + +* neg_vec v0, v1 + + Similarly, v0 = -v1. + +* and_vec v0, v1, v2 +* or_vec v0, v1, v2 +* xor_vec v0, v1, v2 +* andc_vec v0, v1, v2 +* orc_vec v0, v1, v2 +* not_vec v0, v1 + + Similarly, logical operations with and without compliment. + Note that VECE is unused. + +* shli_vec v0, v1, i2 +* shls_vec v0, v1, s2 + + Shift all elements from v1 by a scalar i2/s2. I.e. + + for (i = 0; i < VECL/VECE; ++i) { + v0[i] = v1[i] << s2; + } + +* shri_vec v0, v1, i2 +* sari_vec v0, v1, i2 +* shrs_vec v0, v1, s2 +* sars_vec v0, v1, s2 + + Similarly for logical and arithmetic right shift. + +* shlv_vec v0, v1, v2 + + Shift elements from v1 by elements from v2. I.e. + + for (i = 0; i < VECL/VECE; ++i) { + v0[i] = v1[i] << v2[i]; + } + +* shrv_vec v0, v1, v2 +* sarv_vec v0, v1, v2 + + Similarly for logical and arithmetic right shift. + +* cmp_vec v0, v1, v2, cond + + Compare vectors by element, storing -1 for true and 0 for false. + ********* Note 1: Some shortcuts are defined when the last operand is known to be diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h index c2525066ab..9aea1d1771 100644 --- a/tcg/aarch64/tcg-target.h +++ b/tcg/aarch64/tcg-target.h @@ -31,13 +31,22 @@ typedef enum { TCG_REG_SP = 31, TCG_REG_XZR = 31, + TCG_REG_V0 = 32, TCG_REG_V1, TCG_REG_V2, TCG_REG_V3, + TCG_REG_V4, TCG_REG_V5, TCG_REG_V6, TCG_REG_V7, + TCG_REG_V8, TCG_REG_V9, TCG_REG_V10, TCG_REG_V11, + TCG_REG_V12, TCG_REG_V13, TCG_REG_V14, TCG_REG_V15, + TCG_REG_V16, TCG_REG_V17, TCG_REG_V18, TCG_REG_V19, + TCG_REG_V20, TCG_REG_V21, TCG_REG_V22, TCG_REG_V23, + TCG_REG_V24, TCG_REG_V25, TCG_REG_V26, TCG_REG_V27, + TCG_REG_V28, TCG_REG_V29, TCG_REG_V30, TCG_REG_V31, + /* Aliases. */ TCG_REG_FP = TCG_REG_X29, TCG_REG_LR = TCG_REG_X30, TCG_AREG0 = TCG_REG_X19, } TCGReg; -#define TCG_TARGET_NB_REGS 32 +#define TCG_TARGET_NB_REGS 64 /* used for function call generation */ #define TCG_REG_CALL_STACK TCG_REG_SP @@ -113,6 +122,20 @@ typedef enum { #define TCG_TARGET_HAS_mulsh_i64 1 #define TCG_TARGET_HAS_direct_jump 1 +#define TCG_TARGET_HAS_v64 1 +#define TCG_TARGET_HAS_v128 1 +#define TCG_TARGET_HAS_v256 0 + +#define TCG_TARGET_HAS_andc_vec 1 +#define TCG_TARGET_HAS_orc_vec 1 +#define TCG_TARGET_HAS_not_vec 1 +#define TCG_TARGET_HAS_neg_vec 1 +#define TCG_TARGET_HAS_shi_vec 1 +#define TCG_TARGET_HAS_shs_vec 0 +#define TCG_TARGET_HAS_shv_vec 0 +#define TCG_TARGET_HAS_cmp_vec 1 +#define TCG_TARGET_HAS_mul_vec 1 + #define TCG_TARGET_DEFAULT_MO (0) static inline void flush_icache_range(uintptr_t start, uintptr_t stop) diff --git a/tcg/aarch64/tcg-target.inc.c b/tcg/aarch64/tcg-target.inc.c index 150530f30e..be3192078d 100644 --- a/tcg/aarch64/tcg-target.inc.c +++ b/tcg/aarch64/tcg-target.inc.c @@ -20,10 +20,15 @@ QEMU_BUILD_BUG_ON(TCG_TYPE_I32 != 0 || TCG_TYPE_I64 != 1); #ifdef CONFIG_DEBUG_TCG static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { - "%x0", "%x1", "%x2", "%x3", "%x4", "%x5", "%x6", "%x7", - "%x8", "%x9", "%x10", "%x11", "%x12", "%x13", "%x14", "%x15", - "%x16", "%x17", "%x18", "%x19", "%x20", "%x21", "%x22", "%x23", - "%x24", "%x25", "%x26", "%x27", "%x28", "%fp", "%x30", "%sp", + "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", + "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", + "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", + "x24", "x25", "x26", "x27", "x28", "fp", "x30", "sp", + + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "fp", "v30", "v31", }; #endif /* CONFIG_DEBUG_TCG */ @@ -43,6 +48,14 @@ static const int tcg_target_reg_alloc_order[] = { /* X19 reserved for AREG0 */ /* X29 reserved as fp */ /* X30 reserved as temporary */ + + TCG_REG_V0, TCG_REG_V1, TCG_REG_V2, TCG_REG_V3, + TCG_REG_V4, TCG_REG_V5, TCG_REG_V6, TCG_REG_V7, + /* V8 - V15 are call-saved, and skipped. */ + TCG_REG_V16, TCG_REG_V17, TCG_REG_V18, TCG_REG_V19, + TCG_REG_V20, TCG_REG_V21, TCG_REG_V22, TCG_REG_V23, + TCG_REG_V24, TCG_REG_V25, TCG_REG_V26, TCG_REG_V27, + TCG_REG_V28, TCG_REG_V29, TCG_REG_V30, TCG_REG_V31, }; static const int tcg_target_call_iarg_regs[8] = { @@ -54,6 +67,7 @@ static const int tcg_target_call_oarg_regs[1] = { }; #define TCG_REG_TMP TCG_REG_X30 +#define TCG_VEC_TMP TCG_REG_V31 #ifndef CONFIG_SOFTMMU /* Note that XZR cannot be encoded in the address base register slot, @@ -119,9 +133,13 @@ static const char *target_parse_constraint(TCGArgConstraint *ct, const char *ct_str, TCGType type) { switch (*ct_str++) { - case 'r': + case 'r': /* general registers */ ct->ct |= TCG_CT_REG; - ct->u.regs = 0xffffffffu; + ct->u.regs |= 0xffffffffu; + break; + case 'w': /* advsimd registers */ + ct->ct |= TCG_CT_REG; + ct->u.regs |= 0xffffffff00000000ull; break; case 'l': /* qemu_ld / qemu_st address, data_reg */ ct->ct |= TCG_CT_REG; @@ -153,11 +171,13 @@ static const char *target_parse_constraint(TCGArgConstraint *ct, return ct_str; } +/* Match a constant valid for addition (12-bit, optionally shifted). */ static inline bool is_aimm(uint64_t val) { return (val & ~0xfff) == 0 || (val & ~0xfff000) == 0; } +/* Match a constant valid for logical operations. */ static inline bool is_limm(uint64_t val) { /* Taking a simplified view of the logical immediates for now, ignoring @@ -178,6 +198,106 @@ static inline bool is_limm(uint64_t val) return (val & (val - 1)) == 0; } +/* Match a constant that is valid for vectors. */ +static bool is_fimm(uint64_t v64, int *op, int *cmode, int *imm8) +{ + int i; + + *op = 0; + /* Match replication across 8 bits. */ + if (v64 == dup_const(MO_8, v64)) { + *cmode = 0xe; + *imm8 = v64 & 0xff; + return true; + } + /* Match replication across 16 bits. */ + if (v64 == dup_const(MO_16, v64)) { + uint16_t v16 = v64; + + if (v16 == (v16 & 0xff)) { + *cmode = 0x8; + *imm8 = v16 & 0xff; + return true; + } else if (v16 == (v16 & 0xff00)) { + *cmode = 0xa; + *imm8 = v16 >> 8; + return true; + } + } + /* Match replication across 32 bits. */ + if (v64 == dup_const(MO_32, v64)) { + uint32_t v32 = v64; + + if (v32 == (v32 & 0xff)) { + *cmode = 0x0; + *imm8 = v32 & 0xff; + return true; + } else if (v32 == (v32 & 0xff00)) { + *cmode = 0x2; + *imm8 = (v32 >> 8) & 0xff; + return true; + } else if (v32 == (v32 & 0xff0000)) { + *cmode = 0x4; + *imm8 = (v32 >> 16) & 0xff; + return true; + } else if (v32 == (v32 & 0xff000000)) { + *cmode = 0x6; + *imm8 = v32 >> 24; + return true; + } else if ((v32 & 0xffff00ff) == 0xff) { + *cmode = 0xc; + *imm8 = (v32 >> 8) & 0xff; + return true; + } else if ((v32 & 0xff00ffff) == 0xffff) { + *cmode = 0xd; + *imm8 = (v32 >> 16) & 0xff; + return true; + } + /* Match forms of a float32. */ + if (extract32(v32, 0, 19) == 0 + && (extract32(v32, 25, 6) == 0x20 + || extract32(v32, 25, 6) == 0x1f)) { + *cmode = 0xf; + *imm8 = (extract32(v32, 31, 1) << 7) + | (extract32(v32, 25, 1) << 6) + | extract32(v32, 19, 6); + return true; + } + } + /* Match forms of a float64. */ + if (extract64(v64, 0, 48) == 0 + && (extract64(v64, 54, 9) == 0x100 + || extract64(v64, 54, 9) == 0x0ff)) { + *cmode = 0xf; + *op = 1; + *imm8 = (extract64(v64, 63, 1) << 7) + | (extract64(v64, 54, 1) << 6) + | extract64(v64, 48, 6); + return true; + } + /* Match bytes of 0x00 and 0xff. */ + for (i = 0; i < 64; i += 8) { + uint64_t byte = extract64(v64, i, 8); + if (byte != 0 && byte != 0xff) { + break; + } + } + if (i == 64) { + *cmode = 0xe; + *op = 1; + *imm8 = (extract64(v64, 0, 1) << 0) + | (extract64(v64, 8, 1) << 1) + | (extract64(v64, 16, 1) << 2) + | (extract64(v64, 24, 1) << 3) + | (extract64(v64, 32, 1) << 4) + | (extract64(v64, 40, 1) << 5) + | (extract64(v64, 48, 1) << 6) + | (extract64(v64, 56, 1) << 7); + return true; + } + return false; +} + static int tcg_target_const_match(tcg_target_long val, TCGType type, const TCGArgConstraint *arg_ct) { @@ -271,6 +391,9 @@ typedef enum { /* Load literal for loading the address at pc-relative offset */ I3305_LDR = 0x58000000, + I3305_LDR_v64 = 0x5c000000, + I3305_LDR_v128 = 0x9c000000, + /* Load/store register. Described here as 3.3.12, but the helper that emits them can transform to 3.3.10 or 3.3.13. */ I3312_STRB = 0x38000000 | LDST_ST << 22 | MO_8 << 30, @@ -290,6 +413,15 @@ typedef enum { I3312_LDRSHX = 0x38000000 | LDST_LD_S_X << 22 | MO_16 << 30, I3312_LDRSWX = 0x38000000 | LDST_LD_S_X << 22 | MO_32 << 30, + I3312_LDRVS = 0x3c000000 | LDST_LD << 22 | MO_32 << 30, + I3312_STRVS = 0x3c000000 | LDST_ST << 22 | MO_32 << 30, + + I3312_LDRVD = 0x3c000000 | LDST_LD << 22 | MO_64 << 30, + I3312_STRVD = 0x3c000000 | LDST_ST << 22 | MO_64 << 30, + + I3312_LDRVQ = 0x3c000000 | 3 << 22 | 0 << 30, + I3312_STRVQ = 0x3c000000 | 2 << 22 | 0 << 30, + I3312_TO_I3310 = 0x00200800, I3312_TO_I3313 = 0x01000000, @@ -374,8 +506,48 @@ typedef enum { I3510_EON = 0x4a200000, I3510_ANDS = 0x6a000000, - NOP = 0xd503201f, + /* AdvSIMD copy */ + I3605_DUP = 0x0e000400, + I3605_INS = 0x4e001c00, + I3605_UMOV = 0x0e003c00, + + /* AdvSIMD modified immediate */ + I3606_MOVI = 0x0f000400, + + /* AdvSIMD shift by immediate */ + I3614_SSHR = 0x0f000400, + I3614_SSRA = 0x0f001400, + I3614_SHL = 0x0f005400, + I3614_USHR = 0x2f000400, + I3614_USRA = 0x2f001400, + + /* AdvSIMD three same. */ + I3616_ADD = 0x0e208400, + I3616_AND = 0x0e201c00, + I3616_BIC = 0x0e601c00, + I3616_EOR = 0x2e201c00, + I3616_MUL = 0x0e209c00, + I3616_ORR = 0x0ea01c00, + I3616_ORN = 0x0ee01c00, + I3616_SUB = 0x2e208400, + I3616_CMGT = 0x0e203400, + I3616_CMGE = 0x0e203c00, + I3616_CMTST = 0x0e208c00, + I3616_CMHI = 0x2e203400, + I3616_CMHS = 0x2e203c00, + I3616_CMEQ = 0x2e208c00, + + /* AdvSIMD two-reg misc. */ + I3617_CMGT0 = 0x0e208800, + I3617_CMEQ0 = 0x0e209800, + I3617_CMLT0 = 0x0e20a800, + I3617_CMGE0 = 0x2e208800, + I3617_CMLE0 = 0x2e20a800, + I3617_NOT = 0x2e205800, + I3617_NEG = 0x2e20b800, + /* System instructions. */ + NOP = 0xd503201f, DMB_ISH = 0xd50338bf, DMB_LD = 0x00000100, DMB_ST = 0x00000200, @@ -520,26 +692,64 @@ static void tcg_out_insn_3509(TCGContext *s, AArch64Insn insn, TCGType ext, tcg_out32(s, insn | ext << 31 | rm << 16 | ra << 10 | rn << 5 | rd); } +static void tcg_out_insn_3605(TCGContext *s, AArch64Insn insn, bool q, + TCGReg rd, TCGReg rn, int dst_idx, int src_idx) +{ + /* Note that bit 11 set means general register input. Therefore + we can handle both register sets with one function. */ + tcg_out32(s, insn | q << 30 | (dst_idx << 16) | (src_idx << 11) + | (rd & 0x1f) | (~rn & 0x20) << 6 | (rn & 0x1f) << 5); +} + +static void tcg_out_insn_3606(TCGContext *s, AArch64Insn insn, bool q, + TCGReg rd, bool op, int cmode, uint8_t imm8) +{ + tcg_out32(s, insn | q << 30 | op << 29 | cmode << 12 | (rd & 0x1f) + | (imm8 & 0xe0) << (16 - 5) | (imm8 & 0x1f) << 5); +} + +static void tcg_out_insn_3614(TCGContext *s, AArch64Insn insn, bool q, + TCGReg rd, TCGReg rn, unsigned immhb) +{ + tcg_out32(s, insn | q << 30 | immhb << 16 + | (rn & 0x1f) << 5 | (rd & 0x1f)); +} + +static void tcg_out_insn_3616(TCGContext *s, AArch64Insn insn, bool q, + unsigned size, TCGReg rd, TCGReg rn, TCGReg rm) +{ + tcg_out32(s, insn | q << 30 | (size << 22) | (rm & 0x1f) << 16 + | (rn & 0x1f) << 5 | (rd & 0x1f)); +} + +static void tcg_out_insn_3617(TCGContext *s, AArch64Insn insn, bool q, + unsigned size, TCGReg rd, TCGReg rn) +{ + tcg_out32(s, insn | q << 30 | (size << 22) + | (rn & 0x1f) << 5 | (rd & 0x1f)); +} + static void tcg_out_insn_3310(TCGContext *s, AArch64Insn insn, TCGReg rd, TCGReg base, TCGType ext, TCGReg regoff) { /* Note the AArch64Insn constants above are for C3.3.12. Adjust. */ tcg_out32(s, insn | I3312_TO_I3310 | regoff << 16 | - 0x4000 | ext << 13 | base << 5 | rd); + 0x4000 | ext << 13 | base << 5 | (rd & 0x1f)); } static void tcg_out_insn_3312(TCGContext *s, AArch64Insn insn, TCGReg rd, TCGReg rn, intptr_t offset) { - tcg_out32(s, insn | (offset & 0x1ff) << 12 | rn << 5 | rd); + tcg_out32(s, insn | (offset & 0x1ff) << 12 | rn << 5 | (rd & 0x1f)); } static void tcg_out_insn_3313(TCGContext *s, AArch64Insn insn, TCGReg rd, TCGReg rn, uintptr_t scaled_uimm) { /* Note the AArch64Insn constants above are for C3.3.12. Adjust. */ - tcg_out32(s, insn | I3312_TO_I3313 | scaled_uimm << 10 | rn << 5 | rd); + tcg_out32(s, insn | I3312_TO_I3313 | scaled_uimm << 10 + | rn << 5 | (rd & 0x1f)); } /* Register to register move using ORR (shifted register with no shift). */ @@ -585,6 +795,22 @@ static void tcg_out_logicali(TCGContext *s, AArch64Insn insn, TCGType ext, tcg_out_insn_3404(s, insn, ext, rd, rn, ext, r, c); } +static void tcg_out_dupi_vec(TCGContext *s, TCGType type, + TCGReg rd, uint64_t v64) +{ + int op, cmode, imm8; + + if (is_fimm(v64, &op, &cmode, &imm8)) { + tcg_out_insn(s, 3606, MOVI, type == TCG_TYPE_V128, rd, op, cmode, imm8); + } else if (type == TCG_TYPE_V128) { + new_pool_l2(s, R_AARCH64_CONDBR19, s->code_ptr, 0, v64, v64); + tcg_out_insn(s, 3305, LDR_v128, 0, rd); + } else { + new_pool_label(s, v64, R_AARCH64_CONDBR19, s->code_ptr, 0); + tcg_out_insn(s, 3305, LDR_v64, 0, rd); + } +} + static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd, tcg_target_long value) { @@ -594,6 +820,22 @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd, int s0, s1; AArch64Insn opc; + switch (type) { + case TCG_TYPE_I32: + case TCG_TYPE_I64: + tcg_debug_assert(rd < 32); + break; + + case TCG_TYPE_V64: + case TCG_TYPE_V128: + tcg_debug_assert(rd >= 32); + tcg_out_dupi_vec(s, type, rd, value); + return; + + default: + g_assert_not_reached(); + } + /* For 32-bit values, discard potential garbage in value. For 64-bit values within [2**31, 2**32-1], we can create smaller sequences by interpreting this as a negative 32-bit number, while ensuring that @@ -669,15 +911,13 @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd, /* Define something more legible for general use. */ #define tcg_out_ldst_r tcg_out_insn_3310 -static void tcg_out_ldst(TCGContext *s, AArch64Insn insn, - TCGReg rd, TCGReg rn, intptr_t offset) +static void tcg_out_ldst(TCGContext *s, AArch64Insn insn, TCGReg rd, + TCGReg rn, intptr_t offset, int lgsize) { - TCGMemOp size = (uint32_t)insn >> 30; - /* If the offset is naturally aligned and in range, then we can use the scaled uimm12 encoding */ - if (offset >= 0 && !(offset & ((1 << size) - 1))) { - uintptr_t scaled_uimm = offset >> size; + if (offset >= 0 && !(offset & ((1 << lgsize) - 1))) { + uintptr_t scaled_uimm = offset >> lgsize; if (scaled_uimm <= 0xfff) { tcg_out_insn_3313(s, insn, rd, rn, scaled_uimm); return; @@ -695,32 +935,102 @@ static void tcg_out_ldst(TCGContext *s, AArch64Insn insn, tcg_out_ldst_r(s, insn, rd, rn, TCG_TYPE_I64, TCG_REG_TMP); } -static inline void tcg_out_mov(TCGContext *s, - TCGType type, TCGReg ret, TCGReg arg) +static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg) { - if (ret != arg) { - tcg_out_movr(s, type, ret, arg); + if (ret == arg) { + return; + } + switch (type) { + case TCG_TYPE_I32: + case TCG_TYPE_I64: + if (ret < 32 && arg < 32) { + tcg_out_movr(s, type, ret, arg); + break; + } else if (ret < 32) { + tcg_out_insn(s, 3605, UMOV, type, ret, arg, 0, 0); + break; + } else if (arg < 32) { + tcg_out_insn(s, 3605, INS, 0, ret, arg, 4 << type, 0); + break; + } + /* FALLTHRU */ + + case TCG_TYPE_V64: + tcg_debug_assert(ret >= 32 && arg >= 32); + tcg_out_insn(s, 3616, ORR, 0, 0, ret, arg, arg); + break; + case TCG_TYPE_V128: + tcg_debug_assert(ret >= 32 && arg >= 32); + tcg_out_insn(s, 3616, ORR, 1, 0, ret, arg, arg); + break; + + default: + g_assert_not_reached(); } } -static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg arg, - TCGReg arg1, intptr_t arg2) +static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, + TCGReg base, intptr_t ofs) { - tcg_out_ldst(s, type == TCG_TYPE_I32 ? I3312_LDRW : I3312_LDRX, - arg, arg1, arg2); + AArch64Insn insn; + int lgsz; + + switch (type) { + case TCG_TYPE_I32: + insn = (ret < 32 ? I3312_LDRW : I3312_LDRVS); + lgsz = 2; + break; + case TCG_TYPE_I64: + insn = (ret < 32 ? I3312_LDRX : I3312_LDRVD); + lgsz = 3; + break; + case TCG_TYPE_V64: + insn = I3312_LDRVD; + lgsz = 3; + break; + case TCG_TYPE_V128: + insn = I3312_LDRVQ; + lgsz = 4; + break; + default: + g_assert_not_reached(); + } + tcg_out_ldst(s, insn, ret, base, ofs, lgsz); } -static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, - TCGReg arg1, intptr_t arg2) +static void tcg_out_st(TCGContext *s, TCGType type, TCGReg src, + TCGReg base, intptr_t ofs) { - tcg_out_ldst(s, type == TCG_TYPE_I32 ? I3312_STRW : I3312_STRX, - arg, arg1, arg2); + AArch64Insn insn; + int lgsz; + + switch (type) { + case TCG_TYPE_I32: + insn = (src < 32 ? I3312_STRW : I3312_STRVS); + lgsz = 2; + break; + case TCG_TYPE_I64: + insn = (src < 32 ? I3312_STRX : I3312_STRVD); + lgsz = 3; + break; + case TCG_TYPE_V64: + insn = I3312_STRVD; + lgsz = 3; + break; + case TCG_TYPE_V128: + insn = I3312_STRVQ; + lgsz = 4; + break; + default: + g_assert_not_reached(); + } + tcg_out_ldst(s, insn, src, base, ofs, lgsz); } static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, TCGReg base, intptr_t ofs) { - if (val == 0) { + if (type <= TCG_TYPE_I64 && val == 0) { tcg_out_st(s, type, TCG_REG_XZR, base, ofs); return true; } @@ -1210,14 +1520,15 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg addr_reg, TCGMemOp opc, /* Merge "low bits" from tlb offset, load the tlb comparator into X0. X0 = load [X2 + (tlb_offset & 0x000fff)] */ tcg_out_ldst(s, TARGET_LONG_BITS == 32 ? I3312_LDRW : I3312_LDRX, - TCG_REG_X0, TCG_REG_X2, tlb_offset & 0xfff); + TCG_REG_X0, TCG_REG_X2, tlb_offset & 0xfff, + TARGET_LONG_BITS == 32 ? 2 : 3); /* Load the tlb addend. Do that early to avoid stalling. X1 = load [X2 + (tlb_offset & 0xfff) + offsetof(addend)] */ tcg_out_ldst(s, I3312_LDRX, TCG_REG_X1, TCG_REG_X2, (tlb_offset & 0xfff) + (offsetof(CPUTLBEntry, addend)) - (is_read ? offsetof(CPUTLBEntry, addr_read) - : offsetof(CPUTLBEntry, addr_write))); + : offsetof(CPUTLBEntry, addr_write)), 3); /* Perform the address comparison. */ tcg_out_cmp(s, (TARGET_LONG_BITS == 64), TCG_REG_X0, TCG_REG_X3, 0); @@ -1435,49 +1746,49 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, case INDEX_op_ld8u_i32: case INDEX_op_ld8u_i64: - tcg_out_ldst(s, I3312_LDRB, a0, a1, a2); + tcg_out_ldst(s, I3312_LDRB, a0, a1, a2, 0); break; case INDEX_op_ld8s_i32: - tcg_out_ldst(s, I3312_LDRSBW, a0, a1, a2); + tcg_out_ldst(s, I3312_LDRSBW, a0, a1, a2, 0); break; case INDEX_op_ld8s_i64: - tcg_out_ldst(s, I3312_LDRSBX, a0, a1, a2); + tcg_out_ldst(s, I3312_LDRSBX, a0, a1, a2, 0); break; case INDEX_op_ld16u_i32: case INDEX_op_ld16u_i64: - tcg_out_ldst(s, I3312_LDRH, a0, a1, a2); + tcg_out_ldst(s, I3312_LDRH, a0, a1, a2, 1); break; case INDEX_op_ld16s_i32: - tcg_out_ldst(s, I3312_LDRSHW, a0, a1, a2); + tcg_out_ldst(s, I3312_LDRSHW, a0, a1, a2, 1); break; case INDEX_op_ld16s_i64: - tcg_out_ldst(s, I3312_LDRSHX, a0, a1, a2); + tcg_out_ldst(s, I3312_LDRSHX, a0, a1, a2, 1); break; case INDEX_op_ld_i32: case INDEX_op_ld32u_i64: - tcg_out_ldst(s, I3312_LDRW, a0, a1, a2); + tcg_out_ldst(s, I3312_LDRW, a0, a1, a2, 2); break; case INDEX_op_ld32s_i64: - tcg_out_ldst(s, I3312_LDRSWX, a0, a1, a2); + tcg_out_ldst(s, I3312_LDRSWX, a0, a1, a2, 2); break; case INDEX_op_ld_i64: - tcg_out_ldst(s, I3312_LDRX, a0, a1, a2); + tcg_out_ldst(s, I3312_LDRX, a0, a1, a2, 3); break; case INDEX_op_st8_i32: case INDEX_op_st8_i64: - tcg_out_ldst(s, I3312_STRB, REG0(0), a1, a2); + tcg_out_ldst(s, I3312_STRB, REG0(0), a1, a2, 0); break; case INDEX_op_st16_i32: case INDEX_op_st16_i64: - tcg_out_ldst(s, I3312_STRH, REG0(0), a1, a2); + tcg_out_ldst(s, I3312_STRH, REG0(0), a1, a2, 1); break; case INDEX_op_st_i32: case INDEX_op_st32_i64: - tcg_out_ldst(s, I3312_STRW, REG0(0), a1, a2); + tcg_out_ldst(s, I3312_STRW, REG0(0), a1, a2, 2); break; case INDEX_op_st_i64: - tcg_out_ldst(s, I3312_STRX, REG0(0), a1, a2); + tcg_out_ldst(s, I3312_STRX, REG0(0), a1, a2, 3); break; case INDEX_op_add_i32: @@ -1776,25 +2087,176 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc, case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */ case INDEX_op_mov_i64: + case INDEX_op_mov_vec: case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi. */ case INDEX_op_movi_i64: + case INDEX_op_dupi_vec: case INDEX_op_call: /* Always emitted via tcg_out_call. */ default: - tcg_abort(); + g_assert_not_reached(); } #undef REG0 } +static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, + unsigned vecl, unsigned vece, + const TCGArg *args, const int *const_args) +{ + static const AArch64Insn cmp_insn[16] = { + [TCG_COND_EQ] = I3616_CMEQ, + [TCG_COND_GT] = I3616_CMGT, + [TCG_COND_GE] = I3616_CMGE, + [TCG_COND_GTU] = I3616_CMHI, + [TCG_COND_GEU] = I3616_CMHS, + }; + static const AArch64Insn cmp0_insn[16] = { + [TCG_COND_EQ] = I3617_CMEQ0, + [TCG_COND_GT] = I3617_CMGT0, + [TCG_COND_GE] = I3617_CMGE0, + [TCG_COND_LT] = I3617_CMLT0, + [TCG_COND_LE] = I3617_CMLE0, + }; + + TCGType type = vecl + TCG_TYPE_V64; + unsigned is_q = vecl; + TCGArg a0, a1, a2; + + a0 = args[0]; + a1 = args[1]; + a2 = args[2]; + + switch (opc) { + case INDEX_op_ld_vec: + tcg_out_ld(s, type, a0, a1, a2); + break; + case INDEX_op_st_vec: + tcg_out_st(s, type, a0, a1, a2); + break; + case INDEX_op_add_vec: + tcg_out_insn(s, 3616, ADD, is_q, vece, a0, a1, a2); + break; + case INDEX_op_sub_vec: + tcg_out_insn(s, 3616, SUB, is_q, vece, a0, a1, a2); + break; + case INDEX_op_mul_vec: + tcg_out_insn(s, 3616, MUL, is_q, vece, a0, a1, a2); + break; + case INDEX_op_neg_vec: + tcg_out_insn(s, 3617, NEG, is_q, vece, a0, a1); + break; + case INDEX_op_and_vec: + tcg_out_insn(s, 3616, AND, is_q, 0, a0, a1, a2); + break; + case INDEX_op_or_vec: + tcg_out_insn(s, 3616, ORR, is_q, 0, a0, a1, a2); + break; + case INDEX_op_xor_vec: + tcg_out_insn(s, 3616, EOR, is_q, 0, a0, a1, a2); + break; + case INDEX_op_andc_vec: + tcg_out_insn(s, 3616, BIC, is_q, 0, a0, a1, a2); + break; + case INDEX_op_orc_vec: + tcg_out_insn(s, 3616, ORN, is_q, 0, a0, a1, a2); + break; + case INDEX_op_not_vec: + tcg_out_insn(s, 3617, NOT, is_q, 0, a0, a1); + break; + case INDEX_op_dup_vec: + tcg_out_insn(s, 3605, DUP, is_q, a0, a1, 1 << vece, 0); + break; + case INDEX_op_shli_vec: + tcg_out_insn(s, 3614, SHL, is_q, a0, a1, a2 + (8 << vece)); + break; + case INDEX_op_shri_vec: + tcg_out_insn(s, 3614, USHR, is_q, a0, a1, (16 << vece) - a2); + break; + case INDEX_op_sari_vec: + tcg_out_insn(s, 3614, SSHR, is_q, a0, a1, (16 << vece) - a2); + break; + case INDEX_op_cmp_vec: + { + TCGCond cond = args[3]; + AArch64Insn insn; + + if (cond == TCG_COND_NE) { + if (const_args[2]) { + tcg_out_insn(s, 3616, CMTST, is_q, vece, a0, a1, a1); + } else { + tcg_out_insn(s, 3616, CMEQ, is_q, vece, a0, a1, a2); + tcg_out_insn(s, 3617, NOT, is_q, 0, a0, a0); + } + } else { + if (const_args[2]) { + insn = cmp0_insn[cond]; + if (insn) { + tcg_out_insn_3617(s, insn, is_q, vece, a0, a1); + break; + } + tcg_out_dupi_vec(s, type, TCG_VEC_TMP, 0); + a2 = TCG_VEC_TMP; + } + insn = cmp_insn[cond]; + if (insn == 0) { + TCGArg t; + t = a1, a1 = a2, a2 = t; + cond = tcg_swap_cond(cond); + insn = cmp_insn[cond]; + tcg_debug_assert(insn != 0); + } + tcg_out_insn_3616(s, insn, is_q, vece, a0, a1, a2); + } + } + break; + default: + g_assert_not_reached(); + } +} + +int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) +{ + switch (opc) { + case INDEX_op_add_vec: + case INDEX_op_sub_vec: + case INDEX_op_mul_vec: + case INDEX_op_and_vec: + case INDEX_op_or_vec: + case INDEX_op_xor_vec: + case INDEX_op_andc_vec: + case INDEX_op_orc_vec: + case INDEX_op_neg_vec: + case INDEX_op_not_vec: + case INDEX_op_cmp_vec: + case INDEX_op_shli_vec: + case INDEX_op_shri_vec: + case INDEX_op_sari_vec: + return 1; + + default: + return 0; + } +} + +void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, + TCGArg a0, ...) +{ +} + static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op) { static const TCGTargetOpDef r = { .args_ct_str = { "r" } }; static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } }; + static const TCGTargetOpDef w_w = { .args_ct_str = { "w", "w" } }; + static const TCGTargetOpDef w_r = { .args_ct_str = { "w", "r" } }; + static const TCGTargetOpDef w_wr = { .args_ct_str = { "w", "wr" } }; static const TCGTargetOpDef r_l = { .args_ct_str = { "r", "l" } }; static const TCGTargetOpDef r_rA = { .args_ct_str = { "r", "rA" } }; static const TCGTargetOpDef rZ_r = { .args_ct_str = { "rZ", "r" } }; static const TCGTargetOpDef lZ_l = { .args_ct_str = { "lZ", "l" } }; static const TCGTargetOpDef r_r_r = { .args_ct_str = { "r", "r", "r" } }; + static const TCGTargetOpDef w_w_w = { .args_ct_str = { "w", "w", "w" } }; + static const TCGTargetOpDef w_w_wZ = { .args_ct_str = { "w", "w", "wZ" } }; static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } }; static const TCGTargetOpDef r_r_rA = { .args_ct_str = { "r", "r", "rA" } }; static const TCGTargetOpDef r_r_rL = { .args_ct_str = { "r", "r", "rL" } }; @@ -1938,6 +2400,29 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op) case INDEX_op_sub2_i64: return &add2; + case INDEX_op_add_vec: + case INDEX_op_sub_vec: + case INDEX_op_mul_vec: + case INDEX_op_and_vec: + case INDEX_op_or_vec: + case INDEX_op_xor_vec: + case INDEX_op_andc_vec: + case INDEX_op_orc_vec: + return &w_w_w; + case INDEX_op_not_vec: + case INDEX_op_neg_vec: + case INDEX_op_shli_vec: + case INDEX_op_shri_vec: + case INDEX_op_sari_vec: + return &w_w; + case INDEX_op_ld_vec: + case INDEX_op_st_vec: + return &w_r; + case INDEX_op_dup_vec: + return &w_wr; + case INDEX_op_cmp_vec: + return &w_w_wZ; + default: return NULL; } @@ -1947,8 +2432,10 @@ static void tcg_target_init(TCGContext *s) { tcg_target_available_regs[TCG_TYPE_I32] = 0xffffffffu; tcg_target_available_regs[TCG_TYPE_I64] = 0xffffffffu; + tcg_target_available_regs[TCG_TYPE_V64] = 0xffffffff00000000ull; + tcg_target_available_regs[TCG_TYPE_V128] = 0xffffffff00000000ull; - tcg_target_call_clobber_regs = 0xfffffffu; + tcg_target_call_clobber_regs = -1ull; tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X19); tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X20); tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X21); @@ -1960,12 +2447,21 @@ static void tcg_target_init(TCGContext *s) tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X27); tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X28); tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X29); + tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V8); + tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V9); + tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V10); + tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V11); + tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V12); + tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V13); + tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V14); + tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V15); s->reserved_regs = 0; tcg_regset_set_reg(s->reserved_regs, TCG_REG_SP); tcg_regset_set_reg(s->reserved_regs, TCG_REG_FP); tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP); tcg_regset_set_reg(s->reserved_regs, TCG_REG_X18); /* platform register */ + tcg_regset_set_reg(s->reserved_regs, TCG_VEC_TMP); } /* Saving pairs: (X19, X20) .. (X27, X28), (X29(fp), X30(lr)). */ diff --git a/tcg/aarch64/tcg-target.opc.h b/tcg/aarch64/tcg-target.opc.h new file mode 100644 index 0000000000..4816a6c3d4 --- /dev/null +++ b/tcg/aarch64/tcg-target.opc.h @@ -0,0 +1,3 @@ +/* Target-specific opcodes for host vector expansion. These will be + emitted by tcg_expand_vec_op. For those familiar with GCC internals, + consider these to be UNSPEC with names. */ diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h index b89dababf4..9fdf37f23c 100644 --- a/tcg/i386/tcg-target.h +++ b/tcg/i386/tcg-target.h @@ -30,10 +30,10 @@ #ifdef __x86_64__ # define TCG_TARGET_REG_BITS 64 -# define TCG_TARGET_NB_REGS 16 +# define TCG_TARGET_NB_REGS 32 #else # define TCG_TARGET_REG_BITS 32 -# define TCG_TARGET_NB_REGS 8 +# define TCG_TARGET_NB_REGS 24 #endif typedef enum { @@ -56,6 +56,26 @@ typedef enum { TCG_REG_R13, TCG_REG_R14, TCG_REG_R15, + + TCG_REG_XMM0, + TCG_REG_XMM1, + TCG_REG_XMM2, + TCG_REG_XMM3, + TCG_REG_XMM4, + TCG_REG_XMM5, + TCG_REG_XMM6, + TCG_REG_XMM7, + + /* 64-bit registers; likewise always define. */ + TCG_REG_XMM8, + TCG_REG_XMM9, + TCG_REG_XMM10, + TCG_REG_XMM11, + TCG_REG_XMM12, + TCG_REG_XMM13, + TCG_REG_XMM14, + TCG_REG_XMM15, + TCG_REG_RAX = TCG_REG_EAX, TCG_REG_RCX = TCG_REG_ECX, TCG_REG_RDX = TCG_REG_EDX, @@ -77,6 +97,8 @@ typedef enum { extern bool have_bmi1; extern bool have_popcnt; +extern bool have_avx1; +extern bool have_avx2; /* optional instructions */ #define TCG_TARGET_HAS_div2_i32 1 @@ -146,6 +168,21 @@ extern bool have_popcnt; #define TCG_TARGET_HAS_mulsh_i64 0 #endif +/* We do not support older SSE systems, only beginning with AVX1. */ +#define TCG_TARGET_HAS_v64 have_avx1 +#define TCG_TARGET_HAS_v128 have_avx1 +#define TCG_TARGET_HAS_v256 have_avx2 + +#define TCG_TARGET_HAS_andc_vec 1 +#define TCG_TARGET_HAS_orc_vec 0 +#define TCG_TARGET_HAS_not_vec 0 +#define TCG_TARGET_HAS_neg_vec 0 +#define TCG_TARGET_HAS_shi_vec 1 +#define TCG_TARGET_HAS_shs_vec 0 +#define TCG_TARGET_HAS_shv_vec 0 +#define TCG_TARGET_HAS_cmp_vec 1 +#define TCG_TARGET_HAS_mul_vec 1 + #define TCG_TARGET_deposit_i32_valid(ofs, len) \ (((ofs) == 0 && (len) == 8) || ((ofs) == 8 && (len) == 8) || \ ((ofs) == 0 && (len) == 16)) diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c index 63d27f10e7..fc05909d1d 100644 --- a/tcg/i386/tcg-target.inc.c +++ b/tcg/i386/tcg-target.inc.c @@ -28,10 +28,15 @@ static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = { #if TCG_TARGET_REG_BITS == 64 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi", - "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", #else "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi", #endif + "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", +#if TCG_TARGET_REG_BITS == 64 + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", +#endif }; #endif @@ -61,6 +66,28 @@ static const int tcg_target_reg_alloc_order[] = { TCG_REG_EDX, TCG_REG_EAX, #endif + TCG_REG_XMM0, + TCG_REG_XMM1, + TCG_REG_XMM2, + TCG_REG_XMM3, + TCG_REG_XMM4, + TCG_REG_XMM5, +#ifndef _WIN64 + /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save + any of them. Therefore only allow xmm0-xmm5 to be allocated. */ + TCG_REG_XMM6, + TCG_REG_XMM7, +#if TCG_TARGET_REG_BITS == 64 + TCG_REG_XMM8, + TCG_REG_XMM9, + TCG_REG_XMM10, + TCG_REG_XMM11, + TCG_REG_XMM12, + TCG_REG_XMM13, + TCG_REG_XMM14, + TCG_REG_XMM15, +#endif +#endif }; static const int tcg_target_call_iarg_regs[] = { @@ -94,7 +121,7 @@ static const int tcg_target_call_oarg_regs[] = { #define TCG_CT_CONST_I32 0x400 #define TCG_CT_CONST_WSZ 0x800 -/* Registers used with L constraint, which are the first argument +/* Registers used with L constraint, which are the first argument registers on x86_64, and two random call clobbered registers on i386. */ #if TCG_TARGET_REG_BITS == 64 @@ -125,6 +152,8 @@ static bool have_cmov; it there. Therefore we always define the variable. */ bool have_bmi1; bool have_popcnt; +bool have_avx1; +bool have_avx2; #ifdef CONFIG_CPUID_H static bool have_movbe; @@ -148,6 +177,8 @@ static void patch_reloc(tcg_insn_unit *code_ptr, int type, if (value != (int32_t)value) { tcg_abort(); } + /* FALLTHRU */ + case R_386_32: tcg_patch32(code_ptr, value); break; case R_386_PC8: @@ -162,6 +193,14 @@ static void patch_reloc(tcg_insn_unit *code_ptr, int type, } } +#if TCG_TARGET_REG_BITS == 64 +#define ALL_GENERAL_REGS 0x0000ffffu +#define ALL_VECTOR_REGS 0xffff0000u +#else +#define ALL_GENERAL_REGS 0x000000ffu +#define ALL_VECTOR_REGS 0x00ff0000u +#endif + /* parse target specific constraints */ static const char *target_parse_constraint(TCGArgConstraint *ct, const char *ct_str, TCGType type) @@ -192,21 +231,29 @@ static const char *target_parse_constraint(TCGArgConstraint *ct, tcg_regset_set_reg(ct->u.regs, TCG_REG_EDI); break; case 'q': + /* A register that can be used as a byte operand. */ ct->ct |= TCG_CT_REG; ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf; break; case 'Q': + /* A register with an addressable second byte (e.g. %ah). */ ct->ct |= TCG_CT_REG; ct->u.regs = 0xf; break; case 'r': + /* A general register. */ ct->ct |= TCG_CT_REG; - ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff; + ct->u.regs |= ALL_GENERAL_REGS; break; case 'W': /* With TZCNT/LZCNT, we can have operand-size as an input. */ ct->ct |= TCG_CT_CONST_WSZ; break; + case 'x': + /* A vector register. */ + ct->ct |= TCG_CT_REG; + ct->u.regs |= ALL_VECTOR_REGS; + break; /* qemu_ld/st address constraint */ case 'L': @@ -277,14 +324,17 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type, # define P_REXB_RM 0 # define P_GS 0 #endif -#define P_SIMDF3 0x10000 /* 0xf3 opcode prefix */ -#define P_SIMDF2 0x20000 /* 0xf2 opcode prefix */ +#define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */ +#define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */ +#define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */ +#define P_VEXL 0x80000 /* Set VEX.L = 1 */ #define OPC_ARITH_EvIz (0x81) #define OPC_ARITH_EvIb (0x83) #define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */ #define OPC_ANDN (0xf2 | P_EXT38) #define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3)) +#define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16) #define OPC_BSF (0xbc | P_EXT) #define OPC_BSR (0xbd | P_EXT) #define OPC_BSWAP (0xc8 | P_EXT) @@ -310,11 +360,68 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type, #define OPC_MOVL_Iv (0xb8) #define OPC_MOVBE_GyMy (0xf0 | P_EXT38) #define OPC_MOVBE_MyGy (0xf1 | P_EXT38) +#define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16) +#define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16) +#define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2) +#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16) +#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16) +#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3) +#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3) +#define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3) +#define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16) #define OPC_MOVSBL (0xbe | P_EXT) #define OPC_MOVSWL (0xbf | P_EXT) #define OPC_MOVSLQ (0x63 | P_REXW) #define OPC_MOVZBL (0xb6 | P_EXT) #define OPC_MOVZWL (0xb7 | P_EXT) +#define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16) +#define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16) +#define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16) +#define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16) +#define OPC_PADDB (0xfc | P_EXT | P_DATA16) +#define OPC_PADDW (0xfd | P_EXT | P_DATA16) +#define OPC_PADDD (0xfe | P_EXT | P_DATA16) +#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16) +#define OPC_PAND (0xdb | P_EXT | P_DATA16) +#define OPC_PANDN (0xdf | P_EXT | P_DATA16) +#define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16) +#define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16) +#define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16) +#define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16) +#define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16) +#define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16) +#define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16) +#define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16) +#define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16) +#define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16) +#define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16) +#define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16) +#define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16) +#define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16) +#define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16) +#define OPC_PMULLW (0xd5 | P_EXT | P_DATA16) +#define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16) +#define OPC_POR (0xeb | P_EXT | P_DATA16) +#define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16) +#define OPC_PSHUFD (0x70 | P_EXT | P_DATA16) +#define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2) +#define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3) +#define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */ +#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */ +#define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */ +#define OPC_PSUBB (0xf8 | P_EXT | P_DATA16) +#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16) +#define OPC_PSUBD (0xfa | P_EXT | P_DATA16) +#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16) +#define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16) +#define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16) +#define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16) +#define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16) +#define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16) +#define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16) +#define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16) +#define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16) +#define OPC_PXOR (0xef | P_EXT | P_DATA16) #define OPC_POP_r32 (0x58) #define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3) #define OPC_PUSH_r32 (0x50) @@ -326,14 +433,26 @@ static inline int tcg_target_const_match(tcg_target_long val, TCGType type, #define OPC_SHIFT_Ib (0xc1) #define OPC_SHIFT_cl (0xd3) #define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3) +#define OPC_SHUFPS (0xc6 | P_EXT) #define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16) #define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2) #define OPC_TESTL (0x85) #define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3) +#define OPC_UD2 (0x0b | P_EXT) +#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16) +#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16) +#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16) +#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16) +#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16) +#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16) +#define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_REXW) +#define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL) +#define OPC_VZEROUPPER (0x77 | P_EXT) #define OPC_XCHG_ax_r32 (0x90) #define OPC_GRP3_Ev (0xf7) #define OPC_GRP5 (0xff) +#define OPC_GRP14 (0x73 | P_EXT | P_DATA16) /* Group 1 opcode extensions for 0x80-0x83. These are also used as modifiers for OPC_ARITH. */ @@ -439,10 +558,12 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x) tcg_out8(s, (uint8_t)(rex | 0x40)); } - if (opc & (P_EXT | P_EXT38)) { + if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { tcg_out8(s, 0x0f); if (opc & P_EXT38) { tcg_out8(s, 0x38); + } else if (opc & P_EXT3A) { + tcg_out8(s, 0x3a); } } @@ -459,10 +580,12 @@ static void tcg_out_opc(TCGContext *s, int opc) } else if (opc & P_SIMDF2) { tcg_out8(s, 0xf2); } - if (opc & (P_EXT | P_EXT38)) { + if (opc & (P_EXT | P_EXT38 | P_EXT3A)) { tcg_out8(s, 0x0f); if (opc & P_EXT38) { tcg_out8(s, 0x38); + } else if (opc & P_EXT3A) { + tcg_out8(s, 0x3a); } } tcg_out8(s, opc); @@ -479,34 +602,42 @@ static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm) tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); } -static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) +static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v, + int rm, int index) { int tmp; - if ((opc & (P_REXW | P_EXT | P_EXT38)) || (rm & 8)) { + /* Use the two byte form if possible, which cannot encode + VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */ + if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT + && ((rm | index) & 8) == 0) { + /* Two byte VEX prefix. */ + tcg_out8(s, 0xc5); + + tmp = (r & 8 ? 0 : 0x80); /* VEX.R */ + } else { /* Three byte VEX prefix. */ tcg_out8(s, 0xc4); /* VEX.m-mmmm */ - if (opc & P_EXT38) { + if (opc & P_EXT3A) { + tmp = 3; + } else if (opc & P_EXT38) { tmp = 2; } else if (opc & P_EXT) { tmp = 1; } else { - tcg_abort(); + g_assert_not_reached(); } - tmp |= 0x40; /* VEX.X */ - tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */ - tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */ + tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */ + tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */ + tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */ tcg_out8(s, tmp); - tmp = (opc & P_REXW ? 0x80 : 0); /* VEX.W */ - } else { - /* Two byte VEX prefix. */ - tcg_out8(s, 0xc5); - - tmp = (r & 8 ? 0 : 0x80); /* VEX.R */ + tmp = (opc & P_REXW ? 0x80 : 0); /* VEX.W */ } + + tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */ /* VEX.pp */ if (opc & P_DATA16) { tmp |= 1; /* 0x66 */ @@ -518,6 +649,11 @@ static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) tmp |= (~v & 15) << 3; /* VEX.vvvv */ tcg_out8(s, tmp); tcg_out8(s, opc); +} + +static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) +{ + tcg_out_vex_opc(s, opc, r, v, rm, 0); tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); } @@ -526,8 +662,8 @@ static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) mode for absolute addresses, ~RM is the size of the immediate operand that will follow the instruction. */ -static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, - int index, int shift, intptr_t offset) +static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index, + int shift, intptr_t offset) { int mod, len; @@ -538,7 +674,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm; intptr_t disp = offset - pc; if (disp == (int32_t)disp) { - tcg_out_opc(s, opc, r, 0, 0); tcg_out8(s, (LOWREGMASK(r) << 3) | 5); tcg_out32(s, disp); return; @@ -548,7 +683,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, use of the MODRM+SIB encoding and is therefore larger than rip-relative addressing. */ if (offset == (int32_t)offset) { - tcg_out_opc(s, opc, r, 0, 0); tcg_out8(s, (LOWREGMASK(r) << 3) | 4); tcg_out8(s, (4 << 3) | 5); tcg_out32(s, offset); @@ -556,10 +690,9 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, } /* ??? The memory isn't directly addressable. */ - tcg_abort(); + g_assert_not_reached(); } else { /* Absolute address. */ - tcg_out_opc(s, opc, r, 0, 0); tcg_out8(s, (r << 3) | 5); tcg_out32(s, offset); return; @@ -582,7 +715,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, that would be used for %esp is the escape to the two byte form. */ if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) { /* Single byte MODRM format. */ - tcg_out_opc(s, opc, r, rm, 0); tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); } else { /* Two byte MODRM+SIB format. */ @@ -596,7 +728,6 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, tcg_debug_assert(index != TCG_REG_ESP); } - tcg_out_opc(s, opc, r, rm, index); tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4); tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm)); } @@ -608,6 +739,21 @@ static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, } } +static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm, + int index, int shift, intptr_t offset) +{ + tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index); + tcg_out_sib_offset(s, r, rm, index, shift, offset); +} + +static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v, + int rm, int index, int shift, + intptr_t offset) +{ + tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index); + tcg_out_sib_offset(s, r, rm, index, shift, offset); +} + /* A simplification of the above with no index or shift. */ static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, int rm, intptr_t offset) @@ -615,6 +761,30 @@ static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r, tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset); } +static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r, + int v, int rm, intptr_t offset) +{ + tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset); +} + +/* Output an opcode with an expected reference to the constant pool. */ +static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r) +{ + tcg_out_opc(s, opc, r, 0, 0); + /* Absolute for 32-bit, pc-relative for 64-bit. */ + tcg_out8(s, LOWREGMASK(r) << 3 | 5); + tcg_out32(s, 0); +} + +/* Output an opcode with an expected reference to the constant pool. */ +static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r) +{ + tcg_out_vex_opc(s, opc, r, 0, 0, 0); + /* Absolute for 32-bit, pc-relative for 64-bit. */ + tcg_out8(s, LOWREGMASK(r) << 3 | 5); + tcg_out32(s, 0); +} + /* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */ static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src) { @@ -625,12 +795,116 @@ static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src) tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src); } -static inline void tcg_out_mov(TCGContext *s, TCGType type, - TCGReg ret, TCGReg arg) +static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg) { - if (arg != ret) { - int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0); - tcg_out_modrm(s, opc, ret, arg); + int rexw = 0; + + if (arg == ret) { + return; + } + switch (type) { + case TCG_TYPE_I64: + rexw = P_REXW; + /* fallthru */ + case TCG_TYPE_I32: + if (ret < 16) { + if (arg < 16) { + tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg); + } else { + tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret); + } + } else { + if (arg < 16) { + tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg); + } else { + tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); + } + } + break; + + case TCG_TYPE_V64: + tcg_debug_assert(ret >= 16 && arg >= 16); + tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg); + break; + case TCG_TYPE_V128: + tcg_debug_assert(ret >= 16 && arg >= 16); + tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg); + break; + case TCG_TYPE_V256: + tcg_debug_assert(ret >= 16 && arg >= 16); + tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg); + break; + + default: + g_assert_not_reached(); + } +} + +static void tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece, + TCGReg r, TCGReg a) +{ + if (have_avx2) { + static const int dup_insn[4] = { + OPC_VPBROADCASTB, OPC_VPBROADCASTW, + OPC_VPBROADCASTD, OPC_VPBROADCASTQ, + }; + int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); + tcg_out_vex_modrm(s, dup_insn[vece] + vex_l, r, 0, a); + } else { + switch (vece) { + case MO_8: + /* ??? With zero in a register, use PSHUFB. */ + tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, 0, a); + a = r; + /* FALLTHRU */ + case MO_16: + tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, 0, a); + a = r; + /* FALLTHRU */ + case MO_32: + tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a); + /* imm8 operand: all output lanes selected from input lane 0. */ + tcg_out8(s, 0); + break; + case MO_64: + tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, 0, a); + break; + default: + g_assert_not_reached(); + } + } +} + +static void tcg_out_dupi_vec(TCGContext *s, TCGType type, + TCGReg ret, tcg_target_long arg) +{ + int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0); + + if (arg == 0) { + tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret); + return; + } + if (arg == -1) { + tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret); + return; + } + + if (TCG_TARGET_REG_BITS == 64) { + if (type == TCG_TYPE_V64) { + tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret); + } else if (have_avx2) { + tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret); + } else { + tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret); + } + new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4); + } else if (have_avx2) { + tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret); + new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); + } else { + tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy, ret); + new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0); + tcg_out_dup_vec(s, type, MO_32, ret, ret); } } @@ -639,6 +913,25 @@ static void tcg_out_movi(TCGContext *s, TCGType type, { tcg_target_long diff; + switch (type) { + case TCG_TYPE_I32: +#if TCG_TARGET_REG_BITS == 64 + case TCG_TYPE_I64: +#endif + if (ret < 16) { + break; + } + /* fallthru */ + case TCG_TYPE_V64: + case TCG_TYPE_V128: + case TCG_TYPE_V256: + tcg_debug_assert(ret >= 16); + tcg_out_dupi_vec(s, type, ret, arg); + return; + default: + g_assert_not_reached(); + } + if (arg == 0) { tgen_arithr(s, ARITH_XOR, ret, ret); return; @@ -702,18 +995,74 @@ static inline void tcg_out_pop(TCGContext *s, int reg) tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0); } -static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, - TCGReg arg1, intptr_t arg2) +static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret, + TCGReg arg1, intptr_t arg2) { - int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0); - tcg_out_modrm_offset(s, opc, ret, arg1, arg2); + switch (type) { + case TCG_TYPE_I32: + if (ret < 16) { + tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2); + } else { + tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2); + } + break; + case TCG_TYPE_I64: + if (ret < 16) { + tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2); + break; + } + /* FALLTHRU */ + case TCG_TYPE_V64: + tcg_debug_assert(ret >= 16); + tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2); + break; + case TCG_TYPE_V128: + tcg_debug_assert(ret >= 16); + tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx, ret, 0, arg1, arg2); + break; + case TCG_TYPE_V256: + tcg_debug_assert(ret >= 16); + tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL, + ret, 0, arg1, arg2); + break; + default: + g_assert_not_reached(); + } } -static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, - TCGReg arg1, intptr_t arg2) +static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, + TCGReg arg1, intptr_t arg2) { - int opc = OPC_MOVL_EvGv + (type == TCG_TYPE_I64 ? P_REXW : 0); - tcg_out_modrm_offset(s, opc, arg, arg1, arg2); + switch (type) { + case TCG_TYPE_I32: + if (arg < 16) { + tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2); + } else { + tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2); + } + break; + case TCG_TYPE_I64: + if (arg < 16) { + tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2); + break; + } + /* FALLTHRU */ + case TCG_TYPE_V64: + tcg_debug_assert(arg >= 16); + tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2); + break; + case TCG_TYPE_V128: + tcg_debug_assert(arg >= 16); + tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx, arg, 0, arg1, arg2); + break; + case TCG_TYPE_V256: + tcg_debug_assert(arg >= 16); + tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL, + arg, 0, arg1, arg2); + break; + default: + g_assert_not_reached(); + } } static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, @@ -725,6 +1074,8 @@ static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, return false; } rexw = P_REXW; + } else if (type != TCG_TYPE_I32) { + return false; } tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs); tcg_out32(s, val); @@ -2259,8 +2610,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, break; case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */ case INDEX_op_mov_i64: + case INDEX_op_mov_vec: case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi. */ case INDEX_op_movi_i64: + case INDEX_op_dupi_vec: case INDEX_op_call: /* Always emitted via tcg_out_call. */ default: tcg_abort(); @@ -2269,6 +2622,181 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, #undef OP_32_64 } +static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, + unsigned vecl, unsigned vece, + const TCGArg *args, const int *const_args) +{ + static int const add_insn[4] = { + OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ + }; + static int const sub_insn[4] = { + OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ + }; + static int const mul_insn[4] = { + OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2 + }; + static int const shift_imm_insn[4] = { + OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib + }; + static int const cmpeq_insn[4] = { + OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ + }; + static int const cmpgt_insn[4] = { + OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ + }; + static int const punpckl_insn[4] = { + OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ + }; + static int const punpckh_insn[4] = { + OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ + }; + static int const packss_insn[4] = { + OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2 + }; + static int const packus_insn[4] = { + OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2 + }; + + TCGType type = vecl + TCG_TYPE_V64; + int insn, sub; + TCGArg a0, a1, a2; + + a0 = args[0]; + a1 = args[1]; + a2 = args[2]; + + switch (opc) { + case INDEX_op_add_vec: + insn = add_insn[vece]; + goto gen_simd; + case INDEX_op_sub_vec: + insn = sub_insn[vece]; + goto gen_simd; + case INDEX_op_mul_vec: + insn = mul_insn[vece]; + goto gen_simd; + case INDEX_op_and_vec: + insn = OPC_PAND; + goto gen_simd; + case INDEX_op_or_vec: + insn = OPC_POR; + goto gen_simd; + case INDEX_op_xor_vec: + insn = OPC_PXOR; + goto gen_simd; + case INDEX_op_x86_punpckl_vec: + insn = punpckl_insn[vece]; + goto gen_simd; + case INDEX_op_x86_punpckh_vec: + insn = punpckh_insn[vece]; + goto gen_simd; + case INDEX_op_x86_packss_vec: + insn = packss_insn[vece]; + goto gen_simd; + case INDEX_op_x86_packus_vec: + insn = packus_insn[vece]; + goto gen_simd; + gen_simd: + tcg_debug_assert(insn != OPC_UD2); + if (type == TCG_TYPE_V256) { + insn |= P_VEXL; + } + tcg_out_vex_modrm(s, insn, a0, a1, a2); + break; + + case INDEX_op_cmp_vec: + sub = args[3]; + if (sub == TCG_COND_EQ) { + insn = cmpeq_insn[vece]; + } else if (sub == TCG_COND_GT) { + insn = cmpgt_insn[vece]; + } else { + g_assert_not_reached(); + } + goto gen_simd; + + case INDEX_op_andc_vec: + insn = OPC_PANDN; + if (type == TCG_TYPE_V256) { + insn |= P_VEXL; + } + tcg_out_vex_modrm(s, insn, a0, a2, a1); + break; + + case INDEX_op_shli_vec: + sub = 6; + goto gen_shift; + case INDEX_op_shri_vec: + sub = 2; + goto gen_shift; + case INDEX_op_sari_vec: + tcg_debug_assert(vece != MO_64); + sub = 4; + gen_shift: + tcg_debug_assert(vece != MO_8); + insn = shift_imm_insn[vece]; + if (type == TCG_TYPE_V256) { + insn |= P_VEXL; + } + tcg_out_vex_modrm(s, insn, sub, a0, a1); + tcg_out8(s, a2); + break; + + case INDEX_op_ld_vec: + tcg_out_ld(s, type, a0, a1, a2); + break; + case INDEX_op_st_vec: + tcg_out_st(s, type, a0, a1, a2); + break; + case INDEX_op_dup_vec: + tcg_out_dup_vec(s, type, vece, a0, a1); + break; + + case INDEX_op_x86_shufps_vec: + insn = OPC_SHUFPS; + sub = args[3]; + goto gen_simd_imm8; + case INDEX_op_x86_blend_vec: + if (vece == MO_16) { + insn = OPC_PBLENDW; + } else if (vece == MO_32) { + insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS); + } else { + g_assert_not_reached(); + } + sub = args[3]; + goto gen_simd_imm8; + case INDEX_op_x86_vperm2i128_vec: + insn = OPC_VPERM2I128; + sub = args[3]; + goto gen_simd_imm8; + gen_simd_imm8: + if (type == TCG_TYPE_V256) { + insn |= P_VEXL; + } + tcg_out_vex_modrm(s, insn, a0, a1, a2); + tcg_out8(s, sub); + break; + + case INDEX_op_x86_vpblendvb_vec: + insn = OPC_VPBLENDVB; + if (type == TCG_TYPE_V256) { + insn |= P_VEXL; + } + tcg_out_vex_modrm(s, insn, a0, a1, a2); + tcg_out8(s, args[3] << 4); + break; + + case INDEX_op_x86_psrldq_vec: + tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1); + tcg_out8(s, a2); + break; + + default: + g_assert_not_reached(); + } +} + static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op) { static const TCGTargetOpDef r = { .args_ct_str = { "r" } }; @@ -2292,6 +2820,11 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op) = { .args_ct_str = { "r", "r", "L", "L" } }; static const TCGTargetOpDef L_L_L_L = { .args_ct_str = { "L", "L", "L", "L" } }; + static const TCGTargetOpDef x_x = { .args_ct_str = { "x", "x" } }; + static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } }; + static const TCGTargetOpDef x_x_x_x + = { .args_ct_str = { "x", "x", "x", "x" } }; + static const TCGTargetOpDef x_r = { .args_ct_str = { "x", "r" } }; switch (op) { case INDEX_op_goto_ptr: @@ -2493,12 +3026,342 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op) return &s2; } + case INDEX_op_ld_vec: + case INDEX_op_st_vec: + return &x_r; + + case INDEX_op_add_vec: + case INDEX_op_sub_vec: + case INDEX_op_mul_vec: + case INDEX_op_and_vec: + case INDEX_op_or_vec: + case INDEX_op_xor_vec: + case INDEX_op_andc_vec: + case INDEX_op_cmp_vec: + case INDEX_op_x86_shufps_vec: + case INDEX_op_x86_blend_vec: + case INDEX_op_x86_packss_vec: + case INDEX_op_x86_packus_vec: + case INDEX_op_x86_vperm2i128_vec: + case INDEX_op_x86_punpckl_vec: + case INDEX_op_x86_punpckh_vec: + return &x_x_x; + case INDEX_op_dup_vec: + case INDEX_op_shli_vec: + case INDEX_op_shri_vec: + case INDEX_op_sari_vec: + case INDEX_op_x86_psrldq_vec: + return &x_x; + case INDEX_op_x86_vpblendvb_vec: + return &x_x_x_x; + default: break; } return NULL; } +int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) +{ + switch (opc) { + case INDEX_op_add_vec: + case INDEX_op_sub_vec: + case INDEX_op_and_vec: + case INDEX_op_or_vec: + case INDEX_op_xor_vec: + case INDEX_op_andc_vec: + return 1; + case INDEX_op_cmp_vec: + return -1; + + case INDEX_op_shli_vec: + case INDEX_op_shri_vec: + /* We must expand the operation for MO_8. */ + return vece == MO_8 ? -1 : 1; + + case INDEX_op_sari_vec: + /* We must expand the operation for MO_8. */ + if (vece == MO_8) { + return -1; + } + /* We can emulate this for MO_64, but it does not pay off + unless we're producing at least 4 values. */ + if (vece == MO_64) { + return type >= TCG_TYPE_V256 ? -1 : 0; + } + return 1; + + case INDEX_op_mul_vec: + if (vece == MO_8) { + /* We can expand the operation for MO_8. */ + return -1; + } + if (vece == MO_64) { + return 0; + } + return 1; + + default: + return 0; + } +} + +void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece, + TCGArg a0, ...) +{ + va_list va; + TCGArg a1, a2; + TCGv_vec v0, t1, t2, t3, t4; + + va_start(va, a0); + v0 = temp_tcgv_vec(arg_temp(a0)); + + switch (opc) { + case INDEX_op_shli_vec: + case INDEX_op_shri_vec: + tcg_debug_assert(vece == MO_8); + a1 = va_arg(va, TCGArg); + a2 = va_arg(va, TCGArg); + /* Unpack to W, shift, and repack. Tricky bits: + (1) Use punpck*bw x,x to produce DDCCBBAA, + i.e. duplicate in other half of the 16-bit lane. + (2) For right-shift, add 8 so that the high half of + the lane becomes zero. For left-shift, we must + shift up and down again. + (3) Step 2 leaves high half zero such that PACKUSWB + (pack with unsigned saturation) does not modify + the quantity. */ + t1 = tcg_temp_new_vec(type); + t2 = tcg_temp_new_vec(type); + vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, + tcgv_vec_arg(t1), a1, a1); + vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, + tcgv_vec_arg(t2), a1, a1); + if (opc == INDEX_op_shri_vec) { + vec_gen_3(INDEX_op_shri_vec, type, MO_16, + tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8); + vec_gen_3(INDEX_op_shri_vec, type, MO_16, + tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8); + } else { + vec_gen_3(INDEX_op_shli_vec, type, MO_16, + tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8); + vec_gen_3(INDEX_op_shli_vec, type, MO_16, + tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8); + vec_gen_3(INDEX_op_shri_vec, type, MO_16, + tcgv_vec_arg(t1), tcgv_vec_arg(t1), 8); + vec_gen_3(INDEX_op_shri_vec, type, MO_16, + tcgv_vec_arg(t2), tcgv_vec_arg(t2), 8); + } + vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8, + a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2)); + tcg_temp_free_vec(t1); + tcg_temp_free_vec(t2); + break; + + case INDEX_op_sari_vec: + a1 = va_arg(va, TCGArg); + a2 = va_arg(va, TCGArg); + if (vece == MO_8) { + /* Unpack to W, shift, and repack, as above. */ + t1 = tcg_temp_new_vec(type); + t2 = tcg_temp_new_vec(type); + vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8, + tcgv_vec_arg(t1), a1, a1); + vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8, + tcgv_vec_arg(t2), a1, a1); + vec_gen_3(INDEX_op_sari_vec, type, MO_16, + tcgv_vec_arg(t1), tcgv_vec_arg(t1), a2 + 8); + vec_gen_3(INDEX_op_sari_vec, type, MO_16, + tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2 + 8); + vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8, + a0, tcgv_vec_arg(t1), tcgv_vec_arg(t2)); + tcg_temp_free_vec(t1); + tcg_temp_free_vec(t2); + break; + } + tcg_debug_assert(vece == MO_64); + /* MO_64: If the shift is <= 32, we can emulate the sign extend by + performing an arithmetic 32-bit shift and overwriting the high + half of the result (note that the ISA says shift of 32 is valid). */ + if (a2 <= 32) { + t1 = tcg_temp_new_vec(type); + vec_gen_3(INDEX_op_sari_vec, type, MO_32, tcgv_vec_arg(t1), a1, a2); + vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2); + vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32, + a0, a0, tcgv_vec_arg(t1), 0xaa); + tcg_temp_free_vec(t1); + break; + } + /* Otherwise we will need to use a compare vs 0 to produce the + sign-extend, shift and merge. */ + t1 = tcg_temp_new_vec(type); + t2 = tcg_const_zeros_vec(type); + vec_gen_4(INDEX_op_cmp_vec, type, MO_64, + tcgv_vec_arg(t1), tcgv_vec_arg(t2), a1, TCG_COND_GT); + tcg_temp_free_vec(t2); + vec_gen_3(INDEX_op_shri_vec, type, MO_64, a0, a1, a2); + vec_gen_3(INDEX_op_shli_vec, type, MO_64, + tcgv_vec_arg(t1), tcgv_vec_arg(t1), 64 - a2); + vec_gen_3(INDEX_op_or_vec, type, MO_64, a0, a0, tcgv_vec_arg(t1)); + tcg_temp_free_vec(t1); + break; + + case INDEX_op_mul_vec: + tcg_debug_assert(vece == MO_8); + a1 = va_arg(va, TCGArg); + a2 = va_arg(va, TCGArg); + switch (type) { + case TCG_TYPE_V64: + t1 = tcg_temp_new_vec(TCG_TYPE_V128); + t2 = tcg_temp_new_vec(TCG_TYPE_V128); + tcg_gen_dup16i_vec(t2, 0); + vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, + tcgv_vec_arg(t1), a1, tcgv_vec_arg(t2)); + vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, + tcgv_vec_arg(t2), tcgv_vec_arg(t2), a2); + tcg_gen_mul_vec(MO_16, t1, t1, t2); + tcg_gen_shri_vec(MO_16, t1, t1, 8); + vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, + a0, tcgv_vec_arg(t1), tcgv_vec_arg(t1)); + tcg_temp_free_vec(t1); + tcg_temp_free_vec(t2); + break; + + case TCG_TYPE_V128: + t1 = tcg_temp_new_vec(TCG_TYPE_V128); + t2 = tcg_temp_new_vec(TCG_TYPE_V128); + t3 = tcg_temp_new_vec(TCG_TYPE_V128); + t4 = tcg_temp_new_vec(TCG_TYPE_V128); + tcg_gen_dup16i_vec(t4, 0); + vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, + tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4)); + vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8, + tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2); + vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8, + tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4)); + vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V128, MO_8, + tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2); + tcg_gen_mul_vec(MO_16, t1, t1, t2); + tcg_gen_mul_vec(MO_16, t3, t3, t4); + tcg_gen_shri_vec(MO_16, t1, t1, 8); + tcg_gen_shri_vec(MO_16, t3, t3, 8); + vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8, + a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3)); + tcg_temp_free_vec(t1); + tcg_temp_free_vec(t2); + tcg_temp_free_vec(t3); + tcg_temp_free_vec(t4); + break; + + case TCG_TYPE_V256: + t1 = tcg_temp_new_vec(TCG_TYPE_V256); + t2 = tcg_temp_new_vec(TCG_TYPE_V256); + t3 = tcg_temp_new_vec(TCG_TYPE_V256); + t4 = tcg_temp_new_vec(TCG_TYPE_V256); + tcg_gen_dup16i_vec(t4, 0); + /* a1: A[0-7] ... D[0-7]; a2: W[0-7] ... Z[0-7] + t1: extends of B[0-7], D[0-7] + t2: extends of X[0-7], Z[0-7] + t3: extends of A[0-7], C[0-7] + t4: extends of W[0-7], Y[0-7]. */ + vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8, + tcgv_vec_arg(t1), a1, tcgv_vec_arg(t4)); + vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V256, MO_8, + tcgv_vec_arg(t2), tcgv_vec_arg(t4), a2); + vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8, + tcgv_vec_arg(t3), a1, tcgv_vec_arg(t4)); + vec_gen_3(INDEX_op_x86_punpckh_vec, TCG_TYPE_V256, MO_8, + tcgv_vec_arg(t4), tcgv_vec_arg(t4), a2); + /* t1: BX DZ; t2: AW CY. */ + tcg_gen_mul_vec(MO_16, t1, t1, t2); + tcg_gen_mul_vec(MO_16, t3, t3, t4); + tcg_gen_shri_vec(MO_16, t1, t1, 8); + tcg_gen_shri_vec(MO_16, t3, t3, 8); + /* a0: AW BX CY DZ. */ + vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V256, MO_8, + a0, tcgv_vec_arg(t1), tcgv_vec_arg(t3)); + tcg_temp_free_vec(t1); + tcg_temp_free_vec(t2); + tcg_temp_free_vec(t3); + tcg_temp_free_vec(t4); + break; + + default: + g_assert_not_reached(); + } + break; + + case INDEX_op_cmp_vec: + { + enum { + NEED_SWAP = 1, + NEED_INV = 2, + NEED_BIAS = 4 + }; + static const uint8_t fixups[16] = { + [0 ... 15] = -1, + [TCG_COND_EQ] = 0, + [TCG_COND_NE] = NEED_INV, + [TCG_COND_GT] = 0, + [TCG_COND_LT] = NEED_SWAP, + [TCG_COND_LE] = NEED_INV, + [TCG_COND_GE] = NEED_SWAP | NEED_INV, + [TCG_COND_GTU] = NEED_BIAS, + [TCG_COND_LTU] = NEED_BIAS | NEED_SWAP, + [TCG_COND_LEU] = NEED_BIAS | NEED_INV, + [TCG_COND_GEU] = NEED_BIAS | NEED_SWAP | NEED_INV, + }; + + TCGCond cond; + uint8_t fixup; + + a1 = va_arg(va, TCGArg); + a2 = va_arg(va, TCGArg); + cond = va_arg(va, TCGArg); + fixup = fixups[cond & 15]; + tcg_debug_assert(fixup != 0xff); + + if (fixup & NEED_INV) { + cond = tcg_invert_cond(cond); + } + if (fixup & NEED_SWAP) { + TCGArg t; + t = a1, a1 = a2, a2 = t; + cond = tcg_swap_cond(cond); + } + + t1 = t2 = NULL; + if (fixup & NEED_BIAS) { + t1 = tcg_temp_new_vec(type); + t2 = tcg_temp_new_vec(type); + tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1)); + tcg_gen_sub_vec(vece, t1, temp_tcgv_vec(arg_temp(a1)), t2); + tcg_gen_sub_vec(vece, t2, temp_tcgv_vec(arg_temp(a2)), t2); + a1 = tcgv_vec_arg(t1); + a2 = tcgv_vec_arg(t2); + cond = tcg_signed_cond(cond); + } + + tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT); + vec_gen_4(INDEX_op_cmp_vec, type, vece, a0, a1, a2, cond); + + if (fixup & NEED_BIAS) { + tcg_temp_free_vec(t1); + tcg_temp_free_vec(t2); + } + if (fixup & NEED_INV) { + tcg_gen_not_vec(vece, v0, v0); + } + } + break; + + default: + break; + } + + va_end(va); +} + static const int tcg_target_callee_save_regs[] = { #if TCG_TARGET_REG_BITS == 64 TCG_REG_RBP, @@ -2577,6 +3440,9 @@ static void tcg_target_qemu_prologue(TCGContext *s) tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend); + if (have_avx2) { + tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0); + } for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) { tcg_out_pop(s, tcg_target_callee_save_regs[i]); } @@ -2598,9 +3464,16 @@ static void tcg_out_nop_fill(tcg_insn_unit *p, int count) static void tcg_target_init(TCGContext *s) { #ifdef CONFIG_CPUID_H - unsigned a, b, c, d; + unsigned a, b, c, d, b7 = 0; int max = __get_cpuid_max(0, 0); + if (max >= 7) { + /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */ + __cpuid_count(7, 0, a, b7, c, d); + have_bmi1 = (b7 & bit_BMI) != 0; + have_bmi2 = (b7 & bit_BMI2) != 0; + } + if (max >= 1) { __cpuid(1, a, b, c, d); #ifndef have_cmov @@ -2609,17 +3482,22 @@ static void tcg_target_init(TCGContext *s) available, we'll use a small forward branch. */ have_cmov = (d & bit_CMOV) != 0; #endif + /* MOVBE is only available on Intel Atom and Haswell CPUs, so we need to probe for it. */ have_movbe = (c & bit_MOVBE) != 0; have_popcnt = (c & bit_POPCNT) != 0; - } - if (max >= 7) { - /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */ - __cpuid_count(7, 0, a, b, c, d); - have_bmi1 = (b & bit_BMI) != 0; - have_bmi2 = (b & bit_BMI2) != 0; + /* There are a number of things we must check before we can be + sure of not hitting invalid opcode. */ + if (c & bit_OSXSAVE) { + unsigned xcrl, xcrh; + asm ("xgetbv" : "=a" (xcrl), "=d" (xcrh) : "c" (0)); + if ((xcrl & 6) == 6) { + have_avx1 = (c & bit_AVX) != 0; + have_avx2 = (b7 & bit_AVX2) != 0; + } + } } max = __get_cpuid_max(0x8000000, 0); @@ -2630,11 +3508,16 @@ static void tcg_target_init(TCGContext *s) } #endif /* CONFIG_CPUID_H */ + tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS; if (TCG_TARGET_REG_BITS == 64) { - tcg_target_available_regs[TCG_TYPE_I32] = 0xffff; - tcg_target_available_regs[TCG_TYPE_I64] = 0xffff; - } else { - tcg_target_available_regs[TCG_TYPE_I32] = 0xff; + tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS; + } + if (have_avx1) { + tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS; + tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS; + } + if (have_avx2) { + tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS; } tcg_target_call_clobber_regs = 0; diff --git a/tcg/i386/tcg-target.opc.h b/tcg/i386/tcg-target.opc.h new file mode 100644 index 0000000000..e5fa88ba25 --- /dev/null +++ b/tcg/i386/tcg-target.opc.h @@ -0,0 +1,13 @@ +/* Target-specific opcodes for host vector expansion. These will be + emitted by tcg_expand_vec_op. For those familiar with GCC internals, + consider these to be UNSPEC with names. */ + +DEF(x86_shufps_vec, 1, 2, 1, IMPLVEC) +DEF(x86_vpblendvb_vec, 1, 3, 0, IMPLVEC) +DEF(x86_blend_vec, 1, 2, 1, IMPLVEC) +DEF(x86_packss_vec, 1, 2, 0, IMPLVEC) +DEF(x86_packus_vec, 1, 2, 0, IMPLVEC) +DEF(x86_psrldq_vec, 1, 1, 1, IMPLVEC) +DEF(x86_vperm2i128_vec, 1, 2, 1, IMPLVEC) +DEF(x86_punpckl_vec, 1, 2, 0, IMPLVEC) +DEF(x86_punpckh_vec, 1, 2, 0, IMPLVEC) diff --git a/tcg/optimize.c b/tcg/optimize.c index 2cbbeefd53..d4ea67e541 100644 --- a/tcg/optimize.c +++ b/tcg/optimize.c @@ -32,6 +32,11 @@ glue(glue(case INDEX_op_, x), _i32): \ glue(glue(case INDEX_op_, x), _i64) +#define CASE_OP_32_64_VEC(x) \ + glue(glue(case INDEX_op_, x), _i32): \ + glue(glue(case INDEX_op_, x), _i64): \ + glue(glue(case INDEX_op_, x), _vec) + struct tcg_temp_info { bool is_const; TCGTemp *prev_copy; @@ -108,40 +113,6 @@ static void init_arg_info(struct tcg_temp_info *infos, init_ts_info(infos, temps_used, arg_temp(arg)); } -static int op_bits(TCGOpcode op) -{ - const TCGOpDef *def = &tcg_op_defs[op]; - return def->flags & TCG_OPF_64BIT ? 64 : 32; -} - -static TCGOpcode op_to_mov(TCGOpcode op) -{ - switch (op_bits(op)) { - case 32: - return INDEX_op_mov_i32; - case 64: - return INDEX_op_mov_i64; - default: - fprintf(stderr, "op_to_mov: unexpected return value of " - "function op_bits.\n"); - tcg_abort(); - } -} - -static TCGOpcode op_to_movi(TCGOpcode op) -{ - switch (op_bits(op)) { - case 32: - return INDEX_op_movi_i32; - case 64: - return INDEX_op_movi_i64; - default: - fprintf(stderr, "op_to_movi: unexpected return value of " - "function op_bits.\n"); - tcg_abort(); - } -} - static TCGTemp *find_better_copy(TCGContext *s, TCGTemp *ts) { TCGTemp *i; @@ -199,11 +170,23 @@ static bool args_are_copies(TCGArg arg1, TCGArg arg2) static void tcg_opt_gen_movi(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg val) { - TCGOpcode new_op = op_to_movi(op->opc); + const TCGOpDef *def; + TCGOpcode new_op; tcg_target_ulong mask; struct tcg_temp_info *di = arg_info(dst); + def = &tcg_op_defs[op->opc]; + if (def->flags & TCG_OPF_VECTOR) { + new_op = INDEX_op_dupi_vec; + } else if (def->flags & TCG_OPF_64BIT) { + new_op = INDEX_op_movi_i64; + } else { + new_op = INDEX_op_movi_i32; + } op->opc = new_op; + /* TCGOP_VECL and TCGOP_VECE remain unchanged. */ + op->args[0] = dst; + op->args[1] = val; reset_temp(dst); di->is_const = true; @@ -214,15 +197,13 @@ static void tcg_opt_gen_movi(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg val) mask |= ~0xffffffffull; } di->mask = mask; - - op->args[0] = dst; - op->args[1] = val; } static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src) { TCGTemp *dst_ts = arg_temp(dst); TCGTemp *src_ts = arg_temp(src); + const TCGOpDef *def; struct tcg_temp_info *di; struct tcg_temp_info *si; tcg_target_ulong mask; @@ -236,9 +217,16 @@ static void tcg_opt_gen_mov(TCGContext *s, TCGOp *op, TCGArg dst, TCGArg src) reset_ts(dst_ts); di = ts_info(dst_ts); si = ts_info(src_ts); - new_op = op_to_mov(op->opc); - + def = &tcg_op_defs[op->opc]; + if (def->flags & TCG_OPF_VECTOR) { + new_op = INDEX_op_mov_vec; + } else if (def->flags & TCG_OPF_64BIT) { + new_op = INDEX_op_mov_i64; + } else { + new_op = INDEX_op_mov_i32; + } op->opc = new_op; + /* TCGOP_VECL and TCGOP_VECE remain unchanged. */ op->args[0] = dst; op->args[1] = src; @@ -417,8 +405,9 @@ static TCGArg do_constant_folding_2(TCGOpcode op, TCGArg x, TCGArg y) static TCGArg do_constant_folding(TCGOpcode op, TCGArg x, TCGArg y) { + const TCGOpDef *def = &tcg_op_defs[op]; TCGArg res = do_constant_folding_2(op, x, y); - if (op_bits(op) == 32) { + if (!(def->flags & TCG_OPF_64BIT)) { res = (int32_t)res; } return res; @@ -508,13 +497,12 @@ static TCGArg do_constant_folding_cond(TCGOpcode op, TCGArg x, tcg_target_ulong xv = arg_info(x)->val; tcg_target_ulong yv = arg_info(y)->val; if (arg_is_const(x) && arg_is_const(y)) { - switch (op_bits(op)) { - case 32: - return do_constant_folding_cond_32(xv, yv, c); - case 64: + const TCGOpDef *def = &tcg_op_defs[op]; + tcg_debug_assert(!(def->flags & TCG_OPF_VECTOR)); + if (def->flags & TCG_OPF_64BIT) { return do_constant_folding_cond_64(xv, yv, c); - default: - tcg_abort(); + } else { + return do_constant_folding_cond_32(xv, yv, c); } } else if (args_are_copies(x, y)) { return do_constant_folding_cond_eq(c); @@ -653,11 +641,11 @@ void tcg_optimize(TCGContext *s) /* For commutative operations make constant second argument */ switch (opc) { - CASE_OP_32_64(add): - CASE_OP_32_64(mul): - CASE_OP_32_64(and): - CASE_OP_32_64(or): - CASE_OP_32_64(xor): + CASE_OP_32_64_VEC(add): + CASE_OP_32_64_VEC(mul): + CASE_OP_32_64_VEC(and): + CASE_OP_32_64_VEC(or): + CASE_OP_32_64_VEC(xor): CASE_OP_32_64(eqv): CASE_OP_32_64(nand): CASE_OP_32_64(nor): @@ -722,7 +710,7 @@ void tcg_optimize(TCGContext *s) continue; } break; - CASE_OP_32_64(sub): + CASE_OP_32_64_VEC(sub): { TCGOpcode neg_op; bool have_neg; @@ -734,9 +722,12 @@ void tcg_optimize(TCGContext *s) if (opc == INDEX_op_sub_i32) { neg_op = INDEX_op_neg_i32; have_neg = TCG_TARGET_HAS_neg_i32; - } else { + } else if (opc == INDEX_op_sub_i64) { neg_op = INDEX_op_neg_i64; have_neg = TCG_TARGET_HAS_neg_i64; + } else { + neg_op = INDEX_op_neg_vec; + have_neg = TCG_TARGET_HAS_neg_vec; } if (!have_neg) { break; @@ -750,7 +741,7 @@ void tcg_optimize(TCGContext *s) } } break; - CASE_OP_32_64(xor): + CASE_OP_32_64_VEC(xor): CASE_OP_32_64(nand): if (!arg_is_const(op->args[1]) && arg_is_const(op->args[2]) @@ -767,7 +758,7 @@ void tcg_optimize(TCGContext *s) goto try_not; } break; - CASE_OP_32_64(andc): + CASE_OP_32_64_VEC(andc): if (!arg_is_const(op->args[2]) && arg_is_const(op->args[1]) && arg_info(op->args[1])->val == -1) { @@ -775,7 +766,7 @@ void tcg_optimize(TCGContext *s) goto try_not; } break; - CASE_OP_32_64(orc): + CASE_OP_32_64_VEC(orc): CASE_OP_32_64(eqv): if (!arg_is_const(op->args[2]) && arg_is_const(op->args[1]) @@ -789,7 +780,10 @@ void tcg_optimize(TCGContext *s) TCGOpcode not_op; bool have_not; - if (def->flags & TCG_OPF_64BIT) { + if (def->flags & TCG_OPF_VECTOR) { + not_op = INDEX_op_not_vec; + have_not = TCG_TARGET_HAS_not_vec; + } else if (def->flags & TCG_OPF_64BIT) { not_op = INDEX_op_not_i64; have_not = TCG_TARGET_HAS_not_i64; } else { @@ -810,16 +804,16 @@ void tcg_optimize(TCGContext *s) /* Simplify expression for "op r, a, const => mov r, a" cases */ switch (opc) { - CASE_OP_32_64(add): - CASE_OP_32_64(sub): + CASE_OP_32_64_VEC(add): + CASE_OP_32_64_VEC(sub): + CASE_OP_32_64_VEC(or): + CASE_OP_32_64_VEC(xor): + CASE_OP_32_64_VEC(andc): CASE_OP_32_64(shl): CASE_OP_32_64(shr): CASE_OP_32_64(sar): CASE_OP_32_64(rotl): CASE_OP_32_64(rotr): - CASE_OP_32_64(or): - CASE_OP_32_64(xor): - CASE_OP_32_64(andc): if (!arg_is_const(op->args[1]) && arg_is_const(op->args[2]) && arg_info(op->args[2])->val == 0) { @@ -827,8 +821,8 @@ void tcg_optimize(TCGContext *s) continue; } break; - CASE_OP_32_64(and): - CASE_OP_32_64(orc): + CASE_OP_32_64_VEC(and): + CASE_OP_32_64_VEC(orc): CASE_OP_32_64(eqv): if (!arg_is_const(op->args[1]) && arg_is_const(op->args[2]) @@ -1042,8 +1036,8 @@ void tcg_optimize(TCGContext *s) /* Simplify expression for "op r, a, 0 => movi r, 0" cases */ switch (opc) { - CASE_OP_32_64(and): - CASE_OP_32_64(mul): + CASE_OP_32_64_VEC(and): + CASE_OP_32_64_VEC(mul): CASE_OP_32_64(muluh): CASE_OP_32_64(mulsh): if (arg_is_const(op->args[2]) @@ -1058,8 +1052,8 @@ void tcg_optimize(TCGContext *s) /* Simplify expression for "op r, a, a => mov r, a" cases */ switch (opc) { - CASE_OP_32_64(or): - CASE_OP_32_64(and): + CASE_OP_32_64_VEC(or): + CASE_OP_32_64_VEC(and): if (args_are_copies(op->args[1], op->args[2])) { tcg_opt_gen_mov(s, op, op->args[0], op->args[1]); continue; @@ -1071,9 +1065,9 @@ void tcg_optimize(TCGContext *s) /* Simplify expression for "op r, a, a => movi r, 0" cases */ switch (opc) { - CASE_OP_32_64(andc): - CASE_OP_32_64(sub): - CASE_OP_32_64(xor): + CASE_OP_32_64_VEC(andc): + CASE_OP_32_64_VEC(sub): + CASE_OP_32_64_VEC(xor): if (args_are_copies(op->args[1], op->args[2])) { tcg_opt_gen_movi(s, op, op->args[0], 0); continue; @@ -1087,13 +1081,23 @@ void tcg_optimize(TCGContext *s) folding. Constants will be substituted to arguments by register allocator where needed and possible. Also detect copies. */ switch (opc) { - CASE_OP_32_64(mov): + CASE_OP_32_64_VEC(mov): tcg_opt_gen_mov(s, op, op->args[0], op->args[1]); break; CASE_OP_32_64(movi): + case INDEX_op_dupi_vec: tcg_opt_gen_movi(s, op, op->args[0], op->args[1]); break; + case INDEX_op_dup_vec: + if (arg_is_const(op->args[1])) { + tmp = arg_info(op->args[1])->val; + tmp = dup_const(TCGOP_VECE(op), tmp); + tcg_opt_gen_movi(s, op, op->args[0], tmp); + continue; + } + break; + CASE_OP_32_64(not): CASE_OP_32_64(neg): CASE_OP_32_64(ext8s): diff --git a/tcg/tcg-gvec-desc.h b/tcg/tcg-gvec-desc.h new file mode 100644 index 0000000000..3b4c2d9c69 --- /dev/null +++ b/tcg/tcg-gvec-desc.h @@ -0,0 +1,49 @@ +/* + * Generic vector operation descriptor + * + * Copyright (c) 2018 Linaro + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +/* ??? These bit widths are set for ARM SVE, maxing out at 256 byte vectors. */ +#define SIMD_OPRSZ_SHIFT 0 +#define SIMD_OPRSZ_BITS 5 + +#define SIMD_MAXSZ_SHIFT (SIMD_OPRSZ_SHIFT + SIMD_OPRSZ_BITS) +#define SIMD_MAXSZ_BITS 5 + +#define SIMD_DATA_SHIFT (SIMD_MAXSZ_SHIFT + SIMD_MAXSZ_BITS) +#define SIMD_DATA_BITS (32 - SIMD_DATA_SHIFT) + +/* Create a descriptor from components. */ +uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data); + +/* Extract the operation size from a descriptor. */ +static inline intptr_t simd_oprsz(uint32_t desc) +{ + return (extract32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS) + 1) * 8; +} + +/* Extract the max vector size from a descriptor. */ +static inline intptr_t simd_maxsz(uint32_t desc) +{ + return (extract32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS) + 1) * 8; +} + +/* Extract the operation-specific data from a descriptor. */ +static inline int32_t simd_data(uint32_t desc) +{ + return sextract32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS); +} diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c new file mode 100644 index 0000000000..bfe44bba81 --- /dev/null +++ b/tcg/tcg-op-gvec.c @@ -0,0 +1,2216 @@ +/* + * Generic vector operation expansion + * + * Copyright (c) 2018 Linaro + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "tcg.h" +#include "tcg-op.h" +#include "tcg-op-gvec.h" +#include "tcg-gvec-desc.h" + +#define MAX_UNROLL 4 + +/* Verify vector size and alignment rules. OFS should be the OR of all + of the operand offsets so that we can check them all at once. */ +static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs) +{ + uint32_t opr_align = oprsz >= 16 ? 15 : 7; + uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7; + tcg_debug_assert(oprsz > 0); + tcg_debug_assert(oprsz <= maxsz); + tcg_debug_assert((oprsz & opr_align) == 0); + tcg_debug_assert((maxsz & max_align) == 0); + tcg_debug_assert((ofs & max_align) == 0); +} + +/* Verify vector overlap rules for two operands. */ +static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s) +{ + tcg_debug_assert(d == a || d + s <= a || a + s <= d); +} + +/* Verify vector overlap rules for three operands. */ +static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s) +{ + check_overlap_2(d, a, s); + check_overlap_2(d, b, s); + check_overlap_2(a, b, s); +} + +/* Verify vector overlap rules for four operands. */ +static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b, + uint32_t c, uint32_t s) +{ + check_overlap_2(d, a, s); + check_overlap_2(d, b, s); + check_overlap_2(d, c, s); + check_overlap_2(a, b, s); + check_overlap_2(a, c, s); + check_overlap_2(b, c, s); +} + +/* Create a descriptor from components. */ +uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data) +{ + uint32_t desc = 0; + + assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS)); + assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS)); + assert(data == sextract32(data, 0, SIMD_DATA_BITS)); + + oprsz = (oprsz / 8) - 1; + maxsz = (maxsz / 8) - 1; + desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz); + desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz); + desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data); + + return desc; +} + +/* Generate a call to a gvec-style helper with two vector operands. */ +void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs, + uint32_t oprsz, uint32_t maxsz, int32_t data, + gen_helper_gvec_2 *fn) +{ + TCGv_ptr a0, a1; + TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); + + a0 = tcg_temp_new_ptr(); + a1 = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(a0, cpu_env, dofs); + tcg_gen_addi_ptr(a1, cpu_env, aofs); + + fn(a0, a1, desc); + + tcg_temp_free_ptr(a0); + tcg_temp_free_ptr(a1); + tcg_temp_free_i32(desc); +} + +/* Generate a call to a gvec-style helper with two vector operands + and one scalar operand. */ +void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c, + uint32_t oprsz, uint32_t maxsz, int32_t data, + gen_helper_gvec_2i *fn) +{ + TCGv_ptr a0, a1; + TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); + + a0 = tcg_temp_new_ptr(); + a1 = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(a0, cpu_env, dofs); + tcg_gen_addi_ptr(a1, cpu_env, aofs); + + fn(a0, a1, c, desc); + + tcg_temp_free_ptr(a0); + tcg_temp_free_ptr(a1); + tcg_temp_free_i32(desc); +} + +/* Generate a call to a gvec-style helper with three vector operands. */ +void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, + uint32_t oprsz, uint32_t maxsz, int32_t data, + gen_helper_gvec_3 *fn) +{ + TCGv_ptr a0, a1, a2; + TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); + + a0 = tcg_temp_new_ptr(); + a1 = tcg_temp_new_ptr(); + a2 = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(a0, cpu_env, dofs); + tcg_gen_addi_ptr(a1, cpu_env, aofs); + tcg_gen_addi_ptr(a2, cpu_env, bofs); + + fn(a0, a1, a2, desc); + + tcg_temp_free_ptr(a0); + tcg_temp_free_ptr(a1); + tcg_temp_free_ptr(a2); + tcg_temp_free_i32(desc); +} + +/* Generate a call to a gvec-style helper with four vector operands. */ +void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, + uint32_t cofs, uint32_t oprsz, uint32_t maxsz, + int32_t data, gen_helper_gvec_4 *fn) +{ + TCGv_ptr a0, a1, a2, a3; + TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); + + a0 = tcg_temp_new_ptr(); + a1 = tcg_temp_new_ptr(); + a2 = tcg_temp_new_ptr(); + a3 = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(a0, cpu_env, dofs); + tcg_gen_addi_ptr(a1, cpu_env, aofs); + tcg_gen_addi_ptr(a2, cpu_env, bofs); + tcg_gen_addi_ptr(a3, cpu_env, cofs); + + fn(a0, a1, a2, a3, desc); + + tcg_temp_free_ptr(a0); + tcg_temp_free_ptr(a1); + tcg_temp_free_ptr(a2); + tcg_temp_free_ptr(a3); + tcg_temp_free_i32(desc); +} + +/* Generate a call to a gvec-style helper with five vector operands. */ +void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, + uint32_t cofs, uint32_t xofs, uint32_t oprsz, + uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn) +{ + TCGv_ptr a0, a1, a2, a3, a4; + TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); + + a0 = tcg_temp_new_ptr(); + a1 = tcg_temp_new_ptr(); + a2 = tcg_temp_new_ptr(); + a3 = tcg_temp_new_ptr(); + a4 = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(a0, cpu_env, dofs); + tcg_gen_addi_ptr(a1, cpu_env, aofs); + tcg_gen_addi_ptr(a2, cpu_env, bofs); + tcg_gen_addi_ptr(a3, cpu_env, cofs); + tcg_gen_addi_ptr(a4, cpu_env, xofs); + + fn(a0, a1, a2, a3, a4, desc); + + tcg_temp_free_ptr(a0); + tcg_temp_free_ptr(a1); + tcg_temp_free_ptr(a2); + tcg_temp_free_ptr(a3); + tcg_temp_free_ptr(a4); + tcg_temp_free_i32(desc); +} + +/* Generate a call to a gvec-style helper with three vector operands + and an extra pointer operand. */ +void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs, + TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, + int32_t data, gen_helper_gvec_2_ptr *fn) +{ + TCGv_ptr a0, a1; + TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); + + a0 = tcg_temp_new_ptr(); + a1 = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(a0, cpu_env, dofs); + tcg_gen_addi_ptr(a1, cpu_env, aofs); + + fn(a0, a1, ptr, desc); + + tcg_temp_free_ptr(a0); + tcg_temp_free_ptr(a1); + tcg_temp_free_i32(desc); +} + +/* Generate a call to a gvec-style helper with three vector operands + and an extra pointer operand. */ +void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, + TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, + int32_t data, gen_helper_gvec_3_ptr *fn) +{ + TCGv_ptr a0, a1, a2; + TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); + + a0 = tcg_temp_new_ptr(); + a1 = tcg_temp_new_ptr(); + a2 = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(a0, cpu_env, dofs); + tcg_gen_addi_ptr(a1, cpu_env, aofs); + tcg_gen_addi_ptr(a2, cpu_env, bofs); + + fn(a0, a1, a2, ptr, desc); + + tcg_temp_free_ptr(a0); + tcg_temp_free_ptr(a1); + tcg_temp_free_ptr(a2); + tcg_temp_free_i32(desc); +} + +/* Generate a call to a gvec-style helper with four vector operands + and an extra pointer operand. */ +void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, + uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz, + uint32_t maxsz, int32_t data, + gen_helper_gvec_4_ptr *fn) +{ + TCGv_ptr a0, a1, a2, a3; + TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data)); + + a0 = tcg_temp_new_ptr(); + a1 = tcg_temp_new_ptr(); + a2 = tcg_temp_new_ptr(); + a3 = tcg_temp_new_ptr(); + + tcg_gen_addi_ptr(a0, cpu_env, dofs); + tcg_gen_addi_ptr(a1, cpu_env, aofs); + tcg_gen_addi_ptr(a2, cpu_env, bofs); + tcg_gen_addi_ptr(a3, cpu_env, cofs); + + fn(a0, a1, a2, a3, ptr, desc); + + tcg_temp_free_ptr(a0); + tcg_temp_free_ptr(a1); + tcg_temp_free_ptr(a2); + tcg_temp_free_ptr(a3); + tcg_temp_free_i32(desc); +} + +/* Return true if we want to implement something of OPRSZ bytes + in units of LNSZ. This limits the expansion of inline code. */ +static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz) +{ + uint32_t lnct = oprsz / lnsz; + return lnct >= 1 && lnct <= MAX_UNROLL; +} + +static void expand_clr(uint32_t dofs, uint32_t maxsz); + +/* Duplicate C as per VECE. */ +uint64_t (dup_const)(unsigned vece, uint64_t c) +{ + switch (vece) { + case MO_8: + return 0x0101010101010101ull * (uint8_t)c; + case MO_16: + return 0x0001000100010001ull * (uint16_t)c; + case MO_32: + return 0x0000000100000001ull * (uint32_t)c; + case MO_64: + return c; + default: + g_assert_not_reached(); + } +} + +/* Duplicate IN into OUT as per VECE. */ +static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in) +{ + switch (vece) { + case MO_8: + tcg_gen_ext8u_i32(out, in); + tcg_gen_muli_i32(out, out, 0x01010101); + break; + case MO_16: + tcg_gen_deposit_i32(out, in, in, 16, 16); + break; + case MO_32: + tcg_gen_mov_i32(out, in); + break; + default: + g_assert_not_reached(); + } +} + +static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in) +{ + switch (vece) { + case MO_8: + tcg_gen_ext8u_i64(out, in); + tcg_gen_muli_i64(out, out, 0x0101010101010101ull); + break; + case MO_16: + tcg_gen_ext16u_i64(out, in); + tcg_gen_muli_i64(out, out, 0x0001000100010001ull); + break; + case MO_32: + tcg_gen_deposit_i64(out, in, in, 32, 32); + break; + case MO_64: + tcg_gen_mov_i64(out, in); + break; + default: + g_assert_not_reached(); + } +} + +/* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C. + * Only one of IN_32 or IN_64 may be set; + * IN_C is used if IN_32 and IN_64 are unset. + */ +static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz, + uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64, + uint64_t in_c) +{ + TCGType type; + TCGv_i64 t_64; + TCGv_i32 t_32, t_desc; + TCGv_ptr t_ptr; + uint32_t i; + + assert(vece <= (in_32 ? MO_32 : MO_64)); + assert(in_32 == NULL || in_64 == NULL); + + /* If we're storing 0, expand oprsz to maxsz. */ + if (in_32 == NULL && in_64 == NULL) { + in_c = dup_const(vece, in_c); + if (in_c == 0) { + oprsz = maxsz; + } + } + + type = 0; + if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)) { + type = TCG_TYPE_V256; + } else if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)) { + type = TCG_TYPE_V128; + } else if (TCG_TARGET_HAS_v64 && check_size_impl(oprsz, 8) + /* Prefer integer when 64-bit host and no variable dup. */ + && !(TCG_TARGET_REG_BITS == 64 && in_32 == NULL + && (in_64 == NULL || vece == MO_64))) { + type = TCG_TYPE_V64; + } + + /* Implement inline with a vector type, if possible. */ + if (type != 0) { + TCGv_vec t_vec = tcg_temp_new_vec(type); + + if (in_32) { + tcg_gen_dup_i32_vec(vece, t_vec, in_32); + } else if (in_64) { + tcg_gen_dup_i64_vec(vece, t_vec, in_64); + } else { + switch (vece) { + case MO_8: + tcg_gen_dup8i_vec(t_vec, in_c); + break; + case MO_16: + tcg_gen_dup16i_vec(t_vec, in_c); + break; + case MO_32: + tcg_gen_dup32i_vec(t_vec, in_c); + break; + default: + tcg_gen_dup64i_vec(t_vec, in_c); + break; + } + } + + i = 0; + if (TCG_TARGET_HAS_v256) { + for (; i + 32 <= oprsz; i += 32) { + tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256); + } + } + if (TCG_TARGET_HAS_v128) { + for (; i + 16 <= oprsz; i += 16) { + tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128); + } + } + if (TCG_TARGET_HAS_v64) { + for (; i < oprsz; i += 8) { + tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64); + } + } + tcg_temp_free_vec(t_vec); + goto done; + } + + /* Otherwise, inline with an integer type, unless "large". */ + if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) { + t_64 = NULL; + t_32 = NULL; + + if (in_32) { + /* We are given a 32-bit variable input. For a 64-bit host, + use a 64-bit operation unless the 32-bit operation would + be simple enough. */ + if (TCG_TARGET_REG_BITS == 64 + && (vece != MO_32 || !check_size_impl(oprsz, 4))) { + t_64 = tcg_temp_new_i64(); + tcg_gen_extu_i32_i64(t_64, in_32); + gen_dup_i64(vece, t_64, t_64); + } else { + t_32 = tcg_temp_new_i32(); + gen_dup_i32(vece, t_32, in_32); + } + } else if (in_64) { + /* We are given a 64-bit variable input. */ + t_64 = tcg_temp_new_i64(); + gen_dup_i64(vece, t_64, in_64); + } else { + /* We are given a constant input. */ + /* For 64-bit hosts, use 64-bit constants for "simple" constants + or when we'd need too many 32-bit stores, or when a 64-bit + constant is really required. */ + if (vece == MO_64 + || (TCG_TARGET_REG_BITS == 64 + && (in_c == 0 || in_c == -1 + || !check_size_impl(oprsz, 4)))) { + t_64 = tcg_const_i64(in_c); + } else { + t_32 = tcg_const_i32(in_c); + } + } + + /* Implement inline if we picked an implementation size above. */ + if (t_32) { + for (i = 0; i < oprsz; i += 4) { + tcg_gen_st_i32(t_32, cpu_env, dofs + i); + } + tcg_temp_free_i32(t_32); + goto done; + } + if (t_64) { + for (i = 0; i < oprsz; i += 8) { + tcg_gen_st_i64(t_64, cpu_env, dofs + i); + } + tcg_temp_free_i64(t_64); + goto done; + } + } + + /* Otherwise implement out of line. */ + t_ptr = tcg_temp_new_ptr(); + tcg_gen_addi_ptr(t_ptr, cpu_env, dofs); + t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0)); + + if (vece == MO_64) { + if (in_64) { + gen_helper_gvec_dup64(t_ptr, t_desc, in_64); + } else { + t_64 = tcg_const_i64(in_c); + gen_helper_gvec_dup64(t_ptr, t_desc, t_64); + tcg_temp_free_i64(t_64); + } + } else { + typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32); + static dup_fn * const fns[3] = { + gen_helper_gvec_dup8, + gen_helper_gvec_dup16, + gen_helper_gvec_dup32 + }; + + if (in_32) { + fns[vece](t_ptr, t_desc, in_32); + } else { + t_32 = tcg_temp_new_i32(); + if (in_64) { + tcg_gen_extrl_i64_i32(t_32, in_64); + } else if (vece == MO_8) { + tcg_gen_movi_i32(t_32, in_c & 0xff); + } else if (vece == MO_16) { + tcg_gen_movi_i32(t_32, in_c & 0xffff); + } else { + tcg_gen_movi_i32(t_32, in_c); + } + fns[vece](t_ptr, t_desc, t_32); + tcg_temp_free_i32(t_32); + } + } + + tcg_temp_free_ptr(t_ptr); + tcg_temp_free_i32(t_desc); + return; + + done: + if (oprsz < maxsz) { + expand_clr(dofs + oprsz, maxsz - oprsz); + } +} + +/* Likewise, but with zero. */ +static void expand_clr(uint32_t dofs, uint32_t maxsz) +{ + do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0); +} + +/* Expand OPSZ bytes worth of two-operand operations using i32 elements. */ +static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, + void (*fni)(TCGv_i32, TCGv_i32)) +{ + TCGv_i32 t0 = tcg_temp_new_i32(); + uint32_t i; + + for (i = 0; i < oprsz; i += 4) { + tcg_gen_ld_i32(t0, cpu_env, aofs + i); + fni(t0, t0); + tcg_gen_st_i32(t0, cpu_env, dofs + i); + } + tcg_temp_free_i32(t0); +} + +static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, + int32_t c, bool load_dest, + void (*fni)(TCGv_i32, TCGv_i32, int32_t)) +{ + TCGv_i32 t0 = tcg_temp_new_i32(); + TCGv_i32 t1 = tcg_temp_new_i32(); + uint32_t i; + + for (i = 0; i < oprsz; i += 4) { + tcg_gen_ld_i32(t0, cpu_env, aofs + i); + if (load_dest) { + tcg_gen_ld_i32(t1, cpu_env, dofs + i); + } + fni(t1, t0, c); + tcg_gen_st_i32(t1, cpu_env, dofs + i); + } + tcg_temp_free_i32(t0); + tcg_temp_free_i32(t1); +} + +static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz, + TCGv_i32 c, bool scalar_first, + void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) +{ + TCGv_i32 t0 = tcg_temp_new_i32(); + TCGv_i32 t1 = tcg_temp_new_i32(); + uint32_t i; + + for (i = 0; i < oprsz; i += 4) { + tcg_gen_ld_i32(t0, cpu_env, aofs + i); + if (scalar_first) { + fni(t1, c, t0); + } else { + fni(t1, t0, c); + } + tcg_gen_st_i32(t1, cpu_env, dofs + i); + } + tcg_temp_free_i32(t0); + tcg_temp_free_i32(t1); +} + +/* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ +static void expand_3_i32(uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, bool load_dest, + void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32)) +{ + TCGv_i32 t0 = tcg_temp_new_i32(); + TCGv_i32 t1 = tcg_temp_new_i32(); + TCGv_i32 t2 = tcg_temp_new_i32(); + uint32_t i; + + for (i = 0; i < oprsz; i += 4) { + tcg_gen_ld_i32(t0, cpu_env, aofs + i); + tcg_gen_ld_i32(t1, cpu_env, bofs + i); + if (load_dest) { + tcg_gen_ld_i32(t2, cpu_env, dofs + i); + } + fni(t2, t0, t1); + tcg_gen_st_i32(t2, cpu_env, dofs + i); + } + tcg_temp_free_i32(t2); + tcg_temp_free_i32(t1); + tcg_temp_free_i32(t0); +} + +/* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ +static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, + uint32_t cofs, uint32_t oprsz, + void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32)) +{ + TCGv_i32 t0 = tcg_temp_new_i32(); + TCGv_i32 t1 = tcg_temp_new_i32(); + TCGv_i32 t2 = tcg_temp_new_i32(); + TCGv_i32 t3 = tcg_temp_new_i32(); + uint32_t i; + + for (i = 0; i < oprsz; i += 4) { + tcg_gen_ld_i32(t1, cpu_env, aofs + i); + tcg_gen_ld_i32(t2, cpu_env, bofs + i); + tcg_gen_ld_i32(t3, cpu_env, cofs + i); + fni(t0, t1, t2, t3); + tcg_gen_st_i32(t0, cpu_env, dofs + i); + } + tcg_temp_free_i32(t3); + tcg_temp_free_i32(t2); + tcg_temp_free_i32(t1); + tcg_temp_free_i32(t0); +} + +/* Expand OPSZ bytes worth of two-operand operations using i64 elements. */ +static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, + void (*fni)(TCGv_i64, TCGv_i64)) +{ + TCGv_i64 t0 = tcg_temp_new_i64(); + uint32_t i; + + for (i = 0; i < oprsz; i += 8) { + tcg_gen_ld_i64(t0, cpu_env, aofs + i); + fni(t0, t0); + tcg_gen_st_i64(t0, cpu_env, dofs + i); + } + tcg_temp_free_i64(t0); +} + +static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, + int64_t c, bool load_dest, + void (*fni)(TCGv_i64, TCGv_i64, int64_t)) +{ + TCGv_i64 t0 = tcg_temp_new_i64(); + TCGv_i64 t1 = tcg_temp_new_i64(); + uint32_t i; + + for (i = 0; i < oprsz; i += 8) { + tcg_gen_ld_i64(t0, cpu_env, aofs + i); + if (load_dest) { + tcg_gen_ld_i64(t1, cpu_env, dofs + i); + } + fni(t1, t0, c); + tcg_gen_st_i64(t1, cpu_env, dofs + i); + } + tcg_temp_free_i64(t0); + tcg_temp_free_i64(t1); +} + +static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz, + TCGv_i64 c, bool scalar_first, + void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) +{ + TCGv_i64 t0 = tcg_temp_new_i64(); + TCGv_i64 t1 = tcg_temp_new_i64(); + uint32_t i; + + for (i = 0; i < oprsz; i += 8) { + tcg_gen_ld_i64(t0, cpu_env, aofs + i); + if (scalar_first) { + fni(t1, c, t0); + } else { + fni(t1, t0, c); + } + tcg_gen_st_i64(t1, cpu_env, dofs + i); + } + tcg_temp_free_i64(t0); + tcg_temp_free_i64(t1); +} + +/* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ +static void expand_3_i64(uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, bool load_dest, + void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64)) +{ + TCGv_i64 t0 = tcg_temp_new_i64(); + TCGv_i64 t1 = tcg_temp_new_i64(); + TCGv_i64 t2 = tcg_temp_new_i64(); + uint32_t i; + + for (i = 0; i < oprsz; i += 8) { + tcg_gen_ld_i64(t0, cpu_env, aofs + i); + tcg_gen_ld_i64(t1, cpu_env, bofs + i); + if (load_dest) { + tcg_gen_ld_i64(t2, cpu_env, dofs + i); + } + fni(t2, t0, t1); + tcg_gen_st_i64(t2, cpu_env, dofs + i); + } + tcg_temp_free_i64(t2); + tcg_temp_free_i64(t1); + tcg_temp_free_i64(t0); +} + +/* Expand OPSZ bytes worth of three-operand operations using i64 elements. */ +static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, + uint32_t cofs, uint32_t oprsz, + void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64)) +{ + TCGv_i64 t0 = tcg_temp_new_i64(); + TCGv_i64 t1 = tcg_temp_new_i64(); + TCGv_i64 t2 = tcg_temp_new_i64(); + TCGv_i64 t3 = tcg_temp_new_i64(); + uint32_t i; + + for (i = 0; i < oprsz; i += 8) { + tcg_gen_ld_i64(t1, cpu_env, aofs + i); + tcg_gen_ld_i64(t2, cpu_env, bofs + i); + tcg_gen_ld_i64(t3, cpu_env, cofs + i); + fni(t0, t1, t2, t3); + tcg_gen_st_i64(t0, cpu_env, dofs + i); + } + tcg_temp_free_i64(t3); + tcg_temp_free_i64(t2); + tcg_temp_free_i64(t1); + tcg_temp_free_i64(t0); +} + +/* Expand OPSZ bytes worth of two-operand operations using host vectors. */ +static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t oprsz, uint32_t tysz, TCGType type, + void (*fni)(unsigned, TCGv_vec, TCGv_vec)) +{ + TCGv_vec t0 = tcg_temp_new_vec(type); + uint32_t i; + + for (i = 0; i < oprsz; i += tysz) { + tcg_gen_ld_vec(t0, cpu_env, aofs + i); + fni(vece, t0, t0); + tcg_gen_st_vec(t0, cpu_env, dofs + i); + } + tcg_temp_free_vec(t0); +} + +/* Expand OPSZ bytes worth of two-vector operands and an immediate operand + using host vectors. */ +static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t oprsz, uint32_t tysz, TCGType type, + int64_t c, bool load_dest, + void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t)) +{ + TCGv_vec t0 = tcg_temp_new_vec(type); + TCGv_vec t1 = tcg_temp_new_vec(type); + uint32_t i; + + for (i = 0; i < oprsz; i += tysz) { + tcg_gen_ld_vec(t0, cpu_env, aofs + i); + if (load_dest) { + tcg_gen_ld_vec(t1, cpu_env, dofs + i); + } + fni(vece, t1, t0, c); + tcg_gen_st_vec(t1, cpu_env, dofs + i); + } + tcg_temp_free_vec(t0); + tcg_temp_free_vec(t1); +} + +static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t oprsz, uint32_t tysz, TCGType type, + TCGv_vec c, bool scalar_first, + void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) +{ + TCGv_vec t0 = tcg_temp_new_vec(type); + TCGv_vec t1 = tcg_temp_new_vec(type); + uint32_t i; + + for (i = 0; i < oprsz; i += tysz) { + tcg_gen_ld_vec(t0, cpu_env, aofs + i); + if (scalar_first) { + fni(vece, t1, c, t0); + } else { + fni(vece, t1, t0, c); + } + tcg_gen_st_vec(t1, cpu_env, dofs + i); + } + tcg_temp_free_vec(t0); + tcg_temp_free_vec(t1); +} + +/* Expand OPSZ bytes worth of three-operand operations using host vectors. */ +static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, + uint32_t tysz, TCGType type, bool load_dest, + void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec)) +{ + TCGv_vec t0 = tcg_temp_new_vec(type); + TCGv_vec t1 = tcg_temp_new_vec(type); + TCGv_vec t2 = tcg_temp_new_vec(type); + uint32_t i; + + for (i = 0; i < oprsz; i += tysz) { + tcg_gen_ld_vec(t0, cpu_env, aofs + i); + tcg_gen_ld_vec(t1, cpu_env, bofs + i); + if (load_dest) { + tcg_gen_ld_vec(t2, cpu_env, dofs + i); + } + fni(vece, t2, t0, t1); + tcg_gen_st_vec(t2, cpu_env, dofs + i); + } + tcg_temp_free_vec(t2); + tcg_temp_free_vec(t1); + tcg_temp_free_vec(t0); +} + +/* Expand OPSZ bytes worth of four-operand operations using host vectors. */ +static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t cofs, uint32_t oprsz, + uint32_t tysz, TCGType type, + void (*fni)(unsigned, TCGv_vec, TCGv_vec, + TCGv_vec, TCGv_vec)) +{ + TCGv_vec t0 = tcg_temp_new_vec(type); + TCGv_vec t1 = tcg_temp_new_vec(type); + TCGv_vec t2 = tcg_temp_new_vec(type); + TCGv_vec t3 = tcg_temp_new_vec(type); + uint32_t i; + + for (i = 0; i < oprsz; i += tysz) { + tcg_gen_ld_vec(t1, cpu_env, aofs + i); + tcg_gen_ld_vec(t2, cpu_env, bofs + i); + tcg_gen_ld_vec(t3, cpu_env, cofs + i); + fni(vece, t0, t1, t2, t3); + tcg_gen_st_vec(t0, cpu_env, dofs + i); + } + tcg_temp_free_vec(t3); + tcg_temp_free_vec(t2); + tcg_temp_free_vec(t1); + tcg_temp_free_vec(t0); +} + +/* Expand a vector two-operand operation. */ +void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs, + uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g) +{ + check_size_align(oprsz, maxsz, dofs | aofs); + check_overlap_2(dofs, aofs, maxsz); + + /* Recall that ARM SVE allows vector sizes that are not a power of 2. + Expand with successively smaller host vector sizes. The intent is + that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */ + /* ??? For maxsz > oprsz, the host may be able to use an opr-sized + operation, zeroing the balance of the register. We can then + use a max-sized store to implement the clearing without an extra + store operation. This is true for aarch64 and x86_64 hosts. */ + + if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32) + && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) { + uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32); + expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv); + if (some == oprsz) { + goto done; + } + dofs += some; + aofs += some; + oprsz -= some; + maxsz -= some; + } + + if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16) + && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) { + expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv); + } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64 + && g->fniv && check_size_impl(oprsz, 8) + && (!g->opc + || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) { + expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv); + } else if (g->fni8 && check_size_impl(oprsz, 8)) { + expand_2_i64(dofs, aofs, oprsz, g->fni8); + } else if (g->fni4 && check_size_impl(oprsz, 4)) { + expand_2_i32(dofs, aofs, oprsz, g->fni4); + } else { + assert(g->fno != NULL); + tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno); + return; + } + + done: + if (oprsz < maxsz) { + expand_clr(dofs + oprsz, maxsz - oprsz); + } +} + +/* Expand a vector operation with two vectors and an immediate. */ +void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz, + uint32_t maxsz, int64_t c, const GVecGen2i *g) +{ + check_size_align(oprsz, maxsz, dofs | aofs); + check_overlap_2(dofs, aofs, maxsz); + + /* Recall that ARM SVE allows vector sizes that are not a power of 2. + Expand with successively smaller host vector sizes. The intent is + that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */ + + if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32) + && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) { + uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32); + expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, + c, g->load_dest, g->fniv); + if (some == oprsz) { + goto done; + } + dofs += some; + aofs += some; + oprsz -= some; + maxsz -= some; + } + + if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16) + && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) { + expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, + c, g->load_dest, g->fniv); + } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64 + && g->fniv && check_size_impl(oprsz, 8) + && (!g->opc + || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) { + expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, + c, g->load_dest, g->fniv); + } else if (g->fni8 && check_size_impl(oprsz, 8)) { + expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8); + } else if (g->fni4 && check_size_impl(oprsz, 4)) { + expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4); + } else { + if (g->fno) { + tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno); + } else { + TCGv_i64 tcg_c = tcg_const_i64(c); + tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz, maxsz, c, g->fnoi); + tcg_temp_free_i64(tcg_c); + } + return; + } + + done: + if (oprsz < maxsz) { + expand_clr(dofs + oprsz, maxsz - oprsz); + } +} + +/* Expand a vector operation with two vectors and a scalar. */ +void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz, + uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g) +{ + TCGType type; + + check_size_align(oprsz, maxsz, dofs | aofs); + check_overlap_2(dofs, aofs, maxsz); + + type = 0; + if (g->fniv) { + if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)) { + type = TCG_TYPE_V256; + } else if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)) { + type = TCG_TYPE_V128; + } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64 + && check_size_impl(oprsz, 8)) { + type = TCG_TYPE_V64; + } + } + if (type != 0) { + TCGv_vec t_vec = tcg_temp_new_vec(type); + + tcg_gen_dup_i64_vec(g->vece, t_vec, c); + + /* Recall that ARM SVE allows vector sizes that are not a power of 2. + Expand with successively smaller host vector sizes. The intent is + that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */ + switch (type) { + case TCG_TYPE_V256: + { + uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32); + expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, + t_vec, g->scalar_first, g->fniv); + if (some == oprsz) { + break; + } + dofs += some; + aofs += some; + oprsz -= some; + maxsz -= some; + } + /* fallthru */ + + case TCG_TYPE_V128: + expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, + t_vec, g->scalar_first, g->fniv); + break; + + case TCG_TYPE_V64: + expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, + t_vec, g->scalar_first, g->fniv); + break; + + default: + g_assert_not_reached(); + } + tcg_temp_free_vec(t_vec); + } else if (g->fni8 && check_size_impl(oprsz, 8)) { + TCGv_i64 t64 = tcg_temp_new_i64(); + + gen_dup_i64(g->vece, t64, c); + expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8); + tcg_temp_free_i64(t64); + } else if (g->fni4 && check_size_impl(oprsz, 4)) { + TCGv_i32 t32 = tcg_temp_new_i32(); + + tcg_gen_extrl_i64_i32(t32, c); + gen_dup_i32(g->vece, t32, t32); + expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4); + tcg_temp_free_i32(t32); + } else { + tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno); + return; + } + + if (oprsz < maxsz) { + expand_clr(dofs + oprsz, maxsz - oprsz); + } +} + +/* Expand a vector three-operand operation. */ +void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs, + uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g) +{ + check_size_align(oprsz, maxsz, dofs | aofs | bofs); + check_overlap_3(dofs, aofs, bofs, maxsz); + + /* Recall that ARM SVE allows vector sizes that are not a power of 2. + Expand with successively smaller host vector sizes. The intent is + that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */ + + if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32) + && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) { + uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32); + expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, + g->load_dest, g->fniv); + if (some == oprsz) { + goto done; + } + dofs += some; + aofs += some; + bofs += some; + oprsz -= some; + maxsz -= some; + } + + if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16) + && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) { + expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, + g->load_dest, g->fniv); + } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64 + && g->fniv && check_size_impl(oprsz, 8) + && (!g->opc + || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) { + expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, + g->load_dest, g->fniv); + } else if (g->fni8 && check_size_impl(oprsz, 8)) { + expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8); + } else if (g->fni4 && check_size_impl(oprsz, 4)) { + expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4); + } else { + assert(g->fno != NULL); + tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, g->data, g->fno); + } + + done: + if (oprsz < maxsz) { + expand_clr(dofs + oprsz, maxsz - oprsz); + } +} + +/* Expand a vector four-operand operation. */ +void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs, + uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g) +{ + check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs); + check_overlap_4(dofs, aofs, bofs, cofs, maxsz); + + /* Recall that ARM SVE allows vector sizes that are not a power of 2. + Expand with successively smaller host vector sizes. The intent is + that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */ + + if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32) + && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) { + uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32); + expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some, + 32, TCG_TYPE_V256, g->fniv); + if (some == oprsz) { + goto done; + } + dofs += some; + aofs += some; + bofs += some; + cofs += some; + oprsz -= some; + maxsz -= some; + } + + if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16) + && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) { + expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, + 16, TCG_TYPE_V128, g->fniv); + } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64 + && g->fniv && check_size_impl(oprsz, 8) + && (!g->opc + || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) { + expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz, + 8, TCG_TYPE_V64, g->fniv); + } else if (g->fni8 && check_size_impl(oprsz, 8)) { + expand_4_i64(dofs, aofs, bofs, cofs, oprsz, g->fni8); + } else if (g->fni4 && check_size_impl(oprsz, 4)) { + expand_4_i32(dofs, aofs, bofs, cofs, oprsz, g->fni4); + } else { + assert(g->fno != NULL); + tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs, + oprsz, maxsz, g->data, g->fno); + return; + } + + done: + if (oprsz < maxsz) { + expand_clr(dofs + oprsz, maxsz - oprsz); + } +} + +/* + * Expand specific vector operations. + */ + +static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b) +{ + tcg_gen_mov_vec(a, b); +} + +void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen2 g = { + .fni8 = tcg_gen_mov_i64, + .fniv = vec_mov2, + .fno = gen_helper_gvec_mov, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + }; + if (dofs != aofs) { + tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); + } else { + check_size_align(oprsz, maxsz, dofs); + if (oprsz < maxsz) { + expand_clr(dofs + oprsz, maxsz - oprsz); + } + } +} + +void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz, + uint32_t maxsz, TCGv_i32 in) +{ + check_size_align(oprsz, maxsz, dofs); + tcg_debug_assert(vece <= MO_32); + do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0); +} + +void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz, + uint32_t maxsz, TCGv_i64 in) +{ + check_size_align(oprsz, maxsz, dofs); + tcg_debug_assert(vece <= MO_64); + do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0); +} + +void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t oprsz, uint32_t maxsz) +{ + if (vece <= MO_32) { + TCGv_i32 in = tcg_temp_new_i32(); + switch (vece) { + case MO_8: + tcg_gen_ld8u_i32(in, cpu_env, aofs); + break; + case MO_16: + tcg_gen_ld16u_i32(in, cpu_env, aofs); + break; + case MO_32: + tcg_gen_ld_i32(in, cpu_env, aofs); + break; + } + tcg_gen_gvec_dup_i32(vece, dofs, oprsz, maxsz, in); + tcg_temp_free_i32(in); + } else if (vece == MO_64) { + TCGv_i64 in = tcg_temp_new_i64(); + tcg_gen_ld_i64(in, cpu_env, aofs); + tcg_gen_gvec_dup_i64(MO_64, dofs, oprsz, maxsz, in); + tcg_temp_free_i64(in); + } else { + /* 128-bit duplicate. */ + /* ??? Dup to 256-bit vector. */ + int i; + + tcg_debug_assert(vece == 4); + tcg_debug_assert(oprsz >= 16); + if (TCG_TARGET_HAS_v128) { + TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128); + + tcg_gen_ld_vec(in, cpu_env, aofs); + for (i = 0; i < oprsz; i += 16) { + tcg_gen_st_vec(in, cpu_env, dofs + i); + } + tcg_temp_free_vec(in); + } else { + TCGv_i64 in0 = tcg_temp_new_i64(); + TCGv_i64 in1 = tcg_temp_new_i64(); + + tcg_gen_ld_i64(in0, cpu_env, aofs); + tcg_gen_ld_i64(in1, cpu_env, aofs + 8); + for (i = 0; i < oprsz; i += 16) { + tcg_gen_st_i64(in0, cpu_env, dofs + i); + tcg_gen_st_i64(in1, cpu_env, dofs + i + 8); + } + tcg_temp_free_i64(in0); + tcg_temp_free_i64(in1); + } + } +} + +void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz, + uint32_t maxsz, uint64_t x) +{ + check_size_align(oprsz, maxsz, dofs); + do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x); +} + +void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz, + uint32_t maxsz, uint32_t x) +{ + check_size_align(oprsz, maxsz, dofs); + do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x); +} + +void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz, + uint32_t maxsz, uint16_t x) +{ + check_size_align(oprsz, maxsz, dofs); + do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x); +} + +void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz, + uint32_t maxsz, uint8_t x) +{ + check_size_align(oprsz, maxsz, dofs); + do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x); +} + +void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen2 g = { + .fni8 = tcg_gen_not_i64, + .fniv = tcg_gen_not_vec, + .fno = gen_helper_gvec_not, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + }; + tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g); +} + +/* Perform a vector addition using normal addition and a mask. The mask + should be the sign bit of each lane. This 6-operation form is more + efficient than separate additions when there are 4 or more lanes in + the 64-bit operation. */ +static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) +{ + TCGv_i64 t1 = tcg_temp_new_i64(); + TCGv_i64 t2 = tcg_temp_new_i64(); + TCGv_i64 t3 = tcg_temp_new_i64(); + + tcg_gen_andc_i64(t1, a, m); + tcg_gen_andc_i64(t2, b, m); + tcg_gen_xor_i64(t3, a, b); + tcg_gen_add_i64(d, t1, t2); + tcg_gen_and_i64(t3, t3, m); + tcg_gen_xor_i64(d, d, t3); + + tcg_temp_free_i64(t1); + tcg_temp_free_i64(t2); + tcg_temp_free_i64(t3); +} + +void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); + gen_addv_mask(d, a, b, m); + tcg_temp_free_i64(m); +} + +void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); + gen_addv_mask(d, a, b, m); + tcg_temp_free_i64(m); +} + +void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + TCGv_i64 t1 = tcg_temp_new_i64(); + TCGv_i64 t2 = tcg_temp_new_i64(); + + tcg_gen_andi_i64(t1, a, ~0xffffffffull); + tcg_gen_add_i64(t2, a, b); + tcg_gen_add_i64(t1, t1, b); + tcg_gen_deposit_i64(d, t1, t2, 0, 32); + + tcg_temp_free_i64(t1); + tcg_temp_free_i64(t2); +} + +void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen3 g[4] = { + { .fni8 = tcg_gen_vec_add8_i64, + .fniv = tcg_gen_add_vec, + .fno = gen_helper_gvec_add8, + .opc = INDEX_op_add_vec, + .vece = MO_8 }, + { .fni8 = tcg_gen_vec_add16_i64, + .fniv = tcg_gen_add_vec, + .fno = gen_helper_gvec_add16, + .opc = INDEX_op_add_vec, + .vece = MO_16 }, + { .fni4 = tcg_gen_add_i32, + .fniv = tcg_gen_add_vec, + .fno = gen_helper_gvec_add32, + .opc = INDEX_op_add_vec, + .vece = MO_32 }, + { .fni8 = tcg_gen_add_i64, + .fniv = tcg_gen_add_vec, + .fno = gen_helper_gvec_add64, + .opc = INDEX_op_add_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 }, + }; + + tcg_debug_assert(vece <= MO_64); + tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); +} + +void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs, + TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen2s g[4] = { + { .fni8 = tcg_gen_vec_add8_i64, + .fniv = tcg_gen_add_vec, + .fno = gen_helper_gvec_adds8, + .opc = INDEX_op_add_vec, + .vece = MO_8 }, + { .fni8 = tcg_gen_vec_add16_i64, + .fniv = tcg_gen_add_vec, + .fno = gen_helper_gvec_adds16, + .opc = INDEX_op_add_vec, + .vece = MO_16 }, + { .fni4 = tcg_gen_add_i32, + .fniv = tcg_gen_add_vec, + .fno = gen_helper_gvec_adds32, + .opc = INDEX_op_add_vec, + .vece = MO_32 }, + { .fni8 = tcg_gen_add_i64, + .fniv = tcg_gen_add_vec, + .fno = gen_helper_gvec_adds64, + .opc = INDEX_op_add_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 }, + }; + + tcg_debug_assert(vece <= MO_64); + tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); +} + +void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t c, uint32_t oprsz, uint32_t maxsz) +{ + TCGv_i64 tmp = tcg_const_i64(c); + tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz); + tcg_temp_free_i64(tmp); +} + +void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs, + TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen2s g[4] = { + { .fni8 = tcg_gen_vec_sub8_i64, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_gvec_subs8, + .opc = INDEX_op_sub_vec, + .vece = MO_8 }, + { .fni8 = tcg_gen_vec_sub16_i64, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_gvec_subs16, + .opc = INDEX_op_sub_vec, + .vece = MO_16 }, + { .fni4 = tcg_gen_sub_i32, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_gvec_subs32, + .opc = INDEX_op_sub_vec, + .vece = MO_32 }, + { .fni8 = tcg_gen_sub_i64, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_gvec_subs64, + .opc = INDEX_op_sub_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 }, + }; + + tcg_debug_assert(vece <= MO_64); + tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); +} + +/* Perform a vector subtraction using normal subtraction and a mask. + Compare gen_addv_mask above. */ +static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m) +{ + TCGv_i64 t1 = tcg_temp_new_i64(); + TCGv_i64 t2 = tcg_temp_new_i64(); + TCGv_i64 t3 = tcg_temp_new_i64(); + + tcg_gen_or_i64(t1, a, m); + tcg_gen_andc_i64(t2, b, m); + tcg_gen_eqv_i64(t3, a, b); + tcg_gen_sub_i64(d, t1, t2); + tcg_gen_and_i64(t3, t3, m); + tcg_gen_xor_i64(d, d, t3); + + tcg_temp_free_i64(t1); + tcg_temp_free_i64(t2); + tcg_temp_free_i64(t3); +} + +void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); + gen_subv_mask(d, a, b, m); + tcg_temp_free_i64(m); +} + +void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); + gen_subv_mask(d, a, b, m); + tcg_temp_free_i64(m); +} + +void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + TCGv_i64 t1 = tcg_temp_new_i64(); + TCGv_i64 t2 = tcg_temp_new_i64(); + + tcg_gen_andi_i64(t1, b, ~0xffffffffull); + tcg_gen_sub_i64(t2, a, b); + tcg_gen_sub_i64(t1, a, t1); + tcg_gen_deposit_i64(d, t1, t2, 0, 32); + + tcg_temp_free_i64(t1); + tcg_temp_free_i64(t2); +} + +void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen3 g[4] = { + { .fni8 = tcg_gen_vec_sub8_i64, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_gvec_sub8, + .opc = INDEX_op_sub_vec, + .vece = MO_8 }, + { .fni8 = tcg_gen_vec_sub16_i64, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_gvec_sub16, + .opc = INDEX_op_sub_vec, + .vece = MO_16 }, + { .fni4 = tcg_gen_sub_i32, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_gvec_sub32, + .opc = INDEX_op_sub_vec, + .vece = MO_32 }, + { .fni8 = tcg_gen_sub_i64, + .fniv = tcg_gen_sub_vec, + .fno = gen_helper_gvec_sub64, + .opc = INDEX_op_sub_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 }, + }; + + tcg_debug_assert(vece <= MO_64); + tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); +} + +void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen3 g[4] = { + { .fniv = tcg_gen_mul_vec, + .fno = gen_helper_gvec_mul8, + .opc = INDEX_op_mul_vec, + .vece = MO_8 }, + { .fniv = tcg_gen_mul_vec, + .fno = gen_helper_gvec_mul16, + .opc = INDEX_op_mul_vec, + .vece = MO_16 }, + { .fni4 = tcg_gen_mul_i32, + .fniv = tcg_gen_mul_vec, + .fno = gen_helper_gvec_mul32, + .opc = INDEX_op_mul_vec, + .vece = MO_32 }, + { .fni8 = tcg_gen_mul_i64, + .fniv = tcg_gen_mul_vec, + .fno = gen_helper_gvec_mul64, + .opc = INDEX_op_mul_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 }, + }; + + tcg_debug_assert(vece <= MO_64); + tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); +} + +void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs, + TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen2s g[4] = { + { .fniv = tcg_gen_mul_vec, + .fno = gen_helper_gvec_muls8, + .opc = INDEX_op_mul_vec, + .vece = MO_8 }, + { .fniv = tcg_gen_mul_vec, + .fno = gen_helper_gvec_muls16, + .opc = INDEX_op_mul_vec, + .vece = MO_16 }, + { .fni4 = tcg_gen_mul_i32, + .fniv = tcg_gen_mul_vec, + .fno = gen_helper_gvec_muls32, + .opc = INDEX_op_mul_vec, + .vece = MO_32 }, + { .fni8 = tcg_gen_mul_i64, + .fniv = tcg_gen_mul_vec, + .fno = gen_helper_gvec_muls64, + .opc = INDEX_op_mul_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 }, + }; + + tcg_debug_assert(vece <= MO_64); + tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]); +} + +void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t c, uint32_t oprsz, uint32_t maxsz) +{ + TCGv_i64 tmp = tcg_const_i64(c); + tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz); + tcg_temp_free_i64(tmp); +} + +void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen3 g[4] = { + { .fno = gen_helper_gvec_ssadd8, .vece = MO_8 }, + { .fno = gen_helper_gvec_ssadd16, .vece = MO_16 }, + { .fno = gen_helper_gvec_ssadd32, .vece = MO_32 }, + { .fno = gen_helper_gvec_ssadd64, .vece = MO_64 } + }; + tcg_debug_assert(vece <= MO_64); + tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); +} + +void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen3 g[4] = { + { .fno = gen_helper_gvec_sssub8, .vece = MO_8 }, + { .fno = gen_helper_gvec_sssub16, .vece = MO_16 }, + { .fno = gen_helper_gvec_sssub32, .vece = MO_32 }, + { .fno = gen_helper_gvec_sssub64, .vece = MO_64 } + }; + tcg_debug_assert(vece <= MO_64); + tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); +} + +static void tcg_gen_vec_usadd32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + TCGv_i32 max = tcg_const_i32(-1); + tcg_gen_add_i32(d, a, b); + tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d); + tcg_temp_free_i32(max); +} + +static void tcg_gen_vec_usadd32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + TCGv_i64 max = tcg_const_i64(-1); + tcg_gen_add_i64(d, a, b); + tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d); + tcg_temp_free_i64(max); +} + +void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen3 g[4] = { + { .fno = gen_helper_gvec_usadd8, .vece = MO_8 }, + { .fno = gen_helper_gvec_usadd16, .vece = MO_16 }, + { .fni4 = tcg_gen_vec_usadd32_i32, + .fno = gen_helper_gvec_usadd32, + .vece = MO_32 }, + { .fni8 = tcg_gen_vec_usadd32_i64, + .fno = gen_helper_gvec_usadd64, + .vece = MO_64 } + }; + tcg_debug_assert(vece <= MO_64); + tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); +} + +static void tcg_gen_vec_ussub32_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b) +{ + TCGv_i32 min = tcg_const_i32(0); + tcg_gen_sub_i32(d, a, b); + tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d); + tcg_temp_free_i32(min); +} + +static void tcg_gen_vec_ussub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b) +{ + TCGv_i64 min = tcg_const_i64(0); + tcg_gen_sub_i64(d, a, b); + tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d); + tcg_temp_free_i64(min); +} + +void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen3 g[4] = { + { .fno = gen_helper_gvec_ussub8, .vece = MO_8 }, + { .fno = gen_helper_gvec_ussub16, .vece = MO_16 }, + { .fni4 = tcg_gen_vec_ussub32_i32, + .fno = gen_helper_gvec_ussub32, + .vece = MO_32 }, + { .fni8 = tcg_gen_vec_ussub32_i64, + .fno = gen_helper_gvec_ussub64, + .vece = MO_64 } + }; + tcg_debug_assert(vece <= MO_64); + tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]); +} + +/* Perform a vector negation using normal negation and a mask. + Compare gen_subv_mask above. */ +static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m) +{ + TCGv_i64 t2 = tcg_temp_new_i64(); + TCGv_i64 t3 = tcg_temp_new_i64(); + + tcg_gen_andc_i64(t3, m, b); + tcg_gen_andc_i64(t2, b, m); + tcg_gen_sub_i64(d, m, t2); + tcg_gen_xor_i64(d, d, t3); + + tcg_temp_free_i64(t2); + tcg_temp_free_i64(t3); +} + +void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b) +{ + TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80)); + gen_negv_mask(d, b, m); + tcg_temp_free_i64(m); +} + +void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b) +{ + TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000)); + gen_negv_mask(d, b, m); + tcg_temp_free_i64(m); +} + +void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b) +{ + TCGv_i64 t1 = tcg_temp_new_i64(); + TCGv_i64 t2 = tcg_temp_new_i64(); + + tcg_gen_andi_i64(t1, b, ~0xffffffffull); + tcg_gen_neg_i64(t2, b); + tcg_gen_neg_i64(t1, t1); + tcg_gen_deposit_i64(d, t1, t2, 0, 32); + + tcg_temp_free_i64(t1); + tcg_temp_free_i64(t2); +} + +void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen2 g[4] = { + { .fni8 = tcg_gen_vec_neg8_i64, + .fniv = tcg_gen_neg_vec, + .fno = gen_helper_gvec_neg8, + .opc = INDEX_op_neg_vec, + .vece = MO_8 }, + { .fni8 = tcg_gen_vec_neg16_i64, + .fniv = tcg_gen_neg_vec, + .fno = gen_helper_gvec_neg16, + .opc = INDEX_op_neg_vec, + .vece = MO_16 }, + { .fni4 = tcg_gen_neg_i32, + .fniv = tcg_gen_neg_vec, + .fno = gen_helper_gvec_neg32, + .opc = INDEX_op_neg_vec, + .vece = MO_32 }, + { .fni8 = tcg_gen_neg_i64, + .fniv = tcg_gen_neg_vec, + .fno = gen_helper_gvec_neg64, + .opc = INDEX_op_neg_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 }, + }; + + tcg_debug_assert(vece <= MO_64); + tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]); +} + +void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen3 g = { + .fni8 = tcg_gen_and_i64, + .fniv = tcg_gen_and_vec, + .fno = gen_helper_gvec_and, + .opc = INDEX_op_and_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + }; + tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); +} + +void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen3 g = { + .fni8 = tcg_gen_or_i64, + .fniv = tcg_gen_or_vec, + .fno = gen_helper_gvec_or, + .opc = INDEX_op_or_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + }; + tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); +} + +void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen3 g = { + .fni8 = tcg_gen_xor_i64, + .fniv = tcg_gen_xor_vec, + .fno = gen_helper_gvec_xor, + .opc = INDEX_op_xor_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + }; + tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); +} + +void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen3 g = { + .fni8 = tcg_gen_andc_i64, + .fniv = tcg_gen_andc_vec, + .fno = gen_helper_gvec_andc, + .opc = INDEX_op_andc_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + }; + tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); +} + +void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen3 g = { + .fni8 = tcg_gen_orc_i64, + .fniv = tcg_gen_orc_vec, + .fno = gen_helper_gvec_orc, + .opc = INDEX_op_orc_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + }; + tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g); +} + +static const GVecGen2s gop_ands = { + .fni8 = tcg_gen_and_i64, + .fniv = tcg_gen_and_vec, + .fno = gen_helper_gvec_ands, + .opc = INDEX_op_and_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 +}; + +void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs, + TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) +{ + TCGv_i64 tmp = tcg_temp_new_i64(); + gen_dup_i64(vece, tmp, c); + tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); + tcg_temp_free_i64(tmp); +} + +void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t c, uint32_t oprsz, uint32_t maxsz) +{ + TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); + tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands); + tcg_temp_free_i64(tmp); +} + +static const GVecGen2s gop_xors = { + .fni8 = tcg_gen_xor_i64, + .fniv = tcg_gen_xor_vec, + .fno = gen_helper_gvec_xors, + .opc = INDEX_op_xor_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 +}; + +void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs, + TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) +{ + TCGv_i64 tmp = tcg_temp_new_i64(); + gen_dup_i64(vece, tmp, c); + tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); + tcg_temp_free_i64(tmp); +} + +void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t c, uint32_t oprsz, uint32_t maxsz) +{ + TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); + tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors); + tcg_temp_free_i64(tmp); +} + +static const GVecGen2s gop_ors = { + .fni8 = tcg_gen_or_i64, + .fniv = tcg_gen_or_vec, + .fno = gen_helper_gvec_ors, + .opc = INDEX_op_or_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 +}; + +void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs, + TCGv_i64 c, uint32_t oprsz, uint32_t maxsz) +{ + TCGv_i64 tmp = tcg_temp_new_i64(); + gen_dup_i64(vece, tmp, c); + tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); + tcg_temp_free_i64(tmp); +} + +void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t c, uint32_t oprsz, uint32_t maxsz) +{ + TCGv_i64 tmp = tcg_const_i64(dup_const(vece, c)); + tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors); + tcg_temp_free_i64(tmp); +} + +void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) +{ + uint64_t mask = dup_const(MO_8, 0xff << c); + tcg_gen_shli_i64(d, a, c); + tcg_gen_andi_i64(d, d, mask); +} + +void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) +{ + uint64_t mask = dup_const(MO_16, 0xffff << c); + tcg_gen_shli_i64(d, a, c); + tcg_gen_andi_i64(d, d, mask); +} + +void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t shift, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen2i g[4] = { + { .fni8 = tcg_gen_vec_shl8i_i64, + .fniv = tcg_gen_shli_vec, + .fno = gen_helper_gvec_shl8i, + .opc = INDEX_op_shli_vec, + .vece = MO_8 }, + { .fni8 = tcg_gen_vec_shl16i_i64, + .fniv = tcg_gen_shli_vec, + .fno = gen_helper_gvec_shl16i, + .opc = INDEX_op_shli_vec, + .vece = MO_16 }, + { .fni4 = tcg_gen_shli_i32, + .fniv = tcg_gen_shli_vec, + .fno = gen_helper_gvec_shl32i, + .opc = INDEX_op_shli_vec, + .vece = MO_32 }, + { .fni8 = tcg_gen_shli_i64, + .fniv = tcg_gen_shli_vec, + .fno = gen_helper_gvec_shl64i, + .opc = INDEX_op_shli_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 }, + }; + + tcg_debug_assert(vece <= MO_64); + tcg_debug_assert(shift >= 0 && shift < (8 << vece)); + if (shift == 0) { + tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); + } else { + tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); + } +} + +void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) +{ + uint64_t mask = dup_const(MO_8, 0xff >> c); + tcg_gen_shri_i64(d, a, c); + tcg_gen_andi_i64(d, d, mask); +} + +void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) +{ + uint64_t mask = dup_const(MO_16, 0xffff >> c); + tcg_gen_shri_i64(d, a, c); + tcg_gen_andi_i64(d, d, mask); +} + +void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t shift, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen2i g[4] = { + { .fni8 = tcg_gen_vec_shr8i_i64, + .fniv = tcg_gen_shri_vec, + .fno = gen_helper_gvec_shr8i, + .opc = INDEX_op_shri_vec, + .vece = MO_8 }, + { .fni8 = tcg_gen_vec_shr16i_i64, + .fniv = tcg_gen_shri_vec, + .fno = gen_helper_gvec_shr16i, + .opc = INDEX_op_shri_vec, + .vece = MO_16 }, + { .fni4 = tcg_gen_shri_i32, + .fniv = tcg_gen_shri_vec, + .fno = gen_helper_gvec_shr32i, + .opc = INDEX_op_shri_vec, + .vece = MO_32 }, + { .fni8 = tcg_gen_shri_i64, + .fniv = tcg_gen_shri_vec, + .fno = gen_helper_gvec_shr64i, + .opc = INDEX_op_shri_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 }, + }; + + tcg_debug_assert(vece <= MO_64); + tcg_debug_assert(shift >= 0 && shift < (8 << vece)); + if (shift == 0) { + tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); + } else { + tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); + } +} + +void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) +{ + uint64_t s_mask = dup_const(MO_8, 0x80 >> c); + uint64_t c_mask = dup_const(MO_8, 0xff >> c); + TCGv_i64 s = tcg_temp_new_i64(); + + tcg_gen_shri_i64(d, a, c); + tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ + tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ + tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ + tcg_gen_or_i64(d, d, s); /* include sign extension */ + tcg_temp_free_i64(s); +} + +void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c) +{ + uint64_t s_mask = dup_const(MO_16, 0x8000 >> c); + uint64_t c_mask = dup_const(MO_16, 0xffff >> c); + TCGv_i64 s = tcg_temp_new_i64(); + + tcg_gen_shri_i64(d, a, c); + tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */ + tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */ + tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */ + tcg_gen_or_i64(d, d, s); /* include sign extension */ + tcg_temp_free_i64(s); +} + +void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t shift, uint32_t oprsz, uint32_t maxsz) +{ + static const GVecGen2i g[4] = { + { .fni8 = tcg_gen_vec_sar8i_i64, + .fniv = tcg_gen_sari_vec, + .fno = gen_helper_gvec_sar8i, + .opc = INDEX_op_sari_vec, + .vece = MO_8 }, + { .fni8 = tcg_gen_vec_sar16i_i64, + .fniv = tcg_gen_sari_vec, + .fno = gen_helper_gvec_sar16i, + .opc = INDEX_op_sari_vec, + .vece = MO_16 }, + { .fni4 = tcg_gen_sari_i32, + .fniv = tcg_gen_sari_vec, + .fno = gen_helper_gvec_sar32i, + .opc = INDEX_op_sari_vec, + .vece = MO_32 }, + { .fni8 = tcg_gen_sari_i64, + .fniv = tcg_gen_sari_vec, + .fno = gen_helper_gvec_sar64i, + .opc = INDEX_op_sari_vec, + .prefer_i64 = TCG_TARGET_REG_BITS == 64, + .vece = MO_64 }, + }; + + tcg_debug_assert(vece <= MO_64); + tcg_debug_assert(shift >= 0 && shift < (8 << vece)); + if (shift == 0) { + tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz); + } else { + tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]); + } +} + +/* Expand OPSZ bytes worth of three-operand operations using i32 elements. */ +static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs, + uint32_t oprsz, TCGCond cond) +{ + TCGv_i32 t0 = tcg_temp_new_i32(); + TCGv_i32 t1 = tcg_temp_new_i32(); + uint32_t i; + + for (i = 0; i < oprsz; i += 4) { + tcg_gen_ld_i32(t0, cpu_env, aofs + i); + tcg_gen_ld_i32(t1, cpu_env, bofs + i); + tcg_gen_setcond_i32(cond, t0, t0, t1); + tcg_gen_neg_i32(t0, t0); + tcg_gen_st_i32(t0, cpu_env, dofs + i); + } + tcg_temp_free_i32(t1); + tcg_temp_free_i32(t0); +} + +static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs, + uint32_t oprsz, TCGCond cond) +{ + TCGv_i64 t0 = tcg_temp_new_i64(); + TCGv_i64 t1 = tcg_temp_new_i64(); + uint32_t i; + + for (i = 0; i < oprsz; i += 8) { + tcg_gen_ld_i64(t0, cpu_env, aofs + i); + tcg_gen_ld_i64(t1, cpu_env, bofs + i); + tcg_gen_setcond_i64(cond, t0, t0, t1); + tcg_gen_neg_i64(t0, t0); + tcg_gen_st_i64(t0, cpu_env, dofs + i); + } + tcg_temp_free_i64(t1); + tcg_temp_free_i64(t0); +} + +static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t tysz, + TCGType type, TCGCond cond) +{ + TCGv_vec t0 = tcg_temp_new_vec(type); + TCGv_vec t1 = tcg_temp_new_vec(type); + uint32_t i; + + for (i = 0; i < oprsz; i += tysz) { + tcg_gen_ld_vec(t0, cpu_env, aofs + i); + tcg_gen_ld_vec(t1, cpu_env, bofs + i); + tcg_gen_cmp_vec(cond, vece, t0, t0, t1); + tcg_gen_st_vec(t0, cpu_env, dofs + i); + } + tcg_temp_free_vec(t1); + tcg_temp_free_vec(t0); +} + +void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs, + uint32_t aofs, uint32_t bofs, + uint32_t oprsz, uint32_t maxsz) +{ + static gen_helper_gvec_3 * const eq_fn[4] = { + gen_helper_gvec_eq8, gen_helper_gvec_eq16, + gen_helper_gvec_eq32, gen_helper_gvec_eq64 + }; + static gen_helper_gvec_3 * const ne_fn[4] = { + gen_helper_gvec_ne8, gen_helper_gvec_ne16, + gen_helper_gvec_ne32, gen_helper_gvec_ne64 + }; + static gen_helper_gvec_3 * const lt_fn[4] = { + gen_helper_gvec_lt8, gen_helper_gvec_lt16, + gen_helper_gvec_lt32, gen_helper_gvec_lt64 + }; + static gen_helper_gvec_3 * const le_fn[4] = { + gen_helper_gvec_le8, gen_helper_gvec_le16, + gen_helper_gvec_le32, gen_helper_gvec_le64 + }; + static gen_helper_gvec_3 * const ltu_fn[4] = { + gen_helper_gvec_ltu8, gen_helper_gvec_ltu16, + gen_helper_gvec_ltu32, gen_helper_gvec_ltu64 + }; + static gen_helper_gvec_3 * const leu_fn[4] = { + gen_helper_gvec_leu8, gen_helper_gvec_leu16, + gen_helper_gvec_leu32, gen_helper_gvec_leu64 + }; + static gen_helper_gvec_3 * const * const fns[16] = { + [TCG_COND_EQ] = eq_fn, + [TCG_COND_NE] = ne_fn, + [TCG_COND_LT] = lt_fn, + [TCG_COND_LE] = le_fn, + [TCG_COND_LTU] = ltu_fn, + [TCG_COND_LEU] = leu_fn, + }; + + check_size_align(oprsz, maxsz, dofs | aofs | bofs); + check_overlap_3(dofs, aofs, bofs, maxsz); + + if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) { + do_dup(MO_8, dofs, oprsz, maxsz, + NULL, NULL, -(cond == TCG_COND_ALWAYS)); + return; + } + + /* Recall that ARM SVE allows vector sizes that are not a power of 2. + Expand with successively smaller host vector sizes. The intent is + that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */ + + if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32) + && tcg_can_emit_vec_op(INDEX_op_cmp_vec, TCG_TYPE_V256, vece)) { + uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32); + expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond); + if (some == oprsz) { + goto done; + } + dofs += some; + aofs += some; + bofs += some; + oprsz -= some; + maxsz -= some; + } + + if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16) + && tcg_can_emit_vec_op(INDEX_op_cmp_vec, TCG_TYPE_V128, vece)) { + expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond); + } else if (TCG_TARGET_HAS_v64 + && check_size_impl(oprsz, 8) + && (TCG_TARGET_REG_BITS == 32 || vece != MO_64) + && tcg_can_emit_vec_op(INDEX_op_cmp_vec, TCG_TYPE_V64, vece)) { + expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond); + } else if (vece == MO_64 && check_size_impl(oprsz, 8)) { + expand_cmp_i64(dofs, aofs, bofs, oprsz, cond); + } else if (vece == MO_32 && check_size_impl(oprsz, 4)) { + expand_cmp_i32(dofs, aofs, bofs, oprsz, cond); + } else { + gen_helper_gvec_3 * const *fn = fns[cond]; + + if (fn == NULL) { + uint32_t tmp; + tmp = aofs, aofs = bofs, bofs = tmp; + cond = tcg_swap_cond(cond); + fn = fns[cond]; + assert(fn != NULL); + } + tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]); + return; + } + + done: + if (oprsz < maxsz) { + expand_clr(dofs + oprsz, maxsz - oprsz); + } +} diff --git a/tcg/tcg-op-gvec.h b/tcg/tcg-op-gvec.h new file mode 100644 index 0000000000..ff43a29a0b --- /dev/null +++ b/tcg/tcg-op-gvec.h @@ -0,0 +1,306 @@ +/* + * Generic vector operation expansion + * + * Copyright (c) 2018 Linaro + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +/* + * "Generic" vectors. All operands are given as offsets from ENV, + * and therefore cannot also be allocated via tcg_global_mem_new_*. + * OPRSZ is the byte size of the vector upon which the operation is performed. + * MAXSZ is the byte size of the full vector; bytes beyond OPSZ are cleared. + * + * All sizes must be 8 or any multiple of 16. + * When OPRSZ is 8, the alignment may be 8, otherwise must be 16. + * Operands may completely, but not partially, overlap. + */ + +/* Expand a call to a gvec-style helper, with pointers to two vector + operands, and a descriptor (see tcg-gvec-desc.h). */ +typedef void gen_helper_gvec_2(TCGv_ptr, TCGv_ptr, TCGv_i32); +void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs, + uint32_t oprsz, uint32_t maxsz, int32_t data, + gen_helper_gvec_2 *fn); + +/* Similarly, passing an extra data value. */ +typedef void gen_helper_gvec_2i(TCGv_ptr, TCGv_ptr, TCGv_i64, TCGv_i32); +void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c, + uint32_t oprsz, uint32_t maxsz, int32_t data, + gen_helper_gvec_2i *fn); + +/* Similarly, passing an extra pointer (e.g. env or float_status). */ +typedef void gen_helper_gvec_2_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32); +void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs, + TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, + int32_t data, gen_helper_gvec_2_ptr *fn); + +/* Similarly, with three vector operands. */ +typedef void gen_helper_gvec_3(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_i32); +void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, + uint32_t oprsz, uint32_t maxsz, int32_t data, + gen_helper_gvec_3 *fn); + +/* Similarly, with four vector operands. */ +typedef void gen_helper_gvec_4(TCGv_ptr, TCGv_ptr, TCGv_ptr, + TCGv_ptr, TCGv_i32); +void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, + uint32_t cofs, uint32_t oprsz, uint32_t maxsz, + int32_t data, gen_helper_gvec_4 *fn); + +/* Similarly, with five vector operands. */ +typedef void gen_helper_gvec_5(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr, + TCGv_ptr, TCGv_i32); +void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs, + uint32_t cofs, uint32_t xofs, uint32_t oprsz, + uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn); + +typedef void gen_helper_gvec_3_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr, + TCGv_ptr, TCGv_i32); +void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, + TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz, + int32_t data, gen_helper_gvec_3_ptr *fn); + +typedef void gen_helper_gvec_4_ptr(TCGv_ptr, TCGv_ptr, TCGv_ptr, + TCGv_ptr, TCGv_ptr, TCGv_i32); +void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs, + uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz, + uint32_t maxsz, int32_t data, + gen_helper_gvec_4_ptr *fn); + +/* Expand a gvec operation. Either inline or out-of-line depending on + the actual vector size and the operations supported by the host. */ +typedef struct { + /* Expand inline as a 64-bit or 32-bit integer. + Only one of these will be non-NULL. */ + void (*fni8)(TCGv_i64, TCGv_i64); + void (*fni4)(TCGv_i32, TCGv_i32); + /* Expand inline with a host vector type. */ + void (*fniv)(unsigned, TCGv_vec, TCGv_vec); + /* Expand out-of-line helper w/descriptor. */ + gen_helper_gvec_2 *fno; + /* The opcode, if any, to which this corresponds. */ + TCGOpcode opc; + /* The data argument to the out-of-line helper. */ + int32_t data; + /* The vector element size, if applicable. */ + uint8_t vece; + /* Prefer i64 to v64. */ + bool prefer_i64; +} GVecGen2; + +typedef struct { + /* Expand inline as a 64-bit or 32-bit integer. + Only one of these will be non-NULL. */ + void (*fni8)(TCGv_i64, TCGv_i64, int64_t); + void (*fni4)(TCGv_i32, TCGv_i32, int32_t); + /* Expand inline with a host vector type. */ + void (*fniv)(unsigned, TCGv_vec, TCGv_vec, int64_t); + /* Expand out-of-line helper w/descriptor, data in descriptor. */ + gen_helper_gvec_2 *fno; + /* Expand out-of-line helper w/descriptor, data as argument. */ + gen_helper_gvec_2i *fnoi; + /* The opcode, if any, to which this corresponds. */ + TCGOpcode opc; + /* The vector element size, if applicable. */ + uint8_t vece; + /* Prefer i64 to v64. */ + bool prefer_i64; + /* Load dest as a 3rd source operand. */ + bool load_dest; +} GVecGen2i; + +typedef struct { + /* Expand inline as a 64-bit or 32-bit integer. + Only one of these will be non-NULL. */ + void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64); + void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32); + /* Expand inline with a host vector type. */ + void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec); + /* Expand out-of-line helper w/descriptor. */ + gen_helper_gvec_2i *fno; + /* The opcode, if any, to which this corresponds. */ + TCGOpcode opc; + /* The data argument to the out-of-line helper. */ + uint32_t data; + /* The vector element size, if applicable. */ + uint8_t vece; + /* Prefer i64 to v64. */ + bool prefer_i64; + /* Load scalar as 1st source operand. */ + bool scalar_first; +} GVecGen2s; + +typedef struct { + /* Expand inline as a 64-bit or 32-bit integer. + Only one of these will be non-NULL. */ + void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64); + void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32); + /* Expand inline with a host vector type. */ + void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec); + /* Expand out-of-line helper w/descriptor. */ + gen_helper_gvec_3 *fno; + /* The opcode, if any, to which this corresponds. */ + TCGOpcode opc; + /* The data argument to the out-of-line helper. */ + int32_t data; + /* The vector element size, if applicable. */ + uint8_t vece; + /* Prefer i64 to v64. */ + bool prefer_i64; + /* Load dest as a 3rd source operand. */ + bool load_dest; +} GVecGen3; + +typedef struct { + /* Expand inline as a 64-bit or 32-bit integer. + Only one of these will be non-NULL. */ + void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64); + void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32); + /* Expand inline with a host vector type. */ + void (*fniv)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec, TCGv_vec); + /* Expand out-of-line helper w/descriptor. */ + gen_helper_gvec_4 *fno; + /* The opcode, if any, to which this corresponds. */ + TCGOpcode opc; + /* The data argument to the out-of-line helper. */ + int32_t data; + /* The vector element size, if applicable. */ + uint8_t vece; + /* Prefer i64 to v64. */ + bool prefer_i64; +} GVecGen4; + +void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs, + uint32_t oprsz, uint32_t maxsz, const GVecGen2 *); +void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz, + uint32_t maxsz, int64_t c, const GVecGen2i *); +void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz, + uint32_t maxsz, TCGv_i64 c, const GVecGen2s *); +void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs, + uint32_t oprsz, uint32_t maxsz, const GVecGen3 *); +void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs, + uint32_t oprsz, uint32_t maxsz, const GVecGen4 *); + +/* Expand a specific vector operation. */ + +void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t oprsz, uint32_t maxsz); + +void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz); + +void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t c, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t c, uint32_t oprsz, uint32_t maxsz); + +void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs, + TCGv_i64 c, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs, + TCGv_i64 c, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs, + TCGv_i64 c, uint32_t oprsz, uint32_t maxsz); + +/* Saturated arithmetic. */ +void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz); + +void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t bofs, uint32_t oprsz, uint32_t maxsz); + +void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t c, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t c, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t c, uint32_t oprsz, uint32_t maxsz); + +void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs, + TCGv_i64 c, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs, + TCGv_i64 c, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs, + TCGv_i64 c, uint32_t oprsz, uint32_t maxsz); + +void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs, + uint32_t s, uint32_t m); +void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t s, + uint32_t m, TCGv_i32); +void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t s, + uint32_t m, TCGv_i64); + +void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t s, uint32_t m, uint8_t x); +void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t s, uint32_t m, uint16_t x); +void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t s, uint32_t m, uint32_t x); +void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t s, uint32_t m, uint64_t x); + +void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t shift, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t shift, uint32_t oprsz, uint32_t maxsz); +void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs, + int64_t shift, uint32_t oprsz, uint32_t maxsz); + +void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs, + uint32_t aofs, uint32_t bofs, + uint32_t oprsz, uint32_t maxsz); + +/* + * 64-bit vector operations. Use these when the register has been allocated + * with tcg_global_mem_new_i64, and so we cannot also address it via pointer. + * OPRSZ = MAXSZ = 8. + */ + +void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 a); +void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 a); +void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 a); + +void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b); +void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b); +void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b); + +void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b); +void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b); +void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b); + +void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t); +void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t); +void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t); +void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t); +void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t); +void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t); diff --git a/tcg/tcg-op-vec.c b/tcg/tcg-op-vec.c new file mode 100644 index 0000000000..70ec889bc1 --- /dev/null +++ b/tcg/tcg-op-vec.c @@ -0,0 +1,389 @@ +/* + * Tiny Code Generator for QEMU + * + * Copyright (c) 2018 Linaro, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "cpu.h" +#include "exec/exec-all.h" +#include "tcg.h" +#include "tcg-op.h" +#include "tcg-mo.h" + +/* Reduce the number of ifdefs below. This assumes that all uses of + TCGV_HIGH and TCGV_LOW are properly protected by a conditional that + the compiler can eliminate. */ +#if TCG_TARGET_REG_BITS == 64 +extern TCGv_i32 TCGV_LOW_link_error(TCGv_i64); +extern TCGv_i32 TCGV_HIGH_link_error(TCGv_i64); +#define TCGV_LOW TCGV_LOW_link_error +#define TCGV_HIGH TCGV_HIGH_link_error +#endif + +void vec_gen_2(TCGOpcode opc, TCGType type, unsigned vece, TCGArg r, TCGArg a) +{ + TCGOp *op = tcg_emit_op(opc); + TCGOP_VECL(op) = type - TCG_TYPE_V64; + TCGOP_VECE(op) = vece; + op->args[0] = r; + op->args[1] = a; +} + +void vec_gen_3(TCGOpcode opc, TCGType type, unsigned vece, + TCGArg r, TCGArg a, TCGArg b) +{ + TCGOp *op = tcg_emit_op(opc); + TCGOP_VECL(op) = type - TCG_TYPE_V64; + TCGOP_VECE(op) = vece; + op->args[0] = r; + op->args[1] = a; + op->args[2] = b; +} + +void vec_gen_4(TCGOpcode opc, TCGType type, unsigned vece, + TCGArg r, TCGArg a, TCGArg b, TCGArg c) +{ + TCGOp *op = tcg_emit_op(opc); + TCGOP_VECL(op) = type - TCG_TYPE_V64; + TCGOP_VECE(op) = vece; + op->args[0] = r; + op->args[1] = a; + op->args[2] = b; + op->args[3] = c; +} + +static void vec_gen_op2(TCGOpcode opc, unsigned vece, TCGv_vec r, TCGv_vec a) +{ + TCGTemp *rt = tcgv_vec_temp(r); + TCGTemp *at = tcgv_vec_temp(a); + TCGType type = rt->base_type; + + /* Must enough inputs for the output. */ + tcg_debug_assert(at->base_type >= type); + vec_gen_2(opc, type, vece, temp_arg(rt), temp_arg(at)); +} + +static void vec_gen_op3(TCGOpcode opc, unsigned vece, + TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + TCGTemp *rt = tcgv_vec_temp(r); + TCGTemp *at = tcgv_vec_temp(a); + TCGTemp *bt = tcgv_vec_temp(b); + TCGType type = rt->base_type; + + /* Must enough inputs for the output. */ + tcg_debug_assert(at->base_type >= type); + tcg_debug_assert(bt->base_type >= type); + vec_gen_3(opc, type, vece, temp_arg(rt), temp_arg(at), temp_arg(bt)); +} + +void tcg_gen_mov_vec(TCGv_vec r, TCGv_vec a) +{ + if (r != a) { + vec_gen_op2(INDEX_op_mov_vec, 0, r, a); + } +} + +#define MO_REG (TCG_TARGET_REG_BITS == 64 ? MO_64 : MO_32) + +static void do_dupi_vec(TCGv_vec r, unsigned vece, TCGArg a) +{ + TCGTemp *rt = tcgv_vec_temp(r); + vec_gen_2(INDEX_op_dupi_vec, rt->base_type, vece, temp_arg(rt), a); +} + +TCGv_vec tcg_const_zeros_vec(TCGType type) +{ + TCGv_vec ret = tcg_temp_new_vec(type); + do_dupi_vec(ret, MO_REG, 0); + return ret; +} + +TCGv_vec tcg_const_ones_vec(TCGType type) +{ + TCGv_vec ret = tcg_temp_new_vec(type); + do_dupi_vec(ret, MO_REG, -1); + return ret; +} + +TCGv_vec tcg_const_zeros_vec_matching(TCGv_vec m) +{ + TCGTemp *t = tcgv_vec_temp(m); + return tcg_const_zeros_vec(t->base_type); +} + +TCGv_vec tcg_const_ones_vec_matching(TCGv_vec m) +{ + TCGTemp *t = tcgv_vec_temp(m); + return tcg_const_ones_vec(t->base_type); +} + +void tcg_gen_dup64i_vec(TCGv_vec r, uint64_t a) +{ + if (TCG_TARGET_REG_BITS == 32 && a == deposit64(a, 32, 32, a)) { + do_dupi_vec(r, MO_32, a); + } else if (TCG_TARGET_REG_BITS == 64 || a == (uint64_t)(int32_t)a) { + do_dupi_vec(r, MO_64, a); + } else { + TCGv_i64 c = tcg_const_i64(a); + tcg_gen_dup_i64_vec(MO_64, r, c); + tcg_temp_free_i64(c); + } +} + +void tcg_gen_dup32i_vec(TCGv_vec r, uint32_t a) +{ + do_dupi_vec(r, MO_REG, dup_const(MO_32, a)); +} + +void tcg_gen_dup16i_vec(TCGv_vec r, uint32_t a) +{ + do_dupi_vec(r, MO_REG, dup_const(MO_16, a)); +} + +void tcg_gen_dup8i_vec(TCGv_vec r, uint32_t a) +{ + do_dupi_vec(r, MO_REG, dup_const(MO_8, a)); +} + +void tcg_gen_dupi_vec(unsigned vece, TCGv_vec r, uint64_t a) +{ + do_dupi_vec(r, MO_REG, dup_const(vece, a)); +} + +void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec r, TCGv_i64 a) +{ + TCGArg ri = tcgv_vec_arg(r); + TCGTemp *rt = arg_temp(ri); + TCGType type = rt->base_type; + + if (TCG_TARGET_REG_BITS == 64) { + TCGArg ai = tcgv_i64_arg(a); + vec_gen_2(INDEX_op_dup_vec, type, vece, ri, ai); + } else if (vece == MO_64) { + TCGArg al = tcgv_i32_arg(TCGV_LOW(a)); + TCGArg ah = tcgv_i32_arg(TCGV_HIGH(a)); + vec_gen_3(INDEX_op_dup2_vec, type, MO_64, ri, al, ah); + } else { + TCGArg ai = tcgv_i32_arg(TCGV_LOW(a)); + vec_gen_2(INDEX_op_dup_vec, type, vece, ri, ai); + } +} + +void tcg_gen_dup_i32_vec(unsigned vece, TCGv_vec r, TCGv_i32 a) +{ + TCGArg ri = tcgv_vec_arg(r); + TCGArg ai = tcgv_i32_arg(a); + TCGTemp *rt = arg_temp(ri); + TCGType type = rt->base_type; + + vec_gen_2(INDEX_op_dup_vec, type, vece, ri, ai); +} + +static void vec_gen_ldst(TCGOpcode opc, TCGv_vec r, TCGv_ptr b, TCGArg o) +{ + TCGArg ri = tcgv_vec_arg(r); + TCGArg bi = tcgv_ptr_arg(b); + TCGTemp *rt = arg_temp(ri); + TCGType type = rt->base_type; + + vec_gen_3(opc, type, 0, ri, bi, o); +} + +void tcg_gen_ld_vec(TCGv_vec r, TCGv_ptr b, TCGArg o) +{ + vec_gen_ldst(INDEX_op_ld_vec, r, b, o); +} + +void tcg_gen_st_vec(TCGv_vec r, TCGv_ptr b, TCGArg o) +{ + vec_gen_ldst(INDEX_op_st_vec, r, b, o); +} + +void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr b, TCGArg o, TCGType low_type) +{ + TCGArg ri = tcgv_vec_arg(r); + TCGArg bi = tcgv_ptr_arg(b); + TCGTemp *rt = arg_temp(ri); + TCGType type = rt->base_type; + + tcg_debug_assert(low_type >= TCG_TYPE_V64); + tcg_debug_assert(low_type <= type); + vec_gen_3(INDEX_op_st_vec, low_type, 0, ri, bi, o); +} + +void tcg_gen_add_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + vec_gen_op3(INDEX_op_add_vec, vece, r, a, b); +} + +void tcg_gen_sub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + vec_gen_op3(INDEX_op_sub_vec, vece, r, a, b); +} + +void tcg_gen_and_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + vec_gen_op3(INDEX_op_and_vec, 0, r, a, b); +} + +void tcg_gen_or_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + vec_gen_op3(INDEX_op_or_vec, 0, r, a, b); +} + +void tcg_gen_xor_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + vec_gen_op3(INDEX_op_xor_vec, 0, r, a, b); +} + +void tcg_gen_andc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + if (TCG_TARGET_HAS_andc_vec) { + vec_gen_op3(INDEX_op_andc_vec, 0, r, a, b); + } else { + TCGv_vec t = tcg_temp_new_vec_matching(r); + tcg_gen_not_vec(0, t, b); + tcg_gen_and_vec(0, r, a, t); + tcg_temp_free_vec(t); + } +} + +void tcg_gen_orc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + if (TCG_TARGET_HAS_orc_vec) { + vec_gen_op3(INDEX_op_orc_vec, 0, r, a, b); + } else { + TCGv_vec t = tcg_temp_new_vec_matching(r); + tcg_gen_not_vec(0, t, b); + tcg_gen_or_vec(0, r, a, t); + tcg_temp_free_vec(t); + } +} + +void tcg_gen_not_vec(unsigned vece, TCGv_vec r, TCGv_vec a) +{ + if (TCG_TARGET_HAS_not_vec) { + vec_gen_op2(INDEX_op_not_vec, 0, r, a); + } else { + TCGv_vec t = tcg_const_ones_vec_matching(r); + tcg_gen_xor_vec(0, r, a, t); + tcg_temp_free_vec(t); + } +} + +void tcg_gen_neg_vec(unsigned vece, TCGv_vec r, TCGv_vec a) +{ + if (TCG_TARGET_HAS_neg_vec) { + vec_gen_op2(INDEX_op_neg_vec, vece, r, a); + } else { + TCGv_vec t = tcg_const_zeros_vec_matching(r); + tcg_gen_sub_vec(vece, r, t, a); + tcg_temp_free_vec(t); + } +} + +static void do_shifti(TCGOpcode opc, unsigned vece, + TCGv_vec r, TCGv_vec a, int64_t i) +{ + TCGTemp *rt = tcgv_vec_temp(r); + TCGTemp *at = tcgv_vec_temp(a); + TCGArg ri = temp_arg(rt); + TCGArg ai = temp_arg(at); + TCGType type = rt->base_type; + int can; + + tcg_debug_assert(at->base_type == type); + tcg_debug_assert(i >= 0 && i < (8 << vece)); + + if (i == 0) { + tcg_gen_mov_vec(r, a); + return; + } + + can = tcg_can_emit_vec_op(opc, type, vece); + if (can > 0) { + vec_gen_3(opc, type, vece, ri, ai, i); + } else { + /* We leave the choice of expansion via scalar or vector shift + to the target. Often, but not always, dupi can feed a vector + shift easier than a scalar. */ + tcg_debug_assert(can < 0); + tcg_expand_vec_op(opc, type, vece, ri, ai, i); + } +} + +void tcg_gen_shli_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i) +{ + do_shifti(INDEX_op_shli_vec, vece, r, a, i); +} + +void tcg_gen_shri_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i) +{ + do_shifti(INDEX_op_shri_vec, vece, r, a, i); +} + +void tcg_gen_sari_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i) +{ + do_shifti(INDEX_op_sari_vec, vece, r, a, i); +} + +void tcg_gen_cmp_vec(TCGCond cond, unsigned vece, + TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + TCGTemp *rt = tcgv_vec_temp(r); + TCGTemp *at = tcgv_vec_temp(a); + TCGTemp *bt = tcgv_vec_temp(b); + TCGArg ri = temp_arg(rt); + TCGArg ai = temp_arg(at); + TCGArg bi = temp_arg(bt); + TCGType type = rt->base_type; + int can; + + tcg_debug_assert(at->base_type == type); + tcg_debug_assert(bt->base_type == type); + can = tcg_can_emit_vec_op(INDEX_op_cmp_vec, type, vece); + if (can > 0) { + vec_gen_4(INDEX_op_cmp_vec, type, vece, ri, ai, bi, cond); + } else { + tcg_debug_assert(can < 0); + tcg_expand_vec_op(INDEX_op_cmp_vec, type, vece, ri, ai, bi, cond); + } +} + +void tcg_gen_mul_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b) +{ + TCGTemp *rt = tcgv_vec_temp(r); + TCGTemp *at = tcgv_vec_temp(a); + TCGTemp *bt = tcgv_vec_temp(b); + TCGArg ri = temp_arg(rt); + TCGArg ai = temp_arg(at); + TCGArg bi = temp_arg(bt); + TCGType type = rt->base_type; + int can; + + tcg_debug_assert(at->base_type == type); + tcg_debug_assert(bt->base_type == type); + can = tcg_can_emit_vec_op(INDEX_op_mul_vec, type, vece); + if (can > 0) { + vec_gen_3(INDEX_op_mul_vec, type, vece, ri, ai, bi); + } else { + tcg_debug_assert(can < 0); + tcg_expand_vec_op(INDEX_op_mul_vec, type, vece, ri, ai, bi); + } +} diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c index 0c509bfe46..3467787323 100644 --- a/tcg/tcg-op.c +++ b/tcg/tcg-op.c @@ -140,7 +140,7 @@ void tcg_gen_subi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2) } } -void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2) +void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2) { TCGv_i32 t0; /* Some cases can be optimized here. */ @@ -148,17 +148,17 @@ void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2) case 0: tcg_gen_movi_i32(ret, 0); return; - case 0xffffffffu: + case -1: tcg_gen_mov_i32(ret, arg1); return; - case 0xffu: + case 0xff: /* Don't recurse with tcg_gen_ext8u_i32. */ if (TCG_TARGET_HAS_ext8u_i32) { tcg_gen_op2_i32(INDEX_op_ext8u_i32, ret, arg1); return; } break; - case 0xffffu: + case 0xffff: if (TCG_TARGET_HAS_ext16u_i32) { tcg_gen_op2_i32(INDEX_op_ext16u_i32, ret, arg1); return; @@ -199,9 +199,9 @@ void tcg_gen_xori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2) } } -void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2) +void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2) { - tcg_debug_assert(arg2 < 32); + tcg_debug_assert(arg2 >= 0 && arg2 < 32); if (arg2 == 0) { tcg_gen_mov_i32(ret, arg1); } else { @@ -211,9 +211,9 @@ void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2) } } -void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2) +void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2) { - tcg_debug_assert(arg2 < 32); + tcg_debug_assert(arg2 >= 0 && arg2 < 32); if (arg2 == 0) { tcg_gen_mov_i32(ret, arg1); } else { @@ -223,9 +223,9 @@ void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2) } } -void tcg_gen_sari_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2) +void tcg_gen_sari_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2) { - tcg_debug_assert(arg2 < 32); + tcg_debug_assert(arg2 >= 0 && arg2 < 32); if (arg2 == 0) { tcg_gen_mov_i32(ret, arg1); } else { @@ -1201,7 +1201,7 @@ void tcg_gen_subi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2) } } -void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2) +void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2) { TCGv_i64 t0; @@ -1216,23 +1216,23 @@ void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2) case 0: tcg_gen_movi_i64(ret, 0); return; - case 0xffffffffffffffffull: + case -1: tcg_gen_mov_i64(ret, arg1); return; - case 0xffull: + case 0xff: /* Don't recurse with tcg_gen_ext8u_i64. */ if (TCG_TARGET_HAS_ext8u_i64) { tcg_gen_op2_i64(INDEX_op_ext8u_i64, ret, arg1); return; } break; - case 0xffffu: + case 0xffff: if (TCG_TARGET_HAS_ext16u_i64) { tcg_gen_op2_i64(INDEX_op_ext16u_i64, ret, arg1); return; } break; - case 0xffffffffull: + case 0xffffffffu: if (TCG_TARGET_HAS_ext32u_i64) { tcg_gen_op2_i64(INDEX_op_ext32u_i64, ret, arg1); return; @@ -1332,9 +1332,9 @@ static inline void tcg_gen_shifti_i64(TCGv_i64 ret, TCGv_i64 arg1, } } -void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2) +void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2) { - tcg_debug_assert(arg2 < 64); + tcg_debug_assert(arg2 >= 0 && arg2 < 64); if (TCG_TARGET_REG_BITS == 32) { tcg_gen_shifti_i64(ret, arg1, arg2, 0, 0); } else if (arg2 == 0) { @@ -1346,9 +1346,9 @@ void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2) } } -void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2) +void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2) { - tcg_debug_assert(arg2 < 64); + tcg_debug_assert(arg2 >= 0 && arg2 < 64); if (TCG_TARGET_REG_BITS == 32) { tcg_gen_shifti_i64(ret, arg1, arg2, 1, 0); } else if (arg2 == 0) { @@ -1360,9 +1360,9 @@ void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2) } } -void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2) +void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2) { - tcg_debug_assert(arg2 < 64); + tcg_debug_assert(arg2 >= 0 && arg2 < 64); if (TCG_TARGET_REG_BITS == 32) { tcg_gen_shifti_i64(ret, arg1, arg2, 1, 1); } else if (arg2 == 0) { diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h index ca07b32b65..75bb55aeac 100644 --- a/tcg/tcg-op.h +++ b/tcg/tcg-op.h @@ -35,6 +35,10 @@ void tcg_gen_op4(TCGOpcode, TCGArg, TCGArg, TCGArg, TCGArg); void tcg_gen_op5(TCGOpcode, TCGArg, TCGArg, TCGArg, TCGArg, TCGArg); void tcg_gen_op6(TCGOpcode, TCGArg, TCGArg, TCGArg, TCGArg, TCGArg, TCGArg); +void vec_gen_2(TCGOpcode, TCGType, unsigned, TCGArg, TCGArg); +void vec_gen_3(TCGOpcode, TCGType, unsigned, TCGArg, TCGArg, TCGArg); +void vec_gen_4(TCGOpcode, TCGType, unsigned, TCGArg, TCGArg, TCGArg, TCGArg); + static inline void tcg_gen_op1_i32(TCGOpcode opc, TCGv_i32 a1) { tcg_gen_op1(opc, tcgv_i32_arg(a1)); @@ -265,12 +269,12 @@ void tcg_gen_mb(TCGBar); void tcg_gen_addi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2); void tcg_gen_subfi_i32(TCGv_i32 ret, int32_t arg1, TCGv_i32 arg2); void tcg_gen_subi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2); -void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t arg2); +void tcg_gen_andi_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2); void tcg_gen_ori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2); void tcg_gen_xori_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2); -void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2); -void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2); -void tcg_gen_sari_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2); +void tcg_gen_shli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2); +void tcg_gen_shri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2); +void tcg_gen_sari_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2); void tcg_gen_muli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2); void tcg_gen_div_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2); void tcg_gen_rem_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2); @@ -454,12 +458,12 @@ static inline void tcg_gen_not_i32(TCGv_i32 ret, TCGv_i32 arg) void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2); void tcg_gen_subfi_i64(TCGv_i64 ret, int64_t arg1, TCGv_i64 arg2); void tcg_gen_subi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2); -void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t arg2); +void tcg_gen_andi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2); void tcg_gen_ori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2); void tcg_gen_xori_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2); -void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2); -void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2); -void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2); +void tcg_gen_shli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2); +void tcg_gen_shri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2); +void tcg_gen_sari_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2); void tcg_gen_muli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2); void tcg_gen_div_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2); void tcg_gen_rem_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2); @@ -903,6 +907,36 @@ void tcg_gen_atomic_or_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp); void tcg_gen_atomic_xor_fetch_i32(TCGv_i32, TCGv, TCGv_i32, TCGArg, TCGMemOp); void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp); +void tcg_gen_mov_vec(TCGv_vec, TCGv_vec); +void tcg_gen_dup_i32_vec(unsigned vece, TCGv_vec, TCGv_i32); +void tcg_gen_dup_i64_vec(unsigned vece, TCGv_vec, TCGv_i64); +void tcg_gen_dup8i_vec(TCGv_vec, uint32_t); +void tcg_gen_dup16i_vec(TCGv_vec, uint32_t); +void tcg_gen_dup32i_vec(TCGv_vec, uint32_t); +void tcg_gen_dup64i_vec(TCGv_vec, uint64_t); +void tcg_gen_dupi_vec(unsigned vece, TCGv_vec, uint64_t); +void tcg_gen_add_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b); +void tcg_gen_sub_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b); +void tcg_gen_mul_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b); +void tcg_gen_and_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b); +void tcg_gen_or_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b); +void tcg_gen_xor_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b); +void tcg_gen_andc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b); +void tcg_gen_orc_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b); +void tcg_gen_not_vec(unsigned vece, TCGv_vec r, TCGv_vec a); +void tcg_gen_neg_vec(unsigned vece, TCGv_vec r, TCGv_vec a); + +void tcg_gen_shli_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i); +void tcg_gen_shri_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i); +void tcg_gen_sari_vec(unsigned vece, TCGv_vec r, TCGv_vec a, int64_t i); + +void tcg_gen_cmp_vec(TCGCond cond, unsigned vece, TCGv_vec r, + TCGv_vec a, TCGv_vec b); + +void tcg_gen_ld_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset); +void tcg_gen_st_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset); +void tcg_gen_stl_vec(TCGv_vec r, TCGv_ptr base, TCGArg offset, TCGType t); + #if TARGET_LONG_BITS == 64 #define tcg_gen_movi_tl tcg_gen_movi_i64 #define tcg_gen_mov_tl tcg_gen_mov_i64 @@ -1001,6 +1035,7 @@ void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp); #define tcg_gen_atomic_and_fetch_tl tcg_gen_atomic_and_fetch_i64 #define tcg_gen_atomic_or_fetch_tl tcg_gen_atomic_or_fetch_i64 #define tcg_gen_atomic_xor_fetch_tl tcg_gen_atomic_xor_fetch_i64 +#define tcg_gen_dup_tl_vec tcg_gen_dup_i64_vec #else #define tcg_gen_movi_tl tcg_gen_movi_i32 #define tcg_gen_mov_tl tcg_gen_mov_i32 @@ -1098,6 +1133,7 @@ void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, TCGv_i64, TCGArg, TCGMemOp); #define tcg_gen_atomic_and_fetch_tl tcg_gen_atomic_and_fetch_i32 #define tcg_gen_atomic_or_fetch_tl tcg_gen_atomic_or_fetch_i32 #define tcg_gen_atomic_xor_fetch_tl tcg_gen_atomic_xor_fetch_i32 +#define tcg_gen_dup_tl_vec tcg_gen_dup_i32_vec #endif #if UINTPTR_MAX == UINT32_MAX diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h index 956fb1e9f3..d81a6c4535 100644 --- a/tcg/tcg-opc.h +++ b/tcg/tcg-opc.h @@ -204,8 +204,54 @@ DEF(qemu_ld_i64, DATA64_ARGS, TLADDR_ARGS, 1, DEF(qemu_st_i64, 0, TLADDR_ARGS + DATA64_ARGS, 1, TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT) +/* Host vector support. */ + +#define IMPLVEC TCG_OPF_VECTOR | IMPL(TCG_TARGET_MAYBE_vec) + +DEF(mov_vec, 1, 1, 0, TCG_OPF_VECTOR | TCG_OPF_NOT_PRESENT) +DEF(dupi_vec, 1, 0, 1, TCG_OPF_VECTOR | TCG_OPF_NOT_PRESENT) + +DEF(dup_vec, 1, 1, 0, IMPLVEC) +DEF(dup2_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_REG_BITS == 32)) + +DEF(ld_vec, 1, 1, 1, IMPLVEC) +DEF(st_vec, 0, 2, 1, IMPLVEC) + +DEF(add_vec, 1, 2, 0, IMPLVEC) +DEF(sub_vec, 1, 2, 0, IMPLVEC) +DEF(mul_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_mul_vec)) +DEF(neg_vec, 1, 1, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_neg_vec)) + +DEF(and_vec, 1, 2, 0, IMPLVEC) +DEF(or_vec, 1, 2, 0, IMPLVEC) +DEF(xor_vec, 1, 2, 0, IMPLVEC) +DEF(andc_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_andc_vec)) +DEF(orc_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_orc_vec)) +DEF(not_vec, 1, 1, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_not_vec)) + +DEF(shli_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_shi_vec)) +DEF(shri_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_shi_vec)) +DEF(sari_vec, 1, 1, 1, IMPLVEC | IMPL(TCG_TARGET_HAS_shi_vec)) + +DEF(shls_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_shs_vec)) +DEF(shrs_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_shs_vec)) +DEF(sars_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_shs_vec)) + +DEF(shlv_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_shv_vec)) +DEF(shrv_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_shv_vec)) +DEF(sarv_vec, 1, 2, 0, IMPLVEC | IMPL(TCG_TARGET_HAS_shv_vec)) + +DEF(cmp_vec, 1, 2, 1, IMPLVEC) + +DEF(last_generic, 0, 0, 0, TCG_OPF_NOT_PRESENT) + +#if TCG_TARGET_MAYBE_vec +#include "tcg-target.opc.h" +#endif + #undef TLADDR_ARGS #undef DATA64_ARGS #undef IMPL #undef IMPL64 +#undef IMPLVEC #undef DEF diff --git a/tcg/tcg-pool.inc.c b/tcg/tcg-pool.inc.c index 8a85131405..7af5513ff3 100644 --- a/tcg/tcg-pool.inc.c +++ b/tcg/tcg-pool.inc.c @@ -22,39 +22,110 @@ typedef struct TCGLabelPoolData { struct TCGLabelPoolData *next; - tcg_target_ulong data; tcg_insn_unit *label; intptr_t addend; - int type; + int rtype; + unsigned nlong; + tcg_target_ulong data[]; } TCGLabelPoolData; -static void new_pool_label(TCGContext *s, tcg_target_ulong data, int type, - tcg_insn_unit *label, intptr_t addend) +static TCGLabelPoolData *new_pool_alloc(TCGContext *s, int nlong, int rtype, + tcg_insn_unit *label, intptr_t addend) { - TCGLabelPoolData *n = tcg_malloc(sizeof(*n)); - TCGLabelPoolData *i, **pp; + TCGLabelPoolData *n = tcg_malloc(sizeof(TCGLabelPoolData) + + sizeof(tcg_target_ulong) * nlong); - n->data = data; n->label = label; - n->type = type; n->addend = addend; + n->rtype = rtype; + n->nlong = nlong; + return n; +} + +static void new_pool_insert(TCGContext *s, TCGLabelPoolData *n) +{ + TCGLabelPoolData *i, **pp; + int nlong = n->nlong; /* Insertion sort on the pool. */ - for (pp = &s->pool_labels; (i = *pp) && i->data < data; pp = &i->next) { - continue; + for (pp = &s->pool_labels; (i = *pp) != NULL; pp = &i->next) { + if (nlong > i->nlong) { + break; + } + if (nlong < i->nlong) { + continue; + } + if (memcmp(n->data, i->data, sizeof(tcg_target_ulong) * nlong) >= 0) { + break; + } } n->next = *pp; *pp = n; } +/* The "usual" for generic integer code. */ +static inline void new_pool_label(TCGContext *s, tcg_target_ulong d, int rtype, + tcg_insn_unit *label, intptr_t addend) +{ + TCGLabelPoolData *n = new_pool_alloc(s, 1, rtype, label, addend); + n->data[0] = d; + new_pool_insert(s, n); +} + +/* For v64 or v128, depending on the host. */ +static inline void new_pool_l2(TCGContext *s, int rtype, tcg_insn_unit *label, + intptr_t addend, tcg_target_ulong d0, + tcg_target_ulong d1) +{ + TCGLabelPoolData *n = new_pool_alloc(s, 2, rtype, label, addend); + n->data[0] = d0; + n->data[1] = d1; + new_pool_insert(s, n); +} + +/* For v128 or v256, depending on the host. */ +static inline void new_pool_l4(TCGContext *s, int rtype, tcg_insn_unit *label, + intptr_t addend, tcg_target_ulong d0, + tcg_target_ulong d1, tcg_target_ulong d2, + tcg_target_ulong d3) +{ + TCGLabelPoolData *n = new_pool_alloc(s, 4, rtype, label, addend); + n->data[0] = d0; + n->data[1] = d1; + n->data[2] = d2; + n->data[3] = d3; + new_pool_insert(s, n); +} + +/* For v256, for 32-bit host. */ +static inline void new_pool_l8(TCGContext *s, int rtype, tcg_insn_unit *label, + intptr_t addend, tcg_target_ulong d0, + tcg_target_ulong d1, tcg_target_ulong d2, + tcg_target_ulong d3, tcg_target_ulong d4, + tcg_target_ulong d5, tcg_target_ulong d6, + tcg_target_ulong d7) +{ + TCGLabelPoolData *n = new_pool_alloc(s, 8, rtype, label, addend); + n->data[0] = d0; + n->data[1] = d1; + n->data[2] = d2; + n->data[3] = d3; + n->data[4] = d4; + n->data[5] = d5; + n->data[6] = d6; + n->data[7] = d7; + new_pool_insert(s, n); +} + /* To be provided by cpu/tcg-target.inc.c. */ static void tcg_out_nop_fill(tcg_insn_unit *p, int count); static bool tcg_out_pool_finalize(TCGContext *s) { TCGLabelPoolData *p = s->pool_labels; - tcg_target_ulong d, *a; + TCGLabelPoolData *l = NULL; + void *a; if (p == NULL) { return true; @@ -62,24 +133,24 @@ static bool tcg_out_pool_finalize(TCGContext *s) /* ??? Round up to qemu_icache_linesize, but then do not round again when allocating the next TranslationBlock structure. */ - a = (void *)ROUND_UP((uintptr_t)s->code_ptr, sizeof(tcg_target_ulong)); + a = (void *)ROUND_UP((uintptr_t)s->code_ptr, + sizeof(tcg_target_ulong) * p->nlong); tcg_out_nop_fill(s->code_ptr, (tcg_insn_unit *)a - s->code_ptr); s->data_gen_ptr = a; - /* Ensure the first comparison fails. */ - d = p->data + 1; - for (; p != NULL; p = p->next) { - if (p->data != d) { - d = p->data; - if (unlikely((void *)a > s->code_gen_highwater)) { + size_t size = sizeof(tcg_target_ulong) * p->nlong; + if (!l || l->nlong != p->nlong || memcmp(l->data, p->data, size)) { + if (unlikely(a > s->code_gen_highwater)) { return false; } - *a++ = d; + memcpy(a, p->data, size); + a += size; + l = p; } - patch_reloc(p->label, p->type, (intptr_t)(a - 1), p->addend); + patch_reloc(p->label, p->rtype, (intptr_t)a - size, p->addend); } - s->code_ptr = (void *)a; + s->code_ptr = a; return true; } @@ -106,6 +106,18 @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg ret, tcg_target_long arg); static void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *const_args); +#if TCG_TARGET_MAYBE_vec +static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, unsigned vecl, + unsigned vece, const TCGArg *args, + const int *const_args); +#else +static inline void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, unsigned vecl, + unsigned vece, const TCGArg *args, + const int *const_args) +{ + g_assert_not_reached(); +} +#endif static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, TCGReg arg1, intptr_t arg2); static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val, @@ -146,8 +158,7 @@ struct tcg_region_state { }; static struct tcg_region_state region; - -static TCGRegSet tcg_target_available_regs[2]; +static TCGRegSet tcg_target_available_regs[TCG_TYPE_COUNT]; static TCGRegSet tcg_target_call_clobber_regs; #if TCG_TARGET_INSN_UNIT_SIZE == 1 @@ -1026,6 +1037,41 @@ TCGv_i64 tcg_temp_new_internal_i64(int temp_local) return temp_tcgv_i64(t); } +TCGv_vec tcg_temp_new_vec(TCGType type) +{ + TCGTemp *t; + +#ifdef CONFIG_DEBUG_TCG + switch (type) { + case TCG_TYPE_V64: + assert(TCG_TARGET_HAS_v64); + break; + case TCG_TYPE_V128: + assert(TCG_TARGET_HAS_v128); + break; + case TCG_TYPE_V256: + assert(TCG_TARGET_HAS_v256); + break; + default: + g_assert_not_reached(); + } +#endif + + t = tcg_temp_new_internal(type, 0); + return temp_tcgv_vec(t); +} + +/* Create a new temp of the same type as an existing temp. */ +TCGv_vec tcg_temp_new_vec_matching(TCGv_vec match) +{ + TCGTemp *t = tcgv_vec_temp(match); + + tcg_debug_assert(t->temp_allocated != 0); + + t = tcg_temp_new_internal(t->base_type, 0); + return temp_tcgv_vec(t); +} + static void tcg_temp_free_internal(TCGTemp *ts) { TCGContext *s = tcg_ctx; @@ -1057,6 +1103,11 @@ void tcg_temp_free_i64(TCGv_i64 arg) tcg_temp_free_internal(tcgv_i64_temp(arg)); } +void tcg_temp_free_vec(TCGv_vec arg) +{ + tcg_temp_free_internal(tcgv_vec_temp(arg)); +} + TCGv_i32 tcg_const_i32(int32_t val) { TCGv_i32 t0; @@ -1114,6 +1165,9 @@ int tcg_check_temp_count(void) Test the runtime variable that controls each opcode. */ bool tcg_op_supported(TCGOpcode op) { + const bool have_vec + = TCG_TARGET_HAS_v64 | TCG_TARGET_HAS_v128 | TCG_TARGET_HAS_v256; + switch (op) { case INDEX_op_discard: case INDEX_op_set_label: @@ -1327,10 +1381,47 @@ bool tcg_op_supported(TCGOpcode op) case INDEX_op_mulsh_i64: return TCG_TARGET_HAS_mulsh_i64; - case NB_OPS: - break; + case INDEX_op_mov_vec: + case INDEX_op_dup_vec: + case INDEX_op_dupi_vec: + case INDEX_op_ld_vec: + case INDEX_op_st_vec: + case INDEX_op_add_vec: + case INDEX_op_sub_vec: + case INDEX_op_and_vec: + case INDEX_op_or_vec: + case INDEX_op_xor_vec: + case INDEX_op_cmp_vec: + return have_vec; + case INDEX_op_dup2_vec: + return have_vec && TCG_TARGET_REG_BITS == 32; + case INDEX_op_not_vec: + return have_vec && TCG_TARGET_HAS_not_vec; + case INDEX_op_neg_vec: + return have_vec && TCG_TARGET_HAS_neg_vec; + case INDEX_op_andc_vec: + return have_vec && TCG_TARGET_HAS_andc_vec; + case INDEX_op_orc_vec: + return have_vec && TCG_TARGET_HAS_orc_vec; + case INDEX_op_mul_vec: + return have_vec && TCG_TARGET_HAS_mul_vec; + case INDEX_op_shli_vec: + case INDEX_op_shri_vec: + case INDEX_op_sari_vec: + return have_vec && TCG_TARGET_HAS_shi_vec; + case INDEX_op_shls_vec: + case INDEX_op_shrs_vec: + case INDEX_op_sars_vec: + return have_vec && TCG_TARGET_HAS_shs_vec; + case INDEX_op_shlv_vec: + case INDEX_op_shrv_vec: + case INDEX_op_sarv_vec: + return have_vec && TCG_TARGET_HAS_shv_vec; + + default: + tcg_debug_assert(op > INDEX_op_last_generic && op < NB_OPS); + return true; } - g_assert_not_reached(); } /* Note: we convert the 64 bit args to 32 bit and do some alignment @@ -1661,6 +1752,11 @@ void tcg_dump_ops(TCGContext *s) nb_iargs = def->nb_iargs; nb_cargs = def->nb_cargs; + if (def->flags & TCG_OPF_VECTOR) { + col += qemu_log("v%d,e%d,", 64 << TCGOP_VECL(op), + 8 << TCGOP_VECE(op)); + } + k = 0; for (i = 0; i < nb_oargs; i++) { if (k != 0) { @@ -1685,6 +1781,7 @@ void tcg_dump_ops(TCGContext *s) case INDEX_op_brcond_i64: case INDEX_op_setcond_i64: case INDEX_op_movcond_i64: + case INDEX_op_cmp_vec: if (op->args[k] < ARRAY_SIZE(cond_name) && cond_name[op->args[k]]) { col += qemu_log(",%s", cond_name[op->args[k++]]); @@ -2890,8 +2987,13 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op) } /* emit instruction */ - tcg_out_op(s, op->opc, new_args, const_args); - + if (def->flags & TCG_OPF_VECTOR) { + tcg_out_vec_op(s, op->opc, TCGOP_VECL(op), TCGOP_VECE(op), + new_args, const_args); + } else { + tcg_out_op(s, op->opc, new_args, const_args); + } + /* move the outputs in the correct register if needed */ for(i = 0; i < nb_oargs; i++) { ts = arg_temp(op->args[i]); @@ -3239,10 +3341,12 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb) switch (opc) { case INDEX_op_mov_i32: case INDEX_op_mov_i64: + case INDEX_op_mov_vec: tcg_reg_alloc_mov(s, op); break; case INDEX_op_movi_i32: case INDEX_op_movi_i64: + case INDEX_op_dupi_vec: tcg_reg_alloc_movi(s, op); break; case INDEX_op_insn_start: @@ -3645,3 +3749,10 @@ void tcg_register_jit(void *buf, size_t buf_size) { } #endif /* ELF_HOST_MACHINE */ + +#if !TCG_TARGET_MAYBE_vec +void tcg_expand_vec_op(TCGOpcode o, TCGType t, unsigned e, TCGArg a0, ...) +{ + g_assert_not_reached(); +} +#endif @@ -170,6 +170,31 @@ typedef uint64_t TCGRegSet; # error "Missing unsigned widening multiply" #endif +#if !defined(TCG_TARGET_HAS_v64) \ + && !defined(TCG_TARGET_HAS_v128) \ + && !defined(TCG_TARGET_HAS_v256) +#define TCG_TARGET_MAYBE_vec 0 +#define TCG_TARGET_HAS_neg_vec 0 +#define TCG_TARGET_HAS_not_vec 0 +#define TCG_TARGET_HAS_andc_vec 0 +#define TCG_TARGET_HAS_orc_vec 0 +#define TCG_TARGET_HAS_shi_vec 0 +#define TCG_TARGET_HAS_shs_vec 0 +#define TCG_TARGET_HAS_shv_vec 0 +#define TCG_TARGET_HAS_mul_vec 0 +#else +#define TCG_TARGET_MAYBE_vec 1 +#endif +#ifndef TCG_TARGET_HAS_v64 +#define TCG_TARGET_HAS_v64 0 +#endif +#ifndef TCG_TARGET_HAS_v128 +#define TCG_TARGET_HAS_v128 0 +#endif +#ifndef TCG_TARGET_HAS_v256 +#define TCG_TARGET_HAS_v256 0 +#endif + #ifndef TARGET_INSN_START_EXTRA_WORDS # define TARGET_INSN_START_WORDS 1 #else @@ -246,6 +271,11 @@ typedef struct TCGPool { typedef enum TCGType { TCG_TYPE_I32, TCG_TYPE_I64, + + TCG_TYPE_V64, + TCG_TYPE_V128, + TCG_TYPE_V256, + TCG_TYPE_COUNT, /* number of different types */ /* An alias for the size of the host register. */ @@ -396,6 +426,8 @@ typedef tcg_target_ulong TCGArg; * TCGv_i32 : 32 bit integer type * TCGv_i64 : 64 bit integer type * TCGv_ptr : a host pointer type + * TCGv_vec : a host vector type; the exact size is not exposed + to the CPU front-end code. * TCGv : an integer type the same size as target_ulong (an alias for either TCGv_i32 or TCGv_i64) The compiler's type checking will complain if you mix them @@ -418,6 +450,7 @@ typedef tcg_target_ulong TCGArg; typedef struct TCGv_i32_d *TCGv_i32; typedef struct TCGv_i64_d *TCGv_i64; typedef struct TCGv_ptr_d *TCGv_ptr; +typedef struct TCGv_vec_d *TCGv_vec; typedef TCGv_ptr TCGv_env; #if TARGET_LONG_BITS == 32 #define TCGv TCGv_i32 @@ -589,6 +622,9 @@ typedef struct TCGOp { #define TCGOP_CALLI(X) (X)->param1 #define TCGOP_CALLO(X) (X)->param2 +#define TCGOP_VECL(X) (X)->param1 +#define TCGOP_VECE(X) (X)->param2 + /* Make sure operands fit in the bitfields above. */ QEMU_BUILD_BUG_ON(NB_OPS > (1 << 8)); @@ -726,6 +762,11 @@ static inline TCGTemp *tcgv_ptr_temp(TCGv_ptr v) return tcgv_i32_temp((TCGv_i32)v); } +static inline TCGTemp *tcgv_vec_temp(TCGv_vec v) +{ + return tcgv_i32_temp((TCGv_i32)v); +} + static inline TCGArg tcgv_i32_arg(TCGv_i32 v) { return temp_arg(tcgv_i32_temp(v)); @@ -741,6 +782,11 @@ static inline TCGArg tcgv_ptr_arg(TCGv_ptr v) return temp_arg(tcgv_ptr_temp(v)); } +static inline TCGArg tcgv_vec_arg(TCGv_vec v) +{ + return temp_arg(tcgv_vec_temp(v)); +} + static inline TCGv_i32 temp_tcgv_i32(TCGTemp *t) { (void)temp_idx(t); /* trigger embedded assert */ @@ -757,6 +803,11 @@ static inline TCGv_ptr temp_tcgv_ptr(TCGTemp *t) return (TCGv_ptr)temp_tcgv_i32(t); } +static inline TCGv_vec temp_tcgv_vec(TCGTemp *t) +{ + return (TCGv_vec)temp_tcgv_i32(t); +} + #if TCG_TARGET_REG_BITS == 32 static inline TCGv_i32 TCGV_LOW(TCGv_i64 t) { @@ -832,9 +883,12 @@ TCGTemp *tcg_global_mem_new_internal(TCGType, TCGv_ptr, TCGv_i32 tcg_temp_new_internal_i32(int temp_local); TCGv_i64 tcg_temp_new_internal_i64(int temp_local); +TCGv_vec tcg_temp_new_vec(TCGType type); +TCGv_vec tcg_temp_new_vec_matching(TCGv_vec match); void tcg_temp_free_i32(TCGv_i32 arg); void tcg_temp_free_i64(TCGv_i64 arg); +void tcg_temp_free_vec(TCGv_vec arg); static inline TCGv_i32 tcg_global_mem_new_i32(TCGv_ptr reg, intptr_t offset, const char *name) @@ -916,6 +970,8 @@ enum { /* Instruction is optional and not implemented by the host, or insn is generic and should not be implemened by the host. */ TCG_OPF_NOT_PRESENT = 0x10, + /* Instruction operands are vectors. */ + TCG_OPF_VECTOR = 0x20, }; typedef struct TCGOpDef { @@ -981,6 +1037,10 @@ TCGv_i32 tcg_const_i32(int32_t val); TCGv_i64 tcg_const_i64(int64_t val); TCGv_i32 tcg_const_local_i32(int32_t val); TCGv_i64 tcg_const_local_i64(int64_t val); +TCGv_vec tcg_const_zeros_vec(TCGType); +TCGv_vec tcg_const_ones_vec(TCGType); +TCGv_vec tcg_const_zeros_vec_matching(TCGv_vec); +TCGv_vec tcg_const_ones_vec_matching(TCGv_vec); TCGLabel *gen_new_label(void); @@ -1151,6 +1211,33 @@ uintptr_t tcg_qemu_tb_exec(CPUArchState *env, uint8_t *tb_ptr); void tcg_register_jit(void *buf, size_t buf_size); +#if TCG_TARGET_MAYBE_vec +/* Return zero if the tuple (opc, type, vece) is unsupportable; + return > 0 if it is directly supportable; + return < 0 if we must call tcg_expand_vec_op. */ +int tcg_can_emit_vec_op(TCGOpcode, TCGType, unsigned); +#else +static inline int tcg_can_emit_vec_op(TCGOpcode o, TCGType t, unsigned ve) +{ + return 0; +} +#endif + +/* Expand the tuple (opc, type, vece) on the given arguments. */ +void tcg_expand_vec_op(TCGOpcode, TCGType, unsigned, TCGArg, ...); + +/* Replicate a constant C accoring to the log2 of the element size. */ +uint64_t dup_const(unsigned vece, uint64_t c); + +#define dup_const(VECE, C) \ + (__builtin_constant_p(VECE) \ + ? ( (VECE) == MO_8 ? 0x0101010101010101ull * (uint8_t)(C) \ + : (VECE) == MO_16 ? 0x0001000100010001ull * (uint16_t)(C) \ + : (VECE) == MO_32 ? 0x0000000100000001ull * (uint32_t)(C) \ + : dup_const(VECE, C)) \ + : dup_const(VECE, C)) + + /* * Memory helpers that will be used by TCG generated code. */ |