aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--target/i386/ops_sse.h7
-rw-r--r--target/i386/ops_sse_header.h3
-rw-r--r--target/i386/tcg/decode-new.c.inc126
-rw-r--r--target/i386/tcg/emit.c.inc127
-rw-r--r--target/i386/tcg/translate.c1
5 files changed, 264 insertions, 0 deletions
diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index e3cc6948dd..0037f92e5f 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -1683,6 +1683,10 @@ void glue(helper_ptest, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
CC_SRC = (zf ? 0 : CC_Z) | (cf ? 0 : CC_C);
}
+#define FMOVSLDUP(i) s->L((i) & ~1)
+#define FMOVSHDUP(i) s->L((i) | 1)
+#define FMOVDLDUP(i) s->Q((i) & ~1)
+
#define SSE_HELPER_F(name, elem, num, F) \
void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
{ \
@@ -1705,6 +1709,9 @@ SSE_HELPER_F(helper_pmovzxbq, Q, 1 << SHIFT, s->B)
SSE_HELPER_F(helper_pmovzxwd, L, 2 << SHIFT, s->W)
SSE_HELPER_F(helper_pmovzxwq, Q, 1 << SHIFT, s->W)
SSE_HELPER_F(helper_pmovzxdq, Q, 1 << SHIFT, s->L)
+SSE_HELPER_F(helper_pmovsldup, L, 2 << SHIFT, FMOVSLDUP)
+SSE_HELPER_F(helper_pmovshdup, L, 2 << SHIFT, FMOVSHDUP)
+SSE_HELPER_F(helper_pmovdldup, Q, 1 << SHIFT, FMOVDLDUP)
#endif
void glue(helper_pmuldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h
index dd8dcebc23..00de6d69f1 100644
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@@ -355,6 +355,9 @@ DEF_HELPER_3(glue(pmovzxbq, SUFFIX), void, env, Reg, Reg)
DEF_HELPER_3(glue(pmovzxwd, SUFFIX), void, env, Reg, Reg)
DEF_HELPER_3(glue(pmovzxwq, SUFFIX), void, env, Reg, Reg)
DEF_HELPER_3(glue(pmovzxdq, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pmovsldup, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pmovshdup, SUFFIX), void, env, Reg, Reg)
+DEF_HELPER_3(glue(pmovdldup, SUFFIX), void, env, Reg, Reg)
DEF_HELPER_4(glue(pmuldq, SUFFIX), void, env, Reg, Reg, Reg)
DEF_HELPER_4(glue(pcmpeqq, SUFFIX), void, env, Reg, Reg, Reg)
DEF_HELPER_4(glue(packusdw, SUFFIX), void, env, Reg, Reg, Reg)
diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc
index 6a82d58b23..5435447e07 100644
--- a/target/i386/tcg/decode-new.c.inc
+++ b/target/i386/tcg/decode-new.c.inc
@@ -556,6 +556,122 @@ static void decode_0F3A(DisasContext *s, CPUX86State *env, X86OpEntry *entry, ui
*entry = opcodes_0F3A[*b];
}
+/*
+ * There are some mistakes in the operands in the manual, and the load/store/register
+ * cases are easiest to keep separate, so the entries for 10-17 follow simplicity and
+ * efficiency of implementation rather than copying what the manual says.
+ *
+ * In particular:
+ *
+ * 1) "VMOVSS m32, xmm1" and "VMOVSD m64, xmm1" do not support VEX.vvvv != 1111b,
+ * but this is not mentioned in the tables.
+ *
+ * 2) MOVHLPS, MOVHPS, MOVHPD, MOVLPD, MOVLPS read the high quadword of one of their
+ * operands, which must therefore be dq; MOVLPD and MOVLPS also write the high
+ * quadword of the V operand.
+ */
+static void decode_0F10(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ static const X86OpEntry opcodes_0F10_reg[4] = {
+ X86_OP_ENTRY3(MOVDQ, V,x, None,None, W,x, vex4_unal), /* MOVUPS */
+ X86_OP_ENTRY3(MOVDQ, V,x, None,None, W,x, vex4_unal), /* MOVUPD */
+ X86_OP_ENTRY3(VMOVSS, V,x, H,x, W,x, vex4),
+ X86_OP_ENTRY3(VMOVLPx, V,x, H,x, W,x, vex4), /* MOVSD */
+ };
+
+ static const X86OpEntry opcodes_0F10_mem[4] = {
+ X86_OP_ENTRY3(MOVDQ, V,x, None,None, W,x, vex4_unal), /* MOVUPS */
+ X86_OP_ENTRY3(MOVDQ, V,x, None,None, W,x, vex4_unal), /* MOVUPD */
+ X86_OP_ENTRY3(VMOVSS_ld, V,x, H,x, M,ss, vex4),
+ X86_OP_ENTRY3(VMOVSD_ld, V,x, H,x, M,sd, vex4),
+ };
+
+ if ((get_modrm(s, env) >> 6) == 3) {
+ *entry = *decode_by_prefix(s, opcodes_0F10_reg);
+ } else {
+ *entry = *decode_by_prefix(s, opcodes_0F10_mem);
+ }
+}
+
+static void decode_0F11(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ static const X86OpEntry opcodes_0F11_reg[4] = {
+ X86_OP_ENTRY3(MOVDQ, W,x, None,None, V,x, vex4), /* MOVPS */
+ X86_OP_ENTRY3(MOVDQ, W,x, None,None, V,x, vex4), /* MOVPD */
+ X86_OP_ENTRY3(VMOVSS, W,x, H,x, V,x, vex4),
+ X86_OP_ENTRY3(VMOVLPx, W,x, H,x, V,q, vex4), /* MOVSD */
+ };
+
+ static const X86OpEntry opcodes_0F11_mem[4] = {
+ X86_OP_ENTRY3(MOVDQ, W,x, None,None, V,x, vex4), /* MOVPS */
+ X86_OP_ENTRY3(MOVDQ, W,x, None,None, V,x, vex4), /* MOVPD */
+ X86_OP_ENTRY3(VMOVSS_st, M,ss, None,None, V,x, vex4),
+ X86_OP_ENTRY3(VMOVLPx_st, M,sd, None,None, V,x, vex4), /* MOVSD */
+ };
+
+ if ((get_modrm(s, env) >> 6) == 3) {
+ *entry = *decode_by_prefix(s, opcodes_0F11_reg);
+ } else {
+ *entry = *decode_by_prefix(s, opcodes_0F11_mem);
+ }
+}
+
+static void decode_0F12(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ static const X86OpEntry opcodes_0F12_mem[4] = {
+ /*
+ * Use dq for operand for compatibility with gen_MOVSD and
+ * to allow VEX128 only.
+ */
+ X86_OP_ENTRY3(VMOVLPx_ld, V,dq, H,dq, M,q, vex4), /* MOVLPS */
+ X86_OP_ENTRY3(VMOVLPx_ld, V,dq, H,dq, M,q, vex4), /* MOVLPD */
+ X86_OP_ENTRY3(VMOVSLDUP, V,x, None,None, W,x, vex4 cpuid(SSE3)),
+ X86_OP_ENTRY3(VMOVDDUP, V,x, None,None, WM,q, vex4 cpuid(SSE3)), /* qq if VEX.256 */
+ };
+ static const X86OpEntry opcodes_0F12_reg[4] = {
+ X86_OP_ENTRY3(VMOVHLPS, V,dq, H,dq, U,dq, vex4),
+ X86_OP_ENTRY3(VMOVLPx, W,x, H,x, U,q, vex4), /* MOVLPD */
+ X86_OP_ENTRY3(VMOVSLDUP, V,x, None,None, U,x, vex4 cpuid(SSE3)),
+ X86_OP_ENTRY3(VMOVDDUP, V,x, None,None, U,x, vex4 cpuid(SSE3)),
+ };
+
+ if ((get_modrm(s, env) >> 6) == 3) {
+ *entry = *decode_by_prefix(s, opcodes_0F12_reg);
+ } else {
+ *entry = *decode_by_prefix(s, opcodes_0F12_mem);
+ if ((s->prefix & PREFIX_REPNZ) && s->vex_l) {
+ entry->s2 = X86_SIZE_qq;
+ }
+ }
+}
+
+static void decode_0F16(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
+{
+ static const X86OpEntry opcodes_0F16_mem[4] = {
+ /*
+ * Operand 1 technically only reads the low 64 bits, but uses dq so that
+ * it is easier to check for op0 == op1 in an endianness-neutral manner.
+ */
+ X86_OP_ENTRY3(VMOVHPx_ld, V,dq, H,dq, M,q, vex4), /* MOVHPS */
+ X86_OP_ENTRY3(VMOVHPx_ld, V,dq, H,dq, M,q, vex4), /* MOVHPD */
+ X86_OP_ENTRY3(VMOVSHDUP, V,x, None,None, W,x, vex4 cpuid(SSE3)),
+ {},
+ };
+ static const X86OpEntry opcodes_0F16_reg[4] = {
+ /* Same as above, operand 1 could be Hq if it wasn't for big-endian. */
+ X86_OP_ENTRY3(VMOVLHPS, V,dq, H,dq, U,q, vex4),
+ X86_OP_ENTRY3(VMOVHPx, V,x, H,x, U,x, vex4), /* MOVHPD */
+ X86_OP_ENTRY3(VMOVSHDUP, V,x, None,None, U,x, vex4 cpuid(SSE3)),
+ {},
+ };
+
+ if ((get_modrm(s, env) >> 6) == 3) {
+ *entry = *decode_by_prefix(s, opcodes_0F16_reg);
+ } else {
+ *entry = *decode_by_prefix(s, opcodes_0F16_mem);
+ }
+}
+
static void decode_sse_unary(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b)
{
if (!(s->prefix & (PREFIX_REPZ | PREFIX_REPNZ))) {
@@ -593,6 +709,16 @@ static void decode_0FE6(DisasContext *s, CPUX86State *env, X86OpEntry *entry, ui
}
static const X86OpEntry opcodes_0F[256] = {
+ [0x10] = X86_OP_GROUP0(0F10),
+ [0x11] = X86_OP_GROUP0(0F11),
+ [0x12] = X86_OP_GROUP0(0F12),
+ [0x13] = X86_OP_ENTRY3(VMOVLPx_st, M,q, None,None, V,q, vex4 p_00_66),
+ [0x14] = X86_OP_ENTRY3(VUNPCKLPx, V,x, H,x, W,x, vex4 p_00_66),
+ [0x15] = X86_OP_ENTRY3(VUNPCKHPx, V,x, H,x, W,x, vex4 p_00_66),
+ [0x16] = X86_OP_GROUP0(0F16),
+ /* Incorrectly listed as Mq,Vq in the manual */
+ [0x17] = X86_OP_ENTRY3(VMOVHPx_st, M,q, None,None, V,dq, vex4 p_00_66),
+
[0x50] = X86_OP_ENTRY3(MOVMSK, G,y, None,None, U,x, vex7 p_00_66),
[0x51] = X86_OP_GROUP3(sse_unary, V,x, H,x, W,x, vex2_rep3 p_00_66_f3_f2),
[0x52] = X86_OP_GROUP3(sse_unary, V,x, H,x, W,x, vex5 p_00_f3),
diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc
index f7ac481203..d87f6016d9 100644
--- a/target/i386/tcg/emit.c.inc
+++ b/target/i386/tcg/emit.c.inc
@@ -394,6 +394,7 @@ static inline void gen_fp_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn
gen_illegal_opcode(s);
}
}
+
#define FP_SSE(uname, lname) \
static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
{ \
@@ -412,6 +413,20 @@ FP_SSE(VMIN, min)
FP_SSE(VDIV, div)
FP_SSE(VMAX, max)
+#define FP_UNPACK_SSE(uname, lname) \
+static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \
+{ \
+ /* PS maps to the DQ integer instruction, PD maps to QDQ. */ \
+ gen_fp_sse(s, env, decode, \
+ gen_helper_##lname##qdq_xmm, \
+ gen_helper_##lname##dq_xmm, \
+ gen_helper_##lname##qdq_ymm, \
+ gen_helper_##lname##dq_ymm, \
+ NULL, NULL); \
+}
+FP_UNPACK_SSE(VUNPCKLPx, punpckl)
+FP_UNPACK_SSE(VUNPCKHPx, punpckh)
+
/*
* 00 = v*ps Vps, Wpd
* f3 = v*ss Vss, Wps
@@ -749,6 +764,10 @@ UNARY_INT_SSE(VPMOVZXWD, pmovzxwd)
UNARY_INT_SSE(VPMOVZXWQ, pmovzxwq)
UNARY_INT_SSE(VPMOVZXDQ, pmovzxdq)
+UNARY_INT_SSE(VMOVSLDUP, pmovsldup)
+UNARY_INT_SSE(VMOVSHDUP, pmovshdup)
+UNARY_INT_SSE(VMOVDDUP, pmovdldup)
+
UNARY_INT_SSE(VCVTDQ2PD, cvtdq2pd)
UNARY_INT_SSE(VCVTPD2DQ, cvtpd2dq)
UNARY_INT_SSE(VCVTTPD2DQ, cvttpd2dq)
@@ -1808,6 +1827,114 @@ static void gen_VMASKMOVPS_st(DisasContext *s, CPUX86State *env, X86DecodedInsn
gen_maskmov(s, env, decode, gen_helper_vpmaskmovd_st_xmm, gen_helper_vpmaskmovd_st_ymm);
}
+static void gen_VMOVHPx_ld(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ gen_ldq_env_A0(s, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
+ if (decode->op[0].offset != decode->op[1].offset) {
+ tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(0)));
+ tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
+ }
+}
+
+static void gen_VMOVHPx_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ gen_stq_env_A0(s, decode->op[2].offset + offsetof(XMMReg, XMM_Q(1)));
+}
+
+static void gen_VMOVHPx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ if (decode->op[0].offset != decode->op[2].offset) {
+ tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[2].offset + offsetof(XMMReg, XMM_Q(1)));
+ tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
+ }
+ if (decode->op[0].offset != decode->op[1].offset) {
+ tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(0)));
+ tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
+ }
+}
+
+static void gen_VMOVHLPS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[2].offset + offsetof(XMMReg, XMM_Q(1)));
+ tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
+ if (decode->op[0].offset != decode->op[1].offset) {
+ tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(1)));
+ tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
+ }
+}
+
+static void gen_VMOVLHPS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[2].offset);
+ tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(1)));
+ if (decode->op[0].offset != decode->op[1].offset) {
+ tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[1].offset + offsetof(XMMReg, XMM_Q(0)));
+ tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
+ }
+}
+
+/*
+ * Note that MOVLPx supports 256-bit operation unlike MOVHLPx, MOVLHPx, MOXHPx.
+ * Use a gvec move to move everything above the bottom 64 bits.
+ */
+
+static void gen_VMOVLPx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ int vec_len = vector_len(s, decode);
+
+ tcg_gen_ld_i64(s->tmp1_i64, cpu_env, decode->op[2].offset + offsetof(XMMReg, XMM_Q(0)));
+ tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len);
+ tcg_gen_st_i64(s->tmp1_i64, cpu_env, decode->op[0].offset + offsetof(XMMReg, XMM_Q(0)));
+}
+
+static void gen_VMOVLPx_ld(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ int vec_len = vector_len(s, decode);
+
+ tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ);
+ tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len);
+ tcg_gen_st_i64(s->tmp1_i64, OP_PTR0, offsetof(ZMMReg, ZMM_Q(0)));
+}
+
+static void gen_VMOVLPx_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ tcg_gen_ld_i64(s->tmp1_i64, OP_PTR2, offsetof(ZMMReg, ZMM_Q(0)));
+ tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ);
+}
+
+static void gen_VMOVSD_ld(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ TCGv_i64 zero = tcg_constant_i64(0);
+
+ tcg_gen_qemu_ld_i64(s->tmp1_i64, s->A0, s->mem_index, MO_LEUQ);
+ tcg_gen_st_i64(zero, OP_PTR0, offsetof(ZMMReg, ZMM_Q(1)));
+ tcg_gen_st_i64(s->tmp1_i64, OP_PTR0, offsetof(ZMMReg, ZMM_Q(0)));
+}
+
+static void gen_VMOVSS(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ int vec_len = vector_len(s, decode);
+
+ tcg_gen_ld_i32(s->tmp2_i32, OP_PTR2, offsetof(ZMMReg, ZMM_L(0)));
+ tcg_gen_gvec_mov(MO_64, decode->op[0].offset, decode->op[1].offset, vec_len, vec_len);
+ tcg_gen_st_i32(s->tmp2_i32, OP_PTR0, offsetof(ZMMReg, ZMM_L(0)));
+}
+
+static void gen_VMOVSS_ld(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ int vec_len = vector_len(s, decode);
+
+ tcg_gen_qemu_ld_i32(s->tmp2_i32, s->A0, s->mem_index, MO_LEUL);
+ tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0);
+ tcg_gen_st_i32(s->tmp2_i32, OP_PTR0, offsetof(ZMMReg, ZMM_L(0)));
+}
+
+static void gen_VMOVSS_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
+{
+ tcg_gen_ld_i32(s->tmp2_i32, OP_PTR2, offsetof(ZMMReg, ZMM_L(0)));
+ tcg_gen_qemu_st_i32(s->tmp2_i32, s->A0, s->mem_index, MO_LEUL);
+}
+
static void gen_VPMASKMOV_st(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode)
{
if (s->vex_w) {
diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
index c79dffd6f9..90bdd0994e 100644
--- a/target/i386/tcg/translate.c
+++ b/target/i386/tcg/translate.c
@@ -4783,6 +4783,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu)
#endif
if (use_new &&
(b == 0x138 || b == 0x13a ||
+ (b >= 0x110 && b <= 0x117) ||
(b >= 0x150 && b <= 0x17f) ||
b == 0x1c2 || (b >= 0x1c4 && b <= 0x1c6) ||
(b >= 0x1d0 && b <= 0x1ff))) {