aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndre Przywara <andre.przywara@amd.com>2009-09-19 00:30:48 +0200
committerAurelien Jarno <aurelien@aurel32.net>2009-10-04 14:09:41 +0200
commitd9f4bb27dbff2e40ec2e36eb8017c9dedce77f30 (patch)
tree60e4374cd0a5fd95cfadb397f0413468d11a1f14
parentccd59d09a9d0c75b86185b89d8246e40b5f01168 (diff)
target-i386: add SSE4a instruction support
This adds support for the AMD Phenom/Barcelona's SSE4a instructions. Those include insertq and extrq, which are doing shift and mask on XMM registers, in two versions (immediate shift/length values and stored in another XMM register). Additionally it implements movntss, movntsd, which are scalar non-temporal stores (avoiding cache trashing). These are implemented as normal stores, though. SSE4a is guarded by the SSE4A CPUID bit (Fn8000_0001:ECX[6]). Signed-off-by: Andre Przywara <andre.przywara@amd.com> Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
-rw-r--r--target-i386/ops_sse.h44
-rw-r--r--target-i386/ops_sse_header.h4
-rw-r--r--target-i386/translate.c39
3 files changed, 85 insertions, 2 deletions
diff --git a/target-i386/ops_sse.h b/target-i386/ops_sse.h
index 709732a4a7..3232abd965 100644
--- a/target-i386/ops_sse.h
+++ b/target-i386/ops_sse.h
@@ -802,6 +802,50 @@ void helper_rcpss(XMMReg *d, XMMReg *s)
d->XMM_S(0) = approx_rcp(s->XMM_S(0));
}
+static inline uint64_t helper_extrq(uint64_t src, int shift, int len)
+{
+ uint64_t mask;
+
+ if (len == 0) {
+ mask = ~0LL;
+ } else {
+ mask = (1ULL << len) - 1;
+ }
+ return (src >> shift) & mask;
+}
+
+void helper_extrq_r(XMMReg *d, XMMReg *s)
+{
+ d->XMM_Q(0) = helper_extrq(d->XMM_Q(0), s->XMM_B(1), s->XMM_B(0));
+}
+
+void helper_extrq_i(XMMReg *d, int index, int length)
+{
+ d->XMM_Q(0) = helper_extrq(d->XMM_Q(0), index, length);
+}
+
+static inline uint64_t helper_insertq(uint64_t src, int shift, int len)
+{
+ uint64_t mask;
+
+ if (len == 0) {
+ mask = ~0ULL;
+ } else {
+ mask = (1ULL << len) - 1;
+ }
+ return (src & ~(mask << shift)) | ((src & mask) << shift);
+}
+
+void helper_insertq_r(XMMReg *d, XMMReg *s)
+{
+ d->XMM_Q(0) = helper_insertq(s->XMM_Q(0), s->XMM_B(9), s->XMM_B(8));
+}
+
+void helper_insertq_i(XMMReg *d, int index, int length)
+{
+ d->XMM_Q(0) = helper_insertq(d->XMM_Q(0), index, length);
+}
+
void helper_haddps(XMMReg *d, XMMReg *s)
{
XMMReg r;
diff --git a/target-i386/ops_sse_header.h b/target-i386/ops_sse_header.h
index 53add99f99..a0a63613a1 100644
--- a/target-i386/ops_sse_header.h
+++ b/target-i386/ops_sse_header.h
@@ -187,6 +187,10 @@ DEF_HELPER_2(rsqrtps, void, XMMReg, XMMReg)
DEF_HELPER_2(rsqrtss, void, XMMReg, XMMReg)
DEF_HELPER_2(rcpps, void, XMMReg, XMMReg)
DEF_HELPER_2(rcpss, void, XMMReg, XMMReg)
+DEF_HELPER_2(extrq_r, void, XMMReg, XMMReg)
+DEF_HELPER_3(extrq_i, void, XMMReg, int, int)
+DEF_HELPER_2(insertq_r, void, XMMReg, XMMReg)
+DEF_HELPER_3(insertq_i, void, XMMReg, int, int)
DEF_HELPER_2(haddps, void, XMMReg, XMMReg)
DEF_HELPER_2(haddpd, void, XMMReg, XMMReg)
DEF_HELPER_2(hsubps, void, XMMReg, XMMReg)
diff --git a/target-i386/translate.c b/target-i386/translate.c
index 0f62a97286..66aa21bb65 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -2826,7 +2826,7 @@ static void *sse_op_table1[256][4] = {
[0x28] = { SSE_SPECIAL, SSE_SPECIAL }, /* movaps, movapd */
[0x29] = { SSE_SPECIAL, SSE_SPECIAL }, /* movaps, movapd */
[0x2a] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* cvtpi2ps, cvtpi2pd, cvtsi2ss, cvtsi2sd */
- [0x2b] = { SSE_SPECIAL, SSE_SPECIAL }, /* movntps, movntpd */
+ [0x2b] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movntps, movntpd, movntss, movntsd */
[0x2c] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* cvttps2pi, cvttpd2pi, cvttsd2si, cvttss2si */
[0x2d] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* cvtps2pi, cvtpd2pi, cvtsd2si, cvtss2si */
[0x2e] = { gen_helper_ucomiss, gen_helper_ucomisd },
@@ -2883,6 +2883,8 @@ static void *sse_op_table1[256][4] = {
[0x75] = MMX_OP2(pcmpeqw),
[0x76] = MMX_OP2(pcmpeql),
[0x77] = { SSE_DUMMY }, /* emms */
+ [0x78] = { NULL, SSE_SPECIAL, NULL, SSE_SPECIAL }, /* extrq_i, insertq_i */
+ [0x79] = { NULL, gen_helper_extrq_r, NULL, gen_helper_insertq_r },
[0x7c] = { NULL, gen_helper_haddpd, NULL, gen_helper_haddps },
[0x7d] = { NULL, gen_helper_hsubpd, NULL, gen_helper_hsubps },
[0x7e] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL }, /* movd, movd, , movq */
@@ -3169,6 +3171,20 @@ static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
gen_sto_env_A0(s->mem_index, offsetof(CPUX86State,xmm_regs[reg]));
break;
+ case 0x22b: /* movntss */
+ case 0x32b: /* movntsd */
+ if (mod == 3)
+ goto illegal_op;
+ gen_lea_modrm(s, modrm, &reg_addr, &offset_addr);
+ if (b1 & 1) {
+ gen_stq_env_A0(s->mem_index, offsetof(CPUX86State,
+ xmm_regs[reg]));
+ } else {
+ tcg_gen_ld32u_tl(cpu_T[0], cpu_env, offsetof(CPUX86State,
+ xmm_regs[reg].XMM_L(0)));
+ gen_op_st_T0_A0(OT_LONG + s->mem_index);
+ }
+ break;
case 0x6e: /* movd mm, ea */
#ifdef TARGET_X86_64
if (s->dflag == 2) {
@@ -3324,6 +3340,25 @@ static void gen_sse(DisasContext *s, int b, target_ulong pc_start, int rex_r)
gen_op_movl(offsetof(CPUX86State,xmm_regs[reg].XMM_L(2)),
offsetof(CPUX86State,xmm_regs[reg].XMM_L(3)));
break;
+ case 0x178:
+ case 0x378:
+ {
+ int bit_index, field_length;
+
+ if (b1 == 1 && reg != 0)
+ goto illegal_op;
+ field_length = ldub_code(s->pc++) & 0x3F;
+ bit_index = ldub_code(s->pc++) & 0x3F;
+ tcg_gen_addi_ptr(cpu_ptr0, cpu_env,
+ offsetof(CPUX86State,xmm_regs[reg]));
+ if (b1 == 1)
+ gen_helper_extrq_i(cpu_ptr0, tcg_const_i32(bit_index),
+ tcg_const_i32(field_length));
+ else
+ gen_helper_insertq_i(cpu_ptr0, tcg_const_i32(bit_index),
+ tcg_const_i32(field_length));
+ }
+ break;
case 0x7e: /* movd ea, mm */
#ifdef TARGET_X86_64
if (s->dflag == 2) {
@@ -7555,7 +7590,7 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
case 0x110 ... 0x117:
case 0x128 ... 0x12f:
case 0x138 ... 0x13a:
- case 0x150 ... 0x177:
+ case 0x150 ... 0x179:
case 0x17c ... 0x17f:
case 0x1c2:
case 0x1c4 ... 0x1c6: