From 988c3eb0d6f41ac13f4ec145c637f12c776de602 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Wed, 23 Jan 2013 16:03:16 -0800 Subject: target-i386: Use CC_SRC2 for ADC and SBB Add another slot in ENV and store two of the three inputs. This lets us do less work when carry-out is not needed, and avoids the unpredictable CC_OP after translating these insns. Signed-off-by: Richard Henderson --- target-i386/cc_helper.c | 40 ++++++++++---------- target-i386/cc_helper_template.h | 26 +++++++------ target-i386/cpu.h | 10 +++-- target-i386/helper.h | 4 +- target-i386/translate.c | 80 ++++++++++++++++------------------------ 5 files changed, 75 insertions(+), 85 deletions(-) diff --git a/target-i386/cc_helper.c b/target-i386/cc_helper.c index a5d8181804..218a9b519f 100644 --- a/target-i386/cc_helper.c +++ b/target-i386/cc_helper.c @@ -75,7 +75,8 @@ const uint8_t parity_table[256] = { #endif -target_ulong helper_cc_compute_all(target_ulong dst, target_ulong src1, int op) +target_ulong helper_cc_compute_all(target_ulong dst, target_ulong src1, + target_ulong src2, int op) { switch (op) { default: /* should never happen */ @@ -99,11 +100,11 @@ target_ulong helper_cc_compute_all(target_ulong dst, target_ulong src1, int op) return compute_all_addl(dst, src1); case CC_OP_ADCB: - return compute_all_adcb(dst, src1); + return compute_all_adcb(dst, src1, src2); case CC_OP_ADCW: - return compute_all_adcw(dst, src1); + return compute_all_adcw(dst, src1, src2); case CC_OP_ADCL: - return compute_all_adcl(dst, src1); + return compute_all_adcl(dst, src1, src2); case CC_OP_SUBB: return compute_all_subb(dst, src1); @@ -113,11 +114,11 @@ target_ulong helper_cc_compute_all(target_ulong dst, target_ulong src1, int op) return compute_all_subl(dst, src1); case CC_OP_SBBB: - return compute_all_sbbb(dst, src1); + return compute_all_sbbb(dst, src1, src2); case CC_OP_SBBW: - return compute_all_sbbw(dst, src1); + return compute_all_sbbw(dst, src1, src2); case CC_OP_SBBL: - return compute_all_sbbl(dst, src1); + return compute_all_sbbl(dst, src1, src2); case CC_OP_LOGICB: return compute_all_logicb(dst, src1); @@ -160,11 +161,11 @@ target_ulong helper_cc_compute_all(target_ulong dst, target_ulong src1, int op) case CC_OP_ADDQ: return compute_all_addq(dst, src1); case CC_OP_ADCQ: - return compute_all_adcq(dst, src1); + return compute_all_adcq(dst, src1, src2); case CC_OP_SUBQ: return compute_all_subq(dst, src1); case CC_OP_SBBQ: - return compute_all_sbbq(dst, src1); + return compute_all_sbbq(dst, src1, src2); case CC_OP_LOGICQ: return compute_all_logicq(dst, src1); case CC_OP_INCQ: @@ -181,10 +182,11 @@ target_ulong helper_cc_compute_all(target_ulong dst, target_ulong src1, int op) uint32_t cpu_cc_compute_all(CPUX86State *env, int op) { - return helper_cc_compute_all(CC_DST, CC_SRC, op); + return helper_cc_compute_all(CC_DST, CC_SRC, CC_SRC2, op); } -target_ulong helper_cc_compute_c(target_ulong dst, target_ulong src1, int op) +target_ulong helper_cc_compute_c(target_ulong dst, target_ulong src1, + target_ulong src2, int op) { switch (op) { default: /* should never happen */ @@ -225,11 +227,11 @@ target_ulong helper_cc_compute_c(target_ulong dst, target_ulong src1, int op) return compute_c_addl(dst, src1); case CC_OP_ADCB: - return compute_c_adcb(dst, src1); + return compute_c_adcb(dst, src1, src2); case CC_OP_ADCW: - return compute_c_adcw(dst, src1); + return compute_c_adcw(dst, src1, src2); case CC_OP_ADCL: - return compute_c_adcl(dst, src1); + return compute_c_adcl(dst, src1, src2); case CC_OP_SUBB: return compute_c_subb(dst, src1); @@ -239,11 +241,11 @@ target_ulong helper_cc_compute_c(target_ulong dst, target_ulong src1, int op) return compute_c_subl(dst, src1); case CC_OP_SBBB: - return compute_c_sbbb(dst, src1); + return compute_c_sbbb(dst, src1, src2); case CC_OP_SBBW: - return compute_c_sbbw(dst, src1); + return compute_c_sbbw(dst, src1, src2); case CC_OP_SBBL: - return compute_c_sbbl(dst, src1); + return compute_c_sbbl(dst, src1, src2); case CC_OP_SHLB: return compute_c_shlb(dst, src1); @@ -256,11 +258,11 @@ target_ulong helper_cc_compute_c(target_ulong dst, target_ulong src1, int op) case CC_OP_ADDQ: return compute_c_addq(dst, src1); case CC_OP_ADCQ: - return compute_c_adcq(dst, src1); + return compute_c_adcq(dst, src1, src2); case CC_OP_SUBQ: return compute_c_subq(dst, src1); case CC_OP_SBBQ: - return compute_c_sbbq(dst, src1); + return compute_c_sbbq(dst, src1, src2); case CC_OP_SHLQ: return compute_c_shlq(dst, src1); #endif diff --git a/target-i386/cc_helper_template.h b/target-i386/cc_helper_template.h index 522b462285..87f47d2e97 100644 --- a/target-i386/cc_helper_template.h +++ b/target-i386/cc_helper_template.h @@ -58,12 +58,13 @@ static int glue(compute_c_add, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) return dst < src1; } -static int glue(compute_all_adc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) +static int glue(compute_all_adc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1, + DATA_TYPE src3) { int cf, pf, af, zf, sf, of; - DATA_TYPE src2 = dst - src1 - 1; + DATA_TYPE src2 = dst - src1 - src3; - cf = dst <= src1; + cf = (src3 ? dst <= src1 : dst < src1); pf = parity_table[(uint8_t)dst]; af = (dst ^ src1 ^ src2) & 0x10; zf = (dst == 0) << 6; @@ -72,9 +73,10 @@ static int glue(compute_all_adc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) return cf | pf | af | zf | sf | of; } -static int glue(compute_c_adc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) +static int glue(compute_c_adc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1, + DATA_TYPE src3) { - return dst <= src1; + return src3 ? dst <= src1 : dst < src1; } static int glue(compute_all_sub, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2) @@ -98,12 +100,13 @@ static int glue(compute_c_sub, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2) return src1 < src2; } -static int glue(compute_all_sbb, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2) +static int glue(compute_all_sbb, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2, + DATA_TYPE src3) { int cf, pf, af, zf, sf, of; - DATA_TYPE src1 = dst + src2 + 1; + DATA_TYPE src1 = dst + src2 + src3; - cf = src1 <= src2; + cf = (src3 ? src1 <= src2 : src1 < src2); pf = parity_table[(uint8_t)dst]; af = (dst ^ src1 ^ src2) & 0x10; zf = (dst == 0) << 6; @@ -112,11 +115,12 @@ static int glue(compute_all_sbb, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2) return cf | pf | af | zf | sf | of; } -static int glue(compute_c_sbb, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2) +static int glue(compute_c_sbb, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2, + DATA_TYPE src3) { - DATA_TYPE src1 = dst + src2 + 1; + DATA_TYPE src1 = dst + src2 + src3; - return src1 <= src2; + return (src3 ? src1 <= src2 : src1 < src2); } static int glue(compute_all_logic, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) diff --git a/target-i386/cpu.h b/target-i386/cpu.h index 8c4c605299..1fa9dc8267 100644 --- a/target-i386/cpu.h +++ b/target-i386/cpu.h @@ -725,8 +725,9 @@ typedef struct CPUX86State { stored elsewhere */ /* emulator internal eflags handling */ - target_ulong cc_src; target_ulong cc_dst; + target_ulong cc_src; + target_ulong cc_src2; uint32_t cc_op; int32_t df; /* D flag : 1 if D = 0, -1 if D = 1 */ uint32_t hflags; /* TB flags, see HF_xxx constants. These flags @@ -1116,9 +1117,10 @@ static inline int cpu_mmu_index (CPUX86State *env) #define EIP (env->eip) #define DF (env->df) -#define CC_SRC (env->cc_src) -#define CC_DST (env->cc_dst) -#define CC_OP (env->cc_op) +#define CC_DST (env->cc_dst) +#define CC_SRC (env->cc_src) +#define CC_SRC2 (env->cc_src2) +#define CC_OP (env->cc_op) /* n must be a constant to be efficient */ static inline target_long lshift(target_long x, int n) diff --git a/target-i386/helper.h b/target-i386/helper.h index 901ff73c12..4c46ab1b40 100644 --- a/target-i386/helper.h +++ b/target-i386/helper.h @@ -1,7 +1,7 @@ #include "exec/def-helper.h" -DEF_HELPER_FLAGS_3(cc_compute_all, TCG_CALL_NO_RWG_SE, tl, tl, tl, int) -DEF_HELPER_FLAGS_3(cc_compute_c, TCG_CALL_NO_RWG_SE, tl, tl, tl, int) +DEF_HELPER_FLAGS_4(cc_compute_all, TCG_CALL_NO_RWG_SE, tl, tl, tl, tl, int) +DEF_HELPER_FLAGS_4(cc_compute_c, TCG_CALL_NO_RWG_SE, tl, tl, tl, tl, int) DEF_HELPER_0(lock, void) DEF_HELPER_0(unlock, void) diff --git a/target-i386/translate.c b/target-i386/translate.c index 5235aff15e..f667f9333b 100644 --- a/target-i386/translate.c +++ b/target-i386/translate.c @@ -61,7 +61,7 @@ /* global register indexes */ static TCGv_ptr cpu_env; static TCGv cpu_A0; -static TCGv cpu_cc_src, cpu_cc_dst, cpu_cc_srcT; +static TCGv cpu_cc_dst, cpu_cc_src, cpu_cc_src2, cpu_cc_srcT; static TCGv_i32 cpu_cc_op; static TCGv cpu_regs[CPU_NB_REGS]; /* local temps */ @@ -188,18 +188,19 @@ enum { enum { USES_CC_DST = 1, USES_CC_SRC = 2, - USES_CC_SRCT = 4, + USES_CC_SRC2 = 4, + USES_CC_SRCT = 8, }; /* Bit set if the global variable is live after setting CC_OP to X. */ static const uint8_t cc_op_live[CC_OP_NB] = { - [CC_OP_DYNAMIC] = USES_CC_DST | USES_CC_SRC, + [CC_OP_DYNAMIC] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2, [CC_OP_EFLAGS] = USES_CC_SRC, [CC_OP_MULB ... CC_OP_MULQ] = USES_CC_DST | USES_CC_SRC, [CC_OP_ADDB ... CC_OP_ADDQ] = USES_CC_DST | USES_CC_SRC, - [CC_OP_ADCB ... CC_OP_ADCQ] = USES_CC_DST | USES_CC_SRC, + [CC_OP_ADCB ... CC_OP_ADCQ] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2, [CC_OP_SUBB ... CC_OP_SUBQ] = USES_CC_DST | USES_CC_SRC | USES_CC_SRCT, - [CC_OP_SBBB ... CC_OP_SBBQ] = USES_CC_DST | USES_CC_SRC, + [CC_OP_SBBB ... CC_OP_SBBQ] = USES_CC_DST | USES_CC_SRC | USES_CC_SRC2, [CC_OP_LOGICB ... CC_OP_LOGICQ] = USES_CC_DST, [CC_OP_INCB ... CC_OP_INCQ] = USES_CC_DST | USES_CC_SRC, [CC_OP_DECB ... CC_OP_DECQ] = USES_CC_DST | USES_CC_SRC, @@ -223,6 +224,9 @@ static void set_cc_op(DisasContext *s, CCOp op) if (dead & USES_CC_SRC) { tcg_gen_discard_tl(cpu_cc_src); } + if (dead & USES_CC_SRC2) { + tcg_gen_discard_tl(cpu_cc_src2); + } if (dead & USES_CC_SRCT) { tcg_gen_discard_tl(cpu_cc_srcT); } @@ -867,6 +871,13 @@ static void gen_op_update2_cc(void) tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]); } +static void gen_op_update3_cc(TCGv reg) +{ + tcg_gen_mov_tl(cpu_cc_src2, reg); + tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]); + tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]); +} + static inline void gen_op_testl_T0_T1_cc(void) { tcg_gen_and_tl(cpu_cc_dst, cpu_T[0], cpu_T[1]); @@ -882,7 +893,7 @@ static void gen_op_update_neg_cc(void) /* compute all eflags to cc_src */ static void gen_compute_eflags(DisasContext *s) { - TCGv zero, dst, src1; + TCGv zero, dst, src1, src2; int live, dead; if (s->cc_op == CC_OP_EFLAGS) { @@ -892,10 +903,11 @@ static void gen_compute_eflags(DisasContext *s) TCGV_UNUSED(zero); dst = cpu_cc_dst; src1 = cpu_cc_src; + src2 = cpu_cc_src2; /* Take care to not read values that are not live. */ live = cc_op_live[s->cc_op] & ~USES_CC_SRCT; - dead = live ^ (USES_CC_DST | USES_CC_SRC); + dead = live ^ (USES_CC_DST | USES_CC_SRC | USES_CC_SRC2); if (dead) { zero = tcg_const_tl(0); if (dead & USES_CC_DST) { @@ -904,10 +916,13 @@ static void gen_compute_eflags(DisasContext *s) if (dead & USES_CC_SRC) { src1 = zero; } + if (dead & USES_CC_SRC2) { + src2 = zero; + } } gen_update_cc_op(s); - gen_helper_cc_compute_all(cpu_cc_src, dst, src1, cpu_cc_op); + gen_helper_cc_compute_all(cpu_cc_src, dst, src1, src2, cpu_cc_op); set_cc_op(s, CC_OP_EFLAGS); if (dead) { @@ -951,30 +966,6 @@ static CCPrepare gen_prepare_eflags_c(DisasContext *s, TCGv reg) return (CCPrepare) { .cond = TCG_COND_LTU, .reg = t0, .reg2 = t1, .mask = -1, .use_reg2 = true }; - case CC_OP_SBBB ... CC_OP_SBBQ: - /* (DATA_TYPE)(CC_DST + CC_SRC + 1) <= (DATA_TYPE)CC_SRC */ - size = s->cc_op - CC_OP_SBBB; - t1 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, false); - if (TCGV_EQUAL(t1, reg) && TCGV_EQUAL(reg, cpu_cc_src)) { - tcg_gen_mov_tl(cpu_tmp0, cpu_cc_src); - t1 = cpu_tmp0; - } - - tcg_gen_add_tl(reg, cpu_cc_dst, cpu_cc_src); - tcg_gen_addi_tl(reg, reg, 1); - gen_extu(size, reg); - t0 = reg; - goto adc_sbb; - - case CC_OP_ADCB ... CC_OP_ADCQ: - /* (DATA_TYPE)CC_DST <= (DATA_TYPE)CC_SRC */ - size = s->cc_op - CC_OP_ADCB; - t1 = gen_ext_tl(cpu_tmp0, cpu_cc_src, size, false); - t0 = gen_ext_tl(reg, cpu_cc_dst, size, false); - adc_sbb: - return (CCPrepare) { .cond = TCG_COND_LEU, .reg = t0, - .reg2 = t1, .mask = -1, .use_reg2 = true }; - case CC_OP_LOGICB ... CC_OP_LOGICQ: return (CCPrepare) { .cond = TCG_COND_NEVER, .mask = -1 }; @@ -1004,7 +995,8 @@ static CCPrepare gen_prepare_eflags_c(DisasContext *s, TCGv reg) /* The need to compute only C from CC_OP_DYNAMIC is important in efficiently implementing e.g. INC at the start of a TB. */ gen_update_cc_op(s); - gen_helper_cc_compute_c(reg, cpu_cc_dst, cpu_cc_src, cpu_cc_op); + gen_helper_cc_compute_c(reg, cpu_cc_dst, cpu_cc_src, + cpu_cc_src2, cpu_cc_op); return (CCPrepare) { .cond = TCG_COND_NE, .reg = reg, .mask = -1, .no_setcond = true }; } @@ -1442,18 +1434,10 @@ static void gen_op(DisasContext *s1, int op, int ot, int d) gen_op_mov_reg_T0(ot, d); else gen_op_st_T0_A0(ot + s1->mem_index); - tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]); - tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]); - tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_tmp4); - tcg_gen_shli_i32(cpu_tmp2_i32, cpu_tmp2_i32, 2); - tcg_gen_addi_i32(cpu_cc_op, cpu_tmp2_i32, CC_OP_ADDB + ot); - set_cc_op(s1, CC_OP_DYNAMIC); + gen_op_update3_cc(cpu_tmp4); + set_cc_op(s1, CC_OP_ADCB + ot); break; case OP_SBBL: - /* - * No need to store cpu_cc_srcT, because it is used only - * when the cc_op is known. - */ gen_compute_eflags_c(s1, cpu_tmp4); tcg_gen_sub_tl(cpu_T[0], cpu_T[0], cpu_T[1]); tcg_gen_sub_tl(cpu_T[0], cpu_T[0], cpu_tmp4); @@ -1461,12 +1445,8 @@ static void gen_op(DisasContext *s1, int op, int ot, int d) gen_op_mov_reg_T0(ot, d); else gen_op_st_T0_A0(ot + s1->mem_index); - tcg_gen_mov_tl(cpu_cc_src, cpu_T[1]); - tcg_gen_mov_tl(cpu_cc_dst, cpu_T[0]); - tcg_gen_trunc_tl_i32(cpu_tmp2_i32, cpu_tmp4); - tcg_gen_shli_i32(cpu_tmp2_i32, cpu_tmp2_i32, 2); - tcg_gen_addi_i32(cpu_cc_op, cpu_tmp2_i32, CC_OP_SUBB + ot); - set_cc_op(s1, CC_OP_DYNAMIC); + gen_op_update3_cc(cpu_tmp4); + set_cc_op(s1, CC_OP_SBBB + ot); break; case OP_ADDL: gen_op_addl_T0_T1(); @@ -7788,6 +7768,8 @@ void optimize_flags_init(void) "cc_dst"); cpu_cc_src = tcg_global_mem_new(TCG_AREG0, offsetof(CPUX86State, cc_src), "cc_src"); + cpu_cc_src2 = tcg_global_mem_new(TCG_AREG0, offsetof(CPUX86State, cc_src2), + "cc_src2"); #ifdef TARGET_X86_64 cpu_regs[R_EAX] = tcg_global_mem_new_i64(TCG_AREG0, -- cgit v1.2.3