diff options
Diffstat (limited to 'target')
-rw-r--r-- | target/i386/fpu_helper.c | 211 |
1 files changed, 202 insertions, 9 deletions
diff --git a/target/i386/fpu_helper.c b/target/i386/fpu_helper.c index 8f34ea9776..884e84c804 100644 --- a/target/i386/fpu_helper.c +++ b/target/i386/fpu_helper.c @@ -1373,19 +1373,212 @@ void helper_fprem(CPUX86State *env) helper_fprem_common(env, true); } +/* 128-bit significand of log2(e). */ +#define log2_e_sig_high 0xb8aa3b295c17f0bbULL +#define log2_e_sig_low 0xbe87fed0691d3e89ULL + +/* + * Polynomial coefficients for an approximation to log2((1+x)/(1-x)), + * with only odd powers of x used, for x in the interval [2*sqrt(2)-3, + * 3-2*sqrt(2)], which corresponds to logarithms of numbers in the + * interval [sqrt(2)/2, sqrt(2)]. + */ +#define fyl2x_coeff_0 make_floatx80(0x4000, 0xb8aa3b295c17f0bcULL) +#define fyl2x_coeff_0_low make_floatx80(0xbfbf, 0x834972fe2d7bab1bULL) +#define fyl2x_coeff_1 make_floatx80(0x3ffe, 0xf6384ee1d01febb8ULL) +#define fyl2x_coeff_2 make_floatx80(0x3ffe, 0x93bb62877cdfa2e3ULL) +#define fyl2x_coeff_3 make_floatx80(0x3ffd, 0xd30bb153d808f269ULL) +#define fyl2x_coeff_4 make_floatx80(0x3ffd, 0xa42589eaf451499eULL) +#define fyl2x_coeff_5 make_floatx80(0x3ffd, 0x864d42c0f8f17517ULL) +#define fyl2x_coeff_6 make_floatx80(0x3ffc, 0xe3476578adf26272ULL) +#define fyl2x_coeff_7 make_floatx80(0x3ffc, 0xc506c5f874e6d80fULL) +#define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL) +#define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL) + void helper_fyl2xp1(CPUX86State *env) { - double fptemp = floatx80_to_double(env, ST0); - - if ((fptemp + 1.0) > 0.0) { - fptemp = log(fptemp + 1.0) / log(2.0); /* log2(ST + 1.0) */ - fptemp *= floatx80_to_double(env, ST1); - ST1 = double_to_floatx80(env, fptemp); - fpop(env); + uint8_t old_flags = save_exception_flags(env); + uint64_t arg0_sig = extractFloatx80Frac(ST0); + int32_t arg0_exp = extractFloatx80Exp(ST0); + bool arg0_sign = extractFloatx80Sign(ST0); + uint64_t arg1_sig = extractFloatx80Frac(ST1); + int32_t arg1_exp = extractFloatx80Exp(ST1); + bool arg1_sign = extractFloatx80Sign(ST1); + + if (floatx80_is_signaling_nan(ST0, &env->fp_status)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_silence_nan(ST0, &env->fp_status); + } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_silence_nan(ST1, &env->fp_status); + } else if (floatx80_invalid_encoding(ST0) || + floatx80_invalid_encoding(ST1)) { + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_default_nan(&env->fp_status); + } else if (floatx80_is_any_nan(ST0)) { + ST1 = ST0; + } else if (floatx80_is_any_nan(ST1)) { + /* Pass this NaN through. */ + } else if (arg0_exp > 0x3ffd || + (arg0_exp == 0x3ffd && arg0_sig > (arg0_sign ? + 0x95f619980c4336f7ULL : + 0xd413cccfe7799211ULL))) { + /* + * Out of range for the instruction (ST0 must have absolute + * value less than 1 - sqrt(2)/2 = 0.292..., according to + * Intel manuals; AMD manuals allow a range from sqrt(2)/2 - 1 + * to sqrt(2) - 1, which we allow here), treat as invalid. + */ + float_raise(float_flag_invalid, &env->fp_status); + ST1 = floatx80_default_nan(&env->fp_status); + } else if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) || + arg1_exp == 0x7fff) { + /* + * One argument is zero, or multiplying by infinity; correct + * result is exact and can be obtained by multiplying the + * arguments. + */ + ST1 = floatx80_mul(ST0, ST1, &env->fp_status); + } else if (arg0_exp < 0x3fb0) { + /* + * Multiplying both arguments and an extra-precision version + * of log2(e) is sufficiently precise. + */ + uint64_t sig0, sig1, sig2; + int32_t exp; + if (arg0_exp == 0) { + normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig); + } + if (arg1_exp == 0) { + normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig); + } + mul128By64To192(log2_e_sig_high, log2_e_sig_low, arg0_sig, + &sig0, &sig1, &sig2); + exp = arg0_exp + 1; + mul128By64To192(sig0, sig1, arg1_sig, &sig0, &sig1, &sig2); + exp += arg1_exp - 0x3ffe; + /* This result is inexact. */ + sig1 |= 1; + ST1 = normalizeRoundAndPackFloatx80(80, arg0_sign ^ arg1_sign, exp, + sig0, sig1, &env->fp_status); } else { - env->fpus &= ~0x4700; - env->fpus |= 0x400; + bool asign; + uint32_t dexp, texp, aexp; + uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2; + uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3; + uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1; + floatx80 t2, accum; + FloatRoundMode save_mode = env->fp_status.float_rounding_mode; + signed char save_prec = env->fp_status.floatx80_rounding_precision; + env->fp_status.float_rounding_mode = float_round_nearest_even; + env->fp_status.floatx80_rounding_precision = 80; + + /* + * Compute an approximation of ST0/(2+ST0), with extra + * precision, as the argument to a polynomial approximation. + * The extra precision is only needed for the first term of + * the approximation, with subsequent terms being + * significantly smaller; the approximation only uses odd + * exponents, and the square of ST0/(2+ST0) is at most + * 17-12*sqrt(2) = 0.029.... + */ + if (arg0_sign) { + dexp = 0x3fff; + shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1); + sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1); + } else { + dexp = 0x4000; + shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1); + dsig0 |= 0x8000000000000000ULL; + } + texp = arg0_exp - dexp + 0x3ffe; + rsig0 = arg0_sig; + rsig1 = 0; + rsig2 = 0; + if (dsig0 <= rsig0) { + shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1); + ++texp; + } + tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0); + mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2); + sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2, + &rsig0, &rsig1, &rsig2); + while ((int64_t) rsig0 < 0) { + --tsig0; + add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1, + &rsig0, &rsig1, &rsig2); + } + tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0); + /* + * No need to correct any estimation error in tsig1; even with + * such error, it is accurate enough. Now compute the square + * of that approximation. + */ + mul128To256(tsig0, tsig1, tsig0, tsig1, + &t2sig0, &t2sig1, &t2sig2, &t2sig3); + t2 = normalizeRoundAndPackFloatx80(80, false, texp + texp - 0x3ffe, + t2sig0, t2sig1, &env->fp_status); + + /* Compute the lower parts of the polynomial expansion. */ + accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status); + accum = floatx80_mul(accum, t2, &env->fp_status); + accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status); + + /* + * The full polynomial expansion is fyl2x_coeff_0 + accum + * (where accum has much lower magnitude, and so, in + * particular, carry out of the addition is not possible), + * multiplied by t. (This expansion is only accurate to about + * 70 bits, not 128 bits.) + */ + aexp = extractFloatx80Exp(fyl2x_coeff_0); + asign = extractFloatx80Sign(fyl2x_coeff_0); + shift128RightJamming(extractFloatx80Frac(accum), 0, + aexp - extractFloatx80Exp(accum), + &asig0, &asig1); + bsig0 = extractFloatx80Frac(fyl2x_coeff_0); + bsig1 = 0; + if (asign == extractFloatx80Sign(accum)) { + add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1); + } else { + sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1); + } + /* + * Multiply by t and by the second argument to compute the + * required result. + */ + mul128To256(asig0, asig1, tsig0, tsig1, + &asig0, &asig1, &asig2, &asig3); + aexp += texp - 0x3ffe; + if (arg1_exp == 0) { + normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig); + } + mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2); + aexp += arg1_exp - 0x3ffe; + /* This result is inexact. */ + asig1 |= 1; + env->fp_status.float_rounding_mode = save_mode; + ST1 = normalizeRoundAndPackFloatx80(80, arg0_sign ^ arg1_sign, aexp, + asig0, asig1, &env->fp_status); + env->fp_status.floatx80_rounding_precision = save_prec; } + fpop(env); + merge_exception_flags(env, old_flags); } void helper_fsqrt(CPUX86State *env) |