diff options
Diffstat (limited to 'fpu/softfloat.c')
-rw-r--r-- | fpu/softfloat.c | 865 |
1 files changed, 817 insertions, 48 deletions
diff --git a/fpu/softfloat.c b/fpu/softfloat.c index e1eef954e6..59eac97d10 100644 --- a/fpu/softfloat.c +++ b/fpu/softfloat.c @@ -83,6 +83,7 @@ this code that are retained. * target-dependent and needs the TARGET_* macros. */ #include "qemu/osdep.h" +#include <math.h> #include "qemu/bitops.h" #include "fpu/softfloat.h" @@ -95,6 +96,324 @@ this code that are retained. *----------------------------------------------------------------------------*/ #include "fpu/softfloat-macros.h" +/* + * Hardfloat + * + * Fast emulation of guest FP instructions is challenging for two reasons. + * First, FP instruction semantics are similar but not identical, particularly + * when handling NaNs. Second, emulating at reasonable speed the guest FP + * exception flags is not trivial: reading the host's flags register with a + * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp], + * and trapping on every FP exception is not fast nor pleasant to work with. + * + * We address these challenges by leveraging the host FPU for a subset of the + * operations. To do this we expand on the idea presented in this paper: + * + * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a + * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615. + * + * The idea is thus to leverage the host FPU to (1) compute FP operations + * and (2) identify whether FP exceptions occurred while avoiding + * expensive exception flag register accesses. + * + * An important optimization shown in the paper is that given that exception + * flags are rarely cleared by the guest, we can avoid recomputing some flags. + * This is particularly useful for the inexact flag, which is very frequently + * raised in floating-point workloads. + * + * We optimize the code further by deferring to soft-fp whenever FP exception + * detection might get hairy. Two examples: (1) when at least one operand is + * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result + * and the result is < the minimum normal. + */ +#define GEN_INPUT_FLUSH__NOCHECK(name, soft_t) \ + static inline void name(soft_t *a, float_status *s) \ + { \ + if (unlikely(soft_t ## _is_denormal(*a))) { \ + *a = soft_t ## _set_sign(soft_t ## _zero, \ + soft_t ## _is_neg(*a)); \ + s->float_exception_flags |= float_flag_input_denormal; \ + } \ + } + +GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32) +GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64) +#undef GEN_INPUT_FLUSH__NOCHECK + +#define GEN_INPUT_FLUSH1(name, soft_t) \ + static inline void name(soft_t *a, float_status *s) \ + { \ + if (likely(!s->flush_inputs_to_zero)) { \ + return; \ + } \ + soft_t ## _input_flush__nocheck(a, s); \ + } + +GEN_INPUT_FLUSH1(float32_input_flush1, float32) +GEN_INPUT_FLUSH1(float64_input_flush1, float64) +#undef GEN_INPUT_FLUSH1 + +#define GEN_INPUT_FLUSH2(name, soft_t) \ + static inline void name(soft_t *a, soft_t *b, float_status *s) \ + { \ + if (likely(!s->flush_inputs_to_zero)) { \ + return; \ + } \ + soft_t ## _input_flush__nocheck(a, s); \ + soft_t ## _input_flush__nocheck(b, s); \ + } + +GEN_INPUT_FLUSH2(float32_input_flush2, float32) +GEN_INPUT_FLUSH2(float64_input_flush2, float64) +#undef GEN_INPUT_FLUSH2 + +#define GEN_INPUT_FLUSH3(name, soft_t) \ + static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \ + { \ + if (likely(!s->flush_inputs_to_zero)) { \ + return; \ + } \ + soft_t ## _input_flush__nocheck(a, s); \ + soft_t ## _input_flush__nocheck(b, s); \ + soft_t ## _input_flush__nocheck(c, s); \ + } + +GEN_INPUT_FLUSH3(float32_input_flush3, float32) +GEN_INPUT_FLUSH3(float64_input_flush3, float64) +#undef GEN_INPUT_FLUSH3 + +/* + * Choose whether to use fpclassify or float32/64_* primitives in the generated + * hardfloat functions. Each combination of number of inputs and float size + * gets its own value. + */ +#if defined(__x86_64__) +# define QEMU_HARDFLOAT_1F32_USE_FP 0 +# define QEMU_HARDFLOAT_1F64_USE_FP 1 +# define QEMU_HARDFLOAT_2F32_USE_FP 0 +# define QEMU_HARDFLOAT_2F64_USE_FP 1 +# define QEMU_HARDFLOAT_3F32_USE_FP 0 +# define QEMU_HARDFLOAT_3F64_USE_FP 1 +#else +# define QEMU_HARDFLOAT_1F32_USE_FP 0 +# define QEMU_HARDFLOAT_1F64_USE_FP 0 +# define QEMU_HARDFLOAT_2F32_USE_FP 0 +# define QEMU_HARDFLOAT_2F64_USE_FP 0 +# define QEMU_HARDFLOAT_3F32_USE_FP 0 +# define QEMU_HARDFLOAT_3F64_USE_FP 0 +#endif + +/* + * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over + * float{32,64}_is_infinity when !USE_FP. + * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup. + * On power64 however, using isinf() reduces fp-bench performance by up to 50%. + */ +#if defined(__x86_64__) || defined(__aarch64__) +# define QEMU_HARDFLOAT_USE_ISINF 1 +#else +# define QEMU_HARDFLOAT_USE_ISINF 0 +#endif + +/* + * Some targets clear the FP flags before most FP operations. This prevents + * the use of hardfloat, since hardfloat relies on the inexact flag being + * already set. + */ +#if defined(TARGET_PPC) || defined(__FAST_MATH__) +# if defined(__FAST_MATH__) +# warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \ + IEEE implementation +# endif +# define QEMU_NO_HARDFLOAT 1 +# define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN +#else +# define QEMU_NO_HARDFLOAT 0 +# define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline)) +#endif + +static inline bool can_use_fpu(const float_status *s) +{ + if (QEMU_NO_HARDFLOAT) { + return false; + } + return likely(s->float_exception_flags & float_flag_inexact && + s->float_rounding_mode == float_round_nearest_even); +} + +/* + * Hardfloat generation functions. Each operation can have two flavors: + * either using softfloat primitives (e.g. float32_is_zero_or_normal) for + * most condition checks, or native ones (e.g. fpclassify). + * + * The flavor is chosen by the callers. Instead of using macros, we rely on the + * compiler to propagate constants and inline everything into the callers. + * + * We only generate functions for operations with two inputs, since only + * these are common enough to justify consolidating them into common code. + */ + +typedef union { + float32 s; + float h; +} union_float32; + +typedef union { + float64 s; + double h; +} union_float64; + +typedef bool (*f32_check_fn)(union_float32 a, union_float32 b); +typedef bool (*f64_check_fn)(union_float64 a, union_float64 b); + +typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s); +typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s); +typedef float (*hard_f32_op2_fn)(float a, float b); +typedef double (*hard_f64_op2_fn)(double a, double b); + +/* 2-input is-zero-or-normal */ +static inline bool f32_is_zon2(union_float32 a, union_float32 b) +{ + if (QEMU_HARDFLOAT_2F32_USE_FP) { + /* + * Not using a temp variable for consecutive fpclassify calls ends up + * generating faster code. + */ + return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && + (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); + } + return float32_is_zero_or_normal(a.s) && + float32_is_zero_or_normal(b.s); +} + +static inline bool f64_is_zon2(union_float64 a, union_float64 b) +{ + if (QEMU_HARDFLOAT_2F64_USE_FP) { + return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && + (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO); + } + return float64_is_zero_or_normal(a.s) && + float64_is_zero_or_normal(b.s); +} + +/* 3-input is-zero-or-normal */ +static inline +bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c) +{ + if (QEMU_HARDFLOAT_3F32_USE_FP) { + return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && + (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && + (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); + } + return float32_is_zero_or_normal(a.s) && + float32_is_zero_or_normal(b.s) && + float32_is_zero_or_normal(c.s); +} + +static inline +bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c) +{ + if (QEMU_HARDFLOAT_3F64_USE_FP) { + return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && + (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) && + (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO); + } + return float64_is_zero_or_normal(a.s) && + float64_is_zero_or_normal(b.s) && + float64_is_zero_or_normal(c.s); +} + +static inline bool f32_is_inf(union_float32 a) +{ + if (QEMU_HARDFLOAT_USE_ISINF) { + return isinf(a.h); + } + return float32_is_infinity(a.s); +} + +static inline bool f64_is_inf(union_float64 a) +{ + if (QEMU_HARDFLOAT_USE_ISINF) { + return isinf(a.h); + } + return float64_is_infinity(a.s); +} + +/* Note: @fast_test and @post can be NULL */ +static inline float32 +float32_gen2(float32 xa, float32 xb, float_status *s, + hard_f32_op2_fn hard, soft_f32_op2_fn soft, + f32_check_fn pre, f32_check_fn post, + f32_check_fn fast_test, soft_f32_op2_fn fast_op) +{ + union_float32 ua, ub, ur; + + ua.s = xa; + ub.s = xb; + + if (unlikely(!can_use_fpu(s))) { + goto soft; + } + + float32_input_flush2(&ua.s, &ub.s, s); + if (unlikely(!pre(ua, ub))) { + goto soft; + } + if (fast_test && fast_test(ua, ub)) { + return fast_op(ua.s, ub.s, s); + } + + ur.h = hard(ua.h, ub.h); + if (unlikely(f32_is_inf(ur))) { + s->float_exception_flags |= float_flag_overflow; + } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { + if (post == NULL || post(ua, ub)) { + goto soft; + } + } + return ur.s; + + soft: + return soft(ua.s, ub.s, s); +} + +static inline float64 +float64_gen2(float64 xa, float64 xb, float_status *s, + hard_f64_op2_fn hard, soft_f64_op2_fn soft, + f64_check_fn pre, f64_check_fn post, + f64_check_fn fast_test, soft_f64_op2_fn fast_op) +{ + union_float64 ua, ub, ur; + + ua.s = xa; + ub.s = xb; + + if (unlikely(!can_use_fpu(s))) { + goto soft; + } + + float64_input_flush2(&ua.s, &ub.s, s); + if (unlikely(!pre(ua, ub))) { + goto soft; + } + if (fast_test && fast_test(ua, ub)) { + return fast_op(ua.s, ub.s, s); + } + + ur.h = hard(ua.h, ub.h); + if (unlikely(f64_is_inf(ur))) { + s->float_exception_flags |= float_flag_overflow; + } else if (unlikely(fabs(ur.h) <= DBL_MIN)) { + if (post == NULL || post(ua, ub)) { + goto soft; + } + } + return ur.s; + + soft: + return soft(ua.s, ub.s, s); +} + /*---------------------------------------------------------------------------- | Returns the fraction bits of the half-precision floating-point value `a'. *----------------------------------------------------------------------------*/ @@ -336,8 +655,8 @@ static inline float64 float64_pack_raw(FloatParts p) #include "softfloat-specialize.h" /* Canonicalize EXP and FRAC, setting CLS. */ -static FloatParts canonicalize(FloatParts part, const FloatFmt *parm, - float_status *status) +static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm, + float_status *status) { if (part.exp == parm->exp_max && !parm->arm_althp) { if (part.frac == 0) { @@ -513,7 +832,7 @@ static FloatParts round_canonical(FloatParts p, float_status *s, static FloatParts float16a_unpack_canonical(float16 f, float_status *s, const FloatFmt *params) { - return canonicalize(float16_unpack_raw(f), params, s); + return sf_canonicalize(float16_unpack_raw(f), params, s); } static FloatParts float16_unpack_canonical(float16 f, float_status *s) @@ -534,7 +853,7 @@ static float16 float16_round_pack_canonical(FloatParts p, float_status *s) static FloatParts float32_unpack_canonical(float32 f, float_status *s) { - return canonicalize(float32_unpack_raw(f), &float32_params, s); + return sf_canonicalize(float32_unpack_raw(f), &float32_params, s); } static float32 float32_round_pack_canonical(FloatParts p, float_status *s) @@ -544,7 +863,7 @@ static float32 float32_round_pack_canonical(FloatParts p, float_status *s) static FloatParts float64_unpack_canonical(float64 f, float_status *s) { - return canonicalize(float64_unpack_raw(f), &float64_params, s); + return sf_canonicalize(float64_unpack_raw(f), &float64_params, s); } static float64 float64_round_pack_canonical(FloatParts p, float_status *s) @@ -735,49 +1054,128 @@ float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status) return float16_round_pack_canonical(pr, status); } -float32 QEMU_FLATTEN float32_add(float32 a, float32 b, float_status *status) +float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status) +{ + FloatParts pa = float16_unpack_canonical(a, status); + FloatParts pb = float16_unpack_canonical(b, status); + FloatParts pr = addsub_floats(pa, pb, true, status); + + return float16_round_pack_canonical(pr, status); +} + +static float32 QEMU_SOFTFLOAT_ATTR +soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status) { FloatParts pa = float32_unpack_canonical(a, status); FloatParts pb = float32_unpack_canonical(b, status); - FloatParts pr = addsub_floats(pa, pb, false, status); + FloatParts pr = addsub_floats(pa, pb, subtract, status); return float32_round_pack_canonical(pr, status); } -float64 QEMU_FLATTEN float64_add(float64 a, float64 b, float_status *status) +static inline float32 soft_f32_add(float32 a, float32 b, float_status *status) +{ + return soft_f32_addsub(a, b, false, status); +} + +static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status) +{ + return soft_f32_addsub(a, b, true, status); +} + +static float64 QEMU_SOFTFLOAT_ATTR +soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status) { FloatParts pa = float64_unpack_canonical(a, status); FloatParts pb = float64_unpack_canonical(b, status); - FloatParts pr = addsub_floats(pa, pb, false, status); + FloatParts pr = addsub_floats(pa, pb, subtract, status); return float64_round_pack_canonical(pr, status); } -float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status) +static inline float64 soft_f64_add(float64 a, float64 b, float_status *status) { - FloatParts pa = float16_unpack_canonical(a, status); - FloatParts pb = float16_unpack_canonical(b, status); - FloatParts pr = addsub_floats(pa, pb, true, status); + return soft_f64_addsub(a, b, false, status); +} - return float16_round_pack_canonical(pr, status); +static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status) +{ + return soft_f64_addsub(a, b, true, status); } -float32 QEMU_FLATTEN float32_sub(float32 a, float32 b, float_status *status) +static float hard_f32_add(float a, float b) { - FloatParts pa = float32_unpack_canonical(a, status); - FloatParts pb = float32_unpack_canonical(b, status); - FloatParts pr = addsub_floats(pa, pb, true, status); + return a + b; +} - return float32_round_pack_canonical(pr, status); +static float hard_f32_sub(float a, float b) +{ + return a - b; } -float64 QEMU_FLATTEN float64_sub(float64 a, float64 b, float_status *status) +static double hard_f64_add(double a, double b) { - FloatParts pa = float64_unpack_canonical(a, status); - FloatParts pb = float64_unpack_canonical(b, status); - FloatParts pr = addsub_floats(pa, pb, true, status); + return a + b; +} - return float64_round_pack_canonical(pr, status); +static double hard_f64_sub(double a, double b) +{ + return a - b; +} + +static bool f32_addsub_post(union_float32 a, union_float32 b) +{ + if (QEMU_HARDFLOAT_2F32_USE_FP) { + return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); + } + return !(float32_is_zero(a.s) && float32_is_zero(b.s)); +} + +static bool f64_addsub_post(union_float64 a, union_float64 b) +{ + if (QEMU_HARDFLOAT_2F64_USE_FP) { + return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO); + } else { + return !(float64_is_zero(a.s) && float64_is_zero(b.s)); + } +} + +static float32 float32_addsub(float32 a, float32 b, float_status *s, + hard_f32_op2_fn hard, soft_f32_op2_fn soft) +{ + return float32_gen2(a, b, s, hard, soft, + f32_is_zon2, f32_addsub_post, NULL, NULL); +} + +static float64 float64_addsub(float64 a, float64 b, float_status *s, + hard_f64_op2_fn hard, soft_f64_op2_fn soft) +{ + return float64_gen2(a, b, s, hard, soft, + f64_is_zon2, f64_addsub_post, NULL, NULL); +} + +float32 QEMU_FLATTEN +float32_add(float32 a, float32 b, float_status *s) +{ + return float32_addsub(a, b, s, hard_f32_add, soft_f32_add); +} + +float32 QEMU_FLATTEN +float32_sub(float32 a, float32 b, float_status *s) +{ + return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub); +} + +float64 QEMU_FLATTEN +float64_add(float64 a, float64 b, float_status *s) +{ + return float64_addsub(a, b, s, hard_f64_add, soft_f64_add); +} + +float64 QEMU_FLATTEN +float64_sub(float64 a, float64 b, float_status *s) +{ + return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub); } /* @@ -838,7 +1236,8 @@ float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status) return float16_round_pack_canonical(pr, status); } -float32 QEMU_FLATTEN float32_mul(float32 a, float32 b, float_status *status) +static float32 QEMU_SOFTFLOAT_ATTR +soft_f32_mul(float32 a, float32 b, float_status *status) { FloatParts pa = float32_unpack_canonical(a, status); FloatParts pb = float32_unpack_canonical(b, status); @@ -847,7 +1246,8 @@ float32 QEMU_FLATTEN float32_mul(float32 a, float32 b, float_status *status) return float32_round_pack_canonical(pr, status); } -float64 QEMU_FLATTEN float64_mul(float64 a, float64 b, float_status *status) +static float64 QEMU_SOFTFLOAT_ATTR +soft_f64_mul(float64 a, float64 b, float_status *status) { FloatParts pa = float64_unpack_canonical(a, status); FloatParts pb = float64_unpack_canonical(b, status); @@ -856,6 +1256,54 @@ float64 QEMU_FLATTEN float64_mul(float64 a, float64 b, float_status *status) return float64_round_pack_canonical(pr, status); } +static float hard_f32_mul(float a, float b) +{ + return a * b; +} + +static double hard_f64_mul(double a, double b) +{ + return a * b; +} + +static bool f32_mul_fast_test(union_float32 a, union_float32 b) +{ + return float32_is_zero(a.s) || float32_is_zero(b.s); +} + +static bool f64_mul_fast_test(union_float64 a, union_float64 b) +{ + return float64_is_zero(a.s) || float64_is_zero(b.s); +} + +static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s) +{ + bool signbit = float32_is_neg(a) ^ float32_is_neg(b); + + return float32_set_sign(float32_zero, signbit); +} + +static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s) +{ + bool signbit = float64_is_neg(a) ^ float64_is_neg(b); + + return float64_set_sign(float64_zero, signbit); +} + +float32 QEMU_FLATTEN +float32_mul(float32 a, float32 b, float_status *s) +{ + return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul, + f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op); +} + +float64 QEMU_FLATTEN +float64_mul(float64 a, float64 b, float_status *s) +{ + return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul, + f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op); +} + /* * Returns the result of multiplying the floating-point values `a' and * `b' then adding 'c', with no intermediate rounding step after the @@ -1070,8 +1518,9 @@ float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c, return float16_round_pack_canonical(pr, status); } -float32 QEMU_FLATTEN float32_muladd(float32 a, float32 b, float32 c, - int flags, float_status *status) +static float32 QEMU_SOFTFLOAT_ATTR +soft_f32_muladd(float32 a, float32 b, float32 c, int flags, + float_status *status) { FloatParts pa = float32_unpack_canonical(a, status); FloatParts pb = float32_unpack_canonical(b, status); @@ -1081,8 +1530,9 @@ float32 QEMU_FLATTEN float32_muladd(float32 a, float32 b, float32 c, return float32_round_pack_canonical(pr, status); } -float64 QEMU_FLATTEN float64_muladd(float64 a, float64 b, float64 c, - int flags, float_status *status) +static float64 QEMU_SOFTFLOAT_ATTR +soft_f64_muladd(float64 a, float64 b, float64 c, int flags, + float_status *status) { FloatParts pa = float64_unpack_canonical(a, status); FloatParts pb = float64_unpack_canonical(b, status); @@ -1092,6 +1542,128 @@ float64 QEMU_FLATTEN float64_muladd(float64 a, float64 b, float64 c, return float64_round_pack_canonical(pr, status); } +float32 QEMU_FLATTEN +float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s) +{ + union_float32 ua, ub, uc, ur; + + ua.s = xa; + ub.s = xb; + uc.s = xc; + + if (unlikely(!can_use_fpu(s))) { + goto soft; + } + if (unlikely(flags & float_muladd_halve_result)) { + goto soft; + } + + float32_input_flush3(&ua.s, &ub.s, &uc.s, s); + if (unlikely(!f32_is_zon3(ua, ub, uc))) { + goto soft; + } + /* + * When (a || b) == 0, there's no need to check for under/over flow, + * since we know the addend is (normal || 0) and the product is 0. + */ + if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) { + union_float32 up; + bool prod_sign; + + prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s); + prod_sign ^= !!(flags & float_muladd_negate_product); + up.s = float32_set_sign(float32_zero, prod_sign); + + if (flags & float_muladd_negate_c) { + uc.h = -uc.h; + } + ur.h = up.h + uc.h; + } else { + if (flags & float_muladd_negate_product) { + ua.h = -ua.h; + } + if (flags & float_muladd_negate_c) { + uc.h = -uc.h; + } + + ur.h = fmaf(ua.h, ub.h, uc.h); + + if (unlikely(f32_is_inf(ur))) { + s->float_exception_flags |= float_flag_overflow; + } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) { + goto soft; + } + } + if (flags & float_muladd_negate_result) { + return float32_chs(ur.s); + } + return ur.s; + + soft: + return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s); +} + +float64 QEMU_FLATTEN +float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s) +{ + union_float64 ua, ub, uc, ur; + + ua.s = xa; + ub.s = xb; + uc.s = xc; + + if (unlikely(!can_use_fpu(s))) { + goto soft; + } + if (unlikely(flags & float_muladd_halve_result)) { + goto soft; + } + + float64_input_flush3(&ua.s, &ub.s, &uc.s, s); + if (unlikely(!f64_is_zon3(ua, ub, uc))) { + goto soft; + } + /* + * When (a || b) == 0, there's no need to check for under/over flow, + * since we know the addend is (normal || 0) and the product is 0. + */ + if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) { + union_float64 up; + bool prod_sign; + + prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s); + prod_sign ^= !!(flags & float_muladd_negate_product); + up.s = float64_set_sign(float64_zero, prod_sign); + + if (flags & float_muladd_negate_c) { + uc.h = -uc.h; + } + ur.h = up.h + uc.h; + } else { + if (flags & float_muladd_negate_product) { + ua.h = -ua.h; + } + if (flags & float_muladd_negate_c) { + uc.h = -uc.h; + } + + ur.h = fma(ua.h, ub.h, uc.h); + + if (unlikely(f64_is_inf(ur))) { + s->float_exception_flags |= float_flag_overflow; + } else if (unlikely(fabs(ur.h) <= FLT_MIN)) { + goto soft; + } + } + if (flags & float_muladd_negate_result) { + return float64_chs(ur.s); + } + return ur.s; + + soft: + return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s); +} + /* * Returns the result of dividing the floating-point value `a' by the * corresponding value `b'. The operation is performed according to @@ -1180,7 +1752,8 @@ float16 float16_div(float16 a, float16 b, float_status *status) return float16_round_pack_canonical(pr, status); } -float32 float32_div(float32 a, float32 b, float_status *status) +static float32 QEMU_SOFTFLOAT_ATTR +soft_f32_div(float32 a, float32 b, float_status *status) { FloatParts pa = float32_unpack_canonical(a, status); FloatParts pb = float32_unpack_canonical(b, status); @@ -1189,7 +1762,8 @@ float32 float32_div(float32 a, float32 b, float_status *status) return float32_round_pack_canonical(pr, status); } -float64 float64_div(float64 a, float64 b, float_status *status) +static float64 QEMU_SOFTFLOAT_ATTR +soft_f64_div(float64 a, float64 b, float_status *status) { FloatParts pa = float64_unpack_canonical(a, status); FloatParts pb = float64_unpack_canonical(b, status); @@ -1198,6 +1772,64 @@ float64 float64_div(float64 a, float64 b, float_status *status) return float64_round_pack_canonical(pr, status); } +static float hard_f32_div(float a, float b) +{ + return a / b; +} + +static double hard_f64_div(double a, double b) +{ + return a / b; +} + +static bool f32_div_pre(union_float32 a, union_float32 b) +{ + if (QEMU_HARDFLOAT_2F32_USE_FP) { + return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && + fpclassify(b.h) == FP_NORMAL; + } + return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s); +} + +static bool f64_div_pre(union_float64 a, union_float64 b) +{ + if (QEMU_HARDFLOAT_2F64_USE_FP) { + return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) && + fpclassify(b.h) == FP_NORMAL; + } + return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s); +} + +static bool f32_div_post(union_float32 a, union_float32 b) +{ + if (QEMU_HARDFLOAT_2F32_USE_FP) { + return fpclassify(a.h) != FP_ZERO; + } + return !float32_is_zero(a.s); +} + +static bool f64_div_post(union_float64 a, union_float64 b) +{ + if (QEMU_HARDFLOAT_2F64_USE_FP) { + return fpclassify(a.h) != FP_ZERO; + } + return !float64_is_zero(a.s); +} + +float32 QEMU_FLATTEN +float32_div(float32 a, float32 b, float_status *s) +{ + return float32_gen2(a, b, s, hard_f32_div, soft_f32_div, + f32_div_pre, f32_div_post, NULL, NULL); +} + +float64 QEMU_FLATTEN +float64_div(float64 a, float64 b, float_status *s) +{ + return float64_gen2(a, b, s, hard_f64_div, soft_f64_div, + f64_div_pre, f64_div_post, NULL, NULL); +} + /* * Float to Float conversions * @@ -2271,28 +2903,109 @@ static int compare_floats(FloatParts a, FloatParts b, bool is_quiet, } } -#define COMPARE(sz) \ -int float ## sz ## _compare(float ## sz a, float ## sz b, \ - float_status *s) \ +#define COMPARE(name, attr, sz) \ +static int attr \ +name(float ## sz a, float ## sz b, bool is_quiet, float_status *s) \ { \ FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ - return compare_floats(pa, pb, false, s); \ -} \ -int float ## sz ## _compare_quiet(float ## sz a, float ## sz b, \ - float_status *s) \ -{ \ - FloatParts pa = float ## sz ## _unpack_canonical(a, s); \ - FloatParts pb = float ## sz ## _unpack_canonical(b, s); \ - return compare_floats(pa, pb, true, s); \ + return compare_floats(pa, pb, is_quiet, s); \ } -COMPARE(16) -COMPARE(32) -COMPARE(64) +COMPARE(soft_f16_compare, QEMU_FLATTEN, 16) +COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32) +COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64) #undef COMPARE +int float16_compare(float16 a, float16 b, float_status *s) +{ + return soft_f16_compare(a, b, false, s); +} + +int float16_compare_quiet(float16 a, float16 b, float_status *s) +{ + return soft_f16_compare(a, b, true, s); +} + +static int QEMU_FLATTEN +f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s) +{ + union_float32 ua, ub; + + ua.s = xa; + ub.s = xb; + + if (QEMU_NO_HARDFLOAT) { + goto soft; + } + + float32_input_flush2(&ua.s, &ub.s, s); + if (isgreaterequal(ua.h, ub.h)) { + if (isgreater(ua.h, ub.h)) { + return float_relation_greater; + } + return float_relation_equal; + } + if (likely(isless(ua.h, ub.h))) { + return float_relation_less; + } + /* The only condition remaining is unordered. + * Fall through to set flags. + */ + soft: + return soft_f32_compare(ua.s, ub.s, is_quiet, s); +} + +int float32_compare(float32 a, float32 b, float_status *s) +{ + return f32_compare(a, b, false, s); +} + +int float32_compare_quiet(float32 a, float32 b, float_status *s) +{ + return f32_compare(a, b, true, s); +} + +static int QEMU_FLATTEN +f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s) +{ + union_float64 ua, ub; + + ua.s = xa; + ub.s = xb; + + if (QEMU_NO_HARDFLOAT) { + goto soft; + } + + float64_input_flush2(&ua.s, &ub.s, s); + if (isgreaterequal(ua.h, ub.h)) { + if (isgreater(ua.h, ub.h)) { + return float_relation_greater; + } + return float_relation_equal; + } + if (likely(isless(ua.h, ub.h))) { + return float_relation_less; + } + /* The only condition remaining is unordered. + * Fall through to set flags. + */ + soft: + return soft_f64_compare(ua.s, ub.s, is_quiet, s); +} + +int float64_compare(float64 a, float64 b, float_status *s) +{ + return f64_compare(a, b, false, s); +} + +int float64_compare_quiet(float64 a, float64 b, float_status *s) +{ + return f64_compare(a, b, true, s); +} + /* Multiply A by 2 raised to the power N. */ static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s) { @@ -2412,20 +3125,76 @@ float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status) return float16_round_pack_canonical(pr, status); } -float32 QEMU_FLATTEN float32_sqrt(float32 a, float_status *status) +static float32 QEMU_SOFTFLOAT_ATTR +soft_f32_sqrt(float32 a, float_status *status) { FloatParts pa = float32_unpack_canonical(a, status); FloatParts pr = sqrt_float(pa, status, &float32_params); return float32_round_pack_canonical(pr, status); } -float64 QEMU_FLATTEN float64_sqrt(float64 a, float_status *status) +static float64 QEMU_SOFTFLOAT_ATTR +soft_f64_sqrt(float64 a, float_status *status) { FloatParts pa = float64_unpack_canonical(a, status); FloatParts pr = sqrt_float(pa, status, &float64_params); return float64_round_pack_canonical(pr, status); } +float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s) +{ + union_float32 ua, ur; + + ua.s = xa; + if (unlikely(!can_use_fpu(s))) { + goto soft; + } + + float32_input_flush1(&ua.s, s); + if (QEMU_HARDFLOAT_1F32_USE_FP) { + if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || + fpclassify(ua.h) == FP_ZERO) || + signbit(ua.h))) { + goto soft; + } + } else if (unlikely(!float32_is_zero_or_normal(ua.s) || + float32_is_neg(ua.s))) { + goto soft; + } + ur.h = sqrtf(ua.h); + return ur.s; + + soft: + return soft_f32_sqrt(ua.s, s); +} + +float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s) +{ + union_float64 ua, ur; + + ua.s = xa; + if (unlikely(!can_use_fpu(s))) { + goto soft; + } + + float64_input_flush1(&ua.s, s); + if (QEMU_HARDFLOAT_1F64_USE_FP) { + if (unlikely(!(fpclassify(ua.h) == FP_NORMAL || + fpclassify(ua.h) == FP_ZERO) || + signbit(ua.h))) { + goto soft; + } + } else if (unlikely(!float64_is_zero_or_normal(ua.s) || + float64_is_neg(ua.s))) { + goto soft; + } + ur.h = sqrt(ua.h); + return ur.s; + + soft: + return soft_f64_sqrt(ua.s, s); +} + /*---------------------------------------------------------------------------- | The pattern for a default generated NaN. *----------------------------------------------------------------------------*/ |