diff options
Diffstat (limited to 'target-arm/op_neon.h')
-rw-r--r-- | target-arm/op_neon.h | 1690 |
1 files changed, 0 insertions, 1690 deletions
diff --git a/target-arm/op_neon.h b/target-arm/op_neon.h deleted file mode 100644 index df3b7cb2a6..0000000000 --- a/target-arm/op_neon.h +++ /dev/null @@ -1,1690 +0,0 @@ -/* - * ARM NEON vector operations. - * - * Copyright (c) 2007 CodeSourcery. - * Written by Paul Brook - * - * This code is licenced under the GPL. - */ -/* Note that for NEON an "l" prefix means it is a wide operation, unlike - scalar arm ops where it means a word size operation. */ - -#define SIGNBIT (uint32_t)0x80000000 -/* ??? NEON ops should probably have their own float status. */ -#define NFS &env->vfp.fp_status -#define NEON_OP(name) void OPPROTO op_neon_##name (void) - -/* Helper routines to perform bitwise copies between float and int. */ -static inline float32 vfp_itos(uint32_t i) -{ - union { - uint32_t i; - float32 s; - } v; - - v.i = i; - return v.s; -} - -static inline uint32_t vfp_stoi(float32 s) -{ - union { - uint32_t i; - float32 s; - } v; - - v.s = s; - return v.i; -} - -NEON_OP(getreg_T0) -{ - T0 = *(uint32_t *)((char *) env + PARAM1); -} - -NEON_OP(getreg_T1) -{ - T1 = *(uint32_t *)((char *) env + PARAM1); -} - -NEON_OP(setreg_T0) -{ - *(uint32_t *)((char *) env + PARAM1) = T0; -} - -NEON_OP(setreg_T1) -{ - *(uint32_t *)((char *) env + PARAM1) = T1; -} - -#define NEON_TYPE1(name, type) \ -typedef struct \ -{ \ - type v1; \ -} neon_##name; -#ifdef WORDS_BIGENDIAN -#define NEON_TYPE2(name, type) \ -typedef struct \ -{ \ - type v2; \ - type v1; \ -} neon_##name; -#define NEON_TYPE4(name, type) \ -typedef struct \ -{ \ - type v4; \ - type v3; \ - type v2; \ - type v1; \ -} neon_##name; -#else -#define NEON_TYPE2(name, type) \ -typedef struct \ -{ \ - type v1; \ - type v2; \ -} neon_##name; -#define NEON_TYPE4(name, type) \ -typedef struct \ -{ \ - type v1; \ - type v2; \ - type v3; \ - type v4; \ -} neon_##name; -#endif - -NEON_TYPE4(s8, int8_t) -NEON_TYPE4(u8, uint8_t) -NEON_TYPE2(s16, int16_t) -NEON_TYPE2(u16, uint16_t) -NEON_TYPE1(s32, int32_t) -NEON_TYPE1(u32, uint32_t) -#undef NEON_TYPE4 -#undef NEON_TYPE2 -#undef NEON_TYPE1 - -/* Copy from a uint32_t to a vector structure type. */ -#define NEON_UNPACK(vtype, dest, val) do { \ - union { \ - vtype v; \ - uint32_t i; \ - } conv_u; \ - conv_u.i = (val); \ - dest = conv_u.v; \ - } while(0) - -/* Copy from a vector structure type to a uint32_t. */ -#define NEON_PACK(vtype, dest, val) do { \ - union { \ - vtype v; \ - uint32_t i; \ - } conv_u; \ - conv_u.v = (val); \ - dest = conv_u.i; \ - } while(0) - -#define NEON_DO1 \ - NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); -#define NEON_DO2 \ - NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ - NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); -#define NEON_DO4 \ - NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ - NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \ - NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \ - NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4); - -#define NEON_VOP(name, vtype, n) \ -NEON_OP(name) \ -{ \ - vtype vsrc1; \ - vtype vsrc2; \ - vtype vdest; \ - NEON_UNPACK(vtype, vsrc1, T0); \ - NEON_UNPACK(vtype, vsrc2, T1); \ - NEON_DO##n; \ - NEON_PACK(vtype, T0, vdest); \ - FORCE_RET(); \ -} - -#define NEON_VOP1(name, vtype, n) \ -NEON_OP(name) \ -{ \ - vtype vsrc1; \ - vtype vdest; \ - NEON_UNPACK(vtype, vsrc1, T0); \ - NEON_DO##n; \ - NEON_PACK(vtype, T0, vdest); \ - FORCE_RET(); \ -} - -/* Pairwise operations. */ -/* For 32-bit elements each segment only contains a single element, so - the elementwise and pairwise operations are the same. */ -#define NEON_PDO2 \ - NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ - NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2); -#define NEON_PDO4 \ - NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ - NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \ - NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \ - NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \ - -#define NEON_POP(name, vtype, n) \ -NEON_OP(name) \ -{ \ - vtype vsrc1; \ - vtype vsrc2; \ - vtype vdest; \ - NEON_UNPACK(vtype, vsrc1, T0); \ - NEON_UNPACK(vtype, vsrc2, T1); \ - NEON_PDO##n; \ - NEON_PACK(vtype, T0, vdest); \ - FORCE_RET(); \ -} - -#define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1 -NEON_VOP(hadd_s8, neon_s8, 4) -NEON_VOP(hadd_u8, neon_u8, 4) -NEON_VOP(hadd_s16, neon_s16, 2) -NEON_VOP(hadd_u16, neon_u16, 2) -#undef NEON_FN - -NEON_OP(hadd_s32) -{ - int32_t src1 = T0; - int32_t src2 = T1; - int32_t dest; - - dest = (src1 >> 1) + (src2 >> 1); - if (src1 & src2 & 1) - dest++; - T0 = dest; - FORCE_RET(); -} - -NEON_OP(hadd_u32) -{ - uint32_t src1 = T0; - uint32_t src2 = T1; - uint32_t dest; - - dest = (src1 >> 1) + (src2 >> 1); - if (src1 & src2 & 1) - dest++; - T0 = dest; - FORCE_RET(); -} - -#define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1 -NEON_VOP(rhadd_s8, neon_s8, 4) -NEON_VOP(rhadd_u8, neon_u8, 4) -NEON_VOP(rhadd_s16, neon_s16, 2) -NEON_VOP(rhadd_u16, neon_u16, 2) -#undef NEON_FN - -NEON_OP(rhadd_s32) -{ - int32_t src1 = T0; - int32_t src2 = T1; - int32_t dest; - - dest = (src1 >> 1) + (src2 >> 1); - if ((src1 | src2) & 1) - dest++; - T0 = dest; - FORCE_RET(); -} - -NEON_OP(rhadd_u32) -{ - uint32_t src1 = T0; - uint32_t src2 = T1; - uint32_t dest; - - dest = (src1 >> 1) + (src2 >> 1); - if ((src1 | src2) & 1) - dest++; - T0 = dest; - FORCE_RET(); -} - -#define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1 -NEON_VOP(hsub_s8, neon_s8, 4) -NEON_VOP(hsub_u8, neon_u8, 4) -NEON_VOP(hsub_s16, neon_s16, 2) -NEON_VOP(hsub_u16, neon_u16, 2) -#undef NEON_FN - -NEON_OP(hsub_s32) -{ - int32_t src1 = T0; - int32_t src2 = T1; - int32_t dest; - - dest = (src1 >> 1) - (src2 >> 1); - if ((~src1) & src2 & 1) - dest--; - T0 = dest; - FORCE_RET(); -} - -NEON_OP(hsub_u32) -{ - uint32_t src1 = T0; - uint32_t src2 = T1; - uint32_t dest; - - dest = (src1 >> 1) - (src2 >> 1); - if ((~src1) & src2 & 1) - dest--; - T0 = dest; - FORCE_RET(); -} - -#define NEON_USAT(dest, src1, src2, type) do { \ - uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ - if (tmp != (type)tmp) { \ - env->QF = 1; \ - dest = ~0; \ - } else { \ - dest = tmp; \ - }} while(0) -#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t) -NEON_VOP(qadd_u8, neon_u8, 4) -#undef NEON_FN -#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t) -NEON_VOP(qadd_u16, neon_u16, 2) -#undef NEON_FN -#undef NEON_USAT - -#define NEON_SSAT(dest, src1, src2, type) do { \ - int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ - if (tmp != (type)tmp) { \ - env->QF = 1; \ - if (src2 > 0) { \ - tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \ - } else { \ - tmp = 1 << (sizeof(type) * 8 - 1); \ - } \ - } \ - dest = tmp; \ - } while(0) -#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t) -NEON_VOP(qadd_s8, neon_s8, 4) -#undef NEON_FN -#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t) -NEON_VOP(qadd_s16, neon_s16, 2) -#undef NEON_FN -#undef NEON_SSAT - -#define NEON_USAT(dest, src1, src2, type) do { \ - uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \ - if (tmp != (type)tmp) { \ - env->QF = 1; \ - dest = 0; \ - } else { \ - dest = tmp; \ - }} while(0) -#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t) -NEON_VOP(qsub_u8, neon_u8, 4) -#undef NEON_FN -#define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t) -NEON_VOP(qsub_u16, neon_u16, 2) -#undef NEON_FN -#undef NEON_USAT - -#define NEON_SSAT(dest, src1, src2, type) do { \ - int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \ - if (tmp != (type)tmp) { \ - env->QF = 1; \ - if (src2 < 0) { \ - tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \ - } else { \ - tmp = 1 << (sizeof(type) * 8 - 1); \ - } \ - } \ - dest = tmp; \ - } while(0) -#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t) -NEON_VOP(qsub_s8, neon_s8, 4) -#undef NEON_FN -#define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t) -NEON_VOP(qsub_s16, neon_s16, 2) -#undef NEON_FN -#undef NEON_SSAT - -#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0 -NEON_VOP(cgt_s8, neon_s8, 4) -NEON_VOP(cgt_u8, neon_u8, 4) -NEON_VOP(cgt_s16, neon_s16, 2) -NEON_VOP(cgt_u16, neon_u16, 2) -NEON_VOP(cgt_s32, neon_s32, 1) -NEON_VOP(cgt_u32, neon_u32, 1) -#undef NEON_FN - -#define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0 -NEON_VOP(cge_s8, neon_s8, 4) -NEON_VOP(cge_u8, neon_u8, 4) -NEON_VOP(cge_s16, neon_s16, 2) -NEON_VOP(cge_u16, neon_u16, 2) -NEON_VOP(cge_s32, neon_s32, 1) -NEON_VOP(cge_u32, neon_u32, 1) -#undef NEON_FN - -#define NEON_FN(dest, src1, src2) do { \ - int8_t tmp; \ - tmp = (int8_t)src2; \ - if (tmp < 0) { \ - dest = src1 >> -tmp; \ - } else { \ - dest = src1 << tmp; \ - }} while (0) -NEON_VOP(shl_s8, neon_s8, 4) -NEON_VOP(shl_u8, neon_u8, 4) -NEON_VOP(shl_s16, neon_s16, 2) -NEON_VOP(shl_u16, neon_u16, 2) -NEON_VOP(shl_s32, neon_s32, 1) -NEON_VOP(shl_u32, neon_u32, 1) -#undef NEON_FN - -NEON_OP(shl_u64) -{ - int8_t shift = env->vfp.scratch[0]; - uint64_t val = T0 | ((uint64_t)T1 << 32); - if (shift < 0) { - val >>= -shift; - } else { - val <<= shift; - } - T0 = val; - T1 = val >> 32; - FORCE_RET(); -} - -NEON_OP(shl_s64) -{ - int8_t shift = env->vfp.scratch[0]; - int64_t val = T0 | ((uint64_t)T1 << 32); - if (shift < 0) { - val >>= -shift; - } else { - val <<= shift; - } - T0 = val; - T1 = val >> 32; - FORCE_RET(); -} - -#define NEON_FN(dest, src1, src2) do { \ - int8_t tmp; \ - tmp = (int8_t)src1; \ - if (tmp < 0) { \ - dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \ - } else { \ - dest = src2 << tmp; \ - }} while (0) - -NEON_VOP(rshl_s8, neon_s8, 4) -NEON_VOP(rshl_u8, neon_u8, 4) -NEON_VOP(rshl_s16, neon_s16, 2) -NEON_VOP(rshl_u16, neon_u16, 2) -NEON_VOP(rshl_s32, neon_s32, 1) -NEON_VOP(rshl_u32, neon_u32, 1) -#undef NEON_FN - -NEON_OP(rshl_u64) -{ - int8_t shift = env->vfp.scratch[0]; - uint64_t val = T0 | ((uint64_t)T1 << 32); - if (shift < 0) { - val = (val + ((uint64_t)1 << (-1 - shift))) >> -shift; - val >>= -shift; - } else { - val <<= shift; - } - T0 = val; - T1 = val >> 32; - FORCE_RET(); -} - -NEON_OP(rshl_s64) -{ - int8_t shift = env->vfp.scratch[0]; - int64_t val = T0 | ((uint64_t)T1 << 32); - if (shift < 0) { - val = (val + ((int64_t)1 << (-1 - shift))) >> -shift; - } else { - val <<= shift; - } - T0 = val; - T1 = val >> 32; - FORCE_RET(); -} - -#define NEON_FN(dest, src1, src2) do { \ - int8_t tmp; \ - tmp = (int8_t)src1; \ - if (tmp < 0) { \ - dest = src2 >> -tmp; \ - } else { \ - dest = src2 << tmp; \ - if ((dest >> tmp) != src2) { \ - env->QF = 1; \ - dest = ~0; \ - } \ - }} while (0) -NEON_VOP(qshl_s8, neon_s8, 4) -NEON_VOP(qshl_s16, neon_s16, 2) -NEON_VOP(qshl_s32, neon_s32, 1) -#undef NEON_FN - -NEON_OP(qshl_s64) -{ - int8_t shift = env->vfp.scratch[0]; - int64_t val = T0 | ((uint64_t)T1 << 32); - if (shift < 0) { - val >>= -shift; - } else { - int64_t tmp = val; - val <<= shift; - if ((val >> shift) != tmp) { - env->QF = 1; - val = (tmp >> 63) ^ 0x7fffffffffffffffULL; - } - } - T0 = val; - T1 = val >> 32; - FORCE_RET(); -} - -#define NEON_FN(dest, src1, src2) do { \ - int8_t tmp; \ - tmp = (int8_t)src1; \ - if (tmp < 0) { \ - dest = src2 >> -tmp; \ - } else { \ - dest = src2 << tmp; \ - if ((dest >> tmp) != src2) { \ - env->QF = 1; \ - dest = src2 >> 31; \ - } \ - }} while (0) -NEON_VOP(qshl_u8, neon_u8, 4) -NEON_VOP(qshl_u16, neon_u16, 2) -NEON_VOP(qshl_u32, neon_u32, 1) -#undef NEON_FN - -NEON_OP(qshl_u64) -{ - int8_t shift = env->vfp.scratch[0]; - uint64_t val = T0 | ((uint64_t)T1 << 32); - if (shift < 0) { - val >>= -shift; - } else { - uint64_t tmp = val; - val <<= shift; - if ((val >> shift) != tmp) { - env->QF = 1; - val = ~(uint64_t)0; - } - } - T0 = val; - T1 = val >> 32; - FORCE_RET(); -} - -#define NEON_FN(dest, src1, src2) do { \ - int8_t tmp; \ - tmp = (int8_t)src1; \ - if (tmp < 0) { \ - dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \ - } else { \ - dest = src2 << tmp; \ - if ((dest >> tmp) != src2) { \ - dest = ~0; \ - } \ - }} while (0) -NEON_VOP(qrshl_s8, neon_s8, 4) -NEON_VOP(qrshl_s16, neon_s16, 2) -NEON_VOP(qrshl_s32, neon_s32, 1) -#undef NEON_FN - -#define NEON_FN(dest, src1, src2) do { \ - int8_t tmp; \ - tmp = (int8_t)src1; \ - if (tmp < 0) { \ - dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \ - } else { \ - dest = src2 << tmp; \ - if ((dest >> tmp) != src2) { \ - env->QF = 1; \ - dest = src2 >> 31; \ - } \ - }} while (0) -NEON_VOP(qrshl_u8, neon_u8, 4) -NEON_VOP(qrshl_u16, neon_u16, 2) -NEON_VOP(qrshl_u32, neon_u32, 1) -#undef NEON_FN - -#define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2 -NEON_VOP(max_s8, neon_s8, 4) -NEON_VOP(max_u8, neon_u8, 4) -NEON_VOP(max_s16, neon_s16, 2) -NEON_VOP(max_u16, neon_u16, 2) -NEON_VOP(max_s32, neon_s32, 1) -NEON_VOP(max_u32, neon_u32, 1) -NEON_POP(pmax_s8, neon_s8, 4) -NEON_POP(pmax_u8, neon_u8, 4) -NEON_POP(pmax_s16, neon_s16, 2) -NEON_POP(pmax_u16, neon_u16, 2) -#undef NEON_FN - -NEON_OP(max_f32) -{ - float32 f0 = vfp_itos(T0); - float32 f1 = vfp_itos(T1); - T0 = (float32_compare_quiet(f0, f1, NFS) == 1) ? T0 : T1; - FORCE_RET(); -} - -#define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2 -NEON_VOP(min_s8, neon_s8, 4) -NEON_VOP(min_u8, neon_u8, 4) -NEON_VOP(min_s16, neon_s16, 2) -NEON_VOP(min_u16, neon_u16, 2) -NEON_VOP(min_s32, neon_s32, 1) -NEON_VOP(min_u32, neon_u32, 1) -NEON_POP(pmin_s8, neon_s8, 4) -NEON_POP(pmin_u8, neon_u8, 4) -NEON_POP(pmin_s16, neon_s16, 2) -NEON_POP(pmin_u16, neon_u16, 2) -#undef NEON_FN - -NEON_OP(min_f32) -{ - float32 f0 = vfp_itos(T0); - float32 f1 = vfp_itos(T1); - T0 = (float32_compare_quiet(f0, f1, NFS) == -1) ? T0 : T1; - FORCE_RET(); -} - -#define NEON_FN(dest, src1, src2) \ - dest = (src1 > src2) ? (src1 - src2) : (src2 - src1) -NEON_VOP(abd_s8, neon_s8, 4) -NEON_VOP(abd_u8, neon_u8, 4) -NEON_VOP(abd_s16, neon_s16, 2) -NEON_VOP(abd_u16, neon_u16, 2) -NEON_VOP(abd_s32, neon_s32, 1) -NEON_VOP(abd_u32, neon_u32, 1) -#undef NEON_FN - -NEON_OP(abd_f32) -{ - float32 f0 = vfp_itos(T0); - float32 f1 = vfp_itos(T1); - T0 = vfp_stoi((float32_compare_quiet(f0, f1, NFS) == 1) - ? float32_sub(f0, f1, NFS) - : float32_sub(f1, f0, NFS)); - FORCE_RET(); -} - -#define NEON_FN(dest, src1, src2) dest = src1 + src2 -NEON_VOP(add_u8, neon_u8, 4) -NEON_VOP(add_u16, neon_u16, 2) -NEON_POP(padd_u8, neon_u8, 4) -NEON_POP(padd_u16, neon_u16, 2) -#undef NEON_FN - -NEON_OP(add_f32) -{ - T0 = vfp_stoi(float32_add(vfp_itos(T0), vfp_itos(T1), NFS)); - FORCE_RET(); -} - -#define NEON_FN(dest, src1, src2) dest = src1 - src2 -NEON_VOP(sub_u8, neon_u8, 4) -NEON_VOP(sub_u16, neon_u16, 2) -#undef NEON_FN - -NEON_OP(sub_f32) -{ - T0 = vfp_stoi(float32_sub(vfp_itos(T0), vfp_itos(T1), NFS)); - FORCE_RET(); -} - -#define NEON_FN(dest, src1, src2) dest = src2 - src1 -NEON_VOP(rsb_u8, neon_u8, 4) -NEON_VOP(rsb_u16, neon_u16, 2) -#undef NEON_FN - -NEON_OP(rsb_f32) -{ - T0 = vfp_stoi(float32_sub(vfp_itos(T1), vfp_itos(T0), NFS)); - FORCE_RET(); -} - -#define NEON_FN(dest, src1, src2) dest = src1 * src2 -NEON_VOP(mul_u8, neon_u8, 4) -NEON_VOP(mul_u16, neon_u16, 2) -#undef NEON_FN - -NEON_OP(mul_f32) -{ - T0 = vfp_stoi(float32_mul(vfp_itos(T0), vfp_itos(T1), NFS)); - FORCE_RET(); -} - -NEON_OP(mul_p8) -{ - T0 = helper_neon_mul_p8(T0, T1); -} - -#define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0 -NEON_VOP(tst_u8, neon_u8, 4) -NEON_VOP(tst_u16, neon_u16, 2) -NEON_VOP(tst_u32, neon_u32, 1) -#undef NEON_FN - -#define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0 -NEON_VOP(ceq_u8, neon_u8, 4) -NEON_VOP(ceq_u16, neon_u16, 2) -NEON_VOP(ceq_u32, neon_u32, 1) -#undef NEON_FN - -#define NEON_QDMULH16(dest, src1, src2, round) do { \ - uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \ - if ((tmp ^ (tmp << 1)) & SIGNBIT) { \ - env->QF = 1; \ - tmp = (tmp >> 31) ^ ~SIGNBIT; \ - } \ - tmp <<= 1; \ - if (round) { \ - int32_t old = tmp; \ - tmp += 1 << 15; \ - if ((int32_t)tmp < old) { \ - env->QF = 1; \ - tmp = SIGNBIT - 1; \ - } \ - } \ - dest = tmp >> 16; \ - } while(0) -#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0) -NEON_VOP(qdmulh_s16, neon_s16, 2) -#undef NEON_FN -#define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1) -NEON_VOP(qrdmulh_s16, neon_s16, 2) -#undef NEON_FN -#undef NEON_QDMULH16 - -#define SIGNBIT64 ((uint64_t)1 << 63) -#define NEON_QDMULH32(dest, src1, src2, round) do { \ - uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \ - if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \ - env->QF = 1; \ - tmp = (tmp >> 63) ^ ~SIGNBIT64; \ - } else { \ - tmp <<= 1; \ - } \ - if (round) { \ - int64_t old = tmp; \ - tmp += (int64_t)1 << 31; \ - if ((int64_t)tmp < old) { \ - env->QF = 1; \ - tmp = SIGNBIT64 - 1; \ - } \ - } \ - dest = tmp >> 32; \ - } while(0) -#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0) -NEON_VOP(qdmulh_s32, neon_s32, 1) -#undef NEON_FN -#define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1) -NEON_VOP(qrdmulh_s32, neon_s32, 1) -#undef NEON_FN -#undef NEON_QDMULH32 - -/* Floating point comparisons produce an integer result. */ -#define NEON_VOP_FCMP(name, cmp) \ -NEON_OP(name) \ -{ \ - if (float32_compare_quiet(vfp_itos(T0), vfp_itos(T1), NFS) cmp 0) \ - T0 = -1; \ - else \ - T0 = 0; \ - FORCE_RET(); \ -} - -NEON_VOP_FCMP(ceq_f32, ==) -NEON_VOP_FCMP(cge_f32, >=) -NEON_VOP_FCMP(cgt_f32, >) - -NEON_OP(acge_f32) -{ - float32 f0 = float32_abs(vfp_itos(T0)); - float32 f1 = float32_abs(vfp_itos(T1)); - T0 = (float32_compare_quiet(f0, f1,NFS) >= 0) ? -1 : 0; - FORCE_RET(); -} - -NEON_OP(acgt_f32) -{ - float32 f0 = float32_abs(vfp_itos(T0)); - float32 f1 = float32_abs(vfp_itos(T1)); - T0 = (float32_compare_quiet(f0, f1, NFS) > 0) ? -1 : 0; - FORCE_RET(); -} - -/* Narrowing instructions. The named type is the destination type. */ -NEON_OP(narrow_u8) -{ - T0 = (T0 & 0xff) | ((T0 >> 8) & 0xff00) - | ((T1 << 16) & 0xff0000) | (T1 << 24); - FORCE_RET(); -} - -NEON_OP(narrow_sat_u8) -{ - neon_u16 src; - neon_u8 dest; -#define SAT8(d, s) \ - if (s > 0xff) { \ - d = 0xff; \ - env->QF = 1; \ - } else { \ - d = s; \ - } - - NEON_UNPACK(neon_u16, src, T0); - SAT8(dest.v1, src.v1); - SAT8(dest.v2, src.v2); - NEON_UNPACK(neon_u16, src, T1); - SAT8(dest.v3, src.v1); - SAT8(dest.v4, src.v2); - NEON_PACK(neon_u8, T0, dest); - FORCE_RET(); -#undef SAT8 -} - -NEON_OP(narrow_sat_s8) -{ - neon_s16 src; - neon_s8 dest; -#define SAT8(d, s) \ - if (s != (uint8_t)s) { \ - d = (s >> 15) ^ 0x7f; \ - env->QF = 1; \ - } else { \ - d = s; \ - } - - NEON_UNPACK(neon_s16, src, T0); - SAT8(dest.v1, src.v1); - SAT8(dest.v2, src.v2); - NEON_UNPACK(neon_s16, src, T1); - SAT8(dest.v3, src.v1); - SAT8(dest.v4, src.v2); - NEON_PACK(neon_s8, T0, dest); - FORCE_RET(); -#undef SAT8 -} - -NEON_OP(narrow_u16) -{ - T0 = (T0 & 0xffff) | (T1 << 16); -} - -NEON_OP(narrow_sat_u16) -{ - if (T0 > 0xffff) { - T0 = 0xffff; - env->QF = 1; - } - if (T1 > 0xffff) { - T1 = 0xffff; - env->QF = 1; - } - T0 |= T1 << 16; - FORCE_RET(); -} - -NEON_OP(narrow_sat_s16) -{ - if ((int32_t)T0 != (int16_t)T0) { - T0 = ((int32_t)T0 >> 31) ^ 0x7fff; - env->QF = 1; - } - if ((int32_t)T1 != (int16_t) T1) { - T1 = ((int32_t)T1 >> 31) ^ 0x7fff; - env->QF = 1; - } - T0 = (uint16_t)T0 | (T1 << 16); - FORCE_RET(); -} - -NEON_OP(narrow_sat_u32) -{ - if (T1) { - T0 = 0xffffffffu; - env->QF = 1; - } - FORCE_RET(); -} - -NEON_OP(narrow_sat_s32) -{ - int32_t sign = (int32_t)T1 >> 31; - - if ((int32_t)T1 != sign) { - T0 = sign ^ 0x7fffffff; - env->QF = 1; - } - FORCE_RET(); -} - -/* Narrowing instructions. Named type is the narrow type. */ -NEON_OP(narrow_high_u8) -{ - T0 = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00) - | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000); - FORCE_RET(); -} - -NEON_OP(narrow_high_u16) -{ - T0 = (T0 >> 16) | (T1 & 0xffff0000); - FORCE_RET(); -} - -NEON_OP(narrow_high_round_u8) -{ - T0 = (((T0 + 0x80) >> 8) & 0xff) | (((T0 + 0x800000) >> 16) & 0xff00) - | (((T1 + 0x80) << 8) & 0xff0000) | ((T1 + 0x800000) & 0xff000000); - FORCE_RET(); -} - -NEON_OP(narrow_high_round_u16) -{ - T0 = ((T0 + 0x8000) >> 16) | ((T1 + 0x8000) & 0xffff0000); - FORCE_RET(); -} - -NEON_OP(narrow_high_round_u32) -{ - if (T0 >= 0x80000000u) - T0 = T1 + 1; - else - T0 = T1; - FORCE_RET(); -} - -/* Widening instructions. Named type is source type. */ -NEON_OP(widen_s8) -{ - uint32_t src; - - src = T0; - T0 = (uint16_t)(int8_t)src | ((int8_t)(src >> 8) << 16); - T1 = (uint16_t)(int8_t)(src >> 16) | ((int8_t)(src >> 24) << 16); -} - -NEON_OP(widen_u8) -{ - T1 = ((T0 >> 8) & 0xff0000) | ((T0 >> 16) & 0xff); - T0 = ((T0 << 8) & 0xff0000) | (T0 & 0xff); -} - -NEON_OP(widen_s16) -{ - int32_t src; - - src = T0; - T0 = (int16_t)src; - T1 = src >> 16; -} - -NEON_OP(widen_u16) -{ - T1 = T0 >> 16; - T0 &= 0xffff; -} - -NEON_OP(widen_s32) -{ - T1 = (int32_t)T0 >> 31; - FORCE_RET(); -} - -NEON_OP(widen_high_u8) -{ - T1 = (T0 & 0xff000000) | ((T0 >> 8) & 0xff00); - T0 = ((T0 << 16) & 0xff000000) | ((T0 << 8) & 0xff00); -} - -NEON_OP(widen_high_u16) -{ - T1 = T0 & 0xffff0000; - T0 <<= 16; -} - -/* Long operations. The type is the wide type. */ -NEON_OP(shll_u16) -{ - int shift = PARAM1; - uint32_t mask; - - mask = 0xffff >> (16 - shift); - mask |= mask << 16; - mask = ~mask; - - T0 = (T0 << shift) & mask; - T1 = (T1 << shift) & mask; - FORCE_RET(); -} - -NEON_OP(shll_u64) -{ - int shift = PARAM1; - - T1 <<= shift; - T1 |= T0 >> (32 - shift); - T0 <<= shift; - FORCE_RET(); -} - -NEON_OP(addl_u16) -{ - uint32_t tmp; - uint32_t high; - - tmp = env->vfp.scratch[0]; - high = (T0 >> 16) + (tmp >> 16); - T0 = (uint16_t)(T0 + tmp); - T0 |= (high << 16); - tmp = env->vfp.scratch[1]; - high = (T1 >> 16) + (tmp >> 16); - T1 = (uint16_t)(T1 + tmp); - T1 |= (high << 16); - FORCE_RET(); -} - -NEON_OP(addl_u32) -{ - T0 += env->vfp.scratch[0]; - T1 += env->vfp.scratch[1]; - FORCE_RET(); -} - -NEON_OP(addl_u64) -{ - uint64_t tmp; - tmp = T0 | ((uint64_t)T1 << 32); - tmp += env->vfp.scratch[0]; - tmp += (uint64_t)env->vfp.scratch[1] << 32; - T0 = tmp; - T1 = tmp >> 32; - FORCE_RET(); -} - -NEON_OP(subl_u16) -{ - uint32_t tmp; - uint32_t high; - - tmp = env->vfp.scratch[0]; - high = (T0 >> 16) - (tmp >> 16); - T0 = (uint16_t)(T0 - tmp); - T0 |= (high << 16); - tmp = env->vfp.scratch[1]; - high = (T1 >> 16) - (tmp >> 16); - T1 = (uint16_t)(T1 - tmp); - T1 |= (high << 16); - FORCE_RET(); -} - -NEON_OP(subl_u32) -{ - T0 -= env->vfp.scratch[0]; - T1 -= env->vfp.scratch[1]; - FORCE_RET(); -} - -NEON_OP(subl_u64) -{ - uint64_t tmp; - tmp = T0 | ((uint64_t)T1 << 32); - tmp -= env->vfp.scratch[0]; - tmp -= (uint64_t)env->vfp.scratch[1] << 32; - T0 = tmp; - T1 = tmp >> 32; - FORCE_RET(); -} - -#define DO_ABD(dest, x, y, type) do { \ - type tmp_x = x; \ - type tmp_y = y; \ - dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \ - } while(0) - -NEON_OP(abdl_u16) -{ - uint32_t tmp; - uint32_t low; - uint32_t high; - - DO_ABD(low, T0, T1, uint8_t); - DO_ABD(tmp, T0 >> 8, T1 >> 8, uint8_t); - low |= tmp << 16; - DO_ABD(high, T0 >> 16, T1 >> 16, uint8_t); - DO_ABD(tmp, T0 >> 24, T1 >> 24, uint8_t); - high |= tmp << 16; - T0 = low; - T1 = high; - FORCE_RET(); -} - -NEON_OP(abdl_s16) -{ - uint32_t tmp; - uint32_t low; - uint32_t high; - - DO_ABD(low, T0, T1, int8_t); - DO_ABD(tmp, T0 >> 8, T1 >> 8, int8_t); - low |= tmp << 16; - DO_ABD(high, T0 >> 16, T1 >> 16, int8_t); - DO_ABD(tmp, T0 >> 24, T1 >> 24, int8_t); - high |= tmp << 16; - T0 = low; - T1 = high; - FORCE_RET(); -} - -NEON_OP(abdl_u32) -{ - uint32_t low; - uint32_t high; - - DO_ABD(low, T0, T1, uint16_t); - DO_ABD(high, T0 >> 16, T1 >> 16, uint16_t); - T0 = low; - T1 = high; - FORCE_RET(); -} - -NEON_OP(abdl_s32) -{ - uint32_t low; - uint32_t high; - - DO_ABD(low, T0, T1, int16_t); - DO_ABD(high, T0 >> 16, T1 >> 16, int16_t); - T0 = low; - T1 = high; - FORCE_RET(); -} - -NEON_OP(abdl_u64) -{ - DO_ABD(T0, T0, T1, uint32_t); - T1 = 0; -} - -NEON_OP(abdl_s64) -{ - DO_ABD(T0, T0, T1, int32_t); - T1 = 0; -} -#undef DO_ABD - -/* Widening multiple. Named type is the source type. */ -#define DO_MULL(dest, x, y, type1, type2) do { \ - type1 tmp_x = x; \ - type1 tmp_y = y; \ - dest = (type2)((type2)tmp_x * (type2)tmp_y); \ - } while(0) - -NEON_OP(mull_u8) -{ - uint32_t tmp; - uint32_t low; - uint32_t high; - - DO_MULL(low, T0, T1, uint8_t, uint16_t); - DO_MULL(tmp, T0 >> 8, T1 >> 8, uint8_t, uint16_t); - low |= tmp << 16; - DO_MULL(high, T0 >> 16, T1 >> 16, uint8_t, uint16_t); - DO_MULL(tmp, T0 >> 24, T1 >> 24, uint8_t, uint16_t); - high |= tmp << 16; - T0 = low; - T1 = high; - FORCE_RET(); -} - -NEON_OP(mull_s8) -{ - uint32_t tmp; - uint32_t low; - uint32_t high; - - DO_MULL(low, T0, T1, int8_t, uint16_t); - DO_MULL(tmp, T0 >> 8, T1 >> 8, int8_t, uint16_t); - low |= tmp << 16; - DO_MULL(high, T0 >> 16, T1 >> 16, int8_t, uint16_t); - DO_MULL(tmp, T0 >> 24, T1 >> 24, int8_t, uint16_t); - high |= tmp << 16; - T0 = low; - T1 = high; - FORCE_RET(); -} - -NEON_OP(mull_u16) -{ - uint32_t low; - uint32_t high; - - DO_MULL(low, T0, T1, uint16_t, uint32_t); - DO_MULL(high, T0 >> 16, T1 >> 16, uint16_t, uint32_t); - T0 = low; - T1 = high; - FORCE_RET(); -} - -NEON_OP(mull_s16) -{ - uint32_t low; - uint32_t high; - - DO_MULL(low, T0, T1, int16_t, uint32_t); - DO_MULL(high, T0 >> 16, T1 >> 16, int16_t, uint32_t); - T0 = low; - T1 = high; - FORCE_RET(); -} - -NEON_OP(addl_saturate_s32) -{ - uint32_t tmp; - uint32_t res; - - tmp = env->vfp.scratch[0]; - res = T0 + tmp; - if (((res ^ T0) & SIGNBIT) && !((T0 ^ tmp) & SIGNBIT)) { - env->QF = 1; - T0 = (T0 >> 31) ^ 0x7fffffff; - } else { - T0 = res; - } - tmp = env->vfp.scratch[1]; - res = T1 + tmp; - if (((res ^ T1) & SIGNBIT) && !((T1 ^ tmp) & SIGNBIT)) { - env->QF = 1; - T1 = (T1 >> 31) ^ 0x7fffffff; - } else { - T1 = res; - } - FORCE_RET(); -} - -NEON_OP(addl_saturate_s64) -{ - uint64_t src1; - uint64_t src2; - uint64_t res; - - src1 = T0 + ((uint64_t)T1 << 32); - src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32); - res = src1 + src2; - if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) { - env->QF = 1; - T0 = ~(int64_t)src1 >> 63; - T1 = T0 ^ 0x80000000; - } else { - T0 = res; - T1 = res >> 32; - } - FORCE_RET(); -} - -NEON_OP(addl_saturate_u64) -{ - uint64_t src1; - uint64_t src2; - uint64_t res; - - src1 = T0 + ((uint64_t)T1 << 32); - src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32); - res = src1 + src2; - if (res < src1) { - env->QF = 1; - T0 = 0xffffffff; - T1 = 0xffffffff; - } else { - T0 = res; - T1 = res >> 32; - } - FORCE_RET(); -} - -NEON_OP(subl_saturate_s64) -{ - uint64_t src1; - uint64_t src2; - uint64_t res; - - src1 = T0 + ((uint64_t)T1 << 32); - src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32); - res = src1 - src2; - if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) { - env->QF = 1; - T0 = ~(int64_t)src1 >> 63; - T1 = T0 ^ 0x80000000; - } else { - T0 = res; - T1 = res >> 32; - } - FORCE_RET(); -} - -NEON_OP(subl_saturate_u64) -{ - uint64_t src1; - uint64_t src2; - uint64_t res; - - src1 = T0 + ((uint64_t)T1 << 32); - src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32); - if (src1 < src2) { - env->QF = 1; - T0 = 0; - T1 = 0; - } else { - res = src1 - src2; - T0 = res; - T1 = res >> 32; - } - FORCE_RET(); -} - -NEON_OP(negl_u16) -{ - uint32_t tmp; - tmp = T0 >> 16; - tmp = -tmp; - T0 = (-T0 & 0xffff) | (tmp << 16); - tmp = T1 >> 16; - tmp = -tmp; - T1 = (-T1 & 0xffff) | (tmp << 16); - FORCE_RET(); -} - -NEON_OP(negl_u32) -{ - T0 = -T0; - T1 = -T1; - FORCE_RET(); -} - -NEON_OP(negl_u64) -{ - uint64_t val; - - val = T0 | ((uint64_t)T1 << 32); - val = -val; - T0 = val; - T1 = val >> 32; - FORCE_RET(); -} - -/* Scalar operations. */ -NEON_OP(dup_low16) -{ - T0 = (T0 & 0xffff) | (T0 << 16); - FORCE_RET(); -} - -NEON_OP(dup_high16) -{ - T0 = (T0 >> 16) | (T0 & 0xffff0000); - FORCE_RET(); -} - -/* Helper for VEXT */ -NEON_OP(extract) -{ - int shift = PARAM1; - T0 = (T0 >> shift) | (T1 << (32 - shift)); - FORCE_RET(); -} - -/* Pairwise add long. Named type is source type. */ -NEON_OP(paddl_s8) -{ - int8_t src1; - int8_t src2; - uint16_t result; - src1 = T0 >> 24; - src2 = T0 >> 16; - result = (uint16_t)src1 + src2; - src1 = T0 >> 8; - src2 = T0; - T0 = (uint16_t)((uint16_t)src1 + src2) | ((uint32_t)result << 16); - FORCE_RET(); -} - -NEON_OP(paddl_u8) -{ - uint8_t src1; - uint8_t src2; - uint16_t result; - src1 = T0 >> 24; - src2 = T0 >> 16; - result = (uint16_t)src1 + src2; - src1 = T0 >> 8; - src2 = T0; - T0 = (uint16_t)((uint16_t)src1 + src2) | ((uint32_t)result << 16); - FORCE_RET(); -} - -NEON_OP(paddl_s16) -{ - T0 = (uint32_t)(int16_t)T0 + (uint32_t)(int16_t)(T0 >> 16); - FORCE_RET(); -} - -NEON_OP(paddl_u16) -{ - T0 = (uint32_t)(uint16_t)T0 + (uint32_t)(uint16_t)(T0 >> 16); - FORCE_RET(); -} - -NEON_OP(paddl_s32) -{ - int64_t tmp; - tmp = (int64_t)(int32_t)T0 + (int64_t)(int32_t)T1; - T0 = tmp; - T1 = tmp >> 32; - FORCE_RET(); -} - -NEON_OP(paddl_u32) -{ - uint64_t tmp; - tmp = (uint64_t)T0 + (uint64_t)T1; - T0 = tmp; - T1 = tmp >> 32; - FORCE_RET(); -} - -/* Count Leading Sign/Zero Bits. */ -static inline int do_clz8(uint8_t x) -{ - int n; - for (n = 8; x; n--) - x >>= 1; - return n; -} - -static inline int do_clz16(uint16_t x) -{ - int n; - for (n = 16; x; n--) - x >>= 1; - return n; -} - -NEON_OP(clz_u8) -{ - uint32_t result; - uint32_t tmp; - - tmp = T0; - result = do_clz8(tmp); - result |= do_clz8(tmp >> 8) << 8; - result |= do_clz8(tmp >> 16) << 16; - result |= do_clz8(tmp >> 24) << 24; - T0 = result; - FORCE_RET(); -} - -NEON_OP(clz_u16) -{ - uint32_t result; - uint32_t tmp; - tmp = T0; - result = do_clz16(tmp); - result |= do_clz16(tmp >> 16) << 16; - T0 = result; - FORCE_RET(); -} - -NEON_OP(cls_s8) -{ - uint32_t result; - int8_t tmp; - tmp = T0; - result = do_clz8((tmp < 0) ? ~tmp : tmp) - 1; - tmp = T0 >> 8; - result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 8; - tmp = T0 >> 16; - result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 16; - tmp = T0 >> 24; - result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 24; - T0 = result; - FORCE_RET(); -} - -NEON_OP(cls_s16) -{ - uint32_t result; - int16_t tmp; - tmp = T0; - result = do_clz16((tmp < 0) ? ~tmp : tmp) - 1; - tmp = T0 >> 16; - result |= (do_clz16((tmp < 0) ? ~tmp : tmp) - 1) << 16; - T0 = result; - FORCE_RET(); -} - -NEON_OP(cls_s32) -{ - int count; - if ((int32_t)T0 < 0) - T0 = ~T0; - for (count = 32; T0 > 0; count--) - T0 = T0 >> 1; - T0 = count - 1; - FORCE_RET(); -} - -/* Bit count. */ -NEON_OP(cnt_u8) -{ - T0 = (T0 & 0x55555555) + ((T0 >> 1) & 0x55555555); - T0 = (T0 & 0x33333333) + ((T0 >> 2) & 0x33333333); - T0 = (T0 & 0x0f0f0f0f) + ((T0 >> 4) & 0x0f0f0f0f); - FORCE_RET(); -} - -/* Saturnating negation. */ -/* ??? Make these use NEON_VOP1 */ -#define DO_QABS8(x) do { \ - if (x == (int8_t)0x80) { \ - x = 0x7f; \ - env->QF = 1; \ - } else if (x < 0) { \ - x = -x; \ - }} while (0) -NEON_OP(qabs_s8) -{ - neon_s8 vec; - NEON_UNPACK(neon_s8, vec, T0); - DO_QABS8(vec.v1); - DO_QABS8(vec.v2); - DO_QABS8(vec.v3); - DO_QABS8(vec.v4); - NEON_PACK(neon_s8, T0, vec); - FORCE_RET(); -} -#undef DO_QABS8 - -#define DO_QNEG8(x) do { \ - if (x == (int8_t)0x80) { \ - x = 0x7f; \ - env->QF = 1; \ - } else { \ - x = -x; \ - }} while (0) -NEON_OP(qneg_s8) -{ - neon_s8 vec; - NEON_UNPACK(neon_s8, vec, T0); - DO_QNEG8(vec.v1); - DO_QNEG8(vec.v2); - DO_QNEG8(vec.v3); - DO_QNEG8(vec.v4); - NEON_PACK(neon_s8, T0, vec); - FORCE_RET(); -} -#undef DO_QNEG8 - -#define DO_QABS16(x) do { \ - if (x == (int16_t)0x8000) { \ - x = 0x7fff; \ - env->QF = 1; \ - } else if (x < 0) { \ - x = -x; \ - }} while (0) -NEON_OP(qabs_s16) -{ - neon_s16 vec; - NEON_UNPACK(neon_s16, vec, T0); - DO_QABS16(vec.v1); - DO_QABS16(vec.v2); - NEON_PACK(neon_s16, T0, vec); - FORCE_RET(); -} -#undef DO_QABS16 - -#define DO_QNEG16(x) do { \ - if (x == (int16_t)0x8000) { \ - x = 0x7fff; \ - env->QF = 1; \ - } else { \ - x = -x; \ - }} while (0) -NEON_OP(qneg_s16) -{ - neon_s16 vec; - NEON_UNPACK(neon_s16, vec, T0); - DO_QNEG16(vec.v1); - DO_QNEG16(vec.v2); - NEON_PACK(neon_s16, T0, vec); - FORCE_RET(); -} -#undef DO_QNEG16 - -NEON_OP(qabs_s32) -{ - if (T0 == 0x80000000) { - T0 = 0x7fffffff; - env->QF = 1; - } else if ((int32_t)T0 < 0) { - T0 = -T0; - } - FORCE_RET(); -} - -NEON_OP(qneg_s32) -{ - if (T0 == 0x80000000) { - T0 = 0x7fffffff; - env->QF = 1; - } else { - T0 = -T0; - } - FORCE_RET(); -} - -/* Unary opperations */ -#define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src -NEON_VOP1(abs_s8, neon_s8, 4) -NEON_VOP1(abs_s16, neon_s16, 2) -NEON_OP(abs_s32) -{ - if ((int32_t)T0 < 0) - T0 = -T0; - FORCE_RET(); -} -#undef NEON_FN - -/* Transpose. Argument order is rather strange to avoid special casing - the tranlation code. - On input T0 = rm, T1 = rd. On output T0 = rd, T1 = rm */ -NEON_OP(trn_u8) -{ - uint32_t rd; - uint32_t rm; - rd = ((T0 & 0x00ff00ff) << 8) | (T1 & 0x00ff00ff); - rm = ((T1 & 0xff00ff00) >> 8) | (T0 & 0xff00ff00); - T0 = rd; - T1 = rm; - FORCE_RET(); -} - -NEON_OP(trn_u16) -{ - uint32_t rd; - uint32_t rm; - rd = (T0 << 16) | (T1 & 0xffff); - rm = (T1 >> 16) | (T0 & 0xffff0000); - T0 = rd; - T1 = rm; - FORCE_RET(); -} - -/* Worker routines for zip and unzip. */ -NEON_OP(unzip_u8) -{ - uint32_t rd; - uint32_t rm; - rd = (T0 & 0xff) | ((T0 >> 8) & 0xff00) - | ((T1 << 16) & 0xff0000) | ((T1 << 8) & 0xff000000); - rm = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00) - | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000); - T0 = rd; - T1 = rm; - FORCE_RET(); -} - -NEON_OP(zip_u8) -{ - uint32_t rd; - uint32_t rm; - rd = (T0 & 0xff) | ((T1 << 8) & 0xff00) - | ((T0 << 16) & 0xff0000) | ((T1 << 24) & 0xff000000); - rm = ((T0 >> 16) & 0xff) | ((T1 >> 8) & 0xff00) - | ((T0 >> 8) & 0xff0000) | (T1 & 0xff000000); - T0 = rd; - T1 = rm; - FORCE_RET(); -} - -NEON_OP(zip_u16) -{ - uint32_t tmp; - - tmp = (T0 & 0xffff) | (T1 << 16); - T1 = (T1 & 0xffff0000) | (T0 >> 16); - T0 = tmp; - FORCE_RET(); -} - -NEON_OP(dup_u8) -{ - T0 = (T0 >> PARAM1) & 0xff; - T0 |= T0 << 8; - T0 |= T0 << 16; - FORCE_RET(); -} |