aboutsummaryrefslogtreecommitdiff
path: root/target/arm/vec_helper.c
diff options
context:
space:
mode:
Diffstat (limited to 'target/arm/vec_helper.c')
-rw-r--r--target/arm/vec_helper.c311
1 files changed, 291 insertions, 20 deletions
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
index f504dd53c8..37f338732e 100644
--- a/target/arm/vec_helper.c
+++ b/target/arm/vec_helper.c
@@ -194,6 +194,197 @@ void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
clear_tail(d, opr_sz, simd_maxsz(desc));
}
+/* Integer 8 and 16-bit dot-product.
+ *
+ * Note that for the loops herein, host endianness does not matter
+ * with respect to the ordering of data within the 64-bit lanes.
+ * All elements are treated equally, no matter where they are.
+ */
+
+void HELPER(gvec_sdot_b)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ uint32_t *d = vd;
+ int8_t *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 4; ++i) {
+ d[i] += n[i * 4 + 0] * m[i * 4 + 0]
+ + n[i * 4 + 1] * m[i * 4 + 1]
+ + n[i * 4 + 2] * m[i * 4 + 2]
+ + n[i * 4 + 3] * m[i * 4 + 3];
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_udot_b)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ uint32_t *d = vd;
+ uint8_t *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 4; ++i) {
+ d[i] += n[i * 4 + 0] * m[i * 4 + 0]
+ + n[i * 4 + 1] * m[i * 4 + 1]
+ + n[i * 4 + 2] * m[i * 4 + 2]
+ + n[i * 4 + 3] * m[i * 4 + 3];
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_sdot_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ uint64_t *d = vd;
+ int16_t *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 8; ++i) {
+ d[i] += (int64_t)n[i * 4 + 0] * m[i * 4 + 0]
+ + (int64_t)n[i * 4 + 1] * m[i * 4 + 1]
+ + (int64_t)n[i * 4 + 2] * m[i * 4 + 2]
+ + (int64_t)n[i * 4 + 3] * m[i * 4 + 3];
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_udot_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc);
+ uint64_t *d = vd;
+ uint16_t *n = vn, *m = vm;
+
+ for (i = 0; i < opr_sz / 8; ++i) {
+ d[i] += (uint64_t)n[i * 4 + 0] * m[i * 4 + 0]
+ + (uint64_t)n[i * 4 + 1] * m[i * 4 + 1]
+ + (uint64_t)n[i * 4 + 2] * m[i * 4 + 2]
+ + (uint64_t)n[i * 4 + 3] * m[i * 4 + 3];
+ }
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4;
+ intptr_t index = simd_data(desc);
+ uint32_t *d = vd;
+ int8_t *n = vn;
+ int8_t *m_indexed = (int8_t *)vm + index * 4;
+
+ /* Notice the special case of opr_sz == 8, from aa64/aa32 advsimd.
+ * Otherwise opr_sz is a multiple of 16.
+ */
+ segend = MIN(4, opr_sz_4);
+ i = 0;
+ do {
+ int8_t m0 = m_indexed[i * 4 + 0];
+ int8_t m1 = m_indexed[i * 4 + 1];
+ int8_t m2 = m_indexed[i * 4 + 2];
+ int8_t m3 = m_indexed[i * 4 + 3];
+
+ do {
+ d[i] += n[i * 4 + 0] * m0
+ + n[i * 4 + 1] * m1
+ + n[i * 4 + 2] * m2
+ + n[i * 4 + 3] * m3;
+ } while (++i < segend);
+ segend = i + 4;
+ } while (i < opr_sz_4);
+
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_udot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4;
+ intptr_t index = simd_data(desc);
+ uint32_t *d = vd;
+ uint8_t *n = vn;
+ uint8_t *m_indexed = (uint8_t *)vm + index * 4;
+
+ /* Notice the special case of opr_sz == 8, from aa64/aa32 advsimd.
+ * Otherwise opr_sz is a multiple of 16.
+ */
+ segend = MIN(4, opr_sz_4);
+ i = 0;
+ do {
+ uint8_t m0 = m_indexed[i * 4 + 0];
+ uint8_t m1 = m_indexed[i * 4 + 1];
+ uint8_t m2 = m_indexed[i * 4 + 2];
+ uint8_t m3 = m_indexed[i * 4 + 3];
+
+ do {
+ d[i] += n[i * 4 + 0] * m0
+ + n[i * 4 + 1] * m1
+ + n[i * 4 + 2] * m2
+ + n[i * 4 + 3] * m3;
+ } while (++i < segend);
+ segend = i + 4;
+ } while (i < opr_sz_4);
+
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8;
+ intptr_t index = simd_data(desc);
+ uint64_t *d = vd;
+ int16_t *n = vn;
+ int16_t *m_indexed = (int16_t *)vm + index * 4;
+
+ /* This is supported by SVE only, so opr_sz is always a multiple of 16.
+ * Process the entire segment all at once, writing back the results
+ * only after we've consumed all of the inputs.
+ */
+ for (i = 0; i < opr_sz_8 ; i += 2) {
+ uint64_t d0, d1;
+
+ d0 = n[i * 4 + 0] * (int64_t)m_indexed[i * 4 + 0];
+ d0 += n[i * 4 + 1] * (int64_t)m_indexed[i * 4 + 1];
+ d0 += n[i * 4 + 2] * (int64_t)m_indexed[i * 4 + 2];
+ d0 += n[i * 4 + 3] * (int64_t)m_indexed[i * 4 + 3];
+ d1 = n[i * 4 + 4] * (int64_t)m_indexed[i * 4 + 0];
+ d1 += n[i * 4 + 5] * (int64_t)m_indexed[i * 4 + 1];
+ d1 += n[i * 4 + 6] * (int64_t)m_indexed[i * 4 + 2];
+ d1 += n[i * 4 + 7] * (int64_t)m_indexed[i * 4 + 3];
+
+ d[i + 0] += d0;
+ d[i + 1] += d1;
+ }
+
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
+void HELPER(gvec_udot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+ intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8;
+ intptr_t index = simd_data(desc);
+ uint64_t *d = vd;
+ uint16_t *n = vn;
+ uint16_t *m_indexed = (uint16_t *)vm + index * 4;
+
+ /* This is supported by SVE only, so opr_sz is always a multiple of 16.
+ * Process the entire segment all at once, writing back the results
+ * only after we've consumed all of the inputs.
+ */
+ for (i = 0; i < opr_sz_8 ; i += 2) {
+ uint64_t d0, d1;
+
+ d0 = n[i * 4 + 0] * (uint64_t)m_indexed[i * 4 + 0];
+ d0 += n[i * 4 + 1] * (uint64_t)m_indexed[i * 4 + 1];
+ d0 += n[i * 4 + 2] * (uint64_t)m_indexed[i * 4 + 2];
+ d0 += n[i * 4 + 3] * (uint64_t)m_indexed[i * 4 + 3];
+ d1 = n[i * 4 + 4] * (uint64_t)m_indexed[i * 4 + 0];
+ d1 += n[i * 4 + 5] * (uint64_t)m_indexed[i * 4 + 1];
+ d1 += n[i * 4 + 6] * (uint64_t)m_indexed[i * 4 + 2];
+ d1 += n[i * 4 + 7] * (uint64_t)m_indexed[i * 4 + 3];
+
+ d[i + 0] += d0;
+ d[i + 1] += d1;
+ }
+
+ clear_tail(d, opr_sz, simd_maxsz(desc));
+}
+
void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
void *vfpst, uint32_t desc)
{
@@ -317,23 +508,29 @@ void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm,
float_status *fpst = vfpst;
intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+ intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
uint32_t neg_real = flip ^ neg_imag;
- uintptr_t i;
- float16 e1 = m[H2(flip)];
- float16 e3 = m[H2(1 - flip)];
+ intptr_t elements = opr_sz / sizeof(float16);
+ intptr_t eltspersegment = 16 / sizeof(float16);
+ intptr_t i, j;
/* Shift boolean to the sign bit so we can xor to negate. */
neg_real <<= 15;
neg_imag <<= 15;
- e1 ^= neg_real;
- e3 ^= neg_imag;
- for (i = 0; i < opr_sz / 2; i += 2) {
- float16 e2 = n[H2(i + flip)];
- float16 e4 = e2;
+ for (i = 0; i < elements; i += eltspersegment) {
+ float16 mr = m[H2(i + 2 * index + 0)];
+ float16 mi = m[H2(i + 2 * index + 1)];
+ float16 e1 = neg_real ^ (flip ? mi : mr);
+ float16 e3 = neg_imag ^ (flip ? mr : mi);
- d[H2(i)] = float16_muladd(e2, e1, d[H2(i)], 0, fpst);
- d[H2(i + 1)] = float16_muladd(e4, e3, d[H2(i + 1)], 0, fpst);
+ for (j = i; j < i + eltspersegment; j += 2) {
+ float16 e2 = n[H2(j + flip)];
+ float16 e4 = e2;
+
+ d[H2(j)] = float16_muladd(e2, e1, d[H2(j)], 0, fpst);
+ d[H2(j + 1)] = float16_muladd(e4, e3, d[H2(j + 1)], 0, fpst);
+ }
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}
@@ -377,23 +574,29 @@ void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm,
float_status *fpst = vfpst;
intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
+ intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
uint32_t neg_real = flip ^ neg_imag;
- uintptr_t i;
- float32 e1 = m[H4(flip)];
- float32 e3 = m[H4(1 - flip)];
+ intptr_t elements = opr_sz / sizeof(float32);
+ intptr_t eltspersegment = 16 / sizeof(float32);
+ intptr_t i, j;
/* Shift boolean to the sign bit so we can xor to negate. */
neg_real <<= 31;
neg_imag <<= 31;
- e1 ^= neg_real;
- e3 ^= neg_imag;
- for (i = 0; i < opr_sz / 4; i += 2) {
- float32 e2 = n[H4(i + flip)];
- float32 e4 = e2;
+ for (i = 0; i < elements; i += eltspersegment) {
+ float32 mr = m[H4(i + 2 * index + 0)];
+ float32 mi = m[H4(i + 2 * index + 1)];
+ float32 e1 = neg_real ^ (flip ? mi : mr);
+ float32 e3 = neg_imag ^ (flip ? mr : mi);
- d[H4(i)] = float32_muladd(e2, e1, d[H4(i)], 0, fpst);
- d[H4(i + 1)] = float32_muladd(e4, e3, d[H4(i + 1)], 0, fpst);
+ for (j = i; j < i + eltspersegment; j += 2) {
+ float32 e2 = n[H4(j + flip)];
+ float32 e4 = e2;
+
+ d[H4(j)] = float32_muladd(e2, e1, d[H4(j)], 0, fpst);
+ d[H4(j + 1)] = float32_muladd(e4, e3, d[H4(j + 1)], 0, fpst);
+ }
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}
@@ -427,6 +630,26 @@ void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm,
clear_tail(d, opr_sz, simd_maxsz(desc));
}
+#define DO_2OP(NAME, FUNC, TYPE) \
+void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
+{ \
+ intptr_t i, oprsz = simd_oprsz(desc); \
+ TYPE *d = vd, *n = vn; \
+ for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
+ d[i] = FUNC(n[i], stat); \
+ } \
+}
+
+DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
+DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
+DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
+
+DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
+DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
+DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
+
+#undef DO_2OP
+
/* Floating-point trigonometric starting value.
* See the ARM ARM pseudocode function FPTrigSMul.
*/
@@ -495,3 +718,51 @@ DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
#endif
#undef DO_3OP
+
+/* For the indexed ops, SVE applies the index per 128-bit vector segment.
+ * For AdvSIMD, there is of course only one such vector segment.
+ */
+
+#define DO_MUL_IDX(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
+{ \
+ intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
+ intptr_t idx = simd_data(desc); \
+ TYPE *d = vd, *n = vn, *m = vm; \
+ for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
+ TYPE mm = m[H(i + idx)]; \
+ for (j = 0; j < segment; j++) { \
+ d[i + j] = TYPE##_mul(n[i + j], mm, stat); \
+ } \
+ } \
+}
+
+DO_MUL_IDX(gvec_fmul_idx_h, float16, H2)
+DO_MUL_IDX(gvec_fmul_idx_s, float32, H4)
+DO_MUL_IDX(gvec_fmul_idx_d, float64, )
+
+#undef DO_MUL_IDX
+
+#define DO_FMLA_IDX(NAME, TYPE, H) \
+void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \
+ void *stat, uint32_t desc) \
+{ \
+ intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
+ TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \
+ intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \
+ TYPE *d = vd, *n = vn, *m = vm, *a = va; \
+ op1_neg <<= (8 * sizeof(TYPE) - 1); \
+ for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
+ TYPE mm = m[H(i + idx)]; \
+ for (j = 0; j < segment; j++) { \
+ d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \
+ mm, a[i + j], 0, stat); \
+ } \
+ } \
+}
+
+DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
+DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
+DO_FMLA_IDX(gvec_fmla_idx_d, float64, )
+
+#undef DO_FMLA_IDX