aboutsummaryrefslogtreecommitdiff
path: root/target/arm/vec_helper.c
diff options
context:
space:
mode:
Diffstat (limited to 'target/arm/vec_helper.c')
-rw-r--r--target/arm/vec_helper.c120
1 files changed, 67 insertions, 53 deletions
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
index 48e3addd81..f88e572132 100644
--- a/target/arm/vec_helper.c
+++ b/target/arm/vec_helper.c
@@ -375,71 +375,76 @@ void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
* All elements are treated equally, no matter where they are.
*/
-void HELPER(gvec_sdot_b)(void *vd, void *vn, void *vm, uint32_t desc)
+void HELPER(gvec_sdot_b)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
{
intptr_t i, opr_sz = simd_oprsz(desc);
- int32_t *d = vd;
+ int32_t *d = vd, *a = va;
int8_t *n = vn, *m = vm;
for (i = 0; i < opr_sz / 4; ++i) {
- d[i] += n[i * 4 + 0] * m[i * 4 + 0]
- + n[i * 4 + 1] * m[i * 4 + 1]
- + n[i * 4 + 2] * m[i * 4 + 2]
- + n[i * 4 + 3] * m[i * 4 + 3];
+ d[i] = (a[i] +
+ n[i * 4 + 0] * m[i * 4 + 0] +
+ n[i * 4 + 1] * m[i * 4 + 1] +
+ n[i * 4 + 2] * m[i * 4 + 2] +
+ n[i * 4 + 3] * m[i * 4 + 3]);
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}
-void HELPER(gvec_udot_b)(void *vd, void *vn, void *vm, uint32_t desc)
+void HELPER(gvec_udot_b)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
{
intptr_t i, opr_sz = simd_oprsz(desc);
- uint32_t *d = vd;
+ uint32_t *d = vd, *a = va;
uint8_t *n = vn, *m = vm;
for (i = 0; i < opr_sz / 4; ++i) {
- d[i] += n[i * 4 + 0] * m[i * 4 + 0]
- + n[i * 4 + 1] * m[i * 4 + 1]
- + n[i * 4 + 2] * m[i * 4 + 2]
- + n[i * 4 + 3] * m[i * 4 + 3];
+ d[i] = (a[i] +
+ n[i * 4 + 0] * m[i * 4 + 0] +
+ n[i * 4 + 1] * m[i * 4 + 1] +
+ n[i * 4 + 2] * m[i * 4 + 2] +
+ n[i * 4 + 3] * m[i * 4 + 3]);
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}
-void HELPER(gvec_sdot_h)(void *vd, void *vn, void *vm, uint32_t desc)
+void HELPER(gvec_sdot_h)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
{
intptr_t i, opr_sz = simd_oprsz(desc);
- int64_t *d = vd;
+ int64_t *d = vd, *a = va;
int16_t *n = vn, *m = vm;
for (i = 0; i < opr_sz / 8; ++i) {
- d[i] += (int64_t)n[i * 4 + 0] * m[i * 4 + 0]
- + (int64_t)n[i * 4 + 1] * m[i * 4 + 1]
- + (int64_t)n[i * 4 + 2] * m[i * 4 + 2]
- + (int64_t)n[i * 4 + 3] * m[i * 4 + 3];
+ d[i] = (a[i] +
+ (int64_t)n[i * 4 + 0] * m[i * 4 + 0] +
+ (int64_t)n[i * 4 + 1] * m[i * 4 + 1] +
+ (int64_t)n[i * 4 + 2] * m[i * 4 + 2] +
+ (int64_t)n[i * 4 + 3] * m[i * 4 + 3]);
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}
-void HELPER(gvec_udot_h)(void *vd, void *vn, void *vm, uint32_t desc)
+void HELPER(gvec_udot_h)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
{
intptr_t i, opr_sz = simd_oprsz(desc);
- uint64_t *d = vd;
+ uint64_t *d = vd, *a = va;
uint16_t *n = vn, *m = vm;
for (i = 0; i < opr_sz / 8; ++i) {
- d[i] += (uint64_t)n[i * 4 + 0] * m[i * 4 + 0]
- + (uint64_t)n[i * 4 + 1] * m[i * 4 + 1]
- + (uint64_t)n[i * 4 + 2] * m[i * 4 + 2]
- + (uint64_t)n[i * 4 + 3] * m[i * 4 + 3];
+ d[i] = (a[i] +
+ (uint64_t)n[i * 4 + 0] * m[i * 4 + 0] +
+ (uint64_t)n[i * 4 + 1] * m[i * 4 + 1] +
+ (uint64_t)n[i * 4 + 2] * m[i * 4 + 2] +
+ (uint64_t)n[i * 4 + 3] * m[i * 4 + 3]);
}
clear_tail(d, opr_sz, simd_maxsz(desc));
}
-void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc)
+void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm,
+ void *va, uint32_t desc)
{
intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4;
intptr_t index = simd_data(desc);
- int32_t *d = vd;
+ int32_t *d = vd, *a = va;
int8_t *n = vn;
int8_t *m_indexed = (int8_t *)vm + H4(index) * 4;
@@ -455,10 +460,11 @@ void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc)
int8_t m3 = m_indexed[i * 4 + 3];
do {
- d[i] += n[i * 4 + 0] * m0
- + n[i * 4 + 1] * m1
- + n[i * 4 + 2] * m2
- + n[i * 4 + 3] * m3;
+ d[i] = (a[i] +
+ n[i * 4 + 0] * m0 +
+ n[i * 4 + 1] * m1 +
+ n[i * 4 + 2] * m2 +
+ n[i * 4 + 3] * m3);
} while (++i < segend);
segend = i + 4;
} while (i < opr_sz_4);
@@ -466,11 +472,12 @@ void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc)
clear_tail(d, opr_sz, simd_maxsz(desc));
}
-void HELPER(gvec_udot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc)
+void HELPER(gvec_udot_idx_b)(void *vd, void *vn, void *vm,
+ void *va, uint32_t desc)
{
intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4;
intptr_t index = simd_data(desc);
- uint32_t *d = vd;
+ uint32_t *d = vd, *a = va;
uint8_t *n = vn;
uint8_t *m_indexed = (uint8_t *)vm + H4(index) * 4;
@@ -486,10 +493,11 @@ void HELPER(gvec_udot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc)
uint8_t m3 = m_indexed[i * 4 + 3];
do {
- d[i] += n[i * 4 + 0] * m0
- + n[i * 4 + 1] * m1
- + n[i * 4 + 2] * m2
- + n[i * 4 + 3] * m3;
+ d[i] = (a[i] +
+ n[i * 4 + 0] * m0 +
+ n[i * 4 + 1] * m1 +
+ n[i * 4 + 2] * m2 +
+ n[i * 4 + 3] * m3);
} while (++i < segend);
segend = i + 4;
} while (i < opr_sz_4);
@@ -497,11 +505,12 @@ void HELPER(gvec_udot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc)
clear_tail(d, opr_sz, simd_maxsz(desc));
}
-void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
+void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm,
+ void *va, uint32_t desc)
{
intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8;
intptr_t index = simd_data(desc);
- int64_t *d = vd;
+ int64_t *d = vd, *a = va;
int16_t *n = vn;
int16_t *m_indexed = (int16_t *)vm + index * 4;
@@ -509,30 +518,33 @@ void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
* Process the entire segment all at once, writing back the results
* only after we've consumed all of the inputs.
*/
- for (i = 0; i < opr_sz_8 ; i += 2) {
- uint64_t d0, d1;
+ for (i = 0; i < opr_sz_8; i += 2) {
+ int64_t d0, d1;
- d0 = n[i * 4 + 0] * (int64_t)m_indexed[i * 4 + 0];
+ d0 = a[i + 0];
+ d0 += n[i * 4 + 0] * (int64_t)m_indexed[i * 4 + 0];
d0 += n[i * 4 + 1] * (int64_t)m_indexed[i * 4 + 1];
d0 += n[i * 4 + 2] * (int64_t)m_indexed[i * 4 + 2];
d0 += n[i * 4 + 3] * (int64_t)m_indexed[i * 4 + 3];
- d1 = n[i * 4 + 4] * (int64_t)m_indexed[i * 4 + 0];
+
+ d1 = a[i + 1];
+ d1 += n[i * 4 + 4] * (int64_t)m_indexed[i * 4 + 0];
d1 += n[i * 4 + 5] * (int64_t)m_indexed[i * 4 + 1];
d1 += n[i * 4 + 6] * (int64_t)m_indexed[i * 4 + 2];
d1 += n[i * 4 + 7] * (int64_t)m_indexed[i * 4 + 3];
- d[i + 0] += d0;
- d[i + 1] += d1;
+ d[i + 0] = d0;
+ d[i + 1] = d1;
}
-
clear_tail(d, opr_sz, simd_maxsz(desc));
}
-void HELPER(gvec_udot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
+void HELPER(gvec_udot_idx_h)(void *vd, void *vn, void *vm,
+ void *va, uint32_t desc)
{
intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8;
intptr_t index = simd_data(desc);
- uint64_t *d = vd;
+ uint64_t *d = vd, *a = va;
uint16_t *n = vn;
uint16_t *m_indexed = (uint16_t *)vm + index * 4;
@@ -540,22 +552,24 @@ void HELPER(gvec_udot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
* Process the entire segment all at once, writing back the results
* only after we've consumed all of the inputs.
*/
- for (i = 0; i < opr_sz_8 ; i += 2) {
+ for (i = 0; i < opr_sz_8; i += 2) {
uint64_t d0, d1;
- d0 = n[i * 4 + 0] * (uint64_t)m_indexed[i * 4 + 0];
+ d0 = a[i + 0];
+ d0 += n[i * 4 + 0] * (uint64_t)m_indexed[i * 4 + 0];
d0 += n[i * 4 + 1] * (uint64_t)m_indexed[i * 4 + 1];
d0 += n[i * 4 + 2] * (uint64_t)m_indexed[i * 4 + 2];
d0 += n[i * 4 + 3] * (uint64_t)m_indexed[i * 4 + 3];
- d1 = n[i * 4 + 4] * (uint64_t)m_indexed[i * 4 + 0];
+
+ d1 = a[i + 1];
+ d1 += n[i * 4 + 4] * (uint64_t)m_indexed[i * 4 + 0];
d1 += n[i * 4 + 5] * (uint64_t)m_indexed[i * 4 + 1];
d1 += n[i * 4 + 6] * (uint64_t)m_indexed[i * 4 + 2];
d1 += n[i * 4 + 7] * (uint64_t)m_indexed[i * 4 + 3];
- d[i + 0] += d0;
- d[i + 1] += d1;
+ d[i + 0] = d0;
+ d[i + 1] = d1;
}
-
clear_tail(d, opr_sz, simd_maxsz(desc));
}