aboutsummaryrefslogtreecommitdiff
path: root/libraries/atlas/atlas.patch
diff options
context:
space:
mode:
Diffstat (limited to 'libraries/atlas/atlas.patch')
-rw-r--r--libraries/atlas/atlas.patch5072
1 files changed, 0 insertions, 5072 deletions
diff --git a/libraries/atlas/atlas.patch b/libraries/atlas/atlas.patch
deleted file mode 100644
index dea4dcc0b2ee..000000000000
--- a/libraries/atlas/atlas.patch
+++ /dev/null
@@ -1,5072 +0,0 @@
-diff -rupN ATLAS/CONFIG/src/backend/archinfo_x86.c atlas-3.8.3/CONFIG/src/backend/archinfo_x86.c
---- ATLAS/CONFIG/src/backend/archinfo_x86.c 2009-02-18 19:47:37.000000000 +0100
-+++ atlas-3.8.3/CONFIG/src/backend/archinfo_x86.c 2009-11-12 13:47:23.777451677 +0100
-@@ -320,7 +320,7 @@ enum MACHTYPE Chip2Mach(enum CHIP chip,
- iret = IntP4;
- break;
- case 3:
-- case 4:
-+ case 4: ; case 6:
- iret = IntP4E;
- break;
- default:
-diff -rupN ATLAS/include/atlas_lvl3.h atlas-3.8.3/include/atlas_lvl3.h
---- ATLAS/include/atlas_lvl3.h 2009-02-18 19:47:35.000000000 +0100
-+++ atlas-3.8.3/include/atlas_lvl3.h 2009-11-12 13:52:49.308496090 +0100
-@@ -126,7 +126,7 @@
- #define CPAT Mjoin(C_ATL_, PRE);
-
- #ifndef ATL_MaxMalloc
-- #define ATL_MaxMalloc 67108864
-+ #define ATL_MaxMalloc XXX_MaxMalloc_XXX
- #endif
-
- typedef void (*MAT2BLK)(int, int, const TYPE*, int, TYPE*, const SCALAR);
-diff -rupN ATLAS/src/blas/gemm/ATL_cmmJITcp.c atlas-3.8.3/src/blas/gemm/ATL_cmmJITcp.c
---- ATLAS/src/blas/gemm/ATL_cmmJITcp.c 2009-02-18 19:47:44.000000000 +0100
-+++ atlas-3.8.3/src/blas/gemm/ATL_cmmJITcp.c 2009-11-12 12:44:34.816529051 +0100
-@@ -268,7 +268,8 @@ static void Mjoin(PATL,mmK)
- {
- NBmm0 = NBmm1 = NBmmX = Mjoin(PATLU,pKBmm);
- if (SCALAR_IS_ZERO(beta))
-- Mjoin(PATL,gezero)(M, N, C, ldc);
-+ /* Mjoin(PATL,gezero)(M, N, C, ldc); */
-+ { Mjoin(PATLU,gezero)(M, N, pC, ldpc); Mjoin(PATLU,gezero)(M, N, pC+ipc, ldpc); }
- }
- if (nblk)
- {
-diff -rupN ATLAS/src/blas/gemm/ATL_gereal2cplx.c atlas-3.8.3/src/blas/gemm/ATL_gereal2cplx.c
---- ATLAS/src/blas/gemm/ATL_gereal2cplx.c 2009-02-18 19:47:44.000000000 +0100
-+++ atlas-3.8.3/src/blas/gemm/ATL_gereal2cplx.c 2009-11-12 12:49:49.331651677 +0100
-@@ -43,7 +43,53 @@ void Mjoin(PATL,gereal2cplx)
- const int ldc2 = (ldc-M)<<1;
- int i, j;
-
-- if (ialp == ATL_rzero && ibet == ATL_rzero)
-+/*
-+ * Cannot read C if BETA is 0
-+ */
-+ if (rbet == ATL_rzero && ibet == ATL_rzero)
-+ {
-+ if (ialp == ATL_rzero) /* alpha is a real number */
-+ {
-+ if (ralp == ATL_rone) /* alpha = 1.0 */
-+ {
-+ for (j=0; j < N; j++, R += ldr, I += ldi, C += ldc2)
-+ {
-+ for (i=0; i < M; i++, C += 2)
-+ {
-+ *C = R[i];
-+ C[1] = I[i];
-+ }
-+ }
-+ }
-+ else
-+ {
-+ for (j=0; j < N; j++, R += ldr, I += ldi, C += ldc2)
-+ {
-+ for (i=0; i < M; i++, C += 2)
-+ {
-+ *C = ralp * R[i];
-+ C[1] = ralp * I[i];
-+ }
-+ }
-+ }
-+ }
-+ else /* alpha is a complex number */
-+ {
-+ for (j=0; j < N; j++, R += ldr, I += ldi, C += ldc2)
-+ {
-+ for (i=0; i < M; i++, C += 2)
-+ {
-+ ra = R[i]; ia = I[i];
-+ C[0] = ralp * ra - ialp * ia;
-+ C[1] = ralp * ia + ialp * ra;
-+ }
-+ }
-+ }
-+ }
-+/*
-+ * If alpha and beta are both real numbers
-+ */
-+ else if (ialp == ATL_rzero && ibet == ATL_rzero)
- {
- if (ralp == ATL_rone && rbet == ATL_rone)
- {
-diff -rupN ATLAS/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c atlas-3.8.3/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c
---- ATLAS/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c 2009-02-18 19:48:26.000000000 +0100
-+++ atlas-3.8.3/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c 2009-11-12 12:35:50.453038827 +0100
-@@ -27,6 +27,13 @@
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-+#if KB > 84
-+ #error "KB cannot exceed 84!"
-+#endif
-+#if (KB/4)*4 != KB
-+ #error "KB must be a multiple of 4!"
-+#endif
-+
- #ifndef ATL_GAS_x8664
- #error "This kernel requires x86-64 assembly!"
- #endif
-@@ -58,25 +65,25 @@
- * Integer register usage shown be these defines
- */
- #define pA %rcx
--#define pA10 %rbx
--#define ldab %rbp
--#define mldab %rdx
-+#define pA10 %rbx
-+#define ldab %rbp
-+#define mldab %rdx
- #define mldab5 %rax
- #define pB %rdi
- #define pC %rsi
- #define incCn %r10
- #define stM %r9
- #define stN %r11
--#define pfA %r8
--#define pA5 pA
--#define pB0 pB
-+#define pfA %r8
-+#define pA5 pA
-+#define pB0 pB
- #if MB == 0
-- #define stM0 %r12
-- #define incAm %r13
-+ #define stM0 %r12
-+ #define incAm %r13
- #endif
- /* rax used in 32/64 conversion */
-
--#define NBso (KB*4)
-+#define NBso (KB*4)
- #define MBKBso (MB*KB*4)
- #define NB2so (NBso+NBso)
- #define NB3so (NBso+NBso+NBso)
-@@ -95,22 +102,22 @@
- /*
- * SSE2 register usage shown be these defines
- */
--#define rA0 %xmm0
--#define rB0 %xmm1
--#define rC0 %xmm2
--#define rC1 %xmm3
--#define rC2 %xmm4
--#define rC3 %xmm5
--#define rC4 %xmm6
--#define rC5 %xmm7
--#define rC6 %xmm8
--#define rC7 %xmm9
--#define rC8 %xmm10
--#define rC9 %xmm11
--#define rC10 %xmm12
--#define rC11 %xmm13
--#define rC12 %xmm14
--#define rC13 %xmm15
-+#define rA0 %xmm0
-+#define rB0 %xmm1
-+#define rC0 %xmm2
-+#define rC1 %xmm3
-+#define rC2 %xmm4
-+#define rC3 %xmm5
-+#define rC4 %xmm6
-+#define rC5 %xmm7
-+#define rC6 %xmm8
-+#define rC7 %xmm9
-+#define rC8 %xmm10
-+#define rC9 %xmm11
-+#define rC10 %xmm12
-+#define rC11 %xmm13
-+#define rC12 %xmm14
-+#define rC13 %xmm15
- /*
- * Prefetch defines
- */
-@@ -127,99 +134,99 @@
- #if MB != 0
- #define incAm $MBKBso-NB14so+176
- #endif
-- .text
-+ .text
- .global ATL_asmdecor(ATL_USERMM)
- ATL_asmdecor(ATL_USERMM):
- /*
- * Save callee-saved iregs
- */
-- movq %rbp, -8(%rsp)
-- movq %rbx, -16(%rsp)
-+ movq %rbp, -8(%rsp)
-+ movq %rbx, -16(%rsp)
- #if MB == 0
-- movq %r12, -32(%rsp)
-- movq %r13, -40(%rsp)
-+ movq %r12, -32(%rsp)
-+ movq %r13, -40(%rsp)
- #endif
- #ifdef BETAX
- #define BOF -56
-- movss %xmm1, BOF(%rsp)
-- movss %xmm1, BOF+4(%rsp)
-- movss %xmm1, BOF+8(%rsp)
-- movss %xmm1, BOF+12(%rsp)
-+ movss %xmm1, BOF(%rsp)
-+ movss %xmm1, BOF+4(%rsp)
-+ movss %xmm1, BOF+8(%rsp)
-+ movss %xmm1, BOF+12(%rsp)
- #endif
- /*
- * pA already comes in right reg
- * Initialize pB = B; pC = C; NBso = NB * sizeof;
- */
-- movq %rsi, stN
-- movq %rdi, %rax
-- movq 16(%rsp), pC
-- prefC((pC))
-- prefC(64(pC))
-- movq %r9, pB
-- prefB((pB))
-- prefB(64(pB))
-- movq %rax, stM
-+ movq %rsi, stN
-+ movq %rdi, %rax
-+ movq 16(%rsp), pC
-+ prefC((pC))
-+ prefC(64(pC))
-+ movq %r9, pB
-+ prefB((pB))
-+ prefB(64(pB))
-+ movq %rax, stM
- /*
- * stM = pA + NBNBso; stN = pB + NBNBso;
- */
- #if MB == 0
-- movq stM, pfA
-- imulq $NBso, pfA
-- prefB(128(pB))
-- movq pfA, incAm
-- addq pA5, pfA
-- addq $176-NB14so, incAm
-+ movq stM, pfA
-+ imulq $NBso, pfA
-+ prefB(128(pB))
-+ movq pfA, incAm
-+ addq pA5, pfA
-+ addq $176-NB14so, incAm
- #else
-- movq $MBKBso, pfA
-- addq pA5, pfA
-- prefB(128(pB))
-+ movq $MBKBso, pfA
-+ addq pA5, pfA
-+ prefB(128(pB))
- #endif
- /*
- * convert ldc to 64 bits, and then set incCn = (ldc - MB)*sizeof
- */
-- movl 24(%rsp), %eax
-- cltq
-- movq %rax, incCn
-- subq stM, incCn
-- addq $14, incCn
-+ movl 24(%rsp), %eax
-+ cltq
-+ movq %rax, incCn
-+ subq stM, incCn
-+ addq $14, incCn
- #ifdef SREAL
-- shl $2, incCn
-+ shl $2, incCn
- #else
-- shl $3, incCn
-- prefC(128(pC))
-- prefC(192(pC))
-+ shl $3, incCn
-+ prefC(128(pC))
-+ prefC(192(pC))
- #endif
- /*
- * Find M/14 if MB is not set
- */
- #if MB == 0
-- cmp $84, stM
-- jne MB_LT84
--/* movq $84/14, stM */
-- movq $6, stM
-+ cmp $84, stM
-+ jne MB_LT84
-+/* movq $84/14, stM */
-+ movq $6, stM
- MBFOUND:
-- subq $1, stM
-- movq stM, stM0
-+ subq $1, stM
-+ movq stM, stM0
- #endif
-- addq $120, pA5
-- addq $120, pB0
-- movq $KB*4, ldab
-- movq $-KB*5*4, mldab5
-- movq $-KB*4, mldab
-- subq mldab5, pA5
-- lea KB*4(pA5, ldab,4), pA10
--/* movq $NB, stN */
-+ addq $120, pA5
-+ addq $120, pB0
-+ movq $KB*4, ldab
-+ movq $-KB*5*4, mldab5
-+ movq $-KB*4, mldab
-+ subq mldab5, pA5
-+ lea KB*4(pA5, ldab,4), pA10
-+/* movq $NB, stN */
-
- UNLOOP:
- #if MB == 0
-- movq stM0, stM
-- cmp $0, stM
-- je MLAST
-+ movq stM0, stM
-+ cmp $0, stM
-+ je MLAST
- #else
- #ifdef ATL_DivAns
-- movq $ATL_DivAns-1, stM
-+ movq $ATL_DivAns-1, stM
- #else
-- movq $MB/14-1, stM
-+ movq $MB/14-1, stM
- #endif
- #endif
- #if MB == 0 || MB > 14
-@@ -227,992 +234,992 @@ UMLOOP:
- /*
- * rC[0-13] = pC[0-13] * beta
- */
-- ALIGN16
-+ ALIGN16
- /*UKLOOP: */
- #ifdef BETA1
-- movaps 0-120(pA10,mldab5,2), rC0
-- movaps 0-120(pB0), rB0
-- mulps rB0, rC0
-- addss (pC), rC0
-- movaps 0-120(pA5, mldab,4), rC1
-- mulps rB0, rC1
-- addss CMUL(4)(pC), rC1
-- movaps 0-120(pA10, mldab,8), rC2
-- mulps rB0, rC2
-- addss CMUL(8)(pC), rC2
-- movaps 0-120(pA5, mldab,2), rC3
-- mulps rB0, rC3
-- addss CMUL(12)(pC), rC3
-- movaps 0-120(pA5, mldab), rC4
-- mulps rB0, rC4
-- addss CMUL(16)(pC), rC4
-- movaps 0-120(pA5), rC5
-- mulps rB0, rC5
-- addss CMUL(20)(pC), rC5
-- movaps 0-120(pA5, ldab), rC6
-- mulps rB0, rC6
-- addss CMUL(24)(pC), rC6
-- movaps 0-120(pA5, ldab,2), rC7
-- mulps rB0, rC7
-- addss CMUL(28)(pC), rC7
-- movaps 0-120(pA10, mldab,2), rC8
-- mulps rB0, rC8
-- addss CMUL(32)(pC), rC8
-- movaps 0-120(pA5,ldab,4), rC9
-- mulps rB0, rC9
-- addss CMUL(36)(pC), rC9
-- movaps 0-120(pA10), rC10
-- mulps rB0, rC10
-- addss CMUL(40)(pC), rC10
-- movaps 0-120(pA10,ldab), rC11
-- mulps rB0, rC11
-- addss CMUL(44)(pC), rC11
-- movaps 0-120(pA10,ldab,2), rC12
-- mulps rB0, rC12
-- addss CMUL(48)(pC), rC12
-- movaps 0-120(pA5,ldab,8), rC13
-- mulps rB0, rC13
-- addss CMUL(52)(pC), rC13
-+ movaps 0-120(pA10,mldab5,2), rC0
-+ movaps 0-120(pB0), rB0
-+ mulps rB0, rC0
-+ addss (pC), rC0
-+ movaps 0-120(pA5, mldab,4), rC1
-+ mulps rB0, rC1
-+ addss CMUL(4)(pC), rC1
-+ movaps 0-120(pA10, mldab,8), rC2
-+ mulps rB0, rC2
-+ addss CMUL(8)(pC), rC2
-+ movaps 0-120(pA5, mldab,2), rC3
-+ mulps rB0, rC3
-+ addss CMUL(12)(pC), rC3
-+ movaps 0-120(pA5, mldab), rC4
-+ mulps rB0, rC4
-+ addss CMUL(16)(pC), rC4
-+ movaps 0-120(pA5), rC5
-+ mulps rB0, rC5
-+ addss CMUL(20)(pC), rC5
-+ movaps 0-120(pA5, ldab), rC6
-+ mulps rB0, rC6
-+ addss CMUL(24)(pC), rC6
-+ movaps 0-120(pA5, ldab,2), rC7
-+ mulps rB0, rC7
-+ addss CMUL(28)(pC), rC7
-+ movaps 0-120(pA10, mldab,2), rC8
-+ mulps rB0, rC8
-+ addss CMUL(32)(pC), rC8
-+ movaps 0-120(pA5,ldab,4), rC9
-+ mulps rB0, rC9
-+ addss CMUL(36)(pC), rC9
-+ movaps 0-120(pA10), rC10
-+ mulps rB0, rC10
-+ addss CMUL(40)(pC), rC10
-+ movaps 0-120(pA10,ldab), rC11
-+ mulps rB0, rC11
-+ addss CMUL(44)(pC), rC11
-+ movaps 0-120(pA10,ldab,2), rC12
-+ mulps rB0, rC12
-+ addss CMUL(48)(pC), rC12
-+ movaps 0-120(pA5,ldab,8), rC13
-+ mulps rB0, rC13
-+ addss CMUL(52)(pC), rC13
- #else
-- movaps 0-120(pA10,mldab5,2), rC0
-- movaps 0-120(pB0), rC13
-- mulps rC13, rC0
-- movaps 0-120(pA5, mldab,4), rC1
-- mulps rC13, rC1
-- movaps 0-120(pA10, mldab,8), rC2
-- mulps rC13, rC2
-- movaps 0-120(pA5, mldab,2), rC3
-- mulps rC13, rC3
-- movaps 0-120(pA5, mldab), rC4
-- mulps rC13, rC4
-- movaps 0-120(pA5), rC5
-- mulps rC13, rC5
-- movaps 0-120(pA5, ldab), rC6
-- mulps rC13, rC6
-- movaps 0-120(pA5, ldab,2), rC7
-- mulps rC13, rC7
-- movaps 0-120(pA10, mldab,2), rC8
-- mulps rC13, rC8
-- movaps 0-120(pA5,ldab,4), rC9
-- mulps rC13, rC9
-- movaps 0-120(pA10), rC10
-- mulps rC13, rC10
-- movaps 0-120(pA10,ldab), rC11
-- mulps rC13, rC11
-- movaps 0-120(pA10,ldab,2), rC12
-- mulps rC13, rC12
-- mulps 0-120(pA5,ldab,8), rC13
-+ movaps 0-120(pA10,mldab5,2), rC0
-+ movaps 0-120(pB0), rC13
-+ mulps rC13, rC0
-+ movaps 0-120(pA5, mldab,4), rC1
-+ mulps rC13, rC1
-+ movaps 0-120(pA10, mldab,8), rC2
-+ mulps rC13, rC2
-+ movaps 0-120(pA5, mldab,2), rC3
-+ mulps rC13, rC3
-+ movaps 0-120(pA5, mldab), rC4
-+ mulps rC13, rC4
-+ movaps 0-120(pA5), rC5
-+ mulps rC13, rC5
-+ movaps 0-120(pA5, ldab), rC6
-+ mulps rC13, rC6
-+ movaps 0-120(pA5, ldab,2), rC7
-+ mulps rC13, rC7
-+ movaps 0-120(pA10, mldab,2), rC8
-+ mulps rC13, rC8
-+ movaps 0-120(pA5,ldab,4), rC9
-+ mulps rC13, rC9
-+ movaps 0-120(pA10), rC10
-+ mulps rC13, rC10
-+ movaps 0-120(pA10,ldab), rC11
-+ mulps rC13, rC11
-+ movaps 0-120(pA10,ldab,2), rC12
-+ mulps rC13, rC12
-+ mulps 0-120(pA5,ldab,8), rC13
- #endif
-
- #if KB > 4
-- movaps 16-120(pA10,mldab5,2), rA0
-- movaps 16-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 16-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 16-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 16-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 16-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 16-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 16-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 16-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 16-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 16-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 16-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 16-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 16-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 16-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 16-120(pA10,mldab5,2), rA0
-+ movaps 16-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 16-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 16-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 16-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 16-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 16-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 16-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 16-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 16-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 16-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 16-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 16-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 16-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 16-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 8
-- movaps 32-120(pA10,mldab5,2), rA0
-- movaps 32-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 32-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 32-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 32-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 32-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 32-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 32-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 32-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 32-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 32-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 32-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 32-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 32-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 32-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 32-120(pA10,mldab5,2), rA0
-+ movaps 32-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 32-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 32-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 32-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 32-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 32-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 32-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 32-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 32-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 32-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 32-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 32-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 32-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 32-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 12
-- movaps 48-120(pA10,mldab5,2), rA0
-- movaps 48-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 48-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 48-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 48-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 48-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 48-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 48-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 48-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 48-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 48-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 48-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 48-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 48-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 48-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 48-120(pA10,mldab5,2), rA0
-+ movaps 48-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 48-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 48-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 48-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 48-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 48-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 48-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 48-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 48-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 48-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 48-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 48-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 48-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 48-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 16
-- movaps 64-120(pA10,mldab5,2), rA0
-- movaps 64-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 64-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 64-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 64-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 64-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 64-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 64-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 64-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 64-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 64-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 64-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 64-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 64-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 64-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 64-120(pA10,mldab5,2), rA0
-+ movaps 64-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 64-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 64-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 64-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 64-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 64-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 64-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 64-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 64-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 64-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 64-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 64-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 64-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 64-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 20
-- movaps 80-120(pA10,mldab5,2), rA0
-- movaps 80-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 80-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 80-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 80-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 80-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 80-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 80-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 80-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 80-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 80-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 80-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 80-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 80-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 80-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 80-120(pA10,mldab5,2), rA0
-+ movaps 80-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 80-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 80-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 80-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 80-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 80-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 80-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 80-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 80-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 80-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 80-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 80-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 80-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 80-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 24
-- movaps 96-120(pA10,mldab5,2), rA0
-- movaps 96-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 96-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 96-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 96-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 96-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 96-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 96-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 96-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 96-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 96-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 96-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 96-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 96-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 96-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 96-120(pA10,mldab5,2), rA0
-+ movaps 96-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 96-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 96-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 96-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 96-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 96-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 96-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 96-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 96-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 96-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 96-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 96-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 96-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 96-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 28
-- movaps 112-120(pA10,mldab5,2), rA0
-- movaps 112-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 112-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 112-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 112-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 112-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 112-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 112-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 112-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 112-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 112-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 112-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 112-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 112-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 112-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 112-120(pA10,mldab5,2), rA0
-+ movaps 112-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 112-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 112-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 112-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 112-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 112-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 112-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 112-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 112-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 112-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 112-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 112-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 112-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 112-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
- #ifndef SREAL
-- pref2((pfA))
-- pref2(64(pfA))
-+ pref2((pfA))
-+ pref2(64(pfA))
- #endif
-
- #if KB > 32
-- movaps 128-120(pA10,mldab5,2), rA0
-- movaps 128-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 128-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 128-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 128-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 128-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 128-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 128-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 128-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 128-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 128-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 128-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 128-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 128-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 128-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 128-120(pA10,mldab5,2), rA0
-+ movaps 128-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 128-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 128-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 128-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 128-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 128-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 128-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 128-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 128-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 128-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 128-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 128-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 128-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 128-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 36
-- movaps 144-120(pA10,mldab5,2), rA0
-- movaps 144-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 144-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 144-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 144-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 144-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 144-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 144-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 144-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 144-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 144-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 144-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 144-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 144-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 144-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 144-120(pA10,mldab5,2), rA0
-+ movaps 144-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 144-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 144-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 144-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 144-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 144-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 144-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 144-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 144-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 144-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 144-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 144-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 144-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 144-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 40
-- movaps 160-120(pA10,mldab5,2), rA0
-- movaps 160-120(pB0), rB0
-- mulps rB0, rA0
-- addq $176, pB0
-- addps rA0, rC0
-- movaps 160-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 160-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 160-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 160-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 160-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 160-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 160-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 160-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 160-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 160-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 160-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 160-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addq $176, pA10
-- addps rA0, rC12
-- mulps 160-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-- addq $176, pA5
-+ movaps 160-120(pA10,mldab5,2), rA0
-+ movaps 160-120(pB0), rB0
-+ mulps rB0, rA0
-+ addq $176, pB0
-+ addps rA0, rC0
-+ movaps 160-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 160-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 160-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 160-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 160-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 160-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 160-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 160-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 160-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 160-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 160-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 160-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addq $176, pA10
-+ addps rA0, rC12
-+ mulps 160-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
-+ addq $176, pA5
- #else
-- addq $176, pB0
-- addq $176, pA10
-- addq $176, pA5
-+ addq $176, pB0
-+ addq $176, pA10
-+ addq $176, pA5
- #endif
-
- #if KB > 44
-- movaps 0-120(pA10,mldab5,2), rA0
-- movaps 0-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 0-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 0-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 0-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 0-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 0-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 0-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 0-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 0-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 0-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 0-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 0-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 0-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 0-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 0-120(pA10,mldab5,2), rA0
-+ movaps 0-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 0-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 0-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 0-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 0-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 0-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 0-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 0-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 0-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 0-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 0-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 0-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 0-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 0-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 48
-- movaps 16-120(pA10,mldab5,2), rA0
-- movaps 16-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 16-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 16-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 16-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 16-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 16-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 16-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 16-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 16-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 16-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 16-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 16-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 16-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 16-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 16-120(pA10,mldab5,2), rA0
-+ movaps 16-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 16-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 16-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 16-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 16-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 16-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 16-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 16-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 16-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 16-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 16-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 16-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 16-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 16-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 52
-- movaps 32-120(pA10,mldab5,2), rA0
-- movaps 32-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 32-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 32-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 32-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 32-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 32-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 32-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 32-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 32-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 32-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 32-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 32-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 32-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 32-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 32-120(pA10,mldab5,2), rA0
-+ movaps 32-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 32-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 32-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 32-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 32-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 32-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 32-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 32-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 32-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 32-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 32-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 32-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 32-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 32-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 56
-- movaps 48-120(pA10,mldab5,2), rA0
-- movaps 48-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 48-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 48-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 48-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 48-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 48-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 48-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 48-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 48-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 48-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 48-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 48-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 48-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 48-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 48-120(pA10,mldab5,2), rA0
-+ movaps 48-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 48-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 48-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 48-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 48-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 48-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 48-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 48-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 48-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 48-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 48-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 48-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 48-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 48-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 60
-- movaps 64-120(pA10,mldab5,2), rA0
-- movaps 64-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 64-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 64-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 64-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 64-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 64-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 64-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 64-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 64-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 64-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 64-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 64-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 64-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 64-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 64-120(pA10,mldab5,2), rA0
-+ movaps 64-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 64-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 64-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 64-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 64-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 64-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 64-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 64-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 64-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 64-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 64-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 64-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 64-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 64-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 64
-- movaps 80-120(pA10,mldab5,2), rA0
-- movaps 80-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 80-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 80-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 80-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 80-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 80-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 80-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 80-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 80-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 80-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 80-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 80-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 80-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 80-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 80-120(pA10,mldab5,2), rA0
-+ movaps 80-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 80-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 80-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 80-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 80-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 80-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 80-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 80-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 80-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 80-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 80-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 80-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 80-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 80-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 68
-- movaps 96-120(pA10,mldab5,2), rA0
-- movaps 96-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 96-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 96-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 96-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 96-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 96-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 96-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 96-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 96-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 96-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 96-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 96-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 96-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 96-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 96-120(pA10,mldab5,2), rA0
-+ movaps 96-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 96-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 96-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 96-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 96-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 96-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 96-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 96-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 96-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 96-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 96-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 96-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 96-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 96-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 72
-- movaps 112-120(pA10,mldab5,2), rA0
-- movaps 112-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 112-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 112-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 112-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 112-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 112-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 112-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 112-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 112-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 112-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 112-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 112-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 112-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 112-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 112-120(pA10,mldab5,2), rA0
-+ movaps 112-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 112-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 112-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 112-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 112-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 112-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 112-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 112-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 112-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 112-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 112-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 112-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 112-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 112-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 76
-- movaps 128-120(pA10,mldab5,2), rA0
-- movaps 128-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 128-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 128-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 128-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 128-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 128-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 128-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 128-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 128-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 128-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 128-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 128-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 128-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 128-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 128-120(pA10,mldab5,2), rA0
-+ movaps 128-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 128-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 128-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 128-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 128-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 128-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 128-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 128-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 128-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 128-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 128-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 128-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 128-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 128-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 80
-- movaps 144-120(pA10,mldab5,2), rA0
-- movaps 144-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 144-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 144-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 144-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 144-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 144-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 144-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 144-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 144-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 144-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 144-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 144-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 144-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 144-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 144-120(pA10,mldab5,2), rA0
-+ movaps 144-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 144-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 144-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 144-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 144-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 144-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 144-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 144-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 144-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 144-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 144-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 144-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 144-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 144-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- /*UKLOOP */
-@@ -1220,234 +1227,234 @@ UMLOOP:
- * Get these bastard things summed up correctly
- */
-
-- /* rC0 = c0a c0b c0c c0d */
-- /* rC1 = c1a c1b c1c c1d */
-- /* rC2 = c2a c2b c2c c2d */
-- /* rC3 = c3a c3b c3c c3d */
-+ /* rC0 = c0a c0b c0c c0d */
-+ /* rC1 = c1a c1b c1c c1d */
-+ /* rC2 = c2a c2b c2c c2d */
-+ /* rC3 = c3a c3b c3c c3d */
- /* */
-- movaps rC2, rB0 /* rB0 = c2a c2b c2c c2d */
-- prefC((pC))
-- prefC(64(pC))
-- movaps rC0, rA0 /* rA0 = c0a c0b c0c c0d */
-- unpckhps rC3, rB0 /* rB0 = c2c c3c c2d c3d */
-- unpckhps rC1, rA0 /* rA0 = c0c c1c c0d c1d */
-- unpcklps rC3, rC2 /* rC2 = c2a c3a c2b c3b */
-- movlhps rB0, rC3 /* rC3 = c3a c3b c2c c3c */
-- unpcklps rC1, rC0 /* rC0 = c0a c1a c0b c1b */
-- movhlps rA0, rC3 /* rC3 = c0d c1d c2c c3c */
-- movlhps rC2, rA0 /* rA0 = c0c c1c c2a c3a */
-- movhlps rC0, rB0 /* rB0 = c0b c1b c2d c3d */
-- addps rA0, rC3 /* rC3 = c0cd c1cd c2ac c3ac */
-- movlhps rC0, rC1 /* rC1 = c1a c1b c0a c1a */
-- movhlps rC1, rC2 /* rC2 = c0a c1a c2b c3b */
-- movaps rC4, rA0 /* rA0 = c4a c4b c4c c4d */
-- addps rB0, rC2 /* rC2 = c0ab c1ab c2bd c3bd */
-- movaps rC6, rB0 /* rB0 = c6a c6b c6c c6d */
-- addps rC2, rC3 /* rC3 = c0abcd c1abcd c2bdac c3bdac */
--
--
-- /* rC4 = c4a c4b c4c c4d */
-- /* rC5 = c5a c5b c5c c5d */
-- /* rC6 = c6a c6b c6c c6d */
-- /* rC7 = c7a c7b c7c c7d */
-- /* rC8 = c08a c08b c08c c08d */
-- /* rC9 = c09a c09b c09c c09d */
-- /* rC10 = c10a c10b c10c c10d */
-- /* rC11 = c11a c11b c11c c11d */
-- /* rC12 = c12a c12b c12c c12d */
-- /* rC13 = c13a c13b c13c c13d */
-+ movaps rC2, rB0 /* rB0 = c2a c2b c2c c2d */
-+ prefC((pC))
-+ prefC(64(pC))
-+ movaps rC0, rA0 /* rA0 = c0a c0b c0c c0d */
-+ unpckhps rC3, rB0 /* rB0 = c2c c3c c2d c3d */
-+ unpckhps rC1, rA0 /* rA0 = c0c c1c c0d c1d */
-+ unpcklps rC3, rC2 /* rC2 = c2a c3a c2b c3b */
-+ movlhps rB0, rC3 /* rC3 = c3a c3b c2c c3c */
-+ unpcklps rC1, rC0 /* rC0 = c0a c1a c0b c1b */
-+ movhlps rA0, rC3 /* rC3 = c0d c1d c2c c3c */
-+ movlhps rC2, rA0 /* rA0 = c0c c1c c2a c3a */
-+ movhlps rC0, rB0 /* rB0 = c0b c1b c2d c3d */
-+ addps rA0, rC3 /* rC3 = c0cd c1cd c2ac c3ac */
-+ movlhps rC0, rC1 /* rC1 = c1a c1b c0a c1a */
-+ movhlps rC1, rC2 /* rC2 = c0a c1a c2b c3b */
-+ movaps rC4, rA0 /* rA0 = c4a c4b c4c c4d */
-+ addps rB0, rC2 /* rC2 = c0ab c1ab c2bd c3bd */
-+ movaps rC6, rB0 /* rB0 = c6a c6b c6c c6d */
-+ addps rC2, rC3 /* rC3 = c0abcd c1abcd c2bdac c3bdac */
-+
-+
-+ /* rC4 = c4a c4b c4c c4d */
-+ /* rC5 = c5a c5b c5c c5d */
-+ /* rC6 = c6a c6b c6c c6d */
-+ /* rC7 = c7a c7b c7c c7d */
-+ /* rC8 = c08a c08b c08c c08d */
-+ /* rC9 = c09a c09b c09c c09d */
-+ /* rC10 = c10a c10b c10c c10d */
-+ /* rC11 = c11a c11b c11c c11d */
-+ /* rC12 = c12a c12b c12c c12d */
-+ /* rC13 = c13a c13b c13c c13d */
- /* */
-- movaps rC10, rC0 /* rC0 = c10a c10b c10c c10d */
-- prefC(128(pC))
-+ movaps rC10, rC0 /* rC0 = c10a c10b c10c c10d */
-+ prefC(128(pC))
- #ifdef SREAL
-- pref2((pfA))
-+ pref2((pfA))
- #else
-- prefC(192(pC))
-+ prefC(192(pC))
- #endif
-- movaps rC8 , rC1 /* rC1 = c08a c08b c08c c08d */
-- movaps rC12, rC2 /* rC2 = c12a c12b c12c c12d */
-- unpckhps rC7, rB0 /* rB0 = c6c c7c c6d c7d */
-- unpckhps rC5, rA0 /* rA0 = c4c c5c c4d c5d */
-- unpcklps rC7, rC6 /* rC6 = c6a c7a c6b c7b */
-- unpckhps rC11, rC0 /* rC0 = c10c c11c c10d c11d */
-- unpckhps rC9 , rC1 /* rC1 = c08c c09c c08d c09d */
-- movlhps rB0, rC7 /* rC7 = c7a c7b c6c c7c */
-- unpcklps rC5, rC4 /* rC4 = c4a c5a c4b c5b */
-- movhlps rA0, rC7 /* rC7 = c4d c5d c6c c7c */
-- movlhps rC6, rA0 /* rA0 = c4c c5c c6a c7a */
-- unpcklps rC11, rC10 /* rC10 = c10a c11a c10b c11b */
-- movhlps rC4, rB0 /* rB0 = c4b c5b c6d c7d */
-- movlhps rC0, rC11 /* rC11 = c11a c11b c10c c11c */
-- addps rA0, rC7 /* rC7 = c4cd c5cd c6ac c7ac */
-+ movaps rC8 , rC1 /* rC1 = c08a c08b c08c c08d */
-+ movaps rC12, rC2 /* rC2 = c12a c12b c12c c12d */
-+ unpckhps rC7, rB0 /* rB0 = c6c c7c c6d c7d */
-+ unpckhps rC5, rA0 /* rA0 = c4c c5c c4d c5d */
-+ unpcklps rC7, rC6 /* rC6 = c6a c7a c6b c7b */
-+ unpckhps rC11, rC0 /* rC0 = c10c c11c c10d c11d */
-+ unpckhps rC9 , rC1 /* rC1 = c08c c09c c08d c09d */
-+ movlhps rB0, rC7 /* rC7 = c7a c7b c6c c7c */
-+ unpcklps rC5, rC4 /* rC4 = c4a c5a c4b c5b */
-+ movhlps rA0, rC7 /* rC7 = c4d c5d c6c c7c */
-+ movlhps rC6, rA0 /* rA0 = c4c c5c c6a c7a */
-+ unpcklps rC11, rC10 /* rC10 = c10a c11a c10b c11b */
-+ movhlps rC4, rB0 /* rB0 = c4b c5b c6d c7d */
-+ movlhps rC0, rC11 /* rC11 = c11a c11b c10c c11c */
-+ addps rA0, rC7 /* rC7 = c4cd c5cd c6ac c7ac */
- #ifdef BETAX
- #ifdef SREAL
-- movups (pC), rA0
-- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
-- movups 16(pC), rC4
-- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
-- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
-- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
-- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
-- movups 32(pC), rC5
-- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
-- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
-- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
-- movlps 48(pC), rC1
-- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
-- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
-- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
-- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
-- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
-- pref2(64(pfA))
-- mulps BOF(%rsp), rA0
-- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
-- mulps BOF(%rsp), rC4
-- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
-- mulps BOF(%rsp), rC5
-- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
-- mulps BOF(%rsp), rC1
-+ movups (pC), rA0
-+ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
-+ movups 16(pC), rC4
-+ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
-+ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
-+ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
-+ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
-+ movups 32(pC), rC5
-+ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
-+ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
-+ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
-+ movlps 48(pC), rC1
-+ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
-+ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
-+ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
-+ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
-+ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
-+ pref2(64(pfA))
-+ mulps BOF(%rsp), rA0
-+ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
-+ mulps BOF(%rsp), rC4
-+ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
-+ mulps BOF(%rsp), rC5
-+ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
-+ mulps BOF(%rsp), rC1
-
- /* */
-
-- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
-- addps rA0, rC3
-- addq $68, pfA
-- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
-- addps rC4, rC7
-- addps rC5, rC11
-- addps rC1, rC12
-+ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
-+ addps rA0, rC3
-+ addq $68, pfA
-+ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
-+ addps rC4, rC7
-+ addps rC5, rC11
-+ addps rC1, rC12
- #else /* BETA = X, complex type */
-- movups (pC), rA0
-- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
-- movups 16(pC), rC4
-- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
-- shufps $0x88, rC4, rA0 /* rA0 = c0 c1 c2 c3 */
-- movups 32(pC), rC4 /* rC4 = c4 X c5 X */
-- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
-- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
-- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
-- movups 48(pC), rC5 /* rC5 = c6 X c7 X */
-- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
-- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
-- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
-- shufps $0x88, rC5, rC4 /* rC4 = c4 c5 c6 c7 */
-- movups 64(pC), rC5 /* rC5 = c8 X c9 X */
-- movups 80(pC), rC1 /* rC1 = c10 X c11 X */
-- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
-- shufps $0x88, rC1, rC5 /* rC5 = c8 c9 c10 c11 */
-- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
-- movss 96(pC), rC1
-- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
-- movss 104(pC), rB0
-- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
-- unpcklps rB0, rC1
-- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
-- prefC(256(pC))
-- mulps BOF(%rsp), rA0
-- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
-- mulps BOF(%rsp), rC4
-- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
-- mulps BOF(%rsp), rC5
-- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
-- mulps BOF(%rsp), rC1
-+ movups (pC), rA0
-+ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
-+ movups 16(pC), rC4
-+ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
-+ shufps $0x88, rC4, rA0 /* rA0 = c0 c1 c2 c3 */
-+ movups 32(pC), rC4 /* rC4 = c4 X c5 X */
-+ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
-+ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
-+ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
-+ movups 48(pC), rC5 /* rC5 = c6 X c7 X */
-+ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
-+ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
-+ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
-+ shufps $0x88, rC5, rC4 /* rC4 = c4 c5 c6 c7 */
-+ movups 64(pC), rC5 /* rC5 = c8 X c9 X */
-+ movups 80(pC), rC1 /* rC1 = c10 X c11 X */
-+ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
-+ shufps $0x88, rC1, rC5 /* rC5 = c8 c9 c10 c11 */
-+ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
-+ movss 96(pC), rC1
-+ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
-+ movss 104(pC), rB0
-+ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
-+ unpcklps rB0, rC1
-+ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
-+ prefC(256(pC))
-+ mulps BOF(%rsp), rA0
-+ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
-+ mulps BOF(%rsp), rC4
-+ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
-+ mulps BOF(%rsp), rC5
-+ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
-+ mulps BOF(%rsp), rC1
-
- /* */
-
-- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
-- addps rA0, rC3
-- prefC(192(pC))
-- addq $68, pfA
-- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
-- addps rC4, rC7
-- addps rC5, rC11
-- addps rC1, rC12
-+ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
-+ addps rA0, rC3
-+ prefC(192(pC))
-+ addq $68, pfA
-+ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
-+ addps rC4, rC7
-+ addps rC5, rC11
-+ addps rC1, rC12
- #endif
-
- #else
-- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
-- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
-- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
-- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
-- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
-- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
-- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
-- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
-- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
-- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
-- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
-- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
-- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
-+ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
-+ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
-+ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
-+ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
-+ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
-+ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
-+ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
-+ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
-+ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
-+ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
-+ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
-+ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
-+ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
- #ifdef SREAL
-- pref2(64(pfA))
-+ pref2(64(pfA))
- #else
-- prefC(256(pC))
-+ prefC(256(pC))
- #endif
-- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
-- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
-- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
-+ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
-+ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
-+ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
-
- /* */
-
-- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
-+ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
- #ifndef SREAL
-- prefC(192(pC))
-+ prefC(192(pC))
- #endif
-- addq $68, pfA
-- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
-+ addq $68, pfA
-+ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
-
- #endif
- /*
- * Write results back to C; pC += 14;
- */
- #ifdef SREAL
-- movups rC3, (pC)
-- movups rC7, 16(pC)
-- movups rC11, 32(pC)
-- movlps rC12, 48(pC)
-- addq $56, pC
-+ movups rC3, (pC)
-+ movups rC7, 16(pC)
-+ movups rC11, 32(pC)
-+ movlps rC12, 48(pC)
-+ addq $56, pC
- #else
-- movss rC3, (pC)
-- movss rC7, 32(pC)
-- movhlps rC3, rC0
-- movhlps rC7, rC6
-- movss rC0, 16(pC)
-- movss rC6, 48(pC)
-- shufps $0x55, rC3, rC3
-- shufps $0x55, rC7, rC7
-- movss rC3, 8(pC)
-- movss rC7, 40(pC)
-- shufps $0x55, rC0, rC0
-- shufps $0x55, rC6, rC6
-- movss rC0, 24(pC)
-- movss rC6, 56(pC)
--
-- movss rC11, 64(pC)
-- movhlps rC11, rC2
-- movss rC12, 96(pC)
-- movss rC2, 80(pC)
-- shufps $0x55, rC11, rC11
-- shufps $0x55, rC12, rC12
-- movss rC11, 72(pC)
-- shufps $0x55, rC2, rC2
-- movss rC12, 104(pC)
-- movss rC2, 88(pC)
-+ movss rC3, (pC)
-+ movss rC7, 32(pC)
-+ movhlps rC3, rC0
-+ movhlps rC7, rC6
-+ movss rC0, 16(pC)
-+ movss rC6, 48(pC)
-+ shufps $0x55, rC3, rC3
-+ shufps $0x55, rC7, rC7
-+ movss rC3, 8(pC)
-+ movss rC7, 40(pC)
-+ shufps $0x55, rC0, rC0
-+ shufps $0x55, rC6, rC6
-+ movss rC0, 24(pC)
-+ movss rC6, 56(pC)
-+
-+ movss rC11, 64(pC)
-+ movhlps rC11, rC2
-+ movss rC12, 96(pC)
-+ movss rC2, 80(pC)
-+ shufps $0x55, rC11, rC11
-+ shufps $0x55, rC12, rC12
-+ movss rC11, 72(pC)
-+ shufps $0x55, rC2, rC2
-+ movss rC12, 104(pC)
-+ movss rC2, 88(pC)
-
-- addq $112, pC
-+ addq $112, pC
- #endif
- /*
- * Write results back to C
- */
-- addq $NB14so-176, pA5
-- addq $NB14so-176, pA10
-- subq $176, pB0
-+ addq $NB14so-176, pA5
-+ addq $NB14so-176, pA10
-+ subq $176, pB0
- /*
- * pC += 14; pA += 14*NB; pB -= NB;
- */
- /*
- * while (pA != stM);
- */
-- subq $1, stM
-- jne UMLOOP
-+ subq $1, stM
-+ jne UMLOOP
- #endif
-
- /*
-@@ -1459,994 +1466,994 @@ MLAST:
- #endif
- /*UKLOOP: */
- #ifdef BETA1
-- movaps 0-120(pA10,mldab5,2), rC0
-- movaps 0-120(pB0), rB0
-- mulps rB0, rC0
-- addss (pC), rC0
-- movaps 0-120(pA5, mldab,4), rC1
-- mulps rB0, rC1
-- addss CMUL(4)(pC), rC1
-- movaps 0-120(pA10, mldab,8), rC2
-- mulps rB0, rC2
-- addss CMUL(8)(pC), rC2
-- movaps 0-120(pA5, mldab,2), rC3
-- mulps rB0, rC3
-- addss CMUL(12)(pC), rC3
-- movaps 0-120(pA5, mldab), rC4
-- mulps rB0, rC4
-- addss CMUL(16)(pC), rC4
-- movaps 0-120(pA5), rC5
-- mulps rB0, rC5
-- addss CMUL(20)(pC), rC5
-- movaps 0-120(pA5, ldab), rC6
-- mulps rB0, rC6
-- addss CMUL(24)(pC), rC6
-- movaps 0-120(pA5, ldab,2), rC7
-- mulps rB0, rC7
-- addss CMUL(28)(pC), rC7
-- movaps 0-120(pA10, mldab,2), rC8
-- mulps rB0, rC8
-- addss CMUL(32)(pC), rC8
-- movaps 0-120(pA5,ldab,4), rC9
-- mulps rB0, rC9
-- addss CMUL(36)(pC), rC9
-- movaps 0-120(pA10), rC10
-- mulps rB0, rC10
-- addss CMUL(40)(pC), rC10
-- movaps 0-120(pA10,ldab), rC11
-- mulps rB0, rC11
-- addss CMUL(44)(pC), rC11
-- movaps 0-120(pA10,ldab,2), rC12
-- mulps rB0, rC12
-- addss CMUL(48)(pC), rC12
-- movaps 0-120(pA5,ldab,8), rC13
-- mulps rB0, rC13
-- addss CMUL(52)(pC), rC13
-+ movaps 0-120(pA10,mldab5,2), rC0
-+ movaps 0-120(pB0), rB0
-+ mulps rB0, rC0
-+ addss (pC), rC0
-+ movaps 0-120(pA5, mldab,4), rC1
-+ mulps rB0, rC1
-+ addss CMUL(4)(pC), rC1
-+ movaps 0-120(pA10, mldab,8), rC2
-+ mulps rB0, rC2
-+ addss CMUL(8)(pC), rC2
-+ movaps 0-120(pA5, mldab,2), rC3
-+ mulps rB0, rC3
-+ addss CMUL(12)(pC), rC3
-+ movaps 0-120(pA5, mldab), rC4
-+ mulps rB0, rC4
-+ addss CMUL(16)(pC), rC4
-+ movaps 0-120(pA5), rC5
-+ mulps rB0, rC5
-+ addss CMUL(20)(pC), rC5
-+ movaps 0-120(pA5, ldab), rC6
-+ mulps rB0, rC6
-+ addss CMUL(24)(pC), rC6
-+ movaps 0-120(pA5, ldab,2), rC7
-+ mulps rB0, rC7
-+ addss CMUL(28)(pC), rC7
-+ movaps 0-120(pA10, mldab,2), rC8
-+ mulps rB0, rC8
-+ addss CMUL(32)(pC), rC8
-+ movaps 0-120(pA5,ldab,4), rC9
-+ mulps rB0, rC9
-+ addss CMUL(36)(pC), rC9
-+ movaps 0-120(pA10), rC10
-+ mulps rB0, rC10
-+ addss CMUL(40)(pC), rC10
-+ movaps 0-120(pA10,ldab), rC11
-+ mulps rB0, rC11
-+ addss CMUL(44)(pC), rC11
-+ movaps 0-120(pA10,ldab,2), rC12
-+ mulps rB0, rC12
-+ addss CMUL(48)(pC), rC12
-+ movaps 0-120(pA5,ldab,8), rC13
-+ mulps rB0, rC13
-+ addss CMUL(52)(pC), rC13
- #else
-- movaps 0-120(pA10,mldab5,2), rC0
-- movaps 0-120(pB0), rC13
-- mulps rC13, rC0
-- movaps 0-120(pA5, mldab,4), rC1
-- mulps rC13, rC1
-- movaps 0-120(pA10, mldab,8), rC2
-- mulps rC13, rC2
-- movaps 0-120(pA5, mldab,2), rC3
-- mulps rC13, rC3
-- movaps 0-120(pA5, mldab), rC4
-- mulps rC13, rC4
-- movaps 0-120(pA5), rC5
-- mulps rC13, rC5
-- movaps 0-120(pA5, ldab), rC6
-- mulps rC13, rC6
-- movaps 0-120(pA5, ldab,2), rC7
-- mulps rC13, rC7
-- movaps 0-120(pA10, mldab,2), rC8
-- mulps rC13, rC8
-- movaps 0-120(pA5,ldab,4), rC9
-- mulps rC13, rC9
-- movaps 0-120(pA10), rC10
-- mulps rC13, rC10
-- movaps 0-120(pA10,ldab), rC11
-- mulps rC13, rC11
-- movaps 0-120(pA10,ldab,2), rC12
-- mulps rC13, rC12
-- mulps 0-120(pA5,ldab,8), rC13
-+ movaps 0-120(pA10,mldab5,2), rC0
-+ movaps 0-120(pB0), rC13
-+ mulps rC13, rC0
-+ movaps 0-120(pA5, mldab,4), rC1
-+ mulps rC13, rC1
-+ movaps 0-120(pA10, mldab,8), rC2
-+ mulps rC13, rC2
-+ movaps 0-120(pA5, mldab,2), rC3
-+ mulps rC13, rC3
-+ movaps 0-120(pA5, mldab), rC4
-+ mulps rC13, rC4
-+ movaps 0-120(pA5), rC5
-+ mulps rC13, rC5
-+ movaps 0-120(pA5, ldab), rC6
-+ mulps rC13, rC6
-+ movaps 0-120(pA5, ldab,2), rC7
-+ mulps rC13, rC7
-+ movaps 0-120(pA10, mldab,2), rC8
-+ mulps rC13, rC8
-+ movaps 0-120(pA5,ldab,4), rC9
-+ mulps rC13, rC9
-+ movaps 0-120(pA10), rC10
-+ mulps rC13, rC10
-+ movaps 0-120(pA10,ldab), rC11
-+ mulps rC13, rC11
-+ movaps 0-120(pA10,ldab,2), rC12
-+ mulps rC13, rC12
-+ mulps 0-120(pA5,ldab,8), rC13
- #endif
-
- #if KB > 4
-- movaps 16-120(pA10,mldab5,2), rA0
-- movaps 16-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 16-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 16-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 16-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 16-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 16-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 16-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 16-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 16-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 16-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 16-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 16-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 16-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 16-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 16-120(pA10,mldab5,2), rA0
-+ movaps 16-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 16-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 16-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 16-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 16-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 16-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 16-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 16-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 16-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 16-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 16-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 16-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 16-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 16-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 8
-- movaps 32-120(pA10,mldab5,2), rA0
-- movaps 32-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 32-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 32-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 32-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 32-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 32-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 32-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 32-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 32-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 32-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 32-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 32-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 32-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 32-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 32-120(pA10,mldab5,2), rA0
-+ movaps 32-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 32-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 32-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 32-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 32-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 32-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 32-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 32-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 32-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 32-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 32-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 32-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 32-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 32-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 12
-- movaps 48-120(pA10,mldab5,2), rA0
-- movaps 48-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 48-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 48-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 48-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 48-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 48-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 48-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 48-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 48-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 48-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 48-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 48-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 48-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 48-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 48-120(pA10,mldab5,2), rA0
-+ movaps 48-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 48-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 48-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 48-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 48-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 48-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 48-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 48-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 48-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 48-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 48-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 48-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 48-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 48-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 16
-- movaps 64-120(pA10,mldab5,2), rA0
-- movaps 64-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 64-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 64-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 64-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 64-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 64-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 64-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 64-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 64-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 64-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 64-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 64-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 64-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 64-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 64-120(pA10,mldab5,2), rA0
-+ movaps 64-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 64-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 64-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 64-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 64-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 64-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 64-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 64-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 64-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 64-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 64-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 64-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 64-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 64-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 20
-- movaps 80-120(pA10,mldab5,2), rA0
-- movaps 80-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 80-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 80-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 80-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 80-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 80-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 80-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 80-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 80-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 80-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 80-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 80-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 80-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 80-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 80-120(pA10,mldab5,2), rA0
-+ movaps 80-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 80-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 80-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 80-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 80-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 80-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 80-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 80-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 80-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 80-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 80-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 80-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 80-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 80-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 24
-- movaps 96-120(pA10,mldab5,2), rA0
-- movaps 96-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 96-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 96-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 96-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 96-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 96-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 96-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 96-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 96-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 96-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 96-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 96-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 96-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 96-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 96-120(pA10,mldab5,2), rA0
-+ movaps 96-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 96-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 96-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 96-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 96-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 96-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 96-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 96-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 96-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 96-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 96-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 96-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 96-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 96-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 28
-- movaps 112-120(pA10,mldab5,2), rA0
-- movaps 112-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 112-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 112-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 112-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 112-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 112-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 112-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 112-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 112-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 112-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 112-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 112-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 112-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 112-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 112-120(pA10,mldab5,2), rA0
-+ movaps 112-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 112-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 112-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 112-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 112-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 112-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 112-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 112-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 112-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 112-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 112-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 112-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 112-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 112-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 32
-- movaps 128-120(pA10,mldab5,2), rA0
-- movaps 128-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 128-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 128-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 128-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 128-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 128-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 128-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 128-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 128-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 128-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 128-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 128-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 128-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 128-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 128-120(pA10,mldab5,2), rA0
-+ movaps 128-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 128-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 128-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 128-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 128-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 128-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 128-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 128-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 128-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 128-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 128-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 128-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 128-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 128-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 36
-- movaps 144-120(pA10,mldab5,2), rA0
-- movaps 144-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 144-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 144-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 144-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 144-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 144-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 144-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 144-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 144-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 144-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 144-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 144-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 144-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 144-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 144-120(pA10,mldab5,2), rA0
-+ movaps 144-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 144-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 144-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 144-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 144-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 144-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 144-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 144-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 144-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 144-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 144-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 144-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 144-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 144-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-- prefB((pB,ldab))
-- prefB(64(pB,ldab))
-+ prefB((pB,ldab))
-+ prefB(64(pB,ldab))
-
- #if KB > 40
-- movaps 160-120(pA10,mldab5,2), rA0
-- movaps 160-120(pB0), rB0
-- mulps rB0, rA0
-- addq $176, pB0
-- addps rA0, rC0
-- movaps 160-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 160-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 160-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 160-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 160-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 160-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 160-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 160-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 160-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 160-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 160-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 160-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addq $176, pA10
-- addps rA0, rC12
-- mulps 160-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-- addq $176, pA5
-+ movaps 160-120(pA10,mldab5,2), rA0
-+ movaps 160-120(pB0), rB0
-+ mulps rB0, rA0
-+ addq $176, pB0
-+ addps rA0, rC0
-+ movaps 160-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 160-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 160-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 160-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 160-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 160-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 160-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 160-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 160-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 160-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 160-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 160-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addq $176, pA10
-+ addps rA0, rC12
-+ mulps 160-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
-+ addq $176, pA5
- #else
-- addq $176, pB0
-- addq $176, pA10
-- addq $176, pA5
-+ addq $176, pB0
-+ addq $176, pA10
-+ addq $176, pA5
- #endif
-
- #if KB > 44
-- movaps 0-120(pA10,mldab5,2), rA0
-- movaps 0-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 0-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 0-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 0-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 0-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 0-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 0-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 0-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 0-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 0-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 0-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 0-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 0-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 0-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 0-120(pA10,mldab5,2), rA0
-+ movaps 0-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 0-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 0-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 0-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 0-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 0-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 0-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 0-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 0-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 0-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 0-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 0-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 0-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 0-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 48
-- movaps 16-120(pA10,mldab5,2), rA0
-- movaps 16-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 16-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 16-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 16-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 16-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 16-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 16-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 16-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 16-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 16-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 16-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 16-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 16-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 16-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 16-120(pA10,mldab5,2), rA0
-+ movaps 16-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 16-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 16-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 16-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 16-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 16-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 16-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 16-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 16-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 16-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 16-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 16-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 16-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 16-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 52
-- movaps 32-120(pA10,mldab5,2), rA0
-- movaps 32-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 32-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 32-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 32-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 32-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 32-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 32-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 32-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 32-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 32-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 32-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 32-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 32-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 32-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 32-120(pA10,mldab5,2), rA0
-+ movaps 32-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 32-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 32-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 32-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 32-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 32-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 32-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 32-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 32-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 32-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 32-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 32-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 32-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 32-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 56
-- movaps 48-120(pA10,mldab5,2), rA0
-- movaps 48-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 48-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 48-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 48-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 48-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 48-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 48-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 48-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 48-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 48-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 48-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 48-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 48-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 48-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 48-120(pA10,mldab5,2), rA0
-+ movaps 48-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 48-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 48-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 48-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 48-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 48-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 48-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 48-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 48-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 48-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 48-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 48-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 48-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 48-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 60
-- movaps 64-120(pA10,mldab5,2), rA0
-- movaps 64-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 64-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 64-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 64-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 64-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 64-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 64-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 64-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 64-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 64-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 64-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 64-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 64-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 64-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 64-120(pA10,mldab5,2), rA0
-+ movaps 64-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 64-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 64-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 64-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 64-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 64-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 64-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 64-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 64-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 64-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 64-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 64-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 64-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 64-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-- prefB(128-176(pB,ldab))
-- prefB(192-176(pB,ldab))
-+ prefB(128-176(pB,ldab))
-+ prefB(192-176(pB,ldab))
-
- #if KB > 64
-- movaps 80-120(pA10,mldab5,2), rA0
-- movaps 80-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 80-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 80-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 80-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 80-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 80-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 80-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 80-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 80-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 80-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 80-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 80-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 80-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 80-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 80-120(pA10,mldab5,2), rA0
-+ movaps 80-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 80-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 80-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 80-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 80-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 80-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 80-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 80-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 80-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 80-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 80-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 80-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 80-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 80-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 68
-- movaps 96-120(pA10,mldab5,2), rA0
-- movaps 96-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 96-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 96-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 96-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 96-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 96-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 96-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 96-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 96-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 96-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 96-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 96-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 96-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 96-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 96-120(pA10,mldab5,2), rA0
-+ movaps 96-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 96-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 96-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 96-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 96-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 96-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 96-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 96-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 96-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 96-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 96-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 96-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 96-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 96-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 72
-- movaps 112-120(pA10,mldab5,2), rA0
-- movaps 112-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 112-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 112-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 112-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 112-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 112-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 112-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 112-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 112-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 112-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 112-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 112-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 112-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 112-120(pA5,ldab,8), rB0
-- prefC((pC))
-- prefC((pC,incCn))
-- addps rB0, rC13
-+ movaps 112-120(pA10,mldab5,2), rA0
-+ movaps 112-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 112-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 112-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 112-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 112-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 112-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 112-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 112-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 112-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 112-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 112-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 112-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 112-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 112-120(pA5,ldab,8), rB0
-+ prefC((pC))
-+ prefC((pC,incCn))
-+ addps rB0, rC13
- #else
-- prefC((pC))
-- prefC((pC,incCn))
-+ prefC((pC))
-+ prefC((pC,incCn))
- #endif
-
- #if KB > 76
-- movaps 128-120(pA10,mldab5,2), rA0
-- movaps 128-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 128-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 128-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 128-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 128-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 128-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 128-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 128-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 128-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 128-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 128-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 128-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 128-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 128-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 128-120(pA10,mldab5,2), rA0
-+ movaps 128-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 128-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 128-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 128-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 128-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 128-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 128-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 128-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 128-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 128-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 128-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 128-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 128-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 128-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- #if KB > 80
-- movaps 144-120(pA10,mldab5,2), rA0
-- movaps 144-120(pB0), rB0
-- mulps rB0, rA0
-- addps rA0, rC0
-- movaps 144-120(pA5, mldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC1
-- movaps 144-120(pA10, mldab,8), rA0
-- mulps rB0, rA0
-- addps rA0, rC2
-- movaps 144-120(pA5, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC3
-- movaps 144-120(pA5, mldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC4
-- movaps 144-120(pA5), rA0
-- mulps rB0, rA0
-- addps rA0, rC5
-- movaps 144-120(pA5, ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC6
-- movaps 144-120(pA5, ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC7
-- movaps 144-120(pA10, mldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC8
-- movaps 144-120(pA5,ldab,4), rA0
-- mulps rB0, rA0
-- addps rA0, rC9
-- movaps 144-120(pA10), rA0
-- mulps rB0, rA0
-- addps rA0, rC10
-- movaps 144-120(pA10,ldab), rA0
-- mulps rB0, rA0
-- addps rA0, rC11
-- movaps 144-120(pA10,ldab,2), rA0
-- mulps rB0, rA0
-- addps rA0, rC12
-- mulps 144-120(pA5,ldab,8), rB0
-- addps rB0, rC13
-+ movaps 144-120(pA10,mldab5,2), rA0
-+ movaps 144-120(pB0), rB0
-+ mulps rB0, rA0
-+ addps rA0, rC0
-+ movaps 144-120(pA5, mldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC1
-+ movaps 144-120(pA10, mldab,8), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC2
-+ movaps 144-120(pA5, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC3
-+ movaps 144-120(pA5, mldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC4
-+ movaps 144-120(pA5), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC5
-+ movaps 144-120(pA5, ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC6
-+ movaps 144-120(pA5, ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC7
-+ movaps 144-120(pA10, mldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC8
-+ movaps 144-120(pA5,ldab,4), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC9
-+ movaps 144-120(pA10), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC10
-+ movaps 144-120(pA10,ldab), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC11
-+ movaps 144-120(pA10,ldab,2), rA0
-+ mulps rB0, rA0
-+ addps rA0, rC12
-+ mulps 144-120(pA5,ldab,8), rB0
-+ addps rB0, rC13
- #endif
-
- /*UKLOOP */
-@@ -2454,202 +2461,202 @@ MLAST:
- * Get these bastard things summed up correctly
- */
-
-- /* rC0 = c0a c0b c0c c0d */
-- /* rC1 = c1a c1b c1c c1d */
-- /* rC2 = c2a c2b c2c c2d */
-- /* rC3 = c3a c3b c3c c3d */
-+ /* rC0 = c0a c0b c0c c0d */
-+ /* rC1 = c1a c1b c1c c1d */
-+ /* rC2 = c2a c2b c2c c2d */
-+ /* rC3 = c3a c3b c3c c3d */
- /* */
-- movaps rC2, rB0 /* rB0 = c2a c2b c2c c2d */
-- prefC(64(pC,incCn))
-- prefB(256-176(pB,ldab))
-- movaps rC0, rA0 /* rA0 = c0a c0b c0c c0d */
-- unpckhps rC3, rB0 /* rB0 = c2c c3c c2d c3d */
-- unpckhps rC1, rA0 /* rA0 = c0c c1c c0d c1d */
-- unpcklps rC3, rC2 /* rC2 = c2a c3a c2b c3b */
-- movlhps rB0, rC3 /* rC3 = c3a c3b c2c c3c */
-- unpcklps rC1, rC0 /* rC0 = c0a c1a c0b c1b */
-- movhlps rA0, rC3 /* rC3 = c0d c1d c2c c3c */
-- movlhps rC2, rA0 /* rA0 = c0c c1c c2a c3a */
-- movhlps rC0, rB0 /* rB0 = c0b c1b c2d c3d */
-- addps rA0, rC3 /* rC3 = c0cd c1cd c2ac c3ac */
-- movlhps rC0, rC1 /* rC1 = c1a c1b c0a c1a */
-- movhlps rC1, rC2 /* rC2 = c0a c1a c2b c3b */
-- movaps rC4, rA0 /* rA0 = c4a c4b c4c c4d */
-- addps rB0, rC2 /* rC2 = c0ab c1ab c2bd c3bd */
-- movaps rC6, rB0 /* rB0 = c6a c6b c6c c6d */
-- addps rC2, rC3 /* rC3 = c0abcd c1abcd c2bdac c3bdac */
--
--
-- /* rC4 = c4a c4b c4c c4d */
-- /* rC5 = c5a c5b c5c c5d */
-- /* rC6 = c6a c6b c6c c6d */
-- /* rC7 = c7a c7b c7c c7d */
-- /* rC8 = c08a c08b c08c c08d */
-- /* rC9 = c09a c09b c09c c09d */
-- /* rC10 = c10a c10b c10c c10d */
-- /* rC11 = c11a c11b c11c c11d */
-- /* rC12 = c12a c12b c12c c12d */
-- /* rC13 = c13a c13b c13c c13d */
-+ movaps rC2, rB0 /* rB0 = c2a c2b c2c c2d */
-+ prefC(64(pC,incCn))
-+ prefB(256-176(pB,ldab))
-+ movaps rC0, rA0 /* rA0 = c0a c0b c0c c0d */
-+ unpckhps rC3, rB0 /* rB0 = c2c c3c c2d c3d */
-+ unpckhps rC1, rA0 /* rA0 = c0c c1c c0d c1d */
-+ unpcklps rC3, rC2 /* rC2 = c2a c3a c2b c3b */
-+ movlhps rB0, rC3 /* rC3 = c3a c3b c2c c3c */
-+ unpcklps rC1, rC0 /* rC0 = c0a c1a c0b c1b */
-+ movhlps rA0, rC3 /* rC3 = c0d c1d c2c c3c */
-+ movlhps rC2, rA0 /* rA0 = c0c c1c c2a c3a */
-+ movhlps rC0, rB0 /* rB0 = c0b c1b c2d c3d */
-+ addps rA0, rC3 /* rC3 = c0cd c1cd c2ac c3ac */
-+ movlhps rC0, rC1 /* rC1 = c1a c1b c0a c1a */
-+ movhlps rC1, rC2 /* rC2 = c0a c1a c2b c3b */
-+ movaps rC4, rA0 /* rA0 = c4a c4b c4c c4d */
-+ addps rB0, rC2 /* rC2 = c0ab c1ab c2bd c3bd */
-+ movaps rC6, rB0 /* rB0 = c6a c6b c6c c6d */
-+ addps rC2, rC3 /* rC3 = c0abcd c1abcd c2bdac c3bdac */
-+
-+
-+ /* rC4 = c4a c4b c4c c4d */
-+ /* rC5 = c5a c5b c5c c5d */
-+ /* rC6 = c6a c6b c6c c6d */
-+ /* rC7 = c7a c7b c7c c7d */
-+ /* rC8 = c08a c08b c08c c08d */
-+ /* rC9 = c09a c09b c09c c09d */
-+ /* rC10 = c10a c10b c10c c10d */
-+ /* rC11 = c11a c11b c11c c11d */
-+ /* rC12 = c12a c12b c12c c12d */
-+ /* rC13 = c13a c13b c13c c13d */
- /* */
-- movaps rC10, rC0 /* rC0 = c10a c10b c10c c10d */
-- movaps rC8 , rC1 /* rC1 = c08a c08b c08c c08d */
-- movaps rC12, rC2 /* rC2 = c12a c12b c12c c12d */
-- unpckhps rC7, rB0 /* rB0 = c6c c7c c6d c7d */
-- unpckhps rC5, rA0 /* rA0 = c4c c5c c4d c5d */
-- unpcklps rC7, rC6 /* rC6 = c6a c7a c6b c7b */
-- unpckhps rC11, rC0 /* rC0 = c10c c11c c10d c11d */
-- unpckhps rC9 , rC1 /* rC1 = c08c c09c c08d c09d */
-- movlhps rB0, rC7 /* rC7 = c7a c7b c6c c7c */
-- unpcklps rC5, rC4 /* rC4 = c4a c5a c4b c5b */
-- movhlps rA0, rC7 /* rC7 = c4d c5d c6c c7c */
-- movlhps rC6, rA0 /* rA0 = c4c c5c c6a c7a */
-- unpcklps rC11, rC10 /* rC10 = c10a c11a c10b c11b */
-- movhlps rC4, rB0 /* rB0 = c4b c5b c6d c7d */
-- movlhps rC0, rC11 /* rC11 = c11a c11b c10c c11c */
-- addps rA0, rC7 /* rC7 = c4cd c5cd c6ac c7ac */
-+ movaps rC10, rC0 /* rC0 = c10a c10b c10c c10d */
-+ movaps rC8 , rC1 /* rC1 = c08a c08b c08c c08d */
-+ movaps rC12, rC2 /* rC2 = c12a c12b c12c c12d */
-+ unpckhps rC7, rB0 /* rB0 = c6c c7c c6d c7d */
-+ unpckhps rC5, rA0 /* rA0 = c4c c5c c4d c5d */
-+ unpcklps rC7, rC6 /* rC6 = c6a c7a c6b c7b */
-+ unpckhps rC11, rC0 /* rC0 = c10c c11c c10d c11d */
-+ unpckhps rC9 , rC1 /* rC1 = c08c c09c c08d c09d */
-+ movlhps rB0, rC7 /* rC7 = c7a c7b c6c c7c */
-+ unpcklps rC5, rC4 /* rC4 = c4a c5a c4b c5b */
-+ movhlps rA0, rC7 /* rC7 = c4d c5d c6c c7c */
-+ movlhps rC6, rA0 /* rA0 = c4c c5c c6a c7a */
-+ unpcklps rC11, rC10 /* rC10 = c10a c11a c10b c11b */
-+ movhlps rC4, rB0 /* rB0 = c4b c5b c6d c7d */
-+ movlhps rC0, rC11 /* rC11 = c11a c11b c10c c11c */
-+ addps rA0, rC7 /* rC7 = c4cd c5cd c6ac c7ac */
- #ifdef BETAX
- #ifdef SREAL
-- movups (pC), rA0
-- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
-- movups 16(pC), rC4
-- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
-- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
-- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
-- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
-- movups 32(pC), rC5
-- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
-- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
-- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
-- movlps 48(pC), rC1
-- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
-- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
-- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
-- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
-- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
-- mulps BOF(%rsp), rA0
-- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
-- mulps BOF(%rsp), rC4
-- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
-- mulps BOF(%rsp), rC5
-- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
-- mulps BOF(%rsp), rC1
-+ movups (pC), rA0
-+ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
-+ movups 16(pC), rC4
-+ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
-+ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
-+ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
-+ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
-+ movups 32(pC), rC5
-+ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
-+ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
-+ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
-+ movlps 48(pC), rC1
-+ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
-+ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
-+ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
-+ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
-+ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
-+ mulps BOF(%rsp), rA0
-+ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
-+ mulps BOF(%rsp), rC4
-+ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
-+ mulps BOF(%rsp), rC5
-+ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
-+ mulps BOF(%rsp), rC1
-
- /* */
-
-- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
-- addps rA0, rC3
-- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
-- addps rC4, rC7
-- addps rC5, rC11
-- prefB(320-176(pB,ldab))
-- addps rC1, rC12
-+ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
-+ addps rA0, rC3
-+ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
-+ addps rC4, rC7
-+ addps rC5, rC11
-+ prefB(320-176(pB,ldab))
-+ addps rC1, rC12
- #else /* BETA = X, complex type */
-- movups (pC), rA0
-- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
-- movups 16(pC), rC4
-- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
-- shufps $0x88, rC4, rA0 /* rA0 = c0 c1 c2 c3 */
-- movups 32(pC), rC4 /* rC4 = c4 X c5 X */
-- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
-- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
-- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
-- movups 48(pC), rC5 /* rC5 = c6 X c7 X */
-- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
-- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
-- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
-- shufps $0x88, rC5, rC4 /* rC4 = c4 c5 c6 c7 */
-- movups 64(pC), rC5 /* rC5 = c8 X c9 X */
-- movups 80(pC), rC1 /* rC1 = c10 X c11 X */
-- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
-- shufps $0x88, rC1, rC5 /* rC5 = c8 c9 c10 c11 */
-- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
-- movss 96(pC), rC1
-- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
-- movss 104(pC), rB0
-- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
-- unpcklps rB0, rC1
-- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
-- mulps BOF(%rsp), rA0
-- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
-- mulps BOF(%rsp), rC4
-- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
-- mulps BOF(%rsp), rC5
-- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
-- mulps BOF(%rsp), rC1
-+ movups (pC), rA0
-+ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
-+ movups 16(pC), rC4
-+ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
-+ shufps $0x88, rC4, rA0 /* rA0 = c0 c1 c2 c3 */
-+ movups 32(pC), rC4 /* rC4 = c4 X c5 X */
-+ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
-+ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
-+ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
-+ movups 48(pC), rC5 /* rC5 = c6 X c7 X */
-+ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
-+ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
-+ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
-+ shufps $0x88, rC5, rC4 /* rC4 = c4 c5 c6 c7 */
-+ movups 64(pC), rC5 /* rC5 = c8 X c9 X */
-+ movups 80(pC), rC1 /* rC1 = c10 X c11 X */
-+ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
-+ shufps $0x88, rC1, rC5 /* rC5 = c8 c9 c10 c11 */
-+ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
-+ movss 96(pC), rC1
-+ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
-+ movss 104(pC), rB0
-+ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
-+ unpcklps rB0, rC1
-+ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
-+ mulps BOF(%rsp), rA0
-+ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
-+ mulps BOF(%rsp), rC4
-+ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
-+ mulps BOF(%rsp), rC5
-+ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
-+ mulps BOF(%rsp), rC1
-
- /* */
-
-- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
-- addps rA0, rC3
-- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
-- addps rC4, rC7
-- addps rC5, rC11
-- prefB(320-176(pB,ldab))
-- addps rC1, rC12
-+ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
-+ addps rA0, rC3
-+ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
-+ addps rC4, rC7
-+ addps rC5, rC11
-+ prefB(320-176(pB,ldab))
-+ addps rC1, rC12
- #endif
-
- #else
-- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
-- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
-- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
-- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
-- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
-- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
-- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
-- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
-- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
-- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
-- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
-- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
-- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
-- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
-- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
-- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
-+ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */
-+ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */
-+ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */
-+ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */
-+ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */
-+ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */
-+ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */
-+ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */
-+ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */
-+ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */
-+ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */
-+ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */
-+ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */
-+ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */
-+ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */
-+ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */
-
- /* */
-
-- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
-- prefB(320-176(pB,ldab))
-- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
-+ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */
-+ prefB(320-176(pB,ldab))
-+ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */
-
- #endif
- /*
- * Write results back to C; pC += 14;
- */
- #ifdef SREAL
-- movups rC3, (pC)
-- movups rC7, 16(pC)
-- movups rC11, 32(pC)
-- movlps rC12, 48(pC)
--/* addq $56, pC */
-+ movups rC3, (pC)
-+ movups rC7, 16(pC)
-+ movups rC11, 32(pC)
-+ movlps rC12, 48(pC)
-+/* addq $56, pC */
- #else
-- movss rC3, (pC)
-- movss rC7, 32(pC)
-- movhlps rC3, rC0
-- movhlps rC7, rC6
-- movss rC0, 16(pC)
-- movss rC6, 48(pC)
-- shufps $0x55, rC3, rC3
-- shufps $0x55, rC7, rC7
-- movss rC3, 8(pC)
-- movss rC7, 40(pC)
-- shufps $0x55, rC0, rC0
-- shufps $0x55, rC6, rC6
-- movss rC0, 24(pC)
-- movss rC6, 56(pC)
--
-- movss rC11, 64(pC)
-- movhlps rC11, rC2
-- movss rC12, 96(pC)
-- movss rC2, 80(pC)
-- shufps $0x55, rC11, rC11
-- shufps $0x55, rC12, rC12
-- movss rC11, 72(pC)
-- shufps $0x55, rC2, rC2
-- movss rC12, 104(pC)
-- movss rC2, 88(pC)
-+ movss rC3, (pC)
-+ movss rC7, 32(pC)
-+ movhlps rC3, rC0
-+ movhlps rC7, rC6
-+ movss rC0, 16(pC)
-+ movss rC6, 48(pC)
-+ shufps $0x55, rC3, rC3
-+ shufps $0x55, rC7, rC7
-+ movss rC3, 8(pC)
-+ movss rC7, 40(pC)
-+ shufps $0x55, rC0, rC0
-+ shufps $0x55, rC6, rC6
-+ movss rC0, 24(pC)
-+ movss rC6, 56(pC)
-+
-+ movss rC11, 64(pC)
-+ movhlps rC11, rC2
-+ movss rC12, 96(pC)
-+ movss rC2, 80(pC)
-+ shufps $0x55, rC11, rC11
-+ shufps $0x55, rC12, rC12
-+ movss rC11, 72(pC)
-+ shufps $0x55, rC2, rC2
-+ movss rC12, 104(pC)
-+ movss rC2, 88(pC)
-
--/* addq $112, pC */
-+/* addq $112, pC */
- #endif
- /*
- * Write results back to C
-@@ -2660,55 +2667,55 @@ MLAST:
- /*
- * while (pA != stM);
- */
--/* subq $1, stM */
--/* jne UMLOOP */
-+/* subq $1, stM */
-+/* jne UMLOOP */
- /*
- * pC += 14; pA += 14*NB; pB -= NB;
- */
--/* subq $MBKBso-NB14so+176, pA5 */
--/* subq $MBKBso-NB14so+176, pA10 */
-- subq incAm, pA5
-- subq incAm, pA10
-- addq $NBso-176, pB0
-+/* subq $MBKBso-NB14so+176, pA5 */
-+/* subq $MBKBso-NB14so+176, pA10 */
-+ subq incAm, pA5
-+ subq incAm, pA10
-+ addq $NBso-176, pB0
- /*
- * while (pA != stM);
- */
--/* subq $1, stM */
--/* jne UMLOOP */
-+/* subq $1, stM */
-+/* jne UMLOOP */
- /*
- * pC += incCn; pA -= NBNB; pB += NB;
- */
-- addq incCn, pC
-+ addq incCn, pC
- /*
- * while (pB != stN);
- */
-- sub $1, stN
-- jne UNLOOP
-+ sub $1, stN
-+ jne UNLOOP
-
- /*
- * Restore callee-saved iregs
- */
- DONE:
-- movq -8(%rsp), %rbp
-- movq -16(%rsp), %rbx
-+ movq -8(%rsp), %rbp
-+ movq -16(%rsp), %rbx
- #if MB == 0
-- movq -32(%rsp), %r12
-- movq -40(%rsp), %r13
-+ movq -32(%rsp), %r12
-+ movq -40(%rsp), %r13
- #endif
-- ret
-+ ret
- #if MB == 0
- MB_LT84:
-- cmp $70, stM
-- jne MB_LT70
--/* movq $70/14, stM */
-- movq $5, stM
-- jmp MBFOUND
-+ cmp $70, stM
-+ jne MB_LT70
-+/* movq $70/14, stM */
-+ movq $5, stM
-+ jmp MBFOUND
- MB_LT70:
-- cmp $56, stM
-- jne MB_LT56
--/* movq $56/14, stM */
-- movq $4, stM
-- jmp MBFOUND
-+ cmp $56, stM
-+ jne MB_LT56
-+/* movq $56/14, stM */
-+ movq $4, stM
-+ jmp MBFOUND
- MB_LT56:
- cmp $42, stM
- jne MB_LT42
-diff -rupN ATLAS/tune/blas/level1/scalsrch.c atlas-3.8.3/tune/blas/level1/scalsrch.c
---- ATLAS/tune/blas/level1/scalsrch.c 2009-02-18 19:48:25.000000000 +0100
-+++ atlas-3.8.3/tune/blas/level1/scalsrch.c 2009-11-12 13:45:48.141174024 +0100
-@@ -747,7 +747,7 @@ void GenMainRout(char pre, int n, int *i
- /*
- * Handle all special alpha cases
- */
-- fprintf(fpout, "%sif ( SCALAR_IS_ZERO(alpha) )\n", spc);
-+ /* fprintf(fpout, "%sif ( SCALAR_IS_ZERO(alpha) )\n", spc);
- fprintf(fpout, "%s{\n", spc);
- if (pre == 'c' || pre == 'z')
- {
-@@ -756,7 +756,7 @@ void GenMainRout(char pre, int n, int *i
- }
- else fprintf(fpout, "%s Mjoin(PATL,set)(N, ATL_rzero, X, incx);\n", spc);
- fprintf(fpout, "%s return;\n", spc);
-- fprintf(fpout, "%s}\n", spc);
-+ fprintf(fpout, "%s}\n", spc); */
- GenAlphCase(pre, spc, fpout, 1, n, ix, iy, ia, ib);
- GenAlphCase(pre, spc, fpout, -1, n, ix, iy, ia, ib);
- if (pre == 'c' || pre == 'z')