diff options
Diffstat (limited to 'libraries/atlas/atlas.patch')
-rw-r--r-- | libraries/atlas/atlas.patch | 5072 |
1 files changed, 0 insertions, 5072 deletions
diff --git a/libraries/atlas/atlas.patch b/libraries/atlas/atlas.patch deleted file mode 100644 index dea4dcc0b2ee..000000000000 --- a/libraries/atlas/atlas.patch +++ /dev/null @@ -1,5072 +0,0 @@ -diff -rupN ATLAS/CONFIG/src/backend/archinfo_x86.c atlas-3.8.3/CONFIG/src/backend/archinfo_x86.c ---- ATLAS/CONFIG/src/backend/archinfo_x86.c 2009-02-18 19:47:37.000000000 +0100 -+++ atlas-3.8.3/CONFIG/src/backend/archinfo_x86.c 2009-11-12 13:47:23.777451677 +0100 -@@ -320,7 +320,7 @@ enum MACHTYPE Chip2Mach(enum CHIP chip, - iret = IntP4; - break; - case 3: -- case 4: -+ case 4: ; case 6: - iret = IntP4E; - break; - default: -diff -rupN ATLAS/include/atlas_lvl3.h atlas-3.8.3/include/atlas_lvl3.h ---- ATLAS/include/atlas_lvl3.h 2009-02-18 19:47:35.000000000 +0100 -+++ atlas-3.8.3/include/atlas_lvl3.h 2009-11-12 13:52:49.308496090 +0100 -@@ -126,7 +126,7 @@ - #define CPAT Mjoin(C_ATL_, PRE); - - #ifndef ATL_MaxMalloc -- #define ATL_MaxMalloc 67108864 -+ #define ATL_MaxMalloc XXX_MaxMalloc_XXX - #endif - - typedef void (*MAT2BLK)(int, int, const TYPE*, int, TYPE*, const SCALAR); -diff -rupN ATLAS/src/blas/gemm/ATL_cmmJITcp.c atlas-3.8.3/src/blas/gemm/ATL_cmmJITcp.c ---- ATLAS/src/blas/gemm/ATL_cmmJITcp.c 2009-02-18 19:47:44.000000000 +0100 -+++ atlas-3.8.3/src/blas/gemm/ATL_cmmJITcp.c 2009-11-12 12:44:34.816529051 +0100 -@@ -268,7 +268,8 @@ static void Mjoin(PATL,mmK) - { - NBmm0 = NBmm1 = NBmmX = Mjoin(PATLU,pKBmm); - if (SCALAR_IS_ZERO(beta)) -- Mjoin(PATL,gezero)(M, N, C, ldc); -+ /* Mjoin(PATL,gezero)(M, N, C, ldc); */ -+ { Mjoin(PATLU,gezero)(M, N, pC, ldpc); Mjoin(PATLU,gezero)(M, N, pC+ipc, ldpc); } - } - if (nblk) - { -diff -rupN ATLAS/src/blas/gemm/ATL_gereal2cplx.c atlas-3.8.3/src/blas/gemm/ATL_gereal2cplx.c ---- ATLAS/src/blas/gemm/ATL_gereal2cplx.c 2009-02-18 19:47:44.000000000 +0100 -+++ atlas-3.8.3/src/blas/gemm/ATL_gereal2cplx.c 2009-11-12 12:49:49.331651677 +0100 -@@ -43,7 +43,53 @@ void Mjoin(PATL,gereal2cplx) - const int ldc2 = (ldc-M)<<1; - int i, j; - -- if (ialp == ATL_rzero && ibet == ATL_rzero) -+/* -+ * Cannot read C if BETA is 0 -+ */ -+ if (rbet == ATL_rzero && ibet == ATL_rzero) -+ { -+ if (ialp == ATL_rzero) /* alpha is a real number */ -+ { -+ if (ralp == ATL_rone) /* alpha = 1.0 */ -+ { -+ for (j=0; j < N; j++, R += ldr, I += ldi, C += ldc2) -+ { -+ for (i=0; i < M; i++, C += 2) -+ { -+ *C = R[i]; -+ C[1] = I[i]; -+ } -+ } -+ } -+ else -+ { -+ for (j=0; j < N; j++, R += ldr, I += ldi, C += ldc2) -+ { -+ for (i=0; i < M; i++, C += 2) -+ { -+ *C = ralp * R[i]; -+ C[1] = ralp * I[i]; -+ } -+ } -+ } -+ } -+ else /* alpha is a complex number */ -+ { -+ for (j=0; j < N; j++, R += ldr, I += ldi, C += ldc2) -+ { -+ for (i=0; i < M; i++, C += 2) -+ { -+ ra = R[i]; ia = I[i]; -+ C[0] = ralp * ra - ialp * ia; -+ C[1] = ralp * ia + ialp * ra; -+ } -+ } -+ } -+ } -+/* -+ * If alpha and beta are both real numbers -+ */ -+ else if (ialp == ATL_rzero && ibet == ATL_rzero) - { - if (ralp == ATL_rone && rbet == ATL_rone) - { -diff -rupN ATLAS/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c atlas-3.8.3/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c ---- ATLAS/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c 2009-02-18 19:48:26.000000000 +0100 -+++ atlas-3.8.3/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c 2009-11-12 12:35:50.453038827 +0100 -@@ -27,6 +27,13 @@ - * POSSIBILITY OF SUCH DAMAGE. - * - */ -+#if KB > 84 -+ #error "KB cannot exceed 84!" -+#endif -+#if (KB/4)*4 != KB -+ #error "KB must be a multiple of 4!" -+#endif -+ - #ifndef ATL_GAS_x8664 - #error "This kernel requires x86-64 assembly!" - #endif -@@ -58,25 +65,25 @@ - * Integer register usage shown be these defines - */ - #define pA %rcx --#define pA10 %rbx --#define ldab %rbp --#define mldab %rdx -+#define pA10 %rbx -+#define ldab %rbp -+#define mldab %rdx - #define mldab5 %rax - #define pB %rdi - #define pC %rsi - #define incCn %r10 - #define stM %r9 - #define stN %r11 --#define pfA %r8 --#define pA5 pA --#define pB0 pB -+#define pfA %r8 -+#define pA5 pA -+#define pB0 pB - #if MB == 0 -- #define stM0 %r12 -- #define incAm %r13 -+ #define stM0 %r12 -+ #define incAm %r13 - #endif - /* rax used in 32/64 conversion */ - --#define NBso (KB*4) -+#define NBso (KB*4) - #define MBKBso (MB*KB*4) - #define NB2so (NBso+NBso) - #define NB3so (NBso+NBso+NBso) -@@ -95,22 +102,22 @@ - /* - * SSE2 register usage shown be these defines - */ --#define rA0 %xmm0 --#define rB0 %xmm1 --#define rC0 %xmm2 --#define rC1 %xmm3 --#define rC2 %xmm4 --#define rC3 %xmm5 --#define rC4 %xmm6 --#define rC5 %xmm7 --#define rC6 %xmm8 --#define rC7 %xmm9 --#define rC8 %xmm10 --#define rC9 %xmm11 --#define rC10 %xmm12 --#define rC11 %xmm13 --#define rC12 %xmm14 --#define rC13 %xmm15 -+#define rA0 %xmm0 -+#define rB0 %xmm1 -+#define rC0 %xmm2 -+#define rC1 %xmm3 -+#define rC2 %xmm4 -+#define rC3 %xmm5 -+#define rC4 %xmm6 -+#define rC5 %xmm7 -+#define rC6 %xmm8 -+#define rC7 %xmm9 -+#define rC8 %xmm10 -+#define rC9 %xmm11 -+#define rC10 %xmm12 -+#define rC11 %xmm13 -+#define rC12 %xmm14 -+#define rC13 %xmm15 - /* - * Prefetch defines - */ -@@ -127,99 +134,99 @@ - #if MB != 0 - #define incAm $MBKBso-NB14so+176 - #endif -- .text -+ .text - .global ATL_asmdecor(ATL_USERMM) - ATL_asmdecor(ATL_USERMM): - /* - * Save callee-saved iregs - */ -- movq %rbp, -8(%rsp) -- movq %rbx, -16(%rsp) -+ movq %rbp, -8(%rsp) -+ movq %rbx, -16(%rsp) - #if MB == 0 -- movq %r12, -32(%rsp) -- movq %r13, -40(%rsp) -+ movq %r12, -32(%rsp) -+ movq %r13, -40(%rsp) - #endif - #ifdef BETAX - #define BOF -56 -- movss %xmm1, BOF(%rsp) -- movss %xmm1, BOF+4(%rsp) -- movss %xmm1, BOF+8(%rsp) -- movss %xmm1, BOF+12(%rsp) -+ movss %xmm1, BOF(%rsp) -+ movss %xmm1, BOF+4(%rsp) -+ movss %xmm1, BOF+8(%rsp) -+ movss %xmm1, BOF+12(%rsp) - #endif - /* - * pA already comes in right reg - * Initialize pB = B; pC = C; NBso = NB * sizeof; - */ -- movq %rsi, stN -- movq %rdi, %rax -- movq 16(%rsp), pC -- prefC((pC)) -- prefC(64(pC)) -- movq %r9, pB -- prefB((pB)) -- prefB(64(pB)) -- movq %rax, stM -+ movq %rsi, stN -+ movq %rdi, %rax -+ movq 16(%rsp), pC -+ prefC((pC)) -+ prefC(64(pC)) -+ movq %r9, pB -+ prefB((pB)) -+ prefB(64(pB)) -+ movq %rax, stM - /* - * stM = pA + NBNBso; stN = pB + NBNBso; - */ - #if MB == 0 -- movq stM, pfA -- imulq $NBso, pfA -- prefB(128(pB)) -- movq pfA, incAm -- addq pA5, pfA -- addq $176-NB14so, incAm -+ movq stM, pfA -+ imulq $NBso, pfA -+ prefB(128(pB)) -+ movq pfA, incAm -+ addq pA5, pfA -+ addq $176-NB14so, incAm - #else -- movq $MBKBso, pfA -- addq pA5, pfA -- prefB(128(pB)) -+ movq $MBKBso, pfA -+ addq pA5, pfA -+ prefB(128(pB)) - #endif - /* - * convert ldc to 64 bits, and then set incCn = (ldc - MB)*sizeof - */ -- movl 24(%rsp), %eax -- cltq -- movq %rax, incCn -- subq stM, incCn -- addq $14, incCn -+ movl 24(%rsp), %eax -+ cltq -+ movq %rax, incCn -+ subq stM, incCn -+ addq $14, incCn - #ifdef SREAL -- shl $2, incCn -+ shl $2, incCn - #else -- shl $3, incCn -- prefC(128(pC)) -- prefC(192(pC)) -+ shl $3, incCn -+ prefC(128(pC)) -+ prefC(192(pC)) - #endif - /* - * Find M/14 if MB is not set - */ - #if MB == 0 -- cmp $84, stM -- jne MB_LT84 --/* movq $84/14, stM */ -- movq $6, stM -+ cmp $84, stM -+ jne MB_LT84 -+/* movq $84/14, stM */ -+ movq $6, stM - MBFOUND: -- subq $1, stM -- movq stM, stM0 -+ subq $1, stM -+ movq stM, stM0 - #endif -- addq $120, pA5 -- addq $120, pB0 -- movq $KB*4, ldab -- movq $-KB*5*4, mldab5 -- movq $-KB*4, mldab -- subq mldab5, pA5 -- lea KB*4(pA5, ldab,4), pA10 --/* movq $NB, stN */ -+ addq $120, pA5 -+ addq $120, pB0 -+ movq $KB*4, ldab -+ movq $-KB*5*4, mldab5 -+ movq $-KB*4, mldab -+ subq mldab5, pA5 -+ lea KB*4(pA5, ldab,4), pA10 -+/* movq $NB, stN */ - - UNLOOP: - #if MB == 0 -- movq stM0, stM -- cmp $0, stM -- je MLAST -+ movq stM0, stM -+ cmp $0, stM -+ je MLAST - #else - #ifdef ATL_DivAns -- movq $ATL_DivAns-1, stM -+ movq $ATL_DivAns-1, stM - #else -- movq $MB/14-1, stM -+ movq $MB/14-1, stM - #endif - #endif - #if MB == 0 || MB > 14 -@@ -227,992 +234,992 @@ UMLOOP: - /* - * rC[0-13] = pC[0-13] * beta - */ -- ALIGN16 -+ ALIGN16 - /*UKLOOP: */ - #ifdef BETA1 -- movaps 0-120(pA10,mldab5,2), rC0 -- movaps 0-120(pB0), rB0 -- mulps rB0, rC0 -- addss (pC), rC0 -- movaps 0-120(pA5, mldab,4), rC1 -- mulps rB0, rC1 -- addss CMUL(4)(pC), rC1 -- movaps 0-120(pA10, mldab,8), rC2 -- mulps rB0, rC2 -- addss CMUL(8)(pC), rC2 -- movaps 0-120(pA5, mldab,2), rC3 -- mulps rB0, rC3 -- addss CMUL(12)(pC), rC3 -- movaps 0-120(pA5, mldab), rC4 -- mulps rB0, rC4 -- addss CMUL(16)(pC), rC4 -- movaps 0-120(pA5), rC5 -- mulps rB0, rC5 -- addss CMUL(20)(pC), rC5 -- movaps 0-120(pA5, ldab), rC6 -- mulps rB0, rC6 -- addss CMUL(24)(pC), rC6 -- movaps 0-120(pA5, ldab,2), rC7 -- mulps rB0, rC7 -- addss CMUL(28)(pC), rC7 -- movaps 0-120(pA10, mldab,2), rC8 -- mulps rB0, rC8 -- addss CMUL(32)(pC), rC8 -- movaps 0-120(pA5,ldab,4), rC9 -- mulps rB0, rC9 -- addss CMUL(36)(pC), rC9 -- movaps 0-120(pA10), rC10 -- mulps rB0, rC10 -- addss CMUL(40)(pC), rC10 -- movaps 0-120(pA10,ldab), rC11 -- mulps rB0, rC11 -- addss CMUL(44)(pC), rC11 -- movaps 0-120(pA10,ldab,2), rC12 -- mulps rB0, rC12 -- addss CMUL(48)(pC), rC12 -- movaps 0-120(pA5,ldab,8), rC13 -- mulps rB0, rC13 -- addss CMUL(52)(pC), rC13 -+ movaps 0-120(pA10,mldab5,2), rC0 -+ movaps 0-120(pB0), rB0 -+ mulps rB0, rC0 -+ addss (pC), rC0 -+ movaps 0-120(pA5, mldab,4), rC1 -+ mulps rB0, rC1 -+ addss CMUL(4)(pC), rC1 -+ movaps 0-120(pA10, mldab,8), rC2 -+ mulps rB0, rC2 -+ addss CMUL(8)(pC), rC2 -+ movaps 0-120(pA5, mldab,2), rC3 -+ mulps rB0, rC3 -+ addss CMUL(12)(pC), rC3 -+ movaps 0-120(pA5, mldab), rC4 -+ mulps rB0, rC4 -+ addss CMUL(16)(pC), rC4 -+ movaps 0-120(pA5), rC5 -+ mulps rB0, rC5 -+ addss CMUL(20)(pC), rC5 -+ movaps 0-120(pA5, ldab), rC6 -+ mulps rB0, rC6 -+ addss CMUL(24)(pC), rC6 -+ movaps 0-120(pA5, ldab,2), rC7 -+ mulps rB0, rC7 -+ addss CMUL(28)(pC), rC7 -+ movaps 0-120(pA10, mldab,2), rC8 -+ mulps rB0, rC8 -+ addss CMUL(32)(pC), rC8 -+ movaps 0-120(pA5,ldab,4), rC9 -+ mulps rB0, rC9 -+ addss CMUL(36)(pC), rC9 -+ movaps 0-120(pA10), rC10 -+ mulps rB0, rC10 -+ addss CMUL(40)(pC), rC10 -+ movaps 0-120(pA10,ldab), rC11 -+ mulps rB0, rC11 -+ addss CMUL(44)(pC), rC11 -+ movaps 0-120(pA10,ldab,2), rC12 -+ mulps rB0, rC12 -+ addss CMUL(48)(pC), rC12 -+ movaps 0-120(pA5,ldab,8), rC13 -+ mulps rB0, rC13 -+ addss CMUL(52)(pC), rC13 - #else -- movaps 0-120(pA10,mldab5,2), rC0 -- movaps 0-120(pB0), rC13 -- mulps rC13, rC0 -- movaps 0-120(pA5, mldab,4), rC1 -- mulps rC13, rC1 -- movaps 0-120(pA10, mldab,8), rC2 -- mulps rC13, rC2 -- movaps 0-120(pA5, mldab,2), rC3 -- mulps rC13, rC3 -- movaps 0-120(pA5, mldab), rC4 -- mulps rC13, rC4 -- movaps 0-120(pA5), rC5 -- mulps rC13, rC5 -- movaps 0-120(pA5, ldab), rC6 -- mulps rC13, rC6 -- movaps 0-120(pA5, ldab,2), rC7 -- mulps rC13, rC7 -- movaps 0-120(pA10, mldab,2), rC8 -- mulps rC13, rC8 -- movaps 0-120(pA5,ldab,4), rC9 -- mulps rC13, rC9 -- movaps 0-120(pA10), rC10 -- mulps rC13, rC10 -- movaps 0-120(pA10,ldab), rC11 -- mulps rC13, rC11 -- movaps 0-120(pA10,ldab,2), rC12 -- mulps rC13, rC12 -- mulps 0-120(pA5,ldab,8), rC13 -+ movaps 0-120(pA10,mldab5,2), rC0 -+ movaps 0-120(pB0), rC13 -+ mulps rC13, rC0 -+ movaps 0-120(pA5, mldab,4), rC1 -+ mulps rC13, rC1 -+ movaps 0-120(pA10, mldab,8), rC2 -+ mulps rC13, rC2 -+ movaps 0-120(pA5, mldab,2), rC3 -+ mulps rC13, rC3 -+ movaps 0-120(pA5, mldab), rC4 -+ mulps rC13, rC4 -+ movaps 0-120(pA5), rC5 -+ mulps rC13, rC5 -+ movaps 0-120(pA5, ldab), rC6 -+ mulps rC13, rC6 -+ movaps 0-120(pA5, ldab,2), rC7 -+ mulps rC13, rC7 -+ movaps 0-120(pA10, mldab,2), rC8 -+ mulps rC13, rC8 -+ movaps 0-120(pA5,ldab,4), rC9 -+ mulps rC13, rC9 -+ movaps 0-120(pA10), rC10 -+ mulps rC13, rC10 -+ movaps 0-120(pA10,ldab), rC11 -+ mulps rC13, rC11 -+ movaps 0-120(pA10,ldab,2), rC12 -+ mulps rC13, rC12 -+ mulps 0-120(pA5,ldab,8), rC13 - #endif - - #if KB > 4 -- movaps 16-120(pA10,mldab5,2), rA0 -- movaps 16-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 16-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 16-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 16-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 16-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 16-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 16-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 16-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 16-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 16-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 16-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 16-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 16-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 16-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 16-120(pA10,mldab5,2), rA0 -+ movaps 16-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 16-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 16-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 16-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 16-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 16-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 16-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 16-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 16-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 16-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 16-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 16-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 16-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 16-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 8 -- movaps 32-120(pA10,mldab5,2), rA0 -- movaps 32-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 32-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 32-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 32-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 32-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 32-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 32-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 32-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 32-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 32-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 32-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 32-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 32-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 32-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 32-120(pA10,mldab5,2), rA0 -+ movaps 32-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 32-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 32-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 32-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 32-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 32-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 32-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 32-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 32-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 32-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 32-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 32-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 32-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 32-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 12 -- movaps 48-120(pA10,mldab5,2), rA0 -- movaps 48-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 48-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 48-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 48-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 48-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 48-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 48-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 48-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 48-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 48-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 48-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 48-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 48-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 48-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 48-120(pA10,mldab5,2), rA0 -+ movaps 48-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 48-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 48-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 48-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 48-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 48-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 48-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 48-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 48-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 48-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 48-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 48-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 48-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 48-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 16 -- movaps 64-120(pA10,mldab5,2), rA0 -- movaps 64-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 64-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 64-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 64-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 64-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 64-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 64-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 64-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 64-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 64-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 64-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 64-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 64-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 64-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 64-120(pA10,mldab5,2), rA0 -+ movaps 64-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 64-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 64-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 64-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 64-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 64-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 64-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 64-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 64-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 64-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 64-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 64-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 64-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 64-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 20 -- movaps 80-120(pA10,mldab5,2), rA0 -- movaps 80-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 80-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 80-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 80-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 80-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 80-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 80-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 80-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 80-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 80-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 80-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 80-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 80-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 80-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 80-120(pA10,mldab5,2), rA0 -+ movaps 80-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 80-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 80-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 80-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 80-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 80-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 80-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 80-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 80-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 80-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 80-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 80-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 80-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 80-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 24 -- movaps 96-120(pA10,mldab5,2), rA0 -- movaps 96-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 96-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 96-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 96-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 96-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 96-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 96-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 96-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 96-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 96-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 96-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 96-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 96-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 96-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 96-120(pA10,mldab5,2), rA0 -+ movaps 96-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 96-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 96-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 96-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 96-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 96-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 96-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 96-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 96-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 96-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 96-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 96-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 96-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 96-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 28 -- movaps 112-120(pA10,mldab5,2), rA0 -- movaps 112-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 112-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 112-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 112-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 112-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 112-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 112-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 112-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 112-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 112-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 112-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 112-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 112-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 112-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 112-120(pA10,mldab5,2), rA0 -+ movaps 112-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 112-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 112-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 112-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 112-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 112-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 112-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 112-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 112-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 112-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 112-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 112-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 112-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 112-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - #ifndef SREAL -- pref2((pfA)) -- pref2(64(pfA)) -+ pref2((pfA)) -+ pref2(64(pfA)) - #endif - - #if KB > 32 -- movaps 128-120(pA10,mldab5,2), rA0 -- movaps 128-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 128-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 128-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 128-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 128-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 128-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 128-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 128-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 128-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 128-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 128-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 128-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 128-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 128-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 128-120(pA10,mldab5,2), rA0 -+ movaps 128-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 128-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 128-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 128-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 128-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 128-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 128-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 128-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 128-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 128-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 128-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 128-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 128-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 128-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 36 -- movaps 144-120(pA10,mldab5,2), rA0 -- movaps 144-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 144-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 144-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 144-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 144-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 144-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 144-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 144-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 144-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 144-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 144-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 144-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 144-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 144-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 144-120(pA10,mldab5,2), rA0 -+ movaps 144-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 144-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 144-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 144-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 144-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 144-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 144-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 144-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 144-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 144-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 144-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 144-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 144-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 144-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 40 -- movaps 160-120(pA10,mldab5,2), rA0 -- movaps 160-120(pB0), rB0 -- mulps rB0, rA0 -- addq $176, pB0 -- addps rA0, rC0 -- movaps 160-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 160-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 160-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 160-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 160-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 160-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 160-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 160-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 160-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 160-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 160-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 160-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addq $176, pA10 -- addps rA0, rC12 -- mulps 160-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -- addq $176, pA5 -+ movaps 160-120(pA10,mldab5,2), rA0 -+ movaps 160-120(pB0), rB0 -+ mulps rB0, rA0 -+ addq $176, pB0 -+ addps rA0, rC0 -+ movaps 160-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 160-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 160-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 160-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 160-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 160-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 160-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 160-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 160-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 160-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 160-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 160-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addq $176, pA10 -+ addps rA0, rC12 -+ mulps 160-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 -+ addq $176, pA5 - #else -- addq $176, pB0 -- addq $176, pA10 -- addq $176, pA5 -+ addq $176, pB0 -+ addq $176, pA10 -+ addq $176, pA5 - #endif - - #if KB > 44 -- movaps 0-120(pA10,mldab5,2), rA0 -- movaps 0-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 0-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 0-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 0-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 0-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 0-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 0-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 0-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 0-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 0-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 0-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 0-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 0-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 0-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 0-120(pA10,mldab5,2), rA0 -+ movaps 0-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 0-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 0-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 0-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 0-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 0-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 0-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 0-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 0-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 0-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 0-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 0-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 0-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 0-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 48 -- movaps 16-120(pA10,mldab5,2), rA0 -- movaps 16-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 16-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 16-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 16-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 16-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 16-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 16-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 16-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 16-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 16-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 16-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 16-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 16-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 16-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 16-120(pA10,mldab5,2), rA0 -+ movaps 16-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 16-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 16-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 16-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 16-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 16-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 16-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 16-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 16-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 16-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 16-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 16-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 16-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 16-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 52 -- movaps 32-120(pA10,mldab5,2), rA0 -- movaps 32-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 32-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 32-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 32-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 32-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 32-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 32-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 32-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 32-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 32-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 32-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 32-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 32-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 32-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 32-120(pA10,mldab5,2), rA0 -+ movaps 32-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 32-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 32-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 32-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 32-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 32-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 32-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 32-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 32-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 32-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 32-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 32-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 32-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 32-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 56 -- movaps 48-120(pA10,mldab5,2), rA0 -- movaps 48-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 48-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 48-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 48-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 48-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 48-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 48-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 48-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 48-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 48-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 48-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 48-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 48-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 48-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 48-120(pA10,mldab5,2), rA0 -+ movaps 48-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 48-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 48-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 48-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 48-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 48-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 48-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 48-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 48-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 48-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 48-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 48-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 48-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 48-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 60 -- movaps 64-120(pA10,mldab5,2), rA0 -- movaps 64-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 64-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 64-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 64-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 64-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 64-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 64-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 64-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 64-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 64-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 64-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 64-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 64-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 64-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 64-120(pA10,mldab5,2), rA0 -+ movaps 64-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 64-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 64-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 64-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 64-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 64-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 64-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 64-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 64-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 64-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 64-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 64-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 64-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 64-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 64 -- movaps 80-120(pA10,mldab5,2), rA0 -- movaps 80-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 80-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 80-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 80-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 80-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 80-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 80-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 80-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 80-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 80-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 80-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 80-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 80-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 80-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 80-120(pA10,mldab5,2), rA0 -+ movaps 80-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 80-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 80-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 80-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 80-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 80-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 80-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 80-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 80-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 80-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 80-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 80-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 80-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 80-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 68 -- movaps 96-120(pA10,mldab5,2), rA0 -- movaps 96-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 96-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 96-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 96-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 96-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 96-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 96-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 96-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 96-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 96-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 96-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 96-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 96-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 96-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 96-120(pA10,mldab5,2), rA0 -+ movaps 96-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 96-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 96-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 96-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 96-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 96-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 96-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 96-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 96-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 96-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 96-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 96-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 96-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 96-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 72 -- movaps 112-120(pA10,mldab5,2), rA0 -- movaps 112-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 112-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 112-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 112-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 112-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 112-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 112-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 112-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 112-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 112-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 112-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 112-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 112-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 112-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 112-120(pA10,mldab5,2), rA0 -+ movaps 112-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 112-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 112-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 112-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 112-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 112-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 112-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 112-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 112-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 112-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 112-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 112-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 112-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 112-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 76 -- movaps 128-120(pA10,mldab5,2), rA0 -- movaps 128-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 128-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 128-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 128-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 128-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 128-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 128-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 128-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 128-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 128-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 128-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 128-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 128-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 128-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 128-120(pA10,mldab5,2), rA0 -+ movaps 128-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 128-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 128-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 128-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 128-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 128-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 128-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 128-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 128-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 128-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 128-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 128-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 128-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 128-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 80 -- movaps 144-120(pA10,mldab5,2), rA0 -- movaps 144-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 144-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 144-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 144-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 144-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 144-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 144-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 144-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 144-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 144-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 144-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 144-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 144-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 144-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 144-120(pA10,mldab5,2), rA0 -+ movaps 144-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 144-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 144-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 144-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 144-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 144-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 144-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 144-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 144-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 144-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 144-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 144-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 144-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 144-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - /*UKLOOP */ -@@ -1220,234 +1227,234 @@ UMLOOP: - * Get these bastard things summed up correctly - */ - -- /* rC0 = c0a c0b c0c c0d */ -- /* rC1 = c1a c1b c1c c1d */ -- /* rC2 = c2a c2b c2c c2d */ -- /* rC3 = c3a c3b c3c c3d */ -+ /* rC0 = c0a c0b c0c c0d */ -+ /* rC1 = c1a c1b c1c c1d */ -+ /* rC2 = c2a c2b c2c c2d */ -+ /* rC3 = c3a c3b c3c c3d */ - /* */ -- movaps rC2, rB0 /* rB0 = c2a c2b c2c c2d */ -- prefC((pC)) -- prefC(64(pC)) -- movaps rC0, rA0 /* rA0 = c0a c0b c0c c0d */ -- unpckhps rC3, rB0 /* rB0 = c2c c3c c2d c3d */ -- unpckhps rC1, rA0 /* rA0 = c0c c1c c0d c1d */ -- unpcklps rC3, rC2 /* rC2 = c2a c3a c2b c3b */ -- movlhps rB0, rC3 /* rC3 = c3a c3b c2c c3c */ -- unpcklps rC1, rC0 /* rC0 = c0a c1a c0b c1b */ -- movhlps rA0, rC3 /* rC3 = c0d c1d c2c c3c */ -- movlhps rC2, rA0 /* rA0 = c0c c1c c2a c3a */ -- movhlps rC0, rB0 /* rB0 = c0b c1b c2d c3d */ -- addps rA0, rC3 /* rC3 = c0cd c1cd c2ac c3ac */ -- movlhps rC0, rC1 /* rC1 = c1a c1b c0a c1a */ -- movhlps rC1, rC2 /* rC2 = c0a c1a c2b c3b */ -- movaps rC4, rA0 /* rA0 = c4a c4b c4c c4d */ -- addps rB0, rC2 /* rC2 = c0ab c1ab c2bd c3bd */ -- movaps rC6, rB0 /* rB0 = c6a c6b c6c c6d */ -- addps rC2, rC3 /* rC3 = c0abcd c1abcd c2bdac c3bdac */ -- -- -- /* rC4 = c4a c4b c4c c4d */ -- /* rC5 = c5a c5b c5c c5d */ -- /* rC6 = c6a c6b c6c c6d */ -- /* rC7 = c7a c7b c7c c7d */ -- /* rC8 = c08a c08b c08c c08d */ -- /* rC9 = c09a c09b c09c c09d */ -- /* rC10 = c10a c10b c10c c10d */ -- /* rC11 = c11a c11b c11c c11d */ -- /* rC12 = c12a c12b c12c c12d */ -- /* rC13 = c13a c13b c13c c13d */ -+ movaps rC2, rB0 /* rB0 = c2a c2b c2c c2d */ -+ prefC((pC)) -+ prefC(64(pC)) -+ movaps rC0, rA0 /* rA0 = c0a c0b c0c c0d */ -+ unpckhps rC3, rB0 /* rB0 = c2c c3c c2d c3d */ -+ unpckhps rC1, rA0 /* rA0 = c0c c1c c0d c1d */ -+ unpcklps rC3, rC2 /* rC2 = c2a c3a c2b c3b */ -+ movlhps rB0, rC3 /* rC3 = c3a c3b c2c c3c */ -+ unpcklps rC1, rC0 /* rC0 = c0a c1a c0b c1b */ -+ movhlps rA0, rC3 /* rC3 = c0d c1d c2c c3c */ -+ movlhps rC2, rA0 /* rA0 = c0c c1c c2a c3a */ -+ movhlps rC0, rB0 /* rB0 = c0b c1b c2d c3d */ -+ addps rA0, rC3 /* rC3 = c0cd c1cd c2ac c3ac */ -+ movlhps rC0, rC1 /* rC1 = c1a c1b c0a c1a */ -+ movhlps rC1, rC2 /* rC2 = c0a c1a c2b c3b */ -+ movaps rC4, rA0 /* rA0 = c4a c4b c4c c4d */ -+ addps rB0, rC2 /* rC2 = c0ab c1ab c2bd c3bd */ -+ movaps rC6, rB0 /* rB0 = c6a c6b c6c c6d */ -+ addps rC2, rC3 /* rC3 = c0abcd c1abcd c2bdac c3bdac */ -+ -+ -+ /* rC4 = c4a c4b c4c c4d */ -+ /* rC5 = c5a c5b c5c c5d */ -+ /* rC6 = c6a c6b c6c c6d */ -+ /* rC7 = c7a c7b c7c c7d */ -+ /* rC8 = c08a c08b c08c c08d */ -+ /* rC9 = c09a c09b c09c c09d */ -+ /* rC10 = c10a c10b c10c c10d */ -+ /* rC11 = c11a c11b c11c c11d */ -+ /* rC12 = c12a c12b c12c c12d */ -+ /* rC13 = c13a c13b c13c c13d */ - /* */ -- movaps rC10, rC0 /* rC0 = c10a c10b c10c c10d */ -- prefC(128(pC)) -+ movaps rC10, rC0 /* rC0 = c10a c10b c10c c10d */ -+ prefC(128(pC)) - #ifdef SREAL -- pref2((pfA)) -+ pref2((pfA)) - #else -- prefC(192(pC)) -+ prefC(192(pC)) - #endif -- movaps rC8 , rC1 /* rC1 = c08a c08b c08c c08d */ -- movaps rC12, rC2 /* rC2 = c12a c12b c12c c12d */ -- unpckhps rC7, rB0 /* rB0 = c6c c7c c6d c7d */ -- unpckhps rC5, rA0 /* rA0 = c4c c5c c4d c5d */ -- unpcklps rC7, rC6 /* rC6 = c6a c7a c6b c7b */ -- unpckhps rC11, rC0 /* rC0 = c10c c11c c10d c11d */ -- unpckhps rC9 , rC1 /* rC1 = c08c c09c c08d c09d */ -- movlhps rB0, rC7 /* rC7 = c7a c7b c6c c7c */ -- unpcklps rC5, rC4 /* rC4 = c4a c5a c4b c5b */ -- movhlps rA0, rC7 /* rC7 = c4d c5d c6c c7c */ -- movlhps rC6, rA0 /* rA0 = c4c c5c c6a c7a */ -- unpcklps rC11, rC10 /* rC10 = c10a c11a c10b c11b */ -- movhlps rC4, rB0 /* rB0 = c4b c5b c6d c7d */ -- movlhps rC0, rC11 /* rC11 = c11a c11b c10c c11c */ -- addps rA0, rC7 /* rC7 = c4cd c5cd c6ac c7ac */ -+ movaps rC8 , rC1 /* rC1 = c08a c08b c08c c08d */ -+ movaps rC12, rC2 /* rC2 = c12a c12b c12c c12d */ -+ unpckhps rC7, rB0 /* rB0 = c6c c7c c6d c7d */ -+ unpckhps rC5, rA0 /* rA0 = c4c c5c c4d c5d */ -+ unpcklps rC7, rC6 /* rC6 = c6a c7a c6b c7b */ -+ unpckhps rC11, rC0 /* rC0 = c10c c11c c10d c11d */ -+ unpckhps rC9 , rC1 /* rC1 = c08c c09c c08d c09d */ -+ movlhps rB0, rC7 /* rC7 = c7a c7b c6c c7c */ -+ unpcklps rC5, rC4 /* rC4 = c4a c5a c4b c5b */ -+ movhlps rA0, rC7 /* rC7 = c4d c5d c6c c7c */ -+ movlhps rC6, rA0 /* rA0 = c4c c5c c6a c7a */ -+ unpcklps rC11, rC10 /* rC10 = c10a c11a c10b c11b */ -+ movhlps rC4, rB0 /* rB0 = c4b c5b c6d c7d */ -+ movlhps rC0, rC11 /* rC11 = c11a c11b c10c c11c */ -+ addps rA0, rC7 /* rC7 = c4cd c5cd c6ac c7ac */ - #ifdef BETAX - #ifdef SREAL -- movups (pC), rA0 -- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ -- movups 16(pC), rC4 -- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ -- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ -- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ -- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ -- movups 32(pC), rC5 -- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ -- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ -- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ -- movlps 48(pC), rC1 -- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ -- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ -- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ -- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ -- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ -- pref2(64(pfA)) -- mulps BOF(%rsp), rA0 -- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ -- mulps BOF(%rsp), rC4 -- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ -- mulps BOF(%rsp), rC5 -- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ -- mulps BOF(%rsp), rC1 -+ movups (pC), rA0 -+ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ -+ movups 16(pC), rC4 -+ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ -+ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ -+ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ -+ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ -+ movups 32(pC), rC5 -+ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ -+ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ -+ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ -+ movlps 48(pC), rC1 -+ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ -+ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ -+ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ -+ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ -+ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ -+ pref2(64(pfA)) -+ mulps BOF(%rsp), rA0 -+ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ -+ mulps BOF(%rsp), rC4 -+ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ -+ mulps BOF(%rsp), rC5 -+ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ -+ mulps BOF(%rsp), rC1 - - /* */ - -- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ -- addps rA0, rC3 -- addq $68, pfA -- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ -- addps rC4, rC7 -- addps rC5, rC11 -- addps rC1, rC12 -+ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ -+ addps rA0, rC3 -+ addq $68, pfA -+ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ -+ addps rC4, rC7 -+ addps rC5, rC11 -+ addps rC1, rC12 - #else /* BETA = X, complex type */ -- movups (pC), rA0 -- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ -- movups 16(pC), rC4 -- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ -- shufps $0x88, rC4, rA0 /* rA0 = c0 c1 c2 c3 */ -- movups 32(pC), rC4 /* rC4 = c4 X c5 X */ -- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ -- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ -- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ -- movups 48(pC), rC5 /* rC5 = c6 X c7 X */ -- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ -- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ -- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ -- shufps $0x88, rC5, rC4 /* rC4 = c4 c5 c6 c7 */ -- movups 64(pC), rC5 /* rC5 = c8 X c9 X */ -- movups 80(pC), rC1 /* rC1 = c10 X c11 X */ -- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ -- shufps $0x88, rC1, rC5 /* rC5 = c8 c9 c10 c11 */ -- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ -- movss 96(pC), rC1 -- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ -- movss 104(pC), rB0 -- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ -- unpcklps rB0, rC1 -- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ -- prefC(256(pC)) -- mulps BOF(%rsp), rA0 -- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ -- mulps BOF(%rsp), rC4 -- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ -- mulps BOF(%rsp), rC5 -- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ -- mulps BOF(%rsp), rC1 -+ movups (pC), rA0 -+ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ -+ movups 16(pC), rC4 -+ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ -+ shufps $0x88, rC4, rA0 /* rA0 = c0 c1 c2 c3 */ -+ movups 32(pC), rC4 /* rC4 = c4 X c5 X */ -+ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ -+ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ -+ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ -+ movups 48(pC), rC5 /* rC5 = c6 X c7 X */ -+ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ -+ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ -+ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ -+ shufps $0x88, rC5, rC4 /* rC4 = c4 c5 c6 c7 */ -+ movups 64(pC), rC5 /* rC5 = c8 X c9 X */ -+ movups 80(pC), rC1 /* rC1 = c10 X c11 X */ -+ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ -+ shufps $0x88, rC1, rC5 /* rC5 = c8 c9 c10 c11 */ -+ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ -+ movss 96(pC), rC1 -+ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ -+ movss 104(pC), rB0 -+ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ -+ unpcklps rB0, rC1 -+ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ -+ prefC(256(pC)) -+ mulps BOF(%rsp), rA0 -+ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ -+ mulps BOF(%rsp), rC4 -+ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ -+ mulps BOF(%rsp), rC5 -+ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ -+ mulps BOF(%rsp), rC1 - - /* */ - -- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ -- addps rA0, rC3 -- prefC(192(pC)) -- addq $68, pfA -- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ -- addps rC4, rC7 -- addps rC5, rC11 -- addps rC1, rC12 -+ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ -+ addps rA0, rC3 -+ prefC(192(pC)) -+ addq $68, pfA -+ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ -+ addps rC4, rC7 -+ addps rC5, rC11 -+ addps rC1, rC12 - #endif - - #else -- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ -- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ -- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ -- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ -- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ -- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ -- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ -- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ -- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ -- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ -- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ -- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ -- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ -+ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ -+ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ -+ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ -+ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ -+ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ -+ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ -+ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ -+ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ -+ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ -+ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ -+ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ -+ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ -+ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ - #ifdef SREAL -- pref2(64(pfA)) -+ pref2(64(pfA)) - #else -- prefC(256(pC)) -+ prefC(256(pC)) - #endif -- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ -- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ -- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ -+ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ -+ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ -+ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ - - /* */ - -- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ -+ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ - #ifndef SREAL -- prefC(192(pC)) -+ prefC(192(pC)) - #endif -- addq $68, pfA -- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ -+ addq $68, pfA -+ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ - - #endif - /* - * Write results back to C; pC += 14; - */ - #ifdef SREAL -- movups rC3, (pC) -- movups rC7, 16(pC) -- movups rC11, 32(pC) -- movlps rC12, 48(pC) -- addq $56, pC -+ movups rC3, (pC) -+ movups rC7, 16(pC) -+ movups rC11, 32(pC) -+ movlps rC12, 48(pC) -+ addq $56, pC - #else -- movss rC3, (pC) -- movss rC7, 32(pC) -- movhlps rC3, rC0 -- movhlps rC7, rC6 -- movss rC0, 16(pC) -- movss rC6, 48(pC) -- shufps $0x55, rC3, rC3 -- shufps $0x55, rC7, rC7 -- movss rC3, 8(pC) -- movss rC7, 40(pC) -- shufps $0x55, rC0, rC0 -- shufps $0x55, rC6, rC6 -- movss rC0, 24(pC) -- movss rC6, 56(pC) -- -- movss rC11, 64(pC) -- movhlps rC11, rC2 -- movss rC12, 96(pC) -- movss rC2, 80(pC) -- shufps $0x55, rC11, rC11 -- shufps $0x55, rC12, rC12 -- movss rC11, 72(pC) -- shufps $0x55, rC2, rC2 -- movss rC12, 104(pC) -- movss rC2, 88(pC) -+ movss rC3, (pC) -+ movss rC7, 32(pC) -+ movhlps rC3, rC0 -+ movhlps rC7, rC6 -+ movss rC0, 16(pC) -+ movss rC6, 48(pC) -+ shufps $0x55, rC3, rC3 -+ shufps $0x55, rC7, rC7 -+ movss rC3, 8(pC) -+ movss rC7, 40(pC) -+ shufps $0x55, rC0, rC0 -+ shufps $0x55, rC6, rC6 -+ movss rC0, 24(pC) -+ movss rC6, 56(pC) -+ -+ movss rC11, 64(pC) -+ movhlps rC11, rC2 -+ movss rC12, 96(pC) -+ movss rC2, 80(pC) -+ shufps $0x55, rC11, rC11 -+ shufps $0x55, rC12, rC12 -+ movss rC11, 72(pC) -+ shufps $0x55, rC2, rC2 -+ movss rC12, 104(pC) -+ movss rC2, 88(pC) - -- addq $112, pC -+ addq $112, pC - #endif - /* - * Write results back to C - */ -- addq $NB14so-176, pA5 -- addq $NB14so-176, pA10 -- subq $176, pB0 -+ addq $NB14so-176, pA5 -+ addq $NB14so-176, pA10 -+ subq $176, pB0 - /* - * pC += 14; pA += 14*NB; pB -= NB; - */ - /* - * while (pA != stM); - */ -- subq $1, stM -- jne UMLOOP -+ subq $1, stM -+ jne UMLOOP - #endif - - /* -@@ -1459,994 +1466,994 @@ MLAST: - #endif - /*UKLOOP: */ - #ifdef BETA1 -- movaps 0-120(pA10,mldab5,2), rC0 -- movaps 0-120(pB0), rB0 -- mulps rB0, rC0 -- addss (pC), rC0 -- movaps 0-120(pA5, mldab,4), rC1 -- mulps rB0, rC1 -- addss CMUL(4)(pC), rC1 -- movaps 0-120(pA10, mldab,8), rC2 -- mulps rB0, rC2 -- addss CMUL(8)(pC), rC2 -- movaps 0-120(pA5, mldab,2), rC3 -- mulps rB0, rC3 -- addss CMUL(12)(pC), rC3 -- movaps 0-120(pA5, mldab), rC4 -- mulps rB0, rC4 -- addss CMUL(16)(pC), rC4 -- movaps 0-120(pA5), rC5 -- mulps rB0, rC5 -- addss CMUL(20)(pC), rC5 -- movaps 0-120(pA5, ldab), rC6 -- mulps rB0, rC6 -- addss CMUL(24)(pC), rC6 -- movaps 0-120(pA5, ldab,2), rC7 -- mulps rB0, rC7 -- addss CMUL(28)(pC), rC7 -- movaps 0-120(pA10, mldab,2), rC8 -- mulps rB0, rC8 -- addss CMUL(32)(pC), rC8 -- movaps 0-120(pA5,ldab,4), rC9 -- mulps rB0, rC9 -- addss CMUL(36)(pC), rC9 -- movaps 0-120(pA10), rC10 -- mulps rB0, rC10 -- addss CMUL(40)(pC), rC10 -- movaps 0-120(pA10,ldab), rC11 -- mulps rB0, rC11 -- addss CMUL(44)(pC), rC11 -- movaps 0-120(pA10,ldab,2), rC12 -- mulps rB0, rC12 -- addss CMUL(48)(pC), rC12 -- movaps 0-120(pA5,ldab,8), rC13 -- mulps rB0, rC13 -- addss CMUL(52)(pC), rC13 -+ movaps 0-120(pA10,mldab5,2), rC0 -+ movaps 0-120(pB0), rB0 -+ mulps rB0, rC0 -+ addss (pC), rC0 -+ movaps 0-120(pA5, mldab,4), rC1 -+ mulps rB0, rC1 -+ addss CMUL(4)(pC), rC1 -+ movaps 0-120(pA10, mldab,8), rC2 -+ mulps rB0, rC2 -+ addss CMUL(8)(pC), rC2 -+ movaps 0-120(pA5, mldab,2), rC3 -+ mulps rB0, rC3 -+ addss CMUL(12)(pC), rC3 -+ movaps 0-120(pA5, mldab), rC4 -+ mulps rB0, rC4 -+ addss CMUL(16)(pC), rC4 -+ movaps 0-120(pA5), rC5 -+ mulps rB0, rC5 -+ addss CMUL(20)(pC), rC5 -+ movaps 0-120(pA5, ldab), rC6 -+ mulps rB0, rC6 -+ addss CMUL(24)(pC), rC6 -+ movaps 0-120(pA5, ldab,2), rC7 -+ mulps rB0, rC7 -+ addss CMUL(28)(pC), rC7 -+ movaps 0-120(pA10, mldab,2), rC8 -+ mulps rB0, rC8 -+ addss CMUL(32)(pC), rC8 -+ movaps 0-120(pA5,ldab,4), rC9 -+ mulps rB0, rC9 -+ addss CMUL(36)(pC), rC9 -+ movaps 0-120(pA10), rC10 -+ mulps rB0, rC10 -+ addss CMUL(40)(pC), rC10 -+ movaps 0-120(pA10,ldab), rC11 -+ mulps rB0, rC11 -+ addss CMUL(44)(pC), rC11 -+ movaps 0-120(pA10,ldab,2), rC12 -+ mulps rB0, rC12 -+ addss CMUL(48)(pC), rC12 -+ movaps 0-120(pA5,ldab,8), rC13 -+ mulps rB0, rC13 -+ addss CMUL(52)(pC), rC13 - #else -- movaps 0-120(pA10,mldab5,2), rC0 -- movaps 0-120(pB0), rC13 -- mulps rC13, rC0 -- movaps 0-120(pA5, mldab,4), rC1 -- mulps rC13, rC1 -- movaps 0-120(pA10, mldab,8), rC2 -- mulps rC13, rC2 -- movaps 0-120(pA5, mldab,2), rC3 -- mulps rC13, rC3 -- movaps 0-120(pA5, mldab), rC4 -- mulps rC13, rC4 -- movaps 0-120(pA5), rC5 -- mulps rC13, rC5 -- movaps 0-120(pA5, ldab), rC6 -- mulps rC13, rC6 -- movaps 0-120(pA5, ldab,2), rC7 -- mulps rC13, rC7 -- movaps 0-120(pA10, mldab,2), rC8 -- mulps rC13, rC8 -- movaps 0-120(pA5,ldab,4), rC9 -- mulps rC13, rC9 -- movaps 0-120(pA10), rC10 -- mulps rC13, rC10 -- movaps 0-120(pA10,ldab), rC11 -- mulps rC13, rC11 -- movaps 0-120(pA10,ldab,2), rC12 -- mulps rC13, rC12 -- mulps 0-120(pA5,ldab,8), rC13 -+ movaps 0-120(pA10,mldab5,2), rC0 -+ movaps 0-120(pB0), rC13 -+ mulps rC13, rC0 -+ movaps 0-120(pA5, mldab,4), rC1 -+ mulps rC13, rC1 -+ movaps 0-120(pA10, mldab,8), rC2 -+ mulps rC13, rC2 -+ movaps 0-120(pA5, mldab,2), rC3 -+ mulps rC13, rC3 -+ movaps 0-120(pA5, mldab), rC4 -+ mulps rC13, rC4 -+ movaps 0-120(pA5), rC5 -+ mulps rC13, rC5 -+ movaps 0-120(pA5, ldab), rC6 -+ mulps rC13, rC6 -+ movaps 0-120(pA5, ldab,2), rC7 -+ mulps rC13, rC7 -+ movaps 0-120(pA10, mldab,2), rC8 -+ mulps rC13, rC8 -+ movaps 0-120(pA5,ldab,4), rC9 -+ mulps rC13, rC9 -+ movaps 0-120(pA10), rC10 -+ mulps rC13, rC10 -+ movaps 0-120(pA10,ldab), rC11 -+ mulps rC13, rC11 -+ movaps 0-120(pA10,ldab,2), rC12 -+ mulps rC13, rC12 -+ mulps 0-120(pA5,ldab,8), rC13 - #endif - - #if KB > 4 -- movaps 16-120(pA10,mldab5,2), rA0 -- movaps 16-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 16-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 16-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 16-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 16-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 16-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 16-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 16-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 16-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 16-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 16-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 16-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 16-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 16-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 16-120(pA10,mldab5,2), rA0 -+ movaps 16-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 16-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 16-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 16-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 16-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 16-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 16-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 16-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 16-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 16-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 16-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 16-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 16-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 16-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 8 -- movaps 32-120(pA10,mldab5,2), rA0 -- movaps 32-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 32-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 32-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 32-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 32-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 32-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 32-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 32-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 32-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 32-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 32-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 32-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 32-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 32-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 32-120(pA10,mldab5,2), rA0 -+ movaps 32-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 32-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 32-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 32-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 32-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 32-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 32-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 32-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 32-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 32-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 32-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 32-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 32-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 32-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 12 -- movaps 48-120(pA10,mldab5,2), rA0 -- movaps 48-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 48-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 48-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 48-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 48-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 48-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 48-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 48-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 48-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 48-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 48-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 48-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 48-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 48-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 48-120(pA10,mldab5,2), rA0 -+ movaps 48-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 48-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 48-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 48-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 48-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 48-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 48-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 48-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 48-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 48-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 48-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 48-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 48-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 48-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 16 -- movaps 64-120(pA10,mldab5,2), rA0 -- movaps 64-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 64-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 64-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 64-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 64-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 64-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 64-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 64-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 64-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 64-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 64-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 64-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 64-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 64-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 64-120(pA10,mldab5,2), rA0 -+ movaps 64-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 64-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 64-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 64-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 64-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 64-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 64-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 64-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 64-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 64-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 64-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 64-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 64-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 64-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 20 -- movaps 80-120(pA10,mldab5,2), rA0 -- movaps 80-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 80-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 80-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 80-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 80-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 80-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 80-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 80-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 80-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 80-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 80-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 80-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 80-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 80-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 80-120(pA10,mldab5,2), rA0 -+ movaps 80-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 80-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 80-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 80-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 80-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 80-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 80-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 80-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 80-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 80-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 80-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 80-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 80-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 80-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 24 -- movaps 96-120(pA10,mldab5,2), rA0 -- movaps 96-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 96-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 96-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 96-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 96-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 96-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 96-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 96-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 96-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 96-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 96-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 96-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 96-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 96-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 96-120(pA10,mldab5,2), rA0 -+ movaps 96-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 96-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 96-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 96-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 96-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 96-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 96-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 96-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 96-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 96-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 96-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 96-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 96-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 96-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 28 -- movaps 112-120(pA10,mldab5,2), rA0 -- movaps 112-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 112-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 112-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 112-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 112-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 112-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 112-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 112-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 112-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 112-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 112-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 112-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 112-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 112-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 112-120(pA10,mldab5,2), rA0 -+ movaps 112-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 112-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 112-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 112-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 112-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 112-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 112-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 112-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 112-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 112-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 112-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 112-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 112-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 112-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 32 -- movaps 128-120(pA10,mldab5,2), rA0 -- movaps 128-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 128-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 128-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 128-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 128-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 128-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 128-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 128-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 128-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 128-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 128-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 128-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 128-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 128-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 128-120(pA10,mldab5,2), rA0 -+ movaps 128-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 128-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 128-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 128-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 128-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 128-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 128-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 128-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 128-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 128-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 128-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 128-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 128-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 128-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 36 -- movaps 144-120(pA10,mldab5,2), rA0 -- movaps 144-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 144-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 144-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 144-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 144-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 144-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 144-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 144-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 144-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 144-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 144-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 144-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 144-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 144-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 144-120(pA10,mldab5,2), rA0 -+ movaps 144-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 144-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 144-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 144-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 144-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 144-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 144-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 144-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 144-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 144-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 144-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 144-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 144-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 144-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif -- prefB((pB,ldab)) -- prefB(64(pB,ldab)) -+ prefB((pB,ldab)) -+ prefB(64(pB,ldab)) - - #if KB > 40 -- movaps 160-120(pA10,mldab5,2), rA0 -- movaps 160-120(pB0), rB0 -- mulps rB0, rA0 -- addq $176, pB0 -- addps rA0, rC0 -- movaps 160-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 160-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 160-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 160-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 160-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 160-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 160-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 160-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 160-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 160-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 160-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 160-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addq $176, pA10 -- addps rA0, rC12 -- mulps 160-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -- addq $176, pA5 -+ movaps 160-120(pA10,mldab5,2), rA0 -+ movaps 160-120(pB0), rB0 -+ mulps rB0, rA0 -+ addq $176, pB0 -+ addps rA0, rC0 -+ movaps 160-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 160-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 160-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 160-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 160-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 160-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 160-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 160-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 160-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 160-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 160-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 160-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addq $176, pA10 -+ addps rA0, rC12 -+ mulps 160-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 -+ addq $176, pA5 - #else -- addq $176, pB0 -- addq $176, pA10 -- addq $176, pA5 -+ addq $176, pB0 -+ addq $176, pA10 -+ addq $176, pA5 - #endif - - #if KB > 44 -- movaps 0-120(pA10,mldab5,2), rA0 -- movaps 0-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 0-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 0-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 0-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 0-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 0-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 0-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 0-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 0-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 0-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 0-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 0-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 0-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 0-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 0-120(pA10,mldab5,2), rA0 -+ movaps 0-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 0-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 0-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 0-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 0-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 0-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 0-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 0-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 0-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 0-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 0-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 0-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 0-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 0-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 48 -- movaps 16-120(pA10,mldab5,2), rA0 -- movaps 16-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 16-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 16-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 16-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 16-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 16-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 16-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 16-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 16-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 16-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 16-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 16-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 16-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 16-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 16-120(pA10,mldab5,2), rA0 -+ movaps 16-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 16-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 16-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 16-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 16-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 16-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 16-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 16-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 16-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 16-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 16-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 16-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 16-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 16-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 52 -- movaps 32-120(pA10,mldab5,2), rA0 -- movaps 32-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 32-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 32-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 32-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 32-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 32-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 32-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 32-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 32-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 32-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 32-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 32-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 32-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 32-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 32-120(pA10,mldab5,2), rA0 -+ movaps 32-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 32-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 32-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 32-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 32-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 32-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 32-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 32-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 32-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 32-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 32-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 32-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 32-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 32-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 56 -- movaps 48-120(pA10,mldab5,2), rA0 -- movaps 48-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 48-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 48-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 48-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 48-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 48-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 48-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 48-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 48-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 48-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 48-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 48-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 48-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 48-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 48-120(pA10,mldab5,2), rA0 -+ movaps 48-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 48-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 48-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 48-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 48-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 48-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 48-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 48-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 48-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 48-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 48-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 48-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 48-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 48-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 60 -- movaps 64-120(pA10,mldab5,2), rA0 -- movaps 64-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 64-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 64-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 64-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 64-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 64-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 64-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 64-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 64-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 64-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 64-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 64-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 64-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 64-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 64-120(pA10,mldab5,2), rA0 -+ movaps 64-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 64-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 64-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 64-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 64-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 64-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 64-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 64-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 64-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 64-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 64-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 64-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 64-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 64-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif -- prefB(128-176(pB,ldab)) -- prefB(192-176(pB,ldab)) -+ prefB(128-176(pB,ldab)) -+ prefB(192-176(pB,ldab)) - - #if KB > 64 -- movaps 80-120(pA10,mldab5,2), rA0 -- movaps 80-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 80-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 80-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 80-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 80-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 80-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 80-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 80-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 80-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 80-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 80-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 80-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 80-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 80-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 80-120(pA10,mldab5,2), rA0 -+ movaps 80-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 80-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 80-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 80-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 80-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 80-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 80-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 80-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 80-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 80-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 80-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 80-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 80-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 80-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 68 -- movaps 96-120(pA10,mldab5,2), rA0 -- movaps 96-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 96-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 96-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 96-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 96-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 96-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 96-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 96-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 96-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 96-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 96-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 96-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 96-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 96-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 96-120(pA10,mldab5,2), rA0 -+ movaps 96-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 96-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 96-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 96-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 96-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 96-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 96-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 96-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 96-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 96-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 96-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 96-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 96-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 96-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 72 -- movaps 112-120(pA10,mldab5,2), rA0 -- movaps 112-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 112-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 112-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 112-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 112-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 112-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 112-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 112-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 112-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 112-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 112-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 112-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 112-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 112-120(pA5,ldab,8), rB0 -- prefC((pC)) -- prefC((pC,incCn)) -- addps rB0, rC13 -+ movaps 112-120(pA10,mldab5,2), rA0 -+ movaps 112-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 112-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 112-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 112-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 112-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 112-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 112-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 112-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 112-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 112-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 112-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 112-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 112-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 112-120(pA5,ldab,8), rB0 -+ prefC((pC)) -+ prefC((pC,incCn)) -+ addps rB0, rC13 - #else -- prefC((pC)) -- prefC((pC,incCn)) -+ prefC((pC)) -+ prefC((pC,incCn)) - #endif - - #if KB > 76 -- movaps 128-120(pA10,mldab5,2), rA0 -- movaps 128-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 128-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 128-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 128-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 128-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 128-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 128-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 128-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 128-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 128-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 128-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 128-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 128-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 128-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 128-120(pA10,mldab5,2), rA0 -+ movaps 128-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 128-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 128-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 128-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 128-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 128-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 128-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 128-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 128-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 128-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 128-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 128-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 128-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 128-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - #if KB > 80 -- movaps 144-120(pA10,mldab5,2), rA0 -- movaps 144-120(pB0), rB0 -- mulps rB0, rA0 -- addps rA0, rC0 -- movaps 144-120(pA5, mldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC1 -- movaps 144-120(pA10, mldab,8), rA0 -- mulps rB0, rA0 -- addps rA0, rC2 -- movaps 144-120(pA5, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC3 -- movaps 144-120(pA5, mldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC4 -- movaps 144-120(pA5), rA0 -- mulps rB0, rA0 -- addps rA0, rC5 -- movaps 144-120(pA5, ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC6 -- movaps 144-120(pA5, ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC7 -- movaps 144-120(pA10, mldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC8 -- movaps 144-120(pA5,ldab,4), rA0 -- mulps rB0, rA0 -- addps rA0, rC9 -- movaps 144-120(pA10), rA0 -- mulps rB0, rA0 -- addps rA0, rC10 -- movaps 144-120(pA10,ldab), rA0 -- mulps rB0, rA0 -- addps rA0, rC11 -- movaps 144-120(pA10,ldab,2), rA0 -- mulps rB0, rA0 -- addps rA0, rC12 -- mulps 144-120(pA5,ldab,8), rB0 -- addps rB0, rC13 -+ movaps 144-120(pA10,mldab5,2), rA0 -+ movaps 144-120(pB0), rB0 -+ mulps rB0, rA0 -+ addps rA0, rC0 -+ movaps 144-120(pA5, mldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC1 -+ movaps 144-120(pA10, mldab,8), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC2 -+ movaps 144-120(pA5, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC3 -+ movaps 144-120(pA5, mldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC4 -+ movaps 144-120(pA5), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC5 -+ movaps 144-120(pA5, ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC6 -+ movaps 144-120(pA5, ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC7 -+ movaps 144-120(pA10, mldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC8 -+ movaps 144-120(pA5,ldab,4), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC9 -+ movaps 144-120(pA10), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC10 -+ movaps 144-120(pA10,ldab), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC11 -+ movaps 144-120(pA10,ldab,2), rA0 -+ mulps rB0, rA0 -+ addps rA0, rC12 -+ mulps 144-120(pA5,ldab,8), rB0 -+ addps rB0, rC13 - #endif - - /*UKLOOP */ -@@ -2454,202 +2461,202 @@ MLAST: - * Get these bastard things summed up correctly - */ - -- /* rC0 = c0a c0b c0c c0d */ -- /* rC1 = c1a c1b c1c c1d */ -- /* rC2 = c2a c2b c2c c2d */ -- /* rC3 = c3a c3b c3c c3d */ -+ /* rC0 = c0a c0b c0c c0d */ -+ /* rC1 = c1a c1b c1c c1d */ -+ /* rC2 = c2a c2b c2c c2d */ -+ /* rC3 = c3a c3b c3c c3d */ - /* */ -- movaps rC2, rB0 /* rB0 = c2a c2b c2c c2d */ -- prefC(64(pC,incCn)) -- prefB(256-176(pB,ldab)) -- movaps rC0, rA0 /* rA0 = c0a c0b c0c c0d */ -- unpckhps rC3, rB0 /* rB0 = c2c c3c c2d c3d */ -- unpckhps rC1, rA0 /* rA0 = c0c c1c c0d c1d */ -- unpcklps rC3, rC2 /* rC2 = c2a c3a c2b c3b */ -- movlhps rB0, rC3 /* rC3 = c3a c3b c2c c3c */ -- unpcklps rC1, rC0 /* rC0 = c0a c1a c0b c1b */ -- movhlps rA0, rC3 /* rC3 = c0d c1d c2c c3c */ -- movlhps rC2, rA0 /* rA0 = c0c c1c c2a c3a */ -- movhlps rC0, rB0 /* rB0 = c0b c1b c2d c3d */ -- addps rA0, rC3 /* rC3 = c0cd c1cd c2ac c3ac */ -- movlhps rC0, rC1 /* rC1 = c1a c1b c0a c1a */ -- movhlps rC1, rC2 /* rC2 = c0a c1a c2b c3b */ -- movaps rC4, rA0 /* rA0 = c4a c4b c4c c4d */ -- addps rB0, rC2 /* rC2 = c0ab c1ab c2bd c3bd */ -- movaps rC6, rB0 /* rB0 = c6a c6b c6c c6d */ -- addps rC2, rC3 /* rC3 = c0abcd c1abcd c2bdac c3bdac */ -- -- -- /* rC4 = c4a c4b c4c c4d */ -- /* rC5 = c5a c5b c5c c5d */ -- /* rC6 = c6a c6b c6c c6d */ -- /* rC7 = c7a c7b c7c c7d */ -- /* rC8 = c08a c08b c08c c08d */ -- /* rC9 = c09a c09b c09c c09d */ -- /* rC10 = c10a c10b c10c c10d */ -- /* rC11 = c11a c11b c11c c11d */ -- /* rC12 = c12a c12b c12c c12d */ -- /* rC13 = c13a c13b c13c c13d */ -+ movaps rC2, rB0 /* rB0 = c2a c2b c2c c2d */ -+ prefC(64(pC,incCn)) -+ prefB(256-176(pB,ldab)) -+ movaps rC0, rA0 /* rA0 = c0a c0b c0c c0d */ -+ unpckhps rC3, rB0 /* rB0 = c2c c3c c2d c3d */ -+ unpckhps rC1, rA0 /* rA0 = c0c c1c c0d c1d */ -+ unpcklps rC3, rC2 /* rC2 = c2a c3a c2b c3b */ -+ movlhps rB0, rC3 /* rC3 = c3a c3b c2c c3c */ -+ unpcklps rC1, rC0 /* rC0 = c0a c1a c0b c1b */ -+ movhlps rA0, rC3 /* rC3 = c0d c1d c2c c3c */ -+ movlhps rC2, rA0 /* rA0 = c0c c1c c2a c3a */ -+ movhlps rC0, rB0 /* rB0 = c0b c1b c2d c3d */ -+ addps rA0, rC3 /* rC3 = c0cd c1cd c2ac c3ac */ -+ movlhps rC0, rC1 /* rC1 = c1a c1b c0a c1a */ -+ movhlps rC1, rC2 /* rC2 = c0a c1a c2b c3b */ -+ movaps rC4, rA0 /* rA0 = c4a c4b c4c c4d */ -+ addps rB0, rC2 /* rC2 = c0ab c1ab c2bd c3bd */ -+ movaps rC6, rB0 /* rB0 = c6a c6b c6c c6d */ -+ addps rC2, rC3 /* rC3 = c0abcd c1abcd c2bdac c3bdac */ -+ -+ -+ /* rC4 = c4a c4b c4c c4d */ -+ /* rC5 = c5a c5b c5c c5d */ -+ /* rC6 = c6a c6b c6c c6d */ -+ /* rC7 = c7a c7b c7c c7d */ -+ /* rC8 = c08a c08b c08c c08d */ -+ /* rC9 = c09a c09b c09c c09d */ -+ /* rC10 = c10a c10b c10c c10d */ -+ /* rC11 = c11a c11b c11c c11d */ -+ /* rC12 = c12a c12b c12c c12d */ -+ /* rC13 = c13a c13b c13c c13d */ - /* */ -- movaps rC10, rC0 /* rC0 = c10a c10b c10c c10d */ -- movaps rC8 , rC1 /* rC1 = c08a c08b c08c c08d */ -- movaps rC12, rC2 /* rC2 = c12a c12b c12c c12d */ -- unpckhps rC7, rB0 /* rB0 = c6c c7c c6d c7d */ -- unpckhps rC5, rA0 /* rA0 = c4c c5c c4d c5d */ -- unpcklps rC7, rC6 /* rC6 = c6a c7a c6b c7b */ -- unpckhps rC11, rC0 /* rC0 = c10c c11c c10d c11d */ -- unpckhps rC9 , rC1 /* rC1 = c08c c09c c08d c09d */ -- movlhps rB0, rC7 /* rC7 = c7a c7b c6c c7c */ -- unpcklps rC5, rC4 /* rC4 = c4a c5a c4b c5b */ -- movhlps rA0, rC7 /* rC7 = c4d c5d c6c c7c */ -- movlhps rC6, rA0 /* rA0 = c4c c5c c6a c7a */ -- unpcklps rC11, rC10 /* rC10 = c10a c11a c10b c11b */ -- movhlps rC4, rB0 /* rB0 = c4b c5b c6d c7d */ -- movlhps rC0, rC11 /* rC11 = c11a c11b c10c c11c */ -- addps rA0, rC7 /* rC7 = c4cd c5cd c6ac c7ac */ -+ movaps rC10, rC0 /* rC0 = c10a c10b c10c c10d */ -+ movaps rC8 , rC1 /* rC1 = c08a c08b c08c c08d */ -+ movaps rC12, rC2 /* rC2 = c12a c12b c12c c12d */ -+ unpckhps rC7, rB0 /* rB0 = c6c c7c c6d c7d */ -+ unpckhps rC5, rA0 /* rA0 = c4c c5c c4d c5d */ -+ unpcklps rC7, rC6 /* rC6 = c6a c7a c6b c7b */ -+ unpckhps rC11, rC0 /* rC0 = c10c c11c c10d c11d */ -+ unpckhps rC9 , rC1 /* rC1 = c08c c09c c08d c09d */ -+ movlhps rB0, rC7 /* rC7 = c7a c7b c6c c7c */ -+ unpcklps rC5, rC4 /* rC4 = c4a c5a c4b c5b */ -+ movhlps rA0, rC7 /* rC7 = c4d c5d c6c c7c */ -+ movlhps rC6, rA0 /* rA0 = c4c c5c c6a c7a */ -+ unpcklps rC11, rC10 /* rC10 = c10a c11a c10b c11b */ -+ movhlps rC4, rB0 /* rB0 = c4b c5b c6d c7d */ -+ movlhps rC0, rC11 /* rC11 = c11a c11b c10c c11c */ -+ addps rA0, rC7 /* rC7 = c4cd c5cd c6ac c7ac */ - #ifdef BETAX - #ifdef SREAL -- movups (pC), rA0 -- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ -- movups 16(pC), rC4 -- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ -- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ -- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ -- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ -- movups 32(pC), rC5 -- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ -- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ -- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ -- movlps 48(pC), rC1 -- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ -- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ -- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ -- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ -- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ -- mulps BOF(%rsp), rA0 -- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ -- mulps BOF(%rsp), rC4 -- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ -- mulps BOF(%rsp), rC5 -- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ -- mulps BOF(%rsp), rC1 -+ movups (pC), rA0 -+ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ -+ movups 16(pC), rC4 -+ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ -+ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ -+ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ -+ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ -+ movups 32(pC), rC5 -+ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ -+ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ -+ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ -+ movlps 48(pC), rC1 -+ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ -+ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ -+ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ -+ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ -+ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ -+ mulps BOF(%rsp), rA0 -+ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ -+ mulps BOF(%rsp), rC4 -+ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ -+ mulps BOF(%rsp), rC5 -+ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ -+ mulps BOF(%rsp), rC1 - - /* */ - -- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ -- addps rA0, rC3 -- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ -- addps rC4, rC7 -- addps rC5, rC11 -- prefB(320-176(pB,ldab)) -- addps rC1, rC12 -+ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ -+ addps rA0, rC3 -+ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ -+ addps rC4, rC7 -+ addps rC5, rC11 -+ prefB(320-176(pB,ldab)) -+ addps rC1, rC12 - #else /* BETA = X, complex type */ -- movups (pC), rA0 -- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ -- movups 16(pC), rC4 -- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ -- shufps $0x88, rC4, rA0 /* rA0 = c0 c1 c2 c3 */ -- movups 32(pC), rC4 /* rC4 = c4 X c5 X */ -- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ -- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ -- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ -- movups 48(pC), rC5 /* rC5 = c6 X c7 X */ -- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ -- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ -- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ -- shufps $0x88, rC5, rC4 /* rC4 = c4 c5 c6 c7 */ -- movups 64(pC), rC5 /* rC5 = c8 X c9 X */ -- movups 80(pC), rC1 /* rC1 = c10 X c11 X */ -- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ -- shufps $0x88, rC1, rC5 /* rC5 = c8 c9 c10 c11 */ -- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ -- movss 96(pC), rC1 -- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ -- movss 104(pC), rB0 -- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ -- unpcklps rB0, rC1 -- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ -- mulps BOF(%rsp), rA0 -- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ -- mulps BOF(%rsp), rC4 -- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ -- mulps BOF(%rsp), rC5 -- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ -- mulps BOF(%rsp), rC1 -+ movups (pC), rA0 -+ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ -+ movups 16(pC), rC4 -+ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ -+ shufps $0x88, rC4, rA0 /* rA0 = c0 c1 c2 c3 */ -+ movups 32(pC), rC4 /* rC4 = c4 X c5 X */ -+ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ -+ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ -+ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ -+ movups 48(pC), rC5 /* rC5 = c6 X c7 X */ -+ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ -+ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ -+ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ -+ shufps $0x88, rC5, rC4 /* rC4 = c4 c5 c6 c7 */ -+ movups 64(pC), rC5 /* rC5 = c8 X c9 X */ -+ movups 80(pC), rC1 /* rC1 = c10 X c11 X */ -+ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ -+ shufps $0x88, rC1, rC5 /* rC5 = c8 c9 c10 c11 */ -+ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ -+ movss 96(pC), rC1 -+ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ -+ movss 104(pC), rB0 -+ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ -+ unpcklps rB0, rC1 -+ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ -+ mulps BOF(%rsp), rA0 -+ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ -+ mulps BOF(%rsp), rC4 -+ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ -+ mulps BOF(%rsp), rC5 -+ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ -+ mulps BOF(%rsp), rC1 - - /* */ - -- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ -- addps rA0, rC3 -- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ -- addps rC4, rC7 -- addps rC5, rC11 -- prefB(320-176(pB,ldab)) -- addps rC1, rC12 -+ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ -+ addps rA0, rC3 -+ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ -+ addps rC4, rC7 -+ addps rC5, rC11 -+ prefB(320-176(pB,ldab)) -+ addps rC1, rC12 - #endif - - #else -- movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ -- unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ -- movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ -- movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ -- movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ -- movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ -- unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ -- addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ -- addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ -- movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ -- unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ -- movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ -- addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ -- addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ -- addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ -- addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ -+ movlhps rC4, rC5 /* rC5 = c5a c5b c4a c5a */ -+ unpcklps rC9 , rC8 /* rC8 = c08a c09a c08b c09b */ -+ movhlps rC1, rC11 /* rC11 = c08d c09d c10c c11c */ -+ movlhps rC10, rC1 /* rC1 = c08c c09c c10a c11a */ -+ movhlps rC5, rC6 /* rC6 = c4a c5a c6b c7b */ -+ movhlps rC8 , rC0 /* rC0 = c08b c09b c10d c11d */ -+ unpcklps rC13, rC2 /* rC2 = c12a c13a c12b c13b */ -+ addps rC1, rC11 /* rC11 = c08cd c09cd c10ac c11ac */ -+ addps rB0, rC6 /* rC6 = c4ab c5ab c6bd c7bd */ -+ movlhps rC8 , rC9 /* rC9 = c09a c09b c08a c09a */ -+ unpckhps rC13, rC12 /* rC12 = c12c c13c c12d c13d */ -+ movhlps rC9 , rC10 /* rC10 = c08a c09a c10b c11b */ -+ addps rC6, rC7 /* rC7 = c4abcd c5abcd c6bdac c7bdac */ -+ addps rC0, rC10 /* rC10 = c08ab c09ab c10bd c11bd */ -+ addps rC2, rC12 /* rC12 = c12ac c13ac c12bd c13bd */ -+ addps rC10, rC11 /* rC11 = c08abcd c09abcd c10bdac c11bdac */ - - /* */ - -- movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ -- prefB(320-176(pB,ldab)) -- addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ -+ movhlps rC12, rC13 /* rC13 = c12bd c13bd X X */ -+ prefB(320-176(pB,ldab)) -+ addps rC13, rC12 /* rC12 = c12abcd c13abcd X X */ - - #endif - /* - * Write results back to C; pC += 14; - */ - #ifdef SREAL -- movups rC3, (pC) -- movups rC7, 16(pC) -- movups rC11, 32(pC) -- movlps rC12, 48(pC) --/* addq $56, pC */ -+ movups rC3, (pC) -+ movups rC7, 16(pC) -+ movups rC11, 32(pC) -+ movlps rC12, 48(pC) -+/* addq $56, pC */ - #else -- movss rC3, (pC) -- movss rC7, 32(pC) -- movhlps rC3, rC0 -- movhlps rC7, rC6 -- movss rC0, 16(pC) -- movss rC6, 48(pC) -- shufps $0x55, rC3, rC3 -- shufps $0x55, rC7, rC7 -- movss rC3, 8(pC) -- movss rC7, 40(pC) -- shufps $0x55, rC0, rC0 -- shufps $0x55, rC6, rC6 -- movss rC0, 24(pC) -- movss rC6, 56(pC) -- -- movss rC11, 64(pC) -- movhlps rC11, rC2 -- movss rC12, 96(pC) -- movss rC2, 80(pC) -- shufps $0x55, rC11, rC11 -- shufps $0x55, rC12, rC12 -- movss rC11, 72(pC) -- shufps $0x55, rC2, rC2 -- movss rC12, 104(pC) -- movss rC2, 88(pC) -+ movss rC3, (pC) -+ movss rC7, 32(pC) -+ movhlps rC3, rC0 -+ movhlps rC7, rC6 -+ movss rC0, 16(pC) -+ movss rC6, 48(pC) -+ shufps $0x55, rC3, rC3 -+ shufps $0x55, rC7, rC7 -+ movss rC3, 8(pC) -+ movss rC7, 40(pC) -+ shufps $0x55, rC0, rC0 -+ shufps $0x55, rC6, rC6 -+ movss rC0, 24(pC) -+ movss rC6, 56(pC) -+ -+ movss rC11, 64(pC) -+ movhlps rC11, rC2 -+ movss rC12, 96(pC) -+ movss rC2, 80(pC) -+ shufps $0x55, rC11, rC11 -+ shufps $0x55, rC12, rC12 -+ movss rC11, 72(pC) -+ shufps $0x55, rC2, rC2 -+ movss rC12, 104(pC) -+ movss rC2, 88(pC) - --/* addq $112, pC */ -+/* addq $112, pC */ - #endif - /* - * Write results back to C -@@ -2660,55 +2667,55 @@ MLAST: - /* - * while (pA != stM); - */ --/* subq $1, stM */ --/* jne UMLOOP */ -+/* subq $1, stM */ -+/* jne UMLOOP */ - /* - * pC += 14; pA += 14*NB; pB -= NB; - */ --/* subq $MBKBso-NB14so+176, pA5 */ --/* subq $MBKBso-NB14so+176, pA10 */ -- subq incAm, pA5 -- subq incAm, pA10 -- addq $NBso-176, pB0 -+/* subq $MBKBso-NB14so+176, pA5 */ -+/* subq $MBKBso-NB14so+176, pA10 */ -+ subq incAm, pA5 -+ subq incAm, pA10 -+ addq $NBso-176, pB0 - /* - * while (pA != stM); - */ --/* subq $1, stM */ --/* jne UMLOOP */ -+/* subq $1, stM */ -+/* jne UMLOOP */ - /* - * pC += incCn; pA -= NBNB; pB += NB; - */ -- addq incCn, pC -+ addq incCn, pC - /* - * while (pB != stN); - */ -- sub $1, stN -- jne UNLOOP -+ sub $1, stN -+ jne UNLOOP - - /* - * Restore callee-saved iregs - */ - DONE: -- movq -8(%rsp), %rbp -- movq -16(%rsp), %rbx -+ movq -8(%rsp), %rbp -+ movq -16(%rsp), %rbx - #if MB == 0 -- movq -32(%rsp), %r12 -- movq -40(%rsp), %r13 -+ movq -32(%rsp), %r12 -+ movq -40(%rsp), %r13 - #endif -- ret -+ ret - #if MB == 0 - MB_LT84: -- cmp $70, stM -- jne MB_LT70 --/* movq $70/14, stM */ -- movq $5, stM -- jmp MBFOUND -+ cmp $70, stM -+ jne MB_LT70 -+/* movq $70/14, stM */ -+ movq $5, stM -+ jmp MBFOUND - MB_LT70: -- cmp $56, stM -- jne MB_LT56 --/* movq $56/14, stM */ -- movq $4, stM -- jmp MBFOUND -+ cmp $56, stM -+ jne MB_LT56 -+/* movq $56/14, stM */ -+ movq $4, stM -+ jmp MBFOUND - MB_LT56: - cmp $42, stM - jne MB_LT42 -diff -rupN ATLAS/tune/blas/level1/scalsrch.c atlas-3.8.3/tune/blas/level1/scalsrch.c ---- ATLAS/tune/blas/level1/scalsrch.c 2009-02-18 19:48:25.000000000 +0100 -+++ atlas-3.8.3/tune/blas/level1/scalsrch.c 2009-11-12 13:45:48.141174024 +0100 -@@ -747,7 +747,7 @@ void GenMainRout(char pre, int n, int *i - /* - * Handle all special alpha cases - */ -- fprintf(fpout, "%sif ( SCALAR_IS_ZERO(alpha) )\n", spc); -+ /* fprintf(fpout, "%sif ( SCALAR_IS_ZERO(alpha) )\n", spc); - fprintf(fpout, "%s{\n", spc); - if (pre == 'c' || pre == 'z') - { -@@ -756,7 +756,7 @@ void GenMainRout(char pre, int n, int *i - } - else fprintf(fpout, "%s Mjoin(PATL,set)(N, ATL_rzero, X, incx);\n", spc); - fprintf(fpout, "%s return;\n", spc); -- fprintf(fpout, "%s}\n", spc); -+ fprintf(fpout, "%s}\n", spc); */ - GenAlphCase(pre, spc, fpout, 1, n, ix, iy, ia, ib); - GenAlphCase(pre, spc, fpout, -1, n, ix, iy, ia, ib); - if (pre == 'c' || pre == 'z') |